Full Text Search Example/Tour

You can do FTS5 using normal SQL as documented. This example shows using APSW specific functionality and extras.

#!/usr/bin/env python3

# This code uses Python's optional typing annotations.  You can
# ignore them and do not need to use them.  If you do use them
# then you must include this future annotations line first.
from __future__ import annotations

from pprint import pprint
import re
import functools

import apsw
import apsw.ext

# The three FTS5 specific modules
import apsw.fts5
import apsw.fts5aux
import apsw.fts5query

Is FTS5 available?

FTS5 is included as part of the SQLite library (usually).

print("FTS5 available:", "ENABLE_FTS5" in apsw.compile_options)

FTS5 available: True

Content setup

The connection to use. The database has a table with recipes.

connection = apsw.Connection("recipes.db")

# The content
print(
    connection.execute(
        "SELECT sql FROM sqlite_schema WHERE name='recipes'"
    ).get
)

print(connection.execute("SELECT COUNT(*) FROM recipes").get)

CREATE TABLE recipes(name, ingredients, description)
173278

Create/open search table

Create a table containing the search index using recipes as an external content table.

if not connection.table_exists("main", "search"):
    # create does all the hard work
    search_table = apsw.fts5.Table.create(
        connection,
        # The table will be named 'search'
        "search",
        # External content table name.  It has to be in the same
        # database.
        content="recipes",
        # We want the same columns as recipe, so pass `None`.
        columns=None,
        # Triggers ensure that changes to the content table
        # are reflected in the search table
        generate_triggers=True,
        # Use APSW recommended tokenization
        tokenize=[
            # simplify tokenizer
            "simplify",
            # removes case distinction
            "casefold",
            "true",
            # removes accents, uses compatibility codepoints
            "strip",
            "true",
            # unicode algorithm for word boundaries tokenizer
            "unicodewords",
        ],
        # There are many more options you can set
    )

else:
    # Already exists so just the name is needed
    search_table = apsw.fts5.Table(connection, "search")

# You should use this to get the table name when formatting SQL
# queries as they can't use bindings.  It will correctly quote the
# schema (attached database name) and the table name no matter what
# characters, spaces etc are used.

print("quoted name", search_table.quoted_table_name)

quoted name "main"."search"

Table structure and statistics

Examine the structure, options, and statistics

pprint(search_table.structure)

# rank is one of several options you can read or change
print(f"{search_table.config_rank()=}")

# some statistics
print(f"{search_table.row_count=}")

print(f"{search_table.tokens_per_column=}")

FTS5TableStructure(name='search',
                   columns=('name', 'ingredients', 'description'),
                   unindexed=set(),
                   tokenize=('simplify',
                             'casefold',
                             'true',
                             'strip',
                             'true',
                             'unicodewords'),
                   prefix=set(),
                   content='recipes',
                   content_rowid='_ROWID_',
                   contentless_delete=None,
                   contentless_unindexed=None,
                   columnsize=True,
                   tokendata=False,
                   locale=False,
                   detail='full')
search_table.config_rank()='bm25()'
search_table.row_count=173278
search_table.tokens_per_column=[743406, 8064731, 3824644]

Content

Use upsert() to add or change existing content and delete() to delete a row. They understand external content tables and will do the operations there, then the triggers will update the search index. row_by_id() gets one or more columns from a row, and also handles external content tables.

# upsert returns the rowid of an insert or change.
rowid = search_table.upsert(
    # you can use positional parameters so this goes to the first column
    "This ends up in the name field",
    # and keywords
    description="This ends up in the description",
)

print(f"{search_table.row_by_id(rowid, 'name')=}")

# modify that row
search_table.upsert(ingredients="some stuff", rowid=rowid)

# And delete our test row
search_table.delete(rowid)

search_table.row_by_id(rowid, 'name')='This ends up in the name field'

Searching with SQL.

You need to specify what should be returned, the FTS query, and order by to get the best results first.

sql = """
   SELECT
   -- snippet takes a lot of options!
   snippet(search, -1, '►', '◄', '...', 10) as Snippet
   -- Pass the query like this
   FROM search(?)
   -- Best matches first
   ORDER BY rank
   -- Top 3
   LIMIT 3"""

for query in (
    "cream",
    "orange NOT zest",
    "name:honey",
    "pomegranate OR olives",
):
    print(f"{query=}")
    print(apsw.ext.format_query_table(connection, sql, (query,)))

query='cream'
┌───────────────────────────────────────────────────────────┐
│                          Snippet                          │
├───────────────────────────────────────────────────────────┤
│ Find the recipe for Sour ►Cream◄ Ice ►Cream◄ and other... │
├───────────────────────────────────────────────────────────┤
│ 1 (8 ounce) package ►cream◄ cheese                        │
│ 1 cup white sugar...                                      │
├───────────────────────────────────────────────────────────┤
│ Want a Ice ►Cream◄ and Party?                             │
│ Here's a Ice-►Cream◄...                                   │
└───────────────────────────────────────────────────────────┘

query='orange NOT zest'
┌───────────────────────────────────────────────────┐
│                      Snippet                      │
├───────────────────────────────────────────────────┤
│ ice                                               │
│ 750ml blood ►orange◄ , or ►orange◄ juice          │
│ 750ml soda water...                               │
├───────────────────────────────────────────────────┤
│ 1 tablespoon olive oil                            │
│ 4 (4 ounce) fillets ►orange◄ roughy...            │
├───────────────────────────────────────────────────┤
│ ...►orange◄-flavored liqueur                      │
│ Extra ►orange◄ slices for serving, or ►orange◄... │
└───────────────────────────────────────────────────┘

query='name:honey'
┌────────────────────────────────────────────────────────────────────────┐
│                                Snippet                                 │
│ Wildflower-►Honey◄ Semifreddo with ►Honey◄ Sesame Wafers               │
│ Roast Chicken with Saffron, Hazelnuts, and ►Honey◄                     │
│ Recipe Flash: Ginger-►Honey◄ Chicken WingsGinger-►Honey◄ Chicken Wings │
└────────────────────────────────────────────────────────────────────────┘

query='pomegranate OR olives'
┌─────────────────────────────────────────────────────────────────────────────┐
│                                   Snippet                                   │
├─────────────────────────────────────────────────────────────────────────────┤
│ A new way to serve ►olives◄ - marinated in ►pomegranate◄ juice...           │
├─────────────────────────────────────────────────────────────────────────────┤
│ Purple kale, oranges, feta cheese and ►olives◄ topped with ►pomegranate◄... │
├─────────────────────────────────────────────────────────────────────────────┤
│ ...juice                                                                    │
│ Seeds from a ►pomegranate◄                                                  │
│ 25g dried apricot pieces                                                    │
│ Quarter...                                                                  │
└─────────────────────────────────────────────────────────────────────────────┘

Search method

search() provides a Pythonic API providing information about each matching row, best matches first.

for row in search_table.search("lemon OR guava"):
    # Note how you see overall query info (it is two phrases) and
    # information about the matched row (how many tokens in each
    # column), and which columns each phrase was found in
    pprint(row)
    # only show the first matching row
    break

# Inspect first matching row
name, description = search_table.row_by_id(
    row.rowid, ("name", "description")
)
print((name, description))

MatchInfo(query_info=QueryInfo(phrases=(('lemon',), ('guava',))),
          rowid=1642796043941632884,
          column_size=[3, 24, 5],
          phrase_columns=[[1], [0, 2]])
("P.L's Guava Jelly", 'Delicious home-made guava jam')

Query correction and suggestion

If the query contains words that don’t exist or are very rare (likely spelling difference) then you can provide alternate queries that probably improve results.

for query in (
    "jalapno",
    # query structure is maintained
    "orange AND guice",
    # column names are corrected too
    "nyme:(minced OR oyl NOT peenut)",
    # None is returned if all terms are ok
    "sweet onion",
):
    suggest = search_table.query_suggest(query)
    print(f"{query=} {suggest=}")

query='jalapno' suggest='jalapeño'
query='orange AND guice' suggest='orange juice'
query='nyme:(minced OR oyl NOT peenut)' suggest='name: (minced OR oil NOT peanut)'
query='sweet onion' suggest=None

Working with tokens

Document and query text is processed into tokens, with matches found based on those tokens. Tokens are not visible to the user.

Typically they correspond to “words” in the text, but with upper/lower case neutralized, punctuation removed, marks and accents removed.

# See what happens with sample text
text = "Don't 2.245e5 Run-Down Déjà 你好世界😂❤️🤣"
# tokenization happens on UTF8
utf8 = text.encode()

# Note offsets into the utf8.  Those could be used to highlight the
# original.
pprint(search_table.tokenize(utf8))

# For our table, the most popular tokens, and what percent of rows
# they are in
print("\nMost popular by row count")
row_count = search_table.row_count
for token, count in search_table.token_doc_frequency():
    print(f"{token:20}{count/row_count:.0%}")

# Most popular tokens, based on total token count
print("\nMost popular by token count")
token_count = search_table.token_count
for token, count in search_table.token_frequency():
    print(f"{token:20}{count/token_count:.0%}")

# Find what text produced a token, by looking at 5 rows.
token = "jalapeno"
text = search_table.text_for_token(token, 5)
print(f"\nText for {token=} is {text}")

[(0, 5, 'dont'),
 (6, 13, '2245e5'),
 (14, 17, 'run'),
 (18, 22, 'down'),
 (23, 29, 'deja'),
 (30, 33, '你'),
 (33, 36, '好'),
 (36, 39, '世'),
 (39, 42, '界'),
 (42, 46, '😂'),
 (46, 52, '❤'),
 (52, 56, '🤣')]

Most popular by row count
1                   92%
2                   80%
and                 72%
a                   59%
cup                 56%
cups                53%
4                   49%
with                47%
3                   47%
the                 43%

Most popular by token count
1                   8%
2                   4%
cups                2%
cup                 2%
and                 2%
1⁄2                 2%
4                   1%
teaspoon            1%
a                   1%
3                   1%

Text for token='jalapeno' is Jalapeño

Key Tokens and More Like

key_tokens() finds tokens represented in a row, but rare in other rows. This is purely statistical and has no idea of the meaning or relationship between tokens.

more_like() is given some rowids, extracts their key tokens, and starts a search with them, excluding the rowids already seen. This lets you provide “infinite scrolling” starting from one or more rows, providing additional similar content.

Both methods let you specify specific columns, or all columns (default)

# A randomly selected row ...
bbq_rowid = 1642796066805404445
# ... for bbq sauce
print(search_table.row_by_id(bbq_rowid, "name"))

# Note how each token gets a score, with bigger numbers meaning the
# token is more unique
pprint(search_table.key_tokens(bbq_rowid, columns=["name"], limit=3))

# More like based on the ingredients column
for count, match_info in enumerate(
    search_table.more_like([bbq_rowid], columns="ingredients")
):
    # Show the name for each
    print(search_table.row_by_id(match_info.rowid, "name"))
    # We could save each of these rowids and at the end do another
    # more_like including them.  Stop after a few for now.
    if count == 5:
        break

Thick and Sticky Barbecue Sauce Recipe
[(0.0002008032128514056, 'sticky'),
 (0.00015475085112968121, 'barbecue'),
 (3.077870113881194e-05, 'thick')]
Indian Chicken Rub
Berbere
Sarah's Chili Seasoning Mix
Jenga Ribs
Indian Spice Baked Chicken
Valentine's Day Steak Rub

Autocomplete

You often want to show results after just a few letters have been typed before there is a complete word entered. This is done by indexing sequences of a few letters, called ngrams. Ngrams are never shown to the user although you can see the snippets below.

if not connection.table_exists("main", "autocomplete"):
    # create does all the hard work
    autocomplete = apsw.fts5.Table.create(
        connection,
        # The table will be named 'search'
        "autocomplete",
        # External content table name.  It has to be in the same
        # database.
        content="recipes",
        # We want the same columns as recipe, so pass `None`.
        columns=None,
        # Triggers ensure that changes to the content table
        # are reflected in the search table
        generate_triggers=True,
        # Use APSW recommended tokenization
        tokenize=[
            # simplify tokenizer
            "simplify",
            # removes case distinction
            "casefold",
            "true",
            # removes accents, uses compatibility codepoints
            "strip",
            "true",
            # ngram tokenizer
            "ngram",
            # How big is each sequence?  This is how many letters have
            # to be typed before any match is possible.  Smaller values
            # result in larger indexes.
            "ngrams",
            "3",
        ],
        # There are many more options you can set
    )

else:
    # Already exists so just the name is needed
    autocomplete = apsw.fts5.Table(connection, "autocomplete")

# do some queries against autocomplete index
sql = """
   SELECT
   -- snippet takes a lot of options!
   snippet(autocomplete, -1, '►', '◄', '...', 10) as Snippet
   -- Pass the query like this
   FROM autocomplete(?)
   -- Best matches first
   ORDER BY rank
   -- Top 3
   LIMIT 3"""

for query in (
    "eam",
    "ora",
    "name:ney",
    "emo jui",
    "barbecue",
):
    print(f"{query=}")
    print(apsw.ext.format_query_table(connection, sql, (query,)))

query='eam'
┌───────────────────┐
│      Snippet      │
│ Sour Cr►eam◄ I... │
│ Cr►eam◄less Cr... │
│ Cr►eam◄ cheese... │
└───────────────────┘

query='ora'
┌───────────────────┐
│      Snippet      │
│ Blood ►ora◄nge... │
│ ►Ora◄nge cake ... │
│ ►Ora◄nge Deser... │
└───────────────────┘

query='name:ney'
┌──────────────────────┐
│       Snippet        │
│ Ho►ney◄ed-Mang...    │
│ ...r-Ho►ney◄ Semi... │
│ ...h Ho►ney◄ed-Ma... │
└──────────────────────┘

query='emo jui'
┌────────────────────────┐
│        Snippet         │
│ ... L►emo◄n ►Jui◄ce... │
│ ... l►emo◄n ►jui◄ce... │
│ ... l►emo◄n ►jui◄ce... │
└────────────────────────┘

query='barbecue'
┌───────────────────┐
│      Snippet      │
│ Classic ►Barb◄... │
│ ►Barbecue◄ Por... │
│ ►Barbecue◄d Ba... │
└───────────────────┘

Auxiliary functions

Auxiliary functions <https://sqlite.org/fts5.html#_auxiliary_functions_>__ are called for each matching row. They can be used to provide `ranking (sorting) <https://www.sqlite.org/fts5.html#sorting_by_auxiliary_function_results>`__ for determining better matches, or returning information about the match and row such as highlights, and snippets.

They are called with FTS5ExtensionApi as the first parameter, and then any function arguments, and return a value. This example shows all the information available during a query.

def row_match(
    api: apsw.FTS5ExtensionApi, *args: apsw.SQLiteValue
) -> apsw.SQLiteValue:
    print("row_match called with", args)
    # Show what information is available from the api
    print(f"{api.rowid=}")
    print(f"{api.row_count=}")
    print(f"{api.column_count=}")
    for col in range(api.column_count):
        print(f"  {col=} {api.column_size(col)=}")
        print(f"  {col=} {api.column_total_size(col)=}")
    print(f"{api.inst_count=}")
    for inst in range(api.inst_count):
        print(f"  {inst=} {api.inst_tokens(inst)=}")
    print(f"{api.phrases=}")
    for phrase in range(len(api.phrases)):
        # which columns the phrase is found in
        print(f"  {phrase=} {api.phrase_columns(phrase)=}")
        # which token numbers
        print(f"  {phrase=} {api.phrase_locations(phrase)=}")
    # the offsets of phrase 3 in column 2
    print(f"{api.phrase_column_offsets(3, 2)=}")

    # note the text is the utf-8 encoded bytes
    print(f"{api.column_text(0)=}")

    # we can get a tokenization of text, useful if you want to extract
    # the original text, add snippets/highlights etc
    print("Tokenized with UTF-8 offsets")
    pprint(api.tokenize(api.column_text(2), api.column_locale(2)))

    # query_phrase is useful for finding out how common a phrase is.
    counts = [0] * len(api.phrases)
    for phrase in range(len(api.phrases)):
        api.query_phrase(phrase, phrase_count, (phrase, counts))

    for i, phrase in enumerate(api.phrases):
        print(f"Phrase {phrase=} occurs { counts[i]:,} times")

    return 7


# This is used as the callback from query_phrase above.  Note that the
# api instance in this call is different than the above function.
def phrase_count(api: apsw.FTS5ExtensionApi, closure):
    phrase, counts = closure

    # increment counts for this phrase
    counts[phrase] += 1
    if counts[phrase] < 5:
        # Show call info the first 4 times for each phrase
        print(f"phrase_count called {api.rowid=} {api.phrases=}")

    # we could do more sophisticated work such as counting how many
    # times it occurs (api.phrase_locations) or which columns
    # (api.phrase_columns).


connection.register_fts5_function("row_match", row_match)

# A deliberately complex query to make the api interesting
query = (
    """("BoiLed eGGs" OR CinnaMON) OR NEAR (drink Cup, 5) NOT Oran*"""
)

# Make all the code above be called. Note how the table name has to be
# the first parameter to our function in the SQL
connection.execute(
    "SELECT row_match(search, 5, 'hello') FROM search(?) order by rank",
    (query,),
)

row_match called with (5, 'hello')
api.rowid=1642795772696450313
api.row_count=173279
api.column_count=3
  col=0 api.column_size(col)=3
  col=0 api.column_total_size(col)=743413
  col=1 api.column_size(col)=4
  col=1 api.column_total_size(col)=8064731
  col=2 api.column_size(col)=9
  col=2 api.column_total_size(col)=3824650
api.inst_count=2
  inst=0 api.inst_tokens(inst)=('boiled', 'eggs')
  inst=1 api.inst_tokens(inst)=('boiled', 'eggs')
api.phrases=(('boiled', 'eggs'), ('cinnamon',), ('drink',), ('cup',), ('oran',))
  phrase=0 api.phrase_columns(phrase)=(0, 2)
  phrase=0 api.phrase_locations(phrase)=[[1], [], [7]]
  phrase=1 api.phrase_columns(phrase)=()
  phrase=1 api.phrase_locations(phrase)=[[], [], []]
  phrase=2 api.phrase_columns(phrase)=()
  phrase=2 api.phrase_locations(phrase)=[[], [], []]
  phrase=3 api.phrase_columns(phrase)=()
  phrase=3 api.phrase_locations(phrase)=[[], [], []]
  phrase=4 api.phrase_columns(phrase)=()
  phrase=4 api.phrase_locations(phrase)=[[], [], []]
api.phrase_column_offsets(3, 2)=[]
api.column_text(0)=b'Hard Boiled Eggs'
Tokenized with UTF-8 offsets
[(0, 3, 'the'),
 (4, 9, 'right'),
 (10, 13, 'way'),
 (14, 16, 'to'),
 (17, 21, 'cook'),
 (22, 29, 'perfect'),
 (30, 34, 'hard'),
 (35, 41, 'boiled'),
 (42, 46, 'eggs')]
phrase_count called api.rowid=1642795733280177125 api.phrases=(('boiled', 'eggs'),)
phrase_count called api.rowid=1642795734218860660 api.phrases=(('boiled', 'eggs'),)
phrase_count called api.rowid=1642795742364372196 api.phrases=(('boiled', 'eggs'),)
phrase_count called api.rowid=1642795748917697308 api.phrases=(('boiled', 'eggs'),)
phrase_count called api.rowid=1642795722012755793 api.phrases=(('cinnamon',),)
phrase_count called api.rowid=1642795722012755797 api.phrases=(('cinnamon',),)
phrase_count called api.rowid=1642795722012755806 api.phrases=(('cinnamon',),)
phrase_count called api.rowid=1642795722012755828 api.phrases=(('cinnamon',),)
phrase_count called api.rowid=1642795722012755795 api.phrases=(('drink',),)
phrase_count called api.rowid=1642795722012819365 api.phrases=(('drink',),)
phrase_count called api.rowid=1642795725826572496 api.phrases=(('drink',),)
phrase_count called api.rowid=1642795725826588872 api.phrases=(('drink',),)
phrase_count called api.rowid=1642795721317011550 api.phrases=(('cup',),)
phrase_count called api.rowid=1642795721317011555 api.phrases=(('cup',),)
phrase_count called api.rowid=1642795721369682815 api.phrases=(('cup',),)
phrase_count called api.rowid=1642795721369683223 api.phrases=(('cup',),)
phrase_count called api.rowid=1642795722012755799 api.phrases=(('oran',),)
phrase_count called api.rowid=1642795722012755999 api.phrases=(('oran',),)
phrase_count called api.rowid=1642795722012757203 api.phrases=(('oran',),)
phrase_count called api.rowid=1642795722012819341 api.phrases=(('oran',),)
Phrase phrase=('boiled', 'eggs') occurs 321 times
Phrase phrase=('cinnamon',) occurs 12,000 times
Phrase phrase=('drink',) occurs 1,925 times
Phrase phrase=('cup',) occurs 96,655 times
Phrase phrase=('oran',) occurs 8,181 times

Query parsing and manipulation

apsw.fts5query lets you programmatically create, update, and parse queries. There are three forms of query.

# This is the query as accepted by FTS5.
print("query")
print(query)

# That can be parsed into the structure
parsed = apsw.fts5query.parse_query_string(query)
print("\nparsed")
pprint(parsed)

# The parsed form is a little unwieldy to work with so a dict based
# form is available.
as_dict = apsw.fts5query.to_dict(parsed)
print("\nas_dict")
pprint(as_dict)

# Make some changes - delete the first query
del as_dict["queries"][0]

as_dict["queries"].append(
    {
        # add a columnfilter
        "@": "COLUMNFILTER",
        "filter": "include",
        "columns": ["name", "description"],
        # The sub queries are just strings.   The module knows what
        # you mean and will convert them into AND
        "query": ["some thing blue", "sunshine"],
    }
)
print("\nmodified as_dict")
pprint(as_dict)

# Turn it into parsed form
parsed = apsw.fts5query.from_dict(as_dict)
print("\nnew parsed")
pprint(parsed)

# Turn the parsed form back into a query string
query = apsw.fts5query.to_query_string(parsed)
print("\nnew query")
print(query)

query
("BoiLed eGGs" OR CinnaMON) OR NEAR (drink Cup, 5) NOT Oran*

parsed
OR(queries=[OR(queries=[PHRASE(phrase='BoiLed eGGs',
                               initial=False,
                               prefix=False,
                               plus=None),
                        PHRASE(phrase='CinnaMON',
                               initial=False,
                               prefix=False,
                               plus=None)]),
            NOT(match=NEAR(phrases=[PHRASE(phrase='drink',
                                           initial=False,
                                           prefix=False,
                                           plus=None),
                                    PHRASE(phrase='Cup',
                                           initial=False,
                                           prefix=False,
                                           plus=None)],
                           distance=5),
                no_match=PHRASE(phrase='Oran',
                                initial=False,
                                prefix=True,
                                plus=None))])

as_dict
{'@': 'OR',
 'queries': [{'@': 'OR',
              'queries': [{'@': 'PHRASE', 'phrase': 'BoiLed eGGs'},
                          {'@': 'PHRASE', 'phrase': 'CinnaMON'}]},
             {'@': 'NOT',
              'match': {'@': 'NEAR',
                        'distance': 5,
                        'phrases': [{'@': 'PHRASE', 'phrase': 'drink'},
                                    {'@': 'PHRASE', 'phrase': 'Cup'}]},
              'no_match': {'@': 'PHRASE', 'phrase': 'Oran', 'prefix': True}}]}

modified as_dict
{'@': 'OR',
 'queries': [{'@': 'NOT',
              'match': {'@': 'NEAR',
                        'distance': 5,
                        'phrases': [{'@': 'PHRASE', 'phrase': 'drink'},
                                    {'@': 'PHRASE', 'phrase': 'Cup'}]},
              'no_match': {'@': 'PHRASE', 'phrase': 'Oran', 'prefix': True}},
             {'@': 'COLUMNFILTER',
              'columns': ['name', 'description'],
              'filter': 'include',
              'query': ['some thing blue', 'sunshine']}]}

new parsed
OR(queries=[NOT(match=NEAR(phrases=[PHRASE(phrase='drink',
                                           initial=False,
                                           prefix=False,
                                           plus=None),
                                    PHRASE(phrase='Cup',
                                           initial=False,
                                           prefix=False,
                                           plus=None)],
                           distance=5),
                no_match=PHRASE(phrase='Oran',
                                initial=False,
                                prefix=True,
                                plus=None)),
            COLUMNFILTER(columns=['name', 'description'],
                         filter='include',
                         query=AND(queries=[PHRASE(phrase='some thing blue',
                                                   initial=False,
                                                   prefix=False,
                                                   plus=None),
                                            PHRASE(phrase='sunshine',
                                                   initial=False,
                                                   prefix=False,
                                                   plus=None)]))])

new query
NEAR(drink Cup, 5) NOT Oran* OR {name description}: ("some thing blue" sunshine)

Tokenizers

Tokenizers convert text into the tokens used to find matching rows. They work on UTF8 input providing the beginning and end offsets for each token. They can also provide more than one token at the same position for example if you wanted both first and 1st.

Tokenizers and their arguments are specified as the ‘tokenize’ option when creating a FTS5 table. You can also call them directly from a connection. APSW provides several tokenizers but lets look at unicode61 - the default SQLite tokenizer

tokenizer = connection.fts5_tokenizer("unicode61")

test_text = """🤦🏼‍♂️ v1.2 Grey Ⅲ ColOUR! Don't jump -  🇫🇮你好世界 Straße
    हैलो वर्ल्ड Déjà vu Résumé SQLITE_ERROR"""

# Call the tokenizer to do a tokenization, supplying the reason
# and no locale.
pprint(
    tokenizer(
        test_text.encode("utf8"), apsw.FTS5_TOKENIZE_DOCUMENT, None
    )
)


# Make a function to show output
def show_tokens(
    text,
    tokenizer_name,
    tokenizer_args=None,
    reason=apsw.FTS5_TOKENIZE_DOCUMENT,
):
    print(f"\n{text=:s}")
    print(f"{tokenizer_name=} {tokenizer_args=}")

    tokenizer = connection.fts5_tokenizer(
        tokenizer_name, tokenizer_args
    )
    # exclude the offsets since they clutter the output
    pprint(
        tokenizer(
            text.encode("utf8"),
            reason,
            None,
            include_offsets=False,
        )
    )
    print()


show_tokens("v1.2 SQLITE_ERROR", "unicode61")

# We want the version number and symbol kept together, so use
# the tokenchars parameter.  Many tokenizers take parameters.
show_tokens("v1.2 SQLITE_ERROR", "unicode61", ["tokenchars", "_."])

# Tokenizers can also be chained together.  The porter tokenizer takes
# existing tokens and turns them into a base.  The rightmost tokenizer
# generates tokens, while ones to the left transform them.  This ensures
# you can search for variations of words without having to get them
# exactly right.
show_tokens(
    "Likes liked likely liking cat cats colour color",
    "porter",
    ["unicode61", "tokenchars", "_"],
)

[(0, 8, '🤦🏼'),
 (18, 20, 'v1'),
 (21, 22, '2'),
 (23, 27, 'grey'),
 (28, 31, 'ⅲ'),
 (32, 38, 'colour'),
 (40, 43, 'don'),
 (44, 45, 't'),
 (46, 50, 'jump'),
 (62, 74, '你好世界'),
 (75, 82, 'straße'),
 (87, 90, 'ह'),
 (93, 96, 'ल'),
 (100, 106, 'वर'),
 (109, 112, 'ल'),
 (115, 118, 'ड'),
 (119, 125, 'deja'),
 (126, 128, 'vu'),
 (129, 137, 'resume'),
 (138, 144, 'sqlite'),
 (145, 150, 'error')]

text=v1.2 SQLITE_ERROR
tokenizer_name='unicode61' tokenizer_args=None
[('v1',), ('2',), ('sqlite',), ('error',)]


text=v1.2 SQLITE_ERROR
tokenizer_name='unicode61' tokenizer_args=['tokenchars', '_.']
[('v1.2',), ('sqlite_error',)]


text=Likes liked likely liking cat cats colour color
tokenizer_name='porter' tokenizer_args=['unicode61', 'tokenchars', '_']
[('like',),
 ('like',),
 ('like',),
 ('like',),
 ('cat',),
 ('cat',),
 ('colour',),
 ('color',)]

apsw.fts5.UnicodeWordsTokenizer

apsw.fts5.UnicodeWordsTokenizer() does word segmentation using the Unicode algorithm TR29 which works well across languages. It understands when punctuation is part of words like in don't, that numbers include punctuation, as well as emoji and regional indicators

connection.register_fts5_tokenizer(
    "unicodewords", apsw.fts5.UnicodeWordsTokenizer
)

# unicode61 doesn't understand grapheme clusters or
# punctuation in words, or other languages
show_tokens(test_text, "unicode61")

# unicodewords has you covered
show_tokens(test_text, "unicodewords")

text=🤦🏼‍♂️ v1.2 Grey Ⅲ ColOUR! Don't jump -  🇫🇮你好世界 Straße
    हैलो वर्ल्ड Déjà vu Résumé SQLITE_ERROR
tokenizer_name='unicode61' tokenizer_args=None
[('🤦🏼',),
 ('v1',),
 ('2',),
 ('grey',),
 ('ⅲ',),
 ('colour',),
 ('don',),
 ('t',),
 ('jump',),
 ('你好世界',),
 ('straße',),
 ('ह',),
 ('ल',),
 ('वर',),
 ('ल',),
 ('ड',),
 ('deja',),
 ('vu',),
 ('resume',),
 ('sqlite',),
 ('error',)]


text=🤦🏼‍♂️ v1.2 Grey Ⅲ ColOUR! Don't jump -  🇫🇮你好世界 Straße
    हैलो वर्ल्ड Déjà vu Résumé SQLITE_ERROR
tokenizer_name='unicodewords' tokenizer_args=None
[('🤦🏼\u200d♂️',),
 ('v1.2',),
 ('Grey',),
 ('Ⅲ',),
 ('ColOUR',),
 ("Don't",),
 ('jump',),
 ('🇫🇮',),
 ('你',),
 ('好',),
 ('世',),
 ('界',),
 ('Straße',),
 ('हैलो',),
 ('वर्ल्ड',),
 ('Déjà',),
 ('vu',),
 ('Résumé',),
 ('SQLITE_ERROR',)]

apsw.fts5.SimplifyTokenizer

You may have noticed that there are accents (diacritics) and mixed case in the tokens in the example above. It is convenient to remove those. The apsw.fts5.SimplifyTokenizer() can neutralize case and remove accents and marks, so you can use it to filter your own or other tokenizers.

connection.register_fts5_tokenizer(
    "simplify", apsw.fts5.SimplifyTokenizer
)

show_tokens(
    test_text,
    "simplify",
    [
        # casefold is for case insensitive comparisons
        "casefold",
        "1",
        # strip decomposes codepoints to remove accents
        # and marks, and uses compatibility codepoints,
        # an example is Roman numeral Ⅲ becomes III,
        "strip",
        "1",
        # Use unicodewords to get the tokens to simplify
        "unicodewords",
    ],
)

text=🤦🏼‍♂️ v1.2 Grey Ⅲ ColOUR! Don't jump -  🇫🇮你好世界 Straße
    हैलो वर्ल्ड Déjà vu Résumé SQLITE_ERROR
tokenizer_name='simplify' tokenizer_args=['casefold', '1', 'strip', '1', 'unicodewords']
[('🤦♂',),
 ('v12',),
 ('grey',),
 ('iii',),
 ('colour',),
 ('dont',),
 ('jump',),
 ('🇫🇮',),
 ('你',),
 ('好',),
 ('世',),
 ('界',),
 ('strasse',),
 ('हल',),
 ('वरलड',),
 ('deja',),
 ('vu',),
 ('resume',),
 ('sqliteerror',)]

Your own tokenizer

We will define our own tokenizer to be the same as above, but without all those parameters in the table definition. A tokenizer takes the connection and list of string parameters.

def my_tokenizer(
    con: apsw.Connection, params: list[str]
) -> apsw.FTS5Tokenizer:
    # we take no params
    if params:
        raise ValueError("Expected no parameters")

    # Same as above, but no longer in our SQL
    return con.fts5_tokenizer(
        "simplify",
        ["casefold", "1", "strip", "1", "unicodewords"],
    )


connection.register_fts5_tokenizer("mine", my_tokenizer)

# Produces same result as above
show_tokens(test_text, "mine")

text=🤦🏼‍♂️ v1.2 Grey Ⅲ ColOUR! Don't jump -  🇫🇮你好世界 Straße
    हैलो वर्ल्ड Déjà vu Résumé SQLITE_ERROR
tokenizer_name='mine' tokenizer_args=None
[('🤦♂',),
 ('v12',),
 ('grey',),
 ('iii',),
 ('colour',),
 ('dont',),
 ('jump',),
 ('🇫🇮',),
 ('你',),
 ('好',),
 ('世',),
 ('界',),
 ('strasse',),
 ('हल',),
 ('वरलड',),
 ('deja',),
 ('vu',),
 ('resume',),
 ('sqliteerror',)]

Your own tokenizer, part 2

We’ll make one entirely our own, not building on any existing tokenizer. Tokenizers operate on UTF8 and byte offsets. The apsw.fts5.StringTokenizer() decorator lets you operate on str instead and handles the mapping. apsw.fts5.parse_tokenizer_args() makes it easy to handle parameters.

@apsw.fts5.StringTokenizer
def atokenizer(
    con: apsw.Connection, params: list[str]
) -> apsw.FTS5Tokenizer:
    # What we accept
    ta = apsw.fts5.TokenizerArgument
    spec = {
        # two choices
        "big": ta(choices=("ship", "plane")),
        # default value only
        "small": "hello",
        # conversion
        "block": ta(default=2, convertor=int),
    }

    options = apsw.fts5.parse_tokenizer_args(spec, con, params)

    # show what options we got
    print(f"{options=}")

    def tokenize(text: str, reason: int, locale: str | None):
        # See apsw.fts5.tokenize_reasons for mapping from text to number
        print(f"{reason=}")
        # if a locale table and value was used
        print(f"{locale=}")
        # break string in groups of 'block' characters
        for start in range(0, len(text), options["block"]):
            token = text[start : start + options["block"]]
            yield start, start + len(token), token

    return tokenize


connection.register_fts5_tokenizer("atokenizer", atokenizer)

# show full return - note offsets are utf8 bytes
tok = connection.fts5_tokenizer(
    "atokenizer", ["big", "plane", "block", "5"]
)
pprint(tok(test_text.encode("utf8"), apsw.FTS5_TOKENIZE_AUX, None))

options={'big': 'plane', 'block': 5, 'small': 'hello'}
reason=8
locale=None
[(0, 17, '🤦🏼\u200d♂️'),
 (17, 22, ' v1.2'),
 (22, 27, ' Grey'),
 (27, 34, ' Ⅲ Co'),
 (34, 39, 'lOUR!'),
 (39, 44, " Don'"),
 (44, 49, 't jum'),
 (49, 54, 'p -  '),
 (54, 71, '🇫🇮你好世'),
 (71, 78, '界 Str'),
 (78, 84, 'aße\n '),
 (84, 93, '   है'),
 (93, 106, 'लो वर'),
 (106, 119, '्ल्ड '),
 (119, 126, 'Déjà '),
 (126, 132, 'vu Ré'),
 (132, 138, 'sumé '),
 (138, 143, 'SQLIT'),
 (143, 148, 'E_ERR'),
 (148, 150, 'OR')]

apsw.fts5.RegexTokenizer

We can use regular expressions. Unlike the other tokenizers the pattern is not passed as a SQL level parameter because there would be a confusing amount of backslashes, square brackets and other quoting going on.

pattern = r"\d+"  # digits
flags = re.ASCII  # only ascii recognised
tokenizer = functools.partial(
    apsw.fts5.RegexTokenizer, pattern=pattern, flags=flags
)
connection.register_fts5_tokenizer("my_regex", tokenizer)

# ASCII/Arabic and non-ascii digits
text = "text2abc 3.14 tamil ௦௧௨௩௪ bengali ০১২৩৪ arabic01234"

show_tokens(text, "my_regex")

text=text2abc 3.14 tamil ௦௧௨௩௪ bengali ০১২৩৪ arabic01234
tokenizer_name='my_regex' tokenizer_args=None
[('2',), ('3',), ('14',), ('01234',)]

apsw.fts5.RegexPreTokenizer

Use regular expressions to extract tokens of interest such as identifiers, and then use a different tokenizer on the text between the regular expression matches. Contrast to RegexTokenizer above which ignores text not matching the pattern.

# For this example our identifiers are two digits slash two letters
text = "73/RS is larger than 65/ST"

# See what unicodewords does
show_tokens(text, "unicodewords")

# Setup RegexPreTokenizer
pattern = r"[0-9][0-9]/[A-Z][A-Z]"
tokenizer = functools.partial(
    apsw.fts5.RegexPreTokenizer, pattern=pattern
)
connection.register_fts5_tokenizer("myids", tokenizer)

# extract myids, leaving the other text to unicodewords
show_tokens(text, "myids", ["unicodewords"])

text=73/RS is larger than 65/ST
tokenizer_name='unicodewords' tokenizer_args=None
[('73',), ('RS',), ('is',), ('larger',), ('than',), ('65',), ('ST',)]


text=73/RS is larger than 65/ST
tokenizer_name='myids' tokenizer_args=['unicodewords']
[('73/RS',), ('is',), ('larger',), ('than',), ('65/ST',)]

HTML tokenizer

The HTMLTokenizer() will pass on the extracted text to another tokenizer.

text = """<title>Big&amp;Small</title><p>Hello <b>world</b>.  B&Agrave;&#97;"""

show_tokens(text, "html", ["unicodewords"])

text=<title>Big&amp;Small</title><p>Hello <b>world</b>.  B&Agrave;&#97;
tokenizer_name='html' tokenizer_args=['unicodewords']
[('Big',), ('Small',), ('Hello',), ('world',), ('BÀa',)]

JSON tokenizer

JSONTokenizer() extracts strings from JSON for processing by another tokenizer.

import json

data = {
    "name": "A car🚗",
    "items": ["one", 2, "three", {"four": "five"}],
}

text = json.dumps(data, indent=True)

# Keys can be extracted
show_tokens(text, "json", ["include_keys", "1", "unicodewords"])

# or ignored
show_tokens(text, "json", ["include_keys", "0", "unicodewords"])

text={
 "name": "A car\ud83d\ude97",
 "items": [
  "one",
  2,
  "three",
  {
   "four": "five"
  }
 ]
}
tokenizer_name='json' tokenizer_args=['include_keys', '1', 'unicodewords']
[('name',),
 ('A',),
 ('car',),
 ('🚗',),
 ('items',),
 ('one',),
 ('three',),
 ('four',),
 ('five',)]


text={
 "name": "A car\ud83d\ude97",
 "items": [
  "one",
  2,
  "three",
  {
   "four": "five"
  }
 ]
}
tokenizer_name='json' tokenizer_args=['include_keys', '0', 'unicodewords']
[('A',), ('car',), ('🚗',), ('one',), ('three',), ('five',)]

Synonym tokenizer

SynonymTokenizer() is useful to add colocated tokens.

text = "one Colour first big dog"

# We use a dict
synonyms = {
    "colour": "color",
    "first": "1st",
    "dog": ["puppy", "canine", "k9"],
}

tokenizer = apsw.fts5.SynonymTokenizer(synonyms.get)

connection.register_fts5_tokenizer(
    "synonyms",
    tokenizer,
)

# It is to the left of simplify so we don't have to match all the
# different cases. By default the synonyms tokenizer only applies when
# tokenizing queries.
show_tokens(
    text,
    "synonyms",
    ["simplify", "casefold", "1", "unicodewords"],
    reason=apsw.FTS5_TOKENIZE_QUERY,
)

text=one Colour first big dog
tokenizer_name='synonyms' tokenizer_args=['simplify', 'casefold', '1', 'unicodewords']
[('one',),
 ('colour', 'color'),
 ('first', '1st'),
 ('big',),
 ('dog', 'puppy', 'canine', 'k9')]

Stopwords tokenizer

StopWordsTokenizer() removes tokens from the stream. Tokens that appear in almost every row aren’t useful for finding good matches, and increase the size of the index.

@apsw.fts5.StopWordsTokenizer
def ignore(token: str) -> bool:
    # Return True if the token should be ignored.  These are common
    # English tokens that appear in almost all rows of English
    # content.
    return token in {
        "to",
        "the",
        "and",
        "for",
        "you",
        "of",
        "on",
        "is",
    }


text = "On the green hills and blue skies you see forever"

connection.register_fts5_tokenizer("ignore", ignore)

# It is to the left of simplify so we don't have to match all the
# different cases.
show_tokens(
    text, "ignore", ["simplify", "casefold", "1", "unicodewords"]
)

text=On the green hills and blue skies you see forever
tokenizer_name='ignore' tokenizer_args=['simplify', 'casefold', '1', 'unicodewords']
[('green',), ('hills',), ('blue',), ('skies',), ('see',), ('forever',)]

Transform tokenizer

TransformTokenizer() modifies tokens.

@apsw.fts5.TransformTokenizer
def transform(token: str) -> str:
    # we remove trailing 's' and 'ing'
    if token.endswith("s"):
        return token[:-1]
    if token.endswith("ing"):
        return token[:-3]
    return token


text = "Chickens playing towards talking plays talks"

connection.register_fts5_tokenizer("transform", transform)

# It is to the left of simplify so we don't have to match all the
# different cases.
show_tokens(
    text, "transform", ["simplify", "casefold", "1", "unicodewords"]
)

text=Chickens playing towards talking plays talks
tokenizer_name='transform' tokenizer_args=['simplify', 'casefold', '1', 'unicodewords']
[('chicken',), ('play',), ('toward',), ('talk',), ('play',), ('talk',)]

Cleanup

We can now close the connection, but it is optional.

connection.close()