Full Text Search Example/Tour
You can do FTS5 using normal SQL as documented. This example shows using APSW specific functionality and extras.
#!/usr/bin/env python3
# This code uses Python's optional typing annotations. You can
# ignore them and do not need to use them. If you do use them
# then you must include this future annotations line first.
from __future__ import annotations
from typing import Optional, Iterator, Any
from pprint import pprint
import re
import functools
import apsw
import apsw.ext
# The three FTS5 specific modules
import apsw.fts5
import apsw.fts5aux
import apsw.fts5query
Is FTS5 available?
FTS5 is included as part of the SQLite library (usually).
print("FTS5 available:", "ENABLE_FTS5" in apsw.compile_options)
FTS5 available: True
Content setup
The connection to use. The database has a table with recipes.
connection = apsw.Connection("recipes.db")
# The content
print(
connection.execute(
"SELECT sql FROM sqlite_schema WHERE name='recipes'"
).get
)
print(connection.execute("SELECT COUNT(*) FROM recipes").get)
CREATE TABLE recipes(name, ingredients, description)
173278
Create/open search table
Create a table containing the search index using recipes as an external content table.
if not connection.table_exists("main", "search"):
# create does all the hard work
search_table = apsw.fts5.Table.create(
connection,
# The table will be named 'search'
"search",
# External content table name. It has to be in the same
# database.
content="recipes",
# We want the same columns as recipe, so pass `None`.
columns=None,
# Triggers ensure that changes to the content table
# are reflected in the search table
generate_triggers=True,
# Use APSW recommended tokenization
tokenize=[
# simplify tokenizer
"simplify",
# removes case distinction
"casefold",
"true",
# removes accents, uses compatibility codepoints
"strip",
"true",
# unicode algorithm for word boundaries tokenizer
"unicodewords",
],
# There are many more options you can set
)
else:
# Already exists so just the name is needed
search_table = apsw.fts5.Table(connection, "search")
# You should use this to get the table name when formatting SQL
# queries as they can't use bindings. It will correctly quote the
# schema (attached database name) and the table name no matter what
# characters, spaces etc are used.
print("quoted name", search_table.quoted_table_name)
quoted name "main"."search"
Table structure and statistics
Examine the structure, options, and statistics
pprint(search_table.structure)
# rank is one of several options you can read or change
print(f"{search_table.config_rank()=}")
# some statistics
print(f"{search_table.row_count=}")
print(f"{search_table.tokens_per_column=}")
FTS5TableStructure(name='search',
columns=('name', 'ingredients', 'description'),
unindexed=set(),
tokenize=('simplify',
'casefold',
'true',
'strip',
'true',
'unicodewords'),
prefix=set(),
content='recipes',
content_rowid='_ROWID_',
contentless_delete=None,
contentless_unindexed=None,
columnsize=True,
tokendata=False,
locale=False,
detail='full')
search_table.config_rank()='bm25()'
search_table.row_count=173278
search_table.tokens_per_column=[743406, 8064731, 3824644]
Content
Use upsert()
to add or change existing
content and delete()
to delete a row. They
understand external content tables and will do the operations there,
then the triggers will update the search index.
row_by_id()
gets one or more columns from a
row, and also handles external content tables.
# upsert returns the rowid of an insert or change.
rowid = search_table.upsert(
# you can use positional parameters so this goes to the first column
"This ends up in the name field",
# and keywords
description="This ends up in the description",
)
print(f"{search_table.row_by_id(rowid, 'name')=}")
# modify that row
search_table.upsert(ingredients="some stuff", rowid=rowid)
# And delete our test row
search_table.delete(rowid)
search_table.row_by_id(rowid, 'name')='This ends up in the name field'
Searching with SQL.
You need to specify what should be returned, the FTS query, and order by to get the best results first.
sql = """
SELECT
-- snippet takes a lot of options!
snippet(search, -1, '►', '◄', '...', 10) as Snippet
-- Pass the query like this
FROM search(?)
-- Best matches first
ORDER BY rank
-- Top 3
LIMIT 3"""
for query in (
"cream",
"orange NOT zest",
"name:honey",
"pomegranate OR olives",
):
print(f"{query=}")
print(apsw.ext.format_query_table(connection, sql, (query,)))
query='cream'
┌───────────────────────────────────────────────────────────┐
│ Snippet │
├───────────────────────────────────────────────────────────┤
│ Find the recipe for Sour ►Cream◄ Ice ►Cream◄ and other... │
├───────────────────────────────────────────────────────────┤
│ 1 (8 ounce) package ►cream◄ cheese │
│ 1 cup white sugar... │
├───────────────────────────────────────────────────────────┤
│ Want a Ice ►Cream◄ and Party? │
│ Here's a Ice-►Cream◄... │
└───────────────────────────────────────────────────────────┘
query='orange NOT zest'
┌───────────────────────────────────────────────────┐
│ Snippet │
├───────────────────────────────────────────────────┤
│ ice │
│ 750ml blood ►orange◄ , or ►orange◄ juice │
│ 750ml soda water... │
├───────────────────────────────────────────────────┤
│ 1 tablespoon olive oil │
│ 4 (4 ounce) fillets ►orange◄ roughy... │
├───────────────────────────────────────────────────┤
│ ...►orange◄-flavored liqueur │
│ Extra ►orange◄ slices for serving, or ►orange◄... │
└───────────────────────────────────────────────────┘
query='name:honey'
┌────────────────────────────────────────────────────────────────────────┐
│ Snippet │
│ Wildflower-►Honey◄ Semifreddo with ►Honey◄ Sesame Wafers │
│ Roast Chicken with Saffron, Hazelnuts, and ►Honey◄ │
│ Recipe Flash: Ginger-►Honey◄ Chicken WingsGinger-►Honey◄ Chicken Wings │
└────────────────────────────────────────────────────────────────────────┘
query='pomegranate OR olives'
┌─────────────────────────────────────────────────────────────────────────────┐
│ Snippet │
├─────────────────────────────────────────────────────────────────────────────┤
│ A new way to serve ►olives◄ - marinated in ►pomegranate◄ juice... │
├─────────────────────────────────────────────────────────────────────────────┤
│ Purple kale, oranges, feta cheese and ►olives◄ topped with ►pomegranate◄... │
├─────────────────────────────────────────────────────────────────────────────┤
│ ...juice │
│ Seeds from a ►pomegranate◄ │
│ 25g dried apricot pieces │
│ Quarter... │
└─────────────────────────────────────────────────────────────────────────────┘
Search method
search()
provides a Pythonic API providing
information about each matching row, best matches first.
for row in search_table.search("lemon OR guava"):
# Note how you see overall query info (it is two phrases) and
# information about the matched row (how many tokens in each
# column), and which columns each phrase was found in
pprint(row)
# only show the first matching row
break
# Inspect first matching row
name, description = search_table.row_by_id(
row.rowid, ("name", "description")
)
print((name, description))
MatchInfo(query_info=QueryInfo(phrases=(('lemon',), ('guava',))),
rowid=1642796043941632884,
column_size=[3, 24, 5],
phrase_columns=[[1], [0, 2]])
("P.L's Guava Jelly", 'Delicious home-made guava jam')
Query correction and suggestion
If the query contains words that don’t exist or are very rare (likely spelling difference) then you can provide alternate queries that probably improve results.
for query in (
"jalapno",
# query structure is maintained
"orange AND guice",
# column names are corrected too
"nyme:(minced OR oyl NOT peenut)",
# None is returned if all terms are ok
"sweet onion",
):
suggest = search_table.query_suggest(query)
print(f"{query=} {suggest=}")
query='jalapno' suggest='jalapeño'
query='orange AND guice' suggest='orange juice'
query='nyme:(minced OR oyl NOT peenut)' suggest='name: (minced OR oil NOT peanut)'
query='sweet onion' suggest=None
Working with tokens
Document and query text is processed into tokens, with matches found based on those tokens. Tokens are not visible to the user.
Typically they correspond to “words” in the text, but with upper/lower case neutralized, punctuation removed, marks and accents removed.
# See what happens with sample text
text = "Don't 2.245e5 Run-Down Déjà 你好世界😂❤️🤣"
# tokenization happens on UTF8
utf8 = text.encode()
# Note offsets into the utf8. Those could be used to highlight the
# original.
pprint(search_table.tokenize(utf8))
# For our table, the most popular tokens, and what percent of rows
# they are in
print("\nMost popular by row count")
row_count = search_table.row_count
for token, count in search_table.token_doc_frequency():
print(f"{token:20}{count/row_count:.0%}")
# Most popular tokens, based on total token count
print("\nMost popular by token count")
token_count = search_table.token_count
for token, count in search_table.token_frequency():
print(f"{token:20}{count/token_count:.0%}")
# Find what text produced a token, by looking at 5 rows.
token = "jalapeno"
text = search_table.text_for_token(token, 5)
print(f"\nText for {token=} is {text}")
[(0, 5, 'dont'),
(6, 13, '2245e5'),
(14, 17, 'run'),
(18, 22, 'down'),
(23, 29, 'deja'),
(30, 33, '你'),
(33, 36, '好'),
(36, 39, '世'),
(39, 42, '界'),
(42, 46, '😂'),
(46, 52, '❤'),
(52, 56, '🤣')]
Most popular by row count
1 92%
2 80%
and 72%
a 59%
cup 56%
cups 53%
4 49%
with 47%
3 47%
the 43%
Most popular by token count
1 8%
2 4%
cups 2%
cup 2%
and 2%
1⁄2 2%
4 1%
teaspoon 1%
a 1%
3 1%
Text for token='jalapeno' is Jalapeño
Key Tokens and More Like
key_tokens()
finds tokens represented in a
row, but rare in other rows. This is purely statistical and has no
idea of the meaning or relationship between tokens.
more_like()
is given some rowids, extracts
their key tokens, and starts a search with them, excluding the
rowids already seen. This lets you provide “infinite scrolling”
starting from one or more rows, providing additional similar
content.
Both methods let you specify specific columns, or all columns (default)
# A randomly selected row ...
bbq_rowid = 1642796066805404445
# ... for bbq sauce
print(search_table.row_by_id(bbq_rowid, "name"))
# Note how each token gets a score, with bigger numbers meaning the
# token is more unique
pprint(search_table.key_tokens(bbq_rowid, columns=["name"], limit=3))
# More like based on the ingredients column
for count, match_info in enumerate(
search_table.more_like([bbq_rowid], columns="ingredients")
):
# Show the name for each
print(search_table.row_by_id(match_info.rowid, "name"))
# We could save each of these rowids and at the end do another
# more_like including them. Stop after a few for now.
if count == 5:
break
Thick and Sticky Barbecue Sauce Recipe
[(0.0002008032128514056, 'sticky'),
(0.00015475085112968121, 'barbecue'),
(3.077870113881194e-05, 'thick')]
Indian Chicken Rub
Berbere
Sarah's Chili Seasoning Mix
Jenga Ribs
Indian Spice Baked Chicken
Valentine's Day Steak Rub
Autocomplete
You often want to show results after just a few letters have been
typed before there is a complete word entered. This is done by
indexing sequences of a few letters, called ngrams
. Ngrams are never shown to the user
although you can see the snippets below.
if not connection.table_exists("main", "autocomplete"):
# create does all the hard work
autocomplete = apsw.fts5.Table.create(
connection,
# The table will be named 'search'
"autocomplete",
# External content table name. It has to be in the same
# database.
content="recipes",
# We want the same columns as recipe, so pass `None`.
columns=None,
# Triggers ensure that changes to the content table
# are reflected in the search table
generate_triggers=True,
# Use APSW recommended tokenization
tokenize=[
# simplify tokenizer
"simplify",
# removes case distinction
"casefold",
"true",
# removes accents, uses compatibility codepoints
"strip",
"true",
# ngram tokenizer
"ngram",
# How big is each sequence? This is how many letters have
# to be typed before any match is possible. Smaller values
# result in larger indexes.
"ngrams",
"3",
],
# There are many more options you can set
)
else:
# Already exists so just the name is needed
autocomplete = apsw.fts5.Table(connection, "autocomplete")
# do some queries against autocomplete index
sql = """
SELECT
-- snippet takes a lot of options!
snippet(autocomplete, -1, '►', '◄', '...', 10) as Snippet
-- Pass the query like this
FROM autocomplete(?)
-- Best matches first
ORDER BY rank
-- Top 3
LIMIT 3"""
for query in (
"eam",
"ora",
"name:ney",
"emo jui",
"barbecue",
):
print(f"{query=}")
print(apsw.ext.format_query_table(connection, sql, (query,)))
query='eam'
┌───────────────────┐
│ Snippet │
│ Sour Cr►eam◄ I... │
│ Cr►eam◄less Cr... │
│ Cr►eam◄ cheese... │
└───────────────────┘
query='ora'
┌───────────────────┐
│ Snippet │
│ Blood ►ora◄nge... │
│ ►Ora◄nge cake ... │
│ ►Ora◄nge Deser... │
└───────────────────┘
query='name:ney'
┌──────────────────────┐
│ Snippet │
│ Ho►ney◄ed-Mang... │
│ ...r-Ho►ney◄ Semi... │
│ ...h Ho►ney◄ed-Ma... │
└──────────────────────┘
query='emo jui'
┌────────────────────────┐
│ Snippet │
│ ... L►emo◄n ►Jui◄ce... │
│ ... l►emo◄n ►jui◄ce... │
│ ... l►emo◄n ►jui◄ce... │
└────────────────────────┘
query='barbecue'
┌───────────────────┐
│ Snippet │
│ Classic ►Barb◄... │
│ ►Barbecue◄ Por... │
│ ►Barbecue◄d Ba... │
└───────────────────┘
Auxiliary functions
Auxiliary functions <https://sqlite.org/fts5.html#_auxiliary_functions_>__ are called for each matching row. They can be used to provide `ranking (sorting) <https://www.sqlite.org/fts5.html#sorting_by_auxiliary_function_results>`__ for determining better matches, or returning information about the match and row such as highlights, and snippets.
They are called with FTS5ExtensionApi
as the first
parameter, and then any function arguments, and return a value.
This example shows all the information available during a query.
def row_match(
api: apsw.FTS5ExtensionApi, *args: apsw.SQLiteValue
) -> apsw.SQLiteValue:
print("row_match called with", args)
# Show what information is available from the api
print(f"{api.rowid=}")
print(f"{api.row_count=}")
print(f"{api.column_count=}")
for col in range(api.column_count):
print(f" {col=} {api.column_size(col)=}")
print(f" {col=} {api.column_total_size(col)=}")
print(f"{api.inst_count=}")
for inst in range(api.inst_count):
print(f" {inst=} {api.inst_tokens(inst)=}")
print(f"{api.phrases=}")
for phrase in range(len(api.phrases)):
# which columns the phrase is found in
print(f" {phrase=} {api.phrase_columns(phrase)=}")
# which token numbers
print(f" {phrase=} {api.phrase_locations(phrase)=}")
# the offsets of phrase 3 in column 2
print(f"{api.phrase_column_offsets(3, 2)=}")
# note the text is the utf-8 encoded bytes
print(f"{api.column_text(0)=}")
# we can get a tokenization of text, useful if you want to extract
# the original text, add snippets/highlights etc
print("Tokenized with UTF-8 offsets")
pprint(api.tokenize(api.column_text(2), api.column_locale(2)))
# query_phrase is useful for finding out how common a phrase is.
counts = [0] * len(api.phrases)
for phrase in range(len(api.phrases)):
api.query_phrase(phrase, phrase_count, (phrase, counts))
for i, phrase in enumerate(api.phrases):
print(f"Phrase {phrase=} occurs { counts[i]:,} times")
return 7
# This is used as the callback from query_phrase above. Note that the
# api instance in this call is different than the above function.
def phrase_count(api: apsw.FTS5ExtensionApi, closure):
phrase, counts = closure
# increment counts for this phrase
counts[phrase] += 1
if counts[phrase] < 5:
# Show call info the first 4 times for each phrase
print(f"phrase_count called {api.rowid=} {api.phrases=}")
# we could do more sophisticated work such as counting how many
# times it occurs (api.phrase_locations) or which columns
# (api.phrase_columns).
connection.register_fts5_function("row_match", row_match)
# A deliberately complex query to make the api interesting
query = (
"""("BoiLed eGGs" OR CinnaMON) OR NEAR (drink Cup, 5) NOT Oran*"""
)
# Make all the code above be called. Note how the table name has to be
# the first parameter to our function in the SQL
connection.execute(
"SELECT row_match(search, 5, 'hello') FROM search(?) order by rank",
(query,),
)
row_match called with (5, 'hello')
api.rowid=1642795772696450313
api.row_count=173279
api.column_count=3
col=0 api.column_size(col)=3
col=0 api.column_total_size(col)=743413
col=1 api.column_size(col)=4
col=1 api.column_total_size(col)=8064731
col=2 api.column_size(col)=9
col=2 api.column_total_size(col)=3824650
api.inst_count=2
inst=0 api.inst_tokens(inst)=('boiled', 'eggs')
inst=1 api.inst_tokens(inst)=('boiled', 'eggs')
api.phrases=(('boiled', 'eggs'), ('cinnamon',), ('drink',), ('cup',), ('oran',))
phrase=0 api.phrase_columns(phrase)=(0, 2)
phrase=0 api.phrase_locations(phrase)=[[1], [], [7]]
phrase=1 api.phrase_columns(phrase)=()
phrase=1 api.phrase_locations(phrase)=[[], [], []]
phrase=2 api.phrase_columns(phrase)=()
phrase=2 api.phrase_locations(phrase)=[[], [], []]
phrase=3 api.phrase_columns(phrase)=()
phrase=3 api.phrase_locations(phrase)=[[], [], []]
phrase=4 api.phrase_columns(phrase)=()
phrase=4 api.phrase_locations(phrase)=[[], [], []]
api.phrase_column_offsets(3, 2)=[]
api.column_text(0)=b'Hard Boiled Eggs'
Tokenized with UTF-8 offsets
[(0, 3, 'the'),
(4, 9, 'right'),
(10, 13, 'way'),
(14, 16, 'to'),
(17, 21, 'cook'),
(22, 29, 'perfect'),
(30, 34, 'hard'),
(35, 41, 'boiled'),
(42, 46, 'eggs')]
phrase_count called api.rowid=1642795733280177125 api.phrases=(('boiled', 'eggs'),)
phrase_count called api.rowid=1642795734218860660 api.phrases=(('boiled', 'eggs'),)
phrase_count called api.rowid=1642795742364372196 api.phrases=(('boiled', 'eggs'),)
phrase_count called api.rowid=1642795748917697308 api.phrases=(('boiled', 'eggs'),)
phrase_count called api.rowid=1642795722012755793 api.phrases=(('cinnamon',),)
phrase_count called api.rowid=1642795722012755797 api.phrases=(('cinnamon',),)
phrase_count called api.rowid=1642795722012755806 api.phrases=(('cinnamon',),)
phrase_count called api.rowid=1642795722012755828 api.phrases=(('cinnamon',),)
phrase_count called api.rowid=1642795722012755795 api.phrases=(('drink',),)
phrase_count called api.rowid=1642795722012819365 api.phrases=(('drink',),)
phrase_count called api.rowid=1642795725826572496 api.phrases=(('drink',),)
phrase_count called api.rowid=1642795725826588872 api.phrases=(('drink',),)
phrase_count called api.rowid=1642795721317011550 api.phrases=(('cup',),)
phrase_count called api.rowid=1642795721317011555 api.phrases=(('cup',),)
phrase_count called api.rowid=1642795721369682815 api.phrases=(('cup',),)
phrase_count called api.rowid=1642795721369683223 api.phrases=(('cup',),)
phrase_count called api.rowid=1642795722012755799 api.phrases=(('oran',),)
phrase_count called api.rowid=1642795722012755999 api.phrases=(('oran',),)
phrase_count called api.rowid=1642795722012757203 api.phrases=(('oran',),)
phrase_count called api.rowid=1642795722012819341 api.phrases=(('oran',),)
Phrase phrase=('boiled', 'eggs') occurs 321 times
Phrase phrase=('cinnamon',) occurs 12,000 times
Phrase phrase=('drink',) occurs 1,925 times
Phrase phrase=('cup',) occurs 96,655 times
Phrase phrase=('oran',) occurs 8,181 times
Query parsing and manipulation
apsw.fts5query
lets you programmatically create, update, and
parse queries. There are three forms of query.
# This is the query as accepted by FTS5.
print("query")
print(query)
# That can be parsed into the structure
parsed = apsw.fts5query.parse_query_string(query)
print("\nparsed")
pprint(parsed)
# The parsed form is a little unwieldy to work with so a dict based
# form is available.
as_dict = apsw.fts5query.to_dict(parsed)
print("\nas_dict")
pprint(as_dict)
# Make some changes - delete the first query
del as_dict["queries"][0]
as_dict["queries"].append(
{
# add a columnfilter
"@": "COLUMNFILTER",
"filter": "include",
"columns": ["name", "description"],
# The sub queries are just strings. The module knows what
# you mean and will convert them into AND
"query": ["some thing blue", "sunshine"],
}
)
print("\nmodified as_dict")
pprint(as_dict)
# Turn it into parsed form
parsed = apsw.fts5query.from_dict(as_dict)
print("\nnew parsed")
pprint(parsed)
# Turn the parsed form back into a query string
query = apsw.fts5query.to_query_string(parsed)
print("\nnew query")
print(query)
query
("BoiLed eGGs" OR CinnaMON) OR NEAR (drink Cup, 5) NOT Oran*
parsed
OR(queries=[OR(queries=[PHRASE(phrase='BoiLed eGGs',
initial=False,
prefix=False,
plus=None),
PHRASE(phrase='CinnaMON',
initial=False,
prefix=False,
plus=None)]),
NOT(match=NEAR(phrases=[PHRASE(phrase='drink',
initial=False,
prefix=False,
plus=None),
PHRASE(phrase='Cup',
initial=False,
prefix=False,
plus=None)],
distance=5),
no_match=PHRASE(phrase='Oran',
initial=False,
prefix=True,
plus=None))])
as_dict
{'@': 'OR',
'queries': [{'@': 'OR',
'queries': [{'@': 'PHRASE', 'phrase': 'BoiLed eGGs'},
{'@': 'PHRASE', 'phrase': 'CinnaMON'}]},
{'@': 'NOT',
'match': {'@': 'NEAR',
'distance': 5,
'phrases': [{'@': 'PHRASE', 'phrase': 'drink'},
{'@': 'PHRASE', 'phrase': 'Cup'}]},
'no_match': {'@': 'PHRASE', 'phrase': 'Oran', 'prefix': True}}]}
modified as_dict
{'@': 'OR',
'queries': [{'@': 'NOT',
'match': {'@': 'NEAR',
'distance': 5,
'phrases': [{'@': 'PHRASE', 'phrase': 'drink'},
{'@': 'PHRASE', 'phrase': 'Cup'}]},
'no_match': {'@': 'PHRASE', 'phrase': 'Oran', 'prefix': True}},
{'@': 'COLUMNFILTER',
'columns': ['name', 'description'],
'filter': 'include',
'query': ['some thing blue', 'sunshine']}]}
new parsed
OR(queries=[NOT(match=NEAR(phrases=[PHRASE(phrase='drink',
initial=False,
prefix=False,
plus=None),
PHRASE(phrase='Cup',
initial=False,
prefix=False,
plus=None)],
distance=5),
no_match=PHRASE(phrase='Oran',
initial=False,
prefix=True,
plus=None)),
COLUMNFILTER(columns=['name', 'description'],
filter='include',
query=AND(queries=[PHRASE(phrase='some thing blue',
initial=False,
prefix=False,
plus=None),
PHRASE(phrase='sunshine',
initial=False,
prefix=False,
plus=None)]))])
new query
NEAR(drink Cup, 5) NOT Oran* OR {name description}: ("some thing blue" sunshine)
Tokenizers
Tokenizers convert
text into the tokens used to find matching rows. They work on UTF8
input providing the beginning and end offsets for each token. They
can also provide more than one token at the same position for example if you
wanted both first
and 1st
.
Tokenizers and their arguments are specified as the ‘tokenize’
option when creating a FTS5 table. You can also call them directly
from a connection
. APSW
provides several tokenizers but lets look at
unicode61 -
the default SQLite tokenizer
tokenizer = connection.fts5_tokenizer("unicode61")
test_text = """🤦🏼♂️ v1.2 Grey Ⅲ ColOUR! Don't jump - 🇫🇮你好世界 Straße
हैलो वर्ल्ड Déjà vu Résumé SQLITE_ERROR"""
# Call the tokenizer to do a tokenization, supplying the reason
# and no locale.
pprint(
tokenizer(
test_text.encode("utf8"), apsw.FTS5_TOKENIZE_DOCUMENT, None
)
)
# Make a function to show output
def show_tokens(
text,
tokenizer_name,
tokenizer_args=None,
reason=apsw.FTS5_TOKENIZE_DOCUMENT,
):
print(f"\n{text=:s}")
print(f"{tokenizer_name=} {tokenizer_args=}")
tokenizer = connection.fts5_tokenizer(
tokenizer_name, tokenizer_args
)
# exclude the offsets since they clutter the output
pprint(
tokenizer(
text.encode("utf8"),
reason,
None,
include_offsets=False,
)
)
print()
show_tokens("v1.2 SQLITE_ERROR", "unicode61")
# We want the version number and symbol kept together, so use
# the tokenchars parameter. Many tokenizers take parameters.
show_tokens("v1.2 SQLITE_ERROR", "unicode61", ["tokenchars", "_."])
# Tokenizers can also be chained together. The porter tokenizer takes
# existing tokens and turns them into a base. The rightmost tokenizer
# generates tokens, while ones to the left transform them. This ensures
# you can search for variations of words without having to get them
# exactly right.
show_tokens(
"Likes liked likely liking cat cats colour color",
"porter",
["unicode61", "tokenchars", "_"],
)
[(0, 8, '🤦🏼'),
(18, 20, 'v1'),
(21, 22, '2'),
(23, 27, 'grey'),
(28, 31, 'ⅲ'),
(32, 38, 'colour'),
(40, 43, 'don'),
(44, 45, 't'),
(46, 50, 'jump'),
(62, 74, '你好世界'),
(75, 82, 'straße'),
(87, 90, 'ह'),
(93, 96, 'ल'),
(100, 106, 'वर'),
(109, 112, 'ल'),
(115, 118, 'ड'),
(119, 125, 'deja'),
(126, 128, 'vu'),
(129, 137, 'resume'),
(138, 144, 'sqlite'),
(145, 150, 'error')]
text=v1.2 SQLITE_ERROR
tokenizer_name='unicode61' tokenizer_args=None
[('v1',), ('2',), ('sqlite',), ('error',)]
text=v1.2 SQLITE_ERROR
tokenizer_name='unicode61' tokenizer_args=['tokenchars', '_.']
[('v1.2',), ('sqlite_error',)]
text=Likes liked likely liking cat cats colour color
tokenizer_name='porter' tokenizer_args=['unicode61', 'tokenchars', '_']
[('like',),
('like',),
('like',),
('like',),
('cat',),
('cat',),
('colour',),
('color',)]
apsw.fts5.UnicodeWordsTokenizer
apsw.fts5.UnicodeWordsTokenizer()
does word segmentation using
the Unicode algorithm TR29 which
works well across languages. It understands when punctuation is part
of words like in don't
, that numbers include punctuation, as
well as emoji and regional indicators
connection.register_fts5_tokenizer(
"unicodewords", apsw.fts5.UnicodeWordsTokenizer
)
# unicode61 doesn't understand grapheme clusters or
# punctuation in words, or other languages
show_tokens(test_text, "unicode61")
# unicodewords has you covered
show_tokens(test_text, "unicodewords")
text=🤦🏼♂️ v1.2 Grey Ⅲ ColOUR! Don't jump - 🇫🇮你好世界 Straße
हैलो वर्ल्ड Déjà vu Résumé SQLITE_ERROR
tokenizer_name='unicode61' tokenizer_args=None
[('🤦🏼',),
('v1',),
('2',),
('grey',),
('ⅲ',),
('colour',),
('don',),
('t',),
('jump',),
('你好世界',),
('straße',),
('ह',),
('ल',),
('वर',),
('ल',),
('ड',),
('deja',),
('vu',),
('resume',),
('sqlite',),
('error',)]
text=🤦🏼♂️ v1.2 Grey Ⅲ ColOUR! Don't jump - 🇫🇮你好世界 Straße
हैलो वर्ल्ड Déjà vu Résumé SQLITE_ERROR
tokenizer_name='unicodewords' tokenizer_args=None
[('🤦🏼\u200d♂️',),
('v1.2',),
('Grey',),
('Ⅲ',),
('ColOUR',),
("Don't",),
('jump',),
('🇫🇮',),
('你',),
('好',),
('世',),
('界',),
('Straße',),
('हैलो',),
('वर्ल्ड',),
('Déjà',),
('vu',),
('Résumé',),
('SQLITE_ERROR',)]
apsw.fts5.SimplifyTokenizer
You may have noticed that there are accents (diacritics) and
mixed case in the tokens in the example above. It is
convenient to remove those. The apsw.fts5.SimplifyTokenizer()
can neutralize case and remove accents and marks, so you can use it
to filter your own or other tokenizers.
connection.register_fts5_tokenizer(
"simplify", apsw.fts5.SimplifyTokenizer
)
show_tokens(
test_text,
"simplify",
[
# casefold is for case insensitive comparisons
"casefold",
"1",
# strip decomposes codepoints to remove accents
# and marks, and uses compatibility codepoints,
# an example is Roman numeral Ⅲ becomes III,
"strip",
"1",
# Use unicodewords to get the tokens to simplify
"unicodewords",
],
)
text=🤦🏼♂️ v1.2 Grey Ⅲ ColOUR! Don't jump - 🇫🇮你好世界 Straße
हैलो वर्ल्ड Déjà vu Résumé SQLITE_ERROR
tokenizer_name='simplify' tokenizer_args=['casefold', '1', 'strip', '1', 'unicodewords']
[('🤦♂',),
('v12',),
('grey',),
('iii',),
('colour',),
('dont',),
('jump',),
('🇫🇮',),
('你',),
('好',),
('世',),
('界',),
('strasse',),
('हल',),
('वरलड',),
('deja',),
('vu',),
('resume',),
('sqliteerror',)]
Your own tokenizer
We will define our own tokenizer to be the same as above, but without all those parameters in the table definition. A tokenizer takes the connection and list of string parameters.
def my_tokenizer(
con: apsw.Connection, params: list[str]
) -> apsw.FTS5Tokenizer:
# we take no params
if params:
raise ValueError("Expected no parameters")
# Same as above, but no longer in our SQL
return con.fts5_tokenizer(
"simplify",
["casefold", "1", "strip", "1", "unicodewords"],
)
connection.register_fts5_tokenizer("mine", my_tokenizer)
# Produces same result as above
show_tokens(test_text, "mine")
text=🤦🏼♂️ v1.2 Grey Ⅲ ColOUR! Don't jump - 🇫🇮你好世界 Straße
हैलो वर्ल्ड Déjà vu Résumé SQLITE_ERROR
tokenizer_name='mine' tokenizer_args=None
[('🤦♂',),
('v12',),
('grey',),
('iii',),
('colour',),
('dont',),
('jump',),
('🇫🇮',),
('你',),
('好',),
('世',),
('界',),
('strasse',),
('हल',),
('वरलड',),
('deja',),
('vu',),
('resume',),
('sqliteerror',)]
Your own tokenizer, part 2
We’ll make one entirely our own, not building on any existing
tokenizer. Tokenizers operate on UTF8 and byte offsets. The
apsw.fts5.StringTokenizer()
decorator lets you operate on
str
instead and handles the mapping.
apsw.fts5.parse_tokenizer_args()
makes it easy to handle
parameters.
@apsw.fts5.StringTokenizer
def atokenizer(
con: apsw.Connection, params: list[str]
) -> apsw.FTS5Tokenizer:
# What we accept
ta = apsw.fts5.TokenizerArgument
spec = {
# two choices
"big": ta(choices=("ship", "plane")),
# default value only
"small": "hello",
# conversion
"block": ta(default=2, convertor=int),
}
options = apsw.fts5.parse_tokenizer_args(spec, con, params)
# show what options we got
print(f"{options=}")
def tokenize(text: str, reason: int, locale: str | None):
# See apsw.fts5.tokenize_reasons for mapping from text to number
print(f"{reason=}")
# if a locale table and value was used
print(f"{locale=}")
# break string in groups of 'block' characters
for start in range(0, len(text), options["block"]):
token = text[start : start + options["block"]]
yield start, start + len(token), token
return tokenize
connection.register_fts5_tokenizer("atokenizer", atokenizer)
# show full return - note offsets are utf8 bytes
tok = connection.fts5_tokenizer(
"atokenizer", ["big", "plane", "block", "5"]
)
pprint(tok(test_text.encode("utf8"), apsw.FTS5_TOKENIZE_AUX, None))
options={'big': 'plane', 'block': 5, 'small': 'hello'}
reason=8
locale=None
[(0, 17, '🤦🏼\u200d♂️'),
(17, 22, ' v1.2'),
(22, 27, ' Grey'),
(27, 34, ' Ⅲ Co'),
(34, 39, 'lOUR!'),
(39, 44, " Don'"),
(44, 49, 't jum'),
(49, 54, 'p - '),
(54, 71, '🇫🇮你好世'),
(71, 78, '界 Str'),
(78, 84, 'aße\n '),
(84, 93, ' है'),
(93, 106, 'लो वर'),
(106, 119, '्ल्ड '),
(119, 126, 'Déjà '),
(126, 132, 'vu Ré'),
(132, 138, 'sumé '),
(138, 143, 'SQLIT'),
(143, 148, 'E_ERR'),
(148, 150, 'OR')]
apsw.fts5.RegexTokenizer
We can use regular expressions
. Unlike the other
tokenizers the pattern is not passed as a SQL level parameter
because there would be a confusing amount of backslashes, square
brackets and other quoting going on.
pattern = r"\d+" # digits
flags = re.ASCII # only ascii recognised
tokenizer = functools.partial(
apsw.fts5.RegexTokenizer, pattern=pattern, flags=flags
)
connection.register_fts5_tokenizer("my_regex", tokenizer)
# ASCII/Arabic and non-ascii digits
text = "text2abc 3.14 tamil ௦௧௨௩௪ bengali ০১২৩৪ arabic01234"
show_tokens(text, "my_regex")
text=text2abc 3.14 tamil ௦௧௨௩௪ bengali ০১২৩৪ arabic01234
tokenizer_name='my_regex' tokenizer_args=None
[('2',), ('3',), ('14',), ('01234',)]
apsw.fts5.RegexPreTokenizer
Use regular expressions to extract tokens of interest such as identifiers, and then use a different tokenizer on the text between the regular expression matches. Contrast to RegexTokenizer above which ignores text not matching the pattern.
# For this example our identifiers are two digits slash two letters
text = "73/RS is larger than 65/ST"
# See what unicodewords does
show_tokens(text, "unicodewords")
# Setup RegexPreTokenizer
pattern = r"[0-9][0-9]/[A-Z][A-Z]"
tokenizer = functools.partial(
apsw.fts5.RegexPreTokenizer, pattern=pattern
)
connection.register_fts5_tokenizer("myids", tokenizer)
# extract myids, leaving the other text to unicodewords
show_tokens(text, "myids", ["unicodewords"])
text=73/RS is larger than 65/ST
tokenizer_name='unicodewords' tokenizer_args=None
[('73',), ('RS',), ('is',), ('larger',), ('than',), ('65',), ('ST',)]
text=73/RS is larger than 65/ST
tokenizer_name='myids' tokenizer_args=['unicodewords']
[('73/RS',), ('is',), ('larger',), ('than',), ('65/ST',)]
HTML tokenizer
The HTMLTokenizer()
will pass on the
extracted text to another tokenizer.
text = """<title>Big&Small</title><p>Hello <b>world</b>. BÀa"""
show_tokens(text, "html", ["unicodewords"])
text=<title>Big&Small</title><p>Hello <b>world</b>. BÀa
tokenizer_name='html' tokenizer_args=['unicodewords']
[('Big',), ('Small',), ('Hello',), ('world',), ('BÀa',)]
JSON tokenizer
JSONTokenizer()
extracts strings from JSON for
processing by another tokenizer.
import json
data = {
"name": "A car🚗",
"items": ["one", 2, "three", {"four": "five"}],
}
text = json.dumps(data, indent=True)
# Keys can be extracted
show_tokens(text, "json", ["include_keys", "1", "unicodewords"])
# or ignored
show_tokens(text, "json", ["include_keys", "0", "unicodewords"])
text={
"name": "A car\ud83d\ude97",
"items": [
"one",
2,
"three",
{
"four": "five"
}
]
}
tokenizer_name='json' tokenizer_args=['include_keys', '1', 'unicodewords']
[('name',),
('A',),
('car',),
('🚗',),
('items',),
('one',),
('three',),
('four',),
('five',)]
text={
"name": "A car\ud83d\ude97",
"items": [
"one",
2,
"three",
{
"four": "five"
}
]
}
tokenizer_name='json' tokenizer_args=['include_keys', '0', 'unicodewords']
[('A',), ('car',), ('🚗',), ('one',), ('three',), ('five',)]
Synonym tokenizer
SynonymTokenizer()
is useful to add colocated
tokens.
text = "one Colour first big dog"
# We use a dict
synonyms = {
"colour": "color",
"first": "1st",
"dog": ["puppy", "canine", "k9"],
}
tokenizer = apsw.fts5.SynonymTokenizer(synonyms.get)
connection.register_fts5_tokenizer(
"synonyms",
tokenizer,
)
# It is to the left of simplify so we don't have to match all the
# different cases. By default the synonyms tokenizer only applies when
# tokenizing queries.
show_tokens(
text,
"synonyms",
["simplify", "casefold", "1", "unicodewords"],
reason=apsw.FTS5_TOKENIZE_QUERY,
)
text=one Colour first big dog
tokenizer_name='synonyms' tokenizer_args=['simplify', 'casefold', '1', 'unicodewords']
[('one',),
('colour', 'color'),
('first', '1st'),
('big',),
('dog', 'puppy', 'canine', 'k9')]
Stopwords tokenizer
StopWordsTokenizer()
removes tokens from the
stream. Tokens that appear in almost every row aren’t useful for
finding good matches, and increase the size of the index.
@apsw.fts5.StopWordsTokenizer
def ignore(token: str) -> bool:
# Return True if the token should be ignored. These are common
# English tokens that appear in almost all rows of English
# content.
return token in {
"to",
"the",
"and",
"for",
"you",
"of",
"on",
"is",
}
text = "On the green hills and blue skies you see forever"
connection.register_fts5_tokenizer("ignore", ignore)
# It is to the left of simplify so we don't have to match all the
# different cases.
show_tokens(
text, "ignore", ["simplify", "casefold", "1", "unicodewords"]
)
text=On the green hills and blue skies you see forever
tokenizer_name='ignore' tokenizer_args=['simplify', 'casefold', '1', 'unicodewords']
[('green',), ('hills',), ('blue',), ('skies',), ('see',), ('forever',)]
Transform tokenizer
TransformTokenizer()
modifies tokens.
@apsw.fts5.TransformTokenizer
def transform(token: str) -> str:
# we remove trailing 's' and 'ing'
if token.endswith("s"):
return token[:-1]
if token.endswith("ing"):
return token[:-3]
return token
text = "Chickens playing towards talking plays talks"
connection.register_fts5_tokenizer("transform", transform)
# It is to the left of simplify so we don't have to match all the
# different cases.
show_tokens(
text, "transform", ["simplify", "casefold", "1", "unicodewords"]
)
text=Chickens playing towards talking plays talks
tokenizer_name='transform' tokenizer_args=['simplify', 'casefold', '1', 'unicodewords']
[('chicken',), ('play',), ('toward',), ('talk',), ('play',), ('talk',)]
Cleanup
We can now close the connection, but it is optional.
connection.close()