#!/usr/bin/env python3
"""
:mod:`apsw.unicode` - Up to date Unicode aware methods and lookups
This module helps with :doc:`textsearch` and general Unicode,
addressing the following:
* The standard library :mod:`unicodedata` has limited information
available (eg no information about emoji), and is only updated to
new `Unicode versions
<https://www.unicode.org/versions/enumeratedversions.html>`__ on a
new Python version.
* Multiple consecutive codepoints can combine into a single user
perceived character (grapheme cluster), such as combining accents,
vowels and marks in some writing systems, variant selectors, joiners
and linkers, etc. That means you can't use indexes into
:class:`str` safely without potentially breaking them.
* The standard library provides no help in splitting text into
grapheme clusters, words, and sentences, or into breaking text into
multiple lines.
* Text processing is performance sensitive - FTS5 easily handles
hundreds of megabytes to gigabytes of text, and so should this
module. It also affects the latency of each query as that is
tokenized, and results can highlight words and sentences.
This module is independent of the main apsw module, and loading it
does not load any database functionality. The majority of the
functionality is implemented in C for size and performance reasons.
See :data:`unicode_version` for the implemented version.
Grapheme cluster, word, and sentence splitting
`Unicode Technical Report #29
<https://www.unicode.org/reports/tr29/>`__ rules for finding
grapheme clusters, words, and sentences are implemented. Tr29
specifies break points which can be found via
:func:`grapheme_next_break`, :func:`word_next_break`, and
:func:`sentence_next_break`.
Building on those are iterators providing optional offsets and the
text. This is used for tokenization (getting character and word
boundaries correct), and for result highlighting (showing
words/sentences before and after match).
Line break splitting
`Unicode Technical Report #14
<https://www.unicode.org/reports/tr14/>`__ rules for finding where
text cam be broken and resumed on the next line. Tr14 specifies
break points which can be found via :func:`line_break_next_break`.
Building on those are iterators providing optional offsets and the
text. This is used for :func:`text_wrap`.
Unicode lookups
* Category information :func:`category`
* Is an emoji or similar :func:`is_extended_pictographic`
* Flag characters :func:`is_regional_indicator`
* Codepoint names :func:`codepoint_name`
Case folding, accent removal
* :func:`casefold` is used to do case insensitive comparisons.
* :func:`strip` is used to remove accents, marks, punctuation,
joiners etc
Helpers
These are aware of grapheme cluster boundaries which Python's builtin
string operations are not. The text width functions take into account
how wide the text is when displayed on most terminals.
* :func:`grapheme_length` to get the number of grapheme clusters
in a string
* :func:`grapheme_substr` to get substrings using grapheme cluster
indexing
* :func:`grapheme_startswith` and :func:`grapheme_endswith`
* :func:`grapheme_find` to find a substring
* :func:`split_lines` to split text into lines using all the
Unicode hard line break codepoints
* :func:`text_width` to count how wide the text is
* :func:`expand_tabs` to expand tabs using text width
* :func:`text_width_substr` to extract substrings based on text width
* :func:`text_wrap` to wrap paragraphs using Unicode words, line
breaking, and text width
* :func:`guess_paragraphs` to establish paragraph boundaries for
text that has line breaks in paragraphs like many plain text
and similar markup formats.
Size
Using the `ICU <https://icu.unicode.org/>`__ `extension
<https://pypi.org/project/PyICU/>`__ is 5MB of code that then
links to shared libraries containing another 5MB of code, and 30MB
of data. This module is 0.5MB, 5 to 50% faster, and has no
dependencies. (ICU includes numerous extra customisations,
formatting, locale helpers etc.)
Performance
There some pure Python alternatives, with less functionality.
They take 5 to 15 times more CPU time to process the same text.
Use ``python3 -m apsw.unicode benchmark --help``.
"""
from __future__ import annotations
from typing import Iterator, Iterable, Any
import re
### BEGIN UNICODE UPDATE SECTION ###
unicode_version = "16.0"
"""The `Unicode version <https://www.unicode.org/versions/enumeratedversions.html>`__
that the rules and data tables implement"""
class _Category:
Cc = 2**0
Cf = 2**1
Cn = 2**2
Co = 2**3
Cs = 2**4
Extended_Pictographic = 2**5
Ll = 2**6
Lm = 2**7
Lo = 2**8
Lt = 2**9
Lu = 2**10
Mc = 2**11
Me = 2**12
Mn = 2**13
Nd = 2**14
Nl = 2**15
No = 2**16
Pc = 2**17
Pd = 2**18
Pe = 2**19
Pf = 2**20
Pi = 2**21
Po = 2**22
Ps = 2**23
Regional_Indicator = 2**24
Sc = 2**25
Sk = 2**26
Sm = 2**27
So = 2**28
WIDTH_INVALID = 2**29
WIDTH_TWO = 2**30
WIDTH_ZERO = 2**31
Zl = 2**32
Zp = 2**33
Zs = 2**34
### END UNICODE UPDATE SECTION ###
from . import _unicode
assert unicode_version == _unicode.unicode_version
_unicode_category = _unicode.category_category
[docs]
def category(codepoint: int | str) -> str:
"""Returns the `general category <https://en.wikipedia.org/wiki/Unicode_character_property#General_Category>`__ - eg ``Lu`` for Letter Uppercase
See :data:`apsw.fts5.unicode_categories` for descriptions mapping"""
cat = _unicode_category(codepoint)
if cat & _Category.Lu:
return "Lu" # Letter Uppercase
elif cat & _Category.Ll:
return "Ll" # Letter Lowercase
elif cat & _Category.Lt:
return "Lt" # Letter Titlecase
elif cat & _Category.Lm:
return "Lm" # Letter Modifier
elif cat & _Category.Lo:
return "Lo" # Letter Other
elif cat & _Category.Mn:
return "Mn" # Mark NonSpacing
elif cat & _Category.Mc:
return "Mc" # Mark SpacingCombining
elif cat & _Category.Me:
return "Me" # Mark Enclosing
elif cat & _Category.Nd:
return "Nd" # Number DecimalDigit
elif cat & _Category.Nl:
return "Nl" # Number Letter
elif cat & _Category.No:
return "No" # Number Other
elif cat & _Category.Pc:
return "Pc" # Punctuation Connector
elif cat & _Category.Pd:
return "Pd" # Punctuation Dash
elif cat & _Category.Ps:
return "Ps" # Punctuation Open
elif cat & _Category.Pe:
return "Pe" # Punctuation Close
elif cat & _Category.Pi:
return "Pi" # Punctuation InitialQuote
elif cat & _Category.Pf:
return "Pf" # Punctuation FinalQuote
elif cat & _Category.Po:
return "Po" # Punctuation Other
elif cat & _Category.Sm:
return "Sm" # Symbol Math
elif cat & _Category.Sc:
return "Sc" # Symbol Currency
elif cat & _Category.Sk:
return "Sk" # Symbol Modifier
elif cat & _Category.So:
return "So" # Symbol Other
elif cat & _Category.Zs:
return "Zs" # Separator Space
elif cat & _Category.Zl:
return "Zl" # Separator Line
elif cat & _Category.Zp:
return "Zp" # Separator Paragraph
elif cat & _Category.Cc:
return "Cc" # Other Control
elif cat & _Category.Cf:
return "Cf" # Other Format
elif cat & _Category.Cs:
return "Cs" # Other Surrogate
elif cat & _Category.Co:
return "Co" # Other PrivateUse
assert cat & _Category.Cn
return "Cn" # Other NotAssigned
[docs]
def is_extended_pictographic(text: str) -> bool:
"Returns True if any of the text has the extended pictographic property (Emoji and similar)"
return _unicode.has_category(text, 0, len(text), _Category.Extended_Pictographic)
[docs]
def is_regional_indicator(text: str) -> bool:
"Returns True if any of the text is one of the 26 `regional indicators <https://en.wikipedia.org/wiki/Regional_indicator_symbol>`__ used in pairs to represent country flags"
return _unicode.has_category(text, 0, len(text), _Category.Regional_Indicator)
[docs]
def casefold(text: str) -> str:
"""Returns the text for equality comparison without case distinction
Case folding maps text to a canonical form where case differences
are removed allowing case insensitive comparison. Unlike upper,
lower, and title case, the result is not intended to be displayed
to people.
"""
return _unicode.casefold(text)
[docs]
def strip(text: str) -> str:
"""Returns the text for less exact comparison with accents, punctuation, marks etc removed
It will strip diacritics leaving the underlying characters so ``áççéñțś`` becomes ``accents``,
punctuation so ``e.g.`` becomes ``eg`` and ``don't`` becomes ``dont``, marks so ``देवनागरी``
becomes ``दवनगर``, as well as all spacing, formatting, `variation selectors
<https://en.wikipedia.org/wiki/Variation_Selectors_%28Unicode_block%29>`__ and similar codepoints.
Codepoints are also converted to their compatibility representation. For example
the single codepoint Roman numeral ``Ⅲ`` becomes ``III`` (three separate regular upper case `I`),
and ``🄷🄴🄻🄻🄾`` becomes ``HELLO``.
The resulting text should not be shown to people, and is intended for doing relaxed equality
comparisons, at the expense of false positives when the accents, marks, punctuation etc were
intended.
You should do :func:`case folding <casefold>` after this.
Emoji are preserved but variation selectors, `fitzpatrick <https://en.wikipedia.org/wiki/Emoji#Skin_color>`__
and `joiners <https://en.wikipedia.org/wiki/Zero-width_joiner>`__ are stripped.
`Regional indicators <https://en.wikipedia.org/wiki/Regional_indicator_symbol>`__ are preserved.
"""
return _unicode.strip(text)
[docs]
def split_lines(text: str, offset: int = 0) -> Iterator[str]:
"""Each line, using hard line break rules
This is a iterator yielding a line at a time. The end of line
yielded will not include the hard line break characters.
"""
lt = len(text)
while offset < lt:
end = _unicode.line_next_hard_break(text, offset)
for hard in range(-1, offset - end - 1, -1):
if ord(text[end + hard]) not in _unicode.hard_breaks:
yield text[offset : end + hard + 1]
break
else:
# it was entirely hard break chars
yield ""
offset = end
[docs]
def expand_tabs(text: str, tabsize: int = 8, invalid: str = ".") -> str:
"""Turns tabs into spaces aligning on tabsize boundaries, similar to :meth:`str.expandtabs`
This is aware of grapheme clusters and text width. Codepoints
that have an invalid width are also replaced by ``invalid``.
Control characters are an example of an invalid character. Line
breaks are replaced with newline.
"""
res: list[str] = []
for line in split_lines(text):
# short cut
if "\t" not in line and text_width(line) >= 0:
res.append(line)
continue
# work on it cluster by cluster
clusters: list[str] = []
pos: int = 0
for gr in grapheme_iter(line):
if gr != "\t":
w = text_width(gr)
if w < 0:
gr = invalid
w = text_width(gr)
pos += w
clusters.append(gr)
else:
# str.expandtabs allows zero and negative numbers
incr = tabsize - (pos % tabsize) if tabsize > 0 else 0
clusters.append(" " * incr)
pos += incr
res.append("".join(clusters))
return "\n".join(res) + ("\n" if len(res) > 1 else "")
[docs]
def grapheme_length(text: str, offset: int = 0) -> int:
"Returns number of grapheme clusters in the text. Unicode aware version of len"
return _unicode.grapheme_length(text, offset)
[docs]
def grapheme_substr(text: str, start: int | None = None, stop: int | None = None) -> str:
"""Like ``text[start:end]`` but in grapheme cluster units
``start`` and ``end`` can be negative to index from the end, or
outside the bounds of the text but are never an invalid
combination (you get empty string returned).
To get one grapheme cluster, make stop one more than start.
For example to get the 3rd last grapheme cluster::
grapheme_substr(text, -3, -3 + 1)
"""
return _unicode.grapheme_substr(text, start, stop)
[docs]
def grapheme_endswith(text: str, substring: str) -> bool:
"Returns True if `text` ends with `substring` being aware of grapheme cluster boundaries"
# match str.endswith
if len(substring) == 0:
return True
if text.endswith(substring):
# it must end with the same codepoints, but also has to start at
# a grapheme cluster boundary
expected = len(text) - len(substring)
boundary = 0
while boundary < expected:
boundary = _unicode.grapheme_next_break(text, boundary)
return boundary == expected
return False
[docs]
def grapheme_startswith(text: str, substring: str) -> bool:
"Returns True if `text` starts with `substring` being aware of grapheme cluster boundaries"
# match str.startswith
if len(substring) == 0:
return True
if text.startswith(substring):
# it must start with the same codepoints, but also has to end at
# a grapheme cluster boundary
expected = len(substring)
boundary = 0
while boundary < expected:
boundary = _unicode.grapheme_next_break(text, boundary)
return boundary == expected
return False
[docs]
def grapheme_find(text: str, substring: str, start: int = 0, end: int | None = None) -> int:
"""Returns the offset in text where substring can be found, being aware of grapheme clusters.
The start and end of the substring have to be at a grapheme cluster boundary.
:param start: Where in text to start the search (default beginning)
:param end: Where to stop the search exclusive (default remaining text)
:returns: offset into text, or -1 if not found or substring is zero length
"""
# C version is 7.5X faster than Python version
return _unicode.grapheme_find(text, substring, start, end if end is not None else len(text))
[docs]
def text_width(text: str, offset: int = 0) -> int:
"""Returns how many columns the text would be if displayed in a terminal
You should :func:`split_lines` first and then operate on each line
separately.
If the `text` contains new lines, control characters, and similar
unrepresentable codepoints then minus 1 is returned.
Terminals aren't entirely consistent with each other, and Unicode
has many kinds of codepoints, and combinations. Consequently this
is right the vast majority of the time, but not always.
Note that web browsers do variable widths even in monospaced
sections like ``<pre>`` so they won't always agree with the terminal
either.
"""
# Some benchmarks in seconds running on 45MB of the UNDR
# 8.34 wcwidth Python module
# 8.14 The implementation of this in Python
# 0.20 Calling libc wcswidth via ctypes
# 0.14 The Python converted to C
return _unicode.text_width(text, offset)
[docs]
def text_width_substr(text: str, width: int, offset: int = 0) -> tuple[int, str]:
"""Extracts substring width or less wide being aware of grapheme cluster boundaries.
For example you could use this to get a substring that is 80 (or
less) wide.
:returns: A tuple of how wide the substring is, and the substring"""
if not isinstance(width, int) or width < 1:
raise ValueError("width must be an int at least 1")
width_so_far = 0
accepted = offset
for _, end, grapheme in grapheme_iter_with_offsets(text, offset):
seg_width = text_width(grapheme)
if seg_width < 0:
raise ValueError(f"text contains invalid codepoints {grapheme=}")
if width_so_far + seg_width <= width:
width_so_far += seg_width
accepted = end
else:
break
if width_so_far == width:
break
return width_so_far, text[offset:accepted]
[docs]
def guess_paragraphs(text: str, tabsize: int = 8) -> str:
"""Given text that contains paragraphs containing newlines, guesses where the paragraphs end.
The returned :class:`str` will have ``\n`` removed where it was
determined to not mark a paragraph end.
.. code-block:: output
If you have text like this, where paragraphs have newlines in
them, then each line gets wrapped separately by text_wrap.
This function tries to guess where the paragraphs end.
Blank lines like above are definite.
Indented lines that continue preserving the indent
are considered the same paragraph, and a change of indent
(in or out) is a new paragraph.
So this will be a new paragraph,
And this will be a new paragraph.
* Punctuation/numbers at the start of line
followed by indented text are considered the same
paragraph
2. So this is a new paragraph, while
this line is part of the line above
3. Optional numbers followed by punctuation then space
- are considered new paragraphs
"""
# regex to match what looks like an (optionally numbered) list
# item
list_item_re = r"^(?P<indent>\s*[0-9+=,\.*:-]+\s+).*"
# what we turn definite end of paragraph into
parasep = "\u2029"
# Force unicode end of line, form feed, next line to parasep
text = text.replace("\u2028", parasep)
text = text.replace("\u000d", parasep)
text = text.replace("\u0085", parasep)
# tabify
text = expand_tabs(text, tabsize)
# Fix Windows EOL
text = text.replace("\r\n", "\n")
# Any stray CR become parasep
text = text.replace("\r", parasep)
# Two newlines is definite
text = text.replace("\n\n", parasep + parasep)
paragraphs: list[str] = []
def append_paragraph(p: list[str]) -> None:
# appends the list of strings as a paragraph
# but we have to strip any indent from second and
# succeeding line
not_first = [line.lstrip(" ") for line in p[1:]]
paragraphs.append(" ".join([p[0]] + not_first))
# each segment is one or more paragraphs
for segment in text.split(parasep):
if "\n" not in segment:
paragraphs.append(segment)
continue
para: list[str] = []
for line in segment.split("\n"):
if not para:
# this is definitely a new paragraph
para.append(line)
continue
# optional spaces, followed by digits|punctuation followed by space
# is considered a new paragraph as a list item.
if re.match(list_item_re, line):
if para:
append_paragraph(para)
para = [line]
continue
# Does indent match previous line
if len(line) - len(line.lstrip(" ")) == len(para[-1]) - len(para[-1].lstrip(" ")):
para.append(line)
continue
# Does indent match previous line as a list item indent?
mo = re.match(list_item_re, para[-1])
if mo:
if len(mo.group("indent")) == len(line) - len(line.lstrip(" ")):
para.append(line)
continue
# new paragraph
append_paragraph(para)
para = [line]
continue
if para:
append_paragraph(para)
# turn back into newline as the expected delimiter
return "\n".join(paragraphs) + "\n"
[docs]
def text_wrap(
text: str,
width: int = 70,
*,
tabsize: int = 8,
hyphen: str = "-",
combine_space: bool = True,
invalid: str = "?",
) -> Iterator[str]:
"""Similar to :func:`textwrap.wrap` but Unicode grapheme cluster and line break aware
.. note::
Newlines in the text are treated as end of paragraph. If your text has paragraphs
with newlines in them, then call :func:`guess_paragraphs` first.
:param text: string to process
:param width: width of yielded lines, if rendered using a monospace font such as to a terminal
:param tabsize: Tab stop spacing as tabs are expanded
:param hyphen: Used to show a segment was broken because it was wider than ``width``
:param combine_space: Leading space on each (indent) is always preserved. Other spaces where
multiple occur are combined into one space.
:param invalid: If invalid codepoints are encountered such as control characters and surrogates
then they are replaced with this.
This yields one line of :class:`str` at a time, which will be
exactly ``width`` when output to a terminal. It will be right
padded with spaces if necessary and not have a trailing newline.
:func:`apsw.ext.format_query_table` uses this method to ensure
each column is the desired width.
"""
hyphen_width = text_width(hyphen)
text = expand_tabs(text, tabsize, invalid)
for line in split_lines(text):
accumulated: list[str] = []
line_width = 0
indent = None
space = False
for segment in line_break_iter(line):
if indent is None:
indent = " " * (len(segment) - len(segment.lstrip(" "))) if segment[0] == " " else ""
if len(indent) >= width - hyphen_width:
# make space for double width char if indent wider than width
indent = indent[: max(0, width - hyphen_width - 2)]
accumulated = [indent]
line_width = len(indent)
if line_width:
if len(indent) != len(segment): # there was spaces and text
segment = segment[line_width:]
else:
continue
if combine_space:
new_segment = segment.rstrip(" ")
new_space = new_segment != segment
# we want to prepend a space if the previous segment
# ended in space
segment = (" " if space else "") + new_segment
space = new_space
seg_width = text_width(segment)
assert seg_width >= 0
while line_width + seg_width > width:
if len(accumulated) == 1: # only indent present
if combine_space and segment[0] == " ":
# we added a space, but don't need it on new line
segment = segment[1:]
# hyphenate too long
hyphen_out = hyphen
desired = width - hyphen_width - line_width
if desired < 1:
hyphen_out = ""
desired = width - line_width
seg_width, substr = text_width_substr(segment, desired)
if seg_width == 0:
# the first grapheme cluster is wider than desired so
# we will display '*' instead for that first grapheme cluster
segment = grapheme_substr(segment, 1)
substr = "*" * desired
else:
segment = segment[len(substr) :]
if desired - seg_width: # did we get less than asked for?
substr += " " * (desired - seg_width)
yield indent + substr + hyphen_out
accumulated = [indent]
line_width = len(indent)
seg_width = text_width(segment)
continue
yield "".join(accumulated) + " " * (width - line_width)
if combine_space and segment[0] == " ":
# we added a space, but don't need it on new line
segment = segment[1:]
seg_width -= 1
accumulated = [indent]
line_width = len(indent)
continue
if segment:
accumulated.append(segment)
line_width += seg_width
if len(accumulated) == 1:
# only indent
yield " " * width
else:
yield "".join(accumulated) + " " * (width - line_width)
[docs]
def codepoint_name(codepoint: int | str) -> str | None:
"""Name or ``None`` if it doesn't have one
For example codepoint 65 is named ``LATIN CAPITAL LETTER A``
while codepoint U+D1234 is not assigned and would return
``None``.
"""
return _unicode.codepoint_name(codepoint)
[docs]
def version_added(codepoint: int | str) -> str | None:
"Returns the unicode version the codepoint was added"
return _unicode.version_added(codepoint)
version_dates = {
# Extracted from https://www.unicode.org/history/publicationdates.html
"16.0": (2024, 9, 10),
"15.1": (2023, 9, 12),
"15.0": (2022, 9, 13),
"14.0": (2021, 9, 14),
"13.0": (2020, 3, 10),
"12.1": (2019, 5, 7),
"12.0": (2019, 3, 5),
"11.0": (2018, 6, 5),
"10.0": (2017, 6, 20),
"9.0": (2016, 6, 21),
"8.0": (2015, 6, 17),
"7.0": (2014, 6, 16),
"6.3": (2013, 9, 30),
"6.2": (2012, 9, 26),
"6.1": (2012, 1, 31),
"6.0": (2010, 10, 11),
"5.2": (2009, 10, 1),
"5.1": (2008, 4, 4),
"5.0": (2006, 7, 14),
"4.1": (2005, 3, 31),
# These releases have no day, so we use the first of the month
"4.0": (2003, 4, 1),
"3.2": (2002, 3, 1),
"3.1": (2001, 3, 1),
"3.0": (1999, 9, 1),
"2.1": (1998, 5, 1),
"2.0": (1996, 7, 1),
"1.1": (1993, 6, 1),
"1.0": (1991, 10, 1),
}
"""Release date (year, month, day) for each unicode version
intended for use with :meth:`version_added`"""
[docs]
def grapheme_next_break(text: str, offset: int = 0) -> int:
"""Returns end of Grapheme cluster / User Perceived Character
For example regional indicators are in pairs, and a base codepoint
can be combined with zero or more additional codepoints providing
diacritics, marks, and variations. Break points are defined in
the `TR29 spec
<https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules>`__.
:param text: The text to examine
:param offset: The first codepoint to examine
:returns: Index of first codepoint not part of the grapheme cluster
starting at offset. You should extract ``text[offset:span]``
"""
return _unicode.grapheme_next_break(text, offset)
[docs]
def grapheme_next(text: str, offset: int = 0) -> tuple[int, int]:
"Returns span of next grapheme cluster"
end = grapheme_next_break(text, offset)
return offset, end
[docs]
def grapheme_iter(text: str, offset: int = 0) -> Iterator[str]:
"Iterator providing text of each grapheme cluster"
lt = len(text)
meth = _unicode.grapheme_next_break
start = offset
while offset < lt:
offset = meth(text, offset)
yield text[start:offset]
start = offset
[docs]
def grapheme_iter_with_offsets(text: str, offset: int = 0) -> Iterator[tuple[int, int, str]]:
"Iterator providing start, end, text of each grapheme cluster"
lt = len(text)
meth = _unicode.grapheme_next_break
start = offset
while offset < lt:
offset = meth(text, offset)
yield (start, offset, text[start:offset])
start = offset
[docs]
def grapheme_iter_with_offsets_filtered(
text: str, offset: int = 0, *, categories: Iterable[str], emoji: bool = False, regional_indicator: bool = False
) -> Iterator[tuple[int, int, str]]:
"Iterator providing start, end, text of each grapheme cluster, providing it includes codepoints from categories, emoji, or regional indicator"
mask = _cats_to_mask(categories, emoji, regional_indicator)
lt = len(text)
meth = _unicode.grapheme_next_break
catcheck = _unicode.has_category
while offset < lt:
end = meth(text, offset)
if catcheck(text, offset, end, mask):
yield (offset, end, text[offset:end])
offset = end
[docs]
def word_next_break(text: str, offset: int = 0) -> int:
"""Returns end of next word or non-word
Finds the next break point according to the `TR29 spec
<https://www.unicode.org/reports/tr29/#Word_Boundary_Rules>`__.
Note that the segment returned may be a word, or a non-word
(spaces, punctuation etc). Use :func:`word_next` to get words.
:param text: The text to examine
:param offset: The first codepoint to examine
:returns: Next break point
"""
return _unicode.word_next_break(text, offset)
_cats_to_mask_mapping = {
"Lu": _Category.Lu,
"Ll": _Category.Ll,
"Lt": _Category.Lt,
"Lm": _Category.Lm,
"Lo": _Category.Lo,
"Mn": _Category.Mn,
"Mc": _Category.Mc,
"Me": _Category.Me,
"Nd": _Category.Nd,
"Nl": _Category.Nl,
"No": _Category.No,
"Pc": _Category.Pc,
"Pd": _Category.Pd,
"Ps": _Category.Ps,
"Pe": _Category.Pe,
"Pi": _Category.Pi,
"Pf": _Category.Pf,
"Po": _Category.Po,
"Sm": _Category.Sm,
"Sc": _Category.Sc,
"Sk": _Category.Sk,
"So": _Category.So,
"Zs": _Category.Zs,
"Zl": _Category.Zl,
"Zp": _Category.Zp,
"Cc": _Category.Cc,
"Cf": _Category.Cf,
"Cs": _Category.Cs,
"Co": _Category.Co,
"Cn": _Category.Cn,
}
def _cats_to_mask(categories: Iterable[str], emoji: bool, regional_indicator: bool) -> int:
mask = 0
for cat in categories:
mask |= _cats_to_mask_mapping[cat]
if emoji:
mask |= _Category.Extended_Pictographic
if regional_indicator:
mask |= _Category.Regional_Indicator
return mask
word_default_categories = {"Lu", "Ll", "Lt", "Lm", "Lo", "Nd", "Nl", "No"}
"Default categories for selecting word segments - letters and numbers"
[docs]
def word_next(
text: str,
offset: int = 0,
*,
categories: Iterable[str] = word_default_categories,
emoji: bool = False,
regional_indicator: bool = False,
) -> tuple[int, int]:
"""Returns span of next word
A segment is considered a word if it contains at least one codepoint corresponding
to any of the `categories`, plus:
* emoji (Extended_Pictographic in Unicode specs)
* regional indicator - two character sequence for flags like 🇧🇷🇨🇦
"""
mask = _cats_to_mask(categories, emoji, regional_indicator)
lt = len(text)
meth = _unicode.word_next_break
catcheck = _unicode.has_category
while offset < lt:
end = meth(text, offset)
if catcheck(text, offset, end, mask):
return offset, end
offset = end
return offset, offset
[docs]
def word_iter(
text: str,
offset: int = 0,
*,
categories: Iterable[str] = word_default_categories,
emoji: bool = False,
regional_indicator: bool = False,
) -> Iterator[str]:
"Iterator providing text of each word"
mask = _cats_to_mask(categories, emoji, regional_indicator)
lt = len(text)
meth = _unicode.word_next_break
catcheck = _unicode.has_category
while offset < lt:
end = meth(text, offset)
if catcheck(text, offset, end, mask):
yield text[offset:end]
offset = end
[docs]
def word_iter_with_offsets(
text: str,
offset: int = 0,
*,
categories: Iterable[str] = word_default_categories,
emoji: bool = False,
regional_indicator: bool = False,
) -> Iterator[str]:
"Iterator providing start, end, text of each word"
mask = _cats_to_mask(categories, emoji, regional_indicator)
lt = len(text)
meth = _unicode.word_next_break
catcheck = _unicode.has_category
while offset < lt:
end = meth(text, offset)
if catcheck(text, offset, end, mask):
yield (offset, end, text[offset:end])
offset = end
[docs]
def sentence_next_break(text: str, offset: int = 0) -> int:
"""Returns end of sentence location.
Finds the next break point according to the `TR29 spec
<https://www.unicode.org/reports/tr29/#Sentence_Boundary_Rules>`__.
Note that the segment returned includes leading and trailing white space.
:param text: The text to examine
:param offset: The first codepoint to examine
:returns: Next break point
"""
return _unicode.sentence_next_break(text, offset)
[docs]
def sentence_next(text: str, offset: int = 0) -> tuple[int, int]:
"""Returns span of next sentence"""
lt = len(text)
meth = _unicode.sentence_next_break
while offset < lt:
end = meth(text, offset=offset)
return offset, end
return offset, offset
[docs]
def sentence_iter(text: str, offset: int = 0) -> Iterator[str]:
"Iterator providing text of each sentence"
lt = len(text)
meth = _unicode.sentence_next_break
while offset < lt:
end = meth(text, offset)
yield text[offset:end]
offset = end
[docs]
def sentence_iter_with_offsets(text: str, offset: int = 0) -> Iterator[tuple[int, int, str]]:
"Iterator providing start, end, text of each sentence"
lt = len(text)
meth = _unicode.sentence_next_break
while offset < lt:
end = meth(text, offset)
yield (offset, end, text[offset:end])
offset = end
[docs]
def line_break_next_break(text: str, offset: int = 0) -> int:
"""Returns next opportunity to break a line
Finds the next break point according to the `TR14 spec
<https://www.unicode.org/reports/tr14/#LB1>`__.
:param text: The text to examine
:param offset: The first codepoint to examine
:returns: Next break point
"""
return _unicode.line_next_break(text, offset)
[docs]
def line_break_next(text: str, offset: int = 0) -> tuple[int, int]:
"""Returns span of next line"""
lt = len(text)
meth = _unicode.line_next_break
while offset < lt:
end = meth(text, offset=offset)
return offset, end
return offset, offset
[docs]
def line_break_iter(text: str, offset: int = 0) -> Iterator[str]:
"Iterator providing text of each line"
lt = len(text)
meth = _unicode.line_next_break
while offset < lt:
end = meth(text, offset)
yield text[offset:end]
offset = end
[docs]
def line_break_iter_with_offsets(text: str, offset: int = 0) -> Iterator[tuple[int, int, str]]:
"Iterator providing start, end, text of each line"
lt = len(text)
meth = _unicode.line_next_break
while offset < lt:
end = meth(text, offset)
yield (offset, end, text[offset:end])
offset = end
if __name__ == "__main__":
import argparse
import os
import sys
import atexit
import apsw.fts5
# We output text non unicode compatible can't handle
sys.stdout.reconfigure(errors="replace")
width = 80
if sys.stdout.isatty():
width = os.get_terminal_size(sys.stdout.fileno()).columns
parser = argparse.ArgumentParser()
parser.add_argument(
"-cc",
"--compact-codepoints",
dest="compact_codepoints",
action="store_true",
default=False,
help="Only show hex codepoint values, not full details",
)
subparsers = parser.add_subparsers(required=True)
p = subparsers.add_parser("breaktest", help="Run Unicode test file")
p.set_defaults(function="breaktest")
p.add_argument("-v", default=False, action="store_true", dest="verbose", help="Show each line as it is tested")
p.add_argument("--fail-fast", default=False, action="store_true", help="Exit on first test failure")
p.add_argument(
"--fail-codepoints-separator",
default=" ",
help="What to separate the list of codepoints with on failure. Useful for long test strings [%(default)s]",
)
p.add_argument("test", choices=("grapheme", "word", "sentence", "line_break"), help="What to test")
p.add_argument(
"file",
help="break test text file. They can be downloaded from https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/",
type=argparse.FileType("rt", encoding="utf8"),
)
p = subparsers.add_parser("show", help="Run against provided text")
p.set_defaults(function="show")
p.add_argument("show", choices=("grapheme", "word", "sentence", "line_break"), help="What to show [%(default)s]")
p.add_argument("--text-file", type=argparse.FileType("rt", encoding="utf8"))
p.add_argument(
"--categories",
default="L* N*",
help="For word, which segments are included. You can use wildcards and ! for negation [%(default)s]",
)
p.add_argument(
"--emoji",
default=False,
action="store_true",
help="For word, if emoji segments are included [%(default)s]",
)
p.add_argument(
"--regional-indicator",
default=False,
action="store_true",
help="For word, if regional indicator segments are included [%(default)s]",
)
p.add_argument("text", nargs="*", help="Text to segment unless --text-file used")
p = subparsers.add_parser("codepoint", help="Show information about codepoints")
p.add_argument("text", nargs="+", help="If a hex constant then use that value, otherwise treat as text")
p.set_defaults(function="codepoint")
p = subparsers.add_parser(
"benchmark",
help="Measure how long segmentation takes to iterate each segment",
)
p.set_defaults(function="benchmark")
p.add_argument(
"--size",
type=float,
default=50,
help="How many million characters (codepoints) of text to use [%(default)s]",
)
p.add_argument("--seed", type=int, default=0, help="Random seed to use [%(default)s]")
p.add_argument(
"--others",
help="A comma separated list of other packages to also benchmark. Use 'all' to get all available ones. Supported are grapheme, uniseg, pyicu",
)
p.add_argument(
"text_file",
type=argparse.FileType("rt", encoding="utf8"),
help="""Text source to use.
The provided text will be repeatedly duplicated and shuffled then
appended until the sized amount of text is available.
A suggested source of text is to download the Universal Declaration of
Human Rights Corpus from https://www.nltk.org/nltk_data/ and
concatenate all the files together. It contains the same text
in most written languages.
""",
)
p = subparsers.add_parser(
"textwrap",
help="""Wrap text to fit the specified number of columns. Each line output
will be padded with spaces to the width.""",
)
p.set_defaults(function="textwrap")
p.add_argument(
"--measurement",
default="apsw.unicode",
choices=["wcswidth-c", "wcswidth-py"],
help="""Instead of using the builtin function for measuring how wide text is
use the C library function wcswidth, or use the wcwidth Python package wcswidth function""",
)
p.add_argument(
"--invalid",
default="?",
help="Replacement for invalid codepoints such as control characters and surrogates [%(default)s]",
)
p.add_argument(
"--width",
type=int,
default=width,
help="How many columns to wrap to [%(default)s]",
)
p.add_argument("--tabsize", type=int, default=8, help="Tabstop size [%(default)s]")
p.add_argument("--hyphen", default="-", help="Text to use when a segment is longer that width [%(default)s]")
p.add_argument(
"--no-combine-space",
dest="combine_space",
default=True,
action="store_false",
help="Disable combining multiple spaces into one. Note that leading indents are always preserved",
)
p.add_argument(
"--start", default="", help="Text output at the beginning of each line. It counts against the width"
)
p.add_argument("--end", default="", help="Text output at the end of each line. It counts against the width")
p.add_argument(
"--use-stdlib",
default=False,
action="store_true",
help="Uses the system textwrap library instead. hyphen is ignored and start/end are applied by this code.",
)
p.add_argument(
"--guess-paragraphs",
default=False,
action="store_true",
help="Guess if newlines in text are the same paragraphs. See the doc for apsw.unicode.guess_paragraphs for details",
)
p.add_argument(
"text_file",
type=argparse.FileType("rt", encoding="utf8"),
help="""Text source to use encoded in UTF8. Newlines are considered to delimit each paragraph, so consider --guess-paragraphs.
Use a name of a single dash to read from standard input.""",
)
p = subparsers.add_parser("casefold", help="Does casefolding on text")
p.set_defaults(function="casefold")
p.add_argument("input", default=sys.stdin, type=argparse.FileType("rt", encoding="utf8"), help="Input text [stdin]")
p.add_argument(
"output", default=sys.stdout, type=argparse.FileType("wt", encoding="utf8"), help="Output text [stdout]"
)
p = subparsers.add_parser("strip", help="Strips accents, uses compatibility codepoints etc")
p.set_defaults(function="strip")
p.add_argument("input", default=sys.stdin, type=argparse.FileType("rt", encoding="utf8"), help="Input text [stdin]")
p.add_argument(
"output", default=sys.stdout, type=argparse.FileType("wt", encoding="utf8"), help="Output text [stdout]"
)
p = subparsers.add_parser(
"breaktestgen",
help="""Extracts data strings to be added to test suite""",
)
p.set_defaults(function="breaktestgen")
p.add_argument("grapheme", type=argparse.FileType("rt", encoding="utf8"), help="Grapheme break test file")
p.add_argument("word", type=argparse.FileType("rt", encoding="utf8"), help="Word break test file")
p.add_argument("sentence", type=argparse.FileType("rt", encoding="utf8"), help="Sentence break test file")
p.add_argument("line_break", type=argparse.FileType("rt", encoding="utf8"), help="Line break test file")
p = subparsers.add_parser(
"width-check",
help="""Check how this terminal differs from width database.
Any differences are reported to stdout in csv format so you should redirect output to a file.
Cursor positioning ANSI sequences are used. Do not type in the terminal while it is running.
It takes about a minute to run with most terminals, 1 hour for kitty, and 21+ hours for gnome
terminals.""",
)
p.set_defaults(function="widthcheck")
options = parser.parse_args()
def codepoint_details(kind, c: str, counter=None) -> str:
if options.compact_codepoints:
return f"U+{ord(c):04x}"
name = str(codepoint_name(c))
cat = category(ord(c))
counter = f"#{counter}:" if counter is not None else ""
name += f" ({ cat } { apsw.fts5.unicode_categories[cat] })"
uni_cat = " | ".join(_unicode.category_name(kind, ord(c)))
return "{" + f"{counter}U+" + ("%04X" % ord(c)) + f" {name} : { uni_cat }" + "}"
if options.function == "show":
if not options.text_file and not options.text:
p.error("You must specify at least --text-file or text arguments")
params = {
"categories": apsw.fts5.convert_unicode_categories(options.categories),
"emoji": options.emoji,
"regional_indicator": options.regional_indicator,
}
if options.show != "word":
params = {}
text = ""
if options.text_file:
text += options.text_file.read()
if options.text:
if text:
text += " "
text += " ".join(options.text)
next_func = globals()[f"{ options.show }_next"]
counter = 0
offset = 0
while offset < len(text):
begin, end = next_func(text, offset, **params)
print(
f"#{ counter } offset { offset } span { begin }-{ end } codepoints { end - begin } value: { text[begin:end] }"
)
for i in range(begin, end):
print(" ", codepoint_details(options.show, text[i]))
offset = end
counter += 1
elif options.function == "textwrap":
# stop debug interpreter whining about file not being closed
atexit.register(lambda: options.text_file.close())
if options.measurement == "wcswidth-c":
import ctypes
import ctypes.util
libc = ctypes.cdll.LoadLibrary(ctypes.util.find_library("c")) # C library on Unix/Linux platforms
if not hasattr(libc, "wcswidth"):
sys.exit("C library does not have wcswidth function")
libc.wcswidth.argtypes = [ctypes.c_wchar_p, ctypes.c_size_t]
libc.wcswidth.restype = ctypes.c_int
def _text_width(text, offset=0):
return libc.wcswidth(text[offset:], len(text) * 10)
# shenanigans so sphinx doesn't try to document these
setattr(sys.modules[__name__], "text_width", _text_width)
elif options.measurement == "wcswidth-py":
import wcwidth
def _text_width(text, offset=0):
return wcwidth.wcswidth(text[offset:])
setattr(sys.modules[__name__], "text_width", _text_width)
width = options.width
width = width - text_width(options.start) - text_width(options.end)
text = options.text_file.read()
if options.guess_paragraphs:
text = guess_paragraphs(text)
if options.use_stdlib:
import textwrap
for line in textwrap.wrap(
text,
width,
tabsize=options.tabsize,
drop_whitespace=options.combine_space,
replace_whitespace=False,
):
for line in line.splitlines():
padding = max(0, width - text_width(line))
print(f"{options.start}{line}{' ' * padding}{options.end}")
else:
for line in text_wrap(
text,
width,
tabsize=options.tabsize,
hyphen=options.hyphen,
combine_space=options.combine_space,
invalid=options.invalid,
):
print(f"{options.start}{line}{options.end}")
elif options.function == "breaktest":
import difflib
# stop debug interpreter whining about file not being closed
atexit.register(lambda: options.file.close())
next_break_func = globals()[f"{ options.test }_next_break"]
# ::TODO:: add option that inserts LB_CM/ZWJ chars after every
# codepoint (except BK/CR/NL etc) to verify LB9 is always done
ok = "÷"
not_ok = "\u00d7"
passed: int = 0
fails: list[str] = []
for line_num, line in enumerate(options.file, 1):
orig_line = line
if not line.strip() or line.startswith("#"):
continue
line = line.split("#")[0].strip().split()
if options.verbose:
print(f"{ line_num }: { orig_line.rstrip() }")
expect = not_ok if options.test == "line_break" else ok
assert line[0] == expect, f"Line { line_num } doesn't start with { expect }!"
assert line[-1] == ok, f"Line { line_num } doesn't end with { ok }!"
line = line[1:]
text = ""
breaks: list[int] = []
while line:
c = line.pop(0)
if c == not_ok:
continue
if c == ok:
breaks.append(len(text))
continue
text += chr(int(c, 16))
def add_failinfo():
fails.append(orig_line.strip())
codepoints = []
for counter, c in enumerate(text):
codepoints.append(codepoint_details(options.test, c, counter))
fails.append(options.fail_codepoints_separator.join(codepoints))
fails.append("")
offset = 0
seen: list[int] = []
lf = len(fails)
while offset < len(text):
span = next_break_func(text, offset)
if span not in breaks:
fails.append(
f"Line { line_num } got unexpected break at { span } - expected are { breaks }. Seen { seen }"
)
add_failinfo()
break
seen.append(span)
offset = span
if options.fail_fast and fails:
break
if len(fails) != lf:
continue
if set(seen) != set(breaks):
fails.append(f"Line { line_num } got breaks at { seen } expected at { breaks }")
if max(len(seen), len(breaks)) > 5:
# use difflib to show difference
sm = difflib.SequenceMatcher(a=seen, b=breaks)
for tag, a1, a2, b1, b2 in sm.get_opcodes():
if tag == "equal":
continue
if a1 != a2:
fails[-1] += f"\n seen {tag} {seen[a1:a2]}"
if b1 != b2:
fails[-1] += f"\n expected {tag} {breaks[b1:b2]}"
add_failinfo()
if options.fail_fast and fails:
break
passed += 1
if fails:
print(f"{ len(fails)//4 } tests failed, {passed:,} passed:", file=sys.stderr)
for fail in fails:
print(fail, file=sys.stderr)
sys.exit(2)
else:
print(f"{passed:,} passed")
elif options.function == "codepoint":
codepoints = []
for t in options.text:
try:
codepoints.append(int(t, 16))
except ValueError:
codepoints.extend(ord(c) for c in t)
def uniname(cp):
return str(codepoint_name(cp))
def deets(cp):
cat = category(cp)
return f"{ uniname(cp) } { cat }: { apsw.fts5.unicode_categories[cat] }"
for i, cp in enumerate(codepoints):
print(f"#{ i } U+{ cp:04X} - ", end="")
try:
print(chr(cp))
except UnicodeEncodeError:
print()
added = version_added(cp)
year = f"({version_dates[added][0]})" if added is not None else ""
print(f"Name: { deets(cp) } Version: { added } { year }")
mangled = []
for mangle in casefold(chr(cp)), strip(chr(cp)):
if not mangle:
mangled.append("(nothing)")
else:
mangled.append(", ".join(f"U+{ ord(v):04X} {uniname(ord(v))}" for v in mangle))
print(f"casefold: { mangled[0] } stripped: { mangled[1] }")
print(
f"Width: { text_width(chr(cp)) } "
f"TR29 grapheme: { ' | '.join(_unicode.category_name('grapheme', cp)) } "
f"word: { ' | '.join(_unicode.category_name('word', cp )) } "
f"sentence: { ' | '.join(_unicode.category_name('sentence', cp)) } "
f"TR14 line break: { ' | '.join(_unicode.category_name('line_break', cp)) }"
)
print()
elif options.function == "benchmark":
import random
import time
random.seed(options.seed)
base_text = options.text_file.read()
text = base_text
# these are the non-ascii codepoints used in the various break tests
interesting = "".join(
chr(int(x, 16))
for x in """0085 00A0 00AD 01BB 0300 0308 034F 0378 05D0 0600 062D 0631
0644 0645 0646 064A 064E 0650 0651 0661 0671 06DD 070F 0710 0712 0717
0718 0719 071D 0721 072A 072B 072C 0900 0903 0904 0915 0924 092F 093C
094D 0A03 0D4E 1100 1160 11A8 200D 2018 2019 201C 201D 2060 231A 2701
3002 3031 5B57 5B83 AC00 AC01 1F1E6 1F1E7 1F1E8 1F1E9 1F3FF 1F476
1F6D1""".split()
)
# make interesting be 0.1% of base text
base_text += interesting * int(len(interesting) / (len(base_text) * 0.001))
tests: list[Any] = [
(
"apsw.unicode",
unicode_version,
(
("grapheme", grapheme_iter),
("word", word_iter),
("sentence", sentence_iter),
("line", line_break_iter),
),
)
]
if options.others:
if options.others == "all":
ok = []
try:
import uniseg
ok.append("uniseg")
except ImportError:
pass
try:
import grapheme
ok.append("grapheme")
except ImportError:
pass
try:
import icu
ok.append("pyicu")
except ImportError:
pass
if ok:
options.others = ",".join(ok)
else:
options.others = None
if options.others:
for package in options.others.split(","):
package = package.strip()
if package == "grapheme":
import grapheme
import grapheme.finder
tests.append(
("grapheme", grapheme.UNICODE_VERSION, (("grapheme", grapheme.finder.GraphemeIterator),))
)
elif package == "uniseg":
import uniseg
import uniseg.graphemecluster
import uniseg.wordbreak
import uniseg.sentencebreak
import uniseg.linebreak
# note that uniseg words doesn't determine which
# segments are words or not so you just get all
# segments
tests.append(
(
"uniseg",
uniseg.unidata_version,
(
("grapheme", uniseg.graphemecluster.grapheme_clusters),
("word", uniseg.wordbreak.words),
("sentence", uniseg.sentencebreak.sentences),
("line", uniseg.linebreak.line_break_units),
),
)
)
elif package == "pyicu":
import icu
import functools
# api only returns breakpoints, so make it match
# the others. It also does its own utf16 based
# strings so there is some conversion overhead
def icu_iterate(kind, text):
icu_it = getattr(icu.BreakIterator, f"create{kind}Instance")(icu.Locale.getEnglish())
icu_str = icu.UnicodeString(text)
icu_it.setText(icu_str)
offset = 0
for pos in icu_it:
yield str(icu_str[offset:pos])
offset = pos
tests.append(
(
"pyicu",
icu.UNICODE_VERSION,
(
("grapheme", functools.partial(icu_iterate, "Character")),
("word", functools.partial(icu_iterate, "Word")),
("sentence", functools.partial(icu_iterate, "Sentence")),
("line", functools.partial(icu_iterate, "Line")),
),
)
)
else:
sys.exit(f"Unknown third party package to benchmark '{package}'")
print(f"Expanding text to { options.size } million chars ...", end="", flush=True)
while len(text) < options.size * 1_000_000:
text += "".join(random.sample(base_text, len(base_text)))
text = text[: int(options.size * 1_000_000)]
print("\nResults in codepoints per second processed, returning each segment. Higher is faster.")
for name, version, parts in tests:
print(f"\nBenchmarking {name:20s} unicode version { version }")
for kind, func in parts:
print(f"{kind:>8}", end=" ", flush=True)
count = 0
offset = 0
start = time.process_time_ns()
exc = None
try:
for _ in func(text):
count += 1
except Exception as exc2:
exc = exc2
end = time.process_time_ns()
if exc is not None:
print(f" EXCEPTION {exc!r}")
else:
seconds = (end - start) / 1e9
print(f"codepoints per second: { int(len(text)/seconds): 12,d} segments: {count: 11,d}")
elif options.function == "casefold":
options.output.write(casefold(options.input.read()))
elif options.function == "strip":
options.output.write(strip(options.input.read()))
elif options.function == "breaktestgen":
# char used to mark ok and not in the files
ok = "÷"
not_ok = "\u00d7"
def get_strings(fh):
for line in fh:
if not line.strip() or line.startswith("#"):
continue
line = line.split("#")[0].strip().split()
line.pop(0) # remove initial marker
line.pop(-1) # and final
text = ""
while line:
c = line.pop(0)
if c == not_ok:
continue
elif c == ok:
text += c
else:
text += chr(int(c, 16))
assert text[-1] != "÷"
yield text
def fmt(text):
res = ""
for c in text:
if category(c) in {"Lu", "Ll", "Nd", "Nl", "Pd", "Sm", "Sc", "So", "Zs"}:
res += c
else:
c = ord(c)
if c <= 0xFFFF:
res += f"\\u{c:04X}"
else:
res += f"\\U{c:08X}"
return '"' + res + '"'
for name in ("grapheme", "word", "sentence", "line_break"):
lines = list(get_strings(getattr(options, name)))
lines.sort(key=lambda l: len(l))
print(f'"{name}":')
print("(")
# we always take the shorted and longest
print(fmt(lines.pop(0)), ",")
print(fmt(lines.pop(-1)), ",")
# and 20 of the rest
for offset in range(len(lines) // 20, len(lines), len(lines) // 20):
print(fmt(lines[offset]), ",")
print("),")
elif options.function == "widthcheck":
import wcwidth # pip install wcwidth
if sys.platform != "win32":
import atexit
import ctypes, ctypes.util
libc = ctypes.cdll.LoadLibrary(ctypes.util.find_library("c"))
libc.wcswidth.argtypes = [ctypes.c_wchar_p, ctypes.c_size_t]
libc.wcswidth.restype = ctypes.c_int
tty_in = open("/dev/tty", "r")
tty_out = open("/dev/tty", "w")
import tty
import termios
term_mode = termios.tcgetattr(tty_in)
def finish():
termios.tcsetattr(tty_in, termios.TCSAFLUSH, term_mode)
print("", flush=True, file=tty_out)
atexit.register(finish)
tty.setraw(tty_in)
else:
import ctypes, msvcrt
kernel32 = ctypes.windll.kernel32
# we need console handle (stdout is often redirected)
h_tty_out = kernel32.CreateFileW("CONOUT$", 0x80000000 | 0x40000000, 0x00000001|0x00000002, None, 3, 0, None)
assert h_tty_out != -1
# Convince it raw bytes are utf8
res = kernel32.SetConsoleCP(65001) # CP_UTF8
assert res # zero means failure
res = kernel32.SetConsoleOutputCP(65001) # CP_UTF8
assert res # zero means failure
# enable ansi processing
res = kernel32.SetConsoleMode(h_tty_out, 5)
assert res # zero means failure
# fake i/o interfaces using classes as namespace, not instances
class tty_out:
def write(data):
data = data.encode("utf8")
res = kernel32.WriteFile(h_tty_out, data, len(data), None, None)
assert res # zero means failure
def flush():
# we do no buffering so flush is a no-op
pass
class tty_in:
def read(how_much):
res = ""
while len(res) < how_much:
res += msvcrt.getwch()
return res
# fake out wcwidth C API
class libc:
def wcswidth(s, n):
# give same value as apsw.unicode
return text_width(s)
def finish():
# utf8 etc above are process local so we don't need to
# reset the terminal
pass
def get_pos():
print("\x1b[6n", flush=True, file=tty_out, end="")
x = tty_in.read(2)
assert x == "\x1b[" # something else was typed
r = ""
while True:
c = tty_in.read(1)
if c == "R":
break
r += c
return list(int(part) for part in r.split(";"))
def set_pos(pos):
print(f"\x1b[{pos[0]};{pos[1]}H", flush=True, file=tty_out, end="")
print("\r\n", flush=True, file=tty_out)
errors = []
start_pos = get_pos()
print(f"{0:06X} -> ", flush=True, end="", file=tty_out)
out_pos = get_pos()
for cp in range(0, sys.maxunicode + 1):
# surrogates can't be output
if 0xD800 <= cp <= 0xDFFF:
continue
set_pos(start_pos)
print(f"{cp:06X} -> ", flush=True, end="", file=tty_out)
set_pos(out_pos)
text = "a" + chr(cp) + "b"
if cp == 0 or (text_width(text) < 0 and libc.wcswidth(text, 1000) < 0 and wcwidth.wcswidth(text) < 0):
continue
print(text, end="", flush=True, file=tty_out)
new_pos = get_pos()
width = new_pos[1] - out_pos[1] - 2 if new_pos[0] == out_pos[0] else -1
if width != text_width(chr(cp)):
errors.append([cp, width])
finish()
if errors:
import csv
w = csv.writer(sys.stdout)
w.writerow(
[
"codepoint",
"hex",
"width",
"text_width",
"wcswidth_c",
"wcswidth_py",
"name",
"version_added",
"category",
]
)
for row in errors:
cp = row[0]
w.writerow(
[
cp,
f"{cp:04X}",
row[1],
text_width(chr(cp)),
libc.wcswidth(chr(cp), 1000),
wcwidth.wcswidth(chr(cp)),
str(codepoint_name(cp)),
version_added(cp),
category(cp),
]
)