diff --git a/CHANGELOG.md b/CHANGELOG.md index 6cc0bfb..b92eb48 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,8 @@ ## 0.4.0 + Added `ValueProperty`, abstract superclass for `ConfigProperty` -+ Changed the behavior of `makelist()`: now it can also decorate a callable, converting its return type to a list ++ \[BREAKING] Changed the behavior of `makelist()`: now it's also a decorator, converting its return type to a list (revertable with `wrap=False`) ++ New module `lex` with functions `symbol_table()` and `lex()` — make tokenization more affordable + Added `addattr()` ## 0.3.6 diff --git a/src/suou/__init__.py b/src/suou/__init__.py index 9d12a76..94d793b 100644 --- a/src/suou/__init__.py +++ b/src/suou/__init__.py @@ -27,13 +27,16 @@ from .itertools import makelist, kwargs_prefix, ltuple, rtuple, additem from .i18n import I18n, JsonI18n, TomlI18n from .snowflake import Snowflake, SnowflakeGen -__version__ = "0.4.0-dev27" +__version__ = "0.4.0-dev28" __all__ = ( - 'Siq', 'SiqCache', 'SiqType', 'SiqGen', 'StringCase', - 'MissingConfigError', 'MissingConfigWarning', 'ConfigOptions', 'ConfigParserConfigSource', 'ConfigSource', 'ConfigValue', 'EnvConfigSource', 'DictConfigSource', - 'deprecated', 'not_implemented', 'Wanted', 'Incomplete', 'jsonencode', 'ltuple', 'rtuple', - 'makelist', 'kwargs_prefix', 'I18n', 'JsonI18n', 'TomlI18n', 'cb32encode', 'cb32decode', 'count_ones', 'mask_shift', - 'want_bytes', 'want_str', 'version', 'b2048encode', 'split_bits', 'join_bits', 'b2048decode', - 'Snowflake', 'SnowflakeGen', 'ssv_list', 'additem', 'b32lencode', 'b32ldecode', 'b64encode', 'b64decode' + 'ConfigOptions', 'ConfigParserConfigSource', 'ConfigSource', 'ConfigValue', + 'DictConfigSource', 'EnvConfigSource', 'I18n', 'Incomplete', 'JsonI18n', + 'MissingConfigError', 'MissingConfigWarning', 'Siq', 'SiqCache', 'SiqGen', + 'SiqType', 'Snowflake', 'SnowflakeGen', 'StringCase', 'TomlI18n', 'Wanted', + 'additem', 'b2048decode', 'b2048encode', 'b32ldecode', 'b32lencode', + 'b64encode', 'b64decode', 'cb32encode', 'cb32decode', 'count_ones', + 'deprecated', 'join_bits', 'jsonencode', 'kwargs_prefix', 'ltuple', + 'makelist', 'mask_shift', 'not_implemented', 'rtuple', 'split_bits', + 'ssv_list', 'want_bytes', 'want_str' ) diff --git a/src/suou/exceptions.py b/src/suou/exceptions.py index bc71037..e6382c0 100644 --- a/src/suou/exceptions.py +++ b/src/suou/exceptions.py @@ -14,7 +14,7 @@ This software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. """ - +from .functools import deprecated class MissingConfigError(LookupError): """ @@ -30,4 +30,15 @@ class MissingConfigWarning(MissingConfigError, Warning): """ A required config property is missing, and the application is assuming a default value. """ - pass \ No newline at end of file + pass + + +class LexError(SyntaxError): + """ + Illegal character or sequence found in the token stream. + """ + +class InconsistencyError(RuntimeError): + """ + This program is in a state which it's not supposed to be in. + """ diff --git a/src/suou/lex.py b/src/suou/lex.py new file mode 100644 index 0000000..086023f --- /dev/null +++ b/src/suou/lex.py @@ -0,0 +1,84 @@ +""" +Utilities for tokenization of text. + +--- +""" + +from re import Match + + +from dataclasses import dataclass +import re +from typing import Any, Callable, Iterable + +from .exceptions import InconsistencyError, LexError + +from .itertools import makelist + + +@dataclass +class TokenSym: + pattern: str + label: str + cast: Callable[[str], Any] | None = None + discard: bool = False + + # convenience methods below + def match(self, s: str, index: int = 0) -> Match[str] | None: + return re.compile(self.pattern, 0).match(s, index) + +@makelist +def symbol_table(*args: Iterable[tuple | TokenSym], whitespace: str | None = None): + """ + Make a symbol table from a list of tuples. + + Tokens are in form (pattern, label[, cast]) where: + - [] means optional + - pattern is a regular expression (r-string syntax advised) + - label is a constant string + - cast is a function + + Need to strip whitespace? Pass the whitespace= keyword parameter. + """ + for arg in args: + if isinstance(arg, TokenSym): + pass + elif isinstance(arg, tuple): + arg = TokenSym(*arg) + else: + raise TypeError(f'invalid type {arg.__class__.__name__!r}') + yield arg + if whitespace: + yield TokenSym('[' + re.escape(whitespace) + ']+', '', discard=True) + + + +def ilex(text: str, table: Iterable[TokenSym], *, whitespace = False): + """ + Return a text as a list of tokens, given a token table (iterable of TokenSym). + + ilex() returns a generator; lex() returns a list. + + table must be a result from symbol_table(). + """ + i = 0 + while i < len(text): + mo = None + for sym in table: + if mo := re.compile(sym.pattern).match(text, i): + if not sym.discard: + mtext = mo.group(0) + if callable(sym.cast): + mtext = sym.cast(mtext) + yield (sym.label, mtext) + elif whitespace: + yield (None, mo.group(0)) + break + if mo is None: + raise LexError(f'illegal character near {text[i:i+5]!r}') + if i == mo.end(0): + raise InconsistencyError + i = mo.end(0) + +lex = makelist(ilex) +