new module lex

This commit is contained in:
Yusur 2025-07-17 19:45:43 +02:00
parent 4a2e8d3343
commit ee36616b43
4 changed files with 109 additions and 10 deletions

View file

@ -3,7 +3,8 @@
## 0.4.0 ## 0.4.0
+ Added `ValueProperty`, abstract superclass for `ConfigProperty` + Added `ValueProperty`, abstract superclass for `ConfigProperty`
+ Changed the behavior of `makelist()`: now it can also decorate a callable, converting its return type to a list + \[BREAKING] Changed the behavior of `makelist()`: now it's also a decorator, converting its return type to a list (revertable with `wrap=False`)
+ New module `lex` with functions `symbol_table()` and `lex()` — make tokenization more affordable
+ Added `addattr()` + Added `addattr()`
## 0.3.6 ## 0.3.6

View file

@ -27,13 +27,16 @@ from .itertools import makelist, kwargs_prefix, ltuple, rtuple, additem
from .i18n import I18n, JsonI18n, TomlI18n from .i18n import I18n, JsonI18n, TomlI18n
from .snowflake import Snowflake, SnowflakeGen from .snowflake import Snowflake, SnowflakeGen
__version__ = "0.4.0-dev27" __version__ = "0.4.0-dev28"
__all__ = ( __all__ = (
'Siq', 'SiqCache', 'SiqType', 'SiqGen', 'StringCase', 'ConfigOptions', 'ConfigParserConfigSource', 'ConfigSource', 'ConfigValue',
'MissingConfigError', 'MissingConfigWarning', 'ConfigOptions', 'ConfigParserConfigSource', 'ConfigSource', 'ConfigValue', 'EnvConfigSource', 'DictConfigSource', 'DictConfigSource', 'EnvConfigSource', 'I18n', 'Incomplete', 'JsonI18n',
'deprecated', 'not_implemented', 'Wanted', 'Incomplete', 'jsonencode', 'ltuple', 'rtuple', 'MissingConfigError', 'MissingConfigWarning', 'Siq', 'SiqCache', 'SiqGen',
'makelist', 'kwargs_prefix', 'I18n', 'JsonI18n', 'TomlI18n', 'cb32encode', 'cb32decode', 'count_ones', 'mask_shift', 'SiqType', 'Snowflake', 'SnowflakeGen', 'StringCase', 'TomlI18n', 'Wanted',
'want_bytes', 'want_str', 'version', 'b2048encode', 'split_bits', 'join_bits', 'b2048decode', 'additem', 'b2048decode', 'b2048encode', 'b32ldecode', 'b32lencode',
'Snowflake', 'SnowflakeGen', 'ssv_list', 'additem', 'b32lencode', 'b32ldecode', 'b64encode', 'b64decode' 'b64encode', 'b64decode', 'cb32encode', 'cb32decode', 'count_ones',
'deprecated', 'join_bits', 'jsonencode', 'kwargs_prefix', 'ltuple',
'makelist', 'mask_shift', 'not_implemented', 'rtuple', 'split_bits',
'ssv_list', 'want_bytes', 'want_str'
) )

View file

@ -14,7 +14,7 @@ This software is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
""" """
from .functools import deprecated
class MissingConfigError(LookupError): class MissingConfigError(LookupError):
""" """
@ -30,4 +30,15 @@ class MissingConfigWarning(MissingConfigError, Warning):
""" """
A required config property is missing, and the application is assuming a default value. A required config property is missing, and the application is assuming a default value.
""" """
pass pass
class LexError(SyntaxError):
"""
Illegal character or sequence found in the token stream.
"""
class InconsistencyError(RuntimeError):
"""
This program is in a state which it's not supposed to be in.
"""

84
src/suou/lex.py Normal file
View file

@ -0,0 +1,84 @@
"""
Utilities for tokenization of text.
---
"""
from re import Match
from dataclasses import dataclass
import re
from typing import Any, Callable, Iterable
from .exceptions import InconsistencyError, LexError
from .itertools import makelist
@dataclass
class TokenSym:
pattern: str
label: str
cast: Callable[[str], Any] | None = None
discard: bool = False
# convenience methods below
def match(self, s: str, index: int = 0) -> Match[str] | None:
return re.compile(self.pattern, 0).match(s, index)
@makelist
def symbol_table(*args: Iterable[tuple | TokenSym], whitespace: str | None = None):
"""
Make a symbol table from a list of tuples.
Tokens are in form (pattern, label[, cast]) where:
- [] means optional
- pattern is a regular expression (r-string syntax advised)
- label is a constant string
- cast is a function
Need to strip whitespace? Pass the whitespace= keyword parameter.
"""
for arg in args:
if isinstance(arg, TokenSym):
pass
elif isinstance(arg, tuple):
arg = TokenSym(*arg)
else:
raise TypeError(f'invalid type {arg.__class__.__name__!r}')
yield arg
if whitespace:
yield TokenSym('[' + re.escape(whitespace) + ']+', '', discard=True)
def ilex(text: str, table: Iterable[TokenSym], *, whitespace = False):
"""
Return a text as a list of tokens, given a token table (iterable of TokenSym).
ilex() returns a generator; lex() returns a list.
table must be a result from symbol_table().
"""
i = 0
while i < len(text):
mo = None
for sym in table:
if mo := re.compile(sym.pattern).match(text, i):
if not sym.discard:
mtext = mo.group(0)
if callable(sym.cast):
mtext = sym.cast(mtext)
yield (sym.label, mtext)
elif whitespace:
yield (None, mo.group(0))
break
if mo is None:
raise LexError(f'illegal character near {text[i:i+5]!r}')
if i == mo.end(0):
raise InconsistencyError
i = mo.end(0)
lex = makelist(ilex)