new module lex

This commit is contained in:
Yusur 2025-07-17 19:45:43 +02:00
parent 4a2e8d3343
commit ee36616b43
4 changed files with 109 additions and 10 deletions

View file

@ -3,7 +3,8 @@
## 0.4.0
+ Added `ValueProperty`, abstract superclass for `ConfigProperty`
+ Changed the behavior of `makelist()`: now it can also decorate a callable, converting its return type to a list
+ \[BREAKING] Changed the behavior of `makelist()`: now it's also a decorator, converting its return type to a list (revertable with `wrap=False`)
+ New module `lex` with functions `symbol_table()` and `lex()` — make tokenization more affordable
+ Added `addattr()`
## 0.3.6

View file

@ -27,13 +27,16 @@ from .itertools import makelist, kwargs_prefix, ltuple, rtuple, additem
from .i18n import I18n, JsonI18n, TomlI18n
from .snowflake import Snowflake, SnowflakeGen
__version__ = "0.4.0-dev27"
__version__ = "0.4.0-dev28"
__all__ = (
'Siq', 'SiqCache', 'SiqType', 'SiqGen', 'StringCase',
'MissingConfigError', 'MissingConfigWarning', 'ConfigOptions', 'ConfigParserConfigSource', 'ConfigSource', 'ConfigValue', 'EnvConfigSource', 'DictConfigSource',
'deprecated', 'not_implemented', 'Wanted', 'Incomplete', 'jsonencode', 'ltuple', 'rtuple',
'makelist', 'kwargs_prefix', 'I18n', 'JsonI18n', 'TomlI18n', 'cb32encode', 'cb32decode', 'count_ones', 'mask_shift',
'want_bytes', 'want_str', 'version', 'b2048encode', 'split_bits', 'join_bits', 'b2048decode',
'Snowflake', 'SnowflakeGen', 'ssv_list', 'additem', 'b32lencode', 'b32ldecode', 'b64encode', 'b64decode'
'ConfigOptions', 'ConfigParserConfigSource', 'ConfigSource', 'ConfigValue',
'DictConfigSource', 'EnvConfigSource', 'I18n', 'Incomplete', 'JsonI18n',
'MissingConfigError', 'MissingConfigWarning', 'Siq', 'SiqCache', 'SiqGen',
'SiqType', 'Snowflake', 'SnowflakeGen', 'StringCase', 'TomlI18n', 'Wanted',
'additem', 'b2048decode', 'b2048encode', 'b32ldecode', 'b32lencode',
'b64encode', 'b64decode', 'cb32encode', 'cb32decode', 'count_ones',
'deprecated', 'join_bits', 'jsonencode', 'kwargs_prefix', 'ltuple',
'makelist', 'mask_shift', 'not_implemented', 'rtuple', 'split_bits',
'ssv_list', 'want_bytes', 'want_str'
)

View file

@ -14,7 +14,7 @@ This software is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
"""
from .functools import deprecated
class MissingConfigError(LookupError):
"""
@ -31,3 +31,14 @@ class MissingConfigWarning(MissingConfigError, Warning):
A required config property is missing, and the application is assuming a default value.
"""
pass
class LexError(SyntaxError):
"""
Illegal character or sequence found in the token stream.
"""
class InconsistencyError(RuntimeError):
"""
This program is in a state which it's not supposed to be in.
"""

84
src/suou/lex.py Normal file
View file

@ -0,0 +1,84 @@
"""
Utilities for tokenization of text.
---
"""
from re import Match
from dataclasses import dataclass
import re
from typing import Any, Callable, Iterable
from .exceptions import InconsistencyError, LexError
from .itertools import makelist
@dataclass
class TokenSym:
pattern: str
label: str
cast: Callable[[str], Any] | None = None
discard: bool = False
# convenience methods below
def match(self, s: str, index: int = 0) -> Match[str] | None:
return re.compile(self.pattern, 0).match(s, index)
@makelist
def symbol_table(*args: Iterable[tuple | TokenSym], whitespace: str | None = None):
"""
Make a symbol table from a list of tuples.
Tokens are in form (pattern, label[, cast]) where:
- [] means optional
- pattern is a regular expression (r-string syntax advised)
- label is a constant string
- cast is a function
Need to strip whitespace? Pass the whitespace= keyword parameter.
"""
for arg in args:
if isinstance(arg, TokenSym):
pass
elif isinstance(arg, tuple):
arg = TokenSym(*arg)
else:
raise TypeError(f'invalid type {arg.__class__.__name__!r}')
yield arg
if whitespace:
yield TokenSym('[' + re.escape(whitespace) + ']+', '', discard=True)
def ilex(text: str, table: Iterable[TokenSym], *, whitespace = False):
"""
Return a text as a list of tokens, given a token table (iterable of TokenSym).
ilex() returns a generator; lex() returns a list.
table must be a result from symbol_table().
"""
i = 0
while i < len(text):
mo = None
for sym in table:
if mo := re.compile(sym.pattern).match(text, i):
if not sym.discard:
mtext = mo.group(0)
if callable(sym.cast):
mtext = sym.cast(mtext)
yield (sym.label, mtext)
elif whitespace:
yield (None, mo.group(0))
break
if mo is None:
raise LexError(f'illegal character near {text[i:i+5]!r}')
if i == mo.end(0):
raise InconsistencyError
i = mo.end(0)
lex = makelist(ilex)