new module lex
This commit is contained in:
parent
4a2e8d3343
commit
ee36616b43
4 changed files with 109 additions and 10 deletions
|
|
@ -3,7 +3,8 @@
|
|||
## 0.4.0
|
||||
|
||||
+ Added `ValueProperty`, abstract superclass for `ConfigProperty`
|
||||
+ Changed the behavior of `makelist()`: now it can also decorate a callable, converting its return type to a list
|
||||
+ \[BREAKING] Changed the behavior of `makelist()`: now it's also a decorator, converting its return type to a list (revertable with `wrap=False`)
|
||||
+ New module `lex` with functions `symbol_table()` and `lex()` — make tokenization more affordable
|
||||
+ Added `addattr()`
|
||||
|
||||
## 0.3.6
|
||||
|
|
|
|||
|
|
@ -27,13 +27,16 @@ from .itertools import makelist, kwargs_prefix, ltuple, rtuple, additem
|
|||
from .i18n import I18n, JsonI18n, TomlI18n
|
||||
from .snowflake import Snowflake, SnowflakeGen
|
||||
|
||||
__version__ = "0.4.0-dev27"
|
||||
__version__ = "0.4.0-dev28"
|
||||
|
||||
__all__ = (
|
||||
'Siq', 'SiqCache', 'SiqType', 'SiqGen', 'StringCase',
|
||||
'MissingConfigError', 'MissingConfigWarning', 'ConfigOptions', 'ConfigParserConfigSource', 'ConfigSource', 'ConfigValue', 'EnvConfigSource', 'DictConfigSource',
|
||||
'deprecated', 'not_implemented', 'Wanted', 'Incomplete', 'jsonencode', 'ltuple', 'rtuple',
|
||||
'makelist', 'kwargs_prefix', 'I18n', 'JsonI18n', 'TomlI18n', 'cb32encode', 'cb32decode', 'count_ones', 'mask_shift',
|
||||
'want_bytes', 'want_str', 'version', 'b2048encode', 'split_bits', 'join_bits', 'b2048decode',
|
||||
'Snowflake', 'SnowflakeGen', 'ssv_list', 'additem', 'b32lencode', 'b32ldecode', 'b64encode', 'b64decode'
|
||||
'ConfigOptions', 'ConfigParserConfigSource', 'ConfigSource', 'ConfigValue',
|
||||
'DictConfigSource', 'EnvConfigSource', 'I18n', 'Incomplete', 'JsonI18n',
|
||||
'MissingConfigError', 'MissingConfigWarning', 'Siq', 'SiqCache', 'SiqGen',
|
||||
'SiqType', 'Snowflake', 'SnowflakeGen', 'StringCase', 'TomlI18n', 'Wanted',
|
||||
'additem', 'b2048decode', 'b2048encode', 'b32ldecode', 'b32lencode',
|
||||
'b64encode', 'b64decode', 'cb32encode', 'cb32decode', 'count_ones',
|
||||
'deprecated', 'join_bits', 'jsonencode', 'kwargs_prefix', 'ltuple',
|
||||
'makelist', 'mask_shift', 'not_implemented', 'rtuple', 'split_bits',
|
||||
'ssv_list', 'want_bytes', 'want_str'
|
||||
)
|
||||
|
|
|
|||
|
|
@ -14,7 +14,7 @@ This software is distributed on an "AS IS" BASIS,
|
|||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
"""
|
||||
|
||||
|
||||
from .functools import deprecated
|
||||
|
||||
class MissingConfigError(LookupError):
|
||||
"""
|
||||
|
|
@ -31,3 +31,14 @@ class MissingConfigWarning(MissingConfigError, Warning):
|
|||
A required config property is missing, and the application is assuming a default value.
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
class LexError(SyntaxError):
|
||||
"""
|
||||
Illegal character or sequence found in the token stream.
|
||||
"""
|
||||
|
||||
class InconsistencyError(RuntimeError):
|
||||
"""
|
||||
This program is in a state which it's not supposed to be in.
|
||||
"""
|
||||
|
|
|
|||
84
src/suou/lex.py
Normal file
84
src/suou/lex.py
Normal file
|
|
@ -0,0 +1,84 @@
|
|||
"""
|
||||
Utilities for tokenization of text.
|
||||
|
||||
---
|
||||
"""
|
||||
|
||||
from re import Match
|
||||
|
||||
|
||||
from dataclasses import dataclass
|
||||
import re
|
||||
from typing import Any, Callable, Iterable
|
||||
|
||||
from .exceptions import InconsistencyError, LexError
|
||||
|
||||
from .itertools import makelist
|
||||
|
||||
|
||||
@dataclass
|
||||
class TokenSym:
|
||||
pattern: str
|
||||
label: str
|
||||
cast: Callable[[str], Any] | None = None
|
||||
discard: bool = False
|
||||
|
||||
# convenience methods below
|
||||
def match(self, s: str, index: int = 0) -> Match[str] | None:
|
||||
return re.compile(self.pattern, 0).match(s, index)
|
||||
|
||||
@makelist
|
||||
def symbol_table(*args: Iterable[tuple | TokenSym], whitespace: str | None = None):
|
||||
"""
|
||||
Make a symbol table from a list of tuples.
|
||||
|
||||
Tokens are in form (pattern, label[, cast]) where:
|
||||
- [] means optional
|
||||
- pattern is a regular expression (r-string syntax advised)
|
||||
- label is a constant string
|
||||
- cast is a function
|
||||
|
||||
Need to strip whitespace? Pass the whitespace= keyword parameter.
|
||||
"""
|
||||
for arg in args:
|
||||
if isinstance(arg, TokenSym):
|
||||
pass
|
||||
elif isinstance(arg, tuple):
|
||||
arg = TokenSym(*arg)
|
||||
else:
|
||||
raise TypeError(f'invalid type {arg.__class__.__name__!r}')
|
||||
yield arg
|
||||
if whitespace:
|
||||
yield TokenSym('[' + re.escape(whitespace) + ']+', '', discard=True)
|
||||
|
||||
|
||||
|
||||
def ilex(text: str, table: Iterable[TokenSym], *, whitespace = False):
|
||||
"""
|
||||
Return a text as a list of tokens, given a token table (iterable of TokenSym).
|
||||
|
||||
ilex() returns a generator; lex() returns a list.
|
||||
|
||||
table must be a result from symbol_table().
|
||||
"""
|
||||
i = 0
|
||||
while i < len(text):
|
||||
mo = None
|
||||
for sym in table:
|
||||
if mo := re.compile(sym.pattern).match(text, i):
|
||||
if not sym.discard:
|
||||
mtext = mo.group(0)
|
||||
if callable(sym.cast):
|
||||
mtext = sym.cast(mtext)
|
||||
yield (sym.label, mtext)
|
||||
elif whitespace:
|
||||
yield (None, mo.group(0))
|
||||
break
|
||||
if mo is None:
|
||||
raise LexError(f'illegal character near {text[i:i+5]!r}')
|
||||
if i == mo.end(0):
|
||||
raise InconsistencyError
|
||||
i = mo.end(0)
|
||||
|
||||
lex = makelist(ilex)
|
||||
|
||||
Loading…
Add table
Add a link
Reference in a new issue