new module lex
This commit is contained in:
parent
4a2e8d3343
commit
ee36616b43
4 changed files with 109 additions and 10 deletions
|
|
@ -3,7 +3,8 @@
|
||||||
## 0.4.0
|
## 0.4.0
|
||||||
|
|
||||||
+ Added `ValueProperty`, abstract superclass for `ConfigProperty`
|
+ Added `ValueProperty`, abstract superclass for `ConfigProperty`
|
||||||
+ Changed the behavior of `makelist()`: now it can also decorate a callable, converting its return type to a list
|
+ \[BREAKING] Changed the behavior of `makelist()`: now it's also a decorator, converting its return type to a list (revertable with `wrap=False`)
|
||||||
|
+ New module `lex` with functions `symbol_table()` and `lex()` — make tokenization more affordable
|
||||||
+ Added `addattr()`
|
+ Added `addattr()`
|
||||||
|
|
||||||
## 0.3.6
|
## 0.3.6
|
||||||
|
|
|
||||||
|
|
@ -27,13 +27,16 @@ from .itertools import makelist, kwargs_prefix, ltuple, rtuple, additem
|
||||||
from .i18n import I18n, JsonI18n, TomlI18n
|
from .i18n import I18n, JsonI18n, TomlI18n
|
||||||
from .snowflake import Snowflake, SnowflakeGen
|
from .snowflake import Snowflake, SnowflakeGen
|
||||||
|
|
||||||
__version__ = "0.4.0-dev27"
|
__version__ = "0.4.0-dev28"
|
||||||
|
|
||||||
__all__ = (
|
__all__ = (
|
||||||
'Siq', 'SiqCache', 'SiqType', 'SiqGen', 'StringCase',
|
'ConfigOptions', 'ConfigParserConfigSource', 'ConfigSource', 'ConfigValue',
|
||||||
'MissingConfigError', 'MissingConfigWarning', 'ConfigOptions', 'ConfigParserConfigSource', 'ConfigSource', 'ConfigValue', 'EnvConfigSource', 'DictConfigSource',
|
'DictConfigSource', 'EnvConfigSource', 'I18n', 'Incomplete', 'JsonI18n',
|
||||||
'deprecated', 'not_implemented', 'Wanted', 'Incomplete', 'jsonencode', 'ltuple', 'rtuple',
|
'MissingConfigError', 'MissingConfigWarning', 'Siq', 'SiqCache', 'SiqGen',
|
||||||
'makelist', 'kwargs_prefix', 'I18n', 'JsonI18n', 'TomlI18n', 'cb32encode', 'cb32decode', 'count_ones', 'mask_shift',
|
'SiqType', 'Snowflake', 'SnowflakeGen', 'StringCase', 'TomlI18n', 'Wanted',
|
||||||
'want_bytes', 'want_str', 'version', 'b2048encode', 'split_bits', 'join_bits', 'b2048decode',
|
'additem', 'b2048decode', 'b2048encode', 'b32ldecode', 'b32lencode',
|
||||||
'Snowflake', 'SnowflakeGen', 'ssv_list', 'additem', 'b32lencode', 'b32ldecode', 'b64encode', 'b64decode'
|
'b64encode', 'b64decode', 'cb32encode', 'cb32decode', 'count_ones',
|
||||||
|
'deprecated', 'join_bits', 'jsonencode', 'kwargs_prefix', 'ltuple',
|
||||||
|
'makelist', 'mask_shift', 'not_implemented', 'rtuple', 'split_bits',
|
||||||
|
'ssv_list', 'want_bytes', 'want_str'
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -14,7 +14,7 @@ This software is distributed on an "AS IS" BASIS,
|
||||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
from .functools import deprecated
|
||||||
|
|
||||||
class MissingConfigError(LookupError):
|
class MissingConfigError(LookupError):
|
||||||
"""
|
"""
|
||||||
|
|
@ -31,3 +31,14 @@ class MissingConfigWarning(MissingConfigError, Warning):
|
||||||
A required config property is missing, and the application is assuming a default value.
|
A required config property is missing, and the application is assuming a default value.
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class LexError(SyntaxError):
|
||||||
|
"""
|
||||||
|
Illegal character or sequence found in the token stream.
|
||||||
|
"""
|
||||||
|
|
||||||
|
class InconsistencyError(RuntimeError):
|
||||||
|
"""
|
||||||
|
This program is in a state which it's not supposed to be in.
|
||||||
|
"""
|
||||||
|
|
|
||||||
84
src/suou/lex.py
Normal file
84
src/suou/lex.py
Normal file
|
|
@ -0,0 +1,84 @@
|
||||||
|
"""
|
||||||
|
Utilities for tokenization of text.
|
||||||
|
|
||||||
|
---
|
||||||
|
"""
|
||||||
|
|
||||||
|
from re import Match
|
||||||
|
|
||||||
|
|
||||||
|
from dataclasses import dataclass
|
||||||
|
import re
|
||||||
|
from typing import Any, Callable, Iterable
|
||||||
|
|
||||||
|
from .exceptions import InconsistencyError, LexError
|
||||||
|
|
||||||
|
from .itertools import makelist
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class TokenSym:
|
||||||
|
pattern: str
|
||||||
|
label: str
|
||||||
|
cast: Callable[[str], Any] | None = None
|
||||||
|
discard: bool = False
|
||||||
|
|
||||||
|
# convenience methods below
|
||||||
|
def match(self, s: str, index: int = 0) -> Match[str] | None:
|
||||||
|
return re.compile(self.pattern, 0).match(s, index)
|
||||||
|
|
||||||
|
@makelist
|
||||||
|
def symbol_table(*args: Iterable[tuple | TokenSym], whitespace: str | None = None):
|
||||||
|
"""
|
||||||
|
Make a symbol table from a list of tuples.
|
||||||
|
|
||||||
|
Tokens are in form (pattern, label[, cast]) where:
|
||||||
|
- [] means optional
|
||||||
|
- pattern is a regular expression (r-string syntax advised)
|
||||||
|
- label is a constant string
|
||||||
|
- cast is a function
|
||||||
|
|
||||||
|
Need to strip whitespace? Pass the whitespace= keyword parameter.
|
||||||
|
"""
|
||||||
|
for arg in args:
|
||||||
|
if isinstance(arg, TokenSym):
|
||||||
|
pass
|
||||||
|
elif isinstance(arg, tuple):
|
||||||
|
arg = TokenSym(*arg)
|
||||||
|
else:
|
||||||
|
raise TypeError(f'invalid type {arg.__class__.__name__!r}')
|
||||||
|
yield arg
|
||||||
|
if whitespace:
|
||||||
|
yield TokenSym('[' + re.escape(whitespace) + ']+', '', discard=True)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def ilex(text: str, table: Iterable[TokenSym], *, whitespace = False):
|
||||||
|
"""
|
||||||
|
Return a text as a list of tokens, given a token table (iterable of TokenSym).
|
||||||
|
|
||||||
|
ilex() returns a generator; lex() returns a list.
|
||||||
|
|
||||||
|
table must be a result from symbol_table().
|
||||||
|
"""
|
||||||
|
i = 0
|
||||||
|
while i < len(text):
|
||||||
|
mo = None
|
||||||
|
for sym in table:
|
||||||
|
if mo := re.compile(sym.pattern).match(text, i):
|
||||||
|
if not sym.discard:
|
||||||
|
mtext = mo.group(0)
|
||||||
|
if callable(sym.cast):
|
||||||
|
mtext = sym.cast(mtext)
|
||||||
|
yield (sym.label, mtext)
|
||||||
|
elif whitespace:
|
||||||
|
yield (None, mo.group(0))
|
||||||
|
break
|
||||||
|
if mo is None:
|
||||||
|
raise LexError(f'illegal character near {text[i:i+5]!r}')
|
||||||
|
if i == mo.end(0):
|
||||||
|
raise InconsistencyError
|
||||||
|
i = mo.end(0)
|
||||||
|
|
||||||
|
lex = makelist(ilex)
|
||||||
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue