From 946973f732218f147e0f5f5c459a101c3da79c0d Mon Sep 17 00:00:00 2001 From: Mattia Succurro Date: Tue, 17 Jun 2025 20:13:40 +0200 Subject: [PATCH] add Snowflake support, change behavior of kwargs_prefix(), fix padding bug in base32 functions --- CHANGELOG.md | 6 +- src/suou/__init__.py | 8 +- src/suou/codecs.py | 37 +++++++- src/suou/iding.py | 33 +++++-- src/suou/itertools.py | 18 +++- src/suou/snowflake.py | 190 +++++++++++++++++++++++++++++++++++++++++ src/suou/sqlalchemy.py | 47 ++++++++-- 7 files changed, 315 insertions(+), 24 deletions(-) create mode 100644 src/suou/snowflake.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 6ce0f75..4d0719b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,13 +2,17 @@ ## 0.3.0 +- Fixed `cb32encode()` and `b32lencode()` doing wrong padding — **UNSOLVED in 0.2.x** which is out of support, effective immediately +- **Changed behavior** of `kwargs_prefix()` which now removes keys from original mapping by default - Add SQLAlchemy auth loaders i.e. `sqlalchemy.require_auth_base()`, `flask_sqlalchemy`. What auth loaders do is loading user token and signature into app +- Add `sqlalchemy.create_session()` - Implement `UserSigner()` - Improve JSON handling in `flask_restx` - Add base2048 (i.e. [BIP-39](https://github.com/bitcoin/bips/blob/master/bip-0039.mediawiki)) codec -- Add `split_bits()`, `join_bits()`, `ltuple()`, `rtuple()` +- Add `split_bits()`, `join_bits()`, `ltuple()`, `rtuple()`, `ssv_list()` - Add `markdown` extensions +- Add Snowflake manipulation utilities ## 0.2.3 diff --git a/src/suou/__init__.py b/src/suou/__init__.py index 8b87268..935ce43 100644 --- a/src/suou/__init__.py +++ b/src/suou/__init__.py @@ -17,20 +17,22 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. """ from .iding import Siq, SiqCache, SiqType, SiqGen -from .codecs import StringCase, cb32encode, cb32decode, jsonencode, want_bytes, want_str, b2048encode, b2048decode +from .codecs import StringCase, cb32encode, cb32decode, jsonencode, want_bytes, want_str, b2048encode, b2048decode, ssv_list from .bits import count_ones, mask_shift, split_bits, join_bits from .configparse import MissingConfigError, MissingConfigWarning, ConfigOptions, ConfigParserConfigSource, ConfigSource, DictConfigSource, ConfigValue, EnvConfigSource from .functools import deprecated, not_implemented from .classtools import Wanted, Incomplete from .itertools import makelist, kwargs_prefix, ltuple, rtuple from .i18n import I18n, JsonI18n, TomlI18n +from .snowflake import Snowflake, SnowflakeGen -__version__ = "0.3.0-dev22" +__version__ = "0.3.0-dev24" __all__ = ( 'Siq', 'SiqCache', 'SiqType', 'SiqGen', 'StringCase', 'MissingConfigError', 'MissingConfigWarning', 'ConfigOptions', 'ConfigParserConfigSource', 'ConfigSource', 'ConfigValue', 'EnvConfigSource', 'DictConfigSource', 'deprecated', 'not_implemented', 'Wanted', 'Incomplete', 'jsonencode', 'ltuple', 'rtuple', 'makelist', 'kwargs_prefix', 'I18n', 'JsonI18n', 'TomlI18n', 'cb32encode', 'cb32decode', 'count_ones', 'mask_shift', - 'want_bytes', 'want_str', 'version', 'b2048encode', 'split_bits', 'join_bits', 'b2048decode' + 'want_bytes', 'want_str', 'version', 'b2048encode', 'split_bits', 'join_bits', 'b2048decode', + 'Snowflake', 'SnowflakeGen', 'ssv_list' ) diff --git a/src/suou/codecs.py b/src/suou/codecs.py index 5f74867..2bee255 100644 --- a/src/suou/codecs.py +++ b/src/suou/codecs.py @@ -162,7 +162,7 @@ def cb32decode(val: bytes | str) -> str: ''' Decode bytes from Crockford Base32. ''' - return base64.b32decode(want_bytes(val).upper().translate(CROCKFORD_TO_B32) + b'=' * ((5 - len(val) % 5) % 5)) + return base64.b32decode(want_bytes(val).upper().translate(CROCKFORD_TO_B32) + b'=' * ((8 - len(val) % 8) % 8)) def b32lencode(val: bytes) -> str: ''' @@ -174,7 +174,7 @@ def b32ldecode(val: bytes | str) -> bytes: ''' Decode a lowercase base32 encoded byte sequence. Padding is managed automatically. ''' - return base64.b32decode(want_bytes(val).upper() + b'=' * ((5 - len(val) % 5) % 5)) + return base64.b32decode(want_bytes(val).upper() + b'=' * ((8 - len(val) % 8) % 8)) def b64encode(val: bytes, *, strip: bool = True) -> str: ''' @@ -229,6 +229,35 @@ def jsonencode(obj: dict, *, skipkeys: bool = True, separators: tuple[str, str] jsondecode = deprecated('just use json.loads()')(json.loads) +def ssv_list(s: str, *, sep_chars = ',;') -> list[str]: + """ + Parse values from a Space Separated Values (SSV) string. + + By default, values are split on spaces, commas (,) and semicolons (;), configurable + with sepchars= argument. + + Double quotes (") can be used to allow spaces, commas etc. in values. Doubled double + quotes ("") are parsed as literal double quotes. + + Useful for environment variables: pass it to ConfigValue() as the cast= argument. + """ + sep_re = r'\s+|\s*[' + re.escape(sep_chars) + r']\s*' + parts = s.split('"') + parts[::2] = [re.split(sep_re, x) for x in parts[::2]] + l: list[str] = parts[0].copy() + for i in range(1, len(parts), 2): + p0, *pt = parts[i+1] + # two "strings" sandwiching each other case + if i < len(parts)-2 and parts[i] and parts[i+2] and not p0 and not pt: + p0 = '"' + l[-1] += ('"' if parts[i] == '' else parts[i]) + p0 + l.extend(pt) + if l and l[0] == '': + l.pop(0) + if l and l[-1] == '': + l.pop() + return l + class StringCase(enum.Enum): """ Enum values used by regex validators and storage converters. @@ -237,7 +266,7 @@ class StringCase(enum.Enum): LOWER = case insensitive, force lowercase UPPER = case insensitive, force uppercase IGNORE = case insensitive, leave as is, use lowercase in comparison - IGNORE_UPPER = same as above, but use uppercase il comparison + IGNORE_UPPER = same as above, but use uppercase in comparison """ AS_IS = 0 LOWER = FORCE_LOWER = 1 @@ -264,5 +293,5 @@ class StringCase(enum.Enum): __all__ = ( 'cb32encode', 'cb32decode', 'b32lencode', 'b32ldecode', 'b64encode', 'b64decode', 'jsonencode' - 'StringCase', 'want_bytes', 'want_str', 'jsondecode' + 'StringCase', 'want_bytes', 'want_str', 'jsondecode', 'ssv_list' ) \ No newline at end of file diff --git a/src/suou/iding.py b/src/suou/iding.py index dba591c..a188d04 100644 --- a/src/suou/iding.py +++ b/src/suou/iding.py @@ -40,7 +40,7 @@ import os from typing import Iterable, override import warnings -from .functools import not_implemented, deprecated +from .functools import deprecated from .codecs import b32lencode, b64encode, cb32encode @@ -220,6 +220,9 @@ class SiqCache: return self._cache.pop(0) class Siq(int): + """ + Representation of a SIQ as an integer. + """ def to_bytes(self, length: int = 14, byteorder = 'big', *, signed: bool = False) -> bytes: return super().to_bytes(length, byteorder, signed=signed) @classmethod @@ -230,17 +233,22 @@ class Siq(int): def to_base64(self, length: int = 15, *, strip: bool = True) -> str: return b64encode(self.to_bytes(length), strip=strip) - def to_cb32(self)-> str: + def to_cb32(self) -> str: return cb32encode(self.to_bytes(15, 'big')) to_crockford = to_cb32 def to_hex(self) -> str: return f'{self:x}' def to_oct(self) -> str: return f'{self:o}' - @deprecated('use str() instead') - def to_dec(self) -> str: - return f'{self}' - + def to_b32l(self) -> str: + """ + This is NOT the URI serializer! + """ + return b32lencode(self.to_bytes(15, 'big')) + def __str__(self) -> str: + return int.__str__(self) + to_dec = deprecated('use str() instead')(__str__) + @override def __format__(self, opt: str, /) -> str: try: @@ -256,7 +264,9 @@ class Siq(int): case '0c': return '0' + self.to_cb32() case 'd' | '': - return int.__str__(self) + return int.__repr__(self) + case 'l': + return self.to_b32l() case 'o' | 'x': return int.__format__(self, opt) case 'u': @@ -287,6 +297,15 @@ class Siq(int): def __repr__(self): return f'{self.__class__.__name__}({super().__repr__()})' + # convenience methods + def timestamp(self): + return (self >> 56) / (1 << 16) + + def shard_id(self): + return (self >> 48) % 256 + + def domain_name(self): + return (self >> 16) % 0xffffffff __all__ = ( 'Siq', 'SiqCache', 'SiqType', 'SiqGen' diff --git a/src/suou/itertools.py b/src/suou/itertools.py index dad51f4..e4bee80 100644 --- a/src/suou/itertools.py +++ b/src/suou/itertools.py @@ -50,11 +50,25 @@ def rtuple(seq: Iterable[_T], size: int, /, pad = None) -> tuple: return seq -def kwargs_prefix(it: dict[str, Any], prefix: str) -> dict[str, Any]: +def kwargs_prefix(it: dict[str, Any], prefix: str, *, remove = True, keep_prefix = False) -> dict[str, Any]: ''' Subset of keyword arguments. Useful for callable wrapping. + + By default, it removes arguments from original kwargs as well. You can prevent by + setting remove=False. + + By default, specified prefix is removed from each key of the returned + dictionary; keep_prefix=True keeps the prefix on keys. ''' - return {k.removeprefix(prefix): v for k, v in it.items() if k.startswith(prefix)} + keys = [k for k in it.keys() if k.startswith(prefix)] + + ka = dict() + for k in keys: + ka[k if keep_prefix else k.removeprefix(prefix)] = it[k] + if remove: + for k in keys: + it.pop(k) + return ka diff --git a/src/suou/snowflake.py b/src/suou/snowflake.py new file mode 100644 index 0000000..bc56354 --- /dev/null +++ b/src/suou/snowflake.py @@ -0,0 +1,190 @@ +""" +Utilities for Snowflake-like identifiers. + +Here for applications who benefit from their use. I (sakuragasaki46) +recommend using SIQ (.iding) when applicable; there also utilities to +convert snowflakes into SIQ's in .migrate. + +--- + +Copyright (c) 2025 Sakuragasaki46. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +See LICENSE for the specific language governing permissions and +limitations under the License. + +This software is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +""" + + +from __future__ import annotations +import os +from threading import Lock +import time +from typing import override +import warnings + +from .migrate import SnowflakeSiqMigrator +from .iding import SiqType +from .codecs import b32ldecode, b32lencode, b64encode, cb32encode +from .functools import deprecated + + +class SnowflakeGen: + """ + Implements a generator Snowflake ID's (i.e. the ones in use at Twitter / Discord). + + Discord snowflakes are in this format: + tttttttt tttttttt tttttttt tttttttt + tttttttt ttddddds sssspppp pppppppp + + where: + t: timestamp (in milliseconds) — 42 bits + d: local ID — 5 bits + s: shard ID — 5 bits + p: progressive counter — 10 bits + + Converter takes local ID and shard ID as one; latter 8 bits are taken for + the shard ID, while the former 2 are added to timestamp, taking advantage of + more precision — along with up to 2 most significant bits of progressive co + + The constructor takes an epoch argument, since snowflakes, due to + optimization requirements, are based on a different epoch (e.g. + Jan 1, 2015 for Discord); epoch is wanted as seconds since Unix epoch + (i.e. midnight of Jan 1, 1970). + """ + epoch: int + local_id: int + shard_id: int + counter: int + last_gen_ts: int + + TS_ACCURACY = 1000 + + + def __init__(self, epoch: int, local_id: int = 0, shard_id: int | None = None, + last_id: int = 0 + ): + self.epoch = epoch + self.local_id = local_id + self.shard_id = (shard_id or os.getpid()) % 32 + self.counter = 0 + self.last_gen_ts = min(last_id >> 22, self.cur_timestamp()) + def cur_timestamp(self) -> int: + return int((time.time() - self.epoch) * self.TS_ACCURACY) + def generate(self, /, n: int = 1): + """ + Generate one or more snowflakes. + The generated ids are returned as integers. + Bulk generation is supported. + + Returns as an iterator, to allow generation “on the fly”. + To get a scalar or a list, use .generate_one() or next(), or + .generate_list() or list(.generate()), respectively. + + Warning: the function **may block**. + """ + now = self.cur_timestamp() + if now < self.last_gen_ts: + time.sleep((self.last_gen_ts - now) / (1 << 16)) + elif now > self.last_gen_ts: + self.counter = 0 + while n: + if self.counter >= 4096: + while (now := self.cur_timestamp()) <= self.last_gen_ts: + time.sleep(1 / (1 << 16)) + with Lock(): + self.counter %= 1 << 16 + # XXX the lock is here "just in case", MULTITHREADED GENERATION IS NOT ADVISED! + with Lock(): + siq = ( + (now << 22) | + ((self.local_id % 32) << 17) | + ((self.shard_id % 32) << 12) | + (self.counter % (1 << 12)) + ) + n -= 1 + self.counter += 1 + yield siq + def generate_one(self, /, typ: SiqType) -> int: + return next(self.generate(typ, 1)) + def generate_list(self, /, typ: SiqType, n: int = 1) -> list[int]: + return list(self.generate(typ, n)) + + +class Snowflake(int): + """ + Representation of a Snowflake as an integer. + """ + + def to_bytes(self, length: int = 14, byteorder = "big", *, signed: bool = False) -> bytes: + return super().to_bytes(length, byteorder, signed=signed) + @classmethod + def from_bytes(cls, b: bytes, byteorder = 'big', *, signed: bool = False) -> Snowflake: + if len(b) != 8: + warnings.warn('Snowflakes are exactly 8 bytes long', BytesWarning) + return super().from_bytes(b, byteorder, signed=signed) + + def to_base64(self, length: int = 9, *, strip: bool = True) -> str: + return b64encode(self.to_bytes(length), strip=strip) + def to_cb32(self)-> str: + return cb32encode(self.to_bytes(9, 'big')) + to_crockford = to_cb32 + def to_hex(self) -> str: + return f'{self:x}' + def to_oct(self) -> str: + return f'{self:o}' + def to_b32l(self) -> str: + return b32lencode(self.to_bytes(10, 'big')).lstrip('a') + @classmethod + def from_b32l(cls, val: str) -> Snowflake: + if val.startswith('_'): + ## support for negative Snowflakes + return -cls.from_b32l(val.lstrip('_')) + return Snowflake.from_bytes(b32ldecode(val.ljust(16, 'a'))[-8:]) + + @override + def __format__(self, opt: str, /) -> str: + try: + return self.format(opt) + except ValueError: + return super().__format__(opt) + def format(self, opt: str, /) -> str: + match opt: + case 'b': + return self.to_base64() + case 'c': + return self.to_cb32() + case '0c': + return '0' + self.to_cb32() + case 'd' | '': + return int.__repr__(self) + case 'l': + return self.to_b32l() + case 'o' | 'x': + return int.__format__(self, opt) + case _: + raise ValueError(f'unknown format: {opt!r}') + + def __str__(self) -> str: + return int.__str__(self) + to_dec = deprecated('use str() instead')(__str__) + + def __repr__(self): + return f'{self.__class__.__name__}({super().__repr__()})' + + def to_siq(self, domain: str, epoch: int, target_type: SiqType, **kwargs): + """ + Convenience method for conversion to SIQ. + + (!) This does not check for existence! Always do the check yourself. + """ + return SnowflakeSiqMigrator(domain, epoch, **kwargs).to_siq(self, target_type) + + + +__all__ = ( + 'Snowflake', 'SnowflakeGen' +) \ No newline at end of file diff --git a/src/suou/sqlalchemy.py b/src/suou/sqlalchemy.py index 3886d0d..71fd5ca 100644 --- a/src/suou/sqlalchemy.py +++ b/src/suou/sqlalchemy.py @@ -20,13 +20,14 @@ from abc import ABCMeta, abstractmethod from functools import wraps from typing import Any, Callable, Iterable, Never, TypeVar import warnings -from sqlalchemy import CheckConstraint, Date, Dialect, ForeignKey, LargeBinary, Column, MetaData, SmallInteger, String, select, text +from sqlalchemy import BigInteger, CheckConstraint, Date, Dialect, ForeignKey, LargeBinary, Column, MetaData, SmallInteger, String, create_engine, select, text from sqlalchemy.orm import DeclarativeBase, Session, declarative_base as _declarative_base +from .snowflake import SnowflakeGen from .itertools import kwargs_prefix, makelist from .signing import HasSigner, UserSigner from .codecs import StringCase -from .functools import deprecated +from .functools import deprecated, not_implemented from .iding import SiqType, SiqCache from .classtools import Incomplete, Wanted @@ -36,7 +37,7 @@ _T = TypeVar('_T') # Not to be confused with SiqType. IdType = LargeBinary(16) - +@not_implemented def sql_escape(s: str, /, dialect: Dialect) -> str: """ Escape a value for SQL embedding, using SQLAlchemy's literal processors. @@ -49,7 +50,18 @@ def sql_escape(s: str, /, dialect: Dialect) -> str: raise TypeError('invalid data type') -def id_column(typ: SiqType, *, primary_key: bool = True): +def create_session(url: str) -> Session: + """ + Create a session on the fly, given a database URL. Useful for + contextless environments, such as Python REPL. + + Heads up: a function with the same name exists in core sqlalchemy, but behaves + completely differently!! + """ + engine = create_engine(url) + return Session(bind = engine) + +def id_column(typ: SiqType, *, primary_key: bool = True, **kwargs): """ Marks a column which contains a SIQ. """ @@ -60,9 +72,27 @@ def id_column(typ: SiqType, *, primary_key: bool = True): return idgen.generate().to_bytes() return new_id if primary_key: - return Incomplete(Column, IdType, primary_key = True, default = Wanted(new_id_factory)) + return Incomplete(Column, IdType, primary_key = True, default = Wanted(new_id_factory), **kwargs) else: - return Incomplete(Column, IdType, unique = True, nullable = False, default = Wanted(new_id_factory)) + return Incomplete(Column, IdType, unique = True, nullable = False, default = Wanted(new_id_factory), **kwargs) + +def snowflake_column(*, primary_key: bool = True, **kwargs): + """ + Same as id_column() but with snowflakes. + + XXX this is meant ONLY as means of transition; for new stuff, use id_column() and SIQ. + """ + def new_id_factory(owner: DeclarativeBase) -> Callable: + epoch = owner.metadata.info['snowflake_epoch'] + # more arguments will be passed on (?) + idgen = SnowflakeGen(epoch) + def new_id() -> bytes: + return idgen.generate() + return new_id + if primary_key: + return Incomplete(Column, BigInteger, primary_key = True, default = Wanted(new_id_factory), **kwargs) + else: + return Incomplete(Column, BigInteger, unique = True, nullable = False, default = Wanted(new_id_factory), **kwargs) def match_constraint(col_name: str, regex: str, /, dialect: str = 'default', constraint_name: str | None = None) -> CheckConstraint: @@ -99,9 +129,12 @@ def declarative_base(domain_name: str, master_secret: bytes, metadata: dict | No metadata = dict() if 'info' not in metadata: metadata['info'] = dict() + # snowflake metadata + snowflake_kwargs = kwargs_prefix(kwargs, 'snowflake_', remove=True) metadata['info'].update( domain_name = domain_name, - secret_key = master_secret + secret_key = master_secret, + **{f'snowflake_{k}': v for k, v in snowflake_kwargs} ) Base = _declarative_base(metadata=MetaData(**metadata), **kwargs) return Base