From c27630c3d6b3e37b66b355ed53705f05bbb885ce Mon Sep 17 00:00:00 2001 From: Yusur Princeps Date: Wed, 29 Oct 2025 09:28:59 +0100 Subject: [PATCH] 0.7.4 add test and docs to .iding --- CHANGELOG.md | 8 +- README.md | 6 +- aliases/sakuragasaki46_suou/pyproject.toml | 2 +- docs/conf.py | 2 +- docs/iding.rst | 197 +++++++++++++++++++++ docs/index.rst | 1 + docs/sqlalchemy.rst | 3 +- src/suou/__init__.py | 2 +- src/suou/glue.py | 19 +- src/suou/iding.py | 17 +- src/suou/luck.py | 2 +- tests/test_iding.py | 35 ++++ 12 files changed, 278 insertions(+), 16 deletions(-) create mode 100644 docs/iding.rst create mode 100644 tests/test_iding.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 5a9562f..e4ebb12 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,9 +1,15 @@ # Changelog +## 0.7.4 + ++ Delay release of `@glue()` ++ Add docs and some tests to `.iding` ++ Fix bug in `SiqGen()` that may prevent generation in short amounts of time + ## 0.7.3 + Fixed some broken imports in `.sqlalchemy` -+ Stage `@glue()` for release in 0.8.0 ++ Stage `@glue()` for release in 0.9.0 + Add docs to `.sqlalchemy` ## 0.7.2 diff --git a/README.md b/README.md index 8f931dc..5b9a797 100644 --- a/README.md +++ b/README.md @@ -3,9 +3,9 @@ Good morning, my brother! Welcome **SUOU** (**S**IS **U**nified **O**bject **U**nderarmor), the Python library which speeds up and makes it pleasing to develop API, database schemas and stuff in Python. It provides utilities such as: -* [SIQ](https://yusur.moe/protocols/siq.html) -* signing and generation of access tokens, on top of [ItsDangerous](https://github.com/pallets/itsdangerous) -* helpers for use in Flask, SQLAlchemy, and other popular frameworks +* SIQ ([specification](https://yusur.moe/protocols/siq.html) - [copy](https://suou.readthedocs.io/en/latest/iding.html)) +* signing and generation of access tokens, on top of [ItsDangerous](https://github.com/pallets/itsdangerous) *not tested and not working* +* helpers for use in Flask, [SQLAlchemy](https://suou.readthedocs.io/en/latest/sqlalchemy.html), and other popular frameworks * i forgor 💀 **It is not an ORM** nor a replacement of it; it works along existing ORMs (currently only SQLAlchemy is supported lol). diff --git a/aliases/sakuragasaki46_suou/pyproject.toml b/aliases/sakuragasaki46_suou/pyproject.toml index 52e3d6b..f53eeb0 100644 --- a/aliases/sakuragasaki46_suou/pyproject.toml +++ b/aliases/sakuragasaki46_suou/pyproject.toml @@ -10,7 +10,7 @@ license = "Apache-2.0" readme = "README.md" dependencies = [ - "suou==0.7.2", + "suou==0.7.4", "itsdangerous", "toml", "pydantic", diff --git a/docs/conf.py b/docs/conf.py index 5d415f2..8ff904e 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -31,7 +31,7 @@ autodoc_mock_imports = [ "toml", "starlette", "itsdangerous", - "pydantic", + #"pydantic", "quart_schema" ] diff --git a/docs/iding.rst b/docs/iding.rst new file mode 100644 index 0000000..38eaa69 --- /dev/null +++ b/docs/iding.rst @@ -0,0 +1,197 @@ + +IDing +===== + +.. currentmodule:: suou.iding + +... + +SIQ +--- + +The main point of the SUOU library is to provide an implementation for the methods of SIS, a protocol for information exchange in phase of definition, +and of which SUOU is the reference implementation. + +The key element is the ID format called SIQ, a 112-bit identifier format. + +Here follow an extract from the `specification`_: + +.. _specification: + +Why SIQ? +******** + +.. highlights:: + I needed unique, compact, decentralized, reproducible and sortable identifiers for my applications. + + Something I could reliably use as database key, as long as being fit for my purposes, in the context of a larger project, a federated protocol. + +Why not ... +*********** + +.. highlights:: + * **Serial numbers**? They are relative. If they needed to be absolute, they would have to be issued by a single central authority for everyone else. Unacceptable for a decentralized protocol. + * **Username-domain identifiers**? Despite them being in use in other decentralized protocols (such as ActivityPub and Matrix), they are immutable and bound to a single domain. It means, the system sees different domains or usernames as different users. Users can't change their username after registration, therefore forcing them to carry an unpleasant or cringe handle for the rest of their life. + * **UUID**'s? UUIDs are unreliable. Most services use UUIDv4's, which are just opaque sequences of random bytes, and definitely not optimal as database keys. Other versions exist (such as the timestamp-based [UUIDv7](https://uuidv7.org)), however they still miss something needed for cross-domain uniqueness. In any case, UUIDs need to waste some bits to specify their "protocol". + * **Snowflake**s? Snowflakes would be a good choice, and are the inspiration for SIQ themselves. However, 64 bits are not enough for our use case, and Snowflake is *already making the necessary sacrifices* to ensure everything fits into 64 bits (i.e. the epoch got significantly moved forward). + * **Content hashes**? They are based on content, therefore they require content to be immutable and undeletable. Also: collisions. + * **PLC**'s (i.e. the ones in use at BlueSky)? [The implementation is cryptic](https://github.com/did-method-plc/did-method-plc). Moreover, it requires a central authority, and BlueSky is, as of now, holding the role of the sole authority. The resulting identifier as well is apparently random, therefore unorderable. + * **ULID**'s? They are just UUIDv4's with a timestamp. Sortable? Yes. Predictable? No, random bits rely on the assumption of being generated on a single host — i.e. centralization. Think of them as yet another attempt to UUIDv7's. + +Anatomy of a SIQ +**************** + + +SIQ's are **112 bit** binary strings. Why 112? Why not 128? Idk, felt like it. Maybe to save space. Maybe because I could fit it into UUID some day — UUID already reserves some bits for the protocol. + +Those 112 bits split up into: + +* 56 bits of **timestamp**; +* 8 bits of process ("**shard**") information; +* 32 bits of **domain** hash; +* 16 bits of **serial** and **qualifier**. + +Here is a graph of a typical SIQ layout: + +``` +0: tttttttt tttttttt tttttttt tttttttt tttttttt +40: uuuuuuuu uuuuuuuu ssssssss dddddddd dddddddd +80: dddddddd dddddddd nnnnnnnn nnqqqqqq + +where: +t : timestamp -- seconds +u : timestamp -- fraction seconds +s : shard +d : domain hash +n : progressive +q : qualifier (variable width, in fact) +``` + +Timestamp +********* + +SIQ uses 56 bits for storing timestamp: + +- **40 bits** for **seconds**; +- **16 bits** for **fraction seconds**. + +There is no need to explain [why I need no less than 40 bits for seconds](https://en.wikipedia.org/wiki/Year_2038_problem). + +Most standards — including Snowflake and ULID — store timestamp in *milliseconds*. It means the system needs to make a division by 1000 to retrieve second value. + +But 1000 is almost 1024, right? So the last ten bits can safely be ignored and we easily obtain a UNIX timestamp by doing a right shi-  wait. + +It's more comfortable to assume that 1024 is nearly 1000. *Melius abundare quam deficere*. And injective mapping is there. + +But rounding? Truncation? Here comes the purpose of the 6 additional trailing bits: precision control. Bits from dividing milliseconds o'clock are different from those from rounding microseconds. + +Yes, most systems can't go beyond milliseconds for accuracy — standard Java is like that. But detecting platform accuracy is beyond my scope. + +There are other factors to ensure uniqueness: *domain* and *shard* bits. + +Domain, shard +************* + +The temporal uniqueness is ensured by timestamp. However, in a distributed, federated system there is the chance for the same ID to get generated twice by two different subjects. + +Therefore, *spacial* uniqueness must be enforced in some way. + +Since SIQ's are going to be used the most in web applications, a way to differentiate *spacially* different applications is via the **domain name**. + +I decided to reserve **32 bits** for the domain hash. + +The algorithm of choice is **SHA-256** for its well-known diffusion and collision resistance. However, 256 bits are too much to fit into a SIQ! So, the last 4 bytes are taken. + +*...* + +Development and testing environments may safely set all the domain bits to 0. + +Qualifiers +********** + +The last 16 bits are special, in a way that makes those identifiers unique, and you can tell what is what just by looking at them. + +Inspired by programming language implementations, such as OCaml and early JavaScript, a distinguishing bit affix differentiates among types of heterogeneous entities: + +* terminal entities (leaves) end in ``1``. This includes content blobs, array elements, and relationships; +* non-leaves end in ``0``. + +The full assigment scheme (managed by me) looks like this: + +------------------------------------------------------- +Suffix Usage +======================================================= +``x00000`` user account +``x10000`` application (e.g. API, client, bot, form) +``x01000`` event, task +``x11000`` product, subscription +``x00100`` user group, membership, role +``x10100`` collection, feed +``x01100`` invite +``x11100`` *unassigned* +``x00010`` tag, category +``x10010`` *unassigned* +``x01010`` channel (guild, live chat, forum, wiki~) +``x11010`` *unassigned* +``xx0110`` thread, page +``xx1110`` message, post, revision +``xxx001`` 3+ fk relationship +``xxx101`` many-to-many, hash array element +``xxx011`` array element (one to many) +``xxx111`` content +-------------------------------------------------------- + + +The leftover bits are used as progressive serials, incremented as generation continues, and usually reset when timestamp is incremented. + +Like with snowflakes and ULID's, if you happen to run out with serials, you need to wait till timestamp changes. Usually around 15 microseconds. + +Storage +******* + +It is advised to store in databases as *16 byte binary strings*. + +- In MySQL/MariaDB, it's ``VARBINARY(16)``. + +The two extra bytes are to ease alignment, and possible expansion of timestamp range — even though it would not be an issue until some years after 10,000 CE. + +It is possible to fit them into UUID's (specifically, UUIDv8's — custom ones), taking advantage from databases and libraries implementing a UUID type — e.g. PostgreSQL. + +Unfortunately, nobody wants to deal with storing arbitrarily long integers — lots of issues pop up by going beyond 64. 128 bit integers are not natively supported in most places. Let alone 112 bit ones. + +(end of extract) + +Implementation +************** + +.. autoclass:: Siq + +.. autoclass:: SiqGen + +.. automethod:: SiqGen.__init__ +.. automethod:: SiqGen.generate + +Snowflake +--------- + +SUOU also implements \[the Discord flavor of\] Snowflake ID's. + +This flavor of Snowflake requires an epoch date, and consists of: +* 42 bits of timestamp, with millisecond precision; +* 10 bits for, respectively, worker ID (5 bits) and shard ID (5 bits); +* 12 bits incremented progressively. + + +.. autoclass:: suou.snowflake.Snowflake + +.. autoclass:: suou.snowflake.SnowflakeGen + + +Other ID formats +---------------- + +Other ID formats (such as UUID's, ULID's) are implemented by other libraries. + +In particular, Python itself has support for UUID in the Standard Library. + + diff --git a/docs/index.rst b/docs/index.rst index b84454f..12e5d40 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -14,4 +14,5 @@ ease programmer's QoL and write shorter and cleaner code that works. :maxdepth: 2 sqlalchemy + iding api \ No newline at end of file diff --git a/docs/sqlalchemy.rst b/docs/sqlalchemy.rst index 7cea449..a1a78ac 100644 --- a/docs/sqlalchemy.rst +++ b/docs/sqlalchemy.rst @@ -38,7 +38,8 @@ Column pairs Misc ---- -.. autofunction:: BitSelector +.. autoclass:: BitSelector + .. autofunction:: match_constraint .. autofunction:: a_relationship .. autofunction:: declarative_base diff --git a/src/suou/__init__.py b/src/suou/__init__.py index 96bc8aa..db3e41a 100644 --- a/src/suou/__init__.py +++ b/src/suou/__init__.py @@ -37,7 +37,7 @@ from .redact import redact_url_password from .http import WantsContentType from .color import chalk -__version__ = "0.7.3" +__version__ = "0.7.4" __all__ = ( 'ConfigOptions', 'ConfigParserConfigSource', 'ConfigSource', 'ConfigValue', diff --git a/src/suou/glue.py b/src/suou/glue.py index 1d97318..db08aaf 100644 --- a/src/suou/glue.py +++ b/src/suou/glue.py @@ -18,6 +18,7 @@ import importlib from types import ModuleType from functools import wraps +from suou.classtools import MISSING from suou.functools import future @@ -32,28 +33,36 @@ class FakeModule(ModuleType): raise AttributeError(f'Module {self.__name__} not found; this feature is not available ({self._exc})') from self._exc -@future(version = "0.8.0") +@future(version = "0.9.0") def glue(*modules): """ Helper for "glue" code -- it imports the given modules and passes them as keyword arguments to the wrapped functions. - NEW 0.8.0 + NEW 0.9.0 """ module_dict = dict() + imports_succeeded = True for module in modules: try: module_dict[module] = importlib.import_module(module) except Exception as e: + imports_succeeded = False module_dict[module] = FakeModule(module, e) def decorator(func): @wraps(func) def wrapper(*a, **k): - k.update(module_dict) - return func(*a, **k) + try: + result = func(*a, **k) + except Exception: + if not imports_succeeded: + ## XXX return an iterable? A Fake****? + return MISSING + raise + return result return wrapper return decorator # This module is experimental and therefore not re-exported into __init__ -__all__ = ('glue',) \ No newline at end of file +__all__ = ('glue', 'FakeModule') \ No newline at end of file diff --git a/src/suou/iding.py b/src/suou/iding.py index 2fe2364..a2e0c37 100644 --- a/src/suou/iding.py +++ b/src/suou/iding.py @@ -31,6 +31,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. from __future__ import annotations import base64 import binascii +import datetime import enum from functools import cached_property import hashlib @@ -40,6 +41,8 @@ import os from typing import Iterable, override import warnings +from suou.calendar import want_timestamp + from .functools import deprecated from .codecs import b32lencode, b64encode, cb32decode, cb32encode, want_str @@ -120,20 +123,30 @@ class SiqGen: """ Implement a SIS-compliant SIQ generator. """ - __slots__ = ('domain_hash', 'last_gen_ts', 'counters', 'shard_id', '__weakref__') + __slots__ = ('domain_hash', 'last_gen_ts', 'counters', 'shard_id', '_test_cur_ts', '__weakref__') domain_hash: int last_gen_ts: int shard_id: int counters: dict[SiqType, int] + _test_cur_timestamp: int | None def __init__(self, domain: str, last_siq: int = 0, local_id: int | None = None, shard_id: int | None = None): self.domain_hash = make_domain_hash(domain, local_id) + self._test_cur_ts = None ## test only self.last_gen_ts = min(last_siq >> 56, self.cur_timestamp()) self.counters = dict() self.shard_id = (shard_id or os.getpid()) % 256 def cur_timestamp(self) -> int: + if self._test_cur_ts is not None: + return self._test_cur_ts return int(time.time() * (1 << 16)) + def set_cur_timestamp(self, value: datetime.datetime): + """ + Intended to be used by tests only! Do not use in production! + """ + self._test_cur_ts = int(want_timestamp(value) * 2 ** 16) + self.last_gen_ts = int(want_timestamp(value) * 2 ** 16) def generate(self, /, typ: SiqType, n: int = 1) -> Iterable[int]: """ Generate one or more SIQ's. @@ -152,7 +165,7 @@ class SiqGen: elif now > self.last_gen_ts: self.counters[typ] = 0 while n: - idseq = typ.prepend(self.counters[typ]) + idseq = typ.prepend(self.counters.setdefault(typ, 0)) if idseq >= (1 << 16): while (now := self.cur_timestamp()) <= self.last_gen_ts: time.sleep(1 / (1 << 16)) diff --git a/src/suou/luck.py b/src/suou/luck.py index 1ea9039..78b58f8 100644 --- a/src/suou/luck.py +++ b/src/suou/luck.py @@ -1,5 +1,5 @@ """ -Fortune' RNG and esoterism. +Fortune, RNG and esoterism. NEW 0.7.0 diff --git a/tests/test_iding.py b/tests/test_iding.py new file mode 100644 index 0000000..630b180 --- /dev/null +++ b/tests/test_iding.py @@ -0,0 +1,35 @@ + + +import datetime +import unittest + +from suou.iding import Siq, SiqType, SiqGen, make_domain_hash + + +class TestIding(unittest.TestCase): + def setUp(self) -> None: + ... + def tearDown(self) -> None: + ... + def test_generation(self): + gen = SiqGen('0', shard_id=256) + gen.set_cur_timestamp(datetime.datetime(2020,1,1)) + i1 = gen.generate_one(SiqType.CONTENT) + self.assertEqual(i1, 7451106619238957490390643507207) + i2_16 = gen.generate_list(SiqType.CONTENT, 15) + self.assertIsInstance(i2_16, list) + self.assertEqual(i2_16[0], i1 + 8) + self.assertEqual(i2_16[14], i1 + 120) + + gen.set_cur_timestamp(datetime.datetime(2021, 1, 1)) + i17 = gen.generate_one(SiqType.CONTENT) + self.assertEqual(i17, 7600439181106854559196223897735) + + def test_domain_hash(self): + self.assertEqual(make_domain_hash('0'), 0) + self.assertEqual(make_domain_hash('example.com'), 2261653831) + + def test_representation(self): + i1 = Siq(7451106619238957490390643507207) + self.assertEqual(i1.to_hex(), "5e0bd2f0000000000000000007") + self.assertEqual(i1.to_did(), "did:siq:iuxvojaaf4c6s6aaaaaaaaaaaaaah") \ No newline at end of file