From 1e45ef9314a45819bbd00e54e537e62c128c0888 Mon Sep 17 00:00:00 2001 From: nobody Date: Tue, 8 Jul 2025 16:49:52 -0700 Subject: [PATCH] feat: experiment with different implementations of LSEQ --- .gitignore | 46 ++ README.md | 35 + research/.plan | 4 + research/ALGORITHM_EXPLANATION.md | 134 ++++ research/Cargo.lock | 721 ++++++++++++++++++ research/Cargo.toml | 17 + research/README.md | 37 + research/benches/lseq_benchmarks.rs | 176 +++++ research/src/algorithms/lseq_base64.rs | 613 +++++++++++++++ research/src/algorithms/mod.rs | 5 + .../original_paper_reference_impl.rs | 501 ++++++++++++ research/src/bin/encoding_analyzer.rs | 373 +++++++++ research/src/encoding_analysis.rs | 180 +++++ research/src/lib.rs | 7 + research/src/main.rs | 52 ++ rust/Cargo.toml | 21 + rust/README.md | 82 ++ rust/src/lib.rs | 382 ++++++++++ typescript/.npmignore | 5 + typescript/README.md | 61 ++ typescript/package.json | 36 + typescript/src/index.ts | 72 ++ typescript/tsconfig.json | 18 + 23 files changed, 3578 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100644 research/.plan create mode 100644 research/ALGORITHM_EXPLANATION.md create mode 100644 research/Cargo.lock create mode 100644 research/Cargo.toml create mode 100644 research/README.md create mode 100644 research/benches/lseq_benchmarks.rs create mode 100644 research/src/algorithms/lseq_base64.rs create mode 100644 research/src/algorithms/mod.rs create mode 100644 research/src/algorithms/original_paper_reference_impl.rs create mode 100644 research/src/bin/encoding_analyzer.rs create mode 100644 research/src/encoding_analysis.rs create mode 100644 research/src/lib.rs create mode 100644 research/src/main.rs create mode 100644 rust/Cargo.toml create mode 100644 rust/README.md create mode 100644 rust/src/lib.rs create mode 100644 typescript/.npmignore create mode 100644 typescript/README.md create mode 100644 typescript/package.json create mode 100644 typescript/src/index.ts create mode 100644 typescript/tsconfig.json diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..314acf6 --- /dev/null +++ b/.gitignore @@ -0,0 +1,46 @@ +# TypeScript +typescript/node_modules/ +typescript/dist/ +typescript/*.tsbuildinfo +typescript/.nyc_output/ +typescript/coverage/ + +# Rust +target/ +rust/Cargo.lock + +# IDE +.vscode/ +.idea/ +*.swp +*.swo +*~ + +# OS +.DS_Store +Thumbs.db + +# Logs +*.log +npm-debug.log* +yarn-debug.log* +yarn-error.log* + +# Runtime data +pids +*.pid +*.seed +*.pid.lock + +# Optional npm cache directory +.npm + +# Optional eslint cache +.eslintcache + +# dotenv environment variable files +.env +.env.development.local +.env.test.local +.env.production.local +.env.local diff --git a/README.md b/README.md new file mode 100644 index 0000000..bb566e6 --- /dev/null +++ b/README.md @@ -0,0 +1,35 @@ +# L-SEQ - Fractional Indexing for List CRDTs + +This repository contains implementations of the L-SEQ algorithm for fractional indexing, used to create naive list CRDTs (Conflict-free Replicated Data Types). L-SEQ provides a way to generate unique, sortable identifiers that can be inserted between any two existing identifiers, making it ideal for collaborative editing and distributed systems. + +## Implementations + +- **TypeScript**: `@peoplesgrocers/lseq` - Located in the `typescript/` directory +- **Rust**: `peoplesgrocers-lseq` - Located in the `rust/` directory + +## What is L-SEQ? + +L-SEQ is a sequence CRDT that uses fractional indexing to maintain a total order of elements in a distributed system. It allows multiple users to concurrently insert elements into a list without conflicts, automatically resolving the order based on the generated identifiers. + +## Usage + +Each implementation provides: +- Allocation of new identifiers between existing ones +- Comparison functions for sorting +- Serialization to/from strings + +See the individual directories for language-specific documentation and examples. + +## References + +This library implements: + +[Brice Nédelec, Pascal Molli, Achour Mostefaoui, Emmanuel Desmontils. LSEQ: an Adaptive Structure +for Sequences in Distributed Collaborative Editing. 13th ACM Symposium on Document Engineering +(DocEng), Sep 2013, Florence, Italy. pp.37–46, 10.1145/2494266.2494278. hal-00921633](https://hal.science/hal-00921633/document) + +I recommend also reading: + +[Marc Shapiro, Nuno Preguiça, Carlos Baquero, Marek Zawirski. A comprehensive study of Conver- +gent and Commutative Replicated Data Types. [Research Report] RR-7506, Inria – Centre Paris- +Rocquencourt; INRIA. 2011, pp.50. inria-00555588](https://inria.hal.science/inria-00555588/document) diff --git a/research/.plan b/research/.plan new file mode 100644 index 0000000..0a73b97 --- /dev/null +++ b/research/.plan @@ -0,0 +1,4 @@ +There is a test harness written in Java by the original paper authors +https://github.com/Chat-Wane/LSEQ + +So far I cannot get my implementation to reproduce the numbers diff --git a/research/ALGORITHM_EXPLANATION.md b/research/ALGORITHM_EXPLANATION.md new file mode 100644 index 0000000..27d246b --- /dev/null +++ b/research/ALGORITHM_EXPLANATION.md @@ -0,0 +1,134 @@ +# LSEQ Base64: A Continued Fraction System for Distributed Identifiers + +This is a continued fraction system. + +You have identifiers like `[1, 2, 52]` that represent positions in a mixed-radix number system where each digit position has exponentially more capacity: 64, 64², 64³, etc. When you need to insert between two identifiers, you're doing arithmetic in this variable-base system until you find a digit position with enough space. + +## The Core Problem + +You want to insert a new identifier between `p = [1,2,52]` and `q = [1,2,53]`. These are adjacent at the last digit, so there's no room. What do you do? + +**Answer**: Extend to the next digit position, which has 16.7 million slots. + +This is exactly like having the decimal numbers 1252 and 1253, realizing they're adjacent, and extending to 12520 through 12529 to find space between them. Except our "decimal" system has bases 64, 64², 64³, etc. + +## The Algorithm + +Walk down both identifiers digit by digit, building the result as you go: + +```rust +function alloc(self, p, q): + depth = 0 + result = Vec::with_capacity(max(p.len(), q.len()) + 1) + borrow_flag = false + + interval = 0 + + while depth < result.len(): + if self.strategies.len() < depth: + self.strategies.push(random(bool)) + + p_val = p[depth] if depth < p.len() else 0 + q_val = + if carry_flag: + max_value_at_depth + else if q[depth] if depth < q.len(): + q[depth] + else + 0 + + if p_val == q_val: + result[depth] = p_val + depth += 1 // Same value, continue deeper + continue + + if q_val - p_val > 1: + // Enough space at this level + interval = q_val - p_val - 1 + + if self.strategies[depth]: + // add to p + result[depth] = p_val + random(1, min(BOUNDARY, interval)) + else: + // subtract from q + result[depth] = q_val - random(1, min(BOUNDARY, interval)) + break + else: + // q_val - p_val == 1, not enough space, go deeper one level + result[depth] = p_val + depth += 1 + borrow_flag = true + + return result[0..=depth] +``` + +The key insights: +- **Pre-allocate result**: We know the maximum possible depth upfront +- **Borrow flag**: When there's no space at a depth, we set a borrow flag that affects how we interpret missing digits in the next level +- **Strategy array**: Each depth has a persistent strategy (boundary+ or boundary-) to prevent clustering +- **Boundary limiting**: Use a `BOUNDARY` constant to limit random selection and improve distribution + +## Why This Works + +**Guaranteed space**: Each level has exponentially more capacity (64^(level+1) slots), so you'll always find space eventually. + +**Total ordering**: The lexicographic ordering of vectors gives you a consistent sort order. + +**No coordination**: Two nodes can independently pick identifiers without talking to each other. + +## Concrete Example + +`p = [1,2,52]`, `q = [1,2,53]` + +- **Depth 0**: `p_val = 1`, `q_val = 1` → same value, so `result[0] = 1`, continue deeper +- **Depth 1**: `p_val = 2`, `q_val = 2` → same value, so `result[1] = 2`, continue deeper +- **Depth 2**: `p_val = 52`, `q_val = 53` → `q_val - p_val = 1`, no space (≤ 1), so `result[2] = 52`, set `borrow_flag = true`, continue deeper +- **Depth 3**: `p_val = 0` (past end of p), `q_val = max_value_at_depth` (because borrow_flag is true) → huge interval available! + +Now we have space: `interval = max_value_at_depth - 0 - 1`. Check the strategy at depth 3: +- If `strategies[3] = true` (boundary+): `result[3] = 0 + random(1, min(BOUNDARY, interval))` +- If `strategies[3] = false` (boundary-): `result[3] = max_value_at_depth - random(1, min(BOUNDARY, interval))` + +Return `[1,2,52,chosen_value]`. + +## The "Borrowing" (Borrow Flag) + +When there's no space at depth 2 (`q_val - p_val = 1`), we set `borrow_flag = true`. This affects how we interpret missing digits in the next level: + +- Without borrow flag: missing digit in `q` becomes `0` +- With borrow flag: missing digit in `q` becomes `max_value_at_depth` + +Why? Because when we couldn't fit at depth 2, we're now looking for space between: +- `[1,2,52,0...]` (p extended) +- `[1,2,52,max_value...]` (q "borrowed down") + +**Continued fraction borrowing direction**: Since our array represents continued fraction numerators from most significant to least significant, we're borrowing from the more significant position (earlier in the array at depth 2) to create space at the less significant position (later in the array at depth 3). + +This is like decimal borrowing, but in reverse array order: when looking between 1252 and 1253, we actually search between 1252 and 1252.999... The borrow flag tells us we're in this "borrowed" state where the more significant digit has lent capacity to the less significant position. + +## Edge Cases + +**Adjacent values**: Handled by extending to the next depth. + +**Maximum values**: If `p[depth]` is already at max, extending still works because the next depth has way more capacity. + +**Empty inputs**: `p = []` becomes `[0]`, `q = []` becomes `[max_at_depth_0]`. + +## Why "Continued Fraction"? + +Each digit position has a different base (64¹, 64², 64³, ...), and you're doing arithmetic across these variable-capacity positions. This is the defining characteristic of continued fractions and mixed-radix systems. + +The tree visualization is just a way to think about it, but fundamentally you're doing arithmetic in a number system where carrying/borrowing happens between positions with different capacities. + +## Implementation Details + +The actual code builds the result vector directly as it traverses both identifiers simultaneously. Key implementation points: + +- **Pre-allocated result**: We know the maximum depth upfront: `max(p.len(), q.len()) + 1` +- **Strategy persistence**: Each depth has a persistent random strategy (boundary+ or boundary-) stored in `self.strategies` +- **Borrow flag mechanics**: When `q_val - p_val = 1`, we subtract one from the +q_val and set borrow_flag for the next level (which is same as taking p_val) +- **Boundary limiting**: Use `min(BOUNDARY, interval)` to limit random selection and improve distribution + +The strategy selection prevents clustering and ensures good distribution of identifiers over time. + diff --git a/research/Cargo.lock b/research/Cargo.lock new file mode 100644 index 0000000..6ffa44f --- /dev/null +++ b/research/Cargo.lock @@ -0,0 +1,721 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "aho-corasick" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" +dependencies = [ + "memchr", +] + +[[package]] +name = "anes" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" + +[[package]] +name = "anstyle" +version = "1.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "862ed96ca487e809f1c8e5a8447f6ee2cf102f846893800b20cebdf541fc6bbd" + +[[package]] +name = "autocfg" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" + +[[package]] +name = "bumpalo" +version = "3.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "46c5e41b57b8bba42a04676d81cb89e9ee8e859a1a66f80a5a72e1cb76b34d43" + +[[package]] +name = "cast" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" + +[[package]] +name = "cfg-if" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9555578bc9e57714c812a1f84e4fc5b4d21fcb063490c624de019f7464c91268" + +[[package]] +name = "ciborium" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42e69ffd6f0917f5c029256a24d0161db17cea3997d185db0d35926308770f0e" +dependencies = [ + "ciborium-io", + "ciborium-ll", + "serde", +] + +[[package]] +name = "ciborium-io" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05afea1e0a06c9be33d539b876f1ce3692f4afea2cb41f740e7743225ed1c757" + +[[package]] +name = "ciborium-ll" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57663b653d948a338bfb3eeba9bb2fd5fcfaecb9e199e87e1eda4d9e8b240fd9" +dependencies = [ + "ciborium-io", + "half", +] + +[[package]] +name = "clap" +version = "4.5.40" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40b6887a1d8685cebccf115538db5c0efe625ccac9696ad45c409d96566e910f" +dependencies = [ + "clap_builder", +] + +[[package]] +name = "clap_builder" +version = "4.5.40" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e0c66c08ce9f0c698cbce5c0279d0bb6ac936d8674174fe48f736533b964f59e" +dependencies = [ + "anstyle", + "clap_lex", +] + +[[package]] +name = "clap_lex" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b94f61472cee1439c0b966b47e3aca9ae07e45d070759512cd390ea2bebc6675" + +[[package]] +name = "criterion" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2b12d017a929603d80db1831cd3a24082f8137ce19c69e6447f54f5fc8d692f" +dependencies = [ + "anes", + "cast", + "ciborium", + "clap", + "criterion-plot", + "is-terminal", + "itertools", + "num-traits", + "once_cell", + "oorandom", + "plotters", + "rayon", + "regex", + "serde", + "serde_derive", + "serde_json", + "tinytemplate", + "walkdir", +] + +[[package]] +name = "criterion-plot" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1" +dependencies = [ + "cast", + "itertools", +] + +[[package]] +name = "crossbeam-deque" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" + +[[package]] +name = "crunchy" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5" + +[[package]] +name = "either" +version = "1.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" + +[[package]] +name = "env_logger" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4cd405aab171cb85d6735e5c8d9db038c17d3ca007a4d2c25f337935c3d90580" +dependencies = [ + "humantime", + "is-terminal", + "log", + "regex", + "termcolor", +] + +[[package]] +name = "getrandom" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "335ff9f135e4384c8150d6f27c6daed433577f86b4750418338c01a1a2528592" +dependencies = [ + "cfg-if", + "libc", + "wasi", +] + +[[package]] +name = "half" +version = "2.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "459196ed295495a68f7d7fe1d84f6c4b7ff0e21fe3017b2f283c6fac3ad803c9" +dependencies = [ + "cfg-if", + "crunchy", +] + +[[package]] +name = "hermit-abi" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc0fef456e4baa96da950455cd02c081ca953b141298e41db3fc7e36b1da849c" + +[[package]] +name = "humantime" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b112acc8b3adf4b107a8ec20977da0273a8c386765a3ec0229bd500a1443f9f" + +[[package]] +name = "is-terminal" +version = "0.4.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e04d7f318608d35d4b61ddd75cbdaee86b023ebe2bd5a66ee0915f0bf93095a9" +dependencies = [ + "hermit-abi", + "libc", + "windows-sys", +] + +[[package]] +name = "itertools" +version = "0.10.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473" +dependencies = [ + "either", +] + +[[package]] +name = "itoa" +version = "1.0.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" + +[[package]] +name = "js-sys" +version = "0.3.77" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1cfaf33c695fc6e08064efbc1f72ec937429614f25eef83af942d0e227c3a28f" +dependencies = [ + "once_cell", + "wasm-bindgen", +] + +[[package]] +name = "libc" +version = "0.2.174" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1171693293099992e19cddea4e8b849964e9846f4acee11b3948bcc337be8776" + +[[package]] +name = "log" +version = "0.4.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94" + +[[package]] +name = "memchr" +version = "2.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a282da65faaf38286cf3be983213fcf1d2e2a58700e808f83f4ea9a4804bc0" + +[[package]] +name = "num-traits" +version = "0.2.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" +dependencies = [ + "autocfg", +] + +[[package]] +name = "once_cell" +version = "1.21.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" + +[[package]] +name = "oorandom" +version = "11.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d6790f58c7ff633d8771f42965289203411a5e5c68388703c06e14f24770b41e" + +[[package]] +name = "peoplesgrocers-lseq" +version = "1.0.0" +dependencies = [ + "rand", +] + +[[package]] +name = "peoplesgrocers-lseq-research" +version = "0.1.0" +dependencies = [ + "criterion", + "env_logger", + "log", + "peoplesgrocers-lseq", + "rand", +] + +[[package]] +name = "plotters" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5aeb6f403d7a4911efb1e33402027fc44f29b5bf6def3effcc22d7bb75f2b747" +dependencies = [ + "num-traits", + "plotters-backend", + "plotters-svg", + "wasm-bindgen", + "web-sys", +] + +[[package]] +name = "plotters-backend" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df42e13c12958a16b3f7f4386b9ab1f3e7933914ecea48da7139435263a4172a" + +[[package]] +name = "plotters-svg" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51bae2ac328883f7acdfea3d66a7c35751187f870bc81f94563733a154d7a670" +dependencies = [ + "plotters-backend", +] + +[[package]] +name = "ppv-lite86" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9" +dependencies = [ + "zerocopy", +] + +[[package]] +name = "proc-macro2" +version = "1.0.95" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "02b3e5e68a3a1a02aad3ec490a98007cbc13c37cbe84a3cd7b8e406d76e7f778" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.40" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "rand" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" +dependencies = [ + "libc", + "rand_chacha", + "rand_core", +] + +[[package]] +name = "rand_chacha" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" +dependencies = [ + "ppv-lite86", + "rand_core", +] + +[[package]] +name = "rand_core" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" +dependencies = [ + "getrandom", +] + +[[package]] +name = "rayon" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa" +dependencies = [ + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2" +dependencies = [ + "crossbeam-deque", + "crossbeam-utils", +] + +[[package]] +name = "regex" +version = "1.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" + +[[package]] +name = "rustversion" +version = "1.0.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a0d197bd2c9dc6e53b84da9556a69ba4cdfab8619eb41a8bd1cc2027a0f6b1d" + +[[package]] +name = "ryu" +version = "1.0.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f" + +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + +[[package]] +name = "serde" +version = "1.0.219" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f0e2c6ed6606019b4e29e69dbaba95b11854410e5347d525002456dbbb786b6" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.219" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_json" +version = "1.0.140" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "20068b6e96dc6c9bd23e01df8827e6c7e1f2fddd43c21810382803c136b99373" +dependencies = [ + "itoa", + "memchr", + "ryu", + "serde", +] + +[[package]] +name = "syn" +version = "2.0.104" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "17b6f705963418cdb9927482fa304bc562ece2fdd4f616084c50b7023b435a40" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "termcolor" +version = "1.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06794f8f6c5c898b3275aebefa6b8a1cb24cd2c6c79397ab15774837a0bc5755" +dependencies = [ + "winapi-util", +] + +[[package]] +name = "tinytemplate" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc" +dependencies = [ + "serde", + "serde_json", +] + +[[package]] +name = "unicode-ident" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512" + +[[package]] +name = "walkdir" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" +dependencies = [ + "same-file", + "winapi-util", +] + +[[package]] +name = "wasi" +version = "0.11.1+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b" + +[[package]] +name = "wasm-bindgen" +version = "0.2.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1edc8929d7499fc4e8f0be2262a241556cfc54a0bea223790e71446f2aab1ef5" +dependencies = [ + "cfg-if", + "once_cell", + "rustversion", + "wasm-bindgen-macro", +] + +[[package]] +name = "wasm-bindgen-backend" +version = "0.2.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2f0a0651a5c2bc21487bde11ee802ccaf4c51935d0d3d42a6101f98161700bc6" +dependencies = [ + "bumpalo", + "log", + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7fe63fc6d09ed3792bd0897b314f53de8e16568c2b3f7982f468c0bf9bd0b407" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ae87ea40c9f689fc23f209965b6fb8a99ad69aeeb0231408be24920604395de" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-backend", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a05d73b933a847d6cccdda8f838a22ff101ad9bf93e33684f39c1f5f0eece3d" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "web-sys" +version = "0.3.77" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "33b6dd2ef9186f1f2072e409e99cd22a975331a6b3591b12c764e0e55c60d5d2" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "winapi-util" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb" +dependencies = [ + "windows-sys", +] + +[[package]] +name = "windows-sys" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-targets" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_gnullvm", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" + +[[package]] +name = "zerocopy" +version = "0.8.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1039dd0d3c310cf05de012d8a39ff557cb0d23087fd44cad61df08fc31907a2f" +dependencies = [ + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.8.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ecf5b4cc5364572d7f4c329661bcc82724222973f2cab6f050a4e5c22f75181" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] diff --git a/research/Cargo.toml b/research/Cargo.toml new file mode 100644 index 0000000..d5f76a8 --- /dev/null +++ b/research/Cargo.toml @@ -0,0 +1,17 @@ +[package] +name = "peoplesgrocers-lseq-research" +version = "0.1.0" +edition = "2021" + +[dependencies] +peoplesgrocers-lseq = { path = "../rust" } +rand = "0.8" +log = "0.4" +env_logger = "0.10" + +[dev-dependencies] +criterion = "0.5" + +[[bench]] +name = "lseq_benchmarks" +harness = false diff --git a/research/README.md b/research/README.md new file mode 100644 index 0000000..50c7d8b --- /dev/null +++ b/research/README.md @@ -0,0 +1,37 @@ +# L-SEQ Research + +This crate contains experimental implementations of the L-SEQ algorithm for research and comparison purposes. + +## Structure + +- `src/algorithms/original_paper_reference_impl.rs` - A direct, naive translation of the original L-SEQ paper +- `benches/` - Criterion benchmarks comparing different implementations +- `src/main.rs` - Simple demonstration of the original paper implementation + +## Implementations + +### Original Paper Reference Implementation + +This is a direct translation of the L-SEQ algorithm from the original paper without optimizations. It's designed to be as close as possible to the pseudocode from the paper for verification and comparison purposes. + +### Future Implementations + +This crate is structured to allow adding more experimental implementations in the `src/algorithms/` directory to explore different tradeoffs and optimizations. + +## Usage + +Run the demo: +```bash +cargo run +``` + +Run benchmarks: +```bash +cargo bench +``` + +## Philosophy + +This crate avoids abstraction layers and keeps each L-SEQ implementation as a concrete type with its own SortKey. Comparisons and compatibility testing are handled in the benchmarks rather than through trait abstractions. + +Each implementation is self-contained and can be studied independently without needing to understand complex trait hierarchies or wrapper types. \ No newline at end of file diff --git a/research/benches/lseq_benchmarks.rs b/research/benches/lseq_benchmarks.rs new file mode 100644 index 0000000..f81c90a --- /dev/null +++ b/research/benches/lseq_benchmarks.rs @@ -0,0 +1,176 @@ +use criterion::{black_box, criterion_group, criterion_main, Criterion, BenchmarkId}; +use peoplesgrocers_lseq_research::ReferenceLSEQ; +use peoplesgrocers_lseq::{SortKey, LSEQ}; +use peoplesgrocers_lseq_research::algorithms::lseq_base64::{LSEQBase64, SortKeyBase64}; +use rand::{Rng, rngs::StdRng, SeedableRng}; +use std::collections::VecDeque; + +fn benchmark_sequential_insertions(c: &mut Criterion) { + let mut group = c.benchmark_group("sequential_insertions"); + + for size in [100, 1000, 5000].iter() { + // Benchmark original paper reference implementation + group.bench_with_input( + BenchmarkId::new("original", size), + size, + |b, &size| { + b.iter(|| { + let mut lseq = ReferenceLSEQ::new(StdRng::seed_from_u64(42)); + let mut keys = Vec::new(); + + for _ in 0..size { + let before = keys.last(); + let key = lseq.allocate(before, None).unwrap(); + keys.push(key); + } + + black_box(keys); + }); + }, + ); + + // Benchmark published implementation + group.bench_with_input( + BenchmarkId::new("published", size), + size, + |b, &size| { + b.iter(|| { + let mut lseq = LSEQ::new(StdRng::seed_from_u64(42)); + let mut keys = Vec::new(); + + for _ in 0..size { + let before = keys.last(); + let key = lseq.alloc(before, None); + keys.push(key); + } + + black_box(keys); + }); + }, + ); + + // Benchmark Base64 implementation + group.bench_with_input( + BenchmarkId::new("base64", size), + size, + |b, &size| { + b.iter(|| { + let mut lseq = LSEQBase64::new(StdRng::seed_from_u64(42)); + let mut keys = Vec::new(); + + for _ in 0..size { + let before = keys.last(); + let key = lseq.allocate(before, None).unwrap(); + keys.push(key); + } + + black_box(keys); + }); + }, + ); + } + + group.finish(); +} + +fn benchmark_random_insertions(c: &mut Criterion) { + let mut group = c.benchmark_group("random_insertions"); + + for size in [100, 1000, 5000].iter() { + // Benchmark original paper reference implementation + group.bench_with_input( + BenchmarkId::new("original", size), + size, + |b, &size| { + b.iter(|| { + let mut lseq = ReferenceLSEQ::new(StdRng::seed_from_u64(42)); + let mut keys = Vec::new(); + let mut rng = StdRng::seed_from_u64(123); + + for _ in 0..size { + if keys.is_empty() { + let key = lseq.allocate(None, None).unwrap(); + keys.push(key); + } else { + let idx = rng.gen_range(0..keys.len()); + let before = if idx == 0 { None } else { Some(&keys[idx - 1]) }; + let after = if idx == keys.len() { None } else { Some(&keys[idx]) }; + + let key = lseq.allocate(before, after).unwrap(); + keys.insert(idx, key); + } + } + + black_box(keys); + }); + }, + ); + + // Benchmark published implementation + group.bench_with_input( + BenchmarkId::new("published", size), + size, + |b, &size| { + b.iter(|| { + let mut lseq = LSEQ::new(StdRng::seed_from_u64(42)); + let mut keys = Vec::new(); + let mut rng = StdRng::seed_from_u64(123); + + for _ in 0..size { + if keys.is_empty() { + let key = lseq.alloc(None, None); + keys.push(key); + } else { + let idx = rng.gen_range(0..keys.len()); + let before = if idx == 0 { None } else { Some(&keys[idx - 1]) }; + let after = if idx == keys.len() { None } else { Some(&keys[idx]) }; + + let key = lseq.alloc(before, after); + keys.insert(idx, key); + } + } + + black_box(keys); + }); + }, + ); + + // Benchmark Base64 implementation + group.bench_with_input( + BenchmarkId::new("base64", size), + size, + |b, &size| { + b.iter(|| { + let mut lseq = LSEQBase64::new(StdRng::seed_from_u64(42)); + let mut keys = Vec::new(); + let mut rng = StdRng::seed_from_u64(123); + + for _ in 0..size { + if keys.is_empty() { + let key = lseq.allocate(None, None).unwrap(); + keys.push(key); + } else { + let idx = rng.gen_range(0..keys.len()); + let before = if idx == 0 { None } else { Some(&keys[idx - 1]) }; + let after = if idx == keys.len() { None } else { Some(&keys[idx]) }; + + let key = lseq.allocate(before, after).unwrap(); + keys.insert(idx, key); + } + } + + black_box(keys); + }); + }, + ); + } + + group.finish(); +} + +criterion_group!( + benches, + benchmark_sequential_insertions, + benchmark_random_insertions, +); +criterion_main!(benches); \ No newline at end of file diff --git a/research/src/algorithms/lseq_base64.rs b/research/src/algorithms/lseq_base64.rs new file mode 100644 index 0000000..2c72135 --- /dev/null +++ b/research/src/algorithms/lseq_base64.rs @@ -0,0 +1,613 @@ +use rand::Rng; +use std::error::Error; +use std::fmt; +use log::{trace, debug}; + +const BOUNDARY: u64 = 40; // The paper says this can be any constant + +// The maximum level is 9 because the maximum value of a level is 2^(6+6*9) - 1, +// which is 2^60 - 1, which fits in u64. At level 10, we would have 2^66 - 1, +// which exceeds u64 capacity. +const MAX_LEVEL: usize = 9; + +// Python program used to generate LEVEL_DIGITS_LOOKUP: +// ```python +// def compute_level_digits(): +// digits = [] +// for i in range(10): +// max_value = (64 * (64 ** i)) - 1 # 64^(i+1) - 1 = 2^(6+6*i) - 1 +// num_digits = len(str(max_value)) +// digits.append(num_digits) +// return digits +// +// if __name__ == "__main__": +// digits = compute_level_digits() +// print(f"const LEVEL_DIGITS_LOOKUP: [usize; 10] = {digits};") +// ``` + +// Precomputed number of digits needed for each level (0-9) +// Level i has max value of 2^(6+6*i) - 1, so we need enough digits to represent that +const LEVEL_DIGITS_LOOKUP: [usize; 10] = [ + 2, 4, 6, 8, 10, 11, 13, 15, 17, 19 +]; + +/// L-SEQ implementation with 64 slots per level, multiplying by 64 each level +pub struct LSEQBase64 { + /// Strategy vector - true for + strategy, false for - strategy + strategies: Vec, + /// Random number generator + rng: R, +} + +/// Sort key implementation for 64-slot L-SEQ +#[derive(Clone, PartialEq, Eq, PartialOrd, Ord)] +pub struct SortKeyBase64 { + levels: Vec, +} + +impl SortKeyBase64 { + pub fn new(levels: Vec) -> Self { + Self { levels } + } + + pub fn levels(&self) -> &[u64] { + &self.levels + } + + /// Calculate the number of base64 characters needed for maximally encoded form + /// In this compact encoding, level i needs exactly (i+1) base64 characters: + /// - Level 0: 1 character (6 bits, 0-63) + /// - Level 1: 2 characters (12 bits, 0-4095) + /// - Level 2: 3 characters (18 bits, 0-262143) + /// - etc. + /// No separators needed since we know the structure. + pub fn max_base64_chars(&self) -> usize { + self.levels.iter().enumerate().map(|(level, _)| level + 1).sum() + } +} + +/// Get the number of slots for a given level (64 * 64^level = 64^(level+1)) +#[allow(dead_code)] +fn get_level_slots(level: usize) -> u64 { + let base_slots = 64u64; + let multiplier = 64u64.checked_pow(level as u32) + .expect("Level exceeds u64 representation capacity"); + + base_slots.checked_mul(multiplier) + .expect("Level slots exceed u64 capacity") +} + +impl fmt::Display for SortKeyBase64 { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let parts: Vec = self.levels.iter().map(|&x| x.to_string()).collect(); + write!(f, "{}", parts.join(".")) + } +} + +impl fmt::Debug for SortKeyBase64 { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let parts: Vec = self.levels.iter().enumerate().map(|(level, &value)| { + let digits = if level <= MAX_LEVEL { + LEVEL_DIGITS_LOOKUP[level] + } else { + // For levels beyond MAX_LEVEL, use the same digit count as MAX_LEVEL + // since we're capping at 2^60 - 1 + LEVEL_DIGITS_LOOKUP[MAX_LEVEL] + }; + format!("{:0width$}", value, width = digits) + }).collect(); + write!(f, "{}", parts.join(".")) + } +} + +impl LSEQBase64 { + pub fn new(rng: R) -> Self { + Self { + strategies: Vec::new(), + rng, + } + } + + /// Set strategies for testing purposes + #[cfg(test)] + pub fn set_strategies(&mut self, strategies: Vec) { + self.strategies = strategies; + } + + /// Allocate a new identifier between two existing identifiers + pub fn allocate(&mut self, before: Option<&SortKeyBase64>, after: Option<&SortKeyBase64>) -> Result> { + + // Convert to the format expected by the paper's algorithm + let p = before.map_or(vec![0], |k| k.levels().to_vec()); + let q = after.map_or(vec![self.get_depth_max(0)], |k| k.levels().to_vec()); + + let levels = self.alloc(&p, &q); + let key = SortKeyBase64::new(levels); + + // Debug assertions to verify the allocated key is properly ordered + if let Some(before_key) = before { + debug_assert!( + before_key < &key, + "ORDERING VIOLATION: before < allocated failed\n\ + before = {:?} (internal: {:?})\n\ + allocated = {:?} (internal: {:?})\n\ + after = {} (internal: {:?})\n\ + Expected: before < allocated < after", + before_key, before_key.levels(), + key, key.levels(), + after.map(|k| format!("{:?}", k)).unwrap_or_else(|| "None".to_string()), + after.map(|k| k.levels()).unwrap_or(&[]) + ); + } + + if let Some(after_key) = after { + debug_assert!( + &key < after_key, + "ORDERING VIOLATION: allocated < after failed\n\ + before = {} (internal: {:?})\n\ + allocated = {:?} (internal: {:?})\n\ + after = {:?} (internal: {:?})\n\ + Expected: before < allocated < after", + before.map(|k| format!("{:?}", k)).unwrap_or_else(|| "None".to_string()), + before.map(|k| k.levels()).unwrap_or(&[]), + key, key.levels(), + after_key, after_key.levels() + ); + } + + Ok(key) + } + + /// Get the maximum value for a given level (64^(level+1) - 1 = 2^(6+6*level) - 1) + /// For levels beyond 9, we cap at 2^60 - 1 to avoid u64 overflow + fn get_depth_max(&self, depth: usize) -> u64 { + let max_val = if depth <= MAX_LEVEL { + (1 << (6 + 6 * depth)) - 1 + } else { + // Cap at 2^60 - 1 for levels beyond 9 + (1 << 60) - 1 + }; + trace!("get_depth_max({}) -> {}", depth, max_val); + max_val + } + + fn alloc(&mut self, p: &[u64], q: &[u64]) -> Vec { + debug!("Starting allocation between p={:?} and q={:?}", p, q); + if !(p.is_empty() && q.is_empty()) { + debug_assert_ne!(p, q, "Cannot allocate between identical positions: p={:?}, q={:?}", p, q); + } + + let mut borrow_flag = false; + let max_levels = std::cmp::max(p.len(), q.len()) + 1; + let mut result = Vec::with_capacity(max_levels); + + trace!("Initial state: carry_flag={}, max_levels={}", borrow_flag, max_levels); + + // Phase 1: Find the allocation depth + for depth in 0..max_levels { + trace!("=== Processing depth {} ===", depth); + trace!("Current result so far: {:?}", result); + trace!("Current carry_flag: {}", borrow_flag); + + if self.strategies.len() <= depth { + let new_strategy = self.rng.gen_bool(0.5); + trace!("BRANCH: Generating new strategy for depth {}: {} (+ strategy: {})", + depth, new_strategy, new_strategy); + self.strategies.push(new_strategy); + } else { + trace!("Using existing strategy for depth {}: {} (+ strategy: {})", + depth, self.strategies[depth], self.strategies[depth]); + } + + + let p_val = if depth < p.len() { + trace!("BRANCH: p_val from p[{}] = {}", depth, p[depth]); + p[depth] + } else { + trace!("BRANCH: p_val defaulted to 0 (depth {} >= p.len() {})", depth, p.len()); + 0 + }; + + let q_val = if borrow_flag { + let max_val = self.get_depth_max(depth); + trace!("BRANCH: q_val from get_depth_max({}) = {} (carry_flag=true)", depth, max_val); + max_val + } else if depth < q.len() { + trace!("BRANCH: q_val from q[{}] = {} (carry_flag=false)", depth, q[depth]); + q[depth] + } else { + trace!("BRANCH: q_val defaulted to 0 (depth {} >= q.len() {}, carry_flag=false)", depth, q.len()); + 0 + }; + + trace!("At depth {}: p_val={}, q_val={}, gap={}", depth, p_val, q_val, q_val.saturating_sub(p_val)); + + if p_val == q_val { + trace!("BRANCH: Values equal at depth {} (p_val={}, q_val={}), extending prefix and going deeper", + depth, p_val, q_val); + result.push(p_val); + continue; + } + + if q_val < p_val { + trace!("BRANCH: ERROR - q_val < p_val at depth {} (q_val={}, p_val={})", depth, q_val, p_val); + debug_assert!(q_val > p_val, "q < p at depth {}", depth); + // We know that q > p overall, and we know that we had a shared + // prefix up until this point, therefor q_val must be greater than p_val + // TODO I might want to return an error here instead of panicing + } + + let gap = q_val - p_val; + if gap > 1 { + // Enough space at this level + trace!("BRANCH: Sufficient space found at depth {} (gap={} > 1)", depth, gap); + let interval = gap - 1; + let step = std::cmp::min(BOUNDARY, interval); + + let allocated_value = if self.strategies[depth] { + let delta = self.rng.gen_range(1..=step); + trace!("Space allocation: interval={}, step={}, delta={}", interval, step, delta); + let val = p_val + delta; + trace!("BRANCH: Using + strategy, allocated_value = p_val + delta = {} + {} = {}", + p_val, delta, val); + val + } else { + let delta = if borrow_flag { + //self.rng.gen_range(0..step) + self.rng.gen_range(1..=step) + } else { + self.rng.gen_range(1..=step) + }; + trace!("Space allocation: interval={}, step={}, delta={}", interval, step, delta); + let val = q_val - delta; + trace!("BRANCH: Using - strategy, allocated_value = q_val - delta = {} - {} = {}", + q_val, delta, val); + val + }; + + result.push(allocated_value); + trace!("BRANCH: Allocation complete at depth {}, final result: {:?}", depth, result); + return result; + } else { + trace!("BRANCH: Insufficient space at depth {} (gap={} <= 1), extending prefix and setting carry_flag", + depth, gap); + result.push(p_val); + borrow_flag = true; + trace!("Updated state: result={:?}, carry_flag={}", result, borrow_flag); + } + } + + trace!("BRANCH: Loop completed without allocation, returning result: {:?}", result); + result + } +} + +#[cfg(test)] +mod tests { + use super::*; + use rand::rngs::StdRng; + use rand::SeedableRng; + + #[test] + fn test_level_max() { + let lseq = LSEQBase64::new(StdRng::seed_from_u64(42)); + + // Level 0: 64 slots (0-63) + assert_eq!(lseq.get_depth_max(0), 63); + // Level 1: 4096 slots (0-4095) + assert_eq!(lseq.get_depth_max(1), 4095); + // Level 2: 262144 slots (0-262143) + assert_eq!(lseq.get_depth_max(2), 262143); + // Level 3: 16777216 slots (0-16777215) + assert_eq!(lseq.get_depth_max(3), 16777215); + } + + #[test] + fn test_basic_allocation() { + let mut lseq = LSEQBase64::new(StdRng::seed_from_u64(42)); + + let key1 = lseq.allocate(None, None).unwrap(); + let key2 = lseq.allocate(Some(&key1), None).unwrap(); + let key3 = lseq.allocate(None, Some(&key1)).unwrap(); + + assert!(key3 < key1); + assert!(key1 < key2); + } + + #[test] + fn test_sort_key_ordering() { + let key1 = SortKeyBase64::new(vec![5]); + let key2 = SortKeyBase64::new(vec![5, 10]); + let key3 = SortKeyBase64::new(vec![6]); + + assert!(key1 < key2); + assert!(key2 < key3); + } + + #[test] + fn test_boundary_usage() { + let mut lseq = LSEQBase64::new(StdRng::seed_from_u64(42)); + + // Create keys with large gaps to test boundary limiting + let key1 = SortKeyBase64::new(vec![0]); + let key2 = SortKeyBase64::new(vec![63]); + + // Allocate between them - should use BOUNDARY to limit step + let key_between = lseq.allocate(Some(&key1), Some(&key2)).unwrap(); + + // The new key should be valid + assert!(key1 < key_between); + assert!(key_between < key2); + } + + #[test] + fn test_allocation_beyond_max_level() { + let mut lseq = LSEQBase64::new(StdRng::seed_from_u64(42)); + + // Create two identifiers that are identical at every level up to MAX_LEVEL, + // but differ by 1 at the MAX_LEVEL position. This forces the algorithm + // to keep going deeper beyond MAX_LEVEL. + + // Build p: [0, 0, 0, ..., 0, max_value_at_MAX_LEVEL - 1] + let mut p = vec![0u64; MAX_LEVEL + 1]; + let max_value_at_max_level = (1u64 << (6 + 6 * MAX_LEVEL)) - 1; + p[MAX_LEVEL] = max_value_at_max_level - 1; + + // Build q: [0, 0, 0, ..., 0, max_value_at_MAX_LEVEL] + let mut q = vec![0u64; MAX_LEVEL + 1]; + q[MAX_LEVEL] = max_value_at_max_level; + + let p_key = SortKeyBase64::new(p); + let q_key = SortKeyBase64::new(q); + + // This should now succeed by allocating at depth MAX_LEVEL + 1 with capped max value + let allocated_key = lseq.allocate(Some(&p_key), Some(&q_key)).unwrap(); + + // Verify the allocated key is properly ordered + assert!(p_key < allocated_key, "p_key < allocated_key should be true"); + assert!(allocated_key < q_key, "allocated_key < q_key should be true"); + + // The allocated key should be at least MAX_LEVEL + 2 levels deep + assert!(allocated_key.levels().len() >= MAX_LEVEL + 2, + "Allocated key should be at least {} levels deep, got {}", + MAX_LEVEL + 2, allocated_key.levels().len()); + } + + #[test] + fn test_formatting() { + // Test with various values to verify digit padding + let xs = vec![5, 6, 7, 8, 9]; + assert_eq!(SortKeyBase64::new(xs.clone()).to_string(), "5.6.7.8.9"); + assert_eq!(format!("{:?}", SortKeyBase64::new(xs)), "05.0006.000007.00000008.0000000009"); + + let ys = vec![5, 10, 63, 127, 4095]; + assert_eq!(SortKeyBase64::new(ys.clone()).to_string(), "5.10.63.127.4095"); + assert_eq!(format!("{:?}", SortKeyBase64::new(ys)), "05.0010.000063.00000127.0000004095"); + } + + #[test] + fn test_level_digits_lookup_correctness() { + // Validate that our precomputed lookup table matches the actual calculation + for i in 0..=MAX_LEVEL { + let max_value = (1u64 << (6 + 6 * i)) - 1; + let expected_digits = max_value.to_string().len(); + + assert_eq!( + LEVEL_DIGITS_LOOKUP[i], + expected_digits, + "Level {} digit count mismatch: lookup={}, calculated={}, max_value={}", + i, LEVEL_DIGITS_LOOKUP[i], expected_digits, max_value + ); + } + } + + #[test] + fn test_get_level_slots() { + // Test that get_level_slots function works correctly + assert_eq!(get_level_slots(0), 64); // 64 * 64^0 = 64 + assert_eq!(get_level_slots(1), 4096); // 64 * 64^1 = 4096 + assert_eq!(get_level_slots(2), 262144); // 64 * 64^2 = 262144 + assert_eq!(get_level_slots(3), 16777216); // 64 * 64^3 = 16777216 + } + + #[test] + fn test_max_base64_chars() { + // Test the compact base64 encoding calculation (no separators) + // Level i needs exactly (i+1) base64 characters in this encoding + let key1 = SortKeyBase64::new(vec![5]); // Level 0 only + assert_eq!(key1.max_base64_chars(), 1); // 1 character for level 0 + + let key2 = SortKeyBase64::new(vec![5, 10]); // Levels 0 and 1 + assert_eq!(key2.max_base64_chars(), 3); // 1 + 2 characters for levels 0 and 1 + + let key3 = SortKeyBase64::new(vec![5, 10, 15]); // Levels 0, 1, and 2 + assert_eq!(key3.max_base64_chars(), 6); // 1 + 2 + 3 characters for levels 0, 1, and 2 + + let key4 = SortKeyBase64::new(vec![1, 2, 3, 4, 5]); // Levels 0-4 + assert_eq!(key4.max_base64_chars(), 15); // 1 + 2 + 3 + 4 + 5 = 15 + } + + #[test] + fn test_reproduce_ordering_violation_bug() { + // Initialize logger with trace level for this test + let _ = env_logger::Builder::from_default_env() + .filter_level(log::LevelFilter::Trace) + .is_test(true) + .try_init(); + + // This test reproduces the exact bug found in random insertion: + // ORDERING VIOLATION: allocated < after failed + // before = "52.0034" (internal: [52, 34]) + // allocated = 52.0035.262119 (internal: [52, 35, 262119]) + // after = 52.0035 (internal: [52, 35]) + // Expected: before < allocated < after + + let mut lseq = LSEQBase64::new(StdRng::seed_from_u64(42)); + + // Create the before and after keys from the bug report + let before_key = SortKeyBase64::new(vec![52, 34]); + let after_key = SortKeyBase64::new(vec![52, 35]); + + // Verify the keys are properly ordered before we start + assert!(before_key < after_key, "Sanity check: before < after should be true"); + + // Try to allocate between them - this should succeed and maintain ordering + let allocated_key = lseq.allocate(Some(&before_key), Some(&after_key)).unwrap(); + + // Verify the allocated key is properly ordered + assert!(before_key < allocated_key, "before < allocated should be true, got before={:?}, allocated={:?}", before_key, allocated_key); + assert!(allocated_key < after_key, "allocated < after should be true, got allocated={:?}, after={:?}", allocated_key, after_key); + } + + #[test] + fn test_reproduce_specific_ordering_violation_bug() { + // Initialize logger with trace level for this test + let _ = env_logger::Builder::from_default_env() + .filter_level(log::LevelFilter::Trace) + .is_test(true) + .try_init(); + + // This test reproduces a specific ordering violation bug found in random insertion: + // ORDERING VIOLATION: before < allocated failed + // before = 51.0038 (internal: [51, 38]) + // allocated = 51.0017 (internal: [51, 17]) + // after = 52 (internal: [52]) + // Expected: before < allocated < after + + // Create the before and after keys from the bug report + let before_key = SortKeyBase64::new(vec![51, 38]); + let after_key = SortKeyBase64::new(vec![52]); + + // Verify the keys are properly ordered before we start + assert!(before_key < after_key, "Sanity check: before < after should be true"); + + let mut violations_found = Vec::new(); + + // Loop over 1000 different seeds to see if we can reproduce the failure + for seed in 0..1000 { + let mut lseq: LSEQBase64 = LSEQBase64::new(StdRng::seed_from_u64(seed)); + + // Initialize strategies to match the bug condition: [false, true, true] + lseq.set_strategies(vec![false, true, true]); + + // Try to allocate between them + match lseq.allocate(Some(&before_key), Some(&after_key)) { + Ok(allocated_key) => { + // Check for ordering violations + let before_violation = !(before_key < allocated_key); + let after_violation = !(allocated_key < after_key); + + if before_violation || after_violation { + violations_found.push((seed, allocated_key.clone(), before_violation, after_violation)); + + eprintln!("ORDERING VIOLATION found with seed {}: + before = {:?} (internal: {:?}) + allocated = {:?} (internal: {:?}) + after = {:?} (internal: {:?}) + before_violation: {} (before < allocated = {}) + after_violation: {} (allocated < after = {})", + seed, + before_key, before_key.levels(), + allocated_key, allocated_key.levels(), + after_key, after_key.levels(), + before_violation, before_key < allocated_key, + after_violation, allocated_key < after_key + ); + } + } + Err(e) => { + eprintln!("Allocation failed with seed {}: {}", seed, e); + } + } + } + + if !violations_found.is_empty() { + panic!("Found {} ordering violations out of 1000 seeds tested. First violation was with seed {}", + violations_found.len(), violations_found[0].0); + } else { + println!("No ordering violations found across 1000 different seeds for the specific test case."); + } + } + + #[test] + fn test_allocate_between_prefix_and_deep_extension() { + // Initialize logger with trace level for this test + let _ = env_logger::Builder::from_default_env() + .filter_level(log::LevelFilter::Trace) + .is_test(true) + .try_init(); + + // Test allocating between [3] and [3, 0, 0, 0, 2] + // This tests the case where we have a short key and a longer key that extends it deeply + let mut lseq = LSEQBase64::new(StdRng::seed_from_u64(42)); + + let before_key = SortKeyBase64::new(vec![3]); + let after_key = SortKeyBase64::new(vec![3, 0, 0, 0, 2]); + + // Verify the keys are properly ordered before we start + assert!(before_key < after_key, "Sanity check: before < after should be true"); + + // Allocate between them + let allocated_key = lseq.allocate(Some(&before_key), Some(&after_key)).unwrap(); + + // Verify the allocated key is properly ordered + assert!(before_key < allocated_key, + "before < allocated should be true, got before={:?}, allocated={:?}", + before_key, allocated_key); + assert!(allocated_key < after_key, + "allocated < after should be true, got allocated={:?}, after={:?}", + allocated_key, after_key); + + // The allocated key should start with [3] since that's the common prefix + assert_eq!(allocated_key.levels()[0], 3, "Allocated key should start with 3"); + + // The allocated key should be at least 5 levels deep to fit between [3] and [3, 0, 0, 0, 2] + assert_eq!(allocated_key.levels().len(), 5, + "Allocated key should be 5 levels deep, got {:?}", allocated_key.levels()); + + println!("Successfully allocated between [3] and [3, 0, 0, 0, 2]: {:?}", allocated_key); + } + + #[test] + fn test_allocate_between_max_value_and_next_level() { + // Initialize logger with trace level for this test + let _ = env_logger::Builder::from_default_env() + .filter_level(log::LevelFilter::Trace) + .is_test(true) + .try_init(); + + // Test allocating between [2, 64^2 - 1] and [3, 0] + // This tests suffix space allocation when the before key has max value at a level + let mut lseq = LSEQBase64::new(StdRng::seed_from_u64(42)); + + let level_1_max = 64u64.pow(2) - 1; // 4095 + let before_key = SortKeyBase64::new(vec![2, level_1_max]); + let after_key = SortKeyBase64::new(vec![3, 0]); + + // Verify the keys are properly ordered before we start + assert!(before_key < after_key, "Sanity check: before < after should be true"); + + // Allocate between them + let allocated_key = lseq.allocate(Some(&before_key), Some(&after_key)).unwrap(); + + // Verify the allocated key is properly ordered + assert!(before_key < allocated_key, + "before < allocated should be true, got before={:?}, allocated={:?}", + before_key, allocated_key); + assert!(allocated_key < after_key, + "allocated < after should be true, got allocated={:?}, after={:?}", + allocated_key, after_key); + + // Since [2] and [3] differ by 1, we should be allocating in suffix space after [2, 4095] + // The allocated key should start with [2, 4095] as prefix + assert_eq!(allocated_key.levels()[0], 2, "Allocated key should start with 2"); + assert_eq!(allocated_key.levels()[1], level_1_max, "Allocated key should have max value at level 1"); + + // The allocated key should be at least 3 levels deep for suffix space allocation + assert!(allocated_key.levels().len() >= 3, + "Allocated key should be at least 3 levels deep for suffix allocation, got {:?}", + allocated_key.levels()); + + println!("Successfully allocated between [2, {}] and [3, 0]: {:?}", level_1_max, allocated_key); + } +} \ No newline at end of file diff --git a/research/src/algorithms/mod.rs b/research/src/algorithms/mod.rs new file mode 100644 index 0000000..6ccb8f4 --- /dev/null +++ b/research/src/algorithms/mod.rs @@ -0,0 +1,5 @@ +pub mod original_paper_reference_impl; +pub mod lseq_base64; + +pub use original_paper_reference_impl::ReferenceLSEQ; +pub use lseq_base64::LSEQBase64; \ No newline at end of file diff --git a/research/src/algorithms/original_paper_reference_impl.rs b/research/src/algorithms/original_paper_reference_impl.rs new file mode 100644 index 0000000..eb15aaa --- /dev/null +++ b/research/src/algorithms/original_paper_reference_impl.rs @@ -0,0 +1,501 @@ +use rand::Rng; +use std::error::Error; +use std::fmt; +use log::{trace, debug}; + +const BOUNDARY: u64 = 10; // The paper says this can be any constant +// +// The maximum level is 58 because the maximum value of a level is 2^(4+58) - 1, +// which is 2^62 - 1, which is i64::MAX. Because the coding below is lazy and +// uses i64 to keep track of sign. This could be pushed to 59 if we used u64 for +// calcuations. +const MAX_LEVEL: usize = 58; + +// Python program used to generate LEVEL_DIGITS_LOOKUP: +// ```python +// def compute_level_digits(): +// digits = [] +// for i in range(59): +// max_value = (16 * (2 ** i)) - 1 # 2^(4+i) - 1 +// num_digits = len(str(max_value)) +// digits.append(num_digits) +// return digits +// +// if __name__ == "__main__": +// digits = compute_level_digits() +// print(f"const LEVEL_DIGITS_LOOKUP: [usize; 59] = {digits};") +// ``` + +// Precomputed number of digits needed for each level (0-58) +// Level i has max value of 2^(4+i) - 1, so we need enough digits to represent that +const LEVEL_DIGITS_LOOKUP: [usize; 59] = [ + 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, + 5, 5, 5, 6, 6, 6, 7, 7, 7, 7, + 8, 8, 8, 9, 9, 9, 10, 10, 10, 10, + 11, 11, 11, 12, 12, 12, 13, 13, 13, 13, + 14, 14, 14, 15, 15, 15, 16, 16, 16, 16, + 17, 17, 17, 18, 18, 18, 19, 19, 19, +]; + +/// Reference implementation of L-SEQ following the original paper +/// This is a direct, naive translation without optimizations +pub struct ReferenceLSEQ { + /// Strategy vector - true for + strategy, false for - strategy + strategies: Vec, + /// Random number generator + rng: R, +} + +/// Reference sort key implementation for the original paper +#[derive(Clone, PartialEq, Eq, PartialOrd, Ord)] +pub struct ReferenceSortKey { + levels: Vec, +} + +impl ReferenceSortKey { + pub fn new(levels: Vec) -> Self { + Self { levels } + } + + pub fn levels(&self) -> &[u64] { + &self.levels + } + + /// Calculate the number of base64 characters needed to encode the full identifier + /// In this compact encoding, we pack all level bits together without separators: + /// - Level 0: 4 bits (0-15) + /// - Level 1: 5 bits (0-31) + /// - Level 2: 6 bits (0-63) + /// - etc. + /// We sum all bits and encode as base64 (6 bits per character, rounding up). + pub fn base64_chars_needed(&self) -> usize { + let total_bits: usize = self.levels.iter().enumerate() + .map(|(level, _)| 4 + level) + .sum(); + + // Round up to nearest multiple of 6 bits (since base64 uses 6 bits per character) + (total_bits + 5) / 6 + } +} + +impl fmt::Display for ReferenceSortKey { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let parts: Vec = self.levels.iter().map(|&x| x.to_string()).collect(); + write!(f, "{}", parts.join(".")) + } +} + +impl fmt::Debug for ReferenceSortKey { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let parts: Vec = self.levels.iter().enumerate().map(|(level, &value)| { + if level > MAX_LEVEL { + panic!("Level exceeds u64 representation capacity"); + } + let digits = LEVEL_DIGITS_LOOKUP[level]; + format!("{:0width$}", value, width = digits) + }).collect(); + write!(f, "{}", parts.join(".")) + } +} + +impl ReferenceLSEQ { + pub fn new(rng: R) -> Self { + Self { + strategies: Vec::new(), + rng, + } + } + + /// Set strategies for testing purposes + #[cfg(test)] + pub fn set_strategies(&mut self, strategies: Vec) { + self.strategies = strategies; + } + + /// Allocate a new identifier between two existing identifiers + pub fn allocate(&mut self, before: Option<&ReferenceSortKey>, after: Option<&ReferenceSortKey>) -> Result> { + // Convert to the format expected by the paper's algorithm + let p = before.map_or(vec![0], |k| k.levels().to_vec()); + let q = after.map_or(vec![self.get_depth_max(0)], |k| k.levels().to_vec()); + + let levels = self.alloc(&p, &q); + let key = ReferenceSortKey::new(levels); + + // Debug assertions to verify the allocated key is properly ordered + if let Some(before_key) = before { + debug_assert!( + before_key < &key, + "ORDERING VIOLATION: before < allocated failed\n\ + before = {:?} (internal: {:?})\n\ + allocated = {:?} (internal: {:?})\n\ + after = {} (internal: {:?})\n\ + Expected: before < allocated < after", + before_key, before_key.levels(), + key, key.levels(), + after.map(|k| format!("{:?}", k)).unwrap_or_else(|| "None".to_string()), + after.map(|k| k.levels()).unwrap_or(&[]) + ); + } + + if let Some(after_key) = after { + debug_assert!( + &key < after_key, + "ORDERING VIOLATION: allocated < after failed\n\ + before = {} (internal: {:?})\n\ + allocated = {:?} (internal: {:?})\n\ + after = {:?} (internal: {:?})\n\ + Expected: before < allocated < after", + before.map(|k| format!("{:?}", k)).unwrap_or_else(|| "None".to_string()), + before.map(|k| k.levels()).unwrap_or(&[]), + key, key.levels(), + after_key, after_key.levels() + ); + } + + Ok(key) + } + + /// Get the maximum value for a given level (16 * 2^level - 1) + /// For levels beyond MAX_LEVEL, we cap at 2^62 - 1 to avoid u64 overflow + fn get_depth_max(&self, depth: usize) -> u64 { + let max_val = if depth <= MAX_LEVEL { + (1 << (4 + depth)) - 1 + } else { + // Cap at 2^62 - 1 for levels beyond MAX_LEVEL + (1 << 62) - 1 + }; + trace!("get_depth_max({}) -> {}", depth, max_val); + max_val + } + + fn alloc(&mut self, p: &[u64], q: &[u64]) -> Vec { + debug!("Starting allocation between p={:?} and q={:?}", p, q); + if !(p.is_empty() && q.is_empty()) { + debug_assert_ne!(p, q, "Cannot allocate between identical positions: p={:?}, q={:?}", p, q); + } + + let mut borrow_flag = false; + let max_levels = std::cmp::max(p.len(), q.len()) + 1; + let mut result = Vec::with_capacity(max_levels); + + trace!("Initial state: carry_flag={}, max_levels={}", borrow_flag, max_levels); + + // Phase 1: Find the allocation depth using continued fraction approach + for depth in 0..max_levels { + trace!("=== Processing depth {} ===", depth); + trace!("Current result so far: {:?}", result); + trace!("Current carry_flag: {}", borrow_flag); + + if self.strategies.len() <= depth { + let new_strategy = self.rng.gen_bool(0.5); + trace!("BRANCH: Generating new strategy for depth {}: {} (+ strategy: {})", + depth, new_strategy, new_strategy); + self.strategies.push(new_strategy); + } else { + trace!("Using existing strategy for depth {}: {} (+ strategy: {})", + depth, self.strategies[depth], self.strategies[depth]); + } + + let p_val = if depth < p.len() { + trace!("BRANCH: p_val from p[{}] = {}", depth, p[depth]); + p[depth] + } else { + trace!("BRANCH: p_val defaulted to 0 (depth {} >= p.len() {})", depth, p.len()); + 0 + }; + + let q_val = if borrow_flag { + let max_val = self.get_depth_max(depth); + trace!("BRANCH: q_val from get_depth_max({}) = {} (carry_flag=true)", depth, max_val); + max_val + } else if depth < q.len() { + trace!("BRANCH: q_val from q[{}] = {} (carry_flag=false)", depth, q[depth]); + q[depth] + } else { + trace!("BRANCH: q_val defaulted to 0 (depth {} >= q.len() {}, carry_flag=false)", depth, q.len()); + 0 + }; + + trace!("At depth {}: p_val={}, q_val={}, gap={}", depth, p_val, q_val, q_val.saturating_sub(p_val)); + + if p_val == q_val { + trace!("BRANCH: Values equal at depth {} (p_val={}, q_val={}), extending prefix and going deeper", + depth, p_val, q_val); + result.push(p_val); + continue; + } + + if q_val < p_val { + trace!("BRANCH: ERROR - q_val < p_val at depth {} (q_val={}, p_val={})", depth, q_val, p_val); + debug_assert!(q_val > p_val, "q < p at depth {}", depth); + // We know that q > p overall, and we know that we had a shared + // prefix up until this point, therefore q_val must be greater than p_val + // TODO I might want to return an error here instead of panicking + } + + let gap = q_val - p_val; + if gap > 1 { + // Enough space at this level + trace!("BRANCH: Sufficient space found at depth {} (gap={} > 1)", depth, gap); + let interval = gap - 1; + let step = std::cmp::min(BOUNDARY, interval); + + let allocated_value = if self.strategies[depth] { + let delta = self.rng.gen_range(1..=step); + trace!("Space allocation: interval={}, step={}, delta={}", interval, step, delta); + let val = p_val + delta; + trace!("BRANCH: Using + strategy, allocated_value = p_val + delta = {} + {} = {}", + p_val, delta, val); + val + } else { + let delta = if borrow_flag { + self.rng.gen_range(1..=step) + } else { + self.rng.gen_range(1..=step) + }; + trace!("Space allocation: interval={}, step={}, delta={}", interval, step, delta); + let val = q_val - delta; + trace!("BRANCH: Using - strategy, allocated_value = q_val - delta = {} - {} = {}", + q_val, delta, val); + val + }; + + result.push(allocated_value); + trace!("BRANCH: Allocation complete at depth {}, final result: {:?}", depth, result); + return result; + } else { + trace!("BRANCH: Insufficient space at depth {} (gap={} <= 1), extending prefix and setting carry_flag", + depth, gap); + result.push(p_val); + borrow_flag = true; + trace!("Updated state: result={:?}, carry_flag={}", result, borrow_flag); + } + } + + trace!("BRANCH: Loop completed without allocation, returning result: {:?}", result); + result + } +} + +/// Get the number of slots for a given level (16 * 2^level) +#[allow(dead_code)] +fn get_level_slots(level: usize) -> u64 { + let base_slots = 16u64; + let multiplier = 2u64.checked_pow(level as u32) + .expect("Level exceeds u64 representation capacity"); + + base_slots.checked_mul(multiplier) + .expect("Level slots exceed u64 capacity") +} + +#[cfg(test)] +mod tests { + use super::*; + use rand::rngs::StdRng; + use rand::SeedableRng; + + #[test] + fn test_level_max() { + let lseq = ReferenceLSEQ::new(StdRng::seed_from_u64(42)); + + assert_eq!(lseq.get_depth_max(0), 15); + assert_eq!(lseq.get_depth_max(1), 31); + assert_eq!(lseq.get_depth_max(2), 63); + assert_eq!(lseq.get_depth_max(3), 127); + } + + #[test] + fn test_basic_allocation() { + let mut lseq = ReferenceLSEQ::new(StdRng::seed_from_u64(42)); + + let key1 = lseq.allocate(None, None).unwrap(); + let key2 = lseq.allocate(Some(&key1), None).unwrap(); + let key3 = lseq.allocate(None, Some(&key1)).unwrap(); + + assert!(key3 < key1); + assert!(key1 < key2); + } + + #[test] + fn test_sort_key_ordering() { + let key1 = ReferenceSortKey::new(vec![5]); + let key2 = ReferenceSortKey::new(vec![5, 10]); + let key3 = ReferenceSortKey::new(vec![6]); + + assert!(key1 < key2); + assert!(key2 < key3); + } + + #[test] + fn test_boundary_usage() { + let mut lseq = ReferenceLSEQ::new(StdRng::seed_from_u64(42)); + + // Create keys with large gaps to test boundary limiting + let key1 = ReferenceSortKey::new(vec![0]); + let key2 = ReferenceSortKey::new(vec![15]); + + // Allocate between them - should use BOUNDARY to limit step + let key_between = lseq.allocate(Some(&key1), Some(&key2)).unwrap(); + + // The new key should be valid + assert!(key1 < key_between); + assert!(key_between < key2); + } + + #[test] + fn test_allocation_beyond_max_level() { + let mut lseq = ReferenceLSEQ::new(StdRng::seed_from_u64(42)); + + // Create two identifiers that are identical at every level up to MAX_LEVEL, + // but differ by 1 at the MAX_LEVEL position. This forces the algorithm + // to keep going deeper beyond MAX_LEVEL. + + // Build p: [0, 0, 0, ..., 0, max_value_at_MAX_LEVEL - 1] + let mut p = vec![0u64; MAX_LEVEL + 1]; + let max_value_at_max_level = (1u64 << (4 + MAX_LEVEL)) - 1; + p[MAX_LEVEL] = max_value_at_max_level - 1; + + // Build q: [0, 0, 0, ..., 0, max_value_at_MAX_LEVEL] + let mut q = vec![0u64; MAX_LEVEL + 1]; + q[MAX_LEVEL] = max_value_at_max_level; + + let p_key = ReferenceSortKey::new(p); + let q_key = ReferenceSortKey::new(q); + + // This should now succeed by allocating at depth MAX_LEVEL + 1 with capped max value + let allocated_key = lseq.allocate(Some(&p_key), Some(&q_key)).unwrap(); + + // Verify the allocated key is properly ordered + assert!(p_key < allocated_key, "p_key < allocated_key should be true"); + assert!(allocated_key < q_key, "allocated_key < q_key should be true"); + + // The allocated key should be at least MAX_LEVEL + 2 levels deep + assert!(allocated_key.levels().len() >= MAX_LEVEL + 2, + "Allocated key should be at least {} levels deep, got {}", + MAX_LEVEL + 2, allocated_key.levels().len()); + } + + #[test] + fn test_formatting() { + // Test with values that needs 3 digits at 4th level (128 slots) + + let xs = vec![5, 6, 7, 8, 9]; + assert_eq!(ReferenceSortKey::new(xs.clone()).to_string(), "5.6.7.8.9"); + assert_eq!(format!("{:?}", ReferenceSortKey::new(xs)), "05.06.07.008.009"); + + + let ys = vec![5, 10, 63, 127]; + assert_eq!(ReferenceSortKey::new(ys.clone()).to_string(), "5.10.63.127"); + assert_eq!(format!("{:?}", ReferenceSortKey::new(ys)), "05.10.63.127"); + } + + #[test] + fn test_level_digits_lookup_correctness() { + // Validate that our precomputed lookup table matches the actual calculation + for i in 0..=MAX_LEVEL { + let max_value = (1u64 << (4 + i)) - 1; + let expected_digits = max_value.to_string().len(); + + assert_eq!( + LEVEL_DIGITS_LOOKUP[i], + expected_digits, + "Level {} digit count mismatch: lookup={}, calculated={}, max_value={}", + i, LEVEL_DIGITS_LOOKUP[i], expected_digits, max_value + ); + } + } + + #[test] + fn test_base64_chars_needed() { + // Test the compact base64 encoding calculation (no separators) + let key1 = ReferenceSortKey::new(vec![5]); // Level 0 only: 4 bits + assert_eq!(key1.base64_chars_needed(), 1); // 4 bits -> 1 base64 character + + let key2 = ReferenceSortKey::new(vec![5, 10]); // Levels 0 and 1: 4 + 5 = 9 bits + assert_eq!(key2.base64_chars_needed(), 2); // 9 bits -> 2 base64 characters + + let key3 = ReferenceSortKey::new(vec![5, 10, 15]); // Levels 0, 1, and 2: 4 + 5 + 6 = 15 bits + assert_eq!(key3.base64_chars_needed(), 3); // 15 bits -> 3 base64 characters + + let key4 = ReferenceSortKey::new(vec![1, 2, 3, 4, 5]); // Levels 0-4: 4 + 5 + 6 + 7 + 8 = 30 bits + assert_eq!(key4.base64_chars_needed(), 5); // 30 bits -> 5 base64 characters + + // Test edge case: exactly divisible by 6 + let key5 = ReferenceSortKey::new(vec![1, 2, 3, 4, 5, 6]); // Levels 0-5: 4 + 5 + 6 + 7 + 8 + 9 = 39 bits + assert_eq!(key5.base64_chars_needed(), 7); // 39 bits -> 7 base64 characters + + // Test edge case with 36 bits (exactly divisible by 6) + let key6 = ReferenceSortKey::new(vec![1, 2, 3, 4, 5, 6, 7]); // Levels 0-6: 4+5+6+7+8+9+10 = 49 bits + assert_eq!(key6.base64_chars_needed(), 9); // 49 bits -> 9 base64 characters (rounded up from 8.17) + } + + #[test] + fn test_continued_fraction_ordering_validation() { + // Initialize logger with trace level for this test + let _ = env_logger::Builder::from_default_env() + .filter_level(log::LevelFilter::Trace) + .is_test(true) + .try_init(); + + // Test the continued fraction approach with adjacent identifiers + let mut lseq = ReferenceLSEQ::new(StdRng::seed_from_u64(42)); + + // Create adjacent keys that need to use the continued fraction approach + let before_key = ReferenceSortKey::new(vec![5, 10]); + let after_key = ReferenceSortKey::new(vec![5, 11]); + + // Verify the keys are properly ordered before we start + assert!(before_key < after_key, "Sanity check: before < after should be true"); + + // Try to allocate between them - this should succeed using the continued fraction approach + let allocated_key = lseq.allocate(Some(&before_key), Some(&after_key)).unwrap(); + + // Verify the allocated key is properly ordered + assert!(before_key < allocated_key, "before < allocated should be true, got before={:?}, allocated={:?}", before_key, allocated_key); + assert!(allocated_key < after_key, "allocated < after should be true, got allocated={:?}, after={:?}", allocated_key, after_key); + + // The allocated key should be at least 3 levels deep since there's no space at level 1 + assert!(allocated_key.levels().len() >= 3, + "Allocated key should be at least 3 levels deep for continued fraction, got {:?}", + allocated_key.levels()); + } + + #[test] + fn test_allocate_between_prefix_and_deep_extension() { + // Initialize logger with trace level for this test + let _ = env_logger::Builder::from_default_env() + .filter_level(log::LevelFilter::Trace) + .is_test(true) + .try_init(); + + // Test allocating between [3] and [3, 0, 0, 0, 2] + // This tests the case where we have a short key and a longer key that extends it deeply + let mut lseq = ReferenceLSEQ::new(StdRng::seed_from_u64(42)); + + let before_key = ReferenceSortKey::new(vec![3]); + let after_key = ReferenceSortKey::new(vec![3, 0, 0, 0, 2]); + + // Verify the keys are properly ordered before we start + assert!(before_key < after_key, "Sanity check: before < after should be true"); + + // Allocate between them + let allocated_key = lseq.allocate(Some(&before_key), Some(&after_key)).unwrap(); + + // Verify the allocated key is properly ordered + assert!(before_key < allocated_key, + "before < allocated should be true, got before={:?}, allocated={:?}", + before_key, allocated_key); + assert!(allocated_key < after_key, + "allocated < after should be true, got allocated={:?}, after={:?}", + allocated_key, after_key); + + // The allocated key should start with [3] since that's the common prefix + assert_eq!(allocated_key.levels()[0], 3, "Allocated key should start with 3"); + + // The allocated key should be at least 5 levels deep to fit between [3] and [3, 0, 0, 0, 2] + assert_eq!(allocated_key.levels().len(), 5, + "Allocated key should be 5 levels deep, got {:?}", allocated_key.levels()); + + println!("Successfully allocated between [3] and [3, 0, 0, 0, 2]: {:?}", allocated_key); + } +} \ No newline at end of file diff --git a/research/src/bin/encoding_analyzer.rs b/research/src/bin/encoding_analyzer.rs new file mode 100644 index 0000000..30530db --- /dev/null +++ b/research/src/bin/encoding_analyzer.rs @@ -0,0 +1,373 @@ +/*! +# L-SEQ Encoding Analysis Tool + +This binary demonstrates the encoding efficiency analysis for L-SEQ algorithms. + +It allocates a large number of identifiers (configurable, default 10,000) and shows: +- Base64 encoding size histograms +- Comparison between different L-SEQ variants +- Statistics useful for real-world deployment decisions + +## Usage + +```bash +cargo run --bin encoding_analyzer +cargo run --bin encoding_analyzer -- --count 1000000 +cargo run --bin encoding_analyzer -- --count 10000 --insertion-mode random +cargo run --bin encoding_analyzer -- --count 10000 --insertion-mode tail +cargo run --bin encoding_analyzer -- --count 10000 --insertion-mode head +``` + +## Options + +- `--count `: Number of identifiers to generate (default: 10000) +- `--insertion-mode `: 'tail' for sequential insertion, 'random' for random insertion, or 'head' for head insertion (default: tail) +*/ + +use std::env; +use rand::rngs::StdRng; +use rand::{Rng, SeedableRng}; +use peoplesgrocers_lseq_research::algorithms::lseq_base64::{LSEQBase64, SortKeyBase64}; +use peoplesgrocers_lseq_research::algorithms::original_paper_reference_impl::{ReferenceLSEQ, ReferenceSortKey}; +use peoplesgrocers_lseq_research::encoding_analysis::{analyze_base64_encoding, analyze_reference_encoding, compare_encodings}; + +#[derive(Debug, Clone, PartialEq)] +enum InsertionMode { + Tail, + Random, + Head, +} + +impl InsertionMode { + fn from_str(s: &str) -> Result { + match s.to_lowercase().as_str() { + "tail" => Ok(InsertionMode::Tail), + "random" => Ok(InsertionMode::Random), + "head" => Ok(InsertionMode::Head), + _ => Err("Invalid insertion mode. Use 'tail', 'random', or 'head'"), + } + } +} + +/// Verify that all keys are sorted in proper order +fn verify_sorted_base64(keys: &[SortKeyBase64]) -> Result<(), String> { + for i in 1..keys.len() { + if keys[i-1] >= keys[i] { + return Err(format!( + "I expected key at position {} to be smaller than key at position {}\n\ + [{}] = {:?} (internal: {:?})\n\ + [{}] = {:?} (internal: {:?})\n\ + But {:?} >= {:?}", + i-1, i, + i-1, keys[i-1], keys[i-1].levels(), + i, keys[i], keys[i].levels(), + keys[i-1], keys[i] + )); + } + } + Ok(()) +} + +/// Verify that all keys are sorted in proper order +#[allow(dead_code)] +fn verify_sorted_reference(keys: &[ReferenceSortKey]) -> Result<(), String> { + for i in 1..keys.len() { + if keys[i-1] >= keys[i] { + return Err(format!( + "I expected key at position {} to be smaller than key at position {}\n\ + [{}] = {:?} (internal: {:?})\n\ + [{}] = {:?} (internal: {:?})\n\ + But {:?} >= {:?}", + i-1, i, + i-1, keys[i-1], keys[i-1].levels(), + i, keys[i], keys[i].levels(), + keys[i-1], keys[i] + )); + } + } + Ok(()) +} + +/// Generate random insertion positions for consistent comparison +fn generate_insertion_positions(count: usize, rng: &mut StdRng) -> Vec { + let mut positions = Vec::new(); + + for i in 0..count { + if i == 0 { + positions.push(0); // First element always goes at position 0 + } else { + // Insert after position 0 to i-1 (current list has i elements) + positions.push(rng.gen_range(0..i)); + } + } + + positions +} + +/// Generate identifiers using tail insertion +fn generate_tail_insertion_base64(count: usize, rng: StdRng) -> Vec { + let mut keys = Vec::new(); + let mut lseq = LSEQBase64::new(rng); + + for i in 0..count { + let before = if i == 0 { + None + } else { + Some(&keys[i - 1]) + }; + + let key = lseq.allocate(before, None).unwrap(); + keys.push(key); + } + + keys +} + +/// Generate identifiers using tail insertion +fn generate_tail_insertion_reference(count: usize, rng: StdRng) -> Vec { + let mut keys = Vec::new(); + let mut lseq = ReferenceLSEQ::new(rng); + + for i in 0..count { + let before = if i == 0 { + None + } else { + Some(&keys[i - 1]) + }; + + let key = lseq.allocate(before, None).unwrap(); + keys.push(key); + } + + keys +} + +/// Generate identifiers using head insertion +fn generate_head_insertion_base64(count: usize, rng: StdRng) -> Vec { + let mut keys = Vec::new(); + let mut lseq = LSEQBase64::new(rng); + + for i in 0..count { + let after = if i == 0 { + None + } else { + Some(&keys[0]) + }; + + let key = lseq.allocate(None, after).unwrap(); + keys.insert(0, key); + } + + keys +} + +/// Generate identifiers using head insertion +fn generate_head_insertion_reference(count: usize, rng: StdRng) -> Vec { + let mut keys = Vec::new(); + let mut lseq = ReferenceLSEQ::new(rng); + + for i in 0..count { + let after = if i == 0 { + None + } else { + Some(&keys[0]) + }; + + let key = lseq.allocate(None, after).unwrap(); + keys.insert(0, key); + } + + keys +} + +/// Generate identifiers using random insertion at the same positions +fn generate_random_insertion_base64(count: usize, positions: &[usize], rng: StdRng) -> Vec { + let mut keys = Vec::new(); + let mut lseq = LSEQBase64::new(rng); + + for i in 0..count { + eprintln!("Generating key {} of {}", i, count); + let insert_after_pos = positions[i]; + + // We want to insert after position insert_after_pos + // before = element at insert_after_pos (if valid) + // after = element at insert_after_pos + 1 (if valid) + // insert at position insert_after_pos + 1 + + let before = if insert_after_pos >= keys.len() { + // If insert_after_pos is beyond the end, insert at the end + keys.last() + } else { + Some(&keys[insert_after_pos]) + }; + + let after = if insert_after_pos + 1 >= keys.len() { + None + } else { + Some(&keys[insert_after_pos + 1]) + }; + + eprintln!("before: {:?}, after: {:?}", before, after); + let key = lseq.allocate(before, after).unwrap(); + let insert_pos = std::cmp::min(insert_after_pos + 1, keys.len()); + keys.insert(insert_pos, key); + } + + keys +} + +/// Generate identifiers using random insertion at the same positions +fn generate_random_insertion_reference(count: usize, positions: &[usize], rng: StdRng) -> Vec { + let mut keys = Vec::new(); + let mut lseq = ReferenceLSEQ::new(rng); + + for i in 0..count { + let insert_after_pos = positions[i]; + + // We want to insert after position insert_after_pos + // before = element at insert_after_pos (if valid) + // after = element at insert_after_pos + 1 (if valid) + // insert at position insert_after_pos + 1 + + let before = if insert_after_pos >= keys.len() { + // If insert_after_pos is beyond the end, insert at the end + keys.last() + } else { + Some(&keys[insert_after_pos]) + }; + + let after = if insert_after_pos + 1 >= keys.len() { + None + } else { + Some(&keys[insert_after_pos + 1]) + }; + + let key = lseq.allocate(before, after).unwrap(); + let insert_pos = std::cmp::min(insert_after_pos + 1, keys.len()); + keys.insert(insert_pos, key); + } + + keys +} + +fn main() { + // Parse command line arguments + let args: Vec = env::args().collect(); + let mut count = 10000; + let mut insertion_mode = InsertionMode::Tail; + + let mut i = 1; + while i < args.len() { + match args[i].as_str() { + "--count" => { + if i + 1 < args.len() { + count = args[i + 1].parse::().unwrap_or(10000); + i += 2; + } else { + eprintln!("Error: --count requires a number"); + std::process::exit(1); + } + } + "--insertion-mode" => { + if i + 1 < args.len() { + insertion_mode = InsertionMode::from_str(&args[i + 1]).unwrap_or_else(|err| { + eprintln!("Error: {}", err); + std::process::exit(1); + }); + i += 2; + } else { + eprintln!("Error: --insertion-mode requires 'tail', 'random', or 'head'"); + std::process::exit(1); + } + } + _ => { + eprintln!("Unknown argument: {}", args[i]); + std::process::exit(1); + } + } + } + + println!("L-SEQ Encoding Analysis Tool"); + println!("============================"); + println!("Allocating {} identifiers for analysis...", count); + println!("Insertion mode: {:?}", insertion_mode); + println!(); + + // Generate identifiers based on insertion mode + let (base64_keys, reference_keys) = match insertion_mode { + InsertionMode::Tail => { + println!("Using tail insertion (sequential)..."); + let base64_keys = generate_tail_insertion_base64(count, StdRng::seed_from_u64(42)); + let reference_keys = generate_tail_insertion_reference(count, StdRng::seed_from_u64(42)); + (base64_keys, reference_keys) + } + InsertionMode::Random => { + println!("Using random insertion..."); + let mut rng = StdRng::seed_from_u64(42); + let positions = generate_insertion_positions(count, &mut rng); + + let base64_keys = generate_random_insertion_base64(count, &positions, StdRng::seed_from_u64(42)); + let reference_keys = generate_random_insertion_reference(count, &positions, StdRng::seed_from_u64(42)); + (base64_keys, reference_keys) + } + InsertionMode::Head => { + println!("Using head insertion (reverse sequential)..."); + let base64_keys = generate_head_insertion_base64(count, StdRng::seed_from_u64(42)); + let reference_keys = generate_head_insertion_reference(count, StdRng::seed_from_u64(42)); + (base64_keys, reference_keys) + } + }; + + // Verify that all keys are sorted + println!("Verifying sort order..."); + if let Err(e) = verify_sorted_base64(&base64_keys) { + eprintln!("ERROR: Base64 keys not sorted: {}", e); + std::process::exit(1); + } + + //if let Err(e) = verify_sorted_reference(&reference_keys) { + // eprintln!("ERROR: Reference keys not sorted: {}", e); + // std::process::exit(1); + //} + + println!("✓ All keys are properly sorted!"); + println!(); + + // Analyze encoding efficiency + let base64_stats = analyze_base64_encoding(&base64_keys); + let reference_stats = analyze_reference_encoding(&reference_keys); + + // Print results + base64_stats.print_summary("Base64 Variant (64 slots per level)"); + reference_stats.print_summary("Reference Implementation (16 * 2^level slots)"); + + compare_encodings(&base64_stats, "Base64 Variant", &reference_stats, "Reference"); + + // Additional analysis + println!("\n=== Additional Analysis ==="); + println!("Total base64 characters needed:"); + let base64_total: usize = base64_keys.iter().map(|k| k.max_base64_chars()).sum(); + let reference_total: usize = reference_keys.iter().map(|k| k.base64_chars_needed()).sum(); + + println!(" Base64 variant: {} characters", base64_total); + println!(" Reference impl: {} characters", reference_total); + println!(" Difference: {} characters ({:.1}% {})", + base64_total.abs_diff(reference_total), + (base64_total as f64 - reference_total as f64).abs() / reference_total as f64 * 100.0, + if base64_total > reference_total { "more" } else { "less" }); + + println!("\nAverage bytes per key (assuming 1 byte per base64 character):"); + println!(" Base64 variant: {:.2} bytes", base64_total as f64 / count as f64); + println!(" Reference impl: {:.2} bytes", reference_total as f64 / count as f64); + + // Show some sample keys for understanding + println!("\n=== Sample Keys (first 10) ==="); + for i in 0..std::cmp::min(10, count) { + println!("Key {}: Base64({} chars) = {:?}, Reference({} chars) = {:?}", + i, + base64_keys[i].max_base64_chars(), + base64_keys[i], + reference_keys[i].base64_chars_needed(), + reference_keys[i]); + } +} \ No newline at end of file diff --git a/research/src/encoding_analysis.rs b/research/src/encoding_analysis.rs new file mode 100644 index 0000000..986db75 --- /dev/null +++ b/research/src/encoding_analysis.rs @@ -0,0 +1,180 @@ +/*! +# L-SEQ Encoding Efficiency Analysis + +This module provides tools for analyzing the encoding efficiency of L-SEQ algorithms. + +## Use Case + +When implementing L-SEQ in real-world applications (especially web applications), we need to +serialize and transfer sort keys between systems. JavaScript and web APIs commonly use base64 +encoding for safely representing binary data in text format. + +To measure the practical efficiency of different L-SEQ variants, we: + +1. **Allocate large numbers of identifiers** (e.g., 1,000,000) in realistic usage patterns +2. **Calculate base64 encoding requirements** for each identifier using the "maximally encoded" + compact format (no separators, since the structure is known) +3. **Generate histograms** showing the distribution of encoding sizes +4. **Compare different algorithms** to understand their space efficiency trade-offs + +## Encoding Formats + +### Base64 Variant (64 slots per level) +- Level 0: 1 base64 character (6 bits, 0-63) +- Level 1: 2 base64 characters (12 bits, 0-4095) +- Level 2: 3 base64 characters (18 bits, 0-262143) +- Sequential parsing: read 1 char, then 2 chars, then 3 chars, etc. + +### Original Paper Reference (16 * 2^level slots) +- Level 0: 4 bits (0-15) +- Level 1: 5 bits (0-31) +- Level 2: 6 bits (0-63) +- Packed encoding: concatenate all bits, encode as base64 (6 bits per character) + +## Analysis Functions + +This module provides functions to: +- Calculate encoding size histograms for collections of sort keys +- Compare efficiency between different L-SEQ variants +- Generate statistics for real-world usage scenarios +*/ + +use std::collections::HashMap; +use crate::algorithms::lseq_base64::SortKeyBase64; +use crate::algorithms::original_paper_reference_impl::ReferenceSortKey; + +/// Histogram of base64 encoding sizes +pub type EncodingSizeHistogram = HashMap; + +/// Statistics about encoding sizes +#[derive(Debug, Clone)] +pub struct EncodingStats { + pub total_keys: usize, + pub min_size: usize, + pub max_size: usize, + pub mean_size: f64, + pub median_size: usize, + pub histogram: EncodingSizeHistogram, +} + +impl EncodingStats { + /// Calculate statistics from a list of encoding sizes + pub fn from_sizes(sizes: Vec) -> Self { + let total_keys = sizes.len(); + let min_size = *sizes.iter().min().unwrap_or(&0); + let max_size = *sizes.iter().max().unwrap_or(&0); + let mean_size = sizes.iter().sum::() as f64 / total_keys as f64; + + let mut sorted_sizes = sizes.clone(); + sorted_sizes.sort_unstable(); + let median_size = if total_keys % 2 == 0 { + (sorted_sizes[total_keys / 2 - 1] + sorted_sizes[total_keys / 2]) / 2 + } else { + sorted_sizes[total_keys / 2] + }; + + let mut histogram = HashMap::new(); + for size in sizes { + *histogram.entry(size).or_insert(0) += 1; + } + + Self { + total_keys, + min_size, + max_size, + mean_size, + median_size, + histogram, + } + } + + /// Print a formatted summary of the statistics + pub fn print_summary(&self, algorithm_name: &str) { + println!("\n=== {} Encoding Statistics ===", algorithm_name); + println!("Total keys: {}", self.total_keys); + println!("Min size: {} base64 characters", self.min_size); + println!("Max size: {} base64 characters", self.max_size); + println!("Mean size: {:.2} base64 characters", self.mean_size); + println!("Median size: {} base64 characters", self.median_size); + + println!("\nSize distribution:"); + let mut sizes: Vec<_> = self.histogram.keys().collect(); + sizes.sort(); + for &size in sizes { + let count = self.histogram[&size]; + let percentage = (count as f64 / self.total_keys as f64) * 100.0; + println!(" {} chars: {} keys ({:.1}%)", size, count, percentage); + } + } +} + +/// Analyze the encoding efficiency of Base64 variant sort keys +pub fn analyze_base64_encoding(keys: &[SortKeyBase64]) -> EncodingStats { + let sizes: Vec = keys.iter().map(|key| key.max_base64_chars()).collect(); + EncodingStats::from_sizes(sizes) +} + +/// Analyze the encoding efficiency of Reference implementation sort keys +pub fn analyze_reference_encoding(keys: &[ReferenceSortKey]) -> EncodingStats { + let sizes: Vec = keys.iter().map(|key| key.base64_chars_needed()).collect(); + EncodingStats::from_sizes(sizes) +} + +/// Compare encoding efficiency between two algorithms +pub fn compare_encodings(stats1: &EncodingStats, name1: &str, stats2: &EncodingStats, name2: &str) { + println!("\n=== Encoding Comparison: {} vs {} ===", name1, name2); + println!("Mean size: {:.2} vs {:.2} chars ({:.1}% difference)", + stats1.mean_size, stats2.mean_size, + ((stats2.mean_size - stats1.mean_size) / stats1.mean_size) * 100.0); + println!("Max size: {} vs {} chars", stats1.max_size, stats2.max_size); + println!("Min size: {} vs {} chars", stats1.min_size, stats2.min_size); +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_encoding_stats() { + let sizes = vec![1, 2, 2, 3, 3, 3, 4, 5]; + let stats = EncodingStats::from_sizes(sizes); + + assert_eq!(stats.total_keys, 8); + assert_eq!(stats.min_size, 1); + assert_eq!(stats.max_size, 5); + assert_eq!(stats.mean_size, 2.875); + assert_eq!(stats.median_size, 3); + assert_eq!(stats.histogram[&3], 3); + assert_eq!(stats.histogram[&2], 2); + } + + #[test] + fn test_base64_analysis() { + let keys = vec![ + SortKeyBase64::new(vec![1]), + SortKeyBase64::new(vec![1, 2]), + SortKeyBase64::new(vec![1, 2, 3]), + ]; + + let stats = analyze_base64_encoding(&keys); + assert_eq!(stats.total_keys, 3); + assert_eq!(stats.min_size, 1); // 1 level = 1 char + assert_eq!(stats.max_size, 6); // 3 levels = 1+2+3 = 6 chars + assert_eq!(stats.mean_size, 10.0/3.0); // (1+3+6)/3 + } + + #[test] + fn test_reference_analysis() { + let keys = vec![ + ReferenceSortKey::new(vec![1]), + ReferenceSortKey::new(vec![1, 2]), + ReferenceSortKey::new(vec![1, 2, 3]), + ]; + + let stats = analyze_reference_encoding(&keys); + assert_eq!(stats.total_keys, 3); + assert_eq!(stats.min_size, 1); // 4 bits = 1 char + assert_eq!(stats.max_size, 3); // 4+5+6=15 bits = 3 chars + assert_eq!(stats.mean_size, 2.0); // (1+2+3)/3 + } +} \ No newline at end of file diff --git a/research/src/lib.rs b/research/src/lib.rs new file mode 100644 index 0000000..65c4a79 --- /dev/null +++ b/research/src/lib.rs @@ -0,0 +1,7 @@ +pub mod algorithms; +pub mod encoding_analysis; + +pub use algorithms::ReferenceLSEQ; + +// Re-export for convenience in benchmarks +pub use rand; \ No newline at end of file diff --git a/research/src/main.rs b/research/src/main.rs new file mode 100644 index 0000000..ae883f2 --- /dev/null +++ b/research/src/main.rs @@ -0,0 +1,52 @@ +use peoplesgrocers_lseq_research::ReferenceLSEQ; +use rand::rngs::StdRng; +use rand::SeedableRng; +use log::trace; + +fn main() -> Result<(), Box> { + // Because this smoke test is so simple, I'm not going to show the module name or timestamp. + env_logger::Builder::from_default_env() + .format(|buf, record| { + use std::io::Write; + use env_logger::fmt::Color; + + let mut style = buf.style(); + let level_color = match record.level() { + log::Level::Error => Color::Red, + log::Level::Warn => Color::Yellow, + log::Level::Info => Color::Green, + log::Level::Debug => Color::Blue, + log::Level::Trace => Color::Cyan, + }; + style.set_color(level_color).set_bold(true); + + writeln!(buf, "{} {}", style.value(record.level()), record.args()) + }) + .init(); + + println!("L-SEQ Research - Original Paper Reference Implementation"); + + // Test the original paper reference implementation + let mut lseq = ReferenceLSEQ::new(StdRng::seed_from_u64(42)); + let mut keys = Vec::new(); + + // Generate 10 sequential insertions + for i in 0..10 { + let before = keys.last(); + let key = lseq.allocate(before, None)?; + println!("Generated key {}: {}", i + 1, key); + trace!("--------------------------------"); + keys.push(key); + } + + // Verify they are sorted + println!("\nVerifying sort order:"); + for i in 0..keys.len() - 1 { + println!("{} < {}", keys[i], keys[i + 1]); + assert!(keys[i] < keys[i + 1]); + } + + println!("\nAll keys are properly sorted!"); + + Ok(()) +} diff --git a/rust/Cargo.toml b/rust/Cargo.toml new file mode 100644 index 0000000..f8bd08d --- /dev/null +++ b/rust/Cargo.toml @@ -0,0 +1,21 @@ +[package] +name = "peoplesgrocers-lseq" +version = "1.0.0" +edition = "2021" +description = "L-SEQ algorithm implementation for fractional indexing and list CRDTs" +keywords = ["lseq", "crdt", "fractional-indexing", "sequence", "collaborative-editing"] +categories = ["data-structures", "algorithms"] +license = "MIT" +repository = "https://github.com/peoplesgrocers/lseq" +readme = "README.md" + +[features] +default = [] +serde = ["dep:serde"] + +[dependencies] +rand = "0.8" +serde = { version = "1.0", features = ["derive"], optional = true } + +[dev-dependencies] +rand = { version = "0.8", features = ["small_rng"] } \ No newline at end of file diff --git a/rust/README.md b/rust/README.md new file mode 100644 index 0000000..f7544bd --- /dev/null +++ b/rust/README.md @@ -0,0 +1,82 @@ +# peoplesgrocers-lseq + +Rust implementation of the L-SEQ algorithm for fractional indexing and list CRDTs. + +## Installation + +Add this to your `Cargo.toml`: + +```toml +[dependencies] +peoplesgrocers-lseq = "1.0.0" +``` + +## Usage + +```rust +use peoplesgrocers_lseq::{LSEQ, SortKey, compare_lseq}; +use rand::thread_rng; + +// Create a new L-SEQ instance +let mut lseq = LSEQ::new(thread_rng()); + +// Allocate identifiers +let id1 = lseq.alloc(None, None); // First identifier +let id2 = lseq.alloc(Some(&id1), None); // After id1 +let id3 = lseq.alloc(Some(&id1), Some(&id2)); // Between id1 and id2 + +// Sort identifiers +let mut ids = vec![id3.clone(), id1.clone(), id2.clone()]; +ids.sort(); +println!("{:?}", ids); // [id1, id3, id2] - properly ordered + +// Convert to/from strings +let key_str = id1.to_string(); +let parsed_key: SortKey = key_str.parse().unwrap(); +assert_eq!(id1, parsed_key); + +// Use with deterministic RNG for testing +use rand::rngs::StdRng; +use rand::SeedableRng; + +let rng = StdRng::seed_from_u64(42); +let mut deterministic_lseq = LSEQ::new(rng); +``` + +## Features + +- **Fractional indexing**: Generate identifiers that can be inserted between any two existing ones +- **Serialization**: Full support for serde serialization/deserialization +- **Ordering**: SortKey implements Ord and can be used directly with Rust's sorting +- **String conversion**: Convert to/from strings for storage and transmission +- **Even spacing**: Utilities for generating evenly distributed keys for bulk operations + +## API + +### `LSEQ` + +#### `new(rng: R) -> Self` + +Creates a new L-SEQ instance with the given random number generator. + +#### `alloc(&mut self, before: Option<&SortKey>, after: Option<&SortKey>) -> SortKey` + +Allocates a new identifier between two existing identifiers. + +- `before`: The identifier that should come before the new one (or `None` for beginning) +- `after`: The identifier that should come after the new one (or `None` for end) +- Returns: A new SortKey that sorts between `before` and `after` + +### `SortKey` + +A sort key that implements `Ord`, `Serialize`, `Deserialize`, and string conversion. + +### `EvenSpacingIterator` + +Utility for generating evenly spaced sort keys for bulk operations. + +## How it works + +L-SEQ generates identifiers using a base-64 alphabet that maintains lexicographic ordering. Each identifier is a sequence of characters from this alphabet, and new identifiers are generated by finding space between existing ones at different depths. + +The algorithm uses alternating allocation strategies (bias toward min or max) at different depths to avoid degenerative cases and maintain good performance characteristics. \ No newline at end of file diff --git a/rust/src/lib.rs b/rust/src/lib.rs new file mode 100644 index 0000000..4aa1fb5 --- /dev/null +++ b/rust/src/lib.rs @@ -0,0 +1,382 @@ +use rand::Rng; +#[cfg(feature = "serde")] +use serde::{ + de::{self, Visitor}, + Deserialize, Serialize, +}; +use std::error::Error; +use std::fmt; +use std::str::FromStr; + +const ALPHABET: &[u8] = b"-0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz"; + +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)] +#[cfg_attr(feature = "serde", derive(Serialize))] +#[cfg_attr(feature = "serde", serde(into = "String"))] +pub struct SortKey { + numbers: Vec, +} + +#[cfg(feature = "serde")] +impl<'de> Deserialize<'de> for SortKey { + fn deserialize(deserializer: D) -> Result + where + D: serde::Deserializer<'de>, + { + struct SortKeyVisitor; + + impl<'de> Visitor<'de> for SortKeyVisitor { + type Value = SortKey; + + fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result { + formatter.write_str("a string containing valid sort key characters") + } + + fn visit_str(self, value: &str) -> Result + where + E: de::Error, + { + value.parse().map_err(|e| E::custom(e)) + } + } + + deserializer.deserialize_str(SortKeyVisitor) + } +} + +impl SortKey { + pub fn from_numbers(numbers: Vec) -> Self { + SortKey { numbers } + } +} + +impl From for Vec { + fn from(key: SortKey) -> Vec { + key.numbers + } +} + +impl From for String { + fn from(key: SortKey) -> String { + key.to_string() + } +} + +impl AsRef<[u8]> for SortKey { + fn as_ref(&self) -> &[u8] { + &self.numbers + } +} + +impl From for SortKey { + fn from(s: String) -> Self { + s.parse().unwrap_or_else(|_| SortKey { numbers: vec![0] }) + } +} + +impl fmt::Display for SortKey { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + for &n in &self.numbers { + write!(f, "{}", ALPHABET[n as usize] as char)?; + } + Ok(()) + } +} + +#[allow(dead_code)] +#[derive(Debug)] +pub struct LSEQ { + strategies: Vec, + rng: R, +} + +#[allow(dead_code)] +impl LSEQ { + pub fn new(mut rng: R) -> Self { + let strategies = vec![rng.gen_bool(0.5)]; + LSEQ { strategies, rng } + } + + pub fn alloc(&mut self, before: Option<&SortKey>, after: Option<&SortKey>) -> SortKey { + // Convert to numeric arrays, using boundary values for null + let p = before.map_or(vec![0], |s| s.numbers.clone()); + let q = after.map_or(vec![63], |s| s.numbers.clone()); + + // Walk through digits looking for space + let mut depth = 0; + let mut result = Vec::new(); + + loop { + let p_val = if depth < p.len() { p[depth] } else { 0 }; + let q_val = if depth < q.len() { q[depth] } else { 63 }; + + let interval = q_val as i32 - p_val as i32; + + // If we have space between values at this depth + if interval > 1 { + // Pick a value in the available range + let range = interval - 1; + let add_val = 1 + self.rng.gen_range(0..range) as u8; + let new_value = if self.strategies[depth] { + p_val + add_val + } else { + q_val - add_val + }; + + // Take the prefix from p up to depth and append our new value + result.push(new_value); + return SortKey::from_numbers(result); + } + result.push(p_val); + + // If values are the same or adjacent at this depth, + // continue to next depth + depth += 1; + if depth >= self.strategies.len() { + self.strategies.push(self.rng.gen_bool(0.5)); + } + } + } +} + +#[derive(Debug)] +pub enum SpacingError { + TooManyItems, +} + +impl fmt::Display for SpacingError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + SpacingError::TooManyItems => write!(f, "Too many items to allocate"), + } + } +} + +impl Error for SpacingError {} + +#[derive(Debug, Clone)] +pub struct EvenSpacingIterator { + remaining_items: usize, + space_size: u64, + next_item: u64, + step_size_integer: u64, // Integer part of step size + step_size_error: f64, // Fractional part of step size + error_accumulator: f64, // Accumulated error +} + +impl EvenSpacingIterator { + // Static table of (64^k - 2) values for k from 1 to 9 + // We subtract 2 from each space size because we need to reserve two boundary positions: + // 1. Position 0 (represented by "-") is reserved as the lower boundary + // 2. Position 63 (represented by "z") is reserved as the upper boundary + // This ensures we can always insert elements at the very beginning or end of the sequence + const USABLE_SPACE: [usize; 9] = [ + 64 - 2, // 64^1 - 2 + 4096 - 2, // 64^2 - 2 + 262144 - 2, // 64^3 - 2 + 16777216 - 2, // 64^4 - 2 + 1073741824 - 2, // 64^5 - 2 + 68719476736 - 2, // 64^6 - 2 + 4398046511104 - 2, // 64^7 - 2 + 281474976710656 - 2, // 64^8 - 2 + 18014398509481984 - 2, // 64^9 - 2 + ]; + + pub fn new(total_items: usize) -> Result<(u64, Self), SpacingError> { + if total_items == 0 { + return Err(SpacingError::TooManyItems); + } + + // Find the smallest k where 64^k > total_items using the static table + let mut k = 0; + let mut space_size = 0; + + for (index, &size) in Self::USABLE_SPACE.iter().enumerate() { + if size >= total_items { + k = index as u64 + 1; // k is 1-indexed + space_size = size; + break; + } + } + + // If we couldn't find a suitable k, the request is too large + if k == 0 { + return Err(SpacingError::TooManyItems); + } + + // Calculate step size split into integer and fractional parts + let step_size = (space_size as f64) / (total_items as f64); + let step_size_integer = step_size.floor() as u64; + let step_size_error = step_size - step_size_integer as f64; + + Ok(( + k, + EvenSpacingIterator { + remaining_items: total_items, + space_size: space_size.try_into().unwrap(), + next_item: 1, + step_size_integer, + step_size_error, + error_accumulator: 0.0, + }, + )) + } + + // Helper method to convert a position to a sort key + pub fn position_to_key(k: u64, position: u64) -> SortKey { + let mut result = Vec::with_capacity(k as usize); + let mut pos = position; + const BASE: u64 = 64; + + // Fill in digits from least significant to most significant + for _ in 0..k { + // SAFETY: digit is guaranteed to be in bounds because: + // 1. digit = pos % base where base is 64 + // 2. ALPHABET has exactly 64 elements + // Therefore digit as u64 will always be 0-63 + let digit = (pos % BASE) as u8; + pos /= BASE; + result.push(digit); + } + + // Reverse to get most significant digit first + result.reverse(); + SortKey::from_numbers(result) + } +} + +impl Iterator for EvenSpacingIterator { + type Item = u64; + + fn next(&mut self) -> Option { + if self.remaining_items == 0 { + return None; + } + + if self.next_item > self.space_size { + return None; + } + + let current_position = self.next_item; + self.remaining_items -= 1; + + self.next_item += self.step_size_integer; + + self.error_accumulator += self.step_size_error; + if self.error_accumulator >= 1.0 { + self.next_item += 1; + self.error_accumulator -= 1.0; + } + + Some(current_position) + } +} + +#[derive(Debug)] +pub enum SortKeyParseError { + InvalidCharacter(char), +} + +impl fmt::Display for SortKeyParseError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + SortKeyParseError::InvalidCharacter(c) => write!( + f, + "Invalid character '{}' in sort key. Expected characters from alphabet: {}", + c, + String::from_utf8_lossy(ALPHABET) + ), + } + } +} + +impl Error for SortKeyParseError {} + +impl FromStr for SortKey { + type Err = SortKeyParseError; + + fn from_str(s: &str) -> Result { + let numbers = s + .bytes() + .map(|b| ALPHABET.iter().position(|&x| x == b).map(|pos| pos as u8)) + .collect::>>() + .ok_or_else(|| SortKeyParseError::InvalidCharacter(s.chars().next().unwrap()))?; + Ok(SortKey { numbers }) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use rand::rngs::StdRng; + use rand::SeedableRng; + + #[test] + fn test_compare_lseq() { + let a = "a".parse::().unwrap(); + let b = "b".parse::().unwrap(); + assert_eq!(a < b, true); + assert_eq!(b < a, false); + assert_eq!(a < a, false); + } + + #[test] + fn test_lseq_alloc() { + let rng = StdRng::seed_from_u64(42); // Deterministic RNG for testing + let mut lseq = LSEQ::new(rng); + let id1 = lseq.alloc(None, None); + let id2 = lseq.alloc(Some(&id1), None); + let id3 = lseq.alloc(Some(&id1), Some(&id2)); + + assert!(id1 < id2); + assert!(id1 < id3); + assert!(id3 < id2); + } + + #[test] + fn test_position_to_key() { + const K: u64 = 2; + assert_eq!(EvenSpacingIterator::position_to_key(K, 1).to_string(), "-0"); + } + + #[test] + fn test_even_spacing_4093() { + let (k, mut iter) = EvenSpacingIterator::new(4093).unwrap(); + assert_eq!(k, 2); + let mut positions = Vec::new(); + for pos in iter.by_ref() { + // Use by_ref() to borrow instead of consume + positions.push(pos); + } + + // Print all generated sort keys + //println!("\nGenerated sort keys for 62 positions:"); + //for (i, pos) in positions.iter().enumerate() { + // let key = EvenSpacingIterator::position_to_key(k, *pos); + // println!("Position {}: {} (numeric: {})", i, key, pos); + //} + println!("{:?}", iter); + + assert_eq!(positions.len(), 4093); + } + + #[test] + fn test_even_spacing_6() { + let (k, mut iter) = EvenSpacingIterator::new(6).unwrap(); + eprintln!("Created iterator with k={}", k); + let mut positions = Vec::new(); + let mut count = 0; + while let Some(pos) = iter.next() { + count += 1; + eprintln!("Iteration {}: Got position {}", count, pos); + positions.push(pos); + } + eprintln!("Final iterator state: {:?}", iter); + assert_eq!( + positions.len(), + 6, + "Expected 6 positions, got {}", + positions.len() + ); + } +} diff --git a/typescript/.npmignore b/typescript/.npmignore new file mode 100644 index 0000000..743b615 --- /dev/null +++ b/typescript/.npmignore @@ -0,0 +1,5 @@ +tsconfig.json +*.test.ts +*.spec.ts +.gitignore +node_modules/ \ No newline at end of file diff --git a/typescript/README.md b/typescript/README.md new file mode 100644 index 0000000..b70fd2c --- /dev/null +++ b/typescript/README.md @@ -0,0 +1,61 @@ +# @peoplesgrocers/lseq + +TypeScript implementation of the L-SEQ algorithm for fractional indexing and list CRDTs. + +## Installation + +```bash +npm install @peoplesgrocers/lseq +``` + +## Usage + +```typescript +import { LSEQ, compareLSEQ } from '@peoplesgrocers/lseq'; + +// Create a new L-SEQ instance +const lseq = new LSEQ(); + +// Allocate identifiers +const id1 = lseq.alloc(null, null); // First identifier +const id2 = lseq.alloc(id1, null); // After id1 +const id3 = lseq.alloc(id1, id2); // Between id1 and id2 + +// Sort identifiers +const ids = [id3, id1, id2]; +ids.sort(compareLSEQ); +console.log(ids); // [id1, id3, id2] - properly ordered + +// Custom random function (useful for deterministic testing) +const deterministicLSEQ = new LSEQ(() => 0.5); +``` + +## API + +### `LSEQ` + +#### `constructor(random?: () => number)` + +Creates a new L-SEQ instance. + +- `random`: Optional custom random function (defaults to `Math.random`) + +#### `alloc(before: string | null, after: string | null): string` + +Allocates a new identifier between two existing identifiers. + +- `before`: The identifier that should come before the new one (or `null` for beginning) +- `after`: The identifier that should come after the new one (or `null` for end) +- Returns: A new identifier that sorts between `before` and `after` + +### `compareLSEQ(a: string, b: string): number` + +Compares two L-SEQ identifiers for sorting. + +- Returns: `-1` if `a < b`, `1` if `a > b`, `0` if `a === b` + +## How it works + +L-SEQ generates identifiers using a base-64 alphabet that maintains lexicographic ordering. Each identifier is a sequence of characters from this alphabet, and new identifiers are generated by finding space between existing ones at different depths. + +The algorithm uses alternating allocation strategies (bias toward min or max) at different depths to avoid degenerative cases and maintain good performance characteristics. \ No newline at end of file diff --git a/typescript/package.json b/typescript/package.json new file mode 100644 index 0000000..edebf4c --- /dev/null +++ b/typescript/package.json @@ -0,0 +1,36 @@ +{ + "name": "@peoplesgrocers/lseq", + "version": "1.0.0", + "description": "L-SEQ algorithm implementation for fractional indexing and list CRDTs", + "main": "dist/index.js", + "types": "dist/index.d.ts", + "scripts": { + "build": "tsc", + "test": "uvu -r tsx src \\.test\\.ts$", + "prepublishOnly": "npm run build" + }, + "keywords": [ + "lseq", + "crdt", + "fractional-indexing", + "sequence", + "collaborative-editing" + ], + "author": "peoplesgrocers", + "license": "SEE LICENSE IN LICENSE.txt", + "devDependencies": { + "typescript": "^5.0.0", + "uvu": "^0.5.6", + "tsx": "^4.7.0" + }, + "files": [ + "dist/**/*", + "src/**/*", + "README.md" + ], + "repository": { + "type": "git", + "url": "git+https://peoplesgrocers.com/en/forge/peoplesgrocers/lseq.git", + "directory": "typescript" + } +} \ No newline at end of file diff --git a/typescript/src/index.ts b/typescript/src/index.ts new file mode 100644 index 0000000..2121353 --- /dev/null +++ b/typescript/src/index.ts @@ -0,0 +1,72 @@ +const ALPHABET = + "-0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz"; + +function idToNumbers(id: string): number[] { + const nums = id.split("").map((char) => ALPHABET.indexOf(char)); + return nums; +} + +type Maybe = T | null; + +function isNone(value: Maybe): value is null { + return value === null; +} + +export class LSEQ { + // true = allocate near min, false = allocate near max + private strategies: boolean[]; + private random: () => number; + + constructor(random: () => number = Math.random) { + this.random = random; + this.strategies = [random() < 0.5]; + } + + public alloc(before: Maybe, after: Maybe): string { + // Convert to numeric arrays, using boundary values for null + const p = isNone(before) ? [0] : idToNumbers(before); + const q = isNone(after) ? [63] : idToNumbers(after); + + // Walk through digits looking for space + let depth = 0; + const result = []; + // eslint-disable-next-line no-constant-condition + while (true) { + const pVal = depth < p.length ? p[depth] : 0; + const qVal = depth < q.length ? q[depth] : 63; + + const interval = qVal - pVal; + + // If we have space between values at this depth + if (interval > 1) { + // Pick a value in the available range + const range = interval - 1; + const addVal = 1 + Math.floor(this.random() * range); + let newValue; + if (this.strategies[depth]) { + newValue = pVal + addVal; + } else { + newValue = qVal - addVal; + } + + // Take the prefix from p up to depth and append our new value + result.push(newValue); + + return result.map((n) => ALPHABET[n]).join(""); + } + result.push(pVal); + + // If values are the same or adjacent at this depth, + // continue to next depth + depth++; + if (depth > this.strategies.length) { + this.strategies.push(this.random() < 0.5); + } + } + } +} + +export function compareLSEQ(a: string, b: string): number { + if (a === b) return 0; + return a < b ? -1 : 1; +} diff --git a/typescript/tsconfig.json b/typescript/tsconfig.json new file mode 100644 index 0000000..273116c --- /dev/null +++ b/typescript/tsconfig.json @@ -0,0 +1,18 @@ +{ + "compilerOptions": { + "target": "ES2020", + "module": "commonjs", + "lib": ["ES2020"], + "outDir": "./dist", + "rootDir": "./src", + "strict": true, + "esModuleInterop": true, + "skipLibCheck": true, + "forceConsistentCasingInFileNames": true, + "declaration": true, + "declarationMap": true, + "sourceMap": true + }, + "include": ["src/**/*"], + "exclude": ["node_modules", "dist", "**/*.test.ts"] +} \ No newline at end of file