1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
//! Persistent Adaptive Radix Trie (PART) Dictionary
//!
//! This module implements a disk-based dictionary using a hybrid of:
//! - **Adaptive Radix Tree (ART)**: For optimal trie traversal with adaptive node sizes
//! - **B-trie Buckets**: For efficient leaf storage with multiple strings per disk page
//!
//! # Architecture
//!
//! ```text
//! ┌─────────────────────────────────────────────────────────────┐
//! │ PersistentARTrie<V> │
//! ├─────────────────────────────────────────────────────────────┤
//! │ Index Layer (ART) │
//! │ - Node4: 1-4 children (linear scan) │
//! │ - Node16: 5-16 children (SIMD accelerated) │
//! │ - Node48: 17-48 children (index array) │
//! │ - Node256: 49-256 children (direct array) │
//! ├─────────────────────────────────────────────────────────────┤
//! │ Leaf Layer (B-trie Buckets) │
//! │ - StringBucket: 8KB pages with multiple strings │
//! │ - Binary search within buckets │
//! │ - B-trie style splits when full │
//! ├─────────────────────────────────────────────────────────────┤
//! │ Storage Layer │
//! │ - BufferManager: LRU cache with Clock eviction │
//! │ - DiskManager: Memory-mapped 256KB blocks │
//! │ - WAL: Write-ahead logging for crash recovery │
//! └─────────────────────────────────────────────────────────────┘
//! ```
//!
//! # Key Features
//!
//! - **Pointer Swizzling**: Lazy loading - nodes start as disk references,
//! swizzled to memory pointers on access
//! - **Path Compression**: Up to 12 bytes of prefix stored inline
//! - **Adaptive Node Sizes**: Optimal memory/performance trade-off
//! - **SIMD Acceleration**: SSE4.1 for Node16 key lookup
//! - **Crash Recovery**: WAL with redo-only recovery
//!
//! # Usage
//!
//! ```text
//! use libdictenstein::persistent_artrie::PersistentARTrie;
//!
//! // Create a new persistent dictionary
//! let mut dict = PersistentARTrie::create("words.part")?;
//!
//! // Insert terms
//! dict.insert("hello", ())?;
//! dict.insert("world", ())?;
//!
//! // Query with Levenshtein automaton
//! let transducer = Transducer::new(&dict, Algorithm::Standard);
//! for result in transducer.query("helo", 1) {
//! println!("{}: distance {}", result.term, result.distance);
//! }
//! ```
//!
//! # Feature Flag
//!
//! Enable with the `persistent-artrie` feature:
//!
//! ```toml
//! [dependencies]
//! liblevenshtein = { version = "0.8", features = ["persistent-artrie"] }
//! ```
//!
//! # References
//!
//! - [B-tries for disk-based string management](https://link.springer.com/article/10.1007/s00778-008-0094-1)
//! (Askitis & Zobel, VLDBJ 2009)
//! - [The Adaptive Radix Tree](https://db.in.tum.de/~leis/papers/ART.pdf)
//! (Leis et al., ICDE 2013)
//! - [Persistent Storage of ART in DuckDB](https://duckdb.org/2022/07/27/art-storage)
//! (DuckDB, 2022)
//!
//! # ACID Guarantees
//!
//! This implementation provides ACID properties for reliable persistent storage:
//!
//! ## Atomicity
//!
//! - **Single Operations**: Individual insert/remove operations are atomic
//! - **Document Transactions**: Multi-term transactions via [`DocumentTransaction`] provide
//! all-or-nothing semantics - either all terms are committed or none are
//! - **WAL Logging**: Operations are logged to WAL before application, ensuring atomicity
//! across crashes
//!
//! ## Consistency
//!
//! - **Trie Invariants**: ART structure invariants (node types, child counts, path compression)
//! are maintained after every operation
//! - **CRC32 Checksums**: WAL records include CRC32 checksums for integrity verification
//! - **Recovery Validation**: Crash recovery validates checksums and replays only valid records
//!
//! ## Isolation
//!
//! The implementation uses read-write locks for thread safety:
//!
//! | Isolation Level | Dirty Read | Non-Repeatable Read | Phantom Read |
//! |-----------------|------------|---------------------|--------------|
//! | RwLock (default)| No | No | No |
//! | MVCC-Lite | No | No | Possible* |
//!
//! *MVCC-Lite uses epoch-based snapshots for reads, allowing concurrent writes.
//! Phantoms are possible if new terms are inserted between snapshot creation and read.
//!
//! ## Durability
//!
//! Durability is controlled by [`DurabilityPolicy`]:
//!
//! | Policy | fsync Behavior | Guarantee | Use Case |
//! |-------------|---------------------|--------------|----------------------------------|
//! | `Immediate` | Before public mutation/commit acknowledgement | Full | ACID compliance (default) |
//! | `GroupCommit` | Batched by coordinator or blocking sync fallback | Full | High-throughput workloads |
//! | `Periodic` | At checkpoints only | Bounded loss | Performance-critical |
//! | `None` | Never | None | Testing only |
//!
//! The default `Immediate` policy ensures that acknowledged public mutations and
//! committed transactions are durable on disk. The `GroupCommit` policy batches
//! fsync calls when a coordinator is installed and otherwise falls back to a
//! blocking sync to preserve full durability acknowledgements. `Periodic` trades
//! some durability for performance.
// Core modules (storage foundation)
//
// `error` has been relocated to `crate::persistent_artrie_core::error`; it is
// re-exported here under its original path so existing call-sites work
// unchanged after the core extraction.
pub use crateerror;
pub use crateswizzled_ptr;
// Block storage abstraction + memory-mapped and io_uring backends (relocated to core).
pub use crateblock_storage;
pub use cratedisk_manager;
pub use crateio_uring_disk_manager;
pub use cratebuffer_manager;
// Arena allocation for efficient node storage
// Compact variable-width encoding
//
// Relocated to `crate::persistent_artrie_core::compact_encoding`; re-exported
// here for backward-compatible call-sites.
pub use cratecompact_encoding;
// ART node types
// Path compression operations
// Compaction config/stats/progress (Phase-5 split out of dict_impl).
// Document transaction data types (Phase-5 split out of dict_impl).
// Parallel-merge extension trait (Phase-5 split out of dict_impl).
// Lock-free CAS cluster (Phase-5 split out of dict_impl).
// Thin production-write-path router for the lock-free overlay (the SOLE
// representation since L3.3). The byte seam impl of the shared
// `LockFreeOverlay<ByteKey, V, S>` trait lives here: `route_overlay()` + the
// overlay publishers (the owned tree is deleted).
pub
// M2a byte LockFreeOverlay correspondence + reestablish round-trip (in-crate
// because the read-engine skins + the `LockFreeOverlay` trait are `pub(crate)`).
// Byte read/write routing + reject correspondence tests (in-crate: the routed
// public reads/writes are exercised against the overlay, the SOLE representation —
// every constructor installs it, so route_overlay() is universally true).
// Phase 5/6 byte overlay-eviction correspondence tests (OE3/OE5/OE8 twins of char's
// `overlay_eviction_driver_correspondence`). In-crate because they drive the lifted
// `pub(crate)` `OverlayEvictable` primitives + the `pub(crate)` `evict_overlay_nodes`
// byte driver + the `bench_*` eviction surface, and inspect the overlay-internal state
// (an OnDisk overlay child after eviction, the M-2a stamp 1c guard).
// Per-monomorph value-route for the byte counter increment (increment_bytes) — the
// byte twin of char's `lockfree_value_route`. SAFE `Any` dispatch to the `<u64, S>`
// durable primitives; `None` for non-u64 `V` (caller runs the general overlay
// value-CAS path).
pub
// Document-transaction execution methods (Phase-5 split out of dict_impl).
// Atomic read-modify-write operations (Phase-5 split out of dict_impl).
// ARTrie + EvictableARTrie trait impls for SharedARTrie<V> (Phase-5 split out of dict_impl).
// Public iteration API (iter / iter_strings / iter_prefix wrappers).
// IoUringDiskManager-specific constructors (Phase-5 split out of dict_impl).
// Dictionary / MappedDictionary / Debug trait impls (Phase-5 split out of dict_impl).
// PersistentARTrie::compact (file-rewrite compaction) — Phase-5 split out of dict_impl.
// Persistence/durability/stats public API (Phase-5 split out of dict_impl).
// Byte seam impl of the shared OverlayCheckpoint route-split (M2b). The overlay is
// the SOLE representation (`route_overlay()` universally true), so the route-split
// always runs the overlay arm.
pub
// Byte overlay fault-in primitive (`load_overlay_node_from_disk`) + the SAFE
// `OverlayFaulter` impl that lets the overlay-backed `DictionaryNode` resolve
// `Child::OnDisk` overlay children during a graph walk. F7 BLOCKER-1.
pub
// MmapDiskManager-specific constructors (Phase-5 split out of dict_impl).
// Public mutation API (insert/remove/batch wrappers) — Phase-5 split out of dict_impl.
// Disk-loading helpers (load_root_from_disk + variants) — Phase-5 split out of dict_impl.
// Merge API (merge_from/merge_replace/merge_from_batched*) — Phase-5 split out of dict_impl.
// Disk-ref resolution + prefetch helpers — Phase-5 split out of dict_impl.
// On-disk serialization helpers (persist_to_disk + serialize_*) — Phase-5 split out of dict_impl.
// Arena-aware prefix iteration (navigate_to_prefix_with_arena, collect_terms_with_arena,
// iter_prefix_with_arena, iter_prefix_with_values_and_arena) — Phase-5 split out of dict_impl.
// Cursor-based prefix iteration (iter_prefix_from_cursor) — Phase-5 split.
// F5 (Slice 3): the direct dense→overlay reopen loader — `load_root_immutable`
// (eager-load owned + iterative walk-converter owned→overlay). Gated OFF by default
// (`LockFreeOverlay::USE_F5_REOPEN_LOADER`); see `docs/design/slice3-f5-loader-impl.md`.
pub
// Page-aware prefix-iteration result types (Phase-5 split out of dict_impl).
// DFS iterators (TermIterator + TermValueIterator + IterState).
// Node serialization
// B-trie string buckets
// Bucket ↔ ART transitions
// Dictionary node implementation
// Dictionary trait implementation
// Zipper implementation
// Write-ahead log for crash recovery (relocated to core)
pub use cratewal;
// WAL management trait for shared WAL operations
pub use cratewal_managed;
// Crash recovery (relocated to core)
pub use craterecovery;
// Epoch-based checkpoint metadata/tracking (relocated to core)
pub use crateepoch;
// Group commit for WAL batching (relocated to core)
pub use crategroup_commit;
// Prefetching for DFS traversal
pub use crateprefetch;
// Concurrency controls - optimistic lock coupling (relocated to core)
pub use crateconcurrency;
// Traversal context for block caching
// Dirty tracking for incremental checkpoints (relocated to core)
pub use cratedirty_tracker;
// Hash-based deduplication for space efficiency
// Relative offset encoding for space-efficient child pointers
// Memory pressure monitoring for proactive eviction
pub use cratememory_monitor;
// Memory pressure-driven node eviction (relocated to core)
pub use crateeviction;
// Adaptive buffer pool sizing
pub use crateadaptive_pool;
// Per-node logging for near-instant recovery
// Version-based checkpoint management
pub use crateversion_checkpoint;
// MVCC-lite read transactions
// Version garbage collection
pub use crateversion_gc;
// Re-exports for convenience
pub use ;
pub use ;
pub use ;
// Bucket types
pub use ;
// Transition types
//
// L3.3c: the owned bucket↔ART transition surface (`art_node_to_bucket`,
// `bucket_to_art_node`, `should_convert_bucket_to_art`, `should_merge_art_to_bucket`,
// `ArtToBucketResult`, `BucketToArtResult`, `TransitionError`) was deleted with the
// owned tree. `ChildNode` (the disk-decode child pointer) survives.
pub use ChildNode;
// Node types
pub use PersistentARTrieNode;
// Dictionary types
pub use ;
// Parallel merge extension trait
pub use SharedARTrieParallelExt;
/// Thread-safe handle for `PersistentARTrie`.
///
/// **F4 (the lock collapse):** this is now a bare `Arc<PersistentARTrie<V,S>>` —
/// the outer `RwLock` is DELETED. Overlay reads AND writes are fully lock-free;
/// the only operations that still need mutual exclusion take dedicated inner locks
/// (`checkpoint_lock`, the wrapped `root` `RwLock` for the dormant owned path, the
/// `eviction_coordinator` `Mutex`, `merge_lock`) — never the handle. A
/// backward-compatible `.read()`/`.write()` API is preserved by
/// [`SharedTrieAccess`](crate::persistent_artrie_core::shared_access::SharedTrieAccess)
/// (both return a transparent guard that derefs to `&T`; there is no lock), so the
/// ~270 historical call sites and the `liblevenshtein-rust` sibling compile
/// unchanged against the collapsed type.
pub type SharedARTrie<V, S = MmapDiskManager> = Arc;
pub use crateSharedTrieAccess;
// F4: the concrete `.read()/.write()` shim impl on the collapsed byte handle.
// CONCRETE (not a blanket `Arc<T>`) so it never shadows the inherent
// `RwLock::{read,write}` on the crate's `Arc<RwLock<…>>` manager handles.
// Arena-aware iteration types
pub use ;
// Per-document transaction types
pub use ;
// Compaction types
pub use ;
// Zipper types
pub use PersistentARTrieZipper;
pub use ;
pub use ;
pub use ;
pub use IoUringDiskManager;
// Arena types
pub use ;
pub use ;
// Compact encoding types
pub use ;
// WAL types
pub use ;
// Async WAL types for concurrent writes during sync
pub use ;
// io_uring-based WAL fsync backend (Linux-only, requires `io-uring-backend` feature)
pub use IoUringFsync;
// WAL management trait for shared WAL operations
pub use ;
// Group commit types (opt-in feature for slower storage)
pub use ;
// Recovery types
pub use ;
// Epoch-based checkpointing types
pub use ;
// Prefetch types
pub use ;
// Concurrency types
pub use ;
// Traversal context types
pub use ;
// Dirty tracker types
pub use ;
// Deduplication types
pub use ;
// Relative encoding types
pub use ;
// Memory pressure monitoring types
pub use ;
// Adaptive buffer pool sizing types
pub use ;
// Eviction types for bounded-memory operation
pub use ;
// Per-node logging types
pub use ;
// Version checkpoint types (Phase 7)
pub use ;
// MVCC-lite read transaction types (Phase 8)
pub use ;
// Version garbage collection types (Phase 9)
pub use ;
/// Maximum key length supported (64KB - 1)
pub const MAX_KEY_LENGTH: usize = 65535;
/// Maximum value size supported (limited by bucket size)
pub const MAX_VALUE_SIZE: usize = 8192;
/// Default buffer pool size (256 pages = 64MB)
pub const DEFAULT_BUFFER_POOL_SIZE: usize = 256;
/// B-trie bucket page size (8KB)
pub const BUCKET_PAGE_SIZE: usize = 8192;
/// Maximum entries per bucket before split
pub const MAX_BUCKET_ENTRIES: usize = 256;
/// Path compression prefix maximum length
pub const MAX_PREFIX_LENGTH: usize = 12;