sochdb_storage/
lib.rs

1// Copyright 2025 Sushanth (https://github.com/sushanthpy)
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15//! SochDB Storage Layer
16//!
17//! Log-Structured Column Store (LSCS) with transaction-aware WAL for TOON-native data.
18//!
19//! ## Runtime Modes
20//!
21//! This crate supports two runtime modes:
22//!
23//! ### Embedded Sync Mode (like SQLite)
24//!
25//! For embedded deployments without async runtime:
26//!
27//! ```toml
28//! sochdb-storage = { version = "...", default-features = false, features = ["embedded-sync"] }
29//! ```
30//!
31//! Benefits:
32//! - ~500KB smaller binary
33//! - No async runtime overhead
34//! - Simpler embedded integration
35//!
36//! ### Async Mode (default, for servers)
37//!
38//! For server deployments with async I/O:
39//!
40//! ```toml
41//! sochdb-storage = { version = "..." }  # async enabled by default
42//! ```
43//!
44//! Benefits:
45//! - Better scalability for concurrent connections
46//! - Non-blocking I/O for server workloads
47//!
48//! ## Novel Components
49//!
50//! - **LSCS** (`lscs`): Log-Structured Column Store - columnar variant of LSM with
51//!   schema-aware compression and column-aware compaction for reduced write amplification.
52//!
53//! - **Transaction WAL** (`txn_wal`): ACID-compliant Write-Ahead Log with transaction
54//!   boundaries, commit/abort markers, and crash recovery.
55//!
56//! - **StorageEngine Trait** (`storage_engine`): Pluggable storage backend abstraction
57//!   enabling 80% I/O reduction for columnar projections (Task 1).
58//!
59//! - **Page Manager** (`page_manager`): TOON file format with magic header and O(1)
60//!   page allocation (Task 8).
61//!
62//! - **Columnar Compression** (`columnar_compression`): Type-aware encoding with
63//!   dictionary, RLE, and delta compression for 2-4× storage reduction (Task 9).
64//!
65//! ## Utility Components
66//!
67//! - **Bloom Filters** (`bloom`): Probabilistic existence checks
68//! - **Block Checksums** (`block_checksum`): Data integrity validation
69//! - **Compression** (`compression`): LZ4/Zstd compression
70//! - **Sketches** (`sketches`): Approximate algorithms (HyperLogLog, CountMin, DDSketch)
71
72// New TOON-native storage components
73pub mod actor; // Actor-based connection manager (mm.md Task 7.2)
74pub mod aries_recovery; // ARIES-style crash recovery (Task 1)
75pub mod checkpoint; // ARIES-style checkpointing with WAL truncation (mm.md Task 1.4)
76pub mod columnar_compression;
77pub mod database; // Database Kernel (shared by embedded + server)
78pub mod durable_storage; // Fully wired durable storage with MVCC
79pub mod ffi;
80pub mod group_commit; // Event-driven Group Commit (Task 4)
81pub mod hlc; // Hybrid Logical Clock for commit timestamps (mm.md Task 1.3)
82pub mod hybrid_store; // PAX hybrid row-column storage (mm.md Task 4.1)
83pub mod ipc; // IPC Protocol with multiplexing (mm.md Task 7.1)
84#[cfg(unix)]
85pub mod ipc_server; // Unix Socket IPC Server (Task 3)
86pub mod learned_index_integration;
87pub mod lscs;
88pub mod mvcc_new;
89pub mod mvcc_snapshot;
90pub mod page_manager;
91pub mod production_wal; // Production WAL with ARIES recovery (mm.md Task 3)
92pub mod ssi; // Serializable Snapshot Isolation (Task 2)
93pub mod storage_engine;
94pub mod streaming_iterator; // Streaming Iterator Architecture (mm.md Task 4)
95pub mod transaction; // Unified Transaction Coordinator trait and types
96pub mod txn_arena; // Transaction-scoped arena with zero-copy key/value plumbing
97pub mod txn_wal;
98pub mod wal_integration;
99pub mod zero_copy_safety; // Zero-Copy Validation Layer (Task 5) // FFI bindings for Python SDK
100
101// Performance optimization modules
102pub mod adaptive_learned_index;
103pub mod adaptive_memtable; // Adaptive memtable sizing with memory pressure (Task 10)
104pub mod deferred_index; // Deferred sorted index with LSM-style compaction (Rec 2)
105pub mod dirty_tracking; // Batched dirty tracking with MPSC queue
106pub mod index_policy; // Per-table index policy
107pub mod batch_wal; // Batched WAL with vectored I/O (Task 3)
108pub mod key_buffer; // Cache-line aligned key buffer (Task 2)
109pub mod lockfree_memtable; // Lock-free read path with hazard pointers (Task 4)
110pub mod packed_row; // Unified row storage with delta encoding (Task 1)
111
112// PhD-Level Architectural Optimizations (December 2025)
113pub mod clr_learned_index; // CLR Learned Index for sorted runs (Task 3)
114pub mod lockfree_epoch; // Lock-Free Epoch Tracking (Task 3)
115pub mod hierarchical_ts; // Hierarchical Timestamp Oracle (Task 9)
116pub mod shard_coalesced; // Shard-Coalesced Batch DashMap (Task 6)
117pub mod polymorphic_value; // Polymorphic Value Encoding (Task 12)
118pub mod epoch_arena; // Epoch-Partitioned Key Arena (Task 1)
119pub mod stratified_skiplist; // Stratified SkipList with Deferred Promotion (Task 2)
120pub mod columnar_wal; // Columnar WAL Layout (Task 4)
121pub mod generational_slab; // Generational Slab Allocator (Task 5)
122pub mod rl_workload; // RL Workload Classifier (Task 10)
123#[cfg(unix)]
124pub mod io_uring_wal; // io_uring WAL Submission (Task 11)
125
126// New performance modules (Recommendations 1-9)
127pub mod cow_btree; // Copy-on-Write B-Tree for ordered access (Recommendation 5)
128pub mod epoch_mvcc; // Epoch-based MVCC for O(log E) version lookup (Recommendation 7)
129pub mod page_cache; // Application-level page cache with Clock-Pro (Recommendation 8)
130pub mod row_format; // Slot-based columnar row storage (Recommendation 1)
131pub mod tiered_memtable; // Tiered MemTable with deferred sorting (Recommendation 3)
132pub mod tournament_tree; // K-way merge with tournament tree (Task 2)
133pub mod vectorized_scan; // SIMD-accelerated vectorized scan engine (Recommendation 2)
134pub mod zero_copy_serde; // Zero-copy serialization for WAL (Recommendation 6)
135
136// Namespace and multi-tenancy support (Task 3)
137pub mod namespace; // Namespace routing and on-disk layout
138
139// Core utilities
140pub mod backend;
141pub mod backup;
142pub mod block_checksum;
143pub mod bloom;
144pub mod compression;
145pub mod dict_compression;
146pub mod direct_io;
147#[cfg(unix)]
148pub mod io_uring;
149pub mod manifest;
150pub mod memory;
151pub mod parallel_merge;
152pub mod payload;
153pub mod prefetch;
154pub mod sketches;
155pub mod two_level_index;
156pub mod validation;
157pub mod version_store;
158pub mod zero_copy;
159
160// Re-exports for new components
161pub use columnar_compression::{
162    ColumnEncoder, DeltaEncoder, DictionaryEncoder, EncodingStats, EncodingType, RleEncoder,
163};
164pub use learned_index_integration::{
165    HybridIndex, IndexManager, IndexType, KeyStats, PointLookupExecutor,
166};
167pub use lscs::{
168    ColumnDef, ColumnGroup, ColumnType, ColumnarMemtable, Lscs, LscsConfig, LscsRecoveryStats,
169    LscsStats, TableSchema,
170};
171#[allow(deprecated)]
172pub use mvcc_snapshot::{
173    MvccStore, Snapshot as MvccSnapshot, Timestamp, TransactionManager, TxnId, TxnStatus,
174    VersionChain, VersionInfo,
175};
176pub use page_manager::{
177    DEFAULT_PAGE_SIZE, DbHeader, FORMAT_VERSION, FreePageHeader, PageId, PageManager,
178    PageManagerStats, PageType, SOCHDB_MAGIC,
179};
180pub use storage_engine::{
181    ColumnId, ColumnIterator, Row, RowId, StorageEngine, StorageEngineType, StorageStats,
182    TxnHandle, open_storage_engine,
183};
184pub use transaction::{
185    DurabilityLevel, IsolationLevel, RecoveryStats as TxnRecoveryStats, TransactionCoordinator,
186    TransactionHandle,
187};
188pub use txn_wal::{CrashRecoveryStats, TxnWal, TxnWalBuffer, TxnWalEntry, TxnWalStats};
189pub use wal_integration::{
190    GroupCommitBuffer, MvccTransactionManager, RecoveryStats, Transaction, TxnState, 
191    WalStorageManager,
192};
193
194// Re-exports for performance optimization modules
195pub use adaptive_learned_index::{AdaptiveLearnedIndex, LearnedIndexStats, PiecewiseLinearModel};
196pub use adaptive_memtable::{
197    AdaptiveMemtableConfig, AdaptiveMemtableSizer, AdaptiveMemtableStats,
198    DEFAULT_BASE_SIZE, MAX_MEMTABLE_SIZE, MIN_MEMTABLE_SIZE,
199};
200pub use batch_wal::{
201    BatchAccumulator, BatchedWalReader, BatchedWalStats, BatchedWalWriter, ConcurrentBatchedWal,
202    DEFAULT_MAX_BATCH_BYTES, DEFAULT_MAX_BATCH_SIZE,
203};
204pub use clr_learned_index::{ClrIndex, ClrLookupResult, ClrStats, IndexedSortedRun};
205pub use key_buffer::{
206    ArenaKey,
207    ArenaKeyHandle,
208    BatchKeyGenerator,
209    InternedTablePrefix,
210    // Arena allocation for high-throughput key operations
211    KeyArena,
212    KeyBuffer,
213    MAX_KEY_LENGTH,
214};
215pub use lockfree_memtable::{
216    HazardDomain,
217    INLINE_VALUE_SIZE,
218    LockFreeMemTable,
219    LockFreeVersion,
220    LockFreeVersionChain,
221    // Inline value storage for reduced memory indirection
222    ValueStorage,
223};
224pub use packed_row::{
225    PackedColumnDef, PackedColumnType, PackedRow, PackedRowBuilder, PackedTableSchema,
226};
227
228// Re-exports for utilities
229pub use backend::{LocalFsBackend, ObjectMetadata, StorageBackend};
230pub use backup::{BackupManager, BackupMetadata};
231pub use block_checksum::{
232    BlockChecksumConfig, BlockChecksumStats, BlockType as BlockChecksumType, BlockWriter, ChecksummedBlock,
233};
234pub use bloom::{BlockedBloomFilter, BloomFilter, LevelAdaptiveFPR, UnifiedBloomFilter};
235pub use compression::{CompressionEngine, CompressionStats, StorageTier};
236pub use manifest::{FileMetadata, LsmState, Manifest, VersionEdit};
237pub use memory::{MemoryBudget, MemoryTracker, WriteBufferManager, WriteBufferStats};
238pub use mvcc_new::{
239    ColumnGroupRef, ReadVersion, Snapshot, SnapshotGuard, VersionGuard, VersionSet,
240    VersionSetStats, VersionSetStatsSnapshot,
241};
242pub use payload::{CompressionType, PayloadStats, PayloadStore};
243pub use sketches::{AdaptiveSketch, CountMinSketch, DDSketch, ExponentialHistogram, HyperLogLog};
244pub use two_level_index::{
245    BlockIndexEntry, BlockIndexReader, FencePointer, TemporalKey, TwoLevelIndex,
246};
247pub use validation::{SSTableValidator, validate_sstable_file};
248
249// Re-exports for durable storage
250pub use durable_storage::{ArenaMvccMemTable, DurableStorage, MvccMemTable, TransactionMode};
251
252// Super Version and Copy-on-Write Version Set (mm.md Task 1)
253pub mod version_set;
254pub mod concurrent_art;
255pub mod sstable;
256pub mod wal_segment;
257pub mod compaction_policy;
258pub mod optimized_scan;
259
260// Re-exports for new performance modules (Recommendations 1-9)
261pub use version_set::{
262    FileMetadata as VersionFileMetadata, ImmutableMemTable, ImmutableMemTableRef,
263    LevelMetadata, SuperVersion, SuperVersionHandle, VersionSet as CowVersionSet,
264};
265pub use concurrent_art::ConcurrentART;
266pub use sstable::{
267    BlockBuilder, BlockIterator, BlockHandle, BlockType,
268    FilterPolicy, BloomFilterPolicy, RibbonFilterPolicy, XorFilterPolicy, FilterReader,
269    SSTableFormat, Header, Footer, Section, SectionType,
270    SSTableBuilder, SSTableBuilderOptions, SSTableBuilderResult,
271    SSTable, TableMetadata, ReadOptions, BlockCache,
272};
273pub use wal_segment::{
274    WalSegmentManager, SegmentConfig, SegmentHeader, SegmentMetadata,
275    CheckpointRecord, SegmentStats, RecoveryIterator, WalEntry,
276};
277pub use compaction_policy::{
278    CompactionConfig, CompactionFile, CompactionJob, CompactionPicker,
279    CompactionPriority, CompactionReason, CompactionState, CompactionStats,
280    CompactionStrategy, LeveledCompactionPicker, RetentionConfig,
281    UniversalCompactionPicker, VersionPruner,
282};
283pub use optimized_scan::{
284    EntrySource, FileRange, LevelFiles, RangeScanner, ScanConfig, ScanStats,
285    TournamentTree, VersionedEntry,
286};
287pub use cow_btree::{BTreeEntry, BTreeSnapshot, CowBTree, Node, SearchResult};
288pub use epoch_mvcc::{
289    CommitResult, EpochManager, EpochMvccStore, EpochSnapshot, EpochTransaction,
290    EpochVersionChain, GcStats, StoreStats, VersionEntry,
291};
292pub use page_cache::{CacheStats, ClockProCache, CachedPage, PageId as CachePageId, PageState};
293pub use row_format::{Slot, SlotRow, SlotRowArena, SlotRowHandle, SlotRowFlags};
294pub use tiered_memtable::{HotEntry, SortedBatch, TieredMemTable};
295pub use vectorized_scan::{
296    ColumnVector, ComparisonOp, Int64Comparison, VectorBatch, VectorPredicate,
297    VectorizedScanConfig, VectorizedScanStats, DEFAULT_BATCH_SIZE,
298    // SoA + Late Materialization (80/20 optimization)
299    SimdVisibilityFilter, SoaBatch, SoaScanIterator, SoaScanStats, SoaSource,
300    StreamingScanIterator, ValueHandle, VersionedSlice,
301};
302pub use zero_copy_serde::{
303    FieldDescriptor, MmapWalReader, SerdeStats, WalBatchReader, WalBatchWriter,
304    WalEntryBuilder, WalEntryHeader, WalEntryReader, WalEntryType, ZeroCopyHeader,
305    FORMAT_VERSION as SERDE_FORMAT_VERSION, HEADER_SIZE as SERDE_HEADER_SIZE, ZERO_COPY_MAGIC,
306};
307
308// Re-exports for transaction arena and zero-copy plumbing
309pub use txn_arena::{
310    ArenaWriteSet, BytesRef, KeyFingerprint, TxnArena, TxnWriteBuffer, WriteOp,
311};
312
313// Re-exports for dirty tracking with batching
314pub use dirty_tracking::{
315    BatchedDirtyTracker, DirtyEvent, DirtyTrackingStats, TxnDirtyBuffer,
316};
317
318// Re-exports for per-table index policy
319pub use index_policy::{
320    BalancedTableIndex, IndexPolicy, SortedRun, TableIndexConfig, TableIndexRegistry,
321};
322
323// Re-exports for database kernel
324pub use database::{
325    ColumnDef as DbColumnDef,
326    ColumnType as DbColumnType,
327    ColumnarQueryResult, // SIMD-friendly columnar result format
328    Database,
329    DatabaseConfig,
330    GroupCommitSettings,
331    QueryBuilder,
332    QueryResult,
333    QueryRowIterator,
334    RecoveryStats as DbRecoveryStats,
335    Stats as DbStats,
336    SyncMode,
337    TableSchema as DbTableSchema,
338    TxnHandle as KernelTxnHandle,
339    VectorSearchResult,
340};