sochdb_storage/
lib.rs

1// Copyright 2025 Sushanth (https://github.com/sushanthpy)
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15//! SochDB Storage Layer
16//!
17//! Log-Structured Column Store (LSCS) with transaction-aware WAL for TOON-native data.
18//!
19//! ## Runtime Modes
20//!
21//! This crate supports two runtime modes:
22//!
23//! ### Embedded Sync Mode (like SQLite)
24//!
25//! For embedded deployments without async runtime:
26//!
27//! ```toml
28//! sochdb-storage = { version = "...", default-features = false, features = ["embedded-sync"] }
29//! ```
30//!
31//! Benefits:
32//! - ~500KB smaller binary
33//! - No async runtime overhead
34//! - Simpler embedded integration
35//!
36//! ### Async Mode (default, for servers)
37//!
38//! For server deployments with async I/O:
39//!
40//! ```toml
41//! sochdb-storage = { version = "..." }  # async enabled by default
42//! ```
43//!
44//! Benefits:
45//! - Better scalability for concurrent connections
46//! - Non-blocking I/O for server workloads
47//!
48//! ## Novel Components
49//!
50//! - **LSCS** (`lscs`): Log-Structured Column Store - columnar variant of LSM with
51//!   schema-aware compression and column-aware compaction for reduced write amplification.
52//!
53//! - **Transaction WAL** (`txn_wal`): ACID-compliant Write-Ahead Log with transaction
54//!   boundaries, commit/abort markers, and crash recovery.
55//!
56//! - **StorageEngine Trait** (`storage_engine`): Pluggable storage backend abstraction
57//!   enabling 80% I/O reduction for columnar projections (Task 1).
58//!
59//! - **Page Manager** (`page_manager`): TOON file format with magic header and O(1)
60//!   page allocation (Task 8).
61//!
62//! - **Columnar Compression** (`columnar_compression`): Type-aware encoding with
63//!   dictionary, RLE, and delta compression for 2-4× storage reduction (Task 9).
64//!
65//! ## Utility Components
66//!
67//! - **Bloom Filters** (`bloom`): Probabilistic existence checks
68//! - **Block Checksums** (`block_checksum`): Data integrity validation
69//! - **Compression** (`compression`): LZ4/Zstd compression
70//! - **Sketches** (`sketches`): Approximate algorithms (HyperLogLog, CountMin, DDSketch)
71
72// New TOON-native storage components
73pub mod actor; // Actor-based connection manager (mm.md Task 7.2)
74pub mod aries_recovery; // ARIES-style crash recovery (Task 1)
75pub mod checkpoint; // ARIES-style checkpointing with WAL truncation (mm.md Task 1.4)
76pub mod columnar_compression;
77pub mod database; // Database Kernel (shared by embedded + server)
78pub mod durable_storage; // Fully wired durable storage with MVCC
79pub mod ffi;
80pub mod group_commit; // Event-driven Group Commit (Task 4)
81pub mod hlc; // Hybrid Logical Clock for commit timestamps (mm.md Task 1.3)
82pub mod hybrid_store; // PAX hybrid row-column storage (mm.md Task 4.1)
83pub mod ipc; // IPC Protocol with multiplexing (mm.md Task 7.1)
84#[cfg(unix)]
85pub mod ipc_server; // Unix Socket IPC Server (Task 3)
86pub mod learned_index_integration;
87pub mod lock; // Advisory file locking for database exclusivity
88pub mod lscs;
89pub mod mvcc_new;
90pub mod mvcc_snapshot;
91pub mod page_manager;
92pub mod production_wal; // Production WAL with ARIES recovery (mm.md Task 3)
93pub mod ssi; // Serializable Snapshot Isolation (Task 2)
94pub mod storage_engine;
95pub mod streaming_iterator; // Streaming Iterator Architecture (mm.md Task 4)
96pub mod transaction; // Unified Transaction Coordinator trait and types
97pub mod txn_arena; // Transaction-scoped arena with zero-copy key/value plumbing
98pub mod txn_wal;
99pub mod wal_fencing; // Epoch-based WAL fencing for split-brain detection
100pub mod wal_integration;
101pub mod zero_copy_safety; // Zero-Copy Validation Layer (Task 5) // FFI bindings for Python SDK
102
103// Performance optimization modules
104pub mod adaptive_learned_index;
105pub mod adaptive_memtable; // Adaptive memtable sizing with memory pressure (Task 10)
106pub mod deferred_index; // Deferred sorted index with LSM-style compaction (Rec 2)
107pub mod dirty_tracking; // Batched dirty tracking with MPSC queue
108pub mod index_policy; // Per-table index policy
109pub mod queue_index; // Queue-optimized index structure (Task: Queue Index Policy)
110pub mod batch_wal; // Batched WAL with vectored I/O (Task 3)
111pub mod key_buffer; // Cache-line aligned key buffer (Task 2)
112pub mod lockfree_memtable; // Lock-free read path with hazard pointers (Task 4)
113pub mod packed_row; // Unified row storage with delta encoding (Task 1)
114
115// PhD-Level Architectural Optimizations (December 2025)
116pub mod clr_learned_index; // CLR Learned Index for sorted runs (Task 3)
117pub mod lockfree_epoch; // Lock-Free Epoch Tracking (Task 3)
118pub mod hierarchical_ts; // Hierarchical Timestamp Oracle (Task 9)
119pub mod shard_coalesced; // Shard-Coalesced Batch DashMap (Task 6)
120pub mod polymorphic_value; // Polymorphic Value Encoding (Task 12)
121pub mod epoch_arena; // Epoch-Partitioned Key Arena (Task 1)
122pub mod stratified_skiplist; // Stratified SkipList with Deferred Promotion (Task 2)
123pub mod columnar_wal; // Columnar WAL Layout (Task 4)
124pub mod generational_slab; // Generational Slab Allocator (Task 5)
125pub mod rl_workload; // RL Workload Classifier (Task 10)
126#[cfg(unix)]
127pub mod io_uring_wal; // io_uring WAL Submission (Task 11)
128
129// New performance modules (Recommendations 1-9)
130pub mod cow_btree; // Copy-on-Write B-Tree for ordered access (Recommendation 5)
131pub mod epoch_mvcc; // Epoch-based MVCC for O(log E) version lookup (Recommendation 7)
132pub mod page_cache; // Application-level page cache with Clock-Pro (Recommendation 8)
133pub mod row_format; // Slot-based columnar row storage (Recommendation 1)
134pub mod tiered_memtable; // Tiered MemTable with deferred sorting (Recommendation 3)
135pub mod tournament_tree; // K-way merge with tournament tree (Task 2)
136pub mod vectorized_scan; // SIMD-accelerated vectorized scan engine (Recommendation 2)
137pub mod zero_copy_serde; // Zero-copy serialization for WAL (Recommendation 6)
138
139// Namespace and multi-tenancy support (Task 3)
140pub mod namespace; // Namespace routing and on-disk layout
141
142// Core utilities
143pub mod backend;
144pub mod backup;
145pub mod block_checksum;
146pub mod bloom;
147pub mod compression;
148pub mod dict_compression;
149pub mod direct_io;
150#[cfg(unix)]
151pub mod io_uring;
152pub mod manifest;
153pub mod memory;
154pub mod parallel_merge;
155pub mod payload;
156pub mod prefetch;
157pub mod sketches;
158pub mod two_level_index;
159pub mod validation;
160pub mod version_store;
161pub mod zero_copy;
162
163// Re-exports for new components
164pub use columnar_compression::{
165    ColumnEncoder, DeltaEncoder, DictionaryEncoder, EncodingStats, EncodingType, RleEncoder,
166};
167pub use learned_index_integration::{
168    HybridIndex, IndexManager, IndexType, KeyStats, PointLookupExecutor,
169};
170pub use lscs::{
171    ColumnDef, ColumnGroup, ColumnType, ColumnarMemtable, Lscs, LscsConfig, LscsRecoveryStats,
172    LscsStats, TableSchema,
173};
174#[allow(deprecated)]
175pub use mvcc_snapshot::{
176    MvccStore, Snapshot as MvccSnapshot, Timestamp, TransactionManager, TxnId, TxnStatus,
177    VersionChain, VersionInfo,
178};
179pub use page_manager::{
180    DEFAULT_PAGE_SIZE, DbHeader, FORMAT_VERSION, FreePageHeader, PageId, PageManager,
181    PageManagerStats, PageType, SOCHDB_MAGIC,
182};
183pub use storage_engine::{
184    ColumnId, ColumnIterator, Row, RowId, StorageEngine, StorageEngineType, StorageStats,
185    TxnHandle, open_storage_engine,
186};
187pub use transaction::{
188    DurabilityLevel, IsolationLevel, RecoveryStats as TxnRecoveryStats, TransactionCoordinator,
189    TransactionHandle,
190};
191pub use txn_wal::{CrashRecoveryStats, TxnWal, TxnWalBuffer, TxnWalEntry, TxnWalStats};
192pub use wal_integration::{
193    GroupCommitBuffer, MvccTransactionManager, RecoveryStats, Transaction, TxnState, 
194    WalStorageManager,
195};
196
197// Re-exports for performance optimization modules
198pub use adaptive_learned_index::{AdaptiveLearnedIndex, LearnedIndexStats, PiecewiseLinearModel};
199pub use adaptive_memtable::{
200    AdaptiveMemtableConfig, AdaptiveMemtableSizer, AdaptiveMemtableStats,
201    DEFAULT_BASE_SIZE, MAX_MEMTABLE_SIZE, MIN_MEMTABLE_SIZE,
202};
203pub use batch_wal::{
204    BatchAccumulator, BatchedWalReader, BatchedWalStats, BatchedWalWriter, ConcurrentBatchedWal,
205    DEFAULT_MAX_BATCH_BYTES, DEFAULT_MAX_BATCH_SIZE,
206};
207pub use clr_learned_index::{ClrIndex, ClrLookupResult, ClrStats, IndexedSortedRun};
208pub use key_buffer::{
209    ArenaKey,
210    ArenaKeyHandle,
211    BatchKeyGenerator,
212    InternedTablePrefix,
213    // Arena allocation for high-throughput key operations
214    KeyArena,
215    KeyBuffer,
216    MAX_KEY_LENGTH,
217};
218pub use lockfree_memtable::{
219    HazardDomain,
220    INLINE_VALUE_SIZE,
221    LockFreeMemTable,
222    LockFreeVersion,
223    LockFreeVersionChain,
224    // Inline value storage for reduced memory indirection
225    ValueStorage,
226};
227pub use packed_row::{
228    PackedColumnDef, PackedColumnType, PackedRow, PackedRowBuilder, PackedTableSchema,
229};
230
231// Re-exports for utilities
232pub use backend::{LocalFsBackend, ObjectMetadata, StorageBackend};
233pub use backup::{BackupManager, BackupMetadata};
234pub use block_checksum::{
235    BlockChecksumConfig, BlockChecksumStats, BlockType as BlockChecksumType, BlockWriter, ChecksummedBlock,
236};
237pub use bloom::{BlockedBloomFilter, BloomFilter, LevelAdaptiveFPR, UnifiedBloomFilter};
238pub use compression::{CompressionEngine, CompressionStats, StorageTier};
239pub use manifest::{FileMetadata, LsmState, Manifest, VersionEdit};
240pub use memory::{MemoryBudget, MemoryTracker, WriteBufferManager, WriteBufferStats};
241pub use mvcc_new::{
242    ColumnGroupRef, ReadVersion, Snapshot, SnapshotGuard, VersionGuard, VersionSet,
243    VersionSetStats, VersionSetStatsSnapshot,
244};
245pub use payload::{CompressionType, PayloadStats, PayloadStore};
246pub use sketches::{AdaptiveSketch, CountMinSketch, DDSketch, ExponentialHistogram, HyperLogLog};
247pub use two_level_index::{
248    BlockIndexEntry, BlockIndexReader, FencePointer, TemporalKey, TwoLevelIndex,
249};
250pub use validation::{SSTableValidator, validate_sstable_file};
251
252// Re-exports for durable storage
253pub use durable_storage::{ArenaMvccMemTable, DurableStorage, MvccMemTable, TransactionMode};
254
255// Super Version and Copy-on-Write Version Set (mm.md Task 1)
256pub mod version_set;
257pub mod concurrent_art;
258pub mod sstable;
259pub mod wal_segment;
260pub mod compaction_policy;
261pub mod optimized_scan;
262
263// Re-exports for new performance modules (Recommendations 1-9)
264pub use version_set::{
265    FileMetadata as VersionFileMetadata, ImmutableMemTable, ImmutableMemTableRef,
266    LevelMetadata, SuperVersion, SuperVersionHandle, VersionSet as CowVersionSet,
267};
268pub use concurrent_art::ConcurrentART;
269pub use sstable::{
270    BlockBuilder, BlockIterator, BlockHandle, BlockType,
271    FilterPolicy, BloomFilterPolicy, RibbonFilterPolicy, XorFilterPolicy, FilterReader,
272    SSTableFormat, Header, Footer, Section, SectionType,
273    SSTableBuilder, SSTableBuilderOptions, SSTableBuilderResult,
274    SSTable, TableMetadata, ReadOptions, BlockCache,
275};
276pub use wal_segment::{
277    WalSegmentManager, SegmentConfig, SegmentHeader, SegmentMetadata,
278    CheckpointRecord, SegmentStats, RecoveryIterator, WalEntry,
279};
280pub use compaction_policy::{
281    CompactionConfig, CompactionFile, CompactionJob, CompactionPicker,
282    CompactionPriority, CompactionReason, CompactionState, CompactionStats,
283    CompactionStrategy, LeveledCompactionPicker, RetentionConfig,
284    UniversalCompactionPicker, VersionPruner,
285};
286pub use optimized_scan::{
287    EntrySource, FileRange, LevelFiles, RangeScanner, ScanConfig, ScanStats,
288    TournamentTree, VersionedEntry,
289};
290pub use cow_btree::{BTreeEntry, BTreeSnapshot, CowBTree, Node, SearchResult};
291pub use epoch_mvcc::{
292    CommitResult, EpochManager, EpochMvccStore, EpochSnapshot, EpochTransaction,
293    EpochVersionChain, GcStats, StoreStats, VersionEntry,
294};
295pub use page_cache::{CacheStats, ClockProCache, CachedPage, PageId as CachePageId, PageState};
296pub use row_format::{Slot, SlotRow, SlotRowArena, SlotRowHandle, SlotRowFlags};
297pub use tiered_memtable::{HotEntry, SortedBatch, TieredMemTable};
298pub use vectorized_scan::{
299    ColumnVector, ComparisonOp, Int64Comparison, VectorBatch, VectorPredicate,
300    VectorizedScanConfig, VectorizedScanStats, DEFAULT_BATCH_SIZE,
301    // SoA + Late Materialization (80/20 optimization)
302    SimdVisibilityFilter, SoaBatch, SoaScanIterator, SoaScanStats, SoaSource,
303    StreamingScanIterator, ValueHandle, VersionedSlice,
304};
305pub use zero_copy_serde::{
306    FieldDescriptor, MmapWalReader, SerdeStats, WalBatchReader, WalBatchWriter,
307    WalEntryBuilder, WalEntryHeader, WalEntryReader, WalEntryType, ZeroCopyHeader,
308    FORMAT_VERSION as SERDE_FORMAT_VERSION, HEADER_SIZE as SERDE_HEADER_SIZE, ZERO_COPY_MAGIC,
309};
310
311// Re-exports for transaction arena and zero-copy plumbing
312pub use txn_arena::{
313    ArenaWriteSet, BytesRef, KeyFingerprint, TxnArena, TxnWriteBuffer, WriteOp,
314};
315
316// Re-exports for dirty tracking with batching
317pub use dirty_tracking::{
318    BatchedDirtyTracker, DirtyEvent, DirtyTrackingStats, TxnDirtyBuffer,
319};
320
321// Re-exports for per-table index policy
322pub use index_policy::{
323    BalancedTableIndex, IndexPolicy, SortedRun, TableIndexConfig, TableIndexRegistry,
324};
325
326// Re-exports for queue-optimized index structure
327pub use queue_index::{
328    CompositeQueueKey, QueueIndex, QueueIndexConfig, QueueIndexStats, QueueTableRegistry,
329};
330
331// Re-exports for database kernel
332pub use database::{
333    ColumnDef as DbColumnDef,
334    ColumnType as DbColumnType,
335    ColumnarQueryResult, // SIMD-friendly columnar result format
336    Database,
337    DatabaseConfig,
338    GroupCommitSettings,
339    QueryBuilder,
340    QueryResult,
341    QueryRowIterator,
342    RecoveryStats as DbRecoveryStats,
343    Stats as DbStats,
344    SyncMode,
345    TableSchema as DbTableSchema,
346    TxnHandle as KernelTxnHandle,
347    VectorSearchResult,
348};