sochdb_storage/
lib.rs

1// Copyright 2025 Sushanth (https://github.com/sushanthpy)
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15//! SochDB Storage Layer
16//!
17//! Log-Structured Column Store (LSCS) with transaction-aware WAL for TOON-native data.
18//!
19//! ## Runtime Modes
20//!
21//! This crate supports two runtime modes:
22//!
23//! ### Embedded Sync Mode (like SQLite)
24//!
25//! For embedded deployments without async runtime:
26//!
27//! ```toml
28//! sochdb-storage = { version = "...", default-features = false, features = ["embedded-sync"] }
29//! ```
30//!
31//! Benefits:
32//! - ~500KB smaller binary
33//! - No async runtime overhead
34//! - Simpler embedded integration
35//!
36//! ### Async Mode (default, for servers)
37//!
38//! For server deployments with async I/O:
39//!
40//! ```toml
41//! sochdb-storage = { version = "..." }  # async enabled by default
42//! ```
43//!
44//! Benefits:
45//! - Better scalability for concurrent connections
46//! - Non-blocking I/O for server workloads
47//!
48//! ## Novel Components
49//!
50//! - **LSCS** (`lscs`): Log-Structured Column Store - columnar variant of LSM with
51//!   schema-aware compression and column-aware compaction for reduced write amplification.
52//!
53//! - **Transaction WAL** (`txn_wal`): ACID-compliant Write-Ahead Log with transaction
54//!   boundaries, commit/abort markers, and crash recovery.
55//!
56//! - **StorageEngine Trait** (`storage_engine`): Pluggable storage backend abstraction
57//!   enabling 80% I/O reduction for columnar projections (Task 1).
58//!
59//! - **Page Manager** (`page_manager`): TOON file format with magic header and O(1)
60//!   page allocation (Task 8).
61//!
62//! - **Columnar Compression** (`columnar_compression`): Type-aware encoding with
63//!   dictionary, RLE, and delta compression for 2-4× storage reduction (Task 9).
64//!
65//! ## Utility Components
66//!
67//! - **Bloom Filters** (`bloom`): Probabilistic existence checks
68//! - **Block Checksums** (`block_checksum`): Data integrity validation
69//! - **Compression** (`compression`): LZ4/Zstd compression
70//! - **Sketches** (`sketches`): Approximate algorithms (HyperLogLog, CountMin, DDSketch)
71
72// New TOON-native storage components
73pub mod actor; // Actor-based connection manager (mm.md Task 7.2)
74pub mod aries_recovery; // ARIES-style crash recovery (Task 1)
75pub mod checkpoint; // ARIES-style checkpointing with WAL truncation (mm.md Task 1.4)
76pub mod columnar_compression;
77pub mod database; // Database Kernel (shared by embedded + server)
78pub mod durable_storage; // Fully wired durable storage with MVCC
79pub mod ffi;
80pub mod group_commit; // Event-driven Group Commit (Task 4)
81pub mod hlc; // Hybrid Logical Clock for commit timestamps (mm.md Task 1.3)
82pub mod hybrid_store; // PAX hybrid row-column storage (mm.md Task 4.1)
83pub mod ipc; // IPC Protocol with multiplexing (mm.md Task 7.1)
84#[cfg(unix)]
85pub mod ipc_server; // Unix Socket IPC Server (Task 3)
86pub mod learned_index_integration;
87pub mod lock; // Advisory file locking for database exclusivity
88pub mod lscs;
89pub mod mvcc_new;
90pub mod mvcc_snapshot;
91pub mod page_manager;
92pub mod production_wal; // Production WAL with ARIES recovery (mm.md Task 3)
93pub mod ssi; // Serializable Snapshot Isolation (Task 2)
94pub mod storage_engine;
95pub mod streaming_iterator; // Streaming Iterator Architecture (mm.md Task 4)
96pub mod transaction; // Unified Transaction Coordinator trait and types
97pub mod txn_arena; // Transaction-scoped arena with zero-copy key/value plumbing
98pub mod txn_wal;
99pub mod wal_fencing; // Epoch-based WAL fencing for split-brain detection
100pub mod wal_integration;
101pub mod zero_copy_safety; // Zero-Copy Validation Layer (Task 5) // FFI bindings for Python SDK
102
103// Performance optimization modules
104pub mod adaptive_learned_index;
105pub mod adaptive_memtable; // Adaptive memtable sizing with memory pressure (Task 10)
106pub mod deferred_index; // Deferred sorted index with LSM-style compaction (Rec 2)
107pub mod dirty_tracking; // Batched dirty tracking with MPSC queue
108pub mod index_policy; // Per-table index policy
109pub mod batch_wal; // Batched WAL with vectored I/O (Task 3)
110pub mod key_buffer; // Cache-line aligned key buffer (Task 2)
111pub mod lockfree_memtable; // Lock-free read path with hazard pointers (Task 4)
112pub mod packed_row; // Unified row storage with delta encoding (Task 1)
113
114// PhD-Level Architectural Optimizations (December 2025)
115pub mod clr_learned_index; // CLR Learned Index for sorted runs (Task 3)
116pub mod lockfree_epoch; // Lock-Free Epoch Tracking (Task 3)
117pub mod hierarchical_ts; // Hierarchical Timestamp Oracle (Task 9)
118pub mod shard_coalesced; // Shard-Coalesced Batch DashMap (Task 6)
119pub mod polymorphic_value; // Polymorphic Value Encoding (Task 12)
120pub mod epoch_arena; // Epoch-Partitioned Key Arena (Task 1)
121pub mod stratified_skiplist; // Stratified SkipList with Deferred Promotion (Task 2)
122pub mod columnar_wal; // Columnar WAL Layout (Task 4)
123pub mod generational_slab; // Generational Slab Allocator (Task 5)
124pub mod rl_workload; // RL Workload Classifier (Task 10)
125#[cfg(unix)]
126pub mod io_uring_wal; // io_uring WAL Submission (Task 11)
127
128// New performance modules (Recommendations 1-9)
129pub mod cow_btree; // Copy-on-Write B-Tree for ordered access (Recommendation 5)
130pub mod epoch_mvcc; // Epoch-based MVCC for O(log E) version lookup (Recommendation 7)
131pub mod page_cache; // Application-level page cache with Clock-Pro (Recommendation 8)
132pub mod row_format; // Slot-based columnar row storage (Recommendation 1)
133pub mod tiered_memtable; // Tiered MemTable with deferred sorting (Recommendation 3)
134pub mod tournament_tree; // K-way merge with tournament tree (Task 2)
135pub mod vectorized_scan; // SIMD-accelerated vectorized scan engine (Recommendation 2)
136pub mod zero_copy_serde; // Zero-copy serialization for WAL (Recommendation 6)
137
138// Namespace and multi-tenancy support (Task 3)
139pub mod namespace; // Namespace routing and on-disk layout
140
141// Core utilities
142pub mod backend;
143pub mod backup;
144pub mod block_checksum;
145pub mod bloom;
146pub mod compression;
147pub mod dict_compression;
148pub mod direct_io;
149#[cfg(unix)]
150pub mod io_uring;
151pub mod manifest;
152pub mod memory;
153pub mod parallel_merge;
154pub mod payload;
155pub mod prefetch;
156pub mod sketches;
157pub mod two_level_index;
158pub mod validation;
159pub mod version_store;
160pub mod zero_copy;
161
162// Re-exports for new components
163pub use columnar_compression::{
164    ColumnEncoder, DeltaEncoder, DictionaryEncoder, EncodingStats, EncodingType, RleEncoder,
165};
166pub use learned_index_integration::{
167    HybridIndex, IndexManager, IndexType, KeyStats, PointLookupExecutor,
168};
169pub use lscs::{
170    ColumnDef, ColumnGroup, ColumnType, ColumnarMemtable, Lscs, LscsConfig, LscsRecoveryStats,
171    LscsStats, TableSchema,
172};
173#[allow(deprecated)]
174pub use mvcc_snapshot::{
175    MvccStore, Snapshot as MvccSnapshot, Timestamp, TransactionManager, TxnId, TxnStatus,
176    VersionChain, VersionInfo,
177};
178pub use page_manager::{
179    DEFAULT_PAGE_SIZE, DbHeader, FORMAT_VERSION, FreePageHeader, PageId, PageManager,
180    PageManagerStats, PageType, SOCHDB_MAGIC,
181};
182pub use storage_engine::{
183    ColumnId, ColumnIterator, Row, RowId, StorageEngine, StorageEngineType, StorageStats,
184    TxnHandle, open_storage_engine,
185};
186pub use transaction::{
187    DurabilityLevel, IsolationLevel, RecoveryStats as TxnRecoveryStats, TransactionCoordinator,
188    TransactionHandle,
189};
190pub use txn_wal::{CrashRecoveryStats, TxnWal, TxnWalBuffer, TxnWalEntry, TxnWalStats};
191pub use wal_integration::{
192    GroupCommitBuffer, MvccTransactionManager, RecoveryStats, Transaction, TxnState, 
193    WalStorageManager,
194};
195
196// Re-exports for performance optimization modules
197pub use adaptive_learned_index::{AdaptiveLearnedIndex, LearnedIndexStats, PiecewiseLinearModel};
198pub use adaptive_memtable::{
199    AdaptiveMemtableConfig, AdaptiveMemtableSizer, AdaptiveMemtableStats,
200    DEFAULT_BASE_SIZE, MAX_MEMTABLE_SIZE, MIN_MEMTABLE_SIZE,
201};
202pub use batch_wal::{
203    BatchAccumulator, BatchedWalReader, BatchedWalStats, BatchedWalWriter, ConcurrentBatchedWal,
204    DEFAULT_MAX_BATCH_BYTES, DEFAULT_MAX_BATCH_SIZE,
205};
206pub use clr_learned_index::{ClrIndex, ClrLookupResult, ClrStats, IndexedSortedRun};
207pub use key_buffer::{
208    ArenaKey,
209    ArenaKeyHandle,
210    BatchKeyGenerator,
211    InternedTablePrefix,
212    // Arena allocation for high-throughput key operations
213    KeyArena,
214    KeyBuffer,
215    MAX_KEY_LENGTH,
216};
217pub use lockfree_memtable::{
218    HazardDomain,
219    INLINE_VALUE_SIZE,
220    LockFreeMemTable,
221    LockFreeVersion,
222    LockFreeVersionChain,
223    // Inline value storage for reduced memory indirection
224    ValueStorage,
225};
226pub use packed_row::{
227    PackedColumnDef, PackedColumnType, PackedRow, PackedRowBuilder, PackedTableSchema,
228};
229
230// Re-exports for utilities
231pub use backend::{LocalFsBackend, ObjectMetadata, StorageBackend};
232pub use backup::{BackupManager, BackupMetadata};
233pub use block_checksum::{
234    BlockChecksumConfig, BlockChecksumStats, BlockType as BlockChecksumType, BlockWriter, ChecksummedBlock,
235};
236pub use bloom::{BlockedBloomFilter, BloomFilter, LevelAdaptiveFPR, UnifiedBloomFilter};
237pub use compression::{CompressionEngine, CompressionStats, StorageTier};
238pub use manifest::{FileMetadata, LsmState, Manifest, VersionEdit};
239pub use memory::{MemoryBudget, MemoryTracker, WriteBufferManager, WriteBufferStats};
240pub use mvcc_new::{
241    ColumnGroupRef, ReadVersion, Snapshot, SnapshotGuard, VersionGuard, VersionSet,
242    VersionSetStats, VersionSetStatsSnapshot,
243};
244pub use payload::{CompressionType, PayloadStats, PayloadStore};
245pub use sketches::{AdaptiveSketch, CountMinSketch, DDSketch, ExponentialHistogram, HyperLogLog};
246pub use two_level_index::{
247    BlockIndexEntry, BlockIndexReader, FencePointer, TemporalKey, TwoLevelIndex,
248};
249pub use validation::{SSTableValidator, validate_sstable_file};
250
251// Re-exports for durable storage
252pub use durable_storage::{ArenaMvccMemTable, DurableStorage, MvccMemTable, TransactionMode};
253
254// Super Version and Copy-on-Write Version Set (mm.md Task 1)
255pub mod version_set;
256pub mod concurrent_art;
257pub mod sstable;
258pub mod wal_segment;
259pub mod compaction_policy;
260pub mod optimized_scan;
261
262// Re-exports for new performance modules (Recommendations 1-9)
263pub use version_set::{
264    FileMetadata as VersionFileMetadata, ImmutableMemTable, ImmutableMemTableRef,
265    LevelMetadata, SuperVersion, SuperVersionHandle, VersionSet as CowVersionSet,
266};
267pub use concurrent_art::ConcurrentART;
268pub use sstable::{
269    BlockBuilder, BlockIterator, BlockHandle, BlockType,
270    FilterPolicy, BloomFilterPolicy, RibbonFilterPolicy, XorFilterPolicy, FilterReader,
271    SSTableFormat, Header, Footer, Section, SectionType,
272    SSTableBuilder, SSTableBuilderOptions, SSTableBuilderResult,
273    SSTable, TableMetadata, ReadOptions, BlockCache,
274};
275pub use wal_segment::{
276    WalSegmentManager, SegmentConfig, SegmentHeader, SegmentMetadata,
277    CheckpointRecord, SegmentStats, RecoveryIterator, WalEntry,
278};
279pub use compaction_policy::{
280    CompactionConfig, CompactionFile, CompactionJob, CompactionPicker,
281    CompactionPriority, CompactionReason, CompactionState, CompactionStats,
282    CompactionStrategy, LeveledCompactionPicker, RetentionConfig,
283    UniversalCompactionPicker, VersionPruner,
284};
285pub use optimized_scan::{
286    EntrySource, FileRange, LevelFiles, RangeScanner, ScanConfig, ScanStats,
287    TournamentTree, VersionedEntry,
288};
289pub use cow_btree::{BTreeEntry, BTreeSnapshot, CowBTree, Node, SearchResult};
290pub use epoch_mvcc::{
291    CommitResult, EpochManager, EpochMvccStore, EpochSnapshot, EpochTransaction,
292    EpochVersionChain, GcStats, StoreStats, VersionEntry,
293};
294pub use page_cache::{CacheStats, ClockProCache, CachedPage, PageId as CachePageId, PageState};
295pub use row_format::{Slot, SlotRow, SlotRowArena, SlotRowHandle, SlotRowFlags};
296pub use tiered_memtable::{HotEntry, SortedBatch, TieredMemTable};
297pub use vectorized_scan::{
298    ColumnVector, ComparisonOp, Int64Comparison, VectorBatch, VectorPredicate,
299    VectorizedScanConfig, VectorizedScanStats, DEFAULT_BATCH_SIZE,
300    // SoA + Late Materialization (80/20 optimization)
301    SimdVisibilityFilter, SoaBatch, SoaScanIterator, SoaScanStats, SoaSource,
302    StreamingScanIterator, ValueHandle, VersionedSlice,
303};
304pub use zero_copy_serde::{
305    FieldDescriptor, MmapWalReader, SerdeStats, WalBatchReader, WalBatchWriter,
306    WalEntryBuilder, WalEntryHeader, WalEntryReader, WalEntryType, ZeroCopyHeader,
307    FORMAT_VERSION as SERDE_FORMAT_VERSION, HEADER_SIZE as SERDE_HEADER_SIZE, ZERO_COPY_MAGIC,
308};
309
310// Re-exports for transaction arena and zero-copy plumbing
311pub use txn_arena::{
312    ArenaWriteSet, BytesRef, KeyFingerprint, TxnArena, TxnWriteBuffer, WriteOp,
313};
314
315// Re-exports for dirty tracking with batching
316pub use dirty_tracking::{
317    BatchedDirtyTracker, DirtyEvent, DirtyTrackingStats, TxnDirtyBuffer,
318};
319
320// Re-exports for per-table index policy
321pub use index_policy::{
322    BalancedTableIndex, IndexPolicy, SortedRun, TableIndexConfig, TableIndexRegistry,
323};
324
325// Re-exports for database kernel
326pub use database::{
327    ColumnDef as DbColumnDef,
328    ColumnType as DbColumnType,
329    ColumnarQueryResult, // SIMD-friendly columnar result format
330    Database,
331    DatabaseConfig,
332    GroupCommitSettings,
333    QueryBuilder,
334    QueryResult,
335    QueryRowIterator,
336    RecoveryStats as DbRecoveryStats,
337    Stats as DbStats,
338    SyncMode,
339    TableSchema as DbTableSchema,
340    TxnHandle as KernelTxnHandle,
341    VectorSearchResult,
342};