Skip to main content

sochdb_storage/
lib.rs

1// SPDX-License-Identifier: AGPL-3.0-or-later
2// SochDB - LLM-Optimized Embedded Database
3// Copyright (C) 2026 Sushanth Reddy Vanagala (https://github.com/sushanthpy)
4//
5// This program is free software: you can redistribute it and/or modify
6// it under the terms of the GNU Affero General Public License as published by
7// the Free Software Foundation, either version 3 of the License, or
8// (at your option) any later version.
9//
10// This program is distributed in the hope that it will be useful,
11// but WITHOUT ANY WARRANTY; without even the implied warranty of
12// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13// GNU Affero General Public License for more details.
14//
15// You should have received a copy of the GNU Affero General Public License
16// along with this program. If not, see <https://www.gnu.org/licenses/>.
17
18//! SochDB Storage Layer
19//!
20//! Log-Structured Column Store (LSCS) with transaction-aware WAL for TOON-native data.
21//!
22//! ## Runtime Modes
23//!
24//! This crate supports two runtime modes:
25//!
26//! ### Embedded Sync Mode (like SQLite)
27//!
28//! For embedded deployments without async runtime:
29//!
30//! ```toml
31//! sochdb-storage = { version = "...", default-features = false, features = ["embedded-sync"] }
32//! ```
33//!
34//! Benefits:
35//! - ~500KB smaller binary
36//! - No async runtime overhead
37//! - Simpler embedded integration
38//!
39//! ### Async Mode (default, for servers)
40//!
41//! For server deployments with async I/O:
42//!
43//! ```toml
44//! sochdb-storage = { version = "..." }  # async enabled by default
45//! ```
46//!
47//! Benefits:
48//! - Better scalability for concurrent connections
49//! - Non-blocking I/O for server workloads
50//!
51//! ## Novel Components
52//!
53//! - **LSCS** (`lscs`): Log-Structured Column Store - columnar variant of LSM with
54//!   schema-aware compression and column-aware compaction for reduced write amplification.
55//!
56//! - **Transaction WAL** (`txn_wal`): ACID-compliant Write-Ahead Log with transaction
57//!   boundaries, commit/abort markers, and crash recovery.
58//!
59//! - **StorageEngine Trait** (`storage_engine`): Pluggable storage backend abstraction
60//!   enabling 80% I/O reduction for columnar projections (Task 1).
61//!
62//! - **Page Manager** (`page_manager`): TOON file format with magic header and O(1)
63//!   page allocation (Task 8).
64//!
65//! - **Columnar Compression** (`columnar_compression`): Type-aware encoding with
66//!   dictionary, RLE, and delta compression for 2-4× storage reduction (Task 9).
67//!
68//! ## Utility Components
69//!
70//! - **Bloom Filters** (`bloom`): Probabilistic existence checks
71//! - **Block Checksums** (`block_checksum`): Data integrity validation
72//! - **Compression** (`compression`): LZ4/Zstd compression
73//! - **Sketches** (`sketches`): Approximate algorithms (HyperLogLog, CountMin, DDSketch)
74
75// New TOON-native storage components
76pub mod actor; // Actor-based connection manager (mm.md Task 7.2)
77pub mod admission_control; // Admission control with cost model + tenant fairness (Task 6)
78pub mod aries_recovery; // ARIES-style crash recovery (Task 1)
79pub mod cdc; // WAL-derived Change Data Capture (T1)
80pub mod checkpoint; // ARIES-style checkpointing with WAL truncation (mm.md Task 1.4)
81pub mod columnar_compression;
82pub mod correctness_testing; // Property-based correctness testing (Task 13)
83pub mod database; // Database Kernel (shared by embedded + server)
84pub mod durable_storage; // Fully wired durable storage with MVCC
85pub mod durability_contract; // Durability contract hardening (Task 4)
86pub mod encryption; // Data-at-rest encryption (AES-256-GCM-SIV, Enterprise Security)
87pub mod ffi;
88pub mod group_commit; // Event-driven Group Commit (Task 4)
89pub mod hlc; // Hybrid Logical Clock for commit timestamps (mm.md Task 1.3)
90pub mod hybrid_store; // PAX hybrid row-column storage (mm.md Task 4.1)
91pub mod io_isolation; // I/O isolation policy with cache partitioning (Task 5)
92pub mod ipc; // IPC Protocol with multiplexing (mm.md Task 7.1)
93#[cfg(unix)]
94pub mod ipc_server; // Unix Socket IPC Server (Task 3)
95pub mod learned_index_integration;
96pub mod lock; // Advisory file locking for database exclusivity
97pub mod lscs;
98pub mod mvcc_concurrent; // Concurrent MVCC for multi-reader single-writer (Task: Concurrent Embedded)
99pub mod mvcc_new;
100pub mod mvcc_snapshot;
101pub mod page_manager;
102pub mod pitr; // Point-in-Time Recovery with WAL archiving (Task 11)
103pub mod production_wal; // Production WAL with ARIES recovery (mm.md Task 3)
104pub mod ssi; // Serializable Snapshot Isolation (Task 2)
105pub mod ssi_scaling; // SSI scaling guardrails with range locks (Task 7)
106pub mod storage_engine;
107pub mod streaming_iterator; // Streaming Iterator Architecture (mm.md Task 4)
108pub mod transaction; // Unified Transaction Coordinator trait and types
109pub mod txn_arena; // Transaction-scoped arena with zero-copy key/value plumbing
110pub mod txn_wal;
111pub mod upgrade_contract; // Upgrade compatibility contract (Task 12)
112pub mod wal_fencing; // Epoch-based WAL fencing for split-brain detection
113pub mod wal_integration;
114pub mod zero_copy_safety; // Zero-Copy Validation Layer (Task 5) // FFI bindings for Python SDK
115
116// Performance optimization modules
117pub mod adaptive_learned_index;
118pub mod adaptive_memtable; // Adaptive memtable sizing with memory pressure (Task 10)
119pub mod deferred_index; // Deferred sorted index with LSM-style compaction (Rec 2)
120pub mod dirty_tracking; // Batched dirty tracking with MPSC queue
121pub mod index_policy; // Per-table index policy
122pub mod queue_index; // Queue-optimized index structure (Task: Queue Index Policy)
123pub mod batch_wal; // Batched WAL with vectored I/O (Task 3)
124pub mod key_buffer; // Cache-line aligned key buffer (Task 2)
125pub mod lockfree_memtable; // Lock-free read path with hazard pointers (Task 4)
126pub mod packed_row; // Unified row storage with delta encoding (Task 1)
127
128// PhD-Level Architectural Optimizations (December 2025)
129pub mod clr_learned_index; // CLR Learned Index for sorted runs (Task 3)
130pub mod lockfree_epoch; // Lock-Free Epoch Tracking (Task 3)
131pub mod hierarchical_ts; // Hierarchical Timestamp Oracle (Task 9)
132pub mod shard_coalesced; // Shard-Coalesced Batch DashMap (Task 6)
133pub mod polymorphic_value; // Polymorphic Value Encoding (Task 12)
134pub mod epoch_arena; // Epoch-Partitioned Key Arena (Task 1)
135pub mod stratified_skiplist; // Stratified SkipList with Deferred Promotion (Task 2)
136pub mod columnar_wal; // Columnar WAL Layout (Task 4)
137pub mod generational_slab; // Generational Slab Allocator (Task 5)
138pub mod rl_workload; // RL Workload Classifier (Task 10)
139#[cfg(unix)]
140pub mod io_uring_wal; // io_uring WAL Submission (Task 11)
141
142// New performance modules (Recommendations 1-9)
143pub mod cow_btree; // Copy-on-Write B-Tree for ordered access (Recommendation 5)
144pub mod epoch_mvcc; // Epoch-based MVCC for O(log E) version lookup (Recommendation 7)
145pub mod page_cache; // Application-level page cache with Clock-Pro (Recommendation 8)
146pub mod row_format; // Slot-based columnar row storage (Recommendation 1)
147pub mod tiered_memtable; // Tiered MemTable with deferred sorting (Recommendation 3)
148pub mod tournament_tree; // K-way merge with tournament tree (Task 2)
149pub mod vectorized_scan; // SIMD-accelerated vectorized scan engine (Recommendation 2)
150pub mod zero_copy_serde; // Zero-copy serialization for WAL (Recommendation 6)
151
152// Namespace and multi-tenancy support (Task 3)
153pub mod namespace; // Namespace routing and on-disk layout
154
155// Core utilities
156pub mod backend;
157pub mod backup;
158pub mod block_checksum;
159pub mod bloom;
160pub mod compression;
161pub mod dict_compression;
162pub mod direct_io;
163#[cfg(unix)]
164pub mod io_uring;
165pub mod manifest;
166pub mod memory;
167pub mod parallel_merge;
168pub mod payload;
169pub mod prefetch;
170pub mod sketches;
171pub mod two_level_index;
172pub mod validation;
173pub mod version_store;
174pub mod zero_copy;
175
176// Re-exports for new components
177pub use columnar_compression::{
178    ColumnEncoder, DeltaEncoder, DictionaryEncoder, EncodingStats, EncodingType, RleEncoder,
179};
180pub use learned_index_integration::{
181    HybridIndex, IndexManager, IndexType, KeyStats, PointLookupExecutor,
182};
183pub use lscs::{
184    ColumnDef, ColumnGroup, ColumnType, ColumnarMemtable, Lscs, LscsConfig, LscsRecoveryStats,
185    LscsStats, TableSchema,
186};
187#[allow(deprecated)]
188pub use mvcc_snapshot::{
189    MvccStore, Snapshot as MvccSnapshot, Timestamp, TransactionManager, TxnId, TxnStatus,
190    VersionChain, VersionInfo,
191};
192pub use page_manager::{
193    DEFAULT_PAGE_SIZE, DbHeader, FORMAT_VERSION, FreePageHeader, PageId, PageManager,
194    PageManagerStats, PageType, SOCHDB_MAGIC,
195};
196pub use storage_engine::{
197    ColumnId, ColumnIterator, Row, RowId, StorageEngine, StorageEngineType, StorageStats,
198    TxnHandle, open_storage_engine,
199};
200pub use transaction::{
201    DurabilityLevel, IsolationLevel, RecoveryStats as TxnRecoveryStats, TransactionCoordinator,
202    TransactionHandle,
203};
204pub use txn_wal::{CrashRecoveryStats, TxnWal, TxnWalBuffer, TxnWalEntry, TxnWalStats};
205pub use wal_integration::{
206    GroupCommitBuffer, MvccTransactionManager, RecoveryStats, Transaction, TxnState, 
207    WalStorageManager,
208};
209
210// Re-exports for performance optimization modules
211pub use adaptive_learned_index::{AdaptiveLearnedIndex, LearnedIndexStats, PiecewiseLinearModel};
212pub use adaptive_memtable::{
213    AdaptiveMemtableConfig, AdaptiveMemtableSizer, AdaptiveMemtableStats,
214    DEFAULT_BASE_SIZE, MAX_MEMTABLE_SIZE, MIN_MEMTABLE_SIZE,
215};
216pub use batch_wal::{
217    BatchAccumulator, BatchedWalReader, BatchedWalStats, BatchedWalWriter, ConcurrentBatchedWal,
218    DEFAULT_MAX_BATCH_BYTES, DEFAULT_MAX_BATCH_SIZE,
219};
220pub use clr_learned_index::{ClrIndex, ClrLookupResult, ClrStats, IndexedSortedRun};
221pub use key_buffer::{
222    ArenaKey,
223    ArenaKeyHandle,
224    BatchKeyGenerator,
225    InternedTablePrefix,
226    // Arena allocation for high-throughput key operations
227    KeyArena,
228    KeyBuffer,
229    MAX_KEY_LENGTH,
230};
231pub use lockfree_memtable::{
232    HazardDomain,
233    INLINE_VALUE_SIZE,
234    LockFreeMemTable,
235    LockFreeVersion,
236    LockFreeVersionChain,
237    // Inline value storage for reduced memory indirection
238    ValueStorage,
239};
240pub use packed_row::{
241    PackedColumnDef, PackedColumnType, PackedRow, PackedRowBuilder, PackedTableSchema,
242};
243
244// Re-exports for utilities
245pub use backend::{LocalFsBackend, ObjectMetadata, StorageBackend};
246pub use backup::{BackupManager, BackupMetadata};
247pub use block_checksum::{
248    BlockChecksumConfig, BlockChecksumStats, BlockType as BlockChecksumType, BlockWriter, ChecksummedBlock,
249};
250pub use bloom::{BlockedBloomFilter, BloomFilter, LevelAdaptiveFPR, UnifiedBloomFilter};
251pub use compression::{CompressionEngine, CompressionStats, StorageTier};
252pub use manifest::{FileMetadata, LsmState, Manifest, VersionEdit};
253pub use memory::{MemoryBudget, MemoryTracker, WriteBufferManager, WriteBufferStats};
254pub use mvcc_new::{
255    ColumnGroupRef, ReadVersion, Snapshot, SnapshotGuard, VersionGuard, VersionSet,
256    VersionSetStats, VersionSetStatsSnapshot,
257};
258pub use payload::{CompressionType, PayloadStats, PayloadStore};
259pub use sketches::{AdaptiveSketch, CountMinSketch, DDSketch, ExponentialHistogram, HyperLogLog};
260pub use two_level_index::{
261    BlockIndexEntry, BlockIndexReader, FencePointer, TemporalKey, TwoLevelIndex,
262};
263pub use validation::{SSTableValidator, validate_sstable_file};
264
265// Re-exports for durable storage
266pub use durable_storage::{ArenaMvccMemTable, DurableStorage, EphemeralHandle, MvccMemTable, TransactionMode};
267
268// Re-exports for concurrent MVCC (Task: Concurrent Embedded)
269pub use mvcc_concurrent::{
270    ConcurrentMvcc, HlcTimestamp, ReaderSlot, 
271    ConcurrentVersionChain, ConcurrentVersionEntry,
272    VersionStore, VersionStoreStats, WriterGuard,
273};
274
275// Super Version and Copy-on-Write Version Set (mm.md Task 1)
276pub mod version_set;
277pub mod concurrent_art;
278pub mod sstable;
279pub mod wal_segment;
280pub mod compaction_policy;
281pub mod optimized_scan;
282
283// Re-exports for new performance modules (Recommendations 1-9)
284pub use version_set::{
285    FileMetadata as VersionFileMetadata, ImmutableMemTable, ImmutableMemTableRef,
286    LevelMetadata, SuperVersion, SuperVersionHandle, VersionSet as CowVersionSet,
287};
288pub use concurrent_art::ConcurrentART;
289pub use sstable::{
290    BlockBuilder, BlockIterator, BlockHandle, BlockType,
291    FilterPolicy, BloomFilterPolicy, RibbonFilterPolicy, XorFilterPolicy, FilterReader,
292    SSTableFormat, Header, Footer, Section, SectionType,
293    SSTableBuilder, SSTableBuilderOptions, SSTableBuilderResult,
294    SSTable, TableMetadata, ReadOptions, BlockCache,
295};
296pub use wal_segment::{
297    WalSegmentManager, SegmentConfig, SegmentHeader, SegmentMetadata,
298    CheckpointRecord, SegmentStats, RecoveryIterator, WalEntry,
299};
300pub use compaction_policy::{
301    CompactionConfig, CompactionFile, CompactionJob, CompactionPicker,
302    CompactionPriority, CompactionReason, CompactionState, CompactionStats,
303    CompactionStrategy, LeveledCompactionPicker, RetentionConfig,
304    UniversalCompactionPicker, VersionPruner,
305};
306pub use optimized_scan::{
307    EntrySource, FileRange, LevelFiles, RangeScanner, ScanConfig, ScanStats,
308    TournamentTree, VersionedEntry,
309};
310pub use cow_btree::{BTreeEntry, BTreeSnapshot, CowBTree, Node, SearchResult};
311pub use epoch_mvcc::{
312    CommitResult, EpochManager, EpochMvccStore, EpochSnapshot, EpochTransaction,
313    EpochVersionChain, GcStats, StoreStats, VersionEntry,
314};
315pub use page_cache::{CacheStats, ClockProCache, CachedPage, PageId as CachePageId, PageState};
316pub use row_format::{Slot, SlotRow, SlotRowArena, SlotRowHandle, SlotRowFlags};
317pub use tiered_memtable::{HotEntry, SortedBatch, TieredMemTable};
318pub use vectorized_scan::{
319    ColumnVector, ComparisonOp, Int64Comparison, VectorBatch, VectorPredicate,
320    VectorizedScanConfig, VectorizedScanStats, DEFAULT_BATCH_SIZE,
321    // SoA + Late Materialization (80/20 optimization)
322    SimdVisibilityFilter, SoaBatch, SoaScanIterator, SoaScanStats, SoaSource,
323    StreamingScanIterator, ValueHandle, VersionedSlice,
324};
325pub use zero_copy_serde::{
326    FieldDescriptor, MmapWalReader, SerdeStats, WalBatchReader, WalBatchWriter,
327    WalEntryBuilder, WalEntryHeader, WalEntryReader, WalEntryType, ZeroCopyHeader,
328    FORMAT_VERSION as SERDE_FORMAT_VERSION, HEADER_SIZE as SERDE_HEADER_SIZE, ZERO_COPY_MAGIC,
329};
330
331// Re-exports for transaction arena and zero-copy plumbing
332pub use txn_arena::{
333    ArenaWriteSet, BytesRef, KeyFingerprint, TxnArena, TxnWriteBuffer, WriteOp,
334};
335
336// Re-exports for dirty tracking with batching
337pub use dirty_tracking::{
338    BatchedDirtyTracker, DirtyEvent, DirtyTrackingStats, TxnDirtyBuffer,
339};
340
341// Re-exports for per-table index policy
342pub use index_policy::{
343    BalancedTableIndex, IndexPolicy, SortedRun, TableIndexConfig, TableIndexRegistry,
344};
345
346// Re-exports for queue-optimized index structure
347pub use queue_index::{
348    CompositeQueueKey, QueueIndex, QueueIndexConfig, QueueIndexStats, QueueTableRegistry,
349};
350
351// Re-exports for CDC engine
352pub use cdc::{CdcConfig, CdcEmitter, CdcError, CdcEvent, CdcLog, CdcOperation, CdcSubscriber};
353
354// Re-exports for database kernel
355pub use database::{
356    ColumnDef as DbColumnDef,
357    ColumnType as DbColumnType,
358    ColumnarQueryResult, // SIMD-friendly columnar result format
359    Database,
360    DatabaseConfig,
361    GroupCommitSettings,
362    QueryBuilder,
363    QueryResult,
364    QueryRowIterator,
365    RecoveryStats as DbRecoveryStats,
366    Stats as DbStats,
367    SyncMode,
368    TableSchema as DbTableSchema,
369    TxnHandle as KernelTxnHandle,
370    VectorSearchResult,
371};