Skip to main content

sochdb_storage/
lib.rs

1// SPDX-License-Identifier: AGPL-3.0-or-later
2// SochDB - LLM-Optimized Embedded Database
3// Copyright (C) 2026 Sushanth Reddy Vanagala (https://github.com/sushanthpy)
4//
5// This program is free software: you can redistribute it and/or modify
6// it under the terms of the GNU Affero General Public License as published by
7// the Free Software Foundation, either version 3 of the License, or
8// (at your option) any later version.
9//
10// This program is distributed in the hope that it will be useful,
11// but WITHOUT ANY WARRANTY; without even the implied warranty of
12// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13// GNU Affero General Public License for more details.
14//
15// You should have received a copy of the GNU Affero General Public License
16// along with this program. If not, see <https://www.gnu.org/licenses/>.
17
18//! SochDB Storage Layer
19//!
20//! Log-Structured Column Store (LSCS) with transaction-aware WAL for TOON-native data.
21//!
22//! ## Runtime Modes
23//!
24//! This crate supports two runtime modes:
25//!
26//! ### Embedded Sync Mode (like SQLite)
27//!
28//! For embedded deployments without async runtime:
29//!
30//! ```toml
31//! sochdb-storage = { version = "...", default-features = false, features = ["embedded-sync"] }
32//! ```
33//!
34//! Benefits:
35//! - ~500KB smaller binary
36//! - No async runtime overhead
37//! - Simpler embedded integration
38//!
39//! ### Async Mode (default, for servers)
40//!
41//! For server deployments with async I/O:
42//!
43//! ```toml
44//! sochdb-storage = { version = "..." }  # async enabled by default
45//! ```
46//!
47//! Benefits:
48//! - Better scalability for concurrent connections
49//! - Non-blocking I/O for server workloads
50//!
51//! ## Novel Components
52//!
53//! - **LSCS** (`lscs`): Log-Structured Column Store - columnar variant of LSM with
54//!   schema-aware compression and column-aware compaction for reduced write amplification.
55//!
56//! - **Transaction WAL** (`txn_wal`): ACID-compliant Write-Ahead Log with transaction
57//!   boundaries, commit/abort markers, and crash recovery.
58//!
59//! - **StorageEngine Trait** (`storage_engine`): Pluggable storage backend abstraction
60//!   enabling 80% I/O reduction for columnar projections (Task 1).
61//!
62//! - **Page Manager** (`page_manager`): TOON file format with magic header and O(1)
63//!   page allocation (Task 8).
64//!
65//! - **Columnar Compression** (`columnar_compression`): Type-aware encoding with
66//!   dictionary, RLE, and delta compression for 2-4× storage reduction (Task 9).
67//!
68//! ## Utility Components
69//!
70//! - **Bloom Filters** (`bloom`): Probabilistic existence checks
71//! - **Block Checksums** (`block_checksum`): Data integrity validation
72//! - **Compression** (`compression`): LZ4/Zstd compression
73//! - **Sketches** (`sketches`): Approximate algorithms (HyperLogLog, CountMin, DDSketch)
74
75// New TOON-native storage components
76pub mod actor; // Actor-based connection manager (mm.md Task 7.2)
77pub mod aries_recovery; // ARIES-style crash recovery (Task 1)
78pub mod checkpoint; // ARIES-style checkpointing with WAL truncation (mm.md Task 1.4)
79pub mod columnar_compression;
80pub mod database; // Database Kernel (shared by embedded + server)
81pub mod durable_storage; // Fully wired durable storage with MVCC
82pub mod ffi;
83pub mod group_commit; // Event-driven Group Commit (Task 4)
84pub mod hlc; // Hybrid Logical Clock for commit timestamps (mm.md Task 1.3)
85pub mod hybrid_store; // PAX hybrid row-column storage (mm.md Task 4.1)
86pub mod ipc; // IPC Protocol with multiplexing (mm.md Task 7.1)
87#[cfg(unix)]
88pub mod ipc_server; // Unix Socket IPC Server (Task 3)
89pub mod learned_index_integration;
90pub mod lock; // Advisory file locking for database exclusivity
91pub mod lscs;
92pub mod mvcc_concurrent; // Concurrent MVCC for multi-reader single-writer (Task: Concurrent Embedded)
93pub mod mvcc_new;
94pub mod mvcc_snapshot;
95pub mod page_manager;
96pub mod production_wal; // Production WAL with ARIES recovery (mm.md Task 3)
97pub mod ssi; // Serializable Snapshot Isolation (Task 2)
98pub mod storage_engine;
99pub mod streaming_iterator; // Streaming Iterator Architecture (mm.md Task 4)
100pub mod transaction; // Unified Transaction Coordinator trait and types
101pub mod txn_arena; // Transaction-scoped arena with zero-copy key/value plumbing
102pub mod txn_wal;
103pub mod wal_fencing; // Epoch-based WAL fencing for split-brain detection
104pub mod wal_integration;
105pub mod zero_copy_safety; // Zero-Copy Validation Layer (Task 5) // FFI bindings for Python SDK
106
107// Performance optimization modules
108pub mod adaptive_learned_index;
109pub mod adaptive_memtable; // Adaptive memtable sizing with memory pressure (Task 10)
110pub mod deferred_index; // Deferred sorted index with LSM-style compaction (Rec 2)
111pub mod dirty_tracking; // Batched dirty tracking with MPSC queue
112pub mod index_policy; // Per-table index policy
113pub mod queue_index; // Queue-optimized index structure (Task: Queue Index Policy)
114pub mod batch_wal; // Batched WAL with vectored I/O (Task 3)
115pub mod key_buffer; // Cache-line aligned key buffer (Task 2)
116pub mod lockfree_memtable; // Lock-free read path with hazard pointers (Task 4)
117pub mod packed_row; // Unified row storage with delta encoding (Task 1)
118
119// PhD-Level Architectural Optimizations (December 2025)
120pub mod clr_learned_index; // CLR Learned Index for sorted runs (Task 3)
121pub mod lockfree_epoch; // Lock-Free Epoch Tracking (Task 3)
122pub mod hierarchical_ts; // Hierarchical Timestamp Oracle (Task 9)
123pub mod shard_coalesced; // Shard-Coalesced Batch DashMap (Task 6)
124pub mod polymorphic_value; // Polymorphic Value Encoding (Task 12)
125pub mod epoch_arena; // Epoch-Partitioned Key Arena (Task 1)
126pub mod stratified_skiplist; // Stratified SkipList with Deferred Promotion (Task 2)
127pub mod columnar_wal; // Columnar WAL Layout (Task 4)
128pub mod generational_slab; // Generational Slab Allocator (Task 5)
129pub mod rl_workload; // RL Workload Classifier (Task 10)
130#[cfg(unix)]
131pub mod io_uring_wal; // io_uring WAL Submission (Task 11)
132
133// New performance modules (Recommendations 1-9)
134pub mod cow_btree; // Copy-on-Write B-Tree for ordered access (Recommendation 5)
135pub mod epoch_mvcc; // Epoch-based MVCC for O(log E) version lookup (Recommendation 7)
136pub mod page_cache; // Application-level page cache with Clock-Pro (Recommendation 8)
137pub mod row_format; // Slot-based columnar row storage (Recommendation 1)
138pub mod tiered_memtable; // Tiered MemTable with deferred sorting (Recommendation 3)
139pub mod tournament_tree; // K-way merge with tournament tree (Task 2)
140pub mod vectorized_scan; // SIMD-accelerated vectorized scan engine (Recommendation 2)
141pub mod zero_copy_serde; // Zero-copy serialization for WAL (Recommendation 6)
142
143// Namespace and multi-tenancy support (Task 3)
144pub mod namespace; // Namespace routing and on-disk layout
145
146// Core utilities
147pub mod backend;
148pub mod backup;
149pub mod block_checksum;
150pub mod bloom;
151pub mod compression;
152pub mod dict_compression;
153pub mod direct_io;
154#[cfg(unix)]
155pub mod io_uring;
156pub mod manifest;
157pub mod memory;
158pub mod parallel_merge;
159pub mod payload;
160pub mod prefetch;
161pub mod sketches;
162pub mod two_level_index;
163pub mod validation;
164pub mod version_store;
165pub mod zero_copy;
166
167// Re-exports for new components
168pub use columnar_compression::{
169    ColumnEncoder, DeltaEncoder, DictionaryEncoder, EncodingStats, EncodingType, RleEncoder,
170};
171pub use learned_index_integration::{
172    HybridIndex, IndexManager, IndexType, KeyStats, PointLookupExecutor,
173};
174pub use lscs::{
175    ColumnDef, ColumnGroup, ColumnType, ColumnarMemtable, Lscs, LscsConfig, LscsRecoveryStats,
176    LscsStats, TableSchema,
177};
178#[allow(deprecated)]
179pub use mvcc_snapshot::{
180    MvccStore, Snapshot as MvccSnapshot, Timestamp, TransactionManager, TxnId, TxnStatus,
181    VersionChain, VersionInfo,
182};
183pub use page_manager::{
184    DEFAULT_PAGE_SIZE, DbHeader, FORMAT_VERSION, FreePageHeader, PageId, PageManager,
185    PageManagerStats, PageType, SOCHDB_MAGIC,
186};
187pub use storage_engine::{
188    ColumnId, ColumnIterator, Row, RowId, StorageEngine, StorageEngineType, StorageStats,
189    TxnHandle, open_storage_engine,
190};
191pub use transaction::{
192    DurabilityLevel, IsolationLevel, RecoveryStats as TxnRecoveryStats, TransactionCoordinator,
193    TransactionHandle,
194};
195pub use txn_wal::{CrashRecoveryStats, TxnWal, TxnWalBuffer, TxnWalEntry, TxnWalStats};
196pub use wal_integration::{
197    GroupCommitBuffer, MvccTransactionManager, RecoveryStats, Transaction, TxnState, 
198    WalStorageManager,
199};
200
201// Re-exports for performance optimization modules
202pub use adaptive_learned_index::{AdaptiveLearnedIndex, LearnedIndexStats, PiecewiseLinearModel};
203pub use adaptive_memtable::{
204    AdaptiveMemtableConfig, AdaptiveMemtableSizer, AdaptiveMemtableStats,
205    DEFAULT_BASE_SIZE, MAX_MEMTABLE_SIZE, MIN_MEMTABLE_SIZE,
206};
207pub use batch_wal::{
208    BatchAccumulator, BatchedWalReader, BatchedWalStats, BatchedWalWriter, ConcurrentBatchedWal,
209    DEFAULT_MAX_BATCH_BYTES, DEFAULT_MAX_BATCH_SIZE,
210};
211pub use clr_learned_index::{ClrIndex, ClrLookupResult, ClrStats, IndexedSortedRun};
212pub use key_buffer::{
213    ArenaKey,
214    ArenaKeyHandle,
215    BatchKeyGenerator,
216    InternedTablePrefix,
217    // Arena allocation for high-throughput key operations
218    KeyArena,
219    KeyBuffer,
220    MAX_KEY_LENGTH,
221};
222pub use lockfree_memtable::{
223    HazardDomain,
224    INLINE_VALUE_SIZE,
225    LockFreeMemTable,
226    LockFreeVersion,
227    LockFreeVersionChain,
228    // Inline value storage for reduced memory indirection
229    ValueStorage,
230};
231pub use packed_row::{
232    PackedColumnDef, PackedColumnType, PackedRow, PackedRowBuilder, PackedTableSchema,
233};
234
235// Re-exports for utilities
236pub use backend::{LocalFsBackend, ObjectMetadata, StorageBackend};
237pub use backup::{BackupManager, BackupMetadata};
238pub use block_checksum::{
239    BlockChecksumConfig, BlockChecksumStats, BlockType as BlockChecksumType, BlockWriter, ChecksummedBlock,
240};
241pub use bloom::{BlockedBloomFilter, BloomFilter, LevelAdaptiveFPR, UnifiedBloomFilter};
242pub use compression::{CompressionEngine, CompressionStats, StorageTier};
243pub use manifest::{FileMetadata, LsmState, Manifest, VersionEdit};
244pub use memory::{MemoryBudget, MemoryTracker, WriteBufferManager, WriteBufferStats};
245pub use mvcc_new::{
246    ColumnGroupRef, ReadVersion, Snapshot, SnapshotGuard, VersionGuard, VersionSet,
247    VersionSetStats, VersionSetStatsSnapshot,
248};
249pub use payload::{CompressionType, PayloadStats, PayloadStore};
250pub use sketches::{AdaptiveSketch, CountMinSketch, DDSketch, ExponentialHistogram, HyperLogLog};
251pub use two_level_index::{
252    BlockIndexEntry, BlockIndexReader, FencePointer, TemporalKey, TwoLevelIndex,
253};
254pub use validation::{SSTableValidator, validate_sstable_file};
255
256// Re-exports for durable storage
257pub use durable_storage::{ArenaMvccMemTable, DurableStorage, MvccMemTable, TransactionMode};
258
259// Re-exports for concurrent MVCC (Task: Concurrent Embedded)
260pub use mvcc_concurrent::{
261    ConcurrentMvcc, HlcTimestamp, ReaderSlot, 
262    ConcurrentVersionChain, ConcurrentVersionEntry,
263    VersionStore, VersionStoreStats, WriterGuard,
264};
265
266// Super Version and Copy-on-Write Version Set (mm.md Task 1)
267pub mod version_set;
268pub mod concurrent_art;
269pub mod sstable;
270pub mod wal_segment;
271pub mod compaction_policy;
272pub mod optimized_scan;
273
274// Re-exports for new performance modules (Recommendations 1-9)
275pub use version_set::{
276    FileMetadata as VersionFileMetadata, ImmutableMemTable, ImmutableMemTableRef,
277    LevelMetadata, SuperVersion, SuperVersionHandle, VersionSet as CowVersionSet,
278};
279pub use concurrent_art::ConcurrentART;
280pub use sstable::{
281    BlockBuilder, BlockIterator, BlockHandle, BlockType,
282    FilterPolicy, BloomFilterPolicy, RibbonFilterPolicy, XorFilterPolicy, FilterReader,
283    SSTableFormat, Header, Footer, Section, SectionType,
284    SSTableBuilder, SSTableBuilderOptions, SSTableBuilderResult,
285    SSTable, TableMetadata, ReadOptions, BlockCache,
286};
287pub use wal_segment::{
288    WalSegmentManager, SegmentConfig, SegmentHeader, SegmentMetadata,
289    CheckpointRecord, SegmentStats, RecoveryIterator, WalEntry,
290};
291pub use compaction_policy::{
292    CompactionConfig, CompactionFile, CompactionJob, CompactionPicker,
293    CompactionPriority, CompactionReason, CompactionState, CompactionStats,
294    CompactionStrategy, LeveledCompactionPicker, RetentionConfig,
295    UniversalCompactionPicker, VersionPruner,
296};
297pub use optimized_scan::{
298    EntrySource, FileRange, LevelFiles, RangeScanner, ScanConfig, ScanStats,
299    TournamentTree, VersionedEntry,
300};
301pub use cow_btree::{BTreeEntry, BTreeSnapshot, CowBTree, Node, SearchResult};
302pub use epoch_mvcc::{
303    CommitResult, EpochManager, EpochMvccStore, EpochSnapshot, EpochTransaction,
304    EpochVersionChain, GcStats, StoreStats, VersionEntry,
305};
306pub use page_cache::{CacheStats, ClockProCache, CachedPage, PageId as CachePageId, PageState};
307pub use row_format::{Slot, SlotRow, SlotRowArena, SlotRowHandle, SlotRowFlags};
308pub use tiered_memtable::{HotEntry, SortedBatch, TieredMemTable};
309pub use vectorized_scan::{
310    ColumnVector, ComparisonOp, Int64Comparison, VectorBatch, VectorPredicate,
311    VectorizedScanConfig, VectorizedScanStats, DEFAULT_BATCH_SIZE,
312    // SoA + Late Materialization (80/20 optimization)
313    SimdVisibilityFilter, SoaBatch, SoaScanIterator, SoaScanStats, SoaSource,
314    StreamingScanIterator, ValueHandle, VersionedSlice,
315};
316pub use zero_copy_serde::{
317    FieldDescriptor, MmapWalReader, SerdeStats, WalBatchReader, WalBatchWriter,
318    WalEntryBuilder, WalEntryHeader, WalEntryReader, WalEntryType, ZeroCopyHeader,
319    FORMAT_VERSION as SERDE_FORMAT_VERSION, HEADER_SIZE as SERDE_HEADER_SIZE, ZERO_COPY_MAGIC,
320};
321
322// Re-exports for transaction arena and zero-copy plumbing
323pub use txn_arena::{
324    ArenaWriteSet, BytesRef, KeyFingerprint, TxnArena, TxnWriteBuffer, WriteOp,
325};
326
327// Re-exports for dirty tracking with batching
328pub use dirty_tracking::{
329    BatchedDirtyTracker, DirtyEvent, DirtyTrackingStats, TxnDirtyBuffer,
330};
331
332// Re-exports for per-table index policy
333pub use index_policy::{
334    BalancedTableIndex, IndexPolicy, SortedRun, TableIndexConfig, TableIndexRegistry,
335};
336
337// Re-exports for queue-optimized index structure
338pub use queue_index::{
339    CompositeQueueKey, QueueIndex, QueueIndexConfig, QueueIndexStats, QueueTableRegistry,
340};
341
342// Re-exports for database kernel
343pub use database::{
344    ColumnDef as DbColumnDef,
345    ColumnType as DbColumnType,
346    ColumnarQueryResult, // SIMD-friendly columnar result format
347    Database,
348    DatabaseConfig,
349    GroupCommitSettings,
350    QueryBuilder,
351    QueryResult,
352    QueryRowIterator,
353    RecoveryStats as DbRecoveryStats,
354    Stats as DbStats,
355    SyncMode,
356    TableSchema as DbTableSchema,
357    TxnHandle as KernelTxnHandle,
358    VectorSearchResult,
359};