Skip to main content

sochdb_storage/
lib.rs

1// SPDX-License-Identifier: AGPL-3.0-or-later
2// SochDB - LLM-Optimized Embedded Database
3// Copyright (C) 2026 Sushanth Reddy Vanagala (https://github.com/sushanthpy)
4//
5// This program is free software: you can redistribute it and/or modify
6// it under the terms of the GNU Affero General Public License as published by
7// the Free Software Foundation, either version 3 of the License, or
8// (at your option) any later version.
9//
10// This program is distributed in the hope that it will be useful,
11// but WITHOUT ANY WARRANTY; without even the implied warranty of
12// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13// GNU Affero General Public License for more details.
14//
15// You should have received a copy of the GNU Affero General Public License
16// along with this program. If not, see <https://www.gnu.org/licenses/>.
17
18//! SochDB Storage Layer
19//!
20//! Log-Structured Column Store (LSCS) with transaction-aware WAL for TOON-native data.
21//!
22//! ## Runtime Modes
23//!
24//! This crate supports two runtime modes:
25//!
26//! ### Embedded Sync Mode (like SQLite)
27//!
28//! For embedded deployments without async runtime:
29//!
30//! ```toml
31//! sochdb-storage = { version = "...", default-features = false, features = ["embedded-sync"] }
32//! ```
33//!
34//! Benefits:
35//! - ~500KB smaller binary
36//! - No async runtime overhead
37//! - Simpler embedded integration
38//!
39//! ### Async Mode (default, for servers)
40//!
41//! For server deployments with async I/O:
42//!
43//! ```toml
44//! sochdb-storage = { version = "..." }  # async enabled by default
45//! ```
46//!
47//! Benefits:
48//! - Better scalability for concurrent connections
49//! - Non-blocking I/O for server workloads
50//!
51//! ## Novel Components
52//!
53//! - **LSCS** (`lscs`): Log-Structured Column Store - columnar variant of LSM with
54//!   schema-aware compression and column-aware compaction for reduced write amplification.
55//!
56//! - **Transaction WAL** (`txn_wal`): ACID-compliant Write-Ahead Log with transaction
57//!   boundaries, commit/abort markers, and crash recovery.
58//!
59//! - **StorageEngine Trait** (`storage_engine`): Pluggable storage backend abstraction
60//!   enabling 80% I/O reduction for columnar projections (Task 1).
61//!
62//! - **Page Manager** (`page_manager`): TOON file format with magic header and O(1)
63//!   page allocation (Task 8).
64//!
65//! - **Columnar Compression** (`columnar_compression`): Type-aware encoding with
66//!   dictionary, RLE, and delta compression for 2-4× storage reduction (Task 9).
67//!
68//! ## Utility Components
69//!
70//! - **Bloom Filters** (`bloom`): Probabilistic existence checks
71//! - **Block Checksums** (`block_checksum`): Data integrity validation
72//! - **Compression** (`compression`): LZ4/Zstd compression
73//! - **Sketches** (`sketches`): Approximate algorithms (HyperLogLog, CountMin, DDSketch)
74
75// New TOON-native storage components
76pub mod actor; // Actor-based connection manager (mm.md Task 7.2)
77pub mod admission_control; // Admission control with cost model + tenant fairness (Task 6)
78#[cfg(feature = "experimental")]
79pub mod aries_recovery; // ARIES-style crash recovery (Task 1) [quarantined: unwired]
80pub mod cdc; // WAL-derived Change Data Capture (T1)
81#[cfg(feature = "experimental")]
82pub mod checkpoint; // ARIES-style checkpointing with WAL truncation (mm.md Task 1.4) [quarantined: unwired]
83pub mod columnar_compression;
84pub mod correctness_testing; // Property-based correctness testing (Task 13)
85pub mod database; // Database Kernel (shared by embedded + server)
86pub mod durability_contract; // Durability contract hardening (Task 4)
87pub mod durable_storage; // Fully wired durable storage with MVCC
88#[cfg(feature = "experimental")]
89pub mod encryption; // Data-at-rest encryption (AES-256-GCM-SIV) [quarantined: unwired]
90pub mod ffi;
91pub mod group_commit; // Event-driven Group Commit (Task 4)
92pub mod hlc; // Hybrid Logical Clock for commit timestamps (mm.md Task 1.3)
93pub mod hybrid_store; // PAX hybrid row-column storage (mm.md Task 4.1)
94pub mod io_isolation; // I/O isolation policy with cache partitioning (Task 5)
95pub mod ipc; // IPC Protocol with multiplexing (mm.md Task 7.1)
96#[cfg(unix)]
97pub mod ipc_server; // Unix Socket IPC Server (Task 3)
98pub mod learned_index_integration;
99pub mod lock; // Advisory file locking for database exclusivity
100pub mod lscs;
101pub mod mvcc_concurrent; // Concurrent MVCC for multi-reader single-writer (Task: Concurrent Embedded)
102pub mod mvcc_new;
103pub mod mvcc_snapshot;
104pub mod page_manager;
105#[cfg(feature = "experimental")]
106pub mod pitr; // Point-in-Time Recovery with WAL archiving (Task 11) [quarantined: unwired]
107#[cfg(feature = "experimental")]
108pub mod production_wal; // Production WAL with ARIES recovery (mm.md Task 3) [quarantined: unwired]
109pub mod ssi; // Serializable Snapshot Isolation (Task 2)
110pub mod ssi_scaling; // SSI scaling guardrails with range locks (Task 7)
111pub mod storage_engine;
112pub mod streaming_iterator; // Streaming Iterator Architecture (mm.md Task 4)
113pub mod supervisor; // Supervised background workers (panic-contained restart) (Task 4)
114pub mod transaction; // Unified Transaction Coordinator trait and types
115pub mod txn_arena; // Transaction-scoped arena with zero-copy key/value plumbing
116pub mod txn_wal;
117pub mod upgrade_contract; // Upgrade compatibility contract (Task 12)
118#[cfg(feature = "experimental")]
119pub mod wal_fencing; // Epoch-based WAL fencing for split-brain detection [quarantined: unwired]
120pub mod wal_integration;
121pub mod zero_copy_safety; // Zero-Copy Validation Layer (Task 5) // FFI bindings for Python SDK
122
123// Performance optimization modules
124pub mod adaptive_learned_index;
125pub mod adaptive_memtable; // Adaptive memtable sizing with memory pressure (Task 10)
126pub mod batch_wal; // Batched WAL with vectored I/O (Task 3)
127pub mod deferred_index; // Deferred sorted index with LSM-style compaction (Rec 2)
128pub mod dirty_tracking; // Batched dirty tracking with MPSC queue
129pub mod index_policy; // Per-table index policy
130pub mod key_buffer; // Cache-line aligned key buffer (Task 2)
131pub mod lockfree_memtable; // Lock-free read path with hazard pointers (Task 4)
132pub mod packed_row;
133pub mod queue_index; // Queue-optimized index structure (Task: Queue Index Policy) // Unified row storage with delta encoding (Task 1)
134
135// PhD-Level Architectural Optimizations (December 2025)
136pub mod clr_learned_index; // CLR Learned Index for sorted runs (Task 3)
137#[cfg(feature = "experimental")]
138pub mod columnar_wal; // Columnar WAL Layout (Task 4) [quarantined: unwired]
139pub mod epoch_arena; // Epoch-Partitioned Key Arena (Task 1)
140pub mod generational_slab; // Generational Slab Allocator (Task 5)
141#[cfg(feature = "experimental")]
142pub mod hierarchical_ts; // Hierarchical Timestamp Oracle (Task 9) [quarantined: unwired]
143#[cfg(all(unix, feature = "experimental"))]
144pub mod io_uring_wal; // [quarantined: unwired]
145pub mod lockfree_epoch; // Lock-Free Epoch Tracking (Task 3)
146pub mod polymorphic_value; // Polymorphic Value Encoding (Task 12)
147#[cfg(feature = "experimental")]
148pub mod rl_workload; // RL Workload Classifier (Task 10) [quarantined: unwired]
149pub mod shard_coalesced; // Shard-Coalesced Batch DashMap (Task 6)
150pub mod stratified_skiplist; // Stratified SkipList with Deferred Promotion (Task 2) // io_uring WAL Submission (Task 11)
151
152// New performance modules (Recommendations 1-9)
153pub mod cow_btree; // Copy-on-Write B-Tree for ordered access (Recommendation 5)
154pub mod epoch_mvcc; // Epoch-based MVCC for O(log E) version lookup (Recommendation 7)
155pub mod page_cache; // Application-level page cache with Clock-Pro (Recommendation 8)
156pub mod row_format; // Slot-based columnar row storage (Recommendation 1)
157pub mod tiered_memtable; // Tiered MemTable with deferred sorting (Recommendation 3)
158pub mod tournament_tree; // K-way merge with tournament tree (Task 2)
159pub mod vectorized_scan; // SIMD-accelerated vectorized scan engine (Recommendation 2)
160pub mod zero_copy_serde; // Zero-copy serialization for WAL (Recommendation 6)
161
162// Namespace and multi-tenancy support (Task 3)
163pub mod lazy_namespace; // Per-namespace lazy hydrate/evict
164pub mod namespace; // Namespace routing and on-disk layout
165pub mod object_store_tier; // Object-storage cold tier for immutable segments
166
167// Core utilities
168pub mod backend;
169pub mod backup;
170pub mod block_checksum;
171pub mod bloom;
172pub mod compression;
173pub mod dict_compression;
174pub mod direct_io;
175#[cfg(unix)]
176pub mod io_uring;
177pub mod manifest;
178pub mod memory;
179pub mod parallel_merge;
180pub mod payload;
181pub mod prefetch;
182pub mod sketches;
183pub mod two_level_index;
184pub mod validation;
185pub mod version_store;
186pub mod zero_copy;
187
188// Re-exports for new components
189pub use columnar_compression::{
190    ColumnEncoder, DeltaEncoder, DictionaryEncoder, EncodingStats, EncodingType, RleEncoder,
191};
192pub use learned_index_integration::{
193    HybridIndex, IndexManager, IndexType, KeyStats, PointLookupExecutor,
194};
195pub use lscs::{
196    ColumnDef, ColumnGroup, ColumnType, ColumnarMemtable, Lscs, LscsConfig, LscsRecoveryStats,
197    LscsStats, TableSchema,
198};
199#[allow(deprecated)]
200pub use mvcc_snapshot::{
201    MvccStore, Snapshot as MvccSnapshot, Timestamp, TransactionManager, TxnId, TxnStatus,
202    VersionChain, VersionInfo,
203};
204pub use page_manager::{
205    DEFAULT_PAGE_SIZE, DbHeader, FORMAT_VERSION, FreePageHeader, PageId, PageManager,
206    PageManagerStats, PageType, SOCHDB_MAGIC,
207};
208pub use storage_engine::{
209    ColumnId, ColumnIterator, Row, RowId, StorageEngine, StorageEngineType, StorageStats,
210    TxnHandle, open_storage_engine,
211};
212pub use transaction::{
213    DurabilityLevel, IsolationLevel, RecoveryStats as TxnRecoveryStats, TransactionCoordinator,
214    TransactionHandle,
215};
216pub use txn_wal::{CrashRecoveryStats, TxnWal, TxnWalBuffer, TxnWalEntry, TxnWalStats};
217pub use wal_integration::{
218    GroupCommitBuffer, MvccTransactionManager, RecoveryStats, Transaction, TxnState,
219    WalStorageManager,
220};
221
222// Re-exports for performance optimization modules
223pub use adaptive_learned_index::{AdaptiveLearnedIndex, LearnedIndexStats, PiecewiseLinearModel};
224pub use adaptive_memtable::{
225    AdaptiveMemtableConfig, AdaptiveMemtableSizer, AdaptiveMemtableStats, DEFAULT_BASE_SIZE,
226    MAX_MEMTABLE_SIZE, MIN_MEMTABLE_SIZE,
227};
228pub use batch_wal::{
229    BatchAccumulator, BatchedWalReader, BatchedWalStats, BatchedWalWriter, ConcurrentBatchedWal,
230    DEFAULT_MAX_BATCH_BYTES, DEFAULT_MAX_BATCH_SIZE,
231};
232pub use clr_learned_index::{ClrIndex, ClrLookupResult, ClrStats, IndexedSortedRun};
233pub use key_buffer::{
234    ArenaKey,
235    ArenaKeyHandle,
236    BatchKeyGenerator,
237    InternedTablePrefix,
238    // Arena allocation for high-throughput key operations
239    KeyArena,
240    KeyBuffer,
241    MAX_KEY_LENGTH,
242};
243pub use lockfree_memtable::{
244    HazardDomain,
245    INLINE_VALUE_SIZE,
246    LockFreeMemTable,
247    LockFreeVersion,
248    LockFreeVersionChain,
249    // Inline value storage for reduced memory indirection
250    ValueStorage,
251};
252pub use packed_row::{
253    PackedColumnDef, PackedColumnType, PackedRow, PackedRowBuilder, PackedTableSchema,
254};
255
256// Re-exports for utilities
257pub use backend::{LocalFsBackend, ObjectMetadata, StorageBackend};
258pub use backup::{BackupManager, BackupMetadata};
259pub use block_checksum::{
260    BlockChecksumConfig, BlockChecksumStats, BlockType as BlockChecksumType, BlockWriter,
261    ChecksummedBlock,
262};
263pub use bloom::{BlockedBloomFilter, BloomFilter, LevelAdaptiveFPR, UnifiedBloomFilter};
264pub use compression::{CompressionEngine, CompressionStats, StorageTier};
265pub use manifest::{FileMetadata, LsmState, Manifest, VersionEdit};
266pub use memory::{MemoryBudget, MemoryTracker, WriteBufferManager, WriteBufferStats};
267pub use mvcc_new::{
268    ColumnGroupRef, ReadVersion, Snapshot, SnapshotGuard, VersionGuard, VersionSet,
269    VersionSetStats, VersionSetStatsSnapshot,
270};
271pub use payload::{CompressionType, PayloadStats, PayloadStore};
272pub use sketches::{AdaptiveSketch, CountMinSketch, DDSketch, ExponentialHistogram, HyperLogLog};
273pub use two_level_index::{
274    BlockIndexEntry, BlockIndexReader, FencePointer, TemporalKey, TwoLevelIndex,
275};
276pub use validation::{SSTableValidator, validate_sstable_file};
277
278// Re-exports for durable storage
279pub use durable_storage::{
280    ArenaMvccMemTable, DurableStorage, EphemeralHandle, MvccMemTable, TransactionMode,
281};
282
283// Re-exports for concurrent MVCC (Task: Concurrent Embedded)
284pub use mvcc_concurrent::{
285    ConcurrentMvcc, ConcurrentVersionChain, ConcurrentVersionEntry, HlcTimestamp, ReaderSlot,
286    VersionStore, VersionStoreStats, WriterGuard,
287};
288
289// Super Version and Copy-on-Write Version Set (mm.md Task 1)
290pub mod compaction_policy;
291pub mod concurrent_art;
292pub mod optimized_scan;
293pub mod sstable;
294pub mod version_set;
295pub mod wal_segment;
296
297// Re-exports for new performance modules (Recommendations 1-9)
298pub use compaction_policy::{
299    CompactionConfig, CompactionFile, CompactionJob, CompactionPicker, CompactionPriority,
300    CompactionReason, CompactionState, CompactionStats, CompactionStrategy,
301    LeveledCompactionPicker, RetentionConfig, UniversalCompactionPicker, VersionPruner,
302};
303pub use concurrent_art::ConcurrentART;
304pub use cow_btree::{BTreeEntry, BTreeSnapshot, CowBTree, Node, SearchResult};
305pub use epoch_mvcc::{
306    CommitResult, EpochManager, EpochMvccStore, EpochSnapshot, EpochTransaction, EpochVersionChain,
307    GcStats, StoreStats, VersionEntry,
308};
309pub use lazy_namespace::{LazyNamespaceConfig, LazyNamespaceTable};
310pub use object_store_tier::{ObjectStoreTier, ObjectStoreTierConfig, SegmentDescriptor};
311pub use optimized_scan::{
312    EntrySource, FileRange, LevelFiles, RangeScanner, ScanConfig, ScanStats, TournamentTree,
313    VersionedEntry,
314};
315pub use page_cache::{CacheStats, CachedPage, ClockProCache, PageId as CachePageId, PageState};
316pub use row_format::{Slot, SlotRow, SlotRowArena, SlotRowFlags, SlotRowHandle};
317pub use sstable::{
318    BlockBuilder, BlockCache, BlockHandle, BlockIterator, BlockType, BloomFilterPolicy,
319    FilterPolicy, FilterReader, Footer, Header, ReadOptions, RibbonFilterPolicy, SSTable,
320    SSTableBuilder, SSTableBuilderOptions, SSTableBuilderResult, SSTableFormat, Section,
321    SectionType, TableMetadata, XorFilterPolicy,
322};
323pub use tiered_memtable::{HotEntry, SortedBatch, TieredMemTable};
324pub use vectorized_scan::{
325    ColumnVector,
326    ComparisonOp,
327    DEFAULT_BATCH_SIZE,
328    Int64Comparison,
329    // SoA + Late Materialization (80/20 optimization)
330    SimdVisibilityFilter,
331    SoaBatch,
332    SoaScanIterator,
333    SoaScanStats,
334    SoaSource,
335    StreamingScanIterator,
336    ValueHandle,
337    VectorBatch,
338    VectorPredicate,
339    VectorizedScanConfig,
340    VectorizedScanStats,
341    VersionedSlice,
342};
343pub use version_set::{
344    FileMetadata as VersionFileMetadata, ImmutableMemTable, ImmutableMemTableRef, LevelMetadata,
345    SuperVersion, SuperVersionHandle, VersionSet as CowVersionSet,
346};
347pub use wal_segment::{
348    CheckpointRecord, RecoveryIterator, SegmentConfig, SegmentHeader, SegmentMetadata,
349    SegmentStats, WalEntry, WalSegmentManager,
350};
351pub use zero_copy_serde::{
352    FORMAT_VERSION as SERDE_FORMAT_VERSION, FieldDescriptor, HEADER_SIZE as SERDE_HEADER_SIZE,
353    MmapWalReader, SerdeStats, WalBatchReader, WalBatchWriter, WalEntryBuilder, WalEntryHeader,
354    WalEntryReader, WalEntryType, ZERO_COPY_MAGIC, ZeroCopyHeader,
355};
356
357// Re-exports for transaction arena and zero-copy plumbing
358pub use txn_arena::{ArenaWriteSet, BytesRef, KeyFingerprint, TxnArena, TxnWriteBuffer, WriteOp};
359
360// Re-exports for dirty tracking with batching
361pub use dirty_tracking::{BatchedDirtyTracker, DirtyEvent, DirtyTrackingStats, TxnDirtyBuffer};
362
363// Re-exports for per-table index policy
364pub use index_policy::{
365    BalancedTableIndex, IndexPolicy, SortedRun, TableIndexConfig, TableIndexRegistry,
366};
367
368// Re-exports for queue-optimized index structure
369pub use queue_index::{
370    CompositeQueueKey, QueueIndex, QueueIndexConfig, QueueIndexStats, QueueTableRegistry,
371};
372
373// Re-exports for CDC engine
374pub use cdc::{CdcConfig, CdcEmitter, CdcError, CdcEvent, CdcLog, CdcOperation, CdcSubscriber};
375
376// Re-exports for database kernel
377pub use database::{
378    ColumnDef as DbColumnDef,
379    ColumnType as DbColumnType,
380    ColumnarQueryResult, // SIMD-friendly columnar result format
381    Database,
382    DatabaseConfig,
383    GroupCommitSettings,
384    QueryBuilder,
385    QueryResult,
386    QueryRowIterator,
387    RecoveryStats as DbRecoveryStats,
388    Stats as DbStats,
389    SyncMode,
390    TableSchema as DbTableSchema,
391    TxnHandle as KernelTxnHandle,
392    VectorSearchResult,
393};