Skip to main content

sochdb_storage/
lib.rs

1// SPDX-License-Identifier: AGPL-3.0-or-later
2// SochDB - LLM-Optimized Embedded Database
3// Copyright (C) 2026 Sushanth Reddy Vanagala (https://github.com/sushanthpy)
4//
5// This program is free software: you can redistribute it and/or modify
6// it under the terms of the GNU Affero General Public License as published by
7// the Free Software Foundation, either version 3 of the License, or
8// (at your option) any later version.
9//
10// This program is distributed in the hope that it will be useful,
11// but WITHOUT ANY WARRANTY; without even the implied warranty of
12// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13// GNU Affero General Public License for more details.
14//
15// You should have received a copy of the GNU Affero General Public License
16// along with this program. If not, see <https://www.gnu.org/licenses/>.
17
18//! SochDB Storage Layer
19//!
20//! Log-Structured Column Store (LSCS) with transaction-aware WAL for TOON-native data.
21//!
22//! ## Runtime Modes
23//!
24//! This crate supports two runtime modes:
25//!
26//! ### Embedded Sync Mode (like SQLite)
27//!
28//! For embedded deployments without async runtime:
29//!
30//! ```toml
31//! sochdb-storage = { version = "...", default-features = false, features = ["embedded-sync"] }
32//! ```
33//!
34//! Benefits:
35//! - ~500KB smaller binary
36//! - No async runtime overhead
37//! - Simpler embedded integration
38//!
39//! ### Async Mode (default, for servers)
40//!
41//! For server deployments with async I/O:
42//!
43//! ```toml
44//! sochdb-storage = { version = "..." }  # async enabled by default
45//! ```
46//!
47//! Benefits:
48//! - Better scalability for concurrent connections
49//! - Non-blocking I/O for server workloads
50//!
51//! ## Novel Components
52//!
53//! - **LSCS** (`lscs`): Log-Structured Column Store - columnar variant of LSM with
54//!   schema-aware compression and column-aware compaction for reduced write amplification.
55//!
56//! - **Transaction WAL** (`txn_wal`): ACID-compliant Write-Ahead Log with transaction
57//!   boundaries, commit/abort markers, and crash recovery.
58//!
59//! - **StorageEngine Trait** (`storage_engine`): Pluggable storage backend abstraction
60//!   enabling 80% I/O reduction for columnar projections (Task 1).
61//!
62//! - **Page Manager** (`page_manager`): TOON file format with magic header and O(1)
63//!   page allocation (Task 8).
64//!
65//! - **Columnar Compression** (`columnar_compression`): Type-aware encoding with
66//!   dictionary, RLE, and delta compression for 2-4× storage reduction (Task 9).
67//!
68//! ## Utility Components
69//!
70//! - **Bloom Filters** (`bloom`): Probabilistic existence checks
71//! - **Block Checksums** (`block_checksum`): Data integrity validation
72//! - **Compression** (`compression`): LZ4/Zstd compression
73//! - **Sketches** (`sketches`): Approximate algorithms (HyperLogLog, CountMin, DDSketch)
74
75// New TOON-native storage components
76pub mod actor; // Actor-based connection manager (mm.md Task 7.2)
77pub mod admission_control; // Admission control with cost model + tenant fairness (Task 6)
78#[cfg(feature = "experimental")]
79pub mod aries_recovery; // ARIES-style crash recovery (Task 1) [quarantined: unwired]
80pub mod cdc; // WAL-derived Change Data Capture (T1)
81#[cfg(feature = "experimental")]
82pub mod checkpoint; // ARIES-style checkpointing with WAL truncation (mm.md Task 1.4) [quarantined: unwired]
83pub mod columnar_compression;
84pub mod correctness_testing; // Property-based correctness testing (Task 13)
85pub mod database; // Database Kernel (shared by embedded + server)
86pub mod durability_contract; // Durability contract hardening (Task 4)
87pub mod durable_storage; // Fully wired durable storage with MVCC
88pub mod encryption; // Data-at-rest encryption (AES-256-GCM-SIV envelope) — now wired (Task 3B)
89pub mod ffi;
90pub mod group_commit; // Event-driven Group Commit (Task 4)
91pub mod hlc; // Hybrid Logical Clock for commit timestamps (mm.md Task 1.3)
92pub mod hybrid_store; // PAX hybrid row-column storage (mm.md Task 4.1)
93pub mod io_isolation; // I/O isolation policy with cache partitioning (Task 5)
94pub mod ipc; // IPC Protocol with multiplexing (mm.md Task 7.1)
95#[cfg(unix)]
96pub mod ipc_server; // Unix Socket IPC Server (Task 3)
97pub mod keyring; // KEK/DEK envelope: HKDF-derived DEK, wrapped + persisted, fail-closed (Task 3B)
98pub mod learned_index_integration;
99pub mod lock; // Advisory file locking for database exclusivity
100pub mod lscs;
101pub mod mvcc_concurrent; // Concurrent MVCC for multi-reader single-writer (Task: Concurrent Embedded)
102#[deprecated(
103    note = "Unused duplicate; live MVCC is mvcc_concurrent::ConcurrentMvcc + durable_storage::MvccMemTable. Scheduled for removal (Task 2 consolidation)."
104)]
105pub mod mvcc_new;
106pub mod mvcc_snapshot;
107pub mod page_manager;
108#[cfg(feature = "experimental")]
109pub mod pitr; // Point-in-Time Recovery with WAL archiving (Task 11) [quarantined: unwired]
110#[cfg(feature = "experimental")]
111pub mod production_wal; // Production WAL with ARIES recovery (mm.md Task 3) [quarantined: unwired]
112pub mod ssi; // Serializable Snapshot Isolation (Task 2)
113#[deprecated(
114    note = "Unused; SSI lives in ssi/MvccManager. Scheduled for removal (Task 2 consolidation)."
115)]
116pub mod ssi_scaling; // SSI scaling guardrails with range locks (Task 7)
117pub mod storage_engine;
118pub mod streaming_iterator; // Streaming Iterator Architecture (mm.md Task 4)
119pub mod supervisor; // Supervised background workers (panic-contained restart) (Task 4)
120pub mod transaction; // Unified Transaction Coordinator trait and types
121pub mod txn_arena; // Transaction-scoped arena with zero-copy key/value plumbing
122pub mod txn_wal;
123pub mod upgrade_contract; // Upgrade compatibility contract (Task 12)
124#[cfg(feature = "experimental")]
125pub mod wal_fencing; // Epoch-based WAL fencing for split-brain detection [quarantined: unwired]
126pub mod wal_integration;
127pub mod zero_copy_safety; // Zero-Copy Validation Layer (Task 5) // FFI bindings for Python SDK
128
129// Performance optimization modules
130#[deprecated(
131    note = "Unused duplicate; live learned index is learned_index_integration. Scheduled for removal (Task 2 consolidation)."
132)]
133pub mod adaptive_learned_index;
134#[deprecated(
135    note = "Unused duplicate memtable; live memtables are lscs::ColumnarMemtable + durable_storage::MvccMemTable. Scheduled for removal (Task 2 consolidation)."
136)]
137pub mod adaptive_memtable; // Adaptive memtable sizing with memory pressure (Task 10)
138pub mod batch_wal; // Batched WAL with vectored I/O (Task 3)
139pub mod deferred_index; // Deferred sorted index with LSM-style compaction (Rec 2)
140pub mod dirty_tracking; // Batched dirty tracking with MPSC queue
141pub mod index_policy; // Per-table index policy
142pub mod key_buffer; // Cache-line aligned key buffer (Task 2)
143#[deprecated(
144    note = "Unused duplicate memtable; live memtables are lscs::ColumnarMemtable + durable_storage::MvccMemTable. Scheduled for removal (Task 2 consolidation)."
145)]
146pub mod lockfree_memtable; // Lock-free read path with hazard pointers (Task 4)
147pub mod packed_row;
148pub mod queue_index; // Queue-optimized index structure (Task: Queue Index Policy) // Unified row storage with delta encoding (Task 1)
149
150// PhD-Level Architectural Optimizations (December 2025)
151#[deprecated(
152    note = "Unused duplicate; live learned index is learned_index_integration. Scheduled for removal (Task 2 consolidation)."
153)]
154pub mod clr_learned_index; // CLR Learned Index for sorted runs (Task 3)
155#[cfg(feature = "experimental")]
156pub mod columnar_wal; // Columnar WAL Layout (Task 4) [quarantined: unwired]
157pub mod epoch_arena; // Epoch-Partitioned Key Arena (Task 1)
158pub mod generational_slab; // Generational Slab Allocator (Task 5)
159#[cfg(feature = "experimental")]
160pub mod hierarchical_ts; // Hierarchical Timestamp Oracle (Task 9) [quarantined: unwired]
161#[cfg(all(unix, feature = "experimental"))]
162pub mod io_uring_wal; // [quarantined: unwired]
163pub mod lockfree_epoch; // Lock-Free Epoch Tracking (Task 3)
164pub mod polymorphic_value; // Polymorphic Value Encoding (Task 12)
165#[cfg(feature = "experimental")]
166pub mod rl_workload; // RL Workload Classifier (Task 10) [quarantined: unwired]
167pub mod shard_coalesced; // Shard-Coalesced Batch DashMap (Task 6)
168#[deprecated(
169    note = "Unused duplicate memtable; live memtables are lscs::ColumnarMemtable + durable_storage::MvccMemTable. Scheduled for removal (Task 2 consolidation)."
170)]
171pub mod stratified_skiplist; // Stratified SkipList with Deferred Promotion (Task 2) // io_uring WAL Submission (Task 11)
172
173// New performance modules (Recommendations 1-9)
174pub mod cow_btree; // Copy-on-Write B-Tree for ordered access (Recommendation 5)
175pub mod epoch_mvcc; // Epoch-based MVCC for O(log E) version lookup (Recommendation 7)
176pub mod page_cache; // Application-level page cache with Clock-Pro (Recommendation 8)
177pub mod row_format; // Slot-based columnar row storage (Recommendation 1)
178pub mod tiered_memtable; // Tiered MemTable with deferred sorting (Recommendation 3)
179pub mod tournament_tree; // K-way merge with tournament tree (Task 2)
180pub mod vectorized_scan; // SIMD-accelerated vectorized scan engine (Recommendation 2)
181pub mod zero_copy_serde; // Zero-copy serialization for WAL (Recommendation 6)
182
183// Namespace and multi-tenancy support (Task 3)
184pub mod lazy_namespace; // Per-namespace lazy hydrate/evict
185pub mod namespace; // Namespace routing and on-disk layout
186pub mod object_store_tier; // Object-storage cold tier for immutable segments
187
188// Core utilities
189pub mod backend;
190pub mod backup;
191pub mod block_checksum;
192pub mod bloom;
193pub mod compression;
194pub mod dict_compression;
195pub mod direct_io;
196#[cfg(unix)]
197pub mod io_uring;
198pub mod manifest;
199pub mod memory;
200pub mod parallel_merge;
201pub mod payload;
202pub mod prefetch;
203pub mod sketches;
204pub mod two_level_index;
205pub mod validation;
206pub mod version_store;
207pub mod zero_copy;
208
209// Re-exports for new components
210pub use columnar_compression::{
211    ColumnEncoder, DeltaEncoder, DictionaryEncoder, EncodingStats, EncodingType, RleEncoder,
212};
213pub use learned_index_integration::{
214    HybridIndex, IndexManager, IndexType, KeyStats, PointLookupExecutor,
215};
216pub use lscs::{
217    ColumnDef, ColumnGroup, ColumnType, ColumnarMemtable, Lscs, LscsConfig, LscsRecoveryStats,
218    LscsStats, TableSchema,
219};
220#[allow(deprecated)]
221pub use mvcc_snapshot::{
222    MvccStore, Snapshot as MvccSnapshot, Timestamp, TransactionManager, TxnId, TxnStatus,
223    VersionChain, VersionInfo,
224};
225pub use page_manager::{
226    DEFAULT_PAGE_SIZE, DbHeader, FORMAT_VERSION, FreePageHeader, PageId, PageManager,
227    PageManagerStats, PageType, SOCHDB_MAGIC,
228};
229pub use storage_engine::{
230    ColumnId, ColumnIterator, Row, RowId, StorageEngine, StorageEngineType, StorageStats,
231    TxnHandle, open_storage_engine,
232};
233pub use transaction::{
234    DurabilityLevel, IsolationLevel, RecoveryStats as TxnRecoveryStats, TransactionCoordinator,
235    TransactionHandle,
236};
237pub use txn_wal::{CrashRecoveryStats, TxnWal, TxnWalBuffer, TxnWalEntry, TxnWalStats};
238pub use wal_integration::{
239    GroupCommitBuffer, MvccTransactionManager, RecoveryStats, Transaction, TxnState,
240    WalStorageManager,
241};
242
243// Re-exports for performance optimization modules
244#[allow(deprecated)]
245pub use adaptive_learned_index::{AdaptiveLearnedIndex, LearnedIndexStats, PiecewiseLinearModel};
246#[allow(deprecated)]
247pub use adaptive_memtable::{
248    AdaptiveMemtableConfig, AdaptiveMemtableSizer, AdaptiveMemtableStats, DEFAULT_BASE_SIZE,
249    MAX_MEMTABLE_SIZE, MIN_MEMTABLE_SIZE,
250};
251pub use batch_wal::{
252    BatchAccumulator, BatchedWalReader, BatchedWalStats, BatchedWalWriter, ConcurrentBatchedWal,
253    DEFAULT_MAX_BATCH_BYTES, DEFAULT_MAX_BATCH_SIZE,
254};
255#[allow(deprecated)]
256pub use clr_learned_index::{ClrIndex, ClrLookupResult, ClrStats, IndexedSortedRun};
257pub use key_buffer::{
258    ArenaKey,
259    ArenaKeyHandle,
260    BatchKeyGenerator,
261    InternedTablePrefix,
262    // Arena allocation for high-throughput key operations
263    KeyArena,
264    KeyBuffer,
265    MAX_KEY_LENGTH,
266};
267#[allow(deprecated)]
268pub use lockfree_memtable::{
269    HazardDomain,
270    INLINE_VALUE_SIZE,
271    LockFreeMemTable,
272    LockFreeVersion,
273    LockFreeVersionChain,
274    // Inline value storage for reduced memory indirection
275    ValueStorage,
276};
277pub use packed_row::{
278    PackedColumnDef, PackedColumnType, PackedRow, PackedRowBuilder, PackedTableSchema,
279};
280
281// Re-exports for utilities
282pub use backend::{LocalFsBackend, ObjectMetadata, StorageBackend};
283pub use backup::{BackupManager, BackupMetadata};
284pub use block_checksum::{
285    BlockChecksumConfig, BlockChecksumStats, BlockType as BlockChecksumType, BlockWriter,
286    ChecksummedBlock,
287};
288pub use bloom::{BlockedBloomFilter, BloomFilter, LevelAdaptiveFPR, UnifiedBloomFilter};
289pub use compression::{CompressionEngine, CompressionStats, StorageTier};
290pub use manifest::{FileMetadata, LsmState, Manifest, VersionEdit};
291pub use memory::{MemoryBudget, MemoryTracker, WriteBufferManager, WriteBufferStats};
292#[allow(deprecated)]
293pub use mvcc_new::{
294    ColumnGroupRef, ReadVersion, Snapshot, SnapshotGuard, VersionGuard, VersionSet,
295    VersionSetStats, VersionSetStatsSnapshot,
296};
297pub use payload::{CompressionType, PayloadStats, PayloadStore};
298pub use sketches::{AdaptiveSketch, CountMinSketch, DDSketch, ExponentialHistogram, HyperLogLog};
299pub use two_level_index::{
300    BlockIndexEntry, BlockIndexReader, FencePointer, TemporalKey, TwoLevelIndex,
301};
302pub use validation::{SSTableValidator, validate_sstable_file};
303
304// Re-exports for durable storage
305pub use durable_storage::{
306    ArenaMvccMemTable, DurableStorage, EphemeralHandle, MvccMemTable, TransactionMode,
307};
308
309// ============================================================================
310// Truth-in-capabilities: durability feature matrix (Task 3A)
311// ============================================================================
312
313/// Durability features actually wired into THIS build's live storage path.
314///
315/// Prose like "production-grade" must not be read as implying features that are
316/// quarantined behind the empty, non-default `experimental` feature and
317/// unreferenced by the live write/recovery path. Query this matrix instead of
318/// trusting documentation strings.
319#[derive(Debug, Clone, Copy, PartialEq, Eq)]
320pub struct DurabilityCapabilities {
321    /// Crash-consistent WAL recovery (txn_wal / RecoveryStats / durability_contract). Live.
322    pub crash_recovery: bool,
323    /// At-rest encryption (AES-256-GCM-SIV envelope). Wired into the live WAL path
324    /// (Task 3B): inactive by default, active per-database when a key is configured.
325    /// The build-level `durability_capabilities()` reports the DEFAULT (false);
326    /// query `DurableStorage::durability_capabilities()` for the live per-instance
327    /// state.
328    pub at_rest_encryption: bool,
329    /// Point-in-time recovery via WAL archiving. `pitr` module — substrate landing
330    /// incrementally (Task 3B); reported true per-instance once archiving is active.
331    pub point_in_time_recovery: bool,
332    /// ARIES-style checkpointing. `aries_recovery` / `checkpoint` modules, quarantined/unwired.
333    pub aries_checkpoint: bool,
334    /// Epoch-based WAL fencing (split-brain detection). `wal_fencing` module, quarantined/unwired.
335    pub wal_fencing: bool,
336}
337
338/// The DEFAULT durability capabilities of the current build — a function of what
339/// is actually wired, not of documentation. At-rest encryption is now wired into
340/// the live WAL path (Task 3B) but is INACTIVE unless a key is configured, so the
341/// build default reports it `false`. For the live per-database state (which
342/// reflects whether encryption is actually active on that instance), call
343/// [`durable_storage::DurableStorage::durability_capabilities`].
344pub const fn durability_capabilities() -> DurabilityCapabilities {
345    DurabilityCapabilities {
346        crash_recovery: true,
347        at_rest_encryption: false,
348        point_in_time_recovery: false,
349        aries_checkpoint: false,
350        wal_fencing: false,
351    }
352}
353
354#[cfg(test)]
355mod durability_capabilities_tests {
356    use super::durability_capabilities;
357
358    #[test]
359    fn live_build_durability_matrix_is_honest() {
360        let caps = durability_capabilities();
361        // The one durability guarantee the live path actually provides.
362        assert!(
363            caps.crash_recovery,
364            "live path must provide crash-consistent WAL recovery"
365        );
366        // Quarantined/unwired — must NOT be advertised as present on the live build.
367        assert!(!caps.at_rest_encryption);
368        assert!(!caps.point_in_time_recovery);
369        assert!(!caps.aries_checkpoint);
370        assert!(!caps.wal_fencing);
371    }
372}
373
374// Re-exports for concurrent MVCC (Task: Concurrent Embedded)
375pub use mvcc_concurrent::{
376    ConcurrentMvcc, ConcurrentVersionChain, ConcurrentVersionEntry, HlcTimestamp, ReaderSlot,
377    VersionStore, VersionStoreStats, WriterGuard,
378};
379
380// Super Version and Copy-on-Write Version Set (mm.md Task 1)
381pub mod compaction_policy;
382pub mod concurrent_art;
383pub mod optimized_scan;
384pub mod sstable;
385pub mod version_set;
386pub mod wal_segment;
387
388// Re-exports for new performance modules (Recommendations 1-9)
389pub use compaction_policy::{
390    CompactionConfig, CompactionFile, CompactionJob, CompactionPicker, CompactionPriority,
391    CompactionReason, CompactionState, CompactionStats, CompactionStrategy,
392    LeveledCompactionPicker, RetentionConfig, UniversalCompactionPicker, VersionPruner,
393};
394pub use concurrent_art::ConcurrentART;
395pub use cow_btree::{BTreeEntry, BTreeSnapshot, CowBTree, Node, SearchResult};
396pub use epoch_mvcc::{
397    CommitResult, EpochManager, EpochMvccStore, EpochSnapshot, EpochTransaction, EpochVersionChain,
398    GcStats, StoreStats, VersionEntry,
399};
400pub use lazy_namespace::{LazyNamespaceConfig, LazyNamespaceTable};
401pub use object_store_tier::{ObjectStoreTier, ObjectStoreTierConfig, SegmentDescriptor};
402pub use optimized_scan::{
403    EntrySource, FileRange, LevelFiles, RangeScanner, ScanConfig, ScanStats, TournamentTree,
404    VersionedEntry,
405};
406pub use page_cache::{CacheStats, CachedPage, ClockProCache, PageId as CachePageId, PageState};
407pub use row_format::{Slot, SlotRow, SlotRowArena, SlotRowFlags, SlotRowHandle};
408pub use sstable::{
409    BlockBuilder, BlockCache, BlockHandle, BlockIterator, BlockType, BloomFilterPolicy,
410    FilterPolicy, FilterReader, Footer, Header, ReadOptions, RibbonFilterPolicy, SSTable,
411    SSTableBuilder, SSTableBuilderOptions, SSTableBuilderResult, SSTableFormat, Section,
412    SectionType, TableMetadata, XorFilterPolicy,
413};
414pub use tiered_memtable::{HotEntry, SortedBatch, TieredMemTable};
415pub use vectorized_scan::{
416    ColumnVector,
417    ComparisonOp,
418    DEFAULT_BATCH_SIZE,
419    Int64Comparison,
420    // SoA + Late Materialization (80/20 optimization)
421    SimdVisibilityFilter,
422    SoaBatch,
423    SoaScanIterator,
424    SoaScanStats,
425    SoaSource,
426    StreamingScanIterator,
427    ValueHandle,
428    VectorBatch,
429    VectorPredicate,
430    VectorizedScanConfig,
431    VectorizedScanStats,
432    VersionedSlice,
433};
434pub use version_set::{
435    FileMetadata as VersionFileMetadata, ImmutableMemTable, ImmutableMemTableRef, LevelMetadata,
436    SuperVersion, SuperVersionHandle, VersionSet as CowVersionSet,
437};
438pub use wal_segment::{
439    CheckpointRecord, RecoveryIterator, SegmentConfig, SegmentHeader, SegmentMetadata,
440    SegmentStats, WalEntry, WalSegmentManager,
441};
442pub use zero_copy_serde::{
443    FORMAT_VERSION as SERDE_FORMAT_VERSION, FieldDescriptor, HEADER_SIZE as SERDE_HEADER_SIZE,
444    MmapWalReader, SerdeStats, WalBatchReader, WalBatchWriter, WalEntryBuilder, WalEntryHeader,
445    WalEntryReader, WalEntryType, ZERO_COPY_MAGIC, ZeroCopyHeader,
446};
447
448// Re-exports for transaction arena and zero-copy plumbing
449pub use txn_arena::{ArenaWriteSet, BytesRef, KeyFingerprint, TxnArena, TxnWriteBuffer, WriteOp};
450
451// Re-exports for dirty tracking with batching
452pub use dirty_tracking::{BatchedDirtyTracker, DirtyEvent, DirtyTrackingStats, TxnDirtyBuffer};
453
454// Re-exports for per-table index policy
455pub use index_policy::{
456    BalancedTableIndex, IndexPolicy, SortedRun, TableIndexConfig, TableIndexRegistry,
457};
458
459// Re-exports for queue-optimized index structure
460pub use queue_index::{
461    CompositeQueueKey, QueueIndex, QueueIndexConfig, QueueIndexStats, QueueTableRegistry,
462};
463
464// Re-exports for CDC engine
465pub use cdc::{CdcConfig, CdcEmitter, CdcError, CdcEvent, CdcLog, CdcOperation, CdcSubscriber};
466
467// Re-exports for database kernel
468pub use database::{
469    ColumnDef as DbColumnDef,
470    ColumnType as DbColumnType,
471    ColumnarQueryResult, // SIMD-friendly columnar result format
472    Database,
473    DatabaseConfig,
474    GroupCommitSettings,
475    QueryBuilder,
476    QueryResult,
477    QueryRowIterator,
478    RecoveryStats as DbRecoveryStats,
479    Stats as DbStats,
480    SyncMode,
481    TableSchema as DbTableSchema,
482    TxnHandle as KernelTxnHandle,
483    VectorSearchResult,
484};