Skip to main content

sochdb_storage/
lib.rs

1// SPDX-License-Identifier: AGPL-3.0-or-later
2// SochDB - LLM-Optimized Embedded Database
3// Copyright (C) 2026 Sushanth Reddy Vanagala (https://github.com/sushanthpy)
4//
5// This program is free software: you can redistribute it and/or modify
6// it under the terms of the GNU Affero General Public License as published by
7// the Free Software Foundation, either version 3 of the License, or
8// (at your option) any later version.
9//
10// This program is distributed in the hope that it will be useful,
11// but WITHOUT ANY WARRANTY; without even the implied warranty of
12// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13// GNU Affero General Public License for more details.
14//
15// You should have received a copy of the GNU Affero General Public License
16// along with this program. If not, see <https://www.gnu.org/licenses/>.
17
18//! SochDB Storage Layer
19//!
20//! Log-Structured Column Store (LSCS) with transaction-aware WAL for TOON-native data.
21//!
22//! ## Runtime Modes
23//!
24//! This crate supports two runtime modes:
25//!
26//! ### Embedded Sync Mode (like SQLite)
27//!
28//! For embedded deployments without async runtime:
29//!
30//! ```toml
31//! sochdb-storage = { version = "...", default-features = false, features = ["embedded-sync"] }
32//! ```
33//!
34//! Benefits:
35//! - ~500KB smaller binary
36//! - No async runtime overhead
37//! - Simpler embedded integration
38//!
39//! ### Async Mode (default, for servers)
40//!
41//! For server deployments with async I/O:
42//!
43//! ```toml
44//! sochdb-storage = { version = "..." }  # async enabled by default
45//! ```
46//!
47//! Benefits:
48//! - Better scalability for concurrent connections
49//! - Non-blocking I/O for server workloads
50//!
51//! ## Novel Components
52//!
53//! - **LSCS** (`lscs`): Log-Structured Column Store - columnar variant of LSM with
54//!   schema-aware compression and column-aware compaction for reduced write amplification.
55//!
56//! - **Transaction WAL** (`txn_wal`): ACID-compliant Write-Ahead Log with transaction
57//!   boundaries, commit/abort markers, and crash recovery.
58//!
59//! - **StorageEngine Trait** (`storage_engine`): Pluggable storage backend abstraction
60//!   enabling 80% I/O reduction for columnar projections (Task 1).
61//!
62//! - **Page Manager** (`page_manager`): TOON file format with magic header and O(1)
63//!   page allocation (Task 8).
64//!
65//! - **Columnar Compression** (`columnar_compression`): Type-aware encoding with
66//!   dictionary, RLE, and delta compression for 2-4× storage reduction (Task 9).
67//!
68//! ## Utility Components
69//!
70//! - **Bloom Filters** (`bloom`): Probabilistic existence checks
71//! - **Block Checksums** (`block_checksum`): Data integrity validation
72//! - **Compression** (`compression`): LZ4/Zstd compression
73//! - **Sketches** (`sketches`): Approximate algorithms (HyperLogLog, CountMin, DDSketch)
74
75// New TOON-native storage components
76pub mod actor; // Actor-based connection manager (mm.md Task 7.2)
77pub mod admission_control; // Admission control with cost model + tenant fairness (Task 6)
78#[cfg(feature = "experimental")]
79pub mod aries_recovery; // ARIES-style crash recovery (Task 1) [quarantined: unwired]
80pub mod cdc; // WAL-derived Change Data Capture (T1)
81#[cfg(feature = "experimental")]
82pub mod checkpoint; // ARIES-style checkpointing with WAL truncation (mm.md Task 1.4) [quarantined: unwired]
83pub mod columnar_compression;
84pub mod correctness_testing; // Property-based correctness testing (Task 13)
85pub mod database; // Database Kernel (shared by embedded + server)
86pub mod durability_contract; // Durability contract hardening (Task 4)
87pub mod durable_storage; // Fully wired durable storage with MVCC
88pub mod encryption; // Data-at-rest encryption (AES-256-GCM-SIV envelope) — now wired (Task 3B)
89pub mod ffi;
90pub mod group_commit; // Event-driven Group Commit (Task 4)
91pub mod hlc; // Hybrid Logical Clock for commit timestamps (mm.md Task 1.3)
92pub mod hybrid_store; // PAX hybrid row-column storage (mm.md Task 4.1)
93pub mod io_isolation; // I/O isolation policy with cache partitioning (Task 5)
94pub mod ipc; // IPC Protocol with multiplexing (mm.md Task 7.1)
95#[cfg(unix)]
96pub mod ipc_server; // Unix Socket IPC Server (Task 3)
97pub mod keyring; // KEK/DEK envelope: HKDF-derived DEK, wrapped + persisted, fail-closed (Task 3B)
98pub mod learned_index_integration;
99pub mod lock; // Advisory file locking for database exclusivity
100pub mod lscs;
101pub mod mvcc_concurrent; // Concurrent MVCC for multi-reader single-writer (Task: Concurrent Embedded)
102#[deprecated(
103    note = "Unused duplicate; live MVCC is mvcc_concurrent::ConcurrentMvcc + durable_storage::MvccMemTable. Scheduled for removal (Task 2 consolidation)."
104)]
105pub mod mvcc_new;
106pub mod mvcc_snapshot;
107pub mod page_manager;
108#[cfg(feature = "experimental")]
109pub mod pitr; // Point-in-Time Recovery with WAL archiving (Task 11) [quarantined: unwired]
110#[cfg(feature = "experimental")]
111pub mod production_wal; // Production WAL with ARIES recovery (mm.md Task 3) [quarantined: unwired]
112pub mod ssi; // Serializable Snapshot Isolation (Task 2)
113#[deprecated(
114    note = "Unused; SSI lives in ssi/MvccManager. Scheduled for removal (Task 2 consolidation)."
115)]
116pub mod ssi_scaling; // SSI scaling guardrails with range locks (Task 7)
117pub mod storage_engine;
118pub mod streaming_iterator; // Streaming Iterator Architecture (mm.md Task 4)
119pub mod supervisor; // Supervised background workers (panic-contained restart) (Task 4)
120pub mod transaction; // Unified Transaction Coordinator trait and types
121pub mod txn_arena; // Transaction-scoped arena with zero-copy key/value plumbing
122pub mod txn_wal;
123pub mod upgrade_contract; // Upgrade compatibility contract (Task 12)
124#[cfg(feature = "experimental")]
125pub mod wal_fencing; // Epoch-based WAL fencing for split-brain detection [quarantined: unwired]
126pub mod wal_integration;
127pub mod wal_manifest; // Durable PITR anchor (last-checkpoint LSN + DB identity), crash-safe (Task 3B PITR)
128pub mod zero_copy_safety; // Zero-Copy Validation Layer (Task 5) // FFI bindings for Python SDK
129
130// Performance optimization modules
131#[deprecated(
132    note = "Unused duplicate; live learned index is learned_index_integration. Scheduled for removal (Task 2 consolidation)."
133)]
134pub mod adaptive_learned_index;
135#[deprecated(
136    note = "Unused duplicate memtable; live memtables are lscs::ColumnarMemtable + durable_storage::MvccMemTable. Scheduled for removal (Task 2 consolidation)."
137)]
138pub mod adaptive_memtable; // Adaptive memtable sizing with memory pressure (Task 10)
139pub mod batch_wal; // Batched WAL with vectored I/O (Task 3)
140pub mod deferred_index; // Deferred sorted index with LSM-style compaction (Rec 2)
141pub mod dirty_tracking; // Batched dirty tracking with MPSC queue
142pub mod index_policy; // Per-table index policy
143pub mod key_buffer; // Cache-line aligned key buffer (Task 2)
144#[deprecated(
145    note = "Unused duplicate memtable; live memtables are lscs::ColumnarMemtable + durable_storage::MvccMemTable. Scheduled for removal (Task 2 consolidation)."
146)]
147pub mod lockfree_memtable; // Lock-free read path with hazard pointers (Task 4)
148pub mod packed_row;
149pub mod queue_index; // Queue-optimized index structure (Task: Queue Index Policy) // Unified row storage with delta encoding (Task 1)
150
151// PhD-Level Architectural Optimizations (December 2025)
152#[deprecated(
153    note = "Unused duplicate; live learned index is learned_index_integration. Scheduled for removal (Task 2 consolidation)."
154)]
155pub mod clr_learned_index; // CLR Learned Index for sorted runs (Task 3)
156#[cfg(feature = "experimental")]
157pub mod columnar_wal; // Columnar WAL Layout (Task 4) [quarantined: unwired]
158pub mod epoch_arena; // Epoch-Partitioned Key Arena (Task 1)
159pub mod generational_slab; // Generational Slab Allocator (Task 5)
160#[cfg(feature = "experimental")]
161pub mod hierarchical_ts; // Hierarchical Timestamp Oracle (Task 9) [quarantined: unwired]
162#[cfg(all(unix, feature = "experimental"))]
163pub mod io_uring_wal; // [quarantined: unwired]
164pub mod lockfree_epoch; // Lock-Free Epoch Tracking (Task 3)
165pub mod polymorphic_value; // Polymorphic Value Encoding (Task 12)
166#[cfg(feature = "experimental")]
167pub mod rl_workload; // RL Workload Classifier (Task 10) [quarantined: unwired]
168pub mod shard_coalesced; // Shard-Coalesced Batch DashMap (Task 6)
169#[deprecated(
170    note = "Unused duplicate memtable; live memtables are lscs::ColumnarMemtable + durable_storage::MvccMemTable. Scheduled for removal (Task 2 consolidation)."
171)]
172pub mod stratified_skiplist; // Stratified SkipList with Deferred Promotion (Task 2) // io_uring WAL Submission (Task 11)
173
174// New performance modules (Recommendations 1-9)
175pub mod cow_btree; // Copy-on-Write B-Tree for ordered access (Recommendation 5)
176pub mod epoch_mvcc; // Epoch-based MVCC for O(log E) version lookup (Recommendation 7)
177pub mod page_cache; // Application-level page cache with Clock-Pro (Recommendation 8)
178pub mod row_format; // Slot-based columnar row storage (Recommendation 1)
179pub mod tiered_memtable; // Tiered MemTable with deferred sorting (Recommendation 3)
180pub mod tournament_tree; // K-way merge with tournament tree (Task 2)
181pub mod vectorized_scan; // SIMD-accelerated vectorized scan engine (Recommendation 2)
182pub mod zero_copy_serde; // Zero-copy serialization for WAL (Recommendation 6)
183
184// Namespace and multi-tenancy support (Task 3)
185pub mod lazy_namespace; // Per-namespace lazy hydrate/evict
186pub mod namespace; // Namespace routing and on-disk layout
187pub mod object_store_tier; // Object-storage cold tier for immutable segments
188
189// Core utilities
190pub mod backend;
191pub mod backup;
192pub mod block_checksum;
193pub mod bloom;
194pub mod compression;
195pub mod dict_compression;
196pub mod direct_io;
197#[cfg(unix)]
198pub mod io_uring;
199pub mod manifest;
200pub mod memory;
201pub mod parallel_merge;
202pub mod payload;
203pub mod prefetch;
204pub mod sketches;
205pub mod two_level_index;
206pub mod validation;
207pub mod version_store;
208pub mod zero_copy;
209
210// Re-exports for new components
211pub use columnar_compression::{
212    ColumnEncoder, DeltaEncoder, DictionaryEncoder, EncodingStats, EncodingType, RleEncoder,
213};
214pub use learned_index_integration::{
215    HybridIndex, IndexManager, IndexType, KeyStats, PointLookupExecutor,
216};
217pub use lscs::{
218    ColumnDef, ColumnGroup, ColumnType, ColumnarMemtable, Lscs, LscsConfig, LscsRecoveryStats,
219    LscsStats, TableSchema,
220};
221#[allow(deprecated)]
222pub use mvcc_snapshot::{
223    MvccStore, Snapshot as MvccSnapshot, Timestamp, TransactionManager, TxnId, TxnStatus,
224    VersionChain, VersionInfo,
225};
226pub use page_manager::{
227    DEFAULT_PAGE_SIZE, DbHeader, FORMAT_VERSION, FreePageHeader, PageId, PageManager,
228    PageManagerStats, PageType, SOCHDB_MAGIC,
229};
230pub use storage_engine::{
231    ColumnId, ColumnIterator, Row, RowId, StorageEngine, StorageEngineType, StorageStats,
232    TxnHandle, open_storage_engine,
233};
234pub use transaction::{
235    DurabilityLevel, IsolationLevel, RecoveryStats as TxnRecoveryStats, TransactionCoordinator,
236    TransactionHandle,
237};
238pub use txn_wal::{
239    CrashRecoveryStats, RecoveryTarget, TxnWal, TxnWalBuffer, TxnWalEntry, TxnWalStats,
240};
241pub use wal_integration::{
242    GroupCommitBuffer, MvccTransactionManager, RecoveryStats, Transaction, TxnState,
243    WalStorageManager,
244};
245pub use wal_manifest::WalManifest;
246
247// Re-exports for performance optimization modules
248#[allow(deprecated)]
249pub use adaptive_learned_index::{AdaptiveLearnedIndex, LearnedIndexStats, PiecewiseLinearModel};
250#[allow(deprecated)]
251pub use adaptive_memtable::{
252    AdaptiveMemtableConfig, AdaptiveMemtableSizer, AdaptiveMemtableStats, DEFAULT_BASE_SIZE,
253    MAX_MEMTABLE_SIZE, MIN_MEMTABLE_SIZE,
254};
255pub use batch_wal::{
256    BatchAccumulator, BatchedWalReader, BatchedWalStats, BatchedWalWriter, ConcurrentBatchedWal,
257    DEFAULT_MAX_BATCH_BYTES, DEFAULT_MAX_BATCH_SIZE,
258};
259#[allow(deprecated)]
260pub use clr_learned_index::{ClrIndex, ClrLookupResult, ClrStats, IndexedSortedRun};
261pub use key_buffer::{
262    ArenaKey,
263    ArenaKeyHandle,
264    BatchKeyGenerator,
265    InternedTablePrefix,
266    // Arena allocation for high-throughput key operations
267    KeyArena,
268    KeyBuffer,
269    MAX_KEY_LENGTH,
270};
271#[allow(deprecated)]
272pub use lockfree_memtable::{
273    HazardDomain,
274    INLINE_VALUE_SIZE,
275    LockFreeMemTable,
276    LockFreeVersion,
277    LockFreeVersionChain,
278    // Inline value storage for reduced memory indirection
279    ValueStorage,
280};
281pub use packed_row::{
282    PackedColumnDef, PackedColumnType, PackedRow, PackedRowBuilder, PackedTableSchema,
283};
284
285// Re-exports for utilities
286pub use backend::{LocalFsBackend, ObjectMetadata, StorageBackend};
287pub use backup::{BackupManager, BackupMetadata};
288pub use block_checksum::{
289    BlockChecksumConfig, BlockChecksumStats, BlockType as BlockChecksumType, BlockWriter,
290    ChecksummedBlock,
291};
292pub use bloom::{BlockedBloomFilter, BloomFilter, LevelAdaptiveFPR, UnifiedBloomFilter};
293pub use compression::{CompressionEngine, CompressionStats, StorageTier};
294pub use manifest::{FileMetadata, LsmState, Manifest, VersionEdit};
295pub use memory::{MemoryBudget, MemoryTracker, WriteBufferManager, WriteBufferStats};
296#[allow(deprecated)]
297pub use mvcc_new::{
298    ColumnGroupRef, ReadVersion, Snapshot, SnapshotGuard, VersionGuard, VersionSet,
299    VersionSetStats, VersionSetStatsSnapshot,
300};
301pub use payload::{CompressionType, PayloadStats, PayloadStore};
302pub use sketches::{AdaptiveSketch, CountMinSketch, DDSketch, ExponentialHistogram, HyperLogLog};
303pub use two_level_index::{
304    BlockIndexEntry, BlockIndexReader, FencePointer, TemporalKey, TwoLevelIndex,
305};
306pub use validation::{SSTableValidator, validate_sstable_file};
307
308// Re-exports for durable storage
309pub use durable_storage::{
310    ArenaMvccMemTable, DurableStorage, EphemeralHandle, MvccMemTable, StorageEncryption,
311    TransactionMode,
312};
313// At-rest encryption public surface (Task 3B), reachable from the crate root
314// alongside DurableStorage::open_with_encryption / Database::open_with_config_and_encryption.
315pub use encryption::{EncryptionEngine, EncryptionError, EncryptionKey, generate_key};
316pub use keyring::EncryptionState;
317
318// ============================================================================
319// Truth-in-capabilities: durability feature matrix (Task 3A)
320// ============================================================================
321
322/// Durability features actually wired into THIS build's live storage path.
323///
324/// Prose like "production-grade" must not be read as implying features that are
325/// quarantined behind the empty, non-default `experimental` feature and
326/// unreferenced by the live write/recovery path. Query this matrix instead of
327/// trusting documentation strings.
328#[derive(Debug, Clone, Copy, PartialEq, Eq)]
329pub struct DurabilityCapabilities {
330    /// Crash-consistent WAL recovery (txn_wal / RecoveryStats / durability_contract). Live.
331    pub crash_recovery: bool,
332    /// At-rest encryption (AES-256-GCM-SIV envelope). Wired into the live WAL path
333    /// (Task 3B): inactive by default, active per-database when a key is configured.
334    /// The build-level `durability_capabilities()` reports the DEFAULT (false);
335    /// query `DurableStorage::durability_capabilities()` for the live per-instance
336    /// state.
337    pub at_rest_encryption: bool,
338    /// Point-in-time recovery via WAL archiving. `pitr` module — substrate landing
339    /// incrementally (Task 3B); reported true per-instance once archiving is active.
340    pub point_in_time_recovery: bool,
341    /// ARIES-style checkpointing. `aries_recovery` / `checkpoint` modules, quarantined/unwired.
342    pub aries_checkpoint: bool,
343    /// Epoch-based WAL fencing (split-brain detection). `wal_fencing` module, quarantined/unwired.
344    pub wal_fencing: bool,
345}
346
347/// The DEFAULT durability capabilities of the current build — a function of what
348/// is actually wired, not of documentation. At-rest encryption is now wired into
349/// the live WAL path (Task 3B) but is INACTIVE unless a key is configured, so the
350/// build default reports it `false`. For the live per-database state (which
351/// reflects whether encryption is actually active on that instance), call
352/// [`durable_storage::DurableStorage::durability_capabilities`].
353pub const fn durability_capabilities() -> DurabilityCapabilities {
354    DurabilityCapabilities {
355        crash_recovery: true,
356        at_rest_encryption: false,
357        point_in_time_recovery: false,
358        aries_checkpoint: false,
359        wal_fencing: false,
360    }
361}
362
363#[cfg(test)]
364mod durability_capabilities_tests {
365    use super::durability_capabilities;
366
367    #[test]
368    fn live_build_durability_matrix_is_honest() {
369        let caps = durability_capabilities();
370        // The one durability guarantee the live path actually provides.
371        assert!(
372            caps.crash_recovery,
373            "live path must provide crash-consistent WAL recovery"
374        );
375        // Quarantined/unwired — must NOT be advertised as present on the live build.
376        assert!(!caps.at_rest_encryption);
377        assert!(!caps.point_in_time_recovery);
378        assert!(!caps.aries_checkpoint);
379        assert!(!caps.wal_fencing);
380    }
381}
382
383// Re-exports for concurrent MVCC (Task: Concurrent Embedded)
384pub use mvcc_concurrent::{
385    ConcurrentMvcc, ConcurrentVersionChain, ConcurrentVersionEntry, HlcTimestamp, ReaderSlot,
386    VersionStore, VersionStoreStats, WriterGuard,
387};
388
389// Super Version and Copy-on-Write Version Set (mm.md Task 1)
390pub mod compaction_policy;
391pub mod concurrent_art;
392pub mod optimized_scan;
393pub mod sstable;
394pub mod version_set;
395pub mod wal_segment;
396
397// Re-exports for new performance modules (Recommendations 1-9)
398pub use compaction_policy::{
399    CompactionConfig, CompactionFile, CompactionJob, CompactionPicker, CompactionPriority,
400    CompactionReason, CompactionState, CompactionStats, CompactionStrategy,
401    LeveledCompactionPicker, RetentionConfig, UniversalCompactionPicker, VersionPruner,
402};
403pub use concurrent_art::ConcurrentART;
404pub use cow_btree::{BTreeEntry, BTreeSnapshot, CowBTree, Node, SearchResult};
405pub use epoch_mvcc::{
406    CommitResult, EpochManager, EpochMvccStore, EpochSnapshot, EpochTransaction, EpochVersionChain,
407    GcStats, StoreStats, VersionEntry,
408};
409pub use lazy_namespace::{LazyNamespaceConfig, LazyNamespaceTable};
410pub use object_store_tier::{ObjectStoreTier, ObjectStoreTierConfig, SegmentDescriptor};
411pub use optimized_scan::{
412    EntrySource, FileRange, LevelFiles, RangeScanner, ScanConfig, ScanStats, TournamentTree,
413    VersionedEntry,
414};
415pub use page_cache::{CacheStats, CachedPage, ClockProCache, PageId as CachePageId, PageState};
416pub use row_format::{Slot, SlotRow, SlotRowArena, SlotRowFlags, SlotRowHandle};
417pub use sstable::{
418    BlockBuilder, BlockCache, BlockHandle, BlockIterator, BlockType, BloomFilterPolicy,
419    FilterPolicy, FilterReader, Footer, Header, ReadOptions, RibbonFilterPolicy, SSTable,
420    SSTableBuilder, SSTableBuilderOptions, SSTableBuilderResult, SSTableFormat, Section,
421    SectionType, TableMetadata, XorFilterPolicy,
422};
423pub use tiered_memtable::{HotEntry, SortedBatch, TieredMemTable};
424pub use vectorized_scan::{
425    ColumnVector,
426    ComparisonOp,
427    DEFAULT_BATCH_SIZE,
428    Int64Comparison,
429    // SoA + Late Materialization (80/20 optimization)
430    SimdVisibilityFilter,
431    SoaBatch,
432    SoaScanIterator,
433    SoaScanStats,
434    SoaSource,
435    StreamingScanIterator,
436    ValueHandle,
437    VectorBatch,
438    VectorPredicate,
439    VectorizedScanConfig,
440    VectorizedScanStats,
441    VersionedSlice,
442};
443pub use version_set::{
444    FileMetadata as VersionFileMetadata, ImmutableMemTable, ImmutableMemTableRef, LevelMetadata,
445    SuperVersion, SuperVersionHandle, VersionSet as CowVersionSet,
446};
447pub use wal_segment::{
448    CheckpointRecord, RecoveryIterator, SegmentConfig, SegmentHeader, SegmentMetadata,
449    SegmentStats, WalEntry, WalSegmentManager,
450};
451pub use zero_copy_serde::{
452    FORMAT_VERSION as SERDE_FORMAT_VERSION, FieldDescriptor, HEADER_SIZE as SERDE_HEADER_SIZE,
453    MmapWalReader, SerdeStats, WalBatchReader, WalBatchWriter, WalEntryBuilder, WalEntryHeader,
454    WalEntryReader, WalEntryType, ZERO_COPY_MAGIC, ZeroCopyHeader,
455};
456
457// Re-exports for transaction arena and zero-copy plumbing
458pub use txn_arena::{ArenaWriteSet, BytesRef, KeyFingerprint, TxnArena, TxnWriteBuffer, WriteOp};
459
460// Re-exports for dirty tracking with batching
461pub use dirty_tracking::{BatchedDirtyTracker, DirtyEvent, DirtyTrackingStats, TxnDirtyBuffer};
462
463// Re-exports for per-table index policy
464pub use index_policy::{
465    BalancedTableIndex, IndexPolicy, SortedRun, TableIndexConfig, TableIndexRegistry,
466};
467
468// Re-exports for queue-optimized index structure
469pub use queue_index::{
470    CompositeQueueKey, QueueIndex, QueueIndexConfig, QueueIndexStats, QueueTableRegistry,
471};
472
473// Re-exports for CDC engine
474pub use cdc::{CdcConfig, CdcEmitter, CdcError, CdcEvent, CdcLog, CdcOperation, CdcSubscriber};
475
476// Re-exports for database kernel
477pub use database::{
478    ColumnDef as DbColumnDef,
479    ColumnType as DbColumnType,
480    ColumnarQueryResult, // SIMD-friendly columnar result format
481    Database,
482    DatabaseConfig,
483    GroupCommitSettings,
484    QueryBuilder,
485    QueryResult,
486    QueryRowIterator,
487    RecoveryStats as DbRecoveryStats,
488    Stats as DbStats,
489    SyncMode,
490    TableSchema as DbTableSchema,
491    TxnHandle as KernelTxnHandle,
492    VectorSearchResult,
493};