Skip to main content

sochdb_storage/
lib.rs

1// SPDX-License-Identifier: AGPL-3.0-or-later
2// SochDB - LLM-Optimized Embedded Database
3// Copyright (C) 2026 Sushanth Reddy Vanagala (https://github.com/sushanthpy)
4//
5// This program is free software: you can redistribute it and/or modify
6// it under the terms of the GNU Affero General Public License as published by
7// the Free Software Foundation, either version 3 of the License, or
8// (at your option) any later version.
9//
10// This program is distributed in the hope that it will be useful,
11// but WITHOUT ANY WARRANTY; without even the implied warranty of
12// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13// GNU Affero General Public License for more details.
14//
15// You should have received a copy of the GNU Affero General Public License
16// along with this program. If not, see <https://www.gnu.org/licenses/>.
17
18//! SochDB Storage Layer
19//!
20//! Log-Structured Column Store (LSCS) with transaction-aware WAL for TOON-native data.
21//!
22//! ## Runtime Modes
23//!
24//! This crate supports two runtime modes:
25//!
26//! ### Embedded Sync Mode (like SQLite)
27//!
28//! For embedded deployments without async runtime:
29//!
30//! ```toml
31//! sochdb-storage = { version = "...", default-features = false, features = ["embedded-sync"] }
32//! ```
33//!
34//! Benefits:
35//! - ~500KB smaller binary
36//! - No async runtime overhead
37//! - Simpler embedded integration
38//!
39//! ### Async Mode (default, for servers)
40//!
41//! For server deployments with async I/O:
42//!
43//! ```toml
44//! sochdb-storage = { version = "..." }  # async enabled by default
45//! ```
46//!
47//! Benefits:
48//! - Better scalability for concurrent connections
49//! - Non-blocking I/O for server workloads
50//!
51//! ## Novel Components
52//!
53//! - **LSCS** (`lscs`): Log-Structured Column Store - columnar variant of LSM with
54//!   schema-aware compression and column-aware compaction for reduced write amplification.
55//!
56//! - **Transaction WAL** (`txn_wal`): ACID-compliant Write-Ahead Log with transaction
57//!   boundaries, commit/abort markers, and crash recovery.
58//!
59//! - **StorageEngine Trait** (`storage_engine`): Pluggable storage backend abstraction
60//!   enabling 80% I/O reduction for columnar projections (Task 1).
61//!
62//! - **Page Manager** (`page_manager`): TOON file format with magic header and O(1)
63//!   page allocation (Task 8).
64//!
65//! - **Columnar Compression** (`columnar_compression`): Type-aware encoding with
66//!   dictionary, RLE, and delta compression for 2-4× storage reduction (Task 9).
67//!
68//! ## Utility Components
69//!
70//! - **Bloom Filters** (`bloom`): Probabilistic existence checks
71//! - **Block Checksums** (`block_checksum`): Data integrity validation
72//! - **Compression** (`compression`): LZ4/Zstd compression
73//! - **Sketches** (`sketches`): Approximate algorithms (HyperLogLog, CountMin, DDSketch)
74
75// New TOON-native storage components
76pub mod actor; // Actor-based connection manager (mm.md Task 7.2)
77pub mod admission_control; // Admission control with cost model + tenant fairness (Task 6)
78pub mod aries_recovery; // ARIES-style crash recovery (Task 1)
79pub mod checkpoint; // ARIES-style checkpointing with WAL truncation (mm.md Task 1.4)
80pub mod columnar_compression;
81pub mod correctness_testing; // Property-based correctness testing (Task 13)
82pub mod database; // Database Kernel (shared by embedded + server)
83pub mod durable_storage; // Fully wired durable storage with MVCC
84pub mod durability_contract; // Durability contract hardening (Task 4)
85pub mod ffi;
86pub mod group_commit; // Event-driven Group Commit (Task 4)
87pub mod hlc; // Hybrid Logical Clock for commit timestamps (mm.md Task 1.3)
88pub mod hybrid_store; // PAX hybrid row-column storage (mm.md Task 4.1)
89pub mod io_isolation; // I/O isolation policy with cache partitioning (Task 5)
90pub mod ipc; // IPC Protocol with multiplexing (mm.md Task 7.1)
91#[cfg(unix)]
92pub mod ipc_server; // Unix Socket IPC Server (Task 3)
93pub mod learned_index_integration;
94pub mod lock; // Advisory file locking for database exclusivity
95pub mod lscs;
96pub mod mvcc_concurrent; // Concurrent MVCC for multi-reader single-writer (Task: Concurrent Embedded)
97pub mod mvcc_new;
98pub mod mvcc_snapshot;
99pub mod page_manager;
100pub mod pitr; // Point-in-Time Recovery with WAL archiving (Task 11)
101pub mod production_wal; // Production WAL with ARIES recovery (mm.md Task 3)
102pub mod ssi; // Serializable Snapshot Isolation (Task 2)
103pub mod ssi_scaling; // SSI scaling guardrails with range locks (Task 7)
104pub mod storage_engine;
105pub mod streaming_iterator; // Streaming Iterator Architecture (mm.md Task 4)
106pub mod transaction; // Unified Transaction Coordinator trait and types
107pub mod txn_arena; // Transaction-scoped arena with zero-copy key/value plumbing
108pub mod txn_wal;
109pub mod upgrade_contract; // Upgrade compatibility contract (Task 12)
110pub mod wal_fencing; // Epoch-based WAL fencing for split-brain detection
111pub mod wal_integration;
112pub mod zero_copy_safety; // Zero-Copy Validation Layer (Task 5) // FFI bindings for Python SDK
113
114// Performance optimization modules
115pub mod adaptive_learned_index;
116pub mod adaptive_memtable; // Adaptive memtable sizing with memory pressure (Task 10)
117pub mod deferred_index; // Deferred sorted index with LSM-style compaction (Rec 2)
118pub mod dirty_tracking; // Batched dirty tracking with MPSC queue
119pub mod index_policy; // Per-table index policy
120pub mod queue_index; // Queue-optimized index structure (Task: Queue Index Policy)
121pub mod batch_wal; // Batched WAL with vectored I/O (Task 3)
122pub mod key_buffer; // Cache-line aligned key buffer (Task 2)
123pub mod lockfree_memtable; // Lock-free read path with hazard pointers (Task 4)
124pub mod packed_row; // Unified row storage with delta encoding (Task 1)
125
126// PhD-Level Architectural Optimizations (December 2025)
127pub mod clr_learned_index; // CLR Learned Index for sorted runs (Task 3)
128pub mod lockfree_epoch; // Lock-Free Epoch Tracking (Task 3)
129pub mod hierarchical_ts; // Hierarchical Timestamp Oracle (Task 9)
130pub mod shard_coalesced; // Shard-Coalesced Batch DashMap (Task 6)
131pub mod polymorphic_value; // Polymorphic Value Encoding (Task 12)
132pub mod epoch_arena; // Epoch-Partitioned Key Arena (Task 1)
133pub mod stratified_skiplist; // Stratified SkipList with Deferred Promotion (Task 2)
134pub mod columnar_wal; // Columnar WAL Layout (Task 4)
135pub mod generational_slab; // Generational Slab Allocator (Task 5)
136pub mod rl_workload; // RL Workload Classifier (Task 10)
137#[cfg(unix)]
138pub mod io_uring_wal; // io_uring WAL Submission (Task 11)
139
140// New performance modules (Recommendations 1-9)
141pub mod cow_btree; // Copy-on-Write B-Tree for ordered access (Recommendation 5)
142pub mod epoch_mvcc; // Epoch-based MVCC for O(log E) version lookup (Recommendation 7)
143pub mod page_cache; // Application-level page cache with Clock-Pro (Recommendation 8)
144pub mod row_format; // Slot-based columnar row storage (Recommendation 1)
145pub mod tiered_memtable; // Tiered MemTable with deferred sorting (Recommendation 3)
146pub mod tournament_tree; // K-way merge with tournament tree (Task 2)
147pub mod vectorized_scan; // SIMD-accelerated vectorized scan engine (Recommendation 2)
148pub mod zero_copy_serde; // Zero-copy serialization for WAL (Recommendation 6)
149
150// Namespace and multi-tenancy support (Task 3)
151pub mod namespace; // Namespace routing and on-disk layout
152
153// Core utilities
154pub mod backend;
155pub mod backup;
156pub mod block_checksum;
157pub mod bloom;
158pub mod compression;
159pub mod dict_compression;
160pub mod direct_io;
161#[cfg(unix)]
162pub mod io_uring;
163pub mod manifest;
164pub mod memory;
165pub mod parallel_merge;
166pub mod payload;
167pub mod prefetch;
168pub mod sketches;
169pub mod two_level_index;
170pub mod validation;
171pub mod version_store;
172pub mod zero_copy;
173
174// Re-exports for new components
175pub use columnar_compression::{
176    ColumnEncoder, DeltaEncoder, DictionaryEncoder, EncodingStats, EncodingType, RleEncoder,
177};
178pub use learned_index_integration::{
179    HybridIndex, IndexManager, IndexType, KeyStats, PointLookupExecutor,
180};
181pub use lscs::{
182    ColumnDef, ColumnGroup, ColumnType, ColumnarMemtable, Lscs, LscsConfig, LscsRecoveryStats,
183    LscsStats, TableSchema,
184};
185#[allow(deprecated)]
186pub use mvcc_snapshot::{
187    MvccStore, Snapshot as MvccSnapshot, Timestamp, TransactionManager, TxnId, TxnStatus,
188    VersionChain, VersionInfo,
189};
190pub use page_manager::{
191    DEFAULT_PAGE_SIZE, DbHeader, FORMAT_VERSION, FreePageHeader, PageId, PageManager,
192    PageManagerStats, PageType, SOCHDB_MAGIC,
193};
194pub use storage_engine::{
195    ColumnId, ColumnIterator, Row, RowId, StorageEngine, StorageEngineType, StorageStats,
196    TxnHandle, open_storage_engine,
197};
198pub use transaction::{
199    DurabilityLevel, IsolationLevel, RecoveryStats as TxnRecoveryStats, TransactionCoordinator,
200    TransactionHandle,
201};
202pub use txn_wal::{CrashRecoveryStats, TxnWal, TxnWalBuffer, TxnWalEntry, TxnWalStats};
203pub use wal_integration::{
204    GroupCommitBuffer, MvccTransactionManager, RecoveryStats, Transaction, TxnState, 
205    WalStorageManager,
206};
207
208// Re-exports for performance optimization modules
209pub use adaptive_learned_index::{AdaptiveLearnedIndex, LearnedIndexStats, PiecewiseLinearModel};
210pub use adaptive_memtable::{
211    AdaptiveMemtableConfig, AdaptiveMemtableSizer, AdaptiveMemtableStats,
212    DEFAULT_BASE_SIZE, MAX_MEMTABLE_SIZE, MIN_MEMTABLE_SIZE,
213};
214pub use batch_wal::{
215    BatchAccumulator, BatchedWalReader, BatchedWalStats, BatchedWalWriter, ConcurrentBatchedWal,
216    DEFAULT_MAX_BATCH_BYTES, DEFAULT_MAX_BATCH_SIZE,
217};
218pub use clr_learned_index::{ClrIndex, ClrLookupResult, ClrStats, IndexedSortedRun};
219pub use key_buffer::{
220    ArenaKey,
221    ArenaKeyHandle,
222    BatchKeyGenerator,
223    InternedTablePrefix,
224    // Arena allocation for high-throughput key operations
225    KeyArena,
226    KeyBuffer,
227    MAX_KEY_LENGTH,
228};
229pub use lockfree_memtable::{
230    HazardDomain,
231    INLINE_VALUE_SIZE,
232    LockFreeMemTable,
233    LockFreeVersion,
234    LockFreeVersionChain,
235    // Inline value storage for reduced memory indirection
236    ValueStorage,
237};
238pub use packed_row::{
239    PackedColumnDef, PackedColumnType, PackedRow, PackedRowBuilder, PackedTableSchema,
240};
241
242// Re-exports for utilities
243pub use backend::{LocalFsBackend, ObjectMetadata, StorageBackend};
244pub use backup::{BackupManager, BackupMetadata};
245pub use block_checksum::{
246    BlockChecksumConfig, BlockChecksumStats, BlockType as BlockChecksumType, BlockWriter, ChecksummedBlock,
247};
248pub use bloom::{BlockedBloomFilter, BloomFilter, LevelAdaptiveFPR, UnifiedBloomFilter};
249pub use compression::{CompressionEngine, CompressionStats, StorageTier};
250pub use manifest::{FileMetadata, LsmState, Manifest, VersionEdit};
251pub use memory::{MemoryBudget, MemoryTracker, WriteBufferManager, WriteBufferStats};
252pub use mvcc_new::{
253    ColumnGroupRef, ReadVersion, Snapshot, SnapshotGuard, VersionGuard, VersionSet,
254    VersionSetStats, VersionSetStatsSnapshot,
255};
256pub use payload::{CompressionType, PayloadStats, PayloadStore};
257pub use sketches::{AdaptiveSketch, CountMinSketch, DDSketch, ExponentialHistogram, HyperLogLog};
258pub use two_level_index::{
259    BlockIndexEntry, BlockIndexReader, FencePointer, TemporalKey, TwoLevelIndex,
260};
261pub use validation::{SSTableValidator, validate_sstable_file};
262
263// Re-exports for durable storage
264pub use durable_storage::{ArenaMvccMemTable, DurableStorage, MvccMemTable, TransactionMode};
265
266// Re-exports for concurrent MVCC (Task: Concurrent Embedded)
267pub use mvcc_concurrent::{
268    ConcurrentMvcc, HlcTimestamp, ReaderSlot, 
269    ConcurrentVersionChain, ConcurrentVersionEntry,
270    VersionStore, VersionStoreStats, WriterGuard,
271};
272
273// Super Version and Copy-on-Write Version Set (mm.md Task 1)
274pub mod version_set;
275pub mod concurrent_art;
276pub mod sstable;
277pub mod wal_segment;
278pub mod compaction_policy;
279pub mod optimized_scan;
280
281// Re-exports for new performance modules (Recommendations 1-9)
282pub use version_set::{
283    FileMetadata as VersionFileMetadata, ImmutableMemTable, ImmutableMemTableRef,
284    LevelMetadata, SuperVersion, SuperVersionHandle, VersionSet as CowVersionSet,
285};
286pub use concurrent_art::ConcurrentART;
287pub use sstable::{
288    BlockBuilder, BlockIterator, BlockHandle, BlockType,
289    FilterPolicy, BloomFilterPolicy, RibbonFilterPolicy, XorFilterPolicy, FilterReader,
290    SSTableFormat, Header, Footer, Section, SectionType,
291    SSTableBuilder, SSTableBuilderOptions, SSTableBuilderResult,
292    SSTable, TableMetadata, ReadOptions, BlockCache,
293};
294pub use wal_segment::{
295    WalSegmentManager, SegmentConfig, SegmentHeader, SegmentMetadata,
296    CheckpointRecord, SegmentStats, RecoveryIterator, WalEntry,
297};
298pub use compaction_policy::{
299    CompactionConfig, CompactionFile, CompactionJob, CompactionPicker,
300    CompactionPriority, CompactionReason, CompactionState, CompactionStats,
301    CompactionStrategy, LeveledCompactionPicker, RetentionConfig,
302    UniversalCompactionPicker, VersionPruner,
303};
304pub use optimized_scan::{
305    EntrySource, FileRange, LevelFiles, RangeScanner, ScanConfig, ScanStats,
306    TournamentTree, VersionedEntry,
307};
308pub use cow_btree::{BTreeEntry, BTreeSnapshot, CowBTree, Node, SearchResult};
309pub use epoch_mvcc::{
310    CommitResult, EpochManager, EpochMvccStore, EpochSnapshot, EpochTransaction,
311    EpochVersionChain, GcStats, StoreStats, VersionEntry,
312};
313pub use page_cache::{CacheStats, ClockProCache, CachedPage, PageId as CachePageId, PageState};
314pub use row_format::{Slot, SlotRow, SlotRowArena, SlotRowHandle, SlotRowFlags};
315pub use tiered_memtable::{HotEntry, SortedBatch, TieredMemTable};
316pub use vectorized_scan::{
317    ColumnVector, ComparisonOp, Int64Comparison, VectorBatch, VectorPredicate,
318    VectorizedScanConfig, VectorizedScanStats, DEFAULT_BATCH_SIZE,
319    // SoA + Late Materialization (80/20 optimization)
320    SimdVisibilityFilter, SoaBatch, SoaScanIterator, SoaScanStats, SoaSource,
321    StreamingScanIterator, ValueHandle, VersionedSlice,
322};
323pub use zero_copy_serde::{
324    FieldDescriptor, MmapWalReader, SerdeStats, WalBatchReader, WalBatchWriter,
325    WalEntryBuilder, WalEntryHeader, WalEntryReader, WalEntryType, ZeroCopyHeader,
326    FORMAT_VERSION as SERDE_FORMAT_VERSION, HEADER_SIZE as SERDE_HEADER_SIZE, ZERO_COPY_MAGIC,
327};
328
329// Re-exports for transaction arena and zero-copy plumbing
330pub use txn_arena::{
331    ArenaWriteSet, BytesRef, KeyFingerprint, TxnArena, TxnWriteBuffer, WriteOp,
332};
333
334// Re-exports for dirty tracking with batching
335pub use dirty_tracking::{
336    BatchedDirtyTracker, DirtyEvent, DirtyTrackingStats, TxnDirtyBuffer,
337};
338
339// Re-exports for per-table index policy
340pub use index_policy::{
341    BalancedTableIndex, IndexPolicy, SortedRun, TableIndexConfig, TableIndexRegistry,
342};
343
344// Re-exports for queue-optimized index structure
345pub use queue_index::{
346    CompositeQueueKey, QueueIndex, QueueIndexConfig, QueueIndexStats, QueueTableRegistry,
347};
348
349// Re-exports for database kernel
350pub use database::{
351    ColumnDef as DbColumnDef,
352    ColumnType as DbColumnType,
353    ColumnarQueryResult, // SIMD-friendly columnar result format
354    Database,
355    DatabaseConfig,
356    GroupCommitSettings,
357    QueryBuilder,
358    QueryResult,
359    QueryRowIterator,
360    RecoveryStats as DbRecoveryStats,
361    Stats as DbStats,
362    SyncMode,
363    TableSchema as DbTableSchema,
364    TxnHandle as KernelTxnHandle,
365    VectorSearchResult,
366};