1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
//! Comprehensive index persistence layer for AletheiaDB.
//!
//! This module provides persistence for all index types:
//! - Vector indexes (HNSW via usearch)
//! - Graph indexes (CSR adjacency)
//! - Temporal indexes (version chains)
//! - String interner
//!
//! # Architecture
//!
//! ```text
//! indexes/
//! ├── manifest.idx # Index registry
//! ├── strings/interner.idx # String interning table
//! ├── graph/adjacency.idx # CSR adjacency data
//! ├── temporal/versions.idx # Version chains
//! └── vector/{prop}/ # Per-property vector indexes
//! ```
//!
//! # Load Order
//!
//! 1. String interner (others depend on string indices)
//! 2. Manifest (tells us what indexes exist)
//! 3. Graph, Temporal, Vector (parallel)
//!
//! # Usage
//!
//! ```rust,no_run
//! use aletheiadb::storage::index_persistence::{
//! IndexPersistenceManager, PersistenceConfig, IndexManifest
//! };
//!
//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
//! // Use a valid path
//! let manager = IndexPersistenceManager::new("data");
//! manager.ensure_directories()?;
//!
//! // Save
//! manager.save_string_interner()?;
//! let manifest = IndexManifest::new(100);
//! manager.save_manifest(&manifest)?;
//!
//! // Load (respects load order)
//! if manager.indexes_exist() {
//! let manifest = manager.load_manifest_and_strings()?;
//! // String interner is now restored
//! // Ready to load other indexes
//! }
//! # Ok(())
//! # }
//! ```
//!
//! # Format Details
//!
//! All index files (except the native HNSW index) use [bitcode](https://github.com/llogiq/bitcode)
//! serialization with a consistent header format: `[MAGIC:4][VERSION:1][DATA...]`.
//!
//! | File Type | Magic Bytes | Rust Struct | Description |
//! |-----------|-------------|-------------|-------------|
//! | **Manifest** | `GIDX` | [`formats::IndexManifest`] | Registry of all indexes, LSN tracking, and timestamps. |
//! | **String Interner** | `GSTR` | [`formats::StringInternerData`] | Ordered list of interned strings for ID restoration. |
//! | **Graph Index** | `GGRP` | [`formats::GraphIndexData`] | Nodes, edges, CSR adjacency, and current properties. |
//! | **Graph Delta** | `GDLT` | [`formats::GraphIndexDelta`] | Incremental changes (added/modified/deleted) since base snapshot. |
//! | **Temporal Index** | `GTMP` | [`formats::TemporalIndexData`] | Version chains, anchors, and deltas for time-travel. |
//! | **Vector Meta** | `GVEC` | [`formats::VectorIndexMeta`] | Metadata for a vector index (dimensions, metric, etc.). |
//!
//! ## Vector Index Hybrid Format
//!
//! Vector indexes use a hybrid format for performance:
//! 1. **Metadata** (`meta.idx`): Bitcode-serialized [`formats::VectorIndexMeta`].
//! 2. **Mappings** (`mappings.idx`): Bitcode-serialized [`formats::VectorMappingsData`] mapping NodeIDs to usearch keys.
//! 3. **Index** (`current.usearch`): Native binary format produced by the `usearch` C++ library (HNSW graph).
//!
//! # Safety and Integrity
//!
//! - **Atomic Writes**: All files are written using a write-temp-then-rename strategy (`atomic_write`) to prevent corruption during crashes.
//! - **Checksums**: Many formats (Graph, Vector Meta/Mappings) include CRC32 checksums for integrity verification.
//! - **Magic Bytes**: All files start with 4 magic bytes to prevent parsing invalid file types.
//! - **Versioning**: All files include a version byte to support future schema evolution.
pub
/// Persistence operations implementation.
/// Persistence mutation tracking.
/// Background persistence worker thread.
pub use ;
pub use ;
pub use *;
pub use IndexPersistenceManager;
use *;
/// Current manifest format version.
pub const MANIFEST_VERSION: u16 = 1;
/// Magic bytes for manifest files.
pub const MANIFEST_MAGIC: = *b"GIDX";
/// Magic bytes for string interner files.
pub const INTERNER_MAGIC: = *b"GSTR";
/// Magic bytes for graph index files.
pub const GRAPH_MAGIC: = *b"GGRP";
/// Magic bytes for graph delta files.
pub const DELTA_MAGIC: = *b"GDLT";
/// Magic bytes for temporal index files.
pub const TEMPORAL_MAGIC: = *b"GTMP";
/// Magic bytes for temporal adjacency index files.
pub const TEMPORAL_ADJACENCY_MAGIC: = *b"GTAJ";
/// Magic bytes for vector metadata files.
pub const VECTOR_META_MAGIC: = *b"GVEC";
/// Maximum number of strings allowed in the string interner (DoS protection).
/// ~100K strings should be sufficient for most databases while preventing
/// memory exhaustion attacks.
pub const MAX_STRING_COUNT: u64 = 100_000;
/// Maximum length of a single string in bytes (DoS protection).
/// Increased from 1MB to 10MB to support business scenarios:
/// - Document storage: Full articles and papers
/// - Base64 encoded data: Medium-sized images and files
/// - Large JSON objects: Complex configuration and metadata
///
/// Still provides DoS protection while enabling practical use cases.
pub const MAX_STRING_LENGTH: usize = 10_485_760; // 10MB
/// Maximum vector dimension (DoS protection).
/// 100K dimensions aligns with the documented maximum.
/// At 4 bytes per f32, this is 400KB per vector.
pub const MAX_VECTOR_DIMENSIONS: usize = 100_000;
/// Maximum size of a graph index file (DoS protection).
///
/// Limits the amount of memory allocated when loading graph indexes.
/// Increased from 4GB to 100GB to support enterprise-scale graphs:
/// - 4GB ≈ 100M-500M nodes (depending on density)
/// - Enterprise graphs can have billions of nodes
/// - Enables large-scale knowledge graphs and social networks
///
/// Default: 100GB in production, 10MB in tests.
pub const MAX_GRAPH_INDEX_FILE_SIZE: u64 = if cfg! else ;
/// Maximum allowed decompressed size for graph index files (DoS protection).
///
/// Prevents "zip bomb" attacks where a small compressed file expands to fill memory.
/// The compressed file size is already checked by `MAX_GRAPH_INDEX_FILE_SIZE`, but a
/// crafted file with extreme compression ratios could still expand to gigabytes.
///
/// Default: 16GB in production (64-bit), 2GB (32-bit), 100MB in tests.
pub const MAX_GRAPH_DECOMPRESSED_SIZE: usize = if cfg! else ;
/// See 64-bit variant for documentation.
pub const MAX_GRAPH_DECOMPRESSED_SIZE: usize = if cfg! else ;
/// Maximum size of a vector index metadata/mappings file (DoS protection).
///
/// Limits the amount of memory allocated when loading vector index metadata.
/// Default: 1GB in production, 5MB in tests.
pub const MAX_VECTOR_INDEX_FILE_SIZE: u64 = if cfg! else ;
/// Maximum size of a temporal index file (DoS protection).
///
/// Limits the amount of memory allocated when loading temporal indexes.
/// Default: 2GB in production, 10MB in tests.
pub const MAX_TEMPORAL_INDEX_FILE_SIZE: u64 = if cfg! else ;
/// Maximum size of a string interner file (DoS protection).
///
/// Limits the amount of memory allocated when loading the string interner.
/// Test limit increased to 20MB to allow testing string length validation
/// (MAX_STRING_LENGTH is 10MB, need buffer for encoding overhead).
/// Default: 256MB in production, 20MB in tests.
pub const MAX_STRING_INTERNER_FILE_SIZE: u64 = if cfg! else ;
/// Maximum size of a manifest file (DoS protection).
///
/// Limits the amount of memory allocated when loading the manifest.
/// Default: 1MB in production, 100KB in tests.
pub const MAX_MANIFEST_FILE_SIZE: u64 = if cfg! else ;
/// Maximum allowed file size for memory-mapped files (Sanity Check).
///
/// Prevents attempting to map ridiculously large or sparse files that could cause issues.
/// Default: 100GB in production, 100MB in tests.
pub const MAX_MMAP_FILE_SIZE: u64 = if cfg! else ;
/// Atomically write data to a file using write-temp-then-rename pattern.
///
/// This prevents corruption if the process crashes mid-write:
/// 1. Write to `{path}.tmp.{random_suffix}`
/// 2. Sync to disk
/// 3. Rename temp → target (atomic on POSIX, nearly-atomic on Windows)
///
/// # Thread Safety
///
/// Uses a random suffix for the temporary file to allow multiple threads to attempt
/// atomic writes to the same target concurrently (though last writer wins).
/// This prevents race conditions where one thread truncates another thread's
/// temporary file.
///
/// # Errors
///
/// Returns an error if:
/// - Failed to write temp file
/// - Failed to sync to disk
/// - Failed to rename temp to target
pub
/// Load graph, temporal, and vector indexes in parallel for faster startup.
///
/// This function spawns threads to load all three index types concurrently,
/// reducing startup time for databases with large indexes.
///
/// # Arguments
///
/// * `graph_path` - Path to the graph index file
/// * `temporal_path` - Optional path to the temporal index file
/// * `vector_paths` - Optional vector of vector index paths (meta, mappings, snapshots)
///
/// # Returns
///
/// A tuple of (graph_data, temporal_data_option, vector_data_vec)
///
/// # Errors
///
/// Returns an error if any of the index files fail to load.
///
/// # Examples
///
/// ```ignore
/// use aletheiadb::storage::index_persistence::load_indexes_parallel;
///
/// let (graph, temporal, vector) = load_indexes_parallel(
/// &graph_path,
/// Some(&temporal_path),
/// vec![],
/// )?;
/// ```