sqlitegraph 2.0.7

Embedded graph database with full ACID transactions, HNSW vector search, dual backend support, and comprehensive graph algorithms library
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
//! Per-traversal cache for graph operations.
//!
//! This module provides a simple, efficient cache for node adjacency data during
//! graph traversal operations (BFS, k-hop, shortest path, etc.).
//!
//! # Design: Per-Traversal Cache
//!
//! The cache is scoped to each traversal operation and evaporates when the
//! traversal function returns. This design:
//!
//! - **Preserves MVCC isolation**: No cross-transaction cache staleness
//! - **Avoids invalidation complexity**: Cache dies with the function
//! - **Eliminates redundant I/O**: Nodes visited multiple times during traversal
//!   are cached in memory
//! - **Zero thread-safety overhead**: Single-threaded traversal, no atomics needed
//!
//! # Cache Key Design
//!
//! The cache key combines `(node_id, direction)` to handle cases where a node
//! has different outgoing vs incoming neighbors. This allows the same traversal
//! to efficiently query both directions without cache pollution.
//!
//! # Usage Pattern
//!
//! ```rust
//! use crate::backend::native::graph_ops::cache::{TraversalCache, TraversalCacheStats, get_neighbors_cached};
//! use crate::backend::native::adjacency::Direction;
//!
//! fn bfs_with_cache(graph_file: &mut GraphFile, start: NativeNodeId, depth: u32) -> NativeResult<Vec<NativeNodeId>> {
//!     let mut cache = TraversalCache::new();
//!     let mut stats = TraversalCacheStats::default();
//!
//!     // ... traversal loop ...
//!     let neighbors = get_neighbors_cached(
//!         graph_file,
//!         current_node,
//!         Direction::Outgoing,
//!         &mut cache,
//!         &mut stats,
//!     )?;
//!     // ... use neighbors ...
//!
//!     // Cache evaporates here when function returns
//!     Ok(result)
//! }
//! ```

use crate::backend::native::adjacency::{AdjacencyHelpers, Direction};
use crate::backend::native::graph_file::GraphFile;
use crate::backend::native::types::{NativeNodeId, NativeResult};
use crate::backend::native::v2::edge_cluster::EdgeCluster;
use ahash::AHashMap;

// Phase 34: Sequential cluster reader
use crate::backend::native::adjacency::SequentialClusterReader;

// Forward declaration for TraversalContext
// TraversalContext is defined in traversal_context.rs (sibling module)
// The module will be declared in mod.rs in Task 3 of this plan.
// Using full crate path to avoid circular dependency issues.
use crate::backend::native::graph_ops::traversal_context::TraversalContext;

/// Per-traversal cache key combining node ID and direction.
///
/// Using both node_id and direction as the key ensures that:
/// - Outgoing and incoming neighbors are cached separately
/// - No cache pollution when querying both directions in same traversal
/// - Simple hash-based lookup with ahash for performance
type CacheKey = (NativeNodeId, Direction);

/// Per-traversal cache for node adjacency data.
///
/// Key: (node_id, direction) tuple
/// Value: Vector of neighbor node IDs
///
/// The cache is created at the start of a traversal operation and dropped
/// when the traversal completes. No explicit invalidation is needed.
pub type TraversalCache = AHashMap<CacheKey, Vec<NativeNodeId>>;

/// Cache statistics for tracking hit/miss rates during traversal.
///
/// Used for performance validation (PERF-07) to measure cache effectiveness.
/// Non-atomic counters are safe because traversals are single-threaded.
#[derive(Debug, Default, Clone, Copy)]
pub struct TraversalCacheStats {
    /// Number of cache hits (neighbors served from cache)
    pub hits: u64,
    /// Number of cache misses (neighbors loaded from storage)
    pub misses: u64,
}

impl TraversalCacheStats {
    /// Create new zero-initialized statistics.
    #[inline]
    pub fn new() -> Self {
        Self::default()
    }

    /// Calculate cache hit rate as a fraction (0.0 to 1.0).
    ///
    /// Returns 0.0 if no cache operations were performed.
    #[inline]
    pub fn hit_rate(&self) -> f64 {
        let total = self.hits + self.misses;
        if total == 0 {
            0.0
        } else {
            self.hits as f64 / total as f64
        }
    }

    /// Record a cache hit.
    #[inline]
    pub fn record_hit(&mut self) {
        self.hits += 1;
    }

    /// Record a cache miss.
    #[inline]
    pub fn record_miss(&mut self) {
        self.misses += 1;
    }
}

/// Get neighbors for a node with per-traversal caching.
///
/// This helper function checks the cache before falling through to
/// `AdjacencyHelpers`, reducing redundant I/O during traversals.
///
/// # Parameters
///
/// - `graph_file`: Mutable borrow of the graph file for I/O operations
/// - `node_id`: The node whose neighbors we want to fetch
/// - `direction`: Whether to get Outgoing or Incoming neighbors
/// - `cache`: Mutable borrow of the per-traversal cache
/// - `stats`: Mutable borrow of statistics tracking (optional)
///
/// # Returns
///
/// An owned `Vec<NativeNodeId>` containing the neighbor node IDs.
///
/// **Note:** Returns an owned `Vec` (not a reference) to avoid borrow
/// checker lifetime issues. The clone is cheap for typical neighbor list sizes.
///
/// # Example
///
/// ```rust
/// let mut cache = TraversalCache::new();
/// let mut stats = TraversalCacheStats::default();
///
/// let neighbors = get_neighbors_cached(
///     graph_file,
///     node_id,
///     Direction::Outgoing,
///     &mut cache,
///     &mut stats,
/// )?;
/// ```
pub fn get_neighbors_cached(
    graph_file: &mut GraphFile,
    node_id: NativeNodeId,
    direction: Direction,
    cache: &mut TraversalCache,
    stats: &mut TraversalCacheStats,
) -> NativeResult<Vec<NativeNodeId>> {
    let cache_key = (node_id, direction);

    // Check cache first - this is the hot path
    if let Some(cached) = cache.get(&cache_key) {
        stats.record_hit();
        // Return cloned Vec to avoid borrow checker issues
        // Clone is cheap for typical neighbor list sizes
        return Ok(cached.clone());
    }

    // Cache miss - load from storage via AdjacencyHelpers
    stats.record_miss();

    let neighbors = match direction {
        Direction::Outgoing => AdjacencyHelpers::get_outgoing_neighbors(graph_file, node_id)?,
        Direction::Incoming => AdjacencyHelpers::get_incoming_neighbors(graph_file, node_id)?,
    };

    // Insert into cache for future lookups in this traversal
    cache.insert(cache_key, neighbors.clone());

    Ok(neighbors)
}

/// Get neighbors with 3-tier lookup hierarchy for sequential I/O optimization.
///
/// This function implements a tiered lookup strategy that prioritizes faster caches:
///
/// # Lookup Hierarchy
///
/// 1. **L1: SequentialReadBuffer** - Decoded node slots from batched reads (fastest)
///    - Only checked after LinearDetector confirms linear pattern
///    - Extracts neighbors directly from buffered NodeRecordV2
///    - Checks cluster cache before doing file I/O (Phase 32-05)
///    - Records buffer hit/miss for statistics
///
/// 2. **Phase 34: Sequential cluster read** - Single I/O for all chain clusters (lazy trigger)
///    - Triggered only once when `should_use_sequential_read()` returns true
///    - Reads all contiguous clusters for the chain in a single I/O operation
///    - Stores raw buffer and offsets in TraversalContext for subsequent nodes
///    - Full neighbor extraction deferred to Phase 35 (requires node_id -> cluster_index mapping)
///
/// 3. **L2: TraversalCache** - Cached neighbor lists (from v1.3)
///    - Stores Vec<NativeNodeId> for (node_id, direction) keys
///    - Records cache hit/miss for statistics
///
/// 4. **L3: AdjacencyHelpers** - Storage I/O (slowest)
///    - Falls through to disk when L1 and L2 miss
///    - Results are inserted into L2 cache for future lookups
///
/// # Parameters
///
/// - `graph_file`: Mutable borrow of the graph file for I/O operations
/// - `node_id`: The node whose neighbors we want to fetch
/// - `direction`: Whether to get Outgoing or Incoming neighbors
/// - `ctx`: Mutable borrow of the unified traversal context
///
/// # Returns
///
/// An owned `Vec<NativeNodeId>` containing the neighbor node IDs.
///
/// # L1 Buffer Extraction with Cluster Cache (Phase 32-04, 32-05)
///
/// When a node is found in the SequentialReadBuffer and has valid cluster metadata,
/// this function first checks if the cluster data is cached. If cached, it deserializes
/// from the cached bytes (no I/O). If not cached, it reads from file.
///
/// The extraction process:
/// 1. Check if node is in buffer (only after linear pattern confirmed)
/// 2. Extract cluster_offset and cluster_size based on direction
/// 3. Check cluster cache: if cached, deserialize from cached bytes (no I/O)
/// 4. If not cached, read cluster data from file
/// 5. Deserialize EdgeCluster and extract neighbors via iter_neighbors()
/// 6. Return neighbors immediately (early return, no L2/L3 fallback)
///
/// If buffer miss or cluster metadata is invalid, fall through to L2/L3.
///
/// # Example
///
/// ```rust
/// use crate::backend::native::graph_ops::{TraversalContext, get_neighbors_optimized};
/// use crate::backend::native::adjacency::Direction;
///
/// let mut ctx = TraversalContext::new();
///
/// // During traversal with pattern detection
/// let degree = AdjacencyHelpers::outgoing_degree(graph_file, node_id)?;
/// ctx.detector.observe(node_id, degree);
///
/// // Trigger prefetch with cluster caching if linear confirmed
/// if ctx.detector.is_linear_confirmed() && !ctx.buffer.contains(node_id) {
///     ctx.buffer.prefetch_clusters_from(graph_file, node_id)?;
/// }
///
/// // Get neighbors via 3-tier lookup
/// let neighbors = get_neighbors_optimized(
///     graph_file,
///     node_id,
///     Direction::Outgoing,
///     &mut ctx,
/// )?;
/// ```
pub fn get_neighbors_optimized(
    graph_file: &mut GraphFile,
    node_id: NativeNodeId,
    direction: Direction,
    ctx: &mut TraversalContext,
) -> NativeResult<Vec<NativeNodeId>> {
    let cache_key = (node_id, direction);

    // L1: Check SequentialReadBuffer first (fastest path)
    // Only check after linear pattern is confirmed
    if ctx.detector.is_linear_confirmed() {
        // Extract cluster metadata from buffer if node exists
        let l1_result = ctx.buffer.get(node_id).map(|node_record| {
            let (cluster_offset, cluster_size) = match direction {
                Direction::Outgoing => (
                    node_record.outgoing_cluster_offset,
                    node_record.outgoing_cluster_size,
                ),
                Direction::Incoming => (
                    node_record.incoming_cluster_offset,
                    node_record.incoming_cluster_size,
                ),
            };
            (cluster_offset, cluster_size)
        });

        if let Some((cluster_offset, cluster_size)) = l1_result {
            ctx.record_buffer_hit();

            // If cluster metadata is valid, extract neighbors from buffer
            if cluster_offset > 0 && cluster_size > 0 {
                // First, check if cluster data is already cached in buffer
                if let Some(cached_cluster_bytes) = ctx.buffer.get_cluster(cluster_offset) {
                    // Cluster cache hit - deserialize from cached bytes (no I/O)
                    if let Ok(cluster) = EdgeCluster::deserialize(cached_cluster_bytes) {
                        let neighbors: Vec<NativeNodeId> = cluster
                            .iter_neighbors()
                            .map(|id| id as NativeNodeId)
                            .collect();

                        // Insert into L2 cache for future lookups in this traversal
                        ctx.cache.insert(cache_key, neighbors.clone());

                        return Ok(neighbors);
                    }
                    // If deserialization fails, fall through to file I/O below
                }

                // Cluster cache miss - read cluster data from file
                let mut cluster_data = vec![0u8; cluster_size as usize];
                if graph_file
                    .read_bytes(cluster_offset, &mut cluster_data)
                    .is_ok()
                {
                    // Deserialize edge cluster and extract neighbors
                    if let Ok(cluster) = EdgeCluster::deserialize(&cluster_data) {
                        let neighbors: Vec<NativeNodeId> = cluster
                            .iter_neighbors()
                            .map(|id| id as NativeNodeId)
                            .collect();

                        // Insert into L2 cache for future lookups in this traversal
                        ctx.cache.insert(cache_key, neighbors.clone());

                        return Ok(neighbors);
                    }
                    // If deserialization fails, fall through to L2/L3
                }
                // If read fails, fall through to L2/L3
            }
            // If cluster_offset == 0 or cluster_size == 0, node has no edges in this direction
            // Return empty neighbors immediately
            else {
                ctx.cache.insert(cache_key, Vec::new());
                return Ok(Vec::new());
            }
        } else {
            ctx.record_buffer_miss();
        }
    }

    // Phase 34: Sequential cluster read (lazy trigger)
    // Trigger only once when linear pattern confirmed, clusters contiguous, and buffer not yet populated
    if ctx.cluster_buffer.is_none() && ctx.detector.should_use_sequential_read() {
        let mut reader = SequentialClusterReader::new();
        match reader.read_chain_clusters(graph_file, ctx.detector.cluster_offsets()) {
            Ok(buffer) => {
                ctx.cluster_buffer = Some(buffer);
                ctx.cluster_buffer_offsets = ctx.detector.cluster_offsets().to_vec();
            }
            Err(_) => {
                // Sequential read failed - fall back to standard path
                // Buffer remains None, subsequent checks will use L2/L3
            }
        }
    }

    // Phase 35: Extract neighbors from cluster_buffer using node_id -> cluster_index mapping
    if let Some(buffer) = &ctx.cluster_buffer {
        // Look up cluster index for this node_id
        if let Some(&cluster_index) = ctx.node_cluster_index.get(&node_id) {
            // Extract neighbors from buffered cluster
            let mut reader = SequentialClusterReader::new();
            match reader.extract_neighbors(buffer, cluster_index, &ctx.cluster_buffer_offsets) {
                Ok(neighbors) => {
                    // Insert into L2 cache for future lookups
                    ctx.cache.insert(cache_key, neighbors.clone());
                    return Ok(neighbors);
                }
                Err(_) => {
                    // Extraction failed - fall through to L2/L3
                    // This shouldn't happen if clusters are valid, but handle gracefully
                }
            }
        }
    }

    // L2: Check TraversalCache (v1.3 cache)
    if let Some(cached) = ctx.cache.get(&cache_key) {
        ctx.stats.record_hit();
        return Ok(cached.clone());
    }
    ctx.stats.record_miss();

    // L3: Load from storage via AdjacencyHelpers
    let neighbors = match direction {
        Direction::Outgoing => AdjacencyHelpers::get_outgoing_neighbors(graph_file, node_id)?,
        Direction::Incoming => AdjacencyHelpers::get_incoming_neighbors(graph_file, node_id)?,
    };

    // Insert into cache for future lookups
    ctx.cache.insert(cache_key, neighbors.clone());

    Ok(neighbors)
}

/// Traversal helper demonstrating chain detection and mapping population pattern.
///
/// This function documents the expected usage pattern for Phase 35:
/// 1. Call observe_with_cluster() to track node and cluster metadata
/// 2. Populate node_cluster_index mapping with calculated cluster_index
/// 3. Check for Branching pattern and call clear_cluster_buffer() immediately
/// 4. Call get_neighbors_optimized() which extracts from cluster_buffer if available
///
/// # Parameters
///
/// - **graph_file**: Mutable borrow for I/O operations
/// - **node_id**: Current node being traversed
/// - **direction**: Outgoing or Incoming
/// - **cluster_offset**: Edge cluster offset for this node
/// - **cluster_size**: Edge cluster size for this node
/// - **ctx**: Mutable traversal context
///
/// # Returns
///
/// Neighbors for the node, extracted from cluster_buffer if sequential read active.
///
/// # Example
///
/// ```rust,no_run
/// use crate::backend::native::graph_ops::{TraversalContext, traverse_with_detection};
/// use crate::backend::native::adjacency::Direction;
///
/// let mut ctx = TraversalContext::new();
/// let start_node = 1;
///
/// // Get cluster metadata for node
/// let (cluster_offset, cluster_size) = get_node_cluster_metadata(graph_file, start_node)?;
/// let degree = AdjacencyHelpers::outgoing_degree(graph_file, start_node)?;
///
/// // Traverse with detection
/// let neighbors = traverse_with_detection(
///     graph_file,
///     start_node,
///     Direction::Outgoing,
///     cluster_offset,
///     cluster_size,
///     &mut ctx,
/// )?;
/// ```
pub fn traverse_with_detection(
    graph_file: &mut GraphFile,
    node_id: NativeNodeId,
    direction: Direction,
    cluster_offset: u64,
    cluster_size: u32,
    ctx: &mut TraversalContext,
) -> NativeResult<Vec<NativeNodeId>> {
    use crate::backend::native::adjacency::TraversalPattern;

    // Get degree for this node
    let degree = match direction {
        Direction::Outgoing => AdjacencyHelpers::outgoing_degree(graph_file, node_id)?,
        Direction::Incoming => AdjacencyHelpers::incoming_degree(graph_file, node_id)?,
    };

    // Observe node with cluster metadata (pushes to cluster_offsets)
    let pattern = ctx
        .detector
        .observe_with_cluster(node_id, degree, cluster_offset, cluster_size);

    // Populate node_id -> cluster_index mapping
    // cluster_index = current length BEFORE push, so after observe_with_cluster() it's len() - 1
    let cluster_index = ctx.detector.cluster_offsets().len().saturating_sub(1);
    ctx.node_cluster_index.insert(node_id, cluster_index);

    // Check for pattern break requiring immediate fallback
    if pattern == TraversalPattern::Branching {
        ctx.clear_cluster_buffer();
    }

    // Get neighbors (will extract from cluster_buffer if sequential read active)
    get_neighbors_optimized(graph_file, node_id, direction, ctx)
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::backend::native::types::NativeBackendError;

    #[test]
    fn test_cache_stats_new() {
        let stats = TraversalCacheStats::new();
        assert_eq!(stats.hits, 0);
        assert_eq!(stats.misses, 0);
        assert_eq!(stats.hit_rate(), 0.0);
    }

    #[test]
    fn test_cache_stats_default() {
        let stats = TraversalCacheStats::default();
        assert_eq!(stats.hits, 0);
        assert_eq!(stats.misses, 0);
    }

    #[test]
    fn test_cache_stats_hit_rate() {
        let mut stats = TraversalCacheStats::new();

        // No operations yet
        assert_eq!(stats.hit_rate(), 0.0);

        // Record some hits and misses
        stats.record_hit();
        stats.record_hit();
        stats.record_miss();
        stats.record_miss();

        // 2 hits out of 4 total = 0.5
        assert!((stats.hit_rate() - 0.5).abs() < f64::EPSILON);

        // All hits
        stats.hits = 10;
        stats.misses = 0;
        assert_eq!(stats.hit_rate(), 1.0);

        // All misses
        stats.hits = 0;
        stats.misses = 10;
        assert_eq!(stats.hit_rate(), 0.0);
    }

    #[test]
    fn test_cache_key_combines_node_and_direction() {
        // Cache keys with same node but different direction are distinct
        let key1: CacheKey = (42, Direction::Outgoing);
        let key2: CacheKey = (42, Direction::Incoming);

        assert_ne!(key1, key2, "Keys should differ by direction");

        // Cache keys with same direction but different node are distinct
        let key3: CacheKey = (42, Direction::Outgoing);
        let key4: CacheKey = (99, Direction::Outgoing);

        assert_ne!(key3, key4, "Keys should differ by node_id");

        // Cache keys with same node and direction are equal
        let key5: CacheKey = (42, Direction::Outgoing);
        let key6: CacheKey = (42, Direction::Outgoing);

        assert_eq!(key5, key6, "Keys should be identical");
    }

    #[test]
    fn test_traversal_cache_type_alias() {
        // TraversalCache is just AHashMap with our key type
        let cache: TraversalCache = TraversalCache::new();
        assert_eq!(cache.len(), 0);

        // Can insert and retrieve
        let mut cache: TraversalCache = AHashMap::new();
        cache.insert((1, Direction::Outgoing), vec![2, 3, 4]);
        cache.insert((1, Direction::Incoming), vec![0]);

        assert_eq!(cache.len(), 2);
        assert_eq!(cache.get(&(1, Direction::Outgoing)), Some(&vec![2, 3, 4]));
        assert_eq!(cache.get(&(1, Direction::Incoming)), Some(&vec![0]));
    }

    // Phase 34: Sequential cluster read integration tests

    #[test]
    fn test_get_neighbors_optimized_has_sequential_cluster_reader_import() {
        // Compile-time test: verify SequentialClusterReader is imported
        // This test will fail to compile if the import is missing
        use crate::backend::native::adjacency::SequentialClusterReader;
        // If we got here, the import exists
        let _ = SequentialClusterReader::new();
    }

    // Phase 35: Traversal helper and mapping population tests

    #[test]
    fn test_traverse_with_detection_populates_mapping() {
        use crate::backend::native::adjacency::{Direction, LinearDetector};
        use crate::backend::native::types::NativeNodeId;

        let ctx = TraversalContext::new();
        assert!(ctx.node_cluster_index.is_empty());

        // After observing a node, mapping should be populated
        // (This test documents the pattern; full integration test in Phase 35-04)
        assert_eq!(ctx.node_cluster_index.len(), 0);
    }

    #[test]
    fn test_traverse_with_detection_clears_on_branching() {
        use crate::backend::native::adjacency::{Direction, TraversalPattern};

        let mut ctx = TraversalContext::new();

        // Simulate branching detection
        // When detector returns Branching, clear_cluster_buffer() should be called
        // This test documents the fallback behavior
        assert!(ctx.cluster_buffer.is_none());
    }
}