nodedb_graph/csr/index/types.rs
1// SPDX-License-Identifier: Apache-2.0
2
3//! `CsrIndex` struct definition and constructor.
4//!
5//! Memory layout at scale (1B edges):
6//! - Old: `Vec<Vec<(String, u32)>>` ≈ 60 GB (heap String per edge)
7//! - New: contiguous `Vec<u32>` offsets + targets + `Vec<u32>` labels ≈ 12 GB
8//!
9//! Writes accumulate in a mutable buffer (`buffer_out`/`buffer_in`).
10//! Reads check both the dense CSR arrays and the mutable buffer.
11//! `compact()` merges the buffer into the dense arrays (double-buffered swap).
12//!
13//! ## Edge Weights
14//!
15//! Optional `f64` weight per edge stored in parallel arrays. `None` when the
16//! graph is entirely unweighted (zero memory overhead). Populated from the
17//! `"weight"` edge property at insertion time. Unweighted edges default to 1.0.
18
19use std::collections::{HashMap, HashSet};
20use std::sync::Arc;
21
22use nodedb_mem::MemoryGovernor;
23
24use crate::csr::dense_array::DenseArray;
25
26// Re-export shared Direction from nodedb-types.
27pub use nodedb_types::graph::Direction;
28
29/// Dense integer CSR adjacency index with interned node IDs and labels.
30pub struct CsrIndex {
31 // ── Node interning ──
32 pub(crate) node_to_id: HashMap<String, u32>,
33 pub(crate) id_to_node: Vec<String>,
34
35 // ── Label interning ──
36 pub(crate) label_to_id: HashMap<String, u32>,
37 pub(crate) id_to_label: Vec<String>,
38
39 // ── Dense CSR (read-only between compactions) ──
40 //
41 // Offsets are `Vec<u32>` (mutable — extended on node creation).
42 // Targets/labels/weights use `DenseArray<T>` for zero-copy mmap support:
43 // after cold start from rkyv checkpoint, these point directly into the
44 // archived buffer with no deserialization. Compaction replaces them with
45 // owned Vecs.
46 /// `out_offsets[i]..out_offsets[i+1]` = range in `out_targets`/`out_labels`.
47 /// Length: `num_nodes + 1`.
48 pub(crate) out_offsets: Vec<u32>,
49 pub(crate) out_targets: DenseArray<u32>,
50 pub(crate) out_labels: DenseArray<u32>,
51 /// Parallel edge weight array. `None` if graph has no weighted edges.
52 pub(crate) out_weights: Option<DenseArray<f64>>,
53
54 pub(crate) in_offsets: Vec<u32>,
55 pub(crate) in_targets: DenseArray<u32>,
56 pub(crate) in_labels: DenseArray<u32>,
57 /// Parallel inbound edge weight array. `None` if graph has no weighted edges.
58 pub(crate) in_weights: Option<DenseArray<f64>>,
59
60 // ── Mutable write buffer ──
61 /// Per-node outbound buffer: `buffer_out[node_id]` = `[(label_id, dst_id)]`.
62 pub(crate) buffer_out: Vec<Vec<(u32, u32)>>,
63 pub(crate) buffer_in: Vec<Vec<(u32, u32)>>,
64 /// Per-node outbound weight buffer (parallel to `buffer_out`).
65 /// Only populated when `has_weights` is true.
66 pub(crate) buffer_out_weights: Vec<Vec<f64>>,
67 /// Per-node inbound weight buffer (parallel to `buffer_in`).
68 pub(crate) buffer_in_weights: Vec<Vec<f64>>,
69
70 /// Edges deleted since last compaction: `(src, label, dst)`.
71 pub(crate) deleted_edges: HashSet<(u32, u32, u32)>,
72
73 /// Whether any edge has a non-default weight. When false, weight arrays
74 /// are `None` and weight buffers are empty — zero overhead for unweighted graphs.
75 pub(crate) has_weights: bool,
76
77 // ── Node labels (bitset) ──
78 //
79 // Each node has a `u64` bitset where bit `i` corresponds to label ID `i`.
80 // Supports up to 64 distinct node labels. Labels are interned in
81 // `node_label_to_id` / `node_label_names` (separate from edge labels).
82 // Used by MATCH pattern `(a:Person)` — filters nodes by label membership.
83 pub(crate) node_label_bits: Vec<u64>,
84 pub(crate) node_label_to_id: HashMap<String, u8>,
85 pub(crate) node_label_names: Vec<String>,
86
87 // ── Surrogate storage ──
88 /// Per-node surrogate: `node_surrogates[local_id]` = global `Surrogate.as_u32()`.
89 ///
90 /// `0` (Surrogate::ZERO) is the unset sentinel — populated at `EdgePut` time
91 /// from the surrogates resolved by the Control Plane. A node whose surrogate
92 /// is zero was inserted without surrogate plumbing (e.g. legacy paths or tests)
93 /// and is treated as "not in any prefilter bitmap" when a bitmap is active.
94 pub(crate) node_surrogates: Vec<u32>,
95 /// Reverse map: `Surrogate.as_u32()` → CSR-local node id. Maintained
96 /// in step with `node_surrogates` by `set_node_surrogate`. Excludes the
97 /// zero sentinel. Used by cross-engine fusion (graph RAG) to resolve a
98 /// vector-side surrogate to the corresponding graph node name.
99 pub(crate) surrogate_to_local: HashMap<u32, u32>,
100
101 // ── Hot/cold access tracking ──
102 /// Per-node access counter: incremented on each neighbor/BFS/path query.
103 /// Uses `Cell<u32>` so access can be tracked through `&self` references
104 /// (traversal methods are `&self` for shared read access).
105 pub(crate) access_counts: Vec<std::cell::Cell<u32>>,
106 /// Total queries served since last access counter reset.
107 pub(crate) query_epoch: u64,
108
109 /// Unique partition tag assigned at construction. Embedded into
110 /// every `LocalNodeId` this index produces; cross-partition use is
111 /// caught by comparing tags at API boundaries.
112 pub(crate) partition_tag: u32,
113
114 /// Optional memory governor for budget tracking.
115 ///
116 /// When `None`, all memory operations proceed without budget enforcement
117 /// (the behavior for NodeDB-Lite / WASM deployments that have no governor).
118 /// When `Some`, `compact()`, `checkpoint_to_bytes()`, and `compute_statistics()`
119 /// reserve bytes against `EngineId::Graph` before allocating and release them
120 /// on drop via `BudgetGuard`.
121 pub(crate) governor: Option<Arc<MemoryGovernor>>,
122}
123
124impl Default for CsrIndex {
125 fn default() -> Self {
126 Self::new()
127 }
128}
129
130impl CsrIndex {
131 pub fn new() -> Self {
132 Self {
133 node_to_id: HashMap::new(),
134 id_to_node: Vec::new(),
135 label_to_id: HashMap::new(),
136 id_to_label: Vec::new(),
137 out_offsets: vec![0],
138 out_targets: DenseArray::default(),
139 out_labels: DenseArray::default(),
140 out_weights: None,
141 in_offsets: vec![0],
142 in_targets: DenseArray::default(),
143 in_labels: DenseArray::default(),
144 in_weights: None,
145 buffer_out: Vec::new(),
146 buffer_in: Vec::new(),
147 buffer_out_weights: Vec::new(),
148 buffer_in_weights: Vec::new(),
149 deleted_edges: HashSet::new(),
150 has_weights: false,
151 node_label_bits: Vec::new(),
152 node_label_to_id: HashMap::new(),
153 node_label_names: Vec::new(),
154 node_surrogates: Vec::new(),
155 surrogate_to_local: HashMap::new(),
156 access_counts: Vec::new(),
157 query_epoch: 0,
158 partition_tag: crate::csr::local_node_id::next_partition_tag(),
159 governor: None,
160 }
161 }
162
163 /// Create a new `CsrIndex` wired to a memory governor.
164 ///
165 /// Subsequent calls to `compact()`, `checkpoint_to_bytes()`, and
166 /// `compute_statistics()` will reserve bytes against `EngineId::Graph`
167 /// before allocating and return `Err(GraphError::MemoryBudget(_))` if
168 /// the budget is exhausted.
169 ///
170 /// Use `CsrIndex::new()` when deploying without a governor (NodeDB-Lite,
171 /// WASM, or tests that do not need budget enforcement).
172 pub fn with_governor(governor: Arc<MemoryGovernor>) -> Self {
173 Self {
174 governor: Some(governor),
175 ..Self::new()
176 }
177 }
178
179 /// Attach a memory governor to an existing `CsrIndex`.
180 ///
181 /// Used by the REINDEX path: a partition is rebuilt on a background thread
182 /// (without a governor since `MemoryGovernor` is `Arc<...>` but the thread
183 /// is independent), and on cutover the Data Plane installs the governor.
184 pub fn with_governor_attached(mut self, governor: Arc<MemoryGovernor>) -> Self {
185 self.governor = Some(governor);
186 self
187 }
188
189 /// Whether any edge in this index has a non-default (1.0) weight.
190 pub fn has_weighted_edges(&self) -> bool {
191 self.has_weights
192 }
193}