Skip to main content

reddb_server/storage/engine/graph_store/
label_registry.rs

1//! Dynamic label registry for graph nodes and edges.
2//!
3//! Replaces the hardcoded `GraphNodeType` / `GraphEdgeType` enums with a
4//! per-database catalog mapping arbitrary label strings to dense `u32`
5//! identifiers. This unhooks the storage engine from any specific domain
6//! (the prior enums were pentest-flavoured: `Host`, `Vulnerability`, …).
7//!
8//! # Design
9//!
10//! - Two independent namespaces: [`Namespace::Node`] and [`Namespace::Edge`].
11//!   A label `"host"` registered as a node label is distinct from `"host"` as
12//!   an edge label. Mirrors Neo4j semantics where node labels and relationship
13//!   types live in different ID spaces.
14//! - IDs are dense `u32`, allocated monotonically starting at `1`.
15//!   `LabelId(0)` is reserved as the sentinel "unset/unknown" value so callers
16//!   can use it as a default without clashing with a real label.
17//! - Reserved ID range `1..=63` carries the legacy `GraphNodeType` /
18//!   `GraphEdgeType` variants so on-disk format v1 records can be decoded
19//!   into the new format without losing semantics. New labels start at
20//!   [`FIRST_USER_LABEL_ID`] = 64.
21//!
22//! # Persistence
23//!
24//! Encode: `[entry_count: u32 LE]` followed by `entry_count` records of:
25//! `[id: u32 LE][namespace: u8][label_len: u16 LE][label_bytes…]`.
26//! Decode tolerates trailing garbage (returns the prefix it understood).
27//!
28//! Persisting the registry is the caller's job — `GraphStore` writes it as
29//! a sidecar page; embedded users can call [`LabelRegistry::encode`] /
30//! [`LabelRegistry::decode`] directly.
31
32use std::collections::HashMap;
33use std::sync::RwLock;
34
35/// Sentinel value for "no label assigned". Never returned by `intern`.
36pub const UNSET_LABEL_ID: LabelId = LabelId(0);
37
38/// First ID handed out to labels registered at runtime. IDs below this are
39/// reserved for legacy `GraphNodeType` / `GraphEdgeType` variants so v1
40/// page-format files can be migrated in place.
41pub const FIRST_USER_LABEL_ID: u32 = 64;
42
43/// Densely-numbered identifier for a label string within a [`LabelRegistry`].
44///
45/// Cheap to copy/hash/compare. Persisted as a 4-byte little-endian value in
46/// the v2 graph page format (replaces the v1 `u8` enum discriminant).
47#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
48pub struct LabelId(pub u32);
49
50impl LabelId {
51    /// Construct from raw u32.
52    #[inline]
53    pub const fn new(id: u32) -> Self {
54        Self(id)
55    }
56
57    /// Raw u32 value.
58    #[inline]
59    pub const fn as_u32(self) -> u32 {
60        self.0
61    }
62
63    /// True for `UNSET_LABEL_ID`.
64    #[inline]
65    pub const fn is_unset(self) -> bool {
66        self.0 == 0
67    }
68}
69
70/// Which catalog a label belongs to.
71///
72/// Node labels and edge labels live in separate ID spaces so the same
73/// string (`"host"`) can be reused with different semantics.
74#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
75#[repr(u8)]
76pub enum Namespace {
77    Node = 0,
78    Edge = 1,
79}
80
81impl Namespace {
82    fn from_u8(v: u8) -> Option<Self> {
83        match v {
84            0 => Some(Self::Node),
85            1 => Some(Self::Edge),
86            _ => None,
87        }
88    }
89}
90
91/// Errors returned by [`LabelRegistry`] operations.
92#[derive(Debug, Clone, PartialEq, Eq)]
93pub enum LabelRegistryError {
94    /// Label exceeds [`MAX_LABEL_LEN`] bytes.
95    LabelTooLong { len: usize, max: usize },
96    /// Decoding hit malformed bytes at the given offset.
97    Malformed { offset: usize, reason: &'static str },
98    /// Internal lock was poisoned (process should treat this as fatal).
99    LockPoisoned,
100}
101
102impl std::fmt::Display for LabelRegistryError {
103    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
104        match self {
105            Self::LabelTooLong { len, max } => {
106                write!(f, "label too long: {} bytes (max {})", len, max)
107            }
108            Self::Malformed { offset, reason } => {
109                write!(f, "malformed registry at offset {}: {}", offset, reason)
110            }
111            Self::LockPoisoned => write!(f, "label registry lock poisoned"),
112        }
113    }
114}
115
116impl std::error::Error for LabelRegistryError {}
117
118/// Hard cap on label string length. Matches `MAX_LABEL_SIZE` in `graph_store.rs`.
119pub const MAX_LABEL_LEN: usize = 512;
120
121/// Bidirectional `label ↔ LabelId` catalog with concurrent access.
122///
123/// Cloned cheaply via `Arc<LabelRegistry>` from `GraphStore`. Internally
124/// uses a single `RwLock` covering both directions of the map plus the
125/// next-id counter so that allocation + insertion is atomic.
126#[derive(Debug)]
127pub struct LabelRegistry {
128    inner: RwLock<RegistryInner>,
129}
130
131#[derive(Debug)]
132struct RegistryInner {
133    /// `(namespace, label) → id`
134    by_label: HashMap<(Namespace, String), LabelId>,
135    /// `id → (namespace, label)`. `id` is the index, with `Vec[0]` reserved
136    /// as the sentinel for `UNSET_LABEL_ID`.
137    by_id: Vec<Option<(Namespace, String)>>,
138    /// Next unused ID. Starts at [`FIRST_USER_LABEL_ID`] after legacy seed.
139    next_id: u32,
140}
141
142impl LabelRegistry {
143    /// New empty registry pre-seeded with legacy `GraphNodeType` and
144    /// `GraphEdgeType` variant names so v1-format records decode into stable
145    /// IDs without a separate migration step.
146    pub fn with_legacy_seed() -> Self {
147        let reg = Self::empty();
148        // SAFETY of unwraps: labels are short literals well under MAX_LABEL_LEN
149        // and the empty registry has no allocation pressure.
150        for (raw, name) in LEGACY_NODE_LABELS {
151            reg.intern_with_id(Namespace::Node, name, LabelId(legacy_node_id(*raw)))
152                .expect("legacy node label seed");
153        }
154        for (raw, name) in LEGACY_EDGE_LABELS {
155            reg.intern_with_id(Namespace::Edge, name, LabelId(legacy_edge_id(*raw)))
156                .expect("legacy edge label seed");
157        }
158        // Bump next_id past the reserved range.
159        if let Ok(mut g) = reg.inner.write() {
160            g.next_id = FIRST_USER_LABEL_ID;
161        }
162        reg
163    }
164
165    /// New empty registry with no legacy seeding. Use only when callers do
166    /// not need to decode v1 graph pages.
167    pub fn empty() -> Self {
168        Self {
169            inner: RwLock::new(RegistryInner {
170                by_label: HashMap::new(),
171                // Index 0 is the sentinel — leave it None.
172                by_id: vec![None],
173                next_id: 1,
174            }),
175        }
176    }
177
178    /// Look up an existing label or allocate a new ID. Idempotent.
179    pub fn intern(&self, ns: Namespace, label: &str) -> Result<LabelId, LabelRegistryError> {
180        if label.len() > MAX_LABEL_LEN {
181            return Err(LabelRegistryError::LabelTooLong {
182                len: label.len(),
183                max: MAX_LABEL_LEN,
184            });
185        }
186        let mut g = self
187            .inner
188            .write()
189            .map_err(|_| LabelRegistryError::LockPoisoned)?;
190        if let Some(&id) = g.by_label.get(&(ns, label.to_string())) {
191            return Ok(id);
192        }
193        let id = LabelId(g.next_id);
194        g.next_id = g
195            .next_id
196            .checked_add(1)
197            .expect("LabelId u32 space exhausted (>4B labels)");
198        let key = (ns, label.to_string());
199        g.by_label.insert(key.clone(), id);
200        let idx = id.0 as usize;
201        if g.by_id.len() <= idx {
202            g.by_id.resize(idx + 1, None);
203        }
204        g.by_id[idx] = Some(key);
205        Ok(id)
206    }
207
208    /// Look up the ID for an existing label. `None` if not interned.
209    pub fn lookup(&self, ns: Namespace, label: &str) -> Option<LabelId> {
210        let g = self.inner.read().ok()?;
211        g.by_label.get(&(ns, label.to_string())).copied()
212    }
213
214    /// Resolve an ID back to its `(namespace, label)`. `None` for unknown
215    /// or sentinel IDs.
216    pub fn resolve(&self, id: LabelId) -> Option<(Namespace, String)> {
217        if id.is_unset() {
218            return None;
219        }
220        let g = self.inner.read().ok()?;
221        g.by_id.get(id.0 as usize).cloned().flatten()
222    }
223
224    /// Resolve an ID to just its label string, scoped to a namespace.
225    /// Returns `None` if the ID belongs to a different namespace.
226    pub fn label_of(&self, ns: Namespace, id: LabelId) -> Option<String> {
227        self.resolve(id)
228            .filter(|(found_ns, _)| *found_ns == ns)
229            .map(|(_, l)| l)
230    }
231
232    /// Total interned labels (across both namespaces, excludes sentinel).
233    pub fn len(&self) -> usize {
234        self.inner.read().map(|g| g.by_label.len()).unwrap_or(0)
235    }
236
237    /// True if no labels have been interned.
238    pub fn is_empty(&self) -> bool {
239        self.len() == 0
240    }
241
242    /// Translate a v1 `GraphNodeType` discriminant (`0..=8`) into the
243    /// reserved `LabelId` it was seeded with. Returns `UNSET_LABEL_ID` for
244    /// unknown discriminants (forward-compat).
245    pub fn legacy_node_label_id(disc: u8) -> LabelId {
246        if (disc as usize) < LEGACY_NODE_LABELS.len() {
247            LabelId(legacy_node_id(disc))
248        } else {
249            UNSET_LABEL_ID
250        }
251    }
252
253    /// Translate a v1 `GraphEdgeType` discriminant (`0..=9`) into the
254    /// reserved `LabelId` it was seeded with.
255    pub fn legacy_edge_label_id(disc: u8) -> LabelId {
256        if (disc as usize) < LEGACY_EDGE_LABELS.len() {
257            LabelId(legacy_edge_id(disc))
258        } else {
259            UNSET_LABEL_ID
260        }
261    }
262
263    /// Serialize the catalog to a self-describing byte buffer. Format:
264    /// `[count: u32 LE]([id: u32 LE][ns: u8][len: u16 LE][label_bytes])*`
265    pub fn encode(&self) -> Result<Vec<u8>, LabelRegistryError> {
266        let g = self
267            .inner
268            .read()
269            .map_err(|_| LabelRegistryError::LockPoisoned)?;
270        // Count only populated slots.
271        let entries: Vec<(LabelId, Namespace, &str)> = g
272            .by_id
273            .iter()
274            .enumerate()
275            .filter_map(|(i, slot)| {
276                slot.as_ref()
277                    .map(|(ns, label)| (LabelId(i as u32), *ns, label.as_str()))
278            })
279            .collect();
280        let mut buf = Vec::with_capacity(4 + entries.len() * 16);
281        buf.extend_from_slice(&(entries.len() as u32).to_le_bytes());
282        for (id, ns, label) in entries {
283            buf.extend_from_slice(&id.0.to_le_bytes());
284            buf.push(ns as u8);
285            let bytes = label.as_bytes();
286            buf.extend_from_slice(&(bytes.len() as u16).to_le_bytes());
287            buf.extend_from_slice(bytes);
288        }
289        Ok(buf)
290    }
291
292    /// Inverse of [`encode`]. Returns a fresh registry; the legacy seed is
293    /// *not* re-applied (caller decides whether incoming bytes already
294    /// contain the legacy entries).
295    pub fn decode(data: &[u8]) -> Result<Self, LabelRegistryError> {
296        let reg = Self::empty();
297        if data.len() < 4 {
298            return Err(LabelRegistryError::Malformed {
299                offset: 0,
300                reason: "header truncated",
301            });
302        }
303        let count = u32::from_le_bytes([data[0], data[1], data[2], data[3]]) as usize;
304        let mut off = 4;
305        for _ in 0..count {
306            if data.len() < off + 7 {
307                return Err(LabelRegistryError::Malformed {
308                    offset: off,
309                    reason: "entry header truncated",
310                });
311            }
312            let id = u32::from_le_bytes([data[off], data[off + 1], data[off + 2], data[off + 3]]);
313            let ns = Namespace::from_u8(data[off + 4]).ok_or(LabelRegistryError::Malformed {
314                offset: off + 4,
315                reason: "unknown namespace",
316            })?;
317            let len = u16::from_le_bytes([data[off + 5], data[off + 6]]) as usize;
318            off += 7;
319            if data.len() < off + len {
320                return Err(LabelRegistryError::Malformed {
321                    offset: off,
322                    reason: "label bytes truncated",
323                });
324            }
325            let label = std::str::from_utf8(&data[off..off + len]).map_err(|_| {
326                LabelRegistryError::Malformed {
327                    offset: off,
328                    reason: "label not utf8",
329                }
330            })?;
331            reg.intern_with_id(ns, label, LabelId(id))?;
332            off += len;
333        }
334        // Bump next_id to one past the highest ID seen so subsequent
335        // intern() calls do not collide with restored entries.
336        if let Ok(mut g) = reg.inner.write() {
337            let max_id = g
338                .by_id
339                .iter()
340                .enumerate()
341                .filter_map(|(i, slot)| slot.as_ref().map(|_| i as u32))
342                .max()
343                .unwrap_or(0);
344            g.next_id = max_id.saturating_add(1).max(FIRST_USER_LABEL_ID);
345        }
346        Ok(reg)
347    }
348
349    /// Insert at a specific ID. Used by [`with_legacy_seed`] and [`decode`].
350    /// Errors if the ID is already taken with a different `(ns, label)`.
351    fn intern_with_id(
352        &self,
353        ns: Namespace,
354        label: &str,
355        id: LabelId,
356    ) -> Result<(), LabelRegistryError> {
357        if label.len() > MAX_LABEL_LEN {
358            return Err(LabelRegistryError::LabelTooLong {
359                len: label.len(),
360                max: MAX_LABEL_LEN,
361            });
362        }
363        let mut g = self
364            .inner
365            .write()
366            .map_err(|_| LabelRegistryError::LockPoisoned)?;
367        let idx = id.0 as usize;
368        if g.by_id.len() <= idx {
369            g.by_id.resize(idx + 1, None);
370        }
371        let key = (ns, label.to_string());
372        if let Some(existing) = &g.by_id[idx] {
373            if existing != &key {
374                return Err(LabelRegistryError::Malformed {
375                    offset: 0,
376                    reason: "id collision with different label",
377                });
378            }
379            return Ok(());
380        }
381        g.by_id[idx] = Some(key.clone());
382        g.by_label.insert(key, id);
383        Ok(())
384    }
385}
386
387impl Default for LabelRegistry {
388    fn default() -> Self {
389        Self::with_legacy_seed()
390    }
391}
392
393// ===== Legacy seed tables ===================================================
394//
395// These mirror the v1 `GraphNodeType` and `GraphEdgeType` enums *only* so
396// that v1-format graph pages on disk can be decoded into the new label-id
397// world without a separate migration tool. They are intentionally NOT
398// referenced anywhere else — new code should call `intern()` with whatever
399// label string the user picks.
400
401const LEGACY_NODE_LABELS: &[(u8, &str)] = &[
402    (0, "host"),
403    (1, "service"),
404    (2, "credential"),
405    (3, "vulnerability"),
406    (4, "endpoint"),
407    (5, "technology"),
408    (6, "user"),
409    (7, "domain"),
410    (8, "certificate"),
411];
412
413const LEGACY_EDGE_LABELS: &[(u8, &str)] = &[
414    (0, "has_service"),
415    (1, "has_endpoint"),
416    (2, "uses_tech"),
417    (3, "auth_access"),
418    (4, "affected_by"),
419    (5, "contains"),
420    (6, "connects_to"),
421    (7, "related_to"),
422    (8, "has_user"),
423    (9, "has_cert"),
424];
425
426// Legacy IDs are packed 1..=9 (nodes) and 10..=19 (edges), leaving 20..=63
427// as headroom in the reserved range. `FIRST_USER_LABEL_ID = 64` is where
428// runtime allocations begin.
429fn legacy_node_id(disc: u8) -> u32 {
430    1 + disc as u32
431}
432
433fn legacy_edge_id(disc: u8) -> u32 {
434    10 + disc as u32
435}
436
437#[cfg(test)]
438mod tests {
439    use super::*;
440
441    #[test]
442    fn empty_registry_has_no_entries_but_sentinel_resolves_to_none() {
443        let r = LabelRegistry::empty();
444        assert!(r.is_empty());
445        assert_eq!(r.resolve(UNSET_LABEL_ID), None);
446        assert_eq!(r.lookup(Namespace::Node, "anything"), None);
447    }
448
449    #[test]
450    fn intern_is_idempotent() {
451        let r = LabelRegistry::empty();
452        let a = r.intern(Namespace::Node, "order").unwrap();
453        let b = r.intern(Namespace::Node, "order").unwrap();
454        assert_eq!(a, b);
455        assert_eq!(r.len(), 1);
456    }
457
458    #[test]
459    fn namespaces_are_independent() {
460        let r = LabelRegistry::empty();
461        let n = r.intern(Namespace::Node, "host").unwrap();
462        let e = r.intern(Namespace::Edge, "host").unwrap();
463        assert_ne!(
464            n, e,
465            "same label in different namespaces must get distinct ids"
466        );
467        assert_eq!(r.label_of(Namespace::Node, n).as_deref(), Some("host"));
468        assert_eq!(r.label_of(Namespace::Edge, e).as_deref(), Some("host"));
469        assert_eq!(r.label_of(Namespace::Node, e), None);
470    }
471
472    #[test]
473    fn legacy_seed_populates_reserved_range() {
474        let r = LabelRegistry::with_legacy_seed();
475        // Old GraphNodeType::Host (disc=0) → "host" at LabelId(1).
476        let host_id = r.lookup(Namespace::Node, "host").unwrap();
477        assert_eq!(host_id, LabelId(1));
478        assert_eq!(LabelRegistry::legacy_node_label_id(0), host_id);
479        // Old GraphEdgeType::HasService (disc=0) → "has_service" at LabelId(10).
480        let edge_id = r.lookup(Namespace::Edge, "has_service").unwrap();
481        assert_eq!(edge_id, LabelId(10));
482        assert_eq!(LabelRegistry::legacy_edge_label_id(0), edge_id);
483    }
484
485    #[test]
486    fn user_labels_start_at_first_user_id() {
487        let r = LabelRegistry::with_legacy_seed();
488        let id = r.intern(Namespace::Node, "order").unwrap();
489        assert_eq!(id, LabelId(FIRST_USER_LABEL_ID));
490        let id2 = r.intern(Namespace::Node, "product").unwrap();
491        assert_eq!(id2, LabelId(FIRST_USER_LABEL_ID + 1));
492    }
493
494    #[test]
495    fn round_trip_encode_decode() {
496        let r = LabelRegistry::with_legacy_seed();
497        r.intern(Namespace::Node, "order").unwrap();
498        r.intern(Namespace::Node, "product").unwrap();
499        r.intern(Namespace::Edge, "purchased").unwrap();
500        let bytes = r.encode().unwrap();
501        let restored = LabelRegistry::decode(&bytes).unwrap();
502        assert_eq!(restored.len(), r.len());
503        assert_eq!(
504            restored.lookup(Namespace::Node, "order"),
505            r.lookup(Namespace::Node, "order")
506        );
507        assert_eq!(
508            restored.lookup(Namespace::Edge, "purchased"),
509            r.lookup(Namespace::Edge, "purchased")
510        );
511        // After decode, next intern must NOT reuse a restored ID.
512        let new_id = restored.intern(Namespace::Node, "shipment").unwrap();
513        let prior_max = r.lookup(Namespace::Node, "product").unwrap();
514        assert!(new_id.0 > prior_max.0);
515    }
516
517    #[test]
518    fn decode_rejects_truncated_input() {
519        let bad = vec![0xff, 0xff, 0xff];
520        assert!(matches!(
521            LabelRegistry::decode(&bad),
522            Err(LabelRegistryError::Malformed { .. })
523        ));
524    }
525
526    #[test]
527    fn decode_rejects_invalid_namespace() {
528        // count=1, id=64, ns=99 (invalid), len=4, "test"
529        let mut bad = Vec::new();
530        bad.extend_from_slice(&1u32.to_le_bytes());
531        bad.extend_from_slice(&64u32.to_le_bytes());
532        bad.push(99);
533        bad.extend_from_slice(&4u16.to_le_bytes());
534        bad.extend_from_slice(b"test");
535        let err = LabelRegistry::decode(&bad).unwrap_err();
536        assert!(matches!(err, LabelRegistryError::Malformed { .. }));
537    }
538
539    #[test]
540    fn label_too_long_is_rejected() {
541        let r = LabelRegistry::empty();
542        let big = "x".repeat(MAX_LABEL_LEN + 1);
543        assert!(matches!(
544            r.intern(Namespace::Node, &big),
545            Err(LabelRegistryError::LabelTooLong { .. })
546        ));
547    }
548
549    #[test]
550    fn concurrent_intern_yields_consistent_ids() {
551        use std::sync::Arc;
552        use std::thread;
553
554        let r = Arc::new(LabelRegistry::empty());
555        let handles: Vec<_> = (0..16)
556            .map(|i| {
557                let r = Arc::clone(&r);
558                thread::spawn(move || {
559                    let mut ids = Vec::new();
560                    for j in 0..50 {
561                        let label = format!("label_{}_{}", i % 4, j);
562                        ids.push(r.intern(Namespace::Node, &label).unwrap());
563                    }
564                    ids
565                })
566            })
567            .collect();
568        for h in handles {
569            h.join().unwrap();
570        }
571        // Every distinct label should map to a unique LabelId. With i % 4
572        // we have 4 distinct prefixes × 50 suffixes = 200 distinct labels.
573        let mut seen_ids = std::collections::HashSet::new();
574        for i in 0..4 {
575            for j in 0..50 {
576                let label = format!("label_{}_{}", i, j);
577                let id = r.lookup(Namespace::Node, &label).unwrap();
578                assert!(seen_ids.insert(id), "duplicate id {:?} for {}", id, label);
579            }
580        }
581        assert_eq!(seen_ids.len(), 200);
582    }
583
584    #[test]
585    fn unset_id_never_resolves() {
586        let r = LabelRegistry::with_legacy_seed();
587        assert!(UNSET_LABEL_ID.is_unset());
588        assert_eq!(r.resolve(UNSET_LABEL_ID), None);
589        assert_eq!(r.label_of(Namespace::Node, UNSET_LABEL_ID), None);
590    }
591}