plotnik_core/
interner.rs

1//! String interning for efficient string deduplication and comparison.
2//!
3//! Converts heap-allocated strings into cheap integer handles (`Symbol`).
4//! Comparing two symbols is O(1) integer comparison.
5//!
6//! The interner can be serialized to a binary blob format for the compiled query.
7
8use std::collections::HashMap;
9
10/// A lightweight handle to an interned string.
11///
12/// Comparing two symbols is O(1). Symbols are ordered by insertion order,
13/// not lexicographically — use `Interner::resolve` if you need string ordering.
14#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)]
15pub struct Symbol(u32);
16
17impl Symbol {
18    /// Raw index for serialization/debugging.
19    #[inline]
20    pub fn as_u32(self) -> u32 {
21        self.0
22    }
23
24    /// Create a Symbol from a raw index. Use only for deserialization.
25    #[inline]
26    pub fn from_raw(index: u32) -> Self {
27        Self(index)
28    }
29}
30
31impl PartialOrd for Symbol {
32    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
33        Some(self.cmp(other))
34    }
35}
36
37impl Ord for Symbol {
38    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
39        self.0.cmp(&other.0)
40    }
41}
42
43/// String interner. Deduplicates strings and returns cheap Symbol handles.
44#[derive(Debug, Clone, Default)]
45pub struct Interner {
46    /// Map from string to symbol for deduplication.
47    map: HashMap<String, Symbol>,
48    /// Storage for interned strings, indexed by Symbol.
49    strings: Vec<String>,
50}
51
52impl Interner {
53    pub fn new() -> Self {
54        Self::default()
55    }
56
57    /// Intern a string, returning its Symbol.
58    /// If the string was already interned, returns the existing Symbol.
59    pub fn intern(&mut self, s: &str) -> Symbol {
60        if let Some(&sym) = self.map.get(s) {
61            return sym;
62        }
63
64        let sym = Symbol(self.strings.len() as u32);
65        self.strings.push(s.to_owned());
66        self.map.insert(s.to_owned(), sym);
67        sym
68    }
69
70    /// Intern an owned string, avoiding clone if not already present.
71    pub fn intern_owned(&mut self, s: String) -> Symbol {
72        if let Some(&sym) = self.map.get(&s) {
73            return sym;
74        }
75
76        let sym = Symbol(self.strings.len() as u32);
77        self.strings.push(s.clone());
78        self.map.insert(s, sym);
79        sym
80    }
81
82    /// Resolve a Symbol back to its string.
83    ///
84    /// # Panics
85    /// Panics if the symbol was not created by this interner.
86    #[inline]
87    pub fn resolve(&self, sym: Symbol) -> &str {
88        &self.strings[sym.0 as usize]
89    }
90
91    /// Try to resolve a Symbol, returning None if invalid.
92    #[inline]
93    pub fn try_resolve(&self, sym: Symbol) -> Option<&str> {
94        self.strings.get(sym.0 as usize).map(|s| s.as_str())
95    }
96
97    /// Number of interned strings.
98    #[inline]
99    pub fn len(&self) -> usize {
100        self.strings.len()
101    }
102
103    /// Whether the interner is empty.
104    #[inline]
105    pub fn is_empty(&self) -> bool {
106        self.strings.is_empty()
107    }
108
109    /// Iterate over all interned strings with their symbols.
110    #[inline]
111    pub fn iter(&self) -> impl Iterator<Item = (Symbol, &str)> {
112        self.strings
113            .iter()
114            .enumerate()
115            .map(|(i, s)| (Symbol(i as u32), s.as_str()))
116    }
117
118    /// Emit as binary format blob and offset table.
119    ///
120    /// Returns (concatenated UTF-8 bytes, offset for each string + sentinel).
121    /// The offsets array has `len() + 1` entries; the last is the total blob size.
122    pub fn to_blob(&self) -> (Vec<u8>, Vec<u32>) {
123        let mut blob = Vec::new();
124        let mut offsets = Vec::with_capacity(self.strings.len() + 1);
125
126        for s in &self.strings {
127            offsets.push(blob.len() as u32);
128            blob.extend_from_slice(s.as_bytes());
129        }
130        offsets.push(blob.len() as u32); // sentinel for length calculation
131
132        (blob, offsets)
133    }
134}