Skip to main content

xsd_schema/document/
qname.rs

1//! QName atomization for the document buffer.
2//!
3//! [`QNameAtom`] is a 16-byte `Copy` struct holding interned name parts.
4//! [`QNameTable`] deduplicates QNameAtoms via a chained hash table
5//! (same pattern as [`NameTable`](crate::namespace::NameTable)).
6
7use crate::ids::NameId;
8use std::collections::hash_map::DefaultHasher;
9use std::hash::{Hash, Hasher};
10
11/// Atomized qualified name — 20 bytes, `Copy`.
12///
13/// **Equality** compares `local_name`, `namespace_uri`, `prefix`, and
14/// `local_name_hash`.  The `qualified_name_idx` field is **excluded** from
15/// equality because it is a per-document `StringStore` index whose value
16/// may differ across occurrences even though the string content is identical
17/// (StringStore does not deduplicate).  Since the qualified name is fully
18/// determined by `prefix` + `local_name`, comparing those is sufficient.
19///
20/// The [`Hash`] trait implementation hashes only `local_name` and
21/// `namespace_uri` because XML namespace identity ignores the prefix.
22/// This means two atoms that differ only in prefix will hash to the same
23/// bucket but will **not** compare as equal, so `QNameTable::atomize` will
24/// store them as separate entries — which is the desired semantics (the
25/// navigator needs to report the original prefix).
26#[derive(Clone, Copy, Debug)]
27pub struct QNameAtom {
28    pub local_name: NameId,
29    pub namespace_uri: NameId,
30    pub prefix: NameId,
31    pub local_name_hash: u32,
32    /// Index into the per-document `StringStore` (not `NameTable`).
33    /// Excluded from `PartialEq` / `hash_atom` — see struct doc.
34    pub qualified_name_idx: u32,
35}
36
37// Equality compares all fields *except* qualified_name_idx.
38// See struct doc-comment for rationale.
39impl PartialEq for QNameAtom {
40    fn eq(&self, other: &Self) -> bool {
41        self.local_name == other.local_name
42            && self.namespace_uri == other.namespace_uri
43            && self.prefix == other.prefix
44            && self.local_name_hash == other.local_name_hash
45    }
46}
47
48impl Eq for QNameAtom {}
49
50// Hash only local_name + namespace_uri (prefix is irrelevant per XML namespace identity).
51// See doc-comment on the struct for rationale on the Hash/Eq asymmetry.
52impl Hash for QNameAtom {
53    fn hash<H: Hasher>(&self, state: &mut H) {
54        self.local_name.hash(state);
55        self.namespace_uri.hash(state);
56    }
57}
58
59/// Sentinel at index 0 — represents "no name".
60pub const EMPTY_QNAME: QNameAtom = QNameAtom {
61    local_name: NameId(0),
62    namespace_uri: NameId(0),
63    prefix: NameId(0),
64    local_name_hash: 0,
65    qualified_name_idx: 0,
66};
67
68/// Chained hash table that deduplicates [`QNameAtom`] values.
69///
70/// Index 0 is always [`EMPTY_QNAME`] and is never placed into any bucket.
71/// The internal hash used for bucket placement hashes the four identity
72/// fields (`local_name`, `namespace_uri`, `prefix`, `local_name_hash`)
73/// so that atoms differing only in prefix land in different buckets when
74/// possible, avoiding long chains.
75pub struct QNameTable {
76    /// All atoms (index 0 = EMPTY_QNAME sentinel).
77    atoms: Vec<QNameAtom>,
78    /// Parallel chain links (-1 = end of chain).
79    nexts: Vec<i32>,
80    /// Bucket heads (-1 = empty bucket).
81    buckets: Vec<i32>,
82}
83
84impl QNameTable {
85    const INITIAL_BUCKETS: usize = 64;
86
87    /// Creates a new table with [`EMPTY_QNAME`] at index 0.
88    pub fn new() -> Self {
89        let mut atoms = Vec::with_capacity(Self::INITIAL_BUCKETS);
90        let mut nexts = Vec::with_capacity(Self::INITIAL_BUCKETS);
91
92        // Sentinel at index 0 — not inserted into any bucket.
93        atoms.push(EMPTY_QNAME);
94        nexts.push(-1);
95
96        Self {
97            atoms,
98            nexts,
99            buckets: vec![-1; Self::INITIAL_BUCKETS],
100        }
101    }
102
103    /// Inserts `qname` into the table if not already present, returning its index.
104    ///
105    /// Deduplication compares the four identity fields (including prefix,
106    /// but excluding `qualified_name_idx`).
107    pub fn atomize(&mut self, qname: QNameAtom) -> u32 {
108        let hash = Self::hash_atom(&qname);
109
110        // Probe the chain for a match on the identity fields (via PartialEq).
111        let bucket_idx = (hash as usize) % self.buckets.len();
112        let mut entry_idx = self.buckets[bucket_idx];
113        while entry_idx >= 0 {
114            if self.atoms[entry_idx as usize] == qname {
115                return entry_idx as u32;
116            }
117            entry_idx = self.nexts[entry_idx as usize];
118        }
119
120        // Rehash if load factor exceeded.
121        if self.atoms.len() >= self.buckets.len() {
122            self.rehash();
123        }
124
125        // Insert new entry.
126        let new_idx = self.atoms.len() as u32;
127        let bucket_idx = (hash as usize) % self.buckets.len();
128        let head = self.buckets[bucket_idx];
129        self.atoms.push(qname);
130        self.nexts.push(head);
131        self.buckets[bucket_idx] = new_idx as i32;
132
133        new_idx
134    }
135
136    /// Returns the [`QNameAtom`] at the given index.
137    ///
138    /// # Panics
139    ///
140    /// Panics if `idx` is out of range.
141    #[inline]
142    pub fn get(&self, idx: u32) -> QNameAtom {
143        self.atoms[idx as usize]
144    }
145
146    // ── Internal helpers ──────────────────────────────────────────────
147
148    /// Hash the four identity fields for bucket placement
149    /// (`qualified_name_idx` excluded — see [`QNameAtom`] doc).
150    fn hash_atom(qname: &QNameAtom) -> u64 {
151        let mut hasher = DefaultHasher::new();
152        qname.local_name.hash(&mut hasher);
153        qname.namespace_uri.hash(&mut hasher);
154        qname.prefix.hash(&mut hasher);
155        qname.local_name_hash.hash(&mut hasher);
156        hasher.finish()
157    }
158
159    /// Double the bucket count and re-insert all entries (skipping index 0).
160    fn rehash(&mut self) {
161        let new_size = self.buckets.len() * 2;
162        self.buckets = vec![-1; new_size];
163
164        // Reset all chain links.
165        for n in self.nexts.iter_mut() {
166            *n = -1;
167        }
168
169        // Re-insert entries 1..len (skip sentinel at 0).
170        for idx in 1..self.atoms.len() {
171            let hash = Self::hash_atom(&self.atoms[idx]);
172            let bucket_idx = (hash as usize) % new_size;
173            self.nexts[idx] = self.buckets[bucket_idx];
174            self.buckets[bucket_idx] = idx as i32;
175        }
176    }
177}
178
179impl Default for QNameTable {
180    fn default() -> Self {
181        Self::new()
182    }
183}
184
185#[cfg(test)]
186mod tests {
187    use super::*;
188
189    fn make_qname(local: u32, ns: u32, prefix: u32, hash: u32) -> QNameAtom {
190        QNameAtom {
191            local_name: NameId(local),
192            namespace_uri: NameId(ns),
193            prefix: NameId(prefix),
194            local_name_hash: hash,
195            qualified_name_idx: 0,
196        }
197    }
198
199    #[test]
200    fn empty_qname_at_index_zero() {
201        let table = QNameTable::new();
202        assert_eq!(table.get(0), EMPTY_QNAME);
203    }
204
205    #[test]
206    fn dedup_identical_qnames() {
207        let mut table = QNameTable::new();
208        let q = make_qname(1, 2, 3, 100);
209        let idx1 = table.atomize(q);
210        let idx2 = table.atomize(q);
211        assert_eq!(idx1, idx2);
212        assert_eq!(idx1, 1); // first real entry after sentinel
213    }
214
215    #[test]
216    fn different_qnames_different_indices() {
217        let mut table = QNameTable::new();
218        let q1 = make_qname(1, 2, 3, 100);
219        let q2 = make_qname(4, 5, 6, 200);
220        let idx1 = table.atomize(q1);
221        let idx2 = table.atomize(q2);
222        assert_ne!(idx1, idx2);
223    }
224
225    #[test]
226    fn different_prefix_different_entry() {
227        let mut table = QNameTable::new();
228        let q1 = make_qname(1, 2, 3, 100);
229        let q2 = make_qname(1, 2, 99, 100); // same local/ns/hash, different prefix
230        let idx1 = table.atomize(q1);
231        let idx2 = table.atomize(q2);
232        assert_ne!(idx1, idx2, "Different prefix must produce a distinct entry");
233    }
234
235    #[test]
236    fn get_round_trip() {
237        let mut table = QNameTable::new();
238        let q = make_qname(10, 20, 30, 42);
239        let idx = table.atomize(q);
240        assert_eq!(table.get(idx), q);
241    }
242
243    #[test]
244    fn many_entries_trigger_rehash() {
245        let mut table = QNameTable::new();
246        let count = 1024;
247        let mut indices = Vec::with_capacity(count);
248
249        for i in 0..count as u32 {
250            let q = make_qname(i, i + 1000, i % 5, i.wrapping_mul(2654435761));
251            indices.push(table.atomize(q));
252        }
253
254        // Verify round-trip for every entry.
255        for i in 0..count as u32 {
256            let q = make_qname(i, i + 1000, i % 5, i.wrapping_mul(2654435761));
257            assert_eq!(table.get(indices[i as usize]), q);
258        }
259
260        // Re-atomize should return the same index.
261        for i in 0..count as u32 {
262            let q = make_qname(i, i + 1000, i % 5, i.wrapping_mul(2654435761));
263            assert_eq!(table.atomize(q), indices[i as usize]);
264        }
265    }
266
267    #[test]
268    fn dedup_ignores_qualified_name_idx() {
269        let mut table = QNameTable::new();
270        let q1 = QNameAtom {
271            local_name: NameId(1),
272            namespace_uri: NameId(2),
273            prefix: NameId(3),
274            local_name_hash: 100,
275            qualified_name_idx: 10,
276        };
277        let q2 = QNameAtom {
278            local_name: NameId(1),
279            namespace_uri: NameId(2),
280            prefix: NameId(3),
281            local_name_hash: 100,
282            qualified_name_idx: 99, // different StringStore index, same logical name
283        };
284        let idx1 = table.atomize(q1);
285        let idx2 = table.atomize(q2);
286        assert_eq!(
287            idx1, idx2,
288            "Same identity fields must dedup despite different qualified_name_idx"
289        );
290        // The stored atom keeps the first occurrence's qualified_name_idx
291        assert_eq!(table.get(idx1).qualified_name_idx, 10);
292    }
293
294    #[test]
295    fn hash_trait_excludes_prefix() {
296        let q1 = make_qname(1, 2, 3, 100);
297        let q2 = make_qname(1, 2, 99, 100); // only prefix differs
298
299        let hash1 = {
300            let mut h = DefaultHasher::new();
301            q1.hash(&mut h);
302            h.finish()
303        };
304        let hash2 = {
305            let mut h = DefaultHasher::new();
306            q2.hash(&mut h);
307            h.finish()
308        };
309
310        assert_eq!(
311            hash1, hash2,
312            "Hash impl must ignore prefix (XML namespace identity)"
313        );
314    }
315
316    #[test]
317    fn hash_trait_differs_for_different_names() {
318        let q1 = make_qname(1, 2, 3, 100);
319        let q2 = make_qname(1, 99, 3, 100); // different namespace_uri
320
321        let hash1 = {
322            let mut h = DefaultHasher::new();
323            q1.hash(&mut h);
324            h.finish()
325        };
326        let hash2 = {
327            let mut h = DefaultHasher::new();
328            q2.hash(&mut h);
329            h.finish()
330        };
331
332        assert_ne!(
333            hash1, hash2,
334            "Hash should differ when namespace_uri differs"
335        );
336    }
337}