scrape_core/dom/
index.rs

1//! Document indexing for fast element lookup.
2
3use std::collections::HashMap;
4
5use super::NodeId;
6
7/// Index for fast element lookup by ID and class.
8#[derive(Debug, Default, Clone)]
9pub struct DocumentIndex {
10    by_id: HashMap<String, NodeId>,
11    by_class: HashMap<String, Vec<NodeId>>,
12}
13
14impl DocumentIndex {
15    /// Creates a new empty index.
16    #[must_use]
17    pub fn new() -> Self {
18        Self::default()
19    }
20
21    /// Registers an element's ID.
22    ///
23    /// Per HTML spec, first occurrence wins if duplicate IDs exist.
24    pub fn register_id(&mut self, id: String, node_id: NodeId) {
25        self.by_id.entry(id).or_insert(node_id);
26    }
27
28    /// Registers an element's classes.
29    pub fn register_classes(&mut self, classes: &str, node_id: NodeId) {
30        for class in classes.split_whitespace() {
31            self.by_class.entry(class.to_string()).or_default().push(node_id);
32        }
33    }
34
35    /// Looks up an element by ID.
36    #[must_use]
37    pub fn get_by_id(&self, id: &str) -> Option<NodeId> {
38        self.by_id.get(id).copied()
39    }
40
41    /// Looks up elements by class.
42    #[must_use]
43    pub fn get_by_class(&self, class: &str) -> &[NodeId] {
44        self.by_class.get(class).map_or(&[], Vec::as_slice)
45    }
46
47    /// Returns whether the index is empty.
48    #[must_use]
49    pub fn is_empty(&self) -> bool {
50        self.by_id.is_empty() && self.by_class.is_empty()
51    }
52
53    /// Returns the number of indexed IDs.
54    #[must_use]
55    pub fn id_count(&self) -> usize {
56        self.by_id.len()
57    }
58
59    /// Returns the number of indexed classes.
60    #[must_use]
61    pub fn class_count(&self) -> usize {
62        self.by_class.len()
63    }
64}
65
66#[cfg(test)]
67mod tests {
68    use super::*;
69
70    #[test]
71    fn test_index_register_id() {
72        let mut index = DocumentIndex::new();
73        let node1 = NodeId::new(1);
74        let node2 = NodeId::new(2);
75
76        index.register_id("main".to_string(), node1);
77        assert_eq!(index.get_by_id("main"), Some(node1));
78
79        index.register_id("main".to_string(), node2);
80        assert_eq!(index.get_by_id("main"), Some(node1));
81    }
82
83    #[test]
84    fn test_index_register_classes() {
85        let mut index = DocumentIndex::new();
86        let node1 = NodeId::new(1);
87        let node2 = NodeId::new(2);
88
89        index.register_classes("foo bar", node1);
90        index.register_classes("bar baz", node2);
91
92        assert_eq!(index.get_by_class("foo"), &[node1]);
93        assert_eq!(index.get_by_class("bar"), &[node1, node2]);
94        assert_eq!(index.get_by_class("baz"), &[node2]);
95        assert_eq!(index.get_by_class("qux"), &[]);
96    }
97
98    #[test]
99    fn test_index_empty() {
100        let index = DocumentIndex::new();
101        assert!(index.is_empty());
102
103        let mut index = DocumentIndex::new();
104        index.register_id("test".to_string(), NodeId::new(1));
105        assert!(!index.is_empty());
106    }
107
108    #[test]
109    fn test_index_counts() {
110        let mut index = DocumentIndex::new();
111        assert_eq!(index.id_count(), 0);
112        assert_eq!(index.class_count(), 0);
113
114        index.register_id("id1".to_string(), NodeId::new(1));
115        index.register_id("id2".to_string(), NodeId::new(2));
116        index.register_classes("class1 class2", NodeId::new(3));
117
118        assert_eq!(index.id_count(), 2);
119        assert_eq!(index.class_count(), 2);
120    }
121
122    #[test]
123    fn test_index_empty_class_string() {
124        let mut index = DocumentIndex::new();
125        let node = NodeId::new(1);
126
127        index.register_classes("", node);
128        assert_eq!(index.class_count(), 0);
129        assert!(index.is_empty());
130    }
131
132    #[test]
133    fn test_index_whitespace_only_class() {
134        let mut index = DocumentIndex::new();
135        let node = NodeId::new(1);
136
137        index.register_classes("   ", node);
138        index.register_classes("\t\n", node);
139        assert_eq!(index.class_count(), 0);
140        assert!(index.is_empty());
141    }
142
143    #[test]
144    fn test_index_large_scale() {
145        let mut index = DocumentIndex::new();
146
147        for i in 0..10_000 {
148            index.register_id(format!("id-{i}"), NodeId::new(i));
149            index.register_classes(&format!("class-{i} shared"), NodeId::new(i));
150        }
151
152        assert_eq!(index.id_count(), 10_000);
153        assert_eq!(index.class_count(), 10_001);
154
155        assert_eq!(index.get_by_id("id-5000"), Some(NodeId::new(5000)));
156        assert_eq!(index.get_by_class("class-5000"), &[NodeId::new(5000)]);
157        assert_eq!(index.get_by_class("shared").len(), 10_000);
158    }
159
160    #[test]
161    fn test_index_unicode_ids_and_classes() {
162        let mut index = DocumentIndex::new();
163        let node1 = NodeId::new(1);
164        let node2 = NodeId::new(2);
165
166        index.register_id("ζ—₯本θͺž".to_string(), node1);
167        index.register_classes("emoji-πŸ˜€ δΈ­ζ–‡", node2);
168
169        assert_eq!(index.get_by_id("ζ—₯本θͺž"), Some(node1));
170        assert_eq!(index.get_by_class("emoji-πŸ˜€"), &[node2]);
171        assert_eq!(index.get_by_class("δΈ­ζ–‡"), &[node2]);
172    }
173
174    #[test]
175    fn test_index_special_characters() {
176        let mut index = DocumentIndex::new();
177        let node = NodeId::new(1);
178
179        index.register_id("id-with-dash_and_underscore123".to_string(), node);
180        index.register_classes("class:with:colons foo.bar", node);
181
182        assert_eq!(index.get_by_id("id-with-dash_and_underscore123"), Some(node));
183        assert_eq!(index.get_by_class("class:with:colons"), &[node]);
184        assert_eq!(index.get_by_class("foo.bar"), &[node]);
185    }
186}