accessibility_scraper/
node.rs

1//! HTML nodes.
2
3#[cfg(not(feature = "deterministic"))]
4use ahash::AHashMap as HashMap;
5#[cfg(not(feature = "deterministic"))]
6use std::collections::hash_map;
7use std::fmt;
8use std::ops::Deref;
9use std::slice::Iter as SliceIter;
10
11use crate::{selector::CssLocalName, selectors::attr::CaseSensitivity, StrTendril};
12use fast_html5ever::{Attribute, LocalName, QualName};
13use once_cell::unsync::OnceCell;
14
15/// An HTML node.
16// `Element` is usally the most common variant and hence boxing it
17// will most likely not improve performance overall.
18#[allow(variant_size_differences)]
19#[derive(Clone, PartialEq, Eq)]
20pub enum Node {
21    /// The document root.
22    Document,
23
24    /// The fragment root.
25    Fragment,
26
27    /// A doctype.
28    Doctype(Doctype),
29
30    /// A comment.
31    Comment(Comment),
32
33    /// Text.
34    Text(Text),
35
36    /// An element.
37    Element(Element),
38
39    /// A processing instruction.
40    ProcessingInstruction(ProcessingInstruction),
41}
42
43impl Node {
44    /// Returns true if node is the document root.
45    pub fn is_document(&self) -> bool {
46        matches!(*self, Node::Document)
47    }
48
49    /// Returns true if node is the fragment root.
50    pub fn is_fragment(&self) -> bool {
51        matches!(*self, Node::Fragment)
52    }
53
54    /// Returns true if node is a doctype.
55    pub fn is_doctype(&self) -> bool {
56        matches!(*self, Node::Doctype(_))
57    }
58
59    /// Returns true if node is a comment.
60    pub fn is_comment(&self) -> bool {
61        matches!(*self, Node::Comment(_))
62    }
63
64    /// Returns true if node is text.
65    pub fn is_text(&self) -> bool {
66        matches!(*self, Node::Text(_))
67    }
68
69    /// Returns true if node is an element.
70    pub fn is_element(&self) -> bool {
71        matches!(*self, Node::Element(_))
72    }
73
74    /// Returns self as a doctype.
75    pub fn as_doctype(&self) -> Option<&Doctype> {
76        match *self {
77            Node::Doctype(ref d) => Some(d),
78            _ => None,
79        }
80    }
81
82    /// Returns self as a comment.
83    pub fn as_comment(&self) -> Option<&Comment> {
84        match *self {
85            Node::Comment(ref c) => Some(c),
86            _ => None,
87        }
88    }
89
90    /// Returns self as text.
91    pub fn as_text(&self) -> Option<&Text> {
92        match *self {
93            Node::Text(ref t) => Some(t),
94            _ => None,
95        }
96    }
97
98    /// Returns self as an element.
99    pub fn as_element(&self) -> Option<&Element> {
100        match *self {
101            Node::Element(ref e) => Some(e),
102            _ => None,
103        }
104    }
105
106    /// Returns self as an element.
107    pub fn as_processing_instruction(&self) -> Option<&ProcessingInstruction> {
108        match *self {
109            Node::ProcessingInstruction(ref pi) => Some(pi),
110            _ => None,
111        }
112    }
113}
114
115// Always use one line.
116impl fmt::Debug for Node {
117    fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
118        match *self {
119            Node::Document => write!(f, "Document"),
120            Node::Fragment => write!(f, "Fragment"),
121            Node::Doctype(ref d) => write!(f, "Doctype({:?})", d),
122            Node::Comment(ref c) => write!(f, "Comment({:?})", c),
123            Node::Text(ref t) => write!(f, "Text({:?})", t),
124            Node::Element(ref e) => write!(f, "Element({:?})", e),
125            Node::ProcessingInstruction(ref pi) => write!(f, "ProcessingInstruction({:?})", pi),
126        }
127    }
128}
129
130/// A doctype.
131#[derive(Clone, PartialEq, Eq)]
132pub struct Doctype {
133    /// The doctype name.
134    pub name: StrTendril,
135
136    /// The doctype public ID.
137    pub public_id: StrTendril,
138
139    /// The doctype system ID.
140    pub system_id: StrTendril,
141}
142
143impl Doctype {
144    /// Returns the doctype name.
145    pub fn name(&self) -> &str {
146        self.name.deref()
147    }
148
149    /// Returns the doctype public ID.
150    pub fn public_id(&self) -> &str {
151        self.public_id.deref()
152    }
153
154    /// Returns the doctype system ID.
155    pub fn system_id(&self) -> &str {
156        self.system_id.deref()
157    }
158}
159
160impl fmt::Debug for Doctype {
161    fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
162        write!(
163            f,
164            "<!DOCTYPE {} PUBLIC {:?} {:?}>",
165            self.name(),
166            self.public_id(),
167            self.system_id()
168        )
169    }
170}
171
172/// An HTML comment.
173#[derive(Clone, PartialEq, Eq)]
174pub struct Comment {
175    /// The comment text.
176    pub comment: StrTendril,
177}
178
179impl Deref for Comment {
180    type Target = str;
181
182    fn deref(&self) -> &str {
183        self.comment.deref()
184    }
185}
186
187impl fmt::Debug for Comment {
188    fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
189        write!(f, "<!-- {:?} -->", self.deref())
190    }
191}
192
193/// HTML text.
194#[derive(Clone, PartialEq, Eq)]
195pub struct Text {
196    /// The text.
197    pub text: StrTendril,
198}
199
200impl Deref for Text {
201    type Target = str;
202
203    fn deref(&self) -> &str {
204        self.text.deref()
205    }
206}
207
208impl fmt::Debug for Text {
209    fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
210        write!(f, "{:?}", self.deref())
211    }
212}
213
214/// A Map of attributes that preserves the order of the attributes.
215#[cfg(feature = "deterministic")]
216pub type Attributes = indexmap::IndexMap<QualName, StrTendril>;
217
218/// A Map of attributes that doesn't preserve the order of the attributes.
219/// Please enable the `deterministic` feature for order-preserving
220/// (de)serialization.
221#[cfg(not(feature = "deterministic"))]
222pub type Attributes = HashMap<QualName, StrTendril>;
223
224/// An HTML element.
225#[derive(Clone, PartialEq, Eq)]
226pub struct Element {
227    /// The element name.
228    pub name: QualName,
229
230    /// The element attributes.
231    pub attrs: Attributes,
232
233    id: OnceCell<Option<StrTendril>>,
234
235    classes: OnceCell<Vec<LocalName>>,
236
237    /// locale name
238    pub local_name: CssLocalName,
239}
240
241impl Element {
242    #[doc(hidden)]
243    pub fn new(name: QualName, attributes: Vec<Attribute>) -> Self {
244        let local_name = CssLocalName(name.clone().local);
245        let attrs = attributes
246            .into_iter()
247            .map(|a| (a.name, crate::tendril_util::make(a.value)))
248            .collect();
249
250        Element {
251            local_name,
252            attrs,
253            name,
254            id: OnceCell::new(),
255            classes: OnceCell::new(),
256        }
257    }
258
259    /// Returns the element name.
260    pub fn name(&self) -> &str {
261        self.name.local.deref()
262    }
263
264    /// Returns the element ID.
265    pub fn id(&self) -> Option<&str> {
266        self.id
267            .get_or_init(|| {
268                self.attrs
269                    .iter()
270                    .find(|(name, _)| name.local.as_ref() == "id")
271                    .map(|(_, value)| value.clone())
272            })
273            .as_deref()
274    }
275
276    /// Returns true if element has the class.
277    pub fn has_class(&self, class: &str, case_sensitive: CaseSensitivity) -> bool {
278        self.classes()
279            .any(|c| case_sensitive.eq(c.as_bytes(), class.as_bytes()))
280    }
281
282    /// Returns an iterator over the element's classes.
283    pub fn classes(&self) -> Classes {
284        let classes = self.classes.get_or_init(|| {
285            let mut classes: Vec<LocalName> = self
286                .attrs
287                .iter()
288                .filter(|(name, _)| name.local.as_ref() == "class")
289                .flat_map(|(_, value)| value.split_whitespace().map(LocalName::from))
290                .collect();
291
292            classes.sort_unstable();
293            classes.dedup();
294
295            classes
296        });
297
298        Classes {
299            inner: classes.iter(),
300        }
301    }
302
303    /// Returns the value of an attribute.
304    pub fn attr(&self, attr: &str) -> Option<&str> {
305        let qualname = QualName::new(None, ns!(), LocalName::from(attr));
306        self.attrs.get(&qualname).map(Deref::deref)
307    }
308
309    /// Returns an iterator over the element's attributes.
310    pub fn attrs(&self) -> Attrs {
311        Attrs {
312            inner: self.attrs.iter(),
313        }
314    }
315}
316
317/// Iterator over classes.
318#[allow(missing_debug_implementations)]
319#[derive(Clone)]
320pub struct Classes<'a> {
321    inner: SliceIter<'a, LocalName>,
322}
323
324impl<'a> Iterator for Classes<'a> {
325    type Item = &'a str;
326
327    fn next(&mut self) -> Option<&'a str> {
328        self.inner.next().map(Deref::deref)
329    }
330}
331
332/// An iterator over a node's attributes.
333#[cfg(feature = "deterministic")]
334pub type AttributesIter<'a> = indexmap::map::Iter<'a, QualName, StrTendril>;
335
336/// An iterator over a node's attributes.
337#[cfg(not(feature = "deterministic"))]
338pub type AttributesIter<'a> = hash_map::Iter<'a, QualName, StrTendril>;
339
340/// Iterator over attributes.
341#[allow(missing_debug_implementations)]
342#[derive(Clone)]
343pub struct Attrs<'a> {
344    inner: AttributesIter<'a>,
345}
346
347impl<'a> Iterator for Attrs<'a> {
348    type Item = (&'a str, &'a str);
349
350    fn next(&mut self) -> Option<(&'a str, &'a str)> {
351        self.inner.next().map(|(k, v)| (k.local.deref(), v.deref()))
352    }
353}
354
355impl fmt::Debug for Element {
356    fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
357        write!(f, "<{}", self.name())?;
358        for (key, value) in self.attrs() {
359            write!(f, " {}={:?}", key, value)?;
360        }
361        write!(f, ">")
362    }
363}
364
365/// HTML Processing Instruction.
366#[derive(Debug, Clone, PartialEq, Eq)]
367pub struct ProcessingInstruction {
368    /// The PI target.
369    pub target: StrTendril,
370    /// The PI data.
371    pub data: StrTendril,
372}
373
374impl Deref for ProcessingInstruction {
375    type Target = str;
376
377    fn deref(&self) -> &str {
378        self.data.deref()
379    }
380}
381
382pub(crate) mod serializable;