scraper/
node.rs

1//! HTML nodes.
2
3use std::fmt;
4use std::ops::Deref;
5use std::slice::Iter as SliceIter;
6
7use crate::{CaseSensitivity, StrTendril};
8use html5ever::{Attribute, LocalName, QualName};
9use std::cell::OnceCell;
10
11/// An HTML node.
12// `Element` is usally the most common variant and hence boxing it
13// will most likely not improve performance overall.
14#[allow(variant_size_differences)]
15#[derive(Clone, PartialEq, Eq)]
16pub enum Node {
17    /// The document root.
18    Document,
19
20    /// The fragment root.
21    Fragment,
22
23    /// A doctype.
24    Doctype(Doctype),
25
26    /// A comment.
27    Comment(Comment),
28
29    /// Text.
30    Text(Text),
31
32    /// An element.
33    Element(Element),
34
35    /// A processing instruction.
36    ProcessingInstruction(ProcessingInstruction),
37}
38
39impl Node {
40    /// Returns true if node is the document root.
41    pub fn is_document(&self) -> bool {
42        matches!(*self, Node::Document)
43    }
44
45    /// Returns true if node is the fragment root.
46    pub fn is_fragment(&self) -> bool {
47        matches!(*self, Node::Fragment)
48    }
49
50    /// Returns true if node is a doctype.
51    pub fn is_doctype(&self) -> bool {
52        matches!(*self, Node::Doctype(_))
53    }
54
55    /// Returns true if node is a comment.
56    pub fn is_comment(&self) -> bool {
57        matches!(*self, Node::Comment(_))
58    }
59
60    /// Returns true if node is text.
61    pub fn is_text(&self) -> bool {
62        matches!(*self, Node::Text(_))
63    }
64
65    /// Returns true if node is an element.
66    pub fn is_element(&self) -> bool {
67        matches!(*self, Node::Element(_))
68    }
69
70    /// Returns self as a doctype.
71    pub fn as_doctype(&self) -> Option<&Doctype> {
72        match *self {
73            Node::Doctype(ref d) => Some(d),
74            _ => None,
75        }
76    }
77
78    /// Returns self as a comment.
79    pub fn as_comment(&self) -> Option<&Comment> {
80        match *self {
81            Node::Comment(ref c) => Some(c),
82            _ => None,
83        }
84    }
85
86    /// Returns self as text.
87    pub fn as_text(&self) -> Option<&Text> {
88        match *self {
89            Node::Text(ref t) => Some(t),
90            _ => None,
91        }
92    }
93
94    /// Returns self as an element.
95    pub fn as_element(&self) -> Option<&Element> {
96        match *self {
97            Node::Element(ref e) => Some(e),
98            _ => None,
99        }
100    }
101
102    /// Returns self as an element.
103    pub fn as_processing_instruction(&self) -> Option<&ProcessingInstruction> {
104        match *self {
105            Node::ProcessingInstruction(ref pi) => Some(pi),
106            _ => None,
107        }
108    }
109}
110
111// Always use one line.
112impl fmt::Debug for Node {
113    fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
114        match *self {
115            Node::Document => write!(f, "Document"),
116            Node::Fragment => write!(f, "Fragment"),
117            Node::Doctype(ref d) => write!(f, "Doctype({d:?})"),
118            Node::Comment(ref c) => write!(f, "Comment({c:?})"),
119            Node::Text(ref t) => write!(f, "Text({t:?})"),
120            Node::Element(ref e) => write!(f, "Element({e:?})"),
121            Node::ProcessingInstruction(ref pi) => write!(f, "ProcessingInstruction({pi:?})"),
122        }
123    }
124}
125
126/// A doctype.
127#[derive(Clone, PartialEq, Eq)]
128pub struct Doctype {
129    /// The doctype name.
130    pub name: StrTendril,
131
132    /// The doctype public ID.
133    pub public_id: StrTendril,
134
135    /// The doctype system ID.
136    pub system_id: StrTendril,
137}
138
139impl Doctype {
140    /// Returns the doctype name.
141    pub fn name(&self) -> &str {
142        self.name.deref()
143    }
144
145    /// Returns the doctype public ID.
146    pub fn public_id(&self) -> &str {
147        self.public_id.deref()
148    }
149
150    /// Returns the doctype system ID.
151    pub fn system_id(&self) -> &str {
152        self.system_id.deref()
153    }
154}
155
156impl fmt::Debug for Doctype {
157    fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
158        write!(
159            f,
160            "<!DOCTYPE {} PUBLIC {:?} {:?}>",
161            self.name(),
162            self.public_id(),
163            self.system_id()
164        )
165    }
166}
167
168/// An HTML comment.
169#[derive(Clone, PartialEq, Eq)]
170pub struct Comment {
171    /// The comment text.
172    pub comment: StrTendril,
173}
174
175impl Deref for Comment {
176    type Target = str;
177
178    fn deref(&self) -> &str {
179        self.comment.deref()
180    }
181}
182
183impl fmt::Debug for Comment {
184    fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
185        write!(f, "<!-- {:?} -->", self.deref())
186    }
187}
188
189/// HTML text.
190#[derive(Clone, PartialEq, Eq)]
191pub struct Text {
192    /// The text.
193    pub text: StrTendril,
194}
195
196impl Deref for Text {
197    type Target = str;
198
199    fn deref(&self) -> &str {
200        self.text.deref()
201    }
202}
203
204impl fmt::Debug for Text {
205    fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
206        write!(f, "{:?}", self.deref())
207    }
208}
209
210/// A Map of attributes that preserves the order of the attributes.
211#[cfg(feature = "deterministic")]
212pub type Attributes = indexmap::IndexMap<QualName, StrTendril>;
213
214/// A Map of attributes that doesn't preserve the order of the attributes.
215/// Please enable the `deterministic` feature for order-preserving
216/// (de)serialization.
217#[cfg(not(feature = "deterministic"))]
218pub type Attributes = Vec<(QualName, StrTendril)>;
219
220/// An HTML element.
221#[derive(Clone, PartialEq, Eq)]
222pub struct Element {
223    /// The element name.
224    pub name: QualName,
225
226    /// The element attributes.
227    pub attrs: Attributes,
228
229    id: OnceCell<Option<StrTendril>>,
230
231    classes: OnceCell<Box<[LocalName]>>,
232}
233
234impl Element {
235    #[doc(hidden)]
236    pub fn new(name: QualName, attributes: Vec<Attribute>) -> Self {
237        #[allow(unused_mut)]
238        let mut attrs = attributes
239            .into_iter()
240            .map(|attr| (attr.name, crate::tendril_util::make(attr.value)))
241            .collect::<Attributes>();
242
243        #[cfg(not(feature = "deterministic"))]
244        attrs.sort_unstable_by(|lhs, rhs| lhs.0.cmp(&rhs.0));
245
246        Element {
247            attrs,
248            name,
249            id: OnceCell::new(),
250            classes: OnceCell::new(),
251        }
252    }
253
254    /// Returns the element name.
255    pub fn name(&self) -> &str {
256        self.name.local.deref()
257    }
258
259    /// Returns the element ID.
260    pub fn id(&self) -> Option<&str> {
261        self.id
262            .get_or_init(|| {
263                self.attrs
264                    .iter()
265                    .find(|(name, _)| name.local.as_ref() == "id")
266                    .map(|(_, value)| value.clone())
267            })
268            .as_deref()
269    }
270
271    /// Returns true if element has the class.
272    pub fn has_class(&self, class: &str, case_sensitive: CaseSensitivity) -> bool {
273        self.classes()
274            .any(|c| case_sensitive.eq(c.as_bytes(), class.as_bytes()))
275    }
276
277    /// Returns an iterator over the element's classes.
278    pub fn classes(&self) -> Classes<'_> {
279        let classes = self.classes.get_or_init(|| {
280            let mut classes = self
281                .attrs
282                .iter()
283                .filter(|(name, _)| name.local.as_ref() == "class")
284                .flat_map(|(_, value)| value.split_ascii_whitespace().map(LocalName::from))
285                .collect::<Vec<_>>();
286
287            classes.sort_unstable();
288            classes.dedup();
289
290            classes.into_boxed_slice()
291        });
292
293        Classes {
294            inner: classes.iter(),
295        }
296    }
297
298    /// Returns the value of an attribute.
299    pub fn attr(&self, attr: &str) -> Option<&str> {
300        let qualname = QualName::new(None, ns!(), LocalName::from(attr));
301
302        #[cfg(not(feature = "deterministic"))]
303        let value = self
304            .attrs
305            .binary_search_by(|attr| attr.0.cmp(&qualname))
306            .ok()
307            .map(|idx| &*self.attrs[idx].1);
308
309        #[cfg(feature = "deterministic")]
310        let value = self.attrs.get(&qualname).map(Deref::deref);
311
312        value
313    }
314
315    /// Returns an iterator over the element's attributes.
316    pub fn attrs(&self) -> Attrs<'_> {
317        Attrs {
318            inner: self.attrs.iter(),
319        }
320    }
321}
322
323/// Iterator over classes.
324#[allow(missing_debug_implementations)]
325#[derive(Clone)]
326pub struct Classes<'a> {
327    inner: SliceIter<'a, LocalName>,
328}
329
330impl<'a> Iterator for Classes<'a> {
331    type Item = &'a str;
332
333    fn next(&mut self) -> Option<&'a str> {
334        self.inner.next().map(Deref::deref)
335    }
336}
337
338/// An iterator over a node's attributes.
339#[cfg(feature = "deterministic")]
340pub type AttributesIter<'a> = indexmap::map::Iter<'a, QualName, StrTendril>;
341
342/// An iterator over a node's attributes.
343#[cfg(not(feature = "deterministic"))]
344pub type AttributesIter<'a> = SliceIter<'a, (QualName, StrTendril)>;
345
346/// Iterator over attributes.
347#[allow(missing_debug_implementations)]
348#[derive(Clone)]
349pub struct Attrs<'a> {
350    inner: AttributesIter<'a>,
351}
352
353impl<'a> Iterator for Attrs<'a> {
354    type Item = (&'a str, &'a str);
355
356    fn next(&mut self) -> Option<(&'a str, &'a str)> {
357        self.inner.next().map(|(k, v)| (k.local.deref(), v.deref()))
358    }
359}
360
361impl fmt::Debug for Element {
362    fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
363        write!(f, "<{}", self.name())?;
364        for (key, value) in self.attrs() {
365            write!(f, " {key}={value:?}")?;
366        }
367        write!(f, ">")
368    }
369}
370
371/// HTML Processing Instruction.
372#[derive(Debug, Clone, PartialEq, Eq)]
373pub struct ProcessingInstruction {
374    /// The PI target.
375    pub target: StrTendril,
376    /// The PI data.
377    pub data: StrTendril,
378}
379
380impl Deref for ProcessingInstruction {
381    type Target = str;
382
383    fn deref(&self) -> &str {
384        self.data.deref()
385    }
386}
387
388pub(crate) mod serializable;