scraper/
node.rs

1//! HTML nodes.
2
3use hashbrown::{hash_map::Iter, hash_set, HashMap, HashSet};
4
5use fast_html5ever::{Attribute, LocalName, QualName};
6use selectors::attr::CaseSensitivity;
7use std::fmt;
8use std::ops::Deref;
9
10use fast_html5ever::tendril::{fmt::UTF8, Atomic, Tendril};
11
12/// Atomic StrTendril type
13pub type AtomicStrTendril = Tendril<UTF8, Atomic>;
14
15/// An HTML node.
16// `Element` is usally the most common variant and hence boxing it
17// will most likely not improve performance overall.
18#[allow(variant_size_differences)]
19#[derive(Clone)]
20pub enum Node {
21    /// The document root.
22    Document,
23
24    /// The fragment root.
25    Fragment,
26
27    /// A doctype.
28    Doctype(Doctype),
29
30    /// A comment.
31    Comment(Comment),
32
33    /// Text.
34    Text(Text),
35
36    /// An element.
37    Element(Element),
38
39    /// A processing instruction.
40    ProcessingInstruction(ProcessingInstruction),
41}
42
43impl Node {
44    /// Returns true if node is the document root.
45    pub fn is_document(&self) -> bool {
46        matches!(*self, Node::Document)
47    }
48
49    /// Returns true if node is the fragment root.
50    pub fn is_fragment(&self) -> bool {
51        matches!(*self, Node::Fragment)
52    }
53
54    /// Returns true if node is a doctype.
55    pub fn is_doctype(&self) -> bool {
56        matches!(*self, Node::Doctype(_))
57    }
58
59    /// Returns true if node is a comment.
60    pub fn is_comment(&self) -> bool {
61        matches!(*self, Node::Comment(_))
62    }
63
64    /// Returns true if node is text.
65    pub fn is_text(&self) -> bool {
66        matches!(*self, Node::Text(_))
67    }
68
69    /// Returns true if node is an element.
70    pub fn is_element(&self) -> bool {
71        matches!(*self, Node::Element(_))
72    }
73
74    /// Returns self as a doctype.
75    pub fn as_doctype(&self) -> Option<&Doctype> {
76        match *self {
77            Node::Doctype(ref d) => Some(d),
78            _ => None,
79        }
80    }
81
82    /// Returns self as a comment.
83    pub fn as_comment(&self) -> Option<&Comment> {
84        match *self {
85            Node::Comment(ref c) => Some(c),
86            _ => None,
87        }
88    }
89
90    /// Returns self as text.
91    pub fn as_text(&self) -> Option<&Text> {
92        match *self {
93            Node::Text(ref t) => Some(t),
94            _ => None,
95        }
96    }
97
98    /// Returns self as an element.
99    pub fn as_element(&self) -> Option<&Element> {
100        match *self {
101            Node::Element(ref e) => Some(e),
102            _ => None,
103        }
104    }
105
106    /// Returns self as an element.
107    pub fn as_processing_instruction(&self) -> Option<&ProcessingInstruction> {
108        match *self {
109            Node::ProcessingInstruction(ref pi) => Some(pi),
110            _ => None,
111        }
112    }
113}
114
115// Always use one line.
116impl fmt::Debug for Node {
117    fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
118        match *self {
119            Node::Document => write!(f, "Document"),
120            Node::Fragment => write!(f, "Fragment"),
121            Node::Doctype(ref d) => write!(f, "Doctype({:?})", d),
122            Node::Comment(ref c) => write!(f, "Comment({:?})", c),
123            Node::Text(ref t) => write!(f, "Text({:?})", t),
124            Node::Element(ref e) => write!(f, "Element({:?})", e),
125            Node::ProcessingInstruction(ref pi) => write!(f, "ProcessingInstruction({:?})", pi),
126        }
127    }
128}
129
130/// A doctype.
131#[derive(Clone)]
132pub struct Doctype {
133    /// The doctype name.
134    pub name: AtomicStrTendril,
135
136    /// The doctype public ID.
137    pub public_id: AtomicStrTendril,
138
139    /// The doctype system ID.
140    pub system_id: AtomicStrTendril,
141}
142
143impl Doctype {
144    /// Returns the doctype name.
145    pub fn name(&self) -> &str {
146        self.name.deref()
147    }
148
149    /// Returns the doctype public ID.
150    pub fn public_id(&self) -> &str {
151        self.public_id.deref()
152    }
153
154    /// Returns the doctype system ID.
155    pub fn system_id(&self) -> &str {
156        self.system_id.deref()
157    }
158}
159
160impl fmt::Debug for Doctype {
161    fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
162        write!(
163            f,
164            "<!DOCTYPE {} PUBLIC {:?} {:?}>",
165            self.name(),
166            self.public_id(),
167            self.system_id()
168        )
169    }
170}
171
172/// An HTML comment.
173#[derive(Clone)]
174pub struct Comment {
175    /// The comment text.
176    pub comment: AtomicStrTendril,
177}
178
179impl Deref for Comment {
180    type Target = str;
181
182    fn deref(&self) -> &str {
183        self.comment.deref()
184    }
185}
186
187impl fmt::Debug for Comment {
188    fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
189        write!(f, "<!-- {:?} -->", self.deref())
190    }
191}
192
193/// HTML text.
194#[derive(Clone, PartialEq, Eq)]
195pub struct Text {
196    /// The text.
197    pub text: AtomicStrTendril,
198}
199
200impl Deref for Text {
201    type Target = str;
202
203    fn deref(&self) -> &str {
204        self.text.deref()
205    }
206}
207
208impl fmt::Debug for Text {
209    fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
210        write!(f, "{:?}", self.deref())
211    }
212}
213
214/// A Map of attributes that doesn't preserve the order of the attributes.
215/// Please enable the `deterministic` feature for order-preserving
216/// (de)serialization.
217pub type Attributes = HashMap<QualName, AtomicStrTendril>;
218
219/// An HTML element.
220#[derive(Clone, PartialEq, Eq)]
221pub struct Element {
222    /// The element name.
223    pub name: QualName,
224
225    /// The element ID.
226    pub id: Option<LocalName>,
227
228    /// The element classes.
229    pub classes: HashSet<LocalName>,
230
231    /// The element attributes.
232    pub attrs: Attributes,
233}
234
235impl Element {
236    #[doc(hidden)]
237    pub fn new(name: QualName, attributes: Vec<Attribute>) -> Self {
238        let mut classes: HashSet<LocalName> = HashSet::new();
239        let mut attrs: HashMap<QualName, AtomicStrTendril> =
240            HashMap::with_capacity(attributes.len());
241        let mut id: Option<LocalName> = None;
242
243        for a in attributes {
244            match a.name.local.deref() {
245                "id" => {
246                    id = Some(LocalName::from(a.value.deref()));
247                }
248                "class" => {
249                    classes.extend(a.value.deref().split_whitespace().map(LocalName::from));
250                }
251                _ => (),
252            };
253            attrs.insert(a.name, a.value.into_send().into());
254        }
255
256        Element {
257            attrs,
258            name,
259            id,
260            classes,
261        }
262    }
263
264    /// Returns the element name.
265    pub fn name(&self) -> &str {
266        self.name.local.deref()
267    }
268
269    /// Returns the element ID.
270    pub fn id(&self) -> Option<&str> {
271        self.id.as_deref()
272    }
273
274    /// Returns true if element has the class.
275    pub fn has_class(&self, class: &str, case_sensitive: CaseSensitivity) -> bool {
276        self.classes()
277            .any(|c| case_sensitive.eq(c.as_bytes(), class.as_bytes()))
278    }
279
280    /// Returns an iterator over the element's classes.
281    pub fn classes(&self) -> Classes {
282        Classes {
283            inner: self.classes.iter(),
284        }
285    }
286
287    /// Returns the value of an attribute.
288    pub fn attr(&self, attr: &str) -> Option<&str> {
289        let qualname = QualName::new(None, ns!(), LocalName::from(attr));
290        self.attrs.get(&qualname).map(Deref::deref)
291    }
292
293    /// Returns an iterator over the element's attributes.
294    pub fn attrs(&self) -> Attrs {
295        Attrs {
296            inner: self.attrs.iter(),
297        }
298    }
299}
300
301/// Iterator over classes.
302#[allow(missing_debug_implementations)]
303#[derive(Clone)]
304pub struct Classes<'a> {
305    inner: hash_set::Iter<'a, LocalName>,
306}
307
308impl<'a> Iterator for Classes<'a> {
309    type Item = &'a str;
310
311    fn next(&mut self) -> Option<&'a str> {
312        self.inner.next().map(Deref::deref)
313    }
314}
315
316/// An iterator over a node's attributes.
317pub type AttributesIter<'a> = Iter<'a, QualName, AtomicStrTendril>;
318
319/// Iterator over attributes.
320#[allow(missing_debug_implementations)]
321#[derive(Clone)]
322pub struct Attrs<'a> {
323    inner: AttributesIter<'a>,
324}
325
326impl<'a> Iterator for Attrs<'a> {
327    type Item = (&'a str, &'a str);
328
329    fn next(&mut self) -> Option<(&'a str, &'a str)> {
330        self.inner.next().map(|(k, v)| (k.local.deref(), v.deref()))
331    }
332}
333
334impl fmt::Debug for Element {
335    fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
336        write!(f, "<{}", self.name())?;
337        for (key, value) in self.attrs() {
338            write!(f, " {}={:?}", key, value)?;
339        }
340        write!(f, ">")
341    }
342}
343
344/// HTML Processing Instruction.
345#[derive(Debug, Clone, PartialEq, Eq)]
346pub struct ProcessingInstruction {
347    /// The PI target.
348    pub target: AtomicStrTendril,
349    /// The PI data.
350    pub data: AtomicStrTendril,
351}
352
353impl Deref for ProcessingInstruction {
354    type Target = str;
355
356    fn deref(&self) -> &str {
357        self.data.deref()
358    }
359}
360
361pub(crate) mod serializable;