sws_scraper/
node.rs

1//! HTML nodes.
2
3use std::collections::{hash_map, hash_set};
4use std::collections::{HashMap, HashSet};
5use std::fmt;
6use std::ops::Deref;
7
8use html5ever::tendril::StrTendril;
9use html5ever::{Attribute, LocalName, QualName};
10use selectors::attr::CaseSensitivity;
11
12/// An HTML node.
13#[derive(Clone, PartialEq, Eq)]
14pub enum Node {
15    /// The document root.
16    Document,
17
18    /// The fragment root.
19    Fragment,
20
21    /// A doctype.
22    Doctype(Doctype),
23
24    /// A comment.
25    Comment(Comment),
26
27    /// Text.
28    Text(Text),
29
30    /// An element.
31    Element(Element),
32
33    /// A processing instruction.
34    ProcessingInstruction(ProcessingInstruction),
35}
36
37impl Node {
38    /// Returns true if node is the document root.
39    pub fn is_document(&self) -> bool {
40        matches!(*self, Node::Document)
41    }
42
43    /// Returns true if node is the fragment root.
44    pub fn is_fragment(&self) -> bool {
45        matches!(*self, Node::Fragment)
46    }
47
48    /// Returns true if node is a doctype.
49    pub fn is_doctype(&self) -> bool {
50        matches!(*self, Node::Doctype(_))
51    }
52
53    /// Returns true if node is a comment.
54    pub fn is_comment(&self) -> bool {
55        matches!(*self, Node::Comment(_))
56    }
57
58    /// Returns true if node is text.
59    pub fn is_text(&self) -> bool {
60        matches!(*self, Node::Text(_))
61    }
62
63    /// Returns true if node is an element.
64    pub fn is_element(&self) -> bool {
65        matches!(*self, Node::Element(_))
66    }
67
68    /// Returns self as a doctype.
69    pub fn as_doctype(&self) -> Option<&Doctype> {
70        match *self {
71            Node::Doctype(ref d) => Some(d),
72            _ => None,
73        }
74    }
75
76    /// Returns self as a comment.
77    pub fn as_comment(&self) -> Option<&Comment> {
78        match *self {
79            Node::Comment(ref c) => Some(c),
80            _ => None,
81        }
82    }
83
84    /// Returns self as text.
85    pub fn as_text(&self) -> Option<&Text> {
86        match *self {
87            Node::Text(ref t) => Some(t),
88            _ => None,
89        }
90    }
91
92    /// Returns self as an element.
93    pub fn as_element(&self) -> Option<&Element> {
94        match *self {
95            Node::Element(ref e) => Some(e),
96            _ => None,
97        }
98    }
99
100    /// Returns self as an element.
101    pub fn as_processing_instruction(&self) -> Option<&ProcessingInstruction> {
102        match *self {
103            Node::ProcessingInstruction(ref pi) => Some(pi),
104            _ => None,
105        }
106    }
107}
108
109// Always use one line.
110impl fmt::Debug for Node {
111    fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
112        match *self {
113            Node::Document => write!(f, "Document"),
114            Node::Fragment => write!(f, "Fragment"),
115            Node::Doctype(ref d) => write!(f, "Doctype({:?})", d),
116            Node::Comment(ref c) => write!(f, "Comment({:?})", c),
117            Node::Text(ref t) => write!(f, "Text({:?})", t),
118            Node::Element(ref e) => write!(f, "Element({:?})", e),
119            Node::ProcessingInstruction(ref pi) => write!(f, "ProcessingInstruction({:?})", pi),
120        }
121    }
122}
123
124/// A doctype.
125#[derive(Clone, PartialEq, Eq)]
126pub struct Doctype {
127    /// The doctype name.
128    pub name: StrTendril,
129
130    /// The doctype public ID.
131    pub public_id: StrTendril,
132
133    /// The doctype system ID.
134    pub system_id: StrTendril,
135}
136
137impl Doctype {
138    /// Returns the doctype name.
139    pub fn name(&self) -> &str {
140        self.name.deref()
141    }
142
143    /// Returns the doctype public ID.
144    pub fn public_id(&self) -> &str {
145        self.public_id.deref()
146    }
147
148    /// Returns the doctype system ID.
149    pub fn system_id(&self) -> &str {
150        self.system_id.deref()
151    }
152}
153
154impl fmt::Debug for Doctype {
155    fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
156        write!(
157            f,
158            "<!DOCTYPE {} PUBLIC {:?} {:?}>",
159            self.name(),
160            self.public_id(),
161            self.system_id()
162        )
163    }
164}
165
166/// An HTML comment.
167#[derive(Clone, PartialEq, Eq)]
168pub struct Comment {
169    /// The comment text.
170    pub comment: StrTendril,
171}
172
173impl Deref for Comment {
174    type Target = str;
175
176    fn deref(&self) -> &str {
177        self.comment.deref()
178    }
179}
180
181impl fmt::Debug for Comment {
182    fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
183        write!(f, "<!-- {:?} -->", self.deref())
184    }
185}
186
187/// HTML text.
188#[derive(Clone, PartialEq, Eq)]
189pub struct Text {
190    /// The text.
191    pub text: StrTendril,
192}
193
194impl Deref for Text {
195    type Target = str;
196
197    fn deref(&self) -> &str {
198        self.text.deref()
199    }
200}
201
202impl fmt::Debug for Text {
203    fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
204        write!(f, "{:?}", self.deref())
205    }
206}
207
208/// A Map of attributes that doesn't preserve the order of the attributes.
209pub type Attributes = HashMap<QualName, StrTendril>;
210
211/// An HTML element.
212#[derive(Clone, PartialEq, Eq)]
213pub struct Element {
214    /// The element name.
215    pub name: QualName,
216
217    /// The element ID.
218    pub id: Option<LocalName>,
219
220    /// The element classes.
221    pub classes: HashSet<LocalName>,
222
223    /// The element attributes.
224    pub attrs: Attributes,
225}
226
227impl Element {
228    #[doc(hidden)]
229    pub fn new(name: QualName, attrs: Vec<Attribute>) -> Self {
230        let id = attrs
231            .iter()
232            .find(|a| a.name.local.deref() == "id")
233            .map(|a| LocalName::from(a.value.deref()));
234
235        let classes: HashSet<LocalName> = attrs
236            .iter()
237            .find(|a| a.name.local.deref() == "class")
238            .map_or(HashSet::new(), |a| {
239                a.value
240                    .deref()
241                    .split_whitespace()
242                    .map(LocalName::from)
243                    .collect()
244            });
245
246        Element {
247            attrs: attrs.into_iter().map(|a| (a.name, a.value)).collect(),
248            name,
249            id,
250            classes,
251        }
252    }
253
254    /// Returns the element name.
255    pub fn name(&self) -> &str {
256        self.name.local.deref()
257    }
258
259    /// Returns the element ID.
260    pub fn id(&self) -> Option<&str> {
261        self.id.as_deref()
262    }
263
264    /// Returns true if element has the class.
265    pub fn has_class(&self, class: &str, case_sensitive: CaseSensitivity) -> bool {
266        self.classes()
267            .any(|c| case_sensitive.eq(c.as_bytes(), class.as_bytes()))
268    }
269
270    /// Returns an iterator over the element's classes.
271    pub fn classes(&self) -> Classes {
272        Classes {
273            inner: self.classes.iter(),
274        }
275    }
276
277    /// Returns the value of an attribute.
278    pub fn attr(&self, attr: &str) -> Option<&str> {
279        let qualname = QualName::new(None, ns!(), LocalName::from(attr));
280        self.attrs.get(&qualname).map(Deref::deref)
281    }
282
283    /// Returns an iterator over the element's attributes.
284    pub fn attrs(&self) -> Attrs {
285        Attrs {
286            inner: self.attrs.iter(),
287        }
288    }
289}
290
291/// Iterator over classes.
292#[allow(missing_debug_implementations)]
293#[derive(Clone)]
294pub struct Classes<'a> {
295    inner: hash_set::Iter<'a, LocalName>,
296}
297
298impl<'a> Iterator for Classes<'a> {
299    type Item = &'a str;
300
301    fn next(&mut self) -> Option<&'a str> {
302        self.inner.next().map(Deref::deref)
303    }
304}
305
306/// An iterator over a node's attributes.
307pub type AttributesIter<'a> = hash_map::Iter<'a, QualName, StrTendril>;
308
309/// Iterator over attributes.
310#[allow(missing_debug_implementations)]
311#[derive(Clone)]
312pub struct Attrs<'a> {
313    inner: AttributesIter<'a>,
314}
315
316impl<'a> Iterator for Attrs<'a> {
317    type Item = (&'a str, &'a str);
318
319    fn next(&mut self) -> Option<(&'a str, &'a str)> {
320        self.inner.next().map(|(k, v)| (k.local.deref(), v.deref()))
321    }
322}
323
324impl fmt::Debug for Element {
325    fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
326        write!(f, "<{}", self.name())?;
327        for (key, value) in self.attrs() {
328            write!(f, " {}={:?}", key, value)?;
329        }
330        write!(f, ">")
331    }
332}
333
334/// HTML Processing Instruction.
335#[derive(Debug, Clone, PartialEq, Eq)]
336pub struct ProcessingInstruction {
337    /// The PI target.
338    pub target: StrTendril,
339    /// The PI data.
340    pub data: StrTendril,
341}
342
343impl Deref for ProcessingInstruction {
344    type Target = str;
345
346    fn deref(&self) -> &str {
347        self.data.deref()
348    }
349}