Skip to main content

scah/sax/element/
builder.rs

1use super::tokenizer::ElementAttributeToken;
2use crate::utils::Reader;
3
4/// A key-value pair representing an HTML element attribute.
5///
6/// Both `key` and `value` are zero-copy `&str` references into the
7/// original HTML source.
8///
9/// # Example
10///
11/// ```rust
12/// use scah::{Query, Save, parse};
13///
14/// let html = r#"<a href="https://example.com" target="_blank">Link</a>"#;
15/// let queries = &[Query::all("a", Save::all()).build()];
16/// let store = parse(html, queries);
17///
18/// let a = store.get("a").unwrap().next().unwrap();
19/// let attrs = a.attributes(&store).unwrap();
20/// assert_eq!(attrs[0].key, "href");
21/// assert_eq!(attrs[0].value, Some("https://example.com"));
22/// assert_eq!(attrs[1].key, "target");
23/// assert_eq!(attrs[1].value, Some("_blank"));
24/// ```
25#[derive(Debug, PartialEq, Clone)]
26pub struct Attribute<'html> {
27    /// The attribute name (e.g. `"href"`, `"class"`, `"data-id"`).
28    pub key: &'html str,
29    /// The attribute value, or `None` for boolean attributes (e.g. `disabled`).
30    pub value: Option<&'html str>,
31}
32
33//pub type Attributes<'html> = SmallVec<[Attribute<'html>, 3]>;
34
35/// An HTML element as parsed from the token stream.
36///
37/// This is the *parser-level* representation used during streaming.
38/// Once an element is matched by a query, its data is copied into an
39/// [`Element`](crate::Element) inside the [`Store`](crate::Store).
40#[derive(Debug, PartialEq, Clone, Default)]
41pub struct XHtmlElement<'html> {
42    /// The tag name (e.g. `"div"`, `"a"`, `"section"`).
43    pub name: &'html str,
44    /// The value of the `id` attribute, if present.
45    pub id: Option<&'html str>,
46    /// The value of the `class` attribute, if present.
47    pub class: Option<&'html str>,
48    /// Slice of additional attributes (excludes `id` and `class`).
49    pub attributes: &'html [Attribute<'html>],
50}
51
52#[derive(Debug, PartialEq)]
53pub enum XHtmlTag<'html> {
54    Open,
55    Close(&'html str),
56}
57
58impl<'html> XHtmlElement<'html> {
59    fn add_to_element(
60        &mut self,
61        attribute: Attribute<'html>,
62        attribute_tape: &mut Vec<Attribute<'html>>,
63    ) {
64        if self.name.is_empty() && attribute.value.is_none() {
65            self.name = attribute.key;
66        } else if self.class.is_none() && attribute.key == "class" && attribute.value.is_some() {
67            self.class = attribute.value;
68        } else if self.id.is_none() && attribute.key == "id" && attribute.value.is_some() {
69            self.id = attribute.value;
70        } else {
71            attribute_tape.push(attribute);
72        }
73    }
74
75    pub fn is_self_closing(&self) -> bool {
76        if matches!(
77            self.name,
78            "area"
79                | "base"
80                | "br"
81                | "col"
82                | "embed"
83                | "hr"
84                | "img"
85                | "input"
86                | "link"
87                | "meta"
88                | "param"
89                | "source"
90                | "track"
91                | "wbr"
92        ) {
93            return true;
94        }
95        if let Some(last_attribute) = self.attributes.last() {
96            return last_attribute.key == "\\";
97        }
98
99        false
100    }
101
102    pub fn clear(&mut self) {
103        self.name = "";
104        self.id = None;
105        self.class = None;
106        self.attributes = &[];
107    }
108
109    /*
110     * When a Element is parsed all the Attributes are added to a Tape
111     * If the Element was not saved, then we need to delete these Attributes
112     */
113    pub fn remove_attributes(&self, attribute_tape: &mut Vec<Attribute<'html>>) {
114        if self.attributes.is_empty() {
115            return;
116        }
117        let tape_ptr = attribute_tape.as_ptr();
118        let attr_range_ptr = self.attributes.as_ptr();
119        let idx = unsafe { attr_range_ptr.offset_from_unsigned(tape_ptr) };
120
121        attribute_tape.truncate(idx);
122    }
123
124    pub fn from(&mut self, reader: &mut Reader<'html>, attribute_tape: &mut Vec<Attribute<'html>>) {
125        let mut assign = false;
126        let mut key = None;
127        let start_len = attribute_tape.len();
128
129        while let Some(token) = ElementAttributeToken::next(reader) {
130            match token {
131                ElementAttributeToken::String(string_value) => match key {
132                    None => {
133                        debug_assert!(!assign);
134                        key = Some(string_value);
135                    }
136                    Some(k) => {
137                        if assign {
138                            self.add_to_element(
139                                Attribute {
140                                    key: k,
141                                    value: Some(string_value),
142                                },
143                                attribute_tape,
144                            );
145                            key = None;
146                        } else {
147                            self.add_to_element(
148                                Attribute {
149                                    key: k,
150                                    value: None,
151                                },
152                                attribute_tape,
153                            );
154                            key = Some(string_value)
155                        }
156                        assign = false;
157                    }
158                },
159
160                ElementAttributeToken::Equal => {
161                    assign = true;
162                }
163            }
164        }
165
166        if let Some(attribute) = key {
167            self.add_to_element(
168                Attribute {
169                    key: attribute,
170                    value: None,
171                },
172                attribute_tape,
173            );
174        }
175
176        // Since we are
177        //  1) assigning after adding the Attributes
178        //  and 2) either transforming it into a Range in Store or removing them
179        //  their is no risk when doing this unsafely
180        self.attributes = unsafe {
181            std::slice::from_raw_parts(
182                attribute_tape.as_ptr().add(start_len),
183                attribute_tape.len() - start_len,
184            )
185        };
186    }
187}
188
189// TODO: Parse the closing tag for the XHtmlTag
190impl<'a> XHtmlTag<'a> {
191    pub fn from(reader: &mut Reader<'a>) -> Option<Self> {
192        reader.next_while_list(&[b' ', b'\n', b'\r', b'\t', b'<']);
193        if let Some(character) = reader.peek() {
194            if character == b'/' {
195                let start = reader.get_position() + 1;
196                reader.next_until(b'>');
197
198                let end = reader.get_position();
199                reader.skip();
200
201                // BUG: Handle start and end not conforming to the rules of slices.
202
203                // BUG: The Formating of the string breaks this code
204
205                return Some(Self::Close(reader.slice(start..end).trim()));
206            } else if character == b'!' {
207                // This is a comment
208                reader.next_until(b'>');
209                reader.skip();
210                return None;
211            }
212        }
213        Some(Self::Open)
214    }
215}
216
217#[cfg(test)]
218mod tests {
219    use super::*;
220
221    #[test]
222    fn test_key_no_quote_and_value_with_quote() {
223        let mut reader = Reader::new("p key=\"value\"");
224        let mut element = XHtmlElement::default();
225        let mut attributes = vec![];
226        element.from(&mut reader, &mut attributes);
227        assert_eq!(element.name, "p");
228
229        assert_eq!(
230            element.attributes[0],
231            Attribute {
232                key: "key",
233                value: Some("value")
234            }
235        );
236    }
237
238    #[test]
239    fn test_key_no_quote_and_value_no_quote() {
240        let mut reader = Reader::new("p key=value");
241        let mut element = XHtmlElement::default();
242        let mut attributes = vec![];
243        element.from(&mut reader, &mut attributes);
244
245        assert_eq!(element.name, "p");
246
247        assert_eq!(element.attributes.len(), 1);
248
249        assert_eq!(
250            element.attributes[0],
251            Attribute {
252                key: "key",
253                value: Some("value")
254            }
255        );
256    }
257
258    #[test]
259    fn test_key_with_quote_and_value_with_quote() {
260        let mut reader = Reader::new("p \"key\"=\"value\"");
261        let mut element = XHtmlElement::default();
262        let mut attributes = vec![];
263        element.from(&mut reader, &mut attributes);
264
265        assert_eq!(element.name, "p");
266
267        assert_eq!(
268            element.attributes[0],
269            Attribute {
270                key: "key",
271                value: Some("value")
272            }
273        );
274    }
275
276    #[test]
277    fn test_multiple_key_value_pairs() {
278        let mut reader = Reader::new("p key=\"value\" \"key1\"=value1 \"key2\"=\"value2\" keey");
279        let mut element = XHtmlElement::default();
280        let mut attributes = vec![];
281        element.from(&mut reader, &mut attributes);
282
283        assert_eq!(element.name, "p");
284
285        assert_eq!(
286            element.attributes[0],
287            Attribute {
288                key: "key",
289                value: Some("value")
290            }
291        );
292        assert_eq!(
293            element.attributes[1],
294            Attribute {
295                key: "key1",
296                value: Some("value1")
297            }
298        );
299        assert_eq!(
300            element.attributes[2],
301            Attribute {
302                key: "key2",
303                value: Some("value2")
304            }
305        );
306        assert_eq!(
307            element.attributes[3],
308            Attribute {
309                key: "keey",
310                value: None
311            }
312        );
313    }
314
315    #[test]
316    fn test_key_with_quote_and_no_value() {
317        let mut reader = Reader::new("p \"key\"");
318        let mut element = XHtmlElement::default();
319        let mut attributes = vec![];
320        element.from(&mut reader, &mut attributes);
321
322        assert_eq!(element.name, "p");
323
324        assert_eq!(
325            element.attributes[0],
326            Attribute {
327                key: "key",
328                value: None
329            }
330        );
331    }
332
333    #[test]
334    fn test_key_no_quote_and_no_value() {
335        let mut reader = Reader::new("p key");
336        let mut element = XHtmlElement::default();
337        let mut attributes = vec![];
338        element.from(&mut reader, &mut attributes);
339
340        assert_eq!(element.name, "p");
341
342        assert_eq!(
343            element.attributes[0],
344            Attribute {
345                key: "key",
346                value: None
347            }
348        );
349    }
350
351    #[test]
352    #[ignore = "Known issue: Escapes are not handled"]
353    fn test_key_no_quote_and_escaped_space_value() {
354        let mut reader = Reader::new("p key = hello\\ world");
355        let mut element = XHtmlElement::default();
356        let mut attributes = vec![];
357        element.from(&mut reader, &mut attributes);
358
359        assert_eq!(element.name, "p");
360
361        assert_eq!(
362            element.attributes[0],
363            Attribute {
364                key: "key",
365                value: Some("hello\\ world")
366            }
367        );
368    }
369
370    #[test]
371    fn test_long_key_with_spaces() {
372        let mut reader = Reader::new("p \"long key with spaces\"=\"value\"");
373        let mut element = XHtmlElement::default();
374        let mut attributes = vec![];
375        element.from(&mut reader, &mut attributes);
376
377        assert_eq!(element.name, "p");
378
379        assert_eq!(
380            element.attributes[0],
381            Attribute {
382                key: "long key with spaces",
383                value: Some("value")
384            }
385        );
386    }
387
388    #[test]
389    fn test_long_key_with_spaces_and_different_quote_inside() {
390        let mut reader = Reader::new("p \"long key's with spaces\"=\"value\"");
391        let mut element = XHtmlElement::default();
392        let mut attributes = vec![];
393        element.from(&mut reader, &mut attributes);
394
395        assert_eq!(element.name, "p");
396
397        assert_eq!(
398            element.attributes[0],
399            Attribute {
400                key: "long key's with spaces",
401                value: Some("value")
402            }
403        );
404    }
405
406    #[test]
407    #[ignore = "Known issue: Escapes are not handled"]
408    fn test_long_key_with_spaces_and_real_same_quote_inside() {
409        let mut reader = Reader::new(r#"p "long key\"s with spaces"="value""#);
410        let mut element = XHtmlElement::default();
411        let mut attributes = vec![];
412        element.from(&mut reader, &mut attributes);
413
414        assert_eq!(element.name, "p");
415
416        assert_eq!(
417            element.attributes[0],
418            Attribute {
419                key: r#"long key\"s with spaces"#,
420                value: Some("value")
421            }
422        );
423    }
424
425    #[test]
426    #[ignore = "Known issue: Escapes are not handled"]
427    fn test_long_key_and_value_with_spaces_and_real_same_quote_inside() {
428        let mut reader = Reader::new(
429            r#"p "long key\"s with spaces"="value\"s of an other person \\\\\\ \\\\\ \ \  \"""#,
430        );
431        let mut element = XHtmlElement::default();
432        let mut attributes = vec![];
433        element.from(&mut reader, &mut attributes);
434
435        assert_eq!(element.name, "p");
436
437        assert_eq!(
438            element.attributes[0],
439            Attribute {
440                key: r#"long key\"s with spaces"#,
441                value: Some(r#"value\"s of an other person \\\\\\ \\\\\ \ \  \""#)
442            }
443        );
444    }
445
446    #[test]
447    fn test_valid_anchor_tag_attributes() {
448        let mut reader = Reader::new(
449            "a target=\"_blank\" href=\"/my_cv.pdf\" class=\"px-7 py-3\" hello-world=hello-world",
450        );
451        let mut element = XHtmlElement::default();
452        let mut attributes = vec![];
453        element.from(&mut reader, &mut attributes);
454
455        assert_eq!(element.name, "a");
456
457        assert_eq!(
458            element.attributes[0],
459            Attribute {
460                key: "target",
461                value: Some("_blank")
462            }
463        );
464
465        assert_eq!(
466            element.attributes[1],
467            Attribute {
468                key: "href",
469                value: Some("/my_cv.pdf")
470            }
471        );
472
473        assert_eq!(element.class, Some("px-7 py-3"));
474
475        assert_eq!(
476            element.attributes[2],
477            Attribute {
478                key: "hello-world",
479                value: Some("hello-world")
480            }
481        );
482    }
483
484    #[test]
485    fn test_complex_open_tag() {
486        let mut reader = Reader::new(
487            r#"a href="https://developer.mozilla.org/en-US/docs/Web/HTML/Attributes/crossorigin" title="The crossorigin attribute, valid on the <audio>, <img>, <link>, <script>, and <video> elements, provides support for CORS, defining how the element handles cross-origin requests, thereby enabling the configuration of the CORS requests for the element's fetched data. Depending on the element, the attribute can be a CORS settings attribute.""#,
488        );
489
490        let tag = XHtmlTag::from(&mut reader);
491        let mut element = XHtmlElement::default();
492        let mut attributes = vec![];
493        element.from(&mut reader, &mut attributes);
494
495        assert_eq!(tag, Some(XHtmlTag::Open));
496
497        assert_eq!(
498            element,
499            XHtmlElement {
500                name: "a",
501                id: None,
502                class: None,
503                attributes: &[
504                    Attribute {
505                        key: "href",
506                        value: Some(
507                            "https://developer.mozilla.org/en-US/docs/Web/HTML/Attributes/crossorigin"
508                        )
509                    },
510                    Attribute {
511                        key: "title",
512                        value: Some(
513                            "The crossorigin attribute, valid on the <audio>, <img>, <link>, <script>, and <video> elements, provides support for CORS, defining how the element handles cross-origin requests, thereby enabling the configuration of the CORS requests for the element's fetched data. Depending on the element, the attribute can be a CORS settings attribute."
514                        )
515                    }
516                ],
517            }
518        );
519    }
520
521    #[test]
522    fn test_xhtml_tag_open() {
523        let mut reader = Reader::new("p key=\"value\"");
524        let tag = XHtmlTag::from(&mut reader);
525        let mut element = XHtmlElement::default();
526        let mut attributes = vec![];
527        element.from(&mut reader, &mut attributes);
528
529        assert_eq!(tag, Some(XHtmlTag::Open));
530
531        assert_eq!(
532            element,
533            XHtmlElement {
534                name: "p",
535                id: None,
536                class: None,
537                attributes: &[Attribute {
538                    key: "key",
539                    value: Some("value")
540                }],
541            }
542        );
543    }
544
545    #[test]
546    fn test_xhtml_tag_close() {
547        let mut reader = Reader::new("/p>");
548        let tag = XHtmlTag::from(&mut reader);
549
550        assert_eq!(tag, Some(XHtmlTag::Close("p")));
551    }
552
553    #[test]
554    fn test_xhtml_tag_close_weird_formatting() {
555        let mut reader = Reader::new("  /   p   >");
556        let tag = XHtmlTag::from(&mut reader);
557
558        assert_eq!(tag, Some(XHtmlTag::Close("p")));
559    }
560
561    #[test]
562    fn test_parsing_comment() {
563        let mut reader = Reader::new("<!-- These 3 links will be selected by the selector -->");
564        let tag = XHtmlTag::from(&mut reader);
565
566        assert!(tag.is_none())
567    }
568
569    #[test]
570    fn test_parsing_mutiline_comment() {
571        let mut reader = Reader::new(
572            r#"
573            <!-- These 3 links will be selected by the selector -->
574        "#,
575        );
576        let tag = XHtmlTag::from(&mut reader);
577
578        assert!(tag.is_none())
579    }
580}