html_streaming_editor/html/
mod.rs

1use rctree::{Children, Node};
2use snafu::{Backtrace, Snafu};
3use std::collections::BTreeMap;
4
5use crate::CssSelector;
6use tl::{HTMLTag, HTMLVersion, NodeHandle, Parser, VDom};
7
8#[cfg(test)]
9mod tests;
10
11#[derive(Debug, Snafu)]
12#[snafu(visibility(pub(crate)))]
13pub enum HtmlDomError {
14    #[snafu(display("Nothing Imported from tl"))]
15    NothingImported { backtrace: Backtrace },
16    #[snafu(display("Node not resolved by Parser"))]
17    InvalidParserState { backtrace: Backtrace },
18    #[snafu(display("HTML Document has invalid structure: {}", message))]
19    InvalidHtmlDocument {
20        message: &'static str,
21        backtrace: Backtrace,
22    },
23}
24
25const HTML_VOID_ELEMENTS: [&str; 16] = [
26    "area", "base", "br", "col", "command", "embed", "hr", "img", "input", "keygen", "link",
27    "meta", "param", "source", "track", "wbr",
28];
29
30#[derive(Debug, PartialEq, Clone)]
31pub(crate) struct HtmlDocument {
32    pub doctype: Option<HTMLVersion>,
33}
34
35#[derive(Debug, PartialEq, Clone)]
36pub(crate) struct HtmlTag {
37    pub name: String,
38    pub attributes: BTreeMap<String, String>,
39}
40
41impl HtmlTag {
42    pub(crate) fn of_name(name: impl Into<String>) -> Self {
43        HtmlTag {
44            name: name.into(),
45            attributes: BTreeMap::<String, String>::new(),
46        }
47    }
48
49    pub(crate) fn build_start_tag(&self, mut add_string: impl FnMut(String)) {
50        add_string(format!("<{}", self.name));
51        self.attributes
52            .iter()
53            .for_each(|(key, value)| add_string(format!(r#" {}="{}""#, key, value)));
54        add_string(String::from(">"));
55    }
56
57    pub(crate) fn build_end_tag(&self, mut add_string: impl FnMut(String)) {
58        if HTML_VOID_ELEMENTS.contains(&self.name.as_ref()) {
59            return;
60        }
61
62        add_string(format!("</{}>", self.name));
63    }
64
65    fn matches_selector(&self, selector: &CssSelector) -> bool {
66        if let Some(element) = selector.element {
67            if element.as_bytes() != self.name.as_bytes() {
68                return false;
69            }
70        }
71
72        if let Some(id) = selector.id {
73            if let Some(tag_id) = self.attributes.get(&String::from("id")) {
74                if id.as_bytes() != tag_id.as_bytes() {
75                    return false;
76                }
77            } else {
78                return false;
79            }
80        }
81
82        for class in &selector.classes {
83            if !self.is_class_member(class) {
84                return false;
85            }
86        }
87
88        for _pseudo_class in &selector.pseudo_classes {
89            todo!("Implement pseudo-class support")
90        }
91
92        for attribute in &selector.attributes {
93            if let Some(attribute_value) = self.attributes.get(&String::from(attribute.attribute)) {
94                if !attribute.matches(attribute_value) {
95                    return false;
96                }
97            } else {
98                return false;
99            }
100        }
101
102        true
103    }
104
105    fn is_class_member(&self, class: &str) -> bool {
106        if let Some(classes) = self.attributes.get(&String::from("class")) {
107            classes.split(' ').any(|c| c == class)
108        } else {
109            false
110        }
111    }
112}
113
114#[derive(Debug, PartialEq, Clone)]
115pub(crate) enum HtmlContent {
116    Document(HtmlDocument),
117    Tag(HtmlTag),
118    Text(String),
119    Comment(String),
120}
121
122impl HtmlContent {
123    pub(crate) fn is_tag(&self) -> bool {
124        matches!(self, HtmlContent::Tag(_))
125    }
126
127    pub(crate) fn import(dom: VDom) -> Result<Node<HtmlContent>, HtmlDomError> {
128        let (root_tag, root_tag_name) = Self::find_root_tag(&dom)?;
129
130        if root_tag_name == *"html" {
131            let document = Node::new(HtmlContent::Document(HtmlDocument {
132                doctype: dom.version(),
133            }));
134            document.append(root_tag);
135
136            Ok(document)
137        } else {
138            Ok(root_tag)
139        }
140    }
141
142    fn find_root_tag(dom: &VDom) -> Result<(Node<HtmlContent>, String), HtmlDomError> {
143        let parser = dom.parser();
144
145        for child in dom.children() {
146            if let Some(node) = child.get(parser) {
147                if let Some(tag) = node.as_tag() {
148                    let name = String::from(tag.name().as_utf8_str());
149                    let converted = Self::convert_tag(tag, parser)?;
150                    return Ok((converted, name));
151                }
152            }
153        }
154
155        NothingImportedSnafu {}.fail()
156    }
157
158    fn convert_tag(tag: &HTMLTag, parser: &Parser) -> Result<Node<HtmlContent>, HtmlDomError> {
159        let name = String::from(tag.name().as_utf8_str());
160        let mut attributes = BTreeMap::new();
161
162        for (key, value) in tag.attributes().iter() {
163            let value_string = if let Some(value_content) = value {
164                String::from(value_content)
165            } else {
166                String::new()
167            };
168
169            attributes.insert(String::from(key), value_string);
170        }
171
172        let converted = Node::<HtmlContent>::new(HtmlContent::Tag(HtmlTag { name, attributes }));
173
174        for child in tag.children().top().iter() {
175            converted.append(Self::convert_node(child, parser)?)
176        }
177
178        Ok(converted)
179    }
180
181    fn convert_node(
182        node_handle: &NodeHandle,
183        parser: &Parser,
184    ) -> Result<Node<HtmlContent>, HtmlDomError> {
185        if let Some(node) = node_handle.get(parser) {
186            return match node {
187                tl::Node::Tag(tag) => Self::convert_tag(tag, parser),
188                tl::Node::Raw(text) => Self::convert_text(text.as_utf8_str()),
189                tl::Node::Comment(comment) => Self::convert_comment(comment.as_utf8_str()),
190            };
191        }
192
193        InvalidParserStateSnafu {}.fail()
194    }
195
196    fn convert_text(text: impl Into<String>) -> Result<Node<HtmlContent>, HtmlDomError> {
197        Ok(Node::new(HtmlContent::Text(text.into())))
198    }
199
200    fn convert_comment(comment: impl Into<String>) -> Result<Node<HtmlContent>, HtmlDomError> {
201        let comment = comment.into();
202        let comment = comment.trim_start_matches("<!--");
203        let comment = comment.trim_end_matches("-->");
204        let comment = comment.trim();
205        Ok(Node::new(HtmlContent::Comment(comment.into())))
206    }
207
208    fn inner_html(&self, children: Children<HtmlContent>) -> String {
209        match self {
210            HtmlContent::Comment(_) => String::new(),
211            HtmlContent::Text(s) => s.clone(),
212            HtmlContent::Document(d) => {
213                let mut inner_content = children
214                    .into_iter()
215                    .map(|c| c.outer_html())
216                    .collect::<Vec<_>>();
217                if let Some(doctype) = &d.doctype {
218                    inner_content.insert(0, doctype.outer_html());
219                    inner_content.insert(1, String::from('\n'));
220                }
221
222                inner_content.join("")
223            }
224            HtmlContent::Tag(_t) => children
225                .into_iter()
226                .map(|c| c.outer_html())
227                .collect::<Vec<_>>()
228                .join(""),
229        }
230    }
231
232    fn outer_html(&self, children: Children<HtmlContent>) -> String {
233        match self {
234            HtmlContent::Comment(s) => format!("<!-- {} -->", s),
235            HtmlContent::Text(s) => s.clone(),
236            HtmlContent::Document(_) => self.inner_html(children),
237            HtmlContent::Tag(t) => {
238                let mut parts = Vec::<String>::new();
239                t.build_start_tag(|content| parts.push(content));
240
241                for child in children {
242                    parts.push(child.outer_html());
243                }
244
245                t.build_end_tag(|content| parts.push(content));
246                parts.join("")
247            }
248        }
249    }
250
251    fn text_content(&self, children: Children<HtmlContent>) -> String {
252        match self {
253            HtmlContent::Comment(_) => String::new(),
254            HtmlContent::Text(s) => s.clone(),
255            HtmlContent::Tag(_) | HtmlContent::Document(_) => children
256                .into_iter()
257                .filter_map(|c| {
258                    let child_render = c.text_content();
259
260                    if child_render.is_empty() {
261                        None
262                    } else {
263                        Some(child_render)
264                    }
265                })
266                .collect::<Vec<_>>()
267                .join(" "),
268        }
269    }
270
271    fn matches_selector(&self, selector: &CssSelector) -> bool {
272        match self {
273            HtmlContent::Comment(_) | HtmlContent::Text(_) | HtmlContent::Document(_) => false,
274            HtmlContent::Tag(t) => t.matches_selector(selector),
275        }
276    }
277
278    pub(crate) fn clear_attribute(&mut self, attribute: &String) {
279        match self {
280            HtmlContent::Comment(_) | HtmlContent::Text(_) | HtmlContent::Document(_) => (),
281            HtmlContent::Tag(tag) => {
282                tag.attributes.remove(attribute);
283            }
284        }
285    }
286
287    pub(crate) fn set_attribute(&mut self, attribute: impl Into<String>, value: impl Into<String>) {
288        match self {
289            HtmlContent::Comment(_) | HtmlContent::Text(_) | HtmlContent::Document(_) => (),
290            HtmlContent::Tag(tag) => {
291                tag.attributes.insert(attribute.into(), value.into());
292            }
293        }
294    }
295
296    pub(crate) fn get_attribute(&self, attribute: &String) -> Option<String> {
297        match self {
298            HtmlContent::Comment(_) | HtmlContent::Text(_) | HtmlContent::Document(_) => None,
299            HtmlContent::Tag(tag) => tag.attributes.get(attribute).cloned(),
300        }
301    }
302}
303
304pub trait HtmlRenderable {
305    /// Returns the markup of all child elements
306    ///
307    /// ## Limitations
308    /// - The tag attributes are written in alphabetical order
309    /// - Spaces within the tag are not preserved (i.e. `<img      src="">` may become `<img src="">`)
310    /// - Does no escaping to change as little as possible of the input HTML. Input has to be correctly escaped
311    ///
312    /// Equivalent to [Element#innerHTML](https://developer.mozilla.org/en-US/docs/Web/API/Element/innerHTML) in browsers
313    fn inner_html(&self) -> String;
314    /// Returns the markup building up this element an all children
315    ///
316    /// ## Limitations
317    /// - The tag attributes are written in alphabetical order
318    /// - Spaces within the tag are not preserved (i.e. `<img      src="">` may become `<img src="">`)
319    /// - Does no escaping to change as little as possible of the input HTML. Input has to be correctly escaped
320    ///
321    /// Equivalent to [Element#outerHTML](https://developer.mozilla.org/en-US/docs/Web/API/Element/outerHTML) in browsers
322    fn outer_html(&self) -> String;
323    /// Returns the contained (non-comment) text of this element, excluding any markup.
324    /// - Does no escaping to change as little as possible of the input HTML. Input has to be correctly escaped
325    ///
326    /// Equivalent to [Element#outerHTML](https://developer.mozilla.org/en-US/docs/Web/API/Node/textContent) in browsers
327    fn text_content(&self) -> String;
328}
329
330impl HtmlRenderable for Node<HtmlContent> {
331    fn inner_html(&self) -> String {
332        let children = self.children();
333        let inner = self.borrow();
334
335        inner.inner_html(children)
336    }
337
338    fn outer_html(&self) -> String {
339        let children = self.children();
340        let inner = self.borrow();
341
342        inner.outer_html(children)
343    }
344
345    fn text_content(&self) -> String {
346        let children = self.children();
347        let inner = self.borrow();
348
349        inner.text_content(children)
350    }
351}
352
353impl HtmlRenderable for HTMLVersion {
354    fn inner_html(&self) -> String {
355        String::new()
356    }
357
358    fn outer_html(&self) -> String {
359        match self {
360            HTMLVersion::HTML5 => String::from("<!DOCTYPE html>"),
361            HTMLVersion::StrictHTML401 => String::from(
362                r#"<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">"#,
363            ),
364            HTMLVersion::TransitionalHTML401 => String::from(
365                r#"<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/1999/REC-html401-19991224/loose.dtd">"#,
366            ),
367            HTMLVersion::FramesetHTML401 => String::from(
368                r#"<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Frameset//EN" "http://www.w3.org/TR/1999/REC-html401-19991224/frameset.dtd">"#,
369            ),
370        }
371    }
372
373    fn text_content(&self) -> String {
374        String::new()
375    }
376}
377
378pub(crate) trait HtmlQueryable {
379    fn matches_selector(&self, selector: &CssSelector) -> bool;
380}
381
382impl HtmlQueryable for Node<HtmlContent> {
383    fn matches_selector(&self, selector: &CssSelector) -> bool {
384        let inner = self.borrow();
385        inner.matches_selector(selector)
386    }
387}