rusthtml/
lib.rs

1#[cfg(test)]
2mod tests;
3
4/// A tag "completer" that fill in some missing tags allowed by html specification
5///
6/// Some html tags are allowed to be "inferred" in specific location; however, our parser does not
7/// allow this. Thus, this function will "fix" those tags.
8///
9/// **This function is currently significantly incomplete compared to html specification.**
10///
11/// ```
12/// # extern crate rusthtml;
13/// # use rusthtml::*;
14/// let parsed = ElementContent::parse(tag_optimize(HtmlTag::parse(
15///        r#"<img abc>"#,
16///    )));
17/// assert_eq!(parsed,
18///     Ok(vec![
19///         ElementContent::HtmlElement(Box::new(HtmlElement {
20///         name: "img",
21///         attributes: vec![("abc", None)],
22///         tag_state: ElementTagState::BothTag,
23///         content: Vec::new(),
24///     }))])
25/// );
26/// ```
27pub fn tag_optimize<'a>(mut content: Vec<HtmlTag<'a>>) -> Vec<HtmlTag<'a>> {
28    let mut offset = 0;
29    // There should be a better way to do this...
30    // Despreated.
31    let _ = |x| match x {
32        HtmlTag::OpeningTag(i, j) => {
33            let mut a = j
34                .iter()
35                .map(|x| {
36                    if let Some(i) = x.1 {
37                        format!(" {}={}", x.0, i)
38                    } else {
39                        format!(" {}", x.0)
40                    }
41                })
42                .fold(format!("<{}", i), |a, b| {
43                    let mut a = a;
44                    a.push_str(&b);
45                    a
46                });
47            a.push('>');
48            a
49        }
50        HtmlTag::ClosingTag(i) => format!("</{}>", i),
51        HtmlTag::Unparsable(i) => i.to_string(),
52    };
53    // TODO: implement `template`
54    // TODO: implement `head`, `body` omition
55    // This is only a subset of the full rule. More rules should be added to make it complete.
56    for i in 0..content.len() {
57        if let HtmlTag::OpeningTag(name, _) = content[i + offset] {
58            match name {
59                "area" | "base" | "br" | "col" | "embed" | "hr" | "img" | "input" | "link"
60                | "meta" | "param" | "source" | "track" | "wbr" => {
61                    content.insert(i + offset + 1, HtmlTag::ClosingTag(name));
62                    offset += 1;
63                }
64                "li" | "dd" | "dt" | "rt" | "rp" | "optgroup" | "tr" | "td" | "th" => {
65                    if let HtmlTag::OpeningTag(name_c, _) = content[i + offset + 1] {
66                        if name_c == name {
67                            content.insert(i + offset + 1, HtmlTag::ClosingTag(name));
68                            offset += 1;
69                        }
70                    }
71                }
72                "p" => {
73                    // TODO: "if there is no more content in the parent element and the parent
74                    // element is an HTML element that is not an a, audio, del, ins, map, noscript,
75                    // or video element, or an autonomous custom element."
76                    if let HtmlTag::OpeningTag(name_c, _) = content[i + offset + 1] {
77                        match name_c {
78                            "address" | "article" | "aside" | "blockquote" | "details" | "div"
79                            | "dl" | "fieldset" | "figcaption" | "figure" | "footer" | "form"
80                            | "h1" | "h2" | "h3" | "h4" | "h5" | "h6" | "header" | "hgroup"
81                            | "hr" | "main" | "menu" | "nav" | "ol" | "p" | "pre" | "section"
82                            | "table" | "ul" => {
83                                content.insert(i + offset + 1, HtmlTag::ClosingTag("p"));
84                                offset += 1;
85                            }
86                            _ => {}
87                        }
88                    }
89                }
90                _ => {}
91            }
92        }
93    }
94
95    content
96}
97
98/// Possible tag state of an element
99#[derive(PartialEq, Debug)]
100pub enum ElementTagState {
101    OnlyStartTag,
102    OnlyEndTag,
103    BothTag,
104}
105/// Possible content of an element
106#[derive(PartialEq, Debug)]
107pub enum ElementContent<'a> {
108    HtmlElement(Box<HtmlElement<'a>>),
109    LiteralContent(&'a str),
110}
111/// A html element
112#[derive(PartialEq, Debug)]
113pub struct HtmlElement<'a> {
114    /// Name of the element
115    pub name: &'a str,
116    /// Attributes of the element
117    pub attributes: Vec<(&'a str, Option<&'a str>)>,
118    /// Possible tag state of an element
119    pub tag_state: ElementTagState,
120    /// Contents of the element
121    pub content: Vec<ElementContent<'a>>,
122}
123impl<'a> ElementContent<'a> {
124    /// Parse a vector of html tag to elements
125    ///
126    /// # Errors
127    ///
128    /// If the input content contains ending tag of a non-existing element, the function will
129    /// yield an error.
130    pub fn parse(content: Vec<HtmlTag<'a>>) -> Result<Vec<Self>, ()> {
131        let mut constructed = Vec::new();
132        for i in content {
133            match i {
134                HtmlTag::OpeningTag(i, j) => {
135                    constructed.push(Self::HtmlElement(Box::new(HtmlElement {
136                        name: i,
137                        attributes: j,
138                        tag_state: ElementTagState::OnlyStartTag,
139                        content: Vec::new(),
140                    })))
141                }
142                HtmlTag::ClosingTag(i) => {
143                    let mut tag_content = Vec::new();
144                    while constructed.len() != 0 {
145                        if let Self::HtmlElement(k) = &constructed[constructed.len() - 1] {
146                            if k.name == i {
147                                break;
148                            }
149                        }
150                        tag_content.push(constructed.remove(constructed.len() - 1));
151                    }
152                    if constructed.len() == 0 {
153                        return Err(());
154                    }
155                    let mut last_ref = if let Some(i) = constructed.last_mut() {
156                        if let Self::HtmlElement(i) = i {
157                            i
158                        } else {
159                            unsafe { core::hint::unreachable_unchecked() }
160                        }
161                    } else {
162                        unsafe { core::hint::unreachable_unchecked() }
163                    };
164                    tag_content.reverse();
165                    last_ref.content.append(&mut tag_content);
166                    last_ref.tag_state = ElementTagState::BothTag;
167                }
168                HtmlTag::Unparsable(i) => constructed.push(Self::LiteralContent(i)),
169            }
170        }
171        Ok(constructed)
172    }
173}
174
175/// Raw html tag
176#[derive(PartialEq, Debug)]
177pub enum HtmlTag<'a> {
178    OpeningTag(&'a str, Vec<(&'a str, Option<&'a str>)>),
179    ClosingTag(&'a str),
180    Unparsable(&'a str),
181}
182impl<'a> HtmlTag<'a> {
183    /// Parse a html to tags
184    pub fn parse(content: &'a str) -> Vec<Self> {
185        let mut last_splitn = 0;
186        let mut constructed = Vec::new();
187        let unparsable_content_push = |index, last_splitn, constructed: &mut Vec<_>| {
188            if last_splitn != 0 && !content[last_splitn + 1..index].trim().is_empty() {
189                constructed.push(Self::Unparsable(&content[last_splitn + 1..index]))
190            }
191        };
192        let mut ignore_parsing = None;
193        for (index, i) in content.char_indices() {
194            if i == '<' {
195                if ignore_parsing.is_none() {
196                    unparsable_content_push(index, last_splitn, &mut constructed);
197                }
198                last_splitn = index;
199            } else if i == '>' {
200                let tag = &content[last_splitn..index];
201                if tag.chars().nth(0).unwrap() != '<' {
202                    continue;
203                }
204                let tag = &tag[1..].trim_start();
205                let constru = if tag.chars().nth(0) == Some('/') {
206                    if let Some((i, j)) = ignore_parsing {
207                        if i == &tag[1..] {
208                            ignore_parsing = None;
209                            constructed.push(HtmlTag::Unparsable(&content[j..last_splitn]));
210                        } else {
211                            continue;
212                        }
213                    }
214                    Self::ClosingTag(&tag[1..])
215                } else if tag.chars().nth(0) == Some('!') {
216                    Self::Unparsable(tag)
217                } else {
218                    if ignore_parsing.is_some() {
219                        continue;
220                    }
221                    let parsed = Self::parse_opening_tag_content(tag);
222                    if (parsed.0 == "script")
223                        | (parsed.0 == "style")
224                        | (parsed.0 == "textarea")
225                        | (parsed.0 == "title")
226                    {
227                        ignore_parsing = Some((parsed.0, index + 1));
228                    }
229                    Self::OpeningTag(parsed.0, parsed.1)
230                };
231                constructed.push(constru);
232                last_splitn = index;
233            }
234        }
235        constructed
236    }
237    fn parse_opening_tag_content(content: &'a str) -> (&'a str, Vec<(&'a str, Option<&'a str>)>) {
238        let content = content.trim();
239        #[derive(PartialEq)]
240        enum QuoteStatus {
241            NoQuote,
242            SingleQuote,
243            DoubleQuote,
244            BangQuote,
245        };
246        let mut current_quotation = QuoteStatus::NoQuote;
247        let mut splitted_content = Vec::new();
248        let mut space_position = 0;
249        let mut is_empty = true;
250        let length = content.chars().count();
251        for (index, i) in content.char_indices() {
252            if i == ' ' && current_quotation == QuoteStatus::NoQuote && !is_empty {
253                // This is appropriate split position
254                if space_position != 0 {
255                    space_position += 1;
256                }
257                //println!("{} {}", space_position, index);
258                splitted_content.push(&content[space_position..index]);
259                is_empty = true;
260                space_position = index;
261            } else if index + 1 == length {
262                splitted_content.push(&content[space_position..].trim_start());
263                space_position = index + 1;
264            } else if (i == '"') | (i == '\'') | (i == '!') {
265                current_quotation = match current_quotation {
266                    QuoteStatus::NoQuote => {
267                        if i == '"' {
268                            QuoteStatus::DoubleQuote
269                        } else if i == '\'' {
270                            QuoteStatus::SingleQuote
271                        } else {
272                            QuoteStatus::BangQuote
273                        }
274                    }
275                    _ => QuoteStatus::NoQuote,
276                };
277            }
278            if i != ' ' {
279                is_empty = false;
280            }
281        }
282        if splitted_content.len() == 0 {
283            return ("", Vec::new());
284        }
285        let name = splitted_content.remove(0);
286        let splitted_content = splitted_content
287            .iter_mut()
288            .map(|x| {
289                let equal_sign = x.rfind('=');
290                match equal_sign {
291                    Some(i) => (
292                        &x[..i],
293                        Some(x[i + 1..].trim_matches(|c| (c == '"') | (c == '\''))),
294                    ),
295                    None => (&x[..], None),
296                }
297            })
298            .collect();
299        (name, splitted_content)
300    }
301}