lssg_lib/html/
lexer.rs

1use std::{collections::HashMap, io::Read};
2
3use crate::{char_reader::CharReader, parse_error::ParseError};
4
5#[macro_export]
6macro_rules! html {
7    ($x:tt) => {
8        $crate::html::parse_html(format!($x).as_bytes())
9            .map(|html| match html.into_iter().next() {
10                Some(i) => i,
11                None => panic!("has to contain valid html"),
12            })
13            .expect("should contain valid html")
14    };
15}
16
17pub fn parse_html(input: impl Read) -> Result<Vec<Html>, ParseError> {
18    let mut reader = CharReader::new(input);
19
20    let mut tokens = vec![];
21
22    loop {
23        match read_token(&mut reader)? {
24            None => break,
25            Some(t) => tokens.push(t),
26        }
27    }
28
29    // add texts together
30    let mut reduced_tokens = vec![];
31    for token in tokens.into_iter() {
32        if let Some(Html::Text { text: a }) = reduced_tokens.last_mut() {
33            if let Html::Text { text: b } = &token {
34                *a += b;
35                continue;
36            }
37        }
38        reduced_tokens.push(token)
39    }
40
41    Ok(reduced_tokens)
42}
43
44/// parse html from start to end and return (tag, attributes, innerHtml)
45///
46/// seperated to make logic more reusable
47pub fn parse_html_block(
48    reader: &mut CharReader<impl Read>,
49) -> Result<Option<(String, HashMap<String, String>, String)>, ParseError> {
50    if let Some('<') = reader.peek_char(0)? {
51        if let Some(start_tag) = reader.peek_until(|c| c == '>')? {
52            // get start_tag
53            let mut tag = String::new();
54            for c in start_tag[1..start_tag.len() - 1].chars() {
55                match c {
56                    ' ' => break,
57                    '\n' => break,
58                    _ => tag.push(c),
59                }
60            }
61
62            // get attributes
63            let mut attributes = HashMap::new();
64            let chars: Vec<char> = start_tag[1 + tag.len()..start_tag.len() - 1]
65                .chars()
66                .collect();
67            let mut key = String::new();
68            let mut value = String::new();
69            let mut in_value = false;
70            let mut i = 0;
71            while i < chars.len() {
72                match chars[i] {
73                    ' ' if in_value == false => {
74                        if key.len() > 0 {
75                            attributes.insert(key, value);
76                            key = String::new();
77                            value = String::new();
78                            in_value = false;
79                        }
80                    }
81                    '=' => match chars.get(i + 1) {
82                        Some('"') | Some('\'') => {
83                            i += 1;
84                            in_value = true
85                        }
86                        _ => {}
87                    },
88                    '\'' | '"' if in_value => in_value = false,
89                    c => {
90                        if in_value {
91                            value.push(c)
92                        } else {
93                            key.push(c)
94                        }
95                    }
96                }
97                i += 1;
98            }
99            if key.len() > 0 {
100                attributes.insert(key, value);
101            }
102
103            let end_tag = format!("</{tag}>");
104            if let Some(html_block) = reader.peek_until_match_inclusive(&end_tag)? {
105                reader.consume(start_tag.len())?;
106                let mut content = reader.consume_string(html_block.len() - start_tag.len())?;
107                content.truncate(content.len() - end_tag.len());
108
109                let mut children = vec![];
110                let mut reader = CharReader::new(content.as_bytes());
111                while let Some(html) = read_token(&mut reader)? {
112                    children.push(html);
113                }
114                return Ok(Some((tag, attributes, content)));
115            }
116        }
117    }
118    return Ok(None);
119}
120
121/// A "simple" streaming html parser function. This is a fairly simplified way of parsing html
122/// ignoring a lot of edge cases and validation normally seen when parsing html.
123///
124/// **NOTE: Might return multiple Text tokens one after another.**
125pub fn read_token(reader: &mut CharReader<impl Read>) -> Result<Option<Html>, ParseError> {
126    match reader.peek_char(0)? {
127        None => return Ok(None),
128        Some(c) => {
129            if c == '<' {
130                if "<!--" == reader.peek_string(4)? {
131                    if let Some(text) = reader.peek_until_match_inclusive("-->")? {
132                        reader.consume(4)?; // skip start
133                        let text = reader.consume_string(text.len() - 4 - 3)?;
134                        reader.consume(3)?; // skip end
135                        return Ok(Some(Html::Comment { text }));
136                    }
137                }
138
139                if let Some((tag, attributes, content)) = parse_html_block(reader)? {
140                    let mut children = vec![];
141                    let mut reader = CharReader::new(content.as_bytes());
142                    while let Some(html) = read_token(&mut reader)? {
143                        children.push(html);
144                    }
145                    return Ok(Some(Html::Element {
146                        tag,
147                        attributes,
148                        children,
149                    }));
150                }
151
152                // non html opening
153                reader.consume(1)?;
154                let mut text = "<".to_string();
155                text.push_str(&reader.consume_until_exclusive(|c| c == '<')?);
156                return Ok(Some(Html::Text { text }));
157            }
158
159            let text = reader.consume_until_exclusive(|c| c == '<')?;
160            return Ok(Some(Html::Text { text }));
161        }
162    }
163}
164
165#[derive(Debug, Clone, PartialEq)]
166pub enum Html {
167    Comment {
168        text: String,
169    },
170    Text {
171        text: String,
172    },
173    Element {
174        tag: String,
175        attributes: HashMap<String, String>,
176        children: Vec<Html>,
177    },
178}
179
180#[cfg(test)]
181mod tests {
182    use std::io::Cursor;
183
184    use crate::html::to_attributes;
185
186    use super::*;
187
188    #[test]
189    fn test_html() {
190        let input = r#"<a href="test.com"><i class="fa-solid fa-rss"></i>Test</a>
191<button disabled></button>"#;
192        let expected = vec![
193            Html::Element {
194                tag: "a".into(),
195                attributes: to_attributes([("href", "test.com")]),
196                children: vec![
197                    Html::Element {
198                        tag: "i".into(),
199                        attributes: to_attributes([("class", "fa-solid fa-rss")]),
200                        children: vec![],
201                    },
202                    Html::Text {
203                        text: "Test".into(),
204                    },
205                ],
206            },
207            Html::Text { text: "\n".into() },
208            Html::Element {
209                tag: "button".into(),
210                attributes: to_attributes([("disabled", "")]),
211                children: vec![],
212            },
213        ];
214
215        let reader: Box<dyn Read> = Box::new(Cursor::new(input));
216        let tokens = parse_html(reader).unwrap();
217        assert_eq!(expected, tokens);
218
219        let input = r#"<div>
220<a href="link.com">[other](other.com)</a>
221</div>"#;
222        let expected = vec![Html::Element {
223            tag: "div".into(),
224            attributes: HashMap::new(),
225            children: vec![
226                Html::Text { text: "\n".into() },
227                Html::Element {
228                    tag: "a".into(),
229                    attributes: to_attributes([("href", "link.com")]),
230                    children: vec![Html::Text {
231                        text: "[other](other.com)".into(),
232                    }],
233                },
234                Html::Text { text: "\n".into() },
235            ],
236        }];
237        let reader: Box<dyn Read> = Box::new(Cursor::new(input));
238        let tokens = parse_html(reader).unwrap();
239        assert_eq!(expected, tokens);
240    }
241
242    #[test]
243    fn test_text_looks_like_html() {
244        let input = r#"<Lots of people say Rust > c++. even though it might be
245< then c++. Who knows? 
246<nonclosing>
247This should be text
248"#;
249        let expected = vec![Html::Text {
250            text: "<Lots of people say Rust > c++. even though it might be
251< then c++. Who knows? 
252<nonclosing>
253This should be text
254"
255            .into(),
256        }];
257
258        let reader: Box<dyn Read> = Box::new(Cursor::new(input));
259        let tokens = parse_html(reader).unwrap();
260        assert_eq!(expected, tokens);
261    }
262}