html_parser_rs/
lib.rs

1#[derive(Debug)]
2pub enum Event {
3    StartElement(String),
4    EndElement(String),
5    TextContent(String),
6    Attribute(String, String),
7}
8
9pub struct Lexer {
10    content: Vec<char>,
11    tag_stack: Vec<String>,
12    events: Vec<Event>,
13}
14
15impl Lexer {
16    pub fn new(content: String) -> Self {
17        Self {
18            content: content.chars().collect::<Vec<char>>(),
19            tag_stack: Vec::new(),
20            events: Vec::new(),
21        }
22    }
23
24    pub fn parse(&mut self) -> &[Event] {
25        while !self.content.is_empty() {
26            self.take_whitespaces();
27
28            if self.content.is_empty() {
29                break;
30            }
31
32            // TODO: Implement html declaration eg, `<!Doctype html>`.
33            // TODO: Implement html comment.
34            // TODO: Implement not closing tag eg, `<meta ...>`
35
36            // End element
37            if self.content.len() > 1 && self.content[0] == '<' && self.content[1] == '/' {
38                self.take_end_element();
39            }
40
41            // Start element
42            if !self.content.is_empty() && self.content[0] == '<' {
43                self.take_start_element();
44            }
45
46            // Text content
47            if !self.content.is_empty() {
48                self.take_text_content();
49            }
50        }
51
52        &self.events
53    }
54
55    // `</TAG_NAME>``
56    fn take_end_element(&mut self) {
57        let tag_name = self.take_tag_name(2);
58        self.events.push(Event::EndElement(tag_name.clone()));
59
60        // End element validation
61        if self.tag_stack.last() == Some(&tag_name) {
62            self.tag_stack.pop();
63        } else {
64            eprintln!("ERROR: Invalid closing tag `{tag_name}`.")
65        }
66
67        self.take_whitespaces();
68
69        if self.content[0] == '>' {
70            self.get_slice(0, 1);
71        } else {
72            eprintln!("ERROR: Invalid closing tag with extra args.");
73        }
74    }
75
76    // `<TAG_NAME ATTRIBUTES>` | `<TAG_NAME ATTRIBUTES/>`
77    fn take_start_element(&mut self) {
78        let tag_name = self.take_tag_name(1);
79        self.events.push(Event::StartElement(tag_name.clone()));
80        self.tag_stack.push(tag_name);
81
82        self.take_attributes();
83        self.take_whitespaces();
84
85        // SELF_END_ELEMENT `/>`
86        if self.content.len() > 1 && self.content[0] == '/' && self.content[1] == '>' {
87            self.get_slice(0, 2);
88
89            if let Some(last_tag) = self.tag_stack.pop() {
90                self.events.push(Event::EndElement(last_tag));
91            } else {
92                eprintln!("ERROR: there is no tag.");
93            }
94        }
95        // EXPECTED `>`
96        else if self.content[0] == '>' {
97            self.get_slice(0, 1);
98        } else {
99            eprintln!("ERROR: expected `>` on start element.");
100        }
101    }
102
103    fn take_tag_name(&mut self, start: usize) -> String {
104        self.take_while_from(start, |x| x.is_alphabetic() || x.is_alphanumeric())
105    }
106
107    fn take_text_content(&mut self) {
108        let value = self.take_while(|x| x != '<');
109        let value = value.replace("\n", "");
110        let value = value.replace("\t", "");
111        let value = value.trim().to_string();
112
113        if value.is_empty() {
114            return;
115        }
116
117        self.events.push(Event::TextContent(value));
118    }
119
120    fn take_attributes(&mut self) {
121        // after start a element, until we go to '>' | '/', means we are collecting attributes
122        // eg, `<tag key=value key=value>` || `<tag key=value />`;
123        while (self.content[0] != '>') && (self.content[0] != '/') {
124            self.take_attribute()
125        }
126    }
127
128    // `KEY=VALUE` || `KEY`
129    fn take_attribute(&mut self) {
130        self.take_whitespaces();
131
132        let key =
133            self.take_while(|x| x.is_alphabetic() || x.is_alphanumeric() || x == '-' || x == '_');
134
135        let value = self.take_attribute_value();
136
137        self.events.push(Event::Attribute(key, value));
138
139        self.take_whitespaces();
140    }
141
142    fn take_attribute_value(&mut self) -> String {
143        if self.content.is_empty() || self.content[0] != '=' {
144            return String::from("");
145        }
146
147        self.get_slice(0, 1);
148
149        let mut qoute_count = 0;
150
151        // TODO: Implement string logic.
152        let value = self.take_while(|x| {
153            // String identifire
154            if x == '"' {
155                qoute_count += 1;
156                return if qoute_count == 2 { false } else { true };
157            }
158
159            // If value is a string
160            if qoute_count == 1 {
161                return true;
162            }
163
164            x != ' ' || x != '>' || x != '/'
165        });
166
167        if qoute_count == 0 {
168            return value;
169        }
170
171        // when qoute == 2 THEN we break, means we don't count ending '"'
172        // we need to clean up that '"'
173        self.get_slice(0, 1);
174
175        value[1..].to_string()
176    }
177
178    fn take_whitespaces(&mut self) {
179        self.take_while(|x| x.is_whitespace());
180    }
181
182    fn take_while<F>(&mut self, predict: F) -> String
183    where
184        F: FnMut(char) -> bool,
185    {
186        self.take_while_from(0, predict)
187    }
188
189    fn take_while_from<F>(&mut self, start: usize, mut predict: F) -> String
190    where
191        F: FnMut(char) -> bool,
192    {
193        let mut i = start;
194
195        while self.content.len() > i && predict(self.content[i]) {
196            i += 1;
197        }
198
199        self.get_slice(start, i)
200    }
201
202    fn get_slice(&mut self, from: usize, to: usize) -> String {
203        let value = self.content[from..to].iter().collect::<String>();
204        self.content = self.content[to..].to_vec();
205
206        value
207    }
208}