parse_html/lexer/
tokenizer.rs

1use crate::token::Token;
2
3use super::lexer_trait::LexerTrait;
4
5pub struct Lexer {
6    input: Vec<char>,
7    position: usize,
8}
9
10impl Lexer {
11    fn next_char(&mut self) -> Option<char> {
12        if self.position < self.input.len() {
13            let ch = self.input[self.position];
14            self.position += 1;
15            Some(ch)
16        } else {
17            None
18        }
19    }
20
21    fn read_tag_name(&mut self) -> String {
22        let mut name = String::new();
23        while let Some(ch) = self.next_char() {
24            if ch.is_alphanumeric() {
25                name.push(ch);
26            } else {
27                self.position -= 1;
28                break;
29            }
30        }
31        name
32    }
33
34    fn read_attribute_value(&mut self) -> String {
35        let mut value = String::new();
36        let mut inside_quotes = false;
37
38        while let Some(ch) = self.next_char() {
39            if ch == '"' {
40                if inside_quotes {
41                    break;
42                } else {
43                    inside_quotes = true;
44                    continue;
45                }
46            }
47            if inside_quotes {
48                value.push(ch);
49            }
50        }
51        value
52    }
53
54    fn read_attribute_name(&mut self) -> Option<Token> {
55        let mut name = String::new();
56        while let Some(ch) = self.next_char() {
57            if ch.is_alphanumeric() || ch == '-' {
58                name.push(ch);
59            } else if ch == '=' || ch == ' ' {
60                self.position -= 1;
61                return Some(Token::AttributeName(name));
62            } else {
63                self.position -= 1;
64                break;
65            }
66        }
67        None
68    }
69}
70
71impl LexerTrait for Lexer {
72    fn new(input: &str) -> Self {
73        Self {
74            input: input.chars().collect(),
75            position: 0,
76        }
77    }
78
79    fn tokenize(&mut self) -> Vec<Token> {
80        let mut tokens = Vec::new();
81
82        while let Some(ch) = self.next_char() {
83            match ch {
84                '<' => {
85                    if let Some(next) = self.next_char() {
86                        if next == '!' {
87                            if self.next_char() == Some('-') && self.next_char() == Some('-') {
88                                // Ignorer le commentaire
89                                while self.next_char().is_some() {
90                                    if self.next_char() == Some('-')
91                                        && self.next_char() == Some('-')
92                                        && self.next_char() == Some('>')
93                                    {
94                                        break;
95                                    }
96                                }
97                                continue;
98                            }
99                        } else if next == '/' {
100                            tokens.push(Token::TagClose(self.read_tag_name()));
101                        } else {
102                            let mut tmp_tokens = Vec::new();
103                            self.position -= 1;
104                            let start_position = self.position;
105                            let mut is_self_closing = false;
106                            let mut self_tag_name = String::new();
107                            while let Some(ch1) = self.next_char() {
108                                match ch1 {
109                                    '>' => break,
110                                    '/' => {
111                                        is_self_closing = true;
112                                        break;
113                                    }
114                                    ' ' => {
115                                        if let Some(attr) = self.read_attribute_name() {
116                                            tmp_tokens.push(attr);
117                                        }
118                                    }
119                                    '=' => {
120                                        let value = self.read_attribute_value();
121                                        tmp_tokens.push(Token::AttributeValue(value));
122                                    }
123                                    _ => {
124                                        self_tag_name.push(ch1);
125                                    }
126                                }
127                            }
128                            let end_position = self.position;
129                            self.position = start_position;
130                            if is_self_closing {
131                                tokens.push(Token::SelfClosingTag(self_tag_name));
132                            } else {
133                                tokens.push(Token::TagOpen(self.read_tag_name()));
134                            }
135                            tokens.extend(tmp_tokens.clone());
136                            self.position = end_position;
137                        }
138                    }
139                }
140                ' ' | '\n' | '\t' | '>' | '/' => continue,
141                _ => {
142                    let mut text = String::new();
143                    text.push(ch);
144                    while let Some(next) = self.next_char() {
145                        if next == '<' {
146                            self.position -= 1;
147                            break;
148                        }
149                        text.push(next);
150                    }
151                    tokens.push(Token::Text(text));
152                }
153            }
154        }
155
156        tokens.push(Token::Eof);
157        tokens
158    }
159}
160
161#[cfg(test)]
162mod tests {
163    use crate::token::Token;
164
165    use super::*;
166
167    fn run_lexer_test(input: &str, expected_tokens: Vec<Token>) {
168        let mut lexer = Lexer::new(input);
169        let tokens = lexer.tokenize();
170        assert_eq!(tokens, expected_tokens);
171    }
172
173    #[test]
174    fn test_simple_text() {
175        let input = "<p>Hello World</p>";
176        let expected_tokens = vec![
177            Token::TagOpen("p".to_string()),
178            Token::Text("Hello World".to_string()),
179            Token::TagClose("p".to_string()),
180            Token::Eof,
181        ];
182        run_lexer_test(input, expected_tokens);
183    }
184
185    #[test]
186    fn test_nested_tags() {
187        let input = "<div><h1>Title</h1><p>Paragraph</p></div>";
188        let expected_tokens = vec![
189            Token::TagOpen("div".to_string()),
190            Token::TagOpen("h1".to_string()),
191            Token::Text("Title".to_string()),
192            Token::TagClose("h1".to_string()),
193            Token::TagOpen("p".to_string()),
194            Token::Text("Paragraph".to_string()),
195            Token::TagClose("p".to_string()),
196            Token::TagClose("div".to_string()),
197            Token::Eof,
198        ];
199        run_lexer_test(input, expected_tokens);
200    }
201
202    #[test]
203    fn test_attributes() {
204        let input = r#"<a href="https://example.com">Click here</a>"#;
205        let expected_tokens = vec![
206            Token::TagOpen("a".to_string()),
207            Token::AttributeName("href".to_string()),
208            Token::AttributeValue("https://example.com".to_string()),
209            Token::Text("Click here".to_string()),
210            Token::TagClose("a".to_string()),
211            Token::Eof,
212        ];
213        run_lexer_test(input, expected_tokens);
214    }
215
216    #[test]
217    fn test_self_closing_tag() {
218        let input = r#"<img src="image.png" />"#;
219        let expected_tokens = vec![
220            Token::SelfClosingTag("img".to_string()),
221            Token::AttributeName("src".to_string()),
222            Token::AttributeValue("image.png".to_string()),
223            Token::Eof,
224        ];
225        run_lexer_test(input, expected_tokens);
226    }
227
228    #[test]
229    fn test_mixed_text_and_tags() {
230        let input = "<p>Hello <strong>World</strong>!</p>";
231        let expected_tokens = vec![
232            Token::TagOpen("p".to_string()),
233            Token::Text("Hello ".to_string()),
234            Token::TagOpen("strong".to_string()),
235            Token::Text("World".to_string()),
236            Token::TagClose("strong".to_string()),
237            Token::Text("!".to_string()),
238            Token::TagClose("p".to_string()),
239            Token::Eof,
240        ];
241        run_lexer_test(input, expected_tokens);
242    }
243
244    #[test]
245    fn test_multiple_attributes() {
246        let input = r#"<input type="text" value="Hello" disabled />"#;
247        let expected_tokens = vec![
248            Token::SelfClosingTag("input".to_string()),
249            Token::AttributeName("type".to_string()),
250            Token::AttributeValue("text".to_string()),
251            Token::AttributeName("value".to_string()),
252            Token::AttributeValue("Hello".to_string()),
253            Token::AttributeName("disabled".to_string()),
254            Token::Eof,
255        ];
256        run_lexer_test(input, expected_tokens);
257    }
258
259    #[test]
260    fn test_text_inside_nested_tags() {
261        let input = "<div><p>Hello <span>beautiful</span> world!</p></div>";
262        let expected_tokens = vec![
263            Token::TagOpen("div".to_string()),
264            Token::TagOpen("p".to_string()),
265            Token::Text("Hello ".to_string()),
266            Token::TagOpen("span".to_string()),
267            Token::Text("beautiful".to_string()),
268            Token::TagClose("span".to_string()),
269            Token::Text("world!".to_string()),
270            Token::TagClose("p".to_string()),
271            Token::TagClose("div".to_string()),
272            Token::Eof,
273        ];
274        run_lexer_test(input, expected_tokens);
275    }
276
277    #[test]
278    fn test_malformed_html() {
279        let input = "<div><p>Unclosed div";
280        let expected_tokens = vec![
281            Token::TagOpen("div".to_string()),
282            Token::TagOpen("p".to_string()),
283            Token::Text("Unclosed div".to_string()),
284            Token::Eof,
285        ];
286        run_lexer_test(input, expected_tokens);
287    }
288
289    #[test]
290    fn test_html_with_comments() {
291        let input = "<p>Hello<!-- This is a comment -->World</p>";
292        let expected_tokens = vec![
293            Token::TagOpen("p".to_string()),
294            Token::Text("Hello".to_string()),
295            Token::Text("World".to_string()),
296            Token::TagClose("p".to_string()),
297            Token::Eof,
298        ];
299        run_lexer_test(input, expected_tokens);
300    }
301
302    #[test]
303    fn test_script_tag_content() {
304        let input = r#"<script>console.log("Hello World");</script>"#;
305        let expected_tokens = vec![
306            Token::TagOpen("script".to_string()),
307            Token::Text("console.log(\"Hello World\");".to_string()),
308            Token::TagClose("script".to_string()),
309            Token::Eof,
310        ];
311        run_lexer_test(input, expected_tokens);
312    }
313
314    #[test]
315    fn test_style_tag_content() {
316        let input = r#"<style>body { color: red; }</style>"#;
317        let expected_tokens = vec![
318            Token::TagOpen("style".to_string()),
319            Token::Text("body { color: red; }".to_string()),
320            Token::TagClose("style".to_string()),
321            Token::Eof,
322        ];
323        run_lexer_test(input, expected_tokens);
324    }
325
326    #[test]
327    fn test_complex_html() {
328        let input = r#"
329        <html>
330            <head>
331                <title>Test Page</title>
332                <meta charset="UTF-8" />
333            </head>
334            <body>
335                <h1>Welcome</h1>
336                <p>This is a <strong>test</strong>.</p>
337                <br />
338                <img src="logo.png" alt="Logo" />
339            </body>
340        </html>
341    "#;
342        let expected_tokens = vec![
343            Token::TagOpen("html".to_string()),
344            Token::TagOpen("head".to_string()),
345            Token::TagOpen("title".to_string()),
346            Token::Text("Test Page".to_string()),
347            Token::TagClose("title".to_string()),
348            Token::SelfClosingTag("meta".to_string()),
349            Token::AttributeName("charset".to_string()),
350            Token::AttributeValue("UTF-8".to_string()),
351            Token::TagClose("head".to_string()),
352            Token::TagOpen("body".to_string()),
353            Token::TagOpen("h1".to_string()),
354            Token::Text("Welcome".to_string()),
355            Token::TagClose("h1".to_string()),
356            Token::TagOpen("p".to_string()),
357            Token::Text("This is a ".to_string()),
358            Token::TagOpen("strong".to_string()),
359            Token::Text("test".to_string()),
360            Token::TagClose("strong".to_string()),
361            Token::Text(".".to_string()),
362            Token::TagClose("p".to_string()),
363            Token::SelfClosingTag("br".to_string()),
364            Token::SelfClosingTag("img".to_string()),
365            Token::AttributeName("src".to_string()),
366            Token::AttributeValue("logo.png".to_string()),
367            Token::AttributeName("alt".to_string()),
368            Token::AttributeValue("Logo".to_string()),
369            Token::TagClose("body".to_string()),
370            Token::TagClose("html".to_string()),
371            Token::Eof,
372        ];
373        run_lexer_test(input, expected_tokens);
374    }
375}