1use crate::token::Token;
2
3use super::lexer_trait::LexerTrait;
4
5pub struct Lexer {
6 input: Vec<char>,
7 position: usize,
8}
9
10impl Lexer {
11 fn next_char(&mut self) -> Option<char> {
12 if self.position < self.input.len() {
13 let ch = self.input[self.position];
14 self.position += 1;
15 Some(ch)
16 } else {
17 None
18 }
19 }
20
21 fn read_tag_name(&mut self) -> String {
22 let mut name = String::new();
23 while let Some(ch) = self.next_char() {
24 if ch.is_alphanumeric() {
25 name.push(ch);
26 } else {
27 self.position -= 1;
28 break;
29 }
30 }
31 name
32 }
33
34 fn read_attribute_value(&mut self) -> String {
35 let mut value = String::new();
36 let mut inside_quotes = false;
37
38 while let Some(ch) = self.next_char() {
39 if ch == '"' {
40 if inside_quotes {
41 break;
42 } else {
43 inside_quotes = true;
44 continue;
45 }
46 }
47 if inside_quotes {
48 value.push(ch);
49 }
50 }
51 value
52 }
53
54 fn read_attribute_name(&mut self) -> Option<Token> {
55 let mut name = String::new();
56 while let Some(ch) = self.next_char() {
57 if ch.is_alphanumeric() || ch == '-' {
58 name.push(ch);
59 } else if ch == '=' || ch == ' ' {
60 self.position -= 1;
61 return Some(Token::AttributeName(name));
62 } else {
63 self.position -= 1;
64 break;
65 }
66 }
67 None
68 }
69}
70
71impl LexerTrait for Lexer {
72 fn new(input: &str) -> Self {
73 Self {
74 input: input.chars().collect(),
75 position: 0,
76 }
77 }
78
79 fn tokenize(&mut self) -> Vec<Token> {
80 let mut tokens = Vec::new();
81
82 while let Some(ch) = self.next_char() {
83 match ch {
84 '<' => {
85 if let Some(next) = self.next_char() {
86 if next == '!' {
87 if self.next_char() == Some('-') && self.next_char() == Some('-') {
88 while self.next_char().is_some() {
90 if self.next_char() == Some('-')
91 && self.next_char() == Some('-')
92 && self.next_char() == Some('>')
93 {
94 break;
95 }
96 }
97 continue;
98 }
99 } else if next == '/' {
100 tokens.push(Token::TagClose(self.read_tag_name()));
101 } else {
102 let mut tmp_tokens = Vec::new();
103 self.position -= 1;
104 let start_position = self.position;
105 let mut is_self_closing = false;
106 let mut self_tag_name = String::new();
107 while let Some(ch1) = self.next_char() {
108 match ch1 {
109 '>' => break,
110 '/' => {
111 is_self_closing = true;
112 break;
113 }
114 ' ' => {
115 if let Some(attr) = self.read_attribute_name() {
116 tmp_tokens.push(attr);
117 }
118 }
119 '=' => {
120 let value = self.read_attribute_value();
121 tmp_tokens.push(Token::AttributeValue(value));
122 }
123 _ => {
124 self_tag_name.push(ch1);
125 }
126 }
127 }
128 let end_position = self.position;
129 self.position = start_position;
130 if is_self_closing {
131 tokens.push(Token::SelfClosingTag(self_tag_name));
132 } else {
133 tokens.push(Token::TagOpen(self.read_tag_name()));
134 }
135 tokens.extend(tmp_tokens.clone());
136 self.position = end_position;
137 }
138 }
139 }
140 ' ' | '\n' | '\t' | '>' | '/' => continue,
141 _ => {
142 let mut text = String::new();
143 text.push(ch);
144 while let Some(next) = self.next_char() {
145 if next == '<' {
146 self.position -= 1;
147 break;
148 }
149 text.push(next);
150 }
151 tokens.push(Token::Text(text));
152 }
153 }
154 }
155
156 tokens.push(Token::Eof);
157 tokens
158 }
159}
160
161#[cfg(test)]
162mod tests {
163 use crate::token::Token;
164
165 use super::*;
166
167 fn run_lexer_test(input: &str, expected_tokens: Vec<Token>) {
168 let mut lexer = Lexer::new(input);
169 let tokens = lexer.tokenize();
170 assert_eq!(tokens, expected_tokens);
171 }
172
173 #[test]
174 fn test_simple_text() {
175 let input = "<p>Hello World</p>";
176 let expected_tokens = vec![
177 Token::TagOpen("p".to_string()),
178 Token::Text("Hello World".to_string()),
179 Token::TagClose("p".to_string()),
180 Token::Eof,
181 ];
182 run_lexer_test(input, expected_tokens);
183 }
184
185 #[test]
186 fn test_nested_tags() {
187 let input = "<div><h1>Title</h1><p>Paragraph</p></div>";
188 let expected_tokens = vec![
189 Token::TagOpen("div".to_string()),
190 Token::TagOpen("h1".to_string()),
191 Token::Text("Title".to_string()),
192 Token::TagClose("h1".to_string()),
193 Token::TagOpen("p".to_string()),
194 Token::Text("Paragraph".to_string()),
195 Token::TagClose("p".to_string()),
196 Token::TagClose("div".to_string()),
197 Token::Eof,
198 ];
199 run_lexer_test(input, expected_tokens);
200 }
201
202 #[test]
203 fn test_attributes() {
204 let input = r#"<a href="https://example.com">Click here</a>"#;
205 let expected_tokens = vec![
206 Token::TagOpen("a".to_string()),
207 Token::AttributeName("href".to_string()),
208 Token::AttributeValue("https://example.com".to_string()),
209 Token::Text("Click here".to_string()),
210 Token::TagClose("a".to_string()),
211 Token::Eof,
212 ];
213 run_lexer_test(input, expected_tokens);
214 }
215
216 #[test]
217 fn test_self_closing_tag() {
218 let input = r#"<img src="image.png" />"#;
219 let expected_tokens = vec![
220 Token::SelfClosingTag("img".to_string()),
221 Token::AttributeName("src".to_string()),
222 Token::AttributeValue("image.png".to_string()),
223 Token::Eof,
224 ];
225 run_lexer_test(input, expected_tokens);
226 }
227
228 #[test]
229 fn test_mixed_text_and_tags() {
230 let input = "<p>Hello <strong>World</strong>!</p>";
231 let expected_tokens = vec![
232 Token::TagOpen("p".to_string()),
233 Token::Text("Hello ".to_string()),
234 Token::TagOpen("strong".to_string()),
235 Token::Text("World".to_string()),
236 Token::TagClose("strong".to_string()),
237 Token::Text("!".to_string()),
238 Token::TagClose("p".to_string()),
239 Token::Eof,
240 ];
241 run_lexer_test(input, expected_tokens);
242 }
243
244 #[test]
245 fn test_multiple_attributes() {
246 let input = r#"<input type="text" value="Hello" disabled />"#;
247 let expected_tokens = vec![
248 Token::SelfClosingTag("input".to_string()),
249 Token::AttributeName("type".to_string()),
250 Token::AttributeValue("text".to_string()),
251 Token::AttributeName("value".to_string()),
252 Token::AttributeValue("Hello".to_string()),
253 Token::AttributeName("disabled".to_string()),
254 Token::Eof,
255 ];
256 run_lexer_test(input, expected_tokens);
257 }
258
259 #[test]
260 fn test_text_inside_nested_tags() {
261 let input = "<div><p>Hello <span>beautiful</span> world!</p></div>";
262 let expected_tokens = vec![
263 Token::TagOpen("div".to_string()),
264 Token::TagOpen("p".to_string()),
265 Token::Text("Hello ".to_string()),
266 Token::TagOpen("span".to_string()),
267 Token::Text("beautiful".to_string()),
268 Token::TagClose("span".to_string()),
269 Token::Text("world!".to_string()),
270 Token::TagClose("p".to_string()),
271 Token::TagClose("div".to_string()),
272 Token::Eof,
273 ];
274 run_lexer_test(input, expected_tokens);
275 }
276
277 #[test]
278 fn test_malformed_html() {
279 let input = "<div><p>Unclosed div";
280 let expected_tokens = vec![
281 Token::TagOpen("div".to_string()),
282 Token::TagOpen("p".to_string()),
283 Token::Text("Unclosed div".to_string()),
284 Token::Eof,
285 ];
286 run_lexer_test(input, expected_tokens);
287 }
288
289 #[test]
290 fn test_html_with_comments() {
291 let input = "<p>Hello<!-- This is a comment -->World</p>";
292 let expected_tokens = vec![
293 Token::TagOpen("p".to_string()),
294 Token::Text("Hello".to_string()),
295 Token::Text("World".to_string()),
296 Token::TagClose("p".to_string()),
297 Token::Eof,
298 ];
299 run_lexer_test(input, expected_tokens);
300 }
301
302 #[test]
303 fn test_script_tag_content() {
304 let input = r#"<script>console.log("Hello World");</script>"#;
305 let expected_tokens = vec![
306 Token::TagOpen("script".to_string()),
307 Token::Text("console.log(\"Hello World\");".to_string()),
308 Token::TagClose("script".to_string()),
309 Token::Eof,
310 ];
311 run_lexer_test(input, expected_tokens);
312 }
313
314 #[test]
315 fn test_style_tag_content() {
316 let input = r#"<style>body { color: red; }</style>"#;
317 let expected_tokens = vec![
318 Token::TagOpen("style".to_string()),
319 Token::Text("body { color: red; }".to_string()),
320 Token::TagClose("style".to_string()),
321 Token::Eof,
322 ];
323 run_lexer_test(input, expected_tokens);
324 }
325
326 #[test]
327 fn test_complex_html() {
328 let input = r#"
329 <html>
330 <head>
331 <title>Test Page</title>
332 <meta charset="UTF-8" />
333 </head>
334 <body>
335 <h1>Welcome</h1>
336 <p>This is a <strong>test</strong>.</p>
337 <br />
338 <img src="logo.png" alt="Logo" />
339 </body>
340 </html>
341 "#;
342 let expected_tokens = vec![
343 Token::TagOpen("html".to_string()),
344 Token::TagOpen("head".to_string()),
345 Token::TagOpen("title".to_string()),
346 Token::Text("Test Page".to_string()),
347 Token::TagClose("title".to_string()),
348 Token::SelfClosingTag("meta".to_string()),
349 Token::AttributeName("charset".to_string()),
350 Token::AttributeValue("UTF-8".to_string()),
351 Token::TagClose("head".to_string()),
352 Token::TagOpen("body".to_string()),
353 Token::TagOpen("h1".to_string()),
354 Token::Text("Welcome".to_string()),
355 Token::TagClose("h1".to_string()),
356 Token::TagOpen("p".to_string()),
357 Token::Text("This is a ".to_string()),
358 Token::TagOpen("strong".to_string()),
359 Token::Text("test".to_string()),
360 Token::TagClose("strong".to_string()),
361 Token::Text(".".to_string()),
362 Token::TagClose("p".to_string()),
363 Token::SelfClosingTag("br".to_string()),
364 Token::SelfClosingTag("img".to_string()),
365 Token::AttributeName("src".to_string()),
366 Token::AttributeValue("logo.png".to_string()),
367 Token::AttributeName("alt".to_string()),
368 Token::AttributeValue("Logo".to_string()),
369 Token::TagClose("body".to_string()),
370 Token::TagClose("html".to_string()),
371 Token::Eof,
372 ];
373 run_lexer_test(input, expected_tokens);
374 }
375}