use crate::token::Token;
use super::lexer_trait::LexerTrait;
pub struct Lexer {
input: Vec<char>,
position: usize,
}
impl Lexer {
fn next_char(&mut self) -> Option<char> {
if self.position < self.input.len() {
let ch = self.input[self.position];
self.position += 1;
Some(ch)
} else {
None
}
}
fn read_tag_name(&mut self) -> String {
let mut name = String::new();
while let Some(ch) = self.next_char() {
if ch.is_alphanumeric() {
name.push(ch);
} else {
self.position -= 1;
break;
}
}
name
}
fn read_attribute_value(&mut self) -> String {
let mut value = String::new();
let mut inside_quotes = false;
while let Some(ch) = self.next_char() {
if ch == '"' {
if inside_quotes {
break;
} else {
inside_quotes = true;
continue;
}
}
if inside_quotes {
value.push(ch);
}
}
value
}
fn read_attribute_name(&mut self) -> Option<Token> {
let mut name = String::new();
while let Some(ch) = self.next_char() {
if ch.is_alphanumeric() || ch == '-' {
name.push(ch);
} else if ch == '=' || ch == ' ' {
self.position -= 1;
return Some(Token::AttributeName(name));
} else {
self.position -= 1;
break;
}
}
None
}
}
impl LexerTrait for Lexer {
fn new(input: &str) -> Self {
Self {
input: input.chars().collect(),
position: 0,
}
}
fn tokenize(&mut self) -> Vec<Token> {
let mut tokens = Vec::new();
while let Some(ch) = self.next_char() {
match ch {
'<' => {
if let Some(next) = self.next_char() {
if next == '!' {
if self.next_char() == Some('-') && self.next_char() == Some('-') {
while self.next_char().is_some() {
if self.next_char() == Some('-')
&& self.next_char() == Some('-')
&& self.next_char() == Some('>')
{
break;
}
}
continue;
}
} else if next == '/' {
tokens.push(Token::TagClose(self.read_tag_name()));
} else {
let mut tmp_tokens = Vec::new();
self.position -= 1;
let start_position = self.position;
let mut is_self_closing = false;
let mut self_tag_name = String::new();
while let Some(ch1) = self.next_char() {
match ch1 {
'>' => break,
'/' => {
is_self_closing = true;
break;
}
' ' => {
if let Some(attr) = self.read_attribute_name() {
tmp_tokens.push(attr);
}
}
'=' => {
let value = self.read_attribute_value();
tmp_tokens.push(Token::AttributeValue(value));
}
_ => {
self_tag_name.push(ch1);
}
}
}
let end_position = self.position;
self.position = start_position;
if is_self_closing {
tokens.push(Token::SelfClosingTag(self_tag_name));
} else {
tokens.push(Token::TagOpen(self.read_tag_name()));
}
tokens.extend(tmp_tokens.clone());
self.position = end_position;
}
}
}
' ' | '\n' | '\t' | '>' | '/' => continue,
_ => {
let mut text = String::new();
text.push(ch);
while let Some(next) = self.next_char() {
if next == '<' {
self.position -= 1;
break;
}
text.push(next);
}
tokens.push(Token::Text(text));
}
}
}
tokens.push(Token::Eof);
tokens
}
}
#[cfg(test)]
mod tests {
use crate::token::Token;
use super::*;
fn run_lexer_test(input: &str, expected_tokens: Vec<Token>) {
let mut lexer = Lexer::new(input);
let tokens = lexer.tokenize();
assert_eq!(tokens, expected_tokens);
}
#[test]
fn test_simple_text() {
let input = "<p>Hello World</p>";
let expected_tokens = vec![
Token::TagOpen("p".to_string()),
Token::Text("Hello World".to_string()),
Token::TagClose("p".to_string()),
Token::Eof,
];
run_lexer_test(input, expected_tokens);
}
#[test]
fn test_nested_tags() {
let input = "<div><h1>Title</h1><p>Paragraph</p></div>";
let expected_tokens = vec![
Token::TagOpen("div".to_string()),
Token::TagOpen("h1".to_string()),
Token::Text("Title".to_string()),
Token::TagClose("h1".to_string()),
Token::TagOpen("p".to_string()),
Token::Text("Paragraph".to_string()),
Token::TagClose("p".to_string()),
Token::TagClose("div".to_string()),
Token::Eof,
];
run_lexer_test(input, expected_tokens);
}
#[test]
fn test_attributes() {
let input = r#"<a href="https://example.com">Click here</a>"#;
let expected_tokens = vec![
Token::TagOpen("a".to_string()),
Token::AttributeName("href".to_string()),
Token::AttributeValue("https://example.com".to_string()),
Token::Text("Click here".to_string()),
Token::TagClose("a".to_string()),
Token::Eof,
];
run_lexer_test(input, expected_tokens);
}
#[test]
fn test_self_closing_tag() {
let input = r#"<img src="image.png" />"#;
let expected_tokens = vec![
Token::SelfClosingTag("img".to_string()),
Token::AttributeName("src".to_string()),
Token::AttributeValue("image.png".to_string()),
Token::Eof,
];
run_lexer_test(input, expected_tokens);
}
#[test]
fn test_mixed_text_and_tags() {
let input = "<p>Hello <strong>World</strong>!</p>";
let expected_tokens = vec![
Token::TagOpen("p".to_string()),
Token::Text("Hello ".to_string()),
Token::TagOpen("strong".to_string()),
Token::Text("World".to_string()),
Token::TagClose("strong".to_string()),
Token::Text("!".to_string()),
Token::TagClose("p".to_string()),
Token::Eof,
];
run_lexer_test(input, expected_tokens);
}
#[test]
fn test_multiple_attributes() {
let input = r#"<input type="text" value="Hello" disabled />"#;
let expected_tokens = vec![
Token::SelfClosingTag("input".to_string()),
Token::AttributeName("type".to_string()),
Token::AttributeValue("text".to_string()),
Token::AttributeName("value".to_string()),
Token::AttributeValue("Hello".to_string()),
Token::AttributeName("disabled".to_string()),
Token::Eof,
];
run_lexer_test(input, expected_tokens);
}
#[test]
fn test_text_inside_nested_tags() {
let input = "<div><p>Hello <span>beautiful</span> world!</p></div>";
let expected_tokens = vec![
Token::TagOpen("div".to_string()),
Token::TagOpen("p".to_string()),
Token::Text("Hello ".to_string()),
Token::TagOpen("span".to_string()),
Token::Text("beautiful".to_string()),
Token::TagClose("span".to_string()),
Token::Text("world!".to_string()),
Token::TagClose("p".to_string()),
Token::TagClose("div".to_string()),
Token::Eof,
];
run_lexer_test(input, expected_tokens);
}
#[test]
fn test_malformed_html() {
let input = "<div><p>Unclosed div";
let expected_tokens = vec![
Token::TagOpen("div".to_string()),
Token::TagOpen("p".to_string()),
Token::Text("Unclosed div".to_string()),
Token::Eof,
];
run_lexer_test(input, expected_tokens);
}
#[test]
fn test_html_with_comments() {
let input = "<p>Hello<!-- This is a comment -->World</p>";
let expected_tokens = vec![
Token::TagOpen("p".to_string()),
Token::Text("Hello".to_string()),
Token::Text("World".to_string()),
Token::TagClose("p".to_string()),
Token::Eof,
];
run_lexer_test(input, expected_tokens);
}
#[test]
fn test_script_tag_content() {
let input = r#"<script>console.log("Hello World");</script>"#;
let expected_tokens = vec![
Token::TagOpen("script".to_string()),
Token::Text("console.log(\"Hello World\");".to_string()),
Token::TagClose("script".to_string()),
Token::Eof,
];
run_lexer_test(input, expected_tokens);
}
#[test]
fn test_style_tag_content() {
let input = r#"<style>body { color: red; }</style>"#;
let expected_tokens = vec![
Token::TagOpen("style".to_string()),
Token::Text("body { color: red; }".to_string()),
Token::TagClose("style".to_string()),
Token::Eof,
];
run_lexer_test(input, expected_tokens);
}
#[test]
fn test_complex_html() {
let input = r#"
<html>
<head>
<title>Test Page</title>
<meta charset="UTF-8" />
</head>
<body>
<h1>Welcome</h1>
<p>This is a <strong>test</strong>.</p>
<br />
<img src="logo.png" alt="Logo" />
</body>
</html>
"#;
let expected_tokens = vec![
Token::TagOpen("html".to_string()),
Token::TagOpen("head".to_string()),
Token::TagOpen("title".to_string()),
Token::Text("Test Page".to_string()),
Token::TagClose("title".to_string()),
Token::SelfClosingTag("meta".to_string()),
Token::AttributeName("charset".to_string()),
Token::AttributeValue("UTF-8".to_string()),
Token::TagClose("head".to_string()),
Token::TagOpen("body".to_string()),
Token::TagOpen("h1".to_string()),
Token::Text("Welcome".to_string()),
Token::TagClose("h1".to_string()),
Token::TagOpen("p".to_string()),
Token::Text("This is a ".to_string()),
Token::TagOpen("strong".to_string()),
Token::Text("test".to_string()),
Token::TagClose("strong".to_string()),
Token::Text(".".to_string()),
Token::TagClose("p".to_string()),
Token::SelfClosingTag("br".to_string()),
Token::SelfClosingTag("img".to_string()),
Token::AttributeName("src".to_string()),
Token::AttributeValue("logo.png".to_string()),
Token::AttributeName("alt".to_string()),
Token::AttributeValue("Logo".to_string()),
Token::TagClose("body".to_string()),
Token::TagClose("html".to_string()),
Token::Eof,
];
run_lexer_test(input, expected_tokens);
}
}