parse-html 0.4.1

A simple Rust project to parse HTML.
Documentation
use crate::token::Token;

use super::lexer_trait::LexerTrait;

pub struct Lexer {
    input: Vec<char>,
    position: usize,
}

impl Lexer {
    fn next_char(&mut self) -> Option<char> {
        if self.position < self.input.len() {
            let ch = self.input[self.position];
            self.position += 1;
            Some(ch)
        } else {
            None
        }
    }

    fn read_tag_name(&mut self) -> String {
        let mut name = String::new();
        while let Some(ch) = self.next_char() {
            if ch.is_alphanumeric() {
                name.push(ch);
            } else {
                self.position -= 1;
                break;
            }
        }
        name
    }

    fn read_attribute_value(&mut self) -> String {
        let mut value = String::new();
        let mut inside_quotes = false;

        while let Some(ch) = self.next_char() {
            if ch == '"' {
                if inside_quotes {
                    break;
                } else {
                    inside_quotes = true;
                    continue;
                }
            }
            if inside_quotes {
                value.push(ch);
            }
        }
        value
    }

    fn read_attribute_name(&mut self) -> Option<Token> {
        let mut name = String::new();
        while let Some(ch) = self.next_char() {
            if ch.is_alphanumeric() || ch == '-' {
                name.push(ch);
            } else if ch == '=' || ch == ' ' {
                self.position -= 1;
                return Some(Token::AttributeName(name));
            } else {
                self.position -= 1;
                break;
            }
        }
        None
    }
}

impl LexerTrait for Lexer {
    fn new(input: &str) -> Self {
        Self {
            input: input.chars().collect(),
            position: 0,
        }
    }

    fn tokenize(&mut self) -> Vec<Token> {
        let mut tokens = Vec::new();

        while let Some(ch) = self.next_char() {
            match ch {
                '<' => {
                    if let Some(next) = self.next_char() {
                        if next == '!' {
                            if self.next_char() == Some('-') && self.next_char() == Some('-') {
                                // Ignorer le commentaire
                                while self.next_char().is_some() {
                                    if self.next_char() == Some('-')
                                        && self.next_char() == Some('-')
                                        && self.next_char() == Some('>')
                                    {
                                        break;
                                    }
                                }
                                continue;
                            }
                        } else if next == '/' {
                            tokens.push(Token::TagClose(self.read_tag_name()));
                        } else {
                            let mut tmp_tokens = Vec::new();
                            self.position -= 1;
                            let start_position = self.position;
                            let mut is_self_closing = false;
                            let mut self_tag_name = String::new();
                            while let Some(ch1) = self.next_char() {
                                match ch1 {
                                    '>' => break,
                                    '/' => {
                                        is_self_closing = true;
                                        break;
                                    }
                                    ' ' => {
                                        if let Some(attr) = self.read_attribute_name() {
                                            tmp_tokens.push(attr);
                                        }
                                    }
                                    '=' => {
                                        let value = self.read_attribute_value();
                                        tmp_tokens.push(Token::AttributeValue(value));
                                    }
                                    _ => {
                                        self_tag_name.push(ch1);
                                    }
                                }
                            }
                            let end_position = self.position;
                            self.position = start_position;
                            if is_self_closing {
                                tokens.push(Token::SelfClosingTag(self_tag_name));
                            } else {
                                tokens.push(Token::TagOpen(self.read_tag_name()));
                            }
                            tokens.extend(tmp_tokens.clone());
                            self.position = end_position;
                        }
                    }
                }
                ' ' | '\n' | '\t' | '>' | '/' => continue,
                _ => {
                    let mut text = String::new();
                    text.push(ch);
                    while let Some(next) = self.next_char() {
                        if next == '<' {
                            self.position -= 1;
                            break;
                        }
                        text.push(next);
                    }
                    tokens.push(Token::Text(text));
                }
            }
        }

        tokens.push(Token::Eof);
        tokens
    }
}

#[cfg(test)]
mod tests {
    use crate::token::Token;

    use super::*;

    fn run_lexer_test(input: &str, expected_tokens: Vec<Token>) {
        let mut lexer = Lexer::new(input);
        let tokens = lexer.tokenize();
        assert_eq!(tokens, expected_tokens);
    }

    #[test]
    fn test_simple_text() {
        let input = "<p>Hello World</p>";
        let expected_tokens = vec![
            Token::TagOpen("p".to_string()),
            Token::Text("Hello World".to_string()),
            Token::TagClose("p".to_string()),
            Token::Eof,
        ];
        run_lexer_test(input, expected_tokens);
    }

    #[test]
    fn test_nested_tags() {
        let input = "<div><h1>Title</h1><p>Paragraph</p></div>";
        let expected_tokens = vec![
            Token::TagOpen("div".to_string()),
            Token::TagOpen("h1".to_string()),
            Token::Text("Title".to_string()),
            Token::TagClose("h1".to_string()),
            Token::TagOpen("p".to_string()),
            Token::Text("Paragraph".to_string()),
            Token::TagClose("p".to_string()),
            Token::TagClose("div".to_string()),
            Token::Eof,
        ];
        run_lexer_test(input, expected_tokens);
    }

    #[test]
    fn test_attributes() {
        let input = r#"<a href="https://example.com">Click here</a>"#;
        let expected_tokens = vec![
            Token::TagOpen("a".to_string()),
            Token::AttributeName("href".to_string()),
            Token::AttributeValue("https://example.com".to_string()),
            Token::Text("Click here".to_string()),
            Token::TagClose("a".to_string()),
            Token::Eof,
        ];
        run_lexer_test(input, expected_tokens);
    }

    #[test]
    fn test_self_closing_tag() {
        let input = r#"<img src="image.png" />"#;
        let expected_tokens = vec![
            Token::SelfClosingTag("img".to_string()),
            Token::AttributeName("src".to_string()),
            Token::AttributeValue("image.png".to_string()),
            Token::Eof,
        ];
        run_lexer_test(input, expected_tokens);
    }

    #[test]
    fn test_mixed_text_and_tags() {
        let input = "<p>Hello <strong>World</strong>!</p>";
        let expected_tokens = vec![
            Token::TagOpen("p".to_string()),
            Token::Text("Hello ".to_string()),
            Token::TagOpen("strong".to_string()),
            Token::Text("World".to_string()),
            Token::TagClose("strong".to_string()),
            Token::Text("!".to_string()),
            Token::TagClose("p".to_string()),
            Token::Eof,
        ];
        run_lexer_test(input, expected_tokens);
    }

    #[test]
    fn test_multiple_attributes() {
        let input = r#"<input type="text" value="Hello" disabled />"#;
        let expected_tokens = vec![
            Token::SelfClosingTag("input".to_string()),
            Token::AttributeName("type".to_string()),
            Token::AttributeValue("text".to_string()),
            Token::AttributeName("value".to_string()),
            Token::AttributeValue("Hello".to_string()),
            Token::AttributeName("disabled".to_string()),
            Token::Eof,
        ];
        run_lexer_test(input, expected_tokens);
    }

    #[test]
    fn test_text_inside_nested_tags() {
        let input = "<div><p>Hello <span>beautiful</span> world!</p></div>";
        let expected_tokens = vec![
            Token::TagOpen("div".to_string()),
            Token::TagOpen("p".to_string()),
            Token::Text("Hello ".to_string()),
            Token::TagOpen("span".to_string()),
            Token::Text("beautiful".to_string()),
            Token::TagClose("span".to_string()),
            Token::Text("world!".to_string()),
            Token::TagClose("p".to_string()),
            Token::TagClose("div".to_string()),
            Token::Eof,
        ];
        run_lexer_test(input, expected_tokens);
    }

    #[test]
    fn test_malformed_html() {
        let input = "<div><p>Unclosed div";
        let expected_tokens = vec![
            Token::TagOpen("div".to_string()),
            Token::TagOpen("p".to_string()),
            Token::Text("Unclosed div".to_string()),
            Token::Eof,
        ];
        run_lexer_test(input, expected_tokens);
    }

    #[test]
    fn test_html_with_comments() {
        let input = "<p>Hello<!-- This is a comment -->World</p>";
        let expected_tokens = vec![
            Token::TagOpen("p".to_string()),
            Token::Text("Hello".to_string()),
            Token::Text("World".to_string()),
            Token::TagClose("p".to_string()),
            Token::Eof,
        ];
        run_lexer_test(input, expected_tokens);
    }

    #[test]
    fn test_script_tag_content() {
        let input = r#"<script>console.log("Hello World");</script>"#;
        let expected_tokens = vec![
            Token::TagOpen("script".to_string()),
            Token::Text("console.log(\"Hello World\");".to_string()),
            Token::TagClose("script".to_string()),
            Token::Eof,
        ];
        run_lexer_test(input, expected_tokens);
    }

    #[test]
    fn test_style_tag_content() {
        let input = r#"<style>body { color: red; }</style>"#;
        let expected_tokens = vec![
            Token::TagOpen("style".to_string()),
            Token::Text("body { color: red; }".to_string()),
            Token::TagClose("style".to_string()),
            Token::Eof,
        ];
        run_lexer_test(input, expected_tokens);
    }

    #[test]
    fn test_complex_html() {
        let input = r#"
        <html>
            <head>
                <title>Test Page</title>
                <meta charset="UTF-8" />
            </head>
            <body>
                <h1>Welcome</h1>
                <p>This is a <strong>test</strong>.</p>
                <br />
                <img src="logo.png" alt="Logo" />
            </body>
        </html>
    "#;
        let expected_tokens = vec![
            Token::TagOpen("html".to_string()),
            Token::TagOpen("head".to_string()),
            Token::TagOpen("title".to_string()),
            Token::Text("Test Page".to_string()),
            Token::TagClose("title".to_string()),
            Token::SelfClosingTag("meta".to_string()),
            Token::AttributeName("charset".to_string()),
            Token::AttributeValue("UTF-8".to_string()),
            Token::TagClose("head".to_string()),
            Token::TagOpen("body".to_string()),
            Token::TagOpen("h1".to_string()),
            Token::Text("Welcome".to_string()),
            Token::TagClose("h1".to_string()),
            Token::TagOpen("p".to_string()),
            Token::Text("This is a ".to_string()),
            Token::TagOpen("strong".to_string()),
            Token::Text("test".to_string()),
            Token::TagClose("strong".to_string()),
            Token::Text(".".to_string()),
            Token::TagClose("p".to_string()),
            Token::SelfClosingTag("br".to_string()),
            Token::SelfClosingTag("img".to_string()),
            Token::AttributeName("src".to_string()),
            Token::AttributeValue("logo.png".to_string()),
            Token::AttributeName("alt".to_string()),
            Token::AttributeValue("Logo".to_string()),
            Token::TagClose("body".to_string()),
            Token::TagClose("html".to_string()),
            Token::Eof,
        ];
        run_lexer_test(input, expected_tokens);
    }
}