html_forge 0.1.0-a

A robust and efficient HTML parsing library for Rust
Documentation
use crate::{
    dom::{Element, Node},
    errors::ParseError,
};

pub struct Parser {
    input: String,
    position: usize,
}

impl Parser {
    pub fn new(input: String) -> Self {
        Parser { input, position: 0 }
    }

    pub fn parse(&mut self) -> Result<Node, ParseError> {
        self.parse_node()
    }

    fn parse_node(&mut self) -> Result<Node, ParseError> {
        self.consume_whitespace();

        if self.eof() {
            return Err(ParseError::UnexpectedEOF);
        }

        if self.starts_with("<!--") {
            self.parse_comment()
        } else if self.starts_with("<") {
            self.parse_element()
        } else {
            self.parse_text()
        }
    }

    fn parse_comment(&mut self) -> Result<Node, ParseError> {
        self.consume_string("<!--")?;
        let content = self.consume_while_sp(|parser| !parser.starts_with("-->"));
        self.consume_string("-->")?;
        Ok(Node::Comment(content))
    }

    fn parse_element(&mut self) -> Result<Node, ParseError> {
        self.consume_char();

        let tag_name = self.parse_tag_name();
        let attributes = self.parse_attributes()?;

        if self.starts_with("/>") {
            self.consume_char();
            self.consume_char();
            return Ok(Node::Element(Element::with_attributes(
                tag_name, attributes,
            )));
        } else {
            self.consume_char();
        }

        let mut element = Element::new(tag_name.clone());
        element.attributes = attributes;

        loop {
            self.consume_whitespace();
            if self.starts_with("</") {
                break;
            }
            if self.eof() {
                return Err(ParseError::UnexpectedEOF);
            }
            let child = self.parse_node()?;
            element.children.push(child);
        }

        self.consume_char();
        self.consume_char();
        let closing_tag_name = self.parse_tag_name();
        if closing_tag_name != tag_name {
            return Err(ParseError::MismatchedClosingTag);
        }
        self.consume_char();

        Ok(Node::Element(element))
    }

    fn parse_text(&mut self) -> Result<Node, ParseError> {
        let text = self.consume_while(|c| c != '<');
        Ok(Node::Text(text))
    }

    fn parse_tag_name(&mut self) -> String {
        self.consume_while(|c| c.is_alphanumeric())
    }

    fn parse_attributes(&mut self) -> Result<Vec<(String, String)>, ParseError> {
        let mut attributes = Vec::new();
        loop {
            self.consume_whitespace();
            if self.next_char() == '>' || self.next_char() == '/' {
                break;
            }
            let name = self.parse_tag_name();
            self.consume_char();
            let value = self.parse_attribute_value()?;
            attributes.push((name, value));
        }
        Ok(attributes)
    }

    fn parse_attribute_value(&mut self) -> Result<String, ParseError> {
        let quote = self.consume_char();
        if quote != '"' && quote != '\'' {
            return Err(ParseError::InvalidAttributeValue);
        }
        let value = self.consume_while(|c| c != quote);
        self.consume_char();
        Ok(value)
    }

    fn consume_whitespace(&mut self) {
        self.consume_while(char::is_whitespace);
    }

    fn consume_while<F>(&mut self, test: F) -> String
    where
        F: Fn(char) -> bool,
    {
        let mut result = String::new();
        while !self.eof() && test(self.next_char()) {
            result.push(self.consume_char());
        }
        result
    }

    fn consume_while_sp<F>(&mut self, mut test: F) -> String
    where
        F: FnMut(&Self) -> bool,
    {
        let mut result = String::new();
        while !self.eof() && test(self) {
            result.push(self.consume_char());
        }
        result
    }

    fn consume_char(&mut self) -> char {
        let mut iter = self.input[self.position..].char_indices();
        let (_, cur_char) = iter.next().unwrap();
        let (next_pos, _) = iter.next().unwrap_or((1, ' '));
        self.position += next_pos;
        cur_char
    }

    fn consume_string(&mut self, s: &str) -> Result<(), ParseError> {
        if self.input[self.position..].starts_with(s) {
            self.position += s.len();
            Ok(())
        } else {
            Err(ParseError::InvalidTag)
        }
    }

    fn next_char(&self) -> char {
        self.input[self.position..].chars().next().unwrap()
    }

    fn starts_with(&self, s: &str) -> bool {
        self.input[self.position..].starts_with(s)
    }

    fn eof(&self) -> bool {
        self.position >= self.input.len()
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::dom::Node;
    use crate::errors::ParseError;

    #[test]
    fn test_parse_valid_single_element() {
        let input = "<div></div>".to_string();
        let mut parser = Parser::new(input);

        match parser.parse() {
            Ok(Node::Element(element)) => {
                assert_eq!(element.tag_name, "div");
                assert!(element.children.is_empty());
            }
            _ => panic!("Expected a valid div element"),
        }
    }

    #[test]
    fn test_parse_valid_nested_elements() {
        let input = "<div><p></p></div>".to_string();
        let mut parser = Parser::new(input);

        match parser.parse() {
            Ok(Node::Element(element)) => {
                assert_eq!(element.tag_name, "div");
                assert_eq!(element.children.len(), 1);
                if let Node::Element(child) = &element.children[0] {
                    assert_eq!(child.tag_name, "p");
                    assert!(child.children.is_empty());
                } else {
                    panic!("Expected a p element inside div");
                }
            }
            _ => panic!("Expected a valid nested element structure"),
        }
    }

    #[test]
    fn test_parse_text_node() {
        let input = "Hello, world!".to_string();
        let mut parser = Parser::new(input);

        match parser.parse() {
            Ok(Node::Text(text)) => {
                assert_eq!(text, "Hello, world!");
            }
            _ => panic!("Expected a text node"),
        }
    }

    #[test]
    fn test_mismatched_closing_tag() {
        let input = "<div><p></div></p>".to_string();
        let mut parser = Parser::new(input);

        match parser.parse() {
            Err(ParseError::MismatchedClosingTag) => (),
            _ => panic!("Expected MismatchedClosingTag error"),
        }
    }

    #[test]
    fn test_empty_input() {
        let input = "".to_string();
        let mut parser = Parser::new(input);

        match parser.parse() {
            Err(ParseError::UnexpectedEOF) => (),
            _ => panic!("Expected UnexpectedEOF error"),
        }
    }

    #[test]
    fn test_parse_element_with_attributes() {
        let input = r#"<div class="container" id='main'></div>"#.to_string();
        let mut parser = Parser::new(input);

        match parser.parse() {
            Ok(Node::Element(element)) => {
                assert_eq!(element.tag_name, "div");
                assert_eq!(element.attributes.len(), 2);
                assert_eq!(
                    element.attributes[0],
                    ("class".to_string(), "container".to_string())
                );
                assert_eq!(
                    element.attributes[1],
                    ("id".to_string(), "main".to_string())
                );
            }
            _ => panic!("Expected a div element with attributes"),
        }
    }

    #[test]
    fn test_parse_self_closing_element() {
        let input = "<img src='image.jpg' />".to_string();
        let mut parser = Parser::new(input);

        match parser.parse() {
            Ok(Node::Element(element)) => {
                assert_eq!(element.tag_name, "img");
                assert_eq!(element.attributes.len(), 1);
                assert_eq!(
                    element.attributes[0],
                    ("src".to_string(), "image.jpg".to_string())
                );
                assert!(element.children.is_empty());
            }
            _ => panic!("Expected a self-closing img element"),
        }
    }

    #[test]
    fn test_parse_comment_simple() {
        let input = "<!-- This is a comment -->".to_string();
        let mut parser = Parser::new(input);

        match parser.parse_comment() {
            Ok(Node::Comment(content)) => {
                assert_eq!(content, " This is a comment ");
            }
            _ => panic!("Expected a comment node"),
        }
    }

    #[test]
    fn test_parse_comment_with_extra_text() {
        let input = "<!-- This is a comment --> Extra text".to_string();
        let mut parser = Parser::new(input);

        match parser.parse_comment() {
            Ok(Node::Comment(content)) => {
                assert_eq!(content, " This is a comment ");
            }
            _ => panic!("Expected a comment node followed by extra text"),
        }

        assert!(parser.starts_with(" Extra text"));
    }

    #[test]
    fn test_invalid_attribute_value() {
        let input = "<div class=container></div>".to_string();
        let mut parser = Parser::new(input);

        match parser.parse() {
            Err(ParseError::InvalidAttributeValue) => (),
            _ => panic!("Expected InvalidAttributeValue error"),
        }
    }
}