html_simple_parser 0.1.1

A simple parser for html files to extract tags, child tags, attributes, etc.
Documentation
use pest::iterators::Pair;
use pest::Parser;
use pest_derive::Parser;
use thiserror::Error;

#[derive(Parser)]
#[grammar = "./grammar.pest"]
pub struct Grammar;

#[derive(Error, Debug)]
pub enum HtmlParseError {
    #[error("Invalid HTML structure detected!")]
    ErrorHtmlStructure,
    #[error("The tag is closed with a different tag than expected. Expected: {0}, found: {1}")]
    MismatchedClosingTag(String, String),
}
#[derive(Debug)]

pub enum HtmlElem {
    Tag {
        tag_name: String,
        children: Vec<HtmlElem>,
    },
    Text(String),
    Documentation(String),
}

pub fn parse_html(input: &str) -> Result<Vec<HtmlElem>, HtmlParseError> {
    let elements =
        Grammar::parse(Rule::document, input).map_err(|_| HtmlParseError::ErrorHtmlStructure)?;

    let mut html_dom = vec![];

    for pair in elements {
        match pair.as_rule() {
            Rule::document => {
                for tag in pair.into_inner() {
                    match tag.as_rule() {
                        Rule::elem | Rule::self_closed_tag => {
                            let elem = parse_elem(&tag)?;
                            html_dom.push(elem);
                        }
                        Rule::declaration => {
                            let doctype = tag.as_str().to_string();
                            html_dom.push(HtmlElem::Documentation(doctype));
                        }
                        _ => {}
                    }
                }
            }
            _ => return Err(HtmlParseError::ErrorHtmlStructure),
        }
    }
    Ok(html_dom)
}

pub fn parse_elem(pair: &Pair<Rule>) -> Result<HtmlElem, HtmlParseError> {
    match pair.as_rule() {
        Rule::elem => {
            let mut inner_pairs = pair.clone().into_inner();
            let start_tag = inner_pairs.next().unwrap();
            let tag_name = start_tag.into_inner().next().unwrap().as_str().to_string();

            let mut children = vec![];

            for child in inner_pairs {
                match child.as_rule() {
                    Rule::elem => {
                        let child_elem = parse_elem(&child)?;
                        children.push(child_elem);
                    }

                    Rule::text => {
                        let child_text = HtmlElem::Text(child.as_str().to_string());
                        children.push(child_text);
                    }

                    Rule::self_closed_tag => {
                        let child_elem = parse_elem(&child)?;
                        children.push(child_elem);
                    }

                    Rule::end_tag => {
                        let end_tag_name = child.into_inner().next().unwrap().as_str();
                        if tag_name != end_tag_name {
                            return Err(HtmlParseError::MismatchedClosingTag(
                                tag_name,
                                end_tag_name.to_string(),
                            ));
                        }
                    }
                    _ => return Err(HtmlParseError::ErrorHtmlStructure),
                }
            }
            Ok(HtmlElem::Tag { tag_name, children })
        }
        Rule::text => Ok(HtmlElem::Text(pair.as_str().to_string())),
        Rule::self_closed_tag => {
            let tag_name = pair.as_str().to_string();
            Ok(HtmlElem::Tag {
                tag_name,
                children: vec![],
            })
        }

        _ => Err(HtmlParseError::ErrorHtmlStructure),
    }
}