html_simple_parser 0.1.1

A simple parser for html files to extract tags, child tags, attributes, etc.
Documentation
// This rule is ignored when outputting the result, but it is needed to determine the spaces of tabs and newlines.
WHITESPACE = _{ " "}
NEWLINE = _{"\n" | "\r\n"}
TAB = _{"\t"}
WHITESPACE_ALL = _{ WHITESPACE | TAB | NEWLINE }

/// This rule is used to parse all input text. 
///
/// #Examples
///```
/// // <!DOCTYPE html><html><br/><html/>
/// ```
document = {SOI ~ WHITESPACE_ALL* ~ declaration? ~ WHITESPACE_ALL* ~ (TAB* ~ elem | self_closed_tag)* ~ EOI}

/// This rule is used to define the declaration at the start of the text. 
///
/// #Examples
///```
/// // <!DOCTYPE html>
/// ```
declaration = {"<!DOCTYPE html>"}

/// This rule is used to identify the structure of a complex tag, which necessarily includes an opening and closing tag. And may contain nested elements.
/// #Examples
///```
/// // <html> </html>
/// // <html> <br/> <p>Text</p> </html>
/// ```
elem = {start_tag ~ NEWLINE*  ~ (TAB* ~ (elem | text | self_closed_tag) ~ NEWLINE*)* ~ end_tag }


/// This rule is used to identify the opening tag.
/// #Examples
///```
/// // <html>
/// // <p>
/// ```
start_tag = { "<" ~ tag_name ~ ">" }
//(WHITESPACE+ ~ attribute)* 


/// This rule is used to identify the ending tag.
/// #Examples
///```
/// // </html>
/// // </p>
/// ```
end_tag   = { "</" ~ tag_name ~ ">" }

/// This rule is used to identify the self-closing tag.
/// #Examples
///```
/// // <br/>
/// // <img/>
/// ```
self_closed_tag = { TAB* ~ "<" ~ tag_name ~ "/>"  }

/// This rule is used to identify the tag name.
/// #Examples
///```
/// // html
/// // div
/// // br
/// ```
tag_name = @{ ASCII_ALPHA+ } 

/// This rule is used to identify the text component.
/// #Examples
///```
/// // 'Just some text between the two components or tags.'
/// ```
text = {(!"<" ~ ANY)+}