html_parser_rscx 0.7.1

A simple and general purpose html/xhtml parser
Documentation
//
// HTML
//
html = _{
    SOI
    ~ node_comment*
    ~ doctype?
    ~ node*
    ~ EOI
}

//
// DOCTYPE
//
doctype = { WSP* ~ chevron_left_bang ~ ^"doctype" ~ WSP* ~ attr* ~ WSP* ~ chevron_right_normal}

//
// NODES
//
node = _{ node_comment | node_element | node_text }
node_comment = { WSP* ~ (comment_if | comment_normal) ~ WSP* }
node_text = { (!(node_element | comment_tag_start | chevron_left_bang) ~ ANY)+ }
node_element = { el_void | el_void_xml | el_process_instruct | el_raw_text | el_normal | el_dangling }

//
// COMMENTS
//
comment_normal = _{ comment_tag_start ~ comment_body ~ comment_tag_end }
comment_body = { (!comment_tag_end ~ ANY)* }
comment_tag_start = _{ chevron_left_bang ~ "--" ~ WSP* }
comment_tag_end = _{ WSP* ~ "--" ~ chevron_right_normal }

// Compatability with old IE browsers... This is not necessary for newer browsers
comment_if = _{ comment_if_start ~ comment_if_body ~ comment_if_end }
comment_if_body = { (!comment_if_end ~ ANY)* }
comment_if_start = _{ comment_tag_start ~ "[" ~ ^"if" }
comment_if_end = _{ chevron_left_bang ~ "[" ~ ^"endif" ~ "]" ~ comment_tag_end }

//
// ATTRIBUTES
//
attr = { attr_key ~ (equal ~ WSP* ~ (attr_non_quoted | attr_quoted ))? }
attr_quoted =  ${PUSH(quote) ~ attr_value ~ POP }
attr_non_quoted = @{ !quote ~ (!(WSP | chevron_right) ~ ANY)* }
attr_key = { WSP* ~ ASCII_ALPHA ~ text_chars* ~ WSP* }
attr_value = { WSP* ~ (!PEEK ~ ANY)* ~ WSP* }

//
// ELEMENTS
//
el_name = @{ ASCII_ALPHA ~ text_chars* }

// Void element aka self-closing element
// Ex: <hr>
el_void_name_html = @{
    ^"area"
    | ^"base"
    | ^"br"
    | ^"col"
    | ^"command"
    | ^"embed"
    | ^"hr"
    | ^"img"
    | ^"input"
    | ^"keygen"
    | ^"link"
    | ^"meta"
    | ^"param"
    | ^"source"
    | ^"track"
    | ^"wbr"
    | ^"meta"
}
// NOTE: This should not have to be a rule, but people doesn't know what void elements are...
el_void_name_svg = @{
    ^"path"
    | ^"polygon"
    | ^"rect"
    | ^"circle"
}
el_void_name = @{ el_void_name_html | el_void_name_svg }
el_void = _{ chevron_left_normal ~ WSP* ~ el_void_name ~ WSP* ~ attr* ~ WSP* ~ (chevron_right_normal | chevron_right_closed) }
el_void_xml = _{ chevron_left_normal ~ WSP* ~ el_name ~ WSP* ~ attr* ~ WSP* ~ chevron_right_closed }

// Open elements are default element that can take children 
// and have both a start tag and an end tag
// Ex: <html lang="en"></html>
el_normal = _{ el_normal_start ~ (!el_normal_end ~ node)* ~ el_normal_end }
el_normal_start = _{ chevron_left_normal ~ WSP* ~ PUSH(el_name) ~ WSP* ~ attr* ~ WSP* ~ chevron_right_normal}
el_normal_end = { chevron_left_closed ~ WSP* ~ POP ~ WSP* ~ chevron_right_normal}

// Raw text elements are elements with text/script content that
// might interfere with the normal html syntax
el_raw_text_name = {
    ^"style"
    | ^"script"
    | ^"title"
    | ^"textarea"
}
el_raw_text_content = { (!el_raw_text_end ~ ANY)* }
el_raw_text = _{ el_raw_text_start ~ el_raw_text_content ~ el_raw_text_end }
el_raw_text_start = _{ chevron_left_normal ~ WSP* ~ PUSH(el_raw_text_name) ~ WSP* ~ attr* ~ WSP* ~ chevron_right_normal ~ WSP*}
el_raw_text_end = { WSP* ~ chevron_left_closed ~ WSP* ~ POP ~ WSP* ~ chevron_right_normal}

// XML processing instruction
// Ex: <?xml version="1.0" ?>
el_process_instruct = { chevron_left_question ~ WSP* ~ el_name? ~ WSP* ~ attr* ~ WSP* ~ chevron_right_question }

// Catch dangling elements
// Ex: <div/></div>
el_dangling = { chevron_left_closed ~ WSP* ~ el_name ~ WSP* ~ chevron_right_normal}

//
// SYMBOLS / CHARACTERS
//
text_chars = _{'a'..'z' | 'A'..'Z' | "_" | "-" | ":" |'0'..'9'}

chevron_left_normal = _{ "<" }
chevron_left_closed = _{ "</" }
chevron_left_bang = _{ "<!" }
chevron_left_question = _{ "<?" }

chevron_right_normal = _{ ">" }
chevron_right_closed = _{ "/>" }
chevron_right_question = _{ "?>" }
chevron_right = _{
    chevron_right_normal
    | chevron_right_closed
    | chevron_right_question
}

equal = _{ "=" }
quote_dubble = _{ "\"" }
quote_single = _{ "'" }
quote = _{ quote_dubble | quote_single }
WSP = _{ " " | "\t" | "\r" | "\n" }