email-address-parser 2.0.2

An RFC 5322, and RFC 6532 compliant email address parser.
Documentation
address_single = { SOI ~ address_spec ~ EOI }
address_spec = { local_part ~ "@" ~ domain }

local_part = @{ dot_atom | quoted_string }
domain = @{ dot_atom | domain_literal }

local_part_complete = { SOI ~ local_part ~ EOI }
domain_complete = { SOI ~ domain ~ EOI }

/*------------ lower level rules -------------*/
// VCHAR - visible (printing chracters): 33-126
printable_us_ascii = { '\u{21}'..'\u{7e}' }

// space (SP, ASCII value 32) and horizontal tab (HTAB, ASCII value 9) characters (together known as the white space characters, WSP). 
WSP = { " " | "\t" }
CR = { "\u{0d}" }
LF = { "\u{0a}" }
CRLF = { CR~LF }
FWS = _{ ((WSP* ~ CRLF)? ~ WSP+) | obs_FWS }
DQUOTE = { "\"" }
quoted_pair = { ("\\" ~ (printable_us_ascii | WSP)) | obs_qp }
// Printable US-ASCII characters not including "(", ")", or "\".
ctext = { !("(" | ")" | "\\") ~ (printable_us_ascii | UTF8_non_ascii | obs_ctext) }
ccontent = { ctext | quoted_pair | comment }
comment = { "(" ~ (FWS? ~ ccontent)* ~ FWS? ~ ")" }
CFWS = _{ ((FWS? ~ comment)+ ~ FWS?) | FWS }

atext = { ASCII_ALPHANUMERIC 
  | "!" | "#" |"$" | "%" | "&" | "'" |"*" | "+" |"-" | "/" |"=" | "?" |"^" | "_" |"`" | "{" |"|" | "}" |"~"
  | UTF8_non_ascii
}

/*------------ UTF8 START -------------*/
UTF8_2 = { '\u{80}'..'\u{7FF}' }
UTF8_3 = { '\u{800}'..'\u{FFFF}' }
UTF8_4 = { '\u{10000}'..'\u{10FFFF}' }
UTF8_non_ascii = { UTF8_2 | UTF8_3 | UTF8_4 } 
/*------------ UTF8 END -------------*/

/**
 * The construction dot_atom_text rule is an intentional deviation from the RFC5322.
 * The original rule is `{ atext+ ("." ~ atext+)* }`.
 * But this does not prevent such invalid domains: `-example.com`, `example-.com`, etc.
 */
atext_wo_dash = { !("-") ~ atext }
dot_atom_text = { 
  atext_wo_dash+ ~ (
    ("." ~ CFWS* ~ dot_atom_text+) |
    (("-"{2,} ~ dot_atom_text+) | ("-" ~ dot_atom_text+))
  )*
}
dot_atom = { WSP? ~ dot_atom_text ~ WSP? }

// Printable US-ASCII characters not including "[", "]", or "\".
dtext = { !("[" | "]" | "\\") ~ (printable_us_ascii | UTF8_non_ascii | obs_dtext) }
domain_literal = { CFWS? ~ "[" ~ (FWS? ~ dtext)* ~ FWS? ~ "]" ~ CFWS? }

// Printable US-ASCII characters not including "\" or the quote character.
qtext = { !(DQUOTE | "\\" ) ~ (printable_us_ascii | UTF8_non_ascii | obs_qtext) }
qcontent = { qtext | quoted_pair }
quoted_string = { CFWS? ~ DQUOTE ~ (FWS? ~ qcontent)* ~ FWS? ~ DQUOTE ~ CFWS? }

/*------------ obsolete support -------------*/
address_single_obs = { SOI ~ address_spec_obs ~ EOI }
address_spec_obs = { local_part_obs ~ "@" ~ domain_obs }

local_part_obs = @{ obs_local_part | dot_atom | quoted_string }
domain_obs = @{ obs_domain | dot_atom | domain_literal }

obs_local_part_complete = { SOI ~ obs_local_part ~ EOI }
obs_domain_complete = { SOI ~ obs_domain ~ EOI }

obs_local_part = { FWS* ~ word ~ (CFWS* ~ "." ~ CFWS* ~ word)* }
obs_domain = { 
  CFWS* ~ 
  atext_wo_dash+ ~ 
  (
    CFWS* ~ 
    (
      ("." ~ obs_domain+) |
      (("-"{2,} ~ obs_domain+) | ("-" ~ obs_domain+))
    )
  )* ~ 
  FWS*
}

word = { atom | quoted_string }
atom = { CFWS? ~ atext+ ~ CFWS? }

obs_FWS = { WSP+ ~ (CRLF ~ WSP+)* }
// US-ASCII control characters that do not include the carriage return, line feed, and white space characters
obs_NO_WS_CTL = { '\u{01}'..'\u{08}' | "\u{0b}" | "\u{0c}" | '\u{0e}'..'\u{1f}' | "\u{7f}" }
obs_qp = { "\\" ~ ("\u{00}" | obs_NO_WS_CTL | LF | CR) }

obs_ctext = { obs_NO_WS_CTL }
obs_dtext = { obs_NO_WS_CTL | quoted_pair }
obs_qtext = { obs_NO_WS_CTL }