pegmd 0.2.3

A PEG based markdown parser for creating an Abstract Syntax Tree
Documentation
document = { SOI ~ section* ~ EOI }
section = _{ blank_line* ~ section_contents ~ blank_line* }
section_contents = _{ verbatim | reference | header | bullet_list | ordered_list | code_block | thematic_break |  paragraph }

COMMENT = _{ ("[//]: # (" ~ ("\\)" | (!")" ~ ANY))* ~ ")") | "<!--" ~ (!"-->" ~ ANY)+ ~ "-->" }

space = { " " | "\t" }
silent_space = _{ " " }

blank_line = _{ space* ~ NEWLINE }
indent = _{ " "{4} | "\t" }
non_indent_space = _{ " "{1,3} }

double_quote = _{ "\"" }
single_quote = _{ "'" }

special_char = _{ "~" | "*" | "_" | "`" | "&" | "[" | "]" | "(" | ")" | "<" | "!" | "#" | "\\" | "\"" }
normal_char = _{ !special_char ~ !NEWLINE ~ !space ~ ANY }
non_space = { !NEWLINE ~ !space ~ ANY }
control_character = { "-" | "\\" | "`" | "|" | "*" | "#" | "+" | "." | "!" | "_" | "{" | "}" | "[" | "]" | "(" | ")" | "<" |">" | "\"" | "'" }
escaped_special_char = _{ "\\" ~ control_character } 

markup = _{ strong | emphasis | code | image | link }
str = { normal_char+ ~ (normal_char | space+ ~ &normal_char)* }
symbol = { special_char }

normal_endline = { space? ~ NEWLINE ~ !(blank_line | indent | thematic_break | block_quote_open | non_indent_space? ~ (atx_hash | "1. " | "1) " | !star_strong ~ !star_emphasis ~ bullet)) }
trailing_endline = _{ space* ~ NEWLINE ~ EOI }
linebreak = { "  " ~ normal_endline }
endline = _{ normal_endline | trailing_endline | linebreak }

inline = _{ str | endline  | space | escaped_special_char | markup | symbol }
inlines = _{ ((!endline ~ inline) | (endline ~ &inline))+ ~ endline? }

label = { (!"]" ~ !NEWLINE ~ inline)* }
source = { (!")" ~ !">" ~ !"]" ~ non_space)+ }

star_line = _{ non_indent_space? ~ ("*" ~ space*){3,} ~ NEWLINE }
dash_line = _{ non_indent_space? ~ ("-" ~ space*){3,} ~ NEWLINE }
underscore_line = _{ non_indent_space? ~ ("_" ~ space*){3,} ~ NEWLINE }
thematic_break = { star_line | dash_line | underscore_line }

paragraph = { non_indent_space? ~ inlines ~ (blank_line+ | EOI) }
block_quote_open = _{ non_indent_space? ~ ">" }
verbatim = { (block_quote_open ~ silent_space* ~ inline* ~ (linebreak_literal ~ &block_quote_open)?)+ ~ NEWLINE? }

atx_hash = { "#"{1,6} ~ !"#" }
header = { 
  non_indent_space? ~ 
  atx_hash ~ silent_space* ~ 
  (!((space | "#")* ~ NEWLINE) ~ inline)+ ~ 
  (silent_space | "#")* ~ endline? ~ 
  blank_line* 
}

end_list = { space* ~ COMMENT | &thematic_break }

star_bullet = { "*" ~ silent_space+ }
dash_bullet = { "-" ~ silent_space+ }
plus_bullet = { "+" ~ silent_space+ }
bullet = { star_bullet | dash_bullet | plus_bullet }
list_index = { ASCII_DIGIT{1,3} }
enumerator = _{ list_index ~ (")"| ".") ~ silent_space+ }

inline_list_block = _{ 
  (!blank_line ~ (
    inline | 
    (NEWLINE ~ PEEK_ALL ~ silent_space{0,1} ~ (&strong | &emphasis | !(bullet | enumerator)) ~ inline)
  ))* 
  ~ endline? }
continued_list_block = _{
  blank_line* ~ // Allow a blank line before the continuation block
  PEEK_ALL ~ !end_list ~ // Match the indent level AND confirm there's no hard break for the list (only needed for a root, non-indented list)
  (&silent_space{2,} | &strong | &emphasis | !(bullet | enumerator)) ~ // We've either indended more OR the row doesn't start with a bullet. Checking for emph and strong first since they start with a bullet but should be allowed
  (section_contents) // Match a single section
}

star_bullet_item_tight = { star_bullet ~ list_item_contents_tight ~ NEWLINE? }
dash_bullet_item_tight = { dash_bullet ~ list_item_contents_tight ~ NEWLINE? }
plus_bullet_item_tight = { plus_bullet ~ list_item_contents_tight ~ NEWLINE? }

period_ordered_list_item_tight = { list_index ~ "." ~ silent_space+ ~ list_item_contents_tight ~ NEWLINE? }
parenthesis_ordered_list_item_tight = { list_index ~ ")" ~ silent_space+ ~ list_item_contents_tight ~ NEWLINE? }

star_bullet_item = { star_bullet ~ list_item_contents }
dash_bullet_item = { dash_bullet ~ list_item_contents }
plus_bullet_item = { plus_bullet ~ list_item_contents }

period_ordered_list_item = { list_index ~ "." ~ silent_space+ ~ list_item_contents }
parenthesis_ordered_list_item = { list_index ~ ")" ~ silent_space+ ~ list_item_contents }

star_bullet_list_tight = _{
  PUSH(silent_space*) ~ 
  star_bullet_item_tight ~
  (!blank_line ~ PEEK_ALL ~ star_bullet_item_tight)* ~
  !(blank_line+ ~ PEEK_ALL ~ "* ") ~
  DROP
}

dash_bullet_list_tight = _{
  PUSH(silent_space*) ~ 
  dash_bullet_item_tight ~
  (!blank_line ~ PEEK_ALL ~ dash_bullet_item_tight)* ~
  !(blank_line+ ~ PEEK_ALL ~ "- ") ~
  DROP
}

plus_bullet_list_tight = _{
  PUSH(silent_space*) ~ 
  plus_bullet_item_tight ~
  (!blank_line ~ PEEK_ALL ~ plus_bullet_item_tight)* ~
  !(blank_line+ ~ PEEK_ALL ~ "+ ") ~
  DROP
}

star_bullet_list = _{
  PUSH(silent_space*) ~ 
  star_bullet_item ~
  (blank_line* ~ PEEK_ALL ~ star_bullet_item)* ~
  DROP
}

dash_bullet_list = _{
  PUSH(silent_space*) ~ 
  dash_bullet_item ~
  (blank_line* ~ PEEK_ALL ~ dash_bullet_item)* ~
  DROP
}

plus_bullet_list = _{
  PUSH(silent_space*) ~ 
  plus_bullet_item ~
  (blank_line* ~ PEEK_ALL ~ plus_bullet_item)* ~
  DROP
}

parenthesis_ordered_list_tight = _{
  PUSH(silent_space*) ~ 
  parenthesis_ordered_list_item_tight ~
  (!blank_line ~ PEEK_ALL ~ parenthesis_ordered_list_item_tight)* ~
  !(blank_line+ ~ PEEK_ALL ~ list_index ~ ")" ~ space+) ~
  DROP
}

period_ordered_list_tight = _{
  PUSH(silent_space*) ~ 
  period_ordered_list_item_tight ~
  (!blank_line ~ PEEK_ALL ~ period_ordered_list_item_tight)* ~
  !(blank_line+ ~ PEEK_ALL ~ list_index ~ "." ~ space+) ~
  DROP
}

parenthesis_ordered_list = _{
  PUSH(silent_space*) ~ 
  parenthesis_ordered_list_item ~
  (blank_line* ~ PEEK_ALL ~ parenthesis_ordered_list_item)* ~
  DROP
}

period_ordered_list = _{
  PUSH(silent_space*) ~ 
  period_ordered_list_item ~
  (blank_line* ~ PEEK_ALL ~ period_ordered_list_item)* ~
  DROP
}

list_item_contents = ${
  inline_list_block ~
  (blank_line* ~ continued_list_block)*
}

list_item_contents_tight = ${
  inline_list_block ~
  continued_list_block*
}

list_tight = { 
  dash_bullet_list_tight | plus_bullet_list_tight | 
  star_bullet_list_tight | period_ordered_list_tight | 
  parenthesis_ordered_list_tight 
}

list_loose = {
  dash_bullet_list | plus_bullet_list | 
  star_bullet_list | period_ordered_list | 
  parenthesis_ordered_list 
}

bullet_list = { &(silent_space* ~ bullet) ~ (list_tight | list_loose) }
ordered_list = { &(silent_space* ~ enumerator) ~ (list_tight | list_loose) } 

underline_strong = _{ "__" ~ !space ~ (!"__" ~ inline)+ ~ "__" }
star_strong = _{ "**" ~ !space ~ (!"**" ~ inline)+ ~ "**" }
strong = { star_strong | underline_strong }

star_emphasis = _{ "*" ~ !space ~ ((!"*" ~ inline) | strong)+ ~ "*" }
underline_emphasis = _{ "_" ~ !space ~ ((!"_" ~ inline) | strong)+ ~ "_" }
emphasis = { star_emphasis | underline_emphasis }

// I don't like having to break all these out, but it is helpful to assert that there are exactly n many ticks
// in a row as a dedicated rule.
single_tick = _{ "`" ~ !"`"}
two_ticks =   _{ "`"{2} ~ !"`" }
three_ticks = _{ "`"{3} ~ !"`" }
four_ticks =  _{ "`"{4} ~ !"`" }
five_ticks =  _{ "`"{5} ~ !"`" }

info_string_language = { (!"`" ~ non_space)+ }
info_string = _{ silent_space* ~ info_string_language ~ (silent_space* ~ (!"`" ~ non_space)+ ~ silent_space*)* }

linebreak_literal = { NEWLINE }
code = {
    single_tick ~ silent_space? ~ ((!"`" ~ non_space)+ |              !(space ~ single_tick) ~ (space | linebreak_literal ~ !blank_line))+ ~ silent_space? ~ (single_tick | EOI)
  | two_ticks   ~ silent_space? ~ ((!"`" ~ non_space)+ | "`" ~ !"`" | !(space ~ two_ticks)   ~ (space | linebreak_literal ~ !blank_line))+ ~ silent_space? ~ (two_ticks | EOI)
}

backtick_fenced_codeblock = _{
    three_ticks ~ info_string? ~ NEWLINE? ~ ((!"`" ~ non_space)+ | "`"{1,2} ~ !"`" | !(space+ ~ "`"{3,}) ~ (space | linebreak_literal ~ !"`"{3,}))* ~ NEWLINE ~ non_indent_space? ~ ("`"{3,} ~ silent_space* ~ NEWLINE | EOI)
  | four_ticks  ~ info_string? ~ NEWLINE? ~ ((!"`" ~ non_space)+ | "`"{1,3} ~ !"`" | !(space+ ~ "`"{4,}) ~ (space | linebreak_literal ~ !"`"{4,}))* ~ NEWLINE ~ non_indent_space? ~ ("`"{4,} ~ silent_space* ~ NEWLINE | EOI) 
  | five_ticks  ~ info_string? ~ NEWLINE? ~ ((!"`" ~ non_space)+ | "`"{1,4} ~ !"`" | !(space+ ~ "`"{5,}) ~ (space | linebreak_literal ~ !"`"{5,}))* ~ NEWLINE ~ non_indent_space? ~ ("`"{5,} ~ silent_space* ~ NEWLINE | EOI)
}

tilde_fenced_codeblock = { "TODO: Implement tilde_fenced_codeblock rule" }

fenced_codeblock = { backtick_fenced_codeblock | tilde_fenced_codeblock }
indented_codeblock_line = _{ !NEWLINE ~ (space* ~ non_space)+ }
indented_codeblock = { 
  PUSH(indent+) ~ indented_codeblock_line ~ linebreak_literal? ~
  ((space* ~ linebreak_literal)+ ~ &(PEEK_ALL ~ indented_codeblock_line) | PEEK_ALL ~ indented_codeblock_line)* ~ 
  NEWLINE? ~
  DROP 
}

code_block = _{ fenced_codeblock | indented_codeblock }

reference_source = { non_space+ }

link_title = { 
    single_quote ~ (!single_quote ~ !blank_line ~ (escaped_special_char | space | non_space))* ~ single_quote
  | double_quote ~ (!double_quote ~ !blank_line ~ (escaped_special_char | space | non_space))* ~ double_quote
  | "(" ~ (!")" ~ !blank_line ~ (escaped_special_char | space | non_space))* ~ ")" ~ &")"
}
reference = { 
  non_indent_space? ~ 
  !"[]" ~ "[" ~ label ~ "]:" ~ (silent_space+ | NEWLINE ~ silent_space*) ~ reference_source ~ 
  ((silent_space* ~ NEWLINE ~ silent_space* | silent_space+) ~ link_title)?
}

shortcut_reference_link = { "[" ~ label ~ "]" ~ "[]"? }
full_reference_link = { "[" ~ label ~ "]" ~ !"[]" ~ "[" ~ label ~ "]" }
reference_link = _{ full_reference_link | shortcut_reference_link }

directed_link = {
  "[" ~ label ~ "](" ~ source ~ 
  ((silent_space* ~ NEWLINE ~ silent_space* | silent_space+) ~ link_title)? 
  ~ ")"
}
autolink = { "<" ~ source ~ ">" }
link = { directed_link | reference_link | autolink }

image = { !"\\" ~ "!" ~ (directed_link | reference_link) }