arinamcnulty-markdown-parser 0.1.1

//
// Markdown parser grammar
// Author: Zudilova Oryna
//
// This grammar defines rules for parsing CommonMark Markdown syntax.
// Each rule corresponds to a specific Markdown element and is documented below.
//

// Single character and whitespace handling
/// Any single character except newline
character = { !NEWLINE ~ ANY }
/// Whitespace characters (space or tab)
whitespace = _{ " " | "\t" }

// Links and images - core inline elements
// Example: [Click here](https://example.com)
// Example: ![Alt text](image.jpg)
link = { "[" ~ link_content ~ "](" ~ link_url ~ ")" }
image = { "![" ~ image_alt ~ "](" ~ image_url ~ ")" }

// Link and image content parsing
link_content = { link_char+ }
image_alt = { image_char+ }
link_url = { url_char+ }
image_url = { url_char+ }

// Characters allowed in different contexts
link_char = { !("]" | "\\") ~ ANY | escape_sequence }
image_char = { !("]" | "\\") ~ ANY | escape_sequence }
url_char = { !(")" | "\\") ~ ANY | escape_sequence }

// Text formatting - bold, italic, etc.
// Example: **bold text**, *italic text*, ~~strikethrough~~
bold_formatting = { "**" ~ bold_content ~ "**" }
italic_formatting = { ("*" ~ italic_content ~ "*") | ("_" ~ italic_content ~ "_") }
strikethrough_formatting = { "~~" ~ strikethrough_content ~ "~~" }
underline_formatting = { "__" ~ underline_content ~ "__" }

// Content rules for each formatting type (consume until closing delimiter)
bold_content = { (!( "**" ) ~ ANY)* }
italic_content = { (!( "*" | "_" ) ~ ANY)* }
strikethrough_content = { (!( "~~" ) ~ ANY)* }
underline_content = { (!( "__" ) ~ ANY)* }

// Escape sequences for special characters
// Example: \* for literal asterisk
escape_sequence = { "\\" ~ (!whitespace ~ character) }

// Inline code with backticks
inline_code = { "`" ~ (!"`" ~ ANY)* ~ "`" }

// Lists
// Unordered lists: - item or * item
// Ordered lists: 1. item, 2. item, etc.
unordered_list_item = { ("-" | "*") ~ whitespace ~ (!NEWLINE ~ ANY)* ~ NEWLINE? }
ordered_list_item = { ASCII_DIGIT+ ~ "." ~ whitespace ~ (!NEWLINE ~ ANY)* ~ NEWLINE? }

// Document lists
document_unordered_list = { unordered_list_item+ }
document_ordered_list = { ordered_list_item+ }

// Thematic break (horizontal rule)
thematic_break = { ("---" | "***" | "___") ~ whitespace* ~ (NEWLINE | EOI) }

// Plain text characters (no formatting markers)
plain_char = { !("*" | "_" | "~" | "[" | "!" | "\\" | "#" | "`" | "-" | ASCII_DIGIT | NEWLINE) ~ ANY }

// Headings with different levels
// Example: # Heading 1, ## Heading 2, ### Heading 3
document_heading = _{
    h1_heading
  | h2_heading
  | h3_heading
}

h1_heading = { "#" ~ whitespace ~ (!NEWLINE ~ ANY)* ~ NEWLINE? }
h2_heading = { "##" ~ whitespace ~ (!NEWLINE ~ ANY)* ~ NEWLINE? }
h3_heading = { "###" ~ whitespace ~ (!NEWLINE ~ ANY)* ~ NEWLINE? }

// Blockquotes with optional line prefixes
// Example:
// > This is a quote
// > Second line
document_quote = { quote_line+ }
quote_line = { ">" ~ whitespace? ~ (paragraph_text | blank_line) }

// Code blocks with language support
// Example:
// ```rust
// println!("Hello");
// ```
code_fence = { "```" ~ (language_spec ~ whitespace* ~ NEWLINE)? ~ code_body ~ NEWLINE? ~ "```" ~ NEWLINE? }
language_spec = { whitespace* ~ ('a'..'z' | 'A'..'Z')+ }
code_body = { (!(NEWLINE? ~ "```") ~ ANY)+ }

// Horizontal rules (thematic breaks)
// Example: ---, ***, ___

// Paragraphs - basic text blocks
document_paragraph = { paragraph_text+ }
paragraph_text = { inline_content+ ~ line_break? }
line_break = _{ NEWLINE }

// Blank lines for separation
blank_line = { NEWLINE }

// All possible inline elements in text
inline_content = _{
    image
  | link
  | text_formatting
  | inline_code
  | escape_sequence
  | plain_text
}

text_formatting = _{
    bold_formatting
  | italic_formatting
  | strikethrough_formatting
  | underline_formatting
  | image
  | link
}

// Plain text spans
plain_text = @{ plain_char+ }

// Main document structure
document_structure = { SOI ~ (document_block ~ NEWLINE*)* ~ document_block? ~ EOI? }

// All possible document blocks
document_block = {
    document_heading
  | document_quote
  | code_fence
  | document_unordered_list
  | document_ordered_list
  | thematic_break
  | document_paragraph
}