//
// parse/lexer.pest
//
// ftml - Library to parse Wikidot text
// Copyright (C) 2019-2026 Wikijump Team
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
//
// Despite being a parser generator, we're using pest here only to produce tokens,
// using this as a lexer.
//
// The original pest grammar was exhaustive, but because this is a formal grammar,
// it would error as opposed to performing fallback logic.
// Meta {{{
// Complete input from start to finish
document = _{ SOI ~ token* ~ EOI }
// An individual token
// Order determines priority (earlier = higher)
token = _{
// Raw should have the highest priority
raw |
left_raw |
right_raw |
// Comments
left_comment |
right_comment |
// Text-like
url |
email |
identifier |
variable |
double_quote |
escaped_double_quote |
escaped_backslash |
// Special case to handle those pesky "[[[[" and "]]]]"s
// These are [[[ triple links surrounded by constant [ brackets.
(left_bracket ~ left_link) |
(right_link ~ right_bracket) |
// Brackets
left_link_star |
left_link |
left_math |
left_block_anchor |
left_block_star |
left_block_end |
left_block |
left_bracket_anchor |
left_bracket_star |
left_bracket |
left_parens |
right_link |
right_math |
right_block |
right_bracket |
right_parens |
// Formatting
bold |
italics |
underline |
superscript |
subscript |
color |
left_monospace |
right_monospace |
// Tables
table_column_title |
table_column_right |
table_column_center |
table_column |
// Singular symbols
clear_float_left |
clear_float_right |
clear_float |
triple_dash |
double_dash |
double_tilde |
left_double_angle |
pipe |
equals |
colon |
underscore |
quote |
heading |
bullet_item |
numbered_item |
// Whitespace
paragraph_break |
line_break |
space |
// Generic fallback after all other rules have been tried
other
}
// }}}
// Text {{{
identifier = @{ (ASCII_ALPHANUMERIC | ASCII_DIGIT)+ }
email = @{
(!(" " | "\t" | "@" | NEWLINE) ~ ANY)+ ~
"@" ~
(!(" " | "\t" | "." | NEWLINE) ~ ANY)+ ~
"." ~
(!(" " | "\t" | NEWLINE) ~ ANY)+
}
url = @{
(("http" ~ "s"?) | "ftp") ~ "://" ~
(!(NEWLINE | " " | "\"" | "|" | "[" | "]") ~ ANY)+
}
// }}}
// Symbols {{{
raw = @{ "@@" }
left_raw = @{ "@<" }
right_raw = @{ ">@" }
left_comment = @{ "[!--" }
right_comment = @{ "--]" }
left_bracket = @{ "[" }
left_bracket_anchor = @{ "[#" }
left_bracket_star = @{ "[*" }
left_block = @{ "[[" }
left_block_end = @{ "[[/" }
left_block_anchor = @{ "[[#" }
left_block_star = @{ "[[*" }
left_math = @{ "[[$" }
left_link = @{ "[[[" }
left_link_star = @{ "[[[*" }
right_bracket = @{ "]" }
right_block = @{ "]]" }
right_math = @{ "$]]" }
right_link = @{ "]]]" }
left_parens = @{ "((" }
right_parens = @{ "))" }
clear_float = @{ "~"{3,} }
clear_float_left = @{ "~"{3,} ~ "<" }
clear_float_right = @{ "~"{3,} ~ ">" }
triple_dash = @{ "-"{3,} }
double_dash = @{ "-"{2} }
double_tilde = @{ "~"{2} }
left_double_angle = @{ "<<" }
pipe = @{ "|" }
equals = @{ "=" }
colon = @{ ":" }
underscore = @{ "_" }
quote = @{ ">"+ }
heading = @{ "+"{1,6} ~ ("*" ~ !"*")? }
// }}}
// Formatting {{{
bold = @{ "**" }
italics = @{ "//" }
underline = @{ "__" }
superscript = @{ "^^" }
subscript = @{ ",," }
color = @{ "##" }
left_monospace = @{ "{{" }
right_monospace = @{ "}}" }
// }}}
// Lists {{{
bullet_item = @{ "*" ~ !"*" }
numbered_item = @{ "#" ~ !"#" }
// }}}
// Tables {{{
// NOTE: "table_column_left" ("||<") is not a real token,
// it's not supported in Wikidot and adding it can
// cause parsing issues. See WJ-1095.
table_column = @{ "||" }
table_column_right = @{ "||>" }
table_column_center = @{ "||=" }
table_column_title = @{ "||~" }
// }}}
// Variable {{{
variable = @{ "{$" ~ identifier ~ "}" }
// }}}
// String {{{
double_quote = @{ "\"" }
escaped_double_quote = @{ "\\" ~ double_quote }
escaped_backslash = @{ "\\\\" }
// }}}
// Misc {{{
line_break = @{ NEWLINE }
paragraph_break = @{ NEWLINE{2,} }
space = @{ (" " | "\t")+ }
// To be consolidated in code
//
// this way pest matches ANY lazily, permitting other rules,
// as opposed to greedily, where the first unusual token will
// turn the rest of the input into a big "other"
other = @{ ANY }
// }}}
// vim: set fdm=marker foldlevel=0: