// BEGIN of GENERIC RULES
// insignificant whitespace, not repeated
ws = _{ " " | "\t" }
meta_attr_key = ${ "name" | "version" | "overlay-set" }
meta_attr_value = ${ string | char+}
meta_key_pair = @{ meta_attr_key ~ arg_ws? ~ "=" ~ arg_ws? ~ meta_attr_value }
meta_comment = @{ "--" ~ ws* ~ (!NEWLINE ~ meta_key_pair)* }
meta_comment_line = _{ ws* ~ meta_comment ~ NEWLINE? }
comment = @{ "#" ~ (!NEWLINE ~ ANY)* }
comment_line = _{ ws* ~ comment ~ NEWLINE? }
empty_line = @{ ws* ~ NEWLINE }
/// Double quoted string
double_quoted_string = _{ "\"" ~ inner ~ "\"" }
inner = @{ (!("\"" | "\\" | "\u{0000}" | "\u{001F}") ~ ANY)* ~ (escape ~ inner)? }
escape = @{ "\\" ~ ("b" | "t" | "n" | "f" | "r" | "\"" | "\\" | "'" | unicode | NEWLINE)? }
unicode = @{ "u" ~ ASCII_HEX_DIGIT{4} | "U" ~ ASCII_HEX_DIGIT{8} }
single_quoted_string = _{ "'" ~ single_quoted_inner ~ "'" }
single_quoted_inner = @{ (!("'" | "\\" | "\u{0000}" | "\u{001F}") ~ ANY)* ~ (escape ~ single_quoted_inner)? }
string = { single_quoted_string | double_quoted_string }
// a line continuation, allowing an instruction to continue onto a new line
line_continuation = _{ "\\" ~ ws* ~ NEWLINE }
// whitespace that may appear between instruction arguments
// this allows instructions to expand past a newline if escaped
arg_ws = _{ (ws | line_continuation ~ (comment_line | empty_line)*)+ }
// like arg_ws, but where whitespace is optional
arg_ws_maybe = _{ (ws | line_continuation ~ (comment_line | empty_line)*)* }
// continues consuming input beyond a newline, if the newline is preceeded by an
// escape (\)
// these tokens need to be preserved in the final tree so they can be handled
// appropraitely; pest's ignore rules aren't sufficient for our needs
any_content = @{
(
!NEWLINE ~
!line_continuation ~
ANY
)+
}
any_breakable = ${
(
// can be any comment string (no line continuation required)
comment_line ~ any_breakable?
) | (
// ... OR some piece of content, requiring a continuation EXCEPT on the
// final line
any_content ~ (line_continuation ~ any_breakable)?
)
}
// consumes any character until the end of the line
any_eol = _{ (!NEWLINE ~ ANY)* }
// consumes all characters until the next whitespace
until_whitespace = _{ (!(NEWLINE | EOI | arg_ws) ~ ANY)+ }
// consumes identifier characters until the next whitespace
identifier_whitespace = _{ (!ws ~ (ASCII_ALPHANUMERIC | "_" | "-"))+ }
// consumes until whitespace or = (for key in key=value pairs)
any_equals = _{ (!(NEWLINE | ws | "=") ~ ANY)+ }
// END of GENERIC RULES
commands = _{
(
from |
add |
modify |
remove
) ~ NEWLINE?
}
from = { ^"from" ~ ws* ~ from_ref}
add = { ^"add" ~ arg_ws* ~ oca_object }
modify = { ^"modify" ~ char+ }
remove = { ^"remove" ~ arg_ws* ~ remove_oca_object }
SCRIPTS = {
ADLAM | AHOM | ANATOLIAN_HIEROGLYPHS | ARABIC | ARMENIAN | AVESTAN
| BALINESE | BAMUM | BASSA_VAH | BATAK | BENGALI | BHAIKSUKI | BOPOMOFO |
BRAHMI | BRAILLE | BUGINESE | BUHID | CANADIAN_ABORIGINAL | CARIAN |
CAUCASIAN_ALBANIAN | CHAKMA | CHAM | CHEROKEE | CHORASMIAN | COPTIC |
CUNEIFORM | CYPRIOT | CYPRO_MINOAN | CYRILLIC | DESERET | DEVANAGARI |
DIVES_AKURU | DOGRA | DUPLOYAN | EGYPTIAN_HIEROGLYPHS | ELBASAN | ELYMAIC |
ETHIOPIC | GEORGIAN | GLAGOLITIC | GOTHIC | GRANTHA | GREEK | GUJARATI |
GUNJALA_GONDI | GURMUKHI | HAN | HANGUL | HANIFI_ROHINGYA | HANUNOO | HATRAN |
HEBREW | HIRAGANA | IMPERIAL_ARAMAIC | INHERITED | INSCRIPTIONAL_PAHLAVI |
INSCRIPTIONAL_PARTHIAN | JAVANESE | KAITHI | KANNADA | KATAKANA | KAWI |
KAYAH_LI | KHAROSHTHI | KHITAN_SMALL_SCRIPT | KHMER | KHOJKI | KHUDAWADI | LAO
| LATIN | LEPCHA | LIMBU | LINEAR_A | LINEAR_B | LISU | LYCIAN | LYDIAN |
MAHAJANI | MAKASAR | MALAYALAM | MANDAIC | MANICHAEAN | MARCHEN | MASARAM_GONDI
| MEDEFAIDRIN | MEETEI_MAYEK | MENDE_KIKAKUI | MEROITIC_CURSIVE |
MEROITIC_HIEROGLYPHS | MIAO | MODI | MONGOLIAN | MRO | MULTANI | MYANMAR |
NABATAEAN | NAG_MUNDARI | NANDINAGARI | NEW_TAI_LUE | NEWA | NKO | NUSHU |
NYIAKENG_PUACHUE_HMONG | OGHAM | OL_CHIKI | OLD_HUNGARIAN | OLD_ITALIC |
OLD_NORTH_ARABIAN | OLD_PERMIC | OLD_PERSIAN | OLD_SOGDIAN | OLD_SOUTH_ARABIAN
| OLD_TURKIC | OLD_UYGHUR | ORIYA | OSAGE | OSMANYA | PAHAWH_HMONG | PALMYRENE
| PAU_CIN_HAU | PHAGS_PA | PHOENICIAN | PSALTER_PAHLAVI | REJANG | RUNIC |
SAMARITAN | SAURASHTRA | SHARADA | SHAVIAN | SIDDHAM | SIGNWRITING | SINHALA |
SOGDIAN | SORA_SOMPENG | SOYOMBO | SUNDANESE | SYLOTI_NAGRI | SYRIAC | TAGALOG
| TAGBANWA | TAI_LE | TAI_THAM | TAI_VIET | TAKRI | TAMIL | TANGSA | TANGUT |
TELUGU | THAANA | THAI | TIBETAN | TIFINAGH | TIRHUTA | TOTO | UGARITIC | VAI |
VITHKUQI | WANCHO | WARANG_CITI | YEZIDI | YI | ZANABAZAR_SQUARE
}
char = _{ LETTER | NUMBER | "." | "-" | "_" | "/" | ":" | SCRIPTS }
indent = @{ (" " | "\t")+ }
from_ref = _{ ref_said | ref_alias }
oca_object = _{
(
capture_base |
overlay
)
}
remove_oca_object = _{
(
remove_attribute |
remove_overlay
)
}
attrs_key = _{ ^"attrs" ~ arg_ws}
url = ${ string }
json_key = ${ string }
json_value = ${ string | url | json_object }
json_pair = ${ json_key ~ arg_ws* ~ ":" ~ (arg_ws | NEWLINE)? ~ json_value ~ (arg_ws? ~ "," ~ (arg_ws | NEWLINE)?)? }
json_object = ${ "{" ~ ((arg_ws | NEWLINE)? ~ arg_ws* ~ json_pair ~ (arg_ws | NEWLINE)?)+ ~ arg_ws* ~ "}" }
capture_base = { ^"attribute" ~ attr_pairs+ }
overlay = { overlay_header ~ overlay_body }
overlay_header = { ^"overlay" ~ arg_ws+ ~ overlay_name ~ NEWLINE+ }
remove_attribute = { ^"attribute" ~ (arg_ws ~ attr_key)* }
overlay_body = { overlay_line+ }
overlay_line = {
indent? ~ line_content
}
line_content = _{
kv_pair |
block_header
}
block_header = { attr_key ~ NEWLINE+ }
kv_pair = { attr_key ~ arg_ws? ~ "=" ~ arg_ws? ~ key_value ~ NEWLINE? }
remove_overlay = { ^"label" ~ arg_ws ~ lang ~ (arg_ws ~ attrs_key ~ attr_key+)? }
overlay_name = ${ string | char+}
attr_key = ${ string | char+ }
key_value = { reference_type | quoted_reference | string | array_type | char+}
quoted_reference = _{ "\"" ~ reference_type ~ "\"" | "'" ~ reference_type ~ "'" }
array_type = _{ "[" ~ arg_ws? ~ array? ~ arg_ws? ~ "]" }
array = { key_value ~ (arg_ws? ~ "," ~ arg_ws? ~ key_value)* }
key_pair = ${ attr_key ~ arg_ws? ~ "=" ~ arg_ws? ~ key_value }
base_attr_type = @{ (^"Text" |
^"Numeric" |
^"Boolean" |
^"Binary" |
^"DateTime" )}
array_attr_type = ${( "["~ arg_ws? ~ (base_attr_type | reference_type | array_attr_type ) ~ arg_ws? ~"]" )}
reference_type = _{ ref_said | ref_alias }
alias = @{ char+ }
said = @{ char+ }
refs = _{^"refs:"}
refn = _{^"refn:"}
ref_said = _{ refs ~ said}
ref_alias = _{ refn ~ alias }
/// Type of the attribute, can be a base type, an array of base types or a reference to another object
attr_type = ${ base_attr_type | array_attr_type | ref_said | ref_alias }
/// Capture base attribute key=type pair
attr_pair = @{attr_key ~ arg_ws? ~ "=" ~ arg_ws? ~ attr_type}
/// List of capture base attributes
attr_pairs = ${ (arg_ws ~ attr_pair)+ ~ NEWLINE*}
// TODO add support for lang ISO
lang = ${ ASCII_ALPHA{2} ~ ("-" ~ ASCII_ALPHA{2})? }
file = {
SOI ~
(empty_line | meta_comment_line | comment_line | commands)*
~ EOI
}