sile 0.15.2

Simon’s Improved Layout Engine
; Formal grammar specification for SIL (SILE Input Language) files
;
; Uses RFC 5234 (Augmented BNF for Syntax Specifications: ABNF)
; Uses RFC 7405 (Case-Sensitive String Support in ABNF)

; IMPORTANT CAVEAT:
; Backus-Naur Form grammars (like ABNF and EBNF) do not have a way to
; express matching opening and closing tags. The grammar below does
; not express SILE's ability to skip over passthrough content until
; it hits the matching closing tag for environments.

; A master document can only have one top level content item, but we allow
; loading of fragments as well which can have any number of top level content
; items, hence valid grammar can be any number of content items.
document = *content

; Top level content can be any sequence of these things
content =  environment
content =/ comment
content =/ text
content =/ braced-content
content =/ command

; Environments come in two flavors, passthrough (raw) and regular. The
; difference is what is allowed to terminate them and what escapes are needed
; for the content in the middle.
environment =  %s"\begin" [ options ] "{" passthrough-command-id "}"
               env-passthrough-text
               %s"\end{" passthrough-command-id "}"
               ;         ^^^^^^^^^^^^^^^^^^^^^^
               ; End command must match id used in begin, see caveat at top
environment =/ %s"\begin" [ options ] "{" command-id "}"
               content
               %s"\end{" command-id "}"
               ;         ^^^^^^^^^^
               ; End command must match id used in begin, see caveat at top

; Passthrough (raw) environments can have any valid UTF-8 except the closing
; delimiter matching the opening, per the environment rule.
env-passthrough-text = *utf8-char

; Nothing to see here.
; But potentially important because it eats newlines!
comment = "%" *utf8-char CRLF

; Input strings that are not special
text = *text-char

; Input content wrapped in braces can be attached to a command or used to
; manually isolate chunks of content (e.g. to hinder ligatures).
braced-content = "{" content "}"

; As with environments, the content format may be passthrough (raw) or more SIL
; content depending on the command.
command =  "\" passthrough-command-id [ options ] [ braced-passthrough-text ]
command =/ "\" command-id [ options ] [ braced-content ]

; Passthrough (raw) command text can have any valid UTF-8 except an unbalanced
; closing delimiter
braced-passthrough-text = "{"
                          *( braced-passthrough-text / braced-passthrough-char )
                          "}"

braced-passthrough-char =  %x00-7A ; omit {
braced-passthrough-char =/ %x7C    ; omit }
braced-passthrough-char =/ %x7E-7F ; end of utf8-1
braced-passthrough-char =/ utf8-2
braced-passthrough-char =/ utf8-3
braced-passthrough-char =/ utf8-4

options = "[" parameter *( "," parameter ) "]"
parameter = *WSP identifier *WSP "=" *WSP ( quoted-value / value ) *WSP

quoted-value = DQUOTE *quoted-value-char DQUOTE
quoted-value-char = "\" %x22
quoted-value-char =/ %x00-21 ; omit "
quoted-value-char =/ %x23-7F ; end of utf8-1
quoted-value-char =/ utf8-2
quoted-value-char =/ utf8-3
quoted-value-char =/ utf8-4

value = *value-char
value-char =  %x00-21 ; omit "
value-char =/ %x23-2B ; omit ,
value-char =/ %x3C-5C ; omit ]
value-char =/ %x3E-7F ; end of utf8-1
value-char =/ utf8-2
value-char =/ utf8-3
value-char =/ utf8-4

text-char =  "\" ( %x5C / %x25 / %x7B / %x7D )
text-char =/ %x00-24 ; omit %
text-char =/ %x26-5B ; omit \
text-char =/ %x5D-7A ; omit {
text-char =/ %x7C    ; omit }
text-char =/ %x7E-7F ; end of utf8-1
text-char =/ utf8-2
text-char =/ utf8-3
text-char =/ utf8-4

letter = ALPHA / "_" / ":"
identifier = letter *( letter / DIGIT / "-" / "." )
passthrough-command-id = %s"ftl"
                       / %s"lua"
                       / %s"math"
                       / %s"raw"
                       / %s"script"
                       / %s"sil"
                       / %s"use"
                       / %s"xml"
command-id = identifier

; ASCII isn't good enough for us.
utf8-char = utf8-1 / utf8-2 / utf8-3 / utf8-4
utf8-1    = %x00-7F
utf8-2    = %xC2-DF utf8-tail
utf8-3    = %xE0 %xA0-BF utf8-tail
          / %xE1-EC 2utf8-tail
          / %xED %x80-9F utf8-tail
          / %xEE-EF 2utf8-tail
utf8-4    = %xF0 %x90-BF 2utf8-tail
          / %xF1-F3 3utf8-tail
          / %xF4 %x80-8F 2utf8-tail
utf8-tail = %x80-BF