// CDDL Pest Grammar
// Based on RFC 8610 Appendix B: ABNF Grammar
// This grammar defines the Concise Data Definition Language (CDDL)
//
// CDDL is a notation for expressing CBOR and JSON data structures.
// This Pest grammar file provides a complete implementation that can parse
// all valid CDDL constructs as defined in RFC 8610 and subsequent extensions.
//
// Grammar Organization:
// 1. Entry point and whitespace handling
// 2. Rules (type rules and group rules)
// 3. Type expressions and type choices
// 4. Operators (control, range, etc.)
// 5. Groups and group entries
// 6. Identifiers and socket/plug syntax
// 7. Values (numbers, strings, byte strings)
// 8. Standard prelude types
// =============================================================================
// ENTRY POINT AND WHITESPACE
// =============================================================================
// Main entry point: A CDDL specification consists of zero or more rules
// separated by whitespace and comments (RFC 9682 Section 3.1)
cddl = { SOI ~ S ~ (rule ~ S)* ~ EOI }
// Whitespace and comments (silent rules that don't appear in parse tree)
WHITESPACE = _{ " " | "\t" | "\r" | "\n" }
COMMENT = { ";" ~ (!NEWLINE ~ ANY)* }
S = _{ (WHITESPACE | COMMENT)* }
NEWLINE = _{ "\n" | "\r\n" }
// =============================================================================
// RULES
// =============================================================================
// Rules define types or groups
// Type rule: typename [<params>] = type
// Group rule: groupname [<params>] = grpent
rule = { typename ~ generic_params? ~ S ~ assign_t ~ S ~ type_expr
| groupname ~ generic_params? ~ S ~ assign_g ~ S ~ group_entry }
// Assignment operators
// "=" assigns a new rule
// "/=" adds a type choice alternative
// "//=" adds a group choice alternative
assign_t = { assign | assign_t_choice }
assign_g = { assign | assign_g_choice }
assign = { "=" }
assign_t_choice = { "/=" }
assign_g_choice = { "//=" }
// =============================================================================
// GENERIC PARAMETERS AND ARGUMENTS
// =============================================================================
// Generic parameters: define type/group parameters for reusable definitions
// Example: map<K, V> = { * K => V }
generic_params = { "<" ~ S ~ generic_param ~ (S ~ "," ~ S ~ generic_param)* ~ S ~ ">" }
generic_param = { id }
// Generic arguments: provide concrete types when using a generic rule
// Example: my_map = map<text, int>
generic_args = { "<" ~ S ~ generic_arg ~ (S ~ "," ~ S ~ generic_arg)* ~ S ~ ">" }
generic_arg = { type1 }
// =============================================================================
// TYPE EXPRESSIONS
// =============================================================================
// Type expressions can have multiple choices separated by "/"
// Example: value = int / text / bool
type_expr = { type_choice ~ (S ~ type_choice_op ~ S ~ type_choice)* }
type_choice = { type1 }
type_choice_op = { "/" }
// Type1: A type optionally followed by a range or control operator
// Example: port = 0..65535
// Example: email = tstr .regexp "[^@]+@[^@]+"
type1 = { type2 ~ S ~ (range_op ~ S ~ type2 | control_op ~ S ~ controller)? }
// =============================================================================
// OPERATORS
// =============================================================================
// Range operators define numeric or value ranges
// ".." is inclusive range (e.g., 0..100)
// "..." is exclusive range (e.g., 0...100)
// Note: Order matters - try longer match first
range_op = { range_op_exclusive | range_op_inclusive }
range_op_inclusive = { ".." }
range_op_exclusive = { "..." }
// Control operators provide additional constraints on types
// Standard operators from RFC 8610:
// .size - constrains size of strings, arrays, or maps
// .bits - constrains bit strings
// .regexp/.pcre - pattern matching
// .cbor/.cborseq - embedded CBOR
// .within - subset constraint
// .and - conjunction constraint
// .lt/.le/.gt/.ge/.eq/.ne - numeric comparisons
// .default - default value
// Additional operators from RFC 9165 and RFC 9741:
// .cat/.det/.plus - string operations
// .abnf/.abnfb - ABNF grammar constraints
// .feature - feature-based selection
// .b64u/.b64c/.hex/.b32/.b45 - encoding constraints
// .printf/.json/.join - formatting operations
control_op = { "." ~ control_name }
control_name = { "size" | "bits" | "regexp" | "pcre" | "cbor" | "cborseq"
| "within" | "and" | "lt" | "le" | "gt" | "ge" | "eq" | "ne"
| "default" | "cat" | "det" | "plus" | "abnfb" | "abnf"
| "feature" | "b64u-sloppy" | "b64c-sloppy" | "b64u" | "b64c"
| "hexuc" | "hexlc" | "hex" | "base10" | "printf" | "json" | "join"
| "b32" | "h32" | "b45" }
controller = { type2 }
// =============================================================================
// TYPE2: PRIMARY TYPE EXPRESSIONS
// =============================================================================
// Type2 represents the core type constructs:
// - Literal values (numbers, strings)
// - Named types (with optional generic arguments)
// - Parenthesized type expressions
// - Map types: { group }
// - Array types: [ group ]
// - Unwrapped types: ~typename
// - Group-to-choice enumeration: &(group) or &groupname
// - Tagged types: #6.32(type) or #6.<type>
type2 = { value
| typename ~ generic_args?
| "(" ~ S ~ type_expr ~ S ~ ")"
| "{" ~ S ~ group ~ S ~ "}"
| "[" ~ S ~ group ~ S ~ "]"
| "~" ~ S ~ typename ~ generic_args?
| "&" ~ S ~ "(" ~ S ~ group ~ S ~ ")"
| "&" ~ S ~ groupname ~ generic_args?
| tag_expr }
// Tag expressions define CBOR tags
// Examples:
// #6.32(tstr) - tag 32 with text string
// #6.<typename> - tag with type constraint
// #1.5 - major type 1, additional info 5
tag_expr = { "#" ~ DIGIT ~ ("." ~ tag_value)? ~ ("(" ~ S ~ type_expr ~ S ~ ")")?
| "#" ~ ("(" ~ S ~ type_expr ~ S ~ ")")? }
tag_value = { uint_value | "<" ~ S ~ type_expr ~ S ~ ">" }
// =============================================================================
// GROUPS
// =============================================================================
// Groups define collections of entries (for maps and arrays)
// Group choices are separated by "//"
// Example: ( name: text // id: int )
group = { group_choice ~ (S ~ group_choice_op ~ S ~ group_choice)* }
group_choice = { (group_entry ~ (S ~ ","? ~ S ~ group_entry)* ~ (S ~ ",")?)? }
group_choice_op = { "//" }
// =============================================================================
// GROUP ENTRIES
// =============================================================================
// Group entries define the contents of maps and arrays
// Supported forms:
// - Member with key: key: type (for maps with bareword keys)
// - Member with arrow: key [^] => type (for computed keys, optional cut)
// - Occurrence with type: ? type, * type, + type, n*m type
// - Named group reference: groupname
// - Nested group: ( group )
// - Cut entries: key ^ => type (prevents backtracking)
group_entry = { occur? ~ S ~ member_key ~ S ~ cut? ~ S ~ "=>" ~ S ~ type_expr
| occur? ~ S ~ member_key ~ S ~ ":" ~ S ~ type_expr
| occur? ~ S ~ "(" ~ S ~ group ~ S ~ ")"
| occur? ~ S ~ type_expr
| occur? ~ S ~ groupname ~ generic_args? }
// Cut marker (^) prevents backtracking past this point in validation
cut = { "^" }
// Member keys can be barewords, type names, or literal values
member_key = { bareword | typename ~ generic_args? | value }
// =============================================================================
// OCCURRENCE INDICATORS
// =============================================================================
// Occurrence indicators specify how many times an entry can appear
// ? - optional (0 or 1)
// * - zero or more
// + - one or more
// n* - exactly n times
// n*m - between n and m times
// Note: Sub-rules are atomic to prevent implicit whitespace skipping
// before the negative lookahead (!DIGIT), and ordered so that
// zero_or_more/one_or_more/optional match before the catch-all range.
occur = { occur_exact
| occur_zero_or_more
| occur_one_or_more
| occur_optional
| occur_range }
occur_exact = @{ uint_value ~ "*" ~ !DIGIT }
occur_range = @{ uint_value ~ "*" ~ uint_value | uint_value? ~ "*" ~ uint_value? }
occur_zero_or_more = @{ "*" ~ !DIGIT }
occur_one_or_more = @{ "+" }
occur_optional = @{ "?" }
// =============================================================================
// IDENTIFIERS
// =============================================================================
// Identifiers for types, groups, and barewords
// Can contain letters, digits, hyphens, and periods
// Must start with a letter, @, _, or $
id = @{ EALPHA_START ~ (("-" | ".")? ~ (EALPHA | DIGIT))* }
typename = { socket_type? ~ id }
groupname = { socket_group? ~ id }
bareword = @{ id }
// Socket/plug syntax for extensibility
// $typename - type socket (single $)
// $$groupname - group socket (double $$)
socket_type = { "$" }
socket_group = { "$$" }
// EALPHA for identifier content (can include $ in middle)
EALPHA = { ALPHA | "@" | "_" | "$" }
// EALPHA_START for identifier start (cannot start with $)
EALPHA_START = { ALPHA | "@" | "_" }
ALPHA = { 'a'..'z' | 'A'..'Z' }
DIGIT = { '0'..'9' }
// =============================================================================
// VALUES
// =============================================================================
// Value literals: numbers, text strings, or byte strings
value = { number | text_value | bytes_value }
// =============================================================================
// NUMBERS
// =============================================================================
// Numeric values: integers, floats, and hexfloats
// Examples: 42, -10, 3.14, 1.5e10, 0x1.5p10
number = { hexfloat | float_value | int_value | uint_value }
uint_value = @{ DIGIT+ }
int_value = @{ "-" ~ DIGIT+ }
float_value = @{ "-"? ~ DIGIT+ ~ "." ~ DIGIT+ ~ (^"e" ~ ("+" | "-")? ~ DIGIT+)? }
hexfloat = @{ "-"? ~ "0x" ~ ASCII_HEX_DIGIT+ ~ ("." ~ ASCII_HEX_DIGIT+)? ~ ("p" ~ ("+" | "-")? ~ DIGIT+)? }
// =============================================================================
// TEXT VALUES (STRINGS)
// =============================================================================
// Text string values with escape sequences
// Supports JSON escape sequences and RFC 9682 \u{hex} escapes:
// \", \\, \/, \b, \f, \n, \r, \t, \uXXXX, \uHHHH\uLLLL, \u{hex}
text_value = ${ "\"" ~ text_inner ~ "\"" }
text_inner = @{ text_char* }
text_char = { escape_sequence | (!("\"" | "\\") ~ ANY) }
escape_sequence = @{ "\\" ~ (
"\"" | "\\" | "/" | "b" | "f" | "n" | "r" | "t"
| ("u" ~ "{" ~ ASCII_HEX_DIGIT+ ~ "}")
| ("u" ~ ASCII_HEX_DIGIT{4})
) }
// =============================================================================
// BYTE STRINGS
// =============================================================================
// Byte string values in various encodings
// 'base64data' - base64 encoding
// h'hexdata' - base16 (hex) encoding
// h"text" - hex-quoted form
bytes_value = { bytes_b64 | bytes_b16 | bytes_h_quoted }
bytes_b64 = @{ "'" ~ BASE64_INNER ~ "'" }
bytes_b16 = @{ "h'" ~ HEX_INNER ~ "'" }
bytes_h_quoted = @{ "h" ~ "\"" ~ (!("\"") ~ ANY)* ~ "\"" }
BASE64_INNER = { (!(QUOTE) ~ ANY)* }
HEX_INNER = { (ASCII_HEX_DIGIT | WHITESPACE)* }
QUOTE = _{ "'" }
// =============================================================================
// STANDARD PRELUDE TYPES
// =============================================================================
// Standard prelude types from RFC 8610 Section 3.1
// These are predefined type names with special semantics
// Note: In practice, these are treated as identifiers that match
// specific type patterns. This rule documents them for reference.
prelude_type = { "any" | "uint" | "nint" | "int" | "bstr" | "bytes"
| "tstr" | "text" | "tdate" | "time" | "number"
| "biguint" | "bignint" | "bigint" | "integer" | "unsigned"
| "decfrac" | "bigfloat" | "eb64url" | "eb64legacy" | "eb16"
| "encoded-cbor" | "uri" | "b64url" | "b64legacy"
| "regexp" | "mime-message" | "cbor-any" | "undefined"
| "float16" | "float32" | "float64" | "float16-32"
| "float32-64" | "float" | "false" | "true" | "bool"
| "nil" | "null" }