oxillama_runtime/sampling/grammar/
parser.rs

1//! GBNF grammar parser.
2//!
3//! Parses a subset of GBNF (Grammar-Based Natural Form) syntax, which is the
4//! grammar format used by llama.cpp for constrained generation.
5//!
6//! Supported syntax:
7//! - Rules: `rule_name ::= body`
8//! - Sequences: `item1 item2`
9//! - Alternations: `body1 | body2`
10//! - Character classes: `[abc]`, `[a-z]`, `[0-9]`, `[^...]` (negated)
11//! - Repetitions: `body*`, `body+`, `body?`
12//! - Quoted strings: `"text"` (ASCII + UTF-8)
13//! - Rule references: bare identifiers
14//! - Grouping: `(body)`
15
16use std::collections::HashMap;
17
18use super::error::{GrammarError, GrammarResult};
19
20/// An inclusive byte-range in a character class.
21#[derive(Debug, Clone, PartialEq, Eq)]
22pub struct CharRange {
23    /// Inclusive lower bound.
24    pub lo: u8,
25    /// Inclusive upper bound.
26    pub hi: u8,
27}
28
29impl CharRange {
30    /// Construct a single-byte range.
31    pub fn single(b: u8) -> Self {
32        Self { lo: b, hi: b }
33    }
34
35    /// Construct a range from `lo` to `hi` inclusive.
36    pub fn range(lo: u8, hi: u8) -> Self {
37        Self { lo, hi }
38    }
39
40    /// Returns true if `b` is within this range.
41    pub fn contains(&self, b: u8) -> bool {
42        b >= self.lo && b <= self.hi
43    }
44}
45
46/// A single node in the grammar tree.
47#[derive(Debug, Clone)]
48pub enum GrammarNode {
49    /// Matches an exact sequence of bytes.
50    Literal(Vec<u8>),
51    /// Matches a single byte that falls within any of the listed ranges.
52    /// If `negated` is true, matches bytes that fall outside all ranges.
53    CharClass {
54        ranges: Vec<CharRange>,
55        negated: bool,
56    },
57    /// Reference to another named rule.
58    RuleRef(String),
59    /// Matches all items in the sequence, in order.
60    Sequence(Vec<GrammarNode>),
61    /// Matches one of the alternatives.
62    Alternation(Vec<GrammarNode>),
63    /// Repeat a node `min..=max` times (max=None means unbounded).
64    Repeat {
65        node: Box<GrammarNode>,
66        min: usize,
67        max: Option<usize>,
68    },
69}
70
71/// A fully parsed GBNF grammar.
72#[derive(Debug, Clone)]
73pub struct Grammar {
74    /// Map from rule name to its grammar node.
75    pub rules: HashMap<String, GrammarNode>,
76    /// The root rule name (first rule defined, or "root" if present).
77    pub root: String,
78    /// The original GBNF source string (retained for snapshot/resume).
79    pub source: String,
80}
81
82impl Grammar {
83    /// Parse a GBNF grammar string into a `Grammar` structure.
84    pub fn parse(input: &str) -> GrammarResult<Self> {
85        let mut parser = GbnfParser::new(input);
86        let mut grammar = parser.parse_grammar()?;
87        grammar.source = input.to_string();
88        Ok(grammar)
89    }
90
91    /// Get the grammar node for the root rule.
92    pub fn root_node(&self) -> GrammarResult<&GrammarNode> {
93        self.rules
94            .get(&self.root)
95            .ok_or_else(|| GrammarError::UnknownRule {
96                rule: self.root.clone(),
97            })
98    }
99}
100
101// ─── Parser internals ────────────────────────────────────────────────────────
102
103/// Recursive descent parser for GBNF grammars.
104struct GbnfParser<'a> {
105    input: &'a [u8],
106    pos: usize,
107}
108
109impl<'a> GbnfParser<'a> {
110    fn new(input: &'a str) -> Self {
111        Self {
112            input: input.as_bytes(),
113            pos: 0,
114        }
115    }
116
117    // ── Error helpers ────────────────────────────────────────────────────────
118
119    fn parse_error(&self, msg: impl Into<String>) -> GrammarError {
120        GrammarError::ParseError {
121            pos: self.pos,
122            msg: msg.into(),
123        }
124    }
125
126    // ── Character tests ──────────────────────────────────────────────────────
127
128    fn peek(&self) -> Option<u8> {
129        self.input.get(self.pos).copied()
130    }
131
132    fn peek2(&self) -> Option<u8> {
133        self.input.get(self.pos + 1).copied()
134    }
135
136    fn advance(&mut self) {
137        self.pos += 1;
138    }
139
140    fn consume(&mut self) -> Option<u8> {
141        let b = self.peek()?;
142        self.advance();
143        Some(b)
144    }
145
146    fn expect(&mut self, expected: u8) -> GrammarResult<()> {
147        match self.consume() {
148            Some(b) if b == expected => Ok(()),
149            Some(b) => Err(self.parse_error(format!(
150                "expected '{}' but got '{}'",
151                expected as char, b as char
152            ))),
153            None => Err(self.parse_error(format!(
154                "expected '{}' but reached end of input",
155                expected as char
156            ))),
157        }
158    }
159
160    fn is_ident_start(b: u8) -> bool {
161        b.is_ascii_alphabetic() || b == b'_'
162    }
163
164    fn is_ident_cont(b: u8) -> bool {
165        b.is_ascii_alphanumeric() || b == b'_' || b == b'-'
166    }
167
168    // ── Whitespace / comment skipping ────────────────────────────────────────
169
170    /// Skip spaces and tabs (but NOT newlines — those separate rules).
171    fn skip_inline_ws(&mut self) {
172        while matches!(self.peek(), Some(b' ') | Some(b'\t')) {
173            self.advance();
174        }
175    }
176
177    /// Skip all whitespace including newlines and `#`-comments.
178    fn skip_ws_and_comments(&mut self) {
179        loop {
180            match self.peek() {
181                Some(b' ') | Some(b'\t') | Some(b'\r') | Some(b'\n') => {
182                    self.advance();
183                }
184                Some(b'#') => {
185                    while !matches!(self.peek(), None | Some(b'\n')) {
186                        self.advance();
187                    }
188                }
189                _ => break,
190            }
191        }
192    }
193
194    // ── Top-level grammar parsing ────────────────────────────────────────────
195
196    fn parse_grammar(&mut self) -> GrammarResult<Grammar> {
197        let mut rules: HashMap<String, GrammarNode> = HashMap::new();
198        let mut first_rule: Option<String> = None;
199
200        self.skip_ws_and_comments();
201
202        while self.pos < self.input.len() {
203            let name = self.parse_ident()?;
204            self.skip_inline_ws();
205
206            // Expect ::=
207            if self.pos + 3 > self.input.len() || &self.input[self.pos..self.pos + 3] != b"::=" {
208                return Err(self.parse_error("expected '::=' after rule name"));
209            }
210            self.pos += 3;
211            self.skip_inline_ws();
212
213            let node = self.parse_alternation()?;
214
215            if first_rule.is_none() {
216                first_rule = Some(name.clone());
217            }
218            rules.insert(name, node);
219
220            self.skip_ws_and_comments();
221        }
222
223        if rules.is_empty() {
224            return Err(GrammarError::ParseError {
225                pos: 0,
226                msg: "grammar has no rules".to_string(),
227            });
228        }
229
230        // The root rule is "root" if it exists, otherwise the first rule.
231        let root = if rules.contains_key("root") {
232            "root".to_string()
233        } else {
234            first_rule.unwrap_or_default()
235        };
236
237        Ok(Grammar {
238            rules,
239            root,
240            source: String::new(), // filled in by Grammar::parse()
241        })
242    }
243
244    // ── Identifier parsing ───────────────────────────────────────────────────
245
246    fn parse_ident(&mut self) -> GrammarResult<String> {
247        let start = self.pos;
248        match self.peek() {
249            Some(b) if Self::is_ident_start(b) => {
250                self.advance();
251            }
252            _ => return Err(self.parse_error("expected identifier")),
253        }
254        while matches!(self.peek(), Some(b) if Self::is_ident_cont(b)) {
255            self.advance();
256        }
257        let slice = &self.input[start..self.pos];
258        Ok(String::from_utf8_lossy(slice).into_owned())
259    }
260
261    // ── Alternation (lowest precedence) ─────────────────────────────────────
262
263    fn parse_alternation(&mut self) -> GrammarResult<GrammarNode> {
264        let first = self.parse_sequence()?;
265        self.skip_inline_ws();
266
267        if !matches!(self.peek(), Some(b'|')) {
268            return Ok(first);
269        }
270
271        let mut alternatives = vec![first];
272        while matches!(self.peek(), Some(b'|')) {
273            self.advance(); // consume '|'
274            self.skip_inline_ws();
275            alternatives.push(self.parse_sequence()?);
276            self.skip_inline_ws();
277        }
278
279        Ok(GrammarNode::Alternation(alternatives))
280    }
281
282    // ── Sequence ─────────────────────────────────────────────────────────────
283
284    fn parse_sequence(&mut self) -> GrammarResult<GrammarNode> {
285        let mut items: Vec<GrammarNode> = Vec::new();
286
287        loop {
288            self.skip_inline_ws();
289            // Stop at end of input, newline (rule boundary), '|', ')', or ']'
290            match self.peek() {
291                None | Some(b'\n') | Some(b'\r') | Some(b'|') | Some(b')') => break,
292                // '#' starts a comment — stop the sequence
293                Some(b'#') => break,
294                _ => {}
295            }
296            let item = self.parse_item()?;
297            items.push(item);
298        }
299
300        match items.len() {
301            0 => Ok(GrammarNode::Literal(vec![])),
302            1 => Ok(items.remove(0)),
303            _ => Ok(GrammarNode::Sequence(items)),
304        }
305    }
306
307    // ── Single item (possibly with suffix quantifier) ────────────────────────
308
309    fn parse_item(&mut self) -> GrammarResult<GrammarNode> {
310        let base = self.parse_atom()?;
311        self.parse_quantifier(base)
312    }
313
314    fn parse_quantifier(&mut self, base: GrammarNode) -> GrammarResult<GrammarNode> {
315        match self.peek() {
316            Some(b'*') => {
317                self.advance();
318                Ok(GrammarNode::Repeat {
319                    node: Box::new(base),
320                    min: 0,
321                    max: None,
322                })
323            }
324            Some(b'+') => {
325                self.advance();
326                Ok(GrammarNode::Repeat {
327                    node: Box::new(base),
328                    min: 1,
329                    max: None,
330                })
331            }
332            Some(b'?') => {
333                self.advance();
334                Ok(GrammarNode::Repeat {
335                    node: Box::new(base),
336                    min: 0,
337                    max: Some(1),
338                })
339            }
340            _ => Ok(base),
341        }
342    }
343
344    // ── Atoms ────────────────────────────────────────────────────────────────
345
346    fn parse_atom(&mut self) -> GrammarResult<GrammarNode> {
347        match self.peek() {
348            Some(b'"') => self.parse_string_literal(),
349            Some(b'[') => self.parse_char_class(),
350            Some(b'(') => {
351                self.advance(); // consume '('
352                self.skip_inline_ws();
353                let inner = self.parse_alternation()?;
354                self.skip_inline_ws();
355                self.expect(b')')?;
356                Ok(inner)
357            }
358            Some(b) if Self::is_ident_start(b) => {
359                let name = self.parse_ident()?;
360                Ok(GrammarNode::RuleRef(name))
361            }
362            Some(b) => {
363                Err(self.parse_error(format!("unexpected character '{}' in grammar", b as char)))
364            }
365            None => Err(self.parse_error("unexpected end of input")),
366        }
367    }
368
369    // ── String literal parsing ───────────────────────────────────────────────
370
371    fn parse_string_literal(&mut self) -> GrammarResult<GrammarNode> {
372        self.expect(b'"')?;
373        let mut bytes: Vec<u8> = Vec::new();
374
375        loop {
376            match self.consume() {
377                None => return Err(self.parse_error("unterminated string literal")),
378                Some(b'"') => break,
379                Some(b'\\') => {
380                    let escaped = self.parse_escape_sequence()?;
381                    bytes.extend_from_slice(&escaped);
382                }
383                Some(b) => bytes.push(b),
384            }
385        }
386
387        Ok(GrammarNode::Literal(bytes))
388    }
389
390    /// Parse a backslash escape sequence, returning the byte(s) it represents.
391    fn parse_escape_sequence(&mut self) -> GrammarResult<Vec<u8>> {
392        match self.consume() {
393            Some(b'n') => Ok(vec![b'\n']),
394            Some(b'r') => Ok(vec![b'\r']),
395            Some(b't') => Ok(vec![b'\t']),
396            Some(b'\\') => Ok(vec![b'\\']),
397            Some(b'"') => Ok(vec![b'"']),
398            Some(b'\'') => Ok(vec![b'\'']),
399            Some(b'0') => Ok(vec![0u8]),
400            Some(b'x') => {
401                // \xHH — two hex digits
402                let hi = self.consume_hex_digit()?;
403                let lo = self.consume_hex_digit()?;
404                Ok(vec![(hi << 4) | lo])
405            }
406            Some(b'u') => {
407                // \uHHHH — four hex digits, encode as UTF-8
408                self.expect(b'{')?;
409                let mut codepoint: u32 = 0;
410                let mut digits = 0usize;
411                while !matches!(self.peek(), Some(b'}')) {
412                    let d = self.consume_hex_digit()?;
413                    codepoint = (codepoint << 4) | (d as u32);
414                    digits += 1;
415                    if digits > 6 {
416                        return Err(self.parse_error("\\u{...} codepoint too large"));
417                    }
418                }
419                self.expect(b'}')?;
420                let ch = char::from_u32(codepoint)
421                    .ok_or_else(|| self.parse_error("invalid Unicode codepoint"))?;
422                let mut buf = [0u8; 4];
423                Ok(ch.encode_utf8(&mut buf).as_bytes().to_vec())
424            }
425            Some(b) => Err(self.parse_error(format!("unknown escape sequence '\\{}'", b as char))),
426            None => Err(self.parse_error("truncated escape sequence")),
427        }
428    }
429
430    fn consume_hex_digit(&mut self) -> GrammarResult<u8> {
431        match self.consume() {
432            Some(b @ b'0'..=b'9') => Ok(b - b'0'),
433            Some(b @ b'a'..=b'f') => Ok(b - b'a' + 10),
434            Some(b @ b'A'..=b'F') => Ok(b - b'A' + 10),
435            Some(b) => Err(self.parse_error(format!("expected hex digit, got '{}'", b as char))),
436            None => Err(self.parse_error("expected hex digit, reached end of input")),
437        }
438    }
439
440    // ── Character class parsing ──────────────────────────────────────────────
441
442    fn parse_char_class(&mut self) -> GrammarResult<GrammarNode> {
443        self.expect(b'[')?;
444        let negated = if matches!(self.peek(), Some(b'^')) {
445            self.advance();
446            true
447        } else {
448            false
449        };
450
451        let mut ranges: Vec<CharRange> = Vec::new();
452
453        loop {
454            match self.peek() {
455                None => return Err(self.parse_error("unterminated character class")),
456                Some(b']') => {
457                    self.advance();
458                    break;
459                }
460                Some(b'\\') => {
461                    self.advance(); // consume '\'
462                    let b = self.parse_class_escape()?;
463                    // Check for range: \x-y
464                    if matches!(self.peek(), Some(b'-')) && !matches!(self.peek2(), Some(b']')) {
465                        self.advance(); // consume '-'
466                        let hi = self.parse_class_byte()?;
467                        ranges.push(CharRange::range(b, hi));
468                    } else {
469                        ranges.push(CharRange::single(b));
470                    }
471                }
472                Some(_) => {
473                    let lo = self.parse_class_byte()?;
474                    if matches!(self.peek(), Some(b'-')) && !matches!(self.peek2(), Some(b']')) {
475                        self.advance(); // consume '-'
476                        let hi = self.parse_class_byte()?;
477                        ranges.push(CharRange::range(lo, hi));
478                    } else {
479                        ranges.push(CharRange::single(lo));
480                    }
481                }
482            }
483        }
484
485        Ok(GrammarNode::CharClass { ranges, negated })
486    }
487
488    /// Parse a single literal byte inside a character class.
489    fn parse_class_byte(&mut self) -> GrammarResult<u8> {
490        match self.peek() {
491            Some(b'\\') => {
492                self.advance();
493                self.parse_class_escape()
494            }
495            Some(b) => {
496                self.advance();
497                Ok(b)
498            }
499            None => Err(self.parse_error("unexpected end inside character class")),
500        }
501    }
502
503    /// Parse a backslash escape inside `[...]`.
504    fn parse_class_escape(&mut self) -> GrammarResult<u8> {
505        match self.consume() {
506            Some(b'n') => Ok(b'\n'),
507            Some(b'r') => Ok(b'\r'),
508            Some(b't') => Ok(b'\t'),
509            Some(b'\\') => Ok(b'\\'),
510            Some(b']') => Ok(b']'),
511            Some(b'-') => Ok(b'-'),
512            Some(b'^') => Ok(b'^'),
513            Some(b'x') => {
514                let hi = self.consume_hex_digit()?;
515                let lo = self.consume_hex_digit()?;
516                Ok((hi << 4) | lo)
517            }
518            Some(b) => Err(self.parse_error(format!("unknown class escape '\\{}'", b as char))),
519            None => Err(self.parse_error("truncated class escape")),
520        }
521    }
522}
523
524// ─── Unit tests ───────────────────────────────────────────────────────────────
525
526#[cfg(test)]
527mod tests {
528    use super::*;
529
530    #[test]
531    fn test_parse_literal_alternation() {
532        let g = Grammar::parse(r#"root ::= "yes" | "no""#).unwrap();
533        assert_eq!(g.root, "root");
534        assert!(g.rules.contains_key("root"));
535    }
536
537    #[test]
538    fn test_parse_char_class_range() {
539        let g = Grammar::parse("root ::= [a-z]+").unwrap();
540        assert!(g.rules.contains_key("root"));
541    }
542
543    #[test]
544    fn test_parse_sequence() {
545        let g = Grammar::parse(r#"root ::= [a-z]+ ":" [0-9]+"#).unwrap();
546        assert!(!g.rules.is_empty());
547    }
548
549    #[test]
550    fn test_parse_multiple_rules() {
551        let grammar_str = "root ::= greeting\ngreeting ::= \"hello\"";
552        let g = Grammar::parse(grammar_str).unwrap();
553        assert_eq!(g.root, "root");
554        assert!(g.rules.contains_key("greeting"));
555    }
556
557    #[test]
558    fn test_parse_empty_grammar_fails() {
559        let result = Grammar::parse("");
560        assert!(result.is_err());
561    }
562
563    #[test]
564    fn test_parse_negated_char_class() {
565        let g = Grammar::parse("root ::= [^0-9]+").unwrap();
566        assert!(g.rules.contains_key("root"));
567        match g.rules.get("root").unwrap() {
568            GrammarNode::Repeat { node, .. } => match node.as_ref() {
569                GrammarNode::CharClass { negated, .. } => assert!(*negated),
570                other => panic!("expected CharClass, got {:?}", other),
571            },
572            other => panic!("expected Repeat, got {:?}", other),
573        }
574    }
575
576    #[test]
577    fn test_parse_grouping() {
578        let g = Grammar::parse(r#"root ::= ("a" | "b")+"#).unwrap();
579        assert!(g.rules.contains_key("root"));
580    }
581
582    #[test]
583    fn test_parse_json_simple() {
584        // A small JSON-like grammar
585        let grammar_str = r#"root ::= "{" ws "}"
586ws ::= [ \t\n]*"#;
587        let g = Grammar::parse(grammar_str).unwrap();
588        assert_eq!(g.root, "root");
589    }
590
591    // ── String literal escape sequences ──────────────────────────────────────
592
593    #[test]
594    fn test_parse_escape_newline() {
595        // \n in a string literal
596        let g = Grammar::parse(r#"root ::= "\n""#).expect("test: \\n escape should parse");
597        match g.rules.get("root").expect("test: root rule") {
598            GrammarNode::Literal(bytes) => assert_eq!(bytes, b"\n"),
599            other => panic!("expected Literal, got {:?}", other),
600        }
601    }
602
603    #[test]
604    fn test_parse_escape_tab() {
605        let g = Grammar::parse(r#"root ::= "\t""#).expect("test: \\t escape should parse");
606        match g.rules.get("root").expect("test: root rule") {
607            GrammarNode::Literal(bytes) => assert_eq!(bytes, b"\t"),
608            other => panic!("expected Literal, got {:?}", other),
609        }
610    }
611
612    #[test]
613    fn test_parse_escape_carriage_return() {
614        let g = Grammar::parse(r#"root ::= "\r""#).expect("test: \\r escape should parse");
615        match g.rules.get("root").expect("test: root rule") {
616            GrammarNode::Literal(bytes) => assert_eq!(bytes, b"\r"),
617            other => panic!("expected Literal, got {:?}", other),
618        }
619    }
620
621    #[test]
622    fn test_parse_escape_backslash() {
623        let g = Grammar::parse(r#"root ::= "\\""#).expect("test: \\\\ escape should parse");
624        match g.rules.get("root").expect("test: root rule") {
625            GrammarNode::Literal(bytes) => assert_eq!(bytes, b"\\"),
626            other => panic!("expected Literal, got {:?}", other),
627        }
628    }
629
630    #[test]
631    fn test_parse_escape_quote() {
632        let g = Grammar::parse(r#"root ::= "\"hi\"""#).expect("test: escaped quote should parse");
633        match g.rules.get("root").expect("test: root rule") {
634            GrammarNode::Literal(bytes) => {
635                assert_eq!(bytes[0], b'"');
636                assert_eq!(*bytes.last().expect("test: last byte"), b'"');
637            }
638            other => panic!("expected Literal, got {:?}", other),
639        }
640    }
641
642    #[test]
643    fn test_parse_escape_null() {
644        let g = Grammar::parse(r#"root ::= "\0""#).expect("test: \\0 escape should parse");
645        match g.rules.get("root").expect("test: root rule") {
646            GrammarNode::Literal(bytes) => assert_eq!(bytes, &[0u8]),
647            other => panic!("expected Literal, got {:?}", other),
648        }
649    }
650
651    #[test]
652    fn test_parse_escape_hex() {
653        // \x41 = 'A'
654        let g = Grammar::parse(r#"root ::= "\x41""#).expect("test: \\x41 should parse");
655        match g.rules.get("root").expect("test: root rule") {
656            GrammarNode::Literal(bytes) => assert_eq!(bytes, &[0x41u8]),
657            other => panic!("expected Literal, got {:?}", other),
658        }
659    }
660
661    #[test]
662    fn test_parse_escape_unicode() {
663        // \u{0041} = 'A' in UTF-8
664        let g = Grammar::parse(r#"root ::= "\u{0041}""#).expect("test: \\u{0041} should parse");
665        match g.rules.get("root").expect("test: root rule") {
666            GrammarNode::Literal(bytes) => assert_eq!(bytes, b"A"),
667            other => panic!("expected Literal, got {:?}", other),
668        }
669    }
670
671    #[test]
672    fn test_parse_escape_unicode_multibyte() {
673        // \u{263A} = ☺, U+263A → UTF-8: 0xE2 0x98 0xBA
674        let g = Grammar::parse(r#"root ::= "\u{263A}""#).expect("test: \\u{263A} should parse");
675        match g.rules.get("root").expect("test: root rule") {
676            GrammarNode::Literal(bytes) => {
677                assert_eq!(bytes, "☺".as_bytes());
678            }
679            other => panic!("expected Literal, got {:?}", other),
680        }
681    }
682
683    // ── String literal error paths ────────────────────────────────────────────
684
685    #[test]
686    fn test_parse_unterminated_string_errors() {
687        let result = Grammar::parse(r#"root ::= "abc"#);
688        assert!(result.is_err(), "unterminated string should be an error");
689    }
690
691    #[test]
692    fn test_parse_unknown_escape_errors() {
693        let result = Grammar::parse(r#"root ::= "\q""#);
694        assert!(result.is_err(), "unknown escape \\q should be an error");
695    }
696
697    #[test]
698    fn test_parse_truncated_escape_errors() {
699        // String ends with bare backslash
700        let result = Grammar::parse("root ::= \"\\\"");
701        // The parser will either see a \"  (escaped quote, then unterminated)
702        // or fail on truncated. Either error is acceptable.
703        assert!(
704            result.is_err(),
705            "truncated escape at end of input should error"
706        );
707    }
708
709    #[test]
710    fn test_parse_hex_escape_invalid_digit_errors() {
711        // \xGG — 'G' is not a valid hex digit
712        let result = Grammar::parse(r#"root ::= "\xGG""#);
713        assert!(result.is_err(), "invalid hex digit in \\x should error");
714    }
715
716    #[test]
717    fn test_parse_unicode_escape_invalid_codepoint_errors() {
718        // \u{D800} — surrogate, not a valid Unicode scalar
719        // Build the grammar string without a raw string literal to avoid Rust
720        // interpreting the { } as a format argument.
721        let grammar_str = "root ::= \"\\u{D800}\"";
722        let result = Grammar::parse(grammar_str);
723        assert!(
724            result.is_err(),
725            "surrogate codepoint \\u{{D800}} should error"
726        );
727    }
728
729    #[test]
730    fn test_parse_unicode_escape_too_many_digits_errors() {
731        // \u{1234567} — 7 hex digits is too many
732        let grammar_str = "root ::= \"\\u{1234567}\"";
733        let result = Grammar::parse(grammar_str);
734        assert!(result.is_err(), "\\u{{...}} with >6 digits should error");
735    }
736
737    // ── Character class escape sequences ─────────────────────────────────────
738
739    #[test]
740    fn test_parse_char_class_escape_newline() {
741        let g = Grammar::parse(r"root ::= [\n]").expect("test: [\\n] should parse");
742        assert!(g.rules.contains_key("root"));
743    }
744
745    #[test]
746    fn test_parse_char_class_escape_tab() {
747        let g = Grammar::parse(r"root ::= [\t]").expect("test: [\\t] should parse");
748        assert!(g.rules.contains_key("root"));
749    }
750
751    #[test]
752    fn test_parse_char_class_escape_dash() {
753        // \- inside a char class should represent a literal '-'
754        let g = Grammar::parse(r"root ::= [\-]").expect("test: [\\-] should parse");
755        assert!(g.rules.contains_key("root"));
756    }
757
758    #[test]
759    fn test_parse_char_class_escape_caret() {
760        // \^ inside a char class is a literal '^'
761        let g = Grammar::parse(r"root ::= [\^]").expect("test: [\\^] should parse");
762        assert!(g.rules.contains_key("root"));
763    }
764
765    #[test]
766    fn test_parse_char_class_escape_bracket() {
767        // \] inside a char class is a literal ']'
768        let g = Grammar::parse(r"root ::= [\]]").expect("test: [\\]] should parse");
769        assert!(g.rules.contains_key("root"));
770    }
771
772    #[test]
773    fn test_parse_char_class_hex_escape() {
774        // [\x41-\x5A] = [A-Z]
775        let g = Grammar::parse(r"root ::= [\x41-\x5A]+").expect("test: [\\x41-\\x5A] should parse");
776        assert!(g.rules.contains_key("root"));
777    }
778
779    #[test]
780    fn test_parse_char_class_unknown_escape_errors() {
781        let result = Grammar::parse(r"root ::= [\q]");
782        assert!(
783            result.is_err(),
784            "unknown char class escape \\q should error"
785        );
786    }
787
788    #[test]
789    fn test_parse_unterminated_char_class_errors() {
790        let result = Grammar::parse("root ::= [a-z");
791        assert!(result.is_err(), "unterminated char class should error");
792    }
793
794    // ── Grammar structure error paths ─────────────────────────────────────────
795
796    #[test]
797    fn test_parse_missing_assign_errors() {
798        let result = Grammar::parse("root = \"hello\"");
799        assert!(result.is_err(), "missing ::= should error");
800    }
801
802    #[test]
803    fn test_parse_unexpected_char_errors() {
804        // '?' at start of an atom (not after something) is unexpected
805        let result = Grammar::parse("root ::= @");
806        assert!(
807            result.is_err(),
808            "unexpected char '@' in grammar atom should error"
809        );
810    }
811
812    #[test]
813    fn test_parse_unclosed_group_errors() {
814        let result = Grammar::parse(r#"root ::= ("abc""#);
815        assert!(result.is_err(), "unclosed group should error");
816    }
817
818    #[test]
819    fn test_parse_first_rule_becomes_root_when_no_root() {
820        // When there's no rule named "root", the first defined rule is the root.
821        let g = Grammar::parse("entry ::= \"hello\"").expect("test: single rule without root");
822        assert_eq!(g.root, "entry");
823    }
824
825    #[test]
826    fn test_parse_optional_quantifier() {
827        // body? → Repeat{min:0, max:Some(1)}
828        let g = Grammar::parse(r#"root ::= "a"?"#).expect("test: ? quantifier should parse");
829        match g.rules.get("root").expect("test: root rule") {
830            GrammarNode::Repeat { min, max, .. } => {
831                assert_eq!(*min, 0);
832                assert_eq!(*max, Some(1));
833            }
834            other => panic!("expected Repeat, got {:?}", other),
835        }
836    }
837
838    #[test]
839    fn test_parse_star_quantifier() {
840        let g = Grammar::parse(r#"root ::= "a"*"#).expect("test: * quantifier should parse");
841        match g.rules.get("root").expect("test: root rule") {
842            GrammarNode::Repeat { min, max, .. } => {
843                assert_eq!(*min, 0);
844                assert_eq!(*max, None);
845            }
846            other => panic!("expected Repeat, got {:?}", other),
847        }
848    }
849
850    #[test]
851    fn test_parse_plus_quantifier() {
852        let g = Grammar::parse(r#"root ::= "a"+"#).expect("test: + quantifier should parse");
853        match g.rules.get("root").expect("test: root rule") {
854            GrammarNode::Repeat { min, max, .. } => {
855                assert_eq!(*min, 1);
856                assert_eq!(*max, None);
857            }
858            other => panic!("expected Repeat, got {:?}", other),
859        }
860    }
861
862    #[test]
863    fn test_parse_comment_skipped() {
864        // Lines starting with '#' are comments and should be ignored
865        let grammar_str = "# This is a comment\nroot ::= \"hello\"";
866        let g = Grammar::parse(grammar_str).expect("test: comment should be ignored");
867        assert_eq!(g.root, "root");
868    }
869
870    #[test]
871    fn test_root_node_accessor_ok() {
872        let g = Grammar::parse(r#"root ::= "hi""#).expect("test: should parse");
873        g.root_node()
874            .expect("test: root_node() should succeed when root rule exists");
875    }
876
877    #[test]
878    fn test_root_node_accessor_missing_rule_errors() {
879        // Manually construct a Grammar with a root pointing to a non-existent rule.
880        let g = Grammar {
881            rules: std::collections::HashMap::new(),
882            root: "missing".to_string(),
883            source: String::new(),
884        };
885        let result = g.root_node();
886        assert!(result.is_err(), "root_node() on missing rule should error");
887        match result {
888            Err(super::super::error::GrammarError::UnknownRule { rule }) => {
889                assert_eq!(rule, "missing");
890            }
891            other => panic!("expected UnknownRule, got {:?}", other),
892        }
893    }
894
895    #[test]
896    fn test_char_range_contains() {
897        let r = CharRange::range(b'a', b'z');
898        assert!(r.contains(b'a'));
899        assert!(r.contains(b'z'));
900        assert!(r.contains(b'm'));
901        assert!(!r.contains(b'A'));
902        assert!(!r.contains(b'0'));
903    }
904
905    #[test]
906    fn test_char_range_single() {
907        let r = CharRange::single(b'X');
908        assert!(r.contains(b'X'));
909        assert!(!r.contains(b'Y'));
910        assert!(!r.contains(b'W'));
911    }
912}
oxillama_runtime/sampling/grammar/parser.rs

oxillama_runtime/sampling/grammar/
parser.rs