oxibonsai-runtime 0.1.4

Inference runtime, sampling, tokenizer, and server for OxiBonsai
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
//! Integration tests for the GBNF grammar parser.
//!
//! Tests cover:
//! * Error cases (empty, missing root, unknown rule)
//! * Literals, alternation, sequences
//! * Quantifiers: `*`, `+`, `?`
//! * Char classes: simple, range, negated, mixed, hex escapes
//! * Groups with quantifiers
//! * Rule references, recursive rules
//! * Comments, escape sequences
//! * Complex grammars
//! * Integration with GrammarConstraint

use oxibonsai_runtime::grammar::{parse_gbnf, GbnfParseError, GrammarConstraint};

// ─────────────────────────────────────────────────────────────────────────────
// Helper
// ─────────────────────────────────────────────────────────────────────────────

/// Parse GBNF or panic with a descriptive message.
fn parse_ok(src: &str) -> oxibonsai_runtime::grammar::Grammar {
    parse_gbnf(src).unwrap_or_else(|e| panic!("parse_gbnf failed on:\n{src}\nError: {e}"))
}

// ─────────────────────────────────────────────────────────────────────────────
// Test 1: empty grammar → EmptyGrammar
// ─────────────────────────────────────────────────────────────────────────────

#[test]
fn test_empty_grammar_error() {
    assert!(matches!(parse_gbnf(""), Err(GbnfParseError::EmptyGrammar)));
    assert!(matches!(
        parse_gbnf("   \n\n  \t\n"),
        Err(GbnfParseError::EmptyGrammar)
    ));
}

// ─────────────────────────────────────────────────────────────────────────────
// Test 2: no `root` rule → MissingRootRule
// ─────────────────────────────────────────────────────────────────────────────

#[test]
fn test_missing_root_rule_error() {
    let err = parse_gbnf(r#"word ::= "hello""#);
    assert!(matches!(err, Err(GbnfParseError::MissingRootRule)));
}

// ─────────────────────────────────────────────────────────────────────────────
// Test 3: simple string literal
// ─────────────────────────────────────────────────────────────────────────────

#[test]
fn test_simple_literal() {
    let g = parse_ok(r#"root ::= "hello""#);
    assert!(g.start() < g.nt_count, "start symbol must be a valid NT id");
    // Root rule should have a non-empty RHS (each byte is a terminal).
    let root_rules: Vec<_> = g.rules_for(g.start()).collect();
    assert!(!root_rules.is_empty(), "root must have at least one rule");
}

// ─────────────────────────────────────────────────────────────────────────────
// Test 4: alternation
// ─────────────────────────────────────────────────────────────────────────────

#[test]
fn test_alternation() {
    let g = parse_ok(r#"root ::= "cat" | "dog""#);
    let root_rules: Vec<_> = g.rules_for(g.start()).collect();
    // Two alternatives → two rules for root.
    assert_eq!(root_rules.len(), 2, "expected 2 root rules for alternation");
}

// ─────────────────────────────────────────────────────────────────────────────
// Test 5: Kleene star
// ─────────────────────────────────────────────────────────────────────────────

#[test]
fn test_kleene_star() {
    let g = parse_ok(r#"root ::= "a"*"#);
    // Grammar must have rules (synthetic star NT adds ε and recursive rules).
    assert!(
        g.rules.len() >= 2,
        "star should create synthetic NT with at least 2 rules, got {}",
        g.rules.len()
    );
    // Verify we can find the synthetic star NT (ε rule).
    let has_epsilon = g.rules.iter().any(|r| r.is_epsilon());
    assert!(has_epsilon, "star must introduce an ε rule");
}

// ─────────────────────────────────────────────────────────────────────────────
// Test 6: plus quantifier
// ─────────────────────────────────────────────────────────────────────────────

#[test]
fn test_plus() {
    let g = parse_ok(r#"root ::= "a"+"#);
    // Plus creates star NT (2 rules) and plus NT (1 rule).
    assert!(
        g.rules.len() >= 3,
        "plus should create 3+ rules, got {}",
        g.rules.len()
    );
    // The `+` case must NOT have an ε rule for the plus NT itself.
    // (The star helper does, but not the plus NT.)
    let root_start = g.start();
    let root_rules: Vec<_> = g.rules_for(root_start).collect();
    // Root delegates to the plus_nt, so root has exactly 1 rule.
    assert_eq!(root_rules.len(), 1);
    // That rule's RHS is non-empty (not ε).
    assert!(!root_rules[0].1.is_epsilon());
}

// ─────────────────────────────────────────────────────────────────────────────
// Test 7: optional (`?`)
// ─────────────────────────────────────────────────────────────────────────────

#[test]
fn test_optional() {
    let g = parse_ok(r#"root ::= "colo" "u"? "r""#);
    // The optional `u` expands to a synthetic opt NT with an ε rule.
    let has_epsilon = g.rules.iter().any(|r| r.is_epsilon());
    assert!(has_epsilon, "optional must introduce an ε rule");
}

// ─────────────────────────────────────────────────────────────────────────────
// Test 8: char class (simple)
// ─────────────────────────────────────────────────────────────────────────────

#[test]
fn test_char_class_simple() {
    let g = parse_ok(r#"root ::= [abc]+"#);
    // The char class `[abc]` matches 3 bytes → 3 rules for the class NT.
    // Plus 2 rules for the star, 1 for plus, 1 for root = 7 rules minimum.
    assert!(
        g.rules.len() >= 6,
        "char class plus should produce 6+ rules, got {}",
        g.rules.len()
    );
}

// ─────────────────────────────────────────────────────────────────────────────
// Test 9: char class with range
// ─────────────────────────────────────────────────────────────────────────────

#[test]
fn test_char_class_range() {
    let g = parse_ok(r#"root ::= [a-z]+"#);
    // `[a-z]` → 26 bytes. 26 rules for class_nt + 2 star + 1 plus + 1 root = 30+.
    assert!(
        g.rules.len() >= 29,
        "a-z class + plus expected >=29 rules, got {}",
        g.rules.len()
    );
}

// ─────────────────────────────────────────────────────────────────────────────
// Test 10: negated char class
// ─────────────────────────────────────────────────────────────────────────────

#[test]
fn test_char_class_negated() {
    let g = parse_ok(r#"root ::= [^abc]+"#);
    // `[^abc]` matches 256 - 3 = 253 bytes.
    // 253 class rules + 2 star + 1 plus + 1 root = 257+.
    assert!(
        g.rules.len() >= 256,
        "negated class [^abc] + plus expected >=256 rules, got {}",
        g.rules.len()
    );
}

// ─────────────────────────────────────────────────────────────────────────────
// Test 11: mixed char class
// ─────────────────────────────────────────────────────────────────────────────

#[test]
fn test_char_class_mixed() {
    let g = parse_ok(r#"root ::= [a-zA-Z0-9_]+"#);
    // a-z = 26, A-Z = 26, 0-9 = 10, _ = 1 → 63 distinct bytes.
    let class_nt_rules: Vec<_> = g
        .rules
        .iter()
        .filter(|r| {
            // Char class NT rules have a single 1-byte terminal RHS.
            r.rhs.len() == 1
                && r.rhs[0]
                    .terminal_bytes()
                    .map(|b| b.len() == 1)
                    .unwrap_or(false)
                && r.lhs != g.start()
        })
        .collect();
    // Should have exactly 63 such rules (one per matching byte in the class).
    assert_eq!(
        class_nt_rules.len(),
        63,
        "mixed class should have 63 terminal rules, got {}",
        class_nt_rules.len()
    );
}

// ─────────────────────────────────────────────────────────────────────────────
// Test 12: rule reference
// ─────────────────────────────────────────────────────────────────────────────

#[test]
fn test_rule_reference() {
    let g = parse_ok("root ::= word\nword ::= [a-z]+\n");
    // `root` rule should reference `word` NT.
    let root_rules: Vec<_> = g.rules_for(g.start()).collect();
    assert_eq!(root_rules.len(), 1);
    // The single root rule RHS should be a NonTerminal (reference to `word`).
    assert!(
        root_rules[0].1.rhs.iter().any(|s| s.is_non_terminal()),
        "root should reference word via NonTerminal"
    );
}

// ─────────────────────────────────────────────────────────────────────────────
// Test 13: unknown rule reference → UnknownRule error
// ─────────────────────────────────────────────────────────────────────────────

#[test]
fn test_unknown_rule_reference_error() {
    // `ghost` is referenced but never defined.
    let err = parse_gbnf("root ::= ghost");
    assert!(
        matches!(err, Err(GbnfParseError::UnknownRule(ref n)) if n == "ghost"),
        "expected UnknownRule(\"ghost\"), got {err:?}"
    );
}

// ─────────────────────────────────────────────────────────────────────────────
// Test 14: comment is ignored
// ─────────────────────────────────────────────────────────────────────────────

#[test]
fn test_comment_ignored() {
    let g = parse_ok("# this is a comment\nroot ::= \"x\"");
    let root_rules: Vec<_> = g.rules_for(g.start()).collect();
    assert_eq!(root_rules.len(), 1);
}

// ─────────────────────────────────────────────────────────────────────────────
// Test 15: string escape sequences
// ─────────────────────────────────────────────────────────────────────────────

#[test]
fn test_string_escape_sequences() {
    let g = parse_ok(r#"root ::= "\n\t\r\\""#);
    // Should parse without error; root rule has 4 byte terminals.
    let root_rules: Vec<_> = g.rules_for(g.start()).collect();
    assert!(!root_rules.is_empty());
    // RHS symbols should include the escaped bytes.
    let total_terminals: usize = root_rules
        .iter()
        .map(|(_, r)| r.rhs.iter().filter(|s| s.is_terminal()).count())
        .sum();
    assert_eq!(
        total_terminals, 4,
        "expected 4 terminal bytes (\\n, \\t, \\r, \\\\)"
    );
}

// ─────────────────────────────────────────────────────────────────────────────
// Test 16: hex escape in char class
// ─────────────────────────────────────────────────────────────────────────────

#[test]
fn test_hex_escape_in_class() {
    // `[\x41-\x5A]` = A (0x41) to Z (0x5A) = 26 bytes.
    let g = parse_ok(r#"root ::= [\x41-\x5A]+"#);
    // Count single-byte terminal rules not in root or synthetic stars/plus.
    let class_rules: Vec<_> = g
        .rules
        .iter()
        .filter(|r| {
            r.rhs.len() == 1
                && r.rhs[0]
                    .terminal_bytes()
                    .map(|b| b.len() == 1)
                    .unwrap_or(false)
        })
        .collect();
    assert_eq!(
        class_rules.len(),
        26,
        "\\x41-\\x5A should produce 26 terminal rules, got {}",
        class_rules.len()
    );
}

// ─────────────────────────────────────────────────────────────────────────────
// Test 17: group with quantifier
// ─────────────────────────────────────────────────────────────────────────────

#[test]
fn test_group_with_quantifier() {
    let g = parse_ok(r#"root ::= ("a" "b")+"#);
    // Group is a synthetic NT; plus wraps it with star.
    assert!(
        g.rules.len() >= 4,
        "group+ should produce 4+ rules, got {}",
        g.rules.len()
    );
    // ε rule must exist from the star.
    assert!(g.rules.iter().any(|r| r.is_epsilon()));
}

// ─────────────────────────────────────────────────────────────────────────────
// Test 18: nested alternation
// ─────────────────────────────────────────────────────────────────────────────

#[test]
fn test_nested_alternation() {
    let g = parse_ok(r#"root ::= "a" | ("b" | "c")"#);
    // Root has 2 alternatives: "a" and the group.
    let root_rules: Vec<_> = g.rules_for(g.start()).collect();
    assert_eq!(
        root_rules.len(),
        2,
        "expected 2 root rules for nested alternation"
    );
    // The group NT should have 2 alternatives.
    let group_nt_id = match &root_rules[1].1.rhs[..] {
        [oxibonsai_runtime::grammar::Symbol::NonTerminal(id)] => *id,
        _ => panic!("second alternative should be a NonTerminal group reference"),
    };
    let group_rules: Vec<_> = g.rules_for(group_nt_id).collect();
    assert_eq!(
        group_rules.len(),
        2,
        "group should have 2 alternatives (b | c)"
    );
}

// ─────────────────────────────────────────────────────────────────────────────
// Test 19: recursive rule (self-referential)
// ─────────────────────────────────────────────────────────────────────────────

#[test]
fn test_recursive_rule() {
    // root ::= "a" root?  — self-referential via optional
    let g = parse_ok(r#"root ::= "a" root?"#);
    // Should parse without infinite loop; root has 1 rule.
    let root_rules: Vec<_> = g.rules_for(g.start()).collect();
    assert_eq!(root_rules.len(), 1);
    // The RHS should have 2 items: terminal 'a' and a NonTerminal (opt wrapping root).
    assert_eq!(root_rules[0].1.rhs.len(), 2);
}

// ─────────────────────────────────────────────────────────────────────────────
// Test 20: multiword sequence
// ─────────────────────────────────────────────────────────────────────────────

#[test]
fn test_multiword_sequence() {
    let g = parse_ok(r#"root ::= "hello" " " "world""#);
    let root_rules: Vec<_> = g.rules_for(g.start()).collect();
    assert_eq!(root_rules.len(), 1);
    // "hello" + " " + "world" = 11 bytes → 11 terminal symbols in the sequence.
    let terminal_count = root_rules[0]
        .1
        .rhs
        .iter()
        .filter(|s| s.is_terminal())
        .count();
    assert_eq!(
        terminal_count, 11,
        "expected 11 terminal bytes for \"hello\" \" \" \"world\""
    );
}

// ─────────────────────────────────────────────────────────────────────────────
// Test 21: complex JSON-like grammar
// ─────────────────────────────────────────────────────────────────────────────

#[test]
fn test_complex_json_like() {
    // A simplified JSON object grammar.
    let src = r#"
root    ::= "{" ws "}"
          | "{" ws members ws "}"
members ::= pair ("," ws pair)*
pair    ::= string ws ":" ws value
value   ::= string | number
string  ::= "\"" [^"]* "\""
number  ::= [0-9]+
ws      ::= [ \t\n]*
"#;
    // Must parse without error.
    let g = parse_gbnf(src).unwrap_or_else(|e| panic!("JSON-like parse failed: {e}"));
    assert!(!g.rules.is_empty());
    // `root` must be the start symbol.
    let root_name = g.nt_name(g.start()).to_string();
    assert_eq!(root_name, "root");
}

// ─────────────────────────────────────────────────────────────────────────────
// Test 22: grammar has correct start symbol
// ─────────────────────────────────────────────────────────────────────────────

#[test]
fn test_grammar_has_correct_start_symbol() {
    let g = parse_ok("root ::= \"x\"\nother ::= \"y\"");
    let start_name = g.nt_name(g.start()).to_string();
    assert_eq!(start_name, "root", "start symbol must be `root`");
}

// ─────────────────────────────────────────────────────────────────────────────
// Test 23: integration with GrammarConstraint
// ─────────────────────────────────────────────────────────────────────────────

#[test]
fn test_constraint_integration() {
    let g = parse_ok(r#"root ::= [0-9]+"#);
    // GrammarConstraint::new should not panic.
    let _constraint = GrammarConstraint::new(
        g,
        |id| {
            if id < 128 {
                vec![id as u8]
            } else {
                vec![]
            }
        },
        128,
    );
}

// ─────────────────────────────────────────────────────────────────────────────
// Test 24: multiple rules all parse correctly
// ─────────────────────────────────────────────────────────────────────────────

#[test]
fn test_multiple_rules() {
    let src =
        "root ::= noun verb noun\nnoun ::= \"cat\" | \"dog\"\nverb ::= \"chases\" | \"sees\"\n";
    let g = parse_ok(src);
    // Should have 3 named NTs (root, noun, verb) + synthetic ones.
    assert!(
        g.nt_count >= 3,
        "expected at least 3 NTs, got {}",
        g.nt_count
    );
    // Root has 1 rule (the sequence noun verb noun).
    let root_rules: Vec<_> = g.rules_for(g.start()).collect();
    assert_eq!(root_rules.len(), 1);
    // noun and verb each have 2 rules.
    let noun_id = *g.nt_names.iter().find(|(_, n)| *n == "noun").unwrap().0;
    let verb_id = *g.nt_names.iter().find(|(_, n)| *n == "verb").unwrap().0;
    assert_eq!(g.rules_for(noun_id).count(), 2);
    assert_eq!(g.rules_for(verb_id).count(), 2);
}

// ─────────────────────────────────────────────────────────────────────────────
// Test 25: decimal number pattern
// ─────────────────────────────────────────────────────────────────────────────

#[test]
fn test_digit_word_pattern() {
    // root ::= [0-9]+ "." [0-9]+  — decimal number like "3.14"
    let g = parse_ok(r#"root ::= [0-9]+ "." [0-9]+"#);
    assert!(!g.rules.is_empty());
    // Grammar must have the root rule and digit class rules.
    let root_name = g.nt_name(g.start()).to_string();
    assert_eq!(root_name, "root");
    // Should contain ε rules (from the star helper inside plus).
    assert!(
        g.rules.iter().any(|r| r.is_epsilon()),
        "plus quantifier must introduce ε rule"
    );
}

// ─────────────────────────────────────────────────────────────────────────────
// Additional: GbnfParseError Display impl coverage
// ─────────────────────────────────────────────────────────────────────────────

#[test]
fn test_error_display_coverage() {
    let cases: &[GbnfParseError] = &[
        GbnfParseError::EmptyGrammar,
        GbnfParseError::MissingRootRule,
        GbnfParseError::UnknownRule("foo".to_string()),
        GbnfParseError::UnexpectedChar {
            line: 1,
            col: 2,
            ch: '!',
        },
        GbnfParseError::UnterminatedString,
        GbnfParseError::UnterminatedCharClass,
        GbnfParseError::InvalidEscape('z'),
        GbnfParseError::UnsupportedFeature("test".to_string()),
        GbnfParseError::RecursionLimit,
    ];
    for e in cases {
        assert!(
            !e.to_string().is_empty(),
            "Display for {e:?} must not be empty"
        );
    }
    // std::error::Error is implemented.
    let e: &dyn std::error::Error = &GbnfParseError::EmptyGrammar;
    assert!(e.source().is_none());
}