1use std::{
8 fmt::{Debug, Display},
9 hash::Hash,
10 ops::Range,
11 sync::{Arc, Mutex},
12};
13
14use crate::{
15 earley::{grammar::ParamValue, lexer::MatchingLexemesIdx, ParamCond},
16 HashMap, HashSet, Instant,
17};
18use anyhow::{bail, ensure, Result};
19use derivre::{NextByte, RegexAst, StateID};
20use serde::{Deserialize, Serialize};
21use toktrie::{
22 parse_numeric_token, Recognizer, SimpleVob, TokEnv, TokTrie, TokenId, INVALID_TOKEN,
23};
24
25use crate::{
26 api::{ParserLimits, StopReason},
27 earley::{lexer::Lexer, lexerspec::LexemeClass},
28 id32_type,
29};
30
31use super::{
32 grammar::{CGrammar, CSymIdx, CSymbol, RhsPtr},
33 lexer::{LexerResult, PreLexeme},
34 lexerspec::{Lexeme, LexemeIdx, LexemeSpec, LexerSpec},
35 perf::ParserPerfCounters,
36 regexvec::{LexemeSet, LexerStats},
37};
38
39const TRACE: bool = false;
40const DEBUG: bool = true;
41pub(crate) const ITEM_TRACE: bool = false;
42
43macro_rules! trace {
44 ($($arg:tt)*) => {
45 if cfg!(feature = "logging") && TRACE {
46 eprintln!($($arg)*);
47 }
48 }
49}
50
51macro_rules! debug {
52 ($($arg:tt)*) => {
53 if cfg!(feature = "logging") && DEBUG {
54 eprintln!($($arg)*);
55 }
56 }
57}
58
59macro_rules! debug_def {
60 ($s:expr, $($arg:tt)*) => {
61 if cfg!(feature = "logging") && DEBUG && $s.scratch.log_enabled() {
62 eprintln!($($arg)*);
63 }
64 }
65}
66
67macro_rules! item_trace {
68 ($($arg:tt)*) => {
69 if ITEM_TRACE {
70 eprint!(" ");
71 eprintln!($($arg)*);
72 }
73 }
74}
75
76#[derive(Clone, Copy, PartialEq, Eq, Hash)]
77struct Item {
78 data: u64,
79}
80
81#[derive(Clone)]
82struct SavedParserState {
83 lexer_stack_length: usize,
84}
85
86#[derive(Debug, Default, Serialize, Deserialize, Clone)]
87pub struct ParserStats {
88 pub compute_time_us: u64,
89 pub rows: usize,
90 pub cached_rows: usize,
91 pub all_items: usize,
92 pub lexer_cost: u64,
93 pub slices_applied: usize,
94 pub trie_nodes_walked: usize,
95
96 pub definitive_bytes: usize,
97 pub lexer_ops: usize,
98 pub num_lex_errors: usize,
99 pub num_lexemes: usize,
100}
101
102#[derive(Debug, Clone)]
103pub struct XorShift {
104 seed: u32,
105}
106
107impl XorShift {
108 pub fn new(seed: u32) -> Self {
109 XorShift { seed }
110 }
111
112 pub fn new_str(s: &str) -> Self {
113 XorShift {
114 seed: XorShift::fnv1a_32(s.as_bytes()),
115 }
116 }
117
118 #[allow(clippy::should_implement_trait)]
119 pub fn next(&mut self) -> u32 {
120 let mut x = self.seed;
121 x ^= x << 13;
122 x ^= x >> 17;
123 x ^= x << 5;
124 self.seed = x;
125 x
126 }
127
128 pub fn from_range(&mut self, r: Range<usize>) -> usize {
129 assert!(r.start < r.end);
130 assert!(r.end < u32::MAX as usize);
131 r.start + (self.next() as usize) % (r.end - r.start)
132 }
133
134 pub fn one_in(&mut self, n: u32) -> bool {
135 self.next() % n == 0
136 }
137
138 pub fn next_alt(&mut self) -> u32 {
139 let mut x = self.seed;
140 x ^= x << 15;
141 x ^= x >> 4;
142 x ^= x << 23;
143 self.seed = x;
144 x
145 }
146
147 pub fn fnv1a_32(s: &[u8]) -> u32 {
148 let mut hash: u32 = 0x811c9dc5;
149 for byte in s {
150 hash ^= *byte as u32;
151 hash = hash.wrapping_mul(0x01000193);
152 }
153 hash
154 }
155
156 pub fn sample_from_vob(&mut self, vob: &SimpleVob) -> u32 {
157 let nset = vob.num_set();
158 assert!(nset > 0);
159 if nset > vob.len() / 10 {
160 loop {
161 let idx = self.from_range(0..vob.len());
162 if vob[idx] {
163 return idx as u32;
164 }
165 }
166 } else {
167 let choices = vob.to_list();
168 choices[self.from_range(0..choices.len())]
169 }
170 }
171}
172
173impl Default for XorShift {
174 fn default() -> Self {
175 XorShift { seed: 0xdeadf00d }
176 }
177}
178
179#[derive(Debug, Default, Clone)]
180pub struct ParserMetrics {
181 pub rand: XorShift,
182 pub message: String,
183 pub slicer_leftover_us: usize,
184}
185
186impl ParserStats {
187 pub fn delta(&self, previous: &ParserStats) -> ParserStats {
188 ParserStats {
189 rows: self.rows.saturating_sub(previous.rows),
190 cached_rows: self.cached_rows.saturating_sub(previous.cached_rows),
191 definitive_bytes: self
192 .definitive_bytes
193 .saturating_sub(previous.definitive_bytes),
194 lexer_ops: self.lexer_ops.saturating_sub(previous.lexer_ops),
195 num_lexemes: self.num_lexemes.saturating_sub(previous.num_lexemes),
196 num_lex_errors: self.num_lex_errors.saturating_sub(previous.num_lex_errors),
197 all_items: self.all_items.saturating_sub(previous.all_items),
198 lexer_cost: self.lexer_cost.saturating_sub(previous.lexer_cost),
199 compute_time_us: self
200 .compute_time_us
201 .saturating_sub(previous.compute_time_us),
202 slices_applied: self.slices_applied.saturating_sub(previous.slices_applied),
203 trie_nodes_walked: self
204 .trie_nodes_walked
205 .saturating_sub(previous.trie_nodes_walked),
206 }
207 }
208
209 pub fn max(&self, other: &ParserStats) -> ParserStats {
210 ParserStats {
211 rows: self.rows.max(other.rows),
212 cached_rows: self.cached_rows.max(other.cached_rows),
213 definitive_bytes: self.definitive_bytes.max(other.definitive_bytes),
214 lexer_ops: self.lexer_ops.max(other.lexer_ops),
215 num_lexemes: self.num_lexemes.max(other.num_lexemes),
216 num_lex_errors: self.num_lex_errors.max(other.num_lex_errors),
217 all_items: self.all_items.max(other.all_items),
218 lexer_cost: self.lexer_cost.max(other.lexer_cost),
219 compute_time_us: self.compute_time_us.max(other.compute_time_us),
220 slices_applied: self.slices_applied.max(other.slices_applied),
221 trie_nodes_walked: self.trie_nodes_walked.max(other.trie_nodes_walked),
222 }
223 }
224}
225
226impl Display for ParserStats {
227 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
228 write!(f, "{}", serde_json::to_string_pretty(self).unwrap())
229 }
230}
231
232id32_type!(GrammarStackPtr);
233
234#[derive(Clone, Debug)]
235struct GrammarStackNode {
236 back_ptr: GrammarStackPtr,
237 token_horizon: u32,
238 grammar_id: LexemeClass,
239 start_item: Item,
240 start_item_idx: usize,
241}
242
243#[derive(Clone)]
247struct Row {
248 first_item: u32,
249 last_item: u32,
250
251 grammar_stack_ptr: GrammarStackPtr,
252
253 lexer_start_state: StateID,
259
260 lexeme_idx: MatchingLexemesIdx,
261}
262
263impl Row {
264 fn item_indices(&self) -> Range<usize> {
265 self.first_item as usize..self.last_item as usize
266 }
267}
268
269impl Item {
272 #[allow(dead_code)]
273 const NULL: Self = Item { data: 0 };
274
275 fn new(rule: RhsPtr, start: usize) -> Self {
276 Item {
277 data: rule.as_index() as u64 | ((start as u64) << 32),
278 }
279 }
280
281 fn rhs_ptr(&self) -> RhsPtr {
283 RhsPtr::from_index(self.data as u32)
284 }
285
286 fn start_pos(&self) -> usize {
287 (self.data >> 32) as usize
288 }
289
290 fn advance_dot(&self) -> Self {
291 Item {
292 data: self.data + 1,
293 }
294 }
295
296 fn rewind_dot(&self) -> Self {
297 Item {
298 data: self.data - 1,
299 }
300 }
301}
302
303impl Debug for Item {
304 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
305 let rule = self.rhs_ptr();
306 write!(f, "Item(rhs={} @{})", rule.as_index(), self.start_pos())
307 }
308}
309
310#[derive(Clone)]
313struct Scratch {
314 grammar: Arc<CGrammar>,
315
316 row_start: usize,
318 row_end: usize,
319
320 items: Vec<Item>,
323 item_args: Vec<ParamValue>,
324 grammar_stack: Vec<GrammarStackNode>,
325
326 push_allowed_grammar_ids: SimpleVob,
327 push_allowed_lexemes: LexemeSet,
328 push_grm_top: GrammarStackPtr,
329 push_lexeme_idx: MatchingLexemesIdx,
330
331 definitive: bool,
338
339 log_override: bool,
340
341 parametric: bool,
343}
344
345#[derive(Clone)]
346struct RowInfo {
347 start_byte_idx: usize,
349 lexeme: Lexeme,
350 token_idx_start: usize,
351 token_idx_stop: usize,
352}
353
354impl RowInfo {
355 fn apply_token_idx(&mut self, idx: usize) {
356 self.token_idx_start = self.token_idx_start.min(idx);
357 self.token_idx_stop = self.token_idx_stop.max(idx);
358 }
359
360 fn set_token_idx(&mut self, idx: usize) {
361 self.token_idx_start = idx;
362 self.token_idx_stop = idx;
363 }
364
365 fn dbg(&self, lexer: &Lexer) -> String {
366 format!(
367 "token_idx: {}-{}; b:{}; {}",
368 self.token_idx_start,
369 self.token_idx_stop,
370 self.start_byte_idx,
371 lexer.dbg_lexeme(&self.lexeme),
372 )
373 }
374}
375
376#[derive(Clone, Copy, Debug)]
395struct LexerState {
396 row_idx: u32, lexer_state: StateID, byte: Option<u8>,
399}
400
401#[derive(Clone)]
402struct Captures {
403 capture_list: Vec<(String, Vec<u8>)>,
404 capture_map: HashMap<String, Vec<u8>>,
405}
406
407impl Captures {
408 fn new() -> Self {
409 Captures {
410 capture_list: vec![],
411 capture_map: HashMap::default(),
412 }
413 }
414
415 fn push(&mut self, cap: (String, Vec<u8>)) {
416 let (name, bytes) = cap;
417 if !name.starts_with("__LIST_APPEND:") {
419 if let Some(old) = self.capture_map.get(&name) {
420 if old == &bytes {
421 return;
422 }
423 }
424 }
425 self.capture_list.push((name.clone(), bytes.clone()));
426 self.capture_map.insert(name, bytes);
427 }
428}
429
430#[derive(Clone)]
431struct ParserState {
432 grammar: Arc<CGrammar>,
433 tok_env: TokEnv,
434 scratch: Scratch,
435 trie_lexer_stack: usize,
436 trie_grammar_stack: usize,
437 captures: Captures,
438 special_token_marker_token: TokenId,
439
440 lexer_stack: Vec<LexerState>,
445 lexer_stack_top_eos: bool,
446 lexer_stack_flush_position: usize,
447 rows: Vec<Row>,
448 rows_valid_end: usize,
449
450 trace_byte_stack: Vec<u8>,
451 trace_stats0: ParserStats,
452 trace_start: Instant,
453
454 row_infos: Vec<RowInfo>,
456 token_idx: usize,
457 bytes: Vec<u8>,
458 byte_to_token_idx: Vec<u32>,
460
461 last_force_bytes_len: usize,
462
463 stats: ParserStats,
464 perf_counters: Arc<ParserPerfCounters>,
465 limits: ParserLimits,
466 metrics: ParserMetrics,
467 max_all_items: usize,
468 parser_error: Option<String>,
469 backtrack_byte_count: usize,
470
471 shared_box: Box<SharedState>,
472}
473
474#[derive(Clone, Default)]
475struct SharedState {
476 lexer_opt: Option<Lexer>,
477}
478
479impl SharedState {
480 #[inline(always)]
481 fn lexer_mut(&mut self) -> &mut Lexer {
482 self.lexer_opt.as_mut().unwrap()
483 }
484
485 #[inline(always)]
486 fn lexer(&self) -> &Lexer {
487 self.lexer_opt.as_ref().unwrap()
488 }
489}
490
491#[derive(Clone)]
492pub struct Parser {
493 shared: Arc<Mutex<Box<SharedState>>>,
494 state: ParserState,
495}
496
497impl Scratch {
498 fn new(grammar: Arc<CGrammar>) -> Self {
499 Scratch {
500 push_allowed_lexemes: grammar.lexer_spec().alloc_lexeme_set(),
501 push_allowed_grammar_ids: grammar.lexer_spec().alloc_grammar_set(),
502 push_grm_top: GrammarStackPtr::new(0),
503 push_lexeme_idx: MatchingLexemesIdx::Single(LexemeIdx::new(0)),
504 parametric: grammar.parametric(),
505 grammar,
506 row_start: 0,
507 row_end: 0,
508 items: vec![],
509 item_args: vec![],
510 grammar_stack: vec![],
511 definitive: true,
512 log_override: false,
513 }
514 }
515
516 fn log_enabled(&self) -> bool {
517 self.definitive || self.log_override
518 }
519
520 fn new_row(&mut self, pos: usize) {
523 self.row_start = pos;
524 self.row_end = pos;
525 }
526
527 fn row_len(&self) -> usize {
529 self.row_end - self.row_start
530 }
531
532 fn work_row(&self, lexer_start_state: StateID) -> Row {
535 Row {
536 first_item: self.row_start as u32,
537 last_item: self.row_end as u32,
538 grammar_stack_ptr: self.push_grm_top,
539 lexer_start_state,
540 lexeme_idx: self.push_lexeme_idx,
541 }
542 }
543
544 #[inline(always)]
547 fn ensure_items(&mut self, n: usize) {
548 let k = n.saturating_sub(self.items.len());
549 self.items.reserve(k);
550 if self.parametric {
551 self.item_args.reserve(k);
552 }
553 }
554
555 fn push_grammar_stack(&mut self, node: GrammarStackNode) {
556 if self.log_enabled() {
557 debug!("push_grammar_stack: {:?}", node);
558 }
559 let ptr = GrammarStackPtr::new(self.grammar_stack.len());
560 self.grammar_stack.push(node);
561 self.push_grm_top = ptr;
562 }
563
564 fn just_add_idx(&mut self, item: Item, src_item_idx: usize, info: &str) {
565 let arg = if self.parametric {
566 self.item_args[src_item_idx]
567 } else {
568 ParamValue::default()
569 };
570 self.just_add(item, arg, info)
571 }
572
573 #[inline(always)]
579 fn just_add(&mut self, item: Item, param: ParamValue, info: &str) {
580 if self.items.len() == self.row_end {
581 self.items.push(item);
582 } else {
583 self.items[self.row_end] = item;
584 }
585 if self.parametric {
586 if self.item_args.len() == self.row_end {
587 self.item_args.push(param);
588 } else {
589 self.item_args[self.row_end] = param;
590 }
591 }
592 if self.log_enabled() {
593 debug!(
594 " addu: {} ({}) ::{}",
595 self.item_to_string(self.row_end),
596 info,
597 param
598 );
599 }
600 self.row_end += 1;
601 }
602
603 #[inline(always)]
605 fn find_item(&self, item: Item) -> Option<usize> {
606 self.items[self.row_start..self.row_end]
607 .iter()
608 .position(|&x| x == item)
609 .map(|x| x + self.row_start)
610 }
611
612 #[inline(always)]
613 fn add_unique(&mut self, item: Item, origin_item_idx: usize, info: &str) {
614 if self.parametric {
615 self.add_unique_arg(item, info, self.item_args[origin_item_idx])
616 } else {
617 self.add_unique_arg(item, info, ParamValue::default())
618 }
619 }
620
621 #[inline(always)]
625 fn add_unique_arg(&mut self, item: Item, info: &str, param: ParamValue) {
626 if self.parametric {
627 for idx in self.row_start..self.row_end {
628 if self.items[idx] == item && self.item_args[idx] == param {
629 return;
630 }
631 }
632 self.just_add(item, param, info);
633 } else {
634 if self.find_item(item).is_none() {
636 self.just_add(item, param, info);
637 }
638 }
639 }
640
641 fn item_to_string(&self, idx: usize) -> String {
643 item_to_string(
644 &self.grammar,
645 &self.items[idx],
646 self.item_args.get(idx).copied().unwrap_or_default(),
647 )
648 }
649}
650
651macro_rules! ensure_internal {
652 ($cond:expr, $msg:expr) => {
653 ensure!($cond, "Internal error: {}", $msg)
654 };
655}
656
657impl ParserState {
658 fn new(
661 tok_env: TokEnv,
662 grammar: Arc<CGrammar>,
663 mut limits: ParserLimits,
664 perf_counters: Arc<ParserPerfCounters>,
665 ) -> Result<(Self, Lexer)> {
666 let start = grammar.start();
667 let mut lexer = Lexer::from(grammar.lexer_spec(), &mut limits, true)?;
668 if limits.precompute_large_lexemes {
669 let t0 = crate::Instant::now();
670 lexer.dfa.set_fuel(limits.initial_lexer_fuel);
671 for spec in &grammar.lexer_spec().lexemes {
672 let w = lexer.dfa.lexeme_weight(spec.idx);
673 if w > 1000 {
674 let mut allowed = grammar.lexer_spec().alloc_lexeme_set();
680 allowed.add(spec.idx);
681 lexer.precompute_for(tok_env.tok_trie(), &allowed);
682 }
684 }
685 perf_counters.precompute.record(t0.elapsed());
686 if lexer.dfa.has_error() {
687 bail!("lexer precomputation failed; either increase limits.initial_lexer_fuel or disable limits.precompute_large_lexemes");
688 }
689 }
690 let scratch = Scratch::new(Arc::clone(&grammar));
691 let lexer_state = lexer.a_dead_state(); let spec_tok = tok_env
693 .tok_trie()
694 .greedy_tokenize(&[TokTrie::SPECIAL_TOKEN_MARKER]);
695 let special_marker_token = if spec_tok.len() == 1 {
696 spec_tok[0]
697 } else {
698 INVALID_TOKEN
699 };
700 let mut r = ParserState {
701 grammar,
702 tok_env,
703 special_token_marker_token: special_marker_token,
704 trie_lexer_stack: usize::MAX,
705 rows: vec![],
706 rows_valid_end: 0,
707 row_infos: vec![],
708 captures: Captures::new(),
709 scratch,
710 stats: ParserStats::default(),
711 metrics: ParserMetrics::default(),
712 trace_stats0: ParserStats::default(),
713 trace_byte_stack: vec![],
714 trace_start: Instant::now(),
715 token_idx: 0,
716 byte_to_token_idx: vec![],
717 bytes: vec![],
718 last_force_bytes_len: usize::MAX,
719 max_all_items: usize::MAX,
720 limits,
721 backtrack_byte_count: 0,
722 lexer_stack_top_eos: false,
723 lexer_stack_flush_position: 0,
724 lexer_stack: vec![LexerState {
725 row_idx: 0,
726 lexer_state,
727 byte: None,
728 }],
729 trie_grammar_stack: 0,
730 parser_error: None,
731 shared_box: Box::new(SharedState {
732 lexer_opt: Some(lexer),
733 }),
734 perf_counters,
735 };
736
737 r.scratch.grammar_stack.push(GrammarStackNode {
738 back_ptr: GrammarStackPtr::new(0),
739 token_horizon: u32::MAX,
740 grammar_id: LexemeClass::ROOT,
741 start_item: Item::new(RhsPtr::from_index(0), 0),
742 start_item_idx: 0,
743 });
744
745 for rule in r.grammar.rules_of(start) {
748 r.scratch
749 .add_unique_arg(Item::new(*rule, 0), "init", ParamValue::default());
750 }
751 debug!("initial push");
752 let _ = r.push_row(0, &Lexeme::bogus());
753 ensure_internal!(
754 r.num_rows() == 1 && r.rows.len() == 1,
755 "initial push failed"
756 );
757 assert!(r.lexer_stack.len() == 1);
758
759 if !r.lexer_spec().allow_initial_skip {
762 let skip_id = r.lexer_spec().skip_id(LexemeClass::ROOT);
766 let mut possible = r
767 .lexer()
768 .possible_lexemes(r.rows[0].lexer_start_state)
769 .clone();
770 possible.remove(skip_id);
771 let new_state = r.lexer_mut().start_state(&possible);
772 r.rows[0].lexer_start_state = new_state;
773 debug!(
774 "disallowing initial SKIP; {}",
775 r.allowed_lexemes_dbg(new_state)
776 );
777 }
778
779 let state = r.rows[0].lexer_start_state;
780 r.lexer_stack[0].lexer_state = state;
781 r.assert_definitive();
782
783 let lexer = std::mem::take(&mut r.shared_box).lexer_opt.unwrap();
784
785 r.stats.lexer_cost = lexer.dfa.total_fuel_spent();
786
787 Ok((r, lexer))
788 }
789
790 #[inline(always)]
791 fn lexer(&self) -> &Lexer {
792 self.shared_box.lexer()
793 }
794
795 #[inline(always)]
796 fn lexer_mut(&mut self) -> &mut Lexer {
797 self.shared_box.lexer_mut()
798 }
799
800 fn with_items_limit<T>(
801 &mut self,
802 limit: usize,
803 lbl: &str,
804 f: impl FnOnce(&mut Self) -> T,
805 ) -> T {
806 self.max_all_items = self.stats.all_items + limit;
807
808 let r = f(self);
809
810 if self.stats.all_items > self.max_all_items && self.parser_error.is_none() {
811 self.parser_error = Some(format!(
812 "Too many items (limit {limit}; {lbl}); try avoiding single-byte/short lexemes"
813 ));
814 }
815
816 self.max_all_items = usize::MAX;
817
818 r
819 }
820
821 fn compute_bias(&mut self, computer: &dyn BiasComputer, start: &[u8]) -> SimpleVob {
822 let t0 = Instant::now();
823 let limits = self.limits.clone();
824 let dfa = &mut self.lexer_mut().dfa;
825 dfa.set_fuel(limits.step_lexer_fuel);
826 dfa.set_max_states(limits.max_lexer_states);
827
828 let mut set = self.with_items_limit(limits.step_max_items, "mask", |state| {
829 let mut r = ParserRecognizer { state };
830 computer.compute_bias(&mut r, start)
831 });
832
833 self.stats.lexer_cost = self.lexer().dfa.total_fuel_spent();
834
835 if self.special_token_marker_token != INVALID_TOKEN {
837 set.disallow_token(self.special_token_marker_token);
838 }
839
840 if start.is_empty() {
841 self.run_speculative("token_ranges", |state| {
842 if state.flush_lexer() {
843 for spec in state.token_range_lexemes() {
844 for range in &spec.token_ranges {
845 set.allow_range(range.clone());
846 }
847 }
848 }
849 });
850 }
851
852 let eos = computer.trie().eos_token();
853 if eos != INVALID_TOKEN && start.is_empty() && self.lexer_allows_eos() {
854 set.allow_token(eos);
855 }
856
857 let d = t0.elapsed();
858 self.stats.compute_time_us += d.as_micros() as u64;
859 self.perf_counters.compute_bias.record(d);
860
861 set
862 }
863
864 fn after_dots(&self) -> impl Iterator<Item = RhsPtr> + '_ {
865 self.curr_row()
866 .item_indices()
867 .map(|i| self.scratch.items[i].rhs_ptr())
868 }
869
870 fn after_dots_symdata(&self) -> impl Iterator<Item = &CSymbol> + '_ {
871 self.after_dots().map(|pos| self.grammar.sym_data_dot(pos))
872 }
873
874 fn can_advance_inner(&self) -> bool {
875 for data in self.after_dots_symdata() {
876 if data.idx == CSymIdx::NULL {
877 continue;
878 }
879 if data.is_terminal || data.gen_grammar.is_some() {
880 return true;
881 }
882 }
883 false
884 }
885
886 pub fn can_advance(&self) -> bool {
887 self.has_pending_lexeme_bytes() || self.can_advance_inner()
888 }
889
890 pub fn has_pending_lexeme_bytes(&self) -> bool {
891 let row_idx = self.num_rows() - 1;
892 for back in self.lexer_stack.iter().rev() {
893 if back.row_idx as usize != row_idx {
894 break;
895 }
896 if back.byte.is_some() {
897 return true;
898 }
899 }
900 false
901 }
902
903 fn row_is_accepting(&self) -> bool {
907 for pos in self.after_dots() {
908 let after_dot = self.grammar.sym_idx_dot(pos);
909 if after_dot == CSymIdx::NULL {
910 let lhs = self.grammar.sym_idx_lhs(pos);
911 if lhs == self.grammar.start() {
912 return true;
913 }
914 }
915 }
916 false
917 }
918
919 pub fn lexer_allows_eos(&mut self) -> bool {
920 if self.has_pending_lexeme_bytes() {
921 let lexer_state = self.lexer_state().lexer_state;
922 self.lexer_mut().allows_eos(lexer_state)
923 } else {
924 false
926 }
927 }
928
929 fn item_to_string(&self, idx: usize) -> String {
930 self.scratch.item_to_string(idx)
931 }
932
933 fn print_row(&self, row_idx: usize) {
934 let row = &self.rows[row_idx];
935 println!(
936 "row {}; lexer_stack={} top_state={:?}",
937 row_idx,
938 self.lexer_stack.len(),
939 self.lexer_stack.last().unwrap().lexer_state
940 );
941
942 println!(
943 " allowed: {}",
944 self.allowed_lexemes_dbg(row.lexer_start_state)
945 );
946
947 if row_idx < self.row_infos.len() {
948 let info = &self.row_infos[row_idx];
949 if info.lexeme.is_bogus() {
950 println!(" lexeme: placeholder");
951 } else {
952 println!(" lexeme: {}", self.lexer().dbg_lexeme(&info.lexeme));
953 }
954 } else {
955 println!(" speculative");
956 }
957 for i in row.item_indices() {
958 println!(" {}", self.item_to_string(i));
959 }
960 }
961
962 #[inline(always)]
963 fn lexer_state(&self) -> LexerState {
964 self.lexer_stack[self.lexer_stack.len() - 1]
965 }
966
967 #[inline(always)]
970 pub fn num_rows(&self) -> usize {
971 self.lexer_state().row_idx as usize + 1
974 }
975
976 #[inline(always)]
977 fn pop_lexer_states(&mut self, n: usize) {
978 self.lexer_stack
979 .truncate(self.lexer_stack.len().saturating_sub(n));
980 }
981
982 #[allow(dead_code)]
983 pub fn print_stats(&mut self) {
984 println!("stats: {:?}", self.stats);
985 self.stats = ParserStats::default();
986 }
987
988 fn assert_definitive_inner(&self) {
989 assert!(self.scratch.definitive);
990 assert!(self.backtrack_byte_count == 0);
991 if self.num_rows() != self.row_infos.len() {
992 panic!(
993 "num_rows={} row_infos={}",
994 self.num_rows(),
995 self.row_infos.len()
996 );
997 }
998 }
999
1000 fn assert_definitive(&self) {
1001 self.assert_definitive_inner();
1002
1003 if self.lexer_spec().can_rollback() {
1004 self.check_lexer_bytes_invariant();
1005 }
1006 }
1007
1008 pub fn get_bytes(&self) -> &[u8] {
1009 &self.bytes
1010 }
1011
1012 fn item_lhs(&self, item: &Item) -> CSymIdx {
1013 self.grammar.sym_idx_lhs(item.rhs_ptr())
1014 }
1015
1016 #[allow(dead_code)]
1017 fn item_sym_data(&self, item: &Item) -> &CSymbol {
1018 self.grammar.sym_data(self.item_lhs(item))
1019 }
1020
1021 fn hidden_start(&self, lexer: &mut Lexer) -> usize {
1022 let lexer_state = self.lexer_state().lexer_state;
1023 let hidden_len = lexer.possible_hidden_len(lexer_state);
1024 if hidden_len == 0 {
1025 return usize::MAX;
1026 }
1027 let last_lexeme_visible_len = self.curr_row_bytes().len() - hidden_len;
1028 let prefix_len = self.row_infos[self.num_rows() - 1].start_byte_idx;
1029 prefix_len + last_lexeme_visible_len
1030 }
1031
1032 pub fn temperature(&self) -> Option<f32> {
1033 let mut temp = -1000.0f32;
1034 for data in self.after_dots_symdata() {
1035 if data.is_terminal {
1036 temp = temp.max(data.props.temperature);
1037 }
1038 }
1039 if temp < 0.00000001 {
1040 None
1041 } else {
1042 Some(temp)
1043 }
1044 }
1045
1046 pub fn rollback(&mut self, n_bytes: usize) -> Result<()> {
1047 debug!("rollback: {} bytes", n_bytes);
1048 ensure!(self.parser_error.is_none(), "rollback: parser error");
1049 self.assert_definitive();
1050 ensure!(
1051 n_bytes <= self.byte_to_token_idx.len(),
1052 "rollback: too many bytes {} > {}",
1053 n_bytes,
1054 self.byte_to_token_idx.len()
1055 );
1056
1057 let new_len = self.byte_to_token_idx.len() - n_bytes;
1058
1059 self.byte_to_token_idx.truncate(new_len);
1060 self.bytes.truncate(new_len);
1061 self.lexer_stack.truncate(new_len + 1);
1062
1063 self.row_infos.truncate(self.num_rows());
1064 self.token_idx = *self.byte_to_token_idx.last().unwrap_or(&0) as usize;
1065 self.last_force_bytes_len = usize::MAX;
1066 self.lexer_stack_top_eos = false;
1067 self.rows_valid_end = self.num_rows();
1068
1069 self.assert_definitive();
1070
1071 Ok(())
1072 }
1073
1074 pub fn validate_tokens(&mut self, tokens: &[TokenId]) -> usize {
1075 self.assert_definitive();
1076 self.run_speculative("validate_tokens", |state| {
1077 state.scratch.log_override = true;
1078 let mut applied_idx = state.byte_to_token_idx.len();
1079 let tok_env = state.tok_env.clone();
1080 let trie = tok_env.tok_trie();
1081 let eos = trie.eos_token();
1082 let mut recog = ParserRecognizer { state };
1083 for (tidx, &tok) in tokens.iter().enumerate() {
1084 let state = &mut recog.state;
1085 if tok == eos {
1086 if applied_idx == state.bytes.len() && state.is_accepting_inner() {
1087 return tidx + 1;
1088 } else {
1089 return tidx;
1090 }
1091 }
1092
1093 if applied_idx >= state.bytes.len() {
1094 let saved_parser_state = state.save_state();
1095
1096 if let Some(idx) = state.flush_and_check_numeric(tok) {
1097 let numeric_bytes = trie.decode_as_special(tok);
1098 let ok = state.add_numeric_token(idx, &numeric_bytes);
1099 assert!(ok.is_ok());
1100 continue; }
1102
1103 state.restore_state(saved_parser_state);
1104 }
1105
1106 let token_bytes = trie.decode_raw(&[tok]);
1107
1108 let token_bytes = if applied_idx < state.bytes.len()
1109 && state.bytes[applied_idx] == TokTrie::SPECIAL_TOKEN_MARKER
1110 {
1111 trie.decode_as_special(tok)
1112 } else {
1113 token_bytes
1114 };
1115
1116 for &b in &token_bytes {
1117 if applied_idx < recog.state.bytes.len() {
1118 if recog.state.bytes[applied_idx] == b {
1119 applied_idx += 1;
1120 } else {
1121 return tidx;
1122 }
1123 } else {
1124 if b != TokTrie::SPECIAL_TOKEN_MARKER && recog.try_push_byte(b) {
1126 continue;
1128 } else {
1129 return tidx;
1130 }
1131 }
1132 }
1133 }
1134
1135 tokens.len() })
1137 }
1138
1139 fn add_numeric_token(&mut self, idx: LexemeIdx, tok_bytes: &[u8]) -> Result<()> {
1140 let lexer_state = self.lexer_state();
1141 for &b in &tok_bytes[0..tok_bytes.len() - 1] {
1143 self.lexer_stack.push(LexerState {
1144 byte: Some(b),
1145 ..lexer_state
1146 });
1147 }
1148
1149 if self.scratch.definitive {
1150 self.bytes.extend_from_slice(tok_bytes);
1151 for _ in 0..tok_bytes.len() {
1152 self.byte_to_token_idx
1153 .push(self.token_idx.try_into().unwrap());
1154 }
1155 }
1156 debug_def!(
1157 self,
1158 "add_numeric_token: idx={:?} bytes={:?}",
1159 idx,
1160 tok_bytes
1161 );
1162 let ok = self.advance_parser(PreLexeme::just_idx(MatchingLexemesIdx::Single(idx)));
1163 ensure!(
1164 ok,
1165 "failed to advance parser after adding bytes ignoring lexer"
1166 );
1167 if self.scratch.definitive {
1168 let row_idx = self.num_rows() - 1;
1169 self.row_infos[row_idx].apply_token_idx(self.token_idx);
1170 }
1171 Ok(())
1172 }
1173
1174 fn flush_and_check_numeric(&mut self, tok_id: TokenId) -> Option<LexemeIdx> {
1175 if self.flush_lexer() {
1176 for spec in self.token_range_lexemes() {
1177 if spec.contains_token(tok_id) {
1178 return Some(spec.idx);
1179 }
1180 }
1181 }
1182 None
1183 }
1184
1185 pub fn apply_token(&mut self, tok_bytes: &[u8], tok_id: TokenId) -> Result<usize> {
1189 self.assert_definitive();
1190
1191 item_trace!("apply_token: {:?}", String::from_utf8_lossy(tok_bytes));
1192
1193 let mut check_lexer_max_tokens = false;
1194
1195 let mut row_to_apply = self.num_rows() - 1;
1196
1197 let applied_idx0 = self.byte_to_token_idx.len();
1199 while row_to_apply > 0 {
1200 if self.row_infos[row_to_apply].start_byte_idx <= applied_idx0 {
1201 break;
1202 }
1203 row_to_apply -= 1;
1204 }
1205
1206 if self.tok_env.tok_trie().token(tok_id) == tok_bytes
1209 && self.byte_to_token_idx.len() == self.bytes.len()
1210 {
1211 let applies = self
1212 .run_speculative("numeric_apply_token", |state| {
1213 state.flush_and_check_numeric(tok_id)
1214 })
1215 .is_some();
1216 if applies {
1217 let row_idx = self.num_rows() - 1;
1219 self.row_infos[row_idx].apply_token_idx(self.token_idx);
1220
1221 self.lexer_stack_flush_position = 0;
1222 let idx = self.flush_and_check_numeric(tok_id).unwrap();
1223 self.add_numeric_token(idx, tok_bytes)?;
1224
1225 if self.lexer_stack_flush_position > 0 {
1227 assert!(self.lexer_stack_flush_position + 1 < self.lexer_stack.len());
1229 self.lexer_stack.remove(self.lexer_stack_flush_position);
1231 }
1232
1233 self.assert_definitive();
1234
1235 return Ok(0);
1236 }
1237 }
1238
1239 for (bidx, &b) in tok_bytes.iter().enumerate() {
1240 check_lexer_max_tokens = false;
1241 let applied_idx = self.byte_to_token_idx.len();
1242 if applied_idx >= self.bytes.len() {
1243 assert!(applied_idx == self.bytes.len());
1244
1245 let row_idx = self.num_rows() - 1;
1246
1247 self.row_infos[row_idx].apply_token_idx(self.token_idx);
1248
1249 let (ok, bt) = self.try_push_byte_definitive(Some(b));
1250 if !ok {
1251 bail!(
1252 "token {:?} doesn't satisfy the grammar; byte {:?} fails parse",
1253 String::from_utf8_lossy(tok_bytes),
1254 b as char,
1255 );
1256 }
1257 if bt > 0 {
1258 self.byte_to_token_idx.truncate(self.bytes.len());
1259 let bt = bt + (tok_bytes.len() - bidx - 1);
1260 return Ok(bt);
1261 }
1262 if row_idx == self.num_rows() - 1 {
1263 check_lexer_max_tokens = true;
1266 }
1267 } else {
1268 if bidx == 0 && self.bytes[applied_idx] == TokTrie::SPECIAL_TOKEN_MARKER {
1269 if let Some(tid) = self.tok_env.tok_trie().token_id_at_bytes(tok_bytes) {
1270 if let Some((len, tid2)) =
1271 parse_numeric_token(&self.bytes[applied_idx + 1..])
1272 {
1273 if tid == tid2 {
1274 let tokidx = self.token_idx.try_into().unwrap();
1275 for _ in 0..len + 1 {
1276 self.byte_to_token_idx.push(tokidx);
1277 }
1278 break;
1279 }
1280 }
1281 }
1282 }
1283 if self.bytes[applied_idx] != b {
1284 bail!(
1285 "token {:?} doesn't satisfy the grammar; forced bytes: got {:?}; applying {:?}",
1286 String::from_utf8_lossy(tok_bytes),
1287 self.bytes[applied_idx] as char,
1288 b as char,
1289 );
1290 }
1291 }
1292
1293 self.byte_to_token_idx
1294 .push(self.token_idx.try_into().unwrap());
1295 }
1296
1297 item_trace!(
1298 "apply_token: ok, {}/{}",
1299 self.byte_to_token_idx.len(),
1300 self.bytes.len()
1301 );
1302
1303 for idx in row_to_apply..self.num_rows() {
1304 if self.row_infos[idx].start_byte_idx >= applied_idx0 {
1306 self.row_infos[idx].set_token_idx(self.token_idx);
1307 } else {
1308 self.row_infos[idx].apply_token_idx(self.token_idx);
1310 }
1311 }
1312
1313 if check_lexer_max_tokens {
1314 let row_idx = self.num_rows() - 1;
1315
1316 let mut pop_classes = HashSet::default();
1317 let mut stack_ptr = self.rows[row_idx].grammar_stack_ptr;
1318 while stack_ptr.as_usize() > 0 {
1319 let grm_top = &self.scratch.grammar_stack[stack_ptr.as_usize()];
1320 if grm_top.token_horizon <= self.token_idx as u32 + 1 {
1321 pop_classes.insert(grm_top.grammar_id);
1322 stack_ptr = grm_top.back_ptr;
1323 } else {
1324 break;
1325 }
1326 }
1327
1328 let info = &self.row_infos[row_idx];
1329 let info_tokens = std::cmp::max(
1330 0,
1331 self.token_idx as isize + 1 - info.token_idx_start as isize,
1332 ) as usize;
1333 let lex_state = self.lexer_state().lexer_state;
1334 let mut limit = self.lexer_spec().alloc_lexeme_set();
1335 let mut num_limit = 0;
1336 {
1337 let possible_lexemes = self.lexer().possible_lexemes(lex_state);
1338 for lex in possible_lexemes.iter() {
1339 let lex_spec = self.lexer_spec().lexeme_spec(lex);
1340 let max_tokens = lex_spec.max_tokens();
1341 let class_ok = !pop_classes.contains(&lex_spec.class());
1342 debug!(
1344 " max_tokens: {} max={} info={} class_ok={}",
1345 self.lexer().dbg_lexeme(&Lexeme::single_idx(lex)),
1346 max_tokens,
1347 info_tokens,
1348 class_ok
1349 );
1350 if info_tokens < max_tokens && class_ok {
1351 limit.add(lex);
1352 } else {
1353 num_limit += 1;
1354 }
1355 }
1356 }
1357 if num_limit > 0 {
1358 debug!(
1359 " max_tokens limiting to: {}",
1360 self.lexer_spec().dbg_lexeme_set(&limit)
1361 );
1362 let new_state = self.lexer_mut().limit_state_to(lex_state, &limit);
1363 if new_state.is_dead() {
1364 debug!(" limited everything; forcing EOI");
1365 let (ok, bt) = self.try_push_byte_definitive(None);
1366 assert!(bt == 0);
1367 if !ok {
1368 debug!("parse reject on max_tokens");
1369 return Ok(0);
1370 }
1371 } else {
1372 self.lexer_stack.last_mut().unwrap().lexer_state = new_state;
1373 }
1374 }
1375 }
1376
1377 let item_count = self.curr_row().item_indices().count();
1378 if item_count > self.limits.max_items_in_row {
1379 bail!(
1380 "Current row has {} items; max is {}; consider making your grammar left-recursive if it's right-recursive",
1381 item_count,
1382 self.limits.max_items_in_row,
1383 );
1384 }
1385
1386 if false {
1387 self.print_row(self.num_rows() - 1);
1388 }
1389
1390 self.assert_definitive();
1391
1392 Ok(0)
1393 }
1394
1395 fn token_range_lexemes(&self) -> Vec<&LexemeSpec> {
1396 let state = self.lexer_state().lexer_state;
1397 let possible = self.lexer().possible_lexemes(state);
1398 self.lexer_spec().token_range_lexemes(possible)
1399 }
1400
1401 pub fn needs_force_bytes(&self) -> bool {
1402 self.bytes.len() != self.last_force_bytes_len
1403 }
1404
1405 pub fn force_bytes(&mut self) {
1409 self.assert_definitive();
1410 if !self.needs_force_bytes() {
1411 return;
1412 }
1413 trace!("force_bytes lexer_stack {}", self.lexer_stack.len());
1414 self.with_items_limit(self.limits.step_max_items, "ff_tokens", |s| {
1415 while let Some(b) = s.forced_byte() {
1416 debug!(" forced: {:?} 0x{:x}", b as char, b);
1417 if b == TokTrie::SPECIAL_TOKEN_MARKER {
1418 assert!(!s.has_pending_lexeme_bytes());
1419 let specs = s.token_range_lexemes();
1420 let mut unique_token_id = None;
1421 'spec: for s in specs {
1422 for range in &s.token_ranges {
1423 if range.start() == range.end() {
1424 let t = *range.start();
1425 if unique_token_id.is_none() || unique_token_id == Some(t) {
1426 unique_token_id = Some(t);
1427 } else {
1428 unique_token_id = None;
1429 break 'spec;
1430 }
1431 } else {
1432 unique_token_id = None;
1433 break 'spec;
1434 }
1435 }
1436 }
1437 if let Some(token_id) = unique_token_id {
1438 let mut bytes = format!("X[{token_id}]").into_bytes();
1439 bytes[0] = TokTrie::SPECIAL_TOKEN_MARKER;
1440 let mut all_ok = true;
1441 for b in bytes {
1442 let (ok, bt) = s.try_push_byte_definitive(Some(b));
1443 assert!(bt == 0);
1444 if !ok {
1445 all_ok = false;
1446 break;
1447 }
1448 }
1449
1450 if !all_ok {
1451 debug!(
1453 " force_bytes reject, special token {}, byte = {}",
1454 token_id, b as char
1455 );
1456 break;
1457 } else {
1458 continue;
1459 }
1460 } else {
1461 debug!(" non-determined special token");
1462 break;
1463 }
1464 }
1465
1466 let (ok, bt) = s.try_push_byte_definitive(Some(b));
1467 assert!(bt == 0);
1468 if !ok {
1469 debug!(" force_bytes reject {}", b as char);
1471 break;
1472 }
1473 }
1474 });
1475 self.assert_definitive();
1476 self.last_force_bytes_len = self.bytes.len();
1477 let bytes = &self.bytes[self.byte_to_token_idx.len()..];
1478 trace!(
1479 "force_bytes exit {} lexer_stack={}",
1480 bytes.len(),
1481 self.lexer_stack.len()
1482 );
1483 }
1484
1485 fn special_pre_lexeme(&mut self, state: StateID) -> bool {
1486 let possible = self.lexer().possible_lexemes(state);
1487 let specs = self.lexer_spec().token_range_lexemes(possible);
1488 let bytes = self.curr_row_bytes();
1489 debug!("special_pre_lexeme: {:?}", String::from_utf8_lossy(&bytes));
1490 let bytes = &bytes[2..bytes.len()];
1492 if let Ok(tok_id) = std::str::from_utf8(bytes).unwrap().parse::<u32>() {
1493 let idx = specs.iter().position(|spec| {
1494 spec.token_ranges
1495 .iter()
1496 .any(|range| range.contains(&tok_id))
1497 });
1498 debug!(" >> tok_id={} idx={:?}", tok_id, idx);
1499 if let Some(idx) = idx {
1500 let pre = PreLexeme {
1501 idx: MatchingLexemesIdx::Single(specs[idx].idx),
1502 byte: Some(b']'),
1503 byte_next_row: false,
1504 };
1505 self.advance_parser(pre)
1506 } else {
1507 false
1508 }
1509 } else {
1510 debug!(" >> not a number; should never happen?");
1511 false
1512 }
1513 }
1514
1515 #[inline(always)]
1518 fn advance_lexer_or_parser(&mut self, lex_result: LexerResult, curr: LexerState) -> bool {
1519 match lex_result {
1520 LexerResult::State(next_state, byte) => {
1521 self.lexer_stack.push(LexerState {
1523 row_idx: curr.row_idx,
1524 lexer_state: next_state,
1525 byte: Some(byte),
1526 });
1527 true
1528 }
1529 LexerResult::Error => false,
1530 LexerResult::Lexeme(pre_lexeme) => self.advance_parser(pre_lexeme),
1531 LexerResult::SpecialToken(state) => self.special_pre_lexeme(state),
1532 }
1533 }
1534
1535 fn check_lexer_bytes_invariant(&self) {
1536 let off = if self.lexer_stack_top_eos { 2 } else { 1 };
1537 if self.lexer_stack.len() != self.bytes.len() + off {
1538 panic!(
1539 "lexer_stack={:?} bytes={:?} {}!={}+{off}",
1540 self.lexer_stack,
1541 String::from_utf8_lossy(&self.bytes),
1542 self.lexer_stack.len(),
1543 self.bytes.len()
1544 );
1545 }
1546 }
1547
1548 fn trie_started_inner(&mut self, lbl: &str) {
1549 self.assert_definitive();
1551
1552 self.trie_lexer_stack = self.lexer_stack.len();
1553 self.trie_grammar_stack = self.scratch.grammar_stack.len();
1554 self.scratch.definitive = false;
1555 if ITEM_TRACE {
1556 self.trace_stats0 = self.stats.clone();
1557 self.trace_start = Instant::now();
1558 self.trace_byte_stack.clear();
1559 item_trace!("trie started; {}", lbl);
1560 }
1561 self.rows_valid_end = self.num_rows();
1562 }
1563
1564 fn trie_finished_inner(&mut self) {
1565 assert!(!self.scratch.definitive);
1567 assert!(self.row_infos.len() <= self.num_rows());
1568
1569 assert!(self.scratch.grammar_stack.len() >= self.trie_grammar_stack);
1571 self.scratch.grammar_stack.truncate(self.trie_grammar_stack);
1572
1573 if ITEM_TRACE {
1574 let mut st = self.stats.clone();
1575 st.lexer_cost = self.lexer().dfa.total_fuel_spent();
1576 st = st.delta(&self.trace_stats0);
1577 st.compute_time_us = self.trace_start.elapsed().as_micros() as u64;
1578 item_trace!("trie finished: {}", serde_json::to_string(&st).unwrap());
1579 self.trace_byte_stack.clear();
1580 }
1581
1582 self.pop_lexer_states(self.lexer_stack.len() - self.trie_lexer_stack);
1584 self.scratch.definitive = true;
1585 self.assert_definitive();
1586 self.rows_valid_end = self.num_rows();
1587 self.scratch.log_override = false; self.lexer_stack_flush_position = 0;
1589 }
1590
1591 fn run_speculative<T>(&mut self, lbl: &str, f: impl FnOnce(&mut Self) -> T) -> T {
1592 self.trie_started_inner(lbl);
1593 let r = f(self);
1594 self.trie_finished_inner();
1595 r
1596 }
1597
1598 fn is_accepting_inner(&mut self) -> bool {
1599 self.flush_lexer() && self.row_is_accepting()
1600 }
1601
1602 pub fn is_accepting(&mut self) -> bool {
1603 self.run_speculative("is_accepting", |s| s.is_accepting_inner())
1604 }
1605
1606 fn try_push_byte_definitive(&mut self, byte: Option<u8>) -> (bool, usize) {
1610 assert!(self.scratch.definitive);
1611
1612 let curr = self.lexer_state();
1613
1614 let res = if byte.is_none() {
1615 let lexeme = self.lexer_mut().force_lexeme_end(curr.lexer_state);
1616 if lexeme.is_error() {
1617 debug!(
1618 " lexer fail on forced end; allowed: {}",
1619 self.allowed_lexemes_dbg(self.rows[curr.row_idx as usize].lexer_start_state)
1620 );
1621 }
1622 lexeme
1623 } else {
1624 self.stats.definitive_bytes += 1;
1625 self.lexer_mut()
1626 .advance(curr.lexer_state, byte.unwrap(), true)
1627 };
1628
1629 if res.is_error() {
1630 debug!(
1631 " lexer fail; allowed: {}",
1632 self.allowed_lexemes_dbg(self.rows[curr.row_idx as usize].lexer_start_state)
1633 );
1634 }
1635
1636 assert!(self.backtrack_byte_count == 0);
1637 if self.advance_lexer_or_parser(res, curr) {
1638 if let Some(b) = byte {
1639 self.bytes.push(b);
1640 }
1641 let bt = std::mem::take(&mut self.backtrack_byte_count);
1642 if bt > 0 {
1643 assert!(self.lexer_spec().has_stop);
1644 self.last_force_bytes_len = usize::MAX;
1646 self.bytes.truncate(self.bytes.len() - bt);
1647 }
1648 (true, bt)
1649 } else {
1650 (false, 0)
1651 }
1652 }
1653
1654 fn curr_row(&self) -> &Row {
1657 &self.rows[self.lexer_state().row_idx as usize]
1658 }
1659
1660 fn forced_byte(&mut self) -> Option<u8> {
1664 if self.is_accepting() {
1665 debug!(" in accept state, not forcing");
1666 return None;
1667 }
1668
1669 let lex_state = self.lexer_state().lexer_state;
1673 let quick_res = self.lexer_mut().next_byte(lex_state);
1674 if let NextByte::ForcedByte(b) = quick_res {
1675 return Some(b);
1676 }
1677
1678 let slow_res = self.run_speculative("forced_byte", |state| {
1679 let mut r = ParserRecognizer { state };
1680
1681 if let NextByte::SomeBytes2([a, b]) = quick_res {
1683 if r.try_push_byte(a) {
1684 r.pop_bytes(1);
1685 if r.try_push_byte(b) {
1686 r.pop_bytes(1);
1687 return None;
1689 }
1690 }
1691 }
1692
1693 let b0 = quick_res.some_bytes().first().cloned().unwrap_or(b' ');
1698 let mut b = b0;
1699 let mut byte_sym = None;
1700 loop {
1701 if r.try_push_byte(b) {
1702 r.pop_bytes(1);
1703 if byte_sym.is_some() {
1705 return None; } else {
1708 byte_sym = Some(b);
1709 }
1710 }
1711 b = b.wrapping_add(1);
1712 if b == b0 {
1713 break;
1714 }
1715 }
1716 byte_sym
1717 });
1718
1719 slow_res
1726 }
1727
1728 fn save_state(&self) -> SavedParserState {
1729 SavedParserState {
1730 lexer_stack_length: self.lexer_stack.len(),
1731 }
1732 }
1733
1734 fn restore_state(&mut self, state: SavedParserState) {
1735 self.lexer_stack.truncate(state.lexer_stack_length);
1736 }
1737
1738 fn flush_lexer(&mut self) -> bool {
1742 if !self.has_pending_lexeme_bytes() {
1743 return true;
1744 }
1745 let curr = self.lexer_state();
1746 let lex_result = self.lexer_mut().try_lexeme_end(curr.lexer_state);
1747 let prev_len = self.lexer_stack.len();
1748 let r = self.advance_lexer_or_parser(lex_result, curr);
1749 if self.lexer_stack.len() != prev_len {
1750 assert!(self.lexer_stack.len() == prev_len + 1);
1751 assert!(prev_len > 0);
1752 self.lexer_stack_flush_position = prev_len;
1753 }
1754 assert!(self.backtrack_byte_count == 0);
1755 r
1756 }
1757
1758 pub fn scan_eos(&mut self) -> bool {
1759 self.assert_definitive(); let lexer_eos = self.lexer_allows_eos();
1762
1763 debug!(" scan eos: lexer_eos={}", lexer_eos);
1764
1765 let prev_stack = self.lexer_stack.len();
1766 if !self.flush_lexer() {
1767 assert_eq!(self.lexer_stack.len(), prev_stack);
1768 debug!(" flush_lexer() failed");
1769 return false;
1770 }
1771
1772 debug!(" flush_lexer() OK");
1773
1774 if lexer_eos {
1775 return true;
1776 }
1777 if self.lexer_stack.len() != prev_stack {
1784 assert_eq!(self.lexer_stack.len(), prev_stack + 1);
1785 self.lexer_stack_top_eos = true;
1786 }
1787
1788 self.assert_definitive(); false
1791 }
1792
1793 fn scan_skip_lexeme(&mut self, lexeme: &Lexeme) -> bool {
1795 let src = self.curr_row().item_indices();
1796 let n = src.len();
1797 if n == 0 {
1798 return false;
1799 }
1800 self.scratch.ensure_items(src.end + n + 10);
1801 self.scratch.new_row(src.end);
1802 self.scratch.push_lexeme_idx = lexeme.idx;
1803
1804 let mut lex_start = Some(self.rows[self.num_rows() - 1].lexer_start_state);
1807
1808 for i in src {
1809 self.scratch
1810 .just_add_idx(self.scratch.items[i], i, "skip_lexeme");
1811 }
1812
1813 let (grammar_id, max_token_ptr) = self.maybe_pop_grammar_stack(lexeme.idx);
1814
1815 if let Some(ptr) = max_token_ptr {
1818 self.process_max_tokens(ptr, lexeme);
1820 lex_start = None;
1822 }
1823
1824 let push_res = self.just_push_row(grammar_id, lex_start);
1825 assert!(push_res);
1826
1827 true
1828 }
1829
1830 fn scan(&mut self, lexeme: &Lexeme) -> bool {
1840 let set = self.shared_box.lexer().lexemes_from_idx(lexeme.idx);
1841
1842 let lex_spec = self.lexer_spec();
1843 for lx in set.as_slice() {
1844 if lex_spec.lexeme_spec(*lx).is_skip {
1845 return self.scan_skip_lexeme(lexeme);
1846 }
1847 }
1848
1849 let row_idx = self.num_rows() - 1;
1850 let items = self.rows[row_idx].item_indices();
1851 self.scratch.ensure_items(items.end + items.len() + 10);
1852 self.scratch.new_row(items.end);
1853 self.scratch.push_lexeme_idx = lexeme.idx;
1854
1855 debug_def!(
1856 self,
1857 " scan: {} at row={} token={}",
1858 self.lexer().dbg_lexeme(lexeme),
1859 row_idx,
1860 self.token_idx,
1861 );
1862
1863 for i in items {
1869 let item = self.scratch.items[i];
1870 let sym = self.grammar.sym_data_dot(item.rhs_ptr());
1871 if let Some(idx) = sym.lexeme {
1872 if set.contains(idx) {
1873 self.scratch.just_add_idx(item.advance_dot(), i, "scan");
1874 }
1875 }
1876 }
1877
1878 self.push_row(self.num_rows(), lexeme)
1880 }
1881
1882 fn mk_capture(&self, var_name: &str, bytes: &[u8]) -> (String, Vec<u8>) {
1883 debug!(
1884 " capture: {} {:?}",
1885 var_name,
1886 String::from_utf8_lossy(bytes)
1887 );
1888
1889 let bytes = self.tok_env.tok_trie().decode_raw_to_decode(bytes);
1890 (var_name.to_string(), bytes)
1891 }
1892
1893 fn process_one_capture(
1894 &mut self,
1895 lhs: CSymIdx,
1896 curr_idx: usize,
1897 lexeme: &Lexeme,
1898 is_lexeme: bool,
1899 capture_start: usize,
1900 ) {
1901 let sym_data = self.grammar.sym_data(lhs);
1902
1903 debug!(
1904 " process_one_capture: {} {}-{} {}",
1905 self.grammar.sym_name(lhs),
1906 capture_start,
1907 curr_idx,
1908 if is_lexeme { "lexeme" } else { "full" }
1909 );
1910
1911 if let Some(var_name) = sym_data.props.stop_capture_name.as_ref() {
1912 let bytes = lexeme.hidden_bytes();
1913 self.captures.push(self.mk_capture(var_name, bytes));
1914 }
1915
1916 if let Some(var_name) = sym_data.props.capture_name.as_ref() {
1917 let mut bytes = Vec::new();
1918 if capture_start < curr_idx {
1919 bytes = self.row_infos[capture_start..curr_idx]
1920 .iter()
1921 .map(|ri| ri.lexeme.upper_visible_bytes(is_lexeme))
1922 .collect::<Vec<_>>()
1923 .concat();
1924 }
1925 if is_lexeme || capture_start < curr_idx {
1926 bytes.extend_from_slice(lexeme.upper_visible_bytes(is_lexeme));
1927 }
1928 self.captures.push(self.mk_capture(var_name, &bytes));
1929 }
1930 }
1931
1932 fn process_captures(&mut self, item: Item, curr_idx: usize, lexeme: &Lexeme, for_lexeme: bool) {
1933 let rule = item.rhs_ptr();
1934 let for_full_rule = self.grammar.sym_idx_dot(rule) == CSymIdx::NULL;
1935
1936 debug!(
1937 " process_captures: for_full_rule={} for_lexeme={}; {:?}",
1938 for_full_rule, for_lexeme, lexeme
1939 );
1940
1941 if for_full_rule {
1942 let lhs = self.grammar.sym_idx_lhs(rule);
1943 self.process_one_capture(lhs, curr_idx, lexeme, false, item.start_pos());
1944 }
1945
1946 if for_lexeme {
1947 let prev_item = item.rewind_dot();
1950 let lex_idx = self.grammar.sym_idx_dot(prev_item.rhs_ptr());
1951 assert!(lex_idx != CSymIdx::NULL);
1952 self.process_one_capture(lex_idx, curr_idx, lexeme, true, curr_idx);
1953 }
1954 }
1955
1956 #[inline(always)]
1957 fn process_agenda(&mut self, curr_idx: usize, lexeme: &Lexeme) {
1958 let mut agenda_ptr = self.scratch.row_start;
1959
1960 self.scratch.push_allowed_lexemes.clear();
1961 self.scratch.push_allowed_grammar_ids.set_all(false);
1962
1963 let lexemes_end = if self.scratch.row_start == 0 {
1964 0
1966 } else {
1967 self.scratch.row_end
1969 };
1970
1971 while agenda_ptr < self.scratch.row_end {
1979 let item_idx = agenda_ptr;
1980 let item = self.scratch.items[agenda_ptr];
1981 agenda_ptr += 1;
1982 debug_def!(self, " agenda: {}", self.item_to_string(item_idx));
1983
1984 let dot_ptr = item.rhs_ptr();
1985 let after_dot = self.grammar.sym_idx_dot(dot_ptr);
1986
1987 if self.scratch.definitive {
1988 let is_lexeme = agenda_ptr <= lexemes_end;
1989 if is_lexeme || after_dot == CSymIdx::NULL {
1990 self.process_captures(item, curr_idx, lexeme, is_lexeme);
1991 }
1992 }
1993
1994 if after_dot == CSymIdx::NULL {
1996 let lhs = self.grammar.sym_idx_lhs(dot_ptr);
1997 if item.start_pos() < curr_idx {
1998 for i in self.rows[item.start_pos()].item_indices() {
2002 let item = self.scratch.items[i];
2003 if self.grammar.sym_idx_dot(item.rhs_ptr()) == lhs {
2004 self.scratch.add_unique(item.advance_dot(), i, "complete");
2005 }
2006 }
2007 }
2008 } else {
2009 let sym_data = self.grammar.sym_data(after_dot);
2011 if let Some(lx) = sym_data.lexeme {
2012 self.scratch
2013 .push_allowed_grammar_ids
2014 .set(sym_data.props.grammar_id.as_usize(), true);
2015 self.scratch.push_allowed_lexemes.add(lx);
2016 }
2017
2018 let mut cond_nullable = false;
2019
2020 if sym_data.gen_grammar.is_some() {
2021 let mut node = self.mk_grammar_stack_node(sym_data, curr_idx);
2022 self.scratch
2023 .add_unique(node.start_item, item_idx, "gen_grammar");
2024 node.start_item_idx = self.scratch.find_item(node.start_item).unwrap();
2025 self.scratch.push_grammar_stack(node);
2026 } else {
2027 if self.scratch.parametric {
2030 let param_here = self.scratch.item_args[item_idx];
2031 let param_dot = self.grammar.param_value_dot(dot_ptr).eval(param_here);
2033
2034 if !sym_data.cond_nullable.is_empty()
2035 && sym_data.cond_nullable.iter().any(|c| c.eval(param_dot))
2036 {
2037 cond_nullable = true;
2038 }
2039
2040 for ri in 0..sym_data.rules.len() {
2041 if !sym_data.rules_cond[ri].eval(param_dot) {
2042 continue; }
2044 let inner_rule = sym_data.rules[ri];
2045 let new_item = Item::new(inner_rule, curr_idx);
2046 self.scratch
2047 .add_unique_arg(new_item, "predict_parametric", param_dot);
2048 }
2049 } else {
2050 for rule in &sym_data.rules {
2051 let new_item = Item::new(*rule, curr_idx);
2052 self.scratch.add_unique(new_item, item_idx, "predict");
2053 }
2054 }
2055 }
2056
2057 if sym_data.is_nullable || cond_nullable {
2060 self.scratch
2061 .add_unique(item.advance_dot(), item_idx, "null");
2062 if self.scratch.definitive && sym_data.props.capture_name.is_some() {
2063 let var_name = sym_data.props.capture_name.as_ref().unwrap();
2065 debug!(" capture: {} NULL", var_name);
2066 self.captures.push((var_name.clone(), vec![]));
2067 }
2068 }
2069 }
2070 }
2071 }
2072
2073 #[inline(always)]
2074 fn just_push_row(&mut self, grammar_id: LexemeClass, lex_start: Option<StateID>) -> bool {
2075 let row_len = self.scratch.row_len();
2076
2077 self.stats.rows += 1;
2078
2079 if row_len == 0 {
2080 false
2081 } else {
2082 self.stats.all_items += row_len;
2083
2084 let lex_start = if let Some(l) = lex_start {
2085 l
2086 } else {
2087 if self
2089 .scratch
2090 .push_allowed_grammar_ids
2091 .get(grammar_id.as_usize())
2092 {
2093 let skip = self.lexer_spec().skip_id(grammar_id);
2094 self.scratch.push_allowed_lexemes.add(skip);
2095 }
2096
2097 self.shared_box
2098 .lexer_mut()
2099 .start_state(&self.scratch.push_allowed_lexemes)
2100 };
2101
2102 debug_def!(
2103 self,
2104 " push row: {} {:?}",
2105 self.allowed_lexemes_dbg(lex_start),
2106 grammar_id
2107 );
2108
2109 let idx = self.num_rows();
2111
2112 let row = self.scratch.work_row(lex_start);
2113 if self.rows.is_empty() || self.rows.len() == idx {
2114 self.rows.push(row);
2117 } else {
2118 self.rows[idx] = row;
2121 }
2122 self.rows_valid_end = idx + 1;
2123
2124 if self.scratch.definitive {
2125 if self.row_infos.len() > idx {
2128 self.row_infos.drain(idx..);
2129 }
2130
2131 let mut start_byte_idx = self.bytes.len();
2137 if start_byte_idx > 0 {
2138 start_byte_idx += 1;
2139 }
2140
2141 self.row_infos.push(RowInfo {
2142 lexeme: Lexeme::bogus(),
2143 token_idx_start: self.token_idx,
2144 token_idx_stop: self.token_idx,
2145 start_byte_idx,
2146 });
2147 }
2149
2150 true
2151 }
2152 }
2153
2154 fn process_max_tokens(&mut self, ptr: GrammarStackPtr, lexeme: &Lexeme) {
2155 debug_def!(self, " process_max_tokens");
2156 let curr_idx = self.num_rows();
2157 let top = &self.scratch.grammar_stack[ptr.as_usize()];
2158 self.scratch.push_grm_top = top.back_ptr;
2159 let item = top.start_item.advance_dot();
2160 self.scratch.row_end = self.scratch.row_start;
2162 self.scratch
2163 .just_add_idx(item, top.start_item_idx, "max_tokens");
2164 self.process_agenda(curr_idx, lexeme);
2165 }
2166
2167 #[inline(always)]
2175 fn push_row(&mut self, curr_idx: usize, lexeme: &Lexeme) -> bool {
2176 let (grammar_id, max_token_ptr) = self.maybe_pop_grammar_stack(lexeme.idx);
2177
2178 self.process_agenda(curr_idx, lexeme);
2179
2180 if let Some(ptr) = max_token_ptr {
2181 assert!(curr_idx == self.num_rows(), "max_tokens on first row");
2182 self.process_max_tokens(ptr, lexeme);
2183 }
2184
2185 self.just_push_row(grammar_id, None)
2186 }
2187
2188 fn mk_grammar_stack_node(&self, sym_data: &CSymbol, curr_idx: usize) -> GrammarStackNode {
2189 assert!(sym_data.rules.len() == 1);
2192 let start_item = Item::new(sym_data.rules[0], curr_idx);
2193 assert!(self.grammar.sym_idx_dot(start_item.advance_dot().rhs_ptr()) == CSymIdx::NULL);
2195 let nested_sym = self.grammar.sym_data_dot(start_item.rhs_ptr());
2196 let token_horizon = sym_data.props.max_tokens.saturating_add(self.token_idx);
2197 GrammarStackNode {
2198 back_ptr: self.scratch.push_grm_top,
2199 token_horizon: token_horizon as u32,
2200 grammar_id: nested_sym.props.grammar_id,
2201 start_item,
2202 start_item_idx: usize::MAX,
2203 }
2204 }
2205
2206 #[inline(always)]
2208 fn maybe_pop_grammar_stack(
2209 &mut self,
2210 lx: MatchingLexemesIdx,
2211 ) -> (LexemeClass, Option<GrammarStackPtr>) {
2212 let set = self.lexer().lexemes_from_idx(lx);
2213 let lex_spec = &self.lexer_spec();
2214 let grammar_ids = HashSet::from_iter(
2215 set.as_slice()
2216 .iter()
2217 .map(|&e| lex_spec.lexeme_spec(e).class()),
2218 );
2219 let mut max_token_ptr = None;
2220
2221 let mut grm_stack_top = if !self.rows.is_empty() {
2222 self.rows[self.num_rows() - 1].grammar_stack_ptr
2223 } else {
2224 GrammarStackPtr::new(0)
2225 };
2226
2227 while grm_stack_top.as_usize() > 0 {
2228 let grm_top = &self.scratch.grammar_stack[grm_stack_top.as_usize()];
2229 debug_def!(
2230 self,
2231 " pop grammar_stack: top={:?}, curr={:?}, #{}",
2232 grm_top.grammar_id,
2233 grammar_ids,
2234 self.token_idx
2235 );
2236 if grammar_ids.contains(&grm_top.grammar_id) {
2237 if grm_top.token_horizon <= self.token_idx as u32 {
2239 debug_def!(
2245 self,
2246 " hit token limit horizon={} token_idx={}",
2247 grm_top.token_horizon,
2248 self.token_idx
2249 );
2250 max_token_ptr = Some(grm_stack_top);
2251 }
2252 break;
2253 }
2254 grm_stack_top = grm_top.back_ptr;
2255 }
2256
2257 if grm_stack_top.as_usize() == 0 {
2258 assert!(
2259 grammar_ids.contains(&LexemeClass::ROOT),
2260 "grammar stack empty for non-root grammar: {grammar_ids:?}"
2261 );
2262 }
2263
2264 self.scratch.push_grm_top = grm_stack_top;
2265
2266 let top_id = self.scratch.grammar_stack[grm_stack_top.as_usize()].grammar_id;
2267 (top_id, max_token_ptr)
2268 }
2269
2270 #[inline(always)]
2273 fn curr_row_bytes(&self) -> Vec<u8> {
2274 let mut bytes = vec![];
2275 let row_idx = self.num_rows() - 1;
2276 for back in self.lexer_stack.iter().rev() {
2277 if back.row_idx as usize != row_idx {
2278 break;
2279 }
2280 if let Some(b) = back.byte {
2281 bytes.push(b);
2282 }
2283 }
2284 bytes.reverse();
2285 bytes
2286 }
2287
2288 fn lexer_spec(&self) -> &LexerSpec {
2289 self.grammar.lexer_spec()
2290 }
2291
2292 fn allowed_lexemes_dbg(&self, lex_state: StateID) -> String {
2293 self.lexer_spec()
2294 .dbg_lexeme_set(self.lexer().possible_lexemes(lex_state))
2295 }
2296
2297 #[inline(always)]
2300 fn mk_lexeme(&self, byte: Option<u8>, pre_lexeme: PreLexeme) -> Lexeme {
2301 let mut bytes = self.curr_row_bytes();
2302 if let Some(byte) = byte {
2303 bytes.push(byte);
2304 }
2305
2306 let (hidden, is_suffix) = self.lexer().lexeme_props(pre_lexeme.idx);
2307 Lexeme::new(pre_lexeme.idx, bytes, hidden, is_suffix)
2308 }
2309
2310 fn has_forced_bytes(&self, allowed_lexemes: &LexemeSet, bytes: &[u8]) -> bool {
2311 if allowed_lexemes.is_empty() {
2313 return false;
2314 }
2315 let mut matched_something = false;
2316 for lexeme_idx in allowed_lexemes.iter() {
2317 let lex_spec = &self.lexer_spec().lexemes[lexeme_idx.as_usize()];
2318 if lex_spec.is_skip && matches!(lex_spec.rx, RegexAst::NoMatch) {
2319 continue;
2320 }
2321
2322 if !self.lexer_spec().has_forced_bytes(lex_spec, bytes) {
2323 return false;
2324 }
2325 matched_something = true;
2326 }
2327 matched_something
2329 }
2330
2331 #[inline(always)]
2332 fn lexer_state_for_added_row(
2333 &mut self,
2334 lexeme: Lexeme,
2335 transition_byte: Option<u8>,
2336 ) -> LexerState {
2337 let added_row = self.num_rows();
2340 let added_row_start_state = self.rows[added_row].lexer_start_state;
2341
2342 let no_hidden = LexerState {
2343 row_idx: added_row as u32,
2344 lexer_state: self
2345 .shared_box
2346 .lexer_mut()
2347 .transition_start_state(added_row_start_state, transition_byte),
2348 byte: transition_byte,
2349 };
2350
2351 if self.scratch.definitive {
2352 self.row_infos[added_row - 1].lexeme = lexeme;
2354 if transition_byte.is_some() {
2358 let new_start = self.row_infos[added_row - 1]
2359 .start_byte_idx
2360 .saturating_sub(1);
2361 self.row_infos[added_row].start_byte_idx -= new_start;
2362 }
2363 }
2364 debug_def!(
2365 self,
2366 "lex: re-start {:?} (via {:?}); allowed: {}",
2367 no_hidden.lexer_state,
2368 transition_byte.map(|b| b as char),
2369 self.allowed_lexemes_dbg(added_row_start_state)
2370 );
2371
2372 no_hidden
2373 }
2374
2375 #[inline(always)]
2376 fn handle_hidden_bytes(
2377 &mut self,
2378 no_hidden: LexerState,
2379 lexeme_byte: Option<u8>,
2380 pre_lexeme: PreLexeme,
2381 ) -> bool {
2382 let added_row_start_state = self.rows[self.num_rows()].lexer_start_state;
2383
2384 let lexeme = self.mk_lexeme(lexeme_byte, pre_lexeme);
2386
2387 let hidden_bytes = lexeme.hidden_bytes();
2388
2389 let trace_here = self.scratch.log_enabled();
2390
2391 if trace_here {
2392 trace!(
2393 " hidden_bytes: {} {:?}",
2394 self.allowed_lexemes_dbg(added_row_start_state),
2395 String::from_utf8_lossy(hidden_bytes)
2396 );
2397 }
2398
2399 if self.has_forced_bytes(
2400 self.lexer().possible_lexemes(added_row_start_state),
2401 hidden_bytes,
2402 ) {
2403 if trace_here {
2404 trace!(" hidden forced");
2405 }
2406 let mut lexer_state = added_row_start_state;
2407 self.pop_lexer_states(hidden_bytes.len() - 1);
2410 for idx in 0..hidden_bytes.len() {
2411 let b = hidden_bytes[idx];
2412 match self
2413 .shared_box
2414 .lexer_mut()
2415 .advance(lexer_state, b, trace_here)
2416 {
2417 LexerResult::State(next_state, _) => {
2418 lexer_state = next_state;
2419 }
2420 LexerResult::SpecialToken(_) => panic!("hidden byte resulted in special token"),
2421 LexerResult::Error => panic!("hidden byte failed; {hidden_bytes:?}"),
2422 LexerResult::Lexeme(second_lexeme) => {
2423 if trace_here {
2424 debug!("hidden bytes lexeme: {:?}", second_lexeme);
2425 }
2426 assert!(
2427 idx == hidden_bytes.len() - 1,
2428 "lexeme in the middle of hidden bytes"
2429 );
2430
2431 self.lexer_stack.push(LexerState {
2433 lexer_state,
2434 byte: None,
2435 ..no_hidden
2436 });
2437 let r = self.advance_parser(second_lexeme);
2438 if r {
2440 let new_top = self.lexer_stack.pop().unwrap();
2442 *self.lexer_stack.last_mut().unwrap() = new_top;
2443 return true;
2444 } else {
2445 self.lexer_stack.pop();
2449 return false;
2450 }
2451 }
2452 }
2453 self.lexer_stack.push(LexerState {
2454 lexer_state,
2455 byte: Some(b),
2456 ..no_hidden
2457 });
2458 }
2459 if self.scratch.definitive {
2460 self.assert_definitive_inner();
2461 }
2462 } else {
2463 if trace_here {
2464 debug!(" hidden not forced");
2465 }
2466 if self.scratch.definitive {
2467 self.lexer_stack.push(LexerState {
2469 lexer_state: added_row_start_state,
2470 byte: None,
2471 ..no_hidden
2472 });
2473 self.assert_definitive_inner();
2474 self.backtrack_byte_count = hidden_bytes.len();
2475 } else {
2476 self.lexer_stack.push(LexerState {
2478 lexer_state: self.shared_box.lexer_mut().a_dead_state(),
2479 byte: None,
2480 ..no_hidden
2481 });
2482 }
2483 }
2485
2486 true
2487 }
2488
2489 fn lexer_stack_top(&self) -> String {
2490 String::from_utf8_lossy(&self.trace_byte_stack).to_string()
2491 }
2492
2493 #[inline(never)]
2503 fn advance_parser(&mut self, pre_lexeme: PreLexeme) -> bool {
2504 if self.stats.all_items > self.max_all_items {
2505 return false;
2506 }
2507
2508 let transition_byte = if pre_lexeme.byte_next_row {
2510 pre_lexeme.byte
2511 } else {
2512 None
2513 };
2514 let lexeme_byte = if pre_lexeme.byte_next_row {
2516 None
2517 } else {
2518 pre_lexeme.byte
2519 };
2520 let lexeme_idx = pre_lexeme.idx;
2521
2522 let lexeme = if self.scratch.definitive {
2523 self.mk_lexeme(lexeme_byte, pre_lexeme)
2524 } else {
2525 Lexeme::just_idx(lexeme_idx)
2526 };
2527
2528 let scan_res = if !self.scratch.definitive
2529 && self.num_rows() < self.rows_valid_end
2530 && self.rows[self.num_rows()].lexeme_idx == lexeme_idx
2531 {
2532 self.stats.cached_rows += 1;
2534 true
2535 } else {
2536 let scan_res = self.scan(&lexeme);
2538
2539 if scan_res && ITEM_TRACE {
2540 let added_row = self.num_rows();
2541 let row = &self.rows[added_row];
2542 item_trace!(
2543 " row: {:?} -> {}",
2544 self.lexer_stack_top(),
2545 row.item_indices().len()
2546 );
2547
2548 if self.stats.all_items > self.max_all_items {
2549 panic!("max items exceeded");
2550 }
2551 }
2552
2553 scan_res
2554 };
2555
2556 if scan_res {
2557 let mut no_hidden = self.lexer_state_for_added_row(lexeme, transition_byte);
2558
2559 let (hidden, is_suffix) = self.lexer().lexeme_props(lexeme_idx);
2560 if hidden > 0 && !is_suffix {
2561 return self.handle_hidden_bytes(no_hidden, lexeme_byte, pre_lexeme);
2562 } else {
2563 if pre_lexeme.byte_next_row && no_hidden.lexer_state.is_dead() {
2564 if self.scratch.definitive {
2565 self.row_infos.drain(no_hidden.row_idx as usize..);
2567 }
2568 return false;
2569 }
2570 if let Some(b) = transition_byte {
2571 let single = self
2576 .lexer_mut()
2577 .check_for_single_byte_lexeme(no_hidden.lexer_state, b);
2578 if let Some(second_lexeme) = single {
2579 debug_def!(self, "single byte lexeme: {:?}", second_lexeme);
2580 no_hidden.byte = None;
2581 self.lexer_stack.push(no_hidden);
2582
2583 assert!(pre_lexeme.byte_next_row);
2585 assert!(!second_lexeme.byte_next_row);
2586
2587 let r = self.advance_parser(second_lexeme);
2588 if r {
2589 let new_top = self.lexer_stack.pop().unwrap();
2590 *self.lexer_stack.last_mut().unwrap() = new_top;
2591 return true;
2592 } else {
2593 self.lexer_stack.pop();
2594 return false;
2595 }
2596 }
2597 }
2598 debug_def!(self, " push normal: {no_hidden:?}");
2599 self.lexer_stack.push(no_hidden);
2600 }
2601 if self.scratch.definitive {
2602 self.assert_definitive_inner();
2603 }
2604 true
2605 } else {
2606 debug_def!(self, " scan failed");
2607 false
2608 }
2609 }
2610}
2611
2612pub struct ParserRecognizer<'a> {
2613 state: &'a mut ParserState,
2614}
2615
2616impl ParserRecognizer<'_> {
2617 pub fn lexer_mut(&mut self) -> &mut Lexer {
2618 self.state.lexer_mut()
2619 }
2620 pub fn lexer(&self) -> &Lexer {
2621 self.state.lexer()
2622 }
2623 pub fn lexer_state(&self) -> StateID {
2624 self.state.lexer_state().lexer_state
2625 }
2626 pub fn stats_mut(&mut self) -> &mut ParserStats {
2627 &mut self.state.stats
2628 }
2629 pub fn metrics_mut(&mut self) -> &mut ParserMetrics {
2630 &mut self.state.metrics
2631 }
2632}
2633
2634pub trait BiasComputer: Send + Sync {
2635 fn compute_bias(&self, rec: &mut ParserRecognizer<'_>, start: &[u8]) -> SimpleVob;
2636 fn trie(&self) -> &TokTrie;
2637}
2638
2639impl Recognizer for ParserRecognizer<'_> {
2646 #[inline(always)]
2647 fn pop_bytes(&mut self, num: usize) {
2648 if ITEM_TRACE {
2649 self.state
2650 .trace_byte_stack
2651 .truncate(self.state.trace_byte_stack.len() - num);
2652 }
2653 self.state.pop_lexer_states(num);
2654 }
2655
2656 fn collapse(&mut self) {
2658 }
2661
2662 fn trie_started(&mut self, lbl: &str) {
2663 self.state.trie_started_inner(lbl);
2664 }
2665
2666 fn trie_finished(&mut self) {
2667 self.state.trie_finished_inner();
2668 }
2669
2670 #[inline(always)]
2676 fn try_push_byte(&mut self, byte: u8) -> bool {
2677 let stats = false;
2678
2679 let lexer_logging = false;
2680 let curr = self.state.lexer_state();
2681 let res = self
2682 .state
2683 .lexer_mut()
2684 .advance(curr.lexer_state, byte, lexer_logging);
2685
2686 if ITEM_TRACE {
2687 self.state.trace_byte_stack.push(byte);
2688 }
2689
2690 if stats {
2691 assert!(!self.state.scratch.definitive);
2693
2694 self.state.stats.lexer_ops += 1;
2695 match res {
2696 LexerResult::State(_, _) => {}
2697 LexerResult::Error => self.state.stats.num_lex_errors += 1,
2698 LexerResult::Lexeme(_) | LexerResult::SpecialToken(_) => {
2699 self.state.stats.num_lexemes += 1
2700 }
2701 }
2702 }
2703
2704 let r = self.state.advance_lexer_or_parser(res, curr);
2705
2706 if ITEM_TRACE && !r {
2707 self.state.trace_byte_stack.pop();
2708 }
2709
2710 r
2711 }
2712
2713 fn save_stats(&mut self, nodes_walked: usize) {
2714 self.state.stats.trie_nodes_walked += nodes_walked;
2715 }
2716}
2717
2718fn item_to_string(g: &CGrammar, item: &Item, param: ParamValue) -> String {
2719 let mut r = format!(
2720 "{} @{}",
2721 g.rule_to_string(item.rhs_ptr(), &ParamCond::True),
2722 item.start_pos()
2723 );
2724 if !param.is_default() {
2725 r.push_str(&format!(" ::{param}"));
2726 }
2727 r
2728}
2729
2730pub enum ParserError {
2731 LexerError(String),
2732 ParserError(String),
2733}
2734
2735impl ParserError {
2736 pub fn stop_reason(&self) -> StopReason {
2737 match self {
2738 ParserError::LexerError(_) => StopReason::LexerTooComplex,
2739 ParserError::ParserError(_) => StopReason::ParserTooComplex,
2740 }
2741 }
2742
2743 pub fn message(&self) -> String {
2744 match self {
2745 ParserError::LexerError(s) => format!("lexer error: {s}"),
2746 ParserError::ParserError(s) => format!("parser error: {s}"),
2747 }
2748 }
2749}
2750
2751impl Parser {
2752 pub fn new(
2753 tok_env: TokEnv,
2754 grammar: Arc<CGrammar>,
2755 limits: ParserLimits,
2756 perf_counters: Arc<ParserPerfCounters>,
2757 ) -> Result<Self> {
2758 let (state, lexer) = ParserState::new(tok_env, grammar, limits, perf_counters)?;
2759 let shared = Arc::new(Mutex::new(Box::new(SharedState {
2760 lexer_opt: Some(lexer),
2761 })));
2762 Ok(Parser { shared, state })
2763 }
2764
2765 pub fn compute_bias(&mut self, computer: &dyn BiasComputer, start: &[u8]) -> SimpleVob {
2769 self.with_shared(|state| state.compute_bias(computer, start))
2770 }
2771
2772 pub fn captures(&self) -> &[(String, Vec<u8>)] {
2773 &self.state.captures.capture_list
2774 }
2775
2776 pub fn get_capture(&self, name: &str) -> Option<&[u8]> {
2777 self.state.captures.capture_map.get(name).map(|v| &v[..])
2778 }
2779
2780 pub fn stats(&self) -> &ParserStats {
2781 &self.state.stats
2782 }
2783
2784 #[inline(always)]
2785 pub fn perf_counters(&self) -> &ParserPerfCounters {
2786 &self.state.perf_counters
2787 }
2788
2789 pub fn metrics_mut(&mut self) -> &mut ParserMetrics {
2790 &mut self.state.metrics
2791 }
2792
2793 pub fn hidden_start(&self) -> usize {
2798 let mut shared = self.shared.lock().unwrap();
2799 self.state.hidden_start(shared.lexer_mut())
2800 }
2801
2802 pub fn lexer_stats(&self) -> LexerStats {
2803 self.shared.lock().unwrap().lexer().dfa.stats()
2804 }
2805
2806 pub fn get_error(&self) -> Option<ParserError> {
2807 let shared = self.shared.lock().unwrap();
2808 if let Some(e) = shared.lexer().dfa.get_error() {
2809 return Some(ParserError::LexerError(e));
2810 }
2811 if let Some(e) = &self.state.parser_error {
2812 return Some(ParserError::ParserError(e.clone()));
2813 }
2814 None
2815 }
2816
2817 pub fn with_recognizer<T>(&mut self, f: impl FnOnce(&mut ParserRecognizer) -> T) -> T {
2818 self.with_shared(|state| {
2819 let mut rec = ParserRecognizer { state };
2820 f(&mut rec)
2821 })
2822 }
2823
2824 pub fn get_bytes(&self) -> &[u8] {
2825 self.state.get_bytes()
2826 }
2827
2828 pub fn force_bytes(&mut self) -> &[u8] {
2829 if !self.state.needs_force_bytes() {
2830 self.currently_forced_bytes()
2831 } else {
2832 let t0 = Instant::now();
2833 let prev_len = self.currently_forced_bytes().len();
2834 self.with_shared(|state| state.force_bytes());
2835 let r = self.currently_forced_bytes();
2836 if r.len() > prev_len {
2837 self.state.perf_counters.force_bytes.record(t0.elapsed());
2838 } else {
2839 self.state
2840 .perf_counters
2841 .force_bytes_empty
2842 .record(t0.elapsed());
2843 }
2844 r
2845 }
2846 }
2847
2848 pub fn scan_eos(&mut self) -> bool {
2849 self.with_shared(|state| state.scan_eos())
2850 }
2851
2852 pub fn grammar_warnings(&mut self) -> Vec<String> {
2853 self.with_shared(|state| state.lexer_spec().render_warnings())
2854 }
2855
2856 pub(crate) fn apply_forced(&mut self, byte_idx: usize) {
2857 self.state.byte_to_token_idx.resize(byte_idx, 0);
2858 }
2859
2860 pub(crate) fn additional_backtrack(&mut self, n_bytes: usize) {
2861 let new_len = self.state.byte_to_token_idx.len().saturating_sub(n_bytes);
2864 self.state.byte_to_token_idx.truncate(new_len);
2865 }
2866
2867 pub fn apply_token(&mut self, tok_bytes: &[u8], tok_id: TokenId) -> Result<usize> {
2868 let r = self.with_shared(|state| state.apply_token(tok_bytes, tok_id));
2869 self.state.token_idx += 1;
2870 r
2871 }
2872
2873 fn with_shared<T>(&mut self, f: impl FnOnce(&mut ParserState) -> T) -> T {
2874 let mut shared = self.shared.lock().unwrap();
2875 self.state.shared_box = std::mem::take(&mut *shared);
2876 let r = f(&mut self.state);
2877 *shared = std::mem::take(&mut self.state.shared_box);
2878 assert!(shared.lexer_opt.is_some());
2879 r
2880 }
2881
2882 pub fn rollback(&mut self, n_bytes: usize) -> Result<()> {
2883 self.state.lexer_spec().check_rollback()?;
2884 self.with_shared(|state| state.rollback(n_bytes))
2885 }
2886
2887 pub fn validate_tokens(&mut self, tokens: &[TokenId]) -> usize {
2889 self.with_shared(|state| {
2890 let r = state.validate_tokens(tokens);
2891 debug!(
2892 "validate_tokens: {} -> {}/{}",
2893 state.tok_env.tok_trie().tokens_dbg(tokens),
2894 r,
2895 tokens.len()
2896 );
2897 r
2898 })
2899 }
2900
2901 pub fn log_row_infos(&mut self, label: &str) {
2902 if cfg!(feature = "logging") && DEBUG {
2903 self.with_shared(|state| {
2904 debug!(
2905 "row infos {}: token_idx: {}; applied bytes: {}/{}",
2906 label,
2907 state.token_idx,
2908 state.byte_to_token_idx.len(),
2909 state.bytes.len()
2910 );
2911 for infos in state.row_infos.iter() {
2912 debug!(" {}", infos.dbg(state.lexer()));
2913 }
2914 })
2915 }
2916 }
2917
2918 pub fn is_accepting(&mut self) -> bool {
2919 self.with_shared(|state| state.is_accepting())
2920 }
2921
2922 pub fn currently_forced_bytes(&self) -> &[u8] {
2923 &self.state.bytes[self.state.byte_to_token_idx.len()..]
2924 }
2925
2926 pub fn has_pending_lexeme_bytes(&self) -> bool {
2927 self.state.has_pending_lexeme_bytes()
2928 }
2929
2930 pub fn grammar(&self) -> &CGrammar {
2931 &self.state.grammar
2932 }
2933
2934 pub fn can_advance(&self) -> bool {
2935 self.state.can_advance()
2936 }
2937
2938 pub fn temperature(&self) -> Option<f32> {
2939 self.state.temperature()
2940 }
2941
2942 pub fn deep_clone(&self) -> Self {
2943 let mut copy = self.clone();
2944 let shared = self.shared.lock().unwrap();
2945 copy.shared = Arc::new(Mutex::new(shared.clone()));
2946 copy
2947 }
2948
2949 pub fn test_trigger_lexer_error(&mut self) -> Result<()> {
2950 self.with_shared(|_state| {
2951 panic!("synthetic error");
2952 })
2953 }
2954}