1use anyhow::{bail, Result};
8use derivre::raw::{DerivCache, ExprSet, NextByteCache, RelevanceCache, VecHashCons};
9use serde::{Deserialize, Serialize};
10use std::fmt::{Debug, Display};
11use toktrie::SimpleVob;
12
13pub use derivre::{AlphabetInfo, ExprRef, NextByte, StateID};
14
15use crate::api::ParserLimits;
16
17use super::lexerspec::LexemeIdx;
18
19#[derive(Clone, Serialize, Deserialize, Default)]
20pub struct LexerStats {
21 pub num_regexps: usize,
22 pub num_ast_nodes: usize,
23 pub num_derived: usize,
24 pub num_derivatives: usize,
25 pub total_fuel_spent: usize,
26 pub num_states: usize,
27 pub num_transitions: usize,
28 pub num_bytes: usize,
29 pub alphabet_size: usize,
30 pub error: bool,
31}
32
33#[derive(Clone)]
34pub enum MatchingLexemes {
35 None,
36 One(LexemeIdx),
37 Two([LexemeIdx; 2]),
38 Many(Vec<LexemeIdx>),
39}
40
41impl MatchingLexemes {
42 pub fn is_some(&self) -> bool {
43 !matches!(self, MatchingLexemes::None)
44 }
45
46 pub fn is_none(&self) -> bool {
47 !self.is_some()
48 }
49
50 pub fn first(&self) -> Option<LexemeIdx> {
51 match self {
52 MatchingLexemes::None => None,
53 MatchingLexemes::One(idx) => Some(*idx),
54 MatchingLexemes::Two([idx, _]) => Some(*idx),
55 MatchingLexemes::Many(v) => v.first().copied(),
56 }
57 }
58
59 pub fn contains(&self, idx: LexemeIdx) -> bool {
60 match self {
61 MatchingLexemes::None => false,
62 MatchingLexemes::One(idx2) => *idx2 == idx,
63 MatchingLexemes::Two([idx1, idx2]) => *idx1 == idx || *idx2 == idx,
64 MatchingLexemes::Many(v) => v.contains(&idx),
65 }
66 }
67
68 pub fn add(&mut self, idx: LexemeIdx) {
69 match self {
70 MatchingLexemes::None => *self = MatchingLexemes::One(idx),
71 MatchingLexemes::One(idx2) => {
72 *self = MatchingLexemes::Two([*idx2, idx]);
73 }
74 MatchingLexemes::Two([idx1, idx2]) => {
75 *self = MatchingLexemes::Many(vec![*idx1, *idx2, idx]);
76 }
77 MatchingLexemes::Many(v) => {
78 v.push(idx);
79 }
80 }
81 }
82
83 pub fn len(&self) -> usize {
84 match self {
85 MatchingLexemes::None => 0,
86 MatchingLexemes::One(_) => 1,
87 MatchingLexemes::Two(_) => 2,
88 MatchingLexemes::Many(v) => v.len(),
89 }
90 }
91
92 pub fn is_empty(&self) -> bool {
93 self.len() == 0
94 }
95
96 pub fn as_slice(&self) -> &[LexemeIdx] {
97 match self {
98 MatchingLexemes::None => &[],
99 MatchingLexemes::One(idx) => std::slice::from_ref(idx),
100 MatchingLexemes::Two(v) => v,
101 MatchingLexemes::Many(v) => v.as_slice(),
102 }
103 }
104}
105
106impl Debug for MatchingLexemes {
107 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
108 match self {
109 MatchingLexemes::None => write!(f, "Lex:[]"),
110 MatchingLexemes::One(idx) => write!(f, "Lex:[{}]", idx.as_usize()),
111 MatchingLexemes::Two([idx1, idx2]) => {
112 write!(f, "Lex:[{},{}]", idx1.as_usize(), idx2.as_usize())
113 }
114 MatchingLexemes::Many(v) => write!(
115 f,
116 "Lex:[{}]",
117 v.iter()
118 .map(|idx| idx.as_usize().to_string())
119 .collect::<Vec<_>>()
120 .join(",")
121 ),
122 }
123 }
124}
125
126impl Display for LexerStats {
127 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
128 write!(
129 f,
130 "regexps: {} with {} nodes (+ {} derived via {} derivatives with total fuel {}), states: {}; transitions: {}; bytes: {}; alphabet size: {} {}",
131 self.num_regexps,
132 self.num_ast_nodes,
133 self.num_derived,
134 self.num_derivatives,
135 self.total_fuel_spent,
136 self.num_states,
137 self.num_transitions,
138 self.num_bytes,
139 self.alphabet_size,
140 if self.error { "ERROR" } else { "" }
141 )
142 }
143}
144
145impl Debug for LexerStats {
146 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
147 Display::fmt(self, f)
148 }
149}
150
151#[derive(Clone, Debug)]
152pub struct LexemeSet {
153 vob: SimpleVob,
154}
155
156impl LexemeSet {
157 pub fn new(size: usize) -> Self {
158 LexemeSet {
159 vob: SimpleVob::alloc(size),
160 }
161 }
162
163 pub fn len(&self) -> usize {
164 self.vob.len()
165 }
166
167 pub fn from_vob(vob: &SimpleVob) -> Self {
168 LexemeSet { vob: vob.clone() }
169 }
170
171 pub fn is_empty(&self) -> bool {
172 self.vob.is_zero()
173 }
174
175 #[inline(always)]
176 pub fn iter(&self) -> impl Iterator<Item = LexemeIdx> + '_ {
177 self.vob.iter().map(|e| LexemeIdx::new(e as usize))
178 }
179
180 pub fn add(&mut self, idx: LexemeIdx) {
181 self.vob.set(idx.as_usize(), true);
182 }
183
184 pub fn remove(&mut self, idx: LexemeIdx) {
185 self.vob.set(idx.as_usize(), false);
186 }
187
188 pub fn first(&self) -> Option<LexemeIdx> {
189 self.vob.first_bit_set().map(LexemeIdx::new)
190 }
191
192 pub fn contains(&self, idx: LexemeIdx) -> bool {
193 self.vob.get(idx.as_usize())
194 }
195
196 pub fn clear(&mut self) {
197 self.vob.set_all(false);
198 }
199}
200
201#[derive(Clone)]
202pub struct RegexVec {
203 exprs: ExprSet,
204 deriv: DerivCache,
205 next_byte: NextByteCache,
206 relevance: RelevanceCache,
207 alpha: AlphabetInfo,
208 #[allow(dead_code)]
209 rx_lexemes: Vec<RxLexeme>,
210 lazy: LexemeSet,
211 subsumable: LexemeSet,
212 rx_list: Vec<ExprRef>,
213 special_token_rx: Option<ExprRef>,
214 rx_sets: VecHashCons,
215 state_table: Vec<StateID>,
216 state_descs: Vec<StateDesc>,
217 num_transitions: usize,
218 num_ast_nodes: usize,
219 max_states: usize,
220 fuel: u64,
221}
222
223#[derive(Clone, Debug)]
224pub struct StateDesc {
225 pub state: StateID,
226 pub greedy_accepting: MatchingLexemes,
227 pub possible: LexemeSet,
228
229 pub lazy_accepting: MatchingLexemes,
233 pub lazy_hidden_len: u32,
234
235 pub has_special_token: bool,
236
237 possible_lookahead_len: Option<usize>,
238 lookahead_len: Option<Option<usize>>,
239 next_byte: Option<NextByte>,
240}
241
242impl RegexVec {
244 pub fn alpha(&self) -> &AlphabetInfo {
245 &self.alpha
246 }
247
248 pub fn lazy_regexes(&self) -> &LexemeSet {
249 &self.lazy
250 }
251
252 pub fn initial_state(&mut self, selected: &LexemeSet) -> StateID {
255 let mut vec_desc = vec![];
256 for idx in selected.iter() {
257 let rx = self.get_rx(idx);
258 if rx != ExprRef::NO_MATCH {
259 Self::push_rx(&mut vec_desc, idx, rx);
260 }
261 }
262 self.insert_state(vec_desc)
263 }
264
265 #[inline(always)]
266 pub fn state_desc(&self, state: StateID) -> &StateDesc {
267 &self.state_descs[state.as_usize()]
268 }
269
270 pub fn possible_lookahead_len(&mut self, state: StateID) -> usize {
271 let desc = &mut self.state_descs[state.as_usize()];
272 if let Some(len) = desc.possible_lookahead_len {
273 return len;
274 }
275 let mut max_len = 0;
276 for (_, e) in iter_state(&self.rx_sets, state) {
277 max_len = max_len.max(self.exprs.possible_lookahead_len(e));
278 }
279 desc.possible_lookahead_len = Some(max_len);
280 max_len
281 }
282
283 pub fn lookahead_len_for_state(&mut self, state: StateID) -> Option<usize> {
284 let desc = &mut self.state_descs[state.as_usize()];
285 if desc.greedy_accepting.is_none() {
286 return None;
287 }
288 if let Some(len) = desc.lookahead_len {
289 return len;
290 }
291 let mut res = None;
292 let exprs = &self.exprs;
293 for (idx2, e) in iter_state(&self.rx_sets, state) {
294 if res.is_none() && exprs.is_nullable(e) {
295 assert!(desc.greedy_accepting.contains(idx2));
296 res = Some(exprs.lookahead_len(e).unwrap_or(0));
297 }
298 }
299 desc.lookahead_len = Some(res);
300 res
301 }
302
303 #[inline(always)]
307 pub fn transition(&mut self, state: StateID, b: u8) -> StateID {
308 let idx = self.alpha.map_state(state, b);
309 let new_state = self.state_table[idx];
310 if new_state != StateID::MISSING {
311 new_state
312 } else {
313 self.transition_inner(state, b, idx)
314 }
315 }
316
317 pub fn subsume_possible(&mut self, state: StateID) -> bool {
321 if state.is_dead() || self.has_error() {
322 return false;
323 }
324 for (idx, _) in iter_state(&self.rx_sets, state) {
325 if self.lazy.contains(idx) {
326 return false;
327 }
328 }
329 true
330 }
331
332 pub fn check_subsume(
335 &mut self,
336 state: StateID,
337 lexeme_idx: LexemeIdx,
338 mut budget: u64,
339 ) -> Result<bool> {
340 let budget0 = budget;
341 assert!(self.subsume_possible(state));
342 let small = self.get_rx(lexeme_idx);
343 let mut res = false;
344 for (idx, e) in iter_state(&self.rx_sets, state) {
345 if !self.subsumable.contains(idx) {
346 continue;
347 }
348 let c0 = self.exprs.cost();
349 let cache_failures = budget > budget0 / 2;
350 let is_contained = self
351 .relevance
352 .is_contained_in_prefixes(
353 &mut self.exprs,
354 &mut self.deriv,
355 small,
356 e,
357 budget,
358 cache_failures,
359 )
360 .unwrap_or(false);
361 if is_contained {
367 res = true;
368 break;
369 }
370 let cost = self.exprs.cost() - c0;
371 budget = budget.saturating_sub(cost);
372 }
373 Ok(res)
374 }
375
376 pub fn num_bytes(&self) -> usize {
378 self.exprs.num_bytes()
379 + self.deriv.num_bytes()
380 + self.next_byte.num_bytes()
381 + self.relevance.num_bytes()
382 + self.state_descs.len() * 100
383 + self.state_table.len() * std::mem::size_of::<StateID>()
384 + self.rx_sets.num_bytes()
385 }
386
387 fn lowest_match_inner(&mut self, desc: &mut StateDesc) {
392 let mut all_eoi = true;
397
398 let mut eois = MatchingLexemes::None;
401
402 let mut lazies = MatchingLexemes::None;
403
404 let mut hidden_len = 0;
405
406 for (idx, e) in iter_state(&self.rx_sets, desc.state) {
408 if !self.exprs.is_nullable(e) {
412 all_eoi = false;
414 continue;
415 } else if Some(self.get_rx(idx)) == self.special_token_rx {
416 desc.lazy_accepting = MatchingLexemes::One(idx);
421 desc.has_special_token = true;
422 return;
423 }
424
425 if self.lazy.contains(idx) {
428 if lazies.is_none() {
429 all_eoi = false;
430 hidden_len = self.exprs.possible_lookahead_len(e) as u32;
431 }
432 lazies.add(idx);
433 continue;
434 }
435
436 if all_eoi {
438 if self.next_byte.next_byte(&self.exprs, e) == NextByte::ForcedEOI {
440 eois.add(idx);
443 } else {
444 all_eoi = false;
447 }
448 }
449 }
450
451 if lazies.is_some() {
452 desc.lazy_accepting = lazies;
453 desc.lazy_hidden_len = hidden_len;
454 } else if all_eoi {
455 desc.lazy_accepting = eois;
456 }
458 }
459
460 pub fn next_byte(&mut self, state: StateID) -> NextByte {
463 let desc = &mut self.state_descs[state.as_usize()];
464 if let Some(next_byte) = desc.next_byte {
465 return next_byte;
466 }
467
468 let mut next_byte = NextByte::Dead;
469 for (_, e) in iter_state(&self.rx_sets, state) {
470 next_byte = next_byte | self.next_byte.next_byte(&self.exprs, e);
471 if next_byte.is_some_bytes() {
472 break;
473 }
474 }
475
476 desc.next_byte = Some(next_byte);
477 next_byte
478 }
479
480 pub fn limit_state_to(&mut self, state: StateID, allowed_lexemes: &LexemeSet) -> StateID {
481 let mut vec_desc = vec![];
482 for (idx, e) in iter_state(&self.rx_sets, state) {
483 if allowed_lexemes.contains(idx) {
484 Self::push_rx(&mut vec_desc, idx, e);
485 }
486 }
487 self.insert_state(vec_desc)
488 }
489
490 pub fn total_fuel_spent(&self) -> u64 {
491 self.exprs.cost()
492 }
493
494 pub fn lexeme_weight(&mut self, lexeme_idx: LexemeIdx) -> u32 {
495 let e = self.rx_list[lexeme_idx.as_usize()];
496 self.exprs.get_weight(e)
497 }
498
499 pub fn set_max_states(&mut self, max_states: usize) {
500 if !self.has_error() {
501 self.max_states = max_states;
502 }
503 }
504
505 pub fn set_fuel(&mut self, fuel: u64) {
508 if !self.has_error() {
509 self.fuel = fuel;
510 }
511 }
512
513 pub fn get_fuel(&self) -> u64 {
514 self.fuel
515 }
516
517 pub fn has_error(&self) -> bool {
518 self.alpha.has_error()
519 }
520
521 pub fn get_error(&self) -> Option<String> {
522 if self.has_error() {
523 if self.fuel == 0 {
524 Some("too many expressions constructed".to_string())
525 } else if self.state_descs.len() >= self.max_states {
526 Some(format!(
527 "too many states: {} >= {}",
528 self.state_descs.len(),
529 self.max_states
530 ))
531 } else {
532 Some("unknown error".to_string())
533 }
534 } else {
535 None
536 }
537 }
538
539 pub fn stats(&self) -> LexerStats {
540 LexerStats {
541 num_regexps: self.rx_list.len(),
542 num_ast_nodes: self.num_ast_nodes,
543 num_derived: self.exprs.len() - self.num_ast_nodes,
544 num_derivatives: self.deriv.num_deriv,
545 total_fuel_spent: self.total_fuel_spent() as usize,
546 num_states: self.state_descs.len(),
547 num_transitions: self.num_transitions,
548 num_bytes: self.num_bytes(),
549 alphabet_size: self.alpha.len(),
550 error: self.has_error(),
551 }
552 }
553
554 pub fn print_state_table(&self) {
555 for (state, row) in self.state_table.chunks(self.alpha.len()).enumerate() {
556 println!("state: {}", state);
557 for (b, &new_state) in row.iter().enumerate() {
558 println!(" s{:?} -> {:?}", b, new_state);
559 }
560 }
561 }
562}
563
564#[derive(Clone, Debug)]
565pub(crate) struct RxLexeme {
566 pub rx: ExprRef,
567 pub lazy: bool,
568 #[allow(dead_code)]
569 pub priority: i32,
570}
571
572impl RegexVec {
574 pub(crate) fn new_with_exprset(
575 exprset: ExprSet,
576 mut rx_lexemes: Vec<RxLexeme>,
577 special_token_rx: Option<ExprRef>,
578 limits: &mut ParserLimits,
579 ) -> Result<Self> {
580 let spec_pos = if let Some(rx) = special_token_rx {
581 rx_lexemes.iter().position(|r| r.rx == rx)
582 } else {
583 None
584 };
585 let (alpha, mut exprset, mut rx_list) = AlphabetInfo::from_exprset(
586 exprset,
587 &rx_lexemes.iter().map(|r| r.rx).collect::<Vec<_>>(),
588 );
589 let num_ast_nodes = exprset.len();
590 let special_token_rx = spec_pos.map(|pos| rx_list[pos]);
591
592 for idx in 0..rx_lexemes.len() {
593 rx_lexemes[idx].rx = rx_list[idx];
594 }
595
596 let fuel0 = limits.initial_lexer_fuel;
597 let mut relevance = RelevanceCache::new();
598 for elt in rx_list.iter_mut() {
599 let c0 = exprset.cost();
600 match relevance.is_non_empty_limited(&mut exprset, *elt, limits.initial_lexer_fuel) {
601 Ok(true) => {}
602 Ok(false) => {
603 *elt = ExprRef::NO_MATCH;
604 }
605 Err(_) => {
606 bail!(
607 "fuel exhausted when checking relevance of lexemes ({})",
608 fuel0
609 );
610 }
611 }
612 limits.initial_lexer_fuel = limits
613 .initial_lexer_fuel
614 .saturating_sub(exprset.cost() - c0);
615 }
616
617 let mut lazy = LexemeSet::new(rx_lexemes.len());
618 let mut subsumable = LexemeSet::new(rx_lexemes.len());
619 for (idx, r) in rx_lexemes.iter().enumerate() {
620 if r.lazy {
621 lazy.add(LexemeIdx::new(idx));
622 } else if exprset.attr_has_repeat(r.rx) {
623 subsumable.add(LexemeIdx::new(idx));
624 }
625 }
626
627 let rx_sets = StateID::new_hash_cons();
628 let mut r = RegexVec {
629 deriv: DerivCache::new(),
630 next_byte: NextByteCache::new(),
631 special_token_rx,
632 relevance,
633 lazy,
634 subsumable,
635 rx_lexemes,
636 exprs: exprset,
637 alpha,
638 rx_list,
639 rx_sets,
640 state_table: vec![],
641 state_descs: vec![],
642 num_transitions: 0,
643 num_ast_nodes,
644 fuel: u64::MAX,
645 max_states: usize::MAX,
646 };
647
648 assert!(r.lazy.len() == r.rx_list.len());
649
650 r.insert_state(vec![]);
651 r.append_state(r.state_descs[0].clone());
653 r.state_table.fill(StateID::DEAD);
655 assert!(!r.alpha.is_empty());
656 Ok(r)
657 }
658
659 fn get_rx(&self, idx: LexemeIdx) -> ExprRef {
660 self.rx_list[idx.as_usize()]
661 }
662
663 fn append_state(&mut self, state_desc: StateDesc) {
664 let mut new_states = vec![StateID::MISSING; self.alpha.len()];
665 self.state_table.append(&mut new_states);
666 self.state_descs.push(state_desc);
667 if self.state_descs.len() >= self.max_states {
668 self.alpha.enter_error_state();
669 }
670 }
671
672 fn insert_state(&mut self, lst: Vec<u32>) -> StateID {
673 assert!(lst.len() % 2 == 0);
678 let id = StateID::new(self.rx_sets.insert(&lst));
679 if id.as_usize() >= self.state_descs.len() {
680 let state_desc = self.compute_state_desc(id);
681 self.append_state(state_desc);
682 }
683 if self.state_desc(id).lazy_accepting.is_some() {
684 id._set_lowest_match()
685 } else {
686 id
687 }
688 }
689
690 fn compute_state_desc(&mut self, state: StateID) -> StateDesc {
691 let mut res = StateDesc {
692 state,
693 greedy_accepting: MatchingLexemes::None,
694 possible: LexemeSet::new(self.rx_list.len()),
695 possible_lookahead_len: None,
696 lookahead_len: None,
697 next_byte: None,
698 lazy_accepting: MatchingLexemes::None,
699 lazy_hidden_len: 0,
700 has_special_token: false,
701 };
702 for (idx, e) in iter_state(&self.rx_sets, state) {
703 res.possible.add(idx);
704 if self.exprs.is_nullable(e) {
705 res.greedy_accepting.add(idx);
706 }
707 }
708
709 if res.possible.is_empty() {
710 assert!(state == StateID::DEAD);
711 }
712
713 self.lowest_match_inner(&mut res);
714
715 res
718 }
719
720 fn push_rx(vec_desc: &mut Vec<u32>, idx: LexemeIdx, e: ExprRef) {
721 vec_desc.push(idx.as_usize() as u32);
722 vec_desc.push(e.as_u32());
723 }
724
725 fn transition_inner(&mut self, state: StateID, b: u8, idx: usize) -> StateID {
728 assert!(state.is_valid());
729
730 let mut vec_desc = vec![];
731
732 let c0 = self.exprs.cost();
734 for (idx, e) in iter_state(&self.rx_sets, state) {
738 let d = self.deriv.derivative(&mut self.exprs, e, b);
739
740 let fuel = self.fuel.saturating_sub(self.exprs.cost() - c0);
741 let d = match self
742 .relevance
743 .is_non_empty_limited(&mut self.exprs, d, fuel)
744 {
745 Ok(true) => d,
746 Ok(false) => ExprRef::NO_MATCH,
747 Err(_) => {
748 self.fuel = 0; break;
750 }
751 };
752
753 if d != ExprRef::NO_MATCH {
755 Self::push_rx(&mut vec_desc, idx, d);
756 }
757 }
758
759 let cost = self.exprs.cost() - c0;
761 self.fuel = self.fuel.saturating_sub(cost);
762 if self.fuel == 0 {
763 self.alpha.enter_error_state();
764 }
765 let new_state = self.insert_state(vec_desc);
779 self.num_transitions += 1;
780 self.state_table[idx] = new_state;
781 new_state
782 }
783}
784
785impl Debug for RegexVec {
786 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
787 write!(f, "RegexVec({})", self.stats())
788 }
789}
790
791fn iter_state(
792 rx_sets: &VecHashCons,
793 state: StateID,
794) -> impl Iterator<Item = (LexemeIdx, ExprRef)> + '_ {
795 let lst = rx_sets.get(state.as_u32());
796 (0..lst.len()).step_by(2).map(move |idx| {
797 (
798 LexemeIdx::new(lst[idx] as usize),
799 ExprRef::new(lst[idx + 1]),
800 )
801 })
802}
803
804