llguidance/earley/
regexvec.rs

1/// This file implements regex vectors.  To match tokens to lexemes, llguidance uses
2/// a DFA whose nodes are regex vectors.  For more on this see
3/// S. Owens, J. Reppy, and A. Turon.
4/// Regular Expression Derivatives Reexamined".
5/// Journal of Functional Programming 19(2):173-190, March 2009.
6/// https://www.khoury.northeastern.edu/home/turon/re-deriv.pdf (retrieved 15 Nov 2024)
7use anyhow::{bail, Result};
8use derivre::raw::{DerivCache, ExprSet, NextByteCache, RelevanceCache, VecHashCons};
9use serde::{Deserialize, Serialize};
10use std::fmt::{Debug, Display};
11use toktrie::SimpleVob;
12
13pub use derivre::{AlphabetInfo, ExprRef, NextByte, StateID};
14
15use crate::api::ParserLimits;
16
17use super::lexerspec::LexemeIdx;
18
19#[derive(Clone, Serialize, Deserialize, Default)]
20pub struct LexerStats {
21    pub num_regexps: usize,
22    pub num_ast_nodes: usize,
23    pub num_derived: usize,
24    pub num_derivatives: usize,
25    pub total_fuel_spent: usize,
26    pub num_states: usize,
27    pub num_transitions: usize,
28    pub num_bytes: usize,
29    pub alphabet_size: usize,
30    pub error: bool,
31}
32
33#[derive(Clone)]
34pub enum MatchingLexemes {
35    None,
36    One(LexemeIdx),
37    Two([LexemeIdx; 2]),
38    Many(Vec<LexemeIdx>),
39}
40
41impl MatchingLexemes {
42    pub fn is_some(&self) -> bool {
43        !matches!(self, MatchingLexemes::None)
44    }
45
46    pub fn is_none(&self) -> bool {
47        !self.is_some()
48    }
49
50    pub fn first(&self) -> Option<LexemeIdx> {
51        match self {
52            MatchingLexemes::None => None,
53            MatchingLexemes::One(idx) => Some(*idx),
54            MatchingLexemes::Two([idx, _]) => Some(*idx),
55            MatchingLexemes::Many(v) => v.first().copied(),
56        }
57    }
58
59    pub fn contains(&self, idx: LexemeIdx) -> bool {
60        match self {
61            MatchingLexemes::None => false,
62            MatchingLexemes::One(idx2) => *idx2 == idx,
63            MatchingLexemes::Two([idx1, idx2]) => *idx1 == idx || *idx2 == idx,
64            MatchingLexemes::Many(v) => v.contains(&idx),
65        }
66    }
67
68    pub fn add(&mut self, idx: LexemeIdx) {
69        match self {
70            MatchingLexemes::None => *self = MatchingLexemes::One(idx),
71            MatchingLexemes::One(idx2) => {
72                *self = MatchingLexemes::Two([*idx2, idx]);
73            }
74            MatchingLexemes::Two([idx1, idx2]) => {
75                *self = MatchingLexemes::Many(vec![*idx1, *idx2, idx]);
76            }
77            MatchingLexemes::Many(v) => {
78                v.push(idx);
79            }
80        }
81    }
82
83    pub fn len(&self) -> usize {
84        match self {
85            MatchingLexemes::None => 0,
86            MatchingLexemes::One(_) => 1,
87            MatchingLexemes::Two(_) => 2,
88            MatchingLexemes::Many(v) => v.len(),
89        }
90    }
91
92    pub fn is_empty(&self) -> bool {
93        self.len() == 0
94    }
95
96    pub fn as_slice(&self) -> &[LexemeIdx] {
97        match self {
98            MatchingLexemes::None => &[],
99            MatchingLexemes::One(idx) => std::slice::from_ref(idx),
100            MatchingLexemes::Two(v) => v,
101            MatchingLexemes::Many(v) => v.as_slice(),
102        }
103    }
104}
105
106impl Debug for MatchingLexemes {
107    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
108        match self {
109            MatchingLexemes::None => write!(f, "Lex:[]"),
110            MatchingLexemes::One(idx) => write!(f, "Lex:[{}]", idx.as_usize()),
111            MatchingLexemes::Two([idx1, idx2]) => {
112                write!(f, "Lex:[{},{}]", idx1.as_usize(), idx2.as_usize())
113            }
114            MatchingLexemes::Many(v) => write!(
115                f,
116                "Lex:[{}]",
117                v.iter()
118                    .map(|idx| idx.as_usize().to_string())
119                    .collect::<Vec<_>>()
120                    .join(",")
121            ),
122        }
123    }
124}
125
126impl Display for LexerStats {
127    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
128        write!(
129            f,
130            "regexps: {} with {} nodes (+ {} derived via {} derivatives with total fuel {}), states: {}; transitions: {}; bytes: {}; alphabet size: {} {}",
131            self.num_regexps,
132            self.num_ast_nodes,
133            self.num_derived,
134            self.num_derivatives,
135            self.total_fuel_spent,
136            self.num_states,
137            self.num_transitions,
138            self.num_bytes,
139            self.alphabet_size,
140            if self.error { "ERROR" } else { "" }
141        )
142    }
143}
144
145impl Debug for LexerStats {
146    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
147        Display::fmt(self, f)
148    }
149}
150
151#[derive(Clone, Debug)]
152pub struct LexemeSet {
153    vob: SimpleVob,
154}
155
156impl LexemeSet {
157    pub fn new(size: usize) -> Self {
158        LexemeSet {
159            vob: SimpleVob::alloc(size),
160        }
161    }
162
163    pub fn len(&self) -> usize {
164        self.vob.len()
165    }
166
167    pub fn from_vob(vob: &SimpleVob) -> Self {
168        LexemeSet { vob: vob.clone() }
169    }
170
171    pub fn is_empty(&self) -> bool {
172        self.vob.is_zero()
173    }
174
175    #[inline(always)]
176    pub fn iter(&self) -> impl Iterator<Item = LexemeIdx> + '_ {
177        self.vob.iter().map(|e| LexemeIdx::new(e as usize))
178    }
179
180    pub fn add(&mut self, idx: LexemeIdx) {
181        self.vob.set(idx.as_usize(), true);
182    }
183
184    pub fn remove(&mut self, idx: LexemeIdx) {
185        self.vob.set(idx.as_usize(), false);
186    }
187
188    pub fn first(&self) -> Option<LexemeIdx> {
189        self.vob.first_bit_set().map(LexemeIdx::new)
190    }
191
192    pub fn contains(&self, idx: LexemeIdx) -> bool {
193        self.vob.get(idx.as_usize())
194    }
195
196    pub fn clear(&mut self) {
197        self.vob.set_all(false);
198    }
199}
200
201#[derive(Clone)]
202pub struct RegexVec {
203    exprs: ExprSet,
204    deriv: DerivCache,
205    next_byte: NextByteCache,
206    relevance: RelevanceCache,
207    alpha: AlphabetInfo,
208    #[allow(dead_code)]
209    rx_lexemes: Vec<RxLexeme>,
210    lazy: LexemeSet,
211    subsumable: LexemeSet,
212    rx_list: Vec<ExprRef>,
213    special_token_rx: Option<ExprRef>,
214    rx_sets: VecHashCons,
215    state_table: Vec<StateID>,
216    state_descs: Vec<StateDesc>,
217    num_transitions: usize,
218    num_ast_nodes: usize,
219    max_states: usize,
220    fuel: u64,
221}
222
223#[derive(Clone, Debug)]
224pub struct StateDesc {
225    pub state: StateID,
226    pub greedy_accepting: MatchingLexemes,
227    pub possible: LexemeSet,
228
229    /// Index of lowest matching regex if any.
230    /// Lazy regexes match as soon as they accept, while greedy only
231    /// if they accept and force EOI.
232    pub lazy_accepting: MatchingLexemes,
233    pub lazy_hidden_len: u32,
234
235    pub has_special_token: bool,
236
237    possible_lookahead_len: Option<usize>,
238    lookahead_len: Option<Option<usize>>,
239    next_byte: Option<NextByte>,
240}
241
242// public implementation
243impl RegexVec {
244    pub fn alpha(&self) -> &AlphabetInfo {
245        &self.alpha
246    }
247
248    pub fn lazy_regexes(&self) -> &LexemeSet {
249        &self.lazy
250    }
251
252    /// Create and return the initial state of a DFA for this
253    /// regex vector
254    pub fn initial_state(&mut self, selected: &LexemeSet) -> StateID {
255        let mut vec_desc = vec![];
256        for idx in selected.iter() {
257            let rx = self.get_rx(idx);
258            if rx != ExprRef::NO_MATCH {
259                Self::push_rx(&mut vec_desc, idx, rx);
260            }
261        }
262        self.insert_state(vec_desc)
263    }
264
265    #[inline(always)]
266    pub fn state_desc(&self, state: StateID) -> &StateDesc {
267        &self.state_descs[state.as_usize()]
268    }
269
270    pub fn possible_lookahead_len(&mut self, state: StateID) -> usize {
271        let desc = &mut self.state_descs[state.as_usize()];
272        if let Some(len) = desc.possible_lookahead_len {
273            return len;
274        }
275        let mut max_len = 0;
276        for (_, e) in iter_state(&self.rx_sets, state) {
277            max_len = max_len.max(self.exprs.possible_lookahead_len(e));
278        }
279        desc.possible_lookahead_len = Some(max_len);
280        max_len
281    }
282
283    pub fn lookahead_len_for_state(&mut self, state: StateID) -> Option<usize> {
284        let desc = &mut self.state_descs[state.as_usize()];
285        if desc.greedy_accepting.is_none() {
286            return None;
287        }
288        if let Some(len) = desc.lookahead_len {
289            return len;
290        }
291        let mut res = None;
292        let exprs = &self.exprs;
293        for (idx2, e) in iter_state(&self.rx_sets, state) {
294            if res.is_none() && exprs.is_nullable(e) {
295                assert!(desc.greedy_accepting.contains(idx2));
296                res = Some(exprs.lookahead_len(e).unwrap_or(0));
297            }
298        }
299        desc.lookahead_len = Some(res);
300        res
301    }
302
303    /// Given a transition (a from-state and a byte) of the DFA
304    /// for this regex vector, return the to-state.  It is taken
305    /// from the cache, if it is cached, and created otherwise.
306    #[inline(always)]
307    pub fn transition(&mut self, state: StateID, b: u8) -> StateID {
308        let idx = self.alpha.map_state(state, b);
309        let new_state = self.state_table[idx];
310        if new_state != StateID::MISSING {
311            new_state
312        } else {
313            self.transition_inner(state, b, idx)
314        }
315    }
316
317    /// "Subsumption" is a feature implementing regex containment.
318    /// subsume_possible() returns true if it's possible for this
319    /// state, false otherwise.
320    pub fn subsume_possible(&mut self, state: StateID) -> bool {
321        if state.is_dead() || self.has_error() {
322            return false;
323        }
324        for (idx, _) in iter_state(&self.rx_sets, state) {
325            if self.lazy.contains(idx) {
326                return false;
327            }
328        }
329        true
330    }
331
332    /// Part of the interface for "subsumption", a feature implementing
333    /// regex containment.
334    pub fn check_subsume(
335        &mut self,
336        state: StateID,
337        lexeme_idx: LexemeIdx,
338        mut budget: u64,
339    ) -> Result<bool> {
340        let budget0 = budget;
341        assert!(self.subsume_possible(state));
342        let small = self.get_rx(lexeme_idx);
343        let mut res = false;
344        for (idx, e) in iter_state(&self.rx_sets, state) {
345            if !self.subsumable.contains(idx) {
346                continue;
347            }
348            let c0 = self.exprs.cost();
349            let cache_failures = budget > budget0 / 2;
350            let is_contained = self
351                .relevance
352                .is_contained_in_prefixes(
353                    &mut self.exprs,
354                    &mut self.deriv,
355                    small,
356                    e,
357                    budget,
358                    cache_failures,
359                )
360                .unwrap_or(false);
361            // println!("chk: {} in {} -> {}",
362            //     self.exprs.expr_to_string(small),
363            //     self.exprs.expr_to_string(e),
364            //     is_contained
365            // );
366            if is_contained {
367                res = true;
368                break;
369            }
370            let cost = self.exprs.cost() - c0;
371            budget = budget.saturating_sub(cost);
372        }
373        Ok(res)
374    }
375
376    /// Estimate the size of the regex tables in bytes.
377    pub fn num_bytes(&self) -> usize {
378        self.exprs.num_bytes()
379            + self.deriv.num_bytes()
380            + self.next_byte.num_bytes()
381            + self.relevance.num_bytes()
382            + self.state_descs.len() * 100
383            + self.state_table.len() * std::mem::size_of::<StateID>()
384            + self.rx_sets.num_bytes()
385    }
386
387    // Find the lowest, or best, match in 'state'.  It is the first lazy regex.
388    // If there is no lazy regex, and all greedy lexemes have reached the end of
389    // the lexeme, then it is the first greedy lexeme.  If neither of these
390    // criteria produce a choice for "best", 'None' is returned.
391    fn lowest_match_inner(&mut self, desc: &mut StateDesc) {
392        // 'all_eoi' is true if all greedy lexemes match, that is, if we are at
393        // the end of lexeme for all of them.  End of lexeme is called
394        // "end of input" or EOI for consistency with the regex package.
395        // Initially, 'all_eoi' is true, vacuously.
396        let mut all_eoi = true;
397
398        // 'eoi_candidate' tracks the lowest (aka first or best) greedy match.
399        // Initially, there is none.
400        let mut eois = MatchingLexemes::None;
401
402        let mut lazies = MatchingLexemes::None;
403
404        let mut hidden_len = 0;
405
406        // For every regex in this state
407        for (idx, e) in iter_state(&self.rx_sets, desc.state) {
408            // If this lexeme is not a match.  (If the derivative at this point is nullable,
409            // there is a match, so if it is not nullable, there is no match.)
410            // println!("idx: {:?} e: {:?} {:?}", idx, e,self.special_token_rx);
411            if !self.exprs.is_nullable(e) {
412                // No match, so not at end of lexeme
413                all_eoi = false;
414                continue;
415            } else if Some(self.get_rx(idx)) == self.special_token_rx {
416                // the regex is /\xFF\[[0-9]+\]/ so it's guaranteed not to conflict with anything
417                // else (starts with non-unicode byte); thus we ignore the rest of processing
418                // when has_special_token is set, we just need to make sure lazy_accepting is non-empty,
419                // the actual value is not important
420                desc.lazy_accepting = MatchingLexemes::One(idx);
421                desc.has_special_token = true;
422                return;
423            }
424
425            // If this is the first lazy lexeme, we can cut things short.  The first
426            // lazy lexeme is our lowest, or best, match.  We return it and are done.
427            if self.lazy.contains(idx) {
428                if lazies.is_none() {
429                    all_eoi = false;
430                    hidden_len = self.exprs.possible_lookahead_len(e) as u32;
431                }
432                lazies.add(idx);
433                continue;
434            }
435
436            // If all the greedy lexemes so far are matches.
437            if all_eoi {
438                // If this greedy lexeme is at end of lexeme ...
439                if self.next_byte.next_byte(&self.exprs, e) == NextByte::ForcedEOI {
440                    // then, if we have not yet found a matching greedy lexeme, set
441                    // this one to be our lowest match ...
442                    eois.add(idx);
443                } else {
444                    // ... otherwise, if this greedy lexeme is not yet a match, then indicate
445                    // that not all greedy lexemes are matches at this point.
446                    all_eoi = false;
447                }
448            }
449        }
450
451        if lazies.is_some() {
452            desc.lazy_accepting = lazies;
453            desc.lazy_hidden_len = hidden_len;
454        } else if all_eoi {
455            desc.lazy_accepting = eois;
456            // no hidden len
457        }
458    }
459
460    /// Check if the there is only one transition out of state.
461    /// This is an approximation - see docs for NextByte.
462    pub fn next_byte(&mut self, state: StateID) -> NextByte {
463        let desc = &mut self.state_descs[state.as_usize()];
464        if let Some(next_byte) = desc.next_byte {
465            return next_byte;
466        }
467
468        let mut next_byte = NextByte::Dead;
469        for (_, e) in iter_state(&self.rx_sets, state) {
470            next_byte = next_byte | self.next_byte.next_byte(&self.exprs, e);
471            if next_byte.is_some_bytes() {
472                break;
473            }
474        }
475
476        desc.next_byte = Some(next_byte);
477        next_byte
478    }
479
480    pub fn limit_state_to(&mut self, state: StateID, allowed_lexemes: &LexemeSet) -> StateID {
481        let mut vec_desc = vec![];
482        for (idx, e) in iter_state(&self.rx_sets, state) {
483            if allowed_lexemes.contains(idx) {
484                Self::push_rx(&mut vec_desc, idx, e);
485            }
486        }
487        self.insert_state(vec_desc)
488    }
489
490    pub fn total_fuel_spent(&self) -> u64 {
491        self.exprs.cost()
492    }
493
494    pub fn lexeme_weight(&mut self, lexeme_idx: LexemeIdx) -> u32 {
495        let e = self.rx_list[lexeme_idx.as_usize()];
496        self.exprs.get_weight(e)
497    }
498
499    pub fn set_max_states(&mut self, max_states: usize) {
500        if !self.has_error() {
501            self.max_states = max_states;
502        }
503    }
504
505    // Each fuel point is on the order 100ns (though it varies).
506    // So, for ~10ms limit, do a .set_fuel(100_000).
507    pub fn set_fuel(&mut self, fuel: u64) {
508        if !self.has_error() {
509            self.fuel = fuel;
510        }
511    }
512
513    pub fn get_fuel(&self) -> u64 {
514        self.fuel
515    }
516
517    pub fn has_error(&self) -> bool {
518        self.alpha.has_error()
519    }
520
521    pub fn get_error(&self) -> Option<String> {
522        if self.has_error() {
523            if self.fuel == 0 {
524                Some("too many expressions constructed".to_string())
525            } else if self.state_descs.len() >= self.max_states {
526                Some(format!(
527                    "too many states: {} >= {}",
528                    self.state_descs.len(),
529                    self.max_states
530                ))
531            } else {
532                Some("unknown error".to_string())
533            }
534        } else {
535            None
536        }
537    }
538
539    pub fn stats(&self) -> LexerStats {
540        LexerStats {
541            num_regexps: self.rx_list.len(),
542            num_ast_nodes: self.num_ast_nodes,
543            num_derived: self.exprs.len() - self.num_ast_nodes,
544            num_derivatives: self.deriv.num_deriv,
545            total_fuel_spent: self.total_fuel_spent() as usize,
546            num_states: self.state_descs.len(),
547            num_transitions: self.num_transitions,
548            num_bytes: self.num_bytes(),
549            alphabet_size: self.alpha.len(),
550            error: self.has_error(),
551        }
552    }
553
554    pub fn print_state_table(&self) {
555        for (state, row) in self.state_table.chunks(self.alpha.len()).enumerate() {
556            println!("state: {}", state);
557            for (b, &new_state) in row.iter().enumerate() {
558                println!("  s{:?} -> {:?}", b, new_state);
559            }
560        }
561    }
562}
563
564#[derive(Clone, Debug)]
565pub(crate) struct RxLexeme {
566    pub rx: ExprRef,
567    pub lazy: bool,
568    #[allow(dead_code)]
569    pub priority: i32,
570}
571
572// private implementation
573impl RegexVec {
574    pub(crate) fn new_with_exprset(
575        exprset: ExprSet,
576        mut rx_lexemes: Vec<RxLexeme>,
577        special_token_rx: Option<ExprRef>,
578        limits: &mut ParserLimits,
579    ) -> Result<Self> {
580        let spec_pos = if let Some(rx) = special_token_rx {
581            rx_lexemes.iter().position(|r| r.rx == rx)
582        } else {
583            None
584        };
585        let (alpha, mut exprset, mut rx_list) = AlphabetInfo::from_exprset(
586            exprset,
587            &rx_lexemes.iter().map(|r| r.rx).collect::<Vec<_>>(),
588        );
589        let num_ast_nodes = exprset.len();
590        let special_token_rx = spec_pos.map(|pos| rx_list[pos]);
591
592        for idx in 0..rx_lexemes.len() {
593            rx_lexemes[idx].rx = rx_list[idx];
594        }
595
596        let fuel0 = limits.initial_lexer_fuel;
597        let mut relevance = RelevanceCache::new();
598        for elt in rx_list.iter_mut() {
599            let c0 = exprset.cost();
600            match relevance.is_non_empty_limited(&mut exprset, *elt, limits.initial_lexer_fuel) {
601                Ok(true) => {}
602                Ok(false) => {
603                    *elt = ExprRef::NO_MATCH;
604                }
605                Err(_) => {
606                    bail!(
607                        "fuel exhausted when checking relevance of lexemes ({})",
608                        fuel0
609                    );
610                }
611            }
612            limits.initial_lexer_fuel = limits
613                .initial_lexer_fuel
614                .saturating_sub(exprset.cost() - c0);
615        }
616
617        let mut lazy = LexemeSet::new(rx_lexemes.len());
618        let mut subsumable = LexemeSet::new(rx_lexemes.len());
619        for (idx, r) in rx_lexemes.iter().enumerate() {
620            if r.lazy {
621                lazy.add(LexemeIdx::new(idx));
622            } else if exprset.attr_has_repeat(r.rx) {
623                subsumable.add(LexemeIdx::new(idx));
624            }
625        }
626
627        let rx_sets = StateID::new_hash_cons();
628        let mut r = RegexVec {
629            deriv: DerivCache::new(),
630            next_byte: NextByteCache::new(),
631            special_token_rx,
632            relevance,
633            lazy,
634            subsumable,
635            rx_lexemes,
636            exprs: exprset,
637            alpha,
638            rx_list,
639            rx_sets,
640            state_table: vec![],
641            state_descs: vec![],
642            num_transitions: 0,
643            num_ast_nodes,
644            fuel: u64::MAX,
645            max_states: usize::MAX,
646        };
647
648        assert!(r.lazy.len() == r.rx_list.len());
649
650        r.insert_state(vec![]);
651        // also append state for the "MISSING"
652        r.append_state(r.state_descs[0].clone());
653        // in fact, transition from MISSING and DEAD should both lead to DEAD
654        r.state_table.fill(StateID::DEAD);
655        assert!(!r.alpha.is_empty());
656        Ok(r)
657    }
658
659    fn get_rx(&self, idx: LexemeIdx) -> ExprRef {
660        self.rx_list[idx.as_usize()]
661    }
662
663    fn append_state(&mut self, state_desc: StateDesc) {
664        let mut new_states = vec![StateID::MISSING; self.alpha.len()];
665        self.state_table.append(&mut new_states);
666        self.state_descs.push(state_desc);
667        if self.state_descs.len() >= self.max_states {
668            self.alpha.enter_error_state();
669        }
670    }
671
672    fn insert_state(&mut self, lst: Vec<u32>) -> StateID {
673        // does this help?
674        // if lst.len() == 0 {
675        //     return StateID::DEAD;
676        // }
677        assert!(lst.len() % 2 == 0);
678        let id = StateID::new(self.rx_sets.insert(&lst));
679        if id.as_usize() >= self.state_descs.len() {
680            let state_desc = self.compute_state_desc(id);
681            self.append_state(state_desc);
682        }
683        if self.state_desc(id).lazy_accepting.is_some() {
684            id._set_lowest_match()
685        } else {
686            id
687        }
688    }
689
690    fn compute_state_desc(&mut self, state: StateID) -> StateDesc {
691        let mut res = StateDesc {
692            state,
693            greedy_accepting: MatchingLexemes::None,
694            possible: LexemeSet::new(self.rx_list.len()),
695            possible_lookahead_len: None,
696            lookahead_len: None,
697            next_byte: None,
698            lazy_accepting: MatchingLexemes::None,
699            lazy_hidden_len: 0,
700            has_special_token: false,
701        };
702        for (idx, e) in iter_state(&self.rx_sets, state) {
703            res.possible.add(idx);
704            if self.exprs.is_nullable(e) {
705                res.greedy_accepting.add(idx);
706            }
707        }
708
709        if res.possible.is_empty() {
710            assert!(state == StateID::DEAD);
711        }
712
713        self.lowest_match_inner(&mut res);
714
715        // println!("state {:?} desc: {:?}", state, res);
716
717        res
718    }
719
720    fn push_rx(vec_desc: &mut Vec<u32>, idx: LexemeIdx, e: ExprRef) {
721        vec_desc.push(idx.as_usize() as u32);
722        vec_desc.push(e.as_u32());
723    }
724
725    /// Given a transition (from-state and byte), create the to-state.
726    /// It is assumed the to-state does not exist.
727    fn transition_inner(&mut self, state: StateID, b: u8, idx: usize) -> StateID {
728        assert!(state.is_valid());
729
730        let mut vec_desc = vec![];
731
732        // let d0 = self.deriv.num_deriv;
733        let c0 = self.exprs.cost();
734        // let t0 = crate::Instant::now();
735        // let mut state_size = 0;
736
737        for (idx, e) in iter_state(&self.rx_sets, state) {
738            let d = self.deriv.derivative(&mut self.exprs, e, b);
739
740            let fuel = self.fuel.saturating_sub(self.exprs.cost() - c0);
741            let d = match self
742                .relevance
743                .is_non_empty_limited(&mut self.exprs, d, fuel)
744            {
745                Ok(true) => d,
746                Ok(false) => ExprRef::NO_MATCH,
747                Err(_) => {
748                    self.fuel = 0; // just in case
749                    break;
750                }
751            };
752
753            // state_size += 1;
754            if d != ExprRef::NO_MATCH {
755                Self::push_rx(&mut vec_desc, idx, d);
756            }
757        }
758
759        // let num_deriv = self.deriv.num_deriv - d0;
760        let cost = self.exprs.cost() - c0;
761        self.fuel = self.fuel.saturating_sub(cost);
762        if self.fuel == 0 {
763            self.alpha.enter_error_state();
764        }
765        // if false && cost > 40 {
766        //     eprintln!(
767        //         "cost: {:?} {} {} size={}",
768        //         t0.elapsed() / (cost as u32),
769        //         num_deriv,
770        //         cost,
771        //         state_size
772        //     );
773
774        //     // for (idx, e) in iter_state(&self.rx_sets, state) {
775        //     //     eprintln!("expr{}: {}", idx, self.exprs.expr_to_string(e));
776        //     // }
777        // }
778        let new_state = self.insert_state(vec_desc);
779        self.num_transitions += 1;
780        self.state_table[idx] = new_state;
781        new_state
782    }
783}
784
785impl Debug for RegexVec {
786    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
787        write!(f, "RegexVec({})", self.stats())
788    }
789}
790
791fn iter_state(
792    rx_sets: &VecHashCons,
793    state: StateID,
794) -> impl Iterator<Item = (LexemeIdx, ExprRef)> + '_ {
795    let lst = rx_sets.get(state.as_u32());
796    (0..lst.len()).step_by(2).map(move |idx| {
797        (
798            LexemeIdx::new(lst[idx] as usize),
799            ExprRef::new(lst[idx + 1]),
800        )
801    })
802}
803
804// #[test]
805// fn test_fuel() {
806//     let mut rx = RegexVec::new_single("a(bc+|b[eh])g|.h").unwrap();
807//     println!("{:?}", rx);
808//     rx.set_fuel(200);
809//     match_(&mut rx, "abcg");
810//     assert!(!rx.has_error());
811//     let mut rx = RegexVec::new_single("a(bc+|b[eh])g|.h").unwrap();
812//     println!("{:?}", rx);
813//     rx.set_fuel(20);
814//     no_match(&mut rx, "abcg");
815//     assert!(rx.has_error());
816// }
llguidance/earley/regexvec.rs

llguidance/earley/
regexvec.rs