lalrpop_util/
lexer.rs

1#![doc(hidden)]
2//! The built-in lalrpop lexer
3//!
4//! This is the code for the built in lexer, and is linked by lalrpop generated parsers to provide
5//! lexer support when you don't write a custom lexer.
6//!
7//! Typically you don't want to use APIs from this module directly, they are public to be accessed
8//! by the generated parser.
9use alloc::{fmt, vec::Vec};
10use core::marker::PhantomData;
11
12use crate::ParseError;
13
14use regex_automata::hybrid::dfa::{Cache, DFA};
15use regex_automata::hybrid::{BuildError, LazyStateID};
16use regex_automata::nfa::thompson::Config as NfaConfig;
17use regex_automata::util::syntax::Config as SyntaxConfig;
18use regex_automata::{Anchored, Input, MatchKind};
19
20#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord)]
21pub struct Token<'input>(pub usize, pub &'input str);
22impl fmt::Display for Token<'_> {
23    fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> {
24        fmt::Display::fmt(self.1, formatter)
25    }
26}
27
28pub struct MatcherBuilder {
29    dfa: DFA,
30    skip_vec: Vec<bool>,
31}
32
33impl MatcherBuilder {
34    #[allow(clippy::result_large_err)]
35    pub fn new<S>(exprs: impl IntoIterator<Item = (S, bool)>) -> Result<MatcherBuilder, BuildError>
36    where
37        S: AsRef<str>,
38    {
39        let exprs = exprs.into_iter();
40        let mut regex_vec = Vec::with_capacity(exprs.size_hint().0);
41        let mut skip_vec = Vec::with_capacity(exprs.size_hint().0);
42        for (regex, skip) in exprs {
43            regex_vec.push(regex);
44            skip_vec.push(skip);
45        }
46
47        let enable_unicode = cfg!(feature = "unicode");
48        let dfa = DFA::builder()
49            .configure(DFA::config().match_kind(MatchKind::All))
50            .syntax(
51                SyntaxConfig::new()
52                    .unicode(enable_unicode)
53                    .utf8(enable_unicode),
54            )
55            .thompson(NfaConfig::new().utf8(enable_unicode).shrink(true))
56            .build_many(&regex_vec)?;
57
58        Ok(MatcherBuilder { dfa, skip_vec })
59    }
60
61    pub fn matcher<'input, 'builder, E>(
62        &'builder self,
63        text: &'input str,
64    ) -> Matcher<'input, 'builder, E> {
65        let input = Input::new(text).anchored(Anchored::Yes);
66        let mut cache = self.dfa.create_cache();
67        let start = self.dfa.start_state_forward(&mut cache, &input).unwrap();
68        Matcher {
69            text,
70            consumed: 0,
71            cache,
72            start,
73            dfa: &self.dfa,
74            skip_vec: &self.skip_vec,
75            _marker: PhantomData,
76        }
77    }
78}
79
80pub struct Matcher<'input, 'builder, E> {
81    text: &'input str,
82    consumed: usize,
83    cache: Cache,
84    start: LazyStateID,
85    dfa: &'builder DFA,
86    skip_vec: &'builder [bool],
87    _marker: PhantomData<fn() -> E>,
88}
89
90impl<'input, E> Iterator for Matcher<'input, '_, E> {
91    type Item = Result<(usize, Token<'input>, usize), ParseError<usize, Token<'input>, E>>;
92
93    fn next(&mut self) -> Option<Self::Item> {
94        loop {
95            let text = self.text;
96            let start_offset = self.consumed;
97            if text.is_empty() {
98                self.consumed = start_offset;
99                return None;
100            }
101
102            let mut match_ = None;
103            'search: {
104                let mut state = self.start;
105                for (i, byte) in text.bytes().enumerate() {
106                    state = self.dfa.next_state(&mut self.cache, state, byte).unwrap();
107                    if state.is_match() {
108                        match_ = Some((state, i));
109                    } else if state.is_dead() {
110                        break 'search;
111                    }
112                }
113                state = self.dfa.next_eoi_state(&mut self.cache, state).unwrap();
114                if state.is_match() {
115                    match_ = Some((state, text.len()));
116                }
117            }
118
119            let (match_state, longest_match) = match match_ {
120                Some(match_) => match_,
121                None => {
122                    return Some(Err(ParseError::InvalidToken {
123                        location: start_offset,
124                    }))
125                }
126            };
127            let index = (0..self.dfa.match_len(&self.cache, match_state))
128                .map(|n| {
129                    self.dfa
130                        .match_pattern(&self.cache, match_state, n)
131                        .as_usize()
132                })
133                .max()
134                .unwrap();
135
136            let result = &text[..longest_match];
137            let remaining = &text[longest_match..];
138            let end_offset = start_offset + longest_match;
139            self.text = remaining;
140            self.consumed = end_offset;
141
142            if self.skip_vec[index] {
143                if longest_match == 0 {
144                    return Some(Err(ParseError::InvalidToken {
145                        location: start_offset,
146                    }));
147                }
148                continue;
149            }
150
151            return Some(Ok((start_offset, Token(index, result), end_offset)));
152        }
153    }
154}