lexgen_util/
lib.rs

1#![allow(clippy::should_implement_trait, clippy::type_complexity)]
2
3use std::iter::Peekable;
4use std::str::Chars;
5
6use unicode_width::UnicodeWidthChar;
7
8#[derive(Debug, Clone, PartialEq, Eq)]
9pub struct LexerError<E> {
10    pub location: Loc,
11    pub kind: LexerErrorKind<E>,
12}
13
14#[derive(Debug, Clone, PartialEq, Eq)]
15pub enum LexerErrorKind<E> {
16    /// A lexer error, returned by lexgen.
17    InvalidToken,
18
19    /// A custom error, returned by a semantic action.
20    Custom(E),
21}
22
23/// A location in an input.
24#[derive(Debug, Default, Clone, Copy, PartialEq, Eq)]
25pub struct Loc {
26    /// Zero-based line number in the input.
27    pub line: u32,
28
29    /// Zero-based character index of this location in its line.
30    pub col: u32,
31
32    /// Zero-based UTF-8 byte index of this location in the input.
33    pub byte_idx: usize,
34}
35
36impl Loc {
37    const ZERO: Loc = Loc {
38        line: 0,
39        col: 0,
40        byte_idx: 0,
41    };
42}
43
44/// **Do not use**
45// Possible outcomes of a semantic action
46pub enum SemanticActionResult<T> {
47    // Semantic action did not return a token, continue with lexing
48    Continue,
49    // Semantic action returned a token, return it
50    Return(T),
51}
52
53impl<T> SemanticActionResult<T> {
54    pub fn map_token<F, T1>(self, f: F) -> SemanticActionResult<T1>
55    where
56        F: Fn(T) -> T1,
57    {
58        match self {
59            SemanticActionResult::Continue => SemanticActionResult::Continue,
60            SemanticActionResult::Return(t) => SemanticActionResult::Return(f(t)),
61        }
62    }
63}
64
65/// Common parts in lexers generated by lexgen.
66///
67/// **Fields are used by lexgen-generated code and should not be used directly.**
68#[derive(Debug, Clone)]
69pub struct Lexer<'input, Iter: Iterator<Item = char> + Clone, Token, State, Error, Wrapper> {
70    // Current lexer state
71    pub __state: usize,
72
73    // Set after end-of-input is handled by a rule, or by default in `Init` rule
74    pub __done: bool,
75
76    // Which lexer state to switch to on successful match
77    pub __initial_state: usize,
78
79    user_state: State,
80
81    // User-provided input string. Does not change after initialization.
82    input: &'input str,
83
84    // Start location of `iter`. We update this as we backtrack and update `iter`.
85    iter_loc: Loc,
86
87    // Character iterator. `Peekable` is used in the handler's `peek` method. Note that we can't
88    // use byte index returned by this directly, as we re-initialize this field when backtracking.
89    // Add `iter_byte_idx` to the byte index before using. When resetting, update `iter_byte_idx`.
90    pub __iter: Peekable<Iter>,
91
92    // Start of the current match
93    current_match_start: Loc,
94
95    // End of the current match
96    current_match_end: Loc,
97
98    // If we skipped an accepting state, this holds the triple:
99    //
100    // - Skipped match start (byte index in `input`)
101    // - Semantic action (a function name)
102    // - Skipped match end (exclusive, byte index in `input`)
103    last_match: Option<(
104        Loc,
105        Peekable<Iter>,
106        for<'lexer> fn(&'lexer mut Wrapper) -> SemanticActionResult<Result<Token, Error>>,
107        Loc,
108    )>,
109}
110
111impl<I: Iterator<Item = char> + Clone, T, S: Default, E, W> Lexer<'static, I, T, S, E, W> {
112    pub fn new_from_iter(iter: I) -> Self {
113        Self::new_from_iter_with_state(iter, Default::default())
114    }
115}
116
117impl<I: Iterator<Item = char> + Clone, T, S, E, W> Lexer<'static, I, T, S, E, W> {
118    pub fn new_from_iter_with_state(iter: I, state: S) -> Self {
119        Self {
120            __state: 0,
121            __done: false,
122            __initial_state: 0,
123            user_state: state,
124            input: "",
125            iter_loc: Loc::ZERO,
126            __iter: iter.peekable(),
127            current_match_start: Loc::ZERO,
128            current_match_end: Loc::ZERO,
129            last_match: None,
130        }
131    }
132}
133
134impl<'input, T, S: Default, E, W> Lexer<'input, Chars<'input>, T, S, E, W> {
135    pub fn new(input: &'input str) -> Self {
136        Self::new_with_state(input, Default::default())
137    }
138}
139
140impl<'input, T, S, E, W> Lexer<'input, Chars<'input>, T, S, E, W> {
141    pub fn new_with_state(input: &'input str, state: S) -> Self {
142        Self {
143            __state: 0,
144            __done: false,
145            __initial_state: 0,
146            user_state: state,
147            input,
148            iter_loc: Loc::ZERO,
149            __iter: input.chars().peekable(),
150            current_match_start: Loc::ZERO,
151            current_match_end: Loc::ZERO,
152            last_match: None,
153        }
154    }
155}
156
157impl<'input, I: Iterator<Item = char> + Clone, T, S, E, W> Lexer<'input, I, T, S, E, W> {
158    // Read the next chracter
159    pub fn next(&mut self) -> Option<char> {
160        match self.__iter.next() {
161            None => None,
162            Some(char) => {
163                self.current_match_end.byte_idx += char.len_utf8();
164                if char == '\n' {
165                    self.current_match_end.line += 1;
166                    self.current_match_end.col = 0;
167                } else if char == '\t' {
168                    self.current_match_end.col += 4; // TODO: Make this configurable?
169                } else {
170                    self.current_match_end.col += UnicodeWidthChar::width(char).unwrap_or(1) as u32;
171                }
172                Some(char)
173            }
174        }
175    }
176
177    pub fn peek(&mut self) -> Option<char> {
178        self.__iter.peek().copied()
179    }
180
181    // On success returns semantic action function for the last match
182    pub fn backtrack(
183        &mut self,
184    ) -> Result<for<'lexer> fn(&'lexer mut W) -> SemanticActionResult<Result<T, E>>, LexerError<E>>
185    {
186        match self.last_match.take() {
187            None => {
188                self.__state = 0;
189                Err(LexerError {
190                    location: self.current_match_start,
191                    kind: LexerErrorKind::InvalidToken,
192                })
193            }
194            Some((match_start, iter, semantic_action, match_end)) => {
195                self.__done = false;
196                self.current_match_start = match_start;
197                self.current_match_end = match_end;
198                self.__iter = iter;
199                self.iter_loc = match_end;
200                Ok(semantic_action)
201            }
202        }
203    }
204
205    pub fn reset_accepting_state(&mut self) {
206        self.last_match = None;
207    }
208
209    pub fn set_accepting_state(
210        &mut self,
211        semantic_action_fn: for<'lexer> fn(&'lexer mut W) -> SemanticActionResult<Result<T, E>>,
212    ) {
213        self.last_match = Some((
214            self.current_match_start,
215            self.__iter.clone(),
216            semantic_action_fn,
217            self.current_match_end,
218        ));
219    }
220
221    pub fn reset_match(&mut self) {
222        self.current_match_start = self.current_match_end;
223    }
224
225    pub fn match_(&self) -> &'input str {
226        &self.input[self.current_match_start.byte_idx..self.current_match_end.byte_idx]
227    }
228
229    pub fn match_loc(&self) -> (Loc, Loc) {
230        (self.current_match_start, self.current_match_end)
231    }
232
233    pub fn state(&mut self) -> &mut S {
234        &mut self.user_state
235    }
236}