rustemo/
lexer.rs

1use crate::{context::Context, input::Input, location::Location, log, parser::State};
2#[cfg(debug_assertions)]
3use colored::*;
4use core::fmt::Debug;
5use std::marker::PhantomData;
6
7/// The trait implemented by all Rustemo lexers
8///
9/// Lexer is stateless and its job is to produce the next token given the
10/// current context.
11///
12/// # Generic types
13///
14/// - `C` - parsing context
15/// - `S` - parser state type.
16/// - `TK` - token kind type. This is generated by the parser generator from the
17///   grammar. This is the type that describes the kinds of token lexer can
18///   produce.
19///
20pub trait Lexer<'i, C, S, TK>
21where
22    C: Context<'i, Self::Input, S, TK>,
23    S: State,
24{
25    type Input: Input + ?Sized;
26
27    // TODO: Optimization. Find an ergonomic way to return iterator from
28    // next_tokens without boxing and using an indirection through trait object
29    //type TokenIterator: Iterator<Item = Token<'i, Self::Input, TK>>;
30
31    /// Given the current context, this method yield an iterator over possible
32    /// tokens found at the current location where the order and kinds of token
33    /// to look for, and its finish flags, are given by the `expected_tokens`
34    /// parameter.
35    ///
36    /// Context is mutable to support lexers that implement skipping of
37    /// whitespaces.
38    fn next_tokens(
39        &self,
40        context: &mut C,
41        input: &'i Self::Input,
42        expected_tokens: Vec<(TK, bool)>,
43    ) -> Box<dyn Iterator<Item = Token<'i, Self::Input, TK>> + 'i>;
44}
45
46/// The trait implemented by types used to recognize tokens in string inputs.
47/// Used by [`StringLexer`].
48pub trait TokenRecognizer<'i> {
49    fn recognize(&self, _input: &'i str) -> Option<&'i str> {
50        panic!("Recognize is not defined.")
51    }
52}
53
54/// A lexer that operates over string inputs and uses generated string and regex
55/// recognizers provided by the parser table.
56pub struct StringLexer<C, S, TK, TR: 'static, const TERMINAL_COUNT: usize> {
57    skip_ws: bool,
58    token_recognizers: &'static [TR; TERMINAL_COUNT],
59    phantom: PhantomData<(C, S, TK)>,
60}
61
62impl<
63        'i,
64        C: Context<'i, str, S, TK>,
65        S: State,
66        TK,
67        TR: TokenRecognizer<'i>,
68        const TERMINAL_COUNT: usize,
69    > StringLexer<C, S, TK, TR, TERMINAL_COUNT>
70{
71    pub fn new(skip_ws: bool, token_recognizers: &'static [TR; TERMINAL_COUNT]) -> Self {
72        Self {
73            skip_ws,
74            token_recognizers,
75            phantom: PhantomData,
76        }
77    }
78
79    fn skip(input: &'i str, context: &mut C) {
80        let skipped_len: usize = input[context.position()..]
81            .chars()
82            .take_while(|x| x.is_whitespace())
83            .map(|c| c.len_utf8())
84            .sum();
85        if skipped_len > 0 {
86            let skipped = &input[context.position()..context.position() + skipped_len];
87            log!("\t{} {}", "Skipped ws:".bold().green(), skipped_len);
88            context.set_layout_ahead(Some(skipped));
89            context.set_position(context.position() + skipped_len);
90            context.set_location(skipped.location_after(context.location()));
91        } else {
92            context.set_layout_ahead(None);
93        }
94    }
95}
96
97struct TokenIterator<'i, TR: 'static, TK> {
98    input: &'i str,
99    position: usize,
100    location: Location,
101    token_recognizers: Vec<(&'static TR, TK, bool)>,
102    index: usize,
103    finish: bool,
104}
105
106impl<'i, TR, TK> TokenIterator<'i, TR, TK> {
107    fn new(
108        input: &'i str,
109        position: usize,
110        location: Location,
111        token_recognizers: Vec<(&'static TR, TK, bool)>,
112    ) -> Self {
113        Self {
114            input,
115            position,
116            location,
117            token_recognizers,
118            index: 0,
119            finish: false,
120        }
121    }
122}
123
124impl<'i, TK, TR> Iterator for TokenIterator<'i, TR, TK>
125where
126    TR: TokenRecognizer<'i>,
127    TK: Copy,
128{
129    type Item = Token<'i, str, TK>;
130
131    fn next(&mut self) -> Option<Self::Item> {
132        loop {
133            if !self.finish && self.index < self.token_recognizers.len() {
134                let (recognizer, token_kind, finish) = &self.token_recognizers[self.index];
135                self.index += 1;
136                if let Some(recognized) = recognizer.recognize(&self.input[self.position..]) {
137                    self.finish = *finish;
138                    return Some(Token {
139                        kind: *token_kind,
140                        value: recognized,
141                        location: recognized.location_span(self.location),
142                    });
143                }
144            } else {
145                return None;
146            }
147        }
148    }
149}
150
151impl<'i, C, S, TK, TR, const TERMINAL_COUNT: usize> Lexer<'i, C, S, TK>
152    for StringLexer<C, S, TK, TR, TERMINAL_COUNT>
153where
154    C: Context<'i, str, S, TK>,
155    S: State + Into<usize>,
156    TK: Debug + Into<usize> + Copy + 'i,
157    TR: TokenRecognizer<'i>,
158{
159    type Input = str;
160
161    fn next_tokens(
162        &self,
163        context: &mut C,
164        input: &'i Self::Input,
165        expected_tokens: Vec<(TK, bool)>,
166    ) -> Box<dyn Iterator<Item = Token<'i, Self::Input, TK>> + 'i> {
167        if self.skip_ws {
168            Self::skip(input, context);
169        }
170        log!("  {} {:?}", "Trying recognizers:".green(), expected_tokens);
171
172        dbg!(context.position(), context.location());
173
174        Box::new(TokenIterator::new(
175            input,
176            context.position(),
177            context.location(),
178            expected_tokens
179                .iter()
180                .map(|&tok| (&self.token_recognizers[tok.0.into()], tok.0, tok.1))
181                .collect::<Vec<_>>(),
182        ))
183    }
184}
185
186/// Represents a single token from the input stream.
187pub struct Token<'i, I: Input + ?Sized, TK> {
188    pub kind: TK,
189
190    /// The part of the input stream that this token represents.
191    pub value: &'i I,
192
193    /// Location (with span) in the input file where this token is found.
194    pub location: Location,
195}
196
197impl<I: Input + ?Sized, TK: Copy> Clone for Token<'_, I, TK> {
198    fn clone(&self) -> Self {
199        Self {
200            kind: self.kind,
201            value: self.value,
202            location: self.location,
203        }
204    }
205}
206
207impl<I, TK> Debug for Token<'_, I, TK>
208where
209    I: Input + ?Sized,
210    I::Output: Debug,
211    TK: Debug,
212{
213    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
214        write!(
215            f,
216            "{:?}({:?} {:?})",
217            self.kind,
218            if self.value.len() > 50 {
219                format!(
220                    "{:?}{}{:?}",
221                    &self.value.slice(0..20),
222                    "..<snip>..",
223                    &self.value.slice(self.value.len() - 20..self.value.len())
224                )
225            } else {
226                format!("{:?}", self.value)
227            },
228            self.location
229        )
230    }
231}