do_not_use_antlr_rust/
lexer.rs

1//! Lexer implementation
2use std::borrow::Cow::Borrowed;
3use std::borrow::{Borrow, Cow};
4use std::cell::{Cell, RefCell};
5
6use std::rc::Rc;
7
8use crate::char_stream::{CharStream, InputData};
9use crate::error_listener::{ConsoleErrorListener, ErrorListener};
10use crate::errors::ANTLRError;
11use crate::int_stream::IntStream;
12use crate::lexer_atn_simulator::{ILexerATNSimulator, LexerATNSimulator};
13use crate::parser::ParserNodeType;
14
15use crate::recognizer::{Actions, Recognizer};
16use crate::rule_context::EmptyContextType;
17use crate::token::TOKEN_INVALID_TYPE;
18use crate::token_factory::{CommonTokenFactory, TokenAware, TokenFactory};
19use crate::token_source::TokenSource;
20use std::ops::{Deref, DerefMut};
21
22///  Lexer functionality required by `LexerATNSimulator` to work properly
23pub trait Lexer<'input>:
24    TokenSource<'input>
25    + Recognizer<'input, Node = EmptyContextType<'input, <Self as TokenAware<'input>>::TF>>
26{
27    /// Concrete input stream used by this parser
28    type Input: IntStream;
29    /// Same as `TokenStream::get_input_stream` but returns concrete type instance
30    /// important for proper inlining in hot code of `LexerATNSimulator`
31    fn input(&mut self) -> &mut Self::Input;
32    /// Sets channel where current token will be pushed
33    ///
34    /// By default two channels are available:
35    ///  - `LEXER_DEFAULT_TOKEN_CHANNEL`
36    ///  - `LEXER_HIDDEN`
37    fn set_channel(&mut self, v: isize);
38
39    /// Pushes current mode to internal mode stack and sets `m` as current lexer mode
40    /// `pop_mode should be used to recover previous mode
41    fn push_mode(&mut self, m: usize);
42
43    /// Pops mode from internal mode stack
44    fn pop_mode(&mut self) -> Option<usize>;
45
46    /// Sets type of the current token
47    /// Called from action to override token that will be emitted by lexer
48    fn set_type(&mut self, t: isize);
49
50    /// Sets lexer mode discarding current one
51    fn set_mode(&mut self, m: usize);
52
53    /// Used to informs lexer that it should consider next token as a continuation of the current one
54    fn more(&mut self);
55
56    /// Tells lexer to completely ignore and not emit current token.
57    fn skip(&mut self);
58
59    #[doc(hidden)]
60    fn reset(&mut self);
61
62    #[doc(hidden)]
63    fn get_interpreter(&self) -> Option<&LexerATNSimulator>;
64}
65
66/// **! Usually generated by ANTLR !**
67///
68/// This trait combines everything that can be used to extend Lexer behavior
69pub trait LexerRecog<'a, T: Recognizer<'a>>: Actions<'a, T> + Sized + 'static {
70    /// Callback to extend emit behavior
71    fn before_emit(_lexer: &mut T) {}
72}
73
74/// Default implementation of Lexer
75///
76/// Public fields in this struct are intended to be used by embedded actions
77#[allow(missing_docs)]
78pub struct BaseLexer<
79    'input,
80    T: LexerRecog<'input, Self> + 'static,
81    Input: CharStream<TF::From>,
82    TF: TokenFactory<'input> = CommonTokenFactory,
83> {
84    /// `LexerATNSimulator` instance of this lexer
85    pub interpreter: Option<Box<LexerATNSimulator>>,
86    /// `CharStream` used by this lexer
87    pub input: Option<Input>,
88    recog: T,
89
90    factory: &'input TF,
91
92    error_listeners: RefCell<Vec<Box<dyn ErrorListener<'input, Self>>>>,
93
94    pub token_start_char_index: isize,
95    pub token_start_line: isize,
96    pub token_start_column: isize,
97    current_pos: Rc<LexerPosition>,
98    /// Overrides token type emitted by lexer for current token
99    pub token_type: isize,
100    /// Make it `Some` to override token that is currently being generated by lexer
101    pub token: Option<TF::Tok>,
102    hit_eof: bool,
103    /// Channel lexer is currently assigning tokens to
104    pub channel: isize,
105    mode_stack: Vec<usize>,
106    /// Mode lexer is currently in
107    pub mode: usize,
108    /// Make it `Some` to override text for token that is currently being generated by lexer
109    pub text: Option<<TF::Data as ToOwned>::Owned>,
110}
111
112#[derive(Debug)]
113pub(crate) struct LexerPosition {
114    pub(crate) line: Cell<isize>,
115    pub(crate) char_position_in_line: Cell<isize>,
116}
117
118impl<'input, T, Input, TF> Deref for BaseLexer<'input, T, Input, TF>
119where
120    T: LexerRecog<'input, Self> + 'static,
121    Input: CharStream<TF::From>,
122    TF: TokenFactory<'input>,
123{
124    type Target = T;
125
126    fn deref(&self) -> &Self::Target { &self.recog }
127}
128
129impl<'input, T, Input, TF> DerefMut for BaseLexer<'input, T, Input, TF>
130where
131    T: LexerRecog<'input, Self> + 'static,
132    Input: CharStream<TF::From>,
133    TF: TokenFactory<'input>,
134{
135    fn deref_mut(&mut self) -> &mut Self::Target { &mut self.recog }
136}
137
138impl<'input, T, Input, TF> Recognizer<'input> for BaseLexer<'input, T, Input, TF>
139where
140    T: LexerRecog<'input, Self> + 'static,
141    Input: CharStream<TF::From>,
142    TF: TokenFactory<'input>,
143{
144    type Node = EmptyContextType<'input, TF>;
145
146    fn sempred(
147        &mut self,
148        _localctx: Option<&<Self::Node as ParserNodeType<'input>>::Type>,
149        rule_index: isize,
150        action_index: isize,
151    ) -> bool {
152        <T as Actions<'input, Self>>::sempred(_localctx, rule_index, action_index, self)
153    }
154
155    fn action(
156        &mut self,
157        _localctx: Option<&<Self::Node as ParserNodeType<'input>>::Type>,
158        rule_index: isize,
159        action_index: isize,
160    ) {
161        <T as Actions<'input, Self>>::action(_localctx, rule_index, action_index, self)
162    }
163}
164
165/// Default lexer mode id
166pub const LEXER_DEFAULT_MODE: usize = 0;
167/// Special token type to indicate that lexer should continue current token on next iteration
168/// see `Lexer::more()`
169pub const LEXER_MORE: isize = -2;
170/// Special token type to indicate that lexer should not return current token
171/// usually used to skip whitespaces and comments
172/// see `Lexer::skip()`
173pub const LEXER_SKIP: isize = -3;
174
175#[doc(inline)]
176pub use super::token::TOKEN_DEFAULT_CHANNEL as LEXER_DEFAULT_TOKEN_CHANNEL;
177
178#[doc(inline)]
179pub use super::token::TOKEN_HIDDEN_CHANNEL as LEXER_HIDDEN;
180
181pub(crate) const LEXER_MIN_CHAR_VALUE: isize = 0x0000;
182pub(crate) const LEXER_MAX_CHAR_VALUE: isize = 0x10FFFF;
183
184impl<'input, T, Input, TF> BaseLexer<'input, T, Input, TF>
185where
186    T: LexerRecog<'input, Self> + 'static,
187    Input: CharStream<TF::From>,
188    TF: TokenFactory<'input>,
189{
190    fn emit_token(&mut self, token: TF::Tok) { self.token = Some(token); }
191
192    fn emit(&mut self) {
193        <T as LexerRecog<Self>>::before_emit(self);
194        let stop = self.get_char_index() - 1;
195        let token = self.factory.create(
196            Some(self.input.as_mut().unwrap()),
197            self.token_type,
198            self.text.take(),
199            self.channel,
200            self.token_start_char_index,
201            stop,
202            self.token_start_line,
203            self.token_start_column,
204        );
205        self.emit_token(token);
206    }
207
208    fn emit_eof(&mut self) {
209        let token = self.factory.create(
210            None::<&mut Input>,
211            super::int_stream::EOF,
212            None,
213            LEXER_DEFAULT_TOKEN_CHANNEL,
214            self.get_char_index(),
215            self.get_char_index() - 1,
216            self.get_line(),
217            self.get_char_position_in_line(),
218        );
219        self.emit_token(token)
220    }
221
222    /// Current position in input stream
223    pub fn get_char_index(&self) -> isize { self.input.as_ref().unwrap().index() }
224
225    /// Current token text
226    pub fn get_text<'a>(&'a self) -> Cow<'a, TF::Data>
227    where
228        'input: 'a,
229    {
230        self.text
231            .as_ref()
232            .map(|it| Borrowed(it.borrow()))
233            // .unwrap_or("")
234            .unwrap_or_else(|| {
235                let text = self
236                    .input
237                    .as_ref()
238                    .unwrap()
239                    .get_text(self.token_start_char_index, self.get_char_index() - 1);
240                TF::get_data(text)
241            })
242    }
243
244    /// Used from lexer actions to override text of the token that will be emitted next
245    pub fn set_text(&mut self, _text: <TF::Data as ToOwned>::Owned) { self.text = Some(_text); }
246
247    // fn get_all_tokens(&mut self) -> Vec<TF::Tok> { unimplemented!() }
248
249    // fn get_char_error_display(&self, _c: char) -> String { unimplemented!() }
250
251    /// Add error listener
252    pub fn add_error_listener(&mut self, listener: Box<dyn ErrorListener<'input, Self>>) {
253        self.error_listeners.borrow_mut().push(listener);
254    }
255
256    /// Remove and drop all error listeners
257    pub fn remove_error_listeners(&mut self) { self.error_listeners.borrow_mut().clear(); }
258
259    /// Creates new lexer instance
260    pub fn new_base_lexer(
261        input: Input,
262        interpreter: LexerATNSimulator,
263        recog: T,
264        factory: &'input TF,
265    ) -> Self {
266        let mut lexer = Self {
267            interpreter: Some(Box::new(interpreter)),
268            input: Some(input),
269            recog,
270            factory,
271            error_listeners: RefCell::new(vec![Box::new(ConsoleErrorListener {})]),
272            token_start_char_index: 0,
273            token_start_line: 0,
274            token_start_column: 0,
275            current_pos: Rc::new(LexerPosition {
276                line: Cell::new(1),
277                char_position_in_line: Cell::new(0),
278            }),
279            token_type: super::token::TOKEN_INVALID_TYPE,
280            text: None,
281            token: None,
282            hit_eof: false,
283            channel: super::token::TOKEN_DEFAULT_CHANNEL,
284            //            token_factory_source_pair: None,
285            mode_stack: Vec::new(),
286            mode: self::LEXER_DEFAULT_MODE,
287        };
288        let pos = lexer.current_pos.clone();
289        lexer.interpreter.as_mut().unwrap().current_pos = pos;
290        lexer
291    }
292}
293
294impl<'input, T, Input, TF> TokenAware<'input> for BaseLexer<'input, T, Input, TF>
295where
296    T: LexerRecog<'input, Self> + 'static,
297    Input: CharStream<TF::From>,
298    TF: TokenFactory<'input>,
299{
300    type TF = TF;
301}
302
303impl<'input, T, Input, TF> TokenSource<'input> for BaseLexer<'input, T, Input, TF>
304where
305    T: LexerRecog<'input, Self> + 'static,
306    Input: CharStream<TF::From>,
307    TF: TokenFactory<'input>,
308{
309    type TF = TF;
310    #[inline]
311    #[allow(unused_labels)]
312    fn next_token(&mut self) -> <Self::TF as TokenFactory<'input>>::Tok {
313        assert!(self.input.is_some());
314
315        let _marker = self.input().mark();
316        'outer: loop {
317            if self.hit_eof {
318                self.emit_eof();
319                break;
320            }
321            self.token = None;
322            self.channel = LEXER_DEFAULT_TOKEN_CHANNEL;
323            self.token_start_column = self
324                .interpreter
325                .as_ref()
326                .unwrap()
327                .get_char_position_in_line();
328            self.token_start_line = self.interpreter.as_ref().unwrap().get_line();
329            self.text = None;
330            let index = self.input().index();
331            self.token_start_char_index = index;
332
333            'inner: loop {
334                self.token_type = TOKEN_INVALID_TYPE;
335                // detach from self, to allow self to be passed deeper
336                let mut interpreter = self.interpreter.take().unwrap();
337                //                    let mut input = self.input.take().unwrap();
338                let result = interpreter.match_token(self.mode, self);
339                self.interpreter = Some(interpreter);
340
341                let ttype = result.unwrap_or_else(|err| {
342                    //                            println!("error, recovering");
343                    notify_listeners(&mut self.error_listeners.borrow_mut(), &err, self);
344                    self.interpreter
345                        .as_mut()
346                        .unwrap()
347                        .recover(err, self.input.as_mut().unwrap());
348                    LEXER_SKIP
349                });
350                //                    self.input = Some(input)
351
352                if self.input().la(1) == super::int_stream::EOF {
353                    self.hit_eof = true;
354                }
355
356                if self.token_type == TOKEN_INVALID_TYPE {
357                    self.token_type = ttype;
358                }
359
360                if self.token_type == LEXER_SKIP {
361                    continue 'outer;
362                }
363
364                if self.token_type != LEXER_MORE {
365                    break;
366                }
367            }
368
369            if self.token.is_none() {
370                self.emit();
371                break;
372            }
373        }
374        self.input().release(_marker);
375        self.token.take().unwrap()
376    }
377
378    fn get_line(&self) -> isize { self.current_pos.line.get() }
379
380    fn get_char_position_in_line(&self) -> isize { self.current_pos.char_position_in_line.get() }
381
382    fn get_input_stream(&mut self) -> Option<&mut dyn IntStream> {
383        match &mut self.input {
384            None => None,
385            Some(x) => Some(x as _),
386        }
387    }
388
389    fn get_source_name(&self) -> String {
390        self.input
391            .as_ref()
392            .map(|it| it.get_source_name())
393            .unwrap_or("<none>".to_string())
394    }
395
396    //    fn set_token_factory<'c: 'b>(&mut self, f: &'c TokenFactory) {
397    //        self.factory = f;
398    //    }
399
400    fn get_token_factory(&self) -> &'input TF { self.factory }
401}
402
403#[cold]
404#[inline(never)]
405fn notify_listeners<'input, T, Input, TF>(
406    liseners: &mut Vec<Box<dyn ErrorListener<'input, BaseLexer<'input, T, Input, TF>>>>,
407    e: &ANTLRError,
408    lexer: &BaseLexer<'input, T, Input, TF>,
409) where
410    T: LexerRecog<'input, BaseLexer<'input, T, Input, TF>> + 'static,
411    Input: CharStream<TF::From>,
412    TF: TokenFactory<'input>,
413{
414    let inner = lexer
415        .input
416        .as_ref()
417        .unwrap()
418        .get_text(lexer.token_start_char_index, lexer.get_char_index());
419    let text = format!(
420        "token recognition error at: '{}'",
421        TF::get_data(inner).to_display()
422    );
423    for listener in liseners.iter_mut() {
424        listener.syntax_error(
425            lexer,
426            None,
427            lexer.token_start_line,
428            lexer.token_start_column,
429            &text,
430            Some(e),
431        )
432    }
433}
434
435impl<'input, T, Input, TF> Lexer<'input> for BaseLexer<'input, T, Input, TF>
436where
437    T: LexerRecog<'input, Self> + 'static,
438    Input: CharStream<TF::From>,
439    TF: TokenFactory<'input>,
440{
441    type Input = Input;
442
443    fn input(&mut self) -> &mut Self::Input { self.input.as_mut().unwrap() }
444
445    fn set_channel(&mut self, v: isize) { self.channel = v; }
446
447    fn push_mode(&mut self, m: usize) {
448        self.mode_stack.push(self.mode);
449        self.mode = m;
450    }
451
452    fn pop_mode(&mut self) -> Option<usize> {
453        self.mode_stack.pop().map(|mode| {
454            self.mode = mode;
455            mode
456        })
457    }
458
459    fn set_type(&mut self, t: isize) { self.token_type = t; }
460
461    fn set_mode(&mut self, m: usize) { self.mode = m; }
462
463    fn more(&mut self) { self.set_type(LEXER_MORE) }
464
465    fn skip(&mut self) { self.set_type(LEXER_SKIP) }
466
467    fn reset(&mut self) { unimplemented!() }
468
469    fn get_interpreter(&self) -> Option<&LexerATNSimulator> { self.interpreter.as_deref() }
470}