antlr_rust_sleagon/
lexer.rs

1//! Lexer implementation
2use std::borrow::Cow::Borrowed;
3use std::borrow::{Borrow, Cow};
4use std::cell::{Cell, RefCell};
5
6use std::rc::Rc;
7
8use crate::char_stream::{CharStream, InputData};
9use crate::error_listener::{ConsoleErrorListener, ErrorListener};
10use crate::errors::ANTLRError;
11use crate::int_stream::IntStream;
12use crate::lexer_atn_simulator::{ILexerATNSimulator, LexerATNSimulator};
13use crate::parser::ParserNodeType;
14
15use crate::recognizer::{Actions, Recognizer};
16use crate::rule_context::EmptyContextType;
17use crate::token::TOKEN_INVALID_TYPE;
18use crate::token_factory::{CommonTokenFactory, TokenAware, TokenFactory};
19use crate::token_source::TokenSource;
20use std::ops::{Deref, DerefMut};
21
22///  Lexer functionality required by `LexerATNSimulator` to work properly
23pub trait Lexer<'input>:
24    TokenSource<'input>
25    + Recognizer<'input, Node = EmptyContextType<'input, <Self as TokenAware<'input>>::TF>>
26{
27    /// Concrete input stream used by this parser
28    type Input: IntStream;
29    /// Same as `TokenStream::get_input_stream` but returns concrete type instance
30    /// important for proper inlining in hot code of `LexerATNSimulator`
31    fn input(&mut self) -> &mut Self::Input;
32    /// Sets channel where current token will be pushed
33    ///
34    /// By default two channels are available:
35    ///  - `LEXER_DEFAULT_TOKEN_CHANNEL`
36    ///  - `LEXER_HIDDEN`
37    fn set_channel(&mut self, v: isize);
38
39    /// Pushes current mode to internal mode stack and sets `m` as current lexer mode
40    /// `pop_mode should be used to recover previous mode
41    fn push_mode(&mut self, m: usize);
42
43    /// Pops mode from internal mode stack
44    fn pop_mode(&mut self) -> Option<usize>;
45
46    /// Sets type of the current token
47    /// Called from action to override token that will be emitted by lexer
48    fn set_type(&mut self, t: isize);
49
50    /// Sets lexer mode discarding current one
51    fn set_mode(&mut self, m: usize);
52
53    /// Used to informs lexer that it should consider next token as a continuation of the current one
54    fn more(&mut self);
55
56    /// Tells lexer to completely ignore and not emit current token.
57    fn skip(&mut self);
58
59    #[doc(hidden)]
60    fn reset(&mut self);
61
62    #[doc(hidden)]
63    fn get_interpreter(&self) -> Option<&LexerATNSimulator>;
64}
65
66/// **! Usually generated by ANTLR !**
67///
68/// This trait combines everything that can be used to extend Lexer behavior
69pub trait LexerRecog<'a, T: Recognizer<'a>>: Actions<'a, T> + Sized + 'static {
70    /// Callback to extend emit behavior
71    fn before_emit(_lexer: &mut T) {}
72}
73
74/// Default implementation of Lexer
75///
76/// Public fields in this struct are intended to be used by embedded actions
77#[allow(missing_docs)]
78pub struct BaseLexer<
79    'input,
80    T: LexerRecog<'input, Self> + 'static,
81    Input: CharStream<TF::From>,
82    TF: TokenFactory<'input> = CommonTokenFactory,
83> {
84    /// `LexerATNSimulator` instance of this lexer
85    pub interpreter: Option<Box<LexerATNSimulator>>,
86    /// `CharStream` used by this lexer
87    pub input: Option<Input>,
88    recog: T,
89
90    factory: &'input TF,
91
92    error_listeners: RefCell<Vec<Box<dyn ErrorListener<'input, Self>>>>,
93
94    pub token_start_char_index: isize,
95    pub token_start_line: isize,
96    pub token_start_column: isize,
97    current_pos: Rc<LexerPosition>,
98    /// Overrides token type emitted by lexer for current token
99    pub token_type: isize,
100    /// Make it `Some` to override token that is currently being generated by lexer
101    pub token: Option<TF::Tok>,
102    hit_eof: bool,
103    /// Channel lexer is currently assigning tokens to
104    pub channel: isize,
105    /// stack of modes, which is used for pushMode,popMode lexer actions
106    pub mode_stack: Vec<usize>,
107    /// Mode lexer is currently in
108    pub mode: usize,
109    /// Make it `Some` to override text for token that is currently being generated by lexer
110    pub text: Option<<TF::Data as ToOwned>::Owned>,
111}
112
113#[derive(Debug)]
114pub(crate) struct LexerPosition {
115    pub(crate) line: Cell<isize>,
116    pub(crate) char_position_in_line: Cell<isize>,
117}
118
119impl<'input, T, Input, TF> Deref for BaseLexer<'input, T, Input, TF>
120where
121    T: LexerRecog<'input, Self> + 'static,
122    Input: CharStream<TF::From>,
123    TF: TokenFactory<'input>,
124{
125    type Target = T;
126
127    fn deref(&self) -> &Self::Target {
128        &self.recog
129    }
130}
131
132impl<'input, T, Input, TF> DerefMut for BaseLexer<'input, T, Input, TF>
133where
134    T: LexerRecog<'input, Self> + 'static,
135    Input: CharStream<TF::From>,
136    TF: TokenFactory<'input>,
137{
138    fn deref_mut(&mut self) -> &mut Self::Target {
139        &mut self.recog
140    }
141}
142
143impl<'input, T, Input, TF> Recognizer<'input> for BaseLexer<'input, T, Input, TF>
144where
145    T: LexerRecog<'input, Self> + 'static,
146    Input: CharStream<TF::From>,
147    TF: TokenFactory<'input>,
148{
149    type Node = EmptyContextType<'input, TF>;
150
151    fn sempred(
152        &mut self,
153        _localctx: Option<&<Self::Node as ParserNodeType<'input>>::Type>,
154        rule_index: isize,
155        action_index: isize,
156    ) -> bool {
157        <T as Actions<'input, Self>>::sempred(_localctx, rule_index, action_index, self)
158    }
159
160    fn action(
161        &mut self,
162        _localctx: Option<&<Self::Node as ParserNodeType<'input>>::Type>,
163        rule_index: isize,
164        action_index: isize,
165    ) {
166        <T as Actions<'input, Self>>::action(_localctx, rule_index, action_index, self)
167    }
168}
169
170/// Default lexer mode id
171pub const LEXER_DEFAULT_MODE: usize = 0;
172/// Special token type to indicate that lexer should continue current token on next iteration
173/// see `Lexer::more()`
174pub const LEXER_MORE: isize = -2;
175/// Special token type to indicate that lexer should not return current token
176/// usually used to skip whitespaces and comments
177/// see `Lexer::skip()`
178pub const LEXER_SKIP: isize = -3;
179
180#[doc(inline)]
181pub use super::token::TOKEN_DEFAULT_CHANNEL as LEXER_DEFAULT_TOKEN_CHANNEL;
182
183#[doc(inline)]
184pub use super::token::TOKEN_HIDDEN_CHANNEL as LEXER_HIDDEN;
185
186pub(crate) const LEXER_MIN_CHAR_VALUE: isize = 0x0000;
187pub(crate) const LEXER_MAX_CHAR_VALUE: isize = 0x10FFFF;
188
189impl<'input, T, Input, TF> BaseLexer<'input, T, Input, TF>
190where
191    T: LexerRecog<'input, Self> + 'static,
192    Input: CharStream<TF::From>,
193    TF: TokenFactory<'input>,
194{
195    fn emit_token(&mut self, token: TF::Tok) {
196        self.token = Some(token);
197    }
198
199    fn emit(&mut self) {
200        <T as LexerRecog<Self>>::before_emit(self);
201        let stop = self.get_char_index() - 1;
202        let token = self.factory.create(
203            Some(self.input.as_mut().unwrap()),
204            self.token_type,
205            self.text.take(),
206            self.channel,
207            self.token_start_char_index,
208            stop,
209            self.token_start_line,
210            self.token_start_column,
211        );
212        self.emit_token(token);
213    }
214
215    fn emit_eof(&mut self) {
216        let token = self.factory.create(
217            None::<&mut Input>,
218            super::int_stream::EOF,
219            None,
220            LEXER_DEFAULT_TOKEN_CHANNEL,
221            self.get_char_index(),
222            self.get_char_index() - 1,
223            self.get_line(),
224            self.get_char_position_in_line(),
225        );
226        self.emit_token(token)
227    }
228
229    /// Current position in input stream
230    pub fn get_char_index(&self) -> isize {
231        self.input.as_ref().unwrap().index()
232    }
233
234    /// Current token text
235    pub fn get_text<'a>(&'a self) -> Cow<'a, TF::Data>
236    where
237        'input: 'a,
238    {
239        self.text
240            .as_ref()
241            .map(|it| Borrowed(it.borrow()))
242            // .unwrap_or("")
243            .unwrap_or_else(|| {
244                let text = self
245                    .input
246                    .as_ref()
247                    .unwrap()
248                    .get_text(self.token_start_char_index, self.get_char_index() - 1);
249                TF::get_data(text)
250            })
251    }
252
253    /// Used from lexer actions to override text of the token that will be emitted next
254    pub fn set_text(&mut self, _text: <TF::Data as ToOwned>::Owned) {
255        self.text = Some(_text);
256    }
257
258    // fn get_all_tokens(&mut self) -> Vec<TF::Tok> { unimplemented!() }
259
260    // fn get_char_error_display(&self, _c: char) -> String { unimplemented!() }
261
262    /// Add error listener
263    pub fn add_error_listener(&mut self, listener: Box<dyn ErrorListener<'input, Self>>) {
264        self.error_listeners.borrow_mut().push(listener);
265    }
266
267    /// Remove and drop all error listeners
268    pub fn remove_error_listeners(&mut self) {
269        self.error_listeners.borrow_mut().clear();
270    }
271
272    /// Creates new lexer instance
273    pub fn new_base_lexer(
274        input: Input,
275        interpreter: LexerATNSimulator,
276        recog: T,
277        factory: &'input TF,
278    ) -> Self {
279        let mut lexer = Self {
280            interpreter: Some(Box::new(interpreter)),
281            input: Some(input),
282            recog,
283            factory,
284            error_listeners: RefCell::new(vec![Box::new(ConsoleErrorListener {})]),
285            token_start_char_index: 0,
286            token_start_line: 0,
287            token_start_column: 0,
288            current_pos: Rc::new(LexerPosition {
289                line: Cell::new(1),
290                char_position_in_line: Cell::new(0),
291            }),
292            token_type: super::token::TOKEN_INVALID_TYPE,
293            text: None,
294            token: None,
295            hit_eof: false,
296            channel: super::token::TOKEN_DEFAULT_CHANNEL,
297            //            token_factory_source_pair: None,
298            mode_stack: Vec::new(),
299            mode: self::LEXER_DEFAULT_MODE,
300        };
301        let pos = lexer.current_pos.clone();
302        lexer.interpreter.as_mut().unwrap().current_pos = pos;
303        lexer
304    }
305}
306
307impl<'input, T, Input, TF> TokenAware<'input> for BaseLexer<'input, T, Input, TF>
308where
309    T: LexerRecog<'input, Self> + 'static,
310    Input: CharStream<TF::From>,
311    TF: TokenFactory<'input>,
312{
313    type TF = TF;
314}
315
316impl<'input, T, Input, TF> TokenSource<'input> for BaseLexer<'input, T, Input, TF>
317where
318    T: LexerRecog<'input, Self> + 'static,
319    Input: CharStream<TF::From>,
320    TF: TokenFactory<'input>,
321{
322    type TF = TF;
323    #[inline]
324    #[allow(unused_labels)]
325    fn next_token(&mut self) -> <Self::TF as TokenFactory<'input>>::Tok {
326        assert!(self.input.is_some());
327
328        let _marker = self.input().mark();
329        'outer: loop {
330            if self.hit_eof {
331                self.emit_eof();
332                break;
333            }
334            self.token = None;
335            self.channel = LEXER_DEFAULT_TOKEN_CHANNEL;
336            self.token_start_column = self
337                .interpreter
338                .as_ref()
339                .unwrap()
340                .get_char_position_in_line();
341            self.token_start_line = self.interpreter.as_ref().unwrap().get_line();
342            self.text = None;
343            let index = self.input().index();
344            self.token_start_char_index = index;
345
346            'inner: loop {
347                self.token_type = TOKEN_INVALID_TYPE;
348                // detach from self, to allow self to be passed deeper
349                let mut interpreter = self.interpreter.take().unwrap();
350                //                    let mut input = self.input.take().unwrap();
351                let result = interpreter.match_token(self.mode, self);
352                self.interpreter = Some(interpreter);
353
354                let ttype = result.unwrap_or_else(|err| {
355                    //                            println!("error, recovering");
356                    notify_listeners(&mut self.error_listeners.borrow_mut(), &err, self);
357                    self.interpreter
358                        .as_mut()
359                        .unwrap()
360                        .recover(err, self.input.as_mut().unwrap());
361                    LEXER_SKIP
362                });
363                //                    self.input = Some(input)
364
365                if self.input().la(1) == super::int_stream::EOF {
366                    self.hit_eof = true;
367                }
368
369                if self.token_type == TOKEN_INVALID_TYPE {
370                    self.token_type = ttype;
371                }
372
373                if self.token_type == LEXER_SKIP {
374                    continue 'outer;
375                }
376
377                if self.token_type != LEXER_MORE {
378                    break;
379                }
380            }
381
382            if self.token.is_none() {
383                self.emit();
384                break;
385            }
386        }
387        self.input().release(_marker);
388        self.token.take().unwrap()
389    }
390
391    fn get_line(&self) -> isize {
392        self.current_pos.line.get()
393    }
394
395    fn get_char_position_in_line(&self) -> isize {
396        self.current_pos.char_position_in_line.get()
397    }
398
399    fn get_input_stream(&mut self) -> Option<&mut dyn IntStream> {
400        match &mut self.input {
401            None => None,
402            Some(x) => Some(x as _),
403        }
404    }
405
406    fn get_source_name(&self) -> String {
407        self.input
408            .as_ref()
409            .map(|it| it.get_source_name())
410            .unwrap_or("<none>".to_string())
411    }
412
413    //    fn set_token_factory<'c: 'b>(&mut self, f: &'c TokenFactory) {
414    //        self.factory = f;
415    //    }
416
417    fn get_token_factory(&self) -> &'input TF {
418        self.factory
419    }
420}
421
422#[cold]
423#[inline(never)]
424fn notify_listeners<'input, T, Input, TF>(
425    liseners: &mut Vec<Box<dyn ErrorListener<'input, BaseLexer<'input, T, Input, TF>>>>,
426    e: &ANTLRError,
427    lexer: &BaseLexer<'input, T, Input, TF>,
428) where
429    T: LexerRecog<'input, BaseLexer<'input, T, Input, TF>> + 'static,
430    Input: CharStream<TF::From>,
431    TF: TokenFactory<'input>,
432{
433    let inner = lexer
434        .input
435        .as_ref()
436        .unwrap()
437        .get_text(lexer.token_start_char_index, lexer.get_char_index());
438    let text = format!(
439        "token recognition error at: '{}'",
440        TF::get_data(inner).to_display()
441    );
442    for listener in liseners.iter_mut() {
443        listener.syntax_error(
444            lexer,
445            None,
446            lexer.token_start_line,
447            lexer.token_start_column,
448            &text,
449            Some(e),
450        )
451    }
452}
453
454impl<'input, T, Input, TF> Lexer<'input> for BaseLexer<'input, T, Input, TF>
455where
456    T: LexerRecog<'input, Self> + 'static,
457    Input: CharStream<TF::From>,
458    TF: TokenFactory<'input>,
459{
460    type Input = Input;
461
462    fn input(&mut self) -> &mut Self::Input {
463        self.input.as_mut().unwrap()
464    }
465
466    fn set_channel(&mut self, v: isize) {
467        self.channel = v;
468    }
469
470    fn push_mode(&mut self, m: usize) {
471        self.mode_stack.push(self.mode);
472        self.mode = m;
473    }
474
475    fn pop_mode(&mut self) -> Option<usize> {
476        self.mode_stack.pop().map(|mode| {
477            self.mode = mode;
478            mode
479        })
480    }
481
482    fn set_type(&mut self, t: isize) {
483        self.token_type = t;
484    }
485
486    fn set_mode(&mut self, m: usize) {
487        self.mode = m;
488    }
489
490    fn more(&mut self) {
491        self.set_type(LEXER_MORE)
492    }
493
494    fn skip(&mut self) {
495        self.set_type(LEXER_SKIP)
496    }
497
498    fn reset(&mut self) {
499        unimplemented!()
500    }
501
502    fn get_interpreter(&self) -> Option<&LexerATNSimulator> {
503        self.interpreter.as_deref()
504    }
505}