Skip to main content

dbt_antlr4/
lexer.rs

1//! Lexer implementation
2use std::borrow::Cow::Borrowed;
3use std::borrow::{Borrow, Cow};
4use std::cell::Cell;
5
6use std::rc::Rc;
7
8use crate::char_stream::CharStream;
9use crate::error_listener::{ConsoleErrorListener, ErrorListener};
10use crate::errors::ANTLRError;
11use crate::int_stream::IntStream;
12use crate::lexer_atn_simulator::{ILexerATNSimulator, LexerATNSimulator};
13
14use crate::recognizer::{Actions, Recognizer};
15use crate::rule_context::EmptyRuleNode;
16use crate::token::TOKEN_INVALID_TYPE;
17use crate::token_factory::TokenFactory;
18use crate::token_source::TokenSource;
19use std::ops::{Deref, DerefMut};
20
21///  Lexer functionality required by `LexerATNSimulator` to work properly
22pub trait Lexer<'input, 'arena, Input, TF>:
23    TokenSource<'input, 'arena, TF> + Recognizer<'input, 'arena>
24where
25    'input: 'arena,
26    Input: CharStream<'input>,
27    TF: TokenFactory<'input, 'arena> + 'arena,
28{
29    /// Same as `TokenStream::get_input_stream` but returns concrete type instance
30    /// important for proper inlining in hot code of `LexerATNSimulator`
31    fn input(&mut self) -> &mut Input;
32    /// Sets channel where current token will be pushed
33    ///
34    /// By default two channels are available:
35    ///  - `LEXER_DEFAULT_TOKEN_CHANNEL`
36    ///  - `LEXER_HIDDEN`
37    fn set_channel(&mut self, v: i32);
38
39    /// Pushes current mode to internal mode stack and sets `m` as current lexer mode
40    /// `pop_mode should be used to recover previous mode
41    fn push_mode(&mut self, m: usize);
42
43    /// Pops mode from internal mode stack
44    fn pop_mode(&mut self) -> Option<usize>;
45
46    /// Sets type of the current token
47    /// Called from action to override token that will be emitted by lexer
48    fn set_type(&mut self, t: i32);
49
50    /// Sets lexer mode discarding current one
51    fn set_mode(&mut self, m: usize);
52
53    /// Used to informs lexer that it should consider next token as a continuation of the current one
54    fn more(&mut self);
55
56    /// Tells lexer to completely ignore and not emit current token.
57    fn skip(&mut self);
58
59    #[doc(hidden)]
60    fn reset(&mut self);
61
62    #[doc(hidden)]
63    fn get_interpreter(&self) -> Option<&LexerATNSimulator>;
64}
65
66/// **! Usually generated by ANTLR !**
67///
68/// This trait combines everything that can be used to extend Lexer behavior
69pub trait LexerRecog<'input, 'arena, TF, R>: Actions<'input, 'arena, R> + Sized + 'static
70where
71    'input: 'arena,
72    TF: TokenFactory<'input, 'arena> + 'arena,
73    R: Recognizer<'input, 'arena>,
74{
75    /// Callback to extend emit behavior
76    fn before_emit(_lexer: &mut R) {}
77
78    fn get_rule_names(&self) -> &'static [&'static str];
79
80    fn get_literal_names(&self) -> &[Option<&str>];
81
82    fn get_symbolic_names(&self) -> &[Option<&str>];
83
84    fn get_grammar_file_name(&self) -> &'static str;
85
86    fn get_atn_simulator(&self) -> LexerATNSimulator;
87}
88
89/// Default implementation of Lexer
90///
91/// Public fields in this struct are intended to be used by embedded actions
92#[allow(missing_docs)]
93pub struct BaseLexer<'input, 'arena, Ext, Input, TF>
94where
95    'input: 'arena,
96    Ext: LexerRecog<'input, 'arena, TF, Self> + 'static,
97    Input: CharStream<'input>,
98    TF: TokenFactory<'input, 'arena> + 'arena,
99{
100    /// `LexerATNSimulator` instance of this lexer
101    pub interpreter: Option<Box<LexerATNSimulator>>,
102    /// `CharStream` used by this lexer
103    pub input: Option<Input>,
104    recog: Ext,
105
106    factory: TF,
107
108    error_listeners: Vec<Box<dyn ErrorListener<'input, 'arena, Self>>>,
109
110    pub token_start_char_index: isize,
111    pub token_start_line: u32,
112    pub token_start_column: i32,
113    current_pos: Rc<LexerPosition>,
114    /// Overrides token type emitted by lexer for current token
115    pub token_type: i32,
116    /// Make it `Some` to override token that is currently being generated by lexer
117    pub token: Option<&'arena mut TF::Tok>,
118    hit_eof: bool,
119    /// Channel lexer is currently assigning tokens to
120    pub channel: i32,
121    /// stack of modes, which is used for pushMode,popMode lexer actions
122    pub mode_stack: Vec<usize>,
123    /// Mode lexer is currently in
124    pub mode: usize,
125    /// Make it `Some` to override text for token that is currently being generated by lexer
126    pub text: Option<String>,
127}
128
129#[derive(Debug)]
130pub(crate) struct LexerPosition {
131    pub(crate) line: Cell<u32>,
132    pub(crate) char_position_in_line: Cell<i32>,
133}
134
135impl<'input, 'arena, Ext, Input, TF> Deref for BaseLexer<'input, 'arena, Ext, Input, TF>
136where
137    Ext: LexerRecog<'input, 'arena, TF, Self> + 'static,
138    Input: CharStream<'input>,
139    TF: TokenFactory<'input, 'arena> + 'arena,
140{
141    type Target = Ext;
142    fn deref(&self) -> &Self::Target {
143        &self.recog
144    }
145}
146
147impl<'input, 'arena, Ext, Input, TF> DerefMut for BaseLexer<'input, 'arena, Ext, Input, TF>
148where
149    Ext: LexerRecog<'input, 'arena, TF, Self> + 'static,
150    Input: CharStream<'input>,
151    TF: TokenFactory<'input, 'arena> + 'arena,
152{
153    fn deref_mut(&mut self) -> &mut Self::Target {
154        &mut self.recog
155    }
156}
157
158impl<'input, 'arena, Ext, Input, TF> Recognizer<'input, 'arena>
159    for BaseLexer<'input, 'arena, Ext, Input, TF>
160where
161    'input: 'arena,
162    Ext: LexerRecog<'input, 'arena, TF, Self> + 'static,
163    Input: CharStream<'input>,
164    TF: TokenFactory<'input, 'arena> + 'arena,
165{
166    type Node = EmptyRuleNode<'input, 'arena>;
167
168    fn sempred(
169        &mut self,
170        localctx: Option<&'arena EmptyRuleNode<'input, 'arena>>,
171        rule_index: i32,
172        action_index: i32,
173    ) -> bool {
174        Ext::sempred(localctx, rule_index, action_index, self)
175    }
176
177    fn action(
178        &mut self,
179        localctx: Option<&'arena EmptyRuleNode<'input, 'arena>>,
180        rule_index: i32,
181        action_index: i32,
182    ) {
183        Ext::action(localctx, rule_index, action_index, self)
184    }
185}
186
187/// Default lexer mode id
188pub const LEXER_DEFAULT_MODE: usize = 0;
189/// Special token type to indicate that lexer should continue current token on next iteration
190/// see `Lexer::more()`
191pub const LEXER_MORE: i32 = -2;
192/// Special token type to indicate that lexer should not return current token
193/// usually used to skip whitespaces and comments
194/// see `Lexer::skip()`
195pub const LEXER_SKIP: i32 = -3;
196
197#[doc(inline)]
198pub use super::token::TOKEN_DEFAULT_CHANNEL as LEXER_DEFAULT_TOKEN_CHANNEL;
199
200#[doc(inline)]
201pub use super::token::TOKEN_HIDDEN_CHANNEL as LEXER_HIDDEN;
202
203pub(crate) const LEXER_MIN_CHAR_VALUE: i32 = 0x0000;
204pub(crate) const LEXER_MAX_CHAR_VALUE: i32 = 0x10FFFF;
205
206impl<'input, 'arena, Ext, Input, TF> BaseLexer<'input, 'arena, Ext, Input, TF>
207where
208    'input: 'arena,
209    Ext: LexerRecog<'input, 'arena, TF, Self> + 'static,
210    Input: CharStream<'input>,
211    TF: TokenFactory<'input, 'arena> + 'arena,
212{
213    fn emit_token(&mut self, token: &'arena mut TF::Tok) {
214        self.token = Some(token);
215    }
216
217    fn emit(&mut self) {
218        Ext::before_emit(self);
219        let stop = self.get_char_index() - 1;
220        let token = self.factory.create(
221            Some(self.input.as_mut().unwrap()),
222            self.token_type,
223            self.text.take(),
224            self.channel,
225            self.token_start_char_index,
226            stop,
227            self.token_start_line,
228            self.token_start_column,
229        );
230        self.emit_token(token);
231    }
232
233    fn emit_eof(&mut self) {
234        let token = self.factory.create(
235            None::<&mut Input>,
236            super::int_stream::EOF,
237            None,
238            LEXER_DEFAULT_TOKEN_CHANNEL,
239            self.get_char_index(),
240            self.get_char_index() - 1,
241            self.get_line(),
242            self.get_char_position_in_line(),
243        );
244        self.emit_token(token)
245    }
246
247    /// Current position in input stream
248    pub fn get_char_index(&self) -> isize {
249        self.input.as_ref().unwrap().index()
250    }
251
252    /// Current token text
253    pub fn get_text<'a>(&'a self) -> Cow<'a, str>
254    where
255        'input: 'a,
256    {
257        self.text
258            .as_ref()
259            .map(|it| Borrowed(it.borrow()))
260            // .unwrap_or("")
261            .unwrap_or_else(|| {
262                self.input
263                    .as_ref()
264                    .unwrap()
265                    .get_text(self.token_start_char_index, self.get_char_index() - 1)
266            })
267    }
268
269    /// Used from lexer actions to override text of the token that will be emitted next
270    pub fn set_text(&mut self, _text: impl Into<String>) {
271        self.text = Some(_text.into());
272    }
273
274    // fn get_all_tokens(&mut self) -> Vec<TF::Tok> { unimplemented!() }
275
276    // fn get_char_error_display(&self, _c: char) -> String { unimplemented!() }
277
278    /// Add error listener
279    pub fn add_error_listener(&mut self, listener: Box<dyn ErrorListener<'input, 'arena, Self>>) {
280        self.error_listeners.push(listener);
281    }
282
283    /// Remove and drop all error listeners
284    pub fn remove_error_listeners(&mut self) {
285        self.error_listeners.clear();
286    }
287
288    /// Creates new lexer instance
289    pub fn new_base_lexer(input: Input, recog: Ext, factory: TF) -> Self {
290        let mut lexer = Self {
291            interpreter: Some(Box::new(recog.get_atn_simulator())),
292            input: Some(input),
293            recog,
294            factory,
295            error_listeners: vec![Box::new(ConsoleErrorListener {})],
296            token_start_char_index: 0,
297            token_start_line: 0,
298            token_start_column: 0,
299            current_pos: Rc::new(LexerPosition {
300                line: Cell::new(1),
301                char_position_in_line: Cell::new(0),
302            }),
303            token_type: super::token::TOKEN_INVALID_TYPE,
304            text: None,
305            token: None,
306            hit_eof: false,
307            channel: super::token::TOKEN_DEFAULT_CHANNEL,
308            //            token_factory_source_pair: None,
309            mode_stack: Vec::new(),
310            mode: self::LEXER_DEFAULT_MODE,
311        };
312        let pos = lexer.current_pos.clone();
313        lexer.interpreter.as_mut().unwrap().current_pos = pos;
314        lexer
315    }
316}
317
318impl<'input, 'arena, L, Input, TF> TokenSource<'input, 'arena, TF>
319    for BaseLexer<'input, 'arena, L, Input, TF>
320where
321    'input: 'arena,
322    L: LexerRecog<'input, 'arena, TF, Self> + 'static,
323    Input: CharStream<'input>,
324    TF: TokenFactory<'input, 'arena> + 'arena,
325{
326    #[inline]
327    #[allow(unused_labels)]
328    fn next_token(&mut self) -> &'arena mut TF::Tok {
329        assert!(self.input.is_some());
330
331        let _marker = self.input().mark();
332        'outer: loop {
333            if self.hit_eof {
334                self.emit_eof();
335                break;
336            }
337            self.token = None;
338            self.channel = LEXER_DEFAULT_TOKEN_CHANNEL;
339            self.token_start_column = self
340                .interpreter
341                .as_ref()
342                .unwrap()
343                .get_char_position_in_line();
344            self.token_start_line = self.interpreter.as_ref().unwrap().get_line();
345            self.text = None;
346            let index = self.input().index();
347            self.token_start_char_index = index;
348
349            'inner: loop {
350                self.token_type = TOKEN_INVALID_TYPE;
351                // detach from self, to allow self to be passed deeper
352                let mut interpreter = self.interpreter.take().unwrap();
353                //                    let mut input = self.input.take().unwrap();
354                let result = interpreter.match_token(self.mode, self);
355                self.interpreter = Some(interpreter);
356
357                let ttype = result.unwrap_or_else(|err| {
358                    //                            println!("error, recovering");
359                    notify_listeners(self, &err);
360                    self.interpreter
361                        .as_mut()
362                        .unwrap()
363                        .recover(err, self.input.as_mut().unwrap());
364                    LEXER_SKIP
365                });
366                //                    self.input = Some(input)
367
368                if self.input().la(1) == super::int_stream::EOF {
369                    self.hit_eof = true;
370                }
371
372                if self.token_type == TOKEN_INVALID_TYPE {
373                    self.token_type = ttype;
374                }
375
376                if self.token_type == LEXER_SKIP {
377                    continue 'outer;
378                }
379
380                if self.token_type != LEXER_MORE {
381                    break;
382                }
383            }
384
385            if self.token.is_none() {
386                self.emit();
387                break;
388            }
389        }
390        self.input().release(_marker);
391        self.token.take().unwrap()
392    }
393
394    fn get_line(&self) -> u32 {
395        self.current_pos.line.get()
396    }
397
398    fn get_char_position_in_line(&self) -> i32 {
399        self.current_pos.char_position_in_line.get()
400    }
401
402    fn get_input_stream(&mut self) -> Option<&mut dyn IntStream> {
403        self.input.as_mut().map(|x| x as _)
404    }
405
406    fn get_source_name(&self) -> String {
407        self.input
408            .as_ref()
409            .map(|it| it.get_source_name())
410            .unwrap_or("<none>".to_string())
411    }
412
413    //    fn set_token_factory<'c: 'b>(&mut self, f: &'c TokenFactory) {
414    //        self.factory = f;
415    //    }
416
417    fn get_token_factory(&self) -> &TF {
418        &self.factory
419    }
420
421    fn get_dfa_string(&self) -> String {
422        self.get_interpreter()
423            .unwrap()
424            .get_dfa_for_mode(LEXER_DEFAULT_MODE)
425            .to_lexer_string()
426    }
427}
428
429#[cold]
430#[inline(never)]
431fn notify_listeners<'input, 'arena, L, Input, TF>(
432    lexer: &mut BaseLexer<'input, 'arena, L, Input, TF>,
433    e: &ANTLRError,
434) where
435    'input: 'arena,
436    L: LexerRecog<'input, 'arena, TF, BaseLexer<'input, 'arena, L, Input, TF>> + 'static,
437    Input: CharStream<'input>,
438    TF: TokenFactory<'input, 'arena> + 'arena,
439{
440    let inner = lexer
441        .input
442        .as_ref()
443        .unwrap()
444        .get_text(lexer.token_start_char_index, lexer.get_char_index());
445    let text = format!("token recognition error at: '{}'", inner);
446    for listener in lexer.error_listeners.iter() {
447        listener.syntax_error(
448            lexer,
449            None,
450            lexer.token_start_line,
451            lexer.token_start_column,
452            &text,
453            Some(e),
454        )
455    }
456}
457
458impl<'input, 'arena, L, Input, TF> Lexer<'input, 'arena, Input, TF>
459    for BaseLexer<'input, 'arena, L, Input, TF>
460where
461    'input: 'arena,
462    L: LexerRecog<'input, 'arena, TF, Self> + 'static,
463    Input: CharStream<'input>,
464    TF: TokenFactory<'input, 'arena> + 'arena,
465{
466    fn input(&mut self) -> &mut Input {
467        self.input.as_mut().unwrap()
468    }
469
470    fn set_channel(&mut self, v: i32) {
471        self.channel = v;
472    }
473
474    fn push_mode(&mut self, m: usize) {
475        self.mode_stack.push(self.mode);
476        self.mode = m;
477    }
478
479    fn pop_mode(&mut self) -> Option<usize> {
480        self.mode_stack.pop().inspect(|&mode| {
481            self.mode = mode;
482        })
483    }
484
485    fn set_type(&mut self, t: i32) {
486        self.token_type = t;
487    }
488
489    fn set_mode(&mut self, m: usize) {
490        self.mode = m;
491    }
492
493    fn more(&mut self) {
494        self.set_type(LEXER_MORE)
495    }
496
497    fn skip(&mut self) {
498        self.set_type(LEXER_SKIP)
499    }
500
501    fn reset(&mut self) {
502        unimplemented!()
503    }
504
505    fn get_interpreter(&self) -> Option<&LexerATNSimulator> {
506        self.interpreter.as_deref()
507    }
508}