Skip to main content

lexigram_core/lexer/
mod.rs

1// Copyright (c) 2025 Redglyph (@gmail.com). All Rights Reserved.
2
3pub(crate) mod tests;
4
5use std::collections::HashMap;
6use std::fmt::{Display, Formatter};
7use std::io::Read;
8use std::ops::{Add, AddAssign};
9use crate::segmap::{char_to_group, GroupId, SegMap};
10use crate::char_reader::{escape_char, CharReader};
11use crate::TokenId;
12// ---------------------------------------------------------------------------------------------
13// Types used in lexer
14
15pub type StateId = usize;
16pub type ChannelId = u16;
17pub type ModeId = u16;
18
19/// Terminal instructions for the lexer logic.
20///
21/// Possible actions:
22/// * skip           => doesn't return token, drops current string
23/// * more           => doesn't return token, keeps current string for next rule
24/// * push(n)        => pushes mode and switches to mode `n`
25/// * pop            => pops next mode from the stack
26/// * channel #      => defines output channel
27///
28/// By default, `push`, `pop`, `channel` or no specified action outputs a token (`token = Some(..)`).
29/// If a `skip` or `more` action is specified, no token is returned (`token = None`).
30#[derive(Clone, Debug, PartialEq, Default, PartialOrd, Eq, Ord)]
31pub struct Terminal {
32    pub action: ActionOption,
33    pub channel: ChannelId,
34    pub mode: ModeOption,
35    pub mode_state: Option<StateId>,
36    pub pop: bool
37}
38
39impl Terminal {
40    #[inline]
41    pub fn is_only_skip(&self) -> bool {
42        self.action.is_skip() && self.mode.is_none() && self.mode_state.is_none() && !self.pop
43    }
44
45    #[inline]
46    pub fn is_token(&self) -> bool {
47        self.action.is_token()
48    }
49
50    #[inline]
51    pub fn get_token(&self) -> Option<TokenId> {
52        self.action.get_token()
53    }
54
55    pub fn to_macro(&self) -> String {
56        let mut str = Vec::<String>::new();
57        match self.action {
58            ActionOption::Skip => str.push("term!(skip)".to_string()),
59            ActionOption::Token(t) => str.push(format!("term!(={t})")),
60            ActionOption::More => str.push("term!(more)".to_string())
61        }
62        if self.channel != 0 {
63            str.push(format!("term!(#{})", self.channel));
64        }
65        match self.mode {
66            ModeOption::None => {}
67            ModeOption::Mode(m) => str.push(format!("term!(mode {m})")),
68            ModeOption::Push(m) => str.push(format!("term!(push {m})")),
69        }
70        if let Some(id) = self.mode_state {
71            str.push(format!("term!(pushst {})", id));
72        }
73        if self.pop {
74            str.push("term!(pop)".to_string());
75        }
76        str.join(" + ")
77    }
78}
79
80impl Display for Terminal {
81    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
82        write!(f, "<{}", self.action)?;
83        if self.channel != 0 { write!(f, ",ch {}", self.channel)?; }
84        if !self.mode.is_none() || self.mode_state.is_some() {
85            match self.mode {
86                ModeOption::None => {}
87                ModeOption::Mode(m) => write!(f, ",mode({m}")?,
88                ModeOption::Push(m) => write!(f, ",push({m}")?,
89            }
90            if let Some(s) = self.mode_state { write!(f, ",state {s}")?; }
91            write!(f, ")")?;
92        }
93        if self.pop { write!(f, ",pop")?; }
94        write!(f, ">")
95    }
96}
97
98impl Add for Terminal {
99    type Output = Terminal;
100
101    fn add(self, rhs: Self) -> Self::Output {
102        Terminal {
103            // token: if self.token.is_some() { self.token } else { rhs.token },
104            action: self.action + rhs.action,
105            channel: self.channel + rhs.channel,
106            mode: if !self.mode.is_none() { self.mode } else { rhs.mode },
107            mode_state: if self.mode_state.is_some() { self.mode_state } else { rhs.mode_state },
108            pop: self.pop || rhs.pop
109        }
110    }
111}
112
113#[derive(Clone, Debug, PartialEq, Default, PartialOrd, Eq, Ord)]
114pub enum ActionOption {
115    #[default] Skip,
116    Token(TokenId),
117    More
118}
119
120impl ActionOption {
121    pub fn is_skip(&self) -> bool { self == &ActionOption::Skip }
122    pub fn is_token(&self) -> bool { matches!(self, ActionOption::Token(_) ) }
123    pub fn is_more(&self) -> bool { self == &ActionOption::More }
124
125    pub fn get_token(&self) -> Option<TokenId> {
126        if let ActionOption::Token(token) = self {
127            Some(*token)
128        } else {
129            None
130        }
131    }
132}
133
134impl Add for ActionOption {
135    type Output = Self;
136
137    fn add(self, rhs: Self) -> Self::Output {
138        match self {
139            ActionOption::Skip => rhs,
140            _ => if rhs.is_skip() { self } else { panic!("can't add {self:?} and {rhs:?}") }
141        }
142    }
143}
144
145impl Display for ActionOption {
146    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
147        match self {
148            ActionOption::Skip => write!(f, "skip"),
149            ActionOption::Token(t) => write!(f, "end:{t}"),
150            ActionOption::More => write!(f, "more")
151        }
152    }
153}
154
155#[derive(Clone, Copy, Debug, PartialEq, Default, PartialOrd, Eq, Ord)]
156pub enum ModeOption {
157    #[default]
158    None,
159    Mode(ModeId),
160    Push(ModeId)
161}
162
163impl ModeOption {
164    pub fn is_none(&self) -> bool {
165        self == &ModeOption::None
166    }
167
168    pub fn is_mode(&self) -> bool {
169        matches!(self, &ModeOption::Mode(_))
170    }
171
172    pub fn is_push(&self) -> bool {
173        matches!(self, &ModeOption::Push(_))
174    }
175}
176
177// ---------------------------------------------------------------------------------------------
178// Locations
179
180pub type CaretCol = u64;
181pub type CaretLine = u64;
182
183/// `Pos(line, col)`
184#[derive(Clone, Copy, PartialEq, PartialOrd, Debug)]
185pub struct Pos(pub CaretLine, pub CaretCol);
186
187impl Pos {
188    pub fn line(&self) -> CaretLine {
189        self.0
190    }
191
192    pub fn col(&self) -> CaretCol {
193        self.1
194    }
195
196    pub fn update_pos(&mut self, c: char, tab_width: CaretCol) {
197        match c {
198            '\t' => {
199                //            ↓       ↓    (if self.tab_width = 8)
200                //    1234567890123456789
201                // 1) ..↑                  col = 3
202                //    ..→→→→→→↑            col = 3 - 2%8 + 8 = 3 - 2 + 8 = 9
203                // 2) .............↑       col = 14
204                //    .............→→→↑    col = 14 - 13%8 + 8 = 14 - 5 + 8 = 17
205                self.1 = self.1 - (self.1 - 1) % tab_width + tab_width;
206            }
207            '\n' => {
208                self.0 += 1;
209                self.1 = 1;
210            }
211            '\r' => {}
212            _ => self.1 += 1,
213        }
214    }
215}
216
217impl Display for Pos {
218    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
219        write!(f, "{}:{}", self.0, self.1)
220    }
221}
222
223/// `PosSpan` defines a text selection where `first` and `last` are the [position](Pos) of the first and last character.
224/// When `first` > `last`, no text is selected.
225#[derive(Clone, PartialEq, Debug)]
226pub struct PosSpan {
227    pub first: Pos,
228    pub last: Pos,
229}
230
231impl PosSpan {
232    #[inline(always)]
233    pub fn new(first: Pos, last: Pos) -> Self {
234        PosSpan { first, last }
235    }
236
237    #[inline(always)]
238    pub fn empty() -> Self {
239        PosSpan { first: Pos(1, 1), last: Pos(0, 0) }
240    }
241
242    pub fn take(&mut self) -> PosSpan {
243        std::mem::take(self)
244    }
245
246    #[inline(always)]
247    pub fn is_empty(&self) -> bool {
248        self.first > self.last
249    }
250
251    #[inline(always)]
252    pub fn is_not_empty(&self) -> bool {
253        self.first <= self.last
254    }
255
256    pub fn first(&self) -> Option<Pos> {
257        if self.is_not_empty() { Some(self.first) } else { None }
258    }
259
260    pub fn first_forced(&self) -> Pos {
261        if self.is_not_empty() { self.first } else { panic!("span is empty") }
262    }
263
264    pub fn last(&self) -> Option<Pos> {
265        if self.is_not_empty() { Some(self.last) } else { None }
266    }
267
268    pub fn last_forced(&self) -> Pos {
269        if self.is_not_empty() { self.last } else { panic!("span is empty") }
270    }
271}
272
273impl AddAssign<&PosSpan> for PosSpan {
274    fn add_assign(&mut self, rhs: &Self) {
275        match (self.is_empty(), rhs.is_empty()) {
276            (true, false) => (self.first, self.last) = (rhs.first, rhs.last),
277            (false, false) => self.last = rhs.last,
278            _ => {}
279        }
280    }
281}
282
283impl Add<&PosSpan> for &PosSpan {
284    type Output = PosSpan;
285
286    fn add(self, rhs: &PosSpan) -> Self::Output {
287        let mut sum = self.clone();
288        sum += rhs;
289        sum
290    }
291}
292
293impl Add<&PosSpan> for PosSpan {
294    type Output = PosSpan;
295
296    fn add(self, rhs: &PosSpan) -> Self::Output {
297        let mut sum = self.clone();
298        sum += rhs;
299        sum
300    }
301}
302
303impl Add<PosSpan> for PosSpan {
304    type Output = PosSpan;
305
306    fn add(mut self, rhs: PosSpan) -> Self::Output {
307        self += &rhs;
308        self
309    }
310}
311
312impl Default for PosSpan {
313    fn default() -> Self {
314        PosSpan::empty()
315    }
316}
317
318impl Display for PosSpan {
319    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
320        if self.is_not_empty() {
321            let (first, last) = (&self.first, &self.last);
322            if first == last {
323                write!(f, "{first}")
324            } else if first.0 == last.0 {
325                write!(f, "{first}-{}", last.1)
326            } else {
327                write!(f, "{first}-{last}")
328            }
329        } else {
330            write!(f, "<empty>")
331        }
332    }
333}
334
335// ---------------------------------------------------------------------------------------------
336// Table-based lexer interpreter
337
338#[derive(Clone, PartialEq, Debug)]
339pub struct LexerErrorInfo {
340    pub pos: u64,
341    pub line: CaretLine,
342    pub col: CaretCol,
343    pub curr_char: Option<char>,
344    pub group: GroupId,
345    pub state: StateId,
346    pub text: String,
347}
348
349#[derive(Clone, PartialEq, Debug)]
350pub enum LexerError {
351    None,
352    NoStreamAttached,
353    EndOfStream { info: LexerErrorInfo },
354    InvalidChar { info: LexerErrorInfo },
355    UnrecognizedChar { info: LexerErrorInfo },
356    InfiniteLoop { pos: u64 },
357    EmptyStateStack { info: LexerErrorInfo }
358}
359
360impl Display for LexerError {
361    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
362        match self {
363            LexerError::None => write!(f, "no error"),
364            LexerError::NoStreamAttached => write!(f, "no stream attached"),
365            LexerError::EndOfStream { info: LexerErrorInfo { pos, line, col, ..} } =>
366                write!(f, "end of stream, line {line}, col {col} (stream pos = {pos})"),
367            LexerError::InvalidChar { info: LexerErrorInfo { pos, line, col, curr_char, .. } } =>
368                write!(f, "invalid character '{}', line {line}, col {col} (stream pos = {pos})", curr_char.unwrap()),
369            LexerError::UnrecognizedChar  { info: LexerErrorInfo { pos, line, col, curr_char, .. } } =>
370                write!(f, "unrecognized character '{}', line {line}, col {col} (stream pos = {pos})", curr_char.unwrap()),
371            LexerError::InfiniteLoop { pos } =>
372                write!(f, "infinite loop (stream pos = {pos})"),
373            LexerError::EmptyStateStack { info: LexerErrorInfo { pos, line, col, curr_char, .. } } =>
374                write!(f, "pop from empty stack, line {line}, col {col}{} (stream pos = {pos})",
375                       if let Some(c) = curr_char { format!(", chr = '{c}'") } else { String::new() })
376        }
377    }
378}
379
380impl LexerError {
381    pub fn get_pos(&self) -> Option<u64> {
382        match &self {
383            LexerError::EndOfStream { info: LexerErrorInfo { pos, .. } }
384            | LexerError::InvalidChar { info: LexerErrorInfo { pos, .. } }
385            | LexerError::UnrecognizedChar { info: LexerErrorInfo { pos, .. } }
386            | LexerError::InfiniteLoop { pos }
387            | LexerError::EmptyStateStack { info: LexerErrorInfo { pos, .. } } => Some(*pos),
388            _ => None
389        }
390    }
391
392    pub fn get_line_col(&self) -> Option<(CaretLine, CaretCol)> {
393        match &self {
394            LexerError::EndOfStream { info: LexerErrorInfo { line, col, .. } }
395            | LexerError::InvalidChar { info: LexerErrorInfo { line, col, .. } }
396            | LexerError::UnrecognizedChar { info: LexerErrorInfo { line, col, .. } }
397            | LexerError::EmptyStateStack { info: LexerErrorInfo { line, col, .. } } => Some((*line, *col)),
398            _ => None
399        }
400    }
401}
402
403pub type LexerToken = (TokenId, ChannelId, String, PosSpan);
404
405/// Lexical analyzer (lexer) based on tables, which scans a `Read` source and produces tokens.
406///
407/// The tokens can be extracted one by one with [`get_token()`](Lexer::get_token) or from an
408/// iterator created by [`tokens()`](Lexer::tokens).
409pub struct Lexer<'a, R> {
410    // operating variables
411    pub(crate) input: Option<CharReader<R>>,
412    pub(crate) error: LexerError,
413    pub(crate) is_eos: bool,
414    pub(crate) pos: u64,
415    pub(crate) cursor: Pos,
416    pub(crate) tab_width: CaretCol,
417    pub(crate) state_stack: Vec<StateId>,
418    pub(crate) start_state: StateId,
419    // parameters
420    pub nbr_groups: u32,
421    pub initial_state: StateId,
422    pub first_end_state: StateId,   // accepting when state >= first_end_state
423    pub nbr_states: StateId,        // error if state >= nbr_states
424    // tables
425    pub ascii_to_group: &'a [GroupId],
426    pub utf8_to_group: HashMap<char, GroupId>,
427    pub seg_to_group: SegMap<GroupId>,
428    pub state_table: &'a [StateId],
429    pub terminal_table: &'a [Terminal],  // token(state) = token_table[state - first_end_state]
430}
431
432impl<'a, R: Read> Lexer<'a, R> {
433    pub fn new(
434        // parameters
435        nbr_groups: u32,
436        initial_state: StateId,
437        first_end_state: StateId,   // accepting when state >= first_end_state
438        nbr_states: StateId,        // error if state >= nbr_states
439        // tables
440        ascii_to_group: &'a [GroupId],
441        utf8_to_group: HashMap<char, GroupId>,
442        seg_to_group: SegMap<GroupId>,
443        state_table: &'a [StateId],
444        terminal_table: &'a [Terminal],  // token(state) = token_table[state - first_end_state>]
445    ) -> Self {
446        Lexer {
447            input: None,
448            error: LexerError::None,
449            is_eos: false,
450            pos: 0,
451            cursor: Pos(1, 1),
452            tab_width: 4,
453            state_stack: Vec::new(),
454            start_state: 0,
455            nbr_groups,
456            initial_state,
457            first_end_state,
458            nbr_states,
459            ascii_to_group,
460            utf8_to_group,
461            seg_to_group,
462            state_table,
463            terminal_table,
464        }
465    }
466
467    pub fn attach_stream(&mut self, input: CharReader<R>) {
468        self.input = Some(input);
469        self.is_eos = false;
470        self.pos = 0;
471        self.cursor = Pos(1, 1);
472        self.state_stack.clear();
473        self.start_state = self.initial_state;
474    }
475
476    pub fn detach_stream(&mut self) -> Option<CharReader<R>> {
477        // self.pos = None;
478        self.input.take()
479    }
480
481    pub fn set_tab_width(&mut self, width: CaretCol) {
482        self.tab_width = width;
483    }
484
485    pub fn get_tab_width(&self) -> CaretCol {
486        self.tab_width
487    }
488
489    pub fn stream(&self) -> Option<&CharReader<R>> {
490        self.input.as_ref()
491    }
492
493    pub fn is_open(&self) -> bool {
494        self.input.as_ref().map(|input| input.is_reading()).unwrap_or(false)
495    }
496
497    pub fn skip_to_pos(&mut self, new_pos: Pos) -> Result<(), String> {
498        if self.input.is_none() {
499            return Err("no current input".to_string());
500        }
501        while self.cursor != new_pos {
502            if let Some(c) = self.input.as_mut().unwrap().get_char() {
503                self.cursor.update_pos(c, self.tab_width);
504            } else {
505                return Err("cannot find the position of the grammar in the lexicon".to_string());
506            };
507        }
508        Ok(())
509    }
510
511    pub fn tokens(&mut self) -> LexInterpretIter<'_, 'a, R> {
512        LexInterpretIter { lexer: self, error_info: None, mode: LexInterpretIterMode::Normal }
513    }
514
515    // get_token flow:
516    //
517    //      if input.is_none
518    //          return error
519    //      state = start
520    //      startpos = endpos = self.(line, col)
521    //      loop
522    //          next char
523    //          group       -> group == nbr_groups => unrecognized
524    //          next_state  -> [normal] < first_end_state <= [accepting] <= nbr_states <= [invalid char]
525    //          if next_state >= nbr_states || group >= nbr_groups (invalid char)
526    //              if !EOS
527    //                  rewind char
528    //              if first_end_state <= state < nbr_states (accepting)
529    //                  // process skip/push/pop:
530    //                  curr_start = start
531    //                  if pop
532    //                      start = stack.pop()
533    //                  if push(n)
534    //                      stack.push(curr_start)
535    //                      start = n
536    //                      state = n
537    //                  if !skip
538    //                      return (token, channel, span(startpos, endpos))
539    //                  startpos = self.(line, col)
540    //                  if !EOS
541    //                      state = start
542    //                      continue // skip
543    //              return error/EOS
544    //          else
545    //              endpos = self.(line, col)
546    //              update self.(line, col)
547    //              state = next_state
548    //              pos++
549    //
550    pub fn get_token(&mut self) -> Result<Option<LexerToken>, LexerError> {
551        const VERBOSE: bool = false;
552        if VERBOSE { println!("lexer state_table: {}, last: {}", self.state_table.len(), self.state_table.iter().last().unwrap()); }
553        self.error = LexerError::None;
554        let mut text = String::new();
555        let mut more_text = String::new();  // keeps previously scanned text if `more` action
556        if self.input.is_some() {
557            let mut state = self.start_state;
558            let mut first_pos = self.cursor;
559            let mut last_pos = first_pos;
560            #[cfg(debug_assertions)] let mut last_state: Option<StateId> = None;
561            #[cfg(debug_assertions)] let mut last_offset: Option<u64> = None;
562            #[cfg(debug_assertions)] let mut infinite_loop_cnt = 0_u32;
563            loop {
564                if VERBOSE { print!("- state = {state}"); }
565                #[allow(clippy::unnecessary_unwrap)] // borrow checker disagreed with Clippy and won the argument
566                let input = self.input.as_mut().unwrap();
567                #[cfg(debug_assertions)] {
568                    if last_state.map(|st| st == state).unwrap_or(false) && last_offset.map(|offset| offset == input.get_offset()).unwrap_or(false) {
569                        if infinite_loop_cnt > 3 {
570                            self.error = LexerError::InfiniteLoop { pos: self.pos };
571                            if VERBOSE { println!(" => Err({})", self.error); }
572                            return Err(self.error.clone());
573                        }
574                        infinite_loop_cnt += 1;
575                    } else {
576                        infinite_loop_cnt = 0;
577                    }
578                    last_state = Some(state);
579                    last_offset = Some(input.get_offset());
580                }
581                let c_opt = input.get_char();
582                let is_eos = c_opt.is_none();
583                self.is_eos = is_eos;
584                let group = c_opt.and_then(|c| char_to_group(self.ascii_to_group, &self.utf8_to_group, &self.seg_to_group, c))
585                    .unwrap_or(self.nbr_groups);
586                if VERBOSE { print!(", char '{}' group {}", if let Some(c) = c_opt { escape_char(c) } else { "<EOF>".to_string() }, group); }
587                // we can use the state_table even if group = error = nrb_group (but we must
588                // ignore new_state and detect that the group is illegal):
589                let new_state = self.state_table[self.nbr_groups as usize * state + group as usize];
590                if new_state >= self.nbr_states || group >= self.nbr_groups { // we can't do anything with the current character
591                    if let Some(c) = c_opt {
592                        input.rewind(c).unwrap_or_else(|_| panic!("Can't rewind character '{}'", escape_char(c)));
593                    }
594                    let is_accepting = self.first_end_state <= state && state < self.nbr_states;
595                    if is_accepting { // accepting
596                        let terminal = &self.terminal_table[state - self.first_end_state];
597                        if terminal.pop {
598                            if self.state_stack.is_empty() {
599                                self.error = LexerError::EmptyStateStack {
600                                    info: LexerErrorInfo {
601                                        pos: self.pos,
602                                        line: self.cursor.line(),
603                                        col: self.cursor.col(),
604                                        curr_char: c_opt,
605                                        group,
606                                        state,
607                                        text: more_text + &text,
608                                    }
609                                };
610                                if VERBOSE { println!(" => Err({})", self.error); }
611                                return Err(self.error.clone());
612                            }
613                            self.start_state = self.state_stack.pop().unwrap();
614                            if VERBOSE { print!(", pop to {}", self.start_state); }
615                        }
616                        if let Some(goto_state) = terminal.mode_state {
617                            if terminal.mode.is_push() {
618                                self.state_stack.push(self.start_state);
619                            }
620                            self.start_state = goto_state;
621                            if VERBOSE { print!(", {}({})", if terminal.mode.is_push() { "push" } else { "mode" }, goto_state); }
622                        }
623                        if let Some(token) = &terminal.get_token() {
624                            if VERBOSE { println!(" => OK: token {}", token); }
625                            return Ok(Some((*token, terminal.channel, more_text + &text, PosSpan::new(first_pos, last_pos))));
626                        }
627                        if !terminal.action.is_more() {
628                            first_pos = self.cursor;
629                        }
630                        if !is_eos { // we can't skip if <EOF> or we'll loop indefinitely
631                            if VERBOSE { println!(" => {}, state {}", terminal.action, self.start_state); }
632                            state = self.start_state;
633                            if terminal.action.is_more() {
634                                more_text.push_str(&text);
635                            }
636                            text.clear();
637                            continue;
638                        }
639                    }
640                    // EOF or invalid character
641                    if is_eos && is_accepting {
642                        return Ok(None);
643                    }
644                    let info = LexerErrorInfo {
645                        pos: self.pos,
646                        line: self.cursor.line(),
647                        col: self.cursor.col(),
648                        curr_char: c_opt,
649                        group,
650                        state,
651                        text: more_text + &text,
652                    };
653                    self.error = if is_eos {
654                        LexerError::EndOfStream { info }
655                    } else if group >= self.nbr_groups {
656                        let c = input.get_char().unwrap();   // removing the bad character (not accepting state)
657                        self.update_pos(c);
658                        LexerError::UnrecognizedChar { info }
659                    } else {
660                        let c = input.get_char().unwrap();   // removing the bad character (not accepting state)
661                        self.update_pos(c);
662                        LexerError::InvalidChar { info }
663                    };
664                    if VERBOSE { println!(" => Err({})", self.error); }
665                    return Err(self.error.clone());
666                } else {
667                    last_pos = self.cursor;
668                    if let Some(c) = c_opt {
669                        text.push(c);
670                        self.update_pos(c);
671                    }
672                    if VERBOSE { println!(" => state {new_state}"); }
673                    state = new_state;
674                }
675            }
676        }
677        self.error = LexerError::NoStreamAttached;
678        if VERBOSE { println!(" => Err({})", self.error); }
679        Err(self.error.clone())
680    }
681
682    pub fn update_pos(&mut self, c: char) {
683        self.cursor.update_pos(c, self.tab_width);
684        self.pos += 1;
685    }
686
687    pub fn get_error(&self) -> &LexerError {
688        &self.error
689    }
690
691    pub fn has_error(&self) -> bool {
692        self.error != LexerError::None
693    }
694
695    pub fn is_eos(&self) -> bool {
696        self.is_eos
697        // matches!(self.error, LexerError::EndOfStream { .. })
698    }
699}
700
701#[derive(Debug)]
702enum LexInterpretIterMode { Normal, Error }
703
704pub struct LexInterpretIter<'a, 'b, R> {
705    lexer: &'a mut Lexer<'b, R>,
706    error_info: Option<LexerErrorInfo>,
707    mode: LexInterpretIterMode
708}
709
710impl<'a, 'b, R: Read> Iterator for LexInterpretIter<'a, 'b, R> {
711    type Item = LexerToken; // (TokenId, ChannelId, String, CaretLine, CaretCol);
712
713    fn next(&mut self) -> Option<Self::Item> {
714        if self.lexer.is_eos {
715            None
716        } else {
717            match self.mode {
718                LexInterpretIterMode::Normal => {
719                    let t = self.lexer.get_token();
720                    match t {
721                        Ok(Some(token)) => Some(token),
722                        Err(LexerError::InvalidChar { info } | LexerError::UnrecognizedChar { info }) => {
723                            // in case of invalid or unrecognized character, the stream issues None then a special lexer tokens
724                            // that have a TokenId::MAX value and the error message in the text field
725                            self.error_info = Some(info);
726                            self.mode = LexInterpretIterMode::Error;
727                            None
728                        }
729                        _ => {
730                            None
731                        }
732                    }
733                }
734                LexInterpretIterMode::Error => {
735                    let info = self.error_info.as_ref().unwrap();
736                    self.mode = LexInterpretIterMode::Normal;
737                    let msg = format!("{}, scanned before = '{}'", self.lexer.get_error(), self.error_info.as_ref().unwrap().text);
738                    let pos = Pos(info.line, info.col);
739                    Some((TokenId::MAX, 0, msg, PosSpan::new(pos, pos)))
740                }
741            }
742        }
743    }
744}
745
746// ---------------------------------------------------------------------------------------------
747
748pub struct TokenSplit<I, F> {
749    iter: I,
750    ch: ChannelId,
751    f: F
752}
753
754pub trait TokenSpliterator: Iterator<Item=LexerToken> {
755    /// Splits the token iterator out of the lexer (Item: `(TokenId, ChannelId, String, PosSpan)`) based on the channel ID:
756    /// * the default channel 0 is output as another iterator on `(token, string, pos_span)`, suitable for the parser
757    /// * other channels are consummed by the closure `f`, which takes the parameters `(token, channel, string, pos_span)`
758    ///
759    /// ## Example
760    /// ```ignore
761    /// let tokens = lexer.tokens().split_channel0(|(tok, ch, text, pos_span)|
762    ///     println!("TOKEN: channel {ch}, discarded, pos {pos_span}, Id {tok:?}, \"{text}\"")
763    /// );
764    /// let result = parser.parse_stream(&mut listener, tokens);
765    /// ```
766    fn split_channel0<F>(self, f: F) -> TokenSplit<Self, F>
767    where Self: Sized,
768          F: FnMut((TokenId, ChannelId, String, PosSpan))
769    {
770        TokenSplit { iter: self, ch: 0, f }
771    }
772
773    /// Splits the token iterator out of the lexer (Item: `(TokenId, ChannelId, String, PosSpan)`) based on the channel ID:
774    /// * the channel `channel` is output as another iterator on `(token, string, pos_span)`, suitable for the parser
775    /// * other channels are consummed by the closure `f`, which takes the parameters `(token, channel, string, pos_span)`
776    ///
777    /// ## Example
778    /// ```ignore
779    /// let tokens = lexer.tokens().split_channels(2, |(tok, ch, text, pos_span)|
780    ///     println!("TOKEN: channel {ch}, discarded, pos {pos_span}, Id {tok:?}, \"{text}\"")
781    /// );
782    /// let result = parser.parse_stream(&mut listener, tokens);
783    /// ```
784    fn split_channels<F>(self, channel: ChannelId, f: F) -> TokenSplit<Self, F>
785    where Self: Sized,
786          F: FnMut(LexerToken)
787    {
788        TokenSplit { iter: self, ch: channel, f }
789    }
790
791    /// Filters the token iterator out of the lexer (Item: `(TokenId, ChannelId, String, PosSpan)`) based on the channel ID:
792    /// * the default channel 0 is output as another iterator on `(token, string, pos_span)`, suitable for the parser
793    /// * other channels are discarded.
794    ///
795    /// ## Example
796    /// ```ignore
797    /// let tokens = lexer.tokens().keep_channel0();
798    /// let result = parser.parse_stream(&mut listener, tokens);
799    /// ```
800    fn keep_channel0(self) -> impl Iterator<Item=(TokenId, String, PosSpan)>
801    where Self: Sized
802    {
803        self.filter_map(|(token, ch, str, pos_span)| {
804            if ch == 0 {
805                Some((token, str, pos_span))
806            } else {
807                None
808            }
809        })
810    }
811
812    /// Filters the token iterator out of the lexer (Item: `(TokenId, ChannelId, String, PosSpan)`) based on the channel ID:
813    /// * channel `channel` is output as another iterator on `(token, string, pos_span)`, suitable for the parser
814    /// * other channels are discarded.
815    ///
816    /// ## Example
817    /// ```ignore
818    /// let tokens = lexer.tokens().keep_channel(2);
819    /// let result = parser.parse_stream(&mut listener, tokens);
820    /// ```
821    fn keep_channel(self, channel: ChannelId) -> TokenSplit<Self, fn(LexerToken)>
822    where Self: Sized
823    {
824        TokenSplit { iter: self, ch: channel, f: |_| {} }
825    }
826
827    // or:
828    //
829    // fn keep_channel(self, channel: ChannelId) -> impl Iterator<Item=(TokenId, String, PosSpan)>
830    // where Self: Sized
831    // {
832    //     self.filter_map(move |(token, ch, str, pos_span)| {
833    //         if ch == channel {
834    //             Some((token, str, pos_span))
835    //         } else {
836    //             None
837    //         }
838    //     })
839    // }
840}
841
842impl<I, F> Iterator for TokenSplit<I, F>
843    where I: Iterator<Item=LexerToken>,
844          F: FnMut((TokenId, ChannelId, String, PosSpan))
845{
846    type Item = (TokenId, String, PosSpan);
847
848    fn next(&mut self) -> Option<Self::Item> {
849        if let Some((token, ch, str, pos_span)) = self.iter.next() {
850            if ch == self.ch {
851                Some((token, str, pos_span))
852            } else {
853                (self.f)((token, ch, str, pos_span));
854                None
855            }
856        } else {
857            None
858        }
859    }
860}
861
862impl<I: Iterator<Item=LexerToken>> TokenSpliterator for I {}