lexigram_core/lexer/
mod.rs

1// Copyright (c) 2025 Redglyph (@gmail.com). All Rights Reserved.
2
3pub(crate) mod tests;
4
5use std::collections::HashMap;
6use std::fmt::{Display, Formatter};
7use std::io::Read;
8use std::ops::{Add, AddAssign};
9use crate::segmap::{char_to_group, GroupId, SegMap};
10use crate::char_reader::{escape_char, CharReader};
11use crate::TokenId;
12// ---------------------------------------------------------------------------------------------
13// Types used in lexer
14
15pub type StateId = usize;
16pub type ChannelId = u16;
17pub type ModeId = u16;
18
19/// Terminal instructions for the lexer logic.
20///
21/// Possible actions:
22/// * skip           => doesn't return token, drops current string
23/// * more           => doesn't return token, keeps current string for next rule
24/// * push(n)        => pushes mode and switches to mode `n`
25/// * pop            => pops next mode from the stack
26/// * channel #      => defines output channel
27///
28/// By default, `push`, `pop`, `channel` or no specified action outputs a token (`token = Some(..)`).
29/// If a `skip` or `more` action is specified, no token is returned (`token = None`).
30#[derive(Clone, Debug, PartialEq, Default, PartialOrd, Eq, Ord)]
31pub struct Terminal {
32    pub action: ActionOption,
33    pub channel: ChannelId,
34    pub mode: ModeOption,
35    pub mode_state: Option<StateId>,
36    pub pop: bool
37}
38
39impl Terminal {
40    #[inline]
41    pub fn is_only_skip(&self) -> bool {
42        self.action.is_skip() && self.mode.is_none() && self.mode_state.is_none() && !self.pop
43    }
44
45    #[inline]
46    pub fn is_token(&self) -> bool {
47        self.action.is_token()
48    }
49
50    #[inline]
51    pub fn get_token(&self) -> Option<TokenId> {
52        self.action.get_token()
53    }
54
55    pub fn to_macro(&self) -> String {
56        let mut str = Vec::<String>::new();
57        match self.action {
58            ActionOption::Skip => str.push("term!(skip)".to_string()),
59            ActionOption::Token(t) => str.push(format!("term!(={t})")),
60            ActionOption::More => str.push("term!(more)".to_string())
61        }
62        if self.channel != 0 {
63            str.push(format!("term!(#{})", self.channel));
64        }
65        match self.mode {
66            ModeOption::None => {}
67            ModeOption::Mode(m) => str.push(format!("term!(mode {m})")),
68            ModeOption::Push(m) => str.push(format!("term!(push {m})")),
69        }
70        if let Some(id) = self.mode_state {
71            str.push(format!("term!(pushst {})", id));
72        }
73        if self.pop {
74            str.push("term!(pop)".to_string());
75        }
76        str.join(" + ")
77    }
78}
79
80impl Display for Terminal {
81    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
82        write!(f, "<{}", self.action)?;
83        if self.channel != 0 { write!(f, ",ch {}", self.channel)?; }
84        if !self.mode.is_none() || self.mode_state.is_some() {
85            match self.mode {
86                ModeOption::None => {}
87                ModeOption::Mode(m) => write!(f, ",mode({m}")?,
88                ModeOption::Push(m) => write!(f, ",push({m}")?,
89            }
90            if let Some(s) = self.mode_state { write!(f, ",state {s}")?; }
91            write!(f, ")")?;
92        }
93        if self.pop { write!(f, ",pop")?; }
94        write!(f, ">")
95    }
96}
97
98impl Add for Terminal {
99    type Output = Terminal;
100
101    fn add(self, rhs: Self) -> Self::Output {
102        Terminal {
103            // token: if self.token.is_some() { self.token } else { rhs.token },
104            action: self.action + rhs.action,
105            channel: self.channel + rhs.channel,
106            mode: if !self.mode.is_none() { self.mode } else { rhs.mode },
107            mode_state: if self.mode_state.is_some() { self.mode_state } else { rhs.mode_state },
108            pop: self.pop || rhs.pop
109        }
110    }
111}
112
113#[derive(Clone, Debug, PartialEq, Default, PartialOrd, Eq, Ord)]
114pub enum ActionOption {
115    #[default] Skip,
116    Token(TokenId),
117    More
118}
119
120impl ActionOption {
121    pub fn is_skip(&self) -> bool { self == &ActionOption::Skip }
122    pub fn is_token(&self) -> bool { matches!(self, ActionOption::Token(_) ) }
123    pub fn is_more(&self) -> bool { self == &ActionOption::More }
124
125    pub fn get_token(&self) -> Option<TokenId> {
126        if let ActionOption::Token(token) = self {
127            Some(*token)
128        } else {
129            None
130        }
131    }
132}
133
134impl Add for ActionOption {
135    type Output = Self;
136
137    fn add(self, rhs: Self) -> Self::Output {
138        match self {
139            ActionOption::Skip => rhs,
140            _ => if rhs.is_skip() { self } else { panic!("can't add {self:?} and {rhs:?}") }
141        }
142    }
143}
144
145impl Display for ActionOption {
146    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
147        match self {
148            ActionOption::Skip => write!(f, "skip"),
149            ActionOption::Token(t) => write!(f, "end:{t}"),
150            ActionOption::More => write!(f, "more")
151        }
152    }
153}
154
155#[derive(Clone, Copy, Debug, PartialEq, Default, PartialOrd, Eq, Ord)]
156pub enum ModeOption {
157    #[default]
158    None,
159    Mode(ModeId),
160    Push(ModeId)
161}
162
163impl ModeOption {
164    pub fn is_none(&self) -> bool {
165        self == &ModeOption::None
166    }
167
168    pub fn is_mode(&self) -> bool {
169        matches!(self, &ModeOption::Mode(_))
170    }
171
172    pub fn is_push(&self) -> bool {
173        matches!(self, &ModeOption::Push(_))
174    }
175}
176
177// ---------------------------------------------------------------------------------------------
178// Locations
179
180pub type CaretCol = u64;
181pub type CaretLine = u64;
182
183/// `Pos(line, col)`
184#[derive(Clone, Copy, PartialEq, PartialOrd, Debug)]
185pub struct Pos(pub CaretLine, pub CaretCol);
186
187impl Pos {
188    pub fn line(&self) -> CaretLine {
189        self.0
190    }
191
192    pub fn col(&self) -> CaretCol {
193        self.1
194    }
195}
196
197/// `PosSpan` defines a text selection where `first` and `last` are the [position](Pos) of the first and last character.
198/// When `first` > `last`, no text is selected.
199#[derive(Clone, PartialEq, Debug)]
200pub struct PosSpan {
201    pub first: Pos,
202    pub last: Pos,
203}
204
205impl PosSpan {
206    #[inline(always)]
207    pub fn new(first: Pos, last: Pos) -> Self {
208        PosSpan { first, last }
209    }
210
211    #[inline(always)]
212    pub fn empty() -> Self {
213        PosSpan { first: Pos(1, 1), last: Pos(0, 0) }
214    }
215
216    pub fn take(&mut self) -> PosSpan {
217        std::mem::take(self)
218    }
219
220    #[inline(always)]
221    pub fn is_empty(&self) -> bool {
222        self.first > self.last
223    }
224
225    #[inline(always)]
226    pub fn is_not_empty(&self) -> bool {
227        self.first <= self.last
228    }
229
230    pub fn first(&self) -> Option<Pos> {
231        if self.is_not_empty() { Some(self.first) } else { None }
232    }
233
234    pub fn first_forced(&self) -> Pos {
235        if self.is_not_empty() { self.first } else { panic!("span is empty") }
236    }
237
238    pub fn last(&self) -> Option<Pos> {
239        if self.is_not_empty() { Some(self.last) } else { None }
240    }
241
242    pub fn last_forced(&self) -> Pos {
243        if self.is_not_empty() { self.last } else { panic!("span is empty") }
244    }
245}
246
247impl AddAssign<&PosSpan> for PosSpan {
248    fn add_assign(&mut self, rhs: &Self) {
249        match (self.is_empty(), rhs.is_empty()) {
250            (true, false) => (self.first, self.last) = (rhs.first, rhs.last),
251            (false, false) => self.last = rhs.last,
252            _ => {}
253        }
254    }
255}
256
257impl Default for PosSpan {
258    fn default() -> Self {
259        PosSpan::empty()
260    }
261}
262
263impl Display for PosSpan {
264    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
265        if self.is_not_empty() {
266            let (first, last) = (&self.first, &self.last);
267            if first == last {
268                write!(f, "{}:{}", first.0, first.1)
269            } else if first.0 == last.0 {
270                write!(f, "{}:{}-{}", first.0, first.1, last.1)
271            } else {
272                write!(f, "{}:{}-{}:{}", first.0, first.1, last.0, last.1)
273            }
274        } else {
275            write!(f, "<empty>")
276        }
277    }
278}
279
280// ---------------------------------------------------------------------------------------------
281// Table-based lexer interpreter
282
283#[derive(Clone, PartialEq, Debug)]
284pub struct LexerErrorInfo {
285    pub pos: u64,
286    pub line: CaretLine,
287    pub col: CaretCol,
288    pub curr_char: Option<char>,
289    pub group: GroupId,
290    pub state: StateId,
291    pub text: String,
292}
293
294#[derive(Clone, PartialEq, Debug)]
295pub enum LexerError {
296    None,
297    NoStreamAttached,
298    EndOfStream { info: LexerErrorInfo },
299    InvalidChar { info: LexerErrorInfo },
300    UnrecognizedChar { info: LexerErrorInfo },
301    InfiniteLoop { pos: u64 },
302    EmptyStateStack { info: LexerErrorInfo }
303}
304
305impl Display for LexerError {
306    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
307        match self {
308            LexerError::None => write!(f, "no error"),
309            LexerError::NoStreamAttached => write!(f, "no stream attached"),
310            LexerError::EndOfStream { info: LexerErrorInfo { pos, line, col, ..} } =>
311                write!(f, "end of stream, line {line}, col {col} (stream pos = {pos})"),
312            LexerError::InvalidChar { info: LexerErrorInfo { pos, line, col, curr_char, .. } } =>
313                write!(f, "invalid character '{}', line {line}, col {col} (stream pos = {pos})", curr_char.unwrap()),
314            LexerError::UnrecognizedChar  { info: LexerErrorInfo { pos, line, col, curr_char, .. } } =>
315                write!(f, "unrecognized character '{}', line {line}, col {col} (stream pos = {pos})", curr_char.unwrap()),
316            LexerError::InfiniteLoop { pos } =>
317                write!(f, "infinite loop (stream pos = {pos})"),
318            LexerError::EmptyStateStack { info: LexerErrorInfo { pos, line, col, curr_char, .. } } =>
319                write!(f, "pop from empty stack, line {line}, col {col}{} (stream pos = {pos})",
320                       if let Some(c) = curr_char { format!(", chr = '{c}'") } else { String::new() })
321        }
322    }
323}
324
325impl LexerError {
326    pub fn get_pos(&self) -> Option<u64> {
327        match &self {
328            LexerError::EndOfStream { info: LexerErrorInfo { pos, .. } }
329            | LexerError::InvalidChar { info: LexerErrorInfo { pos, .. } }
330            | LexerError::UnrecognizedChar { info: LexerErrorInfo { pos, .. } }
331            | LexerError::InfiniteLoop { pos }
332            | LexerError::EmptyStateStack { info: LexerErrorInfo { pos, .. } } => Some(*pos),
333            _ => None
334        }
335    }
336
337    pub fn get_line_col(&self) -> Option<(CaretLine, CaretCol)> {
338        match &self {
339            LexerError::EndOfStream { info: LexerErrorInfo { line, col, .. } }
340            | LexerError::InvalidChar { info: LexerErrorInfo { line, col, .. } }
341            | LexerError::UnrecognizedChar { info: LexerErrorInfo { line, col, .. } }
342            | LexerError::EmptyStateStack { info: LexerErrorInfo { line, col, .. } } => Some((*line, *col)),
343            _ => None
344        }
345    }
346}
347
348pub type LexerToken = (TokenId, ChannelId, String, PosSpan);
349
350/// Lexical analyzer (lexer) based on tables, which scans a `Read` source and produces tokens.
351///
352/// The tokens can be extracted one by one with [`get_token()`](Lexer::get_token) or from an
353/// iterator created by [`tokens()`](Lexer::tokens).
354pub struct Lexer<'a, R> {
355    // operating variables
356    pub(crate) input: Option<CharReader<R>>,
357    pub(crate) error: LexerError,
358    pub(crate) is_eos: bool,
359    pub(crate) pos: u64,
360    pub(crate) line: CaretLine,
361    pub(crate) col: CaretCol,
362    pub(crate) tab_width: u8,
363    pub(crate) state_stack: Vec<StateId>,
364    pub(crate) start_state: StateId,
365    // parameters
366    pub nbr_groups: u32,
367    pub initial_state: StateId,
368    pub first_end_state: StateId,   // accepting when state >= first_end_state
369    pub nbr_states: StateId,        // error if state >= nbr_states
370    // tables
371    pub ascii_to_group: &'a [GroupId],
372    pub utf8_to_group: HashMap<char, GroupId>,
373    pub seg_to_group: SegMap<GroupId>,
374    pub state_table: &'a [StateId],
375    pub terminal_table: &'a [Terminal],  // token(state) = token_table[state - first_end_state]
376}
377
378impl<'a, R: Read> Lexer<'a, R> {
379    pub fn new(
380        // parameters
381        nbr_groups: u32,
382        initial_state: StateId,
383        first_end_state: StateId,   // accepting when state >= first_end_state
384        nbr_states: StateId,        // error if state >= nbr_states
385        // tables
386        ascii_to_group: &'a [GroupId],
387        utf8_to_group: HashMap<char, GroupId>,
388        seg_to_group: SegMap<GroupId>,
389        state_table: &'a [StateId],
390        terminal_table: &'a [Terminal],  // token(state) = token_table[state - first_end_state>]
391    ) -> Self {
392        Lexer {
393            input: None,
394            error: LexerError::None,
395            is_eos: false,
396            pos: 0,
397            line: 1,
398            col: 1,
399            tab_width: 4,
400            state_stack: Vec::new(),
401            start_state: 0,
402            nbr_groups,
403            initial_state,
404            first_end_state,
405            nbr_states,
406            ascii_to_group,
407            utf8_to_group,
408            seg_to_group,
409            state_table,
410            terminal_table,
411        }
412    }
413
414    pub fn attach_stream(&mut self, input: CharReader<R>) {
415        self.input = Some(input);
416        self.is_eos = false;
417        self.pos = 0;
418        self.line = 1;
419        self.col = 1;
420        self.state_stack.clear();
421        self.start_state = self.initial_state;
422    }
423
424    pub fn detach_stream(&mut self) -> Option<CharReader<R>> {
425        // self.pos = None;
426        self.input.take()
427    }
428
429    pub fn set_tab_width(&mut self, width: u8) {
430        self.tab_width = width;
431    }
432
433    pub fn get_tab_width(&self) -> u8 {
434        self.tab_width
435    }
436
437    pub fn stream(&self) -> Option<&CharReader<R>> {
438        self.input.as_ref()
439    }
440
441    pub fn is_open(&self) -> bool {
442        self.input.as_ref().map(|input| input.is_reading()).unwrap_or(false)
443    }
444
445    pub fn tokens(&mut self) -> LexInterpretIter<'_, 'a, R> {
446        LexInterpretIter { lexer: self, error_info: None, mode: LexInterpretIterMode::Normal }
447    }
448
449    // get_token flow:
450    //
451    //      if input.is_none
452    //          return error
453    //      state = start
454    //      startpos = endpos = self.(line, col)
455    //      loop
456    //          next char
457    //          group       -> group == nbr_groups => unrecognized
458    //          next_state  -> [normal] < first_end_state <= [accepting] <= nbr_states <= [invalid char]
459    //          if next_state >= nbr_states || group >= nbr_groups (invalid char)
460    //              if !EOS
461    //                  rewind char
462    //              if first_end_state <= state < nbr_states (accepting)
463    //                  // process skip/push/pop:
464    //                  curr_start = start
465    //                  if pop
466    //                      start = stack.pop()
467    //                  if push(n)
468    //                      stack.push(curr_start)
469    //                      start = n
470    //                      state = n
471    //                  if !skip
472    //                      return (token, channel, span(startpos, endpos))
473    //                  startpos = self.(line, col)
474    //                  if !EOS
475    //                      state = start
476    //                      continue // skip
477    //              return error/EOS
478    //          else
479    //              endpos = self.(line, col)
480    //              update self.(line, col)
481    //              state = next_state
482    //              pos++
483    //
484    pub fn get_token(&mut self) -> Result<Option<LexerToken>, LexerError> {
485        const VERBOSE: bool = false;
486        if VERBOSE { println!("lexer state_table: {}, last: {}", self.state_table.len(), self.state_table.iter().last().unwrap()); }
487        self.error = LexerError::None;
488        let mut text = String::new();
489        let mut more_text = String::new();  // keeps previously scanned text if `more` action
490        // if let Some(input) = self.input.as_mut() {
491        if self.input.is_some() {
492            let mut state = self.start_state;
493            let mut first_pos = Pos(self.line, self.col);
494            let mut last_pos = first_pos;
495            #[cfg(debug_assertions)] let mut last_state: Option<StateId> = None;
496            #[cfg(debug_assertions)] let mut last_offset: Option<u64> = None;
497            #[cfg(debug_assertions)] let mut infinite_loop_cnt = 0_u32;
498            loop {
499                if VERBOSE { print!("- state = {state}"); }
500                let input = self.input.as_mut().unwrap();
501                #[cfg(debug_assertions)] {
502                    if last_state.map(|st| st == state).unwrap_or(false) && last_offset.map(|offset| offset == input.get_offset()).unwrap_or(false) {
503                        if infinite_loop_cnt > 3 {
504                            self.error = LexerError::InfiniteLoop { pos: self.pos };
505                            if VERBOSE { println!(" => Err({})", self.error); }
506                            return Err(self.error.clone());
507                        }
508                        infinite_loop_cnt += 1;
509                    } else {
510                        infinite_loop_cnt = 0;
511                    }
512                    last_state = Some(state);
513                    last_offset = Some(input.get_offset());
514                }
515                let c_opt = input.get_char();
516                let is_eos = c_opt.is_none();
517                self.is_eos = is_eos;
518                let group = c_opt.and_then(|c| char_to_group(&self.ascii_to_group, &self.utf8_to_group, &self.seg_to_group, c))
519                    .unwrap_or(self.nbr_groups);
520                if VERBOSE { print!(", char '{}' group {}", if let Some(c) = c_opt { escape_char(c) } else { "<EOF>".to_string() }, group); }
521                // we can use the state_table even if group = error = nrb_group (but we must
522                // ignore new_state and detect that the group is illegal):
523                let new_state = self.state_table[self.nbr_groups as usize * state + group as usize];
524                if new_state >= self.nbr_states || group >= self.nbr_groups { // we can't do anything with the current character
525                    if let Some(c) = c_opt {
526                        input.rewind(c).expect(&format!("Can't rewind character '{}'", escape_char(c)));
527                    }
528                    let is_accepting = self.first_end_state <= state && state < self.nbr_states;
529                    if is_accepting { // accepting
530                        let terminal = &self.terminal_table[state - self.first_end_state];
531                        if terminal.pop {
532                            if self.state_stack.is_empty() {
533                                self.error = LexerError::EmptyStateStack {
534                                    info: LexerErrorInfo {
535                                        pos: self.pos,
536                                        line: self.line,
537                                        col: self.col,
538                                        curr_char: c_opt,
539                                        group,
540                                        state,
541                                        text: more_text + &text,
542                                    }
543                                };
544                                if VERBOSE { println!(" => Err({})", self.error); }
545                                return Err(self.error.clone());
546                            }
547                            self.start_state = self.state_stack.pop().unwrap();
548                            if VERBOSE { print!(", pop to {}", self.start_state); }
549                        }
550                        if let Some(goto_state) = terminal.mode_state {
551                            if terminal.mode.is_push() {
552                                self.state_stack.push(self.start_state);
553                            }
554                            self.start_state = goto_state;
555                            if VERBOSE { print!(", {}({})", if terminal.mode.is_push() { "push" } else { "mode" }, goto_state); }
556                        }
557                        if let Some(token) = &terminal.get_token() {
558                            if VERBOSE { println!(" => OK: token {}", token); }
559                            return Ok(Some((token.clone(), terminal.channel, more_text + &text, PosSpan::new(first_pos, last_pos))));
560                        }
561                        if !terminal.action.is_more() {
562                            first_pos = Pos(self.line, self.col);
563                        }
564                        if !is_eos { // we can't skip if <EOF> or we'll loop indefinitely
565                            if VERBOSE { println!(" => {}, state {}", terminal.action, self.start_state); }
566                            state = self.start_state;
567                            if terminal.action.is_more() {
568                                more_text.push_str(&text);
569                            }
570                            text.clear();
571                            continue;
572                        }
573                    }
574                    // EOF or invalid character
575                    if is_eos && is_accepting {
576                        return Ok(None);
577                    }
578                    let info = LexerErrorInfo {
579                        pos: self.pos,
580                        line: self.line,
581                        col: self.col,
582                        curr_char: c_opt,
583                        group,
584                        state,
585                        text: more_text + &text,
586                    };
587                    self.error = if is_eos {
588                        LexerError::EndOfStream { info }
589                    } else if group >= self.nbr_groups {
590                        let c = input.get_char().unwrap();   // removing the bad character (not accepting state)
591                        self.update_pos(c);
592                        LexerError::UnrecognizedChar { info }
593                    } else {
594                        let c = input.get_char().unwrap();   // removing the bad character (not accepting state)
595                        self.update_pos(c);
596                        LexerError::InvalidChar { info }
597                    };
598                    if VERBOSE { println!(" => Err({})", self.error); }
599                    return Err(self.error.clone());
600                } else {
601                    last_pos = Pos(self.line, self.col);
602                    if let Some(c) = c_opt {
603                        text.push(c);
604                        self.update_pos(c);
605                    }
606                    if VERBOSE { println!(" => state {new_state}"); }
607                    state = new_state;
608                }
609            }
610        }
611        self.error = LexerError::NoStreamAttached;
612        if VERBOSE { println!(" => Err({})", self.error); }
613        Err(self.error.clone())
614    }
615
616    pub fn update_pos(&mut self, c: char) {
617        match c {
618            '\t' => {
619                //            ↓       ↓    (if self.tab_width = 8)
620                //    1234567890123456789
621                // 1) ..↑                  col = 3
622                //    ..→→→→→→↑            col = 3 - 2%8 + 8 = 3 - 2 + 8 = 9
623                // 2) .............↑       col = 14
624                //    .............→→→↑    col = 14 - 13%8 + 8 = 14 - 5 + 8 = 17
625                self.col = self.col - (self.col - 1) % self.tab_width as CaretCol + self.tab_width as CaretCol;
626            }
627            '\n' => {
628                self.line += 1;
629                self.col = 1;
630            }
631            '\r' => {}
632            _ => self.col += 1,
633        }
634        self.pos += 1;
635    }
636
637    pub fn get_error(&self) -> &LexerError {
638        &self.error
639    }
640
641    pub fn has_error(&self) -> bool {
642        self.error != LexerError::None
643    }
644
645    pub fn is_eos(&self) -> bool {
646        self.is_eos
647        // matches!(self.error, LexerError::EndOfStream { .. })
648    }
649}
650
651#[derive(Debug)]
652enum LexInterpretIterMode { Normal, Error }
653
654pub struct LexInterpretIter<'a, 'b, R> {
655    lexer: &'a mut Lexer<'b, R>,
656    error_info: Option<LexerErrorInfo>,
657    mode: LexInterpretIterMode
658}
659
660impl<'a, 'b, R: Read> Iterator for LexInterpretIter<'a, 'b, R> {
661    type Item = LexerToken; // (TokenId, ChannelId, String, CaretLine, CaretCol);
662
663    fn next(&mut self) -> Option<Self::Item> {
664        if self.lexer.is_eos {
665            None
666        } else {
667            match self.mode {
668                LexInterpretIterMode::Normal => {
669                    let t = self.lexer.get_token();
670                    match t {
671                        Ok(Some(token)) => Some(token),
672                        Err(LexerError::InvalidChar { info } | LexerError::UnrecognizedChar { info }) => {
673                            // in case of invalid or unrecognized character, the stream issues None then a special lexer tokens
674                            // that have a TokenId::MAX value and the error message in the text field
675                            self.error_info = Some(info);
676                            self.mode = LexInterpretIterMode::Error;
677                            None
678                        }
679                        _ => {
680                            None
681                        }
682                    }
683                }
684                LexInterpretIterMode::Error => {
685                    let info = self.error_info.as_ref().unwrap();
686                    self.mode = LexInterpretIterMode::Normal;
687                    let msg = format!("{}, scanned before = '{}'", self.lexer.get_error().to_string(), self.error_info.as_ref().unwrap().text);
688                    let pos = Pos(info.line, info.col);
689                    Some((TokenId::MAX, 0, msg, PosSpan::new(pos, pos)))
690                }
691            }
692        }
693    }
694}
695
696// ---------------------------------------------------------------------------------------------
697
698pub struct TokenSplit<I, F> {
699    iter: I,
700    ch: ChannelId,
701    f: F
702}
703
704pub trait TokenSpliterator: Iterator<Item=(TokenId, ChannelId, String, PosSpan)> {
705    /// Splits the token iterator out of the lexer (Item: `(TokenId, ChannelId, String, PosSpan)`) based on the channel ID:
706    /// * the default channel 0 is output as another iterator on `(token, string, pos_span)`, suitable for the parser
707    /// * other channels are consummed by the closure `f`, which takes the parameters `(token, channel, string, pos_span)`
708    ///
709    /// ## Example
710    /// ```ignore
711    /// let tokens = lexer.tokens().split_channel0(|(tok, ch, text, pos_span)|
712    ///     println!("TOKEN: channel {ch}, discarded, pos {pos_span}, Id {tok:?}, \"{text}\"")
713    /// );
714    /// let result = parser.parse_stream(&mut listener, tokens);
715    /// ```
716    fn split_channel0<F>(self, f: F) -> TokenSplit<Self, F>
717    where Self: Sized,
718          F: FnMut((TokenId, ChannelId, String, PosSpan))
719    {
720        TokenSplit { iter: self, ch: 0, f }
721    }
722
723    /// Splits the token iterator out of the lexer (Item: `(TokenId, ChannelId, String, PosSpan)`) based on the channel ID:
724    /// * the channel `channel` is output as another iterator on `(token, string, pos_span)`, suitable for the parser
725    /// * other channels are consummed by the closure `f`, which takes the parameters `(token, channel, string, pos_span)`
726    ///
727    /// ## Example
728    /// ```ignore
729    /// let tokens = lexer.tokens().split_channels(2, |(tok, ch, text, pos_span)|
730    ///     println!("TOKEN: channel {ch}, discarded, pos {pos_span}, Id {tok:?}, \"{text}\"")
731    /// );
732    /// let result = parser.parse_stream(&mut listener, tokens);
733    /// ```
734    fn split_channels<F>(self, channel: ChannelId, f: F) -> TokenSplit<Self, F>
735    where Self: Sized,
736          F: FnMut((TokenId, ChannelId, String, PosSpan))
737    {
738        TokenSplit { iter: self, ch: channel, f }
739    }
740
741    /// Filters the token iterator out of the lexer (Item: `(TokenId, ChannelId, String, PosSpan)`) based on the channel ID:
742    /// * the default channel 0 is output as another iterator on `(token, string, pos_span)`, suitable for the parser
743    /// * other channels are discarded.
744    ///
745    /// ## Example
746    /// ```ignore
747    /// let tokens = lexer.tokens().keep_channel0();
748    /// let result = parser.parse_stream(&mut listener, tokens);
749    /// ```
750    fn keep_channel0(self) -> impl Iterator<Item=(TokenId, String, PosSpan)>
751    where Self: Sized
752    {
753        self.filter_map(|(token, ch, str, pos_span)| {
754            if ch == 0 {
755                Some((token, str, pos_span))
756            } else {
757                None
758            }
759        })
760    }
761
762    /// Filters the token iterator out of the lexer (Item: `(TokenId, ChannelId, String, PosSpan)`) based on the channel ID:
763    /// * channel `channel` is output as another iterator on `(token, string, pos_span)`, suitable for the parser
764    /// * other channels are discarded.
765    ///
766    /// ## Example
767    /// ```ignore
768    /// let tokens = lexer.tokens().keep_channel(2);
769    /// let result = parser.parse_stream(&mut listener, tokens);
770    /// ```
771    fn keep_channel(self, channel: ChannelId) -> TokenSplit<Self, fn((TokenId, ChannelId, String, PosSpan))>
772    where Self: Sized
773    {
774        TokenSplit { iter: self, ch: channel, f: |_| {} }
775    }
776
777    // or:
778    //
779    // fn keep_channel(self, channel: ChannelId) -> impl Iterator<Item=(TokenId, String, PosSpan)>
780    // where Self: Sized
781    // {
782    //     self.filter_map(move |(token, ch, str, pos_span)| {
783    //         if ch == channel {
784    //             Some((token, str, pos_span))
785    //         } else {
786    //             None
787    //         }
788    //     })
789    // }
790}
791
792impl<I, F> Iterator for TokenSplit<I, F>
793    where I: Iterator<Item=(TokenId, ChannelId, String, PosSpan)>,
794          F: FnMut((TokenId, ChannelId, String, PosSpan))
795{
796    type Item = (TokenId, String, PosSpan);
797
798    fn next(&mut self) -> Option<Self::Item> {
799        if let Some((token, ch, str, pos_span)) = self.iter.next() {
800            if ch == self.ch {
801                Some((token, str, pos_span))
802            } else {
803                (self.f)((token, ch, str, pos_span));
804                None
805            }
806        } else {
807            None
808        }
809    }
810}
811
812impl<I: Iterator<Item=(TokenId, ChannelId, String, PosSpan)>> TokenSpliterator for I {}