1pub(crate) mod tests;
4
5use std::collections::HashMap;
6use std::fmt::{Display, Formatter};
7use std::io::Read;
8use std::ops::{Add, AddAssign};
9use crate::segmap::{char_to_group, GroupId, SegMap};
10use crate::char_reader::{escape_char, CharReader};
11use crate::TokenId;
12pub type StateId = usize;
16pub type ChannelId = u16;
17pub type ModeId = u16;
18
19#[derive(Clone, Debug, PartialEq, Default, PartialOrd, Eq, Ord)]
31pub struct Terminal {
32 pub action: ActionOption,
33 pub channel: ChannelId,
34 pub mode: ModeOption,
35 pub mode_state: Option<StateId>,
36 pub pop: bool
37}
38
39impl Terminal {
40 #[inline]
41 pub fn is_only_skip(&self) -> bool {
42 self.action.is_skip() && self.mode.is_none() && self.mode_state.is_none() && !self.pop
43 }
44
45 #[inline]
46 pub fn is_token(&self) -> bool {
47 self.action.is_token()
48 }
49
50 #[inline]
51 pub fn get_token(&self) -> Option<TokenId> {
52 self.action.get_token()
53 }
54
55 pub fn to_macro(&self) -> String {
56 let mut str = Vec::<String>::new();
57 match self.action {
58 ActionOption::Skip => str.push("term!(skip)".to_string()),
59 ActionOption::Token(t) => str.push(format!("term!(={t})")),
60 ActionOption::More => str.push("term!(more)".to_string())
61 }
62 if self.channel != 0 {
63 str.push(format!("term!(#{})", self.channel));
64 }
65 match self.mode {
66 ModeOption::None => {}
67 ModeOption::Mode(m) => str.push(format!("term!(mode {m})")),
68 ModeOption::Push(m) => str.push(format!("term!(push {m})")),
69 }
70 if let Some(id) = self.mode_state {
71 str.push(format!("term!(pushst {})", id));
72 }
73 if self.pop {
74 str.push("term!(pop)".to_string());
75 }
76 str.join(" + ")
77 }
78}
79
80impl Display for Terminal {
81 fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
82 write!(f, "<{}", self.action)?;
83 if self.channel != 0 { write!(f, ",ch {}", self.channel)?; }
84 if !self.mode.is_none() || self.mode_state.is_some() {
85 match self.mode {
86 ModeOption::None => {}
87 ModeOption::Mode(m) => write!(f, ",mode({m}")?,
88 ModeOption::Push(m) => write!(f, ",push({m}")?,
89 }
90 if let Some(s) = self.mode_state { write!(f, ",state {s}")?; }
91 write!(f, ")")?;
92 }
93 if self.pop { write!(f, ",pop")?; }
94 write!(f, ">")
95 }
96}
97
98impl Add for Terminal {
99 type Output = Terminal;
100
101 fn add(self, rhs: Self) -> Self::Output {
102 Terminal {
103 action: self.action + rhs.action,
105 channel: self.channel + rhs.channel,
106 mode: if !self.mode.is_none() { self.mode } else { rhs.mode },
107 mode_state: if self.mode_state.is_some() { self.mode_state } else { rhs.mode_state },
108 pop: self.pop || rhs.pop
109 }
110 }
111}
112
113#[derive(Clone, Debug, PartialEq, Default, PartialOrd, Eq, Ord)]
114pub enum ActionOption {
115 #[default] Skip,
116 Token(TokenId),
117 More
118}
119
120impl ActionOption {
121 pub fn is_skip(&self) -> bool { self == &ActionOption::Skip }
122 pub fn is_token(&self) -> bool { matches!(self, ActionOption::Token(_) ) }
123 pub fn is_more(&self) -> bool { self == &ActionOption::More }
124
125 pub fn get_token(&self) -> Option<TokenId> {
126 if let ActionOption::Token(token) = self {
127 Some(*token)
128 } else {
129 None
130 }
131 }
132}
133
134impl Add for ActionOption {
135 type Output = Self;
136
137 fn add(self, rhs: Self) -> Self::Output {
138 match self {
139 ActionOption::Skip => rhs,
140 _ => if rhs.is_skip() { self } else { panic!("can't add {self:?} and {rhs:?}") }
141 }
142 }
143}
144
145impl Display for ActionOption {
146 fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
147 match self {
148 ActionOption::Skip => write!(f, "skip"),
149 ActionOption::Token(t) => write!(f, "end:{t}"),
150 ActionOption::More => write!(f, "more")
151 }
152 }
153}
154
155#[derive(Clone, Copy, Debug, PartialEq, Default, PartialOrd, Eq, Ord)]
156pub enum ModeOption {
157 #[default]
158 None,
159 Mode(ModeId),
160 Push(ModeId)
161}
162
163impl ModeOption {
164 pub fn is_none(&self) -> bool {
165 self == &ModeOption::None
166 }
167
168 pub fn is_mode(&self) -> bool {
169 matches!(self, &ModeOption::Mode(_))
170 }
171
172 pub fn is_push(&self) -> bool {
173 matches!(self, &ModeOption::Push(_))
174 }
175}
176
177pub type CaretCol = u64;
181pub type CaretLine = u64;
182
183#[derive(Clone, Copy, PartialEq, PartialOrd, Debug)]
185pub struct Pos(pub CaretLine, pub CaretCol);
186
187impl Pos {
188 pub fn line(&self) -> CaretLine {
189 self.0
190 }
191
192 pub fn col(&self) -> CaretCol {
193 self.1
194 }
195
196 pub fn update_pos(&mut self, c: char, tab_width: CaretCol) {
197 match c {
198 '\t' => {
199 self.1 = self.1 - (self.1 - 1) % tab_width + tab_width;
206 }
207 '\n' => {
208 self.0 += 1;
209 self.1 = 1;
210 }
211 '\r' => {}
212 _ => self.1 += 1,
213 }
214 }
215}
216
217impl Display for Pos {
218 fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
219 write!(f, "{}:{}", self.0, self.1)
220 }
221}
222
223#[derive(Clone, PartialEq, Debug)]
226pub struct PosSpan {
227 pub first: Pos,
228 pub last: Pos,
229}
230
231impl PosSpan {
232 #[inline(always)]
233 pub fn new(first: Pos, last: Pos) -> Self {
234 PosSpan { first, last }
235 }
236
237 #[inline(always)]
238 pub fn empty() -> Self {
239 PosSpan { first: Pos(1, 1), last: Pos(0, 0) }
240 }
241
242 pub fn take(&mut self) -> PosSpan {
243 std::mem::take(self)
244 }
245
246 #[inline(always)]
247 pub fn is_empty(&self) -> bool {
248 self.first > self.last
249 }
250
251 #[inline(always)]
252 pub fn is_not_empty(&self) -> bool {
253 self.first <= self.last
254 }
255
256 pub fn first(&self) -> Option<Pos> {
257 if self.is_not_empty() { Some(self.first) } else { None }
258 }
259
260 pub fn first_forced(&self) -> Pos {
261 if self.is_not_empty() { self.first } else { panic!("span is empty") }
262 }
263
264 pub fn last(&self) -> Option<Pos> {
265 if self.is_not_empty() { Some(self.last) } else { None }
266 }
267
268 pub fn last_forced(&self) -> Pos {
269 if self.is_not_empty() { self.last } else { panic!("span is empty") }
270 }
271}
272
273impl AddAssign<&PosSpan> for PosSpan {
274 fn add_assign(&mut self, rhs: &Self) {
275 match (self.is_empty(), rhs.is_empty()) {
276 (true, false) => (self.first, self.last) = (rhs.first, rhs.last),
277 (false, false) => self.last = rhs.last,
278 _ => {}
279 }
280 }
281}
282
283impl Add<&PosSpan> for &PosSpan {
284 type Output = PosSpan;
285
286 fn add(self, rhs: &PosSpan) -> Self::Output {
287 let mut sum = self.clone();
288 sum += rhs;
289 sum
290 }
291}
292
293impl Add<&PosSpan> for PosSpan {
294 type Output = PosSpan;
295
296 fn add(self, rhs: &PosSpan) -> Self::Output {
297 let mut sum = self.clone();
298 sum += rhs;
299 sum
300 }
301}
302
303impl Add<PosSpan> for PosSpan {
304 type Output = PosSpan;
305
306 fn add(mut self, rhs: PosSpan) -> Self::Output {
307 self += &rhs;
308 self
309 }
310}
311
312impl Default for PosSpan {
313 fn default() -> Self {
314 PosSpan::empty()
315 }
316}
317
318impl Display for PosSpan {
319 fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
320 if self.is_not_empty() {
321 let (first, last) = (&self.first, &self.last);
322 if first == last {
323 write!(f, "{first}")
324 } else if first.0 == last.0 {
325 write!(f, "{first}-{}", last.1)
326 } else {
327 write!(f, "{first}-{last}")
328 }
329 } else {
330 write!(f, "<empty>")
331 }
332 }
333}
334
335#[derive(Clone, PartialEq, Debug)]
339pub struct LexerErrorInfo {
340 pub pos: u64,
341 pub line: CaretLine,
342 pub col: CaretCol,
343 pub curr_char: Option<char>,
344 pub group: GroupId,
345 pub state: StateId,
346 pub text: String,
347}
348
349#[derive(Clone, PartialEq, Debug)]
350pub enum LexerError {
351 None,
352 NoStreamAttached,
353 EndOfStream { info: LexerErrorInfo },
354 InvalidChar { info: LexerErrorInfo },
355 UnrecognizedChar { info: LexerErrorInfo },
356 InfiniteLoop { pos: u64 },
357 EmptyStateStack { info: LexerErrorInfo }
358}
359
360impl Display for LexerError {
361 fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
362 match self {
363 LexerError::None => write!(f, "no error"),
364 LexerError::NoStreamAttached => write!(f, "no stream attached"),
365 LexerError::EndOfStream { info: LexerErrorInfo { pos, line, col, ..} } =>
366 write!(f, "end of stream, line {line}, col {col} (stream pos = {pos})"),
367 LexerError::InvalidChar { info: LexerErrorInfo { pos, line, col, curr_char, .. } } =>
368 write!(f, "invalid character '{}', line {line}, col {col} (stream pos = {pos})", curr_char.unwrap()),
369 LexerError::UnrecognizedChar { info: LexerErrorInfo { pos, line, col, curr_char, .. } } =>
370 write!(f, "unrecognized character '{}', line {line}, col {col} (stream pos = {pos})", curr_char.unwrap()),
371 LexerError::InfiniteLoop { pos } =>
372 write!(f, "infinite loop (stream pos = {pos})"),
373 LexerError::EmptyStateStack { info: LexerErrorInfo { pos, line, col, curr_char, .. } } =>
374 write!(f, "pop from empty stack, line {line}, col {col}{} (stream pos = {pos})",
375 if let Some(c) = curr_char { format!(", chr = '{c}'") } else { String::new() })
376 }
377 }
378}
379
380impl LexerError {
381 pub fn get_pos(&self) -> Option<u64> {
382 match &self {
383 LexerError::EndOfStream { info: LexerErrorInfo { pos, .. } }
384 | LexerError::InvalidChar { info: LexerErrorInfo { pos, .. } }
385 | LexerError::UnrecognizedChar { info: LexerErrorInfo { pos, .. } }
386 | LexerError::InfiniteLoop { pos }
387 | LexerError::EmptyStateStack { info: LexerErrorInfo { pos, .. } } => Some(*pos),
388 _ => None
389 }
390 }
391
392 pub fn get_line_col(&self) -> Option<(CaretLine, CaretCol)> {
393 match &self {
394 LexerError::EndOfStream { info: LexerErrorInfo { line, col, .. } }
395 | LexerError::InvalidChar { info: LexerErrorInfo { line, col, .. } }
396 | LexerError::UnrecognizedChar { info: LexerErrorInfo { line, col, .. } }
397 | LexerError::EmptyStateStack { info: LexerErrorInfo { line, col, .. } } => Some((*line, *col)),
398 _ => None
399 }
400 }
401}
402
403pub type LexerToken = (TokenId, ChannelId, String, PosSpan);
404
405pub struct Lexer<'a, R> {
410 pub(crate) input: Option<CharReader<R>>,
412 pub(crate) error: LexerError,
413 pub(crate) is_eos: bool,
414 pub(crate) pos: u64,
415 pub(crate) cursor: Pos,
416 pub(crate) tab_width: CaretCol,
417 pub(crate) state_stack: Vec<StateId>,
418 pub(crate) start_state: StateId,
419 pub nbr_groups: u32,
421 pub initial_state: StateId,
422 pub first_end_state: StateId, pub nbr_states: StateId, pub ascii_to_group: &'a [GroupId],
426 pub utf8_to_group: HashMap<char, GroupId>,
427 pub seg_to_group: SegMap<GroupId>,
428 pub state_table: &'a [StateId],
429 pub terminal_table: &'a [Terminal], }
431
432impl<'a, R: Read> Lexer<'a, R> {
433 pub fn new(
434 nbr_groups: u32,
436 initial_state: StateId,
437 first_end_state: StateId, nbr_states: StateId, ascii_to_group: &'a [GroupId],
441 utf8_to_group: HashMap<char, GroupId>,
442 seg_to_group: SegMap<GroupId>,
443 state_table: &'a [StateId],
444 terminal_table: &'a [Terminal], ) -> Self {
446 Lexer {
447 input: None,
448 error: LexerError::None,
449 is_eos: false,
450 pos: 0,
451 cursor: Pos(1, 1),
452 tab_width: 4,
453 state_stack: Vec::new(),
454 start_state: 0,
455 nbr_groups,
456 initial_state,
457 first_end_state,
458 nbr_states,
459 ascii_to_group,
460 utf8_to_group,
461 seg_to_group,
462 state_table,
463 terminal_table,
464 }
465 }
466
467 pub fn attach_stream(&mut self, input: CharReader<R>) {
468 self.input = Some(input);
469 self.is_eos = false;
470 self.pos = 0;
471 self.cursor = Pos(1, 1);
472 self.state_stack.clear();
473 self.start_state = self.initial_state;
474 }
475
476 pub fn detach_stream(&mut self) -> Option<CharReader<R>> {
477 self.input.take()
479 }
480
481 pub fn set_tab_width(&mut self, width: CaretCol) {
482 self.tab_width = width;
483 }
484
485 pub fn get_tab_width(&self) -> CaretCol {
486 self.tab_width
487 }
488
489 pub fn stream(&self) -> Option<&CharReader<R>> {
490 self.input.as_ref()
491 }
492
493 pub fn is_open(&self) -> bool {
494 self.input.as_ref().map(|input| input.is_reading()).unwrap_or(false)
495 }
496
497 pub fn skip_to_pos(&mut self, new_pos: Pos) -> Result<(), String> {
498 if self.input.is_none() {
499 return Err("no current input".to_string());
500 }
501 while self.cursor != new_pos {
502 if let Some(c) = self.input.as_mut().unwrap().get_char() {
503 self.cursor.update_pos(c, self.tab_width);
504 } else {
505 return Err("cannot find the position of the grammar in the lexicon".to_string());
506 };
507 }
508 Ok(())
509 }
510
511 pub fn tokens(&mut self) -> LexInterpretIter<'_, 'a, R> {
512 LexInterpretIter { lexer: self, error_info: None, mode: LexInterpretIterMode::Normal }
513 }
514
515 pub fn get_token(&mut self) -> Result<Option<LexerToken>, LexerError> {
551 const VERBOSE: bool = false;
552 if VERBOSE { println!("lexer state_table: {}, last: {}", self.state_table.len(), self.state_table.iter().last().unwrap()); }
553 self.error = LexerError::None;
554 let mut text = String::new();
555 let mut more_text = String::new(); if self.input.is_some() {
557 let mut state = self.start_state;
558 let mut first_pos = self.cursor;
559 let mut last_pos = first_pos;
560 #[cfg(debug_assertions)] let mut last_state: Option<StateId> = None;
561 #[cfg(debug_assertions)] let mut last_offset: Option<u64> = None;
562 #[cfg(debug_assertions)] let mut infinite_loop_cnt = 0_u32;
563 loop {
564 if VERBOSE { print!("- state = {state}"); }
565 #[allow(clippy::unnecessary_unwrap)] let input = self.input.as_mut().unwrap();
567 #[cfg(debug_assertions)] {
568 if last_state.map(|st| st == state).unwrap_or(false) && last_offset.map(|offset| offset == input.get_offset()).unwrap_or(false) {
569 if infinite_loop_cnt > 3 {
570 self.error = LexerError::InfiniteLoop { pos: self.pos };
571 if VERBOSE { println!(" => Err({})", self.error); }
572 return Err(self.error.clone());
573 }
574 infinite_loop_cnt += 1;
575 } else {
576 infinite_loop_cnt = 0;
577 }
578 last_state = Some(state);
579 last_offset = Some(input.get_offset());
580 }
581 let c_opt = input.get_char();
582 let is_eos = c_opt.is_none();
583 self.is_eos = is_eos;
584 let group = c_opt.and_then(|c| char_to_group(self.ascii_to_group, &self.utf8_to_group, &self.seg_to_group, c))
585 .unwrap_or(self.nbr_groups);
586 if VERBOSE { print!(", char '{}' group {}", if let Some(c) = c_opt { escape_char(c) } else { "<EOF>".to_string() }, group); }
587 let new_state = self.state_table[self.nbr_groups as usize * state + group as usize];
590 if new_state >= self.nbr_states || group >= self.nbr_groups { if let Some(c) = c_opt {
592 input.rewind(c).unwrap_or_else(|_| panic!("Can't rewind character '{}'", escape_char(c)));
593 }
594 let is_accepting = self.first_end_state <= state && state < self.nbr_states;
595 if is_accepting { let terminal = &self.terminal_table[state - self.first_end_state];
597 if terminal.pop {
598 if self.state_stack.is_empty() {
599 self.error = LexerError::EmptyStateStack {
600 info: LexerErrorInfo {
601 pos: self.pos,
602 line: self.cursor.line(),
603 col: self.cursor.col(),
604 curr_char: c_opt,
605 group,
606 state,
607 text: more_text + &text,
608 }
609 };
610 if VERBOSE { println!(" => Err({})", self.error); }
611 return Err(self.error.clone());
612 }
613 self.start_state = self.state_stack.pop().unwrap();
614 if VERBOSE { print!(", pop to {}", self.start_state); }
615 }
616 if let Some(goto_state) = terminal.mode_state {
617 if terminal.mode.is_push() {
618 self.state_stack.push(self.start_state);
619 }
620 self.start_state = goto_state;
621 if VERBOSE { print!(", {}({})", if terminal.mode.is_push() { "push" } else { "mode" }, goto_state); }
622 }
623 if let Some(token) = &terminal.get_token() {
624 if VERBOSE { println!(" => OK: token {}", token); }
625 return Ok(Some((*token, terminal.channel, more_text + &text, PosSpan::new(first_pos, last_pos))));
626 }
627 if !terminal.action.is_more() {
628 first_pos = self.cursor;
629 }
630 if !is_eos { if VERBOSE { println!(" => {}, state {}", terminal.action, self.start_state); }
632 state = self.start_state;
633 if terminal.action.is_more() {
634 more_text.push_str(&text);
635 }
636 text.clear();
637 continue;
638 }
639 }
640 if is_eos && is_accepting {
642 return Ok(None);
643 }
644 let info = LexerErrorInfo {
645 pos: self.pos,
646 line: self.cursor.line(),
647 col: self.cursor.col(),
648 curr_char: c_opt,
649 group,
650 state,
651 text: more_text + &text,
652 };
653 self.error = if is_eos {
654 LexerError::EndOfStream { info }
655 } else if group >= self.nbr_groups {
656 let c = input.get_char().unwrap(); self.update_pos(c);
658 LexerError::UnrecognizedChar { info }
659 } else {
660 let c = input.get_char().unwrap(); self.update_pos(c);
662 LexerError::InvalidChar { info }
663 };
664 if VERBOSE { println!(" => Err({})", self.error); }
665 return Err(self.error.clone());
666 } else {
667 last_pos = self.cursor;
668 if let Some(c) = c_opt {
669 text.push(c);
670 self.update_pos(c);
671 }
672 if VERBOSE { println!(" => state {new_state}"); }
673 state = new_state;
674 }
675 }
676 }
677 self.error = LexerError::NoStreamAttached;
678 if VERBOSE { println!(" => Err({})", self.error); }
679 Err(self.error.clone())
680 }
681
682 pub fn update_pos(&mut self, c: char) {
683 self.cursor.update_pos(c, self.tab_width);
684 self.pos += 1;
685 }
686
687 pub fn get_error(&self) -> &LexerError {
688 &self.error
689 }
690
691 pub fn has_error(&self) -> bool {
692 self.error != LexerError::None
693 }
694
695 pub fn is_eos(&self) -> bool {
696 self.is_eos
697 }
699}
700
701#[derive(Debug)]
702enum LexInterpretIterMode { Normal, Error }
703
704pub struct LexInterpretIter<'a, 'b, R> {
705 lexer: &'a mut Lexer<'b, R>,
706 error_info: Option<LexerErrorInfo>,
707 mode: LexInterpretIterMode
708}
709
710impl<'a, 'b, R: Read> Iterator for LexInterpretIter<'a, 'b, R> {
711 type Item = LexerToken; fn next(&mut self) -> Option<Self::Item> {
714 if self.lexer.is_eos {
715 None
716 } else {
717 match self.mode {
718 LexInterpretIterMode::Normal => {
719 let t = self.lexer.get_token();
720 match t {
721 Ok(Some(token)) => Some(token),
722 Err(LexerError::InvalidChar { info } | LexerError::UnrecognizedChar { info }) => {
723 self.error_info = Some(info);
726 self.mode = LexInterpretIterMode::Error;
727 None
728 }
729 _ => {
730 None
731 }
732 }
733 }
734 LexInterpretIterMode::Error => {
735 let info = self.error_info.as_ref().unwrap();
736 self.mode = LexInterpretIterMode::Normal;
737 let msg = format!("{}, scanned before = '{}'", self.lexer.get_error(), self.error_info.as_ref().unwrap().text);
738 let pos = Pos(info.line, info.col);
739 Some((TokenId::MAX, 0, msg, PosSpan::new(pos, pos)))
740 }
741 }
742 }
743 }
744}
745
746pub struct TokenSplit<I, F> {
749 iter: I,
750 ch: ChannelId,
751 f: F
752}
753
754pub trait TokenSpliterator: Iterator<Item=LexerToken> {
755 fn split_channel0<F>(self, f: F) -> TokenSplit<Self, F>
767 where Self: Sized,
768 F: FnMut((TokenId, ChannelId, String, PosSpan))
769 {
770 TokenSplit { iter: self, ch: 0, f }
771 }
772
773 fn split_channels<F>(self, channel: ChannelId, f: F) -> TokenSplit<Self, F>
785 where Self: Sized,
786 F: FnMut(LexerToken)
787 {
788 TokenSplit { iter: self, ch: channel, f }
789 }
790
791 fn keep_channel0(self) -> impl Iterator<Item=(TokenId, String, PosSpan)>
801 where Self: Sized
802 {
803 self.filter_map(|(token, ch, str, pos_span)| {
804 if ch == 0 {
805 Some((token, str, pos_span))
806 } else {
807 None
808 }
809 })
810 }
811
812 fn keep_channel(self, channel: ChannelId) -> TokenSplit<Self, fn(LexerToken)>
822 where Self: Sized
823 {
824 TokenSplit { iter: self, ch: channel, f: |_| {} }
825 }
826
827 }
841
842impl<I, F> Iterator for TokenSplit<I, F>
843 where I: Iterator<Item=LexerToken>,
844 F: FnMut((TokenId, ChannelId, String, PosSpan))
845{
846 type Item = (TokenId, String, PosSpan);
847
848 fn next(&mut self) -> Option<Self::Item> {
849 if let Some((token, ch, str, pos_span)) = self.iter.next() {
850 if ch == self.ch {
851 Some((token, str, pos_span))
852 } else {
853 (self.f)((token, ch, str, pos_span));
854 None
855 }
856 } else {
857 None
858 }
859 }
860}
861
862impl<I: Iterator<Item=LexerToken>> TokenSpliterator for I {}