1pub(crate) mod tests;
4
5use std::collections::HashMap;
6use std::fmt::{Display, Formatter};
7use std::io::Read;
8use std::ops::{Add, AddAssign};
9use crate::segmap::{char_to_group, GroupId, SegMap};
10use crate::char_reader::{escape_char, CharReader};
11use crate::TokenId;
12pub type StateId = usize;
16pub type ChannelId = u16;
17pub type ModeId = u16;
18
19#[derive(Clone, Debug, PartialEq, Default, PartialOrd, Eq, Ord)]
31pub struct Terminal {
32 pub action: ActionOption,
33 pub channel: ChannelId,
34 pub mode: ModeOption,
35 pub mode_state: Option<StateId>,
36 pub pop: bool
37}
38
39impl Terminal {
40 #[inline]
41 pub fn is_only_skip(&self) -> bool {
42 self.action.is_skip() && self.mode.is_none() && self.mode_state.is_none() && !self.pop
43 }
44
45 #[inline]
46 pub fn is_token(&self) -> bool {
47 self.action.is_token()
48 }
49
50 #[inline]
51 pub fn get_token(&self) -> Option<TokenId> {
52 self.action.get_token()
53 }
54
55 pub fn to_macro(&self) -> String {
56 let mut str = Vec::<String>::new();
57 match self.action {
58 ActionOption::Skip => str.push("term!(skip)".to_string()),
59 ActionOption::Token(t) => str.push(format!("term!(={t})")),
60 ActionOption::More => str.push("term!(more)".to_string())
61 }
62 if self.channel != 0 {
63 str.push(format!("term!(#{})", self.channel));
64 }
65 match self.mode {
66 ModeOption::None => {}
67 ModeOption::Mode(m) => str.push(format!("term!(mode {m})")),
68 ModeOption::Push(m) => str.push(format!("term!(push {m})")),
69 }
70 if let Some(id) = self.mode_state {
71 str.push(format!("term!(pushst {})", id));
72 }
73 if self.pop {
74 str.push("term!(pop)".to_string());
75 }
76 str.join(" + ")
77 }
78}
79
80impl Display for Terminal {
81 fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
82 write!(f, "<{}", self.action)?;
83 if self.channel != 0 { write!(f, ",ch {}", self.channel)?; }
84 if !self.mode.is_none() || self.mode_state.is_some() {
85 match self.mode {
86 ModeOption::None => {}
87 ModeOption::Mode(m) => write!(f, ",mode({m}")?,
88 ModeOption::Push(m) => write!(f, ",push({m}")?,
89 }
90 if let Some(s) = self.mode_state { write!(f, ",state {s}")?; }
91 write!(f, ")")?;
92 }
93 if self.pop { write!(f, ",pop")?; }
94 write!(f, ">")
95 }
96}
97
98impl Add for Terminal {
99 type Output = Terminal;
100
101 fn add(self, rhs: Self) -> Self::Output {
102 Terminal {
103 action: self.action + rhs.action,
105 channel: self.channel + rhs.channel,
106 mode: if !self.mode.is_none() { self.mode } else { rhs.mode },
107 mode_state: if self.mode_state.is_some() { self.mode_state } else { rhs.mode_state },
108 pop: self.pop || rhs.pop
109 }
110 }
111}
112
113#[derive(Clone, Debug, PartialEq, Default, PartialOrd, Eq, Ord)]
114pub enum ActionOption {
115 #[default] Skip,
116 Token(TokenId),
117 More
118}
119
120impl ActionOption {
121 pub fn is_skip(&self) -> bool { self == &ActionOption::Skip }
122 pub fn is_token(&self) -> bool { matches!(self, ActionOption::Token(_) ) }
123 pub fn is_more(&self) -> bool { self == &ActionOption::More }
124
125 pub fn get_token(&self) -> Option<TokenId> {
126 if let ActionOption::Token(token) = self {
127 Some(*token)
128 } else {
129 None
130 }
131 }
132}
133
134impl Add for ActionOption {
135 type Output = Self;
136
137 fn add(self, rhs: Self) -> Self::Output {
138 match self {
139 ActionOption::Skip => rhs,
140 _ => if rhs.is_skip() { self } else { panic!("can't add {self:?} and {rhs:?}") }
141 }
142 }
143}
144
145impl Display for ActionOption {
146 fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
147 match self {
148 ActionOption::Skip => write!(f, "skip"),
149 ActionOption::Token(t) => write!(f, "end:{t}"),
150 ActionOption::More => write!(f, "more")
151 }
152 }
153}
154
155#[derive(Clone, Copy, Debug, PartialEq, Default, PartialOrd, Eq, Ord)]
156pub enum ModeOption {
157 #[default]
158 None,
159 Mode(ModeId),
160 Push(ModeId)
161}
162
163impl ModeOption {
164 pub fn is_none(&self) -> bool {
165 self == &ModeOption::None
166 }
167
168 pub fn is_mode(&self) -> bool {
169 matches!(self, &ModeOption::Mode(_))
170 }
171
172 pub fn is_push(&self) -> bool {
173 matches!(self, &ModeOption::Push(_))
174 }
175}
176
177pub type CaretCol = u64;
181pub type CaretLine = u64;
182
183#[derive(Clone, Copy, PartialEq, PartialOrd, Debug)]
185pub struct Pos(pub CaretLine, pub CaretCol);
186
187impl Pos {
188 pub fn line(&self) -> CaretLine {
189 self.0
190 }
191
192 pub fn col(&self) -> CaretCol {
193 self.1
194 }
195}
196
197#[derive(Clone, PartialEq, Debug)]
200pub struct PosSpan {
201 pub first: Pos,
202 pub last: Pos,
203}
204
205impl PosSpan {
206 #[inline(always)]
207 pub fn new(first: Pos, last: Pos) -> Self {
208 PosSpan { first, last }
209 }
210
211 #[inline(always)]
212 pub fn empty() -> Self {
213 PosSpan { first: Pos(1, 1), last: Pos(0, 0) }
214 }
215
216 pub fn take(&mut self) -> PosSpan {
217 std::mem::take(self)
218 }
219
220 #[inline(always)]
221 pub fn is_empty(&self) -> bool {
222 self.first > self.last
223 }
224
225 #[inline(always)]
226 pub fn is_not_empty(&self) -> bool {
227 self.first <= self.last
228 }
229
230 pub fn first(&self) -> Option<Pos> {
231 if self.is_not_empty() { Some(self.first) } else { None }
232 }
233
234 pub fn first_forced(&self) -> Pos {
235 if self.is_not_empty() { self.first } else { panic!("span is empty") }
236 }
237
238 pub fn last(&self) -> Option<Pos> {
239 if self.is_not_empty() { Some(self.last) } else { None }
240 }
241
242 pub fn last_forced(&self) -> Pos {
243 if self.is_not_empty() { self.last } else { panic!("span is empty") }
244 }
245}
246
247impl AddAssign<&PosSpan> for PosSpan {
248 fn add_assign(&mut self, rhs: &Self) {
249 match (self.is_empty(), rhs.is_empty()) {
250 (true, false) => (self.first, self.last) = (rhs.first, rhs.last),
251 (false, false) => self.last = rhs.last,
252 _ => {}
253 }
254 }
255}
256
257impl Default for PosSpan {
258 fn default() -> Self {
259 PosSpan::empty()
260 }
261}
262
263impl Display for PosSpan {
264 fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
265 if self.is_not_empty() {
266 let (first, last) = (&self.first, &self.last);
267 if first == last {
268 write!(f, "{}:{}", first.0, first.1)
269 } else if first.0 == last.0 {
270 write!(f, "{}:{}-{}", first.0, first.1, last.1)
271 } else {
272 write!(f, "{}:{}-{}:{}", first.0, first.1, last.0, last.1)
273 }
274 } else {
275 write!(f, "<empty>")
276 }
277 }
278}
279
280#[derive(Clone, PartialEq, Debug)]
284pub struct LexerErrorInfo {
285 pub pos: u64,
286 pub line: CaretLine,
287 pub col: CaretCol,
288 pub curr_char: Option<char>,
289 pub group: GroupId,
290 pub state: StateId,
291 pub text: String,
292}
293
294#[derive(Clone, PartialEq, Debug)]
295pub enum LexerError {
296 None,
297 NoStreamAttached,
298 EndOfStream { info: LexerErrorInfo },
299 InvalidChar { info: LexerErrorInfo },
300 UnrecognizedChar { info: LexerErrorInfo },
301 InfiniteLoop { pos: u64 },
302 EmptyStateStack { info: LexerErrorInfo }
303}
304
305impl Display for LexerError {
306 fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
307 match self {
308 LexerError::None => write!(f, "no error"),
309 LexerError::NoStreamAttached => write!(f, "no stream attached"),
310 LexerError::EndOfStream { info: LexerErrorInfo { pos, line, col, ..} } =>
311 write!(f, "end of stream, line {line}, col {col} (stream pos = {pos})"),
312 LexerError::InvalidChar { info: LexerErrorInfo { pos, line, col, curr_char, .. } } =>
313 write!(f, "invalid character '{}', line {line}, col {col} (stream pos = {pos})", curr_char.unwrap()),
314 LexerError::UnrecognizedChar { info: LexerErrorInfo { pos, line, col, curr_char, .. } } =>
315 write!(f, "unrecognized character '{}', line {line}, col {col} (stream pos = {pos})", curr_char.unwrap()),
316 LexerError::InfiniteLoop { pos } =>
317 write!(f, "infinite loop (stream pos = {pos})"),
318 LexerError::EmptyStateStack { info: LexerErrorInfo { pos, line, col, curr_char, .. } } =>
319 write!(f, "pop from empty stack, line {line}, col {col}{} (stream pos = {pos})",
320 if let Some(c) = curr_char { format!(", chr = '{c}'") } else { String::new() })
321 }
322 }
323}
324
325impl LexerError {
326 pub fn get_pos(&self) -> Option<u64> {
327 match &self {
328 LexerError::EndOfStream { info: LexerErrorInfo { pos, .. } }
329 | LexerError::InvalidChar { info: LexerErrorInfo { pos, .. } }
330 | LexerError::UnrecognizedChar { info: LexerErrorInfo { pos, .. } }
331 | LexerError::InfiniteLoop { pos }
332 | LexerError::EmptyStateStack { info: LexerErrorInfo { pos, .. } } => Some(*pos),
333 _ => None
334 }
335 }
336
337 pub fn get_line_col(&self) -> Option<(CaretLine, CaretCol)> {
338 match &self {
339 LexerError::EndOfStream { info: LexerErrorInfo { line, col, .. } }
340 | LexerError::InvalidChar { info: LexerErrorInfo { line, col, .. } }
341 | LexerError::UnrecognizedChar { info: LexerErrorInfo { line, col, .. } }
342 | LexerError::EmptyStateStack { info: LexerErrorInfo { line, col, .. } } => Some((*line, *col)),
343 _ => None
344 }
345 }
346}
347
348pub type LexerToken = (TokenId, ChannelId, String, PosSpan);
349
350pub struct Lexer<'a, R> {
355 pub(crate) input: Option<CharReader<R>>,
357 pub(crate) error: LexerError,
358 pub(crate) is_eos: bool,
359 pub(crate) pos: u64,
360 pub(crate) line: CaretLine,
361 pub(crate) col: CaretCol,
362 pub(crate) tab_width: u8,
363 pub(crate) state_stack: Vec<StateId>,
364 pub(crate) start_state: StateId,
365 pub nbr_groups: u32,
367 pub initial_state: StateId,
368 pub first_end_state: StateId, pub nbr_states: StateId, pub ascii_to_group: &'a [GroupId],
372 pub utf8_to_group: HashMap<char, GroupId>,
373 pub seg_to_group: SegMap<GroupId>,
374 pub state_table: &'a [StateId],
375 pub terminal_table: &'a [Terminal], }
377
378impl<'a, R: Read> Lexer<'a, R> {
379 pub fn new(
380 nbr_groups: u32,
382 initial_state: StateId,
383 first_end_state: StateId, nbr_states: StateId, ascii_to_group: &'a [GroupId],
387 utf8_to_group: HashMap<char, GroupId>,
388 seg_to_group: SegMap<GroupId>,
389 state_table: &'a [StateId],
390 terminal_table: &'a [Terminal], ) -> Self {
392 Lexer {
393 input: None,
394 error: LexerError::None,
395 is_eos: false,
396 pos: 0,
397 line: 1,
398 col: 1,
399 tab_width: 4,
400 state_stack: Vec::new(),
401 start_state: 0,
402 nbr_groups,
403 initial_state,
404 first_end_state,
405 nbr_states,
406 ascii_to_group,
407 utf8_to_group,
408 seg_to_group,
409 state_table,
410 terminal_table,
411 }
412 }
413
414 pub fn attach_stream(&mut self, input: CharReader<R>) {
415 self.input = Some(input);
416 self.is_eos = false;
417 self.pos = 0;
418 self.line = 1;
419 self.col = 1;
420 self.state_stack.clear();
421 self.start_state = self.initial_state;
422 }
423
424 pub fn detach_stream(&mut self) -> Option<CharReader<R>> {
425 self.input.take()
427 }
428
429 pub fn set_tab_width(&mut self, width: u8) {
430 self.tab_width = width;
431 }
432
433 pub fn get_tab_width(&self) -> u8 {
434 self.tab_width
435 }
436
437 pub fn stream(&self) -> Option<&CharReader<R>> {
438 self.input.as_ref()
439 }
440
441 pub fn is_open(&self) -> bool {
442 self.input.as_ref().map(|input| input.is_reading()).unwrap_or(false)
443 }
444
445 pub fn tokens(&mut self) -> LexInterpretIter<'_, 'a, R> {
446 LexInterpretIter { lexer: self, error_info: None, mode: LexInterpretIterMode::Normal }
447 }
448
449 pub fn get_token(&mut self) -> Result<Option<LexerToken>, LexerError> {
485 const VERBOSE: bool = false;
486 if VERBOSE { println!("lexer state_table: {}, last: {}", self.state_table.len(), self.state_table.iter().last().unwrap()); }
487 self.error = LexerError::None;
488 let mut text = String::new();
489 let mut more_text = String::new(); if self.input.is_some() {
492 let mut state = self.start_state;
493 let mut first_pos = Pos(self.line, self.col);
494 let mut last_pos = first_pos;
495 #[cfg(debug_assertions)] let mut last_state: Option<StateId> = None;
496 #[cfg(debug_assertions)] let mut last_offset: Option<u64> = None;
497 #[cfg(debug_assertions)] let mut infinite_loop_cnt = 0_u32;
498 loop {
499 if VERBOSE { print!("- state = {state}"); }
500 let input = self.input.as_mut().unwrap();
501 #[cfg(debug_assertions)] {
502 if last_state.map(|st| st == state).unwrap_or(false) && last_offset.map(|offset| offset == input.get_offset()).unwrap_or(false) {
503 if infinite_loop_cnt > 3 {
504 self.error = LexerError::InfiniteLoop { pos: self.pos };
505 if VERBOSE { println!(" => Err({})", self.error); }
506 return Err(self.error.clone());
507 }
508 infinite_loop_cnt += 1;
509 } else {
510 infinite_loop_cnt = 0;
511 }
512 last_state = Some(state);
513 last_offset = Some(input.get_offset());
514 }
515 let c_opt = input.get_char();
516 let is_eos = c_opt.is_none();
517 self.is_eos = is_eos;
518 let group = c_opt.and_then(|c| char_to_group(&self.ascii_to_group, &self.utf8_to_group, &self.seg_to_group, c))
519 .unwrap_or(self.nbr_groups);
520 if VERBOSE { print!(", char '{}' group {}", if let Some(c) = c_opt { escape_char(c) } else { "<EOF>".to_string() }, group); }
521 let new_state = self.state_table[self.nbr_groups as usize * state + group as usize];
524 if new_state >= self.nbr_states || group >= self.nbr_groups { if let Some(c) = c_opt {
526 input.rewind(c).expect(&format!("Can't rewind character '{}'", escape_char(c)));
527 }
528 let is_accepting = self.first_end_state <= state && state < self.nbr_states;
529 if is_accepting { let terminal = &self.terminal_table[state - self.first_end_state];
531 if terminal.pop {
532 if self.state_stack.is_empty() {
533 self.error = LexerError::EmptyStateStack {
534 info: LexerErrorInfo {
535 pos: self.pos,
536 line: self.line,
537 col: self.col,
538 curr_char: c_opt,
539 group,
540 state,
541 text: more_text + &text,
542 }
543 };
544 if VERBOSE { println!(" => Err({})", self.error); }
545 return Err(self.error.clone());
546 }
547 self.start_state = self.state_stack.pop().unwrap();
548 if VERBOSE { print!(", pop to {}", self.start_state); }
549 }
550 if let Some(goto_state) = terminal.mode_state {
551 if terminal.mode.is_push() {
552 self.state_stack.push(self.start_state);
553 }
554 self.start_state = goto_state;
555 if VERBOSE { print!(", {}({})", if terminal.mode.is_push() { "push" } else { "mode" }, goto_state); }
556 }
557 if let Some(token) = &terminal.get_token() {
558 if VERBOSE { println!(" => OK: token {}", token); }
559 return Ok(Some((token.clone(), terminal.channel, more_text + &text, PosSpan::new(first_pos, last_pos))));
560 }
561 if !terminal.action.is_more() {
562 first_pos = Pos(self.line, self.col);
563 }
564 if !is_eos { if VERBOSE { println!(" => {}, state {}", terminal.action, self.start_state); }
566 state = self.start_state;
567 if terminal.action.is_more() {
568 more_text.push_str(&text);
569 }
570 text.clear();
571 continue;
572 }
573 }
574 if is_eos && is_accepting {
576 return Ok(None);
577 }
578 let info = LexerErrorInfo {
579 pos: self.pos,
580 line: self.line,
581 col: self.col,
582 curr_char: c_opt,
583 group,
584 state,
585 text: more_text + &text,
586 };
587 self.error = if is_eos {
588 LexerError::EndOfStream { info }
589 } else if group >= self.nbr_groups {
590 let c = input.get_char().unwrap(); self.update_pos(c);
592 LexerError::UnrecognizedChar { info }
593 } else {
594 let c = input.get_char().unwrap(); self.update_pos(c);
596 LexerError::InvalidChar { info }
597 };
598 if VERBOSE { println!(" => Err({})", self.error); }
599 return Err(self.error.clone());
600 } else {
601 last_pos = Pos(self.line, self.col);
602 if let Some(c) = c_opt {
603 text.push(c);
604 self.update_pos(c);
605 }
606 if VERBOSE { println!(" => state {new_state}"); }
607 state = new_state;
608 }
609 }
610 }
611 self.error = LexerError::NoStreamAttached;
612 if VERBOSE { println!(" => Err({})", self.error); }
613 Err(self.error.clone())
614 }
615
616 pub fn update_pos(&mut self, c: char) {
617 match c {
618 '\t' => {
619 self.col = self.col - (self.col - 1) % self.tab_width as CaretCol + self.tab_width as CaretCol;
626 }
627 '\n' => {
628 self.line += 1;
629 self.col = 1;
630 }
631 '\r' => {}
632 _ => self.col += 1,
633 }
634 self.pos += 1;
635 }
636
637 pub fn get_error(&self) -> &LexerError {
638 &self.error
639 }
640
641 pub fn has_error(&self) -> bool {
642 self.error != LexerError::None
643 }
644
645 pub fn is_eos(&self) -> bool {
646 self.is_eos
647 }
649}
650
651#[derive(Debug)]
652enum LexInterpretIterMode { Normal, Error }
653
654pub struct LexInterpretIter<'a, 'b, R> {
655 lexer: &'a mut Lexer<'b, R>,
656 error_info: Option<LexerErrorInfo>,
657 mode: LexInterpretIterMode
658}
659
660impl<'a, 'b, R: Read> Iterator for LexInterpretIter<'a, 'b, R> {
661 type Item = LexerToken; fn next(&mut self) -> Option<Self::Item> {
664 if self.lexer.is_eos {
665 None
666 } else {
667 match self.mode {
668 LexInterpretIterMode::Normal => {
669 let t = self.lexer.get_token();
670 match t {
671 Ok(Some(token)) => Some(token),
672 Err(LexerError::InvalidChar { info } | LexerError::UnrecognizedChar { info }) => {
673 self.error_info = Some(info);
676 self.mode = LexInterpretIterMode::Error;
677 None
678 }
679 _ => {
680 None
681 }
682 }
683 }
684 LexInterpretIterMode::Error => {
685 let info = self.error_info.as_ref().unwrap();
686 self.mode = LexInterpretIterMode::Normal;
687 let msg = format!("{}, scanned before = '{}'", self.lexer.get_error().to_string(), self.error_info.as_ref().unwrap().text);
688 let pos = Pos(info.line, info.col);
689 Some((TokenId::MAX, 0, msg, PosSpan::new(pos, pos)))
690 }
691 }
692 }
693 }
694}
695
696pub struct TokenSplit<I, F> {
699 iter: I,
700 ch: ChannelId,
701 f: F
702}
703
704pub trait TokenSpliterator: Iterator<Item=(TokenId, ChannelId, String, PosSpan)> {
705 fn split_channel0<F>(self, f: F) -> TokenSplit<Self, F>
717 where Self: Sized,
718 F: FnMut((TokenId, ChannelId, String, PosSpan))
719 {
720 TokenSplit { iter: self, ch: 0, f }
721 }
722
723 fn split_channels<F>(self, channel: ChannelId, f: F) -> TokenSplit<Self, F>
735 where Self: Sized,
736 F: FnMut((TokenId, ChannelId, String, PosSpan))
737 {
738 TokenSplit { iter: self, ch: channel, f }
739 }
740
741 fn keep_channel0(self) -> impl Iterator<Item=(TokenId, String, PosSpan)>
751 where Self: Sized
752 {
753 self.filter_map(|(token, ch, str, pos_span)| {
754 if ch == 0 {
755 Some((token, str, pos_span))
756 } else {
757 None
758 }
759 })
760 }
761
762 fn keep_channel(self, channel: ChannelId) -> TokenSplit<Self, fn((TokenId, ChannelId, String, PosSpan))>
772 where Self: Sized
773 {
774 TokenSplit { iter: self, ch: channel, f: |_| {} }
775 }
776
777 }
791
792impl<I, F> Iterator for TokenSplit<I, F>
793 where I: Iterator<Item=(TokenId, ChannelId, String, PosSpan)>,
794 F: FnMut((TokenId, ChannelId, String, PosSpan))
795{
796 type Item = (TokenId, String, PosSpan);
797
798 fn next(&mut self) -> Option<Self::Item> {
799 if let Some((token, ch, str, pos_span)) = self.iter.next() {
800 if ch == self.ch {
801 Some((token, str, pos_span))
802 } else {
803 (self.f)((token, ch, str, pos_span));
804 None
805 }
806 } else {
807 None
808 }
809 }
810}
811
812impl<I: Iterator<Item=(TokenId, ChannelId, String, PosSpan)>> TokenSpliterator for I {}