ebnf_parser/
lexer.rs

1use std::{mem, str::Chars};
2
3use crate::{
4    error::SyntaxError,
5    span::Span,
6    token::{Token, TokenKind},
7};
8
9macro_rules! simple_token {
10    ($self:ident, $token:expr) => {{
11        let start = $self.index;
12        $self.next();
13        Ok(Token::new($token, Span::new(start, $self.index)))
14    }};
15}
16
17pub struct Lexer<'src> {
18    text: &'src str,
19    src: Chars<'src>,
20    curr_char: Option<char>,
21    next_char: Option<char>,
22    pub(crate) index: usize,
23}
24
25impl<'src> Lexer<'src> {
26    pub fn new(text: &'src str) -> Self {
27        let mut lexer = Lexer {
28            text,
29            src: text.chars(),
30            curr_char: None,
31            next_char: None,
32            index: 0,
33        };
34        lexer.next();
35        lexer.next();
36        lexer
37    }
38
39    fn next(&mut self) {
40        if let Some(curr_char) = self.curr_char {
41            self.index += curr_char.len_utf8();
42        }
43        mem::swap(&mut self.curr_char, &mut self.next_char);
44        self.next_char = self.src.next();
45    }
46
47    pub fn next_token(&mut self) -> Result<Option<Token<'src>>, SyntaxError> {
48        while let Some(' ' | '\n' | '\t' | '\r') = self.curr_char {
49            self.next();
50        }
51        if let Some(curr_char) = self.curr_char {
52            let token_result = match curr_char {
53                '{' => simple_token!(self, TokenKind::LBrace),
54                '}' => simple_token!(self, TokenKind::RBrace),
55                '[' => simple_token!(self, TokenKind::LBracket),
56                ']' => simple_token!(self, TokenKind::RBracket),
57                '(' if self.next_char != Some('*') => simple_token!(self, TokenKind::LParen),
58                ')' => simple_token!(self, TokenKind::RParen),
59                '|' => simple_token!(self, TokenKind::Pipe),
60                ',' => simple_token!(self, TokenKind::Comma),
61                ';' => simple_token!(self, TokenKind::Semicolon),
62                '=' => simple_token!(self, TokenKind::Equal),
63                '*' => simple_token!(self, TokenKind::Star),
64                '-' => simple_token!(self, TokenKind::Dash),
65                '(' => self.parse_comment(),
66                '\'' | '"' => self.parse_terminal(),
67                '?' => self.parse_special_seq(),
68                c if c.is_ascii_alphabetic() => self.parse_identifier(),
69                c if c.is_ascii_digit() => self.parse_integer(),
70                c => {
71                    let span_start = self.index;
72                    self.next();
73                    Err(SyntaxError::new(
74                        Span::new(span_start, self.index),
75                        format!("Illegal character '{}'", c).into(),
76                    ))
77                }
78            };
79            match token_result {
80                Ok(token) => Ok(Some(token)),
81                Err(err) => Err(err),
82            }
83        } else {
84            Ok(None)
85        }
86    }
87
88    fn delimeted_str(&mut self, delimeter: Option<char>) -> &'src str {
89        self.next(); // opening delimeter
90        let content_start = self.index;
91        while self.curr_char.is_some() && self.curr_char != delimeter {
92            self.next();
93        }
94        let content_end = self.index;
95        self.next(); // closing delimeter
96        &self.text[content_start..content_end]
97    }
98
99    fn parse_comment(&mut self) -> Result<Token<'src>, SyntaxError> {
100        debug_assert!(
101            self.curr_char == Some('(') && self.next_char == Some('*'),
102            "Expected '(' and '*', was {:?} and {:?}",
103            self.curr_char,
104            self.next_char,
105        );
106
107        let span_start = self.index;
108
109        self.next();
110        self.next();
111        let content_start = self.index;
112        while self.curr_char.is_some()
113            && !(self.curr_char == Some('*') && self.next_char == Some(')'))
114        {
115            self.next();
116        }
117        let content_end = self.index;
118        self.next();
119        self.next();
120
121        let content = &self.text[content_start..content_end];
122
123        Ok(Token::new(
124            TokenKind::Comment(content),
125            Span::new(span_start, self.index),
126        ))
127    }
128
129    fn parse_terminal(&mut self) -> Result<Token<'src>, SyntaxError> {
130        debug_assert!(
131            self.curr_char == Some('\'') || self.curr_char == Some('"'),
132            "Expected quote, was {:?}",
133            self.curr_char,
134        );
135
136        let quote = self.curr_char;
137        let span_start = self.index;
138        let content = self.delimeted_str(quote).trim();
139
140        Ok(Token::new(
141            TokenKind::Terminal(content),
142            Span::new(span_start, self.index),
143        ))
144    }
145
146    fn parse_special_seq(&mut self) -> Result<Token<'src>, SyntaxError> {
147        debug_assert!(
148            self.curr_char == Some('?'),
149            "Expected '?', was {:?}",
150            self.curr_char,
151        );
152
153        let span_start = self.index;
154        let content = self.delimeted_str(Some('?')).trim();
155
156        Ok(Token::new(
157            TokenKind::SpecialSeq(content),
158            Span::new(span_start, self.index),
159        ))
160    }
161
162    fn parse_identifier(&mut self) -> Result<Token<'src>, SyntaxError> {
163        debug_assert!(
164            self.curr_char.map_or(false, |c| c.is_ascii_alphabetic()),
165            "Expected letter, was {:?}",
166            self.curr_char,
167        );
168
169        let span_start = self.index;
170        let content_start = self.index;
171        self.next(); // first letter
172        while self
173            .curr_char
174            .map_or(false, |c| c.is_ascii_alphanumeric() || c == '_')
175        {
176            self.next();
177        }
178        let content_end = self.index;
179        let content = &self.text[content_start..content_end];
180
181        Ok(Token::new(
182            TokenKind::Identifier(content),
183            Span::new(span_start, self.index),
184        ))
185    }
186
187    fn parse_integer(&mut self) -> Result<Token<'src>, SyntaxError> {
188        debug_assert!(
189            self.curr_char.map_or(false, |c| c.is_ascii_digit()),
190            "Expected digit, was {:?}",
191            self.curr_char,
192        );
193
194        let span_start = self.index;
195        let content_start = self.index;
196        self.next(); // first digit
197        while self.curr_char.map_or(false, |c| c.is_ascii_digit()) {
198            self.next();
199        }
200        let content_end = self.index;
201        let slice = &self.text[content_start..content_end];
202        let num = match slice.parse() {
203            Ok(num) => num,
204            Err(_) => {
205                return Err(SyntaxError::new(
206                    Span::new(span_start, self.index),
207                    "Number does not fit into `usize` type".into(),
208                ))
209            }
210        };
211
212        Ok(Token::new(
213            TokenKind::Integer(num),
214            Span::new(span_start, self.index),
215        ))
216    }
217}