1use std::io;
2use core::iter::{Iterator, Peekable};
3use crate::language;
4use crate::debug::{DebugSymbol, TokenIndex, TokenLength};
5
6
7mod token;
8mod errors;
9mod tests;
10
11pub mod rules;
12pub use rules::MatchResult;
13use rules::LexerRule;
14use rules::comments::{LineCommentRule, BlockCommentRule};
15
16pub use token::*;
17pub use errors::*;
18
19
20#[derive(Clone)]
23pub struct LexerOptions {
24 skip_comments: bool,
25}
26
27pub struct LexerBuilder {
28 rules: Vec<Box<dyn LexerRule>>,
29 options: LexerOptions,
30}
31
32impl Default for LexerBuilder {
33 fn default() -> Self { Self::new() }
34}
35
36impl LexerBuilder {
37 pub fn new() -> Self {
38 LexerBuilder {
39 rules: Vec::new(),
40 options: LexerOptions {
41 skip_comments: true,
42 }
43 }
44 }
45
46 fn set_options(mut self, options: LexerOptions) -> Self {
47 self.options = options;
48 self
49 }
50
51 pub fn set_skip_comments(mut self, skip_comments: bool) -> Self {
52 self.options.skip_comments = skip_comments;
53 self
54 }
55
56 pub fn add_rule<R>(mut self, rule: R) -> Self
59 where R: LexerRule + 'static {
60 self.rules.push(Box::new(rule));
61 self
62 }
63
64 pub fn insert_rule<R>(mut self, index: usize, rule: impl LexerRule + 'static) -> Self {
65 self.rules.insert(index, Box::new(rule));
66 self
67 }
68
69 pub fn extend_rules(mut self, rules: impl Iterator<Item=impl LexerRule + 'static>) -> Self {
70 for rule in rules {
71 self.rules.push(Box::new(rule));
72 }
73 self
74 }
75
76 pub fn build_once<S>(self, source: S) -> Lexer<S> where S: Iterator<Item=io::Result<char>> {
78
79 Lexer::new(source, self.options, self.rules.into_iter())
80
81 }
82
83 pub fn build<S>(&self, source: S) -> Lexer<S> where S: Iterator<Item=io::Result<char>> {
84
85 Lexer::new(source, self.options.clone(), self.rules.clone().into_iter())
86
87 }
88}
89
90fn split_array_pair_mut<T>(pair: &mut [T; 2]) -> (&mut T, &mut T) {
93 let (first, rest) = pair.split_first_mut().unwrap();
94 let second = &mut rest[0];
95 (first, second)
96}
97
98type RuleID = usize;
101
102pub struct Lexer<S> where S: Iterator<Item=io::Result<char>> {
103 source: Peekable<S>,
104 options: LexerOptions,
105 rules: Vec<Box<dyn LexerRule>>,
106
107 current: TokenIndex, last: Option<char>,
109 newline: bool,
110
111 active: [Vec<RuleID>; 2],
114 complete: [Vec<RuleID>; 2],
115}
116
117const THIS_CYCLE: usize = 0;
119const NEXT_CYCLE: usize = 1;
120
121
122impl<S> Iterator for Lexer<S> where S: Iterator<Item=io::Result<char>> {
123 type Item = Result<TokenMeta, LexerError>;
124
125 fn next(&mut self) -> Option<Self::Item> { Some(self.next_token()) }
126}
127
128type PrevNextChars = (Option<char>, Option<char>);
129
130impl<S> Lexer<S> where S: Iterator<Item=io::Result<char>> {
131
132 pub fn new(source: S, options: LexerOptions, rules: impl Iterator<Item=Box<dyn LexerRule>>) -> Self {
133 Lexer {
134 options,
135 source: source.peekable(),
136 rules: rules.collect(),
137
138 current: 0,
139 last: None,
140 newline: true,
141 active: [Vec::new(), Vec::new()],
142 complete: [Vec::new(), Vec::new()],
143 }
144 }
145
146 fn get_next(&mut self) -> Result<Option<char>, LexerError> {
148 match self.source.next() {
149 None => Ok(None),
150 Some(result) => match result {
151 Ok(c) => Ok(Some(c)),
152 Err(error) => Err(self.error(ErrorKind::IOError, self.current).caused_by(Box::new(error))),
153 },
154 }
155 }
156
157 fn peek_next(&mut self) -> Result<Option<char>, LexerError> {
158 let result = match self.source.peek() {
159 None => Ok(None),
160 Some(Ok(c)) => Ok(Some(*c)),
161
162 Some(Err(..)) => Err(()),
164 };
165
166 result.map_err(|_| {
167 let ioerror = self.source.next().unwrap().unwrap_err();
168 self.error(ErrorKind::IOError, self.current).caused_by(Box::new(ioerror))
169 })
170 }
171
172 fn advance(&mut self) -> Result<PrevNextChars, LexerError> {
173 self.last = self.peek_next()?;
174 let next = self.get_next()?;
175
176 if next.is_some() {
177 if self.current == TokenIndex::MAX {
178 return Err(self.error(ErrorKind::SourceTooLong, self.current));
179 }
180 self.current += 1;
181 }
182
183 Ok((self.last, next))
184 }
185
186 fn peek(&mut self) -> Result<PrevNextChars, LexerError> {
188 Ok((self.last, self.peek_next()?))
189 }
190
191 pub fn at_eof(&mut self) -> bool {
192 self.source.peek().is_none()
193 }
194
195 fn skip_whitespace(&mut self) -> Result<(), LexerError> {
196 let mut next = self.peek_next()?;
197 while next.is_some() && next.unwrap().is_whitespace() {
198 if let (_, Some('\n')) = self.advance()? {
200 self.newline = true;
201 }
202 next = self.peek_next()?;
203 }
204 Ok(())
205 }
206
207 fn skip_comments(&mut self) -> Result<bool, LexerError> {
208 let line_rule = LineCommentRule::new(language::COMMENT_CHAR);
209 let block_rule = BlockCommentRule::new(language::NESTED_COMMENT_START, language::NESTED_COMMENT_END);
210
211 let mut line = Some(line_rule);
212 let mut block = Some(block_rule);
213
214 let start_pos = self.current;
215 loop {
216 let (prev, next) = self.peek()?;
217 let next = match next {
218 Some(ch) => ch,
219 None => break,
220 };
221
222 if let Some(rule) = line.as_mut() {
223 if !rule.try_match(prev, next).is_match() {
224 line = None;
225 }
226 }
227
228 if let Some(rule) = block.as_mut() {
229 if !rule.try_match(prev, next).is_match() {
230 block = None;
231 }
232 }
233
234 if line.is_none() && block.is_none() {
235 break;
236 }
237
238 if let (_, Some('\n')) = self.advance()? {
240 self.newline = true;
241 }
242 }
243
244 let continue_ = !self.at_eof() && self.current > start_pos;
246
247 Ok(continue_)
248 }
249
250 fn reset_rules(&mut self) {
251 for rule in self.rules.iter_mut() {
252 rule.reset();
253 }
254
255 for idx in 0..2 {
256 self.active[idx].clear();
257 self.complete[idx].clear();
258 }
259 }
260
261 pub fn next_token(&mut self) -> Result<TokenMeta, LexerError> {
262 self.skip_whitespace()?;
263
264 if self.options.skip_comments {
265 while self.skip_comments()? {
266 self.skip_whitespace()?;
267 }
268 }
269
270 let result = self.scan_token();
271 self.newline = matches!(self.last, Some('\n'));
272
273 result
274 }
275
276 fn scan_token(&mut self) -> Result<TokenMeta, LexerError> {
277
278 let token_start = self.current;
280 self.reset_rules();
281
282 let (mut prev, next) = self.peek()?;
297 let mut next = match next {
298 Some(ch) => ch,
299 None => {
300 return self.token_data(Token::EOF, token_start);
301 },
302 };
303
304 self.active[THIS_CYCLE].extend(0..self.rules.len());
306
307 loop {
308
309 {
312 let (active, next_active) = split_array_pair_mut(&mut self.active);
313 let (complete, next_complete) = split_array_pair_mut(&mut self.complete);
314
315 next_active.clear();
320 next_complete.clear();
321
322 for &rule_id in active.iter() {
323 let rule = &mut self.rules[rule_id];
324 let match_result = rule.try_match(prev, next);
325
326 if match_result.is_match() {
327 next_active.push(rule_id);
328
329 if match_result.is_complete_match() {
330 next_complete.push(rule_id);
331 }
332 }
333 }
334
335 if next_active.is_empty() && !complete.is_empty() {
341 let rule_id = *complete.iter().min().unwrap();
347 let matching_rule = &mut self.rules[rule_id];
348 let token = matching_rule.get_token()
349 .map_err(|err| self.error(ErrorKind::CouldNotReadToken, token_start).caused_by(err))?;
350
351 return self.token_data(token, token_start);
352
353 }
354 }
355
356 self.advance()?;
358
359 {
360 let next_active = &self.active[NEXT_CYCLE];
361
362 if next_active.is_empty() {
363 return Err(self.error(ErrorKind::NoMatchingRule, token_start));
364 }
365 if next_active.len() == 1 {
366 let rule_id = next_active[0];
367 return self.exhaust_rule(rule_id, token_start);
368 }
369
370 prev = Some(next);
371 next = match self.peek_next()? {
372 Some(ch) => ch,
373 None => break,
374 };
375
376 self.active.swap(0, 1);
378 self.complete.swap(0, 1);
379 }
380 }
381
382 let next_complete = &self.complete[NEXT_CYCLE];
383 if !next_complete.is_empty() {
384
385 let rule_id = *next_complete.iter().min().unwrap();
387 let matching_rule = &mut self.rules[rule_id];
388 let token = matching_rule.get_token()
389 .map_err(|err| self.error(ErrorKind::CouldNotReadToken, token_start).caused_by(err))?;
390
391 return self.token_data(token, token_start);
392 }
393
394 Err(self.error(ErrorKind::UnexpectedEOF, token_start))
395 }
396
397 fn exhaust_rule(&mut self, rule_id: RuleID, token_start: TokenIndex) -> Result<TokenMeta, LexerError> {
398 {
399 let rule = &mut self.rules[rule_id];
400 debug_assert!(!matches!(rule.current_state(), MatchResult::NoMatch));
401 }
402
403 loop {
404 let (prev, next) = self.peek()?;
405 let next = match next {
406 Some(ch) => ch,
407 None => break,
408 };
409
410 {
411 let rule = &mut self.rules[rule_id];
413 match rule.try_match(prev, next) {
414 MatchResult::NoMatch => break,
415 _ => { self.advance()?; },
416 }
417 }
418 }
419
420 let rule = &mut self.rules[rule_id];
421 if matches!(rule.current_state(), MatchResult::CompleteMatch) {
422 let token = rule.get_token()
423 .map_err(|err| self.error(ErrorKind::CouldNotReadToken, token_start).caused_by(err))?;
424
425 return self.token_data(token, token_start);
426 }
427
428 if self.at_eof() {
429 Err(self.error(ErrorKind::UnexpectedEOF, token_start))
430 } else {
431 Err(self.error(ErrorKind::NoMatchingRule, token_start))
432 }
433 }
434
435 fn get_symbol(start_idx: TokenIndex, end_idx: TokenIndex) -> Result<DebugSymbol, LexerError> {
436 let length = TokenLength::try_from(end_idx.saturating_sub(start_idx));
437 let symbol = DebugSymbol::new(start_idx, length.unwrap_or(0));
438
439 if length.is_err() {
440 Err(LexerError::new(ErrorKind::MaxTokenLengthExceeded, symbol))
441 } else {
442 Ok(symbol)
443 }
444 }
445
446 fn token_data(&self, token: Token, token_start: TokenIndex) -> Result<TokenMeta, LexerError> {
447 let symbol = Self::get_symbol(token_start, self.current)?;
448 Ok(TokenMeta { token, symbol, newline: self.newline })
449 }
450
451 fn error(&self, kind: ErrorKind, token_start: TokenIndex) -> LexerError {
452 let length = TokenLength::try_from(self.current.saturating_sub(token_start));
453 let symbol = DebugSymbol::new(token_start, length.unwrap_or(0));
454 LexerError::new(kind, symbol)
455 }
456}