sphinx/
lexer.rs

1use std::io;
2use core::iter::{Iterator, Peekable};
3use crate::language;
4use crate::debug::{DebugSymbol, TokenIndex, TokenLength};
5
6
7mod token;
8mod errors;
9mod tests;
10
11pub mod rules;
12pub use rules::MatchResult;
13use rules::LexerRule;
14use rules::comments::{LineCommentRule, BlockCommentRule};
15
16pub use token::*;
17pub use errors::*;
18
19
20// Lexer Builder
21
22#[derive(Clone)]
23pub struct LexerOptions {
24    skip_comments: bool,
25}
26
27pub struct LexerBuilder {
28    rules: Vec<Box<dyn LexerRule>>,
29    options: LexerOptions,
30}
31
32impl Default for LexerBuilder {
33    fn default() -> Self { Self::new() }
34}
35
36impl LexerBuilder {
37    pub fn new() -> Self {
38        LexerBuilder {
39            rules: Vec::new(),
40            options: LexerOptions {
41                skip_comments: true,
42            }
43        }
44    }
45    
46    fn set_options(mut self, options: LexerOptions) -> Self {
47        self.options = options;
48        self
49    }
50    
51    pub fn set_skip_comments(mut self, skip_comments: bool) -> Self {
52        self.options.skip_comments = skip_comments;
53        self
54    }
55    
56    // Note, the order that rules are added determines priority
57    
58    pub fn add_rule<R>(mut self, rule: R) -> Self
59    where R: LexerRule + 'static {
60        self.rules.push(Box::new(rule));
61        self
62    }
63    
64    pub fn insert_rule<R>(mut self, index: usize, rule: impl LexerRule + 'static) -> Self {
65        self.rules.insert(index, Box::new(rule));
66        self
67    }
68    
69    pub fn extend_rules(mut self, rules: impl Iterator<Item=impl LexerRule + 'static>) -> Self {
70        for rule in rules {
71            self.rules.push(Box::new(rule));
72        }
73        self
74    }
75    
76    // less expensive than build(), but invalidates self
77    pub fn build_once<S>(self, source: S) -> Lexer<S> where S: Iterator<Item=io::Result<char>> {
78        
79        Lexer::new(source, self.options, self.rules.into_iter())
80        
81    }
82    
83    pub fn build<S>(&self, source: S) -> Lexer<S> where S: Iterator<Item=io::Result<char>> {
84        
85        Lexer::new(source, self.options.clone(), self.rules.clone().into_iter())
86        
87    }
88}
89
90// Lexer
91
92fn split_array_pair_mut<T>(pair: &mut [T; 2]) -> (&mut T, &mut T) {
93    let (first, rest) = pair.split_first_mut().unwrap();
94    let second = &mut rest[0];
95    (first, second)
96}
97
98// to avoid interior self-referentiality inside Lexer (not permitted in safe Rust), 
99// instead of passing around references, we pass indices into the rules Vec instead
100type RuleID = usize;
101
102pub struct Lexer<S> where S: Iterator<Item=io::Result<char>> {
103    source: Peekable<S>,
104    options: LexerOptions,
105    rules: Vec<Box<dyn LexerRule>>,
106    
107    current: TokenIndex, // one ahead of current char
108    last: Option<char>,
109    newline: bool,
110    
111    // internal state used by next_token(). 
112    // putting these here instead to avoid unnecessary allocations
113    active:   [Vec<RuleID>; 2],
114    complete: [Vec<RuleID>; 2],
115}
116
117// indices for active/complete arrays
118const THIS_CYCLE: usize = 0;
119const NEXT_CYCLE: usize = 1;
120
121
122impl<S> Iterator for Lexer<S> where S: Iterator<Item=io::Result<char>> {
123    type Item = Result<TokenMeta, LexerError>;
124    
125    fn next(&mut self) -> Option<Self::Item> { Some(self.next_token()) }
126}
127
128type PrevNextChars = (Option<char>, Option<char>);
129
130impl<S> Lexer<S> where S: Iterator<Item=io::Result<char>> {
131    
132    pub fn new(source: S, options: LexerOptions, rules: impl Iterator<Item=Box<dyn LexerRule>>) -> Self {
133        Lexer {
134            options,
135            source: source.peekable(),
136            rules: rules.collect(),
137            
138            current: 0,
139            last: None,
140            newline: true,
141            active:   [Vec::new(), Vec::new()],
142            complete: [Vec::new(), Vec::new()],
143        }
144    }
145    
146    // grab the next character from source, transposing any io::Error and mapping it to LexerError
147    fn get_next(&mut self) -> Result<Option<char>, LexerError> {
148        match self.source.next() {
149            None => Ok(None),
150            Some(result) => match result {
151                Ok(c) => Ok(Some(c)),
152                Err(error) => Err(self.error(ErrorKind::IOError, self.current).caused_by(Box::new(error))),
153            },
154        }
155    }
156    
157    fn peek_next(&mut self) -> Result<Option<char>, LexerError> {
158        let result = match self.source.peek() {
159            None => Ok(None),
160            Some(Ok(c)) => Ok(Some(*c)),
161            
162            // if there is an IO error we cannot process it yet, since peek() merely borrows the error
163            Some(Err(..)) => Err(()),
164        };
165        
166        result.map_err(|_| {
167            let ioerror = self.source.next().unwrap().unwrap_err();
168            self.error(ErrorKind::IOError, self.current).caused_by(Box::new(ioerror))
169        })
170    }
171    
172    fn advance(&mut self) -> Result<PrevNextChars, LexerError> {
173        self.last = self.peek_next()?;
174        let next = self.get_next()?;
175        
176        if next.is_some() {
177            if self.current == TokenIndex::MAX {
178                return Err(self.error(ErrorKind::SourceTooLong, self.current));
179            }
180            self.current += 1;
181        }
182        
183        Ok((self.last, next))
184    }
185    
186    // these have to be &mut self because they can mutate the source iterator
187    fn peek(&mut self) -> Result<PrevNextChars, LexerError> {
188        Ok((self.last, self.peek_next()?))
189    }
190    
191    pub fn at_eof(&mut self) -> bool {
192        self.source.peek().is_none()
193    }
194    
195    fn skip_whitespace(&mut self) -> Result<(), LexerError> {
196        let mut next = self.peek_next()?;
197        while next.is_some() && next.unwrap().is_whitespace() {
198            // consume whitespace and update self.newline
199            if let (_, Some('\n')) = self.advance()? {
200                self.newline = true;
201            }
202            next = self.peek_next()?;
203        }
204        Ok(())
205    }
206    
207    fn skip_comments(&mut self) -> Result<bool, LexerError> {
208        let line_rule = LineCommentRule::new(language::COMMENT_CHAR);
209        let block_rule = BlockCommentRule::new(language::NESTED_COMMENT_START, language::NESTED_COMMENT_END);
210        
211        let mut line = Some(line_rule);
212        let mut block = Some(block_rule);
213        
214        let start_pos = self.current;
215        loop {
216            let (prev, next) = self.peek()?;
217            let next = match next {
218                Some(ch) => ch,
219                None => break,
220            };
221            
222            if let Some(rule) = line.as_mut() {
223                if !rule.try_match(prev, next).is_match() {
224                    line = None;
225                }
226            }
227            
228            if let Some(rule) = block.as_mut() {
229                if !rule.try_match(prev, next).is_match() {
230                    block = None;
231                }
232            }
233            
234            if line.is_none() && block.is_none() {
235                break;
236            }
237            
238            // consume comment char and update self.newline
239            if let (_, Some('\n')) = self.advance()? {
240                self.newline = true;
241            }
242        }
243        
244        // continue skipping if we are at not at EOF and we advanced
245        let continue_ = !self.at_eof() && self.current > start_pos;
246        
247        Ok(continue_)
248    }
249
250    fn reset_rules(&mut self) {
251        for rule in self.rules.iter_mut() {
252            rule.reset();
253        }
254        
255        for idx in 0..2 {
256            self.active[idx].clear();
257            self.complete[idx].clear();
258        }
259    }
260    
261    pub fn next_token(&mut self) -> Result<TokenMeta, LexerError> {
262        self.skip_whitespace()?;
263        
264        if self.options.skip_comments {
265            while self.skip_comments()? {
266                self.skip_whitespace()?;
267            }
268        }
269        
270        let result = self.scan_token();
271        self.newline = matches!(self.last, Some('\n'));
272        
273        result
274    }
275    
276    fn scan_token(&mut self) -> Result<TokenMeta, LexerError> {
277        
278        //starting a new token
279        let token_start = self.current;
280        self.reset_rules();
281        
282        // grab the next char, and feed it to all the rules
283        // any rules that no longer match are discarded
284        //
285        // if exactly one rule left, stop iterating and just fill out that one
286        // if nothing left, consider rules that were completed on the last iteration...
287        //    if there are none, error (could not parse symbol)
288        //    if there are more than one, error (ambiguous symbol)
289        //    if there is exactly one, stop iterating and emit a to;ken
290        //
291        // otherwise...
292        //    any rules that match completely are moved to a separate Vec for the next iteration
293        //    advance current to the next char
294        
295        // check if we are already at EOF
296        let (mut prev, next) = self.peek()?;
297        let mut next = match next {
298            Some(ch) => ch,
299            None => {
300                return self.token_data(Token::EOF, token_start);
301            },
302        };
303        
304        // generate rule ids
305        self.active[THIS_CYCLE].extend(0..self.rules.len());
306        
307        loop {
308            
309            // need to split the body of this loop into two blocks in order to keep the borrow checker happy...
310            
311            {
312                let (active, next_active) = split_array_pair_mut(&mut self.active);
313                let (complete, next_complete) = split_array_pair_mut(&mut self.complete);
314                
315                // println!("({}) next: {:?}", self.current, next);
316                // println!("({}) active: {:?}", self.current, active);
317                // println!("({}) complete: {:?}", self.current, complete);
318                
319                next_active.clear();
320                next_complete.clear();
321                
322                for &rule_id in active.iter() {
323                    let rule = &mut self.rules[rule_id];
324                    let match_result = rule.try_match(prev, next);
325                    
326                    if match_result.is_match() {
327                        next_active.push(rule_id);
328                        
329                        if match_result.is_complete_match() {
330                            next_complete.push(rule_id);
331                        }
332                    }
333                }
334                
335                // println!("({}) next_active: {:?}", self.current, next_active);
336                // println!("({}) next_complete: {:?}", self.current, next_complete);
337                
338                
339                // Only care about complete rules if next_active is empty ("rule of maximal munch")
340                if next_active.is_empty() && !complete.is_empty() {
341                    // look at rules that matched the previous char
342                    // falling back to the rules which matched completely on the previous char
343                    // do not advance the lexer as we will revisit the current char on the next pass
344                    
345                    // if there is more than one complete rule, the lowest index takes priority!
346                    let rule_id = *complete.iter().min().unwrap();
347                    let matching_rule = &mut self.rules[rule_id];
348                    let token = matching_rule.get_token()
349                        .map_err(|err| self.error(ErrorKind::CouldNotReadToken, token_start).caused_by(err))?;
350                    
351                    return self.token_data(token, token_start);
352                
353                }
354            }
355            
356            // commit to accepting this char (and therefore consuming it)
357            self.advance()?;
358            
359            {
360                let next_active = &self.active[NEXT_CYCLE];
361                
362                if next_active.is_empty() {
363                    return Err(self.error(ErrorKind::NoMatchingRule, token_start));
364                } 
365                if next_active.len() == 1 {
366                    let rule_id = next_active[0];
367                    return self.exhaust_rule(rule_id, token_start);
368                }
369                
370                prev = Some(next);
371                next = match self.peek_next()? {
372                    Some(ch) => ch,
373                    None => break,
374                };
375                
376                // swap cycles
377                self.active.swap(0, 1);
378                self.complete.swap(0, 1);
379            }
380        }
381        
382        let next_complete = &self.complete[NEXT_CYCLE];
383        if !next_complete.is_empty() {
384            
385            // if there is more than one complete rule, the lowest index takes priority!
386            let rule_id = *next_complete.iter().min().unwrap();
387            let matching_rule = &mut self.rules[rule_id];
388            let token = matching_rule.get_token()
389                .map_err(|err| self.error(ErrorKind::CouldNotReadToken, token_start).caused_by(err))?;
390            
391            return self.token_data(token, token_start);
392        }
393        
394        Err(self.error(ErrorKind::UnexpectedEOF, token_start))
395    }
396    
397    fn exhaust_rule(&mut self, rule_id: RuleID, token_start: TokenIndex) -> Result<TokenMeta, LexerError> {
398        {
399            let rule = &mut self.rules[rule_id];
400            debug_assert!(!matches!(rule.current_state(), MatchResult::NoMatch));
401        }
402
403        loop {
404            let (prev, next) = self.peek()?;
405            let next = match next {
406                Some(ch) => ch,
407                None => break,
408            };
409            
410            {
411                // println!("({}) next: {:?}", self.current, next);
412                let rule = &mut self.rules[rule_id];
413                match rule.try_match(prev, next) {
414                    MatchResult::NoMatch => break,
415                    _ => { self.advance()?; },
416                }
417            }
418        }
419        
420        let rule = &mut self.rules[rule_id];
421        if matches!(rule.current_state(), MatchResult::CompleteMatch) {
422            let token = rule.get_token()
423                .map_err(|err| self.error(ErrorKind::CouldNotReadToken, token_start).caused_by(err))?;
424            
425            return self.token_data(token, token_start);
426        }
427        
428        if self.at_eof() {
429            Err(self.error(ErrorKind::UnexpectedEOF, token_start))
430        } else {
431            Err(self.error(ErrorKind::NoMatchingRule, token_start))
432        }
433    }
434    
435    fn get_symbol(start_idx: TokenIndex, end_idx: TokenIndex) -> Result<DebugSymbol, LexerError> {
436        let length = TokenLength::try_from(end_idx.saturating_sub(start_idx));
437        let symbol = DebugSymbol::new(start_idx, length.unwrap_or(0));
438        
439        if length.is_err() {
440            Err(LexerError::new(ErrorKind::MaxTokenLengthExceeded, symbol))
441        } else {
442            Ok(symbol)
443        }
444    }
445    
446    fn token_data(&self, token: Token, token_start: TokenIndex) -> Result<TokenMeta, LexerError> {
447        let symbol = Self::get_symbol(token_start, self.current)?;
448        Ok(TokenMeta { token, symbol, newline: self.newline })
449    }
450    
451    fn error(&self, kind: ErrorKind, token_start: TokenIndex) -> LexerError {
452        let length = TokenLength::try_from(self.current.saturating_sub(token_start));
453        let symbol = DebugSymbol::new(token_start, length.unwrap_or(0));
454        LexerError::new(kind, symbol)
455    }
456}