aplang_lib/lexer/
lexer.rs

1use crate::lexer::token::Token;
2use crate::lexer::token::TokenType::*;
3use crate::lexer::token::{LiteralValue, TokenType};
4use miette::{miette, LabeledSpan, Report, SourceSpan};
5use owo_colors::OwoColorize;
6use std::collections::HashMap;
7use std::convert::From;
8use std::fmt::Display;
9use std::sync::Arc;
10
11pub struct Lexer {
12    file_name: String,
13    source: Arc<str>,
14
15    pub(super) tokens: Vec<Token>,
16
17    start: usize,
18    current: usize,
19    line: usize,
20
21    keywords: HashMap<&'static str, TokenType>,
22}
23
24impl Lexer {
25    pub fn new(input: impl Into<Arc<str>>, file_name: String) -> Self {
26        Self {
27            file_name,
28            source: input.into(),
29            tokens: Vec::new(),
30            start: 0,
31            current: 0,
32            line: 1,
33            keywords: crate::lexer::token::get_keywords_hashmap(),
34        }
35    }
36
37    pub fn scan(
38        input: impl Into<Arc<str>>,
39        file_name: String,
40    ) -> miette::Result<Vec<Token>, Vec<Report>> {
41        let mut lexer = Self::new(input, file_name);
42        let tokens = lexer.scan_tokens()?;
43
44        Ok(tokens)
45    }
46
47    pub fn scan_tokens(&mut self) -> miette::Result<Vec<Token>, Vec<Report>> {
48        let mut errors: Vec<Report> = vec![];
49        while !self.is_at_end() {
50            // println!("({}..{})", self.start, self.current);
51            self.start = self.current;
52            match self.scan_token() {
53                Ok(_) => (),
54                Err(msg) => errors.push(msg),
55            }
56        }
57
58        // push eof token onto token stack
59        self.tokens.push(Token {
60            token_type: Eof,
61            lexeme: "<EOF>".to_string(),
62            literal: None,
63            span: SourceSpan::new(self.start.into(), 0usize),
64            line_number: self.line,
65            source: self.source.clone(), // pass a source ptr to each token
66        });
67
68        if !errors.is_empty() {
69            return Err(errors);
70        }
71
72        Ok(self.tokens.clone())
73    }
74
75    fn is_at_end(&self) -> bool {
76        self.current >= self.source.len()
77    }
78
79    fn scan_token(&mut self) -> miette::Result<()> {
80        let c = self.advance();
81
82        match c {
83            '(' => self.add_token(LeftParen),
84            ')' => self.add_token(RightParen),
85            '[' => self.add_token(LeftBracket),
86            ']' => self.add_token(RightBracket),
87            '{' => self.add_token(LeftBrace),
88            '}' => self.add_token(RightBrace),
89            ',' => self.add_token(Comma),
90            '.' => self.add_token(Dot),
91            '-' => self.add_token(Minus),
92            '+' => self.add_token(Plus),
93            '*' => self.add_token(Star),
94            ';' => self.add_token(SoftSemi),
95            '!' => {
96                if self.char_match('=') {
97                    self.add_token(BangEqual)
98                } else {
99                    let labels = vec![LabeledSpan::at(
100                        self.current_span(),
101                        "operator `!` (bang) not allowed in syntax",
102                    )];
103                    let error = miette!(
104                        labels = labels,
105                        code = "lexer::unknown_symbol::bang",
106                        help = "for logical not write `NOT` instead of `!`",
107                        "{} unknown symbol `!`",
108                        self.location_string()
109                    )
110                    .with_source_code(self.source.clone());
111
112                    return Err(error);
113                }
114            }
115            '=' => {
116                if self.char_match('=') {
117                    self.add_token(EqualEqual)
118                } else {
119                    let labels = vec![LabeledSpan::at(
120                        self.current_span(),
121                        "operator `=` (equals) not allowed in syntax",
122                    )];
123                    let error = miette!(
124                        labels = labels,
125                        code = "lexer::unknown_symbol::equals",
126                        help = "for logical equals write `==` instead of `=`\n\
127                        to assign to a variable write `<-` instead of `=`",
128                        "{} unknown symbol `=`",
129                        self.location_string()
130                    )
131                    .with_source_code(self.source.clone());
132
133                    return Err(error);
134                }
135            }
136            '<' => {
137                let token = if self.char_match('=') {
138                    LessEqual
139                } else if self.char_match('-') {
140                    Arrow
141                } else {
142                    Less
143                };
144
145                self.add_token(token)
146            }
147            '>' => {
148                let token = if self.char_match('=') {
149                    GreaterEqual
150                } else {
151                    Greater
152                };
153
154                self.add_token(token)
155            }
156            '/' => {
157                if self.char_match('/') {
158                    // comment
159                    loop {
160                        if self.peek() == '\n' || self.is_at_end() {
161                            break;
162                        }
163                        self.advance();
164                    }
165                } else {
166                    self.add_token(Slash)
167                }
168            },
169            '\\' => {
170                if !self.char_match('\n') {
171                    let labels = vec![LabeledSpan::at(
172                        self.current_span(),
173                        "must be followed by newline (\\n)",
174                    )];
175                    
176                    let error = miette!(
177                        labels = labels,
178                        help = "use \\ to escape a newline",
179                        "expected newline (\\n) following \\ instead found {}",
180                        self.peek()
181                    ).with_source_code(self.source.clone());
182                    
183                    return Err(error);
184                }
185            },
186            ' ' | '\r' | '\t' => { /* nop */ }
187            '\n' => {
188                if let Some(prev) = self.tokens.last() {
189                    self.line += 1;
190                    // use go's method of implicit semicolons
191                    // see: https://go.dev/ref/spec#Semicolons
192                    match prev.token_type {
193                        Identifier | // ident
194                        Number | StringLiteral | Null | True | False | // literal
195                        Break | Continue | Return |
196                        RightParen | RightBracket | RightBrace
197                        => {
198                            self.add_token(SoftSemi)
199                        }
200                        // otherwise ignore
201                        _ => {}
202                    }
203                };
204            }
205            '"' => self.string()?,
206            ch if ch.is_ascii_digit() => self.number()?,
207            ch if ch.is_alphanumeric() => self.identifier(),
208            ch => {
209                let labels = vec![LabeledSpan::at(
210                    self.current_span(),
211                    format!("symbol `{ch}` is not allowed in syntax"),
212                )];
213
214                let error = miette!(
215                    labels = labels,
216                    code = "lexer::unknown_symbol",
217                    "{} unknown symbol `{ch}`",
218                    self.location_string()
219                )
220                .with_source_code(self.source.clone());
221
222                return Err(error);
223            }
224        }
225
226        Ok(())
227    }
228
229    fn string(&mut self) -> miette::Result<()> {
230        let mut result = String::new();
231
232        while self.peek() != '"' && !self.is_at_end() {
233            if self.peek() == '\n' {
234                self.line += 1;
235            }
236
237            // escape codes
238            if self.peek() == '\\' {
239                self.advance(); // consume the backslash
240
241                match self.peek() {
242                    'n' => {
243                        result.push('\n');
244                        self.advance(); // consume 'n'
245                    }
246                    'r' => {
247                        result.push('\r');
248                        self.advance(); // consume 'r'
249                    }
250                    't' => {
251                        result.push('\t');
252                        self.advance(); // consume 't'
253                    }
254                    '\\' => {
255                        result.push('\\');
256                        self.advance(); // consume another '\'
257                    }
258                    '"' => {
259                        result.push('"');
260                        self.advance(); // consume the double quote
261                    }
262                    _ => {
263                        // invalid escape sequence
264                        return Err(miette!("Invalid escape sequence: \\{}", self.peek()));
265                    }
266                }
267            } else {
268                result.push(self.advance()); // add normal characters to the result
269            }
270        }
271
272        // reaching the end without closing the string should throw an error
273        if self.is_at_end() {
274            let labels = vec![
275                LabeledSpan::at_offset(self.start, "unmatched quote"),
276                LabeledSpan::at(self.current_span(), "unmatched quote"),
277            ];
278
279            let error = miette!(
280                labels = labels,
281                code = "lexer::unterminated_string",
282                help = "A string literal must end with a matching quote",
283                "{} unterminated string",
284                self.location_string()
285            )
286            .with_source_code(self.source.clone());
287
288            return Err(error);
289        }
290
291        self.advance(); // consume the closing quote
292
293        // store the parsed string literal in the token list
294        self.add_token_lit(StringLiteral, Some(LiteralValue::String(result)));
295
296        Ok(())
297    }
298
299    fn number(&mut self) -> miette::Result<()> {
300        while self.peek().is_ascii_digit() {
301            self.advance();
302        }
303
304        if self.peek() == '.' && self.peek_advance().is_ascii_digit() {
305            self.advance();
306
307            while self.peek().is_ascii_digit() {
308                self.advance();
309            }
310        }
311        let substring = &self.source[self.start..self.current];
312        let value = substring.parse::<f64>();
313
314        match value {
315            Ok(value) => self.add_token_lit(Number, Some(LiteralValue::Number(value))),
316            Err(_) => {
317                let labels = vec![LabeledSpan::at(self.current_span(), "could not parse")];
318
319                let error = miette!(
320                    labels = labels,
321                    code = "lexer::unknown_token",
322                    help = "this token might not be a valid number",
323                    "{} failed to parse `{}` into number",
324                    self.location_string(),
325                    substring
326                )
327                .with_source_code(self.source.clone());
328
329                return Err(error);
330            }
331        }
332
333        Ok(())
334    }
335
336    fn identifier(&mut self) {
337        while self.peek().is_alphanumeric() || self.peek() == '_' {
338            self.advance();
339        }
340        let substring = &self.source[self.start..self.current];
341        if let Some(keyword_token_type) = self.keywords.get(substring) {
342            self.add_token(keyword_token_type.clone());
343        } else {
344            self.add_token(Identifier)
345        }
346    }
347
348    fn peek_advance(&self) -> char {
349        if self.current + 1 >= self.source.len() {
350            return '\0';
351        }
352
353        self.source.chars().nth(self.current + 1).unwrap()
354    }
355    fn peek(&self) -> char {
356        if self.is_at_end() {
357            return '\0';
358        }
359        self.source.chars().nth(self.current).unwrap()
360    }
361
362    fn advance(&mut self) -> char {
363        let c = self.source.chars().nth(self.current).unwrap();
364        self.current += 1;
365
366        c
367    }
368
369    fn check_next(&self, ch: char) -> bool {
370        if self.is_at_end() {
371            return false;
372        }
373
374        let mut i = 1;
375        loop {
376            let next_char = self.source.chars().nth(self.current + i);
377
378            match next_char {
379                // if we're at the end, then return false
380                None => {
381                    break false;
382                }
383                Some(next_char) => {
384                    if next_char.is_whitespace() {
385                        i += 1;
386                    } else {
387                        return next_char == ch;
388                    }
389                }
390            }
391        }
392    }
393
394    fn add_token(&mut self, token_type: TokenType) {
395        self.add_token_lit(token_type, None)
396    }
397
398    fn add_token_lit(&mut self, token_type: TokenType, literal: Option<LiteralValue>) {
399        let text = self
400            .source
401            .get(self.start..self.current)
402            .expect("Internal Compiler Error, This is a BUG")
403            .to_string();
404
405        let span_len = self.current - self.start;
406
407        self.tokens.push(Token {
408            token_type,
409            lexeme: text,
410            literal,
411            line_number: self.line,
412            span: SourceSpan::new(self.start.into(), span_len),
413            source: self.source.clone(), // pass a pointer to source
414        });
415    }
416
417    fn char_match(&mut self, ch: char) -> bool {
418        if self.is_at_end() {
419            return false;
420        }
421
422        if self.source.chars().nth(self.current).unwrap() != ch {
423            false
424        } else {
425            self.current += 1;
426            true
427        }
428    }
429
430    fn current_span(&self) -> SourceSpan {
431        SourceSpan::from(self.start..self.current)
432    }
433
434    /// generate the location string for errors
435    fn location_string(&self) -> impl Display {
436        let string = format!("{}:{}:{}", self.file_name, self.line, self.start);
437        let string = string.bold();
438        let string = string.red();
439        format!("{string}")
440    }
441}