pulsar_frontend/
lexer.rs

1// Copyright (C) 2024 Ethan Uppal. All rights reserved.
2use super::token::{Token, TokenType};
3use pulsar_utils::{
4    error::{ErrorBuilder, ErrorCode, ErrorManager, Level, Style},
5    loc::{Loc, Region, Source}
6};
7use std::{cell::RefCell, rc::Rc};
8
9/// Produces tokens from an input source.
10///
11/// # Example
12/// ```
13/// fn lex(source: Rc<Source>, error_manager: Rc<RefCell<ErrorManager>>) {
14///     let lexer = Lexer::new(source, error_manager);
15///     for token in lexer {
16///         println! {"{}", token};
17///     }
18/// }
19/// ```
20pub struct Lexer {
21    loc: Loc,
22    buffer: Vec<char>,
23    error_manager: Rc<RefCell<ErrorManager>>
24}
25
26/// Enables exploration of the lexer buffer, e.g., with [`Lexer::advance`],
27/// without side effects.
28///
29/// Note: this macro must only be invoked within the lexer.
30macro_rules! with_unwind {
31    ($self:ident in $($action:tt)*) => {
32        let old_loc = $self.loc.clone();
33        {
34            $($action)*
35        }
36        $self.loc = old_loc;
37    };
38}
39
40impl Lexer {
41    /// Constructs a lexer for the given `source`.
42    pub fn new(
43        source: Rc<Source>, error_manager: Rc<RefCell<ErrorManager>>
44    ) -> Self {
45        Lexer {
46            loc: Loc {
47                line: 1,
48                col: 1,
49                pos: 0,
50                source: source.clone()
51            },
52            buffer: source.contents().chars().collect(),
53            error_manager
54        }
55    }
56
57    /// The current character in the buffer.
58    fn current(&self) -> char {
59        self.buffer[self.loc.pos as usize]
60    }
61
62    /// Whether the lexer has no remaining characters in the
63    /// buffer.
64    fn is_eof(&self) -> bool {
65        (self.loc.pos as usize) == self.buffer.len()
66    }
67
68    /// Consumes a single character in the buffer.
69    fn advance(&mut self) {
70        if self.current() == '\n' {
71            self.loc.col = 0;
72            self.loc.line += 1;
73        }
74        self.loc.pos += 1;
75        self.loc.col += 1;
76    }
77
78    /// Consumes `n` characters in the buffer.
79    fn advance_n(&mut self, n: usize) {
80        for _ in 0..n {
81            self.advance();
82        }
83    }
84
85    /// Skips past all non-newline whitespace.
86    fn skip(&mut self) {
87        while !self.is_eof()
88            && self.current().is_whitespace()
89            && self.current() != '\n'
90        {
91            self.advance();
92        }
93    }
94
95    /// Consumes `length` characters and creates a token over those characters
96    /// with type `ty`.
97    fn make_token(&mut self, ty: TokenType, length: usize) -> Token {
98        let loc_copy = self.loc.clone();
99        self.advance_n(length);
100        let pos_copy = loc_copy.pos as usize;
101        let value: String =
102            self.buffer[pos_copy..pos_copy + length].iter().collect();
103        Token {
104            ty,
105            value,
106            loc: loc_copy
107        }
108    }
109
110    /// Requires: `current().is_numeric()`.
111    fn make_number_token(&mut self) -> Token {
112        let mut length = 0;
113        with_unwind! { self in
114            while !self.is_eof() && self.current().is_numeric() {
115                self.advance();
116                length += 1;
117            }
118        }
119        self.make_token(TokenType::Integer, length)
120    }
121
122    /// Requires: `current().is_alphabetic() || current() == '_'`.
123    fn make_identifier_token(&mut self) -> Token {
124        let mut length = 0;
125        with_unwind! { self in
126            while !self.is_eof()
127            && (self.current().is_alphanumeric() || self.current() == '_')
128            {
129                self.advance();
130                length += 1;
131            }
132        }
133        self.make_token(TokenType::Identifier, length)
134    }
135
136    /// Requires: ` current() == '@'`.
137    fn make_directive_token(&mut self) -> Option<Token> {
138        let mut length = 1;
139        with_unwind! { self in
140            self.advance();
141            if self.is_eof()
142                || !(self.current().is_alphanumeric() || self.current() == '_')
143            {
144                return None;
145            }
146            while !self.is_eof()
147                && (self.current().is_alphanumeric() || self.current() == '_')
148            {
149                    self.advance();
150                    length += 1;
151            }
152        }
153        Some(self.make_token(TokenType::Directive, length))
154    }
155}
156
157macro_rules! lex {
158    ($self:ident in $(| $token:expr => {$token_type:expr})* | _ $finally:block) => {
159        $(
160            {
161                let input_token_length = ($token).len();
162                let loc_pos = $self.loc.pos as usize;
163                if loc_pos + input_token_length <= $self.buffer.len()
164                    && $self.buffer[loc_pos..loc_pos + input_token_length].iter().copied().eq($token.chars()) {
165                    return Some($self.make_token($token_type, input_token_length));
166                };
167            }
168        )*
169        $finally
170    };
171}
172
173impl Iterator for Lexer {
174    type Item = Token;
175
176    fn next(&mut self) -> Option<Token> {
177        if self.is_eof() || self.error_manager.borrow().has_errors() {
178            return None;
179        }
180
181        self.skip();
182
183        lex! { self in
184            | "+" => { TokenType::Plus }
185            | "->" => { TokenType::Arrow }
186            | "-" => { TokenType::Minus }
187            | "*" => { TokenType::Times }
188            | "(" => { TokenType::LeftPar }
189            | ")" => { TokenType::RightPar }
190            | "{" => { TokenType::LeftBrace }
191            | "}" => { TokenType::RightBrace }
192            | "[" => { TokenType::LeftBracket }
193            | "]" => { TokenType::RightBracket }
194            | "<" => { TokenType::LeftAngle }
195            | ">" => { TokenType::RightAngle }
196            | "=" => { TokenType::Assign }
197            | ":" => { TokenType::Colon }
198            | "..." => { TokenType::Dots }
199            | "." => { TokenType::Dot }
200            | "," => { TokenType::Comma }
201            | "\n" => { TokenType::Newline }
202            | "func" => { TokenType::Func }
203            | "let" => { TokenType::Let }
204            | "return" => { TokenType::Return }
205            | "pure" => { TokenType::Pure }
206            | "map" => { TokenType::HardwareMap }
207            | _ {
208                if self.current().is_numeric() {
209                    Some(self.make_number_token())
210                } else if self.current().is_alphabetic() || self.current() == '_' {
211                    Some(self.make_identifier_token())
212                } else if self.current() == '@' {
213                    self.make_directive_token()
214                } else {
215                    let error = ErrorBuilder::new()
216                        .of_style(Style::Primary)
217                        .at_level(Level::Error)
218                        .with_code(ErrorCode::UnrecognizedCharacter)
219                        .at_region(&Region::unit(self.loc.clone()))
220                        .message("Encountered unrecognized character".into())
221                        .build();
222                    self.error_manager.borrow_mut().record(error);
223                    None
224                }
225            }
226        }
227    }
228}