Skip to main content

oak_apl/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2/// Token type definitions.
3pub mod token_type;
4
5pub use token_type::AplTokenType;
6
7use crate::language::AplLanguage;
8use oak_core::{
9    Lexer, LexerCache, LexerState, OakError,
10    lexer::{LexOutput, WhitespaceConfig},
11    source::Source,
12};
13use std::sync::LazyLock;
14
15pub(crate) type State<'a, S> = LexerState<'a, S, AplLanguage>;
16
17static APL_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
18
19/// Lexer for the APL language.
20#[derive(Clone, Debug)]
21pub struct AplLexer<'config> {
22    /// The language configuration.
23    pub config: &'config AplLanguage,
24}
25
26impl<'config> Lexer<AplLanguage> for AplLexer<'config> {
27    fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[oak_core::source::TextEdit], cache: &'a mut impl LexerCache<AplLanguage>) -> LexOutput<AplLanguage> {
28        let mut state: State<'_, S> = LexerState::new_with_cache(source, 0, cache);
29        let result = self.run(&mut state);
30        if result.is_ok() {
31            state.add_eof();
32        }
33        state.finish_with_cache(result, cache)
34    }
35}
36
37impl<'config> AplLexer<'config> {
38    /// Creates a new `AplLexer`.
39    pub fn new(config: &'config AplLanguage) -> Self {
40        Self { config }
41    }
42
43    /// Main lexical analysis logic.
44    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
45        while state.not_at_end() {
46            let safe_point = state.get_position();
47
48            if self.skip_whitespace(state) {
49                continue;
50            }
51
52            if self.skip_comment(state) {
53                continue;
54            }
55
56            if self.lex_string_literal(state) {
57                continue;
58            }
59
60            if self.lex_number_literal(state) {
61                continue;
62            }
63
64            if self.lex_identifier(state) {
65                continue;
66            }
67
68            if self.lex_symbols(state) {
69                continue;
70            }
71
72            // If no pattern matches, skip current character and generate Error token
73            if let Some(ch) = state.peek() {
74                state.advance(ch.len_utf8());
75                state.add_token(AplTokenType::Error, safe_point, state.get_position());
76            }
77        }
78
79        Ok(())
80    }
81
82    /// Skips whitespace characters.
83    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
84        APL_WHITESPACE.scan(state, AplTokenType::Whitespace)
85    }
86
87    fn skip_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
88        let start = state.get_position();
89        if state.peek() == Some('⍝') {
90            state.advance('⍝'.len_utf8());
91            while let Some(ch) = state.peek() {
92                if ch == '\n' || ch == '\r' {
93                    break;
94                }
95                state.advance(ch.len_utf8());
96            }
97            state.add_token(AplTokenType::Comment, start, state.get_position());
98            return true;
99        }
100        false
101    }
102
103    fn lex_string_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
104        let start = state.get_position();
105        if let Some(quote) = state.peek() {
106            if quote == '\'' || quote == '"' {
107                state.advance(1);
108                while let Some(ch) = state.peek() {
109                    if ch == quote {
110                        state.advance(1);
111                        if state.peek() == Some(quote) {
112                            state.advance(1);
113                            continue;
114                        }
115                        break;
116                    }
117                    state.advance(ch.len_utf8());
118                    if ch == '\n' || ch == '\r' {
119                        break;
120                    }
121                }
122                state.add_token(AplTokenType::StringLiteral, start, state.get_position());
123                return true;
124            }
125        }
126        false
127    }
128
129    fn lex_number_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
130        let start = state.get_position();
131        if let Some(ch) = state.peek() {
132            if ch.is_ascii_digit() || ch == '¯' || (ch == '.' && state.peek_next_n(1).map_or(false, |c| c.is_ascii_digit())) {
133                if ch == '¯' {
134                    state.advance('¯'.len_utf8());
135                }
136
137                let mut has_digits = false;
138                while let Some(c) = state.peek() {
139                    if c.is_ascii_digit() {
140                        state.advance(1);
141                        has_digits = true;
142                    }
143                    else {
144                        break;
145                    }
146                }
147
148                if state.peek() == Some('.') {
149                    state.advance(1);
150                    while let Some(c) = state.peek() {
151                        if c.is_ascii_digit() {
152                            state.advance(1);
153                            has_digits = true;
154                        }
155                        else {
156                            break;
157                        }
158                    }
159                }
160
161                if !has_digits && state.get_position() == start {
162                    return false;
163                }
164
165                if let Some(e) = state.peek() {
166                    if e == 'e' || e == 'E' {
167                        state.advance(1);
168                        if let Some(sign) = state.peek() {
169                            if sign == '+' || sign == '-' || sign == '¯' {
170                                state.advance(sign.len_utf8());
171                            }
172                        }
173                        while let Some(c) = state.peek() {
174                            if c.is_ascii_digit() {
175                                state.advance(1);
176                            }
177                            else {
178                                break;
179                            }
180                        }
181                    }
182                }
183
184                state.add_token(AplTokenType::NumberLiteral, start, state.get_position());
185                return true;
186            }
187        }
188        false
189    }
190
191    fn lex_identifier<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
192        let start = state.get_position();
193        if let Some(ch) = state.peek() {
194            if ch.is_alphabetic() || ch == '∆' || ch == '⍙' {
195                state.advance(ch.len_utf8());
196                while let Some(c) = state.peek() {
197                    if c.is_alphanumeric() || c == '∆' || c == '⍙' || c == '_' {
198                        state.advance(c.len_utf8());
199                    }
200                    else {
201                        break;
202                    }
203                }
204                state.add_token(AplTokenType::Identifier, start, state.get_position());
205                return true;
206            }
207        }
208        false
209    }
210
211    fn lex_symbols<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
212        let start = state.get_position();
213        if let Some(ch) = state.peek() {
214            let token = match ch {
215                '←' => AplTokenType::LeftArrow,
216                '→' => AplTokenType::RightArrow,
217                '⋄' => AplTokenType::Diamond,
218                '⎕' => AplTokenType::Quad,
219                '⍞' => AplTokenType::QuoteQuad,
220                '⍴' => AplTokenType::Rho,
221                '⍳' => AplTokenType::Iota,
222                '∊' => AplTokenType::Epsilon,
223                '↑' => AplTokenType::UpArrow,
224                '↓' => AplTokenType::DownArrow,
225                '∇' => AplTokenType::Del,
226                '∆' => AplTokenType::Delta,
227                '⍺' => AplTokenType::Alpha,
228                '⍵' => AplTokenType::Omega,
229                '⍬' => AplTokenType::Zilde,
230                '+' => AplTokenType::Plus,
231                '-' => AplTokenType::Minus,
232                '×' => AplTokenType::Times,
233                '÷' => AplTokenType::Divide,
234                '*' => AplTokenType::Star,
235                '⍟' => AplTokenType::Log,
236                '○' => AplTokenType::Circle,
237                '∨' => AplTokenType::Or,
238                '∧' => AplTokenType::And,
239                '∼' => AplTokenType::Not,
240                '⍱' => AplTokenType::Nor,
241                '⍲' => AplTokenType::Nand,
242                '=' => AplTokenType::Equal,
243                '≠' => AplTokenType::NotEqual,
244                '<' => AplTokenType::LessThan,
245                '≤' => AplTokenType::LessEqual,
246                '≥' => AplTokenType::GreaterEqual,
247                '>' => AplTokenType::GreaterThan,
248                '⌈' => AplTokenType::UpStile,
249                '⌊' => AplTokenType::DownStile,
250                '|' => AplTokenType::Bar,
251                '~' => AplTokenType::Tilde,
252                '?' => AplTokenType::Question,
253                '!' => AplTokenType::Factorial,
254                '/' => AplTokenType::Slash,
255                '\\' => AplTokenType::Backslash,
256                '⌿' => AplTokenType::SlashBar,
257                '⍀' => AplTokenType::BackslashBar,
258                '.' => AplTokenType::Dot,
259                '∘' => AplTokenType::Jot,
260                '¨' => AplTokenType::Diaeresis,
261                '⍣' => AplTokenType::Power,
262                '⍤' => AplTokenType::Rank,
263                '≢' => AplTokenType::Tally,
264                '(' => AplTokenType::LeftParen,
265                ')' => AplTokenType::RightParen,
266                '[' => AplTokenType::LeftBracket,
267                ']' => AplTokenType::RightBracket,
268                '{' => AplTokenType::LeftBrace,
269                '}' => AplTokenType::RightBrace,
270                ';' => AplTokenType::Semicolon,
271                _ => return false,
272            };
273            state.advance(ch.len_utf8());
274            state.add_token(token, start, state.get_position());
275            return true;
276        }
277        false
278    }
279}