1#![doc = include_str!("readme.md")]
2pub mod token_type;
4
5pub use token_type::AplTokenType;
6
7use crate::language::AplLanguage;
8use oak_core::{
9 Lexer, LexerCache, LexerState, OakError,
10 lexer::{LexOutput, WhitespaceConfig},
11 source::Source,
12};
13use std::sync::LazyLock;
14
15pub(crate) type State<'a, S> = LexerState<'a, S, AplLanguage>;
16
17static APL_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
18
19#[derive(Clone, Debug)]
21pub struct AplLexer<'config> {
22 pub config: &'config AplLanguage,
24}
25
26impl<'config> Lexer<AplLanguage> for AplLexer<'config> {
27 fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[oak_core::source::TextEdit], cache: &'a mut impl LexerCache<AplLanguage>) -> LexOutput<AplLanguage> {
28 let mut state: State<'_, S> = LexerState::new_with_cache(source, 0, cache);
29 let result = self.run(&mut state);
30 if result.is_ok() {
31 state.add_eof();
32 }
33 state.finish_with_cache(result, cache)
34 }
35}
36
37impl<'config> AplLexer<'config> {
38 pub fn new(config: &'config AplLanguage) -> Self {
40 Self { config }
41 }
42
43 fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
45 while state.not_at_end() {
46 let safe_point = state.get_position();
47
48 if self.skip_whitespace(state) {
49 continue;
50 }
51
52 if self.skip_comment(state) {
53 continue;
54 }
55
56 if self.lex_string_literal(state) {
57 continue;
58 }
59
60 if self.lex_number_literal(state) {
61 continue;
62 }
63
64 if self.lex_identifier(state) {
65 continue;
66 }
67
68 if self.lex_symbols(state) {
69 continue;
70 }
71
72 if let Some(ch) = state.peek() {
74 state.advance(ch.len_utf8());
75 state.add_token(AplTokenType::Error, safe_point, state.get_position());
76 }
77 }
78
79 Ok(())
80 }
81
82 fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
84 APL_WHITESPACE.scan(state, AplTokenType::Whitespace)
85 }
86
87 fn skip_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
88 let start = state.get_position();
89 if state.peek() == Some('⍝') {
90 state.advance('⍝'.len_utf8());
91 while let Some(ch) = state.peek() {
92 if ch == '\n' || ch == '\r' {
93 break;
94 }
95 state.advance(ch.len_utf8());
96 }
97 state.add_token(AplTokenType::Comment, start, state.get_position());
98 return true;
99 }
100 false
101 }
102
103 fn lex_string_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
104 let start = state.get_position();
105 if let Some(quote) = state.peek() {
106 if quote == '\'' || quote == '"' {
107 state.advance(1);
108 while let Some(ch) = state.peek() {
109 if ch == quote {
110 state.advance(1);
111 if state.peek() == Some(quote) {
112 state.advance(1);
113 continue;
114 }
115 break;
116 }
117 state.advance(ch.len_utf8());
118 if ch == '\n' || ch == '\r' {
119 break;
120 }
121 }
122 state.add_token(AplTokenType::StringLiteral, start, state.get_position());
123 return true;
124 }
125 }
126 false
127 }
128
129 fn lex_number_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
130 let start = state.get_position();
131 if let Some(ch) = state.peek() {
132 if ch.is_ascii_digit() || ch == '¯' || (ch == '.' && state.peek_next_n(1).map_or(false, |c| c.is_ascii_digit())) {
133 if ch == '¯' {
134 state.advance('¯'.len_utf8());
135 }
136
137 let mut has_digits = false;
138 while let Some(c) = state.peek() {
139 if c.is_ascii_digit() {
140 state.advance(1);
141 has_digits = true;
142 }
143 else {
144 break;
145 }
146 }
147
148 if state.peek() == Some('.') {
149 state.advance(1);
150 while let Some(c) = state.peek() {
151 if c.is_ascii_digit() {
152 state.advance(1);
153 has_digits = true;
154 }
155 else {
156 break;
157 }
158 }
159 }
160
161 if !has_digits && state.get_position() == start {
162 return false;
163 }
164
165 if let Some(e) = state.peek() {
166 if e == 'e' || e == 'E' {
167 state.advance(1);
168 if let Some(sign) = state.peek() {
169 if sign == '+' || sign == '-' || sign == '¯' {
170 state.advance(sign.len_utf8());
171 }
172 }
173 while let Some(c) = state.peek() {
174 if c.is_ascii_digit() {
175 state.advance(1);
176 }
177 else {
178 break;
179 }
180 }
181 }
182 }
183
184 state.add_token(AplTokenType::NumberLiteral, start, state.get_position());
185 return true;
186 }
187 }
188 false
189 }
190
191 fn lex_identifier<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
192 let start = state.get_position();
193 if let Some(ch) = state.peek() {
194 if ch.is_alphabetic() || ch == '∆' || ch == '⍙' {
195 state.advance(ch.len_utf8());
196 while let Some(c) = state.peek() {
197 if c.is_alphanumeric() || c == '∆' || c == '⍙' || c == '_' {
198 state.advance(c.len_utf8());
199 }
200 else {
201 break;
202 }
203 }
204 state.add_token(AplTokenType::Identifier, start, state.get_position());
205 return true;
206 }
207 }
208 false
209 }
210
211 fn lex_symbols<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
212 let start = state.get_position();
213 if let Some(ch) = state.peek() {
214 let token = match ch {
215 '←' => AplTokenType::LeftArrow,
216 '→' => AplTokenType::RightArrow,
217 '⋄' => AplTokenType::Diamond,
218 '⎕' => AplTokenType::Quad,
219 '⍞' => AplTokenType::QuoteQuad,
220 '⍴' => AplTokenType::Rho,
221 '⍳' => AplTokenType::Iota,
222 '∊' => AplTokenType::Epsilon,
223 '↑' => AplTokenType::UpArrow,
224 '↓' => AplTokenType::DownArrow,
225 '∇' => AplTokenType::Del,
226 '∆' => AplTokenType::Delta,
227 '⍺' => AplTokenType::Alpha,
228 '⍵' => AplTokenType::Omega,
229 '⍬' => AplTokenType::Zilde,
230 '+' => AplTokenType::Plus,
231 '-' => AplTokenType::Minus,
232 '×' => AplTokenType::Times,
233 '÷' => AplTokenType::Divide,
234 '*' => AplTokenType::Star,
235 '⍟' => AplTokenType::Log,
236 '○' => AplTokenType::Circle,
237 '∨' => AplTokenType::Or,
238 '∧' => AplTokenType::And,
239 '∼' => AplTokenType::Not,
240 '⍱' => AplTokenType::Nor,
241 '⍲' => AplTokenType::Nand,
242 '=' => AplTokenType::Equal,
243 '≠' => AplTokenType::NotEqual,
244 '<' => AplTokenType::LessThan,
245 '≤' => AplTokenType::LessEqual,
246 '≥' => AplTokenType::GreaterEqual,
247 '>' => AplTokenType::GreaterThan,
248 '⌈' => AplTokenType::UpStile,
249 '⌊' => AplTokenType::DownStile,
250 '|' => AplTokenType::Bar,
251 '~' => AplTokenType::Tilde,
252 '?' => AplTokenType::Question,
253 '!' => AplTokenType::Factorial,
254 '/' => AplTokenType::Slash,
255 '\\' => AplTokenType::Backslash,
256 '⌿' => AplTokenType::SlashBar,
257 '⍀' => AplTokenType::BackslashBar,
258 '.' => AplTokenType::Dot,
259 '∘' => AplTokenType::Jot,
260 '¨' => AplTokenType::Diaeresis,
261 '⍣' => AplTokenType::Power,
262 '⍤' => AplTokenType::Rank,
263 '≢' => AplTokenType::Tally,
264 '(' => AplTokenType::LeftParen,
265 ')' => AplTokenType::RightParen,
266 '[' => AplTokenType::LeftBracket,
267 ']' => AplTokenType::RightBracket,
268 '{' => AplTokenType::LeftBrace,
269 '}' => AplTokenType::RightBrace,
270 ';' => AplTokenType::Semicolon,
271 _ => return false,
272 };
273 state.advance(ch.len_utf8());
274 state.add_token(token, start, state.get_position());
275 return true;
276 }
277 false
278 }
279}