1#![doc = include_str!("readme.md")]
2pub mod token_type;
3
4pub use token_type::AplTokenType;
5
6use crate::language::AplLanguage;
7use oak_core::{
8 Lexer, LexerCache, LexerState, OakError,
9 lexer::{LexOutput, WhitespaceConfig},
10 source::Source,
11};
12use std::sync::LazyLock;
13
14type State<'a, S> = LexerState<'a, S, AplLanguage>;
15
16static APL_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
17
18#[derive(Clone, Debug)]
19pub struct AplLexer<'config> {
20 config: &'config AplLanguage,
21}
22
23impl<'config> Lexer<AplLanguage> for AplLexer<'config> {
24 fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[oak_core::source::TextEdit], cache: &'a mut impl LexerCache<AplLanguage>) -> LexOutput<AplLanguage> {
25 let mut state: State<'_, S> = LexerState::new_with_cache(source, 0, cache);
26 let result = self.run(&mut state);
27 if result.is_ok() {
28 state.add_eof();
29 }
30 state.finish_with_cache(result, cache)
31 }
32}
33
34impl<'config> AplLexer<'config> {
35 pub fn new(config: &'config AplLanguage) -> Self {
36 Self { config }
37 }
38
39 fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
41 while state.not_at_end() {
42 let safe_point = state.get_position();
43
44 if self.skip_whitespace(state) {
45 continue;
46 }
47
48 if self.skip_comment(state) {
49 continue;
50 }
51
52 if self.lex_string_literal(state) {
53 continue;
54 }
55
56 if self.lex_number_literal(state) {
57 continue;
58 }
59
60 if self.lex_identifier(state) {
61 continue;
62 }
63
64 if self.lex_symbols(state) {
65 continue;
66 }
67
68 if let Some(ch) = state.peek() {
70 state.advance(ch.len_utf8());
71 state.add_token(AplTokenType::Error, safe_point, state.get_position());
72 }
73 }
74
75 Ok(())
76 }
77
78 fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
80 APL_WHITESPACE.scan(state, AplTokenType::Whitespace)
81 }
82
83 fn skip_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
84 let start = state.get_position();
85 if state.peek() == Some('⍝') {
86 state.advance('⍝'.len_utf8());
87 while let Some(ch) = state.peek() {
88 if ch == '\n' || ch == '\r' {
89 break;
90 }
91 state.advance(ch.len_utf8());
92 }
93 state.add_token(AplTokenType::Comment, start, state.get_position());
94 return true;
95 }
96 false
97 }
98
99 fn lex_string_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
100 let start = state.get_position();
101 if let Some(quote) = state.peek() {
102 if quote == '\'' || quote == '"' {
103 state.advance(1);
104 while let Some(ch) = state.peek() {
105 if ch == quote {
106 state.advance(1);
107 if state.peek() == Some(quote) {
108 state.advance(1);
109 continue;
110 }
111 break;
112 }
113 state.advance(ch.len_utf8());
114 if ch == '\n' || ch == '\r' {
115 break;
116 }
117 }
118 state.add_token(AplTokenType::StringLiteral, start, state.get_position());
119 return true;
120 }
121 }
122 false
123 }
124
125 fn lex_number_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
126 let start = state.get_position();
127 if let Some(ch) = state.peek() {
128 if ch.is_ascii_digit() || ch == '¯' || (ch == '.' && state.peek_next_n(1).map_or(false, |c| c.is_ascii_digit())) {
129 if ch == '¯' {
130 state.advance('¯'.len_utf8());
131 }
132
133 let mut has_digits = false;
134 while let Some(c) = state.peek() {
135 if c.is_ascii_digit() {
136 state.advance(1);
137 has_digits = true;
138 }
139 else {
140 break;
141 }
142 }
143
144 if state.peek() == Some('.') {
145 state.advance(1);
146 while let Some(c) = state.peek() {
147 if c.is_ascii_digit() {
148 state.advance(1);
149 has_digits = true;
150 }
151 else {
152 break;
153 }
154 }
155 }
156
157 if !has_digits && state.get_position() == start {
158 return false;
159 }
160
161 if let Some(e) = state.peek() {
162 if e == 'e' || e == 'E' {
163 state.advance(1);
164 if let Some(sign) = state.peek() {
165 if sign == '+' || sign == '-' || sign == '¯' {
166 state.advance(sign.len_utf8());
167 }
168 }
169 while let Some(c) = state.peek() {
170 if c.is_ascii_digit() {
171 state.advance(1);
172 }
173 else {
174 break;
175 }
176 }
177 }
178 }
179
180 state.add_token(AplTokenType::NumberLiteral, start, state.get_position());
181 return true;
182 }
183 }
184 false
185 }
186
187 fn lex_identifier<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
188 let start = state.get_position();
189 if let Some(ch) = state.peek() {
190 if ch.is_alphabetic() || ch == '∆' || ch == '⍙' {
191 state.advance(ch.len_utf8());
192 while let Some(c) = state.peek() {
193 if c.is_alphanumeric() || c == '∆' || c == '⍙' || c == '_' {
194 state.advance(c.len_utf8());
195 }
196 else {
197 break;
198 }
199 }
200 state.add_token(AplTokenType::Identifier, start, state.get_position());
201 return true;
202 }
203 }
204 false
205 }
206
207 fn lex_symbols<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
208 let start = state.get_position();
209 if let Some(ch) = state.peek() {
210 let token = match ch {
211 '←' => AplTokenType::LeftArrow,
212 '→' => AplTokenType::RightArrow,
213 '⋄' => AplTokenType::Diamond,
214 '⎕' => AplTokenType::Quad,
215 '⍞' => AplTokenType::QuoteQuad,
216 '⍴' => AplTokenType::Rho,
217 '⍳' => AplTokenType::Iota,
218 '∊' => AplTokenType::Epsilon,
219 '↑' => AplTokenType::UpArrow,
220 '↓' => AplTokenType::DownArrow,
221 '∇' => AplTokenType::Del,
222 '∆' => AplTokenType::Delta,
223 '⍺' => AplTokenType::Alpha,
224 '⍵' => AplTokenType::Omega,
225 '⍬' => AplTokenType::Zilde,
226 '+' => AplTokenType::Plus,
227 '-' => AplTokenType::Minus,
228 '×' => AplTokenType::Times,
229 '÷' => AplTokenType::Divide,
230 '*' => AplTokenType::Star,
231 '⍟' => AplTokenType::Log,
232 '○' => AplTokenType::Circle,
233 '∨' => AplTokenType::Or,
234 '∧' => AplTokenType::And,
235 '∼' => AplTokenType::Not,
236 '⍱' => AplTokenType::Nor,
237 '⍲' => AplTokenType::Nand,
238 '=' => AplTokenType::Equal,
239 '≠' => AplTokenType::NotEqual,
240 '<' => AplTokenType::LessThan,
241 '≤' => AplTokenType::LessEqual,
242 '≥' => AplTokenType::GreaterEqual,
243 '>' => AplTokenType::GreaterThan,
244 '⌈' => AplTokenType::UpStile,
245 '⌊' => AplTokenType::DownStile,
246 '|' => AplTokenType::Bar,
247 '~' => AplTokenType::Tilde,
248 '?' => AplTokenType::Question,
249 '!' => AplTokenType::Factorial,
250 '/' => AplTokenType::Slash,
251 '\\' => AplTokenType::Backslash,
252 '⌿' => AplTokenType::SlashBar,
253 '⍀' => AplTokenType::BackslashBar,
254 '.' => AplTokenType::Dot,
255 '∘' => AplTokenType::Jot,
256 '¨' => AplTokenType::Diaeresis,
257 '⍣' => AplTokenType::Power,
258 '⍤' => AplTokenType::Rank,
259 '≢' => AplTokenType::Tally,
260 '(' => AplTokenType::LeftParen,
261 ')' => AplTokenType::RightParen,
262 '[' => AplTokenType::LeftBracket,
263 ']' => AplTokenType::RightBracket,
264 '{' => AplTokenType::LeftBrace,
265 '}' => AplTokenType::RightBrace,
266 ';' => AplTokenType::Semicolon,
267 _ => return false,
268 };
269 state.advance(ch.len_utf8());
270 state.add_token(token, start, state.get_position());
271 return true;
272 }
273 false
274 }
275}