1use crate::{kind::PascalSyntaxKind, language::PascalLanguage};
2use oak_core::{
3 IncrementalCache, Lexer, LexerState, OakError,
4 lexer::{CommentLine, LexOutput, StringConfig, WhitespaceConfig},
5 source::Source,
6};
7use std::sync::LazyLock;
8
9type State<S> = LexerState<S, PascalLanguage>;
10
11static PASCAL_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
12static PASCAL_COMMENT: LazyLock<CommentLine> = LazyLock::new(|| CommentLine { line_markers: &["//"] });
13static PASCAL_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['\''], escape: None });
14
15#[derive(Clone)]
16pub struct PascalLexer<'config> {
17 config: &'config PascalLanguage,
18}
19
20impl<'config> PascalLexer<'config> {
21 pub fn new(config: &'config PascalLanguage) -> Self {
22 Self { config }
23 }
24
25 fn skip_whitespace<S: Source>(&self, state: &mut State<S>) -> bool {
26 match PASCAL_WHITESPACE.scan(state.rest(), state.get_position(), PascalSyntaxKind::Whitespace) {
27 Some(token) => {
28 state.advance_with(token);
29 true
30 }
31 None => false,
32 }
33 }
34
35 fn skip_comment<S: Source>(&self, state: &mut State<S>) -> bool {
36 let start = state.get_position();
37 let rest = state.rest();
38
39 if rest.starts_with("//") {
41 match PASCAL_COMMENT.scan(rest, start, PascalSyntaxKind::Comment) {
42 Some(token) => {
43 state.advance_with(token);
44 return true;
45 }
46 None => return false,
47 }
48 }
49
50 if state.current() == Some('{') {
52 state.advance(1);
53 while let Some(ch) = state.peek() {
54 if ch == '}' {
55 state.advance(1);
56 break;
57 }
58 state.advance(ch.len_utf8());
59 }
60 state.add_token(PascalSyntaxKind::Comment, start, state.get_position());
61 return true;
62 }
63
64 if rest.starts_with("(*") {
66 state.advance(2);
67 while let Some(ch) = state.peek() {
68 if ch == '*' && state.peek_next_n(1) == Some(')') {
69 state.advance(2);
70 break;
71 }
72 state.advance(ch.len_utf8());
73 }
74 state.add_token(PascalSyntaxKind::Comment, start, state.get_position());
75 return true;
76 }
77
78 false
79 }
80
81 fn lex_string<S: Source>(&self, state: &mut State<S>) -> bool {
82 let start = state.get_position();
83
84 if state.current() == Some('\'') {
86 state.advance(1);
87 while let Some(ch) = state.peek() {
88 if ch == '\'' {
89 if state.peek_next_n(1) == Some('\'') {
91 state.advance(2); continue;
93 }
94 else {
95 state.advance(1); break;
97 }
98 }
99 state.advance(ch.len_utf8());
100 }
101 state.add_token(PascalSyntaxKind::StringLiteral, start, state.get_position());
102 return true;
103 }
104 false
105 }
106
107 fn lex_identifier_or_keyword<S: Source>(&self, state: &mut State<S>) -> bool {
108 if let Some(ch) = state.peek() {
109 if ch.is_alphabetic() || ch == '_' {
110 let start_pos = state.get_position();
111 let mut text = String::new();
112
113 while let Some(ch) = state.peek() {
115 if ch.is_alphanumeric() || ch == '_' {
116 text.push(ch);
117 state.advance(ch.len_utf8());
118 }
119 else {
120 break;
121 }
122 }
123
124 let kind = match text.to_lowercase().as_str() {
126 "program" => PascalSyntaxKind::Program,
127 "var" => PascalSyntaxKind::Var,
128 "const" => PascalSyntaxKind::Const,
129 "type" => PascalSyntaxKind::Type,
130 "procedure" => PascalSyntaxKind::Procedure,
131 "function" => PascalSyntaxKind::Function,
132 "begin" => PascalSyntaxKind::Begin,
133 "end" => PascalSyntaxKind::End,
134 "if" => PascalSyntaxKind::If,
135 "then" => PascalSyntaxKind::Then,
136 "else" => PascalSyntaxKind::Else,
137 "while" => PascalSyntaxKind::While,
138 "do" => PascalSyntaxKind::Do,
139 "for" => PascalSyntaxKind::For,
140 "to" => PascalSyntaxKind::To,
141 "downto" => PascalSyntaxKind::Downto,
142 "repeat" => PascalSyntaxKind::Repeat,
143 "until" => PascalSyntaxKind::Until,
144 "case" => PascalSyntaxKind::Case,
145 "of" => PascalSyntaxKind::Of,
146 "with" => PascalSyntaxKind::With,
147 "record" => PascalSyntaxKind::Record,
148 "array" => PascalSyntaxKind::Array,
149 "set" => PascalSyntaxKind::Set,
150 "file" => PascalSyntaxKind::File,
151 "packed" => PascalSyntaxKind::Packed,
152 "nil" => PascalSyntaxKind::Nil,
153 "true" => PascalSyntaxKind::True,
154 "false" => PascalSyntaxKind::False,
155 "and" => PascalSyntaxKind::And,
156 "or" => PascalSyntaxKind::Or,
157 "not" => PascalSyntaxKind::Not,
158 "div" => PascalSyntaxKind::Div,
159 "mod" => PascalSyntaxKind::Mod,
160 "in" => PascalSyntaxKind::In,
161
162 _ => PascalSyntaxKind::Identifier,
163 };
164
165 state.add_token(kind, start_pos, state.get_position());
166 true
167 }
168 else {
169 false
170 }
171 }
172 else {
173 false
174 }
175 }
176
177 fn lex_number<S: Source>(&self, state: &mut State<S>) -> bool {
178 if let Some(ch) = state.peek() {
179 if ch.is_ascii_digit() {
180 let start_pos = state.get_position();
181 let mut has_dot = false;
182
183 while let Some(ch) = state.peek() {
185 if ch.is_ascii_digit() {
186 state.advance(1);
187 }
188 else if ch == '.' && !has_dot {
189 has_dot = true;
190 state.advance(1);
191 }
192 else {
193 break;
194 }
195 }
196
197 let kind = if has_dot { PascalSyntaxKind::RealLiteral } else { PascalSyntaxKind::IntegerLiteral };
198
199 state.add_token(kind, start_pos, state.get_position());
200 true
201 }
202 else {
203 false
204 }
205 }
206 else {
207 false
208 }
209 }
210
211 fn lex_operators_and_punctuation<S: Source>(&self, state: &mut State<S>) -> bool {
212 if let Some(ch) = state.peek() {
213 let start_pos = state.get_position();
214
215 let kind = match ch {
216 '+' => {
217 state.advance(1);
218 PascalSyntaxKind::Plus
219 }
220 '-' => {
221 state.advance(1);
222 PascalSyntaxKind::Minus
223 }
224 '*' => {
225 state.advance(1);
226 PascalSyntaxKind::Multiply
227 }
228 '/' => {
229 state.advance(1);
230 PascalSyntaxKind::Divide
231 }
232 '=' => {
233 state.advance(1);
234 PascalSyntaxKind::Equal
235 }
236 '<' => {
237 state.advance(1);
238 if let Some('=') = state.peek() {
239 state.advance(1);
240 PascalSyntaxKind::LessEqual
241 }
242 else if let Some('>') = state.peek() {
243 state.advance(1);
244 PascalSyntaxKind::NotEqual
245 }
246 else {
247 PascalSyntaxKind::Less
248 }
249 }
250 '>' => {
251 state.advance(1);
252 if let Some('=') = state.peek() {
253 state.advance(1);
254 PascalSyntaxKind::GreaterEqual
255 }
256 else {
257 PascalSyntaxKind::Greater
258 }
259 }
260 ':' => {
261 state.advance(1);
262 if let Some('=') = state.peek() {
263 state.advance(1);
264 PascalSyntaxKind::Assign
265 }
266 else {
267 PascalSyntaxKind::Colon
268 }
269 }
270 ';' => {
271 state.advance(1);
272 PascalSyntaxKind::Semicolon
273 }
274 ',' => {
275 state.advance(1);
276 PascalSyntaxKind::Comma
277 }
278 '.' => {
279 state.advance(1);
280 if let Some('.') = state.peek() {
281 state.advance(1);
282 PascalSyntaxKind::Range
283 }
284 else {
285 PascalSyntaxKind::Dot
286 }
287 }
288 '(' => {
289 state.advance(1);
290 PascalSyntaxKind::LeftParen
291 }
292 ')' => {
293 state.advance(1);
294 PascalSyntaxKind::RightParen
295 }
296 '[' => {
297 state.advance(1);
298 PascalSyntaxKind::LeftBracket
299 }
300 ']' => {
301 state.advance(1);
302 PascalSyntaxKind::RightBracket
303 }
304 '^' => {
305 state.advance(1);
306 PascalSyntaxKind::Caret
307 }
308 '\n' => {
309 state.advance(1);
310 PascalSyntaxKind::Newline
311 }
312 _ => {
313 state.advance(ch.len_utf8());
314 PascalSyntaxKind::Error
315 }
316 };
317
318 state.add_token(kind, start_pos, state.get_position());
319 true
320 }
321 else {
322 false
323 }
324 }
325}
326
327impl<'config> Lexer<PascalLanguage> for PascalLexer<'config> {
328 fn lex_incremental(
329 &self,
330 source: impl Source,
331 changed: usize,
332 cache: IncrementalCache<PascalLanguage>,
333 ) -> LexOutput<PascalLanguage> {
334 let mut state = LexerState::new_with_cache(source, changed, cache);
335 let result = self.run(&mut state);
336 state.finish(result)
337 }
338}
339
340impl<'config> PascalLexer<'config> {
341 fn run<S: Source>(&self, state: &mut State<S>) -> Result<(), OakError> {
342 while state.not_at_end() {
343 if self.skip_whitespace(state) {
345 continue;
346 }
347
348 if self.skip_comment(state) {
350 continue;
351 }
352
353 if self.lex_string(state) {
355 continue;
356 }
357
358 if self.lex_identifier_or_keyword(state) {
360 continue;
361 }
362
363 if self.lex_number(state) {
365 continue;
366 }
367
368 if self.lex_operators_and_punctuation(state) {
370 continue;
371 }
372
373 let start_pos = state.get_position();
375 if let Some(ch) = state.peek() {
376 state.advance(ch.len_utf8());
377 state.add_token(PascalSyntaxKind::Error, start_pos, state.get_position());
378 }
379 else {
380 break;
381 }
382 }
383
384 let eof_pos = state.get_position();
386 state.add_token(PascalSyntaxKind::Eof, eof_pos, eof_pos);
387 Ok(())
388 }
389}