1#![doc = include_str!("readme.md")]
2pub mod token_type;
4
5use crate::{language::PascalLanguage, lexer::token_type::PascalTokenType};
6use oak_core::{
7 Lexer, LexerCache, LexerState, OakError,
8 lexer::{CommentConfig, LexOutput, WhitespaceConfig},
9 source::Source,
10};
11use std::sync::LazyLock;
12
13type State<'s, S> = LexerState<'s, S, PascalLanguage>;
14
15static PASCAL_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
16static PASCAL_COMMENT: LazyLock<CommentConfig> = LazyLock::new(|| CommentConfig { line_marker: "//", block_start: "{", block_end: "}", nested_blocks: false });
17
18#[derive(Clone, Debug)]
20pub struct PascalLexer<'config> {
21 config: &'config PascalLanguage,
22}
23
24impl<'config> PascalLexer<'config> {
25 pub fn new(config: &'config PascalLanguage) -> Self {
27 Self { config }
28 }
29
30 fn skip_whitespace<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
31 PASCAL_WHITESPACE.scan(state, PascalTokenType::Whitespace)
32 }
33
34 fn skip_comment<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
35 let start = state.get_position();
36
37 if state.rest().starts_with("//") {
39 return PASCAL_COMMENT.scan(state, PascalTokenType::Comment, PascalTokenType::Comment);
40 }
41
42 if state.current() == Some('{') {
44 state.advance(1);
45 while let Some(ch) = state.peek() {
46 if ch == '}' {
47 state.advance(1);
48 break;
49 }
50 state.advance(ch.len_utf8());
51 }
52 state.add_token(PascalTokenType::Comment, start, state.get_position());
53 return true;
54 }
55
56 if state.rest().starts_with("(*") {
58 state.advance(2);
59 while let Some(ch) = state.peek() {
60 if ch == '*' && state.peek_next_n(1) == Some(')') {
61 state.advance(2);
62 break;
63 }
64 state.advance(ch.len_utf8());
65 }
66 state.add_token(PascalTokenType::Comment, start, state.get_position());
67 return true;
68 }
69
70 false
71 }
72
73 fn lex_string<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
74 let start = state.get_position();
75
76 if state.current() == Some('\'') {
78 state.advance(1);
79 while let Some(ch) = state.peek() {
80 if ch == '\'' {
81 if state.peek_next_n(1) == Some('\'') {
83 state.advance(2); continue;
85 }
86 else {
87 state.advance(1); break;
89 }
90 }
91 state.advance(ch.len_utf8());
92 }
93 state.add_token(PascalTokenType::StringLiteral, start, state.get_position());
94 return true;
95 }
96 false
97 }
98
99 fn lex_identifier_or_keyword<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
100 if let Some(ch) = state.peek() {
101 if ch.is_alphabetic() || ch == '_' {
102 let start_pos = state.get_position();
103 let mut text = String::new();
104
105 while let Some(ch) = state.peek() {
107 if ch.is_alphanumeric() || ch == '_' {
108 text.push(ch);
109 state.advance(ch.len_utf8());
110 }
111 else {
112 break;
113 }
114 }
115
116 let kind = match text.to_lowercase().as_str() {
118 "program" => PascalTokenType::Program,
119 "var" => PascalTokenType::Var,
120 "const" => PascalTokenType::Const,
121 "type" => PascalTokenType::Type,
122 "procedure" => PascalTokenType::Procedure,
123 "function" => PascalTokenType::Function,
124 "begin" => PascalTokenType::Begin,
125 "end" => PascalTokenType::End,
126 "if" => PascalTokenType::If,
127 "then" => PascalTokenType::Then,
128 "else" => PascalTokenType::Else,
129 "while" => PascalTokenType::While,
130 "do" => PascalTokenType::Do,
131 "for" => PascalTokenType::For,
132 "to" => PascalTokenType::To,
133 "downto" => PascalTokenType::Downto,
134 "repeat" => PascalTokenType::Repeat,
135 "until" => PascalTokenType::Until,
136 "case" => PascalTokenType::Case,
137 "of" => PascalTokenType::Of,
138 "with" => PascalTokenType::With,
139 "record" => PascalTokenType::Record,
140 "array" => PascalTokenType::Array,
141 "set" => PascalTokenType::Set,
142 "file" => PascalTokenType::File,
143 "packed" => PascalTokenType::Packed,
144 "nil" => PascalTokenType::Nil,
145 "true" => PascalTokenType::True,
146 "false" => PascalTokenType::False,
147 "and" => PascalTokenType::And,
148 "or" => PascalTokenType::Or,
149 "not" => PascalTokenType::Not,
150 "div" => PascalTokenType::Div,
151 "mod" => PascalTokenType::Mod,
152 "in" => PascalTokenType::In,
153
154 _ => PascalTokenType::Identifier,
155 };
156
157 state.add_token(kind, start_pos, state.get_position());
158 true
159 }
160 else {
161 false
162 }
163 }
164 else {
165 false
166 }
167 }
168
169 fn lex_number<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
170 if let Some(ch) = state.peek() {
171 if ch.is_ascii_digit() {
172 let start_pos = state.get_position();
173 let mut has_dot = false;
174
175 while let Some(ch) = state.peek() {
177 if ch.is_ascii_digit() {
178 state.advance(1);
179 }
180 else if ch == '.' && !has_dot {
181 has_dot = true;
182 state.advance(1);
183 }
184 else {
185 break;
186 }
187 }
188
189 let kind = if has_dot { PascalTokenType::RealLiteral } else { PascalTokenType::IntegerLiteral };
190
191 state.add_token(kind, start_pos, state.get_position());
192 true
193 }
194 else {
195 false
196 }
197 }
198 else {
199 false
200 }
201 }
202
203 fn lex_operators_and_punctuation<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
204 if let Some(ch) = state.peek() {
205 let start_pos = state.get_position();
206
207 let kind = match ch {
208 '+' => {
209 state.advance(1);
210 PascalTokenType::Plus
211 }
212 '-' => {
213 state.advance(1);
214 PascalTokenType::Minus
215 }
216 '*' => {
217 state.advance(1);
218 PascalTokenType::Multiply
219 }
220 '/' => {
221 state.advance(1);
222 PascalTokenType::Divide
223 }
224 '=' => {
225 state.advance(1);
226 PascalTokenType::Equal
227 }
228 '<' => {
229 state.advance(1);
230 if let Some('=') = state.peek() {
231 state.advance(1);
232 PascalTokenType::LessEqual
233 }
234 else if let Some('>') = state.peek() {
235 state.advance(1);
236 PascalTokenType::NotEqual
237 }
238 else {
239 PascalTokenType::Less
240 }
241 }
242 '>' => {
243 state.advance(1);
244 if let Some('=') = state.peek() {
245 state.advance(1);
246 PascalTokenType::GreaterEqual
247 }
248 else {
249 PascalTokenType::Greater
250 }
251 }
252 ':' => {
253 state.advance(1);
254 if let Some('=') = state.peek() {
255 state.advance(1);
256 PascalTokenType::Assign
257 }
258 else {
259 PascalTokenType::Colon
260 }
261 }
262 ';' => {
263 state.advance(1);
264 PascalTokenType::Semicolon
265 }
266 ',' => {
267 state.advance(1);
268 PascalTokenType::Comma
269 }
270 '.' => {
271 state.advance(1);
272 if let Some('.') = state.peek() {
273 state.advance(1);
274 PascalTokenType::Range
275 }
276 else {
277 PascalTokenType::Dot
278 }
279 }
280 '(' => {
281 state.advance(1);
282 PascalTokenType::LeftParen
283 }
284 ')' => {
285 state.advance(1);
286 PascalTokenType::RightParen
287 }
288 '[' => {
289 state.advance(1);
290 PascalTokenType::LeftBracket
291 }
292 ']' => {
293 state.advance(1);
294 PascalTokenType::RightBracket
295 }
296 '^' => {
297 state.advance(1);
298 PascalTokenType::Caret
299 }
300 '\n' => {
301 state.advance(1);
302 PascalTokenType::Newline
303 }
304 _ => {
305 state.advance(ch.len_utf8());
306 PascalTokenType::Error
307 }
308 };
309
310 state.add_token(kind, start_pos, state.get_position());
311 true
312 }
313 else {
314 false
315 }
316 }
317}
318
319impl Lexer<PascalLanguage> for PascalLexer<'_> {
320 fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[oak_core::source::TextEdit], cache: &'a mut impl LexerCache<PascalLanguage>) -> LexOutput<PascalLanguage> {
321 let mut state = State::new(source);
322 let result = self.run(&mut state);
323 if result.is_ok() {
324 state.add_eof();
325 }
326 state.finish_with_cache(result, cache)
327 }
328}
329
330impl PascalLexer<'_> {
331 fn run<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> Result<(), OakError> {
332 let safe_point = state.get_position();
333 while state.not_at_end() {
334 if self.skip_whitespace(state) {
336 continue;
337 }
338
339 if self.skip_comment(state) {
341 continue;
342 }
343
344 if self.lex_string(state) {
346 continue;
347 }
348
349 if self.lex_identifier_or_keyword(state) {
351 continue;
352 }
353
354 if self.lex_number(state) {
356 continue;
357 }
358
359 if self.lex_operators_and_punctuation(state) {
361 continue;
362 }
363
364 let start_pos = state.get_position();
366 if let Some(ch) = state.peek() {
367 state.advance(ch.len_utf8());
368 state.add_token(PascalTokenType::Error, start_pos, state.get_position());
369 }
370
371 state.advance_if_dead_lock(safe_point);
372 }
373
374 Ok(())
376 }
377}