1#![doc = include_str!("readme.md")]
2pub mod token_type;
3
4use crate::{language::PascalLanguage, lexer::token_type::PascalTokenType};
5use oak_core::{
6 Lexer, LexerCache, LexerState, OakError,
7 lexer::{CommentConfig, LexOutput, WhitespaceConfig},
8 source::Source,
9};
10use std::sync::LazyLock;
11
12type State<'s, S> = LexerState<'s, S, PascalLanguage>;
13
14static PASCAL_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
15static PASCAL_COMMENT: LazyLock<CommentConfig> = LazyLock::new(|| CommentConfig { line_marker: "//", block_start: "{", block_end: "}", nested_blocks: false });
16
17#[derive(Clone, Debug)]
18pub struct PascalLexer<'config> {
19 _config: &'config PascalLanguage,
20}
21
22impl<'config> PascalLexer<'config> {
23 pub fn new(config: &'config PascalLanguage) -> Self {
24 Self { _config: config }
25 }
26
27 fn skip_whitespace<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
28 PASCAL_WHITESPACE.scan(state, PascalTokenType::Whitespace)
29 }
30
31 fn skip_comment<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
32 let start = state.get_position();
33
34 if state.rest().starts_with("//") {
36 return PASCAL_COMMENT.scan(state, PascalTokenType::Comment, PascalTokenType::Comment);
37 }
38
39 if state.current() == Some('{') {
41 state.advance(1);
42 while let Some(ch) = state.peek() {
43 if ch == '}' {
44 state.advance(1);
45 break;
46 }
47 state.advance(ch.len_utf8());
48 }
49 state.add_token(PascalTokenType::Comment, start, state.get_position());
50 return true;
51 }
52
53 if state.rest().starts_with("(*") {
55 state.advance(2);
56 while let Some(ch) = state.peek() {
57 if ch == '*' && state.peek_next_n(1) == Some(')') {
58 state.advance(2);
59 break;
60 }
61 state.advance(ch.len_utf8());
62 }
63 state.add_token(PascalTokenType::Comment, start, state.get_position());
64 return true;
65 }
66
67 false
68 }
69
70 fn lex_string<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
71 let start = state.get_position();
72
73 if state.current() == Some('\'') {
75 state.advance(1);
76 while let Some(ch) = state.peek() {
77 if ch == '\'' {
78 if state.peek_next_n(1) == Some('\'') {
80 state.advance(2); continue;
82 }
83 else {
84 state.advance(1); break;
86 }
87 }
88 state.advance(ch.len_utf8());
89 }
90 state.add_token(PascalTokenType::StringLiteral, start, state.get_position());
91 return true;
92 }
93 false
94 }
95
96 fn lex_identifier_or_keyword<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
97 if let Some(ch) = state.peek() {
98 if ch.is_alphabetic() || ch == '_' {
99 let start_pos = state.get_position();
100 let mut text = String::new();
101
102 while let Some(ch) = state.peek() {
104 if ch.is_alphanumeric() || ch == '_' {
105 text.push(ch);
106 state.advance(ch.len_utf8());
107 }
108 else {
109 break;
110 }
111 }
112
113 let kind = match text.to_lowercase().as_str() {
115 "program" => PascalTokenType::Program,
116 "var" => PascalTokenType::Var,
117 "const" => PascalTokenType::Const,
118 "type" => PascalTokenType::Type,
119 "procedure" => PascalTokenType::Procedure,
120 "function" => PascalTokenType::Function,
121 "begin" => PascalTokenType::Begin,
122 "end" => PascalTokenType::End,
123 "if" => PascalTokenType::If,
124 "then" => PascalTokenType::Then,
125 "else" => PascalTokenType::Else,
126 "while" => PascalTokenType::While,
127 "do" => PascalTokenType::Do,
128 "for" => PascalTokenType::For,
129 "to" => PascalTokenType::To,
130 "downto" => PascalTokenType::Downto,
131 "repeat" => PascalTokenType::Repeat,
132 "until" => PascalTokenType::Until,
133 "case" => PascalTokenType::Case,
134 "of" => PascalTokenType::Of,
135 "with" => PascalTokenType::With,
136 "record" => PascalTokenType::Record,
137 "array" => PascalTokenType::Array,
138 "set" => PascalTokenType::Set,
139 "file" => PascalTokenType::File,
140 "packed" => PascalTokenType::Packed,
141 "nil" => PascalTokenType::Nil,
142 "true" => PascalTokenType::True,
143 "false" => PascalTokenType::False,
144 "and" => PascalTokenType::And,
145 "or" => PascalTokenType::Or,
146 "not" => PascalTokenType::Not,
147 "div" => PascalTokenType::Div,
148 "mod" => PascalTokenType::Mod,
149 "in" => PascalTokenType::In,
150
151 _ => PascalTokenType::Identifier,
152 };
153
154 state.add_token(kind, start_pos, state.get_position());
155 true
156 }
157 else {
158 false
159 }
160 }
161 else {
162 false
163 }
164 }
165
166 fn lex_number<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
167 if let Some(ch) = state.peek() {
168 if ch.is_ascii_digit() {
169 let start_pos = state.get_position();
170 let mut has_dot = false;
171
172 while let Some(ch) = state.peek() {
174 if ch.is_ascii_digit() {
175 state.advance(1);
176 }
177 else if ch == '.' && !has_dot {
178 has_dot = true;
179 state.advance(1);
180 }
181 else {
182 break;
183 }
184 }
185
186 let kind = if has_dot { PascalTokenType::RealLiteral } else { PascalTokenType::IntegerLiteral };
187
188 state.add_token(kind, start_pos, state.get_position());
189 true
190 }
191 else {
192 false
193 }
194 }
195 else {
196 false
197 }
198 }
199
200 fn lex_operators_and_punctuation<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
201 if let Some(ch) = state.peek() {
202 let start_pos = state.get_position();
203
204 let kind = match ch {
205 '+' => {
206 state.advance(1);
207 PascalTokenType::Plus
208 }
209 '-' => {
210 state.advance(1);
211 PascalTokenType::Minus
212 }
213 '*' => {
214 state.advance(1);
215 PascalTokenType::Multiply
216 }
217 '/' => {
218 state.advance(1);
219 PascalTokenType::Divide
220 }
221 '=' => {
222 state.advance(1);
223 PascalTokenType::Equal
224 }
225 '<' => {
226 state.advance(1);
227 if let Some('=') = state.peek() {
228 state.advance(1);
229 PascalTokenType::LessEqual
230 }
231 else if let Some('>') = state.peek() {
232 state.advance(1);
233 PascalTokenType::NotEqual
234 }
235 else {
236 PascalTokenType::Less
237 }
238 }
239 '>' => {
240 state.advance(1);
241 if let Some('=') = state.peek() {
242 state.advance(1);
243 PascalTokenType::GreaterEqual
244 }
245 else {
246 PascalTokenType::Greater
247 }
248 }
249 ':' => {
250 state.advance(1);
251 if let Some('=') = state.peek() {
252 state.advance(1);
253 PascalTokenType::Assign
254 }
255 else {
256 PascalTokenType::Colon
257 }
258 }
259 ';' => {
260 state.advance(1);
261 PascalTokenType::Semicolon
262 }
263 ',' => {
264 state.advance(1);
265 PascalTokenType::Comma
266 }
267 '.' => {
268 state.advance(1);
269 if let Some('.') = state.peek() {
270 state.advance(1);
271 PascalTokenType::Range
272 }
273 else {
274 PascalTokenType::Dot
275 }
276 }
277 '(' => {
278 state.advance(1);
279 PascalTokenType::LeftParen
280 }
281 ')' => {
282 state.advance(1);
283 PascalTokenType::RightParen
284 }
285 '[' => {
286 state.advance(1);
287 PascalTokenType::LeftBracket
288 }
289 ']' => {
290 state.advance(1);
291 PascalTokenType::RightBracket
292 }
293 '^' => {
294 state.advance(1);
295 PascalTokenType::Caret
296 }
297 '\n' => {
298 state.advance(1);
299 PascalTokenType::Newline
300 }
301 _ => {
302 state.advance(ch.len_utf8());
303 PascalTokenType::Error
304 }
305 };
306
307 state.add_token(kind, start_pos, state.get_position());
308 true
309 }
310 else {
311 false
312 }
313 }
314}
315
316impl Lexer<PascalLanguage> for PascalLexer<'_> {
317 fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[oak_core::source::TextEdit], cache: &'a mut impl LexerCache<PascalLanguage>) -> LexOutput<PascalLanguage> {
318 let mut state = State::new(source);
319 let result = self.run(&mut state);
320 if result.is_ok() {
321 state.add_eof();
322 }
323 state.finish_with_cache(result, cache)
324 }
325}
326
327impl PascalLexer<'_> {
328 fn run<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> Result<(), OakError> {
329 let safe_point = state.get_position();
330 while state.not_at_end() {
331 if self.skip_whitespace(state) {
333 continue;
334 }
335
336 if self.skip_comment(state) {
338 continue;
339 }
340
341 if self.lex_string(state) {
343 continue;
344 }
345
346 if self.lex_identifier_or_keyword(state) {
348 continue;
349 }
350
351 if self.lex_number(state) {
353 continue;
354 }
355
356 if self.lex_operators_and_punctuation(state) {
358 continue;
359 }
360
361 let start_pos = state.get_position();
363 if let Some(ch) = state.peek() {
364 state.advance(ch.len_utf8());
365 state.add_token(PascalTokenType::Error, start_pos, state.get_position());
366 }
367
368 state.advance_if_dead_lock(safe_point);
369 }
370
371 Ok(())
373 }
374}