1use crate::{kind::OCamlSyntaxKind, language::OCamlLanguage};
2use oak_core::{
3 Lexer, LexerCache, LexerState, OakError,
4 lexer::{CommentConfig, LexOutput, WhitespaceConfig},
5 source::Source,
6};
7use std::sync::LazyLock;
8
9type State<'a, S> = LexerState<'a, S, OCamlLanguage>;
10
11static OCAML_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
12static OCAML_COMMENT: LazyLock<CommentConfig> = LazyLock::new(|| CommentConfig { line_marker: "//", block_start: "(*", block_end: "*)", nested_blocks: true });
13
14#[derive(Clone, Debug)]
15pub struct OCamlLexer<'config> {
16 _config: &'config OCamlLanguage,
17}
18
19impl<'config> Lexer<OCamlLanguage> for OCamlLexer<'config> {
20 fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[oak_core::source::TextEdit], cache: &'a mut impl LexerCache<OCamlLanguage>) -> LexOutput<OCamlLanguage> {
21 let mut state = State::new_with_cache(source, 0, cache);
22 let result = self.run(&mut state);
23 if result.is_ok() {
24 state.add_eof();
25 }
26 state.finish_with_cache(result, cache)
27 }
28}
29
30impl<'config> OCamlLexer<'config> {
31 pub fn new(config: &'config OCamlLanguage) -> Self {
32 Self { _config: config }
33 }
34
35 fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
37 while state.not_at_end() {
38 let safe_point = state.get_position();
39
40 if self.skip_whitespace(state) {
41 continue;
42 }
43
44 if self.skip_comment(state) {
45 continue;
46 }
47
48 if self.lex_string_literal(state) {
49 continue;
50 }
51
52 if self.lex_char_literal(state) {
53 continue;
54 }
55
56 if self.lex_number_literal(state) {
57 continue;
58 }
59
60 if self.lex_identifier_or_keyword(state) {
61 continue;
62 }
63
64 if self.lex_operators(state) {
65 continue;
66 }
67
68 if self.lex_single_char_tokens(state) {
69 continue;
70 }
71
72 state.advance_if_dead_lock(safe_point);
73 }
74
75 Ok(())
76 }
77
78 fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
80 OCAML_WHITESPACE.scan(state, OCamlSyntaxKind::Whitespace)
81 }
82
83 fn skip_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
84 OCAML_COMMENT.scan(state, OCamlSyntaxKind::Comment, OCamlSyntaxKind::Comment)
85 }
86
87 fn lex_string_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
88 let start = state.get_position();
89 if state.current() != Some('"') {
90 return false;
91 }
92
93 state.advance(1); let mut escaped = false;
95 while let Some(ch) = state.peek() {
96 if ch == '"' && !escaped {
97 state.advance(1); break;
99 }
100 state.advance(ch.len_utf8());
101 if escaped {
102 escaped = false;
103 continue;
104 }
105 if ch == '\\' {
106 escaped = true;
107 continue;
108 }
109 if ch == '\n' || ch == '\r' {
110 break;
111 }
112 }
113 state.add_token(OCamlSyntaxKind::StringLiteral, start, state.get_position());
114 true
115 }
116
117 fn lex_char_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
118 let start = state.get_position();
119 if state.current() != Some('\'') {
120 return false;
121 }
122
123 state.advance(1); if let Some('\\') = state.peek() {
125 state.advance(1);
126 if let Some(c) = state.peek() {
127 state.advance(c.len_utf8());
128 }
129 }
130 else if let Some(c) = state.peek() {
131 state.advance(c.len_utf8());
132 }
133 else {
134 state.set_position(start);
135 return false;
136 }
137
138 if state.peek() == Some('\'') {
139 state.advance(1);
140 state.add_token(OCamlSyntaxKind::CharLiteral, start, state.get_position());
141 return true;
142 }
143
144 state.set_position(start);
145 false
146 }
147
148 fn lex_number_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
149 let start = state.get_position();
150 let first = match state.current() {
151 Some(c) => c,
152 None => return false,
153 };
154
155 if !first.is_ascii_digit() {
156 return false;
157 }
158
159 let mut is_float = false;
160
161 state.advance(1);
163 while let Some(c) = state.peek() {
164 if c.is_ascii_digit() {
165 state.advance(1);
166 }
167 else {
168 break;
169 }
170 }
171
172 if state.peek() == Some('.') {
174 let n1 = state.peek_next_n(1);
175 if n1.map(|c| c.is_ascii_digit()).unwrap_or(false) {
176 is_float = true;
177 state.advance(1); while let Some(c) = state.peek() {
179 if c.is_ascii_digit() {
180 state.advance(1);
181 }
182 else {
183 break;
184 }
185 }
186 }
187 }
188
189 if let Some(c) = state.peek() {
191 if c == 'e' || c == 'E' {
192 let n1 = state.peek_next_n(1);
193 if n1 == Some('+') || n1 == Some('-') || n1.map(|d| d.is_ascii_digit()).unwrap_or(false) {
194 is_float = true;
195 state.advance(1);
196 if let Some(sign) = state.peek() {
197 if sign == '+' || sign == '-' {
198 state.advance(1);
199 }
200 }
201 while let Some(d) = state.peek() {
202 if d.is_ascii_digit() {
203 state.advance(1);
204 }
205 else {
206 break;
207 }
208 }
209 }
210 }
211 }
212
213 let end = state.get_position();
214 state.add_token(if is_float { OCamlSyntaxKind::FloatLiteral } else { OCamlSyntaxKind::IntegerLiteral }, start, end);
215 true
216 }
217
218 fn lex_identifier_or_keyword<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
219 let start = state.get_position();
220 let ch = match state.current() {
221 Some(c) => c,
222 None => return false,
223 };
224
225 if !(ch.is_ascii_alphabetic() || ch == '_') {
226 return false;
227 }
228
229 state.advance(1);
230 while let Some(c) = state.current() {
231 if c.is_ascii_alphanumeric() || c == '_' || c == '\'' {
232 state.advance(1);
233 }
234 else {
235 break;
236 }
237 }
238
239 let end = state.get_position();
240 let text = state.get_text_in((start..end).into());
241 let kind = match text.as_ref() {
242 "and" => OCamlSyntaxKind::And,
244 "as" => OCamlSyntaxKind::As,
245 "assert" => OCamlSyntaxKind::Assert,
246 "begin" => OCamlSyntaxKind::Begin,
247 "class" => OCamlSyntaxKind::Class,
248 "constraint" => OCamlSyntaxKind::Constraint,
249 "do" => OCamlSyntaxKind::Do,
250 "done" => OCamlSyntaxKind::Done,
251 "downto" => OCamlSyntaxKind::Downto,
252 "else" => OCamlSyntaxKind::Else,
253 "end" => OCamlSyntaxKind::End,
254 "exception" => OCamlSyntaxKind::Exception,
255 "external" => OCamlSyntaxKind::External,
256 "false" => OCamlSyntaxKind::False,
257 "for" => OCamlSyntaxKind::For,
258 "fun" => OCamlSyntaxKind::Fun,
259 "function" => OCamlSyntaxKind::Function,
260 "functor" => OCamlSyntaxKind::Functor,
261 "if" => OCamlSyntaxKind::If,
262 "in" => OCamlSyntaxKind::In,
263 "include" => OCamlSyntaxKind::Include,
264 "inherit" => OCamlSyntaxKind::Inherit,
265 "initializer" => OCamlSyntaxKind::Initializer,
266 "lazy" => OCamlSyntaxKind::Lazy,
267 "let" => OCamlSyntaxKind::Let,
268 "match" => OCamlSyntaxKind::Match,
269 "method" => OCamlSyntaxKind::Method,
270 "module" => OCamlSyntaxKind::Module,
271 "mutable" => OCamlSyntaxKind::Mutable,
272 "new" => OCamlSyntaxKind::New,
273 "object" => OCamlSyntaxKind::Object,
274 "of" => OCamlSyntaxKind::Of,
275 "open" => OCamlSyntaxKind::Open,
276 "or" => OCamlSyntaxKind::Or,
277 "private" => OCamlSyntaxKind::Private,
278 "rec" => OCamlSyntaxKind::Rec,
279 "sig" => OCamlSyntaxKind::Sig,
280 "struct" => OCamlSyntaxKind::Struct,
281 "then" => OCamlSyntaxKind::Then,
282 "to" => OCamlSyntaxKind::To,
283 "true" => OCamlSyntaxKind::True,
284 "try" => OCamlSyntaxKind::Try,
285 "type" => OCamlSyntaxKind::Type,
286 "val" => OCamlSyntaxKind::Val,
287 "virtual" => OCamlSyntaxKind::Virtual,
288 "when" => OCamlSyntaxKind::When,
289 "while" => OCamlSyntaxKind::While,
290 "with" => OCamlSyntaxKind::With,
291
292 _ => OCamlSyntaxKind::Identifier,
293 };
294
295 state.add_token(kind, start, state.get_position());
296 true
297 }
298
299 fn lex_operators<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
300 let start = state.get_position();
301 let rest = state.rest();
302
303 let patterns: &[(&str, OCamlSyntaxKind)] = &[
305 ("==", OCamlSyntaxKind::EqualEqual),
306 ("!=", OCamlSyntaxKind::NotEqual),
307 (">=", OCamlSyntaxKind::GreaterEqual),
308 ("<=", OCamlSyntaxKind::LessEqual),
309 ("&&", OCamlSyntaxKind::AndAnd),
310 ("||", OCamlSyntaxKind::OrOr),
311 ("::", OCamlSyntaxKind::ColonColon),
312 ("->", OCamlSyntaxKind::RightArrow),
313 ("<-", OCamlSyntaxKind::LeftArrow),
314 ];
315
316 for (pat, kind) in patterns {
317 if rest.starts_with(pat) {
318 state.advance(pat.len());
319 state.add_token(*kind, start, state.get_position());
320 return true;
321 }
322 }
323
324 if let Some(ch) = state.current() {
325 let kind = match ch {
326 '+' => Some(OCamlSyntaxKind::Plus),
327 '-' => Some(OCamlSyntaxKind::Minus),
328 '*' => Some(OCamlSyntaxKind::Star),
329 '/' => Some(OCamlSyntaxKind::Slash),
330 '%' => Some(OCamlSyntaxKind::Percent),
331 '=' => Some(OCamlSyntaxKind::Equal),
332 '>' => Some(OCamlSyntaxKind::Greater),
333 '<' => Some(OCamlSyntaxKind::Less),
334 '!' => Some(OCamlSyntaxKind::Bang),
335 '?' => Some(OCamlSyntaxKind::Question),
336 ':' => Some(OCamlSyntaxKind::Colon),
337 ';' => Some(OCamlSyntaxKind::Semicolon),
338 ',' => Some(OCamlSyntaxKind::Comma),
339 '.' => Some(OCamlSyntaxKind::Dot),
340 '|' => Some(OCamlSyntaxKind::Pipe),
341 '&' => Some(OCamlSyntaxKind::Ampersand),
342 '^' => Some(OCamlSyntaxKind::Caret),
343 '~' => Some(OCamlSyntaxKind::Tilde),
344 '@' => Some(OCamlSyntaxKind::At),
345 '#' => Some(OCamlSyntaxKind::Hash),
346 '$' => Some(OCamlSyntaxKind::Dollar),
347 '`' => Some(OCamlSyntaxKind::Backtick),
348 _ => None,
349 };
350
351 if let Some(k) = kind {
352 state.advance(ch.len_utf8());
353 state.add_token(k, start, state.get_position());
354 return true;
355 }
356 }
357
358 false
359 }
360
361 fn lex_single_char_tokens<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
362 let start = state.get_position();
363 if let Some(ch) = state.current() {
364 let kind = match ch {
365 '(' => OCamlSyntaxKind::LeftParen,
366 ')' => OCamlSyntaxKind::RightParen,
367 '[' => OCamlSyntaxKind::LeftBracket,
368 ']' => OCamlSyntaxKind::RightBracket,
369 '{' => OCamlSyntaxKind::LeftBrace,
370 '}' => OCamlSyntaxKind::RightBrace,
371 _ => return false,
372 };
373
374 state.advance(ch.len_utf8());
375 state.add_token(kind, start, state.get_position());
376 true
377 }
378 else {
379 false
380 }
381 }
382}