1#![doc = include_str!("readme.md")]
2pub mod token_type;
3
4use crate::{language::OCamlLanguage, lexer::token_type::OCamlTokenType};
5use oak_core::{
6 Lexer, LexerCache, LexerState, OakError,
7 lexer::{CommentConfig, LexOutput, WhitespaceConfig},
8 source::Source,
9};
10use std::sync::LazyLock;
11
12pub(crate) type State<'a, S> = LexerState<'a, S, OCamlLanguage>;
13
14static OCAML_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
15static OCAML_COMMENT: LazyLock<CommentConfig> = LazyLock::new(|| CommentConfig { line_marker: "//", block_start: "(*", block_end: "*)", nested_blocks: true });
16
17#[derive(Clone, Debug)]
18pub struct OCamlLexer<'config> {
19 config: &'config OCamlLanguage,
20}
21
22impl<'config> Lexer<OCamlLanguage> for OCamlLexer<'config> {
23 fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[oak_core::source::TextEdit], cache: &'a mut impl LexerCache<OCamlLanguage>) -> LexOutput<OCamlLanguage> {
24 let mut state = State::new_with_cache(source, 0, cache);
25 let result = self.run(&mut state);
26 if result.is_ok() {
27 state.add_eof()
28 }
29 state.finish_with_cache(result, cache)
30 }
31}
32
33impl<'config> OCamlLexer<'config> {
34 pub fn new(config: &'config OCamlLanguage) -> Self {
35 Self { config }
36 }
37
38 fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
40 while state.not_at_end() {
41 let safe_point = state.get_position();
42
43 if self.skip_whitespace(state) {
44 continue;
45 }
46
47 if self.skip_comment(state) {
48 continue;
49 }
50
51 if self.lex_string_literal(state) {
52 continue;
53 }
54
55 if self.lex_char_literal(state) {
56 continue;
57 }
58
59 if self.lex_number_literal(state) {
60 continue;
61 }
62
63 if self.lex_identifier_or_keyword(state) {
64 continue;
65 }
66
67 if self.lex_operators(state) {
68 continue;
69 }
70
71 if self.lex_single_char_tokens(state) {
72 continue;
73 }
74
75 state.advance_if_dead_lock(safe_point)
76 }
77
78 Ok(())
79 }
80
81 fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
83 OCAML_WHITESPACE.scan(state, OCamlTokenType::Whitespace)
84 }
85
86 fn skip_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
87 OCAML_COMMENT.scan(state, OCamlTokenType::Comment, OCamlTokenType::Comment)
88 }
89
90 fn lex_string_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
91 let start = state.get_position();
92 if state.current() != Some('"') {
93 return false;
94 }
95
96 state.advance(1); let mut escaped = false;
98 while let Some(ch) = state.peek() {
99 if ch == '"' && !escaped {
100 state.advance(1); break;
102 }
103 state.advance(ch.len_utf8());
104 if escaped {
105 escaped = false;
106 continue;
107 }
108 if ch == '\\' {
109 escaped = true;
110 continue;
111 }
112 if ch == '\n' || ch == '\r' {
113 break;
114 }
115 }
116 state.add_token(OCamlTokenType::StringLiteral, start, state.get_position());
117 true
118 }
119
120 fn lex_char_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
121 let start = state.get_position();
122 if state.current() != Some('\'') {
123 return false;
124 }
125
126 state.advance(1); if let Some('\\') = state.peek() {
128 state.advance(1);
129 if let Some(c) = state.peek() {
130 state.advance(c.len_utf8())
131 }
132 }
133 else if let Some(c) = state.peek() {
134 state.advance(c.len_utf8())
135 }
136 else {
137 state.set_position(start);
138 return false;
139 }
140
141 if state.peek() == Some('\'') {
142 state.advance(1);
143 state.add_token(OCamlTokenType::CharLiteral, start, state.get_position());
144 return true;
145 }
146
147 state.set_position(start);
148 false
149 }
150
151 fn lex_number_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
152 let start = state.get_position();
153 let first = match state.current() {
154 Some(c) => c,
155 None => return false,
156 };
157
158 if !first.is_ascii_digit() {
159 return false;
160 }
161
162 let mut is_float = false;
163
164 state.advance(1);
166 while let Some(c) = state.peek() {
167 if c.is_ascii_digit() {
168 state.advance(1);
169 }
170 else {
171 break;
172 }
173 }
174
175 if state.peek() == Some('.') {
177 let n1 = state.peek_next_n(1);
178 if n1.map(|c| c.is_ascii_digit()).unwrap_or(false) {
179 is_float = true;
180 state.advance(1); while let Some(c) = state.peek() {
182 if c.is_ascii_digit() {
183 state.advance(1);
184 }
185 else {
186 break;
187 }
188 }
189 }
190 }
191
192 if let Some(c) = state.peek() {
194 if c == 'e' || c == 'E' {
195 let n1 = state.peek_next_n(1);
196 if n1 == Some('+') || n1 == Some('-') || n1.map(|d| d.is_ascii_digit()).unwrap_or(false) {
197 is_float = true;
198 state.advance(1);
199 if let Some(sign) = state.peek() {
200 if sign == '+' || sign == '-' {
201 state.advance(1);
202 }
203 }
204 while let Some(d) = state.peek() {
205 if d.is_ascii_digit() {
206 state.advance(1);
207 }
208 else {
209 break;
210 }
211 }
212 }
213 }
214 }
215
216 let end = state.get_position();
217 state.add_token(if is_float { OCamlTokenType::FloatLiteral } else { OCamlTokenType::IntegerLiteral }, start, end);
218 true
219 }
220
221 fn lex_identifier_or_keyword<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
222 let start = state.get_position();
223 let ch = match state.current() {
224 Some(c) => c,
225 None => return false,
226 };
227
228 if !(ch.is_ascii_alphabetic() || ch == '_') {
229 return false;
230 }
231
232 state.advance(1);
233 while let Some(c) = state.current() {
234 if c.is_ascii_alphanumeric() || c == '_' || c == '\'' { state.advance(1) } else { break }
235 }
236
237 let end = state.get_position();
238 let text = state.get_text_in((start..end).into());
239 let kind = match text.as_ref() {
240 "and" => OCamlTokenType::And,
242 "as" => OCamlTokenType::As,
243 "assert" => OCamlTokenType::Assert,
244 "begin" => OCamlTokenType::Begin,
245 "class" => OCamlTokenType::Class,
246 "constraint" => OCamlTokenType::Constraint,
247 "do" => OCamlTokenType::Do,
248 "done" => OCamlTokenType::Done,
249 "downto" => OCamlTokenType::Downto,
250 "else" => OCamlTokenType::Else,
251 "end" => OCamlTokenType::End,
252 "exception" => OCamlTokenType::Exception,
253 "external" => OCamlTokenType::External,
254 "false" => OCamlTokenType::False,
255 "for" => OCamlTokenType::For,
256 "fun" => OCamlTokenType::Fun,
257 "function" => OCamlTokenType::Function,
258 "functor" => OCamlTokenType::Functor,
259 "if" => OCamlTokenType::If,
260 "in" => OCamlTokenType::In,
261 "include" => OCamlTokenType::Include,
262 "inherit" => OCamlTokenType::Inherit,
263 "initializer" => OCamlTokenType::Initializer,
264 "lazy" => OCamlTokenType::Lazy,
265 "let" => OCamlTokenType::Let,
266 "match" => OCamlTokenType::Match,
267 "method" => OCamlTokenType::Method,
268 "module" => OCamlTokenType::Module,
269 "mutable" => OCamlTokenType::Mutable,
270 "new" => OCamlTokenType::New,
271 "object" => OCamlTokenType::Object,
272 "of" => OCamlTokenType::Of,
273 "open" => OCamlTokenType::Open,
274 "or" => OCamlTokenType::Or,
275 "private" => OCamlTokenType::Private,
276 "rec" => OCamlTokenType::Rec,
277 "sig" => OCamlTokenType::Sig,
278 "struct" => OCamlTokenType::Struct,
279 "then" => OCamlTokenType::Then,
280 "to" => OCamlTokenType::To,
281 "true" => OCamlTokenType::True,
282 "try" => OCamlTokenType::Try,
283 "type" => OCamlTokenType::Type,
284 "val" => OCamlTokenType::Val,
285 "virtual" => OCamlTokenType::Virtual,
286 "when" => OCamlTokenType::When,
287 "while" => OCamlTokenType::While,
288 "with" => OCamlTokenType::With,
289
290 _ => OCamlTokenType::Identifier,
291 };
292
293 state.add_token(kind, start, state.get_position());
294 true
295 }
296
297 fn lex_operators<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
298 let start = state.get_position();
299 let rest = state.rest();
300
301 let patterns: &[(&str, OCamlTokenType)] = &[
303 ("==", OCamlTokenType::EqualEqual),
304 ("!=", OCamlTokenType::NotEqual),
305 (">=", OCamlTokenType::GreaterEqual),
306 ("<=", OCamlTokenType::LessEqual),
307 ("&&", OCamlTokenType::AndAnd),
308 ("||", OCamlTokenType::OrOr),
309 ("::", OCamlTokenType::ColonColon),
310 ("->", OCamlTokenType::RightArrow),
311 ("<-", OCamlTokenType::LeftArrow),
312 ("-.", OCamlTokenType::MinusDot),
313 ];
314
315 for (pat, kind) in patterns {
316 if rest.starts_with(pat) {
317 state.advance(pat.len());
318 state.add_token(*kind, start, state.get_position());
319 return true;
320 }
321 }
322
323 if let Some(ch) = state.current() {
324 let kind = match ch {
325 '+' => Some(OCamlTokenType::Plus),
326 '-' => Some(OCamlTokenType::Minus),
327 '*' => Some(OCamlTokenType::Star),
328 '/' => Some(OCamlTokenType::Slash),
329 '%' => Some(OCamlTokenType::Percent),
330 '=' => Some(OCamlTokenType::Equal),
331 '>' => Some(OCamlTokenType::Greater),
332 '<' => Some(OCamlTokenType::Less),
333 '!' => Some(OCamlTokenType::Bang),
334 '?' => Some(OCamlTokenType::Question),
335 ':' => Some(OCamlTokenType::Colon),
336 ';' => Some(OCamlTokenType::Semicolon),
337 ',' => Some(OCamlTokenType::Comma),
338 '.' => Some(OCamlTokenType::Dot),
339 '|' => Some(OCamlTokenType::Pipe),
340 '&' => Some(OCamlTokenType::Ampersand),
341 '^' => Some(OCamlTokenType::Caret),
342 '~' => Some(OCamlTokenType::Tilde),
343 '@' => Some(OCamlTokenType::At),
344 '#' => Some(OCamlTokenType::Hash),
345 '$' => Some(OCamlTokenType::Dollar),
346 '`' => Some(OCamlTokenType::Backtick),
347 _ => None,
348 };
349
350 if let Some(k) = kind {
351 state.advance(ch.len_utf8());
352 state.add_token(k, start, state.get_position());
353 return true;
354 }
355 }
356
357 false
358 }
359
360 fn lex_single_char_tokens<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
361 let start = state.get_position();
362 if let Some(ch) = state.current() {
363 let kind = match ch {
364 '(' => OCamlTokenType::LeftParen,
365 ')' => OCamlTokenType::RightParen,
366 '[' => OCamlTokenType::LeftBracket,
367 ']' => OCamlTokenType::RightBracket,
368 '{' => OCamlTokenType::LeftBrace,
369 '}' => OCamlTokenType::RightBrace,
370 _ => return false,
371 };
372
373 state.advance(ch.len_utf8());
374 state.add_token(kind, start, state.get_position());
375 true
376 }
377 else {
378 false
379 }
380 }
381}