1#![doc = include_str!("readme.md")]
2pub mod token_type;
4
5use crate::{language::OCamlLanguage, lexer::token_type::OCamlTokenType};
6use oak_core::{
7 Lexer, LexerCache, LexerState, OakError,
8 lexer::{CommentConfig, LexOutput, WhitespaceConfig},
9 source::Source,
10};
11use std::sync::LazyLock;
12
13pub(crate) type State<'a, S> = LexerState<'a, S, OCamlLanguage>;
14
15static OCAML_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
16static OCAML_COMMENT: LazyLock<CommentConfig> = LazyLock::new(|| CommentConfig { line_marker: "//", block_start: "(*", block_end: "*)", nested_blocks: true });
17
18#[derive(Clone, Debug)]
20pub struct OCamlLexer<'config> {
21 config: &'config OCamlLanguage,
22}
23
24impl<'config> Lexer<OCamlLanguage> for OCamlLexer<'config> {
25 fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[oak_core::source::TextEdit], cache: &'a mut impl LexerCache<OCamlLanguage>) -> LexOutput<OCamlLanguage> {
26 let mut state = State::new_with_cache(source, 0, cache);
27 let result = self.run(&mut state);
28 if result.is_ok() {
29 state.add_eof()
30 }
31 state.finish_with_cache(result, cache)
32 }
33}
34
35impl<'config> OCamlLexer<'config> {
36 pub fn new(config: &'config OCamlLanguage) -> Self {
38 Self { config }
39 }
40
41 fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
43 while state.not_at_end() {
44 let safe_point = state.get_position();
45
46 if self.skip_whitespace(state) {
47 continue;
48 }
49
50 if self.skip_comment(state) {
51 continue;
52 }
53
54 if self.lex_string_literal(state) {
55 continue;
56 }
57
58 if self.lex_char_literal(state) {
59 continue;
60 }
61
62 if self.lex_number_literal(state) {
63 continue;
64 }
65
66 if self.lex_identifier_or_keyword(state) {
67 continue;
68 }
69
70 if self.lex_operators(state) {
71 continue;
72 }
73
74 if self.lex_single_char_tokens(state) {
75 continue;
76 }
77
78 state.advance_if_dead_lock(safe_point)
79 }
80
81 Ok(())
82 }
83
84 fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
86 OCAML_WHITESPACE.scan(state, OCamlTokenType::Whitespace)
87 }
88
89 fn skip_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
90 OCAML_COMMENT.scan(state, OCamlTokenType::Comment, OCamlTokenType::Comment)
91 }
92
93 fn lex_string_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
94 let start = state.get_position();
95 if state.current() != Some('"') {
96 return false;
97 }
98
99 state.advance(1); let mut escaped = false;
101 while let Some(ch) = state.peek() {
102 if ch == '"' && !escaped {
103 state.advance(1); break;
105 }
106 state.advance(ch.len_utf8());
107 if escaped {
108 escaped = false;
109 continue;
110 }
111 if ch == '\\' {
112 escaped = true;
113 continue;
114 }
115 if ch == '\n' || ch == '\r' {
116 break;
117 }
118 }
119 state.add_token(OCamlTokenType::StringLiteral, start, state.get_position());
120 true
121 }
122
123 fn lex_char_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
124 let start = state.get_position();
125 if state.current() != Some('\'') {
126 return false;
127 }
128
129 state.advance(1); if let Some('\\') = state.peek() {
131 state.advance(1);
132 if let Some(c) = state.peek() {
133 state.advance(c.len_utf8())
134 }
135 }
136 else if let Some(c) = state.peek() {
137 state.advance(c.len_utf8())
138 }
139 else {
140 state.set_position(start);
141 return false;
142 }
143
144 if state.peek() == Some('\'') {
145 state.advance(1);
146 state.add_token(OCamlTokenType::CharLiteral, start, state.get_position());
147 return true;
148 }
149
150 state.set_position(start);
151 false
152 }
153
154 fn lex_number_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
155 let start = state.get_position();
156 let first = match state.current() {
157 Some(c) => c,
158 None => return false,
159 };
160
161 if !first.is_ascii_digit() {
162 return false;
163 }
164
165 let mut is_float = false;
166
167 state.advance(1);
169 while let Some(c) = state.peek() {
170 if c.is_ascii_digit() {
171 state.advance(1);
172 }
173 else {
174 break;
175 }
176 }
177
178 if state.peek() == Some('.') {
180 let n1 = state.peek_next_n(1);
181 if n1.map(|c| c.is_ascii_digit()).unwrap_or(false) {
182 is_float = true;
183 state.advance(1); while let Some(c) = state.peek() {
185 if c.is_ascii_digit() {
186 state.advance(1);
187 }
188 else {
189 break;
190 }
191 }
192 }
193 }
194
195 if let Some(c) = state.peek() {
197 if c == 'e' || c == 'E' {
198 let n1 = state.peek_next_n(1);
199 if n1 == Some('+') || n1 == Some('-') || n1.map(|d| d.is_ascii_digit()).unwrap_or(false) {
200 is_float = true;
201 state.advance(1);
202 if let Some(sign) = state.peek() {
203 if sign == '+' || sign == '-' {
204 state.advance(1);
205 }
206 }
207 while let Some(d) = state.peek() {
208 if d.is_ascii_digit() {
209 state.advance(1);
210 }
211 else {
212 break;
213 }
214 }
215 }
216 }
217 }
218
219 let end = state.get_position();
220 state.add_token(if is_float { OCamlTokenType::FloatLiteral } else { OCamlTokenType::IntegerLiteral }, start, end);
221 true
222 }
223
224 fn lex_identifier_or_keyword<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
225 let start = state.get_position();
226 let ch = match state.current() {
227 Some(c) => c,
228 None => return false,
229 };
230
231 if !(ch.is_ascii_alphabetic() || ch == '_') {
232 return false;
233 }
234
235 state.advance(1);
236 while let Some(c) = state.current() {
237 if c.is_ascii_alphanumeric() || c == '_' || c == '\'' { state.advance(1) } else { break }
238 }
239
240 let end = state.get_position();
241 let text = state.get_text_in((start..end).into());
242 let kind = match text.as_ref() {
243 "and" => OCamlTokenType::And,
245 "as" => OCamlTokenType::As,
246 "assert" => OCamlTokenType::Assert,
247 "begin" => OCamlTokenType::Begin,
248 "class" => OCamlTokenType::Class,
249 "constraint" => OCamlTokenType::Constraint,
250 "do" => OCamlTokenType::Do,
251 "done" => OCamlTokenType::Done,
252 "downto" => OCamlTokenType::Downto,
253 "else" => OCamlTokenType::Else,
254 "end" => OCamlTokenType::End,
255 "exception" => OCamlTokenType::Exception,
256 "external" => OCamlTokenType::External,
257 "false" => OCamlTokenType::False,
258 "for" => OCamlTokenType::For,
259 "fun" => OCamlTokenType::Fun,
260 "function" => OCamlTokenType::Function,
261 "functor" => OCamlTokenType::Functor,
262 "if" => OCamlTokenType::If,
263 "in" => OCamlTokenType::In,
264 "include" => OCamlTokenType::Include,
265 "inherit" => OCamlTokenType::Inherit,
266 "initializer" => OCamlTokenType::Initializer,
267 "lazy" => OCamlTokenType::Lazy,
268 "let" => OCamlTokenType::Let,
269 "match" => OCamlTokenType::Match,
270 "method" => OCamlTokenType::Method,
271 "module" => OCamlTokenType::Module,
272 "mutable" => OCamlTokenType::Mutable,
273 "new" => OCamlTokenType::New,
274 "object" => OCamlTokenType::Object,
275 "of" => OCamlTokenType::Of,
276 "open" => OCamlTokenType::Open,
277 "or" => OCamlTokenType::Or,
278 "private" => OCamlTokenType::Private,
279 "rec" => OCamlTokenType::Rec,
280 "sig" => OCamlTokenType::Sig,
281 "struct" => OCamlTokenType::Struct,
282 "then" => OCamlTokenType::Then,
283 "to" => OCamlTokenType::To,
284 "true" => OCamlTokenType::True,
285 "try" => OCamlTokenType::Try,
286 "type" => OCamlTokenType::Type,
287 "val" => OCamlTokenType::Val,
288 "virtual" => OCamlTokenType::Virtual,
289 "when" => OCamlTokenType::When,
290 "while" => OCamlTokenType::While,
291 "with" => OCamlTokenType::With,
292
293 _ => OCamlTokenType::Identifier,
294 };
295
296 state.add_token(kind, start, state.get_position());
297 true
298 }
299
300 fn lex_operators<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
301 let start = state.get_position();
302 let rest = state.rest();
303
304 let patterns: &[(&str, OCamlTokenType)] = &[
306 ("==", OCamlTokenType::EqualEqual),
307 ("!=", OCamlTokenType::NotEqual),
308 (">=", OCamlTokenType::GreaterEqual),
309 ("<=", OCamlTokenType::LessEqual),
310 ("&&", OCamlTokenType::AndAnd),
311 ("||", OCamlTokenType::OrOr),
312 ("::", OCamlTokenType::ColonColon),
313 ("->", OCamlTokenType::RightArrow),
314 ("<-", OCamlTokenType::LeftArrow),
315 ("-.", OCamlTokenType::MinusDot),
316 ];
317
318 for (pat, kind) in patterns {
319 if rest.starts_with(pat) {
320 state.advance(pat.len());
321 state.add_token(*kind, start, state.get_position());
322 return true;
323 }
324 }
325
326 if let Some(ch) = state.current() {
327 let kind = match ch {
328 '+' => Some(OCamlTokenType::Plus),
329 '-' => Some(OCamlTokenType::Minus),
330 '*' => Some(OCamlTokenType::Star),
331 '/' => Some(OCamlTokenType::Slash),
332 '%' => Some(OCamlTokenType::Percent),
333 '=' => Some(OCamlTokenType::Equal),
334 '>' => Some(OCamlTokenType::Greater),
335 '<' => Some(OCamlTokenType::Less),
336 '!' => Some(OCamlTokenType::Bang),
337 '?' => Some(OCamlTokenType::Question),
338 ':' => Some(OCamlTokenType::Colon),
339 ';' => Some(OCamlTokenType::Semicolon),
340 ',' => Some(OCamlTokenType::Comma),
341 '.' => Some(OCamlTokenType::Dot),
342 '|' => Some(OCamlTokenType::Pipe),
343 '&' => Some(OCamlTokenType::Ampersand),
344 '^' => Some(OCamlTokenType::Caret),
345 '~' => Some(OCamlTokenType::Tilde),
346 '@' => Some(OCamlTokenType::At),
347 '#' => Some(OCamlTokenType::Hash),
348 '$' => Some(OCamlTokenType::Dollar),
349 '`' => Some(OCamlTokenType::Backtick),
350 _ => None,
351 };
352
353 if let Some(k) = kind {
354 state.advance(ch.len_utf8());
355 state.add_token(k, start, state.get_position());
356 return true;
357 }
358 }
359
360 false
361 }
362
363 fn lex_single_char_tokens<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
364 let start = state.get_position();
365 if let Some(ch) = state.current() {
366 let kind = match ch {
367 '(' => OCamlTokenType::LeftParen,
368 ')' => OCamlTokenType::RightParen,
369 '[' => OCamlTokenType::LeftBracket,
370 ']' => OCamlTokenType::RightBracket,
371 '{' => OCamlTokenType::LeftBrace,
372 '}' => OCamlTokenType::RightBrace,
373 _ => return false,
374 };
375
376 state.advance(ch.len_utf8());
377 state.add_token(kind, start, state.get_position());
378 true
379 }
380 else {
381 false
382 }
383 }
384}