Skip to main content

oak_ocaml/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2/// Token types for the OCaml language.
3pub mod token_type;
4
5use crate::{language::OCamlLanguage, lexer::token_type::OCamlTokenType};
6use oak_core::{
7    Lexer, LexerCache, LexerState, OakError,
8    lexer::{CommentConfig, LexOutput, WhitespaceConfig},
9    source::Source,
10};
11use std::sync::LazyLock;
12
13pub(crate) type State<'a, S> = LexerState<'a, S, OCamlLanguage>;
14
15static OCAML_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
16static OCAML_COMMENT: LazyLock<CommentConfig> = LazyLock::new(|| CommentConfig { line_marker: "//", block_start: "(*", block_end: "*)", nested_blocks: true });
17
18/// OCaml lexer implementation.
19#[derive(Clone, Debug)]
20pub struct OCamlLexer<'config> {
21    config: &'config OCamlLanguage,
22}
23
24impl<'config> Lexer<OCamlLanguage> for OCamlLexer<'config> {
25    fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[oak_core::source::TextEdit], cache: &'a mut impl LexerCache<OCamlLanguage>) -> LexOutput<OCamlLanguage> {
26        let mut state = State::new_with_cache(source, 0, cache);
27        let result = self.run(&mut state);
28        if result.is_ok() {
29            state.add_eof()
30        }
31        state.finish_with_cache(result, cache)
32    }
33}
34
35impl<'config> OCamlLexer<'config> {
36    /// Create a new OCaml lexer.
37    pub fn new(config: &'config OCamlLanguage) -> Self {
38        Self { config }
39    }
40
41    /// Main lexical analysis loop
42    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
43        while state.not_at_end() {
44            let safe_point = state.get_position();
45
46            if self.skip_whitespace(state) {
47                continue;
48            }
49
50            if self.skip_comment(state) {
51                continue;
52            }
53
54            if self.lex_string_literal(state) {
55                continue;
56            }
57
58            if self.lex_char_literal(state) {
59                continue;
60            }
61
62            if self.lex_number_literal(state) {
63                continue;
64            }
65
66            if self.lex_identifier_or_keyword(state) {
67                continue;
68            }
69
70            if self.lex_operators(state) {
71                continue;
72            }
73
74            if self.lex_single_char_tokens(state) {
75                continue;
76            }
77
78            state.advance_if_dead_lock(safe_point)
79        }
80
81        Ok(())
82    }
83
84    /// Skips whitespace
85    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
86        OCAML_WHITESPACE.scan(state, OCamlTokenType::Whitespace)
87    }
88
89    fn skip_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
90        OCAML_COMMENT.scan(state, OCamlTokenType::Comment, OCamlTokenType::Comment)
91    }
92
93    fn lex_string_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
94        let start = state.get_position();
95        if state.current() != Some('"') {
96            return false;
97        }
98
99        state.advance(1); // opening "
100        let mut escaped = false;
101        while let Some(ch) = state.peek() {
102            if ch == '"' && !escaped {
103                state.advance(1); // consume closing quote
104                break;
105            }
106            state.advance(ch.len_utf8());
107            if escaped {
108                escaped = false;
109                continue;
110            }
111            if ch == '\\' {
112                escaped = true;
113                continue;
114            }
115            if ch == '\n' || ch == '\r' {
116                break;
117            }
118        }
119        state.add_token(OCamlTokenType::StringLiteral, start, state.get_position());
120        true
121    }
122
123    fn lex_char_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
124        let start = state.get_position();
125        if state.current() != Some('\'') {
126            return false;
127        }
128
129        state.advance(1); // opening '
130        if let Some('\\') = state.peek() {
131            state.advance(1);
132            if let Some(c) = state.peek() {
133                state.advance(c.len_utf8())
134            }
135        }
136        else if let Some(c) = state.peek() {
137            state.advance(c.len_utf8())
138        }
139        else {
140            state.set_position(start);
141            return false;
142        }
143
144        if state.peek() == Some('\'') {
145            state.advance(1);
146            state.add_token(OCamlTokenType::CharLiteral, start, state.get_position());
147            return true;
148        }
149
150        state.set_position(start);
151        false
152    }
153
154    fn lex_number_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
155        let start = state.get_position();
156        let first = match state.current() {
157            Some(c) => c,
158            None => return false,
159        };
160
161        if !first.is_ascii_digit() {
162            return false;
163        }
164
165        let mut is_float = false;
166
167        // consume digits
168        state.advance(1);
169        while let Some(c) = state.peek() {
170            if c.is_ascii_digit() {
171                state.advance(1);
172            }
173            else {
174                break;
175            }
176        }
177
178        // fractional part
179        if state.peek() == Some('.') {
180            let n1 = state.peek_next_n(1);
181            if n1.map(|c| c.is_ascii_digit()).unwrap_or(false) {
182                is_float = true;
183                state.advance(1); // consume '.'
184                while let Some(c) = state.peek() {
185                    if c.is_ascii_digit() {
186                        state.advance(1);
187                    }
188                    else {
189                        break;
190                    }
191                }
192            }
193        }
194
195        // exponent
196        if let Some(c) = state.peek() {
197            if c == 'e' || c == 'E' {
198                let n1 = state.peek_next_n(1);
199                if n1 == Some('+') || n1 == Some('-') || n1.map(|d| d.is_ascii_digit()).unwrap_or(false) {
200                    is_float = true;
201                    state.advance(1);
202                    if let Some(sign) = state.peek() {
203                        if sign == '+' || sign == '-' {
204                            state.advance(1);
205                        }
206                    }
207                    while let Some(d) = state.peek() {
208                        if d.is_ascii_digit() {
209                            state.advance(1);
210                        }
211                        else {
212                            break;
213                        }
214                    }
215                }
216            }
217        }
218
219        let end = state.get_position();
220        state.add_token(if is_float { OCamlTokenType::FloatLiteral } else { OCamlTokenType::IntegerLiteral }, start, end);
221        true
222    }
223
224    fn lex_identifier_or_keyword<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
225        let start = state.get_position();
226        let ch = match state.current() {
227            Some(c) => c,
228            None => return false,
229        };
230
231        if !(ch.is_ascii_alphabetic() || ch == '_') {
232            return false;
233        }
234
235        state.advance(1);
236        while let Some(c) = state.current() {
237            if c.is_ascii_alphanumeric() || c == '_' || c == '\'' { state.advance(1) } else { break }
238        }
239
240        let end = state.get_position();
241        let text = state.get_text_in((start..end).into());
242        let kind = match text.as_ref() {
243            // OCaml keywords
244            "and" => OCamlTokenType::And,
245            "as" => OCamlTokenType::As,
246            "assert" => OCamlTokenType::Assert,
247            "begin" => OCamlTokenType::Begin,
248            "class" => OCamlTokenType::Class,
249            "constraint" => OCamlTokenType::Constraint,
250            "do" => OCamlTokenType::Do,
251            "done" => OCamlTokenType::Done,
252            "downto" => OCamlTokenType::Downto,
253            "else" => OCamlTokenType::Else,
254            "end" => OCamlTokenType::End,
255            "exception" => OCamlTokenType::Exception,
256            "external" => OCamlTokenType::External,
257            "false" => OCamlTokenType::False,
258            "for" => OCamlTokenType::For,
259            "fun" => OCamlTokenType::Fun,
260            "function" => OCamlTokenType::Function,
261            "functor" => OCamlTokenType::Functor,
262            "if" => OCamlTokenType::If,
263            "in" => OCamlTokenType::In,
264            "include" => OCamlTokenType::Include,
265            "inherit" => OCamlTokenType::Inherit,
266            "initializer" => OCamlTokenType::Initializer,
267            "lazy" => OCamlTokenType::Lazy,
268            "let" => OCamlTokenType::Let,
269            "match" => OCamlTokenType::Match,
270            "method" => OCamlTokenType::Method,
271            "module" => OCamlTokenType::Module,
272            "mutable" => OCamlTokenType::Mutable,
273            "new" => OCamlTokenType::New,
274            "object" => OCamlTokenType::Object,
275            "of" => OCamlTokenType::Of,
276            "open" => OCamlTokenType::Open,
277            "or" => OCamlTokenType::Or,
278            "private" => OCamlTokenType::Private,
279            "rec" => OCamlTokenType::Rec,
280            "sig" => OCamlTokenType::Sig,
281            "struct" => OCamlTokenType::Struct,
282            "then" => OCamlTokenType::Then,
283            "to" => OCamlTokenType::To,
284            "true" => OCamlTokenType::True,
285            "try" => OCamlTokenType::Try,
286            "type" => OCamlTokenType::Type,
287            "val" => OCamlTokenType::Val,
288            "virtual" => OCamlTokenType::Virtual,
289            "when" => OCamlTokenType::When,
290            "while" => OCamlTokenType::While,
291            "with" => OCamlTokenType::With,
292
293            _ => OCamlTokenType::Identifier,
294        };
295
296        state.add_token(kind, start, state.get_position());
297        true
298    }
299
300    fn lex_operators<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
301        let start = state.get_position();
302        let rest = state.rest();
303
304        // prefer longest matches first
305        let patterns: &[(&str, OCamlTokenType)] = &[
306            ("==", OCamlTokenType::EqualEqual),
307            ("!=", OCamlTokenType::NotEqual),
308            (">=", OCamlTokenType::GreaterEqual),
309            ("<=", OCamlTokenType::LessEqual),
310            ("&&", OCamlTokenType::AndAnd),
311            ("||", OCamlTokenType::OrOr),
312            ("::", OCamlTokenType::ColonColon),
313            ("->", OCamlTokenType::RightArrow),
314            ("<-", OCamlTokenType::LeftArrow),
315            ("-.", OCamlTokenType::MinusDot),
316        ];
317
318        for (pat, kind) in patterns {
319            if rest.starts_with(pat) {
320                state.advance(pat.len());
321                state.add_token(*kind, start, state.get_position());
322                return true;
323            }
324        }
325
326        if let Some(ch) = state.current() {
327            let kind = match ch {
328                '+' => Some(OCamlTokenType::Plus),
329                '-' => Some(OCamlTokenType::Minus),
330                '*' => Some(OCamlTokenType::Star),
331                '/' => Some(OCamlTokenType::Slash),
332                '%' => Some(OCamlTokenType::Percent),
333                '=' => Some(OCamlTokenType::Equal),
334                '>' => Some(OCamlTokenType::Greater),
335                '<' => Some(OCamlTokenType::Less),
336                '!' => Some(OCamlTokenType::Bang),
337                '?' => Some(OCamlTokenType::Question),
338                ':' => Some(OCamlTokenType::Colon),
339                ';' => Some(OCamlTokenType::Semicolon),
340                ',' => Some(OCamlTokenType::Comma),
341                '.' => Some(OCamlTokenType::Dot),
342                '|' => Some(OCamlTokenType::Pipe),
343                '&' => Some(OCamlTokenType::Ampersand),
344                '^' => Some(OCamlTokenType::Caret),
345                '~' => Some(OCamlTokenType::Tilde),
346                '@' => Some(OCamlTokenType::At),
347                '#' => Some(OCamlTokenType::Hash),
348                '$' => Some(OCamlTokenType::Dollar),
349                '`' => Some(OCamlTokenType::Backtick),
350                _ => None,
351            };
352
353            if let Some(k) = kind {
354                state.advance(ch.len_utf8());
355                state.add_token(k, start, state.get_position());
356                return true;
357            }
358        }
359
360        false
361    }
362
363    fn lex_single_char_tokens<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
364        let start = state.get_position();
365        if let Some(ch) = state.current() {
366            let kind = match ch {
367                '(' => OCamlTokenType::LeftParen,
368                ')' => OCamlTokenType::RightParen,
369                '[' => OCamlTokenType::LeftBracket,
370                ']' => OCamlTokenType::RightBracket,
371                '{' => OCamlTokenType::LeftBrace,
372                '}' => OCamlTokenType::RightBrace,
373                _ => return false,
374            };
375
376            state.advance(ch.len_utf8());
377            state.add_token(kind, start, state.get_position());
378            true
379        }
380        else {
381            false
382        }
383    }
384}