Skip to main content

oak_ocaml/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2pub mod token_type;
3
4use crate::{language::OCamlLanguage, lexer::token_type::OCamlTokenType};
5use oak_core::{
6    Lexer, LexerCache, LexerState, OakError,
7    lexer::{CommentConfig, LexOutput, WhitespaceConfig},
8    source::Source,
9};
10use std::sync::LazyLock;
11
12pub(crate) type State<'a, S> = LexerState<'a, S, OCamlLanguage>;
13
14static OCAML_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
15static OCAML_COMMENT: LazyLock<CommentConfig> = LazyLock::new(|| CommentConfig { line_marker: "//", block_start: "(*", block_end: "*)", nested_blocks: true });
16
17#[derive(Clone, Debug)]
18pub struct OCamlLexer<'config> {
19    config: &'config OCamlLanguage,
20}
21
22impl<'config> Lexer<OCamlLanguage> for OCamlLexer<'config> {
23    fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[oak_core::source::TextEdit], cache: &'a mut impl LexerCache<OCamlLanguage>) -> LexOutput<OCamlLanguage> {
24        let mut state = State::new_with_cache(source, 0, cache);
25        let result = self.run(&mut state);
26        if result.is_ok() {
27            state.add_eof()
28        }
29        state.finish_with_cache(result, cache)
30    }
31}
32
33impl<'config> OCamlLexer<'config> {
34    pub fn new(config: &'config OCamlLanguage) -> Self {
35        Self { config }
36    }
37
38    /// Main lexical analysis loop
39    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
40        while state.not_at_end() {
41            let safe_point = state.get_position();
42
43            if self.skip_whitespace(state) {
44                continue;
45            }
46
47            if self.skip_comment(state) {
48                continue;
49            }
50
51            if self.lex_string_literal(state) {
52                continue;
53            }
54
55            if self.lex_char_literal(state) {
56                continue;
57            }
58
59            if self.lex_number_literal(state) {
60                continue;
61            }
62
63            if self.lex_identifier_or_keyword(state) {
64                continue;
65            }
66
67            if self.lex_operators(state) {
68                continue;
69            }
70
71            if self.lex_single_char_tokens(state) {
72                continue;
73            }
74
75            state.advance_if_dead_lock(safe_point)
76        }
77
78        Ok(())
79    }
80
81    /// Skips whitespace
82    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
83        OCAML_WHITESPACE.scan(state, OCamlTokenType::Whitespace)
84    }
85
86    fn skip_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
87        OCAML_COMMENT.scan(state, OCamlTokenType::Comment, OCamlTokenType::Comment)
88    }
89
90    fn lex_string_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
91        let start = state.get_position();
92        if state.current() != Some('"') {
93            return false;
94        }
95
96        state.advance(1); // opening "
97        let mut escaped = false;
98        while let Some(ch) = state.peek() {
99            if ch == '"' && !escaped {
100                state.advance(1); // consume closing quote
101                break;
102            }
103            state.advance(ch.len_utf8());
104            if escaped {
105                escaped = false;
106                continue;
107            }
108            if ch == '\\' {
109                escaped = true;
110                continue;
111            }
112            if ch == '\n' || ch == '\r' {
113                break;
114            }
115        }
116        state.add_token(OCamlTokenType::StringLiteral, start, state.get_position());
117        true
118    }
119
120    fn lex_char_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
121        let start = state.get_position();
122        if state.current() != Some('\'') {
123            return false;
124        }
125
126        state.advance(1); // opening '
127        if let Some('\\') = state.peek() {
128            state.advance(1);
129            if let Some(c) = state.peek() {
130                state.advance(c.len_utf8())
131            }
132        }
133        else if let Some(c) = state.peek() {
134            state.advance(c.len_utf8())
135        }
136        else {
137            state.set_position(start);
138            return false;
139        }
140
141        if state.peek() == Some('\'') {
142            state.advance(1);
143            state.add_token(OCamlTokenType::CharLiteral, start, state.get_position());
144            return true;
145        }
146
147        state.set_position(start);
148        false
149    }
150
151    fn lex_number_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
152        let start = state.get_position();
153        let first = match state.current() {
154            Some(c) => c,
155            None => return false,
156        };
157
158        if !first.is_ascii_digit() {
159            return false;
160        }
161
162        let mut is_float = false;
163
164        // consume digits
165        state.advance(1);
166        while let Some(c) = state.peek() {
167            if c.is_ascii_digit() {
168                state.advance(1);
169            }
170            else {
171                break;
172            }
173        }
174
175        // fractional part
176        if state.peek() == Some('.') {
177            let n1 = state.peek_next_n(1);
178            if n1.map(|c| c.is_ascii_digit()).unwrap_or(false) {
179                is_float = true;
180                state.advance(1); // consume '.'
181                while let Some(c) = state.peek() {
182                    if c.is_ascii_digit() {
183                        state.advance(1);
184                    }
185                    else {
186                        break;
187                    }
188                }
189            }
190        }
191
192        // exponent
193        if let Some(c) = state.peek() {
194            if c == 'e' || c == 'E' {
195                let n1 = state.peek_next_n(1);
196                if n1 == Some('+') || n1 == Some('-') || n1.map(|d| d.is_ascii_digit()).unwrap_or(false) {
197                    is_float = true;
198                    state.advance(1);
199                    if let Some(sign) = state.peek() {
200                        if sign == '+' || sign == '-' {
201                            state.advance(1);
202                        }
203                    }
204                    while let Some(d) = state.peek() {
205                        if d.is_ascii_digit() {
206                            state.advance(1);
207                        }
208                        else {
209                            break;
210                        }
211                    }
212                }
213            }
214        }
215
216        let end = state.get_position();
217        state.add_token(if is_float { OCamlTokenType::FloatLiteral } else { OCamlTokenType::IntegerLiteral }, start, end);
218        true
219    }
220
221    fn lex_identifier_or_keyword<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
222        let start = state.get_position();
223        let ch = match state.current() {
224            Some(c) => c,
225            None => return false,
226        };
227
228        if !(ch.is_ascii_alphabetic() || ch == '_') {
229            return false;
230        }
231
232        state.advance(1);
233        while let Some(c) = state.current() {
234            if c.is_ascii_alphanumeric() || c == '_' || c == '\'' { state.advance(1) } else { break }
235        }
236
237        let end = state.get_position();
238        let text = state.get_text_in((start..end).into());
239        let kind = match text.as_ref() {
240            // OCaml keywords
241            "and" => OCamlTokenType::And,
242            "as" => OCamlTokenType::As,
243            "assert" => OCamlTokenType::Assert,
244            "begin" => OCamlTokenType::Begin,
245            "class" => OCamlTokenType::Class,
246            "constraint" => OCamlTokenType::Constraint,
247            "do" => OCamlTokenType::Do,
248            "done" => OCamlTokenType::Done,
249            "downto" => OCamlTokenType::Downto,
250            "else" => OCamlTokenType::Else,
251            "end" => OCamlTokenType::End,
252            "exception" => OCamlTokenType::Exception,
253            "external" => OCamlTokenType::External,
254            "false" => OCamlTokenType::False,
255            "for" => OCamlTokenType::For,
256            "fun" => OCamlTokenType::Fun,
257            "function" => OCamlTokenType::Function,
258            "functor" => OCamlTokenType::Functor,
259            "if" => OCamlTokenType::If,
260            "in" => OCamlTokenType::In,
261            "include" => OCamlTokenType::Include,
262            "inherit" => OCamlTokenType::Inherit,
263            "initializer" => OCamlTokenType::Initializer,
264            "lazy" => OCamlTokenType::Lazy,
265            "let" => OCamlTokenType::Let,
266            "match" => OCamlTokenType::Match,
267            "method" => OCamlTokenType::Method,
268            "module" => OCamlTokenType::Module,
269            "mutable" => OCamlTokenType::Mutable,
270            "new" => OCamlTokenType::New,
271            "object" => OCamlTokenType::Object,
272            "of" => OCamlTokenType::Of,
273            "open" => OCamlTokenType::Open,
274            "or" => OCamlTokenType::Or,
275            "private" => OCamlTokenType::Private,
276            "rec" => OCamlTokenType::Rec,
277            "sig" => OCamlTokenType::Sig,
278            "struct" => OCamlTokenType::Struct,
279            "then" => OCamlTokenType::Then,
280            "to" => OCamlTokenType::To,
281            "true" => OCamlTokenType::True,
282            "try" => OCamlTokenType::Try,
283            "type" => OCamlTokenType::Type,
284            "val" => OCamlTokenType::Val,
285            "virtual" => OCamlTokenType::Virtual,
286            "when" => OCamlTokenType::When,
287            "while" => OCamlTokenType::While,
288            "with" => OCamlTokenType::With,
289
290            _ => OCamlTokenType::Identifier,
291        };
292
293        state.add_token(kind, start, state.get_position());
294        true
295    }
296
297    fn lex_operators<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
298        let start = state.get_position();
299        let rest = state.rest();
300
301        // prefer longest matches first
302        let patterns: &[(&str, OCamlTokenType)] = &[
303            ("==", OCamlTokenType::EqualEqual),
304            ("!=", OCamlTokenType::NotEqual),
305            (">=", OCamlTokenType::GreaterEqual),
306            ("<=", OCamlTokenType::LessEqual),
307            ("&&", OCamlTokenType::AndAnd),
308            ("||", OCamlTokenType::OrOr),
309            ("::", OCamlTokenType::ColonColon),
310            ("->", OCamlTokenType::RightArrow),
311            ("<-", OCamlTokenType::LeftArrow),
312            ("-.", OCamlTokenType::MinusDot),
313        ];
314
315        for (pat, kind) in patterns {
316            if rest.starts_with(pat) {
317                state.advance(pat.len());
318                state.add_token(*kind, start, state.get_position());
319                return true;
320            }
321        }
322
323        if let Some(ch) = state.current() {
324            let kind = match ch {
325                '+' => Some(OCamlTokenType::Plus),
326                '-' => Some(OCamlTokenType::Minus),
327                '*' => Some(OCamlTokenType::Star),
328                '/' => Some(OCamlTokenType::Slash),
329                '%' => Some(OCamlTokenType::Percent),
330                '=' => Some(OCamlTokenType::Equal),
331                '>' => Some(OCamlTokenType::Greater),
332                '<' => Some(OCamlTokenType::Less),
333                '!' => Some(OCamlTokenType::Bang),
334                '?' => Some(OCamlTokenType::Question),
335                ':' => Some(OCamlTokenType::Colon),
336                ';' => Some(OCamlTokenType::Semicolon),
337                ',' => Some(OCamlTokenType::Comma),
338                '.' => Some(OCamlTokenType::Dot),
339                '|' => Some(OCamlTokenType::Pipe),
340                '&' => Some(OCamlTokenType::Ampersand),
341                '^' => Some(OCamlTokenType::Caret),
342                '~' => Some(OCamlTokenType::Tilde),
343                '@' => Some(OCamlTokenType::At),
344                '#' => Some(OCamlTokenType::Hash),
345                '$' => Some(OCamlTokenType::Dollar),
346                '`' => Some(OCamlTokenType::Backtick),
347                _ => None,
348            };
349
350            if let Some(k) = kind {
351                state.advance(ch.len_utf8());
352                state.add_token(k, start, state.get_position());
353                return true;
354            }
355        }
356
357        false
358    }
359
360    fn lex_single_char_tokens<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
361        let start = state.get_position();
362        if let Some(ch) = state.current() {
363            let kind = match ch {
364                '(' => OCamlTokenType::LeftParen,
365                ')' => OCamlTokenType::RightParen,
366                '[' => OCamlTokenType::LeftBracket,
367                ']' => OCamlTokenType::RightBracket,
368                '{' => OCamlTokenType::LeftBrace,
369                '}' => OCamlTokenType::RightBrace,
370                _ => return false,
371            };
372
373            state.advance(ch.len_utf8());
374            state.add_token(kind, start, state.get_position());
375            true
376        }
377        else {
378            false
379        }
380    }
381}