Skip to main content

oak_ocaml/lexer/
mod.rs

1use crate::{kind::OCamlSyntaxKind, language::OCamlLanguage};
2use oak_core::{
3    Lexer, LexerCache, LexerState, OakError,
4    lexer::{CommentConfig, LexOutput, WhitespaceConfig},
5    source::Source,
6};
7use std::sync::LazyLock;
8
9type State<'a, S> = LexerState<'a, S, OCamlLanguage>;
10
11static OCAML_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
12static OCAML_COMMENT: LazyLock<CommentConfig> = LazyLock::new(|| CommentConfig { line_marker: "//", block_start: "(*", block_end: "*)", nested_blocks: true });
13
14#[derive(Clone, Debug)]
15pub struct OCamlLexer<'config> {
16    _config: &'config OCamlLanguage,
17}
18
19impl<'config> Lexer<OCamlLanguage> for OCamlLexer<'config> {
20    fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[oak_core::source::TextEdit], cache: &'a mut impl LexerCache<OCamlLanguage>) -> LexOutput<OCamlLanguage> {
21        let mut state = State::new_with_cache(source, 0, cache);
22        let result = self.run(&mut state);
23        if result.is_ok() {
24            state.add_eof();
25        }
26        state.finish_with_cache(result, cache)
27    }
28}
29
30impl<'config> OCamlLexer<'config> {
31    pub fn new(config: &'config OCamlLanguage) -> Self {
32        Self { _config: config }
33    }
34
35    /// 主词法分析循环
36    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
37        while state.not_at_end() {
38            let safe_point = state.get_position();
39
40            if self.skip_whitespace(state) {
41                continue;
42            }
43
44            if self.skip_comment(state) {
45                continue;
46            }
47
48            if self.lex_string_literal(state) {
49                continue;
50            }
51
52            if self.lex_char_literal(state) {
53                continue;
54            }
55
56            if self.lex_number_literal(state) {
57                continue;
58            }
59
60            if self.lex_identifier_or_keyword(state) {
61                continue;
62            }
63
64            if self.lex_operators(state) {
65                continue;
66            }
67
68            if self.lex_single_char_tokens(state) {
69                continue;
70            }
71
72            state.advance_if_dead_lock(safe_point);
73        }
74
75        Ok(())
76    }
77
78    /// 跳过空白字符
79    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
80        OCAML_WHITESPACE.scan(state, OCamlSyntaxKind::Whitespace)
81    }
82
83    fn skip_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
84        OCAML_COMMENT.scan(state, OCamlSyntaxKind::Comment, OCamlSyntaxKind::Comment)
85    }
86
87    fn lex_string_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
88        let start = state.get_position();
89        if state.current() != Some('"') {
90            return false;
91        }
92
93        state.advance(1); // opening "
94        let mut escaped = false;
95        while let Some(ch) = state.peek() {
96            if ch == '"' && !escaped {
97                state.advance(1); // consume closing quote
98                break;
99            }
100            state.advance(ch.len_utf8());
101            if escaped {
102                escaped = false;
103                continue;
104            }
105            if ch == '\\' {
106                escaped = true;
107                continue;
108            }
109            if ch == '\n' || ch == '\r' {
110                break;
111            }
112        }
113        state.add_token(OCamlSyntaxKind::StringLiteral, start, state.get_position());
114        true
115    }
116
117    fn lex_char_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
118        let start = state.get_position();
119        if state.current() != Some('\'') {
120            return false;
121        }
122
123        state.advance(1); // opening '
124        if let Some('\\') = state.peek() {
125            state.advance(1);
126            if let Some(c) = state.peek() {
127                state.advance(c.len_utf8());
128            }
129        }
130        else if let Some(c) = state.peek() {
131            state.advance(c.len_utf8());
132        }
133        else {
134            state.set_position(start);
135            return false;
136        }
137
138        if state.peek() == Some('\'') {
139            state.advance(1);
140            state.add_token(OCamlSyntaxKind::CharLiteral, start, state.get_position());
141            return true;
142        }
143
144        state.set_position(start);
145        false
146    }
147
148    fn lex_number_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
149        let start = state.get_position();
150        let first = match state.current() {
151            Some(c) => c,
152            None => return false,
153        };
154
155        if !first.is_ascii_digit() {
156            return false;
157        }
158
159        let mut is_float = false;
160
161        // consume digits
162        state.advance(1);
163        while let Some(c) = state.peek() {
164            if c.is_ascii_digit() {
165                state.advance(1);
166            }
167            else {
168                break;
169            }
170        }
171
172        // fractional part
173        if state.peek() == Some('.') {
174            let n1 = state.peek_next_n(1);
175            if n1.map(|c| c.is_ascii_digit()).unwrap_or(false) {
176                is_float = true;
177                state.advance(1); // consume '.'
178                while let Some(c) = state.peek() {
179                    if c.is_ascii_digit() {
180                        state.advance(1);
181                    }
182                    else {
183                        break;
184                    }
185                }
186            }
187        }
188
189        // exponent
190        if let Some(c) = state.peek() {
191            if c == 'e' || c == 'E' {
192                let n1 = state.peek_next_n(1);
193                if n1 == Some('+') || n1 == Some('-') || n1.map(|d| d.is_ascii_digit()).unwrap_or(false) {
194                    is_float = true;
195                    state.advance(1);
196                    if let Some(sign) = state.peek() {
197                        if sign == '+' || sign == '-' {
198                            state.advance(1);
199                        }
200                    }
201                    while let Some(d) = state.peek() {
202                        if d.is_ascii_digit() {
203                            state.advance(1);
204                        }
205                        else {
206                            break;
207                        }
208                    }
209                }
210            }
211        }
212
213        let end = state.get_position();
214        state.add_token(if is_float { OCamlSyntaxKind::FloatLiteral } else { OCamlSyntaxKind::IntegerLiteral }, start, end);
215        true
216    }
217
218    fn lex_identifier_or_keyword<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
219        let start = state.get_position();
220        let ch = match state.current() {
221            Some(c) => c,
222            None => return false,
223        };
224
225        if !(ch.is_ascii_alphabetic() || ch == '_') {
226            return false;
227        }
228
229        state.advance(1);
230        while let Some(c) = state.current() {
231            if c.is_ascii_alphanumeric() || c == '_' || c == '\'' {
232                state.advance(1);
233            }
234            else {
235                break;
236            }
237        }
238
239        let end = state.get_position();
240        let text = state.get_text_in((start..end).into());
241        let kind = match text.as_ref() {
242            // OCaml keywords
243            "and" => OCamlSyntaxKind::And,
244            "as" => OCamlSyntaxKind::As,
245            "assert" => OCamlSyntaxKind::Assert,
246            "begin" => OCamlSyntaxKind::Begin,
247            "class" => OCamlSyntaxKind::Class,
248            "constraint" => OCamlSyntaxKind::Constraint,
249            "do" => OCamlSyntaxKind::Do,
250            "done" => OCamlSyntaxKind::Done,
251            "downto" => OCamlSyntaxKind::Downto,
252            "else" => OCamlSyntaxKind::Else,
253            "end" => OCamlSyntaxKind::End,
254            "exception" => OCamlSyntaxKind::Exception,
255            "external" => OCamlSyntaxKind::External,
256            "false" => OCamlSyntaxKind::False,
257            "for" => OCamlSyntaxKind::For,
258            "fun" => OCamlSyntaxKind::Fun,
259            "function" => OCamlSyntaxKind::Function,
260            "functor" => OCamlSyntaxKind::Functor,
261            "if" => OCamlSyntaxKind::If,
262            "in" => OCamlSyntaxKind::In,
263            "include" => OCamlSyntaxKind::Include,
264            "inherit" => OCamlSyntaxKind::Inherit,
265            "initializer" => OCamlSyntaxKind::Initializer,
266            "lazy" => OCamlSyntaxKind::Lazy,
267            "let" => OCamlSyntaxKind::Let,
268            "match" => OCamlSyntaxKind::Match,
269            "method" => OCamlSyntaxKind::Method,
270            "module" => OCamlSyntaxKind::Module,
271            "mutable" => OCamlSyntaxKind::Mutable,
272            "new" => OCamlSyntaxKind::New,
273            "object" => OCamlSyntaxKind::Object,
274            "of" => OCamlSyntaxKind::Of,
275            "open" => OCamlSyntaxKind::Open,
276            "or" => OCamlSyntaxKind::Or,
277            "private" => OCamlSyntaxKind::Private,
278            "rec" => OCamlSyntaxKind::Rec,
279            "sig" => OCamlSyntaxKind::Sig,
280            "struct" => OCamlSyntaxKind::Struct,
281            "then" => OCamlSyntaxKind::Then,
282            "to" => OCamlSyntaxKind::To,
283            "true" => OCamlSyntaxKind::True,
284            "try" => OCamlSyntaxKind::Try,
285            "type" => OCamlSyntaxKind::Type,
286            "val" => OCamlSyntaxKind::Val,
287            "virtual" => OCamlSyntaxKind::Virtual,
288            "when" => OCamlSyntaxKind::When,
289            "while" => OCamlSyntaxKind::While,
290            "with" => OCamlSyntaxKind::With,
291
292            _ => OCamlSyntaxKind::Identifier,
293        };
294
295        state.add_token(kind, start, state.get_position());
296        true
297    }
298
299    fn lex_operators<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
300        let start = state.get_position();
301        let rest = state.rest();
302
303        // prefer longest matches first
304        let patterns: &[(&str, OCamlSyntaxKind)] = &[
305            ("==", OCamlSyntaxKind::EqualEqual),
306            ("!=", OCamlSyntaxKind::NotEqual),
307            (">=", OCamlSyntaxKind::GreaterEqual),
308            ("<=", OCamlSyntaxKind::LessEqual),
309            ("&&", OCamlSyntaxKind::AndAnd),
310            ("||", OCamlSyntaxKind::OrOr),
311            ("::", OCamlSyntaxKind::ColonColon),
312            ("->", OCamlSyntaxKind::RightArrow),
313            ("<-", OCamlSyntaxKind::LeftArrow),
314        ];
315
316        for (pat, kind) in patterns {
317            if rest.starts_with(pat) {
318                state.advance(pat.len());
319                state.add_token(*kind, start, state.get_position());
320                return true;
321            }
322        }
323
324        if let Some(ch) = state.current() {
325            let kind = match ch {
326                '+' => Some(OCamlSyntaxKind::Plus),
327                '-' => Some(OCamlSyntaxKind::Minus),
328                '*' => Some(OCamlSyntaxKind::Star),
329                '/' => Some(OCamlSyntaxKind::Slash),
330                '%' => Some(OCamlSyntaxKind::Percent),
331                '=' => Some(OCamlSyntaxKind::Equal),
332                '>' => Some(OCamlSyntaxKind::Greater),
333                '<' => Some(OCamlSyntaxKind::Less),
334                '!' => Some(OCamlSyntaxKind::Bang),
335                '?' => Some(OCamlSyntaxKind::Question),
336                ':' => Some(OCamlSyntaxKind::Colon),
337                ';' => Some(OCamlSyntaxKind::Semicolon),
338                ',' => Some(OCamlSyntaxKind::Comma),
339                '.' => Some(OCamlSyntaxKind::Dot),
340                '|' => Some(OCamlSyntaxKind::Pipe),
341                '&' => Some(OCamlSyntaxKind::Ampersand),
342                '^' => Some(OCamlSyntaxKind::Caret),
343                '~' => Some(OCamlSyntaxKind::Tilde),
344                '@' => Some(OCamlSyntaxKind::At),
345                '#' => Some(OCamlSyntaxKind::Hash),
346                '$' => Some(OCamlSyntaxKind::Dollar),
347                '`' => Some(OCamlSyntaxKind::Backtick),
348                _ => None,
349            };
350
351            if let Some(k) = kind {
352                state.advance(ch.len_utf8());
353                state.add_token(k, start, state.get_position());
354                return true;
355            }
356        }
357
358        false
359    }
360
361    fn lex_single_char_tokens<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
362        let start = state.get_position();
363        if let Some(ch) = state.current() {
364            let kind = match ch {
365                '(' => OCamlSyntaxKind::LeftParen,
366                ')' => OCamlSyntaxKind::RightParen,
367                '[' => OCamlSyntaxKind::LeftBracket,
368                ']' => OCamlSyntaxKind::RightBracket,
369                '{' => OCamlSyntaxKind::LeftBrace,
370                '}' => OCamlSyntaxKind::RightBrace,
371                _ => return false,
372            };
373
374            state.advance(ch.len_utf8());
375            state.add_token(kind, start, state.get_position());
376            true
377        }
378        else {
379            false
380        }
381    }
382}