oak_fsharp/lexer/
mod.rs

1use crate::{kind::FSharpSyntaxKind, language::FSharpLanguage};
2use oak_core::{
3    IncrementalCache, Lexer, LexerState, OakError,
4    lexer::{LexOutput, WhitespaceConfig},
5    source::Source,
6};
7use std::sync::LazyLock;
8
9type State<S> = LexerState<S, FSharpLanguage>;
10
11static FS_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
12
13/// F# 词法分析器
14#[derive(Clone)]
15pub struct FSharpLexer<'config> {
16    config: &'config FSharpLanguage,
17}
18
19impl<'config> FSharpLexer<'config> {
20    pub fn new(config: &'config FSharpLanguage) -> Self {
21        Self { config }
22    }
23
24    fn run<S: Source>(&self, state: &mut State<S>) -> Result<(), OakError> {
25        while state.not_at_end() {
26            // 跳过空白字符
27            if self.skip_whitespace(state) {
28                continue;
29            }
30
31            // 处理注释
32            if self.skip_comment(state) {
33                continue;
34            }
35
36            // 处理字符串字面量
37            if self.lex_string_literal(state) {
38                continue;
39            }
40
41            // 处理字符字面量
42            if self.lex_char_literal(state) {
43                continue;
44            }
45
46            // 处理数字字面量
47            if self.lex_number(state) {
48                continue;
49            }
50
51            // 处理标识符和关键字
52            if self.lex_identifier_or_keyword(state) {
53                continue;
54            }
55
56            // 处理操作符和标点符号
57            if self.lex_operator_or_punctuation(state) {
58                continue;
59            }
60
61            // 如果没有匹配任何模式,跳过当前字符
62            let start = state.get_position();
63            if let Some(ch) = state.peek() {
64                state.advance(ch.len_utf8());
65                state.add_token(FSharpSyntaxKind::Error, start, state.get_position());
66            }
67        }
68
69        // 添加 EOF token
70        let eof_pos = state.get_position();
71        state.add_token(FSharpSyntaxKind::Eof, eof_pos, eof_pos);
72        Ok(())
73    }
74
75    fn skip_whitespace<S: Source>(&self, state: &mut State<S>) -> bool {
76        match FS_WHITESPACE.scan(state.rest(), state.get_position(), FSharpSyntaxKind::Whitespace) {
77            Some(token) => {
78                state.advance_with(token);
79                return true;
80            }
81            None => {}
82        }
83        false
84    }
85
86    fn skip_comment<S: Source>(&self, state: &mut State<S>) -> bool {
87        let start = state.get_position();
88        let rest = state.rest();
89
90        // 行注释: // ... 直到换行
91        if rest.starts_with("//") {
92            state.advance(2);
93            while let Some(ch) = state.peek() {
94                if ch == '\n' || ch == '\r' {
95                    break;
96                }
97                state.advance(ch.len_utf8());
98            }
99            state.add_token(FSharpSyntaxKind::LineComment, start, state.get_position());
100            return true;
101        }
102
103        // 块注释: (* ... *) 支持嵌套
104        if rest.starts_with("(*") {
105            state.advance(2);
106            let mut depth = 1usize;
107            while let Some(ch) = state.peek() {
108                if ch == '(' && state.peek_next_n(1) == Some('*') {
109                    state.advance(2);
110                    depth += 1;
111                    continue;
112                }
113                if ch == '*' && state.peek_next_n(1) == Some(')') {
114                    state.advance(2);
115                    depth -= 1;
116                    if depth == 0 {
117                        break;
118                    }
119                    continue;
120                }
121                state.advance(ch.len_utf8());
122            }
123            state.add_token(FSharpSyntaxKind::BlockComment, start, state.get_position());
124            return true;
125        }
126        false
127    }
128
129    fn lex_string_literal<S: Source>(&self, state: &mut State<S>) -> bool {
130        let start = state.get_position();
131
132        // 原始字符串: @"..."
133        if state.peek() == Some('@') && state.peek_next_n(1) == Some('"') {
134            state.advance(2); // 跳过 @"
135            while let Some(ch) = state.peek() {
136                if ch == '"' {
137                    state.advance(1);
138                    break;
139                }
140                state.advance(ch.len_utf8());
141            }
142            state.add_token(FSharpSyntaxKind::StringLiteral, start, state.get_position());
143            return true;
144        }
145
146        // 普通字符串: "..."
147        if state.peek() == Some('"') {
148            state.advance(1); // 跳过 "
149            while let Some(ch) = state.peek() {
150                if ch == '"' {
151                    state.advance(1);
152                    break;
153                }
154                if ch == '\\' {
155                    state.advance(1); // 跳过转义字符
156                    if let Some(escaped) = state.peek() {
157                        state.advance(escaped.len_utf8());
158                    }
159                }
160                else {
161                    state.advance(ch.len_utf8());
162                }
163            }
164            state.add_token(FSharpSyntaxKind::StringLiteral, start, state.get_position());
165            return true;
166        }
167        false
168    }
169
170    fn lex_char_literal<S: Source>(&self, state: &mut State<S>) -> bool {
171        let start = state.get_position();
172
173        if state.peek() == Some('\'') {
174            state.advance(1); // 跳过 '
175            if let Some(ch) = state.peek() {
176                if ch == '\\' {
177                    state.advance(1); // 跳过转义字符
178                    if let Some(escaped) = state.peek() {
179                        state.advance(escaped.len_utf8());
180                    }
181                }
182                else {
183                    state.advance(ch.len_utf8());
184                }
185            }
186            if state.peek() == Some('\'') {
187                state.advance(1); // 跳过结束的 '
188            }
189            state.add_token(FSharpSyntaxKind::CharLiteral, start, state.get_position());
190            return true;
191        }
192        false
193    }
194
195    fn lex_number<S: Source>(&self, state: &mut State<S>) -> bool {
196        if !state.current().map_or(false, |c| c.is_ascii_digit()) {
197            return false;
198        }
199
200        let start = state.get_position();
201
202        // 处理整数部分
203        while state.current().map_or(false, |c| c.is_ascii_digit()) {
204            state.advance(1);
205        }
206
207        // 处理小数点
208        if state.current() == Some('.') && state.peek().map_or(false, |c| c.is_ascii_digit()) {
209            state.advance(1); // 跳过 '.'
210            while state.current().map_or(false, |c| c.is_ascii_digit()) {
211                state.advance(1);
212            }
213            state.add_token(FSharpSyntaxKind::FloatLiteral, start, state.get_position());
214        }
215        else {
216            // 处理科学计数法
217            if matches!(state.current(), Some('e') | Some('E')) {
218                state.advance(1);
219                if matches!(state.current(), Some('+') | Some('-')) {
220                    state.advance(1);
221                }
222                while state.current().map_or(false, |c| c.is_ascii_digit()) {
223                    state.advance(1);
224                }
225                state.add_token(FSharpSyntaxKind::FloatLiteral, start, state.get_position());
226            }
227            else {
228                // 处理数字后缀
229                if state.current().map_or(false, |c| c.is_ascii_alphabetic()) {
230                    while state.current().map_or(false, |c| c.is_ascii_alphanumeric()) {
231                        state.advance(1);
232                    }
233                }
234                state.add_token(FSharpSyntaxKind::IntegerLiteral, start, state.get_position());
235            }
236        }
237
238        true
239    }
240
241    fn lex_identifier_or_keyword<S: Source>(&self, state: &mut State<S>) -> bool {
242        if !state.current().map_or(false, |c| c.is_ascii_alphabetic() || c == '_') {
243            return false;
244        }
245
246        let start = state.get_position();
247
248        // 读取标识符
249        while state.current().map_or(false, |c| c.is_ascii_alphanumeric() || c == '_') {
250            state.advance(1);
251        }
252
253        let text = state.get_text_from(start);
254        let kind = self.classify_identifier(&text);
255        state.add_token(kind, start, state.get_position());
256        true
257    }
258
259    fn classify_identifier(&self, text: &str) -> FSharpSyntaxKind {
260        match text {
261            // F# 关键字
262            "abstract" => FSharpSyntaxKind::Abstract,
263            "and" => FSharpSyntaxKind::And,
264            "as" => FSharpSyntaxKind::As,
265            "assert" => FSharpSyntaxKind::Assert,
266            "base" => FSharpSyntaxKind::Base,
267            "begin" => FSharpSyntaxKind::Begin,
268            "class" => FSharpSyntaxKind::Class,
269            "default" => FSharpSyntaxKind::Default,
270            "do" => FSharpSyntaxKind::Do,
271            "done" => FSharpSyntaxKind::Done,
272            "downcast" => FSharpSyntaxKind::Downcast,
273            "downto" => FSharpSyntaxKind::Downto,
274            "elif" => FSharpSyntaxKind::Elif,
275            "else" => FSharpSyntaxKind::Else,
276            "end" => FSharpSyntaxKind::End,
277            "exception" => FSharpSyntaxKind::Exception,
278            "extern" => FSharpSyntaxKind::Extern,
279            "false" => FSharpSyntaxKind::False,
280            "finally" => FSharpSyntaxKind::Finally,
281            "for" => FSharpSyntaxKind::For,
282            "fun" => FSharpSyntaxKind::Fun,
283            "function" => FSharpSyntaxKind::Function,
284            "global" => FSharpSyntaxKind::Global,
285            "if" => FSharpSyntaxKind::If,
286            "in" => FSharpSyntaxKind::In,
287            "inherit" => FSharpSyntaxKind::Inherit,
288            "inline" => FSharpSyntaxKind::Inline,
289            "interface" => FSharpSyntaxKind::Interface,
290            "internal" => FSharpSyntaxKind::Internal,
291            "lazy" => FSharpSyntaxKind::Lazy,
292            "let" => FSharpSyntaxKind::Let,
293            "match" => FSharpSyntaxKind::Match,
294            "member" => FSharpSyntaxKind::Member,
295            "module" => FSharpSyntaxKind::Module,
296            "mutable" => FSharpSyntaxKind::Mutable,
297            "namespace" => FSharpSyntaxKind::Namespace,
298            "new" => FSharpSyntaxKind::New,
299            "not" => FSharpSyntaxKind::Not,
300            "null" => FSharpSyntaxKind::Null,
301            "of" => FSharpSyntaxKind::Of,
302            "open" => FSharpSyntaxKind::Open,
303            "or" => FSharpSyntaxKind::Or,
304            "override" => FSharpSyntaxKind::Override,
305            "private" => FSharpSyntaxKind::Private,
306            "public" => FSharpSyntaxKind::Public,
307            "rec" => FSharpSyntaxKind::Rec,
308            "return" => FSharpSyntaxKind::Return,
309            "sig" => FSharpSyntaxKind::Sig,
310            "static" => FSharpSyntaxKind::Static,
311            "struct" => FSharpSyntaxKind::Struct,
312            "then" => FSharpSyntaxKind::Then,
313            "to" => FSharpSyntaxKind::To,
314            "true" => FSharpSyntaxKind::True,
315            "try" => FSharpSyntaxKind::Try,
316            "type" => FSharpSyntaxKind::Type,
317            "upcast" => FSharpSyntaxKind::Upcast,
318            "use" => FSharpSyntaxKind::Use,
319            "val" => FSharpSyntaxKind::Val,
320            "void" => FSharpSyntaxKind::Void,
321            "when" => FSharpSyntaxKind::When,
322            "while" => FSharpSyntaxKind::While,
323            "with" => FSharpSyntaxKind::With,
324            "yield" => FSharpSyntaxKind::Yield,
325            "async" => FSharpSyntaxKind::Async,
326            "seq" => FSharpSyntaxKind::Seq,
327            "raise" => FSharpSyntaxKind::Raise,
328            "failwith" => FSharpSyntaxKind::Failwith,
329            _ => FSharpSyntaxKind::Identifier,
330        }
331    }
332
333    fn lex_operator_or_punctuation<S: Source>(&self, state: &mut State<S>) -> bool {
334        let current = state.current();
335        if current.is_none() {
336            return false;
337        }
338
339        let start = state.get_position();
340        let c = current.unwrap();
341        let next = state.peek();
342
343        // 双字符操作符
344        match (c, next) {
345            ('-', Some('>')) => {
346                state.advance(2);
347                state.add_token(FSharpSyntaxKind::Arrow, start, state.get_position());
348                return true;
349            }
350            (':', Some(':')) => {
351                state.advance(2);
352                state.add_token(FSharpSyntaxKind::Cons, start, state.get_position());
353                return true;
354            }
355            ('=', Some('=')) => {
356                state.advance(2);
357                state.add_token(FSharpSyntaxKind::Equal, start, state.get_position());
358                return true;
359            }
360            ('<', Some('=')) => {
361                state.advance(2);
362                state.add_token(FSharpSyntaxKind::LessEqual, start, state.get_position());
363                return true;
364            }
365            ('>', Some('=')) => {
366                state.advance(2);
367                state.add_token(FSharpSyntaxKind::GreaterEqual, start, state.get_position());
368                return true;
369            }
370            ('<', Some('>')) => {
371                state.advance(2);
372                state.add_token(FSharpSyntaxKind::NotEqual, start, state.get_position());
373                return true;
374            }
375            ('|', Some('>')) => {
376                state.advance(2);
377                state.add_token(FSharpSyntaxKind::Pipe, start, state.get_position());
378                return true;
379            }
380            _ => {}
381        }
382
383        // 单字符操作符和标点符号
384        let kind = match c {
385            '+' => FSharpSyntaxKind::Plus,
386            '-' => FSharpSyntaxKind::Minus,
387            '*' => FSharpSyntaxKind::Star,
388            '/' => FSharpSyntaxKind::Slash,
389            '%' => FSharpSyntaxKind::Percent,
390            '=' => FSharpSyntaxKind::Equal,
391            '<' => FSharpSyntaxKind::LessThan,
392            '>' => FSharpSyntaxKind::GreaterThan,
393            '&' => FSharpSyntaxKind::Ampersand,
394            '|' => FSharpSyntaxKind::Pipe,
395            '^' => FSharpSyntaxKind::Caret,
396            '!' => FSharpSyntaxKind::Not,
397            '?' => FSharpSyntaxKind::Question,
398            ':' => FSharpSyntaxKind::Colon,
399            ';' => FSharpSyntaxKind::Semicolon,
400            ',' => FSharpSyntaxKind::Comma,
401            '.' => FSharpSyntaxKind::Dot,
402            '(' => FSharpSyntaxKind::LeftParen,
403            ')' => FSharpSyntaxKind::RightParen,
404            '[' => FSharpSyntaxKind::LeftBracket,
405            ']' => FSharpSyntaxKind::RightBracket,
406            '{' => FSharpSyntaxKind::LeftBrace,
407            '}' => FSharpSyntaxKind::RightBrace,
408            _ => return false,
409        };
410
411        state.advance(1);
412        state.add_token(kind, start, state.get_position());
413        true
414    }
415}
416
417impl<'config> Lexer<FSharpLanguage> for FSharpLexer<'config> {
418    fn lex_incremental(
419        &self,
420        source: impl Source,
421        changed: usize,
422        cache: IncrementalCache<FSharpLanguage>,
423    ) -> LexOutput<FSharpLanguage> {
424        let mut state = LexerState::new_with_cache(source, changed, cache);
425        let result = self.run(&mut state);
426        state.finish(result)
427    }
428}