Skip to main content

oak_fsharp/lexer/
mod.rs

1use crate::{kind::FSharpSyntaxKind, language::FSharpLanguage};
2use oak_core::{
3    Lexer, LexerCache, LexerState, OakError,
4    lexer::{LexOutput, WhitespaceConfig},
5    source::Source,
6};
7use std::sync::LazyLock;
8
9type State<'a, S> = LexerState<'a, S, FSharpLanguage>;
10
11static FS_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
12
13/// F# 词法分析器
14#[derive(Clone)]
15pub struct FSharpLexer<'config> {
16    _config: &'config FSharpLanguage,
17}
18
19impl<'config> Lexer<FSharpLanguage> for FSharpLexer<'config> {
20    fn lex<'a, S: Source + ?Sized>(&self, source: &S, _edits: &[oak_core::source::TextEdit], cache: &'a mut impl LexerCache<FSharpLanguage>) -> LexOutput<FSharpLanguage> {
21        let mut state = LexerState::new(source);
22        let result = self.run(&mut state);
23        if result.is_ok() {
24            state.add_eof();
25        }
26        state.finish_with_cache(result, cache)
27    }
28}
29
30impl<'config> FSharpLexer<'config> {
31    pub fn new(config: &'config FSharpLanguage) -> Self {
32        Self { _config: config }
33    }
34
35    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
36        while state.not_at_end() {
37            // 跳过空白字符
38            if self.skip_whitespace(state) {
39                continue;
40            }
41
42            // 处理注释
43            if self.skip_comment(state) {
44                continue;
45            }
46
47            // 处理字符串字面量
48            if self.lex_string_literal(state) {
49                continue;
50            }
51
52            // 处理字符字面量
53            if self.lex_char_literal(state) {
54                continue;
55            }
56
57            // 处理数字字面量
58            if self.lex_number(state) {
59                continue;
60            }
61
62            // 处理标识符和关键字
63            if self.lex_identifier_or_keyword(state) {
64                continue;
65            }
66
67            // 处理操作符和标点符号
68            if self.lex_operator_or_punctuation(state) {
69                continue;
70            }
71
72            // 如果没有匹配任何模式,跳过当前字符
73            let start = state.get_position();
74            if let Some(ch) = state.peek() {
75                state.advance(ch.len_utf8());
76                state.add_token(FSharpSyntaxKind::Error, start, state.get_position());
77            }
78        }
79
80        Ok(())
81    }
82
83    /// 跳过空白字符
84    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
85        FS_WHITESPACE.scan(state, FSharpSyntaxKind::Whitespace)
86    }
87
88    fn skip_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
89        let start = state.get_position();
90        let rest = state.rest();
91
92        // 行注释: // ... 直到换行
93        if rest.starts_with("//") {
94            state.advance(2);
95            while let Some(ch) = state.peek() {
96                if ch == '\n' || ch == '\r' {
97                    break;
98                }
99                state.advance(ch.len_utf8());
100            }
101            state.add_token(FSharpSyntaxKind::LineComment, start, state.get_position());
102            return true;
103        }
104
105        // 块注释: (* ... *) 支持嵌套
106        if rest.starts_with("(*") {
107            state.advance(2);
108            let mut depth = 1usize;
109            while let Some(ch) = state.peek() {
110                if ch == '(' && state.peek_next_n(1) == Some('*') {
111                    state.advance(2);
112                    depth += 1;
113                    continue;
114                }
115                if ch == '*' && state.peek_next_n(1) == Some(')') {
116                    state.advance(2);
117                    depth -= 1;
118                    if depth == 0 {
119                        break;
120                    }
121                    continue;
122                }
123                state.advance(ch.len_utf8());
124            }
125            state.add_token(FSharpSyntaxKind::BlockComment, start, state.get_position());
126            return true;
127        }
128        false
129    }
130
131    fn lex_string_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
132        let start = state.get_position();
133
134        // 原始字符串: @"..."
135        if state.peek() == Some('@') && state.peek_next_n(1) == Some('"') {
136            state.advance(2); // 跳过 @"
137            while let Some(ch) = state.peek() {
138                if ch == '"' {
139                    state.advance(1);
140                    break;
141                }
142                state.advance(ch.len_utf8());
143            }
144            state.add_token(FSharpSyntaxKind::StringLiteral, start, state.get_position());
145            return true;
146        }
147
148        // 普通字符串: "..."
149        if state.peek() == Some('"') {
150            state.advance(1); // 跳过 "
151            while let Some(ch) = state.peek() {
152                if ch == '"' {
153                    state.advance(1);
154                    break;
155                }
156                if ch == '\\' {
157                    state.advance(1); // 跳过转义字符
158                    if let Some(escaped) = state.peek() {
159                        state.advance(escaped.len_utf8());
160                    }
161                }
162                else {
163                    state.advance(ch.len_utf8());
164                }
165            }
166            state.add_token(FSharpSyntaxKind::StringLiteral, start, state.get_position());
167            return true;
168        }
169        false
170    }
171
172    fn lex_char_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
173        let start = state.get_position();
174
175        if state.peek() == Some('\'') {
176            state.advance(1); // 跳过 '
177            if let Some(ch) = state.peek() {
178                if ch == '\\' {
179                    state.advance(1); // 跳过转义字符
180                    if let Some(escaped) = state.peek() {
181                        state.advance(escaped.len_utf8());
182                    }
183                }
184                else {
185                    state.advance(ch.len_utf8());
186                }
187            }
188            if state.peek() == Some('\'') {
189                state.advance(1); // 跳过结束的 '
190            }
191            state.add_token(FSharpSyntaxKind::CharLiteral, start, state.get_position());
192            return true;
193        }
194        false
195    }
196
197    fn lex_number<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
198        if !state.current().map_or(false, |c| c.is_ascii_digit()) {
199            return false;
200        }
201
202        let start = state.get_position();
203
204        // 处理整数部分
205        while state.current().map_or(false, |c| c.is_ascii_digit()) {
206            state.advance(1);
207        }
208
209        // 处理小数点
210        if state.current() == Some('.') && state.peek().map_or(false, |c| c.is_ascii_digit()) {
211            state.advance(1); // 跳过 '.'
212            while state.current().map_or(false, |c| c.is_ascii_digit()) {
213                state.advance(1);
214            }
215            state.add_token(FSharpSyntaxKind::FloatLiteral, start, state.get_position());
216        }
217        else {
218            // 处理科学计数法
219            if matches!(state.current(), Some('e') | Some('E')) {
220                state.advance(1);
221                if matches!(state.current(), Some('+') | Some('-')) {
222                    state.advance(1);
223                }
224                while state.current().map_or(false, |c| c.is_ascii_digit()) {
225                    state.advance(1);
226                }
227                state.add_token(FSharpSyntaxKind::FloatLiteral, start, state.get_position());
228            }
229            else {
230                // 处理数字后缀
231                if state.current().map_or(false, |c| c.is_ascii_alphabetic()) {
232                    while state.current().map_or(false, |c| c.is_ascii_alphanumeric()) {
233                        state.advance(1);
234                    }
235                }
236                state.add_token(FSharpSyntaxKind::IntegerLiteral, start, state.get_position());
237            }
238        }
239
240        true
241    }
242
243    fn lex_identifier_or_keyword<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
244        if !state.current().map_or(false, |c| c.is_ascii_alphabetic() || c == '_') {
245            return false;
246        }
247
248        let start = state.get_position();
249
250        // 读取标识符
251        while state.current().map_or(false, |c| c.is_ascii_alphanumeric() || c == '_') {
252            state.advance(1);
253        }
254
255        let text = state.get_text_from(start);
256        let kind = self.classify_identifier(&text);
257        state.add_token(kind, start, state.get_position());
258        true
259    }
260
261    fn classify_identifier(&self, text: &str) -> FSharpSyntaxKind {
262        match text {
263            // F# 关键字
264            "abstract" => FSharpSyntaxKind::Abstract,
265            "and" => FSharpSyntaxKind::And,
266            "as" => FSharpSyntaxKind::As,
267            "assert" => FSharpSyntaxKind::Assert,
268            "base" => FSharpSyntaxKind::Base,
269            "begin" => FSharpSyntaxKind::Begin,
270            "class" => FSharpSyntaxKind::Class,
271            "default" => FSharpSyntaxKind::Default,
272            "do" => FSharpSyntaxKind::Do,
273            "done" => FSharpSyntaxKind::Done,
274            "downcast" => FSharpSyntaxKind::Downcast,
275            "downto" => FSharpSyntaxKind::Downto,
276            "elif" => FSharpSyntaxKind::Elif,
277            "else" => FSharpSyntaxKind::Else,
278            "end" => FSharpSyntaxKind::End,
279            "exception" => FSharpSyntaxKind::Exception,
280            "extern" => FSharpSyntaxKind::Extern,
281            "false" => FSharpSyntaxKind::False,
282            "finally" => FSharpSyntaxKind::Finally,
283            "for" => FSharpSyntaxKind::For,
284            "fun" => FSharpSyntaxKind::Fun,
285            "function" => FSharpSyntaxKind::Function,
286            "global" => FSharpSyntaxKind::Global,
287            "if" => FSharpSyntaxKind::If,
288            "in" => FSharpSyntaxKind::In,
289            "inherit" => FSharpSyntaxKind::Inherit,
290            "inline" => FSharpSyntaxKind::Inline,
291            "interface" => FSharpSyntaxKind::Interface,
292            "internal" => FSharpSyntaxKind::Internal,
293            "lazy" => FSharpSyntaxKind::Lazy,
294            "let" => FSharpSyntaxKind::Let,
295            "match" => FSharpSyntaxKind::Match,
296            "member" => FSharpSyntaxKind::Member,
297            "module" => FSharpSyntaxKind::Module,
298            "mutable" => FSharpSyntaxKind::Mutable,
299            "namespace" => FSharpSyntaxKind::Namespace,
300            "new" => FSharpSyntaxKind::New,
301            "not" => FSharpSyntaxKind::Not,
302            "null" => FSharpSyntaxKind::Null,
303            "of" => FSharpSyntaxKind::Of,
304            "open" => FSharpSyntaxKind::Open,
305            "or" => FSharpSyntaxKind::Or,
306            "override" => FSharpSyntaxKind::Override,
307            "private" => FSharpSyntaxKind::Private,
308            "public" => FSharpSyntaxKind::Public,
309            "rec" => FSharpSyntaxKind::Rec,
310            "return" => FSharpSyntaxKind::Return,
311            "sig" => FSharpSyntaxKind::Sig,
312            "static" => FSharpSyntaxKind::Static,
313            "struct" => FSharpSyntaxKind::Struct,
314            "then" => FSharpSyntaxKind::Then,
315            "to" => FSharpSyntaxKind::To,
316            "true" => FSharpSyntaxKind::True,
317            "try" => FSharpSyntaxKind::Try,
318            "type" => FSharpSyntaxKind::Type,
319            "upcast" => FSharpSyntaxKind::Upcast,
320            "use" => FSharpSyntaxKind::Use,
321            "val" => FSharpSyntaxKind::Val,
322            "void" => FSharpSyntaxKind::Void,
323            "when" => FSharpSyntaxKind::When,
324            "while" => FSharpSyntaxKind::While,
325            "with" => FSharpSyntaxKind::With,
326            "yield" => FSharpSyntaxKind::Yield,
327            "async" => FSharpSyntaxKind::Async,
328            "seq" => FSharpSyntaxKind::Seq,
329            "raise" => FSharpSyntaxKind::Raise,
330            "failwith" => FSharpSyntaxKind::Failwith,
331            _ => FSharpSyntaxKind::Identifier,
332        }
333    }
334
335    fn lex_operator_or_punctuation<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
336        let current = state.current();
337        if current.is_none() {
338            return false;
339        }
340
341        let start = state.get_position();
342        let c = current.unwrap();
343        let next = state.peek();
344
345        // 双字符操作符
346        match (c, next) {
347            ('-', Some('>')) => {
348                state.advance(2);
349                state.add_token(FSharpSyntaxKind::Arrow, start, state.get_position());
350                return true;
351            }
352            (':', Some(':')) => {
353                state.advance(2);
354                state.add_token(FSharpSyntaxKind::Cons, start, state.get_position());
355                return true;
356            }
357            ('=', Some('=')) => {
358                state.advance(2);
359                state.add_token(FSharpSyntaxKind::Equal, start, state.get_position());
360                return true;
361            }
362            ('<', Some('=')) => {
363                state.advance(2);
364                state.add_token(FSharpSyntaxKind::LessEqual, start, state.get_position());
365                return true;
366            }
367            ('>', Some('=')) => {
368                state.advance(2);
369                state.add_token(FSharpSyntaxKind::GreaterEqual, start, state.get_position());
370                return true;
371            }
372            ('<', Some('>')) => {
373                state.advance(2);
374                state.add_token(FSharpSyntaxKind::NotEqual, start, state.get_position());
375                return true;
376            }
377            ('|', Some('>')) => {
378                state.advance(2);
379                state.add_token(FSharpSyntaxKind::Pipe, start, state.get_position());
380                return true;
381            }
382            _ => {}
383        }
384
385        // 单字符操作符和标点符号
386        let kind = match c {
387            '+' => FSharpSyntaxKind::Plus,
388            '-' => FSharpSyntaxKind::Minus,
389            '*' => FSharpSyntaxKind::Star,
390            '/' => FSharpSyntaxKind::Slash,
391            '%' => FSharpSyntaxKind::Percent,
392            '=' => FSharpSyntaxKind::Equal,
393            '<' => FSharpSyntaxKind::LessThan,
394            '>' => FSharpSyntaxKind::GreaterThan,
395            '&' => FSharpSyntaxKind::Ampersand,
396            '|' => FSharpSyntaxKind::Pipe,
397            '^' => FSharpSyntaxKind::Caret,
398            '!' => FSharpSyntaxKind::Not,
399            '?' => FSharpSyntaxKind::Question,
400            ':' => FSharpSyntaxKind::Colon,
401            ';' => FSharpSyntaxKind::Semicolon,
402            ',' => FSharpSyntaxKind::Comma,
403            '.' => FSharpSyntaxKind::Dot,
404            '(' => FSharpSyntaxKind::LeftParen,
405            ')' => FSharpSyntaxKind::RightParen,
406            '[' => FSharpSyntaxKind::LeftBracket,
407            ']' => FSharpSyntaxKind::RightBracket,
408            '{' => FSharpSyntaxKind::LeftBrace,
409            '}' => FSharpSyntaxKind::RightBrace,
410            _ => return false,
411        };
412
413        state.advance(1);
414        state.add_token(kind, start, state.get_position());
415        true
416    }
417}