Skip to main content

oak_fsharp/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2pub mod token_type;
3
4use crate::{language::FSharpLanguage, lexer::token_type::FSharpTokenType};
5use oak_core::{
6    Lexer, LexerCache, LexerState, OakError, Range, Source, TextEdit,
7    lexer::{LexOutput, WhitespaceConfig},
8};
9use std::sync::LazyLock;
10
11pub(crate) type State<'a, S> = LexerState<'a, S, FSharpLanguage>;
12
13static FS_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
14
15/// F# lexer
16#[derive(Clone)]
17pub struct FSharpLexer<'config> {
18    config: &'config FSharpLanguage,
19}
20
21impl<'config> Lexer<FSharpLanguage> for FSharpLexer<'config> {
22    fn lex<'a, S: Source + ?Sized>(&self, source: &S, _edits: &[oak_core::source::TextEdit], cache: &'a mut impl LexerCache<FSharpLanguage>) -> LexOutput<FSharpLanguage> {
23        let mut state = LexerState::new(source);
24        let result = self.run(&mut state);
25        if result.is_ok() {
26            state.add_eof();
27        }
28        state.finish_with_cache(result, cache)
29    }
30}
31
32impl<'config> FSharpLexer<'config> {
33    /// Creates a new `FSharpLexer`
34    pub fn new(config: &'config FSharpLanguage) -> Self {
35        Self { config }
36    }
37
38    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
39        while state.not_at_end() {
40            // Skip whitespace characters
41            if self.skip_whitespace(state) {
42                continue;
43            }
44
45            // Handle comments
46            if self.skip_comment(state) {
47                continue;
48            }
49
50            // Handle string literals
51            if self.lex_string_literal(state) {
52                continue;
53            }
54
55            // Handle character literals
56            if self.lex_char_literal(state) {
57                continue;
58            }
59
60            // Handle number literals
61            if self.lex_number(state) {
62                continue;
63            }
64
65            // Handle identifiers and keywords
66            if self.lex_identifier_or_keyword(state) {
67                continue;
68            }
69
70            // Handle operators and punctuation
71            if self.lex_operator_or_punctuation(state) {
72                continue;
73            }
74
75            // If no match, skip current character
76            let start = state.get_position();
77            if let Some(ch) = state.peek() {
78                state.advance(ch.len_utf8());
79                state.add_token(FSharpTokenType::Error, start, state.get_position())
80            }
81        }
82
83        Ok(())
84    }
85
86    /// Skips whitespace characters
87    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
88        let start = state.get_position();
89        if let Some(ch) = state.peek() {
90            if ch == '\n' || ch == '\r' {
91                state.advance(ch.len_utf8());
92                state.add_token(FSharpTokenType::Newline, start, state.get_position());
93                return true;
94            }
95            if ch.is_whitespace() {
96                state.advance(ch.len_utf8());
97                while let Some(next) = state.peek() {
98                    if next == '\n' || next == '\r' || !next.is_whitespace() {
99                        break;
100                    }
101                    state.advance(next.len_utf8());
102                }
103                state.add_token(FSharpTokenType::Whitespace, start, state.get_position());
104                return true;
105            }
106        }
107        false
108    }
109
110    fn skip_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
111        let start = state.get_position();
112        let rest = state.rest();
113
114        // Line comment: // ... until newline
115        if rest.starts_with("//") {
116            state.advance(2);
117            while let Some(ch) = state.peek() {
118                if ch == '\n' || ch == '\r' {
119                    break;
120                }
121                state.advance(ch.len_utf8());
122            }
123            state.add_token(FSharpTokenType::LineComment, start, state.get_position());
124            return true;
125        }
126
127        // Block comment: (* ... *) supporting nesting
128        if rest.starts_with("(*") {
129            state.advance(2);
130            let mut depth = 1usize;
131            while let Some(ch) = state.peek() {
132                if ch == '(' && state.peek_next_n(1) == Some('*') {
133                    state.advance(2);
134                    depth += 1;
135                    continue;
136                }
137                if ch == '*' && state.peek_next_n(1) == Some(')') {
138                    state.advance(2);
139                    depth -= 1;
140                    if depth == 0 {
141                        break;
142                    }
143                    continue;
144                }
145                state.advance(ch.len_utf8());
146            }
147            state.add_token(FSharpTokenType::BlockComment, start, state.get_position());
148            return true;
149        }
150        false
151    }
152
153    fn lex_string_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
154        let start = state.get_position();
155
156        // Verbatim string: @"..."
157        if state.peek() == Some('@') && state.peek_next_n(1) == Some('"') {
158            state.advance(2); // Skip @"
159            while let Some(ch) = state.peek() {
160                if ch == '"' {
161                    state.advance(1);
162                    break;
163                }
164                state.advance(ch.len_utf8());
165            }
166            state.add_token(FSharpTokenType::StringLiteral, start, state.get_position());
167            return true;
168        }
169
170        // Normal string: "..."
171        if state.peek() == Some('"') {
172            state.advance(1); // Skip "
173            while let Some(ch) = state.peek() {
174                if ch == '"' {
175                    state.advance(1);
176                    break;
177                }
178                if ch == '\\' {
179                    state.advance(1); // Skip escape character
180                    if let Some(escaped) = state.peek() {
181                        state.advance(escaped.len_utf8());
182                    }
183                }
184                else {
185                    state.advance(ch.len_utf8());
186                }
187            }
188            state.add_token(FSharpTokenType::StringLiteral, start, state.get_position());
189            return true;
190        }
191        false
192    }
193
194    fn lex_char_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
195        let start = state.get_position();
196
197        if state.peek() == Some('\'') {
198            state.advance(1); // Skip '
199            if let Some(ch) = state.peek() {
200                if ch == '\\' {
201                    state.advance(1); // Skip escape character
202                    if let Some(escaped) = state.peek() {
203                        state.advance(escaped.len_utf8());
204                    }
205                }
206                else {
207                    state.advance(ch.len_utf8());
208                }
209            }
210            if state.peek() == Some('\'') {
211                state.advance(1); // Skip closing '
212            }
213            state.add_token(FSharpTokenType::CharLiteral, start, state.get_position());
214            return true;
215        }
216        false
217    }
218
219    fn lex_number<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
220        if !state.current().map_or(false, |c| c.is_ascii_digit()) {
221            return false;
222        }
223
224        let start = state.get_position();
225
226        // Handle integer part
227        while state.current().map_or(false, |c| c.is_ascii_digit()) {
228            state.advance(1);
229        }
230
231        // Handle decimal point
232        if state.current() == Some('.') && state.peek().map_or(false, |c| c.is_ascii_digit()) {
233            state.advance(1); // Skip '.'
234            while state.current().map_or(false, |c| c.is_ascii_digit()) {
235                state.advance(1);
236            }
237            state.add_token(FSharpTokenType::FloatLiteral, start, state.get_position());
238        }
239        else {
240            // Handle scientific notation
241            if matches!(state.current(), Some('e') | Some('E')) {
242                state.advance(1);
243                if matches!(state.current(), Some('+') | Some('-')) {
244                    state.advance(1);
245                }
246                while state.current().map_or(false, |c| c.is_ascii_digit()) {
247                    state.advance(1);
248                }
249                state.add_token(FSharpTokenType::FloatLiteral, start, state.get_position());
250            }
251            else {
252                // Handle numeric suffixes
253                if state.current().map_or(false, |c| c.is_ascii_alphabetic()) {
254                    while state.current().map_or(false, |c| c.is_ascii_alphanumeric()) {
255                        state.advance(1);
256                    }
257                }
258                state.add_token(FSharpTokenType::IntegerLiteral, start, state.get_position());
259            }
260        }
261
262        true
263    }
264
265    fn lex_identifier_or_keyword<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
266        if !state.current().map_or(false, |c| c.is_ascii_alphabetic() || c == '_') {
267            return false;
268        }
269
270        let start = state.get_position();
271        while state.current().map_or(false, |c| c.is_ascii_alphanumeric() || c == '_') {
272            state.advance(1);
273        }
274
275        let text = state.get_text_in((start..state.get_position()).into());
276        let kind = match text.as_ref() {
277            "abstract" => FSharpTokenType::Abstract,
278            "and" => FSharpTokenType::And,
279            "as" => FSharpTokenType::As,
280            "assert" => FSharpTokenType::Assert,
281            "base" => FSharpTokenType::Base,
282            "begin" => FSharpTokenType::Begin,
283            "class" => FSharpTokenType::Class,
284            "default" => FSharpTokenType::Default,
285            "delegate" => FSharpTokenType::Delegate,
286            "do" => FSharpTokenType::Do,
287            "done" => FSharpTokenType::Done,
288            "downcast" => FSharpTokenType::Downcast,
289            "downto" => FSharpTokenType::Downto,
290            "elif" => FSharpTokenType::Elif,
291            "else" => FSharpTokenType::Else,
292            "end" => FSharpTokenType::End,
293            "exception" => FSharpTokenType::Exception,
294            "extern" => FSharpTokenType::Extern,
295            "false" => FSharpTokenType::False,
296            "finally" => FSharpTokenType::Finally,
297            "for" => FSharpTokenType::For,
298            "fun" => FSharpTokenType::Fun,
299            "function" => FSharpTokenType::Function,
300            "if" => FSharpTokenType::If,
301            "in" => FSharpTokenType::In,
302            "inherit" => FSharpTokenType::Inherit,
303            "inline" => FSharpTokenType::Inline,
304            "interface" => FSharpTokenType::Interface,
305            "internal" => FSharpTokenType::Internal,
306            "lazy" => FSharpTokenType::Lazy,
307            "let" => FSharpTokenType::Let,
308            "match" => FSharpTokenType::Match,
309            "member" => FSharpTokenType::Member,
310            "module" => FSharpTokenType::Module,
311            "mutable" => FSharpTokenType::Mutable,
312            "namespace" => FSharpTokenType::Namespace,
313            "new" => FSharpTokenType::New,
314            "not" => FSharpTokenType::Not,
315            "null" => FSharpTokenType::Null,
316            "of" => FSharpTokenType::Of,
317            "open" => FSharpTokenType::Open,
318            "or" => FSharpTokenType::Or,
319            "override" => FSharpTokenType::Override,
320            "private" => FSharpTokenType::Private,
321            "public" => FSharpTokenType::Public,
322            "rec" => FSharpTokenType::Rec,
323            "return" => FSharpTokenType::Return,
324            "select" => FSharpTokenType::Select,
325            "static" => FSharpTokenType::Static,
326            "struct" => FSharpTokenType::Struct,
327            "then" => FSharpTokenType::Then,
328            "to" => FSharpTokenType::To,
329            "true" => FSharpTokenType::True,
330            "try" => FSharpTokenType::Try,
331            "type" => FSharpTokenType::Type,
332            "upcast" => FSharpTokenType::Upcast,
333            "use" => FSharpTokenType::Use,
334            "val" => FSharpTokenType::Val,
335            "void" => FSharpTokenType::Void,
336            "when" => FSharpTokenType::When,
337            "while" => FSharpTokenType::While,
338            "with" => FSharpTokenType::With,
339            "yield" => FSharpTokenType::Yield,
340            _ => FSharpTokenType::Identifier,
341        };
342
343        state.add_token(kind, start, state.get_position());
344        true
345    }
346
347    fn lex_operator_or_punctuation<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
348        let current = state.current();
349        if current.is_none() {
350            return false;
351        }
352
353        let start = state.get_position();
354        let c = current.unwrap();
355        let next = state.peek();
356
357        // Two-character operators
358        match (c, next) {
359            ('-', Some('>')) => {
360                state.advance(2);
361                state.add_token(FSharpTokenType::Arrow, start, state.get_position());
362                return true;
363            }
364            (':', Some(':')) => {
365                state.advance(2);
366                state.add_token(FSharpTokenType::Cons, start, state.get_position());
367                return true;
368            }
369            ('=', Some('=')) => {
370                state.advance(2);
371                state.add_token(FSharpTokenType::Equal, start, state.get_position());
372                return true;
373            }
374            ('<', Some('=')) => {
375                state.advance(2);
376                state.add_token(FSharpTokenType::LessEqual, start, state.get_position());
377                return true;
378            }
379            ('>', Some('=')) => {
380                state.advance(2);
381                state.add_token(FSharpTokenType::GreaterEqual, start, state.get_position());
382                return true;
383            }
384            ('<', Some('>')) => {
385                state.advance(2);
386                state.add_token(FSharpTokenType::NotEqual, start, state.get_position());
387                return true;
388            }
389            ('|', Some('>')) => {
390                state.advance(2);
391                state.add_token(FSharpTokenType::Pipe, start, state.get_position());
392                return true;
393            }
394            _ => {}
395        }
396
397        // Single-character operators and punctuation
398        let kind = match c {
399            '+' => FSharpTokenType::Plus,
400            '-' => FSharpTokenType::Minus,
401            '*' => FSharpTokenType::Star,
402            '/' => FSharpTokenType::Slash,
403            '%' => FSharpTokenType::Percent,
404            '=' => FSharpTokenType::Equal,
405            '<' => FSharpTokenType::LessThan,
406            '>' => FSharpTokenType::GreaterThan,
407            '&' => FSharpTokenType::Ampersand,
408            '|' => FSharpTokenType::Pipe,
409            '^' => FSharpTokenType::Caret,
410            '!' => FSharpTokenType::Not,
411            '?' => FSharpTokenType::Question,
412            ':' => FSharpTokenType::Colon,
413            ';' => FSharpTokenType::Semicolon,
414            ',' => FSharpTokenType::Comma,
415            '.' => FSharpTokenType::Dot,
416            '(' => FSharpTokenType::LeftParen,
417            ')' => FSharpTokenType::RightParen,
418            '[' => FSharpTokenType::LeftBracket,
419            ']' => FSharpTokenType::RightBracket,
420            '{' => FSharpTokenType::LeftBrace,
421            '}' => FSharpTokenType::RightBrace,
422            _ => return false,
423        };
424
425        state.advance(1);
426        state.add_token(kind, start, state.get_position());
427        true
428    }
429}