Skip to main content

oak_fsharp/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2pub mod token_type;
3
4use crate::{language::FSharpLanguage, lexer::token_type::FSharpTokenType};
5use oak_core::{
6    Lexer, LexerCache, LexerState, OakError, Range, Source, TextEdit,
7    lexer::{LexOutput, WhitespaceConfig},
8};
9use std::sync::LazyLock;
10
11pub(crate) type State<'a, S> = LexerState<'a, S, FSharpLanguage>;
12
13static FS_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
14
15/// F# lexer
16#[derive(Clone)]
17pub struct FSharpLexer<'config> {
18    config: &'config FSharpLanguage,
19}
20
21impl<'config> Lexer<FSharpLanguage> for FSharpLexer<'config> {
22    fn lex<'a, S: Source + ?Sized>(&self, source: &S, _edits: &[oak_core::source::TextEdit], cache: &'a mut impl LexerCache<FSharpLanguage>) -> LexOutput<FSharpLanguage> {
23        let mut state = LexerState::new(source);
24        let result = self.run(&mut state);
25        if result.is_ok() {
26            state.add_eof();
27        }
28        state.finish_with_cache(result, cache)
29    }
30}
31
32impl<'config> FSharpLexer<'config> {
33    /// Creates a new `FSharpLexer`
34    /// Creates a new FSharpLexer with the given language configuration.
35    pub fn new(config: &'config FSharpLanguage) -> Self {
36        Self { config }
37    }
38
39    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
40        while state.not_at_end() {
41            // Skip whitespace characters
42            if self.skip_whitespace(state) {
43                continue;
44            }
45
46            // Handle comments
47            if self.skip_comment(state) {
48                continue;
49            }
50
51            // Handle string literals
52            if self.lex_string_literal(state) {
53                continue;
54            }
55
56            // Handle character literals
57            if self.lex_char_literal(state) {
58                continue;
59            }
60
61            // Handle number literals
62            if self.lex_number(state) {
63                continue;
64            }
65
66            // Handle identifiers and keywords
67            if self.lex_identifier_or_keyword(state) {
68                continue;
69            }
70
71            // Handle operators and punctuation
72            if self.lex_operator_or_punctuation(state) {
73                continue;
74            }
75
76            // If no match, skip current character
77            let start = state.get_position();
78            if let Some(ch) = state.peek() {
79                state.advance(ch.len_utf8());
80                state.add_token(FSharpTokenType::Error, start, state.get_position())
81            }
82        }
83
84        Ok(())
85    }
86
87    /// Skips whitespace characters
88    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
89        let start = state.get_position();
90        if let Some(ch) = state.peek() {
91            if ch == '\n' || ch == '\r' {
92                state.advance(ch.len_utf8());
93                state.add_token(FSharpTokenType::Newline, start, state.get_position());
94                return true;
95            }
96            if ch.is_whitespace() {
97                state.advance(ch.len_utf8());
98                while let Some(next) = state.peek() {
99                    if next == '\n' || next == '\r' || !next.is_whitespace() {
100                        break;
101                    }
102                    state.advance(next.len_utf8());
103                }
104                state.add_token(FSharpTokenType::Whitespace, start, state.get_position());
105                return true;
106            }
107        }
108        false
109    }
110
111    fn skip_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
112        let start = state.get_position();
113        let rest = state.rest();
114
115        // Line comment: // ... until newline
116        if rest.starts_with("//") {
117            state.advance(2);
118            while let Some(ch) = state.peek() {
119                if ch == '\n' || ch == '\r' {
120                    break;
121                }
122                state.advance(ch.len_utf8());
123            }
124            state.add_token(FSharpTokenType::LineComment, start, state.get_position());
125            return true;
126        }
127
128        // Block comment: (* ... *) supporting nesting
129        if rest.starts_with("(*") {
130            state.advance(2);
131            let mut depth = 1usize;
132            while let Some(ch) = state.peek() {
133                if ch == '(' && state.peek_next_n(1) == Some('*') {
134                    state.advance(2);
135                    depth += 1;
136                    continue;
137                }
138                if ch == '*' && state.peek_next_n(1) == Some(')') {
139                    state.advance(2);
140                    depth -= 1;
141                    if depth == 0 {
142                        break;
143                    }
144                    continue;
145                }
146                state.advance(ch.len_utf8());
147            }
148            state.add_token(FSharpTokenType::BlockComment, start, state.get_position());
149            return true;
150        }
151        false
152    }
153
154    fn lex_string_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
155        let start = state.get_position();
156
157        // Verbatim string: @"..."
158        if state.peek() == Some('@') && state.peek_next_n(1) == Some('"') {
159            state.advance(2); // Skip @"
160            while let Some(ch) = state.peek() {
161                if ch == '"' {
162                    state.advance(1);
163                    break;
164                }
165                state.advance(ch.len_utf8());
166            }
167            state.add_token(FSharpTokenType::StringLiteral, start, state.get_position());
168            return true;
169        }
170
171        // Normal string: "..."
172        if state.peek() == Some('"') {
173            state.advance(1); // Skip "
174            while let Some(ch) = state.peek() {
175                if ch == '"' {
176                    state.advance(1);
177                    break;
178                }
179                if ch == '\\' {
180                    state.advance(1); // Skip escape character
181                    if let Some(escaped) = state.peek() {
182                        state.advance(escaped.len_utf8());
183                    }
184                }
185                else {
186                    state.advance(ch.len_utf8());
187                }
188            }
189            state.add_token(FSharpTokenType::StringLiteral, start, state.get_position());
190            return true;
191        }
192        false
193    }
194
195    fn lex_char_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
196        let start = state.get_position();
197
198        if state.peek() == Some('\'') {
199            state.advance(1); // Skip '
200            if let Some(ch) = state.peek() {
201                if ch == '\\' {
202                    state.advance(1); // Skip escape character
203                    if let Some(escaped) = state.peek() {
204                        state.advance(escaped.len_utf8());
205                    }
206                }
207                else {
208                    state.advance(ch.len_utf8());
209                }
210            }
211            if state.peek() == Some('\'') {
212                state.advance(1); // Skip closing '
213            }
214            state.add_token(FSharpTokenType::CharLiteral, start, state.get_position());
215            return true;
216        }
217        false
218    }
219
220    fn lex_number<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
221        if !state.current().map_or(false, |c| c.is_ascii_digit()) {
222            return false;
223        }
224
225        let start = state.get_position();
226
227        // Handle integer part
228        while state.current().map_or(false, |c| c.is_ascii_digit()) {
229            state.advance(1);
230        }
231
232        // Handle decimal point
233        if state.current() == Some('.') && state.peek().map_or(false, |c| c.is_ascii_digit()) {
234            state.advance(1); // Skip '.'
235            while state.current().map_or(false, |c| c.is_ascii_digit()) {
236                state.advance(1);
237            }
238            state.add_token(FSharpTokenType::FloatLiteral, start, state.get_position());
239        }
240        else {
241            // Handle scientific notation
242            if matches!(state.current(), Some('e') | Some('E')) {
243                state.advance(1);
244                if matches!(state.current(), Some('+') | Some('-')) {
245                    state.advance(1);
246                }
247                while state.current().map_or(false, |c| c.is_ascii_digit()) {
248                    state.advance(1);
249                }
250                state.add_token(FSharpTokenType::FloatLiteral, start, state.get_position());
251            }
252            else {
253                // Handle numeric suffixes
254                if state.current().map_or(false, |c| c.is_ascii_alphabetic()) {
255                    while state.current().map_or(false, |c| c.is_ascii_alphanumeric()) {
256                        state.advance(1);
257                    }
258                }
259                state.add_token(FSharpTokenType::IntegerLiteral, start, state.get_position());
260            }
261        }
262
263        true
264    }
265
266    fn lex_identifier_or_keyword<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
267        if !state.current().map_or(false, |c| c.is_ascii_alphabetic() || c == '_') {
268            return false;
269        }
270
271        let start = state.get_position();
272        while state.current().map_or(false, |c| c.is_ascii_alphanumeric() || c == '_') {
273            state.advance(1);
274        }
275
276        let text = state.get_text_in((start..state.get_position()).into());
277        let kind = match text.as_ref() {
278            "abstract" => FSharpTokenType::Abstract,
279            "and" => FSharpTokenType::And,
280            "as" => FSharpTokenType::As,
281            "assert" => FSharpTokenType::Assert,
282            "base" => FSharpTokenType::Base,
283            "begin" => FSharpTokenType::Begin,
284            "class" => FSharpTokenType::Class,
285            "default" => FSharpTokenType::Default,
286            "delegate" => FSharpTokenType::Delegate,
287            "do" => FSharpTokenType::Do,
288            "done" => FSharpTokenType::Done,
289            "downcast" => FSharpTokenType::Downcast,
290            "downto" => FSharpTokenType::Downto,
291            "elif" => FSharpTokenType::Elif,
292            "else" => FSharpTokenType::Else,
293            "end" => FSharpTokenType::End,
294            "exception" => FSharpTokenType::Exception,
295            "extern" => FSharpTokenType::Extern,
296            "false" => FSharpTokenType::False,
297            "finally" => FSharpTokenType::Finally,
298            "for" => FSharpTokenType::For,
299            "fun" => FSharpTokenType::Fun,
300            "function" => FSharpTokenType::Function,
301            "if" => FSharpTokenType::If,
302            "in" => FSharpTokenType::In,
303            "inherit" => FSharpTokenType::Inherit,
304            "inline" => FSharpTokenType::Inline,
305            "interface" => FSharpTokenType::Interface,
306            "internal" => FSharpTokenType::Internal,
307            "lazy" => FSharpTokenType::Lazy,
308            "let" => FSharpTokenType::Let,
309            "match" => FSharpTokenType::Match,
310            "member" => FSharpTokenType::Member,
311            "module" => FSharpTokenType::Module,
312            "mutable" => FSharpTokenType::Mutable,
313            "namespace" => FSharpTokenType::Namespace,
314            "new" => FSharpTokenType::New,
315            "not" => FSharpTokenType::Not,
316            "null" => FSharpTokenType::Null,
317            "of" => FSharpTokenType::Of,
318            "open" => FSharpTokenType::Open,
319            "or" => FSharpTokenType::Or,
320            "override" => FSharpTokenType::Override,
321            "private" => FSharpTokenType::Private,
322            "public" => FSharpTokenType::Public,
323            "rec" => FSharpTokenType::Rec,
324            "return" => FSharpTokenType::Return,
325            "select" => FSharpTokenType::Select,
326            "static" => FSharpTokenType::Static,
327            "struct" => FSharpTokenType::Struct,
328            "then" => FSharpTokenType::Then,
329            "to" => FSharpTokenType::To,
330            "true" => FSharpTokenType::True,
331            "try" => FSharpTokenType::Try,
332            "type" => FSharpTokenType::Type,
333            "upcast" => FSharpTokenType::Upcast,
334            "use" => FSharpTokenType::Use,
335            "val" => FSharpTokenType::Val,
336            "void" => FSharpTokenType::Void,
337            "when" => FSharpTokenType::When,
338            "while" => FSharpTokenType::While,
339            "with" => FSharpTokenType::With,
340            "yield" => FSharpTokenType::Yield,
341            _ => FSharpTokenType::Identifier,
342        };
343
344        state.add_token(kind, start, state.get_position());
345        true
346    }
347
348    fn lex_operator_or_punctuation<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
349        let current = state.current();
350        if current.is_none() {
351            return false;
352        }
353
354        let start = state.get_position();
355        let c = current.unwrap();
356        let next = state.peek();
357
358        // Two-character operators
359        match (c, next) {
360            ('-', Some('>')) => {
361                state.advance(2);
362                state.add_token(FSharpTokenType::Arrow, start, state.get_position());
363                return true;
364            }
365            (':', Some(':')) => {
366                state.advance(2);
367                state.add_token(FSharpTokenType::Cons, start, state.get_position());
368                return true;
369            }
370            ('=', Some('=')) => {
371                state.advance(2);
372                state.add_token(FSharpTokenType::Equal, start, state.get_position());
373                return true;
374            }
375            ('<', Some('=')) => {
376                state.advance(2);
377                state.add_token(FSharpTokenType::LessEqual, start, state.get_position());
378                return true;
379            }
380            ('>', Some('=')) => {
381                state.advance(2);
382                state.add_token(FSharpTokenType::GreaterEqual, start, state.get_position());
383                return true;
384            }
385            ('<', Some('>')) => {
386                state.advance(2);
387                state.add_token(FSharpTokenType::NotEqual, start, state.get_position());
388                return true;
389            }
390            ('|', Some('>')) => {
391                state.advance(2);
392                state.add_token(FSharpTokenType::Pipe, start, state.get_position());
393                return true;
394            }
395            _ => {}
396        }
397
398        // Single-character operators and punctuation
399        let kind = match c {
400            '+' => FSharpTokenType::Plus,
401            '-' => FSharpTokenType::Minus,
402            '*' => FSharpTokenType::Star,
403            '/' => FSharpTokenType::Slash,
404            '%' => FSharpTokenType::Percent,
405            '=' => FSharpTokenType::Equal,
406            '<' => FSharpTokenType::LessThan,
407            '>' => FSharpTokenType::GreaterThan,
408            '&' => FSharpTokenType::Ampersand,
409            '|' => FSharpTokenType::Pipe,
410            '^' => FSharpTokenType::Caret,
411            '!' => FSharpTokenType::Not,
412            '?' => FSharpTokenType::Question,
413            ':' => FSharpTokenType::Colon,
414            ';' => FSharpTokenType::Semicolon,
415            ',' => FSharpTokenType::Comma,
416            '.' => FSharpTokenType::Dot,
417            '(' => FSharpTokenType::LeftParen,
418            ')' => FSharpTokenType::RightParen,
419            '[' => FSharpTokenType::LeftBracket,
420            ']' => FSharpTokenType::RightBracket,
421            '{' => FSharpTokenType::LeftBrace,
422            '}' => FSharpTokenType::RightBrace,
423            _ => return false,
424        };
425
426        state.advance(1);
427        state.add_token(kind, start, state.get_position());
428        true
429    }
430}