Skip to main content

oak_scheme/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2use crate::{language::SchemeLanguage, lexer::token_type::SchemeTokenType};
3pub mod token_type;
4use oak_core::{
5    Lexer, LexerCache, LexerState, OakError, Source, TextEdit,
6    lexer::{CommentConfig, LexOutput, StringConfig, WhitespaceConfig},
7};
8use std::sync::LazyLock;
9
10pub(crate) type State<'a, S> = LexerState<'a, S, SchemeLanguage>;
11
12static SCHEME_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
13static SCHEME_COMMENT: LazyLock<CommentConfig> = LazyLock::new(|| CommentConfig { line_marker: ";", block_start: "#|", block_end: "|#", nested_blocks: true });
14static SCHEME_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"'], escape: Some('\\') });
15
16#[derive(Clone, Debug)]
17/// Lexer for Scheme source code.
18pub struct SchemeLexer<'config> {
19    config: &'config SchemeLanguage,
20}
21
22impl<'config> Lexer<SchemeLanguage> for SchemeLexer<'config> {
23    fn lex<'a, S: Source + ?Sized>(&self, source: &S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<SchemeLanguage>) -> LexOutput<SchemeLanguage> {
24        let mut state: State<'_, S> = LexerState::new_with_cache(source, 0, cache);
25        let result = self.run(&mut state);
26        state.finish_with_cache(result, cache)
27    }
28}
29
30impl<'config> SchemeLexer<'config> {
31    /// Creates a new SchemeLexer with the given language configuration.
32    pub fn new(config: &'config SchemeLanguage) -> Self {
33        Self { config }
34    }
35
36    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
37        while state.not_at_end() {
38            let safe_point = state.get_position();
39
40            if self.skip_whitespace(state) {
41                continue;
42            }
43
44            if self.lex_newline(state) {
45                continue;
46            }
47
48            if self.skip_comment(state) {
49                continue;
50            }
51
52            if self.lex_string_literal(state) {
53                continue;
54            }
55
56            if self.lex_number_literal(state) {
57                continue;
58            }
59
60            if self.lex_identifier_or_keyword(state) {
61                continue;
62            }
63
64            if self.lex_single_char_tokens(state) {
65                continue;
66            }
67
68            // Error handling: If no rules match, skip current character and mark as error
69            let start_pos = state.get_position();
70            if let Some(ch) = state.peek() {
71                state.advance(ch.len_utf8());
72                state.add_token(SchemeTokenType::Error, start_pos, state.get_position());
73            }
74
75            state.advance_if_dead_lock(safe_point)
76        }
77
78        // Add EOF token
79        state.add_eof();
80        Ok(())
81    }
82
83    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
84        SCHEME_WHITESPACE.scan(state, SchemeTokenType::Whitespace)
85    }
86
87    /// Handle newline
88    fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
89        let start_pos = state.get_position();
90
91        if let Some('\n') = state.peek() {
92            state.advance(1);
93            state.add_token(SchemeTokenType::Newline, start_pos, state.get_position());
94            true
95        }
96        else if let Some('\r') = state.peek() {
97            state.advance(1);
98            if let Some('\n') = state.peek() {
99                state.advance(1);
100            }
101            state.add_token(SchemeTokenType::Newline, start_pos, state.get_position());
102            true
103        }
104        else {
105            false
106        }
107    }
108
109    fn skip_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
110        SCHEME_COMMENT.scan(state, SchemeTokenType::LineComment, SchemeTokenType::Comment)
111    }
112
113    fn lex_string_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
114        SCHEME_STRING.scan(state, SchemeTokenType::StringLiteral)
115    }
116
117    fn lex_number_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
118        let start = state.get_position();
119        let mut len = 0;
120        let mut has_digits = false;
121
122        {
123            let rest = state.rest();
124            if rest.is_empty() {
125                return false;
126            }
127
128            let first_char = rest.chars().next().unwrap();
129            if !first_char.is_ascii_digit() && first_char != '-' && first_char != '+' {
130                return false;
131            }
132
133            // Handle sign
134            if first_char == '-' || first_char == '+' {
135                len += first_char.len_utf8();
136            }
137
138            // Skip digits
139            let mut chars = rest.chars().skip(if first_char == '-' || first_char == '+' { 1 } else { 0 });
140
141            while let Some(ch) = chars.next() {
142                if ch.is_ascii_digit() {
143                    len += ch.len_utf8();
144                    has_digits = true;
145                }
146                else if ch == '.' {
147                    // Floating point number
148                    len += ch.len_utf8();
149                    while let Some(ch) = chars.next() {
150                        if ch.is_ascii_digit() {
151                            len += ch.len_utf8();
152                            has_digits = true;
153                        }
154                        else {
155                            break;
156                        }
157                    }
158                    break;
159                }
160                else {
161                    break;
162                }
163            }
164        }
165
166        if has_digits {
167            state.advance(len);
168            let end = state.get_position();
169            state.add_token(SchemeTokenType::NumberLiteral, start, end);
170            true
171        }
172        else {
173            false
174        }
175    }
176
177    fn lex_identifier_or_keyword<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
178        let start = state.get_position();
179        let mut len;
180
181        {
182            let rest = state.rest();
183            if rest.is_empty() {
184                return false;
185            }
186
187            let first_char = rest.chars().next().unwrap();
188            if !self.is_identifier_start(first_char) {
189                return false;
190            }
191
192            len = first_char.len_utf8();
193            let mut chars = rest.chars().skip(1);
194
195            while let Some(ch) = chars.next() {
196                if self.is_identifier_continue(ch) {
197                    len += ch.len_utf8();
198                }
199                else {
200                    break;
201                }
202            }
203        }
204
205        let text = state.get_text_in(oak_core::Range { start, end: start + len }).to_string();
206        state.advance(len);
207        let end = state.get_position();
208
209        let kind = match text.as_str() {
210            "define" | "lambda" | "if" | "cond" | "case" | "and" | "or" | "not" | "let" | "let*" | "letrec" | "begin" | "do" | "quote" | "quasiquote" | "unquote" | "unquote-splicing" | "set!" | "delay" | "force" | "#t" | "#f" | "null" | "car" | "cdr"
211            | "cons" | "list" | "append" | "length" | "reverse" | "map" | "for-each" | "apply" => SchemeTokenType::Keyword,
212            _ => SchemeTokenType::Identifier,
213        };
214
215        state.add_token(kind, start, end);
216        true
217    }
218
219    fn is_identifier_start(&self, ch: char) -> bool {
220        ch.is_alphabetic() || "!$%&*+-./:<=>?@^_~".contains(ch)
221    }
222
223    fn is_identifier_continue(&self, ch: char) -> bool {
224        self.is_identifier_start(ch) || ch.is_ascii_digit()
225    }
226
227    fn lex_single_char_tokens<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
228        let start = state.get_position();
229        let ch = match state.peek() {
230            Some(ch) => ch,
231            None => return false,
232        };
233
234        let kind = match ch {
235            '(' => Some(SchemeTokenType::LeftParen),
236            ')' => Some(SchemeTokenType::RightParen),
237            '[' => Some(SchemeTokenType::LeftBracket),
238            ']' => Some(SchemeTokenType::RightBracket),
239            '{' => Some(SchemeTokenType::LeftBrace),
240            '}' => Some(SchemeTokenType::RightBrace),
241            '\'' => Some(SchemeTokenType::Quote),
242            '`' => Some(SchemeTokenType::Quasiquote),
243            ',' => Some(SchemeTokenType::Unquote),
244            '.' => Some(SchemeTokenType::Dot),
245            '#' => Some(SchemeTokenType::Hash),
246            _ => None,
247        };
248
249        if let Some(kind) = kind {
250            state.advance(ch.len_utf8());
251            state.add_token(kind, start, state.get_position());
252            true
253        }
254        else {
255            false
256        }
257    }
258}