oak_scheme/lexer/
mod.rs

1use crate::{kind::SchemeSyntaxKind, language::SchemeLanguage};
2use oak_core::{
3    Lexer, LexerCache, LexerState, OakError,
4    lexer::{CommentConfig, LexOutput, StringConfig, WhitespaceConfig},
5    source::Source,
6};
7use std::sync::LazyLock;
8
9type State<'a, S> = LexerState<'a, S, SchemeLanguage>;
10
11static SCHEME_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
12static SCHEME_COMMENT: LazyLock<CommentConfig> = LazyLock::new(|| CommentConfig { line_marker: ";", block_start: "#|", block_end: "|#", nested_blocks: true });
13static SCHEME_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"'], escape: Some('\\') });
14
15#[derive(Clone)]
16pub struct SchemeLexer<'config> {
17    _config: &'config SchemeLanguage,
18}
19
20impl<'config> Lexer<SchemeLanguage> for SchemeLexer<'config> {
21    fn lex<'a, S: Source + ?Sized>(&self, source: &S, _edits: &[oak_core::TextEdit], cache: &'a mut impl LexerCache<SchemeLanguage>) -> LexOutput<SchemeLanguage> {
22        let mut state: State<'_, S> = LexerState::new(source);
23        let result = self.run(&mut state);
24        state.finish_with_cache(result, cache)
25    }
26}
27
28impl<'config> SchemeLexer<'config> {
29    pub fn new(config: &'config SchemeLanguage) -> Self {
30        Self { _config: config }
31    }
32
33    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
34        while state.not_at_end() {
35            let safe_point = state.get_position();
36
37            if self.skip_whitespace(state) {
38                continue;
39            }
40
41            if self.lex_newline(state) {
42                continue;
43            }
44
45            if self.skip_comment(state) {
46                continue;
47            }
48
49            if self.lex_string_literal(state) {
50                continue;
51            }
52
53            if self.lex_number_literal(state) {
54                continue;
55            }
56
57            if self.lex_identifier_or_keyword(state) {
58                continue;
59            }
60
61            if self.lex_single_char_tokens(state) {
62                continue;
63            }
64
65            // 错误处理:如果没有匹配任何规则,跳过当前字符并标记为错误
66            let start_pos = state.get_position();
67            if let Some(ch) = state.peek() {
68                state.advance(ch.len_utf8());
69                state.add_token(SchemeSyntaxKind::Error, start_pos, state.get_position());
70            }
71
72            state.advance_if_dead_lock(safe_point);
73        }
74
75        // 添加 EOF token
76        let eof_pos = state.get_position();
77        state.add_token(SchemeSyntaxKind::Eof, eof_pos, eof_pos);
78        Ok(())
79    }
80
81    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
82        SCHEME_WHITESPACE.scan(state, SchemeSyntaxKind::Whitespace)
83    }
84
85    /// 处理换行
86    fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
87        let start_pos = state.get_position();
88
89        if let Some('\n') = state.peek() {
90            state.advance(1);
91            state.add_token(SchemeSyntaxKind::Newline, start_pos, state.get_position());
92            true
93        }
94        else if let Some('\r') = state.peek() {
95            state.advance(1);
96            if let Some('\n') = state.peek() {
97                state.advance(1);
98            }
99            state.add_token(SchemeSyntaxKind::Newline, start_pos, state.get_position());
100            true
101        }
102        else {
103            false
104        }
105    }
106
107    fn skip_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
108        SCHEME_COMMENT.scan(state, SchemeSyntaxKind::LineComment, SchemeSyntaxKind::Comment)
109    }
110
111    fn lex_string_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
112        SCHEME_STRING.scan(state, SchemeSyntaxKind::StringLiteral)
113    }
114
115    fn lex_number_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
116        let start = state.get_position();
117        let mut len = 0;
118        let mut has_digits = false;
119
120        {
121            let rest = state.rest();
122            if rest.is_empty() {
123                return false;
124            }
125
126            let first_char = rest.chars().next().unwrap();
127            if !first_char.is_ascii_digit() && first_char != '-' && first_char != '+' {
128                return false;
129            }
130
131            // 处理符号
132            if first_char == '-' || first_char == '+' {
133                len += first_char.len_utf8();
134            }
135
136            // 跳过数字
137            let mut chars = rest.chars().skip(if first_char == '-' || first_char == '+' { 1 } else { 0 });
138
139            while let Some(ch) = chars.next() {
140                if ch.is_ascii_digit() {
141                    len += ch.len_utf8();
142                    has_digits = true;
143                }
144                else if ch == '.' {
145                    // 浮点数
146                    len += ch.len_utf8();
147                    while let Some(ch) = chars.next() {
148                        if ch.is_ascii_digit() {
149                            len += ch.len_utf8();
150                            has_digits = true;
151                        }
152                        else {
153                            break;
154                        }
155                    }
156                    break;
157                }
158                else {
159                    break;
160                }
161            }
162        }
163
164        if has_digits {
165            state.advance(len);
166            let end = state.get_position();
167            state.add_token(SchemeSyntaxKind::NumberLiteral, start, end);
168            true
169        }
170        else {
171            false
172        }
173    }
174
175    fn lex_identifier_or_keyword<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
176        let start = state.get_position();
177        let mut len;
178
179        {
180            let rest = state.rest();
181            if rest.is_empty() {
182                return false;
183            }
184
185            let first_char = rest.chars().next().unwrap();
186            if !self.is_identifier_start(first_char) {
187                return false;
188            }
189
190            len = first_char.len_utf8();
191            let mut chars = rest.chars().skip(1);
192
193            while let Some(ch) = chars.next() {
194                if self.is_identifier_continue(ch) {
195                    len += ch.len_utf8();
196                }
197                else {
198                    break;
199                }
200            }
201        }
202
203        let text = state.get_text_in(oak_core::Range { start, end: start + len }).to_string();
204        state.advance(len);
205        let end = state.get_position();
206
207        let kind = match text.as_str() {
208            "define" | "lambda" | "if" | "cond" | "case" | "and" | "or" | "not" | "let" | "let*" | "letrec" | "begin" | "do" | "quote" | "quasiquote" | "unquote" | "unquote-splicing" | "set!" | "delay" | "force" | "#t" | "#f" | "null" | "car" | "cdr"
209            | "cons" | "list" | "append" | "length" | "reverse" | "map" | "for-each" | "apply" => SchemeSyntaxKind::Keyword,
210            _ => SchemeSyntaxKind::Identifier,
211        };
212
213        state.add_token(kind, start, end);
214        true
215    }
216
217    fn is_identifier_start(&self, ch: char) -> bool {
218        ch.is_alphabetic() || "!$%&*+-./:<=>?@^_~".contains(ch)
219    }
220
221    fn is_identifier_continue(&self, ch: char) -> bool {
222        self.is_identifier_start(ch) || ch.is_ascii_digit()
223    }
224
225    fn lex_single_char_tokens<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
226        let start = state.get_position();
227        let ch = match state.peek() {
228            Some(ch) => ch,
229            None => return false,
230        };
231
232        let kind = match ch {
233            '(' => Some(SchemeSyntaxKind::LeftParen),
234            ')' => Some(SchemeSyntaxKind::RightParen),
235            '[' => Some(SchemeSyntaxKind::LeftBracket),
236            ']' => Some(SchemeSyntaxKind::RightBracket),
237            '{' => Some(SchemeSyntaxKind::LeftBrace),
238            '}' => Some(SchemeSyntaxKind::RightBrace),
239            '\'' => Some(SchemeSyntaxKind::Quote),
240            '`' => Some(SchemeSyntaxKind::Quasiquote),
241            ',' => Some(SchemeSyntaxKind::Unquote),
242            '.' => Some(SchemeSyntaxKind::Dot),
243            '#' => Some(SchemeSyntaxKind::Hash),
244            _ => None,
245        };
246
247        if let Some(kind) = kind {
248            state.advance(ch.len_utf8());
249            state.add_token(kind, start, state.get_position());
250            true
251        }
252        else {
253            false
254        }
255    }
256}