Skip to main content

oak_scheme/lexer/
mod.rs

1use crate::{kind::SchemeSyntaxKind, language::SchemeLanguage};
2use oak_core::{
3    Lexer, LexerCache, LexerState, OakError,
4    lexer::{CommentConfig, LexOutput, StringConfig, WhitespaceConfig},
5    source::Source,
6};
7use std::sync::LazyLock;
8
9type State<'a, S> = LexerState<'a, S, SchemeLanguage>;
10
11static SCHEME_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
12static SCHEME_COMMENT: LazyLock<CommentConfig> = LazyLock::new(|| CommentConfig { line_marker: ";", block_start: "#|", block_end: "|#", nested_blocks: true });
13static SCHEME_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"'], escape: Some('\\') });
14
15#[derive(Clone, Debug)]
16pub struct SchemeLexer<'config> {
17    _config: &'config SchemeLanguage,
18}
19
20impl<'config> Lexer<SchemeLanguage> for SchemeLexer<'config> {
21    fn lex<'a, S: Source + ?Sized>(&self, source: &S, _edits: &[oak_core::TextEdit], cache: &'a mut impl LexerCache<SchemeLanguage>) -> LexOutput<SchemeLanguage> {
22        let mut state: State<'_, S> = LexerState::new_with_cache(source, 0, cache);
23        let result = self.run(&mut state);
24        state.finish_with_cache(result, cache)
25    }
26}
27
28impl<'config> SchemeLexer<'config> {
29    pub fn new(config: &'config SchemeLanguage) -> Self {
30        Self { _config: config }
31    }
32
33    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
34        while state.not_at_end() {
35            let safe_point = state.get_position();
36
37            if self.skip_whitespace(state) {
38                continue;
39            }
40
41            if self.lex_newline(state) {
42                continue;
43            }
44
45            if self.skip_comment(state) {
46                continue;
47            }
48
49            if self.lex_string_literal(state) {
50                continue;
51            }
52
53            if self.lex_number_literal(state) {
54                continue;
55            }
56
57            if self.lex_identifier_or_keyword(state) {
58                continue;
59            }
60
61            if self.lex_single_char_tokens(state) {
62                continue;
63            }
64
65            // 错误处理:如果没有匹配任何规则,跳过当前字符并标记为错误
66            let start_pos = state.get_position();
67            if let Some(ch) = state.peek() {
68                state.advance(ch.len_utf8());
69                state.add_token(SchemeSyntaxKind::Error, start_pos, state.get_position());
70            }
71
72            state.advance_if_dead_lock(safe_point);
73        }
74
75        // 添加 EOF token
76        state.add_eof();
77        Ok(())
78    }
79
80    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
81        SCHEME_WHITESPACE.scan(state, SchemeSyntaxKind::Whitespace)
82    }
83
84    /// 处理换行
85    fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
86        let start_pos = state.get_position();
87
88        if let Some('\n') = state.peek() {
89            state.advance(1);
90            state.add_token(SchemeSyntaxKind::Newline, start_pos, state.get_position());
91            true
92        }
93        else if let Some('\r') = state.peek() {
94            state.advance(1);
95            if let Some('\n') = state.peek() {
96                state.advance(1);
97            }
98            state.add_token(SchemeSyntaxKind::Newline, start_pos, state.get_position());
99            true
100        }
101        else {
102            false
103        }
104    }
105
106    fn skip_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
107        SCHEME_COMMENT.scan(state, SchemeSyntaxKind::LineComment, SchemeSyntaxKind::Comment)
108    }
109
110    fn lex_string_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
111        SCHEME_STRING.scan(state, SchemeSyntaxKind::StringLiteral)
112    }
113
114    fn lex_number_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
115        let start = state.get_position();
116        let mut len = 0;
117        let mut has_digits = false;
118
119        {
120            let rest = state.rest();
121            if rest.is_empty() {
122                return false;
123            }
124
125            let first_char = rest.chars().next().unwrap();
126            if !first_char.is_ascii_digit() && first_char != '-' && first_char != '+' {
127                return false;
128            }
129
130            // 处理符号
131            if first_char == '-' || first_char == '+' {
132                len += first_char.len_utf8();
133            }
134
135            // 跳过数字
136            let mut chars = rest.chars().skip(if first_char == '-' || first_char == '+' { 1 } else { 0 });
137
138            while let Some(ch) = chars.next() {
139                if ch.is_ascii_digit() {
140                    len += ch.len_utf8();
141                    has_digits = true;
142                }
143                else if ch == '.' {
144                    // 浮点数
145                    len += ch.len_utf8();
146                    while let Some(ch) = chars.next() {
147                        if ch.is_ascii_digit() {
148                            len += ch.len_utf8();
149                            has_digits = true;
150                        }
151                        else {
152                            break;
153                        }
154                    }
155                    break;
156                }
157                else {
158                    break;
159                }
160            }
161        }
162
163        if has_digits {
164            state.advance(len);
165            let end = state.get_position();
166            state.add_token(SchemeSyntaxKind::NumberLiteral, start, end);
167            true
168        }
169        else {
170            false
171        }
172    }
173
174    fn lex_identifier_or_keyword<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
175        let start = state.get_position();
176        let mut len;
177
178        {
179            let rest = state.rest();
180            if rest.is_empty() {
181                return false;
182            }
183
184            let first_char = rest.chars().next().unwrap();
185            if !self.is_identifier_start(first_char) {
186                return false;
187            }
188
189            len = first_char.len_utf8();
190            let mut chars = rest.chars().skip(1);
191
192            while let Some(ch) = chars.next() {
193                if self.is_identifier_continue(ch) {
194                    len += ch.len_utf8();
195                }
196                else {
197                    break;
198                }
199            }
200        }
201
202        let text = state.get_text_in(oak_core::Range { start, end: start + len }).to_string();
203        state.advance(len);
204        let end = state.get_position();
205
206        let kind = match text.as_str() {
207            "define" | "lambda" | "if" | "cond" | "case" | "and" | "or" | "not" | "let" | "let*" | "letrec" | "begin" | "do" | "quote" | "quasiquote" | "unquote" | "unquote-splicing" | "set!" | "delay" | "force" | "#t" | "#f" | "null" | "car" | "cdr"
208            | "cons" | "list" | "append" | "length" | "reverse" | "map" | "for-each" | "apply" => SchemeSyntaxKind::Keyword,
209            _ => SchemeSyntaxKind::Identifier,
210        };
211
212        state.add_token(kind, start, end);
213        true
214    }
215
216    fn is_identifier_start(&self, ch: char) -> bool {
217        ch.is_alphabetic() || "!$%&*+-./:<=>?@^_~".contains(ch)
218    }
219
220    fn is_identifier_continue(&self, ch: char) -> bool {
221        self.is_identifier_start(ch) || ch.is_ascii_digit()
222    }
223
224    fn lex_single_char_tokens<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
225        let start = state.get_position();
226        let ch = match state.peek() {
227            Some(ch) => ch,
228            None => return false,
229        };
230
231        let kind = match ch {
232            '(' => Some(SchemeSyntaxKind::LeftParen),
233            ')' => Some(SchemeSyntaxKind::RightParen),
234            '[' => Some(SchemeSyntaxKind::LeftBracket),
235            ']' => Some(SchemeSyntaxKind::RightBracket),
236            '{' => Some(SchemeSyntaxKind::LeftBrace),
237            '}' => Some(SchemeSyntaxKind::RightBrace),
238            '\'' => Some(SchemeSyntaxKind::Quote),
239            '`' => Some(SchemeSyntaxKind::Quasiquote),
240            ',' => Some(SchemeSyntaxKind::Unquote),
241            '.' => Some(SchemeSyntaxKind::Dot),
242            '#' => Some(SchemeSyntaxKind::Hash),
243            _ => None,
244        };
245
246        if let Some(kind) = kind {
247            state.advance(ch.len_utf8());
248            state.add_token(kind, start, state.get_position());
249            true
250        }
251        else {
252            false
253        }
254    }
255}