oak_scheme/lexer/
mod.rs

1use crate::{kind::SchemeSyntaxKind, language::SchemeLanguage};
2use oak_core::{
3    IncrementalCache, Lexer, LexerState, OakError,
4    lexer::{CommentLine, LexOutput, StringConfig, WhitespaceConfig},
5    source::Source,
6};
7use std::sync::LazyLock;
8
9type State<S> = LexerState<S, SchemeLanguage>;
10
11static SCHEME_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
12static SCHEME_COMMENT: LazyLock<CommentLine> = LazyLock::new(|| CommentLine { line_markers: &[";"] });
13static SCHEME_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"'], escape: Some('\\') });
14
15#[derive(Clone)]
16pub struct SchemeLexer<'config> {
17    config: &'config SchemeLanguage,
18}
19
20impl<'config> Lexer<SchemeLanguage> for SchemeLexer<'config> {
21    fn lex_incremental(
22        &self,
23        source: impl Source,
24        changed: usize,
25        cache: IncrementalCache<SchemeLanguage>,
26    ) -> LexOutput<SchemeLanguage> {
27        let mut state = LexerState::new_with_cache(source, changed, cache);
28        let result = self.run(&mut state);
29        state.finish(result)
30    }
31}
32
33impl<'config> SchemeLexer<'config> {
34    pub fn new(config: &'config SchemeLanguage) -> Self {
35        Self { config }
36    }
37
38    fn run<S: Source>(&self, state: &mut State<S>) -> Result<(), OakError> {
39        while state.not_at_end() {
40            let safe_point = state.get_position();
41
42            if self.skip_whitespace(state) {
43                continue;
44            }
45
46            if self.lex_newline(state) {
47                continue;
48            }
49
50            if self.skip_comment(state) {
51                continue;
52            }
53
54            if self.lex_string_literal(state) {
55                continue;
56            }
57
58            if self.lex_number_literal(state) {
59                continue;
60            }
61
62            if self.lex_identifier_or_keyword(state) {
63                continue;
64            }
65
66            if self.lex_single_char_tokens(state) {
67                continue;
68            }
69
70            // 错误处理:如果没有匹配任何规则,跳过当前字符并标记为错误
71            let start_pos = state.get_position();
72            if let Some(ch) = state.peek() {
73                state.advance(ch.len_utf8());
74                state.add_token(SchemeSyntaxKind::Error, start_pos, state.get_position());
75            }
76
77            state.safe_check(safe_point);
78        }
79
80        // 添加 EOF token
81        let eof_pos = state.get_position();
82        state.add_token(SchemeSyntaxKind::Eof, eof_pos, eof_pos);
83        Ok(())
84    }
85
86    fn skip_whitespace<S: Source>(&self, state: &mut State<S>) -> bool {
87        match SCHEME_WHITESPACE.scan(state.rest(), state.get_position(), SchemeSyntaxKind::Whitespace) {
88            Some(token) => {
89                state.advance_with(token);
90                return true;
91            }
92            None => {}
93        }
94        false
95    }
96
97    /// 处理换行
98    fn lex_newline<S: Source>(&self, state: &mut State<S>) -> bool {
99        let start_pos = state.get_position();
100
101        if let Some('\n') = state.peek() {
102            state.advance(1);
103            state.add_token(SchemeSyntaxKind::Newline, start_pos, state.get_position());
104            true
105        }
106        else if let Some('\r') = state.peek() {
107            state.advance(1);
108            if let Some('\n') = state.peek() {
109                state.advance(1);
110            }
111            state.add_token(SchemeSyntaxKind::Newline, start_pos, state.get_position());
112            true
113        }
114        else {
115            false
116        }
117    }
118
119    fn skip_comment<S: Source>(&self, state: &mut State<S>) -> bool {
120        match SCHEME_COMMENT.scan(state.rest(), state.get_position(), SchemeSyntaxKind::Comment) {
121            Some(token) => {
122                state.advance_with(token);
123                return true;
124            }
125            None => {}
126        }
127        false
128    }
129
130    fn lex_string_literal<S: Source>(&self, state: &mut State<S>) -> bool {
131        match SCHEME_STRING.scan(state.rest(), state.get_position(), SchemeSyntaxKind::StringLiteral) {
132            Some(token) => {
133                state.advance_with(token);
134                return true;
135            }
136            None => {}
137        }
138        false
139    }
140
141    fn lex_number_literal<S: Source>(&self, state: &mut State<S>) -> bool {
142        let rest = state.rest();
143        if rest.is_empty() {
144            return false;
145        }
146
147        let first_char = rest.chars().next().unwrap();
148        if !first_char.is_ascii_digit() && first_char != '-' && first_char != '+' {
149            return false;
150        }
151
152        let start = state.get_position();
153        let mut len = 0;
154
155        // 处理符号
156        if first_char == '-' || first_char == '+' {
157            len += first_char.len_utf8();
158        }
159
160        // 跳过数字
161        let mut has_digits = false;
162        let mut chars = rest.chars().skip(if first_char == '-' || first_char == '+' { 1 } else { 0 });
163
164        while let Some(ch) = chars.next() {
165            if ch.is_ascii_digit() {
166                len += ch.len_utf8();
167                has_digits = true;
168            }
169            else if ch == '.' {
170                // 浮点数
171                len += ch.len_utf8();
172                while let Some(ch) = chars.next() {
173                    if ch.is_ascii_digit() {
174                        len += ch.len_utf8();
175                        has_digits = true;
176                    }
177                    else {
178                        break;
179                    }
180                }
181                break;
182            }
183            else {
184                break;
185            }
186        }
187
188        if has_digits {
189            state.advance(len);
190        }
191
192        if !has_digits {
193            // 重置位置,这不是一个数字
194            return false;
195        }
196
197        let end = state.get_position();
198        state.add_token(SchemeSyntaxKind::NumberLiteral, start, end);
199        true
200    }
201
202    fn lex_identifier_or_keyword<S: Source>(&self, state: &mut State<S>) -> bool {
203        let rest = state.rest();
204        if rest.is_empty() {
205            return false;
206        }
207
208        let first_char = rest.chars().next().unwrap();
209        if !self.is_identifier_start(first_char) {
210            return false;
211        }
212
213        let start = state.get_position();
214        let mut len = first_char.len_utf8();
215        let mut chars = rest.chars().skip(1);
216
217        while let Some(ch) = chars.next() {
218            if self.is_identifier_continue(ch) {
219                len += ch.len_utf8();
220            }
221            else {
222                break;
223            }
224        }
225
226        let text = rest[..len].to_string();
227        state.advance(len);
228        let end = state.get_position();
229
230        let kind = match text.as_str() {
231            "define" | "lambda" | "if" | "cond" | "case" | "and" | "or" | "not" | "let" | "let*" | "letrec" | "begin"
232            | "do" | "quote" | "quasiquote" | "unquote" | "unquote-splicing" | "set!" | "delay" | "force" | "#t" | "#f"
233            | "null" | "car" | "cdr" | "cons" | "list" | "append" | "length" | "reverse" | "map" | "for-each" | "apply" => {
234                SchemeSyntaxKind::Keyword
235            }
236            _ => SchemeSyntaxKind::Identifier,
237        };
238
239        state.add_token(kind, start, end);
240        true
241    }
242
243    fn is_identifier_start(&self, ch: char) -> bool {
244        ch.is_alphabetic() || "!$%&*+-./:<=>?@^_~".contains(ch)
245    }
246
247    fn is_identifier_continue(&self, ch: char) -> bool {
248        self.is_identifier_start(ch) || ch.is_ascii_digit()
249    }
250
251    fn lex_single_char_tokens<S: Source>(&self, state: &mut State<S>) -> bool {
252        let rest = state.rest();
253        if rest.is_empty() {
254            return false;
255        }
256
257        let ch = rest.chars().next().unwrap();
258        let start = state.get_position();
259        state.advance(ch.len_utf8());
260        let end = state.get_position();
261
262        let kind = match ch {
263            '(' => SchemeSyntaxKind::LeftParen,
264            ')' => SchemeSyntaxKind::RightParen,
265            '[' => SchemeSyntaxKind::LeftBracket,
266            ']' => SchemeSyntaxKind::RightBracket,
267            '{' => SchemeSyntaxKind::LeftBrace,
268            '}' => SchemeSyntaxKind::RightBrace,
269            '\'' => SchemeSyntaxKind::Quote,
270            '`' => SchemeSyntaxKind::Quasiquote,
271            ',' => SchemeSyntaxKind::Unquote,
272            '.' => SchemeSyntaxKind::Dot,
273            '#' => SchemeSyntaxKind::Hash,
274            _ => {
275                return false;
276            }
277        };
278
279        state.add_token(kind, start, end);
280        true
281    }
282}