1use crate::{kind::SchemeSyntaxKind, language::SchemeLanguage};
2use oak_core::{
3 IncrementalCache, Lexer, LexerState, OakError,
4 lexer::{CommentLine, LexOutput, StringConfig, WhitespaceConfig},
5 source::Source,
6};
7use std::sync::LazyLock;
8
9type State<S> = LexerState<S, SchemeLanguage>;
10
11static SCHEME_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
12static SCHEME_COMMENT: LazyLock<CommentLine> = LazyLock::new(|| CommentLine { line_markers: &[";"] });
13static SCHEME_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"'], escape: Some('\\') });
14
15#[derive(Clone)]
16pub struct SchemeLexer<'config> {
17 config: &'config SchemeLanguage,
18}
19
20impl<'config> Lexer<SchemeLanguage> for SchemeLexer<'config> {
21 fn lex_incremental(
22 &self,
23 source: impl Source,
24 changed: usize,
25 cache: IncrementalCache<SchemeLanguage>,
26 ) -> LexOutput<SchemeLanguage> {
27 let mut state = LexerState::new_with_cache(source, changed, cache);
28 let result = self.run(&mut state);
29 state.finish(result)
30 }
31}
32
33impl<'config> SchemeLexer<'config> {
34 pub fn new(config: &'config SchemeLanguage) -> Self {
35 Self { config }
36 }
37
38 fn run<S: Source>(&self, state: &mut State<S>) -> Result<(), OakError> {
39 while state.not_at_end() {
40 let safe_point = state.get_position();
41
42 if self.skip_whitespace(state) {
43 continue;
44 }
45
46 if self.lex_newline(state) {
47 continue;
48 }
49
50 if self.skip_comment(state) {
51 continue;
52 }
53
54 if self.lex_string_literal(state) {
55 continue;
56 }
57
58 if self.lex_number_literal(state) {
59 continue;
60 }
61
62 if self.lex_identifier_or_keyword(state) {
63 continue;
64 }
65
66 if self.lex_single_char_tokens(state) {
67 continue;
68 }
69
70 let start_pos = state.get_position();
72 if let Some(ch) = state.peek() {
73 state.advance(ch.len_utf8());
74 state.add_token(SchemeSyntaxKind::Error, start_pos, state.get_position());
75 }
76
77 state.safe_check(safe_point);
78 }
79
80 let eof_pos = state.get_position();
82 state.add_token(SchemeSyntaxKind::Eof, eof_pos, eof_pos);
83 Ok(())
84 }
85
86 fn skip_whitespace<S: Source>(&self, state: &mut State<S>) -> bool {
87 match SCHEME_WHITESPACE.scan(state.rest(), state.get_position(), SchemeSyntaxKind::Whitespace) {
88 Some(token) => {
89 state.advance_with(token);
90 return true;
91 }
92 None => {}
93 }
94 false
95 }
96
97 fn lex_newline<S: Source>(&self, state: &mut State<S>) -> bool {
99 let start_pos = state.get_position();
100
101 if let Some('\n') = state.peek() {
102 state.advance(1);
103 state.add_token(SchemeSyntaxKind::Newline, start_pos, state.get_position());
104 true
105 }
106 else if let Some('\r') = state.peek() {
107 state.advance(1);
108 if let Some('\n') = state.peek() {
109 state.advance(1);
110 }
111 state.add_token(SchemeSyntaxKind::Newline, start_pos, state.get_position());
112 true
113 }
114 else {
115 false
116 }
117 }
118
119 fn skip_comment<S: Source>(&self, state: &mut State<S>) -> bool {
120 match SCHEME_COMMENT.scan(state.rest(), state.get_position(), SchemeSyntaxKind::Comment) {
121 Some(token) => {
122 state.advance_with(token);
123 return true;
124 }
125 None => {}
126 }
127 false
128 }
129
130 fn lex_string_literal<S: Source>(&self, state: &mut State<S>) -> bool {
131 match SCHEME_STRING.scan(state.rest(), state.get_position(), SchemeSyntaxKind::StringLiteral) {
132 Some(token) => {
133 state.advance_with(token);
134 return true;
135 }
136 None => {}
137 }
138 false
139 }
140
141 fn lex_number_literal<S: Source>(&self, state: &mut State<S>) -> bool {
142 let rest = state.rest();
143 if rest.is_empty() {
144 return false;
145 }
146
147 let first_char = rest.chars().next().unwrap();
148 if !first_char.is_ascii_digit() && first_char != '-' && first_char != '+' {
149 return false;
150 }
151
152 let start = state.get_position();
153 let mut len = 0;
154
155 if first_char == '-' || first_char == '+' {
157 len += first_char.len_utf8();
158 }
159
160 let mut has_digits = false;
162 let mut chars = rest.chars().skip(if first_char == '-' || first_char == '+' { 1 } else { 0 });
163
164 while let Some(ch) = chars.next() {
165 if ch.is_ascii_digit() {
166 len += ch.len_utf8();
167 has_digits = true;
168 }
169 else if ch == '.' {
170 len += ch.len_utf8();
172 while let Some(ch) = chars.next() {
173 if ch.is_ascii_digit() {
174 len += ch.len_utf8();
175 has_digits = true;
176 }
177 else {
178 break;
179 }
180 }
181 break;
182 }
183 else {
184 break;
185 }
186 }
187
188 if has_digits {
189 state.advance(len);
190 }
191
192 if !has_digits {
193 return false;
195 }
196
197 let end = state.get_position();
198 state.add_token(SchemeSyntaxKind::NumberLiteral, start, end);
199 true
200 }
201
202 fn lex_identifier_or_keyword<S: Source>(&self, state: &mut State<S>) -> bool {
203 let rest = state.rest();
204 if rest.is_empty() {
205 return false;
206 }
207
208 let first_char = rest.chars().next().unwrap();
209 if !self.is_identifier_start(first_char) {
210 return false;
211 }
212
213 let start = state.get_position();
214 let mut len = first_char.len_utf8();
215 let mut chars = rest.chars().skip(1);
216
217 while let Some(ch) = chars.next() {
218 if self.is_identifier_continue(ch) {
219 len += ch.len_utf8();
220 }
221 else {
222 break;
223 }
224 }
225
226 let text = rest[..len].to_string();
227 state.advance(len);
228 let end = state.get_position();
229
230 let kind = match text.as_str() {
231 "define" | "lambda" | "if" | "cond" | "case" | "and" | "or" | "not" | "let" | "let*" | "letrec" | "begin"
232 | "do" | "quote" | "quasiquote" | "unquote" | "unquote-splicing" | "set!" | "delay" | "force" | "#t" | "#f"
233 | "null" | "car" | "cdr" | "cons" | "list" | "append" | "length" | "reverse" | "map" | "for-each" | "apply" => {
234 SchemeSyntaxKind::Keyword
235 }
236 _ => SchemeSyntaxKind::Identifier,
237 };
238
239 state.add_token(kind, start, end);
240 true
241 }
242
243 fn is_identifier_start(&self, ch: char) -> bool {
244 ch.is_alphabetic() || "!$%&*+-./:<=>?@^_~".contains(ch)
245 }
246
247 fn is_identifier_continue(&self, ch: char) -> bool {
248 self.is_identifier_start(ch) || ch.is_ascii_digit()
249 }
250
251 fn lex_single_char_tokens<S: Source>(&self, state: &mut State<S>) -> bool {
252 let rest = state.rest();
253 if rest.is_empty() {
254 return false;
255 }
256
257 let ch = rest.chars().next().unwrap();
258 let start = state.get_position();
259 state.advance(ch.len_utf8());
260 let end = state.get_position();
261
262 let kind = match ch {
263 '(' => SchemeSyntaxKind::LeftParen,
264 ')' => SchemeSyntaxKind::RightParen,
265 '[' => SchemeSyntaxKind::LeftBracket,
266 ']' => SchemeSyntaxKind::RightBracket,
267 '{' => SchemeSyntaxKind::LeftBrace,
268 '}' => SchemeSyntaxKind::RightBrace,
269 '\'' => SchemeSyntaxKind::Quote,
270 '`' => SchemeSyntaxKind::Quasiquote,
271 ',' => SchemeSyntaxKind::Unquote,
272 '.' => SchemeSyntaxKind::Dot,
273 '#' => SchemeSyntaxKind::Hash,
274 _ => {
275 return false;
276 }
277 };
278
279 state.add_token(kind, start, end);
280 true
281 }
282}