1use crate::{kind::SchemeSyntaxKind, language::SchemeLanguage};
2use oak_core::{
3 Lexer, LexerCache, LexerState, OakError,
4 lexer::{CommentConfig, LexOutput, StringConfig, WhitespaceConfig},
5 source::Source,
6};
7use std::sync::LazyLock;
8
9type State<'a, S> = LexerState<'a, S, SchemeLanguage>;
10
11static SCHEME_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
12static SCHEME_COMMENT: LazyLock<CommentConfig> = LazyLock::new(|| CommentConfig { line_marker: ";", block_start: "#|", block_end: "|#", nested_blocks: true });
13static SCHEME_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"'], escape: Some('\\') });
14
15#[derive(Clone, Debug)]
16pub struct SchemeLexer<'config> {
17 _config: &'config SchemeLanguage,
18}
19
20impl<'config> Lexer<SchemeLanguage> for SchemeLexer<'config> {
21 fn lex<'a, S: Source + ?Sized>(&self, source: &S, _edits: &[oak_core::TextEdit], cache: &'a mut impl LexerCache<SchemeLanguage>) -> LexOutput<SchemeLanguage> {
22 let mut state: State<'_, S> = LexerState::new_with_cache(source, 0, cache);
23 let result = self.run(&mut state);
24 state.finish_with_cache(result, cache)
25 }
26}
27
28impl<'config> SchemeLexer<'config> {
29 pub fn new(config: &'config SchemeLanguage) -> Self {
30 Self { _config: config }
31 }
32
33 fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
34 while state.not_at_end() {
35 let safe_point = state.get_position();
36
37 if self.skip_whitespace(state) {
38 continue;
39 }
40
41 if self.lex_newline(state) {
42 continue;
43 }
44
45 if self.skip_comment(state) {
46 continue;
47 }
48
49 if self.lex_string_literal(state) {
50 continue;
51 }
52
53 if self.lex_number_literal(state) {
54 continue;
55 }
56
57 if self.lex_identifier_or_keyword(state) {
58 continue;
59 }
60
61 if self.lex_single_char_tokens(state) {
62 continue;
63 }
64
65 let start_pos = state.get_position();
67 if let Some(ch) = state.peek() {
68 state.advance(ch.len_utf8());
69 state.add_token(SchemeSyntaxKind::Error, start_pos, state.get_position());
70 }
71
72 state.advance_if_dead_lock(safe_point);
73 }
74
75 state.add_eof();
77 Ok(())
78 }
79
80 fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
81 SCHEME_WHITESPACE.scan(state, SchemeSyntaxKind::Whitespace)
82 }
83
84 fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
86 let start_pos = state.get_position();
87
88 if let Some('\n') = state.peek() {
89 state.advance(1);
90 state.add_token(SchemeSyntaxKind::Newline, start_pos, state.get_position());
91 true
92 }
93 else if let Some('\r') = state.peek() {
94 state.advance(1);
95 if let Some('\n') = state.peek() {
96 state.advance(1);
97 }
98 state.add_token(SchemeSyntaxKind::Newline, start_pos, state.get_position());
99 true
100 }
101 else {
102 false
103 }
104 }
105
106 fn skip_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
107 SCHEME_COMMENT.scan(state, SchemeSyntaxKind::LineComment, SchemeSyntaxKind::Comment)
108 }
109
110 fn lex_string_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
111 SCHEME_STRING.scan(state, SchemeSyntaxKind::StringLiteral)
112 }
113
114 fn lex_number_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
115 let start = state.get_position();
116 let mut len = 0;
117 let mut has_digits = false;
118
119 {
120 let rest = state.rest();
121 if rest.is_empty() {
122 return false;
123 }
124
125 let first_char = rest.chars().next().unwrap();
126 if !first_char.is_ascii_digit() && first_char != '-' && first_char != '+' {
127 return false;
128 }
129
130 if first_char == '-' || first_char == '+' {
132 len += first_char.len_utf8();
133 }
134
135 let mut chars = rest.chars().skip(if first_char == '-' || first_char == '+' { 1 } else { 0 });
137
138 while let Some(ch) = chars.next() {
139 if ch.is_ascii_digit() {
140 len += ch.len_utf8();
141 has_digits = true;
142 }
143 else if ch == '.' {
144 len += ch.len_utf8();
146 while let Some(ch) = chars.next() {
147 if ch.is_ascii_digit() {
148 len += ch.len_utf8();
149 has_digits = true;
150 }
151 else {
152 break;
153 }
154 }
155 break;
156 }
157 else {
158 break;
159 }
160 }
161 }
162
163 if has_digits {
164 state.advance(len);
165 let end = state.get_position();
166 state.add_token(SchemeSyntaxKind::NumberLiteral, start, end);
167 true
168 }
169 else {
170 false
171 }
172 }
173
174 fn lex_identifier_or_keyword<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
175 let start = state.get_position();
176 let mut len;
177
178 {
179 let rest = state.rest();
180 if rest.is_empty() {
181 return false;
182 }
183
184 let first_char = rest.chars().next().unwrap();
185 if !self.is_identifier_start(first_char) {
186 return false;
187 }
188
189 len = first_char.len_utf8();
190 let mut chars = rest.chars().skip(1);
191
192 while let Some(ch) = chars.next() {
193 if self.is_identifier_continue(ch) {
194 len += ch.len_utf8();
195 }
196 else {
197 break;
198 }
199 }
200 }
201
202 let text = state.get_text_in(oak_core::Range { start, end: start + len }).to_string();
203 state.advance(len);
204 let end = state.get_position();
205
206 let kind = match text.as_str() {
207 "define" | "lambda" | "if" | "cond" | "case" | "and" | "or" | "not" | "let" | "let*" | "letrec" | "begin" | "do" | "quote" | "quasiquote" | "unquote" | "unquote-splicing" | "set!" | "delay" | "force" | "#t" | "#f" | "null" | "car" | "cdr"
208 | "cons" | "list" | "append" | "length" | "reverse" | "map" | "for-each" | "apply" => SchemeSyntaxKind::Keyword,
209 _ => SchemeSyntaxKind::Identifier,
210 };
211
212 state.add_token(kind, start, end);
213 true
214 }
215
216 fn is_identifier_start(&self, ch: char) -> bool {
217 ch.is_alphabetic() || "!$%&*+-./:<=>?@^_~".contains(ch)
218 }
219
220 fn is_identifier_continue(&self, ch: char) -> bool {
221 self.is_identifier_start(ch) || ch.is_ascii_digit()
222 }
223
224 fn lex_single_char_tokens<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
225 let start = state.get_position();
226 let ch = match state.peek() {
227 Some(ch) => ch,
228 None => return false,
229 };
230
231 let kind = match ch {
232 '(' => Some(SchemeSyntaxKind::LeftParen),
233 ')' => Some(SchemeSyntaxKind::RightParen),
234 '[' => Some(SchemeSyntaxKind::LeftBracket),
235 ']' => Some(SchemeSyntaxKind::RightBracket),
236 '{' => Some(SchemeSyntaxKind::LeftBrace),
237 '}' => Some(SchemeSyntaxKind::RightBrace),
238 '\'' => Some(SchemeSyntaxKind::Quote),
239 '`' => Some(SchemeSyntaxKind::Quasiquote),
240 ',' => Some(SchemeSyntaxKind::Unquote),
241 '.' => Some(SchemeSyntaxKind::Dot),
242 '#' => Some(SchemeSyntaxKind::Hash),
243 _ => None,
244 };
245
246 if let Some(kind) = kind {
247 state.advance(ch.len_utf8());
248 state.add_token(kind, start, state.get_position());
249 true
250 }
251 else {
252 false
253 }
254 }
255}