1use crate::{kind::SchemeSyntaxKind, language::SchemeLanguage};
2use oak_core::{
3 Lexer, LexerCache, LexerState, OakError,
4 lexer::{CommentConfig, LexOutput, StringConfig, WhitespaceConfig},
5 source::Source,
6};
7use std::sync::LazyLock;
8
9type State<'a, S> = LexerState<'a, S, SchemeLanguage>;
10
11static SCHEME_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
12static SCHEME_COMMENT: LazyLock<CommentConfig> = LazyLock::new(|| CommentConfig { line_marker: ";", block_start: "#|", block_end: "|#", nested_blocks: true });
13static SCHEME_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"'], escape: Some('\\') });
14
15#[derive(Clone)]
16pub struct SchemeLexer<'config> {
17 _config: &'config SchemeLanguage,
18}
19
20impl<'config> Lexer<SchemeLanguage> for SchemeLexer<'config> {
21 fn lex<'a, S: Source + ?Sized>(&self, source: &S, _edits: &[oak_core::TextEdit], cache: &'a mut impl LexerCache<SchemeLanguage>) -> LexOutput<SchemeLanguage> {
22 let mut state: State<'_, S> = LexerState::new(source);
23 let result = self.run(&mut state);
24 state.finish_with_cache(result, cache)
25 }
26}
27
28impl<'config> SchemeLexer<'config> {
29 pub fn new(config: &'config SchemeLanguage) -> Self {
30 Self { _config: config }
31 }
32
33 fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
34 while state.not_at_end() {
35 let safe_point = state.get_position();
36
37 if self.skip_whitespace(state) {
38 continue;
39 }
40
41 if self.lex_newline(state) {
42 continue;
43 }
44
45 if self.skip_comment(state) {
46 continue;
47 }
48
49 if self.lex_string_literal(state) {
50 continue;
51 }
52
53 if self.lex_number_literal(state) {
54 continue;
55 }
56
57 if self.lex_identifier_or_keyword(state) {
58 continue;
59 }
60
61 if self.lex_single_char_tokens(state) {
62 continue;
63 }
64
65 let start_pos = state.get_position();
67 if let Some(ch) = state.peek() {
68 state.advance(ch.len_utf8());
69 state.add_token(SchemeSyntaxKind::Error, start_pos, state.get_position());
70 }
71
72 state.advance_if_dead_lock(safe_point);
73 }
74
75 let eof_pos = state.get_position();
77 state.add_token(SchemeSyntaxKind::Eof, eof_pos, eof_pos);
78 Ok(())
79 }
80
81 fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
82 SCHEME_WHITESPACE.scan(state, SchemeSyntaxKind::Whitespace)
83 }
84
85 fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
87 let start_pos = state.get_position();
88
89 if let Some('\n') = state.peek() {
90 state.advance(1);
91 state.add_token(SchemeSyntaxKind::Newline, start_pos, state.get_position());
92 true
93 }
94 else if let Some('\r') = state.peek() {
95 state.advance(1);
96 if let Some('\n') = state.peek() {
97 state.advance(1);
98 }
99 state.add_token(SchemeSyntaxKind::Newline, start_pos, state.get_position());
100 true
101 }
102 else {
103 false
104 }
105 }
106
107 fn skip_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
108 SCHEME_COMMENT.scan(state, SchemeSyntaxKind::LineComment, SchemeSyntaxKind::Comment)
109 }
110
111 fn lex_string_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
112 SCHEME_STRING.scan(state, SchemeSyntaxKind::StringLiteral)
113 }
114
115 fn lex_number_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
116 let start = state.get_position();
117 let mut len = 0;
118 let mut has_digits = false;
119
120 {
121 let rest = state.rest();
122 if rest.is_empty() {
123 return false;
124 }
125
126 let first_char = rest.chars().next().unwrap();
127 if !first_char.is_ascii_digit() && first_char != '-' && first_char != '+' {
128 return false;
129 }
130
131 if first_char == '-' || first_char == '+' {
133 len += first_char.len_utf8();
134 }
135
136 let mut chars = rest.chars().skip(if first_char == '-' || first_char == '+' { 1 } else { 0 });
138
139 while let Some(ch) = chars.next() {
140 if ch.is_ascii_digit() {
141 len += ch.len_utf8();
142 has_digits = true;
143 }
144 else if ch == '.' {
145 len += ch.len_utf8();
147 while let Some(ch) = chars.next() {
148 if ch.is_ascii_digit() {
149 len += ch.len_utf8();
150 has_digits = true;
151 }
152 else {
153 break;
154 }
155 }
156 break;
157 }
158 else {
159 break;
160 }
161 }
162 }
163
164 if has_digits {
165 state.advance(len);
166 let end = state.get_position();
167 state.add_token(SchemeSyntaxKind::NumberLiteral, start, end);
168 true
169 }
170 else {
171 false
172 }
173 }
174
175 fn lex_identifier_or_keyword<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
176 let start = state.get_position();
177 let mut len;
178
179 {
180 let rest = state.rest();
181 if rest.is_empty() {
182 return false;
183 }
184
185 let first_char = rest.chars().next().unwrap();
186 if !self.is_identifier_start(first_char) {
187 return false;
188 }
189
190 len = first_char.len_utf8();
191 let mut chars = rest.chars().skip(1);
192
193 while let Some(ch) = chars.next() {
194 if self.is_identifier_continue(ch) {
195 len += ch.len_utf8();
196 }
197 else {
198 break;
199 }
200 }
201 }
202
203 let text = state.get_text_in(oak_core::Range { start, end: start + len }).to_string();
204 state.advance(len);
205 let end = state.get_position();
206
207 let kind = match text.as_str() {
208 "define" | "lambda" | "if" | "cond" | "case" | "and" | "or" | "not" | "let" | "let*" | "letrec" | "begin" | "do" | "quote" | "quasiquote" | "unquote" | "unquote-splicing" | "set!" | "delay" | "force" | "#t" | "#f" | "null" | "car" | "cdr"
209 | "cons" | "list" | "append" | "length" | "reverse" | "map" | "for-each" | "apply" => SchemeSyntaxKind::Keyword,
210 _ => SchemeSyntaxKind::Identifier,
211 };
212
213 state.add_token(kind, start, end);
214 true
215 }
216
217 fn is_identifier_start(&self, ch: char) -> bool {
218 ch.is_alphabetic() || "!$%&*+-./:<=>?@^_~".contains(ch)
219 }
220
221 fn is_identifier_continue(&self, ch: char) -> bool {
222 self.is_identifier_start(ch) || ch.is_ascii_digit()
223 }
224
225 fn lex_single_char_tokens<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
226 let start = state.get_position();
227 let ch = match state.peek() {
228 Some(ch) => ch,
229 None => return false,
230 };
231
232 let kind = match ch {
233 '(' => Some(SchemeSyntaxKind::LeftParen),
234 ')' => Some(SchemeSyntaxKind::RightParen),
235 '[' => Some(SchemeSyntaxKind::LeftBracket),
236 ']' => Some(SchemeSyntaxKind::RightBracket),
237 '{' => Some(SchemeSyntaxKind::LeftBrace),
238 '}' => Some(SchemeSyntaxKind::RightBrace),
239 '\'' => Some(SchemeSyntaxKind::Quote),
240 '`' => Some(SchemeSyntaxKind::Quasiquote),
241 ',' => Some(SchemeSyntaxKind::Unquote),
242 '.' => Some(SchemeSyntaxKind::Dot),
243 '#' => Some(SchemeSyntaxKind::Hash),
244 _ => None,
245 };
246
247 if let Some(kind) = kind {
248 state.advance(ch.len_utf8());
249 state.add_token(kind, start, state.get_position());
250 true
251 }
252 else {
253 false
254 }
255 }
256}