1#![doc = include_str!("readme.md")]
2use crate::{language::SchemeLanguage, lexer::token_type::SchemeTokenType};
3pub mod token_type;
4use oak_core::{
5 Lexer, LexerCache, LexerState, OakError, Source, TextEdit,
6 lexer::{CommentConfig, LexOutput, StringConfig, WhitespaceConfig},
7};
8use std::sync::LazyLock;
9
10pub(crate) type State<'a, S> = LexerState<'a, S, SchemeLanguage>;
11
12static SCHEME_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
13static SCHEME_COMMENT: LazyLock<CommentConfig> = LazyLock::new(|| CommentConfig { line_marker: ";", block_start: "#|", block_end: "|#", nested_blocks: true });
14static SCHEME_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"'], escape: Some('\\') });
15
16#[derive(Clone, Debug)]
17pub struct SchemeLexer<'config> {
19 config: &'config SchemeLanguage,
20}
21
22impl<'config> Lexer<SchemeLanguage> for SchemeLexer<'config> {
23 fn lex<'a, S: Source + ?Sized>(&self, source: &S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<SchemeLanguage>) -> LexOutput<SchemeLanguage> {
24 let mut state: State<'_, S> = LexerState::new_with_cache(source, 0, cache);
25 let result = self.run(&mut state);
26 state.finish_with_cache(result, cache)
27 }
28}
29
30impl<'config> SchemeLexer<'config> {
31 pub fn new(config: &'config SchemeLanguage) -> Self {
33 Self { config }
34 }
35
36 fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
37 while state.not_at_end() {
38 let safe_point = state.get_position();
39
40 if self.skip_whitespace(state) {
41 continue;
42 }
43
44 if self.lex_newline(state) {
45 continue;
46 }
47
48 if self.skip_comment(state) {
49 continue;
50 }
51
52 if self.lex_string_literal(state) {
53 continue;
54 }
55
56 if self.lex_number_literal(state) {
57 continue;
58 }
59
60 if self.lex_identifier_or_keyword(state) {
61 continue;
62 }
63
64 if self.lex_single_char_tokens(state) {
65 continue;
66 }
67
68 let start_pos = state.get_position();
70 if let Some(ch) = state.peek() {
71 state.advance(ch.len_utf8());
72 state.add_token(SchemeTokenType::Error, start_pos, state.get_position());
73 }
74
75 state.advance_if_dead_lock(safe_point)
76 }
77
78 state.add_eof();
80 Ok(())
81 }
82
83 fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
84 SCHEME_WHITESPACE.scan(state, SchemeTokenType::Whitespace)
85 }
86
87 fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
89 let start_pos = state.get_position();
90
91 if let Some('\n') = state.peek() {
92 state.advance(1);
93 state.add_token(SchemeTokenType::Newline, start_pos, state.get_position());
94 true
95 }
96 else if let Some('\r') = state.peek() {
97 state.advance(1);
98 if let Some('\n') = state.peek() {
99 state.advance(1);
100 }
101 state.add_token(SchemeTokenType::Newline, start_pos, state.get_position());
102 true
103 }
104 else {
105 false
106 }
107 }
108
109 fn skip_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
110 SCHEME_COMMENT.scan(state, SchemeTokenType::LineComment, SchemeTokenType::Comment)
111 }
112
113 fn lex_string_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
114 SCHEME_STRING.scan(state, SchemeTokenType::StringLiteral)
115 }
116
117 fn lex_number_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
118 let start = state.get_position();
119 let mut len = 0;
120 let mut has_digits = false;
121
122 {
123 let rest = state.rest();
124 if rest.is_empty() {
125 return false;
126 }
127
128 let first_char = rest.chars().next().unwrap();
129 if !first_char.is_ascii_digit() && first_char != '-' && first_char != '+' {
130 return false;
131 }
132
133 if first_char == '-' || first_char == '+' {
135 len += first_char.len_utf8();
136 }
137
138 let mut chars = rest.chars().skip(if first_char == '-' || first_char == '+' { 1 } else { 0 });
140
141 while let Some(ch) = chars.next() {
142 if ch.is_ascii_digit() {
143 len += ch.len_utf8();
144 has_digits = true;
145 }
146 else if ch == '.' {
147 len += ch.len_utf8();
149 while let Some(ch) = chars.next() {
150 if ch.is_ascii_digit() {
151 len += ch.len_utf8();
152 has_digits = true;
153 }
154 else {
155 break;
156 }
157 }
158 break;
159 }
160 else {
161 break;
162 }
163 }
164 }
165
166 if has_digits {
167 state.advance(len);
168 let end = state.get_position();
169 state.add_token(SchemeTokenType::NumberLiteral, start, end);
170 true
171 }
172 else {
173 false
174 }
175 }
176
177 fn lex_identifier_or_keyword<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
178 let start = state.get_position();
179 let mut len;
180
181 {
182 let rest = state.rest();
183 if rest.is_empty() {
184 return false;
185 }
186
187 let first_char = rest.chars().next().unwrap();
188 if !self.is_identifier_start(first_char) {
189 return false;
190 }
191
192 len = first_char.len_utf8();
193 let mut chars = rest.chars().skip(1);
194
195 while let Some(ch) = chars.next() {
196 if self.is_identifier_continue(ch) {
197 len += ch.len_utf8();
198 }
199 else {
200 break;
201 }
202 }
203 }
204
205 let text = state.get_text_in(oak_core::Range { start, end: start + len }).to_string();
206 state.advance(len);
207 let end = state.get_position();
208
209 let kind = match text.as_str() {
210 "define" | "lambda" | "if" | "cond" | "case" | "and" | "or" | "not" | "let" | "let*" | "letrec" | "begin" | "do" | "quote" | "quasiquote" | "unquote" | "unquote-splicing" | "set!" | "delay" | "force" | "#t" | "#f" | "null" | "car" | "cdr"
211 | "cons" | "list" | "append" | "length" | "reverse" | "map" | "for-each" | "apply" => SchemeTokenType::Keyword,
212 _ => SchemeTokenType::Identifier,
213 };
214
215 state.add_token(kind, start, end);
216 true
217 }
218
219 fn is_identifier_start(&self, ch: char) -> bool {
220 ch.is_alphabetic() || "!$%&*+-./:<=>?@^_~".contains(ch)
221 }
222
223 fn is_identifier_continue(&self, ch: char) -> bool {
224 self.is_identifier_start(ch) || ch.is_ascii_digit()
225 }
226
227 fn lex_single_char_tokens<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
228 let start = state.get_position();
229 let ch = match state.peek() {
230 Some(ch) => ch,
231 None => return false,
232 };
233
234 let kind = match ch {
235 '(' => Some(SchemeTokenType::LeftParen),
236 ')' => Some(SchemeTokenType::RightParen),
237 '[' => Some(SchemeTokenType::LeftBracket),
238 ']' => Some(SchemeTokenType::RightBracket),
239 '{' => Some(SchemeTokenType::LeftBrace),
240 '}' => Some(SchemeTokenType::RightBrace),
241 '\'' => Some(SchemeTokenType::Quote),
242 '`' => Some(SchemeTokenType::Quasiquote),
243 ',' => Some(SchemeTokenType::Unquote),
244 '.' => Some(SchemeTokenType::Dot),
245 '#' => Some(SchemeTokenType::Hash),
246 _ => None,
247 };
248
249 if let Some(kind) = kind {
250 state.advance(ch.len_utf8());
251 state.add_token(kind, start, state.get_position());
252 true
253 }
254 else {
255 false
256 }
257 }
258}