1#![doc = include_str!("readme.md")]
2use crate::{language::SchemeLanguage, lexer::token_type::SchemeTokenType};
3pub mod token_type;
4use oak_core::{
5 Lexer, LexerCache, LexerState, OakError, Source, TextEdit,
6 lexer::{CommentConfig, LexOutput, StringConfig, WhitespaceConfig},
7};
8use std::sync::LazyLock;
9
10type State<'a, S> = LexerState<'a, S, SchemeLanguage>;
11
12static SCHEME_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
13static SCHEME_COMMENT: LazyLock<CommentConfig> = LazyLock::new(|| CommentConfig { line_marker: ";", block_start: "#|", block_end: "|#", nested_blocks: true });
14static SCHEME_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"'], escape: Some('\\') });
15
16#[derive(Clone, Debug)]
17pub struct SchemeLexer<'config> {
18 config: &'config SchemeLanguage,
19}
20
21impl<'config> Lexer<SchemeLanguage> for SchemeLexer<'config> {
22 fn lex<'a, S: Source + ?Sized>(&self, source: &S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<SchemeLanguage>) -> LexOutput<SchemeLanguage> {
23 let mut state: State<'_, S> = LexerState::new_with_cache(source, 0, cache);
24 let result = self.run(&mut state);
25 state.finish_with_cache(result, cache)
26 }
27}
28
29impl<'config> SchemeLexer<'config> {
30 pub fn new(config: &'config SchemeLanguage) -> Self {
31 Self { config }
32 }
33
34 fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
35 while state.not_at_end() {
36 let safe_point = state.get_position();
37
38 if self.skip_whitespace(state) {
39 continue;
40 }
41
42 if self.lex_newline(state) {
43 continue;
44 }
45
46 if self.skip_comment(state) {
47 continue;
48 }
49
50 if self.lex_string_literal(state) {
51 continue;
52 }
53
54 if self.lex_number_literal(state) {
55 continue;
56 }
57
58 if self.lex_identifier_or_keyword(state) {
59 continue;
60 }
61
62 if self.lex_single_char_tokens(state) {
63 continue;
64 }
65
66 let start_pos = state.get_position();
68 if let Some(ch) = state.peek() {
69 state.advance(ch.len_utf8());
70 state.add_token(SchemeTokenType::Error, start_pos, state.get_position());
71 }
72
73 state.advance_if_dead_lock(safe_point)
74 }
75
76 state.add_eof();
78 Ok(())
79 }
80
81 fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
82 SCHEME_WHITESPACE.scan(state, SchemeTokenType::Whitespace)
83 }
84
85 fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
87 let start_pos = state.get_position();
88
89 if let Some('\n') = state.peek() {
90 state.advance(1);
91 state.add_token(SchemeTokenType::Newline, start_pos, state.get_position());
92 true
93 }
94 else if let Some('\r') = state.peek() {
95 state.advance(1);
96 if let Some('\n') = state.peek() {
97 state.advance(1);
98 }
99 state.add_token(SchemeTokenType::Newline, start_pos, state.get_position());
100 true
101 }
102 else {
103 false
104 }
105 }
106
107 fn skip_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
108 SCHEME_COMMENT.scan(state, SchemeTokenType::LineComment, SchemeTokenType::Comment)
109 }
110
111 fn lex_string_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
112 SCHEME_STRING.scan(state, SchemeTokenType::StringLiteral)
113 }
114
115 fn lex_number_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
116 let start = state.get_position();
117 let mut len = 0;
118 let mut has_digits = false;
119
120 {
121 let rest = state.rest();
122 if rest.is_empty() {
123 return false;
124 }
125
126 let first_char = rest.chars().next().unwrap();
127 if !first_char.is_ascii_digit() && first_char != '-' && first_char != '+' {
128 return false;
129 }
130
131 if first_char == '-' || first_char == '+' {
133 len += first_char.len_utf8();
134 }
135
136 let mut chars = rest.chars().skip(if first_char == '-' || first_char == '+' { 1 } else { 0 });
138
139 while let Some(ch) = chars.next() {
140 if ch.is_ascii_digit() {
141 len += ch.len_utf8();
142 has_digits = true;
143 }
144 else if ch == '.' {
145 len += ch.len_utf8();
147 while let Some(ch) = chars.next() {
148 if ch.is_ascii_digit() {
149 len += ch.len_utf8();
150 has_digits = true;
151 }
152 else {
153 break;
154 }
155 }
156 break;
157 }
158 else {
159 break;
160 }
161 }
162 }
163
164 if has_digits {
165 state.advance(len);
166 let end = state.get_position();
167 state.add_token(SchemeTokenType::NumberLiteral, start, end);
168 true
169 }
170 else {
171 false
172 }
173 }
174
175 fn lex_identifier_or_keyword<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
176 let start = state.get_position();
177 let mut len;
178
179 {
180 let rest = state.rest();
181 if rest.is_empty() {
182 return false;
183 }
184
185 let first_char = rest.chars().next().unwrap();
186 if !self.is_identifier_start(first_char) {
187 return false;
188 }
189
190 len = first_char.len_utf8();
191 let mut chars = rest.chars().skip(1);
192
193 while let Some(ch) = chars.next() {
194 if self.is_identifier_continue(ch) {
195 len += ch.len_utf8();
196 }
197 else {
198 break;
199 }
200 }
201 }
202
203 let text = state.get_text_in(oak_core::Range { start, end: start + len }).to_string();
204 state.advance(len);
205 let end = state.get_position();
206
207 let kind = match text.as_str() {
208 "define" | "lambda" | "if" | "cond" | "case" | "and" | "or" | "not" | "let" | "let*" | "letrec" | "begin" | "do" | "quote" | "quasiquote" | "unquote" | "unquote-splicing" | "set!" | "delay" | "force" | "#t" | "#f" | "null" | "car" | "cdr"
209 | "cons" | "list" | "append" | "length" | "reverse" | "map" | "for-each" | "apply" => SchemeTokenType::Keyword,
210 _ => SchemeTokenType::Identifier,
211 };
212
213 state.add_token(kind, start, end);
214 true
215 }
216
217 fn is_identifier_start(&self, ch: char) -> bool {
218 ch.is_alphabetic() || "!$%&*+-./:<=>?@^_~".contains(ch)
219 }
220
221 fn is_identifier_continue(&self, ch: char) -> bool {
222 self.is_identifier_start(ch) || ch.is_ascii_digit()
223 }
224
225 fn lex_single_char_tokens<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
226 let start = state.get_position();
227 let ch = match state.peek() {
228 Some(ch) => ch,
229 None => return false,
230 };
231
232 let kind = match ch {
233 '(' => Some(SchemeTokenType::LeftParen),
234 ')' => Some(SchemeTokenType::RightParen),
235 '[' => Some(SchemeTokenType::LeftBracket),
236 ']' => Some(SchemeTokenType::RightBracket),
237 '{' => Some(SchemeTokenType::LeftBrace),
238 '}' => Some(SchemeTokenType::RightBrace),
239 '\'' => Some(SchemeTokenType::Quote),
240 '`' => Some(SchemeTokenType::Quasiquote),
241 ',' => Some(SchemeTokenType::Unquote),
242 '.' => Some(SchemeTokenType::Dot),
243 '#' => Some(SchemeTokenType::Hash),
244 _ => None,
245 };
246
247 if let Some(kind) = kind {
248 state.advance(ch.len_utf8());
249 state.add_token(kind, start, state.get_position());
250 true
251 }
252 else {
253 false
254 }
255 }
256}