1use super::data::*;
2use super::loc::{Position, Span, Spanned};
3use super::utf8::{next_char, MovementInBytes, NextCharError};
4
5#[cfg(feature = "unicode")]
6use unicode_xid::UnicodeXID;
7
8#[derive(Debug, Clone)]
10pub struct TokenizerConfig {
11 filter_comment: bool,
13 support_bytes: bool,
15 support_brace: bool,
17 support_bracket: bool,
19}
20
21impl Default for TokenizerConfig {
22 fn default() -> Self {
23 TokenizerConfig {
24 filter_comment: false,
25 support_bytes: true,
26 support_bracket: true,
27 support_brace: true,
28 }
29 }
30}
31
32impl TokenizerConfig {
33 pub fn comment(mut self, enabled: bool) -> Self {
35 self.filter_comment = !enabled;
36 self
37 }
38
39 pub fn braces(mut self, enabled: bool) -> Self {
41 self.support_brace = enabled;
42 self
43 }
44
45 pub fn bracket(mut self, enabled: bool) -> Self {
47 self.support_bracket = enabled;
48 self
49 }
50
51 pub fn support_bytes(mut self, supported: bool) -> Self {
53 self.support_bytes = supported;
54 self
55 }
56}
57
58pub struct Tokenizer<'a> {
60 data: &'a [u8],
61 index: TokDataPos,
62 position: Position,
63 cfg: TokenizerConfig,
64}
65
66#[derive(Clone, Copy)]
67pub struct TokDataPos(usize);
68
69#[derive(Clone, Debug)]
71pub enum Token<'a> {
72 Left(GroupKind),
74 Right(GroupKind),
76 Comment(&'a str),
78 Atom(Atom<'a>),
80}
81
82impl<'a> Token<'a> {
83 pub fn is_comment(&self) -> bool {
84 match self {
85 Token::Comment(_) => true,
86 _ => false,
87 }
88 }
89}
90
91pub type SpannedToken<'a> = Spanned<Token<'a>>;
93#[derive(Clone, Debug)]
102pub enum TokenError {
103 DataError(NextCharError, usize),
104 UnterminatedString(Position),
105 UnterminatedBytes(Position),
106 UnprocessedChar(char),
107 UnterminatedBytesChar(Position, char),
108}
109
110impl<'a> Tokenizer<'a> {
111 pub fn new(data: &'a str) -> Self {
113 Tokenizer {
114 data: data.as_bytes(),
115 index: TokDataPos(0),
116 position: Position::default(),
117 cfg: TokenizerConfig::default(),
118 }
119 }
120
121 pub fn new_with_config(data: &'a str, cfg: TokenizerConfig) -> Self {
123 Tokenizer {
124 data: data.as_bytes(),
125 index: TokDataPos(0),
126 position: Position::default(),
127 cfg,
128 }
129 }
130
131 pub fn next(&mut self) -> Result<Option<SpannedToken<'a>>, TokenError> {
133 loop {
137 self.skip_whitespace().expect("Valid string");
138 match self.peek_char().expect("Valid string") {
139 None => return Ok(None),
140 Some((leading_char, advance)) => {
141 let token_start = self.position;
142 let position_start = self.index;
143 self.position.advance(leading_char);
144 self.move_index(advance);
145 let tok = self.next_cont(token_start, position_start, leading_char)?;
146 if !tok.inner.is_comment() {
148 return Ok(Some(tok));
149 } else {
150 if !self.cfg.filter_comment {
151 return Ok(Some(tok));
152 }
153 }
154 }
155 }
156 }
157 }
158
159 fn move_index(&mut self, bytes: MovementInBytes) {
160 self.index.0 += bytes.0
161 }
162
163 fn slice_from(&self, start: TokDataPos) -> &'a str {
164 let slice = &self.data[start.0..self.index.0];
165 core::str::from_utf8(slice).expect("valid utf8")
166 }
167
168 fn peek_char(&self) -> Result<Option<(char, MovementInBytes)>, TokenError> {
169 match next_char(self.data, self.index.0) {
170 Err(e) => Err(TokenError::DataError(e, self.index.0)),
171 Ok(ok) => Ok(ok),
172 }
173 }
174
175 fn skip_whitespace(&mut self) -> Result<(), TokenError> {
176 loop {
177 match self.peek_char()? {
178 None => return Ok(()),
179 Some((ch, advance)) => {
180 if !"\n\t ".contains(ch) {
181 return Ok(());
182 }
183 self.position.advance(ch);
184 self.move_index(advance);
185 }
186 }
187 }
188 }
189
190 fn skip_until<F>(&mut self, f: F) -> Result<(), TokenError>
192 where
193 F: Fn(char) -> bool,
194 {
195 loop {
196 match self.peek_char()? {
197 None => return Ok(()),
198 Some((ch, advance)) => {
199 if f(ch) {
200 return Ok(());
201 }
202 self.position.advance(ch);
203 self.move_index(advance);
204 }
205 }
206 }
207 }
208
209 fn skip_while<F>(&mut self, f: F) -> Result<(), TokenError>
211 where
212 F: Fn(char) -> bool,
213 {
214 loop {
215 match self.peek_char()? {
216 None => return Ok(()),
217 Some((ch, advance)) => {
218 if !f(ch) {
219 return Ok(());
220 }
221 self.position.advance(ch);
222 self.move_index(advance);
223 }
224 }
225 }
226 }
227
228 fn bytes(&mut self) -> Result<ABytes<'a>, TokenError> {
229 let position_start = self.index;
230 self.skip_while(|c| c.is_ascii_hexdigit())?;
231 match self.peek_char()? {
232 None => Err(TokenError::UnterminatedBytes(self.position)),
233 Some((ch, advance)) => {
234 if ch == '#' {
235 let dat = self.slice_from(position_start);
236
237 self.position.advance(ch);
239 self.move_index(advance);
240
241 return Ok(ABytes(dat));
242 } else {
243 return Err(TokenError::UnterminatedBytesChar(self.position, ch));
244 }
245 }
246 }
247 }
248
249 fn number(
250 &mut self,
251 leading_char: char,
252 position_start: TokDataPos,
253 ) -> Result<ANum<'a>, TokenError> {
254 match self.peek_char()? {
255 None => {
256 let dat = self.slice_from(position_start);
258 Ok(ANum {
259 base: ANumBase::Decimal,
260 dat: dat,
261 })
262 }
263 Some((ch, advance)) => {
264 let zero_start = leading_char == '0';
265
266 if zero_start {
267 if ch == 'b' {
268 self.position.advance(ch);
270 self.move_index(advance);
271
272 let position_start = self.index;
273
274 self.skip_while(|c| c == '0' || c == '1' || c == '_')?;
275 Ok(ANum {
276 base: ANumBase::Binary,
277 dat: self.slice_from(position_start),
278 })
279 } else if ch == 'x' {
280 self.position.advance(ch);
282 self.move_index(advance);
283
284 let position_start = self.index;
285
286 self.skip_while(|c| c.is_ascii_hexdigit() || c == '_')?;
287 Ok(ANum {
288 base: ANumBase::Hexadecimal,
289 dat: self.slice_from(position_start),
290 })
291 } else if ch.is_ascii_digit() {
292 self.position.advance(ch);
293 self.move_index(advance);
294
295 self.skip_while(|c| c.is_numeric() || c == '_')?;
296 Ok(ANum {
297 base: ANumBase::Decimal,
298 dat: self.slice_from(position_start),
299 })
300 } else {
301 let dat = self.slice_from(position_start);
302 Ok(ANum {
303 base: ANumBase::Decimal,
304 dat: dat,
305 })
306 }
307 } else {
308 if ch.is_ascii_digit() {
309 self.position.advance(ch);
310 self.move_index(advance);
311
312 self.skip_while(|c| c.is_numeric() || c == '_')?;
313 Ok(ANum {
314 base: ANumBase::Decimal,
315 dat: self.slice_from(position_start),
316 })
317 } else {
318 let dat = self.slice_from(position_start);
319 Ok(ANum {
320 base: ANumBase::Decimal,
321 dat: dat,
322 })
323 }
324 }
325 }
326 }
327 }
328
329 fn string(&mut self) -> Result<AStr<'a>, TokenError> {
331 let mut has_escape = false; let position_start = self.index;
333
334 let mut escape = false;
335 loop {
336 match self.peek_char()? {
337 None => return Err(TokenError::UnterminatedString(self.position)),
338 Some((ch, advance)) => {
339 if escape {
340 escape = false;
341 } else {
342 if ch == '\\' {
343 has_escape = true;
344 escape = true;
345 } else if ch == '"' {
346 let dat = self.slice_from(position_start);
347
348 self.position.advance(ch);
350 self.move_index(advance);
351
352 return Ok(AStr {
353 has_escape,
354 raw_data: dat,
355 });
356 }
357 }
358 self.position.advance(ch);
359 self.move_index(advance);
360 }
361 }
362 }
363 }
364
365 fn next_cont(
367 &mut self,
368 token_start: Position,
369 position_start: TokDataPos,
370 leading_char: char,
371 ) -> Result<SpannedToken<'a>, TokenError> {
372 let stok = |cur, token| {
373 let span = Span {
374 start: token_start,
375 end: cur,
376 };
377 Ok(Spanned { span, inner: token })
378 };
379
380 if leading_char == '(' {
389 stok(self.position, Token::Left(GroupKind::Paren))
390 } else if leading_char == ')' {
391 stok(self.position, Token::Right(GroupKind::Paren))
392 } else if self.cfg.support_bracket && leading_char == '[' {
393 stok(self.position, Token::Left(GroupKind::Bracket))
394 } else if self.cfg.support_bracket && leading_char == ']' {
395 stok(self.position, Token::Right(GroupKind::Bracket))
396 } else if self.cfg.support_brace && leading_char == '{' {
397 stok(self.position, Token::Left(GroupKind::Brace))
398 } else if self.cfg.support_brace && leading_char == '}' {
399 stok(self.position, Token::Right(GroupKind::Brace))
400 } else if leading_char == ';' {
401 self.skip_until(|c| c == '\n')?;
403 let comment = self.slice_from(position_start);
404 stok(self.position, Token::Comment(comment))
405 } else if leading_char == '"' {
406 let astr = self.string()?;
408 stok(self.position, Token::Atom(Atom::String(astr)))
409 } else if self.cfg.support_bytes && leading_char == '#' {
410 let bstr = self.bytes()?;
412 stok(self.position, Token::Atom(Atom::Bytes(bstr)))
413 } else if leading_char.is_ascii_digit() {
414 let anum = self.number(leading_char, position_start)?;
416 let is_decimal = anum.base == ANumBase::Decimal;
417 if is_decimal {
419 match self.peek_char() {
420 Ok(Some((ch @ '.', dot_advance))) => {
421 self.position.advance(ch);
422 self.move_index(dot_advance);
423
424 let fractional_start = self.index;
426 self.skip_while(|c| c.is_ascii_digit())?;
427 let raw_fractional = self.slice_from(fractional_start);
428
429 let adec = ADecimal {
430 raw_integral: anum.dat,
431 raw_fractional,
432 };
433 stok(self.position, Token::Atom(Atom::Decimal(adec)))
434 }
435 _ => stok(self.position, Token::Atom(Atom::Integral(anum))),
436 }
437 } else {
438 stok(self.position, Token::Atom(Atom::Integral(anum)))
439 }
440 } else if is_id_start(leading_char) {
441 self.skip_while(|c| is_id_continue(c))?;
442 let ident = self.slice_from(position_start);
443 stok(self.position, Token::Atom(Atom::Ident(ident)))
444 } else {
445 Err(TokenError::UnprocessedChar(leading_char))
446 }
447 }
448}
449
450fn is_id_start(ch: char) -> bool {
451 #[cfg(feature = "unicode")]
452 {
453 ch.is_xid_start()
454 || ch == '_'
455 || is_ascii_operator(ch)
456 || crate::utf8::extended_math_operator(ch)
457 }
458 #[cfg(not(feature = "unicode"))]
459 {
460 ch.is_ascii_alphabetic() || ch == '_' || is_ascii_operator(ch)
461 }
462}
463
464fn is_id_continue(ch: char) -> bool {
465 #[cfg(feature = "unicode")]
466 {
467 ch.is_xid_continue()
468 || ch == '_'
469 || ch.is_ascii_digit()
470 || is_ascii_operator(ch)
471 || crate::utf8::extended_math_operator(ch)
472 }
473 #[cfg(not(feature = "unicode"))]
474 {
475 ch.is_ascii_alphabetic() || ch == '_' || ch.is_ascii_digit() || is_ascii_operator(ch)
476 }
477}
478
479fn is_ascii_operator(ch: char) -> bool {
480 "?!#@$+-*/=<>,.:|%^&~'`".contains(ch)
482}