1use std::collections::VecDeque;
2use std::ops::Range;
3
4use crate::atom::{AtomData, PlausibleSerializedAtom};
5use crate::error::{Result, TokenizationError};
6use crate::input::{Input, InputChunk};
7use crate::Ref;
8
9#[derive(Debug)]
10pub enum Token<'de, 't> {
11 LeftParen,
12 Atom(Ref<'de, 't, AtomData>),
13 RightParen,
14}
15
16impl<'de, 't> Token<'de, 't> {
17 pub fn kind(&self) -> TokenKind {
18 match self {
19 Token::LeftParen => TokenKind::LeftParen,
20 Token::Atom(_) => TokenKind::Atom,
21 Token::RightParen => TokenKind::RightParen,
22 }
23 }
24}
25
26#[derive(Copy, Clone, Debug)]
27pub enum TokenKind {
28 LeftParen,
29 Atom,
30 RightParen,
31}
32
33pub trait TokenIterator<'de> {
34 fn next<'t>(&'t mut self) -> Result<Option<Token<'de, 't>>>;
35
36 fn peek_kind(&mut self) -> Result<Option<TokenKind>>;
37}
38
39#[derive(Copy, Clone, Debug)]
40pub enum VarTokenKind {
41 Atom,
42 LineComment,
43 BlockComment,
44}
45
46#[derive(Debug)]
58#[repr(transparent)]
59pub struct RawTokenBytes([u8]);
60
61impl RawTokenBytes {
62 pub fn new(bytes: &[u8]) -> &RawTokenBytes {
63 unsafe { &*(bytes as *const [u8] as *const RawTokenBytes) }
66 }
67
68 pub fn bytes(&self) -> &[u8] {
69 &self.0
70 }
71
72 pub fn validate_block_comment(&self) -> std::result::Result<(), TokenizationError> {
73 let mut bytes = &self.0;
74
75 while let Some(open_quote_index) = bytes.iter().position(|b| *b == b'"') {
77 bytes = &bytes[(open_quote_index + 1)..];
78
79 let mut close_quote_index = 0;
80 let mut remaining_bytes = bytes;
81
82 'find_close_quote: loop {
83 let Some(quote_or_backslash_index) = remaining_bytes
85 .iter()
86 .position(|b| *b == b'"' || *b == b'\\')
87 else {
88 return Err(TokenizationError::UnterminatedQuote);
89 };
90
91 close_quote_index += quote_or_backslash_index;
92
93 if remaining_bytes[quote_or_backslash_index] == b'"' {
94 break 'find_close_quote;
95 }
96
97 if quote_or_backslash_index + 1 >= remaining_bytes.len() {
98 return Err(TokenizationError::UnterminatedBackslashEscape);
99 }
100
101 close_quote_index += 2;
103 remaining_bytes = &remaining_bytes[(quote_or_backslash_index + 2)..];
104 }
105
106 PlausibleSerializedAtom::validate_quote_escaping(&bytes[..close_quote_index])?;
108
109 bytes = &bytes[(close_quote_index + 1)..];
110 }
111
112 Ok(())
113 }
114}
115
116#[derive(Debug)]
117pub enum RawToken<'de, 't> {
118 LeftParen,
119 RightParen,
120 Atom(Ref<'de, 't, PlausibleSerializedAtom>),
121 LineComment(Ref<'de, 't, RawTokenBytes>),
122 BlockComment(Ref<'de, 't, RawTokenBytes>),
123 SexpComment,
124}
125
126pub enum RawTokenKind {
128 LeftParen,
129 RightParen,
130 Atom,
131 LineComment,
132 BlockComment,
133 SexpComment,
134}
135
136impl<'de, 't> RawToken<'de, 't> {
137 fn from_token_bytes_and_kind(
138 token_bytes: Ref<'de, 't, [u8]>,
139 kind: VarTokenKind,
140 ) -> RawToken<'de, 't> {
141 if matches!(kind, VarTokenKind::Atom) {
142 let plausible_atom = match token_bytes {
143 Ref::Borrowed(bytes) => Ref::Borrowed(PlausibleSerializedAtom::new(bytes).unwrap()),
144 Ref::Transient(bytes) => {
145 Ref::Transient(PlausibleSerializedAtom::new(bytes).unwrap())
146 }
147 };
148
149 return RawToken::Atom(plausible_atom);
150 }
151
152 let raw_token_bytes = match token_bytes {
153 Ref::Borrowed(bytes) => Ref::Borrowed(RawTokenBytes::new(bytes)),
154 Ref::Transient(bytes) => Ref::Transient(RawTokenBytes::new(bytes)),
155 };
156
157 match kind {
158 VarTokenKind::LineComment => RawToken::LineComment(raw_token_bytes),
159 VarTokenKind::BlockComment => RawToken::BlockComment(raw_token_bytes),
160 VarTokenKind::Atom => unreachable!(),
161 }
162 }
163}
164
165#[derive(Debug)]
166enum RawTokenRefData {
167 Range(Range<usize>),
168 Scratch,
169}
170
171#[derive(Debug)]
172enum RawTokenRef {
173 LeftParen,
174 RightParen,
175 SexpComment,
176 VarToken(RawTokenRefData, VarTokenKind),
177}
178
179impl RawTokenRef {
180 fn to_raw_token_kind(&self) -> RawTokenKind {
181 match self {
182 RawTokenRef::LeftParen => RawTokenKind::LeftParen,
183 RawTokenRef::RightParen => RawTokenKind::RightParen,
184 RawTokenRef::VarToken(_, VarTokenKind::Atom) => RawTokenKind::Atom,
185 RawTokenRef::VarToken(_, VarTokenKind::LineComment) => RawTokenKind::LineComment,
186 RawTokenRef::VarToken(_, VarTokenKind::BlockComment) => RawTokenKind::BlockComment,
187 RawTokenRef::SexpComment => RawTokenKind::SexpComment,
188 }
189 }
190}
191
192pub struct HasEnoughData(());
193
194pub trait RawTokenTape {
195 fn feed_more_data(&mut self, data: &[u8]);
196 fn eof(&mut self);
197 fn has_enough_data_to_produce_tokens(&self) -> Option<HasEnoughData>;
198
199 fn next_raw_token<'de, 't>(
200 &'t mut self,
201 witness: HasEnoughData,
202 current_data: Option<Ref<'de, 't, [u8]>>,
203 ) -> Result<Option<RawToken<'de, 't>>>;
204
205 fn peek_raw_token_kind(&self, witness: &HasEnoughData) -> Result<Option<RawTokenKind>>;
206
207 fn advance(&mut self, witness: HasEnoughData) -> Result<()>;
208}
209
210#[derive(Copy, Clone, Eq, PartialEq, Debug)]
253enum TokenizationState {
254 Start,
255 CarriageReturn,
256 InUnquotedAtom,
257 InUnquotedAtomPoundSign,
258 InUnquotedAtomBar,
259 InQuotedAtom,
260 InQuotedAtomEscape,
261 LineComment,
262 PoundSign,
263 Bar,
264 BlockComment,
265 BlockCommentPoundSign,
266 BlockCommentBar,
267 BlockCommentInQuotedString,
268 BlockCommentInQuotedStringEscape,
269}
270
271pub struct BasicTapeTokenizer {
272 state: Option<TokenizationState>,
275 scratch_buffer_for_a_previous_token: Vec<u8>,
276 scratch_buffer_for_current_token: Vec<u8>,
277 using_scratch_buffer_for_current_token: bool,
278 raw_token_refs: VecDeque<Result<RawTokenRef>>,
279 block_comment_depth: i64,
283 start_of_current_token: usize,
285}
286
287macro_rules! whitespace {
288 () => {
289 b' ' | b'\n' | b'\t' | b'\x0c'
290 };
291}
292
293impl BasicTapeTokenizer {
294 pub fn new() -> Self {
295 BasicTapeTokenizer {
296 state: Some(TokenizationState::Start),
297 scratch_buffer_for_a_previous_token: vec![],
298 scratch_buffer_for_current_token: vec![],
299 using_scratch_buffer_for_current_token: false,
300 raw_token_refs: VecDeque::new(),
301 block_comment_depth: 0,
305 start_of_current_token: 0,
306 }
307 }
308
309 fn start_new_token(&mut self, pos: usize, state: TokenizationState) {
310 self.using_scratch_buffer_for_current_token = false;
311 self.start_of_current_token = pos;
312 self.state = Some(state);
313 }
314
315 fn copy_partial_token_to_scratch_buffer(&mut self, buffer: &[u8]) {
316 let partial_token = &buffer[self.start_of_current_token..];
317 if !self.using_scratch_buffer_for_current_token {
318 self.scratch_buffer_for_current_token.clear();
319 self.using_scratch_buffer_for_current_token = true;
320 }
321 self.scratch_buffer_for_current_token
322 .extend_from_slice(partial_token);
323 }
324
325 fn finish_token(&mut self, kind: VarTokenKind, ends_before: usize, buffer: &[u8]) {
326 let range = self.start_of_current_token..ends_before;
327 let raw_token_ref_data = if self.using_scratch_buffer_for_current_token {
328 let partial_token = &buffer[range];
329 self.scratch_buffer_for_current_token
330 .extend_from_slice(partial_token);
331 self.complete_token_in_scratch_buffer();
332 RawTokenRefData::Scratch
333 } else {
334 RawTokenRefData::Range(range)
335 };
336
337 self.raw_token_refs
338 .push_back(Ok(RawTokenRef::VarToken(raw_token_ref_data, kind)));
339 }
340
341 fn complete_token_in_scratch_buffer(&mut self) {
342 std::mem::swap(
345 &mut self.scratch_buffer_for_a_previous_token,
346 &mut self.scratch_buffer_for_current_token,
347 );
348 self.scratch_buffer_for_current_token.clear();
349 }
350}
351
352impl Default for BasicTapeTokenizer {
353 fn default() -> Self {
354 Self::new()
355 }
356}
357
358impl RawTokenTape for BasicTapeTokenizer {
359 fn has_enough_data_to_produce_tokens(&self) -> Option<HasEnoughData> {
360 if self.raw_token_refs.is_empty() && self.state.is_some() {
361 None
362 } else {
363 Some(HasEnoughData(()))
364 }
365 }
366
367 fn peek_raw_token_kind(&self, _: &HasEnoughData) -> Result<Option<RawTokenKind>> {
368 match self.raw_token_refs.front() {
369 None => Ok(None),
370 Some(Err(err)) => Err(err.clone()),
371 Some(Ok(raw_token_ref)) => Ok(Some(raw_token_ref.to_raw_token_kind())),
372 }
373 }
374
375 fn next_raw_token<'de, 't>(
376 &'t mut self,
377 _: HasEnoughData,
378 current_data: Option<Ref<'de, 't, [u8]>>,
379 ) -> Result<Option<RawToken<'de, 't>>> {
380 match self.raw_token_refs.pop_front() {
381 None => Ok(None),
382 Some(Err(error)) => Err(error),
383 Some(Ok(raw_token_ref)) => {
384 let raw_token = match raw_token_ref {
385 RawTokenRef::LeftParen => RawToken::LeftParen,
386 RawTokenRef::RightParen => RawToken::RightParen,
387 RawTokenRef::SexpComment => RawToken::SexpComment,
388 RawTokenRef::VarToken(raw_token_ref_data, token_kind) => {
389 let raw_token_bytes = match raw_token_ref_data {
390 RawTokenRefData::Scratch => {
391 Ref::Transient(self.scratch_buffer_for_a_previous_token.as_slice())
392 }
393 RawTokenRefData::Range(range) => match current_data {
394 Some(data) => data.index(range.clone()),
395 None => panic!("TapeTokenizer has stale `Range` ref."),
396 },
397 };
398
399 RawToken::from_token_bytes_and_kind(raw_token_bytes, token_kind)
400 }
401 };
402
403 Ok(Some(raw_token))
404 }
405 }
406 }
407
408 fn advance(&mut self, _: HasEnoughData) -> Result<()> {
409 match self.raw_token_refs.pop_front() {
410 None => Ok(()),
411 Some(Err(error)) => Err(error),
412 Some(Ok(_raw_token_ref)) => Ok(()),
413 }
414 }
415
416 fn feed_more_data(&mut self, buffer: &[u8]) {
417 assert!(self.raw_token_refs.is_empty());
418
419 if buffer.is_empty() {
421 return;
422 }
423
424 if self.state.is_none() {
425 self.raw_token_refs
426 .push_back(Err(TokenizationError::TriedToProcessMoreDataAfterEof.into()));
427 return;
428 };
429
430 self.start_of_current_token = 0;
431
432 for (pos, ch) in buffer.iter().enumerate() {
433 match self.state.unwrap() {
435 TokenizationState::Start => match *ch {
436 whitespace!() => (),
437 b'(' => self.raw_token_refs.push_back(Ok(RawTokenRef::LeftParen)),
438 b')' => self.raw_token_refs.push_back(Ok(RawTokenRef::RightParen)),
439 b'\r' => self.state = Some(TokenizationState::CarriageReturn),
440 b'"' => self.start_new_token(pos, TokenizationState::InQuotedAtom),
441 b';' => self.start_new_token(pos, TokenizationState::LineComment),
442 b'#' => self.start_new_token(pos, TokenizationState::PoundSign),
443 b'|' => self.start_new_token(pos, TokenizationState::Bar),
444 _ => self.start_new_token(pos, TokenizationState::InUnquotedAtom),
445 },
446 TokenizationState::CarriageReturn => {
447 if *ch != b'\n' {
448 self.raw_token_refs
449 .push_back(Err(TokenizationError::NakedCarriageReturn.into()));
450 self.state = None;
452 return;
453 }
454 self.state = Some(TokenizationState::Start);
455 }
456 TokenizationState::InUnquotedAtom
457 | TokenizationState::InUnquotedAtomPoundSign
458 | TokenizationState::InUnquotedAtomBar
459 | TokenizationState::PoundSign
460 | TokenizationState::Bar => match *ch {
461 whitespace!() => {
462 self.finish_token(VarTokenKind::Atom, pos, buffer);
463 self.state = Some(TokenizationState::Start);
464 }
465 b'(' => {
466 self.finish_token(VarTokenKind::Atom, pos, buffer);
467 self.raw_token_refs.push_back(Ok(RawTokenRef::LeftParen));
468 self.state = Some(TokenizationState::Start);
469 }
470 b')' => {
471 self.finish_token(VarTokenKind::Atom, pos, buffer);
472 self.raw_token_refs.push_back(Ok(RawTokenRef::RightParen));
473 self.state = Some(TokenizationState::Start);
474 }
475 b'\r' => {
476 self.finish_token(VarTokenKind::Atom, pos, buffer);
477 self.state = Some(TokenizationState::CarriageReturn);
478 }
479 b'"' => {
480 self.finish_token(VarTokenKind::Atom, pos, buffer);
481 self.start_new_token(pos, TokenizationState::InQuotedAtom);
482 }
483 b';' => match self.state.unwrap() {
484 TokenizationState::PoundSign => {
485 self.raw_token_refs.push_back(Ok(RawTokenRef::SexpComment));
486 self.state = Some(TokenizationState::Start);
487 }
488 _ => {
489 self.finish_token(VarTokenKind::Atom, pos, buffer);
490 self.start_new_token(pos, TokenizationState::LineComment);
491 }
492 },
493 b'#' => match self.state.unwrap() {
494 TokenizationState::InUnquotedAtomBar => {
495 self.raw_token_refs.push_back(Err(
496 TokenizationError::BlockCommentEndTokenInUnquotedAtom.into(),
497 ));
498 self.state = None;
499 return;
500 }
501 TokenizationState::Bar => {
502 self.raw_token_refs.push_back(Err(
503 TokenizationError::UnexpectedEndOfBlockComment.into(),
504 ));
505 self.state = None;
506 return;
507 }
508 _ => self.state = Some(TokenizationState::InUnquotedAtomPoundSign),
509 },
510 b'|' => match self.state.unwrap() {
511 TokenizationState::InUnquotedAtomPoundSign => {
512 self.raw_token_refs.push_back(Err(
513 TokenizationError::BlockCommentStartTokenInUnquotedAtom.into(),
514 ));
515 self.state = None;
516 return;
517 }
518 TokenizationState::PoundSign => {
519 self.state = Some(TokenizationState::BlockComment);
520 self.block_comment_depth = 1;
521 }
522 _ => self.state = Some(TokenizationState::InUnquotedAtomBar),
523 },
524 _ => self.state = Some(TokenizationState::InUnquotedAtom),
525 },
526 TokenizationState::InQuotedAtom | TokenizationState::BlockCommentInQuotedString => {
529 match *ch {
530 b'"' => {
531 if self.state.unwrap() == TokenizationState::InQuotedAtom {
532 self.finish_token(VarTokenKind::Atom, pos + 1, buffer);
533 self.state = Some(TokenizationState::Start);
534 } else {
535 self.state = Some(TokenizationState::BlockComment);
536 }
537 }
538 b'\\' => {
539 if self.state.unwrap() == TokenizationState::InQuotedAtom {
540 self.state = Some(TokenizationState::InQuotedAtomEscape);
541 } else {
542 self.state =
543 Some(TokenizationState::BlockCommentInQuotedStringEscape);
544 }
545 }
546 _ => (),
547 }
548 }
549 TokenizationState::InQuotedAtomEscape => {
550 self.state = Some(TokenizationState::InQuotedAtom);
551 }
552 TokenizationState::BlockCommentInQuotedStringEscape => {
553 self.state = Some(TokenizationState::BlockCommentInQuotedString);
554 }
555 TokenizationState::LineComment => match *ch {
556 b'\n' => {
557 self.finish_token(VarTokenKind::LineComment, pos, buffer);
558 self.state = Some(TokenizationState::Start);
559 }
560 b'\r' => {
561 self.finish_token(VarTokenKind::LineComment, pos, buffer);
562 self.state = Some(TokenizationState::CarriageReturn);
563 }
564 _ => (),
565 },
566 TokenizationState::BlockComment => match *ch {
567 b'"' => self.state = Some(TokenizationState::BlockCommentInQuotedString),
568 b'#' => self.state = Some(TokenizationState::BlockCommentPoundSign),
569 b'|' => self.state = Some(TokenizationState::BlockCommentBar),
570 _ => (),
571 },
572 TokenizationState::BlockCommentPoundSign => match *ch {
573 b'"' => self.state = Some(TokenizationState::BlockCommentInQuotedString),
574 b'#' => self.state = Some(TokenizationState::BlockCommentPoundSign),
575 b'|' => {
576 self.block_comment_depth += 1;
577 self.state = Some(TokenizationState::BlockComment);
578 }
579 _ => self.state = Some(TokenizationState::BlockComment),
580 },
581 TokenizationState::BlockCommentBar => match *ch {
582 b'"' => self.state = Some(TokenizationState::BlockCommentInQuotedString),
583 b'|' => self.state = Some(TokenizationState::BlockCommentBar),
584 b'#' => {
585 self.block_comment_depth -= 1;
586 if self.block_comment_depth == 0 {
587 self.finish_token(VarTokenKind::BlockComment, pos + 1, buffer);
588 self.state = Some(TokenizationState::Start);
589 } else {
590 self.state = Some(TokenizationState::BlockComment);
591 }
592 }
593 _ => self.state = Some(TokenizationState::BlockComment),
594 },
595 }
596 }
597
598 match self.state.unwrap() {
600 TokenizationState::Start | TokenizationState::CarriageReturn => (),
601 TokenizationState::PoundSign
603 | TokenizationState::InUnquotedAtom
605 | TokenizationState::InUnquotedAtomPoundSign
606 | TokenizationState::InUnquotedAtomBar
607 | TokenizationState::Bar
608 | TokenizationState::InQuotedAtom
609 | TokenizationState::InQuotedAtomEscape
610 | TokenizationState::LineComment
612 | TokenizationState::BlockComment
614 | TokenizationState::BlockCommentPoundSign
615 | TokenizationState::BlockCommentBar
616 | TokenizationState::BlockCommentInQuotedString
617 | TokenizationState::BlockCommentInQuotedStringEscape => {
618 self.copy_partial_token_to_scratch_buffer(buffer);
619 }
620 }
621 }
622
623 fn eof(&mut self) {
624 let Some(final_state) = self.state.take() else {
626 self.raw_token_refs
627 .push_back(Err(TokenizationError::EofCalledMultipleTimes.into()));
628 return;
629 };
630
631 let raw_token_ref_data = RawTokenRefData::Scratch;
634
635 let final_token_ref = match final_state {
636 TokenizationState::Start => return,
637 TokenizationState::InUnquotedAtom
638 | TokenizationState::InUnquotedAtomBar
639 | TokenizationState::InUnquotedAtomPoundSign
640 | TokenizationState::Bar => {
641 assert!(self.using_scratch_buffer_for_current_token);
642 self.complete_token_in_scratch_buffer();
643 Ok(RawTokenRef::VarToken(
644 raw_token_ref_data,
645 VarTokenKind::Atom,
646 ))
647 }
648 TokenizationState::LineComment => {
649 assert!(self.using_scratch_buffer_for_current_token);
650 self.complete_token_in_scratch_buffer();
651 Ok(RawTokenRef::VarToken(
652 raw_token_ref_data,
653 VarTokenKind::LineComment,
654 ))
655 }
656 TokenizationState::PoundSign => {
657 assert!(self.using_scratch_buffer_for_current_token);
658 self.complete_token_in_scratch_buffer();
659 Ok(RawTokenRef::VarToken(
660 raw_token_ref_data,
661 VarTokenKind::Atom,
662 ))
663 }
664 TokenizationState::CarriageReturn => Err(TokenizationError::NakedCarriageReturn.into()),
665 TokenizationState::InQuotedAtom | TokenizationState::InQuotedAtomEscape => {
666 Err(TokenizationError::UnexpectedEofWhileInInQuotedAtom.into())
667 }
668 TokenizationState::BlockComment
669 | TokenizationState::BlockCommentPoundSign
670 | TokenizationState::BlockCommentBar
671 | TokenizationState::BlockCommentInQuotedString
672 | TokenizationState::BlockCommentInQuotedStringEscape => {
673 Err(TokenizationError::UnexpectedEofWhileInBlockComment.into())
674 }
675 };
676
677 self.raw_token_refs.push_back(final_token_ref);
678 }
679}
680
681pub struct RawTokenizer<I> {
682 input: I,
683 tape_tokenizer: BasicTapeTokenizer,
684}
685
686impl<I> RawTokenizer<I> {
687 pub fn new(input: I) -> RawTokenizer<I> {
688 RawTokenizer {
689 input,
690 tape_tokenizer: BasicTapeTokenizer::new(),
691 }
692 }
693}
694
695impl<'de, I> RawTokenizer<I>
696where
697 I: Input<'de>,
698{
699 fn process_more_input_if_needed(&mut self) -> Result<HasEnoughData> {
700 loop {
701 if let Some(witness) = self.tape_tokenizer.has_enough_data_to_produce_tokens() {
702 return Ok(witness);
703 }
704
705 match self.input.next_chunk()? {
706 InputChunk::Data(chunk) => self.tape_tokenizer.feed_more_data(chunk),
707 InputChunk::Eof => self.tape_tokenizer.eof(),
708 }
709 }
710 }
711
712 pub fn next_raw_token<'t>(&'t mut self) -> Result<Option<RawToken<'de, 't>>> {
713 let witness = self.process_more_input_if_needed()?;
714 let current_chunk = self.input.current_chunk();
715 self.tape_tokenizer.next_raw_token(witness, current_chunk)
716 }
717
718 pub fn peek_raw_token_kind(&mut self) -> Result<Option<RawTokenKind>> {
719 let witness = self.process_more_input_if_needed()?;
720 self.tape_tokenizer.peek_raw_token_kind(&witness)
721 }
722
723 pub fn advance(&mut self) -> Result<()> {
724 let witness = self.process_more_input_if_needed()?;
725 self.tape_tokenizer.advance(witness)
726 }
727}
728
729pub struct Tokenizer<I> {
730 sexp_comment_nesting_depths: Vec<usize>,
732 scratch_space_for_unescaped_atom: Vec<u8>,
733 raw_tokenizer: RawTokenizer<I>,
734 peeked_input_error: Option<crate::error::Error>,
735}
736
737impl<I> Tokenizer<I> {
738 pub fn new(input: I) -> Tokenizer<I> {
739 Tokenizer {
740 sexp_comment_nesting_depths: vec![],
741 scratch_space_for_unescaped_atom: vec![],
742 raw_tokenizer: RawTokenizer::new(input),
743 peeked_input_error: None,
744 }
745 }
746}
747
748impl<'de, I> Tokenizer<I>
749where
750 I: Input<'de>,
751{
752 fn consume_commented_out_sexp(&mut self) -> Result<()> {
753 self.sexp_comment_nesting_depths.push(0);
754
755 while !self.sexp_comment_nesting_depths.is_empty() {
756 match self.raw_tokenizer.next_raw_token()? {
757 None => return Err(TokenizationError::UnterminatedSexpCommentAtEof.into()),
758 Some(RawToken::LeftParen) => {
759 *self.sexp_comment_nesting_depths.last_mut().unwrap() += 1
760 }
761 Some(RawToken::RightParen) => {
762 let last = self.sexp_comment_nesting_depths.last_mut().unwrap();
763 if *last == 0 {
764 return Err(TokenizationError::UnterminatedSexpCommentAtEndOfList.into());
765 }
766
767 *last -= 1;
768 if *last == 0 {
769 self.sexp_comment_nesting_depths.pop();
770 }
771 }
772 Some(RawToken::Atom(atom)) => {
773 atom.validate()?;
774 if *self.sexp_comment_nesting_depths.last().unwrap() == 0 {
775 self.sexp_comment_nesting_depths.pop();
776 }
777 }
778 Some(RawToken::SexpComment) => {
779 self.sexp_comment_nesting_depths.push(0);
780 }
781 Some(RawToken::BlockComment(block_comment)) => {
782 block_comment.validate_block_comment()?;
783 }
784 Some(RawToken::LineComment(_)) => (),
785 }
786 }
787
788 Ok(())
789 }
790}
791
792impl<'de, I> TokenIterator<'de> for Tokenizer<I>
793where
794 I: Input<'de>,
795{
796 fn peek_kind(&mut self) -> Result<Option<TokenKind>> {
797 if let Some(error) = &self.peeked_input_error {
798 return Err(error.clone());
799 };
800
801 loop {
803 let Some(raw_token_kind) = self.raw_tokenizer.peek_raw_token_kind()? else {
804 return Ok(None);
805 };
806
807 match raw_token_kind {
808 RawTokenKind::LeftParen => return Ok(Some(TokenKind::LeftParen)),
809 RawTokenKind::RightParen => return Ok(Some(TokenKind::RightParen)),
810 RawTokenKind::Atom => return Ok(Some(TokenKind::Atom)),
811 RawTokenKind::LineComment => self.raw_tokenizer.advance()?,
812 RawTokenKind::SexpComment => {
813 self.raw_tokenizer.advance()?;
814 if let Some(error) = self.consume_commented_out_sexp().err() {
815 self.peeked_input_error = Some(error.clone());
816 return Err(error);
817 };
818 }
819 RawTokenKind::BlockComment => {
820 let Ok(Some(RawToken::BlockComment(comment_bytes))) =
821 self.raw_tokenizer.next_raw_token()
822 else {
823 panic!("peek_raw_token_kind just returned BlockComment");
824 };
825 comment_bytes.validate_block_comment()?;
826 }
827 }
828 }
829 }
830
831 fn next<'t>(&'t mut self) -> Result<Option<Token<'de, 't>>> {
832 if let Some(error) = self.peeked_input_error.take() {
833 return Err(error);
834 }
835
836 loop {
837 let raw_token_kind = match self.raw_tokenizer.peek_raw_token_kind() {
860 Ok(None) => return Ok(None),
861 Ok(Some(raw_token_kind)) => raw_token_kind,
862 Err(err) => {
863 self.raw_tokenizer.advance()?;
864 return Err(err.clone());
867 }
868 };
869
870 match raw_token_kind {
871 RawTokenKind::LeftParen => {
872 self.raw_tokenizer.advance()?;
873 return Ok(Some(Token::LeftParen));
874 }
875 RawTokenKind::RightParen => {
876 self.raw_tokenizer.advance()?;
877 return Ok(Some(Token::RightParen));
878 }
879 RawTokenKind::Atom => {
880 let Ok(Some(RawToken::Atom(serialized_atom))) =
881 self.raw_tokenizer.next_raw_token()
882 else {
883 panic!("peek_raw_token_kind just returned Atom");
884 };
885
886 let atom = match serialized_atom {
887 Ref::Borrowed(serialized_atom) => {
888 match serialized_atom
889 .unescape(&mut self.scratch_space_for_unescaped_atom)?
890 {
891 Ref::Borrowed(atom) => Ref::Borrowed(atom),
892 Ref::Transient(atom) => Ref::Transient(atom),
893 }
894 }
895 Ref::Transient(serialized_atom) => {
896 match serialized_atom
897 .unescape(&mut self.scratch_space_for_unescaped_atom)?
898 {
899 Ref::Borrowed(atom) | Ref::Transient(atom) => {
900 Ref::Transient(atom)
904 }
905 }
906 }
907 };
908
909 return Ok(Some(Token::Atom(atom)));
910 }
911 RawTokenKind::LineComment => self.raw_tokenizer.advance()?,
912 RawTokenKind::SexpComment => {
913 self.raw_tokenizer.advance()?;
914 self.consume_commented_out_sexp()?;
915 }
916 RawTokenKind::BlockComment => {
917 let Ok(Some(RawToken::BlockComment(comment_bytes))) =
918 self.raw_tokenizer.next_raw_token()
919 else {
920 panic!("peek_raw_token_kind just returned BlockComment");
921 };
922
923 comment_bytes.validate_block_comment()?;
924 }
925 }
926 }
927 }
928}
929
930#[cfg(test)]
931mod tests {
932 use super::*;
933 use crate::atom::AtomData;
934 use crate::error;
935 use crate::input::tests::ExplicitChunksInput;
936 use crate::input::SliceInput;
937 use crate::Ref;
938
939 use bstr::ByteSlice;
940 use insta::assert_snapshot;
941
942 use std::fmt::Write;
943
944 fn raw_tokenize_fragments(buffers: &[&'static [u8]]) -> String {
945 let input = ExplicitChunksInput::new(buffers);
946 let mut raw_tokenizer = RawTokenizer::new(input);
947
948 let mut output = String::new();
949 let o = &mut output;
950
951 loop {
952 let _ = match raw_tokenizer.next_raw_token() {
953 Ok(None) => break,
954 Ok(Some(raw_token)) => writeln!(o, "{}", format_raw_token(raw_token)),
955 Err(err) => writeln!(o, "{}", format_error(err)),
956 };
957 }
958
959 output
960 }
961
962 fn raw_tokenize_str(buffer: &[u8]) -> String {
963 let input = SliceInput::new(buffer);
964 let mut raw_tokenizer = RawTokenizer::new(input);
965
966 let mut output = String::new();
967 let o = &mut output;
968
969 loop {
970 let _ = match raw_tokenizer.next_raw_token() {
971 Ok(None) => break,
972 Ok(Some(raw_token)) => writeln!(o, "{}", format_raw_token(raw_token)),
973 Err(err) => writeln!(o, "{}", format_error(err)),
974 };
975 }
976
977 output
978 }
979
980 fn format_error(err: error::Error) -> String {
981 format!("ERROR: {:?}", err)
982 }
983
984 fn format_raw_token(raw_token: RawToken<'_, '_>) -> String {
985 fn borrowed_or_owned<T: ?Sized>(token_bytes: &Ref<'_, '_, T>) -> &'static str {
986 match token_bytes {
987 Ref::Borrowed(_) => "borrowed",
988 Ref::Transient(_) => "transient",
989 }
990 }
991
992 match raw_token {
993 RawToken::LeftParen => "LeftParen: (".to_owned(),
994 RawToken::RightParen => "RightParen: )".to_owned(),
995 RawToken::SexpComment => "SexpComment: #;".to_owned(),
996 RawToken::Atom(raw_token_bytes) => {
997 let ref_kind = borrowed_or_owned(&raw_token_bytes);
998 let bytes = raw_token_bytes.bytes().as_bstr();
999 format!("Atom: {:?} ({})", bytes, ref_kind)
1000 }
1001 RawToken::LineComment(raw_token_bytes) => {
1002 let ref_kind = borrowed_or_owned(&raw_token_bytes);
1003 let bytes = raw_token_bytes.bytes().as_bstr();
1004 format!("LineComment: {:?} ({})", bytes, ref_kind)
1005 }
1006 RawToken::BlockComment(raw_token_bytes) => {
1007 let ref_kind = borrowed_or_owned(&raw_token_bytes);
1008 let bytes = raw_token_bytes.bytes().as_bstr();
1009 format!("BlockComment: {:?} ({})", bytes, ref_kind)
1010 }
1011 }
1012 }
1013
1014 #[test]
1015 fn test_basics() {
1016 assert_snapshot!(raw_tokenize_str(b"a bc 123 "), @r#"
1017 Atom: "a" (borrowed)
1018 Atom: "bc" (borrowed)
1019 Atom: "123" (borrowed)
1020 "#);
1021
1022 assert_snapshot!(raw_tokenize_str(b"a\"123\"b \"\""), @r#"
1023 Atom: "a" (borrowed)
1024 Atom: "\"123\"" (borrowed)
1025 Atom: "b" (borrowed)
1026 Atom: "\"\"" (borrowed)
1027 "#);
1028
1029 assert_snapshot!(raw_tokenize_str(b"## #a #( #) #\"#\" #\r\n#\n| #;|\n# "), @r###"
1030 Atom: "##" (borrowed)
1031 Atom: "#a" (borrowed)
1032 Atom: "#" (borrowed)
1033 LeftParen: (
1034 Atom: "#" (borrowed)
1035 RightParen: )
1036 Atom: "#" (borrowed)
1037 Atom: "\"#\"" (borrowed)
1038 Atom: "#" (borrowed)
1039 Atom: "#" (borrowed)
1040 Atom: "|" (borrowed)
1041 SexpComment: #;
1042 Atom: "|" (borrowed)
1043 Atom: "#" (borrowed)
1044 "###);
1045
1046 assert_snapshot!(raw_tokenize_str(b"z#a z#( z#) z#\"#\" z#\r\nz#\n| z#;|\n"), @r##"
1047 Atom: "z#a" (borrowed)
1048 Atom: "z#" (borrowed)
1049 LeftParen: (
1050 Atom: "z#" (borrowed)
1051 RightParen: )
1052 Atom: "z#" (borrowed)
1053 Atom: "\"#\"" (borrowed)
1054 Atom: "z#" (borrowed)
1055 Atom: "z#" (borrowed)
1056 Atom: "|" (borrowed)
1057 Atom: "z#" (borrowed)
1058 LineComment: ";|" (borrowed)
1059 "##);
1060
1061 assert_snapshot!(raw_tokenize_str(b"|| |a |( |) |\"|\" |\r\n|\n# |;|\n| "), @r##"
1062 Atom: "||" (borrowed)
1063 Atom: "|a" (borrowed)
1064 Atom: "|" (borrowed)
1065 LeftParen: (
1066 Atom: "|" (borrowed)
1067 RightParen: )
1068 Atom: "|" (borrowed)
1069 Atom: "\"|\"" (borrowed)
1070 Atom: "|" (borrowed)
1071 Atom: "|" (borrowed)
1072 Atom: "#" (borrowed)
1073 Atom: "|" (borrowed)
1074 LineComment: ";|" (borrowed)
1075 Atom: "|" (borrowed)
1076 "##);
1077
1078 assert_snapshot!(raw_tokenize_str(b"z|a z|( z|) z|\"|\" z|\r\nz|\n# z|;|\n"), @r##"
1079 Atom: "z|a" (borrowed)
1080 Atom: "z|" (borrowed)
1081 LeftParen: (
1082 Atom: "z|" (borrowed)
1083 RightParen: )
1084 Atom: "z|" (borrowed)
1085 Atom: "\"|\"" (borrowed)
1086 Atom: "z|" (borrowed)
1087 Atom: "z|" (borrowed)
1088 Atom: "#" (borrowed)
1089 Atom: "z|" (borrowed)
1090 LineComment: ";|" (borrowed)
1091 "##);
1092 }
1093
1094 #[test]
1095 fn test_quoted_string_escapes() {
1096 assert_snapshot!(
1097 raw_tokenize_str(b"\"\\\n \\n \\123 \\\\ \\x01 \\x0\""),
1098 @r#"Atom: "\"\\\n \\n \\123 \\\\ \\x01 \\x0\"" (borrowed)"#);
1099 }
1100
1101 #[test]
1102 fn test_line_comments() {
1103 assert_snapshot!(
1104 raw_tokenize_str(b";\"\"\n;abc\r\n;\n "),
1105 @r#"
1106 LineComment: ";\"\"" (borrowed)
1107 LineComment: ";abc" (borrowed)
1108 LineComment: ";" (borrowed)
1109 "#);
1110 }
1111
1112 #[test]
1113 fn test_block_comments() {
1114 assert_snapshot!(
1115 raw_tokenize_str(b"#|a|# _ #|# |# _ #|\"|#\\\"\"|# _ #| #| a |#| |#"),
1116 @r##"
1117 BlockComment: "#|a|#" (borrowed)
1118 Atom: "_" (borrowed)
1119 BlockComment: "#|# |#" (borrowed)
1120 Atom: "_" (borrowed)
1121 BlockComment: "#|\"|#\\\"\"|#" (borrowed)
1122 Atom: "_" (borrowed)
1123 BlockComment: "#| #| a |#| |#" (borrowed)
1124 "##,
1125 );
1126 }
1127
1128 #[test]
1129 fn test_block_comment_errors() {
1130 assert_snapshot!(raw_tokenize_str(b"a#|b"), @"ERROR: TokenizationError(BlockCommentStartTokenInUnquotedAtom)");
1131 assert_snapshot!(raw_tokenize_str(b"a##|b"), @"ERROR: TokenizationError(BlockCommentStartTokenInUnquotedAtom)");
1132 assert_snapshot!(raw_tokenize_str(b"a|#b"), @"ERROR: TokenizationError(BlockCommentEndTokenInUnquotedAtom)");
1133 assert_snapshot!(raw_tokenize_str(b"a||#b"), @"ERROR: TokenizationError(BlockCommentEndTokenInUnquotedAtom)");
1134 assert_snapshot!(raw_tokenize_str(b"|#"), @"ERROR: TokenizationError(UnexpectedEndOfBlockComment)");
1135 }
1136
1137 #[test]
1138 fn test_sexp_comments() {
1139 assert_snapshot!(raw_tokenize_str(b"#; a#;x\n##;y\n"), @r###"
1140 SexpComment: #;
1141 Atom: "a#" (borrowed)
1142 LineComment: ";x" (borrowed)
1143 Atom: "##" (borrowed)
1144 LineComment: ";y" (borrowed)
1145 "###);
1146 }
1147
1148 #[test]
1149 fn test_partial_tokens() {
1150 assert_snapshot!(
1151 raw_tokenize_fragments(&[b"abc", b"", b"def", b"ghi "]),
1152 @r#"Atom: "abcdefghi" (transient)"#,
1153 );
1154
1155 assert_snapshot!(
1156 raw_tokenize_fragments(&[b";abc", b"def", b"ghi\n"]),
1157 @r#"LineComment: ";abcdefghi" (transient)"#,
1158 );
1159
1160 assert_snapshot!(
1161 raw_tokenize_fragments(&[b"#| abc", b"def", b"ghi |# "]),
1162 @r##"BlockComment: "#| abcdefghi |#" (transient)"##,
1163 );
1164 }
1165
1166 #[test]
1167 fn test_handling_of_pounds_across_buffers() {
1168 assert_snapshot!(
1173 raw_tokenize_fragments(&[b"#", b"; #", b"| |# #", b"a #", b"# "]),
1174 @r###"
1175 SexpComment: #;
1176 BlockComment: "#| |#" (transient)
1177 Atom: "#a" (transient)
1178 Atom: "##" (transient)
1179 "###,
1180 );
1181 }
1182
1183 #[test]
1184 fn test_eof() {
1185 assert_snapshot!(raw_tokenize_fragments(&[b"a\r"]), @r#"
1186 Atom: "a" (transient)
1187 ERROR: TokenizationError(NakedCarriageReturn)
1188 "#);
1189
1190 assert_snapshot!(raw_tokenize_fragments(&[b"a"]), @r#"Atom: "a" (transient)"#);
1191
1192 assert_snapshot!(raw_tokenize_fragments(&[b"a|"]), @r#"Atom: "a|" (transient)"#);
1193
1194 assert_snapshot!(raw_tokenize_fragments(&[b"a#"]), @r#"Atom: "a#" (transient)"#);
1195
1196 assert_snapshot!(raw_tokenize_fragments(&[b"|"]), @r#"Atom: "|" (transient)"#);
1197
1198 assert_snapshot!(raw_tokenize_fragments(&[b";"]), @r#"LineComment: ";" (transient)"#);
1199
1200 assert_snapshot!(raw_tokenize_fragments(&[b";"]), @r#"LineComment: ";" (transient)"#);
1201
1202 assert_snapshot!(raw_tokenize_fragments(&[b"#"]), @r##"Atom: "#" (transient)"##);
1203 }
1204
1205 #[test]
1206 fn test_eof_errors() {
1207 assert_snapshot!(raw_tokenize_fragments(&[b"#|"]), @"ERROR: TokenizationError(UnexpectedEofWhileInBlockComment)");
1208 assert_snapshot!(raw_tokenize_fragments(&[b"#| #"]), @"ERROR: TokenizationError(UnexpectedEofWhileInBlockComment)");
1209 assert_snapshot!(raw_tokenize_fragments(&[b"#| |"]), @"ERROR: TokenizationError(UnexpectedEofWhileInBlockComment)");
1210 assert_snapshot!(raw_tokenize_fragments(&[b"#| \""]), @"ERROR: TokenizationError(UnexpectedEofWhileInBlockComment)");
1211 assert_snapshot!(raw_tokenize_fragments(&[b"#| \"\\"]), @"ERROR: TokenizationError(UnexpectedEofWhileInBlockComment)");
1212 }
1213
1214 #[test]
1215 fn test_raw_tokenizer() {
1216 assert_snapshot!(
1217 raw_tokenize_fragments(&[b"a1 a2", b" a3"]),
1218 @r#"
1219 Atom: "a1" (transient)
1220 Atom: "a2" (transient)
1221 Atom: "a3" (transient)
1222 "#,
1223 );
1224
1225 assert_snapshot!(
1226 raw_tokenize_fragments(&[b"abc", b"def", b"ghi"]),
1227 @r#"Atom: "abcdefghi" (transient)"#,
1228 );
1229
1230 assert_snapshot!(
1231 raw_tokenize_fragments(&[b"; lc1\n ; lc2", b"\n ; lc3"]),
1232 @r#"
1233 LineComment: "; lc1" (transient)
1234 LineComment: "; lc2" (transient)
1235 LineComment: "; lc3" (transient)
1236 "#,
1237 );
1238
1239 assert_snapshot!(
1240 raw_tokenize_fragments(&[b"; abc", b"def", b"ghi"]),
1241 @r#"LineComment: "; abcdefghi" (transient)"#,
1242 );
1243
1244 assert_snapshot!(
1245 raw_tokenize_fragments(&[b"#| bc1 |# #| bc2 ", b"|#"]),
1246 @r##"
1247 BlockComment: "#| bc1 |#" (transient)
1248 BlockComment: "#| bc2 |#" (transient)
1249 "##,
1250 );
1251
1252 assert_snapshot!(
1253 raw_tokenize_fragments(&[b"#| abc", b"def", b"ghi |#"]),
1254 @r##"BlockComment: "#| abcdefghi |#" (transient)"##,
1255 );
1256 }
1257
1258 #[test]
1259 fn test_block_comment_validation() {
1260 fn b(bytes: &[u8]) -> String {
1261 match RawTokenBytes::new(bytes).validate_block_comment() {
1262 Ok(()) => "Ok".to_owned(),
1263 Err(err) => format!("{:?}", err),
1264 }
1265 }
1266
1267 assert_snapshot!(b(b"#| |#"), @"Ok");
1269 assert_snapshot!(b(br#"#| "abc" |#"#), @"Ok");
1270 assert_snapshot!(b(br#"#| "\\ \' \" \ ""abc" |#"#), @"Ok");
1271 assert_snapshot!(b(br#"#| "\n \t \b \r""abc" |#"#), @"Ok");
1272 assert_snapshot!(b(br#"#| "\x00 \xff""abc" |#"#), @"Ok");
1273 assert_snapshot!(b(br#"#| "\x000 \x255""abc" |#"#), @"Ok");
1274
1275 assert_snapshot!(b(br#"#| \xgg \x0 \256 \999 |#"#), @"Ok");
1277
1278 assert_snapshot!(b(br#"#| "\xgg" |#"#), @"InvalidHexadecimalEscape");
1280 assert_snapshot!(b(br#"#| "\xf " |#"#), @"InvalidHexadecimalEscape");
1281 assert_snapshot!(b(br#"#| "\xf" |#"#), @"UnterminatedHexadecimalEscape");
1282 assert_snapshot!(b(br#"#| "\256" |#"#), @"OutOfRangeDecimalEscape");
1283 assert_snapshot!(b(br#"#| "\25 " |#"#), @"InvalidDecimalEscape");
1284 assert_snapshot!(b(br#"#| "\25" |#"#), @"UnterminatedDecimalEscape");
1285 assert_snapshot!(b(br#"#| "\"""\xgg" |#"#), @"InvalidHexadecimalEscape");
1286 assert_snapshot!(b(br#"#| " \" \""#), @"UnterminatedQuote");
1287
1288 assert_snapshot!(b(br#""#), @"Ok");
1290 assert_snapshot!(b(br#"#| "#), @"Ok");
1291 assert_snapshot!(b(br#" |#"#), @"Ok");
1292 assert_snapshot!(b(br#"#| #| |# |# |#"#), @"Ok");
1293 }
1294
1295 fn format_token(token: Token<'_, '_>) -> String {
1296 fn borrowed_or_owned(token_bytes: &Ref<'_, '_, AtomData>) -> &'static str {
1297 match token_bytes {
1298 Ref::Borrowed(_) => "borrowed",
1299 Ref::Transient(_) => "transient",
1300 }
1301 }
1302
1303 match token {
1304 Token::LeftParen => "LeftParen: (".to_owned(),
1305 Token::RightParen => "RightParen: )".to_owned(),
1306 Token::Atom(data) => {
1307 let ref_kind = borrowed_or_owned(&data);
1308 format!("Atom: {:?} ({})", data.bytes().as_bstr(), ref_kind)
1309 }
1310 }
1311 }
1312
1313 fn tokenize_str(buffer: &[u8]) -> String {
1314 let input = SliceInput::new(buffer);
1315 let mut tokenizer = Tokenizer::new(input);
1316
1317 let mut output = String::new();
1318 let o = &mut output;
1319
1320 loop {
1321 let _ = tokenizer.peek_kind();
1323 let _ = match tokenizer.next() {
1324 Ok(None) => break,
1325 Ok(Some(token)) => writeln!(o, "{}", format_token(token)),
1326 Err(err) => writeln!(o, "{}", format_error(err)),
1327 };
1328 }
1329
1330 output
1331 }
1332
1333 #[test]
1334 fn test_tokenizer() {
1335 assert_snapshot!(tokenize_str(br#"a "b c" "d\e" f"#), @r#"
1336 Atom: "a" (borrowed)
1337 Atom: "b c" (borrowed)
1338 Atom: "d\\e" (transient)
1339 Atom: "f" (transient)
1340 "#);
1341
1342 assert_snapshot!(tokenize_str(b"(a #| xyz |# b c ; abc \n)"), @r#"
1343 LeftParen: (
1344 Atom: "a" (borrowed)
1345 Atom: "b" (borrowed)
1346 Atom: "c" (borrowed)
1347 RightParen: )
1348 "#);
1349
1350 assert_snapshot!(tokenize_str(b") ) ( ("), @r"
1352 RightParen: )
1353 RightParen: )
1354 LeftParen: (
1355 LeftParen: (
1356 ");
1357 }
1358
1359 #[test]
1360 fn test_tokenizer_handles_sexp_comments() {
1361 assert_snapshot!(tokenize_str(b"a #; b #; (x y z)"), @r#"Atom: "a" (borrowed)"#);
1362
1363 assert_snapshot!(tokenize_str(b"a #; #; #; w (x #; 0 y) z b c"), @r#"
1364 Atom: "a" (borrowed)
1365 Atom: "b" (borrowed)
1366 Atom: "c" (transient)
1367 "#);
1368
1369 assert_snapshot!(tokenize_str(b"a #;"), @r#"
1370 Atom: "a" (borrowed)
1371 ERROR: TokenizationError(UnterminatedSexpCommentAtEof)
1372 "#);
1373
1374 assert_snapshot!(tokenize_str(b"(#;)"), @r"
1375 LeftParen: (
1376 ERROR: TokenizationError(UnterminatedSexpCommentAtEndOfList)
1377 ");
1378 }
1379}