ocaml_sexplib/
tokenizer.rs

1use std::collections::VecDeque;
2use std::ops::Range;
3
4use crate::atom::{AtomData, PlausibleSerializedAtom};
5use crate::error::{Result, TokenizationError};
6use crate::input::{Input, InputChunk};
7use crate::Ref;
8
9#[derive(Debug)]
10pub enum Token<'de, 't> {
11    LeftParen,
12    Atom(Ref<'de, 't, AtomData>),
13    RightParen,
14}
15
16impl<'de, 't> Token<'de, 't> {
17    pub fn kind(&self) -> TokenKind {
18        match self {
19            Token::LeftParen => TokenKind::LeftParen,
20            Token::Atom(_) => TokenKind::Atom,
21            Token::RightParen => TokenKind::RightParen,
22        }
23    }
24}
25
26#[derive(Copy, Clone, Debug)]
27pub enum TokenKind {
28    LeftParen,
29    Atom,
30    RightParen,
31}
32
33pub trait TokenIterator<'de> {
34    fn next<'t>(&'t mut self) -> Result<Option<Token<'de, 't>>>;
35
36    fn peek_kind(&mut self) -> Result<Option<TokenKind>>;
37}
38
39#[derive(Copy, Clone, Debug)]
40pub enum VarTokenKind {
41    Atom,
42    LineComment,
43    BlockComment,
44}
45
46/// Raw bytes that have already designated by the tokenizer as representing an input
47/// token, and thus already have certain guarantees. Will not be empty.
48///
49/// Atoms: If it starts with a double quote, it will end with a double quote. It may
50/// contain escape sequences in between. If it doesn't start with a double quote, it
51/// represents a valid unescaped atom.
52///
53/// Line comments: Starts with ';' and doesn't contain any newlines. Always valid.
54///
55/// Block comments: Starts with "#|" and ends with "|#". Double quotes will be balanced
56/// and may contain escaped values in between.
57#[derive(Debug)]
58#[repr(transparent)]
59pub struct RawTokenBytes([u8]);
60
61impl RawTokenBytes {
62    pub fn new(bytes: &[u8]) -> &RawTokenBytes {
63        // SAFETY: RawTokenBytes is just a wrapper around [u8], enforced by #[repr(transparent)],
64        // therefore converting &[u8] to &RawTokenBytes is safe.
65        unsafe { &*(bytes as *const [u8] as *const RawTokenBytes) }
66    }
67
68    pub fn bytes(&self) -> &[u8] {
69        &self.0
70    }
71
72    pub fn validate_block_comment(&self) -> std::result::Result<(), TokenizationError> {
73        let mut bytes = &self.0;
74
75        // Someday: Use memchr
76        while let Some(open_quote_index) = bytes.iter().position(|b| *b == b'"') {
77            bytes = &bytes[(open_quote_index + 1)..];
78
79            let mut close_quote_index = 0;
80            let mut remaining_bytes = bytes;
81
82            'find_close_quote: loop {
83                // Someday: Use memchr2
84                let Some(quote_or_backslash_index) = remaining_bytes
85                    .iter()
86                    .position(|b| *b == b'"' || *b == b'\\')
87                else {
88                    return Err(TokenizationError::UnterminatedQuote);
89                };
90
91                close_quote_index += quote_or_backslash_index;
92
93                if remaining_bytes[quote_or_backslash_index] == b'"' {
94                    break 'find_close_quote;
95                }
96
97                if quote_or_backslash_index + 1 >= remaining_bytes.len() {
98                    return Err(TokenizationError::UnterminatedBackslashEscape);
99                }
100
101                // Skip past the escaped character (we'll validate the actual escape below).
102                close_quote_index += 2;
103                remaining_bytes = &remaining_bytes[(quote_or_backslash_index + 2)..];
104            }
105
106            // Quoted sections in block comments must follow the same rules as regular atoms.
107            PlausibleSerializedAtom::validate_quote_escaping(&bytes[..close_quote_index])?;
108
109            bytes = &bytes[(close_quote_index + 1)..];
110        }
111
112        Ok(())
113    }
114}
115
116#[derive(Debug)]
117pub enum RawToken<'de, 't> {
118    LeftParen,
119    RightParen,
120    Atom(Ref<'de, 't, PlausibleSerializedAtom>),
121    LineComment(Ref<'de, 't, RawTokenBytes>),
122    BlockComment(Ref<'de, 't, RawTokenBytes>),
123    SexpComment,
124}
125
126// We only use this type to get around a limitation in the the borrow checker.
127pub enum RawTokenKind {
128    LeftParen,
129    RightParen,
130    Atom,
131    LineComment,
132    BlockComment,
133    SexpComment,
134}
135
136impl<'de, 't> RawToken<'de, 't> {
137    fn from_token_bytes_and_kind(
138        token_bytes: Ref<'de, 't, [u8]>,
139        kind: VarTokenKind,
140    ) -> RawToken<'de, 't> {
141        if matches!(kind, VarTokenKind::Atom) {
142            let plausible_atom = match token_bytes {
143                Ref::Borrowed(bytes) => Ref::Borrowed(PlausibleSerializedAtom::new(bytes).unwrap()),
144                Ref::Transient(bytes) => {
145                    Ref::Transient(PlausibleSerializedAtom::new(bytes).unwrap())
146                }
147            };
148
149            return RawToken::Atom(plausible_atom);
150        }
151
152        let raw_token_bytes = match token_bytes {
153            Ref::Borrowed(bytes) => Ref::Borrowed(RawTokenBytes::new(bytes)),
154            Ref::Transient(bytes) => Ref::Transient(RawTokenBytes::new(bytes)),
155        };
156
157        match kind {
158            VarTokenKind::LineComment => RawToken::LineComment(raw_token_bytes),
159            VarTokenKind::BlockComment => RawToken::BlockComment(raw_token_bytes),
160            VarTokenKind::Atom => unreachable!(),
161        }
162    }
163}
164
165#[derive(Debug)]
166enum RawTokenRefData {
167    Range(Range<usize>),
168    Scratch,
169}
170
171#[derive(Debug)]
172enum RawTokenRef {
173    LeftParen,
174    RightParen,
175    SexpComment,
176    VarToken(RawTokenRefData, VarTokenKind),
177}
178
179impl RawTokenRef {
180    fn to_raw_token_kind(&self) -> RawTokenKind {
181        match self {
182            RawTokenRef::LeftParen => RawTokenKind::LeftParen,
183            RawTokenRef::RightParen => RawTokenKind::RightParen,
184            RawTokenRef::VarToken(_, VarTokenKind::Atom) => RawTokenKind::Atom,
185            RawTokenRef::VarToken(_, VarTokenKind::LineComment) => RawTokenKind::LineComment,
186            RawTokenRef::VarToken(_, VarTokenKind::BlockComment) => RawTokenKind::BlockComment,
187            RawTokenRef::SexpComment => RawTokenKind::SexpComment,
188        }
189    }
190}
191
192pub struct HasEnoughData(());
193
194pub trait RawTokenTape {
195    fn feed_more_data(&mut self, data: &[u8]);
196    fn eof(&mut self);
197    fn has_enough_data_to_produce_tokens(&self) -> Option<HasEnoughData>;
198
199    fn next_raw_token<'de, 't>(
200        &'t mut self,
201        witness: HasEnoughData,
202        current_data: Option<Ref<'de, 't, [u8]>>,
203    ) -> Result<Option<RawToken<'de, 't>>>;
204
205    fn peek_raw_token_kind(&self, witness: &HasEnoughData) -> Result<Option<RawTokenKind>>;
206
207    fn advance(&mut self, witness: HasEnoughData) -> Result<()>;
208}
209
210// Sexplib lexer: https://github.com/janestreet/sexplib/blob/master/src/lexer.mll
211//
212// Rules:
213// - Special characters:
214//   - Parentheses: '(' and ')'
215//   - Start of line comment: ';'
216//   - Start of quoted atom: '"'
217//   - Whitespace:
218//     - Space: ' '
219//     - Tab: '\t' (hex: 0x09)
220//     - Newline: '\n' (hex: 0x0a)
221//     - CRLF: "\r\n" (hex: 0x0d 0x0a)
222//     - Naked carriage returns (0x0d) are a lexer ERROR
223// - Unquoted atoms:
224//   - Backslashes ('\\') do nothing (just another character)
225//   - Cannot contain '#|' or '|#' (start or end of block comment)
226//   - A '#;' in an unquoted atom is treated as end of atom and then a line comment
227// - Quoted atoms:
228//   - Backslash escapes:
229//     - character escapes: \ ' " <space> (backlash, single/double quote, space)
230//     - control escapes: n t b r (newline, tab, backspace, carriage return)
231//     - decimal escape: \ddd -> ddd (in decimal) as a byte
232//       - it is a _lexer_ error if ddd > 255 (this seems silly; we won't do this)
233//     - hexadecimal escape: \xhh -> hh (in hexadecimal) as a byte
234//     - line wrapping escape: \ (Newline or CRLF) (Space or tab)
235//       - these bytes are totally ignored by the parser
236//     - a backslash followed by any other character is just treated as
237//       a literal backslash and then that character
238//   - Newlines (0x0a) do not get special treatment
239//     - CRLF is also not special
240//     - Naked carriage returns (0x0d) are permitted
241// - Line comments:
242//   - Start with ';', go until newline or CRLF
243// - Sexp comment: "#;"
244//   - Immediately goes back to start state
245//   - Within an unquoted atom, '#' is treated as part of the atom + line comment
246// - Block comments:
247//   - Start with "#|", end with "|#"
248//   - Can be nested
249//   - Strings within quoted atoms follow exact same escaping rules as normal quoted atoms
250//   - Naked carriage returns _are_ allowed
251
252#[derive(Copy, Clone, Eq, PartialEq, Debug)]
253enum TokenizationState {
254    Start,
255    CarriageReturn,
256    InUnquotedAtom,
257    InUnquotedAtomPoundSign,
258    InUnquotedAtomBar,
259    InQuotedAtom,
260    InQuotedAtomEscape,
261    LineComment,
262    PoundSign,
263    Bar,
264    BlockComment,
265    BlockCommentPoundSign,
266    BlockCommentBar,
267    BlockCommentInQuotedString,
268    BlockCommentInQuotedStringEscape,
269}
270
271pub struct BasicTapeTokenizer {
272    // None when done tokenizing.
273    // Someday: indicate EOF vs error states separately?
274    state: Option<TokenizationState>,
275    scratch_buffer_for_a_previous_token: Vec<u8>,
276    scratch_buffer_for_current_token: Vec<u8>,
277    using_scratch_buffer_for_current_token: bool,
278    raw_token_refs: VecDeque<Result<RawTokenRef>>,
279    // byte_offset: u64,
280    // line_num: u64,
281    // col_num: u64,
282    block_comment_depth: i64,
283    // Only valid during one iteration
284    start_of_current_token: usize,
285}
286
287macro_rules! whitespace {
288    () => {
289        b' ' | b'\n' | b'\t' | b'\x0c'
290    };
291}
292
293impl BasicTapeTokenizer {
294    pub fn new() -> Self {
295        BasicTapeTokenizer {
296            state: Some(TokenizationState::Start),
297            scratch_buffer_for_a_previous_token: vec![],
298            scratch_buffer_for_current_token: vec![],
299            using_scratch_buffer_for_current_token: false,
300            raw_token_refs: VecDeque::new(),
301            // byte_offset: 0,
302            // line_num: 1,
303            // col_num: 0,
304            block_comment_depth: 0,
305            start_of_current_token: 0,
306        }
307    }
308
309    fn start_new_token(&mut self, pos: usize, state: TokenizationState) {
310        self.using_scratch_buffer_for_current_token = false;
311        self.start_of_current_token = pos;
312        self.state = Some(state);
313    }
314
315    fn copy_partial_token_to_scratch_buffer(&mut self, buffer: &[u8]) {
316        let partial_token = &buffer[self.start_of_current_token..];
317        if !self.using_scratch_buffer_for_current_token {
318            self.scratch_buffer_for_current_token.clear();
319            self.using_scratch_buffer_for_current_token = true;
320        }
321        self.scratch_buffer_for_current_token
322            .extend_from_slice(partial_token);
323    }
324
325    fn finish_token(&mut self, kind: VarTokenKind, ends_before: usize, buffer: &[u8]) {
326        let range = self.start_of_current_token..ends_before;
327        let raw_token_ref_data = if self.using_scratch_buffer_for_current_token {
328            let partial_token = &buffer[range];
329            self.scratch_buffer_for_current_token
330                .extend_from_slice(partial_token);
331            self.complete_token_in_scratch_buffer();
332            RawTokenRefData::Scratch
333        } else {
334            RawTokenRefData::Range(range)
335        };
336
337        self.raw_token_refs
338            .push_back(Ok(RawTokenRef::VarToken(raw_token_ref_data, kind)));
339    }
340
341    fn complete_token_in_scratch_buffer(&mut self) {
342        // Make the scratch buffer for the current token into the scratch
343        // buffer for a previous token.
344        std::mem::swap(
345            &mut self.scratch_buffer_for_a_previous_token,
346            &mut self.scratch_buffer_for_current_token,
347        );
348        self.scratch_buffer_for_current_token.clear();
349    }
350}
351
352impl Default for BasicTapeTokenizer {
353    fn default() -> Self {
354        Self::new()
355    }
356}
357
358impl RawTokenTape for BasicTapeTokenizer {
359    fn has_enough_data_to_produce_tokens(&self) -> Option<HasEnoughData> {
360        if self.raw_token_refs.is_empty() && self.state.is_some() {
361            None
362        } else {
363            Some(HasEnoughData(()))
364        }
365    }
366
367    fn peek_raw_token_kind(&self, _: &HasEnoughData) -> Result<Option<RawTokenKind>> {
368        match self.raw_token_refs.front() {
369            None => Ok(None),
370            Some(Err(err)) => Err(err.clone()),
371            Some(Ok(raw_token_ref)) => Ok(Some(raw_token_ref.to_raw_token_kind())),
372        }
373    }
374
375    fn next_raw_token<'de, 't>(
376        &'t mut self,
377        _: HasEnoughData,
378        current_data: Option<Ref<'de, 't, [u8]>>,
379    ) -> Result<Option<RawToken<'de, 't>>> {
380        match self.raw_token_refs.pop_front() {
381            None => Ok(None),
382            Some(Err(error)) => Err(error),
383            Some(Ok(raw_token_ref)) => {
384                let raw_token = match raw_token_ref {
385                    RawTokenRef::LeftParen => RawToken::LeftParen,
386                    RawTokenRef::RightParen => RawToken::RightParen,
387                    RawTokenRef::SexpComment => RawToken::SexpComment,
388                    RawTokenRef::VarToken(raw_token_ref_data, token_kind) => {
389                        let raw_token_bytes = match raw_token_ref_data {
390                            RawTokenRefData::Scratch => {
391                                Ref::Transient(self.scratch_buffer_for_a_previous_token.as_slice())
392                            }
393                            RawTokenRefData::Range(range) => match current_data {
394                                Some(data) => data.index(range.clone()),
395                                None => panic!("TapeTokenizer has stale `Range` ref."),
396                            },
397                        };
398
399                        RawToken::from_token_bytes_and_kind(raw_token_bytes, token_kind)
400                    }
401                };
402
403                Ok(Some(raw_token))
404            }
405        }
406    }
407
408    fn advance(&mut self, _: HasEnoughData) -> Result<()> {
409        match self.raw_token_refs.pop_front() {
410            None => Ok(()),
411            Some(Err(error)) => Err(error),
412            Some(Ok(_raw_token_ref)) => Ok(()),
413        }
414    }
415
416    fn feed_more_data(&mut self, buffer: &[u8]) {
417        assert!(self.raw_token_refs.is_empty());
418
419        // Immediately return if no data to process
420        if buffer.is_empty() {
421            return;
422        }
423
424        if self.state.is_none() {
425            self.raw_token_refs
426                .push_back(Err(TokenizationError::TriedToProcessMoreDataAfterEof.into()));
427            return;
428        };
429
430        self.start_of_current_token = 0;
431
432        for (pos, ch) in buffer.iter().enumerate() {
433            // We only ever set `self.state` to `None` in `eof`.
434            match self.state.unwrap() {
435                TokenizationState::Start => match *ch {
436                    whitespace!() => (),
437                    b'(' => self.raw_token_refs.push_back(Ok(RawTokenRef::LeftParen)),
438                    b')' => self.raw_token_refs.push_back(Ok(RawTokenRef::RightParen)),
439                    b'\r' => self.state = Some(TokenizationState::CarriageReturn),
440                    b'"' => self.start_new_token(pos, TokenizationState::InQuotedAtom),
441                    b';' => self.start_new_token(pos, TokenizationState::LineComment),
442                    b'#' => self.start_new_token(pos, TokenizationState::PoundSign),
443                    b'|' => self.start_new_token(pos, TokenizationState::Bar),
444                    _ => self.start_new_token(pos, TokenizationState::InUnquotedAtom),
445                },
446                TokenizationState::CarriageReturn => {
447                    if *ch != b'\n' {
448                        self.raw_token_refs
449                            .push_back(Err(TokenizationError::NakedCarriageReturn.into()));
450                        // Someday: Make `state` be `Eof`, `Error` or `Some`.
451                        self.state = None;
452                        return;
453                    }
454                    self.state = Some(TokenizationState::Start);
455                }
456                TokenizationState::InUnquotedAtom
457                | TokenizationState::InUnquotedAtomPoundSign
458                | TokenizationState::InUnquotedAtomBar
459                | TokenizationState::PoundSign
460                | TokenizationState::Bar => match *ch {
461                    whitespace!() => {
462                        self.finish_token(VarTokenKind::Atom, pos, buffer);
463                        self.state = Some(TokenizationState::Start);
464                    }
465                    b'(' => {
466                        self.finish_token(VarTokenKind::Atom, pos, buffer);
467                        self.raw_token_refs.push_back(Ok(RawTokenRef::LeftParen));
468                        self.state = Some(TokenizationState::Start);
469                    }
470                    b')' => {
471                        self.finish_token(VarTokenKind::Atom, pos, buffer);
472                        self.raw_token_refs.push_back(Ok(RawTokenRef::RightParen));
473                        self.state = Some(TokenizationState::Start);
474                    }
475                    b'\r' => {
476                        self.finish_token(VarTokenKind::Atom, pos, buffer);
477                        self.state = Some(TokenizationState::CarriageReturn);
478                    }
479                    b'"' => {
480                        self.finish_token(VarTokenKind::Atom, pos, buffer);
481                        self.start_new_token(pos, TokenizationState::InQuotedAtom);
482                    }
483                    b';' => match self.state.unwrap() {
484                        TokenizationState::PoundSign => {
485                            self.raw_token_refs.push_back(Ok(RawTokenRef::SexpComment));
486                            self.state = Some(TokenizationState::Start);
487                        }
488                        _ => {
489                            self.finish_token(VarTokenKind::Atom, pos, buffer);
490                            self.start_new_token(pos, TokenizationState::LineComment);
491                        }
492                    },
493                    b'#' => match self.state.unwrap() {
494                        TokenizationState::InUnquotedAtomBar => {
495                            self.raw_token_refs.push_back(Err(
496                                TokenizationError::BlockCommentEndTokenInUnquotedAtom.into(),
497                            ));
498                            self.state = None;
499                            return;
500                        }
501                        TokenizationState::Bar => {
502                            self.raw_token_refs.push_back(Err(
503                                TokenizationError::UnexpectedEndOfBlockComment.into(),
504                            ));
505                            self.state = None;
506                            return;
507                        }
508                        _ => self.state = Some(TokenizationState::InUnquotedAtomPoundSign),
509                    },
510                    b'|' => match self.state.unwrap() {
511                        TokenizationState::InUnquotedAtomPoundSign => {
512                            self.raw_token_refs.push_back(Err(
513                                TokenizationError::BlockCommentStartTokenInUnquotedAtom.into(),
514                            ));
515                            self.state = None;
516                            return;
517                        }
518                        TokenizationState::PoundSign => {
519                            self.state = Some(TokenizationState::BlockComment);
520                            self.block_comment_depth = 1;
521                        }
522                        _ => self.state = Some(TokenizationState::InUnquotedAtomBar),
523                    },
524                    _ => self.state = Some(TokenizationState::InUnquotedAtom),
525                },
526                // Processing quoted atoms and quoted strings in block comments is the same
527                // (other than what state we return to after the '"' or escaped character).
528                TokenizationState::InQuotedAtom | TokenizationState::BlockCommentInQuotedString => {
529                    match *ch {
530                        b'"' => {
531                            if self.state.unwrap() == TokenizationState::InQuotedAtom {
532                                self.finish_token(VarTokenKind::Atom, pos + 1, buffer);
533                                self.state = Some(TokenizationState::Start);
534                            } else {
535                                self.state = Some(TokenizationState::BlockComment);
536                            }
537                        }
538                        b'\\' => {
539                            if self.state.unwrap() == TokenizationState::InQuotedAtom {
540                                self.state = Some(TokenizationState::InQuotedAtomEscape);
541                            } else {
542                                self.state =
543                                    Some(TokenizationState::BlockCommentInQuotedStringEscape);
544                            }
545                        }
546                        _ => (),
547                    }
548                }
549                TokenizationState::InQuotedAtomEscape => {
550                    self.state = Some(TokenizationState::InQuotedAtom);
551                }
552                TokenizationState::BlockCommentInQuotedStringEscape => {
553                    self.state = Some(TokenizationState::BlockCommentInQuotedString);
554                }
555                TokenizationState::LineComment => match *ch {
556                    b'\n' => {
557                        self.finish_token(VarTokenKind::LineComment, pos, buffer);
558                        self.state = Some(TokenizationState::Start);
559                    }
560                    b'\r' => {
561                        self.finish_token(VarTokenKind::LineComment, pos, buffer);
562                        self.state = Some(TokenizationState::CarriageReturn);
563                    }
564                    _ => (),
565                },
566                TokenizationState::BlockComment => match *ch {
567                    b'"' => self.state = Some(TokenizationState::BlockCommentInQuotedString),
568                    b'#' => self.state = Some(TokenizationState::BlockCommentPoundSign),
569                    b'|' => self.state = Some(TokenizationState::BlockCommentBar),
570                    _ => (),
571                },
572                TokenizationState::BlockCommentPoundSign => match *ch {
573                    b'"' => self.state = Some(TokenizationState::BlockCommentInQuotedString),
574                    b'#' => self.state = Some(TokenizationState::BlockCommentPoundSign),
575                    b'|' => {
576                        self.block_comment_depth += 1;
577                        self.state = Some(TokenizationState::BlockComment);
578                    }
579                    _ => self.state = Some(TokenizationState::BlockComment),
580                },
581                TokenizationState::BlockCommentBar => match *ch {
582                    b'"' => self.state = Some(TokenizationState::BlockCommentInQuotedString),
583                    b'|' => self.state = Some(TokenizationState::BlockCommentBar),
584                    b'#' => {
585                        self.block_comment_depth -= 1;
586                        if self.block_comment_depth == 0 {
587                            self.finish_token(VarTokenKind::BlockComment, pos + 1, buffer);
588                            self.state = Some(TokenizationState::Start);
589                        } else {
590                            self.state = Some(TokenizationState::BlockComment);
591                        }
592                    }
593                    _ => self.state = Some(TokenizationState::BlockComment),
594                },
595            }
596        }
597
598        // If in the middle of a token, output `StartOfToken` or `MiddleOfToken`.
599        match self.state.unwrap() {
600            TokenizationState::Start | TokenizationState::CarriageReturn => (),
601            // Maybe starting an atom; it's fine if we don't end up using this.
602            TokenizationState::PoundSign
603            // Starting an atom
604            | TokenizationState::InUnquotedAtom
605            | TokenizationState::InUnquotedAtomPoundSign
606            | TokenizationState::InUnquotedAtomBar
607            | TokenizationState::Bar
608            | TokenizationState::InQuotedAtom
609            | TokenizationState::InQuotedAtomEscape
610            // Started a line comment
611            | TokenizationState::LineComment
612            // Started a block comment
613            | TokenizationState::BlockComment
614            | TokenizationState::BlockCommentPoundSign
615            | TokenizationState::BlockCommentBar
616            | TokenizationState::BlockCommentInQuotedString
617            | TokenizationState::BlockCommentInQuotedStringEscape => {
618                self.copy_partial_token_to_scratch_buffer(buffer);
619            }
620        }
621    }
622
623    fn eof(&mut self) {
624        // Set `self.state` to `None`, indicating that we've seen EOF.
625        let Some(final_state) = self.state.take() else {
626            self.raw_token_refs
627                .push_back(Err(TokenizationError::EofCalledMultipleTimes.into()));
628            return;
629        };
630
631        // If we push a new token when we see EOF, that will always be contained
632        // in the scratch buffer.
633        let raw_token_ref_data = RawTokenRefData::Scratch;
634
635        let final_token_ref = match final_state {
636            TokenizationState::Start => return,
637            TokenizationState::InUnquotedAtom
638            | TokenizationState::InUnquotedAtomBar
639            | TokenizationState::InUnquotedAtomPoundSign
640            | TokenizationState::Bar => {
641                assert!(self.using_scratch_buffer_for_current_token);
642                self.complete_token_in_scratch_buffer();
643                Ok(RawTokenRef::VarToken(
644                    raw_token_ref_data,
645                    VarTokenKind::Atom,
646                ))
647            }
648            TokenizationState::LineComment => {
649                assert!(self.using_scratch_buffer_for_current_token);
650                self.complete_token_in_scratch_buffer();
651                Ok(RawTokenRef::VarToken(
652                    raw_token_ref_data,
653                    VarTokenKind::LineComment,
654                ))
655            }
656            TokenizationState::PoundSign => {
657                assert!(self.using_scratch_buffer_for_current_token);
658                self.complete_token_in_scratch_buffer();
659                Ok(RawTokenRef::VarToken(
660                    raw_token_ref_data,
661                    VarTokenKind::Atom,
662                ))
663            }
664            TokenizationState::CarriageReturn => Err(TokenizationError::NakedCarriageReturn.into()),
665            TokenizationState::InQuotedAtom | TokenizationState::InQuotedAtomEscape => {
666                Err(TokenizationError::UnexpectedEofWhileInInQuotedAtom.into())
667            }
668            TokenizationState::BlockComment
669            | TokenizationState::BlockCommentPoundSign
670            | TokenizationState::BlockCommentBar
671            | TokenizationState::BlockCommentInQuotedString
672            | TokenizationState::BlockCommentInQuotedStringEscape => {
673                Err(TokenizationError::UnexpectedEofWhileInBlockComment.into())
674            }
675        };
676
677        self.raw_token_refs.push_back(final_token_ref);
678    }
679}
680
681pub struct RawTokenizer<I> {
682    input: I,
683    tape_tokenizer: BasicTapeTokenizer,
684}
685
686impl<I> RawTokenizer<I> {
687    pub fn new(input: I) -> RawTokenizer<I> {
688        RawTokenizer {
689            input,
690            tape_tokenizer: BasicTapeTokenizer::new(),
691        }
692    }
693}
694
695impl<'de, I> RawTokenizer<I>
696where
697    I: Input<'de>,
698{
699    fn process_more_input_if_needed(&mut self) -> Result<HasEnoughData> {
700        loop {
701            if let Some(witness) = self.tape_tokenizer.has_enough_data_to_produce_tokens() {
702                return Ok(witness);
703            }
704
705            match self.input.next_chunk()? {
706                InputChunk::Data(chunk) => self.tape_tokenizer.feed_more_data(chunk),
707                InputChunk::Eof => self.tape_tokenizer.eof(),
708            }
709        }
710    }
711
712    pub fn next_raw_token<'t>(&'t mut self) -> Result<Option<RawToken<'de, 't>>> {
713        let witness = self.process_more_input_if_needed()?;
714        let current_chunk = self.input.current_chunk();
715        self.tape_tokenizer.next_raw_token(witness, current_chunk)
716    }
717
718    pub fn peek_raw_token_kind(&mut self) -> Result<Option<RawTokenKind>> {
719        let witness = self.process_more_input_if_needed()?;
720        self.tape_tokenizer.peek_raw_token_kind(&witness)
721    }
722
723    pub fn advance(&mut self) -> Result<()> {
724        let witness = self.process_more_input_if_needed()?;
725        self.tape_tokenizer.advance(witness)
726    }
727}
728
729pub struct Tokenizer<I> {
730    // Someday: Add `validate_atoms_and_block_comments: bool` flag?
731    sexp_comment_nesting_depths: Vec<usize>,
732    scratch_space_for_unescaped_atom: Vec<u8>,
733    raw_tokenizer: RawTokenizer<I>,
734    peeked_input_error: Option<crate::error::Error>,
735}
736
737impl<I> Tokenizer<I> {
738    pub fn new(input: I) -> Tokenizer<I> {
739        Tokenizer {
740            sexp_comment_nesting_depths: vec![],
741            scratch_space_for_unescaped_atom: vec![],
742            raw_tokenizer: RawTokenizer::new(input),
743            peeked_input_error: None,
744        }
745    }
746}
747
748impl<'de, I> Tokenizer<I>
749where
750    I: Input<'de>,
751{
752    fn consume_commented_out_sexp(&mut self) -> Result<()> {
753        self.sexp_comment_nesting_depths.push(0);
754
755        while !self.sexp_comment_nesting_depths.is_empty() {
756            match self.raw_tokenizer.next_raw_token()? {
757                None => return Err(TokenizationError::UnterminatedSexpCommentAtEof.into()),
758                Some(RawToken::LeftParen) => {
759                    *self.sexp_comment_nesting_depths.last_mut().unwrap() += 1
760                }
761                Some(RawToken::RightParen) => {
762                    let last = self.sexp_comment_nesting_depths.last_mut().unwrap();
763                    if *last == 0 {
764                        return Err(TokenizationError::UnterminatedSexpCommentAtEndOfList.into());
765                    }
766
767                    *last -= 1;
768                    if *last == 0 {
769                        self.sexp_comment_nesting_depths.pop();
770                    }
771                }
772                Some(RawToken::Atom(atom)) => {
773                    atom.validate()?;
774                    if *self.sexp_comment_nesting_depths.last().unwrap() == 0 {
775                        self.sexp_comment_nesting_depths.pop();
776                    }
777                }
778                Some(RawToken::SexpComment) => {
779                    self.sexp_comment_nesting_depths.push(0);
780                }
781                Some(RawToken::BlockComment(block_comment)) => {
782                    block_comment.validate_block_comment()?;
783                }
784                Some(RawToken::LineComment(_)) => (),
785            }
786        }
787
788        Ok(())
789    }
790}
791
792impl<'de, I> TokenIterator<'de> for Tokenizer<I>
793where
794    I: Input<'de>,
795{
796    fn peek_kind(&mut self) -> Result<Option<TokenKind>> {
797        if let Some(error) = &self.peeked_input_error {
798            return Err(error.clone());
799        };
800
801        // We might have to advance over comment tokens
802        loop {
803            let Some(raw_token_kind) = self.raw_tokenizer.peek_raw_token_kind()? else {
804                return Ok(None);
805            };
806
807            match raw_token_kind {
808                RawTokenKind::LeftParen => return Ok(Some(TokenKind::LeftParen)),
809                RawTokenKind::RightParen => return Ok(Some(TokenKind::RightParen)),
810                RawTokenKind::Atom => return Ok(Some(TokenKind::Atom)),
811                RawTokenKind::LineComment => self.raw_tokenizer.advance()?,
812                RawTokenKind::SexpComment => {
813                    self.raw_tokenizer.advance()?;
814                    if let Some(error) = self.consume_commented_out_sexp().err() {
815                        self.peeked_input_error = Some(error.clone());
816                        return Err(error);
817                    };
818                }
819                RawTokenKind::BlockComment => {
820                    let Ok(Some(RawToken::BlockComment(comment_bytes))) =
821                        self.raw_tokenizer.next_raw_token()
822                    else {
823                        panic!("peek_raw_token_kind just returned BlockComment");
824                    };
825                    comment_bytes.validate_block_comment()?;
826                }
827            }
828        }
829    }
830
831    fn next<'t>(&'t mut self) -> Result<Option<Token<'de, 't>>> {
832        if let Some(error) = self.peeked_input_error.take() {
833            return Err(error);
834        }
835
836        loop {
837            // The obvious way to write the body of this loop is: ```
838            //     match self.raw_tokenizer.next_raw_token()? {
839            //         None => return Ok(None),
840            //         Some(RawToken::LeftParen) => return Ok(Some(Token::LeftParen)),
841            //         Some(RawToken::RightParen) => return Ok(Some(Token::RightParen)),
842            //         Some(RawToken::Atom(_)) => return Ok(Some(Token::Atom(atom))),
843            //         Some(RawToken::LineComment(_)) => (),
844            //         Some(RawToken::SexpComment) => self.consume_commented_out_sexp()?,
845            //         Some(RawToken::BlockComment(_)) => ...,
846            //     }
847            // ```
848            //
849            // But because we return the `atom` (which has a lifetime tied to `next_raw_token`),
850            // the borrow checker thinks the borrow when we're calling `next_raw_token` has
851            // lifetime 't, and so we can't call `consume_commented_out_sexp`, or proceed
852            // to the next iteration of the loop.
853            //
854            // Instead, we have to call `peek_raw_token_kind`, have that be a temporary reference,
855            // then only call `next_raw_token` in the branches where we return.
856            //
857            // This is the "Problem Case #3" mentioned here:
858            // https://blog.rust-lang.org/inside-rust/2023/10/06/polonius-update/
859            let raw_token_kind = match self.raw_tokenizer.peek_raw_token_kind() {
860                Ok(None) => return Ok(None),
861                Ok(Some(raw_token_kind)) => raw_token_kind,
862                Err(err) => {
863                    self.raw_tokenizer.advance()?;
864                    // `advance` should return the peeked error, so I don't think
865                    // we'll ever actually get here.
866                    return Err(err.clone());
867                }
868            };
869
870            match raw_token_kind {
871                RawTokenKind::LeftParen => {
872                    self.raw_tokenizer.advance()?;
873                    return Ok(Some(Token::LeftParen));
874                }
875                RawTokenKind::RightParen => {
876                    self.raw_tokenizer.advance()?;
877                    return Ok(Some(Token::RightParen));
878                }
879                RawTokenKind::Atom => {
880                    let Ok(Some(RawToken::Atom(serialized_atom))) =
881                        self.raw_tokenizer.next_raw_token()
882                    else {
883                        panic!("peek_raw_token_kind just returned Atom");
884                    };
885
886                    let atom = match serialized_atom {
887                        Ref::Borrowed(serialized_atom) => {
888                            match serialized_atom
889                                .unescape(&mut self.scratch_space_for_unescaped_atom)?
890                            {
891                                Ref::Borrowed(atom) => Ref::Borrowed(atom),
892                                Ref::Transient(atom) => Ref::Transient(atom),
893                            }
894                        }
895                        Ref::Transient(serialized_atom) => {
896                            match serialized_atom
897                                .unescape(&mut self.scratch_space_for_unescaped_atom)?
898                            {
899                                Ref::Borrowed(atom) | Ref::Transient(atom) => {
900                                    // Even if got back a Ref::Borrowed because we didn't have to
901                                    // do any unescaping, it's coming from a Transient ref, so we
902                                    // always have to return Trasient.
903                                    Ref::Transient(atom)
904                                }
905                            }
906                        }
907                    };
908
909                    return Ok(Some(Token::Atom(atom)));
910                }
911                RawTokenKind::LineComment => self.raw_tokenizer.advance()?,
912                RawTokenKind::SexpComment => {
913                    self.raw_tokenizer.advance()?;
914                    self.consume_commented_out_sexp()?;
915                }
916                RawTokenKind::BlockComment => {
917                    let Ok(Some(RawToken::BlockComment(comment_bytes))) =
918                        self.raw_tokenizer.next_raw_token()
919                    else {
920                        panic!("peek_raw_token_kind just returned BlockComment");
921                    };
922
923                    comment_bytes.validate_block_comment()?;
924                }
925            }
926        }
927    }
928}
929
930#[cfg(test)]
931mod tests {
932    use super::*;
933    use crate::atom::AtomData;
934    use crate::error;
935    use crate::input::tests::ExplicitChunksInput;
936    use crate::input::SliceInput;
937    use crate::Ref;
938
939    use bstr::ByteSlice;
940    use insta::assert_snapshot;
941
942    use std::fmt::Write;
943
944    fn raw_tokenize_fragments(buffers: &[&'static [u8]]) -> String {
945        let input = ExplicitChunksInput::new(buffers);
946        let mut raw_tokenizer = RawTokenizer::new(input);
947
948        let mut output = String::new();
949        let o = &mut output;
950
951        loop {
952            let _ = match raw_tokenizer.next_raw_token() {
953                Ok(None) => break,
954                Ok(Some(raw_token)) => writeln!(o, "{}", format_raw_token(raw_token)),
955                Err(err) => writeln!(o, "{}", format_error(err)),
956            };
957        }
958
959        output
960    }
961
962    fn raw_tokenize_str(buffer: &[u8]) -> String {
963        let input = SliceInput::new(buffer);
964        let mut raw_tokenizer = RawTokenizer::new(input);
965
966        let mut output = String::new();
967        let o = &mut output;
968
969        loop {
970            let _ = match raw_tokenizer.next_raw_token() {
971                Ok(None) => break,
972                Ok(Some(raw_token)) => writeln!(o, "{}", format_raw_token(raw_token)),
973                Err(err) => writeln!(o, "{}", format_error(err)),
974            };
975        }
976
977        output
978    }
979
980    fn format_error(err: error::Error) -> String {
981        format!("ERROR: {:?}", err)
982    }
983
984    fn format_raw_token(raw_token: RawToken<'_, '_>) -> String {
985        fn borrowed_or_owned<T: ?Sized>(token_bytes: &Ref<'_, '_, T>) -> &'static str {
986            match token_bytes {
987                Ref::Borrowed(_) => "borrowed",
988                Ref::Transient(_) => "transient",
989            }
990        }
991
992        match raw_token {
993            RawToken::LeftParen => "LeftParen: (".to_owned(),
994            RawToken::RightParen => "RightParen: )".to_owned(),
995            RawToken::SexpComment => "SexpComment: #;".to_owned(),
996            RawToken::Atom(raw_token_bytes) => {
997                let ref_kind = borrowed_or_owned(&raw_token_bytes);
998                let bytes = raw_token_bytes.bytes().as_bstr();
999                format!("Atom: {:?} ({})", bytes, ref_kind)
1000            }
1001            RawToken::LineComment(raw_token_bytes) => {
1002                let ref_kind = borrowed_or_owned(&raw_token_bytes);
1003                let bytes = raw_token_bytes.bytes().as_bstr();
1004                format!("LineComment: {:?} ({})", bytes, ref_kind)
1005            }
1006            RawToken::BlockComment(raw_token_bytes) => {
1007                let ref_kind = borrowed_or_owned(&raw_token_bytes);
1008                let bytes = raw_token_bytes.bytes().as_bstr();
1009                format!("BlockComment: {:?} ({})", bytes, ref_kind)
1010            }
1011        }
1012    }
1013
1014    #[test]
1015    fn test_basics() {
1016        assert_snapshot!(raw_tokenize_str(b"a bc 123 "), @r#"
1017        Atom: "a" (borrowed)
1018        Atom: "bc" (borrowed)
1019        Atom: "123" (borrowed)
1020        "#);
1021
1022        assert_snapshot!(raw_tokenize_str(b"a\"123\"b \"\""), @r#"
1023        Atom: "a" (borrowed)
1024        Atom: "\"123\"" (borrowed)
1025        Atom: "b" (borrowed)
1026        Atom: "\"\"" (borrowed)
1027        "#);
1028
1029        assert_snapshot!(raw_tokenize_str(b"## #a #( #) #\"#\" #\r\n#\n| #;|\n# "), @r###"
1030        Atom: "##" (borrowed)
1031        Atom: "#a" (borrowed)
1032        Atom: "#" (borrowed)
1033        LeftParen: (
1034        Atom: "#" (borrowed)
1035        RightParen: )
1036        Atom: "#" (borrowed)
1037        Atom: "\"#\"" (borrowed)
1038        Atom: "#" (borrowed)
1039        Atom: "#" (borrowed)
1040        Atom: "|" (borrowed)
1041        SexpComment: #;
1042        Atom: "|" (borrowed)
1043        Atom: "#" (borrowed)
1044        "###);
1045
1046        assert_snapshot!(raw_tokenize_str(b"z#a z#( z#) z#\"#\" z#\r\nz#\n| z#;|\n"), @r##"
1047        Atom: "z#a" (borrowed)
1048        Atom: "z#" (borrowed)
1049        LeftParen: (
1050        Atom: "z#" (borrowed)
1051        RightParen: )
1052        Atom: "z#" (borrowed)
1053        Atom: "\"#\"" (borrowed)
1054        Atom: "z#" (borrowed)
1055        Atom: "z#" (borrowed)
1056        Atom: "|" (borrowed)
1057        Atom: "z#" (borrowed)
1058        LineComment: ";|" (borrowed)
1059        "##);
1060
1061        assert_snapshot!(raw_tokenize_str(b"|| |a |( |) |\"|\" |\r\n|\n# |;|\n| "), @r##"
1062        Atom: "||" (borrowed)
1063        Atom: "|a" (borrowed)
1064        Atom: "|" (borrowed)
1065        LeftParen: (
1066        Atom: "|" (borrowed)
1067        RightParen: )
1068        Atom: "|" (borrowed)
1069        Atom: "\"|\"" (borrowed)
1070        Atom: "|" (borrowed)
1071        Atom: "|" (borrowed)
1072        Atom: "#" (borrowed)
1073        Atom: "|" (borrowed)
1074        LineComment: ";|" (borrowed)
1075        Atom: "|" (borrowed)
1076        "##);
1077
1078        assert_snapshot!(raw_tokenize_str(b"z|a z|( z|) z|\"|\" z|\r\nz|\n# z|;|\n"), @r##"
1079        Atom: "z|a" (borrowed)
1080        Atom: "z|" (borrowed)
1081        LeftParen: (
1082        Atom: "z|" (borrowed)
1083        RightParen: )
1084        Atom: "z|" (borrowed)
1085        Atom: "\"|\"" (borrowed)
1086        Atom: "z|" (borrowed)
1087        Atom: "z|" (borrowed)
1088        Atom: "#" (borrowed)
1089        Atom: "z|" (borrowed)
1090        LineComment: ";|" (borrowed)
1091        "##);
1092    }
1093
1094    #[test]
1095    fn test_quoted_string_escapes() {
1096        assert_snapshot!(
1097            raw_tokenize_str(b"\"\\\n \\n \\123 \\\\ \\x01 \\x0\""),
1098            @r#"Atom: "\"\\\n \\n \\123 \\\\ \\x01 \\x0\"" (borrowed)"#);
1099    }
1100
1101    #[test]
1102    fn test_line_comments() {
1103        assert_snapshot!(
1104            raw_tokenize_str(b";\"\"\n;abc\r\n;\n "),
1105            @r#"
1106        LineComment: ";\"\"" (borrowed)
1107        LineComment: ";abc" (borrowed)
1108        LineComment: ";" (borrowed)
1109        "#);
1110    }
1111
1112    #[test]
1113    fn test_block_comments() {
1114        assert_snapshot!(
1115            raw_tokenize_str(b"#|a|# _ #|# |# _ #|\"|#\\\"\"|# _ #| #| a |#| |#"),
1116            @r##"
1117        BlockComment: "#|a|#" (borrowed)
1118        Atom: "_" (borrowed)
1119        BlockComment: "#|# |#" (borrowed)
1120        Atom: "_" (borrowed)
1121        BlockComment: "#|\"|#\\\"\"|#" (borrowed)
1122        Atom: "_" (borrowed)
1123        BlockComment: "#| #| a |#| |#" (borrowed)
1124        "##,
1125        );
1126    }
1127
1128    #[test]
1129    fn test_block_comment_errors() {
1130        assert_snapshot!(raw_tokenize_str(b"a#|b"), @"ERROR: TokenizationError(BlockCommentStartTokenInUnquotedAtom)");
1131        assert_snapshot!(raw_tokenize_str(b"a##|b"), @"ERROR: TokenizationError(BlockCommentStartTokenInUnquotedAtom)");
1132        assert_snapshot!(raw_tokenize_str(b"a|#b"), @"ERROR: TokenizationError(BlockCommentEndTokenInUnquotedAtom)");
1133        assert_snapshot!(raw_tokenize_str(b"a||#b"), @"ERROR: TokenizationError(BlockCommentEndTokenInUnquotedAtom)");
1134        assert_snapshot!(raw_tokenize_str(b"|#"), @"ERROR: TokenizationError(UnexpectedEndOfBlockComment)");
1135    }
1136
1137    #[test]
1138    fn test_sexp_comments() {
1139        assert_snapshot!(raw_tokenize_str(b"#; a#;x\n##;y\n"), @r###"
1140        SexpComment: #;
1141        Atom: "a#" (borrowed)
1142        LineComment: ";x" (borrowed)
1143        Atom: "##" (borrowed)
1144        LineComment: ";y" (borrowed)
1145        "###);
1146    }
1147
1148    #[test]
1149    fn test_partial_tokens() {
1150        assert_snapshot!(
1151            raw_tokenize_fragments(&[b"abc", b"", b"def", b"ghi "]),
1152            @r#"Atom: "abcdefghi" (transient)"#,
1153        );
1154
1155        assert_snapshot!(
1156            raw_tokenize_fragments(&[b";abc", b"def", b"ghi\n"]),
1157            @r#"LineComment: ";abcdefghi" (transient)"#,
1158        );
1159
1160        assert_snapshot!(
1161            raw_tokenize_fragments(&[b"#| abc", b"def", b"ghi |# "]),
1162            @r##"BlockComment: "#| abcdefghi |#" (transient)"##,
1163        );
1164    }
1165
1166    #[test]
1167    fn test_handling_of_pounds_across_buffers() {
1168        // #;
1169        // #| |#
1170        // #a
1171        // ##
1172        assert_snapshot!(
1173            raw_tokenize_fragments(&[b"#", b"; #", b"| |# #", b"a #", b"# "]),
1174            @r###"
1175        SexpComment: #;
1176        BlockComment: "#| |#" (transient)
1177        Atom: "#a" (transient)
1178        Atom: "##" (transient)
1179        "###,
1180        );
1181    }
1182
1183    #[test]
1184    fn test_eof() {
1185        assert_snapshot!(raw_tokenize_fragments(&[b"a\r"]), @r#"
1186        Atom: "a" (transient)
1187        ERROR: TokenizationError(NakedCarriageReturn)
1188        "#);
1189
1190        assert_snapshot!(raw_tokenize_fragments(&[b"a"]), @r#"Atom: "a" (transient)"#);
1191
1192        assert_snapshot!(raw_tokenize_fragments(&[b"a|"]), @r#"Atom: "a|" (transient)"#);
1193
1194        assert_snapshot!(raw_tokenize_fragments(&[b"a#"]), @r#"Atom: "a#" (transient)"#);
1195
1196        assert_snapshot!(raw_tokenize_fragments(&[b"|"]), @r#"Atom: "|" (transient)"#);
1197
1198        assert_snapshot!(raw_tokenize_fragments(&[b";"]), @r#"LineComment: ";" (transient)"#);
1199
1200        assert_snapshot!(raw_tokenize_fragments(&[b";"]), @r#"LineComment: ";" (transient)"#);
1201
1202        assert_snapshot!(raw_tokenize_fragments(&[b"#"]), @r##"Atom: "#" (transient)"##);
1203    }
1204
1205    #[test]
1206    fn test_eof_errors() {
1207        assert_snapshot!(raw_tokenize_fragments(&[b"#|"]), @"ERROR: TokenizationError(UnexpectedEofWhileInBlockComment)");
1208        assert_snapshot!(raw_tokenize_fragments(&[b"#| #"]), @"ERROR: TokenizationError(UnexpectedEofWhileInBlockComment)");
1209        assert_snapshot!(raw_tokenize_fragments(&[b"#| |"]), @"ERROR: TokenizationError(UnexpectedEofWhileInBlockComment)");
1210        assert_snapshot!(raw_tokenize_fragments(&[b"#| \""]), @"ERROR: TokenizationError(UnexpectedEofWhileInBlockComment)");
1211        assert_snapshot!(raw_tokenize_fragments(&[b"#| \"\\"]), @"ERROR: TokenizationError(UnexpectedEofWhileInBlockComment)");
1212    }
1213
1214    #[test]
1215    fn test_raw_tokenizer() {
1216        assert_snapshot!(
1217            raw_tokenize_fragments(&[b"a1 a2", b" a3"]),
1218            @r#"
1219        Atom: "a1" (transient)
1220        Atom: "a2" (transient)
1221        Atom: "a3" (transient)
1222        "#,
1223        );
1224
1225        assert_snapshot!(
1226            raw_tokenize_fragments(&[b"abc", b"def", b"ghi"]),
1227            @r#"Atom: "abcdefghi" (transient)"#,
1228        );
1229
1230        assert_snapshot!(
1231            raw_tokenize_fragments(&[b"; lc1\n ; lc2", b"\n ; lc3"]),
1232            @r#"
1233        LineComment: "; lc1" (transient)
1234        LineComment: "; lc2" (transient)
1235        LineComment: "; lc3" (transient)
1236        "#,
1237        );
1238
1239        assert_snapshot!(
1240            raw_tokenize_fragments(&[b"; abc", b"def", b"ghi"]),
1241            @r#"LineComment: "; abcdefghi" (transient)"#,
1242        );
1243
1244        assert_snapshot!(
1245            raw_tokenize_fragments(&[b"#| bc1 |# #| bc2 ", b"|#"]),
1246            @r##"
1247        BlockComment: "#| bc1 |#" (transient)
1248        BlockComment: "#| bc2 |#" (transient)
1249        "##,
1250        );
1251
1252        assert_snapshot!(
1253            raw_tokenize_fragments(&[b"#| abc", b"def", b"ghi |#"]),
1254            @r##"BlockComment: "#| abcdefghi |#" (transient)"##,
1255        );
1256    }
1257
1258    #[test]
1259    fn test_block_comment_validation() {
1260        fn b(bytes: &[u8]) -> String {
1261            match RawTokenBytes::new(bytes).validate_block_comment() {
1262                Ok(()) => "Ok".to_owned(),
1263                Err(err) => format!("{:?}", err),
1264            }
1265        }
1266
1267        // Basic
1268        assert_snapshot!(b(b"#| |#"),                        @"Ok");
1269        assert_snapshot!(b(br#"#| "abc" |#"#),               @"Ok");
1270        assert_snapshot!(b(br#"#| "\\ \' \" \ ""abc" |#"#),  @"Ok");
1271        assert_snapshot!(b(br#"#| "\n \t \b \r""abc" |#"#),  @"Ok");
1272        assert_snapshot!(b(br#"#| "\x00 \xff""abc" |#"#),    @"Ok");
1273        assert_snapshot!(b(br#"#| "\x000 \x255""abc" |#"#),  @"Ok");
1274
1275        // Not in quotes, so fine
1276        assert_snapshot!(b(br#"#| \xgg \x0 \256 \999 |#"#),  @"Ok");
1277
1278        // Invalid escapes
1279        assert_snapshot!(b(br#"#| "\xgg" |#"#),     @"InvalidHexadecimalEscape");
1280        assert_snapshot!(b(br#"#| "\xf " |#"#),     @"InvalidHexadecimalEscape");
1281        assert_snapshot!(b(br#"#| "\xf" |#"#),      @"UnterminatedHexadecimalEscape");
1282        assert_snapshot!(b(br#"#| "\256" |#"#),     @"OutOfRangeDecimalEscape");
1283        assert_snapshot!(b(br#"#| "\25 " |#"#),     @"InvalidDecimalEscape");
1284        assert_snapshot!(b(br#"#| "\25" |#"#),      @"UnterminatedDecimalEscape");
1285        assert_snapshot!(b(br#"#| "\"""\xgg" |#"#), @"InvalidHexadecimalEscape");
1286        assert_snapshot!(b(br#"#| " \" \""#),       @"UnterminatedQuote");
1287
1288        // Block comment formatting isn't checked
1289        assert_snapshot!(b(br#""#),               @"Ok");
1290        assert_snapshot!(b(br#"#| "#),            @"Ok");
1291        assert_snapshot!(b(br#" |#"#),            @"Ok");
1292        assert_snapshot!(b(br#"#| #| |# |# |#"#), @"Ok");
1293    }
1294
1295    fn format_token(token: Token<'_, '_>) -> String {
1296        fn borrowed_or_owned(token_bytes: &Ref<'_, '_, AtomData>) -> &'static str {
1297            match token_bytes {
1298                Ref::Borrowed(_) => "borrowed",
1299                Ref::Transient(_) => "transient",
1300            }
1301        }
1302
1303        match token {
1304            Token::LeftParen => "LeftParen: (".to_owned(),
1305            Token::RightParen => "RightParen: )".to_owned(),
1306            Token::Atom(data) => {
1307                let ref_kind = borrowed_or_owned(&data);
1308                format!("Atom: {:?} ({})", data.bytes().as_bstr(), ref_kind)
1309            }
1310        }
1311    }
1312
1313    fn tokenize_str(buffer: &[u8]) -> String {
1314        let input = SliceInput::new(buffer);
1315        let mut tokenizer = Tokenizer::new(input);
1316
1317        let mut output = String::new();
1318        let o = &mut output;
1319
1320        loop {
1321            // Someday: Do a better job of testing this with and without calling peek?
1322            let _ = tokenizer.peek_kind();
1323            let _ = match tokenizer.next() {
1324                Ok(None) => break,
1325                Ok(Some(token)) => writeln!(o, "{}", format_token(token)),
1326                Err(err) => writeln!(o, "{}", format_error(err)),
1327            };
1328        }
1329
1330        output
1331    }
1332
1333    #[test]
1334    fn test_tokenizer() {
1335        assert_snapshot!(tokenize_str(br#"a "b c" "d\e" f"#), @r#"
1336        Atom: "a" (borrowed)
1337        Atom: "b c" (borrowed)
1338        Atom: "d\\e" (transient)
1339        Atom: "f" (transient)
1340        "#);
1341
1342        assert_snapshot!(tokenize_str(b"(a #| xyz |# b c ; abc \n)"), @r#"
1343        LeftParen: (
1344        Atom: "a" (borrowed)
1345        Atom: "b" (borrowed)
1346        Atom: "c" (borrowed)
1347        RightParen: )
1348        "#);
1349
1350        // Tokenizer does not enforce a valid sexp during regular parsing.
1351        assert_snapshot!(tokenize_str(b") ) ( ("), @r"
1352        RightParen: )
1353        RightParen: )
1354        LeftParen: (
1355        LeftParen: (
1356        ");
1357    }
1358
1359    #[test]
1360    fn test_tokenizer_handles_sexp_comments() {
1361        assert_snapshot!(tokenize_str(b"a #; b #; (x y z)"), @r#"Atom: "a" (borrowed)"#);
1362
1363        assert_snapshot!(tokenize_str(b"a #; #; #; w (x #; 0 y) z b c"), @r#"
1364        Atom: "a" (borrowed)
1365        Atom: "b" (borrowed)
1366        Atom: "c" (transient)
1367        "#);
1368
1369        assert_snapshot!(tokenize_str(b"a #;"), @r#"
1370        Atom: "a" (borrowed)
1371        ERROR: TokenizationError(UnterminatedSexpCommentAtEof)
1372        "#);
1373
1374        assert_snapshot!(tokenize_str(b"(#;)"), @r"
1375        LeftParen: (
1376        ERROR: TokenizationError(UnterminatedSexpCommentAtEndOfList)
1377        ");
1378    }
1379}
ocaml_sexplib/tokenizer.rs

ocaml_sexplib/
tokenizer.rs