castle_tokenizer/
tokenizer.rs

1use std::{collections::VecDeque, io::Read};
2
3use castle_input_cursor::Cursor;
4use castle_types::CastleError;
5
6use crate::{
7    token_parsers::{
8        parse_ident_or_keyword::parse_ident_or_keyword, parse_newline::parse_newline,
9        parse_numbers::parse_number, parse_operator::parse_operator, parse_string::parse_string,
10        skip_comment::skip_comment,
11    },
12    Token, TokenKind, Tokenizable,
13};
14
15pub struct Tokenizer<R> {
16    pub cursor: Cursor<R>,
17    pub peeked: VecDeque<Token>,
18}
19
20impl<R: Read> Tokenizable for Tokenizer<R> {
21    fn next(&mut self, skip_line_terminators: bool) -> Result<Option<Token>, CastleError> {
22        loop {
23            let token = match self.peeked.pop_front() {
24                Some(token) => Some(token),
25                None => self.advance()?,
26            };
27
28            return match token {
29                Some(Token {
30                    kind: TokenKind::LineTerminator,
31                    ..
32                }) if skip_line_terminators => continue,
33                Some(token) => Ok(Some(token)),
34                None => Ok(None),
35            };
36        }
37    }
38
39    fn peek_n(
40        &mut self,
41        skip_n: usize,
42        skip_line_terminators: bool,
43    ) -> Result<Option<&Token>, CastleError> {
44        // if the number of tokens to skip is greater than or equal to the number of peeked tokens,
45        // we need to read more tokens
46        // firstly filter out self.peeked tokens that are line terminators
47        if skip_line_terminators {
48            self.peeked.retain(|token| match token.kind {
49                TokenKind::LineTerminator => false,
50                _ => true,
51            });
52        }
53        while skip_n >= self.peeked.len() {
54            match self.advance()? {
55                Some(Token {
56                    kind: TokenKind::LineTerminator,
57                    ..
58                }) if skip_line_terminators => continue,
59                Some(token) => self.peeked.push_back(token),
60                None => break, // EOF
61            }
62        }
63
64        Ok(self.peeked.get(skip_n))
65    }
66}
67
68impl<R: Read> Tokenizer<R> {
69    pub fn new(reader: R) -> Self {
70        Self {
71            cursor: Cursor::new(reader),
72            peeked: VecDeque::new(),
73        }
74    }
75
76    /// Advances the cursor and returns the next token
77    /// Skips comments and whitespace (not including line terminators)
78    /// Coalesces consecutive line terminators (\n and \r)
79    pub fn advance(&mut self) -> Result<Option<Token>, CastleError> {
80        loop {
81            // skip whitespaces
82            let (start, next_ch) = loop {
83                let start = self.cursor.pos();
84                if let Some(next_ch) = self.cursor.peek_char()? {
85                    // Ignore whitespace
86                    if !is_whitespace(next_ch) {
87                        break (start, next_ch);
88                    }
89                    self.cursor.next_char()?;
90                } else {
91                    return Ok(None);
92                }
93            };
94
95            if let Ok(c) = char::try_from(next_ch) {
96                let token = match c {
97                    '#' => {
98                        skip_comment(&mut self.cursor)?;
99                        continue;
100                    }
101                    '\r' | '\n' => parse_newline(&mut self.cursor, start)?,
102                    '"' => parse_string(&mut self.cursor, start)?,
103                    // Operator & Punctuator
104                    '=' | '<' | '>' | '*' | '/' | '%' | '&' | '|' | '^' | ':' | '{' | '}' | '['
105                    | ']' | ',' | ';' | '@' | '(' | ')' => parse_operator(&mut self.cursor, start)?,
106                    '-' => parse_number(&mut self.cursor, start)?,
107                    _ if c.is_digit(10) => parse_number(&mut self.cursor, start)?,
108                    _ if c.is_ascii_alphabetic() || c == '_' => parse_ident_or_keyword(&mut self.cursor, start)?,
109                    _ => Err(CastleError::syntax(
110                        format!(
111                            "Unexpected '{}' at line {}, column {}",
112                            c,
113                            start.line_number(),
114                            start.column_number()
115                        ),
116                        start,
117                    ))?,
118                };
119
120                return Ok(Some(token));
121            } else {
122                return Ok(None); // EOF
123            }
124        }
125    }
126}
127
128fn is_whitespace(ch: u32) -> bool {
129    matches!(
130        ch,
131        0x0020 | 0x0009 | 0x000B | 0x000C | 0x00A0 | 0xFEFF |
132        // Unicode Space_Seperator category (minus \u{0020} and \u{00A0} which are allready stated above)
133        0x1680 | 0x2000..=0x200A | 0x202F | 0x205F | 0x3000
134    )
135}
136
137
138#[test]
139fn skips_newlines_with_eof() {
140    let mut tokenizer = Tokenizer::new("\n\n".as_bytes());
141    assert_eq!(tokenizer.next(true).unwrap(), None)
142}