castle_tokenizer/
tokenizer.rs1use std::{collections::VecDeque, io::Read};
2
3use castle_input_cursor::Cursor;
4use castle_types::CastleError;
5
6use crate::{
7 token_parsers::{
8 parse_ident_or_keyword::parse_ident_or_keyword, parse_newline::parse_newline,
9 parse_numbers::parse_number, parse_operator::parse_operator, parse_string::parse_string,
10 skip_comment::skip_comment,
11 },
12 Token, TokenKind, Tokenizable,
13};
14
15pub struct Tokenizer<R> {
16 pub cursor: Cursor<R>,
17 pub peeked: VecDeque<Token>,
18}
19
20impl<R: Read> Tokenizable for Tokenizer<R> {
21 fn next(&mut self, skip_line_terminators: bool) -> Result<Option<Token>, CastleError> {
22 loop {
23 let token = match self.peeked.pop_front() {
24 Some(token) => Some(token),
25 None => self.advance()?,
26 };
27
28 return match token {
29 Some(Token {
30 kind: TokenKind::LineTerminator,
31 ..
32 }) if skip_line_terminators => continue,
33 Some(token) => Ok(Some(token)),
34 None => Ok(None),
35 };
36 }
37 }
38
39 fn peek_n(
40 &mut self,
41 skip_n: usize,
42 skip_line_terminators: bool,
43 ) -> Result<Option<&Token>, CastleError> {
44 if skip_line_terminators {
48 self.peeked.retain(|token| match token.kind {
49 TokenKind::LineTerminator => false,
50 _ => true,
51 });
52 }
53 while skip_n >= self.peeked.len() {
54 match self.advance()? {
55 Some(Token {
56 kind: TokenKind::LineTerminator,
57 ..
58 }) if skip_line_terminators => continue,
59 Some(token) => self.peeked.push_back(token),
60 None => break, }
62 }
63
64 Ok(self.peeked.get(skip_n))
65 }
66}
67
68impl<R: Read> Tokenizer<R> {
69 pub fn new(reader: R) -> Self {
70 Self {
71 cursor: Cursor::new(reader),
72 peeked: VecDeque::new(),
73 }
74 }
75
76 pub fn advance(&mut self) -> Result<Option<Token>, CastleError> {
80 loop {
81 let (start, next_ch) = loop {
83 let start = self.cursor.pos();
84 if let Some(next_ch) = self.cursor.peek_char()? {
85 if !is_whitespace(next_ch) {
87 break (start, next_ch);
88 }
89 self.cursor.next_char()?;
90 } else {
91 return Ok(None);
92 }
93 };
94
95 if let Ok(c) = char::try_from(next_ch) {
96 let token = match c {
97 '#' => {
98 skip_comment(&mut self.cursor)?;
99 continue;
100 }
101 '\r' | '\n' => parse_newline(&mut self.cursor, start)?,
102 '"' => parse_string(&mut self.cursor, start)?,
103 '=' | '<' | '>' | '*' | '/' | '%' | '&' | '|' | '^' | ':' | '{' | '}' | '['
105 | ']' | ',' | ';' | '@' | '(' | ')' => parse_operator(&mut self.cursor, start)?,
106 '-' => parse_number(&mut self.cursor, start)?,
107 _ if c.is_digit(10) => parse_number(&mut self.cursor, start)?,
108 _ if c.is_ascii_alphabetic() || c == '_' => parse_ident_or_keyword(&mut self.cursor, start)?,
109 _ => Err(CastleError::syntax(
110 format!(
111 "Unexpected '{}' at line {}, column {}",
112 c,
113 start.line_number(),
114 start.column_number()
115 ),
116 start,
117 ))?,
118 };
119
120 return Ok(Some(token));
121 } else {
122 return Ok(None); }
124 }
125 }
126}
127
128fn is_whitespace(ch: u32) -> bool {
129 matches!(
130 ch,
131 0x0020 | 0x0009 | 0x000B | 0x000C | 0x00A0 | 0xFEFF |
132 0x1680 | 0x2000..=0x200A | 0x202F | 0x205F | 0x3000
134 )
135}
136
137
138#[test]
139fn skips_newlines_with_eof() {
140 let mut tokenizer = Tokenizer::new("\n\n".as_bytes());
141 assert_eq!(tokenizer.next(true).unwrap(), None)
142}