1use std::{error::Error, fmt::Display};
4
5use keyword::{Keyword, KeywordRandomizer};
6use rand::RngCore;
7
8pub mod keyword;
9
10#[derive(Debug, Clone, Copy, PartialEq)]
12pub struct Token<'src> {
13 pub tag: TokenTag<'src>,
15 pub line: usize,
17 pub col: usize,
19 pub len: usize,
21}
22
23#[derive(Debug, Clone, Copy, PartialEq)]
25pub enum TokenTag<'src> {
26 Identifier(&'src str),
28 Number(f64),
30 String(&'src str),
32 Keyword(Keyword),
34 OpenParen,
36 CloseParen,
38 OpenBracket,
40 CloseBracket,
42 Semicolon,
44 Plus,
46 PlusPlus,
48 PlusEq,
50 Minus,
52 Star,
54 Comma,
56 Dot,
58 Slash,
60
61 EOF,
63}
64
65#[derive(Debug, Clone, PartialEq, Eq)]
67pub struct TokenizeError {
68 pub message: String,
70 pub line: usize,
72 pub col: usize,
74}
75
76impl TokenizeError {
77 pub fn new(msg: impl Into<String>, line: usize, col: usize) -> Self {
79 Self {
80 message: msg.into(),
81 line,
82 col,
83 }
84 }
85}
86
87impl Display for TokenizeError {
88 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
89 write!(
90 f,
91 "{} at line {}, col {}",
92 self.message, self.line, self.col
93 )
94 }
95}
96
97impl Error for TokenizeError {}
98
99pub type Result<T> = std::result::Result<T, TokenizeError>;
101
102pub trait Tokenizable {
104 fn tokenize<RNG: RngCore>(&self, rng: &mut RNG) -> Result<Vec<Token<'_>>>;
106 fn tokenze_no_rng(&self) -> Result<Vec<Token<'_>>> {
109 let mut rng = rand::rng();
110
111 self.tokenize(&mut rng)
112 }
113}
114
115impl<STR> Tokenizable for STR
116where
117 STR: AsRef<str>,
118{
119 fn tokenize<RNG: RngCore>(&self, rng: &mut RNG) -> Result<Vec<Token<'_>>> {
120 let keyword_gen = KeywordRandomizer::seeded_start(rng);
121 let mut peek = self.as_ref().chars().enumerate().peekable();
122 let stream = &self.as_ref();
123 let mut tokens = vec![];
124
125 let mut line = 1;
126 let mut col = 0;
127
128 while let Some((idx, ch)) = peek.next() {
129 let mut len = 1;
130 col += 1;
131
132 let tag = match keyword_gen.try_parse(stream, idx, &mut len) {
133 Ok(kwrd) => {
134 for _ in 0..len - 1 {
135 peek.next();
136 col += 1;
137 }
138
139 TokenTag::Keyword(kwrd)
140 }
141 Err(Some(was)) => {
142 let word = &self.as_ref()[idx..idx + len];
143 return Err(TokenizeError::new(
144 format!("Invalid keyword `{word}`, did you mean `{was}`?"),
145 line,
146 col,
147 ));
148 }
149 _ => {
150 match ch {
151 '[' => TokenTag::OpenBracket,
152 ']' => TokenTag::CloseBracket,
153
154 '(' => TokenTag::OpenParen,
155 ')' => TokenTag::CloseParen,
156
157 ';' => TokenTag::Semicolon,
158 '.' => TokenTag::Dot,
159
160 '+' => match peek.peek() {
161 Some((_, '+')) => {
162 peek.next();
163 TokenTag::PlusPlus
164 }
165 Some((_, '=')) => {
166 peek.next();
167 TokenTag::PlusEq
168 }
169 _ => TokenTag::Plus,
170 },
171 '-' => TokenTag::Minus,
172 '*' => TokenTag::Star,
173 '/' => match peek.peek() {
174 Some((_, '/')) => {
175 for (_, ch) in peek.by_ref() {
176 if ch == '\n' {
177 break;
178 }
179 }
180 continue;
181 }
182 Some((_, '*')) => {
183 peek.next();
184 while let Some((_, ch)) = peek.next() {
185 if ch == '*' {
186 if let Some((_, '/')) = peek.peek() {
187 peek.next();
188 break;
189 }
190 }
191 }
192
193 continue;
194 }
195 _ => TokenTag::Slash,
196 },
197
198 '\n' => {
199 col = 0;
200 line += 1;
201 continue;
202 }
203
204 ',' => TokenTag::Comma,
205
206 ws if ws.is_whitespace() => continue,
207
208 num if num.is_numeric() => {
209 let mut curr = String::new();
210 curr.push(num);
211
212 let mut dot = false;
213 while let Some((_, next)) = peek.peek() {
214 if next.is_numeric() {
215 col += 1;
216 len += 1;
217 curr.push(peek.next().unwrap().1);
218 } else if *next == '.' && !dot {
219 col += 1;
220 len += 1;
221 curr.push(peek.next().unwrap().1);
222 dot = true;
223 } else {
224 break;
225 }
226 }
227
228 TokenTag::Number(curr.parse().unwrap())
231 }
232
233 '"' => {
234 let mut idx2 = idx;
235 let mut ended = false;
236
237 for (_, c) in peek.by_ref() {
238 if c != '"' {
239 idx2 += 1;
240 col += 1;
241 len += 1;
242 } else {
243 ended = true;
244 break;
245 }
246 }
247
248 if ended {
249 TokenTag::String(&self.as_ref()[idx + 1..=idx2])
250 } else {
251 return Err(TokenizeError::new(
252 r#"Expected End of String: `"`"#,
253 line,
254 col,
255 ));
256 }
257 }
258
259 ch if ch.is_alphanumeric() || ch == '_' => {
260 let mut end = idx;
261
262 while let Some((idx2, next)) = peek.peek() {
263 if !(next.is_alphanumeric() || *next == '_') {
264 break;
265 }
266
267 end = *idx2;
268 col += 1;
269 len += 1;
270 peek.next();
271 }
272
273 let word = &self.as_ref()[idx..=end];
274 TokenTag::Identifier(word)
275 }
276 bad => {
277 return Err(TokenizeError::new(
278 format!("Invalid token {bad}"),
279 line,
280 col,
281 ));
282 }
283 }
284 }
285 };
286
287 let next = Token {
288 tag,
289 col,
290 len,
291 line,
292 };
293
294 tokens.push(next);
295 }
296
297 tokens.push(Token {
298 line,
299 col,
300 len: 0,
301 tag: TokenTag::EOF,
302 });
303
304 Ok(tokens)
305 }
306}
307
308#[cfg(test)]
309mod tests {
310 use rand::SeedableRng;
311 use rand_chacha::ChaCha8Rng;
312
313 use crate::tokenizer::keyword::Keyword;
314
315 use super::{Token, TokenTag, Tokenizable};
316
317 fn toks<'a>(tokens: Vec<Token<'a>>) -> Vec<TokenTag<'a>> {
318 tokens.into_iter().map(|token| token.tag).collect()
319 }
320
321 #[test]
322 fn basic_tokenizer_test() {
323 let mut rng = ChaCha8Rng::seed_from_u64(42);
324 let stream = r#"$ i = 0;
325$ foo = 10;
326println ("this is a little test");"#
327 .tokenize(&mut rng)
328 .expect("Valid tokenization");
329
330 let toks = toks(stream);
331 let expected = [
332 TokenTag::Keyword(Keyword::VariableDeclaration),
333 TokenTag::Identifier("i"),
334 TokenTag::Keyword(Keyword::Equal),
335 TokenTag::Number(0.0),
336 TokenTag::Semicolon,
337 TokenTag::Keyword(Keyword::VariableDeclaration),
338 TokenTag::Identifier("foo"),
339 TokenTag::Keyword(Keyword::Equal),
340 TokenTag::Number(10.0),
341 TokenTag::Semicolon,
342 TokenTag::Keyword(Keyword::Print),
343 TokenTag::OpenParen,
344 TokenTag::String("this is a little test"),
345 TokenTag::CloseParen,
346 TokenTag::Semicolon,
347 TokenTag::EOF,
348 ];
349
350 assert_eq!(toks, expected)
351 }
352}