1use std::{error::Error, fmt::Display};
4
5use keyword::{Keyword, KeywordRandomizer};
6use rand::RngCore;
7
8pub mod keyword;
9
10#[derive(Debug, Clone, Copy, PartialEq)]
12pub struct Token<'src> {
13 pub tag: TokenTag<'src>,
15 pub line: usize,
17 pub col: usize,
19 pub len: usize,
21}
22
23#[derive(Debug, Clone, Copy, PartialEq)]
25pub enum TokenTag<'src> {
26 Identifier(&'src str),
28 Number(f64),
30 String(&'src str),
32 Keyword(Keyword),
34 OpenParen,
36 CloseParen,
38 OpenBracket,
40 CloseBracket,
42 Semicolon,
44 Plus,
46 PlusPlus,
48 PlusEq,
50 Minus,
52 Star,
54 Comma,
56 Dot,
58 Slash,
60
61 EOF,
63}
64
65#[derive(Debug, Clone, PartialEq, Eq)]
67pub struct TokenizeError {
68 pub message: String,
70 pub line: usize,
72 pub col: usize,
74}
75
76impl TokenizeError {
77 pub fn new(msg: impl Into<String>, line: usize, col: usize) -> Self {
79 Self {
80 message: msg.into(),
81 line,
82 col,
83 }
84 }
85}
86
87impl Display for TokenizeError {
88 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
89 write!(
90 f,
91 "{} at line {}, col {}",
92 self.message, self.line, self.col
93 )
94 }
95}
96
97impl Error for TokenizeError {}
98
99pub type Result<T> = std::result::Result<T, TokenizeError>;
101
102pub trait Tokenizable {
104 fn tokenize<RNG: RngCore>(&self, rng: &mut RNG) -> Result<Vec<Token<'_>>>;
106 fn tokenze_no_rng(&self) -> Result<Vec<Token<'_>>> {
109 let mut rng = rand::rng();
110
111 self.tokenize(&mut rng)
112 }
113}
114
115impl<STR> Tokenizable for STR
116where
117 STR: AsRef<str>,
118{
119 fn tokenize<RNG: RngCore>(&self, rng: &mut RNG) -> Result<Vec<Token<'_>>> {
120 let keyword_gen = KeywordRandomizer::seeded_start(rng);
121 let mut peek = self.as_ref().chars().enumerate().peekable();
122 let stream = &self.as_ref();
123 let mut tokens = vec![];
124
125 let mut line = 1;
126 let mut col = 0;
127
128 while let Some((idx, ch)) = peek.next() {
129 let mut len = 1;
130 col += 1;
131
132 let tag = if let Some(kwrd) = keyword_gen.try_parse(stream, idx, &mut len) {
133 for _ in 0..len - 1 {
134 peek.next();
135 col += 1;
136 }
137
138 TokenTag::Keyword(kwrd)
139 } else {
140 match ch {
141 '[' => TokenTag::OpenBracket,
142 ']' => TokenTag::CloseBracket,
143
144 '(' => TokenTag::OpenParen,
145 ')' => TokenTag::CloseParen,
146
147 ';' => TokenTag::Semicolon,
148 '.' => TokenTag::Dot,
149
150 '+' => match peek.peek() {
151 Some((_, '+')) => {
152 peek.next();
153 TokenTag::PlusPlus
154 }
155 Some((_, '=')) => {
156 peek.next();
157 TokenTag::PlusEq
158 }
159 _ => TokenTag::Plus,
160 },
161 '-' => TokenTag::Minus,
162 '*' => TokenTag::Star,
163 '/' => match peek.peek() {
164 Some((_, '/')) => {
165 for (_, ch) in peek.by_ref() {
166 if ch == '\n' {
167 break;
168 }
169 }
170 continue;
171 }
172 Some((_, '*')) => {
173 peek.next();
174 while let Some((_, ch)) = peek.next() {
175 if ch == '*' {
176 if let Some((_, '/')) = peek.peek() {
177 peek.next();
178 break;
179 }
180 }
181 }
182
183 continue;
184 }
185 _ => TokenTag::Slash,
186 },
187
188 '\n' => {
189 col = 0;
190 line += 1;
191 continue;
192 }
193
194 ',' => TokenTag::Comma,
195
196 ws if ws.is_whitespace() => continue,
197
198 num if num.is_numeric() => {
199 let mut curr = String::new();
200 curr.push(num);
201
202 let mut dot = false;
203 while let Some((_, next)) = peek.peek() {
204 if next.is_numeric() {
205 col += 1;
206 len += 1;
207 curr.push(peek.next().unwrap().1);
208 } else if *next == '.' && !dot {
209 col += 1;
210 len += 1;
211 curr.push(peek.next().unwrap().1);
212 dot = true;
213 } else {
214 break;
215 }
216 }
217
218 TokenTag::Number(curr.parse().unwrap())
221 }
222
223 '"' => {
224 let mut idx2 = idx;
225 let mut ended = false;
226
227 for (_, c) in peek.by_ref() {
228 if c != '"' {
229 idx2 += 1;
230 col += 1;
231 len += 1;
232 } else {
233 ended = true;
234 break;
235 }
236 }
237
238 if ended {
239 TokenTag::String(&self.as_ref()[idx + 1..=idx2])
240 } else {
241 return Err(TokenizeError::new(
242 r#"Expected End of String: `"`"#,
243 line,
244 col,
245 ));
246 }
247 }
248
249 ch if ch.is_alphanumeric() || ch == '_' || ch == '.' => {
250 let mut end = idx;
251
252 while let Some((idx2, next)) = peek.peek() {
253 if !(next.is_alphanumeric() || *next == '_' || *next == '.') {
254 break;
255 }
256
257 end = *idx2;
258 col += 1;
259 len += 1;
260 peek.next();
261 }
262
263 let word = &self.as_ref()[idx..=end];
264 if let Err(Some(was)) = keyword_gen.try_from_str(word) {
265 return Err(TokenizeError::new(
266 format!("Invalid keyword `{word}`, did you mean `{was}`?"),
267 line,
268 col,
269 ));
270 } else {
271 TokenTag::Identifier(word)
272 }
273 }
274 bad => {
275 return Err(TokenizeError::new(
276 format!("Invalid token {bad}"),
277 line,
278 col,
279 ));
280 }
281 }
282 };
283
284 let next = Token {
285 tag,
286 col,
287 len,
288 line,
289 };
290
291 tokens.push(next);
292 }
293
294 tokens.push(Token {
295 line,
296 col,
297 len: 0,
298 tag: TokenTag::EOF,
299 });
300
301 Ok(tokens)
302 }
303}
304
305#[cfg(test)]
306mod tests {
307 use rand::SeedableRng;
308 use rand_chacha::ChaCha8Rng;
309
310 use crate::tokenizer::keyword::Keyword;
311
312 use super::{Token, TokenTag, Tokenizable};
313
314 fn toks<'a>(tokens: Vec<Token<'a>>) -> Vec<TokenTag<'a>> {
315 tokens.into_iter().map(|token| token.tag).collect()
316 }
317
318 #[test]
319 fn basic_tokenizer_test() {
320 let mut rng = ChaCha8Rng::seed_from_u64(42);
321 let stream = r#"$ i = 0;
322$ foo = 10;
323fmt.Println("this is a little test");"#
324 .tokenize(&mut rng)
325 .expect("Valid tokenization");
326
327 let toks = toks(stream);
328 let expected = [
329 TokenTag::Keyword(Keyword::VariableDeclaration),
330 TokenTag::Identifier("i"),
331 TokenTag::Keyword(Keyword::Equal),
332 TokenTag::Number(0.0),
333 TokenTag::Semicolon,
334 TokenTag::Keyword(Keyword::VariableDeclaration),
335 TokenTag::Identifier("foo"),
336 TokenTag::Keyword(Keyword::Equal),
337 TokenTag::Number(10.0),
338 TokenTag::Semicolon,
339 TokenTag::Keyword(Keyword::Print),
340 TokenTag::OpenParen,
341 TokenTag::String("this is a little test"),
342 TokenTag::CloseParen,
343 TokenTag::Semicolon,
344 TokenTag::EOF,
345 ];
346
347 assert_eq!(toks, expected)
348 }
349}