1use std::{error::Error, fmt::Display};
4
5use keyword::{Keyword, KeywordRandomizer};
6use rand::RngCore;
7
8pub mod keyword;
9
10#[derive(Debug, Clone, Copy, PartialEq)]
12pub struct Token<'src> {
13    pub tag: TokenTag<'src>,
15    pub line: usize,
17    pub col: usize,
19    pub len: usize,
21}
22
23#[derive(Debug, Clone, Copy, PartialEq)]
25pub enum TokenTag<'src> {
26    Identifier(&'src str),
28    Number(f64),
30    String(&'src str),
32    Keyword(Keyword),
34    OpenParen,
36    CloseParen,
38    OpenBracket,
40    CloseBracket,
42    Semicolon,
44    Plus,
46    PlusPlus,
48    PlusEq,
50    Minus,
52    Star,
54    Comma,
56    Dot,
58    Slash,
60
61    EOF,
63}
64
65#[derive(Debug, Clone, PartialEq, Eq)]
67pub struct TokenizeError {
68    pub message: String,
70    pub line: usize,
72    pub col: usize,
74}
75
76impl TokenizeError {
77    pub fn new(msg: impl Into<String>, line: usize, col: usize) -> Self {
79        Self {
80            message: msg.into(),
81            line,
82            col,
83        }
84    }
85}
86
87impl Display for TokenizeError {
88    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
89        write!(
90            f,
91            "{} at line {}, col {}",
92            self.message, self.line, self.col
93        )
94    }
95}
96
97impl Error for TokenizeError {}
98
99pub type Result<T> = std::result::Result<T, TokenizeError>;
101
102pub trait Tokenizable {
104    fn tokenize<RNG: RngCore>(&self, rng: &mut RNG) -> Result<Vec<Token<'_>>>;
106    fn tokenze_no_rng(&self) -> Result<Vec<Token<'_>>> {
109        let mut rng = rand::rng();
110
111        self.tokenize(&mut rng)
112    }
113}
114
115impl<STR> Tokenizable for STR
116where
117    STR: AsRef<str>,
118{
119    fn tokenize<RNG: RngCore>(&self, rng: &mut RNG) -> Result<Vec<Token<'_>>> {
120        let keyword_gen = KeywordRandomizer::seeded_start(rng);
121        let mut peek = self.as_ref().chars().enumerate().peekable();
122        let stream = &self.as_ref();
123        let mut tokens = vec![];
124
125        let mut line = 1;
126        let mut col = 0;
127
128        while let Some((idx, ch)) = peek.next() {
129            let mut len = 1;
130            col += 1;
131
132            let tag = if let Some(kwrd) = keyword_gen.try_parse(stream, idx, &mut len) {
133                for _ in 0..len - 1 {
134                    peek.next();
135                    col += 1;
136                }
137
138                TokenTag::Keyword(kwrd)
139            } else {
140                match ch {
141                    '[' => TokenTag::OpenBracket,
142                    ']' => TokenTag::CloseBracket,
143
144                    '(' => TokenTag::OpenParen,
145                    ')' => TokenTag::CloseParen,
146
147                    ';' => TokenTag::Semicolon,
148                    '.' => TokenTag::Dot,
149
150                    '+' => match peek.peek() {
151                        Some((_, '+')) => {
152                            peek.next();
153                            TokenTag::PlusPlus
154                        }
155                        Some((_, '=')) => {
156                            peek.next();
157                            TokenTag::PlusEq
158                        }
159                        _ => TokenTag::Plus,
160                    },
161                    '-' => TokenTag::Minus,
162                    '*' => TokenTag::Star,
163                    '/' => match peek.peek() {
164                        Some((_, '/')) => {
165                            for (_, ch) in peek.by_ref() {
166                                if ch == '\n' {
167                                    break;
168                                }
169                            }
170                            continue;
171                        }
172                        Some((_, '*')) => {
173                            peek.next();
174                            while let Some((_, ch)) = peek.next() {
175                                if ch == '*' {
176                                    if let Some((_, '/')) = peek.peek() {
177                                        peek.next();
178                                        break;
179                                    }
180                                }
181                            }
182
183                            continue;
184                        }
185                        _ => TokenTag::Slash,
186                    },
187
188                    '\n' => {
189                        col = 0;
190                        line += 1;
191                        continue;
192                    }
193
194                    ',' => TokenTag::Comma,
195
196                    ws if ws.is_whitespace() => continue,
197
198                    num if num.is_numeric() => {
199                        let mut curr = String::new();
200                        curr.push(num);
201
202                        let mut dot = false;
203                        while let Some((_, next)) = peek.peek() {
204                            if next.is_numeric() {
205                                col += 1;
206                                len += 1;
207                                curr.push(peek.next().unwrap().1);
208                            } else if *next == '.' && !dot {
209                                col += 1;
210                                len += 1;
211                                curr.push(peek.next().unwrap().1);
212                                dot = true;
213                            } else {
214                                break;
215                            }
216                        }
217
218                        TokenTag::Number(curr.parse().unwrap())
221                    }
222
223                    '"' => {
224                        let mut idx2 = idx;
225                        let mut ended = false;
226
227                        for (_, c) in peek.by_ref() {
228                            if c != '"' {
229                                idx2 += 1;
230                                col += 1;
231                                len += 1;
232                            } else {
233                                ended = true;
234                                break;
235                            }
236                        }
237
238                        if ended {
239                            TokenTag::String(&self.as_ref()[idx + 1..=idx2])
240                        } else {
241                            return Err(TokenizeError::new(
242                                r#"Expected End of String: `"`"#,
243                                line,
244                                col,
245                            ));
246                        }
247                    }
248
249                    ch if ch.is_alphanumeric() || ch == '_' || ch == '.' => {
250                        let mut end = idx;
251
252                        while let Some((idx2, next)) = peek.peek() {
253                            if !(next.is_alphanumeric() || *next == '_' || *next == '.') {
254                                break;
255                            }
256
257                            end = *idx2;
258                            col += 1;
259                            len += 1;
260                            peek.next();
261                        }
262
263                        let word = &self.as_ref()[idx..=end];
264                        if let Err(Some(was)) = keyword_gen.try_from_str(word) {
265                            return Err(TokenizeError::new(
266                                format!("Invalid keyword `{word}`, did you mean `{was}`?"),
267                                line,
268                                col,
269                            ));
270                        } else {
271                            TokenTag::Identifier(word)
272                        }
273                    }
274                    bad => {
275                        return Err(TokenizeError::new(
276                            format!("Invalid token {bad}"),
277                            line,
278                            col,
279                        ));
280                    }
281                }
282            };
283
284            let next = Token {
285                tag,
286                col,
287                len,
288                line,
289            };
290
291            tokens.push(next);
292        }
293
294        tokens.push(Token {
295            line,
296            col,
297            len: 0,
298            tag: TokenTag::EOF,
299        });
300
301        Ok(tokens)
302    }
303}
304
305#[cfg(test)]
306mod tests {
307    use rand::SeedableRng;
308    use rand_chacha::ChaCha8Rng;
309
310    use crate::tokenizer::keyword::Keyword;
311
312    use super::{Token, TokenTag, Tokenizable};
313
314    fn toks<'a>(tokens: Vec<Token<'a>>) -> Vec<TokenTag<'a>> {
315        tokens.into_iter().map(|token| token.tag).collect()
316    }
317
318    #[test]
319    fn basic_tokenizer_test() {
320        let mut rng = ChaCha8Rng::seed_from_u64(42);
321        let stream = r#"$ i = 0;
322$ foo = 10;
323fmt.Println("this is a little test");"#
324            .tokenize(&mut rng)
325            .expect("Valid tokenization");
326
327        let toks = toks(stream);
328        let expected = [
329            TokenTag::Keyword(Keyword::VariableDeclaration),
330            TokenTag::Identifier("i"),
331            TokenTag::Keyword(Keyword::Equal),
332            TokenTag::Number(0.0),
333            TokenTag::Semicolon,
334            TokenTag::Keyword(Keyword::VariableDeclaration),
335            TokenTag::Identifier("foo"),
336            TokenTag::Keyword(Keyword::Equal),
337            TokenTag::Number(10.0),
338            TokenTag::Semicolon,
339            TokenTag::Keyword(Keyword::Print),
340            TokenTag::OpenParen,
341            TokenTag::String("this is a little test"),
342            TokenTag::CloseParen,
343            TokenTag::Semicolon,
344            TokenTag::EOF,
345        ];
346
347        assert_eq!(toks, expected)
348    }
349}