maolang_core/
tokenizer.rs

1//! Core tokenizer for the language, defines all `variants` a token may have
2
3use std::{error::Error, fmt::Display};
4
5use keyword::{Keyword, KeywordRandomizer};
6use rand::RngCore;
7
8pub mod keyword;
9
10/// A token with respect to it's location in the token stream
11#[derive(Debug, Clone, Copy, PartialEq)]
12pub struct Token<'src> {
13    /// The tag
14    pub tag: TokenTag<'src>,
15    /// Line
16    pub line: usize,
17    /// Column in line
18    pub col: usize,
19    /// Length of token
20    pub len: usize,
21}
22
23/// All tags for a token
24#[derive(Debug, Clone, Copy, PartialEq)]
25pub enum TokenTag<'src> {
26    /// An identifier
27    Identifier(&'src str),
28    /// A numeric literal
29    Number(f64),
30    /// A string literal
31    String(&'src str),
32    /// A keyword
33    Keyword(Keyword),
34    /// (
35    OpenParen,
36    /// )
37    CloseParen,
38    /// [
39    OpenBracket,
40    /// ]
41    CloseBracket,
42    /// ;
43    Semicolon,
44    /// +
45    Plus,
46    /// ++
47    PlusPlus,
48    /// +=
49    PlusEq,
50    /// -
51    Minus,
52    /// *
53    Star,
54    ///,
55    Comma,
56    /// .
57    Dot,
58    /// /
59    Slash,
60
61    /// End of file
62    EOF,
63}
64
65/// An error during tokenization
66#[derive(Debug, Clone, PartialEq, Eq)]
67pub struct TokenizeError {
68    /// The invalid message
69    pub message: String,
70    /// The line that is invalid
71    pub line: usize,
72    /// The column in that line that's invalid
73    pub col: usize,
74}
75
76impl TokenizeError {
77    /// Creates a new token error with context
78    pub fn new(msg: impl Into<String>, line: usize, col: usize) -> Self {
79        Self {
80            message: msg.into(),
81            line,
82            col,
83        }
84    }
85}
86
87impl Display for TokenizeError {
88    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
89        write!(
90            f,
91            "{} at line {}, col {}",
92            self.message, self.line, self.col
93        )
94    }
95}
96
97impl Error for TokenizeError {}
98
99/// An result type for tokenizing
100pub type Result<T> = std::result::Result<T, TokenizeError>;
101
102/// Any object that can be transformed into a token stream
103pub trait Tokenizable {
104    /// Creates a token stream with respect to `self`, lifetime should match self's lifetime
105    fn tokenize<RNG: RngCore>(&self, rng: &mut RNG) -> Result<Vec<Token<'_>>>;
106    /// Creates a token stream with respect to `self`, lifetime should match self's lifetime with
107    /// no rng
108    fn tokenze_no_rng(&self) -> Result<Vec<Token<'_>>> {
109        let mut rng = rand::rng();
110
111        self.tokenize(&mut rng)
112    }
113}
114
115impl<STR> Tokenizable for STR
116where
117    STR: AsRef<str>,
118{
119    fn tokenize<RNG: RngCore>(&self, rng: &mut RNG) -> Result<Vec<Token<'_>>> {
120        let keyword_gen = KeywordRandomizer::seeded_start(rng);
121        let mut peek = self.as_ref().chars().enumerate().peekable();
122        let stream = &self.as_ref();
123        let mut tokens = vec![];
124
125        let mut line = 1;
126        let mut col = 0;
127
128        while let Some((idx, ch)) = peek.next() {
129            let mut len = 1;
130            col += 1;
131
132            let tag = match keyword_gen.try_parse(stream, idx, &mut len) {
133                Ok(kwrd) => {
134                    for _ in 0..len - 1 {
135                        peek.next();
136                        col += 1;
137                    }
138
139                    TokenTag::Keyword(kwrd)
140                }
141                Err(Some(was)) => {
142                    let word = &self.as_ref()[idx..idx + len];
143                    return Err(TokenizeError::new(
144                        format!("Invalid keyword `{word}`, did you mean `{was}`?"),
145                        line,
146                        col,
147                    ));
148                }
149                _ => {
150                    match ch {
151                        '[' => TokenTag::OpenBracket,
152                        ']' => TokenTag::CloseBracket,
153
154                        '(' => TokenTag::OpenParen,
155                        ')' => TokenTag::CloseParen,
156
157                        ';' => TokenTag::Semicolon,
158                        '.' => TokenTag::Dot,
159
160                        '+' => match peek.peek() {
161                            Some((_, '+')) => {
162                                peek.next();
163                                TokenTag::PlusPlus
164                            }
165                            Some((_, '=')) => {
166                                peek.next();
167                                TokenTag::PlusEq
168                            }
169                            _ => TokenTag::Plus,
170                        },
171                        '-' => TokenTag::Minus,
172                        '*' => TokenTag::Star,
173                        '/' => match peek.peek() {
174                            Some((_, '/')) => {
175                                for (_, ch) in peek.by_ref() {
176                                    if ch == '\n' {
177                                        break;
178                                    }
179                                }
180                                continue;
181                            }
182                            Some((_, '*')) => {
183                                peek.next();
184                                while let Some((_, ch)) = peek.next() {
185                                    if ch == '*' {
186                                        if let Some((_, '/')) = peek.peek() {
187                                            peek.next();
188                                            break;
189                                        }
190                                    }
191                                }
192
193                                continue;
194                            }
195                            _ => TokenTag::Slash,
196                        },
197
198                        '\n' => {
199                            col = 0;
200                            line += 1;
201                            continue;
202                        }
203
204                        ',' => TokenTag::Comma,
205
206                        ws if ws.is_whitespace() => continue,
207
208                        num if num.is_numeric() => {
209                            let mut curr = String::new();
210                            curr.push(num);
211
212                            let mut dot = false;
213                            while let Some((_, next)) = peek.peek() {
214                                if next.is_numeric() {
215                                    col += 1;
216                                    len += 1;
217                                    curr.push(peek.next().unwrap().1);
218                                } else if *next == '.' && !dot {
219                                    col += 1;
220                                    len += 1;
221                                    curr.push(peek.next().unwrap().1);
222                                    dot = true;
223                                } else {
224                                    break;
225                                }
226                            }
227
228                            // Unwrap safety, as we build the number we are ensuring that only numeric
229                            // characters are added to it, this cannot fail
230                            TokenTag::Number(curr.parse().unwrap())
231                        }
232
233                        '"' => {
234                            let mut idx2 = idx;
235                            let mut ended = false;
236
237                            for (_, c) in peek.by_ref() {
238                                if c != '"' {
239                                    idx2 += 1;
240                                    col += 1;
241                                    len += 1;
242                                } else {
243                                    ended = true;
244                                    break;
245                                }
246                            }
247
248                            if ended {
249                                TokenTag::String(&self.as_ref()[idx + 1..=idx2])
250                            } else {
251                                return Err(TokenizeError::new(
252                                    r#"Expected End of String: `"`"#,
253                                    line,
254                                    col,
255                                ));
256                            }
257                        }
258
259                        ch if ch.is_alphanumeric() || ch == '_' => {
260                            let mut end = idx;
261
262                            while let Some((idx2, next)) = peek.peek() {
263                                if !(next.is_alphanumeric() || *next == '_') {
264                                    break;
265                                }
266
267                                end = *idx2;
268                                col += 1;
269                                len += 1;
270                                peek.next();
271                            }
272
273                            let word = &self.as_ref()[idx..=end];
274                            TokenTag::Identifier(word)
275                        }
276                        bad => {
277                            return Err(TokenizeError::new(
278                                format!("Invalid token {bad}"),
279                                line,
280                                col,
281                            ));
282                        }
283                    }
284                }
285            };
286
287            let next = Token {
288                tag,
289                col,
290                len,
291                line,
292            };
293
294            tokens.push(next);
295        }
296
297        tokens.push(Token {
298            line,
299            col,
300            len: 0,
301            tag: TokenTag::EOF,
302        });
303
304        Ok(tokens)
305    }
306}
307
308#[cfg(test)]
309mod tests {
310    use rand::SeedableRng;
311    use rand_chacha::ChaCha8Rng;
312
313    use crate::tokenizer::keyword::Keyword;
314
315    use super::{Token, TokenTag, Tokenizable};
316
317    fn toks<'a>(tokens: Vec<Token<'a>>) -> Vec<TokenTag<'a>> {
318        tokens.into_iter().map(|token| token.tag).collect()
319    }
320
321    #[test]
322    fn basic_tokenizer_test() {
323        let mut rng = ChaCha8Rng::seed_from_u64(42);
324        let stream = r#"$ i = 0;
325$ foo = 10;
326println ("this is a little test");"#
327            .tokenize(&mut rng)
328            .expect("Valid tokenization");
329
330        let toks = toks(stream);
331        let expected = [
332            TokenTag::Keyword(Keyword::VariableDeclaration),
333            TokenTag::Identifier("i"),
334            TokenTag::Keyword(Keyword::Equal),
335            TokenTag::Number(0.0),
336            TokenTag::Semicolon,
337            TokenTag::Keyword(Keyword::VariableDeclaration),
338            TokenTag::Identifier("foo"),
339            TokenTag::Keyword(Keyword::Equal),
340            TokenTag::Number(10.0),
341            TokenTag::Semicolon,
342            TokenTag::Keyword(Keyword::Print),
343            TokenTag::OpenParen,
344            TokenTag::String("this is a little test"),
345            TokenTag::CloseParen,
346            TokenTag::Semicolon,
347            TokenTag::EOF,
348        ];
349
350        assert_eq!(toks, expected)
351    }
352}