maolang_core/
tokenizer.rs

1//! Core tokenizer for the language, defines all `variants` a token may have
2
3use std::{error::Error, fmt::Display};
4
5use keyword::{Keyword, KeywordRandomizer};
6use rand::RngCore;
7
8pub mod keyword;
9
10/// A token with respect to it's location in the token stream
11#[derive(Debug, Clone, Copy, PartialEq)]
12pub struct Token<'src> {
13    /// The tag
14    pub tag: TokenTag<'src>,
15    /// Line
16    pub line: usize,
17    /// Column in line
18    pub col: usize,
19    /// Length of token
20    pub len: usize,
21}
22
23/// All tags for a token
24#[derive(Debug, Clone, Copy, PartialEq)]
25pub enum TokenTag<'src> {
26    /// An identifier
27    Identifier(&'src str),
28    /// A numeric literal
29    Number(f64),
30    /// A string literal
31    String(&'src str),
32    /// A keyword
33    Keyword(Keyword),
34    /// (
35    OpenParen,
36    /// )
37    CloseParen,
38    /// [
39    OpenBracket,
40    /// ]
41    CloseBracket,
42    /// ;
43    Semicolon,
44    /// +
45    Plus,
46    /// ++
47    PlusPlus,
48    /// +=
49    PlusEq,
50    /// -
51    Minus,
52    /// *
53    Star,
54    ///,
55    Comma,
56    /// .
57    Dot,
58    /// /
59    Slash,
60
61    /// End of file
62    EOF,
63}
64
65/// An error during tokenization
66#[derive(Debug, Clone, PartialEq, Eq)]
67pub struct TokenizeError {
68    /// The invalid message
69    pub message: String,
70    /// The line that is invalid
71    pub line: usize,
72    /// The column in that line that's invalid
73    pub col: usize,
74}
75
76impl TokenizeError {
77    /// Creates a new token error with context
78    pub fn new(msg: impl Into<String>, line: usize, col: usize) -> Self {
79        Self {
80            message: msg.into(),
81            line,
82            col,
83        }
84    }
85}
86
87impl Display for TokenizeError {
88    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
89        write!(
90            f,
91            "{} at line {}, col {}",
92            self.message, self.line, self.col
93        )
94    }
95}
96
97impl Error for TokenizeError {}
98
99/// An result type for tokenizing
100pub type Result<T> = std::result::Result<T, TokenizeError>;
101
102/// Any object that can be transformed into a token stream
103pub trait Tokenizable {
104    /// Creates a token stream with respect to `self`, lifetime should match self's lifetime
105    fn tokenize<RNG: RngCore>(&self, rng: &mut RNG) -> Result<Vec<Token<'_>>>;
106    /// Creates a token stream with respect to `self`, lifetime should match self's lifetime with
107    /// no rng
108    fn tokenze_no_rng(&self) -> Result<Vec<Token<'_>>> {
109        let mut rng = rand::rng();
110
111        self.tokenize(&mut rng)
112    }
113}
114
115impl<STR> Tokenizable for STR
116where
117    STR: AsRef<str>,
118{
119    fn tokenize<RNG: RngCore>(&self, rng: &mut RNG) -> Result<Vec<Token<'_>>> {
120        let keyword_gen = KeywordRandomizer::seeded_start(rng);
121        let mut peek = self.as_ref().chars().enumerate().peekable();
122        let stream = &self.as_ref();
123        let mut tokens = vec![];
124
125        let mut line = 1;
126        let mut col = 0;
127
128        while let Some((idx, ch)) = peek.next() {
129            let mut len = 1;
130            col += 1;
131
132            let tag = if let Some(kwrd) = keyword_gen.try_parse(stream, idx, &mut len) {
133                for _ in 0..len - 1 {
134                    peek.next();
135                    col += 1;
136                }
137
138                TokenTag::Keyword(kwrd)
139            } else {
140                match ch {
141                    '[' => TokenTag::OpenBracket,
142                    ']' => TokenTag::CloseBracket,
143
144                    '(' => TokenTag::OpenParen,
145                    ')' => TokenTag::CloseParen,
146
147                    ';' => TokenTag::Semicolon,
148                    '.' => TokenTag::Dot,
149
150                    '+' => match peek.peek() {
151                        Some((_, '+')) => {
152                            peek.next();
153                            TokenTag::PlusPlus
154                        }
155                        Some((_, '=')) => {
156                            peek.next();
157                            TokenTag::PlusEq
158                        }
159                        _ => TokenTag::Plus,
160                    },
161                    '-' => TokenTag::Minus,
162                    '*' => TokenTag::Star,
163                    '/' => match peek.peek() {
164                        Some((_, '/')) => {
165                            for (_, ch) in peek.by_ref() {
166                                if ch == '\n' {
167                                    break;
168                                }
169                            }
170                            continue;
171                        }
172                        Some((_, '*')) => {
173                            peek.next();
174                            while let Some((_, ch)) = peek.next() {
175                                if ch == '*' {
176                                    if let Some((_, '/')) = peek.peek() {
177                                        peek.next();
178                                        break;
179                                    }
180                                }
181                            }
182
183                            continue;
184                        }
185                        _ => TokenTag::Slash,
186                    },
187
188                    '\n' => {
189                        col = 0;
190                        line += 1;
191                        continue;
192                    }
193
194                    ',' => TokenTag::Comma,
195
196                    ws if ws.is_whitespace() => continue,
197
198                    num if num.is_numeric() => {
199                        let mut curr = String::new();
200                        curr.push(num);
201
202                        let mut dot = false;
203                        while let Some((_, next)) = peek.peek() {
204                            if next.is_numeric() {
205                                col += 1;
206                                len += 1;
207                                curr.push(peek.next().unwrap().1);
208                            } else if *next == '.' && !dot {
209                                col += 1;
210                                len += 1;
211                                curr.push(peek.next().unwrap().1);
212                                dot = true;
213                            } else {
214                                break;
215                            }
216                        }
217
218                        // Unwrap safety, as we build the number we are ensuring that only numeric
219                        // characters are added to it, this cannot fail
220                        TokenTag::Number(curr.parse().unwrap())
221                    }
222
223                    '"' => {
224                        let mut idx2 = idx;
225                        let mut ended = false;
226
227                        for (_, c) in peek.by_ref() {
228                            if c != '"' {
229                                idx2 += 1;
230                                col += 1;
231                                len += 1;
232                            } else {
233                                ended = true;
234                                break;
235                            }
236                        }
237
238                        if ended {
239                            TokenTag::String(&self.as_ref()[idx + 1..=idx2])
240                        } else {
241                            return Err(TokenizeError::new(
242                                r#"Expected End of String: `"`"#,
243                                line,
244                                col,
245                            ));
246                        }
247                    }
248
249                    ch if ch.is_alphanumeric() || ch == '_' || ch == '.' => {
250                        let mut end = idx;
251
252                        while let Some((idx2, next)) = peek.peek() {
253                            if !(next.is_alphanumeric() || *next == '_' || *next == '.') {
254                                break;
255                            }
256
257                            end = *idx2;
258                            col += 1;
259                            len += 1;
260                            peek.next();
261                        }
262
263                        let word = &self.as_ref()[idx..=end];
264                        if let Err(Some(was)) = keyword_gen.try_from_str(word) {
265                            return Err(TokenizeError::new(
266                                format!("Invalid keyword `{word}`, did you mean `{was}`?"),
267                                line,
268                                col,
269                            ));
270                        } else {
271                            TokenTag::Identifier(word)
272                        }
273                    }
274                    bad => {
275                        return Err(TokenizeError::new(
276                            format!("Invalid token {bad}"),
277                            line,
278                            col,
279                        ));
280                    }
281                }
282            };
283
284            let next = Token {
285                tag,
286                col,
287                len,
288                line,
289            };
290
291            tokens.push(next);
292        }
293
294        tokens.push(Token {
295            line,
296            col,
297            len: 0,
298            tag: TokenTag::EOF,
299        });
300
301        Ok(tokens)
302    }
303}
304
305#[cfg(test)]
306mod tests {
307    use rand::SeedableRng;
308    use rand_chacha::ChaCha8Rng;
309
310    use crate::tokenizer::keyword::Keyword;
311
312    use super::{Token, TokenTag, Tokenizable};
313
314    fn toks<'a>(tokens: Vec<Token<'a>>) -> Vec<TokenTag<'a>> {
315        tokens.into_iter().map(|token| token.tag).collect()
316    }
317
318    #[test]
319    fn basic_tokenizer_test() {
320        let mut rng = ChaCha8Rng::seed_from_u64(42);
321        let stream = r#"$ i = 0;
322$ foo = 10;
323fmt.Println("this is a little test");"#
324            .tokenize(&mut rng)
325            .expect("Valid tokenization");
326
327        let toks = toks(stream);
328        let expected = [
329            TokenTag::Keyword(Keyword::VariableDeclaration),
330            TokenTag::Identifier("i"),
331            TokenTag::Keyword(Keyword::Equal),
332            TokenTag::Number(0.0),
333            TokenTag::Semicolon,
334            TokenTag::Keyword(Keyword::VariableDeclaration),
335            TokenTag::Identifier("foo"),
336            TokenTag::Keyword(Keyword::Equal),
337            TokenTag::Number(10.0),
338            TokenTag::Semicolon,
339            TokenTag::Keyword(Keyword::Print),
340            TokenTag::OpenParen,
341            TokenTag::String("this is a little test"),
342            TokenTag::CloseParen,
343            TokenTag::Semicolon,
344            TokenTag::EOF,
345        ];
346
347        assert_eq!(toks, expected)
348    }
349}