elemental/tokenizer/
mod.rs

1//! Provides a tokenizer for the Elemental interpreter.
2
3use crate::error::*;
4
5/// Outlines the types of tokens that Elemental can process.
6#[derive(Clone, Copy, PartialEq, Eq, Debug, Hash)]
7pub enum TokenClass {
8    Identifier,
9    Int,
10    Float,
11    Assignment,
12    Plus,
13    Minus,
14    Multiply,
15    Divide,
16    Eq,
17    Semicolon,
18    Comma,
19    Newline,
20    Prime,
21    OpenParen,
22    CloseParen,
23    OpenBracket,
24    CloseBracket,
25}
26
27/// Holds a token's class and its value.
28#[derive(Clone, Debug)]
29pub struct Token {
30    class: TokenClass,
31    value: String,
32}
33
34impl Token {
35    /// Constructs a new `Token` from a value and a `TokenClass`.
36    pub fn new(class: TokenClass, value: String) -> Self {
37        Self {
38            class,
39            value,
40        }
41    }
42
43    /// Gets the class of the token.
44    pub fn get_class(&self) -> TokenClass {
45        self.class
46    }
47
48    /// Gets the value of the token.
49    pub fn get_value(&self) -> String {
50        self.value.to_owned()
51    }
52
53    /// Checks if the token is in the given class.
54    pub fn check(&self, class: TokenClass) -> bool {
55        self.class == class
56    }
57}
58
59
60/// Holds a stream of characters.
61pub struct CharStream {
62    characters: Vec<char>,
63    index: usize,
64}
65
66impl CharStream {
67    /// Constructs a new character stream from a `String`.
68    pub fn from(input: String) -> Self {
69        let characters = input.as_str().chars().collect::<Vec<char>>();
70        let index = 0;
71
72        Self {
73            characters,
74            index,
75        }
76    }
77
78    /// Advances the character stream.
79    pub fn next(&mut self) -> Option<char> {
80        let character = self.peek();
81        if self.index >= self.characters.len() {
82            None
83        } else {
84            self.index += 1;
85            character
86        }
87    }
88
89    /// Peeks at the next character in the stream.
90    pub fn peek(&self) -> Option<char> {
91        if self.index >= self.characters.len() {
92            None
93        } else {
94            Some (self.characters[self.index])
95        }
96    }
97
98    /// Looks ahead `n` characters.
99    /// 
100    /// `Self::lookahead(0)` is equivalent to `Self::peek()`.
101    pub fn lookahead(&self, n: usize) -> Option<char> {
102        if self.index >= self.characters.len() {
103            None
104        } else {
105            Some (self.characters[self.index + n])
106        }
107    }
108
109    /// Iterates through a stream of characters, pushing characters to a `String`
110    /// so long as they are in a given superstring.  Once a character is found that
111    /// is not in the given superstring, stops and returns the `String`.
112    pub fn get(&mut self, superstring: &str) -> String {
113        let mut current = String::new();
114        while let Some(c) = self.peek() {
115            if superstring.contains(c) {
116                self.next();
117                current.push(c);
118            } else {
119                break;
120            }
121        }
122        current
123    }
124
125    /// Skips comments.
126    pub fn skip_comments(&mut self) {
127        while self.peek() == Some('/') && self.lookahead(1) == Some('/') {
128            while self.peek() != Some('\n') {
129                self.next();
130            }
131            // Consume the newline
132            self.next();
133        }
134    }
135}
136
137
138/// Characters that can compose an identifier.
139/// 
140/// Please note that, though numbers are included here, identifiers cannot start
141/// with a numeric digit (`'0'..='9'`).
142const IDENTIFIER: &str = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_";
143
144
145/// Numeric values.  These can compose a numeric literal.
146/// 
147/// Please note that numeric literals cannot start with `'.'`.
148const NUMERIC: &str = "01235456789.";
149
150
151/// Separators & whitespace.  To be ignored.
152const SEPARATORS: &str = " \t\n";
153
154
155/// Holds a stream of tokens.
156pub struct Tokenizer {
157    tokens: Vec<Token>,
158    index: usize,
159}
160
161impl Tokenizer {
162    /// Constructs a new token stream from a `String`.
163    pub fn from(input: String) -> Self {
164        let index = 0;
165        let mut charstream = CharStream::from(input);
166        let mut tokens = Vec::new();
167
168        // Skip any comments
169        charstream.skip_comments();
170
171        while let Some(c) = charstream.next() {
172            if SEPARATORS.contains(c) {
173                continue;
174            }
175
176            let token = match c {
177                'a'..='z' | 'A'..='Z' | '_' => {
178                    let name = format!(
179                        "{}{}",
180                        c,
181                        charstream.get(IDENTIFIER),
182                    );
183                    Token::new(TokenClass::Identifier, name)
184                },
185                '0'..='9' => {
186                    let raw = format!(
187                        "{}{}",
188                        c,
189                        charstream.get(NUMERIC),
190                    );
191                    
192                    let token = match str::parse::<i64>(&raw) {
193                        Ok(_) => Token::new(TokenClass::Int, raw),
194                        Err(_) => match str::parse::<f64>(&raw) {
195                            Ok(_) => Token::new(TokenClass::Float, raw),
196                            Err(_) => {
197                                throw(CouldNotParseNumeric);
198                                Token::new(TokenClass::Float, "0.0".to_string())
199                            },
200                        },
201                    };
202                    token
203                },
204                '=' => if charstream.peek() == Some('=') {
205                    Token::new(TokenClass::Eq, "==".to_string())
206                } else if let Some(_) = charstream.peek() {
207                    Token::new(TokenClass::Assignment, "=".to_string())
208                } else {
209                    throw(UnexpectedEof);
210                    Token::new(TokenClass::Newline, '\n'.to_string())
211                },
212                '\n' => Token::new(TokenClass::Newline, '\n'.to_string()),
213                '+' => Token::new(TokenClass::Plus, '+'.to_string()),
214                '-' => {
215                    let chr = match charstream.peek() {
216                        Some(p) => p,
217                        None => {
218                            throw(UnexpectedEof);
219                            '\n'
220                        },
221                    };
222                    if NUMERIC.contains(chr) {
223                        let raw = format!(
224                            "{}{}",
225                            c,
226                            charstream.get(NUMERIC),
227                        );
228                        
229                        let token = match str::parse::<i64>(&raw) {
230                            Ok(_) => Token::new(TokenClass::Int, raw),
231                            Err(_) => match str::parse::<f64>(&raw) {
232                                Ok(_) => Token::new(TokenClass::Float, raw),
233                                Err(_) => {
234                                    throw(CouldNotParseNumeric);
235                                    Token::new(TokenClass::Float, "0.0".to_string())
236                                },
237                            },
238                        };
239                        token
240                    } else {
241                        Token::new(TokenClass::Minus, '-'.to_string())
242                    }
243                }
244                '*' => Token::new(TokenClass::Multiply, '*'.to_string()),
245                '/' => Token::new(TokenClass::Divide, '/'.to_string()),
246                ';' => Token::new(TokenClass::Semicolon, ';'.to_string()),
247                '(' => Token::new(TokenClass::OpenParen, '('.to_string()),
248                ')' => Token::new(TokenClass::CloseParen, ')'.to_string()),
249                '[' => Token::new(TokenClass::OpenBracket, '['.to_string()),
250                ']' => Token::new(TokenClass::CloseBracket, ']'.to_string()),
251                ',' => Token::new(TokenClass::Comma, ';'.to_string()),
252                '\'' => Token::new(TokenClass::Prime, '\''.to_string()),
253                _ => {
254                    throw(UnexpectedEof);
255                    Token::new(TokenClass::Newline, '\n'.to_string())
256                },
257            };
258            tokens.push(token);
259
260            // Skip comments
261            charstream.skip_comments();
262        }
263
264        Self {
265            tokens,
266            index,
267        }
268    }
269
270    /// Peeks at the next character in the stream.
271    pub fn peek(&self) -> Option<Token> {
272        if self.index >= self.tokens.len() {
273            None
274        } else {
275            Some (self.tokens[self.index].to_owned())
276        }
277    }
278
279    /// Advances the character stream.
280    pub fn next(&mut self) -> Option<Token> {
281        let token = self.peek();
282        self.index += 1;
283        token
284    }
285
286    /// Returns all tokens without consuming the tokenizer.
287    pub fn get_tokens(&mut self) -> Vec<Token> {
288        self.tokens.to_owned()
289    }
290
291    /// Checks whether or not the last token is a semicolon.
292    /// 
293    /// Lines that end with semicolons are not displayed.
294    pub fn chk_silent(&self) -> bool {
295        if self.tokens.len() != 0 {
296            self.tokens.len() != 0 && self.tokens[self.tokens.len() - 1].get_class() == TokenClass::Semicolon
297        } else {
298            true
299        }
300    }
301
302    /// Get the precedence of the next token.
303    pub fn get_next_precedence(&self) -> u8 {
304        if let Some(t) = self.peek() {
305            t.get_class().into()
306        } else {
307            0
308        }
309    }
310}
311
312#[test]
313fn tokenize_00() {
314    let input: String = "x = 1.3\ny = 2.6".to_string();
315    let mut tokenizer = Tokenizer::from(input);
316    println!("Tokens: {:#?}", tokenizer.get_tokens());
317}