spore_vm/parser/
tokenizer.rs

1use super::span::Span;
2
3/// Describes the type of token.
4#[derive(Copy, Clone, Debug, PartialEq)]
5pub enum TokenType {
6    /// An open parenthesis.
7    OpenParen,
8    /// A close parenthesis.
9    CloseParen,
10    /// A string literal.
11    String,
12    /// A string literal, but missing the closing quote.
13    UnterminatedString,
14    /// A comment.
15    Comment,
16    /// Something else. Usually an atom(int, float literal) or an identifier.
17    Other,
18}
19
20/// Contains a token type and the portion of the text defining the token.
21#[derive(Copy, Clone, Debug, PartialEq)]
22pub struct Token {
23    /// The type held within the token.
24    pub token_type: TokenType,
25    /// The location within the source string.
26    pub span: Span,
27}
28
29impl Token {
30    /// Get the current token's backing [str].
31    pub fn as_str<'a>(&self, src: &'a str) -> &'a str {
32        self.span.with_src(src).as_str()
33    }
34
35    /// Parse an input source into a stream of tokens.
36    pub fn parse_tokens(input_source: &str) -> impl '_ + Iterator<Item = Token> {
37        let mut start = 0;
38        std::iter::from_fn(move || {
39            Token::parse_next(input_source, start).inspect(|t| start = t.span.end as usize)
40        })
41    }
42
43    /// Parse an input source into a vector of tokens. Used for convenience in unit tests, prefer
44    /// using `Self::parse_tokens`.
45    #[cfg(test)]
46    pub fn parse_tokens_to_vec<'a>(input_src: &'a str) -> Vec<(TokenType, &'a str)> {
47        let tokens =
48            Token::parse_tokens(input_src).map(|token| (token.token_type, token.as_str(input_src)));
49        tokens.collect()
50    }
51
52    /// Parse the next token within an input source string or return `None`. Returns a tuple
53    /// containing the parsed token and the rest of the string.
54    fn parse_next(src: &str, start: usize) -> Option<Token> {
55        let input_src = &src[start..].trim_start();
56        let start = src.len() - input_src.len();
57        match input_src.chars().next() {
58            None => return None,
59            Some(';') => {
60                return Some(Token {
61                    token_type: TokenType::Comment,
62                    span: Token::parse_comment(src, start),
63                })
64            }
65            Some('"') => return Some(Token::parse_next_string(src, start)),
66            Some('(') | Some('[') => {
67                return Some(Token {
68                    token_type: TokenType::OpenParen,
69                    span: Span::new(start as u32, start as u32 + 1),
70                })
71            }
72            Some(')') | Some(']') => {
73                return Some(Token {
74                    token_type: TokenType::CloseParen,
75                    span: Span::new(start as u32, start as u32 + 1),
76                })
77            }
78            _ => {}
79        }
80        for (idx, ch) in input_src.char_indices() {
81            let is_end = match ch {
82                '(' | ')' | '[' | ']' => true,
83                _ => ch.is_whitespace(),
84            };
85            if is_end {
86                return Some(Token {
87                    token_type: TokenType::Other,
88                    span: Span::new(start as u32, start as u32 + idx as u32),
89                });
90            }
91        }
92        Some(Token {
93            token_type: TokenType::Other,
94            span: Span::new(start as u32, src.len() as u32),
95        })
96    }
97
98    /// Parse the next string in input source. `input_source` must start with a '"'
99    /// character. Returns a tuple of the parsed token and the rest of the string.
100    fn parse_comment(src: &str, start: usize) -> Span {
101        for (idx, ch) in (start + 1..src.len()).zip(src[start..].chars()) {
102            if ch == '\n' {
103                return Span::new(start as u32, idx as u32);
104            }
105        }
106        Span::new(start as u32, src.len() as u32)
107    }
108
109    /// Parse the next string in input source. `input_source` must start with a '"'
110    /// character. Returns a tuple of the parsed token and the rest of the string.
111    fn parse_next_string(src: &str, start: usize) -> Token {
112        let input_src = &src[start..];
113        let mut is_escaped = false;
114        for (idx, ch) in input_src.char_indices() {
115            if idx == 0 {
116                debug_assert_eq!(ch, '"');
117                continue;
118            };
119            match ch {
120                '\\' => {
121                    is_escaped = !is_escaped;
122                }
123                '"' => {
124                    if !is_escaped {
125                        return Token {
126                            token_type: TokenType::String,
127                            span: Span::new(start as u32, start as u32 + idx as u32 + 1),
128                        };
129                    }
130                    is_escaped = false;
131                }
132                _ => {
133                    is_escaped = false;
134                }
135            };
136        }
137        Token {
138            token_type: TokenType::UnterminatedString,
139            span: Span::new(start as u32, src.len() as u32),
140        }
141    }
142}
143
144#[cfg(test)]
145mod tests {
146    use super::*;
147
148    #[test]
149    fn empty_str_produces_empty_stream() {
150        let actual = Token::parse_tokens_to_vec("");
151        assert_eq!(actual, Vec::new());
152    }
153
154    #[test]
155    fn whitespace_only_produces_empty_stream() {
156        let actual = Token::parse_tokens_to_vec(" \n\t");
157        assert_eq!(actual, Vec::new());
158    }
159
160    #[test]
161    fn whitespace_separated_values_produce_item_for_each() {
162        let src = "\t1  two\n3.0\n";
163        let actual = Token::parse_tokens_to_vec(src);
164        assert_eq!(
165            actual,
166            vec![
167                (TokenType::Other, "1"),
168                (TokenType::Other, "two"),
169                (TokenType::Other, "3.0"),
170            ]
171        );
172    }
173
174    #[test]
175    fn phrase_in_quotes_is_string() {
176        let actual = Token::parse_tokens_to_vec("\"hello world!\"not-text");
177        assert_eq!(
178            actual,
179            vec![
180                (TokenType::String, "\"hello world!\""),
181                (TokenType::Other, "not-text")
182            ]
183        );
184    }
185
186    #[test]
187    fn backslash_quote_in_quote_escapes_quote_as_part_of_string() {
188        let actual = Token::parse_tokens_to_vec(r#" \" "\"quotes\""   "#);
189        assert_eq!(
190            actual,
191            vec![
192                (TokenType::Other, "\\\""),
193                (TokenType::String, "\"\\\"quotes\\\"\"")
194            ]
195        );
196    }
197
198    #[test]
199    fn unclosed_string_is_unterminated_string() {
200        let actual = Token::parse_tokens_to_vec("\"I am not closed");
201        assert_eq!(
202            actual,
203            vec![(TokenType::UnterminatedString, "\"I am not closed")]
204        );
205    }
206
207    #[test]
208    fn parenthesis_are_parsed_into_own_tokens() {
209        let actual = Token::parse_tokens_to_vec("(left right)");
210        assert_eq!(
211            actual,
212            vec![
213                (TokenType::OpenParen, "("),
214                (TokenType::Other, "left"),
215                (TokenType::Other, "right"),
216                (TokenType::CloseParen, ")")
217            ]
218        );
219    }
220
221    #[test]
222    fn colon_denotes_start_of_line_comment() {
223        let actual = Token::parse_tokens_to_vec("(code) ; comment\n;other comment");
224        assert_eq!(
225            actual,
226            vec![
227                (TokenType::OpenParen, "("),
228                (TokenType::Other, "code"),
229                (TokenType::CloseParen, ")"),
230                (TokenType::Comment, "; comment\n"),
231                (TokenType::Comment, ";other comment"),
232            ]
233        );
234    }
235}