asexp/
token.rs

1#[derive(Debug, PartialEq, Copy, Clone)]
2pub enum TokenError {
3    // If a token not a valid number
4    // For example "+0a" starts as a number, but contains invalid characters.
5    InvalidNumber,
6
7    // missing terminating quote "
8    MissingQuoteEnd,
9
10    InvalidUnquotedString,
11
12    InvalidEscape,
13}
14
15#[derive(Debug, PartialEq, Clone)]
16pub enum Token<'a> {
17    Error((&'a str, TokenError)),
18    Whitespace(&'a str),
19    Comment(&'a str),
20
21    Str(&'a str),
22    QStr(String),
23
24    OpenBracket,  // '['
25    CloseBracket, // ']'
26    OpenParens,   // '('
27    CloseParens,  // ')'
28    OpenCurly,    // '{'
29    CloseCurly,   // '}'
30
31    UInt(u64),
32    SInt(i64),
33    Float(f64),
34}
35
36#[inline]
37fn scan<F: Fn(char) -> bool>(s: &str, cond: F) -> (&str, &str) {
38    // split at the first non-"cond" character
39    match s.find(|c: char| !cond(c)) {
40        None => (s, ""),
41        Some(pos) => s.split_at(pos),
42    }
43}
44
45#[inline]
46pub fn is_token_delim(c: char) -> bool {
47    c.is_whitespace()
48        || c == '('
49        || c == ')'
50        || c == '['
51        || c == ']'
52        || c == '{'
53        || c == '}'
54        || c == '#'
55}
56
57fn is_valid_unquoted_string(s: &str) -> bool {
58    !(s.contains('\\') || s.contains('"'))
59}
60
61fn next_token<'a>(s: &'a str) -> Option<(Token<'a>, &'a str)> {
62    let mut chars = s.chars();
63    match chars.next() {
64        None => None,
65        Some(c) => {
66            match c {
67                '(' => Some((Token::OpenParens, chars.as_str())),
68                ')' => Some((Token::CloseParens, chars.as_str())),
69
70                '[' => Some((Token::OpenBracket, chars.as_str())),
71                ']' => Some((Token::CloseBracket, chars.as_str())),
72
73                '{' => Some((Token::OpenCurly, chars.as_str())),
74                '}' => Some((Token::CloseCurly, chars.as_str())),
75
76                '#' => {
77                    // comment
78                    let (comment, rest) = scan(chars.as_str(), |ch| ch != '\n');
79                    Some((Token::Comment(comment), rest))
80                }
81
82                '"' => {
83                    let mut unquoted_string = String::new();
84
85                    loop {
86                        match chars.next() {
87                            None => {
88                                // Error. No terminating quoting character found
89                                return Some((
90                                    Token::Error((chars.as_str(), TokenError::MissingQuoteEnd)),
91                                    s,
92                                ));
93                            }
94                            Some(ch) => {
95                                match ch {
96                                    '"' => {
97                                        // found good string
98                                        break;
99                                    }
100                                    '\\' => {
101                                        // next character is escaped
102                                        match chars.next() {
103                                            Some('\\') => {
104                                                unquoted_string.push('\\');
105                                            }
106                                            Some('"') => {
107                                                unquoted_string.push('"');
108                                            }
109                                            _ => {
110                                                return Some((
111                                                    Token::Error((
112                                                        chars.as_str(),
113                                                        TokenError::InvalidEscape,
114                                                    )),
115                                                    s,
116                                                ));
117                                            }
118                                        }
119                                    }
120                                    _ => {
121                                        unquoted_string.push(ch);
122                                    }
123                                }
124                            }
125                        }
126                    }
127
128                    Some((Token::QStr(unquoted_string), chars.as_str()))
129                }
130
131                c if char::is_whitespace(c) => {
132                    let (ws, rest) = scan(s, char::is_whitespace);
133                    Some((Token::Whitespace(ws), rest))
134                }
135
136                '+' | '-' => {
137                    let (string, rest) = scan(s, |ch| !is_token_delim(ch));
138                    assert!(string.len() > 0);
139
140                    // parse +|- always as a signed integer first.
141                    // this is the way to distinguish a positive signed
142                    // integer from a positive unsigned (+1 == SInt, while 1 == UInt).
143
144                    // if it is followed by a digit, this is should be a number.
145                    match chars.next() {
146                        Some(ch) if char::is_digit(ch, 10) => {
147                            // +|- followed by [0-9]. This must be a number!
148
149                            if let Ok(i) = string.parse::<i64>() {
150                                Some((Token::SInt(i), rest))
151                            } else if let Ok(i) = string.parse::<f64>() {
152                                Some((Token::Float(i), rest))
153                            } else {
154                                Some((Token::Error((string, TokenError::InvalidNumber)), rest))
155                            }
156                        }
157                        _ => {
158                            // If it is followed by any other character (or none) it is a valid
159                            // string token.
160                            if is_valid_unquoted_string(string) {
161                                Some((Token::Str(string), rest))
162                            } else {
163                                Some((
164                                    Token::Error((string, TokenError::InvalidUnquotedString)),
165                                    rest,
166                                ))
167                            }
168                        }
169                    }
170                }
171
172                '0'...'9' => {
173                    // this should be either a unsigned integer of a floating point number. If not,
174                    // it's invalid.
175                    let (string, rest) = scan(s, |ch| !is_token_delim(ch));
176                    assert!(string.len() > 0);
177
178                    if let Ok(i) = string.parse::<u64>() {
179                        Some((Token::UInt(i), rest))
180                    } else if let Ok(i) = string.parse::<f64>() {
181                        Some((Token::Float(i), rest))
182                    } else {
183                        Some((Token::Error((string, TokenError::InvalidNumber)), rest))
184                    }
185                }
186
187                _ => {
188                    // this neither starts with '+' or '-' or 'digit'. this is definitively a
189                    // string.
190
191                    let (string, rest) = scan(s, |ch| !is_token_delim(ch));
192                    assert!(string.len() > 0);
193
194                    if is_valid_unquoted_string(string) {
195                        Some((Token::Str(string), rest))
196                    } else {
197                        Some((
198                            Token::Error((string, TokenError::InvalidUnquotedString)),
199                            rest,
200                        ))
201                    }
202                }
203            }
204        }
205    }
206}
207
208pub struct Tokenizer<'a> {
209    current: &'a str,
210    ignore_ws: bool,
211}
212
213impl<'a> Tokenizer<'a> {
214    pub fn new(s: &'a str, ignore_ws: bool) -> Tokenizer<'a> {
215        Tokenizer {
216            current: s,
217            ignore_ws: ignore_ws,
218        }
219    }
220
221    pub fn with_curly_around(self) -> CurlyAroundTokenizer<'a> {
222        CurlyAroundTokenizer::new(self)
223    }
224}
225
226impl<'a> Iterator for Tokenizer<'a> {
227    type Item = Token<'a>;
228
229    fn next(&mut self) -> Option<Self::Item> {
230        loop {
231            match next_token(self.current) {
232                Some((tok, rest)) => {
233                    self.current = rest;
234                    if self.ignore_ws {
235                        match tok {
236                            Token::Whitespace(_) | Token::Comment(_) => continue,
237                            _ => {}
238                        }
239                    }
240                    return Some(tok);
241                }
242                None => {
243                    return None;
244                }
245            }
246        }
247    }
248}
249
250enum State {
251    Begin,
252    Inner,
253    End,
254}
255
256pub struct CurlyAroundTokenizer<'a> {
257    inner: Tokenizer<'a>,
258    state: State,
259}
260
261impl<'a> CurlyAroundTokenizer<'a> {
262    pub fn new(inner: Tokenizer<'a>) -> CurlyAroundTokenizer<'a> {
263        CurlyAroundTokenizer {
264            inner: inner,
265            state: State::Begin,
266        }
267    }
268}
269
270impl<'a> Iterator for CurlyAroundTokenizer<'a> {
271    type Item = Token<'a>;
272
273    fn next(&mut self) -> Option<Self::Item> {
274        match self.state {
275            State::Begin => {
276                self.state = State::Inner;
277                Some(Token::OpenCurly)
278            }
279            State::Inner => {
280                if let Some(tok) = self.inner.next() {
281                    return Some(tok);
282                }
283                self.state = State::End;
284                Some(Token::CloseCurly)
285            }
286            State::End => None,
287        }
288    }
289}
290
291#[test]
292fn test_tokenizer_whitespace() {
293    let t = Tokenizer::new(" (abc 123)", false);
294    let tokens: Vec<_> = t.into_iter().collect();
295    assert_eq!(
296        vec![
297            Token::Whitespace(" "),
298            Token::OpenParens,
299            Token::Str("abc"),
300            Token::Whitespace(" "),
301            Token::UInt(123),
302            Token::CloseParens
303        ],
304        tokens
305    );
306}
307
308#[test]
309fn test_tokenizer_comment() {
310    let t = Tokenizer::new(" (abc#comment\n 123)", false);
311    let tokens: Vec<_> = t.into_iter().collect();
312    assert_eq!(
313        vec![
314            Token::Whitespace(" "),
315            Token::OpenParens,
316            Token::Str("abc"),
317            Token::Comment("comment"),
318            Token::Whitespace("\n "),
319            Token::UInt(123),
320            Token::CloseParens
321        ],
322        tokens
323    );
324}
325
326#[test]
327fn test_tokenizer_curly_around() {
328    let t = CurlyAroundTokenizer::new(Tokenizer::new(" (abc 123)", true));
329    let tokens: Vec<_> = t.into_iter().collect();
330    assert_eq!(
331        vec![
332            Token::OpenCurly,
333            Token::OpenParens,
334            Token::Str("abc"),
335            Token::UInt(123),
336            Token::CloseParens,
337            Token::CloseCurly
338        ],
339        tokens
340    );
341}
342
343#[test]
344fn test_tokenizer_no_whitespace() {
345    let t = Tokenizer::new(" (abc 123)", true);
346    let tokens: Vec<_> = t.into_iter().collect();
347    assert_eq!(
348        vec![
349            Token::OpenParens,
350            Token::Str("abc"),
351            Token::UInt(123),
352            Token::CloseParens
353        ],
354        tokens
355    );
356}
357
358#[test]
359fn test_token() {
360    assert_eq!(Some((Token::Whitespace("  "), "abc")), next_token("  abc"));
361    assert_eq!(Some((Token::Str("abc"), "")), next_token("abc"));
362    assert_eq!(Some((Token::Str("abc"), "(")), next_token("abc("));
363
364    assert_eq!(Some((Token::OpenParens, ")")), next_token("()"));
365
366    assert_eq!(Some((Token::UInt(12345), "")), next_token("12345"));
367    assert_eq!(Some((Token::UInt(12345), " ")), next_token("12345 "));
368    assert_eq!(Some((Token::SInt(12345), " ")), next_token("+12345 "));
369    assert_eq!(Some((Token::SInt(-12345), " ")), next_token("-12345 "));
370    assert_eq!(Some((Token::Str("-a"), " ")), next_token("-a "));
371    assert_eq!(Some((Token::Str("+a"), " ")), next_token("+a "));
372    assert_eq!(Some((Token::Str("+a"), "(")), next_token("+a("));
373
374    assert_eq!(
375        Some((Token::Error(("12345+", TokenError::InvalidNumber)), "")),
376        next_token("12345+")
377    );
378    assert_eq!(Some((Token::UInt(12345), " +")), next_token("12345 +"));
379    assert_eq!(Some((Token::Float(12345.123), "")), next_token("12345.123"));
380    assert_eq!(
381        Some((Token::Float(12345.123), "(")),
382        next_token("12345.123(")
383    );
384
385    assert_eq!(
386        Some((
387            Token::Error(("abc\\", TokenError::InvalidUnquotedString)),
388            " test"
389        )),
390        next_token("abc\\ test")
391    );
392    assert_eq!(
393        Some((
394            Token::Error(("abc\"", TokenError::InvalidUnquotedString)),
395            " test"
396        )),
397        next_token("abc\" test")
398    );
399
400    assert_eq!(
401        Some((Token::QStr("".to_string()), "(")),
402        next_token("\"\"(")
403    );
404    assert_eq!(
405        Some((Token::QStr("abc".to_string()), "(")),
406        next_token("\"abc\"(")
407    );
408    assert_eq!(
409        Some((Token::QStr("a\"b".to_string()), "(")),
410        next_token("\"a\\\"b\"(")
411    );
412    assert_eq!(
413        Some((Token::QStr("a\\b".to_string()), "(")),
414        next_token("\"a\\\\b\"(")
415    );
416
417    assert_eq!(
418        Some((Token::Error(("", TokenError::MissingQuoteEnd)), "\"abc ")),
419        next_token("\"abc ")
420    );
421
422    //assert_eq!(Some((Token::Error(("n ", TokenError::InvalidEscape)), "\"abc\\n ")),
423    assert_eq!(
424        Some((Token::Error((" ", TokenError::InvalidEscape)), "\"abc\\n ")),
425        next_token("\"abc\\n ")
426    );
427}