kb/
lexer.rs

1use super::BasicError;
2use super::Mark;
3use super::RcStr;
4use super::Source;
5use std::rc::Rc;
6
7#[derive(Debug, Clone, PartialEq)]
8pub enum Token<'a> {
9    Name(&'a str),
10    Number(f64),
11    RawString(&'a str),
12    String(RcStr),
13    EOF,
14
15    // Single character symbols
16    Newline,
17    LParen,
18    RParen,
19    LBracket,
20    RBracket,
21    LBrace,
22    RBrace,
23    Dollar,
24    Dot,
25    Dot2,
26    Colon,
27    Comma,
28    Semicolon,
29    Percent,
30    Plus,
31    Minus,
32    Star,
33    Slash,
34    Slash2,
35    Eq,
36    Bar,
37    Excalamation,
38
39    Eq2,
40    Ne,
41    LessThan,
42    GreaterThan,
43    LessThanOrEqual,
44    GreaterThanOrEqual,
45}
46
47impl<'a> Token<'a> {
48    pub fn name_or_keyword(&self) -> Option<&str> {
49        if let Token::Name(s) = self {
50            Some(s)
51        } else {
52            None
53        }
54    }
55    #[allow(dead_code)]
56    pub fn number(&self) -> Option<f64> {
57        if let Token::Number(x) = self {
58            Some(*x)
59        } else {
60            None
61        }
62    }
63    #[allow(dead_code)]
64    pub fn raw_string(&self) -> Option<&str> {
65        if let Token::RawString(x) = self {
66            Some(x)
67        } else {
68            None
69        }
70    }
71    #[allow(dead_code)]
72    pub fn string(self) -> Option<RcStr> {
73        if let Token::String(x) = self {
74            Some(x)
75        } else {
76            None
77        }
78    }
79}
80
81pub fn lex(source: &Rc<Source>) -> Result<Vec<(Token, Mark)>, BasicError> {
82    let s = &source.data;
83    let mut ret = Vec::<(Token, Mark)>::new();
84    let mut state = State::Neutral;
85    let mut last_ig_ws = 0;
86    let mut pstack = ParenStack::new();
87    let mut chars = Chars::new(s);
88    while let Some(c) = chars.next() {
89        let i = chars.index - c.len_utf8();
90        let mark = Mark {
91            source: source.clone(),
92            pos: i,
93        };
94        match state {
95            State::Neutral => {
96                if c.is_whitespace() && (c != '\n' || pstack.ignore_newline()) {
97                    // skip whitespace
98                    // We also keep track of the last ignored whitespace
99                    // to figure out when tokens should be combined
100                    last_ig_ws = i;
101                    state = State::Neutral;
102                } else if c.is_ascii_digit() {
103                    state = State::Digits(i);
104                } else if c == '_' || c.is_alphanumeric() {
105                    state = State::Name(i);
106                } else if c == '"' || c == '\'' {
107                    if let Some((Token::Name("r"), _)) = ret.last() {
108                        ret.pop().unwrap();
109                        state = State::RawString(c, i + c.len_utf8());
110                    } else {
111                        state = State::String(c, String::new());
112                    }
113                } else if c == '#' {
114                    if let Some((Token::Name("r"), _)) = ret.last() {
115                        ret.pop().unwrap();
116                        state = State::DeepRawStringStart(i, 1);
117                    } else {
118                        state = State::LineComment;
119                    }
120                } else {
121                    let tok = match c {
122                        '\0' => Some(Token::EOF),
123                        '\n' => Some(Token::Newline),
124                        '(' => Some(Token::LParen),
125                        ')' => Some(Token::RParen),
126                        '[' => Some(Token::LBracket),
127                        ']' => Some(Token::RBracket),
128                        '{' => Some(Token::LBrace),
129                        '}' => Some(Token::RBrace),
130                        '$' => Some(Token::Dollar),
131                        '.' => Some(
132                            if ret.last().map(|p| &p.0) == Some(&Token::Dot) && last_ig_ws < i - 1 {
133                                ret.pop().unwrap();
134                                Token::Dot2
135                            } else {
136                                Token::Dot
137                            },
138                        ),
139                        ':' => Some(Token::Colon),
140                        ',' => Some(Token::Comma),
141                        ';' => Some(Token::Semicolon),
142                        '+' => Some(Token::Plus),
143                        '-' => Some(Token::Minus),
144                        '*' => Some(Token::Star),
145                        '/' => Some(
146                            if ret.last().map(|p| &p.0) == Some(&Token::Slash) && last_ig_ws < i - 1
147                            {
148                                ret.pop().unwrap();
149                                Token::Slash2
150                            } else {
151                                Token::Slash
152                            },
153                        ),
154                        '%' => Some(Token::Percent),
155                        '|' => Some(Token::Bar),
156                        '!' => Some(Token::Excalamation),
157                        '<' => Some(Token::LessThan),
158                        '>' => Some(Token::GreaterThan),
159                        '=' => Some({
160                            if last_ig_ws < i - 1 {
161                                match ret.last() {
162                                    Some((Token::LessThan, _)) => {
163                                        ret.pop().unwrap();
164                                        Token::LessThanOrEqual
165                                    }
166                                    Some((Token::GreaterThan, _)) => {
167                                        ret.pop().unwrap();
168                                        Token::GreaterThanOrEqual
169                                    }
170                                    Some((Token::Eq, _)) => {
171                                        ret.pop().unwrap();
172                                        Token::Eq2
173                                    }
174                                    Some((Token::Excalamation, _)) => {
175                                        ret.pop().unwrap();
176                                        Token::Ne
177                                    }
178                                    _ => Token::Eq,
179                                }
180                            } else {
181                                Token::Eq
182                            }
183                        }),
184                        _ => None,
185                    };
186                    if let Some(tok) = tok {
187                        match tok {
188                            Token::LParen | Token::LBracket => pstack.push(true),
189                            // for KB at least, curly braces will also consume newlines
190                            Token::LBrace => pstack.push(true),
191                            Token::RParen | Token::RBracket | Token::RBrace => match pstack.pop() {
192                                Ok(()) => {}
193                                Err(message) => {
194                                    return Err(BasicError {
195                                        marks: vec![mark],
196                                        message,
197                                        help: None,
198                                    })
199                                }
200                            },
201                            _ => (),
202                        }
203                        ret.push((tok, mark));
204                        state = State::Neutral;
205                    } else {
206                        return Err(BasicError {
207                            marks: vec![mark],
208                            message: format!("Unrecognized token: {}", c),
209                            help: None,
210                        });
211                    }
212                }
213            }
214            State::Digits(start) => {
215                if c.is_ascii_digit() {
216                    state = State::Digits(start);
217                } else if c == '.' {
218                    state = State::Number(start);
219                } else {
220                    chars.put_back(c);
221                    state = State::Number(start);
222                }
223            }
224            State::Number(start) => {
225                if c.is_ascii_digit() {
226                    state = State::Number(start);
227                } else {
228                    let n: f64 = s[start..i].parse().unwrap();
229                    ret.push((
230                        Token::Number(n),
231                        Mark {
232                            source: source.clone(),
233                            pos: start,
234                        },
235                    ));
236                    chars.put_back(c);
237                    state = State::Neutral;
238                }
239            }
240            State::Name(start) => {
241                if c == '_' || c.is_alphanumeric() {
242                    state = State::Name(start);
243                } else {
244                    ret.push((
245                        Token::Name(&s[start..i]),
246                        Mark {
247                            source: source.clone(),
248                            pos: start,
249                        },
250                    ));
251                    chars.put_back(c);
252                    state = State::Neutral;
253                }
254            }
255            State::DeepRawStringStart(start, hlen) => match c {
256                '#' => state = State::DeepRawStringStart(start, hlen + 1),
257                '"' | '\'' => state = State::DeepRawStringBody(i + 1, c, hlen),
258                _ => {
259                    return Err(BasicError::new(
260                        vec![Mark {
261                            source: source.clone(),
262                            pos: i,
263                        }],
264                        "Expected quote for raw string".into(),
265                    ))
266                }
267            },
268            State::DeepRawStringBody(start, quote, hlen) => {
269                if c == quote {
270                    state = State::DeepRawStringEnd(start, i, quote, hlen, hlen);
271                }
272            }
273            State::DeepRawStringEnd(start, end, quote, shlen, hlen) => {
274                assert!(hlen > 0);
275                if c == '#' {
276                    if hlen == 1 {
277                        ret.push((
278                            Token::RawString(&s[start..end]),
279                            Mark {
280                                source: source.clone(),
281                                pos: start,
282                            },
283                        ));
284                        state = State::Neutral;
285                    } else {
286                        state = State::DeepRawStringEnd(start, end, quote, shlen, hlen - 1);
287                    }
288                } else {
289                    state = State::DeepRawStringBody(start, quote, shlen)
290                }
291            }
292            State::RawString(q, start) => {
293                if c == q {
294                    ret.push((
295                        Token::RawString(&s[start..i]),
296                        Mark {
297                            source: source.clone(),
298                            pos: start,
299                        },
300                    ));
301                    state = State::Neutral;
302                } else {
303                    state = State::RawString(q, start);
304                }
305            }
306            State::String(q, mut string) => {
307                if c == q {
308                    ret.push((
309                        Token::String(string.into()),
310                        Mark {
311                            source: source.clone(),
312                            pos: i,
313                        },
314                    ));
315                    state = State::Neutral;
316                } else if c == '\\' {
317                    state = State::StringEscaped(q, string);
318                } else {
319                    string.push(c);
320                    state = State::String(q, string);
321                }
322            }
323            State::StringEscaped(q, mut string) => {
324                let s = match c {
325                    '\\' => "\\",
326                    '\'' => "\'",
327                    '\"' => "\"",
328                    't' => "\t",
329                    'n' => "\n",
330                    'r' => "\r",
331                    _ => {
332                        return Err(BasicError {
333                            marks: vec![Mark {
334                                source: source.clone(),
335                                pos: i,
336                            }],
337                            message: format!("Invalid string escape ({})", c),
338                            help: None,
339                        })
340                    }
341                };
342                string.push_str(s);
343                state = State::String(q, string);
344            }
345            State::LineComment => {
346                if c == '\n' {
347                    state = State::Neutral;
348                }
349            }
350        }
351    }
352    if let State::Neutral = &state {
353        Ok(ret)
354    } else {
355        Err(BasicError {
356            marks: vec![Mark {
357                source: source.clone(),
358                pos: s.len(),
359            }],
360            message: format!("Expected more input: {:?}", state),
361            help: None,
362        })
363    }
364}
365
366#[derive(Debug)]
367enum State {
368    Neutral,
369    Digits(usize),
370    Number(usize),
371    Name(usize),
372    DeepRawStringStart(usize, usize),
373    DeepRawStringBody(usize, char, usize),
374    DeepRawStringEnd(usize, usize, char, usize, usize),
375    RawString(char, usize),
376    String(char, String),
377    StringEscaped(char, String),
378    LineComment,
379}
380
381struct ParenStack {
382    stack: Vec<bool>,
383}
384
385impl ParenStack {
386    pub fn new() -> ParenStack {
387        ParenStack { stack: Vec::new() }
388    }
389    pub fn push(&mut self, ignore_newline: bool) {
390        self.stack.push(ignore_newline)
391    }
392    pub fn pop(&mut self) -> Result<(), String> {
393        match self.stack.pop() {
394            Some(_) => Ok(()),
395            None => Err(format!("Mismatched grouping symbols")),
396        }
397    }
398    pub fn ignore_newline(&self) -> bool {
399        self.stack.last().cloned().unwrap_or(false)
400    }
401}
402
403struct Chars<'a> {
404    index: usize,
405    peek: Option<char>,
406    chars: std::iter::Chain<std::str::Chars<'a>, std::vec::IntoIter<char>>,
407}
408
409impl<'a> Chars<'a> {
410    fn new(s: &str) -> Chars {
411        Chars {
412            index: 0,
413            peek: None,
414            chars: s.chars().chain(vec!['\0']),
415        }
416    }
417    fn next(&mut self) -> Option<char> {
418        let ch = if let Some(ch) = std::mem::replace(&mut self.peek, None) {
419            Some(ch)
420        } else {
421            self.chars.next()
422        };
423        if let Some(ch) = ch {
424            self.index += ch.len_utf8();
425        }
426        ch
427    }
428    fn put_back(&mut self, c: char) {
429        assert!(self.peek.is_none());
430        self.peek = Some(c);
431        self.index -= c.len_utf8();
432    }
433}
434
435#[cfg(test)]
436mod tests {
437    use super::*;
438    use Token::*;
439
440    fn mksrc(data: &str) -> Rc<Source> {
441        Source {
442            name: "[for-test]".into(),
443            data: data.into(),
444        }
445        .into()
446    }
447
448    fn lex(src: &Rc<Source>) -> Vec<Token> {
449        super::lex(src).unwrap().into_iter().map(|p| p.0).collect()
450    }
451
452    #[test]
453    fn raw_string_literals() {
454        let src = mksrc(r####" r"hi" "####);
455        assert_eq!(lex(&src), vec![RawString("hi"), EOF]);
456
457        let src = mksrc(r####" r#"hello " "# "####);
458        assert_eq!(lex(&src), vec![RawString("hello \" "), EOF]);
459
460        let src = mksrc(r####" r##"world"## "####);
461        assert_eq!(lex(&src), vec![RawString("world"), EOF]);
462
463        let src = mksrc(r####" r##"hello "# "## "####);
464        assert_eq!(lex(&src), vec![RawString("hello \"# "), EOF]);
465    }
466
467    #[test]
468    fn misc() {
469        let src = mksrc(r##" x = r"hi" "##);
470        assert_eq!(lex(&src), vec![Name("x"), Eq, RawString("hi"), EOF]);
471    }
472}