ungrammar/
lexer.rs

1//! Simple hand-written ungrammar lexer
2use crate::error::{bail, Result};
3
4#[derive(Debug, Eq, PartialEq)]
5pub(crate) enum TokenKind {
6    Node(String),
7    Token(String),
8    Eq,
9    Star,
10    Pipe,
11    QMark,
12    Colon,
13    LParen,
14    RParen,
15}
16
17#[derive(Debug)]
18pub(crate) struct Token {
19    pub(crate) kind: TokenKind,
20    pub(crate) loc: Location,
21}
22
23#[derive(Copy, Clone, Default, Debug)]
24pub(crate) struct Location {
25    pub(crate) line: usize,
26    pub(crate) column: usize,
27}
28
29impl Location {
30    fn advance(&mut self, text: &str) {
31        match text.rfind('\n') {
32            Some(idx) => {
33                self.line += text.chars().filter(|&it| it == '\n').count();
34                self.column = text[idx + 1..].chars().count();
35            }
36            None => self.column += text.chars().count(),
37        }
38    }
39}
40
41pub(crate) fn tokenize(mut input: &str) -> Result<Vec<Token>> {
42    let mut res = Vec::new();
43    let mut loc = Location::default();
44    while !input.is_empty() {
45        let old_input = input;
46        skip_ws(&mut input);
47        skip_comment(&mut input);
48        if old_input.len() == input.len() {
49            match advance(&mut input) {
50                Ok(kind) => {
51                    res.push(Token { kind, loc });
52                }
53                Err(err) => return Err(err.with_location(loc)),
54            }
55        }
56        let consumed = old_input.len() - input.len();
57        loc.advance(&old_input[..consumed]);
58    }
59
60    Ok(res)
61}
62
63fn skip_ws(input: &mut &str) {
64    *input = input.trim_start_matches(is_whitespace)
65}
66fn skip_comment(input: &mut &str) {
67    if input.starts_with("//") {
68        let idx = input.find('\n').map_or(input.len(), |it| it + 1);
69        *input = &input[idx..]
70    }
71}
72
73fn advance(input: &mut &str) -> Result<TokenKind> {
74    let mut chars = input.chars();
75    let c = chars.next().unwrap();
76    let res = match c {
77        '=' => TokenKind::Eq,
78        '*' => TokenKind::Star,
79        '?' => TokenKind::QMark,
80        '(' => TokenKind::LParen,
81        ')' => TokenKind::RParen,
82        '|' => TokenKind::Pipe,
83        ':' => TokenKind::Colon,
84        '\'' => {
85            let mut buf = String::new();
86            loop {
87                match chars.next() {
88                    None => bail!("unclosed token literal"),
89                    Some('\\') => match chars.next() {
90                        Some(c) if is_escapable(c) => buf.push(c),
91                        _ => bail!("invalid escape in token literal"),
92                    },
93                    Some('\'') => break,
94                    Some(c) => buf.push(c),
95                }
96            }
97            TokenKind::Token(buf)
98        }
99        c if is_ident_char(c) => {
100            let mut buf = String::new();
101            buf.push(c);
102            loop {
103                match chars.clone().next() {
104                    Some(c) if is_ident_char(c) => {
105                        chars.next();
106                        buf.push(c);
107                    }
108                    _ => break,
109                }
110            }
111            TokenKind::Node(buf)
112        }
113        '\r' => bail!("unexpected `\\r`, only Unix-style line endings allowed"),
114        c => bail!("unexpected character: `{}`", c),
115    };
116
117    *input = chars.as_str();
118    Ok(res)
119}
120
121fn is_escapable(c: char) -> bool {
122    matches!(c, '\\' | '\'')
123}
124fn is_whitespace(c: char) -> bool {
125    matches!(c, ' ' | '\t' | '\n')
126}
127fn is_ident_char(c: char) -> bool {
128    matches!(c, 'a'..='z' | 'A'..='Z' | '_')
129}