1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
//! Simple hand-written ungrammar lexer
use crate::error::{bail, Result};

#[derive(Debug, Eq, PartialEq)]
pub(crate) enum TokenKind {
    Node(String),
    Token(String),
    Eq,
    Star,
    Pipe,
    QMark,
    Colon,
    LParen,
    RParen,
}

#[derive(Debug)]
pub(crate) struct Token {
    pub(crate) kind: TokenKind,
    pub(crate) loc: Location,
}

#[derive(Copy, Clone, Default, Debug)]
pub(crate) struct Location {
    pub(crate) line: usize,
    pub(crate) column: usize,
}

impl Location {
    fn advance(&mut self, text: &str) {
        match text.rfind('\n') {
            Some(idx) => {
                self.line += text.chars().filter(|&it| it == '\n').count();
                self.column = text[idx + 1..].chars().count();
            }
            None => self.column += text.chars().count(),
        }
    }
}

pub(crate) fn tokenize(mut input: &str) -> Result<Vec<Token>> {
    let mut res = Vec::new();
    let mut loc = Location::default();
    while !input.is_empty() {
        let old_input = input;
        skip_ws(&mut input);
        skip_comment(&mut input);
        if old_input.len() == input.len() {
            match advance(&mut input) {
                Ok(kind) => {
                    res.push(Token { kind, loc });
                }
                Err(err) => return Err(err.with_location(loc)),
            }
        }
        let consumed = old_input.len() - input.len();
        loc.advance(&old_input[..consumed]);
    }

    Ok(res)
}

fn skip_ws(input: &mut &str) {
    *input = input.trim_start_matches(is_whitespace)
}
fn skip_comment(input: &mut &str) {
    if input.starts_with("//") {
        let idx = input.find('\n').map_or(input.len(), |it| it + 1);
        *input = &input[idx..]
    }
}

fn advance(input: &mut &str) -> Result<TokenKind> {
    let mut chars = input.chars();
    let c = chars.next().unwrap();
    let res = match c {
        '=' => TokenKind::Eq,
        '*' => TokenKind::Star,
        '?' => TokenKind::QMark,
        '(' => TokenKind::LParen,
        ')' => TokenKind::RParen,
        '|' => TokenKind::Pipe,
        ':' => TokenKind::Colon,
        '\'' => {
            let mut buf = String::new();
            loop {
                match chars.next() {
                    None => bail!("unclosed token literal"),
                    Some('\\') => match chars.next() {
                        Some(c) if is_escapable(c) => buf.push(c),
                        _ => bail!("invalid escape in token literal"),
                    },
                    Some('\'') => break,
                    Some(c) => buf.push(c),
                }
            }
            TokenKind::Token(buf)
        }
        c if is_ident_char(c) => {
            let mut buf = String::new();
            buf.push(c);
            loop {
                match chars.clone().next() {
                    Some(c) if is_ident_char(c) => {
                        chars.next();
                        buf.push(c);
                    }
                    _ => break,
                }
            }
            TokenKind::Node(buf)
        }
        '\r' => bail!("unexpected `\\r`, only Unix-style line endings allowed"),
        c => bail!("unexpected character: `{}`", c),
    };

    *input = chars.as_str();
    Ok(res)
}

fn is_escapable(c: char) -> bool {
    matches!(c, '\\' | '\'')
}
fn is_whitespace(c: char) -> bool {
    matches!(c, ' ' | '\t' | '\n')
}
fn is_ident_char(c: char) -> bool {
    matches!(c, 'a'..='z' | 'A'..='Z' | '_')
}