m_lexer 0.0.4

A simple extensible regular expressions based lexer
Documentation
extern crate regex;

use regex::Regex;

#[derive(Copy, Clone, Debug, Eq, PartialEq, Ord, PartialOrd)]
pub struct TokenKind(pub u16);

#[derive(Copy, Clone, Debug, Eq, PartialEq, Ord, PartialOrd)]
pub struct Token {
    pub kind: TokenKind,
    pub len: usize,
}

pub type ExternRule = Box<Fn(&str) -> Option<usize> + Send + Sync>;

pub struct Lexer {
    error_token: TokenKind,
    rules: Vec<Rule>,
}

pub struct Rule {
    kind: TokenKind,
    re: Regex,
    f: Option<ExternRule>,
}

pub struct LexerBuilder {
    error_token: Option<TokenKind>,
    rules: Vec<Rule>,
}

impl Lexer {
    pub fn next_token(&self, input: &str) -> Token {
        assert!(!input.is_empty(), "next_token should not be called with empty input");
        self.valid_token(input).unwrap_or_else(|| {
            self.invalid_token(input)
        })
    }

    pub fn tokenize(&self, input: &str) -> Vec<Token> {
        let mut ret = Vec::new();
        let mut rest = input;
        while !rest.is_empty() {
            let tok = self.next_token(rest);
            ret.push(tok);
            assert!(tok.len > 0);
            rest = &rest[tok.len..];
        }
        ret
    }

    pub fn test(&self, input: &str, expected: &str) {
        let mut len = 0;
        let mut actual = String::new();

        for t in self.tokenize(input) {
            let end = len + t.len;
            let text = &input[len..end];
            actual += &format!("{:?} {}\n", text, t.kind.0);
            len = end;
        }
        let expected = expected.trim();
        let actual = actual.trim();

        assert_eq!(
            expected, actual,
            "\nExpected:\n\n\
            {}\n\n\
            Actual:\n\n\
            {}\n\n",
            expected, actual,
        );
    }

    fn valid_token(&self, input: &str) -> Option<(Token)> {
        let longest_match = self.rules.iter().rev()
            .filter_map(|rule| {
                let m = rule.re.find(input)?;
                Some((m.end(), rule))
            })
            .max_by_key(|&(len, _)| len)?;

        let (len, rule) = longest_match;
        let len = match &rule.f {
            Some(f) => f(input)?,
            None => len,
        };
        assert!(len > 0, "empty token\n\
                          kind: {:?}\n\
                          re: {:?}\n\
                          input {:?}", rule.kind, rule.re, input);
        Some(Token { kind: rule.kind, len })
    }

    fn invalid_token(&self, input: &str) -> Token {
        let mut len = 0;
        for c in input.chars() {
            len += c.len_utf8();
            if self.valid_token(&input[len..]).is_some() {
                break;
            }
        }
        assert!(0 < len && len <= input.len());
        Token { kind: self.error_token, len }
    }
}

impl LexerBuilder {
    pub fn new() -> LexerBuilder {
        LexerBuilder { error_token: None, rules: Vec::new() }
    }

    pub fn error_token(mut self, kind: TokenKind) -> Self {
        self.error_token = Some(kind);
        self
    }

    pub fn token(self, kind: TokenKind, re: &str) -> Self {
        self.rule(kind, re, None)
    }

    pub fn tokens(self, rules: &[(TokenKind, &str)]) -> Self {
        let mut this = self;
        for &(kind, re) in rules {
            this = this.token(kind, re);
        }
        this
    }

    pub fn external_token<F>(self, kind: TokenKind, re: &str, f: F) -> Self
        where F: Fn(&str) -> Option<usize> + 'static + Send + Sync
    {
        self.rule(kind, re, Some(Box::new(f)))
    }

    pub fn rule(mut self, kind: TokenKind, re: &str, f: Option<ExternRule>) -> Self {
        let rule = Rule { kind, re: parse_re(re), f };
        self.rules.push(rule);
        self
    }

    pub fn build(self) -> Lexer {
        let error_token = self.error_token.unwrap_or_else(|| {
            panic!("`error_token` is not set")
        });
        Lexer { error_token, rules: self.rules }
    }
}

fn parse_re(re: &str) -> Regex {
    Regex::new(&format!("^({})", re)).unwrap()
}


#[test]
fn tokenize_longest_first_wins() {
    const ERROR: TokenKind = TokenKind(0);
    const WS: TokenKind = TokenKind(1);
    const FOO: TokenKind = TokenKind(10);
    const WORD: TokenKind = TokenKind(11);
    const FOOBAR: TokenKind = TokenKind(12);

    let lexer = LexerBuilder::new()
        .error_token(ERROR)
        .tokens(&[
            (WS, r"\s+"),
            (FOO, "foo"),
            (WORD, r"\w+"),
            (FOOBAR, "foobar"),
        ])
        .build();
    lexer.test(
        "foo foob foobar",
        r#"
"foo" 10
" " 1
"foob" 11
" " 1
"foobar" 11"#,
    );
}

#[test]
fn extern_rule() {
    const ERROR: TokenKind = TokenKind(0);
    const WS: TokenKind = TokenKind(1);
    const WORD: TokenKind = TokenKind(2);
    const COMMENT: TokenKind = TokenKind(3);

    let lexer = LexerBuilder::new()
        .error_token(ERROR)
        .tokens(&[
            (WS, r"\s+"),
            (WORD, r"\w+"),
        ])
        .rule(COMMENT, r"\(\*", Some(Box::new(|input| nested_comment(input))))
        .build();

    lexer.test(
        "foo (* (* bar *) *) baz *) (*",
        r#"
"foo" 2
" " 1
"(* (* bar *) *)" 3
" " 1
"baz" 2
" " 1
"*)" 0
" " 1
"(*" 3"#,
    );

    fn nested_comment(input: &str) -> Option<usize> {
        assert!(input.starts_with("(*"));
        let mut level = 0;
        let mut len = 0;

        let mut rest = input;
        loop {
            let bite = match rest.chars().next() {
                None => return Some(len),
                _ if rest.starts_with("(*") => {
                    level += 1;
                    2
                }
                _ if rest.starts_with("*)") => {
                    level -= 1;
                    2
                }
                Some(c) => c.len_utf8(),
            };
            len += bite;
            rest = &rest[bite..];
            if level == 0 {
                return Some(len);
            }
        }
    }
}