pvm 0.1.0

A parsing virtual machine with a focus on expressibility of grammars.
Documentation
use std::collections::{HashMap};
use ast;

#[derive(Debug, Eq, PartialEq)]
pub enum Token {
    OpenBrace,
    CloseBrace,
    OpenParen,
    CloseParen,
    OpenBracket,
    CloseBracket,
    SingleQuote,
    DoubleQuote,
    Dot,
    Plus,
    Asterik,
    Question,
    Exclamation,
    Ambersand,
    Slash,
    Dash,
    Colon,
    Number(i32),
    Name(i32),
    Letter(u8)
}

pub fn tokenize(grammar : &str) -> (Vec<Token>, i32, HashMap<Vec<u8>, i32>) {
    let mut iterator = grammar.chars();
    let mut tokens = vec![];
    let mut name = vec![];
    let mut i = 1;
    let mut in_quote = false;
    let mut in_bracket = false;
    let mut escaped = false;
    let main = vec![b'm', b'a', b'i', b'n'];
    let mut map = HashMap::new();
    map.insert(main, 0);

    while let Some(item) = iterator.next() {
        if in_quote || in_bracket {
            if escaped {
                match item {
                    't' => tokens.push(Token::Letter(b'\t')),
                    'r' => tokens.push(Token::Letter(b'\r')),
                    'n' => tokens.push(Token::Letter(b'\n')),
                    _ => tokens.push(Token::Letter(item as u8))
                }
                escaped = false;
            } else {
                match item {
                    '-' => {
                        if in_bracket {
                            tokens.push(Token::Dash);
                        } else {
                            tokens.push(Token::Letter('-' as u8));
                        }
                    },
                    '\'' if !in_bracket => {
                        tokens.push(Token::SingleQuote); 
                        in_quote = false;
                    },
                    '\"' if !in_bracket => {
                        tokens.push(Token::DoubleQuote); 
                        in_quote = false;
                    },
                    ']' if !in_quote => {
                        tokens.push(Token::CloseBracket);
                        in_bracket = false;
                    }
                    '\\' => { escaped = true; },
                    _ => tokens.push(Token::Letter(item as u8)),
                }
            }
        } else {
            if !item.is_alphanumeric() && item != '_' && name.len() != 0 {
                if name.iter().all(|&x| x >= b'0' && x <= b'9') {
                    let mut i = 1;
                    let mut result = 0;
                    while let Some(x) = name.pop() {
                        let n = (x - b'0') as i32;
                        result += n*i;
                        i *= 10;
                    }
                    tokens.push(Token::Number(result));
                } else {
                    let mut to_insert = false;
                    let id = match map.get(&name) {
                        Some(j) => *j,
                        None => {
                            let j = i;
                            i += 1;
                            to_insert = true;
                            j
                        }
                    };
                    if to_insert { map.insert(name.clone(), id); }
                    tokens.push(Token::Name(id));
                    name.clear();
                }
            }

            match item {  
                '{' => tokens.push(Token::OpenBrace),
                '}' => tokens.push(Token::CloseBrace),
                '(' => tokens.push(Token::OpenParen),
                ')' => tokens.push(Token::CloseParen),
                '.' => tokens.push(Token::Dot),
                '+' => tokens.push(Token::Plus),
                '*' => tokens.push(Token::Asterik),
                '?' => tokens.push(Token::Question),
                '!' => tokens.push(Token::Exclamation),
                '&' => tokens.push(Token::Ambersand),
                '/' => tokens.push(Token::Slash),
                '-' => tokens.push(Token::Dash),
                ':' => tokens.push(Token::Colon),
                '[' => {
                    tokens.push(Token::OpenBracket);
                    in_bracket = true;
                },
                ']' => {
                    tokens.push(Token::CloseBracket);
                    in_bracket = false;
                },
                '\'' => {
                    tokens.push(Token::SingleQuote);
                    in_quote = true;
                },
                '\"' => {
                    tokens.push(Token::DoubleQuote);
                    in_quote = true;
                },
                _ if item.is_alphanumeric() || item == '_' => name.push(item as u8),
                _ => { }
            }
        }
    }
    (tokens, i, map)
}

pub fn parse(tokens : Vec<Token>, rule_count : i32) -> Result<ast::Grammar, usize> {
    let mut grammar_object = ast::Grammar { rules: vec![], main: 0 };
    let mut insert_order = vec![];
    let mut i = 0;

    while let Some(token) = tokens.get(i) {
        if let &Token::Name(id) = token {
            i += 1;
            if let Some(brace_token) = tokens.get(i) {
                i += 1;
                if brace_token == &Token::OpenBrace {
                    let pattern = parse_expression(&mut i, &tokens);
                    match tokens.get(i) {
                        Some(&Token::CloseBrace) => i += 1,
                        _ => return Err(i)
                    }
                    match pattern {
                        Ok(p) => insert_order.push((id, p)),
                        Err(x) => return Err(x)
                    }
                } else {
                    return Err(i);
                }
            } else {
                return Err(i);
            } 
        } else {
            return Err(i);
        }
    }

    insert_order.sort_by(|a, b| a.0.cmp(&b.0));

    if insert_order.iter().map(|x| x.0).eq(0..rule_count) {
        grammar_object.rules = insert_order.drain(..).map(|x| x.1).collect();
        Ok(grammar_object)
    } else {
        Err(i)
    }
}

fn parse_expression(i : &mut usize, tokens : &Vec<Token>) -> Result<ast::Pattern, usize> {
    let first_sequence = match parse_sequence(i, tokens) {
        Ok(p) => p,
        Err(x) => return Err(x)
    };
    let mut patterns = vec![first_sequence];

    while tokens.get(*i) == Some(&Token::Slash) {
        *i += 1;
        let sequence = match parse_sequence(i, tokens) {
            Ok(p) => p,
            Err(x) => return Err(x)
        };
        patterns.push(sequence);
    }

    if patterns.len() >= 2 {
        let left = patterns.pop().unwrap();
        let right = patterns.pop().unwrap();
        let mut result = ast::Pattern::Choice(Box::new(right), Box::new(left));

        while let Some(remaining) = patterns.pop() {
            let temp = result;
            result = ast::Pattern::Choice(Box::new(remaining), Box::new(temp));
        }
        Ok(result)
    } else {
        Ok(patterns.pop().unwrap())
    }
}

fn parse_sequence(i : &mut usize, tokens : &Vec<Token>) -> Result<ast::Pattern, usize> {
    let first_prefix = match parse_prefix(i, tokens) {
        Ok(p) => p,
        Err(x) => return Err(x)
    };
    let mut patterns = vec![first_prefix];

    let mut backtrack = *i;
    loop {
        match parse_prefix(i, tokens) {
            Ok(p) => {
                patterns.push(p);
                backtrack = *i;
            },
            Err(_) => {
                *i = backtrack;
                break;
            }
        }
    }

    if patterns.len() > 1 {
        let boxed_patterns = patterns.drain(..).map(|x| Box::new(x)).collect();
        Ok(ast::Pattern::Sequence(boxed_patterns))
    } else {
        Ok(patterns.pop().unwrap())
    }
}

fn parse_prefix(i : &mut usize, tokens : &Vec<Token>) -> Result<ast::Pattern, usize> {
    let mut lookahead = None;

    if tokens.get(*i) == Some(&Token::Ambersand) {
        *i += 1;
        lookahead = Some(true);
    } else if tokens.get(*i) == Some(&Token::Exclamation) {
        *i += 1;
        lookahead = Some(false);
    }

    match parse_suffix(i, tokens) {
        Ok(p) => {
            match lookahead {
                Some(b) => Ok(ast::Pattern::Lookahead(b, Box::new(p))),
                None => Ok(p)
            }
        },
        Err(x) => Err(x)
    }
}

fn parse_suffix(i : &mut usize, tokens : &Vec<Token>) -> Result<ast::Pattern, usize> {
    match parse_primary(i, tokens) {
        Ok(p) => {
            if let Some(token) = tokens.get(*i) {
                match token {
                    &Token::Plus => {
                        *i += 1;
                        Ok(ast::Pattern::OneOrMore(Box::new(p)))
                    },
                    &Token::Asterik => {
                        *i += 1;
                        Ok(ast::Pattern::ZeroOrMore(Box::new(p)))
                    },
                    &Token::Question => {
                        *i += 1;
                        Ok(ast::Pattern::Optional(Box::new(p)))
                    },
                    _ => Ok(p)
                }
            } else { Ok(p) }
        },
        Err(x) => Err(x)
    }
}

fn parse_primary(i : &mut usize, tokens : &Vec<Token>) -> Result<ast::Pattern, usize> {
    let backtrack = *i;
    if let Some(token) = tokens.get(*i) {
        match token {
            &Token::Name(id) => {
                *i += 1;
                if tokens.get(*i) != Some(&Token::OpenBrace) {
                    if tokens.get(*i) == Some(&Token::Colon) {
                        *i += 1;
                        match tokens.get(*i) {
                            Some(&Token::Number(num)) => {
                                *i += 1;
                                Ok(ast::Pattern::Variable(id, num, 0, false))
                            },
                            _ => Err(*i)
                        }
                    } else {
                        Ok(ast::Pattern::Variable(id, -1, 0, false))
                    }
                } else {
                    *i = backtrack;
                    Err(*i)
                }
            },
            &Token::OpenParen => {
                *i += 1;
                match parse_expression(i, tokens) {
                    Ok(p) => {
                        if tokens.get(*i) == Some(&Token::CloseParen) {
                            *i += 1;
                            Ok(p)
                        } else {
                            Err(*i)
                        }
                    },
                    Err(x) => {
                        *i = backtrack;
                        Err(x)
                    }
                }
            },
            &Token::SingleQuote | &Token::DoubleQuote => {
                match parse_literal(i, tokens) {
                    Ok(p) => Ok(p),
                    Err(x) => {
                        *i = backtrack;
                        Err(x)
                    }
                }
            },
            &Token::OpenBracket => {
                match parse_class(i, tokens) {
                    Ok(p) => Ok(p),
                    Err(x) => {
                        *i = backtrack;
                        Err(x)
                    }
                }
            },
            &Token::Dot => {
                *i += 1;
                Ok(ast::Pattern::CharAny)
            },
            _ => Err(*i)
        }
    } else {
        Err(*i)
    }
}

fn parse_literal(i : &mut usize, tokens : &Vec<Token>) -> Result<ast::Pattern, usize> {
    let left_quote = tokens.get(*i);
    *i += 1;
    let mut letters = vec![];
    while let Some(token) = tokens.get(*i) {
        match token {
            &Token::Letter(x) => {
                *i += 1;
                letters.push(x);
            },
            _ => break
        }
    }
    let right_quote = tokens.get(*i);
    *i += 1;
    let quote_test = left_quote.is_some()
        && right_quote.is_some()
        && left_quote == right_quote
        && (*left_quote.unwrap() == Token::SingleQuote
            || *left_quote.unwrap() == Token::DoubleQuote);

    if quote_test {
        Ok(ast::Pattern::CharSequence(letters))
    } else {
        Err(*i)
    }
}

fn parse_class(i : &mut usize, tokens : &Vec<Token>) -> Result<ast::Pattern, usize> {
    match tokens.get(*i) {
        Some(&Token::OpenBracket) => *i += 1,
        _ => return Err(*i)
    }

    let mut ranges = vec![];
    loop {
        let left_letter = match tokens.get(*i) {
            Some(&Token::Letter(x)) => x,
            _ => break
        };
        *i += 1;
        let mut right_letter = None;
        
        if tokens.get(*i) == Some(&Token::Dash) {
            *i += 1;
            right_letter = match tokens.get(*i) {
                Some(&Token::Letter(x)) => Some(x),
                _ => return Err(*i)
            }
        }

        ranges.push((left_letter, right_letter));
    }

    match tokens.get(*i) {
        Some(&Token::CloseBracket) => *i += 1,
        _ => return Err(*i)
    }

    if ranges.len() > 0 {
        Ok(ast::Pattern::CharClass(ranges))
    } else {
        Err(*i)
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use machine;

    fn execute_test(grammar : &str, subjects : &Vec<&str>, expected : &Vec<bool>) {
        let machine_result = machine::Machine::<String>::new(grammar);
        assert!(machine_result.is_ok());
        assert!(subjects.len() == expected.len());
        let mut machine = machine_result.ok().unwrap();
        for i in 0..expected.len() {
            let result = machine.execute(subjects[i].to_string().into_bytes());
            let fail = result.is_err();
            println!("machine result: {:?}", result);
            println!("{}", subjects[i]);
            assert!(!fail == expected[i]);
        }
    }

    #[test]
    fn tokenizer() {
        let grammar = "
            main { a b c / b c a / c b a }
            a { apple+ }
            b { \"bu\"* }
            c { [a-\\\"c]? }
            apple { &.!(\" \") }
        ";

        let tokens = tokenize(&grammar).0;
        println!("{:?}", tokens);
        let expected = vec![
            Token::Name(0), Token::OpenBrace, Token::Name(1), Token::Name(2), Token::Name(3),
                Token::Slash, Token::Name(2), Token::Name(3), Token::Name(1), Token::Slash,
                Token::Name(3), Token::Name(2), Token::Name(1), Token::CloseBrace,
            Token::Name(1), Token::OpenBrace, Token::Name(4), Token::Plus, Token::CloseBrace,
            Token::Name(2), Token::OpenBrace, Token::DoubleQuote, Token::Letter(b'b'),
                Token::Letter(b'u'), Token::DoubleQuote, Token::Asterik, Token::CloseBrace,
            Token::Name(3), Token::OpenBrace, Token::OpenBracket, Token::Letter(b'a'),
                Token::Dash, Token::Letter(b'\"'), Token::Letter(b'c'), Token::CloseBracket,
                Token::Question, Token::CloseBrace,
            Token::Name(4), Token::OpenBrace, Token::Ambersand, Token::Dot, Token::Exclamation,
                Token::OpenParen, Token::DoubleQuote, Token::Letter(b' '), Token::DoubleQuote,
                Token::CloseParen, Token::CloseBrace
        ];
        assert!(tokens.iter().eq(expected.iter()));
    }

    #[test]
    fn simple_char_grammar() {
        let grammar = "
            main { .char_class char_seq }
            char_class { [a-zA] }
            char_seq { \"abc\" }
        ";

        let subjects = vec!["azabc", "Bkabc", "AAabc", "aqd", "xyz"];
        let expected = vec![true, true, true, false, false];
        execute_test(&grammar, &subjects, &expected);
    }

    #[test]
    fn simple_suffix_grammar() {
        let grammar = "main { 'a'+ 'b'* 'c'? }";
        let subjects = vec!["ac", "a", "abb", "aaabbbc", "aaabbb", "bb", "c", "z"];
        let expected = vec![true, true, true, true, true, false, false, false];
        execute_test(&grammar, &subjects, &expected);
    }

    #[test]
    fn simple_choice_grammar() {
        let grammar = "main { 'a' / 'b' / 'c' }";
        let subjects = vec!["a", "b", "c", "abc", "z"];
        let expected = vec![true, true, true, false, false];
        execute_test(&grammar, &subjects, &expected);
    }

    #[test]
    fn simple_prefix_grammar() {
        let grammar = "main { 'a' &'b' 'b' 'c' !'d' }";
        let subjects = vec!["abc", "abc", "ac", "abcd"];
        let expected = vec![true, true, false, false];
        execute_test(&grammar, &subjects, &expected);
    }

    #[test]
    fn simple_parentheticals() {
        let grammar = "main { ('a' / 'b') / 'c' ('d'.)+ }";
        let subjects = vec!["a", "b", "cdx", "cdxdcdy", "x", "c"];
        let expected = vec![true, true, true, true, false, false];
        execute_test(&grammar, &subjects, &expected);
    }

    #[test]
    fn parenthetical_and_lookahead() {
        let grammar = "main { &('b' / 'a') .* }";
        let subjects = vec!["a", "b", "aa", "ab", "azzd", "c", "zab"];
        let expected = vec![true, true, true, true, true, false, false];
        execute_test(&grammar, &subjects, &expected);
    }

    #[test]
    fn direct_left_recursion() {
        let grammar = "main { (main:1 \"+n\" / 'n') ';' }";
        let subjects = vec!["n;", "n+n;", "n+n+n+n+n+n;", "n", "n+;", "+n;", "n+n", ";"];
        let expected = vec![true, true, true, false, false, false, false, false];
        execute_test(&grammar, &subjects, &expected);
    }

    #[test]
    fn indirect_left_recursion() {
        let grammar = "main { L } L { P:1 '.x' / 'x' } P { P:1 '(n)' / L:1 }";
        let subjects = vec!["x", "x.x", "x(n).x", "x(n)(n).x(n).x", "x.", "x(n)x", "(n)"];
        let expected = vec![true, true, true, true, false, false, false];
        execute_test(&grammar, &subjects, &expected);
    }

    #[test]
    fn dogfood() {
        let grammar = "
            main { grammar }
            grammar { s rule+ }
            rule { name '{' s expression '}' s }
            expression { sequence (slash sequence)* }
            sequence { prefix+ }
            prefix { (and / not)? suffix }
            suffix { primary (question / star / plus)? }
            primary { name (colon num)? !'{' / open expression close / literal / class / dot }

            name { [a-zA-Z][a-zA-Z0-9_]* s }
            num { [1-9][0-9]* s }

            literal { 
                    '\\'' (!'\\'' char)* '\\'' s
                /   '\\\"' (!'\\\"' char)* '\\\"' s
            }
            class { '[' (!']' range)* ']' s }
            range { char '-' char / char }
            char {
                    '\\\\' [trn'\"\\\\]
                /   !'\\\\' .
            }

            slash { '/' s }
            and { '&' s }
            not { '!' s }
            question { '?' s }
            star { '*' s }
            plus { '+' s }
            open { '(' s }
            close { ')' s }
            dot { '.' s }
            colon { ':' s }

            s { ws* }
            ws { [ \\t\\r\\n] }
        ";
        let subjects = vec![
            "main { ('a' / 'b') / 'c' ('d'.)+ }",
            "main { 'a' &'b' 'b' 'c' !'d' }",
            "main { 'a' / 'b' / 'c' }",
            "main { 'a'+ 'b'* 'c'? }",
            "main { .char_class char_seq }
                char_class { [a-zA] }
                char_seq { \"abc\" }",
            &grammar
        ];
        let expected = vec![true, true, true, true, true, true];
        execute_test(&grammar, &subjects, &expected);
    }

}