peg 0.3.9

A parser generator built on the Parsing Expression Grammar (PEG) formalism.
use translate::*;
use std::char;

#[export]
grammar -> Grammar
  = __ imports:rust_use* rules:rule*
  { Grammar{ imports:imports, rules:rules } }

rule -> Rule
  = exported:exportflag cached:cacheflag name:identifier returns: returntype equals expression:expression semicolon? {
      Rule{ name: name, expr: Box::new(expression), ret_type: returns, exported: exported, cached: cached }
    }

    exportflag -> bool = ("#[export]"/"#[pub]") __ {true} / "" {false}
    cacheflag -> bool = "#[cache]" __ {true} / {false}

returntype -> String
  = returns tp:(rust_type {match_str.trim().to_string()}) { tp }
  / { "()".to_string() }

rust_use -> RustUse
  = "use" __ p:rust_path __ v:(
    "::" __ "*" __
      { RustUseGlob(p.clone()) }
    / "::" __ "{" __ names:(identifier ++ ("," __)) "}" __
      { RustUseList(p.clone(), names) }
    / ""
      { RustUseSimple(p.clone()) }
  ) ";" __ { v }

rust_path -> String
  = ( identifier ++ ("::" __)) { match_str.to_string() }

rust_type
  = "()" __
  / "[" rust_type "]" __
  / "&" ("'" identifier)? rust_type
  / identifier "<" (("'" identifier "") / rust_type) ** (__ "," __) ">" __
  / identifier "::" rust_type
  / "(" rust_type ++ (__ "," __) ")" __
  / identifier ""

expression -> Expr
  = choice

choice -> Expr
  = head:sequence tail:(slash s:sequence {s})* {
      if tail.len() > 0 {
        let mut list = tail;
        list.insert(0, head);
        ChoiceExpr(list)
      } else {
        head
      }
    }

sequence -> Expr
  = elements:labeled* code:action {
      ActionExpr(elements, code.0, code.1)
    }
  / elements:prefixed* {
      if elements.len() != 1 {
          SequenceExpr(elements)
      } else {
          elements.into_iter().next().unwrap()
      }
    }

labeled -> TaggedExpr
  = label:identifier colon expression:prefixed {
      TaggedExpr{ name: Some(label), expr: Box::new(expression) }
    }
  / expr: prefixed {
      TaggedExpr{ name: None, expr: Box::new(expr) }
  }

prefixed -> Expr
  = dollar expression:suffixed {
      expression
    }
//  / and code:action {
//      fail!("/*Semantic and unsupported*/");
//    }
  / and expression:suffixed {
      PosAssertExpr(Box::new(expression))
    }
//  / not code:action {
//      fail!("/*Semantic not unsupported*/");
//    }
  / not expression:suffixed {
      NegAssertExpr(Box::new(expression))
    }
  / suffixed

suffixed -> Expr
  = expression:primary question {
       OptionalExpr(Box::new(expression))
    }
  / expression:primary starstar sep:primary {
      Repeat(Box::new(expression), 0, None, Some(Box::new(sep)))
    }
  / expression:primary plusplus sep:primary {
      Repeat(Box::new(expression), 1, None, Some(Box::new(sep)))
    }
  / expression:primary star {
      Repeat(Box::new(expression), 0, None, None)
    }
  / expression:primary plus {
      Repeat(Box::new(expression), 1, None, None)
    }
  / expression:primary lbrace n:integer rbrace {
      Repeat(Box::new(expression), n, Some(n), None)
    }
  / expression:primary lbrace min:integer comma max:integer? rbrace {
      Repeat(Box::new(expression), min, max, None)
    }
  / expression:primary lbrace comma max:integer? rbrace {
      Repeat(Box::new(expression), 0, max, None)
    }
  / primary

#[cache]
primary -> Expr
  = name:identifier !(string? returntype equals) {
      RuleExpr(name)
    }
  / literal
  / class
  / dot { AnyCharExpr }
  / lparen expression:expression rparen { expression }

/* "Lexical" elements */

action -> (String, /*is conditional match?*/ bool)
  = "{" cond:"?"? (((braced "") / nonBraceCharacters)*) "}" __ {
    match cond {
        Some(_) => {
            let mut cond = String::with_capacity(match_str.len()-1);
            cond.push_str("{"); // }
            cond.push_str(&match_str[2..]);
            (cond, true)
        }
        None => (match_str.to_string(), false)
    }
  }

braced -> String
  = "{" (((braced "") / nonBraceCharacters)*) "}" {match_str.to_string()}

nonBraceCharacters
  = nonBraceCharacter+

nonBraceCharacter
  = [^{}]

equals    = "="  __
colon     = ":"  __
semicolon = ";"  __
slash     = "/"  __
and       = "&"  __
not       = "!"  __
dollar    = "$"  __
question  = "?"  __
star      = "*"  __
starstar  = "**" __
plus      = "+"  __
plusplus  = "++" __
lparen    = "("  __
rparen    = ")"  __
dot       = "."  __
returns   = "->" __
lbrace    = "{"  __
rbrace    = "}"  __
comma     = ","  __

integer -> usize
  = i:([0-9]+ { match_str.parse().unwrap() }) __ { i }

/*
 * Modeled after ECMA-262, 5th ed., 7.6, but much simplified:
 *
 * * no Unicode escape sequences
 *
 * * "Unicode combining marks" and "Unicode connection punctuation" can't be
 *   part of the identifier
 *
 * * only [a-zA-Z] is considered a "Unicode letter"
 *
 * * only [0-9] is considered a "Unicode digit"
 *
 * The simplifications were made just to make the implementation little bit
 * easier, there is no "philosophical" reason behind them.
 *
 * Contrary to ECMA 262, the "$" character is not valid because it serves other
 * purpose in the grammar.
 */
identifier -> String
  = chars:((letter / "_") (letter / digit / "_")* {match_str.to_string()}) __ { chars }

/*
 * Modeled after ECMA-262, 5th ed., 7.8.4. (syntax & semantics, rules only
 * vaguely).
 */
literal -> Expr
  = value:(doubleQuotedString / singleQuotedString) case_insensitive:"i"? __ {
      LiteralExpr(value,case_insensitive.is_some())
    }

string -> String
  = string:(doubleQuotedString / singleQuotedString) __ { string }

doubleQuotedString -> String
  = '"' s:doubleQuotedCharacter* '"' { s.into_iter().collect() }

doubleQuotedCharacter -> char
  = simpleDoubleQuotedCharacter
  / simpleEscapeSequence
  / zeroEscapeSequence
  / hex2EscapeSequence
  / unicodeEscapeSequence
  / hex4EscapeSequence
  / hex8EscapeSequence
  / eolEscapeSequence

simpleDoubleQuotedCharacter -> char
  = !('"' / "\\" / eolChar) . { match_str.chars().next().unwrap() }

singleQuotedString -> String
  = "'" s:singleQuotedCharacter* "'" { s.into_iter().collect() }

singleQuotedCharacter -> char
  = simpleSingleQuotedCharacter
  / simpleEscapeSequence
  / zeroEscapeSequence
  / hex2EscapeSequence
  / unicodeEscapeSequence
  / hex4EscapeSequence
  / hex8EscapeSequence
  / eolEscapeSequence

simpleSingleQuotedCharacter -> char
  = !("'" / "\\" / eolChar) . { match_str.chars().next().unwrap() }

class -> Expr
  = "[" inverted:"^"? parts:(classCharacterRange / classCharacter)* "]" flags:"i"? __ {
      CharSetExpr(inverted.is_some(), parts)
    }

classCharacterRange -> CharSetCase
  = begin:bracketDelimitedCharacter "-" end:bracketDelimitedCharacter {
      //TODO: check start > end
      CharSetCase{start:begin, end:end}
    }

classCharacter -> CharSetCase
  = char_:bracketDelimitedCharacter {
      CharSetCase{start:char_, end:char_}
    }

bracketDelimitedCharacter -> char
  = simpleBracketDelimitedCharacter
  / simpleEscapeSequence
  / zeroEscapeSequence
  / hex2EscapeSequence
  / unicodeEscapeSequence
  / hex4EscapeSequence
  / hex8EscapeSequence
  / eolEscapeSequence

simpleBracketDelimitedCharacter -> char
  = !("]" / "\\" / eolChar) . { match_str.chars().next().unwrap() }

simpleEscapeSequence -> char
  = "\\" !(digit / "x" / "u" / "U" / eolChar) . {
      match match_str[1..].chars().next().unwrap() {
        //'b' => '\b',
        //'f' => '\f',
        'n' => '\n',
        'r' => '\r',
        't' => '\t',
        //'v' => '\v',
         x  => x
      }
    }

zeroEscapeSequence -> char
  = "\\0" !digit { 0u8 as char }

hex2EscapeSequence -> char
  = "\\x" value:(hexDigit hexDigit { u32::from_str_radix(match_str, 16) }) {
      char::from_u32(value.unwrap()).unwrap()
    }

unicodeEscapeSequence -> char
  = "\\u{" value:(hexDigit+ { u32::from_str_radix(match_str, 16)}) "}" {
      char::from_u32(value.unwrap()).unwrap()
    }

// Deprecated in Rust, will be removed.
hex4EscapeSequence -> char
  = "\\u" value:(hexDigit hexDigit hexDigit hexDigit { u32::from_str_radix(match_str, 16)}) {
      char::from_u32(value.unwrap()).unwrap()
    }

// Deprecated in Rust, will be removed.
hex8EscapeSequence -> char
  = "\\U" value:(hexDigit hexDigit hexDigit hexDigit hexDigit hexDigit hexDigit hexDigit { u32::from_str_radix(match_str, 16)}) {
      char::from_u32(value.unwrap()).unwrap()
    }

eolEscapeSequence -> char
  = "\\" eol:eol { '\n' }

digit
  = [0-9]

hexDigit
  = [0-9a-fA-F]

letter
  = lowerCaseLetter
  / upperCaseLetter

lowerCaseLetter
  = [a-z]

upperCaseLetter
  = [A-Z]

__ = (whitespace / eol / comment)*

/* Modeled after ECMA-262, 5th ed., 7.4. */
comment
  = singleLineComment
  / multiLineComment

singleLineComment
  = "//" (!eolChar .)*

multiLineComment
  = "/*" (!"*/" .)* "*/"

/* Modeled after ECMA-262, 5th ed., 7.3. */
eol
  = "\n"
  / "\r\n"
  / "\r"
  / "\u{2028}"
  / "\u{2029}"

eolChar
  = [\n\r\u2028\u2029]

/* Modeled after ECMA-262, 5th ed., 7.2. */
whitespace
  = [ \t\u{00A0}\u{FEFF}\u{1680}\u{180E}\u{2000}-\u{200A}\u{202F}\u{205F}\u{3000}] // \v\f removed