[−][src]Macro pidgin::grammar

macro_rules! grammar {
    ( @initialized $mflags:expr, $($rest:tt)+ ) => { ... };
    (@start_rule $l:expr, $m:expr, $name:ident, $mf:expr, $($rest:tt)+) => { ... };
    (@rules $l:expr, $m:expr, $name:ident => (?$on:ident-$off:ident) $($rest:tt)+) => { ... };
    (@rules $l:expr, $m:expr, $name:ident => (?$on:ident) $($rest:tt)+) => { ... };
    (@rules $l:expr, $m:expr, $name:ident => (?-$off:ident) $($rest:tt)+) => { ... };
    (@rules $l:expr, $m:expr, $name:ident => $($rest:tt)+) => { ... };
    (@rules $l:expr, $m:expr, $name:ident -> (?$on:ident-$off:ident) $($rest:tt)+) => { ... };
    (@rules $l:expr, $m:expr, $name:ident -> (?$on:ident) $($rest:tt)+) => { ... };
    (@rules $l:expr, $m:expr, $name:ident -> (?-$off:ident) $($rest:tt)+) => { ... };
    (@rules $l:expr, $m:expr, $name:ident -> $($rest:tt)+) => { ... };
    ( @add_part $l:expr, $m:expr, $p:expr, $($parts:tt)* ) => { ... };
    ( @add_grammar $l:expr, $m:expr, $e:ident, $low:expr, $high:expr, $stingy:expr, $($parts:tt)* ) => { ... };
    ( @rules $l:expr, $m:expr, <$e:ident>?? $($parts:tt)* ) => { ... };
    ( @rules $l:expr, $m:expr, <$e:ident>? $($parts:tt)* ) => { ... };
    ( @rules $l:expr, $m:expr, <$e:ident>*? $($parts:tt)*) => { ... };
    ( @rules $l:expr, $m:expr, <$e:ident>* $($parts:tt)*) => { ... };
    ( @rules $l:expr, $m:expr, <$e:ident>+? $($parts:tt)*) => { ... };
    ( @rules $l:expr, $m:expr, <$e:ident>+ $($parts:tt)*) => { ... };
    ( @rules $l:expr, $m:expr, <$e:ident>{$low:expr,$high:expr}? $($parts:tt)*) => { ... };
    ( @rules $l:expr, $m:expr, <$e:ident>{$low:expr,$high:expr} $($parts:tt)*) => { ... };
    ( @rules $l:expr, $m:expr, <$e:ident>{$low:expr,}? $($parts:tt)*) => { ... };
    ( @rules $l:expr, $m:expr, <$e:ident>{$low:expr,} $($parts:tt)*) => { ... };
    ( @rules $l:expr, $m:expr, <$e:ident>{$n:expr} $($parts:tt)*) => { ... };
    ( @rules $l:expr, $m:expr, <$e:ident> $($parts:tt)*) => { ... };
    ( @add_foreign_grammar $l:expr, $m:expr, $e:expr, $low:expr, $high:expr, $stingy:expr, $($parts:tt)* ) => { ... };
    ( @rules $l:expr, $m:expr, g($e:expr)?? $($parts:tt)* ) => { ... };
    ( @rules $l:expr, $m:expr, g($e:expr)? $($parts:tt)* ) => { ... };
    ( @rules $l:expr, $m:expr, g($e:expr)*? $($parts:tt)*) => { ... };
    ( @rules $l:expr, $m:expr, g($e:expr)* $($parts:tt)*) => { ... };
    ( @rules $l:expr, $m:expr, g($e:expr)+? $($parts:tt)*) => { ... };
    ( @rules $l:expr, $m:expr, g($e:expr)+ $($parts:tt)*) => { ... };
    ( @rules $l:expr, $m:expr, g($e:expr){$low:expr,$high:expr}? $($parts:tt)*) => { ... };
    ( @rules $l:expr, $m:expr, g($e:expr){$low:expr,$high:expr} $($parts:tt)*) => { ... };
    ( @rules $l:expr, $m:expr, g($e:expr){$low:expr,}? $($parts:tt)*) => { ... };
    ( @rules $l:expr, $m:expr, g($e:expr){$low:expr,} $($parts:tt)*) => { ... };
    ( @rules $l:expr, $m:expr, g($e:expr){$n:expr} $($parts:tt)*) => { ... };
    ( @rules $l:expr, $m:expr, g($e:expr) $($parts:tt)*) => { ... };
    ( @add_string $l:expr, $m:expr, $e:expr, $low:expr, $high:expr, $stingy:expr, $($parts:tt)* ) => { ... };
    ( @rules $l:expr, $m:expr, ($e:expr)?? $($parts:tt)*) => { ... };
    ( @rules $l:expr, $m:expr, ($e:expr)? $($parts:tt)*) => { ... };
    ( @rules $l:expr, $m:expr, ($e:expr)*? $($parts:tt)*) => { ... };
    ( @rules $l:expr, $m:expr, ($e:expr)* $($parts:tt)*) => { ... };
    ( @rules $l:expr, $m:expr, ($e:expr)+? $($parts:tt)*) => { ... };
    ( @rules $l:expr, $m:expr, ($e:expr)+ $($parts:tt)*) => { ... };
    ( @rules $l:expr, $m:expr, ($e:expr){$low:expr,$high:expr}? $($parts:tt)*) => { ... };
    ( @rules $l:expr, $m:expr, ($e:expr){$low:expr,$high:expr} $($parts:tt)*) => { ... };
    ( @rules $l:expr, $m:expr, ($e:expr){$low:expr,}? $($parts:tt)*) => { ... };
    ( @rules $l:expr, $m:expr, ($e:expr){$low:expr,} $($parts:tt)*) => { ... };
    ( @rules $l:expr, $m:expr, ($e:expr){$n:expr} $($parts:tt)*) => { ... };
    ( @rules $l:expr, $m:expr, ($e:expr) $($parts:tt)*) => { ... };
    ( @add_vec $l:expr, $m:expr, $e:expr, $low:expr, $high:expr, $stingy:expr, $($parts:tt)* ) => { ... };
    ( @rules $l:expr, $m:expr, [$e:expr]?? $($parts:tt)*) => { ... };
    ( @rules $l:expr, $m:expr, [$e:expr]? $($parts:tt)*) => { ... };
    ( @rules $l:expr, $m:expr, [$e:expr]*? $($parts:tt)*) => { ... };
    ( @rules $l:expr, $m:expr, [$e:expr]* $($parts:tt)*) => { ... };
    ( @rules $l:expr, $m:expr, [$e:expr]+? $($parts:tt)*) => { ... };
    ( @rules $l:expr, $m:expr, [$e:expr]+ $($parts:tt)*) => { ... };
    ( @rules $l:expr, $m:expr, [$e:expr]{$low:expr,$high:expr}? $($parts:tt)*) => { ... };
    ( @rules $l:expr, $m:expr, [$e:expr]{$low:expr,$high:expr} $($parts:tt)*) => { ... };
    ( @rules $l:expr, $m:expr, [$e:expr]{$low:expr,}? $($parts:tt)*) => { ... };
    ( @rules $l:expr, $m:expr, [$e:expr]{$low:expr,} $($parts:tt)*) => { ... };
    ( @rules $l:expr, $m:expr, [$e:expr]{$n:expr} $($parts:tt)*) => { ... };
    ( @rules $l:expr, $m:expr, [$e:expr] $($parts:tt)*) => { ... };
    ( @rules $l:expr, $m:expr, r($e:expr) $($parts:tt)*) => { ... };
    ( @rules $l:expr, $m:expr, | $($parts:tt)* ) => { ... };
    ( @rules $l:expr, $m:expr, $($parts:tt)*) => { ... };
    ( (?$on:ident) $($rest:tt)+ ) => { ... };
    ( (?-$off:ident) $($rest:tt)+ ) => { ... };
    ( (?$on:ident-$off:ident) $($rest:tt)+ ) => { ... };
    ( $($rest:tt)+ ) => { ... };
}

Compiles a Grammar.

Examples

#![recursion_limit = "256"] // if your grammar is big, you may need to bump this limit
#[macro_use] extern crate pidgin;
let mdays = &(1..=31)
    .into_iter()
    .map(|i| i.to_string())
    .collect::<Vec<_>>();
let g = grammar!{
    (?i)
    // top rule -- each thing matched against is expected only to be a time expression
    time -> r(r"\A") <type> r(r"\z")

    // sub-rules
    type           => <relative> | <absolute>
    relative       -> <modifier> <unit> | <modifier> <period>
    period         => <weekday> | <month>
    absolute       => <month_day> | <day_month_year> | <month_year>
    absolute       => <month_day_year> | <year>
    month_day      -> <month> <mday>
    day_month_year -> <mday> <month> <year>
    month_year     -> <month> <year>
    month_day_year -> <month> <mday> (",") <year>

    // leaves
    mday     => (?bB) [mdays]
    modifier => (?bB) [["this", "last", "next"]]
    unit     => (?bB) [["day", "week", "month", "year"]]
    year     => r(r"\b\d{4}\b")
    weekday  => (?bB) [[
                        "sunday",
                        "monday",
                        "tuesday",
                        "wednesday",
                        "thursday",
                        "friday",
                        "saturday"
                      ]]
    month    => (?bB) [[
                        "january",
                        "february",
                        "march",
                        "april",
                        "may",
                        "june",
                        "july",
                        "august",
                        "september",
                        "october",
                        "november",
                        "december",
                      ]]
};
let matcher = g.matcher().unwrap();
assert!(matcher.is_match("May 6, 1969"));
assert!(matcher.is_match("May 6"));
assert!(matcher.is_match("1969"));
assert!(matcher.is_match("last Saturday"));
let p = matcher.parse("May 6, 1969").unwrap();
assert!(p.name("absolute").is_some());
assert!(p.name("month").is_some());

Conventions

Structure

grammar!{
  // optional default flags for all rules
  (?ims)

  // a master rule that must match
  TOP => <cat> | <dog>

  // optional sub-rules used by the master rule
  cat =>
         (?-i) // optional rule-specific flats
         [["calico", "tabby"]]
  dog => [["dachshund", "malamute"]]
};

The macro requires only that you provide a master rule and that the remaining rules all be sortable such that every rule that is referenced is defined and that there is no recursion. Recursion will cause a panic.

Conventions

Flags

Flags take the form (?<on>-<off>) just as in regular expressions. They may appear before all the rules in the grammar as defaults and after the arrow of a particular rule, in which case they apply to that rule alone. Rules do not inherit the flags of rules they are contained in. In

grammar!{
  foo -> (?i) ("the") <cat>
  cat => ("cat")
};

the <cat> rule is not case-insensitive.

The flags understood are the standard regex flags minus x plus a few peculiar to grammars.

`b` and `B`

The b and B flags correspond to the \b regex anchor, the b flag being for the left word boundary anchor and the B flag for the right. They only have an effect on ("literal") and [vec] elements of a rule, and only when these elements are on the left or right margin of their rule.

grammar!{
  foo -> (?bB) ("cat") ("dog") ("donkey")
};

will produce a regex equivalent to \bcat\s*dog\s*donkey\b.

`w` and `W`

The w and W flags, which are mutually incompatible, control the normalization of whitespace in [vec] elements. w means "some whitespace" and W means "maybe some whitespace".

grammar!{
  foo => (?w) [["cat a log"]]
};

is equivalent to cat\s+a\s+log.

grammar!{
  foo => (?W) [["cat a log"]]
};

is equivalent to cat\s*a\s*log.

As in regular expressions, the order of flags, or repetition of flags, in "on" and "off" parts is irrelevant. (?bB) means the same thing as (?Bb), which means the same thing as (?bbbbbBBBB), which means the same thing as (?bBbB).

Identifiers

Rule names in grammars must be legal rust identifiers: rule is good; 7ule and r*le are bad.

Arrows

Rule names are separated from their definition by one of two arrows.

fat arrow `=>`

grammar!{
  foo => ("cat") ("dog") // equivalent to the regex catdog
};

This is the usual separator.

skinny arrow `->`

The skinny arrow separator indicates that the elements of the rule may optionally be separated by whitespace.

grammar!{
  foo -> ("cat") ("dog") // equivalent to the regex cat\s*dog
};

Repeated elements may also be separated by whitespace.

grammar!{
  foo -> ("cat") ("dog")+ // equivalent to the regex cat(?:\s*dog)+
};

Normally the first element of a -> rule is not preceded by any optional whitespace, but this is not so for repeated elements.

grammar!{
  foo -> ("dog")+ // equivalent to the regex (?:\s*dog)+
};

If you combine the skinny arrow with word boundaries, the optional space may become obligatory.

grammar!{
  foo  -> <word>+
  word => (?bB) [["cat", "dog", "tortoise"]]
};

In order for there to be a word boundary between the repeated words in this case, there must be some space.

Elements

A rule definition, after the optional flags, consists of a sequence of rule elements.

let qux = grammar!{
  something => ("or other")
};
let illustration = grammar!{
  foo => <bar> r("baz") [["plugh"]] g(qux)
  bar => ("baz")
};

`(literal)`

grammar!{
  foo => ("bar")
};

An element delimited by bare parentheses provides a literal. The parentheses must contain an expression that can be converted to a String with to_string. The value of this string will not be further manipulated. Whitespace characters are preserved as is. This literal will be interpolated into the constructed regular expression with appropriate escaping, so, for instance, . will become \.. Flags such as (?i) may still apply.

`r(regex)`

grammar!{
  foo => r(r"\d+")
};

An element delimited by a pair of parentheses proceeded by an r provides a regular. expression literal. The parentheses again must contain an expression that can converted to a String via to_string. Unlike the (literal) expression, the characters in the r(regex) literal will not be escaped.

Take care to avoid named captures in r(regex) elements, as Rust's regex does not allow the repetition of group names. Also, it is possible the name you choose will conflict with those inserted by the macro. These all consist of an m followed by an integer.

`<rule>`

grammar!{
  foo => <bar>
  bar => ("baz")
};

An element delimited by angle brackets refers to another rule. The parentheses again must contain an expression that can converted to a String via to_string.

`[vec]`

grammar!{
  foo => [["cat", "dog", "tortoise"]]
};

An element delimited by square brackets introduces a list of elements to be condensed into a regular expression. Meta-characters in the items in the list will be escaped, but white space may be normalized. The square brackets must contain a Rust expression to which .iter().map(|i| i.to_string()).collect() may be applied to get a Vec of Strings.

`g(grammar)`

let sub_grammar = grammar!{
  foo => ("bar")
};
grammar!{
  foo => g(sub_grammar)
};

The expression inside a g(grammar) element must be a Grammar or a reference to the same. It will be converted into an owned Grammar via clone.

The g(grammar) element provides a means to reuse grammars in other grammars. Note that Grammar::rule provides a mechanism to extract a useful piece of one grammar for re-use in another.

let names = grammar!{
  name       => <east_asian> | <western>
  east_asian -> <surname> <given_name>
  western    -> <given_name> <surname>
  given_name => (?bB) [["Sally", "Wataru", "Jo"]]
  surname    => (?bB) [["Ng", "Smith", "Salasie"]]
};
grammar!{
  foo => g(names.rule("surname").unwrap())
};

Nothing ensures a name used in both grammars is used the same way in each:

let g1 = grammar!{
    foo => ("bar")   // here foo is bar
};
let g2 = grammar!{
    words -> <word>+
    word  => <foo> | <bar>
    foo   => ("baz") // here foo is baz
    bar   => g(g1)   // but the other foo gets in here
};
let matcher = g2.matcher().unwrap();
let p = matcher.parse("bar baz").unwrap();
assert_eq!(
    vec!["bar", "baz"],
    p.all_names("foo")
        .iter()
        .map(|m| m.as_str())
        .collect::<Vec<_>>()
);

Repetition

All element types except r(regex) can be followed by a repetition suffix. These are identical to the repetition suffixes allowed by regexes.

let g1 = grammar!{
  foo => ("bar")
};
grammar!{
  foo => <r> | <s>? | <v>* | <g>+
  g   => g(g1){2}   g(g1){3,}   g(g1){4,5}
  s   => ("foo")??   ("bar")*?   ("baz")+?
  v   => [["plugh"]]{6,}?   [["quux"]]{7,8}?
  r   => r(r"no suffix for me*")
};

There is no need to add repetition suffixes to r(regex) rules, since you can put these in the regex itself. If you absolutely insist, however, you can add a repetition to a reference to the rule.

grammar!{
  foo => <r>+?
  r   => r(r"no suffix for me*")
};

Alternation

Alternation is offering a choice of rules. There are two ways to represent alternation in a grammar!:

grammar!{
  foo => ("a") | ("b")
  foo => ("c")
};

Each definition of a rule is an alternate. Also, within one definition one may separate alternates with |.

Unlike in regexes there is no grouping construction in a grammar! aside from the definition of rules.

Recursion

There are two points that bear mentioning regarding recursion and grammars.

You cannot write a recursive grammar. There is no mechanism in regex which would allow it and the macro could not compile it.
The grammar! macro works by recursion, nibbling elements off the beginning of the grammar definition and then recursing until all token are consumed. This means if your grammar is large, and not terribly large, it may require a larger stack during compilation than the compiler provides by default. In this case one need only request more space.

#![recursion_limit = "256"] // bump this up until your grammar compiles
#[macro_use] extern crate pidgin;
// ...

`lazy_static!`

As with regexes, the definition of grammars is not something you want to do repeatedly at runtime. The best practice is to compile them once and then reuse them with the lazy_static macro.