1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48
peg::parser! {
/// Grammar for a simple English parser
///
/// `english_parser` module contains rules to parse words, numbers,
/// punctuation, sentences and whole tetx in English.
pub grammar english_parser() for str {
/// `word` matches a word, which is a sequence of alphabetic characters with optinal symbols - and '
pub rule word() -> &'input str
= $(['a'..='z' | 'A'..='Z']+ (['-' | '\'']['a'..='z' | 'A'..='Z']+)* (['\''])?)
/// `capital_word` matches a word that starts with a capital letter.
pub rule capital_word() -> &'input str
= $(['A'..='Z']['a'..='z' | 'A'..='Z']+ (['-' | '\'']['a'..='z' | 'A'..='Z']+)* (['\''])?)
/// `number` matches a sequence of numeric characters.
pub rule number() -> &'input str
= $(['0'..='9']+ ("." ['0'..='9']+)?)
/// `end_punctuation` matches sentence-ending punctuation.
pub rule end_punctuation() -> &'input str
= $("..." / ['.' | '?' | '!'])
// `other_punctuation` matches non sentence-ending punctuation.
pub rule other_punctuation() -> &'input str
= $([',' | ';' | ':' | '-'])
/// `whitespace` matches any whitespace character.
pub rule whitespace() -> &'input str
= $([' ' | '\t' | '\n' | '\r'])
/// `sentence` matches a sequence of words, numbers and other punctuation ending with sentence-ending punctuation.
pub rule sentence() -> Vec<&'input str>
= capital_w:capital_word() sequence:((word() / number() / whitespace() / other_punctuation())*) end_punct:end_punctuation() {
let mut sequence_vec = sequence.to_vec();
sequence_vec.insert(0, capital_w);
sequence_vec.push(end_punct);
sequence_vec
}
/// `text` matches a series of sentences, separated by whitespace.
///
/// This rule can be used to parse entire paragraphs or documents.
pub rule text() -> Vec<Vec<&'input str>>
= sentences:(sentence() ** whitespace()) {
sentences
}
}
}