english_language_parser/
lib.rs

1peg::parser! {
2    /// Grammar for a simple English parser
3    ///
4    /// `english_parser` module contains rules to parse words, numbers,
5    /// punctuation, sentences and whole tetx in English.
6    pub grammar english_parser() for str {
7        /// `word` matches a word, which is a sequence of alphabetic characters with optinal symbols - and '
8        pub rule word() -> &'input str
9            = $(['a'..='z' | 'A'..='Z']+ (['-' | '\'']['a'..='z' | 'A'..='Z']+)* (['\''])?)
10
11        /// `capital_word` matches a word that starts with a capital letter.
12        pub rule capital_word() -> &'input str
13            = $(['A'..='Z']['a'..='z' | 'A'..='Z']+ (['-' | '\'']['a'..='z' | 'A'..='Z']+)* (['\''])?)
14
15        /// `number` matches a sequence of numeric characters.
16        pub rule number() -> &'input str
17            = $(['0'..='9']+ ("." ['0'..='9']+)?)
18
19        /// `date` matches dates in the format dd/mm/yyyy.
20        pub rule date() -> &'input str
21            = $(day:['0'..='3']? ['0'..='9'] "/" month:['0'..='1']? ['0'..='9'] "/" year:['1'..='9']['0'..='9']['0'..='9']['0'..='9'])
22
23        /// `hour` matches times in the format hh:mm (am|pm).
24        pub rule hour() -> &'input str
25            = $(hour:['0'..='1']? ['0'..='9'] ":" minute:['0'..='5']['0'..='9'] " " period:("am" / "pm"))
26
27        /// `end_punctuation` matches sentence-ending punctuation.
28        pub rule end_punctuation() -> &'input str
29            = $("..." / ['.' | '?' | '!'])
30
31        // `other_punctuation` matches non sentence-ending punctuation.
32        pub rule other_punctuation() -> &'input str
33            = $([',' | ';' | ':' | '-'])
34
35        /// `whitespace` matches any whitespace character.
36        pub rule whitespace() -> &'input str
37            = $([' ' | '\t' | '\n' | '\r'])
38
39        /// `sentence` matches a sequence of words, numbers and other punctuation ending with sentence-ending punctuation.
40        pub rule sentence() -> Vec<&'input str>
41            = capital_w:capital_word() sequence:((word() / date() / hour() / number() / whitespace() / other_punctuation())*) end_punct:end_punctuation() {
42                let mut sequence_vec = sequence.to_vec();
43                sequence_vec.insert(0, capital_w);
44                sequence_vec.push(end_punct);
45                sequence_vec
46            }
47
48        /// `text` matches a series of sentences, separated by whitespace.
49        ///
50        /// This rule can be used to parse entire paragraphs or documents.
51        pub rule text() -> Vec<Vec<&'input str>>
52            = sentences:(sentence() ** whitespace()) {
53                sentences
54            }
55    }
56}