scirs2_text/pos_tagging/
patterns.rs1#![allow(missing_docs)]
7
8use lazy_static::lazy_static;
9use regex::Regex;
10
11lazy_static! {
12 pub static ref VERB_PATTERNS: Regex = Regex::new(r"(?i)(ing|ed|s)$").expect("Operation failed");
14 pub static ref NOUN_PATTERNS: Regex = Regex::new(r"(?i)(tion|sion|ness|ment|ship|hood|ity|cy|th|ing|er|or|ar|ist|ism|age|al|ance|ence|dom|tude|ure|ery|ary|ory|ly)$").expect("Operation failed");
15 pub static ref ADJ_PATTERNS: Regex = Regex::new(r"(?i)(ful|less|ous|ious|eous|ary|ory|ic|ical|al|able|ible|ive|ative|itive|ent|ant|ed|ing|er|est|ward)$").expect("Operation failed");
16 pub static ref ADV_PATTERNS: Regex = Regex::new(r"(?i)(ly|ward|wise|like)$").expect("Operation failed");
17
18 pub static ref PROPER_NOUN_PATTERN: Regex = Regex::new(r"^[A-Z][a-z]+$").expect("Operation failed");
20 pub static ref ALL_CAPS_PATTERN: Regex = Regex::new(r"^[A-Z]{2,}$").expect("Operation failed");
21}
22
23pub struct PatternMatcher;
25
26impl PatternMatcher {
27 pub fn predict_from_morphology(word: &str) -> Option<crate::stemming::PosTag> {
29 use crate::stemming::PosTag;
30
31 if ADJ_PATTERNS.is_match(word) {
33 return Some(PosTag::Adjective);
34 }
35
36 if ADV_PATTERNS.is_match(word) {
38 return Some(PosTag::Adverb);
39 }
40
41 if NOUN_PATTERNS.is_match(word) {
43 return Some(PosTag::Noun);
44 }
45
46 if VERB_PATTERNS.is_match(word) {
48 return Some(PosTag::Verb);
49 }
50
51 None
52 }
53
54 pub fn predict_from_capitalization(word: &str) -> Option<crate::stemming::PosTag> {
56 use crate::stemming::PosTag;
57
58 if PROPER_NOUN_PATTERN.is_match(word) {
59 Some(PosTag::Noun) } else if ALL_CAPS_PATTERN.is_match(word) {
61 Some(PosTag::Noun) } else {
63 None
64 }
65 }
66
67 pub fn matches_pattern(word: &str, pattern_type: &str) -> bool {
69 match pattern_type {
70 "verb" => VERB_PATTERNS.is_match(word),
71 "noun" => NOUN_PATTERNS.is_match(word),
72 "adjective" => ADJ_PATTERNS.is_match(word),
73 "adverb" => ADV_PATTERNS.is_match(word),
74 "proper_noun" => PROPER_NOUN_PATTERN.is_match(word),
75 "all_caps" => ALL_CAPS_PATTERN.is_match(word),
76 _ => false,
77 }
78 }
79}