Skip to main content

scirs2_text/pos_tagging/
patterns.rs

1//! Pattern matching for POS tagging
2//!
3//! This module contains regex patterns and morphological rules
4//! used for POS tag disambiguation.
5
6#![allow(missing_docs)]
7
8use lazy_static::lazy_static;
9use regex::Regex;
10
11lazy_static! {
12    // Common word patterns for POS disambiguation
13    pub static ref VERB_PATTERNS: Regex = Regex::new(r"(?i)(ing|ed|s)$").expect("Operation failed");
14    pub static ref NOUN_PATTERNS: Regex = Regex::new(r"(?i)(tion|sion|ness|ment|ship|hood|ity|cy|th|ing|er|or|ar|ist|ism|age|al|ance|ence|dom|tude|ure|ery|ary|ory|ly)$").expect("Operation failed");
15    pub static ref ADJ_PATTERNS: Regex = Regex::new(r"(?i)(ful|less|ous|ious|eous|ary|ory|ic|ical|al|able|ible|ive|ative|itive|ent|ant|ed|ing|er|est|ward)$").expect("Operation failed");
16    pub static ref ADV_PATTERNS: Regex = Regex::new(r"(?i)(ly|ward|wise|like)$").expect("Operation failed");
17
18    // Capitalization patterns
19    pub static ref PROPER_NOUN_PATTERN: Regex = Regex::new(r"^[A-Z][a-z]+$").expect("Operation failed");
20    pub static ref ALL_CAPS_PATTERN: Regex = Regex::new(r"^[A-Z]{2,}$").expect("Operation failed");
21}
22
23/// Pattern-based POS tag predictor
24pub struct PatternMatcher;
25
26impl PatternMatcher {
27    /// Predict POS tag based on morphological patterns
28    pub fn predict_from_morphology(word: &str) -> Option<crate::stemming::PosTag> {
29        use crate::stemming::PosTag;
30
31        // Check for adjective patterns first (more specific)
32        if ADJ_PATTERNS.is_match(word) {
33            return Some(PosTag::Adjective);
34        }
35
36        // Check for adverb patterns
37        if ADV_PATTERNS.is_match(word) {
38            return Some(PosTag::Adverb);
39        }
40
41        // Check for noun patterns
42        if NOUN_PATTERNS.is_match(word) {
43            return Some(PosTag::Noun);
44        }
45
46        // Check for verb patterns (less specific, check last)
47        if VERB_PATTERNS.is_match(word) {
48            return Some(PosTag::Verb);
49        }
50
51        None
52    }
53
54    /// Predict POS tag based on capitalization patterns
55    pub fn predict_from_capitalization(word: &str) -> Option<crate::stemming::PosTag> {
56        use crate::stemming::PosTag;
57
58        if PROPER_NOUN_PATTERN.is_match(word) {
59            Some(PosTag::Noun) // Proper nouns
60        } else if ALL_CAPS_PATTERN.is_match(word) {
61            Some(PosTag::Noun) // Acronyms are typically nouns
62        } else {
63            None
64        }
65    }
66
67    /// Check if word matches a specific pattern type
68    pub fn matches_pattern(word: &str, pattern_type: &str) -> bool {
69        match pattern_type {
70            "verb" => VERB_PATTERNS.is_match(word),
71            "noun" => NOUN_PATTERNS.is_match(word),
72            "adjective" => ADJ_PATTERNS.is_match(word),
73            "adverb" => ADV_PATTERNS.is_match(word),
74            "proper_noun" => PROPER_NOUN_PATTERN.is_match(word),
75            "all_caps" => ALL_CAPS_PATTERN.is_match(word),
76            _ => false,
77        }
78    }
79}