scirs2_text/
stemming.rs

1//! Text stemming algorithms
2//!
3//! This module provides implementations of various stemming algorithms
4//! including Porter, Snowball, and Lancaster stemmers, as well as lemmatization
5//! approaches.
6//!
7//! # Stemming Algorithms
8//!
9//! This module offers three primary stemming algorithms with different characteristics:
10//!
11//! - **Porter stemmer**: A classic stemming algorithm with moderate stemming strength.
12//!   Good balance between performance and accuracy for English text.
13//!
14//! - **Snowball stemmer**: An improved version of the Porter algorithm with language-specific
15//!   rules. Currently supports English, with more languages planned for future updates.
16//!
17//! - **Lancaster stemmer**: Also known as Paice/Husk stemmer, this is a more aggressive
18//!   stemming algorithm that typically produces shorter stems. It's configurable with
19//!   options for setting minimum stem length and handling short words.
20//!
21//! # Lemmatization Approaches
22//!
23//! In addition to stemmers, this module provides two lemmatization options:
24//!
25//! - **SimpleLemmatizer**: A dictionary-based lemmatizer that uses predefined
26//!   mappings from word forms to their lemmas. Simple but effective for high-frequency
27//!   words and common irregular forms.
28//!
29//! - **RuleLemmatizer**: A more advanced lemmatizer that combines dictionary
30//!   lookups with rule-based transformations. It handles regular inflectional
31//!   patterns through rules and irregular forms through exceptions.
32//!
33//! # Performance Comparison
34//!
35//! In terms of computational efficiency:
36//! - Lancaster stemmer is typically the fastest stemming algorithm
37//! - Snowball stemmer is moderately fast
38//! - Porter stemmer is the slowest of the three stemmers
39//! - RuleLemmatizer performance is comparable to Porter stemming but with better
40//!   accuracy for many English words
41//! - SimpleLemmatizer is very fast for known words but limited in vocabulary
42//!
43//! # Choosing a Stemming/Lemmatization Approach
44//!
45//! - Use **Porter** when you need established, widely recognized stemming with moderate
46//!   aggressiveness.
47//! - Use **Snowball** when working with multiple languages or when you need language-specific
48//!   refinements to the Porter algorithm.
49//! - Use **Lancaster** when you need very aggressive stemming or maximum performance.
50//! - Use **SimpleLemmatizer** when you need high-speed processing for a limited set
51//!   of known words.
52//! - Use **RuleLemmatizer** when you need more accurate word normalization that
53//!   preserves the base form (lemma) rather than a stem, or when you need
54//!   part-of-speech-aware word normalization.
55//!
56//! # Stemming vs. Lemmatization
57//!
58//! - **Stemming** (Porter, Snowball, Lancaster) simply removes word endings to reduce
59//!   words to their stems, which may not be valid words.
60//! - **Lemmatization** (SimpleLemmatizer, RuleLemmatizer) reduces words to their base
61//!   form (lemma), which is always a valid word. It often requires knowledge of a word's
62//!   part of speech.
63//!
64//! # Example
65//!
66//! ```
67//! use scirs2_text::{LancasterStemmer, PorterStemmer, RuleLemmatizer, SimpleLemmatizer, SnowballStemmer, Stemmer};
68//!
69//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
70//! let porter = PorterStemmer::new();
71//! let snowball = SnowballStemmer::new("english")?;
72//! let lancaster = LancasterStemmer::new();
73//! let simple_lemmatizer = SimpleLemmatizer::new();
74//! let rule_lemmatizer = RuleLemmatizer::new();
75//!
76//! // Compare stemming results
77//! assert_eq!(porter.stem("running")?, "run");
78//! assert_eq!(snowball.stem("running")?, "running");
79//! assert_eq!(lancaster.stem("running")?, "running");
80//! assert_eq!(simple_lemmatizer.stem("running")?, "run");
81//! assert_eq!(rule_lemmatizer.stem("running")?, "run");
82//!
83//! // Compare lemmatization and stemming on irregular verbs
84//! assert_eq!(porter.stem("went")?, "went");
85//! assert_eq!(simple_lemmatizer.stem("went")?, "went"); // Unknown unless in dictionary
86//! assert_eq!(rule_lemmatizer.stem("went")?, "go");     // Correctly lemmatizes irregular form
87//! # Ok(())
88//! # }
89//! ```
90
91pub mod lancaster;
92pub mod rule_lemmatizer;
93
94use crate::error::{Result, TextError};
95use lazy_static::lazy_static;
96use regex::Regex;
97use std::collections::HashMap;
98
99// Re-export stemmer and lemmatizer implementations
100pub use self::lancaster::LancasterStemmer;
101pub use self::rule_lemmatizer::{
102    LemmatizerConfig, PosTag, RuleCondition, RuleLemmatizer, RuleLemmatizerBuilder,
103};
104
105/// Create a POS-aware lemmatizer that automatically detects part-of-speech tags
106/// for improved lemmatization accuracy.
107///
108/// This function creates a lemmatizer that combines automatic POS tagging with
109/// rule-based lemmatization for better accuracy than using lemmatization alone.
110///
111/// # Example
112///
113/// ```
114/// use scirs2_text::stemming::create_pos_aware_lemmatizer;
115/// use scirs2_text::stemming::Stemmer;
116///
117/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
118/// let lemmatizer = create_pos_aware_lemmatizer();
119///
120/// // Automatic POS detection improves accuracy
121/// assert_eq!(lemmatizer.stem("running")?, "run");
122/// assert_eq!(lemmatizer.stem("better")?, "good");  // Uses POS context
123/// assert_eq!(lemmatizer.stem("flies")?, "fly");    // Disambiguated by context
124/// # Ok(())
125/// # }
126/// ```
127#[allow(dead_code)]
128pub fn create_pos_aware_lemmatizer() -> crate::pos_tagging::PosAwareLemmatizer {
129    crate::pos_tagging::PosAwareLemmatizer::new()
130}
131
132/// Create a POS-aware lemmatizer with custom configurations
133#[allow(dead_code)]
134pub fn create_pos_aware_lemmatizer_with_config(
135    posconfig: crate::pos_tagging::PosTaggerConfig,
136    lemmaconfig: LemmatizerConfig,
137) -> crate::pos_tagging::PosAwareLemmatizer {
138    crate::pos_tagging::PosAwareLemmatizer::with_configs(posconfig, lemmaconfig)
139}
140
141lazy_static! {
142    // Porter stemmer regex patterns
143    static ref VOWEL_SEQUENCE: Regex = Regex::new(r"[aeiouy]").expect("Operation failed");
144    static ref DOUBLE_CONSONANT: Regex = Regex::new(r"(bb|dd|ff|gg|mm|nn|pp|rr|tt)$").expect("Operation failed");
145}
146
147/// Trait for text stemming algorithms
148pub trait Stemmer {
149    /// Stem a single word
150    fn stem(&self, word: &str) -> Result<String>;
151
152    /// Stem multiple words
153    fn stem_batch(&self, words: &[&str]) -> Result<Vec<String>> {
154        words.iter().map(|word| self.stem(word)).collect()
155    }
156}
157
158/// Porter stemmer implementation
159#[derive(Debug, Clone)]
160pub struct PorterStemmer;
161
162impl PorterStemmer {
163    /// Create a new Porter stemmer
164    pub fn new() -> Self {
165        Self
166    }
167
168    /// Check if the word ends with a consonant-vowel-consonant pattern
169    fn ends_with_cvc(&self, word: &str) -> bool {
170        if word.len() < 3 {
171            return false;
172        }
173
174        let chars: Vec<char> = word.chars().collect();
175        let n = chars.len();
176
177        // Check consonant-vowel-consonant pattern
178        !self.is_vowel(&chars[n - 3])
179            && self.is_vowel(&chars[n - 2])
180            && !self.is_vowel(&chars[n - 1])
181            && !matches!(chars[n - 1], 'w' | 'x' | 'y')
182    }
183
184    /// Check if a character is a vowel
185    fn is_vowel(&self, ch: &char) -> bool {
186        matches!(*ch, 'a' | 'e' | 'i' | 'o' | 'u' | 'y')
187    }
188
189    /// Calculate the measure of a word (count of consonant sequences)
190    fn measure(&self, word: &str) -> usize {
191        let mut measure = 0;
192        let mut in_vowel_sequence = false;
193
194        for ch in word.chars() {
195            if self.is_vowel(&ch) {
196                in_vowel_sequence = true;
197            } else if in_vowel_sequence {
198                measure += 1;
199                in_vowel_sequence = false;
200            }
201        }
202
203        measure
204    }
205
206    /// Step 1a: Plurals and past participles
207    fn step1a(&self, word: String) -> String {
208        if word.ends_with("sses") || word.ends_with("ies") {
209            word[..word.len() - 2].to_string()
210        } else if word.ends_with("s") && !word.ends_with("ss") && !word.ends_with("ness") {
211            word[..word.len() - 1].to_string()
212        } else {
213            word
214        }
215    }
216
217    /// Step 1b: Past participles
218    fn step1b(&self, mut word: String) -> String {
219        let mut step1b_applied = false;
220
221        if word.ends_with("eed") {
222            let stem = &word[..word.len() - 3];
223            if self.measure(stem) > 0 {
224                word = format!("{stem}ee");
225            }
226        } else if word.ends_with("ed") {
227            let stem = &word[..word.len() - 2];
228            if VOWEL_SEQUENCE.is_match(stem) {
229                word = stem.to_string();
230                step1b_applied = true;
231            }
232        } else if word.ends_with("ing") {
233            let stem = &word[..word.len() - 3];
234            if VOWEL_SEQUENCE.is_match(stem) {
235                word = stem.to_string();
236                step1b_applied = true;
237            }
238        }
239
240        if step1b_applied {
241            if word.ends_with("at") || word.ends_with("bl") || word.ends_with("iz") {
242                word.push('e');
243            } else if DOUBLE_CONSONANT.is_match(&word)
244                && !word.ends_with("l")
245                && !word.ends_with("s")
246                && !word.ends_with("z")
247            {
248                word.pop();
249            } else if self.measure(&word) == 1 && self.ends_with_cvc(&word) {
250                word.push('e');
251            }
252        }
253
254        word
255    }
256
257    /// Step 1c: Y → I
258    fn step1c(&self, word: String) -> String {
259        if word.ends_with("y") && word.len() > 1 {
260            let stem = &word[..word.len() - 1];
261            if VOWEL_SEQUENCE.is_match(stem) {
262                return format!("{stem}i");
263            }
264        }
265        word
266    }
267
268    /// Steps 2-5: Various suffix removals
269    fn step2(&self, word: String) -> String {
270        let suffix_map = vec![
271            ("ational", "ate"),
272            ("tional", "tion"),
273            ("enci", "ence"),
274            ("anci", "ance"),
275            ("izer", "ize"),
276            ("abli", "able"),
277            ("alli", "al"),
278            ("entli", "ent"),
279            ("eli", "e"),
280            ("ousli", "ous"),
281            ("ization", "ize"),
282            ("ation", "ate"),
283            ("ator", "ate"),
284            ("alism", "al"),
285            ("iveness", "ive"),
286            ("fulness", "ful"),
287            ("ousness", "ous"),
288            ("aliti", "al"),
289            ("iviti", "ive"),
290            ("biliti", "ble"),
291        ];
292
293        for (suffix, replacement) in suffix_map {
294            if word.ends_with(suffix) {
295                let stem = &word[..word.len() - suffix.len()];
296                if self.measure(stem) > 0 {
297                    return format!("{stem}{replacement}");
298                }
299            }
300        }
301
302        word
303    }
304
305    /// Step 3: Suffix removal
306    fn step3(&self, word: String) -> String {
307        let suffix_map = vec![
308            ("icate", "ic"),
309            ("ative", ""),
310            ("alize", "al"),
311            ("iciti", "ic"),
312            ("ical", "ic"),
313            ("ful", ""),
314            ("ness", ""),
315        ];
316
317        for (suffix, replacement) in suffix_map {
318            if word.ends_with(suffix) {
319                let stem = &word[..word.len() - suffix.len()];
320                if self.measure(stem) > 0 {
321                    return format!("{stem}{replacement}");
322                }
323            }
324        }
325
326        word
327    }
328
329    /// Step 4: Suffix removal
330    fn step4(&self, word: String) -> String {
331        let suffixes = vec![
332            "al", "ance", "ence", "er", "ic", "able", "ible", "ant", "ement", "ment", "ent",
333            "sion", "tion", "ou", "ism", "ate", "iti", "ous", "ive", "ize",
334        ];
335
336        for suffix in suffixes {
337            if word.ends_with(suffix) {
338                let stem = &word[..word.len() - suffix.len()];
339                if self.measure(stem) > 1 {
340                    return stem.to_string();
341                }
342            }
343        }
344
345        word
346    }
347
348    /// Step 5a: Remove E
349    fn step5a(&self, word: String) -> String {
350        if word.ends_with("e") {
351            let stem = &word[..word.len() - 1];
352            if self.measure(stem) > 1 || (self.measure(stem) == 1 && !self.ends_with_cvc(stem)) {
353                return stem.to_string();
354            }
355        }
356        word
357    }
358
359    /// Step 5b: Remove double L
360    fn step5b(&self, word: String) -> String {
361        if word.ends_with("ll") && self.measure(&word) > 1 {
362            return word[..word.len() - 1].to_string();
363        }
364        word
365    }
366}
367
368impl Default for PorterStemmer {
369    fn default() -> Self {
370        Self::new()
371    }
372}
373
374impl Stemmer for PorterStemmer {
375    fn stem(&self, word: &str) -> Result<String> {
376        if word.is_empty() {
377            return Ok(word.to_string());
378        }
379
380        let mut stemmed = word.to_lowercase();
381
382        // Apply stemming steps in order
383        stemmed = self.step1a(stemmed);
384        stemmed = self.step1b(stemmed);
385        stemmed = self.step1c(stemmed);
386        stemmed = self.step2(stemmed);
387        stemmed = self.step3(stemmed);
388        stemmed = self.step4(stemmed);
389        stemmed = self.step5a(stemmed);
390        stemmed = self.step5b(stemmed);
391
392        Ok(stemmed)
393    }
394}
395
396/// Snowball stemmer (enhanced Porter stemmer)
397#[derive(Debug, Clone)]
398pub struct SnowballStemmer {
399    language: String,
400}
401
402impl SnowballStemmer {
403    /// Create a new Snowball stemmer for a specific language
404    pub fn new(language: &str) -> Result<Self> {
405        match language.to_lowercase().as_str() {
406            "english" | "en" => Ok(Self {
407                language: "english".to_string(),
408            }),
409            _ => Err(TextError::InvalidInput(format!(
410                "Unsupported language: {language}"
411            ))),
412        }
413    }
414
415    /// Apply R1 and R2 region finding (used in Snowball algorithm)
416    fn find_r1_r2(&self, word: &str) -> (usize, usize) {
417        let mut r1 = word.len();
418        let mut r2 = word.len();
419
420        // Find R1: after first non-vowel following a vowel
421        let chars: Vec<char> = word.chars().collect();
422        let mut found_vowel = false;
423
424        for (i, ch) in chars.iter().enumerate() {
425            if self.is_vowel(ch) {
426                found_vowel = true;
427            } else if found_vowel {
428                r1 = i + 1;
429                break;
430            }
431        }
432
433        // Find R2: same rule applied to R1
434        if r1 < word.len() {
435            found_vowel = false;
436            for (i, ch) in chars[r1..].iter().enumerate() {
437                if self.is_vowel(ch) {
438                    found_vowel = true;
439                } else if found_vowel {
440                    r2 = r1 + i + 1;
441                    break;
442                }
443            }
444        }
445
446        (r1, r2)
447    }
448
449    fn is_vowel(&self, ch: &char) -> bool {
450        matches!(*ch, 'a' | 'e' | 'i' | 'o' | 'u' | 'y')
451    }
452
453    /// English-specific Snowball stemming rules
454    fn stem_english(&self, word: &str) -> String {
455        if word.len() <= 2 {
456            return word.to_string();
457        }
458
459        let mut stemmed = word.to_lowercase();
460        let _r1_r2 = self.find_r1_r2(&stemmed);
461
462        // Step 0: Remove trailing apostrophes
463        if stemmed.ends_with("'s'") {
464            stemmed = stemmed[..stemmed.len() - 3].to_string();
465        } else if stemmed.ends_with("'s") {
466            stemmed = stemmed[..stemmed.len() - 2].to_string();
467        } else if stemmed.ends_with("'") {
468            stemmed = stemmed[..stemmed.len() - 1].to_string();
469        }
470
471        // Step 1a: Plurals
472        if stemmed.ends_with("sses") {
473            let truncated = &stemmed[..stemmed.len() - 4];
474            stemmed = format!("{truncated}ss");
475        } else if stemmed.ends_with("ied") || stemmed.ends_with("ies") {
476            if stemmed.len() > 4 {
477                let truncated = &stemmed[..stemmed.len() - 3];
478                stemmed = format!("{truncated}i");
479            } else {
480                let truncated = &stemmed[..stemmed.len() - 3];
481                stemmed = format!("{truncated}ie");
482            }
483        } else if stemmed.ends_with("s") && !stemmed.ends_with("us") && !stemmed.ends_with("ss") {
484            // Check if word contains a vowel before the s
485            let stem = &stemmed[..stemmed.len() - 1];
486            if VOWEL_SEQUENCE.is_match(stem) {
487                stemmed = stem.to_string();
488            }
489        }
490
491        // Additional Snowball steps would go here...
492        // This is a simplified version
493
494        stemmed
495    }
496}
497
498impl Stemmer for SnowballStemmer {
499    fn stem(&self, word: &str) -> Result<String> {
500        match self.language.as_str() {
501            "english" => Ok(self.stem_english(word)),
502            _ => Err(TextError::InvalidInput(format!(
503                "Unsupported language: {}",
504                self.language
505            ))),
506        }
507    }
508}
509
510/// Simple lemmatizer using a dictionary-based approach
511#[derive(Debug, Clone)]
512pub struct SimpleLemmatizer {
513    lemma_dict: HashMap<String, String>,
514}
515
516impl SimpleLemmatizer {
517    /// Create a new lemmatizer
518    pub fn new() -> Self {
519        let mut lemma_dict = HashMap::new();
520
521        // Add some common lemmatization rules
522        // This is a simplified example - a real implementation would load
523        // from a comprehensive dictionary
524        lemma_dict.insert("am".to_string(), "be".to_string());
525        lemma_dict.insert("are".to_string(), "be".to_string());
526        lemma_dict.insert("is".to_string(), "be".to_string());
527        lemma_dict.insert("was".to_string(), "be".to_string());
528        lemma_dict.insert("were".to_string(), "be".to_string());
529        lemma_dict.insert("been".to_string(), "be".to_string());
530        lemma_dict.insert("being".to_string(), "be".to_string());
531
532        lemma_dict.insert("have".to_string(), "have".to_string());
533        lemma_dict.insert("has".to_string(), "have".to_string());
534        lemma_dict.insert("had".to_string(), "have".to_string());
535        lemma_dict.insert("having".to_string(), "have".to_string());
536
537        lemma_dict.insert("does".to_string(), "do".to_string());
538        lemma_dict.insert("did".to_string(), "do".to_string());
539        lemma_dict.insert("doing".to_string(), "do".to_string());
540
541        lemma_dict.insert("better".to_string(), "good".to_string());
542        lemma_dict.insert("best".to_string(), "good".to_string());
543        lemma_dict.insert("worse".to_string(), "bad".to_string());
544        lemma_dict.insert("worst".to_string(), "bad".to_string());
545
546        lemma_dict.insert("running".to_string(), "run".to_string());
547        lemma_dict.insert("ran".to_string(), "run".to_string());
548        lemma_dict.insert("runs".to_string(), "run".to_string());
549
550        Self { lemma_dict }
551    }
552
553    /// Load lemmatization dictionary from a file
554    pub fn from_dict_file(path: &str) -> Result<Self> {
555        // In a real implementation, this would load from a file
556        Ok(Self::new())
557    }
558
559    /// Add a lemma mapping
560    pub fn add_lemma(&mut self, word: &str, lemma: &str) {
561        self.lemma_dict.insert(word.to_string(), lemma.to_string());
562    }
563}
564
565impl Default for SimpleLemmatizer {
566    fn default() -> Self {
567        Self::new()
568    }
569}
570
571impl Stemmer for SimpleLemmatizer {
572    fn stem(&self, word: &str) -> Result<String> {
573        let lower = word.to_lowercase();
574        Ok(self.lemma_dict.get(&lower).unwrap_or(&lower).to_string())
575    }
576}
577
578#[cfg(test)]
579mod tests {
580    use super::*;
581
582    #[test]
583    fn test_porter_stemmer() {
584        let stemmer = PorterStemmer::new();
585
586        let test_cases = vec![
587            ("running", "run"),
588            ("ran", "ran"),
589            ("easily", "easili"),
590            ("fishing", "fish"),
591            ("fished", "fish"),
592            ("productive", "product"),
593            ("production", "produc"),
594            ("sensational", "sensat"),
595        ];
596
597        for (word, expected) in test_cases {
598            let stemmed = stemmer.stem(word).expect("Operation failed");
599            assert_eq!(stemmed, expected, "Failed for word: {word}");
600        }
601    }
602
603    #[test]
604    fn test_snowball_stemmer() {
605        let stemmer = SnowballStemmer::new("english").expect("Operation failed");
606
607        let test_cases = vec![
608            ("cats", "cat"),
609            ("running", "running"), // Simplified version doesn't handle all cases
610            ("flies", "fli"),
611            ("happiness", "happiness"), // Simplified version
612        ];
613
614        for (word, expected) in test_cases {
615            let stemmed = stemmer.stem(word).expect("Operation failed");
616            assert_eq!(stemmed, expected, "Failed for word: {word}");
617        }
618    }
619
620    #[test]
621    fn test_simple_lemmatizer() {
622        let lemmatizer = SimpleLemmatizer::new();
623
624        let test_cases = vec![
625            ("am", "be"),
626            ("are", "be"),
627            ("was", "be"),
628            ("better", "good"),
629            ("running", "run"),
630            ("unknown", "unknown"), // Should return the word itself if not in dict
631        ];
632
633        for (word, expected) in test_cases {
634            let lemma = lemmatizer.stem(word).expect("Operation failed");
635            assert_eq!(lemma, expected, "Failed for word: {word}");
636        }
637    }
638
639    #[test]
640    fn test_rule_lemmatizer() {
641        let lemmatizer = RuleLemmatizer::new();
642
643        // Test with various parts of speech
644        assert_eq!(lemmatizer.lemmatize("running", Some(PosTag::Verb)), "run");
645        assert_eq!(lemmatizer.lemmatize("cats", Some(PosTag::Noun)), "cat");
646        assert_eq!(
647            lemmatizer.lemmatize("better", Some(PosTag::Adjective)),
648            "good"
649        );
650        assert_eq!(
651            lemmatizer.lemmatize("quickly", Some(PosTag::Adverb)),
652            "quick"
653        );
654
655        // Test irregular forms
656        assert_eq!(lemmatizer.lemmatize("went", Some(PosTag::Verb)), "go");
657        assert_eq!(
658            lemmatizer.lemmatize("children", Some(PosTag::Noun)),
659            "child"
660        );
661        assert_eq!(lemmatizer.lemmatize("feet", Some(PosTag::Noun)), "foot");
662
663        // Test without POS tag
664        assert_eq!(lemmatizer.lemmatize("running", None), "run");
665        assert_eq!(lemmatizer.lemmatize("went", None), "go");
666    }
667
668    #[test]
669    fn test_pos_aware_lemmatizer_integration() {
670        let pos_aware = create_pos_aware_lemmatizer();
671        let rule_only = RuleLemmatizer::new();
672
673        // Test cases where POS awareness should improve accuracy
674        let test_cases = vec![
675            "flies",   // Could be verb (3rd person) or noun (plural)
676            "running", // Could be verb or noun/adjective
677            "better",  // Could be adjective (comparative) or adverb
678            "works",   // Could be verb or noun
679            "watches", // Could be verb or noun
680        ];
681
682        for word in test_cases {
683            let pos_aware_result = pos_aware.stem(word).expect("Operation failed");
684            let rule_only_result = rule_only.stem(word).expect("Operation failed");
685
686            println!(
687                "Word: '{word}' -> POS-aware: '{pos_aware_result}', Rule-only: '{rule_only_result}'"
688            );
689
690            // Both should produce valid results
691            assert!(!pos_aware_result.is_empty());
692            assert!(!rule_only_result.is_empty());
693        }
694    }
695
696    #[test]
697    fn test_pos_aware_lemmatizer_accuracy() {
698        let pos_aware = create_pos_aware_lemmatizer();
699
700        // Test cases where POS awareness provides clear benefit
701        assert_eq!(pos_aware.stem("running").expect("Operation failed"), "run");
702        assert_eq!(pos_aware.stem("walked").expect("Operation failed"), "walk");
703        assert_eq!(pos_aware.stem("plays").expect("Operation failed"), "play");
704        assert_eq!(pos_aware.stem("played").expect("Operation failed"), "play");
705        assert_eq!(
706            pos_aware.stem("swimming").expect("Operation failed"),
707            "swim"
708        );
709
710        // Test regular patterns that should work consistently
711        assert_eq!(pos_aware.stem("cats").expect("Operation failed"), "cat");
712        assert_eq!(pos_aware.stem("dogs").expect("Operation failed"), "dog");
713        assert_eq!(
714            pos_aware.stem("happiness").expect("Operation failed"),
715            "happiness"
716        ); // May not be in exceptions
717    }
718
719    #[test]
720    fn test_pos_aware_lemmatizer_custom_config() {
721        let pos_config = crate::pos_tagging::PosTaggerConfig {
722            use_context: false,
723            smoothing_factor: 0.01,
724            use_morphology: true,
725            use_capitalization: true,
726        };
727
728        let lemma_config = LemmatizerConfig {
729            use_pos_tagging: true,
730            default_pos: PosTag::Verb,
731            apply_case_restoration: false,
732            check_vowels: true,
733        };
734
735        let pos_aware = create_pos_aware_lemmatizer_with_config(pos_config, lemma_config);
736
737        // Test with custom configuration
738        let result = pos_aware.stem("Running").expect("Operation failed");
739        assert_eq!(result, "run"); // Simple rule-based transformation
740    }
741
742    #[test]
743    fn test_stemmers_and_lemmatizers_comparison() {
744        let porter = PorterStemmer::new();
745        let snowball = SnowballStemmer::new("english").expect("Operation failed");
746        let lancaster = LancasterStemmer::new();
747        let simple_lemmatizer = SimpleLemmatizer::new();
748        let rule_lemmatizer = RuleLemmatizer::new();
749
750        let test_words = vec![
751            "running",
752            "cats",
753            "better",
754            "went",
755            "children",
756            "feet",
757            "universities",
758        ];
759
760        for word in test_words {
761            println!(
762                "Word: '{}'\nPorter: '{}'\nSnowball: '{}'\nLancaster: '{}'\nSimple: '{}'\nRule: '{}'",
763                word,
764                porter.stem(word).expect("Operation failed"),
765                snowball.stem(word).expect("Operation failed"),
766                lancaster.stem(word).expect("Operation failed"),
767                simple_lemmatizer.stem(word).expect("Operation failed"),
768                rule_lemmatizer.stem(word).expect("Operation failed")
769            );
770        }
771
772        // Test basic cases
773        assert_eq!(porter.stem("running").expect("Operation failed"), "run");
774        assert_eq!(
775            rule_lemmatizer.stem("running").expect("Operation failed"),
776            "run"
777        );
778
779        // Test that lemmatizer works better for irregular forms
780        assert_eq!(porter.stem("went").expect("Operation failed"), "went"); // Stemmer doesn't handle irregular verbs
781        assert_eq!(
782            rule_lemmatizer.stem("went").expect("Operation failed"),
783            "go"
784        ); // Lemmatizer does
785
786        // Lemmatizer should handle irregular plurals
787        assert_eq!(porter.stem("feet").expect("Operation failed"), "feet"); // Stemmer doesn't normalize irregular plurals
788        assert_eq!(
789            rule_lemmatizer.stem("feet").expect("Operation failed"),
790            "foot"
791        ); // Lemmatizer does
792    }
793}
scirs2_text/stemming.rs

scirs2_text/
stemming.rs