1pub mod lancaster;
92pub mod rule_lemmatizer;
93
94use crate::error::{Result, TextError};
95use lazy_static::lazy_static;
96use regex::Regex;
97use std::collections::HashMap;
98
99pub use self::lancaster::LancasterStemmer;
101pub use self::rule_lemmatizer::{
102 LemmatizerConfig, PosTag, RuleCondition, RuleLemmatizer, RuleLemmatizerBuilder,
103};
104
105#[allow(dead_code)]
128pub fn create_pos_aware_lemmatizer() -> crate::pos_tagging::PosAwareLemmatizer {
129 crate::pos_tagging::PosAwareLemmatizer::new()
130}
131
132#[allow(dead_code)]
134pub fn create_pos_aware_lemmatizer_with_config(
135 posconfig: crate::pos_tagging::PosTaggerConfig,
136 lemmaconfig: LemmatizerConfig,
137) -> crate::pos_tagging::PosAwareLemmatizer {
138 crate::pos_tagging::PosAwareLemmatizer::with_configs(posconfig, lemmaconfig)
139}
140
141lazy_static! {
142 static ref VOWEL_SEQUENCE: Regex = Regex::new(r"[aeiouy]").expect("Operation failed");
144 static ref DOUBLE_CONSONANT: Regex = Regex::new(r"(bb|dd|ff|gg|mm|nn|pp|rr|tt)$").expect("Operation failed");
145}
146
147pub trait Stemmer {
149 fn stem(&self, word: &str) -> Result<String>;
151
152 fn stem_batch(&self, words: &[&str]) -> Result<Vec<String>> {
154 words.iter().map(|word| self.stem(word)).collect()
155 }
156}
157
158#[derive(Debug, Clone)]
160pub struct PorterStemmer;
161
162impl PorterStemmer {
163 pub fn new() -> Self {
165 Self
166 }
167
168 fn ends_with_cvc(&self, word: &str) -> bool {
170 if word.len() < 3 {
171 return false;
172 }
173
174 let chars: Vec<char> = word.chars().collect();
175 let n = chars.len();
176
177 !self.is_vowel(&chars[n - 3])
179 && self.is_vowel(&chars[n - 2])
180 && !self.is_vowel(&chars[n - 1])
181 && !matches!(chars[n - 1], 'w' | 'x' | 'y')
182 }
183
184 fn is_vowel(&self, ch: &char) -> bool {
186 matches!(*ch, 'a' | 'e' | 'i' | 'o' | 'u' | 'y')
187 }
188
189 fn measure(&self, word: &str) -> usize {
191 let mut measure = 0;
192 let mut in_vowel_sequence = false;
193
194 for ch in word.chars() {
195 if self.is_vowel(&ch) {
196 in_vowel_sequence = true;
197 } else if in_vowel_sequence {
198 measure += 1;
199 in_vowel_sequence = false;
200 }
201 }
202
203 measure
204 }
205
206 fn step1a(&self, word: String) -> String {
208 if word.ends_with("sses") || word.ends_with("ies") {
209 word[..word.len() - 2].to_string()
210 } else if word.ends_with("s") && !word.ends_with("ss") && !word.ends_with("ness") {
211 word[..word.len() - 1].to_string()
212 } else {
213 word
214 }
215 }
216
217 fn step1b(&self, mut word: String) -> String {
219 let mut step1b_applied = false;
220
221 if word.ends_with("eed") {
222 let stem = &word[..word.len() - 3];
223 if self.measure(stem) > 0 {
224 word = format!("{stem}ee");
225 }
226 } else if word.ends_with("ed") {
227 let stem = &word[..word.len() - 2];
228 if VOWEL_SEQUENCE.is_match(stem) {
229 word = stem.to_string();
230 step1b_applied = true;
231 }
232 } else if word.ends_with("ing") {
233 let stem = &word[..word.len() - 3];
234 if VOWEL_SEQUENCE.is_match(stem) {
235 word = stem.to_string();
236 step1b_applied = true;
237 }
238 }
239
240 if step1b_applied {
241 if word.ends_with("at") || word.ends_with("bl") || word.ends_with("iz") {
242 word.push('e');
243 } else if DOUBLE_CONSONANT.is_match(&word)
244 && !word.ends_with("l")
245 && !word.ends_with("s")
246 && !word.ends_with("z")
247 {
248 word.pop();
249 } else if self.measure(&word) == 1 && self.ends_with_cvc(&word) {
250 word.push('e');
251 }
252 }
253
254 word
255 }
256
257 fn step1c(&self, word: String) -> String {
259 if word.ends_with("y") && word.len() > 1 {
260 let stem = &word[..word.len() - 1];
261 if VOWEL_SEQUENCE.is_match(stem) {
262 return format!("{stem}i");
263 }
264 }
265 word
266 }
267
268 fn step2(&self, word: String) -> String {
270 let suffix_map = vec![
271 ("ational", "ate"),
272 ("tional", "tion"),
273 ("enci", "ence"),
274 ("anci", "ance"),
275 ("izer", "ize"),
276 ("abli", "able"),
277 ("alli", "al"),
278 ("entli", "ent"),
279 ("eli", "e"),
280 ("ousli", "ous"),
281 ("ization", "ize"),
282 ("ation", "ate"),
283 ("ator", "ate"),
284 ("alism", "al"),
285 ("iveness", "ive"),
286 ("fulness", "ful"),
287 ("ousness", "ous"),
288 ("aliti", "al"),
289 ("iviti", "ive"),
290 ("biliti", "ble"),
291 ];
292
293 for (suffix, replacement) in suffix_map {
294 if word.ends_with(suffix) {
295 let stem = &word[..word.len() - suffix.len()];
296 if self.measure(stem) > 0 {
297 return format!("{stem}{replacement}");
298 }
299 }
300 }
301
302 word
303 }
304
305 fn step3(&self, word: String) -> String {
307 let suffix_map = vec![
308 ("icate", "ic"),
309 ("ative", ""),
310 ("alize", "al"),
311 ("iciti", "ic"),
312 ("ical", "ic"),
313 ("ful", ""),
314 ("ness", ""),
315 ];
316
317 for (suffix, replacement) in suffix_map {
318 if word.ends_with(suffix) {
319 let stem = &word[..word.len() - suffix.len()];
320 if self.measure(stem) > 0 {
321 return format!("{stem}{replacement}");
322 }
323 }
324 }
325
326 word
327 }
328
329 fn step4(&self, word: String) -> String {
331 let suffixes = vec![
332 "al", "ance", "ence", "er", "ic", "able", "ible", "ant", "ement", "ment", "ent",
333 "sion", "tion", "ou", "ism", "ate", "iti", "ous", "ive", "ize",
334 ];
335
336 for suffix in suffixes {
337 if word.ends_with(suffix) {
338 let stem = &word[..word.len() - suffix.len()];
339 if self.measure(stem) > 1 {
340 return stem.to_string();
341 }
342 }
343 }
344
345 word
346 }
347
348 fn step5a(&self, word: String) -> String {
350 if word.ends_with("e") {
351 let stem = &word[..word.len() - 1];
352 if self.measure(stem) > 1 || (self.measure(stem) == 1 && !self.ends_with_cvc(stem)) {
353 return stem.to_string();
354 }
355 }
356 word
357 }
358
359 fn step5b(&self, word: String) -> String {
361 if word.ends_with("ll") && self.measure(&word) > 1 {
362 return word[..word.len() - 1].to_string();
363 }
364 word
365 }
366}
367
368impl Default for PorterStemmer {
369 fn default() -> Self {
370 Self::new()
371 }
372}
373
374impl Stemmer for PorterStemmer {
375 fn stem(&self, word: &str) -> Result<String> {
376 if word.is_empty() {
377 return Ok(word.to_string());
378 }
379
380 let mut stemmed = word.to_lowercase();
381
382 stemmed = self.step1a(stemmed);
384 stemmed = self.step1b(stemmed);
385 stemmed = self.step1c(stemmed);
386 stemmed = self.step2(stemmed);
387 stemmed = self.step3(stemmed);
388 stemmed = self.step4(stemmed);
389 stemmed = self.step5a(stemmed);
390 stemmed = self.step5b(stemmed);
391
392 Ok(stemmed)
393 }
394}
395
396#[derive(Debug, Clone)]
398pub struct SnowballStemmer {
399 language: String,
400}
401
402impl SnowballStemmer {
403 pub fn new(language: &str) -> Result<Self> {
405 match language.to_lowercase().as_str() {
406 "english" | "en" => Ok(Self {
407 language: "english".to_string(),
408 }),
409 _ => Err(TextError::InvalidInput(format!(
410 "Unsupported language: {language}"
411 ))),
412 }
413 }
414
415 fn find_r1_r2(&self, word: &str) -> (usize, usize) {
417 let mut r1 = word.len();
418 let mut r2 = word.len();
419
420 let chars: Vec<char> = word.chars().collect();
422 let mut found_vowel = false;
423
424 for (i, ch) in chars.iter().enumerate() {
425 if self.is_vowel(ch) {
426 found_vowel = true;
427 } else if found_vowel {
428 r1 = i + 1;
429 break;
430 }
431 }
432
433 if r1 < word.len() {
435 found_vowel = false;
436 for (i, ch) in chars[r1..].iter().enumerate() {
437 if self.is_vowel(ch) {
438 found_vowel = true;
439 } else if found_vowel {
440 r2 = r1 + i + 1;
441 break;
442 }
443 }
444 }
445
446 (r1, r2)
447 }
448
449 fn is_vowel(&self, ch: &char) -> bool {
450 matches!(*ch, 'a' | 'e' | 'i' | 'o' | 'u' | 'y')
451 }
452
453 fn stem_english(&self, word: &str) -> String {
455 if word.len() <= 2 {
456 return word.to_string();
457 }
458
459 let mut stemmed = word.to_lowercase();
460 let _r1_r2 = self.find_r1_r2(&stemmed);
461
462 if stemmed.ends_with("'s'") {
464 stemmed = stemmed[..stemmed.len() - 3].to_string();
465 } else if stemmed.ends_with("'s") {
466 stemmed = stemmed[..stemmed.len() - 2].to_string();
467 } else if stemmed.ends_with("'") {
468 stemmed = stemmed[..stemmed.len() - 1].to_string();
469 }
470
471 if stemmed.ends_with("sses") {
473 let truncated = &stemmed[..stemmed.len() - 4];
474 stemmed = format!("{truncated}ss");
475 } else if stemmed.ends_with("ied") || stemmed.ends_with("ies") {
476 if stemmed.len() > 4 {
477 let truncated = &stemmed[..stemmed.len() - 3];
478 stemmed = format!("{truncated}i");
479 } else {
480 let truncated = &stemmed[..stemmed.len() - 3];
481 stemmed = format!("{truncated}ie");
482 }
483 } else if stemmed.ends_with("s") && !stemmed.ends_with("us") && !stemmed.ends_with("ss") {
484 let stem = &stemmed[..stemmed.len() - 1];
486 if VOWEL_SEQUENCE.is_match(stem) {
487 stemmed = stem.to_string();
488 }
489 }
490
491 stemmed
495 }
496}
497
498impl Stemmer for SnowballStemmer {
499 fn stem(&self, word: &str) -> Result<String> {
500 match self.language.as_str() {
501 "english" => Ok(self.stem_english(word)),
502 _ => Err(TextError::InvalidInput(format!(
503 "Unsupported language: {}",
504 self.language
505 ))),
506 }
507 }
508}
509
510#[derive(Debug, Clone)]
512pub struct SimpleLemmatizer {
513 lemma_dict: HashMap<String, String>,
514}
515
516impl SimpleLemmatizer {
517 pub fn new() -> Self {
519 let mut lemma_dict = HashMap::new();
520
521 lemma_dict.insert("am".to_string(), "be".to_string());
525 lemma_dict.insert("are".to_string(), "be".to_string());
526 lemma_dict.insert("is".to_string(), "be".to_string());
527 lemma_dict.insert("was".to_string(), "be".to_string());
528 lemma_dict.insert("were".to_string(), "be".to_string());
529 lemma_dict.insert("been".to_string(), "be".to_string());
530 lemma_dict.insert("being".to_string(), "be".to_string());
531
532 lemma_dict.insert("have".to_string(), "have".to_string());
533 lemma_dict.insert("has".to_string(), "have".to_string());
534 lemma_dict.insert("had".to_string(), "have".to_string());
535 lemma_dict.insert("having".to_string(), "have".to_string());
536
537 lemma_dict.insert("does".to_string(), "do".to_string());
538 lemma_dict.insert("did".to_string(), "do".to_string());
539 lemma_dict.insert("doing".to_string(), "do".to_string());
540
541 lemma_dict.insert("better".to_string(), "good".to_string());
542 lemma_dict.insert("best".to_string(), "good".to_string());
543 lemma_dict.insert("worse".to_string(), "bad".to_string());
544 lemma_dict.insert("worst".to_string(), "bad".to_string());
545
546 lemma_dict.insert("running".to_string(), "run".to_string());
547 lemma_dict.insert("ran".to_string(), "run".to_string());
548 lemma_dict.insert("runs".to_string(), "run".to_string());
549
550 Self { lemma_dict }
551 }
552
553 pub fn from_dict_file(path: &str) -> Result<Self> {
555 Ok(Self::new())
557 }
558
559 pub fn add_lemma(&mut self, word: &str, lemma: &str) {
561 self.lemma_dict.insert(word.to_string(), lemma.to_string());
562 }
563}
564
565impl Default for SimpleLemmatizer {
566 fn default() -> Self {
567 Self::new()
568 }
569}
570
571impl Stemmer for SimpleLemmatizer {
572 fn stem(&self, word: &str) -> Result<String> {
573 let lower = word.to_lowercase();
574 Ok(self.lemma_dict.get(&lower).unwrap_or(&lower).to_string())
575 }
576}
577
578#[cfg(test)]
579mod tests {
580 use super::*;
581
582 #[test]
583 fn test_porter_stemmer() {
584 let stemmer = PorterStemmer::new();
585
586 let test_cases = vec![
587 ("running", "run"),
588 ("ran", "ran"),
589 ("easily", "easili"),
590 ("fishing", "fish"),
591 ("fished", "fish"),
592 ("productive", "product"),
593 ("production", "produc"),
594 ("sensational", "sensat"),
595 ];
596
597 for (word, expected) in test_cases {
598 let stemmed = stemmer.stem(word).expect("Operation failed");
599 assert_eq!(stemmed, expected, "Failed for word: {word}");
600 }
601 }
602
603 #[test]
604 fn test_snowball_stemmer() {
605 let stemmer = SnowballStemmer::new("english").expect("Operation failed");
606
607 let test_cases = vec![
608 ("cats", "cat"),
609 ("running", "running"), ("flies", "fli"),
611 ("happiness", "happiness"), ];
613
614 for (word, expected) in test_cases {
615 let stemmed = stemmer.stem(word).expect("Operation failed");
616 assert_eq!(stemmed, expected, "Failed for word: {word}");
617 }
618 }
619
620 #[test]
621 fn test_simple_lemmatizer() {
622 let lemmatizer = SimpleLemmatizer::new();
623
624 let test_cases = vec![
625 ("am", "be"),
626 ("are", "be"),
627 ("was", "be"),
628 ("better", "good"),
629 ("running", "run"),
630 ("unknown", "unknown"), ];
632
633 for (word, expected) in test_cases {
634 let lemma = lemmatizer.stem(word).expect("Operation failed");
635 assert_eq!(lemma, expected, "Failed for word: {word}");
636 }
637 }
638
639 #[test]
640 fn test_rule_lemmatizer() {
641 let lemmatizer = RuleLemmatizer::new();
642
643 assert_eq!(lemmatizer.lemmatize("running", Some(PosTag::Verb)), "run");
645 assert_eq!(lemmatizer.lemmatize("cats", Some(PosTag::Noun)), "cat");
646 assert_eq!(
647 lemmatizer.lemmatize("better", Some(PosTag::Adjective)),
648 "good"
649 );
650 assert_eq!(
651 lemmatizer.lemmatize("quickly", Some(PosTag::Adverb)),
652 "quick"
653 );
654
655 assert_eq!(lemmatizer.lemmatize("went", Some(PosTag::Verb)), "go");
657 assert_eq!(
658 lemmatizer.lemmatize("children", Some(PosTag::Noun)),
659 "child"
660 );
661 assert_eq!(lemmatizer.lemmatize("feet", Some(PosTag::Noun)), "foot");
662
663 assert_eq!(lemmatizer.lemmatize("running", None), "run");
665 assert_eq!(lemmatizer.lemmatize("went", None), "go");
666 }
667
668 #[test]
669 fn test_pos_aware_lemmatizer_integration() {
670 let pos_aware = create_pos_aware_lemmatizer();
671 let rule_only = RuleLemmatizer::new();
672
673 let test_cases = vec![
675 "flies", "running", "better", "works", "watches", ];
681
682 for word in test_cases {
683 let pos_aware_result = pos_aware.stem(word).expect("Operation failed");
684 let rule_only_result = rule_only.stem(word).expect("Operation failed");
685
686 println!(
687 "Word: '{word}' -> POS-aware: '{pos_aware_result}', Rule-only: '{rule_only_result}'"
688 );
689
690 assert!(!pos_aware_result.is_empty());
692 assert!(!rule_only_result.is_empty());
693 }
694 }
695
696 #[test]
697 fn test_pos_aware_lemmatizer_accuracy() {
698 let pos_aware = create_pos_aware_lemmatizer();
699
700 assert_eq!(pos_aware.stem("running").expect("Operation failed"), "run");
702 assert_eq!(pos_aware.stem("walked").expect("Operation failed"), "walk");
703 assert_eq!(pos_aware.stem("plays").expect("Operation failed"), "play");
704 assert_eq!(pos_aware.stem("played").expect("Operation failed"), "play");
705 assert_eq!(
706 pos_aware.stem("swimming").expect("Operation failed"),
707 "swim"
708 );
709
710 assert_eq!(pos_aware.stem("cats").expect("Operation failed"), "cat");
712 assert_eq!(pos_aware.stem("dogs").expect("Operation failed"), "dog");
713 assert_eq!(
714 pos_aware.stem("happiness").expect("Operation failed"),
715 "happiness"
716 ); }
718
719 #[test]
720 fn test_pos_aware_lemmatizer_custom_config() {
721 let pos_config = crate::pos_tagging::PosTaggerConfig {
722 use_context: false,
723 smoothing_factor: 0.01,
724 use_morphology: true,
725 use_capitalization: true,
726 };
727
728 let lemma_config = LemmatizerConfig {
729 use_pos_tagging: true,
730 default_pos: PosTag::Verb,
731 apply_case_restoration: false,
732 check_vowels: true,
733 };
734
735 let pos_aware = create_pos_aware_lemmatizer_with_config(pos_config, lemma_config);
736
737 let result = pos_aware.stem("Running").expect("Operation failed");
739 assert_eq!(result, "run"); }
741
742 #[test]
743 fn test_stemmers_and_lemmatizers_comparison() {
744 let porter = PorterStemmer::new();
745 let snowball = SnowballStemmer::new("english").expect("Operation failed");
746 let lancaster = LancasterStemmer::new();
747 let simple_lemmatizer = SimpleLemmatizer::new();
748 let rule_lemmatizer = RuleLemmatizer::new();
749
750 let test_words = vec![
751 "running",
752 "cats",
753 "better",
754 "went",
755 "children",
756 "feet",
757 "universities",
758 ];
759
760 for word in test_words {
761 println!(
762 "Word: '{}'\nPorter: '{}'\nSnowball: '{}'\nLancaster: '{}'\nSimple: '{}'\nRule: '{}'",
763 word,
764 porter.stem(word).expect("Operation failed"),
765 snowball.stem(word).expect("Operation failed"),
766 lancaster.stem(word).expect("Operation failed"),
767 simple_lemmatizer.stem(word).expect("Operation failed"),
768 rule_lemmatizer.stem(word).expect("Operation failed")
769 );
770 }
771
772 assert_eq!(porter.stem("running").expect("Operation failed"), "run");
774 assert_eq!(
775 rule_lemmatizer.stem("running").expect("Operation failed"),
776 "run"
777 );
778
779 assert_eq!(porter.stem("went").expect("Operation failed"), "went"); assert_eq!(
782 rule_lemmatizer.stem("went").expect("Operation failed"),
783 "go"
784 ); assert_eq!(porter.stem("feet").expect("Operation failed"), "feet"); assert_eq!(
789 rule_lemmatizer.stem("feet").expect("Operation failed"),
790 "foot"
791 ); }
793}