Struct SimpleLemmatizer

Source

pub struct SimpleLemmatizer { /* private fields */ }

Expand description

Simple lemmatizer using a dictionary-based approach

Implementations§

Source §

impl SimpleLemmatizer

Source

pub fn new() -> Self

Create a new lemmatizer

Examples found in repository ?

examples/stemming_comparison_demo.rs (line 13)

5fn main() -> Result<(), Box<dyn Error>> {
6    println!("Stemming Algorithms Comparison Demo");
7    println!("-----------------------------------");
8
9    // Create instances of different stemmers
10    let porter_stemmer = PorterStemmer::new();
11    let snowball_stemmer = SnowballStemmer::new("english")?;
12    let lancaster_stemmer = LancasterStemmer::new();
13    let lemmatizer = SimpleLemmatizer::new();
14
15    // Test words to compare stemming results
16    let test_words = vec![
17        "running",
18        "ran",
19        "runs",
20        "easily",
21        "fishing",
22        "fished",
23        "troubled",
24        "troubling",
25        "troubles",
26        "production",
27        "productive",
28        "argument",
29        "arguing",
30        "university",
31        "universities",
32        "maximizing",
33        "maximum",
34        "presumably",
35        "multiply",
36        "opposition",
37        "computational",
38    ];
39
40    // Print results in a table format
41    println!(
42        "{:<15} {:<15} {:<15} {:<15} {:<15}",
43        "Original", "Porter", "Snowball", "Lancaster", "Lemmatizer"
44    );
45    println!("{}", "-".repeat(75));
46
47    for word in test_words {
48        let porter_result = porter_stemmer.stem(word)?;
49        let snowball_result = snowball_stemmer.stem(word)?;
50        let lancaster_result = lancaster_stemmer.stem(word)?;
51        let lemma_result = lemmatizer.stem(word)?;
52
53        println!(
54            "{word:<15} {porter_result:<15} {snowball_result:<15} {lancaster_result:<15} {lemma_result:<15}"
55        );
56    }
57
58    // Demonstrate configurability of the Lancaster stemmer
59    println!("\nLancaster Stemmer Configuration Options");
60    println!("------------------------------------");
61
62    let default_lancaster = LancasterStemmer::new();
63    let custom_lancaster = LancasterStemmer::new()
64        .with_min_stemmed_length(3)
65        .with_acceptable_check(false);
66
67    println!(
68        "{:<15} {:<20} {:<20}",
69        "Original", "Default Lancaster", "Custom Lancaster"
70    );
71    println!("{}", "-".repeat(55));
72
73    let custom_test_words = vec!["provision", "ear", "me", "fishing", "multiply"];
74
75    for word in custom_test_words {
76        let default_result = default_lancaster.stem(word)?;
77        let custom_result = custom_lancaster.stem(word)?;
78
79        println!("{word:<15} {default_result:<20} {custom_result:<20}");
80    }
81
82    println!("\nNotes:");
83    println!("- Porter stemmer: Established algorithm, medium aggressiveness");
84    println!("- Snowball stemmer: Improved Porter algorithm with language-specific rules");
85    println!("- Lancaster stemmer: Most aggressive stemming, can be configured");
86    println!("- Lemmatizer: Dictionary-based approach, produces actual words");
87
88    Ok(())
89}

More examples

Hide additional examples

examples/text_processing_demo.rs (line 64)

14fn main() -> Result<(), Box<dyn std::error::Error>> {
15    println!("=== SciRS2 Text Processing Demo ===\n");
16
17    let documents = [
18        "The quick brown fox jumps over the lazy dog.",
19        "A fast red fox leaped over the sleeping canine.",
20        "Machine learning algorithms process textual data efficiently.",
21        "Text processing and natural language understanding are important.",
22    ];
23
24    // 1. Text Normalization
25    println!("1. Text Normalization");
26    let normalizer = BasicNormalizer::new(true, true);
27    for (i, doc) in documents.iter().enumerate() {
28        let normalized = normalizer.normalize(doc)?;
29        println!("Doc {}: {}", i + 1, normalized);
30    }
31    println!();
32
33    // 2. Text Cleaning
34    println!("2. Text Cleaning");
35    let cleaner = BasicTextCleaner::new(true, true, true);
36    for (i, doc) in documents.iter().enumerate() {
37        let cleaned = cleaner.clean(doc)?;
38        println!("Doc {}: {}", i + 1, cleaned);
39    }
40    println!();
41
42    // 3. Tokenization Examples
43    println!("3. Tokenization Examples");
44
45    // Word tokenization
46    let word_tokenizer = WordTokenizer::new(true);
47    let tokens = word_tokenizer.tokenize(documents[0])?;
48    println!("Word tokens: {tokens:?}");
49
50    // N-gram tokenization
51    let ngram_tokenizer = NgramTokenizer::new(2)?;
52    let ngrams = ngram_tokenizer.tokenize(documents[0])?;
53    println!("2-grams: {ngrams:?}");
54
55    // Regex tokenization
56    let regex_tokenizer = RegexTokenizer::new(r"\b\w+\b", false)?;
57    let regex_tokens = regex_tokenizer.tokenize(documents[0])?;
58    println!("Regex tokens: {regex_tokens:?}");
59    println!();
60
61    // 4. Stemming and Lemmatization
62    println!("4. Stemming and Lemmatization");
63    let porter_stemmer = PorterStemmer::new();
64    let lemmatizer = SimpleLemmatizer::new();
65
66    let test_words = vec!["running", "jumped", "better", "processing"];
67    for word in test_words {
68        let stemmed = porter_stemmer.stem(word)?;
69        let lemmatized = lemmatizer.stem(word)?;
70        println!("{word}: stemmed={stemmed}, lemmatized={lemmatized}");
71    }
72    println!();
73
74    // 5. Count Vectorization
75    println!("5. Count Vectorization");
76    let mut count_vectorizer = CountVectorizer::new(false);
77
78    let doc_refs = documents.to_vec();
79    count_vectorizer.fit(&doc_refs)?;
80
81    // Transform individual documents
82    let count_matrix = count_vectorizer.transform_batch(&doc_refs)?;
83    println!("Count vector shape: {:?}", count_matrix.shape());
84    println!("Vocabulary size: {}", count_vectorizer.vocabulary().len());
85
86    println!();
87
88    // 6. TF-IDF Vectorization
89    println!("6. TF-IDF Vectorization");
90    let mut tfidf_vectorizer = TfidfVectorizer::new(false, true, Some("l2".to_string()));
91
92    tfidf_vectorizer.fit(&doc_refs)?;
93    let tfidf_matrix = tfidf_vectorizer.transform_batch(&doc_refs)?;
94
95    println!("TF-IDF vector shape: {:?}", tfidf_matrix.shape());
96    println!("Sample TF-IDF values:");
97    for i in 0..3.min(tfidf_matrix.nrows()) {
98        for j in 0..5.min(tfidf_matrix.ncols()) {
99            print!("{:.3} ", tfidf_matrix[[i, j]]);
100        }
101        println!();
102    }
103    println!();
104
105    // 7. Complete Pipeline Example
106    println!("7. Complete Text Processing Pipeline");
107    let testtext = "The cats were running quickly through the gardens.";
108
109    // Normalize
110    let normalized = normalizer.normalize(testtext)?;
111    println!("Normalized: {normalized}");
112
113    // Clean
114    let cleaned = cleaner.clean(&normalized)?;
115    println!("Cleaned: {cleaned}");
116
117    // Tokenize
118    let tokens = word_tokenizer.tokenize(&cleaned)?;
119    println!("Tokens: {tokens:?}");
120
121    // Stem
122    let stemmed_tokens: Result<Vec<_>, _> = tokens
123        .iter()
124        .map(|token| porter_stemmer.stem(token))
125        .collect();
126    let stemmed_tokens = stemmed_tokens?;
127    println!("Stemmed: {stemmed_tokens:?}");
128
129    Ok(())
130}

examples/rule_lemmatizer_demo.rs (line 30)

26fn main() -> Result<(), Box<dyn std::error::Error>> {
27    println!("Rule-based Lemmatization Demo\n");
28
29    // Create lemmatizers and stemmers
30    let simple_lemmatizer = SimpleLemmatizer::new();
31    let rule_lemmatizer = RuleLemmatizer::new();
32    let porter_stemmer = PorterStemmer::new();
33
34    // Create a POS-aware lemmatizer using the builder pattern
35    let pos_aware_lemmatizer = RuleLemmatizerBuilder::new()
36        .use_pos_tagging(true)
37        .apply_case_restoration(true)
38        .check_vowels(true)
39        .build();
40
41    // Simple demo comparing lemmatization results
42    println!("\n=== Lemmatization Comparison ===\n");
43    let test_words = vec![
44        ("running", Some(PosTag::Verb)),
45        ("ran", Some(PosTag::Verb)),
46        ("better", Some(PosTag::Adjective)),
47        ("best", Some(PosTag::Adjective)),
48        ("feet", Some(PosTag::Noun)),
49        ("children", Some(PosTag::Noun)),
50        ("went", Some(PosTag::Verb)),
51        ("mice", Some(PosTag::Noun)),
52        ("quickly", Some(PosTag::Adverb)),
53        ("universities", Some(PosTag::Noun)),
54        ("studying", Some(PosTag::Verb)),
55        ("studied", Some(PosTag::Verb)),
56        ("studies", Some(PosTag::Verb)),
57    ];
58
59    println!(
60        "{:<15} {:<15} {:<15} {:<15}",
61        "Word", "Simple", "Rule-based", "Porter"
62    );
63    println!("{:-<60}", "");
64
65    for (word, pos) in &test_words {
66        let simple_result = simple_lemmatizer.stem(word)?;
67        let rule_result = if let Some(pos_tag) = pos {
68            rule_lemmatizer.lemmatize(word, Some(pos_tag.clone()))
69        } else {
70            rule_lemmatizer.stem(word)?
71        };
72        let porter_result = porter_stemmer.stem(word)?;
73
74        println!("{word:<15} {simple_result:<15} {rule_result:<15} {porter_result:<15}");
75    }
76
77    // POS tagging demonstration
78    println!("\n=== Part-of-Speech Aware Lemmatization ===\n");
79    println!("Demonstrating how the same word can lemmatize differently based on POS tag:\n");
80
81    let ambiguous_words = vec![
82        ("left", vec![PosTag::Verb, PosTag::Adjective, PosTag::Noun]),
83        ("close", vec![PosTag::Verb, PosTag::Adjective, PosTag::Noun]),
84        ("flies", vec![PosTag::Verb, PosTag::Noun]),
85        ("saw", vec![PosTag::Verb, PosTag::Noun]),
86        ("light", vec![PosTag::Noun, PosTag::Verb, PosTag::Adjective]),
87    ];
88
89    for (word, pos_tags) in &ambiguous_words {
90        println!("Word: \"{word}\"");
91
92        for pos in pos_tags {
93            let pos_name = match pos {
94                PosTag::Verb => "Verb",
95                PosTag::Noun => "Noun",
96                PosTag::Adjective => "Adjective",
97                PosTag::Adverb => "Adverb",
98                PosTag::Other => "Other",
99            };
100            println!(
101                "  as {:<10}: {}",
102                pos_name,
103                pos_aware_lemmatizer.lemmatize(word, Some(pos.clone()))
104            );
105        }
106        println!();
107    }
108
109    // Performance comparison
110    println!("\n=== Performance Comparison ===\n");
111
112    // Preprocess text into tokens for benchmarking
113    let mut all_tokens = Vec::new();
114    for text in TEXTS {
115        all_tokens.extend(
116            text.split_whitespace()
117                .map(|s| s.trim_matches(|c: char| !c.is_alphabetic()))
118                .filter(|s| !s.is_empty())
119                .collect::<Vec<_>>(),
120        );
121    }
122
123    // Assign random POS tags for the benchmark
124    let mut tokens_with_pos = Vec::new();
125    let pos_tags = [
126        PosTag::Verb,
127        PosTag::Noun,
128        PosTag::Adjective,
129        PosTag::Adverb,
130        PosTag::Other,
131    ];
132    for (i, token) in all_tokens.iter().enumerate() {
133        tokens_with_pos.push((token.to_string(), pos_tags[i % pos_tags.len()].clone()));
134    }
135
136    // Benchmark simple lemmatizer
137    let start = Instant::now();
138    for token in &all_tokens {
139        let _ = simple_lemmatizer.stem(token)?;
140    }
141    let simple_time = start.elapsed();
142
143    // Benchmark rule-based lemmatizer without POS
144    let start = Instant::now();
145    for token in &all_tokens {
146        let _ = rule_lemmatizer.stem(token)?;
147    }
148    let rule_time = start.elapsed();
149
150    // Benchmark rule-based lemmatizer with POS
151    let start = Instant::now();
152    for (token, pos) in &tokens_with_pos {
153        let _ = pos_aware_lemmatizer.lemmatize(token, Some(pos.clone()));
154    }
155    let pos_rule_time = start.elapsed();
156
157    // Benchmark Porter stemmer
158    let start = Instant::now();
159    for token in &all_tokens {
160        let _ = porter_stemmer.stem(token)?;
161    }
162    let porter_time = start.elapsed();
163
164    println!("Processing {} tokens:\n", all_tokens.len());
165    println!("- SimpleLemmatizer: {simple_time:.2?}");
166    println!("- RuleLemmatizer (without POS): {rule_time:.2?}");
167    println!("- RuleLemmatizer (with POS): {pos_rule_time:.2?}");
168    println!("- PorterStemmer: {porter_time:.2?}");
169
170    // Example using RuleLemmatizer on real text
171    println!("\n=== Text Processing Example ===\n");
172    let text = "The scientists were running experiments to test their hypotheses. \
173                The children went to the museum, where they saw the fossils of prehistoric animals. \
174                Universities are studying better methods to address these issues quickly.";
175
176    println!("Original text:\n{text}\n");
177
178    // Simple tokenization and lemmatization with POS tags
179    let tokens: Vec<&str> = text
180        .split_whitespace()
181        .map(|s| s.trim_matches(|c: char| !c.is_alphabetic()))
182        .filter(|s| !s.is_empty())
183        .collect();
184
185    // Simulate a very basic POS tagger (in a real application, you would use a proper tagger)
186    let mut pos_map: HashMap<&str, PosTag> = HashMap::new();
187    pos_map.insert("scientists", PosTag::Noun);
188    pos_map.insert("were", PosTag::Verb);
189    pos_map.insert("running", PosTag::Verb);
190    pos_map.insert("experiments", PosTag::Noun);
191    pos_map.insert("test", PosTag::Verb);
192    pos_map.insert("their", PosTag::Other);
193    pos_map.insert("hypotheses", PosTag::Noun);
194    pos_map.insert("children", PosTag::Noun);
195    pos_map.insert("went", PosTag::Verb);
196    pos_map.insert("museum", PosTag::Noun);
197    pos_map.insert("saw", PosTag::Verb);
198    pos_map.insert("fossils", PosTag::Noun);
199    pos_map.insert("prehistoric", PosTag::Adjective);
200    pos_map.insert("animals", PosTag::Noun);
201    pos_map.insert("universities", PosTag::Noun);
202    pos_map.insert("studying", PosTag::Verb);
203    pos_map.insert("better", PosTag::Adjective);
204    pos_map.insert("methods", PosTag::Noun);
205    pos_map.insert("address", PosTag::Verb);
206    pos_map.insert("issues", PosTag::Noun);
207    pos_map.insert("quickly", PosTag::Adverb);
208
209    // Process text
210    println!("Word-by-word lemmatization results:");
211    println!(
212        "{:<15} {:<15} {:<15} {:<15}",
213        "Word", "RuleLemmatizer", "With POS", "Porter"
214    );
215    println!("{:-<60}", "");
216
217    for token in tokens {
218        let pos = pos_map.get(token.to_lowercase().as_str()).cloned();
219
220        let rule_result = rule_lemmatizer.stem(token)?;
221        let pos_result = if let Some(pos_tag) = &pos {
222            pos_aware_lemmatizer.lemmatize(token, Some(pos_tag.clone()))
223        } else {
224            pos_aware_lemmatizer.stem(token)?
225        };
226        let porter_result = porter_stemmer.stem(token)?;
227
228        println!("{token:<15} {rule_result:<15} {pos_result:<15} {porter_result:<15}");
229    }
230
231    // Custom rules example
232    println!("\n=== Custom Rules and Exceptions ===\n");
233
234    // Create a custom lemmatizer with additional rules and exceptions
235    let custom_lemmatizer = RuleLemmatizerBuilder::new()
236        .add_exception("dataset", "data")
237        .add_exception("corpora", "corpus")
238        .add_dict_entry("nlp", "natural language processing")
239        .build();
240
241    let custom_words = vec!["dataset", "corpora", "nlp", "datasets"];
242
243    println!("Custom lemmatizer results:");
244    for word in custom_words {
245        println!(
246            "{:<15} -> {}",
247            word,
248            custom_lemmatizer.lemmatize(word, None)
249        );
250    }
251
252    Ok(())
253}

Source

pub fn from_dict_file(path: &str) -> Result<Self>

Load lemmatization dictionary from a file

Source

pub fn add_lemma(&mut self, word: &str, lemma: &str)

Add a lemma mapping

Trait Implementations§

Source §

impl Clone for SimpleLemmatizer

Source §

fn clone(&self) -> SimpleLemmatizer

Returns a duplicate of the value. Read more

1.0.0 · Source§

fn clone_from(&mut self, source: &Self)

Performs copy-assignment from source. Read more

Source §

impl Debug for SimpleLemmatizer

Source §

fn fmt(&self, f: &mut Formatter<'_>) -> Result

Formats the value using the given formatter. Read more

Source §

impl Default for SimpleLemmatizer

Source §

fn default() -> Self

Returns the “default value” for a type. Read more

Source §

impl Stemmer for SimpleLemmatizer

Source §

fn stem(&self, word: &str) -> Result<String>

Stem a single word

Source §

fn stem_batch(&self, words: &[&str]) -> Result<Vec<String>>

Stem multiple words

Auto Trait Implementations§

§

impl UnwindSafe for SimpleLemmatizer

Blanket Implementations§

Source §

impl<T> Any for T
where T: 'static + ?Sized,

Source §

fn type_id(&self) -> TypeId

Gets the TypeId of self. Read more

Source §

impl<T> Borrow<T> for T
where T: ?Sized,

Source §

fn borrow(&self) -> &T

Immutably borrows from an owned value. Read more

Source §

impl<T> BorrowMut<T> for T
where T: ?Sized,

Source §

fn borrow_mut(&mut self) -> &mut T

Mutably borrows from an owned value. Read more

Source §

impl<T> CloneToUninit for T
where T: Clone,

Source §

unsafe fn clone_to_uninit(&self, dest: *mut u8)

🔬This is a nightly-only experimental API. (clone_to_uninit)

Performs copy-assignment from self to dest. Read more

Source §

impl<T> From<T> for T

Source §

fn from(t: T) -> T

Returns the argument unchanged.

Source §

impl<T, U> Into for T
where U: From<T>,

Source §

fn into(self) -> U

Calls U::from(self).

That is, this conversion is whatever the implementation of From<T> for U chooses to do.

Source §

impl<T> IntoEither for T

Source §

fn into_either(self, into_left: bool) -> Either<Self, Self>

Converts self into a Left variant of Either<Self, Self> if into_left is true. Converts self into a Right variant of Either<Self, Self> otherwise. Read more

Source §

fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
where F: FnOnce(&Self) -> bool,

Converts self into a Left variant of Either<Self, Self> if into_left(&self) returns true. Converts self into a Right variant of Either<Self, Self> otherwise. Read more

Source §