pub struct SimpleLemmatizer { /* private fields */ }Expand description
Simple lemmatizer using a dictionary-based approach
Implementations§
Source§impl SimpleLemmatizer
impl SimpleLemmatizer
Sourcepub fn new() -> Self
pub fn new() -> Self
Create a new lemmatizer
Examples found in repository?
examples/stemming_comparison_demo.rs (line 13)
5fn main() -> Result<(), Box<dyn Error>> {
6 println!("Stemming Algorithms Comparison Demo");
7 println!("-----------------------------------");
8
9 // Create instances of different stemmers
10 let porter_stemmer = PorterStemmer::new();
11 let snowball_stemmer = SnowballStemmer::new("english")?;
12 let lancaster_stemmer = LancasterStemmer::new();
13 let lemmatizer = SimpleLemmatizer::new();
14
15 // Test words to compare stemming results
16 let test_words = vec![
17 "running",
18 "ran",
19 "runs",
20 "easily",
21 "fishing",
22 "fished",
23 "troubled",
24 "troubling",
25 "troubles",
26 "production",
27 "productive",
28 "argument",
29 "arguing",
30 "university",
31 "universities",
32 "maximizing",
33 "maximum",
34 "presumably",
35 "multiply",
36 "opposition",
37 "computational",
38 ];
39
40 // Print results in a table format
41 println!(
42 "{:<15} {:<15} {:<15} {:<15} {:<15}",
43 "Original", "Porter", "Snowball", "Lancaster", "Lemmatizer"
44 );
45 println!("{}", "-".repeat(75));
46
47 for word in test_words {
48 let porter_result = porter_stemmer.stem(word)?;
49 let snowball_result = snowball_stemmer.stem(word)?;
50 let lancaster_result = lancaster_stemmer.stem(word)?;
51 let lemma_result = lemmatizer.stem(word)?;
52
53 println!(
54 "{word:<15} {porter_result:<15} {snowball_result:<15} {lancaster_result:<15} {lemma_result:<15}"
55 );
56 }
57
58 // Demonstrate configurability of the Lancaster stemmer
59 println!("\nLancaster Stemmer Configuration Options");
60 println!("------------------------------------");
61
62 let default_lancaster = LancasterStemmer::new();
63 let custom_lancaster = LancasterStemmer::new()
64 .with_min_stemmed_length(3)
65 .with_acceptable_check(false);
66
67 println!(
68 "{:<15} {:<20} {:<20}",
69 "Original", "Default Lancaster", "Custom Lancaster"
70 );
71 println!("{}", "-".repeat(55));
72
73 let custom_test_words = vec!["provision", "ear", "me", "fishing", "multiply"];
74
75 for word in custom_test_words {
76 let default_result = default_lancaster.stem(word)?;
77 let custom_result = custom_lancaster.stem(word)?;
78
79 println!("{word:<15} {default_result:<20} {custom_result:<20}");
80 }
81
82 println!("\nNotes:");
83 println!("- Porter stemmer: Established algorithm, medium aggressiveness");
84 println!("- Snowball stemmer: Improved Porter algorithm with language-specific rules");
85 println!("- Lancaster stemmer: Most aggressive stemming, can be configured");
86 println!("- Lemmatizer: Dictionary-based approach, produces actual words");
87
88 Ok(())
89}More examples
examples/text_processing_demo.rs (line 64)
14fn main() -> Result<(), Box<dyn std::error::Error>> {
15 println!("=== SciRS2 Text Processing Demo ===\n");
16
17 let documents = [
18 "The quick brown fox jumps over the lazy dog.",
19 "A fast red fox leaped over the sleeping canine.",
20 "Machine learning algorithms process textual data efficiently.",
21 "Text processing and natural language understanding are important.",
22 ];
23
24 // 1. Text Normalization
25 println!("1. Text Normalization");
26 let normalizer = BasicNormalizer::new(true, true);
27 for (i, doc) in documents.iter().enumerate() {
28 let normalized = normalizer.normalize(doc)?;
29 println!("Doc {}: {}", i + 1, normalized);
30 }
31 println!();
32
33 // 2. Text Cleaning
34 println!("2. Text Cleaning");
35 let cleaner = BasicTextCleaner::new(true, true, true);
36 for (i, doc) in documents.iter().enumerate() {
37 let cleaned = cleaner.clean(doc)?;
38 println!("Doc {}: {}", i + 1, cleaned);
39 }
40 println!();
41
42 // 3. Tokenization Examples
43 println!("3. Tokenization Examples");
44
45 // Word tokenization
46 let word_tokenizer = WordTokenizer::new(true);
47 let tokens = word_tokenizer.tokenize(documents[0])?;
48 println!("Word tokens: {tokens:?}");
49
50 // N-gram tokenization
51 let ngram_tokenizer = NgramTokenizer::new(2)?;
52 let ngrams = ngram_tokenizer.tokenize(documents[0])?;
53 println!("2-grams: {ngrams:?}");
54
55 // Regex tokenization
56 let regex_tokenizer = RegexTokenizer::new(r"\b\w+\b", false)?;
57 let regex_tokens = regex_tokenizer.tokenize(documents[0])?;
58 println!("Regex tokens: {regex_tokens:?}");
59 println!();
60
61 // 4. Stemming and Lemmatization
62 println!("4. Stemming and Lemmatization");
63 let porter_stemmer = PorterStemmer::new();
64 let lemmatizer = SimpleLemmatizer::new();
65
66 let test_words = vec!["running", "jumped", "better", "processing"];
67 for word in test_words {
68 let stemmed = porter_stemmer.stem(word)?;
69 let lemmatized = lemmatizer.stem(word)?;
70 println!("{word}: stemmed={stemmed}, lemmatized={lemmatized}");
71 }
72 println!();
73
74 // 5. Count Vectorization
75 println!("5. Count Vectorization");
76 let mut count_vectorizer = CountVectorizer::new(false);
77
78 let doc_refs = documents.to_vec();
79 count_vectorizer.fit(&doc_refs)?;
80
81 // Transform individual documents
82 let count_matrix = count_vectorizer.transform_batch(&doc_refs)?;
83 println!("Count vector shape: {:?}", count_matrix.shape());
84 println!("Vocabulary size: {}", count_vectorizer.vocabulary().len());
85
86 println!();
87
88 // 6. TF-IDF Vectorization
89 println!("6. TF-IDF Vectorization");
90 let mut tfidf_vectorizer = TfidfVectorizer::new(false, true, Some("l2".to_string()));
91
92 tfidf_vectorizer.fit(&doc_refs)?;
93 let tfidf_matrix = tfidf_vectorizer.transform_batch(&doc_refs)?;
94
95 println!("TF-IDF vector shape: {:?}", tfidf_matrix.shape());
96 println!("Sample TF-IDF values:");
97 for i in 0..3.min(tfidf_matrix.nrows()) {
98 for j in 0..5.min(tfidf_matrix.ncols()) {
99 print!("{:.3} ", tfidf_matrix[[i, j]]);
100 }
101 println!();
102 }
103 println!();
104
105 // 7. Complete Pipeline Example
106 println!("7. Complete Text Processing Pipeline");
107 let testtext = "The cats were running quickly through the gardens.";
108
109 // Normalize
110 let normalized = normalizer.normalize(testtext)?;
111 println!("Normalized: {normalized}");
112
113 // Clean
114 let cleaned = cleaner.clean(&normalized)?;
115 println!("Cleaned: {cleaned}");
116
117 // Tokenize
118 let tokens = word_tokenizer.tokenize(&cleaned)?;
119 println!("Tokens: {tokens:?}");
120
121 // Stem
122 let stemmed_tokens: Result<Vec<_>, _> = tokens
123 .iter()
124 .map(|token| porter_stemmer.stem(token))
125 .collect();
126 let stemmed_tokens = stemmed_tokens?;
127 println!("Stemmed: {stemmed_tokens:?}");
128
129 Ok(())
130}examples/rule_lemmatizer_demo.rs (line 30)
26fn main() -> Result<(), Box<dyn std::error::Error>> {
27 println!("Rule-based Lemmatization Demo\n");
28
29 // Create lemmatizers and stemmers
30 let simple_lemmatizer = SimpleLemmatizer::new();
31 let rule_lemmatizer = RuleLemmatizer::new();
32 let porter_stemmer = PorterStemmer::new();
33
34 // Create a POS-aware lemmatizer using the builder pattern
35 let pos_aware_lemmatizer = RuleLemmatizerBuilder::new()
36 .use_pos_tagging(true)
37 .apply_case_restoration(true)
38 .check_vowels(true)
39 .build();
40
41 // Simple demo comparing lemmatization results
42 println!("\n=== Lemmatization Comparison ===\n");
43 let test_words = vec![
44 ("running", Some(PosTag::Verb)),
45 ("ran", Some(PosTag::Verb)),
46 ("better", Some(PosTag::Adjective)),
47 ("best", Some(PosTag::Adjective)),
48 ("feet", Some(PosTag::Noun)),
49 ("children", Some(PosTag::Noun)),
50 ("went", Some(PosTag::Verb)),
51 ("mice", Some(PosTag::Noun)),
52 ("quickly", Some(PosTag::Adverb)),
53 ("universities", Some(PosTag::Noun)),
54 ("studying", Some(PosTag::Verb)),
55 ("studied", Some(PosTag::Verb)),
56 ("studies", Some(PosTag::Verb)),
57 ];
58
59 println!(
60 "{:<15} {:<15} {:<15} {:<15}",
61 "Word", "Simple", "Rule-based", "Porter"
62 );
63 println!("{:-<60}", "");
64
65 for (word, pos) in &test_words {
66 let simple_result = simple_lemmatizer.stem(word)?;
67 let rule_result = if let Some(pos_tag) = pos {
68 rule_lemmatizer.lemmatize(word, Some(pos_tag.clone()))
69 } else {
70 rule_lemmatizer.stem(word)?
71 };
72 let porter_result = porter_stemmer.stem(word)?;
73
74 println!("{word:<15} {simple_result:<15} {rule_result:<15} {porter_result:<15}");
75 }
76
77 // POS tagging demonstration
78 println!("\n=== Part-of-Speech Aware Lemmatization ===\n");
79 println!("Demonstrating how the same word can lemmatize differently based on POS tag:\n");
80
81 let ambiguous_words = vec![
82 ("left", vec![PosTag::Verb, PosTag::Adjective, PosTag::Noun]),
83 ("close", vec![PosTag::Verb, PosTag::Adjective, PosTag::Noun]),
84 ("flies", vec![PosTag::Verb, PosTag::Noun]),
85 ("saw", vec![PosTag::Verb, PosTag::Noun]),
86 ("light", vec![PosTag::Noun, PosTag::Verb, PosTag::Adjective]),
87 ];
88
89 for (word, pos_tags) in &ambiguous_words {
90 println!("Word: \"{word}\"");
91
92 for pos in pos_tags {
93 let pos_name = match pos {
94 PosTag::Verb => "Verb",
95 PosTag::Noun => "Noun",
96 PosTag::Adjective => "Adjective",
97 PosTag::Adverb => "Adverb",
98 PosTag::Other => "Other",
99 };
100 println!(
101 " as {:<10}: {}",
102 pos_name,
103 pos_aware_lemmatizer.lemmatize(word, Some(pos.clone()))
104 );
105 }
106 println!();
107 }
108
109 // Performance comparison
110 println!("\n=== Performance Comparison ===\n");
111
112 // Preprocess text into tokens for benchmarking
113 let mut all_tokens = Vec::new();
114 for text in TEXTS {
115 all_tokens.extend(
116 text.split_whitespace()
117 .map(|s| s.trim_matches(|c: char| !c.is_alphabetic()))
118 .filter(|s| !s.is_empty())
119 .collect::<Vec<_>>(),
120 );
121 }
122
123 // Assign random POS tags for the benchmark
124 let mut tokens_with_pos = Vec::new();
125 let pos_tags = [
126 PosTag::Verb,
127 PosTag::Noun,
128 PosTag::Adjective,
129 PosTag::Adverb,
130 PosTag::Other,
131 ];
132 for (i, token) in all_tokens.iter().enumerate() {
133 tokens_with_pos.push((token.to_string(), pos_tags[i % pos_tags.len()].clone()));
134 }
135
136 // Benchmark simple lemmatizer
137 let start = Instant::now();
138 for token in &all_tokens {
139 let _ = simple_lemmatizer.stem(token)?;
140 }
141 let simple_time = start.elapsed();
142
143 // Benchmark rule-based lemmatizer without POS
144 let start = Instant::now();
145 for token in &all_tokens {
146 let _ = rule_lemmatizer.stem(token)?;
147 }
148 let rule_time = start.elapsed();
149
150 // Benchmark rule-based lemmatizer with POS
151 let start = Instant::now();
152 for (token, pos) in &tokens_with_pos {
153 let _ = pos_aware_lemmatizer.lemmatize(token, Some(pos.clone()));
154 }
155 let pos_rule_time = start.elapsed();
156
157 // Benchmark Porter stemmer
158 let start = Instant::now();
159 for token in &all_tokens {
160 let _ = porter_stemmer.stem(token)?;
161 }
162 let porter_time = start.elapsed();
163
164 println!("Processing {} tokens:\n", all_tokens.len());
165 println!("- SimpleLemmatizer: {simple_time:.2?}");
166 println!("- RuleLemmatizer (without POS): {rule_time:.2?}");
167 println!("- RuleLemmatizer (with POS): {pos_rule_time:.2?}");
168 println!("- PorterStemmer: {porter_time:.2?}");
169
170 // Example using RuleLemmatizer on real text
171 println!("\n=== Text Processing Example ===\n");
172 let text = "The scientists were running experiments to test their hypotheses. \
173 The children went to the museum, where they saw the fossils of prehistoric animals. \
174 Universities are studying better methods to address these issues quickly.";
175
176 println!("Original text:\n{text}\n");
177
178 // Simple tokenization and lemmatization with POS tags
179 let tokens: Vec<&str> = text
180 .split_whitespace()
181 .map(|s| s.trim_matches(|c: char| !c.is_alphabetic()))
182 .filter(|s| !s.is_empty())
183 .collect();
184
185 // Simulate a very basic POS tagger (in a real application, you would use a proper tagger)
186 let mut pos_map: HashMap<&str, PosTag> = HashMap::new();
187 pos_map.insert("scientists", PosTag::Noun);
188 pos_map.insert("were", PosTag::Verb);
189 pos_map.insert("running", PosTag::Verb);
190 pos_map.insert("experiments", PosTag::Noun);
191 pos_map.insert("test", PosTag::Verb);
192 pos_map.insert("their", PosTag::Other);
193 pos_map.insert("hypotheses", PosTag::Noun);
194 pos_map.insert("children", PosTag::Noun);
195 pos_map.insert("went", PosTag::Verb);
196 pos_map.insert("museum", PosTag::Noun);
197 pos_map.insert("saw", PosTag::Verb);
198 pos_map.insert("fossils", PosTag::Noun);
199 pos_map.insert("prehistoric", PosTag::Adjective);
200 pos_map.insert("animals", PosTag::Noun);
201 pos_map.insert("universities", PosTag::Noun);
202 pos_map.insert("studying", PosTag::Verb);
203 pos_map.insert("better", PosTag::Adjective);
204 pos_map.insert("methods", PosTag::Noun);
205 pos_map.insert("address", PosTag::Verb);
206 pos_map.insert("issues", PosTag::Noun);
207 pos_map.insert("quickly", PosTag::Adverb);
208
209 // Process text
210 println!("Word-by-word lemmatization results:");
211 println!(
212 "{:<15} {:<15} {:<15} {:<15}",
213 "Word", "RuleLemmatizer", "With POS", "Porter"
214 );
215 println!("{:-<60}", "");
216
217 for token in tokens {
218 let pos = pos_map.get(token.to_lowercase().as_str()).cloned();
219
220 let rule_result = rule_lemmatizer.stem(token)?;
221 let pos_result = if let Some(pos_tag) = &pos {
222 pos_aware_lemmatizer.lemmatize(token, Some(pos_tag.clone()))
223 } else {
224 pos_aware_lemmatizer.stem(token)?
225 };
226 let porter_result = porter_stemmer.stem(token)?;
227
228 println!("{token:<15} {rule_result:<15} {pos_result:<15} {porter_result:<15}");
229 }
230
231 // Custom rules example
232 println!("\n=== Custom Rules and Exceptions ===\n");
233
234 // Create a custom lemmatizer with additional rules and exceptions
235 let custom_lemmatizer = RuleLemmatizerBuilder::new()
236 .add_exception("dataset", "data")
237 .add_exception("corpora", "corpus")
238 .add_dict_entry("nlp", "natural language processing")
239 .build();
240
241 let custom_words = vec!["dataset", "corpora", "nlp", "datasets"];
242
243 println!("Custom lemmatizer results:");
244 for word in custom_words {
245 println!(
246 "{:<15} -> {}",
247 word,
248 custom_lemmatizer.lemmatize(word, None)
249 );
250 }
251
252 Ok(())
253}Sourcepub fn from_dict_file(path: &str) -> Result<Self>
pub fn from_dict_file(path: &str) -> Result<Self>
Load lemmatization dictionary from a file
Trait Implementations§
Source§impl Clone for SimpleLemmatizer
impl Clone for SimpleLemmatizer
Source§fn clone(&self) -> SimpleLemmatizer
fn clone(&self) -> SimpleLemmatizer
Returns a duplicate of the value. Read more
1.0.0 · Source§fn clone_from(&mut self, source: &Self)
fn clone_from(&mut self, source: &Self)
Performs copy-assignment from
source. Read moreSource§impl Debug for SimpleLemmatizer
impl Debug for SimpleLemmatizer
Source§impl Default for SimpleLemmatizer
impl Default for SimpleLemmatizer
Auto Trait Implementations§
impl Freeze for SimpleLemmatizer
impl RefUnwindSafe for SimpleLemmatizer
impl Send for SimpleLemmatizer
impl Sync for SimpleLemmatizer
impl Unpin for SimpleLemmatizer
impl UnwindSafe for SimpleLemmatizer
Blanket Implementations§
Source§impl<T> BorrowMut<T> for Twhere
T: ?Sized,
impl<T> BorrowMut<T> for Twhere
T: ?Sized,
Source§fn borrow_mut(&mut self) -> &mut T
fn borrow_mut(&mut self) -> &mut T
Mutably borrows from an owned value. Read more
Source§impl<T> CloneToUninit for Twhere
T: Clone,
impl<T> CloneToUninit for Twhere
T: Clone,
Source§impl<T> IntoEither for T
impl<T> IntoEither for T
Source§fn into_either(self, into_left: bool) -> Either<Self, Self>
fn into_either(self, into_left: bool) -> Either<Self, Self>
Converts
self into a Left variant of Either<Self, Self>
if into_left is true.
Converts self into a Right variant of Either<Self, Self>
otherwise. Read moreSource§fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
Converts
self into a Left variant of Either<Self, Self>
if into_left(&self) returns true.
Converts self into a Right variant of Either<Self, Self>
otherwise. Read moreSource§impl<T> Pointable for T
impl<T> Pointable for T
Source§impl<SS, SP> SupersetOf<SS> for SPwhere
SS: SubsetOf<SP>,
impl<SS, SP> SupersetOf<SS> for SPwhere
SS: SubsetOf<SP>,
Source§fn to_subset(&self) -> Option<SS>
fn to_subset(&self) -> Option<SS>
The inverse inclusion map: attempts to construct
self from the equivalent element of its
superset. Read moreSource§fn is_in_subset(&self) -> bool
fn is_in_subset(&self) -> bool
Checks if
self is actually part of its subset T (and can be converted to it).Source§fn to_subset_unchecked(&self) -> SS
fn to_subset_unchecked(&self) -> SS
Use with care! Same as
self.to_subset but without any property checks. Always succeeds.Source§fn from_subset(element: &SS) -> SP
fn from_subset(element: &SS) -> SP
The inclusion map: converts
self to the equivalent element of its superset.