pub struct StatisticalCorrector { /* private fields */ }Expand description
Statistical spelling corrector
Implementations§
Source§impl StatisticalCorrector
impl StatisticalCorrector
Sourcepub fn new(config: StatisticalCorrectorConfig) -> Self
pub fn new(config: StatisticalCorrectorConfig) -> Self
Create a new statistical spelling corrector with the given configuration
Examples found in repository?
examples/statistical_spelling_demo.rs (line 272)
248fn performance_test(
249 dict_corrector: &DictionaryCorrector,
250 stat_corrector: &StatisticalCorrector,
251) -> Result<(), Box<dyn std::error::Error>> {
252 println!("\n=== Performance Test ===\n");
253
254 // Create test text with a mix of correct and incorrect words
255 let testtext = TEXT_WITH_CONTEXT_MISSPELLINGS.repeat(10);
256
257 // Measure dictionary _corrector performance
258 let start = Instant::now();
259 let _ = dict_corrector.correcttext(&testtext)?;
260 let dict_time = start.elapsed();
261
262 // Measure statistical _corrector performance
263 let start = Instant::now();
264 let _ = stat_corrector.correcttext(&testtext)?;
265 let stat_time = start.elapsed();
266
267 // Create a non-contextual statistical _corrector for comparison
268 let non_context_config = StatisticalCorrectorConfig {
269 use_context: false,
270 ..Default::default()
271 };
272 let mut non_context_corrector = StatisticalCorrector::new(non_context_config);
273
274 // Add training data to ensure fair comparison
275 train_language_model(&mut non_context_corrector);
276 add_example_words(&mut non_context_corrector);
277
278 // Measure non-contextual statistical _corrector performance
279 let start = Instant::now();
280 let _ = non_context_corrector.correcttext(&testtext)?;
281 let non_context_time = start.elapsed();
282
283 println!(
284 "Performance comparison on text with {} characters:",
285 testtext.len()
286 );
287 println!(" - Dictionary corrector: {dict_time:?}");
288 println!(" - Statistical _corrector (without context): {non_context_time:?}");
289 println!(" - Statistical _corrector (with context): {stat_time:?}");
290
291 Ok(())
292}
293
294// Demonstrate different configurations for statistical correction
295#[allow(dead_code)]
296fn configuration_demo() -> Result<(), Box<dyn std::error::Error>> {
297 println!("\n=== Configuration Options Demo ===\n");
298
299 // Create configurations with different settings
300 let configs = [
301 ("Default", StatisticalCorrectorConfig::default()),
302 (
303 "Conservative (max_edit_distance=1)",
304 StatisticalCorrectorConfig {
305 max_edit_distance: 1,
306 ..Default::default()
307 },
308 ),
309 (
310 "Aggressive (max_edit_distance=3)",
311 StatisticalCorrectorConfig {
312 max_edit_distance: 3,
313 ..Default::default()
314 },
315 ),
316 (
317 "Language model focused (weight=0.9)",
318 StatisticalCorrectorConfig {
319 language_model_weight: 0.9,
320 edit_distance_weight: 0.1,
321 ..Default::default()
322 },
323 ),
324 (
325 "Edit distance focused (weight=0.9)",
326 StatisticalCorrectorConfig {
327 language_model_weight: 0.1,
328 edit_distance_weight: 0.9,
329 ..Default::default()
330 },
331 ),
332 (
333 "No context",
334 StatisticalCorrectorConfig {
335 use_context: false,
336 ..Default::default()
337 },
338 ),
339 ];
340
341 // Sample misspelled words with varied edit distances
342 let test_cases = [
343 "recieve", // Should be "receive"
344 "accidant", // Could be "accident" or "accidental" depending on max_edit_distance
345 "programing", // Single-m, should be "programming"
346 "thier", // Common misspelling of "their"
347 "complements", // Could be "compliments" depending on context
348 ];
349
350 // Test each configuration
351 for (name, config) in &configs {
352 let mut corrector = StatisticalCorrector::new(config.clone());
353
354 // Train the model to ensure consistent behavior
355 train_language_model(&mut corrector);
356 add_example_words(&mut corrector);
357
358 println!("{name} configuration:");
359 println!(" max_editdistance: {}", config.max_edit_distance);
360 println!(" language_modelweight: {}", config.language_model_weight);
361 println!(" edit_distanceweight: {}", config.edit_distance_weight);
362 println!(" usecontext: {}", config.use_context);
363
364 println!("\n Correction examples:");
365 for word in &test_cases {
366 let corrected = corrector.correct(word)?;
367 println!(" {word} -> {corrected}");
368 }
369
370 // Show a context example if context is enabled
371 if config.use_context {
372 let context_example = "Going to the bnk to deposit money. The river bnk was muddy.";
373 let corrected = corrector.correcttext(context_example)?;
374 println!("\n Context example:");
375 println!(" Before: {context_example}");
376 println!(" After: {corrected}");
377 }
378
379 println!("\n{:-<60}", "");
380 }
381
382 Ok(())
383}
384
385// Demonstrate the error model (noisy channel model)
386#[allow(dead_code)]
387fn noise_model_demo() -> Result<(), Box<dyn std::error::Error>> {
388 println!("\n=== Error Model Demo ===\n");
389
390 // Create different error models with varying error type probabilities
391 let models = [
392 ("Default", ErrorModel::default()),
393 ("Deletion-heavy", ErrorModel::new(0.7, 0.1, 0.1, 0.1)),
394 ("Insertion-heavy", ErrorModel::new(0.1, 0.7, 0.1, 0.1)),
395 ("Substitution-heavy", ErrorModel::new(0.1, 0.1, 0.7, 0.1)),
396 ("Transposition-heavy", ErrorModel::new(0.1, 0.1, 0.1, 0.7)),
397 ];
398
399 // Test cases for different error types
400 let test_pairs = [
401 ("recieve", "receive"), // Transposition (i and e)
402 ("acheive", "achieve"), // Transposition (i and e)
403 ("languge", "language"), // Deletion (missing 'a')
404 ("programing", "programming"), // Deletion (missing 'm')
405 ("probblem", "problem"), // Insertion (extra 'b')
406 ("committe", "committee"), // Insertion (missing 'e')
407 ("definately", "definitely"), // Substitution ('a' instead of 'i')
408 ("seperate", "separate"), // Substitution ('e' instead of 'a')
409 ];
410
411 // Test each error model
412 println!(
413 "{:<20} {:<12} {:<12} {:<12} {:<12} {:<12}",
414 "Model", "Delete Prob", "Insert Prob", "Subst Prob", "Transp Prob", "Example"
415 );
416 println!("{:-<80}", "");
417
418 for (name, model) in &models {
419 // Pick one example to show
420 let (typo, correct) = test_pairs[0];
421 let probability = model.error_probability(typo, correct);
422
423 println!(
424 "{:<20} {:<12.2} {:<12.2} {:<12.2} {:<12.2} {:<12.4}",
425 name,
426 model.p_deletion,
427 model.p_insertion,
428 model.p_substitution,
429 model.p_transposition,
430 probability
431 );
432 }
433
434 println!("\nError probabilities for different error types (using default model):");
435
436 let default_model = ErrorModel::default();
437
438 for (typo, correct) in &test_pairs {
439 let prob = default_model.error_probability(typo, correct);
440 println!("{typo:<12} -> {correct:<12}: {prob:.6}");
441 }
442
443 println!("\nImpact on correction with custom error model:");
444
445 // Create a statistical corrector with a custom error model
446 let custom_config = StatisticalCorrectorConfig {
447 language_model_weight: 0.3,
448 edit_distance_weight: 0.7,
449 ..Default::default()
450 };
451
452 let mut custom_corrector = StatisticalCorrector::new(custom_config);
453 train_language_model(&mut custom_corrector);
454 add_example_words(&mut custom_corrector);
455
456 // Create a transposition-heavy error model (good for common spelling errors)
457 let transposition_model = ErrorModel::new(0.1, 0.1, 0.1, 0.7);
458 custom_corrector.set_error_model(transposition_model);
459
460 // Test some examples
461 println!("\nCorrecting text with transposition-heavy error model:");
462 let testtext = "I recieved a mesage about thier acheivements.";
463 let corrected = custom_corrector.correcttext(testtext)?;
464
465 println!("Before: {testtext}");
466 println!("After: {corrected}");
467
468 Ok(())
469}Sourcepub fn from_dictionary_corrector(dictcorrector: &DictionaryCorrector) -> Self
pub fn from_dictionary_corrector(dictcorrector: &DictionaryCorrector) -> Self
Create a statistical corrector from a base dictionary corrector
Sourcepub fn add_corpus_file<P: AsRef<Path>>(&mut self, path: P) -> Result<()>
pub fn add_corpus_file<P: AsRef<Path>>(&mut self, path: P) -> Result<()>
Add a corpus file to train the language model
Sourcepub fn add_trainingtext(&mut self, text: &str)
pub fn add_trainingtext(&mut self, text: &str)
Add text to train the language model
Examples found in repository?
examples/statistical_spelling_demo.rs (line 74)
70fn train_language_model(corrector: &mut StatisticalCorrector) {
71 println!("Training language model with sample text...");
72
73 // Add sample training text
74 corrector.add_trainingtext(SAMPLE_TRAINING_TEXT);
75
76 // Add more specialized training examples for context disambiguation
77 let additional_examples = [
78 // Bank context examples
79 "I went to the bank to deposit money.",
80 "The bank is open until 5pm.",
81 "She works at the bank downtown.",
82 "I need to check my bank account.",
83 // River bank context examples
84 "We sat on the bank of the river.",
85 "The river bank was covered with flowers.",
86 "They fished from the bank of the lake.",
87 "The boat was tied to the bank.",
88 // Homophone examples for there/their/they're
89 "There is a book on the table.",
90 "Their house is very beautiful.",
91 "They're going to the movies tonight.",
92 "There was a problem with the system.",
93 "Their car broke down yesterday.",
94 "They're planning a vacation next month.",
95 // Complement/compliment examples
96 "He received many compliments on his presentation.",
97 "She gave him a compliment about his new haircut.",
98 "Red and green are complementary colors.",
99 "This wine complements the meal perfectly.",
100 // Message examples
101 "I received your message yesterday.",
102 "Please send me a message when you arrive.",
103 "The message was unclear and confusing.",
104 "She left a message on my voicemail.",
105 ];
106
107 for example in &additional_examples {
108 corrector.add_trainingtext(example);
109 }
110
111 println!(
112 "Language model trained with {} words vocabulary\n",
113 corrector.vocabulary_size()
114 );
115}Sourcepub fn set_language_model(&mut self, model: NGramModel)
pub fn set_language_model(&mut self, model: NGramModel)
Set the language model
Sourcepub fn set_error_model(&mut self, model: ErrorModel)
pub fn set_error_model(&mut self, model: ErrorModel)
Set the error model
Examples found in repository?
examples/statistical_spelling_demo.rs (line 458)
387fn noise_model_demo() -> Result<(), Box<dyn std::error::Error>> {
388 println!("\n=== Error Model Demo ===\n");
389
390 // Create different error models with varying error type probabilities
391 let models = [
392 ("Default", ErrorModel::default()),
393 ("Deletion-heavy", ErrorModel::new(0.7, 0.1, 0.1, 0.1)),
394 ("Insertion-heavy", ErrorModel::new(0.1, 0.7, 0.1, 0.1)),
395 ("Substitution-heavy", ErrorModel::new(0.1, 0.1, 0.7, 0.1)),
396 ("Transposition-heavy", ErrorModel::new(0.1, 0.1, 0.1, 0.7)),
397 ];
398
399 // Test cases for different error types
400 let test_pairs = [
401 ("recieve", "receive"), // Transposition (i and e)
402 ("acheive", "achieve"), // Transposition (i and e)
403 ("languge", "language"), // Deletion (missing 'a')
404 ("programing", "programming"), // Deletion (missing 'm')
405 ("probblem", "problem"), // Insertion (extra 'b')
406 ("committe", "committee"), // Insertion (missing 'e')
407 ("definately", "definitely"), // Substitution ('a' instead of 'i')
408 ("seperate", "separate"), // Substitution ('e' instead of 'a')
409 ];
410
411 // Test each error model
412 println!(
413 "{:<20} {:<12} {:<12} {:<12} {:<12} {:<12}",
414 "Model", "Delete Prob", "Insert Prob", "Subst Prob", "Transp Prob", "Example"
415 );
416 println!("{:-<80}", "");
417
418 for (name, model) in &models {
419 // Pick one example to show
420 let (typo, correct) = test_pairs[0];
421 let probability = model.error_probability(typo, correct);
422
423 println!(
424 "{:<20} {:<12.2} {:<12.2} {:<12.2} {:<12.2} {:<12.4}",
425 name,
426 model.p_deletion,
427 model.p_insertion,
428 model.p_substitution,
429 model.p_transposition,
430 probability
431 );
432 }
433
434 println!("\nError probabilities for different error types (using default model):");
435
436 let default_model = ErrorModel::default();
437
438 for (typo, correct) in &test_pairs {
439 let prob = default_model.error_probability(typo, correct);
440 println!("{typo:<12} -> {correct:<12}: {prob:.6}");
441 }
442
443 println!("\nImpact on correction with custom error model:");
444
445 // Create a statistical corrector with a custom error model
446 let custom_config = StatisticalCorrectorConfig {
447 language_model_weight: 0.3,
448 edit_distance_weight: 0.7,
449 ..Default::default()
450 };
451
452 let mut custom_corrector = StatisticalCorrector::new(custom_config);
453 train_language_model(&mut custom_corrector);
454 add_example_words(&mut custom_corrector);
455
456 // Create a transposition-heavy error model (good for common spelling errors)
457 let transposition_model = ErrorModel::new(0.1, 0.1, 0.1, 0.7);
458 custom_corrector.set_error_model(transposition_model);
459
460 // Test some examples
461 println!("\nCorrecting text with transposition-heavy error model:");
462 let testtext = "I recieved a mesage about thier acheivements.";
463 let corrected = custom_corrector.correcttext(testtext)?;
464
465 println!("Before: {testtext}");
466 println!("After: {corrected}");
467
468 Ok(())
469}Sourcepub fn set_metric<M: StringMetric + Send + Sync + 'static>(&mut self, metric: M)
pub fn set_metric<M: StringMetric + Send + Sync + 'static>(&mut self, metric: M)
Set the string metric to use for similarity calculations
Sourcepub fn set_config(&mut self, config: StatisticalCorrectorConfig)
pub fn set_config(&mut self, config: StatisticalCorrectorConfig)
Set the configuration
Sourcepub fn correct_sentence(&self, sentence: &str) -> Result<String>
pub fn correct_sentence(&self, sentence: &str) -> Result<String>
Correct a sentence using a context-aware approach
Sourcepub fn add_word(&mut self, word: &str, frequency: usize)
pub fn add_word(&mut self, word: &str, frequency: usize)
Add a word to the dictionary
Examples found in repository?
examples/statistical_spelling_demo.rs (line 146)
119fn add_example_words(corrector: &mut StatisticalCorrector) {
120 // Add specific words to the dictionary
121 let word_frequencies = [
122 // Common misspelled words
123 ("bank", 100),
124 ("river", 100),
125 ("deposit", 100),
126 ("money", 100),
127 ("received", 100),
128 ("message", 100),
129 ("meeting", 100),
130 ("compliments", 100),
131 ("complements", 100),
132 ("work", 100),
133 ("there", 100),
134 ("their", 100),
135 ("they're", 100),
136 ("was", 100),
137 ("problem", 100),
138 ("computer", 100),
139 ("museum", 100),
140 ("display", 100),
141 ("historical", 100),
142 ("artifacts", 100),
143 ];
144
145 for (word, freq) in &word_frequencies {
146 corrector.add_word(word, *freq);
147 }
148}Sourcepub fn remove_word(&mut self, word: &str)
pub fn remove_word(&mut self, word: &str)
Remove a word from the dictionary
Sourcepub fn dictionary_size(&self) -> usize
pub fn dictionary_size(&self) -> usize
Get the total number of words in the dictionary
Examples found in repository?
examples/statistical_spelling_demo.rs (line 190)
152fn compare_correctors(
153 dict_corrector: &DictionaryCorrector,
154 stat_corrector: &StatisticalCorrector,
155) -> Result<(), Box<dyn std::error::Error>> {
156 println!("=== Dictionary vs. Statistical Correction ===\n");
157
158 // Define test cases with known misspellings
159 let test_cases = [
160 ("recieve", "receive"),
161 ("mesage", "message"),
162 ("bnk", "bank"),
163 ("thier", "their"),
164 ("complements", "compliments"), // Can be correct in some contexts
165 ("artefacts", "artifacts"),
166 ("disply", "display"),
167 ("definately", "definitely"),
168 ];
169
170 println!(
171 "{:<15} {:<15} {:<15}",
172 "Misspelled", "Dictionary", "Statistical"
173 );
174 println!("{:-<45}", "");
175
176 for (misspelled, _expected) in &test_cases {
177 let dict_correction = dict_corrector.correct(misspelled)?;
178 let stat_correction = stat_corrector.correct(misspelled)?;
179
180 println!("{misspelled:<15} {dict_correction:<15} {stat_correction:<15}");
181 }
182
183 println!("\nDictionary sizes:");
184 println!(
185 " - Dictionary _corrector: {} words",
186 dict_corrector.dictionary_size()
187 );
188 println!(
189 " - Statistical _corrector: {} words (+ {} in language model)",
190 stat_corrector.dictionary_size(),
191 stat_corrector.vocabulary_size()
192 );
193
194 Ok(())
195}Sourcepub fn vocabulary_size(&self) -> usize
pub fn vocabulary_size(&self) -> usize
Get the vocabulary size of the language model
Examples found in repository?
examples/statistical_spelling_demo.rs (line 113)
70fn train_language_model(corrector: &mut StatisticalCorrector) {
71 println!("Training language model with sample text...");
72
73 // Add sample training text
74 corrector.add_trainingtext(SAMPLE_TRAINING_TEXT);
75
76 // Add more specialized training examples for context disambiguation
77 let additional_examples = [
78 // Bank context examples
79 "I went to the bank to deposit money.",
80 "The bank is open until 5pm.",
81 "She works at the bank downtown.",
82 "I need to check my bank account.",
83 // River bank context examples
84 "We sat on the bank of the river.",
85 "The river bank was covered with flowers.",
86 "They fished from the bank of the lake.",
87 "The boat was tied to the bank.",
88 // Homophone examples for there/their/they're
89 "There is a book on the table.",
90 "Their house is very beautiful.",
91 "They're going to the movies tonight.",
92 "There was a problem with the system.",
93 "Their car broke down yesterday.",
94 "They're planning a vacation next month.",
95 // Complement/compliment examples
96 "He received many compliments on his presentation.",
97 "She gave him a compliment about his new haircut.",
98 "Red and green are complementary colors.",
99 "This wine complements the meal perfectly.",
100 // Message examples
101 "I received your message yesterday.",
102 "Please send me a message when you arrive.",
103 "The message was unclear and confusing.",
104 "She left a message on my voicemail.",
105 ];
106
107 for example in &additional_examples {
108 corrector.add_trainingtext(example);
109 }
110
111 println!(
112 "Language model trained with {} words vocabulary\n",
113 corrector.vocabulary_size()
114 );
115}
116
117// Function to add specific words for consistent example behavior
118#[allow(dead_code)]
119fn add_example_words(corrector: &mut StatisticalCorrector) {
120 // Add specific words to the dictionary
121 let word_frequencies = [
122 // Common misspelled words
123 ("bank", 100),
124 ("river", 100),
125 ("deposit", 100),
126 ("money", 100),
127 ("received", 100),
128 ("message", 100),
129 ("meeting", 100),
130 ("compliments", 100),
131 ("complements", 100),
132 ("work", 100),
133 ("there", 100),
134 ("their", 100),
135 ("they're", 100),
136 ("was", 100),
137 ("problem", 100),
138 ("computer", 100),
139 ("museum", 100),
140 ("display", 100),
141 ("historical", 100),
142 ("artifacts", 100),
143 ];
144
145 for (word, freq) in &word_frequencies {
146 corrector.add_word(word, *freq);
147 }
148}
149
150// Compare dictionary-based and statistical spelling correction
151#[allow(dead_code)]
152fn compare_correctors(
153 dict_corrector: &DictionaryCorrector,
154 stat_corrector: &StatisticalCorrector,
155) -> Result<(), Box<dyn std::error::Error>> {
156 println!("=== Dictionary vs. Statistical Correction ===\n");
157
158 // Define test cases with known misspellings
159 let test_cases = [
160 ("recieve", "receive"),
161 ("mesage", "message"),
162 ("bnk", "bank"),
163 ("thier", "their"),
164 ("complements", "compliments"), // Can be correct in some contexts
165 ("artefacts", "artifacts"),
166 ("disply", "display"),
167 ("definately", "definitely"),
168 ];
169
170 println!(
171 "{:<15} {:<15} {:<15}",
172 "Misspelled", "Dictionary", "Statistical"
173 );
174 println!("{:-<45}", "");
175
176 for (misspelled, _expected) in &test_cases {
177 let dict_correction = dict_corrector.correct(misspelled)?;
178 let stat_correction = stat_corrector.correct(misspelled)?;
179
180 println!("{misspelled:<15} {dict_correction:<15} {stat_correction:<15}");
181 }
182
183 println!("\nDictionary sizes:");
184 println!(
185 " - Dictionary _corrector: {} words",
186 dict_corrector.dictionary_size()
187 );
188 println!(
189 " - Statistical _corrector: {} words (+ {} in language model)",
190 stat_corrector.dictionary_size(),
191 stat_corrector.vocabulary_size()
192 );
193
194 Ok(())
195}Trait Implementations§
Source§impl Clone for StatisticalCorrector
impl Clone for StatisticalCorrector
Source§impl Debug for StatisticalCorrector
impl Debug for StatisticalCorrector
Source§impl Default for StatisticalCorrector
impl Default for StatisticalCorrector
Source§impl SpellingCorrector for StatisticalCorrector
impl SpellingCorrector for StatisticalCorrector
Auto Trait Implementations§
impl Freeze for StatisticalCorrector
impl !RefUnwindSafe for StatisticalCorrector
impl Send for StatisticalCorrector
impl Sync for StatisticalCorrector
impl Unpin for StatisticalCorrector
impl !UnwindSafe for StatisticalCorrector
Blanket Implementations§
Source§impl<T> BorrowMut<T> for Twhere
T: ?Sized,
impl<T> BorrowMut<T> for Twhere
T: ?Sized,
Source§fn borrow_mut(&mut self) -> &mut T
fn borrow_mut(&mut self) -> &mut T
Mutably borrows from an owned value. Read more
Source§impl<T> CloneToUninit for Twhere
T: Clone,
impl<T> CloneToUninit for Twhere
T: Clone,
Source§impl<T> IntoEither for T
impl<T> IntoEither for T
Source§fn into_either(self, into_left: bool) -> Either<Self, Self>
fn into_either(self, into_left: bool) -> Either<Self, Self>
Converts
self into a Left variant of Either<Self, Self>
if into_left is true.
Converts self into a Right variant of Either<Self, Self>
otherwise. Read moreSource§fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
Converts
self into a Left variant of Either<Self, Self>
if into_left(&self) returns true.
Converts self into a Right variant of Either<Self, Self>
otherwise. Read moreSource§impl<T> Pointable for T
impl<T> Pointable for T
Source§impl<SS, SP> SupersetOf<SS> for SPwhere
SS: SubsetOf<SP>,
impl<SS, SP> SupersetOf<SS> for SPwhere
SS: SubsetOf<SP>,
Source§fn to_subset(&self) -> Option<SS>
fn to_subset(&self) -> Option<SS>
The inverse inclusion map: attempts to construct
self from the equivalent element of its
superset. Read moreSource§fn is_in_subset(&self) -> bool
fn is_in_subset(&self) -> bool
Checks if
self is actually part of its subset T (and can be converted to it).Source§fn to_subset_unchecked(&self) -> SS
fn to_subset_unchecked(&self) -> SS
Use with care! Same as
self.to_subset but without any property checks. Always succeeds.Source§fn from_subset(element: &SS) -> SP
fn from_subset(element: &SS) -> SP
The inclusion map: converts
self to the equivalent element of its superset.