Struct StatisticalCorrector

Source

pub struct StatisticalCorrector { /* private fields */ }

Expand description

Statistical spelling corrector

Implementations§

Source §

impl StatisticalCorrector

Source

pub fn new(config: StatisticalCorrectorConfig) -> Self

Create a new statistical spelling corrector with the given configuration

Examples found in repository ?

examples/statistical_spelling_demo.rs (line 272)

248fn performance_test(
249    dict_corrector: &DictionaryCorrector,
250    stat_corrector: &StatisticalCorrector,
251) -> Result<(), Box<dyn std::error::Error>> {
252    println!("\n=== Performance Test ===\n");
253
254    // Create test text with a mix of correct and incorrect words
255    let testtext = TEXT_WITH_CONTEXT_MISSPELLINGS.repeat(10);
256
257    // Measure dictionary _corrector performance
258    let start = Instant::now();
259    let _ = dict_corrector.correcttext(&testtext)?;
260    let dict_time = start.elapsed();
261
262    // Measure statistical _corrector performance
263    let start = Instant::now();
264    let _ = stat_corrector.correcttext(&testtext)?;
265    let stat_time = start.elapsed();
266
267    // Create a non-contextual statistical _corrector for comparison
268    let non_context_config = StatisticalCorrectorConfig {
269        use_context: false,
270        ..Default::default()
271    };
272    let mut non_context_corrector = StatisticalCorrector::new(non_context_config);
273
274    // Add training data to ensure fair comparison
275    train_language_model(&mut non_context_corrector);
276    add_example_words(&mut non_context_corrector);
277
278    // Measure non-contextual statistical _corrector performance
279    let start = Instant::now();
280    let _ = non_context_corrector.correcttext(&testtext)?;
281    let non_context_time = start.elapsed();
282
283    println!(
284        "Performance comparison on text with {} characters:",
285        testtext.len()
286    );
287    println!("  - Dictionary corrector: {dict_time:?}");
288    println!("  - Statistical _corrector (without context): {non_context_time:?}");
289    println!("  - Statistical _corrector (with context): {stat_time:?}");
290
291    Ok(())
292}
293
294// Demonstrate different configurations for statistical correction
295#[allow(dead_code)]
296fn configuration_demo() -> Result<(), Box<dyn std::error::Error>> {
297    println!("\n=== Configuration Options Demo ===\n");
298
299    // Create configurations with different settings
300    let configs = [
301        ("Default", StatisticalCorrectorConfig::default()),
302        (
303            "Conservative (max_edit_distance=1)",
304            StatisticalCorrectorConfig {
305                max_edit_distance: 1,
306                ..Default::default()
307            },
308        ),
309        (
310            "Aggressive (max_edit_distance=3)",
311            StatisticalCorrectorConfig {
312                max_edit_distance: 3,
313                ..Default::default()
314            },
315        ),
316        (
317            "Language model focused (weight=0.9)",
318            StatisticalCorrectorConfig {
319                language_model_weight: 0.9,
320                edit_distance_weight: 0.1,
321                ..Default::default()
322            },
323        ),
324        (
325            "Edit distance focused (weight=0.9)",
326            StatisticalCorrectorConfig {
327                language_model_weight: 0.1,
328                edit_distance_weight: 0.9,
329                ..Default::default()
330            },
331        ),
332        (
333            "No context",
334            StatisticalCorrectorConfig {
335                use_context: false,
336                ..Default::default()
337            },
338        ),
339    ];
340
341    // Sample misspelled words with varied edit distances
342    let test_cases = [
343        "recieve",     // Should be "receive"
344        "accidant",    // Could be "accident" or "accidental" depending on max_edit_distance
345        "programing",  // Single-m, should be "programming"
346        "thier",       // Common misspelling of "their"
347        "complements", // Could be "compliments" depending on context
348    ];
349
350    // Test each configuration
351    for (name, config) in &configs {
352        let mut corrector = StatisticalCorrector::new(config.clone());
353
354        // Train the model to ensure consistent behavior
355        train_language_model(&mut corrector);
356        add_example_words(&mut corrector);
357
358        println!("{name} configuration:");
359        println!("  max_editdistance: {}", config.max_edit_distance);
360        println!("  language_modelweight: {}", config.language_model_weight);
361        println!("  edit_distanceweight: {}", config.edit_distance_weight);
362        println!("  usecontext: {}", config.use_context);
363
364        println!("\n  Correction examples:");
365        for word in &test_cases {
366            let corrected = corrector.correct(word)?;
367            println!("    {word} -> {corrected}");
368        }
369
370        // Show a context example if context is enabled
371        if config.use_context {
372            let context_example = "Going to the bnk to deposit money. The river bnk was muddy.";
373            let corrected = corrector.correcttext(context_example)?;
374            println!("\n  Context example:");
375            println!("    Before: {context_example}");
376            println!("    After:  {corrected}");
377        }
378
379        println!("\n{:-<60}", "");
380    }
381
382    Ok(())
383}
384
385// Demonstrate the error model (noisy channel model)
386#[allow(dead_code)]
387fn noise_model_demo() -> Result<(), Box<dyn std::error::Error>> {
388    println!("\n=== Error Model Demo ===\n");
389
390    // Create different error models with varying error type probabilities
391    let models = [
392        ("Default", ErrorModel::default()),
393        ("Deletion-heavy", ErrorModel::new(0.7, 0.1, 0.1, 0.1)),
394        ("Insertion-heavy", ErrorModel::new(0.1, 0.7, 0.1, 0.1)),
395        ("Substitution-heavy", ErrorModel::new(0.1, 0.1, 0.7, 0.1)),
396        ("Transposition-heavy", ErrorModel::new(0.1, 0.1, 0.1, 0.7)),
397    ];
398
399    // Test cases for different error types
400    let test_pairs = [
401        ("recieve", "receive"),        // Transposition (i and e)
402        ("acheive", "achieve"),        // Transposition (i and e)
403        ("languge", "language"),       // Deletion (missing 'a')
404        ("programing", "programming"), // Deletion (missing 'm')
405        ("probblem", "problem"),       // Insertion (extra 'b')
406        ("committe", "committee"),     // Insertion (missing 'e')
407        ("definately", "definitely"),  // Substitution ('a' instead of 'i')
408        ("seperate", "separate"),      // Substitution ('e' instead of 'a')
409    ];
410
411    // Test each error model
412    println!(
413        "{:<20} {:<12} {:<12} {:<12} {:<12} {:<12}",
414        "Model", "Delete Prob", "Insert Prob", "Subst Prob", "Transp Prob", "Example"
415    );
416    println!("{:-<80}", "");
417
418    for (name, model) in &models {
419        // Pick one example to show
420        let (typo, correct) = test_pairs[0];
421        let probability = model.error_probability(typo, correct);
422
423        println!(
424            "{:<20} {:<12.2} {:<12.2} {:<12.2} {:<12.2} {:<12.4}",
425            name,
426            model.p_deletion,
427            model.p_insertion,
428            model.p_substitution,
429            model.p_transposition,
430            probability
431        );
432    }
433
434    println!("\nError probabilities for different error types (using default model):");
435
436    let default_model = ErrorModel::default();
437
438    for (typo, correct) in &test_pairs {
439        let prob = default_model.error_probability(typo, correct);
440        println!("{typo:<12} -> {correct:<12}: {prob:.6}");
441    }
442
443    println!("\nImpact on correction with custom error model:");
444
445    // Create a statistical corrector with a custom error model
446    let custom_config = StatisticalCorrectorConfig {
447        language_model_weight: 0.3,
448        edit_distance_weight: 0.7,
449        ..Default::default()
450    };
451
452    let mut custom_corrector = StatisticalCorrector::new(custom_config);
453    train_language_model(&mut custom_corrector);
454    add_example_words(&mut custom_corrector);
455
456    // Create a transposition-heavy error model (good for common spelling errors)
457    let transposition_model = ErrorModel::new(0.1, 0.1, 0.1, 0.7);
458    custom_corrector.set_error_model(transposition_model);
459
460    // Test some examples
461    println!("\nCorrecting text with transposition-heavy error model:");
462    let testtext = "I recieved a mesage about thier acheivements.";
463    let corrected = custom_corrector.correcttext(testtext)?;
464
465    println!("Before: {testtext}");
466    println!("After:  {corrected}");
467
468    Ok(())
469}

Source

pub fn from_dictionary_corrector(dictcorrector: &DictionaryCorrector) -> Self

Create a statistical corrector from a base dictionary corrector

Source

pub fn add_corpus_file<P: AsRef<Path>>(&mut self, path: P) -> Result<()>

Add a corpus file to train the language model

Source

pub fn add_trainingtext(&mut self, text: &str)

Add text to train the language model

Examples found in repository ?

examples/statistical_spelling_demo.rs (line 74)

70fn train_language_model(corrector: &mut StatisticalCorrector) {
71    println!("Training language model with sample text...");
72
73    // Add sample training text
74    corrector.add_trainingtext(SAMPLE_TRAINING_TEXT);
75
76    // Add more specialized training examples for context disambiguation
77    let additional_examples = [
78        // Bank context examples
79        "I went to the bank to deposit money.",
80        "The bank is open until 5pm.",
81        "She works at the bank downtown.",
82        "I need to check my bank account.",
83        // River bank context examples
84        "We sat on the bank of the river.",
85        "The river bank was covered with flowers.",
86        "They fished from the bank of the lake.",
87        "The boat was tied to the bank.",
88        // Homophone examples for there/their/they're
89        "There is a book on the table.",
90        "Their house is very beautiful.",
91        "They're going to the movies tonight.",
92        "There was a problem with the system.",
93        "Their car broke down yesterday.",
94        "They're planning a vacation next month.",
95        // Complement/compliment examples
96        "He received many compliments on his presentation.",
97        "She gave him a compliment about his new haircut.",
98        "Red and green are complementary colors.",
99        "This wine complements the meal perfectly.",
100        // Message examples
101        "I received your message yesterday.",
102        "Please send me a message when you arrive.",
103        "The message was unclear and confusing.",
104        "She left a message on my voicemail.",
105    ];
106
107    for example in &additional_examples {
108        corrector.add_trainingtext(example);
109    }
110
111    println!(
112        "Language model trained with {} words vocabulary\n",
113        corrector.vocabulary_size()
114    );
115}

Source

pub fn set_language_model(&mut self, model: NGramModel)

Set the language model

Source

pub fn set_error_model(&mut self, model: ErrorModel)

Set the error model

Examples found in repository ?

examples/statistical_spelling_demo.rs (line 458)

387fn noise_model_demo() -> Result<(), Box<dyn std::error::Error>> {
388    println!("\n=== Error Model Demo ===\n");
389
390    // Create different error models with varying error type probabilities
391    let models = [
392        ("Default", ErrorModel::default()),
393        ("Deletion-heavy", ErrorModel::new(0.7, 0.1, 0.1, 0.1)),
394        ("Insertion-heavy", ErrorModel::new(0.1, 0.7, 0.1, 0.1)),
395        ("Substitution-heavy", ErrorModel::new(0.1, 0.1, 0.7, 0.1)),
396        ("Transposition-heavy", ErrorModel::new(0.1, 0.1, 0.1, 0.7)),
397    ];
398
399    // Test cases for different error types
400    let test_pairs = [
401        ("recieve", "receive"),        // Transposition (i and e)
402        ("acheive", "achieve"),        // Transposition (i and e)
403        ("languge", "language"),       // Deletion (missing 'a')
404        ("programing", "programming"), // Deletion (missing 'm')
405        ("probblem", "problem"),       // Insertion (extra 'b')
406        ("committe", "committee"),     // Insertion (missing 'e')
407        ("definately", "definitely"),  // Substitution ('a' instead of 'i')
408        ("seperate", "separate"),      // Substitution ('e' instead of 'a')
409    ];
410
411    // Test each error model
412    println!(
413        "{:<20} {:<12} {:<12} {:<12} {:<12} {:<12}",
414        "Model", "Delete Prob", "Insert Prob", "Subst Prob", "Transp Prob", "Example"
415    );
416    println!("{:-<80}", "");
417
418    for (name, model) in &models {
419        // Pick one example to show
420        let (typo, correct) = test_pairs[0];
421        let probability = model.error_probability(typo, correct);
422
423        println!(
424            "{:<20} {:<12.2} {:<12.2} {:<12.2} {:<12.2} {:<12.4}",
425            name,
426            model.p_deletion,
427            model.p_insertion,
428            model.p_substitution,
429            model.p_transposition,
430            probability
431        );
432    }
433
434    println!("\nError probabilities for different error types (using default model):");
435
436    let default_model = ErrorModel::default();
437
438    for (typo, correct) in &test_pairs {
439        let prob = default_model.error_probability(typo, correct);
440        println!("{typo:<12} -> {correct:<12}: {prob:.6}");
441    }
442
443    println!("\nImpact on correction with custom error model:");
444
445    // Create a statistical corrector with a custom error model
446    let custom_config = StatisticalCorrectorConfig {
447        language_model_weight: 0.3,
448        edit_distance_weight: 0.7,
449        ..Default::default()
450    };
451
452    let mut custom_corrector = StatisticalCorrector::new(custom_config);
453    train_language_model(&mut custom_corrector);
454    add_example_words(&mut custom_corrector);
455
456    // Create a transposition-heavy error model (good for common spelling errors)
457    let transposition_model = ErrorModel::new(0.1, 0.1, 0.1, 0.7);
458    custom_corrector.set_error_model(transposition_model);
459
460    // Test some examples
461    println!("\nCorrecting text with transposition-heavy error model:");
462    let testtext = "I recieved a mesage about thier acheivements.";
463    let corrected = custom_corrector.correcttext(testtext)?;
464
465    println!("Before: {testtext}");
466    println!("After:  {corrected}");
467
468    Ok(())
469}

Source

pub fn set_metric<M: StringMetric + Send + Sync + 'static>(&mut self, metric: M)

Set the string metric to use for similarity calculations

Source

pub fn set_config(&mut self, config: StatisticalCorrectorConfig)

Set the configuration

Source

pub fn correct_sentence(&self, sentence: &str) -> Result<String>

Correct a sentence using a context-aware approach

Source

pub fn add_word(&mut self, word: &str, frequency: usize)

Add a word to the dictionary

Examples found in repository ?

examples/statistical_spelling_demo.rs (line 146)

119fn add_example_words(corrector: &mut StatisticalCorrector) {
120    // Add specific words to the dictionary
121    let word_frequencies = [
122        // Common misspelled words
123        ("bank", 100),
124        ("river", 100),
125        ("deposit", 100),
126        ("money", 100),
127        ("received", 100),
128        ("message", 100),
129        ("meeting", 100),
130        ("compliments", 100),
131        ("complements", 100),
132        ("work", 100),
133        ("there", 100),
134        ("their", 100),
135        ("they're", 100),
136        ("was", 100),
137        ("problem", 100),
138        ("computer", 100),
139        ("museum", 100),
140        ("display", 100),
141        ("historical", 100),
142        ("artifacts", 100),
143    ];
144
145    for (word, freq) in &word_frequencies {
146        corrector.add_word(word, *freq);
147    }
148}

Source

pub fn remove_word(&mut self, word: &str)

Remove a word from the dictionary

Source

pub fn dictionary_size(&self) -> usize

Get the total number of words in the dictionary

Examples found in repository ?

examples/statistical_spelling_demo.rs (line 190)

152fn compare_correctors(
153    dict_corrector: &DictionaryCorrector,
154    stat_corrector: &StatisticalCorrector,
155) -> Result<(), Box<dyn std::error::Error>> {
156    println!("=== Dictionary vs. Statistical Correction ===\n");
157
158    // Define test cases with known misspellings
159    let test_cases = [
160        ("recieve", "receive"),
161        ("mesage", "message"),
162        ("bnk", "bank"),
163        ("thier", "their"),
164        ("complements", "compliments"), // Can be correct in some contexts
165        ("artefacts", "artifacts"),
166        ("disply", "display"),
167        ("definately", "definitely"),
168    ];
169
170    println!(
171        "{:<15} {:<15} {:<15}",
172        "Misspelled", "Dictionary", "Statistical"
173    );
174    println!("{:-<45}", "");
175
176    for (misspelled, _expected) in &test_cases {
177        let dict_correction = dict_corrector.correct(misspelled)?;
178        let stat_correction = stat_corrector.correct(misspelled)?;
179
180        println!("{misspelled:<15} {dict_correction:<15} {stat_correction:<15}");
181    }
182
183    println!("\nDictionary sizes:");
184    println!(
185        "  - Dictionary _corrector: {} words",
186        dict_corrector.dictionary_size()
187    );
188    println!(
189        "  - Statistical _corrector: {} words (+ {} in language model)",
190        stat_corrector.dictionary_size(),
191        stat_corrector.vocabulary_size()
192    );
193
194    Ok(())
195}

Source

pub fn vocabulary_size(&self) -> usize

Get the vocabulary size of the language model

Examples found in repository ?

examples/statistical_spelling_demo.rs (line 113)

70fn train_language_model(corrector: &mut StatisticalCorrector) {
71    println!("Training language model with sample text...");
72
73    // Add sample training text
74    corrector.add_trainingtext(SAMPLE_TRAINING_TEXT);
75
76    // Add more specialized training examples for context disambiguation
77    let additional_examples = [
78        // Bank context examples
79        "I went to the bank to deposit money.",
80        "The bank is open until 5pm.",
81        "She works at the bank downtown.",
82        "I need to check my bank account.",
83        // River bank context examples
84        "We sat on the bank of the river.",
85        "The river bank was covered with flowers.",
86        "They fished from the bank of the lake.",
87        "The boat was tied to the bank.",
88        // Homophone examples for there/their/they're
89        "There is a book on the table.",
90        "Their house is very beautiful.",
91        "They're going to the movies tonight.",
92        "There was a problem with the system.",
93        "Their car broke down yesterday.",
94        "They're planning a vacation next month.",
95        // Complement/compliment examples
96        "He received many compliments on his presentation.",
97        "She gave him a compliment about his new haircut.",
98        "Red and green are complementary colors.",
99        "This wine complements the meal perfectly.",
100        // Message examples
101        "I received your message yesterday.",
102        "Please send me a message when you arrive.",
103        "The message was unclear and confusing.",
104        "She left a message on my voicemail.",
105    ];
106
107    for example in &additional_examples {
108        corrector.add_trainingtext(example);
109    }
110
111    println!(
112        "Language model trained with {} words vocabulary\n",
113        corrector.vocabulary_size()
114    );
115}
116
117// Function to add specific words for consistent example behavior
118#[allow(dead_code)]
119fn add_example_words(corrector: &mut StatisticalCorrector) {
120    // Add specific words to the dictionary
121    let word_frequencies = [
122        // Common misspelled words
123        ("bank", 100),
124        ("river", 100),
125        ("deposit", 100),
126        ("money", 100),
127        ("received", 100),
128        ("message", 100),
129        ("meeting", 100),
130        ("compliments", 100),
131        ("complements", 100),
132        ("work", 100),
133        ("there", 100),
134        ("their", 100),
135        ("they're", 100),
136        ("was", 100),
137        ("problem", 100),
138        ("computer", 100),
139        ("museum", 100),
140        ("display", 100),
141        ("historical", 100),
142        ("artifacts", 100),
143    ];
144
145    for (word, freq) in &word_frequencies {
146        corrector.add_word(word, *freq);
147    }
148}
149
150// Compare dictionary-based and statistical spelling correction
151#[allow(dead_code)]
152fn compare_correctors(
153    dict_corrector: &DictionaryCorrector,
154    stat_corrector: &StatisticalCorrector,
155) -> Result<(), Box<dyn std::error::Error>> {
156    println!("=== Dictionary vs. Statistical Correction ===\n");
157
158    // Define test cases with known misspellings
159    let test_cases = [
160        ("recieve", "receive"),
161        ("mesage", "message"),
162        ("bnk", "bank"),
163        ("thier", "their"),
164        ("complements", "compliments"), // Can be correct in some contexts
165        ("artefacts", "artifacts"),
166        ("disply", "display"),
167        ("definately", "definitely"),
168    ];
169
170    println!(
171        "{:<15} {:<15} {:<15}",
172        "Misspelled", "Dictionary", "Statistical"
173    );
174    println!("{:-<45}", "");
175
176    for (misspelled, _expected) in &test_cases {
177        let dict_correction = dict_corrector.correct(misspelled)?;
178        let stat_correction = stat_corrector.correct(misspelled)?;
179
180        println!("{misspelled:<15} {dict_correction:<15} {stat_correction:<15}");
181    }
182
183    println!("\nDictionary sizes:");
184    println!(
185        "  - Dictionary _corrector: {} words",
186        dict_corrector.dictionary_size()
187    );
188    println!(
189        "  - Statistical _corrector: {} words (+ {} in language model)",
190        stat_corrector.dictionary_size(),
191        stat_corrector.vocabulary_size()
192    );
193
194    Ok(())
195}

Trait Implementations§

Source §

impl Clone for StatisticalCorrector

Source §

fn clone(&self) -> Self

Returns a duplicate of the value. Read more

1.0.0 · Source§

fn clone_from(&mut self, source: &Self)

Performs copy-assignment from source. Read more

Source §

impl Debug for StatisticalCorrector

Source §

fn fmt(&self, f: &mut Formatter<'_>) -> Result

Formats the value using the given formatter. Read more

Source §

impl Default for StatisticalCorrector

Source §

fn default() -> Self

Returns the “default value” for a type. Read more

Source §

impl SpellingCorrector for StatisticalCorrector

Source §

fn correct(&self, word: &str) -> Result<String>

Correct a potentially misspelled word

Source §

fn get_suggestions(&self, word: &str, limit: usize) -> Result<Vec<String>>

Get a list of suggestions for a potentially misspelled word

Source §

fn is_correct(&self, word: &str) -> bool

Check if a word is spelled correctly

Source §

fn correcttext(&self, text: &str) -> Result<String>

Correct all words in a text

Auto Trait Implementations§

§

impl !UnwindSafe for StatisticalCorrector

Blanket Implementations§

Source §

impl<T> Any for T
where T: 'static + ?Sized,

Source §

fn type_id(&self) -> TypeId

Gets the TypeId of self. Read more

Source §

impl<T> Borrow<T> for T
where T: ?Sized,

Source §

fn borrow(&self) -> &T

Immutably borrows from an owned value. Read more

Source §

impl<T> BorrowMut<T> for T
where T: ?Sized,

Source §

fn borrow_mut(&mut self) -> &mut T

Mutably borrows from an owned value. Read more

Source §

impl<T> CloneToUninit for T
where T: Clone,

Source §

unsafe fn clone_to_uninit(&self, dest: *mut u8)

🔬This is a nightly-only experimental API. (clone_to_uninit)

Performs copy-assignment from self to dest. Read more

Source §

impl<T> From<T> for T

Source §

fn from(t: T) -> T

Returns the argument unchanged.

Source §

impl<T, U> Into for T
where U: From<T>,

Source §

fn into(self) -> U

Calls U::from(self).

That is, this conversion is whatever the implementation of From<T> for U chooses to do.

Source §

impl<T> IntoEither for T

Source §

fn into_either(self, into_left: bool) -> Either<Self, Self>

Converts self into a Left variant of Either<Self, Self> if into_left is true. Converts self into a Right variant of Either<Self, Self> otherwise. Read more

Source §

fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
where F: FnOnce(&Self) -> bool,

Converts self into a Left variant of Either<Self, Self> if into_left(&self) returns true. Converts self into a Right variant of Either<Self, Self> otherwise. Read more

Source §