statistical_spelling_demo/
statistical_spelling_demo.rs

1// Statistical spelling correction demo
2//
3// This example demonstrates the enhanced statistical spelling correction
4// functionality in the scirs2-text crate, including context-aware correction.
5
6use scirs2_text::{
7    DictionaryCorrector, ErrorModel, SpellingCorrector, StatisticalCorrector,
8    StatisticalCorrectorConfig,
9};
10use std::time::Instant;
11
12// Sample text with misspellings in different contexts
13const TEXT_WITH_CONTEXT_MISSPELLINGS: &str =
14    "I went to the bnk to deposit some money. The river bnk was muddy after the rain. \
15    I recieved your mesage about the meeting. He recieved many complements on his work. \
16    Their was a problem with there computer. The museum disply had many historical artefcts.";
17
18// Expected corrected text
19const EXPECTED_CORRECTED_TEXT: &str =
20    "I went to the bank to deposit some money. The river bank was muddy after the rain. \
21    I received your message about the meeting. He received many compliments on his work. \
22    There was a problem with their computer. The museum display had many historical artifacts.";
23
24// Sample text for language model training
25const SAMPLE_TRAINING_TEXT: &str =
26    "I went to the bank to deposit some money yesterday. The bank offers good interest rates. \
27    The river bank was muddy after the rain. We sat on the bank of the river and watched the sunset. \
28    I received your message about the meeting. Thank you for the message you sent. \
29    He received many compliments on his work. The teacher gave compliments to the students. \
30    There was a problem with their computer. Their car broke down on the highway. \
31    The museum display had many historical artifacts. The ancient artifacts were well preserved. \
32    The display was impressive and educational.";
33
34#[allow(dead_code)]
35fn main() -> Result<(), Box<dyn std::error::Error>> {
36    println!("Statistical Spelling Correction Demo\n");
37
38    // Create a dictionary corrector as baseline
39    let dict_corrector = DictionaryCorrector::default();
40
41    // Create a statistical corrector
42    let mut stat_corrector = StatisticalCorrector::default();
43
44    // Train the language model
45    train_language_model(&mut stat_corrector);
46
47    // Add specific words to ensure consistent behavior in examples
48    add_example_words(&mut stat_corrector);
49
50    // Compare dictionary and statistical correctors
51    compare_correctors(&dict_corrector, &stat_corrector)?;
52
53    // Demonstrate context-aware correction
54    context_aware_correction_demo(&stat_corrector)?;
55
56    // Performance test
57    performance_test(&dict_corrector, &stat_corrector)?;
58
59    // Configuration demo
60    configuration_demo()?;
61
62    // Noise model demo
63    noise_model_demo()?;
64
65    Ok(())
66}
67
68// Function to train the language model with sample text
69#[allow(dead_code)]
70fn train_language_model(corrector: &mut StatisticalCorrector) {
71    println!("Training language model with sample text...");
72
73    // Add sample training text
74    corrector.add_trainingtext(SAMPLE_TRAINING_TEXT);
75
76    // Add more specialized training examples for context disambiguation
77    let additional_examples = [
78        // Bank context examples
79        "I went to the bank to deposit money.",
80        "The bank is open until 5pm.",
81        "She works at the bank downtown.",
82        "I need to check my bank account.",
83        // River bank context examples
84        "We sat on the bank of the river.",
85        "The river bank was covered with flowers.",
86        "They fished from the bank of the lake.",
87        "The boat was tied to the bank.",
88        // Homophone examples for there/their/they're
89        "There is a book on the table.",
90        "Their house is very beautiful.",
91        "They're going to the movies tonight.",
92        "There was a problem with the system.",
93        "Their car broke down yesterday.",
94        "They're planning a vacation next month.",
95        // Complement/compliment examples
96        "He received many compliments on his presentation.",
97        "She gave him a compliment about his new haircut.",
98        "Red and green are complementary colors.",
99        "This wine complements the meal perfectly.",
100        // Message examples
101        "I received your message yesterday.",
102        "Please send me a message when you arrive.",
103        "The message was unclear and confusing.",
104        "She left a message on my voicemail.",
105    ];
106
107    for example in &additional_examples {
108        corrector.add_trainingtext(example);
109    }
110
111    println!(
112        "Language model trained with {} words vocabulary\n",
113        corrector.vocabulary_size()
114    );
115}
116
117// Function to add specific words for consistent example behavior
118#[allow(dead_code)]
119fn add_example_words(corrector: &mut StatisticalCorrector) {
120    // Add specific words to the dictionary
121    let word_frequencies = [
122        // Common misspelled words
123        ("bank", 100),
124        ("river", 100),
125        ("deposit", 100),
126        ("money", 100),
127        ("received", 100),
128        ("message", 100),
129        ("meeting", 100),
130        ("compliments", 100),
131        ("complements", 100),
132        ("work", 100),
133        ("there", 100),
134        ("their", 100),
135        ("they're", 100),
136        ("was", 100),
137        ("problem", 100),
138        ("computer", 100),
139        ("museum", 100),
140        ("display", 100),
141        ("historical", 100),
142        ("artifacts", 100),
143    ];
144
145    for (word, freq) in &word_frequencies {
146        corrector.add_word(word, *freq);
147    }
148}
149
150// Compare dictionary-based and statistical spelling correction
151#[allow(dead_code)]
152fn compare_correctors(
153    dict_corrector: &DictionaryCorrector,
154    stat_corrector: &StatisticalCorrector,
155) -> Result<(), Box<dyn std::error::Error>> {
156    println!("=== Dictionary vs. Statistical Correction ===\n");
157
158    // Define test cases with known misspellings
159    let test_cases = [
160        ("recieve", "receive"),
161        ("mesage", "message"),
162        ("bnk", "bank"),
163        ("thier", "their"),
164        ("complements", "compliments"), // Can be correct in some contexts
165        ("artefacts", "artifacts"),
166        ("disply", "display"),
167        ("definately", "definitely"),
168    ];
169
170    println!(
171        "{:<15} {:<15} {:<15}",
172        "Misspelled", "Dictionary", "Statistical"
173    );
174    println!("{:-<45}", "");
175
176    for (misspelled, _expected) in &test_cases {
177        let dict_correction = dict_corrector.correct(misspelled)?;
178        let stat_correction = stat_corrector.correct(misspelled)?;
179
180        println!("{misspelled:<15} {dict_correction:<15} {stat_correction:<15}");
181    }
182
183    println!("\nDictionary sizes:");
184    println!(
185        "  - Dictionary _corrector: {} words",
186        dict_corrector.dictionary_size()
187    );
188    println!(
189        "  - Statistical _corrector: {} words (+ {} in language model)",
190        stat_corrector.dictionary_size(),
191        stat_corrector.vocabulary_size()
192    );
193
194    Ok(())
195}
196
197// Demonstrate context-aware correction
198#[allow(dead_code)]
199fn context_aware_correction_demo(
200    corrector: &StatisticalCorrector,
201) -> Result<(), Box<dyn std::error::Error>> {
202    println!("\n=== Context-Aware Correction Demo ===\n");
203
204    println!("Original text with misspellings:");
205    println!("{TEXT_WITH_CONTEXT_MISSPELLINGS}\n");
206
207    // Correct the text
208    let correctedtext = corrector.correcttext(TEXT_WITH_CONTEXT_MISSPELLINGS)?;
209
210    println!("Corrected text:");
211    println!("{correctedtext}\n");
212
213    println!("Expected text:");
214    println!("{EXPECTED_CORRECTED_TEXT}\n");
215
216    // Compare specific correction examples
217    println!("Specific context examples:\n");
218
219    // Example 1: bnk -> bank in different contexts
220    let text1 = "I went to the bnk to deposit some money.";
221    let text2 = "The river bnk was muddy after the rain.";
222
223    println!("Example 1: 'bnk' in financial context");
224    println!("Before: {text1}");
225    println!("After:  {}\n", corrector.correcttext(text1)?);
226
227    println!("Example 2: 'bnk' in geographical context");
228    println!("Before: {text2}");
229    println!("After:  {}\n", corrector.correcttext(text2)?);
230
231    // Example 2: there/their homophone confusion
232    let text3 = "Their was a problem with the computer.";
233    let text4 = "There car broke down on the highway.";
234
235    println!("Example 3: 'their' used incorrectly");
236    println!("Before: {text3}");
237    println!("After:  {}\n", corrector.correcttext(text3)?);
238
239    println!("Example 4: 'there' used incorrectly");
240    println!("Before: {text4}");
241    println!("After:  {}\n", corrector.correcttext(text4)?);
242
243    Ok(())
244}
245
246// Test performance of different correctors and configurations
247#[allow(dead_code)]
248fn performance_test(
249    dict_corrector: &DictionaryCorrector,
250    stat_corrector: &StatisticalCorrector,
251) -> Result<(), Box<dyn std::error::Error>> {
252    println!("\n=== Performance Test ===\n");
253
254    // Create test text with a mix of correct and incorrect words
255    let testtext = TEXT_WITH_CONTEXT_MISSPELLINGS.repeat(10);
256
257    // Measure dictionary _corrector performance
258    let start = Instant::now();
259    let _ = dict_corrector.correcttext(&testtext)?;
260    let dict_time = start.elapsed();
261
262    // Measure statistical _corrector performance
263    let start = Instant::now();
264    let _ = stat_corrector.correcttext(&testtext)?;
265    let stat_time = start.elapsed();
266
267    // Create a non-contextual statistical _corrector for comparison
268    let non_context_config = StatisticalCorrectorConfig {
269        use_context: false,
270        ..Default::default()
271    };
272    let mut non_context_corrector = StatisticalCorrector::new(non_context_config);
273
274    // Add training data to ensure fair comparison
275    train_language_model(&mut non_context_corrector);
276    add_example_words(&mut non_context_corrector);
277
278    // Measure non-contextual statistical _corrector performance
279    let start = Instant::now();
280    let _ = non_context_corrector.correcttext(&testtext)?;
281    let non_context_time = start.elapsed();
282
283    println!(
284        "Performance comparison on text with {} characters:",
285        testtext.len()
286    );
287    println!("  - Dictionary corrector: {dict_time:?}");
288    println!("  - Statistical _corrector (without context): {non_context_time:?}");
289    println!("  - Statistical _corrector (with context): {stat_time:?}");
290
291    Ok(())
292}
293
294// Demonstrate different configurations for statistical correction
295#[allow(dead_code)]
296fn configuration_demo() -> Result<(), Box<dyn std::error::Error>> {
297    println!("\n=== Configuration Options Demo ===\n");
298
299    // Create configurations with different settings
300    let configs = [
301        ("Default", StatisticalCorrectorConfig::default()),
302        (
303            "Conservative (max_edit_distance=1)",
304            StatisticalCorrectorConfig {
305                max_edit_distance: 1,
306                ..Default::default()
307            },
308        ),
309        (
310            "Aggressive (max_edit_distance=3)",
311            StatisticalCorrectorConfig {
312                max_edit_distance: 3,
313                ..Default::default()
314            },
315        ),
316        (
317            "Language model focused (weight=0.9)",
318            StatisticalCorrectorConfig {
319                language_model_weight: 0.9,
320                edit_distance_weight: 0.1,
321                ..Default::default()
322            },
323        ),
324        (
325            "Edit distance focused (weight=0.9)",
326            StatisticalCorrectorConfig {
327                language_model_weight: 0.1,
328                edit_distance_weight: 0.9,
329                ..Default::default()
330            },
331        ),
332        (
333            "No context",
334            StatisticalCorrectorConfig {
335                use_context: false,
336                ..Default::default()
337            },
338        ),
339    ];
340
341    // Sample misspelled words with varied edit distances
342    let test_cases = [
343        "recieve",     // Should be "receive"
344        "accidant",    // Could be "accident" or "accidental" depending on max_edit_distance
345        "programing",  // Single-m, should be "programming"
346        "thier",       // Common misspelling of "their"
347        "complements", // Could be "compliments" depending on context
348    ];
349
350    // Test each configuration
351    for (name, config) in &configs {
352        let mut corrector = StatisticalCorrector::new(config.clone());
353
354        // Train the model to ensure consistent behavior
355        train_language_model(&mut corrector);
356        add_example_words(&mut corrector);
357
358        println!("{name} configuration:");
359        println!("  max_editdistance: {}", config.max_edit_distance);
360        println!("  language_modelweight: {}", config.language_model_weight);
361        println!("  edit_distanceweight: {}", config.edit_distance_weight);
362        println!("  usecontext: {}", config.use_context);
363
364        println!("\n  Correction examples:");
365        for word in &test_cases {
366            let corrected = corrector.correct(word)?;
367            println!("    {word} -> {corrected}");
368        }
369
370        // Show a context example if context is enabled
371        if config.use_context {
372            let context_example = "Going to the bnk to deposit money. The river bnk was muddy.";
373            let corrected = corrector.correcttext(context_example)?;
374            println!("\n  Context example:");
375            println!("    Before: {context_example}");
376            println!("    After:  {corrected}");
377        }
378
379        println!("\n{:-<60}", "");
380    }
381
382    Ok(())
383}
384
385// Demonstrate the error model (noisy channel model)
386#[allow(dead_code)]
387fn noise_model_demo() -> Result<(), Box<dyn std::error::Error>> {
388    println!("\n=== Error Model Demo ===\n");
389
390    // Create different error models with varying error type probabilities
391    let models = [
392        ("Default", ErrorModel::default()),
393        ("Deletion-heavy", ErrorModel::new(0.7, 0.1, 0.1, 0.1)),
394        ("Insertion-heavy", ErrorModel::new(0.1, 0.7, 0.1, 0.1)),
395        ("Substitution-heavy", ErrorModel::new(0.1, 0.1, 0.7, 0.1)),
396        ("Transposition-heavy", ErrorModel::new(0.1, 0.1, 0.1, 0.7)),
397    ];
398
399    // Test cases for different error types
400    let test_pairs = [
401        ("recieve", "receive"),        // Transposition (i and e)
402        ("acheive", "achieve"),        // Transposition (i and e)
403        ("languge", "language"),       // Deletion (missing 'a')
404        ("programing", "programming"), // Deletion (missing 'm')
405        ("probblem", "problem"),       // Insertion (extra 'b')
406        ("committe", "committee"),     // Insertion (missing 'e')
407        ("definately", "definitely"),  // Substitution ('a' instead of 'i')
408        ("seperate", "separate"),      // Substitution ('e' instead of 'a')
409    ];
410
411    // Test each error model
412    println!(
413        "{:<20} {:<12} {:<12} {:<12} {:<12} {:<12}",
414        "Model", "Delete Prob", "Insert Prob", "Subst Prob", "Transp Prob", "Example"
415    );
416    println!("{:-<80}", "");
417
418    for (name, model) in &models {
419        // Pick one example to show
420        let (typo, correct) = test_pairs[0];
421        let probability = model.error_probability(typo, correct);
422
423        println!(
424            "{:<20} {:<12.2} {:<12.2} {:<12.2} {:<12.2} {:<12.4}",
425            name,
426            model.p_deletion,
427            model.p_insertion,
428            model.p_substitution,
429            model.p_transposition,
430            probability
431        );
432    }
433
434    println!("\nError probabilities for different error types (using default model):");
435
436    let default_model = ErrorModel::default();
437
438    for (typo, correct) in &test_pairs {
439        let prob = default_model.error_probability(typo, correct);
440        println!("{typo:<12} -> {correct:<12}: {prob:.6}");
441    }
442
443    println!("\nImpact on correction with custom error model:");
444
445    // Create a statistical corrector with a custom error model
446    let custom_config = StatisticalCorrectorConfig {
447        language_model_weight: 0.3,
448        edit_distance_weight: 0.7,
449        ..Default::default()
450    };
451
452    let mut custom_corrector = StatisticalCorrector::new(custom_config);
453    train_language_model(&mut custom_corrector);
454    add_example_words(&mut custom_corrector);
455
456    // Create a transposition-heavy error model (good for common spelling errors)
457    let transposition_model = ErrorModel::new(0.1, 0.1, 0.1, 0.7);
458    custom_corrector.set_error_model(transposition_model);
459
460    // Test some examples
461    println!("\nCorrecting text with transposition-heavy error model:");
462    let testtext = "I recieved a mesage about thier acheivements.";
463    let corrected = custom_corrector.correcttext(testtext)?;
464
465    println!("Before: {testtext}");
466    println!("After:  {corrected}");
467
468    Ok(())
469}