use super::*;
use std::collections::HashSet;
#[test]
fn test_character_substitution_single_character() {
let processor = OcrPostProcessor::new();
let suggestions = processor.generate_suggestions("He1lo");
assert!(
!suggestions.is_empty(),
"Should generate suggestions for 'He1lo'"
);
let has_hello = suggestions.iter().any(|s| s.corrected_word == "Hello");
assert!(
has_hello,
"Should suggest 'Hello' from 'He1lo'. Got: {:?}",
suggestions
.iter()
.map(|s| &s.corrected_word)
.collect::<Vec<_>>()
);
let hello_suggestion = suggestions
.iter()
.find(|s| s.corrected_word == "Hello")
.expect("Hello should exist");
assert_eq!(
hello_suggestion.correction_type,
CorrectionType::CharacterSubstitution
);
assert_eq!(hello_suggestion.correction_confidence, 0.8);
}
#[test]
fn test_character_substitution_multiple_instances() {
let processor = OcrPostProcessor::new();
let suggestions = processor.generate_suggestions("He11o");
let has_first_l = suggestions.iter().any(|s| s.corrected_word == "Hel1o");
let has_second_l = suggestions.iter().any(|s| s.corrected_word == "He1lo");
assert!(
has_first_l || has_second_l,
"Should suggest corrections for '1' -> 'l'. Got: {:?}",
suggestions
.iter()
.map(|s| &s.corrected_word)
.collect::<Vec<_>>()
);
}
#[test]
fn test_character_substitution_no_confusables() {
let processor = OcrPostProcessor::new();
let suggestions = processor.generate_suggestions("abc");
let char_subs: Vec<_> = suggestions
.iter()
.filter(|s| s.correction_type == CorrectionType::CharacterSubstitution)
.collect();
assert!(
char_subs.is_empty(),
"Should not generate character substitutions for 'abc'. Got: {:?}",
char_subs
);
}
#[test]
fn test_pattern_correction_rn_to_m() {
let processor = OcrPostProcessor::new();
let suggestions = processor.generate_suggestions("infornation");
let has_correction = suggestions.iter().any(|s| s.corrected_word == "infomation");
assert!(
has_correction,
"Should suggest 'infomation' from 'infornation'. Got: {:?}",
suggestions
.iter()
.map(|s| &s.corrected_word)
.collect::<Vec<_>>()
);
let correction = suggestions
.iter()
.find(|s| s.corrected_word == "infomation")
.expect("infomation should exist");
assert_eq!(
correction.correction_type,
CorrectionType::PatternCorrection
);
assert_eq!(correction.correction_confidence, 0.85);
}
#[test]
fn test_pattern_correction_cl_to_d() {
let processor = OcrPostProcessor::new();
let suggestions = processor.generate_suggestions("olcl");
let has_correction = suggestions.iter().any(|s| s.corrected_word == "old");
assert!(
has_correction,
"Should suggest 'old' from 'olcl'. Got: {:?}",
suggestions
.iter()
.map(|s| &s.corrected_word)
.collect::<Vec<_>>()
);
}
#[test]
fn test_pattern_correction_multiple_occurrences() {
let processor = OcrPostProcessor::new();
let suggestions = processor.generate_suggestions("infornrnation");
let corrected_words: Vec<_> = suggestions
.iter()
.filter(|s| s.correction_type == CorrectionType::PatternCorrection)
.map(|s| &s.corrected_word)
.collect();
assert!(
corrected_words.contains(&&"infommation".to_string()),
"Should replace all 'rn' with 'm', producing 'infommation'. Got: {:?}",
corrected_words
);
}
#[test]
fn test_dictionary_correction_exact_match() {
let mut dict = HashSet::new();
dict.insert("hello".to_string());
let processor = OcrPostProcessor::new().with_dictionary(dict);
let suggestions = processor.generate_suggestions("hello");
let dict_corrections: Vec<_> = suggestions
.iter()
.filter(|s| s.correction_type == CorrectionType::DictionaryCorrection)
.collect();
assert!(
dict_corrections.is_empty(),
"Valid dictionary word should have no dictionary corrections. Got: {:?}",
dict_corrections
);
}
#[test]
fn test_dictionary_correction_edit_distance_1() {
let mut dict = HashSet::new();
dict.insert("hello".to_string());
dict.insert("world".to_string());
let processor = OcrPostProcessor::new().with_dictionary(dict);
let suggestions = processor.generate_suggestions("helo");
let has_hello = suggestions.iter().any(|s| s.corrected_word == "hello");
assert!(
has_hello,
"Should suggest 'hello' for 'helo' (edit distance 1). Got: {:?}",
suggestions
.iter()
.map(|s| &s.corrected_word)
.collect::<Vec<_>>()
);
let hello_suggestion = suggestions
.iter()
.find(|s| s.corrected_word == "hello")
.expect("hello should exist");
assert!(
(hello_suggestion.correction_confidence - 0.72).abs() < 0.01,
"Confidence should be 0.72, got: {}",
hello_suggestion.correction_confidence
);
}
#[test]
fn test_dictionary_correction_edit_distance_2() {
let mut dict = HashSet::new();
dict.insert("hello".to_string());
let processor = OcrPostProcessor::new().with_dictionary(dict);
let suggestions = processor.generate_suggestions("heo");
let has_hello = suggestions.iter().any(|s| s.corrected_word == "hello");
assert!(
has_hello,
"Should suggest 'hello' for 'heo' (edit distance 2). Got: {:?}",
suggestions
.iter()
.map(|s| &s.corrected_word)
.collect::<Vec<_>>()
);
}
#[test]
fn test_dictionary_correction_exceeds_max_distance() {
let mut dict = HashSet::new();
dict.insert("hello".to_string());
let processor = OcrPostProcessor::new().with_dictionary(dict);
let suggestions = processor.generate_suggestions("hi");
let has_hello = suggestions.iter().any(|s| s.corrected_word == "hello");
assert!(
!has_hello,
"Should NOT suggest 'hello' for 'hi' (edit distance > 2). Got: {:?}",
suggestions
.iter()
.map(|s| &s.corrected_word)
.collect::<Vec<_>>()
);
}
#[test]
fn test_edit_distance_identical() {
let processor = OcrPostProcessor::new();
assert_eq!(processor.edit_distance("hello", "hello"), 0);
assert_eq!(processor.edit_distance("", ""), 0);
assert_eq!(processor.edit_distance("a", "a"), 0);
}
#[test]
fn test_edit_distance_substitution() {
let processor = OcrPostProcessor::new();
assert_eq!(processor.edit_distance("hello", "hallo"), 1); assert_eq!(processor.edit_distance("cat", "bat"), 1); }
#[test]
fn test_edit_distance_insertion() {
let processor = OcrPostProcessor::new();
assert_eq!(processor.edit_distance("hello", "helloo"), 1); assert_eq!(processor.edit_distance("cat", "cart"), 1); }
#[test]
fn test_edit_distance_deletion() {
let processor = OcrPostProcessor::new();
assert_eq!(processor.edit_distance("hello", "helo"), 1); assert_eq!(processor.edit_distance("cart", "cat"), 1); }
#[test]
fn test_edit_distance_empty_strings() {
let processor = OcrPostProcessor::new();
assert_eq!(processor.edit_distance("", ""), 0);
assert_eq!(processor.edit_distance("hello", ""), 5);
assert_eq!(processor.edit_distance("", "world"), 5);
}
#[test]
fn test_edit_distance_complex() {
let processor = OcrPostProcessor::new();
assert_eq!(processor.edit_distance("kitten", "sitting"), 3);
}
#[test]
fn test_region_contains_point_boundaries() {
let region = OcrRegion::new(100, 100, 50, 50);
assert!(region.contains_point(100, 100));
assert!(region.contains_point(101, 101));
assert!(!region.contains_point(150, 150));
assert!(region.contains_point(149, 149));
assert!(!region.contains_point(99, 100));
assert!(!region.contains_point(100, 99));
assert!(!region.contains_point(150, 100));
assert!(!region.contains_point(100, 150));
}
#[test]
fn test_region_zero_dimensions() {
let region = OcrRegion::new(100, 100, 0, 0);
assert!(!region.contains_point(100, 100));
assert!(!region.contains_point(99, 99));
}
#[test]
fn test_region_overlaps_edge_touching() {
let region1 = OcrRegion::new(0, 0, 100, 100); let region2 = OcrRegion::new(100, 100, 100, 100);
assert!(!region1.overlaps_with(®ion2));
assert!(!region2.overlaps_with(®ion1));
}
#[test]
fn test_region_overlaps_single_pixel() {
let region1 = OcrRegion::new(0, 0, 101, 101); let region2 = OcrRegion::new(100, 100, 100, 100);
assert!(region1.overlaps_with(®ion2));
assert!(region2.overlaps_with(®ion1));
}
#[test]
fn test_word_confidence_boundary_threshold() {
let word = WordConfidence::new("test".to_string(), 0.7, 0.0, 40.0);
assert!(!word.is_low_confidence(0.7));
assert!(!word.is_low_confidence(0.6999));
assert!(word.is_low_confidence(0.7001));
}
#[test]
fn test_fragment_average_confidence_empty_words() {
let fragment = OcrTextFragment::with_word_confidences(
"text".to_string(),
0.0,
0.0,
100.0,
20.0,
0.8, 12.0,
FragmentType::Word,
vec![], );
assert_eq!(fragment.average_word_confidence(), Some(0.0));
}
#[test]
fn test_fragment_average_confidence_single_word() {
let word_confs = vec![WordConfidence::new("only".to_string(), 0.95, 0.0, 40.0)];
let fragment = OcrTextFragment::with_word_confidences(
"only".to_string(),
0.0,
0.0,
40.0,
20.0,
0.95,
12.0,
FragmentType::Word,
word_confs,
);
assert_eq!(fragment.average_word_confidence(), Some(0.95));
}
#[test]
fn test_suggestion_limit_enforced() {
let mut dict = HashSet::new();
for i in 0..20 {
dict.insert(format!("wor{}", ('a' as u8 + i as u8) as char));
}
let processor = OcrPostProcessor::new().with_dictionary(dict);
let suggestions = processor.generate_suggestions("word");
assert!(
suggestions.len() <= 5,
"Should limit to 5 suggestions, got: {}",
suggestions.len()
);
}
#[test]
fn test_suggestion_sorting_by_confidence() {
let mut dict = HashSet::new();
dict.insert("hello".to_string()); dict.insert("halo".to_string());
let processor = OcrPostProcessor::new().with_dictionary(dict);
let suggestions = processor.generate_suggestions("helo");
for i in 0..suggestions.len() - 1 {
assert!(
suggestions[i].correction_confidence >= suggestions[i + 1].correction_confidence,
"Suggestions should be sorted by confidence. Got: {:?}",
suggestions
.iter()
.map(|s| (s.corrected_word.clone(), s.correction_confidence))
.collect::<Vec<_>>()
);
}
}
#[test]
fn test_confidence_report_format_without_words() {
let fragment = OcrTextFragment::new(
"test text".to_string(),
0.0,
0.0,
100.0,
20.0,
0.85,
12.0,
FragmentType::Line,
);
let report = fragment.confidence_report();
assert!(
report.contains("85.0%"),
"Should show 85.0% confidence. Got: {}",
report
);
assert!(
report.contains("test text"),
"Should show fragment text. Got: {}",
report
);
assert!(
report.contains("(No word-level data available)"),
"Should indicate no word data. Got: {}",
report
);
}
#[test]
fn test_confidence_report_format_with_words() {
let word_confs = vec![
WordConfidence::new("first".to_string(), 0.9, 0.0, 40.0),
WordConfidence::new("second".to_string(), 0.8, 45.0, 50.0),
];
let fragment = OcrTextFragment::with_word_confidences(
"first second".to_string(),
0.0,
0.0,
95.0,
20.0,
0.85,
12.0,
FragmentType::Line,
word_confs,
);
let report = fragment.confidence_report();
assert!(report.contains("85.0%"), "Should show fragment confidence");
assert!(report.contains("2 words"), "Should show word count");
assert!(report.contains("\"first\""), "Should show first word");
assert!(
report.contains("90.0%"),
"Should show first word confidence"
);
assert!(report.contains("\"second\""), "Should show second word");
assert!(
report.contains("80.0%"),
"Should show second word confidence"
);
}
#[test]
fn test_confidence_report_with_character_level() {
let char_confs = vec![
CharacterConfidence::new('h', 0.95, 0.0, 10.0),
CharacterConfidence::new('i', 0.92, 10.0, 8.0),
];
let word_confs = vec![WordConfidence::with_characters(
"hi".to_string(),
0.935,
0.0,
18.0,
char_confs,
)];
let fragment = OcrTextFragment::with_word_confidences(
"hi".to_string(),
0.0,
0.0,
18.0,
20.0,
0.935,
12.0,
FragmentType::Word,
word_confs,
);
let report = fragment.confidence_report();
assert!(report.contains("'h'"), "Should show character 'h'");
assert!(report.contains("'i'"), "Should show character 'i'");
assert!(
report.contains("95%"),
"Should show 'h' confidence (rounded)"
);
assert!(
report.contains("92%"),
"Should show 'i' confidence (rounded)"
);
}
#[test]
fn test_words_by_confidence_sorted_ascending() {
let word_confs = vec![
WordConfidence::new("high".to_string(), 0.95, 0.0, 40.0),
WordConfidence::new("low".to_string(), 0.45, 45.0, 30.0),
WordConfidence::new("medium".to_string(), 0.75, 80.0, 60.0),
];
let fragment = OcrTextFragment::with_word_confidences(
"high low medium".to_string(),
0.0,
0.0,
140.0,
20.0,
0.72,
12.0,
FragmentType::Line,
word_confs,
);
let sorted = fragment.words_by_confidence();
assert_eq!(sorted.len(), 3);
assert_eq!(sorted[0].word, "low");
assert_eq!(sorted[0].confidence, 0.45);
assert_eq!(sorted[1].word, "medium");
assert_eq!(sorted[1].confidence, 0.75);
assert_eq!(sorted[2].word, "high");
assert_eq!(sorted[2].confidence, 0.95);
}
#[test]
fn test_words_by_confidence_equal_confidences() {
let word_confs = vec![
WordConfidence::new("first".to_string(), 0.8, 0.0, 40.0),
WordConfidence::new("second".to_string(), 0.8, 45.0, 50.0),
WordConfidence::new("third".to_string(), 0.8, 100.0, 50.0),
];
let fragment = OcrTextFragment::with_word_confidences(
"first second third".to_string(),
0.0,
0.0,
150.0,
20.0,
0.8,
12.0,
FragmentType::Line,
word_confs,
);
let sorted = fragment.words_by_confidence();
assert_eq!(sorted.len(), 3);
assert!(sorted.iter().all(|w| w.confidence == 0.8));
}
#[test]
fn test_correction_candidates_respects_threshold() {
let word_confs = vec![
WordConfidence::new("perfect".to_string(), 0.99, 0.0, 70.0),
WordConfidence::new("good".to_string(), 0.85, 75.0, 40.0),
WordConfidence::new("okay".to_string(), 0.69, 120.0, 40.0), WordConfidence::new("bad".to_string(), 0.50, 165.0, 30.0),
];
let fragment = OcrTextFragment::with_word_confidences(
"perfect good okay bad".to_string(),
0.0,
0.0,
200.0,
20.0,
0.76,
12.0,
FragmentType::Line,
word_confs,
);
let candidates = fragment.get_correction_candidates(0.7);
assert_eq!(candidates.len(), 2, "Should have 2 candidates below 0.7");
assert_eq!(candidates[0].word, "okay");
assert_eq!(candidates[0].confidence, 0.69);
assert_eq!(candidates[0].position_in_fragment, 2);
assert_eq!(candidates[1].word, "bad");
assert_eq!(candidates[1].confidence, 0.50);
assert_eq!(candidates[1].position_in_fragment, 3);
}
#[test]
fn test_correction_candidates_position_tracking() {
let word_confs = vec![
WordConfidence::new("word0".to_string(), 0.9, 0.0, 50.0),
WordConfidence::new("word1".to_string(), 0.6, 55.0, 50.0), WordConfidence::new("word2".to_string(), 0.9, 110.0, 50.0),
WordConfidence::new("word3".to_string(), 0.5, 165.0, 50.0), ];
let fragment = OcrTextFragment::with_word_confidences(
"word0 word1 word2 word3".to_string(),
0.0,
0.0,
220.0,
20.0,
0.73,
12.0,
FragmentType::Line,
word_confs,
);
let candidates = fragment.get_correction_candidates(0.7);
assert_eq!(
candidates[0].position_in_fragment, 1,
"word1 is at position 1"
);
assert_eq!(
candidates[1].position_in_fragment, 3,
"word3 is at position 3"
);
}