use crate::layout::TextSpan;
#[derive(Debug, Clone, Copy, PartialEq)]
pub struct CharacterConfidence {
pub score: f32,
pub reason: ConfidenceReason,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ConfidenceReason {
MappedByToUnicode,
StandardEncoding,
FontEncoding,
FontHintFallback,
Unmapped,
SuspiciousContext,
}
impl CharacterConfidence {
pub fn mapped() -> Self {
Self {
score: 0.95,
reason: ConfidenceReason::MappedByToUnicode,
}
}
pub fn standard_encoding() -> Self {
Self {
score: 0.9,
reason: ConfidenceReason::StandardEncoding,
}
}
pub fn unmapped() -> Self {
Self {
score: 0.3,
reason: ConfidenceReason::Unmapped,
}
}
pub fn suspicious(score: f32) -> Self {
Self {
score: score.clamp(0.0, 1.0),
reason: ConfidenceReason::SuspiciousContext,
}
}
}
#[derive(Debug, Clone, Default)]
pub struct NonTextStats {
pub total_chars: usize,
pub mapped_chars: usize,
pub unmapped_chars: usize,
pub avg_confidence: f32,
pub unmapped_ratio: f32,
pub likely_non_text: bool,
}
#[derive(Debug, Clone)]
pub struct NonTextDetector {
pub unmapped_threshold: f32,
pub confidence_threshold: f32,
pub min_sequence_length: usize,
}
impl Default for NonTextDetector {
fn default() -> Self {
Self {
unmapped_threshold: 0.5, confidence_threshold: 0.4, min_sequence_length: 10,
}
}
}
impl NonTextDetector {
pub fn new() -> Self {
Self::default()
}
pub fn analyze_sequence(
&self,
text: &str,
confidences: &[CharacterConfidence],
font_name: &str,
) -> NonTextStats {
if text.len() < self.min_sequence_length {
return NonTextStats::default();
}
let total_chars = text.len();
let mapped_chars = confidences
.iter()
.filter(|c| c.reason != ConfidenceReason::Unmapped)
.count();
let unmapped_chars = total_chars - mapped_chars;
let unmapped_ratio = unmapped_chars as f32 / total_chars as f32;
let avg_confidence = if !confidences.is_empty() {
confidences.iter().map(|c| c.score).sum::<f32>() / confidences.len() as f32
} else {
0.0
};
let likely_non_text = unmapped_ratio > self.unmapped_threshold
|| avg_confidence < self.confidence_threshold
|| self.is_diagram_font(font_name);
NonTextStats {
total_chars,
mapped_chars,
unmapped_chars,
avg_confidence,
unmapped_ratio,
likely_non_text,
}
}
fn is_diagram_font(&self, font_name: &str) -> bool {
let name_lower = font_name.to_lowercase();
[
"symbol",
"wingdings",
"webdings",
"zapf dingbats",
"dingbats",
"mathematical alphanumeric",
]
.iter()
.any(|&pattern| name_lower.contains(pattern))
}
pub fn mark_non_text_spans(&self, spans: &[TextSpan]) -> Vec<SpanClassification> {
spans
.iter()
.enumerate()
.map(|(idx, span)| {
let non_ascii_ratio = span.text.chars().filter(|c| !c.is_ascii()).count() as f32
/ span.text.len().max(1) as f32;
let is_likely_non_text = non_ascii_ratio > 0.3 || has_suspicious_patterns(&span.text);
SpanClassification {
span_index: idx,
span: span.clone(),
is_non_text: is_likely_non_text,
confidence: if is_likely_non_text { 0.6 } else { 0.9 },
}
})
.collect()
}
}
#[derive(Debug, Clone)]
pub struct SpanClassification {
pub span_index: usize,
pub span: TextSpan,
pub is_non_text: bool,
pub confidence: f32,
}
fn has_suspicious_patterns(text: &str) -> bool {
let special_char_count = text
.chars()
.filter(|c| {
let code = *c as u32;
matches!(
code,
0x2600..=0x27BF | 0x1F300..=0x1F9FF | 0x2200..=0x22FF | 0x2A00..=0x2AFF | 0x0080..=0x009F )
})
.count();
let special_ratio = special_char_count as f32 / text.len().max(1) as f32;
special_ratio > 0.4
}
pub fn compute_sequence_confidence(
text: &str,
mapped_count: usize,
font_name: &str,
) -> CharacterConfidence {
if text.is_empty() {
return CharacterConfidence::unmapped();
}
let total = text.len();
let mapped_ratio = mapped_count as f32 / total as f32;
let score: f32 = if mapped_ratio > 0.9 {
0.85
} else if mapped_ratio > 0.75 {
0.7
} else if mapped_ratio > 0.5 {
0.5
} else {
0.2
};
CharacterConfidence {
score: score.clamp(0.0_f32, 1.0_f32),
reason: if is_likely_diagram_font(font_name) {
ConfidenceReason::SuspiciousContext
} else {
ConfidenceReason::Unmapped
},
}
}
fn is_likely_diagram_font(font_name: &str) -> bool {
let name_lower = font_name.to_lowercase();
name_lower.contains("symbol")
|| name_lower.contains("wingdings")
|| name_lower.contains("webdings")
|| name_lower.contains("dingbats")
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_character_confidence_mapped() {
let conf = CharacterConfidence::mapped();
assert_eq!(conf.reason, ConfidenceReason::MappedByToUnicode);
assert!(conf.score > 0.9);
}
#[test]
fn test_character_confidence_unmapped() {
let conf = CharacterConfidence::unmapped();
assert_eq!(conf.reason, ConfidenceReason::Unmapped);
assert!(conf.score < 0.5);
}
#[test]
fn test_non_text_detector_high_unmapped_ratio() {
let detector = NonTextDetector::default();
let confidences = vec![
CharacterConfidence::unmapped(),
CharacterConfidence::unmapped(),
CharacterConfidence::unmapped(),
CharacterConfidence::mapped(),
CharacterConfidence::mapped(),
CharacterConfidence::unmapped(),
CharacterConfidence::unmapped(),
CharacterConfidence::unmapped(),
CharacterConfidence::unmapped(),
CharacterConfidence::unmapped(),
];
let stats = detector.analyze_sequence("äöüäöüäöüX", &confidences, "Helvetica");
assert!(stats.likely_non_text); }
#[test]
fn test_non_text_detector_symbol_font() {
let detector = NonTextDetector::default();
let confidences = vec![CharacterConfidence::mapped(); 10];
let stats = detector.analyze_sequence("test content 123", &confidences, "Symbol");
assert!(stats.likely_non_text);
}
#[test]
fn test_non_text_detector_normal_text() {
let detector = NonTextDetector::default();
let confidences = vec![CharacterConfidence::mapped(); 10];
let stats = detector.analyze_sequence("hello world test", &confidences, "Arial");
assert!(!stats.likely_non_text);
}
#[test]
fn test_suspicious_patterns() {
assert!(!has_suspicious_patterns("The quick brown fox"));
assert!(!has_suspicious_patterns("Café résumé naïve"));
}
#[test]
fn test_sequence_confidence_high_mapped() {
let conf = compute_sequence_confidence("Hello World", 11, "Arial");
assert!(conf.score > 0.7);
}
#[test]
fn test_sequence_confidence_low_mapped() {
let conf = compute_sequence_confidence("☺♦♠♥♣", 1, "Arial");
assert!(conf.score < 0.5);
}
}