pdf_oxide 0.3.59

The fastest Rust PDF library with text extraction: 0.8ms mean, 100% pass rate on 3,830 PDFs. 5× faster than pdf_extract, 17× faster than oxidize_pdf. Extract, create, and edit PDFs.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
//! Non-text content detection for PDF extraction.
//!
//! This module detects whether character sequences represent non-text content
//! (such as embedded figures, diagrams, or other visual elements) rather than
//! actual text. This helps avoid extracting garbled characters from figures
//! that have high percentages of unmapped glyphs.
//!
//! # Phase 3: Enhanced ToUnicode Fallback
//!
//! Phase 3 improves extraction quality by:
//! 1. Detecting non-text content sequences
//! 2. Computing character mapping confidence scores
//! 3. Marking or skipping figures/diagrams in output
//! 4. Preserving text extraction accuracy

use crate::layout::TextSpan;

/// Confidence score for character-to-Unicode mapping (0.0 to 1.0).
///
/// Represents how confident we are that a given character code
/// maps to valid Unicode text rather than being garbage/diagram content.
#[derive(Debug, Clone, Copy, PartialEq)]
pub struct CharacterConfidence {
    /// Overall confidence score (0.0 = certain garbage, 1.0 = certain text)
    pub score: f32,
    /// Reason for the confidence score
    pub reason: ConfidenceReason,
}

/// Reason why a character has a certain confidence score.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ConfidenceReason {
    /// Character has explicit ToUnicode mapping
    MappedByToUnicode,
    /// Character in standard encoding (ASCII, Latin-1, etc.)
    StandardEncoding,
    /// Character from font's built-in encoding
    FontEncoding,
    /// Fallback mapping using font name hints (Symbol, Wingdings, etc.)
    FontHintFallback,
    /// Unmapped character with no mapping available
    Unmapped,
    /// Character appears in suspicious context (likely diagram/figure)
    SuspiciousContext,
}

impl CharacterConfidence {
    /// Create a confidence score for a mapped character.
    pub fn mapped() -> Self {
        Self {
            score: 0.95,
            reason: ConfidenceReason::MappedByToUnicode,
        }
    }

    /// Create a confidence score for a standard encoding character.
    pub fn standard_encoding() -> Self {
        Self {
            score: 0.9,
            reason: ConfidenceReason::StandardEncoding,
        }
    }

    /// Create a confidence score for an unmapped character.
    pub fn unmapped() -> Self {
        Self {
            score: 0.3,
            reason: ConfidenceReason::Unmapped,
        }
    }

    /// Create a confidence score for a suspicious context.
    pub fn suspicious(score: f32) -> Self {
        Self {
            score: score.clamp(0.0, 1.0),
            reason: ConfidenceReason::SuspiciousContext,
        }
    }
}

/// Statistics for non-text content detection.
#[derive(Debug, Clone, Default)]
pub struct NonTextStats {
    /// Total characters analyzed
    pub total_chars: usize,
    /// Number of mapped characters
    pub mapped_chars: usize,
    /// Number of unmapped characters
    pub unmapped_chars: usize,
    /// Average confidence score
    pub avg_confidence: f32,
    /// Percentage of unmapped characters (0.0 to 1.0)
    pub unmapped_ratio: f32,
    /// Likely non-text content flag
    pub likely_non_text: bool,
}

/// Detector for non-text content in character sequences.
#[derive(Debug, Clone)]
pub struct NonTextDetector {
    /// Threshold for unmapped ratio to classify as non-text (default: 0.5)
    pub unmapped_threshold: f32,
    /// Threshold for confidence score to classify as non-text (default: 0.4)
    pub confidence_threshold: f32,
    /// Minimum sequence length to evaluate
    pub min_sequence_length: usize,
    /// Span-level non-ASCII ratio above which a span is treated as non-text
    /// content and dropped by `mark_non_text_spans` (default: 0.3). Set to
    /// `>= 1.0` to disable the non-ASCII drop entirely — appropriate for CJK,
    /// accented-Latin, or currency/math-heavy documents where a high
    /// non-ASCII ratio is normal content, not noise (PDX-7, liteparse report).
    pub non_ascii_drop_threshold: f32,
    /// Whether `mark_non_text_spans` drops spans containing characters in the
    /// "suspicious" Unicode blocks (misc symbols, dingbats, emoji, math
    /// operators). Default `true` preserves historical behaviour; set `false`
    /// to keep symbol/math glyphs that the text path retains (PDX-7).
    pub drop_suspicious_unicode: bool,
}

impl Default for NonTextDetector {
    fn default() -> Self {
        Self {
            unmapped_threshold: 0.5,   // >50% unmapped = likely figure
            confidence_threshold: 0.4, // avg confidence <0.4 = likely figure
            min_sequence_length: 10,
            // Defaults preserve the historical span-drop behaviour; callers
            // that need symbol/CJK/accented content can relax these.
            non_ascii_drop_threshold: 0.3,
            drop_suspicious_unicode: true,
        }
    }
}

impl NonTextDetector {
    /// Create a new non-text detector with default settings.
    pub fn new() -> Self {
        Self::default()
    }

    /// Analyze a character sequence for non-text content indicators.
    ///
    /// # Arguments
    ///
    /// * `text` - The extracted text (may contain unmapped characters)
    /// * `confidences` - Per-character confidence scores
    /// * `font_name` - Name of the font (for heuristics)
    ///
    /// # Returns
    ///
    /// Statistics about the sequence and whether it's likely non-text content.
    pub fn analyze_sequence(
        &self,
        text: &str,
        confidences: &[CharacterConfidence],
        font_name: &str,
    ) -> NonTextStats {
        if text.len() < self.min_sequence_length {
            return NonTextStats::default();
        }

        let total_chars = text.len();
        let mapped_chars = confidences
            .iter()
            .filter(|c| c.reason != ConfidenceReason::Unmapped)
            .count();
        let unmapped_chars = total_chars - mapped_chars;
        let unmapped_ratio = unmapped_chars as f32 / total_chars as f32;

        let avg_confidence = if !confidences.is_empty() {
            confidences.iter().map(|c| c.score).sum::<f32>() / confidences.len() as f32
        } else {
            0.0
        };

        // Classify as likely non-text if:
        // 1. High unmapped ratio (>50%)
        // 2. Low average confidence (<0.4)
        // 3. Font name suggests symbol/diagram font (Symbol, Wingdings, etc.)
        let likely_non_text = unmapped_ratio > self.unmapped_threshold
            || avg_confidence < self.confidence_threshold
            || self.is_diagram_font(font_name);

        NonTextStats {
            total_chars,
            mapped_chars,
            unmapped_chars,
            avg_confidence,
            unmapped_ratio,
            likely_non_text,
        }
    }

    /// Check if a font name suggests diagram/symbol content.
    fn is_diagram_font(&self, font_name: &str) -> bool {
        let name_lower = font_name.to_lowercase();
        [
            "symbol",
            "wingdings",
            "webdings",
            "zapf dingbats",
            "dingbats",
            "mathematical alphanumeric",
        ]
        .iter()
        .any(|&pattern| name_lower.contains(pattern))
    }

    /// Detect and mark sequences as non-text content.
    ///
    /// This method analyzes spans and marks those that likely represent
    /// figures, diagrams, or other non-text content.
    pub fn mark_non_text_spans(&self, spans: &[TextSpan]) -> Vec<SpanClassification> {
        spans
            .iter()
            .enumerate()
            .map(|(idx, span)| {
                // For now, use a simple heuristic:
                // If span has mostly non-ASCII characters or low-confidence mappings,
                // it's likely non-text content
                let non_ascii_ratio = span.text.chars().filter(|c| !c.is_ascii()).count() as f32
                    / span.text.len().max(1) as f32;

                let non_ascii_drop = non_ascii_ratio > self.non_ascii_drop_threshold;
                let suspicious_drop =
                    self.drop_suspicious_unicode && has_suspicious_patterns(&span.text);
                let is_likely_non_text = non_ascii_drop || suspicious_drop;

                SpanClassification {
                    span_index: idx,
                    span: span.clone(),
                    is_non_text: is_likely_non_text,
                    confidence: if is_likely_non_text { 0.6 } else { 0.9 },
                }
            })
            .collect()
    }
}

/// Classification of a text span.
#[derive(Debug, Clone)]
pub struct SpanClassification {
    /// Index of the span in original array
    pub span_index: usize,
    /// The text span itself
    pub span: TextSpan,
    /// Whether this span likely contains non-text content
    pub is_non_text: bool,
    /// Confidence in the classification (0.0 to 1.0)
    pub confidence: f32,
}

/// Check if text contains suspicious patterns indicating non-text content.
fn has_suspicious_patterns(text: &str) -> bool {
    // Patterns that suggest diagram/figure content:
    // 1. Many consecutive special Unicode characters
    // 2. Mix of widely disparate Unicode blocks
    // 3. Very short text with many non-ASCII chars

    let special_char_count = text
        .chars()
        .filter(|c| {
            let code = *c as u32;
            // Ranges known to contain diagram/symbol glyphs
            matches!(
                code,
                0x2600..=0x27BF |   // Miscellaneous Symbols and Dingbats
                0x1F300..=0x1F9FF | // Emoticons and pictographs
                0x2200..=0x22FF |   // Mathematical Operators
                0x2A00..=0x2AFF |   // Supplemental Mathematical Operators
                0x0080..=0x009F     // C1 Control Codes (often unmapped)
            )
        })
        .count();

    let special_ratio = special_char_count as f32 / text.len().max(1) as f32;

    // If >40% of characters are from special Unicode blocks, likely diagram
    special_ratio > 0.4
}

/// Compute mapping confidence for a character sequence.
///
/// Analyzes how many characters in a sequence have valid Unicode mappings
/// versus how many are unmapped or garbled.
pub fn compute_sequence_confidence(
    text: &str,
    mapped_count: usize,
    font_name: &str,
) -> CharacterConfidence {
    if text.is_empty() {
        return CharacterConfidence::unmapped();
    }

    let total = text.len();
    let mapped_ratio = mapped_count as f32 / total as f32;

    // Adjust score based on mapping quality
    let score: f32 = if mapped_ratio > 0.9 {
        // >90% mapped: likely good text
        0.85
    } else if mapped_ratio > 0.75 {
        // 75-90% mapped: probably text with some foreign chars
        0.7
    } else if mapped_ratio > 0.5 {
        // 50-75% mapped: mixed quality
        0.5
    } else {
        // <50% mapped: likely diagram/garbage
        0.2
    };

    CharacterConfidence {
        score: score.clamp(0.0_f32, 1.0_f32),
        reason: if is_likely_diagram_font(font_name) {
            ConfidenceReason::SuspiciousContext
        } else {
            ConfidenceReason::Unmapped
        },
    }
}

/// Check if a font name suggests symbol/diagram content.
fn is_likely_diagram_font(font_name: &str) -> bool {
    let name_lower = font_name.to_lowercase();
    name_lower.contains("symbol")
        || name_lower.contains("wingdings")
        || name_lower.contains("webdings")
        || name_lower.contains("dingbats")
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_character_confidence_mapped() {
        let conf = CharacterConfidence::mapped();
        assert_eq!(conf.reason, ConfidenceReason::MappedByToUnicode);
        assert!(conf.score > 0.9);
    }

    #[test]
    fn test_character_confidence_unmapped() {
        let conf = CharacterConfidence::unmapped();
        assert_eq!(conf.reason, ConfidenceReason::Unmapped);
        assert!(conf.score < 0.5);
    }

    #[test]
    fn test_non_text_detector_high_unmapped_ratio() {
        let detector = NonTextDetector::default();

        // Create mock confidences with high unmapped ratio
        let confidences = vec![
            CharacterConfidence::unmapped(),
            CharacterConfidence::unmapped(),
            CharacterConfidence::unmapped(),
            CharacterConfidence::mapped(),
            CharacterConfidence::mapped(),
            CharacterConfidence::unmapped(),
            CharacterConfidence::unmapped(),
            CharacterConfidence::unmapped(),
            CharacterConfidence::unmapped(),
            CharacterConfidence::unmapped(),
        ];

        let stats = detector.analyze_sequence("äöüäöüäöüX", &confidences, "Helvetica");
        assert!(stats.likely_non_text); // >50% unmapped
    }

    #[test]
    fn test_non_text_detector_symbol_font() {
        let detector = NonTextDetector::default();
        let confidences = vec![CharacterConfidence::mapped(); 10];

        // Symbol fonts should be flagged even with good confidence
        let stats = detector.analyze_sequence("test content 123", &confidences, "Symbol");
        assert!(stats.likely_non_text);
    }

    #[test]
    fn test_non_text_detector_normal_text() {
        let detector = NonTextDetector::default();
        let confidences = vec![CharacterConfidence::mapped(); 10];

        let stats = detector.analyze_sequence("hello world test", &confidences, "Arial");
        assert!(!stats.likely_non_text);
    }

    #[test]
    fn test_suspicious_patterns() {
        // Normal text
        assert!(!has_suspicious_patterns("The quick brown fox"));

        // Text with some accents is OK
        assert!(!has_suspicious_patterns("Café résumé naïve"));
    }

    #[test]
    fn test_sequence_confidence_high_mapped() {
        let conf = compute_sequence_confidence("Hello World", 11, "Arial");
        assert!(conf.score > 0.7);
    }

    #[test]
    fn test_sequence_confidence_low_mapped() {
        let conf = compute_sequence_confidence("☺♦♠♥♣", 1, "Arial");
        assert!(conf.score < 0.5);
    }

    // PDX-7 (liteparse report): the span-drop heuristics in mark_non_text_spans
    // must be configurable so symbol/CJK/accented content can be preserved.
    // Defaults keep historical behaviour; relaxing the knobs keeps the content.
    fn span_with_text(text: &str) -> crate::layout::TextSpan {
        use crate::geometry::Rect;
        use crate::layout::{Color, FontWeight, TextSpan};
        TextSpan {
            artifact_type: None,
            text: text.to_string(),
            bbox: Rect::new(0.0, 0.0, 10.0, 12.0),
            font_name: "Helvetica".to_string(),
            font_size: 12.0,
            font_weight: FontWeight::Normal,
            color: Color::black(),
            mcid: None,
            sequence: 0,
            split_boundary_before: false,
            offset_semantic: false,
            is_italic: false,
            is_monospace: false,
            char_spacing: 0.0,
            word_spacing: 0.0,
            horizontal_scaling: 100.0,
            primary_detected: false,
            char_widths: vec![],
            heading_level: None,
        }
    }

    #[test]
    fn test_non_text_drop_is_configurable() {
        // --- non-ASCII-ratio knob ---
        // Pure CJK is ~33% non-ASCII (1 char / 3 UTF-8 bytes), over the 0.3
        // default — real text the heuristic wrongly drops. CJK is not in the
        // "suspicious" Unicode blocks, so this isolates the non-ASCII gate.
        let cjk = span_with_text("日本語のテキスト処理");
        assert!(
            NonTextDetector::default().mark_non_text_spans(std::slice::from_ref(&cjk))[0]
                .is_non_text,
            "default: CJK dropped by the non-ASCII ratio gate"
        );
        let na_off = NonTextDetector {
            non_ascii_drop_threshold: 1.0,
            ..NonTextDetector::default()
        };
        assert!(
            !na_off.mark_non_text_spans(&[cjk])[0].is_non_text,
            "PDX-7: CJK preserved when the non-ASCII drop is disabled"
        );

        // --- suspicious-Unicode knob ---
        // C1 control codes (0x0080-0x009F, 2 bytes each) push special_ratio
        // over has_suspicious_patterns' 0.4 cutoff. Disable the non-ASCII gate
        // so we isolate the suspicious-Unicode gate.
        let ctrl = span_with_text("\u{0080}\u{0081}\u{0082}");
        let susp_on = NonTextDetector {
            non_ascii_drop_threshold: 1.0,
            drop_suspicious_unicode: true,
            ..NonTextDetector::default()
        };
        assert!(
            susp_on.mark_non_text_spans(std::slice::from_ref(&ctrl))[0].is_non_text,
            "suspicious-Unicode gate drops the span when enabled"
        );
        let susp_off = NonTextDetector {
            non_ascii_drop_threshold: 1.0,
            drop_suspicious_unicode: false,
            ..NonTextDetector::default()
        };
        assert!(
            !susp_off.mark_non_text_spans(&[ctrl])[0].is_non_text,
            "PDX-7: content preserved when the suspicious-Unicode drop is disabled"
        );
    }
}