papyrus_core/parser/
mod.rs

1use std::collections::HashMap;
2
3use crate::ast::{DocumentMetadata, Warning};
4
5/// Font metadata resolved from a PDF font dictionary.
6#[derive(Debug, Clone, PartialEq)]
7pub struct FontInfo {
8    /// The normalized font name (subset prefix stripped).
9    pub name: String,
10    /// Optional font size from the font descriptor (diagnostic only; Tf state is authoritative).
11    pub size: Option<f32>,
12    /// FontWeight from the font descriptor (e.g., 700 = bold). None if not present.
13    pub font_weight: Option<f32>,
14    /// ItalicAngle from the font descriptor. Non-zero indicates italic/oblique. None if not present.
15    pub italic_angle: Option<f32>,
16}
17
18/// A single text segment extracted from a PDF content stream.
19#[derive(Debug, Clone, PartialEq)]
20pub struct RawTextSegment {
21    /// Decoded UTF-8 text content.
22    pub text: String,
23    /// The font resource name as it appears in the content stream (e.g., b"F1").
24    pub font_resource_name: Vec<u8>,
25    /// Font size from the current Tf text state.
26    pub font_size: f32,
27    /// 1-based page number.
28    pub page_number: usize,
29}
30
31/// Load PDF bytes into a lopdf Document, mapping all failures to warnings.
32///
33/// Returns `(None, warnings)` on failure, `(Some(doc), warnings)` on success.
34pub fn load_pdf(bytes: &[u8]) -> (Option<lopdf::Document>, Vec<Warning>) {
35    if bytes.is_empty() {
36        return (
37            None,
38            vec![Warning::MalformedPdfObject {
39                detail: "empty PDF bytes".to_string(),
40            }],
41        );
42    }
43
44    match lopdf::Document::load_mem(bytes) {
45        Ok(doc) => (Some(doc), Vec::new()),
46        Err(e) => (
47            None,
48            vec![Warning::MalformedPdfObject {
49                detail: format!("failed to load PDF: {}", e),
50            }],
51        ),
52    }
53}
54
55/// Extract FontWeight and ItalicAngle from a font's FontDescriptor dictionary.
56///
57/// Returns `(font_weight, italic_angle)` — either may be None if the descriptor
58/// is absent or the key is missing.
59fn extract_font_descriptor_metrics(
60    doc: &lopdf::Document,
61    font_dict: &lopdf::Dictionary,
62) -> (Option<f32>, Option<f32>) {
63    let descriptor = font_dict
64        .get(b"FontDescriptor")
65        .ok()
66        .and_then(|obj| match obj {
67            lopdf::Object::Reference(id) => doc.get_object(*id).ok(),
68            other => Some(other),
69        })
70        .and_then(|obj| obj.as_dict().ok());
71
72    let Some(desc) = descriptor else {
73        return (None, None);
74    };
75
76    let font_weight = desc.get(b"FontWeight").ok().and_then(extract_number);
77    let italic_angle = desc.get(b"ItalicAngle").ok().and_then(extract_number);
78
79    (font_weight, italic_angle)
80}
81
82/// Strip a 6-uppercase-letter subset prefix (e.g., "ABCDEF+Helvetica-Bold" -> "Helvetica-Bold").
83///
84/// PDF subset fonts embed the original font name after a 6-uppercase-letter tag
85/// and a `+` separator. Stripping this prefix recovers the canonical font name
86/// for pattern matching.
87pub(crate) fn strip_subset_prefix(name: &str) -> &str {
88    if name.len() >= 7
89        && name.as_bytes()[6] == b'+'
90        && name[..6].bytes().all(|b| b.is_ascii_uppercase())
91    {
92        &name[7..]
93    } else {
94        name
95    }
96}
97
98/// Resolve font dictionaries for a given page.
99///
100/// Returns a map of font resource name (e.g., b"F1") to FontInfo, plus any warnings.
101/// `page_number` is 1-based.
102pub fn resolve_fonts_for_page(
103    doc: &lopdf::Document,
104    page_number: usize,
105) -> (HashMap<Vec<u8>, FontInfo>, Vec<Warning>) {
106    let mut fonts = HashMap::new();
107    let mut warnings = Vec::new();
108
109    // Get the page ObjectId from the 1-based page number
110    let pages = doc.get_pages();
111    let page_num_u32 = match u32::try_from(page_number) {
112        Ok(n) => n,
113        Err(_) => {
114            warnings.push(Warning::MalformedPdfObject {
115                detail: format!("page number {} exceeds u32 range", page_number),
116            });
117            return (fonts, warnings);
118        }
119    };
120    let page_id = match pages.get(&page_num_u32) {
121        Some(id) => *id,
122        None => {
123            warnings.push(Warning::MalformedPdfObject {
124                detail: format!(
125                    "page {} not found (document has {} pages)",
126                    page_number,
127                    pages.len()
128                ),
129            });
130            return (fonts, warnings);
131        }
132    };
133
134    // Use lopdf's built-in font resolution
135    let page_fonts = match doc.get_page_fonts(page_id) {
136        Ok(f) => f,
137        Err(e) => {
138            warnings.push(Warning::MalformedPdfObject {
139                detail: format!(
140                    "failed to read font resources for page {}: {}",
141                    page_number, e
142                ),
143            });
144            return (fonts, warnings);
145        }
146    };
147
148    for (resource_name, font_dict) in page_fonts {
149        // Extract BaseFont name
150        let base_font_name = match font_dict.get(b"BaseFont") {
151            Ok(obj) => match obj.as_name() {
152                Ok(name_bytes) => {
153                    let raw_name = String::from_utf8_lossy(name_bytes).to_string();
154                    strip_subset_prefix(&raw_name).to_string()
155                }
156                Err(_) => {
157                    warnings.push(Warning::MissingFontMetrics {
158                        font_name: "<unknown>".to_string(),
159                        page: page_number,
160                    });
161                    continue;
162                }
163            },
164            Err(_) => {
165                warnings.push(Warning::MissingFontMetrics {
166                    font_name: "<unknown>".to_string(),
167                    page: page_number,
168                });
169                continue;
170            }
171        };
172
173        // Note: FontInfo.size is diagnostic only — Tf state is authoritative for
174        // RawTextSegment.font_size. Standard PDF font descriptors don't carry a
175        // /FontSize key, so we leave this as None. It may be populated by future
176        // heuristics if needed.
177
178        let (font_weight, italic_angle) = extract_font_descriptor_metrics(doc, &font_dict);
179
180        fonts.insert(
181            resource_name,
182            FontInfo {
183                name: base_font_name,
184                size: None,
185                font_weight,
186                italic_angle,
187            },
188        );
189    }
190
191    (fonts, warnings)
192}
193
194/// WinAnsiEncoding lookup table for bytes 0x80–0x9F.
195///
196/// These bytes differ from ISO-8859-1: WinAnsi maps them to printable characters
197/// (smart quotes, em dashes, euro sign, etc.) while ISO-8859-1 maps them to C1
198/// control characters. PDF spec §D.1 defines this mapping.
199///
200/// Index 0 = byte 0x80, index 31 = byte 0x9F.
201const WINANSI_0X80_TO_0X9F: [char; 32] = [
202    '\u{20AC}', // 0x80 — Euro sign
203    '\u{FFFD}', // 0x81 — undefined, use replacement
204    '\u{201A}', // 0x82 — Single Low-9 Quotation Mark
205    '\u{0192}', // 0x83 — Latin Small Letter F With Hook
206    '\u{201E}', // 0x84 — Double Low-9 Quotation Mark
207    '\u{2026}', // 0x85 — Horizontal Ellipsis
208    '\u{2020}', // 0x86 — Dagger
209    '\u{2021}', // 0x87 — Double Dagger
210    '\u{02C6}', // 0x88 — Modifier Letter Circumflex Accent
211    '\u{2030}', // 0x89 — Per Mille Sign
212    '\u{0160}', // 0x8A — Latin Capital Letter S With Caron
213    '\u{2039}', // 0x8B — Single Left-Pointing Angle Quotation Mark
214    '\u{0152}', // 0x8C — Latin Capital Ligature OE
215    '\u{FFFD}', // 0x8D — undefined, use replacement
216    '\u{017D}', // 0x8E — Latin Capital Letter Z With Caron
217    '\u{FFFD}', // 0x8F — undefined, use replacement
218    '\u{FFFD}', // 0x90 — undefined, use replacement
219    '\u{2018}', // 0x91 — Left Single Quotation Mark
220    '\u{2019}', // 0x92 — Right Single Quotation Mark
221    '\u{201C}', // 0x93 — Left Double Quotation Mark
222    '\u{201D}', // 0x94 — Right Double Quotation Mark
223    '\u{2022}', // 0x95 — Bullet
224    '\u{2013}', // 0x96 — En Dash
225    '\u{2014}', // 0x97 — Em Dash
226    '\u{02DC}', // 0x98 — Small Tilde
227    '\u{2122}', // 0x99 — Trade Mark Sign
228    '\u{0161}', // 0x9A — Latin Small Letter S With Caron
229    '\u{203A}', // 0x9B — Single Right-Pointing Angle Quotation Mark
230    '\u{0153}', // 0x9C — Latin Small Ligature OE
231    '\u{FFFD}', // 0x9D — undefined, use replacement
232    '\u{017E}', // 0x9E — Latin Small Letter Z With Caron
233    '\u{0178}', // 0x9F — Latin Capital Letter Y With Diaeresis
234];
235
236/// Decode a single byte using WinAnsiEncoding.
237fn winansi_byte_to_char(b: u8) -> char {
238    if b < 0x80 {
239        b as char
240    } else if b <= 0x9F {
241        WINANSI_0X80_TO_0X9F[(b - 0x80) as usize]
242    } else {
243        // 0xA0–0xFF: same as ISO-8859-1 (direct Unicode codepoint)
244        b as char
245    }
246}
247
248/// Decode raw PDF string bytes to UTF-8.
249///
250/// Handles UTF-16BE (with or without BOM) and falls back to WinAnsiEncoding.
251fn decode_pdf_string(bytes: &[u8]) -> String {
252    // Check for UTF-16BE BOM (0xFE 0xFF)
253    if bytes.len() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF {
254        // UTF-16BE with BOM — skip BOM bytes
255        return decode_utf16be(&bytes[2..]);
256    }
257
258    // Heuristic for UTF-16BE without BOM: if first byte is 0x00 and length is even,
259    // it's likely UTF-16BE (common in CIDFont text). Note: per PDF spec §7.9.2.2,
260    // UTF-16BE is formally identified only by the BOM. This heuristic is a pragmatic
261    // best-effort for spec-violating PDFs.
262    if bytes.len() >= 2 && bytes.len().is_multiple_of(2) && bytes[0] == 0x00 {
263        return decode_utf16be(bytes);
264    }
265
266    // Default: WinAnsiEncoding (PDF spec §D.1)
267    bytes.iter().map(|&b| winansi_byte_to_char(b)).collect()
268}
269
270/// Decode UTF-16BE bytes into a String.
271fn decode_utf16be(bytes: &[u8]) -> String {
272    let u16_iter = bytes
273        .chunks_exact(2)
274        .map(|pair| u16::from_be_bytes([pair[0], pair[1]]));
275    char::decode_utf16(u16_iter)
276        .map(|r| r.unwrap_or(char::REPLACEMENT_CHARACTER))
277        .collect()
278}
279
280/// Extract raw text segments from a page's content stream.
281///
282/// Processes Tf, Tj, TJ, BT, and ET operators.
283/// `page_number` is 1-based.
284pub fn extract_text_segments_for_page(
285    doc: &lopdf::Document,
286    page_number: usize,
287    _fonts: &HashMap<Vec<u8>, FontInfo>,
288) -> (Vec<RawTextSegment>, Vec<Warning>) {
289    let mut segments = Vec::new();
290    let mut warnings = Vec::new();
291
292    // Get page ObjectId
293    let pages = doc.get_pages();
294    let page_num_u32 = match u32::try_from(page_number) {
295        Ok(n) => n,
296        Err(_) => return (segments, warnings),
297    };
298    let page_id = match pages.get(&page_num_u32) {
299        Some(id) => *id,
300        None => return (segments, warnings),
301    };
302
303    // Decode content stream
304    let content = match doc.get_and_decode_page_content(page_id) {
305        Ok(c) => c,
306        Err(e) => {
307            warnings.push(Warning::UnreadableTextStream {
308                page: page_number,
309                detail: format!("failed to decode content stream: {}", e),
310            });
311            return (segments, warnings);
312        }
313    };
314
315    // Text state machine
316    let mut current_font_resource: Option<Vec<u8>> = None;
317    let mut current_font_size: Option<f32> = None;
318    let mut tf_set_in_text_object = false;
319    let mut warned_no_tf = false;
320
321    for op in content.operations.iter() {
322        match op.operator.as_str() {
323            "BT" => {
324                // Begin text object — reset state
325                current_font_resource = None;
326                current_font_size = None;
327                tf_set_in_text_object = false;
328                warned_no_tf = false;
329            }
330            "ET" => {
331                // End text object — reset state
332                current_font_resource = None;
333                current_font_size = None;
334                tf_set_in_text_object = false;
335                warned_no_tf = false;
336            }
337            "Tf" => {
338                // Set font: operands are [Name, Number]
339                if op.operands.len() >= 2 {
340                    if let Some(name_bytes) = extract_name(&op.operands[0]) {
341                        current_font_resource = Some(name_bytes);
342                    }
343                    if let Some(size) = extract_number(&op.operands[1]) {
344                        current_font_size = Some(size);
345                    }
346                    tf_set_in_text_object = true;
347                }
348            }
349            "Tj" => {
350                // Show string: operand is [String]
351                if let Some(text_bytes) = op.operands.first().and_then(extract_string_bytes) {
352                    let text = decode_pdf_string(&text_bytes);
353                    if !text.is_empty() {
354                        let (font_res, font_sz) = get_text_state_or_default(
355                            &current_font_resource,
356                            current_font_size,
357                            tf_set_in_text_object,
358                            &mut warned_no_tf,
359                            page_number,
360                            &mut warnings,
361                        );
362                        segments.push(RawTextSegment {
363                            text,
364                            font_resource_name: font_res,
365                            font_size: font_sz,
366                            page_number,
367                        });
368                    }
369                }
370            }
371            "TJ" => {
372                // Show array of strings/numbers: operand is [Array]
373                // In PDF, TJ arrays contain strings (text) and numbers (positioning adjustments).
374                // Large negative numbers typically represent word spacing.
375                if let Some(lopdf::Object::Array(arr)) = op.operands.first() {
376                    let mut combined = String::new();
377                    let mut prev_was_string = false;
378                    let mut needs_space = false;
379
380                    for item in arr {
381                        if let Some(bytes) = extract_string_bytes(item) {
382                            let text = decode_pdf_string(&bytes);
383                            // Add space if flagged by a previous large negative adjustment
384                            if needs_space
385                                && !combined.is_empty()
386                                && !combined.ends_with(char::is_whitespace)
387                                && !text.starts_with(char::is_whitespace)
388                            {
389                                combined.push(' ');
390                            }
391                            combined.push_str(&text);
392                            prev_was_string = true;
393                            needs_space = false;
394                        } else if let Some(num) = extract_number(item) {
395                            // Negative numbers move the text position backward (creating space).
396                            // A threshold of -100 is a heuristic: values more negative than this
397                            // typically represent inter-word spacing rather than kerning.
398                            if prev_was_string && num < -100.0 {
399                                needs_space = true;
400                            }
401                        }
402                    }
403                    if !combined.is_empty() {
404                        let (font_res, font_sz) = get_text_state_or_default(
405                            &current_font_resource,
406                            current_font_size,
407                            tf_set_in_text_object,
408                            &mut warned_no_tf,
409                            page_number,
410                            &mut warnings,
411                        );
412                        segments.push(RawTextSegment {
413                            text: combined,
414                            font_resource_name: font_res,
415                            font_size: font_sz,
416                            page_number,
417                        });
418                    }
419                }
420            }
421            _ => {
422                // Ignore all other operators
423            }
424        }
425    }
426
427    (segments, warnings)
428}
429
430/// Extract font resource name or defaults if Tf not yet set.
431/// Emits at most one warning per text object when Tf is missing (per plan §5).
432fn get_text_state_or_default(
433    current_font_resource: &Option<Vec<u8>>,
434    current_font_size: Option<f32>,
435    tf_set: bool,
436    warned_no_tf: &mut bool,
437    page_number: usize,
438    warnings: &mut Vec<Warning>,
439) -> (Vec<u8>, f32) {
440    if tf_set {
441        (
442            current_font_resource
443                .clone()
444                .unwrap_or_else(|| b"<unknown>".to_vec()),
445            current_font_size.unwrap_or(0.0),
446        )
447    } else {
448        if !*warned_no_tf {
449            warnings.push(Warning::MalformedPdfObject {
450                detail: format!("text state not set before Tj/TJ on page {}", page_number),
451            });
452            *warned_no_tf = true;
453        }
454        (b"<unknown>".to_vec(), 0.0)
455    }
456}
457
458/// Extract a Name value (bytes) from a lopdf Object.
459fn extract_name(obj: &lopdf::Object) -> Option<Vec<u8>> {
460    match obj {
461        lopdf::Object::Name(n) => Some(n.clone()),
462        _ => None,
463    }
464}
465
466/// Extract a numeric value (f32) from a lopdf Object.
467fn extract_number(obj: &lopdf::Object) -> Option<f32> {
468    match obj {
469        lopdf::Object::Real(f) => Some(*f),
470        lopdf::Object::Integer(i) => Some(*i as f32),
471        _ => None,
472    }
473}
474
475/// Extract raw bytes from a String object.
476fn extract_string_bytes(obj: &lopdf::Object) -> Option<Vec<u8>> {
477    match obj {
478        lopdf::Object::String(bytes, _) => Some(bytes.clone()),
479        _ => None,
480    }
481}
482
483/// End-to-end PDF parsing: load, extract metadata, resolve fonts, extract text segments.
484///
485/// Never panics. Returns empty results with warnings on failure.
486pub fn parse_pdf(bytes: &[u8]) -> (Vec<RawTextSegment>, DocumentMetadata, Vec<Warning>) {
487    let mut all_segments = Vec::new();
488    let mut all_warnings = Vec::new();
489
490    // Step 1: Load PDF
491    let (doc_opt, load_warnings) = load_pdf(bytes);
492    all_warnings.extend(load_warnings);
493
494    let doc = match doc_opt {
495        Some(d) => d,
496        None => {
497            return (
498                all_segments,
499                DocumentMetadata {
500                    title: None,
501                    author: None,
502                    page_count: 0,
503                },
504                all_warnings,
505            );
506        }
507    };
508
509    // Step 2: Extract metadata
510    let pages = doc.get_pages();
511    let page_count = pages.len();
512
513    // Try to extract title and author from the document info dictionary
514    let (title, author) = extract_doc_info(&doc);
515
516    let metadata = DocumentMetadata {
517        title,
518        author,
519        page_count,
520    };
521
522    // Step 3: Per-page extraction — iterate in page order (1-based)
523    let mut page_numbers: Vec<u32> = pages.keys().copied().collect();
524    page_numbers.sort();
525
526    for &page_num in &page_numbers {
527        // Safe: u32 -> usize is always widening on 32-bit+ platforms
528        let page_number = page_num as usize;
529
530        // Resolve fonts for this page
531        let (fonts, font_warnings) = resolve_fonts_for_page(&doc, page_number);
532        all_warnings.extend(font_warnings);
533
534        // Extract text segments for this page
535        let (segments, extract_warnings) =
536            extract_text_segments_for_page(&doc, page_number, &fonts);
537        all_warnings.extend(extract_warnings);
538
539        all_segments.extend(segments);
540    }
541
542    (all_segments, metadata, all_warnings)
543}
544
545/// Public crate accessor for doc info extraction, used by `lib.rs` single-pass pipeline.
546pub(crate) fn extract_doc_info_pub(doc: &lopdf::Document) -> (Option<String>, Option<String>) {
547    extract_doc_info(doc)
548}
549
550/// Extract title and author from PDF info dictionary.
551fn extract_doc_info(doc: &lopdf::Document) -> (Option<String>, Option<String>) {
552    // Try the trailer's /Info reference
553    let info_dict = doc
554        .trailer
555        .get(b"Info")
556        .ok()
557        .and_then(|obj| match obj {
558            lopdf::Object::Reference(id) => doc.get_object(*id).ok(),
559            _ => Some(obj),
560        })
561        .and_then(|obj| obj.as_dict().ok());
562
563    let info = match info_dict {
564        Some(d) => d,
565        None => return (None, None),
566    };
567
568    let title = get_info_string(info, b"Title");
569    let author = get_info_string(info, b"Author");
570
571    (title, author)
572}
573
574/// Extract a non-empty string value from a PDF info dictionary.
575fn get_info_string(info: &lopdf::Dictionary, key: &[u8]) -> Option<String> {
576    info.get(key).ok().and_then(|obj| match obj {
577        lopdf::Object::String(bytes, _) => {
578            let s = decode_pdf_string(bytes);
579            if s.is_empty() {
580                None
581            } else {
582                Some(s)
583            }
584        }
585        _ => None,
586    })
587}
588
589#[cfg(test)]
590mod tests {
591    use super::*;
592    use std::path::PathBuf;
593
594    /// Helper to resolve fixture paths relative to the workspace root.
595    fn fixture_path(name: &str) -> PathBuf {
596        // CARGO_MANIFEST_DIR points to papyrus-core/
597        // Fixtures are at ../tests/fixtures/
598        let manifest_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
599        manifest_dir
600            .parent()
601            .expect("workspace root")
602            .join("tests")
603            .join("fixtures")
604            .join(name)
605    }
606
607    // ── load_pdf tests ──
608
609    #[test]
610    fn load_pdf_empty_bytes_returns_none_with_warning() {
611        let (doc, warnings) = load_pdf(b"");
612        assert!(doc.is_none(), "empty bytes should not produce a document");
613        assert!(!warnings.is_empty(), "should emit at least one warning");
614        match &warnings[0] {
615            Warning::MalformedPdfObject { detail } => {
616                assert!(!detail.is_empty(), "detail should be non-empty");
617            }
618            other => panic!("expected MalformedPdfObject, got {:?}", other),
619        }
620    }
621
622    #[test]
623    fn load_pdf_invalid_header_returns_none_with_warning() {
624        let (doc, warnings) = load_pdf(b"this is not a PDF");
625        assert!(
626            doc.is_none(),
627            "invalid header should not produce a document"
628        );
629        assert!(!warnings.is_empty(), "should emit at least one warning");
630        match &warnings[0] {
631            Warning::MalformedPdfObject { detail } => {
632                assert!(!detail.is_empty(), "detail should be non-empty");
633            }
634            other => panic!("expected MalformedPdfObject, got {:?}", other),
635        }
636    }
637
638    #[test]
639    fn load_pdf_corrupted_fixture_returns_none_with_warning() {
640        let path = fixture_path("corrupted.pdf");
641        let bytes = std::fs::read(&path)
642            .unwrap_or_else(|e| panic!("corrupted.pdf fixture must exist at {:?}: {}", path, e));
643        let (doc, warnings) = load_pdf(&bytes);
644        assert!(doc.is_none(), "corrupted PDF should not produce a document");
645        assert!(!warnings.is_empty(), "should emit at least one warning");
646        match &warnings[0] {
647            Warning::MalformedPdfObject { detail } => {
648                assert!(!detail.is_empty(), "detail should be non-empty");
649            }
650            other => panic!("expected MalformedPdfObject, got {:?}", other),
651        }
652    }
653
654    #[test]
655    fn load_pdf_valid_simple_fixture_returns_some() {
656        let path = fixture_path("simple.pdf");
657        let bytes = std::fs::read(&path)
658            .unwrap_or_else(|e| panic!("simple.pdf fixture must exist at {:?}: {}", path, e));
659        let (doc, warnings) = load_pdf(&bytes);
660        assert!(doc.is_some(), "valid PDF should produce a document");
661        // A valid, well-formed PDF should not produce MalformedPdfObject warnings
662        for w in &warnings {
663            if let Warning::MalformedPdfObject { .. } = w {
664                panic!("valid PDF should not produce MalformedPdfObject warning");
665            }
666        }
667    }
668
669    // ── resolve_fonts_for_page tests ──
670
671    /// Helper to load a fixture PDF for font resolution tests.
672    fn load_fixture(name: &str) -> lopdf::Document {
673        let path = fixture_path(name);
674        let bytes = std::fs::read(&path)
675            .unwrap_or_else(|e| panic!("fixture {} must exist at {:?}: {}", name, path, e));
676        let (doc, _) = load_pdf(&bytes);
677        doc.expect("fixture should be a valid PDF")
678    }
679
680    #[test]
681    fn resolve_fonts_for_page_simple_returns_font_entries() {
682        let doc = load_fixture("simple.pdf");
683        let (fonts, warnings) = resolve_fonts_for_page(&doc, 1);
684        // simple.pdf uses Helvetica — there should be at least one font
685        assert!(
686            !fonts.is_empty(),
687            "simple.pdf page 1 should have font entries"
688        );
689        // Check that at least one font has "Helvetica" in its name
690        let has_helvetica = fonts.values().any(|f| f.name.contains("Helvetica"));
691        assert!(
692            has_helvetica,
693            "simple.pdf should have a Helvetica font, got: {:?}",
694            fonts
695        );
696        // No warnings expected for a well-formed page
697        let malformed_warnings: Vec<_> = warnings
698            .iter()
699            .filter(|w| matches!(w, Warning::MissingFontMetrics { .. }))
700            .collect();
701        assert!(
702            malformed_warnings.is_empty(),
703            "well-formed page should not produce MissingFontMetrics warnings"
704        );
705    }
706
707    #[test]
708    fn resolve_fonts_for_page_bold_italic_returns_bold_and_italic_fonts() {
709        let doc = load_fixture("bold-italic.pdf");
710        let (fonts, _) = resolve_fonts_for_page(&doc, 1);
711        assert!(
712            !fonts.is_empty(),
713            "bold-italic.pdf page 1 should have font entries"
714        );
715        // Should have a Bold font and an Oblique/Italic font
716        let names: Vec<&str> = fonts.values().map(|f| f.name.as_str()).collect();
717        let has_bold = names.iter().any(|n| n.to_lowercase().contains("bold"));
718        let has_italic = names
719            .iter()
720            .any(|n| n.to_lowercase().contains("oblique") || n.to_lowercase().contains("italic"));
721        assert!(
722            has_bold,
723            "bold-italic.pdf should have a Bold font, got: {:?}",
724            names
725        );
726        assert!(
727            has_italic,
728            "bold-italic.pdf should have an Oblique/Italic font, got: {:?}",
729            names
730        );
731    }
732
733    #[test]
734    fn resolve_fonts_for_page_preserves_resource_names_as_keys() {
735        let doc = load_fixture("simple.pdf");
736        let (fonts, _) = resolve_fonts_for_page(&doc, 1);
737        // Keys should be font resource names like b"F1", b"Helv", etc.
738        for key in fonts.keys() {
739            assert!(
740                !key.is_empty(),
741                "font resource name key should not be empty"
742            );
743        }
744    }
745
746    // ── extract_text_segments_for_page tests ──
747
748    #[test]
749    fn extract_text_segments_for_page_simple_returns_segments() {
750        let doc = load_fixture("simple.pdf");
751        let (fonts, _) = resolve_fonts_for_page(&doc, 1);
752        let (segments, warnings) = extract_text_segments_for_page(&doc, 1, &fonts);
753        assert!(
754            !segments.is_empty(),
755            "simple.pdf page 1 should produce text segments"
756        );
757        // All segments should be page 1
758        for seg in &segments {
759            assert_eq!(seg.page_number, 1, "all segments should be page 1");
760        }
761        // Combined text should contain "Chapter 1" (from oracle)
762        let combined: String = segments.iter().map(|s| s.text.as_str()).collect();
763        assert!(
764            combined.contains("Chapter 1"),
765            "simple.pdf should contain 'Chapter 1', got: {:?}",
766            combined
767        );
768        // No UnreadableTextStream warnings for well-formed page
769        for w in &warnings {
770            if let Warning::UnreadableTextStream { .. } = w {
771                panic!("well-formed page should not produce UnreadableTextStream");
772            }
773        }
774    }
775
776    #[test]
777    fn extract_text_segments_for_page_bold_italic_returns_different_fonts() {
778        let doc = load_fixture("bold-italic.pdf");
779        let (fonts, _) = resolve_fonts_for_page(&doc, 1);
780        let (segments, _) = extract_text_segments_for_page(&doc, 1, &fonts);
781        assert!(
782            !segments.is_empty(),
783            "bold-italic.pdf page 1 should produce text segments"
784        );
785        // Should have segments with different font resource names (bold vs italic vs regular)
786        let unique_fonts: std::collections::HashSet<&Vec<u8>> =
787            segments.iter().map(|s| &s.font_resource_name).collect();
788        assert!(
789            unique_fonts.len() >= 2,
790            "bold-italic.pdf should use at least 2 different fonts, got: {:?}",
791            unique_fonts
792        );
793    }
794
795    #[test]
796    fn extract_text_segments_for_page_font_size_comes_from_tf_state() {
797        let doc = load_fixture("simple.pdf");
798        let (fonts, _) = resolve_fonts_for_page(&doc, 1);
799        let (segments, _) = extract_text_segments_for_page(&doc, 1, &fonts);
800        // All segments should have a positive font size from Tf state
801        for seg in &segments {
802            assert!(
803                seg.font_size > 0.0,
804                "font_size should be positive (from Tf), got: {}",
805                seg.font_size
806            );
807        }
808    }
809
810    #[test]
811    fn extract_text_segments_for_page_preserves_operator_encounter_order() {
812        let doc = load_fixture("simple.pdf");
813        let (fonts, _) = resolve_fonts_for_page(&doc, 1);
814        let (segments, _) = extract_text_segments_for_page(&doc, 1, &fonts);
815        // Segments should appear in content stream order
816        // For simple.pdf, "Chapter 1" should come before "Body text." (per oracle)
817        let combined: String = segments.iter().map(|s| s.text.as_str()).collect();
818        if combined.contains("Chapter 1") && combined.contains("Body text.") {
819            let chapter_pos = combined.find("Chapter 1").unwrap();
820            let body_pos = combined.find("Body text.").unwrap();
821            assert!(
822                chapter_pos < body_pos,
823                "Chapter 1 should come before Body text. in operator order"
824            );
825        }
826    }
827
828    // ── get_text_state_or_default tests ──
829
830    #[test]
831    fn get_text_state_or_default_with_tf_set_returns_current_state() {
832        let font_res = Some(b"F1".to_vec());
833        let mut warned = false;
834        let mut warnings = Vec::new();
835        let (res, size) =
836            get_text_state_or_default(&font_res, Some(12.0), true, &mut warned, 1, &mut warnings);
837        assert_eq!(res, b"F1");
838        assert_eq!(size, 12.0);
839        assert!(
840            warnings.is_empty(),
841            "should not emit warning when Tf is set"
842        );
843        assert!(!warned, "warned flag should remain false");
844    }
845
846    #[test]
847    fn get_text_state_or_default_without_tf_returns_defaults_and_warns_once() {
848        let mut warned = false;
849        let mut warnings = Vec::new();
850
851        // First call: should emit warning
852        let (res1, size1) =
853            get_text_state_or_default(&None, None, false, &mut warned, 1, &mut warnings);
854        assert_eq!(res1, b"<unknown>");
855        assert_eq!(size1, 0.0);
856        assert_eq!(warnings.len(), 1, "should emit exactly one warning");
857        match &warnings[0] {
858            Warning::MalformedPdfObject { detail } => {
859                assert!(detail.contains("text state not set before Tj/TJ"));
860            }
861            other => panic!("expected MalformedPdfObject, got {:?}", other),
862        }
863        assert!(warned, "warned flag should be set after first call");
864
865        // Second call: should NOT emit another warning (plan §5: "emit one")
866        let (res2, size2) =
867            get_text_state_or_default(&None, None, false, &mut warned, 1, &mut warnings);
868        assert_eq!(res2, b"<unknown>");
869        assert_eq!(size2, 0.0);
870        assert_eq!(
871            warnings.len(),
872            1,
873            "should still have exactly one warning after second call"
874        );
875    }
876
877    #[test]
878    fn get_text_state_or_default_warned_resets_across_text_objects() {
879        let mut warned = false;
880        let mut warnings = Vec::new();
881
882        // First text object: emit one warning
883        get_text_state_or_default(&None, None, false, &mut warned, 1, &mut warnings);
884        assert_eq!(warnings.len(), 1);
885
886        // Simulate BT: reset warned flag (caller is responsible for this)
887        warned = false;
888
889        // Second text object: should emit another warning
890        get_text_state_or_default(&None, None, false, &mut warned, 1, &mut warnings);
891        assert_eq!(
892            warnings.len(),
893            2,
894            "should have two warnings for two separate text objects"
895        );
896    }
897
898    // ── parse_pdf tests ──
899
900    #[test]
901    fn parse_pdf_simple_returns_metadata_and_segments() {
902        let path = fixture_path("simple.pdf");
903        let bytes = std::fs::read(&path).unwrap();
904        let (segments, metadata, warnings) = parse_pdf(&bytes);
905
906        // Metadata
907        assert_eq!(metadata.page_count, 1, "simple.pdf has 1 page");
908
909        // Segments should not be empty
910        assert!(!segments.is_empty(), "should produce segments");
911
912        // All segments should be page 1 (1-based)
913        for seg in &segments {
914            assert_eq!(seg.page_number, 1, "all segments should be page 1");
915        }
916
917        // Combined text should contain oracle-expected content
918        let combined: String = segments.iter().map(|s| s.text.as_str()).collect();
919        assert!(combined.contains("Chapter 1"), "should contain 'Chapter 1'");
920        assert!(
921            combined.contains("Body text."),
922            "should contain 'Body text.'"
923        );
924
925        // No critical warnings for valid PDF
926        for w in &warnings {
927            if let Warning::UnreadableTextStream { .. } = w {
928                panic!("valid PDF should not produce UnreadableTextStream");
929            }
930        }
931    }
932
933    #[test]
934    fn parse_pdf_failed_load_returns_empty_with_warning() {
935        let (segments, metadata, warnings) = parse_pdf(b"not a pdf");
936        assert!(
937            segments.is_empty(),
938            "failed load should produce no segments"
939        );
940        assert_eq!(
941            metadata.page_count, 0,
942            "failed load should have page_count=0"
943        );
944        assert!(metadata.title.is_none(), "failed load should have no title");
945        assert!(
946            metadata.author.is_none(),
947            "failed load should have no author"
948        );
949        assert!(!warnings.is_empty(), "failed load should produce warnings");
950        match &warnings[0] {
951            Warning::MalformedPdfObject { detail } => {
952                assert!(!detail.is_empty());
953            }
954            other => panic!("expected MalformedPdfObject, got {:?}", other),
955        }
956    }
957
958    #[test]
959    fn parse_pdf_empty_bytes_returns_empty_with_warning() {
960        let (segments, metadata, warnings) = parse_pdf(b"");
961        assert!(segments.is_empty());
962        assert_eq!(metadata.page_count, 0);
963        assert!(!warnings.is_empty());
964        match &warnings[0] {
965            Warning::MalformedPdfObject { .. } => {}
966            other => panic!("expected MalformedPdfObject, got {:?}", other),
967        }
968    }
969
970    #[test]
971    fn parse_pdf_corrupted_fixture_returns_empty_with_warning() {
972        let path = fixture_path("corrupted.pdf");
973        let bytes = std::fs::read(&path).unwrap();
974        let (segments, metadata, warnings) = parse_pdf(&bytes);
975        assert!(
976            segments.is_empty(),
977            "corrupted PDF should produce no segments"
978        );
979        assert_eq!(metadata.page_count, 0);
980        assert!(!warnings.is_empty());
981        match &warnings[0] {
982            Warning::MalformedPdfObject { detail } => {
983                assert!(!detail.is_empty());
984            }
985            other => panic!("expected MalformedPdfObject, got {:?}", other),
986        }
987    }
988
989    #[test]
990    fn parse_pdf_multi_page_has_1_based_page_numbers() {
991        let path = fixture_path("multi-page.pdf");
992        let bytes = std::fs::read(&path).unwrap();
993        let (segments, metadata, _) = parse_pdf(&bytes);
994
995        assert!(
996            metadata.page_count >= 2,
997            "multi-page.pdf should have 2+ pages"
998        );
999
1000        // Check that segments reference pages starting from 1, not 0
1001        let min_page = segments.iter().map(|s| s.page_number).min().unwrap_or(0);
1002        assert_eq!(min_page, 1, "minimum page number should be 1 (1-based)");
1003
1004        // Check that segments span multiple pages
1005        let max_page = segments.iter().map(|s| s.page_number).max().unwrap_or(0);
1006        assert!(
1007            max_page >= 2,
1008            "multi-page.pdf should have segments from page 2+, got max={}",
1009            max_page
1010        );
1011    }
1012
1013    #[test]
1014    fn parse_pdf_aggregates_warnings_in_stable_order() {
1015        // Calling parse_pdf twice on the same input should produce identical warnings
1016        let path = fixture_path("simple.pdf");
1017        let bytes = std::fs::read(&path).unwrap();
1018        let (_, _, warnings1) = parse_pdf(&bytes);
1019        let (_, _, warnings2) = parse_pdf(&bytes);
1020        assert_eq!(
1021            warnings1.len(),
1022            warnings2.len(),
1023            "warnings should be deterministic"
1024        );
1025        assert_eq!(
1026            warnings1, warnings2,
1027            "warnings should be stable across runs"
1028        );
1029    }
1030
1031    // ── decode_pdf_string tests ──
1032
1033    #[test]
1034    fn decode_pdf_string_winansi_ascii() {
1035        // Pure ASCII bytes should decode to the same string
1036        let result = decode_pdf_string(b"Hello World");
1037        assert_eq!(result, "Hello World");
1038    }
1039
1040    #[test]
1041    fn decode_pdf_string_winansi_high_latin_range() {
1042        // Bytes 0xA0+: same as Latin-1 — é (0xE9), ü (0xFC), ñ (0xF1)
1043        let result = decode_pdf_string(&[0xE9, 0xFC, 0xF1]);
1044        assert_eq!(result, "\u{00E9}\u{00FC}\u{00F1}");
1045        assert!(
1046            !result.contains(char::REPLACEMENT_CHARACTER),
1047            "valid WinAnsi high-latin should not produce replacement chars"
1048        );
1049    }
1050
1051    #[test]
1052    fn decode_pdf_string_winansi_0x80_to_0x9f_range() {
1053        // 0x80 = € (U+20AC), 0x93 = " (U+201C), 0x96 = – (U+2013), 0x97 = — (U+2014)
1054        let result = decode_pdf_string(&[0x80, 0x93, 0x96, 0x97]);
1055        assert_eq!(result, "\u{20AC}\u{201C}\u{2013}\u{2014}");
1056    }
1057
1058    #[test]
1059    fn decode_pdf_string_winansi_undefined_bytes_use_replacement() {
1060        // 0x81 and 0x8D are undefined in WinAnsi — should produce replacement chars
1061        let result = decode_pdf_string(&[0x81, 0x8D]);
1062        assert_eq!(result, "\u{FFFD}\u{FFFD}");
1063    }
1064
1065    #[test]
1066    fn decode_pdf_string_utf16be_with_bom() {
1067        // UTF-16BE BOM (FE FF) followed by "Hi" (0x0048 0x0069)
1068        let bytes = [0xFE, 0xFF, 0x00, 0x48, 0x00, 0x69];
1069        let result = decode_pdf_string(&bytes);
1070        assert_eq!(result, "Hi");
1071        assert!(
1072            !result.contains(char::REPLACEMENT_CHARACTER),
1073            "valid UTF-16BE should not produce replacement chars"
1074        );
1075    }
1076
1077    #[test]
1078    fn decode_pdf_string_utf16be_without_bom() {
1079        // UTF-16BE without BOM: starts with 0x00 and even length
1080        // "AB" = 0x0041 0x0042
1081        let bytes = [0x00, 0x41, 0x00, 0x42];
1082        let result = decode_pdf_string(&bytes);
1083        assert_eq!(result, "AB");
1084    }
1085
1086    #[test]
1087    fn decode_pdf_string_utf16be_with_non_ascii() {
1088        // UTF-16BE BOM + "café" = c(0x0063) a(0x0061) f(0x0066) é(0x00E9)
1089        let bytes = [0xFE, 0xFF, 0x00, 0x63, 0x00, 0x61, 0x00, 0x66, 0x00, 0xE9];
1090        let result = decode_pdf_string(&bytes);
1091        assert_eq!(result, "caf\u{00E9}");
1092    }
1093
1094    #[test]
1095    fn decode_pdf_string_empty_bytes() {
1096        let result = decode_pdf_string(b"");
1097        assert_eq!(result, "");
1098    }
1099
1100    #[test]
1101    fn decode_pdf_string_single_byte_not_utf16() {
1102        // Single byte can't be UTF-16BE — should fallback to WinAnsi
1103        let result = decode_pdf_string(&[0x41]);
1104        assert_eq!(result, "A");
1105    }
1106
1107    #[test]
1108    fn decode_utf16be_invalid_surrogate_uses_replacement() {
1109        // Unpaired high surrogate: 0xD800
1110        let bytes = [0xD8, 0x00];
1111        let result = decode_utf16be(&bytes);
1112        assert!(
1113            result.contains(char::REPLACEMENT_CHARACTER),
1114            "invalid surrogate should produce replacement char, got: {:?}",
1115            result
1116        );
1117    }
1118
1119    // ── winansi_byte_to_char tests ──
1120
1121    #[test]
1122    fn winansi_byte_to_char_ascii_range() {
1123        assert_eq!(winansi_byte_to_char(0x41), 'A');
1124        assert_eq!(winansi_byte_to_char(0x20), ' ');
1125        assert_eq!(winansi_byte_to_char(0x7F), '\x7F');
1126    }
1127
1128    #[test]
1129    fn winansi_byte_to_char_special_range() {
1130        assert_eq!(winansi_byte_to_char(0x80), '\u{20AC}'); // Euro
1131        assert_eq!(winansi_byte_to_char(0x91), '\u{2018}'); // Left single quote
1132        assert_eq!(winansi_byte_to_char(0x92), '\u{2019}'); // Right single quote
1133        assert_eq!(winansi_byte_to_char(0x93), '\u{201C}'); // Left double quote
1134        assert_eq!(winansi_byte_to_char(0x94), '\u{201D}'); // Right double quote
1135        assert_eq!(winansi_byte_to_char(0x96), '\u{2013}'); // En dash
1136        assert_eq!(winansi_byte_to_char(0x97), '\u{2014}'); // Em dash
1137        assert_eq!(winansi_byte_to_char(0x99), '\u{2122}'); // TM
1138    }
1139
1140    #[test]
1141    fn winansi_byte_to_char_high_latin_range() {
1142        assert_eq!(winansi_byte_to_char(0xA0), '\u{00A0}'); // NBSP
1143        assert_eq!(winansi_byte_to_char(0xE9), '\u{00E9}'); // é
1144        assert_eq!(winansi_byte_to_char(0xFF), '\u{00FF}'); // ÿ
1145    }
1146
1147    // ── strip_subset_prefix direct tests ──
1148
1149    #[test]
1150    fn strip_subset_prefix_strips_valid_prefix() {
1151        assert_eq!(
1152            strip_subset_prefix("ABCDEF+Helvetica-Bold"),
1153            "Helvetica-Bold"
1154        );
1155    }
1156
1157    #[test]
1158    fn strip_subset_prefix_strips_any_six_uppercase_letters() {
1159        assert_eq!(strip_subset_prefix("ZZZZZZ+TimesNewRoman"), "TimesNewRoman");
1160    }
1161
1162    #[test]
1163    fn strip_subset_prefix_leaves_non_prefixed_name() {
1164        assert_eq!(strip_subset_prefix("Helvetica"), "Helvetica");
1165    }
1166
1167    #[test]
1168    fn strip_subset_prefix_leaves_short_names() {
1169        assert_eq!(strip_subset_prefix("AB+X"), "AB+X");
1170    }
1171
1172    #[test]
1173    fn strip_subset_prefix_leaves_lowercase_prefix() {
1174        // Lowercase letters before + don't match the subset pattern
1175        assert_eq!(strip_subset_prefix("abcdef+Font"), "abcdef+Font");
1176    }
1177
1178    #[test]
1179    fn strip_subset_prefix_leaves_mixed_case_prefix() {
1180        assert_eq!(strip_subset_prefix("ABCDEf+Font"), "ABCDEf+Font");
1181    }
1182
1183    #[test]
1184    fn strip_subset_prefix_leaves_empty_string() {
1185        assert_eq!(strip_subset_prefix(""), "");
1186    }
1187}
papyrus_core/parser/mod.rs

papyrus_core/parser/
mod.rs