1use std::collections::HashMap;
2
3use crate::ast::{DocumentMetadata, Warning};
4
5#[derive(Debug, Clone, PartialEq)]
7pub struct FontInfo {
8 pub name: String,
10 pub size: Option<f32>,
12 pub font_weight: Option<f32>,
14 pub italic_angle: Option<f32>,
16}
17
18#[derive(Debug, Clone, PartialEq)]
20pub struct RawTextSegment {
21 pub text: String,
23 pub font_resource_name: Vec<u8>,
25 pub font_size: f32,
27 pub page_number: usize,
29}
30
31pub fn load_pdf(bytes: &[u8]) -> (Option<lopdf::Document>, Vec<Warning>) {
35 if bytes.is_empty() {
36 return (
37 None,
38 vec![Warning::MalformedPdfObject {
39 detail: "empty PDF bytes".to_string(),
40 }],
41 );
42 }
43
44 match lopdf::Document::load_mem(bytes) {
45 Ok(doc) => (Some(doc), Vec::new()),
46 Err(e) => (
47 None,
48 vec![Warning::MalformedPdfObject {
49 detail: format!("failed to load PDF: {}", e),
50 }],
51 ),
52 }
53}
54
55fn extract_font_descriptor_metrics(
60 doc: &lopdf::Document,
61 font_dict: &lopdf::Dictionary,
62) -> (Option<f32>, Option<f32>) {
63 let descriptor = font_dict
64 .get(b"FontDescriptor")
65 .ok()
66 .and_then(|obj| match obj {
67 lopdf::Object::Reference(id) => doc.get_object(*id).ok(),
68 other => Some(other),
69 })
70 .and_then(|obj| obj.as_dict().ok());
71
72 let Some(desc) = descriptor else {
73 return (None, None);
74 };
75
76 let font_weight = desc.get(b"FontWeight").ok().and_then(extract_number);
77 let italic_angle = desc.get(b"ItalicAngle").ok().and_then(extract_number);
78
79 (font_weight, italic_angle)
80}
81
82pub(crate) fn strip_subset_prefix(name: &str) -> &str {
88 if name.len() >= 7
89 && name.as_bytes()[6] == b'+'
90 && name[..6].bytes().all(|b| b.is_ascii_uppercase())
91 {
92 &name[7..]
93 } else {
94 name
95 }
96}
97
98pub fn resolve_fonts_for_page(
103 doc: &lopdf::Document,
104 page_number: usize,
105) -> (HashMap<Vec<u8>, FontInfo>, Vec<Warning>) {
106 let mut fonts = HashMap::new();
107 let mut warnings = Vec::new();
108
109 let pages = doc.get_pages();
111 let page_num_u32 = match u32::try_from(page_number) {
112 Ok(n) => n,
113 Err(_) => {
114 warnings.push(Warning::MalformedPdfObject {
115 detail: format!("page number {} exceeds u32 range", page_number),
116 });
117 return (fonts, warnings);
118 }
119 };
120 let page_id = match pages.get(&page_num_u32) {
121 Some(id) => *id,
122 None => {
123 warnings.push(Warning::MalformedPdfObject {
124 detail: format!(
125 "page {} not found (document has {} pages)",
126 page_number,
127 pages.len()
128 ),
129 });
130 return (fonts, warnings);
131 }
132 };
133
134 let page_fonts = match doc.get_page_fonts(page_id) {
136 Ok(f) => f,
137 Err(e) => {
138 warnings.push(Warning::MalformedPdfObject {
139 detail: format!(
140 "failed to read font resources for page {}: {}",
141 page_number, e
142 ),
143 });
144 return (fonts, warnings);
145 }
146 };
147
148 for (resource_name, font_dict) in page_fonts {
149 let base_font_name = match font_dict.get(b"BaseFont") {
151 Ok(obj) => match obj.as_name() {
152 Ok(name_bytes) => {
153 let raw_name = String::from_utf8_lossy(name_bytes).to_string();
154 strip_subset_prefix(&raw_name).to_string()
155 }
156 Err(_) => {
157 warnings.push(Warning::MissingFontMetrics {
158 font_name: "<unknown>".to_string(),
159 page: page_number,
160 });
161 continue;
162 }
163 },
164 Err(_) => {
165 warnings.push(Warning::MissingFontMetrics {
166 font_name: "<unknown>".to_string(),
167 page: page_number,
168 });
169 continue;
170 }
171 };
172
173 let (font_weight, italic_angle) = extract_font_descriptor_metrics(doc, &font_dict);
179
180 fonts.insert(
181 resource_name,
182 FontInfo {
183 name: base_font_name,
184 size: None,
185 font_weight,
186 italic_angle,
187 },
188 );
189 }
190
191 (fonts, warnings)
192}
193
194const WINANSI_0X80_TO_0X9F: [char; 32] = [
202 '\u{20AC}', '\u{FFFD}', '\u{201A}', '\u{0192}', '\u{201E}', '\u{2026}', '\u{2020}', '\u{2021}', '\u{02C6}', '\u{2030}', '\u{0160}', '\u{2039}', '\u{0152}', '\u{FFFD}', '\u{017D}', '\u{FFFD}', '\u{FFFD}', '\u{2018}', '\u{2019}', '\u{201C}', '\u{201D}', '\u{2022}', '\u{2013}', '\u{2014}', '\u{02DC}', '\u{2122}', '\u{0161}', '\u{203A}', '\u{0153}', '\u{FFFD}', '\u{017E}', '\u{0178}', ];
235
236fn winansi_byte_to_char(b: u8) -> char {
238 if b < 0x80 {
239 b as char
240 } else if b <= 0x9F {
241 WINANSI_0X80_TO_0X9F[(b - 0x80) as usize]
242 } else {
243 b as char
245 }
246}
247
248fn decode_pdf_string(bytes: &[u8]) -> String {
252 if bytes.len() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF {
254 return decode_utf16be(&bytes[2..]);
256 }
257
258 if bytes.len() >= 2 && bytes.len().is_multiple_of(2) && bytes[0] == 0x00 {
263 return decode_utf16be(bytes);
264 }
265
266 bytes.iter().map(|&b| winansi_byte_to_char(b)).collect()
268}
269
270fn decode_utf16be(bytes: &[u8]) -> String {
272 let u16_iter = bytes
273 .chunks_exact(2)
274 .map(|pair| u16::from_be_bytes([pair[0], pair[1]]));
275 char::decode_utf16(u16_iter)
276 .map(|r| r.unwrap_or(char::REPLACEMENT_CHARACTER))
277 .collect()
278}
279
280pub fn extract_text_segments_for_page(
285 doc: &lopdf::Document,
286 page_number: usize,
287 _fonts: &HashMap<Vec<u8>, FontInfo>,
288) -> (Vec<RawTextSegment>, Vec<Warning>) {
289 let mut segments = Vec::new();
290 let mut warnings = Vec::new();
291
292 let pages = doc.get_pages();
294 let page_num_u32 = match u32::try_from(page_number) {
295 Ok(n) => n,
296 Err(_) => return (segments, warnings),
297 };
298 let page_id = match pages.get(&page_num_u32) {
299 Some(id) => *id,
300 None => return (segments, warnings),
301 };
302
303 let content = match doc.get_and_decode_page_content(page_id) {
305 Ok(c) => c,
306 Err(e) => {
307 warnings.push(Warning::UnreadableTextStream {
308 page: page_number,
309 detail: format!("failed to decode content stream: {}", e),
310 });
311 return (segments, warnings);
312 }
313 };
314
315 let mut current_font_resource: Option<Vec<u8>> = None;
317 let mut current_font_size: Option<f32> = None;
318 let mut tf_set_in_text_object = false;
319 let mut warned_no_tf = false;
320
321 for op in content.operations.iter() {
322 match op.operator.as_str() {
323 "BT" => {
324 current_font_resource = None;
326 current_font_size = None;
327 tf_set_in_text_object = false;
328 warned_no_tf = false;
329 }
330 "ET" => {
331 current_font_resource = None;
333 current_font_size = None;
334 tf_set_in_text_object = false;
335 warned_no_tf = false;
336 }
337 "Tf" => {
338 if op.operands.len() >= 2 {
340 if let Some(name_bytes) = extract_name(&op.operands[0]) {
341 current_font_resource = Some(name_bytes);
342 }
343 if let Some(size) = extract_number(&op.operands[1]) {
344 current_font_size = Some(size);
345 }
346 tf_set_in_text_object = true;
347 }
348 }
349 "Tj" => {
350 if let Some(text_bytes) = op.operands.first().and_then(extract_string_bytes) {
352 let text = decode_pdf_string(&text_bytes);
353 if !text.is_empty() {
354 let (font_res, font_sz) = get_text_state_or_default(
355 ¤t_font_resource,
356 current_font_size,
357 tf_set_in_text_object,
358 &mut warned_no_tf,
359 page_number,
360 &mut warnings,
361 );
362 segments.push(RawTextSegment {
363 text,
364 font_resource_name: font_res,
365 font_size: font_sz,
366 page_number,
367 });
368 }
369 }
370 }
371 "TJ" => {
372 if let Some(lopdf::Object::Array(arr)) = op.operands.first() {
376 let mut combined = String::new();
377 let mut prev_was_string = false;
378 let mut needs_space = false;
379
380 for item in arr {
381 if let Some(bytes) = extract_string_bytes(item) {
382 let text = decode_pdf_string(&bytes);
383 if needs_space
385 && !combined.is_empty()
386 && !combined.ends_with(char::is_whitespace)
387 && !text.starts_with(char::is_whitespace)
388 {
389 combined.push(' ');
390 }
391 combined.push_str(&text);
392 prev_was_string = true;
393 needs_space = false;
394 } else if let Some(num) = extract_number(item) {
395 if prev_was_string && num < -100.0 {
399 needs_space = true;
400 }
401 }
402 }
403 if !combined.is_empty() {
404 let (font_res, font_sz) = get_text_state_or_default(
405 ¤t_font_resource,
406 current_font_size,
407 tf_set_in_text_object,
408 &mut warned_no_tf,
409 page_number,
410 &mut warnings,
411 );
412 segments.push(RawTextSegment {
413 text: combined,
414 font_resource_name: font_res,
415 font_size: font_sz,
416 page_number,
417 });
418 }
419 }
420 }
421 _ => {
422 }
424 }
425 }
426
427 (segments, warnings)
428}
429
430fn get_text_state_or_default(
433 current_font_resource: &Option<Vec<u8>>,
434 current_font_size: Option<f32>,
435 tf_set: bool,
436 warned_no_tf: &mut bool,
437 page_number: usize,
438 warnings: &mut Vec<Warning>,
439) -> (Vec<u8>, f32) {
440 if tf_set {
441 (
442 current_font_resource
443 .clone()
444 .unwrap_or_else(|| b"<unknown>".to_vec()),
445 current_font_size.unwrap_or(0.0),
446 )
447 } else {
448 if !*warned_no_tf {
449 warnings.push(Warning::MalformedPdfObject {
450 detail: format!("text state not set before Tj/TJ on page {}", page_number),
451 });
452 *warned_no_tf = true;
453 }
454 (b"<unknown>".to_vec(), 0.0)
455 }
456}
457
458fn extract_name(obj: &lopdf::Object) -> Option<Vec<u8>> {
460 match obj {
461 lopdf::Object::Name(n) => Some(n.clone()),
462 _ => None,
463 }
464}
465
466fn extract_number(obj: &lopdf::Object) -> Option<f32> {
468 match obj {
469 lopdf::Object::Real(f) => Some(*f),
470 lopdf::Object::Integer(i) => Some(*i as f32),
471 _ => None,
472 }
473}
474
475fn extract_string_bytes(obj: &lopdf::Object) -> Option<Vec<u8>> {
477 match obj {
478 lopdf::Object::String(bytes, _) => Some(bytes.clone()),
479 _ => None,
480 }
481}
482
483pub fn parse_pdf(bytes: &[u8]) -> (Vec<RawTextSegment>, DocumentMetadata, Vec<Warning>) {
487 let mut all_segments = Vec::new();
488 let mut all_warnings = Vec::new();
489
490 let (doc_opt, load_warnings) = load_pdf(bytes);
492 all_warnings.extend(load_warnings);
493
494 let doc = match doc_opt {
495 Some(d) => d,
496 None => {
497 return (
498 all_segments,
499 DocumentMetadata {
500 title: None,
501 author: None,
502 page_count: 0,
503 },
504 all_warnings,
505 );
506 }
507 };
508
509 let pages = doc.get_pages();
511 let page_count = pages.len();
512
513 let (title, author) = extract_doc_info(&doc);
515
516 let metadata = DocumentMetadata {
517 title,
518 author,
519 page_count,
520 };
521
522 let mut page_numbers: Vec<u32> = pages.keys().copied().collect();
524 page_numbers.sort();
525
526 for &page_num in &page_numbers {
527 let page_number = page_num as usize;
529
530 let (fonts, font_warnings) = resolve_fonts_for_page(&doc, page_number);
532 all_warnings.extend(font_warnings);
533
534 let (segments, extract_warnings) =
536 extract_text_segments_for_page(&doc, page_number, &fonts);
537 all_warnings.extend(extract_warnings);
538
539 all_segments.extend(segments);
540 }
541
542 (all_segments, metadata, all_warnings)
543}
544
545pub(crate) fn extract_doc_info_pub(doc: &lopdf::Document) -> (Option<String>, Option<String>) {
547 extract_doc_info(doc)
548}
549
550fn extract_doc_info(doc: &lopdf::Document) -> (Option<String>, Option<String>) {
552 let info_dict = doc
554 .trailer
555 .get(b"Info")
556 .ok()
557 .and_then(|obj| match obj {
558 lopdf::Object::Reference(id) => doc.get_object(*id).ok(),
559 _ => Some(obj),
560 })
561 .and_then(|obj| obj.as_dict().ok());
562
563 let info = match info_dict {
564 Some(d) => d,
565 None => return (None, None),
566 };
567
568 let title = get_info_string(info, b"Title");
569 let author = get_info_string(info, b"Author");
570
571 (title, author)
572}
573
574fn get_info_string(info: &lopdf::Dictionary, key: &[u8]) -> Option<String> {
576 info.get(key).ok().and_then(|obj| match obj {
577 lopdf::Object::String(bytes, _) => {
578 let s = decode_pdf_string(bytes);
579 if s.is_empty() {
580 None
581 } else {
582 Some(s)
583 }
584 }
585 _ => None,
586 })
587}
588
589#[cfg(test)]
590mod tests {
591 use super::*;
592 use std::path::PathBuf;
593
594 fn fixture_path(name: &str) -> PathBuf {
596 let manifest_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
599 manifest_dir
600 .parent()
601 .expect("workspace root")
602 .join("tests")
603 .join("fixtures")
604 .join(name)
605 }
606
607 #[test]
610 fn load_pdf_empty_bytes_returns_none_with_warning() {
611 let (doc, warnings) = load_pdf(b"");
612 assert!(doc.is_none(), "empty bytes should not produce a document");
613 assert!(!warnings.is_empty(), "should emit at least one warning");
614 match &warnings[0] {
615 Warning::MalformedPdfObject { detail } => {
616 assert!(!detail.is_empty(), "detail should be non-empty");
617 }
618 other => panic!("expected MalformedPdfObject, got {:?}", other),
619 }
620 }
621
622 #[test]
623 fn load_pdf_invalid_header_returns_none_with_warning() {
624 let (doc, warnings) = load_pdf(b"this is not a PDF");
625 assert!(
626 doc.is_none(),
627 "invalid header should not produce a document"
628 );
629 assert!(!warnings.is_empty(), "should emit at least one warning");
630 match &warnings[0] {
631 Warning::MalformedPdfObject { detail } => {
632 assert!(!detail.is_empty(), "detail should be non-empty");
633 }
634 other => panic!("expected MalformedPdfObject, got {:?}", other),
635 }
636 }
637
638 #[test]
639 fn load_pdf_corrupted_fixture_returns_none_with_warning() {
640 let path = fixture_path("corrupted.pdf");
641 let bytes = std::fs::read(&path)
642 .unwrap_or_else(|e| panic!("corrupted.pdf fixture must exist at {:?}: {}", path, e));
643 let (doc, warnings) = load_pdf(&bytes);
644 assert!(doc.is_none(), "corrupted PDF should not produce a document");
645 assert!(!warnings.is_empty(), "should emit at least one warning");
646 match &warnings[0] {
647 Warning::MalformedPdfObject { detail } => {
648 assert!(!detail.is_empty(), "detail should be non-empty");
649 }
650 other => panic!("expected MalformedPdfObject, got {:?}", other),
651 }
652 }
653
654 #[test]
655 fn load_pdf_valid_simple_fixture_returns_some() {
656 let path = fixture_path("simple.pdf");
657 let bytes = std::fs::read(&path)
658 .unwrap_or_else(|e| panic!("simple.pdf fixture must exist at {:?}: {}", path, e));
659 let (doc, warnings) = load_pdf(&bytes);
660 assert!(doc.is_some(), "valid PDF should produce a document");
661 for w in &warnings {
663 if let Warning::MalformedPdfObject { .. } = w {
664 panic!("valid PDF should not produce MalformedPdfObject warning");
665 }
666 }
667 }
668
669 fn load_fixture(name: &str) -> lopdf::Document {
673 let path = fixture_path(name);
674 let bytes = std::fs::read(&path)
675 .unwrap_or_else(|e| panic!("fixture {} must exist at {:?}: {}", name, path, e));
676 let (doc, _) = load_pdf(&bytes);
677 doc.expect("fixture should be a valid PDF")
678 }
679
680 #[test]
681 fn resolve_fonts_for_page_simple_returns_font_entries() {
682 let doc = load_fixture("simple.pdf");
683 let (fonts, warnings) = resolve_fonts_for_page(&doc, 1);
684 assert!(
686 !fonts.is_empty(),
687 "simple.pdf page 1 should have font entries"
688 );
689 let has_helvetica = fonts.values().any(|f| f.name.contains("Helvetica"));
691 assert!(
692 has_helvetica,
693 "simple.pdf should have a Helvetica font, got: {:?}",
694 fonts
695 );
696 let malformed_warnings: Vec<_> = warnings
698 .iter()
699 .filter(|w| matches!(w, Warning::MissingFontMetrics { .. }))
700 .collect();
701 assert!(
702 malformed_warnings.is_empty(),
703 "well-formed page should not produce MissingFontMetrics warnings"
704 );
705 }
706
707 #[test]
708 fn resolve_fonts_for_page_bold_italic_returns_bold_and_italic_fonts() {
709 let doc = load_fixture("bold-italic.pdf");
710 let (fonts, _) = resolve_fonts_for_page(&doc, 1);
711 assert!(
712 !fonts.is_empty(),
713 "bold-italic.pdf page 1 should have font entries"
714 );
715 let names: Vec<&str> = fonts.values().map(|f| f.name.as_str()).collect();
717 let has_bold = names.iter().any(|n| n.to_lowercase().contains("bold"));
718 let has_italic = names
719 .iter()
720 .any(|n| n.to_lowercase().contains("oblique") || n.to_lowercase().contains("italic"));
721 assert!(
722 has_bold,
723 "bold-italic.pdf should have a Bold font, got: {:?}",
724 names
725 );
726 assert!(
727 has_italic,
728 "bold-italic.pdf should have an Oblique/Italic font, got: {:?}",
729 names
730 );
731 }
732
733 #[test]
734 fn resolve_fonts_for_page_preserves_resource_names_as_keys() {
735 let doc = load_fixture("simple.pdf");
736 let (fonts, _) = resolve_fonts_for_page(&doc, 1);
737 for key in fonts.keys() {
739 assert!(
740 !key.is_empty(),
741 "font resource name key should not be empty"
742 );
743 }
744 }
745
746 #[test]
749 fn extract_text_segments_for_page_simple_returns_segments() {
750 let doc = load_fixture("simple.pdf");
751 let (fonts, _) = resolve_fonts_for_page(&doc, 1);
752 let (segments, warnings) = extract_text_segments_for_page(&doc, 1, &fonts);
753 assert!(
754 !segments.is_empty(),
755 "simple.pdf page 1 should produce text segments"
756 );
757 for seg in &segments {
759 assert_eq!(seg.page_number, 1, "all segments should be page 1");
760 }
761 let combined: String = segments.iter().map(|s| s.text.as_str()).collect();
763 assert!(
764 combined.contains("Chapter 1"),
765 "simple.pdf should contain 'Chapter 1', got: {:?}",
766 combined
767 );
768 for w in &warnings {
770 if let Warning::UnreadableTextStream { .. } = w {
771 panic!("well-formed page should not produce UnreadableTextStream");
772 }
773 }
774 }
775
776 #[test]
777 fn extract_text_segments_for_page_bold_italic_returns_different_fonts() {
778 let doc = load_fixture("bold-italic.pdf");
779 let (fonts, _) = resolve_fonts_for_page(&doc, 1);
780 let (segments, _) = extract_text_segments_for_page(&doc, 1, &fonts);
781 assert!(
782 !segments.is_empty(),
783 "bold-italic.pdf page 1 should produce text segments"
784 );
785 let unique_fonts: std::collections::HashSet<&Vec<u8>> =
787 segments.iter().map(|s| &s.font_resource_name).collect();
788 assert!(
789 unique_fonts.len() >= 2,
790 "bold-italic.pdf should use at least 2 different fonts, got: {:?}",
791 unique_fonts
792 );
793 }
794
795 #[test]
796 fn extract_text_segments_for_page_font_size_comes_from_tf_state() {
797 let doc = load_fixture("simple.pdf");
798 let (fonts, _) = resolve_fonts_for_page(&doc, 1);
799 let (segments, _) = extract_text_segments_for_page(&doc, 1, &fonts);
800 for seg in &segments {
802 assert!(
803 seg.font_size > 0.0,
804 "font_size should be positive (from Tf), got: {}",
805 seg.font_size
806 );
807 }
808 }
809
810 #[test]
811 fn extract_text_segments_for_page_preserves_operator_encounter_order() {
812 let doc = load_fixture("simple.pdf");
813 let (fonts, _) = resolve_fonts_for_page(&doc, 1);
814 let (segments, _) = extract_text_segments_for_page(&doc, 1, &fonts);
815 let combined: String = segments.iter().map(|s| s.text.as_str()).collect();
818 if combined.contains("Chapter 1") && combined.contains("Body text.") {
819 let chapter_pos = combined.find("Chapter 1").unwrap();
820 let body_pos = combined.find("Body text.").unwrap();
821 assert!(
822 chapter_pos < body_pos,
823 "Chapter 1 should come before Body text. in operator order"
824 );
825 }
826 }
827
828 #[test]
831 fn get_text_state_or_default_with_tf_set_returns_current_state() {
832 let font_res = Some(b"F1".to_vec());
833 let mut warned = false;
834 let mut warnings = Vec::new();
835 let (res, size) =
836 get_text_state_or_default(&font_res, Some(12.0), true, &mut warned, 1, &mut warnings);
837 assert_eq!(res, b"F1");
838 assert_eq!(size, 12.0);
839 assert!(
840 warnings.is_empty(),
841 "should not emit warning when Tf is set"
842 );
843 assert!(!warned, "warned flag should remain false");
844 }
845
846 #[test]
847 fn get_text_state_or_default_without_tf_returns_defaults_and_warns_once() {
848 let mut warned = false;
849 let mut warnings = Vec::new();
850
851 let (res1, size1) =
853 get_text_state_or_default(&None, None, false, &mut warned, 1, &mut warnings);
854 assert_eq!(res1, b"<unknown>");
855 assert_eq!(size1, 0.0);
856 assert_eq!(warnings.len(), 1, "should emit exactly one warning");
857 match &warnings[0] {
858 Warning::MalformedPdfObject { detail } => {
859 assert!(detail.contains("text state not set before Tj/TJ"));
860 }
861 other => panic!("expected MalformedPdfObject, got {:?}", other),
862 }
863 assert!(warned, "warned flag should be set after first call");
864
865 let (res2, size2) =
867 get_text_state_or_default(&None, None, false, &mut warned, 1, &mut warnings);
868 assert_eq!(res2, b"<unknown>");
869 assert_eq!(size2, 0.0);
870 assert_eq!(
871 warnings.len(),
872 1,
873 "should still have exactly one warning after second call"
874 );
875 }
876
877 #[test]
878 fn get_text_state_or_default_warned_resets_across_text_objects() {
879 let mut warned = false;
880 let mut warnings = Vec::new();
881
882 get_text_state_or_default(&None, None, false, &mut warned, 1, &mut warnings);
884 assert_eq!(warnings.len(), 1);
885
886 warned = false;
888
889 get_text_state_or_default(&None, None, false, &mut warned, 1, &mut warnings);
891 assert_eq!(
892 warnings.len(),
893 2,
894 "should have two warnings for two separate text objects"
895 );
896 }
897
898 #[test]
901 fn parse_pdf_simple_returns_metadata_and_segments() {
902 let path = fixture_path("simple.pdf");
903 let bytes = std::fs::read(&path).unwrap();
904 let (segments, metadata, warnings) = parse_pdf(&bytes);
905
906 assert_eq!(metadata.page_count, 1, "simple.pdf has 1 page");
908
909 assert!(!segments.is_empty(), "should produce segments");
911
912 for seg in &segments {
914 assert_eq!(seg.page_number, 1, "all segments should be page 1");
915 }
916
917 let combined: String = segments.iter().map(|s| s.text.as_str()).collect();
919 assert!(combined.contains("Chapter 1"), "should contain 'Chapter 1'");
920 assert!(
921 combined.contains("Body text."),
922 "should contain 'Body text.'"
923 );
924
925 for w in &warnings {
927 if let Warning::UnreadableTextStream { .. } = w {
928 panic!("valid PDF should not produce UnreadableTextStream");
929 }
930 }
931 }
932
933 #[test]
934 fn parse_pdf_failed_load_returns_empty_with_warning() {
935 let (segments, metadata, warnings) = parse_pdf(b"not a pdf");
936 assert!(
937 segments.is_empty(),
938 "failed load should produce no segments"
939 );
940 assert_eq!(
941 metadata.page_count, 0,
942 "failed load should have page_count=0"
943 );
944 assert!(metadata.title.is_none(), "failed load should have no title");
945 assert!(
946 metadata.author.is_none(),
947 "failed load should have no author"
948 );
949 assert!(!warnings.is_empty(), "failed load should produce warnings");
950 match &warnings[0] {
951 Warning::MalformedPdfObject { detail } => {
952 assert!(!detail.is_empty());
953 }
954 other => panic!("expected MalformedPdfObject, got {:?}", other),
955 }
956 }
957
958 #[test]
959 fn parse_pdf_empty_bytes_returns_empty_with_warning() {
960 let (segments, metadata, warnings) = parse_pdf(b"");
961 assert!(segments.is_empty());
962 assert_eq!(metadata.page_count, 0);
963 assert!(!warnings.is_empty());
964 match &warnings[0] {
965 Warning::MalformedPdfObject { .. } => {}
966 other => panic!("expected MalformedPdfObject, got {:?}", other),
967 }
968 }
969
970 #[test]
971 fn parse_pdf_corrupted_fixture_returns_empty_with_warning() {
972 let path = fixture_path("corrupted.pdf");
973 let bytes = std::fs::read(&path).unwrap();
974 let (segments, metadata, warnings) = parse_pdf(&bytes);
975 assert!(
976 segments.is_empty(),
977 "corrupted PDF should produce no segments"
978 );
979 assert_eq!(metadata.page_count, 0);
980 assert!(!warnings.is_empty());
981 match &warnings[0] {
982 Warning::MalformedPdfObject { detail } => {
983 assert!(!detail.is_empty());
984 }
985 other => panic!("expected MalformedPdfObject, got {:?}", other),
986 }
987 }
988
989 #[test]
990 fn parse_pdf_multi_page_has_1_based_page_numbers() {
991 let path = fixture_path("multi-page.pdf");
992 let bytes = std::fs::read(&path).unwrap();
993 let (segments, metadata, _) = parse_pdf(&bytes);
994
995 assert!(
996 metadata.page_count >= 2,
997 "multi-page.pdf should have 2+ pages"
998 );
999
1000 let min_page = segments.iter().map(|s| s.page_number).min().unwrap_or(0);
1002 assert_eq!(min_page, 1, "minimum page number should be 1 (1-based)");
1003
1004 let max_page = segments.iter().map(|s| s.page_number).max().unwrap_or(0);
1006 assert!(
1007 max_page >= 2,
1008 "multi-page.pdf should have segments from page 2+, got max={}",
1009 max_page
1010 );
1011 }
1012
1013 #[test]
1014 fn parse_pdf_aggregates_warnings_in_stable_order() {
1015 let path = fixture_path("simple.pdf");
1017 let bytes = std::fs::read(&path).unwrap();
1018 let (_, _, warnings1) = parse_pdf(&bytes);
1019 let (_, _, warnings2) = parse_pdf(&bytes);
1020 assert_eq!(
1021 warnings1.len(),
1022 warnings2.len(),
1023 "warnings should be deterministic"
1024 );
1025 assert_eq!(
1026 warnings1, warnings2,
1027 "warnings should be stable across runs"
1028 );
1029 }
1030
1031 #[test]
1034 fn decode_pdf_string_winansi_ascii() {
1035 let result = decode_pdf_string(b"Hello World");
1037 assert_eq!(result, "Hello World");
1038 }
1039
1040 #[test]
1041 fn decode_pdf_string_winansi_high_latin_range() {
1042 let result = decode_pdf_string(&[0xE9, 0xFC, 0xF1]);
1044 assert_eq!(result, "\u{00E9}\u{00FC}\u{00F1}");
1045 assert!(
1046 !result.contains(char::REPLACEMENT_CHARACTER),
1047 "valid WinAnsi high-latin should not produce replacement chars"
1048 );
1049 }
1050
1051 #[test]
1052 fn decode_pdf_string_winansi_0x80_to_0x9f_range() {
1053 let result = decode_pdf_string(&[0x80, 0x93, 0x96, 0x97]);
1055 assert_eq!(result, "\u{20AC}\u{201C}\u{2013}\u{2014}");
1056 }
1057
1058 #[test]
1059 fn decode_pdf_string_winansi_undefined_bytes_use_replacement() {
1060 let result = decode_pdf_string(&[0x81, 0x8D]);
1062 assert_eq!(result, "\u{FFFD}\u{FFFD}");
1063 }
1064
1065 #[test]
1066 fn decode_pdf_string_utf16be_with_bom() {
1067 let bytes = [0xFE, 0xFF, 0x00, 0x48, 0x00, 0x69];
1069 let result = decode_pdf_string(&bytes);
1070 assert_eq!(result, "Hi");
1071 assert!(
1072 !result.contains(char::REPLACEMENT_CHARACTER),
1073 "valid UTF-16BE should not produce replacement chars"
1074 );
1075 }
1076
1077 #[test]
1078 fn decode_pdf_string_utf16be_without_bom() {
1079 let bytes = [0x00, 0x41, 0x00, 0x42];
1082 let result = decode_pdf_string(&bytes);
1083 assert_eq!(result, "AB");
1084 }
1085
1086 #[test]
1087 fn decode_pdf_string_utf16be_with_non_ascii() {
1088 let bytes = [0xFE, 0xFF, 0x00, 0x63, 0x00, 0x61, 0x00, 0x66, 0x00, 0xE9];
1090 let result = decode_pdf_string(&bytes);
1091 assert_eq!(result, "caf\u{00E9}");
1092 }
1093
1094 #[test]
1095 fn decode_pdf_string_empty_bytes() {
1096 let result = decode_pdf_string(b"");
1097 assert_eq!(result, "");
1098 }
1099
1100 #[test]
1101 fn decode_pdf_string_single_byte_not_utf16() {
1102 let result = decode_pdf_string(&[0x41]);
1104 assert_eq!(result, "A");
1105 }
1106
1107 #[test]
1108 fn decode_utf16be_invalid_surrogate_uses_replacement() {
1109 let bytes = [0xD8, 0x00];
1111 let result = decode_utf16be(&bytes);
1112 assert!(
1113 result.contains(char::REPLACEMENT_CHARACTER),
1114 "invalid surrogate should produce replacement char, got: {:?}",
1115 result
1116 );
1117 }
1118
1119 #[test]
1122 fn winansi_byte_to_char_ascii_range() {
1123 assert_eq!(winansi_byte_to_char(0x41), 'A');
1124 assert_eq!(winansi_byte_to_char(0x20), ' ');
1125 assert_eq!(winansi_byte_to_char(0x7F), '\x7F');
1126 }
1127
1128 #[test]
1129 fn winansi_byte_to_char_special_range() {
1130 assert_eq!(winansi_byte_to_char(0x80), '\u{20AC}'); assert_eq!(winansi_byte_to_char(0x91), '\u{2018}'); assert_eq!(winansi_byte_to_char(0x92), '\u{2019}'); assert_eq!(winansi_byte_to_char(0x93), '\u{201C}'); assert_eq!(winansi_byte_to_char(0x94), '\u{201D}'); assert_eq!(winansi_byte_to_char(0x96), '\u{2013}'); assert_eq!(winansi_byte_to_char(0x97), '\u{2014}'); assert_eq!(winansi_byte_to_char(0x99), '\u{2122}'); }
1139
1140 #[test]
1141 fn winansi_byte_to_char_high_latin_range() {
1142 assert_eq!(winansi_byte_to_char(0xA0), '\u{00A0}'); assert_eq!(winansi_byte_to_char(0xE9), '\u{00E9}'); assert_eq!(winansi_byte_to_char(0xFF), '\u{00FF}'); }
1146
1147 #[test]
1150 fn strip_subset_prefix_strips_valid_prefix() {
1151 assert_eq!(
1152 strip_subset_prefix("ABCDEF+Helvetica-Bold"),
1153 "Helvetica-Bold"
1154 );
1155 }
1156
1157 #[test]
1158 fn strip_subset_prefix_strips_any_six_uppercase_letters() {
1159 assert_eq!(strip_subset_prefix("ZZZZZZ+TimesNewRoman"), "TimesNewRoman");
1160 }
1161
1162 #[test]
1163 fn strip_subset_prefix_leaves_non_prefixed_name() {
1164 assert_eq!(strip_subset_prefix("Helvetica"), "Helvetica");
1165 }
1166
1167 #[test]
1168 fn strip_subset_prefix_leaves_short_names() {
1169 assert_eq!(strip_subset_prefix("AB+X"), "AB+X");
1170 }
1171
1172 #[test]
1173 fn strip_subset_prefix_leaves_lowercase_prefix() {
1174 assert_eq!(strip_subset_prefix("abcdef+Font"), "abcdef+Font");
1176 }
1177
1178 #[test]
1179 fn strip_subset_prefix_leaves_mixed_case_prefix() {
1180 assert_eq!(strip_subset_prefix("ABCDEf+Font"), "ABCDEf+Font");
1181 }
1182
1183 #[test]
1184 fn strip_subset_prefix_leaves_empty_string() {
1185 assert_eq!(strip_subset_prefix(""), "");
1186 }
1187}