1use super::{OperationError, OperationResult};
39use crate::parser::{PdfDocument, PdfReader};
40use crate::text::{ExtractionOptions, OcrOptions, OcrProcessingResult, OcrProvider, TextExtractor};
41use std::fs::File;
43use std::path::Path;
44
45#[derive(Debug, Clone, Copy, PartialEq, Eq)]
47pub enum PageType {
48 Scanned,
50 Text,
52 Mixed,
54}
55
56impl PageType {
57 pub fn is_scanned(&self) -> bool {
59 matches!(self, PageType::Scanned)
60 }
61
62 pub fn is_text(&self) -> bool {
64 matches!(self, PageType::Text)
65 }
66
67 pub fn is_mixed(&self) -> bool {
69 matches!(self, PageType::Mixed)
70 }
71}
72
73#[derive(Debug, Clone)]
75pub struct ContentAnalysis {
76 pub page_number: usize,
78 pub page_type: PageType,
80 pub text_ratio: f64,
82 pub image_ratio: f64,
84 pub blank_space_ratio: f64,
86 pub text_fragment_count: usize,
88 pub image_count: usize,
90 pub character_count: usize,
92}
93
94impl ContentAnalysis {
95 pub fn is_scanned(&self) -> bool {
115 self.page_type.is_scanned()
116 }
117
118 pub fn is_text_heavy(&self) -> bool {
120 self.page_type.is_text()
121 }
122
123 pub fn is_mixed_content(&self) -> bool {
125 self.page_type.is_mixed()
126 }
127
128 pub fn dominant_content_ratio(&self) -> f64 {
130 self.text_ratio.max(self.image_ratio)
131 }
132}
133
134#[derive(Debug, Clone)]
136pub struct AnalysisOptions {
137 pub min_text_fragment_size: usize,
139 pub min_image_size: u32,
141 pub scanned_threshold: f64,
143 pub text_threshold: f64,
145 pub ocr_options: Option<OcrOptions>,
147}
148
149impl Default for AnalysisOptions {
150 fn default() -> Self {
151 Self {
152 min_text_fragment_size: 3,
153 min_image_size: 50,
154 scanned_threshold: 0.8,
155 text_threshold: 0.7,
156 ocr_options: None,
157 }
158 }
159}
160
161pub struct PageContentAnalyzer {
166 document: PdfDocument<File>,
167 options: AnalysisOptions,
168}
169
170impl PageContentAnalyzer {
171 pub fn new(document: PdfDocument<File>) -> Self {
190 Self {
191 document,
192 options: AnalysisOptions::default(),
193 }
194 }
195
196 pub fn with_options(document: PdfDocument<File>, options: AnalysisOptions) -> Self {
203 Self { document, options }
204 }
205
206 pub fn from_file<P: AsRef<Path>>(path: P) -> OperationResult<Self> {
216 let document = PdfReader::open_document(path)
217 .map_err(|e| OperationError::ParseError(e.to_string()))?;
218 Ok(Self::new(document))
219 }
220
221 pub fn analyze_page(&self, page_number: usize) -> OperationResult<ContentAnalysis> {
255 let page = self
257 .document
258 .get_page(page_number as u32)
259 .map_err(|e| OperationError::ParseError(e.to_string()))?;
260
261 let page_area = self.calculate_page_area(&page)?;
262
263 let text_analysis = self.analyze_text_content(page_number)?;
265 let text_area = text_analysis.total_area;
266 let text_fragment_count = text_analysis.fragment_count;
267 let character_count = text_analysis.character_count;
268
269 let image_analysis = self.analyze_image_content(page_number)?;
271 let image_area = image_analysis.total_area;
272 let image_count = image_analysis.image_count;
273
274 let text_ratio = if page_area > 0.0 {
276 text_area / page_area
277 } else {
278 0.0
279 };
280 let image_ratio = if page_area > 0.0 {
281 image_area / page_area
282 } else {
283 0.0
284 };
285 let blank_space_ratio = 1.0 - text_ratio - image_ratio;
286
287 let page_type = self.determine_page_type(text_ratio, image_ratio);
289
290 Ok(ContentAnalysis {
291 page_number,
292 page_type,
293 text_ratio,
294 image_ratio,
295 blank_space_ratio: blank_space_ratio.max(0.0),
296 text_fragment_count,
297 image_count,
298 character_count,
299 })
300 }
301
302 pub fn analyze_document(&self) -> OperationResult<Vec<ContentAnalysis>> {
325 let page_count = self
326 .document
327 .page_count()
328 .map_err(|e| OperationError::ParseError(e.to_string()))?;
329
330 let mut analyses = Vec::new();
331 for page_idx in 0..page_count {
332 let analysis = self.analyze_page(page_idx as usize)?;
333 analyses.push(analysis);
334 }
335
336 Ok(analyses)
337 }
338
339 pub fn analyze_pages(&self, page_numbers: &[usize]) -> OperationResult<Vec<ContentAnalysis>> {
349 let mut analyses = Vec::new();
350 for &page_number in page_numbers {
351 let analysis = self.analyze_page(page_number)?;
352 analyses.push(analysis);
353 }
354 Ok(analyses)
355 }
356
357 pub fn is_scanned_page(&self, page_number: usize) -> OperationResult<bool> {
386 let analysis = self.analyze_page(page_number)?;
387 Ok(analysis.is_scanned())
388 }
389
390 pub fn find_scanned_pages(&self) -> OperationResult<Vec<usize>> {
396 let analyses = self.analyze_document()?;
397 Ok(analyses
398 .into_iter()
399 .filter(|analysis| analysis.is_scanned())
400 .map(|analysis| analysis.page_number)
401 .collect())
402 }
403
404 pub fn extract_text_from_scanned_page<P: OcrProvider>(
445 &self,
446 page_number: usize,
447 ocr_provider: &P,
448 ) -> OperationResult<OcrProcessingResult> {
449 let analysis = self.analyze_page(page_number)?;
451 if !analysis.is_scanned() {
452 return Err(OperationError::ParseError(format!(
453 "Page {} is not a scanned page (image ratio: {:.2}%, text ratio: {:.2}%)",
454 page_number,
455 analysis.image_ratio * 100.0,
456 analysis.text_ratio * 100.0
457 )));
458 }
459
460 let ocr_options = self.options.ocr_options.clone().unwrap_or_default();
462
463 let page_image_data = self.extract_page_image_data(page_number)?;
465
466 let ocr_result = ocr_provider
468 .process_page(&analysis, &page_image_data, &ocr_options)
469 .map_err(|e| OperationError::ParseError(format!("OCR processing failed: {e}")))?;
470
471 Ok(ocr_result)
472 }
473
474 pub fn process_scanned_pages_with_ocr<P: OcrProvider>(
508 &self,
509 ocr_provider: &P,
510 ) -> OperationResult<Vec<(usize, OcrProcessingResult)>> {
511 let scanned_pages = self.find_scanned_pages()?;
512 let mut results = Vec::new();
513
514 for page_number in scanned_pages {
515 match self.extract_text_from_scanned_page(page_number, ocr_provider) {
516 Ok(ocr_result) => {
517 results.push((page_number, ocr_result));
518 }
519 Err(e) => {
520 tracing::error!("Failed to process page {page_number}: {e}");
521 continue;
522 }
523 }
524 }
525
526 Ok(results)
527 }
528
529 pub fn process_scanned_pages_parallel<P: OcrProvider + Clone + Send + Sync + 'static>(
564 &self,
565 ocr_provider: &P,
566 max_threads: Option<usize>,
567 ) -> OperationResult<Vec<(usize, OcrProcessingResult)>> {
568 use std::sync::{Arc, Mutex};
569 use std::thread;
570
571 let scanned_pages = self.find_scanned_pages()?;
572 if scanned_pages.is_empty() {
573 return Ok(Vec::new());
574 }
575
576 let thread_count = max_threads.unwrap_or_else(|| {
578 std::cmp::min(
579 scanned_pages.len(),
580 std::thread::available_parallelism()
581 .map(|p| p.get())
582 .unwrap_or(4),
583 )
584 });
585
586 if thread_count <= 1 {
587 return self.process_scanned_pages_with_ocr(ocr_provider);
589 }
590
591 let results = Arc::new(Mutex::new(Vec::new()));
593 let provider = Arc::new(ocr_provider.clone());
594
595 let chunk_size = scanned_pages.len().div_ceil(thread_count);
597 let mut handles = Vec::new();
598
599 for chunk in scanned_pages.chunks(chunk_size) {
600 let chunk_pages = chunk.to_vec();
601 let results_clone = Arc::clone(&results);
602 let provider_clone = Arc::clone(&provider);
603
604 let handle = thread::spawn(move || {
607 let mut thread_results = Vec::new();
608
609 for page_num in chunk_pages {
610 match simulate_page_ocr_processing(page_num, &*provider_clone) {
613 Ok(ocr_result) => {
614 thread_results.push((page_num, ocr_result));
615 }
616 Err(e) => {
617 tracing::error!("OCR failed for page {page_num}: {e}");
618 }
619 }
620 }
621
622 if let Ok(mut shared_results) = results_clone.lock() {
624 shared_results.extend(thread_results);
625 }
626 });
627
628 handles.push(handle);
629 }
630
631 for handle in handles {
633 if let Err(e) = handle.join() {
634 tracing::error!("Thread panicked: {e:?}");
635 }
636 }
637
638 let final_results = results
640 .lock()
641 .map_err(|e| OperationError::ProcessingError(format!("Failed to get results: {e}")))?
642 .clone();
643
644 Ok(final_results)
645 }
646
647 pub fn process_scanned_pages_batch<P: OcrProvider>(
661 &self,
662 ocr_provider: &P,
663 batch_size: usize,
664 ) -> OperationResult<Vec<(usize, OcrProcessingResult)>> {
665 let scanned_pages = self.find_scanned_pages()?;
666 let mut results = Vec::new();
667
668 if batch_size == 0 {
670 return Ok(results);
671 }
672
673 for batch in scanned_pages.chunks(batch_size) {
674 tracing::info!("Processing batch of {} pages", batch.len());
675
676 for &page_num in batch {
677 match self.extract_text_from_scanned_page(page_num, ocr_provider) {
678 Ok(ocr_result) => {
679 results.push((page_num, ocr_result));
680 }
681 Err(e) => {
682 tracing::error!("OCR failed for page {page_num}: {e}");
683 }
684 }
685 }
686
687 std::thread::sleep(std::time::Duration::from_millis(100));
689 }
690
691 Ok(results)
692 }
693
694 pub fn extract_page_image_data(&self, page_number: usize) -> OperationResult<Vec<u8>> {
699 tracing::debug!(
700 "🔍 [DEBUG] extract_page_image_data called for page {}",
701 page_number
702 );
703
704 let page = self
705 .document
706 .get_page(page_number as u32)
707 .map_err(|e| OperationError::ParseError(e.to_string()))?;
708
709 tracing::debug!("🔍 [DEBUG] Trying Method 1: Check page resources for XObjects");
711 let resources = self
712 .document
713 .get_page_resources(&page)
714 .map_err(|e| OperationError::ParseError(e.to_string()))?;
715
716 let mut resolved_resources_dict: Option<crate::parser::objects::PdfDictionary> = None;
718
719 if let Some(_resources) = &resources {
720 tracing::debug!(
722 "🔍 [DEBUG] Page {} has resources via standard method",
723 page_number
724 );
725 } else {
726 tracing::debug!(
728 "🔍 [DEBUG] Page {} resources None, trying direct resolution",
729 page_number
730 );
731 if let Some(resources_ref) = page.dict.get("Resources") {
732 tracing::debug!(
733 "🔍 [DEBUG] Page {} has Resources entry, resolving reference",
734 page_number
735 );
736 match self.document.resolve(resources_ref) {
737 Ok(resolved_obj) => {
738 if let Some(resolved_dict) = resolved_obj.as_dict() {
739 tracing::debug!("🔍 [DEBUG] Page {} resolved Resources to dictionary with {} entries",
740 page_number, resolved_dict.0.len());
741 resolved_resources_dict = Some(resolved_dict.clone());
742 } else {
743 tracing::debug!(
744 "🔍 [DEBUG] Page {} Resources resolved but not a dictionary",
745 page_number
746 );
747 }
748 }
749 Err(e) => {
750 tracing::debug!(
751 "🔍 [DEBUG] Page {} failed to resolve Resources: {}",
752 page_number,
753 e
754 );
755 }
756 }
757 } else {
758 tracing::debug!(
759 "🔍 [DEBUG] Page {} has no Resources entry in dict",
760 page_number
761 );
762 }
763 }
764
765 let active_resources = resources.or(resolved_resources_dict.as_ref());
767
768 if let Some(resources) = &active_resources {
769 tracing::debug!("🔍 [DEBUG] Page {} has resources", page_number);
770 if let Some(crate::parser::objects::PdfObject::Dictionary(xobjects)) = resources
771 .0
772 .get(&crate::parser::objects::PdfName("XObject".to_string()))
773 {
774 tracing::debug!(
775 "🔍 [DEBUG] Page {} has XObject dictionary with {} entries",
776 page_number,
777 xobjects.0.len()
778 );
779 for (xobject_name, obj_ref) in xobjects.0.iter() {
781 if let crate::parser::objects::PdfObject::Reference(obj_num, gen_num) = obj_ref
782 {
783 if let Ok(crate::parser::objects::PdfObject::Stream(stream)) =
784 self.document.get_object(*obj_num, *gen_num)
785 {
786 if let Some(crate::parser::objects::PdfObject::Name(subtype)) = stream
788 .dict
789 .0
790 .get(&crate::parser::objects::PdfName("Subtype".to_string()))
791 {
792 if subtype.0 == "Image" {
793 let width = stream
794 .dict
795 .0
796 .get(&crate::parser::objects::PdfName("Width".to_string()))
797 .and_then(|w| {
798 if let crate::parser::objects::PdfObject::Integer(w) = w
799 {
800 Some(*w)
801 } else {
802 None
803 }
804 })
805 .unwrap_or(0);
806
807 let height = stream
808 .dict
809 .0
810 .get(&crate::parser::objects::PdfName("Height".to_string()))
811 .and_then(|h| {
812 if let crate::parser::objects::PdfObject::Integer(h) = h
813 {
814 Some(*h)
815 } else {
816 None
817 }
818 })
819 .unwrap_or(0);
820
821 tracing::debug!(
822 "🔍 [DEBUG] Page {} Method1 XObject {} -> Object {} ({}x{})",
823 page_number, xobject_name.0, obj_num, width, height
824 );
825 return self.extract_image_stream_for_ocr(&stream);
827 }
828 }
829 }
830 }
831 }
832 } else {
833 tracing::debug!("🔍 [DEBUG] Page {} has no XObject dictionary", page_number);
834 }
835 } else {
836 tracing::debug!("🔍 [DEBUG] Page {} has no resources", page_number);
837 }
838
839 tracing::debug!("🔍 [DEBUG] Trying Method 2: Parse content streams for Do operators");
841 if let Ok(content_streams) = self.document.get_page_content_streams(&page) {
842 tracing::debug!(
843 "🔍 [DEBUG] Page {} has {} content streams",
844 page_number,
845 content_streams.len()
846 );
847 for (i, content_stream) in content_streams.iter().enumerate() {
848 let content_str = String::from_utf8_lossy(content_stream);
849 tracing::debug!(
850 "🔍 [DEBUG] Content stream {} has {} bytes",
851 i,
852 content_stream.len()
853 );
854
855 for line in content_str.lines() {
858 if line.trim().ends_with(" Do") {
859 let parts: Vec<&str> = line.split_whitespace().collect();
861 if parts.len() >= 2 && parts[parts.len() - 1] == "Do" {
862 let xobject_name = parts[parts.len() - 2];
863 tracing::debug!(
864 "🔍 [DEBUG] Found Do operator with XObject: {}",
865 xobject_name
866 );
867 if let Some(name) = xobject_name.strip_prefix('/') {
868 tracing::debug!("🔍 [DEBUG] Looking for XObject: {}", name);
870
871 if let Ok(image_data) =
873 self.find_specific_xobject_image_from_page(name, &page)
874 {
875 return Ok(image_data);
876 } else {
877 tracing::debug!("🔍 [DEBUG] Page-specific XObject lookup failed for: {}, trying document-wide search", name);
878 if let Ok(image_data) = self.find_specific_xobject_image(name) {
880 return Ok(image_data);
881 } else {
882 tracing::debug!("🔍 [DEBUG] Document-wide XObject lookup also failed for: {}", name);
883 }
884 }
885 }
886 }
887 }
888 }
889
890 if content_str.contains("BI") && content_str.contains("EI") {
892 }
895 }
896 }
897
898 tracing::debug!("🔍 [DEBUG] Trying Method 3: Fallback scan for large images");
900 match self.find_image_xobjects_in_document() {
901 Ok(image_data) if !image_data.is_empty() => {
902 return Ok(image_data);
903 }
904 _ => {}
905 }
906
907 Err(OperationError::ParseError(
908 "No image data found on scanned page (checked XObjects and inline images)".to_string(),
909 ))
910 }
911
912 fn find_specific_xobject_image_from_page(
914 &self,
915 xobject_name: &str,
916 page: &crate::parser::page_tree::ParsedPage,
917 ) -> OperationResult<Vec<u8>> {
918 let resources = self
920 .document
921 .get_page_resources(page)
922 .map_err(|e| OperationError::ParseError(e.to_string()))?;
923
924 if let Some(resources) = resources {
926 if let Some(crate::parser::objects::PdfObject::Dictionary(xobjects)) = resources
927 .0
928 .get(&crate::parser::objects::PdfName("XObject".to_string()))
929 {
930 #[allow(clippy::collapsible_match)]
931 if let Some(xobject_ref) = xobjects
932 .0
933 .get(&crate::parser::objects::PdfName(xobject_name.to_string()))
934 {
935 if let crate::parser::objects::PdfObject::Reference(obj_num, gen_num) =
936 xobject_ref
937 {
938 if let Ok(crate::parser::objects::PdfObject::Stream(stream)) =
939 self.document.get_object(*obj_num, *gen_num)
940 {
941 if let Some(crate::parser::objects::PdfObject::Name(subtype)) = stream
942 .dict
943 .0
944 .get(&crate::parser::objects::PdfName("Subtype".to_string()))
945 {
946 if subtype.0 == "Image" {
947 let width = stream
948 .dict
949 .0
950 .get(&crate::parser::objects::PdfName("Width".to_string()))
951 .and_then(|w| {
952 if let crate::parser::objects::PdfObject::Integer(w) = w
953 {
954 Some(*w)
955 } else {
956 None
957 }
958 })
959 .unwrap_or(0);
960 let height = stream
961 .dict
962 .0
963 .get(&crate::parser::objects::PdfName("Height".to_string()))
964 .and_then(|h| {
965 if let crate::parser::objects::PdfObject::Integer(h) = h
966 {
967 Some(*h)
968 } else {
969 None
970 }
971 })
972 .unwrap_or(0);
973 tracing::debug!(
974 "🔍 [DEBUG] Page-specific XObject {} -> Object {} ({}x{})",
975 xobject_name,
976 obj_num,
977 width,
978 height
979 );
980 return self.extract_image_stream_for_ocr(&stream);
981 }
982 }
983 }
984 }
985 }
986 }
987 }
988
989 if let Some(crate::parser::objects::PdfObject::Reference(res_obj, res_gen)) = page
991 .dict
992 .0
993 .get(&crate::parser::objects::PdfName("Resources".to_string()))
994 {
995 match self.document.get_object(*res_obj, *res_gen) {
996 Ok(crate::parser::objects::PdfObject::Dictionary(resolved_dict)) => {
997 tracing::debug!(
998 "🔍 [DEBUG] Page-specific fallback: resolved Resources {} {} R",
999 res_obj,
1000 res_gen
1001 );
1002 if let Some(crate::parser::objects::PdfObject::Dictionary(xobjects)) =
1003 resolved_dict
1004 .0
1005 .get(&crate::parser::objects::PdfName("XObject".to_string()))
1006 {
1007 tracing::debug!("🔍 [DEBUG] Page-specific fallback found XObject dictionary with {} entries", xobjects.0.len());
1008 for (name, obj) in &xobjects.0 {
1009 tracing::debug!(
1010 "🔍 [DEBUG] Page-specific fallback XObject: {} -> {:?}",
1011 name.0,
1012 obj
1013 );
1014 }
1015 if let Some(xobject_ref) = xobjects
1016 .0
1017 .get(&crate::parser::objects::PdfName(xobject_name.to_string()))
1018 {
1019 if let crate::parser::objects::PdfObject::Reference(obj_num, gen_num) =
1020 xobject_ref
1021 {
1022 tracing::debug!("🔍 [DEBUG] Page-specific fallback: trying to get object {} {} R", obj_num, gen_num);
1023 match self.document.get_object(*obj_num, *gen_num) {
1024 Ok(crate::parser::objects::PdfObject::Stream(stream)) => {
1025 tracing::debug!(
1026 "🔍 [DEBUG] Page-specific fallback: got stream object"
1027 );
1028 match stream.dict.0.get(&crate::parser::objects::PdfName(
1029 "Subtype".to_string(),
1030 )) {
1031 Some(crate::parser::objects::PdfObject::Name(
1032 subtype,
1033 )) => {
1034 tracing::debug!("🔍 [DEBUG] Page-specific fallback: stream subtype = {}", subtype.0);
1035 if subtype.0 == "Image" {
1036 let width = stream
1037 .dict
1038 .0
1039 .get(&crate::parser::objects::PdfName("Width".to_string()))
1040 .and_then(|w| {
1041 if let crate::parser::objects::PdfObject::Integer(w) = w
1042 {
1043 Some(*w)
1044 } else {
1045 None
1046 }
1047 })
1048 .unwrap_or(0);
1049 let height = stream
1050 .dict
1051 .0
1052 .get(&crate::parser::objects::PdfName("Height".to_string()))
1053 .and_then(|h| {
1054 if let crate::parser::objects::PdfObject::Integer(h) = h
1055 {
1056 Some(*h)
1057 } else {
1058 None
1059 }
1060 })
1061 .unwrap_or(0);
1062 tracing::debug!(
1063 "🔍 [DEBUG] Page-specific fallback XObject {} -> Object {} ({}x{})",
1064 xobject_name, obj_num, width, height
1065 );
1066 return self
1067 .extract_image_stream_for_ocr(&stream);
1068 } else {
1069 tracing::debug!("🔍 [DEBUG] Page-specific fallback: stream is not an image (subtype: {})", subtype.0);
1070 }
1071 }
1072 None => {
1073 tracing::debug!("🔍 [DEBUG] Page-specific fallback: stream has no Subtype");
1074 }
1075 _ => {
1076 tracing::debug!("🔍 [DEBUG] Page-specific fallback: stream Subtype is not a name");
1077 }
1078 }
1079 }
1080 Ok(obj) => {
1081 tracing::debug!("🔍 [DEBUG] Page-specific fallback: object {} {} R is not a stream, got: {:?}", obj_num, gen_num, std::any::type_name_of_val(&obj));
1082 }
1083 Err(e) => {
1084 tracing::debug!("🔍 [DEBUG] Page-specific fallback: failed to get object {} {} R: {}", obj_num, gen_num, e);
1085 }
1086 }
1087 } else {
1088 tracing::debug!("🔍 [DEBUG] Page-specific fallback: XObject reference is not a Reference");
1089 }
1090 } else {
1091 tracing::debug!("🔍 [DEBUG] Page-specific fallback: XObject '{}' not found in resolved resources", xobject_name);
1092 }
1093 } else {
1094 tracing::debug!("🔍 [DEBUG] Page-specific fallback: no XObject dictionary in resolved resources");
1095 }
1096 }
1097 Ok(_) => {
1098 tracing::debug!("🔍 [DEBUG] Page-specific fallback: Resources reference resolved to non-dictionary");
1099 }
1100 Err(e) => {
1101 tracing::debug!(
1102 "🔍 [DEBUG] Page-specific fallback: failed to resolve Resources: {}",
1103 e
1104 );
1105 }
1106 }
1107 }
1108
1109 if let Some(resources) = resources {
1111 if let Some(crate::parser::objects::PdfObject::Dictionary(xobjects)) = resources
1112 .0
1113 .get(&crate::parser::objects::PdfName("XObject".to_string()))
1114 {
1115 #[allow(clippy::collapsible_match)]
1117 if let Some(xobject_ref) = xobjects
1118 .0
1119 .get(&crate::parser::objects::PdfName(xobject_name.to_string()))
1120 {
1121 if let crate::parser::objects::PdfObject::Reference(obj_num, gen_num) =
1122 xobject_ref
1123 {
1124 if let Ok(crate::parser::objects::PdfObject::Stream(stream)) =
1125 self.document.get_object(*obj_num, *gen_num)
1126 {
1127 if let Some(crate::parser::objects::PdfObject::Name(subtype)) = stream
1129 .dict
1130 .0
1131 .get(&crate::parser::objects::PdfName("Subtype".to_string()))
1132 {
1133 if subtype.0 == "Image" {
1134 let width = stream
1135 .dict
1136 .0
1137 .get(&crate::parser::objects::PdfName("Width".to_string()))
1138 .and_then(|w| {
1139 if let crate::parser::objects::PdfObject::Integer(w) = w
1140 {
1141 Some(*w)
1142 } else {
1143 None
1144 }
1145 })
1146 .unwrap_or(0);
1147
1148 let height = stream
1149 .dict
1150 .0
1151 .get(&crate::parser::objects::PdfName("Height".to_string()))
1152 .and_then(|h| {
1153 if let crate::parser::objects::PdfObject::Integer(h) = h
1154 {
1155 Some(*h)
1156 } else {
1157 None
1158 }
1159 })
1160 .unwrap_or(0);
1161
1162 tracing::debug!(
1163 "🔍 [DEBUG] Page-specific XObject {} -> Object {} ({}x{})",
1164 xobject_name,
1165 obj_num,
1166 width,
1167 height
1168 );
1169 return self.extract_image_stream_for_ocr(&stream);
1170 }
1171 }
1172 }
1173 }
1174 }
1175 }
1176 }
1177
1178 Err(OperationError::ParseError(format!(
1179 "No page-specific XObject found for name: {}",
1180 xobject_name
1181 )))
1182 }
1183
1184 fn find_specific_xobject_image(&self, xobject_name: &str) -> OperationResult<Vec<u8>> {
1186 for obj_num in 1..=1000 {
1190 if let Ok(crate::parser::objects::PdfObject::Stream(stream)) =
1192 self.document.get_object(obj_num, 0)
1193 {
1194 if let Some(crate::parser::objects::PdfObject::Name(subtype)) = stream
1196 .dict
1197 .0
1198 .get(&crate::parser::objects::PdfName("Subtype".to_string()))
1199 {
1200 if subtype.0 == "Image" {
1201 let width = stream
1204 .dict
1205 .0
1206 .get(&crate::parser::objects::PdfName("Width".to_string()))
1207 .and_then(|w| {
1208 if let crate::parser::objects::PdfObject::Integer(w) = w {
1209 Some(*w)
1210 } else {
1211 None
1212 }
1213 })
1214 .unwrap_or(0);
1215 let height = stream
1216 .dict
1217 .0
1218 .get(&crate::parser::objects::PdfName("Height".to_string()))
1219 .and_then(|h| {
1220 if let crate::parser::objects::PdfObject::Integer(h) = h {
1221 Some(*h)
1222 } else {
1223 None
1224 }
1225 })
1226 .unwrap_or(0);
1227
1228 if width > 100 && height > 100 {
1230 tracing::debug!(
1231 "🔍 [DEBUG] Using XObject {} -> Object {} ({}x{})",
1232 xobject_name,
1233 obj_num,
1234 width,
1235 height
1236 );
1237 return self.extract_image_stream_for_ocr(&stream);
1238 }
1239 }
1240 }
1241 }
1242 }
1243
1244 Err(OperationError::ParseError(format!(
1245 "No image XObject found for name: {}",
1246 xobject_name
1247 )))
1248 }
1249
1250 fn find_image_xobjects_in_document(&self) -> OperationResult<Vec<u8>> {
1252 for obj_num in 1..=1000 {
1255 if let Ok(crate::parser::objects::PdfObject::Stream(stream)) =
1257 self.document.get_object(obj_num, 0)
1258 {
1259 if let Some(crate::parser::objects::PdfObject::Name(subtype)) = stream
1261 .dict
1262 .0
1263 .get(&crate::parser::objects::PdfName("Subtype".to_string()))
1264 {
1265 if subtype.0 == "Image" {
1266 let width = stream
1268 .dict
1269 .0
1270 .get(&crate::parser::objects::PdfName("Width".to_string()))
1271 .and_then(|w| {
1272 if let crate::parser::objects::PdfObject::Integer(w) = w {
1273 Some(*w)
1274 } else {
1275 None
1276 }
1277 })
1278 .unwrap_or(0);
1279 let height = stream
1280 .dict
1281 .0
1282 .get(&crate::parser::objects::PdfName("Height".to_string()))
1283 .and_then(|h| {
1284 if let crate::parser::objects::PdfObject::Integer(h) = h {
1285 Some(*h)
1286 } else {
1287 None
1288 }
1289 })
1290 .unwrap_or(0);
1291
1292 if width > 100 && height > 100 {
1294 return self.extract_image_stream_for_ocr(&stream);
1295 }
1296 }
1297 }
1298 }
1299 }
1300
1301 Err(OperationError::ParseError(
1302 "No suitable image objects found in document".to_string(),
1303 ))
1304 }
1305
1306 fn extract_image_stream_for_ocr(
1308 &self,
1309 stream: &crate::parser::objects::PdfStream,
1310 ) -> OperationResult<Vec<u8>> {
1311 tracing::debug!(
1312 "🔍 [DEBUG] extract_image_stream_for_ocr called with stream size: {}",
1313 stream.data.len()
1314 );
1315
1316 let width = match stream
1318 .dict
1319 .0
1320 .get(&crate::parser::objects::PdfName("Width".to_string()))
1321 {
1322 Some(crate::parser::objects::PdfObject::Integer(w)) => *w as u32,
1323 _ => {
1324 return Err(OperationError::ParseError(
1325 "Missing image width".to_string(),
1326 ))
1327 }
1328 };
1329
1330 let height = match stream
1331 .dict
1332 .0
1333 .get(&crate::parser::objects::PdfName("Height".to_string()))
1334 {
1335 Some(crate::parser::objects::PdfObject::Integer(h)) => *h as u32,
1336 _ => {
1337 return Err(OperationError::ParseError(
1338 "Missing image height".to_string(),
1339 ))
1340 }
1341 };
1342
1343 let color_space = stream
1345 .dict
1346 .0
1347 .get(&crate::parser::objects::PdfName("ColorSpace".to_string()));
1348 let bits_per_component = match stream.dict.0.get(&crate::parser::objects::PdfName(
1349 "BitsPerComponent".to_string(),
1350 )) {
1351 Some(crate::parser::objects::PdfObject::Integer(bits)) => *bits as u8,
1352 _ => 8,
1353 };
1354
1355 let filter = stream
1357 .dict
1358 .0
1359 .get(&crate::parser::objects::PdfName("Filter".to_string()));
1360 tracing::debug!(
1361 "🔍 [DEBUG] Image properties: {}x{}, {} bits, filter: {:?}",
1362 width,
1363 height,
1364 bits_per_component,
1365 filter
1366 .as_ref()
1367 .map(|f| match f {
1368 crate::parser::objects::PdfObject::Name(n) => n.0.as_str(),
1369 _ => "Array/Other",
1370 })
1371 .unwrap_or("None")
1372 );
1373
1374 let data = match filter {
1376 Some(crate::parser::objects::PdfObject::Name(filter_name)) => match filter_name
1377 .0
1378 .as_str()
1379 {
1380 "DCTDecode" => {
1381 let jpeg_data = &stream.data;
1384
1385 tracing::debug!(
1386 "🔍 [DEBUG] Processing DCTDecode stream: {} bytes",
1387 jpeg_data.len()
1388 );
1389
1390 if jpeg_data.len() < 4 {
1392 return Err(OperationError::ParseError(
1393 "DCTDecode stream too short to be valid JPEG".to_string(),
1394 ));
1395 }
1396
1397 if jpeg_data[0] != 0xFF || jpeg_data[1] != 0xD8 {
1399 return Err(OperationError::ParseError(format!(
1400 "Invalid JPEG stream: missing SOI marker. Found: {:02X}{:02X}, expected FFD8",
1401 jpeg_data[0], jpeg_data[1]
1402 )));
1403 }
1404
1405 tracing::debug!("✅ [DEBUG] JPEG SOI marker found");
1406
1407 let final_jpeg_data = jpeg_data.to_vec();
1409
1410 tracing::debug!(
1411 "🔍 [DEBUG] Final JPEG size: {} bytes",
1412 final_jpeg_data.len()
1413 );
1414
1415 final_jpeg_data
1418 }
1419 filter_name => {
1420 tracing::debug!("🔍 [DEBUG] Decoding stream with filter: {}", filter_name);
1422 let parse_options = self.document.options();
1423 let decoded_data = stream.decode(&parse_options).map_err(|e| {
1424 OperationError::ParseError(format!("Failed to decode image stream: {e}"))
1425 })?;
1426
1427 tracing::debug!(
1428 "🔍 [DEBUG] Decoded stream data: {} bytes",
1429 decoded_data.len()
1430 );
1431
1432 match filter_name {
1433 "FlateDecode" => {
1434 self.convert_raw_to_png_for_ocr(
1436 &decoded_data,
1437 width,
1438 height,
1439 color_space,
1440 bits_per_component,
1441 )?
1442 }
1443 "CCITTFaxDecode" => {
1444 self.convert_ccitt_to_png_for_ocr(&decoded_data, width, height)?
1446 }
1447 "LZWDecode" => {
1448 self.convert_raw_to_png_for_ocr(
1450 &decoded_data,
1451 width,
1452 height,
1453 color_space,
1454 bits_per_component,
1455 )?
1456 }
1457 "JBIG2Decode" => {
1458 self.convert_ccitt_to_png_for_ocr(&decoded_data, width, height)?
1460 }
1461 _ => {
1462 return Err(OperationError::ParseError(format!(
1463 "Unsupported image filter: {}",
1464 filter_name
1465 )))
1466 }
1467 }
1468 }
1469 },
1470 Some(crate::parser::objects::PdfObject::Array(filters)) => {
1471 if let Some(crate::parser::objects::PdfObject::Name(filter)) = filters.0.first() {
1473 match filter.0.as_str() {
1474 "DCTDecode" => {
1475 tracing::debug!("🔍 [DEBUG] Array filter: Using raw JPEG stream data");
1476 stream.data.clone()
1477 }
1478 filter_name => {
1479 tracing::debug!(
1481 "🔍 [DEBUG] Array filter: Decoding stream with filter: {}",
1482 filter_name
1483 );
1484 let parse_options = self.document.options();
1485 let decoded_data = stream.decode(&parse_options).map_err(|e| {
1486 OperationError::ParseError(format!(
1487 "Failed to decode image stream: {e}"
1488 ))
1489 })?;
1490
1491 match filter_name {
1492 "FlateDecode" => self.convert_raw_to_png_for_ocr(
1493 &decoded_data,
1494 width,
1495 height,
1496 color_space,
1497 bits_per_component,
1498 )?,
1499 "CCITTFaxDecode" => {
1500 self.convert_ccitt_to_png_for_ocr(&decoded_data, width, height)?
1501 }
1502 "LZWDecode" => self.convert_raw_to_png_for_ocr(
1503 &decoded_data,
1504 width,
1505 height,
1506 color_space,
1507 bits_per_component,
1508 )?,
1509 "JBIG2Decode" => {
1510 self.convert_ccitt_to_png_for_ocr(&decoded_data, width, height)?
1512 }
1513 _ => {
1514 return Err(OperationError::ParseError(format!(
1515 "Unsupported image filter in array: {}",
1516 filter_name
1517 )))
1518 }
1519 }
1520 }
1521 }
1522 } else {
1523 return Err(OperationError::ParseError("Empty filter array".to_string()));
1524 }
1525 }
1526 _ => {
1527 tracing::debug!("🔍 [DEBUG] No filter: Converting raw image data to PNG");
1529 let parse_options = self.document.options();
1530 let decoded_data = stream.decode(&parse_options).map_err(|e| {
1531 OperationError::ParseError(format!("Failed to decode raw image stream: {e}"))
1532 })?;
1533
1534 self.convert_raw_to_png_for_ocr(
1535 &decoded_data,
1536 width,
1537 height,
1538 color_space,
1539 bits_per_component,
1540 )?
1541 }
1542 };
1543
1544 tracing::debug!("🔍 [DEBUG] Final image data for OCR: {} bytes", data.len());
1545 Ok(data)
1546 }
1547
1548 #[allow(dead_code)]
1551 fn clean_jpeg_data(&self, raw_data: &[u8]) -> Vec<u8> {
1552 tracing::debug!(
1553 "🔍 [DEBUG] Using raw DCTDecode stream as-is: {} bytes",
1554 raw_data.len()
1555 );
1556
1557 raw_data.to_vec()
1560 }
1561
1562 #[cfg(feature = "external-images")]
1563 #[allow(dead_code)]
1564 fn fix_image_rotation_for_ocr(
1565 &self,
1566 image_data: &[u8],
1567 pdf_width: u32,
1568 pdf_height: u32,
1569 ) -> OperationResult<Vec<u8>> {
1570 tracing::debug!("🔍 [DEBUG] Image rotation correction with external-images feature");
1571
1572 let rotation_needed = self.detect_rotation_needed(pdf_width, pdf_height, 0, 0);
1575
1576 if rotation_needed > 0 {
1577 self.rotate_image_externally(image_data, rotation_needed)
1580 } else {
1581 tracing::debug!("🔍 [DEBUG] No rotation correction needed based on dimensions");
1582 Ok(image_data.to_vec())
1583 }
1584 }
1585
1586 #[cfg(not(feature = "external-images"))]
1587 #[allow(dead_code)]
1588 fn fix_image_rotation_for_ocr(
1589 &self,
1590 image_data: &[u8],
1591 _pdf_width: u32,
1592 _pdf_height: u32,
1593 ) -> OperationResult<Vec<u8>> {
1594 tracing::debug!(
1595 "🔍 [DEBUG] Image rotation correction disabled (external-images feature not enabled)"
1596 );
1597 Ok(image_data.to_vec())
1598 }
1599
1600 #[allow(dead_code)]
1601 fn detect_rotation_needed(
1602 &self,
1603 pdf_width: u32,
1604 pdf_height: u32,
1605 img_width: u32,
1606 img_height: u32,
1607 ) -> u8 {
1608 let (actual_img_width, actual_img_height) = if img_width == 0 || img_height == 0 {
1614 (pdf_width, pdf_height)
1615 } else {
1616 (img_width, img_height)
1617 };
1618
1619 tracing::debug!(
1620 "🔍 [DEBUG] Rotation analysis - PDF: {}x{}, Image: {}x{}",
1621 pdf_width,
1622 pdf_height,
1623 actual_img_width,
1624 actual_img_height
1625 );
1626
1627 if pdf_height > pdf_width {
1629 tracing::debug!("🔍 [DEBUG] Portrait PDF detected - applying 270° rotation to correct typical scan rotation");
1632 return 3; }
1634
1635 if pdf_width == actual_img_height && pdf_height == actual_img_width {
1637 tracing::debug!("🔍 [DEBUG] Dimensions swapped - applying 90° rotation");
1638 return 1; }
1640
1641 tracing::debug!("🔍 [DEBUG] No rotation correction needed");
1642 0
1643 }
1644
1645 #[allow(dead_code)]
1646 fn rotate_image_externally(&self, image_data: &[u8], rotation: u8) -> OperationResult<Vec<u8>> {
1647 use std::fs;
1648 use std::process::Command;
1649
1650 let input_path = format!("examples/results/temp_input_{}.jpg", std::process::id());
1652 let output_path = format!("examples/results/temp_output_{}.jpg", std::process::id());
1653
1654 if let Err(e) = fs::write(&input_path, image_data) {
1656 tracing::debug!("🔍 [DEBUG] Failed to write temp input file: {}", e);
1657 return Ok(image_data.to_vec());
1658 }
1659
1660 let angle = match rotation {
1662 1 => "90", 2 => "180", 3 => "270", _ => {
1666 let _ = fs::remove_file(&input_path);
1667 return Ok(image_data.to_vec());
1668 }
1669 };
1670
1671 tracing::debug!(
1672 "🔍 [DEBUG] Attempting to rotate image {} degrees using external tool",
1673 angle
1674 );
1675
1676 let sips_result = Command::new("sips")
1678 .arg(&input_path)
1679 .arg("-r")
1680 .arg(angle)
1681 .arg("--out")
1682 .arg(&output_path)
1683 .output();
1684
1685 let rotated_data = match sips_result {
1686 Ok(sips_output) if sips_output.status.success() => match fs::read(&output_path) {
1687 Ok(data) => {
1688 tracing::debug!("🔍 [DEBUG] Successfully rotated image using sips");
1689 data
1690 }
1691 Err(e) => {
1692 tracing::debug!("🔍 [DEBUG] Failed to read sips-rotated image: {}", e);
1693 image_data.to_vec()
1694 }
1695 },
1696 Ok(sips_output) => {
1697 tracing::debug!(
1698 "🔍 [DEBUG] sips failed: {}",
1699 String::from_utf8_lossy(&sips_output.stderr)
1700 );
1701
1702 let result = Command::new("convert")
1704 .arg(&input_path)
1705 .arg("-rotate")
1706 .arg(angle)
1707 .arg(&output_path)
1708 .output();
1709
1710 match result {
1711 Ok(output) if output.status.success() => match fs::read(&output_path) {
1712 Ok(data) => {
1713 tracing::debug!(
1714 "🔍 [DEBUG] Successfully rotated image using ImageMagick"
1715 );
1716 data
1717 }
1718 Err(e) => {
1719 tracing::debug!("🔍 [DEBUG] Failed to read rotated image: {}", e);
1720 image_data.to_vec()
1721 }
1722 },
1723 _ => {
1724 tracing::debug!(
1725 "🔍 [DEBUG] Both sips and ImageMagick failed, using original image"
1726 );
1727 image_data.to_vec()
1728 }
1729 }
1730 }
1731 Err(e) => {
1732 tracing::debug!("🔍 [DEBUG] sips not available: {}", e);
1733 tracing::debug!("🔍 [DEBUG] Trying ImageMagick as fallback...");
1734
1735 let result = Command::new("convert")
1736 .arg(&input_path)
1737 .arg("-rotate")
1738 .arg(angle)
1739 .arg(&output_path)
1740 .output();
1741
1742 match result {
1743 Ok(output) if output.status.success() => match fs::read(&output_path) {
1744 Ok(data) => {
1745 tracing::debug!(
1746 "🔍 [DEBUG] Successfully rotated image using ImageMagick"
1747 );
1748 data
1749 }
1750 Err(e) => {
1751 tracing::debug!("🔍 [DEBUG] Failed to read rotated image: {}", e);
1752 image_data.to_vec()
1753 }
1754 },
1755 _ => {
1756 tracing::debug!(
1757 "🔍 [DEBUG] No external rotation tools available, using original image"
1758 );
1759 image_data.to_vec()
1760 }
1761 }
1762 }
1763 };
1764
1765 let _ = fs::remove_file(&input_path);
1767 let _ = fs::remove_file(&output_path);
1768
1769 Ok(rotated_data)
1770 }
1771
1772 #[allow(dead_code)]
1775 fn clean_corrupted_jpeg(
1776 &self,
1777 corrupted_jpeg_data: &[u8],
1778 width: u32,
1779 _height: u32,
1780 ) -> OperationResult<Vec<u8>> {
1781 use std::fs;
1782 use std::process::Command;
1783
1784 tracing::debug!("🔧 [DEBUG] Cleaning corrupted JPEG using sips");
1785
1786 let temp_id = std::process::id();
1788 let input_path = format!("/tmp/ocr_corrupted_{}_{}.jpg", temp_id, width);
1789 let output_path = format!("/tmp/ocr_clean_{}_{}.jpg", temp_id, width);
1790
1791 fs::write(&input_path, corrupted_jpeg_data).map_err(|e| {
1793 OperationError::ProcessingError(format!("Failed to write temp JPEG: {e}"))
1794 })?;
1795
1796 tracing::debug!("🔧 [DEBUG] Saved corrupted JPEG to: {}", input_path);
1797
1798 let output = Command::new("sips")
1800 .args([
1801 "-s",
1802 "format",
1803 "jpeg",
1804 "-s",
1805 "formatOptions",
1806 "100", &input_path,
1808 "--out",
1809 &output_path,
1810 ])
1811 .output()
1812 .map_err(|e| OperationError::ProcessingError(format!("Failed to run sips: {e}")))?;
1813
1814 if !output.status.success() {
1815 let stderr = String::from_utf8_lossy(&output.stderr);
1816 tracing::debug!("❌ [DEBUG] sips failed: {}", stderr);
1817
1818 let _ = fs::remove_file(&input_path);
1820 let _ = fs::remove_file(&output_path);
1821
1822 tracing::debug!("🔧 [DEBUG] Falling back to original JPEG data");
1824 return Ok(corrupted_jpeg_data.to_vec());
1825 }
1826
1827 let cleaned_data = fs::read(&output_path).map_err(|e| {
1829 OperationError::ProcessingError(format!("Failed to read cleaned JPEG: {e}"))
1830 })?;
1831
1832 tracing::debug!(
1833 "🔧 [DEBUG] Successfully cleaned JPEG: {} -> {} bytes",
1834 corrupted_jpeg_data.len(),
1835 cleaned_data.len()
1836 );
1837
1838 let _ = fs::remove_file(&input_path);
1842 let _ = fs::remove_file(&output_path);
1843
1844 Ok(cleaned_data)
1845 }
1846
1847 fn calculate_page_area(&self, page: &crate::parser::ParsedPage) -> OperationResult<f64> {
1851 let width = page.width();
1853 let height = page.height();
1854
1855 Ok(width * height)
1856 }
1857
1858 fn analyze_text_content(&self, page_number: usize) -> OperationResult<TextAnalysisResult> {
1860 let mut extractor = TextExtractor::with_options(ExtractionOptions {
1861 preserve_layout: true,
1862 space_threshold: 0.3,
1863 newline_threshold: 10.0,
1864 ..Default::default()
1865 });
1866
1867 let extracted_text = extractor
1868 .extract_from_page(&self.document, page_number as u32)
1869 .map_err(|e| OperationError::ParseError(e.to_string()))?;
1870
1871 let mut total_area = 0.0;
1872 let mut fragment_count = 0;
1873 let character_count = extracted_text.text.len();
1874
1875 for fragment in &extracted_text.fragments {
1877 if fragment.text.trim().len() >= self.options.min_text_fragment_size {
1878 total_area += fragment.width * fragment.height;
1879 fragment_count += 1;
1880 }
1881 }
1882
1883 Ok(TextAnalysisResult {
1884 total_area,
1885 fragment_count,
1886 character_count,
1887 })
1888 }
1889
1890 fn analyze_image_content(&self, page_number: usize) -> OperationResult<ImageAnalysisResult> {
1892 let page = self
1895 .document
1896 .get_page(page_number as u32)
1897 .map_err(|e| OperationError::ParseError(e.to_string()))?;
1898
1899 let resources = self
1903 .document
1904 .get_page_resources(&page)
1905 .map_err(|e| OperationError::ParseError(e.to_string()))?;
1906
1907 let mut total_area = 0.0;
1908 let mut image_count = 0;
1909
1910 if let Some(resources) = &resources {
1912 if let Some(crate::parser::objects::PdfObject::Dictionary(xobjects)) = resources
1913 .0
1914 .get(&crate::parser::objects::PdfName("XObject".to_string()))
1915 {
1916 for obj_ref in xobjects.0.values() {
1917 if let crate::parser::objects::PdfObject::Reference(obj_num, gen_num) = obj_ref
1918 {
1919 if let Ok(crate::parser::objects::PdfObject::Stream(stream)) =
1920 self.document.get_object(*obj_num, *gen_num)
1921 {
1922 if let Some(crate::parser::objects::PdfObject::Name(subtype)) = stream
1924 .dict
1925 .0
1926 .get(&crate::parser::objects::PdfName("Subtype".to_string()))
1927 {
1928 if subtype.0 == "Image" {
1929 image_count += 1;
1930
1931 let width =
1933 match stream.dict.0.get(&crate::parser::objects::PdfName(
1934 "Width".to_string(),
1935 )) {
1936 Some(crate::parser::objects::PdfObject::Integer(w)) => {
1937 *w as f64
1938 }
1939 _ => 0.0,
1940 };
1941
1942 let height =
1943 match stream.dict.0.get(&crate::parser::objects::PdfName(
1944 "Height".to_string(),
1945 )) {
1946 Some(crate::parser::objects::PdfObject::Integer(h)) => {
1947 *h as f64
1948 }
1949 _ => 0.0,
1950 };
1951
1952 if width >= self.options.min_image_size as f64
1954 && height >= self.options.min_image_size as f64
1955 {
1956 total_area += width * height;
1957 }
1958 }
1959 }
1960 }
1961 }
1962 }
1963 }
1964 }
1965
1966 if let Ok(content_streams) = self.document.get_page_content_streams(&page) {
1968 for content_stream in content_streams.iter() {
1969 let content_str = String::from_utf8_lossy(content_stream);
1970
1971 let bi_count = content_str.matches("BI").count();
1973 let ei_count = content_str.matches("EI").count();
1974
1975 if bi_count > 0 && ei_count > 0 {
1976 image_count += bi_count.min(ei_count);
1977 let page_area = page.width() * page.height();
1979 total_area += page_area * (bi_count.min(ei_count) as f64);
1980 }
1981
1982 let do_count = content_str.matches(" Do").count();
1984 if do_count > 0 && image_count == 0 {
1985 image_count += do_count;
1987 let page_area = page.width() * page.height();
1988 total_area += page_area * (do_count as f64);
1989 }
1990 }
1991 }
1992
1993 Ok(ImageAnalysisResult {
1994 total_area,
1995 image_count,
1996 })
1997 }
1998
1999 fn determine_page_type(&self, text_ratio: f64, image_ratio: f64) -> PageType {
2013 if image_ratio > self.options.scanned_threshold && text_ratio < 0.1 {
2014 PageType::Scanned
2015 } else if text_ratio > self.options.text_threshold && image_ratio < 0.2 {
2016 PageType::Text
2017 } else {
2018 PageType::Mixed
2019 }
2020 }
2021
2022 fn convert_raw_to_png_for_ocr(
2024 &self,
2025 data: &[u8],
2026 width: u32,
2027 height: u32,
2028 color_space: Option<&crate::parser::objects::PdfObject>,
2029 bits_per_component: u8,
2030 ) -> OperationResult<Vec<u8>> {
2031 let components = match color_space {
2035 Some(crate::parser::objects::PdfObject::Name(cs)) => match cs.0.as_str() {
2036 "DeviceGray" => 1,
2037 "DeviceRGB" => 3,
2038 "DeviceCMYK" => 4,
2039 _ => 3, },
2041 _ => 3, };
2043
2044 let mut png_data = Vec::new();
2046
2047 png_data.extend_from_slice(&[0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A]);
2049
2050 let mut ihdr = Vec::new();
2052 ihdr.extend_from_slice(&width.to_be_bytes());
2053 ihdr.extend_from_slice(&height.to_be_bytes());
2054 ihdr.push(bits_per_component);
2055
2056 let color_type = match components {
2058 1 => 0, 3 => 2, 4 => 6, _ => 2, };
2063 ihdr.push(color_type);
2064 ihdr.push(0); ihdr.push(0); ihdr.push(0); self.write_png_chunk(&mut png_data, b"IHDR", &ihdr);
2069
2070 let compressed_data = self.compress_png_data(data, width, height, components)?;
2072 self.write_png_chunk(&mut png_data, b"IDAT", &compressed_data);
2073
2074 self.write_png_chunk(&mut png_data, b"IEND", &[]);
2076
2077 Ok(png_data)
2078 }
2079
2080 fn convert_ccitt_to_png_for_ocr(
2082 &self,
2083 data: &[u8],
2084 width: u32,
2085 height: u32,
2086 ) -> OperationResult<Vec<u8>> {
2087 let mut grayscale_data = Vec::new();
2089
2090 let bits_per_row = width as usize;
2091 let bytes_per_row = bits_per_row.div_ceil(8);
2092
2093 for row in 0..height {
2094 let row_start = row as usize * bytes_per_row;
2095
2096 for col in 0..width {
2097 let byte_idx = row_start + (col as usize / 8);
2098 let bit_idx = 7 - (col as usize % 8);
2099
2100 if byte_idx < data.len() {
2101 let bit = (data[byte_idx] >> bit_idx) & 1;
2102 let gray_value = if bit == 0 { 0 } else { 255 };
2104 grayscale_data.push(gray_value);
2105 } else {
2106 grayscale_data.push(255); }
2108 }
2109 }
2110
2111 self.convert_raw_to_png_for_ocr(
2113 &grayscale_data,
2114 width,
2115 height,
2116 Some(&crate::parser::objects::PdfObject::Name(
2117 crate::parser::objects::PdfName("DeviceGray".to_string()),
2118 )),
2119 8,
2120 )
2121 }
2122
2123 fn write_png_chunk(&self, output: &mut Vec<u8>, chunk_type: &[u8; 4], data: &[u8]) {
2125 output.extend_from_slice(&(data.len() as u32).to_be_bytes());
2127
2128 output.extend_from_slice(chunk_type);
2130
2131 output.extend_from_slice(data);
2133
2134 let crc = self.calculate_png_crc32(chunk_type, data);
2136 output.extend_from_slice(&crc.to_be_bytes());
2137 }
2138
2139 fn calculate_png_crc32(&self, chunk_type: &[u8; 4], data: &[u8]) -> u32 {
2141 let mut crc: u32 = 0xFFFFFFFF;
2142
2143 for &byte in chunk_type {
2145 crc ^= byte as u32;
2146 for _ in 0..8 {
2147 if crc & 1 != 0 {
2148 crc = (crc >> 1) ^ 0xEDB88320;
2149 } else {
2150 crc >>= 1;
2151 }
2152 }
2153 }
2154
2155 for &byte in data {
2157 crc ^= byte as u32;
2158 for _ in 0..8 {
2159 if crc & 1 != 0 {
2160 crc = (crc >> 1) ^ 0xEDB88320;
2161 } else {
2162 crc >>= 1;
2163 }
2164 }
2165 }
2166
2167 crc ^ 0xFFFFFFFF
2168 }
2169
2170 fn compress_png_data(
2172 &self,
2173 data: &[u8],
2174 width: u32,
2175 height: u32,
2176 components: u8,
2177 ) -> OperationResult<Vec<u8>> {
2178 use flate2::write::ZlibEncoder;
2179 use flate2::Compression;
2180 use std::io::Write;
2181
2182 let mut encoder = ZlibEncoder::new(Vec::new(), Compression::default());
2183
2184 let bytes_per_pixel = components as usize;
2186 let bytes_per_row = width as usize * bytes_per_pixel;
2187
2188 for row in 0..height {
2189 encoder.write_all(&[0])?;
2191
2192 let start = row as usize * bytes_per_row;
2194 let end = start + bytes_per_row;
2195 if end <= data.len() {
2196 encoder.write_all(&data[start..end])?;
2197 } else {
2198 let available = data.len().saturating_sub(start);
2200 if available > 0 {
2201 encoder.write_all(&data[start..start + available])?;
2202 }
2203 let padding = bytes_per_row.saturating_sub(available);
2204 for _ in 0..padding {
2205 encoder.write_all(&[0])?;
2206 }
2207 }
2208 }
2209
2210 encoder
2211 .finish()
2212 .map_err(|e| OperationError::ParseError(format!("Failed to compress PNG data: {e}")))
2213 }
2214}
2215
2216struct TextAnalysisResult {
2218 total_area: f64,
2219 fragment_count: usize,
2220 character_count: usize,
2221}
2222
2223struct ImageAnalysisResult {
2225 total_area: f64,
2226 image_count: usize,
2227}
2228
2229fn simulate_page_ocr_processing<P: OcrProvider>(
2231 page_num: usize,
2232 ocr_provider: &P,
2233) -> Result<OcrProcessingResult, crate::text::ocr::OcrError> {
2234 let mock_image_data = vec![
2236 0xFF, 0xD8, 0xFF, 0xE0, 0x00, 0x10, 0x4A, 0x46, 0x49, 0x46, 0x00, 0x01, 0x01, 0x01, 0x00,
2237 0x48, 0x00, 0x48, 0x00, 0x00, 0xFF, 0xD9,
2238 ];
2239
2240 let options = crate::text::ocr::OcrOptions {
2241 language: "eng".to_string(),
2242 min_confidence: 0.6,
2243 preserve_layout: true,
2244 preprocessing: crate::text::ocr::ImagePreprocessing::default(),
2245 engine_options: std::collections::HashMap::new(),
2246 timeout_seconds: 30,
2247 regions: None,
2248 debug_output: false,
2249 };
2250
2251 let mut result = ocr_provider.process_image(&mock_image_data, &options)?;
2253
2254 result.text = format!("Page {page_num} text extracted via OCR");
2256
2257 Ok(result)
2258}
2259
2260#[cfg(test)]
2261mod tests {
2262 use super::*;
2263
2264 #[test]
2265 fn test_page_type_classification() {
2266 assert!(PageType::Scanned.is_scanned());
2267 assert!(!PageType::Text.is_scanned());
2268 assert!(!PageType::Mixed.is_scanned());
2269
2270 assert!(PageType::Text.is_text());
2271 assert!(!PageType::Scanned.is_text());
2272 assert!(!PageType::Mixed.is_text());
2273
2274 assert!(PageType::Mixed.is_mixed());
2275 assert!(!PageType::Scanned.is_mixed());
2276 assert!(!PageType::Text.is_mixed());
2277 }
2278
2279 #[test]
2280 fn test_content_analysis_methods() {
2281 let analysis = ContentAnalysis {
2282 page_number: 0,
2283 page_type: PageType::Scanned,
2284 text_ratio: 0.05,
2285 image_ratio: 0.90,
2286 blank_space_ratio: 0.05,
2287 text_fragment_count: 2,
2288 image_count: 1,
2289 character_count: 15,
2290 };
2291
2292 assert!(analysis.is_scanned());
2293 assert!(!analysis.is_text_heavy());
2294 assert!(!analysis.is_mixed_content());
2295 assert_eq!(analysis.dominant_content_ratio(), 0.90);
2296 }
2297
2298 #[test]
2299 fn test_analysis_options_default() {
2300 let options = AnalysisOptions::default();
2301 assert_eq!(options.min_text_fragment_size, 3);
2302 assert_eq!(options.min_image_size, 50);
2303 assert_eq!(options.scanned_threshold, 0.8);
2304 assert_eq!(options.text_threshold, 0.7);
2305 assert!(options.ocr_options.is_none());
2306 }
2307
2308 #[test]
2309 fn test_determine_page_type() {
2310 let options = AnalysisOptions::default();
2312
2313 let page_type = if 0.90 > options.scanned_threshold && 0.05 < 0.1 {
2315 PageType::Scanned
2316 } else if 0.05 > options.text_threshold && 0.90 < 0.2 {
2317 PageType::Text
2318 } else {
2319 PageType::Mixed
2320 };
2321 assert_eq!(page_type, PageType::Scanned);
2322
2323 let page_type = if 0.10 > options.scanned_threshold && 0.80 < 0.1 {
2325 PageType::Scanned
2326 } else if 0.80 > options.text_threshold && 0.10 < 0.2 {
2327 PageType::Text
2328 } else {
2329 PageType::Mixed
2330 };
2331 assert_eq!(page_type, PageType::Text);
2332
2333 let page_type = if 0.40 > options.scanned_threshold && 0.50 < 0.1 {
2335 PageType::Scanned
2336 } else if 0.50 > options.text_threshold && 0.40 < 0.2 {
2337 PageType::Text
2338 } else {
2339 PageType::Mixed
2340 };
2341 assert_eq!(page_type, PageType::Mixed);
2342 }
2343
2344 #[test]
2348 fn test_jbig2decode_filter_no_longer_errors_in_ocr_path() {
2349 use crate::parser::objects::{PdfDictionary, PdfName, PdfObject, PdfStream};
2350 use crate::{Document, Page};
2351 use std::collections::HashMap;
2352 use tempfile::TempDir;
2353
2354 let temp_dir = TempDir::new().unwrap();
2356 let pdf_path = temp_dir.path().join("test.pdf");
2357 let mut doc = Document::new();
2358 doc.add_page(Page::a4());
2359 doc.save(&pdf_path).unwrap();
2360
2361 let analyzer = PageContentAnalyzer::from_file(&pdf_path).unwrap();
2362
2363 let mut dict_map: HashMap<PdfName, PdfObject> = HashMap::new();
2366 dict_map.insert(PdfName("Width".to_string()), PdfObject::Integer(4));
2367 dict_map.insert(PdfName("Height".to_string()), PdfObject::Integer(4));
2368 dict_map.insert(
2369 PdfName("BitsPerComponent".to_string()),
2370 PdfObject::Integer(1),
2371 );
2372 dict_map.insert(
2373 PdfName("Filter".to_string()),
2374 PdfObject::Name(PdfName("JBIG2Decode".to_string())),
2375 );
2376 let stream_data = vec![0u8; 16];
2379
2380 let stream = PdfStream {
2381 dict: PdfDictionary(dict_map),
2382 data: stream_data,
2383 };
2384
2385 let result = analyzer.extract_image_stream_for_ocr(&stream);
2386
2387 if let Err(err) = &result {
2390 let msg = err.to_string();
2391 assert!(
2392 !msg.contains("Unsupported image filter: JBIG2Decode"),
2393 "JBIG2Decode should no longer produce 'Unsupported image filter' error, got: {msg}"
2394 );
2395 }
2396 }
2397}
2398
2399#[cfg(test)]
2400#[path = "page_analysis_tests.rs"]
2401mod page_analysis_tests;
2402
2403#[cfg(test)]
2404#[path = "page_analysis_ocr_tests.rs"]
2405mod page_analysis_ocr_tests;
2406
2407#[cfg(test)]
2408mod comprehensive_tests {
2409 use super::*;
2410 use crate::parser::{PdfDocument, PdfReader};
2411 use crate::text::{MockOcrProvider, OcrError, OcrOptions, OcrProvider};
2412 use std::fs::File;
2413 use std::io::Write;
2414 use std::sync::Mutex;
2415 use std::time::Duration;
2416 use tempfile::NamedTempFile;
2417
2418 fn create_mock_document() -> crate::parser::document::PdfDocument<std::fs::File> {
2420 use crate::{Document, Page};
2422
2423 let mut doc = Document::new();
2424 doc.add_page(Page::a4());
2425
2426 let temp_file = NamedTempFile::new().expect("Failed to create temp file");
2428 doc.save(temp_file.path()).expect("Failed to save PDF");
2429
2430 let file = std::fs::File::open(temp_file.path()).expect("Failed to open PDF file");
2432 let reader =
2433 crate::parser::reader::PdfReader::new(file).expect("Failed to create PDF reader");
2434 crate::parser::document::PdfDocument::new(reader)
2435 }
2436
2437 #[test]
2439 fn test_text_analysis_result_struct() {
2440 let result = TextAnalysisResult {
2441 total_area: 1000.0,
2442 fragment_count: 10,
2443 character_count: 500,
2444 };
2445
2446 assert_eq!(result.total_area, 1000.0);
2447 assert_eq!(result.fragment_count, 10);
2448 assert_eq!(result.character_count, 500);
2449 }
2450
2451 #[test]
2453 fn test_image_analysis_result_struct() {
2454 let result = ImageAnalysisResult {
2455 total_area: 5000.0,
2456 image_count: 3,
2457 };
2458
2459 assert_eq!(result.total_area, 5000.0);
2460 assert_eq!(result.image_count, 3);
2461 }
2462
2463 #[test]
2465 fn test_analyzer_with_custom_options() {
2466 let doc = create_mock_document();
2467 let custom_options = AnalysisOptions {
2468 min_text_fragment_size: 10,
2469 min_image_size: 200,
2470 scanned_threshold: 0.9,
2471 text_threshold: 0.6,
2472 ocr_options: Some(OcrOptions {
2473 language: "de".to_string(),
2474 min_confidence: 0.85,
2475 ..Default::default()
2476 }),
2477 };
2478
2479 let analyzer = PageContentAnalyzer::with_options(doc, custom_options);
2480
2481 let page_count_result = analyzer.document.page_count();
2483 assert!(page_count_result.is_ok());
2484 assert_eq!(page_count_result.unwrap(), 1);
2485 }
2486
2487 #[test]
2489 fn test_multiple_analyzers() {
2490 let analyzers: Vec<_> = (0..3)
2492 .map(|_| {
2493 let doc = create_mock_document();
2494 PageContentAnalyzer::new(doc)
2495 })
2496 .collect();
2497
2498 for (i, analyzer) in analyzers.iter().enumerate() {
2500 let result = analyzer.document.page_count();
2501 assert!(result.is_ok());
2502 assert_eq!(result.unwrap(), 1);
2503 tracing::debug!("Analyzer {i} works correctly");
2504 }
2505 }
2506
2507 #[test]
2509 fn test_custom_options_propagation() {
2510 let doc = create_mock_document();
2511 let custom_options = AnalysisOptions {
2512 min_text_fragment_size: 15,
2513 min_image_size: 300,
2514 scanned_threshold: 0.85,
2515 text_threshold: 0.65,
2516 ocr_options: None,
2517 };
2518
2519 let analyzer = PageContentAnalyzer::with_options(doc, custom_options);
2520
2521 let result = analyzer.analyze_page(0);
2523 assert!(result.is_ok());
2524 }
2525
2526 #[test]
2528 fn test_empty_document_analysis() {
2529 let pdf_data = b"%PDF-1.4
25311 0 obj
2532<<
2533/Type /Catalog
2534/Pages 2 0 R
2535>>
2536endobj
25372 0 obj
2538<<
2539/Type /Pages
2540/Kids []
2541/Count 0
2542>>
2543endobj
2544xref
25450 3
25460000000000 65535 f
25470000000009 00000 n
25480000000058 00000 n
2549trailer
2550<<
2551/Size 3
2552/Root 1 0 R
2553>>
2554startxref
2555107
2556%%EOF";
2557
2558 let mut temp_file = NamedTempFile::new().expect("Failed to create temp file");
2560 temp_file
2561 .write_all(pdf_data)
2562 .expect("Failed to write PDF data");
2563 temp_file.flush().expect("Failed to flush");
2564
2565 let path = temp_file.path().to_owned();
2567 let file = File::open(&path).expect("Failed to open temp file");
2568
2569 std::mem::forget(temp_file);
2571
2572 let result = PdfReader::new(file);
2574 if result.is_err() {
2575 return;
2578 }
2579
2580 let reader = result.unwrap();
2581 let doc = PdfDocument::new(reader);
2582 let analyzer = PageContentAnalyzer::new(doc);
2583
2584 let analysis_result = analyzer.analyze_document();
2585 assert!(analysis_result.is_ok());
2586 assert_eq!(analysis_result.unwrap().len(), 0);
2587
2588 let scanned_pages = analyzer.find_scanned_pages();
2589 assert!(scanned_pages.is_ok());
2590 assert_eq!(scanned_pages.unwrap().len(), 0);
2591 }
2592
2593 #[test]
2595 fn test_invalid_page_number_handling() {
2596 let doc = create_mock_document();
2597 let analyzer = PageContentAnalyzer::new(doc);
2598
2599 let result = analyzer.analyze_page(999);
2601 if result.is_err() {
2605 assert!(result.unwrap_err().to_string().contains("Page"));
2606 } else {
2607 let analysis = result.unwrap();
2609 assert_eq!(analysis.page_number, 999);
2610 }
2611
2612 let result = analyzer.is_scanned_page(100);
2614 if result.is_err() {
2616 assert!(result.unwrap_err().to_string().contains("Page"));
2617 } else {
2618 let _is_scanned = result.unwrap();
2620 }
2621 }
2622
2623 #[test]
2625 fn test_ocr_extraction_non_scanned_page() {
2626 let doc = create_mock_document();
2627 let analyzer = PageContentAnalyzer::new(doc);
2628 let ocr_provider = MockOcrProvider::new();
2629
2630 let result = analyzer.extract_text_from_scanned_page(0, &ocr_provider);
2632 assert!(result.is_err());
2633 assert!(result
2634 .unwrap_err()
2635 .to_string()
2636 .contains("not a scanned page"));
2637 }
2638
2639 #[test]
2641 fn test_ocr_processing_fallback() {
2642 let doc = create_mock_document();
2643 let analyzer = PageContentAnalyzer::new(doc);
2644 let ocr_provider = MockOcrProvider::new();
2645
2646 let result = analyzer.process_scanned_pages_with_ocr(&ocr_provider);
2648 assert!(result.is_ok());
2649
2650 let result = analyzer.process_scanned_pages_batch(&ocr_provider, 1);
2652 assert!(result.is_ok());
2653 }
2654
2655 #[test]
2657 fn test_ocr_processing_edge_cases() {
2658 let doc = create_mock_document();
2659 let analyzer = PageContentAnalyzer::new(doc);
2660 let ocr_provider = MockOcrProvider::new();
2661
2662 let result = analyzer.find_scanned_pages();
2664 assert!(result.is_ok());
2665
2666 let result = analyzer.process_scanned_pages_batch(&ocr_provider, 0);
2668 assert!(result.is_ok());
2669 }
2670
2671 #[test]
2673 fn test_batch_ocr_processing() {
2674 let doc = create_mock_document();
2675 let analyzer = PageContentAnalyzer::new(doc);
2676 let ocr_provider = MockOcrProvider::new();
2677
2678 let result = analyzer.process_scanned_pages_batch(&ocr_provider, 1);
2680 assert!(result.is_ok());
2681
2682 let result = analyzer.process_scanned_pages_batch(&ocr_provider, 5);
2684 assert!(result.is_ok());
2685
2686 let result = analyzer.process_scanned_pages_batch(&ocr_provider, 100);
2688 assert!(result.is_ok());
2689 }
2690
2691 #[test]
2693 fn test_analyze_specific_pages() {
2694 let doc = create_mock_document();
2695 let analyzer = PageContentAnalyzer::new(doc);
2696
2697 let result = analyzer.analyze_pages(&[0]);
2699 assert!(result.is_ok());
2700 assert_eq!(result.unwrap().len(), 1);
2701
2702 let result = analyzer.analyze_pages(&[0, 99]);
2704 assert!(
2705 result.is_err(),
2706 "analyze_pages with out-of-range page index should return error"
2707 );
2708 }
2709
2710 #[test]
2712 fn test_content_analysis_edge_cases() {
2713 let analysis = ContentAnalysis {
2715 page_number: 0,
2716 page_type: PageType::Mixed,
2717 text_ratio: 0.0,
2718 image_ratio: 0.0,
2719 blank_space_ratio: 1.0,
2720 text_fragment_count: 0,
2721 image_count: 0,
2722 character_count: 0,
2723 };
2724
2725 assert!(!analysis.is_scanned());
2726 assert!(!analysis.is_text_heavy());
2727 assert!(analysis.is_mixed_content());
2728 assert_eq!(analysis.dominant_content_ratio(), 0.0);
2731
2732 let analysis2 = ContentAnalysis {
2734 page_number: 1,
2735 page_type: PageType::Mixed,
2736 text_ratio: 0.33,
2737 image_ratio: 0.33,
2738 blank_space_ratio: 0.34,
2739 text_fragment_count: 10,
2740 image_count: 5,
2741 character_count: 100,
2742 };
2743
2744 assert!(analysis2.is_mixed_content());
2745 assert_eq!(analysis2.dominant_content_ratio(), 0.33); }
2747
2748 #[test]
2750 fn test_ocr_provider_mock_customization() {
2751 let mut provider = MockOcrProvider::new();
2752
2753 provider.set_mock_text("Custom OCR result for testing".to_string());
2755 provider.set_confidence(0.99);
2756 provider.set_processing_delay(10);
2757
2758 let options = OcrOptions::default();
2759 let mock_image = vec![0xFF, 0xD8, 0xFF, 0xE0, 0x00, 0x10, 0x4A, 0x46]; let start = std::time::Instant::now();
2762 let result = provider.process_image(&mock_image, &options);
2763 let elapsed = start.elapsed();
2764
2765 assert!(result.is_ok());
2766 let ocr_result = result.unwrap();
2767 assert!(ocr_result.text.contains("Custom OCR result"));
2768 assert_eq!(ocr_result.confidence, 0.99);
2769 assert!(elapsed >= Duration::from_millis(10));
2770 }
2771
2772 #[test]
2774 fn test_simulate_page_ocr_processing() {
2775 let provider = MockOcrProvider::new();
2776 let result = simulate_page_ocr_processing(5, &provider);
2777
2778 assert!(result.is_ok());
2779 let ocr_result = result.unwrap();
2780 assert!(ocr_result.text.contains("Page 5"));
2781 assert_eq!(ocr_result.language, "eng");
2782 }
2783
2784 #[test]
2786 fn test_process_scanned_pages_error_handling() {
2787 struct FailingOcrProvider;
2789
2790 impl OcrProvider for FailingOcrProvider {
2791 fn process_image(
2792 &self,
2793 _: &[u8],
2794 _: &OcrOptions,
2795 ) -> Result<OcrProcessingResult, OcrError> {
2796 Err(OcrError::ProcessingFailed("Simulated failure".to_string()))
2797 }
2798
2799 fn process_page(
2800 &self,
2801 _: &ContentAnalysis,
2802 _: &[u8],
2803 _: &OcrOptions,
2804 ) -> Result<OcrProcessingResult, OcrError> {
2805 Err(OcrError::ProcessingFailed("Simulated failure".to_string()))
2806 }
2807
2808 fn supported_formats(&self) -> Vec<crate::graphics::ImageFormat> {
2809 vec![]
2810 }
2811
2812 fn engine_name(&self) -> &str {
2813 "Failing"
2814 }
2815
2816 fn engine_type(&self) -> crate::text::OcrEngine {
2817 crate::text::OcrEngine::Mock
2818 }
2819 }
2820
2821 let doc = create_mock_document();
2822 let analyzer = PageContentAnalyzer::new(doc);
2823 let failing_provider = FailingOcrProvider;
2824
2825 let result = analyzer.process_scanned_pages_with_ocr(&failing_provider);
2827 assert!(result.is_ok());
2828 assert_eq!(result.unwrap().len(), 0); }
2830
2831 #[test]
2833 fn test_page_area_calculation() {
2834 let doc = create_mock_document();
2835 let analyzer = PageContentAnalyzer::new(doc);
2836
2837 let page = analyzer.document.get_page(0).unwrap();
2839 let area = analyzer.calculate_page_area(&page);
2840
2841 assert!(area.is_ok());
2842 let area_value = area.unwrap();
2843 assert!(area_value > 0.0);
2844 assert_eq!(area_value, 500990.0);
2846 }
2847
2848 #[test]
2850 fn test_determine_page_type_exact_thresholds() {
2851 let analyzer = PageContentAnalyzer::new(create_mock_document());
2852
2853 let page_type = analyzer.determine_page_type(0.09, 0.81);
2855 assert_eq!(page_type, PageType::Scanned);
2856
2857 let page_type = analyzer.determine_page_type(0.71, 0.19);
2859 assert_eq!(page_type, PageType::Text);
2860
2861 let page_type = analyzer.determine_page_type(0.7, 0.8);
2863 assert_eq!(page_type, PageType::Mixed);
2864 }
2865
2866 #[test]
2868 fn test_analysis_options_with_ocr_configuration() {
2869 let mut engine_options = std::collections::HashMap::new();
2870 engine_options.insert("tesseract_psm".to_string(), "3".to_string());
2871 engine_options.insert("custom_param".to_string(), "value".to_string());
2872
2873 let ocr_options = OcrOptions {
2874 language: "ja".to_string(),
2875 min_confidence: 0.9,
2876 preserve_layout: false,
2877 timeout_seconds: 60,
2878 engine_options,
2879 ..Default::default()
2880 };
2881
2882 let analysis_options = AnalysisOptions {
2883 min_text_fragment_size: 1,
2884 min_image_size: 10,
2885 scanned_threshold: 0.95,
2886 text_threshold: 0.5,
2887 ocr_options: Some(ocr_options),
2888 };
2889
2890 assert!(analysis_options.ocr_options.is_some());
2891 let ocr_opts = analysis_options.ocr_options.unwrap();
2892 assert_eq!(ocr_opts.language, "ja");
2893 assert_eq!(ocr_opts.timeout_seconds, 60);
2894 assert_eq!(ocr_opts.engine_options.len(), 2);
2895 }
2896
2897 #[test]
2899 fn test_content_ratios_sum_to_one() {
2900 let analysis = ContentAnalysis {
2901 page_number: 0,
2902 page_type: PageType::Mixed,
2903 text_ratio: 0.25,
2904 image_ratio: 0.45,
2905 blank_space_ratio: 0.30,
2906 text_fragment_count: 20,
2907 image_count: 3,
2908 character_count: 500,
2909 };
2910
2911 let total = analysis.text_ratio + analysis.image_ratio + analysis.blank_space_ratio;
2912 assert!((total - 1.0).abs() < 0.001);
2913 }
2914
2915 #[test]
2917 fn test_multiple_sequential_analyzers() {
2918 for i in 0..5 {
2920 let doc = create_mock_document();
2921 let analyzer = PageContentAnalyzer::new(doc);
2922 let result = analyzer.analyze_page(0);
2923 assert!(result.is_ok());
2924 tracing::debug!("Analyzer {i} completed analysis");
2925 }
2926 }
2927
2928 #[test]
2930 fn test_extract_page_image_data_no_xobjects() {
2931 let doc = create_mock_document();
2932 let analyzer = PageContentAnalyzer::new(doc);
2933
2934 let result = analyzer.extract_page_image_data(0);
2936 assert!(result.is_err());
2937 assert!(result
2938 .unwrap_err()
2939 .to_string()
2940 .contains("No image data found"));
2941 }
2942
2943 #[test]
2945 fn test_analyze_text_content_fragment_filtering() {
2946 let doc = create_mock_document();
2947 let custom_options = AnalysisOptions {
2948 min_text_fragment_size: 20, ..Default::default()
2950 };
2951 let analyzer = PageContentAnalyzer::with_options(doc, custom_options);
2952
2953 let result = analyzer.analyze_text_content(0);
2954 assert!(result.is_ok());
2955 }
2957
2958 #[test]
2960 fn test_ocr_automatic_configuration() {
2961 let doc = create_mock_document();
2962 let analyzer = PageContentAnalyzer::new(doc);
2963 let provider = MockOcrProvider::new();
2964
2965 let result = analyzer.process_scanned_pages_with_ocr(&provider);
2967 assert!(result.is_ok());
2968
2969 let scanned = analyzer.find_scanned_pages();
2971 assert!(scanned.is_ok());
2972 }
2973
2974 #[test]
2976 fn test_ocr_preprocessing_in_analysis() {
2977 let preprocessing = crate::text::ImagePreprocessing {
2978 denoise: false,
2979 deskew: false,
2980 enhance_contrast: true,
2981 sharpen: true,
2982 scale_factor: 1.5,
2983 };
2984
2985 let ocr_options = OcrOptions {
2986 preprocessing,
2987 ..Default::default()
2988 };
2989
2990 let analysis_options = AnalysisOptions {
2991 ocr_options: Some(ocr_options),
2992 ..Default::default()
2993 };
2994
2995 let doc = create_mock_document();
2996 let analyzer = PageContentAnalyzer::with_options(doc, analysis_options);
2997
2998 assert!(analyzer.options.ocr_options.is_some());
3000 }
3001
3002 #[test]
3004 fn test_batch_processing_timing() {
3005 let doc = create_mock_document();
3006 let analyzer = PageContentAnalyzer::new(doc);
3007 let provider = MockOcrProvider::new();
3008
3009 let start = std::time::Instant::now();
3010 let result = analyzer.process_scanned_pages_batch(&provider, 1);
3011 let _elapsed = start.elapsed();
3012
3013 assert!(result.is_ok());
3014 }
3017
3018 #[test]
3020 fn test_page_type_all_combinations() {
3021 let analyzer = PageContentAnalyzer::new(create_mock_document());
3022
3023 assert_eq!(analyzer.determine_page_type(0.05, 0.85), PageType::Scanned);
3025 assert_eq!(analyzer.determine_page_type(0.0, 0.95), PageType::Scanned);
3026
3027 assert_eq!(analyzer.determine_page_type(0.75, 0.15), PageType::Text);
3029 assert_eq!(analyzer.determine_page_type(0.85, 0.0), PageType::Text);
3030
3031 assert_eq!(analyzer.determine_page_type(0.4, 0.4), PageType::Mixed);
3033 assert_eq!(analyzer.determine_page_type(0.3, 0.3), PageType::Mixed);
3034
3035 assert_eq!(analyzer.determine_page_type(0.5, 0.5), PageType::Mixed);
3037 assert_eq!(analyzer.determine_page_type(0.15, 0.75), PageType::Mixed);
3038 }
3039
3040 #[test]
3042 fn test_multiple_analyzers_shared_results() {
3043 let mut all_results = Vec::new();
3044
3045 for i in 0..3 {
3047 let doc = create_mock_document();
3048 let analyzer = PageContentAnalyzer::new(doc);
3049
3050 if let Ok(analysis) = analyzer.analyze_page(0) {
3051 all_results.push((i, analysis.page_type));
3052 }
3053 }
3054
3055 assert_eq!(all_results.len(), 3);
3056
3057 for (i, page_type) in &all_results {
3059 tracing::debug!("Analyzer {i} detected page type: {page_type:?}");
3060 }
3061 }
3062
3063 #[test]
3065 fn test_batch_processing_error_recovery() {
3066 let doc = create_mock_document();
3068 let analyzer = PageContentAnalyzer::new(doc);
3069
3070 struct IntermittentOcrProvider {
3072 fail_count: Mutex<usize>,
3073 }
3074
3075 impl OcrProvider for IntermittentOcrProvider {
3076 fn process_image(
3077 &self,
3078 data: &[u8],
3079 opts: &OcrOptions,
3080 ) -> Result<OcrProcessingResult, OcrError> {
3081 let mut count = self.fail_count.lock().unwrap();
3082 *count += 1;
3083
3084 if *count % 2 == 0 {
3085 Err(OcrError::ProcessingFailed(
3086 "Intermittent failure".to_string(),
3087 ))
3088 } else {
3089 MockOcrProvider::new().process_image(data, opts)
3090 }
3091 }
3092
3093 fn process_page(
3094 &self,
3095 _analysis: &ContentAnalysis,
3096 data: &[u8],
3097 opts: &OcrOptions,
3098 ) -> Result<OcrProcessingResult, OcrError> {
3099 self.process_image(data, opts)
3100 }
3101
3102 fn supported_formats(&self) -> Vec<crate::graphics::ImageFormat> {
3103 MockOcrProvider::new().supported_formats()
3104 }
3105
3106 fn engine_name(&self) -> &str {
3107 "Intermittent"
3108 }
3109
3110 fn engine_type(&self) -> crate::text::OcrEngine {
3111 crate::text::OcrEngine::Mock
3112 }
3113 }
3114
3115 let provider = IntermittentOcrProvider {
3116 fail_count: Mutex::new(0),
3117 };
3118
3119 let result = analyzer.process_scanned_pages_batch(&provider, 2);
3120 assert!(result.is_ok());
3121 }
3123
3124 #[test]
3126 fn test_memory_stress_multiple_analyses() {
3127 let doc = create_mock_document();
3128 let analyzer = PageContentAnalyzer::new(doc);
3129
3130 for _ in 0..100 {
3132 let result = analyzer.analyze_page(0);
3133 assert!(result.is_ok());
3134 }
3135
3136 for _ in 0..10 {
3138 let result = analyzer.analyze_document();
3139 assert!(result.is_ok());
3140 }
3141 }
3142
3143 #[test]
3145 fn test_ocr_language_fallback() {
3146 let ocr_options = OcrOptions {
3147 language: "unknown_lang".to_string(),
3148 ..Default::default()
3149 };
3150
3151 let analysis_options = AnalysisOptions {
3152 ocr_options: Some(ocr_options),
3153 ..Default::default()
3154 };
3155
3156 let doc = create_mock_document();
3157 let analyzer = PageContentAnalyzer::with_options(doc, analysis_options);
3158 let provider = MockOcrProvider::new();
3159
3160 let result = analyzer.process_scanned_pages_with_ocr(&provider);
3162 assert!(result.is_ok());
3163 }
3164
3165 #[test]
3167 fn test_ocr_timeout_simulation() {
3168 let mut provider = MockOcrProvider::new();
3169 provider.set_processing_delay(100); let ocr_options = OcrOptions {
3172 timeout_seconds: 1, ..Default::default()
3174 };
3175
3176 let analysis_options = AnalysisOptions {
3177 ocr_options: Some(ocr_options),
3178 ..Default::default()
3179 };
3180
3181 let doc = create_mock_document();
3182 let analyzer = PageContentAnalyzer::with_options(doc, analysis_options);
3183
3184 let result = analyzer.process_scanned_pages_with_ocr(&provider);
3186 assert!(result.is_ok());
3187 }
3188
3189 #[test]
3191 fn test_zero_sized_image_filtering() {
3192 let doc = create_mock_document();
3193 let analyzer = PageContentAnalyzer::new(doc);
3194
3195 let result = analyzer.analyze_image_content(0);
3197 assert!(result.is_ok());
3198 let image_analysis = result.unwrap();
3199 assert_eq!(image_analysis.image_count, 0);
3200 assert_eq!(image_analysis.total_area, 0.0);
3201 }
3202
3203 #[test]
3205 fn test_page_numbers_boundary() {
3206 let doc = create_mock_document();
3207 let analyzer = PageContentAnalyzer::new(doc);
3208
3209 let page_numbers = vec![0, usize::MAX];
3211 let result = analyzer.analyze_pages(&page_numbers);
3212 if result.is_ok() {
3215 let analyses = result.unwrap();
3216 assert!(analyses.len() >= 1);
3218 assert_eq!(analyses[0].page_number, 0);
3219 } else {
3220 assert!(result.unwrap_err().to_string().contains("Page"));
3222 }
3223 }
3224
3225 #[test]
3227 fn test_ocr_confidence_boundaries() {
3228 let mut provider = MockOcrProvider::new();
3229
3230 let jpeg_data = [
3232 0xFF, 0xD8, 0xFF, 0xE0, 0x00, 0x10, 0x4A, 0x46, 0x49, 0x46, 0x00, 0x01,
3233 ];
3234
3235 provider.set_confidence(0.0);
3237 let result = provider.process_image(&jpeg_data, &OcrOptions::default());
3238 assert!(result.is_ok());
3239
3240 provider.set_confidence(1.0);
3242 let result = provider.process_image(&jpeg_data, &OcrOptions::default());
3243 assert!(result.is_ok());
3244
3245 let options = OcrOptions {
3247 min_confidence: 0.9,
3248 ..Default::default()
3249 };
3250 provider.set_confidence(0.5);
3251 let result = provider.process_image(&jpeg_data, &options);
3252 assert!(result.is_ok());
3254 }
3255
3256 #[test]
3258 fn test_ocr_processing_configurations() {
3259 let doc = create_mock_document();
3260 let analyzer = PageContentAnalyzer::new(doc);
3261 let provider = MockOcrProvider::new();
3262
3263 let result = analyzer.process_scanned_pages_with_ocr(&provider);
3265 assert!(result.is_ok());
3266
3267 for batch_size in [1, 3, 5, 10] {
3269 let result = analyzer.process_scanned_pages_batch(&provider, batch_size);
3270 assert!(result.is_ok());
3271 }
3272 }
3273
3274 #[test]
3276 fn test_custom_min_image_size() {
3277 let doc = create_mock_document();
3278 let custom_options = AnalysisOptions {
3279 min_image_size: 1000, ..Default::default()
3281 };
3282 let analyzer = PageContentAnalyzer::with_options(doc, custom_options);
3283
3284 let result = analyzer.analyze_image_content(0);
3285 assert!(result.is_ok());
3286 }
3288
3289 #[test]
3291 fn test_comprehensive_page_analysis() {
3292 let doc = create_mock_document();
3293 let analyzer = PageContentAnalyzer::new(doc);
3294
3295 let analysis = analyzer.analyze_page(0);
3296 assert!(analysis.is_ok());
3297
3298 let analysis = analysis.unwrap();
3299
3300 assert!(analysis.page_number == 0);
3302 assert!(analysis.text_ratio >= 0.0 && analysis.text_ratio <= 1.0);
3303 assert!(analysis.image_ratio >= 0.0 && analysis.image_ratio <= 1.0);
3304 assert!(analysis.blank_space_ratio >= 0.0 && analysis.blank_space_ratio <= 1.0);
3305
3306 let total = analysis.text_ratio + analysis.image_ratio + analysis.blank_space_ratio;
3308 assert!((total - 1.0).abs() < 0.01);
3309 }
3310
3311 #[test]
3313 fn test_error_message_formatting() {
3314 let doc = create_mock_document();
3315 let analyzer = PageContentAnalyzer::new(doc);
3316 let provider = MockOcrProvider::new();
3317
3318 let result = analyzer.extract_text_from_scanned_page(0, &provider);
3320 assert!(result.is_err());
3321 let error_msg = result.unwrap_err().to_string();
3322 assert!(error_msg.contains("not a scanned page"));
3323 assert!(error_msg.contains("image ratio"));
3324 assert!(error_msg.contains("text ratio"));
3325 }
3326
3327 #[test]
3329 fn test_batch_size_edge_cases() {
3330 let doc = create_mock_document();
3331 let analyzer = PageContentAnalyzer::new(doc);
3332 let provider = MockOcrProvider::new();
3333
3334 let result = analyzer.process_scanned_pages_batch(&provider, 0);
3336 assert!(result.is_ok());
3337
3338 let result = analyzer.process_scanned_pages_batch(&provider, usize::MAX);
3340 assert!(result.is_ok());
3341 }
3342
3343 #[test]
3345 fn test_ocr_provider_robustness() {
3346 struct UnreliableOcrProvider {
3348 call_count: Mutex<usize>,
3349 }
3350
3351 impl UnreliableOcrProvider {
3352 fn new() -> Self {
3353 UnreliableOcrProvider {
3354 call_count: Mutex::new(0),
3355 }
3356 }
3357 }
3358
3359 impl Clone for UnreliableOcrProvider {
3360 fn clone(&self) -> Self {
3361 UnreliableOcrProvider {
3362 call_count: Mutex::new(0),
3363 }
3364 }
3365 }
3366
3367 impl OcrProvider for UnreliableOcrProvider {
3368 fn process_image(
3369 &self,
3370 _: &[u8],
3371 _: &OcrOptions,
3372 ) -> Result<OcrProcessingResult, OcrError> {
3373 let mut count = self.call_count.lock().unwrap();
3374 *count += 1;
3375
3376 if *count == 1 {
3378 Err(OcrError::ProcessingFailed("Temporary failure".to_string()))
3379 } else {
3380 MockOcrProvider::new().process_image(&[0xFF, 0xD8], &OcrOptions::default())
3381 }
3382 }
3383
3384 fn process_page(
3385 &self,
3386 _: &ContentAnalysis,
3387 data: &[u8],
3388 opts: &OcrOptions,
3389 ) -> Result<OcrProcessingResult, OcrError> {
3390 self.process_image(data, opts)
3391 }
3392
3393 fn supported_formats(&self) -> Vec<crate::graphics::ImageFormat> {
3394 MockOcrProvider::new().supported_formats()
3395 }
3396
3397 fn engine_name(&self) -> &str {
3398 "Unreliable"
3399 }
3400
3401 fn engine_type(&self) -> crate::text::OcrEngine {
3402 crate::text::OcrEngine::Mock
3403 }
3404 }
3405
3406 let doc = create_mock_document();
3407 let analyzer = PageContentAnalyzer::new(doc);
3408 let provider = UnreliableOcrProvider::new();
3409
3410 let result = analyzer.process_scanned_pages_with_ocr(&provider);
3412 assert!(result.is_ok());
3413
3414 let result = analyzer.process_scanned_pages_batch(&provider, 2);
3416 assert!(result.is_ok());
3417 }
3418
3419 #[test]
3421 fn test_analysis_options_validation() {
3422 let options = AnalysisOptions {
3424 min_text_fragment_size: 0,
3425 min_image_size: 0,
3426 scanned_threshold: 1.5, text_threshold: -0.5, ocr_options: None,
3429 };
3430
3431 let doc = create_mock_document();
3432 let analyzer = PageContentAnalyzer::with_options(doc, options);
3433
3434 let result = analyzer.analyze_page(0);
3436 assert!(result.is_ok());
3437 }
3438
3439 #[test]
3441 fn test_ocr_result_aggregation() {
3442 let doc = create_mock_document();
3443 let analyzer = PageContentAnalyzer::new(doc);
3444 let mut provider = MockOcrProvider::new();
3445
3446 provider.set_mock_text("Page content from OCR".to_string());
3448 provider.set_confidence(0.85);
3449
3450 let results = analyzer.process_scanned_pages_with_ocr(&provider);
3451 assert!(results.is_ok());
3452
3453 let ocr_results = results.unwrap();
3454
3455 let total_chars: usize = ocr_results
3457 .iter()
3458 .map(|(_, result)| result.text.len())
3459 .sum();
3460 let avg_confidence: f64 = if !ocr_results.is_empty() {
3461 ocr_results
3462 .iter()
3463 .map(|(_, result)| result.confidence)
3464 .sum::<f64>()
3465 / ocr_results.len() as f64
3466 } else {
3467 0.0
3468 };
3469
3470 assert!(total_chars == total_chars); assert!((0.0..=1.0).contains(&avg_confidence));
3473 }
3474
3475 #[test]
3477 fn test_resource_cleanup() {
3478 for _ in 0..10 {
3480 let doc = create_mock_document();
3481 let analyzer = PageContentAnalyzer::new(doc);
3482 let _result = analyzer.analyze_document();
3483 }
3485
3486 }
3489
3490 #[test]
3492 fn test_complete_analysis_workflow() {
3493 let doc = create_mock_document();
3495 let analyzer = PageContentAnalyzer::new(doc);
3496
3497 let analyses = analyzer.analyze_document().unwrap();
3499 assert!(!analyses.is_empty());
3500
3501 let _scanned_pages = analyzer.find_scanned_pages().unwrap();
3503
3504 let _is_scanned = analyzer.is_scanned_page(0).unwrap();
3506
3507 let provider = MockOcrProvider::new();
3509 let ocr_results = analyzer.process_scanned_pages_with_ocr(&provider).unwrap();
3510
3511 let sequential_results = analyzer.process_scanned_pages_with_ocr(&provider).unwrap();
3513
3514 let batch_results = analyzer.process_scanned_pages_batch(&provider, 5).unwrap();
3516
3517 assert_eq!(ocr_results.len(), sequential_results.len());
3519 assert_eq!(ocr_results.len(), batch_results.len());
3520
3521 tracing::debug!(
3522 "Complete workflow test passed with {} pages analyzed",
3523 analyses.len()
3524 );
3525 }
3526}