1use super::{OperationError, OperationResult};
39use crate::parser::{PdfDocument, PdfReader};
40use crate::text::{ExtractionOptions, OcrOptions, OcrProcessingResult, OcrProvider, TextExtractor};
41use std::fs::File;
43use std::path::Path;
44
45#[derive(Debug, Clone, Copy, PartialEq, Eq)]
47pub enum PageType {
48 Scanned,
50 Text,
52 Mixed,
54}
55
56impl PageType {
57 pub fn is_scanned(&self) -> bool {
59 matches!(self, PageType::Scanned)
60 }
61
62 pub fn is_text(&self) -> bool {
64 matches!(self, PageType::Text)
65 }
66
67 pub fn is_mixed(&self) -> bool {
69 matches!(self, PageType::Mixed)
70 }
71}
72
73#[derive(Debug, Clone)]
75pub struct ContentAnalysis {
76 pub page_number: usize,
78 pub page_type: PageType,
80 pub text_ratio: f64,
82 pub image_ratio: f64,
84 pub blank_space_ratio: f64,
86 pub text_fragment_count: usize,
88 pub image_count: usize,
90 pub character_count: usize,
92}
93
94impl ContentAnalysis {
95 pub fn is_scanned(&self) -> bool {
115 self.page_type.is_scanned()
116 }
117
118 pub fn is_text_heavy(&self) -> bool {
120 self.page_type.is_text()
121 }
122
123 pub fn is_mixed_content(&self) -> bool {
125 self.page_type.is_mixed()
126 }
127
128 pub fn dominant_content_ratio(&self) -> f64 {
130 self.text_ratio.max(self.image_ratio)
131 }
132}
133
134#[derive(Debug, Clone)]
136pub struct AnalysisOptions {
137 pub min_text_fragment_size: usize,
139 pub min_image_size: u32,
141 pub scanned_threshold: f64,
143 pub text_threshold: f64,
145 pub ocr_options: Option<OcrOptions>,
147}
148
149impl Default for AnalysisOptions {
150 fn default() -> Self {
151 Self {
152 min_text_fragment_size: 3,
153 min_image_size: 50,
154 scanned_threshold: 0.8,
155 text_threshold: 0.7,
156 ocr_options: None,
157 }
158 }
159}
160
161pub struct PageContentAnalyzer {
166 document: PdfDocument<File>,
167 options: AnalysisOptions,
168}
169
170impl PageContentAnalyzer {
171 pub fn new(document: PdfDocument<File>) -> Self {
190 Self {
191 document,
192 options: AnalysisOptions::default(),
193 }
194 }
195
196 pub fn with_options(document: PdfDocument<File>, options: AnalysisOptions) -> Self {
203 Self { document, options }
204 }
205
206 pub fn from_file<P: AsRef<Path>>(path: P) -> OperationResult<Self> {
216 let document = PdfReader::open_document(path)
217 .map_err(|e| OperationError::ParseError(e.to_string()))?;
218 Ok(Self::new(document))
219 }
220
221 pub fn analyze_page(&self, page_number: usize) -> OperationResult<ContentAnalysis> {
255 let page = self
257 .document
258 .get_page(page_number as u32)
259 .map_err(|e| OperationError::ParseError(e.to_string()))?;
260
261 let page_area = self.calculate_page_area(&page)?;
262
263 let text_analysis = self.analyze_text_content(page_number)?;
265 let text_area = text_analysis.total_area;
266 let text_fragment_count = text_analysis.fragment_count;
267 let character_count = text_analysis.character_count;
268
269 let image_analysis = self.analyze_image_content(page_number)?;
271 let image_area = image_analysis.total_area;
272 let image_count = image_analysis.image_count;
273
274 let text_ratio = if page_area > 0.0 {
276 text_area / page_area
277 } else {
278 0.0
279 };
280 let image_ratio = if page_area > 0.0 {
281 image_area / page_area
282 } else {
283 0.0
284 };
285 let blank_space_ratio = 1.0 - text_ratio - image_ratio;
286
287 let page_type = self.determine_page_type(text_ratio, image_ratio);
289
290 Ok(ContentAnalysis {
291 page_number,
292 page_type,
293 text_ratio,
294 image_ratio,
295 blank_space_ratio: blank_space_ratio.max(0.0),
296 text_fragment_count,
297 image_count,
298 character_count,
299 })
300 }
301
302 pub fn analyze_document(&self) -> OperationResult<Vec<ContentAnalysis>> {
325 let page_count = self
326 .document
327 .page_count()
328 .map_err(|e| OperationError::ParseError(e.to_string()))?;
329
330 let mut analyses = Vec::new();
331 for page_idx in 0..page_count {
332 let analysis = self.analyze_page(page_idx as usize)?;
333 analyses.push(analysis);
334 }
335
336 Ok(analyses)
337 }
338
339 pub fn analyze_pages(&self, page_numbers: &[usize]) -> OperationResult<Vec<ContentAnalysis>> {
349 let mut analyses = Vec::new();
350 for &page_number in page_numbers {
351 let analysis = self.analyze_page(page_number)?;
352 analyses.push(analysis);
353 }
354 Ok(analyses)
355 }
356
357 pub fn is_scanned_page(&self, page_number: usize) -> OperationResult<bool> {
386 let analysis = self.analyze_page(page_number)?;
387 Ok(analysis.is_scanned())
388 }
389
390 pub fn find_scanned_pages(&self) -> OperationResult<Vec<usize>> {
396 let analyses = self.analyze_document()?;
397 Ok(analyses
398 .into_iter()
399 .filter(|analysis| analysis.is_scanned())
400 .map(|analysis| analysis.page_number)
401 .collect())
402 }
403
404 pub fn extract_text_from_scanned_page<P: OcrProvider>(
445 &self,
446 page_number: usize,
447 ocr_provider: &P,
448 ) -> OperationResult<OcrProcessingResult> {
449 let analysis = self.analyze_page(page_number)?;
451 if !analysis.is_scanned() {
452 return Err(OperationError::ParseError(format!(
453 "Page {} is not a scanned page (image ratio: {:.2}%, text ratio: {:.2}%)",
454 page_number,
455 analysis.image_ratio * 100.0,
456 analysis.text_ratio * 100.0
457 )));
458 }
459
460 let ocr_options = self.options.ocr_options.clone().unwrap_or_default();
462
463 let page_image_data = self.extract_page_image_data(page_number)?;
465
466 let ocr_result = ocr_provider
468 .process_page(&analysis, &page_image_data, &ocr_options)
469 .map_err(|e| OperationError::ParseError(format!("OCR processing failed: {e}")))?;
470
471 Ok(ocr_result)
472 }
473
474 pub fn process_scanned_pages_with_ocr<P: OcrProvider>(
508 &self,
509 ocr_provider: &P,
510 ) -> OperationResult<Vec<(usize, OcrProcessingResult)>> {
511 let scanned_pages = self.find_scanned_pages()?;
512 let mut results = Vec::new();
513
514 for page_number in scanned_pages {
515 match self.extract_text_from_scanned_page(page_number, ocr_provider) {
516 Ok(ocr_result) => {
517 results.push((page_number, ocr_result));
518 }
519 Err(e) => {
520 tracing::error!("Failed to process page {page_number}: {e}");
521 continue;
522 }
523 }
524 }
525
526 Ok(results)
527 }
528
529 pub fn process_scanned_pages_parallel<P: OcrProvider + Clone + Send + Sync + 'static>(
564 &self,
565 ocr_provider: &P,
566 max_threads: Option<usize>,
567 ) -> OperationResult<Vec<(usize, OcrProcessingResult)>> {
568 use std::sync::{Arc, Mutex};
569 use std::thread;
570
571 let scanned_pages = self.find_scanned_pages()?;
572 if scanned_pages.is_empty() {
573 return Ok(Vec::new());
574 }
575
576 let thread_count = max_threads.unwrap_or_else(|| {
578 std::cmp::min(
579 scanned_pages.len(),
580 std::thread::available_parallelism()
581 .map(|p| p.get())
582 .unwrap_or(4),
583 )
584 });
585
586 if thread_count <= 1 {
587 return self.process_scanned_pages_with_ocr(ocr_provider);
589 }
590
591 let results = Arc::new(Mutex::new(Vec::new()));
593 let provider = Arc::new(ocr_provider.clone());
594
595 let chunk_size = scanned_pages.len().div_ceil(thread_count);
597 let mut handles = Vec::new();
598
599 for chunk in scanned_pages.chunks(chunk_size) {
600 let chunk_pages = chunk.to_vec();
601 let results_clone = Arc::clone(&results);
602 let provider_clone = Arc::clone(&provider);
603
604 let handle = thread::spawn(move || {
607 let mut thread_results = Vec::new();
608
609 for page_num in chunk_pages {
610 match simulate_page_ocr_processing(page_num, &*provider_clone) {
613 Ok(ocr_result) => {
614 thread_results.push((page_num, ocr_result));
615 }
616 Err(e) => {
617 tracing::error!("OCR failed for page {page_num}: {e}");
618 }
619 }
620 }
621
622 if let Ok(mut shared_results) = results_clone.lock() {
624 shared_results.extend(thread_results);
625 }
626 });
627
628 handles.push(handle);
629 }
630
631 for handle in handles {
633 if let Err(e) = handle.join() {
634 tracing::error!("Thread panicked: {e:?}");
635 }
636 }
637
638 let final_results = results
640 .lock()
641 .map_err(|e| OperationError::ProcessingError(format!("Failed to get results: {e}")))?
642 .clone();
643
644 Ok(final_results)
645 }
646
647 pub fn process_scanned_pages_batch<P: OcrProvider>(
661 &self,
662 ocr_provider: &P,
663 batch_size: usize,
664 ) -> OperationResult<Vec<(usize, OcrProcessingResult)>> {
665 let scanned_pages = self.find_scanned_pages()?;
666 let mut results = Vec::new();
667
668 if batch_size == 0 {
670 return Ok(results);
671 }
672
673 for batch in scanned_pages.chunks(batch_size) {
674 tracing::info!("Processing batch of {} pages", batch.len());
675
676 for &page_num in batch {
677 match self.extract_text_from_scanned_page(page_num, ocr_provider) {
678 Ok(ocr_result) => {
679 results.push((page_num, ocr_result));
680 }
681 Err(e) => {
682 tracing::error!("OCR failed for page {page_num}: {e}");
683 }
684 }
685 }
686
687 std::thread::sleep(std::time::Duration::from_millis(100));
689 }
690
691 Ok(results)
692 }
693
694 pub fn extract_page_image_data(&self, page_number: usize) -> OperationResult<Vec<u8>> {
699 tracing::debug!(
700 "🔍 [DEBUG] extract_page_image_data called for page {}",
701 page_number
702 );
703
704 let page = self
705 .document
706 .get_page(page_number as u32)
707 .map_err(|e| OperationError::ParseError(e.to_string()))?;
708
709 tracing::debug!("🔍 [DEBUG] Trying Method 1: Check page resources for XObjects");
711 let resources = self
712 .document
713 .get_page_resources(&page)
714 .map_err(|e| OperationError::ParseError(e.to_string()))?;
715
716 let mut resolved_resources_dict: Option<crate::parser::objects::PdfDictionary> = None;
718
719 if let Some(_resources) = &resources {
720 tracing::debug!(
722 "🔍 [DEBUG] Page {} has resources via standard method",
723 page_number
724 );
725 } else {
726 tracing::debug!(
728 "🔍 [DEBUG] Page {} resources None, trying direct resolution",
729 page_number
730 );
731 if let Some(resources_ref) = page.dict.get("Resources") {
732 tracing::debug!(
733 "🔍 [DEBUG] Page {} has Resources entry, resolving reference",
734 page_number
735 );
736 match self.document.resolve(resources_ref) {
737 Ok(resolved_obj) => {
738 if let Some(resolved_dict) = resolved_obj.as_dict() {
739 tracing::debug!("🔍 [DEBUG] Page {} resolved Resources to dictionary with {} entries",
740 page_number, resolved_dict.0.len());
741 resolved_resources_dict = Some(resolved_dict.clone());
742 } else {
743 tracing::debug!(
744 "🔍 [DEBUG] Page {} Resources resolved but not a dictionary",
745 page_number
746 );
747 }
748 }
749 Err(e) => {
750 tracing::debug!(
751 "🔍 [DEBUG] Page {} failed to resolve Resources: {}",
752 page_number,
753 e
754 );
755 }
756 }
757 } else {
758 tracing::debug!(
759 "🔍 [DEBUG] Page {} has no Resources entry in dict",
760 page_number
761 );
762 }
763 }
764
765 let active_resources = resources.or(resolved_resources_dict.as_ref());
767
768 if let Some(resources) = &active_resources {
769 tracing::debug!("🔍 [DEBUG] Page {} has resources", page_number);
770 if let Some(crate::parser::objects::PdfObject::Dictionary(xobjects)) = resources
771 .0
772 .get(&crate::parser::objects::PdfName("XObject".to_string()))
773 {
774 tracing::debug!(
775 "🔍 [DEBUG] Page {} has XObject dictionary with {} entries",
776 page_number,
777 xobjects.0.len()
778 );
779 for (xobject_name, obj_ref) in xobjects.0.iter() {
781 if let crate::parser::objects::PdfObject::Reference(obj_num, gen_num) = obj_ref
782 {
783 if let Ok(crate::parser::objects::PdfObject::Stream(stream)) =
784 self.document.get_object(*obj_num, *gen_num)
785 {
786 if let Some(crate::parser::objects::PdfObject::Name(subtype)) = stream
788 .dict
789 .0
790 .get(&crate::parser::objects::PdfName("Subtype".to_string()))
791 {
792 if subtype.0 == "Image" {
793 let width = stream
794 .dict
795 .0
796 .get(&crate::parser::objects::PdfName("Width".to_string()))
797 .and_then(|w| {
798 if let crate::parser::objects::PdfObject::Integer(w) = w
799 {
800 Some(*w)
801 } else {
802 None
803 }
804 })
805 .unwrap_or(0);
806
807 let height = stream
808 .dict
809 .0
810 .get(&crate::parser::objects::PdfName("Height".to_string()))
811 .and_then(|h| {
812 if let crate::parser::objects::PdfObject::Integer(h) = h
813 {
814 Some(*h)
815 } else {
816 None
817 }
818 })
819 .unwrap_or(0);
820
821 tracing::debug!(
822 "🔍 [DEBUG] Page {} Method1 XObject {} -> Object {} ({}x{})",
823 page_number, xobject_name.0, obj_num, width, height
824 );
825 return self.extract_image_stream_for_ocr(&stream);
827 }
828 }
829 }
830 }
831 }
832 } else {
833 tracing::debug!("🔍 [DEBUG] Page {} has no XObject dictionary", page_number);
834 }
835 } else {
836 tracing::debug!("🔍 [DEBUG] Page {} has no resources", page_number);
837 }
838
839 tracing::debug!("🔍 [DEBUG] Trying Method 2: Parse content streams for Do operators");
841 if let Ok(content_streams) = self.document.get_page_content_streams(&page) {
842 tracing::debug!(
843 "🔍 [DEBUG] Page {} has {} content streams",
844 page_number,
845 content_streams.len()
846 );
847 for (i, content_stream) in content_streams.iter().enumerate() {
848 let content_str = String::from_utf8_lossy(content_stream);
849 tracing::debug!(
850 "🔍 [DEBUG] Content stream {} has {} bytes",
851 i,
852 content_stream.len()
853 );
854
855 for line in content_str.lines() {
858 if line.trim().ends_with(" Do") {
859 let parts: Vec<&str> = line.split_whitespace().collect();
861 if parts.len() >= 2 && parts[parts.len() - 1] == "Do" {
862 let xobject_name = parts[parts.len() - 2];
863 tracing::debug!(
864 "🔍 [DEBUG] Found Do operator with XObject: {}",
865 xobject_name
866 );
867 if let Some(name) = xobject_name.strip_prefix('/') {
868 tracing::debug!("🔍 [DEBUG] Looking for XObject: {}", name);
870
871 if let Ok(image_data) =
873 self.find_specific_xobject_image_from_page(name, &page)
874 {
875 return Ok(image_data);
876 } else {
877 tracing::debug!("🔍 [DEBUG] Page-specific XObject lookup failed for: {}, trying document-wide search", name);
878 if let Ok(image_data) = self.find_specific_xobject_image(name) {
880 return Ok(image_data);
881 } else {
882 tracing::debug!("🔍 [DEBUG] Document-wide XObject lookup also failed for: {}", name);
883 }
884 }
885 }
886 }
887 }
888 }
889
890 if content_str.contains("BI") && content_str.contains("EI") {
892 }
895 }
896 }
897
898 tracing::debug!("🔍 [DEBUG] Trying Method 3: Fallback scan for large images");
900 match self.find_image_xobjects_in_document() {
901 Ok(image_data) if !image_data.is_empty() => {
902 return Ok(image_data);
903 }
904 _ => {}
905 }
906
907 Err(OperationError::ParseError(
908 "No image data found on scanned page (checked XObjects and inline images)".to_string(),
909 ))
910 }
911
912 fn find_specific_xobject_image_from_page(
914 &self,
915 xobject_name: &str,
916 page: &crate::parser::page_tree::ParsedPage,
917 ) -> OperationResult<Vec<u8>> {
918 let resources = self
920 .document
921 .get_page_resources(page)
922 .map_err(|e| OperationError::ParseError(e.to_string()))?;
923
924 if let Some(resources) = resources {
926 if let Some(crate::parser::objects::PdfObject::Dictionary(xobjects)) = resources
927 .0
928 .get(&crate::parser::objects::PdfName("XObject".to_string()))
929 {
930 #[allow(clippy::collapsible_match)]
931 if let Some(xobject_ref) = xobjects
932 .0
933 .get(&crate::parser::objects::PdfName(xobject_name.to_string()))
934 {
935 if let crate::parser::objects::PdfObject::Reference(obj_num, gen_num) =
936 xobject_ref
937 {
938 if let Ok(crate::parser::objects::PdfObject::Stream(stream)) =
939 self.document.get_object(*obj_num, *gen_num)
940 {
941 if let Some(crate::parser::objects::PdfObject::Name(subtype)) = stream
942 .dict
943 .0
944 .get(&crate::parser::objects::PdfName("Subtype".to_string()))
945 {
946 if subtype.0 == "Image" {
947 let width = stream
948 .dict
949 .0
950 .get(&crate::parser::objects::PdfName("Width".to_string()))
951 .and_then(|w| {
952 if let crate::parser::objects::PdfObject::Integer(w) = w
953 {
954 Some(*w)
955 } else {
956 None
957 }
958 })
959 .unwrap_or(0);
960 let height = stream
961 .dict
962 .0
963 .get(&crate::parser::objects::PdfName("Height".to_string()))
964 .and_then(|h| {
965 if let crate::parser::objects::PdfObject::Integer(h) = h
966 {
967 Some(*h)
968 } else {
969 None
970 }
971 })
972 .unwrap_or(0);
973 tracing::debug!(
974 "🔍 [DEBUG] Page-specific XObject {} -> Object {} ({}x{})",
975 xobject_name,
976 obj_num,
977 width,
978 height
979 );
980 return self.extract_image_stream_for_ocr(&stream);
981 }
982 }
983 }
984 }
985 }
986 }
987 }
988
989 if let Some(crate::parser::objects::PdfObject::Reference(res_obj, res_gen)) = page
991 .dict
992 .0
993 .get(&crate::parser::objects::PdfName("Resources".to_string()))
994 {
995 match self.document.get_object(*res_obj, *res_gen) {
996 Ok(crate::parser::objects::PdfObject::Dictionary(resolved_dict)) => {
997 tracing::debug!(
998 "🔍 [DEBUG] Page-specific fallback: resolved Resources {} {} R",
999 res_obj,
1000 res_gen
1001 );
1002 if let Some(crate::parser::objects::PdfObject::Dictionary(xobjects)) =
1003 resolved_dict
1004 .0
1005 .get(&crate::parser::objects::PdfName("XObject".to_string()))
1006 {
1007 tracing::debug!("🔍 [DEBUG] Page-specific fallback found XObject dictionary with {} entries", xobjects.0.len());
1008 for (name, obj) in &xobjects.0 {
1009 tracing::debug!(
1010 "🔍 [DEBUG] Page-specific fallback XObject: {} -> {:?}",
1011 name.0,
1012 obj
1013 );
1014 }
1015 if let Some(xobject_ref) = xobjects
1016 .0
1017 .get(&crate::parser::objects::PdfName(xobject_name.to_string()))
1018 {
1019 if let crate::parser::objects::PdfObject::Reference(obj_num, gen_num) =
1020 xobject_ref
1021 {
1022 tracing::debug!("🔍 [DEBUG] Page-specific fallback: trying to get object {} {} R", obj_num, gen_num);
1023 match self.document.get_object(*obj_num, *gen_num) {
1024 Ok(crate::parser::objects::PdfObject::Stream(stream)) => {
1025 tracing::debug!(
1026 "🔍 [DEBUG] Page-specific fallback: got stream object"
1027 );
1028 match stream.dict.0.get(&crate::parser::objects::PdfName(
1029 "Subtype".to_string(),
1030 )) {
1031 Some(crate::parser::objects::PdfObject::Name(
1032 subtype,
1033 )) => {
1034 tracing::debug!("🔍 [DEBUG] Page-specific fallback: stream subtype = {}", subtype.0);
1035 if subtype.0 == "Image" {
1036 let width = stream
1037 .dict
1038 .0
1039 .get(&crate::parser::objects::PdfName("Width".to_string()))
1040 .and_then(|w| {
1041 if let crate::parser::objects::PdfObject::Integer(w) = w
1042 {
1043 Some(*w)
1044 } else {
1045 None
1046 }
1047 })
1048 .unwrap_or(0);
1049 let height = stream
1050 .dict
1051 .0
1052 .get(&crate::parser::objects::PdfName("Height".to_string()))
1053 .and_then(|h| {
1054 if let crate::parser::objects::PdfObject::Integer(h) = h
1055 {
1056 Some(*h)
1057 } else {
1058 None
1059 }
1060 })
1061 .unwrap_or(0);
1062 tracing::debug!(
1063 "🔍 [DEBUG] Page-specific fallback XObject {} -> Object {} ({}x{})",
1064 xobject_name, obj_num, width, height
1065 );
1066 return self
1067 .extract_image_stream_for_ocr(&stream);
1068 } else {
1069 tracing::debug!("🔍 [DEBUG] Page-specific fallback: stream is not an image (subtype: {})", subtype.0);
1070 }
1071 }
1072 None => {
1073 tracing::debug!("🔍 [DEBUG] Page-specific fallback: stream has no Subtype");
1074 }
1075 _ => {
1076 tracing::debug!("🔍 [DEBUG] Page-specific fallback: stream Subtype is not a name");
1077 }
1078 }
1079 }
1080 Ok(obj) => {
1081 tracing::debug!("🔍 [DEBUG] Page-specific fallback: object {} {} R is not a stream, got: {:?}", obj_num, gen_num, std::any::type_name_of_val(&obj));
1082 }
1083 Err(e) => {
1084 tracing::debug!("🔍 [DEBUG] Page-specific fallback: failed to get object {} {} R: {}", obj_num, gen_num, e);
1085 }
1086 }
1087 } else {
1088 tracing::debug!("🔍 [DEBUG] Page-specific fallback: XObject reference is not a Reference");
1089 }
1090 } else {
1091 tracing::debug!("🔍 [DEBUG] Page-specific fallback: XObject '{}' not found in resolved resources", xobject_name);
1092 }
1093 } else {
1094 tracing::debug!("🔍 [DEBUG] Page-specific fallback: no XObject dictionary in resolved resources");
1095 }
1096 }
1097 Ok(_) => {
1098 tracing::debug!("🔍 [DEBUG] Page-specific fallback: Resources reference resolved to non-dictionary");
1099 }
1100 Err(e) => {
1101 tracing::debug!(
1102 "🔍 [DEBUG] Page-specific fallback: failed to resolve Resources: {}",
1103 e
1104 );
1105 }
1106 }
1107 }
1108
1109 if let Some(resources) = resources {
1111 if let Some(crate::parser::objects::PdfObject::Dictionary(xobjects)) = resources
1112 .0
1113 .get(&crate::parser::objects::PdfName("XObject".to_string()))
1114 {
1115 #[allow(clippy::collapsible_match)]
1117 if let Some(xobject_ref) = xobjects
1118 .0
1119 .get(&crate::parser::objects::PdfName(xobject_name.to_string()))
1120 {
1121 if let crate::parser::objects::PdfObject::Reference(obj_num, gen_num) =
1122 xobject_ref
1123 {
1124 if let Ok(crate::parser::objects::PdfObject::Stream(stream)) =
1125 self.document.get_object(*obj_num, *gen_num)
1126 {
1127 if let Some(crate::parser::objects::PdfObject::Name(subtype)) = stream
1129 .dict
1130 .0
1131 .get(&crate::parser::objects::PdfName("Subtype".to_string()))
1132 {
1133 if subtype.0 == "Image" {
1134 let width = stream
1135 .dict
1136 .0
1137 .get(&crate::parser::objects::PdfName("Width".to_string()))
1138 .and_then(|w| {
1139 if let crate::parser::objects::PdfObject::Integer(w) = w
1140 {
1141 Some(*w)
1142 } else {
1143 None
1144 }
1145 })
1146 .unwrap_or(0);
1147
1148 let height = stream
1149 .dict
1150 .0
1151 .get(&crate::parser::objects::PdfName("Height".to_string()))
1152 .and_then(|h| {
1153 if let crate::parser::objects::PdfObject::Integer(h) = h
1154 {
1155 Some(*h)
1156 } else {
1157 None
1158 }
1159 })
1160 .unwrap_or(0);
1161
1162 tracing::debug!(
1163 "🔍 [DEBUG] Page-specific XObject {} -> Object {} ({}x{})",
1164 xobject_name,
1165 obj_num,
1166 width,
1167 height
1168 );
1169 return self.extract_image_stream_for_ocr(&stream);
1170 }
1171 }
1172 }
1173 }
1174 }
1175 }
1176 }
1177
1178 Err(OperationError::ParseError(format!(
1179 "No page-specific XObject found for name: {}",
1180 xobject_name
1181 )))
1182 }
1183
1184 fn find_specific_xobject_image(&self, xobject_name: &str) -> OperationResult<Vec<u8>> {
1186 for obj_num in 1..=1000 {
1190 if let Ok(crate::parser::objects::PdfObject::Stream(stream)) =
1192 self.document.get_object(obj_num, 0)
1193 {
1194 if let Some(crate::parser::objects::PdfObject::Name(subtype)) = stream
1196 .dict
1197 .0
1198 .get(&crate::parser::objects::PdfName("Subtype".to_string()))
1199 {
1200 if subtype.0 == "Image" {
1201 let width = stream
1204 .dict
1205 .0
1206 .get(&crate::parser::objects::PdfName("Width".to_string()))
1207 .and_then(|w| {
1208 if let crate::parser::objects::PdfObject::Integer(w) = w {
1209 Some(*w)
1210 } else {
1211 None
1212 }
1213 })
1214 .unwrap_or(0);
1215 let height = stream
1216 .dict
1217 .0
1218 .get(&crate::parser::objects::PdfName("Height".to_string()))
1219 .and_then(|h| {
1220 if let crate::parser::objects::PdfObject::Integer(h) = h {
1221 Some(*h)
1222 } else {
1223 None
1224 }
1225 })
1226 .unwrap_or(0);
1227
1228 if width > 100 && height > 100 {
1230 tracing::debug!(
1231 "🔍 [DEBUG] Using XObject {} -> Object {} ({}x{})",
1232 xobject_name,
1233 obj_num,
1234 width,
1235 height
1236 );
1237 return self.extract_image_stream_for_ocr(&stream);
1238 }
1239 }
1240 }
1241 }
1242 }
1243
1244 Err(OperationError::ParseError(format!(
1245 "No image XObject found for name: {}",
1246 xobject_name
1247 )))
1248 }
1249
1250 fn find_image_xobjects_in_document(&self) -> OperationResult<Vec<u8>> {
1252 for obj_num in 1..=1000 {
1255 if let Ok(crate::parser::objects::PdfObject::Stream(stream)) =
1257 self.document.get_object(obj_num, 0)
1258 {
1259 if let Some(crate::parser::objects::PdfObject::Name(subtype)) = stream
1261 .dict
1262 .0
1263 .get(&crate::parser::objects::PdfName("Subtype".to_string()))
1264 {
1265 if subtype.0 == "Image" {
1266 let width = stream
1268 .dict
1269 .0
1270 .get(&crate::parser::objects::PdfName("Width".to_string()))
1271 .and_then(|w| {
1272 if let crate::parser::objects::PdfObject::Integer(w) = w {
1273 Some(*w)
1274 } else {
1275 None
1276 }
1277 })
1278 .unwrap_or(0);
1279 let height = stream
1280 .dict
1281 .0
1282 .get(&crate::parser::objects::PdfName("Height".to_string()))
1283 .and_then(|h| {
1284 if let crate::parser::objects::PdfObject::Integer(h) = h {
1285 Some(*h)
1286 } else {
1287 None
1288 }
1289 })
1290 .unwrap_or(0);
1291
1292 if width > 100 && height > 100 {
1294 return self.extract_image_stream_for_ocr(&stream);
1295 }
1296 }
1297 }
1298 }
1299 }
1300
1301 Err(OperationError::ParseError(
1302 "No suitable image objects found in document".to_string(),
1303 ))
1304 }
1305
1306 fn extract_image_stream_for_ocr(
1308 &self,
1309 stream: &crate::parser::objects::PdfStream,
1310 ) -> OperationResult<Vec<u8>> {
1311 tracing::debug!(
1312 "🔍 [DEBUG] extract_image_stream_for_ocr called with stream size: {}",
1313 stream.data.len()
1314 );
1315
1316 let width = match stream
1318 .dict
1319 .0
1320 .get(&crate::parser::objects::PdfName("Width".to_string()))
1321 {
1322 Some(crate::parser::objects::PdfObject::Integer(w)) => *w as u32,
1323 _ => {
1324 return Err(OperationError::ParseError(
1325 "Missing image width".to_string(),
1326 ))
1327 }
1328 };
1329
1330 let height = match stream
1331 .dict
1332 .0
1333 .get(&crate::parser::objects::PdfName("Height".to_string()))
1334 {
1335 Some(crate::parser::objects::PdfObject::Integer(h)) => *h as u32,
1336 _ => {
1337 return Err(OperationError::ParseError(
1338 "Missing image height".to_string(),
1339 ))
1340 }
1341 };
1342
1343 let color_space = stream
1345 .dict
1346 .0
1347 .get(&crate::parser::objects::PdfName("ColorSpace".to_string()));
1348 let bits_per_component = match stream.dict.0.get(&crate::parser::objects::PdfName(
1349 "BitsPerComponent".to_string(),
1350 )) {
1351 Some(crate::parser::objects::PdfObject::Integer(bits)) => *bits as u8,
1352 _ => 8,
1353 };
1354
1355 let filter = stream
1357 .dict
1358 .0
1359 .get(&crate::parser::objects::PdfName("Filter".to_string()));
1360 tracing::debug!(
1361 "🔍 [DEBUG] Image properties: {}x{}, {} bits, filter: {:?}",
1362 width,
1363 height,
1364 bits_per_component,
1365 filter
1366 .as_ref()
1367 .map(|f| match f {
1368 crate::parser::objects::PdfObject::Name(n) => n.0.as_str(),
1369 _ => "Array/Other",
1370 })
1371 .unwrap_or("None")
1372 );
1373
1374 let data = match filter {
1376 Some(crate::parser::objects::PdfObject::Name(filter_name)) => match filter_name
1377 .0
1378 .as_str()
1379 {
1380 "DCTDecode" => {
1381 let jpeg_data = &stream.data;
1384
1385 tracing::debug!(
1386 "🔍 [DEBUG] Processing DCTDecode stream: {} bytes",
1387 jpeg_data.len()
1388 );
1389
1390 if jpeg_data.len() < 4 {
1392 return Err(OperationError::ParseError(
1393 "DCTDecode stream too short to be valid JPEG".to_string(),
1394 ));
1395 }
1396
1397 if jpeg_data[0] != 0xFF || jpeg_data[1] != 0xD8 {
1399 return Err(OperationError::ParseError(format!(
1400 "Invalid JPEG stream: missing SOI marker. Found: {:02X}{:02X}, expected FFD8",
1401 jpeg_data[0], jpeg_data[1]
1402 )));
1403 }
1404
1405 tracing::debug!("✅ [DEBUG] JPEG SOI marker found");
1406
1407 let final_jpeg_data = jpeg_data.to_vec();
1409
1410 tracing::debug!(
1411 "🔍 [DEBUG] Final JPEG size: {} bytes",
1412 final_jpeg_data.len()
1413 );
1414
1415 final_jpeg_data
1418 }
1419 filter_name => {
1420 tracing::debug!("🔍 [DEBUG] Decoding stream with filter: {}", filter_name);
1422 let parse_options = self.document.options();
1423 let decoded_data = stream.decode(&parse_options).map_err(|e| {
1424 OperationError::ParseError(format!("Failed to decode image stream: {e}"))
1425 })?;
1426
1427 tracing::debug!(
1428 "🔍 [DEBUG] Decoded stream data: {} bytes",
1429 decoded_data.len()
1430 );
1431
1432 match filter_name {
1433 "FlateDecode" => {
1434 self.convert_raw_to_png_for_ocr(
1436 &decoded_data,
1437 width,
1438 height,
1439 color_space,
1440 bits_per_component,
1441 )?
1442 }
1443 "CCITTFaxDecode" => {
1444 self.convert_ccitt_to_png_for_ocr(&decoded_data, width, height)?
1446 }
1447 "LZWDecode" => {
1448 self.convert_raw_to_png_for_ocr(
1450 &decoded_data,
1451 width,
1452 height,
1453 color_space,
1454 bits_per_component,
1455 )?
1456 }
1457 _ => {
1458 return Err(OperationError::ParseError(format!(
1459 "Unsupported image filter: {}",
1460 filter_name
1461 )))
1462 }
1463 }
1464 }
1465 },
1466 Some(crate::parser::objects::PdfObject::Array(filters)) => {
1467 if let Some(crate::parser::objects::PdfObject::Name(filter)) = filters.0.first() {
1469 match filter.0.as_str() {
1470 "DCTDecode" => {
1471 tracing::debug!("🔍 [DEBUG] Array filter: Using raw JPEG stream data");
1472 stream.data.clone()
1473 }
1474 filter_name => {
1475 tracing::debug!(
1477 "🔍 [DEBUG] Array filter: Decoding stream with filter: {}",
1478 filter_name
1479 );
1480 let parse_options = self.document.options();
1481 let decoded_data = stream.decode(&parse_options).map_err(|e| {
1482 OperationError::ParseError(format!(
1483 "Failed to decode image stream: {e}"
1484 ))
1485 })?;
1486
1487 match filter_name {
1488 "FlateDecode" => self.convert_raw_to_png_for_ocr(
1489 &decoded_data,
1490 width,
1491 height,
1492 color_space,
1493 bits_per_component,
1494 )?,
1495 "CCITTFaxDecode" => {
1496 self.convert_ccitt_to_png_for_ocr(&decoded_data, width, height)?
1497 }
1498 "LZWDecode" => self.convert_raw_to_png_for_ocr(
1499 &decoded_data,
1500 width,
1501 height,
1502 color_space,
1503 bits_per_component,
1504 )?,
1505 _ => {
1506 return Err(OperationError::ParseError(format!(
1507 "Unsupported image filter in array: {}",
1508 filter_name
1509 )))
1510 }
1511 }
1512 }
1513 }
1514 } else {
1515 return Err(OperationError::ParseError("Empty filter array".to_string()));
1516 }
1517 }
1518 _ => {
1519 tracing::debug!("🔍 [DEBUG] No filter: Converting raw image data to PNG");
1521 let parse_options = self.document.options();
1522 let decoded_data = stream.decode(&parse_options).map_err(|e| {
1523 OperationError::ParseError(format!("Failed to decode raw image stream: {e}"))
1524 })?;
1525
1526 self.convert_raw_to_png_for_ocr(
1527 &decoded_data,
1528 width,
1529 height,
1530 color_space,
1531 bits_per_component,
1532 )?
1533 }
1534 };
1535
1536 tracing::debug!("🔍 [DEBUG] Final image data for OCR: {} bytes", data.len());
1537 Ok(data)
1538 }
1539
1540 #[allow(dead_code)]
1543 fn clean_jpeg_data(&self, raw_data: &[u8]) -> Vec<u8> {
1544 tracing::debug!(
1545 "🔍 [DEBUG] Using raw DCTDecode stream as-is: {} bytes",
1546 raw_data.len()
1547 );
1548
1549 raw_data.to_vec()
1552 }
1553
1554 #[cfg(feature = "external-images")]
1555 #[allow(dead_code)]
1556 fn fix_image_rotation_for_ocr(
1557 &self,
1558 image_data: &[u8],
1559 pdf_width: u32,
1560 pdf_height: u32,
1561 ) -> OperationResult<Vec<u8>> {
1562 tracing::debug!("🔍 [DEBUG] Image rotation correction with external-images feature");
1563
1564 let rotation_needed = self.detect_rotation_needed(pdf_width, pdf_height, 0, 0);
1567
1568 if rotation_needed > 0 {
1569 self.rotate_image_externally(image_data, rotation_needed)
1572 } else {
1573 tracing::debug!("🔍 [DEBUG] No rotation correction needed based on dimensions");
1574 Ok(image_data.to_vec())
1575 }
1576 }
1577
1578 #[cfg(not(feature = "external-images"))]
1579 #[allow(dead_code)]
1580 fn fix_image_rotation_for_ocr(
1581 &self,
1582 image_data: &[u8],
1583 _pdf_width: u32,
1584 _pdf_height: u32,
1585 ) -> OperationResult<Vec<u8>> {
1586 tracing::debug!(
1587 "🔍 [DEBUG] Image rotation correction disabled (external-images feature not enabled)"
1588 );
1589 Ok(image_data.to_vec())
1590 }
1591
1592 #[allow(dead_code)]
1593 fn detect_rotation_needed(
1594 &self,
1595 pdf_width: u32,
1596 pdf_height: u32,
1597 img_width: u32,
1598 img_height: u32,
1599 ) -> u8 {
1600 let (actual_img_width, actual_img_height) = if img_width == 0 || img_height == 0 {
1606 (pdf_width, pdf_height)
1607 } else {
1608 (img_width, img_height)
1609 };
1610
1611 tracing::debug!(
1612 "🔍 [DEBUG] Rotation analysis - PDF: {}x{}, Image: {}x{}",
1613 pdf_width,
1614 pdf_height,
1615 actual_img_width,
1616 actual_img_height
1617 );
1618
1619 if pdf_height > pdf_width {
1621 tracing::debug!("🔍 [DEBUG] Portrait PDF detected - applying 270° rotation to correct typical scan rotation");
1624 return 3; }
1626
1627 if pdf_width == actual_img_height && pdf_height == actual_img_width {
1629 tracing::debug!("🔍 [DEBUG] Dimensions swapped - applying 90° rotation");
1630 return 1; }
1632
1633 tracing::debug!("🔍 [DEBUG] No rotation correction needed");
1634 0
1635 }
1636
1637 #[allow(dead_code)]
1638 fn rotate_image_externally(&self, image_data: &[u8], rotation: u8) -> OperationResult<Vec<u8>> {
1639 use std::fs;
1640 use std::process::Command;
1641
1642 let input_path = format!("examples/results/temp_input_{}.jpg", std::process::id());
1644 let output_path = format!("examples/results/temp_output_{}.jpg", std::process::id());
1645
1646 if let Err(e) = fs::write(&input_path, image_data) {
1648 tracing::debug!("🔍 [DEBUG] Failed to write temp input file: {}", e);
1649 return Ok(image_data.to_vec());
1650 }
1651
1652 let angle = match rotation {
1654 1 => "90", 2 => "180", 3 => "270", _ => {
1658 let _ = fs::remove_file(&input_path);
1659 return Ok(image_data.to_vec());
1660 }
1661 };
1662
1663 tracing::debug!(
1664 "🔍 [DEBUG] Attempting to rotate image {} degrees using external tool",
1665 angle
1666 );
1667
1668 let sips_result = Command::new("sips")
1670 .arg(&input_path)
1671 .arg("-r")
1672 .arg(angle)
1673 .arg("--out")
1674 .arg(&output_path)
1675 .output();
1676
1677 let rotated_data = match sips_result {
1678 Ok(sips_output) if sips_output.status.success() => match fs::read(&output_path) {
1679 Ok(data) => {
1680 tracing::debug!("🔍 [DEBUG] Successfully rotated image using sips");
1681 data
1682 }
1683 Err(e) => {
1684 tracing::debug!("🔍 [DEBUG] Failed to read sips-rotated image: {}", e);
1685 image_data.to_vec()
1686 }
1687 },
1688 Ok(sips_output) => {
1689 tracing::debug!(
1690 "🔍 [DEBUG] sips failed: {}",
1691 String::from_utf8_lossy(&sips_output.stderr)
1692 );
1693
1694 let result = Command::new("convert")
1696 .arg(&input_path)
1697 .arg("-rotate")
1698 .arg(angle)
1699 .arg(&output_path)
1700 .output();
1701
1702 match result {
1703 Ok(output) if output.status.success() => match fs::read(&output_path) {
1704 Ok(data) => {
1705 tracing::debug!(
1706 "🔍 [DEBUG] Successfully rotated image using ImageMagick"
1707 );
1708 data
1709 }
1710 Err(e) => {
1711 tracing::debug!("🔍 [DEBUG] Failed to read rotated image: {}", e);
1712 image_data.to_vec()
1713 }
1714 },
1715 _ => {
1716 tracing::debug!(
1717 "🔍 [DEBUG] Both sips and ImageMagick failed, using original image"
1718 );
1719 image_data.to_vec()
1720 }
1721 }
1722 }
1723 Err(e) => {
1724 tracing::debug!("🔍 [DEBUG] sips not available: {}", e);
1725 tracing::debug!("🔍 [DEBUG] Trying ImageMagick as fallback...");
1726
1727 let result = Command::new("convert")
1728 .arg(&input_path)
1729 .arg("-rotate")
1730 .arg(angle)
1731 .arg(&output_path)
1732 .output();
1733
1734 match result {
1735 Ok(output) if output.status.success() => match fs::read(&output_path) {
1736 Ok(data) => {
1737 tracing::debug!(
1738 "🔍 [DEBUG] Successfully rotated image using ImageMagick"
1739 );
1740 data
1741 }
1742 Err(e) => {
1743 tracing::debug!("🔍 [DEBUG] Failed to read rotated image: {}", e);
1744 image_data.to_vec()
1745 }
1746 },
1747 _ => {
1748 tracing::debug!(
1749 "🔍 [DEBUG] No external rotation tools available, using original image"
1750 );
1751 image_data.to_vec()
1752 }
1753 }
1754 }
1755 };
1756
1757 let _ = fs::remove_file(&input_path);
1759 let _ = fs::remove_file(&output_path);
1760
1761 Ok(rotated_data)
1762 }
1763
1764 #[allow(dead_code)]
1767 fn clean_corrupted_jpeg(
1768 &self,
1769 corrupted_jpeg_data: &[u8],
1770 width: u32,
1771 _height: u32,
1772 ) -> OperationResult<Vec<u8>> {
1773 use std::fs;
1774 use std::process::Command;
1775
1776 tracing::debug!("🔧 [DEBUG] Cleaning corrupted JPEG using sips");
1777
1778 let temp_id = std::process::id();
1780 let input_path = format!("/tmp/ocr_corrupted_{}_{}.jpg", temp_id, width);
1781 let output_path = format!("/tmp/ocr_clean_{}_{}.jpg", temp_id, width);
1782
1783 fs::write(&input_path, corrupted_jpeg_data).map_err(|e| {
1785 OperationError::ProcessingError(format!("Failed to write temp JPEG: {e}"))
1786 })?;
1787
1788 tracing::debug!("🔧 [DEBUG] Saved corrupted JPEG to: {}", input_path);
1789
1790 let output = Command::new("sips")
1792 .args([
1793 "-s",
1794 "format",
1795 "jpeg",
1796 "-s",
1797 "formatOptions",
1798 "100", &input_path,
1800 "--out",
1801 &output_path,
1802 ])
1803 .output()
1804 .map_err(|e| OperationError::ProcessingError(format!("Failed to run sips: {e}")))?;
1805
1806 if !output.status.success() {
1807 let stderr = String::from_utf8_lossy(&output.stderr);
1808 tracing::debug!("❌ [DEBUG] sips failed: {}", stderr);
1809
1810 let _ = fs::remove_file(&input_path);
1812 let _ = fs::remove_file(&output_path);
1813
1814 tracing::debug!("🔧 [DEBUG] Falling back to original JPEG data");
1816 return Ok(corrupted_jpeg_data.to_vec());
1817 }
1818
1819 let cleaned_data = fs::read(&output_path).map_err(|e| {
1821 OperationError::ProcessingError(format!("Failed to read cleaned JPEG: {e}"))
1822 })?;
1823
1824 tracing::debug!(
1825 "🔧 [DEBUG] Successfully cleaned JPEG: {} -> {} bytes",
1826 corrupted_jpeg_data.len(),
1827 cleaned_data.len()
1828 );
1829
1830 let _ = fs::remove_file(&input_path);
1834 let _ = fs::remove_file(&output_path);
1835
1836 Ok(cleaned_data)
1837 }
1838
1839 fn calculate_page_area(&self, page: &crate::parser::ParsedPage) -> OperationResult<f64> {
1843 let width = page.width();
1845 let height = page.height();
1846
1847 Ok(width * height)
1848 }
1849
1850 fn analyze_text_content(&self, page_number: usize) -> OperationResult<TextAnalysisResult> {
1852 let mut extractor = TextExtractor::with_options(ExtractionOptions {
1853 preserve_layout: true,
1854 space_threshold: 0.3,
1855 newline_threshold: 10.0,
1856 ..Default::default()
1857 });
1858
1859 let extracted_text = extractor
1860 .extract_from_page(&self.document, page_number as u32)
1861 .map_err(|e| OperationError::ParseError(e.to_string()))?;
1862
1863 let mut total_area = 0.0;
1864 let mut fragment_count = 0;
1865 let character_count = extracted_text.text.len();
1866
1867 for fragment in &extracted_text.fragments {
1869 if fragment.text.trim().len() >= self.options.min_text_fragment_size {
1870 total_area += fragment.width * fragment.height;
1871 fragment_count += 1;
1872 }
1873 }
1874
1875 Ok(TextAnalysisResult {
1876 total_area,
1877 fragment_count,
1878 character_count,
1879 })
1880 }
1881
1882 fn analyze_image_content(&self, page_number: usize) -> OperationResult<ImageAnalysisResult> {
1884 let page = self
1887 .document
1888 .get_page(page_number as u32)
1889 .map_err(|e| OperationError::ParseError(e.to_string()))?;
1890
1891 let resources = self
1895 .document
1896 .get_page_resources(&page)
1897 .map_err(|e| OperationError::ParseError(e.to_string()))?;
1898
1899 let mut total_area = 0.0;
1900 let mut image_count = 0;
1901
1902 if let Some(resources) = &resources {
1904 if let Some(crate::parser::objects::PdfObject::Dictionary(xobjects)) = resources
1905 .0
1906 .get(&crate::parser::objects::PdfName("XObject".to_string()))
1907 {
1908 for obj_ref in xobjects.0.values() {
1909 if let crate::parser::objects::PdfObject::Reference(obj_num, gen_num) = obj_ref
1910 {
1911 if let Ok(crate::parser::objects::PdfObject::Stream(stream)) =
1912 self.document.get_object(*obj_num, *gen_num)
1913 {
1914 if let Some(crate::parser::objects::PdfObject::Name(subtype)) = stream
1916 .dict
1917 .0
1918 .get(&crate::parser::objects::PdfName("Subtype".to_string()))
1919 {
1920 if subtype.0 == "Image" {
1921 image_count += 1;
1922
1923 let width =
1925 match stream.dict.0.get(&crate::parser::objects::PdfName(
1926 "Width".to_string(),
1927 )) {
1928 Some(crate::parser::objects::PdfObject::Integer(w)) => {
1929 *w as f64
1930 }
1931 _ => 0.0,
1932 };
1933
1934 let height =
1935 match stream.dict.0.get(&crate::parser::objects::PdfName(
1936 "Height".to_string(),
1937 )) {
1938 Some(crate::parser::objects::PdfObject::Integer(h)) => {
1939 *h as f64
1940 }
1941 _ => 0.0,
1942 };
1943
1944 if width >= self.options.min_image_size as f64
1946 && height >= self.options.min_image_size as f64
1947 {
1948 total_area += width * height;
1949 }
1950 }
1951 }
1952 }
1953 }
1954 }
1955 }
1956 }
1957
1958 if let Ok(content_streams) = self.document.get_page_content_streams(&page) {
1960 for content_stream in content_streams.iter() {
1961 let content_str = String::from_utf8_lossy(content_stream);
1962
1963 let bi_count = content_str.matches("BI").count();
1965 let ei_count = content_str.matches("EI").count();
1966
1967 if bi_count > 0 && ei_count > 0 {
1968 image_count += bi_count.min(ei_count);
1969 let page_area = page.width() * page.height();
1971 total_area += page_area * (bi_count.min(ei_count) as f64);
1972 }
1973
1974 let do_count = content_str.matches(" Do").count();
1976 if do_count > 0 && image_count == 0 {
1977 image_count += do_count;
1979 let page_area = page.width() * page.height();
1980 total_area += page_area * (do_count as f64);
1981 }
1982 }
1983 }
1984
1985 Ok(ImageAnalysisResult {
1986 total_area,
1987 image_count,
1988 })
1989 }
1990
1991 fn determine_page_type(&self, text_ratio: f64, image_ratio: f64) -> PageType {
2005 if image_ratio > self.options.scanned_threshold && text_ratio < 0.1 {
2006 PageType::Scanned
2007 } else if text_ratio > self.options.text_threshold && image_ratio < 0.2 {
2008 PageType::Text
2009 } else {
2010 PageType::Mixed
2011 }
2012 }
2013
2014 fn convert_raw_to_png_for_ocr(
2016 &self,
2017 data: &[u8],
2018 width: u32,
2019 height: u32,
2020 color_space: Option<&crate::parser::objects::PdfObject>,
2021 bits_per_component: u8,
2022 ) -> OperationResult<Vec<u8>> {
2023 let components = match color_space {
2027 Some(crate::parser::objects::PdfObject::Name(cs)) => match cs.0.as_str() {
2028 "DeviceGray" => 1,
2029 "DeviceRGB" => 3,
2030 "DeviceCMYK" => 4,
2031 _ => 3, },
2033 _ => 3, };
2035
2036 let mut png_data = Vec::new();
2038
2039 png_data.extend_from_slice(&[0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A]);
2041
2042 let mut ihdr = Vec::new();
2044 ihdr.extend_from_slice(&width.to_be_bytes());
2045 ihdr.extend_from_slice(&height.to_be_bytes());
2046 ihdr.push(bits_per_component);
2047
2048 let color_type = match components {
2050 1 => 0, 3 => 2, 4 => 6, _ => 2, };
2055 ihdr.push(color_type);
2056 ihdr.push(0); ihdr.push(0); ihdr.push(0); self.write_png_chunk(&mut png_data, b"IHDR", &ihdr);
2061
2062 let compressed_data = self.compress_png_data(data, width, height, components)?;
2064 self.write_png_chunk(&mut png_data, b"IDAT", &compressed_data);
2065
2066 self.write_png_chunk(&mut png_data, b"IEND", &[]);
2068
2069 Ok(png_data)
2070 }
2071
2072 fn convert_ccitt_to_png_for_ocr(
2074 &self,
2075 data: &[u8],
2076 width: u32,
2077 height: u32,
2078 ) -> OperationResult<Vec<u8>> {
2079 let mut grayscale_data = Vec::new();
2081
2082 let bits_per_row = width as usize;
2083 let bytes_per_row = bits_per_row.div_ceil(8);
2084
2085 for row in 0..height {
2086 let row_start = row as usize * bytes_per_row;
2087
2088 for col in 0..width {
2089 let byte_idx = row_start + (col as usize / 8);
2090 let bit_idx = 7 - (col as usize % 8);
2091
2092 if byte_idx < data.len() {
2093 let bit = (data[byte_idx] >> bit_idx) & 1;
2094 let gray_value = if bit == 0 { 0 } else { 255 };
2096 grayscale_data.push(gray_value);
2097 } else {
2098 grayscale_data.push(255); }
2100 }
2101 }
2102
2103 self.convert_raw_to_png_for_ocr(
2105 &grayscale_data,
2106 width,
2107 height,
2108 Some(&crate::parser::objects::PdfObject::Name(
2109 crate::parser::objects::PdfName("DeviceGray".to_string()),
2110 )),
2111 8,
2112 )
2113 }
2114
2115 fn write_png_chunk(&self, output: &mut Vec<u8>, chunk_type: &[u8; 4], data: &[u8]) {
2117 output.extend_from_slice(&(data.len() as u32).to_be_bytes());
2119
2120 output.extend_from_slice(chunk_type);
2122
2123 output.extend_from_slice(data);
2125
2126 let crc = self.calculate_png_crc32(chunk_type, data);
2128 output.extend_from_slice(&crc.to_be_bytes());
2129 }
2130
2131 fn calculate_png_crc32(&self, chunk_type: &[u8; 4], data: &[u8]) -> u32 {
2133 let mut crc: u32 = 0xFFFFFFFF;
2134
2135 for &byte in chunk_type {
2137 crc ^= byte as u32;
2138 for _ in 0..8 {
2139 if crc & 1 != 0 {
2140 crc = (crc >> 1) ^ 0xEDB88320;
2141 } else {
2142 crc >>= 1;
2143 }
2144 }
2145 }
2146
2147 for &byte in data {
2149 crc ^= byte as u32;
2150 for _ in 0..8 {
2151 if crc & 1 != 0 {
2152 crc = (crc >> 1) ^ 0xEDB88320;
2153 } else {
2154 crc >>= 1;
2155 }
2156 }
2157 }
2158
2159 crc ^ 0xFFFFFFFF
2160 }
2161
2162 fn compress_png_data(
2164 &self,
2165 data: &[u8],
2166 width: u32,
2167 height: u32,
2168 components: u8,
2169 ) -> OperationResult<Vec<u8>> {
2170 use flate2::write::ZlibEncoder;
2171 use flate2::Compression;
2172 use std::io::Write;
2173
2174 let mut encoder = ZlibEncoder::new(Vec::new(), Compression::default());
2175
2176 let bytes_per_pixel = components as usize;
2178 let bytes_per_row = width as usize * bytes_per_pixel;
2179
2180 for row in 0..height {
2181 encoder.write_all(&[0])?;
2183
2184 let start = row as usize * bytes_per_row;
2186 let end = start + bytes_per_row;
2187 if end <= data.len() {
2188 encoder.write_all(&data[start..end])?;
2189 } else {
2190 let available = data.len().saturating_sub(start);
2192 if available > 0 {
2193 encoder.write_all(&data[start..start + available])?;
2194 }
2195 let padding = bytes_per_row.saturating_sub(available);
2196 for _ in 0..padding {
2197 encoder.write_all(&[0])?;
2198 }
2199 }
2200 }
2201
2202 encoder
2203 .finish()
2204 .map_err(|e| OperationError::ParseError(format!("Failed to compress PNG data: {e}")))
2205 }
2206}
2207
2208struct TextAnalysisResult {
2210 total_area: f64,
2211 fragment_count: usize,
2212 character_count: usize,
2213}
2214
2215struct ImageAnalysisResult {
2217 total_area: f64,
2218 image_count: usize,
2219}
2220
2221fn simulate_page_ocr_processing<P: OcrProvider>(
2223 page_num: usize,
2224 ocr_provider: &P,
2225) -> Result<OcrProcessingResult, crate::text::ocr::OcrError> {
2226 let mock_image_data = vec![
2228 0xFF, 0xD8, 0xFF, 0xE0, 0x00, 0x10, 0x4A, 0x46, 0x49, 0x46, 0x00, 0x01, 0x01, 0x01, 0x00,
2229 0x48, 0x00, 0x48, 0x00, 0x00, 0xFF, 0xD9,
2230 ];
2231
2232 let options = crate::text::ocr::OcrOptions {
2233 language: "eng".to_string(),
2234 min_confidence: 0.6,
2235 preserve_layout: true,
2236 preprocessing: crate::text::ocr::ImagePreprocessing::default(),
2237 engine_options: std::collections::HashMap::new(),
2238 timeout_seconds: 30,
2239 regions: None,
2240 debug_output: false,
2241 };
2242
2243 let mut result = ocr_provider.process_image(&mock_image_data, &options)?;
2245
2246 result.text = format!("Page {page_num} text extracted via OCR");
2248
2249 Ok(result)
2250}
2251
2252#[cfg(test)]
2253mod tests {
2254 use super::*;
2255
2256 #[test]
2257 fn test_page_type_classification() {
2258 assert!(PageType::Scanned.is_scanned());
2259 assert!(!PageType::Text.is_scanned());
2260 assert!(!PageType::Mixed.is_scanned());
2261
2262 assert!(PageType::Text.is_text());
2263 assert!(!PageType::Scanned.is_text());
2264 assert!(!PageType::Mixed.is_text());
2265
2266 assert!(PageType::Mixed.is_mixed());
2267 assert!(!PageType::Scanned.is_mixed());
2268 assert!(!PageType::Text.is_mixed());
2269 }
2270
2271 #[test]
2272 fn test_content_analysis_methods() {
2273 let analysis = ContentAnalysis {
2274 page_number: 0,
2275 page_type: PageType::Scanned,
2276 text_ratio: 0.05,
2277 image_ratio: 0.90,
2278 blank_space_ratio: 0.05,
2279 text_fragment_count: 2,
2280 image_count: 1,
2281 character_count: 15,
2282 };
2283
2284 assert!(analysis.is_scanned());
2285 assert!(!analysis.is_text_heavy());
2286 assert!(!analysis.is_mixed_content());
2287 assert_eq!(analysis.dominant_content_ratio(), 0.90);
2288 }
2289
2290 #[test]
2291 fn test_analysis_options_default() {
2292 let options = AnalysisOptions::default();
2293 assert_eq!(options.min_text_fragment_size, 3);
2294 assert_eq!(options.min_image_size, 50);
2295 assert_eq!(options.scanned_threshold, 0.8);
2296 assert_eq!(options.text_threshold, 0.7);
2297 assert!(options.ocr_options.is_none());
2298 }
2299
2300 #[test]
2301 fn test_determine_page_type() {
2302 let options = AnalysisOptions::default();
2304
2305 let page_type = if 0.90 > options.scanned_threshold && 0.05 < 0.1 {
2307 PageType::Scanned
2308 } else if 0.05 > options.text_threshold && 0.90 < 0.2 {
2309 PageType::Text
2310 } else {
2311 PageType::Mixed
2312 };
2313 assert_eq!(page_type, PageType::Scanned);
2314
2315 let page_type = if 0.10 > options.scanned_threshold && 0.80 < 0.1 {
2317 PageType::Scanned
2318 } else if 0.80 > options.text_threshold && 0.10 < 0.2 {
2319 PageType::Text
2320 } else {
2321 PageType::Mixed
2322 };
2323 assert_eq!(page_type, PageType::Text);
2324
2325 let page_type = if 0.40 > options.scanned_threshold && 0.50 < 0.1 {
2327 PageType::Scanned
2328 } else if 0.50 > options.text_threshold && 0.40 < 0.2 {
2329 PageType::Text
2330 } else {
2331 PageType::Mixed
2332 };
2333 assert_eq!(page_type, PageType::Mixed);
2334 }
2335}
2336
2337#[cfg(test)]
2338#[path = "page_analysis_tests.rs"]
2339mod page_analysis_tests;
2340
2341#[cfg(test)]
2342#[path = "page_analysis_ocr_tests.rs"]
2343mod page_analysis_ocr_tests;
2344
2345#[cfg(test)]
2346mod comprehensive_tests {
2347 use super::*;
2348 use crate::parser::{PdfDocument, PdfReader};
2349 use crate::text::{MockOcrProvider, OcrError, OcrOptions, OcrProvider};
2350 use std::fs::File;
2351 use std::io::Write;
2352 use std::sync::Mutex;
2353 use std::time::Duration;
2354 use tempfile::NamedTempFile;
2355
2356 fn create_mock_document() -> crate::parser::document::PdfDocument<std::fs::File> {
2358 use crate::{Document, Page};
2360
2361 let mut doc = Document::new();
2362 doc.add_page(Page::a4());
2363
2364 let temp_file = NamedTempFile::new().expect("Failed to create temp file");
2366 doc.save(temp_file.path()).expect("Failed to save PDF");
2367
2368 let file = std::fs::File::open(temp_file.path()).expect("Failed to open PDF file");
2370 let reader =
2371 crate::parser::reader::PdfReader::new(file).expect("Failed to create PDF reader");
2372 crate::parser::document::PdfDocument::new(reader)
2373 }
2374
2375 #[test]
2377 fn test_text_analysis_result_struct() {
2378 let result = TextAnalysisResult {
2379 total_area: 1000.0,
2380 fragment_count: 10,
2381 character_count: 500,
2382 };
2383
2384 assert_eq!(result.total_area, 1000.0);
2385 assert_eq!(result.fragment_count, 10);
2386 assert_eq!(result.character_count, 500);
2387 }
2388
2389 #[test]
2391 fn test_image_analysis_result_struct() {
2392 let result = ImageAnalysisResult {
2393 total_area: 5000.0,
2394 image_count: 3,
2395 };
2396
2397 assert_eq!(result.total_area, 5000.0);
2398 assert_eq!(result.image_count, 3);
2399 }
2400
2401 #[test]
2403 fn test_analyzer_with_custom_options() {
2404 let doc = create_mock_document();
2405 let custom_options = AnalysisOptions {
2406 min_text_fragment_size: 10,
2407 min_image_size: 200,
2408 scanned_threshold: 0.9,
2409 text_threshold: 0.6,
2410 ocr_options: Some(OcrOptions {
2411 language: "de".to_string(),
2412 min_confidence: 0.85,
2413 ..Default::default()
2414 }),
2415 };
2416
2417 let analyzer = PageContentAnalyzer::with_options(doc, custom_options);
2418
2419 let page_count_result = analyzer.document.page_count();
2421 assert!(page_count_result.is_ok());
2422 assert_eq!(page_count_result.unwrap(), 1);
2423 }
2424
2425 #[test]
2427 fn test_multiple_analyzers() {
2428 let analyzers: Vec<_> = (0..3)
2430 .map(|_| {
2431 let doc = create_mock_document();
2432 PageContentAnalyzer::new(doc)
2433 })
2434 .collect();
2435
2436 for (i, analyzer) in analyzers.iter().enumerate() {
2438 let result = analyzer.document.page_count();
2439 assert!(result.is_ok());
2440 assert_eq!(result.unwrap(), 1);
2441 tracing::debug!("Analyzer {i} works correctly");
2442 }
2443 }
2444
2445 #[test]
2447 fn test_custom_options_propagation() {
2448 let doc = create_mock_document();
2449 let custom_options = AnalysisOptions {
2450 min_text_fragment_size: 15,
2451 min_image_size: 300,
2452 scanned_threshold: 0.85,
2453 text_threshold: 0.65,
2454 ocr_options: None,
2455 };
2456
2457 let analyzer = PageContentAnalyzer::with_options(doc, custom_options);
2458
2459 let result = analyzer.analyze_page(0);
2461 assert!(result.is_ok());
2462 }
2463
2464 #[test]
2466 fn test_empty_document_analysis() {
2467 let pdf_data = b"%PDF-1.4
24691 0 obj
2470<<
2471/Type /Catalog
2472/Pages 2 0 R
2473>>
2474endobj
24752 0 obj
2476<<
2477/Type /Pages
2478/Kids []
2479/Count 0
2480>>
2481endobj
2482xref
24830 3
24840000000000 65535 f
24850000000009 00000 n
24860000000058 00000 n
2487trailer
2488<<
2489/Size 3
2490/Root 1 0 R
2491>>
2492startxref
2493107
2494%%EOF";
2495
2496 let mut temp_file = NamedTempFile::new().expect("Failed to create temp file");
2498 temp_file
2499 .write_all(pdf_data)
2500 .expect("Failed to write PDF data");
2501 temp_file.flush().expect("Failed to flush");
2502
2503 let path = temp_file.path().to_owned();
2505 let file = File::open(&path).expect("Failed to open temp file");
2506
2507 std::mem::forget(temp_file);
2509
2510 let result = PdfReader::new(file);
2512 if result.is_err() {
2513 return;
2516 }
2517
2518 let reader = result.unwrap();
2519 let doc = PdfDocument::new(reader);
2520 let analyzer = PageContentAnalyzer::new(doc);
2521
2522 let analysis_result = analyzer.analyze_document();
2523 assert!(analysis_result.is_ok());
2524 assert_eq!(analysis_result.unwrap().len(), 0);
2525
2526 let scanned_pages = analyzer.find_scanned_pages();
2527 assert!(scanned_pages.is_ok());
2528 assert_eq!(scanned_pages.unwrap().len(), 0);
2529 }
2530
2531 #[test]
2533 fn test_invalid_page_number_handling() {
2534 let doc = create_mock_document();
2535 let analyzer = PageContentAnalyzer::new(doc);
2536
2537 let result = analyzer.analyze_page(999);
2539 if result.is_err() {
2543 assert!(result.unwrap_err().to_string().contains("Page"));
2544 } else {
2545 let analysis = result.unwrap();
2547 assert_eq!(analysis.page_number, 999);
2548 }
2549
2550 let result = analyzer.is_scanned_page(100);
2552 if result.is_err() {
2554 assert!(result.unwrap_err().to_string().contains("Page"));
2555 } else {
2556 let _is_scanned = result.unwrap();
2558 }
2559 }
2560
2561 #[test]
2563 fn test_ocr_extraction_non_scanned_page() {
2564 let doc = create_mock_document();
2565 let analyzer = PageContentAnalyzer::new(doc);
2566 let ocr_provider = MockOcrProvider::new();
2567
2568 let result = analyzer.extract_text_from_scanned_page(0, &ocr_provider);
2570 assert!(result.is_err());
2571 assert!(result
2572 .unwrap_err()
2573 .to_string()
2574 .contains("not a scanned page"));
2575 }
2576
2577 #[test]
2579 fn test_ocr_processing_fallback() {
2580 let doc = create_mock_document();
2581 let analyzer = PageContentAnalyzer::new(doc);
2582 let ocr_provider = MockOcrProvider::new();
2583
2584 let result = analyzer.process_scanned_pages_with_ocr(&ocr_provider);
2586 assert!(result.is_ok());
2587
2588 let result = analyzer.process_scanned_pages_batch(&ocr_provider, 1);
2590 assert!(result.is_ok());
2591 }
2592
2593 #[test]
2595 fn test_ocr_processing_edge_cases() {
2596 let doc = create_mock_document();
2597 let analyzer = PageContentAnalyzer::new(doc);
2598 let ocr_provider = MockOcrProvider::new();
2599
2600 let result = analyzer.find_scanned_pages();
2602 assert!(result.is_ok());
2603
2604 let result = analyzer.process_scanned_pages_batch(&ocr_provider, 0);
2606 assert!(result.is_ok());
2607 }
2608
2609 #[test]
2611 fn test_batch_ocr_processing() {
2612 let doc = create_mock_document();
2613 let analyzer = PageContentAnalyzer::new(doc);
2614 let ocr_provider = MockOcrProvider::new();
2615
2616 let result = analyzer.process_scanned_pages_batch(&ocr_provider, 1);
2618 assert!(result.is_ok());
2619
2620 let result = analyzer.process_scanned_pages_batch(&ocr_provider, 5);
2622 assert!(result.is_ok());
2623
2624 let result = analyzer.process_scanned_pages_batch(&ocr_provider, 100);
2626 assert!(result.is_ok());
2627 }
2628
2629 #[test]
2631 fn test_analyze_specific_pages() {
2632 let doc = create_mock_document();
2633 let analyzer = PageContentAnalyzer::new(doc);
2634
2635 let result = analyzer.analyze_pages(&[0]);
2637 assert!(result.is_ok());
2638 assert_eq!(result.unwrap().len(), 1);
2639
2640 let result = analyzer.analyze_pages(&[0, 99]);
2642 assert!(result.is_ok());
2643 let analyses = result.unwrap();
2644 assert!(analyses.len() >= 1);
2646 assert_eq!(analyses[0].page_number, 0);
2647 }
2648
2649 #[test]
2651 fn test_content_analysis_edge_cases() {
2652 let analysis = ContentAnalysis {
2654 page_number: 0,
2655 page_type: PageType::Mixed,
2656 text_ratio: 0.0,
2657 image_ratio: 0.0,
2658 blank_space_ratio: 1.0,
2659 text_fragment_count: 0,
2660 image_count: 0,
2661 character_count: 0,
2662 };
2663
2664 assert!(!analysis.is_scanned());
2665 assert!(!analysis.is_text_heavy());
2666 assert!(analysis.is_mixed_content());
2667 assert_eq!(analysis.dominant_content_ratio(), 0.0);
2670
2671 let analysis2 = ContentAnalysis {
2673 page_number: 1,
2674 page_type: PageType::Mixed,
2675 text_ratio: 0.33,
2676 image_ratio: 0.33,
2677 blank_space_ratio: 0.34,
2678 text_fragment_count: 10,
2679 image_count: 5,
2680 character_count: 100,
2681 };
2682
2683 assert!(analysis2.is_mixed_content());
2684 assert_eq!(analysis2.dominant_content_ratio(), 0.33); }
2686
2687 #[test]
2689 fn test_ocr_provider_mock_customization() {
2690 let mut provider = MockOcrProvider::new();
2691
2692 provider.set_mock_text("Custom OCR result for testing".to_string());
2694 provider.set_confidence(0.99);
2695 provider.set_processing_delay(10);
2696
2697 let options = OcrOptions::default();
2698 let mock_image = vec![0xFF, 0xD8, 0xFF, 0xE0, 0x00, 0x10, 0x4A, 0x46]; let start = std::time::Instant::now();
2701 let result = provider.process_image(&mock_image, &options);
2702 let elapsed = start.elapsed();
2703
2704 assert!(result.is_ok());
2705 let ocr_result = result.unwrap();
2706 assert!(ocr_result.text.contains("Custom OCR result"));
2707 assert_eq!(ocr_result.confidence, 0.99);
2708 assert!(elapsed >= Duration::from_millis(10));
2709 }
2710
2711 #[test]
2713 fn test_simulate_page_ocr_processing() {
2714 let provider = MockOcrProvider::new();
2715 let result = simulate_page_ocr_processing(5, &provider);
2716
2717 assert!(result.is_ok());
2718 let ocr_result = result.unwrap();
2719 assert!(ocr_result.text.contains("Page 5"));
2720 assert_eq!(ocr_result.language, "eng");
2721 }
2722
2723 #[test]
2725 fn test_process_scanned_pages_error_handling() {
2726 struct FailingOcrProvider;
2728
2729 impl OcrProvider for FailingOcrProvider {
2730 fn process_image(
2731 &self,
2732 _: &[u8],
2733 _: &OcrOptions,
2734 ) -> Result<OcrProcessingResult, OcrError> {
2735 Err(OcrError::ProcessingFailed("Simulated failure".to_string()))
2736 }
2737
2738 fn process_page(
2739 &self,
2740 _: &ContentAnalysis,
2741 _: &[u8],
2742 _: &OcrOptions,
2743 ) -> Result<OcrProcessingResult, OcrError> {
2744 Err(OcrError::ProcessingFailed("Simulated failure".to_string()))
2745 }
2746
2747 fn supported_formats(&self) -> Vec<crate::graphics::ImageFormat> {
2748 vec![]
2749 }
2750
2751 fn engine_name(&self) -> &str {
2752 "Failing"
2753 }
2754
2755 fn engine_type(&self) -> crate::text::OcrEngine {
2756 crate::text::OcrEngine::Mock
2757 }
2758 }
2759
2760 let doc = create_mock_document();
2761 let analyzer = PageContentAnalyzer::new(doc);
2762 let failing_provider = FailingOcrProvider;
2763
2764 let result = analyzer.process_scanned_pages_with_ocr(&failing_provider);
2766 assert!(result.is_ok());
2767 assert_eq!(result.unwrap().len(), 0); }
2769
2770 #[test]
2772 fn test_page_area_calculation() {
2773 let doc = create_mock_document();
2774 let analyzer = PageContentAnalyzer::new(doc);
2775
2776 let page = analyzer.document.get_page(0).unwrap();
2778 let area = analyzer.calculate_page_area(&page);
2779
2780 assert!(area.is_ok());
2781 let area_value = area.unwrap();
2782 assert!(area_value > 0.0);
2783 assert_eq!(area_value, 500990.0);
2785 }
2786
2787 #[test]
2789 fn test_determine_page_type_exact_thresholds() {
2790 let analyzer = PageContentAnalyzer::new(create_mock_document());
2791
2792 let page_type = analyzer.determine_page_type(0.09, 0.81);
2794 assert_eq!(page_type, PageType::Scanned);
2795
2796 let page_type = analyzer.determine_page_type(0.71, 0.19);
2798 assert_eq!(page_type, PageType::Text);
2799
2800 let page_type = analyzer.determine_page_type(0.7, 0.8);
2802 assert_eq!(page_type, PageType::Mixed);
2803 }
2804
2805 #[test]
2807 fn test_analysis_options_with_ocr_configuration() {
2808 let mut engine_options = std::collections::HashMap::new();
2809 engine_options.insert("tesseract_psm".to_string(), "3".to_string());
2810 engine_options.insert("custom_param".to_string(), "value".to_string());
2811
2812 let ocr_options = OcrOptions {
2813 language: "ja".to_string(),
2814 min_confidence: 0.9,
2815 preserve_layout: false,
2816 timeout_seconds: 60,
2817 engine_options,
2818 ..Default::default()
2819 };
2820
2821 let analysis_options = AnalysisOptions {
2822 min_text_fragment_size: 1,
2823 min_image_size: 10,
2824 scanned_threshold: 0.95,
2825 text_threshold: 0.5,
2826 ocr_options: Some(ocr_options),
2827 };
2828
2829 assert!(analysis_options.ocr_options.is_some());
2830 let ocr_opts = analysis_options.ocr_options.unwrap();
2831 assert_eq!(ocr_opts.language, "ja");
2832 assert_eq!(ocr_opts.timeout_seconds, 60);
2833 assert_eq!(ocr_opts.engine_options.len(), 2);
2834 }
2835
2836 #[test]
2838 fn test_content_ratios_sum_to_one() {
2839 let analysis = ContentAnalysis {
2840 page_number: 0,
2841 page_type: PageType::Mixed,
2842 text_ratio: 0.25,
2843 image_ratio: 0.45,
2844 blank_space_ratio: 0.30,
2845 text_fragment_count: 20,
2846 image_count: 3,
2847 character_count: 500,
2848 };
2849
2850 let total = analysis.text_ratio + analysis.image_ratio + analysis.blank_space_ratio;
2851 assert!((total - 1.0).abs() < 0.001);
2852 }
2853
2854 #[test]
2856 fn test_multiple_sequential_analyzers() {
2857 for i in 0..5 {
2859 let doc = create_mock_document();
2860 let analyzer = PageContentAnalyzer::new(doc);
2861 let result = analyzer.analyze_page(0);
2862 assert!(result.is_ok());
2863 tracing::debug!("Analyzer {i} completed analysis");
2864 }
2865 }
2866
2867 #[test]
2869 fn test_extract_page_image_data_no_xobjects() {
2870 let doc = create_mock_document();
2871 let analyzer = PageContentAnalyzer::new(doc);
2872
2873 let result = analyzer.extract_page_image_data(0);
2875 assert!(result.is_err());
2876 assert!(result
2877 .unwrap_err()
2878 .to_string()
2879 .contains("No image data found"));
2880 }
2881
2882 #[test]
2884 fn test_analyze_text_content_fragment_filtering() {
2885 let doc = create_mock_document();
2886 let custom_options = AnalysisOptions {
2887 min_text_fragment_size: 20, ..Default::default()
2889 };
2890 let analyzer = PageContentAnalyzer::with_options(doc, custom_options);
2891
2892 let result = analyzer.analyze_text_content(0);
2893 assert!(result.is_ok());
2894 }
2896
2897 #[test]
2899 fn test_ocr_automatic_configuration() {
2900 let doc = create_mock_document();
2901 let analyzer = PageContentAnalyzer::new(doc);
2902 let provider = MockOcrProvider::new();
2903
2904 let result = analyzer.process_scanned_pages_with_ocr(&provider);
2906 assert!(result.is_ok());
2907
2908 let scanned = analyzer.find_scanned_pages();
2910 assert!(scanned.is_ok());
2911 }
2912
2913 #[test]
2915 fn test_ocr_preprocessing_in_analysis() {
2916 let preprocessing = crate::text::ImagePreprocessing {
2917 denoise: false,
2918 deskew: false,
2919 enhance_contrast: true,
2920 sharpen: true,
2921 scale_factor: 1.5,
2922 };
2923
2924 let ocr_options = OcrOptions {
2925 preprocessing,
2926 ..Default::default()
2927 };
2928
2929 let analysis_options = AnalysisOptions {
2930 ocr_options: Some(ocr_options),
2931 ..Default::default()
2932 };
2933
2934 let doc = create_mock_document();
2935 let analyzer = PageContentAnalyzer::with_options(doc, analysis_options);
2936
2937 assert!(analyzer.options.ocr_options.is_some());
2939 }
2940
2941 #[test]
2943 fn test_batch_processing_timing() {
2944 let doc = create_mock_document();
2945 let analyzer = PageContentAnalyzer::new(doc);
2946 let provider = MockOcrProvider::new();
2947
2948 let start = std::time::Instant::now();
2949 let result = analyzer.process_scanned_pages_batch(&provider, 1);
2950 let _elapsed = start.elapsed();
2951
2952 assert!(result.is_ok());
2953 }
2956
2957 #[test]
2959 fn test_page_type_all_combinations() {
2960 let analyzer = PageContentAnalyzer::new(create_mock_document());
2961
2962 assert_eq!(analyzer.determine_page_type(0.05, 0.85), PageType::Scanned);
2964 assert_eq!(analyzer.determine_page_type(0.0, 0.95), PageType::Scanned);
2965
2966 assert_eq!(analyzer.determine_page_type(0.75, 0.15), PageType::Text);
2968 assert_eq!(analyzer.determine_page_type(0.85, 0.0), PageType::Text);
2969
2970 assert_eq!(analyzer.determine_page_type(0.4, 0.4), PageType::Mixed);
2972 assert_eq!(analyzer.determine_page_type(0.3, 0.3), PageType::Mixed);
2973
2974 assert_eq!(analyzer.determine_page_type(0.5, 0.5), PageType::Mixed);
2976 assert_eq!(analyzer.determine_page_type(0.15, 0.75), PageType::Mixed);
2977 }
2978
2979 #[test]
2981 fn test_multiple_analyzers_shared_results() {
2982 let mut all_results = Vec::new();
2983
2984 for i in 0..3 {
2986 let doc = create_mock_document();
2987 let analyzer = PageContentAnalyzer::new(doc);
2988
2989 if let Ok(analysis) = analyzer.analyze_page(0) {
2990 all_results.push((i, analysis.page_type));
2991 }
2992 }
2993
2994 assert_eq!(all_results.len(), 3);
2995
2996 for (i, page_type) in &all_results {
2998 tracing::debug!("Analyzer {i} detected page type: {page_type:?}");
2999 }
3000 }
3001
3002 #[test]
3004 fn test_batch_processing_error_recovery() {
3005 let doc = create_mock_document();
3007 let analyzer = PageContentAnalyzer::new(doc);
3008
3009 struct IntermittentOcrProvider {
3011 fail_count: Mutex<usize>,
3012 }
3013
3014 impl OcrProvider for IntermittentOcrProvider {
3015 fn process_image(
3016 &self,
3017 data: &[u8],
3018 opts: &OcrOptions,
3019 ) -> Result<OcrProcessingResult, OcrError> {
3020 let mut count = self.fail_count.lock().unwrap();
3021 *count += 1;
3022
3023 if *count % 2 == 0 {
3024 Err(OcrError::ProcessingFailed(
3025 "Intermittent failure".to_string(),
3026 ))
3027 } else {
3028 MockOcrProvider::new().process_image(data, opts)
3029 }
3030 }
3031
3032 fn process_page(
3033 &self,
3034 _analysis: &ContentAnalysis,
3035 data: &[u8],
3036 opts: &OcrOptions,
3037 ) -> Result<OcrProcessingResult, OcrError> {
3038 self.process_image(data, opts)
3039 }
3040
3041 fn supported_formats(&self) -> Vec<crate::graphics::ImageFormat> {
3042 MockOcrProvider::new().supported_formats()
3043 }
3044
3045 fn engine_name(&self) -> &str {
3046 "Intermittent"
3047 }
3048
3049 fn engine_type(&self) -> crate::text::OcrEngine {
3050 crate::text::OcrEngine::Mock
3051 }
3052 }
3053
3054 let provider = IntermittentOcrProvider {
3055 fail_count: Mutex::new(0),
3056 };
3057
3058 let result = analyzer.process_scanned_pages_batch(&provider, 2);
3059 assert!(result.is_ok());
3060 }
3062
3063 #[test]
3065 fn test_memory_stress_multiple_analyses() {
3066 let doc = create_mock_document();
3067 let analyzer = PageContentAnalyzer::new(doc);
3068
3069 for _ in 0..100 {
3071 let result = analyzer.analyze_page(0);
3072 assert!(result.is_ok());
3073 }
3074
3075 for _ in 0..10 {
3077 let result = analyzer.analyze_document();
3078 assert!(result.is_ok());
3079 }
3080 }
3081
3082 #[test]
3084 fn test_ocr_language_fallback() {
3085 let ocr_options = OcrOptions {
3086 language: "unknown_lang".to_string(),
3087 ..Default::default()
3088 };
3089
3090 let analysis_options = AnalysisOptions {
3091 ocr_options: Some(ocr_options),
3092 ..Default::default()
3093 };
3094
3095 let doc = create_mock_document();
3096 let analyzer = PageContentAnalyzer::with_options(doc, analysis_options);
3097 let provider = MockOcrProvider::new();
3098
3099 let result = analyzer.process_scanned_pages_with_ocr(&provider);
3101 assert!(result.is_ok());
3102 }
3103
3104 #[test]
3106 fn test_ocr_timeout_simulation() {
3107 let mut provider = MockOcrProvider::new();
3108 provider.set_processing_delay(100); let ocr_options = OcrOptions {
3111 timeout_seconds: 1, ..Default::default()
3113 };
3114
3115 let analysis_options = AnalysisOptions {
3116 ocr_options: Some(ocr_options),
3117 ..Default::default()
3118 };
3119
3120 let doc = create_mock_document();
3121 let analyzer = PageContentAnalyzer::with_options(doc, analysis_options);
3122
3123 let result = analyzer.process_scanned_pages_with_ocr(&provider);
3125 assert!(result.is_ok());
3126 }
3127
3128 #[test]
3130 fn test_zero_sized_image_filtering() {
3131 let doc = create_mock_document();
3132 let analyzer = PageContentAnalyzer::new(doc);
3133
3134 let result = analyzer.analyze_image_content(0);
3136 assert!(result.is_ok());
3137 let image_analysis = result.unwrap();
3138 assert_eq!(image_analysis.image_count, 0);
3139 assert_eq!(image_analysis.total_area, 0.0);
3140 }
3141
3142 #[test]
3144 fn test_page_numbers_boundary() {
3145 let doc = create_mock_document();
3146 let analyzer = PageContentAnalyzer::new(doc);
3147
3148 let page_numbers = vec![0, usize::MAX];
3150 let result = analyzer.analyze_pages(&page_numbers);
3151 if result.is_ok() {
3154 let analyses = result.unwrap();
3155 assert!(analyses.len() >= 1);
3157 assert_eq!(analyses[0].page_number, 0);
3158 } else {
3159 assert!(result.unwrap_err().to_string().contains("Page"));
3161 }
3162 }
3163
3164 #[test]
3166 fn test_ocr_confidence_boundaries() {
3167 let mut provider = MockOcrProvider::new();
3168
3169 let jpeg_data = [
3171 0xFF, 0xD8, 0xFF, 0xE0, 0x00, 0x10, 0x4A, 0x46, 0x49, 0x46, 0x00, 0x01,
3172 ];
3173
3174 provider.set_confidence(0.0);
3176 let result = provider.process_image(&jpeg_data, &OcrOptions::default());
3177 assert!(result.is_ok());
3178
3179 provider.set_confidence(1.0);
3181 let result = provider.process_image(&jpeg_data, &OcrOptions::default());
3182 assert!(result.is_ok());
3183
3184 let options = OcrOptions {
3186 min_confidence: 0.9,
3187 ..Default::default()
3188 };
3189 provider.set_confidence(0.5);
3190 let result = provider.process_image(&jpeg_data, &options);
3191 assert!(result.is_ok());
3193 }
3194
3195 #[test]
3197 fn test_ocr_processing_configurations() {
3198 let doc = create_mock_document();
3199 let analyzer = PageContentAnalyzer::new(doc);
3200 let provider = MockOcrProvider::new();
3201
3202 let result = analyzer.process_scanned_pages_with_ocr(&provider);
3204 assert!(result.is_ok());
3205
3206 for batch_size in [1, 3, 5, 10] {
3208 let result = analyzer.process_scanned_pages_batch(&provider, batch_size);
3209 assert!(result.is_ok());
3210 }
3211 }
3212
3213 #[test]
3215 fn test_custom_min_image_size() {
3216 let doc = create_mock_document();
3217 let custom_options = AnalysisOptions {
3218 min_image_size: 1000, ..Default::default()
3220 };
3221 let analyzer = PageContentAnalyzer::with_options(doc, custom_options);
3222
3223 let result = analyzer.analyze_image_content(0);
3224 assert!(result.is_ok());
3225 }
3227
3228 #[test]
3230 fn test_comprehensive_page_analysis() {
3231 let doc = create_mock_document();
3232 let analyzer = PageContentAnalyzer::new(doc);
3233
3234 let analysis = analyzer.analyze_page(0);
3235 assert!(analysis.is_ok());
3236
3237 let analysis = analysis.unwrap();
3238
3239 assert!(analysis.page_number == 0);
3241 assert!(analysis.text_ratio >= 0.0 && analysis.text_ratio <= 1.0);
3242 assert!(analysis.image_ratio >= 0.0 && analysis.image_ratio <= 1.0);
3243 assert!(analysis.blank_space_ratio >= 0.0 && analysis.blank_space_ratio <= 1.0);
3244
3245 let total = analysis.text_ratio + analysis.image_ratio + analysis.blank_space_ratio;
3247 assert!((total - 1.0).abs() < 0.01);
3248 }
3249
3250 #[test]
3252 fn test_error_message_formatting() {
3253 let doc = create_mock_document();
3254 let analyzer = PageContentAnalyzer::new(doc);
3255 let provider = MockOcrProvider::new();
3256
3257 let result = analyzer.extract_text_from_scanned_page(0, &provider);
3259 assert!(result.is_err());
3260 let error_msg = result.unwrap_err().to_string();
3261 assert!(error_msg.contains("not a scanned page"));
3262 assert!(error_msg.contains("image ratio"));
3263 assert!(error_msg.contains("text ratio"));
3264 }
3265
3266 #[test]
3268 fn test_batch_size_edge_cases() {
3269 let doc = create_mock_document();
3270 let analyzer = PageContentAnalyzer::new(doc);
3271 let provider = MockOcrProvider::new();
3272
3273 let result = analyzer.process_scanned_pages_batch(&provider, 0);
3275 assert!(result.is_ok());
3276
3277 let result = analyzer.process_scanned_pages_batch(&provider, usize::MAX);
3279 assert!(result.is_ok());
3280 }
3281
3282 #[test]
3284 fn test_ocr_provider_robustness() {
3285 struct UnreliableOcrProvider {
3287 call_count: Mutex<usize>,
3288 }
3289
3290 impl UnreliableOcrProvider {
3291 fn new() -> Self {
3292 UnreliableOcrProvider {
3293 call_count: Mutex::new(0),
3294 }
3295 }
3296 }
3297
3298 impl Clone for UnreliableOcrProvider {
3299 fn clone(&self) -> Self {
3300 UnreliableOcrProvider {
3301 call_count: Mutex::new(0),
3302 }
3303 }
3304 }
3305
3306 impl OcrProvider for UnreliableOcrProvider {
3307 fn process_image(
3308 &self,
3309 _: &[u8],
3310 _: &OcrOptions,
3311 ) -> Result<OcrProcessingResult, OcrError> {
3312 let mut count = self.call_count.lock().unwrap();
3313 *count += 1;
3314
3315 if *count == 1 {
3317 Err(OcrError::ProcessingFailed("Temporary failure".to_string()))
3318 } else {
3319 MockOcrProvider::new().process_image(&[0xFF, 0xD8], &OcrOptions::default())
3320 }
3321 }
3322
3323 fn process_page(
3324 &self,
3325 _: &ContentAnalysis,
3326 data: &[u8],
3327 opts: &OcrOptions,
3328 ) -> Result<OcrProcessingResult, OcrError> {
3329 self.process_image(data, opts)
3330 }
3331
3332 fn supported_formats(&self) -> Vec<crate::graphics::ImageFormat> {
3333 MockOcrProvider::new().supported_formats()
3334 }
3335
3336 fn engine_name(&self) -> &str {
3337 "Unreliable"
3338 }
3339
3340 fn engine_type(&self) -> crate::text::OcrEngine {
3341 crate::text::OcrEngine::Mock
3342 }
3343 }
3344
3345 let doc = create_mock_document();
3346 let analyzer = PageContentAnalyzer::new(doc);
3347 let provider = UnreliableOcrProvider::new();
3348
3349 let result = analyzer.process_scanned_pages_with_ocr(&provider);
3351 assert!(result.is_ok());
3352
3353 let result = analyzer.process_scanned_pages_batch(&provider, 2);
3355 assert!(result.is_ok());
3356 }
3357
3358 #[test]
3360 fn test_analysis_options_validation() {
3361 let options = AnalysisOptions {
3363 min_text_fragment_size: 0,
3364 min_image_size: 0,
3365 scanned_threshold: 1.5, text_threshold: -0.5, ocr_options: None,
3368 };
3369
3370 let doc = create_mock_document();
3371 let analyzer = PageContentAnalyzer::with_options(doc, options);
3372
3373 let result = analyzer.analyze_page(0);
3375 assert!(result.is_ok());
3376 }
3377
3378 #[test]
3380 fn test_ocr_result_aggregation() {
3381 let doc = create_mock_document();
3382 let analyzer = PageContentAnalyzer::new(doc);
3383 let mut provider = MockOcrProvider::new();
3384
3385 provider.set_mock_text("Page content from OCR".to_string());
3387 provider.set_confidence(0.85);
3388
3389 let results = analyzer.process_scanned_pages_with_ocr(&provider);
3390 assert!(results.is_ok());
3391
3392 let ocr_results = results.unwrap();
3393
3394 let total_chars: usize = ocr_results
3396 .iter()
3397 .map(|(_, result)| result.text.len())
3398 .sum();
3399 let avg_confidence: f64 = if !ocr_results.is_empty() {
3400 ocr_results
3401 .iter()
3402 .map(|(_, result)| result.confidence)
3403 .sum::<f64>()
3404 / ocr_results.len() as f64
3405 } else {
3406 0.0
3407 };
3408
3409 assert!(total_chars == total_chars); assert!((0.0..=1.0).contains(&avg_confidence));
3412 }
3413
3414 #[test]
3416 fn test_resource_cleanup() {
3417 for _ in 0..10 {
3419 let doc = create_mock_document();
3420 let analyzer = PageContentAnalyzer::new(doc);
3421 let _result = analyzer.analyze_document();
3422 }
3424
3425 }
3428
3429 #[test]
3431 fn test_complete_analysis_workflow() {
3432 let doc = create_mock_document();
3434 let analyzer = PageContentAnalyzer::new(doc);
3435
3436 let analyses = analyzer.analyze_document().unwrap();
3438 assert!(!analyses.is_empty());
3439
3440 let _scanned_pages = analyzer.find_scanned_pages().unwrap();
3442
3443 let _is_scanned = analyzer.is_scanned_page(0).unwrap();
3445
3446 let provider = MockOcrProvider::new();
3448 let ocr_results = analyzer.process_scanned_pages_with_ocr(&provider).unwrap();
3449
3450 let sequential_results = analyzer.process_scanned_pages_with_ocr(&provider).unwrap();
3452
3453 let batch_results = analyzer.process_scanned_pages_batch(&provider, 5).unwrap();
3455
3456 assert_eq!(ocr_results.len(), sequential_results.len());
3458 assert_eq!(ocr_results.len(), batch_results.len());
3459
3460 tracing::debug!(
3461 "Complete workflow test passed with {} pages analyzed",
3462 analyses.len()
3463 );
3464 }
3465}