1use super::{OperationError, OperationResult};
7use crate::graphics::ImageFormat;
8use crate::parser::objects::{PdfName, PdfObject, PdfStream};
9use crate::parser::{PdfDocument, PdfReader};
10use std::collections::HashMap;
11use std::fs::{self, File};
12use std::io::{Read, Seek, Write};
13use std::path::{Path, PathBuf};
14
15#[cfg(feature = "external-images")]
16use image::{DynamicImage, GenericImageView, ImageBuffer, ImageFormat as ImageLibFormat, Luma};
17
18#[derive(Debug, Clone)]
22pub struct TransformMatrix {
23 pub a: f64, pub b: f64, pub c: f64, pub d: f64, pub e: f64, pub f: f64, }
30
31impl TransformMatrix {
32 fn new(a: f64, b: f64, c: f64, d: f64, e: f64, f: f64) -> Self {
33 Self { a, b, c, d, e, f }
34 }
35
36 #[allow(dead_code)]
38 fn is_90_degree_rotation(&self) -> bool {
39 self.a.abs() < 0.001 && self.d.abs() < 0.001 && self.b.abs() > 0.001 && self.c.abs() > 0.001
41 }
42
43 #[allow(dead_code)]
45 fn is_simple_scale(&self) -> bool {
46 self.b.abs() < 0.001 && self.c.abs() < 0.001 && self.a.abs() > 0.001 && self.d.abs() > 0.001
48 }
49
50 #[allow(dead_code)]
52 fn is_fis2_like_matrix(&self) -> bool {
53 (self.a - 841.68).abs() < 1.0
56 && (self.d - 595.08).abs() < 1.0
57 && self.b.abs() < 0.001
58 && self.c.abs() < 0.001
59 }
60}
61
62#[derive(Debug, Clone)]
64pub struct ImagePreprocessingOptions {
65 pub auto_correct_rotation: bool,
67 pub enhance_contrast: bool,
69 pub denoise: bool,
71 pub upscale_small_images: bool,
73 pub upscale_threshold: u32,
75 pub upscale_factor: u32,
77 pub force_grayscale: bool,
79}
80
81impl Default for ImagePreprocessingOptions {
82 fn default() -> Self {
83 Self {
84 auto_correct_rotation: true,
85 enhance_contrast: true,
86 denoise: true,
87 upscale_small_images: true,
88 upscale_threshold: 300,
89 upscale_factor: 2,
90 force_grayscale: false,
91 }
92 }
93}
94
95#[derive(Debug, Clone)]
97pub struct ExtractImagesOptions {
98 pub output_dir: PathBuf,
100 pub name_pattern: String,
103 pub extract_inline: bool,
105 pub min_size: Option<u32>,
107 pub create_dir: bool,
109 pub preprocessing: ImagePreprocessingOptions,
111}
112
113impl Default for ExtractImagesOptions {
114 fn default() -> Self {
115 Self {
116 output_dir: PathBuf::from("."),
117 name_pattern: "page_{page}_image_{index}.{format}".to_string(),
118 extract_inline: true,
119 min_size: Some(10),
120 create_dir: true,
121 preprocessing: ImagePreprocessingOptions::default(),
122 }
123 }
124}
125
126#[derive(Debug)]
128pub struct ExtractedImage {
129 pub page_number: usize,
131 pub image_index: usize,
133 pub file_path: PathBuf,
135 pub width: u32,
137 pub height: u32,
138 pub format: ImageFormat,
140}
141
142pub struct ImageExtractor<R: Read + Seek> {
144 document: PdfDocument<R>,
145 options: ExtractImagesOptions,
146 processed_images: HashMap<String, PathBuf>,
148}
149
150impl<R: Read + Seek> ImageExtractor<R> {
151 pub fn new(document: PdfDocument<R>, options: ExtractImagesOptions) -> Self {
153 Self {
154 document,
155 options,
156 processed_images: HashMap::new(),
157 }
158 }
159
160 pub fn extract_all(&mut self) -> OperationResult<Vec<ExtractedImage>> {
162 if self.options.create_dir && !self.options.output_dir.exists() {
164 fs::create_dir_all(&self.options.output_dir)?;
165 }
166
167 let mut extracted_images = Vec::new();
168 let page_count = self
169 .document
170 .page_count()
171 .map_err(|e| OperationError::ParseError(e.to_string()))?;
172
173 for page_idx in 0..page_count {
174 let page_images = self.extract_from_page(page_idx as usize)?;
175 extracted_images.extend(page_images);
176 }
177
178 Ok(extracted_images)
179 }
180
181 pub fn extract_from_page(
183 &mut self,
184 page_number: usize,
185 ) -> OperationResult<Vec<ExtractedImage>> {
186 let mut extracted = Vec::new();
187
188 let page = self
190 .document
191 .get_page(page_number as u32)
192 .map_err(|e| OperationError::ParseError(e.to_string()))?;
193
194 let xobject_refs: Vec<(String, u32, u16)> = {
196 let resources = self
197 .document
198 .get_page_resources(&page)
199 .map_err(|e| OperationError::ParseError(e.to_string()))?;
200
201 let mut refs = Vec::new();
202
203 if let Some(resources) = resources {
204 if let Some(PdfObject::Dictionary(xobjects)) =
205 resources.0.get(&PdfName("XObject".to_string()))
206 {
207 for (name, obj_ref) in &xobjects.0 {
208 if let PdfObject::Reference(obj_num, gen_num) = obj_ref {
209 refs.push((name.0.clone(), *obj_num, *gen_num));
210 }
211 }
212 }
213 }
214
215 refs
216 };
217
218 let mut image_index = 0;
220 for (name, obj_num, gen_num) in xobject_refs {
221 if let Ok(xobject) = self.document.get_object(obj_num, gen_num) {
222 if let Some(extracted_image) =
223 self.process_xobject(&xobject, page_number, image_index, &name)?
224 {
225 extracted.push(extracted_image);
226 image_index += 1;
227 }
228 }
229 }
230
231 if extracted.is_empty() {
233 if let Ok(content_streams) = self.document.get_page_content_streams(&page) {
235 for stream_data in &content_streams {
236 let referenced_images = self.extract_referenced_images_from_content(
237 stream_data,
238 page_number,
239 &mut image_index,
240 )?;
241 extracted.extend(referenced_images);
242 }
243 }
244 }
245
246 if self.options.extract_inline {
248 if let Ok(parsed_page) = self.document.get_page(page_number as u32) {
249 if let Ok(content_streams) = self.document.get_page_content_streams(&parsed_page) {
250 for stream_data in &content_streams {
251 let inline_images = self.extract_inline_images_from_stream(
252 stream_data,
253 page_number,
254 &mut image_index,
255 )?;
256 extracted.extend(inline_images);
257 }
258 }
259 }
260 }
261
262 Ok(extracted)
263 }
264
265 fn process_xobject(
267 &mut self,
268 xobject: &PdfObject,
269 page_number: usize,
270 image_index: usize,
271 _name: &str,
272 ) -> OperationResult<Option<ExtractedImage>> {
273 if let PdfObject::Stream(stream) = xobject {
274 if let Some(PdfObject::Name(subtype)) =
276 stream.dict.0.get(&PdfName("Subtype".to_string()))
277 {
278 if subtype.0 == "Image" {
279 return self.extract_image_xobject(stream, page_number, image_index);
280 }
281 }
282 }
283 Ok(None)
284 }
285
286 fn extract_image_xobject(
288 &mut self,
289 stream: &PdfStream,
290 page_number: usize,
291 image_index: usize,
292 ) -> OperationResult<Option<ExtractedImage>> {
293 let width = match stream.dict.0.get(&PdfName("Width".to_string())) {
295 Some(PdfObject::Integer(w)) => *w as u32,
296 _ => return Ok(None),
297 };
298
299 let height = match stream.dict.0.get(&PdfName("Height".to_string())) {
300 Some(PdfObject::Integer(h)) => *h as u32,
301 _ => return Ok(None),
302 };
303
304 if let Some(min_size) = self.options.min_size {
306 if width < min_size || height < min_size {
307 return Ok(None);
308 }
309 }
310
311 let color_space = stream.dict.0.get(&PdfName("ColorSpace".to_string()));
313 let bits_per_component = match stream.dict.0.get(&PdfName("BitsPerComponent".to_string())) {
314 Some(PdfObject::Integer(bits)) => *bits as u8,
315 _ => 8, };
317
318 let parse_options = self.document.options();
320 let mut data = stream.decode(&parse_options).map_err(|e| {
321 OperationError::ParseError(format!("Failed to decode image stream: {e}"))
322 })?;
323
324 let format = match stream.dict.0.get(&PdfName("Filter".to_string())) {
326 Some(PdfObject::Name(filter)) => match filter.0.as_str() {
327 "DCTDecode" => {
328 data = stream.data.clone();
331 ImageFormat::Jpeg
332 }
333 "FlateDecode" => {
334 data = self.convert_raw_image_data_to_png(
336 &data,
337 width,
338 height,
339 color_space,
340 bits_per_component,
341 )?;
342 ImageFormat::Png
343 }
344 "CCITTFaxDecode" => {
345 data = self.convert_ccitt_to_png(&data, width, height)?;
347 ImageFormat::Png
348 }
349 "LZWDecode" => {
350 data = self.convert_raw_image_data_to_png(
352 &data,
353 width,
354 height,
355 color_space,
356 bits_per_component,
357 )?;
358 ImageFormat::Png
359 }
360 _ => {
361 tracing::debug!("Unsupported image filter: {}", filter.0);
362 return Ok(None);
363 }
364 },
365 Some(PdfObject::Array(filters)) => {
366 if let Some(PdfObject::Name(filter)) = filters.0.first() {
368 match filter.0.as_str() {
369 "DCTDecode" => {
370 data = stream.data.clone();
372 ImageFormat::Jpeg
373 }
374 "FlateDecode" => {
375 data = self.convert_raw_image_data_to_png(
376 &data,
377 width,
378 height,
379 color_space,
380 bits_per_component,
381 )?;
382 ImageFormat::Png
383 }
384 "CCITTFaxDecode" => {
385 data = self.convert_ccitt_to_png(&data, width, height)?;
386 ImageFormat::Png
387 }
388 "LZWDecode" => {
389 data = self.convert_raw_image_data_to_png(
390 &data,
391 width,
392 height,
393 color_space,
394 bits_per_component,
395 )?;
396 ImageFormat::Png
397 }
398 _ => {
399 tracing::debug!("Unsupported image filter: {}", filter.0);
400 return Ok(None);
401 }
402 }
403 } else {
404 return Ok(None);
405 }
406 }
407 _ => {
408 data = self.convert_raw_image_data_to_png(
410 &data,
411 width,
412 height,
413 color_space,
414 bits_per_component,
415 )?;
416 ImageFormat::Png
417 }
418 };
419
420 let image_key = format!("{:x}", md5::compute(&data));
422
423 let allow_deduplication = !self.options.name_pattern.contains("{page}");
427
428 if allow_deduplication {
430 if let Some(existing_path) = self.processed_images.get(&image_key) {
431 return Ok(Some(ExtractedImage {
433 page_number,
434 image_index,
435 file_path: existing_path.clone(),
436 width,
437 height,
438 format,
439 }));
440 }
441 }
442
443 let extension = match format {
445 ImageFormat::Jpeg => "jpg",
446 ImageFormat::Png => "png",
447 ImageFormat::Tiff => "tiff",
448 ImageFormat::Raw => "rgb",
449 };
450
451 let filename = self
452 .options
453 .name_pattern
454 .replace("{page}", &(page_number + 1).to_string())
455 .replace("{index}", &(image_index + 1).to_string())
456 .replace("{format}", extension);
457
458 let output_path = self.options.output_dir.join(filename);
459
460 #[cfg(feature = "external-images")]
462 let processed_data = if self.should_preprocess() {
463 self.preprocess_image_data(&data, width, height, format)?
464 } else {
465 data
466 };
467
468 #[cfg(not(feature = "external-images"))]
469 let processed_data = data;
470
471 let mut file = File::create(&output_path)?;
473 file.write_all(&processed_data)?;
474
475 self.processed_images.insert(image_key, output_path.clone());
477
478 Ok(Some(ExtractedImage {
479 page_number,
480 image_index,
481 file_path: output_path,
482 width,
483 height,
484 format,
485 }))
486 }
487
488 fn detect_image_format_from_data(&self, data: &[u8]) -> OperationResult<ImageFormat> {
490 if data.is_empty() {
491 return Err(OperationError::ParseError(
492 "Image data too short to detect format".to_string(),
493 ));
494 }
495
496 if data.len() >= 8 && &data[0..8] == b"\x89PNG\r\n\x1a\n" {
498 return Ok(ImageFormat::Png);
499 }
500
501 if data.len() >= 4 {
503 if &data[0..2] == b"II" && &data[2..4] == b"\x2A\x00" {
504 return Ok(ImageFormat::Tiff); }
506 if &data[0..2] == b"MM" && &data[2..4] == b"\x00\x2A" {
507 return Ok(ImageFormat::Tiff); }
509 }
510
511 if data.len() >= 2 && data[0] == 0xFF && data[1] == 0xD8 {
513 return Ok(ImageFormat::Jpeg);
514 }
515
516 if data.len() < 2 {
518 return Err(OperationError::ParseError(
519 "Image data too short to detect format".to_string(),
520 ));
521 }
522
523 Ok(ImageFormat::Png)
526 }
527
528 fn extract_inline_images_from_stream(
530 &mut self,
531 stream_data: &[u8],
532 page_number: usize,
533 image_index: &mut usize,
534 ) -> OperationResult<Vec<ExtractedImage>> {
535 let mut inline_images = Vec::new();
536
537 let stream_str = String::from_utf8_lossy(stream_data);
539
540 let mut pos = 0;
542 while let Some(bi_pos) = stream_str[pos..].find("BI") {
543 let absolute_bi_pos = pos + bi_pos;
544
545 if let Some(relative_id_pos) = stream_str[absolute_bi_pos..].find("ID") {
547 let absolute_id_pos = absolute_bi_pos + relative_id_pos;
548
549 if let Some(relative_ei_pos) = stream_str[absolute_id_pos..].find("EI") {
551 let absolute_ei_pos = absolute_id_pos + relative_ei_pos;
552
553 let dict_section = &stream_str[absolute_bi_pos + 2..absolute_id_pos].trim();
555
556 let data_start = absolute_id_pos + 2;
558 let data_end = absolute_ei_pos;
559
560 if data_start < data_end && data_end <= stream_data.len() {
561 let image_data = &stream_data[data_start..data_end];
562
563 let (width, height) = self.parse_inline_image_dict(dict_section);
565
566 if let Ok(extracted_image) = self.save_inline_image(
568 image_data,
569 page_number,
570 *image_index,
571 width,
572 height,
573 ) {
574 inline_images.push(extracted_image);
575 *image_index += 1;
576 }
577 }
578
579 pos = absolute_ei_pos + 2;
581 } else {
582 break; }
584 } else {
585 break; }
587 }
588
589 Ok(inline_images)
590 }
591
592 fn extract_referenced_images_from_content(
594 &mut self,
595 stream_data: &[u8],
596 page_number: usize,
597 image_index: &mut usize,
598 ) -> OperationResult<Vec<ExtractedImage>> {
599 let mut extracted = Vec::new();
600
601 let content = String::from_utf8_lossy(stream_data);
603
604 tracing::debug!(" Content: {}", content);
605
606 let image_with_transform = self.parse_images_with_transformations(&content)?;
609
610 for (image_name, transform_matrix) in image_with_transform {
611 if let Some(mut extracted_image) =
613 self.find_and_extract_xobject_by_name(&image_name, page_number, *image_index)?
614 {
615 if let Some(matrix) = transform_matrix {
617 extracted_image =
618 self.apply_transformation_to_image(extracted_image, &matrix)?;
619 }
620
621 extracted.push(extracted_image);
622 *image_index += 1;
623 }
624 }
625
626 Ok(extracted)
627 }
628
629 fn find_and_extract_xobject_by_name(
631 &mut self,
632 name: &str,
633 page_number: usize,
634 image_index: usize,
635 ) -> OperationResult<Option<ExtractedImage>> {
636 for obj_num in 1..1000 {
643 if let Ok(obj) = self.document.get_object(obj_num, 0) {
644 if let Some(extracted) =
645 self.try_extract_image_from_object(&obj, page_number, image_index, name)?
646 {
647 return Ok(Some(extracted));
648 }
649 }
650 }
651
652 Ok(None)
653 }
654
655 fn try_extract_image_from_object(
657 &mut self,
658 obj: &PdfObject,
659 page_number: usize,
660 image_index: usize,
661 _expected_name: &str,
662 ) -> OperationResult<Option<ExtractedImage>> {
663 if let PdfObject::Stream(stream) = obj {
664 if let Some(PdfObject::Name(subtype)) =
666 stream.dict.0.get(&PdfName("Subtype".to_string()))
667 {
668 if subtype.0 == "Image" {
669 return self.extract_image_xobject(stream, page_number, image_index);
670 }
671 }
672
673 if let Some(PdfObject::Integer(_width)) =
675 stream.dict.0.get(&PdfName("Width".to_string()))
676 {
677 if let Some(PdfObject::Integer(_height)) =
678 stream.dict.0.get(&PdfName("Height".to_string()))
679 {
680 return self.extract_image_xobject(stream, page_number, image_index);
681 }
682 }
683 }
684
685 Ok(None)
686 }
687
688 fn parse_images_with_transformations(
690 &self,
691 content: &str,
692 ) -> OperationResult<Vec<(String, Option<TransformMatrix>)>> {
693 let mut results = Vec::new();
694 let lines: Vec<&str> = content.lines().collect();
695
696 let mut current_matrix: Option<TransformMatrix> = None;
697
698 for line in lines {
699 let line = line.trim();
700
701 if line.ends_with(" cm") {
703 let parts: Vec<&str> = line.split_whitespace().collect();
704 if parts.len() == 7 && parts[6] == "cm" {
705 if let (Ok(a), Ok(b), Ok(c), Ok(d), Ok(e), Ok(f)) = (
707 parts[0].parse::<f64>(),
708 parts[1].parse::<f64>(),
709 parts[2].parse::<f64>(),
710 parts[3].parse::<f64>(),
711 parts[4].parse::<f64>(),
712 parts[5].parse::<f64>(),
713 ) {
714 current_matrix = Some(TransformMatrix::new(a, b, c, d, e, f));
715 }
716 }
717 }
718
719 if line.contains(" Do") {
721 let parts: Vec<&str> = line.split_whitespace().collect();
722 for part in parts {
723 if part.starts_with('/') && !part.contains("Do") {
724 let image_name = part[1..].to_string(); results.push((image_name, current_matrix.clone()));
726 }
727 }
728 }
729
730 if line.trim() == "Q" {
732 current_matrix = None;
733 }
734 }
735
736 Ok(results)
737 }
738
739 #[allow(unused_mut)]
741 fn apply_transformation_to_image(
742 &self,
743 mut extracted_image: ExtractedImage,
744 _matrix: &TransformMatrix,
745 ) -> OperationResult<ExtractedImage> {
746 #[cfg(feature = "external-images")]
747 {
748 let image_data = std::fs::read(&extracted_image.file_path)?;
750
751 let img = image::load_from_memory(&image_data).map_err(|e| {
753 OperationError::ParseError(format!("Failed to load image for transformation: {e}"))
754 })?;
755
756 let transformed_img =
758 self.fix_stride_problem(img, extracted_image.width, extracted_image.height)?;
759
760 let output_filename = extracted_image
762 .file_path
763 .file_stem()
764 .and_then(|s| s.to_str())
765 .ok_or_else(|| OperationError::InvalidPath {
766 reason: format!(
767 "Image path has no valid filename: {:?}",
768 extracted_image.file_path
769 ),
770 })?;
771 let output_extension = extracted_image
772 .file_path
773 .extension()
774 .and_then(|s| s.to_str())
775 .ok_or_else(|| OperationError::InvalidPath {
776 reason: format!(
777 "Image path has no valid extension: {:?}",
778 extracted_image.file_path
779 ),
780 })?;
781
782 let parent_dir =
783 extracted_image
784 .file_path
785 .parent()
786 .ok_or_else(|| OperationError::InvalidPath {
787 reason: format!(
788 "Image path has no parent directory: {:?}",
789 extracted_image.file_path
790 ),
791 })?;
792 let transformed_path = parent_dir.join(format!(
793 "{}_transformed.{}",
794 output_filename, output_extension
795 ));
796
797 transformed_img.save(&transformed_path).map_err(|e| {
798 OperationError::ParseError(format!("Failed to save transformed image: {e}"))
799 })?;
800
801 let (new_width, new_height) = transformed_img.dimensions();
803 extracted_image.file_path = transformed_path;
804 extracted_image.width = new_width;
805 extracted_image.height = new_height;
806 }
807
808 #[cfg(not(feature = "external-images"))]
809 {}
810
811 Ok(extracted_image)
812 }
813
814 #[cfg(feature = "external-images")]
816 #[allow(dead_code)]
817 fn apply_rotation_transformation(
818 &self,
819 img: DynamicImage,
820 matrix: &TransformMatrix,
821 ) -> OperationResult<DynamicImage> {
822 if matrix.b > 0.0 && matrix.c < 0.0 {
827 Ok(img.rotate90()) } else if matrix.b < 0.0 && matrix.c > 0.0 {
829 Ok(img.rotate270()) } else {
831 Ok(img.rotate90())
833 }
834 }
835
836 #[cfg(feature = "external-images")]
838 #[allow(dead_code)]
839 fn apply_scale_transformation(
840 &self,
841 img: DynamicImage,
842 matrix: &TransformMatrix,
843 ) -> OperationResult<DynamicImage> {
844 let (current_width, current_height) = img.dimensions();
845
846 let new_width = (current_width as f64 * matrix.a.abs()) as u32;
848 let new_height = (current_height as f64 * matrix.d.abs()) as u32;
849
850 if new_width > 0 && new_height > 0 {
851 Ok(img.resize(new_width, new_height, image::imageops::FilterType::Lanczos3))
852 } else {
853 Ok(img)
855 }
856 }
857
858 #[cfg(feature = "external-images")]
860 fn fix_stride_problem(
861 &self,
862 img: DynamicImage,
863 original_width: u32,
864 original_height: u32,
865 ) -> OperationResult<DynamicImage> {
866 let gray_img = img.to_luma8();
868 let pixel_data = gray_img.as_raw();
869
870 let bytes_per_row = original_width as usize;
872 let min_bytes_per_row = bytes_per_row;
873
874 let possible_strides = [
876 min_bytes_per_row, (min_bytes_per_row + 1) & !1, (min_bytes_per_row + 3) & !3, (min_bytes_per_row + 7) & !7, (min_bytes_per_row + 15) & !15, min_bytes_per_row + 1, min_bytes_per_row + 2, min_bytes_per_row + 4, ];
885
886 for (_i, &stride) in possible_strides.iter().enumerate() {
887 let expected_total = stride * original_height as usize;
888
889 if expected_total <= pixel_data.len() {
890 let mut corrected_data = Vec::new();
892 for row in 0..original_height {
893 let row_start = row as usize * stride;
894 let row_end = row_start + bytes_per_row;
895
896 if row_end <= pixel_data.len() {
897 corrected_data.extend_from_slice(&pixel_data[row_start..row_end]);
898 } else {
899 corrected_data.resize(corrected_data.len() + bytes_per_row, 255);
901 }
902 }
903
904 if corrected_data.len() == (original_width * original_height) as usize {
906 if let Some(corrected_img) = ImageBuffer::<Luma<u8>, Vec<u8>>::from_raw(
907 original_width,
908 original_height,
909 corrected_data,
910 ) {
911 return Ok(DynamicImage::ImageLuma8(corrected_img));
912 }
913 }
914 } else {
915 }
916 }
917
918 Ok(img)
919 }
920
921 fn parse_inline_image_dict(&self, dict_str: &str) -> (u32, u32) {
923 let mut width = 100; let mut height = 100; for line in dict_str.lines() {
928 let line = line.trim();
929
930 if line.starts_with("/W ") || line.starts_with("/Width ") {
932 if let Some(value_str) = line.split_whitespace().nth(1) {
933 if let Ok(w) = value_str.parse::<u32>() {
934 width = w;
935 }
936 }
937 }
938
939 if line.starts_with("/H ") || line.starts_with("/Height ") {
941 if let Some(value_str) = line.split_whitespace().nth(1) {
942 if let Ok(h) = value_str.parse::<u32>() {
943 height = h;
944 }
945 }
946 }
947 }
948
949 (width, height)
950 }
951
952 fn save_inline_image(
954 &mut self,
955 data: &[u8],
956 page_number: usize,
957 image_index: usize,
958 width: u32,
959 height: u32,
960 ) -> OperationResult<ExtractedImage> {
961 let image_key = format!("{:x}", md5::compute(data));
963
964 let allow_deduplication = !self.options.name_pattern.contains("{page}");
966
967 if allow_deduplication {
969 if let Some(existing_path) = self.processed_images.get(&image_key) {
970 return Ok(ExtractedImage {
971 page_number,
972 image_index,
973 file_path: existing_path.clone(),
974 width,
975 height,
976 format: ImageFormat::Raw, });
978 }
979 }
980
981 let format = self
983 .detect_image_format_from_data(data)
984 .unwrap_or(ImageFormat::Raw);
985 let extension = match format {
986 ImageFormat::Jpeg => "jpg",
987 ImageFormat::Png => "png",
988 ImageFormat::Tiff => "tif",
989 ImageFormat::Raw => "raw",
990 };
991
992 let filename = format!(
994 "inline_page_{}_{:03}.{}",
995 page_number + 1,
996 image_index + 1,
997 extension
998 );
999 let file_path = self.options.output_dir.join(filename);
1000
1001 fs::write(&file_path, data)?;
1003
1004 self.processed_images.insert(image_key, file_path.clone());
1006
1007 Ok(ExtractedImage {
1008 page_number,
1009 image_index,
1010 file_path,
1011 width,
1012 height,
1013 format,
1014 })
1015 }
1016
1017 fn convert_raw_image_data_to_png(
1019 &self,
1020 data: &[u8],
1021 width: u32,
1022 height: u32,
1023 color_space: Option<&PdfObject>,
1024 bits_per_component: u8,
1025 ) -> OperationResult<Vec<u8>> {
1026 let (components, _channels) = match color_space {
1028 Some(PdfObject::Name(cs)) => match cs.0.as_str() {
1029 "DeviceGray" => (1, 1),
1030 "DeviceRGB" => (3, 3),
1031 "DeviceCMYK" => (4, 4),
1032 _ => (3, 3), },
1034 Some(PdfObject::Array(cs_array)) if !cs_array.0.is_empty() => {
1035 if let Some(PdfObject::Name(cs_name)) = cs_array.0.first() {
1036 match cs_name.0.as_str() {
1037 "ICCBased" | "CalRGB" => (3, 3),
1038 "CalGray" => (1, 1),
1039 _ => (3, 3),
1040 }
1041 } else {
1042 (3, 3)
1043 }
1044 }
1045 _ => (3, 3), };
1047
1048 let bytes_per_sample = if bits_per_component <= 8 { 1 } else { 2 };
1050 let expected_size = (width * height * components as u32 * bytes_per_sample as u32) as usize;
1051
1052 if data.len() < expected_size {
1054 return Err(OperationError::ParseError(format!(
1055 "Image data too small: expected {}, got {}",
1056 expected_size,
1057 data.len()
1058 )));
1059 }
1060
1061 self.create_png_from_raw_data(data, width, height, components, bits_per_component)
1063 }
1064
1065 fn create_png_from_raw_data(
1067 &self,
1068 data: &[u8],
1069 width: u32,
1070 height: u32,
1071 components: u8,
1072 bits_per_component: u8,
1073 ) -> OperationResult<Vec<u8>> {
1074 let mut png_data = Vec::new();
1076
1077 png_data.extend_from_slice(&[0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A]);
1079
1080 let mut ihdr = Vec::new();
1082 ihdr.extend_from_slice(&width.to_be_bytes());
1083 ihdr.extend_from_slice(&height.to_be_bytes());
1084 ihdr.push(bits_per_component);
1085
1086 let color_type = match components {
1088 1 => 0, 3 => 2, 4 => 6, _ => 2, };
1093 ihdr.push(color_type);
1094 ihdr.push(0); ihdr.push(0); ihdr.push(0); self.write_png_chunk(&mut png_data, b"IHDR", &ihdr);
1099
1100 let compressed_data = self.compress_image_data(data, width, height, components)?;
1102 self.write_png_chunk(&mut png_data, b"IDAT", &compressed_data);
1103
1104 self.write_png_chunk(&mut png_data, b"IEND", &[]);
1106
1107 Ok(png_data)
1108 }
1109
1110 fn write_png_chunk(&self, output: &mut Vec<u8>, chunk_type: &[u8; 4], data: &[u8]) {
1112 output.extend_from_slice(&(data.len() as u32).to_be_bytes());
1114
1115 output.extend_from_slice(chunk_type);
1117
1118 output.extend_from_slice(data);
1120
1121 let crc = self.calculate_crc32(chunk_type, data);
1123 output.extend_from_slice(&crc.to_be_bytes());
1124 }
1125
1126 fn calculate_crc32(&self, chunk_type: &[u8; 4], data: &[u8]) -> u32 {
1128 let mut crc: u32 = 0xFFFFFFFF;
1130
1131 for &byte in chunk_type {
1133 crc ^= byte as u32;
1134 for _ in 0..8 {
1135 if crc & 1 != 0 {
1136 crc = (crc >> 1) ^ 0xEDB88320;
1137 } else {
1138 crc >>= 1;
1139 }
1140 }
1141 }
1142
1143 for &byte in data {
1145 crc ^= byte as u32;
1146 for _ in 0..8 {
1147 if crc & 1 != 0 {
1148 crc = (crc >> 1) ^ 0xEDB88320;
1149 } else {
1150 crc >>= 1;
1151 }
1152 }
1153 }
1154
1155 crc ^ 0xFFFFFFFF
1156 }
1157
1158 fn compress_image_data(
1160 &self,
1161 data: &[u8],
1162 width: u32,
1163 height: u32,
1164 components: u8,
1165 ) -> OperationResult<Vec<u8>> {
1166 use flate2::write::ZlibEncoder;
1167 use flate2::Compression;
1168 use std::io::Write;
1169
1170 let mut encoder = ZlibEncoder::new(Vec::new(), Compression::default());
1171
1172 let bytes_per_pixel = components as usize;
1174 let bytes_per_row = width as usize * bytes_per_pixel;
1175
1176 for row in 0..height {
1177 encoder.write_all(&[0])?;
1179
1180 let start = row as usize * bytes_per_row;
1182 let end = start + bytes_per_row;
1183 if end <= data.len() {
1184 encoder.write_all(&data[start..end])?;
1185 }
1186 }
1187
1188 encoder
1189 .finish()
1190 .map_err(|e| OperationError::ParseError(format!("Failed to compress PNG data: {e}")))
1191 }
1192
1193 fn convert_ccitt_to_png(
1195 &self,
1196 data: &[u8],
1197 width: u32,
1198 height: u32,
1199 ) -> OperationResult<Vec<u8>> {
1200 let mut rgb_data = Vec::new();
1203
1204 let bits_per_row = width as usize;
1206 let min_bytes_per_row = bits_per_row.div_ceil(8);
1207
1208 let possible_strides = [
1210 min_bytes_per_row, (min_bytes_per_row + 1) & !1, (min_bytes_per_row + 3) & !3, (min_bytes_per_row + 7) & !7, (min_bytes_per_row + 15) & !15, ];
1216
1217 let correct_stride =
1219 self.detect_correct_row_stride(data, width, height, &possible_strides)?;
1220
1221 for row in 0..height {
1222 let row_start = row as usize * correct_stride;
1223
1224 for col in 0..width {
1225 let byte_idx = row_start + (col as usize / 8);
1226 let bit_idx = 7 - (col as usize % 8);
1227
1228 if byte_idx < data.len() {
1229 let bit = (data[byte_idx] >> bit_idx) & 1;
1230 let gray_value = if bit == 0 { 0 } else { 255 };
1232 rgb_data.push(gray_value);
1233 } else {
1234 rgb_data.push(255); }
1236 }
1237 }
1238
1239 self.create_png_from_raw_data(&rgb_data, width, height, 1, 8)
1241 }
1242
1243 fn detect_correct_row_stride(
1245 &self,
1246 data: &[u8],
1247 width: u32,
1248 height: u32,
1249 possible_strides: &[usize],
1250 ) -> OperationResult<usize> {
1251 let bits_per_row = width as usize;
1252 let min_bytes_per_row = bits_per_row.div_ceil(8);
1253
1254 if data.len() < min_bytes_per_row * 3 {
1256 return Ok(min_bytes_per_row);
1257 }
1258
1259 for &stride in possible_strides {
1261 let expected_size = stride * height as usize;
1262
1263 if expected_size <= data.len() && (data.len() - expected_size) < stride * 2 {
1265 return Ok(stride);
1268 }
1269 }
1270
1271 let calculated_stride = data.len() / height as usize;
1273 if calculated_stride >= min_bytes_per_row {
1274 return Ok(calculated_stride);
1275 }
1276
1277 Ok(min_bytes_per_row)
1279 }
1280
1281 #[allow(dead_code)]
1283 fn should_preprocess(&self) -> bool {
1284 self.options.preprocessing.auto_correct_rotation
1285 || self.options.preprocessing.enhance_contrast
1286 || self.options.preprocessing.denoise
1287 || self.options.preprocessing.upscale_small_images
1288 || self.options.preprocessing.force_grayscale
1289 }
1290
1291 #[cfg(feature = "external-images")]
1293 fn preprocess_image_data(
1294 &self,
1295 data: &[u8],
1296 width: u32,
1297 height: u32,
1298 format: ImageFormat,
1299 ) -> OperationResult<Vec<u8>> {
1300 let img_format = match format {
1302 ImageFormat::Jpeg => ImageLibFormat::Jpeg,
1303 ImageFormat::Png => ImageLibFormat::Png,
1304 ImageFormat::Tiff => ImageLibFormat::Tiff,
1305 ImageFormat::Raw => {
1306 return self.preprocess_raw_image_data(data, width, height);
1308 }
1309 };
1310
1311 let img = image::load_from_memory_with_format(data, img_format)
1312 .map_err(|e| OperationError::ParseError(format!("Failed to load image: {e}")))?;
1313
1314 let mut processed_img = img;
1315
1316 processed_img = self.apply_rotation_correction(processed_img)?;
1318 processed_img = self.apply_contrast_enhancement(processed_img)?;
1319 processed_img = self.apply_noise_reduction(processed_img)?;
1320 processed_img = self.apply_upscaling(processed_img, width, height)?;
1321
1322 if self.options.preprocessing.force_grayscale {
1323 processed_img = DynamicImage::ImageLuma8(processed_img.to_luma8());
1324 }
1325
1326 let mut output = Vec::new();
1328 processed_img
1329 .write_to(&mut std::io::Cursor::new(&mut output), img_format)
1330 .map_err(|e| OperationError::ParseError(format!("Failed to encode image: {e}")))?;
1331
1332 Ok(output)
1333 }
1334
1335 #[cfg(feature = "external-images")]
1337 fn preprocess_raw_image_data(
1338 &self,
1339 data: &[u8],
1340 width: u32,
1341 height: u32,
1342 ) -> OperationResult<Vec<u8>> {
1343 if data.len() < (width * height) as usize {
1345 return Err(OperationError::ParseError(
1346 "Raw image data too small".to_string(),
1347 ));
1348 }
1349
1350 let img_buffer = ImageBuffer::<Luma<u8>, Vec<u8>>::from_raw(
1351 width,
1352 height,
1353 data[..(width * height) as usize].to_vec(),
1354 )
1355 .ok_or_else(|| OperationError::ParseError("Failed to create image buffer".to_string()))?;
1356
1357 let img = DynamicImage::ImageLuma8(img_buffer);
1358 let mut processed_img = img;
1359
1360 processed_img = self.apply_rotation_correction(processed_img)?;
1362 processed_img = self.apply_contrast_enhancement(processed_img)?;
1363 processed_img = self.apply_noise_reduction(processed_img)?;
1364 processed_img = self.apply_upscaling(processed_img, width, height)?;
1365
1366 let mut output = Vec::new();
1368 processed_img
1369 .write_to(&mut std::io::Cursor::new(&mut output), ImageLibFormat::Png)
1370 .map_err(|e| OperationError::ParseError(format!("Failed to encode image: {e}")))?;
1371
1372 Ok(output)
1373 }
1374
1375 #[cfg(feature = "external-images")]
1377 fn apply_rotation_correction(&self, img: DynamicImage) -> OperationResult<DynamicImage> {
1378 if !self.options.preprocessing.auto_correct_rotation {
1379 return Ok(img);
1380 }
1381
1382 let (width, height) = img.dimensions();
1384
1385 if width > height * 2 {
1388 return Ok(img.rotate90());
1390 }
1391
1392 Ok(img)
1395 }
1396
1397 #[cfg(feature = "external-images")]
1399 fn apply_contrast_enhancement(&self, img: DynamicImage) -> OperationResult<DynamicImage> {
1400 if !self.options.preprocessing.enhance_contrast {
1401 return Ok(img);
1402 }
1403
1404 let enhanced = img.adjust_contrast(20.0); Ok(enhanced.brighten(10)) }
1408
1409 #[cfg(feature = "external-images")]
1411 fn apply_noise_reduction(&self, img: DynamicImage) -> OperationResult<DynamicImage> {
1412 if !self.options.preprocessing.denoise {
1413 return Ok(img);
1414 }
1415
1416 Ok(img.blur(0.5))
1418 }
1419
1420 #[cfg(feature = "external-images")]
1422 fn apply_upscaling(
1423 &self,
1424 img: DynamicImage,
1425 original_width: u32,
1426 original_height: u32,
1427 ) -> OperationResult<DynamicImage> {
1428 if !self.options.preprocessing.upscale_small_images {
1429 return Ok(img);
1430 }
1431
1432 let min_dimension = original_width.min(original_height);
1433 if min_dimension < self.options.preprocessing.upscale_threshold {
1434 let new_width = original_width * self.options.preprocessing.upscale_factor;
1435 let new_height = original_height * self.options.preprocessing.upscale_factor;
1436
1437 return Ok(img.resize(
1438 new_width,
1439 new_height,
1440 image::imageops::FilterType::CatmullRom,
1441 ));
1442 }
1443
1444 Ok(img)
1445 }
1446}
1447
1448pub fn extract_images_from_pdf<P: AsRef<Path>>(
1450 input_path: P,
1451 options: ExtractImagesOptions,
1452) -> OperationResult<Vec<ExtractedImage>> {
1453 let document = PdfReader::open_document(input_path)
1454 .map_err(|e| OperationError::ParseError(e.to_string()))?;
1455
1456 let mut extractor = ImageExtractor::new(document, options);
1457 extractor.extract_all()
1458}
1459
1460pub fn extract_images_from_pages<P: AsRef<Path>>(
1462 input_path: P,
1463 pages: &[usize],
1464 options: ExtractImagesOptions,
1465) -> OperationResult<Vec<ExtractedImage>> {
1466 let document = PdfReader::open_document(input_path)
1467 .map_err(|e| OperationError::ParseError(e.to_string()))?;
1468
1469 let mut extractor = ImageExtractor::new(document, options);
1470 let mut all_images = Vec::new();
1471
1472 for &page_num in pages {
1473 let page_images = extractor.extract_from_page(page_num)?;
1474 all_images.extend(page_images);
1475 }
1476
1477 Ok(all_images)
1478}
1479
1480#[cfg(test)]
1481mod tests {
1482 use super::*;
1483 use tempfile::TempDir;
1484
1485 #[test]
1486 fn test_extract_options_default() {
1487 let options = ExtractImagesOptions::default();
1488 assert_eq!(options.output_dir, PathBuf::from("."));
1489 assert!(options.extract_inline);
1490 assert_eq!(options.min_size, Some(10));
1491 assert!(options.create_dir);
1492 }
1493
1494 #[test]
1495 fn test_filename_pattern() {
1496 let options = ExtractImagesOptions {
1497 name_pattern: "img_{page}_{index}.{format}".to_string(),
1498 ..Default::default()
1499 };
1500
1501 let pattern = options
1502 .name_pattern
1503 .replace("{page}", "1")
1504 .replace("{index}", "2")
1505 .replace("{format}", "jpg");
1506
1507 assert_eq!(pattern, "img_1_2.jpg");
1508 }
1509
1510 #[test]
1511 fn test_extract_options_custom() {
1512 let temp_dir = TempDir::new().unwrap();
1513 let options = ExtractImagesOptions {
1514 output_dir: temp_dir.path().to_path_buf(),
1515 name_pattern: "custom_{page}_{index}.{format}".to_string(),
1516 extract_inline: false,
1517 min_size: Some(50),
1518 create_dir: false,
1519 preprocessing: ImagePreprocessingOptions::default(),
1520 };
1521
1522 assert_eq!(options.output_dir, temp_dir.path());
1523 assert_eq!(options.name_pattern, "custom_{page}_{index}.{format}");
1524 assert!(!options.extract_inline);
1525 assert_eq!(options.min_size, Some(50));
1526 assert!(!options.create_dir);
1527 }
1528
1529 #[test]
1530 fn test_extract_options_debug_clone() {
1531 let options = ExtractImagesOptions {
1532 output_dir: PathBuf::from("/test/path"),
1533 name_pattern: "test.{format}".to_string(),
1534 extract_inline: true,
1535 min_size: None,
1536 create_dir: true,
1537 preprocessing: ImagePreprocessingOptions::default(),
1538 };
1539
1540 let debug_str = format!("{options:?}");
1541 assert!(debug_str.contains("ExtractImagesOptions"));
1542 assert!(debug_str.contains("/test/path"));
1543
1544 let cloned = options.clone();
1545 assert_eq!(cloned.output_dir, options.output_dir);
1546 assert_eq!(cloned.name_pattern, options.name_pattern);
1547 assert_eq!(cloned.extract_inline, options.extract_inline);
1548 assert_eq!(cloned.min_size, options.min_size);
1549 assert_eq!(cloned.create_dir, options.create_dir);
1550 }
1551
1552 #[test]
1553 fn test_extracted_image_struct() {
1554 let image = ExtractedImage {
1555 page_number: 0,
1556 image_index: 1,
1557 file_path: PathBuf::from("/test/image.jpg"),
1558 width: 100,
1559 height: 200,
1560 format: ImageFormat::Jpeg,
1561 };
1562
1563 assert_eq!(image.page_number, 0);
1564 assert_eq!(image.image_index, 1);
1565 assert_eq!(image.file_path, PathBuf::from("/test/image.jpg"));
1566 assert_eq!(image.width, 100);
1567 assert_eq!(image.height, 200);
1568 assert_eq!(image.format, ImageFormat::Jpeg);
1569 }
1570
1571 #[test]
1572 fn test_extracted_image_debug() {
1573 let image = ExtractedImage {
1574 page_number: 5,
1575 image_index: 3,
1576 file_path: PathBuf::from("output.png"),
1577 width: 512,
1578 height: 768,
1579 format: ImageFormat::Png,
1580 };
1581
1582 let debug_str = format!("{image:?}");
1583 assert!(debug_str.contains("ExtractedImage"));
1584 assert!(debug_str.contains("5"));
1585 assert!(debug_str.contains("3"));
1586 assert!(debug_str.contains("output.png"));
1587 assert!(debug_str.contains("512"));
1588 assert!(debug_str.contains("768"));
1589 }
1590
1591 fn create_minimal_pdf(temp_file: &std::path::Path) {
1593 let minimal_pdf = b"%PDF-1.7\n\
15941 0 obj\n\
1595<< /Type /Catalog /Pages 2 0 R >>\n\
1596endobj\n\
15972 0 obj\n\
1598<< /Type /Pages /Kids [] /Count 0 >>\n\
1599endobj\n\
1600xref\n\
16010 3\n\
16020000000000 65535 f \n\
16030000000009 00000 n \n\
16040000000055 00000 n \n\
1605trailer\n\
1606<< /Size 3 /Root 1 0 R >>\n\
1607startxref\n\
1608105\n\
1609%%EOF";
1610 std::fs::write(temp_file, minimal_pdf).unwrap();
1611 }
1612
1613 #[test]
1614 fn test_detect_image_format_png() {
1615 let temp_dir = TempDir::new().unwrap();
1617 let temp_file = temp_dir.path().join("test.pdf");
1618 create_minimal_pdf(&temp_file);
1619
1620 let document = PdfReader::open_document(&temp_file).unwrap();
1621 let extractor = ImageExtractor::new(document, ExtractImagesOptions::default());
1622
1623 let png_data = b"\x89PNG\r\n\x1a\n\x00\x00\x00\x0DIHDR";
1625 let format = extractor.detect_image_format_from_data(png_data).unwrap();
1626 assert_eq!(format, ImageFormat::Png);
1627 }
1628
1629 #[test]
1630 fn test_detect_image_format_jpeg() {
1631 let temp_dir = TempDir::new().unwrap();
1632 let temp_file = temp_dir.path().join("test.pdf");
1633 create_minimal_pdf(&temp_file);
1634
1635 let document = PdfReader::open_document(&temp_file).unwrap();
1636 let extractor = ImageExtractor::new(document, ExtractImagesOptions::default());
1637
1638 let jpeg_data = b"\xFF\xD8\xFF\xE0\x00\x10JFIF";
1640 let format = extractor.detect_image_format_from_data(jpeg_data).unwrap();
1641 assert_eq!(format, ImageFormat::Jpeg);
1642 }
1643
1644 #[test]
1645 fn test_detect_image_format_tiff_little_endian() {
1646 let temp_dir = TempDir::new().unwrap();
1647 let temp_file = temp_dir.path().join("test.pdf");
1648 create_minimal_pdf(&temp_file);
1649
1650 let document = PdfReader::open_document(&temp_file).unwrap();
1651 let extractor = ImageExtractor::new(document, ExtractImagesOptions::default());
1652
1653 let tiff_data = b"II\x2A\x00\x08\x00\x00\x00";
1655 let format = extractor.detect_image_format_from_data(tiff_data).unwrap();
1656 assert_eq!(format, ImageFormat::Tiff);
1657 }
1658
1659 #[test]
1660 fn test_detect_image_format_tiff_big_endian() {
1661 let temp_dir = TempDir::new().unwrap();
1662 let temp_file = temp_dir.path().join("test.pdf");
1663 create_minimal_pdf(&temp_file);
1664
1665 let document = PdfReader::open_document(&temp_file).unwrap();
1666 let extractor = ImageExtractor::new(document, ExtractImagesOptions::default());
1667
1668 let tiff_data = b"MM\x00\x2A\x00\x00\x00\x08";
1670 let format = extractor.detect_image_format_from_data(tiff_data).unwrap();
1671 assert_eq!(format, ImageFormat::Tiff);
1672 }
1673
1674 #[test]
1675 fn test_detect_image_format_unknown() {
1676 let temp_dir = TempDir::new().unwrap();
1677 let temp_file = temp_dir.path().join("test.pdf");
1678 create_minimal_pdf(&temp_file);
1679
1680 let document = PdfReader::open_document(&temp_file).unwrap();
1681 let extractor = ImageExtractor::new(document, ExtractImagesOptions::default());
1682
1683 let unknown_data = b"\x00\x01\x02\x03\x04\x05\x06\x07\x08";
1685 let format = extractor
1686 .detect_image_format_from_data(unknown_data)
1687 .unwrap();
1688 assert_eq!(format, ImageFormat::Png); }
1690
1691 #[test]
1692 fn test_detect_image_format_short_data() {
1693 let temp_dir = TempDir::new().unwrap();
1694 let temp_file = temp_dir.path().join("test.pdf");
1695 create_minimal_pdf(&temp_file);
1696
1697 let document = PdfReader::open_document(&temp_file).unwrap();
1698 let extractor = ImageExtractor::new(document, ExtractImagesOptions::default());
1699
1700 let short_data = b"\xFF";
1702 let result = extractor.detect_image_format_from_data(short_data);
1703 assert!(result.is_err());
1704 match result {
1705 Err(OperationError::ParseError(msg)) => {
1706 assert!(msg.contains("too short"));
1707 }
1708 _ => panic!("Expected ParseError"),
1709 }
1710 }
1711
1712 #[test]
1713 fn test_filename_pattern_replacements() {
1714 let options = ExtractImagesOptions {
1715 name_pattern: "page_{page}_img_{index}_{format}.{format}".to_string(),
1716 ..Default::default()
1717 };
1718
1719 let pattern = options
1720 .name_pattern
1721 .replace("{page}", "10")
1722 .replace("{index}", "5")
1723 .replace("{format}", "png");
1724
1725 assert_eq!(pattern, "page_10_img_5_png.png");
1726 }
1727
1728 #[test]
1729 fn test_extract_options_no_min_size() {
1730 let options = ExtractImagesOptions {
1731 min_size: None,
1732 ..Default::default()
1733 };
1734
1735 assert_eq!(options.min_size, None);
1736 }
1737
1738 #[test]
1739 fn test_create_output_directory() {
1740 let temp_dir = TempDir::new().unwrap();
1741 let output_dir = temp_dir.path().join("new_dir");
1742
1743 let options = ExtractImagesOptions {
1744 output_dir: output_dir.clone(),
1745 create_dir: true,
1746 ..Default::default()
1747 };
1748
1749 assert!(!output_dir.exists());
1751 assert_eq!(options.output_dir, output_dir);
1752 assert!(options.create_dir);
1753 }
1754
1755 #[test]
1756 fn test_pattern_with_special_chars() {
1757 let options = ExtractImagesOptions {
1758 name_pattern: "img-{page}_{index}.{format}".to_string(),
1759 ..Default::default()
1760 };
1761
1762 let pattern = options
1763 .name_pattern
1764 .replace("{page}", "1")
1765 .replace("{index}", "1")
1766 .replace("{format}", "jpg");
1767
1768 assert_eq!(pattern, "img-1_1.jpg");
1769 }
1770
1771 #[test]
1772 fn test_multiple_format_extensions() {
1773 let formats = vec![
1774 (ImageFormat::Jpeg, "jpg"),
1775 (ImageFormat::Png, "png"),
1776 (ImageFormat::Tiff, "tiff"),
1777 ];
1778
1779 for (format, expected_ext) in formats {
1780 let extension = match format {
1781 ImageFormat::Jpeg => "jpg",
1782 ImageFormat::Png => "png",
1783 ImageFormat::Tiff => "tiff",
1784 ImageFormat::Raw => "raw",
1785 };
1786 assert_eq!(extension, expected_ext);
1787 }
1788 }
1789
1790 #[test]
1791 fn test_extract_inline_option() {
1792 let mut options = ExtractImagesOptions::default();
1793 assert!(options.extract_inline);
1794
1795 options.extract_inline = false;
1796 assert!(!options.extract_inline);
1797 }
1798
1799 #[test]
1800 fn test_min_size_filtering() {
1801 let options_with_min = ExtractImagesOptions {
1802 min_size: Some(100),
1803 ..Default::default()
1804 };
1805
1806 let options_no_min = ExtractImagesOptions {
1807 min_size: None,
1808 ..Default::default()
1809 };
1810
1811 assert_eq!(options_with_min.min_size, Some(100));
1812 assert_eq!(options_no_min.min_size, None);
1813 }
1814
1815 #[test]
1816 fn test_output_path_combinations() {
1817 let base_dir = PathBuf::from("/output");
1818 let options = ExtractImagesOptions {
1819 output_dir: base_dir,
1820 name_pattern: "img_{page}_{index}.{format}".to_string(),
1821 ..Default::default()
1822 };
1823
1824 let filename = options
1825 .name_pattern
1826 .replace("{page}", "1")
1827 .replace("{index}", "2")
1828 .replace("{format}", "png");
1829
1830 let full_path = options.output_dir.join(filename);
1831 assert_eq!(full_path, PathBuf::from("/output/img_1_2.png"));
1832 }
1833
1834 #[test]
1835 fn test_pattern_without_placeholders() {
1836 let options = ExtractImagesOptions {
1837 name_pattern: "static_name.jpg".to_string(),
1838 ..Default::default()
1839 };
1840
1841 let pattern = options
1842 .name_pattern
1843 .replace("{page}", "1")
1844 .replace("{index}", "2")
1845 .replace("{format}", "png");
1846
1847 assert_eq!(pattern, "static_name.jpg"); }
1849
1850 #[test]
1851 fn test_detect_format_edge_cases() {
1852 let temp_dir = TempDir::new().unwrap();
1853 let temp_file = temp_dir.path().join("test.pdf");
1854 create_minimal_pdf(&temp_file);
1855
1856 let document = PdfReader::open_document(&temp_file).unwrap();
1857 let extractor = ImageExtractor::new(document, ExtractImagesOptions::default());
1858
1859 let empty_data = b"";
1861 assert!(extractor.detect_image_format_from_data(empty_data).is_err());
1862
1863 let exact_8 = b"\x89PNG\r\n\x1a\n";
1865 let format = extractor.detect_image_format_from_data(exact_8).unwrap();
1866 assert_eq!(format, ImageFormat::Png);
1867
1868 let exact_4 = b"II\x2A\x00";
1870 let format = extractor.detect_image_format_from_data(exact_4).unwrap();
1871 assert_eq!(format, ImageFormat::Tiff);
1872
1873 let exact_2 = b"\xFF\xD8";
1875 let format = extractor.detect_image_format_from_data(exact_2).unwrap();
1876 assert_eq!(format, ImageFormat::Jpeg); }
1878
1879 #[test]
1880 fn test_complex_filename_pattern() {
1881 let options = ExtractImagesOptions {
1882 name_pattern: "{format}/page{page}/image_{index}_{page}.{format}".to_string(),
1883 ..Default::default()
1884 };
1885
1886 let pattern = options
1887 .name_pattern
1888 .replace("{page}", "5")
1889 .replace("{index}", "3")
1890 .replace("{format}", "jpeg");
1891
1892 assert_eq!(pattern, "jpeg/page5/image_3_5.jpeg");
1893 }
1894
1895 #[test]
1896 fn test_image_dimensions() {
1897 let small_image = ExtractedImage {
1898 page_number: 0,
1899 image_index: 0,
1900 file_path: PathBuf::from("small.jpg"),
1901 width: 5,
1902 height: 5,
1903 format: ImageFormat::Jpeg,
1904 };
1905
1906 let large_image = ExtractedImage {
1907 page_number: 0,
1908 image_index: 1,
1909 file_path: PathBuf::from("large.jpg"),
1910 width: 2000,
1911 height: 3000,
1912 format: ImageFormat::Jpeg,
1913 };
1914
1915 assert_eq!(small_image.width, 5);
1916 assert_eq!(small_image.height, 5);
1917 assert_eq!(large_image.width, 2000);
1918 assert_eq!(large_image.height, 3000);
1919 }
1920
1921 #[test]
1922 fn test_page_and_index_numbering() {
1923 let image1 = ExtractedImage {
1925 page_number: 0, image_index: 0,
1927 file_path: PathBuf::from("first.jpg"),
1928 width: 100,
1929 height: 100,
1930 format: ImageFormat::Jpeg,
1931 };
1932
1933 let image2 = ExtractedImage {
1934 page_number: 99, image_index: 255, file_path: PathBuf::from("last.jpg"),
1937 width: 100,
1938 height: 100,
1939 format: ImageFormat::Jpeg,
1940 };
1941
1942 assert_eq!(image1.page_number, 0);
1943 assert_eq!(image1.image_index, 0);
1944 assert_eq!(image2.page_number, 99);
1945 assert_eq!(image2.image_index, 255);
1946 }
1947}
1948
1949#[cfg(test)]
1950#[path = "extract_images_tests.rs"]
1951mod extract_images_tests;