oxidize_pdf/operations/
extract_images.rs

1//! PDF image extraction functionality
2//!
3//! This module provides functionality to extract images from PDF documents with
4//! advanced preprocessing for scanned documents.
5
6use super::{OperationError, OperationResult};
7use crate::graphics::ImageFormat;
8use crate::parser::objects::{PdfName, PdfObject, PdfStream};
9use crate::parser::{PdfDocument, PdfReader};
10use std::collections::HashMap;
11use std::fs::{self, File};
12use std::io::{Read, Seek, Write};
13use std::path::{Path, PathBuf};
14
15#[cfg(feature = "external-images")]
16use image::{DynamicImage, GenericImageView, ImageBuffer, ImageFormat as ImageLibFormat, Luma};
17
18/// PDF transformation matrix (a, b, c, d, e, f)
19///
20/// Represents a 3x3 matrix: `[a c e; b d f; 0 0 1]` that transforms point `(x,y)` to `(a*x + c*y + e, b*x + d*y + f)`
21#[derive(Debug, Clone)]
22pub struct TransformMatrix {
23    pub a: f64, // x scaling
24    pub b: f64, // y skewing
25    pub c: f64, // x skewing
26    pub d: f64, // y scaling
27    pub e: f64, // x translation
28    pub f: f64, // y translation
29}
30
31impl TransformMatrix {
32    fn new(a: f64, b: f64, c: f64, d: f64, e: f64, f: f64) -> Self {
33        Self { a, b, c, d, e, f }
34    }
35
36    /// Check if this matrix represents a 90-degree rotation
37    #[allow(dead_code)]
38    fn is_90_degree_rotation(&self) -> bool {
39        // For 90-degree rotation: a ≈ 0, d ≈ 0, b and c are non-zero
40        self.a.abs() < 0.001 && self.d.abs() < 0.001 && self.b.abs() > 0.001 && self.c.abs() > 0.001
41    }
42
43    /// Check if this matrix represents a simple scaling
44    #[allow(dead_code)]
45    fn is_simple_scale(&self) -> bool {
46        // For scaling: b ≈ 0, c ≈ 0, a and d are scaling factors
47        self.b.abs() < 0.001 && self.c.abs() < 0.001 && self.a.abs() > 0.001 && self.d.abs() > 0.001
48    }
49
50    /// Check if this is a matrix that needs rotation for proper OCR
51    #[allow(dead_code)]
52    fn is_fis2_like_matrix(&self) -> bool {
53        // Some PDFs use 841.68 x 595.08 which are A4 dimensions (landscape fitting in portrait)
54        // This indicates the image is landscape but being fit into portrait page
55        (self.a - 841.68).abs() < 1.0
56            && (self.d - 595.08).abs() < 1.0
57            && self.b.abs() < 0.001
58            && self.c.abs() < 0.001
59    }
60}
61
62/// Preprocessing options for extracted images
63#[derive(Debug, Clone)]
64pub struct ImagePreprocessingOptions {
65    /// Auto-detect and correct rotation
66    pub auto_correct_rotation: bool,
67    /// Enhance contrast for better OCR
68    pub enhance_contrast: bool,
69    /// Apply noise reduction
70    pub denoise: bool,
71    /// Upscale small images using bicubic interpolation
72    pub upscale_small_images: bool,
73    /// Minimum size to trigger upscaling
74    pub upscale_threshold: u32,
75    /// Upscale factor (2x, 3x, etc.)
76    pub upscale_factor: u32,
77    /// Convert to grayscale for better OCR on text documents
78    pub force_grayscale: bool,
79}
80
81impl Default for ImagePreprocessingOptions {
82    fn default() -> Self {
83        Self {
84            auto_correct_rotation: true,
85            enhance_contrast: true,
86            denoise: true,
87            upscale_small_images: true,
88            upscale_threshold: 300,
89            upscale_factor: 2,
90            force_grayscale: false,
91        }
92    }
93}
94
95/// Options for image extraction
96#[derive(Debug, Clone)]
97pub struct ExtractImagesOptions {
98    /// Output directory for extracted images
99    pub output_dir: PathBuf,
100    /// File name pattern for extracted images
101    /// Supports placeholders: {page}, {index}, {format}
102    pub name_pattern: String,
103    /// Whether to extract inline images
104    pub extract_inline: bool,
105    /// Minimum size (width or height) to extract
106    pub min_size: Option<u32>,
107    /// Whether to create output directory if it doesn't exist
108    pub create_dir: bool,
109    /// Preprocessing options for extracted images
110    pub preprocessing: ImagePreprocessingOptions,
111}
112
113impl Default for ExtractImagesOptions {
114    fn default() -> Self {
115        Self {
116            output_dir: PathBuf::from("."),
117            name_pattern: "page_{page}_image_{index}.{format}".to_string(),
118            extract_inline: true,
119            min_size: Some(10),
120            create_dir: true,
121            preprocessing: ImagePreprocessingOptions::default(),
122        }
123    }
124}
125
126/// Result of image extraction
127#[derive(Debug)]
128pub struct ExtractedImage {
129    /// Page number (0-indexed)
130    pub page_number: usize,
131    /// Image index on the page
132    pub image_index: usize,
133    /// Output file path
134    pub file_path: PathBuf,
135    /// Image dimensions
136    pub width: u32,
137    pub height: u32,
138    /// Image format
139    pub format: ImageFormat,
140}
141
142/// Image extractor
143pub struct ImageExtractor<R: Read + Seek> {
144    document: PdfDocument<R>,
145    options: ExtractImagesOptions,
146    /// Cache for already processed images
147    processed_images: HashMap<String, PathBuf>,
148}
149
150impl<R: Read + Seek> ImageExtractor<R> {
151    /// Create a new image extractor
152    pub fn new(document: PdfDocument<R>, options: ExtractImagesOptions) -> Self {
153        Self {
154            document,
155            options,
156            processed_images: HashMap::new(),
157        }
158    }
159
160    /// Extract all images from the document
161    pub fn extract_all(&mut self) -> OperationResult<Vec<ExtractedImage>> {
162        // Create output directory if needed
163        if self.options.create_dir && !self.options.output_dir.exists() {
164            fs::create_dir_all(&self.options.output_dir)?;
165        }
166
167        let mut extracted_images = Vec::new();
168        let page_count = self
169            .document
170            .page_count()
171            .map_err(|e| OperationError::ParseError(e.to_string()))?;
172
173        for page_idx in 0..page_count {
174            let page_images = self.extract_from_page(page_idx as usize)?;
175            extracted_images.extend(page_images);
176        }
177
178        Ok(extracted_images)
179    }
180
181    /// Extract images from a specific page
182    pub fn extract_from_page(
183        &mut self,
184        page_number: usize,
185    ) -> OperationResult<Vec<ExtractedImage>> {
186        let mut extracted = Vec::new();
187
188        // Get the page
189        let page = self
190            .document
191            .get_page(page_number as u32)
192            .map_err(|e| OperationError::ParseError(e.to_string()))?;
193
194        // Get page resources and collect XObject references
195        let xobject_refs: Vec<(String, u32, u16)> = {
196            let resources = self
197                .document
198                .get_page_resources(&page)
199                .map_err(|e| OperationError::ParseError(e.to_string()))?;
200
201            let mut refs = Vec::new();
202
203            if let Some(resources) = resources {
204                if let Some(PdfObject::Dictionary(xobjects)) =
205                    resources.0.get(&PdfName("XObject".to_string()))
206                {
207                    for (name, obj_ref) in &xobjects.0 {
208                        if let PdfObject::Reference(obj_num, gen_num) = obj_ref {
209                            refs.push((name.0.clone(), *obj_num, *gen_num));
210                        }
211                    }
212                }
213            }
214
215            refs
216        };
217
218        // Process each XObject reference
219        let mut image_index = 0;
220        for (name, obj_num, gen_num) in xobject_refs {
221            if let Ok(xobject) = self.document.get_object(obj_num, gen_num) {
222                if let Some(extracted_image) =
223                    self.process_xobject(&xobject, page_number, image_index, &name)?
224                {
225                    extracted.push(extracted_image);
226                    image_index += 1;
227                }
228            }
229        }
230
231        // If no XObjects found via resources, try alternative method
232        if extracted.is_empty() {
233            // Analyze content streams for image references
234            if let Ok(content_streams) = self.document.get_page_content_streams(&page) {
235                for stream_data in &content_streams {
236                    let referenced_images = self.extract_referenced_images_from_content(
237                        stream_data,
238                        page_number,
239                        &mut image_index,
240                    )?;
241                    extracted.extend(referenced_images);
242                }
243            }
244        }
245
246        // Extract inline images from content stream if requested
247        if self.options.extract_inline {
248            if let Ok(parsed_page) = self.document.get_page(page_number as u32) {
249                if let Ok(content_streams) = self.document.get_page_content_streams(&parsed_page) {
250                    for stream_data in &content_streams {
251                        let inline_images = self.extract_inline_images_from_stream(
252                            stream_data,
253                            page_number,
254                            &mut image_index,
255                        )?;
256                        extracted.extend(inline_images);
257                    }
258                }
259            }
260        }
261
262        Ok(extracted)
263    }
264
265    /// Process an XObject to see if it's an image
266    fn process_xobject(
267        &mut self,
268        xobject: &PdfObject,
269        page_number: usize,
270        image_index: usize,
271        _name: &str,
272    ) -> OperationResult<Option<ExtractedImage>> {
273        if let PdfObject::Stream(stream) = xobject {
274            // Check if it's an image XObject
275            if let Some(PdfObject::Name(subtype)) =
276                stream.dict.0.get(&PdfName("Subtype".to_string()))
277            {
278                if subtype.0 == "Image" {
279                    return self.extract_image_xobject(stream, page_number, image_index);
280                }
281            }
282        }
283        Ok(None)
284    }
285
286    /// Extract an image XObject
287    fn extract_image_xobject(
288        &mut self,
289        stream: &PdfStream,
290        page_number: usize,
291        image_index: usize,
292    ) -> OperationResult<Option<ExtractedImage>> {
293        // Get image properties
294        let width = match stream.dict.0.get(&PdfName("Width".to_string())) {
295            Some(PdfObject::Integer(w)) => *w as u32,
296            _ => return Ok(None),
297        };
298
299        let height = match stream.dict.0.get(&PdfName("Height".to_string())) {
300            Some(PdfObject::Integer(h)) => *h as u32,
301            _ => return Ok(None),
302        };
303
304        // Check minimum size
305        if let Some(min_size) = self.options.min_size {
306            if width < min_size || height < min_size {
307                return Ok(None);
308            }
309        }
310
311        // Get color space information
312        let color_space = stream.dict.0.get(&PdfName("ColorSpace".to_string()));
313        let bits_per_component = match stream.dict.0.get(&PdfName("BitsPerComponent".to_string())) {
314            Some(PdfObject::Integer(bits)) => *bits as u8,
315            _ => 8, // Default to 8 bits per component
316        };
317
318        // Get the decoded image data
319        let parse_options = self.document.options();
320        let mut data = stream.decode(&parse_options).map_err(|e| {
321            OperationError::ParseError(format!("Failed to decode image stream: {e}"))
322        })?;
323
324        // Determine format from filter and process data accordingly
325        let format = match stream.dict.0.get(&PdfName("Filter".to_string())) {
326            Some(PdfObject::Name(filter)) => match filter.0.as_str() {
327                "DCTDecode" => {
328                    // JPEG data is already in correct format - use raw stream data
329                    // DCTDecode streams contain complete JPEG data, don't decode
330                    data = stream.data.clone();
331                    ImageFormat::Jpeg
332                }
333                "FlateDecode" => {
334                    // FlateDecode contains raw pixel data - need to convert to image format
335                    data = self.convert_raw_image_data_to_png(
336                        &data,
337                        width,
338                        height,
339                        color_space,
340                        bits_per_component,
341                    )?;
342                    ImageFormat::Png
343                }
344                "CCITTFaxDecode" => {
345                    // CCITT data for scanned documents - convert to PNG
346                    data = self.convert_ccitt_to_png(&data, width, height)?;
347                    ImageFormat::Png
348                }
349                "LZWDecode" => {
350                    // LZW compressed raw data - convert to PNG
351                    data = self.convert_raw_image_data_to_png(
352                        &data,
353                        width,
354                        height,
355                        color_space,
356                        bits_per_component,
357                    )?;
358                    ImageFormat::Png
359                }
360                _ => {
361                    tracing::debug!("Unsupported image filter: {}", filter.0);
362                    return Ok(None);
363                }
364            },
365            Some(PdfObject::Array(filters)) => {
366                // Handle filter arrays - use the first filter
367                if let Some(PdfObject::Name(filter)) = filters.0.first() {
368                    match filter.0.as_str() {
369                        "DCTDecode" => {
370                            // JPEG data is already in correct format - use raw stream data
371                            data = stream.data.clone();
372                            ImageFormat::Jpeg
373                        }
374                        "FlateDecode" => {
375                            data = self.convert_raw_image_data_to_png(
376                                &data,
377                                width,
378                                height,
379                                color_space,
380                                bits_per_component,
381                            )?;
382                            ImageFormat::Png
383                        }
384                        "CCITTFaxDecode" => {
385                            data = self.convert_ccitt_to_png(&data, width, height)?;
386                            ImageFormat::Png
387                        }
388                        "LZWDecode" => {
389                            data = self.convert_raw_image_data_to_png(
390                                &data,
391                                width,
392                                height,
393                                color_space,
394                                bits_per_component,
395                            )?;
396                            ImageFormat::Png
397                        }
398                        _ => {
399                            tracing::debug!("Unsupported image filter: {}", filter.0);
400                            return Ok(None);
401                        }
402                    }
403                } else {
404                    return Ok(None);
405                }
406            }
407            _ => {
408                // No filter - raw image data
409                data = self.convert_raw_image_data_to_png(
410                    &data,
411                    width,
412                    height,
413                    color_space,
414                    bits_per_component,
415                )?;
416                ImageFormat::Png
417            }
418        };
419
420        // Generate unique key for this image data
421        let image_key = format!("{:x}", md5::compute(&data));
422
423        // For scanned PDFs where all pages reference the same image object,
424        // we need to create separate files per page for OCR processing
425        // Don't deduplicate if we're extracting for OCR purposes
426        let allow_deduplication = !self.options.name_pattern.contains("{page}");
427
428        // Check if we've already extracted this image (only if deduplication is allowed)
429        if allow_deduplication {
430            if let Some(existing_path) = self.processed_images.get(&image_key) {
431                // Return reference to already extracted image
432                return Ok(Some(ExtractedImage {
433                    page_number,
434                    image_index,
435                    file_path: existing_path.clone(),
436                    width,
437                    height,
438                    format,
439                }));
440            }
441        }
442
443        // Generate output filename
444        let extension = match format {
445            ImageFormat::Jpeg => "jpg",
446            ImageFormat::Png => "png",
447            ImageFormat::Tiff => "tiff",
448            ImageFormat::Raw => "rgb",
449        };
450
451        let filename = self
452            .options
453            .name_pattern
454            .replace("{page}", &(page_number + 1).to_string())
455            .replace("{index}", &(image_index + 1).to_string())
456            .replace("{format}", extension);
457
458        let output_path = self.options.output_dir.join(filename);
459
460        // Apply preprocessing if enabled
461        #[cfg(feature = "external-images")]
462        let processed_data = if self.should_preprocess() {
463            self.preprocess_image_data(&data, width, height, format)?
464        } else {
465            data
466        };
467
468        #[cfg(not(feature = "external-images"))]
469        let processed_data = data;
470
471        // Write image data
472        let mut file = File::create(&output_path)?;
473        file.write_all(&processed_data)?;
474
475        // Cache the path
476        self.processed_images.insert(image_key, output_path.clone());
477
478        Ok(Some(ExtractedImage {
479            page_number,
480            image_index,
481            file_path: output_path,
482            width,
483            height,
484            format,
485        }))
486    }
487
488    /// Detect image format from raw data by examining magic bytes
489    fn detect_image_format_from_data(&self, data: &[u8]) -> OperationResult<ImageFormat> {
490        if data.is_empty() {
491            return Err(OperationError::ParseError(
492                "Image data too short to detect format".to_string(),
493            ));
494        }
495
496        // Check for PNG signature (needs 8 bytes)
497        if data.len() >= 8 && &data[0..8] == b"\x89PNG\r\n\x1a\n" {
498            return Ok(ImageFormat::Png);
499        }
500
501        // Check for TIFF signatures (needs 4 bytes)
502        if data.len() >= 4 {
503            if &data[0..2] == b"II" && &data[2..4] == b"\x2A\x00" {
504                return Ok(ImageFormat::Tiff); // Little endian TIFF
505            }
506            if &data[0..2] == b"MM" && &data[2..4] == b"\x00\x2A" {
507                return Ok(ImageFormat::Tiff); // Big endian TIFF
508            }
509        }
510
511        // Check for JPEG signature (needs 2 bytes)
512        if data.len() >= 2 && data[0] == 0xFF && data[1] == 0xD8 {
513            return Ok(ImageFormat::Jpeg);
514        }
515
516        // If data is too short for any meaningful detection
517        if data.len() < 2 {
518            return Err(OperationError::ParseError(
519                "Image data too short to detect format".to_string(),
520            ));
521        }
522
523        // Default to PNG for FlateDecode if no other format detected
524        // This is a fallback since FlateDecode is commonly used for PNG in PDFs
525        Ok(ImageFormat::Png)
526    }
527
528    /// Extract inline images from a content stream
529    fn extract_inline_images_from_stream(
530        &mut self,
531        stream_data: &[u8],
532        page_number: usize,
533        image_index: &mut usize,
534    ) -> OperationResult<Vec<ExtractedImage>> {
535        let mut inline_images = Vec::new();
536
537        // Convert bytes to string for parsing
538        let stream_str = String::from_utf8_lossy(stream_data);
539
540        // Find inline image operators: BI (Begin Image), ID (Image Data), EI (End Image)
541        let mut pos = 0;
542        while let Some(bi_pos) = stream_str[pos..].find("BI") {
543            let absolute_bi_pos = pos + bi_pos;
544
545            // Find the ID operator after BI
546            if let Some(relative_id_pos) = stream_str[absolute_bi_pos..].find("ID") {
547                let absolute_id_pos = absolute_bi_pos + relative_id_pos;
548
549                // Find the EI operator after ID
550                if let Some(relative_ei_pos) = stream_str[absolute_id_pos..].find("EI") {
551                    let absolute_ei_pos = absolute_id_pos + relative_ei_pos;
552
553                    // Extract image dictionary (between BI and ID)
554                    let dict_section = &stream_str[absolute_bi_pos + 2..absolute_id_pos].trim();
555
556                    // Extract image data (between ID and EI)
557                    let data_start = absolute_id_pos + 2;
558                    let data_end = absolute_ei_pos;
559
560                    if data_start < data_end && data_end <= stream_data.len() {
561                        let image_data = &stream_data[data_start..data_end];
562
563                        // Parse basic image properties from dictionary
564                        let (width, height) = self.parse_inline_image_dict(dict_section);
565
566                        // Create extracted image
567                        if let Ok(extracted_image) = self.save_inline_image(
568                            image_data,
569                            page_number,
570                            *image_index,
571                            width,
572                            height,
573                        ) {
574                            inline_images.push(extracted_image);
575                            *image_index += 1;
576                        }
577                    }
578
579                    // Continue searching after this EI
580                    pos = absolute_ei_pos + 2;
581                } else {
582                    break; // No matching EI found
583                }
584            } else {
585                break; // No matching ID found
586            }
587        }
588
589        Ok(inline_images)
590    }
591
592    /// Extract images referenced in content streams when resources are not available
593    fn extract_referenced_images_from_content(
594        &mut self,
595        stream_data: &[u8],
596        page_number: usize,
597        image_index: &mut usize,
598    ) -> OperationResult<Vec<ExtractedImage>> {
599        let mut extracted = Vec::new();
600
601        // Convert to string for parsing
602        let content = String::from_utf8_lossy(stream_data);
603
604        tracing::debug!("       Content: {}", content);
605
606        // Parse transformation matrices and image references together
607        // Pattern: look for cm matrices followed by Do operators
608        let image_with_transform = self.parse_images_with_transformations(&content)?;
609
610        for (image_name, transform_matrix) in image_with_transform {
611            // Try to find this object by scanning all objects in the document
612            if let Some(mut extracted_image) =
613                self.find_and_extract_xobject_by_name(&image_name, page_number, *image_index)?
614            {
615                // Apply transformation if one was found
616                if let Some(matrix) = transform_matrix {
617                    extracted_image =
618                        self.apply_transformation_to_image(extracted_image, &matrix)?;
619                }
620
621                extracted.push(extracted_image);
622                *image_index += 1;
623            }
624        }
625
626        Ok(extracted)
627    }
628
629    /// Find an XObject by name by scanning through the document
630    fn find_and_extract_xobject_by_name(
631        &mut self,
632        name: &str,
633        page_number: usize,
634        image_index: usize,
635    ) -> OperationResult<Option<ExtractedImage>> {
636        // This is a brute force approach - scan through objects looking for image streams
637        // In a real implementation, we would have better object mapping, but for now
638        // this should work for common landscape-in-portrait cases
639
640        // Try some common object numbers that might contain images
641        // We'll scan a range and look for stream objects that look like images
642        for obj_num in 1..1000 {
643            if let Ok(obj) = self.document.get_object(obj_num, 0) {
644                if let Some(extracted) =
645                    self.try_extract_image_from_object(&obj, page_number, image_index, name)?
646                {
647                    return Ok(Some(extracted));
648                }
649            }
650        }
651
652        Ok(None)
653    }
654
655    /// Try to extract an image from any PDF object
656    fn try_extract_image_from_object(
657        &mut self,
658        obj: &PdfObject,
659        page_number: usize,
660        image_index: usize,
661        _expected_name: &str,
662    ) -> OperationResult<Option<ExtractedImage>> {
663        if let PdfObject::Stream(stream) = obj {
664            // Check if this stream looks like an image
665            if let Some(PdfObject::Name(subtype)) =
666                stream.dict.0.get(&PdfName("Subtype".to_string()))
667            {
668                if subtype.0 == "Image" {
669                    return self.extract_image_xobject(stream, page_number, image_index);
670                }
671            }
672
673            // Also check for streams that might be images but don't have proper Subtype
674            if let Some(PdfObject::Integer(_width)) =
675                stream.dict.0.get(&PdfName("Width".to_string()))
676            {
677                if let Some(PdfObject::Integer(_height)) =
678                    stream.dict.0.get(&PdfName("Height".to_string()))
679                {
680                    return self.extract_image_xobject(stream, page_number, image_index);
681                }
682            }
683        }
684
685        Ok(None)
686    }
687
688    /// Parse content stream to find images with their transformation matrices
689    fn parse_images_with_transformations(
690        &self,
691        content: &str,
692    ) -> OperationResult<Vec<(String, Option<TransformMatrix>)>> {
693        let mut results = Vec::new();
694        let lines: Vec<&str> = content.lines().collect();
695
696        let mut current_matrix: Option<TransformMatrix> = None;
697
698        for line in lines {
699            let line = line.trim();
700
701            // Look for transformation matrices: "a b c d e f cm"
702            if line.ends_with(" cm") {
703                let parts: Vec<&str> = line.split_whitespace().collect();
704                if parts.len() == 7 && parts[6] == "cm" {
705                    // Parse the 6 matrix values
706                    if let (Ok(a), Ok(b), Ok(c), Ok(d), Ok(e), Ok(f)) = (
707                        parts[0].parse::<f64>(),
708                        parts[1].parse::<f64>(),
709                        parts[2].parse::<f64>(),
710                        parts[3].parse::<f64>(),
711                        parts[4].parse::<f64>(),
712                        parts[5].parse::<f64>(),
713                    ) {
714                        current_matrix = Some(TransformMatrix::new(a, b, c, d, e, f));
715                    }
716                }
717            }
718
719            // Look for image draw commands: "/ImageName Do"
720            if line.contains(" Do") {
721                let parts: Vec<&str> = line.split_whitespace().collect();
722                for part in parts {
723                    if part.starts_with('/') && !part.contains("Do") {
724                        let image_name = part[1..].to_string(); // Remove the '/'
725                        results.push((image_name, current_matrix.clone()));
726                    }
727                }
728            }
729
730            // Reset matrix on graphics state restore
731            if line.trim() == "Q" {
732                current_matrix = None;
733            }
734        }
735
736        Ok(results)
737    }
738
739    /// Apply transformation matrix to an extracted image
740    #[allow(unused_mut)]
741    fn apply_transformation_to_image(
742        &self,
743        mut extracted_image: ExtractedImage,
744        _matrix: &TransformMatrix,
745    ) -> OperationResult<ExtractedImage> {
746        #[cfg(feature = "external-images")]
747        {
748            // Read the extracted image file
749            let image_data = std::fs::read(&extracted_image.file_path)?;
750
751            // Load with image crate
752            let img = image::load_from_memory(&image_data).map_err(|e| {
753                OperationError::ParseError(format!("Failed to load image for transformation: {e}"))
754            })?;
755
756            // IGNORE TRANSFORMATION FOR NOW - FOCUS ON STRIDE PROBLEM
757            let transformed_img =
758                self.fix_stride_problem(img, extracted_image.width, extracted_image.height)?;
759
760            // Save the transformed image
761            let output_filename = extracted_image
762                .file_path
763                .file_stem()
764                .and_then(|s| s.to_str())
765                .ok_or_else(|| OperationError::InvalidPath {
766                    reason: format!(
767                        "Image path has no valid filename: {:?}",
768                        extracted_image.file_path
769                    ),
770                })?;
771            let output_extension = extracted_image
772                .file_path
773                .extension()
774                .and_then(|s| s.to_str())
775                .ok_or_else(|| OperationError::InvalidPath {
776                    reason: format!(
777                        "Image path has no valid extension: {:?}",
778                        extracted_image.file_path
779                    ),
780                })?;
781
782            let parent_dir =
783                extracted_image
784                    .file_path
785                    .parent()
786                    .ok_or_else(|| OperationError::InvalidPath {
787                        reason: format!(
788                            "Image path has no parent directory: {:?}",
789                            extracted_image.file_path
790                        ),
791                    })?;
792            let transformed_path = parent_dir.join(format!(
793                "{}_transformed.{}",
794                output_filename, output_extension
795            ));
796
797            transformed_img.save(&transformed_path).map_err(|e| {
798                OperationError::ParseError(format!("Failed to save transformed image: {e}"))
799            })?;
800
801            // Update the extracted image info
802            let (new_width, new_height) = transformed_img.dimensions();
803            extracted_image.file_path = transformed_path;
804            extracted_image.width = new_width;
805            extracted_image.height = new_height;
806        }
807
808        #[cfg(not(feature = "external-images"))]
809        {}
810
811        Ok(extracted_image)
812    }
813
814    /// Apply rotation transformation
815    #[cfg(feature = "external-images")]
816    #[allow(dead_code)]
817    fn apply_rotation_transformation(
818        &self,
819        img: DynamicImage,
820        matrix: &TransformMatrix,
821    ) -> OperationResult<DynamicImage> {
822        // Determine rotation direction based on matrix values
823        // For 90-degree clockwise: a=0, b=1, c=-1, d=0
824        // For 90-degree counter-clockwise: a=0, b=-1, c=1, d=0
825
826        if matrix.b > 0.0 && matrix.c < 0.0 {
827            Ok(img.rotate90()) // 90 degrees clockwise
828        } else if matrix.b < 0.0 && matrix.c > 0.0 {
829            Ok(img.rotate270()) // 90 degrees counter-clockwise (270 clockwise)
830        } else {
831            // Default to 90-degree rotation for landscape-in-portrait cases
832            Ok(img.rotate90())
833        }
834    }
835
836    /// Apply scaling transformation
837    #[cfg(feature = "external-images")]
838    #[allow(dead_code)]
839    fn apply_scale_transformation(
840        &self,
841        img: DynamicImage,
842        matrix: &TransformMatrix,
843    ) -> OperationResult<DynamicImage> {
844        let (current_width, current_height) = img.dimensions();
845
846        // Calculate new dimensions based on scaling factors
847        let new_width = (current_width as f64 * matrix.a.abs()) as u32;
848        let new_height = (current_height as f64 * matrix.d.abs()) as u32;
849
850        if new_width > 0 && new_height > 0 {
851            Ok(img.resize(new_width, new_height, image::imageops::FilterType::Lanczos3))
852        } else {
853            // If scaling results in invalid dimensions, return original
854            Ok(img)
855        }
856    }
857
858    /// Fix stride/row alignment problems in image data
859    #[cfg(feature = "external-images")]
860    fn fix_stride_problem(
861        &self,
862        img: DynamicImage,
863        original_width: u32,
864        original_height: u32,
865    ) -> OperationResult<DynamicImage> {
866        // Convert to raw grayscale data
867        let gray_img = img.to_luma8();
868        let pixel_data = gray_img.as_raw();
869
870        // Try different row strides to fix misalignment
871        let bytes_per_row = original_width as usize;
872        let min_bytes_per_row = bytes_per_row;
873
874        // Possible stride alignments
875        let possible_strides = [
876            min_bytes_per_row,              // No padding
877            (min_bytes_per_row + 1) & !1,   // 2-byte aligned
878            (min_bytes_per_row + 3) & !3,   // 4-byte aligned
879            (min_bytes_per_row + 7) & !7,   // 8-byte aligned
880            (min_bytes_per_row + 15) & !15, // 16-byte aligned
881            min_bytes_per_row + 1,          // +1 padding
882            min_bytes_per_row + 2,          // +2 padding
883            min_bytes_per_row + 4,          // +4 padding
884        ];
885
886        for (_i, &stride) in possible_strides.iter().enumerate() {
887            let expected_total = stride * original_height as usize;
888
889            if expected_total <= pixel_data.len() {
890                // Extract using this stride
891                let mut corrected_data = Vec::new();
892                for row in 0..original_height {
893                    let row_start = row as usize * stride;
894                    let row_end = row_start + bytes_per_row;
895
896                    if row_end <= pixel_data.len() {
897                        corrected_data.extend_from_slice(&pixel_data[row_start..row_end]);
898                    } else {
899                        // Fill with white if we run out of data
900                        corrected_data.resize(corrected_data.len() + bytes_per_row, 255);
901                    }
902                }
903
904                // Create corrected image
905                if corrected_data.len() == (original_width * original_height) as usize {
906                    if let Some(corrected_img) = ImageBuffer::<Luma<u8>, Vec<u8>>::from_raw(
907                        original_width,
908                        original_height,
909                        corrected_data,
910                    ) {
911                        return Ok(DynamicImage::ImageLuma8(corrected_img));
912                    }
913                }
914            } else {
915            }
916        }
917
918        Ok(img)
919    }
920
921    /// Parse inline image dictionary to extract width and height
922    fn parse_inline_image_dict(&self, dict_str: &str) -> (u32, u32) {
923        let mut width = 100; // Default width
924        let mut height = 100; // Default height
925
926        // Simple parsing - look for /W and /H parameters
927        for line in dict_str.lines() {
928            let line = line.trim();
929
930            // Parse width: /W 123 or /Width 123
931            if line.starts_with("/W ") || line.starts_with("/Width ") {
932                if let Some(value_str) = line.split_whitespace().nth(1) {
933                    if let Ok(w) = value_str.parse::<u32>() {
934                        width = w;
935                    }
936                }
937            }
938
939            // Parse height: /H 123 or /Height 123
940            if line.starts_with("/H ") || line.starts_with("/Height ") {
941                if let Some(value_str) = line.split_whitespace().nth(1) {
942                    if let Ok(h) = value_str.parse::<u32>() {
943                        height = h;
944                    }
945                }
946            }
947        }
948
949        (width, height)
950    }
951
952    /// Save an inline image to disk
953    fn save_inline_image(
954        &mut self,
955        data: &[u8],
956        page_number: usize,
957        image_index: usize,
958        width: u32,
959        height: u32,
960    ) -> OperationResult<ExtractedImage> {
961        // Generate unique key for deduplication
962        let image_key = format!("{:x}", md5::compute(data));
963
964        // Don't deduplicate if we're extracting for OCR purposes (pattern contains {page})
965        let allow_deduplication = !self.options.name_pattern.contains("{page}");
966
967        // Check if we've already extracted this image (only if deduplication is allowed)
968        if allow_deduplication {
969            if let Some(existing_path) = self.processed_images.get(&image_key) {
970                return Ok(ExtractedImage {
971                    page_number,
972                    image_index,
973                    file_path: existing_path.clone(),
974                    width,
975                    height,
976                    format: ImageFormat::Raw, // Inline images are often raw
977                });
978            }
979        }
980
981        // Determine format and extension
982        let format = self
983            .detect_image_format_from_data(data)
984            .unwrap_or(ImageFormat::Raw);
985        let extension = match format {
986            ImageFormat::Jpeg => "jpg",
987            ImageFormat::Png => "png",
988            ImageFormat::Tiff => "tif",
989            ImageFormat::Raw => "raw",
990        };
991
992        // Generate filename
993        let filename = format!(
994            "inline_page_{}_{:03}.{}",
995            page_number + 1,
996            image_index + 1,
997            extension
998        );
999        let file_path = self.options.output_dir.join(filename);
1000
1001        // Write image data to file
1002        fs::write(&file_path, data)?;
1003
1004        // Cache the extracted image
1005        self.processed_images.insert(image_key, file_path.clone());
1006
1007        Ok(ExtractedImage {
1008            page_number,
1009            image_index,
1010            file_path,
1011            width,
1012            height,
1013            format,
1014        })
1015    }
1016
1017    /// Convert raw image data to PNG format
1018    fn convert_raw_image_data_to_png(
1019        &self,
1020        data: &[u8],
1021        width: u32,
1022        height: u32,
1023        color_space: Option<&PdfObject>,
1024        bits_per_component: u8,
1025    ) -> OperationResult<Vec<u8>> {
1026        // Determine color components and channels
1027        let (components, _channels) = match color_space {
1028            Some(PdfObject::Name(cs)) => match cs.0.as_str() {
1029                "DeviceGray" => (1, 1),
1030                "DeviceRGB" => (3, 3),
1031                "DeviceCMYK" => (4, 4),
1032                _ => (3, 3), // Default to RGB
1033            },
1034            Some(PdfObject::Array(cs_array)) if !cs_array.0.is_empty() => {
1035                if let Some(PdfObject::Name(cs_name)) = cs_array.0.first() {
1036                    match cs_name.0.as_str() {
1037                        "ICCBased" | "CalRGB" => (3, 3),
1038                        "CalGray" => (1, 1),
1039                        _ => (3, 3),
1040                    }
1041                } else {
1042                    (3, 3)
1043                }
1044            }
1045            _ => (3, 3), // Default to RGB
1046        };
1047
1048        // Calculate expected data size
1049        let bytes_per_sample = if bits_per_component <= 8 { 1 } else { 2 };
1050        let expected_size = (width * height * components as u32 * bytes_per_sample as u32) as usize;
1051
1052        // Validate data size
1053        if data.len() < expected_size {
1054            return Err(OperationError::ParseError(format!(
1055                "Image data too small: expected {}, got {}",
1056                expected_size,
1057                data.len()
1058            )));
1059        }
1060
1061        // Convert to PNG format using simple PNG encoding
1062        self.create_png_from_raw_data(data, width, height, components, bits_per_component)
1063    }
1064
1065    /// Create PNG from raw pixel data
1066    fn create_png_from_raw_data(
1067        &self,
1068        data: &[u8],
1069        width: u32,
1070        height: u32,
1071        components: u8,
1072        bits_per_component: u8,
1073    ) -> OperationResult<Vec<u8>> {
1074        // Simple PNG creation - create a basic PNG structure
1075        let mut png_data = Vec::new();
1076
1077        // PNG signature
1078        png_data.extend_from_slice(&[0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A]);
1079
1080        // IHDR chunk
1081        let mut ihdr = Vec::new();
1082        ihdr.extend_from_slice(&width.to_be_bytes());
1083        ihdr.extend_from_slice(&height.to_be_bytes());
1084        ihdr.push(bits_per_component);
1085
1086        // Color type: 0 = grayscale, 2 = RGB, 6 = RGBA
1087        let color_type = match components {
1088            1 => 0, // Grayscale
1089            3 => 2, // RGB
1090            4 => 6, // RGBA
1091            _ => 2, // Default to RGB
1092        };
1093        ihdr.push(color_type);
1094        ihdr.push(0); // Compression method
1095        ihdr.push(0); // Filter method
1096        ihdr.push(0); // Interlace method
1097
1098        self.write_png_chunk(&mut png_data, b"IHDR", &ihdr);
1099
1100        // IDAT chunk - compress the image data
1101        let compressed_data = self.compress_image_data(data, width, height, components)?;
1102        self.write_png_chunk(&mut png_data, b"IDAT", &compressed_data);
1103
1104        // IEND chunk
1105        self.write_png_chunk(&mut png_data, b"IEND", &[]);
1106
1107        Ok(png_data)
1108    }
1109
1110    /// Write a PNG chunk with proper CRC
1111    fn write_png_chunk(&self, output: &mut Vec<u8>, chunk_type: &[u8; 4], data: &[u8]) {
1112        // Length (4 bytes, big endian)
1113        output.extend_from_slice(&(data.len() as u32).to_be_bytes());
1114
1115        // Chunk type (4 bytes)
1116        output.extend_from_slice(chunk_type);
1117
1118        // Data
1119        output.extend_from_slice(data);
1120
1121        // CRC (4 bytes, big endian)
1122        let crc = self.calculate_crc32(chunk_type, data);
1123        output.extend_from_slice(&crc.to_be_bytes());
1124    }
1125
1126    /// Simple CRC32 calculation for PNG
1127    fn calculate_crc32(&self, chunk_type: &[u8; 4], data: &[u8]) -> u32 {
1128        // Simple CRC32 - in a real implementation we'd use a proper CRC library
1129        let mut crc: u32 = 0xFFFFFFFF;
1130
1131        // Process chunk type
1132        for &byte in chunk_type {
1133            crc ^= byte as u32;
1134            for _ in 0..8 {
1135                if crc & 1 != 0 {
1136                    crc = (crc >> 1) ^ 0xEDB88320;
1137                } else {
1138                    crc >>= 1;
1139                }
1140            }
1141        }
1142
1143        // Process data
1144        for &byte in data {
1145            crc ^= byte as u32;
1146            for _ in 0..8 {
1147                if crc & 1 != 0 {
1148                    crc = (crc >> 1) ^ 0xEDB88320;
1149                } else {
1150                    crc >>= 1;
1151                }
1152            }
1153        }
1154
1155        crc ^ 0xFFFFFFFF
1156    }
1157
1158    /// Compress image data for PNG IDAT chunk
1159    fn compress_image_data(
1160        &self,
1161        data: &[u8],
1162        width: u32,
1163        height: u32,
1164        components: u8,
1165    ) -> OperationResult<Vec<u8>> {
1166        use flate2::write::ZlibEncoder;
1167        use flate2::Compression;
1168        use std::io::Write;
1169
1170        let mut encoder = ZlibEncoder::new(Vec::new(), Compression::default());
1171
1172        // PNG requires scanline filtering - add filter byte (0 = None) to each row
1173        let bytes_per_pixel = components as usize;
1174        let bytes_per_row = width as usize * bytes_per_pixel;
1175
1176        for row in 0..height {
1177            // Filter byte (0 = no filter)
1178            encoder.write_all(&[0])?;
1179
1180            // Row data
1181            let start = row as usize * bytes_per_row;
1182            let end = start + bytes_per_row;
1183            if end <= data.len() {
1184                encoder.write_all(&data[start..end])?;
1185            }
1186        }
1187
1188        encoder
1189            .finish()
1190            .map_err(|e| OperationError::ParseError(format!("Failed to compress PNG data: {e}")))
1191    }
1192
1193    /// Convert CCITT Fax decoded data to PNG (for scanned documents)
1194    fn convert_ccitt_to_png(
1195        &self,
1196        data: &[u8],
1197        width: u32,
1198        height: u32,
1199    ) -> OperationResult<Vec<u8>> {
1200        // CCITT is typically 1-bit monochrome
1201        // Convert 1-bit to 8-bit grayscale
1202        let mut rgb_data = Vec::new();
1203
1204        // Calculate potential row strides - try multiple alignments
1205        let bits_per_row = width as usize;
1206        let min_bytes_per_row = bits_per_row.div_ceil(8);
1207
1208        // Try different row stride alignments (1, 2, 4, 8, 16 byte alignment)
1209        let possible_strides = [
1210            min_bytes_per_row,              // No padding
1211            (min_bytes_per_row + 1) & !1,   // 2-byte aligned
1212            (min_bytes_per_row + 3) & !3,   // 4-byte aligned
1213            (min_bytes_per_row + 7) & !7,   // 8-byte aligned
1214            (min_bytes_per_row + 15) & !15, // 16-byte aligned
1215        ];
1216
1217        // Try to detect the correct stride by checking data patterns
1218        let correct_stride =
1219            self.detect_correct_row_stride(data, width, height, &possible_strides)?;
1220
1221        for row in 0..height {
1222            let row_start = row as usize * correct_stride;
1223
1224            for col in 0..width {
1225                let byte_idx = row_start + (col as usize / 8);
1226                let bit_idx = 7 - (col as usize % 8);
1227
1228                if byte_idx < data.len() {
1229                    let bit = (data[byte_idx] >> bit_idx) & 1;
1230                    // CCITT: 0 = black, 1 = white
1231                    let gray_value = if bit == 0 { 0 } else { 255 };
1232                    rgb_data.push(gray_value);
1233                } else {
1234                    rgb_data.push(255); // White for missing data
1235                }
1236            }
1237        }
1238
1239        // Create PNG from grayscale data
1240        self.create_png_from_raw_data(&rgb_data, width, height, 1, 8)
1241    }
1242
1243    /// Detect the correct row stride by analyzing data patterns
1244    fn detect_correct_row_stride(
1245        &self,
1246        data: &[u8],
1247        width: u32,
1248        height: u32,
1249        possible_strides: &[usize],
1250    ) -> OperationResult<usize> {
1251        let bits_per_row = width as usize;
1252        let min_bytes_per_row = bits_per_row.div_ceil(8);
1253
1254        // If we don't have enough data for analysis, use minimum stride
1255        if data.len() < min_bytes_per_row * 3 {
1256            return Ok(min_bytes_per_row);
1257        }
1258
1259        // Calculate expected total size for each stride
1260        for &stride in possible_strides {
1261            let expected_size = stride * height as usize;
1262
1263            // If this stride gives us a size close to actual data length, use it
1264            if expected_size <= data.len() && (data.len() - expected_size) < stride * 2 {
1265                // Allow some tolerance
1266
1267                return Ok(stride);
1268            }
1269        }
1270
1271        // If no stride fits perfectly, calculate from data length
1272        let calculated_stride = data.len() / height as usize;
1273        if calculated_stride >= min_bytes_per_row {
1274            return Ok(calculated_stride);
1275        }
1276
1277        // Fallback to minimum
1278        Ok(min_bytes_per_row)
1279    }
1280
1281    /// Check if preprocessing should be applied
1282    #[allow(dead_code)]
1283    fn should_preprocess(&self) -> bool {
1284        self.options.preprocessing.auto_correct_rotation
1285            || self.options.preprocessing.enhance_contrast
1286            || self.options.preprocessing.denoise
1287            || self.options.preprocessing.upscale_small_images
1288            || self.options.preprocessing.force_grayscale
1289    }
1290
1291    /// Apply image preprocessing
1292    #[cfg(feature = "external-images")]
1293    fn preprocess_image_data(
1294        &self,
1295        data: &[u8],
1296        width: u32,
1297        height: u32,
1298        format: ImageFormat,
1299    ) -> OperationResult<Vec<u8>> {
1300        // Load image using the image crate
1301        let img_format = match format {
1302            ImageFormat::Jpeg => ImageLibFormat::Jpeg,
1303            ImageFormat::Png => ImageLibFormat::Png,
1304            ImageFormat::Tiff => ImageLibFormat::Tiff,
1305            ImageFormat::Raw => {
1306                // For raw data, create a simple RGB image
1307                return self.preprocess_raw_image_data(data, width, height);
1308            }
1309        };
1310
1311        let img = image::load_from_memory_with_format(data, img_format)
1312            .map_err(|e| OperationError::ParseError(format!("Failed to load image: {e}")))?;
1313
1314        let mut processed_img = img;
1315
1316        // Apply preprocessing steps
1317        processed_img = self.apply_rotation_correction(processed_img)?;
1318        processed_img = self.apply_contrast_enhancement(processed_img)?;
1319        processed_img = self.apply_noise_reduction(processed_img)?;
1320        processed_img = self.apply_upscaling(processed_img, width, height)?;
1321
1322        if self.options.preprocessing.force_grayscale {
1323            processed_img = DynamicImage::ImageLuma8(processed_img.to_luma8());
1324        }
1325
1326        // Encode back to bytes
1327        let mut output = Vec::new();
1328        processed_img
1329            .write_to(&mut std::io::Cursor::new(&mut output), img_format)
1330            .map_err(|e| OperationError::ParseError(format!("Failed to encode image: {e}")))?;
1331
1332        Ok(output)
1333    }
1334
1335    /// Preprocess raw image data
1336    #[cfg(feature = "external-images")]
1337    fn preprocess_raw_image_data(
1338        &self,
1339        data: &[u8],
1340        width: u32,
1341        height: u32,
1342    ) -> OperationResult<Vec<u8>> {
1343        // Create a simple grayscale image from raw data
1344        if data.len() < (width * height) as usize {
1345            return Err(OperationError::ParseError(
1346                "Raw image data too small".to_string(),
1347            ));
1348        }
1349
1350        let img_buffer = ImageBuffer::<Luma<u8>, Vec<u8>>::from_raw(
1351            width,
1352            height,
1353            data[..(width * height) as usize].to_vec(),
1354        )
1355        .ok_or_else(|| OperationError::ParseError("Failed to create image buffer".to_string()))?;
1356
1357        let img = DynamicImage::ImageLuma8(img_buffer);
1358        let mut processed_img = img;
1359
1360        // Apply preprocessing
1361        processed_img = self.apply_rotation_correction(processed_img)?;
1362        processed_img = self.apply_contrast_enhancement(processed_img)?;
1363        processed_img = self.apply_noise_reduction(processed_img)?;
1364        processed_img = self.apply_upscaling(processed_img, width, height)?;
1365
1366        // Encode to PNG
1367        let mut output = Vec::new();
1368        processed_img
1369            .write_to(&mut std::io::Cursor::new(&mut output), ImageLibFormat::Png)
1370            .map_err(|e| OperationError::ParseError(format!("Failed to encode image: {e}")))?;
1371
1372        Ok(output)
1373    }
1374
1375    /// Auto-detect and correct rotation
1376    #[cfg(feature = "external-images")]
1377    fn apply_rotation_correction(&self, img: DynamicImage) -> OperationResult<DynamicImage> {
1378        if !self.options.preprocessing.auto_correct_rotation {
1379            return Ok(img);
1380        }
1381
1382        // Simple rotation detection based on aspect ratio and content analysis
1383        let (width, height) = img.dimensions();
1384
1385        // If image is wider than it is tall but contains mostly vertical text,
1386        // it might need rotation. This is a simplified heuristic.
1387        if width > height * 2 {
1388            // Likely rotated 90 degrees - try rotating
1389            return Ok(img.rotate90());
1390        }
1391
1392        // For now, return as-is. In a more sophisticated implementation,
1393        // we could use OCR or edge detection to determine optimal rotation.
1394        Ok(img)
1395    }
1396
1397    /// Enhance contrast for better OCR
1398    #[cfg(feature = "external-images")]
1399    fn apply_contrast_enhancement(&self, img: DynamicImage) -> OperationResult<DynamicImage> {
1400        if !self.options.preprocessing.enhance_contrast {
1401            return Ok(img);
1402        }
1403
1404        // Apply histogram equalization by adjusting brightness and contrast
1405        let enhanced = img.adjust_contrast(20.0); // Increase contrast by 20%
1406        Ok(enhanced.brighten(10)) // Slightly brighten
1407    }
1408
1409    /// Apply noise reduction
1410    #[cfg(feature = "external-images")]
1411    fn apply_noise_reduction(&self, img: DynamicImage) -> OperationResult<DynamicImage> {
1412        if !self.options.preprocessing.denoise {
1413            return Ok(img);
1414        }
1415
1416        // Simple blur to reduce noise
1417        Ok(img.blur(0.5))
1418    }
1419
1420    /// Upscale small images for better OCR
1421    #[cfg(feature = "external-images")]
1422    fn apply_upscaling(
1423        &self,
1424        img: DynamicImage,
1425        original_width: u32,
1426        original_height: u32,
1427    ) -> OperationResult<DynamicImage> {
1428        if !self.options.preprocessing.upscale_small_images {
1429            return Ok(img);
1430        }
1431
1432        let min_dimension = original_width.min(original_height);
1433        if min_dimension < self.options.preprocessing.upscale_threshold {
1434            let new_width = original_width * self.options.preprocessing.upscale_factor;
1435            let new_height = original_height * self.options.preprocessing.upscale_factor;
1436
1437            return Ok(img.resize(
1438                new_width,
1439                new_height,
1440                image::imageops::FilterType::CatmullRom,
1441            ));
1442        }
1443
1444        Ok(img)
1445    }
1446}
1447
1448/// Extract all images from a PDF file
1449pub fn extract_images_from_pdf<P: AsRef<Path>>(
1450    input_path: P,
1451    options: ExtractImagesOptions,
1452) -> OperationResult<Vec<ExtractedImage>> {
1453    let document = PdfReader::open_document(input_path)
1454        .map_err(|e| OperationError::ParseError(e.to_string()))?;
1455
1456    let mut extractor = ImageExtractor::new(document, options);
1457    extractor.extract_all()
1458}
1459
1460/// Extract images from specific pages
1461pub fn extract_images_from_pages<P: AsRef<Path>>(
1462    input_path: P,
1463    pages: &[usize],
1464    options: ExtractImagesOptions,
1465) -> OperationResult<Vec<ExtractedImage>> {
1466    let document = PdfReader::open_document(input_path)
1467        .map_err(|e| OperationError::ParseError(e.to_string()))?;
1468
1469    let mut extractor = ImageExtractor::new(document, options);
1470    let mut all_images = Vec::new();
1471
1472    for &page_num in pages {
1473        let page_images = extractor.extract_from_page(page_num)?;
1474        all_images.extend(page_images);
1475    }
1476
1477    Ok(all_images)
1478}
1479
1480#[cfg(test)]
1481mod tests {
1482    use super::*;
1483    use tempfile::TempDir;
1484
1485    #[test]
1486    fn test_extract_options_default() {
1487        let options = ExtractImagesOptions::default();
1488        assert_eq!(options.output_dir, PathBuf::from("."));
1489        assert!(options.extract_inline);
1490        assert_eq!(options.min_size, Some(10));
1491        assert!(options.create_dir);
1492    }
1493
1494    #[test]
1495    fn test_filename_pattern() {
1496        let options = ExtractImagesOptions {
1497            name_pattern: "img_{page}_{index}.{format}".to_string(),
1498            ..Default::default()
1499        };
1500
1501        let pattern = options
1502            .name_pattern
1503            .replace("{page}", "1")
1504            .replace("{index}", "2")
1505            .replace("{format}", "jpg");
1506
1507        assert_eq!(pattern, "img_1_2.jpg");
1508    }
1509
1510    #[test]
1511    fn test_extract_options_custom() {
1512        let temp_dir = TempDir::new().unwrap();
1513        let options = ExtractImagesOptions {
1514            output_dir: temp_dir.path().to_path_buf(),
1515            name_pattern: "custom_{page}_{index}.{format}".to_string(),
1516            extract_inline: false,
1517            min_size: Some(50),
1518            create_dir: false,
1519            preprocessing: ImagePreprocessingOptions::default(),
1520        };
1521
1522        assert_eq!(options.output_dir, temp_dir.path());
1523        assert_eq!(options.name_pattern, "custom_{page}_{index}.{format}");
1524        assert!(!options.extract_inline);
1525        assert_eq!(options.min_size, Some(50));
1526        assert!(!options.create_dir);
1527    }
1528
1529    #[test]
1530    fn test_extract_options_debug_clone() {
1531        let options = ExtractImagesOptions {
1532            output_dir: PathBuf::from("/test/path"),
1533            name_pattern: "test.{format}".to_string(),
1534            extract_inline: true,
1535            min_size: None,
1536            create_dir: true,
1537            preprocessing: ImagePreprocessingOptions::default(),
1538        };
1539
1540        let debug_str = format!("{options:?}");
1541        assert!(debug_str.contains("ExtractImagesOptions"));
1542        assert!(debug_str.contains("/test/path"));
1543
1544        let cloned = options.clone();
1545        assert_eq!(cloned.output_dir, options.output_dir);
1546        assert_eq!(cloned.name_pattern, options.name_pattern);
1547        assert_eq!(cloned.extract_inline, options.extract_inline);
1548        assert_eq!(cloned.min_size, options.min_size);
1549        assert_eq!(cloned.create_dir, options.create_dir);
1550    }
1551
1552    #[test]
1553    fn test_extracted_image_struct() {
1554        let image = ExtractedImage {
1555            page_number: 0,
1556            image_index: 1,
1557            file_path: PathBuf::from("/test/image.jpg"),
1558            width: 100,
1559            height: 200,
1560            format: ImageFormat::Jpeg,
1561        };
1562
1563        assert_eq!(image.page_number, 0);
1564        assert_eq!(image.image_index, 1);
1565        assert_eq!(image.file_path, PathBuf::from("/test/image.jpg"));
1566        assert_eq!(image.width, 100);
1567        assert_eq!(image.height, 200);
1568        assert_eq!(image.format, ImageFormat::Jpeg);
1569    }
1570
1571    #[test]
1572    fn test_extracted_image_debug() {
1573        let image = ExtractedImage {
1574            page_number: 5,
1575            image_index: 3,
1576            file_path: PathBuf::from("output.png"),
1577            width: 512,
1578            height: 768,
1579            format: ImageFormat::Png,
1580        };
1581
1582        let debug_str = format!("{image:?}");
1583        assert!(debug_str.contains("ExtractedImage"));
1584        assert!(debug_str.contains("5"));
1585        assert!(debug_str.contains("3"));
1586        assert!(debug_str.contains("output.png"));
1587        assert!(debug_str.contains("512"));
1588        assert!(debug_str.contains("768"));
1589    }
1590
1591    // Helper function to create minimal valid PDF for testing
1592    fn create_minimal_pdf(temp_file: &std::path::Path) {
1593        let minimal_pdf = b"%PDF-1.7\n\
15941 0 obj\n\
1595<< /Type /Catalog /Pages 2 0 R >>\n\
1596endobj\n\
15972 0 obj\n\
1598<< /Type /Pages /Kids [] /Count 0 >>\n\
1599endobj\n\
1600xref\n\
16010 3\n\
16020000000000 65535 f \n\
16030000000009 00000 n \n\
16040000000055 00000 n \n\
1605trailer\n\
1606<< /Size 3 /Root 1 0 R >>\n\
1607startxref\n\
1608105\n\
1609%%EOF";
1610        std::fs::write(temp_file, minimal_pdf).unwrap();
1611    }
1612
1613    #[test]
1614    fn test_detect_image_format_png() {
1615        // Create a minimal valid PDF document for testing
1616        let temp_dir = TempDir::new().unwrap();
1617        let temp_file = temp_dir.path().join("test.pdf");
1618        create_minimal_pdf(&temp_file);
1619
1620        let document = PdfReader::open_document(&temp_file).unwrap();
1621        let extractor = ImageExtractor::new(document, ExtractImagesOptions::default());
1622
1623        // PNG magic bytes
1624        let png_data = b"\x89PNG\r\n\x1a\n\x00\x00\x00\x0DIHDR";
1625        let format = extractor.detect_image_format_from_data(png_data).unwrap();
1626        assert_eq!(format, ImageFormat::Png);
1627    }
1628
1629    #[test]
1630    fn test_detect_image_format_jpeg() {
1631        let temp_dir = TempDir::new().unwrap();
1632        let temp_file = temp_dir.path().join("test.pdf");
1633        create_minimal_pdf(&temp_file);
1634
1635        let document = PdfReader::open_document(&temp_file).unwrap();
1636        let extractor = ImageExtractor::new(document, ExtractImagesOptions::default());
1637
1638        // JPEG magic bytes
1639        let jpeg_data = b"\xFF\xD8\xFF\xE0\x00\x10JFIF";
1640        let format = extractor.detect_image_format_from_data(jpeg_data).unwrap();
1641        assert_eq!(format, ImageFormat::Jpeg);
1642    }
1643
1644    #[test]
1645    fn test_detect_image_format_tiff_little_endian() {
1646        let temp_dir = TempDir::new().unwrap();
1647        let temp_file = temp_dir.path().join("test.pdf");
1648        create_minimal_pdf(&temp_file);
1649
1650        let document = PdfReader::open_document(&temp_file).unwrap();
1651        let extractor = ImageExtractor::new(document, ExtractImagesOptions::default());
1652
1653        // TIFF little endian magic bytes
1654        let tiff_data = b"II\x2A\x00\x08\x00\x00\x00";
1655        let format = extractor.detect_image_format_from_data(tiff_data).unwrap();
1656        assert_eq!(format, ImageFormat::Tiff);
1657    }
1658
1659    #[test]
1660    fn test_detect_image_format_tiff_big_endian() {
1661        let temp_dir = TempDir::new().unwrap();
1662        let temp_file = temp_dir.path().join("test.pdf");
1663        create_minimal_pdf(&temp_file);
1664
1665        let document = PdfReader::open_document(&temp_file).unwrap();
1666        let extractor = ImageExtractor::new(document, ExtractImagesOptions::default());
1667
1668        // TIFF big endian magic bytes
1669        let tiff_data = b"MM\x00\x2A\x00\x00\x00\x08";
1670        let format = extractor.detect_image_format_from_data(tiff_data).unwrap();
1671        assert_eq!(format, ImageFormat::Tiff);
1672    }
1673
1674    #[test]
1675    fn test_detect_image_format_unknown() {
1676        let temp_dir = TempDir::new().unwrap();
1677        let temp_file = temp_dir.path().join("test.pdf");
1678        create_minimal_pdf(&temp_file);
1679
1680        let document = PdfReader::open_document(&temp_file).unwrap();
1681        let extractor = ImageExtractor::new(document, ExtractImagesOptions::default());
1682
1683        // Unknown format - should default to PNG
1684        let unknown_data = b"\x00\x01\x02\x03\x04\x05\x06\x07\x08";
1685        let format = extractor
1686            .detect_image_format_from_data(unknown_data)
1687            .unwrap();
1688        assert_eq!(format, ImageFormat::Png); // Default fallback
1689    }
1690
1691    #[test]
1692    fn test_detect_image_format_short_data() {
1693        let temp_dir = TempDir::new().unwrap();
1694        let temp_file = temp_dir.path().join("test.pdf");
1695        create_minimal_pdf(&temp_file);
1696
1697        let document = PdfReader::open_document(&temp_file).unwrap();
1698        let extractor = ImageExtractor::new(document, ExtractImagesOptions::default());
1699
1700        // Too short data (less than 2 bytes)
1701        let short_data = b"\xFF";
1702        let result = extractor.detect_image_format_from_data(short_data);
1703        assert!(result.is_err());
1704        match result {
1705            Err(OperationError::ParseError(msg)) => {
1706                assert!(msg.contains("too short"));
1707            }
1708            _ => panic!("Expected ParseError"),
1709        }
1710    }
1711
1712    #[test]
1713    fn test_filename_pattern_replacements() {
1714        let options = ExtractImagesOptions {
1715            name_pattern: "page_{page}_img_{index}_{format}.{format}".to_string(),
1716            ..Default::default()
1717        };
1718
1719        let pattern = options
1720            .name_pattern
1721            .replace("{page}", "10")
1722            .replace("{index}", "5")
1723            .replace("{format}", "png");
1724
1725        assert_eq!(pattern, "page_10_img_5_png.png");
1726    }
1727
1728    #[test]
1729    fn test_extract_options_no_min_size() {
1730        let options = ExtractImagesOptions {
1731            min_size: None,
1732            ..Default::default()
1733        };
1734
1735        assert_eq!(options.min_size, None);
1736    }
1737
1738    #[test]
1739    fn test_create_output_directory() {
1740        let temp_dir = TempDir::new().unwrap();
1741        let output_dir = temp_dir.path().join("new_dir");
1742
1743        let options = ExtractImagesOptions {
1744            output_dir: output_dir.clone(),
1745            create_dir: true,
1746            ..Default::default()
1747        };
1748
1749        // In real usage, ImageExtractor would create this directory
1750        assert!(!output_dir.exists());
1751        assert_eq!(options.output_dir, output_dir);
1752        assert!(options.create_dir);
1753    }
1754
1755    #[test]
1756    fn test_pattern_with_special_chars() {
1757        let options = ExtractImagesOptions {
1758            name_pattern: "img-{page}_{index}.{format}".to_string(),
1759            ..Default::default()
1760        };
1761
1762        let pattern = options
1763            .name_pattern
1764            .replace("{page}", "1")
1765            .replace("{index}", "1")
1766            .replace("{format}", "jpg");
1767
1768        assert_eq!(pattern, "img-1_1.jpg");
1769    }
1770
1771    #[test]
1772    fn test_multiple_format_extensions() {
1773        let formats = vec![
1774            (ImageFormat::Jpeg, "jpg"),
1775            (ImageFormat::Png, "png"),
1776            (ImageFormat::Tiff, "tiff"),
1777        ];
1778
1779        for (format, expected_ext) in formats {
1780            let extension = match format {
1781                ImageFormat::Jpeg => "jpg",
1782                ImageFormat::Png => "png",
1783                ImageFormat::Tiff => "tiff",
1784                ImageFormat::Raw => "raw",
1785            };
1786            assert_eq!(extension, expected_ext);
1787        }
1788    }
1789
1790    #[test]
1791    fn test_extract_inline_option() {
1792        let mut options = ExtractImagesOptions::default();
1793        assert!(options.extract_inline);
1794
1795        options.extract_inline = false;
1796        assert!(!options.extract_inline);
1797    }
1798
1799    #[test]
1800    fn test_min_size_filtering() {
1801        let options_with_min = ExtractImagesOptions {
1802            min_size: Some(100),
1803            ..Default::default()
1804        };
1805
1806        let options_no_min = ExtractImagesOptions {
1807            min_size: None,
1808            ..Default::default()
1809        };
1810
1811        assert_eq!(options_with_min.min_size, Some(100));
1812        assert_eq!(options_no_min.min_size, None);
1813    }
1814
1815    #[test]
1816    fn test_output_path_combinations() {
1817        let base_dir = PathBuf::from("/output");
1818        let options = ExtractImagesOptions {
1819            output_dir: base_dir,
1820            name_pattern: "img_{page}_{index}.{format}".to_string(),
1821            ..Default::default()
1822        };
1823
1824        let filename = options
1825            .name_pattern
1826            .replace("{page}", "1")
1827            .replace("{index}", "2")
1828            .replace("{format}", "png");
1829
1830        let full_path = options.output_dir.join(filename);
1831        assert_eq!(full_path, PathBuf::from("/output/img_1_2.png"));
1832    }
1833
1834    #[test]
1835    fn test_pattern_without_placeholders() {
1836        let options = ExtractImagesOptions {
1837            name_pattern: "static_name.jpg".to_string(),
1838            ..Default::default()
1839        };
1840
1841        let pattern = options
1842            .name_pattern
1843            .replace("{page}", "1")
1844            .replace("{index}", "2")
1845            .replace("{format}", "png");
1846
1847        assert_eq!(pattern, "static_name.jpg"); // No placeholders replaced
1848    }
1849
1850    #[test]
1851    fn test_detect_format_edge_cases() {
1852        let temp_dir = TempDir::new().unwrap();
1853        let temp_file = temp_dir.path().join("test.pdf");
1854        create_minimal_pdf(&temp_file);
1855
1856        let document = PdfReader::open_document(&temp_file).unwrap();
1857        let extractor = ImageExtractor::new(document, ExtractImagesOptions::default());
1858
1859        // Empty data
1860        let empty_data = b"";
1861        assert!(extractor.detect_image_format_from_data(empty_data).is_err());
1862
1863        // Data exactly 8 bytes (minimum for PNG check)
1864        let exact_8 = b"\x89PNG\r\n\x1a\n";
1865        let format = extractor.detect_image_format_from_data(exact_8).unwrap();
1866        assert_eq!(format, ImageFormat::Png);
1867
1868        // Data exactly 4 bytes (minimum for TIFF check)
1869        let exact_4 = b"II\x2A\x00";
1870        let format = extractor.detect_image_format_from_data(exact_4).unwrap();
1871        assert_eq!(format, ImageFormat::Tiff);
1872
1873        // Data exactly 2 bytes (minimum for JPEG check)
1874        let exact_2 = b"\xFF\xD8";
1875        let format = extractor.detect_image_format_from_data(exact_2).unwrap();
1876        assert_eq!(format, ImageFormat::Jpeg); // JPEG only needs 2 bytes
1877    }
1878
1879    #[test]
1880    fn test_complex_filename_pattern() {
1881        let options = ExtractImagesOptions {
1882            name_pattern: "{format}/page{page}/image_{index}_{page}.{format}".to_string(),
1883            ..Default::default()
1884        };
1885
1886        let pattern = options
1887            .name_pattern
1888            .replace("{page}", "5")
1889            .replace("{index}", "3")
1890            .replace("{format}", "jpeg");
1891
1892        assert_eq!(pattern, "jpeg/page5/image_3_5.jpeg");
1893    }
1894
1895    #[test]
1896    fn test_image_dimensions() {
1897        let small_image = ExtractedImage {
1898            page_number: 0,
1899            image_index: 0,
1900            file_path: PathBuf::from("small.jpg"),
1901            width: 5,
1902            height: 5,
1903            format: ImageFormat::Jpeg,
1904        };
1905
1906        let large_image = ExtractedImage {
1907            page_number: 0,
1908            image_index: 1,
1909            file_path: PathBuf::from("large.jpg"),
1910            width: 2000,
1911            height: 3000,
1912            format: ImageFormat::Jpeg,
1913        };
1914
1915        assert_eq!(small_image.width, 5);
1916        assert_eq!(small_image.height, 5);
1917        assert_eq!(large_image.width, 2000);
1918        assert_eq!(large_image.height, 3000);
1919    }
1920
1921    #[test]
1922    fn test_page_and_index_numbering() {
1923        // Test that page numbers and indices work correctly
1924        let image1 = ExtractedImage {
1925            page_number: 0, // 0-indexed
1926            image_index: 0,
1927            file_path: PathBuf::from("first.jpg"),
1928            width: 100,
1929            height: 100,
1930            format: ImageFormat::Jpeg,
1931        };
1932
1933        let image2 = ExtractedImage {
1934            page_number: 99,  // Large page number
1935            image_index: 255, // Large index
1936            file_path: PathBuf::from("last.jpg"),
1937            width: 100,
1938            height: 100,
1939            format: ImageFormat::Jpeg,
1940        };
1941
1942        assert_eq!(image1.page_number, 0);
1943        assert_eq!(image1.image_index, 0);
1944        assert_eq!(image2.page_number, 99);
1945        assert_eq!(image2.image_index, 255);
1946    }
1947}
1948
1949#[cfg(test)]
1950#[path = "extract_images_tests.rs"]
1951mod extract_images_tests;
oxidize_pdf/operations/extract_images.rs

oxidize_pdf/operations/
extract_images.rs