Skip to main content

pdfluent_extract/
images.rs

1//! Image extraction from PDF documents.
2//!
3//! Iterates over all XObject Image streams and decodes them based on their filter.
4
5use crate::error::{ExtractError, Result};
6use lopdf::{Document, Object, ObjectId};
7use std::collections::BTreeMap;
8use std::io::Read;
9
10/// The compression filter applied to an image stream.
11#[derive(Debug, Clone, PartialEq, Eq)]
12pub enum ImageFilter {
13    /// DCT (JPEG) compression.
14    Jpeg,
15    /// Flate (zlib/deflate) compression.
16    Flate,
17    /// JBIG2 compression.
18    Jbig2,
19    /// JPX (JPEG 2000) compression.
20    Jpx,
21    /// CCITT fax compression.
22    CcittFax,
23    /// No compression (raw).
24    Raw,
25    /// Unknown or unsupported filter.
26    Unknown(String),
27}
28
29/// An image extracted from a PDF document.
30#[derive(Debug, Clone)]
31pub struct ExtractedImage {
32    /// The PDF object ID of the image stream.
33    pub object_id: ObjectId,
34    /// The page number (1-based) where this image appears.
35    pub page: u32,
36    /// Image width in pixels.
37    pub width: u32,
38    /// Image height in pixels.
39    pub height: u32,
40    /// Bits per color component.
41    pub bits_per_component: u32,
42    /// Color space name (e.g., "DeviceRGB", "DeviceGray").
43    pub color_space: String,
44    /// The compression filter used.
45    pub filter: ImageFilter,
46    /// The decoded (or raw) image data.
47    pub data: Vec<u8>,
48}
49
50/// Extract all images from all pages of a PDF document.
51pub fn extract_all_images(doc: &Document) -> Result<Vec<ExtractedImage>> {
52    let page_map = build_page_image_map(doc);
53    let mut images = Vec::new();
54
55    for (page_num, obj_ids) in &page_map {
56        for &obj_id in obj_ids {
57            if let Ok(img) = decode_image(doc, obj_id, *page_num) {
58                images.push(img);
59            }
60        }
61    }
62
63    Ok(images)
64}
65
66/// Extract images from a specific page (1-based page number).
67pub fn extract_page_images(doc: &Document, page_num: u32) -> Result<Vec<ExtractedImage>> {
68    let pages = doc.get_pages();
69    let total = pages.len() as u32;
70
71    if page_num == 0 || page_num > total {
72        return Err(ExtractError::PageOutOfRange(page_num, total));
73    }
74
75    let page_id = *pages
76        .get(&page_num)
77        .ok_or(ExtractError::PageOutOfRange(page_num, total))?;
78
79    extract_images_from_page_id(doc, page_id, page_num)
80}
81
82/// Extract images from a page identified by its object ID.
83///
84/// Unlike [`extract_page_images`], this function does not call `get_pages()`
85/// internally. Use this when the caller already holds a page map from a
86/// prior `doc.get_pages()` call to avoid redundant page-tree traversals.
87/// Fixes #447: eliminates the double `get_pages()` call in the images test.
88pub fn extract_images_from_page_id(
89    doc: &Document,
90    page_id: lopdf::ObjectId,
91    page_num: u32,
92) -> Result<Vec<ExtractedImage>> {
93    let obj_ids = collect_page_xobject_ids(doc, page_id);
94    let mut images = Vec::new();
95
96    for obj_id in obj_ids {
97        if let Ok(img) = decode_image(doc, obj_id, page_num) {
98            images.push(img);
99        }
100    }
101
102    Ok(images)
103}
104
105/// Check if a stream object is an image XObject.
106fn is_image_stream(dict: &lopdf::Dictionary) -> bool {
107    if let Ok(subtype) = dict.get(b"Subtype") {
108        if let Ok(name) = subtype.as_name() {
109            return name == b"Image";
110        }
111    }
112    false
113}
114
115/// Get the filter from a stream dictionary.
116fn get_filter(dict: &lopdf::Dictionary) -> ImageFilter {
117    if let Ok(filter_obj) = dict.get(b"Filter") {
118        match filter_obj {
119            Object::Name(name) => filter_from_name(name),
120            Object::Array(arr) => {
121                // Use the first filter in the array.
122                if let Some(Object::Name(name)) = arr.first() {
123                    filter_from_name(name)
124                } else {
125                    ImageFilter::Raw
126                }
127            }
128            _ => ImageFilter::Raw,
129        }
130    } else {
131        ImageFilter::Raw
132    }
133}
134
135/// Convert a filter name to an `ImageFilter` variant.
136fn filter_from_name(name: &[u8]) -> ImageFilter {
137    match name {
138        b"DCTDecode" => ImageFilter::Jpeg,
139        b"FlateDecode" => ImageFilter::Flate,
140        b"JBIG2Decode" => ImageFilter::Jbig2,
141        b"JPXDecode" => ImageFilter::Jpx,
142        b"CCITTFaxDecode" => ImageFilter::CcittFax,
143        _ => ImageFilter::Unknown(String::from_utf8_lossy(name).to_string()),
144    }
145}
146
147/// Get color space from an image stream dictionary.
148fn get_color_space(dict: &lopdf::Dictionary) -> String {
149    if let Ok(cs) = dict.get(b"ColorSpace") {
150        match cs {
151            Object::Name(name) => String::from_utf8_lossy(name).to_string(),
152            Object::Array(arr) => {
153                if let Some(Object::Name(name)) = arr.first() {
154                    String::from_utf8_lossy(name).to_string()
155                } else {
156                    "Unknown".to_string()
157                }
158            }
159            _ => "Unknown".to_string(),
160        }
161    } else {
162        "Unknown".to_string()
163    }
164}
165
166/// Get an integer value from a dictionary key.
167fn get_int(dict: &lopdf::Dictionary, key: &[u8]) -> u32 {
168    dict.get(key)
169        .ok()
170        .and_then(|v| match v {
171            Object::Integer(i) => Some(*i as u32),
172            _ => None,
173        })
174        .unwrap_or(0)
175}
176
177/// Decode an image from a PDF object.
178fn decode_image(doc: &Document, obj_id: ObjectId, page: u32) -> Result<ExtractedImage> {
179    let obj = doc
180        .get_object(obj_id)
181        .map_err(|e| ExtractError::Other(format!("object not found: {e}")))?;
182
183    let stream = match obj {
184        Object::Stream(ref s) => s,
185        _ => return Err(ExtractError::Other("not a stream object".into())),
186    };
187
188    let dict = &stream.dict;
189    if !is_image_stream(dict) {
190        return Err(ExtractError::Other("not an image stream".into()));
191    }
192
193    let width = get_int(dict, b"Width");
194    let height = get_int(dict, b"Height");
195    let bits_per_component = get_int(dict, b"BitsPerComponent");
196    let color_space = get_color_space(dict);
197    let filter = get_filter(dict);
198
199    let data = match filter {
200        ImageFilter::Jpeg | ImageFilter::Jbig2 | ImageFilter::Jpx | ImageFilter::CcittFax => {
201            // For these formats, return the raw compressed bytes.
202            get_raw_stream_bytes(stream)
203        }
204        ImageFilter::Flate => {
205            let raw = get_raw_stream_bytes(stream);
206            decompress_flate(&raw).unwrap_or(raw)
207        }
208        ImageFilter::Raw => get_raw_stream_bytes(stream),
209        ImageFilter::Unknown(_) => get_raw_stream_bytes(stream),
210    };
211
212    Ok(ExtractedImage {
213        object_id: obj_id,
214        page,
215        width,
216        height,
217        bits_per_component,
218        color_space,
219        filter,
220        data,
221    })
222}
223
224/// Get raw bytes from a stream (without decompression).
225fn get_raw_stream_bytes(stream: &lopdf::Stream) -> Vec<u8> {
226    stream.content.clone()
227}
228
229/// Get decompressed stream bytes using lopdf's built-in decompression.
230#[allow(dead_code)]
231fn get_stream_bytes(stream: &lopdf::Stream, _doc: &Document) -> Vec<u8> {
232    let mut s = stream.clone();
233    if s.decompress().is_ok() {
234        s.content.clone()
235    } else {
236        stream.content.clone()
237    }
238}
239
240/// Maximum bytes to decompress from a single image stream (64 MB).
241/// Prevents zip-bomb hangs on pathological PDFs.
242const FLATE_MAX_DECOMPRESS_BYTES: u64 = 64 * 1024 * 1024;
243
244/// Decompress flate-encoded data, capped at `FLATE_MAX_DECOMPRESS_BYTES`.
245fn decompress_flate(data: &[u8]) -> std::result::Result<Vec<u8>, std::io::Error> {
246    let decoder = flate2::read::ZlibDecoder::new(data);
247    let mut decoded = Vec::new();
248    decoder
249        .take(FLATE_MAX_DECOMPRESS_BYTES)
250        .read_to_end(&mut decoded)?;
251    Ok(decoded)
252}
253
254/// Build a map of page number -> list of image object IDs.
255fn build_page_image_map(doc: &Document) -> BTreeMap<u32, Vec<ObjectId>> {
256    let mut map = BTreeMap::new();
257    let pages = doc.get_pages();
258
259    for (&page_num, &page_id) in &pages {
260        let ids = collect_page_xobject_ids(doc, page_id);
261        if !ids.is_empty() {
262            map.insert(page_num, ids);
263        }
264    }
265
266    map
267}
268
269/// Collect all XObject IDs referenced by a page that are image streams.
270fn collect_page_xobject_ids(doc: &Document, page_id: ObjectId) -> Vec<ObjectId> {
271    let mut ids = Vec::new();
272
273    let page_obj = match doc.get_object(page_id) {
274        Ok(obj) => obj,
275        Err(_) => return ids,
276    };
277
278    let page_dict = match page_obj {
279        Object::Dictionary(ref d) => d,
280        _ => return ids,
281    };
282
283    // Get Resources dict.
284    let resources = match page_dict.get(b"Resources") {
285        Ok(res_obj) => match res_obj {
286            Object::Dictionary(ref d) => d.clone(),
287            Object::Reference(r) => match doc.get_object(*r) {
288                Ok(Object::Dictionary(ref d)) => d.clone(),
289                _ => return ids,
290            },
291            _ => return ids,
292        },
293        Err(_) => return ids,
294    };
295
296    // Get XObject dict from Resources.
297    let xobjects = match resources.get(b"XObject") {
298        Ok(xo) => match xo {
299            Object::Dictionary(ref d) => d.clone(),
300            Object::Reference(r) => match doc.get_object(*r) {
301                Ok(Object::Dictionary(ref d)) => d.clone(),
302                _ => return ids,
303            },
304            _ => return ids,
305        },
306        Err(_) => return ids,
307    };
308
309    for (_name, obj) in xobjects.iter() {
310        if let Object::Reference(obj_id) = obj {
311            if let Ok(Object::Stream(ref stream)) = doc.get_object(*obj_id) {
312                if is_image_stream(&stream.dict) {
313                    ids.push(*obj_id);
314                }
315            }
316        }
317    }
318
319    ids
320}
321
322#[cfg(test)]
323mod tests {
324    use super::*;
325    use lopdf::{dictionary, Document, Object, Stream};
326
327    /// Helper: create a minimal PDF document with a JPEG image on page 1.
328    fn make_doc_with_jpeg_image() -> Document {
329        let mut doc = Document::with_version("1.7");
330
331        let img_dict = dictionary! {
332            "Type" => "XObject",
333            "Subtype" => "Image",
334            "Width" => 100_i64,
335            "Height" => 50_i64,
336            "BitsPerComponent" => 8_i64,
337            "ColorSpace" => "DeviceRGB",
338            "Filter" => "DCTDecode",
339        };
340        let img_stream = Stream::new(img_dict, vec![0xFF, 0xD8, 0xFF, 0xE0]);
341        let img_id = doc.add_object(Object::Stream(img_stream));
342
343        let xobject_dict = dictionary! {
344            "Im0" => Object::Reference(img_id),
345        };
346        let resources_dict = dictionary! {
347            "XObject" => Object::Dictionary(xobject_dict),
348        };
349        let content_data = b"q 100 0 0 50 0 0 cm /Im0 Do Q".to_vec();
350        let content_stream = Stream::new(dictionary! {}, content_data);
351        let content_id = doc.add_object(Object::Stream(content_stream));
352
353        let page_dict = dictionary! {
354            "Type" => "Page",
355            "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
356            "Resources" => Object::Dictionary(resources_dict),
357            "Contents" => Object::Reference(content_id),
358        };
359        let page_id = doc.add_object(Object::Dictionary(page_dict));
360
361        let pages_dict = dictionary! {
362            "Type" => "Pages",
363            "Kids" => vec![Object::Reference(page_id)],
364            "Count" => 1_i64,
365        };
366        let pages_id = doc.add_object(Object::Dictionary(pages_dict));
367
368        if let Ok(Object::Dictionary(ref mut d)) = doc.get_object_mut(page_id) {
369            d.set("Parent", Object::Reference(pages_id));
370        }
371
372        let catalog = dictionary! {
373            "Type" => "Catalog",
374            "Pages" => Object::Reference(pages_id),
375        };
376        let catalog_id = doc.add_object(Object::Dictionary(catalog));
377        doc.trailer.set("Root", Object::Reference(catalog_id));
378
379        doc
380    }
381
382    /// Helper: create a doc with a raw (uncompressed) image.
383    fn make_doc_with_raw_image() -> Document {
384        let mut doc = Document::with_version("1.7");
385
386        let img_dict = dictionary! {
387            "Type" => "XObject",
388            "Subtype" => "Image",
389            "Width" => 2_i64,
390            "Height" => 2_i64,
391            "BitsPerComponent" => 8_i64,
392            "ColorSpace" => "DeviceRGB",
393        };
394        // 2x2 RGB image = 12 bytes.
395        let img_stream = Stream::new(img_dict, vec![255; 12]);
396        let img_id = doc.add_object(Object::Stream(img_stream));
397
398        let xobject_dict = dictionary! {
399            "Im0" => Object::Reference(img_id),
400        };
401        let resources_dict = dictionary! {
402            "XObject" => Object::Dictionary(xobject_dict),
403        };
404        let content_data = b"q 2 0 0 2 0 0 cm /Im0 Do Q".to_vec();
405        let content_stream = Stream::new(dictionary! {}, content_data);
406        let content_id = doc.add_object(Object::Stream(content_stream));
407
408        let page_dict = dictionary! {
409            "Type" => "Page",
410            "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
411            "Resources" => Object::Dictionary(resources_dict),
412            "Contents" => Object::Reference(content_id),
413        };
414        let page_id = doc.add_object(Object::Dictionary(page_dict));
415
416        let pages_dict = dictionary! {
417            "Type" => "Pages",
418            "Kids" => vec![Object::Reference(page_id)],
419            "Count" => 1_i64,
420        };
421        let pages_id = doc.add_object(Object::Dictionary(pages_dict));
422
423        if let Ok(Object::Dictionary(ref mut d)) = doc.get_object_mut(page_id) {
424            d.set("Parent", Object::Reference(pages_id));
425        }
426
427        let catalog = dictionary! {
428            "Type" => "Catalog",
429            "Pages" => Object::Reference(pages_id),
430        };
431        let catalog_id = doc.add_object(Object::Dictionary(catalog));
432        doc.trailer.set("Root", Object::Reference(catalog_id));
433
434        doc
435    }
436
437    /// Helper: create a doc with a flate-compressed image.
438    fn make_doc_with_flate_image() -> Document {
439        let mut doc = Document::with_version("1.7");
440
441        // Compress raw data with flate.
442        let raw_data = vec![128u8; 12]; // 2x2 RGB
443        let mut encoder =
444            flate2::write::ZlibEncoder::new(Vec::new(), flate2::Compression::default());
445        std::io::Write::write_all(&mut encoder, &raw_data).unwrap();
446        let compressed = encoder.finish().unwrap();
447
448        let img_dict = dictionary! {
449            "Type" => "XObject",
450            "Subtype" => "Image",
451            "Width" => 2_i64,
452            "Height" => 2_i64,
453            "BitsPerComponent" => 8_i64,
454            "ColorSpace" => "DeviceRGB",
455            "Filter" => "FlateDecode",
456        };
457        let img_stream = Stream::new(img_dict, compressed);
458        let img_id = doc.add_object(Object::Stream(img_stream));
459
460        let xobject_dict = dictionary! {
461            "Im0" => Object::Reference(img_id),
462        };
463        let resources_dict = dictionary! {
464            "XObject" => Object::Dictionary(xobject_dict),
465        };
466        let content_data = b"q 2 0 0 2 0 0 cm /Im0 Do Q".to_vec();
467        let content_stream = Stream::new(dictionary! {}, content_data);
468        let content_id = doc.add_object(Object::Stream(content_stream));
469
470        let page_dict = dictionary! {
471            "Type" => "Page",
472            "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
473            "Resources" => Object::Dictionary(resources_dict),
474            "Contents" => Object::Reference(content_id),
475        };
476        let page_id = doc.add_object(Object::Dictionary(page_dict));
477
478        let pages_dict = dictionary! {
479            "Type" => "Pages",
480            "Kids" => vec![Object::Reference(page_id)],
481            "Count" => 1_i64,
482        };
483        let pages_id = doc.add_object(Object::Dictionary(pages_dict));
484
485        if let Ok(Object::Dictionary(ref mut d)) = doc.get_object_mut(page_id) {
486            d.set("Parent", Object::Reference(pages_id));
487        }
488
489        let catalog = dictionary! {
490            "Type" => "Catalog",
491            "Pages" => Object::Reference(pages_id),
492        };
493        let catalog_id = doc.add_object(Object::Dictionary(catalog));
494        doc.trailer.set("Root", Object::Reference(catalog_id));
495
496        doc
497    }
498
499    #[test]
500    fn extract_jpeg_image() {
501        let doc = make_doc_with_jpeg_image();
502        let images = extract_all_images(&doc).unwrap();
503        assert_eq!(images.len(), 1);
504        assert_eq!(images[0].width, 100);
505        assert_eq!(images[0].height, 50);
506        assert_eq!(images[0].bits_per_component, 8);
507        assert_eq!(images[0].color_space, "DeviceRGB");
508        assert_eq!(images[0].filter, ImageFilter::Jpeg);
509        assert_eq!(images[0].page, 1);
510    }
511
512    #[test]
513    fn extract_from_specific_page() {
514        let doc = make_doc_with_jpeg_image();
515        let images = extract_page_images(&doc, 1).unwrap();
516        assert_eq!(images.len(), 1);
517        assert_eq!(images[0].filter, ImageFilter::Jpeg);
518    }
519
520    #[test]
521    fn extract_page_out_of_range() {
522        let doc = make_doc_with_jpeg_image();
523        let result = extract_page_images(&doc, 5);
524        assert!(result.is_err());
525    }
526
527    #[test]
528    fn extract_raw_image() {
529        let doc = make_doc_with_raw_image();
530        let images = extract_all_images(&doc).unwrap();
531        assert_eq!(images.len(), 1);
532        assert_eq!(images[0].filter, ImageFilter::Raw);
533        assert_eq!(images[0].data.len(), 12);
534    }
535
536    #[test]
537    fn extract_flate_compressed_image() {
538        let doc = make_doc_with_flate_image();
539        let images = extract_all_images(&doc).unwrap();
540        assert_eq!(images.len(), 1);
541        assert_eq!(images[0].filter, ImageFilter::Flate);
542        // After decompression, we should have 12 bytes (2x2 RGB).
543        assert_eq!(images[0].data.len(), 12);
544    }
545
546    #[test]
547    fn no_images_returns_empty() {
548        let mut doc = Document::with_version("1.7");
549
550        let content_stream = Stream::new(dictionary! {}, b"BT /F1 12 Tf (Hello) Tj ET".to_vec());
551        let content_id = doc.add_object(Object::Stream(content_stream));
552
553        let page_dict = dictionary! {
554            "Type" => "Page",
555            "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
556            "Contents" => Object::Reference(content_id),
557        };
558        let page_id = doc.add_object(Object::Dictionary(page_dict));
559
560        let pages_dict = dictionary! {
561            "Type" => "Pages",
562            "Kids" => vec![Object::Reference(page_id)],
563            "Count" => 1_i64,
564        };
565        let pages_id = doc.add_object(Object::Dictionary(pages_dict));
566
567        if let Ok(Object::Dictionary(ref mut d)) = doc.get_object_mut(page_id) {
568            d.set("Parent", Object::Reference(pages_id));
569        }
570
571        let catalog = dictionary! {
572            "Type" => "Catalog",
573            "Pages" => Object::Reference(pages_id),
574        };
575        let catalog_id = doc.add_object(Object::Dictionary(catalog));
576        doc.trailer.set("Root", Object::Reference(catalog_id));
577
578        let images = extract_all_images(&doc).unwrap();
579        assert!(images.is_empty());
580    }
581}