1use crate::error::{ExtractError, Result};
6use lopdf::{Document, Object, ObjectId};
7use std::collections::BTreeMap;
8use std::io::Read;
9
10#[derive(Debug, Clone, PartialEq, Eq)]
12pub enum ImageFilter {
13 Jpeg,
15 Flate,
17 Jbig2,
19 Jpx,
21 CcittFax,
23 Raw,
25 Unknown(String),
27}
28
29#[derive(Debug, Clone)]
31pub struct ExtractedImage {
32 pub object_id: ObjectId,
34 pub page: u32,
36 pub width: u32,
38 pub height: u32,
40 pub bits_per_component: u32,
42 pub color_space: String,
44 pub filter: ImageFilter,
46 pub data: Vec<u8>,
48}
49
50pub fn extract_all_images(doc: &Document) -> Result<Vec<ExtractedImage>> {
52 let page_map = build_page_image_map(doc);
53 let mut images = Vec::new();
54
55 for (page_num, obj_ids) in &page_map {
56 for &obj_id in obj_ids {
57 if let Ok(img) = decode_image(doc, obj_id, *page_num) {
58 images.push(img);
59 }
60 }
61 }
62
63 Ok(images)
64}
65
66pub fn extract_page_images(doc: &Document, page_num: u32) -> Result<Vec<ExtractedImage>> {
68 let pages = doc.get_pages();
69 let total = pages.len() as u32;
70
71 if page_num == 0 || page_num > total {
72 return Err(ExtractError::PageOutOfRange(page_num, total));
73 }
74
75 let page_id = *pages
76 .get(&page_num)
77 .ok_or(ExtractError::PageOutOfRange(page_num, total))?;
78
79 extract_images_from_page_id(doc, page_id, page_num)
80}
81
82pub fn extract_images_from_page_id(
89 doc: &Document,
90 page_id: lopdf::ObjectId,
91 page_num: u32,
92) -> Result<Vec<ExtractedImage>> {
93 let obj_ids = collect_page_xobject_ids(doc, page_id);
94 let mut images = Vec::new();
95
96 for obj_id in obj_ids {
97 if let Ok(img) = decode_image(doc, obj_id, page_num) {
98 images.push(img);
99 }
100 }
101
102 Ok(images)
103}
104
105fn is_image_stream(dict: &lopdf::Dictionary) -> bool {
107 if let Ok(subtype) = dict.get(b"Subtype") {
108 if let Ok(name) = subtype.as_name() {
109 return name == b"Image";
110 }
111 }
112 false
113}
114
115fn get_filter(dict: &lopdf::Dictionary) -> ImageFilter {
117 if let Ok(filter_obj) = dict.get(b"Filter") {
118 match filter_obj {
119 Object::Name(name) => filter_from_name(name),
120 Object::Array(arr) => {
121 if let Some(Object::Name(name)) = arr.first() {
123 filter_from_name(name)
124 } else {
125 ImageFilter::Raw
126 }
127 }
128 _ => ImageFilter::Raw,
129 }
130 } else {
131 ImageFilter::Raw
132 }
133}
134
135fn filter_from_name(name: &[u8]) -> ImageFilter {
137 match name {
138 b"DCTDecode" => ImageFilter::Jpeg,
139 b"FlateDecode" => ImageFilter::Flate,
140 b"JBIG2Decode" => ImageFilter::Jbig2,
141 b"JPXDecode" => ImageFilter::Jpx,
142 b"CCITTFaxDecode" => ImageFilter::CcittFax,
143 _ => ImageFilter::Unknown(String::from_utf8_lossy(name).to_string()),
144 }
145}
146
147fn get_color_space(dict: &lopdf::Dictionary) -> String {
149 if let Ok(cs) = dict.get(b"ColorSpace") {
150 match cs {
151 Object::Name(name) => String::from_utf8_lossy(name).to_string(),
152 Object::Array(arr) => {
153 if let Some(Object::Name(name)) = arr.first() {
154 String::from_utf8_lossy(name).to_string()
155 } else {
156 "Unknown".to_string()
157 }
158 }
159 _ => "Unknown".to_string(),
160 }
161 } else {
162 "Unknown".to_string()
163 }
164}
165
166fn get_int(dict: &lopdf::Dictionary, key: &[u8]) -> u32 {
168 dict.get(key)
169 .ok()
170 .and_then(|v| match v {
171 Object::Integer(i) => Some(*i as u32),
172 _ => None,
173 })
174 .unwrap_or(0)
175}
176
177fn decode_image(doc: &Document, obj_id: ObjectId, page: u32) -> Result<ExtractedImage> {
179 let obj = doc
180 .get_object(obj_id)
181 .map_err(|e| ExtractError::Other(format!("object not found: {e}")))?;
182
183 let stream = match obj {
184 Object::Stream(ref s) => s,
185 _ => return Err(ExtractError::Other("not a stream object".into())),
186 };
187
188 let dict = &stream.dict;
189 if !is_image_stream(dict) {
190 return Err(ExtractError::Other("not an image stream".into()));
191 }
192
193 let width = get_int(dict, b"Width");
194 let height = get_int(dict, b"Height");
195 let bits_per_component = get_int(dict, b"BitsPerComponent");
196 let color_space = get_color_space(dict);
197 let filter = get_filter(dict);
198
199 let data = match filter {
200 ImageFilter::Jpeg | ImageFilter::Jbig2 | ImageFilter::Jpx | ImageFilter::CcittFax => {
201 get_raw_stream_bytes(stream)
203 }
204 ImageFilter::Flate => {
205 let raw = get_raw_stream_bytes(stream);
206 decompress_flate(&raw).unwrap_or(raw)
207 }
208 ImageFilter::Raw => get_raw_stream_bytes(stream),
209 ImageFilter::Unknown(_) => get_raw_stream_bytes(stream),
210 };
211
212 Ok(ExtractedImage {
213 object_id: obj_id,
214 page,
215 width,
216 height,
217 bits_per_component,
218 color_space,
219 filter,
220 data,
221 })
222}
223
224fn get_raw_stream_bytes(stream: &lopdf::Stream) -> Vec<u8> {
226 stream.content.clone()
227}
228
229#[allow(dead_code)]
231fn get_stream_bytes(stream: &lopdf::Stream, _doc: &Document) -> Vec<u8> {
232 let mut s = stream.clone();
233 if s.decompress().is_ok() {
234 s.content.clone()
235 } else {
236 stream.content.clone()
237 }
238}
239
240const FLATE_MAX_DECOMPRESS_BYTES: u64 = 64 * 1024 * 1024;
243
244fn decompress_flate(data: &[u8]) -> std::result::Result<Vec<u8>, std::io::Error> {
246 let decoder = flate2::read::ZlibDecoder::new(data);
247 let mut decoded = Vec::new();
248 decoder
249 .take(FLATE_MAX_DECOMPRESS_BYTES)
250 .read_to_end(&mut decoded)?;
251 Ok(decoded)
252}
253
254fn build_page_image_map(doc: &Document) -> BTreeMap<u32, Vec<ObjectId>> {
256 let mut map = BTreeMap::new();
257 let pages = doc.get_pages();
258
259 for (&page_num, &page_id) in &pages {
260 let ids = collect_page_xobject_ids(doc, page_id);
261 if !ids.is_empty() {
262 map.insert(page_num, ids);
263 }
264 }
265
266 map
267}
268
269fn collect_page_xobject_ids(doc: &Document, page_id: ObjectId) -> Vec<ObjectId> {
271 let mut ids = Vec::new();
272
273 let page_obj = match doc.get_object(page_id) {
274 Ok(obj) => obj,
275 Err(_) => return ids,
276 };
277
278 let page_dict = match page_obj {
279 Object::Dictionary(ref d) => d,
280 _ => return ids,
281 };
282
283 let resources = match page_dict.get(b"Resources") {
285 Ok(res_obj) => match res_obj {
286 Object::Dictionary(ref d) => d.clone(),
287 Object::Reference(r) => match doc.get_object(*r) {
288 Ok(Object::Dictionary(ref d)) => d.clone(),
289 _ => return ids,
290 },
291 _ => return ids,
292 },
293 Err(_) => return ids,
294 };
295
296 let xobjects = match resources.get(b"XObject") {
298 Ok(xo) => match xo {
299 Object::Dictionary(ref d) => d.clone(),
300 Object::Reference(r) => match doc.get_object(*r) {
301 Ok(Object::Dictionary(ref d)) => d.clone(),
302 _ => return ids,
303 },
304 _ => return ids,
305 },
306 Err(_) => return ids,
307 };
308
309 for (_name, obj) in xobjects.iter() {
310 if let Object::Reference(obj_id) = obj {
311 if let Ok(Object::Stream(ref stream)) = doc.get_object(*obj_id) {
312 if is_image_stream(&stream.dict) {
313 ids.push(*obj_id);
314 }
315 }
316 }
317 }
318
319 ids
320}
321
322#[cfg(test)]
323mod tests {
324 use super::*;
325 use lopdf::{dictionary, Document, Object, Stream};
326
327 fn make_doc_with_jpeg_image() -> Document {
329 let mut doc = Document::with_version("1.7");
330
331 let img_dict = dictionary! {
332 "Type" => "XObject",
333 "Subtype" => "Image",
334 "Width" => 100_i64,
335 "Height" => 50_i64,
336 "BitsPerComponent" => 8_i64,
337 "ColorSpace" => "DeviceRGB",
338 "Filter" => "DCTDecode",
339 };
340 let img_stream = Stream::new(img_dict, vec![0xFF, 0xD8, 0xFF, 0xE0]);
341 let img_id = doc.add_object(Object::Stream(img_stream));
342
343 let xobject_dict = dictionary! {
344 "Im0" => Object::Reference(img_id),
345 };
346 let resources_dict = dictionary! {
347 "XObject" => Object::Dictionary(xobject_dict),
348 };
349 let content_data = b"q 100 0 0 50 0 0 cm /Im0 Do Q".to_vec();
350 let content_stream = Stream::new(dictionary! {}, content_data);
351 let content_id = doc.add_object(Object::Stream(content_stream));
352
353 let page_dict = dictionary! {
354 "Type" => "Page",
355 "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
356 "Resources" => Object::Dictionary(resources_dict),
357 "Contents" => Object::Reference(content_id),
358 };
359 let page_id = doc.add_object(Object::Dictionary(page_dict));
360
361 let pages_dict = dictionary! {
362 "Type" => "Pages",
363 "Kids" => vec![Object::Reference(page_id)],
364 "Count" => 1_i64,
365 };
366 let pages_id = doc.add_object(Object::Dictionary(pages_dict));
367
368 if let Ok(Object::Dictionary(ref mut d)) = doc.get_object_mut(page_id) {
369 d.set("Parent", Object::Reference(pages_id));
370 }
371
372 let catalog = dictionary! {
373 "Type" => "Catalog",
374 "Pages" => Object::Reference(pages_id),
375 };
376 let catalog_id = doc.add_object(Object::Dictionary(catalog));
377 doc.trailer.set("Root", Object::Reference(catalog_id));
378
379 doc
380 }
381
382 fn make_doc_with_raw_image() -> Document {
384 let mut doc = Document::with_version("1.7");
385
386 let img_dict = dictionary! {
387 "Type" => "XObject",
388 "Subtype" => "Image",
389 "Width" => 2_i64,
390 "Height" => 2_i64,
391 "BitsPerComponent" => 8_i64,
392 "ColorSpace" => "DeviceRGB",
393 };
394 let img_stream = Stream::new(img_dict, vec![255; 12]);
396 let img_id = doc.add_object(Object::Stream(img_stream));
397
398 let xobject_dict = dictionary! {
399 "Im0" => Object::Reference(img_id),
400 };
401 let resources_dict = dictionary! {
402 "XObject" => Object::Dictionary(xobject_dict),
403 };
404 let content_data = b"q 2 0 0 2 0 0 cm /Im0 Do Q".to_vec();
405 let content_stream = Stream::new(dictionary! {}, content_data);
406 let content_id = doc.add_object(Object::Stream(content_stream));
407
408 let page_dict = dictionary! {
409 "Type" => "Page",
410 "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
411 "Resources" => Object::Dictionary(resources_dict),
412 "Contents" => Object::Reference(content_id),
413 };
414 let page_id = doc.add_object(Object::Dictionary(page_dict));
415
416 let pages_dict = dictionary! {
417 "Type" => "Pages",
418 "Kids" => vec![Object::Reference(page_id)],
419 "Count" => 1_i64,
420 };
421 let pages_id = doc.add_object(Object::Dictionary(pages_dict));
422
423 if let Ok(Object::Dictionary(ref mut d)) = doc.get_object_mut(page_id) {
424 d.set("Parent", Object::Reference(pages_id));
425 }
426
427 let catalog = dictionary! {
428 "Type" => "Catalog",
429 "Pages" => Object::Reference(pages_id),
430 };
431 let catalog_id = doc.add_object(Object::Dictionary(catalog));
432 doc.trailer.set("Root", Object::Reference(catalog_id));
433
434 doc
435 }
436
437 fn make_doc_with_flate_image() -> Document {
439 let mut doc = Document::with_version("1.7");
440
441 let raw_data = vec![128u8; 12]; let mut encoder =
444 flate2::write::ZlibEncoder::new(Vec::new(), flate2::Compression::default());
445 std::io::Write::write_all(&mut encoder, &raw_data).unwrap();
446 let compressed = encoder.finish().unwrap();
447
448 let img_dict = dictionary! {
449 "Type" => "XObject",
450 "Subtype" => "Image",
451 "Width" => 2_i64,
452 "Height" => 2_i64,
453 "BitsPerComponent" => 8_i64,
454 "ColorSpace" => "DeviceRGB",
455 "Filter" => "FlateDecode",
456 };
457 let img_stream = Stream::new(img_dict, compressed);
458 let img_id = doc.add_object(Object::Stream(img_stream));
459
460 let xobject_dict = dictionary! {
461 "Im0" => Object::Reference(img_id),
462 };
463 let resources_dict = dictionary! {
464 "XObject" => Object::Dictionary(xobject_dict),
465 };
466 let content_data = b"q 2 0 0 2 0 0 cm /Im0 Do Q".to_vec();
467 let content_stream = Stream::new(dictionary! {}, content_data);
468 let content_id = doc.add_object(Object::Stream(content_stream));
469
470 let page_dict = dictionary! {
471 "Type" => "Page",
472 "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
473 "Resources" => Object::Dictionary(resources_dict),
474 "Contents" => Object::Reference(content_id),
475 };
476 let page_id = doc.add_object(Object::Dictionary(page_dict));
477
478 let pages_dict = dictionary! {
479 "Type" => "Pages",
480 "Kids" => vec![Object::Reference(page_id)],
481 "Count" => 1_i64,
482 };
483 let pages_id = doc.add_object(Object::Dictionary(pages_dict));
484
485 if let Ok(Object::Dictionary(ref mut d)) = doc.get_object_mut(page_id) {
486 d.set("Parent", Object::Reference(pages_id));
487 }
488
489 let catalog = dictionary! {
490 "Type" => "Catalog",
491 "Pages" => Object::Reference(pages_id),
492 };
493 let catalog_id = doc.add_object(Object::Dictionary(catalog));
494 doc.trailer.set("Root", Object::Reference(catalog_id));
495
496 doc
497 }
498
499 #[test]
500 fn extract_jpeg_image() {
501 let doc = make_doc_with_jpeg_image();
502 let images = extract_all_images(&doc).unwrap();
503 assert_eq!(images.len(), 1);
504 assert_eq!(images[0].width, 100);
505 assert_eq!(images[0].height, 50);
506 assert_eq!(images[0].bits_per_component, 8);
507 assert_eq!(images[0].color_space, "DeviceRGB");
508 assert_eq!(images[0].filter, ImageFilter::Jpeg);
509 assert_eq!(images[0].page, 1);
510 }
511
512 #[test]
513 fn extract_from_specific_page() {
514 let doc = make_doc_with_jpeg_image();
515 let images = extract_page_images(&doc, 1).unwrap();
516 assert_eq!(images.len(), 1);
517 assert_eq!(images[0].filter, ImageFilter::Jpeg);
518 }
519
520 #[test]
521 fn extract_page_out_of_range() {
522 let doc = make_doc_with_jpeg_image();
523 let result = extract_page_images(&doc, 5);
524 assert!(result.is_err());
525 }
526
527 #[test]
528 fn extract_raw_image() {
529 let doc = make_doc_with_raw_image();
530 let images = extract_all_images(&doc).unwrap();
531 assert_eq!(images.len(), 1);
532 assert_eq!(images[0].filter, ImageFilter::Raw);
533 assert_eq!(images[0].data.len(), 12);
534 }
535
536 #[test]
537 fn extract_flate_compressed_image() {
538 let doc = make_doc_with_flate_image();
539 let images = extract_all_images(&doc).unwrap();
540 assert_eq!(images.len(), 1);
541 assert_eq!(images[0].filter, ImageFilter::Flate);
542 assert_eq!(images[0].data.len(), 12);
544 }
545
546 #[test]
547 fn no_images_returns_empty() {
548 let mut doc = Document::with_version("1.7");
549
550 let content_stream = Stream::new(dictionary! {}, b"BT /F1 12 Tf (Hello) Tj ET".to_vec());
551 let content_id = doc.add_object(Object::Stream(content_stream));
552
553 let page_dict = dictionary! {
554 "Type" => "Page",
555 "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
556 "Contents" => Object::Reference(content_id),
557 };
558 let page_id = doc.add_object(Object::Dictionary(page_dict));
559
560 let pages_dict = dictionary! {
561 "Type" => "Pages",
562 "Kids" => vec![Object::Reference(page_id)],
563 "Count" => 1_i64,
564 };
565 let pages_id = doc.add_object(Object::Dictionary(pages_dict));
566
567 if let Ok(Object::Dictionary(ref mut d)) = doc.get_object_mut(page_id) {
568 d.set("Parent", Object::Reference(pages_id));
569 }
570
571 let catalog = dictionary! {
572 "Type" => "Catalog",
573 "Pages" => Object::Reference(pages_id),
574 };
575 let catalog_id = doc.add_object(Object::Dictionary(catalog));
576 doc.trailer.set("Root", Object::Reference(catalog_id));
577
578 let images = extract_all_images(&doc).unwrap();
579 assert!(images.is_empty());
580 }
581}