1use lopdf::{Document, Object};
4
5use crate::models::bbox::BoundingBox;
6use crate::models::chunks::ImageChunk;
7use crate::EdgePdfError;
8
9#[derive(Debug, Clone)]
11pub struct ExtractedImage {
12 pub chunk: ImageChunk,
14 pub data: Vec<u8>,
16 pub width: u32,
18 pub height: u32,
20 pub color_space: String,
22 pub bits_per_component: u8,
24 pub filter: String,
26}
27
28pub fn extract_image_chunks(
33 doc: &Document,
34 page_number: u32,
35 page_id: lopdf::ObjectId,
36) -> Result<Vec<ImageChunk>, EdgePdfError> {
37 let page_dict = doc
38 .get_object(page_id)
39 .map_err(|e| EdgePdfError::PipelineError {
40 stage: 1,
41 message: format!("Failed to get page {}: {}", page_number, e),
42 })?
43 .as_dict()
44 .map_err(|e| EdgePdfError::PipelineError {
45 stage: 1,
46 message: format!("Page {} is not a dictionary: {}", page_number, e),
47 })?;
48
49 let resources = match page_dict.get(b"Resources") {
51 Ok(r) => resolve_obj(doc, r),
52 Err(_) => return Ok(Vec::new()),
53 };
54
55 let resources_dict = match resources.as_dict() {
56 Ok(d) => d,
57 Err(_) => return Ok(Vec::new()),
58 };
59
60 let xobjects = match resources_dict.get(b"XObject") {
61 Ok(x) => resolve_obj(doc, x),
62 Err(_) => return Ok(Vec::new()),
63 };
64
65 let xobject_dict = match xobjects.as_dict() {
66 Ok(d) => d,
67 Err(_) => return Ok(Vec::new()),
68 };
69
70 let mut chunks = Vec::new();
71 let mut index = 0u32;
72
73 for (_name, xobj_ref) in xobject_dict.iter() {
74 let xobj = resolve_obj(doc, xobj_ref);
75 if let Ok(stream) = xobj.as_stream() {
76 let dict = &stream.dict;
77
78 let subtype = dict.get(b"Subtype").ok().and_then(|o| {
80 if let Object::Name(ref n) = o {
81 Some(String::from_utf8_lossy(n).to_string())
82 } else {
83 None
84 }
85 });
86
87 if subtype.as_deref() != Some("Image") {
88 continue;
89 }
90
91 let width = get_int(dict, b"Width").unwrap_or(0) as f64;
92 let height = get_int(dict, b"Height").unwrap_or(0) as f64;
93
94 if width <= 0.0 || height <= 0.0 {
95 continue;
96 }
97
98 index += 1;
99
100 let bbox = BoundingBox::new(Some(page_number), 0.0, 0.0, width, height);
103
104 chunks.push(ImageChunk {
105 bbox,
106 index: Some(index),
107 level: None,
108 });
109 }
110 }
111
112 Ok(chunks)
113}
114
115pub fn extract_image_data(
117 doc: &Document,
118 page_id: lopdf::ObjectId,
119 image_index: u32,
120) -> Result<Option<ExtractedImage>, EdgePdfError> {
121 let page_dict = doc
122 .get_object(page_id)
123 .map_err(|e| EdgePdfError::PipelineError {
124 stage: 1,
125 message: format!("Failed to get page: {}", e),
126 })?
127 .as_dict()
128 .map_err(|e| EdgePdfError::PipelineError {
129 stage: 1,
130 message: format!("Page is not a dictionary: {}", e),
131 })?;
132
133 let resources = match page_dict.get(b"Resources") {
134 Ok(r) => resolve_obj(doc, r),
135 Err(_) => return Ok(None),
136 };
137
138 let resources_dict = match resources.as_dict() {
139 Ok(d) => d,
140 Err(_) => return Ok(None),
141 };
142
143 let xobjects = match resources_dict.get(b"XObject") {
144 Ok(x) => resolve_obj(doc, x),
145 Err(_) => return Ok(None),
146 };
147
148 let xobject_dict = match xobjects.as_dict() {
149 Ok(d) => d,
150 Err(_) => return Ok(None),
151 };
152
153 let mut current_index = 0u32;
154
155 for (_name, xobj_ref) in xobject_dict.iter() {
156 let xobj = resolve_obj(doc, xobj_ref);
157 if let Ok(stream) = xobj.as_stream() {
158 let dict = &stream.dict;
159
160 let subtype = dict.get(b"Subtype").ok().and_then(|o| {
161 if let Object::Name(ref n) = o {
162 Some(String::from_utf8_lossy(n).to_string())
163 } else {
164 None
165 }
166 });
167
168 if subtype.as_deref() != Some("Image") {
169 continue;
170 }
171
172 current_index += 1;
173 if current_index != image_index {
174 continue;
175 }
176
177 let width = get_int(dict, b"Width").unwrap_or(0) as u32;
178 let height = get_int(dict, b"Height").unwrap_or(0) as u32;
179 let bpc = get_int(dict, b"BitsPerComponent").unwrap_or(8) as u8;
180
181 let color_space = dict
182 .get(b"ColorSpace")
183 .ok()
184 .and_then(|o| match o {
185 Object::Name(n) => Some(String::from_utf8_lossy(n).to_string()),
186 _ => None,
187 })
188 .unwrap_or_else(|| "DeviceRGB".to_string());
189
190 let filter = dict
191 .get(b"Filter")
192 .ok()
193 .and_then(|o| match o {
194 Object::Name(n) => Some(String::from_utf8_lossy(n).to_string()),
195 _ => None,
196 })
197 .unwrap_or_default();
198
199 let data = if filter == "DCTDecode" {
200 stream.content.clone()
202 } else {
203 stream
205 .decompressed_content()
206 .unwrap_or_else(|_| stream.content.clone())
207 };
208
209 let bbox = BoundingBox::new(Some(0), 0.0, 0.0, width as f64, height as f64);
210
211 return Ok(Some(ExtractedImage {
212 chunk: ImageChunk {
213 bbox,
214 index: Some(image_index),
215 level: None,
216 },
217 data,
218 width,
219 height,
220 color_space,
221 bits_per_component: bpc,
222 filter,
223 }));
224 }
225 }
226
227 Ok(None)
228}
229
230fn resolve_obj(doc: &Document, obj: &Object) -> Object {
231 match obj {
232 Object::Reference(id) => doc.get_object(*id).cloned().unwrap_or(Object::Null),
233 other => other.clone(),
234 }
235}
236
237fn get_int(dict: &lopdf::Dictionary, key: &[u8]) -> Option<i64> {
238 dict.get(key).ok().and_then(|o| match o {
239 Object::Integer(i) => Some(*i),
240 Object::Real(f) => Some(*f as i64),
241 _ => None,
242 })
243}
244
245#[cfg(test)]
246mod tests {
247 use super::*;
248 use lopdf::{dictionary, Stream};
249
250 #[test]
251 fn test_extract_no_images() {
252 let mut doc = Document::with_version("1.5");
253 let pages_id = doc.new_object_id();
254
255 let page_id = doc.add_object(dictionary! {
256 "Type" => "Page",
257 "Parent" => pages_id,
258 "MediaBox" => vec![0.into(), 0.into(), 595.into(), 842.into()],
259 });
260
261 let pages = dictionary! {
262 "Type" => "Pages",
263 "Kids" => vec![page_id.into()],
264 "Count" => 1,
265 };
266 doc.objects.insert(pages_id, Object::Dictionary(pages));
267
268 let catalog_id = doc.add_object(dictionary! {
269 "Type" => "Catalog",
270 "Pages" => pages_id,
271 });
272 doc.trailer.set("Root", catalog_id);
273
274 let pages = doc.get_pages();
275 let (&page_num, &pid) = pages.iter().next().unwrap();
276 let chunks = extract_image_chunks(&doc, page_num, pid).unwrap();
277 assert!(chunks.is_empty());
278 }
279
280 #[test]
281 fn test_extract_image_chunk() {
282 let mut doc = Document::with_version("1.5");
283 let pages_id = doc.new_object_id();
284
285 let img_stream = Stream::new(
287 dictionary! {
288 "Type" => "XObject",
289 "Subtype" => "Image",
290 "Width" => 100,
291 "Height" => 200,
292 "ColorSpace" => "DeviceRGB",
293 "BitsPerComponent" => 8,
294 },
295 vec![0u8; 100 * 200 * 3], );
297 let img_id = doc.add_object(img_stream);
298
299 let resources_id = doc.add_object(dictionary! {
300 "XObject" => dictionary! {
301 "Im1" => img_id,
302 },
303 });
304
305 let page_id = doc.add_object(dictionary! {
306 "Type" => "Page",
307 "Parent" => pages_id,
308 "Resources" => resources_id,
309 "MediaBox" => vec![0.into(), 0.into(), 595.into(), 842.into()],
310 });
311
312 let pages = dictionary! {
313 "Type" => "Pages",
314 "Kids" => vec![page_id.into()],
315 "Count" => 1,
316 };
317 doc.objects.insert(pages_id, Object::Dictionary(pages));
318
319 let catalog_id = doc.add_object(dictionary! {
320 "Type" => "Catalog",
321 "Pages" => pages_id,
322 });
323 doc.trailer.set("Root", catalog_id);
324
325 let pages = doc.get_pages();
326 let (&page_num, &pid) = pages.iter().next().unwrap();
327 let chunks = extract_image_chunks(&doc, page_num, pid).unwrap();
328 assert_eq!(chunks.len(), 1);
329 }
330}