1use crate::engine::ExtractionEngine;
2use crate::error::{DonglerError, Result};
3use crate::ir::{
4 Asset, BBox, Block, Confidence, Document, FigureBlock, ImageObject, Metadata, Page,
5 SourceAnchor, Warning, SCHEMA_VERSION,
6};
7use crate::source::Source;
8
9const EXTRACTION_METHOD: &str = "image_native";
10
11#[derive(Debug, Default, Clone, Copy)]
12pub struct ImageEngine;
13
14#[derive(Debug, Clone, Copy)]
15struct ImageInfo {
16 width: u32,
17 height: u32,
18}
19
20impl ExtractionEngine for ImageEngine {
21 fn name(&self) -> &'static str {
22 "image-native"
23 }
24
25 fn extract(&self, source: &Source) -> Result<Document> {
26 let bytes = source.bytes.as_deref().unwrap_or(source.content.as_bytes());
27 let info = image_info(bytes)
28 .ok_or_else(|| DonglerError::image("unsupported or malformed image header"))?;
29 let bbox = BBox {
30 x: 0.0,
31 y: 0.0,
32 width: info.width as f32,
33 height: info.height as f32,
34 };
35 let image = ImageObject {
36 id: "image-1".to_owned(),
37 object_id: None,
38 bbox: Some(bbox),
39 width: Some(info.width),
40 height: Some(info.height),
41 };
42 let asset = Asset {
43 id: image.id.clone(),
44 kind: "image".to_owned(),
45 object_id: None,
46 bbox: Some(bbox),
47 width: Some(info.width),
48 height: Some(info.height),
49 };
50 let figure = Block::Figure(FigureBlock {
51 alt_text: source
52 .path
53 .as_deref()
54 .and_then(|path| std::path::Path::new(path).file_name())
55 .and_then(|name| name.to_str())
56 .map(str::to_owned),
57 caption: None,
58 bbox: Some(bbox),
59 image_ref: Some(image.id.clone()),
60 source_anchors: vec![SourceAnchor {
61 page_number: 1,
62 pdf_object_ids: Vec::new(),
63 bbox: Some(bbox),
64 extraction_method: EXTRACTION_METHOD.to_owned(),
65 }],
66 confidence: Some(Confidence {
67 score: 0.9,
68 calibrated: false,
69 }), ..Default::default()
70 });
71
72 Ok(Document {
73 schema_version: SCHEMA_VERSION.to_owned(),
74 metadata: Metadata {
75 format: source.format.clone(),
76 engine: self.name().to_owned(),
77 source: source.path.clone(),
78 title: None,
79 character_count: 0,
80 word_count: 0,
81 block_count: 1,
82 file_size_bytes: Some(bytes.len() as u64),
83 pdf_version: None,
84 encrypted: false,
85 },
86 pages: vec![Page {
87 number: 1,
88 width: Some(info.width as f32),
89 height: Some(info.height as f32),
90 rotation: None,
91 bbox: Some(bbox),
92 blocks: vec![figure],
93 images: vec![image],
94 assets: vec![asset.clone()],
95 warnings: Vec::new(), ..Default::default()
96 }],
97 assets: vec![asset],
98 warnings: Vec::<Warning>::new(),
99 })
100 }
101}
102
103fn image_info(bytes: &[u8]) -> Option<ImageInfo> {
104 parse_png(bytes)
105 .or_else(|| parse_jpeg(bytes))
106 .or_else(|| parse_gif(bytes))
107 .or_else(|| parse_bmp(bytes))
108 .or_else(|| parse_tiff(bytes))
109 .or_else(|| parse_webp(bytes))
110}
111
112fn parse_png(bytes: &[u8]) -> Option<ImageInfo> {
113 if bytes.len() < 24 || !bytes.starts_with(b"\x89PNG\r\n\x1a\n") || &bytes[12..16] != b"IHDR" {
114 return None;
115 }
116 Some(ImageInfo {
117 width: u32::from_be_bytes(bytes[16..20].try_into().ok()?),
118 height: u32::from_be_bytes(bytes[20..24].try_into().ok()?),
119 })
120}
121
122fn parse_jpeg(bytes: &[u8]) -> Option<ImageInfo> {
123 if bytes.len() < 4 || !bytes.starts_with(&[0xff, 0xd8]) {
124 return None;
125 }
126
127 let mut pos = 2;
128 while pos + 4 <= bytes.len() {
129 while pos < bytes.len() && bytes[pos] == 0xff {
130 pos += 1;
131 }
132 if pos >= bytes.len() {
133 return None;
134 }
135
136 let marker = bytes[pos];
137 pos += 1;
138 if marker == 0xd9 || marker == 0xda {
139 return None;
140 }
141 if pos + 2 > bytes.len() {
142 return None;
143 }
144 let segment_len = u16::from_be_bytes(bytes[pos..pos + 2].try_into().ok()?) as usize;
145 if segment_len < 2 || pos + segment_len > bytes.len() {
146 return None;
147 }
148 let data_start = pos + 2;
149 if is_jpeg_sof(marker) && data_start + 5 <= bytes.len() {
150 return Some(ImageInfo {
151 height: u16::from_be_bytes(bytes[data_start + 1..data_start + 3].try_into().ok()?)
152 as u32,
153 width: u16::from_be_bytes(bytes[data_start + 3..data_start + 5].try_into().ok()?)
154 as u32,
155 });
156 }
157 pos += segment_len;
158 }
159
160 None
161}
162
163fn is_jpeg_sof(marker: u8) -> bool {
164 matches!(
165 marker,
166 0xc0 | 0xc1 | 0xc2 | 0xc3 | 0xc5 | 0xc6 | 0xc7 | 0xc9 | 0xca | 0xcb | 0xcd | 0xce | 0xcf
167 )
168}
169
170fn parse_gif(bytes: &[u8]) -> Option<ImageInfo> {
171 if bytes.len() < 10 || !(bytes.starts_with(b"GIF87a") || bytes.starts_with(b"GIF89a")) {
172 return None;
173 }
174 Some(ImageInfo {
175 width: u16::from_le_bytes(bytes[6..8].try_into().ok()?) as u32,
176 height: u16::from_le_bytes(bytes[8..10].try_into().ok()?) as u32,
177 })
178}
179
180fn parse_bmp(bytes: &[u8]) -> Option<ImageInfo> {
181 if bytes.len() < 26 || !bytes.starts_with(b"BM") {
182 return None;
183 }
184 Some(ImageInfo {
185 width: i32::from_le_bytes(bytes[18..22].try_into().ok()?).unsigned_abs(),
186 height: i32::from_le_bytes(bytes[22..26].try_into().ok()?).unsigned_abs(),
187 })
188}
189
190fn parse_tiff(bytes: &[u8]) -> Option<ImageInfo> {
191 if bytes.len() < 8 {
192 return None;
193 }
194 let endian = TiffEndian::from_header(bytes)?;
195 if endian.read_u16(&bytes[2..4])? != 42 {
196 return None;
197 }
198 let ifd_offset = endian.read_u32(&bytes[4..8])? as usize;
199 if ifd_offset + 2 > bytes.len() {
200 return None;
201 }
202
203 let entry_count = endian.read_u16(&bytes[ifd_offset..ifd_offset + 2])? as usize;
204 let mut width = None;
205 let mut height = None;
206 let mut entry_pos = ifd_offset + 2;
207 for _ in 0..entry_count {
208 if entry_pos + 12 > bytes.len() {
209 return None;
210 }
211 let tag = endian.read_u16(&bytes[entry_pos..entry_pos + 2])?;
212 let field_type = endian.read_u16(&bytes[entry_pos + 2..entry_pos + 4])?;
213 let count = endian.read_u32(&bytes[entry_pos + 4..entry_pos + 8])?;
214 let value = tiff_inline_value(
215 endian,
216 field_type,
217 count,
218 &bytes[entry_pos + 8..entry_pos + 12],
219 )?;
220 match tag {
221 256 => width = Some(value),
222 257 => height = Some(value),
223 _ => {}
224 }
225 entry_pos += 12;
226 }
227
228 Some(ImageInfo {
229 width: width?,
230 height: height?,
231 })
232}
233
234fn tiff_inline_value(endian: TiffEndian, field_type: u16, count: u32, bytes: &[u8]) -> Option<u32> {
235 if count != 1 {
236 return None;
237 }
238 match field_type {
239 3 => endian.read_u16(&bytes[..2]).map(u32::from),
240 4 => endian.read_u32(bytes),
241 _ => None,
242 }
243}
244
245fn parse_webp(bytes: &[u8]) -> Option<ImageInfo> {
246 if bytes.len() < 30 || !bytes.starts_with(b"RIFF") || &bytes[8..12] != b"WEBP" {
247 return None;
248 }
249 if &bytes[12..16] != b"VP8X" {
250 return None;
251 }
252
253 Some(ImageInfo {
254 width: 1 + read_u24_le(&bytes[24..27])?,
255 height: 1 + read_u24_le(&bytes[27..30])?,
256 })
257}
258
259#[derive(Debug, Clone, Copy)]
260enum TiffEndian {
261 Little,
262 Big,
263}
264
265impl TiffEndian {
266 fn from_header(bytes: &[u8]) -> Option<Self> {
267 match bytes.get(..2)? {
268 b"II" => Some(Self::Little),
269 b"MM" => Some(Self::Big),
270 _ => None,
271 }
272 }
273
274 fn read_u16(self, bytes: &[u8]) -> Option<u16> {
275 let bytes = bytes.get(..2)?;
276 match self {
277 Self::Little => Some(u16::from_le_bytes(bytes.try_into().ok()?)),
278 Self::Big => Some(u16::from_be_bytes(bytes.try_into().ok()?)),
279 }
280 }
281
282 fn read_u32(self, bytes: &[u8]) -> Option<u32> {
283 let bytes = bytes.get(..4)?;
284 match self {
285 Self::Little => Some(u32::from_le_bytes(bytes.try_into().ok()?)),
286 Self::Big => Some(u32::from_be_bytes(bytes.try_into().ok()?)),
287 }
288 }
289}
290
291fn read_u24_le(bytes: &[u8]) -> Option<u32> {
292 Some(
293 (bytes.first().copied()? as u32)
294 | ((bytes.get(1).copied()? as u32) << 8)
295 | ((bytes.get(2).copied()? as u32) << 16),
296 )
297}