Skip to main content

rpdfium_codec/jpeg/
jpegmodule.rs

1// Derived from PDFium's codec/ccodec_jpegmodule.cpp
2// Original: Copyright 2014 The PDFium Authors
3// Licensed under BSD-3-Clause / Apache-2.0
4// See pdfium-upstream/LICENSE for the original license.
5
6//! DCTDecode filter — JPEG decompression using zune-jpeg.
7
8use crate::error::DecodeError;
9
10/// Maximum number of bytes to scan forward looking for SOI marker.
11const SOI_SCAN_LIMIT: usize = 1024;
12
13/// Byte offset of the dimension fields relative to the SOF marker start.
14///
15/// SOF marker layout: FF Cx (2 bytes marker) + length (2 bytes) + bpc (1 byte)
16/// → dimensions start at offset 5 from the marker.
17const SOF_MARKER_BYTE_OFFSET: usize = 5;
18
19/// Known byte offsets (from SOI) where SOF markers appear in malformed JPEG
20/// headers that set height to 0xFFFF.
21///
22/// Ported from PDFium's `kKnownBadHeaderWithInvalidHeightByteOffsetStarts`.
23const KNOWN_BAD_HEADER_OFFSETS: &[usize] = &[94, 163];
24
25/// Information extracted from JPEG markers before decoding.
26#[derive(Debug, Clone, Default)]
27pub struct JpegInfo {
28    /// Image width in pixels.
29    pub width: u32,
30    /// Image height in pixels.
31    pub height: u32,
32    /// Number of color components (e.g. 1 for grayscale, 3 for RGB/YCbCr, 4 for CMYK).
33    pub num_components: u8,
34    /// Bits per component (typically 8).
35    pub bits_per_component: u8,
36    /// Whether an Adobe APP14 marker was found.
37    pub has_adobe_marker: bool,
38    /// Adobe color transform flag (0=Unknown, 1=YCbCr, 2=YCCK).
39    pub adobe_color_transform: u8,
40}
41
42/// Probe JPEG data to extract image information without fully decoding.
43///
44/// Scans for SOF0/SOF1/SOF2 markers to extract dimensions and component info,
45/// and for Adobe APP14 markers to detect color transform flags.
46/// Returns `None` if no SOF marker is found.
47pub fn probe_jpeg_info(input: &[u8]) -> Option<JpegInfo> {
48    let data = find_soi(input)?;
49    let mut info = JpegInfo::default();
50    let mut found_sof = false;
51    let mut pos = 2; // Skip SOI
52
53    while pos + 1 < data.len() {
54        if data[pos] != 0xFF {
55            pos += 1;
56            continue;
57        }
58
59        // Skip padding 0xFF bytes
60        while pos + 1 < data.len() && data[pos + 1] == 0xFF {
61            pos += 1;
62        }
63        if pos + 1 >= data.len() {
64            break;
65        }
66
67        let marker = data[pos + 1];
68        pos += 2;
69
70        // SOS (Start of Scan) — stop scanning markers
71        if marker == 0xDA {
72            break;
73        }
74
75        // Markers without length
76        if marker == 0x00 || marker == 0x01 || (0xD0..=0xD7).contains(&marker) {
77            continue;
78        }
79
80        // Read marker length
81        if pos + 2 > data.len() {
82            break;
83        }
84        let length = u16::from_be_bytes([data[pos], data[pos + 1]]) as usize;
85        if length < 2 || pos + length > data.len() {
86            break;
87        }
88
89        match marker {
90            // SOF0, SOF1, SOF2 (Baseline, Extended, Progressive)
91            0xC0..=0xC2 => {
92                if length >= 8 {
93                    info.bits_per_component = data[pos + 2];
94                    info.height = u16::from_be_bytes([data[pos + 3], data[pos + 4]]) as u32;
95                    info.width = u16::from_be_bytes([data[pos + 5], data[pos + 6]]) as u32;
96                    info.num_components = data[pos + 7];
97                    found_sof = true;
98                }
99            }
100            // APP14 (Adobe marker)
101            0xEE => {
102                // Adobe marker: length >= 14 (2 length + 5 "Adobe" + 7 data)
103                if length >= 14 && pos + 2 + 5 <= data.len() && &data[pos + 2..pos + 7] == b"Adobe"
104                {
105                    info.has_adobe_marker = true;
106                    // Color transform byte is at offset 11 from marker data start
107                    // (2 length bytes + 5 "Adobe" + 2 version + 2 flags0)
108                    if pos + 13 < data.len() {
109                        info.adobe_color_transform = data[pos + 13];
110                    }
111                }
112            }
113            _ => {}
114        }
115
116        pos += length;
117    }
118
119    if found_sof { Some(info) } else { None }
120}
121
122/// Upstream-aligned alias for [`probe_jpeg_info()`](probe_jpeg_info). Corresponds to `JpegModule::LoadInfo()`.
123#[inline]
124pub fn load_info(input: &[u8]) -> Option<JpegInfo> {
125    probe_jpeg_info(input)
126}
127
128/// Find the SOI (Start of Image) marker in the input.
129///
130/// If input starts with 0xFF 0xD8, returns it directly.
131/// Otherwise scans forward up to `SOI_SCAN_LIMIT` bytes.
132fn find_soi(input: &[u8]) -> Option<&[u8]> {
133    if input.len() < 2 {
134        return None;
135    }
136
137    if input[0] == 0xFF && input[1] == 0xD8 {
138        return Some(input);
139    }
140
141    // Scan forward for SOI marker
142    let limit = input.len().min(SOI_SCAN_LIMIT);
143    for i in 0..limit.saturating_sub(1) {
144        if input[i] == 0xFF && input[i + 1] == 0xD8 {
145            return Some(&input[i..]);
146        }
147    }
148
149    None
150}
151
152/// Check whether a SOF marker exists at the given byte offset within JPEG data.
153///
154/// SOF markers are 0xFF followed by 0xC0–0xC2 (baseline/extended/progressive).
155fn is_sof_at_offset(data: &[u8], dimension_offset: usize) -> bool {
156    if dimension_offset < SOF_MARKER_BYTE_OFFSET {
157        return false;
158    }
159    let marker_offset = dimension_offset - SOF_MARKER_BYTE_OFFSET;
160    if marker_offset + 1 >= data.len() {
161        return false;
162    }
163    data[marker_offset] == 0xFF && (0xC0..=0xC2).contains(&data[marker_offset + 1])
164}
165
166/// Check if JPEG data has a known bad header with height set to 0xFFFF
167/// at a specific dimension offset.
168///
169/// Matches upstream PDFium's `HasKnownBadHeaderWithInvalidHeight()`.
170fn has_invalid_height_at_offset(data: &[u8], dimension_offset: usize, expected_width: u32) -> bool {
171    if dimension_offset + 3 >= data.len() {
172        return false;
173    }
174    if !is_sof_at_offset(data, dimension_offset) {
175        return false;
176    }
177
178    let width_hi = ((expected_width >> 8) & 0xFF) as u8;
179    let width_lo = (expected_width & 0xFF) as u8;
180
181    // Height high byte, height low byte, width high byte, width low byte
182    data[dimension_offset] == 0xFF
183        && data[dimension_offset + 1] == 0xFF
184        && data[dimension_offset + 2] == width_hi
185        && data[dimension_offset + 3] == width_lo
186}
187
188/// Detect and attempt to fix JPEG data with a known bad header where height is 0xFFFF.
189///
190/// Some malformed JPEG encoders set the image height to 0xFFFF in the SOF marker.
191/// This function checks known byte offsets for this pattern and, if found, patches
192/// the height to 0 (which per JPEG spec means "determine from data/DNL marker").
193///
194/// Returns `Some(patched_data)` if the known bad pattern was found and patched,
195/// `None` if the data doesn't match the known bad pattern.
196pub fn try_patch_invalid_height(input: &[u8]) -> Option<Vec<u8>> {
197    let data = find_soi(input)?;
198    let soi_offset = input.len() - data.len();
199
200    let info = probe_jpeg_info(data)?;
201    if info.height != 0xFFFF {
202        return None;
203    }
204
205    for &offset in KNOWN_BAD_HEADER_OFFSETS {
206        if has_invalid_height_at_offset(data, offset, info.width as u32) {
207            let mut patched = input.to_vec();
208            let abs_offset = soi_offset + offset;
209            if abs_offset + 1 < patched.len() {
210                // Patch height to 0 (JPEG spec: height determined from data)
211                patched[abs_offset] = 0x00;
212                patched[abs_offset + 1] = 0x00;
213                return Some(patched);
214            }
215        }
216    }
217
218    None
219}
220
221/// Decode JPEG data with automatic recovery for known bad headers.
222///
223/// If normal decoding fails and the JPEG has height=0xFFFF at a known offset,
224/// this patches the height and retries. Ported from upstream PDFium's
225/// malformed JPEG handling logic.
226pub fn decode_with_recovery(input: &[u8]) -> Result<Vec<u8>, DecodeError> {
227    match decode(input) {
228        Ok(data) => Ok(data),
229        Err(original_err) => {
230            if let Some(patched) = try_patch_invalid_height(input) {
231                decode(&patched)
232            } else {
233                Err(original_err)
234            }
235        }
236    }
237}
238
239/// Decode JPEG (DCT) compressed data.
240///
241/// Returns raw pixel data (RGB or grayscale depending on the JPEG).
242/// If the input doesn't start with an SOI marker, scans forward up to
243/// 1024 bytes to find one.
244pub fn decode(input: &[u8]) -> Result<Vec<u8>, DecodeError> {
245    let data = find_soi(input).unwrap_or(input);
246    let cursor = std::io::Cursor::new(data);
247    let mut decoder = zune_jpeg::JpegDecoder::new(cursor);
248    decoder
249        .decode()
250        .map_err(|e| DecodeError::Jpeg(format!("{e}")))
251}
252
253#[cfg(test)]
254mod tests {
255    use super::*;
256
257    #[test]
258    fn test_decode_invalid_jpeg() {
259        let result = decode(b"not a jpeg");
260        assert!(result.is_err());
261    }
262
263    #[test]
264    fn test_decode_empty_input() {
265        let result = decode(b"");
266        assert!(result.is_err());
267    }
268
269    #[test]
270    fn test_decode_truncated_header() {
271        // Start of JPEG marker but truncated
272        let result = decode(&[0xFF, 0xD8, 0xFF]);
273        assert!(result.is_err());
274    }
275
276    #[test]
277    fn test_decode_minimal_valid_jpeg() {
278        // A minimal 1x1 white JPEG (generated programmatically is complex,
279        // so we just test that valid-looking but incomplete data fails gracefully)
280        let partial_jpeg = [
281            0xFF, 0xD8, // SOI
282            0xFF, 0xE0, // APP0
283            0x00, 0x10, // length
284            b'J', b'F', b'I', b'F', 0x00, // JFIF identifier
285            0x01, 0x01, // version
286            0x00, // units
287            0x00, 0x01, // X density
288            0x00, 0x01, // Y density
289            0x00, 0x00, // thumbnail
290        ];
291        let result = decode(&partial_jpeg);
292        // This is incomplete so it should error
293        assert!(result.is_err());
294    }
295
296    /// Build a minimal JPEG-like byte sequence with SOF0 marker for probe tests.
297    fn build_jpeg_with_sof(width: u16, height: u16, components: u8, bpc: u8) -> Vec<u8> {
298        let mut data = Vec::new();
299        // SOI
300        data.extend_from_slice(&[0xFF, 0xD8]);
301        // SOF0
302        data.extend_from_slice(&[0xFF, 0xC0]);
303        // Length = 8 + 3 * components
304        let sof_len = 8 + 3 * components as u16;
305        data.extend_from_slice(&sof_len.to_be_bytes());
306        data.push(bpc);
307        data.extend_from_slice(&height.to_be_bytes());
308        data.extend_from_slice(&width.to_be_bytes());
309        data.push(components);
310        // Component specs (3 bytes each): id, sampling, qtable
311        for i in 0..components {
312            data.extend_from_slice(&[i + 1, 0x11, 0]);
313        }
314        data
315    }
316
317    #[test]
318    fn test_probe_jpeg_info_basic() {
319        let jpeg = build_jpeg_with_sof(640, 480, 3, 8);
320        let info = probe_jpeg_info(&jpeg).unwrap();
321        assert_eq!(info.width, 640);
322        assert_eq!(info.height, 480);
323        assert_eq!(info.num_components, 3);
324        assert_eq!(info.bits_per_component, 8);
325        assert!(!info.has_adobe_marker);
326    }
327
328    #[test]
329    fn test_probe_jpeg_info_grayscale() {
330        let jpeg = build_jpeg_with_sof(100, 200, 1, 8);
331        let info = probe_jpeg_info(&jpeg).unwrap();
332        assert_eq!(info.width, 100);
333        assert_eq!(info.height, 200);
334        assert_eq!(info.num_components, 1);
335    }
336
337    #[test]
338    fn test_probe_jpeg_info_with_adobe_marker() {
339        let mut jpeg = build_jpeg_with_sof(320, 240, 4, 8);
340        // Append APP14 Adobe marker
341        jpeg.extend_from_slice(&[0xFF, 0xEE]);
342        // Length = 14 (2 length + 5 Adobe + 2 version + 2 flags0 + 2 flags1 + 1 transform)
343        jpeg.extend_from_slice(&[0x00, 0x0E]);
344        // "Adobe"
345        jpeg.extend_from_slice(b"Adobe");
346        // Version (2 bytes)
347        jpeg.extend_from_slice(&[0x00, 0x64]);
348        // Flags0 (2 bytes)
349        jpeg.extend_from_slice(&[0x00, 0x00]);
350        // Flags1 (2 bytes)
351        jpeg.extend_from_slice(&[0x00, 0x00]);
352        // Color transform: 2 (YCCK)
353        jpeg.push(0x02);
354
355        let info = probe_jpeg_info(&jpeg).unwrap();
356        assert_eq!(info.width, 320);
357        assert_eq!(info.num_components, 4);
358        assert!(info.has_adobe_marker);
359        assert_eq!(info.adobe_color_transform, 2);
360    }
361
362    #[test]
363    fn test_probe_jpeg_info_no_sof_returns_none() {
364        // Just an SOI with no SOF
365        let data = [0xFF, 0xD8];
366        assert!(probe_jpeg_info(&data).is_none());
367    }
368
369    #[test]
370    fn test_probe_jpeg_info_empty_returns_none() {
371        assert!(probe_jpeg_info(&[]).is_none());
372    }
373
374    #[test]
375    fn test_find_soi_at_start() {
376        let data = [0xFF, 0xD8, 0xFF, 0xC0];
377        let result = find_soi(&data);
378        assert!(result.is_some());
379        assert_eq!(result.unwrap().len(), 4);
380    }
381
382    #[test]
383    fn test_find_soi_with_offset() {
384        // Garbage bytes followed by SOI
385        let mut data = vec![0x00, 0x00, 0x00, 0x42];
386        data.extend_from_slice(&[0xFF, 0xD8, 0xFF, 0xC0]);
387        let result = find_soi(&data);
388        assert!(result.is_some());
389        assert_eq!(result.unwrap().len(), 4); // From SOI to end
390    }
391
392    #[test]
393    fn test_find_soi_not_found() {
394        let data = [0x00; 100];
395        assert!(find_soi(&data).is_none());
396    }
397
398    #[test]
399    fn test_find_soi_beyond_limit() {
400        // SOI marker at position > SOI_SCAN_LIMIT should not be found
401        let mut data = vec![0x00; SOI_SCAN_LIMIT + 10];
402        data[SOI_SCAN_LIMIT + 5] = 0xFF;
403        data[SOI_SCAN_LIMIT + 6] = 0xD8;
404        assert!(find_soi(&data).is_none());
405    }
406
407    #[test]
408    fn test_decode_with_leading_garbage_still_fails_gracefully() {
409        // SOI is there but JPEG is incomplete
410        let mut data = vec![0x00, 0x00, 0x00];
411        data.extend_from_slice(&[0xFF, 0xD8, 0xFF]);
412        let result = decode(&data);
413        assert!(result.is_err());
414    }
415
416    // -----------------------------------------------------------------------
417    // Known-bad-header tests
418    // -----------------------------------------------------------------------
419
420    /// Build JPEG data with SOF0 at a specific byte offset from SOI, with given height.
421    fn build_jpeg_with_sof_at_offset(
422        offset: usize,
423        width: u16,
424        height: u16,
425        components: u8,
426    ) -> Vec<u8> {
427        let mut data = Vec::new();
428        // SOI
429        data.extend_from_slice(&[0xFF, 0xD8]);
430
431        // Pad with APP0 markers to reach the desired offset for SOF
432        // SOF marker position = offset - SOF_MARKER_BYTE_OFFSET (which is 5)
433        let sof_marker_pos = offset - SOF_MARKER_BYTE_OFFSET;
434        let current_pos = 2; // after SOI
435        if sof_marker_pos > current_pos {
436            // Fill with an APP0 segment of appropriate size
437            let fill_len = sof_marker_pos - current_pos;
438            if fill_len >= 4 {
439                data.extend_from_slice(&[0xFF, 0xE0]); // APP0 marker
440                let seg_len = (fill_len - 2) as u16;
441                data.extend_from_slice(&seg_len.to_be_bytes());
442                data.extend(vec![0x00; fill_len - 4]);
443            } else {
444                data.extend(vec![0x00; fill_len]);
445            }
446        }
447
448        // SOF0 marker
449        data.extend_from_slice(&[0xFF, 0xC0]);
450        let sof_len = 8 + 3 * components as u16;
451        data.extend_from_slice(&sof_len.to_be_bytes());
452        data.push(8); // bpc
453        data.extend_from_slice(&height.to_be_bytes());
454        data.extend_from_slice(&width.to_be_bytes());
455        data.push(components);
456        for i in 0..components {
457            data.extend_from_slice(&[i + 1, 0x11, 0]);
458        }
459        data
460    }
461
462    #[test]
463    fn test_detect_known_bad_header_at_offset_94() {
464        let data = build_jpeg_with_sof_at_offset(94, 640, 0xFFFF, 3);
465        let info = probe_jpeg_info(&data).unwrap();
466        assert_eq!(info.height, 0xFFFF);
467        assert_eq!(info.width, 640);
468
469        let patched = try_patch_invalid_height(&data);
470        assert!(patched.is_some());
471
472        // Verify height was patched to 0
473        let patched = patched.unwrap();
474        let patched_info = probe_jpeg_info(&patched).unwrap();
475        assert_eq!(patched_info.height, 0);
476        assert_eq!(patched_info.width, 640);
477    }
478
479    #[test]
480    fn test_detect_known_bad_header_at_offset_163() {
481        let data = build_jpeg_with_sof_at_offset(163, 320, 0xFFFF, 3);
482        let info = probe_jpeg_info(&data).unwrap();
483        assert_eq!(info.height, 0xFFFF);
484
485        let patched = try_patch_invalid_height(&data);
486        assert!(patched.is_some());
487        let patched_info = probe_jpeg_info(&patched.unwrap()).unwrap();
488        assert_eq!(patched_info.height, 0);
489    }
490
491    #[test]
492    fn test_no_patch_for_normal_height() {
493        let data = build_jpeg_with_sof(640, 480, 3, 8);
494        assert!(try_patch_invalid_height(&data).is_none());
495    }
496
497    #[test]
498    fn test_no_patch_for_unknown_offset() {
499        // SOF at offset 50 (not a known bad offset)
500        let data = build_jpeg_with_sof_at_offset(50, 640, 0xFFFF, 3);
501        let info = probe_jpeg_info(&data).unwrap();
502        assert_eq!(info.height, 0xFFFF);
503        // Offset 50 is not in KNOWN_BAD_HEADER_OFFSETS
504        assert!(try_patch_invalid_height(&data).is_none());
505    }
506
507    #[test]
508    fn test_decode_with_recovery_on_invalid_input() {
509        // Recovery still fails if data is not valid JPEG
510        let result = decode_with_recovery(b"not a jpeg");
511        assert!(result.is_err());
512    }
513
514    #[test]
515    fn test_is_sof_at_offset_basic() {
516        // SOF0 at position 0: FF C0
517        let data = [0xFF, 0xC0, 0x00, 0x08, 0x08, 0x00, 0x01, 0x00, 0x01, 0x01];
518        assert!(is_sof_at_offset(&data, SOF_MARKER_BYTE_OFFSET));
519    }
520
521    #[test]
522    fn test_is_sof_at_offset_sof2() {
523        // SOF2 (progressive)
524        let data = [0xFF, 0xC2, 0x00, 0x08, 0x08, 0x00, 0x01, 0x00, 0x01, 0x01];
525        assert!(is_sof_at_offset(&data, SOF_MARKER_BYTE_OFFSET));
526    }
527
528    #[test]
529    fn test_is_sof_at_offset_not_sof() {
530        // Not a SOF marker
531        let data = [0xFF, 0xE0, 0x00, 0x08, 0x08, 0x00, 0x01, 0x00, 0x01, 0x01];
532        assert!(!is_sof_at_offset(&data, SOF_MARKER_BYTE_OFFSET));
533    }
534}