zpdf_parser/
filters.rs

1use zpdf_core::{Error, PdfDict, PdfName, PdfObject, Result};
2
3/// Hard cap on the bytes produced by any single decompression filter (Flate,
4/// LZW, RunLength). Stops decompression bombs even though `ParseLimits` is not
5/// threaded into this layer (filter functions see only bytes + dict).
6const MAX_DECODED_OUTPUT: usize = 1 << 30; // 1 GiB
7
8pub fn decode_stream(data: &[u8], dict: &PdfDict) -> Result<Vec<u8>> {
9    let filters = match dict.get("Filter") {
10        Some(PdfObject::Name(n)) => vec![n.clone()],
11        Some(PdfObject::Array(arr)) => arr
12            .iter()
13            .map(|obj| match obj {
14                PdfObject::Name(n) => Ok(n.clone()),
15                _ => Err(Error::TypeMismatch {
16                    expected: "Name",
17                    actual: obj.type_name(),
18                }),
19            })
20            .collect::<Result<Vec<_>>>()?,
21        Some(_) => {
22            return Err(Error::TypeMismatch {
23                expected: "Name or Array",
24                actual: "other",
25            })
26        }
27        None => return Ok(data.to_vec()),
28    };
29
30    let decode_parms = extract_decode_parms(dict, filters.len());
31
32    let mut result = data.to_vec();
33    for (i, filter) in filters.iter().enumerate() {
34        let params = decode_parms[i].as_ref();
35        result = apply_filter(filter, &result, params)?;
36        if let Some(p) = params {
37            result = apply_predictor(&result, p)?;
38        }
39    }
40    Ok(result)
41}
42
43fn extract_decode_parms(dict: &PdfDict, filter_count: usize) -> Vec<Option<PdfDict>> {
44    match dict.get("DecodeParms").or_else(|| dict.get("DP")) {
45        Some(PdfObject::Dict(d)) => {
46            let mut v = vec![None; filter_count];
47            if !v.is_empty() {
48                v[0] = Some(d.clone());
49            }
50            v
51        }
52        Some(PdfObject::Array(arr)) => arr
53            .iter()
54            .map(|obj| match obj {
55                PdfObject::Dict(d) => Some(d.clone()),
56                _ => None,
57            })
58            .chain(std::iter::repeat(None))
59            .take(filter_count)
60            .collect(),
61        _ => vec![None; filter_count],
62    }
63}
64
65fn apply_predictor(data: &[u8], params: &PdfDict) -> Result<Vec<u8>> {
66    let predictor = params.get_i64("Predictor").unwrap_or(1) as u32;
67    if predictor == 1 {
68        return Ok(data.to_vec());
69    }
70
71    let colors = params.get_i64("Colors").unwrap_or(1).max(1) as usize;
72    let bpc = params.get_i64("BitsPerComponent").unwrap_or(8).max(1) as usize;
73    let columns = params.get_i64("Columns").unwrap_or(1).max(1) as usize;
74
75    if predictor == 2 {
76        decode_tiff_predictor(data, colors, bpc, columns)
77    } else if predictor >= 10 {
78        decode_png_predictor(data, colors, bpc, columns)
79    } else {
80        Ok(data.to_vec())
81    }
82}
83
84fn decode_tiff_predictor(
85    data: &[u8],
86    colors: usize,
87    bpc: usize,
88    columns: usize,
89) -> Result<Vec<u8>> {
90    if bpc != 8 {
91        return Ok(data.to_vec());
92    }
93    let row_bytes = columns * colors;
94    let mut output = data.to_vec();
95    for row_start in (0..output.len()).step_by(row_bytes) {
96        let row_end = (row_start + row_bytes).min(output.len());
97        for i in (row_start + colors)..row_end {
98            output[i] = output[i].wrapping_add(output[i - colors]);
99        }
100    }
101    Ok(output)
102}
103
104fn decode_png_predictor(data: &[u8], colors: usize, bpc: usize, columns: usize) -> Result<Vec<u8>> {
105    let row_bytes = (colors * bpc * columns).div_ceil(8);
106    let bpp = (colors * bpc).div_ceil(8); // bytes per pixel for Sub/Paeth
107    let stride = 1 + row_bytes; // filter byte + row data
108
109    if !data.len().is_multiple_of(stride) && !data.is_empty() {
110        // Try to process what we can
111        tracing::debug!(
112            "PNG predictor: data length {} not multiple of stride {stride}",
113            data.len()
114        );
115    }
116
117    let num_rows = data.len().div_ceil(stride);
118    let mut output = Vec::with_capacity(num_rows * row_bytes);
119    let mut prev_row = vec![0u8; row_bytes];
120
121    let mut pos = 0;
122    while pos < data.len() {
123        if pos >= data.len() {
124            break;
125        }
126        let filter_type = data[pos];
127        pos += 1;
128
129        let available = (data.len() - pos).min(row_bytes);
130        let cur = &data[pos..pos + available];
131
132        let mut row = vec![0u8; row_bytes];
133        row[..available].copy_from_slice(cur);
134
135        match filter_type {
136            0 => {} // None
137            1 => {
138                // Sub
139                for i in bpp..row_bytes {
140                    row[i] = row[i].wrapping_add(row[i - bpp]);
141                }
142            }
143            2 => {
144                // Up
145                for i in 0..row_bytes {
146                    row[i] = row[i].wrapping_add(prev_row[i]);
147                }
148            }
149            3 => {
150                // Average
151                for i in 0..row_bytes {
152                    let left = if i >= bpp { row[i - bpp] as u16 } else { 0 };
153                    let above = prev_row[i] as u16;
154                    row[i] = row[i].wrapping_add(((left + above) / 2) as u8);
155                }
156            }
157            4 => {
158                // Paeth
159                for i in 0..row_bytes {
160                    let left = if i >= bpp { row[i - bpp] as i32 } else { 0 };
161                    let above = prev_row[i] as i32;
162                    let upper_left = if i >= bpp {
163                        prev_row[i - bpp] as i32
164                    } else {
165                        0
166                    };
167                    row[i] = row[i].wrapping_add(paeth(left, above, upper_left));
168                }
169            }
170            _ => {
171                tracing::debug!("PNG predictor: unknown filter type {filter_type}");
172            }
173        }
174
175        output.extend_from_slice(&row);
176        prev_row.copy_from_slice(&row);
177        pos += available;
178    }
179
180    Ok(output)
181}
182
183fn paeth(a: i32, b: i32, c: i32) -> u8 {
184    let p = a + b - c;
185    let pa = (p - a).abs();
186    let pb = (p - b).abs();
187    let pc = (p - c).abs();
188    if pa <= pb && pa <= pc {
189        a as u8
190    } else if pb <= pc {
191        b as u8
192    } else {
193        c as u8
194    }
195}
196
197fn apply_filter(filter: &PdfName, data: &[u8], params: Option<&PdfDict>) -> Result<Vec<u8>> {
198    match filter.as_str() {
199        "FlateDecode" | "Fl" => decode_flate(data),
200        "LZWDecode" | "LZW" => {
201            // EarlyChange lives in DecodeParms; default 1 per ISO 32000.
202            let early_change = params
203                .and_then(|p| p.get_i64("EarlyChange").ok())
204                .unwrap_or(1);
205            lzw_decode(data, early_change)
206        }
207        "ASCIIHexDecode" | "AHx" => decode_ascii_hex(data),
208        "ASCII85Decode" | "A85" => decode_ascii85(data),
209        "RunLengthDecode" | "RL" => decode_run_length(data),
210        "DCTDecode" | "DCT" => decode_dct(data),
211        "CCITTFaxDecode" | "CCF" => {
212            let ccitt_params = crate::ccitt::CcittParams::from_dict(params);
213            crate::ccitt::decode(data, &ccitt_params)
214        }
215        "JBIG2Decode" => {
216            let jbig2_params = crate::jbig2::Jbig2Params::from_dict(params);
217            crate::jbig2::decode(data, &jbig2_params)
218        }
219        // JPXDecode output is decoded *pixels*, not raw samples, and JPEG 2000
220        // carries its own colour-space/alpha metadata that a bytes-only filter
221        // cannot return. Pass the codestream through unchanged; zpdf-image
222        // sniffs it (filter name + JP2/SOC magic) and runs the real decode.
223        "JPXDecode" => Ok(data.to_vec()),
224        other => Err(Error::UnsupportedFilter(other.to_string())),
225    }
226}
227
228/// PDF/TIFF variable-width LZW decoder (ISO 32000-1, 7.4.4.2).
229///
230/// 8-bit input symbols; code width starts at 9 and grows to a max of 12.
231/// Code 256 = ClearTable (reset dictionary, width back to 9),
232/// code 257 = EOD. Codes 258+ are dictionary strings. `early_change` is the
233/// DecodeParms EarlyChange value (default 1); when 1 the code width is increased
234/// one code earlier than the natural boundary.
235fn lzw_decode(data: &[u8], early_change: i64) -> Result<Vec<u8>> {
236    const CLEAR: u32 = 256;
237    const EOD: u32 = 257;
238
239    // EarlyChange is effectively a flag: any nonzero -> 1, explicit 0 -> 0.
240    let early: u32 = if early_change == 0 { 0 } else { 1 };
241
242    // Dictionary: index = code, value = decoded byte string. Slots 0..=255 are
243    // single bytes; 256/257 are placeholders so the first dynamic code is 258.
244    let mut table: Vec<Vec<u8>> = Vec::with_capacity(4096);
245    let reset = |t: &mut Vec<Vec<u8>>| {
246        t.clear();
247        for i in 0..256u32 {
248            t.push(vec![i as u8]);
249        }
250        t.push(Vec::new()); // 256 CLEAR (unused as a string)
251        t.push(Vec::new()); // 257 EOD   (unused as a string)
252    };
253    reset(&mut table);
254
255    let mut width: u32 = 9;
256    let mut bit_pos: usize = 0;
257    let total_bits = data.len() * 8;
258
259    // MSB-first reader; returns None when fewer than `width` bits remain.
260    let read_code = |bit_pos: &mut usize, width: u32| -> Option<u32> {
261        if *bit_pos + width as usize > total_bits {
262            return None;
263        }
264        let mut code: u32 = 0;
265        for _ in 0..width {
266            let byte = data[*bit_pos / 8];
267            let bit = (byte >> (7 - (*bit_pos % 8))) & 1;
268            code = (code << 1) | bit as u32;
269            *bit_pos += 1;
270        }
271        Some(code)
272    };
273
274    let mut out: Vec<u8> = Vec::new();
275    let mut prev: Option<u32> = None;
276
277    // Stop when input is exhausted (some streams omit the EOD marker).
278    while let Some(code) = read_code(&mut bit_pos, width) {
279        if code == EOD {
280            break;
281        }
282        if code == CLEAR {
283            reset(&mut table);
284            width = 9;
285            prev = None;
286            continue;
287        }
288
289        // Resolve the output string for this code.
290        let entry: Vec<u8> = if (code as usize) < table.len() {
291            table[code as usize].clone()
292        } else if code as usize == table.len() {
293            // KwKwK: code refers to the entry we are about to define.
294            match prev {
295                Some(p) => {
296                    let mut e = table[p as usize].clone();
297                    e.push(table[p as usize][0]);
298                    e
299                }
300                None => {
301                    return Err(Error::StreamDecode(format!(
302                        "LZWDecode: code {code} before any literal"
303                    )))
304                }
305            }
306        } else {
307            return Err(Error::StreamDecode(format!(
308                "LZWDecode: invalid code {code} (table size {})",
309                table.len()
310            )));
311        };
312
313        out.extend_from_slice(&entry);
314        if out.len() > MAX_DECODED_OUTPUT {
315            return Err(Error::StreamDecode(
316                "LZWDecode: output exceeds decompression limit".into(),
317            ));
318        }
319
320        // Add new dictionary entry = previous string + first byte of this entry.
321        // (Skipped for the first code after a clear, when prev is None.)
322        if let Some(p) = prev {
323            let mut new_entry = table[p as usize].clone();
324            new_entry.push(entry[0]);
325            table.push(new_entry);
326        }
327        prev = Some(code);
328
329        // Width growth. After the push above, `table.len()` is the index that
330        // will be assigned to the NEXT dictionary entry, which is exactly the
331        // value to test against the current width's capacity. EarlyChange (=1)
332        // bumps the width one code earlier. Grow when `table.len() + early >=
333        // 2^width`. (Validated against weezl/TIFF LZW across the 9->10->11->12
334        // and 4096 boundaries; an earlier `+ 1` here desynced real streams.)
335        let next_code = table.len() as u32;
336        if width < 12 && next_code + early >= (1u32 << width) {
337            width += 1;
338        }
339    }
340
341    Ok(out)
342}
343
344/// Outcome of one chunked inflate attempt: either the reader ran to a clean
345/// EOF, or it failed partway with whatever bytes were recovered first.
346enum InflateOutcome {
347    Complete(Vec<u8>),
348    Failed(Vec<u8>, String),
349}
350
351/// Drive `reader` to completion in fixed-size chunks so that a mid-stream
352/// error still yields the bytes decoded before it. Output is capped at
353/// [`MAX_DECODED_OUTPUT`]; hitting the cap is a hard error (a decompression
354/// bomb is not salvageable data).
355fn inflate_chunked(mut reader: impl std::io::Read) -> Result<InflateOutcome> {
356    let mut out = Vec::new();
357    let mut buf = [0u8; 16 * 1024];
358    loop {
359        match reader.read(&mut buf) {
360            Ok(0) => return Ok(InflateOutcome::Complete(out)),
361            Ok(n) => {
362                if out.len() + n > MAX_DECODED_OUTPUT {
363                    return Err(Error::StreamDecode(
364                        "FlateDecode: output exceeds decompression limit".into(),
365                    ));
366                }
367                out.extend_from_slice(&buf[..n]);
368            }
369            Err(e) => return Ok(InflateOutcome::Failed(out, e.to_string())),
370        }
371    }
372}
373
374/// FlateDecode with real-world tolerance: salvages partial output from
375/// truncated/corrupt zlib streams, retries headerless data as raw deflate,
376/// and skips a bounded run of leading garbage before a plausible zlib header.
377fn decode_flate(data: &[u8]) -> Result<Vec<u8>> {
378    use flate2::read::{DeflateDecoder, ZlibDecoder};
379
380    // Lenient: an empty stream decodes to nothing.
381    if data.is_empty() {
382        return Ok(Vec::new());
383    }
384
385    // Plausible zlib header at `i`: CM (low nibble of CMF) is 8 (deflate) and
386    // the FCHECK property holds (CMF<<8 | FLG divisible by 31).
387    let plausible_zlib = |i: usize| {
388        data.len() >= i + 2
389            && data[i] & 0x0f == 8
390            && ((data[i] as u32) << 8 | data[i + 1] as u32).is_multiple_of(31)
391    };
392
393    let mut zlib_err: Option<String> = None;
394    if plausible_zlib(0) {
395        match inflate_chunked(ZlibDecoder::new(data))? {
396            InflateOutcome::Complete(out) => return Ok(out),
397            InflateOutcome::Failed(partial, err) if !partial.is_empty() => {
398                tracing::warn!(
399                    "FlateDecode: zlib stream failed after {} bytes ({err}); keeping partial output",
400                    partial.len()
401                );
402                return Ok(partial);
403            }
404            InflateOutcome::Failed(_, err) => zlib_err = Some(err),
405        }
406    }
407
408    // The header was implausible (or decoded to nothing): look for a plausible
409    // CMF/FLG pair after a bounded garbage/whitespace prefix.
410    const MAX_HEADER_SCAN: usize = 64;
411    if let Some(k) = (1..data.len().min(MAX_HEADER_SCAN)).find(|&k| plausible_zlib(k)) {
412        match inflate_chunked(ZlibDecoder::new(&data[k..]))? {
413            InflateOutcome::Complete(out) => {
414                tracing::warn!("FlateDecode: skipped {k} bytes of leading garbage");
415                return Ok(out);
416            }
417            InflateOutcome::Failed(partial, err) if !partial.is_empty() => {
418                tracing::warn!(
419                    "FlateDecode: zlib stream at offset {k} failed ({err}); keeping {} partial bytes",
420                    partial.len()
421                );
422                return Ok(partial);
423            }
424            InflateOutcome::Failed(..) => {}
425        }
426    }
427
428    // Last resort: some writers emit raw deflate with no zlib wrapper.
429    match inflate_chunked(DeflateDecoder::new(data))? {
430        InflateOutcome::Complete(out) => {
431            tracing::warn!("FlateDecode: decoded as raw deflate (missing zlib header)");
432            Ok(out)
433        }
434        InflateOutcome::Failed(partial, err) if !partial.is_empty() => {
435            tracing::warn!(
436                "FlateDecode: raw deflate failed ({err}); keeping {} partial bytes",
437                partial.len()
438            );
439            Ok(partial)
440        }
441        InflateOutcome::Failed(_, err) => Err(Error::StreamDecode(format!(
442            "FlateDecode: {}",
443            zlib_err.unwrap_or(err)
444        ))),
445    }
446}
447
448/// Lenient ASCIIHexDecode: whitespace is ignored anywhere, stray non-hex bytes
449/// are skipped (warned, not fatal), and anything after the `>` EOD marker is
450/// ignored, so partial/dirty streams still decode.
451fn decode_ascii_hex(data: &[u8]) -> Result<Vec<u8>> {
452    let mut output = Vec::with_capacity(data.len() / 2);
453    let mut high: Option<u8> = None;
454    let mut stray = 0usize;
455
456    for &b in data {
457        if b == b'>' {
458            break; // EOD; bytes after it are ignored
459        }
460        if b.is_ascii_whitespace() || b == 0 {
461            continue;
462        }
463        let nibble = match b {
464            b'0'..=b'9' => b - b'0',
465            b'a'..=b'f' => b - b'a' + 10,
466            b'A'..=b'F' => b - b'A' + 10,
467            _ => {
468                stray += 1;
469                continue;
470            }
471        };
472
473        match high {
474            None => high = Some(nibble),
475            Some(h) => {
476                output.push((h << 4) | nibble);
477                high = None;
478            }
479        }
480    }
481
482    if let Some(h) = high {
483        output.push(h << 4);
484    }
485    if stray > 0 {
486        tracing::warn!("ASCIIHexDecode: ignored {stray} invalid byte(s)");
487    }
488
489    Ok(output)
490}
491
492/// Lenient ASCII85Decode: whitespace is ignored anywhere, stray bytes outside
493/// the alphabet are skipped (warned, not fatal), and everything from the `~`
494/// of the `~>` EOD marker on is ignored, salvaging partial output.
495fn decode_ascii85(data: &[u8]) -> Result<Vec<u8>> {
496    let mut output = Vec::new();
497    // u64 accumulator: a 5-char group of bytes near 'u' encodes a value just
498    // above u32::MAX; the spec calls it invalid, but it must not overflow.
499    let mut tuple: u64 = 0;
500    let mut count = 0usize;
501    let mut stray = 0usize;
502
503    for &b in data {
504        if b == b'~' {
505            break; // start of the "~>" EOD marker; ignore it and the rest
506        }
507        if b.is_ascii_whitespace() || b == 0 {
508            continue;
509        }
510
511        if b == b'z' && count == 0 {
512            output.extend_from_slice(&[0, 0, 0, 0]);
513            continue;
514        }
515
516        if !(b'!'..=b'u').contains(&b) {
517            stray += 1;
518            continue;
519        }
520
521        tuple = tuple * 85 + (b - b'!') as u64;
522        count += 1;
523
524        if count == 5 {
525            let t = (tuple & 0xFFFF_FFFF) as u32;
526            output.extend_from_slice(&t.to_be_bytes());
527            tuple = 0;
528            count = 0;
529        }
530    }
531
532    // Handle remaining bytes
533    if count > 1 {
534        for _ in count..5 {
535            tuple = tuple * 85 + 84; // pad with 'u'
536        }
537        let t = (tuple & 0xFFFF_FFFF) as u32;
538        for i in 0..(count - 1) {
539            output.push((t >> (24 - i * 8)) as u8);
540        }
541    }
542    if stray > 0 {
543        tracing::warn!("ASCII85Decode: ignored {stray} invalid byte(s)");
544    }
545
546    Ok(output)
547}
548
549fn decode_dct(data: &[u8]) -> Result<Vec<u8>> {
550    use zune_jpeg::JpegDecoder;
551
552    // Adobe YCCK JPEGs (APP14 transform == 2, 4 components) are mis-handled by
553    // zune-jpeg's built-in YCCK->RGB: it applies a spurious `255 - x`, producing
554    // a colour-negative image (a white CMYK page reads back as black). zune has
555    // no YCCK->CMYK arm either, so we take the raw YCCK channels and convert them
556    // ourselves. Plain Adobe CMYK (transform 0) decodes correctly via zune's
557    // CMYK->RGB, so it stays on the default path.
558    if jpeg_is_adobe_ycck(data) {
559        use zune_jpeg::zune_core::colorspace::ColorSpace;
560        use zune_jpeg::zune_core::options::DecoderOptions;
561        let opts = DecoderOptions::default().jpeg_set_out_colorspace(ColorSpace::YCCK);
562        let mut decoder = JpegDecoder::new_with_options(std::io::Cursor::new(data), opts);
563        match decoder.decode() {
564            Ok(ycck) if decoder.output_colorspace() == Some(ColorSpace::YCCK) => {
565                return Ok(ycck_to_rgb(&ycck));
566            }
567            // Unexpected (e.g. not actually 4-component): fall through to the
568            // default decode rather than mangle the data.
569            _ => {}
570        }
571    }
572
573    let mut decoder = JpegDecoder::new(std::io::Cursor::new(data));
574    decoder
575        .decode()
576        .map_err(|e| Error::StreamDecode(format!("DCTDecode: {e}")))
577}
578
579/// Convert raw upsampled Adobe YCCK samples (`Y, Cb, Cr, K` per pixel) to RGB.
580///
581/// In Adobe YCCK the chroma channels encode the *complement* of C/M/Y, so the
582/// JFIF YCbCr->RGB output is the transmitted (inverted) ink: `C = 1 − R'`,
583/// `M = 1 − G'`, `Y = 1 − B'`. The 4th channel is the black-ink amount
584/// (`K_raw = 255` ⇒ full black). The recovered DeviceCMYK is converted through
585/// the shared Adobe polynomial ([`zpdf_color::cmyk_to_rgb`]) so YCCK JPEGs match
586/// every other DeviceCMYK path — e.g. 100 % K is a dark near-black, not pure
587/// black. (The previous `channel * (255 − K_raw)` shortcut was the naïve
588/// `(1−c)(1−k)`, which over-saturated like a non-fidelity viewer.)
589fn ycck_to_rgb(ycck: &[u8]) -> Vec<u8> {
590    let mut out = Vec::with_capacity(ycck.len() / 4 * 3);
591    for px in ycck.chunks_exact(4) {
592        let (y, cb, cr) = (px[0] as f64, px[1] as f64, px[2] as f64);
593        // JFIF YCbCr -> R'G'B' (transmitted light = complement of C/M/Y ink).
594        let rp = (y + 1.402 * (cr - 128.0)).clamp(0.0, 255.0);
595        let gp = (y - 0.344_136 * (cb - 128.0) - 0.714_136 * (cr - 128.0)).clamp(0.0, 255.0);
596        let bp = (y + 1.772 * (cb - 128.0)).clamp(0.0, 255.0);
597        let (r, g, b) = zpdf_color::cmyk_to_rgb(
598            1.0 - rp / 255.0,
599            1.0 - gp / 255.0,
600            1.0 - bp / 255.0,
601            px[3] as f64 / 255.0,
602        );
603        out.push((r * 255.0).round() as u8);
604        out.push((g * 255.0).round() as u8);
605        out.push((b * 255.0).round() as u8);
606    }
607    out
608}
609
610/// Scan a JPEG for an Adobe APP14 marker with transform 2 (YCCK) over a SOF that
611/// declares 4 components. Cheap byte walk over the marker segments only.
612fn jpeg_is_adobe_ycck(data: &[u8]) -> bool {
613    let mut adobe_ycck = false;
614    let mut four_components = false;
615    let mut i = 2; // skip SOI (FFD8)
616    while i + 3 < data.len() {
617        if data[i] != 0xFF {
618            i += 1;
619            continue;
620        }
621        let marker = data[i + 1];
622        // Standalone markers (no length): padding fill, SOI/EOI, RSTn, TEM.
623        if marker == 0xFF || marker == 0x01 || (0xD0..=0xD9).contains(&marker) {
624            i += 2;
625            continue;
626        }
627        let seg_len = ((data[i + 2] as usize) << 8) | data[i + 3] as usize;
628        if seg_len < 2 {
629            break;
630        }
631        let payload_start = i + 4;
632        let payload_end = i + 2 + seg_len;
633        if payload_end > data.len() {
634            break;
635        }
636        let payload = &data[payload_start..payload_end];
637        match marker {
638            // APP14: "Adobe" + version(2) + flags0(2) + flags1(2) + transform(1).
639            0xEE => {
640                if payload.len() >= 12 && &payload[0..5] == b"Adobe" {
641                    adobe_ycck = payload[11] == 2;
642                }
643            }
644            // SOFn (baseline/progressive/etc.), excluding DHT(C4)/JPG(C8)/DAC(CC).
645            0xC0..=0xCF if marker != 0xC4 && marker != 0xC8 && marker != 0xCC => {
646                // precision(1) + height(2) + width(2) + Nf(1).
647                if payload.len() >= 6 {
648                    four_components = payload[5] == 4;
649                }
650            }
651            // Start of scan: header is done.
652            0xDA => break,
653            _ => {}
654        }
655        i = payload_end;
656    }
657    adobe_ycck && four_components
658}
659
660fn decode_run_length(data: &[u8]) -> Result<Vec<u8>> {
661    let mut output = Vec::new();
662    let mut i = 0;
663
664    while i < data.len() {
665        let length_byte = data[i];
666        i += 1;
667
668        if length_byte == 128 {
669            break; // EOD
670        } else if length_byte < 128 {
671            // Copy next (length_byte + 1) bytes literally
672            let count = length_byte as usize + 1;
673            if i + count > data.len() {
674                return Err(Error::StreamDecode("RunLengthDecode: truncated".into()));
675            }
676            if output.len() + count > MAX_DECODED_OUTPUT {
677                return Err(Error::StreamDecode(
678                    "RunLengthDecode: output exceeds decompression limit".into(),
679                ));
680            }
681            output.extend_from_slice(&data[i..i + count]);
682            i += count;
683        } else {
684            // Repeat next byte (257 - length_byte) times
685            let count = 257 - length_byte as usize;
686            if i >= data.len() {
687                return Err(Error::StreamDecode("RunLengthDecode: truncated".into()));
688            }
689            if output.len() + count > MAX_DECODED_OUTPUT {
690                return Err(Error::StreamDecode(
691                    "RunLengthDecode: output exceeds decompression limit".into(),
692                ));
693            }
694            let byte = data[i];
695            i += 1;
696            output.resize(output.len() + count, byte);
697        }
698    }
699
700    Ok(output)
701}
702
703#[cfg(test)]
704mod tests {
705    use super::*;
706
707    #[test]
708    fn ycck_white_decodes_white_not_black() {
709        // Adobe white (no ink): Y=255, Cb=Cr=128 (neutral), K_raw=0 → CMYK all 0.
710        let rgb = ycck_to_rgb(&[255, 128, 128, 0]);
711        assert_eq!(rgb, vec![255, 255, 255], "Adobe YCCK white must stay white");
712    }
713
714    #[test]
715    fn ycck_full_black_ink_decodes_near_black() {
716        // K_raw=255 ⇒ CMYK (0,0,0,1). The Adobe DeviceCMYK polynomial renders
717        // 100% K as a dark near-black, not pure black (matches every other path).
718        let rgb = ycck_to_rgb(&[255, 128, 128, 255]);
719        assert_eq!(rgb, vec![44, 46, 53]);
720    }
721
722    #[test]
723    fn ycck_neutral_gray_via_polynomial() {
724        // No CMY (chroma neutral, luma full) with half black ink ⇒ CMYK
725        // (0,0,0,0.5); the polynomial maps it lighter than the naïve 127.
726        let rgb = ycck_to_rgb(&[255, 128, 128, 128]);
727        assert_eq!(rgb, vec![154, 156, 159]);
728    }
729
730    #[test]
731    fn ycck_colored_pixel_via_polynomial() {
732        // Non-neutral chroma exercises the C/M/Y recovery + the full polynomial.
733        let rgb = ycck_to_rgb(&[200, 100, 150, 50]);
734        assert_eq!(rgb, vec![198, 165, 131]);
735    }
736
737    #[test]
738    fn adobe_ycck_detection() {
739        // Minimal marker stream: SOI, APP14(Adobe, transform=2), SOF0(4 comp), SOS.
740        let mut j = vec![0xFF, 0xD8];
741        // APP14, len=16: "Adobe"(5)+ver(2)+f0(2)+f1(2)+transform(1) = 12 payload, +2 len = 14... use 16 with pad.
742        j.extend_from_slice(&[0xFF, 0xEE, 0x00, 0x0E]);
743        j.extend_from_slice(b"Adobe");
744        j.extend_from_slice(&[0x00, 0x64, 0x00, 0x00, 0x00, 0x00, 0x02]); // version, flags, transform=2
745                                                                          // SOF0, len=17 (1 prec + 2 h + 2 w + 1 Nf=4 + 4*3 comp specs) -> payload 6+ needed.
746        j.extend_from_slice(&[0xFF, 0xC0, 0x00, 0x11, 0x08, 0x00, 0x10, 0x00, 0x10, 0x04]);
747        j.extend_from_slice(&[1, 0x11, 0, 2, 0x11, 0, 3, 0x11, 0, 4, 0x11, 0]);
748        j.extend_from_slice(&[0xFF, 0xDA, 0x00, 0x02]); // SOS
749        assert!(jpeg_is_adobe_ycck(&j));
750
751        // transform=0 (plain CMYK) must NOT take the YCCK path.
752        let mut j0 = j.clone();
753        // transform byte is at: 2 (SOI) + 4 (app14 hdr) + 11 = index 17.
754        j0[17] = 0;
755        assert!(!jpeg_is_adobe_ycck(&j0));
756    }
757
758    #[test]
759    fn flate_roundtrip() {
760        use flate2::write::ZlibEncoder;
761        use flate2::Compression;
762        use std::io::Write;
763
764        let original = b"Hello, zpdf! This is a test of FlateDecode.";
765        let mut encoder = ZlibEncoder::new(Vec::new(), Compression::default());
766        encoder.write_all(original).unwrap();
767        let compressed = encoder.finish().unwrap();
768
769        let decoded = decode_flate(&compressed).unwrap();
770        assert_eq!(decoded, original);
771    }
772
773    #[test]
774    fn flate_partial_salvage_on_truncation() {
775        use flate2::write::ZlibEncoder;
776        use flate2::Compression;
777        use std::io::Write;
778
779        // Deterministic, mostly-incompressible data so the compressed stream
780        // is long and a truncation still leaves plenty of decodable input.
781        let mut state = 0x2545F491u64;
782        let original: Vec<u8> = (0..64 * 1024)
783            .map(|_| {
784                state = state
785                    .wrapping_mul(6364136223846793005)
786                    .wrapping_add(1442695040888963407);
787                (state >> 33) as u8
788            })
789            .collect();
790        let mut encoder = ZlibEncoder::new(Vec::new(), Compression::default());
791        encoder.write_all(&original).unwrap();
792        let compressed = encoder.finish().unwrap();
793
794        let truncated = &compressed[..compressed.len() / 2];
795        let decoded = decode_flate(truncated).unwrap();
796        assert!(!decoded.is_empty(), "partial output must be salvaged");
797        assert!(decoded.len() < original.len());
798        assert_eq!(
799            &original[..decoded.len()],
800            &decoded[..],
801            "salvaged bytes are a prefix"
802        );
803    }
804
805    #[test]
806    fn flate_raw_deflate_fallback() {
807        use flate2::write::DeflateEncoder;
808        use flate2::Compression;
809        use std::io::Write;
810
811        let original = b"raw deflate stream without a zlib wrapper".to_vec();
812        let mut encoder = DeflateEncoder::new(Vec::new(), Compression::default());
813        encoder.write_all(&original).unwrap();
814        let compressed = encoder.finish().unwrap();
815
816        let decoded = decode_flate(&compressed).unwrap();
817        assert_eq!(decoded, original);
818    }
819
820    #[test]
821    fn flate_skips_leading_garbage() {
822        use flate2::write::ZlibEncoder;
823        use flate2::Compression;
824        use std::io::Write;
825
826        let original = b"zlib data behind a garbage prefix".to_vec();
827        let mut encoder = ZlibEncoder::new(Vec::new(), Compression::default());
828        encoder.write_all(&original).unwrap();
829        let compressed = encoder.finish().unwrap();
830
831        // \r\n\xff: no byte pair in the prefix forms a plausible zlib header.
832        let mut data = b"\r\n\xff".to_vec();
833        data.extend_from_slice(&compressed);
834        let decoded = decode_flate(&data).unwrap();
835        assert_eq!(decoded, original);
836    }
837
838    #[test]
839    fn flate_empty_input_is_empty_output() {
840        assert_eq!(decode_flate(&[]).unwrap(), Vec::<u8>::new());
841    }
842
843    #[test]
844    fn flate_garbage_still_errors() {
845        // Pure ASCII text: implausible zlib header, invalid deflate.
846        assert!(decode_flate(b"this is not compressed data at all....").is_err());
847    }
848
849    #[test]
850    fn ascii_hex() {
851        let decoded = decode_ascii_hex(b"48 65 6C 6C 6F>").unwrap();
852        assert_eq!(decoded, b"Hello");
853    }
854
855    #[test]
856    fn ascii_hex_tolerates_stray_bytes_and_data_after_eod() {
857        // 'x'/'!' are not hex digits (skipped); '>' is EOD (rest ignored).
858        let decoded = decode_ascii_hex(b"48 65 x6C!6C 6F> trailing garbage \xff").unwrap();
859        assert_eq!(decoded, b"Hello");
860    }
861
862    #[test]
863    fn ascii85_basic() {
864        // "Man " encodes to "9jqo^" in ASCII85
865        let decoded = decode_ascii85(b"9jqo^~>").unwrap();
866        assert_eq!(decoded, b"Man ");
867    }
868
869    #[test]
870    fn ascii85_ignores_bytes_after_eod() {
871        let decoded = decode_ascii85(b"9jqo^~> stray bytes \xff\xfe after EOD").unwrap();
872        assert_eq!(decoded, b"Man ");
873    }
874
875    #[test]
876    fn ascii85_skips_stray_bytes_and_whitespace() {
877        // NUL and 0xFF are outside the alphabet: skipped, not fatal.
878        // Whitespace inside a group is ignored.
879        let decoded = decode_ascii85(b"9j\x00qo\xff ^~>").unwrap();
880        assert_eq!(decoded, b"Man ");
881    }
882
883    #[test]
884    fn ascii85_overflowing_group_does_not_panic() {
885        // "uuuuu" encodes a value above u32::MAX — invalid per spec, but must
886        // decode leniently (truncated) instead of overflowing.
887        assert!(decode_ascii85(b"uuuuu~>").is_ok());
888    }
889
890    #[test]
891    fn run_length_literal_and_repeat() {
892        // 2 literal bytes [0x41, 0x42], then repeat 0x43 three times, then EOD
893        let data = [1, 0x41, 0x42, 254, 0x43, 128];
894        let decoded = decode_run_length(&data).unwrap();
895        assert_eq!(decoded, vec![0x41, 0x42, 0x43, 0x43, 0x43]);
896    }
897
898    #[test]
899    fn png_predictor_none() {
900        // 2 columns, 1 color, 8 bpc → row_bytes = 2, stride = 3
901        // filter=0 (None): [0, 0x41, 0x42]
902        let data = [0, 0x41, 0x42];
903        let result = decode_png_predictor(&data, 1, 8, 2).unwrap();
904        assert_eq!(result, vec![0x41, 0x42]);
905    }
906
907    #[test]
908    fn png_predictor_sub() {
909        // filter=1 (Sub), bpp=1: each byte += left
910        // row: [1, 10, 5, 3] → decoded: [10, 15, 18]
911        let data = [1, 10, 5, 3];
912        let result = decode_png_predictor(&data, 1, 8, 3).unwrap();
913        assert_eq!(result, vec![10, 15, 18]);
914    }
915
916    #[test]
917    fn png_predictor_up() {
918        // filter=2 (Up): each byte += above
919        // row1: [0, 10, 20] → [10, 20]
920        // row2: [2, 5, 3]   → [15, 23]
921        let data = [0, 10, 20, 2, 5, 3];
922        let result = decode_png_predictor(&data, 1, 8, 2).unwrap();
923        assert_eq!(result, vec![10, 20, 15, 23]);
924    }
925
926    #[test]
927    fn png_predictor_paeth() {
928        // filter=4 (Paeth), 1 color 8bpc 3 columns, bpp=1
929        // row1: [0, 10, 20, 30]  → None: [10, 20, 30]
930        // row2: [4, 5, 7, 3]     → Paeth reconstruction
931        //   i=0: paeth(0, 10, 0)=10, 5+10=15
932        //   i=1: paeth(15, 20, 10)=20, 7+20=27
933        //   i=2: paeth(27, 30, 20)=30, 3+30=33
934        let data = [0, 10, 20, 30, 4, 5, 7, 3];
935        let result = decode_png_predictor(&data, 1, 8, 3).unwrap();
936        assert_eq!(result, vec![10, 20, 30, 15, 27, 33]);
937    }
938
939    #[test]
940    fn tiff_predictor_basic() {
941        // 3 colors (RGB), 8bpc, 2 columns → row = 6 bytes
942        // [R0,G0,B0, dR1,dG1,dB1] → [R0,G0,B0, R0+dR1, G0+dG1, B0+dB1]
943        let data = [100, 150, 200, 10, 20, 30];
944        let result = decode_tiff_predictor(&data, 3, 8, 2).unwrap();
945        assert_eq!(result, vec![100, 150, 200, 110, 170, 230]);
946    }
947
948    // --- LZWDecode ---
949
950    #[test]
951    fn lzw_canonical_vector() {
952        // Classic ISO 32000 / Adobe LZW example. 9-bit codes, MSB-first:
953        //   256 (Clear), 45 ('-'), 258 (KwKwK -> "--"), 259 ("---"),
954        //   65 ('A'), 259 ("---"), 66 ('B'), 257 (EOD)
955        let data = [0x80, 0x0B, 0x60, 0x50, 0x22, 0x0C, 0x0C, 0x85, 0x01];
956        let decoded = lzw_decode(&data, 1).unwrap();
957        assert_eq!(decoded, b"-----A---B");
958    }
959
960    #[test]
961    fn lzw_via_apply_filter_default_early_change() {
962        let data = [0x80, 0x0B, 0x60, 0x50, 0x22, 0x0C, 0x0C, 0x85, 0x01];
963        let name = PdfName::new("LZWDecode");
964        let out = apply_filter(&name, &data, None).unwrap();
965        assert_eq!(out, b"-----A---B");
966    }
967
968    #[test]
969    fn lzw_stops_at_end_without_eod() {
970        // Truncated before EOD; should decode the leading symbols and stop cleanly.
971        let data = [0x80, 0x0B, 0x60, 0x50];
972        let out = lzw_decode(&data, 1).unwrap();
973        assert!(out.starts_with(b"-"));
974    }
975
976    #[test]
977    fn lzw_empty_input() {
978        assert_eq!(lzw_decode(&[], 1).unwrap(), Vec::<u8>::new());
979    }
980
981    /// Encode with weezl (an independent, spec-conformant LZW producer) so the
982    /// decoder is validated against an EXTERNAL reference rather than its own
983    /// paired encoder. weezl's TIFF size-switch == PDF EarlyChange=1; its plain
984    /// MSB encoder == EarlyChange=0. Verified: weezl(tiff) output of the canonical
985    /// vector decodes to "-----A---B" here.
986    fn weezl_encode(data: &[u8], early_change: i64) -> Vec<u8> {
987        use weezl::{encode::Encoder, BitOrder};
988        let mut enc = if early_change == 0 {
989            Encoder::new(BitOrder::Msb, 8)
990        } else {
991            Encoder::with_tiff_size_switch(BitOrder::Msb, 8)
992        };
993        enc.encode(data).expect("weezl encode")
994    }
995
996    #[test]
997    fn lzw_roundtrip_against_weezl() {
998        // Cross every width boundary (9->10->11->12) and the 4096 auto-clear,
999        // for both EarlyChange settings, against an external reference encoder.
1000        for ec in [1i64, 0] {
1001            for &len in &[0usize, 1, 300, 600, 1200, 3000, 5000, 9000] {
1002                // Mix of low- and high-entropy bytes to grow the dictionary.
1003                let input: Vec<u8> = (0..len).map(|i| ((i * 7 + i / 11) % 251) as u8).collect();
1004                let encoded = weezl_encode(&input, ec);
1005                let decoded = lzw_decode(&encoded, ec).unwrap();
1006                assert_eq!(decoded, input, "ec={ec} len={len}");
1007            }
1008        }
1009    }
1010
1011    #[test]
1012    fn lzw_single_byte_run_roundtrip_against_weezl() {
1013        // A long single-symbol run exercises the KwKwK path heavily.
1014        let input = vec![b'A'; 5000];
1015        for ec in [1i64, 0] {
1016            let encoded = weezl_encode(&input, ec);
1017            assert_eq!(lzw_decode(&encoded, ec).unwrap(), input, "ec={ec}");
1018        }
1019    }
1020}
zpdf_parser/filters.rs

zpdf_parser/
filters.rs