pdf_oxide 0.3.35

The fastest Rust PDF library with text extraction: 0.8ms mean, 100% pass rate on 3,830 PDFs. 5× faster than pdf_extract, 17× faster than oxidize_pdf. Extract, create, and edit PDFs.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
//! FlateDecode (zlib/deflate) implementation.
//!
//! This is the most common PDF compression filter, used in ~90% of PDFs.
//! Uses the flate2 crate for zlib decompression.

use crate::decoders::StreamDecoder;
use crate::error::{Error, Result};
use flate2::read::{DeflateDecoder, ZlibDecoder};
use std::io::Read;

/// Default cap for [`FlateDecoder`]: 256 MB per stream.
///
/// Prevents zip-bomb / flate-bomb attacks where a tiny compressed stream
/// expands to an arbitrarily large output, exhausting virtual memory and
/// triggering an allocator abort (SIGABRT / exit 134).
///
/// 256 MB accommodates A4 @ 600 DPI RGB (~99 MB) with headroom.
///
/// Override via:
/// - `PDF_OXIDE_MAX_DECOMPRESS_MB` environment variable (e.g. `64` for 64 MB)
/// - [`FlateDecoder::with_limit`] for programmatic control
pub const DEFAULT_MAX_DECOMPRESSED_BYTES: u64 = 256 * 1024 * 1024;

/// Read the decompression limit from the environment, falling back to the
/// compile-time default.
fn effective_limit() -> u64 {
    std::env::var("PDF_OXIDE_MAX_DECOMPRESS_MB")
        .ok()
        .and_then(|v| v.parse::<u64>().ok())
        .map(|mb| mb * 1024 * 1024)
        .unwrap_or(DEFAULT_MAX_DECOMPRESSED_BYTES)
}

/// Heuristic validator for partial-recovery output from a failing decompress.
///
/// Returns `true` if the decoded bytes look like a plausible PDF stream — any
/// of: a content-stream operator (BT/ET/Tj/TJ/Tm/Td), a common PDF object
/// marker (<</>>/stream/obj/endobj), or a `%PDF-` prefix. The set is kept
/// intentionally broad because FlateDecode also wraps object streams, xref
/// streams, font programs, and image data, none of which carry text operators.
///
/// The guard exists to distinguish genuine partially-decoded output from
/// `deflate` "success" on a misaligned input — the latter produces short runs
/// of pseudo-random bytes (#364 symptom: 128 bytes of `P\xffj!}` repeating)
/// that contain none of these markers.
///
/// Conservative fallback: if the decoded bytes are mostly ASCII (printable
/// plus whitespace), the output is also treated as plausible, because stream
/// contents in the wild include ASCII-only data (hex-encoded images, small
/// object streams) that do not hit any specific marker.
fn looks_like_real_stream(output: &[u8]) -> bool {
    if output.is_empty() {
        return false;
    }
    // Cheap, content-stream-oriented markers first.
    const MARKERS: &[&[u8]] = &[
        b"BT", b"ET", b"Tj", b"TJ", b"Tm", b"Td", b"stream", b"endobj", b"%PDF-",
    ];
    for m in MARKERS {
        if output.windows(m.len()).any(|w| w == *m) {
            return true;
        }
    }
    // Fallback: accept outputs that are ≥ 80% printable/whitespace ASCII.
    // This catches legitimate but marker-less content (hex palettes, short
    // object streams) while still rejecting the high-bit-heavy garbage the
    // partial-recovery path produces on misaligned deflate input.
    let printable = output
        .iter()
        .filter(|&&b| (0x20..=0x7E).contains(&b) || b == b'\t' || b == b'\n' || b == b'\r')
        .count();
    printable * 5 >= output.len() * 4
}

/// Returns `Err` if `output` reached the decompression cap, indicating that the
/// stream was truncated rather than fully decoded.
#[inline]
fn check_limit(output: &[u8], limit: u64) -> Result<()> {
    if output.len() as u64 >= limit {
        return Err(Error::Decode(format!(
            "FlateDecode output reached the {} MB safety limit; \
             stream may be a flate bomb or an unusually large image",
            limit / (1024 * 1024)
        )));
    }
    Ok(())
}

/// FlateDecode filter implementation.
///
/// Decompresses data using the zlib/deflate algorithm. The decompression cap
/// defaults to [`DEFAULT_MAX_DECOMPRESSED_BYTES`] and can be overridden with
/// [`FlateDecoder::with_limit`].
pub struct FlateDecoder {
    /// Maximum number of decompressed bytes accepted per stream.
    pub max_decompressed_bytes: u64,
}

impl Default for FlateDecoder {
    fn default() -> Self {
        Self {
            max_decompressed_bytes: effective_limit(),
        }
    }
}

impl FlateDecoder {
    /// Creates a decoder that rejects any stream decompressing to more than
    /// `limit` bytes. Use this to tighten or relax the default 256 MB cap.
    pub fn with_limit(limit: u64) -> Self {
        Self {
            max_decompressed_bytes: limit,
        }
    }
}

impl StreamDecoder for FlateDecoder {
    fn decode(&self, input: &[u8]) -> Result<Vec<u8>> {
        let mut decoder = ZlibDecoder::new(input).take(self.max_decompressed_bytes);
        let mut output = Vec::new();

        // Try to read all data with standard zlib
        match decoder.read_to_end(&mut output) {
            Ok(_) => {
                check_limit(&output, self.max_decompressed_bytes)?;
                Ok(output)
            },
            Err(e) => {
                // Partial recovery: return only if output *looks like* a
                // plausible stream (#364). The pre-fix behaviour accepted
                // any non-empty buffer, which let strategies 2 and 3 return
                // misaligned-deflate garbage (`P\xffj!}` × 16 on
                // nougat_026.pdf pages 1/2/5) that the text extractor then
                // emitted as zero bytes of output.
                if !output.is_empty() && looks_like_real_stream(&output) {
                    check_limit(&output, self.max_decompressed_bytes)?;
                    log::warn!(
                        "FlateDecode partial recovery: extracted {} bytes before corruption: {}",
                        output.len(),
                        e
                    );
                    return Ok(output);
                }

                // Strategy 2: Try raw deflate (no zlib wrapper)
                // Some PDFs have corrupt zlib headers but valid deflate data
                log::info!("Zlib decode failed, trying raw deflate");
                output.clear();
                let mut deflate_decoder =
                    DeflateDecoder::new(input).take(self.max_decompressed_bytes);

                match deflate_decoder.read_to_end(&mut output) {
                    Ok(_) => {
                        check_limit(&output, self.max_decompressed_bytes)?;
                        log::info!("Raw deflate recovery succeeded: {} bytes", output.len());
                        Ok(output)
                    },
                    Err(deflate_err) => {
                        if !output.is_empty() && looks_like_real_stream(&output) {
                            check_limit(&output, self.max_decompressed_bytes)?;
                            log::warn!(
                                "Raw deflate partial recovery: extracted {} bytes before error",
                                output.len()
                            );
                            return Ok(output);
                        }

                        // Strategy 3: Try skipping zlib header (2 bytes) and reading deflate
                        if input.len() > 2 {
                            log::info!(
                                "Trying deflate after skipping potential corrupt zlib header"
                            );
                            output.clear();
                            let mut deflate_decoder =
                                DeflateDecoder::new(&input[2..]).take(self.max_decompressed_bytes);

                            match deflate_decoder.read_to_end(&mut output) {
                                Ok(_) => {
                                    check_limit(&output, self.max_decompressed_bytes)?;
                                    log::info!(
                                        "Deflate with header skip succeeded: {} bytes",
                                        output.len()
                                    );
                                    return Ok(output);
                                },
                                Err(_) => {
                                    if !output.is_empty() && looks_like_real_stream(&output) {
                                        check_limit(&output, self.max_decompressed_bytes)?;
                                        log::warn!(
                                            "Deflate with header skip partial recovery: {} bytes",
                                            output.len()
                                        );
                                        return Ok(output);
                                    }
                                },
                            }
                        }

                        // Strategy 4: Try fixing corrupt zlib header byte
                        // If first byte has invalid compression method, replace with 0x78 (standard deflate)
                        if input.len() >= 2 {
                            let first_byte = input[0];
                            let compression_method = first_byte & 0x0F;
                            if compression_method != 8 {
                                log::info!(
                                    "Detected invalid compression method {} in header byte 0x{:02x}, trying with corrected header",
                                    compression_method,
                                    first_byte
                                );
                                // Create new buffer with corrected header
                                let mut corrected = input.to_vec();
                                // Replace CM bits (0-3) with 8 (deflate), keep CINFO bits (4-7)
                                corrected[0] = (first_byte & 0xF0) | 0x08;

                                output.clear();
                                let mut decoder = ZlibDecoder::new(&corrected[..])
                                    .take(self.max_decompressed_bytes);
                                match decoder.read_to_end(&mut output) {
                                    Ok(_) if !output.is_empty() => {
                                        check_limit(&output, self.max_decompressed_bytes)?;
                                        log::info!(
                                            "Header correction recovery succeeded: {} bytes",
                                            output.len()
                                        );
                                        return Ok(output);
                                    },
                                    Err(_)
                                        if !output.is_empty()
                                            && looks_like_real_stream(&output) =>
                                    {
                                        check_limit(&output, self.max_decompressed_bytes)?;
                                        log::warn!(
                                            "Header correction partial recovery: {} bytes",
                                            output.len()
                                        );
                                        return Ok(output);
                                    },
                                    _ => {
                                        log::info!("Header correction failed");
                                    },
                                }
                            }
                        }

                        // Strategy 5: Brute-force scan for valid deflate data
                        // Try starting deflate decompression from offsets 0-20
                        // BUT validate the output contains valid PDF operators
                        log::info!("Trying brute-force scan for valid deflate data");
                        let max_offset = std::cmp::min(20, input.len());
                        for offset in 0..max_offset {
                            if offset == 0 || offset == 2 {
                                continue; // Already tried these
                            }

                            output.clear();
                            let mut deflate_decoder = DeflateDecoder::new(&input[offset..])
                                .take(self.max_decompressed_bytes);

                            match deflate_decoder.read_to_end(&mut output) {
                                Ok(_) if !output.is_empty() => {
                                    check_limit(&output, self.max_decompressed_bytes)?;
                                    // Validate output quality - check for PDF operators
                                    let decoded_str = String::from_utf8_lossy(&output);
                                    let has_pdf_operators = decoded_str.contains("BT")
                                        || decoded_str.contains("ET")
                                        || decoded_str.contains("Tj")
                                        || decoded_str.contains("TJ")
                                        || decoded_str.contains("Tm")
                                        || decoded_str.contains("Td");

                                    if has_pdf_operators {
                                        log::info!(
                                            "Brute-force deflate recovery succeeded at offset {}: {} bytes (validated PDF content)",
                                            offset,
                                            output.len()
                                        );
                                        return Ok(output);
                                    } else {
                                        log::info!(
                                            "Brute-force at offset {} produced {} bytes but no valid PDF operators - trying next offset",
                                            offset,
                                            output.len()
                                        );
                                        continue;
                                    }
                                },
                                Err(_) if !output.is_empty() => {
                                    check_limit(&output, self.max_decompressed_bytes)?;
                                    // Validate partial recovery too
                                    let decoded_str = String::from_utf8_lossy(&output);
                                    let has_pdf_operators = decoded_str.contains("BT")
                                        || decoded_str.contains("ET")
                                        || decoded_str.contains("Tj")
                                        || decoded_str.contains("TJ")
                                        || decoded_str.contains("Tm")
                                        || decoded_str.contains("Td");

                                    if has_pdf_operators {
                                        log::warn!(
                                            "Brute-force partial recovery at offset {}: {} bytes (validated PDF content)",
                                            offset,
                                            output.len()
                                        );
                                        return Ok(output);
                                    } else {
                                        log::info!(
                                            "Partial recovery at offset {} but no valid PDF operators - trying next offset",
                                            offset
                                        );
                                        continue;
                                    }
                                },
                                _ => continue,
                            }
                        }

                        // SPEC COMPLIANCE FIX: Removed strategies 8-9 that violated PDF spec
                        //
                        // Previous strategies 8-9 would return raw uncompressed data for streams
                        // labeled as /FlateDecode. This violates PDF Spec ISO 32000-1:2008,
                        // Section 7.3.8.2 which states that if a stream has /Filter /FlateDecode,
                        // it MUST be compressed with the FlateDecode algorithm.
                        //
                        // Returning raw data creates security risks:
                        // 1. Malicious PDFs could bypass compression validation
                        // 2. Type confusion attacks (treating compressed data as raw)
                        // 3. Inconsistent behavior across PDF processors
                        //
                        // Correct behavior: If all decompression strategies fail, return an error.
                        // The stream is either corrupted or malicious, and should not be processed.

                        log::error!(
                            "All FlateDecode recovery strategies failed. Zlib: {}, Deflate: {}",
                            e,
                            deflate_err
                        );
                        log::error!(
                            "Stream labeled as FlateDecode but cannot be decompressed - this violates PDF spec"
                        );

                        Err(Error::Decode(format!(
                            "FlateDecode decompression failed: stream is labeled as compressed but all decompression attempts failed. \
                            This violates PDF Spec ISO 32000-1:2008, Section 7.3.8.2. \
                            Zlib error: {}, Deflate error: {}. Compressed size: {} bytes.",
                            e,
                            deflate_err,
                            input.len()
                        )))
                    },
                }
            },
        }
    }

    fn name(&self) -> &str {
        "FlateDecode"
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use flate2::write::ZlibEncoder;
    use flate2::Compression;
    use std::io::Write;

    // #364 — when a strategy's partial recovery emits garbage (valid-looking
    // deflate bits that decode to pseudo-random bytes on a misaligned input),
    // the decoder must not accept it. Strategy 5 already validates via PDF
    // content-stream operators; strategies 1–4 now validate via
    // `looks_like_real_stream`. This test pins that guard.
    #[test]
    fn looks_like_real_stream_rejects_repeating_garbage() {
        // Actual symptom from nougat_026.pdf page 1 before the fix: 128 bytes
        // of `P\xffj!}\xef\xbd\xbd\xef\xbd\xbd...` high-bit-heavy repetition.
        let garbage = b"P\xffj!}\xef\xbd\xbd\xef\xbd\xbd\xef\xbd\xbd\xef\xbd\xbd\xef\xbd\xbd\xef\xbd\xbd\xef\xbd\xbd\xef\xbd\xbd".repeat(4);
        assert!(
            !looks_like_real_stream(&garbage),
            "misaligned-deflate garbage must be rejected as a partial recovery"
        );
    }

    #[test]
    fn looks_like_real_stream_accepts_content_stream_operators() {
        let real = b"BT /F1 12 Tf 100 700 Td (hello) Tj ET";
        assert!(looks_like_real_stream(real));
    }

    #[test]
    fn looks_like_real_stream_accepts_ascii_only_object_stream() {
        // Object-stream-like payload: ASCII, no content-stream operators.
        let object_stream = b"1 0 obj\n<< /Length 42 >>\nstream\nhello world\nendstream\nendobj\n";
        assert!(looks_like_real_stream(object_stream));
    }

    #[test]
    fn looks_like_real_stream_rejects_empty() {
        assert!(!looks_like_real_stream(&[]));
    }

    #[test]
    fn test_flate_decode_simple() {
        let decoder = FlateDecoder::default();

        // Compress some data
        let original = b"Hello, FlateDecode!";
        let mut encoder = ZlibEncoder::new(Vec::new(), Compression::default());
        encoder.write_all(original).unwrap();
        let compressed = encoder.finish().unwrap();

        // Decompress
        let decoded = decoder.decode(&compressed).unwrap();
        assert_eq!(decoded, original);
    }

    #[test]
    fn test_flate_decode_empty() {
        let decoder = FlateDecoder::default();

        // Compress empty data
        let original = b"";
        let mut encoder = ZlibEncoder::new(Vec::new(), Compression::default());
        encoder.write_all(original).unwrap();
        let compressed = encoder.finish().unwrap();

        let decoded = decoder.decode(&compressed).unwrap();
        assert_eq!(decoded, original);
    }

    #[test]
    fn test_flate_decode_large_data() {
        let decoder = FlateDecoder::default();

        // Create large repeated data
        let original = b"ABCDEFGHIJKLMNOPQRSTUVWXYZ".repeat(1000);
        let mut encoder = ZlibEncoder::new(Vec::new(), Compression::default());
        encoder.write_all(&original).unwrap();
        let compressed = encoder.finish().unwrap();

        let decoded = decoder.decode(&compressed).unwrap();
        assert_eq!(decoded, original);
    }

    #[test]
    fn test_flate_decode_invalid_data() {
        let decoder = FlateDecoder::default();

        // Invalid zlib data - should fail decompression
        // SPEC COMPLIANCE: We now correctly reject invalid compressed data
        // instead of returning it as raw data (which violated PDF spec)
        let invalid = b"This is not zlib compressed data";
        let result = decoder.decode(invalid);
        assert!(result.is_err());

        // Verify error message mentions spec compliance
        if let Err(e) = result {
            let error_msg = format!("{}", e);
            assert!(error_msg.contains("FlateDecode decompression failed"));
        }
    }

    #[test]
    fn test_flate_decoder_name() {
        let decoder = FlateDecoder::default();
        assert_eq!(decoder.name(), "FlateDecode");
    }

    #[test]
    fn test_flate_bomb_rejected() {
        // Verify that check_limit rejects output at or above the cap.
        let large = vec![0u8; DEFAULT_MAX_DECOMPRESSED_BYTES as usize];
        let result = check_limit(&large, DEFAULT_MAX_DECOMPRESSED_BYTES);
        assert!(result.is_err());
        let msg = format!("{}", result.unwrap_err());
        assert!(msg.contains("safety limit"));
    }

    #[test]
    fn test_check_limit_below_threshold() {
        let small = vec![0u8; 1024];
        assert!(check_limit(&small, DEFAULT_MAX_DECOMPRESSED_BYTES).is_ok());
    }

    #[test]
    fn test_custom_limit_accepts_data_within_limit() {
        // A decoder with a small cap should accept data below that cap.
        let original = b"x".repeat(512);
        let mut encoder = ZlibEncoder::new(Vec::new(), Compression::default());
        encoder.write_all(&original).unwrap();
        let compressed = encoder.finish().unwrap();

        let decoder = FlateDecoder::with_limit(1024);
        let decoded = decoder.decode(&compressed).unwrap();
        assert_eq!(decoded, original);
    }

    #[test]
    fn test_custom_limit_rejects_data_over_limit() {
        // A decoder with a tiny cap should reject data that exceeds it.
        let original = b"x".repeat(100);
        let mut encoder = ZlibEncoder::new(Vec::new(), Compression::default());
        encoder.write_all(&original).unwrap();
        let compressed = encoder.finish().unwrap();

        let decoder = FlateDecoder::with_limit(10);
        let result = decoder.decode(&compressed);
        assert!(result.is_err(), "expected rejection when output exceeds custom limit");
    }

    #[test]
    fn test_bomb_error_does_not_expose_internal_symbol_name() {
        // The user-facing error message must not reference internal symbol names.
        let large = vec![0u8; DEFAULT_MAX_DECOMPRESSED_BYTES as usize];
        let result = check_limit(&large, DEFAULT_MAX_DECOMPRESSED_BYTES);
        assert!(result.is_err());
        let msg = format!("{}", result.unwrap_err());
        assert!(
            !msg.contains("MAX_DECOMPRESSED_BYTES"),
            "error message must not reference internal symbol names: {msg}"
        );
    }

    // Single test for env var to avoid parallel race conditions.
    // Tests all three cases sequentially in one function.
    #[test]
    fn test_effective_limit_env_variable() {
        // Default (no env var)
        std::env::remove_var("PDF_OXIDE_MAX_DECOMPRESS_MB");
        assert_eq!(effective_limit(), DEFAULT_MAX_DECOMPRESSED_BYTES);

        // Valid override
        unsafe { std::env::set_var("PDF_OXIDE_MAX_DECOMPRESS_MB", "64") };
        assert_eq!(effective_limit(), 64 * 1024 * 1024);

        // Invalid value falls back to default
        unsafe { std::env::set_var("PDF_OXIDE_MAX_DECOMPRESS_MB", "not_a_number") };
        assert_eq!(effective_limit(), DEFAULT_MAX_DECOMPRESSED_BYTES);

        // Cleanup
        unsafe { std::env::remove_var("PDF_OXIDE_MAX_DECOMPRESS_MB") };
    }
}