pdf_oxide 0.3.32

The fastest Rust PDF library with text extraction: 0.8ms mean, 100% pass rate on 3,830 PDFs. 5× faster than pdf_extract, 17× faster than oxidize_pdf. Extract, create, and edit PDFs.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
//! FlateDecode (zlib/deflate) implementation.
//!
//! This is the most common PDF compression filter, used in ~90% of PDFs.
//! Uses the flate2 crate for zlib decompression.

use crate::decoders::StreamDecoder;
use crate::error::{Error, Result};
use flate2::read::{DeflateDecoder, ZlibDecoder};
use std::io::Read;

/// Default cap for [`FlateDecoder`]: 256 MB per stream.
///
/// Prevents zip-bomb / flate-bomb attacks where a tiny compressed stream
/// expands to an arbitrarily large output, exhausting virtual memory and
/// triggering an allocator abort (SIGABRT / exit 134).
///
/// 256 MB accommodates A4 @ 600 DPI RGB (~99 MB) with headroom.
///
/// Override via:
/// - `PDF_OXIDE_MAX_DECOMPRESS_MB` environment variable (e.g. `64` for 64 MB)
/// - [`FlateDecoder::with_limit`] for programmatic control
pub const DEFAULT_MAX_DECOMPRESSED_BYTES: u64 = 256 * 1024 * 1024;

/// Read the decompression limit from the environment, falling back to the
/// compile-time default.
fn effective_limit() -> u64 {
    std::env::var("PDF_OXIDE_MAX_DECOMPRESS_MB")
        .ok()
        .and_then(|v| v.parse::<u64>().ok())
        .map(|mb| mb * 1024 * 1024)
        .unwrap_or(DEFAULT_MAX_DECOMPRESSED_BYTES)
}

/// Returns `Err` if `output` reached the decompression cap, indicating that the
/// stream was truncated rather than fully decoded.
#[inline]
fn check_limit(output: &[u8], limit: u64) -> Result<()> {
    if output.len() as u64 >= limit {
        return Err(Error::Decode(format!(
            "FlateDecode output reached the {} MB safety limit; \
             stream may be a flate bomb or an unusually large image",
            limit / (1024 * 1024)
        )));
    }
    Ok(())
}

/// FlateDecode filter implementation.
///
/// Decompresses data using the zlib/deflate algorithm. The decompression cap
/// defaults to [`DEFAULT_MAX_DECOMPRESSED_BYTES`] and can be overridden with
/// [`FlateDecoder::with_limit`].
pub struct FlateDecoder {
    /// Maximum number of decompressed bytes accepted per stream.
    pub max_decompressed_bytes: u64,
}

impl Default for FlateDecoder {
    fn default() -> Self {
        Self {
            max_decompressed_bytes: effective_limit(),
        }
    }
}

impl FlateDecoder {
    /// Creates a decoder that rejects any stream decompressing to more than
    /// `limit` bytes. Use this to tighten or relax the default 256 MB cap.
    pub fn with_limit(limit: u64) -> Self {
        Self {
            max_decompressed_bytes: limit,
        }
    }
}

impl StreamDecoder for FlateDecoder {
    fn decode(&self, input: &[u8]) -> Result<Vec<u8>> {
        let mut decoder = ZlibDecoder::new(input).take(self.max_decompressed_bytes);
        let mut output = Vec::new();

        // Try to read all data with standard zlib
        match decoder.read_to_end(&mut output) {
            Ok(_) => {
                check_limit(&output, self.max_decompressed_bytes)?;
                Ok(output)
            },
            Err(e) => {
                // Partial recovery: if we got ANY data before the error, use it
                if !output.is_empty() {
                    check_limit(&output, self.max_decompressed_bytes)?;
                    log::warn!(
                        "FlateDecode partial recovery: extracted {} bytes before corruption: {}",
                        output.len(),
                        e
                    );
                    return Ok(output);
                }

                // Strategy 2: Try raw deflate (no zlib wrapper)
                // Some PDFs have corrupt zlib headers but valid deflate data
                log::info!("Zlib decode failed, trying raw deflate");
                output.clear();
                let mut deflate_decoder =
                    DeflateDecoder::new(input).take(self.max_decompressed_bytes);

                match deflate_decoder.read_to_end(&mut output) {
                    Ok(_) => {
                        check_limit(&output, self.max_decompressed_bytes)?;
                        log::info!("Raw deflate recovery succeeded: {} bytes", output.len());
                        Ok(output)
                    },
                    Err(deflate_err) => {
                        if !output.is_empty() {
                            check_limit(&output, self.max_decompressed_bytes)?;
                            log::warn!(
                                "Raw deflate partial recovery: extracted {} bytes before error",
                                output.len()
                            );
                            return Ok(output);
                        }

                        // Strategy 3: Try skipping zlib header (2 bytes) and reading deflate
                        if input.len() > 2 {
                            log::info!(
                                "Trying deflate after skipping potential corrupt zlib header"
                            );
                            output.clear();
                            let mut deflate_decoder =
                                DeflateDecoder::new(&input[2..]).take(self.max_decompressed_bytes);

                            match deflate_decoder.read_to_end(&mut output) {
                                Ok(_) => {
                                    check_limit(&output, self.max_decompressed_bytes)?;
                                    log::info!(
                                        "Deflate with header skip succeeded: {} bytes",
                                        output.len()
                                    );
                                    return Ok(output);
                                },
                                Err(_) => {
                                    if !output.is_empty() {
                                        check_limit(&output, self.max_decompressed_bytes)?;
                                        log::warn!(
                                            "Deflate with header skip partial recovery: {} bytes",
                                            output.len()
                                        );
                                        return Ok(output);
                                    }
                                },
                            }
                        }

                        // Strategy 4: Try fixing corrupt zlib header byte
                        // If first byte has invalid compression method, replace with 0x78 (standard deflate)
                        if input.len() >= 2 {
                            let first_byte = input[0];
                            let compression_method = first_byte & 0x0F;
                            if compression_method != 8 {
                                log::info!(
                                    "Detected invalid compression method {} in header byte 0x{:02x}, trying with corrected header",
                                    compression_method,
                                    first_byte
                                );
                                // Create new buffer with corrected header
                                let mut corrected = input.to_vec();
                                // Replace CM bits (0-3) with 8 (deflate), keep CINFO bits (4-7)
                                corrected[0] = (first_byte & 0xF0) | 0x08;

                                output.clear();
                                let mut decoder = ZlibDecoder::new(&corrected[..])
                                    .take(self.max_decompressed_bytes);
                                match decoder.read_to_end(&mut output) {
                                    Ok(_) if !output.is_empty() => {
                                        check_limit(&output, self.max_decompressed_bytes)?;
                                        log::info!(
                                            "Header correction recovery succeeded: {} bytes",
                                            output.len()
                                        );
                                        return Ok(output);
                                    },
                                    Err(_) if !output.is_empty() => {
                                        check_limit(&output, self.max_decompressed_bytes)?;
                                        log::warn!(
                                            "Header correction partial recovery: {} bytes",
                                            output.len()
                                        );
                                        return Ok(output);
                                    },
                                    _ => {
                                        log::info!("Header correction failed");
                                    },
                                }
                            }
                        }

                        // Strategy 5: Brute-force scan for valid deflate data
                        // Try starting deflate decompression from offsets 0-20
                        // BUT validate the output contains valid PDF operators
                        log::info!("Trying brute-force scan for valid deflate data");
                        let max_offset = std::cmp::min(20, input.len());
                        for offset in 0..max_offset {
                            if offset == 0 || offset == 2 {
                                continue; // Already tried these
                            }

                            output.clear();
                            let mut deflate_decoder = DeflateDecoder::new(&input[offset..])
                                .take(self.max_decompressed_bytes);

                            match deflate_decoder.read_to_end(&mut output) {
                                Ok(_) if !output.is_empty() => {
                                    check_limit(&output, self.max_decompressed_bytes)?;
                                    // Validate output quality - check for PDF operators
                                    let decoded_str = String::from_utf8_lossy(&output);
                                    let has_pdf_operators = decoded_str.contains("BT")
                                        || decoded_str.contains("ET")
                                        || decoded_str.contains("Tj")
                                        || decoded_str.contains("TJ")
                                        || decoded_str.contains("Tm")
                                        || decoded_str.contains("Td");

                                    if has_pdf_operators {
                                        log::info!(
                                            "Brute-force deflate recovery succeeded at offset {}: {} bytes (validated PDF content)",
                                            offset,
                                            output.len()
                                        );
                                        return Ok(output);
                                    } else {
                                        log::info!(
                                            "Brute-force at offset {} produced {} bytes but no valid PDF operators - trying next offset",
                                            offset,
                                            output.len()
                                        );
                                        continue;
                                    }
                                },
                                Err(_) if !output.is_empty() => {
                                    check_limit(&output, self.max_decompressed_bytes)?;
                                    // Validate partial recovery too
                                    let decoded_str = String::from_utf8_lossy(&output);
                                    let has_pdf_operators = decoded_str.contains("BT")
                                        || decoded_str.contains("ET")
                                        || decoded_str.contains("Tj")
                                        || decoded_str.contains("TJ")
                                        || decoded_str.contains("Tm")
                                        || decoded_str.contains("Td");

                                    if has_pdf_operators {
                                        log::warn!(
                                            "Brute-force partial recovery at offset {}: {} bytes (validated PDF content)",
                                            offset,
                                            output.len()
                                        );
                                        return Ok(output);
                                    } else {
                                        log::info!(
                                            "Partial recovery at offset {} but no valid PDF operators - trying next offset",
                                            offset
                                        );
                                        continue;
                                    }
                                },
                                _ => continue,
                            }
                        }

                        // SPEC COMPLIANCE FIX: Removed strategies 8-9 that violated PDF spec
                        //
                        // Previous strategies 8-9 would return raw uncompressed data for streams
                        // labeled as /FlateDecode. This violates PDF Spec ISO 32000-1:2008,
                        // Section 7.3.8.2 which states that if a stream has /Filter /FlateDecode,
                        // it MUST be compressed with the FlateDecode algorithm.
                        //
                        // Returning raw data creates security risks:
                        // 1. Malicious PDFs could bypass compression validation
                        // 2. Type confusion attacks (treating compressed data as raw)
                        // 3. Inconsistent behavior across PDF processors
                        //
                        // Correct behavior: If all decompression strategies fail, return an error.
                        // The stream is either corrupted or malicious, and should not be processed.

                        log::error!(
                            "All FlateDecode recovery strategies failed. Zlib: {}, Deflate: {}",
                            e,
                            deflate_err
                        );
                        log::error!(
                            "Stream labeled as FlateDecode but cannot be decompressed - this violates PDF spec"
                        );

                        Err(Error::Decode(format!(
                            "FlateDecode decompression failed: stream is labeled as compressed but all decompression attempts failed. \
                            This violates PDF Spec ISO 32000-1:2008, Section 7.3.8.2. \
                            Zlib error: {}, Deflate error: {}. Compressed size: {} bytes.",
                            e,
                            deflate_err,
                            input.len()
                        )))
                    },
                }
            },
        }
    }

    fn name(&self) -> &str {
        "FlateDecode"
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use flate2::write::ZlibEncoder;
    use flate2::Compression;
    use std::io::Write;

    #[test]
    fn test_flate_decode_simple() {
        let decoder = FlateDecoder::default();

        // Compress some data
        let original = b"Hello, FlateDecode!";
        let mut encoder = ZlibEncoder::new(Vec::new(), Compression::default());
        encoder.write_all(original).unwrap();
        let compressed = encoder.finish().unwrap();

        // Decompress
        let decoded = decoder.decode(&compressed).unwrap();
        assert_eq!(decoded, original);
    }

    #[test]
    fn test_flate_decode_empty() {
        let decoder = FlateDecoder::default();

        // Compress empty data
        let original = b"";
        let mut encoder = ZlibEncoder::new(Vec::new(), Compression::default());
        encoder.write_all(original).unwrap();
        let compressed = encoder.finish().unwrap();

        let decoded = decoder.decode(&compressed).unwrap();
        assert_eq!(decoded, original);
    }

    #[test]
    fn test_flate_decode_large_data() {
        let decoder = FlateDecoder::default();

        // Create large repeated data
        let original = b"ABCDEFGHIJKLMNOPQRSTUVWXYZ".repeat(1000);
        let mut encoder = ZlibEncoder::new(Vec::new(), Compression::default());
        encoder.write_all(&original).unwrap();
        let compressed = encoder.finish().unwrap();

        let decoded = decoder.decode(&compressed).unwrap();
        assert_eq!(decoded, original);
    }

    #[test]
    fn test_flate_decode_invalid_data() {
        let decoder = FlateDecoder::default();

        // Invalid zlib data - should fail decompression
        // SPEC COMPLIANCE: We now correctly reject invalid compressed data
        // instead of returning it as raw data (which violated PDF spec)
        let invalid = b"This is not zlib compressed data";
        let result = decoder.decode(invalid);
        assert!(result.is_err());

        // Verify error message mentions spec compliance
        if let Err(e) = result {
            let error_msg = format!("{}", e);
            assert!(error_msg.contains("FlateDecode decompression failed"));
        }
    }

    #[test]
    fn test_flate_decoder_name() {
        let decoder = FlateDecoder::default();
        assert_eq!(decoder.name(), "FlateDecode");
    }

    #[test]
    fn test_flate_bomb_rejected() {
        // Verify that check_limit rejects output at or above the cap.
        let large = vec![0u8; DEFAULT_MAX_DECOMPRESSED_BYTES as usize];
        let result = check_limit(&large, DEFAULT_MAX_DECOMPRESSED_BYTES);
        assert!(result.is_err());
        let msg = format!("{}", result.unwrap_err());
        assert!(msg.contains("safety limit"));
    }

    #[test]
    fn test_check_limit_below_threshold() {
        let small = vec![0u8; 1024];
        assert!(check_limit(&small, DEFAULT_MAX_DECOMPRESSED_BYTES).is_ok());
    }

    #[test]
    fn test_custom_limit_accepts_data_within_limit() {
        // A decoder with a small cap should accept data below that cap.
        let original = b"x".repeat(512);
        let mut encoder = ZlibEncoder::new(Vec::new(), Compression::default());
        encoder.write_all(&original).unwrap();
        let compressed = encoder.finish().unwrap();

        let decoder = FlateDecoder::with_limit(1024);
        let decoded = decoder.decode(&compressed).unwrap();
        assert_eq!(decoded, original);
    }

    #[test]
    fn test_custom_limit_rejects_data_over_limit() {
        // A decoder with a tiny cap should reject data that exceeds it.
        let original = b"x".repeat(100);
        let mut encoder = ZlibEncoder::new(Vec::new(), Compression::default());
        encoder.write_all(&original).unwrap();
        let compressed = encoder.finish().unwrap();

        let decoder = FlateDecoder::with_limit(10);
        let result = decoder.decode(&compressed);
        assert!(result.is_err(), "expected rejection when output exceeds custom limit");
    }

    #[test]
    fn test_bomb_error_does_not_expose_internal_symbol_name() {
        // The user-facing error message must not reference internal symbol names.
        let large = vec![0u8; DEFAULT_MAX_DECOMPRESSED_BYTES as usize];
        let result = check_limit(&large, DEFAULT_MAX_DECOMPRESSED_BYTES);
        assert!(result.is_err());
        let msg = format!("{}", result.unwrap_err());
        assert!(
            !msg.contains("MAX_DECOMPRESSED_BYTES"),
            "error message must not reference internal symbol names: {msg}"
        );
    }

    // Single test for env var to avoid parallel race conditions.
    // Tests all three cases sequentially in one function.
    #[test]
    fn test_effective_limit_env_variable() {
        // Default (no env var)
        std::env::remove_var("PDF_OXIDE_MAX_DECOMPRESS_MB");
        assert_eq!(effective_limit(), DEFAULT_MAX_DECOMPRESSED_BYTES);

        // Valid override
        unsafe { std::env::set_var("PDF_OXIDE_MAX_DECOMPRESS_MB", "64") };
        assert_eq!(effective_limit(), 64 * 1024 * 1024);

        // Invalid value falls back to default
        unsafe { std::env::set_var("PDF_OXIDE_MAX_DECOMPRESS_MB", "not_a_number") };
        assert_eq!(effective_limit(), DEFAULT_MAX_DECOMPRESSED_BYTES);

        // Cleanup
        unsafe { std::env::remove_var("PDF_OXIDE_MAX_DECOMPRESS_MB") };
    }
}