Skip to main content

justpdf_core/
repair.rs

1//! PDF repair module — rebuilds the cross-reference table by scanning
2//! for object definitions when the normal xref/trailer structure is
3//! damaged or missing.
4
5use std::collections::HashMap;
6
7use crate::error::{JustPdfError, Result};
8use crate::object::{self, IndirectRef, PdfDict, PdfObject};
9use crate::parser::PdfDocument;
10use crate::tokenizer::Tokenizer;
11use crate::xref::{Xref, XrefEntry};
12
13/// Rebuild the cross-reference table by scanning the raw bytes for
14/// `N M obj` patterns and locating the trailer dictionary.
15///
16/// This is the core recovery routine: it walks every byte looking for
17/// object headers, records each object's offset, then tries to find
18/// (or synthesise) a trailer dictionary so that a usable [`Xref`] can
19/// be returned.
20pub fn rebuild_xref(data: &[u8]) -> Result<Xref> {
21    let entries = scan_object_headers(data);
22
23    if entries.is_empty() {
24        return Err(JustPdfError::InvalidXref {
25            offset: 0,
26            detail: "no objects found during repair scan".into(),
27        });
28    }
29
30    // Try to find a trailer dictionary the traditional way.
31    let trailer = find_trailer_dict(data).or_else(|_| synthesise_trailer(data, &entries))?;
32
33    let max_obj = entries.keys().copied().max().unwrap_or(0);
34
35    let mut xref = Xref::new();
36    for (&obj_num, &(offset, gen_num)) in &entries {
37        xref.entries.insert(
38            obj_num,
39            XrefEntry::InUse {
40                offset: offset as u64,
41                gen_num,
42            },
43        );
44    }
45
46    // Ensure /Size is present in the trailer.
47    let mut trailer = trailer;
48    if trailer.get_i64(b"Size").is_none() {
49        trailer.insert(
50            b"Size".to_vec(),
51            PdfObject::Integer((max_obj + 1) as i64),
52        );
53    }
54
55    xref.trailer = trailer;
56    Ok(xref)
57}
58
59/// Try normal parsing first; if it fails, fall back to [`rebuild_xref`].
60pub fn repair_document(data: Vec<u8>) -> Result<PdfDocument> {
61    // Happy path — normal parsing.
62    match PdfDocument::from_bytes(data.clone()) {
63        Ok(doc) => return Ok(doc),
64        Err(_) => {}
65    }
66
67    // Fallback — repair.
68    PdfDocument::from_bytes_with_repair(data)
69}
70
71// ---------------------------------------------------------------------------
72// Internal helpers
73// ---------------------------------------------------------------------------
74
75/// Scan the entire file for lines matching `\d+ \d+ obj`.
76///
77/// Returns a map from object number to `(byte_offset, generation)`.
78/// If the same object number appears more than once, the *last*
79/// occurrence wins (mirrors incremental-update semantics).
80fn scan_object_headers(data: &[u8]) -> HashMap<u32, (usize, u16)> {
81    let mut map: HashMap<u32, (usize, u16)> = HashMap::new();
82    let len = data.len();
83    let mut i = 0;
84
85    while i < len {
86        // Fast-skip: we only care about positions that are at the start of
87        // a line (pos 0 or preceded by \n or \r).
88        if i != 0 && data[i - 1] != b'\n' && data[i - 1] != b'\r' {
89            // Advance to next line boundary.
90            while i < len && data[i] != b'\n' && data[i] != b'\r' {
91                i += 1;
92            }
93            // Skip the newline character(s).
94            while i < len && (data[i] == b'\n' || data[i] == b'\r') {
95                i += 1;
96            }
97            continue;
98        }
99
100        // Try to match `<digits> <digits> obj` at position i.
101        if let Some((obj_num, gen_num, after)) = match_obj_header(data, i) {
102            map.insert(obj_num, (i, gen_num));
103            i = after;
104        } else {
105            // Advance to next line.
106            while i < len && data[i] != b'\n' && data[i] != b'\r' {
107                i += 1;
108            }
109            while i < len && (data[i] == b'\n' || data[i] == b'\r') {
110                i += 1;
111            }
112        }
113    }
114
115    map
116}
117
118/// Try to match `<obj_num> <gen_num> obj` starting at `pos`.
119/// Returns `(obj_num, gen_num, byte_after_keyword)` on success.
120fn match_obj_header(data: &[u8], pos: usize) -> Option<(u32, u16, usize)> {
121    let len = data.len();
122    let mut i = pos;
123
124    // Skip optional leading whitespace (spaces/tabs, NOT newlines — we
125    // already ensured we are at a line boundary).
126    while i < len && (data[i] == b' ' || data[i] == b'\t') {
127        i += 1;
128    }
129
130    // First number: object number.
131    let num_start = i;
132    while i < len && data[i].is_ascii_digit() {
133        i += 1;
134    }
135    if i == num_start || i >= len {
136        return None;
137    }
138    let obj_num: u32 = std::str::from_utf8(&data[num_start..i]).ok()?.parse().ok()?;
139
140    // Whitespace between numbers.
141    if i >= len || data[i] != b' ' {
142        return None;
143    }
144    while i < len && data[i] == b' ' {
145        i += 1;
146    }
147
148    // Second number: generation number.
149    let gen_start = i;
150    while i < len && data[i].is_ascii_digit() {
151        i += 1;
152    }
153    if i == gen_start || i >= len {
154        return None;
155    }
156    let gen_num: u16 = std::str::from_utf8(&data[gen_start..i]).ok()?.parse().ok()?;
157
158    // Whitespace before `obj`.
159    if i >= len || data[i] != b' ' {
160        return None;
161    }
162    while i < len && data[i] == b' ' {
163        i += 1;
164    }
165
166    // Keyword `obj` followed by whitespace / EOF.
167    if i + 3 > len {
168        return None;
169    }
170    if &data[i..i + 3] != b"obj" {
171        return None;
172    }
173    let after = i + 3;
174    // `obj` must be followed by a delimiter (whitespace, <, [, /) or EOF.
175    if after < len {
176        let ch = data[after];
177        if !(ch == b' '
178            || ch == b'\t'
179            || ch == b'\n'
180            || ch == b'\r'
181            || ch == b'<'
182            || ch == b'['
183            || ch == b'/')
184        {
185            return None; // e.g. "object" — not the keyword we want.
186        }
187    }
188
189    Some((obj_num, gen_num, after))
190}
191
192/// Scan backward for the `trailer` keyword and parse the dictionary
193/// that follows it.
194fn find_trailer_dict(data: &[u8]) -> Result<PdfDict> {
195    let needle = b"trailer";
196    // Search the last 4 KiB (covers most files, even with multiple
197    // incremental updates we only need the *last* trailer).
198    let search_len = data.len().min(4096);
199    let search_start = data.len() - search_len;
200
201    for i in (search_start..data.len().saturating_sub(needle.len())).rev() {
202        if &data[i..i + needle.len()] == needle {
203            // Skip "trailer" + whitespace, then parse the dict.
204            let after = i + needle.len();
205            let mut tokenizer = Tokenizer::new_at(data, after);
206            if let Ok(obj) = object::parse_object(&mut tokenizer) {
207                if let PdfObject::Dict(d) = obj {
208                    return Ok(d);
209                }
210            }
211        }
212    }
213
214    Err(JustPdfError::TrailerNotFound)
215}
216
217/// When no explicit trailer can be found, build a minimal one by
218/// locating the catalog object (an object whose dictionary contains
219/// `/Type /Catalog`).
220fn synthesise_trailer(
221    data: &[u8],
222    entries: &HashMap<u32, (usize, u16)>,
223) -> Result<PdfDict> {
224    let mut root_ref: Option<IndirectRef> = None;
225
226    for (&obj_num, &(offset, gen_num)) in entries {
227        if let Some(dict) = try_parse_dict_at(data, offset) {
228            if dict.get_name(b"Type") == Some(b"Catalog") {
229                root_ref = Some(IndirectRef { obj_num, gen_num });
230                break;
231            }
232        }
233    }
234
235    let root = root_ref.ok_or(JustPdfError::TrailerNotFound)?;
236
237    let max_obj = entries.keys().copied().max().unwrap_or(0);
238
239    let mut trailer = PdfDict::new();
240    trailer.insert(
241        b"Root".to_vec(),
242        PdfObject::Reference(root),
243    );
244    trailer.insert(
245        b"Size".to_vec(),
246        PdfObject::Integer((max_obj + 1) as i64),
247    );
248
249    Ok(trailer)
250}
251
252/// Attempt to parse the indirect object at `offset` and return its
253/// dictionary if the top-level value is a `Dict` (ignoring streams
254/// for simplicity).
255fn try_parse_dict_at(data: &[u8], offset: usize) -> Option<PdfDict> {
256    let mut tokenizer = Tokenizer::new_at(data, offset);
257    let (_iref, obj) = object::parse_indirect_object(&mut tokenizer).ok()?;
258    match obj {
259        PdfObject::Dict(d) => Some(d),
260        PdfObject::Stream { dict, .. } => Some(dict),
261        _ => None,
262    }
263}
264
265// ---------------------------------------------------------------------------
266// Integration: PdfDocument::from_bytes_with_repair
267// ---------------------------------------------------------------------------
268
269impl PdfDocument {
270    /// Parse a PDF from bytes, falling back to xref repair if the
271    /// normal cross-reference table or trailer is damaged.
272    ///
273    /// This tries [`PdfDocument::from_bytes`] first.  If that fails
274    /// the file is scanned for object definitions and a synthetic xref
275    /// is built via [`rebuild_xref`].
276    pub fn from_bytes_with_repair(data: Vec<u8>) -> Result<Self> {
277        // Try the normal path first.
278        match Self::from_bytes(data.clone()) {
279            Ok(doc) => return Ok(doc),
280            Err(_normal_err) => {}
281        }
282
283        // Repair path: rebuild xref by scanning.
284        let xref = rebuild_xref(&data)?;
285        let version = parse_version_tolerant(&data);
286
287        Ok(Self::from_raw_parts(data, xref, version))
288    }
289}
290
291/// Parse PDF version, returning (1, 4) as a safe default when the
292/// header is missing or corrupt.
293fn parse_version_tolerant(data: &[u8]) -> (u8, u8) {
294    let needle = b"%PDF-";
295    let search_len = data.len().min(1024);
296    for i in 0..search_len.saturating_sub(needle.len() + 3) {
297        if &data[i..i + needle.len()] == needle {
298            let major = data.get(i + 5).copied().unwrap_or(0);
299            let dot = data.get(i + 6).copied().unwrap_or(0);
300            let minor = data.get(i + 7).copied().unwrap_or(0);
301            if major.is_ascii_digit() && dot == b'.' && minor.is_ascii_digit() {
302                return (major - b'0', minor - b'0');
303            }
304        }
305    }
306    (1, 4) // safe default
307}
308
309// ---------------------------------------------------------------------------
310// Tests
311// ---------------------------------------------------------------------------
312
313#[cfg(test)]
314mod tests {
315    use super::*;
316
317    /// Build a minimal valid PDF for testing.
318    fn build_minimal_pdf() -> Vec<u8> {
319        let mut pdf = Vec::new();
320        pdf.extend_from_slice(b"%PDF-1.4\n");
321
322        let obj1_offset = pdf.len();
323        pdf.extend_from_slice(b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n");
324
325        let obj2_offset = pdf.len();
326        pdf.extend_from_slice(
327            b"2 0 obj\n<< /Type /Pages /Kids [3 0 R] /Count 1 >>\nendobj\n",
328        );
329
330        let obj3_offset = pdf.len();
331        pdf.extend_from_slice(
332            b"3 0 obj\n<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] >>\nendobj\n",
333        );
334
335        let xref_offset = pdf.len();
336        pdf.extend_from_slice(b"xref\n");
337        pdf.extend_from_slice(b"0 4\n");
338        pdf.extend_from_slice(b"0000000000 65535 f \r\n");
339        pdf.extend_from_slice(format!("{:010} 00000 n \r\n", obj1_offset).as_bytes());
340        pdf.extend_from_slice(format!("{:010} 00000 n \r\n", obj2_offset).as_bytes());
341        pdf.extend_from_slice(format!("{:010} 00000 n \r\n", obj3_offset).as_bytes());
342
343        pdf.extend_from_slice(b"trailer\n<< /Size 4 /Root 1 0 R >>\n");
344        pdf.extend_from_slice(format!("startxref\n{xref_offset}\n%%EOF\n").as_bytes());
345
346        pdf
347    }
348
349    // ------------------------------------------------------------------
350    // rebuild_xref on a valid PDF — compare with normal parsing
351    // ------------------------------------------------------------------
352
353    #[test]
354    fn test_rebuild_xref_matches_normal() {
355        let data = build_minimal_pdf();
356
357        // Normal load
358        let normal_xref = crate::xref::load_xref(&data).unwrap();
359
360        // Repair load
361        let repaired_xref = rebuild_xref(&data).unwrap();
362
363        // Both must contain objects 1, 2, 3 as InUse with the same offsets.
364        for obj_num in 1u32..=3 {
365            let normal_entry = normal_xref.get(obj_num).unwrap();
366            let repair_entry = repaired_xref.get(obj_num).unwrap();
367            match (normal_entry, repair_entry) {
368                (
369                    XrefEntry::InUse {
370                        offset: o1,
371                        gen_num: g1,
372                    },
373                    XrefEntry::InUse {
374                        offset: o2,
375                        gen_num: g2,
376                    },
377                ) => {
378                    assert_eq!(o1, o2, "offset mismatch for obj {obj_num}");
379                    assert_eq!(g1, g2, "gen mismatch for obj {obj_num}");
380                }
381                _ => panic!("unexpected entry type for obj {obj_num}"),
382            }
383        }
384
385        // Trailer must have /Root
386        assert!(repaired_xref.trailer.get_ref(b"Root").is_some());
387    }
388
389    // ------------------------------------------------------------------
390    // Truncated trailer — trailer keyword removed, rebuild from catalog
391    // ------------------------------------------------------------------
392
393    #[test]
394    fn test_rebuild_xref_truncated_trailer() {
395        let mut data = build_minimal_pdf();
396
397        // Remove everything from `xref` onward so there is no trailer.
398        if let Some(pos) = data
399            .windows(4)
400            .position(|w| w == b"xref")
401        {
402            data.truncate(pos);
403        }
404
405        // Normal parsing must fail.
406        assert!(PdfDocument::from_bytes(data.clone()).is_err());
407
408        // Repair must succeed by synthesising the trailer from the
409        // catalog object.
410        let repaired = rebuild_xref(&data).unwrap();
411        assert!(repaired.get(1).is_some());
412        assert!(repaired.get(2).is_some());
413        assert!(repaired.get(3).is_some());
414
415        // The synthetic trailer must reference the catalog.
416        let root = repaired.trailer.get_ref(b"Root").expect("/Root missing");
417        assert_eq!(root.obj_num, 1);
418    }
419
420    // ------------------------------------------------------------------
421    // Detecting catalog from objects
422    // ------------------------------------------------------------------
423
424    #[test]
425    fn test_detect_catalog_object() {
426        // Build a PDF body with no trailer at all.
427        let mut data = Vec::new();
428        data.extend_from_slice(b"%PDF-1.7\n");
429        data.extend_from_slice(
430            b"5 0 obj\n<< /Type /Pages /Kids [] /Count 0 >>\nendobj\n",
431        );
432        data.extend_from_slice(
433            b"10 0 obj\n<< /Type /Catalog /Pages 5 0 R >>\nendobj\n",
434        );
435
436        let repaired = rebuild_xref(&data).unwrap();
437
438        // The repaired xref should find both objects.
439        assert!(repaired.get(5).is_some());
440        assert!(repaired.get(10).is_some());
441
442        // /Root should point at object 10 (the Catalog).
443        let root = repaired.trailer.get_ref(b"Root").unwrap();
444        assert_eq!(root.obj_num, 10);
445    }
446
447    // ------------------------------------------------------------------
448    // scan_object_headers edge cases
449    // ------------------------------------------------------------------
450
451    #[test]
452    fn test_scan_ignores_non_obj_keywords() {
453        // "object" should not match — only "obj" followed by a delimiter.
454        let data = b"%PDF-1.4\n1 0 object\n2 0 obj\n<< >>\nendobj\n";
455        let entries = scan_object_headers(data);
456        assert!(!entries.contains_key(&1));
457        assert!(entries.contains_key(&2));
458    }
459
460    #[test]
461    fn test_scan_generation_number() {
462        let data = b"%PDF-1.4\n7 3 obj\n<< /Foo /Bar >>\nendobj\n";
463        let entries = scan_object_headers(data);
464        let (_, gen_val) = entries.get(&7).expect("object 7 not found");
465        assert_eq!(*gen_val, 3);
466    }
467
468    // ------------------------------------------------------------------
469    // repair_document (convenience wrapper)
470    // ------------------------------------------------------------------
471
472    #[test]
473    fn test_repair_document_valid_pdf() {
474        let data = build_minimal_pdf();
475        let doc = repair_document(data).unwrap();
476        assert_eq!(doc.version, (1, 4));
477        assert!(doc.object_count() > 0);
478    }
479
480    #[test]
481    fn test_repair_document_damaged_pdf() {
482        let mut data = build_minimal_pdf();
483
484        // Corrupt the xref region.
485        if let Some(pos) = data.windows(4).position(|w| w == b"xref") {
486            data.truncate(pos);
487        }
488
489        let doc = repair_document(data).unwrap();
490        assert!(doc.object_count() > 0);
491    }
492
493    // ------------------------------------------------------------------
494    // from_bytes_with_repair
495    // ------------------------------------------------------------------
496
497    #[test]
498    fn test_from_bytes_with_repair_valid() {
499        let data = build_minimal_pdf();
500        let doc = PdfDocument::from_bytes_with_repair(data).unwrap();
501        assert_eq!(doc.version, (1, 4));
502    }
503
504    #[test]
505    fn test_from_bytes_with_repair_damaged() {
506        let mut data = build_minimal_pdf();
507        if let Some(pos) = data.windows(4).position(|w| w == b"xref") {
508            data.truncate(pos);
509        }
510        let doc = PdfDocument::from_bytes_with_repair(data).unwrap();
511        assert!(doc.object_count() >= 3);
512    }
513}