1use std::collections::HashMap;
6
7use crate::error::{JustPdfError, Result};
8use crate::object::{self, IndirectRef, PdfDict, PdfObject};
9use crate::parser::PdfDocument;
10use crate::tokenizer::Tokenizer;
11use crate::xref::{Xref, XrefEntry};
12
13pub fn rebuild_xref(data: &[u8]) -> Result<Xref> {
21 let entries = scan_object_headers(data);
22
23 if entries.is_empty() {
24 return Err(JustPdfError::InvalidXref {
25 offset: 0,
26 detail: "no objects found during repair scan".into(),
27 });
28 }
29
30 let trailer = find_trailer_dict(data).or_else(|_| synthesise_trailer(data, &entries))?;
32
33 let max_obj = entries.keys().copied().max().unwrap_or(0);
34
35 let mut xref = Xref::new();
36 for (&obj_num, &(offset, gen_num)) in &entries {
37 xref.entries.insert(
38 obj_num,
39 XrefEntry::InUse {
40 offset: offset as u64,
41 gen_num,
42 },
43 );
44 }
45
46 let mut trailer = trailer;
48 if trailer.get_i64(b"Size").is_none() {
49 trailer.insert(
50 b"Size".to_vec(),
51 PdfObject::Integer((max_obj + 1) as i64),
52 );
53 }
54
55 xref.trailer = trailer;
56 Ok(xref)
57}
58
59pub fn repair_document(data: Vec<u8>) -> Result<PdfDocument> {
61 match PdfDocument::from_bytes(data.clone()) {
63 Ok(doc) => return Ok(doc),
64 Err(_) => {}
65 }
66
67 PdfDocument::from_bytes_with_repair(data)
69}
70
71fn scan_object_headers(data: &[u8]) -> HashMap<u32, (usize, u16)> {
81 let mut map: HashMap<u32, (usize, u16)> = HashMap::new();
82 let len = data.len();
83 let mut i = 0;
84
85 while i < len {
86 if i != 0 && data[i - 1] != b'\n' && data[i - 1] != b'\r' {
89 while i < len && data[i] != b'\n' && data[i] != b'\r' {
91 i += 1;
92 }
93 while i < len && (data[i] == b'\n' || data[i] == b'\r') {
95 i += 1;
96 }
97 continue;
98 }
99
100 if let Some((obj_num, gen_num, after)) = match_obj_header(data, i) {
102 map.insert(obj_num, (i, gen_num));
103 i = after;
104 } else {
105 while i < len && data[i] != b'\n' && data[i] != b'\r' {
107 i += 1;
108 }
109 while i < len && (data[i] == b'\n' || data[i] == b'\r') {
110 i += 1;
111 }
112 }
113 }
114
115 map
116}
117
118fn match_obj_header(data: &[u8], pos: usize) -> Option<(u32, u16, usize)> {
121 let len = data.len();
122 let mut i = pos;
123
124 while i < len && (data[i] == b' ' || data[i] == b'\t') {
127 i += 1;
128 }
129
130 let num_start = i;
132 while i < len && data[i].is_ascii_digit() {
133 i += 1;
134 }
135 if i == num_start || i >= len {
136 return None;
137 }
138 let obj_num: u32 = std::str::from_utf8(&data[num_start..i]).ok()?.parse().ok()?;
139
140 if i >= len || data[i] != b' ' {
142 return None;
143 }
144 while i < len && data[i] == b' ' {
145 i += 1;
146 }
147
148 let gen_start = i;
150 while i < len && data[i].is_ascii_digit() {
151 i += 1;
152 }
153 if i == gen_start || i >= len {
154 return None;
155 }
156 let gen_num: u16 = std::str::from_utf8(&data[gen_start..i]).ok()?.parse().ok()?;
157
158 if i >= len || data[i] != b' ' {
160 return None;
161 }
162 while i < len && data[i] == b' ' {
163 i += 1;
164 }
165
166 if i + 3 > len {
168 return None;
169 }
170 if &data[i..i + 3] != b"obj" {
171 return None;
172 }
173 let after = i + 3;
174 if after < len {
176 let ch = data[after];
177 if !(ch == b' '
178 || ch == b'\t'
179 || ch == b'\n'
180 || ch == b'\r'
181 || ch == b'<'
182 || ch == b'['
183 || ch == b'/')
184 {
185 return None; }
187 }
188
189 Some((obj_num, gen_num, after))
190}
191
192fn find_trailer_dict(data: &[u8]) -> Result<PdfDict> {
195 let needle = b"trailer";
196 let search_len = data.len().min(4096);
199 let search_start = data.len() - search_len;
200
201 for i in (search_start..data.len().saturating_sub(needle.len())).rev() {
202 if &data[i..i + needle.len()] == needle {
203 let after = i + needle.len();
205 let mut tokenizer = Tokenizer::new_at(data, after);
206 if let Ok(obj) = object::parse_object(&mut tokenizer) {
207 if let PdfObject::Dict(d) = obj {
208 return Ok(d);
209 }
210 }
211 }
212 }
213
214 Err(JustPdfError::TrailerNotFound)
215}
216
217fn synthesise_trailer(
221 data: &[u8],
222 entries: &HashMap<u32, (usize, u16)>,
223) -> Result<PdfDict> {
224 let mut root_ref: Option<IndirectRef> = None;
225
226 for (&obj_num, &(offset, gen_num)) in entries {
227 if let Some(dict) = try_parse_dict_at(data, offset) {
228 if dict.get_name(b"Type") == Some(b"Catalog") {
229 root_ref = Some(IndirectRef { obj_num, gen_num });
230 break;
231 }
232 }
233 }
234
235 let root = root_ref.ok_or(JustPdfError::TrailerNotFound)?;
236
237 let max_obj = entries.keys().copied().max().unwrap_or(0);
238
239 let mut trailer = PdfDict::new();
240 trailer.insert(
241 b"Root".to_vec(),
242 PdfObject::Reference(root),
243 );
244 trailer.insert(
245 b"Size".to_vec(),
246 PdfObject::Integer((max_obj + 1) as i64),
247 );
248
249 Ok(trailer)
250}
251
252fn try_parse_dict_at(data: &[u8], offset: usize) -> Option<PdfDict> {
256 let mut tokenizer = Tokenizer::new_at(data, offset);
257 let (_iref, obj) = object::parse_indirect_object(&mut tokenizer).ok()?;
258 match obj {
259 PdfObject::Dict(d) => Some(d),
260 PdfObject::Stream { dict, .. } => Some(dict),
261 _ => None,
262 }
263}
264
265impl PdfDocument {
270 pub fn from_bytes_with_repair(data: Vec<u8>) -> Result<Self> {
277 match Self::from_bytes(data.clone()) {
279 Ok(doc) => return Ok(doc),
280 Err(_normal_err) => {}
281 }
282
283 let xref = rebuild_xref(&data)?;
285 let version = parse_version_tolerant(&data);
286
287 Ok(Self::from_raw_parts(data, xref, version))
288 }
289}
290
291fn parse_version_tolerant(data: &[u8]) -> (u8, u8) {
294 let needle = b"%PDF-";
295 let search_len = data.len().min(1024);
296 for i in 0..search_len.saturating_sub(needle.len() + 3) {
297 if &data[i..i + needle.len()] == needle {
298 let major = data.get(i + 5).copied().unwrap_or(0);
299 let dot = data.get(i + 6).copied().unwrap_or(0);
300 let minor = data.get(i + 7).copied().unwrap_or(0);
301 if major.is_ascii_digit() && dot == b'.' && minor.is_ascii_digit() {
302 return (major - b'0', minor - b'0');
303 }
304 }
305 }
306 (1, 4) }
308
309#[cfg(test)]
314mod tests {
315 use super::*;
316
317 fn build_minimal_pdf() -> Vec<u8> {
319 let mut pdf = Vec::new();
320 pdf.extend_from_slice(b"%PDF-1.4\n");
321
322 let obj1_offset = pdf.len();
323 pdf.extend_from_slice(b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n");
324
325 let obj2_offset = pdf.len();
326 pdf.extend_from_slice(
327 b"2 0 obj\n<< /Type /Pages /Kids [3 0 R] /Count 1 >>\nendobj\n",
328 );
329
330 let obj3_offset = pdf.len();
331 pdf.extend_from_slice(
332 b"3 0 obj\n<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] >>\nendobj\n",
333 );
334
335 let xref_offset = pdf.len();
336 pdf.extend_from_slice(b"xref\n");
337 pdf.extend_from_slice(b"0 4\n");
338 pdf.extend_from_slice(b"0000000000 65535 f \r\n");
339 pdf.extend_from_slice(format!("{:010} 00000 n \r\n", obj1_offset).as_bytes());
340 pdf.extend_from_slice(format!("{:010} 00000 n \r\n", obj2_offset).as_bytes());
341 pdf.extend_from_slice(format!("{:010} 00000 n \r\n", obj3_offset).as_bytes());
342
343 pdf.extend_from_slice(b"trailer\n<< /Size 4 /Root 1 0 R >>\n");
344 pdf.extend_from_slice(format!("startxref\n{xref_offset}\n%%EOF\n").as_bytes());
345
346 pdf
347 }
348
349 #[test]
354 fn test_rebuild_xref_matches_normal() {
355 let data = build_minimal_pdf();
356
357 let normal_xref = crate::xref::load_xref(&data).unwrap();
359
360 let repaired_xref = rebuild_xref(&data).unwrap();
362
363 for obj_num in 1u32..=3 {
365 let normal_entry = normal_xref.get(obj_num).unwrap();
366 let repair_entry = repaired_xref.get(obj_num).unwrap();
367 match (normal_entry, repair_entry) {
368 (
369 XrefEntry::InUse {
370 offset: o1,
371 gen_num: g1,
372 },
373 XrefEntry::InUse {
374 offset: o2,
375 gen_num: g2,
376 },
377 ) => {
378 assert_eq!(o1, o2, "offset mismatch for obj {obj_num}");
379 assert_eq!(g1, g2, "gen mismatch for obj {obj_num}");
380 }
381 _ => panic!("unexpected entry type for obj {obj_num}"),
382 }
383 }
384
385 assert!(repaired_xref.trailer.get_ref(b"Root").is_some());
387 }
388
389 #[test]
394 fn test_rebuild_xref_truncated_trailer() {
395 let mut data = build_minimal_pdf();
396
397 if let Some(pos) = data
399 .windows(4)
400 .position(|w| w == b"xref")
401 {
402 data.truncate(pos);
403 }
404
405 assert!(PdfDocument::from_bytes(data.clone()).is_err());
407
408 let repaired = rebuild_xref(&data).unwrap();
411 assert!(repaired.get(1).is_some());
412 assert!(repaired.get(2).is_some());
413 assert!(repaired.get(3).is_some());
414
415 let root = repaired.trailer.get_ref(b"Root").expect("/Root missing");
417 assert_eq!(root.obj_num, 1);
418 }
419
420 #[test]
425 fn test_detect_catalog_object() {
426 let mut data = Vec::new();
428 data.extend_from_slice(b"%PDF-1.7\n");
429 data.extend_from_slice(
430 b"5 0 obj\n<< /Type /Pages /Kids [] /Count 0 >>\nendobj\n",
431 );
432 data.extend_from_slice(
433 b"10 0 obj\n<< /Type /Catalog /Pages 5 0 R >>\nendobj\n",
434 );
435
436 let repaired = rebuild_xref(&data).unwrap();
437
438 assert!(repaired.get(5).is_some());
440 assert!(repaired.get(10).is_some());
441
442 let root = repaired.trailer.get_ref(b"Root").unwrap();
444 assert_eq!(root.obj_num, 10);
445 }
446
447 #[test]
452 fn test_scan_ignores_non_obj_keywords() {
453 let data = b"%PDF-1.4\n1 0 object\n2 0 obj\n<< >>\nendobj\n";
455 let entries = scan_object_headers(data);
456 assert!(!entries.contains_key(&1));
457 assert!(entries.contains_key(&2));
458 }
459
460 #[test]
461 fn test_scan_generation_number() {
462 let data = b"%PDF-1.4\n7 3 obj\n<< /Foo /Bar >>\nendobj\n";
463 let entries = scan_object_headers(data);
464 let (_, gen_val) = entries.get(&7).expect("object 7 not found");
465 assert_eq!(*gen_val, 3);
466 }
467
468 #[test]
473 fn test_repair_document_valid_pdf() {
474 let data = build_minimal_pdf();
475 let doc = repair_document(data).unwrap();
476 assert_eq!(doc.version, (1, 4));
477 assert!(doc.object_count() > 0);
478 }
479
480 #[test]
481 fn test_repair_document_damaged_pdf() {
482 let mut data = build_minimal_pdf();
483
484 if let Some(pos) = data.windows(4).position(|w| w == b"xref") {
486 data.truncate(pos);
487 }
488
489 let doc = repair_document(data).unwrap();
490 assert!(doc.object_count() > 0);
491 }
492
493 #[test]
498 fn test_from_bytes_with_repair_valid() {
499 let data = build_minimal_pdf();
500 let doc = PdfDocument::from_bytes_with_repair(data).unwrap();
501 assert_eq!(doc.version, (1, 4));
502 }
503
504 #[test]
505 fn test_from_bytes_with_repair_damaged() {
506 let mut data = build_minimal_pdf();
507 if let Some(pos) = data.windows(4).position(|w| w == b"xref") {
508 data.truncate(pos);
509 }
510 let doc = PdfDocument::from_bytes_with_repair(data).unwrap();
511 assert!(doc.object_count() >= 3);
512 }
513}