Skip to main content

rpdfium_parser/
xref.rs

1// Derived from PDFium's cpdf_cross_ref_table.cpp / cpdf_parser.cpp
2// Original: Copyright 2014 The PDFium Authors
3// Licensed under BSD-3-Clause / Apache-2.0
4// See pdfium-upstream/LICENSE for the original license.
5
6//! Cross-reference table parsing (traditional `xref` format).
7//!
8//! Parses the standard PDF cross-reference table format with subsection headers
9//! (e.g., `xref\n0 6\n...`). Each entry is exactly 20 bytes.
10
11use rpdfium_core::error::{ParseError, PdfError};
12use rpdfium_core::fx_system::MAX_OBJECT_NUMBER;
13
14use crate::object::ObjectId;
15use crate::tokenizer::is_whitespace;
16
17/// The type of a cross-reference entry.
18#[derive(Debug, Clone, PartialEq, Eq)]
19pub enum XrefEntryType {
20    /// In-use object at a direct byte offset in the file.
21    InUse { offset: u64 },
22    /// Object stored inside an object stream (ObjStm).
23    InStream { stream_id: ObjectId, index: u32 },
24    /// A free (deleted) object.
25    Free,
26}
27
28/// A single cross-reference entry.
29#[derive(Debug, Clone)]
30pub struct XrefEntry {
31    pub id: ObjectId,
32    pub entry_type: XrefEntryType,
33}
34
35/// A section of cross-reference entries from a single xref table or stream.
36#[derive(Debug, Clone)]
37pub struct XrefSection {
38    pub entries: Vec<XrefEntry>,
39}
40
41/// The complete cross-reference table, potentially spanning multiple sections.
42#[derive(Debug, Clone)]
43pub struct XrefTable {
44    pub sections: Vec<XrefSection>,
45    /// The byte offset of the last (newest) xref start in the file.
46    /// Used by incremental saves to set the `/Prev` trailer key.
47    pub start_offset: u64,
48}
49
50impl XrefTable {
51    pub fn new() -> Self {
52        Self {
53            sections: Vec::new(),
54            start_offset: 0,
55        }
56    }
57
58    /// Add a section (newest sections should be added first).
59    pub fn push(&mut self, section: XrefSection) {
60        self.sections.push(section);
61    }
62}
63
64impl Default for XrefTable {
65    fn default() -> Self {
66        Self::new()
67    }
68}
69
70/// Parse a traditional cross-reference table starting at `offset`.
71///
72/// The table begins with the `xref` keyword followed by one or more subsections.
73/// Each subsection has a header `start_id count` and then `count` entries of
74/// exactly 20 bytes each (`nnnnnnnnnn ggggg n \r\n` or `nnnnnnnnnn ggggg f \r\n`).
75///
76/// Returns the parsed `XrefSection` and the byte offset after the table
77/// (positioned just before the `trailer` keyword).
78pub fn parse_xref_table(source: &[u8], offset: u64) -> Result<(XrefSection, u64), PdfError> {
79    let mut pos = offset as usize;
80
81    // Skip whitespace before "xref"
82    while pos < source.len() && is_whitespace(source[pos]) {
83        pos += 1;
84    }
85
86    // Expect "xref" keyword
87    if pos + 4 > source.len() || &source[pos..pos + 4] != b"xref" {
88        return Err(PdfError::InvalidXref);
89    }
90    pos += 4;
91
92    // Skip whitespace after "xref"
93    skip_eol(source, &mut pos);
94
95    let mut entries = Vec::new();
96
97    // Parse subsections until we hit "trailer" or EOF
98    loop {
99        // Skip whitespace
100        while pos < source.len() && is_whitespace(source[pos]) {
101            pos += 1;
102        }
103
104        if pos >= source.len() {
105            break;
106        }
107
108        // Check for "trailer" keyword
109        if pos + 7 <= source.len() && &source[pos..pos + 7] == b"trailer" {
110            break;
111        }
112
113        // Parse subsection header: start_id count
114        let (start_id, count) = parse_subsection_header(source, &mut pos)?;
115
116        // Validate object numbers
117        if start_id.saturating_add(count) > MAX_OBJECT_NUMBER as u64 {
118            return Err(PdfError::InvalidXref);
119        }
120
121        // Parse entries
122        for i in 0..count {
123            skip_eol_minimal(source, &mut pos);
124
125            let entry_bytes = if pos + 20 <= source.len() {
126                &source[pos..pos + 20]
127            } else {
128                return Err(PdfError::Parse(ParseError::InvalidXrefEntry {
129                    offset: pos as u64,
130                }));
131            };
132
133            let entry = parse_xref_entry(entry_bytes, start_id + i, pos as u64)?;
134            pos += 20;
135
136            entries.push(entry);
137        }
138    }
139
140    Ok((XrefSection { entries }, pos as u64))
141}
142
143/// Parse a subsection header: `start_id count\n`.
144fn parse_subsection_header(source: &[u8], pos: &mut usize) -> Result<(u64, u64), PdfError> {
145    // Read start_id
146    let start_id = read_u64(source, pos)?;
147
148    // Skip whitespace (not newlines)
149    while *pos < source.len() && (source[*pos] == b' ' || source[*pos] == b'\t') {
150        *pos += 1;
151    }
152
153    // Read count
154    let count = read_u64(source, pos)?;
155
156    // Skip to end of line
157    skip_eol(source, pos);
158
159    Ok((start_id, count))
160}
161
162/// Read an unsigned integer from the source at the given position.
163fn read_u64(source: &[u8], pos: &mut usize) -> Result<u64, PdfError> {
164    let start = *pos;
165
166    while *pos < source.len() && source[*pos] >= b'0' && source[*pos] <= b'9' {
167        *pos += 1;
168    }
169
170    if *pos == start {
171        return Err(PdfError::InvalidXref);
172    }
173
174    let s = std::str::from_utf8(&source[start..*pos]).map_err(|_| PdfError::InvalidXref)?;
175    s.parse::<u64>().map_err(|_| PdfError::InvalidXref)
176}
177
178/// Parse a single 20-byte xref entry.
179///
180/// Format: `nnnnnnnnnn ggggg n \r\n` or `nnnnnnnnnn ggggg f \r\n`
181/// (10 digits offset, space, 5 digits generation, space, 'n' or 'f', space, EOL)
182fn parse_xref_entry(entry: &[u8], object_number: u64, offset: u64) -> Result<XrefEntry, PdfError> {
183    // Entry must be at least 18 bytes (10 + 1 + 5 + 1 + 1)
184    if entry.len() < 18 {
185        return Err(PdfError::Parse(ParseError::InvalidXrefEntry { offset }));
186    }
187
188    // Parse offset (10 digits)
189    let offset_str = std::str::from_utf8(&entry[0..10])
190        .map_err(|_| PdfError::Parse(ParseError::InvalidXrefEntry { offset }))?;
191    let entry_offset: u64 = offset_str
192        .trim()
193        .parse()
194        .map_err(|_| PdfError::Parse(ParseError::InvalidXrefEntry { offset }))?;
195
196    // Parse generation number (5 digits after a space)
197    let gen_str = std::str::from_utf8(&entry[11..16])
198        .map_err(|_| PdfError::Parse(ParseError::InvalidXrefEntry { offset }))?;
199    let generation: u16 = gen_str
200        .trim()
201        .parse()
202        .map_err(|_| PdfError::Parse(ParseError::InvalidXrefEntry { offset }))?;
203
204    // Parse type marker ('n' for in-use, 'f' for free)
205    let type_marker = entry[17];
206
207    let id = ObjectId::new(object_number as u32, generation);
208
209    let entry_type = match type_marker {
210        b'n' => XrefEntryType::InUse {
211            offset: entry_offset,
212        },
213        b'f' => XrefEntryType::Free,
214        _ => {
215            return Err(PdfError::Parse(ParseError::InvalidXrefEntry { offset }));
216        }
217    };
218
219    Ok(XrefEntry { id, entry_type })
220}
221
222/// Skip end-of-line characters (CR, LF, CR+LF).
223fn skip_eol(source: &[u8], pos: &mut usize) {
224    while *pos < source.len() && is_whitespace(source[*pos]) {
225        *pos += 1;
226    }
227}
228
229/// Skip line-ending characters (CR, LF) between xref entries.
230fn skip_eol_minimal(source: &[u8], pos: &mut usize) {
231    // Skip CR/LF to reach the start of the next 20-byte xref entry block.
232    while *pos < source.len() && (source[*pos] == b'\r' || source[*pos] == b'\n') {
233        *pos += 1;
234    }
235}
236
237/// Rebuild a cross-reference table by scanning the file for `N G obj` markers.
238///
239/// This is a fallback for Lenient mode when standard XRef parsing fails
240/// (e.g., corrupt or missing `startxref`). It iterates through the source
241/// bytes looking for patterns like `\d+ \d+ obj` and records the offset of
242/// each indirect object definition.
243///
244/// For the trailer, it scans backwards for a `trailer` keyword; if none is
245/// found, it looks for the object with `/Type /Catalog` to reconstruct a
246/// minimal [`crate::trailer::TrailerInfo`].
247pub fn rebuild_xref(source: &[u8]) -> Result<(XrefTable, crate::trailer::TrailerInfo), PdfError> {
248    let mut entries = Vec::new();
249    let mut pos = 0;
250
251    // Scan for `N G obj` patterns iteratively
252    while pos < source.len() {
253        // Skip to potential digit start
254        if !source[pos].is_ascii_digit() {
255            pos += 1;
256            continue;
257        }
258
259        // Try to match: <digits> <space> <digits> <space> "obj"
260        let start = pos;
261        if let Some((number, generation, obj_keyword_end)) = try_parse_obj_marker(source, pos) {
262            if number <= MAX_OBJECT_NUMBER {
263                // Verify the byte before the object number is whitespace or start of file
264                let valid_boundary = start == 0 || is_whitespace(source[start - 1]);
265                if valid_boundary {
266                    entries.push(XrefEntry {
267                        id: ObjectId::new(number, generation),
268                        entry_type: XrefEntryType::InUse {
269                            offset: start as u64,
270                        },
271                    });
272                }
273            }
274            // Advance past "obj" to avoid re-matching
275            pos = obj_keyword_end;
276        } else {
277            pos += 1;
278        }
279
280        // Safety limit: prevent building an enormous table
281        if entries.len() > MAX_OBJECT_NUMBER as usize {
282            break;
283        }
284    }
285
286    if entries.is_empty() {
287        return Err(PdfError::InvalidXref);
288    }
289
290    let section = XrefSection { entries };
291
292    // Try to find trailer info
293    let trailer = rebuild_trailer_info(source, &section)?;
294
295    let mut table = XrefTable::new();
296    table.push(section);
297
298    Ok((table, trailer))
299}
300
301/// Try to parse `N G obj` at the given position in source.
302///
303/// Returns `Some((number, generation, end_pos))` if matched, where `end_pos`
304/// is the byte position just after `obj`.
305fn try_parse_obj_marker(source: &[u8], pos: usize) -> Option<(u32, u16, usize)> {
306    let mut p = pos;
307
308    // Parse object number (digits)
309    let num_start = p;
310    while p < source.len() && source[p].is_ascii_digit() {
311        p += 1;
312    }
313    if p == num_start || p >= source.len() {
314        return None;
315    }
316    let num_str = std::str::from_utf8(&source[num_start..p]).ok()?;
317    let number: u32 = num_str.parse().ok()?;
318
319    // Expect exactly one space
320    if p >= source.len() || source[p] != b' ' {
321        return None;
322    }
323    p += 1;
324
325    // Parse generation number (digits)
326    let gen_start = p;
327    while p < source.len() && source[p].is_ascii_digit() {
328        p += 1;
329    }
330    if p == gen_start || p >= source.len() {
331        return None;
332    }
333    let gen_str = std::str::from_utf8(&source[gen_start..p]).ok()?;
334    let generation: u16 = gen_str.parse().ok()?;
335
336    // Expect exactly one space
337    if p >= source.len() || source[p] != b' ' {
338        return None;
339    }
340    p += 1;
341
342    // Expect "obj" keyword
343    if p + 3 > source.len() || &source[p..p + 3] != b"obj" {
344        return None;
345    }
346    p += 3;
347
348    // After "obj", expect whitespace or end of file (not another letter, e.g. "object")
349    if p < source.len() && source[p].is_ascii_alphabetic() {
350        return None;
351    }
352
353    Some((number, generation, p))
354}
355
356/// Rebuild a minimal TrailerInfo by scanning for the trailer dictionary or
357/// by finding the Catalog object.
358fn rebuild_trailer_info(
359    source: &[u8],
360    section: &XrefSection,
361) -> Result<crate::trailer::TrailerInfo, PdfError> {
362    // First, try to find a `trailer` keyword and parse the dictionary after it
363    if let Some(info) = try_find_trailer_dict(source) {
364        return Ok(info);
365    }
366
367    // Fallback: scan objects to find the one with /Type /Catalog
368    for entry in &section.entries {
369        if let XrefEntryType::InUse { offset } = &entry.entry_type {
370            let off = *offset as usize;
371            // Look ahead from the object start for /Type /Catalog
372            let search_end = (off + 512).min(source.len());
373            let window = &source[off..search_end];
374            if contains_catalog_marker(window) {
375                let size = section
376                    .entries
377                    .iter()
378                    .map(|e| e.id.number)
379                    .max()
380                    .unwrap_or(0)
381                    + 1;
382                return Ok(crate::trailer::TrailerInfo {
383                    root: entry.id,
384                    info: None,
385                    encrypt: None,
386                    id: None,
387                    size,
388                    prev: None,
389                });
390            }
391        }
392    }
393
394    Err(PdfError::InvalidTrailer)
395}
396
397/// Scan backwards through the source for a `trailer` keyword and try to
398/// parse a trailer dictionary from it.
399fn try_find_trailer_dict(source: &[u8]) -> Option<crate::trailer::TrailerInfo> {
400    let marker = b"trailer";
401    // Search backwards from end
402    let start = source.len().saturating_sub(4096);
403    for i in (start..source.len().saturating_sub(marker.len())).rev() {
404        if &source[i..i + marker.len()] == marker {
405            // Try to parse the trailer dict after "trailer"
406            let mut pos = i + marker.len();
407            while pos < source.len() && is_whitespace(source[pos]) {
408                pos += 1;
409            }
410            if let Ok(crate::object::Object::Dictionary(dict)) = crate::object_parser::parse_object(
411                source,
412                pos as u64,
413                rpdfium_core::ParsingMode::Lenient,
414            ) {
415                return extract_trailer_from_dict(&dict);
416            }
417        }
418    }
419    None
420}
421
422/// Extract TrailerInfo from a dictionary, returning None if required fields are missing.
423fn extract_trailer_from_dict(
424    dict: &std::collections::HashMap<rpdfium_core::Name, crate::object::Object>,
425) -> Option<crate::trailer::TrailerInfo> {
426    let root = match dict.get(&rpdfium_core::Name::root()) {
427        Some(crate::object::Object::Reference(id)) => *id,
428        _ => return None,
429    };
430    let size = match dict.get(&rpdfium_core::Name::size()) {
431        Some(crate::object::Object::Integer(n)) if *n > 0 => *n as u32,
432        _ => return None,
433    };
434    let info = match dict.get(&rpdfium_core::Name::info()) {
435        Some(crate::object::Object::Reference(id)) => Some(*id),
436        _ => None,
437    };
438    let encrypt = match dict.get(&rpdfium_core::Name::encrypt()) {
439        Some(crate::object::Object::Reference(id)) => Some(*id),
440        _ => None,
441    };
442    Some(crate::trailer::TrailerInfo {
443        root,
444        info,
445        encrypt,
446        id: None,
447        size,
448        prev: None,
449    })
450}
451
452/// Check if a byte window contains `/Type /Catalog` (or `/Type/Catalog`).
453fn contains_catalog_marker(window: &[u8]) -> bool {
454    // Look for /Type followed by optional whitespace and /Catalog
455    let type_marker = b"/Type";
456    let catalog_marker = b"/Catalog";
457    for i in 0..window.len().saturating_sub(type_marker.len()) {
458        if &window[i..i + type_marker.len()] == type_marker {
459            // Skip whitespace after /Type
460            let mut j = i + type_marker.len();
461            while j < window.len() && (window[j] == b' ' || window[j] == b'\t') {
462                j += 1;
463            }
464            if j + catalog_marker.len() <= window.len()
465                && &window[j..j + catalog_marker.len()] == catalog_marker
466            {
467                return true;
468            }
469        }
470    }
471    false
472}
473
474#[cfg(test)]
475mod tests {
476    use super::*;
477
478    #[test]
479    fn parse_simple_xref_table() {
480        let source = b"xref\n\
481                        0 3\n\
482                        0000000000 65535 f \r\n\
483                        0000000009 00000 n \r\n\
484                        0000000074 00000 n \r\n\
485                        trailer";
486        let (section, end_pos) = parse_xref_table(source, 0).unwrap();
487        assert_eq!(section.entries.len(), 3);
488
489        // First entry: free
490        assert_eq!(section.entries[0].id.number, 0);
491        assert_eq!(section.entries[0].entry_type, XrefEntryType::Free);
492
493        // Second entry: in-use at offset 9
494        assert_eq!(section.entries[1].id.number, 1);
495        assert_eq!(
496            section.entries[1].entry_type,
497            XrefEntryType::InUse { offset: 9 }
498        );
499
500        // Third entry: in-use at offset 74
501        assert_eq!(section.entries[2].id.number, 2);
502        assert_eq!(
503            section.entries[2].entry_type,
504            XrefEntryType::InUse { offset: 74 }
505        );
506
507        // Should be positioned at "trailer"
508        assert!(source[end_pos as usize..].starts_with(b"trailer"));
509    }
510
511    #[test]
512    fn parse_xref_with_multiple_subsections() {
513        let source = b"xref\n\
514                        0 1\n\
515                        0000000000 65535 f \r\n\
516                        3 1\n\
517                        0000025325 00000 n \r\n\
518                        trailer";
519        let (section, _) = parse_xref_table(source, 0).unwrap();
520        assert_eq!(section.entries.len(), 2);
521        assert_eq!(section.entries[0].id.number, 0);
522        assert_eq!(section.entries[1].id.number, 3);
523    }
524
525    #[test]
526    fn reject_object_number_exceeding_limit() {
527        // Object number that exceeds MAX_OBJECT_NUMBER
528        let source = format!(
529            "xref\n{} 1\n0000000000 00000 n \r\ntrailer",
530            MAX_OBJECT_NUMBER + 1
531        );
532        let result = parse_xref_table(source.as_bytes(), 0);
533        assert!(result.is_err());
534    }
535
536    #[test]
537    fn parse_xref_entry_in_use() {
538        let entry = b"0000000009 00000 n \r\n";
539        let result = parse_xref_entry(entry, 1, 0).unwrap();
540        assert_eq!(result.id.number, 1);
541        assert_eq!(result.id.generation, 0);
542        assert_eq!(result.entry_type, XrefEntryType::InUse { offset: 9 });
543    }
544
545    #[test]
546    fn parse_xref_entry_free() {
547        let entry = b"0000000000 65535 f \r\n";
548        let result = parse_xref_entry(entry, 0, 0).unwrap();
549        assert_eq!(result.id.number, 0);
550        assert_eq!(result.id.generation, 65535);
551        assert_eq!(result.entry_type, XrefEntryType::Free);
552    }
553
554    #[test]
555    fn xref_table_default() {
556        let table = XrefTable::default();
557        assert!(table.sections.is_empty());
558    }
559
560    #[test]
561    fn parse_xref_at_offset() {
562        let prefix = b"some garbage before ";
563        let xref = b"xref\n0 1\n0000000000 65535 f \r\ntrailer";
564        let mut source = prefix.to_vec();
565        source.extend_from_slice(xref);
566        let (section, _) = parse_xref_table(&source, prefix.len() as u64).unwrap();
567        assert_eq!(section.entries.len(), 1);
568    }
569
570    // -----------------------------------------------------------------------
571    // Upstream-derived xref table tests (cpdf_parser_unittest.cpp)
572    // -----------------------------------------------------------------------
573
574    /// Upstream: LoadCrossRefV4 — multiple non-contiguous subsections.
575    /// Ported from cpdf_parser_unittest.cpp: subsections 0 1, 3 1, 8 2, 12 1.
576    #[test]
577    fn parse_xref_multiple_subsections_non_contiguous() {
578        let source = b"xref\n\
579                        0 1\n\
580                        0000000000 65535 f \r\n\
581                        3 1\n\
582                        0000025325 00000 n \r\n\
583                        8 2\n\
584                        0000025518 00002 n \r\n\
585                        0000025635 00000 n \r\n\
586                        12 1\n\
587                        0000025777 00000 n \r\n\
588                        trailer";
589        let (section, end_pos) = parse_xref_table(source, 0).unwrap();
590        assert_eq!(section.entries.len(), 5);
591
592        // Object 0: free
593        assert_eq!(section.entries[0].id.number, 0);
594        assert_eq!(section.entries[0].id.generation, 65535);
595        assert_eq!(section.entries[0].entry_type, XrefEntryType::Free);
596
597        // Object 3: in-use at offset 25325
598        assert_eq!(section.entries[1].id.number, 3);
599        assert_eq!(section.entries[1].id.generation, 0);
600        assert_eq!(
601            section.entries[1].entry_type,
602            XrefEntryType::InUse { offset: 25325 }
603        );
604
605        // Object 8: in-use at offset 25518, generation 2
606        assert_eq!(section.entries[2].id.number, 8);
607        assert_eq!(section.entries[2].id.generation, 2);
608        assert_eq!(
609            section.entries[2].entry_type,
610            XrefEntryType::InUse { offset: 25518 }
611        );
612
613        // Object 9: in-use at offset 25635
614        assert_eq!(section.entries[3].id.number, 9);
615        assert_eq!(section.entries[3].id.generation, 0);
616        assert_eq!(
617            section.entries[3].entry_type,
618            XrefEntryType::InUse { offset: 25635 }
619        );
620
621        // Object 12: in-use at offset 25777
622        assert_eq!(section.entries[4].id.number, 12);
623        assert_eq!(section.entries[4].id.generation, 0);
624        assert_eq!(
625            section.entries[4].entry_type,
626            XrefEntryType::InUse { offset: 25777 }
627        );
628
629        // End position should be at "trailer"
630        assert!(source[end_pos as usize..].starts_with(b"trailer"));
631    }
632
633    /// Upstream: LoadCrossRefV4 — large table (regression for crbug.com/945624).
634    /// 2048-entry xref table should not cause issues.
635    #[test]
636    fn parse_xref_large_table() {
637        let count = 2048u64;
638        let mut source = Vec::new();
639        source.extend_from_slice(format!("xref\n0 {count}\n").as_bytes());
640
641        // First entry is free
642        source.extend_from_slice(b"0000000000 65535 f \r\n");
643
644        // Remaining entries are in-use with incrementing offsets
645        for i in 1..count {
646            source.extend_from_slice(format!("{:010} 00000 n \r\n", i * 100).as_bytes());
647        }
648        source.extend_from_slice(b"trailer");
649
650        let (section, end_pos) = parse_xref_table(&source, 0).unwrap();
651        assert_eq!(section.entries.len(), count as usize);
652
653        // Verify first and last entries
654        assert_eq!(section.entries[0].id.number, 0);
655        assert_eq!(section.entries[0].entry_type, XrefEntryType::Free);
656
657        let last = &section.entries[count as usize - 1];
658        assert_eq!(last.id.number, (count - 1) as u32);
659        assert_eq!(
660            last.entry_type,
661            XrefEntryType::InUse {
662                offset: (count - 1) * 100
663            }
664        );
665
666        assert!(source[end_pos as usize..].starts_with(b"trailer"));
667    }
668
669    /// Upstream: free entries form a chain. We should parse them correctly.
670    #[test]
671    fn parse_xref_free_entry_chain() {
672        let source = b"xref\n\
673                        0 4\n\
674                        0000000003 65535 f \r\n\
675                        0000000100 00000 n \r\n\
676                        0000000000 65535 f \r\n\
677                        0000000000 65535 f \r\n\
678                        trailer";
679        let (section, _) = parse_xref_table(source, 0).unwrap();
680        assert_eq!(section.entries.len(), 4);
681
682        // Object 0: free (points to next free: 3)
683        assert_eq!(section.entries[0].entry_type, XrefEntryType::Free);
684        // Object 1: in-use
685        assert_eq!(
686            section.entries[1].entry_type,
687            XrefEntryType::InUse { offset: 100 }
688        );
689        // Object 2: free
690        assert_eq!(section.entries[2].entry_type, XrefEntryType::Free);
691        // Object 3: free
692        assert_eq!(section.entries[3].entry_type, XrefEntryType::Free);
693    }
694
695    /// Single-entry subsection.
696    #[test]
697    fn parse_xref_single_entry() {
698        let source = b"xref\n\
699                        5 1\n\
700                        0000012345 00003 n \r\n\
701                        trailer";
702        let (section, _) = parse_xref_table(source, 0).unwrap();
703        assert_eq!(section.entries.len(), 1);
704        assert_eq!(section.entries[0].id.number, 5);
705        assert_eq!(section.entries[0].id.generation, 3);
706        assert_eq!(
707            section.entries[0].entry_type,
708            XrefEntryType::InUse { offset: 12345 }
709        );
710    }
711
712    /// Empty xref table (0 entries).
713    #[test]
714    fn parse_xref_zero_entries() {
715        let source = b"xref\n\
716                        0 0\n\
717                        trailer";
718        let (section, _) = parse_xref_table(source, 0).unwrap();
719        assert!(section.entries.is_empty());
720    }
721
722    /// Truncated xref entry (< 20 bytes) should fail.
723    #[test]
724    fn parse_xref_truncated_entry() {
725        let source = b"xref\n\
726                        0 1\n\
727                        0000000000 65535";
728        let result = parse_xref_table(source, 0);
729        assert!(result.is_err());
730    }
731
732    /// Missing "xref" keyword should fail.
733    #[test]
734    fn parse_xref_missing_keyword() {
735        let source = b"0 1\n0000000000 65535 f \r\ntrailer";
736        let result = parse_xref_table(source, 0);
737        assert!(result.is_err());
738    }
739
740    /// Invalid type marker (not 'n' or 'f') should fail.
741    #[test]
742    fn parse_xref_entry_invalid_marker() {
743        let entry = b"0000000009 00000 x \r\n";
744        let result = parse_xref_entry(entry, 1, 0);
745        assert!(result.is_err());
746    }
747
748    /// Xref entry with high generation number.
749    #[test]
750    fn parse_xref_entry_high_generation() {
751        let entry = b"0000000009 12345 n \r\n";
752        let result = parse_xref_entry(entry, 1, 0).unwrap();
753        assert_eq!(result.id.generation, 12345);
754    }
755
756    /// Xref entry with large offset.
757    #[test]
758    fn parse_xref_entry_large_offset() {
759        let entry = b"9999999999 00000 n \r\n";
760        let result = parse_xref_entry(entry, 1, 0).unwrap();
761        assert_eq!(
762            result.entry_type,
763            XrefEntryType::InUse { offset: 9999999999 }
764        );
765    }
766
767    // -----------------------------------------------------------------------
768    // rebuild_xref tests
769    // -----------------------------------------------------------------------
770
771    /// rebuild_xref finds objects and reconstructs the trailer from a
772    /// PDF with a valid trailer dict but no valid startxref/xref table.
773    #[test]
774    fn rebuild_xref_with_trailer() {
775        let mut pdf = Vec::new();
776        pdf.extend_from_slice(b"%PDF-1.4\n");
777
778        let obj1_offset = pdf.len();
779        pdf.extend_from_slice(b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n");
780
781        let obj2_offset = pdf.len();
782        pdf.extend_from_slice(b"2 0 obj\n<< /Type /Pages /Kids [] /Count 0 >>\nendobj\n");
783
784        // Add a trailer dict but with a corrupt xref
785        pdf.extend_from_slice(b"trailer\n<< /Size 3 /Root 1 0 R >>\n");
786        // No valid startxref
787
788        let (table, trailer) = rebuild_xref(&pdf).unwrap();
789        assert_eq!(table.sections.len(), 1);
790
791        let entries = &table.sections[0].entries;
792        assert_eq!(entries.len(), 2);
793
794        // Verify object 1 found at correct offset
795        let entry1 = entries.iter().find(|e| e.id.number == 1).unwrap();
796        assert_eq!(
797            entry1.entry_type,
798            XrefEntryType::InUse {
799                offset: obj1_offset as u64
800            }
801        );
802
803        // Verify object 2 found at correct offset
804        let entry2 = entries.iter().find(|e| e.id.number == 2).unwrap();
805        assert_eq!(
806            entry2.entry_type,
807            XrefEntryType::InUse {
808                offset: obj2_offset as u64
809            }
810        );
811
812        // Trailer extracted from the trailer dict
813        assert_eq!(trailer.root, ObjectId::new(1, 0));
814        assert_eq!(trailer.size, 3);
815    }
816
817    /// rebuild_xref finds the catalog object when there is no trailer dict.
818    #[test]
819    fn rebuild_xref_catalog_fallback() {
820        let mut pdf = Vec::new();
821        pdf.extend_from_slice(b"%PDF-1.4\n");
822
823        let obj1_offset = pdf.len();
824        pdf.extend_from_slice(b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n");
825
826        let obj2_offset = pdf.len();
827        pdf.extend_from_slice(b"2 0 obj\n<< /Type /Pages /Kids [] /Count 0 >>\nendobj\n");
828
829        // No trailer at all
830
831        let (table, trailer) = rebuild_xref(&pdf).unwrap();
832        assert_eq!(table.sections.len(), 1);
833
834        let entries = &table.sections[0].entries;
835        assert_eq!(entries.len(), 2);
836
837        let entry1 = entries.iter().find(|e| e.id.number == 1).unwrap();
838        assert_eq!(
839            entry1.entry_type,
840            XrefEntryType::InUse {
841                offset: obj1_offset as u64
842            }
843        );
844
845        let entry2 = entries.iter().find(|e| e.id.number == 2).unwrap();
846        assert_eq!(
847            entry2.entry_type,
848            XrefEntryType::InUse {
849                offset: obj2_offset as u64
850            }
851        );
852
853        // Root should be object 1 (the Catalog)
854        assert_eq!(trailer.root, ObjectId::new(1, 0));
855        // Size = max object number + 1
856        assert_eq!(trailer.size, 3);
857    }
858
859    /// rebuild_xref on source with no objects returns an error.
860    #[test]
861    fn rebuild_xref_no_objects() {
862        let source = b"%PDF-1.4\nno objects here at all\n%%EOF";
863        let result = rebuild_xref(source);
864        assert!(result.is_err());
865    }
866
867    /// rebuild_xref does not match partial keywords like "object".
868    #[test]
869    fn rebuild_xref_ignores_non_obj_keywords() {
870        let source = b"%PDF-1.4\nThis is an object keyword test\n";
871        let result = rebuild_xref(source);
872        assert!(result.is_err());
873    }
874
875    /// try_parse_obj_marker correctly parses "N G obj".
876    #[test]
877    fn try_parse_obj_marker_basic() {
878        let source = b"1 0 obj\n";
879        let result = try_parse_obj_marker(source, 0);
880        assert!(result.is_some());
881        let (number, generation, end) = result.unwrap();
882        assert_eq!(number, 1);
883        assert_eq!(generation, 0);
884        assert_eq!(end, 7); // past "obj"
885    }
886
887    /// try_parse_obj_marker rejects "1 0 object" (alphabetic continuation).
888    #[test]
889    fn try_parse_obj_marker_rejects_object_word() {
890        let source = b"1 0 object\n";
891        let result = try_parse_obj_marker(source, 0);
892        assert!(result.is_none());
893    }
894
895    /// contains_catalog_marker detects /Type /Catalog with various spacing.
896    #[test]
897    fn contains_catalog_marker_test() {
898        assert!(contains_catalog_marker(b"/Type /Catalog"));
899        assert!(contains_catalog_marker(b"/Type/Catalog"));
900        assert!(contains_catalog_marker(
901            b"<< /Type  /Catalog /Pages 2 0 R >>"
902        ));
903        assert!(!contains_catalog_marker(b"/Type /Pages"));
904        assert!(!contains_catalog_marker(b"no catalog here"));
905    }
906}