Skip to main content

rpdfium_parser/
trailer.rs

1// Derived from PDFium's cpdf_parser.cpp (trailer & startxref parsing)
2// Original: Copyright 2014 The PDFium Authors
3// Licensed under BSD-3-Clause / Apache-2.0
4// See pdfium-upstream/LICENSE for the original license.
5
6//! Trailer parsing — locates `startxref`, parses trailer dictionary,
7//! and follows the `/Prev` chain to build the full cross-reference table.
8
9use std::collections::HashMap;
10
11use rpdfium_core::error::PdfError;
12use rpdfium_core::{Name, ParsingMode};
13
14use crate::object::{Object, ObjectId};
15use crate::object_parser::parse_object;
16use crate::tokenizer::is_whitespace;
17use crate::xref::parse_xref_table;
18use crate::xref::{XrefSection, XrefTable};
19use crate::xref_stream::parse_xref_stream;
20
21/// Extracted information from the trailer dictionary.
22#[derive(Debug, Clone)]
23pub struct TrailerInfo {
24    /// The indirect reference to the document catalog (`/Root`).
25    pub root: ObjectId,
26    /// Optional indirect reference to the document information dictionary (`/Info`).
27    pub info: Option<ObjectId>,
28    /// Optional indirect reference to the encryption dictionary (`/Encrypt`).
29    pub encrypt: Option<ObjectId>,
30    /// Optional file identifier array (`/ID`).
31    pub id: Option<[Vec<u8>; 2]>,
32    /// Total number of cross-reference entries (`/Size`).
33    pub size: u32,
34    /// Optional offset to the previous cross-reference section (`/Prev`).
35    pub prev: Option<u64>,
36}
37
38impl TrailerInfo {
39    /// Returns the object number of the document catalog (`/Root`).
40    ///
41    /// Corresponds to `CPDF_Parser::GetRootObjNum()` in PDFium.
42    pub fn root_obj_num(&self) -> u32 {
43        self.root.number
44    }
45
46    /// ADR-019 alias for [`root_obj_num()`](Self::root_obj_num).
47    ///
48    /// Corresponds to `CPDF_Parser::GetRootObjNum()` in PDFium.
49    #[inline]
50    pub fn get_root_obj_num(&self) -> u32 {
51        self.root_obj_num()
52    }
53
54    /// Returns the object number of the document information dictionary
55    /// (`/Info`), or `None` if not present.
56    ///
57    /// Corresponds to `CPDF_Parser::GetInfoObjNum()` in PDFium.
58    pub fn info_obj_num(&self) -> Option<u32> {
59        self.info.map(|id| id.number)
60    }
61
62    /// ADR-019 alias for [`info_obj_num()`](Self::info_obj_num).
63    ///
64    /// Corresponds to `CPDF_Parser::GetInfoObjNum()` in PDFium.
65    #[inline]
66    pub fn get_info_obj_num(&self) -> Option<u32> {
67        self.info_obj_num()
68    }
69
70    /// Returns the object ID reference to the encryption dictionary
71    /// (`/Encrypt`), or `None` if the document is not encrypted.
72    ///
73    /// Corresponds to `CPDF_Parser::GetEncryptDict()` in PDFium (returns the
74    /// reference rather than resolving the dictionary itself).
75    pub fn encrypt_dict(&self) -> Option<ObjectId> {
76        self.encrypt
77    }
78
79    /// ADR-019 alias for [`encrypt_dict()`](Self::encrypt_dict).
80    ///
81    /// Corresponds to `CPDF_Parser::GetEncryptDict()` in PDFium.
82    #[inline]
83    pub fn get_encrypt_dict(&self) -> Option<ObjectId> {
84        self.encrypt_dict()
85    }
86
87    /// Deprecated non-upstream alias for [`encrypt_dict()`](Self::encrypt_dict).
88    ///
89    /// There is no `CPDF_Trailer::GetEncryptId()` in PDFium upstream; the
90    /// correct upstream method is `CPDF_Parser::GetEncryptDict()`, already
91    /// covered by [`get_encrypt_dict()`](Self::get_encrypt_dict).
92    /// Use [`get_encrypt_dict()`](Self::get_encrypt_dict) instead.
93    #[deprecated(
94        note = "use `get_encrypt_dict()` — matches upstream CPDF_Parser::GetEncryptDict()"
95    )]
96    #[inline]
97    pub fn get_encrypt_id(&self) -> Option<ObjectId> {
98        self.encrypt_dict()
99    }
100
101    /// Rust-idiomatic alias for [`encrypt_dict()`](Self::encrypt_dict).
102    #[inline]
103    #[deprecated(since = "0.0.0", note = "use `encrypt_dict()` or `get_encrypt_dict()`")]
104    pub fn encrypt_id(&self) -> Option<ObjectId> {
105        self.encrypt_dict()
106    }
107
108    /// Returns a reference to the file identifier byte-string pair (`/ID`),
109    /// or `None` if not present.
110    ///
111    /// Corresponds to `CPDF_Parser::GetIDArray()` in PDFium.
112    pub fn id_array(&self) -> Option<&[Vec<u8>; 2]> {
113        self.id.as_ref()
114    }
115
116    /// ADR-019 alias for [`id_array()`](Self::id_array).
117    ///
118    /// Corresponds to `CPDF_Parser::GetIDArray()` in PDFium.
119    #[inline]
120    pub fn get_id_array(&self) -> Option<&[Vec<u8>; 2]> {
121        self.id_array()
122    }
123}
124
125/// Maximum number of /Prev links to follow (prevents infinite loops).
126const MAX_PREV_CHAIN: usize = 512;
127
128/// Number of bytes from the end of file to search for `startxref`.
129const STARTXREF_SEARCH_SIZE: usize = 1024;
130
131/// Parse all cross-reference sections and trailer info from the source.
132///
133/// 1. Locate `startxref` near the end of the file.
134/// 2. Parse the xref section (table or stream) at that offset.
135/// 3. Follow `/Prev` chain to discover all xref sections.
136/// 4. Return the merged `XrefTable` (newest-first) and `TrailerInfo` from the newest trailer.
137pub fn parse_all_xrefs(
138    source: &[u8],
139    mode: ParsingMode,
140) -> Result<(XrefTable, TrailerInfo), PdfError> {
141    let startxref_offset = find_startxref(source)?;
142
143    let mut xref_table = XrefTable::new();
144    xref_table.start_offset = startxref_offset;
145    let mut trailer_info: Option<TrailerInfo> = None;
146    let mut current_offset = Some(startxref_offset);
147    let mut visited_offsets = Vec::new();
148
149    // Follow /Prev chain iteratively (no recursion)
150    while let Some(offset) = current_offset {
151        // Circular reference detection
152        if visited_offsets.contains(&offset) {
153            tracing::warn!(offset = offset, "circular /Prev reference in xref chain");
154            break;
155        }
156
157        if visited_offsets.len() >= MAX_PREV_CHAIN {
158            tracing::warn!("exceeded maximum /Prev chain length");
159            break;
160        }
161
162        visited_offsets.push(offset);
163
164        // Determine if this is a traditional xref table or an xref stream
165        let (section, trailer_dict, prev) = parse_xref_at_offset(source, offset, mode)?;
166
167        xref_table.push(section);
168
169        // Extract trailer info from the first (newest) trailer
170        if trailer_info.is_none() {
171            trailer_info = Some(extract_trailer_info(&trailer_dict)?);
172        }
173
174        // Follow /Prev
175        current_offset = prev;
176    }
177
178    let info = trailer_info.ok_or(PdfError::InvalidTrailer)?;
179    Ok((xref_table, info))
180}
181
182/// Parse a single xref section at the given offset.
183/// Handles both traditional xref tables and xref streams.
184///
185/// Returns (xref section, trailer dict, optional /Prev offset).
186type XrefAtOffsetResult = Result<(XrefSection, HashMap<Name, Object>, Option<u64>), PdfError>;
187
188fn parse_xref_at_offset(source: &[u8], offset: u64, mode: ParsingMode) -> XrefAtOffsetResult {
189    // Peek at the data to determine format
190    let start = offset as usize;
191    if start >= source.len() {
192        return Err(PdfError::InvalidXref);
193    }
194
195    // Skip whitespace
196    let mut peek_pos = start;
197    while peek_pos < source.len() && is_whitespace(source[peek_pos]) {
198        peek_pos += 1;
199    }
200
201    if peek_pos + 4 <= source.len() && &source[peek_pos..peek_pos + 4] == b"xref" {
202        // Traditional xref table + trailer
203        let (section, trailer_offset) = parse_xref_table(source, offset)?;
204        let (trailer_dict, prev) = parse_trailer_dict(source, trailer_offset, mode)?;
205        Ok((section, trailer_dict, prev))
206    } else {
207        // Xref stream (the object at this offset is the stream)
208        let (section, dict) = parse_xref_stream(source, offset, mode)?;
209        let prev = extract_prev(&dict);
210
211        // If there's a /XRefStm in the dict, we need to also parse that xref stream
212        // For now, the dict serves as both the xref stream dict and the trailer dict
213        Ok((section, dict, prev))
214    }
215}
216
217/// Parse the trailer dictionary that follows a traditional xref table.
218///
219/// Expected format: `trailer\n<< ... >>`
220fn parse_trailer_dict(
221    source: &[u8],
222    offset: u64,
223    mode: ParsingMode,
224) -> Result<(HashMap<Name, Object>, Option<u64>), PdfError> {
225    let mut pos = offset as usize;
226
227    // Skip whitespace
228    while pos < source.len() && is_whitespace(source[pos]) {
229        pos += 1;
230    }
231
232    // Expect "trailer" keyword
233    if pos + 7 > source.len() || &source[pos..pos + 7] != b"trailer" {
234        return Err(PdfError::InvalidTrailer);
235    }
236    pos += 7;
237
238    // Skip whitespace
239    while pos < source.len() && is_whitespace(source[pos]) {
240        pos += 1;
241    }
242
243    // Parse the trailer dictionary
244    let dict_obj = parse_object(source, pos as u64, mode)?;
245    let dict = match dict_obj {
246        Object::Dictionary(d) => d,
247        _ => return Err(PdfError::InvalidTrailer),
248    };
249
250    let prev = extract_prev(&dict);
251    Ok((dict, prev))
252}
253
254/// Extract the `/Prev` offset from a trailer dictionary.
255fn extract_prev(dict: &HashMap<Name, Object>) -> Option<u64> {
256    match dict.get(&Name::prev()) {
257        Some(Object::Integer(n)) if *n >= 0 => Some(*n as u64),
258        _ => None,
259    }
260}
261
262/// Extract `TrailerInfo` from a trailer dictionary.
263fn extract_trailer_info(dict: &HashMap<Name, Object>) -> Result<TrailerInfo, PdfError> {
264    // /Root is required
265    let root = match dict.get(&Name::root()) {
266        Some(Object::Reference(id)) => *id,
267        _ => return Err(PdfError::InvalidTrailer),
268    };
269
270    // /Size is required
271    let size = match dict.get(&Name::size()) {
272        Some(Object::Integer(n)) if *n > 0 => *n as u32,
273        _ => return Err(PdfError::InvalidTrailer),
274    };
275
276    // /Info is optional
277    let info = match dict.get(&Name::info()) {
278        Some(Object::Reference(id)) => Some(*id),
279        _ => None,
280    };
281
282    // /Encrypt is optional
283    let encrypt = match dict.get(&Name::encrypt()) {
284        Some(Object::Reference(id)) => Some(*id),
285        _ => None,
286    };
287
288    // /ID is optional (array of two strings)
289    let id = extract_id_array(dict);
290
291    let prev = extract_prev(dict);
292
293    Ok(TrailerInfo {
294        root,
295        info,
296        encrypt,
297        id,
298        size,
299        prev,
300    })
301}
302
303/// Extract the /ID array (two byte strings) from the trailer dictionary.
304fn extract_id_array(dict: &HashMap<Name, Object>) -> Option<[Vec<u8>; 2]> {
305    let arr = match dict.get(&Name::id()) {
306        Some(Object::Array(a)) if a.len() >= 2 => a,
307        _ => return None,
308    };
309
310    let id0 = match &arr[0] {
311        Object::String(s) => s.as_bytes().to_vec(),
312        _ => return None,
313    };
314    let id1 = match &arr[1] {
315        Object::String(s) => s.as_bytes().to_vec(),
316        _ => return None,
317    };
318
319    Some([id0, id1])
320}
321
322/// Find the `startxref` offset from near the end of the file.
323///
324/// PDF files end with:
325/// ```text
326/// startxref
327/// <offset>
328/// %%EOF
329/// ```
330pub fn find_startxref(source: &[u8]) -> Result<u64, PdfError> {
331    let search_start = source.len().saturating_sub(STARTXREF_SEARCH_SIZE);
332    let tail = &source[search_start..];
333
334    // Search backwards for "startxref"
335    let marker = b"startxref";
336
337    let mut found_pos = None;
338    for i in (0..tail.len().saturating_sub(marker.len())).rev() {
339        if &tail[i..i + marker.len()] == marker {
340            found_pos = Some(i);
341            break;
342        }
343    }
344
345    let pos = found_pos.ok_or(PdfError::InvalidTrailer)?;
346
347    // Skip past "startxref" and whitespace to read the offset
348    let mut offset_start = pos + marker.len();
349    while offset_start < tail.len() && is_whitespace(tail[offset_start]) {
350        offset_start += 1;
351    }
352
353    // Read digits
354    let mut offset_end = offset_start;
355    while offset_end < tail.len() && tail[offset_end] >= b'0' && tail[offset_end] <= b'9' {
356        offset_end += 1;
357    }
358
359    if offset_end == offset_start {
360        return Err(PdfError::InvalidTrailer);
361    }
362
363    let offset_str = std::str::from_utf8(&tail[offset_start..offset_end])
364        .map_err(|_| PdfError::InvalidTrailer)?;
365    let offset: u64 = offset_str.parse().map_err(|_| PdfError::InvalidTrailer)?;
366
367    Ok(offset)
368}
369
370#[cfg(test)]
371mod tests {
372    use super::*;
373
374    #[test]
375    fn test_find_startxref_simple() {
376        let source = b"some content\nstartxref\n1234\n%%EOF";
377        let offset = find_startxref(source).unwrap();
378        assert_eq!(offset, 1234);
379    }
380
381    #[test]
382    fn test_find_startxref_with_extra_whitespace() {
383        let source = b"content\nstartxref\n  5678  \n%%EOF\n";
384        let offset = find_startxref(source).unwrap();
385        assert_eq!(offset, 5678);
386    }
387
388    #[test]
389    fn test_find_startxref_missing() {
390        let source = b"no start xref here %%EOF";
391        let result = find_startxref(source);
392        assert!(result.is_err());
393    }
394
395    #[test]
396    fn test_extract_trailer_info_valid() {
397        let mut dict = HashMap::new();
398        dict.insert(Name::root(), Object::Reference(ObjectId::new(1, 0)));
399        dict.insert(Name::size(), Object::Integer(10));
400        dict.insert(Name::info(), Object::Reference(ObjectId::new(2, 0)));
401
402        let info = extract_trailer_info(&dict).unwrap();
403        assert_eq!(info.root, ObjectId::new(1, 0));
404        assert_eq!(info.size, 10);
405        assert_eq!(info.info, Some(ObjectId::new(2, 0)));
406        assert!(info.encrypt.is_none());
407        assert!(info.id.is_none());
408    }
409
410    #[test]
411    fn test_extract_trailer_info_missing_root() {
412        let mut dict = HashMap::new();
413        dict.insert(Name::size(), Object::Integer(10));
414        let result = extract_trailer_info(&dict);
415        assert!(result.is_err());
416    }
417
418    #[test]
419    fn test_extract_trailer_info_missing_size() {
420        let mut dict = HashMap::new();
421        dict.insert(Name::root(), Object::Reference(ObjectId::new(1, 0)));
422        let result = extract_trailer_info(&dict);
423        assert!(result.is_err());
424    }
425
426    #[test]
427    fn test_extract_prev_present() {
428        let mut dict = HashMap::new();
429        dict.insert(Name::prev(), Object::Integer(500));
430        assert_eq!(extract_prev(&dict), Some(500));
431    }
432
433    #[test]
434    fn test_extract_prev_absent() {
435        let dict = HashMap::new();
436        assert_eq!(extract_prev(&dict), None);
437    }
438
439    #[test]
440    fn test_extract_id_array_valid() {
441        use rpdfium_core::PdfString;
442        let mut dict = HashMap::new();
443        dict.insert(
444            Name::id(),
445            Object::Array(vec![
446                Object::String(PdfString::from_bytes(b"abc".to_vec())),
447                Object::String(PdfString::from_bytes(b"def".to_vec())),
448            ]),
449        );
450        let id = extract_id_array(&dict).unwrap();
451        assert_eq!(id[0], b"abc");
452        assert_eq!(id[1], b"def");
453    }
454
455    #[test]
456    fn test_extract_id_array_missing() {
457        let dict = HashMap::new();
458        assert!(extract_id_array(&dict).is_none());
459    }
460
461    #[test]
462    fn test_parse_trailer_dict() {
463        let source = b"trailer\n<< /Size 10 /Root 1 0 R >>\nstartxref\n0\n%%EOF";
464        let (dict, prev) = parse_trailer_dict(source, 0, ParsingMode::Strict).unwrap();
465        assert!(dict.contains_key(&Name::size()));
466        assert!(dict.contains_key(&Name::root()));
467        assert!(prev.is_none());
468    }
469
470    #[test]
471    fn test_parse_trailer_dict_with_prev() {
472        let source = b"trailer\n<< /Size 10 /Root 1 0 R /Prev 500 >>";
473        let (dict, prev) = parse_trailer_dict(source, 0, ParsingMode::Strict).unwrap();
474        assert!(dict.contains_key(&Name::size()));
475        assert_eq!(prev, Some(500));
476    }
477}