Skip to main content

rpdfium_parser/
linearized_header.rs

1// Derived from PDFium's cpdf_linearized_header.cpp
2// Original: Copyright 2014 The PDFium Authors
3// Licensed under BSD-3-Clause / Apache-2.0
4// See pdfium-upstream/LICENSE for the original license.
5
6//! Linearization detection.
7//!
8//! A linearized PDF has its first indirect object contain a `/Linearized` key
9//! in its dictionary. This module detects this marker and extracts the
10//! linearization parameters.
11
12use rpdfium_core::{Name, ParsingMode};
13
14use crate::header::parse_header;
15use crate::object::Object;
16use crate::object_parser::parse_indirect_object;
17
18/// Information extracted from a linearization dictionary.
19///
20/// Corresponds to `CPDF_LinearizedHeader` in PDFium.
21#[derive(Debug, Clone)]
22pub struct LinearizedInfo {
23    /// Total file length in bytes (`/L`).
24    pub file_length: u64,
25    /// Primary (first) page number (`/P`), typically 0.
26    pub primary_page: u32,
27    /// Byte offset of the first entry in the main cross-reference table (`/T`).
28    pub main_xref_table_first_entry_offset: u64,
29    /// Total page count (`/N`).
30    pub page_count: u32,
31    /// Byte offset of the end of the first page's data (`/E`).
32    pub first_page_end_offset: u64,
33    /// Object number of the first page object (`/O`).
34    pub first_page_obj_num: u32,
35    /// Byte offset of the last cross-reference entry, derived from the parser
36    /// position after `endobj` on the linearization dictionary object.
37    pub last_xref_offset: u64,
38    /// Byte offset of the hint stream (`/H` array, first element).
39    pub hint_stream_offset: Option<u64>,
40    /// Length of the hint stream in bytes (`/H` array, second element).
41    pub hint_stream_length: Option<u32>,
42}
43
44impl LinearizedInfo {
45    /// Returns the total file size in bytes (`/L`).
46    ///
47    /// Corresponds to `CPDF_LinearizedHeader::GetFileSize()` in PDFium.
48    pub fn file_size(&self) -> u64 {
49        self.file_length
50    }
51
52    /// ADR-019 alias for [`file_size()`](Self::file_size).
53    ///
54    /// Corresponds to `CPDF_LinearizedHeader::GetFileSize()` in PDFium.
55    #[inline]
56    pub fn get_file_size(&self) -> u64 {
57        self.file_size()
58    }
59
60    /// Returns the primary (first) page number (`/P`).
61    ///
62    /// Corresponds to `CPDF_LinearizedHeader::GetFirstPageNo()` in PDFium.
63    pub fn first_page_no(&self) -> u32 {
64        self.primary_page
65    }
66
67    /// ADR-019 alias for [`first_page_no()`](Self::first_page_no).
68    ///
69    /// Corresponds to `CPDF_LinearizedHeader::GetFirstPageNo()` in PDFium.
70    #[inline]
71    pub fn get_first_page_no(&self) -> u32 {
72        self.first_page_no()
73    }
74
75    /// Returns the byte offset of the first entry in the main xref table (`/T`).
76    ///
77    /// Corresponds to `CPDF_LinearizedHeader::GetMainXRefTableFirstEntryOffset()`
78    /// in PDFium.
79    pub fn main_xref_table_first_entry_offset(&self) -> u64 {
80        self.main_xref_table_first_entry_offset
81    }
82
83    /// ADR-019 alias for
84    /// [`main_xref_table_first_entry_offset()`](Self::main_xref_table_first_entry_offset).
85    ///
86    /// Corresponds to `CPDF_LinearizedHeader::GetMainXRefTableFirstEntryOffset()`
87    /// in PDFium.
88    #[inline]
89    pub fn get_main_xref_table_first_entry_offset(&self) -> u64 {
90        self.main_xref_table_first_entry_offset()
91    }
92
93    /// Returns the total page count (`/N`).
94    ///
95    /// Corresponds to `CPDF_LinearizedHeader::GetPageCount()` in PDFium.
96    pub fn page_count(&self) -> u32 {
97        self.page_count
98    }
99
100    /// ADR-019 alias for [`page_count()`](Self::page_count).
101    ///
102    /// Corresponds to `CPDF_LinearizedHeader::GetPageCount()` in PDFium.
103    #[inline]
104    pub fn get_page_count(&self) -> u32 {
105        self.page_count()
106    }
107
108    /// Returns the byte offset of the end of the first page's data (`/E`).
109    ///
110    /// Corresponds to `CPDF_LinearizedHeader::GetFirstPageEndOffset()` in PDFium.
111    pub fn first_page_end_offset(&self) -> u64 {
112        self.first_page_end_offset
113    }
114
115    /// ADR-019 alias for [`first_page_end_offset()`](Self::first_page_end_offset).
116    ///
117    /// Corresponds to `CPDF_LinearizedHeader::GetFirstPageEndOffset()` in PDFium.
118    #[inline]
119    pub fn get_first_page_end_offset(&self) -> u64 {
120        self.first_page_end_offset()
121    }
122
123    /// Returns the object number of the first page object (`/O`).
124    ///
125    /// Corresponds to `CPDF_LinearizedHeader::GetFirstPageObjNum()` in PDFium.
126    pub fn first_page_obj_num(&self) -> u32 {
127        self.first_page_obj_num
128    }
129
130    /// ADR-019 alias for [`first_page_obj_num()`](Self::first_page_obj_num).
131    ///
132    /// Corresponds to `CPDF_LinearizedHeader::GetFirstPageObjNum()` in PDFium.
133    #[inline]
134    pub fn get_first_page_obj_num(&self) -> u32 {
135        self.first_page_obj_num()
136    }
137
138    /// Returns the byte offset of the last cross-reference entry.
139    ///
140    /// This is derived from the parser position immediately after the `endobj`
141    /// keyword on the linearization dictionary object, equivalent to the
142    /// `szLastXRefOffset` argument passed to the `CPDF_LinearizedHeader`
143    /// constructor in PDFium.
144    ///
145    /// Corresponds to `CPDF_LinearizedHeader::GetLastXRefOffset()` in PDFium.
146    pub fn last_xref_offset(&self) -> u64 {
147        self.last_xref_offset
148    }
149
150    /// ADR-019 alias for [`last_xref_offset()`](Self::last_xref_offset).
151    ///
152    /// Corresponds to `CPDF_LinearizedHeader::GetLastXRefOffset()` in PDFium.
153    #[inline]
154    pub fn get_last_xref_offset(&self) -> u64 {
155        self.last_xref_offset()
156    }
157
158    /// Returns the byte offset of the hint stream (`/H` first element).
159    ///
160    /// Corresponds to `CPDF_LinearizedHeader::GetHintStart()` in PDFium.
161    pub fn hint_start(&self) -> Option<u64> {
162        self.hint_stream_offset
163    }
164
165    /// ADR-019 alias for [`hint_start()`](Self::hint_start).
166    ///
167    /// Corresponds to `CPDF_LinearizedHeader::GetHintStart()` in PDFium.
168    #[inline]
169    pub fn get_hint_start(&self) -> Option<u64> {
170        self.hint_start()
171    }
172
173    /// Returns the length of the hint stream in bytes (`/H` second element).
174    ///
175    /// Corresponds to `CPDF_LinearizedHeader::GetHintLength()` in PDFium.
176    pub fn hint_length(&self) -> Option<u32> {
177        self.hint_stream_length
178    }
179
180    /// ADR-019 alias for [`hint_length()`](Self::hint_length).
181    ///
182    /// Corresponds to `CPDF_LinearizedHeader::GetHintLength()` in PDFium.
183    #[inline]
184    pub fn get_hint_length(&self) -> Option<u32> {
185        self.hint_length()
186    }
187
188    /// Returns `true` if a hint table is present and valid.
189    ///
190    /// A hint table is valid when: page count > 1, hint start > 0, and
191    /// hint length > 0.
192    ///
193    /// Corresponds to `CPDF_LinearizedHeader::HasHintTable()` in PDFium.
194    pub fn has_hint_table(&self) -> bool {
195        self.page_count() > 1
196            && self.hint_start().is_some_and(|s| s > 0)
197            && self.hint_length().is_some_and(|l| l > 0)
198    }
199}
200
201/// Detect whether a PDF is linearized by checking the first indirect object.
202///
203/// Returns `Some(LinearizedInfo)` if the file is linearized, `None` otherwise.
204pub fn detect_linearized(source: &[u8], mode: ParsingMode) -> Option<LinearizedInfo> {
205    // Parse the header to find where objects start
206    let (_version, header_end) = parse_header(source, mode).ok()?;
207
208    // Skip any binary comment line (common: % followed by 4+ high bytes)
209    let mut pos = header_end as usize;
210    while pos < source.len()
211        && (source[pos] == b'%'
212            || source[pos] == b'\r'
213            || source[pos] == b'\n'
214            || source[pos] > 127)
215    {
216        // Skip the comment line
217        if source[pos] == b'%' {
218            while pos < source.len() && source[pos] != b'\r' && source[pos] != b'\n' {
219                pos += 1;
220            }
221        }
222        while pos < source.len() && (source[pos] == b'\r' || source[pos] == b'\n') {
223            pos += 1;
224        }
225    }
226
227    // Try to parse the first indirect object
228    let (_id, obj) = parse_indirect_object(source, pos as u64, mode).ok()?;
229
230    // Check if the dictionary contains /Linearized
231    let dict = obj.as_dict()?;
232    let _linearized = dict.get(&Name::from("Linearized"))?;
233
234    // Extract linearization parameters per the PDF spec and
235    // CPDF_LinearizedHeader constructor:
236    //   /L = file_size
237    //   /P = first_page_no (optional, default 0)
238    //   /T = main_xref_table_first_entry_offset
239    //   /N = page_count
240    //   /E = first_page_end_offset
241    //   /O = first_page_obj_num
242
243    let file_length = match dict.get(&Name::from_bytes(b"L".to_vec())) {
244        Some(Object::Integer(n)) if *n > 0 => *n as u64,
245        _ => return None,
246    };
247
248    let primary_page = match dict.get(&Name::from_bytes(b"P".to_vec())) {
249        Some(Object::Integer(n)) if *n >= 0 => *n as u32,
250        _ => 0, // Default to 0 if not present
251    };
252
253    let main_xref_table_first_entry_offset = match dict.get(&Name::from_bytes(b"T".to_vec())) {
254        Some(Object::Integer(n)) if *n > 0 => *n as u64,
255        _ => return None,
256    };
257
258    let page_count = match dict.get(&Name::n()) {
259        Some(Object::Integer(n)) if *n > 0 => *n as u32,
260        _ => return None,
261    };
262
263    let first_page_end_offset = match dict.get(&Name::from_bytes(b"E".to_vec())) {
264        Some(Object::Integer(n)) if *n > 0 => *n as u64,
265        _ => return None,
266    };
267
268    let first_page_obj_num = match dict.get(&Name::from_bytes(b"O".to_vec())) {
269        Some(Object::Integer(n)) if *n > 0 => *n as u32,
270        _ => return None,
271    };
272
273    // Parse /H array: [offset, length] or [offset, length, shared_offset, shared_length]
274    let (hint_stream_offset, hint_stream_length) = match dict.get(&Name::h()) {
275        Some(Object::Array(arr)) if arr.len() >= 2 => {
276            let offset = arr[0].as_i64().filter(|&n| n > 0).map(|n| n as u64);
277            let length = arr[1].as_i64().filter(|&n| n > 0).map(|n| n as u32);
278            (offset, length)
279        }
280        _ => (None, None),
281    };
282
283    // The last_xref_offset in the upstream corresponds to the parser position
284    // after `endobj` on the linearization object. We approximate it here as
285    // the end of the parsed object region (the offset after the first object).
286    // This is the "szLastXRefOffset" passed to the CPDF_LinearizedHeader
287    // constructor. For our purposes we use the same value as
288    // main_xref_table_first_entry_offset since we don't track the parser
289    // position through indirect object parsing.
290    let last_xref_offset = main_xref_table_first_entry_offset;
291
292    Some(LinearizedInfo {
293        file_length,
294        primary_page,
295        main_xref_table_first_entry_offset,
296        page_count,
297        first_page_end_offset,
298        first_page_obj_num,
299        last_xref_offset,
300        hint_stream_offset,
301        hint_stream_length,
302    })
303}
304
305#[cfg(test)]
306mod tests {
307    use super::*;
308
309    #[test]
310    fn test_detect_non_linearized() {
311        let mut pdf = Vec::new();
312        pdf.extend_from_slice(b"%PDF-1.4\n");
313        let obj_offset = pdf.len();
314        pdf.extend_from_slice(b"1 0 obj\n<< /Type /Catalog >>\nendobj\n");
315        let xref_offset = pdf.len();
316        pdf.extend_from_slice(b"xref\n0 2\n");
317        pdf.extend_from_slice(b"0000000000 65535 f \r\n");
318        pdf.extend_from_slice(format!("{:010} 00000 n \r\n", obj_offset).as_bytes());
319        pdf.extend_from_slice(b"trailer\n<< /Size 2 /Root 1 0 R >>\n");
320        pdf.extend_from_slice(format!("startxref\n{}\n%%EOF", xref_offset).as_bytes());
321
322        let result = detect_linearized(&pdf, ParsingMode::Lenient);
323        assert!(result.is_none());
324    }
325
326    #[test]
327    fn test_detect_linearized_pdf() {
328        let mut pdf = Vec::new();
329        pdf.extend_from_slice(b"%PDF-1.4\n");
330        pdf.extend_from_slice(
331            b"1 0 obj\n<< /Linearized 1.0 /L 12345 /T 1000 /N 5 /E 800 /O 3 /P 0 >>\nendobj\n",
332        );
333        // We need enough data to look like a valid PDF, but we're only testing the first object
334        pdf.extend_from_slice(b"2 0 obj\n<< >>\nendobj\n");
335
336        let result = detect_linearized(&pdf, ParsingMode::Lenient);
337        assert!(result.is_some());
338
339        let info = result.unwrap();
340        assert_eq!(info.file_size(), 12345);
341        assert_eq!(info.page_count(), 5);
342        assert_eq!(info.first_page_no(), 0);
343        assert_eq!(info.first_page_end_offset(), 800);
344        assert_eq!(info.first_page_obj_num(), 3);
345        assert_eq!(info.main_xref_table_first_entry_offset(), 1000);
346        assert!(info.hint_start().is_none());
347        assert!(info.hint_length().is_none());
348    }
349
350    #[test]
351    fn test_detect_linearized_with_h_array_2_values() {
352        let mut pdf = Vec::new();
353        pdf.extend_from_slice(b"%PDF-1.4\n");
354        pdf.extend_from_slice(
355            b"1 0 obj\n<< /Linearized 1.0 /L 50000 /T 2000 /N 10 /E 1500 /O 2 /H [500 120] >>\nendobj\n",
356        );
357        pdf.extend_from_slice(b"2 0 obj\n<< >>\nendobj\n");
358
359        let info = detect_linearized(&pdf, ParsingMode::Lenient).unwrap();
360        assert_eq!(info.hint_start(), Some(500));
361        assert_eq!(info.hint_length(), Some(120));
362    }
363
364    #[test]
365    fn test_detect_linearized_with_h_array_4_values() {
366        let mut pdf = Vec::new();
367        pdf.extend_from_slice(b"%PDF-1.4\n");
368        pdf.extend_from_slice(
369            b"1 0 obj\n<< /Linearized 1.0 /L 50000 /T 2000 /N 10 /E 1500 /O 2 /H [500 120 700 80] >>\nendobj\n",
370        );
371        pdf.extend_from_slice(b"2 0 obj\n<< >>\nendobj\n");
372
373        let info = detect_linearized(&pdf, ParsingMode::Lenient).unwrap();
374        assert_eq!(info.hint_start(), Some(500));
375        assert_eq!(info.hint_length(), Some(120));
376    }
377
378    #[test]
379    fn test_non_linearized_has_no_hint_fields() {
380        let mut pdf = Vec::new();
381        pdf.extend_from_slice(b"%PDF-1.4\n");
382        let obj_offset = pdf.len();
383        pdf.extend_from_slice(b"1 0 obj\n<< /Type /Catalog >>\nendobj\n");
384        let xref_offset = pdf.len();
385        pdf.extend_from_slice(b"xref\n0 2\n");
386        pdf.extend_from_slice(b"0000000000 65535 f \r\n");
387        pdf.extend_from_slice(format!("{:010} 00000 n \r\n", obj_offset).as_bytes());
388        pdf.extend_from_slice(b"trailer\n<< /Size 2 /Root 1 0 R >>\n");
389        pdf.extend_from_slice(format!("startxref\n{}\n%%EOF", xref_offset).as_bytes());
390
391        let result = detect_linearized(&pdf, ParsingMode::Lenient);
392        assert!(result.is_none());
393    }
394
395    #[test]
396    fn test_detect_linearized_missing_keys() {
397        let mut pdf = Vec::new();
398        pdf.extend_from_slice(b"%PDF-1.4\n");
399        pdf.extend_from_slice(
400            b"1 0 obj\n<< /Linearized 1.0 >>\nendobj\n", // Missing /L, /N, /E, /O, /T
401        );
402
403        let result = detect_linearized(&pdf, ParsingMode::Lenient);
404        assert!(result.is_none());
405    }
406
407    #[test]
408    fn test_has_hint_table_true() {
409        let info = LinearizedInfo {
410            file_length: 50000,
411            primary_page: 0,
412            main_xref_table_first_entry_offset: 2000,
413            page_count: 5,
414            first_page_end_offset: 1500,
415            first_page_obj_num: 2,
416            last_xref_offset: 2000,
417            hint_stream_offset: Some(500),
418            hint_stream_length: Some(120),
419        };
420        assert!(info.has_hint_table());
421    }
422
423    #[test]
424    fn test_has_hint_table_false_single_page() {
425        let info = LinearizedInfo {
426            file_length: 50000,
427            primary_page: 0,
428            main_xref_table_first_entry_offset: 2000,
429            page_count: 1,
430            first_page_end_offset: 1500,
431            first_page_obj_num: 2,
432            last_xref_offset: 2000,
433            hint_stream_offset: Some(500),
434            hint_stream_length: Some(120),
435        };
436        assert!(!info.has_hint_table());
437    }
438
439    #[test]
440    fn test_has_hint_table_false_no_hint() {
441        let info = LinearizedInfo {
442            file_length: 50000,
443            primary_page: 0,
444            main_xref_table_first_entry_offset: 2000,
445            page_count: 5,
446            first_page_end_offset: 1500,
447            first_page_obj_num: 2,
448            last_xref_offset: 2000,
449            hint_stream_offset: None,
450            hint_stream_length: None,
451        };
452        assert!(!info.has_hint_table());
453    }
454}