Skip to main content

rpdfium_parser/
linearized_header.rs

1// Derived from PDFium's cpdf_linearized_header.cpp
2// Original: Copyright 2014 The PDFium Authors
3// Licensed under BSD-3-Clause / Apache-2.0
4// See pdfium-upstream/LICENSE for the original license.
5
6//! Linearization detection.
7//!
8//! A linearized PDF has its first indirect object contain a `/Linearized` key
9//! in its dictionary. This module detects this marker and extracts the
10//! linearization parameters.
11
12use rpdfium_core::{Name, ParsingMode};
13
14use crate::header::parse_header;
15use crate::object::Object;
16use crate::object_parser::parse_indirect_object;
17
18/// Information extracted from a linearization dictionary.
19#[derive(Debug, Clone)]
20pub struct LinearizedInfo {
21    /// Total file length in bytes (`/L`).
22    pub file_length: u64,
23    /// Primary page number (`/P`), typically 0.
24    pub primary_page: u32,
25    /// Total page count (`/N`).
26    pub page_count: u32,
27    /// Byte offset of the first page's data (`/O`).
28    pub first_page_offset: u64,
29    /// Byte offset of the hint stream (`/H` array, first element).
30    pub hint_stream_offset: Option<u64>,
31    /// Length of the hint stream in bytes (`/H` array, second element).
32    pub hint_stream_length: Option<u32>,
33}
34
35/// Detect whether a PDF is linearized by checking the first indirect object.
36///
37/// Returns `Some(LinearizedInfo)` if the file is linearized, `None` otherwise.
38pub fn detect_linearized(source: &[u8], mode: ParsingMode) -> Option<LinearizedInfo> {
39    // Parse the header to find where objects start
40    let (_version, header_end) = parse_header(source, mode).ok()?;
41
42    // Skip any binary comment line (common: % followed by 4+ high bytes)
43    let mut pos = header_end as usize;
44    while pos < source.len()
45        && (source[pos] == b'%'
46            || source[pos] == b'\r'
47            || source[pos] == b'\n'
48            || source[pos] > 127)
49    {
50        // Skip the comment line
51        if source[pos] == b'%' {
52            while pos < source.len() && source[pos] != b'\r' && source[pos] != b'\n' {
53                pos += 1;
54            }
55        }
56        while pos < source.len() && (source[pos] == b'\r' || source[pos] == b'\n') {
57            pos += 1;
58        }
59    }
60
61    // Try to parse the first indirect object
62    let (_id, obj) = parse_indirect_object(source, pos as u64, mode).ok()?;
63
64    // Check if the dictionary contains /Linearized
65    let dict = obj.as_dict()?;
66    let _linearized = dict.get(&Name::from("Linearized"))?;
67
68    // Extract linearization parameters
69    let file_length = match dict.get(&Name::from_bytes(b"L".to_vec())) {
70        Some(Object::Integer(n)) if *n > 0 => *n as u64,
71        _ => return None,
72    };
73
74    let page_count = match dict.get(&Name::n()) {
75        Some(Object::Integer(n)) if *n > 0 => *n as u32,
76        _ => return None,
77    };
78
79    let primary_page = match dict.get(&Name::from_bytes(b"P".to_vec())) {
80        Some(Object::Integer(n)) if *n >= 0 => *n as u32,
81        _ => 0, // Default to 0 if not present
82    };
83
84    let first_page_offset = match dict.get(&Name::from_bytes(b"O".to_vec())) {
85        Some(Object::Integer(n)) if *n > 0 => *n as u64,
86        _ => return None,
87    };
88
89    // Parse /H array: [offset, length] or [offset, length, shared_offset, shared_length]
90    let (hint_stream_offset, hint_stream_length) = match dict.get(&Name::h()) {
91        Some(Object::Array(arr)) if arr.len() >= 2 => {
92            let offset = arr[0].as_i64().filter(|&n| n > 0).map(|n| n as u64);
93            let length = arr[1].as_i64().filter(|&n| n > 0).map(|n| n as u32);
94            (offset, length)
95        }
96        _ => (None, None),
97    };
98
99    Some(LinearizedInfo {
100        file_length,
101        primary_page,
102        page_count,
103        first_page_offset,
104        hint_stream_offset,
105        hint_stream_length,
106    })
107}
108
109#[cfg(test)]
110mod tests {
111    use super::*;
112
113    #[test]
114    fn detect_non_linearized() {
115        let mut pdf = Vec::new();
116        pdf.extend_from_slice(b"%PDF-1.4\n");
117        let obj_offset = pdf.len();
118        pdf.extend_from_slice(b"1 0 obj\n<< /Type /Catalog >>\nendobj\n");
119        let xref_offset = pdf.len();
120        pdf.extend_from_slice(b"xref\n0 2\n");
121        pdf.extend_from_slice(b"0000000000 65535 f \r\n");
122        pdf.extend_from_slice(format!("{:010} 00000 n \r\n", obj_offset).as_bytes());
123        pdf.extend_from_slice(b"trailer\n<< /Size 2 /Root 1 0 R >>\n");
124        pdf.extend_from_slice(format!("startxref\n{}\n%%EOF", xref_offset).as_bytes());
125
126        let result = detect_linearized(&pdf, ParsingMode::Lenient);
127        assert!(result.is_none());
128    }
129
130    #[test]
131    fn detect_linearized_pdf() {
132        let mut pdf = Vec::new();
133        pdf.extend_from_slice(b"%PDF-1.4\n");
134        pdf.extend_from_slice(
135            b"1 0 obj\n<< /Linearized 1.0 /L 12345 /N 5 /O 100 /P 0 >>\nendobj\n",
136        );
137        // We need enough data to look like a valid PDF, but we're only testing the first object
138        pdf.extend_from_slice(b"2 0 obj\n<< >>\nendobj\n");
139
140        let result = detect_linearized(&pdf, ParsingMode::Lenient);
141        assert!(result.is_some());
142
143        let info = result.unwrap();
144        assert_eq!(info.file_length, 12345);
145        assert_eq!(info.page_count, 5);
146        assert_eq!(info.primary_page, 0);
147        assert_eq!(info.first_page_offset, 100);
148        assert!(info.hint_stream_offset.is_none());
149        assert!(info.hint_stream_length.is_none());
150    }
151
152    #[test]
153    fn detect_linearized_with_h_array_2_values() {
154        let mut pdf = Vec::new();
155        pdf.extend_from_slice(b"%PDF-1.4\n");
156        pdf.extend_from_slice(
157            b"1 0 obj\n<< /Linearized 1.0 /L 50000 /N 10 /O 200 /H [500 120] >>\nendobj\n",
158        );
159        pdf.extend_from_slice(b"2 0 obj\n<< >>\nendobj\n");
160
161        let info = detect_linearized(&pdf, ParsingMode::Lenient).unwrap();
162        assert_eq!(info.hint_stream_offset, Some(500));
163        assert_eq!(info.hint_stream_length, Some(120));
164    }
165
166    #[test]
167    fn detect_linearized_with_h_array_4_values() {
168        let mut pdf = Vec::new();
169        pdf.extend_from_slice(b"%PDF-1.4\n");
170        pdf.extend_from_slice(
171            b"1 0 obj\n<< /Linearized 1.0 /L 50000 /N 10 /O 200 /H [500 120 700 80] >>\nendobj\n",
172        );
173        pdf.extend_from_slice(b"2 0 obj\n<< >>\nendobj\n");
174
175        let info = detect_linearized(&pdf, ParsingMode::Lenient).unwrap();
176        assert_eq!(info.hint_stream_offset, Some(500));
177        assert_eq!(info.hint_stream_length, Some(120));
178    }
179
180    #[test]
181    fn non_linearized_has_no_hint_fields() {
182        let mut pdf = Vec::new();
183        pdf.extend_from_slice(b"%PDF-1.4\n");
184        let obj_offset = pdf.len();
185        pdf.extend_from_slice(b"1 0 obj\n<< /Type /Catalog >>\nendobj\n");
186        let xref_offset = pdf.len();
187        pdf.extend_from_slice(b"xref\n0 2\n");
188        pdf.extend_from_slice(b"0000000000 65535 f \r\n");
189        pdf.extend_from_slice(format!("{:010} 00000 n \r\n", obj_offset).as_bytes());
190        pdf.extend_from_slice(b"trailer\n<< /Size 2 /Root 1 0 R >>\n");
191        pdf.extend_from_slice(format!("startxref\n{}\n%%EOF", xref_offset).as_bytes());
192
193        let result = detect_linearized(&pdf, ParsingMode::Lenient);
194        assert!(result.is_none());
195    }
196
197    #[test]
198    fn detect_linearized_missing_keys() {
199        let mut pdf = Vec::new();
200        pdf.extend_from_slice(b"%PDF-1.4\n");
201        pdf.extend_from_slice(
202            b"1 0 obj\n<< /Linearized 1.0 >>\nendobj\n", // Missing /L, /N, /O
203        );
204
205        let result = detect_linearized(&pdf, ParsingMode::Lenient);
206        assert!(result.is_none());
207    }
208}