Skip to main content

rpdfium_parser/
hint_tables.rs

1// Derived from PDFium's cpdf_hint_tables.cpp
2// Original: Copyright 2014 The PDFium Authors
3// Licensed under BSD-3-Clause / Apache-2.0
4// See pdfium-upstream/LICENSE for the original license.
5
6//! Linearization hint tables -- page offset and shared object hint table parsing.
7//!
8//! Linearized PDFs contain hint tables that allow page-at-a-time retrieval
9//! without downloading the entire file. The hint stream (referenced by the `/H`
10//! array in the linearization dictionary) is a bit-packed binary structure
11//! containing per-page and per-shared-object metadata.
12//!
13//! Reference: PDF 32000-1:2008, Annex F (Linearized PDF).
14
15use rpdfium_core::cfx_bitstream::BitReader;
16
17/// Per-page entry in the page offset hint table.
18#[derive(Debug, Clone, Default)]
19pub struct PageHintEntry {
20    /// Delta from the minimum number of objects for this page.
21    pub objects_delta: u32,
22    /// Delta from the minimum page length for this page.
23    pub length_delta: u32,
24    /// Number of shared objects referenced by this page.
25    pub shared_objects_count: u32,
26}
27
28/// The page offset hint table (PDF spec Annex F.3).
29#[derive(Debug, Clone)]
30pub struct PageOffsetHintTable {
31    /// Minimum number of objects in a page.
32    pub min_objects: u32,
33    /// Location of first page's object (offset from start of file).
34    pub first_page_offset: u64,
35    /// Minimum page length in bytes.
36    pub min_page_length: u32,
37    /// Number of bits needed to represent difference from min objects.
38    pub bits_objects_delta: u8,
39    /// Number of bits needed to represent difference from min length.
40    pub bits_length_delta: u8,
41    /// Per-page entries.
42    pub entries: Vec<PageHintEntry>,
43}
44
45/// Per-shared-object entry in the shared object hint table.
46#[derive(Debug, Clone, Default)]
47pub struct SharedHintEntry {
48    /// Length of the shared object in bytes.
49    pub length: u32,
50}
51
52/// The shared object hint table (PDF spec Annex F.4).
53#[derive(Debug, Clone)]
54pub struct SharedObjectHintTable {
55    /// First shared object number.
56    pub first_object: u32,
57    /// Location of first shared object.
58    pub first_offset: u64,
59    /// Number of shared object entries.
60    pub count: u32,
61    /// Per-object entries.
62    pub entries: Vec<SharedHintEntry>,
63}
64
65/// Parsed hint tables from a linearized PDF.
66#[derive(Debug, Clone)]
67pub struct HintTables {
68    /// Page offset hint table.
69    pub page_offset: PageOffsetHintTable,
70    /// Optional shared object hint table.
71    pub shared_objects: Option<SharedObjectHintTable>,
72}
73
74impl HintTables {
75    /// Parse hint tables from the decoded hint stream data.
76    ///
77    /// The `page_count` must be provided from the linearization dictionary (`/N`).
78    ///
79    /// Returns `None` if the data is too short or malformed.
80    pub fn parse(data: &[u8], page_count: u32) -> Option<Self> {
81        if data.is_empty() || page_count == 0 {
82            return None;
83        }
84
85        let mut reader = BitReader::new(data);
86
87        // --- Page Offset Hint Table Header (13 fields per PDF spec F.3) ---
88        // 1. Minimum number of objects in a page (32 bits)
89        let min_objects = reader.read_u32()?;
90        // 2. Location of first page's page object (32 bits)
91        let first_page_offset_low = reader.read_u32()?;
92        // 3. Number of bits needed to represent diff from min objects (16 bits)
93        let bits_objects_delta = reader.read_u16()? as u8;
94        // 4. Minimum page length (32 bits)
95        let min_page_length = reader.read_u32()?;
96        // 5. Number of bits needed to represent diff from min length (16 bits)
97        let bits_length_delta = reader.read_u16()? as u8;
98        // 6. Minimum offset to start of content stream (32 bits)
99        let _min_content_offset = reader.read_u32()?;
100        // 7. Number of bits for content stream offset delta (16 bits)
101        let _bits_content_offset_delta = reader.read_u16()?;
102        // 8. Minimum content stream length (32 bits)
103        let _min_content_length = reader.read_u32()?;
104        // 9. Number of bits for content stream length delta (16 bits)
105        let _bits_content_length_delta = reader.read_u16()?;
106        // 10. Number of bits for number of shared object references (16 bits)
107        let bits_shared_count = reader.read_u16()? as u8;
108        // 11. Number of bits for shared object identifier (16 bits)
109        let _bits_shared_id = reader.read_u16()?;
110        // 12. Number of bits for numerator of fractional page position (16 bits)
111        let _bits_numerator = reader.read_u16()?;
112        // 13. Denominator of fractional page position (16 bits)
113        let _denominator = reader.read_u16()?;
114
115        // Validate bit widths won't cause issues
116        if bits_objects_delta > 32 || bits_length_delta > 32 || bits_shared_count > 32 {
117            return None;
118        }
119
120        // --- Per-page entries ---
121        let n = page_count as usize;
122        let mut entries = Vec::with_capacity(n);
123
124        // Item 1: objects delta for each page
125        let mut objects_deltas = Vec::with_capacity(n);
126        for _ in 0..n {
127            objects_deltas.push(reader.read_bits(bits_objects_delta)?);
128        }
129
130        // Item 2: page length delta for each page
131        let mut length_deltas = Vec::with_capacity(n);
132        for _ in 0..n {
133            length_deltas.push(reader.read_bits(bits_length_delta)?);
134        }
135
136        // Item 3: shared object count for each page
137        let mut shared_counts = Vec::with_capacity(n);
138        for _ in 0..n {
139            shared_counts.push(reader.read_bits(bits_shared_count)?);
140        }
141
142        // Build per-page entries
143        for i in 0..n {
144            entries.push(PageHintEntry {
145                objects_delta: objects_deltas[i],
146                length_delta: length_deltas[i],
147                shared_objects_count: shared_counts[i],
148            });
149        }
150
151        let page_offset = PageOffsetHintTable {
152            min_objects,
153            first_page_offset: u64::from(first_page_offset_low),
154            min_page_length,
155            bits_objects_delta,
156            bits_length_delta,
157            entries,
158        };
159
160        // Align to byte boundary before shared object table
161        reader.byte_align();
162
163        // --- Shared Object Hint Table (optional, PDF spec F.4) ---
164        let shared_objects = Self::parse_shared_objects(&mut reader);
165
166        Some(HintTables {
167            page_offset,
168            shared_objects,
169        })
170    }
171
172    /// Parse the shared object hint table from the current reader position.
173    fn parse_shared_objects(reader: &mut BitReader<'_>) -> Option<SharedObjectHintTable> {
174        // Check if there's enough data remaining for the header (3×u32 + 1×u16 = 112 bits)
175        if reader.bits_remaining() < 32 * 3 + 16 {
176            return None;
177        }
178
179        // 1. First shared object number (32 bits)
180        let first_object = reader.read_u32()?;
181        // 2. Location of first shared object (32 bits)
182        let first_offset_low = reader.read_u32()?;
183        // 3. Number of shared object entries (32 bits)
184        let count = reader.read_u32()?;
185        // 4. Number of bits for object length (16 bits)
186        let bits_length = reader.read_u16()? as u8;
187
188        if bits_length > 32 {
189            return None;
190        }
191
192        // Safety limit on count
193        if count > rpdfium_core::fx_system::MAX_OBJECT_NUMBER {
194            return None;
195        }
196
197        let mut entries = Vec::with_capacity(count as usize);
198        for _ in 0..count {
199            let length = reader.read_bits(bits_length)?;
200            entries.push(SharedHintEntry { length });
201        }
202
203        Some(SharedObjectHintTable {
204            first_object,
205            first_offset: u64::from(first_offset_low),
206            count,
207            entries,
208        })
209    }
210
211    /// Calculate the byte range (offset, length) for a given page index.
212    ///
213    /// Returns `None` if the page index is out of range.
214    pub fn page_byte_range(&self, page: usize) -> Option<(u64, u32)> {
215        let entry = self.page_offset.entries.get(page)?;
216        let page_length = self.page_offset.min_page_length + entry.length_delta;
217
218        // Calculate offset: first page offset + sum of all preceding page lengths
219        let mut offset = self.page_offset.first_page_offset;
220        for i in 0..page {
221            let prev = &self.page_offset.entries[i];
222            offset += u64::from(self.page_offset.min_page_length + prev.length_delta);
223        }
224
225        Some((offset, page_length))
226    }
227}
228
229#[cfg(test)]
230mod tests {
231    use super::*;
232
233    #[test]
234    fn test_hint_tables_parse_with_constructed_data() {
235        // Build a minimal hint stream: 2 pages, no shared objects
236        let mut bits = Vec::new();
237
238        // Page Offset Header (13 fields):
239        // 1. min_objects = 3 (32 bits)
240        push_u32(&mut bits, 3);
241        // 2. first_page_offset = 1000 (32 bits)
242        push_u32(&mut bits, 1000);
243        // 3. bits_objects_delta = 2 (16 bits)
244        push_u16(&mut bits, 2);
245        // 4. min_page_length = 500 (32 bits)
246        push_u32(&mut bits, 500);
247        // 5. bits_length_delta = 4 (16 bits)
248        push_u16(&mut bits, 4);
249        // 6. min_content_offset = 100 (32 bits)
250        push_u32(&mut bits, 100);
251        // 7. bits_content_offset_delta = 0 (16 bits)
252        push_u16(&mut bits, 0);
253        // 8. min_content_length = 200 (32 bits)
254        push_u32(&mut bits, 200);
255        // 9. bits_content_length_delta = 0 (16 bits)
256        push_u16(&mut bits, 0);
257        // 10. bits_shared_count = 1 (16 bits)
258        push_u16(&mut bits, 1);
259        // 11. bits_shared_id = 0 (16 bits)
260        push_u16(&mut bits, 0);
261        // 12. bits_numerator = 0 (16 bits)
262        push_u16(&mut bits, 0);
263        // 13. denominator = 1 (16 bits)
264        push_u16(&mut bits, 1);
265
266        // Per-page items for 2 pages:
267        // Item 1: objects_delta (2 bits each)
268        push_bits(&mut bits, 1, 2); // page 0: delta=1
269        push_bits(&mut bits, 2, 2); // page 1: delta=2
270
271        // Item 2: length_delta (4 bits each)
272        push_bits(&mut bits, 5, 4); // page 0: delta=5
273        push_bits(&mut bits, 10, 4); // page 1: delta=10
274
275        // Item 3: shared_objects_count (1 bit each)
276        push_bits(&mut bits, 0, 1); // page 0: 0 shared
277        push_bits(&mut bits, 1, 1); // page 1: 1 shared
278
279        let data = bits_to_bytes(&bits);
280        let tables = HintTables::parse(&data, 2).unwrap();
281
282        assert_eq!(tables.page_offset.min_objects, 3);
283        assert_eq!(tables.page_offset.first_page_offset, 1000);
284        assert_eq!(tables.page_offset.min_page_length, 500);
285        assert_eq!(tables.page_offset.bits_objects_delta, 2);
286        assert_eq!(tables.page_offset.bits_length_delta, 4);
287        assert_eq!(tables.page_offset.entries.len(), 2);
288
289        assert_eq!(tables.page_offset.entries[0].objects_delta, 1);
290        assert_eq!(tables.page_offset.entries[0].length_delta, 5);
291        assert_eq!(tables.page_offset.entries[0].shared_objects_count, 0);
292
293        assert_eq!(tables.page_offset.entries[1].objects_delta, 2);
294        assert_eq!(tables.page_offset.entries[1].length_delta, 10);
295        assert_eq!(tables.page_offset.entries[1].shared_objects_count, 1);
296    }
297
298    #[test]
299    fn test_page_byte_range_calculation() {
300        let tables = HintTables {
301            page_offset: PageOffsetHintTable {
302                min_objects: 3,
303                first_page_offset: 1000,
304                min_page_length: 500,
305                bits_objects_delta: 0,
306                bits_length_delta: 0,
307                entries: vec![
308                    PageHintEntry {
309                        objects_delta: 0,
310                        length_delta: 100, // page 0: 600 bytes
311                        shared_objects_count: 0,
312                    },
313                    PageHintEntry {
314                        objects_delta: 0,
315                        length_delta: 200, // page 1: 700 bytes
316                        shared_objects_count: 0,
317                    },
318                    PageHintEntry {
319                        objects_delta: 0,
320                        length_delta: 0, // page 2: 500 bytes
321                        shared_objects_count: 0,
322                    },
323                ],
324            },
325            shared_objects: None,
326        };
327
328        // Page 0: offset=1000, length=600
329        let (offset, len) = tables.page_byte_range(0).unwrap();
330        assert_eq!(offset, 1000);
331        assert_eq!(len, 600);
332
333        // Page 1: offset=1000+600=1600, length=700
334        let (offset, len) = tables.page_byte_range(1).unwrap();
335        assert_eq!(offset, 1600);
336        assert_eq!(len, 700);
337
338        // Page 2: offset=1600+700=2300, length=500
339        let (offset, len) = tables.page_byte_range(2).unwrap();
340        assert_eq!(offset, 2300);
341        assert_eq!(len, 500);
342
343        // Out of range
344        assert!(tables.page_byte_range(3).is_none());
345    }
346
347    #[test]
348    fn test_empty_hint_stream_returns_none() {
349        assert!(HintTables::parse(&[], 1).is_none());
350    }
351
352    #[test]
353    fn test_zero_page_count_returns_none() {
354        let data = [0u8; 100];
355        assert!(HintTables::parse(&data, 0).is_none());
356    }
357
358    #[test]
359    fn test_truncated_hint_stream_returns_none() {
360        // Not enough data for the 13-field header
361        let data = [0u8; 10];
362        assert!(HintTables::parse(&data, 1).is_none());
363    }
364
365    // -----------------------------------------------------------------------
366    // Test helpers for constructing bit-packed data
367    // -----------------------------------------------------------------------
368
369    fn push_u32(bits: &mut Vec<u8>, value: u32) {
370        for i in (0..32).rev() {
371            bits.push(((value >> i) & 1) as u8);
372        }
373    }
374
375    fn push_u16(bits: &mut Vec<u8>, value: u16) {
376        for i in (0..16).rev() {
377            bits.push(((value >> i) & 1) as u8);
378        }
379    }
380
381    fn push_bits(bits: &mut Vec<u8>, value: u32, count: u8) {
382        for i in (0..count).rev() {
383            bits.push(((value >> i) & 1) as u8);
384        }
385    }
386
387    /// Convert a vector of individual bits (0 or 1) into packed bytes.
388    fn bits_to_bytes(bits: &[u8]) -> Vec<u8> {
389        let mut bytes = Vec::new();
390        for chunk in bits.chunks(8) {
391            let mut byte = 0u8;
392            for (i, &bit) in chunk.iter().enumerate() {
393                byte |= bit << (7 - i);
394            }
395            bytes.push(byte);
396        }
397        bytes
398    }
399}