Skip to main content

justpdf_core/
linearized.rs

1//! Linearized PDF detection, parameter parsing, and hint table support (PDF spec section 7.4).
2//!
3//! This module detects whether a PDF is linearized, extracts the linearization
4//! parameters from the linearization dictionary, and parses page offset hint tables.
5//! Linearized PDF generation is in [`crate::writer::linearize`].
6
7use crate::object::{parse_indirect_object, PdfObject};
8use crate::parser::PdfDocument;
9use crate::tokenizer::Tokenizer;
10
11/// Linearization parameters from the linearization dictionary.
12#[derive(Debug, Clone)]
13pub struct LinearizationParams {
14    /// File length (/L).
15    pub file_length: i64,
16    /// Hint stream offset (/H first element).
17    pub hint_offset: i64,
18    /// Hint stream length (/H second element).
19    pub hint_length: i64,
20    /// First page object number (/O).
21    pub first_page_obj_num: u32,
22    /// Offset of end of first page (/E).
23    pub end_of_first_page: i64,
24    /// Number of pages (/N).
25    pub page_count: i64,
26    /// Offset of main xref table (/T).
27    pub main_xref_offset: i64,
28    /// Linearization version, typically 1.0.
29    pub version: f64,
30}
31
32/// Find the byte offset just past the PDF header line (`%PDF-x.y` plus its line ending).
33///
34/// Returns `None` if no valid header is found in the first 1024 bytes.
35fn skip_header(data: &[u8]) -> Option<usize> {
36    let search_len = data.len().min(1024);
37    let needle = b"%PDF-";
38
39    for i in 0..search_len.saturating_sub(needle.len()) {
40        if data[i..].starts_with(needle) {
41            // Skip past the header line (find the next line break).
42            let mut pos = i + needle.len();
43            // Skip the rest of the header line (version digits etc.)
44            while pos < data.len() && data[pos] != b'\n' && data[pos] != b'\r' {
45                pos += 1;
46            }
47            // Skip the line ending itself.
48            if pos < data.len() && data[pos] == b'\r' {
49                pos += 1;
50            }
51            if pos < data.len() && data[pos] == b'\n' {
52                pos += 1;
53            }
54            return Some(pos);
55        }
56    }
57    None
58}
59
60/// Skip any comment lines (lines starting with `%`) after the header.
61fn skip_comments(data: &[u8], mut pos: usize) -> usize {
62    loop {
63        // Skip whitespace between lines.
64        while pos < data.len()
65            && (data[pos] == b' ' || data[pos] == b'\t' || data[pos] == b'\r' || data[pos] == b'\n')
66        {
67            pos += 1;
68        }
69        if pos < data.len() && data[pos] == b'%' {
70            // Skip comment line.
71            while pos < data.len() && data[pos] != b'\n' && data[pos] != b'\r' {
72                pos += 1;
73            }
74        } else {
75            break;
76        }
77    }
78    pos
79}
80
81/// Detect whether the given PDF data represents a linearized PDF, and if so,
82/// parse and return the linearization parameters.
83///
84/// This checks if the first indirect object in the file contains a `/Linearized` key.
85pub fn detect_linearization(data: &[u8]) -> Option<LinearizationParams> {
86    let header_end = skip_header(data)?;
87    let obj_start = skip_comments(data, header_end);
88
89    if obj_start >= data.len() {
90        return None;
91    }
92
93    let mut tokenizer = Tokenizer::new_at(data, obj_start);
94    let (_iref, obj) = parse_indirect_object(&mut tokenizer).ok()?;
95
96    let dict = match &obj {
97        PdfObject::Dict(d) => d,
98        _ => return None,
99    };
100
101    // Check for the /Linearized key.
102    if !dict.contains_key(b"Linearized") {
103        return None;
104    }
105
106    let version = dict.get_f64(b"Linearized")?;
107    let file_length = dict.get_i64(b"L")?;
108    let first_page_obj_num = dict.get_i64(b"O")? as u32;
109    let end_of_first_page = dict.get_i64(b"E")?;
110    let page_count = dict.get_i64(b"N")?;
111    let main_xref_offset = dict.get_i64(b"T")?;
112
113    // /H is an array of 2 integers: [offset length].
114    let h_array = dict.get_array(b"H")?;
115    if h_array.len() < 2 {
116        return None;
117    }
118    let hint_offset = h_array[0].as_i64()?;
119    let hint_length = h_array[1].as_i64()?;
120
121    Some(LinearizationParams {
122        file_length,
123        hint_offset,
124        hint_length,
125        first_page_obj_num,
126        end_of_first_page,
127        page_count,
128        main_xref_offset,
129        version,
130    })
131}
132
133/// Simple check: returns `true` if the PDF data appears to be linearized.
134pub fn is_linearized(data: &[u8]) -> bool {
135    detect_linearization(data).is_some()
136}
137
138/// Alternative API that works from a parsed `PdfDocument`.
139///
140/// Finds the first object (usually obj 1) and checks for `/Linearized`.
141/// This re-parses from the raw data since the linearization dict is always the
142/// first indirect object in the file.
143pub fn read_linearization(doc: &PdfDocument) -> Option<LinearizationParams> {
144    detect_linearization(doc.raw_data())
145}
146
147// ---------------------------------------------------------------------------
148// Hint table types and parsing
149// ---------------------------------------------------------------------------
150
151/// Page offset hint table entry.
152///
153/// Each entry describes the location and size of a page's objects within the
154/// linearized PDF body. Used by viewers to seek directly to a specific page.
155#[derive(Debug, Clone)]
156pub struct PageOffsetHint {
157    /// Byte offset of the page's objects within the file.
158    pub offset: u64,
159    /// Total byte length of the page's objects.
160    pub length: u64,
161    /// Number of objects belonging to this page.
162    pub num_objects: u32,
163}
164
165/// A bit reader for extracting variable-width fields from a hint stream.
166struct BitReader<'a> {
167    data: &'a [u8],
168    byte_pos: usize,
169    bit_pos: u8, // 0..8, bits consumed in current byte (MSB first)
170}
171
172impl<'a> BitReader<'a> {
173    fn new(data: &'a [u8]) -> Self {
174        Self {
175            data,
176            byte_pos: 0,
177            bit_pos: 0,
178        }
179    }
180
181    /// Read `n_bits` (up to 64) from the stream in MSB-first order.
182    fn read_bits(&mut self, n_bits: u32) -> Option<u64> {
183        if n_bits == 0 {
184            return Some(0);
185        }
186        let mut result: u64 = 0;
187        let mut remaining = n_bits;
188        while remaining > 0 {
189            if self.byte_pos >= self.data.len() {
190                return None;
191            }
192            let avail = 8 - self.bit_pos as u32;
193            let take = remaining.min(avail);
194            // Extract `take` bits from the current byte starting at bit_pos
195            let shift = avail - take;
196            let mask = ((1u16 << take) - 1) as u8;
197            let bits = (self.data[self.byte_pos] >> shift) & mask;
198            result = (result << take) | bits as u64;
199            remaining -= take;
200            self.bit_pos += take as u8;
201            if self.bit_pos >= 8 {
202                self.bit_pos = 0;
203                self.byte_pos += 1;
204            }
205        }
206        Some(result)
207    }
208
209    /// Advance to the next byte boundary.
210    fn align(&mut self) {
211        if self.bit_pos > 0 {
212            self.bit_pos = 0;
213            self.byte_pos += 1;
214        }
215    }
216}
217
218/// Read a 32-bit big-endian unsigned integer from a byte slice at the given offset.
219fn read_u32_be(data: &[u8], offset: usize) -> Option<u32> {
220    if offset + 4 > data.len() {
221        return None;
222    }
223    Some(u32::from_be_bytes([
224        data[offset],
225        data[offset + 1],
226        data[offset + 2],
227        data[offset + 3],
228    ]))
229}
230
231/// Parse the page offset hint table from a linearized PDF hint stream.
232///
233/// `data` is the raw (decoded) hint stream bytes. `params` provides the
234/// linearization parameters (especially `page_count`).
235///
236/// The page offset hint table header (PDF spec F.3) begins at byte 0 of
237/// the hint stream with a series of 32-bit big-endian fields describing
238/// minimums and bit widths, followed by per-page variable-bit entries.
239///
240/// Returns `None` if the hint stream is too short or malformed.
241pub fn parse_hint_tables(data: &[u8], params: &LinearizationParams) -> Option<Vec<PageOffsetHint>> {
242    let n_pages = params.page_count as usize;
243    if n_pages == 0 {
244        return Some(Vec::new());
245    }
246
247    // The page offset hint table header consists of several 32-bit fields.
248    // Per the spec (Table F.1), we need at least 9 x 4 = 36 bytes for the
249    // header items we use.
250    if data.len() < 36 {
251        return None;
252    }
253
254    // Header fields (Table F.1):
255    // Item 1: min obj per page (4 bytes)
256    let min_objects = read_u32_be(data, 0)?;
257    // Item 2: offset of first page's objects (4 bytes) - location field
258    let first_page_offset = read_u32_be(data, 4)? as u64;
259    // Item 3: bits needed to represent delta-objects (4 bytes)
260    let bits_delta_objects = read_u32_be(data, 8)?;
261    // Item 4: min page length (4 bytes)
262    let min_page_length = read_u32_be(data, 12)? as u64;
263    // Item 5: bits needed for delta-page-length (4 bytes)
264    let bits_delta_length = read_u32_be(data, 16)?;
265    // Item 6: min offset to content stream (4 bytes) - skip for basic parsing
266    let _min_content_offset = read_u32_be(data, 20)?;
267    // Item 7: bits for delta-content-offset (4 bytes) - skip
268    let _bits_delta_content = read_u32_be(data, 24)?;
269    // Item 8: min content stream length (4 bytes) - skip
270    let _min_content_length = read_u32_be(data, 28)?;
271    // Item 9: bits for delta-content-length (4 bytes) - skip
272    let _bits_delta_content_len = read_u32_be(data, 32)?;
273
274    // Per-page data starts at byte 36
275    let per_page_data = &data[36..];
276    let mut reader = BitReader::new(per_page_data);
277
278    // Section 1: number-of-objects deltas
279    let mut num_objects: Vec<u32> = Vec::with_capacity(n_pages);
280    for _ in 0..n_pages {
281        let delta = reader.read_bits(bits_delta_objects)? as u32;
282        num_objects.push(min_objects + delta);
283    }
284    reader.align();
285
286    // Section 2: page-length deltas
287    let mut lengths: Vec<u64> = Vec::with_capacity(n_pages);
288    for _ in 0..n_pages {
289        let delta = reader.read_bits(bits_delta_length)? as u64;
290        lengths.push(min_page_length + delta);
291    }
292
293    // Compute offsets: pages are laid out sequentially starting from
294    // first_page_offset. The first page in the hint table is page 0;
295    // the linearized first-page (at a known offset) is entry 0.
296    let mut offsets: Vec<u64> = Vec::with_capacity(n_pages);
297    let mut running = first_page_offset;
298    for i in 0..n_pages {
299        offsets.push(running);
300        running += lengths[i];
301    }
302
303    let mut entries: Vec<PageOffsetHint> = Vec::with_capacity(n_pages);
304    for i in 0..n_pages {
305        entries.push(PageOffsetHint {
306            offset: offsets[i],
307            length: lengths[i],
308            num_objects: num_objects[i],
309        });
310    }
311
312    Some(entries)
313}
314
315#[cfg(test)]
316mod tests {
317    use super::*;
318
319    /// Helper: build a minimal linearized PDF byte stream.
320    fn make_linearized_pdf(
321        file_length: i64,
322        hint_offset: i64,
323        hint_length: i64,
324        first_page_obj: u32,
325        end_first_page: i64,
326        page_count: i64,
327        main_xref: i64,
328    ) -> Vec<u8> {
329        format!(
330            "%PDF-1.7\n\
331             1 0 obj\n\
332             << /Linearized 1.0 /L {file_length} /H [{hint_offset} {hint_length}] \
333             /O {first_page_obj} /E {end_first_page} /N {page_count} /T {main_xref} >>\n\
334             endobj\n"
335        )
336        .into_bytes()
337    }
338
339    #[test]
340    fn detect_linearized_pdf() {
341        let data = make_linearized_pdf(12345, 200, 50, 5, 1000, 10, 9000);
342        let params = detect_linearization(&data).expect("should detect linearization");
343        assert_eq!(params.file_length, 12345);
344        assert_eq!(params.hint_offset, 200);
345        assert_eq!(params.hint_length, 50);
346        assert_eq!(params.first_page_obj_num, 5);
347        assert_eq!(params.end_of_first_page, 1000);
348        assert_eq!(params.page_count, 10);
349        assert_eq!(params.main_xref_offset, 9000);
350        assert!((params.version - 1.0).abs() < f64::EPSILON);
351    }
352
353    #[test]
354    fn is_linearized_true() {
355        let data = make_linearized_pdf(500, 100, 30, 2, 400, 3, 450);
356        assert!(is_linearized(&data));
357    }
358
359    #[test]
360    fn non_linearized_pdf_returns_none() {
361        let data = b"%PDF-1.4\n1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n";
362        assert!(detect_linearization(data).is_none());
363        assert!(!is_linearized(data));
364    }
365
366    #[test]
367    fn non_dict_first_object_returns_none() {
368        let data = b"%PDF-1.4\n1 0 obj\n42\nendobj\n";
369        assert!(detect_linearization(data).is_none());
370    }
371
372    #[test]
373    fn header_with_comment_line() {
374        // Some PDFs have a binary comment after the header.
375        let data = b"%PDF-1.5\n%\xE2\xE3\xCF\xD3\n\
376            1 0 obj\n\
377            << /Linearized 1.0 /L 5000 /H [100 20] /O 3 /E 800 /N 5 /T 4500 >>\n\
378            endobj\n";
379        let params = detect_linearization(data).expect("should detect through comment");
380        assert_eq!(params.file_length, 5000);
381        assert_eq!(params.page_count, 5);
382    }
383
384    #[test]
385    fn short_input_does_not_panic() {
386        assert!(detect_linearization(b"").is_none());
387        assert!(detect_linearization(b"%PDF").is_none());
388        assert!(detect_linearization(b"%PDF-1.4\n").is_none());
389        assert!(detect_linearization(b"%PDF-1.4\n1").is_none());
390    }
391
392    #[test]
393    fn truncated_dict_does_not_panic() {
394        let data = b"%PDF-1.4\n1 0 obj\n<< /Linearized 1.0 /L";
395        assert!(detect_linearization(data).is_none());
396    }
397
398    #[test]
399    fn missing_required_key_returns_none() {
400        // Has /Linearized but missing /L.
401        let data = b"%PDF-1.4\n1 0 obj\n\
402            << /Linearized 1.0 /H [100 20] /O 3 /E 800 /N 5 /T 4500 >>\n\
403            endobj\n";
404        assert!(detect_linearization(data).is_none());
405    }
406
407    #[test]
408    fn h_array_too_short_returns_none() {
409        let data = b"%PDF-1.4\n1 0 obj\n\
410            << /Linearized 1.0 /L 5000 /H [100] /O 3 /E 800 /N 5 /T 4500 >>\n\
411            endobj\n";
412        assert!(detect_linearization(data).is_none());
413    }
414
415    #[test]
416    fn parse_all_params_correctly() {
417        let data = make_linearized_pdf(999999, 512, 128, 7, 2048, 42, 88888);
418        let params = detect_linearization(&data).unwrap();
419        assert_eq!(params.file_length, 999999);
420        assert_eq!(params.hint_offset, 512);
421        assert_eq!(params.hint_length, 128);
422        assert_eq!(params.first_page_obj_num, 7);
423        assert_eq!(params.end_of_first_page, 2048);
424        assert_eq!(params.page_count, 42);
425        assert_eq!(params.main_xref_offset, 88888);
426    }
427
428    #[test]
429    fn version_as_real() {
430        let data = b"%PDF-1.7\n1 0 obj\n\
431            << /Linearized 1.0 /L 100 /H [10 5] /O 1 /E 50 /N 1 /T 80 >>\n\
432            endobj\n";
433        let params = detect_linearization(data).unwrap();
434        assert!((params.version - 1.0).abs() < f64::EPSILON);
435    }
436
437    #[test]
438    fn version_as_integer() {
439        // Some generators write /Linearized 1 (integer) instead of 1.0.
440        let data = b"%PDF-1.7\n1 0 obj\n\
441            << /Linearized 1 /L 100 /H [10 5] /O 1 /E 50 /N 1 /T 80 >>\n\
442            endobj\n";
443        let params = detect_linearization(data).unwrap();
444        assert!((params.version - 1.0).abs() < f64::EPSILON);
445    }
446
447    #[test]
448    fn read_linearization_from_document() {
449        let data = make_linearized_pdf(5000, 100, 20, 3, 800, 5, 4500);
450        // PdfDocument::from_bytes requires valid xref/trailer, so we test
451        // detect_linearization directly for the raw-data path.
452        let params = detect_linearization(&data).unwrap();
453        assert_eq!(params.page_count, 5);
454    }
455
456    // --- Hint table tests ---
457
458    /// Build a minimal page offset hint table stream for testing.
459    ///
460    /// Generates a header + per-page bit-packed data for `n_pages` pages,
461    /// each with `objs_per_page` objects and `page_len` bytes.
462    fn make_hint_stream(
463        n_pages: usize,
464        objs_per_page: u32,
465        first_page_offset: u32,
466        page_len: u32,
467    ) -> Vec<u8> {
468        let mut buf = Vec::new();
469        // Header: 9 x u32 big-endian
470        // Item 1: min objects per page
471        buf.extend_from_slice(&objs_per_page.to_be_bytes());
472        // Item 2: first page offset
473        buf.extend_from_slice(&first_page_offset.to_be_bytes());
474        // Item 3: bits for delta-objects (0 = all same)
475        buf.extend_from_slice(&0u32.to_be_bytes());
476        // Item 4: min page length
477        buf.extend_from_slice(&page_len.to_be_bytes());
478        // Item 5: bits for delta-page-length (0 = all same)
479        buf.extend_from_slice(&0u32.to_be_bytes());
480        // Items 6-9: content stream fields (unused, zeros)
481        buf.extend_from_slice(&0u32.to_be_bytes());
482        buf.extend_from_slice(&0u32.to_be_bytes());
483        buf.extend_from_slice(&0u32.to_be_bytes());
484        buf.extend_from_slice(&0u32.to_be_bytes());
485        // Per-page sections: all deltas are 0-bit, so nothing to write
486        // (0 bits * n_pages = 0 bytes for both sections)
487        let _ = n_pages; // used implicitly through params.page_count
488        buf
489    }
490
491    #[test]
492    fn parse_hint_table_uniform_pages() {
493        let params = LinearizationParams {
494            file_length: 5000,
495            hint_offset: 100,
496            hint_length: 36,
497            first_page_obj_num: 3,
498            end_of_first_page: 800,
499            page_count: 3,
500            main_xref_offset: 4500,
501            version: 1.0,
502        };
503        let stream = make_hint_stream(3, 5, 200, 400);
504        let hints = parse_hint_tables(&stream, &params).unwrap();
505        assert_eq!(hints.len(), 3);
506        for (i, hint) in hints.iter().enumerate() {
507            assert_eq!(hint.num_objects, 5);
508            assert_eq!(hint.length, 400);
509            assert_eq!(hint.offset, 200 + i as u64 * 400);
510        }
511    }
512
513    #[test]
514    fn parse_hint_table_zero_pages() {
515        let params = LinearizationParams {
516            file_length: 100,
517            hint_offset: 10,
518            hint_length: 5,
519            first_page_obj_num: 1,
520            end_of_first_page: 50,
521            page_count: 0,
522            main_xref_offset: 80,
523            version: 1.0,
524        };
525        let hints = parse_hint_tables(b"", &params).unwrap();
526        assert!(hints.is_empty());
527    }
528
529    #[test]
530    fn parse_hint_table_too_short() {
531        let params = LinearizationParams {
532            file_length: 100,
533            hint_offset: 10,
534            hint_length: 5,
535            first_page_obj_num: 1,
536            end_of_first_page: 50,
537            page_count: 2,
538            main_xref_offset: 80,
539            version: 1.0,
540        };
541        // Only 20 bytes, need at least 36
542        assert!(parse_hint_tables(&[0u8; 20], &params).is_none());
543    }
544
545    #[test]
546    fn bit_reader_basics() {
547        // 0xA5 = 1010_0101
548        let mut r = BitReader::new(&[0xA5]);
549        assert_eq!(r.read_bits(4), Some(0b1010)); // 0xA
550        assert_eq!(r.read_bits(4), Some(0b0101)); // 0x5
551    }
552
553    #[test]
554    fn bit_reader_cross_byte() {
555        // 0xFF 0x00 = 1111_1111 0000_0000
556        let mut r = BitReader::new(&[0xFF, 0x00]);
557        assert_eq!(r.read_bits(4), Some(0xF));
558        assert_eq!(r.read_bits(8), Some(0xF0)); // crosses byte boundary
559        assert_eq!(r.read_bits(4), Some(0x0));
560    }
561
562    #[test]
563    fn bit_reader_zero_bits() {
564        let mut r = BitReader::new(&[0xFF]);
565        assert_eq!(r.read_bits(0), Some(0));
566    }
567
568    #[test]
569    fn parse_hint_table_with_deltas() {
570        // Build a hint stream where delta-objects needs 2 bits and delta-length needs 3 bits.
571        // 2 pages: page0 has 3+1=4 objects, 100+5=105 bytes; page1 has 3+2=5 objects, 100+3=103 bytes.
572        let mut buf = Vec::new();
573        // Header
574        buf.extend_from_slice(&3u32.to_be_bytes());    // min objects = 3
575        buf.extend_from_slice(&500u32.to_be_bytes());   // first page offset
576        buf.extend_from_slice(&2u32.to_be_bytes());     // bits for delta-objects = 2
577        buf.extend_from_slice(&100u32.to_be_bytes());   // min page length = 100
578        buf.extend_from_slice(&3u32.to_be_bytes());     // bits for delta-length = 3
579        // Items 6-9: zeros
580        for _ in 0..4 {
581            buf.extend_from_slice(&0u32.to_be_bytes());
582        }
583        // Per-page section 1: delta-objects (2 bits each, 2 pages)
584        // page0 delta=1 (0b01), page1 delta=2 (0b10)
585        // bits: 01 10 = 0110_0000 = 0x60 (padded)
586        // Per-page section 2: delta-length (3 bits each, 2 pages)
587        // page0 delta=5 (0b101), page1 delta=3 (0b011)
588        // bits: 101 011 = 1010_1100 = 0xAC (padded, after alignment)
589        //
590        // Section 1: 2 bits * 2 pages = 4 bits = partial byte
591        // After align, section 2 starts at next byte.
592        buf.push(0b0110_0000); // section 1: page0=01, page1=10, pad 0000
593        buf.push(0b1010_1100); // section 2: page0=101, page1=011, pad 00
594
595        let params = LinearizationParams {
596            file_length: 5000,
597            hint_offset: 100,
598            hint_length: buf.len() as i64,
599            first_page_obj_num: 3,
600            end_of_first_page: 605,
601            page_count: 2,
602            main_xref_offset: 4500,
603            version: 1.0,
604        };
605
606        let hints = parse_hint_tables(&buf, &params).unwrap();
607        assert_eq!(hints.len(), 2);
608        assert_eq!(hints[0].num_objects, 4); // 3 + 1
609        assert_eq!(hints[1].num_objects, 5); // 3 + 2
610        assert_eq!(hints[0].length, 105);    // 100 + 5
611        assert_eq!(hints[1].length, 103);    // 100 + 3
612        assert_eq!(hints[0].offset, 500);
613        assert_eq!(hints[1].offset, 605);    // 500 + 105
614    }
615}