1use crate::object::{parse_indirect_object, PdfObject};
8use crate::parser::PdfDocument;
9use crate::tokenizer::Tokenizer;
10
11#[derive(Debug, Clone)]
13pub struct LinearizationParams {
14 pub file_length: i64,
16 pub hint_offset: i64,
18 pub hint_length: i64,
20 pub first_page_obj_num: u32,
22 pub end_of_first_page: i64,
24 pub page_count: i64,
26 pub main_xref_offset: i64,
28 pub version: f64,
30}
31
32fn skip_header(data: &[u8]) -> Option<usize> {
36 let search_len = data.len().min(1024);
37 let needle = b"%PDF-";
38
39 for i in 0..search_len.saturating_sub(needle.len()) {
40 if data[i..].starts_with(needle) {
41 let mut pos = i + needle.len();
43 while pos < data.len() && data[pos] != b'\n' && data[pos] != b'\r' {
45 pos += 1;
46 }
47 if pos < data.len() && data[pos] == b'\r' {
49 pos += 1;
50 }
51 if pos < data.len() && data[pos] == b'\n' {
52 pos += 1;
53 }
54 return Some(pos);
55 }
56 }
57 None
58}
59
60fn skip_comments(data: &[u8], mut pos: usize) -> usize {
62 loop {
63 while pos < data.len()
65 && (data[pos] == b' ' || data[pos] == b'\t' || data[pos] == b'\r' || data[pos] == b'\n')
66 {
67 pos += 1;
68 }
69 if pos < data.len() && data[pos] == b'%' {
70 while pos < data.len() && data[pos] != b'\n' && data[pos] != b'\r' {
72 pos += 1;
73 }
74 } else {
75 break;
76 }
77 }
78 pos
79}
80
81pub fn detect_linearization(data: &[u8]) -> Option<LinearizationParams> {
86 let header_end = skip_header(data)?;
87 let obj_start = skip_comments(data, header_end);
88
89 if obj_start >= data.len() {
90 return None;
91 }
92
93 let mut tokenizer = Tokenizer::new_at(data, obj_start);
94 let (_iref, obj) = parse_indirect_object(&mut tokenizer).ok()?;
95
96 let dict = match &obj {
97 PdfObject::Dict(d) => d,
98 _ => return None,
99 };
100
101 if !dict.contains_key(b"Linearized") {
103 return None;
104 }
105
106 let version = dict.get_f64(b"Linearized")?;
107 let file_length = dict.get_i64(b"L")?;
108 let first_page_obj_num = dict.get_i64(b"O")? as u32;
109 let end_of_first_page = dict.get_i64(b"E")?;
110 let page_count = dict.get_i64(b"N")?;
111 let main_xref_offset = dict.get_i64(b"T")?;
112
113 let h_array = dict.get_array(b"H")?;
115 if h_array.len() < 2 {
116 return None;
117 }
118 let hint_offset = h_array[0].as_i64()?;
119 let hint_length = h_array[1].as_i64()?;
120
121 Some(LinearizationParams {
122 file_length,
123 hint_offset,
124 hint_length,
125 first_page_obj_num,
126 end_of_first_page,
127 page_count,
128 main_xref_offset,
129 version,
130 })
131}
132
133pub fn is_linearized(data: &[u8]) -> bool {
135 detect_linearization(data).is_some()
136}
137
138pub fn read_linearization(doc: &PdfDocument) -> Option<LinearizationParams> {
144 detect_linearization(doc.raw_data())
145}
146
147#[derive(Debug, Clone)]
156pub struct PageOffsetHint {
157 pub offset: u64,
159 pub length: u64,
161 pub num_objects: u32,
163}
164
165struct BitReader<'a> {
167 data: &'a [u8],
168 byte_pos: usize,
169 bit_pos: u8, }
171
172impl<'a> BitReader<'a> {
173 fn new(data: &'a [u8]) -> Self {
174 Self {
175 data,
176 byte_pos: 0,
177 bit_pos: 0,
178 }
179 }
180
181 fn read_bits(&mut self, n_bits: u32) -> Option<u64> {
183 if n_bits == 0 {
184 return Some(0);
185 }
186 let mut result: u64 = 0;
187 let mut remaining = n_bits;
188 while remaining > 0 {
189 if self.byte_pos >= self.data.len() {
190 return None;
191 }
192 let avail = 8 - self.bit_pos as u32;
193 let take = remaining.min(avail);
194 let shift = avail - take;
196 let mask = ((1u16 << take) - 1) as u8;
197 let bits = (self.data[self.byte_pos] >> shift) & mask;
198 result = (result << take) | bits as u64;
199 remaining -= take;
200 self.bit_pos += take as u8;
201 if self.bit_pos >= 8 {
202 self.bit_pos = 0;
203 self.byte_pos += 1;
204 }
205 }
206 Some(result)
207 }
208
209 fn align(&mut self) {
211 if self.bit_pos > 0 {
212 self.bit_pos = 0;
213 self.byte_pos += 1;
214 }
215 }
216}
217
218fn read_u32_be(data: &[u8], offset: usize) -> Option<u32> {
220 if offset + 4 > data.len() {
221 return None;
222 }
223 Some(u32::from_be_bytes([
224 data[offset],
225 data[offset + 1],
226 data[offset + 2],
227 data[offset + 3],
228 ]))
229}
230
231pub fn parse_hint_tables(data: &[u8], params: &LinearizationParams) -> Option<Vec<PageOffsetHint>> {
242 let n_pages = params.page_count as usize;
243 if n_pages == 0 {
244 return Some(Vec::new());
245 }
246
247 if data.len() < 36 {
251 return None;
252 }
253
254 let min_objects = read_u32_be(data, 0)?;
257 let first_page_offset = read_u32_be(data, 4)? as u64;
259 let bits_delta_objects = read_u32_be(data, 8)?;
261 let min_page_length = read_u32_be(data, 12)? as u64;
263 let bits_delta_length = read_u32_be(data, 16)?;
265 let _min_content_offset = read_u32_be(data, 20)?;
267 let _bits_delta_content = read_u32_be(data, 24)?;
269 let _min_content_length = read_u32_be(data, 28)?;
271 let _bits_delta_content_len = read_u32_be(data, 32)?;
273
274 let per_page_data = &data[36..];
276 let mut reader = BitReader::new(per_page_data);
277
278 let mut num_objects: Vec<u32> = Vec::with_capacity(n_pages);
280 for _ in 0..n_pages {
281 let delta = reader.read_bits(bits_delta_objects)? as u32;
282 num_objects.push(min_objects + delta);
283 }
284 reader.align();
285
286 let mut lengths: Vec<u64> = Vec::with_capacity(n_pages);
288 for _ in 0..n_pages {
289 let delta = reader.read_bits(bits_delta_length)? as u64;
290 lengths.push(min_page_length + delta);
291 }
292
293 let mut offsets: Vec<u64> = Vec::with_capacity(n_pages);
297 let mut running = first_page_offset;
298 for i in 0..n_pages {
299 offsets.push(running);
300 running += lengths[i];
301 }
302
303 let mut entries: Vec<PageOffsetHint> = Vec::with_capacity(n_pages);
304 for i in 0..n_pages {
305 entries.push(PageOffsetHint {
306 offset: offsets[i],
307 length: lengths[i],
308 num_objects: num_objects[i],
309 });
310 }
311
312 Some(entries)
313}
314
315#[cfg(test)]
316mod tests {
317 use super::*;
318
319 fn make_linearized_pdf(
321 file_length: i64,
322 hint_offset: i64,
323 hint_length: i64,
324 first_page_obj: u32,
325 end_first_page: i64,
326 page_count: i64,
327 main_xref: i64,
328 ) -> Vec<u8> {
329 format!(
330 "%PDF-1.7\n\
331 1 0 obj\n\
332 << /Linearized 1.0 /L {file_length} /H [{hint_offset} {hint_length}] \
333 /O {first_page_obj} /E {end_first_page} /N {page_count} /T {main_xref} >>\n\
334 endobj\n"
335 )
336 .into_bytes()
337 }
338
339 #[test]
340 fn detect_linearized_pdf() {
341 let data = make_linearized_pdf(12345, 200, 50, 5, 1000, 10, 9000);
342 let params = detect_linearization(&data).expect("should detect linearization");
343 assert_eq!(params.file_length, 12345);
344 assert_eq!(params.hint_offset, 200);
345 assert_eq!(params.hint_length, 50);
346 assert_eq!(params.first_page_obj_num, 5);
347 assert_eq!(params.end_of_first_page, 1000);
348 assert_eq!(params.page_count, 10);
349 assert_eq!(params.main_xref_offset, 9000);
350 assert!((params.version - 1.0).abs() < f64::EPSILON);
351 }
352
353 #[test]
354 fn is_linearized_true() {
355 let data = make_linearized_pdf(500, 100, 30, 2, 400, 3, 450);
356 assert!(is_linearized(&data));
357 }
358
359 #[test]
360 fn non_linearized_pdf_returns_none() {
361 let data = b"%PDF-1.4\n1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n";
362 assert!(detect_linearization(data).is_none());
363 assert!(!is_linearized(data));
364 }
365
366 #[test]
367 fn non_dict_first_object_returns_none() {
368 let data = b"%PDF-1.4\n1 0 obj\n42\nendobj\n";
369 assert!(detect_linearization(data).is_none());
370 }
371
372 #[test]
373 fn header_with_comment_line() {
374 let data = b"%PDF-1.5\n%\xE2\xE3\xCF\xD3\n\
376 1 0 obj\n\
377 << /Linearized 1.0 /L 5000 /H [100 20] /O 3 /E 800 /N 5 /T 4500 >>\n\
378 endobj\n";
379 let params = detect_linearization(data).expect("should detect through comment");
380 assert_eq!(params.file_length, 5000);
381 assert_eq!(params.page_count, 5);
382 }
383
384 #[test]
385 fn short_input_does_not_panic() {
386 assert!(detect_linearization(b"").is_none());
387 assert!(detect_linearization(b"%PDF").is_none());
388 assert!(detect_linearization(b"%PDF-1.4\n").is_none());
389 assert!(detect_linearization(b"%PDF-1.4\n1").is_none());
390 }
391
392 #[test]
393 fn truncated_dict_does_not_panic() {
394 let data = b"%PDF-1.4\n1 0 obj\n<< /Linearized 1.0 /L";
395 assert!(detect_linearization(data).is_none());
396 }
397
398 #[test]
399 fn missing_required_key_returns_none() {
400 let data = b"%PDF-1.4\n1 0 obj\n\
402 << /Linearized 1.0 /H [100 20] /O 3 /E 800 /N 5 /T 4500 >>\n\
403 endobj\n";
404 assert!(detect_linearization(data).is_none());
405 }
406
407 #[test]
408 fn h_array_too_short_returns_none() {
409 let data = b"%PDF-1.4\n1 0 obj\n\
410 << /Linearized 1.0 /L 5000 /H [100] /O 3 /E 800 /N 5 /T 4500 >>\n\
411 endobj\n";
412 assert!(detect_linearization(data).is_none());
413 }
414
415 #[test]
416 fn parse_all_params_correctly() {
417 let data = make_linearized_pdf(999999, 512, 128, 7, 2048, 42, 88888);
418 let params = detect_linearization(&data).unwrap();
419 assert_eq!(params.file_length, 999999);
420 assert_eq!(params.hint_offset, 512);
421 assert_eq!(params.hint_length, 128);
422 assert_eq!(params.first_page_obj_num, 7);
423 assert_eq!(params.end_of_first_page, 2048);
424 assert_eq!(params.page_count, 42);
425 assert_eq!(params.main_xref_offset, 88888);
426 }
427
428 #[test]
429 fn version_as_real() {
430 let data = b"%PDF-1.7\n1 0 obj\n\
431 << /Linearized 1.0 /L 100 /H [10 5] /O 1 /E 50 /N 1 /T 80 >>\n\
432 endobj\n";
433 let params = detect_linearization(data).unwrap();
434 assert!((params.version - 1.0).abs() < f64::EPSILON);
435 }
436
437 #[test]
438 fn version_as_integer() {
439 let data = b"%PDF-1.7\n1 0 obj\n\
441 << /Linearized 1 /L 100 /H [10 5] /O 1 /E 50 /N 1 /T 80 >>\n\
442 endobj\n";
443 let params = detect_linearization(data).unwrap();
444 assert!((params.version - 1.0).abs() < f64::EPSILON);
445 }
446
447 #[test]
448 fn read_linearization_from_document() {
449 let data = make_linearized_pdf(5000, 100, 20, 3, 800, 5, 4500);
450 let params = detect_linearization(&data).unwrap();
453 assert_eq!(params.page_count, 5);
454 }
455
456 fn make_hint_stream(
463 n_pages: usize,
464 objs_per_page: u32,
465 first_page_offset: u32,
466 page_len: u32,
467 ) -> Vec<u8> {
468 let mut buf = Vec::new();
469 buf.extend_from_slice(&objs_per_page.to_be_bytes());
472 buf.extend_from_slice(&first_page_offset.to_be_bytes());
474 buf.extend_from_slice(&0u32.to_be_bytes());
476 buf.extend_from_slice(&page_len.to_be_bytes());
478 buf.extend_from_slice(&0u32.to_be_bytes());
480 buf.extend_from_slice(&0u32.to_be_bytes());
482 buf.extend_from_slice(&0u32.to_be_bytes());
483 buf.extend_from_slice(&0u32.to_be_bytes());
484 buf.extend_from_slice(&0u32.to_be_bytes());
485 let _ = n_pages; buf
489 }
490
491 #[test]
492 fn parse_hint_table_uniform_pages() {
493 let params = LinearizationParams {
494 file_length: 5000,
495 hint_offset: 100,
496 hint_length: 36,
497 first_page_obj_num: 3,
498 end_of_first_page: 800,
499 page_count: 3,
500 main_xref_offset: 4500,
501 version: 1.0,
502 };
503 let stream = make_hint_stream(3, 5, 200, 400);
504 let hints = parse_hint_tables(&stream, ¶ms).unwrap();
505 assert_eq!(hints.len(), 3);
506 for (i, hint) in hints.iter().enumerate() {
507 assert_eq!(hint.num_objects, 5);
508 assert_eq!(hint.length, 400);
509 assert_eq!(hint.offset, 200 + i as u64 * 400);
510 }
511 }
512
513 #[test]
514 fn parse_hint_table_zero_pages() {
515 let params = LinearizationParams {
516 file_length: 100,
517 hint_offset: 10,
518 hint_length: 5,
519 first_page_obj_num: 1,
520 end_of_first_page: 50,
521 page_count: 0,
522 main_xref_offset: 80,
523 version: 1.0,
524 };
525 let hints = parse_hint_tables(b"", ¶ms).unwrap();
526 assert!(hints.is_empty());
527 }
528
529 #[test]
530 fn parse_hint_table_too_short() {
531 let params = LinearizationParams {
532 file_length: 100,
533 hint_offset: 10,
534 hint_length: 5,
535 first_page_obj_num: 1,
536 end_of_first_page: 50,
537 page_count: 2,
538 main_xref_offset: 80,
539 version: 1.0,
540 };
541 assert!(parse_hint_tables(&[0u8; 20], ¶ms).is_none());
543 }
544
545 #[test]
546 fn bit_reader_basics() {
547 let mut r = BitReader::new(&[0xA5]);
549 assert_eq!(r.read_bits(4), Some(0b1010)); assert_eq!(r.read_bits(4), Some(0b0101)); }
552
553 #[test]
554 fn bit_reader_cross_byte() {
555 let mut r = BitReader::new(&[0xFF, 0x00]);
557 assert_eq!(r.read_bits(4), Some(0xF));
558 assert_eq!(r.read_bits(8), Some(0xF0)); assert_eq!(r.read_bits(4), Some(0x0));
560 }
561
562 #[test]
563 fn bit_reader_zero_bits() {
564 let mut r = BitReader::new(&[0xFF]);
565 assert_eq!(r.read_bits(0), Some(0));
566 }
567
568 #[test]
569 fn parse_hint_table_with_deltas() {
570 let mut buf = Vec::new();
573 buf.extend_from_slice(&3u32.to_be_bytes()); buf.extend_from_slice(&500u32.to_be_bytes()); buf.extend_from_slice(&2u32.to_be_bytes()); buf.extend_from_slice(&100u32.to_be_bytes()); buf.extend_from_slice(&3u32.to_be_bytes()); for _ in 0..4 {
581 buf.extend_from_slice(&0u32.to_be_bytes());
582 }
583 buf.push(0b0110_0000); buf.push(0b1010_1100); let params = LinearizationParams {
596 file_length: 5000,
597 hint_offset: 100,
598 hint_length: buf.len() as i64,
599 first_page_obj_num: 3,
600 end_of_first_page: 605,
601 page_count: 2,
602 main_xref_offset: 4500,
603 version: 1.0,
604 };
605
606 let hints = parse_hint_tables(&buf, ¶ms).unwrap();
607 assert_eq!(hints.len(), 2);
608 assert_eq!(hints[0].num_objects, 4); assert_eq!(hints[1].num_objects, 5); assert_eq!(hints[0].length, 105); assert_eq!(hints[1].length, 103); assert_eq!(hints[0].offset, 500);
613 assert_eq!(hints[1].offset, 605); }
615}