1use std::collections::HashMap;
13
14use rpdfium_core::error::PdfError;
15use rpdfium_core::fx_system::MAX_OBJECT_NUMBER;
16use rpdfium_core::{Name, ParsingMode};
17
18use crate::object::{Object, ObjectId, StreamData};
19use crate::object_parser::parse_indirect_object;
20use crate::xref::{XrefEntry, XrefEntryType, XrefSection};
21
22pub fn parse_xref_stream(
27 source: &[u8],
28 offset: u64,
29 mode: ParsingMode,
30) -> Result<(XrefSection, HashMap<Name, Object>), PdfError> {
31 let (_id, obj) = parse_indirect_object(source, offset, mode)?;
33
34 let (dict, data) = match obj {
35 Object::Stream { dict, data } => (dict, data),
36 _ => return Err(PdfError::InvalidXref),
37 };
38
39 let decoded_buf;
41 let raw_data = match data {
42 StreamData::Raw {
43 offset: raw_offset,
44 length,
45 } => {
46 let start = raw_offset as usize;
47 let end = start + length as usize;
48 if end > source.len() {
49 return Err(PdfError::InvalidXref);
50 }
51 let raw_bytes = &source[start..end];
52 let filters = crate::filter::resolve_filter_chain(&dict);
54 if !filters.is_empty() {
55 decoded_buf = rpdfium_codec::apply_filter_chain(raw_bytes, &filters)
56 .map_err(|_| PdfError::InvalidXref)?;
57 decoded_buf.as_slice()
58 } else {
59 raw_bytes
60 }
61 }
62 StreamData::Decoded { data: decoded } => {
63 decoded_buf = decoded;
64 decoded_buf.as_slice()
65 }
66 };
67
68 let w = extract_w_array(&dict)?;
70
71 let size = match dict.get(&Name::size()) {
73 Some(Object::Integer(n)) if *n > 0 => *n as u64,
74 _ => return Err(PdfError::InvalidXref),
75 };
76
77 let index_ranges = extract_index_ranges(&dict, size)?;
80
81 let entry_width = w[0] + w[1] + w[2];
83 if entry_width == 0 {
84 return Err(PdfError::InvalidXref);
85 }
86
87 let mut entries = Vec::new();
88
89 let mut data_pos = 0;
90 for range in &index_ranges {
91 let start_id = range.0;
92 let count = range.1;
93
94 for i in 0..count {
95 if data_pos + entry_width > raw_data.len() {
96 break;
97 }
98
99 let object_number = start_id + i as u64;
100 if object_number > MAX_OBJECT_NUMBER as u64 {
101 tracing::warn!(
102 object_number = object_number,
103 "xref stream entry exceeds MAX_OBJECT_NUMBER, skipping"
104 );
105 data_pos += entry_width;
106 continue;
107 }
108
109 let field1 = read_field(raw_data, data_pos, w[0]);
110 let field2 = read_field(raw_data, data_pos + w[0], w[1]);
111 let field3 = read_field(raw_data, data_pos + w[0] + w[1], w[2]);
112 data_pos += entry_width;
113
114 let entry_type_val = if w[0] == 0 { 1 } else { field1 };
116
117 let entry_type = match entry_type_val {
118 0 => XrefEntryType::Free,
119 1 => XrefEntryType::InUse { offset: field2 },
120 2 => XrefEntryType::InStream {
121 stream_id: ObjectId::new(field2 as u32, 0),
122 index: field3 as u32,
123 },
124 _ => {
125 tracing::warn!(
126 entry_type = entry_type_val,
127 "unknown xref stream entry type, treating as free"
128 );
129 XrefEntryType::Free
130 }
131 };
132
133 let generation = match entry_type_val {
134 0 => field3 as u16, 1 => field3 as u16, _ => 0,
137 };
138
139 entries.push(XrefEntry {
140 id: ObjectId::new(object_number as u32, generation),
141 entry_type,
142 });
143 }
144 }
145
146 Ok((XrefSection { entries }, dict))
147}
148
149fn extract_w_array(dict: &HashMap<Name, Object>) -> Result<[usize; 3], PdfError> {
152 let w_array = match dict.get(&Name::w()) {
153 Some(Object::Array(arr)) => arr,
154 _ => return Err(PdfError::InvalidXref),
155 };
156
157 if w_array.len() != 3 {
158 return Err(PdfError::InvalidXref);
159 }
160
161 let mut widths = [0usize; 3];
162 for (i, obj) in w_array.iter().enumerate() {
163 match obj {
164 Object::Integer(n) if *n >= 0 => widths[i] = *n as usize,
165 _ => return Err(PdfError::InvalidXref),
166 }
167 }
168
169 Ok(widths)
170}
171
172fn extract_index_ranges(
176 dict: &HashMap<Name, Object>,
177 size: u64,
178) -> Result<Vec<(u64, usize)>, PdfError> {
179 match dict.get(&Name::index()) {
180 Some(Object::Array(arr)) => {
181 if arr.len() % 2 != 0 {
182 return Err(PdfError::InvalidXref);
183 }
184 let mut ranges = Vec::new();
185 for pair in arr.chunks(2) {
186 let start = match &pair[0] {
187 Object::Integer(n) if *n >= 0 => *n as u64,
188 _ => return Err(PdfError::InvalidXref),
189 };
190 let count = match &pair[1] {
191 Object::Integer(n) if *n >= 0 => *n as usize,
192 _ => return Err(PdfError::InvalidXref),
193 };
194 ranges.push((start, count));
195 }
196 Ok(ranges)
197 }
198 None => Ok(vec![(0, size as usize)]),
199 _ => Err(PdfError::InvalidXref),
200 }
201}
202
203fn read_field(data: &[u8], offset: usize, width: usize) -> u64 {
206 if width == 0 {
207 return 0;
208 }
209
210 let mut value: u64 = 0;
211 for i in 0..width {
212 if offset + i < data.len() {
213 value = (value << 8) | data[offset + i] as u64;
214 }
215 }
216 value
217}
218
219#[cfg(test)]
220mod tests {
221 use super::*;
222
223 #[test]
224 fn test_read_field_1_byte() {
225 let data = [0xFF];
226 assert_eq!(read_field(&data, 0, 1), 255);
227 }
228
229 #[test]
230 fn test_read_field_2_bytes() {
231 let data = [0x01, 0x00];
232 assert_eq!(read_field(&data, 0, 2), 256);
233 }
234
235 #[test]
236 fn test_read_field_4_bytes() {
237 let data = [0x00, 0x00, 0x01, 0x00];
238 assert_eq!(read_field(&data, 0, 4), 256);
239 }
240
241 #[test]
242 fn test_read_field_zero_width() {
243 let data = [0xFF];
244 assert_eq!(read_field(&data, 0, 0), 0);
245 }
246
247 #[test]
248 fn test_extract_w_array_valid() {
249 let mut dict = HashMap::new();
250 dict.insert(
251 Name::w(),
252 Object::Array(vec![
253 Object::Integer(1),
254 Object::Integer(3),
255 Object::Integer(1),
256 ]),
257 );
258 let w = extract_w_array(&dict).unwrap();
259 assert_eq!(w, [1, 3, 1]);
260 }
261
262 #[test]
263 fn test_extract_w_array_missing() {
264 let dict = HashMap::new();
265 assert!(extract_w_array(&dict).is_err());
266 }
267
268 #[test]
269 fn test_extract_w_array_wrong_length() {
270 let mut dict = HashMap::new();
271 dict.insert(
272 Name::w(),
273 Object::Array(vec![Object::Integer(1), Object::Integer(2)]),
274 );
275 assert!(extract_w_array(&dict).is_err());
276 }
277
278 #[test]
279 fn test_extract_index_default() {
280 let dict = HashMap::new();
281 let ranges = extract_index_ranges(&dict, 5).unwrap();
282 assert_eq!(ranges, vec![(0, 5)]);
283 }
284
285 #[test]
286 fn test_extract_index_explicit() {
287 let mut dict = HashMap::new();
288 dict.insert(
289 Name::index(),
290 Object::Array(vec![
291 Object::Integer(0),
292 Object::Integer(3),
293 Object::Integer(10),
294 Object::Integer(2),
295 ]),
296 );
297 let ranges = extract_index_ranges(&dict, 5).unwrap();
298 assert_eq!(ranges, vec![(0, 3), (10, 2)]);
299 }
300
301 #[test]
302 fn test_extract_w_array_negative_component() {
303 let mut dict = HashMap::new();
304 dict.insert(
305 Name::w(),
306 Object::Array(vec![
307 Object::Integer(1),
308 Object::Integer(-1),
309 Object::Integer(1),
310 ]),
311 );
312 assert!(extract_w_array(&dict).is_err());
313 }
314
315 #[test]
316 fn test_extract_w_array_all_zero() {
317 let mut dict = HashMap::new();
318 dict.insert(
319 Name::w(),
320 Object::Array(vec![
321 Object::Integer(0),
322 Object::Integer(0),
323 Object::Integer(0),
324 ]),
325 );
326 let w = extract_w_array(&dict).unwrap();
327 assert_eq!(w, [0, 0, 0]);
328 }
329
330 #[test]
331 fn test_extract_w_array_non_integer() {
332 let mut dict = HashMap::new();
333 dict.insert(
334 Name::w(),
335 Object::Array(vec![
336 Object::Integer(1),
337 Object::String(rpdfium_core::PdfString::from_bytes(b"hello".to_vec())),
338 Object::Integer(1),
339 ]),
340 );
341 assert!(extract_w_array(&dict).is_err());
342 }
343
344 #[test]
345 fn test_size_zero_is_error() {
346 let mut dict = HashMap::new();
350 dict.insert(Name::size(), Object::Integer(0));
351 let size_val = match dict.get(&Name::size()) {
355 Some(Object::Integer(n)) if *n > 0 => Some(*n as u64),
356 _ => None,
357 };
358 assert!(size_val.is_none());
359 }
360
361 #[test]
362 fn test_size_negative_is_error() {
363 let mut dict = HashMap::new();
364 dict.insert(Name::size(), Object::Integer(-5));
365 let size_val = match dict.get(&Name::size()) {
366 Some(Object::Integer(n)) if *n > 0 => Some(*n as u64),
367 _ => None,
368 };
369 assert!(size_val.is_none());
370 }
371
372 #[test]
373 fn test_index_odd_length_is_error() {
374 let mut dict = HashMap::new();
375 dict.insert(
376 Name::index(),
377 Object::Array(vec![
378 Object::Integer(0),
379 Object::Integer(3),
380 Object::Integer(10),
381 ]),
382 );
383 assert!(extract_index_ranges(&dict, 5).is_err());
384 }
385
386 #[test]
387 fn test_index_negative_start_is_error() {
388 let mut dict = HashMap::new();
389 dict.insert(
390 Name::index(),
391 Object::Array(vec![Object::Integer(-1), Object::Integer(3)]),
392 );
393 assert!(extract_index_ranges(&dict, 5).is_err());
394 }
395
396 #[test]
397 fn test_read_field_short_data() {
398 let data = [0x01, 0x02];
400 let value = read_field(&data, 0, 4);
401 assert_eq!(value, 0x0102);
404 }
405
406 use crate::store::ObjectStore;
413
414 fn open_xref_stream_pdf(
418 data: &[u8],
419 ) -> Result<ObjectStore<Vec<u8>>, rpdfium_core::error::PdfError> {
420 ObjectStore::open(data.to_vec(), rpdfium_core::ParsingMode::Lenient)
421 }
422
423 #[test]
428 fn test_parser_xref_object_highest_index() {
429 let data = b"%PDF1-7\n%\xa0\xf2\xa4\xf4\n\
430 7 0 obj <<\n\
431 /Filter /ASCIIHexDecode\n\
432 /Index [25165824 1]\n\
433 /Root 1 0 R\n\
434 /Size 25165825\n\
435 /W [1 1 1]\n\
436 >>\n\
437 stream\n\
438 01 00 00\n\
439 endstream\n\
440 endobj\n\
441 startxref\n\
442 14\n\
443 %%EOF\n";
444 let _result = open_xref_stream_pdf(data);
447 }
448
449 #[test]
454 fn test_parser_xref_object_indices_too_big() {
455 let data = b"%PDF1-7\n%\xa0\xf2\xa4\xf4\n\
456 7 0 obj <<\n\
457 /Filter /ASCIIHexDecode\n\
458 /Index [25165824 2]\n\
459 /Root 1 0 R\n\
460 /Size 25165826\n\
461 /W [1 1 1]\n\
462 >>\n\
463 stream\n\
464 01 00 00\n\
465 01 0F 00\n\
466 01 12 00\n\
467 endstream\n\
468 endobj\n\
469 startxref\n\
470 14\n\
471 %%EOF\n";
472 let _result = open_xref_stream_pdf(data);
474 }
475
476 #[test]
481 fn test_parser_xref_has_invalid_archive_object_number() {
482 let data = b"%PDF1-7\n%\xa0\xf2\xa4\xf4\n\
483 7 0 obj <<\n\
484 /Filter /ASCIIHexDecode\n\
485 /Root 1 0 R\n\
486 /Size 3\n\
487 /W [1 1 1]\n\
488 >>\n\
489 stream\n\
490 02 FF 00\n\
491 01 0F 00\n\
492 01 12 00\n\
493 endstream\n\
494 endobj\n\
495 startxref\n\
496 14\n\
497 %%EOF\n";
498 let _result = open_xref_stream_pdf(data);
500 }
501
502 #[test]
506 fn test_parser_xref_has_invalid_object_type() {
507 let data = b"%PDF1-7\n%\xa0\xf2\xa4\xf4\n\
508 7 0 obj <<\n\
509 /Filter /ASCIIHexDecode\n\
510 /Root 1 0 R\n\
511 /Size 3\n\
512 /W [1 1 1]\n\
513 >>\n\
514 endobj\n\
515 startxref\n\
516 14\n\
517 %%EOF\n";
518 let _result = open_xref_stream_pdf(data);
520 }
521
522 #[test]
526 fn test_parser_xref_has_invalid_prev_value() {
527 let data = b"%PDF1-7\n%\xa0\xf2\xa4\xf4\n\
528 7 0 obj <<\n\
529 /Filter /ASCIIHexDecode\n\
530 /Root 1 0 R\n\
531 /Size 3\n\
532 /W [1 1 1]\n\
533 /Prev -1\n\
534 >>\n\
535 stream\n\
536 02 FF 00\n\
537 01 0F 00\n\
538 01 12 00\n\
539 endstream\n\
540 endobj\n\
541 startxref\n\
542 14\n\
543 %%EOF\n";
544 let _result = open_xref_stream_pdf(data);
546 }
547
548 #[test]
552 fn test_parser_xref_has_invalid_size_value() {
553 let data = b"%PDF1-7\n%\xa0\xf2\xa4\xf4\n\
554 7 0 obj <<\n\
555 /Filter /ASCIIHexDecode\n\
556 /Root 1 0 R\n\
557 /Size 3\n\
558 /W [1 1 1]\n\
559 /Size -1\n\
560 >>\n\
561 stream\n\
562 02 FF 00\n\
563 01 0F 00\n\
564 01 12 00\n\
565 endstream\n\
566 endobj\n\
567 startxref\n\
568 14\n\
569 %%EOF\n";
570 let _result = open_xref_stream_pdf(data);
572 }
573
574 #[test]
578 fn test_parser_xref_has_zero_size_value() {
579 let data = b"%PDF1-7\n%\xa0\xf2\xa4\xf4\n\
580 7 0 obj <<\n\
581 /Filter /ASCIIHexDecode\n\
582 /Root 1 0 R\n\
583 /Size 0\n\
584 /W [1 1 1]\n\
585 /Size 0\n\
586 >>\n\
587 stream\n\
588 02 FF 00\n\
589 01 0F 00\n\
590 01 12 00\n\
591 endstream\n\
592 endobj\n\
593 startxref\n\
594 14\n\
595 %%EOF\n";
596 let _result = open_xref_stream_pdf(data);
598 }
599
600 #[test]
604 fn test_parser_xref_has_invalid_width() {
605 let data = b"%PDF1-7\n%\xa0\xf2\xa4\xf4\n\
606 7 0 obj <<\n\
607 /Filter /ASCIIHexDecode\n\
608 /Root 1 0 R\n\
609 /Size 3\n\
610 /W [1 1]\n\
611 >>\n\
612 stream\n\
613 02 FF 00\n\
614 01 0F 00\n\
615 01 12 00\n\
616 endstream\n\
617 endobj\n\
618 startxref\n\
619 14\n\
620 %%EOF\n";
621 let _result = open_xref_stream_pdf(data);
624 }
625
626 #[test]
630 fn test_parser_xref_first_width_entry_is_zero() {
631 let mut pdf = Vec::new();
633 pdf.extend_from_slice(b"%PDF-1.7\n");
634
635 let _obj1_offset = pdf.len();
637 pdf.extend_from_slice(b"1 0 obj\n<< /Type /Catalog >>\nendobj\n");
638
639 let _xref_offset = pdf.len();
643 let mut dict = HashMap::new();
645 dict.insert(
646 Name::w(),
647 Object::Array(vec![
648 Object::Integer(0),
649 Object::Integer(1),
650 Object::Integer(1),
651 ]),
652 );
653 let w = extract_w_array(&dict).unwrap();
654 assert_eq!(w, [0, 1, 1]);
655
656 let raw_data = vec![0x0F, 0x00, 0x12, 0x00]; let entry_width = w[0] + w[1] + w[2]; assert_eq!(entry_width, 2);
661
662 let mut entries = Vec::new();
664 let mut data_pos = 0;
665 for i in 0..2u64 {
666 let field1 = read_field(&raw_data, data_pos, w[0]);
667 let field2 = read_field(&raw_data, data_pos + w[0], w[1]);
668 let field3 = read_field(&raw_data, data_pos + w[0] + w[1], w[2]);
669 data_pos += entry_width;
670
671 let entry_type_val = if w[0] == 0 { 1 } else { field1 };
672 assert_eq!(entry_type_val, 1, "should default to type 1 when w[0]==0");
673
674 let entry_type = match entry_type_val {
675 1 => crate::xref::XrefEntryType::InUse { offset: field2 },
676 _ => unreachable!(),
677 };
678 entries.push(crate::xref::XrefEntry {
679 id: crate::object::ObjectId::new(i as u32, field3 as u16),
680 entry_type,
681 });
682 }
683
684 assert_eq!(entries.len(), 2);
685 assert_eq!(
686 entries[0].entry_type,
687 crate::xref::XrefEntryType::InUse { offset: 15 }
688 );
689 assert_eq!(
690 entries[1].entry_type,
691 crate::xref::XrefEntryType::InUse { offset: 18 }
692 );
693 }
694
695 #[test]
700 fn test_parser_xref_with_valid_index() {
701 let mut dict = HashMap::new();
702 dict.insert(
703 Name::index(),
704 Object::Array(vec![
705 Object::Integer(2),
706 Object::Integer(1),
707 Object::Integer(4),
708 Object::Integer(2),
709 Object::Integer(80),
710 Object::Integer(3),
711 ]),
712 );
713 let ranges = extract_index_ranges(&dict, 83).unwrap();
714 assert_eq!(ranges, vec![(2, 1), (4, 2), (80, 3)]);
715 }
716
717 #[test]
721 fn test_parser_xref_index_with_repeated_object() {
722 let mut dict = HashMap::new();
723 dict.insert(
724 Name::index(),
725 Object::Array(vec![
726 Object::Integer(2),
727 Object::Integer(2),
728 Object::Integer(3),
729 Object::Integer(1),
730 ]),
731 );
732 let ranges = extract_index_ranges(&dict, 4).unwrap();
733 assert_eq!(ranges, vec![(2, 2), (3, 1)]);
734
735 }
738
739 #[test]
743 fn test_parser_xref_index_with_out_of_order_objects() {
744 let mut dict = HashMap::new();
745 dict.insert(
746 Name::index(),
747 Object::Array(vec![
748 Object::Integer(3),
749 Object::Integer(2),
750 Object::Integer(2),
751 Object::Integer(1),
752 ]),
753 );
754 let ranges = extract_index_ranges(&dict, 5).unwrap();
755 assert_eq!(ranges, vec![(3, 2), (2, 1)]);
756 }
758
759 #[test]
766 fn test_parser_xref_with_index_and_wrong_size() {
767 let mut dict = HashMap::new();
768 dict.insert(
769 Name::index(),
770 Object::Array(vec![
771 Object::Integer(2),
772 Object::Integer(1),
773 Object::Integer(80),
774 Object::Integer(2),
775 ]),
776 );
777 let ranges = extract_index_ranges(&dict, 81).unwrap();
779 assert_eq!(ranges, vec![(2, 1), (80, 2)]);
780 }
781}