1use crate::parser::filters::{apply_filter, apply_filter_with_params, Filter};
10use crate::parser::objects::{PdfArray, PdfDictionary, PdfName, PdfObject};
11use crate::parser::ParseOptions;
12use crate::parser::{ParseError, ParseResult};
13use std::io::{Read, Seek};
14
15#[derive(Debug, Clone, PartialEq)]
17pub enum XRefEntry {
18 Free {
20 next_free_object: u32,
22 generation: u16,
24 },
25 InUse {
27 offset: u64,
29 generation: u16,
31 },
32 Compressed {
34 stream_object_number: u32,
36 index_within_stream: u32,
38 },
39}
40
41pub struct XRefStream {
43 pub dict: PdfDictionary,
45 pub data: Vec<u8>,
47 pub widths: Vec<usize>,
49 pub index: Vec<(u32, u32)>,
51}
52
53impl XRefStream {
54 pub fn parse<R: Read + Seek>(
56 _reader: &mut R,
57 stream_dict: PdfDictionary,
58 stream_data: Vec<u8>,
59 _options: &ParseOptions,
60 ) -> ParseResult<Self> {
61 let widths = stream_dict
63 .get("W")
64 .and_then(|obj| obj.as_array())
65 .ok_or_else(|| ParseError::MissingKey("W array in xref stream".to_string()))?
66 .0
67 .iter()
68 .map(|obj| {
69 obj.as_integer()
70 .ok_or_else(|| ParseError::SyntaxError {
71 position: 0,
72 message: "Invalid width in W array".to_string(),
73 })
74 .map(|n| n as usize)
75 })
76 .collect::<ParseResult<Vec<_>>>()?;
77
78 if widths.len() != 3 {
79 return Err(ParseError::SyntaxError {
80 position: 0,
81 message: format!(
82 "W array must have 3 elements, found {len}",
83 len = widths.len()
84 ),
85 });
86 }
87
88 let index =
90 if let Some(index_array) = stream_dict.get("Index").and_then(|obj| obj.as_array()) {
91 let mut index_pairs = Vec::new();
92 let mut i = 0;
93 while i + 1 < index_array.len() {
94 let first =
95 index_array.0[i]
96 .as_integer()
97 .ok_or_else(|| ParseError::SyntaxError {
98 position: 0,
99 message: "Invalid first object number in Index".to_string(),
100 })? as u32;
101 let count = index_array.0[i + 1].as_integer().ok_or_else(|| {
102 ParseError::SyntaxError {
103 position: 0,
104 message: "Invalid count in Index".to_string(),
105 }
106 })? as u32;
107 index_pairs.push((first, count));
108 i += 2;
109 }
110 index_pairs
111 } else {
112 let size = stream_dict
114 .get("Size")
115 .and_then(|obj| obj.as_integer())
116 .ok_or_else(|| ParseError::MissingKey("Size in xref stream".to_string()))?
117 as u32;
118 vec![(0, size)]
119 };
120
121 let entry_size = widths.iter().sum::<usize>();
123
124 let decoded_data = if let Some(filter_obj) = stream_dict.get("Filter") {
126 match filter_obj {
128 PdfObject::Name(filter_name) => {
129 let filter = Filter::from_name(filter_name.as_str()).ok_or_else(|| {
131 ParseError::StreamDecodeError(format!("Unknown filter: {filter_name:?}"))
132 })?;
133
134 let decode_params = stream_dict.get("DecodeParms");
136
137 if let Some(params_obj) = decode_params {
138 if let Some(mut params_dict) = params_obj.as_dict().cloned() {
139 if params_dict
143 .get("Predictor")
144 .and_then(|p| p.as_integer())
145 .is_some()
146 {
147 params_dict.insert(
148 "Columns".to_string(),
149 PdfObject::Integer(entry_size as i64),
150 );
151 }
152 apply_filter_with_params(&stream_data, filter, Some(¶ms_dict))?
153 } else {
154 apply_filter(&stream_data, filter)?
155 }
156 } else {
157 apply_filter(&stream_data, filter)?
158 }
159 }
160 PdfObject::Array(filters) => {
161 let mut data = stream_data;
162 for filter in filters.0.iter() {
163 if let Some(filter_name) = filter.as_name() {
164 data = apply_filter(
165 &data,
166 Filter::from_name(filter_name.as_str()).ok_or_else(|| {
167 ParseError::StreamDecodeError(format!(
168 "Unknown filter: {filter_name:?}"
169 ))
170 })?,
171 )?;
172 }
173 }
174 data
175 }
176 _ => stream_data,
177 }
178 } else {
179 stream_data
180 };
181
182 Ok(XRefStream {
183 dict: stream_dict,
184 data: decoded_data,
185 widths,
186 index,
187 })
188 }
189
190 pub fn to_xref_entries(&self) -> ParseResult<Vec<(u32, XRefEntry)>> {
192 let mut entries = Vec::new();
193 let entry_size = self.widths.iter().sum::<usize>();
194
195 if entry_size == 0 {
196 return Err(ParseError::SyntaxError {
197 position: 0,
198 message: "Invalid entry size (0) in xref stream".to_string(),
199 });
200 }
201
202 let mut data_offset = 0;
203
204 for &(first_obj, count) in &self.index {
205 for i in 0..count {
206 if data_offset + entry_size > self.data.len() {
207 return Err(ParseError::SyntaxError {
208 position: data_offset,
209 message: format!("Xref stream data truncated at obj {}", first_obj + i),
210 });
211 }
212
213 let mut field_offset = data_offset;
215 let mut fields = Vec::new();
216
217 for &width in &self.widths {
218 let field_value = if width == 0 {
219 0 } else {
221 read_field(&self.data[field_offset..field_offset + width])
222 };
223 fields.push(field_value);
224 field_offset += width;
225 }
226
227 let entry_type = fields[0];
229 let obj_num = first_obj + i;
230
231 let entry = match entry_type {
232 0 => {
233 XRefEntry::Free {
235 next_free_object: fields[1] as u32,
236 generation: fields[2] as u16,
237 }
238 }
239 1 => {
240 XRefEntry::InUse {
242 offset: fields[1],
243 generation: fields[2] as u16,
244 }
245 }
246 2 => {
247 XRefEntry::Compressed {
249 stream_object_number: fields[1] as u32,
250 index_within_stream: fields[2] as u32,
251 }
252 }
253 _ => {
254 return Err(ParseError::SyntaxError {
255 position: data_offset,
256 message: format!("Invalid xref entry type: {entry_type}"),
257 });
258 }
259 };
260
261 entries.push((obj_num, entry));
262 data_offset += entry_size;
263 }
264 }
265
266 Ok(entries)
267 }
268
269 pub fn trailer_dict(&self) -> &PdfDictionary {
271 &self.dict
272 }
273
274 pub fn is_hybrid(&self) -> bool {
276 self.dict.get("XRefStm").is_some()
278 }
279
280 pub fn get_xref_stm_offset(&self) -> Option<u64> {
282 self.dict
283 .get("XRefStm")
284 .and_then(|obj| obj.as_integer())
285 .map(|n| n as u64)
286 }
287
288 pub fn get_prev_offset(&self) -> Option<u64> {
290 self.dict
291 .get("Prev")
292 .and_then(|obj| obj.as_integer())
293 .map(|n| n as u64)
294 }
295}
296
297fn read_field(bytes: &[u8]) -> u64 {
299 let mut value = 0u64;
300 for &byte in bytes {
301 value = (value << 8) | (byte as u64);
302 }
303 value
304}
305
306pub struct XRefStreamBuilder {
308 entries: Vec<(u32, XRefEntry)>,
310 trailer_entries: PdfDictionary,
312}
313
314impl Default for XRefStreamBuilder {
315 fn default() -> Self {
316 Self::new()
317 }
318}
319
320impl XRefStreamBuilder {
321 pub fn new() -> Self {
323 Self {
324 entries: Vec::new(),
325 trailer_entries: PdfDictionary::new(),
326 }
327 }
328
329 pub fn add_entry(&mut self, obj_num: u32, entry: XRefEntry) {
331 self.entries.push((obj_num, entry));
332 }
333
334 pub fn add_trailer_entry(&mut self, key: &str, value: PdfObject) {
336 self.trailer_entries.insert(key.to_string(), value);
337 }
338
339 pub fn build(mut self) -> ParseResult<(PdfDictionary, Vec<u8>)> {
341 self.entries.sort_by_key(|(num, _)| *num);
343
344 let mut max_offset = 0u64;
346 let mut max_obj_num = 0u32;
347 let mut max_gen = 0u16;
348 let mut _has_compressed = false;
349
350 for (obj_num, entry) in &self.entries {
351 max_obj_num = max_obj_num.max(*obj_num);
352 match entry {
353 XRefEntry::InUse { offset, generation } => {
354 max_offset = max_offset.max(*offset);
355 max_gen = max_gen.max(*generation);
356 }
357 XRefEntry::Free { generation, .. } => {
358 max_gen = max_gen.max(*generation);
359 }
360 XRefEntry::Compressed {
361 stream_object_number,
362 index_within_stream,
363 } => {
364 _has_compressed = true;
365 max_obj_num = max_obj_num.max(*stream_object_number);
366 max_offset = max_offset.max(*index_within_stream as u64);
367 }
368 }
369 }
370
371 let w1 = 1; let w2 = bytes_needed(max_offset.max(max_obj_num as u64));
374 let w3 = bytes_needed(max_gen as u64);
375
376 let mut stream_data = Vec::new();
378
379 for (_obj_num, entry) in &self.entries {
380 match entry {
381 XRefEntry::Free {
382 next_free_object,
383 generation,
384 } => {
385 write_field(&mut stream_data, 0, w1); write_field(&mut stream_data, *next_free_object as u64, w2);
387 write_field(&mut stream_data, *generation as u64, w3);
388 }
389 XRefEntry::InUse { offset, generation } => {
390 write_field(&mut stream_data, 1, w1); write_field(&mut stream_data, *offset, w2);
392 write_field(&mut stream_data, *generation as u64, w3);
393 }
394 XRefEntry::Compressed {
395 stream_object_number,
396 index_within_stream,
397 } => {
398 write_field(&mut stream_data, 2, w1); write_field(&mut stream_data, *stream_object_number as u64, w2);
400 write_field(&mut stream_data, *index_within_stream as u64, w3);
401 }
402 }
403 }
404
405 let mut dict = self.trailer_entries;
407 dict.insert(
408 "Type".to_string(),
409 PdfObject::Name(PdfName("XRef".to_string())),
410 );
411 dict.insert(
412 "W".to_string(),
413 PdfObject::Array(PdfArray(vec![
414 PdfObject::Integer(w1 as i64),
415 PdfObject::Integer(w2 as i64),
416 PdfObject::Integer(w3 as i64),
417 ])),
418 );
419
420 let size = self.entries.iter().map(|(n, _)| n + 1).max().unwrap_or(0);
422 dict.insert("Size".to_string(), PdfObject::Integer(size as i64));
423
424 if !self.entries.is_empty() {
426 let first = self.entries[0].0;
427 let count = self.entries.len() as u32;
428 if first != 0 {
429 dict.insert(
430 "Index".to_string(),
431 PdfObject::Array(PdfArray(vec![
432 PdfObject::Integer(first as i64),
433 PdfObject::Integer(count as i64),
434 ])),
435 );
436 }
437 }
438
439 dict.insert(
441 "Length".to_string(),
442 PdfObject::Integer(stream_data.len() as i64),
443 );
444
445 let compressed = compress_data(&stream_data)?;
447 dict.insert(
448 "Filter".to_string(),
449 PdfObject::Name(PdfName("FlateDecode".to_string())),
450 );
451
452 Ok((dict, compressed))
453 }
454}
455
456fn bytes_needed(value: u64) -> usize {
458 if value == 0 {
459 1
460 } else {
461 ((64 - value.leading_zeros()).div_ceil(8)) as usize
462 }
463}
464
465fn write_field(output: &mut Vec<u8>, value: u64, width: usize) {
467 for i in (0..width).rev() {
468 output.push((value >> (i * 8)) as u8);
469 }
470}
471
472fn compress_data(data: &[u8]) -> ParseResult<Vec<u8>> {
474 use flate2::write::ZlibEncoder;
475 use flate2::Compression;
476 use std::io::Write;
477
478 let mut encoder = ZlibEncoder::new(Vec::new(), Compression::default());
479 encoder
480 .write_all(data)
481 .map_err(|e| ParseError::StreamDecodeError(format!("Compression failed: {e}")))?;
482 encoder
483 .finish()
484 .map_err(|e| ParseError::StreamDecodeError(format!("Compression failed: {e}")))
485}
486
487#[cfg(test)]
488mod tests {
489 use super::*;
490
491 #[test]
492 fn test_read_field() {
493 assert_eq!(read_field(&[0x00]), 0);
494 assert_eq!(read_field(&[0xFF]), 255);
495 assert_eq!(read_field(&[0x01, 0x23]), 0x0123);
496 assert_eq!(read_field(&[0x12, 0x34, 0x56]), 0x123456);
497 }
498
499 #[test]
500 fn test_write_field() {
501 let mut data = Vec::new();
502 write_field(&mut data, 0x1234, 2);
503 assert_eq!(data, vec![0x12, 0x34]);
504
505 data.clear();
506 write_field(&mut data, 0xFF, 1);
507 assert_eq!(data, vec![0xFF]);
508
509 data.clear();
510 write_field(&mut data, 0x123456, 3);
511 assert_eq!(data, vec![0x12, 0x34, 0x56]);
512 }
513
514 #[test]
515 fn test_bytes_needed() {
516 assert_eq!(bytes_needed(0), 1);
517 assert_eq!(bytes_needed(0xFF), 1);
518 assert_eq!(bytes_needed(0x100), 2);
519 assert_eq!(bytes_needed(0xFFFF), 2);
520 assert_eq!(bytes_needed(0x10000), 3);
521 assert_eq!(bytes_needed(0xFFFFFF), 3);
522 assert_eq!(bytes_needed(0x1000000), 4);
523 }
524
525 #[test]
526 fn test_xref_stream_builder() {
527 let mut builder = XRefStreamBuilder::new();
528
529 builder.add_entry(
531 0,
532 XRefEntry::Free {
533 next_free_object: 0,
534 generation: 65535,
535 },
536 );
537
538 builder.add_entry(
539 1,
540 XRefEntry::InUse {
541 offset: 15,
542 generation: 0,
543 },
544 );
545
546 builder.add_entry(
547 2,
548 XRefEntry::Compressed {
549 stream_object_number: 5,
550 index_within_stream: 0,
551 },
552 );
553
554 let result = builder.build();
555 assert!(result.is_ok());
556
557 let (dict, _data) = result.unwrap();
558
559 assert_eq!(
561 dict.get("Type")
562 .and_then(|o| o.as_name())
563 .map(|n| n.0.as_str()),
564 Some("XRef")
565 );
566 assert!(dict.get("W").is_some());
567 assert!(dict.get("Size").is_some());
568 assert!(dict.get("Filter").is_some());
569 }
570
571 #[test]
572 fn test_xref_entry_parsing() {
573 let entry_data = vec![
576 1, 0x03, 0xE8, 0, ];
580
581 let xref_stream = XRefStream {
582 dict: PdfDictionary::new(),
583 data: entry_data,
584 widths: vec![1, 2, 1],
585 index: vec![(10, 1)],
586 };
587
588 let entries = xref_stream.to_xref_entries().unwrap();
589 assert_eq!(entries.len(), 1);
590
591 let (obj_num, entry) = &entries[0];
592 assert_eq!(*obj_num, 10);
593
594 match entry {
595 XRefEntry::InUse { offset, generation } => {
596 assert_eq!(*offset, 1000);
597 assert_eq!(*generation, 0);
598 }
599 _ => panic!("Expected InUse entry"),
600 }
601 }
602
603 #[test]
604 fn test_compressed_entry_parsing() {
605 let entry_data = vec![
607 2, 0x00, 0x05, 0x00, 0x03, ];
611
612 let xref_stream = XRefStream {
613 dict: PdfDictionary::new(),
614 data: entry_data,
615 widths: vec![1, 2, 2],
616 index: vec![(20, 1)],
617 };
618
619 let entries = xref_stream.to_xref_entries().unwrap();
620 assert_eq!(entries.len(), 1);
621
622 let (obj_num, entry) = &entries[0];
623 assert_eq!(*obj_num, 20);
624
625 match entry {
626 XRefEntry::Compressed {
627 stream_object_number,
628 index_within_stream,
629 } => {
630 assert_eq!(*stream_object_number, 5);
631 assert_eq!(*index_within_stream, 3);
632 }
633 _ => panic!("Expected Compressed entry"),
634 }
635 }
636
637 #[test]
638 fn test_multiple_index_ranges() {
639 let entry_data = vec![
641 0, 0, 0, 0xFF, 0xFF, 1, 0, 0x0A, 0, 0, 1, 0, 0x14, 0, 0, 1, 0, 0x1E, 0, 0, ];
648
649 let xref_stream = XRefStream {
650 dict: PdfDictionary::new(),
651 data: entry_data,
652 widths: vec![1, 2, 2],
653 index: vec![(0, 2), (10, 2)],
654 };
655
656 let entries = xref_stream.to_xref_entries().unwrap();
657 assert_eq!(entries.len(), 4);
658
659 assert_eq!(entries[0].0, 0);
661 assert_eq!(entries[1].0, 1);
662 assert_eq!(entries[2].0, 10);
663 assert_eq!(entries[3].0, 11);
664 }
665}