Skip to main content

oxidize_pdf/parser/
xref_stream.rs

1//! Cross-reference stream support for PDF 1.5+
2//!
3//! This module implements cross-reference streams according to
4//! ISO 32000-1:2008 Section 7.5.8 (Cross-Reference Streams).
5//!
6//! Cross-reference streams are an alternative to traditional xref tables,
7//! providing more compact representation and supporting compressed object streams.
8
9use crate::parser::filters::{apply_filter, apply_filter_with_params, Filter};
10use crate::parser::objects::{PdfArray, PdfDictionary, PdfName, PdfObject};
11use crate::parser::ParseOptions;
12use crate::parser::{ParseError, ParseResult};
13use std::io::{Read, Seek};
14
15/// Cross-reference entry
16#[derive(Debug, Clone, PartialEq)]
17pub enum XRefEntry {
18    /// Free object entry
19    Free {
20        /// Next free object number
21        next_free_object: u32,
22        /// Generation number
23        generation: u16,
24    },
25    /// In-use object entry
26    InUse {
27        /// Byte offset in the file
28        offset: u64,
29        /// Generation number
30        generation: u16,
31    },
32    /// Compressed object entry (PDF 1.5+)
33    Compressed {
34        /// Object number of the object stream containing this object
35        stream_object_number: u32,
36        /// Index of this object within the object stream
37        index_within_stream: u32,
38    },
39}
40
41/// Cross-reference stream parser
42pub struct XRefStream {
43    /// Stream dictionary
44    pub dict: PdfDictionary,
45    /// Decoded stream data
46    pub data: Vec<u8>,
47    /// Field widths from W array
48    pub widths: Vec<usize>,
49    /// Index array (pairs of [first_object_number, count])
50    pub index: Vec<(u32, u32)>,
51}
52
53impl XRefStream {
54    /// Parse a cross-reference stream
55    pub fn parse<R: Read + Seek>(
56        _reader: &mut R,
57        stream_dict: PdfDictionary,
58        stream_data: Vec<u8>,
59        _options: &ParseOptions,
60    ) -> ParseResult<Self> {
61        // Get the W (widths) array
62        let widths = stream_dict
63            .get("W")
64            .and_then(|obj| obj.as_array())
65            .ok_or_else(|| ParseError::MissingKey("W array in xref stream".to_string()))?
66            .0
67            .iter()
68            .map(|obj| {
69                obj.as_integer()
70                    .ok_or_else(|| ParseError::SyntaxError {
71                        position: 0,
72                        message: "Invalid width in W array".to_string(),
73                    })
74                    .map(|n| n as usize)
75            })
76            .collect::<ParseResult<Vec<_>>>()?;
77
78        if widths.len() != 3 {
79            return Err(ParseError::SyntaxError {
80                position: 0,
81                message: format!(
82                    "W array must have 3 elements, found {len}",
83                    len = widths.len()
84                ),
85            });
86        }
87
88        // Get the Index array if present
89        let index =
90            if let Some(index_array) = stream_dict.get("Index").and_then(|obj| obj.as_array()) {
91                let mut index_pairs = Vec::new();
92                let mut i = 0;
93                while i + 1 < index_array.len() {
94                    let first =
95                        index_array.0[i]
96                            .as_integer()
97                            .ok_or_else(|| ParseError::SyntaxError {
98                                position: 0,
99                                message: "Invalid first object number in Index".to_string(),
100                            })? as u32;
101                    let count = index_array.0[i + 1].as_integer().ok_or_else(|| {
102                        ParseError::SyntaxError {
103                            position: 0,
104                            message: "Invalid count in Index".to_string(),
105                        }
106                    })? as u32;
107                    index_pairs.push((first, count));
108                    i += 2;
109                }
110                index_pairs
111            } else {
112                // Default: start at 0, count is Size
113                let size = stream_dict
114                    .get("Size")
115                    .and_then(|obj| obj.as_integer())
116                    .ok_or_else(|| ParseError::MissingKey("Size in xref stream".to_string()))?
117                    as u32;
118                vec![(0, size)]
119            };
120
121        // Calculate entry size from W array for XRef stream predictor handling
122        let entry_size = widths.iter().sum::<usize>();
123
124        // Decode the stream data
125        let decoded_data = if let Some(filter_obj) = stream_dict.get("Filter") {
126            // Apply filters
127            match filter_obj {
128                PdfObject::Name(filter_name) => {
129                    // Use apply_filter_with_params to handle DecodeParms (like Predictor)
130                    let filter = Filter::from_name(filter_name.as_str()).ok_or_else(|| {
131                        ParseError::StreamDecodeError(format!("Unknown filter: {filter_name:?}"))
132                    })?;
133
134                    // Get DecodeParms if available
135                    let decode_params = stream_dict.get("DecodeParms");
136
137                    if let Some(params_obj) = decode_params {
138                        if let Some(mut params_dict) = params_obj.as_dict().cloned() {
139                            // FIX for Issue #83: XRef streams with PNG predictor
140                            // Override /Columns with actual entry size from W array
141                            // This fixes predictor mismatch (e.g., Columns=4 but W=[1,3,2] requires 6)
142                            if params_dict
143                                .get("Predictor")
144                                .and_then(|p| p.as_integer())
145                                .is_some()
146                            {
147                                params_dict.insert(
148                                    "Columns".to_string(),
149                                    PdfObject::Integer(entry_size as i64),
150                                );
151                            }
152                            apply_filter_with_params(&stream_data, filter, Some(&params_dict))?
153                        } else {
154                            apply_filter(&stream_data, filter)?
155                        }
156                    } else {
157                        apply_filter(&stream_data, filter)?
158                    }
159                }
160                PdfObject::Array(filters) => {
161                    let mut data = stream_data;
162                    for filter in filters.0.iter() {
163                        if let Some(filter_name) = filter.as_name() {
164                            data = apply_filter(
165                                &data,
166                                Filter::from_name(filter_name.as_str()).ok_or_else(|| {
167                                    ParseError::StreamDecodeError(format!(
168                                        "Unknown filter: {filter_name:?}"
169                                    ))
170                                })?,
171                            )?;
172                        }
173                    }
174                    data
175                }
176                _ => stream_data,
177            }
178        } else {
179            stream_data
180        };
181
182        Ok(XRefStream {
183            dict: stream_dict,
184            data: decoded_data,
185            widths,
186            index,
187        })
188    }
189
190    /// Convert the cross-reference stream to XRefTable entries
191    pub fn to_xref_entries(&self) -> ParseResult<Vec<(u32, XRefEntry)>> {
192        let mut entries = Vec::new();
193        let entry_size = self.widths.iter().sum::<usize>();
194
195        if entry_size == 0 {
196            return Err(ParseError::SyntaxError {
197                position: 0,
198                message: "Invalid entry size (0) in xref stream".to_string(),
199            });
200        }
201
202        let mut data_offset = 0;
203
204        for &(first_obj, count) in &self.index {
205            for i in 0..count {
206                if data_offset + entry_size > self.data.len() {
207                    return Err(ParseError::SyntaxError {
208                        position: data_offset,
209                        message: format!("Xref stream data truncated at obj {}", first_obj + i),
210                    });
211                }
212
213                // Read fields according to widths
214                let mut field_offset = data_offset;
215                let mut fields = Vec::new();
216
217                for &width in &self.widths {
218                    let field_value = if width == 0 {
219                        0 // Default value when width is 0
220                    } else {
221                        read_field(&self.data[field_offset..field_offset + width])
222                    };
223                    fields.push(field_value);
224                    field_offset += width;
225                }
226
227                // Interpret fields based on type
228                let entry_type = fields[0];
229                let obj_num = first_obj + i;
230
231                let entry = match entry_type {
232                    0 => {
233                        // Type 0: Free object
234                        XRefEntry::Free {
235                            next_free_object: fields[1] as u32,
236                            generation: fields[2] as u16,
237                        }
238                    }
239                    1 => {
240                        // Type 1: Uncompressed object
241                        XRefEntry::InUse {
242                            offset: fields[1],
243                            generation: fields[2] as u16,
244                        }
245                    }
246                    2 => {
247                        // Type 2: Compressed object
248                        XRefEntry::Compressed {
249                            stream_object_number: fields[1] as u32,
250                            index_within_stream: fields[2] as u32,
251                        }
252                    }
253                    _ => {
254                        return Err(ParseError::SyntaxError {
255                            position: data_offset,
256                            message: format!("Invalid xref entry type: {entry_type}"),
257                        });
258                    }
259                };
260
261                entries.push((obj_num, entry));
262                data_offset += entry_size;
263            }
264        }
265
266        Ok(entries)
267    }
268
269    /// Get the trailer dictionary from the xref stream
270    pub fn trailer_dict(&self) -> &PdfDictionary {
271        &self.dict
272    }
273
274    /// Check if this is a hybrid reference file
275    pub fn is_hybrid(&self) -> bool {
276        // A hybrid file has both xref stream and traditional xref table
277        self.dict.get("XRefStm").is_some()
278    }
279
280    /// Get the offset to additional xref stream (for hybrid files)
281    pub fn get_xref_stm_offset(&self) -> Option<u64> {
282        self.dict
283            .get("XRefStm")
284            .and_then(|obj| obj.as_integer())
285            .map(|n| n as u64)
286    }
287
288    /// Get the previous xref offset
289    pub fn get_prev_offset(&self) -> Option<u64> {
290        self.dict
291            .get("Prev")
292            .and_then(|obj| obj.as_integer())
293            .map(|n| n as u64)
294    }
295}
296
297/// Read a field from bytes (big-endian)
298fn read_field(bytes: &[u8]) -> u64 {
299    let mut value = 0u64;
300    for &byte in bytes {
301        value = (value << 8) | (byte as u64);
302    }
303    value
304}
305
306/// XRef stream builder for creating new xref streams
307pub struct XRefStreamBuilder {
308    /// Entries to include in the stream
309    entries: Vec<(u32, XRefEntry)>,
310    /// Additional trailer dictionary entries
311    trailer_entries: PdfDictionary,
312}
313
314impl Default for XRefStreamBuilder {
315    fn default() -> Self {
316        Self::new()
317    }
318}
319
320impl XRefStreamBuilder {
321    /// Create a new XRef stream builder
322    pub fn new() -> Self {
323        Self {
324            entries: Vec::new(),
325            trailer_entries: PdfDictionary::new(),
326        }
327    }
328
329    /// Add an entry to the xref stream
330    pub fn add_entry(&mut self, obj_num: u32, entry: XRefEntry) {
331        self.entries.push((obj_num, entry));
332    }
333
334    /// Add a trailer dictionary entry
335    pub fn add_trailer_entry(&mut self, key: &str, value: PdfObject) {
336        self.trailer_entries.insert(key.to_string(), value);
337    }
338
339    /// Build the xref stream
340    pub fn build(mut self) -> ParseResult<(PdfDictionary, Vec<u8>)> {
341        // Sort entries by object number
342        self.entries.sort_by_key(|(num, _)| *num);
343
344        // Determine field widths
345        let mut max_offset = 0u64;
346        let mut max_obj_num = 0u32;
347        let mut max_gen = 0u16;
348        let mut _has_compressed = false;
349
350        for (obj_num, entry) in &self.entries {
351            max_obj_num = max_obj_num.max(*obj_num);
352            match entry {
353                XRefEntry::InUse { offset, generation } => {
354                    max_offset = max_offset.max(*offset);
355                    max_gen = max_gen.max(*generation);
356                }
357                XRefEntry::Free { generation, .. } => {
358                    max_gen = max_gen.max(*generation);
359                }
360                XRefEntry::Compressed {
361                    stream_object_number,
362                    index_within_stream,
363                } => {
364                    _has_compressed = true;
365                    max_obj_num = max_obj_num.max(*stream_object_number);
366                    max_offset = max_offset.max(*index_within_stream as u64);
367                }
368            }
369        }
370
371        // Calculate minimum bytes needed for each field
372        let w1 = 1; // Type field (0, 1, or 2)
373        let w2 = bytes_needed(max_offset.max(max_obj_num as u64));
374        let w3 = bytes_needed(max_gen as u64);
375
376        // Build the stream data
377        let mut stream_data = Vec::new();
378
379        for (_obj_num, entry) in &self.entries {
380            match entry {
381                XRefEntry::Free {
382                    next_free_object,
383                    generation,
384                } => {
385                    write_field(&mut stream_data, 0, w1); // Type 0
386                    write_field(&mut stream_data, *next_free_object as u64, w2);
387                    write_field(&mut stream_data, *generation as u64, w3);
388                }
389                XRefEntry::InUse { offset, generation } => {
390                    write_field(&mut stream_data, 1, w1); // Type 1
391                    write_field(&mut stream_data, *offset, w2);
392                    write_field(&mut stream_data, *generation as u64, w3);
393                }
394                XRefEntry::Compressed {
395                    stream_object_number,
396                    index_within_stream,
397                } => {
398                    write_field(&mut stream_data, 2, w1); // Type 2
399                    write_field(&mut stream_data, *stream_object_number as u64, w2);
400                    write_field(&mut stream_data, *index_within_stream as u64, w3);
401                }
402            }
403        }
404
405        // Build the stream dictionary
406        let mut dict = self.trailer_entries;
407        dict.insert(
408            "Type".to_string(),
409            PdfObject::Name(PdfName("XRef".to_string())),
410        );
411        dict.insert(
412            "W".to_string(),
413            PdfObject::Array(PdfArray(vec![
414                PdfObject::Integer(w1 as i64),
415                PdfObject::Integer(w2 as i64),
416                PdfObject::Integer(w3 as i64),
417            ])),
418        );
419
420        // Add Size
421        let size = self.entries.iter().map(|(n, _)| n + 1).max().unwrap_or(0);
422        dict.insert("Size".to_string(), PdfObject::Integer(size as i64));
423
424        // Add Index array if not starting from 0
425        if !self.entries.is_empty() {
426            let first = self.entries[0].0;
427            let count = self.entries.len() as u32;
428            if first != 0 {
429                dict.insert(
430                    "Index".to_string(),
431                    PdfObject::Array(PdfArray(vec![
432                        PdfObject::Integer(first as i64),
433                        PdfObject::Integer(count as i64),
434                    ])),
435                );
436            }
437        }
438
439        // Add Length
440        dict.insert(
441            "Length".to_string(),
442            PdfObject::Integer(stream_data.len() as i64),
443        );
444
445        // Apply compression (FlateDecode)
446        let compressed = compress_data(&stream_data)?;
447        dict.insert(
448            "Filter".to_string(),
449            PdfObject::Name(PdfName("FlateDecode".to_string())),
450        );
451
452        Ok((dict, compressed))
453    }
454}
455
456/// Calculate minimum bytes needed to represent a value
457fn bytes_needed(value: u64) -> usize {
458    if value == 0 {
459        1
460    } else {
461        ((64 - value.leading_zeros()).div_ceil(8)) as usize
462    }
463}
464
465/// Write a field value with specified width (big-endian)
466fn write_field(output: &mut Vec<u8>, value: u64, width: usize) {
467    for i in (0..width).rev() {
468        output.push((value >> (i * 8)) as u8);
469    }
470}
471
472/// Compress data using flate compression
473fn compress_data(data: &[u8]) -> ParseResult<Vec<u8>> {
474    use flate2::write::ZlibEncoder;
475    use flate2::Compression;
476    use std::io::Write;
477
478    let mut encoder = ZlibEncoder::new(Vec::new(), Compression::default());
479    encoder
480        .write_all(data)
481        .map_err(|e| ParseError::StreamDecodeError(format!("Compression failed: {e}")))?;
482    encoder
483        .finish()
484        .map_err(|e| ParseError::StreamDecodeError(format!("Compression failed: {e}")))
485}
486
487#[cfg(test)]
488mod tests {
489    use super::*;
490
491    #[test]
492    fn test_read_field() {
493        assert_eq!(read_field(&[0x00]), 0);
494        assert_eq!(read_field(&[0xFF]), 255);
495        assert_eq!(read_field(&[0x01, 0x23]), 0x0123);
496        assert_eq!(read_field(&[0x12, 0x34, 0x56]), 0x123456);
497    }
498
499    #[test]
500    fn test_write_field() {
501        let mut data = Vec::new();
502        write_field(&mut data, 0x1234, 2);
503        assert_eq!(data, vec![0x12, 0x34]);
504
505        data.clear();
506        write_field(&mut data, 0xFF, 1);
507        assert_eq!(data, vec![0xFF]);
508
509        data.clear();
510        write_field(&mut data, 0x123456, 3);
511        assert_eq!(data, vec![0x12, 0x34, 0x56]);
512    }
513
514    #[test]
515    fn test_bytes_needed() {
516        assert_eq!(bytes_needed(0), 1);
517        assert_eq!(bytes_needed(0xFF), 1);
518        assert_eq!(bytes_needed(0x100), 2);
519        assert_eq!(bytes_needed(0xFFFF), 2);
520        assert_eq!(bytes_needed(0x10000), 3);
521        assert_eq!(bytes_needed(0xFFFFFF), 3);
522        assert_eq!(bytes_needed(0x1000000), 4);
523    }
524
525    #[test]
526    fn test_xref_stream_builder() {
527        let mut builder = XRefStreamBuilder::new();
528
529        // Add some entries
530        builder.add_entry(
531            0,
532            XRefEntry::Free {
533                next_free_object: 0,
534                generation: 65535,
535            },
536        );
537
538        builder.add_entry(
539            1,
540            XRefEntry::InUse {
541                offset: 15,
542                generation: 0,
543            },
544        );
545
546        builder.add_entry(
547            2,
548            XRefEntry::Compressed {
549                stream_object_number: 5,
550                index_within_stream: 0,
551            },
552        );
553
554        let result = builder.build();
555        assert!(result.is_ok());
556
557        let (dict, _data) = result.unwrap();
558
559        // Check dictionary entries
560        assert_eq!(
561            dict.get("Type")
562                .and_then(|o| o.as_name())
563                .map(|n| n.0.as_str()),
564            Some("XRef")
565        );
566        assert!(dict.get("W").is_some());
567        assert!(dict.get("Size").is_some());
568        assert!(dict.get("Filter").is_some());
569    }
570
571    #[test]
572    fn test_xref_entry_parsing() {
573        // Test data for xref stream entries
574        // Type 1 entry: offset=1000, generation=0
575        let entry_data = vec![
576            1, // Type 1
577            0x03, 0xE8, // Offset = 1000 (0x03E8)
578            0,    // Generation = 0
579        ];
580
581        let xref_stream = XRefStream {
582            dict: PdfDictionary::new(),
583            data: entry_data,
584            widths: vec![1, 2, 1],
585            index: vec![(10, 1)],
586        };
587
588        let entries = xref_stream.to_xref_entries().unwrap();
589        assert_eq!(entries.len(), 1);
590
591        let (obj_num, entry) = &entries[0];
592        assert_eq!(*obj_num, 10);
593
594        match entry {
595            XRefEntry::InUse { offset, generation } => {
596                assert_eq!(*offset, 1000);
597                assert_eq!(*generation, 0);
598            }
599            _ => panic!("Expected InUse entry"),
600        }
601    }
602
603    #[test]
604    fn test_compressed_entry_parsing() {
605        // Test compressed object entry
606        let entry_data = vec![
607            2, // Type 2 (compressed)
608            0x00, 0x05, // Stream object number = 5
609            0x00, 0x03, // Index within stream = 3
610        ];
611
612        let xref_stream = XRefStream {
613            dict: PdfDictionary::new(),
614            data: entry_data,
615            widths: vec![1, 2, 2],
616            index: vec![(20, 1)],
617        };
618
619        let entries = xref_stream.to_xref_entries().unwrap();
620        assert_eq!(entries.len(), 1);
621
622        let (obj_num, entry) = &entries[0];
623        assert_eq!(*obj_num, 20);
624
625        match entry {
626            XRefEntry::Compressed {
627                stream_object_number,
628                index_within_stream,
629            } => {
630                assert_eq!(*stream_object_number, 5);
631                assert_eq!(*index_within_stream, 3);
632            }
633            _ => panic!("Expected Compressed entry"),
634        }
635    }
636
637    #[test]
638    fn test_multiple_index_ranges() {
639        // Test with multiple index ranges
640        let entry_data = vec![
641            // First range: objects 0-1 (each entry: 1+2+2=5 bytes)
642            0, 0, 0, 0xFF, 0xFF, // Free object 0
643            1, 0, 0x0A, 0, 0, // InUse object 1 at offset 10
644            // Second range: objects 10-11
645            1, 0, 0x14, 0, 0, // InUse object 10 at offset 20
646            1, 0, 0x1E, 0, 0, // InUse object 11 at offset 30
647        ];
648
649        let xref_stream = XRefStream {
650            dict: PdfDictionary::new(),
651            data: entry_data,
652            widths: vec![1, 2, 2],
653            index: vec![(0, 2), (10, 2)],
654        };
655
656        let entries = xref_stream.to_xref_entries().unwrap();
657        assert_eq!(entries.len(), 4);
658
659        // Check object numbers
660        assert_eq!(entries[0].0, 0);
661        assert_eq!(entries[1].0, 1);
662        assert_eq!(entries[2].0, 10);
663        assert_eq!(entries[3].0, 11);
664    }
665}