oxidize_pdf/parser/
object_stream.rs

1//! PDF Object Stream Parser
2//!
3//! Handles compressed objects stored in object streams (PDF 1.5+)
4
5use super::lexer::Lexer;
6use super::objects::{PdfObject, PdfStream};
7use super::xref::XRefEntry;
8use super::{ParseError, ParseOptions, ParseResult};
9use std::collections::HashMap;
10use std::io::Cursor;
11
12/// Represents a PDF object stream containing compressed objects
13#[derive(Debug)]
14pub struct ObjectStream {
15    /// Stream containing the objects
16    stream: PdfStream,
17    /// Number of objects in the stream
18    n: u32,
19    /// Offset of first object
20    first: u32,
21    /// Cached parsed objects
22    objects: HashMap<u32, PdfObject>,
23}
24
25impl ObjectStream {
26    /// Parse an object stream
27    pub fn parse(stream: PdfStream, options: &ParseOptions) -> ParseResult<Self> {
28        // Get required entries from stream dictionary
29        let dict = &stream.dict;
30
31        let n = dict
32            .get("N")
33            .and_then(|obj| obj.as_integer())
34            .ok_or_else(|| ParseError::MissingKey("N".to_string()))? as u32;
35
36        let first = dict
37            .get("First")
38            .and_then(|obj| obj.as_integer())
39            .ok_or_else(|| ParseError::MissingKey("First".to_string()))? as u32;
40
41        let mut obj_stream = ObjectStream {
42            stream,
43            n,
44            first,
45            objects: HashMap::new(),
46        };
47
48        // Parse all objects eagerly
49        obj_stream.parse_objects(options)?;
50
51        Ok(obj_stream)
52    }
53
54    /// Parse all objects in the stream
55    fn parse_objects(&mut self, options: &ParseOptions) -> ParseResult<()> {
56        // Decode the stream data
57        let data = self.stream.decode(options)?;
58
59        // Create a cursor for reading
60        let mut cursor = Cursor::new(&data);
61        // Pass options to lexer for more flexible parsing
62        let mut lexer = Lexer::new_with_options(&mut cursor, options.clone());
63
64        // Read object number/offset pairs
65        let mut offsets = Vec::new();
66        for _ in 0..self.n {
67            // Read object number
68            let obj_num = match lexer.next_token()? {
69                super::lexer::Token::Integer(n) => n as u32,
70                _ => {
71                    return Err(ParseError::SyntaxError {
72                        position: 0,
73                        message: "Expected object number in object stream".to_string(),
74                    })
75                }
76            };
77
78            // Read offset
79            let offset = match lexer.next_token()? {
80                super::lexer::Token::Integer(n) => n as u32,
81                _ => {
82                    return Err(ParseError::SyntaxError {
83                        position: 0,
84                        message: "Expected offset in object stream".to_string(),
85                    })
86                }
87            };
88
89            offsets.push((obj_num, offset));
90        }
91
92        // Parse each object
93        for (obj_num, offset) in offsets.iter() {
94            // Calculate absolute offset
95            let abs_offset = self.first + offset;
96
97            // Seek to object start
98            cursor.set_position(abs_offset as u64);
99            let mut obj_lexer = Lexer::new_with_options(&mut cursor, options.clone());
100
101            // Parse the object with options for more flexible parsing
102            let obj = PdfObject::parse_with_options(&mut obj_lexer, options)?;
103
104            // Store in cache
105            self.objects.insert(*obj_num, obj);
106        }
107
108        Ok(())
109    }
110
111    /// Get an object by its object number
112    pub fn get_object(&self, obj_num: u32) -> Option<&PdfObject> {
113        self.objects.get(&obj_num)
114    }
115
116    /// Get all objects
117    pub fn objects(&self) -> &HashMap<u32, PdfObject> {
118        &self.objects
119    }
120}
121
122/// Extended XRef entry to handle compressed objects
123#[derive(Debug, Clone, Copy, PartialEq)]
124pub enum XRefEntryType {
125    /// Free object
126    Free { next_free_obj: u32, generation: u16 },
127    /// Uncompressed object
128    InUse { offset: u64, generation: u16 },
129    /// Compressed object in object stream
130    Compressed {
131        stream_obj_num: u32,
132        index_in_stream: u32,
133    },
134}
135
136impl XRefEntryType {
137    /// Convert to simple XRefEntry for compatibility
138    pub fn to_simple_entry(&self) -> XRefEntry {
139        match self {
140            XRefEntryType::Free { generation, .. } => XRefEntry {
141                offset: 0,
142                generation: *generation,
143                in_use: false,
144            },
145            XRefEntryType::InUse { offset, generation } => XRefEntry {
146                offset: *offset,
147                generation: *generation,
148                in_use: true,
149            },
150            XRefEntryType::Compressed { .. } => XRefEntry {
151                offset: 0,
152                generation: 0,
153                in_use: true,
154            },
155        }
156    }
157}
158
159#[cfg(test)]
160mod tests {
161    use super::super::objects::{PdfDictionary, PdfName};
162    use super::*;
163    use flate2::write::ZlibEncoder;
164    use flate2::Compression;
165    use std::collections::HashMap;
166    use std::io::Write;
167
168    #[allow(dead_code)]
169    fn create_test_stream_data() -> Vec<u8> {
170        // Create test data with proper format:
171        // Object numbers and offsets: "1 0 2 2"
172        // Then the objects starting at offset 10 (after "1 0 2 2    ")
173        // Simple objects: "true false"
174        let data = b"1 0 2 2    true false";
175        data.to_vec()
176    }
177
178    #[allow(dead_code)]
179    fn create_compressed_stream_data() -> Vec<u8> {
180        let data = create_test_stream_data();
181        let mut encoder = ZlibEncoder::new(Vec::new(), Compression::default());
182        encoder.write_all(&data).unwrap();
183        encoder.finish().unwrap()
184    }
185
186    #[test]
187    fn test_xref_entry_type_free() {
188        let entry = XRefEntryType::Free {
189            next_free_obj: 5,
190            generation: 65535,
191        };
192
193        let simple = entry.to_simple_entry();
194        assert_eq!(simple.offset, 0);
195        assert_eq!(simple.generation, 65535);
196        assert!(!simple.in_use);
197    }
198
199    #[test]
200    fn test_xref_entry_type_in_use() {
201        let entry = XRefEntryType::InUse {
202            offset: 1234,
203            generation: 0,
204        };
205
206        let simple = entry.to_simple_entry();
207        assert_eq!(simple.offset, 1234);
208        assert_eq!(simple.generation, 0);
209        assert!(simple.in_use);
210    }
211
212    #[test]
213    fn test_xref_entry_type_compressed() {
214        let entry = XRefEntryType::Compressed {
215            stream_obj_num: 10,
216            index_in_stream: 3,
217        };
218
219        let simple = entry.to_simple_entry();
220        assert_eq!(simple.offset, 0); // Compressed entries have offset 0
221        assert_eq!(simple.generation, 0);
222        assert!(simple.in_use);
223    }
224
225    // Note: These tests are simplified because creating valid object stream data
226    // that passes through the full parser is complex. The real testing happens
227    // in integration tests with actual PDF files.
228
229    #[test]
230    fn test_object_stream_parse_missing_n() {
231        // Test that missing N field causes error
232        let mut dict = PdfDictionary(HashMap::new());
233        dict.0.insert(
234            PdfName("Type".to_string()),
235            PdfObject::Name(PdfName("ObjStm".to_string())),
236        );
237        dict.0
238            .insert(PdfName("First".to_string()), PdfObject::Integer(10));
239
240        let stream = PdfStream { dict, data: vec![] };
241
242        let options = ParseOptions::default();
243        let result = ObjectStream::parse(stream, &options);
244
245        assert!(result.is_err());
246        match result.unwrap_err() {
247            ParseError::MissingKey(key) => assert_eq!(key, "N"),
248            _ => panic!("Expected MissingKey error"),
249        }
250    }
251
252    #[test]
253    fn test_object_stream_parse_missing_first() {
254        // Test that missing First field causes error
255        let mut dict = PdfDictionary(HashMap::new());
256        dict.0.insert(
257            PdfName("Type".to_string()),
258            PdfObject::Name(PdfName("ObjStm".to_string())),
259        );
260        dict.0
261            .insert(PdfName("N".to_string()), PdfObject::Integer(2));
262
263        let stream = PdfStream { dict, data: vec![] };
264
265        let options = ParseOptions::default();
266        let result = ObjectStream::parse(stream, &options);
267
268        assert!(result.is_err());
269        match result.unwrap_err() {
270            ParseError::MissingKey(key) => assert_eq!(key, "First"),
271            _ => panic!("Expected MissingKey error"),
272        }
273    }
274}