oxidize_pdf/parser/
objects.rs

1//! PDF Object Parser
2//! 
3//! Parses PDF objects from tokens according to ISO 32000-1 Section 7.3
4
5use super::{ParseError, ParseResult};
6use super::lexer::{Lexer, Token};
7use std::collections::HashMap;
8use std::io::Read;
9
10/// PDF Name object
11#[derive(Debug, Clone, PartialEq, Eq, Hash)]
12pub struct PdfName(pub String);
13
14/// PDF String object  
15#[derive(Debug, Clone, PartialEq)]
16pub struct PdfString(pub Vec<u8>);
17
18/// PDF Array object
19#[derive(Debug, Clone, PartialEq)]
20pub struct PdfArray(pub Vec<PdfObject>);
21
22/// PDF Dictionary object
23#[derive(Debug, Clone, PartialEq)]
24pub struct PdfDictionary(pub HashMap<PdfName, PdfObject>);
25
26/// PDF Stream object
27#[derive(Debug, Clone, PartialEq)]
28pub struct PdfStream {
29    pub dict: PdfDictionary,
30    pub data: Vec<u8>,
31}
32
33impl PdfStream {
34    /// Get the decompressed stream data
35    pub fn decode(&self) -> ParseResult<Vec<u8>> {
36        super::filters::decode_stream(&self.data, &self.dict)
37    }
38    
39    /// Get the raw (possibly compressed) stream data
40    pub fn raw_data(&self) -> &[u8] {
41        &self.data
42    }
43}
44
45/// PDF Object types
46#[derive(Debug, Clone, PartialEq)]
47pub enum PdfObject {
48    Null,
49    Boolean(bool),
50    Integer(i64),
51    Real(f64),
52    String(PdfString),
53    Name(PdfName),
54    Array(PdfArray),
55    Dictionary(PdfDictionary),
56    Stream(PdfStream),
57    Reference(u32, u16), // object number, generation number
58}
59
60impl PdfObject {
61    /// Parse a PDF object from a lexer
62    pub fn parse<R: Read>(lexer: &mut Lexer<R>) -> ParseResult<Self> {
63        let token = lexer.next_token()?;
64        Self::parse_from_token(lexer, token)
65    }
66    
67    /// Parse a PDF object starting from a specific token
68    fn parse_from_token<R: Read>(lexer: &mut Lexer<R>, token: Token) -> ParseResult<Self> {
69        match token {
70            Token::Null => Ok(PdfObject::Null),
71            Token::Boolean(b) => Ok(PdfObject::Boolean(b)),
72            Token::Integer(i) => {
73                // For negative numbers or large values, don't check for references
74                if i < 0 || i > 9999999 {
75                    return Ok(PdfObject::Integer(i));
76                }
77                
78                // Check if this is part of a reference (e.g., "1 0 R")
79                match lexer.next_token()? {
80                    Token::Integer(gen) if gen >= 0 && gen <= 65535 => {
81                        // Might be a reference, check for 'R'
82                        match lexer.next_token()? {
83                            Token::Name(s) if s == "R" => {
84                                Ok(PdfObject::Reference(i as u32, gen as u16))
85                            }
86                            token => {
87                                // Not a reference, push back the tokens
88                                lexer.push_token(token);
89                                lexer.push_token(Token::Integer(gen));
90                                Ok(PdfObject::Integer(i))
91                            }
92                        }
93                    }
94                    token => {
95                        // Not a reference, just an integer
96                        lexer.push_token(token);
97                        Ok(PdfObject::Integer(i))
98                    }
99                }
100            }
101            Token::Real(r) => Ok(PdfObject::Real(r)),
102            Token::String(s) => Ok(PdfObject::String(PdfString(s))),
103            Token::Name(n) => Ok(PdfObject::Name(PdfName(n))),
104            Token::ArrayStart => Self::parse_array(lexer),
105            Token::DictStart => Self::parse_dictionary_or_stream(lexer),
106            Token::Comment(_) => {
107                // Skip comments and parse next object
108                Self::parse(lexer)
109            }
110            Token::StartXRef => {
111                // This is a PDF structure marker, not a parseable object
112                Err(ParseError::SyntaxError {
113                    position: 0,
114                    message: "StartXRef encountered - this is not a PDF object".to_string(),
115                })
116            }
117            Token::Eof => Err(ParseError::SyntaxError {
118                position: 0,
119                message: "Unexpected end of file".to_string(),
120            }),
121            _ => Err(ParseError::UnexpectedToken {
122                expected: "PDF object".to_string(),
123                found: format!("{:?}", token),
124            }),
125        }
126    }
127    
128    /// Parse a PDF array
129    fn parse_array<R: Read>(lexer: &mut Lexer<R>) -> ParseResult<Self> {
130        let mut elements = Vec::new();
131        
132        loop {
133            let token = lexer.next_token()?;
134            match token {
135                Token::ArrayEnd => break,
136                Token::Comment(_) => continue, // Skip comments
137                _ => {
138                    let obj = Self::parse_from_token(lexer, token)?;
139                    elements.push(obj);
140                }
141            }
142        }
143        
144        Ok(PdfObject::Array(PdfArray(elements)))
145    }
146    
147    /// Parse a PDF dictionary and check if it's followed by a stream
148    fn parse_dictionary_or_stream<R: Read>(lexer: &mut Lexer<R>) -> ParseResult<Self> {
149        let dict = Self::parse_dictionary_inner(lexer)?;
150        
151        // Check if this is followed by a stream
152        loop {
153            let token = lexer.next_token()?;
154            // Check for stream
155            match token {
156                Token::Stream => {
157                    // Parse stream data
158                    let stream_data = Self::parse_stream_data(lexer, &dict)?;
159                    return Ok(PdfObject::Stream(PdfStream {
160                        dict,
161                        data: stream_data,
162                    }));
163                }
164                Token::Comment(_) => {
165                    // Skip comment and continue checking
166                    continue;
167                }
168                Token::StartXRef => {
169                    // This is the end of the PDF structure, not a stream
170                    // Push the token back for later processing
171                    // Push back StartXRef token
172                    lexer.push_token(token);
173                    return Ok(PdfObject::Dictionary(dict));
174                }
175                _ => {
176                    // Not a stream, just a dictionary
177                    // Push the token back for later processing
178                    // Push back token
179                    lexer.push_token(token);
180                    return Ok(PdfObject::Dictionary(dict));
181                }
182            }
183        }
184    }
185    
186    /// Parse the inner dictionary
187    fn parse_dictionary_inner<R: Read>(lexer: &mut Lexer<R>) -> ParseResult<PdfDictionary> {
188        let mut dict = HashMap::new();
189        
190        loop {
191            let token = lexer.next_token()?;
192            match token {
193                Token::DictEnd => break,
194                Token::Comment(_) => continue, // Skip comments
195                Token::Name(key) => {
196                    let value = Self::parse(lexer)?;
197                    dict.insert(PdfName(key), value);
198                }
199                _ => {
200                    return Err(ParseError::UnexpectedToken {
201                        expected: "dictionary key (name) or >>".to_string(),
202                        found: format!("{:?}", token),
203                    });
204                }
205            }
206        }
207        
208        Ok(PdfDictionary(dict))
209    }
210    
211    /// Parse stream data
212    fn parse_stream_data<R: Read>(
213        lexer: &mut Lexer<R>,
214        dict: &PdfDictionary,
215    ) -> ParseResult<Vec<u8>> {
216        // Get the stream length from the dictionary
217        let length = dict.0.get(&PdfName("Length".to_string()))
218            .ok_or_else(|| ParseError::MissingKey("Length".to_string()))?;
219        
220        let length = match length {
221            PdfObject::Integer(len) => *len as usize,
222            PdfObject::Reference(_, _) => {
223                // In a full implementation, we'd need to resolve this reference
224                // For now, we'll return an error
225                return Err(ParseError::SyntaxError {
226                    position: lexer.position(),
227                    message: "Stream length references not yet supported".to_string(),
228                });
229            }
230            _ => {
231                return Err(ParseError::SyntaxError {
232                    position: lexer.position(),
233                    message: "Invalid stream length type".to_string(),
234                });
235            }
236        };
237        
238        // Skip the newline after 'stream' keyword
239        lexer.read_newline()?;
240        
241        // Read the actual stream data
242        let stream_data = lexer.read_bytes(length)?;
243        
244        // Skip optional whitespace before endstream
245        lexer.skip_whitespace()?;
246        
247        // Read 'endstream' keyword
248        let token = lexer.next_token()?;
249        match token {
250            Token::EndStream => Ok(stream_data),
251            _ => Err(ParseError::UnexpectedToken {
252                expected: "endstream".to_string(),
253                found: format!("{:?}", token),
254            }),
255        }
256    }
257    
258    /// Check if this object is null
259    pub fn is_null(&self) -> bool {
260        matches!(self, PdfObject::Null)
261    }
262    
263    /// Get as boolean
264    pub fn as_bool(&self) -> Option<bool> {
265        match self {
266            PdfObject::Boolean(b) => Some(*b),
267            _ => None,
268        }
269    }
270    
271    /// Get as integer
272    pub fn as_integer(&self) -> Option<i64> {
273        match self {
274            PdfObject::Integer(i) => Some(*i),
275            _ => None,
276        }
277    }
278    
279    /// Get as real number
280    pub fn as_real(&self) -> Option<f64> {
281        match self {
282            PdfObject::Real(r) => Some(*r),
283            PdfObject::Integer(i) => Some(*i as f64),
284            _ => None,
285        }
286    }
287    
288    /// Get as string
289    pub fn as_string(&self) -> Option<&PdfString> {
290        match self {
291            PdfObject::String(s) => Some(s),
292            _ => None,
293        }
294    }
295    
296    /// Get as name
297    pub fn as_name(&self) -> Option<&PdfName> {
298        match self {
299            PdfObject::Name(n) => Some(n),
300            _ => None,
301        }
302    }
303    
304    /// Get as array
305    pub fn as_array(&self) -> Option<&PdfArray> {
306        match self {
307            PdfObject::Array(a) => Some(a),
308            _ => None,
309        }
310    }
311    
312    /// Get as dictionary
313    pub fn as_dict(&self) -> Option<&PdfDictionary> {
314        match self {
315            PdfObject::Dictionary(d) => Some(d),
316            PdfObject::Stream(s) => Some(&s.dict),
317            _ => None,
318        }
319    }
320    
321    /// Get as stream
322    pub fn as_stream(&self) -> Option<&PdfStream> {
323        match self {
324            PdfObject::Stream(s) => Some(s),
325            _ => None,
326        }
327    }
328    
329    /// Get as reference
330    pub fn as_reference(&self) -> Option<(u32, u16)> {
331        match self {
332            PdfObject::Reference(obj, gen) => Some((*obj, *gen)),
333            _ => None,
334        }
335    }
336}
337
338impl PdfDictionary {
339    /// Create a new empty dictionary
340    pub fn new() -> Self {
341        PdfDictionary(HashMap::new())
342    }
343    
344    /// Get a value by key
345    pub fn get(&self, key: &str) -> Option<&PdfObject> {
346        self.0.get(&PdfName(key.to_string()))
347    }
348    
349    /// Insert a key-value pair
350    pub fn insert(&mut self, key: String, value: PdfObject) {
351        self.0.insert(PdfName(key), value);
352    }
353    
354    /// Check if dictionary contains a key
355    pub fn contains_key(&self, key: &str) -> bool {
356        self.0.contains_key(&PdfName(key.to_string()))
357    }
358    
359    /// Get the dictionary type (value of /Type key)
360    pub fn get_type(&self) -> Option<&str> {
361        self.get("Type").and_then(|obj| obj.as_name()).map(|n| n.0.as_str())
362    }
363}
364
365impl PdfArray {
366    /// Create a new empty array
367    pub fn new() -> Self {
368        PdfArray(Vec::new())
369    }
370    
371    /// Get array length
372    pub fn len(&self) -> usize {
373        self.0.len()
374    }
375    
376    /// Check if array is empty
377    pub fn is_empty(&self) -> bool {
378        self.0.is_empty()
379    }
380    
381    /// Get element at index
382    pub fn get(&self, index: usize) -> Option<&PdfObject> {
383        self.0.get(index)
384    }
385    
386    /// Push an element
387    pub fn push(&mut self, obj: PdfObject) {
388        self.0.push(obj);
389    }
390}
391
392impl PdfString {
393    /// Create a new PDF string
394    pub fn new(data: Vec<u8>) -> Self {
395        PdfString(data)
396    }
397    
398    /// Get as UTF-8 string if possible
399    pub fn as_str(&self) -> Result<&str, std::str::Utf8Error> {
400        std::str::from_utf8(&self.0)
401    }
402    
403    /// Get as bytes
404    pub fn as_bytes(&self) -> &[u8] {
405        &self.0
406    }
407}
408
409impl PdfName {
410    /// Create a new PDF name
411    pub fn new(name: String) -> Self {
412        PdfName(name)
413    }
414    
415    /// Get the name as a string
416    pub fn as_str(&self) -> &str {
417        &self.0
418    }
419}
420
421#[cfg(test)]
422mod tests {
423    use super::*;
424    use std::io::Cursor;
425    
426    #[test]
427    fn test_parse_simple_objects() {
428        let input = b"null true false 123 -456 3.14 /Name (Hello)";
429        let mut lexer = Lexer::new(Cursor::new(input));
430        
431        assert_eq!(PdfObject::parse(&mut lexer).unwrap(), PdfObject::Null);
432        assert_eq!(PdfObject::parse(&mut lexer).unwrap(), PdfObject::Boolean(true));
433        assert_eq!(PdfObject::parse(&mut lexer).unwrap(), PdfObject::Boolean(false));
434        assert_eq!(PdfObject::parse(&mut lexer).unwrap(), PdfObject::Integer(123));
435        assert_eq!(PdfObject::parse(&mut lexer).unwrap(), PdfObject::Integer(-456));
436        assert_eq!(PdfObject::parse(&mut lexer).unwrap(), PdfObject::Real(3.14));
437        assert_eq!(PdfObject::parse(&mut lexer).unwrap(), PdfObject::Name(PdfName("Name".to_string())));
438        assert_eq!(PdfObject::parse(&mut lexer).unwrap(), PdfObject::String(PdfString(b"Hello".to_vec())));
439    }
440    
441    #[test]
442    fn test_parse_array() {
443        // Test simple array without potential references
444        let input = b"[100 200 300 /Name (test)]";
445        let mut lexer = Lexer::new(Cursor::new(input));
446        
447        let obj = PdfObject::parse(&mut lexer).unwrap();
448        let array = obj.as_array().unwrap();
449        
450        assert_eq!(array.len(), 5);
451        assert_eq!(array.get(0).unwrap().as_integer(), Some(100));
452        assert_eq!(array.get(1).unwrap().as_integer(), Some(200));
453        assert_eq!(array.get(2).unwrap().as_integer(), Some(300));
454        assert_eq!(array.get(3).unwrap().as_name().unwrap().as_str(), "Name");
455        assert_eq!(array.get(4).unwrap().as_string().unwrap().as_bytes(), b"test");
456    }
457    
458    #[test]
459    fn test_parse_array_with_references() {
460        // Test array with references
461        let input = b"[1 0 R 2 0 R]";
462        let mut lexer = Lexer::new(Cursor::new(input));
463        
464        let obj = PdfObject::parse(&mut lexer).unwrap();
465        let array = obj.as_array().unwrap();
466        
467        assert_eq!(array.len(), 2);
468        assert!(array.get(0).unwrap().as_reference().is_some());
469        assert!(array.get(1).unwrap().as_reference().is_some());
470    }
471    
472    #[test]
473    fn test_parse_dictionary() {
474        let input = b"<< /Type /Page /Parent 1 0 R /MediaBox [0 0 612 792] >>";
475        let mut lexer = Lexer::new(Cursor::new(input));
476        
477        let obj = PdfObject::parse(&mut lexer).unwrap();
478        let dict = obj.as_dict().unwrap();
479        
480        assert_eq!(dict.get_type(), Some("Page"));
481        assert!(dict.get("Parent").unwrap().as_reference().is_some());
482        assert!(dict.get("MediaBox").unwrap().as_array().is_some());
483    }
484}