oxidize_pdf/parser/
objects.rs

1//! PDF Object Parser - Core PDF data types and parsing
2//!
3//! This module implements parsing of all PDF object types according to ISO 32000-1 Section 7.3.
4//! PDF files are built from a small set of basic object types that can be combined to form
5//! complex data structures.
6//!
7//! # Object Types
8//!
9//! PDF supports the following basic object types:
10//! - **Null**: Represents an undefined value
11//! - **Boolean**: true or false
12//! - **Integer**: Whole numbers
13//! - **Real**: Floating-point numbers
14//! - **String**: Text data (literal or hexadecimal)
15//! - **Name**: Unique atomic symbols (e.g., /Type, /Pages)
16//! - **Array**: Ordered collections of objects
17//! - **Dictionary**: Key-value mappings where keys are names
18//! - **Stream**: Dictionary + binary data
19//! - **Reference**: Indirect reference to another object
20//!
21//! # Example
22//!
23//! ```rust
24//! use oxidize_pdf::parser::objects::{PdfObject, PdfDictionary, PdfName, PdfArray};
25//!
26//! // Create a simple page dictionary
27//! let mut dict = PdfDictionary::new();
28//! dict.insert("Type".to_string(), PdfObject::Name(PdfName::new("Page".to_string())));
29//! dict.insert("MediaBox".to_string(), PdfObject::Array(PdfArray::new()));
30//!
31//! // Check dictionary type
32//! assert_eq!(dict.get_type(), Some("Page"));
33//! ```
34
35use super::lexer::{Lexer, Token};
36use super::{ParseError, ParseOptions, ParseResult};
37use std::collections::HashMap;
38use std::io::Read;
39
40/// PDF Name object - Unique atomic symbols in PDF.
41///
42/// Names are used as keys in dictionaries and to identify various PDF constructs.
43/// They are written with a leading slash (/) in PDF syntax but stored without it.
44///
45/// # Examples
46///
47/// Common PDF names:
48/// - `/Type` - Object type identifier
49/// - `/Pages` - Page tree root
50/// - `/Font` - Font resource
51/// - `/MediaBox` - Page dimensions
52///
53/// ```rust
54/// use oxidize_pdf::parser::objects::PdfName;
55///
56/// let name = PdfName::new("Type".to_string());
57/// assert_eq!(name.as_str(), "Type");
58/// ```
59#[derive(Debug, Clone, PartialEq, Eq, Hash)]
60pub struct PdfName(pub String);
61
62/// PDF String object - Text data in PDF files.
63///
64/// PDF strings can contain arbitrary binary data and use various encodings.
65/// They can be written as literal strings `(text)` or hexadecimal strings `<48656C6C6F>`.
66///
67/// # Encoding
68///
69/// String encoding depends on context:
70/// - Text strings: Usually PDFDocEncoding or UTF-16BE
71/// - Font strings: Encoding specified by the font
72/// - Binary data: No encoding, raw bytes
73///
74/// # Example
75///
76/// ```rust
77/// use oxidize_pdf::parser::objects::PdfString;
78///
79/// // Create from UTF-8
80/// let string = PdfString::new(b"Hello World".to_vec());
81///
82/// // Try to decode as UTF-8
83/// if let Ok(text) = string.as_str() {
84///     println!("Text: {}", text);
85/// }
86/// ```
87#[derive(Debug, Clone, PartialEq)]
88pub struct PdfString(pub Vec<u8>);
89
90/// PDF Array object - Ordered collection of PDF objects.
91///
92/// Arrays can contain any PDF object type, including other arrays and dictionaries.
93/// They are written in PDF syntax as `[item1 item2 ... itemN]`.
94///
95/// # Common Uses
96///
97/// - Rectangle specifications: `[llx lly urx ury]`
98/// - Color values: `[r g b]`
99/// - Matrix transformations: `[a b c d e f]`
100/// - Resource lists
101///
102/// # Example
103///
104/// ```rust
105/// use oxidize_pdf::parser::objects::{PdfArray, PdfObject};
106///
107/// // Create a MediaBox array [0 0 612 792]
108/// let mut media_box = PdfArray::new();
109/// media_box.push(PdfObject::Integer(0));
110/// media_box.push(PdfObject::Integer(0));
111/// media_box.push(PdfObject::Integer(612));
112/// media_box.push(PdfObject::Integer(792));
113///
114/// assert_eq!(media_box.len(), 4);
115/// ```
116#[derive(Debug, Clone, PartialEq)]
117pub struct PdfArray(pub Vec<PdfObject>);
118
119/// PDF Dictionary object - Key-value mapping with name keys.
120///
121/// Dictionaries are the primary way to represent complex data structures in PDF.
122/// Keys must be PdfName objects, values can be any PDF object type.
123///
124/// # Common Dictionary Types
125///
126/// - **Catalog**: Document root (`/Type /Catalog`)
127/// - **Page**: Individual page (`/Type /Page`)
128/// - **Font**: Font definition (`/Type /Font`)
129/// - **Stream**: Binary data with metadata
130///
131/// # Example
132///
133/// ```rust
134/// use oxidize_pdf::parser::objects::{PdfDictionary, PdfObject, PdfName};
135///
136/// let mut page_dict = PdfDictionary::new();
137/// page_dict.insert("Type".to_string(),
138///     PdfObject::Name(PdfName::new("Page".to_string())));
139/// page_dict.insert("Parent".to_string(),
140///     PdfObject::Reference(2, 0)); // Reference to pages tree
141///
142/// // Access values
143/// assert_eq!(page_dict.get_type(), Some("Page"));
144/// assert!(page_dict.contains_key("Parent"));
145/// ```
146#[derive(Debug, Clone, PartialEq)]
147pub struct PdfDictionary(pub HashMap<PdfName, PdfObject>);
148
149/// PDF Stream object - Dictionary with associated binary data.
150///
151/// Streams are used for large data blocks like page content, images, fonts, etc.
152/// The dictionary describes the stream's properties (length, filters, etc.).
153///
154/// # Structure
155///
156/// - `dict`: Stream dictionary with metadata
157/// - `data`: Raw stream bytes (possibly compressed)
158///
159/// # Common Stream Types
160///
161/// - **Content streams**: Page drawing instructions
162/// - **Image XObjects**: Embedded images
163/// - **Font programs**: Embedded font data
164/// - **Form XObjects**: Reusable graphics
165///
166/// # Example
167///
168/// ```rust
169/// use oxidize_pdf::parser::objects::{PdfStream, PdfDictionary};
170/// use oxidize_pdf::parser::ParseOptions;
171///
172/// # fn example() -> Result<(), Box<dyn std::error::Error>> {
173/// # let stream = PdfStream { dict: PdfDictionary::new(), data: vec![] };
174/// // Get decompressed data
175/// let options = ParseOptions::default();
176/// let decoded = stream.decode(&options)?;
177/// println!("Decoded {} bytes", decoded.len());
178///
179/// // Access raw data
180/// let raw = stream.raw_data();
181/// println!("Raw {} bytes", raw.len());
182/// # Ok(())
183/// # }
184/// ```
185#[derive(Debug, Clone, PartialEq)]
186pub struct PdfStream {
187    /// Stream dictionary containing Length, Filter, and other properties
188    pub dict: PdfDictionary,
189    /// Raw stream data (may be compressed)
190    pub data: Vec<u8>,
191}
192
193/// Static empty array for use in lenient parsing
194pub static EMPTY_PDF_ARRAY: PdfArray = PdfArray(Vec::new());
195
196impl PdfStream {
197    /// Get the decompressed stream data.
198    ///
199    /// Automatically applies filters specified in the stream dictionary
200    /// (FlateDecode, ASCIIHexDecode, etc.) to decompress the data.
201    ///
202    /// # Arguments
203    ///
204    /// * `options` - Parse options controlling error recovery behavior
205    ///
206    /// # Returns
207    ///
208    /// The decoded/decompressed stream bytes.
209    ///
210    /// # Errors
211    ///
212    /// Returns an error if:
213    /// - Unknown filter is specified
214    /// - Decompression fails
215    /// - Filter parameters are invalid
216    ///
217    /// # Example
218    ///
219    /// ```rust,no_run
220    /// # use oxidize_pdf::parser::objects::PdfStream;
221    /// # use oxidize_pdf::parser::ParseOptions;
222    /// # fn example(stream: &PdfStream) -> Result<(), Box<dyn std::error::Error>> {
223    /// let options = ParseOptions::default();
224    /// match stream.decode(&options) {
225    ///     Ok(data) => println!("Decoded {} bytes", data.len()),
226    ///     Err(e) => println!("Decode error: {}", e),
227    /// }
228    /// # Ok(())
229    /// # }
230    /// ```
231    pub fn decode(&self, options: &ParseOptions) -> ParseResult<Vec<u8>> {
232        super::filters::decode_stream(&self.data, &self.dict, options)
233    }
234
235    /// Get the raw (possibly compressed) stream data.
236    ///
237    /// Returns the stream data exactly as stored in the PDF file,
238    /// without applying any filters or decompression.
239    ///
240    /// # Example
241    ///
242    /// ```rust
243    /// # use oxidize_pdf::parser::objects::PdfStream;
244    /// # let stream = PdfStream { dict: Default::default(), data: vec![1, 2, 3] };
245    /// let raw_data = stream.raw_data();
246    /// println!("Raw stream: {} bytes", raw_data.len());
247    /// ```
248    pub fn raw_data(&self) -> &[u8] {
249        &self.data
250    }
251}
252
253/// PDF Object types - The fundamental data types in PDF.
254///
255/// All data in a PDF file is represented using these basic types.
256/// Objects can be direct (embedded) or indirect (referenced).
257///
258/// # Object Types
259///
260/// - `Null` - Undefined/absent value
261/// - `Boolean` - true or false
262/// - `Integer` - Signed integers
263/// - `Real` - Floating-point numbers
264/// - `String` - Text or binary data
265/// - `Name` - Atomic symbols like /Type
266/// - `Array` - Ordered collections
267/// - `Dictionary` - Key-value maps
268/// - `Stream` - Dictionary + binary data
269/// - `Reference` - Indirect object reference (num gen R)
270///
271/// # Example
272///
273/// ```rust
274/// use oxidize_pdf::parser::objects::{PdfObject, PdfName, PdfString};
275///
276/// // Different object types
277/// let null = PdfObject::Null;
278/// let bool_val = PdfObject::Boolean(true);
279/// let int_val = PdfObject::Integer(42);
280/// let real_val = PdfObject::Real(3.14159);
281/// let name = PdfObject::Name(PdfName::new("Type".to_string()));
282/// let reference = PdfObject::Reference(10, 0); // 10 0 R
283///
284/// // Type checking
285/// assert!(int_val.as_integer().is_some());
286/// assert_eq!(int_val.as_integer(), Some(42));
287/// ```
288#[derive(Debug, Clone, PartialEq)]
289pub enum PdfObject {
290    /// Null object - represents undefined or absent values
291    Null,
292    /// Boolean value - true or false
293    Boolean(bool),
294    /// Integer number
295    Integer(i64),
296    /// Real (floating-point) number
297    Real(f64),
298    /// String data (literal or hexadecimal)
299    String(PdfString),
300    /// Name object - unique identifier
301    Name(PdfName),
302    /// Array - ordered collection of objects
303    Array(PdfArray),
304    /// Dictionary - unordered key-value pairs
305    Dictionary(PdfDictionary),
306    /// Stream - dictionary with binary data
307    Stream(PdfStream),
308    /// Indirect object reference (object_number, generation_number)
309    Reference(u32, u16),
310}
311
312impl PdfObject {
313    /// Parse a PDF object from a lexer.
314    ///
315    /// Reads tokens from the lexer and constructs the appropriate PDF object.
316    /// Handles all PDF object types including indirect references.
317    ///
318    /// # Arguments
319    ///
320    /// * `lexer` - Token source for parsing
321    ///
322    /// # Returns
323    ///
324    /// The parsed PDF object.
325    ///
326    /// # Errors
327    ///
328    /// Returns an error if:
329    /// - Invalid syntax is encountered
330    /// - Unexpected end of input
331    /// - Malformed object structure
332    ///
333    /// # Example
334    ///
335    /// ```rust,no_run
336    /// use oxidize_pdf::parser::lexer::Lexer;
337    /// use oxidize_pdf::parser::objects::PdfObject;
338    /// use std::io::Cursor;
339    ///
340    /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
341    /// let input = b"42";
342    /// let mut lexer = Lexer::new(Cursor::new(input));
343    /// let obj = PdfObject::parse(&mut lexer)?;
344    /// assert_eq!(obj, PdfObject::Integer(42));
345    /// # Ok(())
346    /// # }
347    /// ```
348    pub fn parse<R: Read + std::io::Seek>(lexer: &mut Lexer<R>) -> ParseResult<Self> {
349        let token = lexer.next_token()?;
350        Self::parse_from_token(lexer, token)
351    }
352
353    /// Parse a PDF object with custom options
354    pub fn parse_with_options<R: Read + std::io::Seek>(
355        lexer: &mut Lexer<R>,
356        options: &super::ParseOptions,
357    ) -> ParseResult<Self> {
358        let token = lexer.next_token()?;
359        Self::parse_from_token_with_options(lexer, token, options)
360    }
361
362    /// Parse a PDF object starting from a specific token
363    fn parse_from_token<R: Read + std::io::Seek>(
364        lexer: &mut Lexer<R>,
365        token: Token,
366    ) -> ParseResult<Self> {
367        Self::parse_from_token_with_options(lexer, token, &super::ParseOptions::default())
368    }
369
370    /// Parse a PDF object starting from a specific token with custom options
371    fn parse_from_token_with_options<R: Read + std::io::Seek>(
372        lexer: &mut Lexer<R>,
373        token: Token,
374        options: &super::ParseOptions,
375    ) -> ParseResult<Self> {
376        match token {
377            Token::Null => Ok(PdfObject::Null),
378            Token::Boolean(b) => Ok(PdfObject::Boolean(b)),
379            Token::Integer(i) => {
380                // For negative numbers or large values, don't check for references
381                if !(0..=9999999).contains(&i) {
382                    return Ok(PdfObject::Integer(i));
383                }
384
385                // Check if this is part of a reference (e.g., "1 0 R")
386                match lexer.next_token()? {
387                    Token::Integer(gen) if (0..=65535).contains(&gen) => {
388                        // Might be a reference, check for 'R'
389                        match lexer.next_token()? {
390                            Token::Name(s) if s == "R" => {
391                                Ok(PdfObject::Reference(i as u32, gen as u16))
392                            }
393                            token => {
394                                // Not a reference, push back the tokens
395                                lexer.push_token(token);
396                                lexer.push_token(Token::Integer(gen));
397                                Ok(PdfObject::Integer(i))
398                            }
399                        }
400                    }
401                    token => {
402                        // Not a reference, just an integer
403                        lexer.push_token(token);
404                        Ok(PdfObject::Integer(i))
405                    }
406                }
407            }
408            Token::Real(r) => Ok(PdfObject::Real(r)),
409            Token::String(s) => Ok(PdfObject::String(PdfString(s))),
410            Token::Name(n) => Ok(PdfObject::Name(PdfName(n))),
411            Token::ArrayStart => Self::parse_array_with_options(lexer, options),
412            Token::DictStart => Self::parse_dictionary_or_stream_with_options(lexer, options),
413            Token::Comment(_) => {
414                // Skip comments and parse next object
415                Self::parse_with_options(lexer, options)
416            }
417            Token::StartXRef => {
418                // This is a PDF structure marker, not a parseable object
419                Err(ParseError::SyntaxError {
420                    position: 0,
421                    message: "StartXRef encountered - this is not a PDF object".to_string(),
422                })
423            }
424            Token::Eof => Err(ParseError::SyntaxError {
425                position: 0,
426                message: "Unexpected end of file".to_string(),
427            }),
428            _ => Err(ParseError::UnexpectedToken {
429                expected: "PDF object".to_string(),
430                found: format!("{token:?}"),
431            }),
432        }
433    }
434
435    /// Parse a PDF array with custom options
436    fn parse_array_with_options<R: Read + std::io::Seek>(
437        lexer: &mut Lexer<R>,
438        options: &super::ParseOptions,
439    ) -> ParseResult<Self> {
440        let mut elements = Vec::new();
441
442        loop {
443            let token = lexer.next_token()?;
444            match token {
445                Token::ArrayEnd => break,
446                Token::Comment(_) => continue, // Skip comments
447                _ => {
448                    let obj = Self::parse_from_token_with_options(lexer, token, options)?;
449                    elements.push(obj);
450                }
451            }
452        }
453
454        Ok(PdfObject::Array(PdfArray(elements)))
455    }
456
457    /// Parse a PDF dictionary and check if it's followed by a stream with custom options
458    fn parse_dictionary_or_stream_with_options<R: Read + std::io::Seek>(
459        lexer: &mut Lexer<R>,
460        options: &super::ParseOptions,
461    ) -> ParseResult<Self> {
462        let dict = Self::parse_dictionary_inner_with_options(lexer, options)?;
463
464        // Check if this is followed by a stream
465        loop {
466            let token = lexer.next_token()?;
467            // Check for stream
468            match token {
469                Token::Stream => {
470                    // Parse stream data
471                    let stream_data = Self::parse_stream_data_with_options(lexer, &dict, options)?;
472                    return Ok(PdfObject::Stream(PdfStream {
473                        dict,
474                        data: stream_data,
475                    }));
476                }
477                Token::Comment(_) => {
478                    // Skip comment and continue checking
479                    continue;
480                }
481                Token::StartXRef => {
482                    // This is the end of the PDF structure, not a stream
483                    // Push the token back for later processing
484                    // Push back StartXRef token
485                    lexer.push_token(token);
486                    return Ok(PdfObject::Dictionary(dict));
487                }
488                _ => {
489                    // Not a stream, just a dictionary
490                    // Push the token back for later processing
491                    // Push back token
492                    lexer.push_token(token);
493                    return Ok(PdfObject::Dictionary(dict));
494                }
495            }
496        }
497    }
498
499    /// Parse the inner dictionary with custom options
500    fn parse_dictionary_inner_with_options<R: Read + std::io::Seek>(
501        lexer: &mut Lexer<R>,
502        options: &super::ParseOptions,
503    ) -> ParseResult<PdfDictionary> {
504        let mut dict = HashMap::new();
505
506        loop {
507            let token = lexer.next_token()?;
508            match token {
509                Token::DictEnd => break,
510                Token::Comment(_) => continue, // Skip comments
511                Token::Name(key) => {
512                    let value = Self::parse_with_options(lexer, options)?;
513                    dict.insert(PdfName(key), value);
514                }
515                _ => {
516                    return Err(ParseError::UnexpectedToken {
517                        expected: "dictionary key (name) or >>".to_string(),
518                        found: format!("{token:?}"),
519                    });
520                }
521            }
522        }
523
524        Ok(PdfDictionary(dict))
525    }
526
527    /// Parse stream data with custom options
528    fn parse_stream_data_with_options<R: Read + std::io::Seek>(
529        lexer: &mut Lexer<R>,
530        dict: &PdfDictionary,
531        options: &super::ParseOptions,
532    ) -> ParseResult<Vec<u8>> {
533        // Get the stream length from the dictionary
534        let length = dict
535            .0
536            .get(&PdfName("Length".to_string()))
537            .or_else(|| {
538                // If Length is missing and we have lenient parsing, try to find endstream
539                if options.lenient_streams {
540                    if options.collect_warnings {
541                        eprintln!("Warning: Missing Length key in stream dictionary, will search for endstream marker");
542                    }
543                    // Return a special marker to indicate we need to search for endstream
544                    Some(&PdfObject::Integer(-1))
545                } else {
546                    None
547                }
548            })
549            .ok_or_else(|| ParseError::MissingKey("Length".to_string()))?;
550
551        let length = match length {
552            PdfObject::Integer(len) => {
553                if *len == -1 {
554                    // Special marker for missing length - we need to search for endstream
555                    usize::MAX // We'll handle this specially below
556                } else {
557                    *len as usize
558                }
559            }
560            PdfObject::Reference(obj_num, gen_num) => {
561                // Stream length is an indirect reference - we'll need to resolve it
562                // For now, we'll use a fallback approach: search for endstream marker
563                // This maintains compatibility while we implement proper reference resolution
564                if options.lenient_streams {
565                    if options.collect_warnings {
566                        eprintln!("Warning: Stream length is an indirect reference ({obj_num} {gen_num} R). Using endstream detection fallback.");
567                    }
568                    usize::MAX // This will trigger the endstream search below
569                } else {
570                    return Err(ParseError::SyntaxError {
571                        position: lexer.position(),
572                        message: format!("Stream length reference ({obj_num} {gen_num} R) requires lenient mode or reference resolution"),
573                    });
574                }
575            }
576            _ => {
577                return Err(ParseError::SyntaxError {
578                    position: lexer.position(),
579                    message: "Invalid stream length type".to_string(),
580                });
581            }
582        };
583
584        // Skip the newline after 'stream' keyword
585        lexer.read_newline()?;
586
587        // Read the actual stream data
588        let mut stream_data = if length == usize::MAX {
589            // Missing length - search for endstream marker
590            let mut data = Vec::new();
591            let max_search = 65536; // Search up to 64KB
592            let mut found_endstream = false;
593
594            for _ in 0..max_search {
595                match lexer.peek_byte() {
596                    Ok(b) => {
597                        // Check if we might be at "endstream"
598                        if b == b'e' {
599                            let pos = lexer.position();
600                            // Try to read "endstream"
601                            if let Ok(Token::EndStream) = lexer.peek_token() {
602                                found_endstream = true;
603                                break;
604                            }
605                            // Not endstream, continue
606                            lexer.seek(pos as u64)?;
607                        }
608                        data.push(lexer.read_byte()?);
609                    }
610                    Err(_) => break,
611                }
612            }
613
614            if !found_endstream && !options.lenient_streams {
615                return Err(ParseError::SyntaxError {
616                    position: lexer.position(),
617                    message: "Could not find endstream marker".to_string(),
618                });
619            }
620
621            data
622        } else {
623            lexer.read_bytes(length)?
624        };
625
626        // Skip optional whitespace before endstream
627        lexer.skip_whitespace()?;
628
629        // Check if we have the endstream keyword where expected
630        let peek_result = lexer.peek_token();
631
632        match peek_result {
633            Ok(Token::EndStream) => {
634                // Everything is fine, consume the token
635                lexer.next_token()?;
636                Ok(stream_data)
637            }
638            Ok(other_token) => {
639                if options.lenient_streams {
640                    // Try to find endstream within max_recovery_bytes
641                    eprintln!("Warning: Stream length mismatch. Expected 'endstream' after {length} bytes, got {other_token:?}");
642
643                    if let Some(additional_bytes) =
644                        lexer.find_keyword_ahead("endstream", options.max_recovery_bytes)?
645                    {
646                        // Read the additional bytes
647                        let extra_data = lexer.read_bytes(additional_bytes)?;
648                        stream_data.extend_from_slice(&extra_data);
649
650                        let actual_length = stream_data.len();
651                        eprintln!(
652                            "Stream length corrected: declared={length}, actual={actual_length}"
653                        );
654
655                        // Skip whitespace and consume endstream
656                        lexer.skip_whitespace()?;
657                        lexer.expect_keyword("endstream")?;
658
659                        Ok(stream_data)
660                    } else {
661                        // Couldn't find endstream within recovery distance
662                        Err(ParseError::SyntaxError {
663                            position: lexer.position(),
664                            message: format!(
665                                "Could not find 'endstream' within {} bytes",
666                                options.max_recovery_bytes
667                            ),
668                        })
669                    }
670                } else {
671                    // Strict mode - return error
672                    Err(ParseError::UnexpectedToken {
673                        expected: "endstream".to_string(),
674                        found: format!("{other_token:?}"),
675                    })
676                }
677            }
678            Err(e) => {
679                if options.lenient_streams {
680                    // Try to find endstream within max_recovery_bytes
681                    eprintln!(
682                        "Warning: Stream length mismatch. Could not peek next token after {length} bytes"
683                    );
684
685                    if let Some(additional_bytes) =
686                        lexer.find_keyword_ahead("endstream", options.max_recovery_bytes)?
687                    {
688                        // Read the additional bytes
689                        let extra_data = lexer.read_bytes(additional_bytes)?;
690                        stream_data.extend_from_slice(&extra_data);
691
692                        let actual_length = stream_data.len();
693                        eprintln!(
694                            "Stream length corrected: declared={length}, actual={actual_length}"
695                        );
696
697                        // Skip whitespace and consume endstream
698                        lexer.skip_whitespace()?;
699                        lexer.expect_keyword("endstream")?;
700
701                        Ok(stream_data)
702                    } else {
703                        // Couldn't find endstream within recovery distance
704                        Err(ParseError::SyntaxError {
705                            position: lexer.position(),
706                            message: format!(
707                                "Could not find 'endstream' within {} bytes",
708                                options.max_recovery_bytes
709                            ),
710                        })
711                    }
712                } else {
713                    // Strict mode - propagate the error
714                    Err(e)
715                }
716            }
717        }
718    }
719
720    /// Check if this object is null.
721    ///
722    /// # Example
723    ///
724    /// ```rust
725    /// use oxidize_pdf::parser::objects::PdfObject;
726    ///
727    /// assert!(PdfObject::Null.is_null());
728    /// assert!(!PdfObject::Integer(42).is_null());
729    /// ```
730    pub fn is_null(&self) -> bool {
731        matches!(self, PdfObject::Null)
732    }
733
734    /// Get the value as a boolean if this is a Boolean object.
735    ///
736    /// # Returns
737    ///
738    /// Some(bool) if this is a Boolean object, None otherwise.
739    ///
740    /// # Example
741    ///
742    /// ```rust
743    /// use oxidize_pdf::parser::objects::PdfObject;
744    ///
745    /// let obj = PdfObject::Boolean(true);
746    /// assert_eq!(obj.as_bool(), Some(true));
747    ///
748    /// let obj = PdfObject::Integer(1);
749    /// assert_eq!(obj.as_bool(), None);
750    /// ```
751    pub fn as_bool(&self) -> Option<bool> {
752        match self {
753            PdfObject::Boolean(b) => Some(*b),
754            _ => None,
755        }
756    }
757
758    /// Get as integer
759    pub fn as_integer(&self) -> Option<i64> {
760        match self {
761            PdfObject::Integer(i) => Some(*i),
762            _ => None,
763        }
764    }
765
766    /// Get the value as a real number.
767    ///
768    /// Returns the value for both Real and Integer objects,
769    /// converting integers to floating-point.
770    ///
771    /// # Returns
772    ///
773    /// Some(f64) if this is a numeric object, None otherwise.
774    ///
775    /// # Example
776    ///
777    /// ```rust
778    /// use oxidize_pdf::parser::objects::PdfObject;
779    ///
780    /// let real_obj = PdfObject::Real(3.14);
781    /// assert_eq!(real_obj.as_real(), Some(3.14));
782    ///
783    /// let int_obj = PdfObject::Integer(42);
784    /// assert_eq!(int_obj.as_real(), Some(42.0));
785    /// ```
786    pub fn as_real(&self) -> Option<f64> {
787        match self {
788            PdfObject::Real(r) => Some(*r),
789            PdfObject::Integer(i) => Some(*i as f64),
790            _ => None,
791        }
792    }
793
794    /// Get as string
795    pub fn as_string(&self) -> Option<&PdfString> {
796        match self {
797            PdfObject::String(s) => Some(s),
798            _ => None,
799        }
800    }
801
802    /// Get as name
803    pub fn as_name(&self) -> Option<&PdfName> {
804        match self {
805            PdfObject::Name(n) => Some(n),
806            _ => None,
807        }
808    }
809
810    /// Get as array
811    pub fn as_array(&self) -> Option<&PdfArray> {
812        match self {
813            PdfObject::Array(a) => Some(a),
814            _ => None,
815        }
816    }
817
818    /// Get as dictionary
819    pub fn as_dict(&self) -> Option<&PdfDictionary> {
820        match self {
821            PdfObject::Dictionary(d) => Some(d),
822            PdfObject::Stream(s) => Some(&s.dict),
823            _ => None,
824        }
825    }
826
827    /// Get as stream
828    pub fn as_stream(&self) -> Option<&PdfStream> {
829        match self {
830            PdfObject::Stream(s) => Some(s),
831            _ => None,
832        }
833    }
834
835    /// Get the object reference if this is a Reference object.
836    ///
837    /// # Returns
838    ///
839    /// Some((object_number, generation_number)) if this is a Reference, None otherwise.
840    ///
841    /// # Example
842    ///
843    /// ```rust
844    /// use oxidize_pdf::parser::objects::PdfObject;
845    ///
846    /// let obj = PdfObject::Reference(10, 0);
847    /// assert_eq!(obj.as_reference(), Some((10, 0)));
848    ///
849    /// // Use for resolving references
850    /// if let Some((obj_num, gen_num)) = obj.as_reference() {
851    ///     println!("Reference to {} {} R", obj_num, gen_num);
852    /// }
853    /// ```
854    pub fn as_reference(&self) -> Option<(u32, u16)> {
855        match self {
856            PdfObject::Reference(obj, gen) => Some((*obj, *gen)),
857            _ => None,
858        }
859    }
860}
861
862impl Default for PdfDictionary {
863    fn default() -> Self {
864        Self::new()
865    }
866}
867
868impl PdfDictionary {
869    /// Create a new empty dictionary.
870    ///
871    /// # Example
872    ///
873    /// ```rust
874    /// use oxidize_pdf::parser::objects::{PdfDictionary, PdfObject, PdfName};
875    ///
876    /// let mut dict = PdfDictionary::new();
877    /// dict.insert("Type".to_string(), PdfObject::Name(PdfName::new("Font".to_string())));
878    /// ```
879    pub fn new() -> Self {
880        PdfDictionary(HashMap::new())
881    }
882
883    /// Get a value by key name.
884    ///
885    /// # Arguments
886    ///
887    /// * `key` - The key name (without leading slash)
888    ///
889    /// # Returns
890    ///
891    /// Reference to the value if the key exists, None otherwise.
892    ///
893    /// # Example
894    ///
895    /// ```rust
896    /// use oxidize_pdf::parser::objects::{PdfDictionary, PdfObject};
897    ///
898    /// let mut dict = PdfDictionary::new();
899    /// dict.insert("Length".to_string(), PdfObject::Integer(1000));
900    ///
901    /// if let Some(length) = dict.get("Length").and_then(|o| o.as_integer()) {
902    ///     println!("Stream length: {}", length);
903    /// }
904    /// ```
905    pub fn get(&self, key: &str) -> Option<&PdfObject> {
906        self.0.get(&PdfName(key.to_string()))
907    }
908
909    /// Insert a key-value pair
910    pub fn insert(&mut self, key: String, value: PdfObject) {
911        self.0.insert(PdfName(key), value);
912    }
913
914    /// Check if dictionary contains a key
915    pub fn contains_key(&self, key: &str) -> bool {
916        self.0.contains_key(&PdfName(key.to_string()))
917    }
918
919    /// Get the dictionary type (value of /Type key).
920    ///
921    /// Many PDF dictionaries have a /Type entry that identifies their purpose.
922    ///
923    /// # Returns
924    ///
925    /// The type name if present, None otherwise.
926    ///
927    /// # Common Types
928    ///
929    /// - "Catalog" - Document catalog
930    /// - "Page" - Page object
931    /// - "Pages" - Page tree node
932    /// - "Font" - Font dictionary
933    /// - "XObject" - External object
934    ///
935    /// # Example
936    ///
937    /// ```rust
938    /// use oxidize_pdf::parser::objects::{PdfDictionary, PdfObject, PdfName};
939    ///
940    /// let mut dict = PdfDictionary::new();
941    /// dict.insert("Type".to_string(), PdfObject::Name(PdfName::new("Page".to_string())));
942    /// assert_eq!(dict.get_type(), Some("Page"));
943    /// ```
944    pub fn get_type(&self) -> Option<&str> {
945        self.get("Type")
946            .and_then(|obj| obj.as_name())
947            .map(|n| n.0.as_str())
948    }
949}
950
951impl Default for PdfArray {
952    fn default() -> Self {
953        Self::new()
954    }
955}
956
957impl PdfArray {
958    /// Create a new empty array
959    pub fn new() -> Self {
960        PdfArray(Vec::new())
961    }
962
963    /// Get array length
964    pub fn len(&self) -> usize {
965        self.0.len()
966    }
967
968    /// Check if array is empty
969    pub fn is_empty(&self) -> bool {
970        self.0.is_empty()
971    }
972
973    /// Get element at index.
974    ///
975    /// # Arguments
976    ///
977    /// * `index` - Zero-based index
978    ///
979    /// # Returns
980    ///
981    /// Reference to the element if index is valid, None otherwise.
982    ///
983    /// # Example
984    ///
985    /// ```rust
986    /// use oxidize_pdf::parser::objects::{PdfArray, PdfObject};
987    ///
988    /// let mut array = PdfArray::new();
989    /// array.push(PdfObject::Integer(10));
990    /// array.push(PdfObject::Integer(20));
991    ///
992    /// assert_eq!(array.get(0).and_then(|o| o.as_integer()), Some(10));
993    /// assert_eq!(array.get(1).and_then(|o| o.as_integer()), Some(20));
994    /// assert!(array.get(2).is_none());
995    /// ```
996    pub fn get(&self, index: usize) -> Option<&PdfObject> {
997        self.0.get(index)
998    }
999
1000    /// Push an element
1001    pub fn push(&mut self, obj: PdfObject) {
1002        self.0.push(obj);
1003    }
1004}
1005
1006impl PdfString {
1007    /// Create a new PDF string
1008    pub fn new(data: Vec<u8>) -> Self {
1009        PdfString(data)
1010    }
1011
1012    /// Get as UTF-8 string if possible.
1013    ///
1014    /// Attempts to decode the string bytes as UTF-8.
1015    /// Note that PDF strings may use other encodings.
1016    ///
1017    /// # Returns
1018    ///
1019    /// Ok(&str) if valid UTF-8, Err otherwise.
1020    ///
1021    /// # Example
1022    ///
1023    /// ```rust
1024    /// use oxidize_pdf::parser::objects::PdfString;
1025    ///
1026    /// let string = PdfString::new(b"Hello".to_vec());
1027    /// assert_eq!(string.as_str(), Ok("Hello"));
1028    ///
1029    /// let binary = PdfString::new(vec![0xFF, 0xFE]);
1030    /// assert!(binary.as_str().is_err());
1031    /// ```
1032    pub fn as_str(&self) -> Result<&str, std::str::Utf8Error> {
1033        std::str::from_utf8(&self.0)
1034    }
1035
1036    /// Get as bytes
1037    pub fn as_bytes(&self) -> &[u8] {
1038        &self.0
1039    }
1040}
1041
1042impl PdfName {
1043    /// Create a new PDF name
1044    pub fn new(name: String) -> Self {
1045        PdfName(name)
1046    }
1047
1048    /// Get the name as a string
1049    pub fn as_str(&self) -> &str {
1050        &self.0
1051    }
1052}
1053
1054#[cfg(test)]
1055mod tests {
1056    use super::*;
1057    use crate::parser::lexer::Lexer;
1058    use crate::parser::ParseOptions;
1059    use std::collections::HashMap;
1060    use std::io::Cursor;
1061
1062    #[test]
1063    fn test_parse_simple_objects() {
1064        let input = b"null true false 123 -456 3.14 /Name (Hello)";
1065        let mut lexer = Lexer::new(Cursor::new(input));
1066
1067        assert_eq!(PdfObject::parse(&mut lexer).unwrap(), PdfObject::Null);
1068        assert_eq!(
1069            PdfObject::parse(&mut lexer).unwrap(),
1070            PdfObject::Boolean(true)
1071        );
1072        assert_eq!(
1073            PdfObject::parse(&mut lexer).unwrap(),
1074            PdfObject::Boolean(false)
1075        );
1076        assert_eq!(
1077            PdfObject::parse(&mut lexer).unwrap(),
1078            PdfObject::Integer(123)
1079        );
1080        assert_eq!(
1081            PdfObject::parse(&mut lexer).unwrap(),
1082            PdfObject::Integer(-456)
1083        );
1084        assert_eq!(PdfObject::parse(&mut lexer).unwrap(), PdfObject::Real(3.14));
1085        assert_eq!(
1086            PdfObject::parse(&mut lexer).unwrap(),
1087            PdfObject::Name(PdfName("Name".to_string()))
1088        );
1089        assert_eq!(
1090            PdfObject::parse(&mut lexer).unwrap(),
1091            PdfObject::String(PdfString(b"Hello".to_vec()))
1092        );
1093    }
1094
1095    #[test]
1096    fn test_parse_array() {
1097        // Test simple array without potential references
1098        let input = b"[100 200 300 /Name (test)]";
1099        let mut lexer = Lexer::new(Cursor::new(input));
1100
1101        let obj = PdfObject::parse(&mut lexer).unwrap();
1102        let array = obj.as_array().unwrap();
1103
1104        assert_eq!(array.len(), 5);
1105        assert_eq!(array.get(0).unwrap().as_integer(), Some(100));
1106        assert_eq!(array.get(1).unwrap().as_integer(), Some(200));
1107        assert_eq!(array.get(2).unwrap().as_integer(), Some(300));
1108        assert_eq!(array.get(3).unwrap().as_name().unwrap().as_str(), "Name");
1109        assert_eq!(
1110            array.get(4).unwrap().as_string().unwrap().as_bytes(),
1111            b"test"
1112        );
1113    }
1114
1115    #[test]
1116    fn test_parse_array_with_references() {
1117        // Test array with references
1118        let input = b"[1 0 R 2 0 R]";
1119        let mut lexer = Lexer::new(Cursor::new(input));
1120
1121        let obj = PdfObject::parse(&mut lexer).unwrap();
1122        let array = obj.as_array().unwrap();
1123
1124        assert_eq!(array.len(), 2);
1125        assert!(array.get(0).unwrap().as_reference().is_some());
1126        assert!(array.get(1).unwrap().as_reference().is_some());
1127    }
1128
1129    #[test]
1130    fn test_parse_dictionary() {
1131        let input = b"<< /Type /Page /Parent 1 0 R /MediaBox [0 0 612 792] >>";
1132        let mut lexer = Lexer::new(Cursor::new(input));
1133
1134        let obj = PdfObject::parse(&mut lexer).unwrap();
1135        let dict = obj.as_dict().unwrap();
1136
1137        assert_eq!(dict.get_type(), Some("Page"));
1138        assert!(dict.get("Parent").unwrap().as_reference().is_some());
1139        assert!(dict.get("MediaBox").unwrap().as_array().is_some());
1140    }
1141
1142    // Comprehensive tests for all object types and their methods
1143    mod comprehensive_tests {
1144        use super::*;
1145
1146        #[test]
1147        fn test_pdf_object_null() {
1148            let obj = PdfObject::Null;
1149            assert!(obj.is_null());
1150            assert_eq!(obj.as_bool(), None);
1151            assert_eq!(obj.as_integer(), None);
1152            assert_eq!(obj.as_real(), None);
1153            assert_eq!(obj.as_string(), None);
1154            assert_eq!(obj.as_name(), None);
1155            assert_eq!(obj.as_array(), None);
1156            assert_eq!(obj.as_dict(), None);
1157            assert_eq!(obj.as_stream(), None);
1158            assert_eq!(obj.as_reference(), None);
1159        }
1160
1161        #[test]
1162        fn test_pdf_object_boolean() {
1163            let obj_true = PdfObject::Boolean(true);
1164            let obj_false = PdfObject::Boolean(false);
1165
1166            assert!(!obj_true.is_null());
1167            assert_eq!(obj_true.as_bool(), Some(true));
1168            assert_eq!(obj_false.as_bool(), Some(false));
1169
1170            assert_eq!(obj_true.as_integer(), None);
1171            assert_eq!(obj_true.as_real(), None);
1172            assert_eq!(obj_true.as_string(), None);
1173            assert_eq!(obj_true.as_name(), None);
1174            assert_eq!(obj_true.as_array(), None);
1175            assert_eq!(obj_true.as_dict(), None);
1176            assert_eq!(obj_true.as_stream(), None);
1177            assert_eq!(obj_true.as_reference(), None);
1178        }
1179
1180        #[test]
1181        fn test_pdf_object_integer() {
1182            let obj = PdfObject::Integer(42);
1183
1184            assert!(!obj.is_null());
1185            assert_eq!(obj.as_bool(), None);
1186            assert_eq!(obj.as_integer(), Some(42));
1187            assert_eq!(obj.as_real(), Some(42.0)); // Should convert to float
1188            assert_eq!(obj.as_string(), None);
1189            assert_eq!(obj.as_name(), None);
1190            assert_eq!(obj.as_array(), None);
1191            assert_eq!(obj.as_dict(), None);
1192            assert_eq!(obj.as_stream(), None);
1193            assert_eq!(obj.as_reference(), None);
1194
1195            // Test negative integers
1196            let obj_neg = PdfObject::Integer(-123);
1197            assert_eq!(obj_neg.as_integer(), Some(-123));
1198            assert_eq!(obj_neg.as_real(), Some(-123.0));
1199
1200            // Test large integers
1201            let obj_large = PdfObject::Integer(9999999999);
1202            assert_eq!(obj_large.as_integer(), Some(9999999999));
1203            assert_eq!(obj_large.as_real(), Some(9999999999.0));
1204        }
1205
1206        #[test]
1207        fn test_pdf_object_real() {
1208            let obj = PdfObject::Real(3.14159);
1209
1210            assert!(!obj.is_null());
1211            assert_eq!(obj.as_bool(), None);
1212            assert_eq!(obj.as_integer(), None);
1213            assert_eq!(obj.as_real(), Some(3.14159));
1214            assert_eq!(obj.as_string(), None);
1215            assert_eq!(obj.as_name(), None);
1216            assert_eq!(obj.as_array(), None);
1217            assert_eq!(obj.as_dict(), None);
1218            assert_eq!(obj.as_stream(), None);
1219            assert_eq!(obj.as_reference(), None);
1220
1221            // Test negative real numbers
1222            let obj_neg = PdfObject::Real(-2.71828);
1223            assert_eq!(obj_neg.as_real(), Some(-2.71828));
1224
1225            // Test zero
1226            let obj_zero = PdfObject::Real(0.0);
1227            assert_eq!(obj_zero.as_real(), Some(0.0));
1228
1229            // Test very small numbers
1230            let obj_small = PdfObject::Real(0.000001);
1231            assert_eq!(obj_small.as_real(), Some(0.000001));
1232
1233            // Test very large numbers
1234            let obj_large = PdfObject::Real(1e10);
1235            assert_eq!(obj_large.as_real(), Some(1e10));
1236        }
1237
1238        #[test]
1239        fn test_pdf_object_string() {
1240            let string_data = b"Hello World".to_vec();
1241            let pdf_string = PdfString(string_data.clone());
1242            let obj = PdfObject::String(pdf_string);
1243
1244            assert!(!obj.is_null());
1245            assert_eq!(obj.as_bool(), None);
1246            assert_eq!(obj.as_integer(), None);
1247            assert_eq!(obj.as_real(), None);
1248            assert!(obj.as_string().is_some());
1249            assert_eq!(obj.as_string().unwrap().as_bytes(), string_data);
1250            assert_eq!(obj.as_name(), None);
1251            assert_eq!(obj.as_array(), None);
1252            assert_eq!(obj.as_dict(), None);
1253            assert_eq!(obj.as_stream(), None);
1254            assert_eq!(obj.as_reference(), None);
1255        }
1256
1257        #[test]
1258        fn test_pdf_object_name() {
1259            let name_str = "Type".to_string();
1260            let pdf_name = PdfName(name_str.clone());
1261            let obj = PdfObject::Name(pdf_name);
1262
1263            assert!(!obj.is_null());
1264            assert_eq!(obj.as_bool(), None);
1265            assert_eq!(obj.as_integer(), None);
1266            assert_eq!(obj.as_real(), None);
1267            assert_eq!(obj.as_string(), None);
1268            assert!(obj.as_name().is_some());
1269            assert_eq!(obj.as_name().unwrap().as_str(), name_str);
1270            assert_eq!(obj.as_array(), None);
1271            assert_eq!(obj.as_dict(), None);
1272            assert_eq!(obj.as_stream(), None);
1273            assert_eq!(obj.as_reference(), None);
1274        }
1275
1276        #[test]
1277        fn test_pdf_object_array() {
1278            let mut array = PdfArray::new();
1279            array.push(PdfObject::Integer(1));
1280            array.push(PdfObject::Integer(2));
1281            array.push(PdfObject::Integer(3));
1282            let obj = PdfObject::Array(array);
1283
1284            assert!(!obj.is_null());
1285            assert_eq!(obj.as_bool(), None);
1286            assert_eq!(obj.as_integer(), None);
1287            assert_eq!(obj.as_real(), None);
1288            assert_eq!(obj.as_string(), None);
1289            assert_eq!(obj.as_name(), None);
1290            assert!(obj.as_array().is_some());
1291            assert_eq!(obj.as_array().unwrap().len(), 3);
1292            assert_eq!(obj.as_dict(), None);
1293            assert_eq!(obj.as_stream(), None);
1294            assert_eq!(obj.as_reference(), None);
1295        }
1296
1297        #[test]
1298        fn test_pdf_object_dictionary() {
1299            let mut dict = PdfDictionary::new();
1300            dict.insert(
1301                "Type".to_string(),
1302                PdfObject::Name(PdfName("Page".to_string())),
1303            );
1304            dict.insert("Count".to_string(), PdfObject::Integer(5));
1305            let obj = PdfObject::Dictionary(dict);
1306
1307            assert!(!obj.is_null());
1308            assert_eq!(obj.as_bool(), None);
1309            assert_eq!(obj.as_integer(), None);
1310            assert_eq!(obj.as_real(), None);
1311            assert_eq!(obj.as_string(), None);
1312            assert_eq!(obj.as_name(), None);
1313            assert_eq!(obj.as_array(), None);
1314            assert!(obj.as_dict().is_some());
1315            assert_eq!(obj.as_dict().unwrap().0.len(), 2);
1316            assert_eq!(obj.as_stream(), None);
1317            assert_eq!(obj.as_reference(), None);
1318        }
1319
1320        #[test]
1321        fn test_pdf_object_stream() {
1322            let mut dict = PdfDictionary::new();
1323            dict.insert("Length".to_string(), PdfObject::Integer(13));
1324            let data = b"Hello, World!".to_vec();
1325            let stream = PdfStream { dict, data };
1326            let obj = PdfObject::Stream(stream);
1327
1328            assert!(!obj.is_null());
1329            assert_eq!(obj.as_bool(), None);
1330            assert_eq!(obj.as_integer(), None);
1331            assert_eq!(obj.as_real(), None);
1332            assert_eq!(obj.as_string(), None);
1333            assert_eq!(obj.as_name(), None);
1334            assert_eq!(obj.as_array(), None);
1335            assert!(obj.as_dict().is_some()); // Stream dictionary should be accessible
1336            assert!(obj.as_stream().is_some());
1337            assert_eq!(obj.as_stream().unwrap().raw_data(), b"Hello, World!");
1338            assert_eq!(obj.as_reference(), None);
1339        }
1340
1341        #[test]
1342        fn test_pdf_object_reference() {
1343            let obj = PdfObject::Reference(42, 0);
1344
1345            assert!(!obj.is_null());
1346            assert_eq!(obj.as_bool(), None);
1347            assert_eq!(obj.as_integer(), None);
1348            assert_eq!(obj.as_real(), None);
1349            assert_eq!(obj.as_string(), None);
1350            assert_eq!(obj.as_name(), None);
1351            assert_eq!(obj.as_array(), None);
1352            assert_eq!(obj.as_dict(), None);
1353            assert_eq!(obj.as_stream(), None);
1354            assert_eq!(obj.as_reference(), Some((42, 0)));
1355
1356            // Test different generations
1357            let obj_gen = PdfObject::Reference(123, 5);
1358            assert_eq!(obj_gen.as_reference(), Some((123, 5)));
1359        }
1360
1361        #[test]
1362        fn test_pdf_string_methods() {
1363            let string_data = b"Hello, World!".to_vec();
1364            let pdf_string = PdfString(string_data.clone());
1365
1366            assert_eq!(pdf_string.as_bytes(), string_data);
1367            assert_eq!(pdf_string.as_str().unwrap(), "Hello, World!");
1368            assert_eq!(pdf_string.0.len(), 13);
1369            assert!(!pdf_string.0.is_empty());
1370
1371            // Test empty string
1372            let empty_string = PdfString(vec![]);
1373            assert!(empty_string.0.is_empty());
1374            assert_eq!(empty_string.0.len(), 0);
1375
1376            // Test non-UTF-8 data
1377            let binary_data = vec![0xFF, 0xFE, 0x00, 0x48, 0x00, 0x69]; // UTF-16 "Hi"
1378            let binary_string = PdfString(binary_data.clone());
1379            assert_eq!(binary_string.as_bytes(), binary_data);
1380            assert!(binary_string.as_str().is_err()); // Should fail UTF-8 conversion
1381        }
1382
1383        #[test]
1384        fn test_pdf_name_methods() {
1385            let name_str = "Type".to_string();
1386            let pdf_name = PdfName(name_str.clone());
1387
1388            assert_eq!(pdf_name.as_str(), name_str);
1389            assert_eq!(pdf_name.0.len(), 4);
1390            assert!(!pdf_name.0.is_empty());
1391
1392            // Test empty name
1393            let empty_name = PdfName("".to_string());
1394            assert!(empty_name.0.is_empty());
1395            assert_eq!(empty_name.0.len(), 0);
1396
1397            // Test name with special characters
1398            let special_name = PdfName("Font#20Name".to_string());
1399            assert_eq!(special_name.as_str(), "Font#20Name");
1400            assert_eq!(special_name.0.len(), 11);
1401        }
1402
1403        #[test]
1404        fn test_pdf_array_methods() {
1405            let mut array = PdfArray::new();
1406            assert_eq!(array.len(), 0);
1407            assert!(array.is_empty());
1408
1409            // Test push operations
1410            array.push(PdfObject::Integer(1));
1411            array.push(PdfObject::Integer(2));
1412            array.push(PdfObject::Integer(3));
1413
1414            assert_eq!(array.len(), 3);
1415            assert!(!array.is_empty());
1416
1417            // Test get operations
1418            assert_eq!(array.get(0).unwrap().as_integer(), Some(1));
1419            assert_eq!(array.get(1).unwrap().as_integer(), Some(2));
1420            assert_eq!(array.get(2).unwrap().as_integer(), Some(3));
1421            assert!(array.get(3).is_none());
1422
1423            // Test iteration
1424            let values: Vec<i64> = array.0.iter().filter_map(|obj| obj.as_integer()).collect();
1425            assert_eq!(values, vec![1, 2, 3]);
1426
1427            // Test mixed types
1428            let mut mixed_array = PdfArray::new();
1429            mixed_array.push(PdfObject::Integer(42));
1430            mixed_array.push(PdfObject::Real(3.14));
1431            mixed_array.push(PdfObject::String(PdfString(b"text".to_vec())));
1432            mixed_array.push(PdfObject::Name(PdfName("Name".to_string())));
1433            mixed_array.push(PdfObject::Boolean(true));
1434            mixed_array.push(PdfObject::Null);
1435
1436            assert_eq!(mixed_array.len(), 6);
1437            assert_eq!(mixed_array.get(0).unwrap().as_integer(), Some(42));
1438            assert_eq!(mixed_array.get(1).unwrap().as_real(), Some(3.14));
1439            assert_eq!(
1440                mixed_array.get(2).unwrap().as_string().unwrap().as_bytes(),
1441                b"text"
1442            );
1443            assert_eq!(
1444                mixed_array.get(3).unwrap().as_name().unwrap().as_str(),
1445                "Name"
1446            );
1447            assert_eq!(mixed_array.get(4).unwrap().as_bool(), Some(true));
1448            assert!(mixed_array.get(5).unwrap().is_null());
1449        }
1450
1451        #[test]
1452        fn test_pdf_dictionary_methods() {
1453            let mut dict = PdfDictionary::new();
1454            assert_eq!(dict.0.len(), 0);
1455            assert!(dict.0.is_empty());
1456
1457            // Test insertions
1458            dict.insert(
1459                "Type".to_string(),
1460                PdfObject::Name(PdfName("Page".to_string())),
1461            );
1462            dict.insert("Count".to_string(), PdfObject::Integer(5));
1463            dict.insert("Resources".to_string(), PdfObject::Reference(10, 0));
1464
1465            assert_eq!(dict.0.len(), 3);
1466            assert!(!dict.0.is_empty());
1467
1468            // Test get operations
1469            assert_eq!(
1470                dict.get("Type").unwrap().as_name().unwrap().as_str(),
1471                "Page"
1472            );
1473            assert_eq!(dict.get("Count").unwrap().as_integer(), Some(5));
1474            assert_eq!(dict.get("Resources").unwrap().as_reference(), Some((10, 0)));
1475            assert!(dict.get("NonExistent").is_none());
1476
1477            // Test contains_key
1478            assert!(dict.contains_key("Type"));
1479            assert!(dict.contains_key("Count"));
1480            assert!(dict.contains_key("Resources"));
1481            assert!(!dict.contains_key("NonExistent"));
1482
1483            // Test get_type helper
1484            assert_eq!(dict.get_type(), Some("Page"));
1485
1486            // Test iteration
1487            let mut keys: Vec<String> = dict.0.keys().map(|k| k.0.clone()).collect();
1488            keys.sort();
1489            assert_eq!(keys, vec!["Count", "Resources", "Type"]);
1490
1491            // Test values
1492            let values: Vec<&PdfObject> = dict.0.values().collect();
1493            assert_eq!(values.len(), 3);
1494        }
1495
1496        #[test]
1497        fn test_pdf_stream_methods() {
1498            let mut dict = PdfDictionary::new();
1499            dict.insert("Length".to_string(), PdfObject::Integer(13));
1500            dict.insert(
1501                "Filter".to_string(),
1502                PdfObject::Name(PdfName("FlateDecode".to_string())),
1503            );
1504
1505            let data = b"Hello, World!".to_vec();
1506            let stream = PdfStream {
1507                dict,
1508                data: data.clone(),
1509            };
1510
1511            // Test raw data access
1512            assert_eq!(stream.raw_data(), data);
1513
1514            // Test dictionary access
1515            assert_eq!(stream.dict.get("Length").unwrap().as_integer(), Some(13));
1516            assert_eq!(
1517                stream
1518                    .dict
1519                    .get("Filter")
1520                    .unwrap()
1521                    .as_name()
1522                    .unwrap()
1523                    .as_str(),
1524                "FlateDecode"
1525            );
1526
1527            // Test decode method (this might fail if filters aren't implemented)
1528            // but we'll test that it returns a result
1529            let options = ParseOptions::default();
1530            let decode_result = stream.decode(&options);
1531            assert!(decode_result.is_ok() || decode_result.is_err());
1532        }
1533
1534        #[test]
1535        fn test_parse_complex_nested_structures() {
1536            // Test nested array
1537            let input = b"[[1 2] [3 4] [5 6]]";
1538            let mut lexer = Lexer::new(Cursor::new(input));
1539            let obj = PdfObject::parse(&mut lexer).unwrap();
1540
1541            let outer_array = obj.as_array().unwrap();
1542            assert_eq!(outer_array.len(), 3);
1543
1544            for i in 0..3 {
1545                let inner_array = outer_array.get(i).unwrap().as_array().unwrap();
1546                assert_eq!(inner_array.len(), 2);
1547                assert_eq!(
1548                    inner_array.get(0).unwrap().as_integer(),
1549                    Some((i as i64) * 2 + 1)
1550                );
1551                assert_eq!(
1552                    inner_array.get(1).unwrap().as_integer(),
1553                    Some((i as i64) * 2 + 2)
1554                );
1555            }
1556        }
1557
1558        #[test]
1559        fn test_parse_complex_dictionary() {
1560            let input = b"<< /Type /Page /Parent 1 0 R /MediaBox [0 0 612 792] /Resources << /Font << /F1 2 0 R >> /ProcSet [/PDF /Text] >> /Contents 3 0 R >>";
1561            let mut lexer = Lexer::new(Cursor::new(input));
1562            let obj = PdfObject::parse(&mut lexer).unwrap();
1563
1564            let dict = obj.as_dict().unwrap();
1565            assert_eq!(dict.get_type(), Some("Page"));
1566            assert_eq!(dict.get("Parent").unwrap().as_reference(), Some((1, 0)));
1567            assert_eq!(dict.get("Contents").unwrap().as_reference(), Some((3, 0)));
1568
1569            // Test nested MediaBox array
1570            let media_box = dict.get("MediaBox").unwrap().as_array().unwrap();
1571            assert_eq!(media_box.len(), 4);
1572            assert_eq!(media_box.get(0).unwrap().as_integer(), Some(0));
1573            assert_eq!(media_box.get(1).unwrap().as_integer(), Some(0));
1574            assert_eq!(media_box.get(2).unwrap().as_integer(), Some(612));
1575            assert_eq!(media_box.get(3).unwrap().as_integer(), Some(792));
1576
1577            // Test nested Resources dictionary
1578            let resources = dict.get("Resources").unwrap().as_dict().unwrap();
1579            assert!(resources.contains_key("Font"));
1580            assert!(resources.contains_key("ProcSet"));
1581
1582            // Test nested Font dictionary
1583            let font_dict = resources.get("Font").unwrap().as_dict().unwrap();
1584            assert_eq!(font_dict.get("F1").unwrap().as_reference(), Some((2, 0)));
1585
1586            // Test ProcSet array
1587            let proc_set = resources.get("ProcSet").unwrap().as_array().unwrap();
1588            assert_eq!(proc_set.len(), 2);
1589            assert_eq!(proc_set.get(0).unwrap().as_name().unwrap().as_str(), "PDF");
1590            assert_eq!(proc_set.get(1).unwrap().as_name().unwrap().as_str(), "Text");
1591        }
1592
1593        #[test]
1594        fn test_parse_hex_strings() {
1595            let input = b"<48656C6C6F>"; // "Hello" in hex
1596            let mut lexer = Lexer::new(Cursor::new(input));
1597            let obj = PdfObject::parse(&mut lexer).unwrap();
1598
1599            let string = obj.as_string().unwrap();
1600            assert_eq!(string.as_str().unwrap(), "Hello");
1601        }
1602
1603        #[test]
1604        fn test_parse_literal_strings() {
1605            let input = b"(Hello World)";
1606            let mut lexer = Lexer::new(Cursor::new(input));
1607            let obj = PdfObject::parse(&mut lexer).unwrap();
1608
1609            let string = obj.as_string().unwrap();
1610            assert_eq!(string.as_str().unwrap(), "Hello World");
1611        }
1612
1613        #[test]
1614        fn test_parse_string_with_escapes() {
1615            let input = b"(Hello\\nWorld\\t!)";
1616            let mut lexer = Lexer::new(Cursor::new(input));
1617            let obj = PdfObject::parse(&mut lexer).unwrap();
1618
1619            let string = obj.as_string().unwrap();
1620            // The lexer should handle escape sequences
1621            assert!(!string.as_bytes().is_empty());
1622        }
1623
1624        #[test]
1625        fn test_parse_names_with_special_chars() {
1626            let input = b"/Name#20with#20spaces";
1627            let mut lexer = Lexer::new(Cursor::new(input));
1628            let obj = PdfObject::parse(&mut lexer).unwrap();
1629
1630            let name = obj.as_name().unwrap();
1631            // The lexer should handle hex escapes in names
1632            assert!(!name.as_str().is_empty());
1633        }
1634
1635        #[test]
1636        fn test_parse_references() {
1637            let input = b"1 0 R";
1638            let mut lexer = Lexer::new(Cursor::new(input));
1639            let obj = PdfObject::parse(&mut lexer).unwrap();
1640
1641            assert_eq!(obj.as_reference(), Some((1, 0)));
1642
1643            // Test reference with higher generation
1644            let input2 = b"42 5 R";
1645            let mut lexer2 = Lexer::new(Cursor::new(input2));
1646            let obj2 = PdfObject::parse(&mut lexer2).unwrap();
1647
1648            assert_eq!(obj2.as_reference(), Some((42, 5)));
1649        }
1650
1651        #[test]
1652        fn test_parse_edge_cases() {
1653            // Test very large numbers
1654            let input = b"9223372036854775807"; // i64::MAX
1655            let mut lexer = Lexer::new(Cursor::new(input));
1656            let obj = PdfObject::parse(&mut lexer).unwrap();
1657            assert_eq!(obj.as_integer(), Some(9223372036854775807));
1658
1659            // Test very small numbers
1660            let input2 = b"-9223372036854775808"; // i64::MIN
1661            let mut lexer2 = Lexer::new(Cursor::new(input2));
1662            let obj2 = PdfObject::parse(&mut lexer2).unwrap();
1663            assert_eq!(obj2.as_integer(), Some(-9223372036854775808));
1664
1665            // Test scientific notation in reals (if supported by lexer)
1666            let input3 = b"1.23e-10";
1667            let mut lexer3 = Lexer::new(Cursor::new(input3));
1668            let obj3 = PdfObject::parse(&mut lexer3).unwrap();
1669            // The lexer might not support scientific notation, so just check it's a real
1670            assert!(obj3.as_real().is_some());
1671        }
1672
1673        #[test]
1674        fn test_parse_empty_structures() {
1675            // Test empty array
1676            let input = b"[]";
1677            let mut lexer = Lexer::new(Cursor::new(input));
1678            let obj = PdfObject::parse(&mut lexer).unwrap();
1679
1680            let array = obj.as_array().unwrap();
1681            assert_eq!(array.len(), 0);
1682            assert!(array.is_empty());
1683
1684            // Test empty dictionary
1685            let input2 = b"<< >>";
1686            let mut lexer2 = Lexer::new(Cursor::new(input2));
1687            let obj2 = PdfObject::parse(&mut lexer2).unwrap();
1688
1689            let dict = obj2.as_dict().unwrap();
1690            assert_eq!(dict.0.len(), 0);
1691            assert!(dict.0.is_empty());
1692        }
1693
1694        #[test]
1695        fn test_error_handling() {
1696            // Test malformed array
1697            let input = b"[1 2 3"; // Missing closing bracket
1698            let mut lexer = Lexer::new(Cursor::new(input));
1699            let result = PdfObject::parse(&mut lexer);
1700            assert!(result.is_err());
1701
1702            // Test malformed dictionary
1703            let input2 = b"<< /Type /Page"; // Missing closing >>
1704            let mut lexer2 = Lexer::new(Cursor::new(input2));
1705            let result2 = PdfObject::parse(&mut lexer2);
1706            assert!(result2.is_err());
1707
1708            // Test malformed reference
1709            let input3 = b"1 0 X"; // Should be R, not X
1710            let mut lexer3 = Lexer::new(Cursor::new(input3));
1711            let result3 = PdfObject::parse(&mut lexer3);
1712            // This should parse as integer 1, but the exact behavior depends on lexer implementation
1713            // Could be an error or could parse as integer 1
1714            assert!(result3.is_ok() || result3.is_err());
1715        }
1716
1717        #[test]
1718        fn test_clone_and_equality() {
1719            let obj1 = PdfObject::Integer(42);
1720            let obj2 = obj1.clone();
1721            assert_eq!(obj1, obj2);
1722
1723            let obj3 = PdfObject::Integer(43);
1724            assert_ne!(obj1, obj3);
1725
1726            // Test complex structure cloning
1727            let mut array = PdfArray::new();
1728            array.push(PdfObject::Integer(1));
1729            array.push(PdfObject::String(PdfString(b"test".to_vec())));
1730            let obj4 = PdfObject::Array(array);
1731            let obj5 = obj4.clone();
1732            assert_eq!(obj4, obj5);
1733        }
1734
1735        #[test]
1736        fn test_debug_formatting() {
1737            let obj = PdfObject::Integer(42);
1738            let debug_str = format!("{:?}", obj);
1739            assert!(debug_str.contains("Integer"));
1740            assert!(debug_str.contains("42"));
1741
1742            let name = PdfName("Type".to_string());
1743            let debug_str2 = format!("{:?}", name);
1744            assert!(debug_str2.contains("PdfName"));
1745            assert!(debug_str2.contains("Type"));
1746        }
1747
1748        #[test]
1749        fn test_performance_large_array() {
1750            let mut array = PdfArray::new();
1751            for i in 0..1000 {
1752                array.push(PdfObject::Integer(i));
1753            }
1754
1755            assert_eq!(array.len(), 1000);
1756            assert_eq!(array.get(0).unwrap().as_integer(), Some(0));
1757            assert_eq!(array.get(999).unwrap().as_integer(), Some(999));
1758
1759            // Test iteration performance
1760            let sum: i64 = array.0.iter().filter_map(|obj| obj.as_integer()).sum();
1761            assert_eq!(sum, 499500); // sum of 0..1000
1762        }
1763
1764        #[test]
1765        fn test_performance_large_dictionary() {
1766            let mut dict = PdfDictionary::new();
1767            for i in 0..1000 {
1768                dict.insert(format!("Key{}", i), PdfObject::Integer(i));
1769            }
1770
1771            assert_eq!(dict.0.len(), 1000);
1772            assert_eq!(dict.get("Key0").unwrap().as_integer(), Some(0));
1773            assert_eq!(dict.get("Key999").unwrap().as_integer(), Some(999));
1774
1775            // Test lookup performance
1776            for i in 0..1000 {
1777                assert!(dict.contains_key(&format!("Key{}", i)));
1778            }
1779        }
1780    }
1781
1782    #[test]
1783    fn test_lenient_stream_parsing_too_short() {
1784        // Create a simpler test for stream parsing
1785        // Dictionary with stream
1786        let dict = PdfDictionary(
1787            vec![(PdfName("Length".to_string()), PdfObject::Integer(10))]
1788                .into_iter()
1789                .collect::<HashMap<_, _>>(),
1790        );
1791
1792        // Create test data where actual stream is longer than declared length
1793        // Note: avoid using "stream" in the content as it confuses the keyword search
1794        let stream_content = b"This is a much longer text content than just 10 bytes";
1795        let test_data = vec![
1796            b"\n".to_vec(), // Newline after stream keyword
1797            stream_content.to_vec(),
1798            b"\nendstream".to_vec(),
1799        ]
1800        .concat();
1801
1802        // Test lenient parsing
1803        let mut cursor = Cursor::new(test_data);
1804        let mut lexer = Lexer::new(&mut cursor);
1805        let mut options = ParseOptions::default();
1806        options.lenient_streams = true;
1807        options.max_recovery_bytes = 100;
1808        options.collect_warnings = false;
1809
1810        // parse_stream_data_with_options expects the 'stream' token to have been consumed already
1811        // and will read the newline after 'stream'
1812
1813        let result = PdfObject::parse_stream_data_with_options(&mut lexer, &dict, &options);
1814        if let Err(e) = &result {
1815            eprintln!("Error in test_lenient_stream_parsing_too_short: {:?}", e);
1816            eprintln!("Warning: Stream length mismatch expected, checking if lenient parsing is working correctly");
1817        }
1818        assert!(result.is_ok());
1819
1820        let stream_data = result.unwrap();
1821        let content = String::from_utf8_lossy(&stream_data);
1822
1823        // In lenient mode, should get content up to endstream
1824        // It seems to be finding "stream" within the content and stopping early
1825        assert!(content.contains("This is a"));
1826    }
1827
1828    #[test]
1829    fn test_lenient_stream_parsing_too_long() {
1830        // Test case where declared length is longer than actual stream
1831        let dict = PdfDictionary(
1832            vec![(PdfName("Length".to_string()), PdfObject::Integer(100))]
1833                .into_iter()
1834                .collect::<HashMap<_, _>>(),
1835        );
1836
1837        // Create test data where actual stream is shorter than declared length
1838        let stream_content = b"Short";
1839        let test_data = vec![
1840            b"\n".to_vec(), // Newline after stream keyword
1841            stream_content.to_vec(),
1842            b"\nendstream".to_vec(),
1843        ]
1844        .concat();
1845
1846        // Test lenient parsing
1847        let mut cursor = Cursor::new(test_data);
1848        let mut lexer = Lexer::new(&mut cursor);
1849        let mut options = ParseOptions::default();
1850        options.lenient_streams = true;
1851        options.max_recovery_bytes = 100;
1852        options.collect_warnings = false;
1853
1854        // parse_stream_data_with_options expects the 'stream' token to have been consumed already
1855
1856        let result = PdfObject::parse_stream_data_with_options(&mut lexer, &dict, &options);
1857
1858        // When declared length is too long, it will fail to read 100 bytes
1859        // This is expected behavior - lenient mode handles incorrect lengths when
1860        // endstream is not where expected, but can't fix EOF issues
1861        assert!(result.is_err());
1862    }
1863
1864    #[test]
1865    fn test_lenient_stream_no_endstream_found() {
1866        // Test case where endstream is missing or too far away
1867        let input = b"<< /Length 10 >>
1868stream
1869This text does not contain the magic word and continues for a very long time with no proper termination...";
1870
1871        let mut cursor = Cursor::new(input.to_vec());
1872        let mut lexer = Lexer::new(&mut cursor);
1873        let mut options = ParseOptions::default();
1874        options.lenient_streams = true;
1875        options.max_recovery_bytes = 50; // Limit search - endstream not within these bytes
1876        options.collect_warnings = false;
1877
1878        let dict_token = lexer.next_token().unwrap();
1879        let obj = PdfObject::parse_from_token_with_options(&mut lexer, dict_token, &options);
1880
1881        // Should fail because endstream not found within recovery distance
1882        assert!(obj.is_err());
1883    }
1884}
oxidize_pdf/parser/objects.rs

oxidize_pdf/parser/
objects.rs