Skip to main content

oxidize_pdf/parser/
content.rs

1//! PDF Content Stream Parser - Complete support for PDF graphics operators
2//!
3//! This module implements comprehensive parsing of PDF content streams according to the PDF specification.
4//! Content streams contain the actual drawing instructions (operators) that render text, graphics, and images
5//! on PDF pages.
6//!
7//! # Overview
8//!
9//! Content streams are sequences of PDF operators that describe:
10//! - Text positioning and rendering
11//! - Path construction and painting
12//! - Color and graphics state management
13//! - Image and XObject placement
14//! - Coordinate transformations
15//!
16//! # Architecture
17//!
18//! The parser is divided into two main components:
19//! - `ContentTokenizer`: Low-level tokenization of content stream bytes
20//! - `ContentParser`: High-level parsing of tokens into structured operations
21//!
22//! # Example
23//!
24//! ```rust,no_run
25//! use oxidize_pdf::parser::content::{ContentParser, ContentOperation};
26//!
27//! # fn example() -> Result<(), Box<dyn std::error::Error>> {
28//! // Parse a content stream
29//! let content_stream = b"BT /F1 12 Tf 100 200 Td (Hello World) Tj ET";
30//! let operations = ContentParser::parse_content(content_stream)?;
31//!
32//! // Process operations
33//! for op in operations {
34//!     match op {
35//!         ContentOperation::BeginText => println!("Start text object"),
36//!         ContentOperation::SetFont(name, size) => println!("Font: {} at {}", name, size),
37//!         ContentOperation::ShowText(text) => println!("Text: {:?}", text),
38//!         _ => {}
39//!     }
40//! }
41//! # Ok(())
42//! # }
43//! ```
44//!
45//! # Supported Operators
46//!
47//! This parser supports all standard PDF operators including:
48//! - Text operators (BT, ET, Tj, TJ, Tf, Td, etc.)
49//! - Graphics state operators (q, Q, cm, w, J, etc.)
50//! - Path construction operators (m, l, c, re, h)
51//! - Path painting operators (S, f, B, n, etc.)
52//! - Color operators (g, rg, k, cs, scn, etc.)
53//! - XObject operators (Do)
54//! - Marked content operators (BMC, BDC, EMC, etc.)
55
56use super::{ParseError, ParseResult};
57use crate::objects::Object;
58use std::collections::HashMap;
59
60/// A single value inside a marked-content properties dictionary or array.
61///
62/// PDF marked-content properties (BDC, DP) carry typed values: strings,
63/// integers, real numbers, names, arrays, and nested dictionaries. The
64/// previous `HashMap<String, String>` carrier was lossy for `/ActualText`
65/// (UTF-16BE bytes mangled by `String::from_utf8_lossy`) and for `/MCID`
66/// (integer values stored as their decimal string representation). This
67/// enum preserves the original token type and bytes; decoding happens
68/// lazily at the extractor level (e.g. UTF-16BE detection via BOM).
69///
70/// Hex strings (`<FEFF00660069>`) and literal strings (`(text)`) both
71/// land here as `MarkedContentValue::String(Vec<u8>)` because both are
72/// raw byte sequences at the PDF tokenizer level.
73#[derive(Debug, Clone, PartialEq)]
74pub enum MarkedContentValue {
75    /// Raw PDF string bytes (from either `Token::String` or `Token::HexString`).
76    /// Decoded lazily by consumers — UTF-16BE detection via BOM happens in the
77    /// extractor's `decode_pdf_string` helper.
78    String(Vec<u8>),
79    /// PDF integer (e.g. `/MCID 0`).
80    Integer(i64),
81    /// PDF real number.
82    Real(f64),
83    /// PDF name token (e.g. `/Pagination`).
84    Name(String),
85    /// PDF array; nested values are themselves `MarkedContentValue`.
86    Array(Vec<MarkedContentValue>),
87    /// Nested dictionary; keys are PDF name strings (the leading `/` is stripped).
88    Dict(HashMap<String, MarkedContentValue>),
89}
90
91/// Properties operand of a BDC/DP operator. Two shapes per ISO 32000-1
92/// §14.6.2:
93///
94/// - **Inline**: the second BDC operand is an inline dictionary literal
95///   (`<< /MCID 0 /ActualText (fi) >>`). Keys map to `MarkedContentValue`.
96/// - **ResourceRef**: the second BDC operand is a name (`/PropsName`) that
97///   references the page's `/Resources /Properties /<name>` dictionary.
98///   Resolution against the page's resource tree happens in the extractor
99///   (parser does not have access to the page object).
100#[derive(Debug, Clone, PartialEq)]
101pub enum MarkedContentProps {
102    Inline(HashMap<String, MarkedContentValue>),
103    ResourceRef(String),
104}
105
106/// Represents a single operator in a PDF content stream.
107///
108/// Each variant corresponds to a specific PDF operator and carries the associated
109/// operands. These operations form a complete instruction set for rendering PDF content.
110///
111/// # Categories
112///
113/// Operations are grouped into several categories:
114/// - **Text Object**: BeginText, EndText
115/// - **Text State**: Font, spacing, scaling, rendering mode
116/// - **Text Positioning**: Matrix transforms, moves, line advances
117/// - **Text Showing**: Display text with various formatting
118/// - **Graphics State**: Save/restore, transforms, line properties
119/// - **Path Construction**: Move, line, curve, rectangle operations
120/// - **Path Painting**: Stroke, fill, clipping operations
121/// - **Color**: RGB, CMYK, grayscale, and color space operations
122/// - **XObject**: External graphics and form placement
123/// - **Marked Content**: Semantic tagging for accessibility
124///
125/// # Example
126///
127/// ```rust
128/// use oxidize_pdf::parser::content::{ContentOperation};
129///
130/// // Text operation
131/// let op1 = ContentOperation::ShowText(b"Hello".to_vec());
132///
133/// // Graphics operation
134/// let op2 = ContentOperation::SetLineWidth(2.0);
135///
136/// // Path operation
137/// let op3 = ContentOperation::Rectangle(10.0, 10.0, 100.0, 50.0);
138/// ```
139#[derive(Debug, Clone, PartialEq)]
140pub enum ContentOperation {
141    // Text object operators
142    /// Begin a text object (BT operator).
143    /// All text showing operations must occur within a text object.
144    BeginText,
145
146    /// End a text object (ET operator).
147    /// Closes the current text object started with BeginText.
148    EndText,
149
150    // Text state operators
151    /// Set character spacing (Tc operator).
152    /// Additional space between characters in unscaled text units.
153    SetCharSpacing(f32),
154
155    /// Set word spacing (Tw operator).
156    /// Additional space for ASCII space character (0x20) in unscaled text units.
157    SetWordSpacing(f32),
158
159    /// Set horizontal text scaling (Tz operator).
160    /// Percentage of normal width (100 = normal).
161    SetHorizontalScaling(f32),
162
163    /// Set text leading (TL operator).
164    /// Vertical distance between baselines for T* operator.
165    SetLeading(f32),
166
167    /// Set font and size (Tf operator).
168    /// Font name must match a key in the Resources/Font dictionary.
169    SetFont(String, f32),
170
171    /// Set text rendering mode (Tr operator).
172    /// 0=fill, 1=stroke, 2=fill+stroke, 3=invisible, 4=fill+clip, 5=stroke+clip, 6=fill+stroke+clip, 7=clip
173    SetTextRenderMode(i32),
174
175    /// Set text rise (Ts operator).
176    /// Vertical displacement for superscripts/subscripts in text units.
177    SetTextRise(f32),
178
179    // Text positioning operators
180    /// Move text position (Td operator).
181    /// Translates the text matrix by (tx, ty).
182    MoveText(f32, f32),
183
184    /// Move text position and set leading (TD operator).
185    /// Equivalent to: -ty TL tx ty Td
186    MoveTextSetLeading(f32, f32),
187
188    /// Set text matrix directly (Tm operator).
189    /// Parameters: [a, b, c, d, e, f] for transformation matrix.
190    SetTextMatrix(f32, f32, f32, f32, f32, f32),
191
192    /// Move to start of next line (T* operator).
193    /// Uses the current leading value set with TL.
194    NextLine,
195
196    // Text showing operators
197    /// Show text string (Tj operator).
198    /// The bytes are encoded according to the current font's encoding.
199    ShowText(Vec<u8>),
200
201    /// Show text with individual positioning (TJ operator).
202    /// Array elements can be strings or position adjustments.
203    ShowTextArray(Vec<TextElement>),
204
205    /// Move to next line and show text (' operator).
206    /// Equivalent to: T* string Tj
207    NextLineShowText(Vec<u8>),
208
209    /// Set spacing, move to next line, and show text (" operator).
210    /// Equivalent to: word_spacing Tw char_spacing Tc string '
211    SetSpacingNextLineShowText(f32, f32, Vec<u8>),
212
213    // Graphics state operators
214    /// Save current graphics state (q operator).
215    /// Pushes the entire graphics state onto a stack.
216    SaveGraphicsState,
217
218    /// Restore graphics state (Q operator).
219    /// Pops the graphics state from the stack.
220    RestoreGraphicsState,
221
222    /// Concatenate matrix to current transformation matrix (cm operator).
223    /// Modifies the CTM: CTM' = CTM × [a b c d e f]
224    SetTransformMatrix(f32, f32, f32, f32, f32, f32),
225
226    /// Set line width (w operator) in user space units.
227    SetLineWidth(f32),
228
229    /// Set line cap style (J operator).
230    /// 0=butt cap, 1=round cap, 2=projecting square cap
231    SetLineCap(i32),
232
233    /// Set line join style (j operator).
234    /// 0=miter join, 1=round join, 2=bevel join
235    SetLineJoin(i32),
236
237    /// Set miter limit (M operator).
238    /// Maximum ratio of miter length to line width.
239    SetMiterLimit(f32),
240
241    /// Set dash pattern (d operator).
242    /// Array of dash/gap lengths and starting phase.
243    SetDashPattern(Vec<f32>, f32),
244
245    /// Set rendering intent (ri operator).
246    /// Color rendering intent: /AbsoluteColorimetric, /RelativeColorimetric, /Saturation, /Perceptual
247    SetIntent(String),
248
249    /// Set flatness tolerance (i operator).
250    /// Maximum error when rendering curves as line segments.
251    SetFlatness(f32),
252
253    /// Set graphics state from parameter dictionary (gs operator).
254    /// References ExtGState resource dictionary.
255    SetGraphicsStateParams(String),
256
257    // Path construction operators
258    /// Begin new subpath at point (m operator).
259    MoveTo(f32, f32),
260
261    /// Append straight line segment (l operator).
262    LineTo(f32, f32),
263
264    /// Append cubic Bézier curve (c operator).
265    /// Control points: (x1,y1), (x2,y2), endpoint: (x3,y3)
266    CurveTo(f32, f32, f32, f32, f32, f32),
267
268    /// Append cubic Bézier curve with first control point = current point (v operator).
269    CurveToV(f32, f32, f32, f32),
270
271    /// Append cubic Bézier curve with second control point = endpoint (y operator).
272    CurveToY(f32, f32, f32, f32),
273
274    /// Close current subpath (h operator).
275    /// Appends straight line to starting point.
276    ClosePath,
277
278    /// Append rectangle as complete subpath (re operator).
279    /// Parameters: x, y, width, height
280    Rectangle(f32, f32, f32, f32),
281
282    // Path painting operators
283    /// Stroke the path (S operator).
284    Stroke,
285
286    /// Close and stroke the path (s operator).
287    /// Equivalent to: h S
288    CloseStroke,
289
290    /// Fill the path using nonzero winding rule (f or F operator).
291    Fill,
292
293    /// Fill the path using even-odd rule (f* operator).
294    FillEvenOdd,
295
296    /// Fill then stroke the path (B operator).
297    /// Uses nonzero winding rule.
298    FillStroke,
299
300    /// Fill then stroke using even-odd rule (B* operator).
301    FillStrokeEvenOdd,
302
303    /// Close, fill, and stroke the path (b operator).
304    /// Equivalent to: h B
305    CloseFillStroke,
306
307    /// Close, fill, and stroke using even-odd rule (b* operator).
308    CloseFillStrokeEvenOdd,
309
310    /// End path without filling or stroking (n operator).
311    /// Used primarily before clipping.
312    EndPath,
313
314    // Clipping path operators
315    Clip,        // W
316    ClipEvenOdd, // W*
317
318    // Color operators
319    /// Set stroking color space (CS operator).
320    /// References ColorSpace resource dictionary.
321    SetStrokingColorSpace(String),
322
323    /// Set non-stroking color space (cs operator).
324    /// References ColorSpace resource dictionary.
325    SetNonStrokingColorSpace(String),
326
327    /// Set stroking color (SC, SCN operators).
328    /// Number of components depends on current color space.
329    SetStrokingColor(Vec<f32>),
330
331    /// Set non-stroking color (sc, scn operators).
332    /// Number of components depends on current color space.
333    SetNonStrokingColor(Vec<f32>),
334
335    /// Set stroking color to DeviceGray (G operator).
336    /// 0.0 = black, 1.0 = white
337    SetStrokingGray(f32),
338
339    /// Set non-stroking color to DeviceGray (g operator).
340    SetNonStrokingGray(f32),
341
342    /// Set stroking color to DeviceRGB (RG operator).
343    /// Components range from 0.0 to 1.0.
344    SetStrokingRGB(f32, f32, f32),
345
346    /// Set non-stroking color to DeviceRGB (rg operator).
347    SetNonStrokingRGB(f32, f32, f32),
348
349    /// Set stroking color to DeviceCMYK (K operator).
350    SetStrokingCMYK(f32, f32, f32, f32),
351
352    /// Set non-stroking color to DeviceCMYK (k operator).
353    SetNonStrokingCMYK(f32, f32, f32, f32),
354
355    // Shading operators
356    ShadingFill(String), // sh
357
358    // Inline image operators
359    /// Begin inline image (BI operator)
360    BeginInlineImage,
361    /// Inline image with parsed dictionary and data
362    InlineImage {
363        /// Image parameters (width, height, colorspace, etc.)
364        params: HashMap<String, Object>,
365        /// Raw image data
366        data: Vec<u8>,
367    },
368
369    // XObject operators
370    /// Paint external object (Do operator).
371    /// References XObject resource dictionary (images, forms).
372    PaintXObject(String),
373
374    // Marked content operators
375    BeginMarkedContent(String),                                    // BMC
376    BeginMarkedContentWithProps(String, MarkedContentProps),       // BDC
377    EndMarkedContent,                                              // EMC
378    DefineMarkedContentPoint(String),                              // MP
379    DefineMarkedContentPointWithProps(String, MarkedContentProps), // DP
380
381    // Compatibility operators
382    BeginCompatibility, // BX
383    EndCompatibility,   // EX
384}
385
386/// Represents a text element in a TJ array for ShowTextArray operations.
387///
388/// The TJ operator takes an array of strings and position adjustments,
389/// allowing fine control over character and word spacing.
390///
391/// # Example
392///
393/// ```rust
394/// use oxidize_pdf::parser::content::{TextElement, ContentOperation};
395///
396/// // TJ array: [(Hello) -50 (World)]
397/// let tj_array = vec![
398///     TextElement::Text(b"Hello".to_vec()),
399///     TextElement::Spacing(-50.0), // Move left 50 units
400///     TextElement::Text(b"World".to_vec()),
401/// ];
402/// let op = ContentOperation::ShowTextArray(tj_array);
403/// ```
404#[derive(Debug, Clone, PartialEq)]
405pub enum TextElement {
406    /// Text string to show
407    Text(Vec<u8>),
408    /// Position adjustment in thousandths of text space units
409    /// Negative values move to the right (decrease spacing)
410    Spacing(f32),
411}
412
413/// Token types in content streams
414#[derive(Debug, Clone, PartialEq)]
415pub(super) enum Token {
416    Number(f32),
417    Integer(i32),
418    String(Vec<u8>),
419    HexString(Vec<u8>),
420    Name(String),
421    Operator(String),
422    ArrayStart,
423    ArrayEnd,
424    DictStart,
425    DictEnd,
426    /// Raw binary data between ID and EI in an inline image.
427    /// The tokenizer captures this as opaque bytes to prevent
428    /// binary image data from being mis-parsed as operators.
429    InlineImageData(Vec<u8>),
430}
431
432/// Content stream tokenizer
433pub struct ContentTokenizer<'a> {
434    input: &'a [u8],
435    position: usize,
436    /// Set after returning an "ID" operator token.
437    /// The next call to next_token() will read raw inline image bytes.
438    in_inline_image: bool,
439}
440
441impl<'a> ContentTokenizer<'a> {
442    /// Create a new tokenizer for the given input
443    pub fn new(input: &'a [u8]) -> Self {
444        Self {
445            input,
446            position: 0,
447            in_inline_image: false,
448        }
449    }
450
451    /// Get the next token from the stream
452    pub(super) fn next_token(&mut self) -> ParseResult<Option<Token>> {
453        // If we just returned an "ID" token, read raw inline image binary data
454        if self.in_inline_image {
455            self.in_inline_image = false;
456            return self.read_inline_image_data();
457        }
458
459        self.skip_whitespace();
460
461        if self.position >= self.input.len() {
462            return Ok(None);
463        }
464
465        let ch = self.input[self.position];
466
467        match ch {
468            // Numbers
469            b'+' | b'-' | b'.' | b'0'..=b'9' => self.read_number(),
470
471            // Strings
472            b'(' => self.read_literal_string(),
473            b'<' => {
474                if self.peek_next() == Some(b'<') {
475                    self.position += 2;
476                    Ok(Some(Token::DictStart))
477                } else {
478                    self.read_hex_string()
479                }
480            }
481            b'>' => {
482                if self.peek_next() == Some(b'>') {
483                    self.position += 2;
484                    Ok(Some(Token::DictEnd))
485                } else {
486                    Err(ParseError::SyntaxError {
487                        position: self.position,
488                        message: "Unexpected '>'".to_string(),
489                    })
490                }
491            }
492
493            // Arrays
494            b'[' => {
495                self.position += 1;
496                Ok(Some(Token::ArrayStart))
497            }
498            b']' => {
499                self.position += 1;
500                Ok(Some(Token::ArrayEnd))
501            }
502
503            // Names
504            b'/' => self.read_name(),
505
506            // Skip unhandled delimiters (corrupted content / binary data recovery)
507            // These bytes are delimiters in read_operator() but have no valid meaning
508            // at the top level of a content stream. Skipping them prevents infinite loops
509            // where read_operator() would return an empty operator without advancing.
510            b';' | b')' | b'{' | b'}' => {
511                self.position += 1;
512                self.next_token() // Recursively get next valid token
513            }
514
515            // Operators or other tokens
516            _ => {
517                let token = self.read_operator()?;
518                // After "ID" operator, switch to raw binary mode for inline image data
519                if let Some(Token::Operator(ref op)) = token {
520                    if op == "ID" {
521                        self.in_inline_image = true;
522                    }
523                }
524                Ok(token)
525            }
526        }
527    }
528
529    fn skip_whitespace(&mut self) {
530        while self.position < self.input.len() {
531            match self.input[self.position] {
532                b' ' | b'\t' | b'\r' | b'\n' | b'\x0C' => self.position += 1,
533                b'%' => self.skip_comment(),
534                _ => break,
535            }
536        }
537    }
538
539    fn skip_comment(&mut self) {
540        while self.position < self.input.len() && self.input[self.position] != b'\n' {
541            self.position += 1;
542        }
543    }
544
545    fn peek_next(&self) -> Option<u8> {
546        if self.position + 1 < self.input.len() {
547            Some(self.input[self.position + 1])
548        } else {
549            None
550        }
551    }
552
553    fn read_number(&mut self) -> ParseResult<Option<Token>> {
554        let start = self.position;
555        let mut has_dot = false;
556
557        // Handle optional sign
558        if self.position < self.input.len()
559            && (self.input[self.position] == b'+' || self.input[self.position] == b'-')
560        {
561            self.position += 1;
562        }
563
564        // Read digits and optional decimal point
565        while self.position < self.input.len() {
566            match self.input[self.position] {
567                b'0'..=b'9' => self.position += 1,
568                b'.' if !has_dot => {
569                    has_dot = true;
570                    self.position += 1;
571                }
572                _ => break,
573            }
574        }
575
576        let num_str = std::str::from_utf8(&self.input[start..self.position]).map_err(|_| {
577            ParseError::SyntaxError {
578                position: start,
579                message: "Invalid number format".to_string(),
580            }
581        })?;
582
583        if has_dot {
584            let value = num_str
585                .parse::<f32>()
586                .map_err(|_| ParseError::SyntaxError {
587                    position: start,
588                    message: "Invalid float number".to_string(),
589                })?;
590            Ok(Some(Token::Number(value)))
591        } else {
592            let value = num_str
593                .parse::<i32>()
594                .map_err(|_| ParseError::SyntaxError {
595                    position: start,
596                    message: "Invalid integer number".to_string(),
597                })?;
598            Ok(Some(Token::Integer(value)))
599        }
600    }
601
602    fn read_literal_string(&mut self) -> ParseResult<Option<Token>> {
603        self.position += 1; // Skip opening '('
604        let mut result = Vec::new();
605        let mut paren_depth = 1;
606        let mut escape = false;
607
608        while self.position < self.input.len() && paren_depth > 0 {
609            let ch = self.input[self.position];
610            self.position += 1;
611
612            if escape {
613                match ch {
614                    b'n' => result.push(b'\n'),
615                    b'r' => result.push(b'\r'),
616                    b't' => result.push(b'\t'),
617                    b'b' => result.push(b'\x08'),
618                    b'f' => result.push(b'\x0C'),
619                    b'(' => result.push(b'('),
620                    b')' => result.push(b')'),
621                    b'\\' => result.push(b'\\'),
622                    b'0'..=b'7' => {
623                        // Octal escape sequence
624                        self.position -= 1;
625                        let octal_value = self.read_octal_escape()?;
626                        result.push(octal_value);
627                    }
628                    _ => result.push(ch), // Unknown escape, treat as literal
629                }
630                escape = false;
631            } else {
632                match ch {
633                    b'\\' => escape = true,
634                    b'(' => {
635                        paren_depth += 1;
636                        result.push(ch);
637                    }
638                    b')' => {
639                        paren_depth -= 1;
640                        if paren_depth > 0 {
641                            result.push(ch);
642                        }
643                    }
644                    _ => result.push(ch),
645                }
646            }
647        }
648
649        Ok(Some(Token::String(result)))
650    }
651
652    fn read_octal_escape(&mut self) -> ParseResult<u8> {
653        // Use u16 to avoid overflow panic on malformed octal sequences (e.g. \777).
654        // Per ISO 32000-1:2008 §7.3.4.2: "high-order overflow shall be ignored".
655        let mut value = 0u16;
656        let mut count = 0;
657
658        while count < 3 && self.position < self.input.len() {
659            match self.input[self.position] {
660                b'0'..=b'7' => {
661                    value = value * 8 + u16::from(self.input[self.position] - b'0');
662                    self.position += 1;
663                    count += 1;
664                }
665                _ => break,
666            }
667        }
668
669        Ok(value as u8)
670    }
671
672    fn read_hex_string(&mut self) -> ParseResult<Option<Token>> {
673        self.position += 1; // Skip opening '<'
674        let mut result = Vec::new();
675        let mut nibble = None;
676
677        while self.position < self.input.len() {
678            let ch = self.input[self.position];
679
680            match ch {
681                b'>' => {
682                    self.position += 1;
683                    // Handle odd number of hex digits
684                    if let Some(n) = nibble {
685                        result.push(n << 4);
686                    }
687                    return Ok(Some(Token::HexString(result)));
688                }
689                b'0'..=b'9' | b'A'..=b'F' | b'a'..=b'f' => {
690                    let digit = if ch <= b'9' {
691                        ch - b'0'
692                    } else if ch <= b'F' {
693                        ch - b'A' + 10
694                    } else {
695                        ch - b'a' + 10
696                    };
697
698                    if let Some(n) = nibble {
699                        result.push((n << 4) | digit);
700                        nibble = None;
701                    } else {
702                        nibble = Some(digit);
703                    }
704                    self.position += 1;
705                }
706                b' ' | b'\t' | b'\r' | b'\n' | b'\x0C' => {
707                    // Skip whitespace in hex strings
708                    self.position += 1;
709                }
710                _ => {
711                    return Err(ParseError::SyntaxError {
712                        position: self.position,
713                        message: format!("Invalid character in hex string: {:?}", ch as char),
714                    });
715                }
716            }
717        }
718
719        Err(ParseError::SyntaxError {
720            position: self.position,
721            message: "Unterminated hex string".to_string(),
722        })
723    }
724
725    fn read_name(&mut self) -> ParseResult<Option<Token>> {
726        self.position += 1; // Skip '/'
727        let start = self.position;
728
729        while self.position < self.input.len() {
730            let ch = self.input[self.position];
731            match ch {
732                b' ' | b'\t' | b'\r' | b'\n' | b'\x0C' | b'(' | b')' | b'<' | b'>' | b'['
733                | b']' | b'{' | b'}' | b'/' | b'%' => break,
734                b'#' => {
735                    // Handle hex escape in name
736                    self.position += 1;
737                    if self.position + 1 < self.input.len() {
738                        self.position += 2;
739                    }
740                }
741                _ => self.position += 1,
742            }
743        }
744
745        let name_bytes = &self.input[start..self.position];
746        let name = self.decode_name(name_bytes)?;
747        Ok(Some(Token::Name(name)))
748    }
749
750    fn decode_name(&self, bytes: &[u8]) -> ParseResult<String> {
751        let mut result = Vec::new();
752        let mut i = 0;
753
754        while i < bytes.len() {
755            if bytes[i] == b'#' && i + 2 < bytes.len() {
756                // Hex escape
757                let hex_str = std::str::from_utf8(&bytes[i + 1..i + 3]).map_err(|_| {
758                    ParseError::SyntaxError {
759                        position: self.position,
760                        message: "Invalid hex escape in name".to_string(),
761                    }
762                })?;
763                let value =
764                    u8::from_str_radix(hex_str, 16).map_err(|_| ParseError::SyntaxError {
765                        position: self.position,
766                        message: "Invalid hex escape in name".to_string(),
767                    })?;
768                result.push(value);
769                i += 3;
770            } else {
771                result.push(bytes[i]);
772                i += 1;
773            }
774        }
775
776        String::from_utf8(result).map_err(|_| ParseError::SyntaxError {
777            position: self.position,
778            message: "Invalid UTF-8 in name".to_string(),
779        })
780    }
781
782    fn read_operator(&mut self) -> ParseResult<Option<Token>> {
783        let start = self.position;
784
785        while self.position < self.input.len() {
786            let ch = self.input[self.position];
787            match ch {
788                b' ' | b'\t' | b'\r' | b'\n' | b'\x0C' | b'(' | b')' | b'<' | b'>' | b'['
789                | b']' | b'{' | b'}' | b'/' | b'%' | b';' => break,
790                _ => self.position += 1,
791            }
792        }
793
794        let op_bytes = &self.input[start..self.position];
795        let op = std::str::from_utf8(op_bytes).map_err(|_| ParseError::SyntaxError {
796            position: start,
797            message: "Invalid operator".to_string(),
798        })?;
799
800        Ok(Some(Token::Operator(op.to_string())))
801    }
802
803    /// Read raw binary data for an inline image (between ID and EI).
804    ///
805    /// Per PDF spec §4.8.6, after the ID operator and a single whitespace byte,
806    /// all subsequent bytes are raw image data until the EI marker is found.
807    /// The EI marker is: whitespace + 'E' + 'I' + (whitespace, delimiter, or EOF).
808    fn read_inline_image_data(&mut self) -> ParseResult<Option<Token>> {
809        // Skip single whitespace byte after ID (per PDF spec §4.8.6)
810        if self.position < self.input.len() {
811            let ch = self.input[self.position];
812            if ch == b' ' || ch == b'\n' || ch == b'\r' || ch == b'\t' {
813                self.position += 1;
814                // Handle \r\n as single whitespace
815                if ch == b'\r'
816                    && self.position < self.input.len()
817                    && self.input[self.position] == b'\n'
818                {
819                    self.position += 1;
820                }
821            }
822        }
823
824        let start = self.position;
825
826        // Scan for EI marker: preceded by whitespace + 'E' + 'I' + (whitespace/delimiter/EOF)
827        while self.position + 1 < self.input.len() {
828            let preceded_by_whitespace = self.position == start
829                || matches!(
830                    self.input[self.position - 1],
831                    b' ' | b'\t' | b'\r' | b'\n' | b'\x0C'
832                );
833
834            if preceded_by_whitespace
835                && self.input[self.position] == b'E'
836                && self.input[self.position + 1] == b'I'
837            {
838                let after_ei = self.position + 2;
839                let followed_by_boundary = after_ei >= self.input.len()
840                    || matches!(
841                        self.input[after_ei],
842                        b' ' | b'\t' | b'\r' | b'\n' | b'\x0C' | b'/' | b'<' | b'(' | b'[' | b'%'
843                    );
844
845                if followed_by_boundary {
846                    // Trim trailing whitespace that preceded EI from the data
847                    let mut end = self.position;
848                    if end > start
849                        && matches!(self.input[end - 1], b' ' | b'\t' | b'\r' | b'\n' | b'\x0C')
850                    {
851                        end -= 1;
852                    }
853                    let data = self.input[start..end].to_vec();
854                    self.position = after_ei; // Skip past "EI"
855                    return Ok(Some(Token::InlineImageData(data)));
856                }
857            }
858            self.position += 1;
859        }
860
861        // No EI found — return remaining bytes as best-effort recovery
862        let data = self.input[start..].to_vec();
863        self.position = self.input.len();
864        Ok(Some(Token::InlineImageData(data)))
865    }
866}
867
868/// High-level content stream parser.
869///
870/// Converts tokenized content streams into structured `ContentOperation` values.
871/// This parser handles the operand stack and operator parsing according to PDF specifications.
872///
873/// # Usage
874///
875/// The parser is typically used through its static methods:
876///
877/// ```rust
878/// use oxidize_pdf::parser::content::ContentParser;
879///
880/// # fn example() -> Result<(), Box<dyn std::error::Error>> {
881/// let content = b"q 1 0 0 1 50 50 cm 100 100 200 150 re S Q";
882/// let operations = ContentParser::parse(content)?;
883/// # Ok(())
884/// # }
885/// ```
886pub struct ContentParser {
887    tokens: Vec<Token>,
888    position: usize,
889}
890
891impl ContentParser {
892    /// Create a new content parser
893    pub fn new(_content: &[u8]) -> Self {
894        Self {
895            tokens: Vec::new(),
896            position: 0,
897        }
898    }
899
900    /// Parse a content stream into a vector of operators.
901    ///
902    /// This is a convenience method that creates a parser and processes the entire stream.
903    ///
904    /// # Arguments
905    ///
906    /// * `content` - Raw content stream bytes (may be compressed)
907    ///
908    /// # Returns
909    ///
910    /// A vector of parsed `ContentOperation` values in the order they appear.
911    ///
912    /// # Errors
913    ///
914    /// Returns an error if:
915    /// - Invalid operator syntax is encountered
916    /// - Operators have incorrect number/type of operands
917    /// - Unknown operators are found
918    ///
919    /// # Example
920    ///
921    /// ```rust
922    /// use oxidize_pdf::parser::content::{ContentParser, ContentOperation};
923    ///
924    /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
925    /// let content = b"BT /F1 12 Tf 100 200 Td (Hello) Tj ET";
926    /// let operations = ContentParser::parse(content)?;
927    ///
928    /// assert_eq!(operations.len(), 5);
929    /// assert!(matches!(operations[0], ContentOperation::BeginText));
930    /// # Ok(())
931    /// # }
932    /// ```
933    pub fn parse(content: &[u8]) -> ParseResult<Vec<ContentOperation>> {
934        Self::parse_content(content)
935    }
936
937    /// Parse a content stream into a vector of operators.
938    ///
939    /// This method tokenizes the input and converts it to operations.
940    /// It handles the PDF postfix notation where operands precede operators.
941    pub fn parse_content(content: &[u8]) -> ParseResult<Vec<ContentOperation>> {
942        let mut tokenizer = ContentTokenizer::new(content);
943        let mut tokens = Vec::new();
944
945        // Tokenize the entire stream. Best-effort recovery (issue #319):
946        // if the tokenizer hits an unrecoverable construct (e.g. an
947        // unterminated hex string), stop there but KEEP every token parsed
948        // so far instead of discarding the whole page. `next_token` already
949        // recovers internally from skippable garbage; a hard error here is
950        // the tail of the stream, and losing the tail beats losing the page.
951        loop {
952            match tokenizer.next_token() {
953                Ok(Some(token)) => tokens.push(token),
954                Ok(None) => break,
955                Err(_e) => {
956                    tracing::debug!("content tokenizer stopped early: {_e}");
957                    break;
958                }
959            }
960        }
961
962        let mut parser = Self {
963            tokens,
964            position: 0,
965        };
966
967        parser.parse_operators()
968    }
969
970    fn parse_operators(&mut self) -> ParseResult<Vec<ContentOperation>> {
971        let mut operators = Vec::new();
972        let mut operand_stack: Vec<Token> = Vec::new();
973
974        while self.position < self.tokens.len() {
975            let token = self.tokens[self.position].clone();
976            self.position += 1;
977
978            match &token {
979                Token::Operator(op) => {
980                    // Best-effort recovery (issue #319): a single malformed
981                    // operator must NOT discard the entire content stream.
982                    // Content streams are not safety-critical; an operand
983                    // mismatch on one operator (e.g. a `Td` missing its
984                    // numbers, common in some producers) previously aborted
985                    // the whole page via `?`, so the extractor dropped every
986                    // valid operator before it. Instead, skip the bad
987                    // operator, resync by clearing its pending operands, and
988                    // continue parsing the rest of the stream.
989                    match self.parse_operator(op, &mut operand_stack) {
990                        Ok(operator) => operators.push(operator),
991                        Err(_e) => {
992                            tracing::debug!("skipping malformed content operator '{op}': {_e}");
993                            operand_stack.clear();
994                        }
995                    }
996                }
997                _ => {
998                    // Not an operator, push to operand stack
999                    operand_stack.push(token);
1000                }
1001            }
1002        }
1003
1004        Ok(operators)
1005    }
1006
1007    fn parse_operator(
1008        &mut self,
1009        op: &str,
1010        operands: &mut Vec<Token>,
1011    ) -> ParseResult<ContentOperation> {
1012        let operator = match op {
1013            // Text object operators
1014            "BT" => ContentOperation::BeginText,
1015            "ET" => ContentOperation::EndText,
1016
1017            // Text state operators
1018            "Tc" => {
1019                let spacing = self.pop_number(operands)?;
1020                ContentOperation::SetCharSpacing(spacing)
1021            }
1022            "Tw" => {
1023                let spacing = self.pop_number(operands)?;
1024                ContentOperation::SetWordSpacing(spacing)
1025            }
1026            "Tz" => {
1027                let scale = self.pop_number(operands)?;
1028                ContentOperation::SetHorizontalScaling(scale)
1029            }
1030            "TL" => {
1031                let leading = self.pop_number(operands)?;
1032                ContentOperation::SetLeading(leading)
1033            }
1034            "Tf" => {
1035                let size = self.pop_number(operands)?;
1036                let font = self.pop_name(operands)?;
1037                ContentOperation::SetFont(font, size)
1038            }
1039            "Tr" => {
1040                let mode = self.pop_integer(operands)?;
1041                ContentOperation::SetTextRenderMode(mode)
1042            }
1043            "Ts" => {
1044                let rise = self.pop_number(operands)?;
1045                ContentOperation::SetTextRise(rise)
1046            }
1047
1048            // Text positioning operators
1049            "Td" => {
1050                let ty = self.pop_number(operands)?;
1051                let tx = self.pop_number(operands)?;
1052                ContentOperation::MoveText(tx, ty)
1053            }
1054            "TD" => {
1055                let ty = self.pop_number(operands)?;
1056                let tx = self.pop_number(operands)?;
1057                ContentOperation::MoveTextSetLeading(tx, ty)
1058            }
1059            "Tm" => {
1060                let f = self.pop_number(operands)?;
1061                let e = self.pop_number(operands)?;
1062                let d = self.pop_number(operands)?;
1063                let c = self.pop_number(operands)?;
1064                let b = self.pop_number(operands)?;
1065                let a = self.pop_number(operands)?;
1066                ContentOperation::SetTextMatrix(a, b, c, d, e, f)
1067            }
1068            "T*" => ContentOperation::NextLine,
1069
1070            // Text showing operators
1071            "Tj" => {
1072                let text = self.pop_string(operands)?;
1073                ContentOperation::ShowText(text)
1074            }
1075            "TJ" => {
1076                let array = self.pop_array(operands)?;
1077                let elements = self.parse_text_array(array)?;
1078                ContentOperation::ShowTextArray(elements)
1079            }
1080            "'" => {
1081                let text = self.pop_string(operands)?;
1082                ContentOperation::NextLineShowText(text)
1083            }
1084            "\"" => {
1085                // ISO 32000-1 §9.4.3: operand order is `aw ac string "`
1086                // (aw at the bottom of the operand stack). `pop_*` is LIFO,
1087                // so we pop string first, then `ac`, then `aw`. The enum
1088                // variant is `(word_spacing, char_spacing, text)` to match
1089                // the spec field names — pass aw first, then ac.
1090                let text = self.pop_string(operands)?;
1091                let ac = self.pop_number(operands)?;
1092                let aw = self.pop_number(operands)?;
1093                ContentOperation::SetSpacingNextLineShowText(aw, ac, text)
1094            }
1095
1096            // Graphics state operators
1097            "q" => ContentOperation::SaveGraphicsState,
1098            "Q" => ContentOperation::RestoreGraphicsState,
1099            "cm" => {
1100                let f = self.pop_number(operands)?;
1101                let e = self.pop_number(operands)?;
1102                let d = self.pop_number(operands)?;
1103                let c = self.pop_number(operands)?;
1104                let b = self.pop_number(operands)?;
1105                let a = self.pop_number(operands)?;
1106                ContentOperation::SetTransformMatrix(a, b, c, d, e, f)
1107            }
1108            "w" => {
1109                let width = self.pop_number(operands)?;
1110                ContentOperation::SetLineWidth(width)
1111            }
1112            "J" => {
1113                let cap = self.pop_integer(operands)?;
1114                ContentOperation::SetLineCap(cap)
1115            }
1116            "j" => {
1117                let join = self.pop_integer(operands)?;
1118                ContentOperation::SetLineJoin(join)
1119            }
1120            "M" => {
1121                let limit = self.pop_number(operands)?;
1122                ContentOperation::SetMiterLimit(limit)
1123            }
1124            "d" => {
1125                let phase = self.pop_number(operands)?;
1126                let array = self.pop_array(operands)?;
1127                let pattern = self.parse_dash_array(array)?;
1128                ContentOperation::SetDashPattern(pattern, phase)
1129            }
1130            "ri" => {
1131                let intent = self.pop_name(operands)?;
1132                ContentOperation::SetIntent(intent)
1133            }
1134            "i" => {
1135                let flatness = self.pop_number(operands)?;
1136                ContentOperation::SetFlatness(flatness)
1137            }
1138            "gs" => {
1139                let name = self.pop_name(operands)?;
1140                ContentOperation::SetGraphicsStateParams(name)
1141            }
1142
1143            // Path construction operators
1144            "m" => {
1145                let y = self.pop_number(operands)?;
1146                let x = self.pop_number(operands)?;
1147                ContentOperation::MoveTo(x, y)
1148            }
1149            "l" => {
1150                let y = self.pop_number(operands)?;
1151                let x = self.pop_number(operands)?;
1152                ContentOperation::LineTo(x, y)
1153            }
1154            "c" => {
1155                let y3 = self.pop_number(operands)?;
1156                let x3 = self.pop_number(operands)?;
1157                let y2 = self.pop_number(operands)?;
1158                let x2 = self.pop_number(operands)?;
1159                let y1 = self.pop_number(operands)?;
1160                let x1 = self.pop_number(operands)?;
1161                ContentOperation::CurveTo(x1, y1, x2, y2, x3, y3)
1162            }
1163            "v" => {
1164                let y3 = self.pop_number(operands)?;
1165                let x3 = self.pop_number(operands)?;
1166                let y2 = self.pop_number(operands)?;
1167                let x2 = self.pop_number(operands)?;
1168                ContentOperation::CurveToV(x2, y2, x3, y3)
1169            }
1170            "y" => {
1171                let y3 = self.pop_number(operands)?;
1172                let x3 = self.pop_number(operands)?;
1173                let y1 = self.pop_number(operands)?;
1174                let x1 = self.pop_number(operands)?;
1175                ContentOperation::CurveToY(x1, y1, x3, y3)
1176            }
1177            "h" => ContentOperation::ClosePath,
1178            "re" => {
1179                let height = self.pop_number(operands)?;
1180                let width = self.pop_number(operands)?;
1181                let y = self.pop_number(operands)?;
1182                let x = self.pop_number(operands)?;
1183                ContentOperation::Rectangle(x, y, width, height)
1184            }
1185
1186            // Path painting operators
1187            "S" => ContentOperation::Stroke,
1188            "s" => ContentOperation::CloseStroke,
1189            "f" | "F" => ContentOperation::Fill,
1190            "f*" => ContentOperation::FillEvenOdd,
1191            "B" => ContentOperation::FillStroke,
1192            "B*" => ContentOperation::FillStrokeEvenOdd,
1193            "b" => ContentOperation::CloseFillStroke,
1194            "b*" => ContentOperation::CloseFillStrokeEvenOdd,
1195            "n" => ContentOperation::EndPath,
1196
1197            // Clipping path operators
1198            "W" => ContentOperation::Clip,
1199            "W*" => ContentOperation::ClipEvenOdd,
1200
1201            // Color operators
1202            "CS" => {
1203                let name = self.pop_name(operands)?;
1204                ContentOperation::SetStrokingColorSpace(name)
1205            }
1206            "cs" => {
1207                let name = self.pop_name(operands)?;
1208                ContentOperation::SetNonStrokingColorSpace(name)
1209            }
1210            "SC" | "SCN" => {
1211                let components = self.pop_color_components(operands)?;
1212                ContentOperation::SetStrokingColor(components)
1213            }
1214            "sc" | "scn" => {
1215                let components = self.pop_color_components(operands)?;
1216                ContentOperation::SetNonStrokingColor(components)
1217            }
1218            "G" => {
1219                let gray = self.pop_number(operands)?;
1220                ContentOperation::SetStrokingGray(gray)
1221            }
1222            "g" => {
1223                let gray = self.pop_number(operands)?;
1224                ContentOperation::SetNonStrokingGray(gray)
1225            }
1226            "RG" => {
1227                let b = self.pop_number(operands)?;
1228                let g = self.pop_number(operands)?;
1229                let r = self.pop_number(operands)?;
1230                ContentOperation::SetStrokingRGB(r, g, b)
1231            }
1232            "rg" => {
1233                let b = self.pop_number(operands)?;
1234                let g = self.pop_number(operands)?;
1235                let r = self.pop_number(operands)?;
1236                ContentOperation::SetNonStrokingRGB(r, g, b)
1237            }
1238            "K" => {
1239                let k = self.pop_number(operands)?;
1240                let y = self.pop_number(operands)?;
1241                let m = self.pop_number(operands)?;
1242                let c = self.pop_number(operands)?;
1243                ContentOperation::SetStrokingCMYK(c, m, y, k)
1244            }
1245            "k" => {
1246                let k = self.pop_number(operands)?;
1247                let y = self.pop_number(operands)?;
1248                let m = self.pop_number(operands)?;
1249                let c = self.pop_number(operands)?;
1250                ContentOperation::SetNonStrokingCMYK(c, m, y, k)
1251            }
1252
1253            // Shading operators
1254            "sh" => {
1255                let name = self.pop_name(operands)?;
1256                ContentOperation::ShadingFill(name)
1257            }
1258
1259            // XObject operators
1260            "Do" => {
1261                let name = self.pop_name(operands)?;
1262                ContentOperation::PaintXObject(name)
1263            }
1264
1265            // Marked content operators
1266            "BMC" => {
1267                let tag = self.pop_name(operands)?;
1268                ContentOperation::BeginMarkedContent(tag)
1269            }
1270            "BDC" => {
1271                let props = self.pop_dict_or_name(operands)?;
1272                let tag = self.pop_name(operands)?;
1273                ContentOperation::BeginMarkedContentWithProps(tag, props)
1274            }
1275            "EMC" => ContentOperation::EndMarkedContent,
1276            "MP" => {
1277                let tag = self.pop_name(operands)?;
1278                ContentOperation::DefineMarkedContentPoint(tag)
1279            }
1280            "DP" => {
1281                let props = self.pop_dict_or_name(operands)?;
1282                let tag = self.pop_name(operands)?;
1283                ContentOperation::DefineMarkedContentPointWithProps(tag, props)
1284            }
1285
1286            // Compatibility operators
1287            "BX" => ContentOperation::BeginCompatibility,
1288            "EX" => ContentOperation::EndCompatibility,
1289
1290            // Inline images are handled specially
1291            "BI" => {
1292                operands.clear(); // Clear any remaining operands
1293                self.parse_inline_image()?
1294            }
1295
1296            _ => {
1297                return Err(ParseError::SyntaxError {
1298                    position: self.position,
1299                    message: format!("Unknown operator: {op}"),
1300                });
1301            }
1302        };
1303
1304        operands.clear(); // Clear operands after processing
1305        Ok(operator)
1306    }
1307
1308    // Helper methods for popping operands
1309    fn pop_number(&self, operands: &mut Vec<Token>) -> ParseResult<f32> {
1310        match operands.pop() {
1311            Some(Token::Number(n)) => Ok(n),
1312            Some(Token::Integer(i)) => Ok(i as f32),
1313            _ => Err(ParseError::SyntaxError {
1314                position: self.position,
1315                message: "Expected number operand".to_string(),
1316            }),
1317        }
1318    }
1319
1320    fn pop_integer(&self, operands: &mut Vec<Token>) -> ParseResult<i32> {
1321        match operands.pop() {
1322            Some(Token::Integer(i)) => Ok(i),
1323            _ => Err(ParseError::SyntaxError {
1324                position: self.position,
1325                message: "Expected integer operand".to_string(),
1326            }),
1327        }
1328    }
1329
1330    fn pop_name(&self, operands: &mut Vec<Token>) -> ParseResult<String> {
1331        match operands.pop() {
1332            Some(Token::Name(n)) => Ok(n),
1333            _ => Err(ParseError::SyntaxError {
1334                position: self.position,
1335                message: "Expected name operand".to_string(),
1336            }),
1337        }
1338    }
1339
1340    fn pop_string(&self, operands: &mut Vec<Token>) -> ParseResult<Vec<u8>> {
1341        match operands.pop() {
1342            Some(Token::String(s)) => Ok(s),
1343            Some(Token::HexString(s)) => Ok(s),
1344            _ => Err(ParseError::SyntaxError {
1345                position: self.position,
1346                message: "Expected string operand".to_string(),
1347            }),
1348        }
1349    }
1350
1351    fn pop_array(&self, operands: &mut Vec<Token>) -> ParseResult<Vec<Token>> {
1352        // First check if we have an ArrayEnd at the top (which we should for a complete array)
1353        let has_array_end = matches!(operands.last(), Some(Token::ArrayEnd));
1354        if has_array_end {
1355            operands.pop(); // Remove the ArrayEnd
1356        }
1357
1358        let mut array = Vec::new();
1359        let mut found_start = false;
1360
1361        // Pop tokens until we find ArrayStart
1362        while let Some(token) = operands.pop() {
1363            match token {
1364                Token::ArrayStart => {
1365                    found_start = true;
1366                    break;
1367                }
1368                Token::ArrayEnd => {
1369                    // Skip any additional ArrayEnd tokens (shouldn't happen in well-formed PDFs)
1370                    continue;
1371                }
1372                _ => array.push(token),
1373            }
1374        }
1375
1376        if !found_start {
1377            return Err(ParseError::SyntaxError {
1378                position: self.position,
1379                message: "Expected array".to_string(),
1380            });
1381        }
1382
1383        array.reverse(); // We collected in reverse order
1384        Ok(array)
1385    }
1386
1387    fn pop_dict_or_name(&self, operands: &mut Vec<Token>) -> ParseResult<MarkedContentProps> {
1388        let token = operands.pop().ok_or_else(|| ParseError::SyntaxError {
1389            position: self.position,
1390            message: "Expected dict or name operand for BDC/DP".to_string(),
1391        })?;
1392
1393        match token {
1394            Token::Name(name) => Ok(MarkedContentProps::ResourceRef(name)),
1395            Token::DictEnd => {
1396                // Inline dictionary. Stack layout (newest on top):
1397                //   ... DictStart Name(k1) Value(v1) Name(k2) Value(v2) DictEnd
1398                // We pop value-then-key pairs in reverse until we hit DictStart.
1399                let mut map: HashMap<String, MarkedContentValue> = HashMap::new();
1400                loop {
1401                    let next = operands.pop().ok_or_else(|| ParseError::SyntaxError {
1402                        position: self.position,
1403                        message: "Unterminated inline dict in BDC/DP".to_string(),
1404                    })?;
1405                    if matches!(next, Token::DictStart) {
1406                        break;
1407                    }
1408                    let value = Self::token_to_mc_value(next, operands)?;
1409                    let key = match operands.pop() {
1410                        Some(Token::Name(k)) => k,
1411                        Some(other) => {
1412                            return Err(ParseError::SyntaxError {
1413                                position: self.position,
1414                                message: format!(
1415                                    "Expected Name as inline dict key, got {:?}",
1416                                    other
1417                                ),
1418                            });
1419                        }
1420                        None => {
1421                            return Err(ParseError::SyntaxError {
1422                                position: self.position,
1423                                message: "Unterminated inline dict (missing key)".to_string(),
1424                            });
1425                        }
1426                    };
1427                    map.insert(key, value);
1428                }
1429                Ok(MarkedContentProps::Inline(map))
1430            }
1431            other => Err(ParseError::SyntaxError {
1432                position: self.position,
1433                message: format!("Expected name or inline dict for BDC/DP, got {:?}", other),
1434            }),
1435        }
1436    }
1437
1438    /// Convert a popped token to a `MarkedContentValue`. For `ArrayEnd` and
1439    /// `DictEnd` tokens we recursively collect the matching container; all
1440    /// other tokens map to leaf variants.
1441    fn token_to_mc_value(
1442        token: Token,
1443        operands: &mut Vec<Token>,
1444    ) -> ParseResult<MarkedContentValue> {
1445        match token {
1446            Token::String(b) | Token::HexString(b) => Ok(MarkedContentValue::String(b)),
1447            Token::Integer(i) => Ok(MarkedContentValue::Integer(i as i64)),
1448            Token::Number(f) => Ok(MarkedContentValue::Real(f as f64)),
1449            Token::Name(n) => Ok(MarkedContentValue::Name(n)),
1450            Token::ArrayEnd => {
1451                let mut items: Vec<MarkedContentValue> = Vec::new();
1452                loop {
1453                    let next = operands.pop().ok_or_else(|| ParseError::SyntaxError {
1454                        position: 0,
1455                        message: "Unterminated array in marked-content props".to_string(),
1456                    })?;
1457                    if matches!(next, Token::ArrayStart) {
1458                        break;
1459                    }
1460                    items.push(Self::token_to_mc_value(next, operands)?);
1461                }
1462                items.reverse();
1463                Ok(MarkedContentValue::Array(items))
1464            }
1465            Token::DictEnd => {
1466                let mut nested: HashMap<String, MarkedContentValue> = HashMap::new();
1467                loop {
1468                    let next = operands.pop().ok_or_else(|| ParseError::SyntaxError {
1469                        position: 0,
1470                        message: "Unterminated nested dict in marked-content props".to_string(),
1471                    })?;
1472                    if matches!(next, Token::DictStart) {
1473                        break;
1474                    }
1475                    let value = Self::token_to_mc_value(next, operands)?;
1476                    let key = match operands.pop() {
1477                        Some(Token::Name(k)) => k,
1478                        _ => {
1479                            return Err(ParseError::SyntaxError {
1480                                position: 0,
1481                                message: "Expected name key in nested dict".to_string(),
1482                            });
1483                        }
1484                    };
1485                    nested.insert(key, value);
1486                }
1487                Ok(MarkedContentValue::Dict(nested))
1488            }
1489            other => Err(ParseError::SyntaxError {
1490                position: 0,
1491                message: format!("Unexpected token type in marked-content value: {:?}", other),
1492            }),
1493        }
1494    }
1495
1496    fn pop_color_components(&self, operands: &mut Vec<Token>) -> ParseResult<Vec<f32>> {
1497        let mut components = Vec::new();
1498
1499        // Pop all numeric values from the stack
1500        while let Some(token) = operands.last() {
1501            match token {
1502                Token::Number(n) => {
1503                    components.push(*n);
1504                    operands.pop();
1505                }
1506                Token::Integer(i) => {
1507                    components.push(*i as f32);
1508                    operands.pop();
1509                }
1510                _ => break,
1511            }
1512        }
1513
1514        components.reverse();
1515        Ok(components)
1516    }
1517
1518    fn parse_text_array(&self, tokens: Vec<Token>) -> ParseResult<Vec<TextElement>> {
1519        let mut elements = Vec::new();
1520
1521        for token in tokens {
1522            match token {
1523                Token::String(s) | Token::HexString(s) => {
1524                    elements.push(TextElement::Text(s));
1525                }
1526                Token::Number(n) => {
1527                    elements.push(TextElement::Spacing(n));
1528                }
1529                Token::Integer(i) => {
1530                    elements.push(TextElement::Spacing(i as f32));
1531                }
1532                _ => {
1533                    return Err(ParseError::SyntaxError {
1534                        position: self.position,
1535                        message: "Invalid element in text array".to_string(),
1536                    });
1537                }
1538            }
1539        }
1540
1541        Ok(elements)
1542    }
1543
1544    fn parse_dash_array(&self, tokens: Vec<Token>) -> ParseResult<Vec<f32>> {
1545        let mut pattern = Vec::new();
1546
1547        for token in tokens {
1548            match token {
1549                Token::Number(n) => pattern.push(n),
1550                Token::Integer(i) => pattern.push(i as f32),
1551                _ => {
1552                    return Err(ParseError::SyntaxError {
1553                        position: self.position,
1554                        message: "Invalid element in dash array".to_string(),
1555                    });
1556                }
1557            }
1558        }
1559
1560        Ok(pattern)
1561    }
1562
1563    fn parse_inline_image(&mut self) -> ParseResult<ContentOperation> {
1564        // Parse inline image dictionary until we find ID
1565        let mut params = HashMap::new();
1566
1567        while self.position < self.tokens.len() {
1568            // Check if we've reached the ID operator
1569            if let Token::Operator(op) = &self.tokens[self.position] {
1570                if op == "ID" {
1571                    self.position += 1;
1572                    break;
1573                }
1574            }
1575
1576            // Parse key-value pairs for image parameters
1577            // Keys are abbreviated in inline images:
1578            // /W -> Width, /H -> Height, /CS -> ColorSpace, /BPC -> BitsPerComponent
1579            // /F -> Filter, /DP -> DecodeParms, /IM -> ImageMask, /I -> Interpolate
1580            if let Token::Name(key) = &self.tokens[self.position] {
1581                self.position += 1;
1582                if self.position >= self.tokens.len() {
1583                    break;
1584                }
1585
1586                // Parse the value
1587                let value = match &self.tokens[self.position] {
1588                    Token::Integer(n) => Object::Integer(*n as i64),
1589                    Token::Number(n) => Object::Real(*n as f64),
1590                    Token::Name(s) => Object::Name(expand_inline_name(s)),
1591                    Token::String(s) => Object::String(String::from_utf8_lossy(s).to_string()),
1592                    Token::HexString(s) => Object::String(String::from_utf8_lossy(s).to_string()),
1593                    _ => Object::Null,
1594                };
1595
1596                // Expand abbreviated keys to full names
1597                let full_key = expand_inline_key(key);
1598                params.insert(full_key, value);
1599                self.position += 1;
1600            } else {
1601                self.position += 1;
1602            }
1603        }
1604
1605        // Get inline image data from dedicated InlineImageData token
1606        // (the tokenizer reads raw bytes between ID whitespace and EI)
1607        let data = if self.position < self.tokens.len() {
1608            if let Token::InlineImageData(bytes) = &self.tokens[self.position] {
1609                let d = bytes.clone();
1610                self.position += 1;
1611                d
1612            } else {
1613                // Fallback: collect tokens until EI (for backwards compat with edge cases)
1614                self.collect_inline_image_data_from_tokens()?
1615            }
1616        } else {
1617            Vec::new()
1618        };
1619
1620        Ok(ContentOperation::InlineImage { params, data })
1621    }
1622
1623    /// Fallback data collection when InlineImageData token is not present.
1624    /// This handles edge cases where the tokenizer couldn't detect the ID/EI boundary.
1625    fn collect_inline_image_data_from_tokens(&mut self) -> ParseResult<Vec<u8>> {
1626        let mut data = Vec::new();
1627        while self.position < self.tokens.len() {
1628            if let Token::Operator(op) = &self.tokens[self.position] {
1629                if op == "EI" {
1630                    self.position += 1;
1631                    break;
1632                }
1633            }
1634            match &self.tokens[self.position] {
1635                Token::String(bytes) | Token::HexString(bytes) => {
1636                    data.extend_from_slice(bytes);
1637                }
1638                Token::Integer(n) => data.extend_from_slice(n.to_string().as_bytes()),
1639                Token::Number(n) => data.extend_from_slice(n.to_string().as_bytes()),
1640                Token::Name(s) | Token::Operator(s) => data.extend_from_slice(s.as_bytes()),
1641                _ => {}
1642            }
1643            self.position += 1;
1644        }
1645        Ok(data)
1646    }
1647}
1648
1649/// Expand abbreviated inline image key names to full names
1650fn expand_inline_key(key: &str) -> String {
1651    match key {
1652        "W" => "Width".to_string(),
1653        "H" => "Height".to_string(),
1654        "CS" | "ColorSpace" => "ColorSpace".to_string(),
1655        "BPC" | "BitsPerComponent" => "BitsPerComponent".to_string(),
1656        "F" => "Filter".to_string(),
1657        "DP" | "DecodeParms" => "DecodeParms".to_string(),
1658        "IM" => "ImageMask".to_string(),
1659        "I" => "Interpolate".to_string(),
1660        "Intent" => "Intent".to_string(),
1661        "D" => "Decode".to_string(),
1662        _ => key.to_string(),
1663    }
1664}
1665
1666/// Expand abbreviated inline image color space names
1667fn expand_inline_name(name: &str) -> String {
1668    match name {
1669        "G" => "DeviceGray".to_string(),
1670        "RGB" => "DeviceRGB".to_string(),
1671        "CMYK" => "DeviceCMYK".to_string(),
1672        "I" => "Indexed".to_string(),
1673        "AHx" => "ASCIIHexDecode".to_string(),
1674        "A85" => "ASCII85Decode".to_string(),
1675        "LZW" => "LZWDecode".to_string(),
1676        "Fl" => "FlateDecode".to_string(),
1677        "RL" => "RunLengthDecode".to_string(),
1678        "DCT" => "DCTDecode".to_string(),
1679        "CCF" => "CCITTFaxDecode".to_string(),
1680        _ => name.to_string(),
1681    }
1682}
1683
1684#[cfg(test)]
1685mod tests {
1686    use super::*;
1687
1688    #[test]
1689    fn test_tokenize_numbers() {
1690        let input = b"123 -45 3.14159 -0.5 .5";
1691        let mut tokenizer = ContentTokenizer::new(input);
1692
1693        assert_eq!(tokenizer.next_token().unwrap(), Some(Token::Integer(123)));
1694        assert_eq!(tokenizer.next_token().unwrap(), Some(Token::Integer(-45)));
1695        assert_eq!(
1696            tokenizer.next_token().unwrap(),
1697            Some(Token::Number(3.14159))
1698        );
1699        assert_eq!(tokenizer.next_token().unwrap(), Some(Token::Number(-0.5)));
1700        assert_eq!(tokenizer.next_token().unwrap(), Some(Token::Number(0.5)));
1701        assert_eq!(tokenizer.next_token().unwrap(), None);
1702    }
1703
1704    #[test]
1705    fn test_tokenize_strings() {
1706        let input = b"(Hello World) (Hello\\nWorld) (Nested (paren))";
1707        let mut tokenizer = ContentTokenizer::new(input);
1708
1709        assert_eq!(
1710            tokenizer.next_token().unwrap(),
1711            Some(Token::String(b"Hello World".to_vec()))
1712        );
1713        assert_eq!(
1714            tokenizer.next_token().unwrap(),
1715            Some(Token::String(b"Hello\nWorld".to_vec()))
1716        );
1717        assert_eq!(
1718            tokenizer.next_token().unwrap(),
1719            Some(Token::String(b"Nested (paren)".to_vec()))
1720        );
1721    }
1722
1723    #[test]
1724    fn test_tokenize_hex_strings() {
1725        let input = b"<48656C6C6F> <48 65 6C 6C 6F>";
1726        let mut tokenizer = ContentTokenizer::new(input);
1727
1728        assert_eq!(
1729            tokenizer.next_token().unwrap(),
1730            Some(Token::HexString(b"Hello".to_vec()))
1731        );
1732        assert_eq!(
1733            tokenizer.next_token().unwrap(),
1734            Some(Token::HexString(b"Hello".to_vec()))
1735        );
1736    }
1737
1738    #[test]
1739    fn test_tokenize_names() {
1740        let input = b"/Name /Name#20with#20spaces /A#42C";
1741        let mut tokenizer = ContentTokenizer::new(input);
1742
1743        assert_eq!(
1744            tokenizer.next_token().unwrap(),
1745            Some(Token::Name("Name".to_string()))
1746        );
1747        assert_eq!(
1748            tokenizer.next_token().unwrap(),
1749            Some(Token::Name("Name with spaces".to_string()))
1750        );
1751        assert_eq!(
1752            tokenizer.next_token().unwrap(),
1753            Some(Token::Name("ABC".to_string()))
1754        );
1755    }
1756
1757    #[test]
1758    fn test_tokenize_operators() {
1759        let input = b"BT Tj ET q Q";
1760        let mut tokenizer = ContentTokenizer::new(input);
1761
1762        assert_eq!(
1763            tokenizer.next_token().unwrap(),
1764            Some(Token::Operator("BT".to_string()))
1765        );
1766        assert_eq!(
1767            tokenizer.next_token().unwrap(),
1768            Some(Token::Operator("Tj".to_string()))
1769        );
1770        assert_eq!(
1771            tokenizer.next_token().unwrap(),
1772            Some(Token::Operator("ET".to_string()))
1773        );
1774        assert_eq!(
1775            tokenizer.next_token().unwrap(),
1776            Some(Token::Operator("q".to_string()))
1777        );
1778        assert_eq!(
1779            tokenizer.next_token().unwrap(),
1780            Some(Token::Operator("Q".to_string()))
1781        );
1782    }
1783
1784    #[test]
1785    fn test_parse_text_operators() {
1786        let content = b"BT /F1 12 Tf 100 200 Td (Hello World) Tj ET";
1787        let operators = ContentParser::parse(content).unwrap();
1788
1789        assert_eq!(operators.len(), 5);
1790        assert_eq!(operators[0], ContentOperation::BeginText);
1791        assert_eq!(
1792            operators[1],
1793            ContentOperation::SetFont("F1".to_string(), 12.0)
1794        );
1795        assert_eq!(operators[2], ContentOperation::MoveText(100.0, 200.0));
1796        assert_eq!(
1797            operators[3],
1798            ContentOperation::ShowText(b"Hello World".to_vec())
1799        );
1800        assert_eq!(operators[4], ContentOperation::EndText);
1801    }
1802
1803    #[test]
1804    fn test_parse_graphics_operators() {
1805        let content = b"q 1 0 0 1 50 50 cm 2 w 0 0 100 100 re S Q";
1806        let operators = ContentParser::parse(content).unwrap();
1807
1808        assert_eq!(operators.len(), 6);
1809        assert_eq!(operators[0], ContentOperation::SaveGraphicsState);
1810        assert_eq!(
1811            operators[1],
1812            ContentOperation::SetTransformMatrix(1.0, 0.0, 0.0, 1.0, 50.0, 50.0)
1813        );
1814        assert_eq!(operators[2], ContentOperation::SetLineWidth(2.0));
1815        assert_eq!(
1816            operators[3],
1817            ContentOperation::Rectangle(0.0, 0.0, 100.0, 100.0)
1818        );
1819        assert_eq!(operators[4], ContentOperation::Stroke);
1820        assert_eq!(operators[5], ContentOperation::RestoreGraphicsState);
1821    }
1822
1823    #[test]
1824    fn test_parse_color_operators() {
1825        let content = b"0.5 g 1 0 0 rg 0 0 0 1 k";
1826        let operators = ContentParser::parse(content).unwrap();
1827
1828        assert_eq!(operators.len(), 3);
1829        assert_eq!(operators[0], ContentOperation::SetNonStrokingGray(0.5));
1830        assert_eq!(
1831            operators[1],
1832            ContentOperation::SetNonStrokingRGB(1.0, 0.0, 0.0)
1833        );
1834        assert_eq!(
1835            operators[2],
1836            ContentOperation::SetNonStrokingCMYK(0.0, 0.0, 0.0, 1.0)
1837        );
1838    }
1839
1840    // Comprehensive tests for all ContentOperation variants
1841    mod comprehensive_tests {
1842        use super::*;
1843
1844        #[test]
1845        fn test_all_text_operators() {
1846            // Test basic text operators that work with current parser
1847            let content = b"BT 5 Tc 10 Tw 120 Tz 15 TL /F1 12 Tf 1 Tr 5 Ts 100 200 Td 50 150 TD T* (Hello) Tj ET";
1848            let operators = ContentParser::parse(content).unwrap();
1849
1850            assert_eq!(operators[0], ContentOperation::BeginText);
1851            assert_eq!(operators[1], ContentOperation::SetCharSpacing(5.0));
1852            assert_eq!(operators[2], ContentOperation::SetWordSpacing(10.0));
1853            assert_eq!(operators[3], ContentOperation::SetHorizontalScaling(120.0));
1854            assert_eq!(operators[4], ContentOperation::SetLeading(15.0));
1855            assert_eq!(
1856                operators[5],
1857                ContentOperation::SetFont("F1".to_string(), 12.0)
1858            );
1859            assert_eq!(operators[6], ContentOperation::SetTextRenderMode(1));
1860            assert_eq!(operators[7], ContentOperation::SetTextRise(5.0));
1861            assert_eq!(operators[8], ContentOperation::MoveText(100.0, 200.0));
1862            assert_eq!(
1863                operators[9],
1864                ContentOperation::MoveTextSetLeading(50.0, 150.0)
1865            );
1866            assert_eq!(operators[10], ContentOperation::NextLine);
1867            assert_eq!(operators[11], ContentOperation::ShowText(b"Hello".to_vec()));
1868            assert_eq!(operators[12], ContentOperation::EndText);
1869        }
1870
1871        #[test]
1872        fn test_all_graphics_state_operators() {
1873            // Test basic graphics state operators without arrays
1874            let content = b"q Q 1 0 0 1 50 50 cm 2 w 1 J 2 j 10 M /GS1 gs 0.5 i /Perceptual ri";
1875            let operators = ContentParser::parse(content).unwrap();
1876
1877            assert_eq!(operators[0], ContentOperation::SaveGraphicsState);
1878            assert_eq!(operators[1], ContentOperation::RestoreGraphicsState);
1879            assert_eq!(
1880                operators[2],
1881                ContentOperation::SetTransformMatrix(1.0, 0.0, 0.0, 1.0, 50.0, 50.0)
1882            );
1883            assert_eq!(operators[3], ContentOperation::SetLineWidth(2.0));
1884            assert_eq!(operators[4], ContentOperation::SetLineCap(1));
1885            assert_eq!(operators[5], ContentOperation::SetLineJoin(2));
1886            assert_eq!(operators[6], ContentOperation::SetMiterLimit(10.0));
1887            assert_eq!(
1888                operators[7],
1889                ContentOperation::SetGraphicsStateParams("GS1".to_string())
1890            );
1891            assert_eq!(operators[8], ContentOperation::SetFlatness(0.5));
1892            assert_eq!(
1893                operators[9],
1894                ContentOperation::SetIntent("Perceptual".to_string())
1895            );
1896        }
1897
1898        #[test]
1899        fn test_all_path_construction_operators() {
1900            let content = b"100 200 m 150 200 l 200 200 250 250 300 200 c 250 180 300 200 v 200 180 300 200 y h 50 50 100 100 re";
1901            let operators = ContentParser::parse(content).unwrap();
1902
1903            assert_eq!(operators[0], ContentOperation::MoveTo(100.0, 200.0));
1904            assert_eq!(operators[1], ContentOperation::LineTo(150.0, 200.0));
1905            assert_eq!(
1906                operators[2],
1907                ContentOperation::CurveTo(200.0, 200.0, 250.0, 250.0, 300.0, 200.0)
1908            );
1909            assert_eq!(
1910                operators[3],
1911                ContentOperation::CurveToV(250.0, 180.0, 300.0, 200.0)
1912            );
1913            assert_eq!(
1914                operators[4],
1915                ContentOperation::CurveToY(200.0, 180.0, 300.0, 200.0)
1916            );
1917            assert_eq!(operators[5], ContentOperation::ClosePath);
1918            assert_eq!(
1919                operators[6],
1920                ContentOperation::Rectangle(50.0, 50.0, 100.0, 100.0)
1921            );
1922        }
1923
1924        #[test]
1925        fn test_all_path_painting_operators() {
1926            let content = b"S s f F f* B B* b b* n W W*";
1927            let operators = ContentParser::parse(content).unwrap();
1928
1929            assert_eq!(operators[0], ContentOperation::Stroke);
1930            assert_eq!(operators[1], ContentOperation::CloseStroke);
1931            assert_eq!(operators[2], ContentOperation::Fill);
1932            assert_eq!(operators[3], ContentOperation::Fill); // F is alias for f
1933            assert_eq!(operators[4], ContentOperation::FillEvenOdd);
1934            assert_eq!(operators[5], ContentOperation::FillStroke);
1935            assert_eq!(operators[6], ContentOperation::FillStrokeEvenOdd);
1936            assert_eq!(operators[7], ContentOperation::CloseFillStroke);
1937            assert_eq!(operators[8], ContentOperation::CloseFillStrokeEvenOdd);
1938            assert_eq!(operators[9], ContentOperation::EndPath);
1939            assert_eq!(operators[10], ContentOperation::Clip);
1940            assert_eq!(operators[11], ContentOperation::ClipEvenOdd);
1941        }
1942
1943        #[test]
1944        fn test_all_color_operators() {
1945            // Test basic color operators that work with current parser
1946            let content = b"/DeviceRGB CS /DeviceGray cs 0.7 G 0.4 g 1 0 0 RG 0 1 0 rg 0 0 0 1 K 0.2 0.3 0.4 0.5 k /Shade1 sh";
1947            let operators = ContentParser::parse(content).unwrap();
1948
1949            assert_eq!(
1950                operators[0],
1951                ContentOperation::SetStrokingColorSpace("DeviceRGB".to_string())
1952            );
1953            assert_eq!(
1954                operators[1],
1955                ContentOperation::SetNonStrokingColorSpace("DeviceGray".to_string())
1956            );
1957            assert_eq!(operators[2], ContentOperation::SetStrokingGray(0.7));
1958            assert_eq!(operators[3], ContentOperation::SetNonStrokingGray(0.4));
1959            assert_eq!(
1960                operators[4],
1961                ContentOperation::SetStrokingRGB(1.0, 0.0, 0.0)
1962            );
1963            assert_eq!(
1964                operators[5],
1965                ContentOperation::SetNonStrokingRGB(0.0, 1.0, 0.0)
1966            );
1967            assert_eq!(
1968                operators[6],
1969                ContentOperation::SetStrokingCMYK(0.0, 0.0, 0.0, 1.0)
1970            );
1971            assert_eq!(
1972                operators[7],
1973                ContentOperation::SetNonStrokingCMYK(0.2, 0.3, 0.4, 0.5)
1974            );
1975            assert_eq!(
1976                operators[8],
1977                ContentOperation::ShadingFill("Shade1".to_string())
1978            );
1979        }
1980
1981        #[test]
1982        fn test_xobject_and_marked_content_operators() {
1983            // Test basic XObject and marked content operators
1984            let content = b"/Image1 Do /MC1 BMC EMC /MP1 MP BX EX";
1985            let operators = ContentParser::parse(content).unwrap();
1986
1987            assert_eq!(
1988                operators[0],
1989                ContentOperation::PaintXObject("Image1".to_string())
1990            );
1991            assert_eq!(
1992                operators[1],
1993                ContentOperation::BeginMarkedContent("MC1".to_string())
1994            );
1995            assert_eq!(operators[2], ContentOperation::EndMarkedContent);
1996            assert_eq!(
1997                operators[3],
1998                ContentOperation::DefineMarkedContentPoint("MP1".to_string())
1999            );
2000            assert_eq!(operators[4], ContentOperation::BeginCompatibility);
2001            assert_eq!(operators[5], ContentOperation::EndCompatibility);
2002        }
2003
2004        #[test]
2005        fn test_complex_content_stream() {
2006            let content = b"q 0.5 0 0 0.5 100 100 cm BT /F1 12 Tf 0 0 Td (Complex) Tj ET Q";
2007            let operators = ContentParser::parse(content).unwrap();
2008
2009            assert_eq!(operators.len(), 8);
2010            assert_eq!(operators[0], ContentOperation::SaveGraphicsState);
2011            assert_eq!(
2012                operators[1],
2013                ContentOperation::SetTransformMatrix(0.5, 0.0, 0.0, 0.5, 100.0, 100.0)
2014            );
2015            assert_eq!(operators[2], ContentOperation::BeginText);
2016            assert_eq!(
2017                operators[3],
2018                ContentOperation::SetFont("F1".to_string(), 12.0)
2019            );
2020            assert_eq!(operators[4], ContentOperation::MoveText(0.0, 0.0));
2021            assert_eq!(
2022                operators[5],
2023                ContentOperation::ShowText(b"Complex".to_vec())
2024            );
2025            assert_eq!(operators[6], ContentOperation::EndText);
2026            assert_eq!(operators[7], ContentOperation::RestoreGraphicsState);
2027        }
2028
2029        #[test]
2030        fn test_tokenizer_whitespace_handling() {
2031            let input = b"  \t\n\r  BT  \t\n  /F1   12.5  \t Tf  \n\r  ET  ";
2032            let mut tokenizer = ContentTokenizer::new(input);
2033
2034            assert_eq!(
2035                tokenizer.next_token().unwrap(),
2036                Some(Token::Operator("BT".to_string()))
2037            );
2038            assert_eq!(
2039                tokenizer.next_token().unwrap(),
2040                Some(Token::Name("F1".to_string()))
2041            );
2042            assert_eq!(tokenizer.next_token().unwrap(), Some(Token::Number(12.5)));
2043            assert_eq!(
2044                tokenizer.next_token().unwrap(),
2045                Some(Token::Operator("Tf".to_string()))
2046            );
2047            assert_eq!(
2048                tokenizer.next_token().unwrap(),
2049                Some(Token::Operator("ET".to_string()))
2050            );
2051            assert_eq!(tokenizer.next_token().unwrap(), None);
2052        }
2053
2054        #[test]
2055        fn test_tokenizer_edge_cases() {
2056            // Test basic number formats that are actually supported
2057            let input = b"0 .5 -.5 +.5 123. .123 1.23 -1.23";
2058            let mut tokenizer = ContentTokenizer::new(input);
2059
2060            assert_eq!(tokenizer.next_token().unwrap(), Some(Token::Integer(0)));
2061            assert_eq!(tokenizer.next_token().unwrap(), Some(Token::Number(0.5)));
2062            assert_eq!(tokenizer.next_token().unwrap(), Some(Token::Number(-0.5)));
2063            assert_eq!(tokenizer.next_token().unwrap(), Some(Token::Number(0.5)));
2064            assert_eq!(tokenizer.next_token().unwrap(), Some(Token::Number(123.0)));
2065            assert_eq!(tokenizer.next_token().unwrap(), Some(Token::Number(0.123)));
2066            assert_eq!(tokenizer.next_token().unwrap(), Some(Token::Number(1.23)));
2067            assert_eq!(tokenizer.next_token().unwrap(), Some(Token::Number(-1.23)));
2068        }
2069
2070        #[test]
2071        fn test_string_parsing_edge_cases() {
2072            let input = b"(Simple) (With\\\\backslash) (With\\)paren) (With\\newline) (With\\ttab) (With\\rcarriage) (With\\bbackspace) (With\\fformfeed) (With\\(leftparen) (With\\)rightparen) (With\\377octal) (With\\dddoctal)";
2073            let mut tokenizer = ContentTokenizer::new(input);
2074
2075            assert_eq!(
2076                tokenizer.next_token().unwrap(),
2077                Some(Token::String(b"Simple".to_vec()))
2078            );
2079            assert_eq!(
2080                tokenizer.next_token().unwrap(),
2081                Some(Token::String(b"With\\backslash".to_vec()))
2082            );
2083            assert_eq!(
2084                tokenizer.next_token().unwrap(),
2085                Some(Token::String(b"With)paren".to_vec()))
2086            );
2087            assert_eq!(
2088                tokenizer.next_token().unwrap(),
2089                Some(Token::String(b"With\newline".to_vec()))
2090            );
2091            assert_eq!(
2092                tokenizer.next_token().unwrap(),
2093                Some(Token::String(b"With\ttab".to_vec()))
2094            );
2095            assert_eq!(
2096                tokenizer.next_token().unwrap(),
2097                Some(Token::String(b"With\rcarriage".to_vec()))
2098            );
2099            assert_eq!(
2100                tokenizer.next_token().unwrap(),
2101                Some(Token::String(b"With\x08backspace".to_vec()))
2102            );
2103            assert_eq!(
2104                tokenizer.next_token().unwrap(),
2105                Some(Token::String(b"With\x0Cformfeed".to_vec()))
2106            );
2107            assert_eq!(
2108                tokenizer.next_token().unwrap(),
2109                Some(Token::String(b"With(leftparen".to_vec()))
2110            );
2111            assert_eq!(
2112                tokenizer.next_token().unwrap(),
2113                Some(Token::String(b"With)rightparen".to_vec()))
2114            );
2115        }
2116
2117        #[test]
2118        fn test_hex_string_parsing() {
2119            let input = b"<48656C6C6F> <48 65 6C 6C 6F> <48656C6C6F57> <48656C6C6F5>";
2120            let mut tokenizer = ContentTokenizer::new(input);
2121
2122            assert_eq!(
2123                tokenizer.next_token().unwrap(),
2124                Some(Token::HexString(b"Hello".to_vec()))
2125            );
2126            assert_eq!(
2127                tokenizer.next_token().unwrap(),
2128                Some(Token::HexString(b"Hello".to_vec()))
2129            );
2130            assert_eq!(
2131                tokenizer.next_token().unwrap(),
2132                Some(Token::HexString(b"HelloW".to_vec()))
2133            );
2134            assert_eq!(
2135                tokenizer.next_token().unwrap(),
2136                Some(Token::HexString(b"Hello\x50".to_vec()))
2137            );
2138        }
2139
2140        #[test]
2141        fn test_name_parsing_edge_cases() {
2142            let input = b"/Name /Name#20with#20spaces /Name#23with#23hash /Name#2Fwith#2Fslash /#45mptyName";
2143            let mut tokenizer = ContentTokenizer::new(input);
2144
2145            assert_eq!(
2146                tokenizer.next_token().unwrap(),
2147                Some(Token::Name("Name".to_string()))
2148            );
2149            assert_eq!(
2150                tokenizer.next_token().unwrap(),
2151                Some(Token::Name("Name with spaces".to_string()))
2152            );
2153            assert_eq!(
2154                tokenizer.next_token().unwrap(),
2155                Some(Token::Name("Name#with#hash".to_string()))
2156            );
2157            assert_eq!(
2158                tokenizer.next_token().unwrap(),
2159                Some(Token::Name("Name/with/slash".to_string()))
2160            );
2161            assert_eq!(
2162                tokenizer.next_token().unwrap(),
2163                Some(Token::Name("EmptyName".to_string()))
2164            );
2165        }
2166
2167        #[test]
2168        fn test_operator_parsing_edge_cases() {
2169            let content = b"q q q Q Q Q BT BT ET ET";
2170            let operators = ContentParser::parse(content).unwrap();
2171
2172            assert_eq!(operators.len(), 10);
2173            assert_eq!(operators[0], ContentOperation::SaveGraphicsState);
2174            assert_eq!(operators[1], ContentOperation::SaveGraphicsState);
2175            assert_eq!(operators[2], ContentOperation::SaveGraphicsState);
2176            assert_eq!(operators[3], ContentOperation::RestoreGraphicsState);
2177            assert_eq!(operators[4], ContentOperation::RestoreGraphicsState);
2178            assert_eq!(operators[5], ContentOperation::RestoreGraphicsState);
2179            assert_eq!(operators[6], ContentOperation::BeginText);
2180            assert_eq!(operators[7], ContentOperation::BeginText);
2181            assert_eq!(operators[8], ContentOperation::EndText);
2182            assert_eq!(operators[9], ContentOperation::EndText);
2183        }
2184
2185        #[test]
2186        fn test_error_handling_insufficient_operands() {
2187            // Best-effort recovery (issue #319): `Td` is missing its y
2188            // coordinate. The malformed operator is skipped, but a following
2189            // valid text operator must still be recovered — the page is NOT
2190            // discarded wholesale.
2191            let content = b"100 Td (kept) Tj";
2192            let ops = ContentParser::parse(content).expect("recovers from bad Td");
2193            assert!(
2194                ops.iter()
2195                    .any(|op| matches!(op, ContentOperation::ShowText(t) if t == b"kept")),
2196                "valid Tj after the malformed Td must survive: {ops:?}"
2197            );
2198        }
2199
2200        #[test]
2201        fn test_error_handling_invalid_operator() {
2202            // Unknown operator `INVALID` is skipped; the following valid
2203            // MoveTo survives (issue #319 recovery contract).
2204            let content = b"100 200 INVALID 10 20 m";
2205            let ops = ContentParser::parse(content).expect("recovers from unknown operator");
2206            assert!(
2207                ops.iter()
2208                    .any(|op| matches!(op, ContentOperation::MoveTo(_, _))),
2209                "valid MoveTo after the unknown operator must survive: {ops:?}"
2210            );
2211        }
2212
2213        #[test]
2214        fn test_error_handling_malformed_string() {
2215            // Test that the tokenizer handles malformed strings appropriately
2216            let input = b"(Unclosed string";
2217            let mut tokenizer = ContentTokenizer::new(input);
2218            let result = tokenizer.next_token();
2219            // The current implementation may not detect this as an error
2220            // so we'll just test that we get some result
2221            assert!(result.is_ok() || result.is_err());
2222        }
2223
2224        #[test]
2225        fn test_error_handling_malformed_hex_string() {
2226            let input = b"<48656C6C6G>";
2227            let mut tokenizer = ContentTokenizer::new(input);
2228            let result = tokenizer.next_token();
2229            assert!(result.is_err());
2230        }
2231
2232        #[test]
2233        fn test_error_handling_malformed_name() {
2234            let input = b"/Name#GG";
2235            let mut tokenizer = ContentTokenizer::new(input);
2236            let result = tokenizer.next_token();
2237            assert!(result.is_err());
2238        }
2239
2240        #[test]
2241        fn test_empty_content_stream() {
2242            let content = b"";
2243            let operators = ContentParser::parse(content).unwrap();
2244            assert_eq!(operators.len(), 0);
2245        }
2246
2247        #[test]
2248        fn test_whitespace_only_content_stream() {
2249            let content = b"   \t\n\r   ";
2250            let operators = ContentParser::parse(content).unwrap();
2251            assert_eq!(operators.len(), 0);
2252        }
2253
2254        #[test]
2255        fn test_mixed_integer_and_real_operands() {
2256            // Test with simple operands that work with current parser
2257            let content = b"100 200 m 150 200 l";
2258            let operators = ContentParser::parse(content).unwrap();
2259
2260            assert_eq!(operators.len(), 2);
2261            assert_eq!(operators[0], ContentOperation::MoveTo(100.0, 200.0));
2262            assert_eq!(operators[1], ContentOperation::LineTo(150.0, 200.0));
2263        }
2264
2265        #[test]
2266        fn test_negative_operands() {
2267            let content = b"-100 -200 Td -50.5 -75.2 TD";
2268            let operators = ContentParser::parse(content).unwrap();
2269
2270            assert_eq!(operators.len(), 2);
2271            assert_eq!(operators[0], ContentOperation::MoveText(-100.0, -200.0));
2272            assert_eq!(
2273                operators[1],
2274                ContentOperation::MoveTextSetLeading(-50.5, -75.2)
2275            );
2276        }
2277
2278        #[test]
2279        fn test_large_numbers() {
2280            let content = b"999999.999999 -999999.999999 m";
2281            let operators = ContentParser::parse(content).unwrap();
2282
2283            assert_eq!(operators.len(), 1);
2284            assert_eq!(
2285                operators[0],
2286                ContentOperation::MoveTo(999999.999999, -999999.999999)
2287            );
2288        }
2289
2290        #[test]
2291        fn test_scientific_notation() {
2292            // Test with simple decimal numbers since scientific notation isn't implemented
2293            let content = b"123.45 -456.78 m";
2294            let operators = ContentParser::parse(content).unwrap();
2295
2296            assert_eq!(operators.len(), 1);
2297            assert_eq!(operators[0], ContentOperation::MoveTo(123.45, -456.78));
2298        }
2299
2300        #[test]
2301        fn test_show_text_array_complex() {
2302            // `TJ` expects an array operand, not a plain string. The malformed
2303            // operator is skipped; a following valid Tj is recovered
2304            // (issue #319 recovery contract).
2305            let content = b"(Hello) TJ (kept) Tj";
2306            let ops = ContentParser::parse(content).expect("recovers from malformed TJ");
2307            assert!(
2308                ops.iter()
2309                    .any(|op| matches!(op, ContentOperation::ShowText(t) if t == b"kept")),
2310                "valid Tj after the malformed TJ must survive: {ops:?}"
2311            );
2312        }
2313
2314        #[test]
2315        fn test_dash_pattern_empty() {
2316            // `d` needs an array operand; the malformed operator is skipped and
2317            // a following valid MoveTo survives (issue #319 recovery contract).
2318            let content = b"0 d 10 20 m";
2319            let ops = ContentParser::parse(content).expect("recovers from malformed d");
2320            assert!(
2321                ops.iter()
2322                    .any(|op| matches!(op, ContentOperation::MoveTo(_, _))),
2323                "valid MoveTo after the malformed dash op must survive: {ops:?}"
2324            );
2325        }
2326
2327        #[test]
2328        fn test_dash_pattern_complex() {
2329            // Same recovery contract with a real-number operand before `d`.
2330            let content = b"2.5 d 10 20 m";
2331            let ops = ContentParser::parse(content).expect("recovers from malformed d");
2332            assert!(
2333                ops.iter()
2334                    .any(|op| matches!(op, ContentOperation::MoveTo(_, _))),
2335                "valid MoveTo after the malformed dash op must survive: {ops:?}"
2336            );
2337        }
2338
2339        #[test]
2340        fn test_pop_array_removes_array_end() {
2341            // Test that pop_array correctly handles ArrayEnd tokens
2342            let parser = ContentParser::new(b"");
2343
2344            // Test normal array: [1 2 3]
2345            let mut operands = vec![
2346                Token::ArrayStart,
2347                Token::Integer(1),
2348                Token::Integer(2),
2349                Token::Integer(3),
2350                Token::ArrayEnd,
2351            ];
2352            let result = parser.pop_array(&mut operands).unwrap();
2353            assert_eq!(result.len(), 3);
2354            assert!(operands.is_empty());
2355
2356            // Test array without ArrayEnd (backwards compatibility)
2357            let mut operands = vec![Token::ArrayStart, Token::Number(1.5), Token::Number(2.5)];
2358            let result = parser.pop_array(&mut operands).unwrap();
2359            assert_eq!(result.len(), 2);
2360            assert!(operands.is_empty());
2361        }
2362
2363        #[test]
2364        fn test_dash_array_parsing_valid() {
2365            // Test that parser correctly parses valid dash arrays
2366            let parser = ContentParser::new(b"");
2367
2368            // Test with valid numbers only
2369            let valid_tokens = vec![Token::Number(3.0), Token::Integer(2)];
2370            let result = parser.parse_dash_array(valid_tokens).unwrap();
2371            assert_eq!(result, vec![3.0, 2.0]);
2372
2373            // Test empty dash array
2374            let empty_tokens = vec![];
2375            let result = parser.parse_dash_array(empty_tokens).unwrap();
2376            let expected: Vec<f32> = vec![];
2377            assert_eq!(result, expected);
2378        }
2379
2380        #[test]
2381        fn test_text_array_parsing_valid() {
2382            // Test that parser correctly parses valid text arrays
2383            let parser = ContentParser::new(b"");
2384
2385            // Test with valid elements only
2386            let valid_tokens = vec![
2387                Token::String(b"Hello".to_vec()),
2388                Token::Number(-100.0),
2389                Token::String(b"World".to_vec()),
2390            ];
2391            let result = parser.parse_text_array(valid_tokens).unwrap();
2392            assert_eq!(result.len(), 3);
2393        }
2394
2395        #[test]
2396        fn test_inline_image_handling() {
2397            let content = b"BI /W 100 /H 100 /BPC 8 /CS /RGB ID some_image_data EI";
2398            let operators = ContentParser::parse(content).unwrap();
2399
2400            assert_eq!(operators.len(), 1);
2401            match &operators[0] {
2402                ContentOperation::InlineImage { params, data: _ } => {
2403                    // Check parsed parameters
2404                    assert_eq!(params.get("Width"), Some(&Object::Integer(100)));
2405                    assert_eq!(params.get("Height"), Some(&Object::Integer(100)));
2406                    assert_eq!(params.get("BitsPerComponent"), Some(&Object::Integer(8)));
2407                    assert_eq!(
2408                        params.get("ColorSpace"),
2409                        Some(&Object::Name("DeviceRGB".to_string()))
2410                    );
2411                    // Data field is not captured, just verify params
2412                }
2413                _ => panic!("Expected InlineImage operation"),
2414            }
2415        }
2416
2417        #[test]
2418        fn test_inline_image_with_filter() {
2419            let content = b"BI /W 50 /H 50 /CS /G /BPC 1 /F /AHx ID 00FF00FF EI";
2420            let operators = ContentParser::parse(content).unwrap();
2421
2422            assert_eq!(operators.len(), 1);
2423            match &operators[0] {
2424                ContentOperation::InlineImage { params, data: _ } => {
2425                    assert_eq!(params.get("Width"), Some(&Object::Integer(50)));
2426                    assert_eq!(params.get("Height"), Some(&Object::Integer(50)));
2427                    assert_eq!(
2428                        params.get("ColorSpace"),
2429                        Some(&Object::Name("DeviceGray".to_string()))
2430                    );
2431                    assert_eq!(params.get("BitsPerComponent"), Some(&Object::Integer(1)));
2432                    assert_eq!(
2433                        params.get("Filter"),
2434                        Some(&Object::Name("ASCIIHexDecode".to_string()))
2435                    );
2436                }
2437                _ => panic!("Expected InlineImage operation"),
2438            }
2439        }
2440
2441        #[test]
2442        fn test_content_parser_performance() {
2443            let mut content = Vec::new();
2444            for i in 0..1000 {
2445                content.extend_from_slice(format!("{} {} m ", i, i + 1).as_bytes());
2446            }
2447
2448            let start = std::time::Instant::now();
2449            let operators = ContentParser::parse(&content).unwrap();
2450            let duration = start.elapsed();
2451
2452            assert_eq!(operators.len(), 1000);
2453            assert!(duration.as_millis() < 100); // Should parse 1000 operators in under 100ms
2454        }
2455
2456        #[test]
2457        fn test_tokenizer_performance() {
2458            let mut input = Vec::new();
2459            for i in 0..1000 {
2460                input.extend_from_slice(format!("{} {} ", i, i + 1).as_bytes());
2461            }
2462
2463            let start = std::time::Instant::now();
2464            let mut tokenizer = ContentTokenizer::new(&input);
2465            let mut count = 0;
2466            while tokenizer.next_token().unwrap().is_some() {
2467                count += 1;
2468            }
2469            let duration = start.elapsed();
2470
2471            assert_eq!(count, 2000); // 1000 pairs of numbers
2472            assert!(duration.as_millis() < 50); // Should tokenize 2000 tokens in under 50ms
2473        }
2474
2475        #[test]
2476        fn test_memory_usage_large_content() {
2477            let mut content = Vec::new();
2478            for i in 0..10000 {
2479                content.extend_from_slice(
2480                    format!("{} {} {} {} {} {} c ", i, i + 1, i + 2, i + 3, i + 4, i + 5)
2481                        .as_bytes(),
2482                );
2483            }
2484
2485            let operators = ContentParser::parse(&content).unwrap();
2486            assert_eq!(operators.len(), 10000);
2487
2488            // Verify all operations are CurveTo
2489            for op in operators {
2490                matches!(op, ContentOperation::CurveTo(_, _, _, _, _, _));
2491            }
2492        }
2493
2494        #[test]
2495        fn test_concurrent_parsing() {
2496            use std::sync::Arc;
2497            use std::thread;
2498
2499            let content = Arc::new(b"BT /F1 12 Tf 100 200 Td (Hello) Tj ET".to_vec());
2500            let handles: Vec<_> = (0..10)
2501                .map(|_| {
2502                    let content_clone = content.clone();
2503                    thread::spawn(move || ContentParser::parse(&content_clone).unwrap())
2504                })
2505                .collect();
2506
2507            for handle in handles {
2508                let operators = handle.join().unwrap();
2509                assert_eq!(operators.len(), 5);
2510                assert_eq!(operators[0], ContentOperation::BeginText);
2511                assert_eq!(operators[4], ContentOperation::EndText);
2512            }
2513        }
2514
2515        // ========== NEW COMPREHENSIVE TESTS ==========
2516
2517        #[test]
2518        fn test_tokenizer_hex_string_edge_cases() {
2519            let mut tokenizer = ContentTokenizer::new(b"<>");
2520            let token = tokenizer.next_token().unwrap().unwrap();
2521            match token {
2522                Token::HexString(data) => assert!(data.is_empty()),
2523                _ => panic!("Expected empty hex string"),
2524            }
2525
2526            // Odd number of hex digits
2527            let mut tokenizer = ContentTokenizer::new(b"<123>");
2528            let token = tokenizer.next_token().unwrap().unwrap();
2529            match token {
2530                Token::HexString(data) => assert_eq!(data, vec![0x12, 0x30]),
2531                _ => panic!("Expected hex string with odd digits"),
2532            }
2533
2534            // Hex string with whitespace
2535            let mut tokenizer = ContentTokenizer::new(b"<12 34\t56\n78>");
2536            let token = tokenizer.next_token().unwrap().unwrap();
2537            match token {
2538                Token::HexString(data) => assert_eq!(data, vec![0x12, 0x34, 0x56, 0x78]),
2539                _ => panic!("Expected hex string with whitespace"),
2540            }
2541        }
2542
2543        #[test]
2544        fn test_tokenizer_literal_string_escape_sequences() {
2545            // Test all standard escape sequences
2546            let mut tokenizer = ContentTokenizer::new(b"(\\n\\r\\t\\b\\f\\(\\)\\\\)");
2547            let token = tokenizer.next_token().unwrap().unwrap();
2548            match token {
2549                Token::String(data) => {
2550                    assert_eq!(
2551                        data,
2552                        vec![b'\n', b'\r', b'\t', 0x08, 0x0C, b'(', b')', b'\\']
2553                    );
2554                }
2555                _ => panic!("Expected string with escapes"),
2556            }
2557
2558            // Test octal escape sequences
2559            let mut tokenizer = ContentTokenizer::new(b"(\\101\\040\\377)");
2560            let token = tokenizer.next_token().unwrap().unwrap();
2561            match token {
2562                Token::String(data) => assert_eq!(data, vec![b'A', b' ', 255]),
2563                _ => panic!("Expected string with octal escapes"),
2564            }
2565        }
2566
2567        #[test]
2568        fn test_tokenizer_nested_parentheses() {
2569            let mut tokenizer = ContentTokenizer::new(b"(outer (inner) text)");
2570            let token = tokenizer.next_token().unwrap().unwrap();
2571            match token {
2572                Token::String(data) => {
2573                    assert_eq!(data, b"outer (inner) text");
2574                }
2575                _ => panic!("Expected string with nested parentheses"),
2576            }
2577
2578            // Multiple levels of nesting
2579            let mut tokenizer = ContentTokenizer::new(b"(level1 (level2 (level3) back2) back1)");
2580            let token = tokenizer.next_token().unwrap().unwrap();
2581            match token {
2582                Token::String(data) => {
2583                    assert_eq!(data, b"level1 (level2 (level3) back2) back1");
2584                }
2585                _ => panic!("Expected string with deep nesting"),
2586            }
2587        }
2588
2589        #[test]
2590        fn test_tokenizer_name_hex_escapes() {
2591            let mut tokenizer = ContentTokenizer::new(b"/Name#20With#20Spaces");
2592            let token = tokenizer.next_token().unwrap().unwrap();
2593            match token {
2594                Token::Name(name) => assert_eq!(name, "Name With Spaces"),
2595                _ => panic!("Expected name with hex escapes"),
2596            }
2597
2598            // Test various special characters
2599            let mut tokenizer = ContentTokenizer::new(b"/Special#2F#28#29#3C#3E");
2600            let token = tokenizer.next_token().unwrap().unwrap();
2601            match token {
2602                Token::Name(name) => assert_eq!(name, "Special/()<>"),
2603                _ => panic!("Expected name with special character escapes"),
2604            }
2605        }
2606
2607        #[test]
2608        fn test_tokenizer_number_edge_cases() {
2609            // Very large integers
2610            let mut tokenizer = ContentTokenizer::new(b"2147483647");
2611            let token = tokenizer.next_token().unwrap().unwrap();
2612            match token {
2613                Token::Integer(n) => assert_eq!(n, 2147483647),
2614                _ => panic!("Expected large integer"),
2615            }
2616
2617            // Very small numbers
2618            let mut tokenizer = ContentTokenizer::new(b"0.00001");
2619            let token = tokenizer.next_token().unwrap().unwrap();
2620            match token {
2621                Token::Number(n) => assert!((n - 0.00001).abs() < f32::EPSILON),
2622                _ => panic!("Expected small float"),
2623            }
2624
2625            // Numbers starting with dot
2626            let mut tokenizer = ContentTokenizer::new(b".5");
2627            let token = tokenizer.next_token().unwrap().unwrap();
2628            match token {
2629                Token::Number(n) => assert!((n - 0.5).abs() < f32::EPSILON),
2630                _ => panic!("Expected float starting with dot"),
2631            }
2632        }
2633
2634        #[test]
2635        fn test_parser_complex_path_operations() {
2636            let content = b"100 200 m 150 200 l 150 250 l 100 250 l h f";
2637            let operators = ContentParser::parse(content).unwrap();
2638
2639            assert_eq!(operators.len(), 6);
2640            assert_eq!(operators[0], ContentOperation::MoveTo(100.0, 200.0));
2641            assert_eq!(operators[1], ContentOperation::LineTo(150.0, 200.0));
2642            assert_eq!(operators[2], ContentOperation::LineTo(150.0, 250.0));
2643            assert_eq!(operators[3], ContentOperation::LineTo(100.0, 250.0));
2644            assert_eq!(operators[4], ContentOperation::ClosePath);
2645            assert_eq!(operators[5], ContentOperation::Fill);
2646        }
2647
2648        #[test]
2649        fn test_parser_bezier_curves() {
2650            let content = b"100 100 150 50 200 150 c";
2651            let operators = ContentParser::parse(content).unwrap();
2652
2653            assert_eq!(operators.len(), 1);
2654            match &operators[0] {
2655                ContentOperation::CurveTo(x1, y1, x2, y2, x3, y3) => {
2656                    // Values are parsed in reverse order: last 6 values for c operator
2657                    // Stack order: 100 100 150 50 200 150
2658                    // Pop order: x1=100, y1=100, x2=150, y2=50, x3=200, y3=150
2659                    assert!(x1.is_finite() && y1.is_finite());
2660                    assert!(x2.is_finite() && y2.is_finite());
2661                    assert!(x3.is_finite() && y3.is_finite());
2662                    // Verify we have 6 coordinate values
2663                    assert!(*x1 >= 50.0 && *x1 <= 200.0);
2664                    assert!(*y1 >= 50.0 && *y1 <= 200.0);
2665                }
2666                _ => panic!("Expected CurveTo operation"),
2667            }
2668        }
2669
2670        #[test]
2671        fn test_parser_color_operations() {
2672            let content = b"0.5 g 1 0 0 rg 0 1 0 1 k /DeviceRGB cs 0.2 0.4 0.6 sc";
2673            let operators = ContentParser::parse(content).unwrap();
2674
2675            assert_eq!(operators.len(), 5);
2676            match &operators[0] {
2677                ContentOperation::SetNonStrokingGray(gray) => assert_eq!(*gray, 0.5),
2678                _ => panic!("Expected SetNonStrokingGray"),
2679            }
2680            match &operators[1] {
2681                ContentOperation::SetNonStrokingRGB(r, g, b) => {
2682                    assert_eq!((*r, *g, *b), (1.0, 0.0, 0.0));
2683                }
2684                _ => panic!("Expected SetNonStrokingRGB"),
2685            }
2686        }
2687
2688        #[test]
2689        fn test_parser_text_positioning_advanced() {
2690            let content = b"BT 1 0 0 1 100 200 Tm 0 TL 10 TL (Line 1) ' (Line 2) ' ET";
2691            let operators = ContentParser::parse(content).unwrap();
2692
2693            assert_eq!(operators.len(), 7);
2694            assert_eq!(operators[0], ContentOperation::BeginText);
2695            match &operators[1] {
2696                ContentOperation::SetTextMatrix(a, b, c, d, e, f) => {
2697                    assert_eq!((*a, *b, *c, *d, *e, *f), (1.0, 0.0, 0.0, 1.0, 100.0, 200.0));
2698                }
2699                _ => panic!("Expected SetTextMatrix"),
2700            }
2701            assert_eq!(operators[6], ContentOperation::EndText);
2702        }
2703
2704        #[test]
2705        fn test_parser_graphics_state_operations() {
2706            let content = b"q 2 0 0 2 100 100 cm 5 w 1 J 2 j 10 M Q";
2707            let operators = ContentParser::parse(content).unwrap();
2708
2709            assert_eq!(operators.len(), 7);
2710            assert_eq!(operators[0], ContentOperation::SaveGraphicsState);
2711            match &operators[1] {
2712                ContentOperation::SetTransformMatrix(a, b, c, d, e, f) => {
2713                    assert_eq!((*a, *b, *c, *d, *e, *f), (2.0, 0.0, 0.0, 2.0, 100.0, 100.0));
2714                }
2715                _ => panic!("Expected SetTransformMatrix"),
2716            }
2717            assert_eq!(operators[6], ContentOperation::RestoreGraphicsState);
2718        }
2719
2720        #[test]
2721        fn test_parser_xobject_operations() {
2722            let content = b"/Image1 Do /Form2 Do /Pattern3 Do";
2723            let operators = ContentParser::parse(content).unwrap();
2724
2725            assert_eq!(operators.len(), 3);
2726            for (i, expected_name) in ["Image1", "Form2", "Pattern3"].iter().enumerate() {
2727                match &operators[i] {
2728                    ContentOperation::PaintXObject(name) => assert_eq!(name, expected_name),
2729                    _ => panic!("Expected PaintXObject"),
2730                }
2731            }
2732        }
2733
2734        #[test]
2735        fn test_parser_marked_content_operations() {
2736            let content = b"/P BMC (Tagged content) Tj EMC";
2737            let operators = ContentParser::parse(content).unwrap();
2738
2739            assert_eq!(operators.len(), 3);
2740            match &operators[0] {
2741                ContentOperation::BeginMarkedContent(tag) => assert_eq!(tag, "P"),
2742                _ => panic!("Expected BeginMarkedContent"),
2743            }
2744            assert_eq!(operators[2], ContentOperation::EndMarkedContent);
2745        }
2746
2747        #[test]
2748        fn test_parser_error_handling_invalid_operators() {
2749            // Best-effort recovery contract (issue #319).
2750
2751            // Missing operands for `m`: the malformed operator is skipped but
2752            // a following valid `l` is recovered.
2753            let content = b"m 10 20 l";
2754            let ops = ContentParser::parse(content).expect("recovers from operand-less m");
2755            assert!(
2756                ops.iter()
2757                    .any(|op| matches!(op, ContentOperation::LineTo(_, _))),
2758                "valid LineTo after the operand-less m must survive: {ops:?}"
2759            );
2760
2761            // Unterminated hex string: the tokenizer stops at the malformed
2762            // tail but keeps every token before it, so valid text ahead of the
2763            // bad hex is still extracted.
2764            let content = b"(kept) Tj <ABC DEF";
2765            let ops = ContentParser::parse(content).expect("recovers, keeping pre-error tokens");
2766            assert!(
2767                ops.iter()
2768                    .any(|op| matches!(op, ContentOperation::ShowText(t) if t == b"kept")),
2769                "text before the unterminated hex must survive: {ops:?}"
2770            );
2771
2772            // Numbers without an operator parse OK (no operator attempted).
2773            let content = b"100 200 300";
2774            assert!(ContentParser::parse(content).is_ok());
2775        }
2776
2777        #[test]
2778        fn test_parser_whitespace_tolerance() {
2779            let content = b"  \n\t  100   \r\n  200  \t m  \n";
2780            let operators = ContentParser::parse(content).unwrap();
2781
2782            assert_eq!(operators.len(), 1);
2783            assert_eq!(operators[0], ContentOperation::MoveTo(100.0, 200.0));
2784        }
2785
2786        #[test]
2787        fn test_tokenizer_comment_handling() {
2788            let content = b"100 % This is a comment\n200 m % Another comment";
2789            let operators = ContentParser::parse(content).unwrap();
2790
2791            assert_eq!(operators.len(), 1);
2792            assert_eq!(operators[0], ContentOperation::MoveTo(100.0, 200.0));
2793        }
2794
2795        #[test]
2796        fn test_parser_stream_with_binary_data() {
2797            // Test content stream with comment containing binary-like data
2798            let content = b"100 200 m % Comment with \xFF binary\n150 250 l";
2799
2800            let operators = ContentParser::parse(content).unwrap();
2801            assert_eq!(operators.len(), 2);
2802            assert_eq!(operators[0], ContentOperation::MoveTo(100.0, 200.0));
2803            assert_eq!(operators[1], ContentOperation::LineTo(150.0, 250.0));
2804        }
2805
2806        #[test]
2807        fn test_tokenizer_array_parsing() {
2808            // Test simple operations that don't require complex array parsing
2809            let content = b"100 200 m 150 250 l";
2810            let operators = ContentParser::parse(content).unwrap();
2811
2812            assert_eq!(operators.len(), 2);
2813            assert_eq!(operators[0], ContentOperation::MoveTo(100.0, 200.0));
2814            assert_eq!(operators[1], ContentOperation::LineTo(150.0, 250.0));
2815        }
2816
2817        #[test]
2818        fn test_parser_rectangle_operations() {
2819            let content = b"10 20 100 50 re 0 0 200 300 re";
2820            let operators = ContentParser::parse(content).unwrap();
2821
2822            assert_eq!(operators.len(), 2);
2823            match &operators[0] {
2824                ContentOperation::Rectangle(x, y, width, height) => {
2825                    assert_eq!((*x, *y, *width, *height), (10.0, 20.0, 100.0, 50.0));
2826                }
2827                _ => panic!("Expected Rectangle operation"),
2828            }
2829            match &operators[1] {
2830                ContentOperation::Rectangle(x, y, width, height) => {
2831                    assert_eq!((*x, *y, *width, *height), (0.0, 0.0, 200.0, 300.0));
2832                }
2833                _ => panic!("Expected Rectangle operation"),
2834            }
2835        }
2836
2837        #[test]
2838        fn test_parser_clipping_operations() {
2839            let content = b"100 100 50 50 re W n 200 200 75 75 re W* n";
2840            let operators = ContentParser::parse(content).unwrap();
2841
2842            assert_eq!(operators.len(), 6);
2843            assert_eq!(operators[1], ContentOperation::Clip);
2844            assert_eq!(operators[2], ContentOperation::EndPath);
2845            assert_eq!(operators[4], ContentOperation::ClipEvenOdd);
2846            assert_eq!(operators[5], ContentOperation::EndPath);
2847        }
2848
2849        #[test]
2850        fn test_parser_painting_operations() {
2851            let content = b"S s f f* B B* b b*";
2852            let operators = ContentParser::parse(content).unwrap();
2853
2854            assert_eq!(operators.len(), 8);
2855            assert_eq!(operators[0], ContentOperation::Stroke);
2856            assert_eq!(operators[1], ContentOperation::CloseStroke);
2857            assert_eq!(operators[2], ContentOperation::Fill);
2858            assert_eq!(operators[3], ContentOperation::FillEvenOdd);
2859            assert_eq!(operators[4], ContentOperation::FillStroke);
2860            assert_eq!(operators[5], ContentOperation::FillStrokeEvenOdd);
2861            assert_eq!(operators[6], ContentOperation::CloseFillStroke);
2862            assert_eq!(operators[7], ContentOperation::CloseFillStrokeEvenOdd);
2863        }
2864
2865        #[test]
2866        fn test_parser_line_style_operations() {
2867            let content = b"5 w 1 J 2 j 10 M [ 3 2 ] 0 d";
2868            let operators = ContentParser::parse(content).unwrap();
2869
2870            assert_eq!(operators.len(), 5);
2871            assert_eq!(operators[0], ContentOperation::SetLineWidth(5.0));
2872            assert_eq!(operators[1], ContentOperation::SetLineCap(1));
2873            assert_eq!(operators[2], ContentOperation::SetLineJoin(2));
2874            assert_eq!(operators[3], ContentOperation::SetMiterLimit(10.0));
2875            // Dash pattern test would need array support
2876        }
2877
2878        #[test]
2879        fn test_parser_text_state_operations() {
2880            let content = b"12 Tc 3 Tw 100 Tz 1 Tr 2 Ts";
2881            let operators = ContentParser::parse(content).unwrap();
2882
2883            assert_eq!(operators.len(), 5);
2884            assert_eq!(operators[0], ContentOperation::SetCharSpacing(12.0));
2885            assert_eq!(operators[1], ContentOperation::SetWordSpacing(3.0));
2886            assert_eq!(operators[2], ContentOperation::SetHorizontalScaling(100.0));
2887            assert_eq!(operators[3], ContentOperation::SetTextRenderMode(1));
2888            assert_eq!(operators[4], ContentOperation::SetTextRise(2.0));
2889        }
2890
2891        #[test]
2892        fn test_parser_unicode_text() {
2893            let content = b"BT (Hello \xC2\xA9 World \xE2\x9C\x93) Tj ET";
2894            let operators = ContentParser::parse(content).unwrap();
2895
2896            assert_eq!(operators.len(), 3);
2897            assert_eq!(operators[0], ContentOperation::BeginText);
2898            match &operators[1] {
2899                ContentOperation::ShowText(text) => {
2900                    assert!(text.len() > 5); // Should contain Unicode bytes
2901                }
2902                _ => panic!("Expected ShowText operation"),
2903            }
2904            assert_eq!(operators[2], ContentOperation::EndText);
2905        }
2906
2907        #[test]
2908        fn test_parser_stress_test_large_coordinates() {
2909            let content = b"999999.999 -999999.999 999999.999 -999999.999 999999.999 -999999.999 c";
2910            let operators = ContentParser::parse(content).unwrap();
2911
2912            assert_eq!(operators.len(), 1);
2913            match &operators[0] {
2914                ContentOperation::CurveTo(_x1, _y1, _x2, _y2, _x3, _y3) => {
2915                    assert!((*_x1 - 999999.999).abs() < 0.1);
2916                    assert!((*_y1 - (-999999.999)).abs() < 0.1);
2917                    assert!((*_x3 - 999999.999).abs() < 0.1);
2918                }
2919                _ => panic!("Expected CurveTo operation"),
2920            }
2921        }
2922
2923        #[test]
2924        fn test_parser_empty_content_stream() {
2925            let content = b"";
2926            let operators = ContentParser::parse(content).unwrap();
2927            assert!(operators.is_empty());
2928
2929            let content = b"   \n\t\r   ";
2930            let operators = ContentParser::parse(content).unwrap();
2931            assert!(operators.is_empty());
2932        }
2933
2934        #[test]
2935        fn test_tokenizer_error_recovery() {
2936            // A comment carrying a stray binary byte sits between two valid
2937            // path operators. The comment (and its binary) is skipped and
2938            // BOTH operators are recovered (issue #319 recovery contract).
2939            let content = b"100 200 m % Comment with\xFFbinary\n150 250 l";
2940            let ops = ContentParser::parse(content).expect("recovers around binary comment");
2941            assert!(
2942                ops.iter()
2943                    .any(|op| matches!(op, ContentOperation::MoveTo(_, _))),
2944                "MoveTo before the comment must survive: {ops:?}"
2945            );
2946            assert!(
2947                ops.iter()
2948                    .any(|op| matches!(op, ContentOperation::LineTo(_, _))),
2949                "LineTo after the comment must survive: {ops:?}"
2950            );
2951        }
2952
2953        #[test]
2954        fn malformed_operator_does_not_discard_surrounding_text() {
2955            // Issue #319: a single malformed operator must NOT drop the whole
2956            // page's content. Here a bare `Td` (missing its two operands)
2957            // sits between two valid text-show operators. Before the fix,
2958            // `parse_operators` propagated the operand error with `?`, so the
2959            // entire stream returned Err and BOTH show-text ops were lost
2960            // (the extractor then dropped the page). The parser must recover:
2961            // skip the bad operator, keep the valid ones.
2962            let content = b"BT /F1 12 Tf 72 700 Td (First line) Tj Td (Second line) Tj ET";
2963            let ops = ContentParser::parse_content(content)
2964                .expect("malformed operator must not fail the whole stream");
2965            let shown: Vec<&Vec<u8>> = ops
2966                .iter()
2967                .filter_map(|op| match op {
2968                    ContentOperation::ShowText(t) => Some(t),
2969                    _ => None,
2970                })
2971                .collect();
2972            assert_eq!(
2973                shown.len(),
2974                2,
2975                "both valid Tj operators must survive the malformed Td"
2976            );
2977            assert_eq!(shown[0], b"First line");
2978            assert_eq!(shown[1], b"Second line");
2979        }
2980
2981        #[test]
2982        fn test_parser_optimization_repeated_operations() {
2983            // Test performance with many repeated operations
2984            let mut content = Vec::new();
2985            for i in 0..1000 {
2986                content.extend_from_slice(format!("{} {} m ", i, i * 2).as_bytes());
2987            }
2988
2989            let start = std::time::Instant::now();
2990            let operators = ContentParser::parse(&content).unwrap();
2991            let duration = start.elapsed();
2992
2993            assert_eq!(operators.len(), 1000);
2994            assert!(duration.as_millis() < 200); // Should be fast
2995        }
2996
2997        #[test]
2998        fn test_parser_memory_efficiency_large_strings() {
2999            // Test with large text content
3000            let large_text = "A".repeat(10000);
3001            let content = format!("BT ({}) Tj ET", large_text);
3002            let operators = ContentParser::parse(content.as_bytes()).unwrap();
3003
3004            assert_eq!(operators.len(), 3);
3005            match &operators[1] {
3006                ContentOperation::ShowText(text) => {
3007                    assert_eq!(text.len(), 10000);
3008                }
3009                _ => panic!("Expected ShowText operation"),
3010            }
3011        }
3012    }
3013
3014    #[test]
3015    fn test_content_stream_too_large() {
3016        // Test handling of very large content streams (covering potential size limits)
3017        let mut large_content = Vec::new();
3018
3019        // Create a content stream with many operations
3020        for i in 0..10000 {
3021            large_content.extend_from_slice(format!("{} {} m ", i, i).as_bytes());
3022        }
3023        large_content.extend_from_slice(b"S");
3024
3025        // Should handle large content without panic
3026        let result = ContentParser::parse_content(&large_content);
3027        assert!(result.is_ok());
3028
3029        let operations = result.unwrap();
3030        // Should have many MoveTo operations plus one Stroke
3031        assert!(operations.len() > 10000);
3032    }
3033
3034    #[test]
3035    fn test_invalid_operator_handling() {
3036        // Test parsing with invalid operators
3037        let content = b"100 200 INVALID_OP 300 400 m";
3038        let result = ContentParser::parse_content(content);
3039
3040        // Should either handle gracefully or return error
3041        if let Ok(operations) = result {
3042            // If it succeeds, should have at least the valid MoveTo
3043            assert!(operations
3044                .iter()
3045                .any(|op| matches!(op, ContentOperation::MoveTo(_, _))));
3046        }
3047    }
3048
3049    #[test]
3050    fn test_nested_arrays_malformed() {
3051        // Test malformed nested arrays in TJ operator
3052        let content = b"[[(Hello] [World)]] TJ";
3053        let result = ContentParser::parse_content(content);
3054
3055        // Should handle malformed arrays gracefully
3056        assert!(result.is_ok() || result.is_err());
3057    }
3058
3059    #[test]
3060    fn test_escape_sequences_in_strings() {
3061        // Test various escape sequences in strings
3062        let test_cases = vec![
3063            (b"(\\n\\r\\t)".as_slice(), b"\n\r\t".as_slice()),
3064            (b"(\\\\)".as_slice(), b"\\".as_slice()),
3065            (b"(\\(\\))".as_slice(), b"()".as_slice()),
3066            (b"(\\123)".as_slice(), b"S".as_slice()), // Octal 123 = 83 = 'S'
3067            (b"(\\0)".as_slice(), b"\0".as_slice()),
3068        ];
3069
3070        for (input, expected) in test_cases {
3071            let mut content = Vec::new();
3072            content.extend_from_slice(input);
3073            content.extend_from_slice(b" Tj");
3074
3075            let result = ContentParser::parse_content(&content);
3076            assert!(result.is_ok());
3077
3078            let operations = result.unwrap();
3079            if let ContentOperation::ShowText(text) = &operations[0] {
3080                assert_eq!(text, expected, "Failed for input: {:?}", input);
3081            } else {
3082                panic!("Expected ShowText operation");
3083            }
3084        }
3085    }
3086
3087    #[test]
3088    fn test_content_with_inline_images() {
3089        // Test handling of inline images in content stream
3090        let content = b"BI /W 10 /H 10 /CS /RGB ID \x00\x01\x02\x03 EI";
3091        let result = ContentParser::parse_content(content);
3092
3093        // Should handle inline images (even if not fully implemented)
3094        assert!(result.is_ok() || result.is_err());
3095    }
3096
3097    #[test]
3098    fn test_operator_with_missing_operands() {
3099        // Test operators with insufficient operands
3100        let test_cases = vec![
3101            b"Tj" as &[u8], // ShowText without string
3102            b"m",           // MoveTo without coordinates
3103            b"rg",          // SetRGBColor without values
3104            b"Tf",          // SetFont without name and size
3105        ];
3106
3107        for content in test_cases {
3108            let result = ContentParser::parse_content(content);
3109            // Should handle gracefully (error or skip)
3110            assert!(result.is_ok() || result.is_err());
3111        }
3112    }
3113
3114    // --- Tests for infinite loop fix (curly braces, stray parens, inline images) ---
3115
3116    #[test]
3117    fn test_tokenizer_handles_curly_braces() {
3118        // Curly braces { } are not valid PDF content operators but appear in
3119        // binary inline image data. The tokenizer must skip them without hanging.
3120        let input = b"q { } Q";
3121        let mut tokenizer = ContentTokenizer::new(input);
3122
3123        let mut tokens = Vec::new();
3124        while let Some(token) = tokenizer.next_token().unwrap() {
3125            tokens.push(token);
3126        }
3127
3128        // Should produce tokens for q and Q, skipping { and }
3129        assert!(tokens.contains(&Token::Operator("q".to_string())));
3130        assert!(tokens.contains(&Token::Operator("Q".to_string())));
3131    }
3132
3133    #[test]
3134    fn test_tokenizer_handles_closing_paren() {
3135        // A stray ) outside a string literal should be skipped, not cause a hang
3136        let input = b"q ) Q";
3137        let mut tokenizer = ContentTokenizer::new(input);
3138
3139        let mut tokens = Vec::new();
3140        while let Some(token) = tokenizer.next_token().unwrap() {
3141            tokens.push(token);
3142        }
3143
3144        assert!(tokens.contains(&Token::Operator("q".to_string())));
3145        assert!(tokens.contains(&Token::Operator("Q".to_string())));
3146    }
3147
3148    #[test]
3149    fn test_inline_image_binary_with_curly_braces() {
3150        // Inline image binary data containing { and } bytes must be handled
3151        // correctly — the tokenizer should capture them as raw image data
3152        let content = b"BI /W 2 /H 2 /BPC 8 /CS /G ID \x7B\x7D\x00\xFF EI Q";
3153        let result = ContentParser::parse_content(content);
3154        assert!(
3155            result.is_ok(),
3156            "Parsing inline image with curly braces failed: {:?}",
3157            result.err()
3158        );
3159
3160        let ops = result.unwrap();
3161        // Should have InlineImage + RestoreGraphicsState
3162        let has_inline = ops
3163            .iter()
3164            .any(|op| matches!(op, ContentOperation::InlineImage { .. }));
3165        let has_q = ops
3166            .iter()
3167            .any(|op| matches!(op, ContentOperation::RestoreGraphicsState));
3168        assert!(has_inline, "Expected InlineImage operation");
3169        assert!(has_q, "Expected RestoreGraphicsState after EI");
3170    }
3171
3172    #[test]
3173    fn test_inline_image_binary_with_all_byte_values() {
3174        // Inline image with bytes 0x00-0xFF to ensure no byte causes a hang
3175        let mut content = Vec::new();
3176        content.extend_from_slice(b"BI /W 16 /H 16 /BPC 8 /CS /G ID ");
3177        // Add all 256 byte values as image data
3178        for b in 0u8..=255 {
3179            content.push(b);
3180        }
3181        content.extend_from_slice(b" EI Q");
3182
3183        let result = ContentParser::parse_content(&content);
3184        assert!(
3185            result.is_ok(),
3186            "Parsing inline image with all byte values failed: {:?}",
3187            result.err()
3188        );
3189    }
3190
3191    #[test]
3192    fn test_inline_image_ei_detection() {
3193        // EI must be preceded by whitespace to be recognized as end marker
3194        // "EI" within binary data (not preceded by whitespace) should NOT end the image
3195        let content = b"BI /W 2 /H 1 /BPC 8 /CS /G ID \x45\x49\x00\n EI Q";
3196        //                                               ^E  ^I  (within data)  ^real EI
3197        let result = ContentParser::parse_content(content);
3198        assert!(result.is_ok(), "EI detection failed: {:?}", result.err());
3199
3200        let ops = result.unwrap();
3201        let has_inline = ops
3202            .iter()
3203            .any(|op| matches!(op, ContentOperation::InlineImage { .. }));
3204        assert!(has_inline, "Expected InlineImage operation");
3205    }
3206
3207    #[test]
3208    fn test_tokenizer_no_infinite_loop_on_consecutive_delimiters() {
3209        // Multiple consecutive unhandled delimiters must not cause a hang
3210        let input = b"q {{{}}})))) Q";
3211        let mut tokenizer = ContentTokenizer::new(input);
3212
3213        let mut tokens = Vec::new();
3214        while let Some(token) = tokenizer.next_token().unwrap() {
3215            tokens.push(token);
3216            if tokens.len() > 100 {
3217                panic!("Tokenizer produced too many tokens — possible infinite loop");
3218            }
3219        }
3220
3221        assert!(tokens.contains(&Token::Operator("q".to_string())));
3222        assert!(tokens.contains(&Token::Operator("Q".to_string())));
3223    }
3224
3225    #[test]
3226    fn test_content_parser_inline_image_produces_correct_operation() {
3227        // Full parse of a simple inline image should produce correct params
3228        let content = b"BI /W 4 /H 4 /BPC 8 /CS /G ID \x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F EI";
3229        let result = ContentParser::parse_content(content);
3230        assert!(result.is_ok(), "Parse failed: {:?}", result.err());
3231
3232        let ops = result.unwrap();
3233        assert_eq!(
3234            ops.len(),
3235            1,
3236            "Expected exactly 1 operation, got {}",
3237            ops.len()
3238        );
3239
3240        if let ContentOperation::InlineImage { params, data } = &ops[0] {
3241            assert_eq!(params.get("Width"), Some(&Object::Integer(4)));
3242            assert_eq!(params.get("Height"), Some(&Object::Integer(4)));
3243            assert_eq!(params.get("BitsPerComponent"), Some(&Object::Integer(8)));
3244            assert!(!data.is_empty(), "Image data should not be empty");
3245        } else {
3246            panic!("Expected InlineImage operation, got {:?}", ops[0]);
3247        }
3248    }
3249
3250    #[test]
3251    fn test_octal_escape_overflow_777() {
3252        // \777 = octal 777 = 511 decimal, overflows u8.
3253        // Per ISO 32000-1:2008 §7.3.4.2: "high-order overflow shall be ignored"
3254        // 511 as u8 = 255 (0x1FF truncated to 0xFF)
3255        let mut tokenizer = ContentTokenizer::new(b"(\\777)");
3256        let token = tokenizer.next_token().unwrap().unwrap();
3257        match token {
3258            Token::String(data) => assert_eq!(data, vec![0xFF]),
3259            _ => panic!("Expected string token"),
3260        }
3261    }
3262
3263    #[test]
3264    fn test_octal_escape_overflow_400() {
3265        // \400 = octal 400 = 256 decimal, just overflows u8.
3266        // 256 as u8 = 0
3267        let mut tokenizer = ContentTokenizer::new(b"(\\400)");
3268        let token = tokenizer.next_token().unwrap().unwrap();
3269        match token {
3270            Token::String(data) => assert_eq!(data, vec![0x00]),
3271            _ => panic!("Expected string token"),
3272        }
3273    }
3274
3275    #[test]
3276    fn test_octal_escape_overflow_577() {
3277        // \577 = octal 577 = 383 decimal.
3278        // 383 as u8 = 127 (0x17F truncated to 0x7F)
3279        let mut tokenizer = ContentTokenizer::new(b"(\\577)");
3280        let token = tokenizer.next_token().unwrap().unwrap();
3281        match token {
3282            Token::String(data) => assert_eq!(data, vec![0x7F]),
3283            _ => panic!("Expected string token"),
3284        }
3285    }
3286
3287    #[test]
3288    fn test_octal_escape_max_valid_377() {
3289        // \377 = 255, max valid octal for u8 - should still work correctly
3290        let mut tokenizer = ContentTokenizer::new(b"(\\377)");
3291        let token = tokenizer.next_token().unwrap().unwrap();
3292        match token {
3293            Token::String(data) => assert_eq!(data, vec![0xFF]),
3294            _ => panic!("Expected string token"),
3295        }
3296    }
3297
3298    #[test]
3299    fn test_octal_escape_overflow_mixed_with_valid() {
3300        // Mix of overflow octal and normal text
3301        let mut tokenizer = ContentTokenizer::new(b"(A\\777B\\101C)");
3302        let token = tokenizer.next_token().unwrap().unwrap();
3303        match token {
3304            Token::String(data) => {
3305                assert_eq!(data, vec![b'A', 0xFF, b'B', b'A', b'C']);
3306            }
3307            _ => panic!("Expected string token"),
3308        }
3309    }
3310}