Skip to main content

rpdfium_parser/
content_stream.rs

1// Derived from PDFium's cpdf_streamparser.cpp / cpdf_contentparser.cpp
2// Original: Copyright 2014 The PDFium Authors
3// Licensed under BSD-3-Clause / Apache-2.0
4// See pdfium-upstream/LICENSE for the original license.
5
6//! Content stream operator tokenization (Stage 1).
7//!
8//! Tokenizes content stream bytes into a sequence of PDF operators.
9//! This module understands the PostScript-like operand stack + operator
10//! keyword pattern but not the semantics of operators.
11//!
12//! ```text
13//! rpdfium-parser:   &[u8] → Vec<Operator>          (tokenization — syntactic)
14//! rpdfium-page:     Vec<Operator> → DisplayTree     (interpretation — semantic)
15//! ```
16
17use std::collections::HashMap;
18
19use rpdfium_core::Name;
20use rpdfium_core::error::PdfError;
21
22use crate::tokenizer::{Token, Tokenizer, is_delimiter, is_whitespace};
23
24/// A content stream operand value.
25#[derive(Debug, Clone, PartialEq)]
26pub enum Operand {
27    Integer(i64),
28    Real(f64),
29    Name(Name),
30    String(Vec<u8>),
31    Array(Vec<Operand>),
32    Boolean(bool),
33    Null,
34}
35
36impl Operand {
37    /// Get as f32, converting from integer if needed.
38    pub fn as_f32(&self) -> Option<f32> {
39        match self {
40            Operand::Integer(n) => Some(*n as f32),
41            Operand::Real(f) => Some(*f as f32),
42            _ => None,
43        }
44    }
45
46    /// Get as i64.
47    pub fn as_i64(&self) -> Option<i64> {
48        match self {
49            Operand::Integer(n) => Some(*n),
50            Operand::Real(f) => Some(*f as i64),
51            _ => None,
52        }
53    }
54}
55
56/// A parsed content stream operator with its operands.
57#[derive(Debug, Clone, PartialEq)]
58pub enum Operator {
59    // --- Text state operators ---
60    /// BT — Begin text object.
61    BeginText,
62    /// ET — End text object.
63    EndText,
64    /// Tf name size — Set font and size.
65    SetFont { name: Name, size: f32 },
66    /// Td tx ty — Move text position.
67    MoveText { tx: f32, ty: f32 },
68    /// TD tx ty — Move text position and set leading.
69    MoveTextSetLeading { tx: f32, ty: f32 },
70    /// Tm a b c d e f — Set text matrix.
71    SetTextMatrix {
72        a: f32,
73        b: f32,
74        c: f32,
75        d: f32,
76        e: f32,
77        f: f32,
78    },
79    /// T* — Move to next line.
80    NextLine,
81    /// Tj string — Show text.
82    ShowText { bytes: Vec<u8> },
83    /// TJ array — Show text with positioning.
84    ShowTextArray { elements: Vec<TextArrayElement> },
85    /// ' string — Move to next line and show text.
86    NextLineShowText { bytes: Vec<u8> },
87    /// " aw ac string — Set word/char spacing, move to next line, show text.
88    SetSpacingShowText {
89        word_space: f32,
90        char_space: f32,
91        bytes: Vec<u8>,
92    },
93    /// Tc charSpace — Set character spacing.
94    SetCharSpacing { spacing: f32 },
95    /// Tw wordSpace — Set word spacing.
96    SetWordSpacing { spacing: f32 },
97    /// Tz scale — Set horizontal scaling.
98    SetHorizontalScaling { scale: f32 },
99    /// TL leading — Set text leading.
100    SetTextLeading { leading: f32 },
101    /// Ts rise — Set text rise.
102    SetTextRise { rise: f32 },
103    /// Tr render — Set text rendering mode.
104    SetTextRenderingMode { mode: i64 },
105
106    // --- Graphics state operators ---
107    /// q — Save graphics state.
108    SaveState,
109    /// Q — Restore graphics state.
110    RestoreState,
111    /// cm a b c d e f — Concatenate matrix.
112    ConcatMatrix {
113        a: f32,
114        b: f32,
115        c: f32,
116        d: f32,
117        e: f32,
118        f: f32,
119    },
120    /// w lineWidth — Set line width.
121    SetLineWidth { width: f32 },
122    /// J lineCap — Set line cap style.
123    SetLineCap { cap: i64 },
124    /// j lineJoin — Set line join style.
125    SetLineJoin { join: i64 },
126    /// M miterLimit — Set miter limit.
127    SetMiterLimit { limit: f32 },
128    /// d dashArray dashPhase — Set line dash pattern.
129    SetDashPattern { array: Vec<f32>, phase: f32 },
130    /// ri intent — Set rendering intent.
131    SetRenderingIntent { intent: Name },
132    /// i flatness — Set flatness tolerance.
133    SetFlatness { flatness: f32 },
134    /// gs name — Set graphics state from external dictionary.
135    SetGraphicsState { name: Name },
136
137    // --- Path construction operators ---
138    /// m x y — Begin new subpath.
139    MoveTo { x: f32, y: f32 },
140    /// l x y — Append straight line.
141    LineTo { x: f32, y: f32 },
142    /// c x1 y1 x2 y2 x3 y3 — Append cubic Bezier curve.
143    CurveTo {
144        x1: f32,
145        y1: f32,
146        x2: f32,
147        y2: f32,
148        x3: f32,
149        y3: f32,
150    },
151    /// v x2 y2 x3 y3 — Append cubic Bezier (initial point replicated).
152    CurveToInitial { x2: f32, y2: f32, x3: f32, y3: f32 },
153    /// y x1 y1 x3 y3 — Append cubic Bezier (final point replicated).
154    CurveToFinal { x1: f32, y1: f32, x3: f32, y3: f32 },
155    /// h — Close subpath.
156    ClosePath,
157    /// re x y w h — Append rectangle.
158    Rectangle { x: f32, y: f32, w: f32, h: f32 },
159
160    // --- Path painting operators ---
161    /// S — Stroke path.
162    Stroke,
163    /// s — Close and stroke path.
164    CloseAndStroke,
165    /// f — Fill path (non-zero winding rule).
166    Fill,
167    /// F — Fill path (non-zero winding rule, obsolete).
168    FillObsolete,
169    /// f* — Fill path (even-odd rule).
170    FillEvenOdd,
171    /// B — Fill and stroke (non-zero winding).
172    FillStroke,
173    /// B* — Fill and stroke (even-odd).
174    FillStrokeEvenOdd,
175    /// b — Close, fill, and stroke (non-zero winding).
176    CloseFillStroke,
177    /// b* — Close, fill, and stroke (even-odd).
178    CloseFillStrokeEvenOdd,
179    /// n — End path (no-op painting).
180    EndPath,
181
182    // --- Clipping operators ---
183    /// W — Set clipping path (non-zero winding).
184    Clip,
185    /// W* — Set clipping path (even-odd).
186    ClipEvenOdd,
187
188    // --- Color operators ---
189    /// CS name — Set color space (stroking).
190    SetColorSpaceStroke { name: Name },
191    /// cs name — Set color space (non-stroking).
192    SetColorSpaceFill { name: Name },
193    /// SC c1...cn — Set color (stroking).
194    SetColorStroke { components: Vec<f32> },
195    /// sc c1...cn — Set color (non-stroking).
196    SetColorFill { components: Vec<f32> },
197    /// SCN c1...cn \[name\] — Set color (stroking, with pattern).
198    SetColorStrokeN {
199        components: Vec<f32>,
200        name: Option<Name>,
201    },
202    /// scn c1...cn \[name\] — Set color (non-stroking, with pattern).
203    SetColorFillN {
204        components: Vec<f32>,
205        name: Option<Name>,
206    },
207    /// G gray — Set gray (stroking).
208    SetGrayStroke { gray: f32 },
209    /// g gray — Set gray (non-stroking).
210    SetGrayFill { gray: f32 },
211    /// RG r g b — Set RGB (stroking).
212    SetRgbStroke { r: f32, g: f32, b: f32 },
213    /// rg r g b — Set RGB (non-stroking).
214    SetRgbFill { r: f32, g: f32, b: f32 },
215    /// K c m y k — Set CMYK (stroking).
216    SetCmykStroke { c: f32, m: f32, y: f32, k: f32 },
217    /// k c m y k — Set CMYK (non-stroking).
218    SetCmykFill { c: f32, m: f32, y: f32, k: f32 },
219
220    // --- XObject, inline image, and shading operators ---
221    /// Do name — Paint XObject.
222    PaintXObject { name: Name },
223    /// sh name — Paint shading.
224    PaintShading { name: Name },
225    /// BI ... ID ... EI — Inline image with parsed properties and raw data.
226    InlineImage {
227        properties: HashMap<Name, Operand>,
228        data: Vec<u8>,
229    },
230
231    // --- Marked content operators ---
232    /// BMC tag — Begin marked content.
233    BeginMarkedContent { tag: Name },
234    /// BDC tag properties — Begin marked content with properties.
235    BeginMarkedContentDict { tag: Name, properties: Operand },
236    /// EMC — End marked content.
237    EndMarkedContent,
238    /// MP tag — Marked content point.
239    MarkedContentPoint { tag: Name },
240    /// DP tag properties — Marked content point with properties.
241    MarkedContentPointDict { tag: Name, properties: Operand },
242
243    // --- Type 3 character width operators ---
244    /// d0 wx wy — Set character width (Type 3).
245    SetCharWidth { wx: f32, wy: f32 },
246    /// d1 wx wy llx lly urx ury — Set cache device (Type 3).
247    SetCacheDevice {
248        wx: f32,
249        wy: f32,
250        llx: f32,
251        lly: f32,
252        urx: f32,
253        ury: f32,
254    },
255
256    // --- Compatibility operators ---
257    /// BX — Begin compatibility section.
258    BeginCompat,
259    /// EX — End compatibility section.
260    EndCompat,
261
262    /// An unknown/unrecognized operator keyword with its raw operands.
263    Unknown {
264        keyword: Vec<u8>,
265        operands: Vec<Operand>,
266    },
267}
268
269/// An element in a TJ (show text array) operator.
270#[derive(Debug, Clone, PartialEq)]
271pub enum TextArrayElement {
272    /// A text string to show.
273    Text(Vec<u8>),
274    /// A position adjustment (negative = move right in text direction).
275    Adjustment(f32),
276}
277
278/// Tokenize a content stream into a sequence of operators.
279///
280/// Content streams use a PostScript-like syntax: operands are pushed onto
281/// a stack, then an operator keyword consumes them.
282pub fn tokenize_content_stream(data: &[u8]) -> Result<Vec<Operator>, PdfError> {
283    let mut tok = Tokenizer::new(data);
284    let mut operands: Vec<Operand> = Vec::new();
285    let mut operators = Vec::new();
286
287    loop {
288        let token = match tok.next_token() {
289            Some(Ok(t)) => t,
290            Some(Err(_)) => {
291                // Skip invalid tokens in content streams (lenient)
292                continue;
293            }
294            None => break,
295        };
296
297        match token {
298            Token::Integer(n) => operands.push(Operand::Integer(n)),
299            Token::Real(f) => operands.push(Operand::Real(f)),
300            Token::Name(n) => operands.push(Operand::Name(n)),
301            Token::String(s) => operands.push(Operand::String(s.as_bytes().to_vec())),
302            Token::Boolean(b) => operands.push(Operand::Boolean(b)),
303            Token::Null => operands.push(Operand::Null),
304            Token::ArrayStart => {
305                // Read array operand
306                let arr = read_operand_array(&mut tok)?;
307                operands.push(Operand::Array(arr));
308            }
309            Token::Ref(_) => {
310                // References shouldn't appear in content streams; treat as unknown
311                tracing::warn!("indirect reference in content stream, ignoring");
312            }
313            Token::Keyword(kw) => {
314                let op = build_operator(&kw, &mut operands, &mut tok);
315                operators.push(op);
316            }
317            Token::DictStart | Token::DictEnd | Token::ArrayEnd | Token::Comment(_) => {
318                // These shouldn't normally appear at the top level, skip
319            }
320        }
321    }
322
323    Ok(operators)
324}
325
326/// Read an array of operands from the tokenizer (for TJ arrays, dash patterns, etc.).
327fn read_operand_array(tok: &mut Tokenizer<'_>) -> Result<Vec<Operand>, PdfError> {
328    let mut arr = Vec::new();
329    loop {
330        match tok.next_token() {
331            Some(Ok(Token::ArrayEnd)) => return Ok(arr),
332            Some(Ok(Token::Integer(n))) => arr.push(Operand::Integer(n)),
333            Some(Ok(Token::Real(f))) => arr.push(Operand::Real(f)),
334            Some(Ok(Token::String(s))) => {
335                arr.push(Operand::String(s.as_bytes().to_vec()));
336            }
337            Some(Ok(Token::Name(n))) => arr.push(Operand::Name(n)),
338            Some(Ok(Token::Boolean(b))) => arr.push(Operand::Boolean(b)),
339            Some(Ok(Token::Null)) => arr.push(Operand::Null),
340            None => return Ok(arr),
341            _ => continue,
342        }
343    }
344}
345
346/// Build an operator from the keyword and operand stack.
347fn build_operator(
348    keyword: &[u8],
349    operands: &mut Vec<Operand>,
350    tok: &mut Tokenizer<'_>,
351) -> Operator {
352    let op = match keyword {
353        // Text state
354        b"BT" => Operator::BeginText,
355        b"ET" => Operator::EndText,
356        b"Tf" => {
357            let size = pop_f32(operands);
358            let name = pop_name(operands);
359            Operator::SetFont { name, size }
360        }
361        b"Td" => {
362            let ty = pop_f32(operands);
363            let tx = pop_f32(operands);
364            Operator::MoveText { tx, ty }
365        }
366        b"TD" => {
367            let ty = pop_f32(operands);
368            let tx = pop_f32(operands);
369            Operator::MoveTextSetLeading { tx, ty }
370        }
371        b"Tm" => {
372            let f = pop_f32(operands);
373            let e = pop_f32(operands);
374            let d = pop_f32(operands);
375            let c = pop_f32(operands);
376            let b = pop_f32(operands);
377            let a = pop_f32(operands);
378            Operator::SetTextMatrix { a, b, c, d, e, f }
379        }
380        b"T*" => Operator::NextLine,
381        b"Tj" => {
382            let bytes = pop_bytes(operands);
383            Operator::ShowText { bytes }
384        }
385        b"TJ" => {
386            let elements = pop_text_array(operands);
387            Operator::ShowTextArray { elements }
388        }
389        b"'" => {
390            let bytes = pop_bytes(operands);
391            Operator::NextLineShowText { bytes }
392        }
393        b"\"" => {
394            let bytes = pop_bytes(operands);
395            let char_space = pop_f32(operands);
396            let word_space = pop_f32(operands);
397            Operator::SetSpacingShowText {
398                word_space,
399                char_space,
400                bytes,
401            }
402        }
403        b"Tc" => Operator::SetCharSpacing {
404            spacing: pop_f32(operands),
405        },
406        b"Tw" => Operator::SetWordSpacing {
407            spacing: pop_f32(operands),
408        },
409        b"Tz" => Operator::SetHorizontalScaling {
410            scale: pop_f32(operands),
411        },
412        b"TL" => Operator::SetTextLeading {
413            leading: pop_f32(operands),
414        },
415        b"Ts" => Operator::SetTextRise {
416            rise: pop_f32(operands),
417        },
418        b"Tr" => Operator::SetTextRenderingMode {
419            mode: pop_i64(operands),
420        },
421
422        // Graphics state
423        b"q" => Operator::SaveState,
424        b"Q" => Operator::RestoreState,
425        b"cm" => {
426            let f = pop_f32(operands);
427            let e = pop_f32(operands);
428            let d = pop_f32(operands);
429            let c = pop_f32(operands);
430            let b = pop_f32(operands);
431            let a = pop_f32(operands);
432            Operator::ConcatMatrix { a, b, c, d, e, f }
433        }
434        b"w" => Operator::SetLineWidth {
435            width: pop_f32(operands),
436        },
437        b"J" => Operator::SetLineCap {
438            cap: pop_i64(operands),
439        },
440        b"j" => Operator::SetLineJoin {
441            join: pop_i64(operands),
442        },
443        b"M" => Operator::SetMiterLimit {
444            limit: pop_f32(operands),
445        },
446        b"d" => {
447            let phase = pop_f32(operands);
448            let array = pop_f32_array(operands);
449            Operator::SetDashPattern { array, phase }
450        }
451        b"ri" => Operator::SetRenderingIntent {
452            intent: pop_name(operands),
453        },
454        b"i" => Operator::SetFlatness {
455            flatness: pop_f32(operands),
456        },
457        b"gs" => Operator::SetGraphicsState {
458            name: pop_name(operands),
459        },
460
461        // Path construction
462        b"m" => {
463            let y = pop_f32(operands);
464            let x = pop_f32(operands);
465            Operator::MoveTo { x, y }
466        }
467        b"l" => {
468            let y = pop_f32(operands);
469            let x = pop_f32(operands);
470            Operator::LineTo { x, y }
471        }
472        b"c" => {
473            let y3 = pop_f32(operands);
474            let x3 = pop_f32(operands);
475            let y2 = pop_f32(operands);
476            let x2 = pop_f32(operands);
477            let y1 = pop_f32(operands);
478            let x1 = pop_f32(operands);
479            Operator::CurveTo {
480                x1,
481                y1,
482                x2,
483                y2,
484                x3,
485                y3,
486            }
487        }
488        b"v" => {
489            let y3 = pop_f32(operands);
490            let x3 = pop_f32(operands);
491            let y2 = pop_f32(operands);
492            let x2 = pop_f32(operands);
493            Operator::CurveToInitial { x2, y2, x3, y3 }
494        }
495        b"y" => {
496            let y3 = pop_f32(operands);
497            let x3 = pop_f32(operands);
498            let y1 = pop_f32(operands);
499            let x1 = pop_f32(operands);
500            Operator::CurveToFinal { x1, y1, x3, y3 }
501        }
502        b"h" => Operator::ClosePath,
503        b"re" => {
504            let h = pop_f32(operands);
505            let w = pop_f32(operands);
506            let y = pop_f32(operands);
507            let x = pop_f32(operands);
508            Operator::Rectangle { x, y, w, h }
509        }
510
511        // Path painting
512        b"S" => Operator::Stroke,
513        b"s" => Operator::CloseAndStroke,
514        b"f" => Operator::Fill,
515        b"F" => Operator::FillObsolete,
516        b"f*" => Operator::FillEvenOdd,
517        b"B" => Operator::FillStroke,
518        b"B*" => Operator::FillStrokeEvenOdd,
519        b"b" => Operator::CloseFillStroke,
520        b"b*" => Operator::CloseFillStrokeEvenOdd,
521        b"n" => Operator::EndPath,
522
523        // Clipping
524        b"W" => Operator::Clip,
525        b"W*" => Operator::ClipEvenOdd,
526
527        // Color
528        b"CS" => Operator::SetColorSpaceStroke {
529            name: pop_name(operands),
530        },
531        b"cs" => Operator::SetColorSpaceFill {
532            name: pop_name(operands),
533        },
534        b"SC" => Operator::SetColorStroke {
535            components: drain_f32(operands),
536        },
537        b"sc" => Operator::SetColorFill {
538            components: drain_f32(operands),
539        },
540        b"SCN" => {
541            let (components, name) = drain_f32_with_optional_name(operands);
542            Operator::SetColorStrokeN { components, name }
543        }
544        b"scn" => {
545            let (components, name) = drain_f32_with_optional_name(operands);
546            Operator::SetColorFillN { components, name }
547        }
548        b"G" => Operator::SetGrayStroke {
549            gray: pop_f32(operands),
550        },
551        b"g" => Operator::SetGrayFill {
552            gray: pop_f32(operands),
553        },
554        b"RG" => {
555            let b = pop_f32(operands);
556            let g = pop_f32(operands);
557            let r = pop_f32(operands);
558            Operator::SetRgbStroke { r, g, b }
559        }
560        b"rg" => {
561            let b = pop_f32(operands);
562            let g = pop_f32(operands);
563            let r = pop_f32(operands);
564            Operator::SetRgbFill { r, g, b }
565        }
566        b"K" => {
567            let k = pop_f32(operands);
568            let y = pop_f32(operands);
569            let m = pop_f32(operands);
570            let c = pop_f32(operands);
571            Operator::SetCmykStroke { c, m, y, k }
572        }
573        b"k" => {
574            let k = pop_f32(operands);
575            let y = pop_f32(operands);
576            let m = pop_f32(operands);
577            let c = pop_f32(operands);
578            Operator::SetCmykFill { c, m, y, k }
579        }
580
581        // XObject
582        b"Do" => Operator::PaintXObject {
583            name: pop_name(operands),
584        },
585
586        // Shading
587        b"sh" => Operator::PaintShading {
588            name: pop_name(operands),
589        },
590
591        // Inline image: BI ... ID <data> EI
592        b"BI" => {
593            let (properties, data) = read_inline_image(tok);
594            Operator::InlineImage { properties, data }
595        }
596
597        // Marked content
598        b"BMC" => Operator::BeginMarkedContent {
599            tag: pop_name(operands),
600        },
601        b"BDC" => {
602            let properties = operands.pop().unwrap_or(Operand::Null);
603            let tag = pop_name(operands);
604            Operator::BeginMarkedContentDict { tag, properties }
605        }
606        b"EMC" => Operator::EndMarkedContent,
607        b"MP" => Operator::MarkedContentPoint {
608            tag: pop_name(operands),
609        },
610        b"DP" => {
611            let properties = operands.pop().unwrap_or(Operand::Null);
612            let tag = pop_name(operands);
613            Operator::MarkedContentPointDict { tag, properties }
614        }
615
616        // Type 3 character width
617        b"d0" => {
618            let wy = pop_f32(operands);
619            let wx = pop_f32(operands);
620            Operator::SetCharWidth { wx, wy }
621        }
622        b"d1" => {
623            let ury = pop_f32(operands);
624            let urx = pop_f32(operands);
625            let lly = pop_f32(operands);
626            let llx = pop_f32(operands);
627            let wy = pop_f32(operands);
628            let wx = pop_f32(operands);
629            Operator::SetCacheDevice {
630                wx,
631                wy,
632                llx,
633                lly,
634                urx,
635                ury,
636            }
637        }
638
639        // Compatibility
640        b"BX" => Operator::BeginCompat,
641        b"EX" => Operator::EndCompat,
642
643        _ => {
644            let all_operands = std::mem::take(operands);
645            return Operator::Unknown {
646                keyword: keyword.to_vec(),
647                operands: all_operands,
648            };
649        }
650    };
651
652    operands.clear();
653    op
654}
655
656/// Expand an abbreviated inline-image key to its full name.
657fn expand_inline_key(abbr: &[u8]) -> Name {
658    match abbr {
659        b"BPC" => Name::from("BitsPerComponent"),
660        b"CS" => Name::from("ColorSpace"),
661        b"D" => Name::from("Decode"),
662        b"DP" => Name::from("DecodeParms"),
663        b"F" => Name::from("Filter"),
664        b"H" => Name::from("Height"),
665        b"IM" => Name::from("ImageMask"),
666        b"I" => Name::from("Interpolate"),
667        b"W" => Name::from("Width"),
668        b"L" => Name::from("Length"),
669        _ => Name::from_bytes(abbr.to_vec()),
670    }
671}
672
673/// Expand abbreviated inline-image color space or filter name values.
674fn expand_inline_name_value(name: &Name) -> Name {
675    match name.as_bytes() {
676        // Color spaces
677        b"G" => Name::from("DeviceGray"),
678        b"RGB" => Name::from("DeviceRGB"),
679        b"CMYK" => Name::from("DeviceCMYK"),
680        b"I" => Name::from("Indexed"),
681        // Filters
682        b"AHx" => Name::from("ASCIIHexDecode"),
683        b"A85" => Name::from("ASCII85Decode"),
684        b"LZW" => Name::from("LZWDecode"),
685        b"Fl" => Name::from("FlateDecode"),
686        b"RL" => Name::from("RunLengthDecode"),
687        b"CCF" => Name::from("CCITTFaxDecode"),
688        b"DCT" => Name::from("DCTDecode"),
689        _ => name.clone(),
690    }
691}
692
693/// Expand abbreviated names within an operand value (for CS and F values).
694fn expand_inline_value(operand: Operand) -> Operand {
695    match operand {
696        Operand::Name(n) => Operand::Name(expand_inline_name_value(&n)),
697        Operand::Array(arr) => Operand::Array(arr.into_iter().map(expand_inline_value).collect()),
698        other => other,
699    }
700}
701
702/// Read inline image properties and data between BI...ID...EI.
703fn read_inline_image(tok: &mut Tokenizer<'_>) -> (HashMap<Name, Operand>, Vec<u8>) {
704    let empty = (HashMap::new(), Vec::new());
705
706    // Parse key-value pairs until we hit "ID"
707    let mut properties = HashMap::new();
708    loop {
709        match tok.next_token() {
710            Some(Ok(Token::Keyword(ref kw))) if kw == b"ID" => break,
711            Some(Ok(Token::Name(key))) => {
712                let expanded_key = expand_inline_key(key.as_bytes());
713                // Read the value
714                let value = match tok.next_token() {
715                    Some(Ok(Token::Integer(n))) => Operand::Integer(n),
716                    Some(Ok(Token::Real(f))) => Operand::Real(f),
717                    Some(Ok(Token::Name(n))) => Operand::Name(n),
718                    Some(Ok(Token::String(s))) => Operand::String(s.as_bytes().to_vec()),
719                    Some(Ok(Token::Boolean(b))) => Operand::Boolean(b),
720                    Some(Ok(Token::Null)) => Operand::Null,
721                    Some(Ok(Token::ArrayStart)) => match read_operand_array(tok) {
722                        Ok(arr) => Operand::Array(arr),
723                        Err(_) => return empty,
724                    },
725                    Some(Ok(Token::Keyword(ref kw))) if kw == b"ID" => {
726                        // Value-less key right before ID; store Null and break
727                        properties.insert(expanded_key, Operand::Null);
728                        break;
729                    }
730                    _ => return empty,
731                };
732                let value = expand_inline_value(value);
733                properties.insert(expanded_key, value);
734            }
735            None => return empty,
736            _ => continue,
737        }
738    }
739
740    // After "ID", skip exactly 1 whitespace byte, then binary data begins
741    let source = tok.source();
742    let pos = tok.position();
743    if pos >= source.len() {
744        return (properties, Vec::new());
745    }
746    // Skip the single whitespace byte after ID
747    let data_start = pos + 1;
748
749    // Binary scan: find whitespace + "EI" + (whitespace|EOF|delimiter)
750    let mut i = data_start;
751    let data;
752    loop {
753        if i + 2 >= source.len() {
754            // Reached end without finding EI — take everything remaining
755            data = source[data_start..source.len()].to_vec();
756            tok.set_position(source.len());
757            return (properties, data);
758        }
759        // Look for whitespace byte followed by 'E' 'I'
760        if is_whitespace(source[i]) && source[i + 1] == b'E' && source[i + 2] == b'I' {
761            // Check that EI is followed by whitespace, delimiter, or EOF
762            let after_ei = i + 3;
763            if after_ei >= source.len()
764                || is_whitespace(source[after_ei])
765                || is_delimiter(source[after_ei])
766            {
767                data = source[data_start..i].to_vec();
768                tok.set_position(after_ei);
769                return (properties, data);
770            }
771        }
772        i += 1;
773    }
774}
775
776// --- Operand stack helper functions ---
777
778fn pop_f32(operands: &mut Vec<Operand>) -> f32 {
779    operands.pop().and_then(|op| op.as_f32()).unwrap_or(0.0)
780}
781
782fn pop_i64(operands: &mut Vec<Operand>) -> i64 {
783    operands.pop().and_then(|op| op.as_i64()).unwrap_or(0)
784}
785
786fn pop_name(operands: &mut Vec<Operand>) -> Name {
787    match operands.pop() {
788        Some(Operand::Name(n)) => n,
789        _ => Name::from_bytes(Vec::new()),
790    }
791}
792
793fn pop_bytes(operands: &mut Vec<Operand>) -> Vec<u8> {
794    match operands.pop() {
795        Some(Operand::String(b)) => b,
796        _ => Vec::new(),
797    }
798}
799
800fn pop_text_array(operands: &mut Vec<Operand>) -> Vec<TextArrayElement> {
801    match operands.pop() {
802        Some(Operand::Array(arr)) => arr
803            .into_iter()
804            .map(|op| match op {
805                Operand::String(b) => TextArrayElement::Text(b),
806                Operand::Integer(n) => TextArrayElement::Adjustment(n as f32),
807                Operand::Real(f) => TextArrayElement::Adjustment(f as f32),
808                _ => TextArrayElement::Adjustment(0.0),
809            })
810            .collect(),
811        _ => Vec::new(),
812    }
813}
814
815fn pop_f32_array(operands: &mut Vec<Operand>) -> Vec<f32> {
816    match operands.pop() {
817        Some(Operand::Array(arr)) => arr.iter().filter_map(|op| op.as_f32()).collect(),
818        _ => Vec::new(),
819    }
820}
821
822fn drain_f32(operands: &mut Vec<Operand>) -> Vec<f32> {
823    let result: Vec<f32> = operands.iter().filter_map(|op| op.as_f32()).collect();
824    operands.clear();
825    result
826}
827
828fn drain_f32_with_optional_name(operands: &mut Vec<Operand>) -> (Vec<f32>, Option<Name>) {
829    let mut name = None;
830    let mut components = Vec::new();
831
832    for op in operands.drain(..) {
833        match op {
834            Operand::Name(n) => name = Some(n),
835            Operand::Integer(n) => components.push(n as f32),
836            Operand::Real(f) => components.push(f as f32),
837            _ => {}
838        }
839    }
840
841    (components, name)
842}
843
844#[cfg(test)]
845mod tests {
846    use super::*;
847
848    #[test]
849    fn test_tokenize_simple_text() {
850        let data = b"BT /F1 12 Tf 100 700 Td (Hello World) Tj ET";
851        let ops = tokenize_content_stream(data).unwrap();
852
853        assert!(matches!(ops[0], Operator::BeginText));
854        assert!(matches!(ops[1], Operator::SetFont { .. }));
855        assert!(matches!(ops[2], Operator::MoveText { .. }));
856        assert!(matches!(ops[3], Operator::ShowText { .. }));
857        assert!(matches!(ops[4], Operator::EndText));
858    }
859
860    #[test]
861    fn test_tokenize_path_operators() {
862        let data = b"100 200 300 400 re f S";
863        let ops = tokenize_content_stream(data).unwrap();
864
865        match &ops[0] {
866            Operator::Rectangle { x, y, w, h } => {
867                assert_eq!(*x, 100.0);
868                assert_eq!(*y, 200.0);
869                assert_eq!(*w, 300.0);
870                assert_eq!(*h, 400.0);
871            }
872            _ => panic!("expected Rectangle"),
873        }
874        assert!(matches!(ops[1], Operator::Fill));
875        assert!(matches!(ops[2], Operator::Stroke));
876    }
877
878    #[test]
879    fn test_tokenize_graphics_state() {
880        let data = b"q 1 0 0 1 100 200 cm Q";
881        let ops = tokenize_content_stream(data).unwrap();
882
883        assert!(matches!(ops[0], Operator::SaveState));
884        assert!(matches!(ops[1], Operator::ConcatMatrix { .. }));
885        assert!(matches!(ops[2], Operator::RestoreState));
886    }
887
888    #[test]
889    fn test_tokenize_color_operators() {
890        let data = b"1 0 0 rg 0.5 G";
891        let ops = tokenize_content_stream(data).unwrap();
892
893        match &ops[0] {
894            Operator::SetRgbFill { r, g, b } => {
895                assert_eq!(*r, 1.0);
896                assert_eq!(*g, 0.0);
897                assert_eq!(*b, 0.0);
898            }
899            _ => panic!("expected SetRgbFill"),
900        }
901        match &ops[1] {
902            Operator::SetGrayStroke { gray } => {
903                assert_eq!(*gray, 0.5);
904            }
905            _ => panic!("expected SetGrayStroke"),
906        }
907    }
908
909    #[test]
910    fn test_tokenize_text_array() {
911        let data = b"[(Hello) -50 (World)] TJ";
912        let ops = tokenize_content_stream(data).unwrap();
913
914        match &ops[0] {
915            Operator::ShowTextArray { elements } => {
916                assert_eq!(elements.len(), 3);
917                assert!(matches!(&elements[0], TextArrayElement::Text(b) if b == b"Hello"));
918                assert!(matches!(
919                    &elements[1],
920                    TextArrayElement::Adjustment(a) if *a == -50.0
921                ));
922                assert!(matches!(&elements[2], TextArrayElement::Text(b) if b == b"World"));
923            }
924            _ => panic!("expected ShowTextArray"),
925        }
926    }
927
928    #[test]
929    fn test_tokenize_marked_content() {
930        let data = b"/OC BMC (Hello) Tj EMC";
931        let ops = tokenize_content_stream(data).unwrap();
932
933        assert!(matches!(ops[0], Operator::BeginMarkedContent { .. }));
934        assert!(matches!(ops[1], Operator::ShowText { .. }));
935        assert!(matches!(ops[2], Operator::EndMarkedContent));
936    }
937
938    #[test]
939    fn test_tokenize_xobject() {
940        let data = b"/Im0 Do";
941        let ops = tokenize_content_stream(data).unwrap();
942
943        match &ops[0] {
944            Operator::PaintXObject { name } => {
945                assert_eq!(name.as_bytes(), b"Im0");
946            }
947            _ => panic!("expected PaintXObject"),
948        }
949    }
950
951    #[test]
952    fn test_tokenize_unknown_operator() {
953        let data = b"42 ZZ";
954        let ops = tokenize_content_stream(data).unwrap();
955
956        match &ops[0] {
957            Operator::Unknown { keyword, operands } => {
958                assert_eq!(keyword, b"ZZ");
959                assert_eq!(operands.len(), 1);
960            }
961            _ => panic!("expected Unknown operator"),
962        }
963    }
964
965    #[test]
966    fn test_tokenize_empty_stream() {
967        let ops = tokenize_content_stream(b"").unwrap();
968        assert!(ops.is_empty());
969    }
970
971    #[test]
972    #[allow(clippy::approx_constant)]
973    fn test_operand_as_f32() {
974        assert_eq!(Operand::Integer(42).as_f32(), Some(42.0));
975        assert_eq!(Operand::Real(3.14).as_f32(), Some(3.14));
976        assert_eq!(Operand::Null.as_f32(), None);
977    }
978
979    #[test]
980    fn test_operand_as_i64() {
981        assert_eq!(Operand::Integer(42).as_i64(), Some(42));
982        assert_eq!(Operand::Null.as_i64(), None);
983    }
984
985    #[test]
986    fn test_set_font_operator() {
987        let data = b"BT /F1 12 Tf ET";
988        let ops = tokenize_content_stream(data).unwrap();
989        match &ops[1] {
990            Operator::SetFont { name, size } => {
991                assert_eq!(name.as_bytes(), b"F1");
992                assert_eq!(*size, 12.0);
993            }
994            _ => panic!("expected SetFont"),
995        }
996    }
997
998    #[test]
999    fn test_set_dash_pattern() {
1000        let data = b"[3 5] 0 d";
1001        let ops = tokenize_content_stream(data).unwrap();
1002        match &ops[0] {
1003            Operator::SetDashPattern { array, phase } => {
1004                assert_eq!(array, &[3.0, 5.0]);
1005                assert_eq!(*phase, 0.0);
1006            }
1007            _ => panic!("expected SetDashPattern"),
1008        }
1009    }
1010
1011    #[test]
1012    fn test_inline_image_basic() {
1013        // BI /W 10 /H 10 /BPC 8 /CS /G ID <10 bytes of data> EI
1014        let mut data = Vec::new();
1015        data.extend_from_slice(b"BI /W 10 /H 10 /BPC 8 /CS /G ID ");
1016        // 10 bytes of image data
1017        data.extend_from_slice(&[0xFF; 10]);
1018        data.extend_from_slice(b" EI");
1019        let ops = tokenize_content_stream(&data).unwrap();
1020        assert_eq!(ops.len(), 1);
1021        match &ops[0] {
1022            Operator::InlineImage { properties, data } => {
1023                assert_eq!(
1024                    properties.get(&Name::from("Width")),
1025                    Some(&Operand::Integer(10))
1026                );
1027                assert_eq!(
1028                    properties.get(&Name::from("Height")),
1029                    Some(&Operand::Integer(10))
1030                );
1031                assert_eq!(
1032                    properties.get(&Name::from("BitsPerComponent")),
1033                    Some(&Operand::Integer(8))
1034                );
1035                assert_eq!(
1036                    properties.get(&Name::from("ColorSpace")),
1037                    Some(&Operand::Name(Name::from("DeviceGray")))
1038                );
1039                assert_eq!(data.len(), 10);
1040                assert!(data.iter().all(|&b| b == 0xFF));
1041            }
1042            _ => panic!("expected InlineImage"),
1043        }
1044    }
1045
1046    #[test]
1047    fn test_inline_image_abbreviation_expansion() {
1048        let mut data = Vec::new();
1049        data.extend_from_slice(b"BI /W 4 /H 4 /CS /RGB /F /Fl ID ");
1050        data.extend_from_slice(&[0xAA; 4]);
1051        data.extend_from_slice(b" EI");
1052        let ops = tokenize_content_stream(&data).unwrap();
1053        match &ops[0] {
1054            Operator::InlineImage { properties, .. } => {
1055                assert_eq!(
1056                    properties.get(&Name::from("ColorSpace")),
1057                    Some(&Operand::Name(Name::from("DeviceRGB")))
1058                );
1059                assert_eq!(
1060                    properties.get(&Name::from("Filter")),
1061                    Some(&Operand::Name(Name::from("FlateDecode")))
1062                );
1063            }
1064            _ => panic!("expected InlineImage"),
1065        }
1066    }
1067
1068    #[test]
1069    fn test_inline_image_binary_data_extraction() {
1070        // Data containing bytes that look like EI but aren't terminated properly
1071        let mut data = Vec::new();
1072        data.extend_from_slice(b"BI /W 2 /H 2 /BPC 8 /CS /G ID ");
1073        // Binary data includes "EI" in middle but not at a valid boundary
1074        data.extend_from_slice(&[0x45, 0x49, 0x45, 0x49]); // "EIEI"
1075        data.extend_from_slice(b" EI");
1076        let ops = tokenize_content_stream(&data).unwrap();
1077        match &ops[0] {
1078            Operator::InlineImage { data, .. } => {
1079                assert_eq!(data, &[0x45, 0x49, 0x45, 0x49]);
1080            }
1081            _ => panic!("expected InlineImage"),
1082        }
1083    }
1084
1085    #[test]
1086    fn test_inline_image_tokenizer_resumes_after_ei() {
1087        let mut data = Vec::new();
1088        data.extend_from_slice(b"BI /W 1 /H 1 /BPC 8 /CS /G ID ");
1089        data.extend_from_slice(&[0xAB]);
1090        data.extend_from_slice(b"\nEI\n");
1091        data.extend_from_slice(b"100 200 m");
1092        let ops = tokenize_content_stream(&data).unwrap();
1093        assert_eq!(ops.len(), 2);
1094        assert!(matches!(&ops[0], Operator::InlineImage { .. }));
1095        assert!(matches!(&ops[1], Operator::MoveTo { x, y } if *x == 100.0 && *y == 200.0));
1096    }
1097
1098    #[test]
1099    fn test_tokenize_paint_shading() {
1100        let data = b"/Sh0 sh";
1101        let ops = tokenize_content_stream(data).unwrap();
1102        assert_eq!(ops.len(), 1);
1103        match &ops[0] {
1104            Operator::PaintShading { name } => {
1105                assert_eq!(name.as_bytes(), b"Sh0");
1106            }
1107            _ => panic!("expected PaintShading"),
1108        }
1109    }
1110
1111    #[test]
1112    fn test_tokenize_d0_operator() {
1113        let data = b"500 0 d0";
1114        let ops = tokenize_content_stream(data).unwrap();
1115        assert_eq!(ops.len(), 1);
1116        match &ops[0] {
1117            Operator::SetCharWidth { wx, wy } => {
1118                assert_eq!(*wx, 500.0);
1119                assert_eq!(*wy, 0.0);
1120            }
1121            _ => panic!("expected SetCharWidth"),
1122        }
1123    }
1124
1125    #[test]
1126    fn test_tokenize_d1_operator() {
1127        let data = b"500 0 10 -20 400 700 d1";
1128        let ops = tokenize_content_stream(data).unwrap();
1129        assert_eq!(ops.len(), 1);
1130        match &ops[0] {
1131            Operator::SetCacheDevice {
1132                wx,
1133                wy,
1134                llx,
1135                lly,
1136                urx,
1137                ury,
1138            } => {
1139                assert_eq!(*wx, 500.0);
1140                assert_eq!(*wy, 0.0);
1141                assert_eq!(*llx, 10.0);
1142                assert_eq!(*lly, -20.0);
1143                assert_eq!(*urx, 400.0);
1144                assert_eq!(*ury, 700.0);
1145            }
1146            _ => panic!("expected SetCacheDevice"),
1147        }
1148    }
1149
1150    #[test]
1151    fn test_d0_in_char_proc_stream() {
1152        let data = b"500 0 d0 100 200 m 300 400 l S";
1153        let ops = tokenize_content_stream(data).unwrap();
1154        assert_eq!(ops.len(), 4);
1155        assert!(matches!(ops[0], Operator::SetCharWidth { .. }));
1156        assert!(matches!(ops[1], Operator::MoveTo { .. }));
1157        assert!(matches!(ops[2], Operator::LineTo { .. }));
1158        assert!(matches!(ops[3], Operator::Stroke));
1159    }
1160
1161    // ===================================================================
1162    // Upstream-ported tests
1163    // ===================================================================
1164
1165    /// Upstream: TEST(CPDFStreamContentParserTest, PDFFindKeyAbbreviation)
1166    ///
1167    /// Inline image key abbreviation lookup: BPC->BitsPerComponent, W->Width, etc.
1168    #[test]
1169    fn test_find_key_abbreviation() {
1170        assert_eq!(expand_inline_key(b"BPC"), Name::from("BitsPerComponent"));
1171        assert_eq!(expand_inline_key(b"W"), Name::from("Width"));
1172        assert_eq!(expand_inline_key(b"H"), Name::from("Height"));
1173        assert_eq!(expand_inline_key(b"CS"), Name::from("ColorSpace"));
1174        assert_eq!(expand_inline_key(b"F"), Name::from("Filter"));
1175        assert_eq!(expand_inline_key(b"D"), Name::from("Decode"));
1176        assert_eq!(expand_inline_key(b"DP"), Name::from("DecodeParms"));
1177        assert_eq!(expand_inline_key(b"IM"), Name::from("ImageMask"));
1178        assert_eq!(expand_inline_key(b"I"), Name::from("Interpolate"));
1179        assert_eq!(expand_inline_key(b"L"), Name::from("Length"));
1180
1181        // Empty and not-in-list should return the input unchanged
1182        assert_eq!(expand_inline_key(b""), Name::from(""));
1183        assert_eq!(expand_inline_key(b"NoInList"), Name::from("NoInList"));
1184
1185        // Prefix should not match
1186        assert_eq!(expand_inline_key(b"WW"), Name::from("WW"));
1187    }
1188
1189    /// Upstream: TEST(CPDFStreamContentParserTest, PDFFindValueAbbreviation)
1190    ///
1191    /// Inline image value abbreviation lookup: G->DeviceGray, DCT->DCTDecode, etc.
1192    #[test]
1193    fn test_find_value_abbreviation() {
1194        assert_eq!(
1195            expand_inline_name_value(&Name::from("G")),
1196            Name::from("DeviceGray")
1197        );
1198        assert_eq!(
1199            expand_inline_name_value(&Name::from("RGB")),
1200            Name::from("DeviceRGB")
1201        );
1202        assert_eq!(
1203            expand_inline_name_value(&Name::from("CMYK")),
1204            Name::from("DeviceCMYK")
1205        );
1206        assert_eq!(
1207            expand_inline_name_value(&Name::from("DCT")),
1208            Name::from("DCTDecode")
1209        );
1210        assert_eq!(
1211            expand_inline_name_value(&Name::from("Fl")),
1212            Name::from("FlateDecode")
1213        );
1214        assert_eq!(
1215            expand_inline_name_value(&Name::from("AHx")),
1216            Name::from("ASCIIHexDecode")
1217        );
1218        assert_eq!(
1219            expand_inline_name_value(&Name::from("A85")),
1220            Name::from("ASCII85Decode")
1221        );
1222        assert_eq!(
1223            expand_inline_name_value(&Name::from("LZW")),
1224            Name::from("LZWDecode")
1225        );
1226        assert_eq!(
1227            expand_inline_name_value(&Name::from("RL")),
1228            Name::from("RunLengthDecode")
1229        );
1230        assert_eq!(
1231            expand_inline_name_value(&Name::from("CCF")),
1232            Name::from("CCITTFaxDecode")
1233        );
1234        assert_eq!(
1235            expand_inline_name_value(&Name::from("I")),
1236            Name::from("Indexed")
1237        );
1238
1239        // Empty and not-in-list should return input unchanged
1240        assert_eq!(expand_inline_name_value(&Name::from("")), Name::from(""));
1241        assert_eq!(
1242            expand_inline_name_value(&Name::from("NoInList")),
1243            Name::from("NoInList")
1244        );
1245
1246        // Prefix should not match
1247        assert_eq!(
1248            expand_inline_name_value(&Name::from("II")),
1249            Name::from("II")
1250        );
1251    }
1252}