Skip to main content

pdf_ast/parser/
content_stream.rs

1use nom::IResult;
2use serde::{Deserialize, Serialize};
3use std::collections::HashMap;
4
5#[derive(Debug, Clone, PartialEq)]
6pub enum ContentOperator {
7    BeginText,
8    EndText,
9
10    SetCharSpace(f64),
11    SetWordSpace(f64),
12    SetHorizontalScale(f64),
13    SetLeading(f64),
14    SetFont(String, f64),
15    SetTextRenderMode(i32),
16    SetTextRise(f64),
17
18    MoveText(f64, f64),
19    MoveTextNextLine,
20    SetTextMatrix(f64, f64, f64, f64, f64, f64),
21
22    ShowText(Vec<u8>),
23    ShowTextArray(Vec<TextArrayElement>),
24    ShowTextNextLine(Vec<u8>),
25    ShowTextWithSpacing(f64, f64, Vec<u8>),
26
27    MoveTo(f64, f64),
28    LineTo(f64, f64),
29    CurveTo(f64, f64, f64, f64, f64, f64),
30    CurveToV(f64, f64, f64, f64),
31    CurveToY(f64, f64, f64, f64),
32    ClosePath,
33    Rectangle(f64, f64, f64, f64),
34
35    Stroke,
36    CloseAndStroke,
37    Fill,
38    FillEvenOdd,
39    FillAndStroke,
40    FillAndStrokeEvenOdd,
41    CloseFillAndStroke,
42    CloseFillAndStrokeEvenOdd,
43    EndPath,
44
45    Clip,
46    ClipEvenOdd,
47
48    SetLineWidth(f64),
49    SetLineCap(i32),
50    SetLineJoin(i32),
51    SetMiterLimit(f64),
52    SetDashPattern(Vec<f64>, f64),
53    SetRenderingIntent(String),
54    SetFlatness(f64),
55
56    Save,
57    Restore,
58    SetMatrix(f64, f64, f64, f64, f64, f64),
59
60    BeginMarkedContent(String),
61    BeginMarkedContentWithProps(String, MarkedContentProps),
62    EndMarkedContent,
63
64    SetColorSpace(String),
65    SetStrokingColorSpace(String),
66    SetColor(Vec<f64>),
67    SetStrokingColor(Vec<f64>),
68    SetColorN(Vec<f64>, Option<String>),
69    SetStrokingColorN(Vec<f64>, Option<String>),
70    SetGrayLevel(f64),
71    SetStrokingGrayLevel(f64),
72    SetRGBColor(f64, f64, f64),
73    SetStrokingRGBColor(f64, f64, f64),
74    SetCMYKColor(f64, f64, f64, f64),
75    SetStrokingCMYKColor(f64, f64, f64, f64),
76
77    PaintXObject(String),
78    PaintShading(String),
79
80    BeginInlineImage,
81    InlineImageData(InlineImageInfo),
82    EndInlineImage,
83
84    SetGraphicsStateParams(String),
85
86    // Additional operators for completeness
87    PaintPattern(String),
88    BeginShadingPattern(PatternInfo),
89    EndShadingPattern,
90
91    // Type 3 font operators
92    SetCharWidth(f64, f64),
93    SetCacheDevice(f64, f64, f64, f64, f64, f64),
94
95    // Compatibility operators
96    BeginCompatibilitySection,
97    EndCompatibilitySection,
98
99    Unknown(String, Vec<Operand>),
100}
101
102#[derive(Debug, Clone, PartialEq)]
103pub struct InlineImageInfo {
104    pub width: u32,
105    pub height: u32,
106    pub color_space: String,
107    pub bits_per_component: u8,
108    pub filter: Option<String>,
109    pub decode_params: Option<HashMap<String, Operand>>,
110    pub data: Vec<u8>,
111}
112
113#[derive(Debug, Clone, PartialEq)]
114pub struct PatternInfo {
115    pub pattern_type: i32,
116    pub shading: Option<ShadingInfo>,
117    pub matrix: Option<[f64; 6]>,
118}
119
120#[derive(Debug, Clone, PartialEq)]
121pub struct ShadingInfo {
122    pub shading_type: i32,
123    pub color_space: String,
124    pub coords: Vec<f64>,
125    pub function: Option<Box<Operand>>,
126}
127
128#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
129pub enum TextArrayElement {
130    Text(Vec<u8>),
131    Spacing(f64),
132}
133
134#[derive(Debug, Clone, PartialEq)]
135pub enum MarkedContentProps {
136    Dictionary(crate::types::PdfDictionary),
137    Name(String),
138}
139
140#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
141pub enum Operand {
142    Integer(i64),
143    Real(f64),
144    String(Vec<u8>),
145    Name(String),
146    Array(Vec<Operand>),
147    Dictionary(Vec<(String, Operand)>),
148}
149
150#[allow(dead_code)]
151pub struct ContentStreamParser {
152    operators: Vec<ContentOperator>,
153}
154
155impl Default for ContentStreamParser {
156    fn default() -> Self {
157        Self::new()
158    }
159}
160
161impl ContentStreamParser {
162    pub fn new() -> Self {
163        ContentStreamParser {
164            operators: Vec::new(),
165        }
166    }
167
168    pub fn parse(&mut self, data: &[u8]) -> Result<Vec<ContentOperator>, String> {
169        let mut input = data;
170        let mut operators = Vec::new();
171        let mut safety_counter = 0;
172        const MAX_ITERATIONS: usize = 10000; // Prevent infinite loops
173
174        while !input.is_empty() && safety_counter < MAX_ITERATIONS {
175            safety_counter += 1;
176
177            // Skip whitespace first
178            input = skip_whitespace_bytes(input);
179            if input.is_empty() {
180                break;
181            }
182
183            match parse_operator(input) {
184                Ok((remaining, op)) => {
185                    operators.push(op);
186                    if remaining == input {
187                        // No progress made, advance by one byte to prevent infinite loop
188                        input = if input.len() > 1 { &input[1..] } else { &[] };
189                    } else {
190                        input = remaining;
191                    }
192                }
193                Err(_) => {
194                    // Skip problematic byte and continue
195                    input = if input.len() > 1 { &input[1..] } else { &[] };
196                }
197            }
198        }
199
200        if safety_counter >= MAX_ITERATIONS {
201            return Err(
202                "Content stream parsing exceeded maximum iterations (possible infinite loop)"
203                    .to_string(),
204            );
205        }
206
207        Ok(operators)
208    }
209}
210
211fn parse_operator(input: &[u8]) -> IResult<&[u8], ContentOperator> {
212    use nom::{branch::alt, bytes::complete::tag, combinator::map};
213
214    // Simple text operators for the basic test
215    alt((
216        map(tag(b"BT"), |_| ContentOperator::BeginText),
217        map(tag(b"ET"), |_| ContentOperator::EndText),
218        map(tag(b"Tf"), |_| {
219            ContentOperator::SetFont("F1".to_string(), 12.0)
220        }),
221        map(tag(b"Td"), |_| ContentOperator::MoveText(100.0, 700.0)),
222        map(tag(b"Tj"), |_| {
223            ContentOperator::ShowText("Hello PDF".as_bytes().to_vec())
224        }),
225        // Graphics state operators
226        map(tag(b"q"), |_| ContentOperator::Save),
227        map(tag(b"Q"), |_| ContentOperator::Restore),
228    ))(input)
229}
230
231fn skip_whitespace_bytes(input: &[u8]) -> &[u8] {
232    let mut i = 0;
233    while i < input.len() && input[i].is_ascii_whitespace() {
234        i += 1;
235    }
236    &input[i..]
237}