Skip to main content

folio_content/
parser.rs

1//! Content stream parser — converts raw PDF content stream bytes into ContentOp sequence.
2
3use crate::ops::{ContentOp, TextOp};
4use folio_core::{Matrix2D, Result};
5use folio_cos::PdfObject;
6use folio_cos::parser::parse_object;
7use folio_cos::tokenizer::{Token, Tokenizer};
8
9/// Parse a content stream into a sequence of operations.
10pub fn parse_content_stream(data: &[u8]) -> Result<Vec<ContentOp>> {
11    let mut tokenizer = Tokenizer::new_at(data, 0);
12    let mut ops = Vec::new();
13    let mut operand_stack: Vec<PdfObject> = Vec::new();
14
15    loop {
16        tokenizer.skip_whitespace_and_comments();
17        if tokenizer.is_eof() {
18            break;
19        }
20
21        // Check for inline image (BI keyword)
22        let pos = tokenizer.pos();
23        if pos + 2 <= data.len() && &data[pos..pos + 2] == b"BI" {
24            // Check it's actually the keyword (followed by whitespace)
25            if pos + 2 >= data.len() || is_whitespace_or_delimiter(data[pos + 2]) {
26                tokenizer.set_pos(pos + 2);
27                let op = parse_inline_image(&mut tokenizer)?;
28                ops.push(op);
29                operand_stack.clear();
30                continue;
31            }
32        }
33
34        let token = match tokenizer.next_token()? {
35            Some(t) => t,
36            None => break,
37        };
38
39        match token {
40            Token::Integer(_)
41            | Token::Real(_)
42            | Token::LiteralString(_)
43            | Token::HexString(_)
44            | Token::Name(_)
45            | Token::ArrayBegin => {
46                // It's an operand — push onto stack
47                tokenizer.set_pos(pos);
48                match parse_object(&mut tokenizer)? {
49                    Some(obj) => operand_stack.push(obj),
50                    None => {}
51                }
52            }
53            Token::Keyword(ref kw) => {
54                let op = build_op(kw, &operand_stack);
55                ops.push(op);
56                operand_stack.clear();
57            }
58            Token::DictBegin => {
59                // Dict as operand (used in BDC, DP)
60                tokenizer.set_pos(pos);
61                if let Some(obj) = parse_object(&mut tokenizer)? {
62                    operand_stack.push(obj);
63                }
64            }
65            _ => {
66                // Unexpected token — skip
67            }
68        }
69    }
70
71    Ok(ops)
72}
73
74fn is_whitespace_or_delimiter(b: u8) -> bool {
75    folio_cos::tokenizer::is_whitespace(b) || folio_cos::tokenizer::is_delimiter(b)
76}
77
78/// Build a ContentOp from an operator keyword and its operands.
79fn build_op(operator: &[u8], operands: &[PdfObject]) -> ContentOp {
80    match operator {
81        // Graphics state
82        b"q" => ContentOp::SaveState,
83        b"Q" => ContentOp::RestoreState,
84        b"cm" if operands.len() >= 6 => ContentOp::ConcatMatrix(Matrix2D::new(
85            f(operands, 0),
86            f(operands, 1),
87            f(operands, 2),
88            f(operands, 3),
89            f(operands, 4),
90            f(operands, 5),
91        )),
92        b"w" => ContentOp::SetLineWidth(f(operands, 0)),
93        b"J" => ContentOp::SetLineCap(i(operands, 0)),
94        b"j" => ContentOp::SetLineJoin(i(operands, 0)),
95        b"M" => ContentOp::SetMiterLimit(f(operands, 0)),
96        b"d" => {
97            let arr = operands
98                .first()
99                .and_then(|o| o.as_array())
100                .map(|a| a.iter().filter_map(|v| v.as_f64()).collect())
101                .unwrap_or_default();
102            let phase = f(operands, 1);
103            ContentOp::SetDashPattern(arr, phase)
104        }
105        b"ri" => ContentOp::SetRenderingIntent(n(operands, 0)),
106        b"i" => ContentOp::SetFlatness(f(operands, 0)),
107        b"gs" => ContentOp::SetExtGState(n(operands, 0)),
108
109        // Path construction
110        b"m" => ContentOp::MoveTo(f(operands, 0), f(operands, 1)),
111        b"l" => ContentOp::LineTo(f(operands, 0), f(operands, 1)),
112        b"c" => ContentOp::CurveTo(
113            f(operands, 0),
114            f(operands, 1),
115            f(operands, 2),
116            f(operands, 3),
117            f(operands, 4),
118            f(operands, 5),
119        ),
120        b"v" => ContentOp::CurveToInitial(
121            f(operands, 0),
122            f(operands, 1),
123            f(operands, 2),
124            f(operands, 3),
125        ),
126        b"y" => ContentOp::CurveToFinal(
127            f(operands, 0),
128            f(operands, 1),
129            f(operands, 2),
130            f(operands, 3),
131        ),
132        b"h" => ContentOp::ClosePath,
133        b"re" => ContentOp::Rectangle(
134            f(operands, 0),
135            f(operands, 1),
136            f(operands, 2),
137            f(operands, 3),
138        ),
139
140        // Path painting
141        b"S" => ContentOp::Stroke,
142        b"s" => ContentOp::CloseAndStroke,
143        b"f" | b"F" => ContentOp::Fill,
144        b"f*" => ContentOp::FillEvenOdd,
145        b"B" => ContentOp::FillAndStroke,
146        b"B*" => ContentOp::FillAndStrokeEvenOdd,
147        b"b" => ContentOp::CloseFillAndStroke,
148        b"b*" => ContentOp::CloseFillAndStrokeEvenOdd,
149        b"n" => ContentOp::EndPath,
150
151        // Clipping
152        b"W" => ContentOp::Clip,
153        b"W*" => ContentOp::ClipEvenOdd,
154
155        // Text
156        b"BT" => ContentOp::BeginText,
157        b"ET" => ContentOp::EndText,
158        b"Tc" => ContentOp::SetCharSpacing(f(operands, 0)),
159        b"Tw" => ContentOp::SetWordSpacing(f(operands, 0)),
160        b"Tz" => ContentOp::SetHorizScaling(f(operands, 0)),
161        b"TL" => ContentOp::SetTextLeading(f(operands, 0)),
162        b"Tf" => ContentOp::SetFont(n(operands, 0), f(operands, 1)),
163        b"Tr" => ContentOp::SetTextRenderMode(i(operands, 0)),
164        b"Ts" => ContentOp::SetTextRise(f(operands, 0)),
165        b"Td" => ContentOp::MoveTextPos(f(operands, 0), f(operands, 1)),
166        b"TD" => ContentOp::MoveTextPosSetLeading(f(operands, 0), f(operands, 1)),
167        b"Tm" if operands.len() >= 6 => ContentOp::SetTextMatrix(Matrix2D::new(
168            f(operands, 0),
169            f(operands, 1),
170            f(operands, 2),
171            f(operands, 3),
172            f(operands, 4),
173            f(operands, 5),
174        )),
175        b"T*" => ContentOp::NextLine,
176        b"Tj" => ContentOp::ShowText(s(operands, 0)),
177        b"TJ" => {
178            let items = operands
179                .first()
180                .and_then(|o| o.as_array())
181                .map(|arr| {
182                    arr.iter()
183                        .map(|item| match item {
184                            PdfObject::Str(s) => TextOp::Text(s.clone()),
185                            PdfObject::Integer(n) => TextOp::Adjustment(*n as f64),
186                            PdfObject::Real(n) => TextOp::Adjustment(*n),
187                            _ => TextOp::Adjustment(0.0),
188                        })
189                        .collect()
190                })
191                .unwrap_or_default();
192            ContentOp::ShowTextAdjusted(items)
193        }
194        b"'" => ContentOp::NextLineShowText(s(operands, 0)),
195        b"\"" => {
196            ContentOp::SetSpacingNextLineShowText(f(operands, 0), f(operands, 1), s(operands, 2))
197        }
198
199        // Color
200        b"CS" => ContentOp::SetStrokeColorSpace(n(operands, 0)),
201        b"cs" => ContentOp::SetFillColorSpace(n(operands, 0)),
202        b"SC" | b"SCN" => {
203            ContentOp::SetStrokeColor(operands.iter().filter_map(|o| o.as_f64()).collect())
204        }
205        b"sc" | b"scn" => {
206            ContentOp::SetFillColor(operands.iter().filter_map(|o| o.as_f64()).collect())
207        }
208        b"G" => ContentOp::SetStrokeGray(f(operands, 0)),
209        b"g" => ContentOp::SetFillGray(f(operands, 0)),
210        b"RG" => ContentOp::SetStrokeRGB(f(operands, 0), f(operands, 1), f(operands, 2)),
211        b"rg" => ContentOp::SetFillRGB(f(operands, 0), f(operands, 1), f(operands, 2)),
212        b"K" => ContentOp::SetStrokeCMYK(
213            f(operands, 0),
214            f(operands, 1),
215            f(operands, 2),
216            f(operands, 3),
217        ),
218        b"k" => ContentOp::SetFillCMYK(
219            f(operands, 0),
220            f(operands, 1),
221            f(operands, 2),
222            f(operands, 3),
223        ),
224
225        // XObject / Shading
226        b"Do" => ContentOp::PaintXObject(n(operands, 0)),
227        b"sh" => ContentOp::PaintShading(n(operands, 0)),
228
229        // Marked content
230        b"MP" => ContentOp::MarkedContentPoint(n(operands, 0)),
231        b"DP" => ContentOp::MarkedContentPointProperties(
232            n(operands, 0),
233            operands.get(1).cloned().unwrap_or(PdfObject::Null),
234        ),
235        b"BMC" => ContentOp::BeginMarkedContent(n(operands, 0)),
236        b"BDC" => ContentOp::BeginMarkedContentProperties(
237            n(operands, 0),
238            operands.get(1).cloned().unwrap_or(PdfObject::Null),
239        ),
240        b"EMC" => ContentOp::EndMarkedContent,
241
242        // Compatibility
243        b"BX" => ContentOp::BeginCompat,
244        b"EX" => ContentOp::EndCompat,
245
246        // Unknown
247        _ => ContentOp::Unknown(operator.to_vec(), operands.to_vec()),
248    }
249}
250
251/// Parse an inline image (after BI keyword has been consumed).
252fn parse_inline_image(tokenizer: &mut Tokenizer) -> Result<ContentOp> {
253    tokenizer.skip_whitespace_and_comments();
254
255    // Parse key-value pairs until ID keyword
256    let mut dict = Vec::new();
257    loop {
258        tokenizer.skip_whitespace_and_comments();
259        if tokenizer.is_eof() {
260            break;
261        }
262
263        // Check for ID keyword
264        let pos = tokenizer.pos();
265        let data = tokenizer.data();
266        if pos + 2 <= data.len() && &data[pos..pos + 2] == b"ID" {
267            tokenizer.set_pos(pos + 2);
268            // Skip single whitespace byte after ID
269            if !tokenizer.is_eof() {
270                tokenizer.set_pos(tokenizer.pos() + 1);
271            }
272            break;
273        }
274
275        match tokenizer.next_token()? {
276            Some(Token::Name(key)) => {
277                // Expand abbreviated key names
278                let full_key = expand_inline_image_key(&key);
279                match parse_object(tokenizer)? {
280                    Some(val) => dict.push((full_key, val)),
281                    None => break,
282                }
283            }
284            _ => break,
285        }
286    }
287
288    // Read image data until EI
289    let start = tokenizer.pos();
290    let data = tokenizer.data();
291    let mut end = start;
292
293    // Search for EI preceded by whitespace
294    while end < data.len() {
295        if end + 2 < data.len()
296            && data[end] == b'E'
297            && data[end + 1] == b'I'
298            && (end == start || is_whitespace_byte(data[end - 1]))
299            && (end + 2 >= data.len() || is_whitespace_or_delimiter(data[end + 2]))
300        {
301            break;
302        }
303        end += 1;
304    }
305
306    // Trim trailing whitespace from image data
307    let mut img_end = end;
308    while img_end > start && is_whitespace_byte(data[img_end - 1]) {
309        img_end -= 1;
310    }
311
312    let image_data = data[start..img_end].to_vec();
313    tokenizer.set_pos(end + 2); // Skip past EI
314
315    Ok(ContentOp::InlineImage {
316        dict,
317        data: image_data,
318    })
319}
320
321fn is_whitespace_byte(b: u8) -> bool {
322    matches!(b, b' ' | b'\t' | b'\n' | b'\r' | b'\x0c' | b'\x00')
323}
324
325/// Expand abbreviated inline image key names to full names.
326fn expand_inline_image_key(key: &[u8]) -> Vec<u8> {
327    match key {
328        b"BPC" => b"BitsPerComponent".to_vec(),
329        b"CS" => b"ColorSpace".to_vec(),
330        b"D" => b"Decode".to_vec(),
331        b"DP" => b"DecodeParms".to_vec(),
332        b"F" => b"Filter".to_vec(),
333        b"H" => b"Height".to_vec(),
334        b"IM" => b"ImageMask".to_vec(),
335        b"I" => b"Interpolate".to_vec(),
336        b"W" => b"Width".to_vec(),
337        _ => key.to_vec(),
338    }
339}
340
341// --- Operand helpers ---
342fn f(ops: &[PdfObject], idx: usize) -> f64 {
343    ops.get(idx).and_then(|o| o.as_f64()).unwrap_or(0.0)
344}
345fn i(ops: &[PdfObject], idx: usize) -> i32 {
346    ops.get(idx).and_then(|o| o.as_i64()).unwrap_or(0) as i32
347}
348fn n(ops: &[PdfObject], idx: usize) -> Vec<u8> {
349    ops.get(idx)
350        .and_then(|o| o.as_name())
351        .unwrap_or(b"")
352        .to_vec()
353}
354fn s(ops: &[PdfObject], idx: usize) -> Vec<u8> {
355    ops.get(idx)
356        .and_then(|o| o.as_str())
357        .unwrap_or(b"")
358        .to_vec()
359}
360
361#[cfg(test)]
362mod tests {
363    use super::*;
364
365    #[test]
366    fn test_basic_ops() {
367        let data = b"q 1 0 0 1 100 200 cm Q";
368        let ops = parse_content_stream(data).unwrap();
369        assert_eq!(ops.len(), 3);
370        assert!(matches!(ops[0], ContentOp::SaveState));
371        assert!(matches!(ops[1], ContentOp::ConcatMatrix(_)));
372        assert!(matches!(ops[2], ContentOp::RestoreState));
373    }
374
375    #[test]
376    fn test_text_ops() {
377        let data = b"BT /F1 12 Tf 100 700 Td (Hello World) Tj ET";
378        let ops = parse_content_stream(data).unwrap();
379        assert!(matches!(ops[0], ContentOp::BeginText));
380        assert!(matches!(ops[1], ContentOp::SetFont(..)));
381        assert!(matches!(ops[2], ContentOp::MoveTextPos(..)));
382        assert!(matches!(ops[3], ContentOp::ShowText(..)));
383        assert!(matches!(ops[4], ContentOp::EndText));
384
385        if let ContentOp::SetFont(ref name, size) = ops[1] {
386            assert_eq!(name, b"F1");
387            assert_eq!(size, 12.0);
388        }
389        if let ContentOp::ShowText(ref text) = ops[3] {
390            assert_eq!(text, b"Hello World");
391        }
392    }
393
394    #[test]
395    fn test_path_ops() {
396        let data = b"100 200 m 300 400 l 100 200 300 400 500 600 c h S";
397        let ops = parse_content_stream(data).unwrap();
398        assert!(matches!(ops[0], ContentOp::MoveTo(100.0, 200.0)));
399        assert!(matches!(ops[1], ContentOp::LineTo(300.0, 400.0)));
400        assert!(matches!(ops[2], ContentOp::CurveTo(..)));
401        assert!(matches!(ops[3], ContentOp::ClosePath));
402        assert!(matches!(ops[4], ContentOp::Stroke));
403    }
404
405    #[test]
406    fn test_color_ops() {
407        let data = b"1 0 0 RG 0.5 g";
408        let ops = parse_content_stream(data).unwrap();
409        assert!(matches!(ops[0], ContentOp::SetStrokeRGB(1.0, 0.0, 0.0)));
410        assert!(matches!(ops[1], ContentOp::SetFillGray(..)));
411    }
412
413    #[test]
414    fn test_tj_array() {
415        let data = b"[(Hello ) -100 (World)] TJ";
416        let ops = parse_content_stream(data).unwrap();
417        assert_eq!(ops.len(), 1);
418        if let ContentOp::ShowTextAdjusted(ref items) = ops[0] {
419            assert_eq!(items.len(), 3);
420            assert!(matches!(items[0], TextOp::Text(ref t) if t == b"Hello "));
421            assert!(matches!(items[1], TextOp::Adjustment(-100.0)));
422            assert!(matches!(items[2], TextOp::Text(ref t) if t == b"World"));
423        } else {
424            panic!("Expected ShowTextAdjusted");
425        }
426    }
427
428    #[test]
429    fn test_marked_content() {
430        let data = b"/Span BMC (text) Tj EMC";
431        let ops = parse_content_stream(data).unwrap();
432        assert!(matches!(ops[0], ContentOp::BeginMarkedContent(..)));
433        assert!(matches!(ops[1], ContentOp::ShowText(..)));
434        assert!(matches!(ops[2], ContentOp::EndMarkedContent));
435    }
436
437    #[test]
438    fn test_xobject() {
439        let data = b"/Im0 Do";
440        let ops = parse_content_stream(data).unwrap();
441        assert_eq!(ops.len(), 1);
442        if let ContentOp::PaintXObject(ref name) = ops[0] {
443            assert_eq!(name, b"Im0");
444        }
445    }
446}