1use crate::ops::{ContentOp, TextOp};
4use folio_core::{Matrix2D, Result};
5use folio_cos::PdfObject;
6use folio_cos::parser::parse_object;
7use folio_cos::tokenizer::{Token, Tokenizer};
8
9pub fn parse_content_stream(data: &[u8]) -> Result<Vec<ContentOp>> {
11 let mut tokenizer = Tokenizer::new_at(data, 0);
12 let mut ops = Vec::new();
13 let mut operand_stack: Vec<PdfObject> = Vec::new();
14
15 loop {
16 tokenizer.skip_whitespace_and_comments();
17 if tokenizer.is_eof() {
18 break;
19 }
20
21 let pos = tokenizer.pos();
23 if pos + 2 <= data.len() && &data[pos..pos + 2] == b"BI" {
24 if pos + 2 >= data.len() || is_whitespace_or_delimiter(data[pos + 2]) {
26 tokenizer.set_pos(pos + 2);
27 let op = parse_inline_image(&mut tokenizer)?;
28 ops.push(op);
29 operand_stack.clear();
30 continue;
31 }
32 }
33
34 let token = match tokenizer.next_token()? {
35 Some(t) => t,
36 None => break,
37 };
38
39 match token {
40 Token::Integer(_)
41 | Token::Real(_)
42 | Token::LiteralString(_)
43 | Token::HexString(_)
44 | Token::Name(_)
45 | Token::ArrayBegin => {
46 tokenizer.set_pos(pos);
48 match parse_object(&mut tokenizer)? {
49 Some(obj) => operand_stack.push(obj),
50 None => {}
51 }
52 }
53 Token::Keyword(ref kw) => {
54 let op = build_op(kw, &operand_stack);
55 ops.push(op);
56 operand_stack.clear();
57 }
58 Token::DictBegin => {
59 tokenizer.set_pos(pos);
61 if let Some(obj) = parse_object(&mut tokenizer)? {
62 operand_stack.push(obj);
63 }
64 }
65 _ => {
66 }
68 }
69 }
70
71 Ok(ops)
72}
73
74fn is_whitespace_or_delimiter(b: u8) -> bool {
75 folio_cos::tokenizer::is_whitespace(b) || folio_cos::tokenizer::is_delimiter(b)
76}
77
78fn build_op(operator: &[u8], operands: &[PdfObject]) -> ContentOp {
80 match operator {
81 b"q" => ContentOp::SaveState,
83 b"Q" => ContentOp::RestoreState,
84 b"cm" if operands.len() >= 6 => ContentOp::ConcatMatrix(Matrix2D::new(
85 f(operands, 0),
86 f(operands, 1),
87 f(operands, 2),
88 f(operands, 3),
89 f(operands, 4),
90 f(operands, 5),
91 )),
92 b"w" => ContentOp::SetLineWidth(f(operands, 0)),
93 b"J" => ContentOp::SetLineCap(i(operands, 0)),
94 b"j" => ContentOp::SetLineJoin(i(operands, 0)),
95 b"M" => ContentOp::SetMiterLimit(f(operands, 0)),
96 b"d" => {
97 let arr = operands
98 .first()
99 .and_then(|o| o.as_array())
100 .map(|a| a.iter().filter_map(|v| v.as_f64()).collect())
101 .unwrap_or_default();
102 let phase = f(operands, 1);
103 ContentOp::SetDashPattern(arr, phase)
104 }
105 b"ri" => ContentOp::SetRenderingIntent(n(operands, 0)),
106 b"i" => ContentOp::SetFlatness(f(operands, 0)),
107 b"gs" => ContentOp::SetExtGState(n(operands, 0)),
108
109 b"m" => ContentOp::MoveTo(f(operands, 0), f(operands, 1)),
111 b"l" => ContentOp::LineTo(f(operands, 0), f(operands, 1)),
112 b"c" => ContentOp::CurveTo(
113 f(operands, 0),
114 f(operands, 1),
115 f(operands, 2),
116 f(operands, 3),
117 f(operands, 4),
118 f(operands, 5),
119 ),
120 b"v" => ContentOp::CurveToInitial(
121 f(operands, 0),
122 f(operands, 1),
123 f(operands, 2),
124 f(operands, 3),
125 ),
126 b"y" => ContentOp::CurveToFinal(
127 f(operands, 0),
128 f(operands, 1),
129 f(operands, 2),
130 f(operands, 3),
131 ),
132 b"h" => ContentOp::ClosePath,
133 b"re" => ContentOp::Rectangle(
134 f(operands, 0),
135 f(operands, 1),
136 f(operands, 2),
137 f(operands, 3),
138 ),
139
140 b"S" => ContentOp::Stroke,
142 b"s" => ContentOp::CloseAndStroke,
143 b"f" | b"F" => ContentOp::Fill,
144 b"f*" => ContentOp::FillEvenOdd,
145 b"B" => ContentOp::FillAndStroke,
146 b"B*" => ContentOp::FillAndStrokeEvenOdd,
147 b"b" => ContentOp::CloseFillAndStroke,
148 b"b*" => ContentOp::CloseFillAndStrokeEvenOdd,
149 b"n" => ContentOp::EndPath,
150
151 b"W" => ContentOp::Clip,
153 b"W*" => ContentOp::ClipEvenOdd,
154
155 b"BT" => ContentOp::BeginText,
157 b"ET" => ContentOp::EndText,
158 b"Tc" => ContentOp::SetCharSpacing(f(operands, 0)),
159 b"Tw" => ContentOp::SetWordSpacing(f(operands, 0)),
160 b"Tz" => ContentOp::SetHorizScaling(f(operands, 0)),
161 b"TL" => ContentOp::SetTextLeading(f(operands, 0)),
162 b"Tf" => ContentOp::SetFont(n(operands, 0), f(operands, 1)),
163 b"Tr" => ContentOp::SetTextRenderMode(i(operands, 0)),
164 b"Ts" => ContentOp::SetTextRise(f(operands, 0)),
165 b"Td" => ContentOp::MoveTextPos(f(operands, 0), f(operands, 1)),
166 b"TD" => ContentOp::MoveTextPosSetLeading(f(operands, 0), f(operands, 1)),
167 b"Tm" if operands.len() >= 6 => ContentOp::SetTextMatrix(Matrix2D::new(
168 f(operands, 0),
169 f(operands, 1),
170 f(operands, 2),
171 f(operands, 3),
172 f(operands, 4),
173 f(operands, 5),
174 )),
175 b"T*" => ContentOp::NextLine,
176 b"Tj" => ContentOp::ShowText(s(operands, 0)),
177 b"TJ" => {
178 let items = operands
179 .first()
180 .and_then(|o| o.as_array())
181 .map(|arr| {
182 arr.iter()
183 .map(|item| match item {
184 PdfObject::Str(s) => TextOp::Text(s.clone()),
185 PdfObject::Integer(n) => TextOp::Adjustment(*n as f64),
186 PdfObject::Real(n) => TextOp::Adjustment(*n),
187 _ => TextOp::Adjustment(0.0),
188 })
189 .collect()
190 })
191 .unwrap_or_default();
192 ContentOp::ShowTextAdjusted(items)
193 }
194 b"'" => ContentOp::NextLineShowText(s(operands, 0)),
195 b"\"" => {
196 ContentOp::SetSpacingNextLineShowText(f(operands, 0), f(operands, 1), s(operands, 2))
197 }
198
199 b"CS" => ContentOp::SetStrokeColorSpace(n(operands, 0)),
201 b"cs" => ContentOp::SetFillColorSpace(n(operands, 0)),
202 b"SC" | b"SCN" => {
203 ContentOp::SetStrokeColor(operands.iter().filter_map(|o| o.as_f64()).collect())
204 }
205 b"sc" | b"scn" => {
206 ContentOp::SetFillColor(operands.iter().filter_map(|o| o.as_f64()).collect())
207 }
208 b"G" => ContentOp::SetStrokeGray(f(operands, 0)),
209 b"g" => ContentOp::SetFillGray(f(operands, 0)),
210 b"RG" => ContentOp::SetStrokeRGB(f(operands, 0), f(operands, 1), f(operands, 2)),
211 b"rg" => ContentOp::SetFillRGB(f(operands, 0), f(operands, 1), f(operands, 2)),
212 b"K" => ContentOp::SetStrokeCMYK(
213 f(operands, 0),
214 f(operands, 1),
215 f(operands, 2),
216 f(operands, 3),
217 ),
218 b"k" => ContentOp::SetFillCMYK(
219 f(operands, 0),
220 f(operands, 1),
221 f(operands, 2),
222 f(operands, 3),
223 ),
224
225 b"Do" => ContentOp::PaintXObject(n(operands, 0)),
227 b"sh" => ContentOp::PaintShading(n(operands, 0)),
228
229 b"MP" => ContentOp::MarkedContentPoint(n(operands, 0)),
231 b"DP" => ContentOp::MarkedContentPointProperties(
232 n(operands, 0),
233 operands.get(1).cloned().unwrap_or(PdfObject::Null),
234 ),
235 b"BMC" => ContentOp::BeginMarkedContent(n(operands, 0)),
236 b"BDC" => ContentOp::BeginMarkedContentProperties(
237 n(operands, 0),
238 operands.get(1).cloned().unwrap_or(PdfObject::Null),
239 ),
240 b"EMC" => ContentOp::EndMarkedContent,
241
242 b"BX" => ContentOp::BeginCompat,
244 b"EX" => ContentOp::EndCompat,
245
246 _ => ContentOp::Unknown(operator.to_vec(), operands.to_vec()),
248 }
249}
250
251fn parse_inline_image(tokenizer: &mut Tokenizer) -> Result<ContentOp> {
253 tokenizer.skip_whitespace_and_comments();
254
255 let mut dict = Vec::new();
257 loop {
258 tokenizer.skip_whitespace_and_comments();
259 if tokenizer.is_eof() {
260 break;
261 }
262
263 let pos = tokenizer.pos();
265 let data = tokenizer.data();
266 if pos + 2 <= data.len() && &data[pos..pos + 2] == b"ID" {
267 tokenizer.set_pos(pos + 2);
268 if !tokenizer.is_eof() {
270 tokenizer.set_pos(tokenizer.pos() + 1);
271 }
272 break;
273 }
274
275 match tokenizer.next_token()? {
276 Some(Token::Name(key)) => {
277 let full_key = expand_inline_image_key(&key);
279 match parse_object(tokenizer)? {
280 Some(val) => dict.push((full_key, val)),
281 None => break,
282 }
283 }
284 _ => break,
285 }
286 }
287
288 let start = tokenizer.pos();
290 let data = tokenizer.data();
291 let mut end = start;
292
293 while end < data.len() {
295 if end + 2 < data.len()
296 && data[end] == b'E'
297 && data[end + 1] == b'I'
298 && (end == start || is_whitespace_byte(data[end - 1]))
299 && (end + 2 >= data.len() || is_whitespace_or_delimiter(data[end + 2]))
300 {
301 break;
302 }
303 end += 1;
304 }
305
306 let mut img_end = end;
308 while img_end > start && is_whitespace_byte(data[img_end - 1]) {
309 img_end -= 1;
310 }
311
312 let image_data = data[start..img_end].to_vec();
313 tokenizer.set_pos(end + 2); Ok(ContentOp::InlineImage {
316 dict,
317 data: image_data,
318 })
319}
320
321fn is_whitespace_byte(b: u8) -> bool {
322 matches!(b, b' ' | b'\t' | b'\n' | b'\r' | b'\x0c' | b'\x00')
323}
324
325fn expand_inline_image_key(key: &[u8]) -> Vec<u8> {
327 match key {
328 b"BPC" => b"BitsPerComponent".to_vec(),
329 b"CS" => b"ColorSpace".to_vec(),
330 b"D" => b"Decode".to_vec(),
331 b"DP" => b"DecodeParms".to_vec(),
332 b"F" => b"Filter".to_vec(),
333 b"H" => b"Height".to_vec(),
334 b"IM" => b"ImageMask".to_vec(),
335 b"I" => b"Interpolate".to_vec(),
336 b"W" => b"Width".to_vec(),
337 _ => key.to_vec(),
338 }
339}
340
341fn f(ops: &[PdfObject], idx: usize) -> f64 {
343 ops.get(idx).and_then(|o| o.as_f64()).unwrap_or(0.0)
344}
345fn i(ops: &[PdfObject], idx: usize) -> i32 {
346 ops.get(idx).and_then(|o| o.as_i64()).unwrap_or(0) as i32
347}
348fn n(ops: &[PdfObject], idx: usize) -> Vec<u8> {
349 ops.get(idx)
350 .and_then(|o| o.as_name())
351 .unwrap_or(b"")
352 .to_vec()
353}
354fn s(ops: &[PdfObject], idx: usize) -> Vec<u8> {
355 ops.get(idx)
356 .and_then(|o| o.as_str())
357 .unwrap_or(b"")
358 .to_vec()
359}
360
361#[cfg(test)]
362mod tests {
363 use super::*;
364
365 #[test]
366 fn test_basic_ops() {
367 let data = b"q 1 0 0 1 100 200 cm Q";
368 let ops = parse_content_stream(data).unwrap();
369 assert_eq!(ops.len(), 3);
370 assert!(matches!(ops[0], ContentOp::SaveState));
371 assert!(matches!(ops[1], ContentOp::ConcatMatrix(_)));
372 assert!(matches!(ops[2], ContentOp::RestoreState));
373 }
374
375 #[test]
376 fn test_text_ops() {
377 let data = b"BT /F1 12 Tf 100 700 Td (Hello World) Tj ET";
378 let ops = parse_content_stream(data).unwrap();
379 assert!(matches!(ops[0], ContentOp::BeginText));
380 assert!(matches!(ops[1], ContentOp::SetFont(..)));
381 assert!(matches!(ops[2], ContentOp::MoveTextPos(..)));
382 assert!(matches!(ops[3], ContentOp::ShowText(..)));
383 assert!(matches!(ops[4], ContentOp::EndText));
384
385 if let ContentOp::SetFont(ref name, size) = ops[1] {
386 assert_eq!(name, b"F1");
387 assert_eq!(size, 12.0);
388 }
389 if let ContentOp::ShowText(ref text) = ops[3] {
390 assert_eq!(text, b"Hello World");
391 }
392 }
393
394 #[test]
395 fn test_path_ops() {
396 let data = b"100 200 m 300 400 l 100 200 300 400 500 600 c h S";
397 let ops = parse_content_stream(data).unwrap();
398 assert!(matches!(ops[0], ContentOp::MoveTo(100.0, 200.0)));
399 assert!(matches!(ops[1], ContentOp::LineTo(300.0, 400.0)));
400 assert!(matches!(ops[2], ContentOp::CurveTo(..)));
401 assert!(matches!(ops[3], ContentOp::ClosePath));
402 assert!(matches!(ops[4], ContentOp::Stroke));
403 }
404
405 #[test]
406 fn test_color_ops() {
407 let data = b"1 0 0 RG 0.5 g";
408 let ops = parse_content_stream(data).unwrap();
409 assert!(matches!(ops[0], ContentOp::SetStrokeRGB(1.0, 0.0, 0.0)));
410 assert!(matches!(ops[1], ContentOp::SetFillGray(..)));
411 }
412
413 #[test]
414 fn test_tj_array() {
415 let data = b"[(Hello ) -100 (World)] TJ";
416 let ops = parse_content_stream(data).unwrap();
417 assert_eq!(ops.len(), 1);
418 if let ContentOp::ShowTextAdjusted(ref items) = ops[0] {
419 assert_eq!(items.len(), 3);
420 assert!(matches!(items[0], TextOp::Text(ref t) if t == b"Hello "));
421 assert!(matches!(items[1], TextOp::Adjustment(-100.0)));
422 assert!(matches!(items[2], TextOp::Text(ref t) if t == b"World"));
423 } else {
424 panic!("Expected ShowTextAdjusted");
425 }
426 }
427
428 #[test]
429 fn test_marked_content() {
430 let data = b"/Span BMC (text) Tj EMC";
431 let ops = parse_content_stream(data).unwrap();
432 assert!(matches!(ops[0], ContentOp::BeginMarkedContent(..)));
433 assert!(matches!(ops[1], ContentOp::ShowText(..)));
434 assert!(matches!(ops[2], ContentOp::EndMarkedContent));
435 }
436
437 #[test]
438 fn test_xobject() {
439 let data = b"/Im0 Do";
440 let ops = parse_content_stream(data).unwrap();
441 assert_eq!(ops.len(), 1);
442 if let ContentOp::PaintXObject(ref name) = ops[0] {
443 assert_eq!(name, b"Im0");
444 }
445 }
446}