1use std::collections::HashMap;
8
9use crate::cid_font::{
10 CidFontMetrics, extract_cid_font_metrics, get_descendant_font, get_type0_encoding,
11 is_type0_font, parse_predefined_cmap_name, strip_subset_prefix,
12};
13use crate::cmap::CMap;
14use crate::color_space::resolve_color_space_name;
15use crate::error::BackendError;
16use crate::font_metrics::{FontMetrics, extract_font_metrics};
17use crate::handler::{CharEvent, ContentHandler, ImageEvent};
18use crate::interpreter_state::InterpreterState;
19use crate::lopdf_backend::object_to_f64;
20use crate::text_renderer::{
21 TjElement, show_string, show_string_cid, show_string_with_positioning_mode,
22};
23use crate::text_state::TextState;
24use crate::tokenizer::{Operand, tokenize};
25use pdfplumber_core::{ExtractOptions, ExtractWarning};
26
27struct CachedFont {
29 metrics: FontMetrics,
30 cmap: Option<CMap>,
31 base_name: String,
32 cid_metrics: Option<CidFontMetrics>,
34 is_cid_font: bool,
36 #[allow(dead_code)]
39 writing_mode: u8,
40}
41
42#[allow(clippy::too_many_arguments)]
59pub(crate) fn interpret_content_stream(
60 doc: &lopdf::Document,
61 stream_bytes: &[u8],
62 resources: &lopdf::Dictionary,
63 handler: &mut dyn ContentHandler,
64 options: &ExtractOptions,
65 depth: usize,
66 gstate: &mut InterpreterState,
67 tstate: &mut TextState,
68) -> Result<(), BackendError> {
69 if depth > options.max_recursion_depth {
70 return Err(BackendError::Interpreter(format!(
71 "Form XObject recursion depth {} exceeds limit {}",
72 depth, options.max_recursion_depth
73 )));
74 }
75
76 let operators = tokenize(stream_bytes)?;
77 let mut font_cache: HashMap<String, CachedFont> = HashMap::new();
78
79 for (op_index, op) in operators.iter().enumerate() {
80 match op.name.as_str() {
81 "q" => gstate.save_state(),
83 "Q" => {
84 gstate.restore_state();
85 }
86 "cm" => {
87 if op.operands.len() >= 6 {
88 let a = get_f64(&op.operands, 0).unwrap_or(1.0);
89 let b = get_f64(&op.operands, 1).unwrap_or(0.0);
90 let c = get_f64(&op.operands, 2).unwrap_or(0.0);
91 let d = get_f64(&op.operands, 3).unwrap_or(1.0);
92 let e = get_f64(&op.operands, 4).unwrap_or(0.0);
93 let f = get_f64(&op.operands, 5).unwrap_or(0.0);
94 gstate.concat_matrix(a, b, c, d, e, f);
95 }
96 }
97 "w" => {
98 if let Some(v) = get_f64(&op.operands, 0) {
99 gstate.set_line_width(v);
100 }
101 }
102
103 "G" => {
105 if let Some(g) = get_f32(&op.operands, 0) {
106 gstate.set_stroking_gray(g);
107 }
108 }
109 "g" => {
110 if let Some(g) = get_f32(&op.operands, 0) {
111 gstate.set_non_stroking_gray(g);
112 }
113 }
114 "RG" => {
115 if op.operands.len() >= 3 {
116 let r = get_f32(&op.operands, 0).unwrap_or(0.0);
117 let g = get_f32(&op.operands, 1).unwrap_or(0.0);
118 let b = get_f32(&op.operands, 2).unwrap_or(0.0);
119 gstate.set_stroking_rgb(r, g, b);
120 }
121 }
122 "rg" => {
123 if op.operands.len() >= 3 {
124 let r = get_f32(&op.operands, 0).unwrap_or(0.0);
125 let g = get_f32(&op.operands, 1).unwrap_or(0.0);
126 let b = get_f32(&op.operands, 2).unwrap_or(0.0);
127 gstate.set_non_stroking_rgb(r, g, b);
128 }
129 }
130 "K" => {
131 if op.operands.len() >= 4 {
132 let c = get_f32(&op.operands, 0).unwrap_or(0.0);
133 let m = get_f32(&op.operands, 1).unwrap_or(0.0);
134 let y = get_f32(&op.operands, 2).unwrap_or(0.0);
135 let k = get_f32(&op.operands, 3).unwrap_or(0.0);
136 gstate.set_stroking_cmyk(c, m, y, k);
137 }
138 }
139 "k" => {
140 if op.operands.len() >= 4 {
141 let c = get_f32(&op.operands, 0).unwrap_or(0.0);
142 let m = get_f32(&op.operands, 1).unwrap_or(0.0);
143 let y = get_f32(&op.operands, 2).unwrap_or(0.0);
144 let k = get_f32(&op.operands, 3).unwrap_or(0.0);
145 gstate.set_non_stroking_cmyk(c, m, y, k);
146 }
147 }
148 "CS" => {
149 if let Some(Operand::Name(name)) = op.operands.first() {
150 if let Some(cs) = resolve_color_space_name(name, doc, resources) {
151 gstate.set_stroking_color_space(cs);
152 }
153 }
154 }
155 "cs" => {
156 if let Some(Operand::Name(name)) = op.operands.first() {
157 if let Some(cs) = resolve_color_space_name(name, doc, resources) {
158 gstate.set_non_stroking_color_space(cs);
159 }
160 }
161 }
162 "SC" | "SCN" => {
163 let components: Vec<f32> = op.operands.iter().filter_map(operand_to_f32).collect();
164 gstate.set_stroking_color(&components);
165 }
166 "sc" | "scn" => {
167 let components: Vec<f32> = op.operands.iter().filter_map(operand_to_f32).collect();
168 gstate.set_non_stroking_color(&components);
169 }
170
171 "BT" => tstate.begin_text(),
173 "ET" => tstate.end_text(),
174 "Tf" => {
175 if op.operands.len() >= 2 {
176 let font_name = operand_to_name(&op.operands[0]);
177 let size = get_f64(&op.operands, 1).unwrap_or(0.0);
178 tstate.set_font(font_name.clone(), size);
179 load_font_if_needed(
180 doc,
181 resources,
182 &font_name,
183 &mut font_cache,
184 handler,
185 options,
186 op_index,
187 );
188 }
189 }
190 "Tm" => {
191 if op.operands.len() >= 6 {
192 let a = get_f64(&op.operands, 0).unwrap_or(1.0);
193 let b = get_f64(&op.operands, 1).unwrap_or(0.0);
194 let c = get_f64(&op.operands, 2).unwrap_or(0.0);
195 let d = get_f64(&op.operands, 3).unwrap_or(1.0);
196 let e = get_f64(&op.operands, 4).unwrap_or(0.0);
197 let f = get_f64(&op.operands, 5).unwrap_or(0.0);
198 tstate.set_text_matrix(a, b, c, d, e, f);
199 }
200 }
201 "Td" => {
202 if op.operands.len() >= 2 {
203 let tx = get_f64(&op.operands, 0).unwrap_or(0.0);
204 let ty = get_f64(&op.operands, 1).unwrap_or(0.0);
205 tstate.move_text_position(tx, ty);
206 }
207 }
208 "TD" => {
209 if op.operands.len() >= 2 {
210 let tx = get_f64(&op.operands, 0).unwrap_or(0.0);
211 let ty = get_f64(&op.operands, 1).unwrap_or(0.0);
212 tstate.move_text_position_and_set_leading(tx, ty);
213 }
214 }
215 "T*" => tstate.move_to_next_line(),
216 "Tc" => {
217 if let Some(v) = get_f64(&op.operands, 0) {
218 tstate.set_char_spacing(v);
219 }
220 }
221 "Tw" => {
222 if let Some(v) = get_f64(&op.operands, 0) {
223 tstate.set_word_spacing(v);
224 }
225 }
226 "Tz" => {
227 if let Some(v) = get_f64(&op.operands, 0) {
228 tstate.set_h_scaling(v);
229 }
230 }
231 "TL" => {
232 if let Some(v) = get_f64(&op.operands, 0) {
233 tstate.set_leading(v);
234 }
235 }
236 "Tr" => {
237 if let Some(v) = get_i64(&op.operands, 0) {
238 if let Some(mode) = crate::text_state::TextRenderMode::from_i64(v) {
239 tstate.set_render_mode(mode);
240 }
241 }
242 }
243 "Ts" => {
244 if let Some(v) = get_f64(&op.operands, 0) {
245 tstate.set_rise(v);
246 }
247 }
248
249 "Tj" => {
251 handle_tj(tstate, gstate, handler, &op.operands, &font_cache);
252 }
253 "TJ" => {
254 handle_tj_array(tstate, gstate, handler, &op.operands, &font_cache);
255 }
256 "'" => {
257 tstate.move_to_next_line();
259 handle_tj(tstate, gstate, handler, &op.operands, &font_cache);
260 }
261 "\"" => {
262 if op.operands.len() >= 3 {
264 if let Some(aw) = get_f64(&op.operands, 0) {
265 tstate.set_word_spacing(aw);
266 }
267 if let Some(ac) = get_f64(&op.operands, 1) {
268 tstate.set_char_spacing(ac);
269 }
270 tstate.move_to_next_line();
271 let string_operands = vec![op.operands[2].clone()];
273 handle_tj(tstate, gstate, handler, &string_operands, &font_cache);
274 }
275 }
276
277 "Do" => {
279 if let Some(Operand::Name(name)) = op.operands.first() {
280 handle_do(
281 doc, resources, handler, options, depth, gstate, tstate, name,
282 )?;
283 }
284 }
285
286 _ => {}
288 }
289 }
290
291 Ok(())
292}
293
294fn get_f64(operands: &[Operand], index: usize) -> Option<f64> {
297 operands.get(index).and_then(|o| match o {
298 Operand::Integer(i) => Some(*i as f64),
299 Operand::Real(f) => Some(*f),
300 _ => None,
301 })
302}
303
304fn get_f32(operands: &[Operand], index: usize) -> Option<f32> {
305 get_f64(operands, index).map(|v| v as f32)
306}
307
308fn get_i64(operands: &[Operand], index: usize) -> Option<i64> {
309 operands.get(index).and_then(|o| match o {
310 Operand::Integer(i) => Some(*i),
311 Operand::Real(f) => Some(*f as i64),
312 _ => None,
313 })
314}
315
316fn operand_to_f32(o: &Operand) -> Option<f32> {
317 match o {
318 Operand::Integer(i) => Some(*i as f32),
319 Operand::Real(f) => Some(*f as f32),
320 _ => None,
321 }
322}
323
324fn operand_to_name(o: &Operand) -> String {
325 match o {
326 Operand::Name(n) => n.clone(),
327 _ => String::new(),
328 }
329}
330
331fn operand_to_string_bytes(o: &Operand) -> Option<&[u8]> {
332 match o {
333 Operand::LiteralString(s) | Operand::HexString(s) => Some(s),
334 _ => None,
335 }
336}
337
338#[allow(clippy::too_many_arguments)]
341fn load_font_if_needed(
342 doc: &lopdf::Document,
343 resources: &lopdf::Dictionary,
344 font_name: &str,
345 cache: &mut HashMap<String, CachedFont>,
346 handler: &mut dyn ContentHandler,
347 options: &ExtractOptions,
348 op_index: usize,
349) {
350 if cache.contains_key(font_name) {
351 return;
352 }
353
354 let font_dict = (|| -> Option<&lopdf::Dictionary> {
356 let fonts_obj = resources.get(b"Font").ok()?;
357 let fonts_obj = resolve_ref(doc, fonts_obj);
358 let fonts_dict = fonts_obj.as_dict().ok()?;
359 let font_obj = fonts_dict.get(font_name.as_bytes()).ok()?;
360 let font_obj = resolve_ref(doc, font_obj);
361 font_obj.as_dict().ok()
362 })();
363
364 let (metrics, cmap, base_name, cid_metrics, is_cid_font, writing_mode) =
365 if let Some(fd) = font_dict {
366 if is_type0_font(fd) {
367 let (cid_met, wm) = load_cid_font(doc, fd);
369 let metrics = if let Some(ref cm) = cid_met {
370 FontMetrics::new(
372 Vec::new(),
373 0,
374 0,
375 cm.default_width(),
376 cm.ascent(),
377 cm.descent(),
378 cm.font_bbox(),
379 )
380 } else {
381 if options.collect_warnings {
382 handler.on_warning(ExtractWarning::with_operator_context(
383 "CID font metrics not available, using defaults",
384 op_index,
385 font_name,
386 ));
387 }
388 FontMetrics::default_metrics()
389 };
390
391 let cmap = extract_tounicode_cmap(doc, fd);
393
394 let raw_base_name = fd
395 .get(b"BaseFont")
396 .ok()
397 .and_then(|o| o.as_name_str().ok())
398 .unwrap_or(font_name);
399 let base_name = strip_subset_prefix(raw_base_name).to_string();
400
401 (metrics, cmap, base_name, cid_met, true, wm)
402 } else {
403 let metrics = match extract_font_metrics(doc, fd) {
405 Ok(m) => m,
406 Err(_) => {
407 if options.collect_warnings {
408 handler.on_warning(ExtractWarning::with_operator_context(
409 "failed to extract font metrics, using defaults",
410 op_index,
411 font_name,
412 ));
413 }
414 FontMetrics::default_metrics()
415 }
416 };
417 let cmap = extract_tounicode_cmap(doc, fd);
418 let raw_base_name = fd
419 .get(b"BaseFont")
420 .ok()
421 .and_then(|o| o.as_name_str().ok())
422 .unwrap_or(font_name);
423 let base_name = strip_subset_prefix(raw_base_name).to_string();
424
425 (metrics, cmap, base_name, None, false, 0)
426 }
427 } else {
428 if options.collect_warnings {
430 handler.on_warning(ExtractWarning::with_operator_context(
431 "font not found in page resources, using defaults",
432 op_index,
433 font_name,
434 ));
435 }
436 (
437 FontMetrics::default_metrics(),
438 None,
439 font_name.to_string(),
440 None,
441 false,
442 0,
443 )
444 };
445
446 cache.insert(
447 font_name.to_string(),
448 CachedFont {
449 metrics,
450 cmap,
451 base_name,
452 cid_metrics,
453 is_cid_font,
454 writing_mode,
455 },
456 );
457}
458
459fn extract_tounicode_cmap(doc: &lopdf::Document, fd: &lopdf::Dictionary) -> Option<CMap> {
461 let tounicode_obj = fd.get(b"ToUnicode").ok()?;
462 let tounicode_obj = resolve_ref(doc, tounicode_obj);
463 let stream = tounicode_obj.as_stream().ok()?;
464 let data = decode_stream(stream).ok()?;
465 CMap::parse(&data).ok()
466}
467
468fn load_cid_font(
470 doc: &lopdf::Document,
471 type0_dict: &lopdf::Dictionary,
472) -> (Option<CidFontMetrics>, u8) {
473 let writing_mode = get_type0_encoding(type0_dict)
475 .and_then(|enc| parse_predefined_cmap_name(&enc))
476 .map(|info| info.writing_mode)
477 .unwrap_or(0);
478
479 let cid_metrics = get_descendant_font(doc, type0_dict)
481 .and_then(|desc| extract_cid_font_metrics(doc, desc).ok());
482
483 (cid_metrics, writing_mode)
484}
485
486fn get_width_fn(cached: Option<&CachedFont>) -> Box<dyn Fn(u32) -> f64 + '_> {
491 match cached {
492 Some(cf) if cf.is_cid_font => {
493 if let Some(ref cid_met) = cf.cid_metrics {
494 Box::new(move |code: u32| cid_met.get_width(code))
495 } else {
496 Box::new(move |code: u32| cf.metrics.get_width(code))
497 }
498 }
499 Some(cf) => Box::new(move |code: u32| cf.metrics.get_width(code)),
500 None => {
501 let default_metrics = FontMetrics::default_metrics();
502 Box::new(move |code: u32| default_metrics.get_width(code))
503 }
504 }
505}
506
507fn handle_tj(
508 tstate: &mut TextState,
509 gstate: &InterpreterState,
510 handler: &mut dyn ContentHandler,
511 operands: &[Operand],
512 font_cache: &HashMap<String, CachedFont>,
513) {
514 let string_bytes = match operands.first().and_then(operand_to_string_bytes) {
515 Some(bytes) => bytes,
516 None => return,
517 };
518
519 let cached = font_cache.get(&tstate.font_name);
520 let width_fn = get_width_fn(cached);
521 let is_cid = cached.is_some_and(|c| c.is_cid_font);
522 let raw_chars = if is_cid {
523 show_string_cid(tstate, string_bytes, &*width_fn)
524 } else {
525 show_string(tstate, string_bytes, &*width_fn)
526 };
527
528 emit_char_events(raw_chars, tstate, gstate, handler, cached);
529}
530
531fn handle_tj_array(
532 tstate: &mut TextState,
533 gstate: &InterpreterState,
534 handler: &mut dyn ContentHandler,
535 operands: &[Operand],
536 font_cache: &HashMap<String, CachedFont>,
537) {
538 let array = match operands.first() {
539 Some(Operand::Array(arr)) => arr,
540 _ => return,
541 };
542
543 let elements: Vec<TjElement> = array
545 .iter()
546 .filter_map(|o| match o {
547 Operand::LiteralString(s) | Operand::HexString(s) => Some(TjElement::String(s.clone())),
548 Operand::Integer(i) => Some(TjElement::Adjustment(*i as f64)),
549 Operand::Real(f) => Some(TjElement::Adjustment(*f)),
550 _ => None,
551 })
552 .collect();
553
554 let cached = font_cache.get(&tstate.font_name);
555 let width_fn = get_width_fn(cached);
556 let is_cid = cached.is_some_and(|c| c.is_cid_font);
557 let raw_chars = show_string_with_positioning_mode(tstate, &elements, &*width_fn, is_cid);
558
559 emit_char_events(raw_chars, tstate, gstate, handler, cached);
560}
561
562fn emit_char_events(
563 raw_chars: Vec<crate::text_renderer::RawChar>,
564 tstate: &TextState,
565 gstate: &InterpreterState,
566 handler: &mut dyn ContentHandler,
567 cached: Option<&CachedFont>,
568) {
569 let ctm = gstate.ctm_array();
570 let font_name = cached.map_or_else(|| tstate.font_name.clone(), |c| c.base_name.clone());
571
572 for rc in raw_chars {
573 let unicode = cached.and_then(|c| {
574 c.cmap
575 .as_ref()
576 .and_then(|cm| cm.lookup(rc.char_code).map(|s| s.to_string()))
577 });
578
579 let displacement = match cached {
581 Some(cf) if cf.is_cid_font => cf
582 .cid_metrics
583 .as_ref()
584 .map_or(600.0, |cm| cm.get_width(rc.char_code)),
585 Some(cf) => cf.metrics.get_width(rc.char_code),
586 None => 600.0,
587 };
588
589 handler.on_char(CharEvent {
590 char_code: rc.char_code,
591 unicode,
592 font_name: font_name.clone(),
593 font_size: tstate.font_size,
594 text_matrix: rc.text_matrix,
595 ctm,
596 displacement,
597 char_spacing: tstate.char_spacing,
598 word_spacing: tstate.word_spacing,
599 h_scaling: tstate.h_scaling_normalized(),
600 rise: tstate.rise,
601 });
602 }
603}
604
605#[allow(clippy::too_many_arguments)]
608fn handle_do(
609 doc: &lopdf::Document,
610 resources: &lopdf::Dictionary,
611 handler: &mut dyn ContentHandler,
612 options: &ExtractOptions,
613 depth: usize,
614 gstate: &mut InterpreterState,
615 tstate: &mut TextState,
616 name: &str,
617) -> Result<(), BackendError> {
618 let xobj_dict = resources.get(b"XObject").map_err(|_| {
620 BackendError::Interpreter(format!(
621 "no /XObject dictionary in resources for Do /{name}"
622 ))
623 })?;
624 let xobj_dict = resolve_ref(doc, xobj_dict);
625 let xobj_dict = xobj_dict.as_dict().map_err(|_| {
626 BackendError::Interpreter("/XObject resource is not a dictionary".to_string())
627 })?;
628
629 let xobj_entry = xobj_dict.get(name.as_bytes()).map_err(|_| {
630 BackendError::Interpreter(format!("XObject /{name} not found in resources"))
631 })?;
632
633 let xobj_id = xobj_entry.as_reference().map_err(|_| {
634 BackendError::Interpreter(format!("XObject /{name} is not an indirect reference"))
635 })?;
636
637 let xobj = doc.get_object(xobj_id).map_err(|e| {
638 BackendError::Interpreter(format!("failed to resolve XObject /{name}: {e}"))
639 })?;
640
641 let stream = xobj
642 .as_stream()
643 .map_err(|e| BackendError::Interpreter(format!("XObject /{name} is not a stream: {e}")))?;
644
645 let subtype = stream
646 .dict
647 .get(b"Subtype")
648 .ok()
649 .and_then(|o| o.as_name_str().ok())
650 .unwrap_or("");
651
652 match subtype {
653 "Form" => handle_form_xobject(
654 doc, stream, name, resources, handler, options, depth, gstate, tstate,
655 ),
656 "Image" => {
657 handle_image_xobject(stream, name, gstate, handler);
658 Ok(())
659 }
660 _ => {
661 Ok(())
663 }
664 }
665}
666
667#[allow(clippy::too_many_arguments)]
668fn handle_form_xobject(
669 doc: &lopdf::Document,
670 stream: &lopdf::Stream,
671 name: &str,
672 parent_resources: &lopdf::Dictionary,
673 handler: &mut dyn ContentHandler,
674 options: &ExtractOptions,
675 depth: usize,
676 gstate: &mut InterpreterState,
677 tstate: &mut TextState,
678) -> Result<(), BackendError> {
679 gstate.save_state();
681
682 if let Ok(matrix_obj) = stream.dict.get(b"Matrix") {
684 if let Ok(arr) = matrix_obj.as_array() {
685 if arr.len() == 6 {
686 let vals: Result<Vec<f64>, _> = arr.iter().map(object_to_f64).collect();
687 if let Ok(vals) = vals {
688 gstate.concat_matrix(vals[0], vals[1], vals[2], vals[3], vals[4], vals[5]);
689 }
690 }
691 }
692 }
693
694 let form_resources_dict;
696 let form_resources = if let Ok(res_obj) = stream.dict.get(b"Resources") {
697 let res_obj = resolve_ref(doc, res_obj);
698 match res_obj.as_dict() {
699 Ok(d) => d,
700 Err(_) => parent_resources,
701 }
702 } else {
703 if let Ok(res_ref) = stream.dict.get(b"Resources") {
707 if let Ok(id) = res_ref.as_reference() {
708 if let Ok(obj) = doc.get_object(id) {
709 if let Ok(d) = obj.as_dict() {
710 form_resources_dict = d.clone();
711 &form_resources_dict
712 } else {
713 parent_resources
714 }
715 } else {
716 parent_resources
717 }
718 } else {
719 parent_resources
720 }
721 } else {
722 parent_resources
723 }
724 };
725
726 let content_bytes = decode_stream(stream).map_err(|e| {
728 BackendError::Interpreter(format!("failed to decode Form XObject /{name} stream: {e}"))
729 })?;
730
731 interpret_content_stream(
733 doc,
734 &content_bytes,
735 form_resources,
736 handler,
737 options,
738 depth + 1,
739 gstate,
740 tstate,
741 )?;
742
743 gstate.restore_state();
745
746 Ok(())
747}
748
749fn handle_image_xobject(
750 stream: &lopdf::Stream,
751 name: &str,
752 gstate: &InterpreterState,
753 handler: &mut dyn ContentHandler,
754) {
755 let width = stream
756 .dict
757 .get(b"Width")
758 .ok()
759 .and_then(|o| o.as_i64().ok())
760 .unwrap_or(0) as u32;
761
762 let height = stream
763 .dict
764 .get(b"Height")
765 .ok()
766 .and_then(|o| o.as_i64().ok())
767 .unwrap_or(0) as u32;
768
769 let colorspace = stream
770 .dict
771 .get(b"ColorSpace")
772 .ok()
773 .and_then(|o| o.as_name_str().ok())
774 .map(|s| s.to_string());
775
776 let bits_per_component = stream
777 .dict
778 .get(b"BitsPerComponent")
779 .ok()
780 .and_then(|o| o.as_i64().ok())
781 .map(|v| v as u32);
782
783 handler.on_image(ImageEvent {
784 name: name.to_string(),
785 ctm: gstate.ctm_array(),
786 width,
787 height,
788 colorspace,
789 bits_per_component,
790 });
791}
792
793fn resolve_ref<'a>(doc: &'a lopdf::Document, obj: &'a lopdf::Object) -> &'a lopdf::Object {
798 match obj {
799 lopdf::Object::Reference(id) => doc.get_object(*id).unwrap_or(obj),
800 _ => obj,
801 }
802}
803
804fn decode_stream(stream: &lopdf::Stream) -> Result<Vec<u8>, BackendError> {
806 if stream.dict.get(b"Filter").is_ok() {
808 stream
809 .decompressed_content()
810 .map_err(|e| BackendError::Interpreter(format!("stream decompression failed: {e}")))
811 } else {
812 Ok(stream.content.clone())
813 }
814}
815
816#[cfg(test)]
817mod tests {
818 use super::*;
819 use crate::handler::{CharEvent, ContentHandler, ImageEvent};
820
821 struct CollectingHandler {
824 chars: Vec<CharEvent>,
825 images: Vec<ImageEvent>,
826 warnings: Vec<ExtractWarning>,
827 }
828
829 impl CollectingHandler {
830 fn new() -> Self {
831 Self {
832 chars: Vec::new(),
833 images: Vec::new(),
834 warnings: Vec::new(),
835 }
836 }
837 }
838
839 impl ContentHandler for CollectingHandler {
840 fn on_char(&mut self, event: CharEvent) {
841 self.chars.push(event);
842 }
843 fn on_image(&mut self, event: ImageEvent) {
844 self.images.push(event);
845 }
846 fn on_warning(&mut self, warning: ExtractWarning) {
847 self.warnings.push(warning);
848 }
849 }
850
851 fn empty_resources() -> lopdf::Dictionary {
854 lopdf::Dictionary::new()
855 }
856
857 fn default_options() -> ExtractOptions {
858 ExtractOptions::default()
859 }
860
861 #[test]
864 fn interpret_simple_text() {
865 let doc = lopdf::Document::with_version("1.5");
866 let resources = empty_resources();
867 let stream = b"BT /F1 12 Tf 72 700 Td (Hello) Tj ET";
868
869 let mut handler = CollectingHandler::new();
870 let mut gstate = InterpreterState::new();
871 let mut tstate = TextState::new();
872
873 interpret_content_stream(
874 &doc,
875 stream,
876 &resources,
877 &mut handler,
878 &default_options(),
879 0,
880 &mut gstate,
881 &mut tstate,
882 )
883 .unwrap();
884
885 assert_eq!(handler.chars.len(), 5);
887 assert_eq!(handler.chars[0].char_code, b'H' as u32);
888 assert_eq!(handler.chars[1].char_code, b'e' as u32);
889 assert_eq!(handler.chars[4].char_code, b'o' as u32);
890 assert_eq!(handler.chars[0].font_size, 12.0);
891 }
892
893 #[test]
894 fn interpret_tj_array() {
895 let doc = lopdf::Document::with_version("1.5");
896 let resources = empty_resources();
897 let stream = b"BT /F1 12 Tf [(H) -20 (i)] TJ ET";
898
899 let mut handler = CollectingHandler::new();
900 let mut gstate = InterpreterState::new();
901 let mut tstate = TextState::new();
902
903 interpret_content_stream(
904 &doc,
905 stream,
906 &resources,
907 &mut handler,
908 &default_options(),
909 0,
910 &mut gstate,
911 &mut tstate,
912 )
913 .unwrap();
914
915 assert_eq!(handler.chars.len(), 2);
916 assert_eq!(handler.chars[0].char_code, b'H' as u32);
917 assert_eq!(handler.chars[1].char_code, b'i' as u32);
918 }
919
920 #[test]
921 fn interpret_ctm_passed_to_char_events() {
922 let doc = lopdf::Document::with_version("1.5");
923 let resources = empty_resources();
924 let stream = b"1 0 0 1 10 20 cm BT /F1 12 Tf (A) Tj ET";
925
926 let mut handler = CollectingHandler::new();
927 let mut gstate = InterpreterState::new();
928 let mut tstate = TextState::new();
929
930 interpret_content_stream(
931 &doc,
932 stream,
933 &resources,
934 &mut handler,
935 &default_options(),
936 0,
937 &mut gstate,
938 &mut tstate,
939 )
940 .unwrap();
941
942 assert_eq!(handler.chars.len(), 1);
943 assert_eq!(handler.chars[0].ctm, [1.0, 0.0, 0.0, 1.0, 10.0, 20.0]);
944 }
945
946 #[test]
949 fn recursion_depth_zero_allowed() {
950 let doc = lopdf::Document::with_version("1.5");
951 let resources = empty_resources();
952 let stream = b"BT ET";
953
954 let mut handler = CollectingHandler::new();
955 let mut gstate = InterpreterState::new();
956 let mut tstate = TextState::new();
957
958 let result = interpret_content_stream(
959 &doc,
960 stream,
961 &resources,
962 &mut handler,
963 &default_options(),
964 0,
965 &mut gstate,
966 &mut tstate,
967 );
968 assert!(result.is_ok());
969 }
970
971 #[test]
972 fn recursion_depth_exceeds_limit() {
973 let doc = lopdf::Document::with_version("1.5");
974 let resources = empty_resources();
975 let stream = b"BT ET";
976
977 let mut handler = CollectingHandler::new();
978 let mut gstate = InterpreterState::new();
979 let mut tstate = TextState::new();
980
981 let mut opts = ExtractOptions::default();
982 opts.max_recursion_depth = 3;
983
984 let result = interpret_content_stream(
985 &doc,
986 stream,
987 &resources,
988 &mut handler,
989 &opts,
990 4, &mut gstate,
992 &mut tstate,
993 );
994 assert!(result.is_err());
995 let err_msg = result.unwrap_err().to_string();
996 assert!(err_msg.contains("recursion depth"));
997 }
998
999 #[test]
1002 fn interpret_q_q_state_save_restore() {
1003 let doc = lopdf::Document::with_version("1.5");
1004 let resources = empty_resources();
1005 let stream = b"0.5 g q 1 0 0 rg Q";
1007
1008 let mut handler = CollectingHandler::new();
1009 let mut gstate = InterpreterState::new();
1010 let mut tstate = TextState::new();
1011
1012 interpret_content_stream(
1013 &doc,
1014 stream,
1015 &resources,
1016 &mut handler,
1017 &default_options(),
1018 0,
1019 &mut gstate,
1020 &mut tstate,
1021 )
1022 .unwrap();
1023
1024 assert_eq!(
1026 gstate.graphics_state().fill_color,
1027 pdfplumber_core::Color::Gray(0.5)
1028 );
1029 }
1030
1031 fn make_cid_font_resources(doc: &mut lopdf::Document) -> lopdf::Dictionary {
1035 use lopdf::{Object, Stream, dictionary};
1036
1037 let tounicode_data = b"\
1039 /CIDInit /ProcSet findresource begin\n\
1040 12 dict begin\n\
1041 begincmap\n\
1042 /CMapName /Adobe-Identity-UCS def\n\
1043 /CMapType 2 def\n\
1044 1 begincodespacerange\n\
1045 <0000> <FFFF>\n\
1046 endcodespacerange\n\
1047 2 beginbfchar\n\
1048 <4E2D> <4E2D>\n\
1049 <6587> <6587>\n\
1050 endbfchar\n\
1051 endcmap\n";
1052 let tounicode_stream = Stream::new(dictionary! {}, tounicode_data.to_vec());
1053 let tounicode_id = doc.add_object(Object::Stream(tounicode_stream));
1054
1055 let cid_font_dict = dictionary! {
1057 "Type" => "Font",
1058 "Subtype" => "CIDFontType2",
1059 "BaseFont" => "MSGothic",
1060 "DW" => Object::Integer(1000),
1061 "CIDToGIDMap" => "Identity",
1062 "CIDSystemInfo" => Object::Dictionary(dictionary! {
1063 "Registry" => Object::String("Adobe".as_bytes().to_vec(), lopdf::StringFormat::Literal),
1064 "Ordering" => Object::String("Identity".as_bytes().to_vec(), lopdf::StringFormat::Literal),
1065 "Supplement" => Object::Integer(0),
1066 }),
1067 };
1068 let cid_font_id = doc.add_object(Object::Dictionary(cid_font_dict));
1069
1070 let type0_dict = dictionary! {
1072 "Type" => "Font",
1073 "Subtype" => "Type0",
1074 "BaseFont" => "MSGothic",
1075 "Encoding" => "Identity-H",
1076 "DescendantFonts" => Object::Array(vec![Object::Reference(cid_font_id)]),
1077 "ToUnicode" => Object::Reference(tounicode_id),
1078 };
1079 let type0_id = doc.add_object(Object::Dictionary(type0_dict));
1080
1081 dictionary! {
1083 "Font" => Object::Dictionary(dictionary! {
1084 "F1" => Object::Reference(type0_id),
1085 }),
1086 }
1087 }
1088
1089 #[test]
1090 fn interpret_cid_font_identity_h_two_byte_codes() {
1091 let mut doc = lopdf::Document::with_version("1.5");
1092 let resources = make_cid_font_resources(&mut doc);
1093
1094 let stream = b"BT /F1 12 Tf <4E2D6587> Tj ET";
1097
1098 let mut handler = CollectingHandler::new();
1099 let mut gstate = InterpreterState::new();
1100 let mut tstate = TextState::new();
1101
1102 interpret_content_stream(
1103 &doc,
1104 stream,
1105 &resources,
1106 &mut handler,
1107 &default_options(),
1108 0,
1109 &mut gstate,
1110 &mut tstate,
1111 )
1112 .unwrap();
1113
1114 assert_eq!(handler.chars.len(), 2);
1116 assert_eq!(handler.chars[0].char_code, 0x4E2D);
1117 assert_eq!(handler.chars[1].char_code, 0x6587);
1118 assert_eq!(handler.chars[0].unicode, Some("中".to_string()));
1120 assert_eq!(handler.chars[1].unicode, Some("文".to_string()));
1121 assert_eq!(handler.chars[0].font_name, "MSGothic");
1122 }
1123
1124 #[test]
1125 fn interpret_cid_font_tj_array_two_byte_codes() {
1126 let mut doc = lopdf::Document::with_version("1.5");
1127 let resources = make_cid_font_resources(&mut doc);
1128
1129 let stream = b"BT /F1 12 Tf [<4E2D> -100 <6587>] TJ ET";
1131
1132 let mut handler = CollectingHandler::new();
1133 let mut gstate = InterpreterState::new();
1134 let mut tstate = TextState::new();
1135
1136 interpret_content_stream(
1137 &doc,
1138 stream,
1139 &resources,
1140 &mut handler,
1141 &default_options(),
1142 0,
1143 &mut gstate,
1144 &mut tstate,
1145 )
1146 .unwrap();
1147
1148 assert_eq!(handler.chars.len(), 2);
1149 assert_eq!(handler.chars[0].char_code, 0x4E2D);
1150 assert_eq!(handler.chars[1].char_code, 0x6587);
1151 }
1152
1153 #[test]
1154 fn interpret_subset_font_name_stripped() {
1155 let mut doc = lopdf::Document::with_version("1.5");
1156
1157 use lopdf::{Object, Stream, dictionary};
1158
1159 let tounicode_data = b"\
1161 beginbfchar\n\
1162 <4E2D> <4E2D>\n\
1163 endbfchar\n";
1164 let tounicode_stream = Stream::new(dictionary! {}, tounicode_data.to_vec());
1165 let tounicode_id = doc.add_object(Object::Stream(tounicode_stream));
1166
1167 let cid_font_dict = dictionary! {
1169 "Type" => "Font",
1170 "Subtype" => "CIDFontType2",
1171 "BaseFont" => "ABCDEF+MSGothic",
1172 "DW" => Object::Integer(1000),
1173 "CIDToGIDMap" => "Identity",
1174 };
1175 let cid_font_id = doc.add_object(Object::Dictionary(cid_font_dict));
1176
1177 let type0_dict = dictionary! {
1179 "Type" => "Font",
1180 "Subtype" => "Type0",
1181 "BaseFont" => "ABCDEF+MSGothic",
1182 "Encoding" => "Identity-H",
1183 "DescendantFonts" => Object::Array(vec![Object::Reference(cid_font_id)]),
1184 "ToUnicode" => Object::Reference(tounicode_id),
1185 };
1186 let type0_id = doc.add_object(Object::Dictionary(type0_dict));
1187
1188 let resources = dictionary! {
1189 "Font" => Object::Dictionary(dictionary! {
1190 "F1" => Object::Reference(type0_id),
1191 }),
1192 };
1193
1194 let stream = b"BT /F1 12 Tf <4E2D> Tj ET";
1195
1196 let mut handler = CollectingHandler::new();
1197 let mut gstate = InterpreterState::new();
1198 let mut tstate = TextState::new();
1199
1200 interpret_content_stream(
1201 &doc,
1202 stream,
1203 &resources,
1204 &mut handler,
1205 &default_options(),
1206 0,
1207 &mut gstate,
1208 &mut tstate,
1209 )
1210 .unwrap();
1211
1212 assert_eq!(handler.chars.len(), 1);
1213 assert_eq!(handler.chars[0].font_name, "MSGothic");
1215 }
1216
1217 fn make_cid_font_resources_identity_v(doc: &mut lopdf::Document) -> lopdf::Dictionary {
1219 use lopdf::{Object, Stream, dictionary};
1220
1221 let tounicode_data = b"\
1222 beginbfchar\n\
1223 <4E2D> <4E2D>\n\
1224 endbfchar\n";
1225 let tounicode_stream = Stream::new(dictionary! {}, tounicode_data.to_vec());
1226 let tounicode_id = doc.add_object(Object::Stream(tounicode_stream));
1227
1228 let cid_font_dict = dictionary! {
1229 "Type" => "Font",
1230 "Subtype" => "CIDFontType2",
1231 "BaseFont" => "MSGothic",
1232 "DW" => Object::Integer(1000),
1233 "CIDToGIDMap" => "Identity",
1234 };
1235 let cid_font_id = doc.add_object(Object::Dictionary(cid_font_dict));
1236
1237 let type0_dict = dictionary! {
1238 "Type" => "Font",
1239 "Subtype" => "Type0",
1240 "BaseFont" => "MSGothic",
1241 "Encoding" => "Identity-V",
1242 "DescendantFonts" => Object::Array(vec![Object::Reference(cid_font_id)]),
1243 "ToUnicode" => Object::Reference(tounicode_id),
1244 };
1245 let type0_id = doc.add_object(Object::Dictionary(type0_dict));
1246
1247 dictionary! {
1248 "Font" => Object::Dictionary(dictionary! {
1249 "F1" => Object::Reference(type0_id),
1250 }),
1251 }
1252 }
1253
1254 #[test]
1255 fn interpret_cid_font_identity_v_detected() {
1256 let mut doc = lopdf::Document::with_version("1.5");
1257 let resources = make_cid_font_resources_identity_v(&mut doc);
1258
1259 let stream = b"BT /F1 12 Tf <4E2D> Tj ET";
1261
1262 let mut handler = CollectingHandler::new();
1263 let mut gstate = InterpreterState::new();
1264 let mut tstate = TextState::new();
1265
1266 interpret_content_stream(
1267 &doc,
1268 stream,
1269 &resources,
1270 &mut handler,
1271 &default_options(),
1272 0,
1273 &mut gstate,
1274 &mut tstate,
1275 )
1276 .unwrap();
1277
1278 assert_eq!(handler.chars.len(), 1);
1280 assert_eq!(handler.chars[0].char_code, 0x4E2D);
1281 assert_eq!(handler.chars[0].unicode, Some("中".to_string()));
1282 }
1283
1284 #[test]
1287 fn interpret_missing_font_emits_warning() {
1288 let doc = lopdf::Document::with_version("1.5");
1289 let resources = empty_resources(); let stream = b"BT /F1 12 Tf (Hi) Tj ET";
1292
1293 let mut handler = CollectingHandler::new();
1294 let mut gstate = InterpreterState::new();
1295 let mut tstate = TextState::new();
1296
1297 interpret_content_stream(
1298 &doc,
1299 stream,
1300 &resources,
1301 &mut handler,
1302 &default_options(),
1303 0,
1304 &mut gstate,
1305 &mut tstate,
1306 )
1307 .unwrap();
1308
1309 assert!(!handler.warnings.is_empty());
1311 assert!(
1312 handler.warnings[0]
1313 .description
1314 .contains("font not found in page resources"),
1315 "expected 'font not found' warning, got: {}",
1316 handler.warnings[0].description
1317 );
1318 assert_eq!(
1319 handler.warnings[0].font_name,
1320 Some("F1".to_string()),
1321 "warning should include font name"
1322 );
1323 assert!(
1324 handler.warnings[0].operator_index.is_some(),
1325 "warning should include operator index"
1326 );
1327
1328 assert_eq!(handler.chars.len(), 2);
1330 }
1331
1332 #[test]
1333 fn interpret_no_warnings_when_collection_disabled() {
1334 let doc = lopdf::Document::with_version("1.5");
1335 let resources = empty_resources();
1336 let stream = b"BT /F1 12 Tf (Hi) Tj ET";
1337
1338 let mut handler = CollectingHandler::new();
1339 let mut gstate = InterpreterState::new();
1340 let mut tstate = TextState::new();
1341
1342 let opts = ExtractOptions {
1343 collect_warnings: false,
1344 ..ExtractOptions::default()
1345 };
1346
1347 interpret_content_stream(
1348 &doc,
1349 stream,
1350 &resources,
1351 &mut handler,
1352 &opts,
1353 0,
1354 &mut gstate,
1355 &mut tstate,
1356 )
1357 .unwrap();
1358
1359 assert!(handler.warnings.is_empty());
1361
1362 assert_eq!(handler.chars.len(), 2);
1364 }
1365
1366 #[test]
1367 fn interpret_warnings_do_not_affect_output() {
1368 let doc = lopdf::Document::with_version("1.5");
1369 let resources = empty_resources();
1370 let stream = b"BT /F1 12 Tf (AB) Tj ET";
1371
1372 let mut handler_on = CollectingHandler::new();
1374 let mut gstate_on = InterpreterState::new();
1375 let mut tstate_on = TextState::new();
1376 let opts_on = ExtractOptions {
1377 collect_warnings: true,
1378 ..ExtractOptions::default()
1379 };
1380 interpret_content_stream(
1381 &doc,
1382 stream,
1383 &resources,
1384 &mut handler_on,
1385 &opts_on,
1386 0,
1387 &mut gstate_on,
1388 &mut tstate_on,
1389 )
1390 .unwrap();
1391
1392 let mut handler_off = CollectingHandler::new();
1394 let mut gstate_off = InterpreterState::new();
1395 let mut tstate_off = TextState::new();
1396 let opts_off = ExtractOptions {
1397 collect_warnings: false,
1398 ..ExtractOptions::default()
1399 };
1400 interpret_content_stream(
1401 &doc,
1402 stream,
1403 &resources,
1404 &mut handler_off,
1405 &opts_off,
1406 0,
1407 &mut gstate_off,
1408 &mut tstate_off,
1409 )
1410 .unwrap();
1411
1412 assert_eq!(handler_on.chars.len(), handler_off.chars.len());
1414 for (a, b) in handler_on.chars.iter().zip(handler_off.chars.iter()) {
1415 assert_eq!(a.char_code, b.char_code);
1416 }
1417 }
1418
1419 #[test]
1420 fn interpret_valid_font_no_warnings() {
1421 let mut doc = lopdf::Document::with_version("1.5");
1422 let resources = make_cid_font_resources(&mut doc);
1423 let stream = b"BT /F1 12 Tf <4E2D> Tj ET";
1424
1425 let mut handler = CollectingHandler::new();
1426 let mut gstate = InterpreterState::new();
1427 let mut tstate = TextState::new();
1428
1429 interpret_content_stream(
1430 &doc,
1431 stream,
1432 &resources,
1433 &mut handler,
1434 &default_options(),
1435 0,
1436 &mut gstate,
1437 &mut tstate,
1438 )
1439 .unwrap();
1440
1441 assert!(
1443 handler.warnings.is_empty(),
1444 "expected no warnings for valid font, got: {:?}",
1445 handler.warnings
1446 );
1447 assert_eq!(handler.chars.len(), 1);
1448 }
1449}