1use std::collections::HashMap;
8
9use crate::cid_font::{
10 CidFontMetrics, extract_cid_font_metrics, get_descendant_font, get_type0_encoding,
11 is_type0_font, parse_predefined_cmap_name, strip_subset_prefix,
12};
13use crate::cmap::CMap;
14use crate::error::BackendError;
15use crate::font_metrics::{FontMetrics, extract_font_metrics};
16use crate::handler::{CharEvent, ContentHandler, ImageEvent};
17use crate::interpreter_state::InterpreterState;
18use crate::lopdf_backend::object_to_f64;
19use crate::text_renderer::{
20 TjElement, show_string, show_string_cid, show_string_with_positioning_mode,
21};
22use crate::text_state::TextState;
23use crate::tokenizer::{Operand, tokenize};
24use pdfplumber_core::{ExtractOptions, ExtractWarning};
25
26struct CachedFont {
28 metrics: FontMetrics,
29 cmap: Option<CMap>,
30 base_name: String,
31 cid_metrics: Option<CidFontMetrics>,
33 is_cid_font: bool,
35 #[allow(dead_code)]
38 writing_mode: u8,
39}
40
41#[allow(clippy::too_many_arguments)]
58pub(crate) fn interpret_content_stream(
59 doc: &lopdf::Document,
60 stream_bytes: &[u8],
61 resources: &lopdf::Dictionary,
62 handler: &mut dyn ContentHandler,
63 options: &ExtractOptions,
64 depth: usize,
65 gstate: &mut InterpreterState,
66 tstate: &mut TextState,
67) -> Result<(), BackendError> {
68 if depth > options.max_recursion_depth {
69 return Err(BackendError::Interpreter(format!(
70 "Form XObject recursion depth {} exceeds limit {}",
71 depth, options.max_recursion_depth
72 )));
73 }
74
75 let operators = tokenize(stream_bytes)?;
76 let mut font_cache: HashMap<String, CachedFont> = HashMap::new();
77
78 for (op_index, op) in operators.iter().enumerate() {
79 match op.name.as_str() {
80 "q" => gstate.save_state(),
82 "Q" => {
83 gstate.restore_state();
84 }
85 "cm" => {
86 if op.operands.len() >= 6 {
87 let a = get_f64(&op.operands, 0).unwrap_or(1.0);
88 let b = get_f64(&op.operands, 1).unwrap_or(0.0);
89 let c = get_f64(&op.operands, 2).unwrap_or(0.0);
90 let d = get_f64(&op.operands, 3).unwrap_or(1.0);
91 let e = get_f64(&op.operands, 4).unwrap_or(0.0);
92 let f = get_f64(&op.operands, 5).unwrap_or(0.0);
93 gstate.concat_matrix(a, b, c, d, e, f);
94 }
95 }
96 "w" => {
97 if let Some(v) = get_f64(&op.operands, 0) {
98 gstate.set_line_width(v);
99 }
100 }
101
102 "G" => {
104 if let Some(g) = get_f32(&op.operands, 0) {
105 gstate.set_stroking_gray(g);
106 }
107 }
108 "g" => {
109 if let Some(g) = get_f32(&op.operands, 0) {
110 gstate.set_non_stroking_gray(g);
111 }
112 }
113 "RG" => {
114 if op.operands.len() >= 3 {
115 let r = get_f32(&op.operands, 0).unwrap_or(0.0);
116 let g = get_f32(&op.operands, 1).unwrap_or(0.0);
117 let b = get_f32(&op.operands, 2).unwrap_or(0.0);
118 gstate.set_stroking_rgb(r, g, b);
119 }
120 }
121 "rg" => {
122 if op.operands.len() >= 3 {
123 let r = get_f32(&op.operands, 0).unwrap_or(0.0);
124 let g = get_f32(&op.operands, 1).unwrap_or(0.0);
125 let b = get_f32(&op.operands, 2).unwrap_or(0.0);
126 gstate.set_non_stroking_rgb(r, g, b);
127 }
128 }
129 "K" => {
130 if op.operands.len() >= 4 {
131 let c = get_f32(&op.operands, 0).unwrap_or(0.0);
132 let m = get_f32(&op.operands, 1).unwrap_or(0.0);
133 let y = get_f32(&op.operands, 2).unwrap_or(0.0);
134 let k = get_f32(&op.operands, 3).unwrap_or(0.0);
135 gstate.set_stroking_cmyk(c, m, y, k);
136 }
137 }
138 "k" => {
139 if op.operands.len() >= 4 {
140 let c = get_f32(&op.operands, 0).unwrap_or(0.0);
141 let m = get_f32(&op.operands, 1).unwrap_or(0.0);
142 let y = get_f32(&op.operands, 2).unwrap_or(0.0);
143 let k = get_f32(&op.operands, 3).unwrap_or(0.0);
144 gstate.set_non_stroking_cmyk(c, m, y, k);
145 }
146 }
147 "SC" | "SCN" => {
148 let components: Vec<f32> = op.operands.iter().filter_map(operand_to_f32).collect();
149 gstate.set_stroking_color(&components);
150 }
151 "sc" | "scn" => {
152 let components: Vec<f32> = op.operands.iter().filter_map(operand_to_f32).collect();
153 gstate.set_non_stroking_color(&components);
154 }
155
156 "BT" => tstate.begin_text(),
158 "ET" => tstate.end_text(),
159 "Tf" => {
160 if op.operands.len() >= 2 {
161 let font_name = operand_to_name(&op.operands[0]);
162 let size = get_f64(&op.operands, 1).unwrap_or(0.0);
163 tstate.set_font(font_name.clone(), size);
164 load_font_if_needed(
165 doc,
166 resources,
167 &font_name,
168 &mut font_cache,
169 handler,
170 options,
171 op_index,
172 );
173 }
174 }
175 "Tm" => {
176 if op.operands.len() >= 6 {
177 let a = get_f64(&op.operands, 0).unwrap_or(1.0);
178 let b = get_f64(&op.operands, 1).unwrap_or(0.0);
179 let c = get_f64(&op.operands, 2).unwrap_or(0.0);
180 let d = get_f64(&op.operands, 3).unwrap_or(1.0);
181 let e = get_f64(&op.operands, 4).unwrap_or(0.0);
182 let f = get_f64(&op.operands, 5).unwrap_or(0.0);
183 tstate.set_text_matrix(a, b, c, d, e, f);
184 }
185 }
186 "Td" => {
187 if op.operands.len() >= 2 {
188 let tx = get_f64(&op.operands, 0).unwrap_or(0.0);
189 let ty = get_f64(&op.operands, 1).unwrap_or(0.0);
190 tstate.move_text_position(tx, ty);
191 }
192 }
193 "TD" => {
194 if op.operands.len() >= 2 {
195 let tx = get_f64(&op.operands, 0).unwrap_or(0.0);
196 let ty = get_f64(&op.operands, 1).unwrap_or(0.0);
197 tstate.move_text_position_and_set_leading(tx, ty);
198 }
199 }
200 "T*" => tstate.move_to_next_line(),
201 "Tc" => {
202 if let Some(v) = get_f64(&op.operands, 0) {
203 tstate.set_char_spacing(v);
204 }
205 }
206 "Tw" => {
207 if let Some(v) = get_f64(&op.operands, 0) {
208 tstate.set_word_spacing(v);
209 }
210 }
211 "Tz" => {
212 if let Some(v) = get_f64(&op.operands, 0) {
213 tstate.set_h_scaling(v);
214 }
215 }
216 "TL" => {
217 if let Some(v) = get_f64(&op.operands, 0) {
218 tstate.set_leading(v);
219 }
220 }
221 "Tr" => {
222 if let Some(v) = get_i64(&op.operands, 0) {
223 if let Some(mode) = crate::text_state::TextRenderMode::from_i64(v) {
224 tstate.set_render_mode(mode);
225 }
226 }
227 }
228 "Ts" => {
229 if let Some(v) = get_f64(&op.operands, 0) {
230 tstate.set_rise(v);
231 }
232 }
233
234 "Tj" => {
236 handle_tj(tstate, gstate, handler, &op.operands, &font_cache);
237 }
238 "TJ" => {
239 handle_tj_array(tstate, gstate, handler, &op.operands, &font_cache);
240 }
241 "'" => {
242 tstate.move_to_next_line();
244 handle_tj(tstate, gstate, handler, &op.operands, &font_cache);
245 }
246 "\"" => {
247 if op.operands.len() >= 3 {
249 if let Some(aw) = get_f64(&op.operands, 0) {
250 tstate.set_word_spacing(aw);
251 }
252 if let Some(ac) = get_f64(&op.operands, 1) {
253 tstate.set_char_spacing(ac);
254 }
255 tstate.move_to_next_line();
256 let string_operands = vec![op.operands[2].clone()];
258 handle_tj(tstate, gstate, handler, &string_operands, &font_cache);
259 }
260 }
261
262 "Do" => {
264 if let Some(Operand::Name(name)) = op.operands.first() {
265 handle_do(
266 doc, resources, handler, options, depth, gstate, tstate, name,
267 )?;
268 }
269 }
270
271 _ => {}
273 }
274 }
275
276 Ok(())
277}
278
279fn get_f64(operands: &[Operand], index: usize) -> Option<f64> {
282 operands.get(index).and_then(|o| match o {
283 Operand::Integer(i) => Some(*i as f64),
284 Operand::Real(f) => Some(*f),
285 _ => None,
286 })
287}
288
289fn get_f32(operands: &[Operand], index: usize) -> Option<f32> {
290 get_f64(operands, index).map(|v| v as f32)
291}
292
293fn get_i64(operands: &[Operand], index: usize) -> Option<i64> {
294 operands.get(index).and_then(|o| match o {
295 Operand::Integer(i) => Some(*i),
296 Operand::Real(f) => Some(*f as i64),
297 _ => None,
298 })
299}
300
301fn operand_to_f32(o: &Operand) -> Option<f32> {
302 match o {
303 Operand::Integer(i) => Some(*i as f32),
304 Operand::Real(f) => Some(*f as f32),
305 _ => None,
306 }
307}
308
309fn operand_to_name(o: &Operand) -> String {
310 match o {
311 Operand::Name(n) => n.clone(),
312 _ => String::new(),
313 }
314}
315
316fn operand_to_string_bytes(o: &Operand) -> Option<&[u8]> {
317 match o {
318 Operand::LiteralString(s) | Operand::HexString(s) => Some(s),
319 _ => None,
320 }
321}
322
323#[allow(clippy::too_many_arguments)]
326fn load_font_if_needed(
327 doc: &lopdf::Document,
328 resources: &lopdf::Dictionary,
329 font_name: &str,
330 cache: &mut HashMap<String, CachedFont>,
331 handler: &mut dyn ContentHandler,
332 options: &ExtractOptions,
333 op_index: usize,
334) {
335 if cache.contains_key(font_name) {
336 return;
337 }
338
339 let font_dict = (|| -> Option<&lopdf::Dictionary> {
341 let fonts_obj = resources.get(b"Font").ok()?;
342 let fonts_obj = resolve_ref(doc, fonts_obj);
343 let fonts_dict = fonts_obj.as_dict().ok()?;
344 let font_obj = fonts_dict.get(font_name.as_bytes()).ok()?;
345 let font_obj = resolve_ref(doc, font_obj);
346 font_obj.as_dict().ok()
347 })();
348
349 let (metrics, cmap, base_name, cid_metrics, is_cid_font, writing_mode) =
350 if let Some(fd) = font_dict {
351 if is_type0_font(fd) {
352 let (cid_met, wm) = load_cid_font(doc, fd);
354 let metrics = if let Some(ref cm) = cid_met {
355 FontMetrics::new(
357 Vec::new(),
358 0,
359 0,
360 cm.default_width(),
361 cm.ascent(),
362 cm.descent(),
363 cm.font_bbox(),
364 )
365 } else {
366 if options.collect_warnings {
367 handler.on_warning(ExtractWarning::with_operator_context(
368 "CID font metrics not available, using defaults",
369 op_index,
370 font_name,
371 ));
372 }
373 FontMetrics::default_metrics()
374 };
375
376 let cmap = extract_tounicode_cmap(doc, fd);
378
379 let raw_base_name = fd
380 .get(b"BaseFont")
381 .ok()
382 .and_then(|o| o.as_name_str().ok())
383 .unwrap_or(font_name);
384 let base_name = strip_subset_prefix(raw_base_name).to_string();
385
386 (metrics, cmap, base_name, cid_met, true, wm)
387 } else {
388 let metrics = match extract_font_metrics(doc, fd) {
390 Ok(m) => m,
391 Err(_) => {
392 if options.collect_warnings {
393 handler.on_warning(ExtractWarning::with_operator_context(
394 "failed to extract font metrics, using defaults",
395 op_index,
396 font_name,
397 ));
398 }
399 FontMetrics::default_metrics()
400 }
401 };
402 let cmap = extract_tounicode_cmap(doc, fd);
403 let raw_base_name = fd
404 .get(b"BaseFont")
405 .ok()
406 .and_then(|o| o.as_name_str().ok())
407 .unwrap_or(font_name);
408 let base_name = strip_subset_prefix(raw_base_name).to_string();
409
410 (metrics, cmap, base_name, None, false, 0)
411 }
412 } else {
413 if options.collect_warnings {
415 handler.on_warning(ExtractWarning::with_operator_context(
416 "font not found in page resources, using defaults",
417 op_index,
418 font_name,
419 ));
420 }
421 (
422 FontMetrics::default_metrics(),
423 None,
424 font_name.to_string(),
425 None,
426 false,
427 0,
428 )
429 };
430
431 cache.insert(
432 font_name.to_string(),
433 CachedFont {
434 metrics,
435 cmap,
436 base_name,
437 cid_metrics,
438 is_cid_font,
439 writing_mode,
440 },
441 );
442}
443
444fn extract_tounicode_cmap(doc: &lopdf::Document, fd: &lopdf::Dictionary) -> Option<CMap> {
446 let tounicode_obj = fd.get(b"ToUnicode").ok()?;
447 let tounicode_obj = resolve_ref(doc, tounicode_obj);
448 let stream = tounicode_obj.as_stream().ok()?;
449 let data = decode_stream(stream).ok()?;
450 CMap::parse(&data).ok()
451}
452
453fn load_cid_font(
455 doc: &lopdf::Document,
456 type0_dict: &lopdf::Dictionary,
457) -> (Option<CidFontMetrics>, u8) {
458 let writing_mode = get_type0_encoding(type0_dict)
460 .and_then(|enc| parse_predefined_cmap_name(&enc))
461 .map(|info| info.writing_mode)
462 .unwrap_or(0);
463
464 let cid_metrics = get_descendant_font(doc, type0_dict)
466 .and_then(|desc| extract_cid_font_metrics(doc, desc).ok());
467
468 (cid_metrics, writing_mode)
469}
470
471fn get_width_fn(cached: Option<&CachedFont>) -> Box<dyn Fn(u32) -> f64 + '_> {
476 match cached {
477 Some(cf) if cf.is_cid_font => {
478 if let Some(ref cid_met) = cf.cid_metrics {
479 Box::new(move |code: u32| cid_met.get_width(code))
480 } else {
481 Box::new(move |code: u32| cf.metrics.get_width(code))
482 }
483 }
484 Some(cf) => Box::new(move |code: u32| cf.metrics.get_width(code)),
485 None => {
486 let default_metrics = FontMetrics::default_metrics();
487 Box::new(move |code: u32| default_metrics.get_width(code))
488 }
489 }
490}
491
492fn handle_tj(
493 tstate: &mut TextState,
494 gstate: &InterpreterState,
495 handler: &mut dyn ContentHandler,
496 operands: &[Operand],
497 font_cache: &HashMap<String, CachedFont>,
498) {
499 let string_bytes = match operands.first().and_then(operand_to_string_bytes) {
500 Some(bytes) => bytes,
501 None => return,
502 };
503
504 let cached = font_cache.get(&tstate.font_name);
505 let width_fn = get_width_fn(cached);
506 let is_cid = cached.is_some_and(|c| c.is_cid_font);
507 let raw_chars = if is_cid {
508 show_string_cid(tstate, string_bytes, &*width_fn)
509 } else {
510 show_string(tstate, string_bytes, &*width_fn)
511 };
512
513 emit_char_events(raw_chars, tstate, gstate, handler, cached);
514}
515
516fn handle_tj_array(
517 tstate: &mut TextState,
518 gstate: &InterpreterState,
519 handler: &mut dyn ContentHandler,
520 operands: &[Operand],
521 font_cache: &HashMap<String, CachedFont>,
522) {
523 let array = match operands.first() {
524 Some(Operand::Array(arr)) => arr,
525 _ => return,
526 };
527
528 let elements: Vec<TjElement> = array
530 .iter()
531 .filter_map(|o| match o {
532 Operand::LiteralString(s) | Operand::HexString(s) => Some(TjElement::String(s.clone())),
533 Operand::Integer(i) => Some(TjElement::Adjustment(*i as f64)),
534 Operand::Real(f) => Some(TjElement::Adjustment(*f)),
535 _ => None,
536 })
537 .collect();
538
539 let cached = font_cache.get(&tstate.font_name);
540 let width_fn = get_width_fn(cached);
541 let is_cid = cached.is_some_and(|c| c.is_cid_font);
542 let raw_chars = show_string_with_positioning_mode(tstate, &elements, &*width_fn, is_cid);
543
544 emit_char_events(raw_chars, tstate, gstate, handler, cached);
545}
546
547fn emit_char_events(
548 raw_chars: Vec<crate::text_renderer::RawChar>,
549 tstate: &TextState,
550 gstate: &InterpreterState,
551 handler: &mut dyn ContentHandler,
552 cached: Option<&CachedFont>,
553) {
554 let ctm = gstate.ctm_array();
555 let font_name = cached.map_or_else(|| tstate.font_name.clone(), |c| c.base_name.clone());
556
557 for rc in raw_chars {
558 let unicode = cached.and_then(|c| {
559 c.cmap
560 .as_ref()
561 .and_then(|cm| cm.lookup(rc.char_code).map(|s| s.to_string()))
562 });
563
564 let displacement = match cached {
566 Some(cf) if cf.is_cid_font => cf
567 .cid_metrics
568 .as_ref()
569 .map_or(600.0, |cm| cm.get_width(rc.char_code)),
570 Some(cf) => cf.metrics.get_width(rc.char_code),
571 None => 600.0,
572 };
573
574 handler.on_char(CharEvent {
575 char_code: rc.char_code,
576 unicode,
577 font_name: font_name.clone(),
578 font_size: tstate.font_size,
579 text_matrix: rc.text_matrix,
580 ctm,
581 displacement,
582 char_spacing: tstate.char_spacing,
583 word_spacing: tstate.word_spacing,
584 h_scaling: tstate.h_scaling_normalized(),
585 rise: tstate.rise,
586 });
587 }
588}
589
590#[allow(clippy::too_many_arguments)]
593fn handle_do(
594 doc: &lopdf::Document,
595 resources: &lopdf::Dictionary,
596 handler: &mut dyn ContentHandler,
597 options: &ExtractOptions,
598 depth: usize,
599 gstate: &mut InterpreterState,
600 tstate: &mut TextState,
601 name: &str,
602) -> Result<(), BackendError> {
603 let xobj_dict = resources.get(b"XObject").map_err(|_| {
605 BackendError::Interpreter(format!(
606 "no /XObject dictionary in resources for Do /{name}"
607 ))
608 })?;
609 let xobj_dict = resolve_ref(doc, xobj_dict);
610 let xobj_dict = xobj_dict.as_dict().map_err(|_| {
611 BackendError::Interpreter("/XObject resource is not a dictionary".to_string())
612 })?;
613
614 let xobj_entry = xobj_dict.get(name.as_bytes()).map_err(|_| {
615 BackendError::Interpreter(format!("XObject /{name} not found in resources"))
616 })?;
617
618 let xobj_id = xobj_entry.as_reference().map_err(|_| {
619 BackendError::Interpreter(format!("XObject /{name} is not an indirect reference"))
620 })?;
621
622 let xobj = doc.get_object(xobj_id).map_err(|e| {
623 BackendError::Interpreter(format!("failed to resolve XObject /{name}: {e}"))
624 })?;
625
626 let stream = xobj
627 .as_stream()
628 .map_err(|e| BackendError::Interpreter(format!("XObject /{name} is not a stream: {e}")))?;
629
630 let subtype = stream
631 .dict
632 .get(b"Subtype")
633 .ok()
634 .and_then(|o| o.as_name_str().ok())
635 .unwrap_or("");
636
637 match subtype {
638 "Form" => handle_form_xobject(
639 doc, stream, name, resources, handler, options, depth, gstate, tstate,
640 ),
641 "Image" => {
642 handle_image_xobject(stream, name, gstate, handler);
643 Ok(())
644 }
645 _ => {
646 Ok(())
648 }
649 }
650}
651
652#[allow(clippy::too_many_arguments)]
653fn handle_form_xobject(
654 doc: &lopdf::Document,
655 stream: &lopdf::Stream,
656 name: &str,
657 parent_resources: &lopdf::Dictionary,
658 handler: &mut dyn ContentHandler,
659 options: &ExtractOptions,
660 depth: usize,
661 gstate: &mut InterpreterState,
662 tstate: &mut TextState,
663) -> Result<(), BackendError> {
664 gstate.save_state();
666
667 if let Ok(matrix_obj) = stream.dict.get(b"Matrix") {
669 if let Ok(arr) = matrix_obj.as_array() {
670 if arr.len() == 6 {
671 let vals: Result<Vec<f64>, _> = arr.iter().map(object_to_f64).collect();
672 if let Ok(vals) = vals {
673 gstate.concat_matrix(vals[0], vals[1], vals[2], vals[3], vals[4], vals[5]);
674 }
675 }
676 }
677 }
678
679 let form_resources_dict;
681 let form_resources = if let Ok(res_obj) = stream.dict.get(b"Resources") {
682 let res_obj = resolve_ref(doc, res_obj);
683 match res_obj.as_dict() {
684 Ok(d) => d,
685 Err(_) => parent_resources,
686 }
687 } else {
688 if let Ok(res_ref) = stream.dict.get(b"Resources") {
692 if let Ok(id) = res_ref.as_reference() {
693 if let Ok(obj) = doc.get_object(id) {
694 if let Ok(d) = obj.as_dict() {
695 form_resources_dict = d.clone();
696 &form_resources_dict
697 } else {
698 parent_resources
699 }
700 } else {
701 parent_resources
702 }
703 } else {
704 parent_resources
705 }
706 } else {
707 parent_resources
708 }
709 };
710
711 let content_bytes = decode_stream(stream).map_err(|e| {
713 BackendError::Interpreter(format!("failed to decode Form XObject /{name} stream: {e}"))
714 })?;
715
716 interpret_content_stream(
718 doc,
719 &content_bytes,
720 form_resources,
721 handler,
722 options,
723 depth + 1,
724 gstate,
725 tstate,
726 )?;
727
728 gstate.restore_state();
730
731 Ok(())
732}
733
734fn handle_image_xobject(
735 stream: &lopdf::Stream,
736 name: &str,
737 gstate: &InterpreterState,
738 handler: &mut dyn ContentHandler,
739) {
740 let width = stream
741 .dict
742 .get(b"Width")
743 .ok()
744 .and_then(|o| o.as_i64().ok())
745 .unwrap_or(0) as u32;
746
747 let height = stream
748 .dict
749 .get(b"Height")
750 .ok()
751 .and_then(|o| o.as_i64().ok())
752 .unwrap_or(0) as u32;
753
754 let colorspace = stream
755 .dict
756 .get(b"ColorSpace")
757 .ok()
758 .and_then(|o| o.as_name_str().ok())
759 .map(|s| s.to_string());
760
761 let bits_per_component = stream
762 .dict
763 .get(b"BitsPerComponent")
764 .ok()
765 .and_then(|o| o.as_i64().ok())
766 .map(|v| v as u32);
767
768 handler.on_image(ImageEvent {
769 name: name.to_string(),
770 ctm: gstate.ctm_array(),
771 width,
772 height,
773 colorspace,
774 bits_per_component,
775 });
776}
777
778fn resolve_ref<'a>(doc: &'a lopdf::Document, obj: &'a lopdf::Object) -> &'a lopdf::Object {
783 match obj {
784 lopdf::Object::Reference(id) => doc.get_object(*id).unwrap_or(obj),
785 _ => obj,
786 }
787}
788
789fn decode_stream(stream: &lopdf::Stream) -> Result<Vec<u8>, BackendError> {
791 if stream.dict.get(b"Filter").is_ok() {
793 stream
794 .decompressed_content()
795 .map_err(|e| BackendError::Interpreter(format!("stream decompression failed: {e}")))
796 } else {
797 Ok(stream.content.clone())
798 }
799}
800
801#[cfg(test)]
802mod tests {
803 use super::*;
804 use crate::handler::{CharEvent, ContentHandler, ImageEvent};
805
806 struct CollectingHandler {
809 chars: Vec<CharEvent>,
810 images: Vec<ImageEvent>,
811 warnings: Vec<ExtractWarning>,
812 }
813
814 impl CollectingHandler {
815 fn new() -> Self {
816 Self {
817 chars: Vec::new(),
818 images: Vec::new(),
819 warnings: Vec::new(),
820 }
821 }
822 }
823
824 impl ContentHandler for CollectingHandler {
825 fn on_char(&mut self, event: CharEvent) {
826 self.chars.push(event);
827 }
828 fn on_image(&mut self, event: ImageEvent) {
829 self.images.push(event);
830 }
831 fn on_warning(&mut self, warning: ExtractWarning) {
832 self.warnings.push(warning);
833 }
834 }
835
836 fn empty_resources() -> lopdf::Dictionary {
839 lopdf::Dictionary::new()
840 }
841
842 fn default_options() -> ExtractOptions {
843 ExtractOptions::default()
844 }
845
846 #[test]
849 fn interpret_simple_text() {
850 let doc = lopdf::Document::with_version("1.5");
851 let resources = empty_resources();
852 let stream = b"BT /F1 12 Tf 72 700 Td (Hello) Tj ET";
853
854 let mut handler = CollectingHandler::new();
855 let mut gstate = InterpreterState::new();
856 let mut tstate = TextState::new();
857
858 interpret_content_stream(
859 &doc,
860 stream,
861 &resources,
862 &mut handler,
863 &default_options(),
864 0,
865 &mut gstate,
866 &mut tstate,
867 )
868 .unwrap();
869
870 assert_eq!(handler.chars.len(), 5);
872 assert_eq!(handler.chars[0].char_code, b'H' as u32);
873 assert_eq!(handler.chars[1].char_code, b'e' as u32);
874 assert_eq!(handler.chars[4].char_code, b'o' as u32);
875 assert_eq!(handler.chars[0].font_size, 12.0);
876 }
877
878 #[test]
879 fn interpret_tj_array() {
880 let doc = lopdf::Document::with_version("1.5");
881 let resources = empty_resources();
882 let stream = b"BT /F1 12 Tf [(H) -20 (i)] TJ ET";
883
884 let mut handler = CollectingHandler::new();
885 let mut gstate = InterpreterState::new();
886 let mut tstate = TextState::new();
887
888 interpret_content_stream(
889 &doc,
890 stream,
891 &resources,
892 &mut handler,
893 &default_options(),
894 0,
895 &mut gstate,
896 &mut tstate,
897 )
898 .unwrap();
899
900 assert_eq!(handler.chars.len(), 2);
901 assert_eq!(handler.chars[0].char_code, b'H' as u32);
902 assert_eq!(handler.chars[1].char_code, b'i' as u32);
903 }
904
905 #[test]
906 fn interpret_ctm_passed_to_char_events() {
907 let doc = lopdf::Document::with_version("1.5");
908 let resources = empty_resources();
909 let stream = b"1 0 0 1 10 20 cm BT /F1 12 Tf (A) Tj ET";
910
911 let mut handler = CollectingHandler::new();
912 let mut gstate = InterpreterState::new();
913 let mut tstate = TextState::new();
914
915 interpret_content_stream(
916 &doc,
917 stream,
918 &resources,
919 &mut handler,
920 &default_options(),
921 0,
922 &mut gstate,
923 &mut tstate,
924 )
925 .unwrap();
926
927 assert_eq!(handler.chars.len(), 1);
928 assert_eq!(handler.chars[0].ctm, [1.0, 0.0, 0.0, 1.0, 10.0, 20.0]);
929 }
930
931 #[test]
934 fn recursion_depth_zero_allowed() {
935 let doc = lopdf::Document::with_version("1.5");
936 let resources = empty_resources();
937 let stream = b"BT ET";
938
939 let mut handler = CollectingHandler::new();
940 let mut gstate = InterpreterState::new();
941 let mut tstate = TextState::new();
942
943 let result = interpret_content_stream(
944 &doc,
945 stream,
946 &resources,
947 &mut handler,
948 &default_options(),
949 0,
950 &mut gstate,
951 &mut tstate,
952 );
953 assert!(result.is_ok());
954 }
955
956 #[test]
957 fn recursion_depth_exceeds_limit() {
958 let doc = lopdf::Document::with_version("1.5");
959 let resources = empty_resources();
960 let stream = b"BT ET";
961
962 let mut handler = CollectingHandler::new();
963 let mut gstate = InterpreterState::new();
964 let mut tstate = TextState::new();
965
966 let mut opts = ExtractOptions::default();
967 opts.max_recursion_depth = 3;
968
969 let result = interpret_content_stream(
970 &doc,
971 stream,
972 &resources,
973 &mut handler,
974 &opts,
975 4, &mut gstate,
977 &mut tstate,
978 );
979 assert!(result.is_err());
980 let err_msg = result.unwrap_err().to_string();
981 assert!(err_msg.contains("recursion depth"));
982 }
983
984 #[test]
987 fn interpret_q_q_state_save_restore() {
988 let doc = lopdf::Document::with_version("1.5");
989 let resources = empty_resources();
990 let stream = b"0.5 g q 1 0 0 rg Q";
992
993 let mut handler = CollectingHandler::new();
994 let mut gstate = InterpreterState::new();
995 let mut tstate = TextState::new();
996
997 interpret_content_stream(
998 &doc,
999 stream,
1000 &resources,
1001 &mut handler,
1002 &default_options(),
1003 0,
1004 &mut gstate,
1005 &mut tstate,
1006 )
1007 .unwrap();
1008
1009 assert_eq!(
1011 gstate.graphics_state().fill_color,
1012 pdfplumber_core::Color::Gray(0.5)
1013 );
1014 }
1015
1016 fn make_cid_font_resources(doc: &mut lopdf::Document) -> lopdf::Dictionary {
1020 use lopdf::{Object, Stream, dictionary};
1021
1022 let tounicode_data = b"\
1024 /CIDInit /ProcSet findresource begin\n\
1025 12 dict begin\n\
1026 begincmap\n\
1027 /CMapName /Adobe-Identity-UCS def\n\
1028 /CMapType 2 def\n\
1029 1 begincodespacerange\n\
1030 <0000> <FFFF>\n\
1031 endcodespacerange\n\
1032 2 beginbfchar\n\
1033 <4E2D> <4E2D>\n\
1034 <6587> <6587>\n\
1035 endbfchar\n\
1036 endcmap\n";
1037 let tounicode_stream = Stream::new(dictionary! {}, tounicode_data.to_vec());
1038 let tounicode_id = doc.add_object(Object::Stream(tounicode_stream));
1039
1040 let cid_font_dict = dictionary! {
1042 "Type" => "Font",
1043 "Subtype" => "CIDFontType2",
1044 "BaseFont" => "MSGothic",
1045 "DW" => Object::Integer(1000),
1046 "CIDToGIDMap" => "Identity",
1047 "CIDSystemInfo" => Object::Dictionary(dictionary! {
1048 "Registry" => Object::String("Adobe".as_bytes().to_vec(), lopdf::StringFormat::Literal),
1049 "Ordering" => Object::String("Identity".as_bytes().to_vec(), lopdf::StringFormat::Literal),
1050 "Supplement" => Object::Integer(0),
1051 }),
1052 };
1053 let cid_font_id = doc.add_object(Object::Dictionary(cid_font_dict));
1054
1055 let type0_dict = dictionary! {
1057 "Type" => "Font",
1058 "Subtype" => "Type0",
1059 "BaseFont" => "MSGothic",
1060 "Encoding" => "Identity-H",
1061 "DescendantFonts" => Object::Array(vec![Object::Reference(cid_font_id)]),
1062 "ToUnicode" => Object::Reference(tounicode_id),
1063 };
1064 let type0_id = doc.add_object(Object::Dictionary(type0_dict));
1065
1066 dictionary! {
1068 "Font" => Object::Dictionary(dictionary! {
1069 "F1" => Object::Reference(type0_id),
1070 }),
1071 }
1072 }
1073
1074 #[test]
1075 fn interpret_cid_font_identity_h_two_byte_codes() {
1076 let mut doc = lopdf::Document::with_version("1.5");
1077 let resources = make_cid_font_resources(&mut doc);
1078
1079 let stream = b"BT /F1 12 Tf <4E2D6587> Tj ET";
1082
1083 let mut handler = CollectingHandler::new();
1084 let mut gstate = InterpreterState::new();
1085 let mut tstate = TextState::new();
1086
1087 interpret_content_stream(
1088 &doc,
1089 stream,
1090 &resources,
1091 &mut handler,
1092 &default_options(),
1093 0,
1094 &mut gstate,
1095 &mut tstate,
1096 )
1097 .unwrap();
1098
1099 assert_eq!(handler.chars.len(), 2);
1101 assert_eq!(handler.chars[0].char_code, 0x4E2D);
1102 assert_eq!(handler.chars[1].char_code, 0x6587);
1103 assert_eq!(handler.chars[0].unicode, Some("中".to_string()));
1105 assert_eq!(handler.chars[1].unicode, Some("文".to_string()));
1106 assert_eq!(handler.chars[0].font_name, "MSGothic");
1107 }
1108
1109 #[test]
1110 fn interpret_cid_font_tj_array_two_byte_codes() {
1111 let mut doc = lopdf::Document::with_version("1.5");
1112 let resources = make_cid_font_resources(&mut doc);
1113
1114 let stream = b"BT /F1 12 Tf [<4E2D> -100 <6587>] TJ ET";
1116
1117 let mut handler = CollectingHandler::new();
1118 let mut gstate = InterpreterState::new();
1119 let mut tstate = TextState::new();
1120
1121 interpret_content_stream(
1122 &doc,
1123 stream,
1124 &resources,
1125 &mut handler,
1126 &default_options(),
1127 0,
1128 &mut gstate,
1129 &mut tstate,
1130 )
1131 .unwrap();
1132
1133 assert_eq!(handler.chars.len(), 2);
1134 assert_eq!(handler.chars[0].char_code, 0x4E2D);
1135 assert_eq!(handler.chars[1].char_code, 0x6587);
1136 }
1137
1138 #[test]
1139 fn interpret_subset_font_name_stripped() {
1140 let mut doc = lopdf::Document::with_version("1.5");
1141
1142 use lopdf::{Object, Stream, dictionary};
1143
1144 let tounicode_data = b"\
1146 beginbfchar\n\
1147 <4E2D> <4E2D>\n\
1148 endbfchar\n";
1149 let tounicode_stream = Stream::new(dictionary! {}, tounicode_data.to_vec());
1150 let tounicode_id = doc.add_object(Object::Stream(tounicode_stream));
1151
1152 let cid_font_dict = dictionary! {
1154 "Type" => "Font",
1155 "Subtype" => "CIDFontType2",
1156 "BaseFont" => "ABCDEF+MSGothic",
1157 "DW" => Object::Integer(1000),
1158 "CIDToGIDMap" => "Identity",
1159 };
1160 let cid_font_id = doc.add_object(Object::Dictionary(cid_font_dict));
1161
1162 let type0_dict = dictionary! {
1164 "Type" => "Font",
1165 "Subtype" => "Type0",
1166 "BaseFont" => "ABCDEF+MSGothic",
1167 "Encoding" => "Identity-H",
1168 "DescendantFonts" => Object::Array(vec![Object::Reference(cid_font_id)]),
1169 "ToUnicode" => Object::Reference(tounicode_id),
1170 };
1171 let type0_id = doc.add_object(Object::Dictionary(type0_dict));
1172
1173 let resources = dictionary! {
1174 "Font" => Object::Dictionary(dictionary! {
1175 "F1" => Object::Reference(type0_id),
1176 }),
1177 };
1178
1179 let stream = b"BT /F1 12 Tf <4E2D> Tj ET";
1180
1181 let mut handler = CollectingHandler::new();
1182 let mut gstate = InterpreterState::new();
1183 let mut tstate = TextState::new();
1184
1185 interpret_content_stream(
1186 &doc,
1187 stream,
1188 &resources,
1189 &mut handler,
1190 &default_options(),
1191 0,
1192 &mut gstate,
1193 &mut tstate,
1194 )
1195 .unwrap();
1196
1197 assert_eq!(handler.chars.len(), 1);
1198 assert_eq!(handler.chars[0].font_name, "MSGothic");
1200 }
1201
1202 fn make_cid_font_resources_identity_v(doc: &mut lopdf::Document) -> lopdf::Dictionary {
1204 use lopdf::{Object, Stream, dictionary};
1205
1206 let tounicode_data = b"\
1207 beginbfchar\n\
1208 <4E2D> <4E2D>\n\
1209 endbfchar\n";
1210 let tounicode_stream = Stream::new(dictionary! {}, tounicode_data.to_vec());
1211 let tounicode_id = doc.add_object(Object::Stream(tounicode_stream));
1212
1213 let cid_font_dict = dictionary! {
1214 "Type" => "Font",
1215 "Subtype" => "CIDFontType2",
1216 "BaseFont" => "MSGothic",
1217 "DW" => Object::Integer(1000),
1218 "CIDToGIDMap" => "Identity",
1219 };
1220 let cid_font_id = doc.add_object(Object::Dictionary(cid_font_dict));
1221
1222 let type0_dict = dictionary! {
1223 "Type" => "Font",
1224 "Subtype" => "Type0",
1225 "BaseFont" => "MSGothic",
1226 "Encoding" => "Identity-V",
1227 "DescendantFonts" => Object::Array(vec![Object::Reference(cid_font_id)]),
1228 "ToUnicode" => Object::Reference(tounicode_id),
1229 };
1230 let type0_id = doc.add_object(Object::Dictionary(type0_dict));
1231
1232 dictionary! {
1233 "Font" => Object::Dictionary(dictionary! {
1234 "F1" => Object::Reference(type0_id),
1235 }),
1236 }
1237 }
1238
1239 #[test]
1240 fn interpret_cid_font_identity_v_detected() {
1241 let mut doc = lopdf::Document::with_version("1.5");
1242 let resources = make_cid_font_resources_identity_v(&mut doc);
1243
1244 let stream = b"BT /F1 12 Tf <4E2D> Tj ET";
1246
1247 let mut handler = CollectingHandler::new();
1248 let mut gstate = InterpreterState::new();
1249 let mut tstate = TextState::new();
1250
1251 interpret_content_stream(
1252 &doc,
1253 stream,
1254 &resources,
1255 &mut handler,
1256 &default_options(),
1257 0,
1258 &mut gstate,
1259 &mut tstate,
1260 )
1261 .unwrap();
1262
1263 assert_eq!(handler.chars.len(), 1);
1265 assert_eq!(handler.chars[0].char_code, 0x4E2D);
1266 assert_eq!(handler.chars[0].unicode, Some("中".to_string()));
1267 }
1268
1269 #[test]
1272 fn interpret_missing_font_emits_warning() {
1273 let doc = lopdf::Document::with_version("1.5");
1274 let resources = empty_resources(); let stream = b"BT /F1 12 Tf (Hi) Tj ET";
1277
1278 let mut handler = CollectingHandler::new();
1279 let mut gstate = InterpreterState::new();
1280 let mut tstate = TextState::new();
1281
1282 interpret_content_stream(
1283 &doc,
1284 stream,
1285 &resources,
1286 &mut handler,
1287 &default_options(),
1288 0,
1289 &mut gstate,
1290 &mut tstate,
1291 )
1292 .unwrap();
1293
1294 assert!(!handler.warnings.is_empty());
1296 assert!(
1297 handler.warnings[0]
1298 .description
1299 .contains("font not found in page resources"),
1300 "expected 'font not found' warning, got: {}",
1301 handler.warnings[0].description
1302 );
1303 assert_eq!(
1304 handler.warnings[0].font_name,
1305 Some("F1".to_string()),
1306 "warning should include font name"
1307 );
1308 assert!(
1309 handler.warnings[0].operator_index.is_some(),
1310 "warning should include operator index"
1311 );
1312
1313 assert_eq!(handler.chars.len(), 2);
1315 }
1316
1317 #[test]
1318 fn interpret_no_warnings_when_collection_disabled() {
1319 let doc = lopdf::Document::with_version("1.5");
1320 let resources = empty_resources();
1321 let stream = b"BT /F1 12 Tf (Hi) Tj ET";
1322
1323 let mut handler = CollectingHandler::new();
1324 let mut gstate = InterpreterState::new();
1325 let mut tstate = TextState::new();
1326
1327 let opts = ExtractOptions {
1328 collect_warnings: false,
1329 ..ExtractOptions::default()
1330 };
1331
1332 interpret_content_stream(
1333 &doc,
1334 stream,
1335 &resources,
1336 &mut handler,
1337 &opts,
1338 0,
1339 &mut gstate,
1340 &mut tstate,
1341 )
1342 .unwrap();
1343
1344 assert!(handler.warnings.is_empty());
1346
1347 assert_eq!(handler.chars.len(), 2);
1349 }
1350
1351 #[test]
1352 fn interpret_warnings_do_not_affect_output() {
1353 let doc = lopdf::Document::with_version("1.5");
1354 let resources = empty_resources();
1355 let stream = b"BT /F1 12 Tf (AB) Tj ET";
1356
1357 let mut handler_on = CollectingHandler::new();
1359 let mut gstate_on = InterpreterState::new();
1360 let mut tstate_on = TextState::new();
1361 let opts_on = ExtractOptions {
1362 collect_warnings: true,
1363 ..ExtractOptions::default()
1364 };
1365 interpret_content_stream(
1366 &doc,
1367 stream,
1368 &resources,
1369 &mut handler_on,
1370 &opts_on,
1371 0,
1372 &mut gstate_on,
1373 &mut tstate_on,
1374 )
1375 .unwrap();
1376
1377 let mut handler_off = CollectingHandler::new();
1379 let mut gstate_off = InterpreterState::new();
1380 let mut tstate_off = TextState::new();
1381 let opts_off = ExtractOptions {
1382 collect_warnings: false,
1383 ..ExtractOptions::default()
1384 };
1385 interpret_content_stream(
1386 &doc,
1387 stream,
1388 &resources,
1389 &mut handler_off,
1390 &opts_off,
1391 0,
1392 &mut gstate_off,
1393 &mut tstate_off,
1394 )
1395 .unwrap();
1396
1397 assert_eq!(handler_on.chars.len(), handler_off.chars.len());
1399 for (a, b) in handler_on.chars.iter().zip(handler_off.chars.iter()) {
1400 assert_eq!(a.char_code, b.char_code);
1401 }
1402 }
1403
1404 #[test]
1405 fn interpret_valid_font_no_warnings() {
1406 let mut doc = lopdf::Document::with_version("1.5");
1407 let resources = make_cid_font_resources(&mut doc);
1408 let stream = b"BT /F1 12 Tf <4E2D> Tj ET";
1409
1410 let mut handler = CollectingHandler::new();
1411 let mut gstate = InterpreterState::new();
1412 let mut tstate = TextState::new();
1413
1414 interpret_content_stream(
1415 &doc,
1416 stream,
1417 &resources,
1418 &mut handler,
1419 &default_options(),
1420 0,
1421 &mut gstate,
1422 &mut tstate,
1423 )
1424 .unwrap();
1425
1426 assert!(
1428 handler.warnings.is_empty(),
1429 "expected no warnings for valid font, got: {:?}",
1430 handler.warnings
1431 );
1432 assert_eq!(handler.chars.len(), 1);
1433 }
1434}