Skip to main content

pdfplumber_parse/
interpreter.rs

1//! Content stream interpreter.
2//!
3//! Interprets tokenized PDF content stream operators, maintaining graphics and
4//! text state, and emitting events to a [`ContentHandler`]. Handles Form XObject
5//! recursion via the `Do` operator.
6
7use std::collections::HashMap;
8
9use crate::cid_font::{
10    CidFontMetrics, extract_cid_font_metrics, get_descendant_font, get_type0_encoding,
11    is_type0_font, parse_predefined_cmap_name, strip_subset_prefix,
12};
13use crate::cmap::CMap;
14use crate::error::BackendError;
15use crate::font_metrics::{FontMetrics, extract_font_metrics};
16use crate::handler::{CharEvent, ContentHandler, ImageEvent};
17use crate::interpreter_state::InterpreterState;
18use crate::lopdf_backend::object_to_f64;
19use crate::text_renderer::{
20    TjElement, show_string, show_string_cid, show_string_with_positioning_mode,
21};
22use crate::text_state::TextState;
23use crate::tokenizer::{Operand, tokenize};
24use pdfplumber_core::{ExtractOptions, ExtractWarning};
25
26/// Cached font information for the interpreter.
27struct CachedFont {
28    metrics: FontMetrics,
29    cmap: Option<CMap>,
30    base_name: String,
31    /// CID font metrics (present for Type0/CID fonts).
32    cid_metrics: Option<CidFontMetrics>,
33    /// Whether this is a CID (composite/Type0) font.
34    is_cid_font: bool,
35    /// Writing mode: 0 = horizontal, 1 = vertical.
36    /// Used in US-041 for vertical writing mode support.
37    #[allow(dead_code)]
38    writing_mode: u8,
39}
40
41/// Interpret a content stream and emit events to the handler.
42///
43/// Processes tokenized PDF operators, updates graphics/text state, and calls
44/// handler methods for text, path, and image events. Handles Form XObject
45/// recursion via the `Do` operator.
46///
47/// # Arguments
48///
49/// * `doc` - The lopdf document (for resolving references)
50/// * `stream_bytes` - Decoded content stream bytes
51/// * `resources` - Resources dictionary for this scope
52/// * `handler` - Event callback handler
53/// * `options` - Resource limits and settings
54/// * `depth` - Current recursion depth (0 for page-level)
55/// * `gstate` - Current graphics/interpreter state
56/// * `tstate` - Current text state
57#[allow(clippy::too_many_arguments)]
58pub(crate) fn interpret_content_stream(
59    doc: &lopdf::Document,
60    stream_bytes: &[u8],
61    resources: &lopdf::Dictionary,
62    handler: &mut dyn ContentHandler,
63    options: &ExtractOptions,
64    depth: usize,
65    gstate: &mut InterpreterState,
66    tstate: &mut TextState,
67) -> Result<(), BackendError> {
68    if depth > options.max_recursion_depth {
69        return Err(BackendError::Interpreter(format!(
70            "Form XObject recursion depth {} exceeds limit {}",
71            depth, options.max_recursion_depth
72        )));
73    }
74
75    let operators = tokenize(stream_bytes)?;
76    let mut font_cache: HashMap<String, CachedFont> = HashMap::new();
77
78    for (op_index, op) in operators.iter().enumerate() {
79        match op.name.as_str() {
80            // --- Graphics state operators ---
81            "q" => gstate.save_state(),
82            "Q" => {
83                gstate.restore_state();
84            }
85            "cm" => {
86                if op.operands.len() >= 6 {
87                    let a = get_f64(&op.operands, 0).unwrap_or(1.0);
88                    let b = get_f64(&op.operands, 1).unwrap_or(0.0);
89                    let c = get_f64(&op.operands, 2).unwrap_or(0.0);
90                    let d = get_f64(&op.operands, 3).unwrap_or(1.0);
91                    let e = get_f64(&op.operands, 4).unwrap_or(0.0);
92                    let f = get_f64(&op.operands, 5).unwrap_or(0.0);
93                    gstate.concat_matrix(a, b, c, d, e, f);
94                }
95            }
96            "w" => {
97                if let Some(v) = get_f64(&op.operands, 0) {
98                    gstate.set_line_width(v);
99                }
100            }
101
102            // --- Color operators ---
103            "G" => {
104                if let Some(g) = get_f32(&op.operands, 0) {
105                    gstate.set_stroking_gray(g);
106                }
107            }
108            "g" => {
109                if let Some(g) = get_f32(&op.operands, 0) {
110                    gstate.set_non_stroking_gray(g);
111                }
112            }
113            "RG" => {
114                if op.operands.len() >= 3 {
115                    let r = get_f32(&op.operands, 0).unwrap_or(0.0);
116                    let g = get_f32(&op.operands, 1).unwrap_or(0.0);
117                    let b = get_f32(&op.operands, 2).unwrap_or(0.0);
118                    gstate.set_stroking_rgb(r, g, b);
119                }
120            }
121            "rg" => {
122                if op.operands.len() >= 3 {
123                    let r = get_f32(&op.operands, 0).unwrap_or(0.0);
124                    let g = get_f32(&op.operands, 1).unwrap_or(0.0);
125                    let b = get_f32(&op.operands, 2).unwrap_or(0.0);
126                    gstate.set_non_stroking_rgb(r, g, b);
127                }
128            }
129            "K" => {
130                if op.operands.len() >= 4 {
131                    let c = get_f32(&op.operands, 0).unwrap_or(0.0);
132                    let m = get_f32(&op.operands, 1).unwrap_or(0.0);
133                    let y = get_f32(&op.operands, 2).unwrap_or(0.0);
134                    let k = get_f32(&op.operands, 3).unwrap_or(0.0);
135                    gstate.set_stroking_cmyk(c, m, y, k);
136                }
137            }
138            "k" => {
139                if op.operands.len() >= 4 {
140                    let c = get_f32(&op.operands, 0).unwrap_or(0.0);
141                    let m = get_f32(&op.operands, 1).unwrap_or(0.0);
142                    let y = get_f32(&op.operands, 2).unwrap_or(0.0);
143                    let k = get_f32(&op.operands, 3).unwrap_or(0.0);
144                    gstate.set_non_stroking_cmyk(c, m, y, k);
145                }
146            }
147            "SC" | "SCN" => {
148                let components: Vec<f32> = op.operands.iter().filter_map(operand_to_f32).collect();
149                gstate.set_stroking_color(&components);
150            }
151            "sc" | "scn" => {
152                let components: Vec<f32> = op.operands.iter().filter_map(operand_to_f32).collect();
153                gstate.set_non_stroking_color(&components);
154            }
155
156            // --- Text state operators ---
157            "BT" => tstate.begin_text(),
158            "ET" => tstate.end_text(),
159            "Tf" => {
160                if op.operands.len() >= 2 {
161                    let font_name = operand_to_name(&op.operands[0]);
162                    let size = get_f64(&op.operands, 1).unwrap_or(0.0);
163                    tstate.set_font(font_name.clone(), size);
164                    load_font_if_needed(
165                        doc,
166                        resources,
167                        &font_name,
168                        &mut font_cache,
169                        handler,
170                        options,
171                        op_index,
172                    );
173                }
174            }
175            "Tm" => {
176                if op.operands.len() >= 6 {
177                    let a = get_f64(&op.operands, 0).unwrap_or(1.0);
178                    let b = get_f64(&op.operands, 1).unwrap_or(0.0);
179                    let c = get_f64(&op.operands, 2).unwrap_or(0.0);
180                    let d = get_f64(&op.operands, 3).unwrap_or(1.0);
181                    let e = get_f64(&op.operands, 4).unwrap_or(0.0);
182                    let f = get_f64(&op.operands, 5).unwrap_or(0.0);
183                    tstate.set_text_matrix(a, b, c, d, e, f);
184                }
185            }
186            "Td" => {
187                if op.operands.len() >= 2 {
188                    let tx = get_f64(&op.operands, 0).unwrap_or(0.0);
189                    let ty = get_f64(&op.operands, 1).unwrap_or(0.0);
190                    tstate.move_text_position(tx, ty);
191                }
192            }
193            "TD" => {
194                if op.operands.len() >= 2 {
195                    let tx = get_f64(&op.operands, 0).unwrap_or(0.0);
196                    let ty = get_f64(&op.operands, 1).unwrap_or(0.0);
197                    tstate.move_text_position_and_set_leading(tx, ty);
198                }
199            }
200            "T*" => tstate.move_to_next_line(),
201            "Tc" => {
202                if let Some(v) = get_f64(&op.operands, 0) {
203                    tstate.set_char_spacing(v);
204                }
205            }
206            "Tw" => {
207                if let Some(v) = get_f64(&op.operands, 0) {
208                    tstate.set_word_spacing(v);
209                }
210            }
211            "Tz" => {
212                if let Some(v) = get_f64(&op.operands, 0) {
213                    tstate.set_h_scaling(v);
214                }
215            }
216            "TL" => {
217                if let Some(v) = get_f64(&op.operands, 0) {
218                    tstate.set_leading(v);
219                }
220            }
221            "Tr" => {
222                if let Some(v) = get_i64(&op.operands, 0) {
223                    if let Some(mode) = crate::text_state::TextRenderMode::from_i64(v) {
224                        tstate.set_render_mode(mode);
225                    }
226                }
227            }
228            "Ts" => {
229                if let Some(v) = get_f64(&op.operands, 0) {
230                    tstate.set_rise(v);
231                }
232            }
233
234            // --- Text rendering operators ---
235            "Tj" => {
236                handle_tj(tstate, gstate, handler, &op.operands, &font_cache);
237            }
238            "TJ" => {
239                handle_tj_array(tstate, gstate, handler, &op.operands, &font_cache);
240            }
241            "'" => {
242                // T* then Tj
243                tstate.move_to_next_line();
244                handle_tj(tstate, gstate, handler, &op.operands, &font_cache);
245            }
246            "\"" => {
247                // aw ac (string) "
248                if op.operands.len() >= 3 {
249                    if let Some(aw) = get_f64(&op.operands, 0) {
250                        tstate.set_word_spacing(aw);
251                    }
252                    if let Some(ac) = get_f64(&op.operands, 1) {
253                        tstate.set_char_spacing(ac);
254                    }
255                    tstate.move_to_next_line();
256                    // Show the string (3rd operand)
257                    let string_operands = vec![op.operands[2].clone()];
258                    handle_tj(tstate, gstate, handler, &string_operands, &font_cache);
259                }
260            }
261
262            // --- XObject operator ---
263            "Do" => {
264                if let Some(Operand::Name(name)) = op.operands.first() {
265                    handle_do(
266                        doc, resources, handler, options, depth, gstate, tstate, name,
267                    )?;
268                }
269            }
270
271            // Other operators (paths, etc.) - not yet handled for this story
272            _ => {}
273        }
274    }
275
276    Ok(())
277}
278
279// --- Operand extraction helpers ---
280
281fn get_f64(operands: &[Operand], index: usize) -> Option<f64> {
282    operands.get(index).and_then(|o| match o {
283        Operand::Integer(i) => Some(*i as f64),
284        Operand::Real(f) => Some(*f),
285        _ => None,
286    })
287}
288
289fn get_f32(operands: &[Operand], index: usize) -> Option<f32> {
290    get_f64(operands, index).map(|v| v as f32)
291}
292
293fn get_i64(operands: &[Operand], index: usize) -> Option<i64> {
294    operands.get(index).and_then(|o| match o {
295        Operand::Integer(i) => Some(*i),
296        Operand::Real(f) => Some(*f as i64),
297        _ => None,
298    })
299}
300
301fn operand_to_f32(o: &Operand) -> Option<f32> {
302    match o {
303        Operand::Integer(i) => Some(*i as f32),
304        Operand::Real(f) => Some(*f as f32),
305        _ => None,
306    }
307}
308
309fn operand_to_name(o: &Operand) -> String {
310    match o {
311        Operand::Name(n) => n.clone(),
312        _ => String::new(),
313    }
314}
315
316fn operand_to_string_bytes(o: &Operand) -> Option<&[u8]> {
317    match o {
318        Operand::LiteralString(s) | Operand::HexString(s) => Some(s),
319        _ => None,
320    }
321}
322
323// --- Font loading ---
324
325#[allow(clippy::too_many_arguments)]
326fn load_font_if_needed(
327    doc: &lopdf::Document,
328    resources: &lopdf::Dictionary,
329    font_name: &str,
330    cache: &mut HashMap<String, CachedFont>,
331    handler: &mut dyn ContentHandler,
332    options: &ExtractOptions,
333    op_index: usize,
334) {
335    if cache.contains_key(font_name) {
336        return;
337    }
338
339    // Look up /Resources/Font/<font_name>
340    let font_dict = (|| -> Option<&lopdf::Dictionary> {
341        let fonts_obj = resources.get(b"Font").ok()?;
342        let fonts_obj = resolve_ref(doc, fonts_obj);
343        let fonts_dict = fonts_obj.as_dict().ok()?;
344        let font_obj = fonts_dict.get(font_name.as_bytes()).ok()?;
345        let font_obj = resolve_ref(doc, font_obj);
346        font_obj.as_dict().ok()
347    })();
348
349    let (metrics, cmap, base_name, cid_metrics, is_cid_font, writing_mode) =
350        if let Some(fd) = font_dict {
351            if is_type0_font(fd) {
352                // Type0 (composite/CID) font
353                let (cid_met, wm) = load_cid_font(doc, fd);
354                let metrics = if let Some(ref cm) = cid_met {
355                    // Create a FontMetrics from CID font data for backward compat
356                    FontMetrics::new(
357                        Vec::new(),
358                        0,
359                        0,
360                        cm.default_width(),
361                        cm.ascent(),
362                        cm.descent(),
363                        cm.font_bbox(),
364                    )
365                } else {
366                    if options.collect_warnings {
367                        handler.on_warning(ExtractWarning::with_operator_context(
368                            "CID font metrics not available, using defaults",
369                            op_index,
370                            font_name,
371                        ));
372                    }
373                    FontMetrics::default_metrics()
374                };
375
376                // Extract ToUnicode CMap if present
377                let cmap = extract_tounicode_cmap(doc, fd);
378
379                let raw_base_name = fd
380                    .get(b"BaseFont")
381                    .ok()
382                    .and_then(|o| o.as_name_str().ok())
383                    .unwrap_or(font_name);
384                let base_name = strip_subset_prefix(raw_base_name).to_string();
385
386                (metrics, cmap, base_name, cid_met, true, wm)
387            } else {
388                // Simple font
389                let metrics = match extract_font_metrics(doc, fd) {
390                    Ok(m) => m,
391                    Err(_) => {
392                        if options.collect_warnings {
393                            handler.on_warning(ExtractWarning::with_operator_context(
394                                "failed to extract font metrics, using defaults",
395                                op_index,
396                                font_name,
397                            ));
398                        }
399                        FontMetrics::default_metrics()
400                    }
401                };
402                let cmap = extract_tounicode_cmap(doc, fd);
403                let raw_base_name = fd
404                    .get(b"BaseFont")
405                    .ok()
406                    .and_then(|o| o.as_name_str().ok())
407                    .unwrap_or(font_name);
408                let base_name = strip_subset_prefix(raw_base_name).to_string();
409
410                (metrics, cmap, base_name, None, false, 0)
411            }
412        } else {
413            // Font not found in page resources — use defaults
414            if options.collect_warnings {
415                handler.on_warning(ExtractWarning::with_operator_context(
416                    "font not found in page resources, using defaults",
417                    op_index,
418                    font_name,
419                ));
420            }
421            (
422                FontMetrics::default_metrics(),
423                None,
424                font_name.to_string(),
425                None,
426                false,
427                0,
428            )
429        };
430
431    cache.insert(
432        font_name.to_string(),
433        CachedFont {
434            metrics,
435            cmap,
436            base_name,
437            cid_metrics,
438            is_cid_font,
439            writing_mode,
440        },
441    );
442}
443
444/// Extract ToUnicode CMap from a font dictionary.
445fn extract_tounicode_cmap(doc: &lopdf::Document, fd: &lopdf::Dictionary) -> Option<CMap> {
446    let tounicode_obj = fd.get(b"ToUnicode").ok()?;
447    let tounicode_obj = resolve_ref(doc, tounicode_obj);
448    let stream = tounicode_obj.as_stream().ok()?;
449    let data = decode_stream(stream).ok()?;
450    CMap::parse(&data).ok()
451}
452
453/// Load CID font information from a Type0 font dictionary.
454fn load_cid_font(
455    doc: &lopdf::Document,
456    type0_dict: &lopdf::Dictionary,
457) -> (Option<CidFontMetrics>, u8) {
458    // Determine writing mode from encoding name
459    let writing_mode = get_type0_encoding(type0_dict)
460        .and_then(|enc| parse_predefined_cmap_name(&enc))
461        .map(|info| info.writing_mode)
462        .unwrap_or(0);
463
464    // Get descendant CIDFont dictionary
465    let cid_metrics = get_descendant_font(doc, type0_dict)
466        .and_then(|desc| extract_cid_font_metrics(doc, desc).ok());
467
468    (cid_metrics, writing_mode)
469}
470
471// --- Text rendering ---
472
473/// Build a width lookup function for a cached font.
474/// For CID fonts, uses CidFontMetrics; for simple fonts, uses FontMetrics.
475fn get_width_fn(cached: Option<&CachedFont>) -> Box<dyn Fn(u32) -> f64 + '_> {
476    match cached {
477        Some(cf) if cf.is_cid_font => {
478            if let Some(ref cid_met) = cf.cid_metrics {
479                Box::new(move |code: u32| cid_met.get_width(code))
480            } else {
481                Box::new(move |code: u32| cf.metrics.get_width(code))
482            }
483        }
484        Some(cf) => Box::new(move |code: u32| cf.metrics.get_width(code)),
485        None => {
486            let default_metrics = FontMetrics::default_metrics();
487            Box::new(move |code: u32| default_metrics.get_width(code))
488        }
489    }
490}
491
492fn handle_tj(
493    tstate: &mut TextState,
494    gstate: &InterpreterState,
495    handler: &mut dyn ContentHandler,
496    operands: &[Operand],
497    font_cache: &HashMap<String, CachedFont>,
498) {
499    let string_bytes = match operands.first().and_then(operand_to_string_bytes) {
500        Some(bytes) => bytes,
501        None => return,
502    };
503
504    let cached = font_cache.get(&tstate.font_name);
505    let width_fn = get_width_fn(cached);
506    let is_cid = cached.is_some_and(|c| c.is_cid_font);
507    let raw_chars = if is_cid {
508        show_string_cid(tstate, string_bytes, &*width_fn)
509    } else {
510        show_string(tstate, string_bytes, &*width_fn)
511    };
512
513    emit_char_events(raw_chars, tstate, gstate, handler, cached);
514}
515
516fn handle_tj_array(
517    tstate: &mut TextState,
518    gstate: &InterpreterState,
519    handler: &mut dyn ContentHandler,
520    operands: &[Operand],
521    font_cache: &HashMap<String, CachedFont>,
522) {
523    let array = match operands.first() {
524        Some(Operand::Array(arr)) => arr,
525        _ => return,
526    };
527
528    // Convert Operand array to TjElement array
529    let elements: Vec<TjElement> = array
530        .iter()
531        .filter_map(|o| match o {
532            Operand::LiteralString(s) | Operand::HexString(s) => Some(TjElement::String(s.clone())),
533            Operand::Integer(i) => Some(TjElement::Adjustment(*i as f64)),
534            Operand::Real(f) => Some(TjElement::Adjustment(*f)),
535            _ => None,
536        })
537        .collect();
538
539    let cached = font_cache.get(&tstate.font_name);
540    let width_fn = get_width_fn(cached);
541    let is_cid = cached.is_some_and(|c| c.is_cid_font);
542    let raw_chars = show_string_with_positioning_mode(tstate, &elements, &*width_fn, is_cid);
543
544    emit_char_events(raw_chars, tstate, gstate, handler, cached);
545}
546
547fn emit_char_events(
548    raw_chars: Vec<crate::text_renderer::RawChar>,
549    tstate: &TextState,
550    gstate: &InterpreterState,
551    handler: &mut dyn ContentHandler,
552    cached: Option<&CachedFont>,
553) {
554    let ctm = gstate.ctm_array();
555    let font_name = cached.map_or_else(|| tstate.font_name.clone(), |c| c.base_name.clone());
556
557    for rc in raw_chars {
558        let unicode = cached.and_then(|c| {
559            c.cmap
560                .as_ref()
561                .and_then(|cm| cm.lookup(rc.char_code).map(|s| s.to_string()))
562        });
563
564        // Use CID font metrics for displacement if available
565        let displacement = match cached {
566            Some(cf) if cf.is_cid_font => cf
567                .cid_metrics
568                .as_ref()
569                .map_or(600.0, |cm| cm.get_width(rc.char_code)),
570            Some(cf) => cf.metrics.get_width(rc.char_code),
571            None => 600.0,
572        };
573
574        handler.on_char(CharEvent {
575            char_code: rc.char_code,
576            unicode,
577            font_name: font_name.clone(),
578            font_size: tstate.font_size,
579            text_matrix: rc.text_matrix,
580            ctm,
581            displacement,
582            char_spacing: tstate.char_spacing,
583            word_spacing: tstate.word_spacing,
584            h_scaling: tstate.h_scaling_normalized(),
585            rise: tstate.rise,
586        });
587    }
588}
589
590// --- Do operator: XObject handling ---
591
592#[allow(clippy::too_many_arguments)]
593fn handle_do(
594    doc: &lopdf::Document,
595    resources: &lopdf::Dictionary,
596    handler: &mut dyn ContentHandler,
597    options: &ExtractOptions,
598    depth: usize,
599    gstate: &mut InterpreterState,
600    tstate: &mut TextState,
601    name: &str,
602) -> Result<(), BackendError> {
603    // Look up /Resources/XObject/<name>
604    let xobj_dict = resources.get(b"XObject").map_err(|_| {
605        BackendError::Interpreter(format!(
606            "no /XObject dictionary in resources for Do /{name}"
607        ))
608    })?;
609    let xobj_dict = resolve_ref(doc, xobj_dict);
610    let xobj_dict = xobj_dict.as_dict().map_err(|_| {
611        BackendError::Interpreter("/XObject resource is not a dictionary".to_string())
612    })?;
613
614    let xobj_entry = xobj_dict.get(name.as_bytes()).map_err(|_| {
615        BackendError::Interpreter(format!("XObject /{name} not found in resources"))
616    })?;
617
618    let xobj_id = xobj_entry.as_reference().map_err(|_| {
619        BackendError::Interpreter(format!("XObject /{name} is not an indirect reference"))
620    })?;
621
622    let xobj = doc.get_object(xobj_id).map_err(|e| {
623        BackendError::Interpreter(format!("failed to resolve XObject /{name}: {e}"))
624    })?;
625
626    let stream = xobj
627        .as_stream()
628        .map_err(|e| BackendError::Interpreter(format!("XObject /{name} is not a stream: {e}")))?;
629
630    let subtype = stream
631        .dict
632        .get(b"Subtype")
633        .ok()
634        .and_then(|o| o.as_name_str().ok())
635        .unwrap_or("");
636
637    match subtype {
638        "Form" => handle_form_xobject(
639            doc, stream, name, resources, handler, options, depth, gstate, tstate,
640        ),
641        "Image" => {
642            handle_image_xobject(stream, name, gstate, handler);
643            Ok(())
644        }
645        _ => {
646            // Unknown XObject subtype — ignore
647            Ok(())
648        }
649    }
650}
651
652#[allow(clippy::too_many_arguments)]
653fn handle_form_xobject(
654    doc: &lopdf::Document,
655    stream: &lopdf::Stream,
656    name: &str,
657    parent_resources: &lopdf::Dictionary,
658    handler: &mut dyn ContentHandler,
659    options: &ExtractOptions,
660    depth: usize,
661    gstate: &mut InterpreterState,
662    tstate: &mut TextState,
663) -> Result<(), BackendError> {
664    // Save graphics state
665    gstate.save_state();
666
667    // Apply /Matrix if present (transforms Form XObject space to parent space)
668    if let Ok(matrix_obj) = stream.dict.get(b"Matrix") {
669        if let Ok(arr) = matrix_obj.as_array() {
670            if arr.len() == 6 {
671                let vals: Result<Vec<f64>, _> = arr.iter().map(object_to_f64).collect();
672                if let Ok(vals) = vals {
673                    gstate.concat_matrix(vals[0], vals[1], vals[2], vals[3], vals[4], vals[5]);
674                }
675            }
676        }
677    }
678
679    // Get Form XObject's resources (fall back to parent resources)
680    let form_resources_dict;
681    let form_resources = if let Ok(res_obj) = stream.dict.get(b"Resources") {
682        let res_obj = resolve_ref(doc, res_obj);
683        match res_obj.as_dict() {
684            Ok(d) => d,
685            Err(_) => parent_resources,
686        }
687    } else {
688        // Check if /Resources is an inline dictionary (common for Form XObjects)
689        // The dict.get already handles this, so use parent as fallback
690        // But also check if it's an indirect reference in the dict
691        if let Ok(res_ref) = stream.dict.get(b"Resources") {
692            if let Ok(id) = res_ref.as_reference() {
693                if let Ok(obj) = doc.get_object(id) {
694                    if let Ok(d) = obj.as_dict() {
695                        form_resources_dict = d.clone();
696                        &form_resources_dict
697                    } else {
698                        parent_resources
699                    }
700                } else {
701                    parent_resources
702                }
703            } else {
704                parent_resources
705            }
706        } else {
707            parent_resources
708        }
709    };
710
711    // Decode stream content
712    let content_bytes = decode_stream(stream).map_err(|e| {
713        BackendError::Interpreter(format!("failed to decode Form XObject /{name} stream: {e}"))
714    })?;
715
716    // Recursively interpret the Form XObject content stream
717    interpret_content_stream(
718        doc,
719        &content_bytes,
720        form_resources,
721        handler,
722        options,
723        depth + 1,
724        gstate,
725        tstate,
726    )?;
727
728    // Restore graphics state
729    gstate.restore_state();
730
731    Ok(())
732}
733
734fn handle_image_xobject(
735    stream: &lopdf::Stream,
736    name: &str,
737    gstate: &InterpreterState,
738    handler: &mut dyn ContentHandler,
739) {
740    let width = stream
741        .dict
742        .get(b"Width")
743        .ok()
744        .and_then(|o| o.as_i64().ok())
745        .unwrap_or(0) as u32;
746
747    let height = stream
748        .dict
749        .get(b"Height")
750        .ok()
751        .and_then(|o| o.as_i64().ok())
752        .unwrap_or(0) as u32;
753
754    let colorspace = stream
755        .dict
756        .get(b"ColorSpace")
757        .ok()
758        .and_then(|o| o.as_name_str().ok())
759        .map(|s| s.to_string());
760
761    let bits_per_component = stream
762        .dict
763        .get(b"BitsPerComponent")
764        .ok()
765        .and_then(|o| o.as_i64().ok())
766        .map(|v| v as u32);
767
768    handler.on_image(ImageEvent {
769        name: name.to_string(),
770        ctm: gstate.ctm_array(),
771        width,
772        height,
773        colorspace,
774        bits_per_component,
775    });
776}
777
778// --- Helpers ---
779
780/// Resolve an indirect reference, returning the referenced object.
781/// If the object is not a reference, returns it as-is.
782fn resolve_ref<'a>(doc: &'a lopdf::Document, obj: &'a lopdf::Object) -> &'a lopdf::Object {
783    match obj {
784        lopdf::Object::Reference(id) => doc.get_object(*id).unwrap_or(obj),
785        _ => obj,
786    }
787}
788
789/// Decode a PDF stream, decompressing if necessary.
790fn decode_stream(stream: &lopdf::Stream) -> Result<Vec<u8>, BackendError> {
791    // Check if stream has filters
792    if stream.dict.get(b"Filter").is_ok() {
793        stream
794            .decompressed_content()
795            .map_err(|e| BackendError::Interpreter(format!("stream decompression failed: {e}")))
796    } else {
797        Ok(stream.content.clone())
798    }
799}
800
801#[cfg(test)]
802mod tests {
803    use super::*;
804    use crate::handler::{CharEvent, ContentHandler, ImageEvent};
805
806    // --- Collecting handler ---
807
808    struct CollectingHandler {
809        chars: Vec<CharEvent>,
810        images: Vec<ImageEvent>,
811        warnings: Vec<ExtractWarning>,
812    }
813
814    impl CollectingHandler {
815        fn new() -> Self {
816            Self {
817                chars: Vec::new(),
818                images: Vec::new(),
819                warnings: Vec::new(),
820            }
821        }
822    }
823
824    impl ContentHandler for CollectingHandler {
825        fn on_char(&mut self, event: CharEvent) {
826            self.chars.push(event);
827        }
828        fn on_image(&mut self, event: ImageEvent) {
829            self.images.push(event);
830        }
831        fn on_warning(&mut self, warning: ExtractWarning) {
832            self.warnings.push(warning);
833        }
834    }
835
836    // --- Helper to create a minimal lopdf document for testing ---
837
838    fn empty_resources() -> lopdf::Dictionary {
839        lopdf::Dictionary::new()
840    }
841
842    fn default_options() -> ExtractOptions {
843        ExtractOptions::default()
844    }
845
846    // --- Basic text interpretation tests ---
847
848    #[test]
849    fn interpret_simple_text() {
850        let doc = lopdf::Document::with_version("1.5");
851        let resources = empty_resources();
852        let stream = b"BT /F1 12 Tf 72 700 Td (Hello) Tj ET";
853
854        let mut handler = CollectingHandler::new();
855        let mut gstate = InterpreterState::new();
856        let mut tstate = TextState::new();
857
858        interpret_content_stream(
859            &doc,
860            stream,
861            &resources,
862            &mut handler,
863            &default_options(),
864            0,
865            &mut gstate,
866            &mut tstate,
867        )
868        .unwrap();
869
870        // "Hello" = 5 characters
871        assert_eq!(handler.chars.len(), 5);
872        assert_eq!(handler.chars[0].char_code, b'H' as u32);
873        assert_eq!(handler.chars[1].char_code, b'e' as u32);
874        assert_eq!(handler.chars[4].char_code, b'o' as u32);
875        assert_eq!(handler.chars[0].font_size, 12.0);
876    }
877
878    #[test]
879    fn interpret_tj_array() {
880        let doc = lopdf::Document::with_version("1.5");
881        let resources = empty_resources();
882        let stream = b"BT /F1 12 Tf [(H) -20 (i)] TJ ET";
883
884        let mut handler = CollectingHandler::new();
885        let mut gstate = InterpreterState::new();
886        let mut tstate = TextState::new();
887
888        interpret_content_stream(
889            &doc,
890            stream,
891            &resources,
892            &mut handler,
893            &default_options(),
894            0,
895            &mut gstate,
896            &mut tstate,
897        )
898        .unwrap();
899
900        assert_eq!(handler.chars.len(), 2);
901        assert_eq!(handler.chars[0].char_code, b'H' as u32);
902        assert_eq!(handler.chars[1].char_code, b'i' as u32);
903    }
904
905    #[test]
906    fn interpret_ctm_passed_to_char_events() {
907        let doc = lopdf::Document::with_version("1.5");
908        let resources = empty_resources();
909        let stream = b"1 0 0 1 10 20 cm BT /F1 12 Tf (A) Tj ET";
910
911        let mut handler = CollectingHandler::new();
912        let mut gstate = InterpreterState::new();
913        let mut tstate = TextState::new();
914
915        interpret_content_stream(
916            &doc,
917            stream,
918            &resources,
919            &mut handler,
920            &default_options(),
921            0,
922            &mut gstate,
923            &mut tstate,
924        )
925        .unwrap();
926
927        assert_eq!(handler.chars.len(), 1);
928        assert_eq!(handler.chars[0].ctm, [1.0, 0.0, 0.0, 1.0, 10.0, 20.0]);
929    }
930
931    // --- Recursion limit tests ---
932
933    #[test]
934    fn recursion_depth_zero_allowed() {
935        let doc = lopdf::Document::with_version("1.5");
936        let resources = empty_resources();
937        let stream = b"BT ET";
938
939        let mut handler = CollectingHandler::new();
940        let mut gstate = InterpreterState::new();
941        let mut tstate = TextState::new();
942
943        let result = interpret_content_stream(
944            &doc,
945            stream,
946            &resources,
947            &mut handler,
948            &default_options(),
949            0,
950            &mut gstate,
951            &mut tstate,
952        );
953        assert!(result.is_ok());
954    }
955
956    #[test]
957    fn recursion_depth_exceeds_limit() {
958        let doc = lopdf::Document::with_version("1.5");
959        let resources = empty_resources();
960        let stream = b"BT ET";
961
962        let mut handler = CollectingHandler::new();
963        let mut gstate = InterpreterState::new();
964        let mut tstate = TextState::new();
965
966        let mut opts = ExtractOptions::default();
967        opts.max_recursion_depth = 3;
968
969        let result = interpret_content_stream(
970            &doc,
971            stream,
972            &resources,
973            &mut handler,
974            &opts,
975            4, // depth > max
976            &mut gstate,
977            &mut tstate,
978        );
979        assert!(result.is_err());
980        let err_msg = result.unwrap_err().to_string();
981        assert!(err_msg.contains("recursion depth"));
982    }
983
984    // --- Graphics state tests ---
985
986    #[test]
987    fn interpret_q_q_state_save_restore() {
988        let doc = lopdf::Document::with_version("1.5");
989        let resources = empty_resources();
990        // Set color, save, change color, restore
991        let stream = b"0.5 g q 1 0 0 rg Q";
992
993        let mut handler = CollectingHandler::new();
994        let mut gstate = InterpreterState::new();
995        let mut tstate = TextState::new();
996
997        interpret_content_stream(
998            &doc,
999            stream,
1000            &resources,
1001            &mut handler,
1002            &default_options(),
1003            0,
1004            &mut gstate,
1005            &mut tstate,
1006        )
1007        .unwrap();
1008
1009        // After Q, fill color should be restored to gray 0.5
1010        assert_eq!(
1011            gstate.graphics_state().fill_color,
1012            pdfplumber_core::Color::Gray(0.5)
1013        );
1014    }
1015
1016    // --- CID font / Identity-H tests ---
1017
1018    /// Build a resources dictionary containing a Type0 font with Identity-H encoding.
1019    fn make_cid_font_resources(doc: &mut lopdf::Document) -> lopdf::Dictionary {
1020        use lopdf::{Object, Stream, dictionary};
1021
1022        // ToUnicode CMap: map 0x4E2D → U+4E2D (中), 0x6587 → U+6587 (文)
1023        let tounicode_data = b"\
1024            /CIDInit /ProcSet findresource begin\n\
1025            12 dict begin\n\
1026            begincmap\n\
1027            /CMapName /Adobe-Identity-UCS def\n\
1028            /CMapType 2 def\n\
1029            1 begincodespacerange\n\
1030            <0000> <FFFF>\n\
1031            endcodespacerange\n\
1032            2 beginbfchar\n\
1033            <4E2D> <4E2D>\n\
1034            <6587> <6587>\n\
1035            endbfchar\n\
1036            endcmap\n";
1037        let tounicode_stream = Stream::new(dictionary! {}, tounicode_data.to_vec());
1038        let tounicode_id = doc.add_object(Object::Stream(tounicode_stream));
1039
1040        // CIDFont dictionary
1041        let cid_font_dict = dictionary! {
1042            "Type" => "Font",
1043            "Subtype" => "CIDFontType2",
1044            "BaseFont" => "MSGothic",
1045            "DW" => Object::Integer(1000),
1046            "CIDToGIDMap" => "Identity",
1047            "CIDSystemInfo" => Object::Dictionary(dictionary! {
1048                "Registry" => Object::String("Adobe".as_bytes().to_vec(), lopdf::StringFormat::Literal),
1049                "Ordering" => Object::String("Identity".as_bytes().to_vec(), lopdf::StringFormat::Literal),
1050                "Supplement" => Object::Integer(0),
1051            }),
1052        };
1053        let cid_font_id = doc.add_object(Object::Dictionary(cid_font_dict));
1054
1055        // Type0 font dictionary with Identity-H encoding
1056        let type0_dict = dictionary! {
1057            "Type" => "Font",
1058            "Subtype" => "Type0",
1059            "BaseFont" => "MSGothic",
1060            "Encoding" => "Identity-H",
1061            "DescendantFonts" => Object::Array(vec![Object::Reference(cid_font_id)]),
1062            "ToUnicode" => Object::Reference(tounicode_id),
1063        };
1064        let type0_id = doc.add_object(Object::Dictionary(type0_dict));
1065
1066        // Resources with Font entry
1067        dictionary! {
1068            "Font" => Object::Dictionary(dictionary! {
1069                "F1" => Object::Reference(type0_id),
1070            }),
1071        }
1072    }
1073
1074    #[test]
1075    fn interpret_cid_font_identity_h_two_byte_codes() {
1076        let mut doc = lopdf::Document::with_version("1.5");
1077        let resources = make_cid_font_resources(&mut doc);
1078
1079        // Content stream: use CID font F1 and show 2-byte character codes
1080        // 0x4E2D = 中, 0x6587 = 文
1081        let stream = b"BT /F1 12 Tf <4E2D6587> Tj ET";
1082
1083        let mut handler = CollectingHandler::new();
1084        let mut gstate = InterpreterState::new();
1085        let mut tstate = TextState::new();
1086
1087        interpret_content_stream(
1088            &doc,
1089            stream,
1090            &resources,
1091            &mut handler,
1092            &default_options(),
1093            0,
1094            &mut gstate,
1095            &mut tstate,
1096        )
1097        .unwrap();
1098
1099        // Should produce 2 characters (2-byte codes), not 4 (1-byte)
1100        assert_eq!(handler.chars.len(), 2);
1101        assert_eq!(handler.chars[0].char_code, 0x4E2D);
1102        assert_eq!(handler.chars[1].char_code, 0x6587);
1103        // Unicode should be resolved via ToUnicode CMap
1104        assert_eq!(handler.chars[0].unicode, Some("中".to_string()));
1105        assert_eq!(handler.chars[1].unicode, Some("文".to_string()));
1106        assert_eq!(handler.chars[0].font_name, "MSGothic");
1107    }
1108
1109    #[test]
1110    fn interpret_cid_font_tj_array_two_byte_codes() {
1111        let mut doc = lopdf::Document::with_version("1.5");
1112        let resources = make_cid_font_resources(&mut doc);
1113
1114        // TJ array with 2-byte CID strings and adjustments
1115        let stream = b"BT /F1 12 Tf [<4E2D> -100 <6587>] TJ ET";
1116
1117        let mut handler = CollectingHandler::new();
1118        let mut gstate = InterpreterState::new();
1119        let mut tstate = TextState::new();
1120
1121        interpret_content_stream(
1122            &doc,
1123            stream,
1124            &resources,
1125            &mut handler,
1126            &default_options(),
1127            0,
1128            &mut gstate,
1129            &mut tstate,
1130        )
1131        .unwrap();
1132
1133        assert_eq!(handler.chars.len(), 2);
1134        assert_eq!(handler.chars[0].char_code, 0x4E2D);
1135        assert_eq!(handler.chars[1].char_code, 0x6587);
1136    }
1137
1138    #[test]
1139    fn interpret_subset_font_name_stripped() {
1140        let mut doc = lopdf::Document::with_version("1.5");
1141
1142        use lopdf::{Object, Stream, dictionary};
1143
1144        // Create a ToUnicode CMap
1145        let tounicode_data = b"\
1146            beginbfchar\n\
1147            <4E2D> <4E2D>\n\
1148            endbfchar\n";
1149        let tounicode_stream = Stream::new(dictionary! {}, tounicode_data.to_vec());
1150        let tounicode_id = doc.add_object(Object::Stream(tounicode_stream));
1151
1152        // CIDFont with subset prefix
1153        let cid_font_dict = dictionary! {
1154            "Type" => "Font",
1155            "Subtype" => "CIDFontType2",
1156            "BaseFont" => "ABCDEF+MSGothic",
1157            "DW" => Object::Integer(1000),
1158            "CIDToGIDMap" => "Identity",
1159        };
1160        let cid_font_id = doc.add_object(Object::Dictionary(cid_font_dict));
1161
1162        // Type0 font with subset prefix in BaseFont
1163        let type0_dict = dictionary! {
1164            "Type" => "Font",
1165            "Subtype" => "Type0",
1166            "BaseFont" => "ABCDEF+MSGothic",
1167            "Encoding" => "Identity-H",
1168            "DescendantFonts" => Object::Array(vec![Object::Reference(cid_font_id)]),
1169            "ToUnicode" => Object::Reference(tounicode_id),
1170        };
1171        let type0_id = doc.add_object(Object::Dictionary(type0_dict));
1172
1173        let resources = dictionary! {
1174            "Font" => Object::Dictionary(dictionary! {
1175                "F1" => Object::Reference(type0_id),
1176            }),
1177        };
1178
1179        let stream = b"BT /F1 12 Tf <4E2D> Tj ET";
1180
1181        let mut handler = CollectingHandler::new();
1182        let mut gstate = InterpreterState::new();
1183        let mut tstate = TextState::new();
1184
1185        interpret_content_stream(
1186            &doc,
1187            stream,
1188            &resources,
1189            &mut handler,
1190            &default_options(),
1191            0,
1192            &mut gstate,
1193            &mut tstate,
1194        )
1195        .unwrap();
1196
1197        assert_eq!(handler.chars.len(), 1);
1198        // Subset prefix should be stripped
1199        assert_eq!(handler.chars[0].font_name, "MSGothic");
1200    }
1201
1202    /// Build resources for Identity-V (vertical writing mode).
1203    fn make_cid_font_resources_identity_v(doc: &mut lopdf::Document) -> lopdf::Dictionary {
1204        use lopdf::{Object, Stream, dictionary};
1205
1206        let tounicode_data = b"\
1207            beginbfchar\n\
1208            <4E2D> <4E2D>\n\
1209            endbfchar\n";
1210        let tounicode_stream = Stream::new(dictionary! {}, tounicode_data.to_vec());
1211        let tounicode_id = doc.add_object(Object::Stream(tounicode_stream));
1212
1213        let cid_font_dict = dictionary! {
1214            "Type" => "Font",
1215            "Subtype" => "CIDFontType2",
1216            "BaseFont" => "MSGothic",
1217            "DW" => Object::Integer(1000),
1218            "CIDToGIDMap" => "Identity",
1219        };
1220        let cid_font_id = doc.add_object(Object::Dictionary(cid_font_dict));
1221
1222        let type0_dict = dictionary! {
1223            "Type" => "Font",
1224            "Subtype" => "Type0",
1225            "BaseFont" => "MSGothic",
1226            "Encoding" => "Identity-V",
1227            "DescendantFonts" => Object::Array(vec![Object::Reference(cid_font_id)]),
1228            "ToUnicode" => Object::Reference(tounicode_id),
1229        };
1230        let type0_id = doc.add_object(Object::Dictionary(type0_dict));
1231
1232        dictionary! {
1233            "Font" => Object::Dictionary(dictionary! {
1234                "F1" => Object::Reference(type0_id),
1235            }),
1236        }
1237    }
1238
1239    #[test]
1240    fn interpret_cid_font_identity_v_detected() {
1241        let mut doc = lopdf::Document::with_version("1.5");
1242        let resources = make_cid_font_resources_identity_v(&mut doc);
1243
1244        // Show a CID character with Identity-V encoding
1245        let stream = b"BT /F1 12 Tf <4E2D> Tj ET";
1246
1247        let mut handler = CollectingHandler::new();
1248        let mut gstate = InterpreterState::new();
1249        let mut tstate = TextState::new();
1250
1251        interpret_content_stream(
1252            &doc,
1253            stream,
1254            &resources,
1255            &mut handler,
1256            &default_options(),
1257            0,
1258            &mut gstate,
1259            &mut tstate,
1260        )
1261        .unwrap();
1262
1263        // Should still produce characters (Identity-V uses same CID=charcode mapping)
1264        assert_eq!(handler.chars.len(), 1);
1265        assert_eq!(handler.chars[0].char_code, 0x4E2D);
1266        assert_eq!(handler.chars[0].unicode, Some("中".to_string()));
1267    }
1268
1269    // --- Warning emission tests ---
1270
1271    #[test]
1272    fn interpret_missing_font_emits_warning() {
1273        let doc = lopdf::Document::with_version("1.5");
1274        let resources = empty_resources(); // No fonts defined
1275        // Use font F1 which is not in resources
1276        let stream = b"BT /F1 12 Tf (Hi) Tj ET";
1277
1278        let mut handler = CollectingHandler::new();
1279        let mut gstate = InterpreterState::new();
1280        let mut tstate = TextState::new();
1281
1282        interpret_content_stream(
1283            &doc,
1284            stream,
1285            &resources,
1286            &mut handler,
1287            &default_options(),
1288            0,
1289            &mut gstate,
1290            &mut tstate,
1291        )
1292        .unwrap();
1293
1294        // Should emit a warning about missing font
1295        assert!(!handler.warnings.is_empty());
1296        assert!(
1297            handler.warnings[0]
1298                .description
1299                .contains("font not found in page resources"),
1300            "expected 'font not found' warning, got: {}",
1301            handler.warnings[0].description
1302        );
1303        assert_eq!(
1304            handler.warnings[0].font_name,
1305            Some("F1".to_string()),
1306            "warning should include font name"
1307        );
1308        assert!(
1309            handler.warnings[0].operator_index.is_some(),
1310            "warning should include operator index"
1311        );
1312
1313        // Characters should still be extracted (using default metrics)
1314        assert_eq!(handler.chars.len(), 2);
1315    }
1316
1317    #[test]
1318    fn interpret_no_warnings_when_collection_disabled() {
1319        let doc = lopdf::Document::with_version("1.5");
1320        let resources = empty_resources();
1321        let stream = b"BT /F1 12 Tf (Hi) Tj ET";
1322
1323        let mut handler = CollectingHandler::new();
1324        let mut gstate = InterpreterState::new();
1325        let mut tstate = TextState::new();
1326
1327        let opts = ExtractOptions {
1328            collect_warnings: false,
1329            ..ExtractOptions::default()
1330        };
1331
1332        interpret_content_stream(
1333            &doc,
1334            stream,
1335            &resources,
1336            &mut handler,
1337            &opts,
1338            0,
1339            &mut gstate,
1340            &mut tstate,
1341        )
1342        .unwrap();
1343
1344        // No warnings should be collected
1345        assert!(handler.warnings.is_empty());
1346
1347        // Characters should still be extracted normally
1348        assert_eq!(handler.chars.len(), 2);
1349    }
1350
1351    #[test]
1352    fn interpret_warnings_do_not_affect_output() {
1353        let doc = lopdf::Document::with_version("1.5");
1354        let resources = empty_resources();
1355        let stream = b"BT /F1 12 Tf (AB) Tj ET";
1356
1357        // With warnings enabled
1358        let mut handler_on = CollectingHandler::new();
1359        let mut gstate_on = InterpreterState::new();
1360        let mut tstate_on = TextState::new();
1361        let opts_on = ExtractOptions {
1362            collect_warnings: true,
1363            ..ExtractOptions::default()
1364        };
1365        interpret_content_stream(
1366            &doc,
1367            stream,
1368            &resources,
1369            &mut handler_on,
1370            &opts_on,
1371            0,
1372            &mut gstate_on,
1373            &mut tstate_on,
1374        )
1375        .unwrap();
1376
1377        // With warnings disabled
1378        let mut handler_off = CollectingHandler::new();
1379        let mut gstate_off = InterpreterState::new();
1380        let mut tstate_off = TextState::new();
1381        let opts_off = ExtractOptions {
1382            collect_warnings: false,
1383            ..ExtractOptions::default()
1384        };
1385        interpret_content_stream(
1386            &doc,
1387            stream,
1388            &resources,
1389            &mut handler_off,
1390            &opts_off,
1391            0,
1392            &mut gstate_off,
1393            &mut tstate_off,
1394        )
1395        .unwrap();
1396
1397        // Same output regardless of warning collection
1398        assert_eq!(handler_on.chars.len(), handler_off.chars.len());
1399        for (a, b) in handler_on.chars.iter().zip(handler_off.chars.iter()) {
1400            assert_eq!(a.char_code, b.char_code);
1401        }
1402    }
1403
1404    #[test]
1405    fn interpret_valid_font_no_warnings() {
1406        let mut doc = lopdf::Document::with_version("1.5");
1407        let resources = make_cid_font_resources(&mut doc);
1408        let stream = b"BT /F1 12 Tf <4E2D> Tj ET";
1409
1410        let mut handler = CollectingHandler::new();
1411        let mut gstate = InterpreterState::new();
1412        let mut tstate = TextState::new();
1413
1414        interpret_content_stream(
1415            &doc,
1416            stream,
1417            &resources,
1418            &mut handler,
1419            &default_options(),
1420            0,
1421            &mut gstate,
1422            &mut tstate,
1423        )
1424        .unwrap();
1425
1426        // Valid font should not produce warnings
1427        assert!(
1428            handler.warnings.is_empty(),
1429            "expected no warnings for valid font, got: {:?}",
1430            handler.warnings
1431        );
1432        assert_eq!(handler.chars.len(), 1);
1433    }
1434}