Skip to main content

edgeparse_core/pdf/
chunk_parser.rs

1//! Unified PDF content stream parser — matches the reference ChunkParser architecture.
2//!
3//! Single-pass content stream walker that produces text, image, and line chunks
4//! with shared graphics state. Handles:
5//! - Text operators (BT/ET/Tf/Td/Tm/Tj/TJ/etc.)
6//! - Image extraction via `Do` operator (XObject images with CTM-based bbox)
7//! - Form XObject recursive processing via `Do` operator
8//! - Inline images (BI/ID/EI)
9//! - Path/line operators (m/l/c/re/S/f/B/etc.)
10//! - Graphics state (q/Q/cm/gs)
11//! - Color operators (g/rg/k/cs/sc/etc.)
12//! - Marked content (BMC/BDC/EMC)
13
14use lopdf::{content::Content, Dictionary, Document, Object, ObjectId};
15
16use crate::models::bbox::{BoundingBox, Vertex};
17use crate::models::chunks::{ImageChunk, LineArtChunk, LineChunk, TextChunk};
18use crate::EdgePdfError;
19
20use super::font::{resolve_page_fonts, FontCache, PdfFont};
21use super::graphics_state::{GraphicsStateStack, Matrix};
22
23/// Maximum recursion depth for Form XObject processing (prevents infinite loops).
24const MAX_FORM_RECURSION_DEPTH: u32 = 10;
25
26/// Minimum line width to consider a path segment (in points).
27const MIN_LINE_WIDTH: f64 = 0.1;
28
29/// Aspect ratio threshold: width/height > this means horizontal line.
30const LINE_ASPECT_RATIO: f64 = 3.0;
31
32/// Maximum thickness for a line (vs rectangle classification).
33const MAX_LINE_THICKNESS: f64 = 10.0;
34
35/// All chunks extracted from a single page.
36#[derive(Debug, Default)]
37pub struct PageChunks {
38    /// Text chunks with position and font info
39    pub text_chunks: Vec<TextChunk>,
40    /// Image chunks with CTM-based bounding boxes
41    pub image_chunks: Vec<ImageChunk>,
42    /// Line segments (horizontal/vertical lines, rectangles)
43    pub line_chunks: Vec<LineChunk>,
44    /// Vector graphics (complex paths)
45    pub line_art_chunks: Vec<LineArtChunk>,
46}
47
48/// Extract all chunks from a single page in one content stream pass.
49pub fn extract_page_chunks(
50    doc: &Document,
51    page_number: u32,
52    page_id: ObjectId,
53) -> Result<PageChunks, EdgePdfError> {
54    let font_cache = resolve_page_fonts(doc, page_id);
55
56    let page_dict = doc
57        .get_object(page_id)
58        .map_err(|e| EdgePdfError::PipelineError {
59            stage: 1,
60            message: format!("Failed to get page {}: {}", page_number, e),
61        })?
62        .as_dict()
63        .map_err(|e| EdgePdfError::PipelineError {
64            stage: 1,
65            message: format!("Page {} is not a dictionary: {}", page_number, e),
66        })?
67        .clone();
68
69    // Get content stream(s)
70    let content_data = super::text_extractor::get_page_content(doc, &page_dict)?;
71    if content_data.is_empty() {
72        return Ok(PageChunks::default());
73    }
74
75    // Parse content stream operations
76    let content = Content::decode(&content_data).map_err(|e| EdgePdfError::PipelineError {
77        stage: 1,
78        message: format!(
79            "Failed to decode content stream for page {}: {}",
80            page_number, e
81        ),
82    })?;
83
84    // Resolve the Resources dictionary for this page (needed for Do/gs operators)
85    let resources = resolve_page_resources(doc, &page_dict);
86
87    let mut parser = ChunkParserState::new(page_number, font_cache);
88    parser.process_operations(doc, &content.operations, &resources, 0);
89
90    Ok(parser.into_page_chunks())
91}
92
93/// Resolve the /Resources dictionary for a page, following references.
94fn resolve_page_resources(doc: &Document, page_dict: &Dictionary) -> Dictionary {
95    match page_dict.get(b"Resources") {
96        Ok(obj) => {
97            let resolved = resolve_obj(doc, obj);
98            resolved.as_dict().cloned().unwrap_or_default()
99        }
100        Err(_) => Dictionary::new(),
101    }
102}
103
104/// Internal state for the unified chunk parser — equivalent to the reference ChunkParser.
105struct ChunkParserState {
106    page_number: u32,
107    font_cache: FontCache,
108    gs_stack: GraphicsStateStack,
109
110    // Chunk accumulators
111    text_chunks: Vec<TextChunk>,
112    image_chunks: Vec<ImageChunk>,
113    line_chunks: Vec<LineChunk>,
114    line_art_chunks: Vec<LineArtChunk>,
115
116    // Indices
117    text_index: usize,
118    image_index: u32,
119    line_index: u32,
120
121    // Marked content tracking
122    mcid_stack: Vec<Option<i64>>,
123
124    // Path construction state
125    current_path: Vec<PathSegment>,
126    subpath_start: Option<(f64, f64)>,
127    current_point: Option<(f64, f64)>,
128    line_width: f64,
129}
130
131impl ChunkParserState {
132    fn new(page_number: u32, font_cache: FontCache) -> Self {
133        Self {
134            page_number,
135            font_cache,
136            gs_stack: GraphicsStateStack::default(),
137
138            text_chunks: Vec::new(),
139            image_chunks: Vec::new(),
140            line_chunks: Vec::new(),
141            line_art_chunks: Vec::new(),
142
143            text_index: 0,
144            image_index: 0,
145            line_index: 0,
146
147            mcid_stack: Vec::new(),
148
149            current_path: Vec::new(),
150            subpath_start: None,
151            current_point: None,
152            line_width: 1.0,
153        }
154    }
155
156    fn into_page_chunks(self) -> PageChunks {
157        PageChunks {
158            text_chunks: self.text_chunks,
159            image_chunks: self.image_chunks,
160            line_chunks: self.line_chunks,
161            line_art_chunks: self.line_art_chunks,
162        }
163    }
164
165    /// Process all content stream operations — the core parser loop.
166    fn process_operations(
167        &mut self,
168        doc: &Document,
169        operations: &[lopdf::content::Operation],
170        resources: &Dictionary,
171        recursion_depth: u32,
172    ) {
173        for op in operations {
174            match op.operator.as_str() {
175                // ── Marked content operators ──
176                "BMC" => {
177                    self.mcid_stack.push(None);
178                }
179                "BDC" => {
180                    let mcid = extract_mcid_from_bdc(&op.operands);
181                    self.mcid_stack.push(mcid);
182                }
183                "EMC" => {
184                    self.mcid_stack.pop();
185                }
186
187                // ── Graphics state ──
188                "q" => self.gs_stack.save(),
189                "Q" => self.gs_stack.restore(),
190                "cm" => {
191                    if op.operands.len() == 6 {
192                        let vals: Vec<f64> = op
193                            .operands
194                            .iter()
195                            .filter_map(|o| obj_to_f64(o.clone()))
196                            .collect();
197                        if vals.len() == 6 {
198                            self.gs_stack
199                                .concat_ctm(vals[0], vals[1], vals[2], vals[3], vals[4], vals[5]);
200                        }
201                    }
202                }
203                "gs" => {
204                    // Extended Graphics State — look up in /ExtGState resources
205                    if let Some(name) = op.operands.first().and_then(obj_name_bytes) {
206                        self.apply_ext_gstate(doc, resources, &name);
207                    }
208                }
209
210                // ── Text state operators ──
211                "BT" => self.gs_stack.current.begin_text(),
212                "ET" => {}
213
214                "Tf" => {
215                    if op.operands.len() == 2 {
216                        if let Object::Name(ref name) = op.operands[0] {
217                            self.gs_stack.current.text_state.font_name =
218                                String::from_utf8_lossy(name).to_string();
219                        }
220                        if let Some(size) = obj_to_f64(op.operands[1].clone()) {
221                            self.gs_stack.current.text_state.font_size = size;
222                        }
223                    }
224                }
225                "Tc" => {
226                    if let Some(v) = op.operands.first().and_then(|o| obj_to_f64(o.clone())) {
227                        self.gs_stack.current.text_state.char_spacing = v;
228                    }
229                }
230                "Tw" => {
231                    if let Some(v) = op.operands.first().and_then(|o| obj_to_f64(o.clone())) {
232                        self.gs_stack.current.text_state.word_spacing = v;
233                    }
234                }
235                "Tz" => {
236                    if let Some(v) = op.operands.first().and_then(|o| obj_to_f64(o.clone())) {
237                        self.gs_stack.current.text_state.horizontal_scaling = v;
238                    }
239                }
240                "TL" => {
241                    if let Some(v) = op.operands.first().and_then(|o| obj_to_f64(o.clone())) {
242                        self.gs_stack.current.text_state.leading = v;
243                    }
244                }
245                "Ts" => {
246                    if let Some(v) = op.operands.first().and_then(|o| obj_to_f64(o.clone())) {
247                        self.gs_stack.current.text_state.rise = v;
248                    }
249                }
250                "Tr" => {
251                    if let Some(v) = op.operands.first().and_then(|o| obj_to_f64(o.clone())) {
252                        self.gs_stack.current.text_state.render_mode = v as i32;
253                    }
254                }
255
256                // ── Text positioning ──
257                "Td" => {
258                    if op.operands.len() == 2 {
259                        let tx = obj_to_f64(op.operands[0].clone()).unwrap_or(0.0);
260                        let ty = obj_to_f64(op.operands[1].clone()).unwrap_or(0.0);
261                        self.gs_stack.current.translate_text(tx, ty);
262                    }
263                }
264                "TD" => {
265                    if op.operands.len() == 2 {
266                        let tx = obj_to_f64(op.operands[0].clone()).unwrap_or(0.0);
267                        let ty = obj_to_f64(op.operands[1].clone()).unwrap_or(0.0);
268                        self.gs_stack.current.text_state.leading = -ty;
269                        self.gs_stack.current.translate_text(tx, ty);
270                    }
271                }
272                "Tm" => {
273                    if op.operands.len() == 6 {
274                        let vals: Vec<f64> = op
275                            .operands
276                            .iter()
277                            .filter_map(|o| obj_to_f64(o.clone()))
278                            .collect();
279                        if vals.len() == 6 {
280                            self.gs_stack.current.set_text_matrix(
281                                vals[0], vals[1], vals[2], vals[3], vals[4], vals[5],
282                            );
283                        }
284                    }
285                }
286                "T*" => {
287                    self.gs_stack.current.next_line();
288                }
289
290                // ── Text showing operators → TextChunk ──
291                "Tj" => {
292                    if let Some(text_bytes) = op.operands.first().and_then(extract_string_bytes) {
293                        self.emit_text_chunk(&text_bytes);
294                    }
295                }
296                "TJ" => {
297                    if let Some(Object::Array(ref arr)) = op.operands.first() {
298                        self.process_tj_array(arr);
299                    }
300                }
301                "'" => {
302                    self.gs_stack.current.next_line();
303                    if let Some(text_bytes) = op.operands.first().and_then(extract_string_bytes) {
304                        self.emit_text_chunk(&text_bytes);
305                    }
306                }
307                "\"" => {
308                    if op.operands.len() == 3 {
309                        if let Some(aw) = obj_to_f64(op.operands[0].clone()) {
310                            self.gs_stack.current.text_state.word_spacing = aw;
311                        }
312                        if let Some(ac) = obj_to_f64(op.operands[1].clone()) {
313                            self.gs_stack.current.text_state.char_spacing = ac;
314                        }
315                        self.gs_stack.current.next_line();
316                        if let Some(text_bytes) = extract_string_bytes(&op.operands[2]) {
317                            self.emit_text_chunk(&text_bytes);
318                        }
319                    }
320                }
321
322                // ── Color operators ──
323                "g" => {
324                    if let Some(gray) = op.operands.first().and_then(|o| obj_to_f64(o.clone())) {
325                        self.gs_stack.current.fill_color = vec![gray];
326                        self.gs_stack.current.fill_color_space_components = 1;
327                    }
328                }
329                "G" => {
330                    if let Some(gray) = op.operands.first().and_then(|o| obj_to_f64(o.clone())) {
331                        self.gs_stack.current.stroke_color = vec![gray];
332                        self.gs_stack.current.stroke_color_space_components = 1;
333                    }
334                }
335                "rg" => {
336                    if op.operands.len() == 3 {
337                        let r = obj_to_f64(op.operands[0].clone()).unwrap_or(0.0);
338                        let g = obj_to_f64(op.operands[1].clone()).unwrap_or(0.0);
339                        let b = obj_to_f64(op.operands[2].clone()).unwrap_or(0.0);
340                        self.gs_stack.current.fill_color = vec![r, g, b];
341                        self.gs_stack.current.fill_color_space_components = 3;
342                    }
343                }
344                "RG" => {
345                    if op.operands.len() == 3 {
346                        let r = obj_to_f64(op.operands[0].clone()).unwrap_or(0.0);
347                        let g = obj_to_f64(op.operands[1].clone()).unwrap_or(0.0);
348                        let b = obj_to_f64(op.operands[2].clone()).unwrap_or(0.0);
349                        self.gs_stack.current.stroke_color = vec![r, g, b];
350                        self.gs_stack.current.stroke_color_space_components = 3;
351                    }
352                }
353                "k" => {
354                    if op.operands.len() == 4 {
355                        let c = obj_to_f64(op.operands[0].clone()).unwrap_or(0.0);
356                        let m = obj_to_f64(op.operands[1].clone()).unwrap_or(0.0);
357                        let y = obj_to_f64(op.operands[2].clone()).unwrap_or(0.0);
358                        let kk = obj_to_f64(op.operands[3].clone()).unwrap_or(0.0);
359                        self.gs_stack.current.fill_color = vec![c, m, y, kk];
360                        self.gs_stack.current.fill_color_space_components = 4;
361                    }
362                }
363                "K" => {
364                    if op.operands.len() == 4 {
365                        let c = obj_to_f64(op.operands[0].clone()).unwrap_or(0.0);
366                        let m = obj_to_f64(op.operands[1].clone()).unwrap_or(0.0);
367                        let y = obj_to_f64(op.operands[2].clone()).unwrap_or(0.0);
368                        let kk = obj_to_f64(op.operands[3].clone()).unwrap_or(0.0);
369                        self.gs_stack.current.stroke_color = vec![c, m, y, kk];
370                        self.gs_stack.current.stroke_color_space_components = 4;
371                    }
372                }
373                "cs" => {
374                    if let Some(name) = op.operands.first() {
375                        let cs_name = obj_to_name(name);
376                        let comps = color_space_components(&cs_name);
377                        self.gs_stack.current.fill_color_space_components = comps;
378                        // PDF spec 8.6.5.3: reset color to default for new space
379                        self.gs_stack.current.fill_color = default_color_for_space(comps);
380                    }
381                }
382                "CS" => {
383                    if let Some(name) = op.operands.first() {
384                        let cs_name = obj_to_name(name);
385                        let comps = color_space_components(&cs_name);
386                        self.gs_stack.current.stroke_color_space_components = comps;
387                        // PDF spec 8.6.5.3: reset color to default for new space
388                        self.gs_stack.current.stroke_color = default_color_for_space(comps);
389                    }
390                }
391                "sc" | "scn" => {
392                    let components: Vec<f64> = op
393                        .operands
394                        .iter()
395                        .filter_map(|o| obj_to_f64(o.clone()))
396                        .collect();
397                    if !components.is_empty() {
398                        self.gs_stack.current.fill_color = components;
399                    }
400                }
401                "SC" | "SCN" => {
402                    let components: Vec<f64> = op
403                        .operands
404                        .iter()
405                        .filter_map(|o| obj_to_f64(o.clone()))
406                        .collect();
407                    if !components.is_empty() {
408                        self.gs_stack.current.stroke_color = components;
409                    }
410                }
411
412                // ── Line width ──
413                "w" => {
414                    if let Some(w) = op.operands.first().and_then(|o| obj_to_f64(o.clone())) {
415                        self.line_width = w;
416                    }
417                }
418
419                // ── Path construction ──
420                "m" => {
421                    if op.operands.len() >= 2 {
422                        if let (Some(x), Some(y)) = (
423                            op.operands.first().and_then(|o| obj_to_f64(o.clone())),
424                            op.operands.get(1).and_then(|o| obj_to_f64(o.clone())),
425                        ) {
426                            let (tx, ty) = self.transform_point(x, y);
427                            self.subpath_start = Some((tx, ty));
428                            self.current_point = Some((tx, ty));
429                        }
430                    }
431                }
432                "l" => {
433                    if op.operands.len() >= 2 {
434                        if let (Some(x), Some(y)) = (
435                            op.operands.first().and_then(|o| obj_to_f64(o.clone())),
436                            op.operands.get(1).and_then(|o| obj_to_f64(o.clone())),
437                        ) {
438                            let (tx, ty) = self.transform_point(x, y);
439                            if let Some((cx, cy)) = self.current_point {
440                                self.current_path.push(PathSegment::Line {
441                                    x1: cx,
442                                    y1: cy,
443                                    x2: tx,
444                                    y2: ty,
445                                });
446                            }
447                            self.current_point = Some((tx, ty));
448                        }
449                    }
450                }
451                "c" => {
452                    if op.operands.len() >= 6 {
453                        let vals: Vec<f64> = op
454                            .operands
455                            .iter()
456                            .filter_map(|o| obj_to_f64(o.clone()))
457                            .collect();
458                        if vals.len() >= 6 {
459                            let (tx, ty) = self.transform_point(vals[4], vals[5]);
460                            if let Some((cx, cy)) = self.current_point {
461                                let (cp1x, cp1y) = self.transform_point(vals[0], vals[1]);
462                                let (cp2x, cp2y) = self.transform_point(vals[2], vals[3]);
463                                self.current_path.push(PathSegment::Curve {
464                                    x1: cx,
465                                    y1: cy,
466                                    cp1x,
467                                    cp1y,
468                                    cp2x,
469                                    cp2y,
470                                    x2: tx,
471                                    y2: ty,
472                                });
473                            }
474                            self.current_point = Some((tx, ty));
475                        }
476                    }
477                }
478                "v" => {
479                    if op.operands.len() >= 4 {
480                        let vals: Vec<f64> = op
481                            .operands
482                            .iter()
483                            .filter_map(|o| obj_to_f64(o.clone()))
484                            .collect();
485                        if vals.len() >= 4 {
486                            let (tx, ty) = self.transform_point(vals[2], vals[3]);
487                            if let Some((cx, cy)) = self.current_point {
488                                let (cp2x, cp2y) = self.transform_point(vals[0], vals[1]);
489                                self.current_path.push(PathSegment::Curve {
490                                    x1: cx,
491                                    y1: cy,
492                                    cp1x: cx,
493                                    cp1y: cy,
494                                    cp2x,
495                                    cp2y,
496                                    x2: tx,
497                                    y2: ty,
498                                });
499                            }
500                            self.current_point = Some((tx, ty));
501                        }
502                    }
503                }
504                "y" => {
505                    if op.operands.len() >= 4 {
506                        let vals: Vec<f64> = op
507                            .operands
508                            .iter()
509                            .filter_map(|o| obj_to_f64(o.clone()))
510                            .collect();
511                        if vals.len() >= 4 {
512                            let (tx, ty) = self.transform_point(vals[2], vals[3]);
513                            if let Some((cx, cy)) = self.current_point {
514                                let (cp1x, cp1y) = self.transform_point(vals[0], vals[1]);
515                                self.current_path.push(PathSegment::Curve {
516                                    x1: cx,
517                                    y1: cy,
518                                    cp1x,
519                                    cp1y,
520                                    cp2x: tx,
521                                    cp2y: ty,
522                                    x2: tx,
523                                    y2: ty,
524                                });
525                            }
526                            self.current_point = Some((tx, ty));
527                        }
528                    }
529                }
530                "h" => {
531                    if let (Some((sx, sy)), Some((cx, cy))) =
532                        (self.subpath_start, self.current_point)
533                    {
534                        if (sx - cx).abs() > 0.01 || (sy - cy).abs() > 0.01 {
535                            self.current_path.push(PathSegment::Line {
536                                x1: cx,
537                                y1: cy,
538                                x2: sx,
539                                y2: sy,
540                            });
541                        }
542                        self.current_point = self.subpath_start;
543                    }
544                }
545                "re" => {
546                    if op.operands.len() >= 4 {
547                        let vals: Vec<f64> = op
548                            .operands
549                            .iter()
550                            .filter_map(|o| obj_to_f64(o.clone()))
551                            .collect();
552                        if vals.len() >= 4 {
553                            let (x, y, w, h) = (vals[0], vals[1], vals[2], vals[3]);
554                            let (x1, y1) = self.transform_point(x, y);
555                            let (x2, y2) = self.transform_point(x + w, y);
556                            let (x3, y3) = self.transform_point(x + w, y + h);
557                            let (x4, y4) = self.transform_point(x, y + h);
558                            self.current_path.push(PathSegment::Line { x1, y1, x2, y2 });
559                            self.current_path.push(PathSegment::Line {
560                                x1: x2,
561                                y1: y2,
562                                x2: x3,
563                                y2: y3,
564                            });
565                            self.current_path.push(PathSegment::Line {
566                                x1: x3,
567                                y1: y3,
568                                x2: x4,
569                                y2: y4,
570                            });
571                            self.current_path.push(PathSegment::Line {
572                                x1: x4,
573                                y1: y4,
574                                x2: x1,
575                                y2: y1,
576                            });
577                            self.subpath_start = Some((x1, y1));
578                            self.current_point = Some((x1, y1));
579                        }
580                    }
581                }
582
583                // ── Path painting ──
584                "S" => {
585                    self.classify_and_emit_path();
586                }
587                "s" => {
588                    // close and stroke
589                    self.close_subpath();
590                    self.classify_and_emit_path();
591                }
592                "f" | "F" | "f*" => {
593                    self.classify_and_emit_path();
594                }
595                "B" | "B*" | "b" | "b*" => {
596                    if op.operator.starts_with('b') {
597                        self.close_subpath();
598                    }
599                    self.classify_and_emit_path();
600                }
601                "n" => {
602                    // End path without painting
603                    self.current_path.clear();
604                    self.subpath_start = None;
605                    self.current_point = None;
606                }
607
608                // ── XObject (Do) — Image and Form XObject handling ──
609                "Do" => {
610                    if let Some(name_bytes) = op.operands.first().and_then(obj_name_bytes) {
611                        self.handle_do_operator(doc, resources, &name_bytes, recursion_depth);
612                    }
613                }
614
615                // ── Inline image (BI/ID/EI) ──
616                // lopdf parses BI inline images as a special operation;
617                // the operator is "BI" with the image dict + data as operands.
618                // We create an ImageChunk using the current CTM.
619                "BI" => {
620                    self.emit_inline_image();
621                }
622
623                _ => {
624                    // Ignore unknown/unhandled operators
625                }
626            }
627        }
628    }
629
630    // ── Text chunk creation ──
631
632    fn emit_text_chunk(&mut self, text_bytes: &[u8]) {
633        if text_bytes.is_empty() {
634            return;
635        }
636
637        let font = self
638            .font_cache
639            .get(&self.gs_stack.current.text_state.font_name)
640            .cloned()
641            .unwrap_or_else(|| PdfFont::default_font(&self.gs_stack.current.text_state.font_name));
642        let active_mcid = self.active_mcid();
643
644        if let Some(chunk) = create_text_chunk(
645            text_bytes,
646            &font,
647            &mut self.gs_stack,
648            self.page_number,
649            &mut self.text_index,
650            active_mcid,
651        ) {
652            self.text_chunks.push(chunk);
653        }
654    }
655
656    fn process_tj_array(&mut self, arr: &[Object]) {
657        let font = self
658            .font_cache
659            .get(&self.gs_stack.current.text_state.font_name)
660            .cloned()
661            .unwrap_or_else(|| PdfFont::default_font(&self.gs_stack.current.text_state.font_name));
662        let active_mcid = self.active_mcid();
663
664        for item in arr {
665            match item {
666                Object::String(bytes, _) => {
667                    if let Some(chunk) = create_text_chunk(
668                        bytes,
669                        &font,
670                        &mut self.gs_stack,
671                        self.page_number,
672                        &mut self.text_index,
673                        active_mcid,
674                    ) {
675                        self.text_chunks.push(chunk);
676                    }
677                }
678                _ => {
679                    if let Some(adj) = obj_to_f64(item.clone()) {
680                        let displacement =
681                            -adj / 1000.0 * self.gs_stack.current.text_state.font_size;
682                        self.gs_stack.current.advance_text(displacement);
683                    }
684                }
685            }
686        }
687    }
688
689    // ── Image handling ──
690
691    /// Handle `Do` operator — dispatches to image or form XObject processing.
692    fn handle_do_operator(
693        &mut self,
694        doc: &Document,
695        resources: &Dictionary,
696        name_bytes: &[u8],
697        recursion_depth: u32,
698    ) {
699        // Look up the XObject in /Resources/XObject
700        let xobject_dict = match resources.get(b"XObject") {
701            Ok(obj) => {
702                let resolved = resolve_obj(doc, obj);
703                match resolved.as_dict() {
704                    Ok(d) => d.clone(),
705                    Err(_) => return,
706                }
707            }
708            Err(_) => return,
709        };
710
711        let xobj_ref = match xobject_dict.get(name_bytes) {
712            Ok(obj) => resolve_obj(doc, obj),
713            Err(_) => return,
714        };
715
716        let stream = match xobj_ref.as_stream() {
717            Ok(s) => s.clone(),
718            Err(_) => return,
719        };
720
721        let subtype = stream
722            .dict
723            .get(b"Subtype")
724            .ok()
725            .and_then(|o| match resolve_obj(doc, o) {
726                Object::Name(n) => Some(String::from_utf8_lossy(&n).to_string()),
727                _ => None,
728            });
729
730        match subtype.as_deref() {
731            Some("Image") => {
732                // Image XObject → create ImageChunk with CTM-based bbox
733                self.emit_image_from_ctm();
734            }
735            Some("Form") => {
736                // Form XObject → recursive content stream processing
737                if recursion_depth < MAX_FORM_RECURSION_DEPTH {
738                    self.process_form_xobject(doc, &stream, resources, recursion_depth);
739                }
740            }
741            _ => {}
742        }
743    }
744
745    /// Create an ImageChunk using the current CTM to compute position.
746    /// Image occupies [0,0] to [1,1] in user space before CTM transform.
747    fn emit_image_from_ctm(&mut self) {
748        let ctm = &self.gs_stack.current.ctm;
749
750        // Transform the image unit square corners through CTM
751        let (x0, y0) = ctm.transform_point(0.0, 0.0);
752        let (x1, y1) = ctm.transform_point(1.0, 0.0);
753        let (x2, y2) = ctm.transform_point(1.0, 1.0);
754        let (x3, y3) = ctm.transform_point(0.0, 1.0);
755
756        let min_x = x0.min(x1).min(x2).min(x3);
757        let max_x = x0.max(x1).max(x2).max(x3);
758        let min_y = y0.min(y1).min(y2).min(y3);
759        let max_y = y0.max(y1).max(y2).max(y3);
760
761        // Skip degenerate images
762        if (max_x - min_x).abs() < 0.1 || (max_y - min_y).abs() < 0.1 {
763            return;
764        }
765
766        self.image_index += 1;
767        self.image_chunks.push(ImageChunk {
768            bbox: BoundingBox::new(Some(self.page_number), min_x, min_y, max_x, max_y),
769            index: Some(self.image_index),
770            level: None,
771        });
772    }
773
774    /// Create an ImageChunk for an inline image (BI/ID/EI).
775    fn emit_inline_image(&mut self) {
776        // Inline images also use the current CTM for positioning
777        self.emit_image_from_ctm();
778    }
779
780    /// Process a Form XObject — recursively parse its content stream.
781    fn process_form_xobject(
782        &mut self,
783        doc: &Document,
784        stream: &lopdf::Stream,
785        parent_resources: &Dictionary,
786        recursion_depth: u32,
787    ) {
788        // Get the form's /Matrix (default identity)
789        let form_matrix = get_form_matrix(doc, &stream.dict);
790
791        // Concatenate form matrix with current CTM (like the reference implementation: xFormGraphicsState.getCTM().concatenate(matrix))
792        self.gs_stack.save();
793        let m = form_matrix;
794        self.gs_stack.concat_ctm(m.a, m.b, m.c, m.d, m.e, m.f);
795
796        // Resolve form's own resources, falling back to parent
797        let form_resources = match stream.dict.get(b"Resources") {
798            Ok(obj) => {
799                let resolved = resolve_obj(doc, obj);
800                resolved
801                    .as_dict()
802                    .cloned()
803                    .unwrap_or_else(|_| parent_resources.clone())
804            }
805            Err(_) => parent_resources.clone(),
806        };
807
808        // Decompress the form's content stream
809        let form_content = if stream.dict.get(b"Filter").is_ok() {
810            match stream.decompressed_content() {
811                Ok(data) => data,
812                Err(_) => {
813                    self.gs_stack.restore();
814                    return;
815                }
816            }
817        } else {
818            stream.content.clone()
819        };
820
821        if form_content.is_empty() {
822            self.gs_stack.restore();
823            return;
824        }
825
826        // Parse the form's content stream
827        if let Ok(content) = Content::decode(&form_content) {
828            // Resolve fonts from form's resources and merge with page fonts
829            let form_font_cache = resolve_form_fonts(doc, &form_resources);
830            let mut merged_cache = FontCache::default();
831            // Copy page fonts first
832            for (name, font) in self.font_cache.iter() {
833                merged_cache.insert(name.clone(), font.clone());
834            }
835            // Override with form fonts
836            for (name, font) in form_font_cache.iter() {
837                merged_cache.insert(name.clone(), font.clone());
838            }
839
840            let saved_fc = std::mem::replace(&mut self.font_cache, merged_cache);
841            self.process_operations(
842                doc,
843                &content.operations,
844                &form_resources,
845                recursion_depth + 1,
846            );
847            self.font_cache = saved_fc;
848        }
849
850        self.gs_stack.restore();
851    }
852
853    // ── Extended Graphics State ──
854
855    fn apply_ext_gstate(&mut self, doc: &Document, resources: &Dictionary, name: &[u8]) {
856        let ext_gstate_dict = match resources.get(b"ExtGState") {
857            Ok(obj) => {
858                let resolved = resolve_obj(doc, obj);
859                match resolved.as_dict() {
860                    Ok(d) => d.clone(),
861                    Err(_) => return,
862                }
863            }
864            Err(_) => return,
865        };
866
867        let gs_obj = match ext_gstate_dict.get(name) {
868            Ok(obj) => resolve_obj(doc, obj),
869            Err(_) => return,
870        };
871
872        let gs_dict = match gs_obj.as_dict() {
873            Ok(d) => d,
874            Err(_) => return,
875        };
876
877        // Apply relevant properties from ExtGState
878        // /Font — set font and size
879        if let Ok(font_arr) = gs_dict.get(b"Font") {
880            if let Ok(arr) = resolve_obj(doc, font_arr).as_array() {
881                if arr.len() >= 2 {
882                    if let Object::Name(ref name) = arr[0] {
883                        self.gs_stack.current.text_state.font_name =
884                            String::from_utf8_lossy(name).to_string();
885                    }
886                    if let Some(size) = obj_to_f64(arr[1].clone()) {
887                        self.gs_stack.current.text_state.font_size = size;
888                    }
889                }
890            }
891        }
892
893        // /LW — line width
894        if let Ok(lw) = gs_dict.get(b"LW") {
895            if let Some(w) = obj_to_f64(resolve_obj(doc, lw)) {
896                self.line_width = w;
897            }
898        }
899    }
900
901    // ── Path classification ──
902
903    fn close_subpath(&mut self) {
904        if let (Some((sx, sy)), Some((cx, cy))) = (self.subpath_start, self.current_point) {
905            if (sx - cx).abs() > 0.01 || (sy - cy).abs() > 0.01 {
906                self.current_path.push(PathSegment::Line {
907                    x1: cx,
908                    y1: cy,
909                    x2: sx,
910                    y2: sy,
911                });
912            }
913            self.current_point = self.subpath_start;
914        }
915    }
916
917    fn classify_and_emit_path(&mut self) {
918        let path = std::mem::take(&mut self.current_path);
919        self.subpath_start = None;
920        self.current_point = None;
921
922        if path.is_empty() || self.line_width < MIN_LINE_WIDTH {
923            return;
924        }
925
926        let has_curves = path.iter().any(|s| matches!(s, PathSegment::Curve { .. }));
927
928        // Try to classify individual segments as line chunks
929        if !has_curves && path.len() <= 4 {
930            let mut classified_lines = Vec::new();
931            for seg in &path {
932                if let PathSegment::Line { x1, y1, x2, y2 } = seg {
933                    let dx = (x2 - x1).abs();
934                    let dy = (y2 - y1).abs();
935                    let length = (dx * dx + dy * dy).sqrt();
936
937                    if length < MIN_LINE_WIDTH {
938                        continue;
939                    }
940
941                    let is_horizontal = dy < MAX_LINE_THICKNESS && dx > dy * LINE_ASPECT_RATIO;
942                    let is_vertical = dx < MAX_LINE_THICKNESS && dy > dx * LINE_ASPECT_RATIO;
943
944                    if is_horizontal || is_vertical {
945                        self.line_index += 1;
946                        let min_x = x1.min(*x2);
947                        let max_x = x1.max(*x2);
948                        let min_y = y1.min(*y2);
949                        let max_y = y1.max(*y2);
950                        let half_w = self.line_width / 2.0;
951
952                        classified_lines.push(LineChunk {
953                            bbox: BoundingBox::new(
954                                Some(self.page_number),
955                                min_x - if is_vertical { half_w } else { 0.0 },
956                                min_y - if is_horizontal { half_w } else { 0.0 },
957                                max_x + if is_vertical { half_w } else { 0.0 },
958                                max_y + if is_horizontal { half_w } else { 0.0 },
959                            ),
960                            index: Some(self.line_index),
961                            level: None,
962                            start: Vertex {
963                                x: *x1,
964                                y: *y1,
965                                radius: 0.0,
966                            },
967                            end: Vertex {
968                                x: *x2,
969                                y: *y2,
970                                radius: 0.0,
971                            },
972                            width: self.line_width,
973                            is_horizontal_line: is_horizontal,
974                            is_vertical_line: is_vertical,
975                            is_square: false,
976                        });
977                    }
978                }
979            }
980            if !classified_lines.is_empty() {
981                self.line_chunks.extend(classified_lines);
982                return;
983            }
984        }
985
986        // Rectangle classification (4 line segments forming a box)
987        if !has_curves && path.len() == 4 {
988            if let Some(rect) = try_classify_rectangle(&path, self.line_width, self.page_number) {
989                self.line_index += 1;
990                let mut rect = rect;
991                rect.index = Some(self.line_index);
992                self.line_chunks.push(rect);
993                return;
994            }
995        }
996
997        // Complex path → LineArtChunk
998        if path.len() >= 2 {
999            let mut art_lines = Vec::new();
1000            let mut min_x = f64::MAX;
1001            let mut min_y = f64::MAX;
1002            let mut max_x = f64::MIN;
1003            let mut max_y = f64::MIN;
1004
1005            for seg in &path {
1006                let (sx, sy, ex, ey) = match seg {
1007                    PathSegment::Line { x1, y1, x2, y2 } => (*x1, *y1, *x2, *y2),
1008                    PathSegment::Curve { x1, y1, x2, y2, .. } => (*x1, *y1, *x2, *y2),
1009                };
1010                min_x = min_x.min(sx).min(ex);
1011                min_y = min_y.min(sy).min(ey);
1012                max_x = max_x.max(sx).max(ex);
1013                max_y = max_y.max(sy).max(ey);
1014
1015                self.line_index += 1;
1016                art_lines.push(LineChunk {
1017                    bbox: BoundingBox::new(
1018                        Some(self.page_number),
1019                        sx.min(ex),
1020                        sy.min(ey),
1021                        sx.max(ex),
1022                        sy.max(ey),
1023                    ),
1024                    index: Some(self.line_index),
1025                    level: None,
1026                    start: Vertex {
1027                        x: sx,
1028                        y: sy,
1029                        radius: 0.0,
1030                    },
1031                    end: Vertex {
1032                        x: ex,
1033                        y: ey,
1034                        radius: 0.0,
1035                    },
1036                    width: self.line_width,
1037                    is_horizontal_line: false,
1038                    is_vertical_line: false,
1039                    is_square: false,
1040                });
1041            }
1042
1043            self.line_index += 1;
1044            self.line_art_chunks.push(LineArtChunk {
1045                bbox: BoundingBox::new(Some(self.page_number), min_x, min_y, max_x, max_y),
1046                index: Some(self.line_index),
1047                level: None,
1048                line_chunks: art_lines,
1049            });
1050        }
1051    }
1052
1053    // ── Helpers ──
1054
1055    fn transform_point(&self, x: f64, y: f64) -> (f64, f64) {
1056        self.gs_stack.current.ctm.transform_point(x, y)
1057    }
1058
1059    fn active_mcid(&self) -> Option<i64> {
1060        self.mcid_stack.iter().rev().find_map(|&mcid| mcid)
1061    }
1062}
1063
1064// ═══════════════════════════════════════════════════════════════════
1065// Helper functions (shared with text_extractor for backward compat)
1066// ═══════════════════════════════════════════════════════════════════
1067
1068/// Create a TextChunk from raw text bytes — same logic as text_extractor.
1069fn create_text_chunk(
1070    text_bytes: &[u8],
1071    font: &PdfFont,
1072    state: &mut GraphicsStateStack,
1073    page_number: u32,
1074    chunk_index: &mut usize,
1075    mcid: Option<i64>,
1076) -> Option<TextChunk> {
1077    if text_bytes.is_empty() {
1078        return None;
1079    }
1080
1081    let trm = state.current.text_rendering_matrix();
1082    let start_x = trm.e;
1083    let font_size = trm.font_size_factor();
1084
1085    if font_size < 0.1 {
1086        return None;
1087    }
1088
1089    let mut text = String::new();
1090    let mut total_width = 0.0;
1091    let mut symbol_ends = Vec::new();
1092
1093    let bpc = font.bytes_per_code as usize;
1094    let mut pos = 0;
1095    while pos + bpc <= text_bytes.len() {
1096        let char_code = if bpc == 2 {
1097            ((text_bytes[pos] as u32) << 8) | (text_bytes[pos + 1] as u32)
1098        } else {
1099            text_bytes[pos] as u32
1100        };
1101        pos += bpc;
1102
1103        let decoded = font.decode_char(char_code);
1104        text.push_str(&decoded);
1105
1106        let glyph_w = font.glyph_width(char_code) / 1000.0;
1107        total_width += glyph_w;
1108        symbol_ends.push(start_x + total_width * font_size);
1109
1110        total_width += state.current.text_state.char_spacing / state.current.text_state.font_size;
1111
1112        if decoded == " " {
1113            total_width +=
1114                state.current.text_state.word_spacing / state.current.text_state.font_size;
1115        }
1116    }
1117
1118    let displacement = total_width * state.current.text_state.font_size;
1119    state.current.advance_text(displacement);
1120
1121    if text.is_empty() {
1122        return None;
1123    }
1124
1125    // Compute TRM_after (text rendering matrix after text advancement)
1126    let trm_after = state.current.text_rendering_matrix();
1127
1128    // Use font ascent/descent from font descriptor (glyph-space units, per-mille).
1129    let ascent = font.ascent;
1130    let descent = font.descent;
1131
1132    // TRM matrix: a=scaleX, b=shearY, c=shearX, d=scaleY, e=translateX, f=translateY
1133    let trm_before = &trm;
1134
1135    // The reference bbox formula with 4 branches based on text direction/orientation.
1136    let (x1, x2) = if trm_before.a >= 0.0 && trm_before.c >= 0.0 {
1137        (
1138            trm_before.e + descent * trm_before.c / 1000.0,
1139            trm_after.e + ascent * trm_after.c / 1000.0,
1140        )
1141    } else if trm_before.a < 0.0 && trm_before.c < 0.0 {
1142        (
1143            trm_after.e + ascent * trm_after.c / 1000.0,
1144            trm_before.e + descent * trm_before.c / 1000.0,
1145        )
1146    } else if trm_before.a >= 0.0 {
1147        (
1148            trm_before.e + ascent * trm_before.c / 1000.0,
1149            trm_after.e + descent * trm_after.c / 1000.0,
1150        )
1151    } else {
1152        (
1153            trm_after.e + descent * trm_after.c / 1000.0,
1154            trm_before.e + ascent * trm_before.c / 1000.0,
1155        )
1156    };
1157
1158    let (y1, y2) = if trm_before.d >= 0.0 && trm_before.b >= 0.0 {
1159        (
1160            trm_before.f + descent * trm_before.d / 1000.0,
1161            trm_after.f + ascent * trm_after.d / 1000.0,
1162        )
1163    } else if trm_before.d < 0.0 && trm_before.b < 0.0 {
1164        (
1165            trm_after.f + ascent * trm_after.d / 1000.0,
1166            trm_before.f + descent * trm_before.d / 1000.0,
1167        )
1168    } else if trm_before.d >= 0.0 {
1169        (
1170            trm_after.f + descent * trm_after.d / 1000.0,
1171            trm_before.f + ascent * trm_before.d / 1000.0,
1172        )
1173    } else {
1174        (
1175            trm_before.f + ascent * trm_before.d / 1000.0,
1176            trm_after.f + descent * trm_after.d / 1000.0,
1177        )
1178    };
1179
1180    let bbox = BoundingBox::new(Some(page_number), x1, y1, x2, y2);
1181
1182    let text_format = if state.current.text_state.rise > font_size * 0.1 {
1183        crate::models::enums::TextFormat::Superscript
1184    } else if state.current.text_state.rise < -font_size * 0.1 {
1185        crate::models::enums::TextFormat::Subscript
1186    } else {
1187        crate::models::enums::TextFormat::Normal
1188    };
1189
1190    *chunk_index += 1;
1191
1192    let fc = &state.current.fill_color;
1193    let font_color = format!(
1194        "[{}]",
1195        fc.iter()
1196            .map(|v| {
1197                // The reference veraPDF stores colors as the reference implementation float (f32), then serializes
1198                // via double's toString(), giving full f64 representation of the f32 value.
1199                // We replicate: parse as f64 (from lopdf) → round to f32 → back to f64.
1200                let f32_val = *v as f32;
1201                let f64_repr = f32_val as f64;
1202                if f32_val.fract() == 0.0 {
1203                    format!("{:.1}", f64_repr)
1204                } else {
1205                    format!("{}", f64_repr)
1206                }
1207            })
1208            .collect::<Vec<_>>()
1209            .join(", ")
1210    );
1211
1212    Some(TextChunk {
1213        value: text,
1214        bbox,
1215        font_name: font.base_font.clone(),
1216        font_size,
1217        font_weight: font.weight,
1218        italic_angle: font.italic_angle,
1219        font_color,
1220        contrast_ratio: 21.0,
1221        symbol_ends,
1222        text_format,
1223        text_type: crate::models::enums::TextType::Regular,
1224        pdf_layer: crate::models::enums::PdfLayer::Main,
1225        ocg_visible: true,
1226        index: Some(*chunk_index),
1227        page_number: Some(page_number),
1228        level: None,
1229        mcid,
1230    })
1231}
1232
1233/// A segment in a path being constructed.
1234#[derive(Debug, Clone)]
1235enum PathSegment {
1236    Line {
1237        x1: f64,
1238        y1: f64,
1239        x2: f64,
1240        y2: f64,
1241    },
1242    #[allow(dead_code)]
1243    Curve {
1244        x1: f64,
1245        y1: f64,
1246        cp1x: f64,
1247        cp1y: f64,
1248        cp2x: f64,
1249        cp2y: f64,
1250        x2: f64,
1251        y2: f64,
1252    },
1253}
1254
1255/// Try to classify 4 line segments as a rectangle.
1256fn try_classify_rectangle(
1257    segments: &[PathSegment],
1258    _line_width: f64,
1259    page_number: u32,
1260) -> Option<LineChunk> {
1261    let mut min_x = f64::MAX;
1262    let mut min_y = f64::MAX;
1263    let mut max_x = f64::MIN;
1264    let mut max_y = f64::MIN;
1265
1266    for seg in segments {
1267        if let PathSegment::Line { x1, y1, x2, y2 } = seg {
1268            min_x = min_x.min(*x1).min(*x2);
1269            min_y = min_y.min(*y1).min(*y2);
1270            max_x = max_x.max(*x1).max(*x2);
1271            max_y = max_y.max(*y1).max(*y2);
1272        } else {
1273            return None;
1274        }
1275    }
1276
1277    let w = max_x - min_x;
1278    let h = max_y - min_y;
1279
1280    if w < MIN_LINE_WIDTH || h < MIN_LINE_WIDTH {
1281        return None;
1282    }
1283
1284    let is_square = (w - h).abs() / w.max(h) < 0.3;
1285
1286    Some(LineChunk {
1287        bbox: BoundingBox::new(Some(page_number), min_x, min_y, max_x, max_y),
1288        index: None,
1289        level: None,
1290        start: Vertex {
1291            x: min_x,
1292            y: min_y,
1293            radius: 0.0,
1294        },
1295        end: Vertex {
1296            x: max_x,
1297            y: max_y,
1298            radius: 0.0,
1299        },
1300        width: w.min(h),
1301        is_horizontal_line: w > h * LINE_ASPECT_RATIO,
1302        is_vertical_line: h > w * LINE_ASPECT_RATIO,
1303        is_square,
1304    })
1305}
1306
1307/// Get the /Matrix from a Form XObject dictionary (defaults to identity).
1308fn get_form_matrix(doc: &Document, dict: &Dictionary) -> Matrix {
1309    match dict.get(b"Matrix") {
1310        Ok(obj) => {
1311            let resolved = resolve_obj(doc, obj);
1312            if let Ok(arr) = resolved.as_array() {
1313                let vals: Vec<f64> = arr.iter().filter_map(|o| obj_to_f64(o.clone())).collect();
1314                if vals.len() == 6 {
1315                    return Matrix {
1316                        a: vals[0],
1317                        b: vals[1],
1318                        c: vals[2],
1319                        d: vals[3],
1320                        e: vals[4],
1321                        f: vals[5],
1322                    };
1323                }
1324            }
1325            Matrix::identity()
1326        }
1327        Err(_) => Matrix::identity(),
1328    }
1329}
1330
1331/// Resolve fonts from a resources dictionary (for Form XObjects).
1332fn resolve_form_fonts(doc: &Document, resources: &Dictionary) -> FontCache {
1333    let font_dict = match resources.get(b"Font") {
1334        Ok(obj) => {
1335            let resolved = resolve_obj(doc, obj);
1336            match resolved.as_dict() {
1337                Ok(d) => d.clone(),
1338                Err(_) => return FontCache::default(),
1339            }
1340        }
1341        Err(_) => return FontCache::default(),
1342    };
1343
1344    let mut cache = FontCache::default();
1345    for (name_bytes, font_ref) in font_dict.iter() {
1346        let font_name = String::from_utf8_lossy(name_bytes).to_string();
1347        let font_obj = resolve_obj(doc, font_ref);
1348        if let Ok(font_dict) = font_obj.as_dict() {
1349            let font = super::font::resolve_font_dict(doc, &font_name, font_dict);
1350            cache.insert(font_name, font);
1351        }
1352    }
1353    cache
1354}
1355
1356// ── Utility functions ──
1357
1358fn extract_string_bytes(obj: &Object) -> Option<Vec<u8>> {
1359    match obj {
1360        Object::String(bytes, _) => Some(bytes.clone()),
1361        _ => None,
1362    }
1363}
1364
1365fn extract_mcid_from_bdc(operands: &[Object]) -> Option<i64> {
1366    if operands.len() < 2 {
1367        return None;
1368    }
1369    match &operands[1] {
1370        Object::Dictionary(dict) => {
1371            if let Ok(Object::Integer(n)) = dict.get(b"MCID") {
1372                return Some(*n);
1373            }
1374            None
1375        }
1376        _ => None,
1377    }
1378}
1379
1380fn obj_to_f64(obj: Object) -> Option<f64> {
1381    match obj {
1382        Object::Integer(i) => Some(i as f64),
1383        Object::Real(f) => Some(f),
1384        _ => None,
1385    }
1386}
1387
1388fn obj_to_name(obj: &Object) -> String {
1389    match obj {
1390        Object::Name(bytes) => String::from_utf8_lossy(bytes).to_string(),
1391        _ => String::new(),
1392    }
1393}
1394
1395/// Extract raw name bytes from a PDF Name object.
1396fn obj_name_bytes(obj: &Object) -> Option<Vec<u8>> {
1397    match obj {
1398        Object::Name(bytes) => Some(bytes.clone()),
1399        _ => None,
1400    }
1401}
1402
1403fn color_space_components(name: &str) -> u8 {
1404    match name {
1405        "DeviceGray" | "CalGray" | "G" => 1,
1406        "DeviceRGB" | "CalRGB" | "RGB" => 3,
1407        "DeviceCMYK" | "CMYK" => 4,
1408        _ => 3,
1409    }
1410}
1411
1412/// Default color for a given color space component count (PDF spec 8.6.5.3).
1413fn default_color_for_space(components: u8) -> Vec<f64> {
1414    match components {
1415        4 => vec![0.0, 0.0, 0.0, 1.0], // CMYK: default black
1416        3 => vec![0.0, 0.0, 0.0],      // RGB: default black
1417        _ => vec![0.0],                // Gray: default black
1418    }
1419}
1420
1421fn resolve_obj(doc: &Document, obj: &Object) -> Object {
1422    match obj {
1423        Object::Reference(id) => doc.get_object(*id).cloned().unwrap_or(Object::Null),
1424        other => other.clone(),
1425    }
1426}
1427
1428#[cfg(test)]
1429mod tests {
1430    use super::*;
1431    use lopdf::content::Operation;
1432    use lopdf::{dictionary, Stream};
1433
1434    fn create_test_pdf_with_text() -> Document {
1435        let mut doc = Document::with_version("1.5");
1436        let pages_id = doc.new_object_id();
1437
1438        let font_id = doc.add_object(dictionary! {
1439            "Type" => "Font",
1440            "Subtype" => "Type1",
1441            "BaseFont" => "Helvetica",
1442        });
1443
1444        let resources_id = doc.add_object(dictionary! {
1445            "Font" => dictionary! {
1446                "F1" => font_id,
1447            },
1448        });
1449
1450        let content = Content {
1451            operations: vec![
1452                Operation::new("BT", vec![]),
1453                Operation::new("Tf", vec!["F1".into(), 12.into()]),
1454                Operation::new("Td", vec![100.into(), 700.into()]),
1455                Operation::new("Tj", vec![Object::string_literal("Hello World!")]),
1456                Operation::new("ET", vec![]),
1457            ],
1458        };
1459
1460        let content_id = doc.add_object(Stream::new(dictionary! {}, content.encode().unwrap()));
1461
1462        let page_id = doc.add_object(dictionary! {
1463            "Type" => "Page",
1464            "Parent" => pages_id,
1465            "Contents" => content_id,
1466            "Resources" => resources_id,
1467            "MediaBox" => vec![0.into(), 0.into(), 595.into(), 842.into()],
1468        });
1469
1470        let pages = dictionary! {
1471            "Type" => "Pages",
1472            "Kids" => vec![page_id.into()],
1473            "Count" => 1,
1474        };
1475        doc.objects.insert(pages_id, Object::Dictionary(pages));
1476
1477        let catalog_id = doc.add_object(dictionary! {
1478            "Type" => "Catalog",
1479            "Pages" => pages_id,
1480        });
1481        doc.trailer.set("Root", catalog_id);
1482        doc
1483    }
1484
1485    #[test]
1486    fn test_unified_text_extraction() {
1487        let doc = create_test_pdf_with_text();
1488        let pages = doc.get_pages();
1489        let (&page_num, &page_id) = pages.iter().next().unwrap();
1490
1491        let chunks = extract_page_chunks(&doc, page_num, page_id).unwrap();
1492        assert!(!chunks.text_chunks.is_empty(), "Expected text chunks");
1493        assert!(
1494            chunks.text_chunks[0].value.contains("Hello"),
1495            "Expected 'Hello' in text"
1496        );
1497        assert!(chunks.image_chunks.is_empty(), "No images expected");
1498    }
1499
1500    #[test]
1501    fn test_image_from_do_operator() {
1502        let mut doc = Document::with_version("1.5");
1503        let pages_id = doc.new_object_id();
1504
1505        // Create image XObject
1506        let img_stream = Stream::new(
1507            dictionary! {
1508                "Type" => "XObject",
1509                "Subtype" => "Image",
1510                "Width" => 200,
1511                "Height" => 100,
1512                "ColorSpace" => "DeviceRGB",
1513                "BitsPerComponent" => 8,
1514            },
1515            vec![0u8; 100],
1516        );
1517        let img_id = doc.add_object(img_stream);
1518
1519        let resources_id = doc.add_object(dictionary! {
1520            "XObject" => dictionary! {
1521                "Im1" => img_id,
1522            },
1523        });
1524
1525        // Content stream: scale image and place at (72, 500)
1526        let content = Content {
1527            operations: vec![
1528                Operation::new("q", vec![]),
1529                Operation::new(
1530                    "cm",
1531                    vec![
1532                        Object::Real(200.0), // a: width
1533                        0.into(),
1534                        0.into(),
1535                        Object::Real(100.0), // d: height
1536                        Object::Real(72.0),  // e: x position
1537                        Object::Real(500.0), // f: y position
1538                    ],
1539                ),
1540                Operation::new("Do", vec!["Im1".into()]),
1541                Operation::new("Q", vec![]),
1542            ],
1543        };
1544
1545        let content_id = doc.add_object(Stream::new(dictionary! {}, content.encode().unwrap()));
1546
1547        let page_id = doc.add_object(dictionary! {
1548            "Type" => "Page",
1549            "Parent" => pages_id,
1550            "Contents" => content_id,
1551            "Resources" => resources_id,
1552            "MediaBox" => vec![0.into(), 0.into(), 595.into(), 842.into()],
1553        });
1554
1555        let pages = dictionary! {
1556            "Type" => "Pages",
1557            "Kids" => vec![page_id.into()],
1558            "Count" => 1,
1559        };
1560        doc.objects.insert(pages_id, Object::Dictionary(pages));
1561
1562        let catalog_id = doc.add_object(dictionary! {
1563            "Type" => "Catalog",
1564            "Pages" => pages_id,
1565        });
1566        doc.trailer.set("Root", catalog_id);
1567
1568        let pages = doc.get_pages();
1569        let (&page_num, &page_id) = pages.iter().next().unwrap();
1570
1571        let chunks = extract_page_chunks(&doc, page_num, page_id).unwrap();
1572        assert_eq!(chunks.image_chunks.len(), 1, "Expected 1 image chunk");
1573
1574        let img = &chunks.image_chunks[0];
1575        // Image should be at (72, 500) with size (200, 100)
1576        assert!(
1577            (img.bbox.left_x - 72.0).abs() < 1.0,
1578            "Expected left_x ~72, got {}",
1579            img.bbox.left_x
1580        );
1581        assert!(
1582            (img.bbox.bottom_y - 500.0).abs() < 1.0,
1583            "Expected bottom_y ~500, got {}",
1584            img.bbox.bottom_y
1585        );
1586        assert!(
1587            (img.bbox.right_x - 272.0).abs() < 1.0,
1588            "Expected right_x ~272, got {}",
1589            img.bbox.right_x
1590        );
1591        assert!(
1592            (img.bbox.top_y - 600.0).abs() < 1.0,
1593            "Expected top_y ~600, got {}",
1594            img.bbox.top_y
1595        );
1596    }
1597
1598    #[test]
1599    fn test_form_xobject_recursive() {
1600        let mut doc = Document::with_version("1.5");
1601        let pages_id = doc.new_object_id();
1602
1603        // Create a font
1604        let font_id = doc.add_object(dictionary! {
1605            "Type" => "Font",
1606            "Subtype" => "Type1",
1607            "BaseFont" => "Helvetica",
1608        });
1609
1610        // Create a Form XObject with text inside
1611        let form_content = Content {
1612            operations: vec![
1613                Operation::new("BT", vec![]),
1614                Operation::new("Tf", vec!["F1".into(), 10.into()]),
1615                Operation::new("Td", vec![0.into(), 0.into()]),
1616                Operation::new("Tj", vec![Object::string_literal("Form Text")]),
1617                Operation::new("ET", vec![]),
1618            ],
1619        };
1620
1621        let form_stream = Stream::new(
1622            dictionary! {
1623                "Type" => "XObject",
1624                "Subtype" => "Form",
1625                "BBox" => vec![0.into(), 0.into(), 200.into(), 50.into()],
1626                "Resources" => dictionary! {
1627                    "Font" => dictionary! {
1628                        "F1" => font_id,
1629                    },
1630                },
1631            },
1632            form_content.encode().unwrap(),
1633        );
1634        let form_id = doc.add_object(form_stream);
1635
1636        let resources_id = doc.add_object(dictionary! {
1637            "Font" => dictionary! {
1638                "F1" => font_id,
1639            },
1640            "XObject" => dictionary! {
1641                "Fm1" => form_id,
1642            },
1643        });
1644
1645        // Page content stream: invoke the form XObject
1646        let page_content = Content {
1647            operations: vec![
1648                Operation::new("q", vec![]),
1649                Operation::new(
1650                    "cm",
1651                    vec![
1652                        1.into(),
1653                        0.into(),
1654                        0.into(),
1655                        1.into(),
1656                        Object::Real(50.0),
1657                        Object::Real(400.0),
1658                    ],
1659                ),
1660                Operation::new("Do", vec!["Fm1".into()]),
1661                Operation::new("Q", vec![]),
1662            ],
1663        };
1664
1665        let content_id =
1666            doc.add_object(Stream::new(dictionary! {}, page_content.encode().unwrap()));
1667
1668        let page_id = doc.add_object(dictionary! {
1669            "Type" => "Page",
1670            "Parent" => pages_id,
1671            "Contents" => content_id,
1672            "Resources" => resources_id,
1673            "MediaBox" => vec![0.into(), 0.into(), 595.into(), 842.into()],
1674        });
1675
1676        let pages = dictionary! {
1677            "Type" => "Pages",
1678            "Kids" => vec![page_id.into()],
1679            "Count" => 1,
1680        };
1681        doc.objects.insert(pages_id, Object::Dictionary(pages));
1682
1683        let catalog_id = doc.add_object(dictionary! {
1684            "Type" => "Catalog",
1685            "Pages" => pages_id,
1686        });
1687        doc.trailer.set("Root", catalog_id);
1688
1689        let pages = doc.get_pages();
1690        let (&page_num, &page_id) = pages.iter().next().unwrap();
1691
1692        let chunks = extract_page_chunks(&doc, page_num, page_id).unwrap();
1693        assert!(
1694            !chunks.text_chunks.is_empty(),
1695            "Expected text from Form XObject"
1696        );
1697        assert!(
1698            chunks.text_chunks[0].value.contains("Form"),
1699            "Expected 'Form' text, got: '{}'",
1700            chunks.text_chunks[0].value
1701        );
1702    }
1703
1704    #[test]
1705    fn test_line_extraction_unified() {
1706        let mut doc = Document::with_version("1.5");
1707        let pages_id = doc.new_object_id();
1708
1709        let content = Content {
1710            operations: vec![
1711                Operation::new("w", vec![Object::Real(1.0)]),
1712                Operation::new("m", vec![72.into(), 400.into()]),
1713                Operation::new("l", vec![500.into(), 400.into()]),
1714                Operation::new("S", vec![]),
1715            ],
1716        };
1717
1718        let content_id = doc.add_object(Stream::new(dictionary! {}, content.encode().unwrap()));
1719
1720        let page_id = doc.add_object(dictionary! {
1721            "Type" => "Page",
1722            "Parent" => pages_id,
1723            "Contents" => content_id,
1724            "MediaBox" => vec![0.into(), 0.into(), 595.into(), 842.into()],
1725        });
1726
1727        let pages = dictionary! {
1728            "Type" => "Pages",
1729            "Kids" => vec![page_id.into()],
1730            "Count" => 1,
1731        };
1732        doc.objects.insert(pages_id, Object::Dictionary(pages));
1733
1734        let catalog_id = doc.add_object(dictionary! {
1735            "Type" => "Catalog",
1736            "Pages" => pages_id,
1737        });
1738        doc.trailer.set("Root", catalog_id);
1739
1740        let pages = doc.get_pages();
1741        let (&page_num, &page_id) = pages.iter().next().unwrap();
1742
1743        let chunks = extract_page_chunks(&doc, page_num, page_id).unwrap();
1744        assert_eq!(chunks.line_chunks.len(), 1, "Expected 1 horizontal line");
1745        assert!(chunks.line_chunks[0].is_horizontal_line);
1746    }
1747}