Skip to main content

edgeparse_core/pdf/
text_extractor.rs

1//! Text extraction from PDF content streams.
2//!
3//! Walks content stream operations and produces TextChunks with position,
4//! font, and Unicode text information.
5
6use lopdf::{content::Content, Document, Object};
7
8use crate::models::bbox::BoundingBox;
9use crate::models::chunks::TextChunk;
10use crate::EdgePdfError;
11
12use super::font::{resolve_page_fonts, FontCache, PdfFont};
13use super::graphics_state::GraphicsStateStack;
14
15/// Extract text chunks from a single page.
16///
17/// Returns a vector of positioned text chunks with font and bounding box info.
18pub fn extract_text_chunks(
19    doc: &Document,
20    page_number: u32,
21    page_id: lopdf::ObjectId,
22) -> Result<Vec<TextChunk>, EdgePdfError> {
23    let font_cache = resolve_page_fonts(doc, page_id);
24    let page_dict = doc
25        .get_object(page_id)
26        .map_err(|e| EdgePdfError::PipelineError {
27            stage: 1,
28            message: format!("Failed to get page {}: {}", page_number, e),
29        })?
30        .as_dict()
31        .map_err(|e| EdgePdfError::PipelineError {
32            stage: 1,
33            message: format!("Page {} is not a dictionary: {}", page_number, e),
34        })?
35        .clone();
36
37    // Get MediaBox for page dimensions
38    let media_box = get_media_box(doc, &page_dict);
39
40    // Get content stream(s)
41    let content_data = get_page_content(doc, &page_dict)?;
42    if content_data.is_empty() {
43        return Ok(Vec::new());
44    }
45
46    // Parse content stream operations
47    let content = Content::decode(&content_data).map_err(|e| EdgePdfError::PipelineError {
48        stage: 1,
49        message: format!(
50            "Failed to decode content stream for page {}: {}",
51            page_number, e
52        ),
53    })?;
54
55    // Process operations
56    let chunks = process_operations(&content.operations, &font_cache, page_number, &media_box);
57
58    Ok(chunks)
59}
60
61/// Get page content stream data.
62pub(crate) fn get_page_content(
63    doc: &Document,
64    page_dict: &lopdf::Dictionary,
65) -> Result<Vec<u8>, EdgePdfError> {
66    let contents = match page_dict.get(b"Contents") {
67        Ok(c) => c.clone(),
68        Err(_) => return Ok(Vec::new()),
69    };
70
71    // Helper: collect stream bytes from an array of items (each item may be a
72    // Reference-to-Stream or a direct Stream).
73    fn collect_array(doc: &Document, arr: &[Object]) -> Result<Vec<u8>, EdgePdfError> {
74        let mut data = Vec::new();
75        for item in arr {
76            let obj = match item {
77                Object::Reference(id) => {
78                    doc.get_object(*id)
79                        .map_err(|e| EdgePdfError::PipelineError {
80                            stage: 1,
81                            message: format!("Failed to resolve content array item: {}", e),
82                        })?
83                }
84                other => other,
85            };
86            if let Object::Stream(ref stream) = obj {
87                if let Ok(content) = get_stream_data(stream) {
88                    data.extend_from_slice(&content);
89                    data.push(b' '); // Separate stream contents
90                }
91            }
92        }
93        Ok(data)
94    }
95
96    match contents {
97        Object::Reference(id) => {
98            let obj = doc
99                .get_object(id)
100                .map_err(|e| EdgePdfError::PipelineError {
101                    stage: 1,
102                    message: format!("Failed to get content object: {}", e),
103                })?;
104            match obj {
105                // Most common: Contents → single stream
106                Object::Stream(ref stream) => get_stream_data(stream),
107                // Also valid: Contents → Reference → Array of stream references
108                Object::Array(ref arr) => collect_array(doc, arr),
109                _ => Ok(Vec::new()),
110            }
111        }
112        Object::Array(ref arr) => collect_array(doc, arr),
113        _ => Ok(Vec::new()),
114    }
115}
116
117/// Get data from a stream, handling both compressed and uncompressed cases.
118fn get_stream_data(stream: &lopdf::Stream) -> Result<Vec<u8>, EdgePdfError> {
119    // If no Filter key, the content is already uncompressed
120    if stream.dict.get(b"Filter").is_err() {
121        return Ok(stream.content.clone());
122    }
123    stream
124        .decompressed_content()
125        .map_err(|e| EdgePdfError::PipelineError {
126            stage: 1,
127            message: format!("Failed to decompress content stream: {}", e),
128        })
129}
130
131/// Get page MediaBox.
132fn get_media_box(doc: &Document, page_dict: &lopdf::Dictionary) -> BoundingBox {
133    if let Ok(mb) = page_dict.get(b"MediaBox") {
134        if let Ok(arr) = resolve_obj(doc, mb).as_array() {
135            if arr.len() == 4 {
136                let vals: Vec<f64> = arr
137                    .iter()
138                    .filter_map(|o| obj_to_f64(resolve_obj(doc, o)))
139                    .collect();
140                if vals.len() == 4 {
141                    return BoundingBox::new(None, vals[0], vals[1], vals[2], vals[3]);
142                }
143            }
144        }
145    }
146    // Default A4 page
147    BoundingBox::new(None, 0.0, 0.0, 595.0, 842.0)
148}
149
150/// Process content stream operations and extract text chunks.
151fn process_operations(
152    operations: &[lopdf::content::Operation],
153    font_cache: &FontCache,
154    page_number: u32,
155    _media_box: &BoundingBox,
156) -> Vec<TextChunk> {
157    let mut chunks = Vec::new();
158    let mut state = GraphicsStateStack::default();
159    let mut chunk_index: usize = 0;
160    // Track marked content sequence IDs (BDC/BMC/EMC)
161    let mut mcid_stack: Vec<Option<i64>> = Vec::new();
162
163    for op in operations {
164        match op.operator.as_str() {
165            // Marked content operators — track MCID for structure tree linkage
166            "BMC" => {
167                // Begin Marked Content (no properties) — push None
168                mcid_stack.push(None);
169            }
170            "BDC" => {
171                // Begin Marked Content with properties — extract MCID if present
172                let mcid = extract_mcid_from_bdc(&op.operands);
173                mcid_stack.push(mcid);
174            }
175            "EMC" => {
176                // End Marked Content — pop the stack
177                mcid_stack.pop();
178            }
179
180            // Graphics state operators
181            "q" => state.save(),
182            "Q" => state.restore(),
183            "cm" => {
184                if op.operands.len() == 6 {
185                    let vals: Vec<f64> = op
186                        .operands
187                        .iter()
188                        .filter_map(|o| obj_to_f64(o.clone()))
189                        .collect();
190                    if vals.len() == 6 {
191                        state.concat_ctm(vals[0], vals[1], vals[2], vals[3], vals[4], vals[5]);
192                    }
193                }
194            }
195
196            // Text state operators
197            "BT" => state.current.begin_text(),
198            "ET" => {} // End text object
199
200            "Tf" => {
201                // Set font and size
202                if op.operands.len() == 2 {
203                    if let Object::Name(ref name) = op.operands[0] {
204                        state.current.text_state.font_name =
205                            String::from_utf8_lossy(name).to_string();
206                    }
207                    if let Some(size) = obj_to_f64(op.operands[1].clone()) {
208                        state.current.text_state.font_size = size;
209                    }
210                }
211            }
212
213            "Tc" => {
214                if let Some(v) = op.operands.first().and_then(|o| obj_to_f64(o.clone())) {
215                    state.current.text_state.char_spacing = v;
216                }
217            }
218
219            "Tw" => {
220                if let Some(v) = op.operands.first().and_then(|o| obj_to_f64(o.clone())) {
221                    state.current.text_state.word_spacing = v;
222                }
223            }
224
225            "Tz" => {
226                if let Some(v) = op.operands.first().and_then(|o| obj_to_f64(o.clone())) {
227                    state.current.text_state.horizontal_scaling = v;
228                }
229            }
230
231            "TL" => {
232                if let Some(v) = op.operands.first().and_then(|o| obj_to_f64(o.clone())) {
233                    state.current.text_state.leading = v;
234                }
235            }
236
237            "Ts" => {
238                if let Some(v) = op.operands.first().and_then(|o| obj_to_f64(o.clone())) {
239                    state.current.text_state.rise = v;
240                }
241            }
242
243            "Tr" => {
244                if let Some(v) = op.operands.first().and_then(|o| obj_to_f64(o.clone())) {
245                    state.current.text_state.render_mode = v as i32;
246                }
247            }
248
249            // Text positioning operators
250            "Td" => {
251                if op.operands.len() == 2 {
252                    let tx = obj_to_f64(op.operands[0].clone()).unwrap_or(0.0);
253                    let ty = obj_to_f64(op.operands[1].clone()).unwrap_or(0.0);
254                    state.current.translate_text(tx, ty);
255                }
256            }
257
258            "TD" => {
259                // Same as Td but also sets leading
260                if op.operands.len() == 2 {
261                    let tx = obj_to_f64(op.operands[0].clone()).unwrap_or(0.0);
262                    let ty = obj_to_f64(op.operands[1].clone()).unwrap_or(0.0);
263                    state.current.text_state.leading = -ty;
264                    state.current.translate_text(tx, ty);
265                }
266            }
267
268            "Tm" => {
269                if op.operands.len() == 6 {
270                    let vals: Vec<f64> = op
271                        .operands
272                        .iter()
273                        .filter_map(|o| obj_to_f64(o.clone()))
274                        .collect();
275                    if vals.len() == 6 {
276                        state
277                            .current
278                            .set_text_matrix(vals[0], vals[1], vals[2], vals[3], vals[4], vals[5]);
279                    }
280                }
281            }
282
283            "T*" => {
284                state.current.next_line();
285            }
286
287            // Text showing operators
288            "Tj" => {
289                if let Some(text_bytes) = op.operands.first().and_then(extract_string_bytes) {
290                    let font = font_cache
291                        .get(&state.current.text_state.font_name)
292                        .cloned()
293                        .unwrap_or_else(|| {
294                            PdfFont::default_font(&state.current.text_state.font_name)
295                        });
296                    let active_mcid = active_mcid(&mcid_stack);
297
298                    if let Some(chunk) = create_text_chunk(
299                        &text_bytes,
300                        &font,
301                        &mut state,
302                        page_number,
303                        &mut chunk_index,
304                        active_mcid,
305                    ) {
306                        chunks.push(chunk);
307                    }
308                }
309            }
310
311            "TJ" => {
312                // Array of strings and positioning adjustments
313                if let Some(Object::Array(ref arr)) = op.operands.first() {
314                    let font = font_cache
315                        .get(&state.current.text_state.font_name)
316                        .cloned()
317                        .unwrap_or_else(|| {
318                            PdfFont::default_font(&state.current.text_state.font_name)
319                        });
320                    let active_mcid = active_mcid(&mcid_stack);
321
322                    for item in arr {
323                        match item {
324                            Object::String(bytes, _) => {
325                                if let Some(chunk) = create_text_chunk(
326                                    bytes,
327                                    &font,
328                                    &mut state,
329                                    page_number,
330                                    &mut chunk_index,
331                                    active_mcid,
332                                ) {
333                                    chunks.push(chunk);
334                                }
335                            }
336                            _ => {
337                                // Numeric adjustment: displacement in thousandths of text space
338                                if let Some(adj) = obj_to_f64(item.clone()) {
339                                    let displacement =
340                                        -adj / 1000.0 * state.current.text_state.font_size;
341                                    state.current.advance_text(displacement);
342                                }
343                            }
344                        }
345                    }
346                }
347            }
348
349            "'" => {
350                // Move to next line, show text
351                state.current.next_line();
352                if let Some(text_bytes) = op.operands.first().and_then(extract_string_bytes) {
353                    let font = font_cache
354                        .get(&state.current.text_state.font_name)
355                        .cloned()
356                        .unwrap_or_else(|| {
357                            PdfFont::default_font(&state.current.text_state.font_name)
358                        });
359                    let active_mcid = active_mcid(&mcid_stack);
360                    if let Some(chunk) = create_text_chunk(
361                        &text_bytes,
362                        &font,
363                        &mut state,
364                        page_number,
365                        &mut chunk_index,
366                        active_mcid,
367                    ) {
368                        chunks.push(chunk);
369                    }
370                }
371            }
372
373            "\"" => {
374                // Set spacing, move to next line, show text
375                if op.operands.len() == 3 {
376                    if let Some(aw) = obj_to_f64(op.operands[0].clone()) {
377                        state.current.text_state.word_spacing = aw;
378                    }
379                    if let Some(ac) = obj_to_f64(op.operands[1].clone()) {
380                        state.current.text_state.char_spacing = ac;
381                    }
382                    state.current.next_line();
383                    if let Some(text_bytes) = extract_string_bytes(&op.operands[2]) {
384                        let font = font_cache
385                            .get(&state.current.text_state.font_name)
386                            .cloned()
387                            .unwrap_or_else(|| {
388                                PdfFont::default_font(&state.current.text_state.font_name)
389                            });
390                        let active_mcid = active_mcid(&mcid_stack);
391                        if let Some(chunk) = create_text_chunk(
392                            &text_bytes,
393                            &font,
394                            &mut state,
395                            page_number,
396                            &mut chunk_index,
397                            active_mcid,
398                        ) {
399                            chunks.push(chunk);
400                        }
401                    }
402                }
403            }
404
405            // Color operators — preserve original color space components (reference approach)
406            "g" => {
407                if let Some(gray) = op.operands.first().and_then(|o| obj_to_f64(o.clone())) {
408                    state.current.fill_color = vec![gray];
409                    state.current.fill_color_space_components = 1;
410                }
411            }
412            "G" => {
413                if let Some(gray) = op.operands.first().and_then(|o| obj_to_f64(o.clone())) {
414                    state.current.stroke_color = vec![gray];
415                    state.current.stroke_color_space_components = 1;
416                }
417            }
418            "rg" => {
419                if op.operands.len() == 3 {
420                    let r = obj_to_f64(op.operands[0].clone()).unwrap_or(0.0);
421                    let g = obj_to_f64(op.operands[1].clone()).unwrap_or(0.0);
422                    let b = obj_to_f64(op.operands[2].clone()).unwrap_or(0.0);
423                    state.current.fill_color = vec![r, g, b];
424                    state.current.fill_color_space_components = 3;
425                }
426            }
427            "RG" => {
428                if op.operands.len() == 3 {
429                    let r = obj_to_f64(op.operands[0].clone()).unwrap_or(0.0);
430                    let g = obj_to_f64(op.operands[1].clone()).unwrap_or(0.0);
431                    let b = obj_to_f64(op.operands[2].clone()).unwrap_or(0.0);
432                    state.current.stroke_color = vec![r, g, b];
433                    state.current.stroke_color_space_components = 3;
434                }
435            }
436            "k" => {
437                if op.operands.len() == 4 {
438                    let c = obj_to_f64(op.operands[0].clone()).unwrap_or(0.0);
439                    let m = obj_to_f64(op.operands[1].clone()).unwrap_or(0.0);
440                    let y = obj_to_f64(op.operands[2].clone()).unwrap_or(0.0);
441                    let k = obj_to_f64(op.operands[3].clone()).unwrap_or(0.0);
442                    state.current.fill_color = vec![c, m, y, k];
443                    state.current.fill_color_space_components = 4;
444                }
445            }
446            "K" => {
447                if op.operands.len() == 4 {
448                    let c = obj_to_f64(op.operands[0].clone()).unwrap_or(0.0);
449                    let m = obj_to_f64(op.operands[1].clone()).unwrap_or(0.0);
450                    let y = obj_to_f64(op.operands[2].clone()).unwrap_or(0.0);
451                    let k = obj_to_f64(op.operands[3].clone()).unwrap_or(0.0);
452                    state.current.stroke_color = vec![c, m, y, k];
453                    state.current.stroke_color_space_components = 4;
454                }
455            }
456            // cs/CS — set color space; sc/SC/scn/SCN — set color in current space
457            "cs" => {
458                if let Some(name) = op.operands.first() {
459                    let cs_name = obj_to_name(name);
460                    state.current.fill_color_space_components = color_space_components(&cs_name);
461                }
462            }
463            "CS" => {
464                if let Some(name) = op.operands.first() {
465                    let cs_name = obj_to_name(name);
466                    state.current.stroke_color_space_components = color_space_components(&cs_name);
467                }
468            }
469            "sc" | "scn" => {
470                let components: Vec<f64> = op
471                    .operands
472                    .iter()
473                    .filter_map(|o| obj_to_f64(o.clone()))
474                    .collect();
475                if !components.is_empty() {
476                    state.current.fill_color = components;
477                }
478            }
479            "SC" | "SCN" => {
480                let components: Vec<f64> = op
481                    .operands
482                    .iter()
483                    .filter_map(|o| obj_to_f64(o.clone()))
484                    .collect();
485                if !components.is_empty() {
486                    state.current.stroke_color = components;
487                }
488            }
489
490            _ => {
491                // Ignore unknown operators
492            }
493        }
494    }
495
496    chunks
497}
498
499/// Create a TextChunk from raw text bytes using the current graphics state.
500fn create_text_chunk(
501    text_bytes: &[u8],
502    font: &PdfFont,
503    state: &mut GraphicsStateStack,
504    page_number: u32,
505    chunk_index: &mut usize,
506    mcid: Option<i64>,
507) -> Option<TextChunk> {
508    if text_bytes.is_empty() {
509        return None;
510    }
511
512    // Get text position before rendering
513    let trm = state.current.text_rendering_matrix();
514    let start_x = trm.e;
515    let font_size = trm.font_size_factor();
516
517    if font_size < 0.1 {
518        return None; // Skip invisible text
519    }
520
521    // Decode text to Unicode
522    let mut text = String::new();
523    let mut total_width = 0.0;
524    let mut symbol_ends = Vec::new();
525
526    let bpc = font.bytes_per_code as usize;
527    let mut pos = 0;
528    while pos + bpc <= text_bytes.len() {
529        let char_code = if bpc == 2 {
530            ((text_bytes[pos] as u32) << 8) | (text_bytes[pos + 1] as u32)
531        } else {
532            text_bytes[pos] as u32
533        };
534        pos += bpc;
535
536        let decoded = font.decode_char(char_code);
537        text.push_str(&decoded);
538
539        // Calculate glyph width
540        let glyph_w = font.glyph_width(char_code) / 1000.0;
541        total_width += glyph_w;
542        symbol_ends.push(start_x + total_width * font_size);
543
544        // Add character spacing
545        total_width += state.current.text_state.char_spacing / state.current.text_state.font_size;
546
547        // Add word spacing for space character
548        if decoded == " " {
549            total_width +=
550                state.current.text_state.word_spacing / state.current.text_state.font_size;
551        }
552    }
553
554    // Advance text position
555    let displacement = total_width * state.current.text_state.font_size;
556    state.current.advance_text(displacement);
557
558    if text.is_empty() {
559        return None;
560    }
561
562    // Compute TRM_after (text rendering matrix after text advancement)
563    let trm_after = state.current.text_rendering_matrix();
564
565    // Use font ascent/descent from font descriptor (glyph-space units, per-mille).
566    // The reference implementation: TextChunksHelper.calculateTextBoundingBox uses font.getAscent()/getDescent()
567    // with fallback to font bounding box.
568    let ascent = font.ascent;
569    let descent = font.descent;
570
571    // TRM matrix components:
572    //   a = scaleX,  b = shearY
573    //   c = shearX,  d = scaleY
574    //   e = translateX, f = translateY
575    let trm_before = &trm; // TRM at start of text
576
577    // The reference bbox formula with 4 branches based on text direction/orientation.
578    // scaleX = trm.a, shearX = trm.c, scaleY = trm.d, shearY = trm.b
579    let (x1, x2) = if trm_before.a >= 0.0 && trm_before.c >= 0.0 {
580        (
581            trm_before.e + descent * trm_before.c / 1000.0,
582            trm_after.e + ascent * trm_after.c / 1000.0,
583        )
584    } else if trm_before.a < 0.0 && trm_before.c < 0.0 {
585        (
586            trm_after.e + ascent * trm_after.c / 1000.0,
587            trm_before.e + descent * trm_before.c / 1000.0,
588        )
589    } else if trm_before.a >= 0.0 {
590        (
591            trm_before.e + ascent * trm_before.c / 1000.0,
592            trm_after.e + descent * trm_after.c / 1000.0,
593        )
594    } else {
595        (
596            trm_after.e + descent * trm_after.c / 1000.0,
597            trm_before.e + ascent * trm_before.c / 1000.0,
598        )
599    };
600
601    let (y1, y2) = if trm_before.d >= 0.0 && trm_before.b >= 0.0 {
602        (
603            trm_before.f + descent * trm_before.d / 1000.0,
604            trm_after.f + ascent * trm_after.d / 1000.0,
605        )
606    } else if trm_before.d < 0.0 && trm_before.b < 0.0 {
607        (
608            trm_after.f + ascent * trm_after.d / 1000.0,
609            trm_before.f + descent * trm_before.d / 1000.0,
610        )
611    } else if trm_before.d >= 0.0 {
612        (
613            trm_after.f + descent * trm_after.d / 1000.0,
614            trm_before.f + ascent * trm_before.d / 1000.0,
615        )
616    } else {
617        (
618            trm_before.f + ascent * trm_before.d / 1000.0,
619            trm_after.f + descent * trm_after.d / 1000.0,
620        )
621    };
622
623    let bbox = BoundingBox::new(Some(page_number), x1, y1, x2, y2);
624
625    // Determine text format from text rise (Ts operator).
626    let text_format = if state.current.text_state.rise > font_size * 0.1 {
627        crate::models::enums::TextFormat::Superscript
628    } else if state.current.text_state.rise < -font_size * 0.1 {
629        crate::models::enums::TextFormat::Subscript
630    } else {
631        crate::models::enums::TextFormat::Normal
632    };
633
634    *chunk_index += 1;
635
636    // Format fill color as the reference Arrays.toString() — preserves original color space.
637    // The reference veraPDF stores colors as the reference implementation float (f32), then serializes via double's toString(),
638    // giving full f64 representation of the f32 value. We replicate:
639    // parse as f64 (from lopdf) → round to f32 → back to f64 for full-precision display.
640    let fc = &state.current.fill_color;
641    let font_color = format!(
642        "[{}]",
643        fc.iter()
644            .map(|v| {
645                let f32_val = *v as f32;
646                let f64_repr = f32_val as f64;
647                if f32_val.fract() == 0.0 {
648                    format!("{:.1}", f64_repr)
649                } else {
650                    format!("{}", f64_repr)
651                }
652            })
653            .collect::<Vec<_>>()
654            .join(", ")
655    );
656
657    Some(TextChunk {
658        value: text,
659        bbox,
660        font_name: font.base_font.clone(),
661        font_size,
662        font_weight: font.weight,
663        italic_angle: font.italic_angle,
664        font_color,
665        contrast_ratio: 21.0, // Default: black on white = max contrast
666        symbol_ends,
667        text_format,
668        text_type: crate::models::enums::TextType::Regular,
669        pdf_layer: crate::models::enums::PdfLayer::Main,
670        ocg_visible: true,
671        index: Some(*chunk_index),
672        page_number: Some(page_number),
673        level: None,
674        mcid,
675    })
676}
677
678/// Extract string bytes from a PDF Object.
679fn extract_string_bytes(obj: &Object) -> Option<Vec<u8>> {
680    match obj {
681        Object::String(bytes, _) => Some(bytes.clone()),
682        _ => None,
683    }
684}
685
686/// Get the currently active MCID from the marked content stack.
687/// Returns the most recent non-None MCID (innermost BDC with MCID).
688fn active_mcid(stack: &[Option<i64>]) -> Option<i64> {
689    stack.iter().rev().find_map(|&mcid| mcid)
690}
691
692/// Extract the MCID from BDC operands.
693/// BDC can appear as: `/Tag <</MCID 0>>` or `/Tag /PropertyListName`
694/// We handle the inline dictionary case (most common in tagged PDFs).
695fn extract_mcid_from_bdc(operands: &[Object]) -> Option<i64> {
696    // BDC has 2 operands: tag name and properties
697    if operands.len() < 2 {
698        return None;
699    }
700    match &operands[1] {
701        Object::Dictionary(dict) => {
702            if let Ok(Object::Integer(n)) = dict.get(b"MCID") {
703                return Some(*n);
704            }
705            None
706        }
707        _ => None,
708    }
709}
710
711/// Convert PDF object to f64.
712fn obj_to_f64(obj: Object) -> Option<f64> {
713    match obj {
714        Object::Integer(i) => Some(i as f64),
715        Object::Real(f) => Some(f),
716        _ => None,
717    }
718}
719
720/// Extract a name string from a PDF object (Name type).
721fn obj_to_name(obj: &Object) -> String {
722    match obj {
723        Object::Name(bytes) => String::from_utf8_lossy(bytes).to_string(),
724        _ => String::new(),
725    }
726}
727
728/// Map a PDF color space name to the number of components.
729fn color_space_components(name: &str) -> u8 {
730    match name {
731        "DeviceGray" | "CalGray" | "G" => 1,
732        "DeviceRGB" | "CalRGB" | "RGB" => 3,
733        "DeviceCMYK" | "CMYK" => 4,
734        _ => 3, // Default to RGB for unknown/ICCBased color spaces
735    }
736}
737
738/// Resolve a PDF object via reference.
739fn resolve_obj<'a>(doc: &'a Document, obj: &'a Object) -> lopdf::Object {
740    match obj {
741        Object::Reference(id) => doc.get_object(*id).cloned().unwrap_or(Object::Null),
742        other => other.clone(),
743    }
744}
745
746#[cfg(test)]
747mod tests {
748    use super::*;
749    use lopdf::content::Operation;
750    use lopdf::dictionary;
751
752    /// Create a minimal PDF document with text content for testing.
753    fn create_test_pdf() -> Document {
754        let mut doc = Document::with_version("1.5");
755        let pages_id = doc.new_object_id();
756
757        let font_id = doc.add_object(dictionary! {
758            "Type" => "Font",
759            "Subtype" => "Type1",
760            "BaseFont" => "Helvetica",
761        });
762
763        let resources_id = doc.add_object(dictionary! {
764            "Font" => dictionary! {
765                "F1" => font_id,
766            },
767        });
768
769        let content = Content {
770            operations: vec![
771                Operation::new("BT", vec![]),
772                Operation::new("Tf", vec!["F1".into(), 12.into()]),
773                Operation::new("Td", vec![100.into(), 700.into()]),
774                Operation::new("Tj", vec![Object::string_literal("Hello World!")]),
775                Operation::new("ET", vec![]),
776            ],
777        };
778
779        let content_id = doc.add_object(lopdf::Stream::new(
780            dictionary! {},
781            content.encode().unwrap(),
782        ));
783
784        let page_id = doc.add_object(dictionary! {
785            "Type" => "Page",
786            "Parent" => pages_id,
787            "Contents" => content_id,
788            "Resources" => resources_id,
789            "MediaBox" => vec![0.into(), 0.into(), 595.into(), 842.into()],
790        });
791
792        let pages = dictionary! {
793            "Type" => "Pages",
794            "Kids" => vec![page_id.into()],
795            "Count" => 1,
796        };
797        doc.objects.insert(pages_id, Object::Dictionary(pages));
798
799        let catalog_id = doc.add_object(dictionary! {
800            "Type" => "Catalog",
801            "Pages" => pages_id,
802        });
803        doc.trailer.set("Root", catalog_id);
804        doc
805    }
806
807    #[test]
808    fn test_extract_text_from_synthetic_pdf() {
809        let doc = create_test_pdf();
810        let pages = doc.get_pages();
811        let (&page_num, &page_id) = pages.iter().next().unwrap();
812
813        let chunks = extract_text_chunks(&doc, page_num, page_id).unwrap();
814
815        // Should extract at least one text chunk
816        assert!(!chunks.is_empty(), "Expected text chunks from test PDF");
817
818        // The first chunk should contain "Hello World!"
819        let first = &chunks[0];
820        assert!(
821            first.value.contains("Hello"),
822            "Expected 'Hello' in chunk, got: '{}'",
823            first.value
824        );
825    }
826
827    #[test]
828    fn test_extract_empty_page() {
829        let mut doc = Document::with_version("1.5");
830        let pages_id = doc.new_object_id();
831
832        let page_id = doc.add_object(dictionary! {
833            "Type" => "Page",
834            "Parent" => pages_id,
835            "MediaBox" => vec![0.into(), 0.into(), 595.into(), 842.into()],
836        });
837
838        let pages = dictionary! {
839            "Type" => "Pages",
840            "Kids" => vec![page_id.into()],
841            "Count" => 1,
842        };
843        doc.objects.insert(pages_id, Object::Dictionary(pages));
844
845        let catalog_id = doc.add_object(dictionary! {
846            "Type" => "Catalog",
847            "Pages" => pages_id,
848        });
849        doc.trailer.set("Root", catalog_id);
850
851        let pages = doc.get_pages();
852        let (&page_num, &page_id) = pages.iter().next().unwrap();
853
854        let chunks = extract_text_chunks(&doc, page_num, page_id).unwrap();
855        assert!(chunks.is_empty());
856    }
857}