Skip to main content

pdf_ast/parser/
text_extraction.rs

1use crate::ast::{NodeId, PdfAstGraph};
2use crate::parser::cmap::{CMap, CMapParser};
3use crate::parser::content_stream::ContentOperator;
4use crate::types::{PdfDictionary, PdfValue};
5use std::collections::HashMap;
6
7/// Text extraction state machine
8#[allow(dead_code)]
9pub struct TextExtractor<'a> {
10    ast: &'a PdfAstGraph,
11    page_resources: &'a PdfDictionary,
12    fonts: HashMap<String, FontInfo>,
13    cmaps: HashMap<String, CMap>,
14    text_spans: Vec<TextSpan>,
15    graphics_state: GraphicsState,
16    text_state: TextState,
17}
18
19#[derive(Debug, Clone)]
20pub struct FontInfo {
21    pub font_type: String,
22    pub base_font: String,
23    pub encoding: String,
24    pub to_unicode: Option<NodeId>,
25    pub width_map: HashMap<u32, f64>,
26    pub default_width: f64,
27    pub font_matrix: [f64; 6],
28}
29
30#[derive(Debug, Clone)]
31pub struct GraphicsState {
32    pub ctm: [f64; 6], // Current Transformation Matrix
33    pub text_matrix: [f64; 6],
34    pub text_line_matrix: [f64; 6],
35    pub leading: f64,
36    pub char_space: f64,
37    pub word_space: f64,
38    pub horizontal_scale: f64,
39    pub text_rise: f64,
40    pub font: Option<String>,
41    pub font_size: f64,
42    pub render_mode: i32,
43}
44
45#[derive(Debug, Clone)]
46pub struct TextState {
47    pub current_font: Option<FontInfo>,
48    pub current_cmap: Option<CMap>,
49}
50
51#[derive(Debug, Clone)]
52pub struct TextSpan {
53    pub text: String,
54    pub x: f64,
55    pub y: f64,
56    pub width: f64,
57    pub height: f64,
58    pub font_name: String,
59    pub font_size: f64,
60    pub space_width: f64,
61    pub chars: Vec<CharInfo>,
62}
63
64#[derive(Debug, Clone)]
65pub struct CharInfo {
66    pub unicode: String,
67    pub x: f64,
68    pub y: f64,
69    pub width: f64,
70    pub height: f64,
71}
72
73impl<'a> TextExtractor<'a> {
74    pub fn new(ast: &'a PdfAstGraph, page_resources: &'a PdfDictionary) -> Self {
75        TextExtractor {
76            ast,
77            page_resources,
78            fonts: HashMap::new(),
79            cmaps: HashMap::new(),
80            text_spans: Vec::new(),
81            graphics_state: GraphicsState::default(),
82            text_state: TextState {
83                current_font: None,
84                current_cmap: None,
85            },
86        }
87    }
88
89    pub fn extract_text(&mut self, operators: &[ContentOperator]) -> Vec<TextSpan> {
90        // Pre-load fonts from resources
91        self.load_fonts();
92
93        // Process operators
94        for op in operators {
95            self.process_operator(op);
96        }
97
98        // Sort spans by position
99        self.text_spans.sort_by(|a, b| {
100            a.y.partial_cmp(&b.y)
101                .unwrap()
102                .then(a.x.partial_cmp(&b.x).unwrap())
103        });
104
105        self.text_spans.clone()
106    }
107
108    fn load_fonts(&mut self) {
109        if let Some(PdfValue::Dictionary(fonts)) = self.page_resources.get("Font") {
110            for (name, font_ref) in fonts.iter() {
111                if let PdfValue::Reference(_obj_id) = font_ref {
112                    // Get font node from AST
113                    // Parse font info
114                    let font_info = self.parse_font_info(name.as_str(), font_ref);
115                    self.fonts.insert(name.to_string(), font_info);
116                }
117            }
118        }
119    }
120
121    fn parse_font_info(&mut self, name: &str, _font_value: &PdfValue) -> FontInfo {
122        // Default font info
123
124        // Parse font dictionary if available
125        // This would need access to the actual font object
126        // For now, return default
127
128        FontInfo {
129            font_type: "Type1".to_string(),
130            base_font: name.to_string(),
131            encoding: "StandardEncoding".to_string(),
132            to_unicode: None,
133            width_map: HashMap::new(),
134            default_width: 1000.0,
135            font_matrix: [0.001, 0.0, 0.0, 0.001, 0.0, 0.0],
136        }
137    }
138
139    fn process_operator(&mut self, op: &ContentOperator) {
140        match op {
141            ContentOperator::BeginText => {
142                self.graphics_state.text_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
143                self.graphics_state.text_line_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
144            }
145
146            ContentOperator::EndText => {
147                // Reset text state
148            }
149
150            ContentOperator::SetFont(name, size) => {
151                self.graphics_state.font = Some(name.clone());
152                self.graphics_state.font_size = *size;
153
154                // Update current font
155                if let Some(font_info) = self.fonts.get(name) {
156                    self.text_state.current_font = Some(font_info.clone());
157                }
158            }
159
160            ContentOperator::SetCharSpace(spacing) => {
161                self.graphics_state.char_space = *spacing;
162            }
163
164            ContentOperator::SetWordSpace(spacing) => {
165                self.graphics_state.word_space = *spacing;
166            }
167
168            ContentOperator::SetHorizontalScale(scale) => {
169                self.graphics_state.horizontal_scale = *scale;
170            }
171
172            ContentOperator::SetLeading(leading) => {
173                self.graphics_state.leading = *leading;
174            }
175
176            ContentOperator::SetTextRise(rise) => {
177                self.graphics_state.text_rise = *rise;
178            }
179
180            ContentOperator::MoveText(tx, ty) => {
181                let tm = &mut self.graphics_state.text_line_matrix;
182                tm[4] += tx;
183                tm[5] += ty;
184                self.graphics_state.text_matrix = *tm;
185            }
186
187            ContentOperator::MoveTextNextLine => {
188                let leading = self.graphics_state.leading;
189                self.process_operator(&ContentOperator::MoveText(0.0, -leading));
190            }
191
192            ContentOperator::SetTextMatrix(a, b, c, d, e, f) => {
193                self.graphics_state.text_matrix = [*a, *b, *c, *d, *e, *f];
194                self.graphics_state.text_line_matrix = [*a, *b, *c, *d, *e, *f];
195            }
196
197            ContentOperator::ShowText(text) => {
198                self.show_text(text);
199            }
200
201            ContentOperator::ShowTextArray(array) => {
202                for element in array {
203                    match element {
204                        crate::parser::content_stream::TextArrayElement::Text(text) => {
205                            self.show_text(text);
206                        }
207                        crate::parser::content_stream::TextArrayElement::Spacing(spacing) => {
208                            // Adjust text matrix by spacing
209                            let adj = -spacing / 1000.0
210                                * self.graphics_state.font_size
211                                * self.graphics_state.horizontal_scale
212                                / 100.0;
213                            self.graphics_state.text_matrix[4] -= adj;
214                        }
215                    }
216                }
217            }
218
219            ContentOperator::ShowTextNextLine(text) => {
220                self.process_operator(&ContentOperator::MoveTextNextLine);
221                self.show_text(text);
222            }
223
224            ContentOperator::ShowTextWithSpacing(tw, tc, text) => {
225                self.graphics_state.word_space = *tw;
226                self.graphics_state.char_space = *tc;
227                self.process_operator(&ContentOperator::MoveTextNextLine);
228                self.show_text(text);
229            }
230
231            ContentOperator::Save => {
232                // Push graphics state
233            }
234
235            ContentOperator::Restore => {
236                // Pop graphics state
237            }
238
239            ContentOperator::SetMatrix(a, b, c, d, e, f) => {
240                self.graphics_state.ctm = [*a, *b, *c, *d, *e, *f];
241            }
242
243            _ => {
244                // Other operators don't affect text extraction
245            }
246        }
247    }
248
249    fn show_text(&mut self, text_bytes: &[u8]) {
250        if self.text_state.current_font.is_none() {
251            return;
252        }
253
254        let font = self.text_state.current_font.as_ref().unwrap();
255        let mut chars = Vec::new();
256        let mut total_width = 0.0;
257
258        // Decode text using font encoding/ToUnicode
259        let decoded = self.decode_text(text_bytes, font);
260
261        // Calculate position for each character
262        let tm = &self.graphics_state.text_matrix;
263        let ctm = &self.graphics_state.ctm;
264
265        // Transform text space to device space
266        let (x, y) = self.transform_point(0.0, 0.0, tm, ctm);
267
268        for ch in decoded.chars() {
269            let char_width = self.get_char_width(ch, font);
270
271            let char_info = CharInfo {
272                unicode: ch.to_string(),
273                x: x + total_width,
274                y,
275                width: char_width * self.graphics_state.font_size,
276                height: self.graphics_state.font_size,
277            };
278
279            chars.push(char_info);
280
281            // Update position
282            total_width += char_width * self.graphics_state.font_size;
283            total_width += self.graphics_state.char_space;
284
285            if ch == ' ' {
286                total_width += self.graphics_state.word_space;
287            }
288        }
289
290        // Update text matrix
291        self.graphics_state.text_matrix[4] += total_width;
292
293        // Create text span
294        if !chars.is_empty() {
295            let span = TextSpan {
296                text: decoded,
297                x,
298                y,
299                width: total_width,
300                height: self.graphics_state.font_size,
301                font_name: self.graphics_state.font.clone().unwrap_or_default(),
302                font_size: self.graphics_state.font_size,
303                space_width: self.get_char_width(' ', font) * self.graphics_state.font_size,
304                chars,
305            };
306
307            self.text_spans.push(span);
308        }
309    }
310
311    fn decode_text(&self, text_bytes: &[u8], font: &FontInfo) -> String {
312        // Try ToUnicode CMap first
313        if let Some(cmap) = &self.text_state.current_cmap {
314            return self.decode_with_cmap(text_bytes, cmap);
315        }
316
317        // Fallback to encoding
318        match font.encoding.as_str() {
319            "WinAnsiEncoding" => self.decode_win_ansi(text_bytes),
320            "MacRomanEncoding" => self.decode_mac_roman(text_bytes),
321            "StandardEncoding" => {
322                // Simple ASCII decoding
323                String::from_utf8_lossy(text_bytes).to_string()
324            }
325            _ => {
326                // Default fallback encoding
327                String::from_utf8_lossy(text_bytes).to_string()
328            }
329        }
330    }
331
332    fn decode_with_cmap(&self, text_bytes: &[u8], cmap: &CMap) -> String {
333        let mut result = String::new();
334        let mut i = 0;
335
336        while i < text_bytes.len() {
337            // Try 2-byte code first
338            if i + 1 < text_bytes.len() {
339                let code = &text_bytes[i..i + 2];
340                if let Some(unicode) = CMapParser::new(
341                    &mut PdfAstGraph::new(),
342                    &crate::parser::reference_resolver::ObjectNodeMap::new(),
343                )
344                .map_code_to_unicode(cmap, code)
345                {
346                    result.push_str(&unicode);
347                    i += 2;
348                    continue;
349                }
350            }
351
352            // Try 1-byte code
353            let code = &text_bytes[i..i + 1];
354            if let Some(unicode) = CMapParser::new(
355                &mut PdfAstGraph::new(),
356                &crate::parser::reference_resolver::ObjectNodeMap::new(),
357            )
358            .map_code_to_unicode(cmap, code)
359            {
360                result.push_str(&unicode);
361            } else {
362                // Fallback to direct mapping
363                result.push(text_bytes[i] as char);
364            }
365
366            i += 1;
367        }
368
369        result
370    }
371
372    fn decode_win_ansi(&self, text_bytes: &[u8]) -> String {
373        text_bytes
374            .iter()
375            .map(|&b| {
376                if b < 128 {
377                    b as char
378                } else {
379                    // Windows-1252 mapping for 128-255
380                    match b {
381                        0x80 => '€',
382                        0x82 => '‚',
383                        0x83 => 'ƒ',
384                        0x84 => '„',
385                        0x85 => '…',
386                        0x86 => '†',
387                        0x87 => '‡',
388                        0x88 => 'ˆ',
389                        0x89 => '‰',
390                        0x8A => 'Š',
391                        0x8B => '‹',
392                        0x8C => 'Œ',
393                        0x8E => 'Ž',
394                        0x91 => '\'',
395                        0x92 => '\'',
396                        0x93 => '"',
397                        0x94 => '"',
398                        0x95 => '•',
399                        0x96 => '–',
400                        0x97 => '—',
401                        0x98 => '˜',
402                        0x99 => '™',
403                        0x9A => 'š',
404                        0x9B => '›',
405                        0x9C => 'œ',
406                        0x9E => 'ž',
407                        0x9F => 'Ÿ',
408                        _ => b as char,
409                    }
410                }
411            })
412            .collect()
413    }
414
415    fn decode_mac_roman(&self, text_bytes: &[u8]) -> String {
416        // Simplified MacRoman decoding
417        String::from_utf8_lossy(text_bytes).to_string()
418    }
419
420    fn get_char_width(&self, ch: char, font: &FontInfo) -> f64 {
421        // Get width from font metrics
422        let code = ch as u32;
423        font.width_map
424            .get(&code)
425            .copied()
426            .unwrap_or(font.default_width)
427            * font.font_matrix[0]
428    }
429
430    fn transform_point(&self, x: f64, y: f64, tm: &[f64; 6], ctm: &[f64; 6]) -> (f64, f64) {
431        // Apply text matrix
432        let tx = tm[0] * x + tm[2] * y + tm[4];
433        let ty = tm[1] * x + tm[3] * y + tm[5];
434
435        // Apply CTM
436        let dx = ctm[0] * tx + ctm[2] * ty + ctm[4];
437        let dy = ctm[1] * tx + ctm[3] * ty + ctm[5];
438
439        (dx, dy)
440    }
441
442    pub fn merge_spans(&mut self) -> Vec<TextLine> {
443        let mut lines = Vec::new();
444        let mut current_line = TextLine::new();
445
446        for span in &self.text_spans {
447            if current_line.should_add_span(span) {
448                current_line.add_span(span.clone());
449            } else {
450                if !current_line.spans.is_empty() {
451                    lines.push(current_line);
452                }
453                current_line = TextLine::new();
454                current_line.add_span(span.clone());
455            }
456        }
457
458        if !current_line.spans.is_empty() {
459            lines.push(current_line);
460        }
461
462        lines
463    }
464}
465
466impl Default for GraphicsState {
467    fn default() -> Self {
468        GraphicsState {
469            ctm: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
470            text_matrix: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
471            text_line_matrix: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
472            leading: 0.0,
473            char_space: 0.0,
474            word_space: 0.0,
475            horizontal_scale: 100.0,
476            text_rise: 0.0,
477            font: None,
478            font_size: 12.0,
479            render_mode: 0,
480        }
481    }
482}
483
484#[derive(Debug, Clone)]
485pub struct TextLine {
486    pub spans: Vec<TextSpan>,
487    pub x: f64,
488    pub y: f64,
489    pub width: f64,
490    pub height: f64,
491}
492
493impl Default for TextLine {
494    fn default() -> Self {
495        Self::new()
496    }
497}
498
499impl TextLine {
500    pub fn new() -> Self {
501        TextLine {
502            spans: Vec::new(),
503            x: 0.0,
504            y: 0.0,
505            width: 0.0,
506            height: 0.0,
507        }
508    }
509
510    pub fn should_add_span(&self, span: &TextSpan) -> bool {
511        if self.spans.is_empty() {
512            return true;
513        }
514
515        let last = &self.spans[self.spans.len() - 1];
516
517        // Check if on same line (within tolerance)
518        let y_diff = (span.y - last.y).abs();
519        if y_diff > last.height * 0.3 {
520            return false;
521        }
522
523        // Check horizontal distance
524        let expected_x = last.x + last.width;
525        let x_diff = span.x - expected_x;
526
527        // Allow reasonable spacing
528        x_diff < last.space_width * 3.0
529    }
530
531    pub fn add_span(&mut self, span: TextSpan) {
532        if self.spans.is_empty() {
533            self.x = span.x;
534            self.y = span.y;
535            self.height = span.height;
536        }
537
538        self.width = (span.x + span.width) - self.x;
539        self.spans.push(span);
540    }
541
542    pub fn get_text(&self) -> String {
543        self.spans
544            .iter()
545            .map(|s| s.text.as_str())
546            .collect::<Vec<_>>()
547            .join(" ")
548    }
549}