Skip to main content

justpdf_core/text/
mod.rs

1pub mod format;
2pub mod layout;
3pub mod search;
4pub mod text_layout;
5
6use std::collections::HashMap;
7
8use crate::content::{ContentOp, Operand, parse_content_stream};
9use crate::error::Result;
10use crate::font::{Encoding, FontInfo, ToUnicodeCMap, decode_text, parse_font_info};
11use crate::object::{PdfDict, PdfObject};
12use crate::page::{PageInfo, collect_pages};
13use crate::parser::PdfDocument;
14
15// ---------------------------------------------------------------------------
16// 2D affine transformation matrix [a b 0; c d 0; e f 1]
17// ---------------------------------------------------------------------------
18
19#[derive(Debug, Clone, Copy)]
20pub struct Matrix {
21    pub a: f64,
22    pub b: f64,
23    pub c: f64,
24    pub d: f64,
25    pub e: f64,
26    pub f: f64,
27}
28
29impl Matrix {
30    pub fn identity() -> Self {
31        Self {
32            a: 1.0,
33            b: 0.0,
34            c: 0.0,
35            d: 1.0,
36            e: 0.0,
37            f: 0.0,
38        }
39    }
40
41    /// Multiply: self × other (row-vector convention as in PDF spec).
42    pub fn concat(&self, other: &Matrix) -> Matrix {
43        Matrix {
44            a: self.a * other.a + self.b * other.c,
45            b: self.a * other.b + self.b * other.d,
46            c: self.c * other.a + self.d * other.c,
47            d: self.c * other.b + self.d * other.d,
48            e: self.e * other.a + self.f * other.c + other.e,
49            f: self.e * other.b + self.f * other.d + other.f,
50        }
51    }
52
53    /// Transform a point (x, y) by this matrix.
54    pub fn transform_point(&self, x: f64, y: f64) -> (f64, f64) {
55        (
56            self.a * x + self.c * y + self.e,
57            self.b * x + self.d * y + self.f,
58        )
59    }
60
61    /// Translation matrix.
62    pub fn translate(tx: f64, ty: f64) -> Self {
63        Self {
64            a: 1.0,
65            b: 0.0,
66            c: 0.0,
67            d: 1.0,
68            e: tx,
69            f: ty,
70        }
71    }
72
73    /// Get the effective font size (y-scale factor).
74    pub fn font_size_scale(&self) -> f64 {
75        (self.b * self.b + self.d * self.d).sqrt()
76    }
77}
78
79// ---------------------------------------------------------------------------
80// Font entry resolved for text extraction
81// ---------------------------------------------------------------------------
82
83#[derive(Debug, Clone)]
84struct ResolvedFont {
85    info: FontInfo,
86    cmap: Option<ToUnicodeCMap>,
87}
88
89impl ResolvedFont {
90    /// Get the width of a character code in text space units (1/1000).
91    fn char_width(&self, code: u32) -> f64 {
92        self.info.widths.get_width(code)
93    }
94}
95
96// ---------------------------------------------------------------------------
97// Text state (per PDF spec 9.3)
98// ---------------------------------------------------------------------------
99
100#[derive(Debug, Clone)]
101struct TextState {
102    /// Character spacing (Tc)
103    char_spacing: f64,
104    /// Word spacing (Tw)
105    word_spacing: f64,
106    /// Horizontal scaling (Tz), stored as fraction (1.0 = 100%)
107    horiz_scaling: f64,
108    /// Text leading (TL)
109    leading: f64,
110    /// Current font name (resource name, e.g. "F1")
111    font_name: Vec<u8>,
112    /// Font size (Tfs)
113    font_size: f64,
114    /// Text rise (Ts)
115    text_rise: f64,
116}
117
118impl Default for TextState {
119    fn default() -> Self {
120        Self {
121            char_spacing: 0.0,
122            word_spacing: 0.0,
123            horiz_scaling: 1.0,
124            leading: 0.0,
125            font_name: Vec::new(),
126            font_size: 12.0,
127            text_rise: 0.0,
128        }
129    }
130}
131
132// ---------------------------------------------------------------------------
133// Graphics state (subset needed for text extraction)
134// ---------------------------------------------------------------------------
135
136#[derive(Debug, Clone)]
137struct GraphicsState {
138    ctm: Matrix,
139    text: TextState,
140}
141
142impl Default for GraphicsState {
143    fn default() -> Self {
144        Self {
145            ctm: Matrix::identity(),
146            text: TextState::default(),
147        }
148    }
149}
150
151// ---------------------------------------------------------------------------
152// Extracted text output types
153// ---------------------------------------------------------------------------
154
155/// A single extracted character with its position.
156#[derive(Debug, Clone)]
157pub struct TextChar {
158    /// The Unicode character(s) for this glyph.
159    pub unicode: String,
160    /// X position in user space (points from page origin).
161    pub x: f64,
162    /// Y position in user space.
163    pub y: f64,
164    /// Effective font size in user space.
165    pub font_size: f64,
166    /// Font name (resource name).
167    pub font_name: String,
168    /// Character advance width in user space.
169    pub width: f64,
170}
171
172/// A word: a sequence of characters not separated by large gaps.
173#[derive(Debug, Clone)]
174pub struct TextWord {
175    pub text: String,
176    pub x: f64,
177    pub y: f64,
178    pub width: f64,
179    pub font_size: f64,
180}
181
182/// A line of text: a sequence of words on roughly the same baseline.
183#[derive(Debug, Clone)]
184pub struct TextLine {
185    pub text: String,
186    pub words: Vec<TextWord>,
187    pub x: f64,
188    pub y: f64,
189}
190
191/// A block of text: a sequence of lines grouped spatially.
192#[derive(Debug, Clone)]
193pub struct TextBlock {
194    pub text: String,
195    pub lines: Vec<TextLine>,
196}
197
198/// Full text extraction result for a page.
199#[derive(Debug, Clone)]
200pub struct PageText {
201    pub page_index: usize,
202    pub chars: Vec<TextChar>,
203    pub lines: Vec<TextLine>,
204    pub blocks: Vec<TextBlock>,
205}
206
207impl PageText {
208    /// Get plain text, joining lines with newlines.
209    pub fn plain_text(&self) -> String {
210        self.blocks
211            .iter()
212            .map(|b| b.text.as_str())
213            .collect::<Vec<_>>()
214            .join("\n\n")
215    }
216}
217
218// ---------------------------------------------------------------------------
219// Content stream text interpreter
220// ---------------------------------------------------------------------------
221
222struct TextInterpreter {
223    /// Graphics state stack.
224    gs_stack: Vec<GraphicsState>,
225    /// Current graphics state.
226    gs: GraphicsState,
227    /// Text matrix (Tm).
228    tm: Matrix,
229    /// Text line matrix (Tlm) — set at the start of each text line.
230    tlm: Matrix,
231    /// Whether we're inside BT...ET.
232    in_text: bool,
233    /// Resolved fonts keyed by resource name.
234    fonts: HashMap<Vec<u8>, ResolvedFont>,
235    /// Extracted characters.
236    chars: Vec<TextChar>,
237}
238
239impl TextInterpreter {
240    fn new(fonts: HashMap<Vec<u8>, ResolvedFont>) -> Self {
241        Self {
242            gs_stack: Vec::new(),
243            gs: GraphicsState::default(),
244            tm: Matrix::identity(),
245            tlm: Matrix::identity(),
246            in_text: false,
247            fonts,
248            chars: Vec::new(),
249        }
250    }
251
252    fn run(mut self, ops: &[ContentOp]) -> Vec<TextChar> {
253        for op in ops {
254            self.process_op(op);
255        }
256        self.chars
257    }
258
259    fn process_op(&mut self, op: &ContentOp) {
260        match op.operator.as_slice() {
261            // Graphics state
262            b"q" => self.gs_stack.push(self.gs.clone()),
263            b"Q" => {
264                if let Some(gs) = self.gs_stack.pop() {
265                    self.gs = gs;
266                }
267            }
268            b"cm" => {
269                if let Some(m) = self.read_matrix(&op.operands) {
270                    self.gs.ctm = m.concat(&self.gs.ctm);
271                }
272            }
273
274            // Text state operators
275            b"Tc" => {
276                if let Some(v) = op.operands.first().and_then(|o| o.as_f64()) {
277                    self.gs.text.char_spacing = v;
278                }
279            }
280            b"Tw" => {
281                if let Some(v) = op.operands.first().and_then(|o| o.as_f64()) {
282                    self.gs.text.word_spacing = v;
283                }
284            }
285            b"Tz" => {
286                if let Some(v) = op.operands.first().and_then(|o| o.as_f64()) {
287                    self.gs.text.horiz_scaling = v / 100.0;
288                }
289            }
290            b"TL" => {
291                if let Some(v) = op.operands.first().and_then(|o| o.as_f64()) {
292                    self.gs.text.leading = v;
293                }
294            }
295            b"Tf" => {
296                if op.operands.len() >= 2 {
297                    if let Some(name) = op.operands[0].as_name() {
298                        self.gs.text.font_name = name.to_vec();
299                    }
300                    if let Some(size) = op.operands[1].as_f64() {
301                        self.gs.text.font_size = size;
302                    }
303                }
304            }
305            b"Ts" => {
306                if let Some(v) = op.operands.first().and_then(|o| o.as_f64()) {
307                    self.gs.text.text_rise = v;
308                }
309            }
310
311            // Text object
312            b"BT" => {
313                self.in_text = true;
314                self.tm = Matrix::identity();
315                self.tlm = Matrix::identity();
316            }
317            b"ET" => {
318                self.in_text = false;
319            }
320
321            // Text positioning
322            b"Td" => {
323                if op.operands.len() >= 2 {
324                    let tx = op.operands[0].as_f64().unwrap_or(0.0);
325                    let ty = op.operands[1].as_f64().unwrap_or(0.0);
326                    self.tlm = Matrix::translate(tx, ty).concat(&self.tlm);
327                    self.tm = self.tlm;
328                }
329            }
330            b"TD" => {
331                if op.operands.len() >= 2 {
332                    let tx = op.operands[0].as_f64().unwrap_or(0.0);
333                    let ty = op.operands[1].as_f64().unwrap_or(0.0);
334                    self.gs.text.leading = -ty;
335                    self.tlm = Matrix::translate(tx, ty).concat(&self.tlm);
336                    self.tm = self.tlm;
337                }
338            }
339            b"Tm" => {
340                if let Some(m) = self.read_matrix(&op.operands) {
341                    self.tm = m;
342                    self.tlm = m;
343                }
344            }
345            b"T*" => {
346                let tl = self.gs.text.leading;
347                self.tlm = Matrix::translate(0.0, -tl).concat(&self.tlm);
348                self.tm = self.tlm;
349            }
350
351            // Text showing
352            b"Tj" => {
353                if let Some(s) = op.operands.first().and_then(|o| o.as_str()) {
354                    self.show_string(s);
355                }
356            }
357            b"TJ" => {
358                if let Some(arr) = op.operands.first().and_then(|o| o.as_array()) {
359                    self.show_tj_array(arr);
360                }
361            }
362            b"'" => {
363                // Move to next line, then show string
364                let tl = self.gs.text.leading;
365                self.tlm = Matrix::translate(0.0, -tl).concat(&self.tlm);
366                self.tm = self.tlm;
367                if let Some(s) = op.operands.first().and_then(|o| o.as_str()) {
368                    self.show_string(s);
369                }
370            }
371            b"\"" => {
372                // Set Tw, Tc, then T* + Tj
373                if op.operands.len() >= 3 {
374                    if let Some(aw) = op.operands[0].as_f64() {
375                        self.gs.text.word_spacing = aw;
376                    }
377                    if let Some(ac) = op.operands[1].as_f64() {
378                        self.gs.text.char_spacing = ac;
379                    }
380                    let tl = self.gs.text.leading;
381                    self.tlm = Matrix::translate(0.0, -tl).concat(&self.tlm);
382                    self.tm = self.tlm;
383                    if let Some(s) = op.operands[2].as_str() {
384                        self.show_string(s);
385                    }
386                }
387            }
388
389            // ExtGState (may contain font)
390            b"gs" => {
391                // We don't resolve ExtGState for now — would need document access
392            }
393
394            _ => {} // Ignore non-text operators
395        }
396    }
397
398    /// Show a text string, extracting characters and updating the text matrix.
399    fn show_string(&mut self, raw: &[u8]) {
400        let font = self.fonts.get(&self.gs.text.font_name);
401
402        let is_two_byte = font
403            .map(|f| {
404                matches!(f.info.encoding, Encoding::Identity) || f.info.subtype == b"Type0"
405            })
406            .unwrap_or(false);
407
408        let tfs = self.gs.text.font_size;
409        let tc = self.gs.text.char_spacing;
410        let tw = self.gs.text.word_spacing;
411        let th = self.gs.text.horiz_scaling;
412        let rise = self.gs.text.text_rise;
413
414        // Iterate over character codes
415        let mut i = 0;
416        while i < raw.len() {
417            let (code, byte_len) = if is_two_byte && i + 1 < raw.len() {
418                (((raw[i] as u32) << 8) | raw[i + 1] as u32, 2)
419            } else {
420                (raw[i] as u32, 1)
421            };
422
423            // Decode to unicode
424            let unicode = if let Some(f) = font {
425                if let Some(ref cmap) = f.cmap {
426                    cmap.lookup(code)
427                        .unwrap_or_else(|| decode_text(&raw[i..i + byte_len], f.info.encoding))
428                } else {
429                    decode_text(&raw[i..i + byte_len], f.info.encoding)
430                }
431            } else {
432                String::from_utf8_lossy(&raw[i..i + byte_len]).into_owned()
433            };
434
435            // Get glyph width in text space (1/1000 units)
436            let w0 = font
437                .map(|f| f.char_width(code))
438                .unwrap_or(500.0);
439
440            // Calculate position in user space:
441            // Text rendering matrix = [fontSize*Tz 0 0; 0 fontSize 0; 0 rise 1] × Tm × CTM
442            let trm = self.text_rendering_matrix(tfs, th, rise);
443            let (x, y) = trm.transform_point(0.0, 0.0);
444            let effective_size = trm.font_size_scale();
445
446            // Glyph displacement in text space
447            let tx = w0 / 1000.0 * tfs;
448            let advance = (tx + tc) * th;
449
450            // Add word spacing for space character (code 32)
451            let total_advance = if code == 32 {
452                advance + tw * th
453            } else {
454                advance
455            };
456
457            // Calculate width in user space
458            let width = (w0 / 1000.0 * tfs * th).abs();
459
460            // Only emit non-control characters
461            if !unicode.is_empty() {
462                self.chars.push(TextChar {
463                    unicode,
464                    x,
465                    y,
466                    font_size: effective_size,
467                    font_name: String::from_utf8_lossy(&self.gs.text.font_name).into_owned(),
468                    width,
469                });
470            }
471
472            // Update text matrix: translate by glyph displacement
473            self.tm = Matrix::translate(total_advance, 0.0).concat(&self.tm);
474
475            i += byte_len;
476        }
477    }
478
479    /// Process a TJ array: [(string) number (string) number ...]
480    fn show_tj_array(&mut self, items: &[Operand]) {
481        let th = self.gs.text.horiz_scaling;
482        let tfs = self.gs.text.font_size;
483
484        for item in items {
485            match item {
486                Operand::String(s) => {
487                    self.show_string(s);
488                }
489                Operand::Integer(n) => {
490                    // Displacement in thousandths of text space unit
491                    let displacement = -*n as f64 / 1000.0 * tfs * th;
492                    self.tm = Matrix::translate(displacement, 0.0).concat(&self.tm);
493                }
494                Operand::Real(n) => {
495                    let displacement = -n / 1000.0 * tfs * th;
496                    self.tm = Matrix::translate(displacement, 0.0).concat(&self.tm);
497                }
498                _ => {}
499            }
500        }
501    }
502
503    /// Compute the text rendering matrix (Trm = text_state_matrix × Tm × CTM).
504    fn text_rendering_matrix(&self, tfs: f64, th: f64, rise: f64) -> Matrix {
505        let text_state = Matrix {
506            a: tfs * th,
507            b: 0.0,
508            c: 0.0,
509            d: tfs,
510            e: 0.0,
511            f: rise,
512        };
513        text_state.concat(&self.tm).concat(&self.gs.ctm)
514    }
515
516    fn read_matrix(&self, operands: &[Operand]) -> Option<Matrix> {
517        if operands.len() < 6 {
518            return None;
519        }
520        Some(Matrix {
521            a: operands[0].as_f64()?,
522            b: operands[1].as_f64()?,
523            c: operands[2].as_f64()?,
524            d: operands[3].as_f64()?,
525            e: operands[4].as_f64()?,
526            f: operands[5].as_f64()?,
527        })
528    }
529}
530
531// ---------------------------------------------------------------------------
532// Word / line / block grouping
533// ---------------------------------------------------------------------------
534
535/// Group extracted characters into words based on spatial gaps.
536fn group_into_words(chars: &[TextChar]) -> Vec<TextWord> {
537    if chars.is_empty() {
538        return Vec::new();
539    }
540
541    let mut words: Vec<TextWord> = Vec::new();
542    let mut current_text = String::new();
543    let mut word_x = chars[0].x;
544    let mut word_y = chars[0].y;
545    let mut word_end_x = chars[0].x;
546    let mut word_font_size = chars[0].font_size;
547
548    for (i, ch) in chars.iter().enumerate() {
549        if i > 0 {
550            let prev = &chars[i - 1];
551            let expected_x = prev.x + prev.width;
552            let gap = (ch.x - expected_x).abs();
553            let y_diff = (ch.y - prev.y).abs();
554            let threshold = prev.font_size * 0.3;
555
556            // Start new word if there's a significant gap or Y change
557            if gap > threshold || y_diff > prev.font_size * 0.5 {
558                if !current_text.is_empty() {
559                    words.push(TextWord {
560                        text: current_text.trim().to_string(),
561                        x: word_x,
562                        y: word_y,
563                        width: word_end_x - word_x,
564                        font_size: word_font_size,
565                    });
566                }
567                current_text = String::new();
568                word_x = ch.x;
569                word_y = ch.y;
570                word_font_size = ch.font_size;
571            }
572        }
573
574        // Treat space as word boundary
575        if ch.unicode == " " {
576            if !current_text.is_empty() {
577                words.push(TextWord {
578                    text: current_text.trim().to_string(),
579                    x: word_x,
580                    y: word_y,
581                    width: word_end_x - word_x,
582                    font_size: word_font_size,
583                });
584                current_text = String::new();
585                word_x = ch.x + ch.width;
586                word_y = ch.y;
587                word_font_size = ch.font_size;
588            }
589        } else {
590            current_text.push_str(&ch.unicode);
591            word_end_x = ch.x + ch.width;
592        }
593    }
594
595    // Flush last word
596    if !current_text.is_empty() {
597        words.push(TextWord {
598            text: current_text.trim().to_string(),
599            x: word_x,
600            y: word_y,
601            width: word_end_x - word_x,
602            font_size: word_font_size,
603        });
604    }
605
606    // Remove empty words
607    words.retain(|w| !w.text.is_empty());
608    words
609}
610
611/// Group words into lines (same baseline, sorted left to right).
612fn group_into_lines(words: &[TextWord]) -> Vec<TextLine> {
613    if words.is_empty() {
614        return Vec::new();
615    }
616
617    // Sort words by Y descending (top to bottom), then X ascending
618    let mut sorted: Vec<&TextWord> = words.iter().collect();
619    sorted.sort_by(|a, b| {
620        let y_cmp = b.y.partial_cmp(&a.y).unwrap_or(std::cmp::Ordering::Equal);
621        if y_cmp == std::cmp::Ordering::Equal {
622            a.x.partial_cmp(&b.x).unwrap_or(std::cmp::Ordering::Equal)
623        } else {
624            y_cmp
625        }
626    });
627
628    let mut lines: Vec<TextLine> = Vec::new();
629    let mut current_line_words: Vec<TextWord> = Vec::new();
630    let mut line_y = sorted[0].y;
631
632    for word in sorted {
633        let y_threshold = word.font_size * 0.5;
634        if (word.y - line_y).abs() > y_threshold && !current_line_words.is_empty() {
635            // Flush current line
636            lines.push(build_line(std::mem::take(&mut current_line_words)));
637            line_y = word.y;
638        }
639        current_line_words.push(word.clone());
640        if current_line_words.len() == 1 {
641            line_y = word.y;
642        }
643    }
644
645    if !current_line_words.is_empty() {
646        lines.push(build_line(current_line_words));
647    }
648
649    lines
650}
651
652fn build_line(mut words: Vec<TextWord>) -> TextLine {
653    // Sort words left to right within the line
654    words.sort_by(|a, b| a.x.partial_cmp(&b.x).unwrap_or(std::cmp::Ordering::Equal));
655
656    let text = words
657        .iter()
658        .map(|w| w.text.as_str())
659        .collect::<Vec<_>>()
660        .join(" ");
661
662    let x = words.first().map(|w| w.x).unwrap_or(0.0);
663    let y = words.first().map(|w| w.y).unwrap_or(0.0);
664
665    TextLine {
666        text,
667        x,
668        y,
669        words,
670    }
671}
672
673// ---------------------------------------------------------------------------
674// Font resolution from page resources
675// ---------------------------------------------------------------------------
676
677/// Resolve all fonts from a page's Resources dictionary.
678fn resolve_fonts(
679    doc: &PdfDocument,
680    resources_ref: &Option<PdfObject>,
681) -> HashMap<Vec<u8>, ResolvedFont> {
682    let mut fonts = HashMap::new();
683
684    let resources = match resources_ref {
685        Some(PdfObject::Dict(d)) => d.clone(),
686        Some(PdfObject::Reference(r)) => {
687            let r = r.clone();
688            match doc.resolve(&r) {
689                Ok(PdfObject::Dict(d)) => d,
690                _ => return fonts,
691            }
692        }
693        _ => return fonts,
694    };
695
696    let font_dict = match resources.get(b"Font") {
697        Some(PdfObject::Dict(d)) => d.clone(),
698        Some(PdfObject::Reference(r)) => {
699            let r = r.clone();
700            match doc.resolve(&r) {
701                Ok(PdfObject::Dict(d)) => d,
702                _ => return fonts,
703            }
704        }
705        _ => return fonts,
706    };
707
708    for (name, obj) in font_dict.iter() {
709        let font_dict = match obj {
710            PdfObject::Dict(d) => d.clone(),
711            PdfObject::Reference(r) => {
712                let r = r.clone();
713                match doc.resolve(&r) {
714                    Ok(PdfObject::Dict(d)) => d.clone(),
715                    _ => continue,
716                }
717            }
718            _ => continue,
719        };
720
721        let mut info = parse_font_info(&font_dict);
722
723        // Resolve ToUnicode CMap
724        let cmap = resolve_to_unicode(doc, &font_dict);
725        if cmap.is_some() {
726            info.to_unicode = None; // We already parsed it
727        }
728
729        // For Type0 fonts, try to get widths from descendant CIDFont
730        if font_dict.get_name(b"Subtype") == Some(b"Type0") {
731            resolve_type0_descendant(doc, &font_dict, &mut info);
732        }
733
734        fonts.insert(name.clone(), ResolvedFont { info, cmap });
735    }
736
737    fonts
738}
739
740fn resolve_to_unicode(doc: &PdfDocument, font_dict: &PdfDict) -> Option<ToUnicodeCMap> {
741    let tu_obj = font_dict.get(b"ToUnicode")?;
742    match tu_obj {
743        PdfObject::Reference(r) => {
744            let r = r.clone();
745            let obj = doc.resolve(&r).ok()?;
746            match obj {
747                PdfObject::Stream { dict, data } => {
748                    let decoded = doc.decode_stream(&dict, &data).ok()?;
749                    Some(ToUnicodeCMap::parse(&decoded))
750                }
751                _ => None,
752            }
753        }
754        PdfObject::Stream { dict, data } => {
755            let decoded = doc.decode_stream(dict, data).ok()?;
756            Some(ToUnicodeCMap::parse(&decoded))
757        }
758        _ => None,
759    }
760}
761
762fn resolve_type0_descendant(doc: &PdfDocument, font_dict: &PdfDict, info: &mut FontInfo) {
763    let descendants = match font_dict.get(b"DescendantFonts") {
764        Some(PdfObject::Array(arr)) => arr.clone(),
765        _ => return,
766    };
767
768    let descendant_ref = match descendants.first() {
769        Some(PdfObject::Reference(r)) => r.clone(),
770        _ => return,
771    };
772
773    let descendant = match doc.resolve(&descendant_ref) {
774        Ok(PdfObject::Dict(d)) => d,
775        _ => return,
776    };
777
778    // Get /W array for CID widths
779    if let Some(PdfObject::Array(w_array)) = descendant.get(b"W") {
780        info.widths = parse_cid_widths(w_array);
781    }
782
783    // Get /DW (default width)
784    if let Some(dw) = descendant.get(b"DW").and_then(|o| o.as_f64()) {
785        match &mut info.widths {
786            crate::font::FontWidths::CID {
787                default_width,
788                ..
789            } => *default_width = dw,
790            crate::font::FontWidths::None {
791                default_width,
792            } => *default_width = dw,
793            _ => {}
794        }
795    }
796
797    // Mark as Identity encoding for Type0
798    info.encoding = Encoding::Identity;
799}
800
801fn parse_cid_widths(w_array: &[PdfObject]) -> crate::font::FontWidths {
802    use crate::font::{CIDWidthEntry, FontWidths};
803
804    let mut entries = Vec::new();
805    let mut i = 0;
806
807    while i < w_array.len() {
808        let first = match w_array[i].as_i64() {
809            Some(v) => v as u32,
810            None => {
811                i += 1;
812                continue;
813            }
814        };
815        i += 1;
816
817        if i >= w_array.len() {
818            break;
819        }
820
821        match &w_array[i] {
822            PdfObject::Array(widths) => {
823                let ws: Vec<f64> = widths.iter().filter_map(|o| o.as_f64()).collect();
824                entries.push(CIDWidthEntry::List {
825                    first,
826                    widths: ws,
827                });
828                i += 1;
829            }
830            PdfObject::Integer(_) | PdfObject::Real(_) => {
831                if i + 1 < w_array.len() {
832                    let last = w_array[i].as_f64().unwrap_or(0.0) as u32;
833                    i += 1;
834                    let width = w_array.get(i).and_then(|o| o.as_f64()).unwrap_or(1000.0);
835                    i += 1;
836                    entries.push(CIDWidthEntry::Range {
837                        first,
838                        last,
839                        width,
840                    });
841                } else {
842                    break;
843                }
844            }
845            _ => {
846                i += 1;
847            }
848        }
849    }
850
851    FontWidths::CID {
852        default_width: 1000.0,
853        w_entries: entries,
854    }
855}
856
857// ---------------------------------------------------------------------------
858// Get content stream data for a page
859// ---------------------------------------------------------------------------
860
861fn get_page_content_data(doc: &PdfDocument, page: &PageInfo) -> Result<Vec<u8>> {
862    let contents_obj = match &page.contents_ref {
863        Some(obj) => obj.clone(),
864        None => return Ok(Vec::new()),
865    };
866
867    match contents_obj {
868        PdfObject::Reference(r) => {
869            let obj = doc.resolve(&r)?;
870            decode_content_obj(doc, &obj)
871        }
872        PdfObject::Array(arr) => {
873            let mut combined = Vec::new();
874            for item in &arr {
875                let data = match item {
876                    PdfObject::Reference(r) => {
877                        let r = r.clone();
878                        let obj = doc.resolve(&r)?;
879                        decode_content_obj(doc, &obj)?
880                    }
881                    _ => Vec::new(),
882                };
883                if !combined.is_empty() {
884                    combined.push(b' ');
885                }
886                combined.extend_from_slice(&data);
887            }
888            Ok(combined)
889        }
890        PdfObject::Stream { dict, data } => doc.decode_stream(&dict, &data),
891        _ => Ok(Vec::new()),
892    }
893}
894
895fn decode_content_obj(doc: &PdfDocument, obj: &PdfObject) -> Result<Vec<u8>> {
896    match obj {
897        PdfObject::Stream { dict, data } => doc.decode_stream(dict, data),
898        _ => Ok(Vec::new()),
899    }
900}
901
902// ---------------------------------------------------------------------------
903// Public API
904// ---------------------------------------------------------------------------
905
906/// Extract text from a single page.
907pub fn extract_page_text(doc: &PdfDocument, page: &PageInfo) -> Result<PageText> {
908    // Resolve fonts
909    let fonts = resolve_fonts(doc, &page.resources_ref);
910
911    // Get content stream data
912    let content_data = get_page_content_data(doc, page)?;
913
914    if content_data.is_empty() {
915        return Ok(PageText {
916            page_index: page.index,
917            chars: Vec::new(),
918            lines: Vec::new(),
919            blocks: Vec::new(),
920        });
921    }
922
923    // Parse content stream
924    let ops = parse_content_stream(&content_data)?;
925
926    // Run text interpreter
927    let interpreter = TextInterpreter::new(fonts);
928    let chars = interpreter.run(&ops);
929
930    // Group into structure
931    let words = group_into_words(&chars);
932    let lines = group_into_lines(&words);
933    // Use advanced layout: column detection, reading order, dehyphenation
934    let blocks = layout::detect_columns_and_reorder(&lines);
935
936    Ok(PageText {
937        page_index: page.index,
938        chars,
939        lines,
940        blocks,
941    })
942}
943
944/// Extract text from all pages of a document.
945pub fn extract_all_text(doc: &PdfDocument) -> Result<Vec<PageText>> {
946    let pages = collect_pages(doc)?;
947    let mut results = Vec::with_capacity(pages.len());
948
949    for page in &pages {
950        results.push(extract_page_text(doc, page)?);
951    }
952
953    Ok(results)
954}
955
956/// Extract plain text from a single page as a string.
957pub fn extract_page_text_string(doc: &PdfDocument, page: &PageInfo) -> Result<String> {
958    let page_text = extract_page_text(doc, page)?;
959    Ok(page_text.plain_text())
960}
961
962/// Extract plain text from all pages, joining with form feeds.
963pub fn extract_all_text_string(doc: &PdfDocument) -> Result<String> {
964    let pages = extract_all_text(doc)?;
965    let texts: Vec<String> = pages.iter().map(|p| p.plain_text()).collect();
966    Ok(texts.join("\n\n"))
967}
968
969// ---------------------------------------------------------------------------
970// Tests
971// ---------------------------------------------------------------------------
972
973#[cfg(test)]
974mod tests {
975    use super::*;
976
977    #[test]
978    fn test_matrix_identity() {
979        let m = Matrix::identity();
980        let (x, y) = m.transform_point(10.0, 20.0);
981        assert!((x - 10.0).abs() < 1e-10);
982        assert!((y - 20.0).abs() < 1e-10);
983    }
984
985    #[test]
986    fn test_matrix_translate() {
987        let m = Matrix::translate(100.0, 200.0);
988        let (x, y) = m.transform_point(0.0, 0.0);
989        assert!((x - 100.0).abs() < 1e-10);
990        assert!((y - 200.0).abs() < 1e-10);
991    }
992
993    #[test]
994    fn test_matrix_concat() {
995        let a = Matrix::translate(10.0, 20.0);
996        let b = Matrix::translate(30.0, 40.0);
997        let c = a.concat(&b);
998        let (x, y) = c.transform_point(0.0, 0.0);
999        assert!((x - 40.0).abs() < 1e-10);
1000        assert!((y - 60.0).abs() < 1e-10);
1001    }
1002
1003    #[test]
1004    fn test_matrix_scale() {
1005        let m = Matrix {
1006            a: 2.0,
1007            b: 0.0,
1008            c: 0.0,
1009            d: 3.0,
1010            e: 0.0,
1011            f: 0.0,
1012        };
1013        let (x, y) = m.transform_point(10.0, 10.0);
1014        assert!((x - 20.0).abs() < 1e-10);
1015        assert!((y - 30.0).abs() < 1e-10);
1016    }
1017
1018    #[test]
1019    fn test_interpreter_basic_text() {
1020        // Simulate: BT /F1 12 Tf 72 720 Td (Hello) Tj ET
1021        let ops = vec![
1022            ContentOp {
1023                operator: b"BT".to_vec(),
1024                operands: vec![],
1025            },
1026            ContentOp {
1027                operator: b"Tf".to_vec(),
1028                operands: vec![
1029                    Operand::Name(b"F1".to_vec()),
1030                    Operand::Integer(12),
1031                ],
1032            },
1033            ContentOp {
1034                operator: b"Td".to_vec(),
1035                operands: vec![Operand::Integer(72), Operand::Integer(720)],
1036            },
1037            ContentOp {
1038                operator: b"Tj".to_vec(),
1039                operands: vec![Operand::String(b"Hello".to_vec())],
1040            },
1041            ContentOp {
1042                operator: b"ET".to_vec(),
1043                operands: vec![],
1044            },
1045        ];
1046
1047        // Create a simple font (no CMap, WinAnsi)
1048        let mut fonts = HashMap::new();
1049        fonts.insert(
1050            b"F1".to_vec(),
1051            ResolvedFont {
1052                info: FontInfo {
1053                    base_font: b"Helvetica".to_vec(),
1054                    subtype: b"Type1".to_vec(),
1055                    encoding: Encoding::WinAnsiEncoding,
1056                    widths: crate::font::FontWidths::None {
1057                        default_width: 600.0,
1058                    },
1059                    to_unicode: None,
1060                    is_standard14: true,
1061                    descriptor: None,
1062                },
1063                cmap: None,
1064            },
1065        );
1066
1067        let interpreter = TextInterpreter::new(fonts);
1068        let chars = interpreter.run(&ops);
1069
1070        assert_eq!(chars.len(), 5);
1071        assert_eq!(chars[0].unicode, "H");
1072        assert_eq!(chars[1].unicode, "e");
1073        assert_eq!(chars[4].unicode, "o");
1074        // Position: first char at (72, 720), font size 12
1075        assert!((chars[0].x - 72.0).abs() < 0.01);
1076        assert!((chars[0].y - 720.0).abs() < 0.01);
1077    }
1078
1079    #[test]
1080    fn test_interpreter_tj_array() {
1081        // BT /F1 12 Tf 0 0 Td [(H) -100 (i)] TJ ET
1082        let ops = vec![
1083            ContentOp {
1084                operator: b"BT".to_vec(),
1085                operands: vec![],
1086            },
1087            ContentOp {
1088                operator: b"Tf".to_vec(),
1089                operands: vec![
1090                    Operand::Name(b"F1".to_vec()),
1091                    Operand::Integer(12),
1092                ],
1093            },
1094            ContentOp {
1095                operator: b"TJ".to_vec(),
1096                operands: vec![Operand::Array(vec![
1097                    Operand::String(b"H".to_vec()),
1098                    Operand::Integer(-100),
1099                    Operand::String(b"i".to_vec()),
1100                ])],
1101            },
1102            ContentOp {
1103                operator: b"ET".to_vec(),
1104                operands: vec![],
1105            },
1106        ];
1107
1108        let mut fonts = HashMap::new();
1109        fonts.insert(
1110            b"F1".to_vec(),
1111            ResolvedFont {
1112                info: FontInfo {
1113                    base_font: b"Helvetica".to_vec(),
1114                    subtype: b"Type1".to_vec(),
1115                    encoding: Encoding::WinAnsiEncoding,
1116                    widths: crate::font::FontWidths::None {
1117                        default_width: 500.0,
1118                    },
1119                    to_unicode: None,
1120                    is_standard14: true,
1121                    descriptor: None,
1122                },
1123                cmap: None,
1124            },
1125        );
1126
1127        let interpreter = TextInterpreter::new(fonts);
1128        let chars = interpreter.run(&ops);
1129
1130        assert_eq!(chars.len(), 2);
1131        assert_eq!(chars[0].unicode, "H");
1132        assert_eq!(chars[1].unicode, "i");
1133        // "i" should be displaced by the kerning value
1134        let h_advance = 500.0 / 1000.0 * 12.0; // 6.0
1135        let kern = 100.0 / 1000.0 * 12.0; // 1.2
1136        let expected_i_x = h_advance + kern;
1137        assert!((chars[1].x - expected_i_x).abs() < 0.01);
1138    }
1139
1140    #[test]
1141    fn test_word_grouping() {
1142        let chars = vec![
1143            TextChar {
1144                unicode: "H".into(),
1145                x: 72.0,
1146                y: 720.0,
1147                font_size: 12.0,
1148                font_name: "F1".into(),
1149                width: 7.0,
1150            },
1151            TextChar {
1152                unicode: "i".into(),
1153                x: 79.0,
1154                y: 720.0,
1155                font_size: 12.0,
1156                font_name: "F1".into(),
1157                width: 3.0,
1158            },
1159            TextChar {
1160                unicode: " ".into(),
1161                x: 82.0,
1162                y: 720.0,
1163                font_size: 12.0,
1164                font_name: "F1".into(),
1165                width: 3.0,
1166            },
1167            TextChar {
1168                unicode: "A".into(),
1169                x: 90.0,
1170                y: 720.0,
1171                font_size: 12.0,
1172                font_name: "F1".into(),
1173                width: 7.0,
1174            },
1175        ];
1176
1177        let words = group_into_words(&chars);
1178        assert_eq!(words.len(), 2);
1179        assert_eq!(words[0].text, "Hi");
1180        assert_eq!(words[1].text, "A");
1181    }
1182
1183    #[test]
1184    fn test_line_grouping() {
1185        let words = vec![
1186            TextWord {
1187                text: "Hello".into(),
1188                x: 72.0,
1189                y: 720.0,
1190                width: 30.0,
1191                font_size: 12.0,
1192            },
1193            TextWord {
1194                text: "World".into(),
1195                x: 110.0,
1196                y: 720.0,
1197                width: 30.0,
1198                font_size: 12.0,
1199            },
1200            TextWord {
1201                text: "Next".into(),
1202                x: 72.0,
1203                y: 700.0,
1204                width: 24.0,
1205                font_size: 12.0,
1206            },
1207        ];
1208
1209        let lines = group_into_lines(&words);
1210        assert_eq!(lines.len(), 2);
1211        assert_eq!(lines[0].text, "Hello World");
1212        assert_eq!(lines[1].text, "Next");
1213    }
1214
1215    #[test]
1216    fn test_interpreter_multiline() {
1217        // BT /F1 12 Tf 72 720 Td (Line1) Tj 0 -14 Td (Line2) Tj ET
1218        let ops = vec![
1219            ContentOp {
1220                operator: b"BT".to_vec(),
1221                operands: vec![],
1222            },
1223            ContentOp {
1224                operator: b"Tf".to_vec(),
1225                operands: vec![
1226                    Operand::Name(b"F1".to_vec()),
1227                    Operand::Integer(12),
1228                ],
1229            },
1230            ContentOp {
1231                operator: b"Td".to_vec(),
1232                operands: vec![Operand::Integer(72), Operand::Integer(720)],
1233            },
1234            ContentOp {
1235                operator: b"Tj".to_vec(),
1236                operands: vec![Operand::String(b"Line1".to_vec())],
1237            },
1238            ContentOp {
1239                operator: b"Td".to_vec(),
1240                operands: vec![Operand::Integer(0), Operand::Integer(-14)],
1241            },
1242            ContentOp {
1243                operator: b"Tj".to_vec(),
1244                operands: vec![Operand::String(b"Line2".to_vec())],
1245            },
1246            ContentOp {
1247                operator: b"ET".to_vec(),
1248                operands: vec![],
1249            },
1250        ];
1251
1252        let mut fonts = HashMap::new();
1253        fonts.insert(
1254            b"F1".to_vec(),
1255            ResolvedFont {
1256                info: FontInfo {
1257                    base_font: b"Courier".to_vec(),
1258                    subtype: b"Type1".to_vec(),
1259                    encoding: Encoding::WinAnsiEncoding,
1260                    widths: crate::font::FontWidths::None {
1261                        default_width: 600.0,
1262                    },
1263                    to_unicode: None,
1264                    is_standard14: true,
1265                    descriptor: None,
1266                },
1267                cmap: None,
1268            },
1269        );
1270
1271        let interpreter = TextInterpreter::new(fonts);
1272        let chars = interpreter.run(&ops);
1273
1274        assert_eq!(chars.len(), 10);
1275        // Line1 starts at y=720
1276        assert!((chars[0].y - 720.0).abs() < 0.01);
1277        // Line2 starts at y=706 (720 - 14)
1278        assert!((chars[5].y - 706.0).abs() < 0.01);
1279
1280        let words = group_into_words(&chars);
1281        let lines = group_into_lines(&words);
1282        assert_eq!(lines.len(), 2);
1283        assert_eq!(lines[0].text, "Line1");
1284        assert_eq!(lines[1].text, "Line2");
1285    }
1286
1287    #[test]
1288    fn test_empty_chars() {
1289        let words = group_into_words(&[]);
1290        assert!(words.is_empty());
1291        let lines = group_into_lines(&[]);
1292        assert!(lines.is_empty());
1293        let blocks = layout::detect_columns_and_reorder(&[]);
1294        assert!(blocks.is_empty());
1295    }
1296
1297    #[test]
1298    fn test_graphics_state_save_restore() {
1299        // q 2 0 0 2 0 0 cm BT /F1 12 Tf (A) Tj ET Q BT /F1 12 Tf (B) Tj ET
1300        let ops = vec![
1301            ContentOp {
1302                operator: b"q".to_vec(),
1303                operands: vec![],
1304            },
1305            ContentOp {
1306                operator: b"cm".to_vec(),
1307                operands: vec![
1308                    Operand::Integer(2),
1309                    Operand::Integer(0),
1310                    Operand::Integer(0),
1311                    Operand::Integer(2),
1312                    Operand::Integer(0),
1313                    Operand::Integer(0),
1314                ],
1315            },
1316            ContentOp {
1317                operator: b"BT".to_vec(),
1318                operands: vec![],
1319            },
1320            ContentOp {
1321                operator: b"Tf".to_vec(),
1322                operands: vec![
1323                    Operand::Name(b"F1".to_vec()),
1324                    Operand::Integer(12),
1325                ],
1326            },
1327            ContentOp {
1328                operator: b"Tj".to_vec(),
1329                operands: vec![Operand::String(b"A".to_vec())],
1330            },
1331            ContentOp {
1332                operator: b"ET".to_vec(),
1333                operands: vec![],
1334            },
1335            ContentOp {
1336                operator: b"Q".to_vec(),
1337                operands: vec![],
1338            },
1339            ContentOp {
1340                operator: b"BT".to_vec(),
1341                operands: vec![],
1342            },
1343            ContentOp {
1344                operator: b"Tf".to_vec(),
1345                operands: vec![
1346                    Operand::Name(b"F1".to_vec()),
1347                    Operand::Integer(12),
1348                ],
1349            },
1350            ContentOp {
1351                operator: b"Tj".to_vec(),
1352                operands: vec![Operand::String(b"B".to_vec())],
1353            },
1354            ContentOp {
1355                operator: b"ET".to_vec(),
1356                operands: vec![],
1357            },
1358        ];
1359
1360        let mut fonts = HashMap::new();
1361        fonts.insert(
1362            b"F1".to_vec(),
1363            ResolvedFont {
1364                info: FontInfo {
1365                    base_font: b"Helvetica".to_vec(),
1366                    subtype: b"Type1".to_vec(),
1367                    encoding: Encoding::WinAnsiEncoding,
1368                    widths: crate::font::FontWidths::None {
1369                        default_width: 600.0,
1370                    },
1371                    to_unicode: None,
1372                    is_standard14: true,
1373                    descriptor: None,
1374                },
1375                cmap: None,
1376            },
1377        );
1378
1379        let interpreter = TextInterpreter::new(fonts);
1380        let chars = interpreter.run(&ops);
1381
1382        assert_eq!(chars.len(), 2);
1383        assert_eq!(chars[0].unicode, "A");
1384        assert_eq!(chars[1].unicode, "B");
1385        // "A" should have scaled position (CTM = 2x)
1386        // "B" should have normal position (CTM restored to identity)
1387        // Both at origin since no Td, but font size differs due to CTM
1388        assert!((chars[0].font_size - 24.0).abs() < 0.01); // 12 * 2
1389        assert!((chars[1].font_size - 12.0).abs() < 0.01); // 12 * 1
1390    }
1391}