oxidize_pdf/text/
extraction.rs

1//! Text extraction from PDF content streams
2//!
3//! This module provides functionality to extract text from PDF pages,
4//! handling text positioning, transformations, and basic encodings.
5
6use crate::parser::content::{ContentOperation, ContentParser, TextElement};
7use crate::parser::document::PdfDocument;
8use crate::parser::ParseResult;
9use std::io::{Read, Seek};
10
11/// Text extraction options
12#[derive(Debug, Clone)]
13pub struct ExtractionOptions {
14    /// Preserve the original layout (spacing and positioning)
15    pub preserve_layout: bool,
16    /// Minimum space width to insert space character (in text space units)
17    pub space_threshold: f64,
18    /// Minimum vertical distance to insert newline (in text space units)
19    pub newline_threshold: f64,
20}
21
22impl Default for ExtractionOptions {
23    fn default() -> Self {
24        Self {
25            preserve_layout: false,
26            space_threshold: 0.2,
27            newline_threshold: 10.0,
28        }
29    }
30}
31
32/// Extracted text with position information
33#[derive(Debug, Clone)]
34pub struct ExtractedText {
35    /// The extracted text content
36    pub text: String,
37    /// Text fragments with position information (if preserve_layout is true)
38    pub fragments: Vec<TextFragment>,
39}
40
41/// A fragment of text with position information
42#[derive(Debug, Clone)]
43pub struct TextFragment {
44    /// Text content
45    pub text: String,
46    /// X position in page coordinates
47    pub x: f64,
48    /// Y position in page coordinates
49    pub y: f64,
50    /// Width of the text
51    pub width: f64,
52    /// Height of the text
53    pub height: f64,
54    /// Font size
55    pub font_size: f64,
56}
57
58/// Text extraction state
59struct TextState {
60    /// Current text matrix
61    text_matrix: [f64; 6],
62    /// Current text line matrix
63    text_line_matrix: [f64; 6],
64    /// Current transformation matrix (CTM)
65    #[allow(dead_code)]
66    ctm: [f64; 6],
67    /// Text leading (line spacing)
68    leading: f64,
69    /// Character spacing
70    char_space: f64,
71    /// Word spacing
72    word_space: f64,
73    /// Horizontal scaling
74    horizontal_scale: f64,
75    /// Text rise
76    text_rise: f64,
77    /// Current font size
78    font_size: f64,
79    /// Current font name
80    font_name: Option<String>,
81    /// Render mode (0 = fill, 1 = stroke, etc.)
82    render_mode: u8,
83}
84
85impl Default for TextState {
86    fn default() -> Self {
87        Self {
88            text_matrix: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
89            text_line_matrix: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
90            ctm: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
91            leading: 0.0,
92            char_space: 0.0,
93            word_space: 0.0,
94            horizontal_scale: 100.0,
95            text_rise: 0.0,
96            font_size: 0.0,
97            font_name: None,
98            render_mode: 0,
99        }
100    }
101}
102
103/// Text extractor for PDF pages
104pub struct TextExtractor {
105    options: ExtractionOptions,
106}
107
108impl TextExtractor {
109    /// Create a new text extractor with default options
110    pub fn new() -> Self {
111        Self {
112            options: ExtractionOptions::default(),
113        }
114    }
115
116    /// Create a text extractor with custom options
117    pub fn with_options(options: ExtractionOptions) -> Self {
118        Self { options }
119    }
120
121    /// Extract text from a PDF document
122    pub fn extract_from_document<R: Read + Seek>(
123        &self,
124        document: &PdfDocument<R>,
125    ) -> ParseResult<Vec<ExtractedText>> {
126        let page_count = document.page_count()?;
127        let mut results = Vec::new();
128
129        for i in 0..page_count {
130            let text = self.extract_from_page(document, i)?;
131            results.push(text);
132        }
133
134        Ok(results)
135    }
136
137    /// Extract text from a specific page
138    pub fn extract_from_page<R: Read + Seek>(
139        &self,
140        document: &PdfDocument<R>,
141        page_index: u32,
142    ) -> ParseResult<ExtractedText> {
143        // Get the page
144        let page = document.get_page(page_index)?;
145
146        // Get content streams
147        let streams = page.content_streams_with_document(document)?;
148
149        let mut extracted_text = String::new();
150        let mut fragments = Vec::new();
151        let mut state = TextState::default();
152        let mut in_text_object = false;
153        let mut last_x = 0.0;
154        let mut last_y = 0.0;
155
156        // Process each content stream
157        for stream_data in streams {
158            let operations = ContentParser::parse_content(&stream_data)?;
159
160            for op in operations {
161                match op {
162                    ContentOperation::BeginText => {
163                        in_text_object = true;
164                        // Reset text matrix to identity
165                        state.text_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
166                        state.text_line_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
167                    }
168
169                    ContentOperation::EndText => {
170                        in_text_object = false;
171                    }
172
173                    ContentOperation::SetTextMatrix(a, b, c, d, e, f) => {
174                        state.text_matrix =
175                            [a as f64, b as f64, c as f64, d as f64, e as f64, f as f64];
176                        state.text_line_matrix =
177                            [a as f64, b as f64, c as f64, d as f64, e as f64, f as f64];
178                    }
179
180                    ContentOperation::MoveText(tx, ty) => {
181                        // Update text matrix by translation
182                        let new_matrix = multiply_matrix(
183                            &[1.0, 0.0, 0.0, 1.0, tx as f64, ty as f64],
184                            &state.text_line_matrix,
185                        );
186                        state.text_matrix = new_matrix;
187                        state.text_line_matrix = new_matrix;
188                    }
189
190                    ContentOperation::NextLine => {
191                        // Move to next line using current leading
192                        let new_matrix = multiply_matrix(
193                            &[1.0, 0.0, 0.0, 1.0, 0.0, -state.leading],
194                            &state.text_line_matrix,
195                        );
196                        state.text_matrix = new_matrix;
197                        state.text_line_matrix = new_matrix;
198                    }
199
200                    ContentOperation::ShowText(text) => {
201                        if in_text_object {
202                            let text_bytes = &text;
203                            let decoded = self.decode_text(text_bytes, &state)?;
204
205                            // Calculate position
206                            let (x, y) = transform_point(0.0, 0.0, &state.text_matrix);
207
208                            // Add spacing based on position change
209                            if !extracted_text.is_empty() {
210                                let dx = x - last_x;
211                                let dy = (y - last_y).abs();
212
213                                if dy > self.options.newline_threshold {
214                                    extracted_text.push('\n');
215                                } else if dx > self.options.space_threshold * state.font_size {
216                                    extracted_text.push(' ');
217                                }
218                            }
219
220                            extracted_text.push_str(&decoded);
221
222                            if self.options.preserve_layout {
223                                fragments.push(TextFragment {
224                                    text: decoded.clone(),
225                                    x,
226                                    y,
227                                    width: calculate_text_width(&decoded, state.font_size),
228                                    height: state.font_size,
229                                    font_size: state.font_size,
230                                });
231                            }
232
233                            // Update position for next text
234                            last_x = x + calculate_text_width(&decoded, state.font_size);
235                            last_y = y;
236
237                            // Update text matrix for next show operation
238                            let text_width = calculate_text_width(&decoded, state.font_size);
239                            let tx = text_width * state.horizontal_scale / 100.0;
240                            state.text_matrix =
241                                multiply_matrix(&[1.0, 0.0, 0.0, 1.0, tx, 0.0], &state.text_matrix);
242                        }
243                    }
244
245                    ContentOperation::ShowTextArray(array) => {
246                        if in_text_object {
247                            for item in array {
248                                match item {
249                                    TextElement::Text(text_bytes) => {
250                                        let decoded = self.decode_text(&text_bytes, &state)?;
251                                        extracted_text.push_str(&decoded);
252
253                                        // Update text matrix
254                                        let text_width =
255                                            calculate_text_width(&decoded, state.font_size);
256                                        let tx = text_width * state.horizontal_scale / 100.0;
257                                        state.text_matrix = multiply_matrix(
258                                            &[1.0, 0.0, 0.0, 1.0, tx, 0.0],
259                                            &state.text_matrix,
260                                        );
261                                    }
262                                    TextElement::Spacing(adjustment) => {
263                                        // Text position adjustment (negative = move left)
264                                        let tx = -(adjustment as f64) / 1000.0 * state.font_size;
265                                        state.text_matrix = multiply_matrix(
266                                            &[1.0, 0.0, 0.0, 1.0, tx, 0.0],
267                                            &state.text_matrix,
268                                        );
269                                    }
270                                }
271                            }
272                        }
273                    }
274
275                    ContentOperation::SetFont(name, size) => {
276                        state.font_name = Some(name);
277                        state.font_size = size as f64;
278                    }
279
280                    ContentOperation::SetLeading(leading) => {
281                        state.leading = leading as f64;
282                    }
283
284                    ContentOperation::SetCharSpacing(spacing) => {
285                        state.char_space = spacing as f64;
286                    }
287
288                    ContentOperation::SetWordSpacing(spacing) => {
289                        state.word_space = spacing as f64;
290                    }
291
292                    ContentOperation::SetHorizontalScaling(scale) => {
293                        state.horizontal_scale = scale as f64;
294                    }
295
296                    ContentOperation::SetTextRise(rise) => {
297                        state.text_rise = rise as f64;
298                    }
299
300                    ContentOperation::SetTextRenderMode(mode) => {
301                        state.render_mode = mode as u8;
302                    }
303
304                    _ => {
305                        // Other operations don't affect text extraction
306                    }
307                }
308            }
309        }
310
311        Ok(ExtractedText {
312            text: extracted_text,
313            fragments,
314        })
315    }
316
317    /// Decode text using the current font encoding
318    fn decode_text(&self, text: &[u8], _state: &TextState) -> ParseResult<String> {
319        // TODO: Get actual font encoding from state
320        // For now, assume WinAnsiEncoding which is the default for most PDFs
321        use crate::text::encoding::TextEncoding;
322
323        let encoding = TextEncoding::WinAnsiEncoding;
324        Ok(encoding.decode(text))
325    }
326}
327
328impl Default for TextExtractor {
329    fn default() -> Self {
330        Self::new()
331    }
332}
333
334/// Multiply two transformation matrices
335fn multiply_matrix(a: &[f64; 6], b: &[f64; 6]) -> [f64; 6] {
336    [
337        a[0] * b[0] + a[1] * b[2],
338        a[0] * b[1] + a[1] * b[3],
339        a[2] * b[0] + a[3] * b[2],
340        a[2] * b[1] + a[3] * b[3],
341        a[4] * b[0] + a[5] * b[2] + b[4],
342        a[4] * b[1] + a[5] * b[3] + b[5],
343    ]
344}
345
346/// Transform a point using a transformation matrix
347fn transform_point(x: f64, y: f64, matrix: &[f64; 6]) -> (f64, f64) {
348    let tx = matrix[0] * x + matrix[2] * y + matrix[4];
349    let ty = matrix[1] * x + matrix[3] * y + matrix[5];
350    (tx, ty)
351}
352
353/// Calculate approximate text width (simplified)
354fn calculate_text_width(text: &str, font_size: f64) -> f64 {
355    // Approximate: assume average character width is 0.5 * font_size
356    text.len() as f64 * font_size * 0.5
357}
358
359#[cfg(test)]
360mod tests {
361    use super::*;
362
363    #[test]
364    fn test_matrix_multiplication() {
365        let identity = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
366        let translation = [1.0, 0.0, 0.0, 1.0, 10.0, 20.0];
367
368        let result = multiply_matrix(&identity, &translation);
369        assert_eq!(result, translation);
370
371        let result2 = multiply_matrix(&translation, &identity);
372        assert_eq!(result2, translation);
373    }
374
375    #[test]
376    fn test_transform_point() {
377        let translation = [1.0, 0.0, 0.0, 1.0, 10.0, 20.0];
378        let (x, y) = transform_point(5.0, 5.0, &translation);
379        assert_eq!(x, 15.0);
380        assert_eq!(y, 25.0);
381    }
382}