oxidize_pdf/text/
extraction.rs

1//! Text extraction from PDF content streams
2//! 
3//! This module provides functionality to extract text from PDF pages,
4//! handling text positioning, transformations, and basic encodings.
5
6use crate::parser::ParseResult;
7use crate::parser::content::{ContentParser, ContentOperation, TextElement};
8use crate::parser::document::PdfDocument;
9use std::io::{Read, Seek};
10
11/// Text extraction options
12#[derive(Debug, Clone)]
13pub struct ExtractionOptions {
14    /// Preserve the original layout (spacing and positioning)
15    pub preserve_layout: bool,
16    /// Minimum space width to insert space character (in text space units)
17    pub space_threshold: f64,
18    /// Minimum vertical distance to insert newline (in text space units)
19    pub newline_threshold: f64,
20}
21
22impl Default for ExtractionOptions {
23    fn default() -> Self {
24        Self {
25            preserve_layout: false,
26            space_threshold: 0.2,
27            newline_threshold: 10.0,
28        }
29    }
30}
31
32/// Extracted text with position information
33#[derive(Debug, Clone)]
34pub struct ExtractedText {
35    /// The extracted text content
36    pub text: String,
37    /// Text fragments with position information (if preserve_layout is true)
38    pub fragments: Vec<TextFragment>,
39}
40
41/// A fragment of text with position information
42#[derive(Debug, Clone)]
43pub struct TextFragment {
44    /// Text content
45    pub text: String,
46    /// X position in page coordinates
47    pub x: f64,
48    /// Y position in page coordinates
49    pub y: f64,
50    /// Width of the text
51    pub width: f64,
52    /// Height of the text
53    pub height: f64,
54    /// Font size
55    pub font_size: f64,
56}
57
58/// Text extraction state
59struct TextState {
60    /// Current text matrix
61    text_matrix: [f64; 6],
62    /// Current text line matrix
63    text_line_matrix: [f64; 6],
64    /// Current transformation matrix (CTM)
65    ctm: [f64; 6],
66    /// Text leading (line spacing)
67    leading: f64,
68    /// Character spacing
69    char_space: f64,
70    /// Word spacing
71    word_space: f64,
72    /// Horizontal scaling
73    horizontal_scale: f64,
74    /// Text rise
75    text_rise: f64,
76    /// Current font size
77    font_size: f64,
78    /// Current font name
79    font_name: Option<String>,
80    /// Render mode (0 = fill, 1 = stroke, etc.)
81    render_mode: u8,
82}
83
84impl Default for TextState {
85    fn default() -> Self {
86        Self {
87            text_matrix: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
88            text_line_matrix: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
89            ctm: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
90            leading: 0.0,
91            char_space: 0.0,
92            word_space: 0.0,
93            horizontal_scale: 100.0,
94            text_rise: 0.0,
95            font_size: 0.0,
96            font_name: None,
97            render_mode: 0,
98        }
99    }
100}
101
102/// Text extractor for PDF pages
103pub struct TextExtractor {
104    options: ExtractionOptions,
105}
106
107impl TextExtractor {
108    /// Create a new text extractor with default options
109    pub fn new() -> Self {
110        Self {
111            options: ExtractionOptions::default(),
112        }
113    }
114    
115    /// Create a text extractor with custom options
116    pub fn with_options(options: ExtractionOptions) -> Self {
117        Self { options }
118    }
119    
120    /// Extract text from a PDF document
121    pub fn extract_from_document<R: Read + Seek>(
122        &self,
123        document: &PdfDocument<R>,
124    ) -> ParseResult<Vec<ExtractedText>> {
125        let page_count = document.page_count()?;
126        let mut results = Vec::new();
127        
128        for i in 0..page_count {
129            let text = self.extract_from_page(document, i)?;
130            results.push(text);
131        }
132        
133        Ok(results)
134    }
135    
136    /// Extract text from a specific page
137    pub fn extract_from_page<R: Read + Seek>(
138        &self,
139        document: &PdfDocument<R>,
140        page_index: u32,
141    ) -> ParseResult<ExtractedText> {
142        // Get the page
143        let page = document.get_page(page_index)?;
144        
145        // Get content streams
146        let streams = page.content_streams_with_document(document)?;
147        
148        let mut extracted_text = String::new();
149        let mut fragments = Vec::new();
150        let mut state = TextState::default();
151        let mut in_text_object = false;
152        let mut last_x = 0.0;
153        let mut last_y = 0.0;
154        
155        // Process each content stream
156        for stream_data in streams {
157            let operations = ContentParser::parse_content(&stream_data)?;
158            
159            for op in operations {
160                match op {
161                    ContentOperation::BeginText => {
162                        in_text_object = true;
163                        // Reset text matrix to identity
164                        state.text_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
165                        state.text_line_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
166                    }
167                    
168                    ContentOperation::EndText => {
169                        in_text_object = false;
170                    }
171                    
172                    ContentOperation::SetTextMatrix(a, b, c, d, e, f) => {
173                        state.text_matrix = [a as f64, b as f64, c as f64, d as f64, e as f64, f as f64];
174                        state.text_line_matrix = [a as f64, b as f64, c as f64, d as f64, e as f64, f as f64];
175                    }
176                    
177                    ContentOperation::MoveText(tx, ty) => {
178                        // Update text matrix by translation
179                        let new_matrix = multiply_matrix(
180                            &[1.0, 0.0, 0.0, 1.0, tx as f64, ty as f64],
181                            &state.text_line_matrix,
182                        );
183                        state.text_matrix = new_matrix;
184                        state.text_line_matrix = new_matrix;
185                    }
186                    
187                    ContentOperation::NextLine => {
188                        // Move to next line using current leading
189                        let new_matrix = multiply_matrix(
190                            &[1.0, 0.0, 0.0, 1.0, 0.0, -state.leading],
191                            &state.text_line_matrix,
192                        );
193                        state.text_matrix = new_matrix;
194                        state.text_line_matrix = new_matrix;
195                    }
196                    
197                    ContentOperation::ShowText(text) => {
198                        if in_text_object {
199                            let text_bytes = &text;
200                            let decoded = self.decode_text(text_bytes, &state)?;
201                            
202                            // Calculate position
203                            let (x, y) = transform_point(0.0, 0.0, &state.text_matrix);
204                            
205                            // Add spacing based on position change
206                            if !extracted_text.is_empty() {
207                                let dx = x - last_x;
208                                let dy = (y - last_y).abs();
209                                
210                                if dy > self.options.newline_threshold {
211                                    extracted_text.push('\n');
212                                } else if dx > self.options.space_threshold * state.font_size {
213                                    extracted_text.push(' ');
214                                }
215                            }
216                            
217                            extracted_text.push_str(&decoded);
218                            
219                            if self.options.preserve_layout {
220                                fragments.push(TextFragment {
221                                    text: decoded.clone(),
222                                    x,
223                                    y,
224                                    width: calculate_text_width(&decoded, state.font_size),
225                                    height: state.font_size,
226                                    font_size: state.font_size,
227                                });
228                            }
229                            
230                            // Update position for next text
231                            last_x = x + calculate_text_width(&decoded, state.font_size);
232                            last_y = y;
233                            
234                            // Update text matrix for next show operation
235                            let text_width = calculate_text_width(&decoded, state.font_size);
236                            let tx = text_width * state.horizontal_scale / 100.0;
237                            state.text_matrix = multiply_matrix(
238                                &[1.0, 0.0, 0.0, 1.0, tx, 0.0],
239                                &state.text_matrix,
240                            );
241                        }
242                    }
243                    
244                    ContentOperation::ShowTextArray(array) => {
245                        if in_text_object {
246                            for item in array {
247                                match item {
248                                    TextElement::Text(text_bytes) => {
249                                        let decoded = self.decode_text(&text_bytes, &state)?;
250                                        extracted_text.push_str(&decoded);
251                                        
252                                        // Update text matrix
253                                        let text_width = calculate_text_width(&decoded, state.font_size);
254                                        let tx = text_width * state.horizontal_scale / 100.0;
255                                        state.text_matrix = multiply_matrix(
256                                            &[1.0, 0.0, 0.0, 1.0, tx, 0.0],
257                                            &state.text_matrix,
258                                        );
259                                    }
260                                    TextElement::Spacing(adjustment) => {
261                                        // Text position adjustment (negative = move left)
262                                        let tx = -(adjustment as f64) / 1000.0 * state.font_size;
263                                        state.text_matrix = multiply_matrix(
264                                            &[1.0, 0.0, 0.0, 1.0, tx, 0.0],
265                                            &state.text_matrix,
266                                        );
267                                    }
268                                }
269                            }
270                        }
271                    }
272                    
273                    ContentOperation::SetFont(name, size) => {
274                        state.font_name = Some(name);
275                        state.font_size = size as f64;
276                    }
277                    
278                    ContentOperation::SetLeading(leading) => {
279                        state.leading = leading as f64;
280                    }
281                    
282                    ContentOperation::SetCharSpacing(spacing) => {
283                        state.char_space = spacing as f64;
284                    }
285                    
286                    ContentOperation::SetWordSpacing(spacing) => {
287                        state.word_space = spacing as f64;
288                    }
289                    
290                    ContentOperation::SetHorizontalScaling(scale) => {
291                        state.horizontal_scale = scale as f64;
292                    }
293                    
294                    ContentOperation::SetTextRise(rise) => {
295                        state.text_rise = rise as f64;
296                    }
297                    
298                    ContentOperation::SetTextRenderMode(mode) => {
299                        state.render_mode = mode as u8;
300                    }
301                    
302                    _ => {
303                        // Other operations don't affect text extraction
304                    }
305                }
306            }
307        }
308        
309        Ok(ExtractedText {
310            text: extracted_text,
311            fragments,
312        })
313    }
314    
315    /// Decode text using the current font encoding
316    fn decode_text(&self, text: &[u8], _state: &TextState) -> ParseResult<String> {
317        // TODO: Get actual font encoding from state
318        // For now, assume WinAnsiEncoding which is the default for most PDFs
319        use crate::text::encoding::TextEncoding;
320        
321        let encoding = TextEncoding::WinAnsiEncoding;
322        Ok(encoding.decode(text))
323    }
324}
325
326impl Default for TextExtractor {
327    fn default() -> Self {
328        Self::new()
329    }
330}
331
332/// Multiply two transformation matrices
333fn multiply_matrix(a: &[f64; 6], b: &[f64; 6]) -> [f64; 6] {
334    [
335        a[0] * b[0] + a[1] * b[2],
336        a[0] * b[1] + a[1] * b[3],
337        a[2] * b[0] + a[3] * b[2],
338        a[2] * b[1] + a[3] * b[3],
339        a[4] * b[0] + a[5] * b[2] + b[4],
340        a[4] * b[1] + a[5] * b[3] + b[5],
341    ]
342}
343
344/// Transform a point using a transformation matrix
345fn transform_point(x: f64, y: f64, matrix: &[f64; 6]) -> (f64, f64) {
346    let tx = matrix[0] * x + matrix[2] * y + matrix[4];
347    let ty = matrix[1] * x + matrix[3] * y + matrix[5];
348    (tx, ty)
349}
350
351/// Calculate approximate text width (simplified)
352fn calculate_text_width(text: &str, font_size: f64) -> f64 {
353    // Approximate: assume average character width is 0.5 * font_size
354    text.len() as f64 * font_size * 0.5
355}
356
357#[cfg(test)]
358mod tests {
359    use super::*;
360    
361    #[test]
362    fn test_matrix_multiplication() {
363        let identity = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
364        let translation = [1.0, 0.0, 0.0, 1.0, 10.0, 20.0];
365        
366        let result = multiply_matrix(&identity, &translation);
367        assert_eq!(result, translation);
368        
369        let result2 = multiply_matrix(&translation, &identity);
370        assert_eq!(result2, translation);
371    }
372    
373    #[test]
374    fn test_transform_point() {
375        let translation = [1.0, 0.0, 0.0, 1.0, 10.0, 20.0];
376        let (x, y) = transform_point(5.0, 5.0, &translation);
377        assert_eq!(x, 15.0);
378        assert_eq!(y, 25.0);
379    }
380}