oxidize_pdf/streaming/
text_streamer.rs

1//! Text streaming for incremental text extraction
2//!
3//! Extracts text from PDF content streams incrementally, processing
4//! text operations as they are encountered.
5
6use crate::error::Result;
7use crate::parser::content::{ContentOperation, ContentParser};
8use std::collections::VecDeque;
9
10/// A chunk of extracted text with position information
11#[derive(Debug, Clone)]
12pub struct TextChunk {
13    /// The extracted text
14    pub text: String,
15    /// X position on the page
16    pub x: f64,
17    /// Y position on the page
18    pub y: f64,
19    /// Font size
20    pub font_size: f64,
21    /// Font name (if known)
22    pub font_name: Option<String>,
23}
24
25/// Options for text streaming
26#[derive(Debug, Clone)]
27pub struct TextStreamOptions {
28    /// Minimum text size to include
29    pub min_font_size: f64,
30    /// Maximum buffer size for text chunks
31    pub max_buffer_size: usize,
32    /// Whether to preserve formatting
33    pub preserve_formatting: bool,
34    /// Whether to sort by position
35    pub sort_by_position: bool,
36}
37
38impl Default for TextStreamOptions {
39    fn default() -> Self {
40        Self {
41            min_font_size: 0.0,
42            max_buffer_size: 1024 * 1024, // 1MB
43            preserve_formatting: true,
44            sort_by_position: true,
45        }
46    }
47}
48
49/// Streams text from PDF content
50pub struct TextStreamer {
51    options: TextStreamOptions,
52    buffer: VecDeque<TextChunk>,
53    current_font: Option<String>,
54    current_font_size: f64,
55    current_x: f64,
56    current_y: f64,
57}
58
59impl TextStreamer {
60    /// Create a new text streamer
61    pub fn new(options: TextStreamOptions) -> Self {
62        Self {
63            options,
64            buffer: VecDeque::new(),
65            current_font: None,
66            current_font_size: 12.0,
67            current_x: 0.0,
68            current_y: 0.0,
69        }
70    }
71
72    /// Process a content stream chunk
73    pub fn process_chunk(&mut self, data: &[u8]) -> Result<Vec<TextChunk>> {
74        let operations = ContentParser::parse(data)
75            .map_err(|e| crate::error::PdfError::ParseError(e.to_string()))?;
76
77        let mut chunks = Vec::new();
78
79        for op in operations {
80            match op {
81                ContentOperation::SetFont(name, size) => {
82                    self.current_font = Some(name);
83                    self.current_font_size = size as f64;
84                }
85                ContentOperation::MoveText(x, y) => {
86                    self.current_x += x as f64;
87                    self.current_y += y as f64;
88                }
89                ContentOperation::ShowText(bytes) => {
90                    if self.current_font_size >= self.options.min_font_size {
91                        let text = String::from_utf8_lossy(&bytes).to_string();
92                        let chunk = TextChunk {
93                            text,
94                            x: self.current_x,
95                            y: self.current_y,
96                            font_size: self.current_font_size,
97                            font_name: self.current_font.clone(),
98                        };
99                        chunks.push(chunk);
100                    }
101                }
102                ContentOperation::BeginText => {
103                    self.current_x = 0.0;
104                    self.current_y = 0.0;
105                }
106                _ => {} // Ignore other operations
107            }
108        }
109
110        // Add to buffer if needed
111        for chunk in &chunks {
112            self.buffer.push_back(chunk.clone());
113        }
114
115        // Check buffer size
116        self.check_buffer_size();
117
118        Ok(chunks)
119    }
120
121    /// Get all buffered text chunks
122    pub fn get_buffered_chunks(&self) -> Vec<TextChunk> {
123        self.buffer.iter().cloned().collect()
124    }
125
126    /// Clear the buffer
127    pub fn clear_buffer(&mut self) {
128        self.buffer.clear();
129    }
130
131    /// Extract text as a single string
132    pub fn extract_text(&self) -> String {
133        let mut chunks = self.get_buffered_chunks();
134
135        if self.options.sort_by_position {
136            // Sort by Y position (top to bottom), then X (left to right)
137            chunks.sort_by(|a, b| {
138                b.y.partial_cmp(&a.y)
139                    .unwrap_or(std::cmp::Ordering::Equal)
140                    .then(a.x.partial_cmp(&b.x).unwrap_or(std::cmp::Ordering::Equal))
141            });
142        }
143
144        chunks
145            .into_iter()
146            .map(|chunk| chunk.text)
147            .collect::<Vec<_>>()
148            .join(" ")
149    }
150
151    fn check_buffer_size(&mut self) {
152        let total_size: usize = self.buffer.iter().map(|chunk| chunk.text.len()).sum();
153
154        // Remove oldest chunks if buffer is too large
155        while total_size > self.options.max_buffer_size && !self.buffer.is_empty() {
156            self.buffer.pop_front();
157        }
158    }
159}
160
161/// Stream text from multiple content streams
162pub fn stream_text<F>(content_streams: Vec<Vec<u8>>, mut callback: F) -> Result<()>
163where
164    F: FnMut(TextChunk) -> Result<()>,
165{
166    let mut streamer = TextStreamer::new(TextStreamOptions::default());
167
168    for stream in content_streams {
169        let chunks = streamer.process_chunk(&stream)?;
170        for chunk in chunks {
171            callback(chunk)?;
172        }
173    }
174
175    Ok(())
176}
177
178#[cfg(test)]
179mod tests {
180    use super::*;
181
182    #[test]
183    fn test_text_chunk() {
184        let chunk = TextChunk {
185            text: "Hello".to_string(),
186            x: 100.0,
187            y: 700.0,
188            font_size: 12.0,
189            font_name: Some("Helvetica".to_string()),
190        };
191
192        assert_eq!(chunk.text, "Hello");
193        assert_eq!(chunk.x, 100.0);
194        assert_eq!(chunk.y, 700.0);
195        assert_eq!(chunk.font_size, 12.0);
196        assert_eq!(chunk.font_name, Some("Helvetica".to_string()));
197    }
198
199    #[test]
200    fn test_text_stream_options_default() {
201        let options = TextStreamOptions::default();
202        assert_eq!(options.min_font_size, 0.0);
203        assert_eq!(options.max_buffer_size, 1024 * 1024);
204        assert!(options.preserve_formatting);
205        assert!(options.sort_by_position);
206    }
207
208    #[test]
209    fn test_text_streamer_creation() {
210        let options = TextStreamOptions::default();
211        let streamer = TextStreamer::new(options);
212
213        assert!(streamer.buffer.is_empty());
214        assert_eq!(streamer.current_font_size, 12.0);
215        assert_eq!(streamer.current_x, 0.0);
216        assert_eq!(streamer.current_y, 0.0);
217    }
218
219    #[test]
220    fn test_process_chunk_text() {
221        let mut streamer = TextStreamer::new(TextStreamOptions::default());
222
223        // Simple text showing operation
224        let content = b"BT /F1 14 Tf 100 700 Td (Hello World) Tj ET";
225        let chunks = streamer.process_chunk(content).unwrap();
226
227        assert!(!chunks.is_empty());
228        assert_eq!(chunks[0].text, "Hello World");
229        assert_eq!(chunks[0].font_size, 14.0);
230    }
231
232    #[test]
233    fn test_min_font_size_filter() {
234        let mut options = TextStreamOptions::default();
235        options.min_font_size = 10.0;
236        let mut streamer = TextStreamer::new(options);
237
238        // Text with small font (8pt) - should be filtered out
239        let content = b"BT /F1 8 Tf 100 700 Td (Small Text) Tj ET";
240        let chunks = streamer.process_chunk(content).unwrap();
241        assert!(chunks.is_empty());
242
243        // Text with large font (12pt) - should be included
244        let content = b"BT /F1 12 Tf 100 650 Td (Large Text) Tj ET";
245        let chunks = streamer.process_chunk(content).unwrap();
246        assert_eq!(chunks.len(), 1);
247        assert_eq!(chunks[0].text, "Large Text");
248    }
249
250    #[test]
251    fn test_extract_text_sorted() {
252        let mut streamer = TextStreamer::new(TextStreamOptions::default());
253
254        // Add text in random order
255        streamer.buffer.push_back(TextChunk {
256            text: "Bottom".to_string(),
257            x: 100.0,
258            y: 100.0,
259            font_size: 12.0,
260            font_name: None,
261        });
262
263        streamer.buffer.push_back(TextChunk {
264            text: "Top".to_string(),
265            x: 100.0,
266            y: 700.0,
267            font_size: 12.0,
268            font_name: None,
269        });
270
271        streamer.buffer.push_back(TextChunk {
272            text: "Middle".to_string(),
273            x: 100.0,
274            y: 400.0,
275            font_size: 12.0,
276            font_name: None,
277        });
278
279        let text = streamer.extract_text();
280        assert_eq!(text, "Top Middle Bottom");
281    }
282
283    #[test]
284    fn test_buffer_management() {
285        let mut options = TextStreamOptions::default();
286        options.max_buffer_size = 10; // Very small buffer
287        let mut streamer = TextStreamer::new(options);
288
289        // Add chunks that exceed buffer size
290        for i in 0..5 {
291            streamer.buffer.push_back(TextChunk {
292                text: format!("Text{}", i),
293                x: 0.0,
294                y: 0.0,
295                font_size: 12.0,
296                font_name: None,
297            });
298        }
299
300        streamer.check_buffer_size();
301
302        // Buffer should be limited
303        assert!(streamer.buffer.len() < 5);
304    }
305
306    #[test]
307    fn test_stream_text_function() {
308        let content1 = b"BT /F1 12 Tf 100 700 Td (Page 1) Tj ET".to_vec();
309        let content2 = b"BT /F1 12 Tf 100 650 Td (Page 2) Tj ET".to_vec();
310        let streams = vec![content1, content2];
311
312        let mut collected = Vec::new();
313        stream_text(streams, |chunk| {
314            collected.push(chunk.text);
315            Ok(())
316        })
317        .unwrap();
318
319        assert_eq!(collected.len(), 2);
320        assert_eq!(collected[0], "Page 1");
321        assert_eq!(collected[1], "Page 2");
322    }
323}