oxidize_pdf/streaming/
text_streamer.rs1use crate::error::Result;
7use crate::parser::content::{ContentOperation, ContentParser};
8use std::collections::VecDeque;
9
10#[derive(Debug, Clone)]
12pub struct TextChunk {
13 pub text: String,
15 pub x: f64,
17 pub y: f64,
19 pub font_size: f64,
21 pub font_name: Option<String>,
23}
24
25#[derive(Debug, Clone)]
27pub struct TextStreamOptions {
28 pub min_font_size: f64,
30 pub max_buffer_size: usize,
32 pub preserve_formatting: bool,
34 pub sort_by_position: bool,
36}
37
38impl Default for TextStreamOptions {
39 fn default() -> Self {
40 Self {
41 min_font_size: 0.0,
42 max_buffer_size: 1024 * 1024, preserve_formatting: true,
44 sort_by_position: true,
45 }
46 }
47}
48
49pub struct TextStreamer {
51 options: TextStreamOptions,
52 buffer: VecDeque<TextChunk>,
53 current_font: Option<String>,
54 current_font_size: f64,
55 current_x: f64,
56 current_y: f64,
57}
58
59impl TextStreamer {
60 pub fn new(options: TextStreamOptions) -> Self {
62 Self {
63 options,
64 buffer: VecDeque::new(),
65 current_font: None,
66 current_font_size: 12.0,
67 current_x: 0.0,
68 current_y: 0.0,
69 }
70 }
71
72 pub fn process_chunk(&mut self, data: &[u8]) -> Result<Vec<TextChunk>> {
74 let operations = ContentParser::parse(data)
75 .map_err(|e| crate::error::PdfError::ParseError(e.to_string()))?;
76
77 let mut chunks = Vec::new();
78
79 for op in operations {
80 match op {
81 ContentOperation::SetFont(name, size) => {
82 self.current_font = Some(name);
83 self.current_font_size = size as f64;
84 }
85 ContentOperation::MoveText(x, y) => {
86 self.current_x += x as f64;
87 self.current_y += y as f64;
88 }
89 ContentOperation::ShowText(bytes) => {
90 if self.current_font_size >= self.options.min_font_size {
91 let text = String::from_utf8_lossy(&bytes).to_string();
92 let chunk = TextChunk {
93 text,
94 x: self.current_x,
95 y: self.current_y,
96 font_size: self.current_font_size,
97 font_name: self.current_font.clone(),
98 };
99 chunks.push(chunk);
100 }
101 }
102 ContentOperation::BeginText => {
103 self.current_x = 0.0;
104 self.current_y = 0.0;
105 }
106 _ => {} }
108 }
109
110 for chunk in &chunks {
112 self.buffer.push_back(chunk.clone());
113 }
114
115 self.check_buffer_size();
117
118 Ok(chunks)
119 }
120
121 pub fn get_buffered_chunks(&self) -> Vec<TextChunk> {
123 self.buffer.iter().cloned().collect()
124 }
125
126 pub fn clear_buffer(&mut self) {
128 self.buffer.clear();
129 }
130
131 pub fn extract_text(&self) -> String {
133 let mut chunks = self.get_buffered_chunks();
134
135 if self.options.sort_by_position {
136 chunks.sort_by(|a, b| {
138 b.y.partial_cmp(&a.y)
139 .unwrap_or(std::cmp::Ordering::Equal)
140 .then(a.x.partial_cmp(&b.x).unwrap_or(std::cmp::Ordering::Equal))
141 });
142 }
143
144 chunks
145 .into_iter()
146 .map(|chunk| chunk.text)
147 .collect::<Vec<_>>()
148 .join(" ")
149 }
150
151 fn check_buffer_size(&mut self) {
152 let total_size: usize = self.buffer.iter().map(|chunk| chunk.text.len()).sum();
153
154 while total_size > self.options.max_buffer_size && !self.buffer.is_empty() {
156 self.buffer.pop_front();
157 }
158 }
159}
160
161pub fn stream_text<F>(content_streams: Vec<Vec<u8>>, mut callback: F) -> Result<()>
163where
164 F: FnMut(TextChunk) -> Result<()>,
165{
166 let mut streamer = TextStreamer::new(TextStreamOptions::default());
167
168 for stream in content_streams {
169 let chunks = streamer.process_chunk(&stream)?;
170 for chunk in chunks {
171 callback(chunk)?;
172 }
173 }
174
175 Ok(())
176}
177
178#[cfg(test)]
179mod tests {
180 use super::*;
181
182 #[test]
183 fn test_text_chunk() {
184 let chunk = TextChunk {
185 text: "Hello".to_string(),
186 x: 100.0,
187 y: 700.0,
188 font_size: 12.0,
189 font_name: Some("Helvetica".to_string()),
190 };
191
192 assert_eq!(chunk.text, "Hello");
193 assert_eq!(chunk.x, 100.0);
194 assert_eq!(chunk.y, 700.0);
195 assert_eq!(chunk.font_size, 12.0);
196 assert_eq!(chunk.font_name, Some("Helvetica".to_string()));
197 }
198
199 #[test]
200 fn test_text_stream_options_default() {
201 let options = TextStreamOptions::default();
202 assert_eq!(options.min_font_size, 0.0);
203 assert_eq!(options.max_buffer_size, 1024 * 1024);
204 assert!(options.preserve_formatting);
205 assert!(options.sort_by_position);
206 }
207
208 #[test]
209 fn test_text_streamer_creation() {
210 let options = TextStreamOptions::default();
211 let streamer = TextStreamer::new(options);
212
213 assert!(streamer.buffer.is_empty());
214 assert_eq!(streamer.current_font_size, 12.0);
215 assert_eq!(streamer.current_x, 0.0);
216 assert_eq!(streamer.current_y, 0.0);
217 }
218
219 #[test]
220 fn test_process_chunk_text() {
221 let mut streamer = TextStreamer::new(TextStreamOptions::default());
222
223 let content = b"BT /F1 14 Tf 100 700 Td (Hello World) Tj ET";
225 let chunks = streamer.process_chunk(content).unwrap();
226
227 assert!(!chunks.is_empty());
228 assert_eq!(chunks[0].text, "Hello World");
229 assert_eq!(chunks[0].font_size, 14.0);
230 }
231
232 #[test]
233 fn test_min_font_size_filter() {
234 let mut options = TextStreamOptions::default();
235 options.min_font_size = 10.0;
236 let mut streamer = TextStreamer::new(options);
237
238 let content = b"BT /F1 8 Tf 100 700 Td (Small Text) Tj ET";
240 let chunks = streamer.process_chunk(content).unwrap();
241 assert!(chunks.is_empty());
242
243 let content = b"BT /F1 12 Tf 100 650 Td (Large Text) Tj ET";
245 let chunks = streamer.process_chunk(content).unwrap();
246 assert_eq!(chunks.len(), 1);
247 assert_eq!(chunks[0].text, "Large Text");
248 }
249
250 #[test]
251 fn test_extract_text_sorted() {
252 let mut streamer = TextStreamer::new(TextStreamOptions::default());
253
254 streamer.buffer.push_back(TextChunk {
256 text: "Bottom".to_string(),
257 x: 100.0,
258 y: 100.0,
259 font_size: 12.0,
260 font_name: None,
261 });
262
263 streamer.buffer.push_back(TextChunk {
264 text: "Top".to_string(),
265 x: 100.0,
266 y: 700.0,
267 font_size: 12.0,
268 font_name: None,
269 });
270
271 streamer.buffer.push_back(TextChunk {
272 text: "Middle".to_string(),
273 x: 100.0,
274 y: 400.0,
275 font_size: 12.0,
276 font_name: None,
277 });
278
279 let text = streamer.extract_text();
280 assert_eq!(text, "Top Middle Bottom");
281 }
282
283 #[test]
284 fn test_buffer_management() {
285 let mut options = TextStreamOptions::default();
286 options.max_buffer_size = 10; let mut streamer = TextStreamer::new(options);
288
289 for i in 0..5 {
291 streamer.buffer.push_back(TextChunk {
292 text: format!("Text{}", i),
293 x: 0.0,
294 y: 0.0,
295 font_size: 12.0,
296 font_name: None,
297 });
298 }
299
300 streamer.check_buffer_size();
301
302 assert!(streamer.buffer.len() < 5);
304 }
305
306 #[test]
307 fn test_stream_text_function() {
308 let content1 = b"BT /F1 12 Tf 100 700 Td (Page 1) Tj ET".to_vec();
309 let content2 = b"BT /F1 12 Tf 100 650 Td (Page 2) Tj ET".to_vec();
310 let streams = vec![content1, content2];
311
312 let mut collected = Vec::new();
313 stream_text(streams, |chunk| {
314 collected.push(chunk.text);
315 Ok(())
316 })
317 .unwrap();
318
319 assert_eq!(collected.len(), 2);
320 assert_eq!(collected[0], "Page 1");
321 assert_eq!(collected[1], "Page 2");
322 }
323}