Skip to main content

oxidize_pdf/streaming/
text_streamer.rs

1//! Text streaming for incremental text extraction
2//!
3//! Extracts text from PDF content streams incrementally, processing
4//! text operations as they are encountered.
5
6use crate::error::Result;
7use crate::parser::content::{ContentOperation, ContentParser};
8use std::collections::VecDeque;
9
10/// A chunk of extracted text with position information
11#[derive(Debug, Clone)]
12pub struct TextChunk {
13    /// The extracted text
14    pub text: String,
15    /// X position on the page
16    pub x: f64,
17    /// Y position on the page
18    pub y: f64,
19    /// Font size
20    pub font_size: f64,
21    /// Font name (if known)
22    pub font_name: Option<String>,
23}
24
25/// Options for text streaming
26#[derive(Debug, Clone)]
27pub struct TextStreamOptions {
28    /// Minimum text size to include
29    pub min_font_size: f64,
30    /// Maximum buffer size for text chunks
31    pub max_buffer_size: usize,
32    /// Whether to preserve formatting
33    pub preserve_formatting: bool,
34    /// Whether to sort by position
35    pub sort_by_position: bool,
36}
37
38impl Default for TextStreamOptions {
39    fn default() -> Self {
40        Self {
41            min_font_size: 0.0,
42            max_buffer_size: 1024 * 1024, // 1MB
43            preserve_formatting: true,
44            sort_by_position: true,
45        }
46    }
47}
48
49/// Streams text from PDF content
50pub struct TextStreamer {
51    options: TextStreamOptions,
52    buffer: VecDeque<TextChunk>,
53    current_font: Option<String>,
54    current_font_size: f64,
55    current_x: f64,
56    current_y: f64,
57}
58
59impl TextStreamer {
60    /// Create a new text streamer
61    pub fn new(options: TextStreamOptions) -> Self {
62        Self {
63            options,
64            buffer: VecDeque::new(),
65            current_font: None,
66            current_font_size: 12.0,
67            current_x: 0.0,
68            current_y: 0.0,
69        }
70    }
71
72    /// Process a content stream chunk
73    pub fn process_chunk(&mut self, data: &[u8]) -> Result<Vec<TextChunk>> {
74        let operations = ContentParser::parse(data)
75            .map_err(|e| crate::error::PdfError::ParseError(e.to_string()))?;
76
77        let mut chunks = Vec::new();
78
79        for op in operations {
80            match op {
81                ContentOperation::SetFont(name, size) => {
82                    self.current_font = Some(name);
83                    self.current_font_size = size as f64;
84                }
85                ContentOperation::MoveText(x, y) => {
86                    self.current_x += x as f64;
87                    self.current_y += y as f64;
88                }
89                ContentOperation::ShowText(bytes) => {
90                    if self.current_font_size >= self.options.min_font_size {
91                        let text = String::from_utf8_lossy(&bytes).to_string();
92                        let chunk = TextChunk {
93                            text,
94                            x: self.current_x,
95                            y: self.current_y,
96                            font_size: self.current_font_size,
97                            font_name: self.current_font.clone(),
98                        };
99                        chunks.push(chunk);
100                    }
101                }
102                ContentOperation::BeginText => {
103                    self.current_x = 0.0;
104                    self.current_y = 0.0;
105                }
106                _ => {} // Ignore other operations
107            }
108        }
109
110        // Add to buffer if needed
111        for chunk in &chunks {
112            self.buffer.push_back(chunk.clone());
113        }
114
115        // Check buffer size
116        self.check_buffer_size();
117
118        Ok(chunks)
119    }
120
121    /// Get all buffered text chunks
122    pub fn get_buffered_chunks(&self) -> Vec<TextChunk> {
123        self.buffer.iter().cloned().collect()
124    }
125
126    /// Clear the buffer
127    pub fn clear_buffer(&mut self) {
128        self.buffer.clear();
129    }
130
131    /// Extract text as a single string
132    pub fn extract_text(&self) -> String {
133        let mut chunks = self.get_buffered_chunks();
134
135        if self.options.sort_by_position {
136            // Sort by Y position (top to bottom), then X (left to right)
137            chunks.sort_by(|a, b| b.y.total_cmp(&a.y).then(a.x.total_cmp(&b.x)));
138        }
139
140        chunks
141            .into_iter()
142            .map(|chunk| chunk.text)
143            .collect::<Vec<_>>()
144            .join(" ")
145    }
146
147    fn check_buffer_size(&mut self) {
148        let total_size: usize = self.buffer.iter().map(|chunk| chunk.text.len()).sum();
149
150        // Remove oldest chunks if buffer is too large
151        while total_size > self.options.max_buffer_size && !self.buffer.is_empty() {
152            self.buffer.pop_front();
153        }
154    }
155}
156
157/// Stream text from multiple content streams
158pub fn stream_text<F>(content_streams: Vec<Vec<u8>>, mut callback: F) -> Result<()>
159where
160    F: FnMut(TextChunk) -> Result<()>,
161{
162    let mut streamer = TextStreamer::new(TextStreamOptions::default());
163
164    for stream in content_streams {
165        let chunks = streamer.process_chunk(&stream)?;
166        for chunk in chunks {
167            callback(chunk)?;
168        }
169    }
170
171    Ok(())
172}
173
174#[cfg(test)]
175mod tests {
176    use super::*;
177
178    #[test]
179    fn test_text_chunk() {
180        let chunk = TextChunk {
181            text: "Hello".to_string(),
182            x: 100.0,
183            y: 700.0,
184            font_size: 12.0,
185            font_name: Some("Helvetica".to_string()),
186        };
187
188        assert_eq!(chunk.text, "Hello");
189        assert_eq!(chunk.x, 100.0);
190        assert_eq!(chunk.y, 700.0);
191        assert_eq!(chunk.font_size, 12.0);
192        assert_eq!(chunk.font_name, Some("Helvetica".to_string()));
193    }
194
195    #[test]
196    fn test_text_stream_options_default() {
197        let options = TextStreamOptions::default();
198        assert_eq!(options.min_font_size, 0.0);
199        assert_eq!(options.max_buffer_size, 1024 * 1024);
200        assert!(options.preserve_formatting);
201        assert!(options.sort_by_position);
202    }
203
204    #[test]
205    fn test_text_streamer_creation() {
206        let options = TextStreamOptions::default();
207        let streamer = TextStreamer::new(options);
208
209        assert!(streamer.buffer.is_empty());
210        assert_eq!(streamer.current_font_size, 12.0);
211        assert_eq!(streamer.current_x, 0.0);
212        assert_eq!(streamer.current_y, 0.0);
213    }
214
215    #[test]
216    fn test_process_chunk_text() {
217        let mut streamer = TextStreamer::new(TextStreamOptions::default());
218
219        // Simple text showing operation
220        let content = b"BT /F1 14 Tf 100 700 Td (Hello World) Tj ET";
221        let chunks = streamer.process_chunk(content).unwrap();
222
223        assert!(!chunks.is_empty());
224        assert_eq!(chunks[0].text, "Hello World");
225        assert_eq!(chunks[0].font_size, 14.0);
226    }
227
228    #[test]
229    fn test_min_font_size_filter() {
230        let mut options = TextStreamOptions::default();
231        options.min_font_size = 10.0;
232        let mut streamer = TextStreamer::new(options);
233
234        // Text with small font (8pt) - should be filtered out
235        let content = b"BT /F1 8 Tf 100 700 Td (Small Text) Tj ET";
236        let chunks = streamer.process_chunk(content).unwrap();
237        assert!(chunks.is_empty());
238
239        // Text with large font (12pt) - should be included
240        let content = b"BT /F1 12 Tf 100 650 Td (Large Text) Tj ET";
241        let chunks = streamer.process_chunk(content).unwrap();
242        assert_eq!(chunks.len(), 1);
243        assert_eq!(chunks[0].text, "Large Text");
244    }
245
246    #[test]
247    fn test_extract_text_sorted() {
248        let mut streamer = TextStreamer::new(TextStreamOptions::default());
249
250        // Add text in random order
251        streamer.buffer.push_back(TextChunk {
252            text: "Bottom".to_string(),
253            x: 100.0,
254            y: 100.0,
255            font_size: 12.0,
256            font_name: None,
257        });
258
259        streamer.buffer.push_back(TextChunk {
260            text: "Top".to_string(),
261            x: 100.0,
262            y: 700.0,
263            font_size: 12.0,
264            font_name: None,
265        });
266
267        streamer.buffer.push_back(TextChunk {
268            text: "Middle".to_string(),
269            x: 100.0,
270            y: 400.0,
271            font_size: 12.0,
272            font_name: None,
273        });
274
275        let text = streamer.extract_text();
276        assert_eq!(text, "Top Middle Bottom");
277    }
278
279    #[test]
280    fn test_buffer_management() {
281        let mut options = TextStreamOptions::default();
282        options.max_buffer_size = 10; // Very small buffer
283        let mut streamer = TextStreamer::new(options);
284
285        // Add chunks that exceed buffer size
286        for i in 0..5 {
287            streamer.buffer.push_back(TextChunk {
288                text: format!("Text{i}"),
289                x: 0.0,
290                y: 0.0,
291                font_size: 12.0,
292                font_name: None,
293            });
294        }
295
296        streamer.check_buffer_size();
297
298        // Buffer should be limited
299        assert!(streamer.buffer.len() < 5);
300    }
301
302    #[test]
303    fn test_stream_text_function() {
304        let content1 = b"BT /F1 12 Tf 100 700 Td (Page 1) Tj ET".to_vec();
305        let content2 = b"BT /F1 12 Tf 100 650 Td (Page 2) Tj ET".to_vec();
306        let streams = vec![content1, content2];
307
308        let mut collected = Vec::new();
309        stream_text(streams, |chunk| {
310            collected.push(chunk.text);
311            Ok(())
312        })
313        .unwrap();
314
315        assert_eq!(collected.len(), 2);
316        assert_eq!(collected[0], "Page 1");
317        assert_eq!(collected[1], "Page 2");
318    }
319
320    #[test]
321    fn test_text_chunk_debug_clone() {
322        let chunk = TextChunk {
323            text: "Test".to_string(),
324            x: 50.0,
325            y: 100.0,
326            font_size: 10.0,
327            font_name: Some("Arial".to_string()),
328        };
329
330        let debug_str = format!("{chunk:?}");
331        assert!(debug_str.contains("TextChunk"));
332        assert!(debug_str.contains("Test"));
333
334        let cloned = chunk.clone();
335        assert_eq!(cloned.text, chunk.text);
336        assert_eq!(cloned.x, chunk.x);
337        assert_eq!(cloned.y, chunk.y);
338        assert_eq!(cloned.font_size, chunk.font_size);
339        assert_eq!(cloned.font_name, chunk.font_name);
340    }
341
342    #[test]
343    fn test_text_stream_options_custom() {
344        let options = TextStreamOptions {
345            min_font_size: 8.0,
346            max_buffer_size: 2048,
347            preserve_formatting: false,
348            sort_by_position: false,
349        };
350
351        assert_eq!(options.min_font_size, 8.0);
352        assert_eq!(options.max_buffer_size, 2048);
353        assert!(!options.preserve_formatting);
354        assert!(!options.sort_by_position);
355    }
356
357    #[test]
358    fn test_text_stream_options_debug_clone() {
359        let options = TextStreamOptions::default();
360
361        let debug_str = format!("{options:?}");
362        assert!(debug_str.contains("TextStreamOptions"));
363
364        let cloned = options.clone();
365        assert_eq!(cloned.min_font_size, options.min_font_size);
366        assert_eq!(cloned.max_buffer_size, options.max_buffer_size);
367        assert_eq!(cloned.preserve_formatting, options.preserve_formatting);
368        assert_eq!(cloned.sort_by_position, options.sort_by_position);
369    }
370
371    #[test]
372    fn test_text_streamer_process_empty_chunk() {
373        let mut streamer = TextStreamer::new(TextStreamOptions::default());
374        let chunks = streamer.process_chunk(b"").unwrap();
375        assert!(chunks.is_empty());
376    }
377
378    #[test]
379    fn test_text_streamer_process_invalid_content() {
380        let mut streamer = TextStreamer::new(TextStreamOptions::default());
381        // Invalid PDF content should be handled gracefully
382        let content = b"Not valid PDF content";
383        let result = streamer.process_chunk(content);
384        // Should either succeed with no chunks or return an error
385        match result {
386            Ok(chunks) => assert!(chunks.is_empty()),
387            Err(_) => {} // Error is also acceptable
388        }
389    }
390
391    #[test]
392    fn test_text_streamer_font_tracking() {
393        let mut streamer = TextStreamer::new(TextStreamOptions::default());
394
395        // Set font operation
396        let content = b"BT /Helvetica-Bold 16 Tf ET";
397        let _ = streamer.process_chunk(content).unwrap();
398
399        assert_eq!(streamer.current_font, Some("Helvetica-Bold".to_string()));
400        assert_eq!(streamer.current_font_size, 16.0);
401    }
402
403    #[test]
404    fn test_text_streamer_position_tracking() {
405        let mut streamer = TextStreamer::new(TextStreamOptions::default());
406
407        // Move text position
408        let content = b"BT 50 100 Td ET";
409        let _ = streamer.process_chunk(content).unwrap();
410
411        assert_eq!(streamer.current_x, 50.0);
412        assert_eq!(streamer.current_y, 100.0);
413    }
414
415    #[test]
416    fn test_text_streamer_begin_text_resets_position() {
417        let mut streamer = TextStreamer::new(TextStreamOptions::default());
418
419        // Set position
420        streamer.current_x = 100.0;
421        streamer.current_y = 200.0;
422
423        // BeginText should reset position
424        let content = b"BT ET";
425        let _ = streamer.process_chunk(content).unwrap();
426
427        assert_eq!(streamer.current_x, 0.0);
428        assert_eq!(streamer.current_y, 0.0);
429    }
430
431    #[test]
432    fn test_text_streamer_clear_buffer() {
433        let mut streamer = TextStreamer::new(TextStreamOptions::default());
434
435        // Add some chunks
436        streamer.buffer.push_back(TextChunk {
437            text: "Chunk1".to_string(),
438            x: 0.0,
439            y: 0.0,
440            font_size: 12.0,
441            font_name: None,
442        });
443        streamer.buffer.push_back(TextChunk {
444            text: "Chunk2".to_string(),
445            x: 0.0,
446            y: 0.0,
447            font_size: 12.0,
448            font_name: None,
449        });
450
451        assert_eq!(streamer.buffer.len(), 2);
452
453        streamer.clear_buffer();
454        assert!(streamer.buffer.is_empty());
455    }
456
457    #[test]
458    fn test_text_streamer_get_buffered_chunks() {
459        let mut streamer = TextStreamer::new(TextStreamOptions::default());
460
461        let chunk1 = TextChunk {
462            text: "First".to_string(),
463            x: 10.0,
464            y: 20.0,
465            font_size: 14.0,
466            font_name: Some("Times".to_string()),
467        };
468        let chunk2 = TextChunk {
469            text: "Second".to_string(),
470            x: 30.0,
471            y: 40.0,
472            font_size: 16.0,
473            font_name: Some("Arial".to_string()),
474        };
475
476        streamer.buffer.push_back(chunk1);
477        streamer.buffer.push_back(chunk2);
478
479        let chunks = streamer.get_buffered_chunks();
480        assert_eq!(chunks.len(), 2);
481        assert_eq!(chunks[0].text, "First");
482        assert_eq!(chunks[1].text, "Second");
483    }
484
485    #[test]
486    fn test_extract_text_no_sorting() {
487        let mut options = TextStreamOptions::default();
488        options.sort_by_position = false;
489        let mut streamer = TextStreamer::new(options);
490
491        // Add text in specific order
492        streamer.buffer.push_back(TextChunk {
493            text: "First".to_string(),
494            x: 200.0,
495            y: 100.0,
496            font_size: 12.0,
497            font_name: None,
498        });
499        streamer.buffer.push_back(TextChunk {
500            text: "Second".to_string(),
501            x: 100.0,
502            y: 200.0,
503            font_size: 12.0,
504            font_name: None,
505        });
506
507        let text = streamer.extract_text();
508        assert_eq!(text, "First Second"); // Should maintain insertion order
509    }
510
511    #[test]
512    fn test_extract_text_horizontal_sorting() {
513        let mut streamer = TextStreamer::new(TextStreamOptions::default());
514
515        // Add text on same line, different X positions
516        streamer.buffer.push_back(TextChunk {
517            text: "Right".to_string(),
518            x: 300.0,
519            y: 500.0,
520            font_size: 12.0,
521            font_name: None,
522        });
523        streamer.buffer.push_back(TextChunk {
524            text: "Left".to_string(),
525            x: 100.0,
526            y: 500.0,
527            font_size: 12.0,
528            font_name: None,
529        });
530        streamer.buffer.push_back(TextChunk {
531            text: "Middle".to_string(),
532            x: 200.0,
533            y: 500.0,
534            font_size: 12.0,
535            font_name: None,
536        });
537
538        let text = streamer.extract_text();
539        assert_eq!(text, "Left Middle Right");
540    }
541
542    #[test]
543    fn test_check_buffer_size_edge_cases() {
544        let mut options = TextStreamOptions::default();
545        options.max_buffer_size = 20;
546        let mut streamer = TextStreamer::new(options);
547
548        // Add chunk that exactly fills buffer
549        streamer.buffer.push_back(TextChunk {
550            text: "a".repeat(20),
551            x: 0.0,
552            y: 0.0,
553            font_size: 12.0,
554            font_name: None,
555        });
556
557        streamer.check_buffer_size();
558        assert_eq!(streamer.buffer.len(), 1); // Should keep the chunk
559
560        // Add another chunk to exceed limit
561        streamer.buffer.push_back(TextChunk {
562            text: "b".to_string(),
563            x: 0.0,
564            y: 0.0,
565            font_size: 12.0,
566            font_name: None,
567        });
568
569        streamer.check_buffer_size();
570        // Should have removed the first chunk
571        assert!(streamer.buffer.len() <= 1);
572    }
573
574    #[test]
575    fn test_stream_text_with_error_callback() {
576        let content = b"BT /F1 12 Tf 100 700 Td (Test) Tj ET".to_vec();
577        let streams = vec![content];
578
579        let result = stream_text(streams, |_chunk| {
580            Err(crate::error::PdfError::ParseError("Test error".to_string()))
581        });
582
583        assert!(result.is_err());
584    }
585
586    #[test]
587    fn test_stream_text_empty_streams() {
588        let streams: Vec<Vec<u8>> = vec![];
589
590        let mut collected = Vec::new();
591        stream_text(streams, |chunk| {
592            collected.push(chunk);
593            Ok(())
594        })
595        .unwrap();
596
597        assert!(collected.is_empty());
598    }
599
600    #[test]
601    fn test_text_chunk_without_font_name() {
602        let chunk = TextChunk {
603            text: "No Font".to_string(),
604            x: 0.0,
605            y: 0.0,
606            font_size: 12.0,
607            font_name: None,
608        };
609
610        assert_eq!(chunk.font_name, None);
611    }
612
613    #[test]
614    fn test_process_chunk_multiple_operations() {
615        let mut streamer = TextStreamer::new(TextStreamOptions::default());
616
617        // Content with multiple text operations
618        let content = b"BT /F1 10 Tf 100 700 Td (First) Tj 50 0 Td (Second) Tj ET";
619        let chunks = streamer.process_chunk(content).unwrap();
620
621        assert_eq!(chunks.len(), 2);
622        assert_eq!(chunks[0].text, "First");
623        assert_eq!(chunks[1].text, "Second");
624        assert_eq!(chunks[0].x, 100.0);
625        assert_eq!(chunks[1].x, 150.0); // 100 + 50
626    }
627
628    #[test]
629    fn test_buffer_size_calculation() {
630        let mut options = TextStreamOptions::default();
631        options.max_buffer_size = 100;
632        let mut streamer = TextStreamer::new(options);
633
634        // Add chunks with known sizes
635        for _i in 0..10 {
636            streamer.buffer.push_back(TextChunk {
637                text: "1234567890".to_string(), // 10 bytes each
638                x: 0.0,
639                y: 0.0,
640                font_size: 12.0,
641                font_name: None,
642            });
643        }
644
645        // Total size is 100 bytes
646        streamer.check_buffer_size();
647
648        // Add one more to exceed
649        streamer.buffer.push_back(TextChunk {
650            text: "x".to_string(),
651            x: 0.0,
652            y: 0.0,
653            font_size: 12.0,
654            font_name: None,
655        });
656
657        streamer.check_buffer_size();
658
659        // Should have removed oldest chunks
660        let total_size: usize = streamer.buffer.iter().map(|c| c.text.len()).sum();
661        assert!(total_size <= 100);
662    }
663
664    #[test]
665    fn test_text_chunk_extreme_positions() {
666        let chunk = TextChunk {
667            text: "Extreme".to_string(),
668            x: f64::MAX,
669            y: f64::MIN,
670            font_size: 0.1,
671            font_name: Some("TinyFont".to_string()),
672        };
673
674        assert_eq!(chunk.x, f64::MAX);
675        assert_eq!(chunk.y, f64::MIN);
676        assert_eq!(chunk.font_size, 0.1);
677    }
678
679    #[test]
680    fn test_text_streamer_accumulated_position() {
681        let mut streamer = TextStreamer::new(TextStreamOptions::default());
682
683        // Multiple move operations should accumulate
684        let content = b"BT 10 20 Td 5 10 Td 15 -5 Td ET";
685        let _ = streamer.process_chunk(content).unwrap();
686
687        assert_eq!(streamer.current_x, 30.0); // 10 + 5 + 15
688        assert_eq!(streamer.current_y, 25.0); // 20 + 10 + (-5)
689    }
690
691    #[test]
692    fn test_process_chunk_with_multiple_font_changes() {
693        let mut streamer = TextStreamer::new(TextStreamOptions::default());
694
695        let content = b"BT /F1 10 Tf (Small) Tj /F2 24 Tf (Large) Tj /F3 16 Tf (Medium) Tj ET";
696        let chunks = streamer.process_chunk(content).unwrap();
697
698        assert_eq!(chunks.len(), 3);
699        assert_eq!(chunks[0].font_size, 10.0);
700        assert_eq!(chunks[1].font_size, 24.0);
701        assert_eq!(chunks[2].font_size, 16.0);
702    }
703
704    #[test]
705    fn test_empty_text_operations() {
706        let mut streamer = TextStreamer::new(TextStreamOptions::default());
707
708        // Empty text operations
709        let content = b"BT /F1 12 Tf () Tj ( ) Tj ET";
710        let chunks = streamer.process_chunk(content).unwrap();
711
712        assert_eq!(chunks.len(), 2);
713        assert!(chunks[0].text.is_empty());
714        assert_eq!(chunks[1].text, " ");
715    }
716
717    #[test]
718    fn test_text_with_special_characters() {
719        let mut streamer = TextStreamer::new(TextStreamOptions::default());
720
721        let content = b"BT /F1 12 Tf (\xC3\xA9\xC3\xA0\xC3\xB1) Tj ET"; // UTF-8: éàñ
722        let chunks = streamer.process_chunk(content).unwrap();
723
724        assert!(!chunks.is_empty());
725        // The text should contain the special characters (lossy conversion)
726        assert!(!chunks[0].text.is_empty());
727    }
728
729    #[test]
730    fn test_sorting_with_equal_positions() {
731        let mut streamer = TextStreamer::new(TextStreamOptions::default());
732
733        // Add chunks with same position
734        for i in 0..3 {
735            streamer.buffer.push_back(TextChunk {
736                text: format!("Text{i}"),
737                x: 100.0,
738                y: 100.0,
739                font_size: 12.0,
740                font_name: None,
741            });
742        }
743
744        let text = streamer.extract_text();
745        // Should maintain order when positions are equal
746        assert!(text.contains("Text0"));
747        assert!(text.contains("Text1"));
748        assert!(text.contains("Text2"));
749    }
750
751    #[test]
752    fn test_max_buffer_size_zero() {
753        let mut options = TextStreamOptions::default();
754        options.max_buffer_size = 0;
755        let mut streamer = TextStreamer::new(options);
756
757        streamer.buffer.push_back(TextChunk {
758            text: "Should be removed".to_string(),
759            x: 0.0,
760            y: 0.0,
761            font_size: 12.0,
762            font_name: None,
763        });
764
765        streamer.check_buffer_size();
766        assert!(streamer.buffer.is_empty());
767    }
768
769    #[test]
770    fn test_font_name_with_spaces() {
771        let mut streamer = TextStreamer::new(TextStreamOptions::default());
772
773        let content = b"BT /Times New Roman 14 Tf ET";
774        let result = streamer.process_chunk(content);
775
776        // This should fail because "New" is treated as an unknown operator
777        assert!(result.is_err());
778
779        // The font and size should remain unchanged (default values)
780        assert_eq!(streamer.current_font, None);
781        assert_eq!(streamer.current_font_size, 12.0);
782    }
783
784    #[test]
785    fn test_stream_text_with_mixed_content() {
786        let content1 = b"BT /F1 8 Tf (Small) Tj ET".to_vec();
787        let content2 = b"Invalid content".to_vec();
788        let content3 = b"BT /F2 16 Tf (Large) Tj ET".to_vec();
789        let streams = vec![content1, content2, content3];
790
791        let mut collected = Vec::new();
792        let result = stream_text(streams, |chunk| {
793            collected.push(chunk.text);
794            Ok(())
795        });
796
797        // Should handle mixed valid/invalid content
798        assert!(result.is_ok() || result.is_err());
799        // Check that collected is valid (len() is always >= 0 for Vec)
800    }
801
802    #[test]
803    fn test_preserve_formatting_option() {
804        let mut options = TextStreamOptions::default();
805        options.preserve_formatting = false;
806        let streamer = TextStreamer::new(options.clone());
807
808        assert!(!streamer.options.preserve_formatting);
809        assert_eq!(streamer.options.min_font_size, options.min_font_size);
810    }
811
812    #[test]
813    fn test_very_large_font_size() {
814        let mut streamer = TextStreamer::new(TextStreamOptions::default());
815
816        let content = b"BT /F1 9999 Tf (Huge) Tj ET";
817        let chunks = streamer.process_chunk(content).unwrap();
818
819        assert!(!chunks.is_empty());
820        assert_eq!(chunks[0].font_size, 9999.0);
821        assert_eq!(chunks[0].text, "Huge");
822    }
823
824    #[test]
825    fn test_negative_font_size() {
826        let mut options = TextStreamOptions::default();
827        options.min_font_size = -10.0; // Allow negative sizes
828        let mut streamer = TextStreamer::new(options);
829
830        streamer.current_font_size = -5.0;
831        let content = b"BT (Negative) Tj ET";
832        let chunks = streamer.process_chunk(content).unwrap();
833
834        assert!(!chunks.is_empty());
835        assert_eq!(chunks[0].font_size, -5.0);
836    }
837
838    #[test]
839    fn test_text_position_nan_handling() {
840        let mut streamer = TextStreamer::new(TextStreamOptions::default());
841
842        // Create chunks with NaN positions
843        let chunk1 = TextChunk {
844            text: "NaN X".to_string(),
845            x: f64::NAN,
846            y: 100.0,
847            font_size: 12.0,
848            font_name: None,
849        };
850        let chunk2 = TextChunk {
851            text: "NaN Y".to_string(),
852            x: 100.0,
853            y: f64::NAN,
854            font_size: 12.0,
855            font_name: None,
856        };
857
858        streamer.buffer.push_back(chunk1);
859        streamer.buffer.push_back(chunk2);
860
861        // extract_text should handle NaN gracefully
862        let text = streamer.extract_text();
863        assert!(text.contains("NaN"));
864    }
865
866    #[test]
867    fn test_buffer_with_different_font_names() {
868        let mut streamer = TextStreamer::new(TextStreamOptions::default());
869
870        let fonts = ["Arial", "Times", "Courier", "Helvetica"];
871        for (i, font) in fonts.iter().enumerate() {
872            streamer.buffer.push_back(TextChunk {
873                text: format!("Font{i}"),
874                x: 0.0,
875                y: 0.0,
876                font_size: 12.0,
877                font_name: Some((*font).to_string()),
878            });
879        }
880
881        let chunks = streamer.get_buffered_chunks();
882        assert_eq!(chunks.len(), 4);
883        for (i, chunk) in chunks.iter().enumerate() {
884            assert_eq!(chunk.font_name, Some(fonts[i].to_string()));
885        }
886    }
887
888    #[test]
889    fn test_process_chunk_error_propagation() {
890        let mut streamer = TextStreamer::new(TextStreamOptions::default());
891
892        // This will cause a parse error
893        let content = b"\xFF\xFE\xFD\xFC"; // Invalid UTF-8
894        let result = streamer.process_chunk(content);
895
896        // Should handle the error gracefully
897        assert!(result.is_ok() || result.is_err());
898    }
899
900    #[test]
901    fn test_extract_text_empty_buffer() {
902        let streamer = TextStreamer::new(TextStreamOptions::default());
903        let text = streamer.extract_text();
904        assert!(text.is_empty());
905    }
906
907    #[test]
908    fn test_extract_text_single_chunk() {
909        let mut streamer = TextStreamer::new(TextStreamOptions::default());
910
911        streamer.buffer.push_back(TextChunk {
912            text: "Single".to_string(),
913            x: 0.0,
914            y: 0.0,
915            font_size: 12.0,
916            font_name: None,
917        });
918
919        let text = streamer.extract_text();
920        assert_eq!(text, "Single");
921    }
922
923    #[test]
924    fn test_check_buffer_size_empty() {
925        let mut streamer = TextStreamer::new(TextStreamOptions::default());
926        streamer.check_buffer_size(); // Should not panic on empty buffer
927        assert!(streamer.buffer.is_empty());
928    }
929
930    #[test]
931    fn test_complex_content_operations() {
932        let mut streamer = TextStreamer::new(TextStreamOptions::default());
933
934        // Complex PDF content with mixed operations
935        let content = b"BT /F1 12 Tf 0 0 Td (Start) Tj ET q Q BT 50 50 Td (End) Tj ET";
936        let chunks = streamer.process_chunk(content).unwrap();
937
938        assert_eq!(chunks.len(), 2);
939        assert_eq!(chunks[0].text, "Start");
940        assert_eq!(chunks[1].text, "End");
941        assert_eq!(chunks[0].x, 0.0);
942        assert_eq!(chunks[1].x, 50.0);
943    }
944
945    #[test]
946    fn test_stream_text_callback_state() {
947        let content = b"BT /F1 12 Tf (Test) Tj ET".to_vec();
948        let streams = vec![content; 3]; // Same content 3 times
949
950        let mut count = 0;
951        stream_text(streams, |_chunk| {
952            count += 1;
953            Ok(())
954        })
955        .unwrap();
956
957        assert_eq!(count, 3);
958    }
959}