oxidize_pdf/streaming/
text_streamer.rs

1//! Text streaming for incremental text extraction
2//!
3//! Extracts text from PDF content streams incrementally, processing
4//! text operations as they are encountered.
5
6use crate::error::Result;
7use crate::parser::content::{ContentOperation, ContentParser};
8use std::collections::VecDeque;
9
10/// A chunk of extracted text with position information
11#[derive(Debug, Clone)]
12pub struct TextChunk {
13    /// The extracted text
14    pub text: String,
15    /// X position on the page
16    pub x: f64,
17    /// Y position on the page
18    pub y: f64,
19    /// Font size
20    pub font_size: f64,
21    /// Font name (if known)
22    pub font_name: Option<String>,
23}
24
25/// Options for text streaming
26#[derive(Debug, Clone)]
27pub struct TextStreamOptions {
28    /// Minimum text size to include
29    pub min_font_size: f64,
30    /// Maximum buffer size for text chunks
31    pub max_buffer_size: usize,
32    /// Whether to preserve formatting
33    pub preserve_formatting: bool,
34    /// Whether to sort by position
35    pub sort_by_position: bool,
36}
37
38impl Default for TextStreamOptions {
39    fn default() -> Self {
40        Self {
41            min_font_size: 0.0,
42            max_buffer_size: 1024 * 1024, // 1MB
43            preserve_formatting: true,
44            sort_by_position: true,
45        }
46    }
47}
48
49/// Streams text from PDF content
50pub struct TextStreamer {
51    options: TextStreamOptions,
52    buffer: VecDeque<TextChunk>,
53    current_font: Option<String>,
54    current_font_size: f64,
55    current_x: f64,
56    current_y: f64,
57}
58
59impl TextStreamer {
60    /// Create a new text streamer
61    pub fn new(options: TextStreamOptions) -> Self {
62        Self {
63            options,
64            buffer: VecDeque::new(),
65            current_font: None,
66            current_font_size: 12.0,
67            current_x: 0.0,
68            current_y: 0.0,
69        }
70    }
71
72    /// Process a content stream chunk
73    pub fn process_chunk(&mut self, data: &[u8]) -> Result<Vec<TextChunk>> {
74        let operations = ContentParser::parse(data)
75            .map_err(|e| crate::error::PdfError::ParseError(e.to_string()))?;
76
77        let mut chunks = Vec::new();
78
79        for op in operations {
80            match op {
81                ContentOperation::SetFont(name, size) => {
82                    self.current_font = Some(name);
83                    self.current_font_size = size as f64;
84                }
85                ContentOperation::MoveText(x, y) => {
86                    self.current_x += x as f64;
87                    self.current_y += y as f64;
88                }
89                ContentOperation::ShowText(bytes) => {
90                    if self.current_font_size >= self.options.min_font_size {
91                        let text = String::from_utf8_lossy(&bytes).to_string();
92                        let chunk = TextChunk {
93                            text,
94                            x: self.current_x,
95                            y: self.current_y,
96                            font_size: self.current_font_size,
97                            font_name: self.current_font.clone(),
98                        };
99                        chunks.push(chunk);
100                    }
101                }
102                ContentOperation::BeginText => {
103                    self.current_x = 0.0;
104                    self.current_y = 0.0;
105                }
106                _ => {} // Ignore other operations
107            }
108        }
109
110        // Add to buffer if needed
111        for chunk in &chunks {
112            self.buffer.push_back(chunk.clone());
113        }
114
115        // Check buffer size
116        self.check_buffer_size();
117
118        Ok(chunks)
119    }
120
121    /// Get all buffered text chunks
122    pub fn get_buffered_chunks(&self) -> Vec<TextChunk> {
123        self.buffer.iter().cloned().collect()
124    }
125
126    /// Clear the buffer
127    pub fn clear_buffer(&mut self) {
128        self.buffer.clear();
129    }
130
131    /// Extract text as a single string
132    pub fn extract_text(&self) -> String {
133        let mut chunks = self.get_buffered_chunks();
134
135        if self.options.sort_by_position {
136            // Sort by Y position (top to bottom), then X (left to right)
137            chunks.sort_by(|a, b| {
138                b.y.partial_cmp(&a.y)
139                    .unwrap_or(std::cmp::Ordering::Equal)
140                    .then(a.x.partial_cmp(&b.x).unwrap_or(std::cmp::Ordering::Equal))
141            });
142        }
143
144        chunks
145            .into_iter()
146            .map(|chunk| chunk.text)
147            .collect::<Vec<_>>()
148            .join(" ")
149    }
150
151    fn check_buffer_size(&mut self) {
152        let total_size: usize = self.buffer.iter().map(|chunk| chunk.text.len()).sum();
153
154        // Remove oldest chunks if buffer is too large
155        while total_size > self.options.max_buffer_size && !self.buffer.is_empty() {
156            self.buffer.pop_front();
157        }
158    }
159}
160
161/// Stream text from multiple content streams
162pub fn stream_text<F>(content_streams: Vec<Vec<u8>>, mut callback: F) -> Result<()>
163where
164    F: FnMut(TextChunk) -> Result<()>,
165{
166    let mut streamer = TextStreamer::new(TextStreamOptions::default());
167
168    for stream in content_streams {
169        let chunks = streamer.process_chunk(&stream)?;
170        for chunk in chunks {
171            callback(chunk)?;
172        }
173    }
174
175    Ok(())
176}
177
178#[cfg(test)]
179mod tests {
180    use super::*;
181
182    #[test]
183    fn test_text_chunk() {
184        let chunk = TextChunk {
185            text: "Hello".to_string(),
186            x: 100.0,
187            y: 700.0,
188            font_size: 12.0,
189            font_name: Some("Helvetica".to_string()),
190        };
191
192        assert_eq!(chunk.text, "Hello");
193        assert_eq!(chunk.x, 100.0);
194        assert_eq!(chunk.y, 700.0);
195        assert_eq!(chunk.font_size, 12.0);
196        assert_eq!(chunk.font_name, Some("Helvetica".to_string()));
197    }
198
199    #[test]
200    fn test_text_stream_options_default() {
201        let options = TextStreamOptions::default();
202        assert_eq!(options.min_font_size, 0.0);
203        assert_eq!(options.max_buffer_size, 1024 * 1024);
204        assert!(options.preserve_formatting);
205        assert!(options.sort_by_position);
206    }
207
208    #[test]
209    fn test_text_streamer_creation() {
210        let options = TextStreamOptions::default();
211        let streamer = TextStreamer::new(options);
212
213        assert!(streamer.buffer.is_empty());
214        assert_eq!(streamer.current_font_size, 12.0);
215        assert_eq!(streamer.current_x, 0.0);
216        assert_eq!(streamer.current_y, 0.0);
217    }
218
219    #[test]
220    fn test_process_chunk_text() {
221        let mut streamer = TextStreamer::new(TextStreamOptions::default());
222
223        // Simple text showing operation
224        let content = b"BT /F1 14 Tf 100 700 Td (Hello World) Tj ET";
225        let chunks = streamer.process_chunk(content).unwrap();
226
227        assert!(!chunks.is_empty());
228        assert_eq!(chunks[0].text, "Hello World");
229        assert_eq!(chunks[0].font_size, 14.0);
230    }
231
232    #[test]
233    fn test_min_font_size_filter() {
234        let mut options = TextStreamOptions::default();
235        options.min_font_size = 10.0;
236        let mut streamer = TextStreamer::new(options);
237
238        // Text with small font (8pt) - should be filtered out
239        let content = b"BT /F1 8 Tf 100 700 Td (Small Text) Tj ET";
240        let chunks = streamer.process_chunk(content).unwrap();
241        assert!(chunks.is_empty());
242
243        // Text with large font (12pt) - should be included
244        let content = b"BT /F1 12 Tf 100 650 Td (Large Text) Tj ET";
245        let chunks = streamer.process_chunk(content).unwrap();
246        assert_eq!(chunks.len(), 1);
247        assert_eq!(chunks[0].text, "Large Text");
248    }
249
250    #[test]
251    fn test_extract_text_sorted() {
252        let mut streamer = TextStreamer::new(TextStreamOptions::default());
253
254        // Add text in random order
255        streamer.buffer.push_back(TextChunk {
256            text: "Bottom".to_string(),
257            x: 100.0,
258            y: 100.0,
259            font_size: 12.0,
260            font_name: None,
261        });
262
263        streamer.buffer.push_back(TextChunk {
264            text: "Top".to_string(),
265            x: 100.0,
266            y: 700.0,
267            font_size: 12.0,
268            font_name: None,
269        });
270
271        streamer.buffer.push_back(TextChunk {
272            text: "Middle".to_string(),
273            x: 100.0,
274            y: 400.0,
275            font_size: 12.0,
276            font_name: None,
277        });
278
279        let text = streamer.extract_text();
280        assert_eq!(text, "Top Middle Bottom");
281    }
282
283    #[test]
284    fn test_buffer_management() {
285        let mut options = TextStreamOptions::default();
286        options.max_buffer_size = 10; // Very small buffer
287        let mut streamer = TextStreamer::new(options);
288
289        // Add chunks that exceed buffer size
290        for i in 0..5 {
291            streamer.buffer.push_back(TextChunk {
292                text: format!("Text{i}"),
293                x: 0.0,
294                y: 0.0,
295                font_size: 12.0,
296                font_name: None,
297            });
298        }
299
300        streamer.check_buffer_size();
301
302        // Buffer should be limited
303        assert!(streamer.buffer.len() < 5);
304    }
305
306    #[test]
307    fn test_stream_text_function() {
308        let content1 = b"BT /F1 12 Tf 100 700 Td (Page 1) Tj ET".to_vec();
309        let content2 = b"BT /F1 12 Tf 100 650 Td (Page 2) Tj ET".to_vec();
310        let streams = vec![content1, content2];
311
312        let mut collected = Vec::new();
313        stream_text(streams, |chunk| {
314            collected.push(chunk.text);
315            Ok(())
316        })
317        .unwrap();
318
319        assert_eq!(collected.len(), 2);
320        assert_eq!(collected[0], "Page 1");
321        assert_eq!(collected[1], "Page 2");
322    }
323
324    #[test]
325    fn test_text_chunk_debug_clone() {
326        let chunk = TextChunk {
327            text: "Test".to_string(),
328            x: 50.0,
329            y: 100.0,
330            font_size: 10.0,
331            font_name: Some("Arial".to_string()),
332        };
333
334        let debug_str = format!("{chunk:?}");
335        assert!(debug_str.contains("TextChunk"));
336        assert!(debug_str.contains("Test"));
337
338        let cloned = chunk.clone();
339        assert_eq!(cloned.text, chunk.text);
340        assert_eq!(cloned.x, chunk.x);
341        assert_eq!(cloned.y, chunk.y);
342        assert_eq!(cloned.font_size, chunk.font_size);
343        assert_eq!(cloned.font_name, chunk.font_name);
344    }
345
346    #[test]
347    fn test_text_stream_options_custom() {
348        let options = TextStreamOptions {
349            min_font_size: 8.0,
350            max_buffer_size: 2048,
351            preserve_formatting: false,
352            sort_by_position: false,
353        };
354
355        assert_eq!(options.min_font_size, 8.0);
356        assert_eq!(options.max_buffer_size, 2048);
357        assert!(!options.preserve_formatting);
358        assert!(!options.sort_by_position);
359    }
360
361    #[test]
362    fn test_text_stream_options_debug_clone() {
363        let options = TextStreamOptions::default();
364
365        let debug_str = format!("{options:?}");
366        assert!(debug_str.contains("TextStreamOptions"));
367
368        let cloned = options.clone();
369        assert_eq!(cloned.min_font_size, options.min_font_size);
370        assert_eq!(cloned.max_buffer_size, options.max_buffer_size);
371        assert_eq!(cloned.preserve_formatting, options.preserve_formatting);
372        assert_eq!(cloned.sort_by_position, options.sort_by_position);
373    }
374
375    #[test]
376    fn test_text_streamer_process_empty_chunk() {
377        let mut streamer = TextStreamer::new(TextStreamOptions::default());
378        let chunks = streamer.process_chunk(b"").unwrap();
379        assert!(chunks.is_empty());
380    }
381
382    #[test]
383    fn test_text_streamer_process_invalid_content() {
384        let mut streamer = TextStreamer::new(TextStreamOptions::default());
385        // Invalid PDF content should be handled gracefully
386        let content = b"Not valid PDF content";
387        let result = streamer.process_chunk(content);
388        // Should either succeed with no chunks or return an error
389        match result {
390            Ok(chunks) => assert!(chunks.is_empty()),
391            Err(_) => {} // Error is also acceptable
392        }
393    }
394
395    #[test]
396    fn test_text_streamer_font_tracking() {
397        let mut streamer = TextStreamer::new(TextStreamOptions::default());
398
399        // Set font operation
400        let content = b"BT /Helvetica-Bold 16 Tf ET";
401        let _ = streamer.process_chunk(content).unwrap();
402
403        assert_eq!(streamer.current_font, Some("Helvetica-Bold".to_string()));
404        assert_eq!(streamer.current_font_size, 16.0);
405    }
406
407    #[test]
408    fn test_text_streamer_position_tracking() {
409        let mut streamer = TextStreamer::new(TextStreamOptions::default());
410
411        // Move text position
412        let content = b"BT 50 100 Td ET";
413        let _ = streamer.process_chunk(content).unwrap();
414
415        assert_eq!(streamer.current_x, 50.0);
416        assert_eq!(streamer.current_y, 100.0);
417    }
418
419    #[test]
420    fn test_text_streamer_begin_text_resets_position() {
421        let mut streamer = TextStreamer::new(TextStreamOptions::default());
422
423        // Set position
424        streamer.current_x = 100.0;
425        streamer.current_y = 200.0;
426
427        // BeginText should reset position
428        let content = b"BT ET";
429        let _ = streamer.process_chunk(content).unwrap();
430
431        assert_eq!(streamer.current_x, 0.0);
432        assert_eq!(streamer.current_y, 0.0);
433    }
434
435    #[test]
436    fn test_text_streamer_clear_buffer() {
437        let mut streamer = TextStreamer::new(TextStreamOptions::default());
438
439        // Add some chunks
440        streamer.buffer.push_back(TextChunk {
441            text: "Chunk1".to_string(),
442            x: 0.0,
443            y: 0.0,
444            font_size: 12.0,
445            font_name: None,
446        });
447        streamer.buffer.push_back(TextChunk {
448            text: "Chunk2".to_string(),
449            x: 0.0,
450            y: 0.0,
451            font_size: 12.0,
452            font_name: None,
453        });
454
455        assert_eq!(streamer.buffer.len(), 2);
456
457        streamer.clear_buffer();
458        assert!(streamer.buffer.is_empty());
459    }
460
461    #[test]
462    fn test_text_streamer_get_buffered_chunks() {
463        let mut streamer = TextStreamer::new(TextStreamOptions::default());
464
465        let chunk1 = TextChunk {
466            text: "First".to_string(),
467            x: 10.0,
468            y: 20.0,
469            font_size: 14.0,
470            font_name: Some("Times".to_string()),
471        };
472        let chunk2 = TextChunk {
473            text: "Second".to_string(),
474            x: 30.0,
475            y: 40.0,
476            font_size: 16.0,
477            font_name: Some("Arial".to_string()),
478        };
479
480        streamer.buffer.push_back(chunk1.clone());
481        streamer.buffer.push_back(chunk2.clone());
482
483        let chunks = streamer.get_buffered_chunks();
484        assert_eq!(chunks.len(), 2);
485        assert_eq!(chunks[0].text, "First");
486        assert_eq!(chunks[1].text, "Second");
487    }
488
489    #[test]
490    fn test_extract_text_no_sorting() {
491        let mut options = TextStreamOptions::default();
492        options.sort_by_position = false;
493        let mut streamer = TextStreamer::new(options);
494
495        // Add text in specific order
496        streamer.buffer.push_back(TextChunk {
497            text: "First".to_string(),
498            x: 200.0,
499            y: 100.0,
500            font_size: 12.0,
501            font_name: None,
502        });
503        streamer.buffer.push_back(TextChunk {
504            text: "Second".to_string(),
505            x: 100.0,
506            y: 200.0,
507            font_size: 12.0,
508            font_name: None,
509        });
510
511        let text = streamer.extract_text();
512        assert_eq!(text, "First Second"); // Should maintain insertion order
513    }
514
515    #[test]
516    fn test_extract_text_horizontal_sorting() {
517        let mut streamer = TextStreamer::new(TextStreamOptions::default());
518
519        // Add text on same line, different X positions
520        streamer.buffer.push_back(TextChunk {
521            text: "Right".to_string(),
522            x: 300.0,
523            y: 500.0,
524            font_size: 12.0,
525            font_name: None,
526        });
527        streamer.buffer.push_back(TextChunk {
528            text: "Left".to_string(),
529            x: 100.0,
530            y: 500.0,
531            font_size: 12.0,
532            font_name: None,
533        });
534        streamer.buffer.push_back(TextChunk {
535            text: "Middle".to_string(),
536            x: 200.0,
537            y: 500.0,
538            font_size: 12.0,
539            font_name: None,
540        });
541
542        let text = streamer.extract_text();
543        assert_eq!(text, "Left Middle Right");
544    }
545
546    #[test]
547    fn test_check_buffer_size_edge_cases() {
548        let mut options = TextStreamOptions::default();
549        options.max_buffer_size = 20;
550        let mut streamer = TextStreamer::new(options);
551
552        // Add chunk that exactly fills buffer
553        streamer.buffer.push_back(TextChunk {
554            text: "a".repeat(20),
555            x: 0.0,
556            y: 0.0,
557            font_size: 12.0,
558            font_name: None,
559        });
560
561        streamer.check_buffer_size();
562        assert_eq!(streamer.buffer.len(), 1); // Should keep the chunk
563
564        // Add another chunk to exceed limit
565        streamer.buffer.push_back(TextChunk {
566            text: "b".to_string(),
567            x: 0.0,
568            y: 0.0,
569            font_size: 12.0,
570            font_name: None,
571        });
572
573        streamer.check_buffer_size();
574        // Should have removed the first chunk
575        assert!(streamer.buffer.len() <= 1);
576    }
577
578    #[test]
579    fn test_stream_text_with_error_callback() {
580        let content = b"BT /F1 12 Tf 100 700 Td (Test) Tj ET".to_vec();
581        let streams = vec![content];
582
583        let result = stream_text(streams, |_chunk| {
584            Err(crate::error::PdfError::ParseError("Test error".to_string()))
585        });
586
587        assert!(result.is_err());
588    }
589
590    #[test]
591    fn test_stream_text_empty_streams() {
592        let streams: Vec<Vec<u8>> = vec![];
593
594        let mut collected = Vec::new();
595        stream_text(streams, |chunk| {
596            collected.push(chunk);
597            Ok(())
598        })
599        .unwrap();
600
601        assert!(collected.is_empty());
602    }
603
604    #[test]
605    fn test_text_chunk_without_font_name() {
606        let chunk = TextChunk {
607            text: "No Font".to_string(),
608            x: 0.0,
609            y: 0.0,
610            font_size: 12.0,
611            font_name: None,
612        };
613
614        assert_eq!(chunk.font_name, None);
615    }
616
617    #[test]
618    fn test_process_chunk_multiple_operations() {
619        let mut streamer = TextStreamer::new(TextStreamOptions::default());
620
621        // Content with multiple text operations
622        let content = b"BT /F1 10 Tf 100 700 Td (First) Tj 50 0 Td (Second) Tj ET";
623        let chunks = streamer.process_chunk(content).unwrap();
624
625        assert_eq!(chunks.len(), 2);
626        assert_eq!(chunks[0].text, "First");
627        assert_eq!(chunks[1].text, "Second");
628        assert_eq!(chunks[0].x, 100.0);
629        assert_eq!(chunks[1].x, 150.0); // 100 + 50
630    }
631
632    #[test]
633    fn test_buffer_size_calculation() {
634        let mut options = TextStreamOptions::default();
635        options.max_buffer_size = 100;
636        let mut streamer = TextStreamer::new(options);
637
638        // Add chunks with known sizes
639        for _i in 0..10 {
640            streamer.buffer.push_back(TextChunk {
641                text: "1234567890".to_string(), // 10 bytes each
642                x: 0.0,
643                y: 0.0,
644                font_size: 12.0,
645                font_name: None,
646            });
647        }
648
649        // Total size is 100 bytes
650        streamer.check_buffer_size();
651
652        // Add one more to exceed
653        streamer.buffer.push_back(TextChunk {
654            text: "x".to_string(),
655            x: 0.0,
656            y: 0.0,
657            font_size: 12.0,
658            font_name: None,
659        });
660
661        streamer.check_buffer_size();
662
663        // Should have removed oldest chunks
664        let total_size: usize = streamer.buffer.iter().map(|c| c.text.len()).sum();
665        assert!(total_size <= 100);
666    }
667
668    #[test]
669    fn test_text_chunk_extreme_positions() {
670        let chunk = TextChunk {
671            text: "Extreme".to_string(),
672            x: f64::MAX,
673            y: f64::MIN,
674            font_size: 0.1,
675            font_name: Some("TinyFont".to_string()),
676        };
677
678        assert_eq!(chunk.x, f64::MAX);
679        assert_eq!(chunk.y, f64::MIN);
680        assert_eq!(chunk.font_size, 0.1);
681    }
682
683    #[test]
684    fn test_text_streamer_accumulated_position() {
685        let mut streamer = TextStreamer::new(TextStreamOptions::default());
686
687        // Multiple move operations should accumulate
688        let content = b"BT 10 20 Td 5 10 Td 15 -5 Td ET";
689        let _ = streamer.process_chunk(content).unwrap();
690
691        assert_eq!(streamer.current_x, 30.0); // 10 + 5 + 15
692        assert_eq!(streamer.current_y, 25.0); // 20 + 10 + (-5)
693    }
694
695    #[test]
696    fn test_process_chunk_with_multiple_font_changes() {
697        let mut streamer = TextStreamer::new(TextStreamOptions::default());
698
699        let content = b"BT /F1 10 Tf (Small) Tj /F2 24 Tf (Large) Tj /F3 16 Tf (Medium) Tj ET";
700        let chunks = streamer.process_chunk(content).unwrap();
701
702        assert_eq!(chunks.len(), 3);
703        assert_eq!(chunks[0].font_size, 10.0);
704        assert_eq!(chunks[1].font_size, 24.0);
705        assert_eq!(chunks[2].font_size, 16.0);
706    }
707
708    #[test]
709    fn test_empty_text_operations() {
710        let mut streamer = TextStreamer::new(TextStreamOptions::default());
711
712        // Empty text operations
713        let content = b"BT /F1 12 Tf () Tj ( ) Tj ET";
714        let chunks = streamer.process_chunk(content).unwrap();
715
716        assert_eq!(chunks.len(), 2);
717        assert!(chunks[0].text.is_empty());
718        assert_eq!(chunks[1].text, " ");
719    }
720
721    #[test]
722    fn test_text_with_special_characters() {
723        let mut streamer = TextStreamer::new(TextStreamOptions::default());
724
725        let content = b"BT /F1 12 Tf (\xC3\xA9\xC3\xA0\xC3\xB1) Tj ET"; // UTF-8: éàñ
726        let chunks = streamer.process_chunk(content).unwrap();
727
728        assert!(!chunks.is_empty());
729        // The text should contain the special characters (lossy conversion)
730        assert!(!chunks[0].text.is_empty());
731    }
732
733    #[test]
734    fn test_sorting_with_equal_positions() {
735        let mut streamer = TextStreamer::new(TextStreamOptions::default());
736
737        // Add chunks with same position
738        for i in 0..3 {
739            streamer.buffer.push_back(TextChunk {
740                text: format!("Text{i}"),
741                x: 100.0,
742                y: 100.0,
743                font_size: 12.0,
744                font_name: None,
745            });
746        }
747
748        let text = streamer.extract_text();
749        // Should maintain order when positions are equal
750        assert!(text.contains("Text0"));
751        assert!(text.contains("Text1"));
752        assert!(text.contains("Text2"));
753    }
754
755    #[test]
756    fn test_max_buffer_size_zero() {
757        let mut options = TextStreamOptions::default();
758        options.max_buffer_size = 0;
759        let mut streamer = TextStreamer::new(options);
760
761        streamer.buffer.push_back(TextChunk {
762            text: "Should be removed".to_string(),
763            x: 0.0,
764            y: 0.0,
765            font_size: 12.0,
766            font_name: None,
767        });
768
769        streamer.check_buffer_size();
770        assert!(streamer.buffer.is_empty());
771    }
772
773    #[test]
774    fn test_font_name_with_spaces() {
775        let mut streamer = TextStreamer::new(TextStreamOptions::default());
776
777        let content = b"BT /Times New Roman 14 Tf ET";
778        let result = streamer.process_chunk(content);
779
780        // This should fail because "New" is treated as an unknown operator
781        assert!(result.is_err());
782
783        // The font and size should remain unchanged (default values)
784        assert_eq!(streamer.current_font, None);
785        assert_eq!(streamer.current_font_size, 12.0);
786    }
787
788    #[test]
789    fn test_stream_text_with_mixed_content() {
790        let content1 = b"BT /F1 8 Tf (Small) Tj ET".to_vec();
791        let content2 = b"Invalid content".to_vec();
792        let content3 = b"BT /F2 16 Tf (Large) Tj ET".to_vec();
793        let streams = vec![content1, content2, content3];
794
795        let mut collected = Vec::new();
796        let result = stream_text(streams, |chunk| {
797            collected.push(chunk.text.clone());
798            Ok(())
799        });
800
801        // Should handle mixed valid/invalid content
802        assert!(result.is_ok() || result.is_err());
803        // Check that collected is valid (len() is always >= 0 for Vec)
804    }
805
806    #[test]
807    fn test_preserve_formatting_option() {
808        let mut options = TextStreamOptions::default();
809        options.preserve_formatting = false;
810        let streamer = TextStreamer::new(options.clone());
811
812        assert!(!streamer.options.preserve_formatting);
813        assert_eq!(streamer.options.min_font_size, options.min_font_size);
814    }
815
816    #[test]
817    fn test_very_large_font_size() {
818        let mut streamer = TextStreamer::new(TextStreamOptions::default());
819
820        let content = b"BT /F1 9999 Tf (Huge) Tj ET";
821        let chunks = streamer.process_chunk(content).unwrap();
822
823        assert!(!chunks.is_empty());
824        assert_eq!(chunks[0].font_size, 9999.0);
825        assert_eq!(chunks[0].text, "Huge");
826    }
827
828    #[test]
829    fn test_negative_font_size() {
830        let mut options = TextStreamOptions::default();
831        options.min_font_size = -10.0; // Allow negative sizes
832        let mut streamer = TextStreamer::new(options);
833
834        streamer.current_font_size = -5.0;
835        let content = b"BT (Negative) Tj ET";
836        let chunks = streamer.process_chunk(content).unwrap();
837
838        assert!(!chunks.is_empty());
839        assert_eq!(chunks[0].font_size, -5.0);
840    }
841
842    #[test]
843    fn test_text_position_nan_handling() {
844        let mut streamer = TextStreamer::new(TextStreamOptions::default());
845
846        // Create chunks with NaN positions
847        let chunk1 = TextChunk {
848            text: "NaN X".to_string(),
849            x: f64::NAN,
850            y: 100.0,
851            font_size: 12.0,
852            font_name: None,
853        };
854        let chunk2 = TextChunk {
855            text: "NaN Y".to_string(),
856            x: 100.0,
857            y: f64::NAN,
858            font_size: 12.0,
859            font_name: None,
860        };
861
862        streamer.buffer.push_back(chunk1);
863        streamer.buffer.push_back(chunk2);
864
865        // extract_text should handle NaN gracefully
866        let text = streamer.extract_text();
867        assert!(text.contains("NaN"));
868    }
869
870    #[test]
871    fn test_buffer_with_different_font_names() {
872        let mut streamer = TextStreamer::new(TextStreamOptions::default());
873
874        let fonts = ["Arial", "Times", "Courier", "Helvetica"];
875        for (i, font) in fonts.iter().enumerate() {
876            streamer.buffer.push_back(TextChunk {
877                text: format!("Font{i}"),
878                x: 0.0,
879                y: 0.0,
880                font_size: 12.0,
881                font_name: Some(font.to_string()),
882            });
883        }
884
885        let chunks = streamer.get_buffered_chunks();
886        assert_eq!(chunks.len(), 4);
887        for (i, chunk) in chunks.iter().enumerate() {
888            assert_eq!(chunk.font_name, Some(fonts[i].to_string()));
889        }
890    }
891
892    #[test]
893    fn test_process_chunk_error_propagation() {
894        let mut streamer = TextStreamer::new(TextStreamOptions::default());
895
896        // This will cause a parse error
897        let content = b"\xFF\xFE\xFD\xFC"; // Invalid UTF-8
898        let result = streamer.process_chunk(content);
899
900        // Should handle the error gracefully
901        assert!(result.is_ok() || result.is_err());
902    }
903
904    #[test]
905    fn test_extract_text_empty_buffer() {
906        let streamer = TextStreamer::new(TextStreamOptions::default());
907        let text = streamer.extract_text();
908        assert!(text.is_empty());
909    }
910
911    #[test]
912    fn test_extract_text_single_chunk() {
913        let mut streamer = TextStreamer::new(TextStreamOptions::default());
914
915        streamer.buffer.push_back(TextChunk {
916            text: "Single".to_string(),
917            x: 0.0,
918            y: 0.0,
919            font_size: 12.0,
920            font_name: None,
921        });
922
923        let text = streamer.extract_text();
924        assert_eq!(text, "Single");
925    }
926
927    #[test]
928    fn test_check_buffer_size_empty() {
929        let mut streamer = TextStreamer::new(TextStreamOptions::default());
930        streamer.check_buffer_size(); // Should not panic on empty buffer
931        assert!(streamer.buffer.is_empty());
932    }
933
934    #[test]
935    fn test_complex_content_operations() {
936        let mut streamer = TextStreamer::new(TextStreamOptions::default());
937
938        // Complex PDF content with mixed operations
939        let content = b"BT /F1 12 Tf 0 0 Td (Start) Tj ET q Q BT 50 50 Td (End) Tj ET";
940        let chunks = streamer.process_chunk(content).unwrap();
941
942        assert_eq!(chunks.len(), 2);
943        assert_eq!(chunks[0].text, "Start");
944        assert_eq!(chunks[1].text, "End");
945        assert_eq!(chunks[0].x, 0.0);
946        assert_eq!(chunks[1].x, 50.0);
947    }
948
949    #[test]
950    fn test_stream_text_callback_state() {
951        let content = b"BT /F1 12 Tf (Test) Tj ET".to_vec();
952        let streams = vec![content; 3]; // Same content 3 times
953
954        let mut count = 0;
955        stream_text(streams, |_chunk| {
956            count += 1;
957            Ok(())
958        })
959        .unwrap();
960
961        assert_eq!(count, 3);
962    }
963}