use crate::error::Result;
use crate::parser::content::{ContentOperation, ContentParser};
use std::collections::VecDeque;
#[derive(Debug, Clone)]
pub struct TextChunk {
pub text: String,
pub x: f64,
pub y: f64,
pub font_size: f64,
pub font_name: Option<String>,
}
#[derive(Debug, Clone)]
pub struct TextStreamOptions {
pub min_font_size: f64,
pub max_buffer_size: usize,
pub preserve_formatting: bool,
pub sort_by_position: bool,
}
impl Default for TextStreamOptions {
fn default() -> Self {
Self {
min_font_size: 0.0,
max_buffer_size: 1024 * 1024, preserve_formatting: true,
sort_by_position: true,
}
}
}
pub struct TextStreamer {
options: TextStreamOptions,
buffer: VecDeque<TextChunk>,
current_font: Option<String>,
current_font_size: f64,
current_x: f64,
current_y: f64,
}
impl TextStreamer {
pub fn new(options: TextStreamOptions) -> Self {
Self {
options,
buffer: VecDeque::new(),
current_font: None,
current_font_size: 12.0,
current_x: 0.0,
current_y: 0.0,
}
}
pub fn process_chunk(&mut self, data: &[u8]) -> Result<Vec<TextChunk>> {
let operations = ContentParser::parse(data)
.map_err(|e| crate::error::PdfError::ParseError(e.to_string()))?;
let mut chunks = Vec::new();
for op in operations {
match op {
ContentOperation::SetFont(name, size) => {
self.current_font = Some(name);
self.current_font_size = size as f64;
}
ContentOperation::MoveText(x, y) => {
self.current_x += x as f64;
self.current_y += y as f64;
}
ContentOperation::ShowText(bytes) => {
if self.current_font_size >= self.options.min_font_size {
let text = String::from_utf8_lossy(&bytes).to_string();
let chunk = TextChunk {
text,
x: self.current_x,
y: self.current_y,
font_size: self.current_font_size,
font_name: self.current_font.clone(),
};
chunks.push(chunk);
}
}
ContentOperation::BeginText => {
self.current_x = 0.0;
self.current_y = 0.0;
}
_ => {} }
}
for chunk in &chunks {
self.buffer.push_back(chunk.clone());
}
self.check_buffer_size();
Ok(chunks)
}
pub fn get_buffered_chunks(&self) -> Vec<TextChunk> {
self.buffer.iter().cloned().collect()
}
pub fn clear_buffer(&mut self) {
self.buffer.clear();
}
pub fn extract_text(&self) -> String {
let mut chunks = self.get_buffered_chunks();
if self.options.sort_by_position {
chunks.sort_by(|a, b| b.y.total_cmp(&a.y).then(a.x.total_cmp(&b.x)));
}
chunks
.into_iter()
.map(|chunk| chunk.text)
.collect::<Vec<_>>()
.join(" ")
}
fn check_buffer_size(&mut self) {
let total_size: usize = self.buffer.iter().map(|chunk| chunk.text.len()).sum();
while total_size > self.options.max_buffer_size && !self.buffer.is_empty() {
self.buffer.pop_front();
}
}
}
pub fn stream_text<F>(content_streams: Vec<Vec<u8>>, mut callback: F) -> Result<()>
where
F: FnMut(TextChunk) -> Result<()>,
{
let mut streamer = TextStreamer::new(TextStreamOptions::default());
for stream in content_streams {
let chunks = streamer.process_chunk(&stream)?;
for chunk in chunks {
callback(chunk)?;
}
}
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_text_chunk() {
let chunk = TextChunk {
text: "Hello".to_string(),
x: 100.0,
y: 700.0,
font_size: 12.0,
font_name: Some("Helvetica".to_string()),
};
assert_eq!(chunk.text, "Hello");
assert_eq!(chunk.x, 100.0);
assert_eq!(chunk.y, 700.0);
assert_eq!(chunk.font_size, 12.0);
assert_eq!(chunk.font_name, Some("Helvetica".to_string()));
}
#[test]
fn test_text_stream_options_default() {
let options = TextStreamOptions::default();
assert_eq!(options.min_font_size, 0.0);
assert_eq!(options.max_buffer_size, 1024 * 1024);
assert!(options.preserve_formatting);
assert!(options.sort_by_position);
}
#[test]
fn test_text_streamer_creation() {
let options = TextStreamOptions::default();
let streamer = TextStreamer::new(options);
assert!(streamer.buffer.is_empty());
assert_eq!(streamer.current_font_size, 12.0);
assert_eq!(streamer.current_x, 0.0);
assert_eq!(streamer.current_y, 0.0);
}
#[test]
fn test_process_chunk_text() {
let mut streamer = TextStreamer::new(TextStreamOptions::default());
let content = b"BT /F1 14 Tf 100 700 Td (Hello World) Tj ET";
let chunks = streamer.process_chunk(content).unwrap();
assert!(!chunks.is_empty());
assert_eq!(chunks[0].text, "Hello World");
assert_eq!(chunks[0].font_size, 14.0);
}
#[test]
fn test_min_font_size_filter() {
let mut options = TextStreamOptions::default();
options.min_font_size = 10.0;
let mut streamer = TextStreamer::new(options);
let content = b"BT /F1 8 Tf 100 700 Td (Small Text) Tj ET";
let chunks = streamer.process_chunk(content).unwrap();
assert!(chunks.is_empty());
let content = b"BT /F1 12 Tf 100 650 Td (Large Text) Tj ET";
let chunks = streamer.process_chunk(content).unwrap();
assert_eq!(chunks.len(), 1);
assert_eq!(chunks[0].text, "Large Text");
}
#[test]
fn test_extract_text_sorted() {
let mut streamer = TextStreamer::new(TextStreamOptions::default());
streamer.buffer.push_back(TextChunk {
text: "Bottom".to_string(),
x: 100.0,
y: 100.0,
font_size: 12.0,
font_name: None,
});
streamer.buffer.push_back(TextChunk {
text: "Top".to_string(),
x: 100.0,
y: 700.0,
font_size: 12.0,
font_name: None,
});
streamer.buffer.push_back(TextChunk {
text: "Middle".to_string(),
x: 100.0,
y: 400.0,
font_size: 12.0,
font_name: None,
});
let text = streamer.extract_text();
assert_eq!(text, "Top Middle Bottom");
}
#[test]
fn test_buffer_management() {
let mut options = TextStreamOptions::default();
options.max_buffer_size = 10; let mut streamer = TextStreamer::new(options);
for i in 0..5 {
streamer.buffer.push_back(TextChunk {
text: format!("Text{i}"),
x: 0.0,
y: 0.0,
font_size: 12.0,
font_name: None,
});
}
streamer.check_buffer_size();
assert!(streamer.buffer.len() < 5);
}
#[test]
fn test_stream_text_function() {
let content1 = b"BT /F1 12 Tf 100 700 Td (Page 1) Tj ET".to_vec();
let content2 = b"BT /F1 12 Tf 100 650 Td (Page 2) Tj ET".to_vec();
let streams = vec![content1, content2];
let mut collected = Vec::new();
stream_text(streams, |chunk| {
collected.push(chunk.text);
Ok(())
})
.unwrap();
assert_eq!(collected.len(), 2);
assert_eq!(collected[0], "Page 1");
assert_eq!(collected[1], "Page 2");
}
#[test]
fn test_text_chunk_debug_clone() {
let chunk = TextChunk {
text: "Test".to_string(),
x: 50.0,
y: 100.0,
font_size: 10.0,
font_name: Some("Arial".to_string()),
};
let debug_str = format!("{chunk:?}");
assert!(debug_str.contains("TextChunk"));
assert!(debug_str.contains("Test"));
let cloned = chunk.clone();
assert_eq!(cloned.text, chunk.text);
assert_eq!(cloned.x, chunk.x);
assert_eq!(cloned.y, chunk.y);
assert_eq!(cloned.font_size, chunk.font_size);
assert_eq!(cloned.font_name, chunk.font_name);
}
#[test]
fn test_text_stream_options_custom() {
let options = TextStreamOptions {
min_font_size: 8.0,
max_buffer_size: 2048,
preserve_formatting: false,
sort_by_position: false,
};
assert_eq!(options.min_font_size, 8.0);
assert_eq!(options.max_buffer_size, 2048);
assert!(!options.preserve_formatting);
assert!(!options.sort_by_position);
}
#[test]
fn test_text_stream_options_debug_clone() {
let options = TextStreamOptions::default();
let debug_str = format!("{options:?}");
assert!(debug_str.contains("TextStreamOptions"));
let cloned = options.clone();
assert_eq!(cloned.min_font_size, options.min_font_size);
assert_eq!(cloned.max_buffer_size, options.max_buffer_size);
assert_eq!(cloned.preserve_formatting, options.preserve_formatting);
assert_eq!(cloned.sort_by_position, options.sort_by_position);
}
#[test]
fn test_text_streamer_process_empty_chunk() {
let mut streamer = TextStreamer::new(TextStreamOptions::default());
let chunks = streamer.process_chunk(b"").unwrap();
assert!(chunks.is_empty());
}
#[test]
fn test_text_streamer_process_invalid_content() {
let mut streamer = TextStreamer::new(TextStreamOptions::default());
let content = b"Not valid PDF content";
let result = streamer.process_chunk(content);
match result {
Ok(chunks) => assert!(chunks.is_empty()),
Err(_) => {} }
}
#[test]
fn test_text_streamer_font_tracking() {
let mut streamer = TextStreamer::new(TextStreamOptions::default());
let content = b"BT /Helvetica-Bold 16 Tf ET";
let _ = streamer.process_chunk(content).unwrap();
assert_eq!(streamer.current_font, Some("Helvetica-Bold".to_string()));
assert_eq!(streamer.current_font_size, 16.0);
}
#[test]
fn test_text_streamer_position_tracking() {
let mut streamer = TextStreamer::new(TextStreamOptions::default());
let content = b"BT 50 100 Td ET";
let _ = streamer.process_chunk(content).unwrap();
assert_eq!(streamer.current_x, 50.0);
assert_eq!(streamer.current_y, 100.0);
}
#[test]
fn test_text_streamer_begin_text_resets_position() {
let mut streamer = TextStreamer::new(TextStreamOptions::default());
streamer.current_x = 100.0;
streamer.current_y = 200.0;
let content = b"BT ET";
let _ = streamer.process_chunk(content).unwrap();
assert_eq!(streamer.current_x, 0.0);
assert_eq!(streamer.current_y, 0.0);
}
#[test]
fn test_text_streamer_clear_buffer() {
let mut streamer = TextStreamer::new(TextStreamOptions::default());
streamer.buffer.push_back(TextChunk {
text: "Chunk1".to_string(),
x: 0.0,
y: 0.0,
font_size: 12.0,
font_name: None,
});
streamer.buffer.push_back(TextChunk {
text: "Chunk2".to_string(),
x: 0.0,
y: 0.0,
font_size: 12.0,
font_name: None,
});
assert_eq!(streamer.buffer.len(), 2);
streamer.clear_buffer();
assert!(streamer.buffer.is_empty());
}
#[test]
fn test_text_streamer_get_buffered_chunks() {
let mut streamer = TextStreamer::new(TextStreamOptions::default());
let chunk1 = TextChunk {
text: "First".to_string(),
x: 10.0,
y: 20.0,
font_size: 14.0,
font_name: Some("Times".to_string()),
};
let chunk2 = TextChunk {
text: "Second".to_string(),
x: 30.0,
y: 40.0,
font_size: 16.0,
font_name: Some("Arial".to_string()),
};
streamer.buffer.push_back(chunk1);
streamer.buffer.push_back(chunk2);
let chunks = streamer.get_buffered_chunks();
assert_eq!(chunks.len(), 2);
assert_eq!(chunks[0].text, "First");
assert_eq!(chunks[1].text, "Second");
}
#[test]
fn test_extract_text_no_sorting() {
let mut options = TextStreamOptions::default();
options.sort_by_position = false;
let mut streamer = TextStreamer::new(options);
streamer.buffer.push_back(TextChunk {
text: "First".to_string(),
x: 200.0,
y: 100.0,
font_size: 12.0,
font_name: None,
});
streamer.buffer.push_back(TextChunk {
text: "Second".to_string(),
x: 100.0,
y: 200.0,
font_size: 12.0,
font_name: None,
});
let text = streamer.extract_text();
assert_eq!(text, "First Second"); }
#[test]
fn test_extract_text_horizontal_sorting() {
let mut streamer = TextStreamer::new(TextStreamOptions::default());
streamer.buffer.push_back(TextChunk {
text: "Right".to_string(),
x: 300.0,
y: 500.0,
font_size: 12.0,
font_name: None,
});
streamer.buffer.push_back(TextChunk {
text: "Left".to_string(),
x: 100.0,
y: 500.0,
font_size: 12.0,
font_name: None,
});
streamer.buffer.push_back(TextChunk {
text: "Middle".to_string(),
x: 200.0,
y: 500.0,
font_size: 12.0,
font_name: None,
});
let text = streamer.extract_text();
assert_eq!(text, "Left Middle Right");
}
#[test]
fn test_check_buffer_size_edge_cases() {
let mut options = TextStreamOptions::default();
options.max_buffer_size = 20;
let mut streamer = TextStreamer::new(options);
streamer.buffer.push_back(TextChunk {
text: "a".repeat(20),
x: 0.0,
y: 0.0,
font_size: 12.0,
font_name: None,
});
streamer.check_buffer_size();
assert_eq!(streamer.buffer.len(), 1);
streamer.buffer.push_back(TextChunk {
text: "b".to_string(),
x: 0.0,
y: 0.0,
font_size: 12.0,
font_name: None,
});
streamer.check_buffer_size();
assert!(streamer.buffer.len() <= 1);
}
#[test]
fn test_stream_text_with_error_callback() {
let content = b"BT /F1 12 Tf 100 700 Td (Test) Tj ET".to_vec();
let streams = vec![content];
let result = stream_text(streams, |_chunk| {
Err(crate::error::PdfError::ParseError("Test error".to_string()))
});
assert!(result.is_err());
}
#[test]
fn test_stream_text_empty_streams() {
let streams: Vec<Vec<u8>> = vec![];
let mut collected = Vec::new();
stream_text(streams, |chunk| {
collected.push(chunk);
Ok(())
})
.unwrap();
assert!(collected.is_empty());
}
#[test]
fn test_text_chunk_without_font_name() {
let chunk = TextChunk {
text: "No Font".to_string(),
x: 0.0,
y: 0.0,
font_size: 12.0,
font_name: None,
};
assert_eq!(chunk.font_name, None);
}
#[test]
fn test_process_chunk_multiple_operations() {
let mut streamer = TextStreamer::new(TextStreamOptions::default());
let content = b"BT /F1 10 Tf 100 700 Td (First) Tj 50 0 Td (Second) Tj ET";
let chunks = streamer.process_chunk(content).unwrap();
assert_eq!(chunks.len(), 2);
assert_eq!(chunks[0].text, "First");
assert_eq!(chunks[1].text, "Second");
assert_eq!(chunks[0].x, 100.0);
assert_eq!(chunks[1].x, 150.0); }
#[test]
fn test_buffer_size_calculation() {
let mut options = TextStreamOptions::default();
options.max_buffer_size = 100;
let mut streamer = TextStreamer::new(options);
for _i in 0..10 {
streamer.buffer.push_back(TextChunk {
text: "1234567890".to_string(), x: 0.0,
y: 0.0,
font_size: 12.0,
font_name: None,
});
}
streamer.check_buffer_size();
streamer.buffer.push_back(TextChunk {
text: "x".to_string(),
x: 0.0,
y: 0.0,
font_size: 12.0,
font_name: None,
});
streamer.check_buffer_size();
let total_size: usize = streamer.buffer.iter().map(|c| c.text.len()).sum();
assert!(total_size <= 100);
}
#[test]
fn test_text_chunk_extreme_positions() {
let chunk = TextChunk {
text: "Extreme".to_string(),
x: f64::MAX,
y: f64::MIN,
font_size: 0.1,
font_name: Some("TinyFont".to_string()),
};
assert_eq!(chunk.x, f64::MAX);
assert_eq!(chunk.y, f64::MIN);
assert_eq!(chunk.font_size, 0.1);
}
#[test]
fn test_text_streamer_accumulated_position() {
let mut streamer = TextStreamer::new(TextStreamOptions::default());
let content = b"BT 10 20 Td 5 10 Td 15 -5 Td ET";
let _ = streamer.process_chunk(content).unwrap();
assert_eq!(streamer.current_x, 30.0); assert_eq!(streamer.current_y, 25.0); }
#[test]
fn test_process_chunk_with_multiple_font_changes() {
let mut streamer = TextStreamer::new(TextStreamOptions::default());
let content = b"BT /F1 10 Tf (Small) Tj /F2 24 Tf (Large) Tj /F3 16 Tf (Medium) Tj ET";
let chunks = streamer.process_chunk(content).unwrap();
assert_eq!(chunks.len(), 3);
assert_eq!(chunks[0].font_size, 10.0);
assert_eq!(chunks[1].font_size, 24.0);
assert_eq!(chunks[2].font_size, 16.0);
}
#[test]
fn test_empty_text_operations() {
let mut streamer = TextStreamer::new(TextStreamOptions::default());
let content = b"BT /F1 12 Tf () Tj ( ) Tj ET";
let chunks = streamer.process_chunk(content).unwrap();
assert_eq!(chunks.len(), 2);
assert!(chunks[0].text.is_empty());
assert_eq!(chunks[1].text, " ");
}
#[test]
fn test_text_with_special_characters() {
let mut streamer = TextStreamer::new(TextStreamOptions::default());
let content = b"BT /F1 12 Tf (\xC3\xA9\xC3\xA0\xC3\xB1) Tj ET"; let chunks = streamer.process_chunk(content).unwrap();
assert!(!chunks.is_empty());
assert!(!chunks[0].text.is_empty());
}
#[test]
fn test_sorting_with_equal_positions() {
let mut streamer = TextStreamer::new(TextStreamOptions::default());
for i in 0..3 {
streamer.buffer.push_back(TextChunk {
text: format!("Text{i}"),
x: 100.0,
y: 100.0,
font_size: 12.0,
font_name: None,
});
}
let text = streamer.extract_text();
assert!(text.contains("Text0"));
assert!(text.contains("Text1"));
assert!(text.contains("Text2"));
}
#[test]
fn test_max_buffer_size_zero() {
let mut options = TextStreamOptions::default();
options.max_buffer_size = 0;
let mut streamer = TextStreamer::new(options);
streamer.buffer.push_back(TextChunk {
text: "Should be removed".to_string(),
x: 0.0,
y: 0.0,
font_size: 12.0,
font_name: None,
});
streamer.check_buffer_size();
assert!(streamer.buffer.is_empty());
}
#[test]
fn test_font_name_with_spaces() {
let mut streamer = TextStreamer::new(TextStreamOptions::default());
let content = b"BT /Times New Roman 14 Tf ET";
let result = streamer.process_chunk(content);
assert!(result.is_err());
assert_eq!(streamer.current_font, None);
assert_eq!(streamer.current_font_size, 12.0);
}
#[test]
fn test_stream_text_with_mixed_content() {
let content1 = b"BT /F1 8 Tf (Small) Tj ET".to_vec();
let content2 = b"Invalid content".to_vec();
let content3 = b"BT /F2 16 Tf (Large) Tj ET".to_vec();
let streams = vec![content1, content2, content3];
let mut collected = Vec::new();
let result = stream_text(streams, |chunk| {
collected.push(chunk.text);
Ok(())
});
assert!(result.is_ok() || result.is_err());
}
#[test]
fn test_preserve_formatting_option() {
let mut options = TextStreamOptions::default();
options.preserve_formatting = false;
let streamer = TextStreamer::new(options.clone());
assert!(!streamer.options.preserve_formatting);
assert_eq!(streamer.options.min_font_size, options.min_font_size);
}
#[test]
fn test_very_large_font_size() {
let mut streamer = TextStreamer::new(TextStreamOptions::default());
let content = b"BT /F1 9999 Tf (Huge) Tj ET";
let chunks = streamer.process_chunk(content).unwrap();
assert!(!chunks.is_empty());
assert_eq!(chunks[0].font_size, 9999.0);
assert_eq!(chunks[0].text, "Huge");
}
#[test]
fn test_negative_font_size() {
let mut options = TextStreamOptions::default();
options.min_font_size = -10.0; let mut streamer = TextStreamer::new(options);
streamer.current_font_size = -5.0;
let content = b"BT (Negative) Tj ET";
let chunks = streamer.process_chunk(content).unwrap();
assert!(!chunks.is_empty());
assert_eq!(chunks[0].font_size, -5.0);
}
#[test]
fn test_text_position_nan_handling() {
let mut streamer = TextStreamer::new(TextStreamOptions::default());
let chunk1 = TextChunk {
text: "NaN X".to_string(),
x: f64::NAN,
y: 100.0,
font_size: 12.0,
font_name: None,
};
let chunk2 = TextChunk {
text: "NaN Y".to_string(),
x: 100.0,
y: f64::NAN,
font_size: 12.0,
font_name: None,
};
streamer.buffer.push_back(chunk1);
streamer.buffer.push_back(chunk2);
let text = streamer.extract_text();
assert!(text.contains("NaN"));
}
#[test]
fn test_buffer_with_different_font_names() {
let mut streamer = TextStreamer::new(TextStreamOptions::default());
let fonts = ["Arial", "Times", "Courier", "Helvetica"];
for (i, font) in fonts.iter().enumerate() {
streamer.buffer.push_back(TextChunk {
text: format!("Font{i}"),
x: 0.0,
y: 0.0,
font_size: 12.0,
font_name: Some((*font).to_string()),
});
}
let chunks = streamer.get_buffered_chunks();
assert_eq!(chunks.len(), 4);
for (i, chunk) in chunks.iter().enumerate() {
assert_eq!(chunk.font_name, Some(fonts[i].to_string()));
}
}
#[test]
fn test_process_chunk_error_propagation() {
let mut streamer = TextStreamer::new(TextStreamOptions::default());
let content = b"\xFF\xFE\xFD\xFC"; let result = streamer.process_chunk(content);
assert!(result.is_ok() || result.is_err());
}
#[test]
fn test_extract_text_empty_buffer() {
let streamer = TextStreamer::new(TextStreamOptions::default());
let text = streamer.extract_text();
assert!(text.is_empty());
}
#[test]
fn test_extract_text_single_chunk() {
let mut streamer = TextStreamer::new(TextStreamOptions::default());
streamer.buffer.push_back(TextChunk {
text: "Single".to_string(),
x: 0.0,
y: 0.0,
font_size: 12.0,
font_name: None,
});
let text = streamer.extract_text();
assert_eq!(text, "Single");
}
#[test]
fn test_check_buffer_size_empty() {
let mut streamer = TextStreamer::new(TextStreamOptions::default());
streamer.check_buffer_size(); assert!(streamer.buffer.is_empty());
}
#[test]
fn test_complex_content_operations() {
let mut streamer = TextStreamer::new(TextStreamOptions::default());
let content = b"BT /F1 12 Tf 0 0 Td (Start) Tj ET q Q BT 50 50 Td (End) Tj ET";
let chunks = streamer.process_chunk(content).unwrap();
assert_eq!(chunks.len(), 2);
assert_eq!(chunks[0].text, "Start");
assert_eq!(chunks[1].text, "End");
assert_eq!(chunks[0].x, 0.0);
assert_eq!(chunks[1].x, 50.0);
}
#[test]
fn test_stream_text_callback_state() {
let content = b"BT /F1 12 Tf (Test) Tj ET".to_vec();
let streams = vec![content; 3];
let mut count = 0;
stream_text(streams, |_chunk| {
count += 1;
Ok(())
})
.unwrap();
assert_eq!(count, 3);
}
}