use crate::error::Result;
use std::io::Read;
#[derive(Debug, Clone, PartialEq)]
pub enum ChunkType {
Text,
Image,
Graphics,
Form,
Unknown,
}
#[derive(Debug, Clone)]
pub struct ContentChunk {
pub chunk_type: ChunkType,
pub data: Vec<u8>,
pub position: u64,
pub size: usize,
pub page_number: u32,
}
impl ContentChunk {
pub fn new(chunk_type: ChunkType, data: Vec<u8>, position: u64, page_number: u32) -> Self {
let size = data.len();
Self {
chunk_type,
data,
position,
size,
page_number,
}
}
pub fn is_text(&self) -> bool {
self.chunk_type == ChunkType::Text
}
pub fn is_image(&self) -> bool {
self.chunk_type == ChunkType::Image
}
pub fn as_text(&self) -> Option<String> {
if self.is_text() {
Some(String::from_utf8_lossy(&self.data).to_string())
} else {
None
}
}
}
#[derive(Debug, Clone)]
pub struct ChunkOptions {
pub max_chunk_size: usize,
pub split_large_objects: bool,
pub buffer_size: usize,
pub chunk_types: Vec<ChunkType>,
}
impl Default for ChunkOptions {
fn default() -> Self {
Self {
max_chunk_size: 1024 * 1024, split_large_objects: true,
buffer_size: 64 * 1024, chunk_types: vec![
ChunkType::Text,
ChunkType::Image,
ChunkType::Graphics,
ChunkType::Form,
],
}
}
}
impl ChunkOptions {
pub fn validate(&self) -> Result<()> {
if self.max_chunk_size == 0 {
return Err(crate::error::PdfError::InvalidStructure(
"max_chunk_size cannot be 0".to_string(),
));
}
if self.buffer_size == 0 {
return Err(crate::error::PdfError::InvalidStructure(
"buffer_size cannot be 0".to_string(),
));
}
Ok(())
}
}
pub struct ChunkProcessor {
options: ChunkOptions,
current_position: u64,
current_page: u32,
}
impl ChunkProcessor {
pub fn new(options: ChunkOptions) -> Self {
Self {
options,
current_position: 0,
current_page: 0,
}
}
pub fn process_content(&mut self, content: &[u8]) -> Result<Vec<ContentChunk>> {
if self.options.max_chunk_size == 0 {
return Ok(vec![]);
}
let mut chunks = Vec::new();
let mut offset = 0;
while offset < content.len() {
let remaining = content.len() - offset;
let chunk_size = remaining.min(self.options.max_chunk_size);
let chunk_type = self.detect_chunk_type(&content[offset..offset + chunk_size]);
if !self.options.chunk_types.contains(&chunk_type) {
offset += chunk_size;
continue;
}
let chunk = ContentChunk::new(
chunk_type,
content[offset..offset + chunk_size].to_vec(),
self.current_position + offset as u64,
self.current_page,
);
chunks.push(chunk);
offset += chunk_size;
}
self.current_position += content.len() as u64;
Ok(chunks)
}
pub fn set_page(&mut self, page_number: u32) {
self.current_page = page_number;
}
pub fn reset(&mut self) {
self.current_position = 0;
self.current_page = 0;
}
fn detect_chunk_type(&self, data: &[u8]) -> ChunkType {
if data.starts_with(b"BT")
|| (data.contains(&b'T') && data.contains(&b'j'))
|| (data.len() == 1 && data[0] == b'T')
{
ChunkType::Text
} else if data.starts_with(b"\xFF\xD8") || data.starts_with(b"\x89PNG") {
ChunkType::Image
} else if data.contains(&b'm') || data.contains(&b'l') || data.contains(&b'c') {
ChunkType::Graphics
} else {
ChunkType::Unknown
}
}
}
pub fn process_in_chunks<R, F>(mut reader: R, options: ChunkOptions, mut callback: F) -> Result<()>
where
R: Read,
F: FnMut(ContentChunk) -> Result<()>,
{
options.validate()?;
let mut processor = ChunkProcessor::new(options.clone());
let mut buffer = vec![0u8; options.buffer_size];
let mut _position = 0u64;
loop {
match reader.read(&mut buffer) {
Ok(0) => break, Ok(n) => {
let chunks = processor.process_content(&buffer[..n])?;
for chunk in chunks {
callback(chunk)?;
}
_position += n as u64;
}
Err(e) => return Err(crate::error::PdfError::Io(e)),
}
}
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_content_chunk() {
let chunk = ContentChunk::new(ChunkType::Text, b"Hello World".to_vec(), 1024, 0);
assert_eq!(chunk.chunk_type, ChunkType::Text);
assert_eq!(chunk.size, 11);
assert_eq!(chunk.position, 1024);
assert_eq!(chunk.page_number, 0);
assert!(chunk.is_text());
assert!(!chunk.is_image());
assert_eq!(chunk.as_text(), Some("Hello World".to_string()));
}
#[test]
fn test_chunk_options_default() {
let options = ChunkOptions::default();
assert_eq!(options.max_chunk_size, 1024 * 1024);
assert!(options.split_large_objects);
assert_eq!(options.buffer_size, 64 * 1024);
assert_eq!(options.chunk_types.len(), 4);
}
#[test]
fn test_chunk_processor() {
let options = ChunkOptions::default();
let mut processor = ChunkProcessor::new(options);
let content = b"BT /F1 12 Tf 100 700 Td (Hello) Tj ET";
let chunks = processor.process_content(content).unwrap();
assert!(!chunks.is_empty());
assert_eq!(chunks[0].chunk_type, ChunkType::Text);
assert_eq!(chunks[0].data, content);
}
#[test]
fn test_chunk_type_detection() {
let processor = ChunkProcessor::new(ChunkOptions::default());
let text = b"BT /F1 12 Tf (text) Tj ET";
assert_eq!(processor.detect_chunk_type(text), ChunkType::Text);
let jpeg = b"\xFF\xD8\xFF\xE0";
assert_eq!(processor.detect_chunk_type(jpeg), ChunkType::Image);
let png = b"\x89PNG\r\n\x1a\n";
assert_eq!(processor.detect_chunk_type(png), ChunkType::Image);
let graphics = b"100 200 m 300 400 l S";
assert_eq!(processor.detect_chunk_type(graphics), ChunkType::Graphics);
}
#[test]
fn test_large_content_splitting() {
let options = ChunkOptions {
max_chunk_size: 10, ..Default::default()
};
let mut processor = ChunkProcessor::new(options);
let content = b"This is a much longer content that should be split into multiple chunks";
let chunks = processor.process_content(content).unwrap();
assert!(chunks.len() > 1);
assert!(chunks.iter().all(|c| c.size <= 10));
}
#[test]
fn test_chunk_filtering() {
let options = ChunkOptions {
chunk_types: vec![ChunkType::Text], ..Default::default()
};
let mut processor = ChunkProcessor::new(options);
let text_content = b"BT (text) Tj ET";
let image_content = b"\xFF\xD8\xFF\xE0 image data";
let text_chunks = processor.process_content(text_content).unwrap();
assert_eq!(text_chunks.len(), 1);
let image_chunks = processor.process_content(image_content).unwrap();
assert_eq!(image_chunks.len(), 0); }
#[test]
fn test_process_in_chunks() {
use std::io::Cursor;
let data = b"BT /F1 12 Tf 100 700 Td (Hello World) Tj ET";
let cursor = Cursor::new(data);
let options = ChunkOptions {
buffer_size: 10,
..Default::default()
};
let mut chunks_received = Vec::new();
process_in_chunks(cursor, options, |chunk| {
chunks_received.push(chunk);
Ok(())
})
.unwrap();
assert!(!chunks_received.is_empty());
}
#[test]
fn test_page_tracking() {
let mut processor = ChunkProcessor::new(ChunkOptions::default());
processor.set_page(5);
let content = b"Page 5 content";
let chunks = processor.process_content(content).unwrap();
assert!(!chunks.is_empty());
assert_eq!(chunks[0].page_number, 5);
}
#[test]
fn test_processor_reset() {
let mut processor = ChunkProcessor::new(ChunkOptions::default());
processor.current_position = 1000;
processor.current_page = 10;
processor.reset();
assert_eq!(processor.current_position, 0);
assert_eq!(processor.current_page, 0);
}
#[test]
fn test_chunk_type_debug_clone_eq() {
let types = vec![
ChunkType::Text,
ChunkType::Image,
ChunkType::Graphics,
ChunkType::Form,
ChunkType::Unknown,
];
for chunk_type in types {
let debug_str = format!("{chunk_type:?}");
assert!(!debug_str.is_empty());
let cloned = chunk_type.clone();
assert_eq!(chunk_type, cloned);
}
}
#[test]
fn test_content_chunk_debug_clone() {
let chunk = ContentChunk {
chunk_type: ChunkType::Graphics,
data: vec![1, 2, 3, 4],
position: 512,
size: 4,
page_number: 2,
};
let debug_str = format!("{chunk:?}");
assert!(debug_str.contains("ContentChunk"));
assert!(debug_str.contains("Graphics"));
let cloned = chunk.clone();
assert_eq!(cloned.chunk_type, chunk.chunk_type);
assert_eq!(cloned.data, chunk.data);
assert_eq!(cloned.position, chunk.position);
assert_eq!(cloned.size, chunk.size);
assert_eq!(cloned.page_number, chunk.page_number);
}
#[test]
fn test_chunk_options_debug_clone() {
let options = ChunkOptions {
max_chunk_size: 2048,
split_large_objects: false,
buffer_size: 1024,
chunk_types: vec![ChunkType::Text, ChunkType::Image],
};
let debug_str = format!("{options:?}");
assert!(debug_str.contains("ChunkOptions"));
let cloned = options.clone();
assert_eq!(cloned.max_chunk_size, options.max_chunk_size);
assert_eq!(cloned.split_large_objects, options.split_large_objects);
assert_eq!(cloned.buffer_size, options.buffer_size);
assert_eq!(cloned.chunk_types, options.chunk_types);
}
#[test]
fn test_content_chunk_image_methods() {
let image_chunk = ContentChunk::new(ChunkType::Image, b"\xFF\xD8\xFF\xE0".to_vec(), 0, 0);
assert!(image_chunk.is_image());
assert!(!image_chunk.is_text());
assert_eq!(image_chunk.as_text(), None);
}
#[test]
fn test_content_chunk_non_text_as_text() {
let graphics_chunk =
ContentChunk::new(ChunkType::Graphics, b"100 200 m 300 400 l S".to_vec(), 0, 0);
assert!(!graphics_chunk.is_text());
assert!(!graphics_chunk.is_image());
assert_eq!(graphics_chunk.as_text(), None);
}
#[test]
fn test_content_chunk_size_calculation() {
let data = b"Hello, World!".to_vec();
let expected_size = data.len();
let chunk = ContentChunk::new(ChunkType::Text, data, 100, 1);
assert_eq!(chunk.size, expected_size);
assert_eq!(chunk.size, chunk.data.len());
}
#[test]
fn test_chunk_processor_position_tracking() {
let mut processor = ChunkProcessor::new(ChunkOptions::default());
let content1 = b"First chunk";
let content2 = b"Second chunk";
let chunks1 = processor.process_content(content1).unwrap();
assert_eq!(chunks1[0].position, 0);
let chunks2 = processor.process_content(content2).unwrap();
assert_eq!(chunks2[0].position, content1.len() as u64);
}
#[test]
fn test_detect_chunk_type_edge_cases() {
let processor = ChunkProcessor::new(ChunkOptions::default());
assert_eq!(processor.detect_chunk_type(b""), ChunkType::Unknown);
assert_eq!(processor.detect_chunk_type(b"T"), ChunkType::Text);
assert_eq!(
processor.detect_chunk_type(b"Hello Tj World"),
ChunkType::Text
);
assert_eq!(processor.detect_chunk_type(b"m l c"), ChunkType::Graphics);
assert_eq!(processor.detect_chunk_type(b"xyz123"), ChunkType::Unknown);
}
#[test]
fn test_chunk_options_all_chunk_types() {
let all_types = vec![
ChunkType::Text,
ChunkType::Image,
ChunkType::Graphics,
ChunkType::Form,
ChunkType::Unknown,
];
let options = ChunkOptions {
chunk_types: all_types,
..Default::default()
};
assert_eq!(options.chunk_types.len(), 5);
assert!(options.chunk_types.contains(&ChunkType::Text));
assert!(options.chunk_types.contains(&ChunkType::Image));
assert!(options.chunk_types.contains(&ChunkType::Graphics));
assert!(options.chunk_types.contains(&ChunkType::Form));
assert!(options.chunk_types.contains(&ChunkType::Unknown));
}
#[test]
fn test_chunk_filtering_multiple_types() {
let mut options = ChunkOptions::default();
options.chunk_types = vec![ChunkType::Text, ChunkType::Graphics];
let mut processor = ChunkProcessor::new(options);
let text_content = b"BT (text) Tj ET";
let graphics_content = b"100 200 m 300 400 l S";
let image_content = b"\xFF\xD8\xFF\xE0";
let text_chunks = processor.process_content(text_content).unwrap();
assert_eq!(text_chunks.len(), 1);
let graphics_chunks = processor.process_content(graphics_content).unwrap();
assert_eq!(graphics_chunks.len(), 1);
let image_chunks = processor.process_content(image_content).unwrap();
assert_eq!(image_chunks.len(), 0); }
#[test]
fn test_process_in_chunks_with_io_error() {
use std::io::Error;
struct ErrorReader;
impl Read for ErrorReader {
fn read(&mut self, _buf: &mut [u8]) -> std::io::Result<usize> {
Err(Error::other("Test error"))
}
}
let reader = ErrorReader;
let options = ChunkOptions::default();
let result = process_in_chunks(reader, options, |_chunk| Ok(()));
assert!(result.is_err());
}
#[test]
fn test_process_in_chunks_with_callback_error() {
use std::io::Cursor;
let data = b"BT (text) Tj ET";
let cursor = Cursor::new(data);
let options = ChunkOptions::default();
let result = process_in_chunks(cursor, options, |_chunk| {
Err(crate::error::PdfError::ParseError(
"Callback error".to_string(),
))
});
assert!(result.is_err());
}
#[test]
fn test_process_in_chunks_empty_data() {
use std::io::Cursor;
let data = b"";
let cursor = Cursor::new(data);
let options = ChunkOptions::default();
let mut chunks_received = Vec::new();
process_in_chunks(cursor, options, |chunk| {
chunks_received.push(chunk);
Ok(())
})
.unwrap();
assert!(chunks_received.is_empty());
}
#[test]
fn test_chunk_processor_with_zero_max_size() {
let mut options = ChunkOptions::default();
options.max_chunk_size = 0;
let mut processor = ChunkProcessor::new(options);
let content = b"Some content";
let chunks = processor.process_content(content).unwrap();
assert!(chunks.is_empty());
}
#[test]
fn test_chunk_processor_exact_chunk_size() {
let mut options = ChunkOptions::default();
options.max_chunk_size = 5;
let mut processor = ChunkProcessor::new(options);
let content = b"Hello";
let chunks = processor.process_content(content).unwrap();
assert_eq!(chunks.len(), 1);
assert_eq!(chunks[0].size, 5);
}
#[test]
fn test_content_chunk_with_binary_data() {
let binary_data = vec![0, 1, 2, 3, 255, 254, 253];
let chunk = ContentChunk::new(ChunkType::Image, binary_data.clone(), 0, 0);
assert_eq!(chunk.data, binary_data);
assert_eq!(chunk.size, 7);
assert!(chunk.is_image());
assert_eq!(chunk.as_text(), None);
}
#[test]
fn test_content_chunk_as_text_with_utf8() {
let text_data = "Hello, 世界!".as_bytes().to_vec();
let chunk = ContentChunk::new(ChunkType::Text, text_data, 0, 0);
assert_eq!(chunk.as_text(), Some("Hello, 世界!".to_string()));
}
#[test]
fn test_content_chunk_as_text_with_invalid_utf8() {
let invalid_utf8 = vec![0xFF, 0xFE, 0xFD];
let chunk = ContentChunk::new(ChunkType::Text, invalid_utf8, 0, 0);
let text = chunk.as_text();
assert!(text.is_some());
assert!(!text.unwrap().is_empty());
}
#[test]
fn test_detect_form_xobject() {
let processor = ChunkProcessor::new(ChunkOptions::default());
let form_content = b"q 1 0 0 1 0 0 cm BT /F1 12 Tf (Form) Tj ET Q";
let detected_type = processor.detect_chunk_type(form_content);
assert_eq!(detected_type, ChunkType::Text);
}
#[test]
fn test_processor_multiple_pages() {
let mut processor = ChunkProcessor::new(ChunkOptions::default());
processor.set_page(0);
let content1 = b"Page 0 content";
let chunks1 = processor.process_content(content1).unwrap();
assert_eq!(chunks1[0].page_number, 0);
processor.set_page(1);
let content2 = b"Page 1 content";
let chunks2 = processor.process_content(content2).unwrap();
assert_eq!(chunks2[0].page_number, 1);
assert!(chunks2[0].position > chunks1[0].position);
}
#[test]
fn test_chunk_options_empty_chunk_types() {
let options = ChunkOptions {
chunk_types: vec![], ..Default::default()
};
let mut processor = ChunkProcessor::new(options);
let content = b"Any content";
let chunks = processor.process_content(content).unwrap();
assert!(chunks.is_empty()); }
#[test]
fn test_process_in_chunks_large_buffer() {
use std::io::Cursor;
let data = b"BT /F1 12 Tf 100 700 Td (Hello World) Tj ET";
let cursor = Cursor::new(data);
let options = ChunkOptions {
buffer_size: 1024, ..Default::default()
};
let mut chunks_received = Vec::new();
process_in_chunks(cursor, options, |chunk| {
chunks_received.push(chunk);
Ok(())
})
.unwrap();
assert!(!chunks_received.is_empty());
assert_eq!(chunks_received[0].data, data);
}
#[test]
fn test_chunk_options_validation() {
let mut options = ChunkOptions::default();
assert!(options.validate().is_ok());
options.max_chunk_size = 0;
assert!(options.validate().is_err());
options = ChunkOptions::default();
options.buffer_size = 0;
assert!(options.validate().is_err());
}
#[test]
fn test_process_in_chunks_with_invalid_options() {
use std::io::Cursor;
let data = b"test data";
let cursor = Cursor::new(data);
let mut options = ChunkOptions::default();
options.buffer_size = 0;
let result = process_in_chunks(cursor, options, |_| Ok(()));
assert!(result.is_err());
let cursor = Cursor::new(data);
let mut options = ChunkOptions::default();
options.max_chunk_size = 0;
let result = process_in_chunks(cursor, options, |_| Ok(()));
assert!(result.is_err());
}
}