use std::path::{Path, PathBuf};
use std::time::Instant;
use crate::document::PdfDocument;
use crate::error::Error;
#[derive(Debug)]
pub struct BatchResult {
pub path: PathBuf,
pub text: Result<String, Error>,
pub time_ms: u64,
pub page_count: usize,
}
pub type ProgressCallback = Box<dyn Fn(usize, usize) + Send + Sync>;
pub struct BatchProcessor {
progress: Option<ProgressCallback>,
}
impl Default for BatchProcessor {
fn default() -> Self {
Self::new()
}
}
impl BatchProcessor {
pub fn new() -> Self {
Self { progress: None }
}
pub fn with_progress(mut self, callback: ProgressCallback) -> Self {
self.progress = Some(callback);
self
}
pub fn extract_text_from_files(&self, paths: &[&Path]) -> Vec<BatchResult> {
let total = paths.len();
#[cfg(feature = "parallel")]
{
use rayon::prelude::*;
use std::sync::atomic::{AtomicUsize, Ordering};
let completed = AtomicUsize::new(0);
let results: Vec<BatchResult> = paths
.par_iter()
.map(|path| {
let result = Self::process_single_file(path);
let done = completed.fetch_add(1, Ordering::Relaxed) + 1;
if let Some(ref cb) = self.progress {
cb(done, total);
}
result
})
.collect();
results
}
#[cfg(not(feature = "parallel"))]
{
let mut results = Vec::with_capacity(total);
for (i, path) in paths.iter().enumerate() {
results.push(Self::process_single_file(path));
if let Some(ref cb) = self.progress {
cb(i + 1, total);
}
}
results
}
}
pub fn extract_text_from_directory(&self, dir: &Path) -> Result<Vec<BatchResult>, Error> {
let mut pdf_paths: Vec<PathBuf> = Vec::new();
let entries = std::fs::read_dir(dir).map_err(Error::Io)?;
for entry in entries.flatten() {
let path = entry.path();
if path.is_file() {
if let Some(ext) = path.extension() {
if ext.eq_ignore_ascii_case("pdf") {
pdf_paths.push(path);
}
}
}
}
pdf_paths.sort();
let path_refs: Vec<&Path> = pdf_paths.iter().map(|p| p.as_path()).collect();
Ok(self.extract_text_from_files(&path_refs))
}
fn process_single_file(path: &Path) -> BatchResult {
let start = Instant::now();
let result = (|| -> Result<(String, usize), Error> {
let mut doc = PdfDocument::open(path)?;
let page_count = doc.page_count()?;
let text = doc.extract_all_text()?;
Ok((text, page_count))
})();
let time_ms = start.elapsed().as_millis() as u64;
match result {
Ok((text, page_count)) => BatchResult {
path: path.to_path_buf(),
text: Ok(text),
time_ms,
page_count,
},
Err(e) => BatchResult {
path: path.to_path_buf(),
text: Err(e),
time_ms,
page_count: 0,
},
}
}
}
#[derive(Debug)]
pub struct BatchSummary {
pub total: usize,
pub succeeded: usize,
pub failed: usize,
pub total_chars: usize,
pub total_pages: usize,
pub total_time_ms: u64,
}
impl BatchSummary {
pub fn from_results(results: &[BatchResult]) -> Self {
let mut succeeded = 0;
let mut failed = 0;
let mut total_chars = 0;
let mut total_pages = 0;
let mut total_time_ms = 0;
for r in results {
total_time_ms += r.time_ms;
total_pages += r.page_count;
match &r.text {
Ok(text) => {
succeeded += 1;
total_chars += text.len();
},
Err(_) => {
failed += 1;
},
}
}
BatchSummary {
total: results.len(),
succeeded,
failed,
total_chars,
total_pages,
total_time_ms,
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_batch_processor_creation() {
let processor = BatchProcessor::new();
assert!(processor.progress.is_none());
}
#[test]
fn test_batch_processor_with_progress() {
let processor = BatchProcessor::new().with_progress(Box::new(|done, total| {
assert!(done <= total);
}));
assert!(processor.progress.is_some());
}
#[test]
fn test_batch_processor_empty_list() {
let processor = BatchProcessor::new();
let results = processor.extract_text_from_files(&[]);
assert!(results.is_empty());
}
#[test]
fn test_batch_processor_nonexistent_file() {
let processor = BatchProcessor::new();
let results = processor.extract_text_from_files(&[Path::new("/nonexistent.pdf")]);
assert_eq!(results.len(), 1);
assert!(results[0].text.is_err());
assert_eq!(results[0].page_count, 0);
}
#[test]
fn test_batch_summary_empty() {
let summary = BatchSummary::from_results(&[]);
assert_eq!(summary.total, 0);
assert_eq!(summary.succeeded, 0);
assert_eq!(summary.failed, 0);
}
#[test]
fn test_batch_summary_from_results() {
let results = vec![
BatchResult {
path: PathBuf::from("good.pdf"),
text: Ok("Hello world".to_string()),
time_ms: 100,
page_count: 5,
},
BatchResult {
path: PathBuf::from("bad.pdf"),
text: Err(Error::InvalidPdf("test".to_string())),
time_ms: 50,
page_count: 0,
},
];
let summary = BatchSummary::from_results(&results);
assert_eq!(summary.total, 2);
assert_eq!(summary.succeeded, 1);
assert_eq!(summary.failed, 1);
assert_eq!(summary.total_chars, 11);
assert_eq!(summary.total_pages, 5);
assert_eq!(summary.total_time_ms, 150);
}
#[test]
fn test_batch_result_fixture() {
let fixture = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
.join("tests")
.join("fixtures")
.join("simple.pdf");
if !fixture.exists() {
return;
}
let processor = BatchProcessor::new();
let results = processor.extract_text_from_files(&[fixture.as_path()]);
assert_eq!(results.len(), 1);
assert!(results[0].text.is_ok());
assert!(results[0].page_count > 0);
assert!(results[0].time_ms < 10_000);
}
}