use crate::pdf::PdfDocument;
use anyhow::Result;
use rayon::prelude::*;
use std::path::Path;
pub fn merge_pdfs_parallel<P: AsRef<Path> + Send + Sync>(input_paths: &[P], output_path: P) -> Result<()> {
if input_paths.is_empty() {
anyhow::bail!("No input PDFs provided");
}
let input_files: Vec<&str> = input_paths
.iter()
.map(|p| p.as_ref().to_str().unwrap())
.collect();
let documents: Result<Vec<_>> = input_files
.par_iter()
.map(|path| {
PdfDocument::load_from_file(path)
.map_err(|e| anyhow::anyhow!("Failed to load {}: {}", path, e))
})
.collect();
let documents = documents?;
let output_str = output_path.as_ref().to_str().unwrap();
crate::pdf_ops::merge_pdfs_sequential(&documents, output_str)
}
pub fn extract_text_parallel<P: AsRef<Path> + Send + Sync>(input_paths: &[P]) -> Result<Vec<(String, String)>> {
input_paths
.par_iter()
.map(|path| {
let path_ref = path.as_ref();
let path_str = path_ref.display().to_string();
let path_file = path_ref.to_str().unwrap();
PdfDocument::load_from_file(path_file)
.and_then(|doc| doc.get_text())
.map(|text| (path_str, text))
.map_err(|e| anyhow::anyhow!("Failed to process {:?}: {}", path_ref, e))
})
.collect()
}
pub fn validate_pdfs_parallel<P: AsRef<Path> + Send + Sync>(input_paths: &[P]) -> Result<Vec<(String, bool)>> {
input_paths
.par_iter()
.map(|path| {
let path_ref = path.as_ref();
let path_str = path_ref.display().to_string();
let path_file = path_ref.to_str().unwrap();
let validation = crate::pdf::validate_pdf(path_file);
Ok(match validation {
Ok(v) => (path_str, v.valid),
Err(_) => (path_str, false),
})
})
.collect()
}
pub fn count_pages_parallel<P: AsRef<Path> + Send + Sync>(input_paths: &[P]) -> Result<Vec<(String, usize)>> {
input_paths
.par_iter()
.map(|path| {
let path_ref = path.as_ref();
let path_str = path_ref.display().to_string();
let path_file = path_ref.to_str().unwrap();
PdfDocument::load_from_file(path_file)
.and_then(|doc| {
let page_count = doc.objects.iter()
.filter(|(_, obj)| {
if let crate::pdf::PdfObject::Stream { data, .. } = obj {
let decompressed = if data.len() > 2 && data[0] == 0x78 && (data[1] == 0x9C || data[1] == 0xDA) {
crate::compression::decompress_deflate(data).unwrap_or_default()
} else {
data.clone()
};
let content = String::from_utf8_lossy(&decompressed);
content.contains("Tj") || content.contains("TJ") || content.contains("BT")
} else {
false
}
})
.count();
Ok((path_str, page_count))
})
.map_err(|e| anyhow::anyhow!("Failed to process {:?}: {}", path_ref, e))
})
.collect()
}
pub fn process_pdfs_parallel<P, F, R>(
input_paths: &[P],
processor: F,
) -> Result<Vec<(String, R)>>
where
P: AsRef<Path> + Send + Sync,
F: Fn(&PdfDocument) -> Result<R> + Sync + Send,
R: Send,
{
input_paths
.par_iter()
.map(|path| {
let path_ref = path.as_ref();
let path_str = path_ref.display().to_string();
let path_file = path_ref.to_str().unwrap();
PdfDocument::load_from_file(path_file)
.and_then(|doc| processor(&doc))
.map(|result| (path_str, result))
.map_err(|e| anyhow::anyhow!("Failed to process {:?}: {}", path_ref, e))
})
.collect()
}
pub struct ParallelPdfGenerator {
_layout: crate::pdf_generator::PageLayout,
_font: String,
_font_size: f32,
}
impl ParallelPdfGenerator {
pub fn new() -> Self {
Self {
_layout: crate::pdf_generator::PageLayout::portrait(),
_font: "Helvetica".to_string(),
_font_size: 12.0,
}
}
pub fn generate_markdown_pdfs_parallel(
&self,
inputs: &std::collections::HashMap<String, String>,
) -> Result<std::collections::HashMap<String, Vec<u8>>> {
inputs
.par_iter()
.map(|(filename, markdown)| {
let elements = crate::elements::parse_markdown(markdown);
let pdf_bytes = crate::pdf_generator::generate_pdf_bytes(
&elements,
&self._font,
self._font_size,
self._layout
)?;
Ok((filename.clone(), pdf_bytes))
})
.collect()
}
}
impl Default for ParallelPdfGenerator {
fn default() -> Self {
Self::new()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_parallel_merge() {
}
}