use pdf_oxide::document::PdfDocument;
use pdf_oxide::pipeline::config::{DocumentType, TextPipelineConfig};
use std::fs::{self, File};
use std::io::Write;
use std::path::{Path, PathBuf};
use std::time::Instant;
struct ExportConfig {
pdf_dir: PathBuf,
output_dir: PathBuf,
verbose: bool,
}
impl ExportConfig {
fn from_args() -> Self {
let args: Vec<String> = std::env::args().collect();
let mut pdf_dir = PathBuf::from("test_datasets/pdfs");
let mut output_dir = PathBuf::from("markdown_exports/with_pipeline");
let mut verbose = false;
let mut i = 1;
while i < args.len() {
match args[i].as_str() {
"--input-dir" => {
i += 1;
if i < args.len() {
pdf_dir = PathBuf::from(&args[i]);
}
},
"--output-dir" => {
i += 1;
if i < args.len() {
output_dir = PathBuf::from(&args[i]);
}
},
"--verbose" | "-v" => {
verbose = true;
},
_ => {},
}
i += 1;
}
Self {
pdf_dir,
output_dir,
verbose,
}
}
}
fn discover_pdfs(base_dir: &Path) -> Vec<(PathBuf, String)> {
let mut pdfs = Vec::new();
if !base_dir.exists() {
eprintln!("Error: Directory {} does not exist", base_dir.display());
return pdfs;
}
fn walk_dir(dir: &Path, pdfs: &mut Vec<(PathBuf, String)>) {
if let Ok(entries) = fs::read_dir(dir) {
for entry in entries.flatten() {
let path = entry.path();
if path.is_dir() {
walk_dir(&path, pdfs);
} else if path.extension().and_then(|s| s.to_str()) == Some("pdf") {
let relative_path = path
.strip_prefix("test_datasets/pdfs")
.unwrap_or(&path)
.to_string_lossy()
.to_string();
pdfs.push((path, relative_path));
}
}
}
}
walk_dir(base_dir, &mut pdfs);
pdfs.sort();
pdfs
}
fn extract_pdf(pdf_path: &Path, verbose: bool) -> Result<String, Box<dyn std::error::Error>> {
let mut doc = PdfDocument::open(pdf_path)?;
let mut full_text = String::new();
let page_count = doc.page_count()?;
for page_idx in 0..page_count {
if verbose {
eprintln!(" Extracting page {}/{}", page_idx + 1, page_count);
}
match doc.extract_text(page_idx) {
Ok(text) => {
full_text.push_str(&text);
if page_idx < page_count - 1 {
full_text.push_str("\n\n---\n\n");
}
},
Err(e) => {
eprintln!(" Warning: Failed to extract page {}: {}", page_idx + 1, e);
},
}
}
Ok(full_text)
}
fn main() -> Result<(), Box<dyn std::error::Error>> {
let config = ExportConfig::from_args();
fs::create_dir_all(&config.output_dir)?;
println!("=== PDF Extraction with TextPipeline API ===");
println!("Input directory: {}", config.pdf_dir.display());
println!("Output directory: {}", config.output_dir.display());
println!();
let pdfs = discover_pdfs(&config.pdf_dir);
println!("Found {} PDFs", pdfs.len());
println!();
if pdfs.is_empty() {
println!("No PDFs found in {}", config.pdf_dir.display());
return Ok(());
}
let _pipeline_config = TextPipelineConfig::for_document_type(DocumentType::Academic);
let start = Instant::now();
let mut successful = 0;
let mut failed = 0;
for (pdf_path, relative_path) in &pdfs {
let output_filename = relative_path.replace(".pdf", ".md");
let output_path = config.output_dir.join(&output_filename);
if let Some(parent) = output_path.parent() {
fs::create_dir_all(parent)?;
}
if config.verbose {
println!("Processing: {}", relative_path);
}
match extract_pdf(pdf_path, config.verbose) {
Ok(markdown) => {
if let Ok(mut file) = File::create(&output_path) {
if file.write_all(markdown.as_bytes()).is_ok() {
if config.verbose {
println!(" ✓ Saved to {}", output_path.display());
}
successful += 1;
} else {
eprintln!("✗ Failed to write {}", relative_path);
failed += 1;
}
} else {
eprintln!("✗ Failed to create output file {}", relative_path);
failed += 1;
}
},
Err(e) => {
eprintln!("✗ Failed to extract {}: {}", relative_path, e);
failed += 1;
},
}
}
let elapsed = start.elapsed();
println!();
println!("=== Extraction Complete ===");
println!("Successfully extracted: {}/{}", successful, pdfs.len());
if failed > 0 {
println!("Failed: {}", failed);
}
println!("Total time: {:.2}s", elapsed.as_secs_f64());
println!(
"Average time per PDF: {:.2}ms",
elapsed.as_secs_f64() * 1000.0 / pdfs.len() as f64
);
println!();
println!("Output directory: {}", config.output_dir.display());
if let Ok(entries) = fs::read_dir(&config.output_dir) {
let count = entries.count();
println!("Generated files: {}", count);
}
Ok(())
}