pdf_oxide 0.3.38

The fastest Rust PDF library with text extraction: 0.8ms mean, 100% pass rate on 3,830 PDFs. 5× faster than pdf_extract, 17× faster than oxidize_pdf. Extract, create, and edit PDFs.
Documentation
//! Export PDFs to HTML using structured extraction
//!
//! Exports all PDFs to HTML format with clickable links and semantic structure.
//!
//! Usage:
//!   cargo run --release --bin export_to_html
//!   cargo run --release --bin export_to_html -- --output-dir custom/path
//!   cargo run --release --bin export_to_html -- --layout-mode  # Preserve PDF layout

use pdf_oxide::converters::ConversionOptions;
use pdf_oxide::document::PdfDocument;
use std::fs::{self, File};
use std::io::Write;
use std::path::{Path, PathBuf};
use std::time::Instant;

struct ExportConfig {
    pdf_dir: PathBuf,
    output_dir: PathBuf,
    verbose: bool,
    preserve_layout: bool,
}

impl ExportConfig {
    fn from_args() -> Self {
        let args: Vec<String> = std::env::args().collect();
        let mut output_dir = PathBuf::from("html_exports/our_library");
        let mut verbose = false;
        let mut preserve_layout = false;

        let mut i = 1;
        while i < args.len() {
            match args[i].as_str() {
                "--output-dir" => {
                    i += 1;
                    if i < args.len() {
                        output_dir = PathBuf::from(&args[i]);
                    }
                },
                "--verbose" | "-v" => {
                    verbose = true;
                },
                "--layout-mode" | "-l" => {
                    preserve_layout = true;
                },
                _ => {},
            }
            i += 1;
        }

        Self {
            pdf_dir: PathBuf::from("test_datasets/pdfs"),
            output_dir,
            verbose,
            preserve_layout,
        }
    }
}

fn discover_pdfs(base_dir: &Path) -> Vec<(PathBuf, String)> {
    let mut pdfs = Vec::new();

    if !base_dir.exists() {
        eprintln!("Error: Directory {} does not exist", base_dir.display());
        return pdfs;
    }

    let categories = match fs::read_dir(base_dir) {
        Ok(entries) => entries
            .filter_map(|e| e.ok())
            .filter(|e| e.path().is_dir())
            .map(|e| e.file_name().to_string_lossy().to_string())
            .collect::<Vec<_>>(),
        Err(e) => {
            eprintln!("Error reading directory: {}", e);
            return pdfs;
        },
    };

    for category in categories {
        let category_path = base_dir.join(&category);

        match fs::read_dir(&category_path) {
            Ok(entries) => {
                for entry in entries.filter_map(|e| e.ok()) {
                    let path = entry.path();
                    if path.extension().is_some_and(|ext| ext == "pdf") {
                        pdfs.push((path, category.clone()));
                    }
                }
            },
            Err(e) => eprintln!("Error reading category {}: {}", category, e),
        }
    }

    pdfs
}

fn export_pdf_to_html(
    pdf_path: &Path,
    category: &str,
    output_dir: &Path,
    preserve_layout: bool,
    verbose: bool,
) -> Result<usize, Box<dyn std::error::Error>> {
    let file_stem = pdf_path
        .file_stem()
        .and_then(|s| s.to_str())
        .unwrap_or("unknown");

    if verbose {
        println!("Processing: {}/{}.pdf", category, file_stem);
    }

    let mut doc = PdfDocument::open(pdf_path)?;

    // Create conversion options
    let options = ConversionOptions {
        preserve_layout,
        detect_headings: !preserve_layout, // Semantic mode detects headings
        extract_tables: false,
        include_images: true,
        image_output_dir: None,
        reading_order_mode: pdf_oxide::converters::ReadingOrderMode::ColumnAware,
        bold_marker_behavior: pdf_oxide::converters::BoldMarkerBehavior::default(),
        table_detection_config: None,
        ..Default::default()
    };

    // Convert to HTML
    let html = doc.to_html_all(&options)?;

    // Save to output directory
    let category_dir = output_dir.join(category);
    fs::create_dir_all(&category_dir)?;

    let output_file = category_dir.join(format!("{}.html", file_stem));
    let mut file = File::create(&output_file)?;
    file.write_all(html.as_bytes())?;

    let bytes = html.len();

    if verbose {
        println!("  ✅ Exported to: {}", output_file.display());
        println!("  📊 Size: {} bytes ({:.2} KB)", bytes, bytes as f64 / 1024.0);
    }

    Ok(bytes)
}

fn main() {
    let config = ExportConfig::from_args();

    println!("PDF to HTML Exporter");
    println!("====================");
    println!("PDF directory: {}", config.pdf_dir.display());
    println!("Output directory: {}", config.output_dir.display());
    println!(
        "Mode: {}",
        if config.preserve_layout {
            "Layout-preserved"
        } else {
            "Semantic"
        }
    );
    println!();

    let pdfs = discover_pdfs(&config.pdf_dir);
    println!("Found {} PDF files\n", pdfs.len());

    if pdfs.is_empty() {
        eprintln!("No PDFs found in {}", config.pdf_dir.display());
        return;
    }

    fs::create_dir_all(&config.output_dir).expect("Failed to create output directory");

    let start = Instant::now();
    let mut success_count = 0;
    let mut error_count = 0;

    for (i, (pdf_path, category)) in pdfs.iter().enumerate() {
        let filename = pdf_path
            .file_stem()
            .and_then(|s| s.to_str())
            .unwrap_or("unknown");

        print!("[{}/{}] Exporting {}/{}.pdf ... ", i + 1, pdfs.len(), category, filename);
        std::io::stdout().flush().unwrap();

        match export_pdf_to_html(
            pdf_path,
            category,
            &config.output_dir,
            config.preserve_layout,
            config.verbose,
        ) {
            Ok(bytes) => {
                println!("✓ ({} bytes)", bytes);
                success_count += 1;
            },
            Err(e) => {
                println!("✗ Error: {}", e);
                error_count += 1;
            },
        }
    }

    let elapsed = start.elapsed();

    println!("\n{}", "=".repeat(70));
    println!("Export Complete");
    println!("{}", "=".repeat(70));
    println!("Success: {}/{}", success_count, pdfs.len());
    println!("Errors: {}", error_count);
    println!("Time: {:?}", elapsed);
    println!("Output: {}", config.output_dir.display());
    println!("{}", "=".repeat(70));
}