pdf-engine 1.0.0-beta.6

Unified PDF rendering engine — page rendering, text extraction, thumbnails.
Documentation
#[cfg(not(any(feature = "ocr", feature = "ocr-onnx")))]
fn main() {
    eprintln!("This example requires `--features ocr` or `--features ocr-onnx`.");
    std::process::exit(1);
}

#[cfg(feature = "ocr-onnx")]
use pdf_engine::ocr::PaddleOnnxBackend;
#[cfg(feature = "ocr")]
use pdf_engine::OcrsBackend;
#[cfg(any(feature = "ocr", feature = "ocr-onnx"))]
use pdf_engine::{OcrBackend, PdfDocument};
#[cfg(feature = "ocr-onnx")]
use pdf_ocr::paddle::{DetectionModel, Language, PaddleOcrConfig};
#[cfg(any(feature = "ocr", feature = "ocr-onnx"))]
use std::fs;
#[cfg(any(feature = "ocr", feature = "ocr-onnx"))]
use std::path::PathBuf;

#[cfg(any(feature = "ocr", feature = "ocr-onnx"))]
const DEFAULT_CASES: &[&str] = &[
    "corpus/fw4v.pdf",
    "corpus/fw8ben.pdf",
    "corpus/f982.pdf",
    "corpus/sf181.pdf",
    "corpus/f461.pdf",
];

#[cfg(any(feature = "ocr", feature = "ocr-onnx"))]
fn main() -> Result<(), Box<dyn std::error::Error>> {
    let args = std::env::args().skip(1).collect::<Vec<_>>();
    let positional_args = args
        .iter()
        .filter(|arg| !arg.starts_with("--backend="))
        .cloned()
        .collect::<Vec<_>>();

    let mut paths = Vec::new();
    let mut backend_name = String::new();
    let backend = select_backend(&args, &mut backend_name)?;

    if positional_args.is_empty() {
        paths.extend(DEFAULT_CASES.iter().map(PathBuf::from));
    } else {
        paths.extend(positional_args.into_iter().map(PathBuf::from));
    }

    println!("backend={backend_name}");
    println!("filename\ttext_length\tocr_length\tsimilarity%");

    let mut total = 0.0f64;
    let mut count = 0usize;

    for path in paths {
        let data = fs::read(&path)?;
        let doc = PdfDocument::open(data)?;
        let expected = normalize_text(&doc.extract_text(0)?);
        let ocr = normalize_text(&doc.ocr_page(0, backend.as_ref(), 150.0)?.text);
        let similarity = levenshtein_similarity(&expected, &ocr);
        let label = path
            .file_stem()
            .and_then(|name| name.to_str())
            .unwrap_or("unknown");

        println!(
            "{label}\t{}\t{}\t{:.1}",
            expected.chars().count(),
            ocr.chars().count(),
            similarity * 100.0
        );

        total += similarity;
        count += 1;
    }

    let average = if count == 0 {
        0.0
    } else {
        total / count as f64
    };
    println!("average\t-\t-\t{:.1}", average * 100.0);

    Ok(())
}

#[cfg(any(feature = "ocr", feature = "ocr-onnx"))]
fn select_backend(
    args: &[String],
    backend_name: &mut String,
) -> Result<Box<dyn OcrBackend>, Box<dyn std::error::Error>> {
    let requested = args
        .iter()
        .find_map(|arg| arg.strip_prefix("--backend="))
        .unwrap_or(default_backend_name());

    match requested {
        "ocrs" => {
            #[cfg(feature = "ocr")]
            {
                *backend_name = "ocrs".to_string();
                return Ok(Box::new(OcrsBackend::try_default()?));
            }
            #[cfg(not(feature = "ocr"))]
            return Err("backend `ocrs` requires `--features ocr`".into());
        }
        "paddle-onnx" => {
            #[cfg(feature = "ocr-onnx")]
            {
                *backend_name = "paddle-onnx".to_string();
                return Ok(Box::new(build_paddle_backend()?));
            }
            #[cfg(not(feature = "ocr-onnx"))]
            return Err("backend `paddle-onnx` requires `--features ocr-onnx`".into());
        }
        other => Err(format!("unsupported backend `{other}`").into()),
    }
}

#[cfg(any(feature = "ocr", feature = "ocr-onnx"))]
fn default_backend_name() -> &'static str {
    #[cfg(feature = "ocr-onnx")]
    {
        return "paddle-onnx";
    }
    #[cfg(all(feature = "ocr", not(feature = "ocr-onnx")))]
    {
        return "ocrs";
    }
    #[allow(unreachable_code)]
    "ocrs"
}

#[cfg(feature = "ocr-onnx")]
fn build_paddle_backend() -> Result<PaddleOnnxBackend, Box<dyn std::error::Error>> {
    let mut config = PaddleOcrConfig::default();

    if matches!(
        std::env::var("PADDLE_DET").ok().as_deref(),
        Some("v5") | Some("V5")
    ) {
        config.detection_model = DetectionModel::V5;
    }

    config.languages = match std::env::var("PADDLE_LANG").ok().as_deref() {
        Some("english") | Some("en") => vec![Language::English],
        Some("latin") => vec![Language::Latin],
        Some("japanese") | Some("jp") => vec![Language::Japanese],
        Some("korean") | Some("ko") => vec![Language::Korean],
        Some("arabic") | Some("ar") => vec![Language::Arabic],
        _ => config.languages,
    };

    if matches!(
        std::env::var("PADDLE_CLS").ok().as_deref(),
        Some("1") | Some("true") | Some("yes")
    ) {
        config.use_angle_classifier = true;
    }

    Ok(PaddleOnnxBackend::with_config(config)?)
}

#[cfg(any(feature = "ocr", feature = "ocr-onnx"))]
fn normalize_text(text: &str) -> String {
    text.split_whitespace().collect::<Vec<_>>().join(" ")
}

#[cfg(any(feature = "ocr", feature = "ocr-onnx"))]
fn levenshtein_similarity(expected: &str, actual: &str) -> f64 {
    let expected_chars = expected.chars().collect::<Vec<_>>();
    let actual_chars = actual.chars().collect::<Vec<_>>();
    let max_len = expected_chars.len().max(actual_chars.len());
    if max_len == 0 {
        return 1.0;
    }

    let dist = levenshtein_distance(&expected_chars, &actual_chars);
    1.0 - dist as f64 / max_len as f64
}

#[cfg(any(feature = "ocr", feature = "ocr-onnx"))]
fn levenshtein_distance(expected: &[char], actual: &[char]) -> usize {
    if expected.is_empty() {
        return actual.len();
    }
    if actual.is_empty() {
        return expected.len();
    }

    let mut prev = (0..=actual.len()).collect::<Vec<_>>();
    let mut curr = vec![0usize; actual.len() + 1];

    for (i, expected_char) in expected.iter().enumerate() {
        curr[0] = i + 1;

        for (j, actual_char) in actual.iter().enumerate() {
            let cost = usize::from(expected_char != actual_char);
            curr[j + 1] = (prev[j + 1] + 1).min(curr[j] + 1).min(prev[j] + cost);
        }

        std::mem::swap(&mut prev, &mut curr);
    }

    prev[actual.len()]
}