#![allow(dead_code)]
use std::path::{Path, PathBuf};
pub fn extract_pdf_text(pdf_bytes: &[u8]) -> String {
pdf_extract::extract_text_from_mem(pdf_bytes).expect("should extract text from PDF")
}
pub fn validate_pdf_with_qpdf(pdf_bytes: &[u8]) -> bool {
if std::env::var("OFFICE2PDF_VALIDATE_PDF").unwrap_or_default() != "1" {
return false;
}
match std::process::Command::new("qpdf").arg("--version").output() {
Ok(output) if output.status.success() => {}
_ => {
eprintln!("[WARN] qpdf not installed, skipping PDF validation");
return false;
}
}
let temp_path = std::env::temp_dir().join(format!(
"office2pdf_test_{}_{}.pdf",
std::process::id(),
std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap()
.as_nanos()
));
std::fs::write(&temp_path, pdf_bytes).expect("should write temp PDF file");
let output = std::process::Command::new("qpdf")
.arg("--check")
.arg(&temp_path)
.output()
.expect("should run qpdf");
let _ = std::fs::remove_file(&temp_path);
assert!(
output.status.success(),
"qpdf --check failed:\nstdout: {}\nstderr: {}",
String::from_utf8_lossy(&output.stdout),
String::from_utf8_lossy(&output.stderr),
);
true
}
pub fn is_pdftoppm_available() -> bool {
std::process::Command::new("pdftoppm")
.arg("-v")
.output()
.is_ok_and(|o| o.status.success() || !o.stderr.is_empty())
}
pub fn is_pdftotext_available() -> bool {
std::process::Command::new("pdftotext")
.arg("-v")
.output()
.is_ok_and(|o| o.status.success() || !o.stderr.is_empty())
}
pub fn render_pdf_to_pngs(
pdf_path: &Path,
output_dir: &Path,
prefix: &str,
dpi: u32,
) -> Vec<PathBuf> {
std::fs::create_dir_all(output_dir).expect("create output dir");
let output_prefix = output_dir.join(prefix);
let status = std::process::Command::new("pdftoppm")
.args(["-png", "-r", &dpi.to_string()])
.arg(pdf_path)
.arg(&output_prefix)
.status()
.expect("run pdftoppm");
assert!(
status.success(),
"pdftoppm failed for {}",
pdf_path.display()
);
collect_pngs(output_dir, prefix)
}
pub fn render_pdf_bytes_to_pngs(
pdf_bytes: &[u8],
output_dir: &Path,
prefix: &str,
dpi: u32,
) -> Vec<PathBuf> {
std::fs::create_dir_all(output_dir).expect("create output dir");
let temp_pdf = output_dir.join(format!("{prefix}.pdf"));
std::fs::write(&temp_pdf, pdf_bytes).expect("write temp PDF");
let result = render_pdf_to_pngs(&temp_pdf, output_dir, prefix, dpi);
let _ = std::fs::remove_file(&temp_pdf);
result
}
pub fn extract_text_from_pdf_file(pdf_path: &Path) -> String {
let output = std::process::Command::new("pdftotext")
.args(["-enc", "UTF-8"])
.arg(pdf_path)
.arg("-") .output()
.expect("run pdftotext");
assert!(
output.status.success(),
"pdftotext failed for {}",
pdf_path.display()
);
String::from_utf8_lossy(&output.stdout).into_owned()
}
pub fn extract_text_from_pdf_bytes(pdf_bytes: &[u8], work_dir: &Path) -> String {
std::fs::create_dir_all(work_dir).expect("create work dir");
let temp_pdf = work_dir.join(format!("_pdftotext_tmp_{}.pdf", std::process::id()));
std::fs::write(&temp_pdf, pdf_bytes).expect("write temp PDF");
let result = extract_text_from_pdf_file(&temp_pdf);
let _ = std::fs::remove_file(&temp_pdf);
result
}
pub fn pdf_page_count(pdf_path: &Path) -> Option<u32> {
let output = std::process::Command::new("pdfinfo")
.arg(pdf_path)
.output()
.ok()?;
if !output.status.success() {
return None;
}
let text = String::from_utf8_lossy(&output.stdout);
for line in text.lines() {
if let Some(rest) = line.strip_prefix("Pages:") {
return rest.trim().parse().ok();
}
}
None
}
fn collect_pngs(dir: &Path, prefix: &str) -> Vec<PathBuf> {
let mut pngs: Vec<PathBuf> = std::fs::read_dir(dir)
.expect("read output dir")
.filter_map(|e| e.ok())
.map(|e| e.path())
.filter(|p| {
p.extension().is_some_and(|ext| ext == "png")
&& p.file_name().unwrap().to_string_lossy().starts_with(prefix)
})
.collect();
pngs.sort();
pngs
}