use super::big_api::fetch_url_with_retry;
use super::PipelineError;
use super::PipelineResultExt;
use std::fs;
use std::path::{Path, PathBuf};
pub(crate) struct DownloadResult {
pub(crate) data: Vec<u8>,
pub(crate) sha256: String,
}
pub(crate) fn download_with_sha256(url: &str) -> Result<DownloadResult, PipelineError> {
let data = fetch_url_with_retry(url, 5, 300, "PDF download")?;
use sha2::Digest;
let mut hasher = sha2::Sha256::new();
hasher.update(&data);
let sha256 = format!("{:x}", hasher.finalize());
Ok(DownloadResult { data, sha256 })
}
pub(crate) fn ensure_pdf(pdf_url: &str, cache_dir: &Path) -> Result<PathBuf, PipelineError> {
fs::create_dir_all(cache_dir).ctx("failed to create cache directory")?;
let pdf_path = cache_dir.join("kemendagri.pdf");
if !pdf_path.exists() {
eprintln!("Downloading Kemendagri PDF (57 MB)...");
let bytes = download_with_sha256(pdf_url)?;
fs::write(&pdf_path, bytes.data).ctx("failed to write PDF")?;
eprintln!("PDF SHA-256: {}", bytes.sha256);
}
Ok(pdf_path)
}
pub(crate) fn extract_text(pdf_path: &Path) -> Result<String, PipelineError> {
eprintln!("Extracting text from PDF...");
let output = std::process::Command::new("pdftotext")
.arg("-layout")
.arg(pdf_path)
.arg("-")
.output()
.ctx("pdftotext failed")?;
if !output.status.success() {
let stderr = String::from_utf8_lossy(&output.stderr);
return Err(PipelineError::new(format!(
"pdftotext exited with status {}: {}",
output.status, stderr
)));
}
Ok(String::from_utf8_lossy(&output.stdout).into_owned())
}