use std::path::{Path, PathBuf};
use anyhow::{Context, Result};
pub fn pdf_to_markdown(path: &Path) -> Result<String> {
let _span = tracing::info_span!("pdf_to_markdown", path = %path.display()).entered();
let script = find_pdf_script()?;
let python = find_python()?;
let output = std::process::Command::new(&python)
.arg("--")
.arg(&script)
.arg(path)
.output()
.with_context(|| format!("Failed to run `{}`. Is Python installed?", python))?;
if !output.status.success() {
let stderr = String::from_utf8_lossy(&output.stderr);
if stderr.contains("pymupdf4llm not installed") {
tracing::warn!("pymupdf4llm not installed");
anyhow::bail!("pymupdf4llm not installed. Run: pip install pymupdf4llm");
}
tracing::warn!(stderr = %stderr, "PDF conversion failed");
anyhow::bail!("PDF conversion failed: {}", stderr.trim());
}
let markdown =
String::from_utf8(output.stdout).context("PDF converter produced non-UTF-8 output")?;
if markdown.trim().is_empty() {
tracing::warn!(path = %path.display(), "PDF produced no text (possibly image-only)");
anyhow::bail!("PDF produced no text output");
}
tracing::info!(path = %path.display(), bytes = markdown.len(), "PDF text extracted");
Ok(markdown)
}
fn find_pdf_script() -> Result<String> {
if let Ok(script) = std::env::var("CQS_PDF_SCRIPT") {
tracing::warn!(script = %script, "Using custom PDF script from CQS_PDF_SCRIPT env var");
let p = PathBuf::from(&script);
if p.extension().is_none_or(|e| e != "py") {
anyhow::bail!("CQS_PDF_SCRIPT must have .py extension (got: {}).", script);
}
if p.exists() {
return Ok(script);
}
tracing::warn!(path = %script, "CQS_PDF_SCRIPT set but file not found");
}
let mut candidates = vec![PathBuf::from("scripts/pdf_to_md.py")];
if let Some(exe_relative) = std::env::current_exe()
.ok()
.and_then(|p| p.parent().map(|d| d.join("../scripts/pdf_to_md.py")))
{
candidates.push(exe_relative);
}
for candidate in &candidates {
if candidate.exists() {
return Ok(candidate.to_string_lossy().to_string());
}
}
anyhow::bail!(
"scripts/pdf_to_md.py not found. \
Run cqs convert from the project root, or set CQS_PDF_SCRIPT env var."
)
}
#[cfg(test)]
mod tests {
use super::*;
use std::fs;
struct EnvGuard {
key: &'static str,
prev: Option<String>,
}
impl EnvGuard {
fn set(key: &'static str, val: &str) -> Self {
let prev = std::env::var(key).ok();
std::env::set_var(key, val);
EnvGuard { key, prev }
}
fn unset(key: &'static str) -> Self {
let prev = std::env::var(key).ok();
std::env::remove_var(key);
EnvGuard { key, prev }
}
}
impl Drop for EnvGuard {
fn drop(&mut self) {
match &self.prev {
Some(v) => std::env::set_var(self.key, v),
None => std::env::remove_var(self.key),
}
}
}
#[test]
#[serial_test::serial]
fn test_find_pdf_script_env_var_existing_file() {
let dir = tempfile::TempDir::new().unwrap();
let script = dir.path().join("my_script.py");
fs::write(&script, "# placeholder").unwrap();
let _guard = EnvGuard::set("CQS_PDF_SCRIPT", script.to_str().unwrap());
let result = find_pdf_script();
assert!(
result.is_ok(),
"should succeed when env var points to existing file"
);
assert_eq!(result.unwrap(), script.to_str().unwrap());
}
#[test]
#[serial_test::serial]
fn test_find_pdf_script_env_var_missing_file_falls_through() {
let dir = tempfile::TempDir::new().unwrap();
let ghost = dir.path().join("does_not_exist.py");
assert!(!ghost.exists());
let _guard = EnvGuard::set("CQS_PDF_SCRIPT", ghost.to_str().unwrap());
let result = find_pdf_script();
if let Ok(found) = &result {
assert_ne!(
found,
ghost.to_str().unwrap(),
"env-var ghost path must not be returned"
);
}
}
#[test]
#[serial_test::serial]
fn test_find_pdf_script_cwd_relative_path() {
let dir = tempfile::TempDir::new().unwrap();
let scripts_dir = dir.path().join("scripts");
fs::create_dir_all(&scripts_dir).unwrap();
fs::write(scripts_dir.join("pdf_to_md.py"), "# placeholder").unwrap();
let prev_dir = std::env::current_dir().unwrap();
std::env::set_current_dir(dir.path()).unwrap();
let _guard = EnvGuard::unset("CQS_PDF_SCRIPT");
let result = find_pdf_script();
std::env::set_current_dir(&prev_dir).unwrap();
assert!(
result.is_ok(),
"should find scripts/pdf_to_md.py relative to CWD"
);
let found = result.unwrap();
assert!(
found.contains("pdf_to_md.py"),
"returned path should contain pdf_to_md.py, got: {}",
found
);
}
#[test]
#[serial_test::serial]
fn test_find_pdf_script_not_found_returns_error() {
let empty_dir = tempfile::TempDir::new().unwrap();
let prev_dir = std::env::current_dir().unwrap();
std::env::set_current_dir(empty_dir.path()).unwrap();
let _guard = EnvGuard::unset("CQS_PDF_SCRIPT");
let result = find_pdf_script();
std::env::set_current_dir(&prev_dir).unwrap();
if let Err(e) = result {
let msg = e.to_string();
assert!(
msg.contains("pdf_to_md.py") || msg.contains("CQS_PDF_SCRIPT"),
"error message should mention the script name or env var, got: {}",
msg
);
}
}
#[test]
#[serial_test::serial]
fn test_find_pdf_script_env_var_non_py_extension_rejected() {
let dir = tempfile::TempDir::new().unwrap();
let script = dir.path().join("converter.sh");
fs::write(&script, "#!/bin/sh\necho hello").unwrap();
let _guard = EnvGuard::set("CQS_PDF_SCRIPT", script.to_str().unwrap());
let result = find_pdf_script();
assert!(result.is_err(), "non-.py extension should be rejected");
let msg = result.unwrap_err().to_string();
assert!(
msg.contains(".py extension"),
"error should mention .py requirement, got: {}",
msg
);
}
}
fn find_python() -> Result<String> {
super::find_python()
}