Skip to main content

bookforge_pdf/
tools.rs

1//! Discovery and invocation of poppler command-line tools.
2//!
3//! Resolution order: `POPPLER_PATH` environment variable (a directory
4//! containing the binaries), then the system `PATH`. External binaries
5//! follow the EPUBCheck precedent (ROADMAP §1.6, §8.4): subprocesses
6//! are acceptable, embedded runtimes are not.
7
8use std::{
9    path::{Path, PathBuf},
10    process::Command,
11};
12
13#[derive(Debug, thiserror::Error)]
14pub enum ToolError {
15    #[error(
16        "poppler tool '{0}' not found. Install poppler and either add it to PATH or set POPPLER_PATH to its bin directory."
17    )]
18    NotFound(&'static str),
19
20    #[error("'{tool}' failed (exit {code:?}): {stderr}")]
21    Failed {
22        tool: &'static str,
23        code: Option<i32>,
24        stderr: String,
25    },
26
27    #[error(transparent)]
28    Io(#[from] std::io::Error),
29}
30
31#[derive(Debug, Clone)]
32pub struct PopplerTools {
33    pub pdftohtml: PathBuf,
34    pub pdftotext: PathBuf,
35}
36
37impl PopplerTools {
38    /// Locate the required poppler binaries or explain what is missing.
39    pub fn discover() -> Result<Self, ToolError> {
40        Ok(Self {
41            pdftohtml: find_tool("pdftohtml")?,
42            pdftotext: find_tool("pdftotext")?,
43        })
44    }
45
46    /// `pdftohtml -v` prints its version banner on stderr.
47    pub fn version(&self) -> Option<String> {
48        let output = Command::new(&self.pdftohtml).arg("-v").output().ok()?;
49        let banner = String::from_utf8_lossy(&output.stderr);
50        banner.lines().next().map(|line| line.trim().to_string())
51    }
52
53    /// Run `pdftohtml -xml` and return the XML document.
54    pub fn pdf_to_xml(&self, pdf: &Path) -> Result<String, ToolError> {
55        let output = Command::new(&self.pdftohtml)
56            .args(["-xml", "-i", "-stdout", "-q", "-enc", "UTF-8"])
57            .arg(pdf)
58            .output()?;
59        if !output.status.success() {
60            return Err(ToolError::Failed {
61                tool: "pdftohtml",
62                code: output.status.code(),
63                stderr: String::from_utf8_lossy(&output.stderr).into_owned(),
64            });
65        }
66        Ok(String::from_utf8_lossy(&output.stdout).into_owned())
67    }
68
69    /// Raw text via `pdftotext`, used as the coverage baseline: it makes
70    /// no layout decisions, so its character count approximates "all the
71    /// text poppler can see".
72    pub fn pdf_to_text(&self, pdf: &Path) -> Result<String, ToolError> {
73        let output = Command::new(&self.pdftotext)
74            .args(["-enc", "UTF-8", "-q"])
75            .arg(pdf)
76            .arg("-")
77            .output()?;
78        if !output.status.success() {
79            return Err(ToolError::Failed {
80                tool: "pdftotext",
81                code: output.status.code(),
82                stderr: String::from_utf8_lossy(&output.stderr).into_owned(),
83            });
84        }
85        Ok(String::from_utf8_lossy(&output.stdout).into_owned())
86    }
87}
88
89fn find_tool(name: &'static str) -> Result<PathBuf, ToolError> {
90    let exe = if cfg!(windows) {
91        format!("{name}.exe")
92    } else {
93        name.to_string()
94    };
95
96    if let Ok(dir) = std::env::var("POPPLER_PATH") {
97        let candidate = Path::new(&dir).join(&exe);
98        if candidate.is_file() {
99            return Ok(candidate);
100        }
101    }
102
103    if let Some(paths) = std::env::var_os("PATH") {
104        for dir in std::env::split_paths(&paths) {
105            let candidate = dir.join(&exe);
106            if candidate.is_file() {
107                return Ok(candidate);
108            }
109        }
110    }
111
112    Err(ToolError::NotFound(name))
113}