use crate::{Document, Error, Extractor, Result};
use std::collections::HashMap;
use std::path::{Path, PathBuf};
use std::process::Command;
const SUPPORTED: &[(&str, &str)] = &[
("docx", "docx"),
("pptx", "pptx"),
("epub", "epub"),
("rtf", "rtf"),
("odt", "odt"),
("tex", "latex"),
("latex", "latex"),
("html", "html"),
("htm", "html"),
];
const EXTENSIONS: &[&str] = &[
"docx", "pptx", "epub", "rtf", "odt", "tex", "latex", "html", "htm",
];
pub struct PandocExtractor {
binary: PathBuf,
}
impl PandocExtractor {
pub fn new() -> Result<Self> {
Self::with_binary("pandoc")
}
pub fn with_binary(binary: impl Into<PathBuf>) -> Result<Self> {
let binary = binary.into();
let result = Command::new(&binary).arg("--version").output();
match result {
Ok(output) if output.status.success() => Ok(Self { binary }),
Ok(output) => Err(Error::MissingDependency {
name: "pandoc".into(),
details: format!(
"{} --version exited {:?}: {}",
binary.display(),
output.status.code(),
String::from_utf8_lossy(&output.stderr).trim()
),
}),
Err(e) => Err(Error::MissingDependency {
name: "pandoc".into(),
details: format!("could not execute {}: {e}", binary.display()),
}),
}
}
pub fn binary(&self) -> &Path {
&self.binary
}
#[must_use]
pub fn pandoc_from(ext: &str) -> Option<&'static str> {
SUPPORTED
.iter()
.find(|(e, _)| *e == ext)
.map(|(_, fmt)| *fmt)
}
}
impl Extractor for PandocExtractor {
fn extensions(&self) -> &[&'static str] {
EXTENSIONS
}
fn name(&self) -> &'static str {
"pandoc"
}
fn extract(&self, path: &Path) -> Result<Document> {
let ext = path
.extension()
.and_then(|s| s.to_str())
.map(str::to_ascii_lowercase)
.ok_or_else(|| Error::ParseError(format!("no file extension on {}", path.display())))?;
let from = Self::pandoc_from(&ext).ok_or_else(|| {
Error::UnsupportedFormat(format!("pandoc backend does not handle .{ext}"))
})?;
let path_str = path.to_str().ok_or_else(|| {
Error::ParseError(format!("path is not valid UTF-8: {}", path.display()))
})?;
let output = Command::new(&self.binary)
.args(["--from", from, "--to", "gfm", path_str])
.output()
.map_err(|e| Error::SidecarFailure {
name: "pandoc".into(),
code: None,
stderr: format!("failed to spawn {}: {e}", self.binary.display()),
})?;
if !output.status.success() {
return Err(Error::SidecarFailure {
name: "pandoc".into(),
code: output.status.code(),
stderr: String::from_utf8_lossy(&output.stderr).trim().to_owned(),
});
}
Ok(Document {
markdown: String::from_utf8_lossy(&output.stdout).into_owned(),
title: None,
metadata: HashMap::new(),
})
}
}
#[cfg(test)]
mod tests {
use super::*;
struct FakePandoc;
impl Extractor for FakePandoc {
fn extensions(&self) -> &[&'static str] {
EXTENSIONS
}
fn extract(&self, _: &Path) -> Result<Document> {
unreachable!("FakePandoc only used for trait-surface tests")
}
fn name(&self) -> &'static str {
"pandoc"
}
}
#[test]
fn covers_expected_office_formats() {
let exts = FakePandoc.extensions();
for required in ["docx", "pptx", "epub", "rtf", "odt", "tex", "html"] {
assert!(
exts.contains(&required),
"expected pandoc to handle .{required}, got {exts:?}"
);
}
}
#[test]
fn name_identifies_backend() {
assert_eq!(FakePandoc.name(), "pandoc");
}
#[test]
fn pandoc_from_maps_extensions_to_reader_names() {
assert_eq!(PandocExtractor::pandoc_from("docx"), Some("docx"));
assert_eq!(PandocExtractor::pandoc_from("tex"), Some("latex"));
assert_eq!(PandocExtractor::pandoc_from("htm"), Some("html"));
assert_eq!(PandocExtractor::pandoc_from("pdf"), None);
assert_eq!(PandocExtractor::pandoc_from("xyz"), None);
}
#[test]
fn missing_pandoc_returns_typed_error() {
let result = PandocExtractor::with_binary("/nonexistent-pandoc-path");
assert!(matches!(
result,
Err(Error::MissingDependency { name, .. }) if name == "pandoc"
));
}
#[test]
#[ignore = "requires `pandoc` on PATH"]
fn extracts_a_real_html_file() {
use std::io::Write;
let extractor = PandocExtractor::new().expect("pandoc not on PATH");
let mut tmp = tempfile::Builder::new().suffix(".html").tempfile().unwrap();
write!(tmp, "<html><body><h1>Hello</h1><p>World</p></body></html>").unwrap();
tmp.flush().unwrap();
let doc = extractor.extract(tmp.path()).expect("extraction failed");
assert!(
doc.markdown.contains("Hello"),
"expected 'Hello' in output: {:?}",
doc.markdown
);
assert!(
doc.markdown.contains("World"),
"expected 'World' in output: {:?}",
doc.markdown
);
}
}