use std::path::Path;
use dbmd_core::extract::{self, ExtractError};
use crate::cli::ExtractArgs;
use crate::context::Context;
use crate::error::{CliError, CliResult, ExitCode};
pub fn run(ctx: &Context, args: &ExtractArgs) -> CliResult {
let path = Path::new(&args.file);
let extracted = extract::extract(path).map_err(map_extract_error)?;
if ctx.json {
let json = serde_json::to_string_pretty(&extracted)
.map_err(|e| CliError::runtime(format!("failed to encode JSON: {e}")))?;
emit(&args.out, &json, true)
} else {
emit(&args.out, &extracted.text, false)
}
}
fn emit(out: &Option<String>, content: &str, add_trailing_newline: bool) -> CliResult {
match out {
Some(path) => {
refuse_symlink_dest(path)?;
let mut body = content.to_string();
if add_trailing_newline && !body.ends_with('\n') {
body.push('\n');
}
std::fs::write(path, body).map_err(|e| {
CliError::new(
ExitCode::Runtime,
"IO_ERROR",
format!("failed to write {path}: {e}"),
)
})?;
Ok(())
}
None => {
use std::io::Write;
let stdout = std::io::stdout();
let mut lock = stdout.lock();
let res = if add_trailing_newline {
writeln!(lock, "{content}")
} else {
write!(lock, "{content}")
};
match res {
Ok(()) => Ok(()),
Err(e) if e.kind() == std::io::ErrorKind::BrokenPipe => Ok(()),
Err(e) => Err(CliError::new(
ExitCode::Runtime,
"IO_ERROR",
format!("write failed: {e}"),
)),
}
}
}
}
fn refuse_symlink_dest(path: &str) -> Result<(), CliError> {
use std::path::Component;
let refuse = |p: &Path| {
CliError::new(
ExitCode::Runtime,
"OUT_IS_SYMLINK",
format!(
"refusing to write {path}: the path is reached through a symlink ({})",
p.display()
),
)
.with_hint(
"extract --out will not follow a symlink (it could overwrite a file elsewhere); \
remove the symlink or choose a destination with no symlinked component",
)
};
let inspect_io_err = |p: &Path, e: std::io::Error| {
CliError::new(
ExitCode::Runtime,
"IO_ERROR",
format!("failed to inspect {}: {e}", p.display()),
)
};
let leaf = Path::new(path);
match std::fs::symlink_metadata(leaf) {
Ok(meta) if meta.file_type().is_symlink() => return Err(refuse(leaf)),
Ok(_) => {}
Err(e) if e.kind() == std::io::ErrorKind::NotFound => {}
Err(e) => return Err(inspect_io_err(leaf, e)),
}
let parent = match leaf.parent() {
Some(p) if !p.as_os_str().is_empty() => p,
_ => return Ok(()),
};
let mut existing = parent.to_path_buf();
let exists = |p: &Path| match std::fs::symlink_metadata(p) {
Ok(_) => Ok(true),
Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(false),
Err(e) => Err(e),
};
loop {
match exists(&existing) {
Ok(true) => break,
Ok(false) => {
if !existing.pop() || existing.as_os_str().is_empty() {
return Ok(());
}
}
Err(e) => return Err(inspect_io_err(&existing, e)),
}
}
let mut real = std::path::PathBuf::new();
let mut lexical = std::path::PathBuf::new();
let mut on_real_ground = false;
if existing.is_relative() {
real = match std::env::current_dir() {
Ok(cwd) => match cwd.canonicalize() {
Ok(c) => c,
Err(e) => return Err(inspect_io_err(&cwd, e)),
},
Err(e) => return Err(inspect_io_err(Path::new("."), e)),
};
on_real_ground = true;
}
for comp in existing.components() {
match comp {
Component::Prefix(_) | Component::RootDir => {
real.push(comp.as_os_str());
lexical.push(comp.as_os_str());
}
Component::CurDir => {}
Component::ParentDir => {
real.pop();
lexical.pop();
}
Component::Normal(name) => {
lexical.push(name);
let probe = real.join(name);
match std::fs::symlink_metadata(&probe) {
Ok(meta) if meta.file_type().is_symlink() => {
if on_real_ground {
return Err(refuse(&lexical));
}
match probe.canonicalize() {
Ok(c) => real = c,
Err(e) => return Err(inspect_io_err(&probe, e)),
}
}
Ok(meta) => {
if meta.is_dir() {
on_real_ground = true;
}
real = probe;
}
Err(e) if e.kind() == std::io::ErrorKind::NotFound => return Ok(()),
Err(e) => return Err(inspect_io_err(&probe, e)),
}
}
}
}
Ok(())
}
fn map_extract_error(err: ExtractError) -> CliError {
match &err {
ExtractError::UnsupportedFormat(_) => CliError::new(
ExitCode::Runtime,
err.code(),
err.to_string(),
)
.with_hint(
"supported document types: .pdf, .docx, .xlsx/.xlsm/.xlsb/.ods, .epub, .html/.htm/.xhtml (detected by extension)",
),
ExtractError::Encrypted(_) => CliError::new(ExitCode::Runtime, err.code(), err.to_string())
.with_hint("the document is password-protected; dbmd extract cannot open it"),
ExtractError::Parse { .. } => CliError::new(ExitCode::Runtime, err.code(), err.to_string()),
ExtractError::Io(_) => CliError::new(ExitCode::Runtime, "IO_ERROR", err.to_string()),
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
#[cfg(unix)]
fn out_symlink_is_refused_and_target_untouched() {
let tmp = tempfile::tempdir().unwrap();
let victim = tmp.path().join("victim.conf");
std::fs::write(&victim, "SENSITIVE-ORIGINAL\n").unwrap();
let link = tmp.path().join("innocent-output.txt");
std::os::unix::fs::symlink(&victim, &link).unwrap();
let out = Some(link.to_string_lossy().into_owned());
let err = emit(&out, "POISONED-BYTES-FROM-DOCUMENT", false)
.expect_err("a symlinked --out must be refused");
assert_eq!(err.code, "OUT_IS_SYMLINK", "got {err:?}");
assert_eq!(
std::fs::read_to_string(&victim).unwrap(),
"SENSITIVE-ORIGINAL\n",
"the symlink target must not be overwritten",
);
assert!(
std::fs::symlink_metadata(&link)
.unwrap()
.file_type()
.is_symlink(),
"the --out path must remain a symlink",
);
}
#[test]
#[cfg(unix)]
fn out_through_symlinked_parent_is_refused_and_target_untouched() {
let tmp = tempfile::tempdir().unwrap();
let store = tmp.path().join("store");
std::fs::create_dir(&store).unwrap();
let external = tmp.path().join("external");
std::fs::create_dir(&external).unwrap();
let victim = external.join("victim.txt");
std::fs::write(&victim, "ORIGINAL_SECRET\n").unwrap();
let linkdir = store.join("linkdir");
std::os::unix::fs::symlink(&external, &linkdir).unwrap();
let out_path = linkdir.join("victim.txt");
let out = Some(out_path.to_string_lossy().into_owned());
let err = emit(&out, "POISONED_BY_EXTRACT", false)
.expect_err("a --out reached through a symlinked parent must be refused");
assert_eq!(err.code, "OUT_IS_SYMLINK", "got {err:?}");
assert_eq!(
std::fs::read_to_string(&victim).unwrap(),
"ORIGINAL_SECRET\n",
"the symlinked-parent target must not be overwritten",
);
}
#[test]
#[cfg(unix)]
fn out_through_symlinked_parent_with_real_subdir_is_refused() {
let tmp = tempfile::tempdir().unwrap();
let store = tmp.path().join("store");
std::fs::create_dir(&store).unwrap();
let external = tmp.path().join("external");
let external_sub = external.join("sub");
std::fs::create_dir_all(&external_sub).unwrap();
let victim = external_sub.join("victim.txt");
std::fs::write(&victim, "ORIGINAL_SECRET\n").unwrap();
let linkdir = store.join("linkdir");
std::os::unix::fs::symlink(&external, &linkdir).unwrap();
let out_path = linkdir.join("sub").join("victim.txt");
let out = Some(out_path.to_string_lossy().into_owned());
let err = emit(&out, "POISONED_BY_EXTRACT", false).expect_err(
"a --out reached through a symlinked parent (with a real subdir below \
the link) must be refused",
);
assert_eq!(err.code, "OUT_IS_SYMLINK", "got {err:?}");
assert_eq!(
std::fs::read_to_string(&victim).unwrap(),
"ORIGINAL_SECRET\n",
"the deep symlinked-parent target must not be overwritten",
);
}
#[test]
fn out_into_real_nested_subdir_is_written() {
let tmp = tempfile::tempdir().unwrap();
let nested = tmp.path().join("a").join("b").join("c");
std::fs::create_dir_all(&nested).unwrap();
let dest = nested.join("out.txt");
let out = Some(dest.to_string_lossy().into_owned());
emit(&out, "deep but real", false).expect("a real deep-nested --out must succeed");
assert_eq!(std::fs::read_to_string(&dest).unwrap(), "deep but real");
}
#[test]
fn out_regular_file_is_written() {
let tmp = tempfile::tempdir().unwrap();
let dest = tmp.path().join("out.txt");
let out = Some(dest.to_string_lossy().into_owned());
emit(&out, "hello extracted text", false).expect("a regular --out must succeed");
assert_eq!(
std::fs::read_to_string(&dest).unwrap(),
"hello extracted text",
);
emit(&out, "second write", false).expect("overwriting a regular file is allowed");
assert_eq!(std::fs::read_to_string(&dest).unwrap(), "second write");
}
}