use anyhow::{anyhow, bail, Result};
use compact_str::{format_compact, CompactString};
use core::fmt;
use fxhash::FxHashMap;
use infer::{Infer, Matcher};
use std::{
fs::{self, File},
path::{Path, PathBuf},
sync::{
atomic::{AtomicU32, Ordering},
Arc, Weak,
},
};
use tempfile::TempDir;
#[derive(Debug)]
struct DecodedInner {
original_path: PathBuf,
decoded_path: PathBuf,
open: Arc<AtomicU32>,
}
impl Drop for DecodedInner {
fn drop(&mut self) {
let _ = fs::remove_file(&self.decoded_path);
self.open.fetch_sub(1, Ordering::Relaxed);
}
}
#[derive(Debug, Clone)]
pub struct Decoded(Arc<DecodedInner>);
impl Decoded {
pub fn original_path(&self) -> &Path {
&self.0.original_path
}
pub fn decoded_path(&self) -> &Path {
&self.0.decoded_path
}
}
pub struct Decoder {
decoded: FxHashMap<PathBuf, Weak<DecodedInner>>,
decoders: FxHashMap<&'static str, Box<dyn FnMut(&Path, &Path) -> Result<()>>>,
tmpdir: TempDir,
open: Arc<AtomicU32>,
next_id: u64,
infer: Infer,
}
impl fmt::Debug for Decoder {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "GeneralDecoder")
}
}
impl Decoder {
pub fn add_decoder(
&mut self,
mime_type: &'static str,
decoder: Box<dyn FnMut(&Path, &Path) -> Result<()>>,
) {
self.decoders.insert(mime_type, decoder);
}
pub fn add_mime_inferer(
&mut self,
mime: &'static str,
extension: &'static str,
matcher: Matcher,
) {
self.infer.add(mime, extension, matcher)
}
pub fn new() -> Result<Self> {
Ok(Self {
decoded: FxHashMap::default(),
decoders: FxHashMap::default(),
tmpdir: TempDir::new()?,
open: Arc::new(AtomicU32::new(0)),
next_id: 0,
infer: Infer::new(),
})
}
pub fn decode<P: AsRef<Path>>(&mut self, path: P) -> Result<Decoded> {
let path = path.as_ref();
if let Some(weak) = self.decoded.get(path) {
if let Some(decoded) = weak.upgrade() {
return Ok(Decoded(decoded));
}
}
let decoded_path = self
.tmpdir
.path()
.join(&*format_compact!("{}", self.next_id));
self.next_id += 1;
let typ = infer_from_path(&self.infer, path)?;
match self.decoders.get_mut(&*typ) {
Some(decoder) => decoder(path, &decoded_path)?,
None => generic_decode(&self.infer, &typ, path, &decoded_path)?,
}
let decoded = Arc::new(DecodedInner {
original_path: path.to_path_buf(),
decoded_path,
open: Arc::clone(&self.open),
});
self.decoded
.insert(path.to_path_buf(), Arc::downgrade(&decoded));
self.open.fetch_add(1, Ordering::Relaxed);
self.gc();
Ok(Decoded(decoded))
}
fn gc(&mut self) {
if (self.decoded.len() as u32) >= self.open.load(Ordering::Relaxed) << 1 {
self.decoded.retain(|_, weak| weak.strong_count() > 0);
}
}
}
fn infer_from_path<P: AsRef<Path>>(infer: &Infer, path: P) -> Result<CompactString> {
let path = path.as_ref();
let typ = infer
.get_from_path(path)?
.map(|t| CompactString::from(t.mime_type()));
match typ {
Some(typ) => Ok(typ),
None => {
if cfg!(unix) {
use std::process::Command;
let out = Command::new("file")
.arg("-b")
.arg("--mime-type")
.arg(path)
.output()?;
Ok(CompactString::from_utf8_lossy(out.stdout.trim_ascii()))
} else {
Ok(CompactString::from("text/plain"))
}
}
}
}
#[cfg(unix)]
fn loconvert(ext: &str, path: &Path, decoded_path: &Path) -> Result<()> {
use std::process::Command;
use tempfile::tempdir;
let name = path.file_name().ok_or_else(|| anyhow!("no file name"))?;
let dir = tempdir()?;
fs::copy(path, dir.path().join(name))?;
let outfile = dir
.path()
.join(name)
.with_extension(ext.split_once(":").map(|(s, _)| s).unwrap_or(ext));
Command::new("libreoffice")
.arg("--convert-to")
.arg(ext)
.arg(path)
.arg("--outdir")
.arg(dir.path())
.status()?;
fs::rename(&outfile, decoded_path)?;
Ok(())
}
#[cfg(unix)]
fn uncompress(infer: &Infer, cmd: &str, path: &Path, decoded_path: &Path) -> Result<()> {
use std::process::Command;
let (out, tmp) = tempfile::NamedTempFile::new()?.into_parts();
Command::new(cmd).arg(path).stdout(out).status()?;
let typ = infer_from_path(infer, &tmp)?;
generic_decode(infer, &typ, &tmp, decoded_path)
}
#[cfg(unix)]
fn generic_decode(infer: &Infer, typ: &str, path: &Path, decoded_path: &Path) -> Result<()> {
use std::process::Command;
match typ {
"text/plain"
| "text/csv"
| "text/xml"
| "text/x-Algol68"
| "text/x-lisp"
| "text/x-c"
| "text/x-c++"
| "text/x-objective-c"
| "text/x-ruby"
| "text/x-asm"
| "text/x-fortran"
| "text/x-java"
| "text/x-pascal"
| "text/x-haskell"
| "text/x-makefile"
| "text/x-erlang"
| "text/x-lua"
| "text/x-php"
| "text/x-m4"
| "application/x-msdos-batch"
| "text/x-script.python"
| "text/x-perl"
| "text/x-shellscript"
| "text/x-tcl"
| "application/json" => {
fs::copy(path, decoded_path)?;
Ok(())
}
"application/vnd.oasis.opendocument.text"
| "text/html"
| "application/vnd.ms-powerpoint"
| "application/vnd.oasis.opendocument.presentation"
| "application/vnd.openxmlformats-officedocument.presentationml.presentation"
| "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
| "application/msword" => loconvert("txt:Text", path, decoded_path),
"application/vnd.ms-excel"
| "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
| "application/vnd.oasis.opendocument.spreadsheet" => loconvert("csv", path, decoded_path),
"application/epub+zip" => {
Command::new("epub2txt")
.arg(path)
.stdout(File::create(decoded_path)?)
.status()?;
Ok(())
}
"application/pdf" => {
Command::new("pdftotext")
.arg(path)
.arg(decoded_path)
.status()?;
Ok(())
}
"application/postscript" => {
Command::new("ps2txt")
.arg(path)
.stdout(File::create(decoded_path)?)
.status()?;
Ok(())
}
"application/x-bzip2" => uncompress(infer, "bzcat", path, decoded_path),
"application/gzip" => uncompress(infer, "zcat", path, decoded_path),
"application/x-xz" => uncompress(infer, "xzcat", path, decoded_path),
"application/zstd" => uncompress(infer, "zstdcat", path, decoded_path),
"application/x-lzma" => uncompress(infer, "lzcat", path, decoded_path),
"application/x-executable"
| "application/x-pie-executable"
| "application/x-sharedlib"
| "application/octet-stream" => {
Command::new("strings")
.arg(path)
.stdout(File::create(decoded_path)?)
.status()?;
Ok(())
}
"text/troff" => {
Command::new("man")
.arg(path)
.stdout(File::create(decoded_path)?)
.status()?;
Ok(())
}
_ => bail!("no decoder for mime type {}", typ),
}
}
#[cfg(windows)]
fn generic_decode(_infer: &Infer, typ: &str, path: &Path, decoded_path: &Path) -> Result<()> {
match typ {
"text/plain" => {
fs::copy(path, decoded_path)?;
Ok(())
}
typ => bail!("no decoder for mime type {}", typ),
}
}