use std::path::Path;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum Format {
Excel,
Pdf,
PowerPoint,
Word,
Image,
Zip,
Epub,
Audio,
Csv,
Html,
Json,
Yaml,
Toml,
Xml,
Sqlite,
Tar,
Video,
Ocr,
MarkdownDocx,
MarkdownHtml,
MarkdownText,
MarkdownLatex,
MarkdownRst,
MarkdownAsciidoc,
MarkdownOrg,
MarkdownEpub,
MarkdownJsonAst,
}
impl Format {
pub fn detect(filename: Option<&str>, bytes: &[u8]) -> Option<Self> {
if let Some(name) = filename
&& let Some(fmt) = Self::from_extension(name) {
return Some(fmt);
}
Self::from_magic_bytes(bytes)
}
fn from_extension(filename: &str) -> Option<Self> {
let ext = Path::new(filename)
.extension()
.and_then(|e| e.to_str())
.map(|e| e.to_ascii_lowercase())?;
match ext.as_str() {
"xlsx" | "xls" | "xlsb" | "ods" => Some(Self::Excel),
"pdf" => Some(Self::Pdf),
"pptx" => Some(Self::PowerPoint),
"docx" => Some(Self::Word),
"png" | "jpg" | "jpeg" | "gif" | "webp" | "svg" | "bmp" | "tiff" | "tif" => {
Some(Self::Image)
}
"zip" => Some(Self::Zip),
"epub" => Some(Self::Epub),
"mp3" | "wav" | "flac" | "ogg" | "m4a" | "aac" | "wma" => Some(Self::Audio),
"csv" | "tsv" => Some(Self::Csv),
"html" | "htm" => Some(Self::Html),
"json" => Some(Self::Json),
"yaml" | "yml" => Some(Self::Yaml),
"toml" => Some(Self::Toml),
"xml" => Some(Self::Xml),
"sqlite" | "sqlite3" | "db" => Some(Self::Sqlite),
"tar" => Some(Self::Tar),
"tgz" => Some(Self::Tar),
"mp4" | "mkv" | "avi" | "mov" | "webm" | "m4v" | "wmv" | "flv" => {
Some(Self::Video)
}
"md" | "markdown" => Some(Self::MarkdownDocx),
_ => None,
}
}
fn from_magic_bytes(bytes: &[u8]) -> Option<Self> {
if bytes.len() < 4 {
return None;
}
if bytes.starts_with(b"%PDF") {
return Some(Self::Pdf);
}
if bytes.starts_with(&[0x89, 0x50, 0x4E, 0x47]) {
return Some(Self::Image);
}
if bytes.starts_with(&[0xFF, 0xD8, 0xFF]) {
return Some(Self::Image);
}
if bytes.starts_with(b"GIF87a") || bytes.starts_with(b"GIF89a") {
return Some(Self::Image);
}
if bytes.len() >= 12 && bytes.starts_with(b"RIFF") && &bytes[8..12] == b"WAVE" {
return Some(Self::Audio);
}
if bytes.starts_with(b"fLaC") {
return Some(Self::Audio);
}
if bytes.starts_with(b"OggS") {
return Some(Self::Audio);
}
if bytes.starts_with(b"ID3")
|| bytes.starts_with(&[0xFF, 0xFB])
|| bytes.starts_with(&[0xFF, 0xF3])
|| bytes.starts_with(&[0xFF, 0xF2])
{
return Some(Self::Audio);
}
if bytes.starts_with(b"BM") {
return Some(Self::Image);
}
if bytes.starts_with(&[0x49, 0x49, 0x2A, 0x00])
|| bytes.starts_with(&[0x4D, 0x4D, 0x00, 0x2A])
{
return Some(Self::Image);
}
if bytes.len() >= 12 && bytes.starts_with(b"RIFF") && &bytes[8..12] == b"WEBP" {
return Some(Self::Image);
}
if bytes.len() >= 16 && bytes.starts_with(b"SQLite format 3\0") {
return Some(Self::Sqlite);
}
if bytes.starts_with(&[0x1F, 0x8B]) {
return Some(Self::Tar);
}
if bytes.starts_with(&[0x50, 0x4B, 0x03, 0x04]) {
#[cfg(any(
feature = "zip",
feature = "word",
feature = "powerpoint",
feature = "excel",
feature = "epub"
))]
return Self::detect_zip_content(bytes);
#[cfg(not(any(
feature = "zip",
feature = "word",
feature = "powerpoint",
feature = "excel",
feature = "epub"
)))]
return Some(Self::Zip);
}
None
}
#[cfg(any(
feature = "zip",
feature = "word",
feature = "powerpoint",
feature = "excel",
feature = "epub"
))]
fn detect_zip_content(bytes: &[u8]) -> Option<Self> {
let cursor = std::io::Cursor::new(bytes);
let mut archive = zip::ZipArchive::new(cursor).ok()?;
for i in 0..archive.len() {
let entry = archive.by_index(i).ok()?;
let name = entry.name().to_string();
if name.starts_with("word/") {
return Some(Self::Word);
}
if name.starts_with("ppt/") {
return Some(Self::PowerPoint);
}
if name.starts_with("xl/") {
return Some(Self::Excel);
}
if name == "mimetype" || name == "META-INF/container.xml" {
return Some(Self::Epub);
}
}
Some(Self::Zip)
}
}
impl std::fmt::Display for Format {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::Excel => write!(f, "excel"),
Self::Pdf => write!(f, "pdf"),
Self::PowerPoint => write!(f, "powerpoint"),
Self::Word => write!(f, "word"),
Self::Image => write!(f, "image"),
Self::Zip => write!(f, "zip"),
Self::Epub => write!(f, "epub"),
Self::Audio => write!(f, "audio"),
Self::Csv => write!(f, "csv"),
Self::Html => write!(f, "html"),
Self::Json => write!(f, "json"),
Self::Yaml => write!(f, "yaml"),
Self::Toml => write!(f, "toml"),
Self::Xml => write!(f, "xml"),
Self::Sqlite => write!(f, "sqlite"),
Self::Tar => write!(f, "tar"),
Self::Video => write!(f, "video"),
Self::Ocr => write!(f, "ocr"),
Self::MarkdownDocx => write!(f, "markdown-docx"),
Self::MarkdownHtml => write!(f, "markdown-html"),
Self::MarkdownText => write!(f, "markdown-text"),
Self::MarkdownLatex => write!(f, "markdown-latex"),
Self::MarkdownRst => write!(f, "markdown-rst"),
Self::MarkdownAsciidoc => write!(f, "markdown-asciidoc"),
Self::MarkdownOrg => write!(f, "markdown-org"),
Self::MarkdownEpub => write!(f, "markdown-epub"),
Self::MarkdownJsonAst => write!(f, "markdown-json-ast"),
}
}
}