1use std::path::Path;
2
3#[derive(Debug, Clone, Copy, PartialEq, Eq)]
4pub enum Format {
5 Excel,
6 Pdf,
7 PowerPoint,
8 Word,
9 Image,
10 Zip,
11 Epub,
12 Audio,
13 Csv,
14 Html,
15 Json,
16 Yaml,
17 Toml,
18 Xml,
19 Sqlite,
20 Tar,
21 Video,
22 Ocr,
23 MarkdownDocx,
24 MarkdownHtml,
25 MarkdownText,
26 MarkdownLatex,
27 MarkdownRst,
28 MarkdownAsciidoc,
29 MarkdownOrg,
30 MarkdownEpub,
31 MarkdownJsonAst,
32}
33
34impl Format {
35 pub fn detect(filename: Option<&str>, bytes: &[u8]) -> Option<Self> {
36 if let Some(name) = filename
37 && let Some(fmt) = Self::from_extension(name) {
38 return Some(fmt);
39 }
40 Self::from_magic_bytes(bytes)
41 }
42
43 fn from_extension(filename: &str) -> Option<Self> {
44 let ext = Path::new(filename)
45 .extension()
46 .and_then(|e| e.to_str())
47 .map(|e| e.to_ascii_lowercase())?;
48
49 match ext.as_str() {
50 "xlsx" | "xls" | "xlsb" | "ods" => Some(Self::Excel),
51 "pdf" => Some(Self::Pdf),
52 "pptx" => Some(Self::PowerPoint),
53 "docx" => Some(Self::Word),
54 "png" | "jpg" | "jpeg" | "gif" | "webp" | "svg" | "bmp" | "tiff" | "tif" => {
55 Some(Self::Image)
56 }
57 "zip" => Some(Self::Zip),
58 "epub" => Some(Self::Epub),
59 "mp3" | "wav" | "flac" | "ogg" | "m4a" | "aac" | "wma" => Some(Self::Audio),
60 "csv" | "tsv" => Some(Self::Csv),
61 "html" | "htm" => Some(Self::Html),
62 "json" => Some(Self::Json),
63 "yaml" | "yml" => Some(Self::Yaml),
64 "toml" => Some(Self::Toml),
65 "xml" => Some(Self::Xml),
66 "sqlite" | "sqlite3" | "db" => Some(Self::Sqlite),
67 "tar" => Some(Self::Tar),
68 "tgz" => Some(Self::Tar),
69 "mp4" | "mkv" | "avi" | "mov" | "webm" | "m4v" | "wmv" | "flv" => {
70 Some(Self::Video)
71 }
72 "md" | "markdown" => Some(Self::MarkdownDocx),
73 _ => None,
74 }
75 }
76
77 fn from_magic_bytes(bytes: &[u8]) -> Option<Self> {
78 if bytes.len() < 4 {
79 return None;
80 }
81
82 if bytes.starts_with(b"%PDF") {
84 return Some(Self::Pdf);
85 }
86
87 if bytes.starts_with(&[0x89, 0x50, 0x4E, 0x47]) {
89 return Some(Self::Image);
90 }
91
92 if bytes.starts_with(&[0xFF, 0xD8, 0xFF]) {
94 return Some(Self::Image);
95 }
96
97 if bytes.starts_with(b"GIF87a") || bytes.starts_with(b"GIF89a") {
99 return Some(Self::Image);
100 }
101
102 if bytes.len() >= 12 && bytes.starts_with(b"RIFF") && &bytes[8..12] == b"WAVE" {
104 return Some(Self::Audio);
105 }
106
107 if bytes.starts_with(b"fLaC") {
109 return Some(Self::Audio);
110 }
111
112 if bytes.starts_with(b"OggS") {
114 return Some(Self::Audio);
115 }
116
117 if bytes.starts_with(b"ID3")
119 || bytes.starts_with(&[0xFF, 0xFB])
120 || bytes.starts_with(&[0xFF, 0xF3])
121 || bytes.starts_with(&[0xFF, 0xF2])
122 {
123 return Some(Self::Audio);
124 }
125
126 if bytes.starts_with(b"BM") {
128 return Some(Self::Image);
129 }
130
131 if bytes.starts_with(&[0x49, 0x49, 0x2A, 0x00])
133 || bytes.starts_with(&[0x4D, 0x4D, 0x00, 0x2A])
134 {
135 return Some(Self::Image);
136 }
137
138 if bytes.len() >= 12 && bytes.starts_with(b"RIFF") && &bytes[8..12] == b"WEBP" {
140 return Some(Self::Image);
141 }
142
143 if bytes.len() >= 16 && bytes.starts_with(b"SQLite format 3\0") {
145 return Some(Self::Sqlite);
146 }
147
148 if bytes.starts_with(&[0x1F, 0x8B]) {
150 return Some(Self::Tar);
151 }
152
153 if bytes.starts_with(&[0x50, 0x4B, 0x03, 0x04]) {
155 #[cfg(any(
156 feature = "zip",
157 feature = "word",
158 feature = "powerpoint",
159 feature = "excel",
160 feature = "epub"
161 ))]
162 return Self::detect_zip_content(bytes);
163 #[cfg(not(any(
164 feature = "zip",
165 feature = "word",
166 feature = "powerpoint",
167 feature = "excel",
168 feature = "epub"
169 )))]
170 return Some(Self::Zip);
171 }
172
173 None
174 }
175
176 #[cfg(any(
177 feature = "zip",
178 feature = "word",
179 feature = "powerpoint",
180 feature = "excel",
181 feature = "epub"
182 ))]
183 fn detect_zip_content(bytes: &[u8]) -> Option<Self> {
184 let cursor = std::io::Cursor::new(bytes);
185 let mut archive = zip::ZipArchive::new(cursor).ok()?;
186
187 for i in 0..archive.len() {
188 let entry = archive.by_index(i).ok()?;
189 let name = entry.name().to_string();
190
191 if name.starts_with("word/") {
192 return Some(Self::Word);
193 }
194 if name.starts_with("ppt/") {
195 return Some(Self::PowerPoint);
196 }
197 if name.starts_with("xl/") {
198 return Some(Self::Excel);
199 }
200 if name == "mimetype" || name == "META-INF/container.xml" {
201 return Some(Self::Epub);
202 }
203 }
204
205 Some(Self::Zip)
206 }
207}
208
209impl std::fmt::Display for Format {
210 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
211 match self {
212 Self::Excel => write!(f, "excel"),
213 Self::Pdf => write!(f, "pdf"),
214 Self::PowerPoint => write!(f, "powerpoint"),
215 Self::Word => write!(f, "word"),
216 Self::Image => write!(f, "image"),
217 Self::Zip => write!(f, "zip"),
218 Self::Epub => write!(f, "epub"),
219 Self::Audio => write!(f, "audio"),
220 Self::Csv => write!(f, "csv"),
221 Self::Html => write!(f, "html"),
222 Self::Json => write!(f, "json"),
223 Self::Yaml => write!(f, "yaml"),
224 Self::Toml => write!(f, "toml"),
225 Self::Xml => write!(f, "xml"),
226 Self::Sqlite => write!(f, "sqlite"),
227 Self::Tar => write!(f, "tar"),
228 Self::Video => write!(f, "video"),
229 Self::Ocr => write!(f, "ocr"),
230 Self::MarkdownDocx => write!(f, "markdown-docx"),
231 Self::MarkdownHtml => write!(f, "markdown-html"),
232 Self::MarkdownText => write!(f, "markdown-text"),
233 Self::MarkdownLatex => write!(f, "markdown-latex"),
234 Self::MarkdownRst => write!(f, "markdown-rst"),
235 Self::MarkdownAsciidoc => write!(f, "markdown-asciidoc"),
236 Self::MarkdownOrg => write!(f, "markdown-org"),
237 Self::MarkdownEpub => write!(f, "markdown-epub"),
238 Self::MarkdownJsonAst => write!(f, "markdown-json-ast"),
239 }
240 }
241}