Skip to main content

mq_conv/
detect.rs

1use std::path::Path;
2
3#[derive(Debug, Clone, Copy, PartialEq, Eq)]
4pub enum Format {
5    Excel,
6    Pdf,
7    PowerPoint,
8    Word,
9    Image,
10    Zip,
11    Epub,
12    Audio,
13    Csv,
14    Html,
15    Json,
16    Yaml,
17    Toml,
18    Xml,
19    Sqlite,
20    Tar,
21    Video,
22    Ocr,
23    MarkdownDocx,
24    MarkdownHtml,
25    MarkdownText,
26    MarkdownLatex,
27    MarkdownRst,
28    MarkdownAsciidoc,
29    MarkdownOrg,
30    MarkdownEpub,
31    MarkdownJsonAst,
32}
33
34impl Format {
35    pub fn detect(filename: Option<&str>, bytes: &[u8]) -> Option<Self> {
36        if let Some(name) = filename
37            && let Some(fmt) = Self::from_extension(name) {
38                return Some(fmt);
39            }
40        Self::from_magic_bytes(bytes)
41    }
42
43    fn from_extension(filename: &str) -> Option<Self> {
44        let ext = Path::new(filename)
45            .extension()
46            .and_then(|e| e.to_str())
47            .map(|e| e.to_ascii_lowercase())?;
48
49        match ext.as_str() {
50            "xlsx" | "xls" | "xlsb" | "ods" => Some(Self::Excel),
51            "pdf" => Some(Self::Pdf),
52            "pptx" => Some(Self::PowerPoint),
53            "docx" => Some(Self::Word),
54            "png" | "jpg" | "jpeg" | "gif" | "webp" | "svg" | "bmp" | "tiff" | "tif" => {
55                Some(Self::Image)
56            }
57            "zip" => Some(Self::Zip),
58            "epub" => Some(Self::Epub),
59            "mp3" | "wav" | "flac" | "ogg" | "m4a" | "aac" | "wma" => Some(Self::Audio),
60            "csv" | "tsv" => Some(Self::Csv),
61            "html" | "htm" => Some(Self::Html),
62            "json" => Some(Self::Json),
63            "yaml" | "yml" => Some(Self::Yaml),
64            "toml" => Some(Self::Toml),
65            "xml" => Some(Self::Xml),
66            "sqlite" | "sqlite3" | "db" => Some(Self::Sqlite),
67            "tar" => Some(Self::Tar),
68            "tgz" => Some(Self::Tar),
69            "mp4" | "mkv" | "avi" | "mov" | "webm" | "m4v" | "wmv" | "flv" => {
70                Some(Self::Video)
71            }
72            "md" | "markdown" => Some(Self::MarkdownDocx),
73            _ => None,
74        }
75    }
76
77    fn from_magic_bytes(bytes: &[u8]) -> Option<Self> {
78        if bytes.len() < 4 {
79            return None;
80        }
81
82        // PDF: %PDF
83        if bytes.starts_with(b"%PDF") {
84            return Some(Self::Pdf);
85        }
86
87        // PNG: \x89PNG
88        if bytes.starts_with(&[0x89, 0x50, 0x4E, 0x47]) {
89            return Some(Self::Image);
90        }
91
92        // JPEG: \xFF\xD8\xFF
93        if bytes.starts_with(&[0xFF, 0xD8, 0xFF]) {
94            return Some(Self::Image);
95        }
96
97        // GIF: GIF87a or GIF89a
98        if bytes.starts_with(b"GIF87a") || bytes.starts_with(b"GIF89a") {
99            return Some(Self::Image);
100        }
101
102        // RIFF....WAVE (WAV)
103        if bytes.len() >= 12 && bytes.starts_with(b"RIFF") && &bytes[8..12] == b"WAVE" {
104            return Some(Self::Audio);
105        }
106
107        // FLAC
108        if bytes.starts_with(b"fLaC") {
109            return Some(Self::Audio);
110        }
111
112        // OGG
113        if bytes.starts_with(b"OggS") {
114            return Some(Self::Audio);
115        }
116
117        // MP3: ID3 tag or sync bytes
118        if bytes.starts_with(b"ID3")
119            || bytes.starts_with(&[0xFF, 0xFB])
120            || bytes.starts_with(&[0xFF, 0xF3])
121            || bytes.starts_with(&[0xFF, 0xF2])
122        {
123            return Some(Self::Audio);
124        }
125
126        // BMP
127        if bytes.starts_with(b"BM") {
128            return Some(Self::Image);
129        }
130
131        // TIFF
132        if bytes.starts_with(&[0x49, 0x49, 0x2A, 0x00])
133            || bytes.starts_with(&[0x4D, 0x4D, 0x00, 0x2A])
134        {
135            return Some(Self::Image);
136        }
137
138        // WEBP: RIFF....WEBP
139        if bytes.len() >= 12 && bytes.starts_with(b"RIFF") && &bytes[8..12] == b"WEBP" {
140            return Some(Self::Image);
141        }
142
143        // SQLite: "SQLite format 3\0"
144        if bytes.len() >= 16 && bytes.starts_with(b"SQLite format 3\0") {
145            return Some(Self::Sqlite);
146        }
147
148        // Gzip (tar.gz): \x1F\x8B
149        if bytes.starts_with(&[0x1F, 0x8B]) {
150            return Some(Self::Tar);
151        }
152
153        // ZIP-based formats: PK\x03\x04
154        if bytes.starts_with(&[0x50, 0x4B, 0x03, 0x04]) {
155            #[cfg(any(
156                feature = "zip",
157                feature = "word",
158                feature = "powerpoint",
159                feature = "excel",
160                feature = "epub"
161            ))]
162            return Self::detect_zip_content(bytes);
163            #[cfg(not(any(
164                feature = "zip",
165                feature = "word",
166                feature = "powerpoint",
167                feature = "excel",
168                feature = "epub"
169            )))]
170            return Some(Self::Zip);
171        }
172
173        None
174    }
175
176    #[cfg(any(
177        feature = "zip",
178        feature = "word",
179        feature = "powerpoint",
180        feature = "excel",
181        feature = "epub"
182    ))]
183    fn detect_zip_content(bytes: &[u8]) -> Option<Self> {
184        let cursor = std::io::Cursor::new(bytes);
185        let mut archive = zip::ZipArchive::new(cursor).ok()?;
186
187        for i in 0..archive.len() {
188            let entry = archive.by_index(i).ok()?;
189            let name = entry.name().to_string();
190
191            if name.starts_with("word/") {
192                return Some(Self::Word);
193            }
194            if name.starts_with("ppt/") {
195                return Some(Self::PowerPoint);
196            }
197            if name.starts_with("xl/") {
198                return Some(Self::Excel);
199            }
200            if name == "mimetype" || name == "META-INF/container.xml" {
201                return Some(Self::Epub);
202            }
203        }
204
205        Some(Self::Zip)
206    }
207}
208
209impl std::fmt::Display for Format {
210    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
211        match self {
212            Self::Excel => write!(f, "excel"),
213            Self::Pdf => write!(f, "pdf"),
214            Self::PowerPoint => write!(f, "powerpoint"),
215            Self::Word => write!(f, "word"),
216            Self::Image => write!(f, "image"),
217            Self::Zip => write!(f, "zip"),
218            Self::Epub => write!(f, "epub"),
219            Self::Audio => write!(f, "audio"),
220            Self::Csv => write!(f, "csv"),
221            Self::Html => write!(f, "html"),
222            Self::Json => write!(f, "json"),
223            Self::Yaml => write!(f, "yaml"),
224            Self::Toml => write!(f, "toml"),
225            Self::Xml => write!(f, "xml"),
226            Self::Sqlite => write!(f, "sqlite"),
227            Self::Tar => write!(f, "tar"),
228            Self::Video => write!(f, "video"),
229            Self::Ocr => write!(f, "ocr"),
230            Self::MarkdownDocx => write!(f, "markdown-docx"),
231            Self::MarkdownHtml => write!(f, "markdown-html"),
232            Self::MarkdownText => write!(f, "markdown-text"),
233            Self::MarkdownLatex => write!(f, "markdown-latex"),
234            Self::MarkdownRst => write!(f, "markdown-rst"),
235            Self::MarkdownAsciidoc => write!(f, "markdown-asciidoc"),
236            Self::MarkdownOrg => write!(f, "markdown-org"),
237            Self::MarkdownEpub => write!(f, "markdown-epub"),
238            Self::MarkdownJsonAst => write!(f, "markdown-json-ast"),
239        }
240    }
241}