omniparse 0.4.0

A Rust toolkit for detecting and extracting metadata, text, and content from various file formats
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
//! Magic byte patterns for file type detection

use std::io::{Cursor, Read};
use zip::ZipArchive;

/// Magic byte pattern for file type detection
#[derive(Debug, Clone)]
pub struct MagicPattern {
    /// MIME type this pattern identifies
    pub mime_type: String,
    /// Offset in bytes where pattern should be found
    pub offset: usize,
    /// Byte pattern to match
    pub pattern: Vec<u8>,
    /// Optional mask to apply before matching
    pub mask: Option<Vec<u8>>,
}

impl MagicPattern {
    /// Create a new magic pattern
    pub fn new(mime_type: &str, offset: usize, pattern: Vec<u8>) -> Self {
        Self {
            mime_type: mime_type.to_string(),
            offset,
            pattern,
            mask: None,
        }
    }
    
    /// Create a new magic pattern with a mask
    pub fn with_mask(mime_type: &str, offset: usize, pattern: Vec<u8>, mask: Vec<u8>) -> Self {
        Self {
            mime_type: mime_type.to_string(),
            offset,
            pattern,
            mask: Some(mask),
        }
    }
    
    /// Check if this pattern matches the given data
    pub fn matches(&self, data: &[u8]) -> bool {
        // Check if data is long enough
        if data.len() < self.offset + self.pattern.len() {
            return false;
        }
        
        let data_slice = &data[self.offset..self.offset + self.pattern.len()];
        
        // Apply mask if present
        if let Some(mask) = &self.mask {
            for i in 0..self.pattern.len() {
                if (data_slice[i] & mask[i]) != (self.pattern[i] & mask[i]) {
                    return false;
                }
            }
            true
        } else {
            data_slice == self.pattern.as_slice()
        }
    }
}

/// Detect OpenXML format by checking `[Content_Types].xml`
/// Returns the specific MIME type if it's an OpenXML document, None otherwise
pub fn detect_openxml_type(data: &[u8]) -> Option<String> {
    // Check if it starts with ZIP signature
    if data.len() < 4 || &data[0..4] != b"PK\x03\x04" {
        return None;
    }

    // Try to open as ZIP archive
    let cursor = Cursor::new(data);
    let mut archive = ZipArchive::new(cursor).ok()?;

    // First check for OpenDocument format (has mimetype file)
    if let Ok(mut mimetype_file) = archive.by_name("mimetype") {
        let mut mimetype = String::new();
        if mimetype_file.read_to_string(&mut mimetype).is_ok() {
            let trimmed = mimetype.trim();
            // Return the exact MIME type from the mimetype file
            if trimmed.starts_with("application/vnd.oasis.opendocument.") {
                return Some(trimmed.to_string());
            }
        }
    }

    // If not OpenDocument, check for OpenXML format (has [Content_Types].xml)
    if let Ok(mut content_types_file) = archive.by_name("[Content_Types].xml") {
        let mut content = String::new();
        if content_types_file.read_to_string(&mut content).is_ok() {
            // Check for specific content type entries to distinguish formats
            if content.contains("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet") {
                return Some("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet".to_string());
            } else if content.contains("application/vnd.openxmlformats-officedocument.presentationml.presentation") {
                return Some("application/vnd.openxmlformats-officedocument.presentationml.presentation".to_string());
            } else if content.contains("application/vnd.openxmlformats-officedocument.wordprocessingml.document") {
                return Some("application/vnd.openxmlformats-officedocument.wordprocessingml.document".to_string());
            }
        }
    }

    None
}

/// Detect OLE2 format type by examining the directory structure
/// Returns the specific MIME type if it's a recognized OLE2 document (DOC, XLS, PPT), None otherwise
pub fn detect_ole2_type(data: &[u8]) -> Option<(String, f32)> {
    // Check if it starts with OLE2 signature
    if data.len() < 512 || &data[0..8] != b"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1" {
        return None;
    }

    // OLE2 files contain directory entries that identify the document type
    // We'll search for specific stream names that are unique to each format
    
    // Convert data to searchable format for stream name detection
    // Stream names in OLE2 are stored in the directory entries
    
    // Look for Word-specific streams
    // Word documents contain "WordDocument" stream
    if contains_stream_name(data, b"WordDocument") || 
       contains_stream_name(data, b"Word") {
        return Some(("application/msword".to_string(), 0.90));
    }
    
    // Look for Excel-specific streams
    // Excel workbooks contain "Workbook" or "Book" stream
    if contains_stream_name(data, b"Workbook") || 
       contains_stream_name(data, b"Book") {
        return Some(("application/vnd.ms-excel".to_string(), 0.90));
    }
    
    // Look for PowerPoint-specific streams
    // PowerPoint presentations contain "PowerPoint Document" or "Current User" stream
    if contains_stream_name(data, b"PowerPoint Document") ||
       contains_stream_name(data, b"Current User") {
        return Some(("application/vnd.ms-powerpoint".to_string(), 0.90));
    }
    
    // If we can't determine the specific type, return None
    // This will allow fallback to extension-based detection
    None
}

/// Check if the OLE2 data contains a specific stream name
/// This searches through the binary data for stream name patterns
fn contains_stream_name(data: &[u8], stream_name: &[u8]) -> bool {
    // OLE2 directory entries start at sector 0 (after the 512-byte header)
    // Each directory entry is 128 bytes
    // The first 64 bytes of each entry contain the name in UTF-16LE
    
    // Search through the data for the stream name
    // We'll look for both ASCII and UTF-16LE encoded versions
    
    // Check for ASCII version (simple search)
    if search_bytes(data, stream_name) {
        return true;
    }
    
    // Check for UTF-16LE version (with null bytes between characters)
    let utf16le_name = to_utf16le(stream_name);
    if search_bytes(data, &utf16le_name) {
        return true;
    }
    
    false
}

/// Search for a byte pattern in data
fn search_bytes(data: &[u8], pattern: &[u8]) -> bool {
    if pattern.is_empty() || data.len() < pattern.len() {
        return false;
    }
    
    for i in 0..=data.len() - pattern.len() {
        if &data[i..i + pattern.len()] == pattern {
            return true;
        }
    }
    
    false
}

/// Convert ASCII bytes to UTF-16LE encoding
fn to_utf16le(ascii: &[u8]) -> Vec<u8> {
    let mut result = Vec::with_capacity(ascii.len() * 2);
    for &byte in ascii {
        result.push(byte);
        result.push(0);
    }
    result
}

/// Get the default magic byte patterns database
pub fn get_magic_patterns() -> Vec<MagicPattern> {
    vec![
        // Document formats
        MagicPattern::new("application/pdf", 0, b"%PDF-".to_vec()),
        MagicPattern::new("application/vnd.openxmlformats-officedocument.wordprocessingml.document", 0, 
            vec![0x50, 0x4B, 0x03, 0x04]), // ZIP signature (DOCX is ZIP-based)
        MagicPattern::new("application/vnd.oasis.opendocument.text", 0, 
            vec![0x50, 0x4B, 0x03, 0x04]), // ZIP signature (ODT is ZIP-based)
        // Note: OLE2 signature (DOC, XLS, PPT) is handled by detect_ole2_type() function
        // to distinguish between different OLE2 formats
        MagicPattern::new("application/rtf", 0, b"{\\rtf".to_vec()),
        
        // Image formats
        MagicPattern::new("image/jpeg", 0, vec![0xFF, 0xD8, 0xFF]),
        MagicPattern::new("image/png", 0, vec![0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A]),
        MagicPattern::new("image/gif", 0, b"GIF87a".to_vec()),
        MagicPattern::new("image/gif", 0, b"GIF89a".to_vec()),
        MagicPattern::new("image/tiff", 0, vec![0x49, 0x49, 0x2A, 0x00]), // Little-endian TIFF
        MagicPattern::new("image/tiff", 0, vec![0x4D, 0x4D, 0x00, 0x2A]), // Big-endian TIFF
        MagicPattern::new("image/bmp", 0, b"BM".to_vec()),
        MagicPattern::new("image/webp", 8, b"WEBP".to_vec()),
        MagicPattern::new("image/x-icon", 0, vec![0x00, 0x00, 0x01, 0x00]),
        // Require the literal <svg tag — the previous `<?xml` pattern
        // swallowed every XML file into image/svg+xml.
        MagicPattern::new("image/svg+xml", 0, b"<svg".to_vec()),
        
        // Archive formats
        MagicPattern::new("application/zip", 0, vec![0x50, 0x4B, 0x03, 0x04]),
        MagicPattern::new("application/zip", 0, vec![0x50, 0x4B, 0x05, 0x06]), // Empty ZIP
        MagicPattern::new("application/zip", 0, vec![0x50, 0x4B, 0x07, 0x08]), // Spanned ZIP
        MagicPattern::new("application/x-tar", 257, b"ustar".to_vec()),
        MagicPattern::new("application/gzip", 0, vec![0x1F, 0x8B]),
        MagicPattern::new("application/x-bzip2", 0, b"BZh".to_vec()),
        MagicPattern::new("application/x-7z-compressed", 0, vec![0x37, 0x7A, 0xBC, 0xAF, 0x27, 0x1C]),
        MagicPattern::new("application/x-rar-compressed", 0, b"Rar!\x1A\x07".to_vec()),
        MagicPattern::new("application/x-xz", 0, vec![0xFD, 0x37, 0x7A, 0x58, 0x5A, 0x00]),
        
        // Text formats
        MagicPattern::new("text/html", 0, b"<!DOCTYPE html".to_vec()),
        MagicPattern::new("text/html", 0, b"<html".to_vec()),
        MagicPattern::new("text/html", 0, b"<HTML".to_vec()),
        MagicPattern::new("text/xml", 0, b"<?xml".to_vec()),
        MagicPattern::new("application/json", 0, b"{".to_vec()),
        MagicPattern::new("application/json", 0, b"[".to_vec()),
        
        // Audio formats
        MagicPattern::new("audio/mpeg", 0, vec![0xFF, 0xFB]),
        MagicPattern::new("audio/mpeg", 0, vec![0xFF, 0xF3]),
        MagicPattern::new("audio/mpeg", 0, vec![0xFF, 0xF2]),
        MagicPattern::new("audio/mpeg", 0, b"ID3".to_vec()),
        MagicPattern::new("audio/wav", 0, b"RIFF".to_vec()),
        MagicPattern::new("audio/x-flac", 0, b"fLaC".to_vec()),
        MagicPattern::new("audio/ogg", 0, b"OggS".to_vec()),
        MagicPattern::new("audio/x-m4a", 4, b"ftyp".to_vec()),
        
        // Video formats
        MagicPattern::new("video/mp4", 4, b"ftyp".to_vec()),
        MagicPattern::new("video/x-msvideo", 0, b"RIFF".to_vec()),
        MagicPattern::new("video/x-matroska", 0, vec![0x1A, 0x45, 0xDF, 0xA3]),
        MagicPattern::new("video/webm", 0, vec![0x1A, 0x45, 0xDF, 0xA3]),
        MagicPattern::new("video/quicktime", 4, b"moov".to_vec()),
        MagicPattern::new("video/quicktime", 4, b"mdat".to_vec()),
        
        // Executable formats
        MagicPattern::new("application/x-executable", 0, vec![0x7F, 0x45, 0x4C, 0x46]), // ELF
        MagicPattern::new("application/x-mach-binary", 0, vec![0xFE, 0xED, 0xFA, 0xCE]), // Mach-O 32-bit
        MagicPattern::new("application/x-mach-binary", 0, vec![0xFE, 0xED, 0xFA, 0xCF]), // Mach-O 64-bit
        MagicPattern::new("application/x-msdownload", 0, b"MZ".to_vec()), // PE/COFF
        
        // Font formats
        MagicPattern::new("font/ttf", 0, vec![0x00, 0x01, 0x00, 0x00]),
        MagicPattern::new("font/otf", 0, b"OTTO".to_vec()),
        MagicPattern::new("font/woff", 0, b"wOFF".to_vec()),
        MagicPattern::new("font/woff2", 0, b"wOF2".to_vec()),
        
        // Database formats
        MagicPattern::new("application/x-sqlite3", 0, b"SQLite format 3\0".to_vec()),
        
        // Office formats (additional)
        MagicPattern::new("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", 0, 
            vec![0x50, 0x4B, 0x03, 0x04]), // XLSX
        MagicPattern::new("application/vnd.openxmlformats-officedocument.presentationml.presentation", 0, 
            vec![0x50, 0x4B, 0x03, 0x04]), // PPTX
        MagicPattern::new("application/vnd.oasis.opendocument.spreadsheet", 0, 
            vec![0x50, 0x4B, 0x03, 0x04]), // ODS
        MagicPattern::new("application/vnd.oasis.opendocument.presentation", 0, 
            vec![0x50, 0x4B, 0x03, 0x04]), // ODP
    ]
}

#[cfg(test)]
mod tests {
    use super::*;
    
    #[test]
    fn test_magic_pattern_matches() {
        let pattern = MagicPattern::new("application/pdf", 0, b"%PDF-".to_vec());
        assert!(pattern.matches(b"%PDF-1.4"));
        assert!(!pattern.matches(b"Not a PDF"));
    }
    
    #[test]
    fn test_magic_pattern_with_offset() {
        let pattern = MagicPattern::new("application/x-tar", 257, b"ustar".to_vec());
        let mut data = vec![0u8; 262];
        data[257..262].copy_from_slice(b"ustar");
        assert!(pattern.matches(&data));
    }
    
    #[test]
    fn test_magic_pattern_with_mask() {
        // Pattern: 0xF0, 0x0F with mask 0xF0, 0x0F
        // This means: check high nibble of first byte (0xF0) and low nibble of second byte (0x0F)
        let pattern = MagicPattern::with_mask(
            "test/masked",
            0,
            vec![0xF0, 0x0F],
            vec![0xF0, 0x0F],
        );
        // 0xF5 & 0xF0 = 0xF0, matches 0xF0 & 0xF0 = 0xF0 ✓
        // 0x0F & 0x0F = 0x0F, matches 0x0F & 0x0F = 0x0F ✓
        assert!(pattern.matches(&[0xF5, 0x0F]));
        
        // 0x0F & 0xF0 = 0x00, doesn't match 0xF0 & 0xF0 = 0xF0 ✗
        assert!(!pattern.matches(&[0x0F, 0xF0]));
    }
    
    #[test]
    fn test_get_magic_patterns_count() {
        let patterns = get_magic_patterns();
        assert!(patterns.len() >= 50, "Should have at least 50 MIME types");
    }
    
    #[test]
    fn test_detect_ole2_type_with_word_document() {
        // Create a minimal OLE2 file with WordDocument stream name
        let mut data = vec![0u8; 1024];
        // OLE2 header
        data[0..8].copy_from_slice(b"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1");
        // Add "WordDocument" in UTF-16LE somewhere in the data
        let word_doc_utf16 = to_utf16le(b"WordDocument");
        data[512..512 + word_doc_utf16.len()].copy_from_slice(&word_doc_utf16);
        
        let result = detect_ole2_type(&data);
        assert!(result.is_some());
        let (mime_type, confidence) = result.unwrap();
        assert_eq!(mime_type, "application/msword");
        assert_eq!(confidence, 0.90);
    }
    
    #[test]
    fn test_detect_ole2_type_with_workbook() {
        // Create a minimal OLE2 file with Workbook stream name
        let mut data = vec![0u8; 1024];
        // OLE2 header
        data[0..8].copy_from_slice(b"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1");
        // Add "Workbook" in UTF-16LE somewhere in the data
        let workbook_utf16 = to_utf16le(b"Workbook");
        data[512..512 + workbook_utf16.len()].copy_from_slice(&workbook_utf16);
        
        let result = detect_ole2_type(&data);
        assert!(result.is_some());
        let (mime_type, confidence) = result.unwrap();
        assert_eq!(mime_type, "application/vnd.ms-excel");
        assert_eq!(confidence, 0.90);
    }
    
    #[test]
    fn test_detect_ole2_type_with_powerpoint() {
        // Create a minimal OLE2 file with PowerPoint Document stream name
        let mut data = vec![0u8; 1024];
        // OLE2 header
        data[0..8].copy_from_slice(b"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1");
        // Add "Current User" in UTF-16LE somewhere in the data (PPT-specific)
        let current_user_utf16 = to_utf16le(b"Current User");
        data[512..512 + current_user_utf16.len()].copy_from_slice(&current_user_utf16);
        
        let result = detect_ole2_type(&data);
        assert!(result.is_some());
        let (mime_type, confidence) = result.unwrap();
        assert_eq!(mime_type, "application/vnd.ms-powerpoint");
        assert_eq!(confidence, 0.90);
    }
    
    #[test]
    fn test_detect_ole2_type_invalid_header() {
        let data = vec![0u8; 512];
        let result = detect_ole2_type(&data);
        assert!(result.is_none());
    }
    
    #[test]
    fn test_detect_ole2_type_too_short() {
        let data = vec![0xD0, 0xCF, 0x11, 0xE0];
        let result = detect_ole2_type(&data);
        assert!(result.is_none());
    }
    
    #[test]
    fn test_to_utf16le() {
        let ascii = b"Test";
        let utf16le = to_utf16le(ascii);
        assert_eq!(utf16le, vec![b'T', 0, b'e', 0, b's', 0, b't', 0]);
    }
    
    #[test]
    fn test_search_bytes() {
        let data = b"Hello World";
        assert!(search_bytes(data, b"World"));
        assert!(search_bytes(data, b"Hello"));
        assert!(!search_bytes(data, b"Goodbye"));
        assert!(!search_bytes(data, b""));
    }
    
    #[test]
    fn test_contains_stream_name() {
        let mut data = vec![0u8; 1024];
        // Add "WordDocument" in UTF-16LE
        let word_doc_utf16 = to_utf16le(b"WordDocument");
        data[100..100 + word_doc_utf16.len()].copy_from_slice(&word_doc_utf16);
        
        assert!(contains_stream_name(&data, b"WordDocument"));
        assert!(!contains_stream_name(&data, b"Workbook"));
    }
}