exif_oxide/file_detection.rs
1//! File type detection engine following ExifTool's implementation
2//!
3//! This module implements ExifTool's sophisticated multi-tiered file type detection
4//! approach, ported from ExifTool.pm:2913-2999
5//!
6//! Detection Flow:
7//! 1. Extension-based candidates (via generated fileTypeLookup)
8//! 2. Magic number validation (via generated magicNumber patterns)
9//! 3. Last-ditch embedded signature recovery
10//!
11//! The implementation preserves ExifTool's exact logic including:
12//! - Weak magic types that defer to extension
13//! - Extension normalization rules
14//! - Conflict resolution patterns
15//! - Error recovery mechanisms
16
17use crate::generated::ExifTool_pm::lookup_mime_types;
18use std::io::{Read, Seek};
19use std::path::Path;
20
21/// Maximum bytes to read for magic number testing
22/// ExifTool uses exactly 1024 bytes - ExifTool.pm:2955
23const MAGIC_TEST_BUFFER_SIZE: usize = 1024;
24
25/// File types with weak magic numbers that defer to extension detection
26/// ExifTool.pm:1030 - only MP3 is marked as weak magic: my %weakMagic = ( MP3 => 1 );
27const WEAK_MAGIC_TYPES: &[&str] = &["MP3"];
28
29// All magic number patterns are now generated from ExifTool.pm %magicNumber hash
30// See src/generated/file_types/magic_numbers.rs for the complete patterns
31// No hardcoded patterns needed - use lookup_magic_number_patterns() for all detection
32
33#[derive(Debug, Clone, PartialEq)]
34pub struct FileTypeDetectionResult {
35 /// Detected file type (e.g., "JPEG", "PNG", "CR2")
36 pub file_type: String,
37 /// Primary format for processing (e.g., "JPEG", "TIFF", "MOV")
38 pub format: String,
39 /// MIME type string
40 pub mime_type: String,
41 /// Human-readable description
42 pub description: String,
43}
44
45#[derive(Debug)]
46pub enum FileDetectionError {
47 /// File type could not be determined
48 UnknownFileType,
49 /// IO error reading file
50 IoError(std::io::Error),
51 /// Invalid file path
52 InvalidPath,
53}
54
55impl From<std::io::Error> for FileDetectionError {
56 fn from(error: std::io::Error) -> Self {
57 FileDetectionError::IoError(error)
58 }
59}
60
61impl std::fmt::Display for FileDetectionError {
62 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
63 match self {
64 FileDetectionError::UnknownFileType => write!(f, "Unknown file type"),
65 FileDetectionError::IoError(e) => write!(f, "IO error: {e}"),
66 FileDetectionError::InvalidPath => write!(f, "Invalid file path"),
67 }
68 }
69}
70
71impl std::error::Error for FileDetectionError {}
72
73/// Main file type detector implementing ExifTool's detection algorithm
74pub struct FileTypeDetector;
75
76impl FileTypeDetector {
77 /// Create a new file type detector
78 pub fn new() -> Self {
79 Self
80 }
81
82 /// Detect file type from path and file content
83 ///
84 /// Implements ExifTool's detection flow from ExifTool.pm:2913-2999
85 pub fn detect_file_type<R: Read + Seek>(
86 &self,
87 path: &Path,
88 reader: &mut R,
89 ) -> Result<FileTypeDetectionResult, FileDetectionError> {
90 // Phase 1: Get extension-based candidates
91 // ExifTool.pm:2940 - GetFileType($filename)
92 let candidates = self.get_candidates_from_extension(path)?;
93
94 // Phase 2: Read test buffer for magic number validation
95 // ExifTool.pm:2955 - Read($raf, $buff, $testLen)
96 let mut buffer = vec![0u8; MAGIC_TEST_BUFFER_SIZE];
97 let bytes_read = reader.read(&mut buffer)?;
98 buffer.truncate(bytes_read);
99
100 // Reset reader position for subsequent processing
101 // This is critical so format-specific processors start at the beginning
102 reader.seek(std::io::SeekFrom::Start(0))?;
103
104 // Phase 3: Magic number validation against candidates
105 // ExifTool.pm:2960-2975 - Test candidates against magic numbers
106 // CRITICAL: Test all candidates before giving up, per TRUST-EXIFTOOL.md
107 let mut matched_type = None;
108 let mut recognized_ext = None;
109
110 for candidate in &candidates {
111 // Check if this is a weak magic type that defers to extension
112 if WEAK_MAGIC_TYPES.contains(&candidate.as_str()) {
113 // Weak magic types are fallback only if no strong magic matches
114 // ExifTool.pm:2970 - "next if $weakMagic{$type} and defined $recognizedExt"
115 if matched_type.is_none() {
116 matched_type = Some(candidate.clone());
117 }
118 continue;
119 }
120
121 // validate_magic_number now checks both file type and format patterns
122 if self.validate_magic_number(candidate, &buffer) {
123 // Strong magic match - use this type
124 matched_type = Some(candidate.clone());
125 break;
126 }
127
128 // ExifTool behavior: Keep track of recognized extensions with modules
129 // Even if magic pattern fails, ExifTool may still process the file
130 // if it has a module defined (like JXL -> Jpeg2000)
131 if recognized_ext.is_none() && self.has_processing_module(candidate) {
132 recognized_ext = Some(candidate.clone());
133 }
134 }
135
136 // If no magic match but we have a recognized extension with a module,
137 // use that as fallback (mimics ExifTool's behavior for JXL and others)
138 if matched_type.is_none() && recognized_ext.is_some() {
139 matched_type = recognized_ext;
140 }
141
142 if let Some(file_type) = matched_type {
143 // Special handling for MOV format to determine specific subtype
144 // ExifTool QuickTime.pm:9868-9877 - ftyp brand determines actual file type
145 // CRITICAL: Check against the format, not the file type
146 use crate::generated::file_types::resolve_file_type;
147 let format = if let Some((formats, _)) = resolve_file_type(&file_type) {
148 formats[0]
149 } else {
150 &file_type
151 };
152
153 let detected_type = if format == "MOV" {
154 self.determine_mov_subtype(&buffer)
155 .unwrap_or_else(|| file_type.clone())
156 } else if self.is_riff_based_format(&file_type) {
157 // For RIFF-based formats, detect the actual type from the header
158 // ExifTool RIFF.pm:2038-2039 - Sets file type based on RIFF format identifier
159 self.detect_riff_type(&buffer)
160 .unwrap_or_else(|| file_type.clone())
161 } else if file_type == "NEF" || file_type == "NRW" {
162 // NEF/NRW correction based on content analysis
163 // ExifTool Exif.pm distinguishes based on compression and linearization table
164 self.correct_nef_nrw_type(&file_type, &buffer)
165 .unwrap_or_else(|| file_type.clone())
166 } else {
167 file_type
168 };
169 return self.build_result(&detected_type, path);
170 }
171
172 // Phase 4: Last-ditch recovery - scan for embedded signatures
173 // ExifTool.pm:2976-2983 - Look for JPEG/TIFF embedded in unknown data
174 if let Some(embedded_type) = self.scan_for_embedded_signatures(&buffer) {
175 return self.build_result(&embedded_type, path);
176 }
177
178 Err(FileDetectionError::UnknownFileType)
179 }
180
181 /// Get file type candidates based on file extension
182 /// ExifTool equivalent: GetFileType() in ExifTool.pm:9010-9050
183 fn get_candidates_from_extension(
184 &self,
185 path: &Path,
186 ) -> Result<Vec<String>, FileDetectionError> {
187 let extension = path
188 .extension()
189 .and_then(|ext| ext.to_str())
190 .ok_or(FileDetectionError::InvalidPath)?;
191
192 // Normalize extension to uppercase (ExifTool convention)
193 let normalized_ext = self.normalize_extension(extension);
194
195 // Resolve through fileTypeLookup with alias following
196 // ExifTool.pm:258-404 %fileTypeLookup hash defines extension mappings
197 use crate::generated::file_types::resolve_file_type;
198
199 // Check if this extension is known to ExifTool
200 let is_known_extension = resolve_file_type(&normalized_ext).is_some();
201
202 // For HEIC/HEIF, we need special handling
203 // Even if not in the generated lookup, these are valid extensions
204 let is_heif_extension = matches!(normalized_ext.as_str(), "HEIC" | "HEIF" | "HIF");
205
206 if is_known_extension || is_heif_extension {
207 // For most formats, the extension itself is the file type candidate
208 // The formats array tells us what processing module to use, not the file type
209 // ExifTool.pm:2940-2950 - GetFileType returns the extension-based type
210
211 // Special case: Some extensions are aliases that should map to a different type
212 // These are hardcoded in ExifTool.pm GetFileType()
213 match normalized_ext.as_str() {
214 "3GP2" => Ok(vec!["3G2".to_string()]), // ExifTool.pm alias
215 "MTS" => Ok(vec!["M2TS".to_string()]), // ExifTool.pm alias
216 // HEIC/HEIF/HIF extensions should use MOV format for detection
217 // ExifTool QuickTime.pm handles these as MOV-based formats
218 "HEIC" | "HEIF" | "HIF" => Ok(vec!["MOV".to_string()]),
219 _ => Ok(vec![normalized_ext.clone()]), // Use the extension as the type
220 }
221 } else {
222 // Unknown extension - return normalized extension as candidate
223 Ok(vec![normalized_ext])
224 }
225 }
226
227 /// Normalize file extension following ExifTool's rules
228 /// ExifTool equivalent: GetFileExtension() in ExifTool.pm:9013-9040
229 fn normalize_extension(&self, extension: &str) -> String {
230 let upper_ext = extension.to_uppercase();
231
232 // ExifTool hardcoded extension conversions
233 // These are critical for consistency - TRUST-EXIFTOOL
234 match upper_ext.as_str() {
235 "TIF" => "TIFF".to_string(), // ExifTool.pm:9019 - hardcoded for TIFF consistency
236 "JPG" => "JPEG".to_string(),
237 "3GP2" => "3G2".to_string(),
238 "AIF" => "AIFF".to_string(),
239 _ => upper_ext,
240 }
241 }
242
243 /// Convert Perl regex pattern to Rust regex pattern
244 /// ExifTool patterns use Perl syntax that needs conversion for Rust regex crate
245 #[allow(dead_code)]
246 fn convert_perl_pattern_to_rust(&self, pattern: &str) -> String {
247 // Convert common Perl regex patterns to Rust-compatible patterns
248 // These conversions preserve ExifTool's exact matching behavior
249
250 let mut rust_pattern = pattern.to_string();
251
252 // Handle null bytes and their quantifiers
253 // \0 -> \x00, \0{3} -> \x00{3}, \0{0,3} -> \x00{0,3}
254 rust_pattern = rust_pattern.replace("\\0", "\\x00");
255
256 // Handle common escape sequences
257 rust_pattern = rust_pattern.replace("\\r", "\\x0d");
258 rust_pattern = rust_pattern.replace("\\n", "\\x0a");
259 rust_pattern = rust_pattern.replace("\\t", "\\x09");
260
261 // Handle Unicode characters by converting to byte sequences
262 // For BPG pattern "BPGû" - convert û (U+00FB) to \xfb
263 if rust_pattern.contains('û') {
264 rust_pattern = rust_pattern.replace('û', "\\xfb");
265 }
266
267 // Handle other common Unicode/extended ASCII characters
268 rust_pattern = rust_pattern.replace('é', "\\xe9");
269 rust_pattern = rust_pattern.replace('ñ', "\\xf1");
270
271 // Fix character classes with hex values - ensure proper escaping
272 // These are already mostly correct for Rust regex
273
274 // Handle dot patterns in specific contexts
275 // For JXL pattern, dots should match any byte in binary context
276 // This is already correct as . matches any byte in bytes regex
277
278 rust_pattern
279 }
280
281 // REMOVED: match_binary_magic_pattern function - replaced with generated patterns from ExifTool
282 // All magic number validation now uses generated patterns from ExifTool.pm %magicNumber hash
283
284 /// Validate magic number for a file type candidate
285 /// ExifTool equivalent: magic number testing in ExifTool.pm:2960-2975
286 /// CRITICAL: Must match ExifTool's exact logic per TRUST-EXIFTOOL.md
287 fn validate_magic_number(&self, file_type: &str, buffer: &[u8]) -> bool {
288 // Special handling for RIFF-based formats (AVI, WAV, WEBP, etc.)
289 // ExifTool RIFF.pm:2037-2046 - RIFF container detection with format analysis
290 if self.is_riff_based_format(file_type) {
291 return self.validate_riff_format(file_type, buffer);
292 }
293
294 // Special handling for TIFF-based RAW formats that need deeper analysis
295 // ExifTool.pm:8531-8612 - DoProcessTIFF() RAW format detection
296 if self.is_tiff_based_raw_format(file_type) {
297 return self.validate_tiff_raw_format(file_type, buffer);
298 }
299
300 // Use generated magic number patterns from ExifTool's %magicNumber hash
301 // ExifTool.pm:912-1027 - patterns extracted and compiled as regex::bytes::Regex
302 use crate::generated::file_types::{
303 magic_number_patterns::matches_magic_number, resolve_file_type,
304 };
305
306 // First try to match against the file type itself
307 if matches_magic_number(file_type, buffer) {
308 return true;
309 }
310
311 // If no direct match, check if this file type has a format that has magic patterns
312 // ExifTool uses the format (MOV, TIFF, etc.) for magic pattern matching
313 if let Some((formats, _desc)) = resolve_file_type(file_type) {
314 // Try magic pattern for the primary format
315 if matches_magic_number(formats[0], buffer) {
316 return true;
317 }
318 }
319
320 false
321 }
322
323 /// Detect actual RIFF format type from buffer
324 /// ExifTool RIFF.pm:2037-2046 - Detects specific RIFF variant
325 fn detect_riff_type(&self, buffer: &[u8]) -> Option<String> {
326 // Need at least 12 bytes for RIFF header analysis
327 if buffer.len() < 12 {
328 return None;
329 }
330
331 // Extract RIFF magic signature (bytes 0-3) and format identifier (bytes 8-11)
332 let magic = &buffer[0..4];
333 let format_id = &buffer[8..12];
334
335 // Check RIFF magic signature first
336 // ExifTool RIFF.pm:2040 - "if ($buff =~ /^(RIFF|RF64)....(.{4})/s)"
337 let is_riff = magic == b"RIFF" || magic == b"RF64";
338 if !is_riff {
339 // Check for obscure lossless audio variants
340 // ExifTool RIFF.pm:2044 - "return 0 unless $buff =~ /^(LA0[234]|OFR |LPAC|wvpk)/"
341 let is_audio_variant = magic == b"LA02"
342 || magic == b"LA03"
343 || magic == b"LA04"
344 || magic == b"OFR "
345 || magic == b"LPAC"
346 || magic == b"wvpk";
347 if !is_audio_variant {
348 return None;
349 }
350 }
351
352 // Map format identifier to file type using ExifTool's riffType mapping
353 // ExifTool RIFF.pm:49-53 - %riffType hash
354 match format_id {
355 b"WAVE" => Some("WAV".to_string()),
356 b"AVI " => Some("AVI".to_string()), // Note: AVI has trailing space
357 b"WEBP" => Some("WEBP".to_string()),
358 b"LA02" | b"LA03" | b"LA04" => Some("LA".to_string()),
359 b"OFR " => Some("OFR".to_string()),
360 b"LPAC" => Some("PAC".to_string()),
361 b"wvpk" => Some("WV".to_string()),
362 _ => Some("RIFF".to_string()), // Unknown RIFF format
363 }
364 }
365
366 /// Check if a file type is based on RIFF container format
367 /// ExifTool maps these extensions to RIFF format processing
368 fn is_riff_based_format(&self, file_type: &str) -> bool {
369 // Check against ExifTool's fileTypeLookup - formats that map to RIFF
370 // From file_type_lookup.rs analysis
371 matches!(
372 file_type,
373 "AVI" | "WAV" | "WEBP" | "LA" | "OFR" | "PAC" | "WV"
374 )
375 }
376
377 /// Check if a file type is a TIFF-based RAW format requiring deeper analysis
378 /// ExifTool.pm:8531-8612 - RAW formats detected in DoProcessTIFF()
379 fn is_tiff_based_raw_format(&self, file_type: &str) -> bool {
380 // RAW formats that use TIFF structure but need specific detection
381 // Based on ExifTool's DoProcessTIFF() implementation
382 // Note: CR3 is MOV-based, MRW has its own magic number pattern
383 matches!(
384 file_type,
385 "CR2" | "NEF" | "NRW" | "RW2" | "RWL" | "ARW" | "DNG" | "ORF" | "IIQ" | "3FR"
386 )
387 }
388
389 /// Validate RIFF container and detect specific format
390 /// ExifTool equivalent: RIFF.pm:2037-2046 ProcessRIFF()
391 /// CRITICAL: Follows ExifTool's exact RIFF detection logic
392 fn validate_riff_format(&self, expected_type: &str, buffer: &[u8]) -> bool {
393 // Need at least 12 bytes for RIFF header analysis
394 // ExifTool RIFF.pm:2039 - "return 0 unless $raf->Read($buff, 12) == 12;"
395 if buffer.len() < 12 {
396 return false;
397 }
398
399 // Extract RIFF magic signature (bytes 0-3) and format identifier (bytes 8-11)
400 let magic = &buffer[0..4];
401 let format_id = &buffer[8..12];
402
403 // Check RIFF magic signature first
404 // ExifTool RIFF.pm:2040 - "if ($buff =~ /^(RIFF|RF64)....(.{4})/s)"
405 let is_riff = magic == b"RIFF" || magic == b"RF64";
406 if !is_riff {
407 // Check for obscure lossless audio variants
408 // ExifTool RIFF.pm:2044 - "return 0 unless $buff =~ /^(LA0[234]|OFR |LPAC|wvpk)/"
409 let is_audio_variant = magic == b"LA02"
410 || magic == b"LA03"
411 || magic == b"LA04"
412 || magic == b"OFR "
413 || magic == b"LPAC"
414 || magic == b"wvpk";
415 if !is_audio_variant {
416 return false;
417 }
418 }
419
420 // Map format identifier to file type using ExifTool's riffType mapping
421 // ExifTool RIFF.pm:49-53 - %riffType hash
422 let detected_type = match format_id {
423 b"WAVE" => "WAV",
424 b"AVI " => "AVI", // Note: AVI has trailing space
425 b"WEBP" => "WEBP",
426 b"LA02" | b"LA03" | b"LA04" => "LA",
427 b"OFR " => "OFR",
428 b"LPAC" => "PAC",
429 b"wvpk" => "WV",
430 _ => {
431 // Unknown RIFF format - be conservative and allow generic RIFF detection
432 // This matches ExifTool's behavior of processing unknown RIFF types
433 return expected_type == "RIFF";
434 }
435 };
436
437 // Check if detected type matches expected type
438 expected_type == detected_type
439 }
440
441 /// Validate TIFF-based RAW format with specific signature detection
442 /// ExifTool equivalent: DoProcessTIFF() in ExifTool.pm:8531-8612
443 /// CRITICAL: Follows ExifTool's exact RAW format detection logic
444 fn validate_tiff_raw_format(&self, file_type: &str, buffer: &[u8]) -> bool {
445 // Need at least 16 bytes for TIFF header + potential signatures
446 if buffer.len() < 16 {
447 return false;
448 }
449
450 // First check basic TIFF magic number
451 if !buffer.starts_with(b"II") && !buffer.starts_with(b"MM") {
452 return false;
453 }
454
455 // CRITICAL: CR3 is MOV-based, not TIFF-based! Check for MOV signature first
456 // ExifTool.pm - CR3 uses QuickTime.pm not TIFF processing
457 if file_type == "CR3" && buffer.len() >= 12 && &buffer[4..8] == b"ftyp" {
458 // This is a MOV-based file, not TIFF - return false to prevent TIFF processing
459 return false;
460 }
461
462 // Extract byte order and TIFF identifier
463 let little_endian = buffer.starts_with(b"II");
464 let identifier = if little_endian {
465 u16::from_le_bytes([buffer[2], buffer[3]])
466 } else {
467 u16::from_be_bytes([buffer[2], buffer[3]])
468 };
469
470 // Extract IFD offset
471 let ifd_offset = if little_endian {
472 u32::from_le_bytes([buffer[4], buffer[5], buffer[6], buffer[7]])
473 } else {
474 u32::from_be_bytes([buffer[4], buffer[5], buffer[6], buffer[7]])
475 } as usize;
476
477 // Apply ExifTool's RAW format detection logic
478 match file_type {
479 "CR2" => {
480 // CR2 detection: ExifTool.pm:8534-8542
481 // identifier == 0x2a and offset >= 16, check for CR\x02\0 signature at offset 8
482 if identifier == 0x2a && ifd_offset >= 16 && buffer.len() >= 12 {
483 let sig = &buffer[8..12]; // CR2 signature is at offset 8, not at IFD offset
484 sig == b"CR\x02\0" || sig == b"\xba\xb0\xac\xbb"
485 } else {
486 false
487 }
488 }
489 "RW2" | "RWL" => {
490 // RW2 detection: ExifTool.pm:8544-8550
491 // identifier == 0x55 and specific magic signature at offset 8
492 if identifier == 0x55 && ifd_offset >= 0x18 && buffer.len() >= 0x18 {
493 let magic_signature = &buffer[0x08..0x18]; // Magic signature is at offset 8, not 0x18
494 magic_signature
495 == b"\x88\xe7\x74\xd8\xf8\x25\x1d\x4d\x94\x7a\x6e\x77\x82\x2b\x5d\x6a"
496 } else {
497 false
498 }
499 }
500 "ORF" => {
501 // ORF detection: ExifTool.pm:8552-8555
502 // identifier == 0x4f52 or 0x5352 (Olympus specific)
503 identifier == 0x4f52 || identifier == 0x5352
504 }
505 "NEF" | "NRW" => {
506 // NEF/NRW detection: ExifTool uses content analysis to distinguish
507 // ExifTool Exif.pm: NRW has JPEG compression in IFD0, NEF has linearization table
508 if identifier == 0x2a {
509 // Valid TIFF structure, now check content to distinguish NEF from NRW
510 use crate::tiff_utils::{read_tiff_ifd0_info, COMPRESSION_JPEG};
511 use std::io::Cursor;
512
513 let mut cursor = Cursor::new(buffer);
514 if let Some((compression, has_nef_linearization)) =
515 read_tiff_ifd0_info(&mut cursor)
516 {
517 match file_type {
518 "NEF" => {
519 // If NEF file has JPEG compression in IFD0, it's actually NRW
520 // ExifTool Exif.pm: "recognize NRW file from a JPEG-compressed thumbnail in IFD0"
521 if compression == Some(COMPRESSION_JPEG) {
522 // This will be corrected to NRW in post-processing
523 true
524 } else {
525 true // Valid NEF
526 }
527 }
528 "NRW" => {
529 // If NRW file has NEFLinearizationTable, it's actually NEF
530 // ExifTool.pm: "fix NEF type if misidentified as NRW"
531 if has_nef_linearization {
532 // This will be corrected to NEF in post-processing
533 true
534 } else {
535 true // Valid NRW
536 }
537 }
538 _ => false,
539 }
540 } else {
541 // If we can't read IFD0, trust the extension
542 true
543 }
544 } else {
545 false // Not even TIFF
546 }
547 }
548 "ARW" => {
549 // ARW detection: Standard TIFF structure (0x2a) but trust extension
550 // ExifTool confirms these based on Sony make/model, we trust the extension
551 identifier == 0x2a
552 }
553 "DNG" => {
554 // DNG detection: Standard TIFF structure (0x2a) but trust extension
555 // ExifTool confirms these based on DNGVersion tag, we trust the extension
556 identifier == 0x2a
557 }
558 "IIQ" => {
559 // IIQ detection: Standard TIFF structure (0x2a) but trust extension
560 // Phase One format, trust extension
561 identifier == 0x2a
562 }
563 "3FR" => {
564 // 3FR detection: Standard TIFF structure (0x2a) but trust extension
565 // Hasselblad format, trust extension
566 identifier == 0x2a
567 }
568 "MRW" => {
569 // MRW detection: Has its own magic number pattern in ExifTool
570 // Should be handled by magic number lookup, not here
571 false
572 }
573 "CR3" => {
574 // CR3 is MOV-based, not TIFF-based - should not reach here
575 // This case exists only for completeness - validate_tiff_raw_format checks MOV signature
576 false
577 }
578 _ => false,
579 }
580 }
581
582 /// Check if a file type has a processing module defined
583 /// This mimics ExifTool's %moduleName hash behavior
584 fn has_processing_module(&self, file_type: &str) -> bool {
585 // In ExifTool, having a module means it can be processed even without magic match
586 // Notable examples include JXL -> Jpeg2000 module
587 // We check if the file type has a defined format/processing path
588 use crate::generated::file_types::resolve_file_type;
589
590 // If resolve_file_type returns Some, it means ExifTool knows how to process this type
591 resolve_file_type(file_type).is_some()
592 }
593
594 /// Correct NEF/NRW type based on content analysis
595 /// ExifTool Exif.pm distinguishes based on compression and linearization table
596 fn correct_nef_nrw_type(&self, file_type: &str, buffer: &[u8]) -> Option<String> {
597 use crate::tiff_utils::{read_tiff_ifd0_info, COMPRESSION_JPEG};
598 use std::io::Cursor;
599
600 let mut cursor = Cursor::new(buffer);
601 if let Some((compression, has_nef_linearization)) = read_tiff_ifd0_info(&mut cursor) {
602 match file_type {
603 "NEF" => {
604 // ExifTool Exif.pm: "recognize NRW file from a JPEG-compressed thumbnail in IFD0"
605 if compression == Some(COMPRESSION_JPEG) {
606 Some("NRW".to_string()) // NEF with JPEG compression is actually NRW
607 } else {
608 None // Keep as NEF
609 }
610 }
611 "NRW" => {
612 // ExifTool.pm: "fix NEF type if misidentified as NRW"
613 if has_nef_linearization {
614 Some("NEF".to_string()) // NRW with linearization table is actually NEF
615 } else {
616 None // Keep as NRW
617 }
618 }
619 _ => None,
620 }
621 } else {
622 None // Can't determine, keep original
623 }
624 }
625
626 /// Last-ditch scan for embedded JPEG/TIFF signatures
627 /// ExifTool equivalent: ExifTool.pm:2976-2983
628 fn scan_for_embedded_signatures(&self, buffer: &[u8]) -> Option<String> {
629 // Look for JPEG signature: \xff\xd8\xff
630 if let Some(pos) = buffer.windows(3).position(|w| w == b"\xff\xd8\xff") {
631 if pos > 0 {
632 eprintln!("Warning: Processing JPEG-like data after unknown {pos}-byte header");
633 }
634 return Some("JPEG".to_string());
635 }
636
637 // Look for TIFF signatures: II*\0 or MM\0*
638 if let Some(pos) = buffer
639 .windows(4)
640 .position(|w| w == b"II*\0" || w == b"MM\0*")
641 {
642 if pos > 0 {
643 eprintln!("Warning: Processing TIFF-like data after unknown {pos}-byte header");
644 }
645 return Some("TIFF".to_string());
646 }
647
648 None
649 }
650
651 /// Determine specific file type for MOV/MP4 containers based on ftyp brand
652 /// ExifTool equivalent: QuickTime.pm:9868-9877 ftyp brand detection
653 fn determine_mov_subtype(&self, buffer: &[u8]) -> Option<String> {
654 // Need at least 12 bytes for ftyp atom structure
655 if buffer.len() >= 12 && &buffer[4..8] == b"ftyp" {
656 let brand = &buffer[8..12];
657 // Map ftyp brand to specific file type
658 // ExifTool QuickTime.pm:227-232 - %ftypLookup entries
659 match brand {
660 b"heic" | b"hevc" => Some("HEIC".to_string()),
661 b"mif1" | b"msf1" | b"heix" => Some("HEIF".to_string()),
662 b"avif" => Some("AVIF".to_string()),
663 b"crx " => Some("CR3".to_string()), // Canon RAW 3 format
664 // Common MP4 brands
665 b"mp41" | b"mp42" | b"mp4v" | b"isom" | b"M4A " | b"M4V " | b"dash" | b"avc1" => {
666 Some("MP4".to_string())
667 }
668 _ => None, // Keep as MOV for other brands
669 }
670 } else {
671 None
672 }
673 }
674
675 /// Validate XMP pattern: \0{0,3}(\xfe\xff|\xff\xfe|\xef\xbb\xbf)?\0{0,3}\s*<
676 /// ExifTool.pm:1018 - XMP files can start with optional BOM and null bytes, then whitespace, then '<'
677 #[allow(dead_code)]
678 fn validate_xmp_pattern(&self, buffer: &[u8]) -> bool {
679 if buffer.is_empty() {
680 return false;
681 }
682
683 let mut pos = 0;
684
685 // Skip up to 3 null bytes at the beginning
686 while pos < buffer.len() && pos < 3 && buffer[pos] == 0 {
687 pos += 1;
688 }
689
690 // Check for optional BOM (Byte Order Mark)
691 if pos + 3 <= buffer.len() {
692 // UTF-8 BOM: EF BB BF
693 if buffer[pos..pos + 3] == [0xef, 0xbb, 0xbf] {
694 pos += 3;
695 }
696 }
697 if pos + 2 <= buffer.len() {
698 // UTF-16 BE BOM: FE FF or UTF-16 LE BOM: FF FE
699 if buffer[pos..pos + 2] == [0xfe, 0xff] || buffer[pos..pos + 2] == [0xff, 0xfe] {
700 pos += 2;
701 }
702 }
703
704 // Skip up to 3 more null bytes after BOM
705 while pos < buffer.len() && pos < 6 && buffer[pos] == 0 {
706 pos += 1;
707 }
708
709 // Skip whitespace (space, tab, newline, carriage return)
710 while pos < buffer.len()
711 && (buffer[pos] == b' '
712 || buffer[pos] == b'\t'
713 || buffer[pos] == b'\n'
714 || buffer[pos] == b'\r')
715 {
716 pos += 1;
717 }
718
719 // Finally, check for '<' character
720 pos < buffer.len() && buffer[pos] == b'<'
721 }
722
723 /// Build final detection result from file type
724 pub fn build_result(
725 &self,
726 file_type: &str,
727 path: &Path,
728 ) -> Result<FileTypeDetectionResult, FileDetectionError> {
729 // Get primary format for processing
730 use crate::generated::file_types::resolve_file_type;
731 let (format, description) = if let Some((formats, desc)) = resolve_file_type(file_type) {
732 (formats[0].to_string(), desc.to_string())
733 } else {
734 (file_type.to_string(), format!("{file_type} file"))
735 };
736
737 // Get MIME type from generated lookup - try the file type first, then fallback, then the format
738 // This ensures file-type-specific MIME types take precedence over generic format MIME types
739 let mime_type = lookup_mime_types(file_type)
740 .or_else(|| self.get_fallback_mime_type(file_type))
741 .or_else(|| lookup_mime_types(&format))
742 .unwrap_or("application/octet-stream")
743 .to_string();
744
745 // Special case: ASF files with .wmv extension should use video/x-ms-wmv MIME type
746 // ExifTool.pm:9570-9592 SetFileType() applies extension-specific MIME types for ASF/WMV
747 // Reference: ExifTool.pm lines 557 (WMV->ASF mapping) and 816 (WMV MIME type)
748 let mime_type = if file_type == "ASF" {
749 if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
750 match ext.to_lowercase().as_str() {
751 "wmv" => "video/x-ms-wmv".to_string(),
752 _ => mime_type,
753 }
754 } else {
755 mime_type
756 }
757 } else {
758 mime_type
759 };
760
761 Ok(FileTypeDetectionResult {
762 file_type: file_type.to_string(),
763 format: format.to_string(),
764 mime_type,
765 description,
766 })
767 }
768
769 /// Get fallback MIME types for file types not covered by ExifTool's %mimeType hash
770 /// These are standard MIME types for common formats
771 fn get_fallback_mime_type(&self, file_type: &str) -> Option<&'static str> {
772 match file_type {
773 // Image formats
774 "JPEG" => Some("image/jpeg"),
775 "PNG" => Some("image/png"),
776 "TIFF" => Some("image/tiff"),
777 "GIF" => Some("image/gif"),
778 "BMP" => Some("image/bmp"),
779 "WEBP" => Some("image/webp"),
780 "HEIC" => Some("image/heic"), // HEIC gets its own MIME type
781 "HEIF" => Some("image/heif"), // High Efficiency Image Format (general)
782 "JP2" => Some("image/jp2"), // JPEG 2000 Part 1 (ISO/IEC 15444-1)
783 "J2C" => Some("image/x-j2c"), // JPEG 2000 Code Stream
784
785 // Video formats
786 "AVI" => Some("video/x-msvideo"),
787 "3GP" => Some("video/3gpp"), // 3GPP video format
788 "3G2" => Some("video/3gpp2"), // 3GPP2 video format
789 "M4V" => Some("video/x-m4v"), // Apple M4V video
790 "MTS" => Some("video/m2ts"), // MPEG-2 Transport Stream (alias for M2TS)
791 "M2TS" => Some("video/m2ts"), // MPEG-2 Transport Stream
792 "MP4" => Some("video/mp4"), // MPEG-4 Part 14
793 "FLV" => Some("video/x-flv"), // Flash Video
794 "WMV" => Some("video/x-ms-wmv"), // Windows Media Video
795 "ASF" => Some("video/x-ms-wmv"), // Advanced Systems Format (usually WMV)
796
797 // Audio formats
798 "WAV" => Some("audio/x-wav"), // WAV audio files
799
800 // Document formats
801 "XMP" => Some("application/rdf+xml"), // Extensible Metadata Platform
802 "PSD" => Some("application/vnd.adobe.photoshop"), // Adobe Photoshop Document
803 "EPS" => Some("application/postscript"), // Encapsulated PostScript
804
805 // Other common formats that might be missing
806 "RIFF" => Some("application/octet-stream"), // Generic RIFF container
807
808 _ => None,
809 }
810 }
811}
812
813impl Default for FileTypeDetector {
814 fn default() -> Self {
815 Self::new()
816 }
817}
818
819#[cfg(test)]
820mod mimetypes_validation;
821
822#[cfg(test)]
823mod tests {
824 use super::*;
825 use std::io::Cursor;
826
827 #[test]
828 fn test_extension_normalization() {
829 let detector = FileTypeDetector::new();
830
831 assert_eq!(detector.normalize_extension("tif"), "TIFF");
832 assert_eq!(detector.normalize_extension("jpg"), "JPEG");
833 assert_eq!(detector.normalize_extension("png"), "PNG");
834 }
835
836 #[test]
837 fn test_jpeg_detection() {
838 let detector = FileTypeDetector::new();
839 let path = Path::new("test.jpg");
840
841 // JPEG magic number: \xff\xd8\xff
842 let jpeg_data = vec![0xff, 0xd8, 0xff, 0xe0, 0x00, 0x10];
843 let mut cursor = Cursor::new(jpeg_data);
844
845 let result = detector.detect_file_type(path, &mut cursor).unwrap();
846 assert_eq!(result.file_type, "JPEG");
847 assert_eq!(result.format, "JPEG");
848 assert_eq!(result.mime_type, "image/jpeg");
849 }
850
851 #[test]
852 fn test_png_detection() {
853 let detector = FileTypeDetector::new();
854 let path = Path::new("test.png");
855
856 // PNG magic number: \x89PNG\r\n\x1a\n
857 let png_data = vec![0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a];
858 let mut cursor = Cursor::new(png_data);
859
860 let result = detector.detect_file_type(path, &mut cursor).unwrap();
861 assert_eq!(result.file_type, "PNG");
862 assert_eq!(result.format, "PNG");
863 assert_eq!(result.mime_type, "image/png");
864 }
865
866 #[test]
867 fn test_tiff_extension_alias() {
868 let detector = FileTypeDetector::new();
869 let path = Path::new("test.tif");
870
871 // TIFF magic number: II*\0 (little endian)
872 let tiff_data = vec![0x49, 0x49, 0x2a, 0x00];
873 let mut cursor = Cursor::new(tiff_data);
874
875 let result = detector.detect_file_type(path, &mut cursor).unwrap();
876 assert_eq!(result.file_type, "TIFF");
877 assert_eq!(result.format, "TIFF");
878 assert_eq!(result.mime_type, "image/tiff");
879 }
880
881 #[test]
882 fn test_embedded_jpeg_recovery() {
883 let detector = FileTypeDetector::new();
884 // Use a filename with unknown extension to trigger embedded signature scan
885 let path = Path::new("unknown.xyz");
886
887 // Unknown header followed by JPEG signature
888 let mut data = vec![0x00, 0x01, 0x02, 0x03]; // Unknown header
889 data.extend_from_slice(&[0xff, 0xd8, 0xff]); // JPEG signature
890 let mut cursor = Cursor::new(data);
891
892 let result = detector.detect_file_type(path, &mut cursor).unwrap();
893 assert_eq!(result.file_type, "JPEG");
894 }
895
896 #[test]
897 fn test_weak_magic_mp3() {
898 let detector = FileTypeDetector::new();
899 let path = Path::new("test.mp3");
900
901 // MP3 has weak magic, should rely on extension
902 let mp3_data = vec![0x49, 0x44, 0x33]; // ID3 tag (valid MP3 start)
903 let mut cursor = Cursor::new(mp3_data);
904
905 let result = detector.detect_file_type(path, &mut cursor).unwrap();
906 assert_eq!(result.file_type, "MP3");
907 assert_eq!(result.mime_type, "audio/mpeg");
908 }
909
910 #[test]
911 fn test_unknown_file_type() {
912 let detector = FileTypeDetector::new();
913 let path = Path::new("test.unknown");
914
915 let unknown_data = vec![0x00, 0x01, 0x02, 0x03];
916 let mut cursor = Cursor::new(unknown_data);
917
918 let result = detector.detect_file_type(path, &mut cursor);
919 assert!(matches!(result, Err(FileDetectionError::UnknownFileType)));
920 }
921
922 #[test]
923 fn test_heic_detection() {
924 let detector = FileTypeDetector::new();
925 let path = Path::new("test.heic");
926
927 // HEIC file with ftyp box and heic brand
928 let mut heic_data = Vec::new();
929 heic_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x28]); // Box size (40 bytes)
930 heic_data.extend_from_slice(b"ftyp"); // Box type (bytes 4-7)
931 heic_data.extend_from_slice(b"heic"); // Major brand (bytes 8-11)
932 heic_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x00]); // Minor version
933 heic_data.extend_from_slice(b"mif1"); // Compatible brand
934 heic_data.extend_from_slice(b"MiHE"); // Compatible brand
935 heic_data.extend_from_slice(b"MiPr"); // Compatible brand
936 heic_data.extend_from_slice(b"miaf"); // Compatible brand
937 heic_data.extend_from_slice(b"MiHB"); // Compatible brand
938 heic_data.extend_from_slice(b"heic"); // Compatible brand
939
940 let mut cursor = Cursor::new(heic_data);
941
942 let result = detector.detect_file_type(path, &mut cursor).unwrap();
943 assert_eq!(result.file_type, "HEIC");
944 assert_eq!(result.format, "MOV");
945 assert_eq!(result.mime_type, "image/heic");
946 assert_eq!(
947 result.description,
948 "High Efficiency Image Format still image"
949 );
950 }
951
952 #[test]
953 fn test_avi_riff_detection() {
954 let detector = FileTypeDetector::new();
955 let path = Path::new("test.avi");
956
957 // AVI RIFF header: RIFF + size + "AVI " format identifier
958 let mut avi_data = Vec::new();
959 avi_data.extend_from_slice(b"RIFF"); // RIFF magic (bytes 0-3)
960 avi_data.extend_from_slice(&[0x24, 0x00, 0x00, 0x00]); // File size - 8 (bytes 4-7)
961 avi_data.extend_from_slice(b"AVI "); // AVI format identifier (bytes 8-11)
962 avi_data.extend_from_slice(b"LIST"); // Chunk header start (bytes 12+)
963 let mut cursor = Cursor::new(avi_data);
964
965 let result = detector.detect_file_type(path, &mut cursor).unwrap();
966 assert_eq!(result.file_type, "AVI");
967 assert_eq!(result.format, "RIFF");
968 assert_eq!(result.mime_type, "video/x-msvideo");
969 }
970
971 #[test]
972 fn test_wav_riff_detection() {
973 let detector = FileTypeDetector::new();
974 let path = Path::new("test.wav");
975
976 // WAV RIFF header: RIFF + size + "WAVE" format identifier
977 let mut wav_data = Vec::new();
978 wav_data.extend_from_slice(b"RIFF"); // RIFF magic (bytes 0-3)
979 wav_data.extend_from_slice(&[0x24, 0x00, 0x00, 0x00]); // File size - 8 (bytes 4-7)
980 wav_data.extend_from_slice(b"WAVE"); // WAVE format identifier (bytes 8-11)
981 wav_data.extend_from_slice(b"fmt "); // Format chunk start (bytes 12+)
982 let mut cursor = Cursor::new(wav_data);
983
984 let result = detector.detect_file_type(path, &mut cursor).unwrap();
985 assert_eq!(result.file_type, "WAV");
986 assert_eq!(result.format, "RIFF");
987 assert_eq!(result.mime_type, "audio/x-wav");
988 }
989
990 #[test]
991 fn test_webp_riff_detection() {
992 let detector = FileTypeDetector::new();
993 let path = Path::new("test.webp");
994
995 // WebP RIFF header: RIFF + size + "WEBP" format identifier
996 let mut webp_data = Vec::new();
997 webp_data.extend_from_slice(b"RIFF"); // RIFF magic (bytes 0-3)
998 webp_data.extend_from_slice(&[0x20, 0x00, 0x00, 0x00]); // File size - 8 (bytes 4-7)
999 webp_data.extend_from_slice(b"WEBP"); // WEBP format identifier (bytes 8-11)
1000 webp_data.extend_from_slice(b"VP8 "); // VP8 chunk header (bytes 12+)
1001 let mut cursor = Cursor::new(webp_data);
1002
1003 let result = detector.detect_file_type(path, &mut cursor).unwrap();
1004 assert_eq!(result.file_type, "WEBP");
1005 assert_eq!(result.format, "RIFF");
1006 assert_eq!(result.mime_type, "image/webp");
1007 }
1008
1009 #[test]
1010 fn test_heic_extension_detection() {
1011 let detector = FileTypeDetector::new();
1012 let path = Path::new("test.heic");
1013
1014 // MOV file with HEIC ftyp brand
1015 let mut heic_data = Vec::new();
1016 heic_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x20]); // Size
1017 heic_data.extend_from_slice(b"ftyp"); // Box type
1018 heic_data.extend_from_slice(b"mif1"); // Major brand (HEIF)
1019 heic_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x00]); // Minor version
1020 heic_data.extend_from_slice(b"mif1heic"); // Compatible brands
1021 let mut cursor = Cursor::new(heic_data);
1022
1023 match detector.detect_file_type(path, &mut cursor) {
1024 Ok(result) => {
1025 println!(
1026 "HEIC detection result: file_type={}, format={}, mime_type={}",
1027 result.file_type, result.format, result.mime_type
1028 );
1029 // Should detect as HEIF due to mif1 brand
1030 assert_eq!(result.file_type, "HEIF");
1031 assert_eq!(result.format, "MOV");
1032 assert_eq!(result.mime_type, "image/heif");
1033 }
1034 Err(e) => {
1035 panic!("Failed to detect HEIC file: {e:?}");
1036 }
1037 }
1038 }
1039
1040 #[test]
1041 fn test_riff_format_content_detection() {
1042 let detector = FileTypeDetector::new();
1043 let path = Path::new("test.avi"); // AVI extension
1044
1045 // But contains WAV data - should detect as WAV based on content
1046 // Following ExifTool's behavior: content takes precedence over extension
1047 let mut wav_data = Vec::new();
1048 wav_data.extend_from_slice(b"RIFF"); // RIFF magic
1049 wav_data.extend_from_slice(&[0x24, 0x00, 0x00, 0x00]); // File size
1050 wav_data.extend_from_slice(b"WAVE"); // WAVE format (not AVI)
1051 wav_data.extend_from_slice(b"fmt "); // Format chunk
1052 let mut cursor = Cursor::new(wav_data);
1053
1054 // Should detect as WAV based on content, following ExifTool's behavior
1055 let result = detector.detect_file_type(path, &mut cursor);
1056 match result {
1057 Ok(detection) => {
1058 assert_eq!(detection.file_type, "WAV");
1059 assert_eq!(detection.format, "RIFF");
1060 assert_eq!(detection.mime_type, "audio/x-wav");
1061 }
1062 Err(e) => {
1063 panic!("Expected WAV detection but got error: {e:?}");
1064 }
1065 }
1066 }
1067}