exif_oxide/
file_detection.rs

1//! File type detection engine following ExifTool's implementation
2//!
3//! This module implements ExifTool's sophisticated multi-tiered file type detection
4//! approach, ported from ExifTool.pm:2913-2999
5//!
6//! Detection Flow:
7//! 1. Extension-based candidates (via generated fileTypeLookup)
8//! 2. Magic number validation (via generated magicNumber patterns)
9//! 3. Last-ditch embedded signature recovery
10//!
11//! The implementation preserves ExifTool's exact logic including:
12//! - Weak magic types that defer to extension
13//! - Extension normalization rules
14//! - Conflict resolution patterns
15//! - Error recovery mechanisms
16
17use crate::generated::ExifTool_pm::lookup_mime_types;
18use std::io::{Read, Seek};
19use std::path::Path;
20
21/// Maximum bytes to read for magic number testing
22/// ExifTool uses exactly 1024 bytes - ExifTool.pm:2955
23const MAGIC_TEST_BUFFER_SIZE: usize = 1024;
24
25/// File types with weak magic numbers that defer to extension detection
26/// ExifTool.pm:1030 - only MP3 is marked as weak magic: my %weakMagic = ( MP3 => 1 );
27const WEAK_MAGIC_TYPES: &[&str] = &["MP3"];
28
29// All magic number patterns are now generated from ExifTool.pm %magicNumber hash
30// See src/generated/file_types/magic_numbers.rs for the complete patterns
31// No hardcoded patterns needed - use lookup_magic_number_patterns() for all detection
32
33#[derive(Debug, Clone, PartialEq)]
34pub struct FileTypeDetectionResult {
35    /// Detected file type (e.g., "JPEG", "PNG", "CR2")
36    pub file_type: String,
37    /// Primary format for processing (e.g., "JPEG", "TIFF", "MOV")
38    pub format: String,
39    /// MIME type string
40    pub mime_type: String,
41    /// Human-readable description
42    pub description: String,
43}
44
45#[derive(Debug)]
46pub enum FileDetectionError {
47    /// File type could not be determined
48    UnknownFileType,
49    /// IO error reading file
50    IoError(std::io::Error),
51    /// Invalid file path
52    InvalidPath,
53}
54
55impl From<std::io::Error> for FileDetectionError {
56    fn from(error: std::io::Error) -> Self {
57        FileDetectionError::IoError(error)
58    }
59}
60
61impl std::fmt::Display for FileDetectionError {
62    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
63        match self {
64            FileDetectionError::UnknownFileType => write!(f, "Unknown file type"),
65            FileDetectionError::IoError(e) => write!(f, "IO error: {e}"),
66            FileDetectionError::InvalidPath => write!(f, "Invalid file path"),
67        }
68    }
69}
70
71impl std::error::Error for FileDetectionError {}
72
73/// Main file type detector implementing ExifTool's detection algorithm
74pub struct FileTypeDetector;
75
76impl FileTypeDetector {
77    /// Create a new file type detector
78    pub fn new() -> Self {
79        Self
80    }
81
82    /// Detect file type from path and file content
83    ///
84    /// Implements ExifTool's detection flow from ExifTool.pm:2913-2999
85    pub fn detect_file_type<R: Read + Seek>(
86        &self,
87        path: &Path,
88        reader: &mut R,
89    ) -> Result<FileTypeDetectionResult, FileDetectionError> {
90        // Phase 1: Get extension-based candidates
91        // ExifTool.pm:2940 - GetFileType($filename)
92        let candidates = self.get_candidates_from_extension(path)?;
93
94        // Phase 2: Read test buffer for magic number validation
95        // ExifTool.pm:2955 - Read($raf, $buff, $testLen)
96        let mut buffer = vec![0u8; MAGIC_TEST_BUFFER_SIZE];
97        let bytes_read = reader.read(&mut buffer)?;
98        buffer.truncate(bytes_read);
99
100        // Reset reader position for subsequent processing
101        // This is critical so format-specific processors start at the beginning
102        reader.seek(std::io::SeekFrom::Start(0))?;
103
104        // Phase 3: Magic number validation against candidates
105        // ExifTool.pm:2960-2975 - Test candidates against magic numbers
106        // CRITICAL: Test all candidates before giving up, per TRUST-EXIFTOOL.md
107        let mut matched_type = None;
108        let mut recognized_ext = None;
109
110        for candidate in &candidates {
111            // Check if this is a weak magic type that defers to extension
112            if WEAK_MAGIC_TYPES.contains(&candidate.as_str()) {
113                // Weak magic types are fallback only if no strong magic matches
114                // ExifTool.pm:2970 - "next if $weakMagic{$type} and defined $recognizedExt"
115                if matched_type.is_none() {
116                    matched_type = Some(candidate.clone());
117                }
118                continue;
119            }
120
121            // validate_magic_number now checks both file type and format patterns
122            if self.validate_magic_number(candidate, &buffer) {
123                // Strong magic match - use this type
124                matched_type = Some(candidate.clone());
125                break;
126            }
127
128            // ExifTool behavior: Keep track of recognized extensions with modules
129            // Even if magic pattern fails, ExifTool may still process the file
130            // if it has a module defined (like JXL -> Jpeg2000)
131            if recognized_ext.is_none() && self.has_processing_module(candidate) {
132                recognized_ext = Some(candidate.clone());
133            }
134        }
135
136        // If no magic match but we have a recognized extension with a module,
137        // use that as fallback (mimics ExifTool's behavior for JXL and others)
138        if matched_type.is_none() && recognized_ext.is_some() {
139            matched_type = recognized_ext;
140        }
141
142        if let Some(file_type) = matched_type {
143            // Special handling for MOV format to determine specific subtype
144            // ExifTool QuickTime.pm:9868-9877 - ftyp brand determines actual file type
145            // CRITICAL: Check against the format, not the file type
146            use crate::generated::file_types::resolve_file_type;
147            let format = if let Some((formats, _)) = resolve_file_type(&file_type) {
148                formats[0]
149            } else {
150                &file_type
151            };
152
153            let detected_type = if format == "MOV" {
154                self.determine_mov_subtype(&buffer)
155                    .unwrap_or_else(|| file_type.clone())
156            } else if self.is_riff_based_format(&file_type) {
157                // For RIFF-based formats, detect the actual type from the header
158                // ExifTool RIFF.pm:2038-2039 - Sets file type based on RIFF format identifier
159                self.detect_riff_type(&buffer)
160                    .unwrap_or_else(|| file_type.clone())
161            } else if file_type == "NEF" || file_type == "NRW" {
162                // NEF/NRW correction based on content analysis
163                // ExifTool Exif.pm distinguishes based on compression and linearization table
164                self.correct_nef_nrw_type(&file_type, &buffer)
165                    .unwrap_or_else(|| file_type.clone())
166            } else {
167                file_type
168            };
169            return self.build_result(&detected_type, path);
170        }
171
172        // Phase 4: Last-ditch recovery - scan for embedded signatures
173        // ExifTool.pm:2976-2983 - Look for JPEG/TIFF embedded in unknown data
174        if let Some(embedded_type) = self.scan_for_embedded_signatures(&buffer) {
175            return self.build_result(&embedded_type, path);
176        }
177
178        Err(FileDetectionError::UnknownFileType)
179    }
180
181    /// Get file type candidates based on file extension
182    /// ExifTool equivalent: GetFileType() in ExifTool.pm:9010-9050
183    fn get_candidates_from_extension(
184        &self,
185        path: &Path,
186    ) -> Result<Vec<String>, FileDetectionError> {
187        let extension = path
188            .extension()
189            .and_then(|ext| ext.to_str())
190            .ok_or(FileDetectionError::InvalidPath)?;
191
192        // Normalize extension to uppercase (ExifTool convention)
193        let normalized_ext = self.normalize_extension(extension);
194
195        // Resolve through fileTypeLookup with alias following
196        // ExifTool.pm:258-404 %fileTypeLookup hash defines extension mappings
197        use crate::generated::file_types::resolve_file_type;
198
199        // Check if this extension is known to ExifTool
200        let is_known_extension = resolve_file_type(&normalized_ext).is_some();
201
202        // For HEIC/HEIF, we need special handling
203        // Even if not in the generated lookup, these are valid extensions
204        let is_heif_extension = matches!(normalized_ext.as_str(), "HEIC" | "HEIF" | "HIF");
205
206        if is_known_extension || is_heif_extension {
207            // For most formats, the extension itself is the file type candidate
208            // The formats array tells us what processing module to use, not the file type
209            // ExifTool.pm:2940-2950 - GetFileType returns the extension-based type
210
211            // Special case: Some extensions are aliases that should map to a different type
212            // These are hardcoded in ExifTool.pm GetFileType()
213            match normalized_ext.as_str() {
214                "3GP2" => Ok(vec!["3G2".to_string()]), // ExifTool.pm alias
215                "MTS" => Ok(vec!["M2TS".to_string()]), // ExifTool.pm alias
216                // HEIC/HEIF/HIF extensions should use MOV format for detection
217                // ExifTool QuickTime.pm handles these as MOV-based formats
218                "HEIC" | "HEIF" | "HIF" => Ok(vec!["MOV".to_string()]),
219                _ => Ok(vec![normalized_ext.clone()]), // Use the extension as the type
220            }
221        } else {
222            // Unknown extension - return normalized extension as candidate
223            Ok(vec![normalized_ext])
224        }
225    }
226
227    /// Normalize file extension following ExifTool's rules
228    /// ExifTool equivalent: GetFileExtension() in ExifTool.pm:9013-9040
229    fn normalize_extension(&self, extension: &str) -> String {
230        let upper_ext = extension.to_uppercase();
231
232        // ExifTool hardcoded extension conversions
233        // These are critical for consistency - TRUST-EXIFTOOL
234        match upper_ext.as_str() {
235            "TIF" => "TIFF".to_string(), // ExifTool.pm:9019 - hardcoded for TIFF consistency
236            "JPG" => "JPEG".to_string(),
237            "3GP2" => "3G2".to_string(),
238            "AIF" => "AIFF".to_string(),
239            _ => upper_ext,
240        }
241    }
242
243    /// Convert Perl regex pattern to Rust regex pattern
244    /// ExifTool patterns use Perl syntax that needs conversion for Rust regex crate
245    #[allow(dead_code)]
246    fn convert_perl_pattern_to_rust(&self, pattern: &str) -> String {
247        // Convert common Perl regex patterns to Rust-compatible patterns
248        // These conversions preserve ExifTool's exact matching behavior
249
250        let mut rust_pattern = pattern.to_string();
251
252        // Handle null bytes and their quantifiers
253        // \0 -> \x00, \0{3} -> \x00{3}, \0{0,3} -> \x00{0,3}
254        rust_pattern = rust_pattern.replace("\\0", "\\x00");
255
256        // Handle common escape sequences
257        rust_pattern = rust_pattern.replace("\\r", "\\x0d");
258        rust_pattern = rust_pattern.replace("\\n", "\\x0a");
259        rust_pattern = rust_pattern.replace("\\t", "\\x09");
260
261        // Handle Unicode characters by converting to byte sequences
262        // For BPG pattern "BPGû" - convert û (U+00FB) to \xfb
263        if rust_pattern.contains('û') {
264            rust_pattern = rust_pattern.replace('û', "\\xfb");
265        }
266
267        // Handle other common Unicode/extended ASCII characters
268        rust_pattern = rust_pattern.replace('é', "\\xe9");
269        rust_pattern = rust_pattern.replace('ñ', "\\xf1");
270
271        // Fix character classes with hex values - ensure proper escaping
272        // These are already mostly correct for Rust regex
273
274        // Handle dot patterns in specific contexts
275        // For JXL pattern, dots should match any byte in binary context
276        // This is already correct as . matches any byte in bytes regex
277
278        rust_pattern
279    }
280
281    // REMOVED: match_binary_magic_pattern function - replaced with generated patterns from ExifTool
282    // All magic number validation now uses generated patterns from ExifTool.pm %magicNumber hash
283
284    /// Validate magic number for a file type candidate
285    /// ExifTool equivalent: magic number testing in ExifTool.pm:2960-2975
286    /// CRITICAL: Must match ExifTool's exact logic per TRUST-EXIFTOOL.md
287    fn validate_magic_number(&self, file_type: &str, buffer: &[u8]) -> bool {
288        // Special handling for RIFF-based formats (AVI, WAV, WEBP, etc.)
289        // ExifTool RIFF.pm:2037-2046 - RIFF container detection with format analysis
290        if self.is_riff_based_format(file_type) {
291            return self.validate_riff_format(file_type, buffer);
292        }
293
294        // Special handling for TIFF-based RAW formats that need deeper analysis
295        // ExifTool.pm:8531-8612 - DoProcessTIFF() RAW format detection
296        if self.is_tiff_based_raw_format(file_type) {
297            return self.validate_tiff_raw_format(file_type, buffer);
298        }
299
300        // Use generated magic number patterns from ExifTool's %magicNumber hash
301        // ExifTool.pm:912-1027 - patterns extracted and compiled as regex::bytes::Regex
302        use crate::generated::file_types::{
303            magic_number_patterns::matches_magic_number, resolve_file_type,
304        };
305
306        // First try to match against the file type itself
307        if matches_magic_number(file_type, buffer) {
308            return true;
309        }
310
311        // If no direct match, check if this file type has a format that has magic patterns
312        // ExifTool uses the format (MOV, TIFF, etc.) for magic pattern matching
313        if let Some((formats, _desc)) = resolve_file_type(file_type) {
314            // Try magic pattern for the primary format
315            if matches_magic_number(formats[0], buffer) {
316                return true;
317            }
318        }
319
320        false
321    }
322
323    /// Detect actual RIFF format type from buffer
324    /// ExifTool RIFF.pm:2037-2046 - Detects specific RIFF variant
325    fn detect_riff_type(&self, buffer: &[u8]) -> Option<String> {
326        // Need at least 12 bytes for RIFF header analysis
327        if buffer.len() < 12 {
328            return None;
329        }
330
331        // Extract RIFF magic signature (bytes 0-3) and format identifier (bytes 8-11)
332        let magic = &buffer[0..4];
333        let format_id = &buffer[8..12];
334
335        // Check RIFF magic signature first
336        // ExifTool RIFF.pm:2040 - "if ($buff =~ /^(RIFF|RF64)....(.{4})/s)"
337        let is_riff = magic == b"RIFF" || magic == b"RF64";
338        if !is_riff {
339            // Check for obscure lossless audio variants
340            // ExifTool RIFF.pm:2044 - "return 0 unless $buff =~ /^(LA0[234]|OFR |LPAC|wvpk)/"
341            let is_audio_variant = magic == b"LA02"
342                || magic == b"LA03"
343                || magic == b"LA04"
344                || magic == b"OFR "
345                || magic == b"LPAC"
346                || magic == b"wvpk";
347            if !is_audio_variant {
348                return None;
349            }
350        }
351
352        // Map format identifier to file type using ExifTool's riffType mapping
353        // ExifTool RIFF.pm:49-53 - %riffType hash
354        match format_id {
355            b"WAVE" => Some("WAV".to_string()),
356            b"AVI " => Some("AVI".to_string()), // Note: AVI has trailing space
357            b"WEBP" => Some("WEBP".to_string()),
358            b"LA02" | b"LA03" | b"LA04" => Some("LA".to_string()),
359            b"OFR " => Some("OFR".to_string()),
360            b"LPAC" => Some("PAC".to_string()),
361            b"wvpk" => Some("WV".to_string()),
362            _ => Some("RIFF".to_string()), // Unknown RIFF format
363        }
364    }
365
366    /// Check if a file type is based on RIFF container format
367    /// ExifTool maps these extensions to RIFF format processing
368    fn is_riff_based_format(&self, file_type: &str) -> bool {
369        // Check against ExifTool's fileTypeLookup - formats that map to RIFF
370        // From file_type_lookup.rs analysis
371        matches!(
372            file_type,
373            "AVI" | "WAV" | "WEBP" | "LA" | "OFR" | "PAC" | "WV"
374        )
375    }
376
377    /// Check if a file type is a TIFF-based RAW format requiring deeper analysis
378    /// ExifTool.pm:8531-8612 - RAW formats detected in DoProcessTIFF()
379    fn is_tiff_based_raw_format(&self, file_type: &str) -> bool {
380        // RAW formats that use TIFF structure but need specific detection
381        // Based on ExifTool's DoProcessTIFF() implementation
382        // Note: CR3 is MOV-based, MRW has its own magic number pattern
383        matches!(
384            file_type,
385            "CR2" | "NEF" | "NRW" | "RW2" | "RWL" | "ARW" | "DNG" | "ORF" | "IIQ" | "3FR"
386        )
387    }
388
389    /// Validate RIFF container and detect specific format
390    /// ExifTool equivalent: RIFF.pm:2037-2046 ProcessRIFF()
391    /// CRITICAL: Follows ExifTool's exact RIFF detection logic
392    fn validate_riff_format(&self, expected_type: &str, buffer: &[u8]) -> bool {
393        // Need at least 12 bytes for RIFF header analysis
394        // ExifTool RIFF.pm:2039 - "return 0 unless $raf->Read($buff, 12) == 12;"
395        if buffer.len() < 12 {
396            return false;
397        }
398
399        // Extract RIFF magic signature (bytes 0-3) and format identifier (bytes 8-11)
400        let magic = &buffer[0..4];
401        let format_id = &buffer[8..12];
402
403        // Check RIFF magic signature first
404        // ExifTool RIFF.pm:2040 - "if ($buff =~ /^(RIFF|RF64)....(.{4})/s)"
405        let is_riff = magic == b"RIFF" || magic == b"RF64";
406        if !is_riff {
407            // Check for obscure lossless audio variants
408            // ExifTool RIFF.pm:2044 - "return 0 unless $buff =~ /^(LA0[234]|OFR |LPAC|wvpk)/"
409            let is_audio_variant = magic == b"LA02"
410                || magic == b"LA03"
411                || magic == b"LA04"
412                || magic == b"OFR "
413                || magic == b"LPAC"
414                || magic == b"wvpk";
415            if !is_audio_variant {
416                return false;
417            }
418        }
419
420        // Map format identifier to file type using ExifTool's riffType mapping
421        // ExifTool RIFF.pm:49-53 - %riffType hash
422        let detected_type = match format_id {
423            b"WAVE" => "WAV",
424            b"AVI " => "AVI", // Note: AVI has trailing space
425            b"WEBP" => "WEBP",
426            b"LA02" | b"LA03" | b"LA04" => "LA",
427            b"OFR " => "OFR",
428            b"LPAC" => "PAC",
429            b"wvpk" => "WV",
430            _ => {
431                // Unknown RIFF format - be conservative and allow generic RIFF detection
432                // This matches ExifTool's behavior of processing unknown RIFF types
433                return expected_type == "RIFF";
434            }
435        };
436
437        // Check if detected type matches expected type
438        expected_type == detected_type
439    }
440
441    /// Validate TIFF-based RAW format with specific signature detection
442    /// ExifTool equivalent: DoProcessTIFF() in ExifTool.pm:8531-8612
443    /// CRITICAL: Follows ExifTool's exact RAW format detection logic
444    fn validate_tiff_raw_format(&self, file_type: &str, buffer: &[u8]) -> bool {
445        // Need at least 16 bytes for TIFF header + potential signatures
446        if buffer.len() < 16 {
447            return false;
448        }
449
450        // First check basic TIFF magic number
451        if !buffer.starts_with(b"II") && !buffer.starts_with(b"MM") {
452            return false;
453        }
454
455        // CRITICAL: CR3 is MOV-based, not TIFF-based! Check for MOV signature first
456        // ExifTool.pm - CR3 uses QuickTime.pm not TIFF processing
457        if file_type == "CR3" && buffer.len() >= 12 && &buffer[4..8] == b"ftyp" {
458            // This is a MOV-based file, not TIFF - return false to prevent TIFF processing
459            return false;
460        }
461
462        // Extract byte order and TIFF identifier
463        let little_endian = buffer.starts_with(b"II");
464        let identifier = if little_endian {
465            u16::from_le_bytes([buffer[2], buffer[3]])
466        } else {
467            u16::from_be_bytes([buffer[2], buffer[3]])
468        };
469
470        // Extract IFD offset
471        let ifd_offset = if little_endian {
472            u32::from_le_bytes([buffer[4], buffer[5], buffer[6], buffer[7]])
473        } else {
474            u32::from_be_bytes([buffer[4], buffer[5], buffer[6], buffer[7]])
475        } as usize;
476
477        // Apply ExifTool's RAW format detection logic
478        match file_type {
479            "CR2" => {
480                // CR2 detection: ExifTool.pm:8534-8542
481                // identifier == 0x2a and offset >= 16, check for CR\x02\0 signature at offset 8
482                if identifier == 0x2a && ifd_offset >= 16 && buffer.len() >= 12 {
483                    let sig = &buffer[8..12]; // CR2 signature is at offset 8, not at IFD offset
484                    sig == b"CR\x02\0" || sig == b"\xba\xb0\xac\xbb"
485                } else {
486                    false
487                }
488            }
489            "RW2" | "RWL" => {
490                // RW2 detection: ExifTool.pm:8544-8550
491                // identifier == 0x55 and specific magic signature at offset 8
492                if identifier == 0x55 && ifd_offset >= 0x18 && buffer.len() >= 0x18 {
493                    let magic_signature = &buffer[0x08..0x18]; // Magic signature is at offset 8, not 0x18
494                    magic_signature
495                        == b"\x88\xe7\x74\xd8\xf8\x25\x1d\x4d\x94\x7a\x6e\x77\x82\x2b\x5d\x6a"
496                } else {
497                    false
498                }
499            }
500            "ORF" => {
501                // ORF detection: ExifTool.pm:8552-8555
502                // identifier == 0x4f52 or 0x5352 (Olympus specific)
503                identifier == 0x4f52 || identifier == 0x5352
504            }
505            "NEF" | "NRW" => {
506                // NEF/NRW detection: ExifTool uses content analysis to distinguish
507                // ExifTool Exif.pm: NRW has JPEG compression in IFD0, NEF has linearization table
508                if identifier == 0x2a {
509                    // Valid TIFF structure, now check content to distinguish NEF from NRW
510                    use crate::tiff_utils::{read_tiff_ifd0_info, COMPRESSION_JPEG};
511                    use std::io::Cursor;
512
513                    let mut cursor = Cursor::new(buffer);
514                    if let Some((compression, has_nef_linearization)) =
515                        read_tiff_ifd0_info(&mut cursor)
516                    {
517                        match file_type {
518                            "NEF" => {
519                                // If NEF file has JPEG compression in IFD0, it's actually NRW
520                                // ExifTool Exif.pm: "recognize NRW file from a JPEG-compressed thumbnail in IFD0"
521                                if compression == Some(COMPRESSION_JPEG) {
522                                    // This will be corrected to NRW in post-processing
523                                    true
524                                } else {
525                                    true // Valid NEF
526                                }
527                            }
528                            "NRW" => {
529                                // If NRW file has NEFLinearizationTable, it's actually NEF
530                                // ExifTool.pm: "fix NEF type if misidentified as NRW"
531                                if has_nef_linearization {
532                                    // This will be corrected to NEF in post-processing
533                                    true
534                                } else {
535                                    true // Valid NRW
536                                }
537                            }
538                            _ => false,
539                        }
540                    } else {
541                        // If we can't read IFD0, trust the extension
542                        true
543                    }
544                } else {
545                    false // Not even TIFF
546                }
547            }
548            "ARW" => {
549                // ARW detection: Standard TIFF structure (0x2a) but trust extension
550                // ExifTool confirms these based on Sony make/model, we trust the extension
551                identifier == 0x2a
552            }
553            "DNG" => {
554                // DNG detection: Standard TIFF structure (0x2a) but trust extension
555                // ExifTool confirms these based on DNGVersion tag, we trust the extension
556                identifier == 0x2a
557            }
558            "IIQ" => {
559                // IIQ detection: Standard TIFF structure (0x2a) but trust extension
560                // Phase One format, trust extension
561                identifier == 0x2a
562            }
563            "3FR" => {
564                // 3FR detection: Standard TIFF structure (0x2a) but trust extension
565                // Hasselblad format, trust extension
566                identifier == 0x2a
567            }
568            "MRW" => {
569                // MRW detection: Has its own magic number pattern in ExifTool
570                // Should be handled by magic number lookup, not here
571                false
572            }
573            "CR3" => {
574                // CR3 is MOV-based, not TIFF-based - should not reach here
575                // This case exists only for completeness - validate_tiff_raw_format checks MOV signature
576                false
577            }
578            _ => false,
579        }
580    }
581
582    /// Check if a file type has a processing module defined
583    /// This mimics ExifTool's %moduleName hash behavior
584    fn has_processing_module(&self, file_type: &str) -> bool {
585        // In ExifTool, having a module means it can be processed even without magic match
586        // Notable examples include JXL -> Jpeg2000 module
587        // We check if the file type has a defined format/processing path
588        use crate::generated::file_types::resolve_file_type;
589
590        // If resolve_file_type returns Some, it means ExifTool knows how to process this type
591        resolve_file_type(file_type).is_some()
592    }
593
594    /// Correct NEF/NRW type based on content analysis
595    /// ExifTool Exif.pm distinguishes based on compression and linearization table
596    fn correct_nef_nrw_type(&self, file_type: &str, buffer: &[u8]) -> Option<String> {
597        use crate::tiff_utils::{read_tiff_ifd0_info, COMPRESSION_JPEG};
598        use std::io::Cursor;
599
600        let mut cursor = Cursor::new(buffer);
601        if let Some((compression, has_nef_linearization)) = read_tiff_ifd0_info(&mut cursor) {
602            match file_type {
603                "NEF" => {
604                    // ExifTool Exif.pm: "recognize NRW file from a JPEG-compressed thumbnail in IFD0"
605                    if compression == Some(COMPRESSION_JPEG) {
606                        Some("NRW".to_string()) // NEF with JPEG compression is actually NRW
607                    } else {
608                        None // Keep as NEF
609                    }
610                }
611                "NRW" => {
612                    // ExifTool.pm: "fix NEF type if misidentified as NRW"
613                    if has_nef_linearization {
614                        Some("NEF".to_string()) // NRW with linearization table is actually NEF
615                    } else {
616                        None // Keep as NRW
617                    }
618                }
619                _ => None,
620            }
621        } else {
622            None // Can't determine, keep original
623        }
624    }
625
626    /// Last-ditch scan for embedded JPEG/TIFF signatures
627    /// ExifTool equivalent: ExifTool.pm:2976-2983
628    fn scan_for_embedded_signatures(&self, buffer: &[u8]) -> Option<String> {
629        // Look for JPEG signature: \xff\xd8\xff
630        if let Some(pos) = buffer.windows(3).position(|w| w == b"\xff\xd8\xff") {
631            if pos > 0 {
632                eprintln!("Warning: Processing JPEG-like data after unknown {pos}-byte header");
633            }
634            return Some("JPEG".to_string());
635        }
636
637        // Look for TIFF signatures: II*\0 or MM\0*
638        if let Some(pos) = buffer
639            .windows(4)
640            .position(|w| w == b"II*\0" || w == b"MM\0*")
641        {
642            if pos > 0 {
643                eprintln!("Warning: Processing TIFF-like data after unknown {pos}-byte header");
644            }
645            return Some("TIFF".to_string());
646        }
647
648        None
649    }
650
651    /// Determine specific file type for MOV/MP4 containers based on ftyp brand
652    /// ExifTool equivalent: QuickTime.pm:9868-9877 ftyp brand detection
653    fn determine_mov_subtype(&self, buffer: &[u8]) -> Option<String> {
654        // Need at least 12 bytes for ftyp atom structure
655        if buffer.len() >= 12 && &buffer[4..8] == b"ftyp" {
656            let brand = &buffer[8..12];
657            // Map ftyp brand to specific file type
658            // ExifTool QuickTime.pm:227-232 - %ftypLookup entries
659            match brand {
660                b"heic" | b"hevc" => Some("HEIC".to_string()),
661                b"mif1" | b"msf1" | b"heix" => Some("HEIF".to_string()),
662                b"avif" => Some("AVIF".to_string()),
663                b"crx " => Some("CR3".to_string()), // Canon RAW 3 format
664                // Common MP4 brands
665                b"mp41" | b"mp42" | b"mp4v" | b"isom" | b"M4A " | b"M4V " | b"dash" | b"avc1" => {
666                    Some("MP4".to_string())
667                }
668                _ => None, // Keep as MOV for other brands
669            }
670        } else {
671            None
672        }
673    }
674
675    /// Validate XMP pattern: \0{0,3}(\xfe\xff|\xff\xfe|\xef\xbb\xbf)?\0{0,3}\s*<
676    /// ExifTool.pm:1018 - XMP files can start with optional BOM and null bytes, then whitespace, then '<'
677    #[allow(dead_code)]
678    fn validate_xmp_pattern(&self, buffer: &[u8]) -> bool {
679        if buffer.is_empty() {
680            return false;
681        }
682
683        let mut pos = 0;
684
685        // Skip up to 3 null bytes at the beginning
686        while pos < buffer.len() && pos < 3 && buffer[pos] == 0 {
687            pos += 1;
688        }
689
690        // Check for optional BOM (Byte Order Mark)
691        if pos + 3 <= buffer.len() {
692            // UTF-8 BOM: EF BB BF
693            if buffer[pos..pos + 3] == [0xef, 0xbb, 0xbf] {
694                pos += 3;
695            }
696        }
697        if pos + 2 <= buffer.len() {
698            // UTF-16 BE BOM: FE FF or UTF-16 LE BOM: FF FE
699            if buffer[pos..pos + 2] == [0xfe, 0xff] || buffer[pos..pos + 2] == [0xff, 0xfe] {
700                pos += 2;
701            }
702        }
703
704        // Skip up to 3 more null bytes after BOM
705        while pos < buffer.len() && pos < 6 && buffer[pos] == 0 {
706            pos += 1;
707        }
708
709        // Skip whitespace (space, tab, newline, carriage return)
710        while pos < buffer.len()
711            && (buffer[pos] == b' '
712                || buffer[pos] == b'\t'
713                || buffer[pos] == b'\n'
714                || buffer[pos] == b'\r')
715        {
716            pos += 1;
717        }
718
719        // Finally, check for '<' character
720        pos < buffer.len() && buffer[pos] == b'<'
721    }
722
723    /// Build final detection result from file type
724    pub fn build_result(
725        &self,
726        file_type: &str,
727        path: &Path,
728    ) -> Result<FileTypeDetectionResult, FileDetectionError> {
729        // Get primary format for processing
730        use crate::generated::file_types::resolve_file_type;
731        let (format, description) = if let Some((formats, desc)) = resolve_file_type(file_type) {
732            (formats[0].to_string(), desc.to_string())
733        } else {
734            (file_type.to_string(), format!("{file_type} file"))
735        };
736
737        // Get MIME type from generated lookup - try the file type first, then fallback, then the format
738        // This ensures file-type-specific MIME types take precedence over generic format MIME types
739        let mime_type = lookup_mime_types(file_type)
740            .or_else(|| self.get_fallback_mime_type(file_type))
741            .or_else(|| lookup_mime_types(&format))
742            .unwrap_or("application/octet-stream")
743            .to_string();
744
745        // Special case: ASF files with .wmv extension should use video/x-ms-wmv MIME type
746        // ExifTool.pm:9570-9592 SetFileType() applies extension-specific MIME types for ASF/WMV
747        // Reference: ExifTool.pm lines 557 (WMV->ASF mapping) and 816 (WMV MIME type)
748        let mime_type = if file_type == "ASF" {
749            if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
750                match ext.to_lowercase().as_str() {
751                    "wmv" => "video/x-ms-wmv".to_string(),
752                    _ => mime_type,
753                }
754            } else {
755                mime_type
756            }
757        } else {
758            mime_type
759        };
760
761        Ok(FileTypeDetectionResult {
762            file_type: file_type.to_string(),
763            format: format.to_string(),
764            mime_type,
765            description,
766        })
767    }
768
769    /// Get fallback MIME types for file types not covered by ExifTool's %mimeType hash
770    /// These are standard MIME types for common formats
771    fn get_fallback_mime_type(&self, file_type: &str) -> Option<&'static str> {
772        match file_type {
773            // Image formats
774            "JPEG" => Some("image/jpeg"),
775            "PNG" => Some("image/png"),
776            "TIFF" => Some("image/tiff"),
777            "GIF" => Some("image/gif"),
778            "BMP" => Some("image/bmp"),
779            "WEBP" => Some("image/webp"),
780            "HEIC" => Some("image/heic"), // HEIC gets its own MIME type
781            "HEIF" => Some("image/heif"), // High Efficiency Image Format (general)
782            "JP2" => Some("image/jp2"),   // JPEG 2000 Part 1 (ISO/IEC 15444-1)
783            "J2C" => Some("image/x-j2c"), // JPEG 2000 Code Stream
784
785            // Video formats
786            "AVI" => Some("video/x-msvideo"),
787            "3GP" => Some("video/3gpp"),     // 3GPP video format
788            "3G2" => Some("video/3gpp2"),    // 3GPP2 video format
789            "M4V" => Some("video/x-m4v"),    // Apple M4V video
790            "MTS" => Some("video/m2ts"),     // MPEG-2 Transport Stream (alias for M2TS)
791            "M2TS" => Some("video/m2ts"),    // MPEG-2 Transport Stream
792            "MP4" => Some("video/mp4"),      // MPEG-4 Part 14
793            "FLV" => Some("video/x-flv"),    // Flash Video
794            "WMV" => Some("video/x-ms-wmv"), // Windows Media Video
795            "ASF" => Some("video/x-ms-wmv"), // Advanced Systems Format (usually WMV)
796
797            // Audio formats
798            "WAV" => Some("audio/x-wav"), // WAV audio files
799
800            // Document formats
801            "XMP" => Some("application/rdf+xml"), // Extensible Metadata Platform
802            "PSD" => Some("application/vnd.adobe.photoshop"), // Adobe Photoshop Document
803            "EPS" => Some("application/postscript"), // Encapsulated PostScript
804
805            // Other common formats that might be missing
806            "RIFF" => Some("application/octet-stream"), // Generic RIFF container
807
808            _ => None,
809        }
810    }
811}
812
813impl Default for FileTypeDetector {
814    fn default() -> Self {
815        Self::new()
816    }
817}
818
819#[cfg(test)]
820mod mimetypes_validation;
821
822#[cfg(test)]
823mod tests {
824    use super::*;
825    use std::io::Cursor;
826
827    #[test]
828    fn test_extension_normalization() {
829        let detector = FileTypeDetector::new();
830
831        assert_eq!(detector.normalize_extension("tif"), "TIFF");
832        assert_eq!(detector.normalize_extension("jpg"), "JPEG");
833        assert_eq!(detector.normalize_extension("png"), "PNG");
834    }
835
836    #[test]
837    fn test_jpeg_detection() {
838        let detector = FileTypeDetector::new();
839        let path = Path::new("test.jpg");
840
841        // JPEG magic number: \xff\xd8\xff
842        let jpeg_data = vec![0xff, 0xd8, 0xff, 0xe0, 0x00, 0x10];
843        let mut cursor = Cursor::new(jpeg_data);
844
845        let result = detector.detect_file_type(path, &mut cursor).unwrap();
846        assert_eq!(result.file_type, "JPEG");
847        assert_eq!(result.format, "JPEG");
848        assert_eq!(result.mime_type, "image/jpeg");
849    }
850
851    #[test]
852    fn test_png_detection() {
853        let detector = FileTypeDetector::new();
854        let path = Path::new("test.png");
855
856        // PNG magic number: \x89PNG\r\n\x1a\n
857        let png_data = vec![0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a];
858        let mut cursor = Cursor::new(png_data);
859
860        let result = detector.detect_file_type(path, &mut cursor).unwrap();
861        assert_eq!(result.file_type, "PNG");
862        assert_eq!(result.format, "PNG");
863        assert_eq!(result.mime_type, "image/png");
864    }
865
866    #[test]
867    fn test_tiff_extension_alias() {
868        let detector = FileTypeDetector::new();
869        let path = Path::new("test.tif");
870
871        // TIFF magic number: II*\0 (little endian)
872        let tiff_data = vec![0x49, 0x49, 0x2a, 0x00];
873        let mut cursor = Cursor::new(tiff_data);
874
875        let result = detector.detect_file_type(path, &mut cursor).unwrap();
876        assert_eq!(result.file_type, "TIFF");
877        assert_eq!(result.format, "TIFF");
878        assert_eq!(result.mime_type, "image/tiff");
879    }
880
881    #[test]
882    fn test_embedded_jpeg_recovery() {
883        let detector = FileTypeDetector::new();
884        // Use a filename with unknown extension to trigger embedded signature scan
885        let path = Path::new("unknown.xyz");
886
887        // Unknown header followed by JPEG signature
888        let mut data = vec![0x00, 0x01, 0x02, 0x03]; // Unknown header
889        data.extend_from_slice(&[0xff, 0xd8, 0xff]); // JPEG signature
890        let mut cursor = Cursor::new(data);
891
892        let result = detector.detect_file_type(path, &mut cursor).unwrap();
893        assert_eq!(result.file_type, "JPEG");
894    }
895
896    #[test]
897    fn test_weak_magic_mp3() {
898        let detector = FileTypeDetector::new();
899        let path = Path::new("test.mp3");
900
901        // MP3 has weak magic, should rely on extension
902        let mp3_data = vec![0x49, 0x44, 0x33]; // ID3 tag (valid MP3 start)
903        let mut cursor = Cursor::new(mp3_data);
904
905        let result = detector.detect_file_type(path, &mut cursor).unwrap();
906        assert_eq!(result.file_type, "MP3");
907        assert_eq!(result.mime_type, "audio/mpeg");
908    }
909
910    #[test]
911    fn test_unknown_file_type() {
912        let detector = FileTypeDetector::new();
913        let path = Path::new("test.unknown");
914
915        let unknown_data = vec![0x00, 0x01, 0x02, 0x03];
916        let mut cursor = Cursor::new(unknown_data);
917
918        let result = detector.detect_file_type(path, &mut cursor);
919        assert!(matches!(result, Err(FileDetectionError::UnknownFileType)));
920    }
921
922    #[test]
923    fn test_heic_detection() {
924        let detector = FileTypeDetector::new();
925        let path = Path::new("test.heic");
926
927        // HEIC file with ftyp box and heic brand
928        let mut heic_data = Vec::new();
929        heic_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x28]); // Box size (40 bytes)
930        heic_data.extend_from_slice(b"ftyp"); // Box type (bytes 4-7)
931        heic_data.extend_from_slice(b"heic"); // Major brand (bytes 8-11)
932        heic_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x00]); // Minor version
933        heic_data.extend_from_slice(b"mif1"); // Compatible brand
934        heic_data.extend_from_slice(b"MiHE"); // Compatible brand
935        heic_data.extend_from_slice(b"MiPr"); // Compatible brand
936        heic_data.extend_from_slice(b"miaf"); // Compatible brand
937        heic_data.extend_from_slice(b"MiHB"); // Compatible brand
938        heic_data.extend_from_slice(b"heic"); // Compatible brand
939
940        let mut cursor = Cursor::new(heic_data);
941
942        let result = detector.detect_file_type(path, &mut cursor).unwrap();
943        assert_eq!(result.file_type, "HEIC");
944        assert_eq!(result.format, "MOV");
945        assert_eq!(result.mime_type, "image/heic");
946        assert_eq!(
947            result.description,
948            "High Efficiency Image Format still image"
949        );
950    }
951
952    #[test]
953    fn test_avi_riff_detection() {
954        let detector = FileTypeDetector::new();
955        let path = Path::new("test.avi");
956
957        // AVI RIFF header: RIFF + size + "AVI " format identifier
958        let mut avi_data = Vec::new();
959        avi_data.extend_from_slice(b"RIFF"); // RIFF magic (bytes 0-3)
960        avi_data.extend_from_slice(&[0x24, 0x00, 0x00, 0x00]); // File size - 8 (bytes 4-7)
961        avi_data.extend_from_slice(b"AVI "); // AVI format identifier (bytes 8-11)
962        avi_data.extend_from_slice(b"LIST"); // Chunk header start (bytes 12+)
963        let mut cursor = Cursor::new(avi_data);
964
965        let result = detector.detect_file_type(path, &mut cursor).unwrap();
966        assert_eq!(result.file_type, "AVI");
967        assert_eq!(result.format, "RIFF");
968        assert_eq!(result.mime_type, "video/x-msvideo");
969    }
970
971    #[test]
972    fn test_wav_riff_detection() {
973        let detector = FileTypeDetector::new();
974        let path = Path::new("test.wav");
975
976        // WAV RIFF header: RIFF + size + "WAVE" format identifier
977        let mut wav_data = Vec::new();
978        wav_data.extend_from_slice(b"RIFF"); // RIFF magic (bytes 0-3)
979        wav_data.extend_from_slice(&[0x24, 0x00, 0x00, 0x00]); // File size - 8 (bytes 4-7)
980        wav_data.extend_from_slice(b"WAVE"); // WAVE format identifier (bytes 8-11)
981        wav_data.extend_from_slice(b"fmt "); // Format chunk start (bytes 12+)
982        let mut cursor = Cursor::new(wav_data);
983
984        let result = detector.detect_file_type(path, &mut cursor).unwrap();
985        assert_eq!(result.file_type, "WAV");
986        assert_eq!(result.format, "RIFF");
987        assert_eq!(result.mime_type, "audio/x-wav");
988    }
989
990    #[test]
991    fn test_webp_riff_detection() {
992        let detector = FileTypeDetector::new();
993        let path = Path::new("test.webp");
994
995        // WebP RIFF header: RIFF + size + "WEBP" format identifier
996        let mut webp_data = Vec::new();
997        webp_data.extend_from_slice(b"RIFF"); // RIFF magic (bytes 0-3)
998        webp_data.extend_from_slice(&[0x20, 0x00, 0x00, 0x00]); // File size - 8 (bytes 4-7)
999        webp_data.extend_from_slice(b"WEBP"); // WEBP format identifier (bytes 8-11)
1000        webp_data.extend_from_slice(b"VP8 "); // VP8 chunk header (bytes 12+)
1001        let mut cursor = Cursor::new(webp_data);
1002
1003        let result = detector.detect_file_type(path, &mut cursor).unwrap();
1004        assert_eq!(result.file_type, "WEBP");
1005        assert_eq!(result.format, "RIFF");
1006        assert_eq!(result.mime_type, "image/webp");
1007    }
1008
1009    #[test]
1010    fn test_heic_extension_detection() {
1011        let detector = FileTypeDetector::new();
1012        let path = Path::new("test.heic");
1013
1014        // MOV file with HEIC ftyp brand
1015        let mut heic_data = Vec::new();
1016        heic_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x20]); // Size
1017        heic_data.extend_from_slice(b"ftyp"); // Box type
1018        heic_data.extend_from_slice(b"mif1"); // Major brand (HEIF)
1019        heic_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x00]); // Minor version
1020        heic_data.extend_from_slice(b"mif1heic"); // Compatible brands
1021        let mut cursor = Cursor::new(heic_data);
1022
1023        match detector.detect_file_type(path, &mut cursor) {
1024            Ok(result) => {
1025                println!(
1026                    "HEIC detection result: file_type={}, format={}, mime_type={}",
1027                    result.file_type, result.format, result.mime_type
1028                );
1029                // Should detect as HEIF due to mif1 brand
1030                assert_eq!(result.file_type, "HEIF");
1031                assert_eq!(result.format, "MOV");
1032                assert_eq!(result.mime_type, "image/heif");
1033            }
1034            Err(e) => {
1035                panic!("Failed to detect HEIC file: {e:?}");
1036            }
1037        }
1038    }
1039
1040    #[test]
1041    fn test_riff_format_content_detection() {
1042        let detector = FileTypeDetector::new();
1043        let path = Path::new("test.avi"); // AVI extension
1044
1045        // But contains WAV data - should detect as WAV based on content
1046        // Following ExifTool's behavior: content takes precedence over extension
1047        let mut wav_data = Vec::new();
1048        wav_data.extend_from_slice(b"RIFF"); // RIFF magic
1049        wav_data.extend_from_slice(&[0x24, 0x00, 0x00, 0x00]); // File size
1050        wav_data.extend_from_slice(b"WAVE"); // WAVE format (not AVI)
1051        wav_data.extend_from_slice(b"fmt "); // Format chunk
1052        let mut cursor = Cursor::new(wav_data);
1053
1054        // Should detect as WAV based on content, following ExifTool's behavior
1055        let result = detector.detect_file_type(path, &mut cursor);
1056        match result {
1057            Ok(detection) => {
1058                assert_eq!(detection.file_type, "WAV");
1059                assert_eq!(detection.format, "RIFF");
1060                assert_eq!(detection.mime_type, "audio/x-wav");
1061            }
1062            Err(e) => {
1063                panic!("Expected WAV detection but got error: {e:?}");
1064            }
1065        }
1066    }
1067}