Skip to main content

libmagic_rs/parser/
loader.rs

1// Copyright (c) 2025-2026 the libmagic-rs contributors
2// SPDX-License-Identifier: Apache-2.0
3
4//! File and directory loading for magic files.
5//!
6//! Provides functions for loading magic rules from individual files and
7//! directories, with automatic format detection and error handling.
8
9use log::warn;
10
11use crate::error::ParseError;
12use crate::parser::ParsedMagic;
13use crate::parser::name_table::NameTable;
14use std::path::{Path, PathBuf};
15
16use super::format::{MagicFileFormat, detect_format};
17
18/// Maximum magic file size (1 GB).
19///
20/// Applied before loading a magic file (or any file within a magic directory)
21/// into memory to prevent memory-exhaustion `DoS` from maliciously oversized
22/// inputs.
23///
24/// This value is kept in sync with `crate::io::FileBuffer::MAX_FILE_SIZE`.
25/// The constant is duplicated (rather than imported) because this module is
26/// also pulled in by `build.rs` via `#[path]` and the build script cannot
27/// reference lib-only modules such as `crate::io`. A unit test below asserts
28/// the two constants remain equal.
29pub const MAX_MAGIC_FILE_SIZE: u64 = 1024 * 1024 * 1024;
30
31/// Reads a magic file into a `String` after verifying its size does not
32/// exceed [`MAX_MAGIC_FILE_SIZE`].
33///
34/// Returns a `ParseError` if metadata cannot be read, the file exceeds the
35/// size limit, or the file contents cannot be read.
36///
37/// # Encoding
38///
39/// Magic files are parsed as byte streams (matching GNU `file`/libmagic
40/// behavior). Real-world magic files frequently contain non-UTF-8 bytes in
41/// comments and attribution lines (e.g., Latin-1 author names). Rather than
42/// rejecting such files, invalid UTF-8 sequences are replaced with U+FFFD
43/// via [`String::from_utf8_lossy`] and a warning is logged. ASCII rule
44/// syntax is preserved byte-for-byte; replacements only affect non-ASCII
45/// text which, in practice, appears almost exclusively inside comments
46/// that are stripped before tokenization.
47fn read_magic_file_bounded(path: &Path) -> Result<String, ParseError> {
48    let metadata = std::fs::metadata(path).map_err(|e| {
49        ParseError::IoError(std::io::Error::new(
50            e.kind(),
51            format!("Failed to read metadata for '{}': {}", path.display(), e),
52        ))
53    })?;
54
55    if metadata.len() > MAX_MAGIC_FILE_SIZE {
56        return Err(ParseError::invalid_syntax(
57            0,
58            format!(
59                "Magic file '{}' is too large: {} bytes (maximum allowed: {} bytes)",
60                path.display(),
61                metadata.len(),
62                MAX_MAGIC_FILE_SIZE
63            ),
64        ));
65    }
66
67    let bytes = std::fs::read(path).map_err(ParseError::from)?;
68
69    match String::from_utf8(bytes) {
70        Ok(s) => Ok(s),
71        Err(e) => {
72            warn!(
73                "Magic file '{}' contains non-UTF-8 bytes; they were replaced with U+FFFD. \
74                 Rule parsing proceeds, but replacements inside rule bodies may alter matching.",
75                path.display()
76            );
77            Ok(String::from_utf8_lossy(&e.into_bytes()).into_owned())
78        }
79    }
80}
81
82/// Loads and parses all magic files from a directory, merging them into a single rule set.
83///
84/// This function reads all regular files in the specified directory, parses each as a magic file,
85/// and combines the resulting rules into a single `Vec<MagicRule>`. Files are processed in
86/// alphabetical order by filename to ensure deterministic results.
87///
88/// # Error Handling Strategy
89///
90/// This function distinguishes between critical and non-critical errors:
91///
92/// - **Critical errors** (I/O failures, directory access issues, encoding problems):
93///   These cause immediate failure and return a `ParseError`. The function stops processing
94///   and propagates the error to the caller.
95///
96/// - **Non-critical errors** (individual file parse failures):
97///   These are logged at warn level and the file is skipped. Processing
98///   continues with remaining files.
99///
100/// # Behavior
101///
102/// - Subdirectories are skipped (not recursively processed)
103/// - Symbolic links are skipped
104/// - Empty directories return an empty rules vector
105/// - Files are processed in alphabetical order by filename
106/// - All successfully parsed rules are merged in order
107///
108/// # Examples
109///
110/// Loading a directory of magic files:
111///
112/// ```rust,no_run
113/// use libmagic_rs::parser::load_magic_directory;
114/// use std::path::Path;
115///
116/// let parsed = load_magic_directory(Path::new("/usr/share/file/magic.d"))?;
117/// println!("Loaded {} rules from directory", parsed.rules.len());
118/// # Ok::<(), libmagic_rs::ParseError>(())
119/// ```
120///
121/// Creating a Magdir-style directory structure:
122///
123/// ```rust,no_run
124/// use libmagic_rs::parser::load_magic_directory;
125/// use std::path::Path;
126///
127/// // Directory structure:
128/// // magic.d/
129/// //   ├── 01-elf
130/// //   ├── 02-archive
131/// //   └── 03-text
132///
133/// let parsed = load_magic_directory(Path::new("./magic.d"))?;
134/// // Rules from all three files are merged in alphabetical order
135/// # Ok::<(), libmagic_rs::ParseError>(())
136/// ```
137///
138/// # Errors
139///
140/// Returns `ParseError` if:
141/// - The directory does not exist or cannot be accessed
142/// - Directory entries cannot be read
143/// - A file cannot be read due to I/O errors
144/// - A file contains invalid UTF-8 encoding
145///
146/// # Panics
147///
148/// This function does not panic under normal operation.
149pub fn load_magic_directory(dir_path: &Path) -> Result<ParsedMagic, ParseError> {
150    use std::fs;
151
152    // Read directory entries
153    let entries = fs::read_dir(dir_path).map_err(|e| {
154        ParseError::invalid_syntax(
155            0,
156            format!("Failed to read directory '{}': {}", dir_path.display(), e),
157        )
158    })?;
159
160    // Collect and sort entries by filename for deterministic ordering
161    let mut file_paths: Vec<std::path::PathBuf> = Vec::new();
162    for entry in entries {
163        let entry = entry.map_err(|e| {
164            ParseError::invalid_syntax(
165                0,
166                format!(
167                    "Failed to read directory entry in '{}': {}",
168                    dir_path.display(),
169                    e
170                ),
171            )
172        })?;
173
174        let path = entry.path();
175        let file_type = entry.file_type().map_err(|e| {
176            ParseError::invalid_syntax(
177                0,
178                format!("Failed to read file type for '{}': {}", path.display(), e),
179            )
180        })?;
181
182        // Only process regular files, skip directories and symlinks
183        if file_type.is_file() && !file_type.is_symlink() {
184            file_paths.push(path);
185        }
186    }
187
188    // Sort by filename for deterministic ordering
189    file_paths.sort_by_key(|path| path.file_name().map(std::ffi::OsStr::to_os_string));
190
191    // Accumulate rules and name tables from all files
192    let mut all_rules = Vec::new();
193    let mut merged_table = NameTable::empty();
194    let mut parse_failures: Vec<(PathBuf, ParseError)> = Vec::new();
195    let mut any_success = false;
196    let file_count = file_paths.len();
197
198    for path in file_paths {
199        // Read file contents (size-bounded to prevent memory exhaustion)
200        let contents = match read_magic_file_bounded(&path) {
201            Ok(contents) => contents,
202            Err(e) => {
203                // I/O errors (including oversized files) are critical
204                return Err(ParseError::invalid_syntax(
205                    0,
206                    format!("Failed to read file '{}': {}", path.display(), e),
207                ));
208            }
209        };
210
211        // Parse the file
212        match super::parse_text_magic_file(&contents) {
213            Ok(parsed) => {
214                any_success = true;
215                all_rules.extend(parsed.rules);
216                merged_table.merge(parsed.name_table);
217            }
218            Err(e) => {
219                // Track parse failures for reporting
220                parse_failures.push((path, e));
221            }
222        }
223    }
224
225    // If all files failed to parse, return an error.
226    // Use `any_success` rather than `all_rules.is_empty()` so that directories
227    // whose files parse successfully but contain only meta-type definitions
228    // (e.g. a directory of pure `name`-subroutine files) are not mistaken for
229    // complete failure.
230    if !any_success && !parse_failures.is_empty() {
231        use std::fmt::Write;
232
233        let failure_details: Vec<String> = parse_failures
234            .iter()
235            .take(3) // Limit to first 3 failures for brevity
236            .map(|(path, e)| format!("  - {}: {}", path.display(), e))
237            .collect();
238
239        let mut message = format!("All {file_count} magic file(s) in directory failed to parse");
240        if !failure_details.is_empty() {
241            message.push_str(":\n");
242            message.push_str(&failure_details.join("\n"));
243            if parse_failures.len() > 3 {
244                let _ = write!(message, "\n  ... and {} more", parse_failures.len() - 3);
245            }
246        }
247
248        return Err(ParseError::invalid_syntax(0, message));
249    }
250
251    // Log warnings for partial failures (some files parsed, some failed)
252    for (path, e) in &parse_failures {
253        warn!("Failed to parse '{}': {}", path.display(), e);
254    }
255
256    Ok(ParsedMagic {
257        rules: all_rules,
258        name_table: merged_table,
259    })
260}
261
262/// Loads magic rules from a file or directory, automatically detecting the format.
263///
264/// This is the unified entry point for loading magic rules from the filesystem. It
265/// automatically detects whether the path points to a text magic file, a directory
266/// containing magic files, or a binary compiled magic file, and dispatches to the
267/// appropriate handler.
268///
269/// # Format Detection and Handling
270///
271/// The function uses [`detect_format()`] to determine the file type and handles each
272/// format as follows:
273///
274/// - **Text format**: Reads the file contents and parses using [`super::parse_text_magic_file()`]
275/// - **Directory format**: Loads all magic files from the directory using [`load_magic_directory()`]
276/// - **Binary format**: Returns an error with guidance to use the `--use-builtin` option
277///
278/// # Arguments
279///
280/// * `path` - Path to a magic file or directory. Can be absolute or relative.
281///
282/// # Returns
283///
284/// Returns `Ok(Vec<MagicRule>)` containing all successfully parsed magic rules. For
285/// directories, rules from all files are merged in alphabetical order by filename.
286///
287/// # Errors
288///
289/// This function returns a [`ParseError`] in the following cases:
290///
291/// - **File not found**: The specified path does not exist
292/// - **Unsupported format**: The file is a binary compiled magic file (`.mgc`)
293/// - **Parse errors**: The magic file contains syntax errors or invalid rules
294/// - **I/O errors**: File system errors during reading (permissions, disk errors, etc.)
295///
296/// # Examples
297///
298/// ## Loading a text magic file
299///
300/// ```no_run
301/// use libmagic_rs::parser::load_magic_file;
302/// use std::path::Path;
303///
304/// let parsed = load_magic_file(Path::new("/usr/share/misc/magic"))?;
305/// println!("Loaded {} magic rules", parsed.rules.len());
306/// # Ok::<(), libmagic_rs::ParseError>(())
307/// ```
308///
309/// ## Loading a directory of magic files
310///
311/// ```no_run
312/// use libmagic_rs::parser::load_magic_file;
313/// use std::path::Path;
314///
315/// let parsed = load_magic_file(Path::new("/usr/share/misc/magic.d"))?;
316/// println!("Loaded {} rules from directory", parsed.rules.len());
317/// # Ok::<(), libmagic_rs::ParseError>(())
318/// ```
319///
320/// ## Handling binary format errors
321///
322/// ```no_run
323/// use libmagic_rs::parser::load_magic_file;
324/// use std::path::Path;
325///
326/// match load_magic_file(Path::new("/usr/share/misc/magic.mgc")) {
327///     Ok(parsed) => println!("Loaded {} rules", parsed.rules.len()),
328///     Err(e) => {
329///         eprintln!("Error loading magic file: {}", e);
330///         eprintln!("Hint: Use --use-builtin for binary files");
331///     }
332/// }
333/// # Ok::<(), libmagic_rs::ParseError>(())
334/// ```
335///
336/// # Security
337///
338/// This function delegates to [`super::parse_text_magic_file()`] or [`load_magic_directory()`]
339/// based on format detection. Security considerations are handled by those functions:
340///
341/// - Rule hierarchy depth is bounded during parsing
342/// - Invalid syntax is rejected with descriptive errors
343/// - Binary `.mgc` files are rejected (not parsed)
344///
345/// A 1 GB size limit ([`MAX_MAGIC_FILE_SIZE`]) is enforced on each file loaded
346/// (both standalone files and files within a directory) to prevent memory
347/// exhaustion from maliciously oversized inputs. Files exceeding the limit are
348/// rejected with a `ParseError` before their contents are read.
349///
350/// # See Also
351///
352/// - [`detect_format()`] - Format detection logic
353/// - [`super::parse_text_magic_file()`] - Text file parser
354/// - [`load_magic_directory()`] - Directory loader
355pub fn load_magic_file(path: &Path) -> Result<ParsedMagic, ParseError> {
356    // Detect the magic file format
357    let format = detect_format(path)?;
358
359    // Dispatch to appropriate handler based on format
360    match format {
361        MagicFileFormat::Text => {
362            // Read file contents (size-bounded) and parse as text magic file
363            let content = read_magic_file_bounded(path)?;
364            super::parse_text_magic_file(&content)
365        }
366        MagicFileFormat::Directory => {
367            // Load all magic files from directory
368            load_magic_directory(path)
369        }
370        MagicFileFormat::Binary => {
371            // Binary compiled magic files are not supported
372            Err(ParseError::unsupported_format(
373                0,
374                "binary .mgc file",
375                "Binary compiled magic files (.mgc) are not supported for parsing.\n\
376                 Use the --use-builtin option to use the built-in magic rules instead,\n\
377                 or provide a text-based magic file or directory.",
378            ))
379        }
380    }
381}
382
383#[cfg(test)]
384mod tests {
385    use super::*;
386
387    // ============================================================
388    // Tests for load_magic_directory (6+ test cases)
389    // ============================================================
390
391    #[test]
392    fn test_load_directory_critical_error_io() {
393        use std::path::Path;
394
395        let non_existent = Path::new("/this/should/not/exist/anywhere/at/all");
396        let result = load_magic_directory(non_existent);
397
398        assert!(
399            result.is_err(),
400            "Should return error for non-existent directory"
401        );
402        let err = result.unwrap_err();
403        assert!(err.to_string().contains("Failed to read directory"));
404    }
405
406    #[test]
407    fn test_load_directory_non_critical_error_parse() {
408        use std::fs;
409        use tempfile::TempDir;
410
411        let temp_dir = TempDir::new().expect("Failed to create temp dir");
412
413        // Create a valid file
414        let valid_path = temp_dir.path().join("valid.magic");
415        fs::write(&valid_path, "0 string \\x01\\x02 valid\n").expect("Failed to write valid file");
416
417        // Create an invalid file
418        let invalid_path = temp_dir.path().join("invalid.magic");
419        fs::write(&invalid_path, "this is invalid syntax\n").expect("Failed to write invalid file");
420
421        // Should succeed, loading only the valid file
422        let parsed = load_magic_directory(temp_dir.path()).expect("Should load valid files");
423
424        assert_eq!(parsed.rules.len(), 1, "Should load only valid file");
425        assert_eq!(parsed.rules[0].message, "valid");
426    }
427
428    #[test]
429    fn test_load_directory_empty_files() {
430        use std::fs;
431        use tempfile::TempDir;
432
433        let temp_dir = TempDir::new().expect("Failed to create temp dir");
434
435        // Create an empty file
436        let empty_path = temp_dir.path().join("empty.magic");
437        fs::write(&empty_path, "").expect("Failed to write empty file");
438
439        // Create a file with only comments
440        let comments_path = temp_dir.path().join("comments.magic");
441        fs::write(&comments_path, "# Just comments\n# Nothing else\n")
442            .expect("Failed to write comments file");
443
444        // Should succeed with no rules
445        let parsed = load_magic_directory(temp_dir.path()).expect("Should handle empty files");
446
447        assert_eq!(
448            parsed.rules.len(),
449            0,
450            "Empty files should contribute no rules"
451        );
452    }
453
454    #[test]
455    fn test_load_directory_binary_files() {
456        use std::fs;
457        use tempfile::TempDir;
458
459        let temp_dir = TempDir::new().expect("Failed to create temp dir");
460
461        // Create a binary file (invalid UTF-8). Lossy conversion turns this
462        // into U+FFFD characters that the grammar parser cannot interpret as
463        // a rule; the directory loader treats that as a non-critical parse
464        // failure and skips the file.
465        let binary_path = temp_dir.path().join("binary.dat");
466        fs::write(&binary_path, [0xFF, 0xFE, 0xFF, 0xFE]).expect("Failed to write binary file");
467
468        // Create a valid text file
469        let valid_path = temp_dir.path().join("valid.magic");
470        fs::write(&valid_path, "0 string \\x01\\x02 valid\n").expect("Failed to write valid file");
471
472        let parsed = load_magic_directory(temp_dir.path())
473            .expect("Directory with a binary file alongside a valid file should still load");
474
475        assert_eq!(
476            parsed.rules.len(),
477            1,
478            "Only the valid magic file should contribute rules"
479        );
480        assert_eq!(parsed.rules[0].message, "valid");
481    }
482
483    #[test]
484    fn test_load_directory_mixed_extensions() {
485        use std::fs;
486        use tempfile::TempDir;
487
488        let temp_dir = TempDir::new().expect("Failed to create temp dir");
489
490        // Create files with different extensions
491        fs::write(
492            temp_dir.path().join("file.magic"),
493            "0 string \\x01\\x02 magic\n",
494        )
495        .expect("Failed to write .magic file");
496        fs::write(
497            temp_dir.path().join("file.txt"),
498            "0 string \\x03\\x04 txt\n",
499        )
500        .expect("Failed to write .txt file");
501        fs::write(temp_dir.path().join("noext"), "0 string \\x05\\x06 noext\n")
502            .expect("Failed to write no-ext file");
503
504        let parsed = load_magic_directory(temp_dir.path())
505            .expect("Should load all files regardless of extension");
506
507        assert_eq!(
508            parsed.rules.len(),
509            3,
510            "Should process all files regardless of extension"
511        );
512
513        let messages: Vec<&str> = parsed.rules.iter().map(|r| r.message.as_str()).collect();
514        assert!(messages.contains(&"magic"));
515        assert!(messages.contains(&"txt"));
516        assert!(messages.contains(&"noext"));
517    }
518
519    #[test]
520    fn test_load_directory_alphabetical_ordering() {
521        use std::fs;
522        use tempfile::TempDir;
523
524        let temp_dir = TempDir::new().expect("Failed to create temp dir");
525
526        // Create files in non-alphabetical order - using valid magic syntax with hex escapes
527        fs::write(
528            temp_dir.path().join("03-third"),
529            "0 string \\x07\\x08\\x09 third\n",
530        )
531        .expect("Failed to write third file");
532        fs::write(
533            temp_dir.path().join("01-first"),
534            "0 string \\x01\\x02\\x03 first\n",
535        )
536        .expect("Failed to write first file");
537        fs::write(
538            temp_dir.path().join("02-second"),
539            "0 string \\x04\\x05\\x06 second\n",
540        )
541        .expect("Failed to write second file");
542
543        let parsed = load_magic_directory(temp_dir.path()).expect("Should load directory in order");
544
545        assert_eq!(parsed.rules.len(), 3);
546        // Should be sorted alphabetically by filename
547        assert_eq!(parsed.rules[0].message, "first");
548        assert_eq!(parsed.rules[1].message, "second");
549        assert_eq!(parsed.rules[2].message, "third");
550    }
551
552    // ============================================================
553    // Tests for load_magic_file (5+ test cases)
554    // ============================================================
555
556    #[test]
557    fn test_load_magic_file_text_format() {
558        use std::fs;
559        use tempfile::TempDir;
560
561        let temp_dir = TempDir::new().expect("Failed to create temp dir");
562        let magic_file = temp_dir.path().join("magic.txt");
563
564        // Create text magic file with valid content
565        fs::write(&magic_file, "0 string \\x7fELF ELF executable\n")
566            .expect("Failed to write magic file");
567
568        // Load using load_magic_file
569        let parsed = load_magic_file(&magic_file).expect("Failed to load text magic file");
570
571        assert_eq!(parsed.rules.len(), 1);
572        assert_eq!(parsed.rules[0].message, "ELF executable");
573    }
574
575    #[test]
576    fn test_load_magic_file_directory_format() {
577        use std::fs;
578        use tempfile::TempDir;
579
580        let temp_dir = TempDir::new().expect("Failed to create temp dir");
581        let magic_dir = temp_dir.path().join("magic.d");
582        fs::create_dir(&magic_dir).expect("Failed to create magic directory");
583
584        // Create multiple files in directory
585        fs::write(
586            magic_dir.join("00_elf"),
587            "0 string \\x7fELF ELF executable\n",
588        )
589        .expect("Failed to write elf file");
590        fs::write(
591            magic_dir.join("01_zip"),
592            "0 string \\x50\\x4b\\x03\\x04 ZIP archive\n",
593        )
594        .expect("Failed to write zip file");
595
596        // Load using load_magic_file
597        let parsed = load_magic_file(&magic_dir).expect("Failed to load directory");
598
599        assert_eq!(parsed.rules.len(), 2);
600        assert_eq!(parsed.rules[0].message, "ELF executable");
601        assert_eq!(parsed.rules[1].message, "ZIP archive");
602    }
603
604    #[test]
605    fn test_load_magic_file_binary_format_error() {
606        use std::fs::File;
607        use std::io::Write;
608        use tempfile::TempDir;
609
610        let temp_dir = TempDir::new().expect("Failed to create temp dir");
611        let binary_file = temp_dir.path().join("magic.mgc");
612
613        // Create binary file with .mgc magic number
614        let mut file = File::create(&binary_file).expect("Failed to create binary file");
615        let magic_number: [u8; 4] = [0x1C, 0x04, 0x1E, 0xF1]; // Little-endian 0xF11E041C
616        file.write_all(&magic_number)
617            .expect("Failed to write magic number");
618
619        // Attempt to load binary file
620        let result = load_magic_file(&binary_file);
621
622        assert!(result.is_err(), "Should fail to load binary .mgc file");
623
624        let error = result.unwrap_err();
625        let error_msg = error.to_string();
626
627        // Verify error mentions unsupported format and --use-builtin
628        assert!(
629            error_msg.contains("Binary") || error_msg.contains("binary"),
630            "Error should mention binary format: {error_msg}",
631        );
632        assert!(
633            error_msg.contains("--use-builtin") || error_msg.contains("built-in"),
634            "Error should mention --use-builtin option: {error_msg}",
635        );
636    }
637
638    #[test]
639    fn test_load_magic_file_io_error() {
640        use std::path::Path;
641
642        // Try to load non-existent file
643        let non_existent = Path::new("/this/path/should/not/exist/magic.txt");
644        let result = load_magic_file(non_existent);
645
646        assert!(result.is_err(), "Should fail for non-existent file");
647    }
648
649    #[test]
650    fn test_load_magic_file_parse_error_propagation() {
651        use std::fs;
652        use tempfile::TempDir;
653
654        let temp_dir = TempDir::new().expect("Failed to create temp dir");
655        let invalid_file = temp_dir.path().join("invalid.magic");
656
657        // Create file with invalid syntax (missing offset)
658        fs::write(&invalid_file, "string test invalid\n").expect("Failed to write invalid file");
659
660        // Attempt to load file with parse errors
661        let result = load_magic_file(&invalid_file);
662
663        assert!(result.is_err(), "Should fail for file with parse errors");
664
665        // Error should be a parse error (not I/O error)
666        let error = result.unwrap_err();
667        let error_msg = format!("{error:?}");
668        assert!(
669            error_msg.contains("InvalidSyntax") || error_msg.contains("syntax"),
670            "Error should be parse error: {error_msg}",
671        );
672    }
673
674    #[test]
675    fn test_max_magic_file_size_matches_file_buffer_limit() {
676        // Ensure the duplicated limit stays in sync with FileBuffer::MAX_FILE_SIZE.
677        // loader.rs cannot `use crate::io::FileBuffer` at module scope because
678        // build.rs pulls this file in via `#[path]`, but tests compile as part
679        // of the library and can reach it fine.
680        assert_eq!(
681            MAX_MAGIC_FILE_SIZE,
682            crate::io::FileBuffer::MAX_FILE_SIZE,
683            "MAX_MAGIC_FILE_SIZE must match FileBuffer::MAX_FILE_SIZE"
684        );
685    }
686
687    #[test]
688    fn test_load_magic_file_rejects_oversized_file() {
689        use std::fs::File;
690        use tempfile::TempDir;
691
692        let temp_dir = TempDir::new().expect("Failed to create temp dir");
693        let oversized = temp_dir.path().join("huge.magic");
694
695        // Create a sparse file whose reported size exceeds MAX_MAGIC_FILE_SIZE
696        // without actually consuming that much disk space.
697        let file = File::create(&oversized).expect("Failed to create oversized file");
698        file.set_len(MAX_MAGIC_FILE_SIZE + 1)
699            .expect("Failed to set sparse file length");
700        drop(file);
701
702        let result = load_magic_file(&oversized);
703
704        assert!(
705            result.is_err(),
706            "Loading a file larger than MAX_MAGIC_FILE_SIZE must fail"
707        );
708
709        let err_msg = result.unwrap_err().to_string();
710        assert!(
711            err_msg.contains("too large"),
712            "Error should indicate size limit violation, got: {err_msg}"
713        );
714        assert!(
715            err_msg.contains(&MAX_MAGIC_FILE_SIZE.to_string()),
716            "Error should mention the maximum allowed size, got: {err_msg}"
717        );
718    }
719
720    #[test]
721    fn test_load_magic_file_tolerates_non_utf8_in_comment() {
722        // Regression: /usr/share/file/magic/filesystems on macOS contains a
723        // Latin-1 `ß` (0xdf) in a contributor attribution comment. Previously
724        // this was rejected by `fs::read_to_string` with an opaque "stream
725        // did not contain valid UTF-8" error. The loader must now tolerate
726        // non-UTF-8 bytes in comments (and anywhere else they appear) by
727        // lossily replacing them.
728        use std::fs;
729        use tempfile::TempDir;
730
731        let temp_dir = TempDir::new().expect("Failed to create temp dir");
732        let magic_path = temp_dir.path().join("with-latin1-comment.magic");
733
734        let mut bytes: Vec<u8> = Vec::new();
735        bytes.extend_from_slice(b"# From: Thomas Wei");
736        bytes.push(0xdf); // invalid UTF-8 (Latin-1 encoding of `ß`)
737        bytes.extend_from_slice(b"schuh <thomas@example.invalid>\n");
738        bytes.extend_from_slice(b"0 string \\x7fELF ELF executable\n");
739        fs::write(&magic_path, &bytes).expect("Failed to write magic file with non-UTF-8 byte");
740
741        let parsed = load_magic_file(&magic_path)
742            .expect("Magic file with non-UTF-8 bytes in a comment must still load");
743
744        assert_eq!(
745            parsed.rules.len(),
746            1,
747            "The ELF rule should be parsed; the comment is stripped"
748        );
749        assert_eq!(parsed.rules[0].message, "ELF executable");
750    }
751
752    #[test]
753    fn test_load_directory_merges_name_tables() {
754        use std::fs;
755        use tempfile::TempDir;
756
757        let temp_dir = TempDir::new().expect("Failed to create temp dir");
758
759        // Each file defines a different named subroutine.
760        fs::write(
761            temp_dir.path().join("00_first"),
762            "0 name sub_a\n>0 byte 1 a-body\n",
763        )
764        .expect("Failed to write sub_a file");
765        fs::write(
766            temp_dir.path().join("01_second"),
767            "0 name sub_b\n>0 byte 2 b-body\n",
768        )
769        .expect("Failed to write sub_b file");
770
771        let parsed =
772            load_magic_directory(temp_dir.path()).expect("Should load both name subroutines");
773
774        // Both `name` rules are hoisted out, so top-level rules list is empty.
775        assert_eq!(parsed.rules.len(), 0);
776        assert!(parsed.name_table.get("sub_a").is_some());
777        assert!(parsed.name_table.get("sub_b").is_some());
778    }
779}