Skip to main content

libmagic_rs/parser/
mod.rs

1// Copyright (c) 2025-2026 the libmagic-rs contributors
2// SPDX-License-Identifier: Apache-2.0
3
4//! Magic file parser module
5//!
6//! This module handles parsing of magic files into an Abstract Syntax Tree (AST)
7//! that can be evaluated against file buffers for type identification.
8//!
9//! # Overview
10//!
11//! The parser implements a complete pipeline for transforming magic file text into
12//! a hierarchical rule structure suitable for evaluation. The pipeline consists of:
13//!
14//! 1. **Preprocessing**: Line handling, comment removal, continuation processing
15//! 2. **Parsing**: Individual magic rule parsing using nom combinators
16//! 3. **Hierarchy Building**: Constructing parent-child relationships based on indentation
17//! 4. **Validation**: Type checking and offset resolution
18//!
19//! # Format Detection and Loading
20//!
21//! The module automatically detects and handles three types of magic file formats:
22//! - **Text files**: Human-readable magic rule definitions
23//! - **Directories**: Collections of magic files (Magdir pattern)
24//! - **Binary files**: Compiled .mgc files (currently unsupported)
25//!
26//! ## Unified Loading API
27//!
28//! The recommended entry point for loading magic files is [`load_magic_file()`], which
29//! automatically detects the format and dispatches to the appropriate handler:
30//!
31//! ```ignore
32//! use libmagic_rs::parser::load_magic_file;
33//! use std::path::Path;
34//!
35//! // Works with text files
36//! let rules = load_magic_file(Path::new("/usr/share/misc/magic"))?;
37//!
38//! // Also works with directories
39//! let rules = load_magic_file(Path::new("/usr/share/misc/magic.d"))?;
40//!
41//! // Binary .mgc files return an error with guidance
42//! match load_magic_file(Path::new("/usr/share/misc/magic.mgc")) {
43//!     Ok(rules) => { /* ... */ },
44//!     Err(e) => eprintln!("Use --use-builtin for binary files: {}", e),
45//! }
46//! # Ok::<(), Box<dyn std::error::Error>>(())
47//! ```
48//!
49//! ## Three-Tier Loading Strategy
50//!
51//! The loading process works as follows:
52//!
53//! 1. **Format Detection**: [`detect_format()`] examines the path to determine the file type
54//! 2. **Dispatch to Handler**:
55//!    - Text files -> [`parse_text_magic_file()`] after reading contents
56//!    - Directories -> [`load_magic_directory()`] to load and merge all files
57//!    - Binary files -> Returns error suggesting `--use-builtin` option
58//! 3. **Return Merged Rules**: All rules are returned in a single `Vec<MagicRule>`
59//!
60//! # Examples
61//!
62//! ## Loading Magic Files (Recommended)
63//!
64//! Use the unified [`load_magic_file()`] API for automatic format detection:
65//!
66//! ```ignore
67//! use libmagic_rs::parser::load_magic_file;
68//! use std::path::Path;
69//!
70//! let rules = load_magic_file(Path::new("/usr/share/misc/magic"))?;
71//! println!("Loaded {} magic rules", rules.len());
72//! # Ok::<(), Box<dyn std::error::Error>>(())
73//! ```
74//!
75//! ## Parsing Text Content Directly
76//!
77//! For parsing magic rule text that's already in memory:
78//!
79//! ```ignore
80//! use libmagic_rs::parser::parse_text_magic_file;
81//!
82//! let magic_content = r#"
83//! 0 string \x7fELF ELF executable
84//! >4 byte 1 32-bit
85//! >4 byte 2 64-bit
86//! "#;
87//!
88//! let rules = parse_text_magic_file(magic_content)?;
89//! assert_eq!(rules.len(), 1);
90//! assert_eq!(rules[0].children.len(), 2);
91//! # Ok::<(), Box<dyn std::error::Error>>(())
92//! ```
93//!
94//! ## Loading a Directory Explicitly
95//!
96//! For Magdir-style directories containing multiple magic files:
97//!
98//! ```ignore
99//! use libmagic_rs::parser::load_magic_directory;
100//! use std::path::Path;
101//!
102//! // Directory structure:
103//! // /usr/share/file/magic.d/
104//! //   ├── elf
105//! //   ├── archive
106//! //   └── text
107//!
108//! let rules = load_magic_directory(Path::new("/usr/share/file/magic.d"))?;
109//! // Rules from all files are merged in alphabetical order by filename
110//! # Ok::<(), Box<dyn std::error::Error>>(())
111//! ```
112//!
113//! ## Migration Note
114//!
115//! **For users upgrading from direct function calls:**
116//!
117//! - **Old approach**: Call `detect_format()` then dispatch manually
118//! - **New approach**: Use `load_magic_file()` for automatic dispatching
119//!
120//! The individual functions (`parse_text_magic_file()`, `load_magic_directory()`)
121//! remain available for advanced use cases where you need direct control.
122//!
123//! **Key differences:**
124//! - `load_magic_file()`: Unified API with automatic format detection (recommended)
125//! - `parse_text_magic_file()`: Parses a single text string containing magic rules
126//! - `load_magic_directory()`: Loads and merges all magic files from a directory
127//! - `detect_format()`: Low-level format detection (now called internally by `load_magic_file()`)
128//!
129//! **Error handling in `load_magic_directory()`:**
130//! - Critical errors (I/O failures, invalid UTF-8): Returns `ParseError` immediately
131//! - Non-critical errors (parse failures in individual files): Logs warning to stderr and continues
132
133pub mod ast;
134mod format;
135pub mod grammar;
136mod hierarchy;
137mod loader;
138pub(crate) mod preprocessing;
139
140// Re-export AST types for convenience
141pub use ast::{Endianness, MagicRule, OffsetSpec, Operator, StrengthModifier, TypeKind, Value};
142
143// Re-export parser functions for convenience
144pub use grammar::{parse_number, parse_offset};
145
146// Re-export format detection and loading
147pub use format::{MagicFileFormat, detect_format};
148pub use loader::{load_magic_directory, load_magic_file};
149
150// Internal re-exports for sibling modules and tests
151pub(crate) use hierarchy::build_rule_hierarchy;
152pub(crate) use preprocessing::preprocess_lines;
153
154use crate::error::ParseError;
155
156/// Parses a complete magic file from raw text input.
157///
158/// This is the main public-facing parser function that orchestrates the complete
159/// parsing pipeline: preprocessing, parsing individual rules, and building the
160/// hierarchical structure.
161///
162/// # Arguments
163///
164/// * `input` - The raw magic file content as a string
165///
166/// # Returns
167///
168/// `Result<Vec<MagicRule>, ParseError>` - A vector of root rules with nested children
169///
170/// # Errors
171///
172/// Returns an error if any stage of parsing fails:
173/// - Preprocessing errors
174/// - Rule parsing errors
175/// - Hierarchy building errors
176///
177/// # Example
178///
179/// ```ignore
180/// use libmagic_rs::parser::parse_text_magic_file;
181///
182/// let magic = r#"0 string \x7fELF ELF file
183/// >4 byte 1 32-bit
184/// >4 byte 2 64-bit"#;
185///
186/// let rules = parse_text_magic_file(magic)?;
187/// assert_eq!(rules.len(), 1);
188/// assert_eq!(rules[0].message, "ELF file");
189/// # Ok::<(), Box<dyn std::error::Error>>(())
190/// ```
191pub fn parse_text_magic_file(input: &str) -> Result<Vec<MagicRule>, ParseError> {
192    let lines = preprocess_lines(input)?;
193    build_rule_hierarchy(lines)
194}
195
196#[cfg(test)]
197mod unit_tests {
198    use super::*;
199
200    // ============================================================
201    // Tests for parse_text_magic_file (10+ test cases)
202    // ============================================================
203
204    #[test]
205    fn test_parse_text_magic_file_single_rule() {
206        let input = "0 string 0 ZIP archive";
207        let rules = parse_text_magic_file(input).unwrap();
208        assert_eq!(rules.len(), 1);
209        assert_eq!(rules[0].message, "ZIP archive");
210    }
211
212    #[test]
213    fn test_parse_text_magic_file_hierarchical_rules() {
214        let input = r"
2150 string 0 ELF
216>4 byte 1 32-bit
217>4 byte 2 64-bit
218";
219        let rules = parse_text_magic_file(input).unwrap();
220        assert_eq!(rules.len(), 1);
221        assert_eq!(rules[0].children.len(), 2);
222    }
223
224    #[test]
225    fn test_parse_text_magic_file_with_comments() {
226        let input = r"
227# ELF file format
2280 string 0 ELF
229>4 byte 1 32-bit
230";
231        let rules = parse_text_magic_file(input).unwrap();
232        assert_eq!(rules.len(), 1);
233        assert_eq!(rules[0].children.len(), 1);
234    }
235
236    #[test]
237    fn test_parse_text_magic_file_multiple_roots() {
238        let input = r"
2390 byte 1 ELF
240>4 byte 1 32-bit
241
2420 byte 2 PDF
243>5 byte 1 v1
244";
245        let rules = parse_text_magic_file(input).unwrap();
246        assert_eq!(rules.len(), 2);
247    }
248
249    #[test]
250    fn test_parse_text_magic_file_empty_input() {
251        let input = "";
252        let rules = parse_text_magic_file(input).unwrap();
253        assert_eq!(rules.len(), 0);
254    }
255
256    #[test]
257    fn test_parse_text_magic_file_only_comments() {
258        let input = r"
259# Comment 1
260# Comment 2
261# Comment 3
262";
263        let rules = parse_text_magic_file(input).unwrap();
264        assert_eq!(rules.len(), 0);
265    }
266
267    #[test]
268    fn test_parse_text_magic_file_empty_lines_only() {
269        let input = r"
270
271
2720 string 0 Test file
273
274
275";
276        let rules = parse_text_magic_file(input).unwrap();
277        assert_eq!(rules.len(), 1);
278    }
279
280    #[test]
281    fn test_parse_text_magic_file_with_message_spaces() {
282        let input = "0 string 0 Long message continued here";
283        let rules = parse_text_magic_file(input).unwrap();
284        assert!(rules[0].message.contains("continued"));
285    }
286
287    #[test]
288    fn test_parse_text_magic_file_mixed_indentation() {
289        let input = r"
2900 byte 1 Root1
291>4 byte 1 Child1
292>4 byte 2 Child2
293>>6 byte 3 Grandchild
294
2950 byte 2 Root2
296>4 byte 4 Child3
297";
298        let rules = parse_text_magic_file(input).unwrap();
299        assert_eq!(rules.len(), 2);
300        assert_eq!(rules[0].children.len(), 2);
301        assert_eq!(rules[0].children[1].children.len(), 1);
302        assert_eq!(rules[1].children.len(), 1);
303    }
304
305    #[test]
306    fn test_parse_text_magic_file_complex_real_world() {
307        let input = r"
308# Magic file for common formats
309
310# ELF binaries
3110 byte 0x7f ELF executable
312>4 byte 1 Intel 80386
313>4 byte 2 x86-64
314>>5 byte 1 LSB
315>>5 byte 2 MSB
316
317# PDF files
3180 byte 0x25 PDF document
319>5 byte 0x31 version 1.0
320>5 byte 0x34 version 1.4
321>5 byte 0x32 version 2.0
322";
323        let rules = parse_text_magic_file(input).unwrap();
324        assert_eq!(rules.len(), 2);
325        assert_eq!(rules[0].message, "ELF executable");
326        assert!(rules[0].children.len() > 1);
327    }
328
329    // ============================================================
330    // Strength directive integration tests
331    // ============================================================
332
333    #[test]
334    fn test_parse_text_magic_file_with_strength_directive() {
335        let input = r"
336!:strength +10
3370 string \\x7fELF ELF executable
338";
339        let rules = parse_text_magic_file(input).unwrap();
340        assert_eq!(rules.len(), 1);
341        assert_eq!(rules[0].strength_modifier, Some(StrengthModifier::Add(10)));
342    }
343
344    #[test]
345    fn test_parse_text_magic_file_strength_applies_to_next_rule() {
346        let input = r"
347!:strength *2
3480 string \\x7fELF ELF executable
3490 string \\x50\\x4b ZIP archive
350";
351        let rules = parse_text_magic_file(input).unwrap();
352        assert_eq!(rules.len(), 2);
353        // Strength should only apply to the immediately following rule
354        assert_eq!(
355            rules[0].strength_modifier,
356            Some(StrengthModifier::Multiply(2))
357        );
358        assert_eq!(rules[1].strength_modifier, None);
359    }
360
361    #[test]
362    fn test_parse_text_magic_file_strength_with_child_rules() {
363        let input = r"
364!:strength =50
3650 string \\x7fELF ELF executable
366>4 byte 1 32-bit
367>4 byte 2 64-bit
368";
369        let rules = parse_text_magic_file(input).unwrap();
370        assert_eq!(rules.len(), 1);
371        // Strength applies to root rule
372        assert_eq!(rules[0].strength_modifier, Some(StrengthModifier::Set(50)));
373        // Children should not have strength modifier
374        assert_eq!(rules[0].children[0].strength_modifier, None);
375        assert_eq!(rules[0].children[1].strength_modifier, None);
376    }
377
378    #[test]
379    fn test_parse_text_magic_file_multiple_strength_directives() {
380        let input = r"
381!:strength +10
3820 string \\x7fELF ELF executable
383!:strength -5
3840 string \\x50\\x4b ZIP archive
385";
386        let rules = parse_text_magic_file(input).unwrap();
387        assert_eq!(rules.len(), 2);
388        assert_eq!(rules[0].strength_modifier, Some(StrengthModifier::Add(10)));
389        assert_eq!(
390            rules[1].strength_modifier,
391            Some(StrengthModifier::Subtract(5))
392        );
393    }
394
395    #[test]
396    fn test_parse_text_magic_file_strength_all_operators() {
397        let inputs = [
398            ("!:strength +20\n0 byte 1 Test", StrengthModifier::Add(20)),
399            (
400                "!:strength -15\n0 byte 1 Test",
401                StrengthModifier::Subtract(15),
402            ),
403            (
404                "!:strength *3\n0 byte 1 Test",
405                StrengthModifier::Multiply(3),
406            ),
407            ("!:strength /2\n0 byte 1 Test", StrengthModifier::Divide(2)),
408            ("!:strength =100\n0 byte 1 Test", StrengthModifier::Set(100)),
409            ("!:strength 50\n0 byte 1 Test", StrengthModifier::Set(50)),
410        ];
411
412        for (input, expected_modifier) in inputs {
413            let rules = parse_text_magic_file(input).unwrap();
414            assert_eq!(
415                rules[0].strength_modifier,
416                Some(expected_modifier),
417                "Failed for input: {input}"
418            );
419        }
420    }
421
422    // ============================================================
423    // Integration and edge case tests
424    // ============================================================
425
426    #[test]
427    fn test_continuation_with_indentation() {
428        let input = r">4 byte 1 Message \
429continued";
430        let rules = parse_text_magic_file(input).unwrap();
431        assert_eq!(rules.len(), 1);
432    }
433
434    #[test]
435    fn test_multiple_hex_offsets() {
436        let input = r"
4370x100 string 0 At 256
4380x200 string 0 At 512
439";
440        let rules = parse_text_magic_file(input).unwrap();
441        assert_eq!(rules.len(), 2);
442    }
443
444    // ============================================================
445    // Overflow protection tests
446    // ============================================================
447
448    #[test]
449    fn test_overflow_decimal_too_many_digits() {
450        use crate::parser::grammar::parse_number;
451        // Test exactly 20 digits (should fail - over i64 max)
452        let result = parse_number("12345678901234567890");
453        assert!(result.is_err(), "Should reject 20+ decimal digits");
454    }
455
456    #[test]
457    fn test_overflow_hex_too_many_digits() {
458        use crate::parser::grammar::parse_number;
459        // Test 17 hex digits (should fail)
460        let result = parse_number("0x10000000000000000");
461        assert!(result.is_err(), "Should reject 17+ hex digits");
462    }
463
464    #[test]
465    fn test_overflow_i64_max() {
466        use crate::parser::grammar::parse_number;
467        // i64::MAX = 9223372036854775807
468        let result = parse_number("9223372036854775807");
469        assert!(result.is_ok(), "Should accept i64::MAX");
470    }
471
472    #[test]
473    fn test_overflow_i64_max_plus_one() {
474        use crate::parser::grammar::parse_number;
475        // i64::MAX + 1 should fail
476        let result = parse_number("9223372036854775808");
477        assert!(result.is_err(), "Should reject i64::MAX + 1");
478    }
479
480    // ============================================================
481    // Line number accuracy test (uses parse_text_magic_file)
482    // ============================================================
483
484    #[test]
485    fn test_error_reports_correct_line_for_continuation() {
486        // When a continued rule fails to parse, error should show the starting line
487        let input = "0 string 0 valid\n0 invalid \\\nsyntax here\n0 string 0 valid2";
488        let result = parse_text_magic_file(input);
489
490        match result {
491            Err(ref e) => {
492                // Error should mention line 2 (start of the bad rule), not line 3
493                let error_str = format!("{e:?}");
494                assert!(
495                    error_str.contains("line 2") || error_str.contains("line: 2"),
496                    "Error should reference line 2, got: {error_str}"
497                );
498            }
499            Ok(_) => panic!("Expected InvalidSyntax error"),
500        }
501    }
502}
503
504#[cfg(test)]
505mod output_test {
506    use crate::parser::{build_rule_hierarchy, parse_text_magic_file, preprocess_lines};
507
508    #[test]
509    fn demo_show_all_parser_outputs() {
510        let input = r"
511# ELF file
5120 string 0 ELF
513>4 byte 1 32-bit
514>4 byte 2 64-bit
515
5160 string 0 ZIP
517>0 byte 3 zipped
518";
519
520        println!("\n================ RAW INPUT ================\n");
521        println!("{input}");
522
523        // --------------------------------------------------
524        // 1. preprocess_lines
525        // --------------------------------------------------
526        println!("\n================ PREPROCESS LINES ================\n");
527
528        let lines = preprocess_lines(input).expect("preprocess_lines failed");
529
530        for (idx, line) in lines.iter().enumerate() {
531            println!(
532                "[{}] line_no={} is_comment={} content='{}'",
533                idx, line.line_number, line.is_comment, line.content
534            );
535        }
536
537        // --------------------------------------------------
538        // 2. parse_text_magic_file (full pipeline)
539        // --------------------------------------------------
540        println!("\n================ PARSED MAGIC RULES ================\n");
541
542        let rules = parse_text_magic_file(input).expect("parse_text_magic_file failed");
543
544        for (i, rule) in rules.iter().enumerate() {
545            println!("ROOT RULE [{i}]:");
546            print_rule(rule, 1);
547        }
548
549        // --------------------------------------------------
550        // 3. build_rule_hierarchy (explicit)
551        // --------------------------------------------------
552        println!("\n================ EXPLICIT HIERARCHY BUILD ================\n");
553
554        let rebuilt = build_rule_hierarchy(lines).expect("build_rule_hierarchy failed");
555
556        for (i, rule) in rebuilt.iter().enumerate() {
557            println!("ROOT [{i}]:");
558            print_rule(rule, 1);
559        }
560    }
561
562    // Helper to pretty-print rule trees
563    fn print_rule(rule: &crate::parser::MagicRule, indent: usize) {
564        let pad = "  ".repeat(indent);
565
566        println!(
567            "{}- level={} offset={:?} type={:?} op={:?} value={:?} message='{}'",
568            pad, rule.level, rule.offset, rule.typ, rule.op, rule.value, rule.message
569        );
570
571        for child in &rule.children {
572            print_rule(child, indent + 1);
573        }
574    }
575}