Skip to main content

libmagic_rs/parser/
mod.rs

1// Copyright (c) 2025-2026 the libmagic-rs contributors
2// SPDX-License-Identifier: Apache-2.0
3
4//! Magic file parser module
5//!
6//! This module handles parsing of magic files into an Abstract Syntax Tree (AST)
7//! that can be evaluated against file buffers for type identification.
8//!
9//! # Overview
10//!
11//! The parser implements a complete pipeline for transforming magic file text into
12//! a hierarchical rule structure suitable for evaluation. The pipeline consists of:
13//!
14//! 1. **Preprocessing**: Line handling, comment removal, continuation processing
15//! 2. **Parsing**: Individual magic rule parsing using nom combinators
16//! 3. **Hierarchy Building**: Constructing parent-child relationships based on indentation
17//! 4. **Validation**: Type checking and offset resolution
18//!
19//! # Format Detection and Loading
20//!
21//! The module automatically detects and handles three types of magic file formats:
22//! - **Text files**: Human-readable magic rule definitions
23//! - **Directories**: Collections of magic files (Magdir pattern)
24//! - **Binary files**: Compiled .mgc files (currently unsupported)
25//!
26//! ## Unified Loading API
27//!
28//! The recommended entry point for loading magic files is [`load_magic_file()`], which
29//! automatically detects the format and dispatches to the appropriate handler:
30//!
31//! ```ignore
32//! use libmagic_rs::parser::load_magic_file;
33//! use std::path::Path;
34//!
35//! // Works with text files
36//! let rules = load_magic_file(Path::new("/usr/share/misc/magic"))?;
37//!
38//! // Also works with directories
39//! let rules = load_magic_file(Path::new("/usr/share/misc/magic.d"))?;
40//!
41//! // Binary .mgc files return an error with guidance
42//! match load_magic_file(Path::new("/usr/share/misc/magic.mgc")) {
43//!     Ok(rules) => { /* ... */ },
44//!     Err(e) => eprintln!("Use --use-builtin for binary files: {}", e),
45//! }
46//! # Ok::<(), Box<dyn std::error::Error>>(())
47//! ```
48//!
49//! ## Three-Tier Loading Strategy
50//!
51//! The loading process works as follows:
52//!
53//! 1. **Format Detection**: [`detect_format()`] examines the path to determine the file type
54//! 2. **Dispatch to Handler**:
55//!    - Text files -> [`parse_text_magic_file()`] after reading contents
56//!    - Directories -> [`load_magic_directory()`] to load and merge all files
57//!    - Binary files -> Returns error suggesting `--use-builtin` option
58//! 3. **Return Merged Rules**: All rules are returned in a single `Vec<MagicRule>`
59//!
60//! # Examples
61//!
62//! ## Loading Magic Files (Recommended)
63//!
64//! Use the unified [`load_magic_file()`] API for automatic format detection:
65//!
66//! ```ignore
67//! use libmagic_rs::parser::load_magic_file;
68//! use std::path::Path;
69//!
70//! let rules = load_magic_file(Path::new("/usr/share/misc/magic"))?;
71//! println!("Loaded {} magic rules", rules.len());
72//! # Ok::<(), Box<dyn std::error::Error>>(())
73//! ```
74//!
75//! ## Parsing Text Content Directly
76//!
77//! For parsing magic rule text that's already in memory:
78//!
79//! ```ignore
80//! use libmagic_rs::parser::parse_text_magic_file;
81//!
82//! let magic_content = r#"
83//! 0 string \x7fELF ELF executable
84//! >4 byte 1 32-bit
85//! >4 byte 2 64-bit
86//! "#;
87//!
88//! let rules = parse_text_magic_file(magic_content)?;
89//! assert_eq!(rules.len(), 1);
90//! assert_eq!(rules[0].children.len(), 2);
91//! # Ok::<(), Box<dyn std::error::Error>>(())
92//! ```
93//!
94//! ## Loading a Directory Explicitly
95//!
96//! For Magdir-style directories containing multiple magic files:
97//!
98//! ```ignore
99//! use libmagic_rs::parser::load_magic_directory;
100//! use std::path::Path;
101//!
102//! // Directory structure:
103//! // /usr/share/file/magic.d/
104//! //   ├── elf
105//! //   ├── archive
106//! //   └── text
107//!
108//! let rules = load_magic_directory(Path::new("/usr/share/file/magic.d"))?;
109//! // Rules from all files are merged in alphabetical order by filename
110//! # Ok::<(), Box<dyn std::error::Error>>(())
111//! ```
112//!
113//! ## Migration Note
114//!
115//! **For users upgrading from direct function calls:**
116//!
117//! - **Old approach**: Call `detect_format()` then dispatch manually
118//! - **New approach**: Use `load_magic_file()` for automatic dispatching
119//!
120//! The individual functions (`parse_text_magic_file()`, `load_magic_directory()`)
121//! remain available for advanced use cases where you need direct control.
122//!
123//! **Key differences:**
124//! - `load_magic_file()`: Unified API with automatic format detection (recommended)
125//! - `parse_text_magic_file()`: Parses a single text string containing magic rules
126//! - `load_magic_directory()`: Loads and merges all magic files from a directory
127//! - `detect_format()`: Low-level format detection (now called internally by `load_magic_file()`)
128//!
129//! **Error handling in `load_magic_directory()`:**
130//! - Critical errors (I/O failures, invalid UTF-8): Returns `ParseError` immediately
131//! - Non-critical errors (parse failures in individual files): Logs warning to stderr and continues
132
133pub mod ast;
134#[allow(dead_code)]
135pub(crate) mod codegen;
136mod format;
137// `grammar` exposes nom-based parser combinators that are implementation
138// details of the magic-file parsing pipeline. Keep them visible to the rest
139// of the crate (for sibling modules and unit tests) but never to external
140// consumers -- the only supported parser entry points are the
141// `parse_text_magic_file` / `load_magic_file` functions in this module.
142pub(crate) mod grammar;
143mod hierarchy;
144mod loader;
145pub(crate) mod name_table;
146pub(crate) mod preprocessing;
147pub mod types;
148
149// Re-export AST types for convenience
150pub use ast::{Endianness, MagicRule, OffsetSpec, Operator, StrengthModifier, TypeKind, Value};
151
152// Re-export format detection and loading
153pub use format::{MagicFileFormat, detect_format};
154pub use loader::{load_magic_directory, load_magic_file};
155
156// Internal re-exports for sibling modules and tests
157pub(crate) use hierarchy::build_rule_hierarchy;
158pub(crate) use preprocessing::preprocess_lines;
159
160use crate::error::ParseError;
161
162/// Result of parsing a text magic file.
163///
164/// Contains the top-level rule list with any `name`-declared subroutines
165/// hoisted into a separate [`name_table::NameTable`] keyed by identifier.
166/// The rule list preserves the original ordering of all non-`Name` top-level
167/// rules, so strength-based sorting and evaluation semantics are unchanged
168/// for magic files that do not use the `name`/`use` directive pair.
169#[derive(Debug)]
170pub struct ParsedMagic {
171    /// Top-level rules after `Name` subroutines have been removed.
172    pub rules: Vec<MagicRule>,
173    /// Extracted `name` subroutine definitions, consulted by the evaluator
174    /// when a rule of type `TypeKind::Meta(MetaType::Use(_))` is reached.
175    pub(crate) name_table: name_table::NameTable,
176}
177
178/// Parses a complete magic file from raw text input.
179///
180/// This is the main public-facing parser function that orchestrates the complete
181/// parsing pipeline: preprocessing, parsing individual rules, and building the
182/// hierarchical structure.
183///
184/// # Arguments
185///
186/// * `input` - The raw magic file content as a string
187///
188/// # Returns
189///
190/// `Result<ParsedMagic, ParseError>` - A [`ParsedMagic`] value containing
191/// the top-level rules (with `name`-declared subroutines hoisted out) and
192/// the resulting name table.
193///
194/// # Errors
195///
196/// Returns an error if any stage of parsing fails:
197/// - Preprocessing errors
198/// - Rule parsing errors
199/// - Hierarchy building errors
200///
201/// # Example
202///
203/// ```ignore
204/// use libmagic_rs::parser::parse_text_magic_file;
205///
206/// let magic = r#"0 string \x7fELF ELF file
207/// >4 byte 1 32-bit
208/// >4 byte 2 64-bit"#;
209///
210/// let parsed = parse_text_magic_file(magic)?;
211/// assert_eq!(parsed.rules.len(), 1);
212/// assert_eq!(parsed.rules[0].message, "ELF file");
213/// # Ok::<(), Box<dyn std::error::Error>>(())
214/// ```
215pub fn parse_text_magic_file(input: &str) -> Result<ParsedMagic, ParseError> {
216    let lines = preprocess_lines(input)?;
217    let rules = build_rule_hierarchy(lines)?;
218    let (rules, name_table) = name_table::extract_name_table(rules);
219    Ok(ParsedMagic { rules, name_table })
220}
221
222#[cfg(test)]
223mod unit_tests {
224    use super::*;
225
226    // ============================================================
227    // Tests for parse_text_magic_file (10+ test cases)
228    // ============================================================
229
230    #[test]
231    fn test_parse_text_magic_file_single_rule() {
232        let input = "0 string 0 ZIP archive";
233        let ParsedMagic { rules, .. } = parse_text_magic_file(input).unwrap();
234        assert_eq!(rules.len(), 1);
235        assert_eq!(rules[0].message, "ZIP archive");
236    }
237
238    #[test]
239    fn test_parse_text_magic_file_hierarchical_rules() {
240        let input = r"
2410 string 0 ELF
242>4 byte 1 32-bit
243>4 byte 2 64-bit
244";
245        let ParsedMagic { rules, .. } = parse_text_magic_file(input).unwrap();
246        assert_eq!(rules.len(), 1);
247        assert_eq!(rules[0].children.len(), 2);
248    }
249
250    #[test]
251    fn test_parse_text_magic_file_with_comments() {
252        let input = r"
253# ELF file format
2540 string 0 ELF
255>4 byte 1 32-bit
256";
257        let ParsedMagic { rules, .. } = parse_text_magic_file(input).unwrap();
258        assert_eq!(rules.len(), 1);
259        assert_eq!(rules[0].children.len(), 1);
260    }
261
262    #[test]
263    fn test_parse_text_magic_file_multiple_roots() {
264        let input = r"
2650 byte 1 ELF
266>4 byte 1 32-bit
267
2680 byte 2 PDF
269>5 byte 1 v1
270";
271        let ParsedMagic { rules, .. } = parse_text_magic_file(input).unwrap();
272        assert_eq!(rules.len(), 2);
273    }
274
275    #[test]
276    fn test_parse_text_magic_file_empty_input() {
277        let input = "";
278        let ParsedMagic { rules, .. } = parse_text_magic_file(input).unwrap();
279        assert_eq!(rules.len(), 0);
280    }
281
282    #[test]
283    fn test_parse_text_magic_file_only_comments() {
284        let input = r"
285# Comment 1
286# Comment 2
287# Comment 3
288";
289        let ParsedMagic { rules, .. } = parse_text_magic_file(input).unwrap();
290        assert_eq!(rules.len(), 0);
291    }
292
293    #[test]
294    fn test_parse_text_magic_file_empty_lines_only() {
295        let input = r"
296
297
2980 string 0 Test file
299
300
301";
302        let ParsedMagic { rules, .. } = parse_text_magic_file(input).unwrap();
303        assert_eq!(rules.len(), 1);
304    }
305
306    #[test]
307    fn test_parse_text_magic_file_with_message_spaces() {
308        let input = "0 string 0 Long message continued here";
309        let ParsedMagic { rules, .. } = parse_text_magic_file(input).unwrap();
310        assert!(rules[0].message.contains("continued"));
311    }
312
313    #[test]
314    fn test_parse_text_magic_file_mixed_indentation() {
315        let input = r"
3160 byte 1 Root1
317>4 byte 1 Child1
318>4 byte 2 Child2
319>>6 byte 3 Grandchild
320
3210 byte 2 Root2
322>4 byte 4 Child3
323";
324        let ParsedMagic { rules, .. } = parse_text_magic_file(input).unwrap();
325        assert_eq!(rules.len(), 2);
326        assert_eq!(rules[0].children.len(), 2);
327        assert_eq!(rules[0].children[1].children.len(), 1);
328        assert_eq!(rules[1].children.len(), 1);
329    }
330
331    #[test]
332    fn test_parse_text_magic_file_complex_real_world() {
333        let input = r"
334# Magic file for common formats
335
336# ELF binaries
3370 byte 0x7f ELF executable
338>4 byte 1 Intel 80386
339>4 byte 2 x86-64
340>>5 byte 1 LSB
341>>5 byte 2 MSB
342
343# PDF files
3440 byte 0x25 PDF document
345>5 byte 0x31 version 1.0
346>5 byte 0x34 version 1.4
347>5 byte 0x32 version 2.0
348";
349        let ParsedMagic { rules, .. } = parse_text_magic_file(input).unwrap();
350        assert_eq!(rules.len(), 2);
351        assert_eq!(rules[0].message, "ELF executable");
352        assert!(rules[0].children.len() > 1);
353    }
354
355    // ============================================================
356    // Strength directive integration tests
357    // ============================================================
358
359    #[test]
360    fn test_parse_text_magic_file_with_strength_directive() {
361        let input = r"
362!:strength +10
3630 string \\x7fELF ELF executable
364";
365        let ParsedMagic { rules, .. } = parse_text_magic_file(input).unwrap();
366        assert_eq!(rules.len(), 1);
367        assert_eq!(rules[0].strength_modifier, Some(StrengthModifier::Add(10)));
368    }
369
370    #[test]
371    fn test_parse_text_magic_file_strength_applies_to_next_rule() {
372        let input = r"
373!:strength *2
3740 string \\x7fELF ELF executable
3750 string \\x50\\x4b ZIP archive
376";
377        let ParsedMagic { rules, .. } = parse_text_magic_file(input).unwrap();
378        assert_eq!(rules.len(), 2);
379        // Strength should only apply to the immediately following rule
380        assert_eq!(
381            rules[0].strength_modifier,
382            Some(StrengthModifier::Multiply(2))
383        );
384        assert_eq!(rules[1].strength_modifier, None);
385    }
386
387    #[test]
388    fn test_parse_text_magic_file_strength_with_child_rules() {
389        let input = r"
390!:strength =50
3910 string \\x7fELF ELF executable
392>4 byte 1 32-bit
393>4 byte 2 64-bit
394";
395        let ParsedMagic { rules, .. } = parse_text_magic_file(input).unwrap();
396        assert_eq!(rules.len(), 1);
397        // Strength applies to root rule
398        assert_eq!(rules[0].strength_modifier, Some(StrengthModifier::Set(50)));
399        // Children should not have strength modifier
400        assert_eq!(rules[0].children[0].strength_modifier, None);
401        assert_eq!(rules[0].children[1].strength_modifier, None);
402    }
403
404    #[test]
405    fn test_parse_text_magic_file_multiple_strength_directives() {
406        let input = r"
407!:strength +10
4080 string \\x7fELF ELF executable
409!:strength -5
4100 string \\x50\\x4b ZIP archive
411";
412        let ParsedMagic { rules, .. } = parse_text_magic_file(input).unwrap();
413        assert_eq!(rules.len(), 2);
414        assert_eq!(rules[0].strength_modifier, Some(StrengthModifier::Add(10)));
415        assert_eq!(
416            rules[1].strength_modifier,
417            Some(StrengthModifier::Subtract(5))
418        );
419    }
420
421    #[test]
422    fn test_parse_text_magic_file_strength_all_operators() {
423        let inputs = [
424            ("!:strength +20\n0 byte 1 Test", StrengthModifier::Add(20)),
425            (
426                "!:strength -15\n0 byte 1 Test",
427                StrengthModifier::Subtract(15),
428            ),
429            (
430                "!:strength *3\n0 byte 1 Test",
431                StrengthModifier::Multiply(3),
432            ),
433            ("!:strength /2\n0 byte 1 Test", StrengthModifier::Divide(2)),
434            ("!:strength =100\n0 byte 1 Test", StrengthModifier::Set(100)),
435            ("!:strength 50\n0 byte 1 Test", StrengthModifier::Set(50)),
436        ];
437
438        for (input, expected_modifier) in inputs {
439            let ParsedMagic { rules, .. } = parse_text_magic_file(input).unwrap();
440            assert_eq!(
441                rules[0].strength_modifier,
442                Some(expected_modifier),
443                "Failed for input: {input}"
444            );
445        }
446    }
447
448    // ============================================================
449    // Integration and edge case tests
450    // ============================================================
451
452    #[test]
453    fn test_continuation_with_indentation() {
454        let input = r">4 byte 1 Message \
455continued";
456        let ParsedMagic { rules, .. } = parse_text_magic_file(input).unwrap();
457        assert_eq!(rules.len(), 1);
458    }
459
460    #[test]
461    fn test_multiple_hex_offsets() {
462        let input = r"
4630x100 string 0 At 256
4640x200 string 0 At 512
465";
466        let ParsedMagic { rules, .. } = parse_text_magic_file(input).unwrap();
467        assert_eq!(rules.len(), 2);
468    }
469
470    // ============================================================
471    // Overflow protection tests
472    // ============================================================
473
474    #[test]
475    fn test_overflow_decimal_too_many_digits() {
476        use crate::parser::grammar::parse_number;
477        // Test exactly 20 digits (should fail - over i64 max)
478        let result = parse_number("12345678901234567890");
479        assert!(result.is_err(), "Should reject 20+ decimal digits");
480    }
481
482    #[test]
483    fn test_overflow_hex_too_many_digits() {
484        use crate::parser::grammar::parse_number;
485        // Test 17 hex digits (should fail)
486        let result = parse_number("0x10000000000000000");
487        assert!(result.is_err(), "Should reject 17+ hex digits");
488    }
489
490    #[test]
491    fn test_overflow_i64_max() {
492        use crate::parser::grammar::parse_number;
493        // i64::MAX = 9223372036854775807
494        let result = parse_number("9223372036854775807");
495        assert!(result.is_ok(), "Should accept i64::MAX");
496    }
497
498    #[test]
499    fn test_overflow_i64_max_plus_one() {
500        use crate::parser::grammar::parse_number;
501        // i64::MAX + 1 should fail
502        let result = parse_number("9223372036854775808");
503        assert!(result.is_err(), "Should reject i64::MAX + 1");
504    }
505
506    // ============================================================
507    // Line number accuracy test (uses parse_text_magic_file)
508    // ============================================================
509
510    #[test]
511    fn test_error_reports_correct_line_for_continuation() {
512        // When a continued rule fails to parse, error should show the starting line
513        let input = "0 string 0 valid\n0 invalid \\\nsyntax here\n0 string 0 valid2";
514        let result = parse_text_magic_file(input);
515
516        match result {
517            Err(ref e) => {
518                // Error should mention line 2 (start of the bad rule), not line 3
519                let error_str = format!("{e:?}");
520                assert!(
521                    error_str.contains("line 2") || error_str.contains("line: 2"),
522                    "Error should reference line 2, got: {error_str}"
523                );
524            }
525            Ok(_) => panic!("Expected InvalidSyntax error"),
526        }
527    }
528}
529
530#[cfg(test)]
531mod output_test {
532    use crate::parser::{
533        ParsedMagic, build_rule_hierarchy, parse_text_magic_file, preprocess_lines,
534    };
535
536    #[test]
537    fn demo_show_all_parser_outputs() {
538        let input = r"
539# ELF file
5400 string 0 ELF
541>4 byte 1 32-bit
542>4 byte 2 64-bit
543
5440 string 0 ZIP
545>0 byte 3 zipped
546";
547
548        println!("\n================ RAW INPUT ================\n");
549        println!("{input}");
550
551        // --------------------------------------------------
552        // 1. preprocess_lines
553        // --------------------------------------------------
554        println!("\n================ PREPROCESS LINES ================\n");
555
556        let lines = preprocess_lines(input).expect("preprocess_lines failed");
557
558        for (idx, line) in lines.iter().enumerate() {
559            println!(
560                "[{}] line_no={} is_comment={} content='{}'",
561                idx, line.line_number, line.is_comment, line.content
562            );
563        }
564
565        // --------------------------------------------------
566        // 2. parse_text_magic_file (full pipeline)
567        // --------------------------------------------------
568        println!("\n================ PARSED MAGIC RULES ================\n");
569
570        let ParsedMagic { rules, .. } =
571            parse_text_magic_file(input).expect("parse_text_magic_file failed");
572
573        for (i, rule) in rules.iter().enumerate() {
574            println!("ROOT RULE [{i}]:");
575            print_rule(rule, 1);
576        }
577
578        // --------------------------------------------------
579        // 3. build_rule_hierarchy (explicit)
580        // --------------------------------------------------
581        println!("\n================ EXPLICIT HIERARCHY BUILD ================\n");
582
583        let rebuilt = build_rule_hierarchy(lines).expect("build_rule_hierarchy failed");
584
585        for (i, rule) in rebuilt.iter().enumerate() {
586            println!("ROOT [{i}]:");
587            print_rule(rule, 1);
588        }
589    }
590
591    // Helper to pretty-print rule trees
592    fn print_rule(rule: &crate::parser::MagicRule, indent: usize) {
593        let pad = "  ".repeat(indent);
594
595        println!(
596            "{}- level={} offset={:?} type={:?} op={:?} value={:?} message='{}'",
597            pad, rule.level, rule.offset, rule.typ, rule.op, rule.value, rule.message
598        );
599
600        for child in &rule.children {
601            print_rule(child, indent + 1);
602        }
603    }
604}