Skip to main content

libmagic_rs/parser/
mod.rs

1// Copyright (c) 2025-2026 the libmagic-rs contributors
2// SPDX-License-Identifier: Apache-2.0
3
4//! Magic file parser module
5//!
6//! This module handles parsing of magic files into an Abstract Syntax Tree (AST)
7//! that can be evaluated against file buffers for type identification.
8//!
9//! # Overview
10//!
11//! The parser implements a complete pipeline for transforming magic file text into
12//! a hierarchical rule structure suitable for evaluation. The pipeline consists of:
13//!
14//! 1. **Preprocessing**: Line handling, comment removal, continuation processing
15//! 2. **Parsing**: Individual magic rule parsing using nom combinators
16//! 3. **Hierarchy Building**: Constructing parent-child relationships based on indentation
17//! 4. **Validation**: Type checking and offset resolution
18//!
19//! # Format Detection and Loading
20//!
21//! The module automatically detects and handles three types of magic file formats:
22//! - **Text files**: Human-readable magic rule definitions
23//! - **Directories**: Collections of magic files (Magdir pattern)
24//! - **Binary files**: Compiled .mgc files (currently unsupported)
25//!
26//! ## Unified Loading API
27//!
28//! The recommended entry point for loading magic files is [`load_magic_file()`], which
29//! automatically detects the format and dispatches to the appropriate handler:
30//!
31//! ```ignore
32//! use libmagic_rs::parser::load_magic_file;
33//! use std::path::Path;
34//!
35//! // Works with text files
36//! let rules = load_magic_file(Path::new("/usr/share/misc/magic"))?;
37//!
38//! // Also works with directories
39//! let rules = load_magic_file(Path::new("/usr/share/misc/magic.d"))?;
40//!
41//! // Binary .mgc files return an error with guidance
42//! match load_magic_file(Path::new("/usr/share/misc/magic.mgc")) {
43//!     Ok(rules) => { /* ... */ },
44//!     Err(e) => eprintln!("Use --use-builtin for binary files: {}", e),
45//! }
46//! # Ok::<(), Box<dyn std::error::Error>>(())
47//! ```
48//!
49//! ## Three-Tier Loading Strategy
50//!
51//! The loading process works as follows:
52//!
53//! 1. **Format Detection**: [`detect_format()`] examines the path to determine the file type
54//! 2. **Dispatch to Handler**:
55//!    - Text files -> [`parse_text_magic_file()`] after reading contents
56//!    - Directories -> [`load_magic_directory()`] to load and merge all files
57//!    - Binary files -> Returns error suggesting `--use-builtin` option
58//! 3. **Return Merged Rules**: All rules are returned in a single `Vec<MagicRule>`
59//!
60//! # Examples
61//!
62//! ## Loading Magic Files (Recommended)
63//!
64//! Use the unified [`load_magic_file()`] API for automatic format detection:
65//!
66//! ```ignore
67//! use libmagic_rs::parser::load_magic_file;
68//! use std::path::Path;
69//!
70//! let rules = load_magic_file(Path::new("/usr/share/misc/magic"))?;
71//! println!("Loaded {} magic rules", rules.len());
72//! # Ok::<(), Box<dyn std::error::Error>>(())
73//! ```
74//!
75//! ## Parsing Text Content Directly
76//!
77//! For parsing magic rule text that's already in memory:
78//!
79//! ```ignore
80//! use libmagic_rs::parser::parse_text_magic_file;
81//!
82//! let magic_content = r#"
83//! 0 string \x7fELF ELF executable
84//! >4 byte 1 32-bit
85//! >4 byte 2 64-bit
86//! "#;
87//!
88//! let rules = parse_text_magic_file(magic_content)?;
89//! assert_eq!(rules.len(), 1);
90//! assert_eq!(rules[0].children.len(), 2);
91//! # Ok::<(), Box<dyn std::error::Error>>(())
92//! ```
93//!
94//! ## Loading a Directory Explicitly
95//!
96//! For Magdir-style directories containing multiple magic files:
97//!
98//! ```ignore
99//! use libmagic_rs::parser::load_magic_directory;
100//! use std::path::Path;
101//!
102//! // Directory structure:
103//! // /usr/share/file/magic.d/
104//! //   ├── elf
105//! //   ├── archive
106//! //   └── text
107//!
108//! let rules = load_magic_directory(Path::new("/usr/share/file/magic.d"))?;
109//! // Rules from all files are merged in alphabetical order by filename
110//! # Ok::<(), Box<dyn std::error::Error>>(())
111//! ```
112//!
113//! ## Migration Note
114//!
115//! **For users upgrading from direct function calls:**
116//!
117//! - **Old approach**: Call `detect_format()` then dispatch manually
118//! - **New approach**: Use `load_magic_file()` for automatic dispatching
119//!
120//! The individual functions (`parse_text_magic_file()`, `load_magic_directory()`)
121//! remain available for advanced use cases where you need direct control.
122//!
123//! **Key differences:**
124//! - `load_magic_file()`: Unified API with automatic format detection (recommended)
125//! - `parse_text_magic_file()`: Parses a single text string containing magic rules
126//! - `load_magic_directory()`: Loads and merges all magic files from a directory
127//! - `detect_format()`: Low-level format detection (now called internally by `load_magic_file()`)
128//!
129//! **Error handling in `load_magic_directory()`:**
130//! - Critical errors (I/O failures, invalid UTF-8): Returns `ParseError` immediately
131//! - Non-critical errors (parse failures in individual files): Logs warning to stderr and continues
132
133pub mod ast;
134#[allow(dead_code)]
135pub(crate) mod codegen;
136mod format;
137pub mod grammar;
138mod hierarchy;
139mod loader;
140pub(crate) mod preprocessing;
141pub mod types;
142
143// Re-export AST types for convenience
144pub use ast::{Endianness, MagicRule, OffsetSpec, Operator, StrengthModifier, TypeKind, Value};
145
146// Re-export parser functions for convenience
147pub use grammar::{parse_number, parse_offset};
148
149// Re-export format detection and loading
150pub use format::{MagicFileFormat, detect_format};
151pub use loader::{load_magic_directory, load_magic_file};
152
153// Internal re-exports for sibling modules and tests
154pub(crate) use hierarchy::build_rule_hierarchy;
155pub(crate) use preprocessing::preprocess_lines;
156
157use crate::error::ParseError;
158
159/// Parses a complete magic file from raw text input.
160///
161/// This is the main public-facing parser function that orchestrates the complete
162/// parsing pipeline: preprocessing, parsing individual rules, and building the
163/// hierarchical structure.
164///
165/// # Arguments
166///
167/// * `input` - The raw magic file content as a string
168///
169/// # Returns
170///
171/// `Result<Vec<MagicRule>, ParseError>` - A vector of root rules with nested children
172///
173/// # Errors
174///
175/// Returns an error if any stage of parsing fails:
176/// - Preprocessing errors
177/// - Rule parsing errors
178/// - Hierarchy building errors
179///
180/// # Example
181///
182/// ```ignore
183/// use libmagic_rs::parser::parse_text_magic_file;
184///
185/// let magic = r#"0 string \x7fELF ELF file
186/// >4 byte 1 32-bit
187/// >4 byte 2 64-bit"#;
188///
189/// let rules = parse_text_magic_file(magic)?;
190/// assert_eq!(rules.len(), 1);
191/// assert_eq!(rules[0].message, "ELF file");
192/// # Ok::<(), Box<dyn std::error::Error>>(())
193/// ```
194pub fn parse_text_magic_file(input: &str) -> Result<Vec<MagicRule>, ParseError> {
195    let lines = preprocess_lines(input)?;
196    build_rule_hierarchy(lines)
197}
198
199#[cfg(test)]
200mod unit_tests {
201    use super::*;
202
203    // ============================================================
204    // Tests for parse_text_magic_file (10+ test cases)
205    // ============================================================
206
207    #[test]
208    fn test_parse_text_magic_file_single_rule() {
209        let input = "0 string 0 ZIP archive";
210        let rules = parse_text_magic_file(input).unwrap();
211        assert_eq!(rules.len(), 1);
212        assert_eq!(rules[0].message, "ZIP archive");
213    }
214
215    #[test]
216    fn test_parse_text_magic_file_hierarchical_rules() {
217        let input = r"
2180 string 0 ELF
219>4 byte 1 32-bit
220>4 byte 2 64-bit
221";
222        let rules = parse_text_magic_file(input).unwrap();
223        assert_eq!(rules.len(), 1);
224        assert_eq!(rules[0].children.len(), 2);
225    }
226
227    #[test]
228    fn test_parse_text_magic_file_with_comments() {
229        let input = r"
230# ELF file format
2310 string 0 ELF
232>4 byte 1 32-bit
233";
234        let rules = parse_text_magic_file(input).unwrap();
235        assert_eq!(rules.len(), 1);
236        assert_eq!(rules[0].children.len(), 1);
237    }
238
239    #[test]
240    fn test_parse_text_magic_file_multiple_roots() {
241        let input = r"
2420 byte 1 ELF
243>4 byte 1 32-bit
244
2450 byte 2 PDF
246>5 byte 1 v1
247";
248        let rules = parse_text_magic_file(input).unwrap();
249        assert_eq!(rules.len(), 2);
250    }
251
252    #[test]
253    fn test_parse_text_magic_file_empty_input() {
254        let input = "";
255        let rules = parse_text_magic_file(input).unwrap();
256        assert_eq!(rules.len(), 0);
257    }
258
259    #[test]
260    fn test_parse_text_magic_file_only_comments() {
261        let input = r"
262# Comment 1
263# Comment 2
264# Comment 3
265";
266        let rules = parse_text_magic_file(input).unwrap();
267        assert_eq!(rules.len(), 0);
268    }
269
270    #[test]
271    fn test_parse_text_magic_file_empty_lines_only() {
272        let input = r"
273
274
2750 string 0 Test file
276
277
278";
279        let rules = parse_text_magic_file(input).unwrap();
280        assert_eq!(rules.len(), 1);
281    }
282
283    #[test]
284    fn test_parse_text_magic_file_with_message_spaces() {
285        let input = "0 string 0 Long message continued here";
286        let rules = parse_text_magic_file(input).unwrap();
287        assert!(rules[0].message.contains("continued"));
288    }
289
290    #[test]
291    fn test_parse_text_magic_file_mixed_indentation() {
292        let input = r"
2930 byte 1 Root1
294>4 byte 1 Child1
295>4 byte 2 Child2
296>>6 byte 3 Grandchild
297
2980 byte 2 Root2
299>4 byte 4 Child3
300";
301        let rules = parse_text_magic_file(input).unwrap();
302        assert_eq!(rules.len(), 2);
303        assert_eq!(rules[0].children.len(), 2);
304        assert_eq!(rules[0].children[1].children.len(), 1);
305        assert_eq!(rules[1].children.len(), 1);
306    }
307
308    #[test]
309    fn test_parse_text_magic_file_complex_real_world() {
310        let input = r"
311# Magic file for common formats
312
313# ELF binaries
3140 byte 0x7f ELF executable
315>4 byte 1 Intel 80386
316>4 byte 2 x86-64
317>>5 byte 1 LSB
318>>5 byte 2 MSB
319
320# PDF files
3210 byte 0x25 PDF document
322>5 byte 0x31 version 1.0
323>5 byte 0x34 version 1.4
324>5 byte 0x32 version 2.0
325";
326        let rules = parse_text_magic_file(input).unwrap();
327        assert_eq!(rules.len(), 2);
328        assert_eq!(rules[0].message, "ELF executable");
329        assert!(rules[0].children.len() > 1);
330    }
331
332    // ============================================================
333    // Strength directive integration tests
334    // ============================================================
335
336    #[test]
337    fn test_parse_text_magic_file_with_strength_directive() {
338        let input = r"
339!:strength +10
3400 string \\x7fELF ELF executable
341";
342        let rules = parse_text_magic_file(input).unwrap();
343        assert_eq!(rules.len(), 1);
344        assert_eq!(rules[0].strength_modifier, Some(StrengthModifier::Add(10)));
345    }
346
347    #[test]
348    fn test_parse_text_magic_file_strength_applies_to_next_rule() {
349        let input = r"
350!:strength *2
3510 string \\x7fELF ELF executable
3520 string \\x50\\x4b ZIP archive
353";
354        let rules = parse_text_magic_file(input).unwrap();
355        assert_eq!(rules.len(), 2);
356        // Strength should only apply to the immediately following rule
357        assert_eq!(
358            rules[0].strength_modifier,
359            Some(StrengthModifier::Multiply(2))
360        );
361        assert_eq!(rules[1].strength_modifier, None);
362    }
363
364    #[test]
365    fn test_parse_text_magic_file_strength_with_child_rules() {
366        let input = r"
367!:strength =50
3680 string \\x7fELF ELF executable
369>4 byte 1 32-bit
370>4 byte 2 64-bit
371";
372        let rules = parse_text_magic_file(input).unwrap();
373        assert_eq!(rules.len(), 1);
374        // Strength applies to root rule
375        assert_eq!(rules[0].strength_modifier, Some(StrengthModifier::Set(50)));
376        // Children should not have strength modifier
377        assert_eq!(rules[0].children[0].strength_modifier, None);
378        assert_eq!(rules[0].children[1].strength_modifier, None);
379    }
380
381    #[test]
382    fn test_parse_text_magic_file_multiple_strength_directives() {
383        let input = r"
384!:strength +10
3850 string \\x7fELF ELF executable
386!:strength -5
3870 string \\x50\\x4b ZIP archive
388";
389        let rules = parse_text_magic_file(input).unwrap();
390        assert_eq!(rules.len(), 2);
391        assert_eq!(rules[0].strength_modifier, Some(StrengthModifier::Add(10)));
392        assert_eq!(
393            rules[1].strength_modifier,
394            Some(StrengthModifier::Subtract(5))
395        );
396    }
397
398    #[test]
399    fn test_parse_text_magic_file_strength_all_operators() {
400        let inputs = [
401            ("!:strength +20\n0 byte 1 Test", StrengthModifier::Add(20)),
402            (
403                "!:strength -15\n0 byte 1 Test",
404                StrengthModifier::Subtract(15),
405            ),
406            (
407                "!:strength *3\n0 byte 1 Test",
408                StrengthModifier::Multiply(3),
409            ),
410            ("!:strength /2\n0 byte 1 Test", StrengthModifier::Divide(2)),
411            ("!:strength =100\n0 byte 1 Test", StrengthModifier::Set(100)),
412            ("!:strength 50\n0 byte 1 Test", StrengthModifier::Set(50)),
413        ];
414
415        for (input, expected_modifier) in inputs {
416            let rules = parse_text_magic_file(input).unwrap();
417            assert_eq!(
418                rules[0].strength_modifier,
419                Some(expected_modifier),
420                "Failed for input: {input}"
421            );
422        }
423    }
424
425    // ============================================================
426    // Integration and edge case tests
427    // ============================================================
428
429    #[test]
430    fn test_continuation_with_indentation() {
431        let input = r">4 byte 1 Message \
432continued";
433        let rules = parse_text_magic_file(input).unwrap();
434        assert_eq!(rules.len(), 1);
435    }
436
437    #[test]
438    fn test_multiple_hex_offsets() {
439        let input = r"
4400x100 string 0 At 256
4410x200 string 0 At 512
442";
443        let rules = parse_text_magic_file(input).unwrap();
444        assert_eq!(rules.len(), 2);
445    }
446
447    // ============================================================
448    // Overflow protection tests
449    // ============================================================
450
451    #[test]
452    fn test_overflow_decimal_too_many_digits() {
453        use crate::parser::grammar::parse_number;
454        // Test exactly 20 digits (should fail - over i64 max)
455        let result = parse_number("12345678901234567890");
456        assert!(result.is_err(), "Should reject 20+ decimal digits");
457    }
458
459    #[test]
460    fn test_overflow_hex_too_many_digits() {
461        use crate::parser::grammar::parse_number;
462        // Test 17 hex digits (should fail)
463        let result = parse_number("0x10000000000000000");
464        assert!(result.is_err(), "Should reject 17+ hex digits");
465    }
466
467    #[test]
468    fn test_overflow_i64_max() {
469        use crate::parser::grammar::parse_number;
470        // i64::MAX = 9223372036854775807
471        let result = parse_number("9223372036854775807");
472        assert!(result.is_ok(), "Should accept i64::MAX");
473    }
474
475    #[test]
476    fn test_overflow_i64_max_plus_one() {
477        use crate::parser::grammar::parse_number;
478        // i64::MAX + 1 should fail
479        let result = parse_number("9223372036854775808");
480        assert!(result.is_err(), "Should reject i64::MAX + 1");
481    }
482
483    // ============================================================
484    // Line number accuracy test (uses parse_text_magic_file)
485    // ============================================================
486
487    #[test]
488    fn test_error_reports_correct_line_for_continuation() {
489        // When a continued rule fails to parse, error should show the starting line
490        let input = "0 string 0 valid\n0 invalid \\\nsyntax here\n0 string 0 valid2";
491        let result = parse_text_magic_file(input);
492
493        match result {
494            Err(ref e) => {
495                // Error should mention line 2 (start of the bad rule), not line 3
496                let error_str = format!("{e:?}");
497                assert!(
498                    error_str.contains("line 2") || error_str.contains("line: 2"),
499                    "Error should reference line 2, got: {error_str}"
500                );
501            }
502            Ok(_) => panic!("Expected InvalidSyntax error"),
503        }
504    }
505}
506
507#[cfg(test)]
508mod output_test {
509    use crate::parser::{build_rule_hierarchy, parse_text_magic_file, preprocess_lines};
510
511    #[test]
512    fn demo_show_all_parser_outputs() {
513        let input = r"
514# ELF file
5150 string 0 ELF
516>4 byte 1 32-bit
517>4 byte 2 64-bit
518
5190 string 0 ZIP
520>0 byte 3 zipped
521";
522
523        println!("\n================ RAW INPUT ================\n");
524        println!("{input}");
525
526        // --------------------------------------------------
527        // 1. preprocess_lines
528        // --------------------------------------------------
529        println!("\n================ PREPROCESS LINES ================\n");
530
531        let lines = preprocess_lines(input).expect("preprocess_lines failed");
532
533        for (idx, line) in lines.iter().enumerate() {
534            println!(
535                "[{}] line_no={} is_comment={} content='{}'",
536                idx, line.line_number, line.is_comment, line.content
537            );
538        }
539
540        // --------------------------------------------------
541        // 2. parse_text_magic_file (full pipeline)
542        // --------------------------------------------------
543        println!("\n================ PARSED MAGIC RULES ================\n");
544
545        let rules = parse_text_magic_file(input).expect("parse_text_magic_file failed");
546
547        for (i, rule) in rules.iter().enumerate() {
548            println!("ROOT RULE [{i}]:");
549            print_rule(rule, 1);
550        }
551
552        // --------------------------------------------------
553        // 3. build_rule_hierarchy (explicit)
554        // --------------------------------------------------
555        println!("\n================ EXPLICIT HIERARCHY BUILD ================\n");
556
557        let rebuilt = build_rule_hierarchy(lines).expect("build_rule_hierarchy failed");
558
559        for (i, rule) in rebuilt.iter().enumerate() {
560            println!("ROOT [{i}]:");
561            print_rule(rule, 1);
562        }
563    }
564
565    // Helper to pretty-print rule trees
566    fn print_rule(rule: &crate::parser::MagicRule, indent: usize) {
567        let pad = "  ".repeat(indent);
568
569        println!(
570            "{}- level={} offset={:?} type={:?} op={:?} value={:?} message='{}'",
571            pad, rule.level, rule.offset, rule.typ, rule.op, rule.value, rule.message
572        );
573
574        for child in &rule.children {
575            print_rule(child, indent + 1);
576        }
577    }
578}