libmagic_rs/parser/mod.rs
1// Copyright (c) 2025-2026 the libmagic-rs contributors
2// SPDX-License-Identifier: Apache-2.0
3
4//! Magic file parser module
5//!
6//! This module handles parsing of magic files into an Abstract Syntax Tree (AST)
7//! that can be evaluated against file buffers for type identification.
8//!
9//! # Overview
10//!
11//! The parser implements a complete pipeline for transforming magic file text into
12//! a hierarchical rule structure suitable for evaluation. The pipeline consists of:
13//!
14//! 1. **Preprocessing**: Line handling, comment removal, continuation processing
15//! 2. **Parsing**: Individual magic rule parsing using nom combinators
16//! 3. **Hierarchy Building**: Constructing parent-child relationships based on indentation
17//! 4. **Validation**: Type checking and offset resolution
18//!
19//! # Format Detection and Loading
20//!
21//! The module automatically detects and handles three types of magic file formats:
22//! - **Text files**: Human-readable magic rule definitions
23//! - **Directories**: Collections of magic files (Magdir pattern)
24//! - **Binary files**: Compiled .mgc files (currently unsupported)
25//!
26//! ## Unified Loading API
27//!
28//! The recommended entry point for loading magic files is [`load_magic_file()`], which
29//! automatically detects the format and dispatches to the appropriate handler:
30//!
31//! ```ignore
32//! use libmagic_rs::parser::load_magic_file;
33//! use std::path::Path;
34//!
35//! // Works with text files
36//! let rules = load_magic_file(Path::new("/usr/share/misc/magic"))?;
37//!
38//! // Also works with directories
39//! let rules = load_magic_file(Path::new("/usr/share/misc/magic.d"))?;
40//!
41//! // Binary .mgc files return an error with guidance
42//! match load_magic_file(Path::new("/usr/share/misc/magic.mgc")) {
43//! Ok(rules) => { /* ... */ },
44//! Err(e) => eprintln!("Use --use-builtin for binary files: {}", e),
45//! }
46//! # Ok::<(), Box<dyn std::error::Error>>(())
47//! ```
48//!
49//! ## Three-Tier Loading Strategy
50//!
51//! The loading process works as follows:
52//!
53//! 1. **Format Detection**: [`detect_format()`] examines the path to determine the file type
54//! 2. **Dispatch to Handler**:
55//! - Text files -> [`parse_text_magic_file()`] after reading contents
56//! - Directories -> [`load_magic_directory()`] to load and merge all files
57//! - Binary files -> Returns error suggesting `--use-builtin` option
58//! 3. **Return Merged Rules**: All rules are returned in a single `Vec<MagicRule>`
59//!
60//! # Examples
61//!
62//! ## Loading Magic Files (Recommended)
63//!
64//! Use the unified [`load_magic_file()`] API for automatic format detection:
65//!
66//! ```ignore
67//! use libmagic_rs::parser::load_magic_file;
68//! use std::path::Path;
69//!
70//! let rules = load_magic_file(Path::new("/usr/share/misc/magic"))?;
71//! println!("Loaded {} magic rules", rules.len());
72//! # Ok::<(), Box<dyn std::error::Error>>(())
73//! ```
74//!
75//! ## Parsing Text Content Directly
76//!
77//! For parsing magic rule text that's already in memory:
78//!
79//! ```ignore
80//! use libmagic_rs::parser::parse_text_magic_file;
81//!
82//! let magic_content = r#"
83//! 0 string \x7fELF ELF executable
84//! >4 byte 1 32-bit
85//! >4 byte 2 64-bit
86//! "#;
87//!
88//! let rules = parse_text_magic_file(magic_content)?;
89//! assert_eq!(rules.len(), 1);
90//! assert_eq!(rules[0].children.len(), 2);
91//! # Ok::<(), Box<dyn std::error::Error>>(())
92//! ```
93//!
94//! ## Loading a Directory Explicitly
95//!
96//! For Magdir-style directories containing multiple magic files:
97//!
98//! ```ignore
99//! use libmagic_rs::parser::load_magic_directory;
100//! use std::path::Path;
101//!
102//! // Directory structure:
103//! // /usr/share/file/magic.d/
104//! // ├── elf
105//! // ├── archive
106//! // └── text
107//!
108//! let rules = load_magic_directory(Path::new("/usr/share/file/magic.d"))?;
109//! // Rules from all files are merged in alphabetical order by filename
110//! # Ok::<(), Box<dyn std::error::Error>>(())
111//! ```
112//!
113//! ## Migration Note
114//!
115//! **For users upgrading from direct function calls:**
116//!
117//! - **Old approach**: Call `detect_format()` then dispatch manually
118//! - **New approach**: Use `load_magic_file()` for automatic dispatching
119//!
120//! The individual functions (`parse_text_magic_file()`, `load_magic_directory()`)
121//! remain available for advanced use cases where you need direct control.
122//!
123//! **Key differences:**
124//! - `load_magic_file()`: Unified API with automatic format detection (recommended)
125//! - `parse_text_magic_file()`: Parses a single text string containing magic rules
126//! - `load_magic_directory()`: Loads and merges all magic files from a directory
127//! - `detect_format()`: Low-level format detection (now called internally by `load_magic_file()`)
128//!
129//! **Error handling in `load_magic_directory()`:**
130//! - Critical errors (I/O failures, invalid UTF-8): Returns `ParseError` immediately
131//! - Non-critical errors (parse failures in individual files): Logs warning to stderr and continues
132
133pub mod ast;
134#[allow(dead_code)]
135pub(crate) mod codegen;
136mod format;
137// `grammar` exposes nom-based parser combinators that are implementation
138// details of the magic-file parsing pipeline. Keep them visible to the rest
139// of the crate (for sibling modules and unit tests) but never to external
140// consumers -- the only supported parser entry points are the
141// `parse_text_magic_file` / `load_magic_file` functions in this module.
142pub(crate) mod grammar;
143mod hierarchy;
144mod loader;
145pub(crate) mod name_table;
146pub(crate) mod preprocessing;
147pub mod types;
148
149// Re-export AST types for convenience
150pub use ast::{Endianness, MagicRule, OffsetSpec, Operator, StrengthModifier, TypeKind, Value};
151
152// Re-export format detection and loading
153pub use format::{MagicFileFormat, detect_format};
154pub use loader::{load_magic_directory, load_magic_file};
155
156// Internal re-exports for sibling modules and tests
157pub(crate) use hierarchy::build_rule_hierarchy;
158pub(crate) use preprocessing::preprocess_lines;
159
160use crate::error::ParseError;
161
162/// Result of parsing a text magic file.
163///
164/// Contains the top-level rule list with any `name`-declared subroutines
165/// hoisted into a separate [`name_table::NameTable`] keyed by identifier.
166/// The rule list preserves the original ordering of all non-`Name` top-level
167/// rules, so strength-based sorting and evaluation semantics are unchanged
168/// for magic files that do not use the `name`/`use` directive pair.
169#[derive(Debug)]
170pub struct ParsedMagic {
171 /// Top-level rules after `Name` subroutines have been removed.
172 pub rules: Vec<MagicRule>,
173 /// Extracted `name` subroutine definitions, consulted by the evaluator
174 /// when a rule of type `TypeKind::Meta(MetaType::Use(_))` is reached.
175 pub(crate) name_table: name_table::NameTable,
176}
177
178/// Parses a complete magic file from raw text input.
179///
180/// This is the main public-facing parser function that orchestrates the complete
181/// parsing pipeline: preprocessing, parsing individual rules, and building the
182/// hierarchical structure.
183///
184/// # Arguments
185///
186/// * `input` - The raw magic file content as a string
187///
188/// # Returns
189///
190/// `Result<ParsedMagic, ParseError>` - A [`ParsedMagic`] value containing
191/// the top-level rules (with `name`-declared subroutines hoisted out) and
192/// the resulting name table.
193///
194/// # Errors
195///
196/// Returns an error if any stage of parsing fails:
197/// - Preprocessing errors
198/// - Rule parsing errors
199/// - Hierarchy building errors
200///
201/// # Example
202///
203/// ```ignore
204/// use libmagic_rs::parser::parse_text_magic_file;
205///
206/// let magic = r#"0 string \x7fELF ELF file
207/// >4 byte 1 32-bit
208/// >4 byte 2 64-bit"#;
209///
210/// let parsed = parse_text_magic_file(magic)?;
211/// assert_eq!(parsed.rules.len(), 1);
212/// assert_eq!(parsed.rules[0].message, "ELF file");
213/// # Ok::<(), Box<dyn std::error::Error>>(())
214/// ```
215pub fn parse_text_magic_file(input: &str) -> Result<ParsedMagic, ParseError> {
216 let lines = preprocess_lines(input)?;
217 let rules = build_rule_hierarchy(lines)?;
218 let (rules, name_table) = name_table::extract_name_table(rules);
219 Ok(ParsedMagic { rules, name_table })
220}
221
222#[cfg(test)]
223mod unit_tests {
224 use super::*;
225
226 // ============================================================
227 // Tests for parse_text_magic_file (10+ test cases)
228 // ============================================================
229
230 #[test]
231 fn test_parse_text_magic_file_single_rule() {
232 let input = "0 string 0 ZIP archive";
233 let ParsedMagic { rules, .. } = parse_text_magic_file(input).unwrap();
234 assert_eq!(rules.len(), 1);
235 assert_eq!(rules[0].message, "ZIP archive");
236 }
237
238 #[test]
239 fn test_parse_text_magic_file_hierarchical_rules() {
240 let input = r"
2410 string 0 ELF
242>4 byte 1 32-bit
243>4 byte 2 64-bit
244";
245 let ParsedMagic { rules, .. } = parse_text_magic_file(input).unwrap();
246 assert_eq!(rules.len(), 1);
247 assert_eq!(rules[0].children.len(), 2);
248 }
249
250 #[test]
251 fn test_parse_text_magic_file_with_comments() {
252 let input = r"
253# ELF file format
2540 string 0 ELF
255>4 byte 1 32-bit
256";
257 let ParsedMagic { rules, .. } = parse_text_magic_file(input).unwrap();
258 assert_eq!(rules.len(), 1);
259 assert_eq!(rules[0].children.len(), 1);
260 }
261
262 #[test]
263 fn test_parse_text_magic_file_multiple_roots() {
264 let input = r"
2650 byte 1 ELF
266>4 byte 1 32-bit
267
2680 byte 2 PDF
269>5 byte 1 v1
270";
271 let ParsedMagic { rules, .. } = parse_text_magic_file(input).unwrap();
272 assert_eq!(rules.len(), 2);
273 }
274
275 #[test]
276 fn test_parse_text_magic_file_empty_input() {
277 let input = "";
278 let ParsedMagic { rules, .. } = parse_text_magic_file(input).unwrap();
279 assert_eq!(rules.len(), 0);
280 }
281
282 #[test]
283 fn test_parse_text_magic_file_only_comments() {
284 let input = r"
285# Comment 1
286# Comment 2
287# Comment 3
288";
289 let ParsedMagic { rules, .. } = parse_text_magic_file(input).unwrap();
290 assert_eq!(rules.len(), 0);
291 }
292
293 #[test]
294 fn test_parse_text_magic_file_empty_lines_only() {
295 let input = r"
296
297
2980 string 0 Test file
299
300
301";
302 let ParsedMagic { rules, .. } = parse_text_magic_file(input).unwrap();
303 assert_eq!(rules.len(), 1);
304 }
305
306 #[test]
307 fn test_parse_text_magic_file_with_message_spaces() {
308 let input = "0 string 0 Long message continued here";
309 let ParsedMagic { rules, .. } = parse_text_magic_file(input).unwrap();
310 assert!(rules[0].message.contains("continued"));
311 }
312
313 #[test]
314 fn test_parse_text_magic_file_mixed_indentation() {
315 let input = r"
3160 byte 1 Root1
317>4 byte 1 Child1
318>4 byte 2 Child2
319>>6 byte 3 Grandchild
320
3210 byte 2 Root2
322>4 byte 4 Child3
323";
324 let ParsedMagic { rules, .. } = parse_text_magic_file(input).unwrap();
325 assert_eq!(rules.len(), 2);
326 assert_eq!(rules[0].children.len(), 2);
327 assert_eq!(rules[0].children[1].children.len(), 1);
328 assert_eq!(rules[1].children.len(), 1);
329 }
330
331 #[test]
332 fn test_parse_text_magic_file_complex_real_world() {
333 let input = r"
334# Magic file for common formats
335
336# ELF binaries
3370 byte 0x7f ELF executable
338>4 byte 1 Intel 80386
339>4 byte 2 x86-64
340>>5 byte 1 LSB
341>>5 byte 2 MSB
342
343# PDF files
3440 byte 0x25 PDF document
345>5 byte 0x31 version 1.0
346>5 byte 0x34 version 1.4
347>5 byte 0x32 version 2.0
348";
349 let ParsedMagic { rules, .. } = parse_text_magic_file(input).unwrap();
350 assert_eq!(rules.len(), 2);
351 assert_eq!(rules[0].message, "ELF executable");
352 assert!(rules[0].children.len() > 1);
353 }
354
355 // ============================================================
356 // Strength directive integration tests
357 // ============================================================
358
359 #[test]
360 fn test_parse_text_magic_file_with_strength_directive() {
361 let input = r"
362!:strength +10
3630 string \\x7fELF ELF executable
364";
365 let ParsedMagic { rules, .. } = parse_text_magic_file(input).unwrap();
366 assert_eq!(rules.len(), 1);
367 assert_eq!(rules[0].strength_modifier, Some(StrengthModifier::Add(10)));
368 }
369
370 #[test]
371 fn test_parse_text_magic_file_strength_applies_to_next_rule() {
372 let input = r"
373!:strength *2
3740 string \\x7fELF ELF executable
3750 string \\x50\\x4b ZIP archive
376";
377 let ParsedMagic { rules, .. } = parse_text_magic_file(input).unwrap();
378 assert_eq!(rules.len(), 2);
379 // Strength should only apply to the immediately following rule
380 assert_eq!(
381 rules[0].strength_modifier,
382 Some(StrengthModifier::Multiply(2))
383 );
384 assert_eq!(rules[1].strength_modifier, None);
385 }
386
387 #[test]
388 fn test_parse_text_magic_file_strength_with_child_rules() {
389 let input = r"
390!:strength =50
3910 string \\x7fELF ELF executable
392>4 byte 1 32-bit
393>4 byte 2 64-bit
394";
395 let ParsedMagic { rules, .. } = parse_text_magic_file(input).unwrap();
396 assert_eq!(rules.len(), 1);
397 // Strength applies to root rule
398 assert_eq!(rules[0].strength_modifier, Some(StrengthModifier::Set(50)));
399 // Children should not have strength modifier
400 assert_eq!(rules[0].children[0].strength_modifier, None);
401 assert_eq!(rules[0].children[1].strength_modifier, None);
402 }
403
404 #[test]
405 fn test_parse_text_magic_file_multiple_strength_directives() {
406 let input = r"
407!:strength +10
4080 string \\x7fELF ELF executable
409!:strength -5
4100 string \\x50\\x4b ZIP archive
411";
412 let ParsedMagic { rules, .. } = parse_text_magic_file(input).unwrap();
413 assert_eq!(rules.len(), 2);
414 assert_eq!(rules[0].strength_modifier, Some(StrengthModifier::Add(10)));
415 assert_eq!(
416 rules[1].strength_modifier,
417 Some(StrengthModifier::Subtract(5))
418 );
419 }
420
421 #[test]
422 fn test_parse_text_magic_file_strength_all_operators() {
423 let inputs = [
424 ("!:strength +20\n0 byte 1 Test", StrengthModifier::Add(20)),
425 (
426 "!:strength -15\n0 byte 1 Test",
427 StrengthModifier::Subtract(15),
428 ),
429 (
430 "!:strength *3\n0 byte 1 Test",
431 StrengthModifier::Multiply(3),
432 ),
433 ("!:strength /2\n0 byte 1 Test", StrengthModifier::Divide(2)),
434 ("!:strength =100\n0 byte 1 Test", StrengthModifier::Set(100)),
435 ("!:strength 50\n0 byte 1 Test", StrengthModifier::Set(50)),
436 ];
437
438 for (input, expected_modifier) in inputs {
439 let ParsedMagic { rules, .. } = parse_text_magic_file(input).unwrap();
440 assert_eq!(
441 rules[0].strength_modifier,
442 Some(expected_modifier),
443 "Failed for input: {input}"
444 );
445 }
446 }
447
448 // ============================================================
449 // Integration and edge case tests
450 // ============================================================
451
452 #[test]
453 fn test_continuation_with_indentation() {
454 let input = r">4 byte 1 Message \
455continued";
456 let ParsedMagic { rules, .. } = parse_text_magic_file(input).unwrap();
457 assert_eq!(rules.len(), 1);
458 }
459
460 #[test]
461 fn test_multiple_hex_offsets() {
462 let input = r"
4630x100 string 0 At 256
4640x200 string 0 At 512
465";
466 let ParsedMagic { rules, .. } = parse_text_magic_file(input).unwrap();
467 assert_eq!(rules.len(), 2);
468 }
469
470 // ============================================================
471 // Overflow protection tests
472 // ============================================================
473
474 #[test]
475 fn test_overflow_decimal_too_many_digits() {
476 use crate::parser::grammar::parse_number;
477 // Test exactly 20 digits (should fail - over i64 max)
478 let result = parse_number("12345678901234567890");
479 assert!(result.is_err(), "Should reject 20+ decimal digits");
480 }
481
482 #[test]
483 fn test_overflow_hex_too_many_digits() {
484 use crate::parser::grammar::parse_number;
485 // Test 17 hex digits (should fail)
486 let result = parse_number("0x10000000000000000");
487 assert!(result.is_err(), "Should reject 17+ hex digits");
488 }
489
490 #[test]
491 fn test_overflow_i64_max() {
492 use crate::parser::grammar::parse_number;
493 // i64::MAX = 9223372036854775807
494 let result = parse_number("9223372036854775807");
495 assert!(result.is_ok(), "Should accept i64::MAX");
496 }
497
498 #[test]
499 fn test_overflow_i64_max_plus_one() {
500 use crate::parser::grammar::parse_number;
501 // i64::MAX + 1 should fail
502 let result = parse_number("9223372036854775808");
503 assert!(result.is_err(), "Should reject i64::MAX + 1");
504 }
505
506 // ============================================================
507 // Line number accuracy test (uses parse_text_magic_file)
508 // ============================================================
509
510 #[test]
511 fn test_error_reports_correct_line_for_continuation() {
512 // When a continued rule fails to parse, error should show the starting line
513 let input = "0 string 0 valid\n0 invalid \\\nsyntax here\n0 string 0 valid2";
514 let result = parse_text_magic_file(input);
515
516 match result {
517 Err(ref e) => {
518 // Error should mention line 2 (start of the bad rule), not line 3
519 let error_str = format!("{e:?}");
520 assert!(
521 error_str.contains("line 2") || error_str.contains("line: 2"),
522 "Error should reference line 2, got: {error_str}"
523 );
524 }
525 Ok(_) => panic!("Expected InvalidSyntax error"),
526 }
527 }
528}
529
530#[cfg(test)]
531mod output_test {
532 use crate::parser::{
533 ParsedMagic, build_rule_hierarchy, parse_text_magic_file, preprocess_lines,
534 };
535
536 #[test]
537 fn demo_show_all_parser_outputs() {
538 let input = r"
539# ELF file
5400 string 0 ELF
541>4 byte 1 32-bit
542>4 byte 2 64-bit
543
5440 string 0 ZIP
545>0 byte 3 zipped
546";
547
548 println!("\n================ RAW INPUT ================\n");
549 println!("{input}");
550
551 // --------------------------------------------------
552 // 1. preprocess_lines
553 // --------------------------------------------------
554 println!("\n================ PREPROCESS LINES ================\n");
555
556 let lines = preprocess_lines(input).expect("preprocess_lines failed");
557
558 for (idx, line) in lines.iter().enumerate() {
559 println!(
560 "[{}] line_no={} is_comment={} content='{}'",
561 idx, line.line_number, line.is_comment, line.content
562 );
563 }
564
565 // --------------------------------------------------
566 // 2. parse_text_magic_file (full pipeline)
567 // --------------------------------------------------
568 println!("\n================ PARSED MAGIC RULES ================\n");
569
570 let ParsedMagic { rules, .. } =
571 parse_text_magic_file(input).expect("parse_text_magic_file failed");
572
573 for (i, rule) in rules.iter().enumerate() {
574 println!("ROOT RULE [{i}]:");
575 print_rule(rule, 1);
576 }
577
578 // --------------------------------------------------
579 // 3. build_rule_hierarchy (explicit)
580 // --------------------------------------------------
581 println!("\n================ EXPLICIT HIERARCHY BUILD ================\n");
582
583 let rebuilt = build_rule_hierarchy(lines).expect("build_rule_hierarchy failed");
584
585 for (i, rule) in rebuilt.iter().enumerate() {
586 println!("ROOT [{i}]:");
587 print_rule(rule, 1);
588 }
589 }
590
591 // Helper to pretty-print rule trees
592 fn print_rule(rule: &crate::parser::MagicRule, indent: usize) {
593 let pad = " ".repeat(indent);
594
595 println!(
596 "{}- level={} offset={:?} type={:?} op={:?} value={:?} message='{}'",
597 pad, rule.level, rule.offset, rule.typ, rule.op, rule.value, rule.message
598 );
599
600 for child in &rule.children {
601 print_rule(child, indent + 1);
602 }
603 }
604}