libmagic_rs/parser/mod.rs
1// Copyright (c) 2025-2026 the libmagic-rs contributors
2// SPDX-License-Identifier: Apache-2.0
3
4//! Magic file parser module
5//!
6//! This module handles parsing of magic files into an Abstract Syntax Tree (AST)
7//! that can be evaluated against file buffers for type identification.
8//!
9//! # Overview
10//!
11//! The parser implements a complete pipeline for transforming magic file text into
12//! a hierarchical rule structure suitable for evaluation. The pipeline consists of:
13//!
14//! 1. **Preprocessing**: Line handling, comment removal, continuation processing
15//! 2. **Parsing**: Individual magic rule parsing using nom combinators
16//! 3. **Hierarchy Building**: Constructing parent-child relationships based on indentation
17//! 4. **Validation**: Type checking and offset resolution
18//!
19//! # Format Detection and Loading
20//!
21//! The module automatically detects and handles three types of magic file formats:
22//! - **Text files**: Human-readable magic rule definitions
23//! - **Directories**: Collections of magic files (Magdir pattern)
24//! - **Binary files**: Compiled .mgc files (currently unsupported)
25//!
26//! ## Unified Loading API
27//!
28//! The recommended entry point for loading magic files is [`load_magic_file()`], which
29//! automatically detects the format and dispatches to the appropriate handler:
30//!
31//! ```ignore
32//! use libmagic_rs::parser::load_magic_file;
33//! use std::path::Path;
34//!
35//! // Works with text files
36//! let rules = load_magic_file(Path::new("/usr/share/misc/magic"))?;
37//!
38//! // Also works with directories
39//! let rules = load_magic_file(Path::new("/usr/share/misc/magic.d"))?;
40//!
41//! // Binary .mgc files return an error with guidance
42//! match load_magic_file(Path::new("/usr/share/misc/magic.mgc")) {
43//! Ok(rules) => { /* ... */ },
44//! Err(e) => eprintln!("Use --use-builtin for binary files: {}", e),
45//! }
46//! # Ok::<(), Box<dyn std::error::Error>>(())
47//! ```
48//!
49//! ## Three-Tier Loading Strategy
50//!
51//! The loading process works as follows:
52//!
53//! 1. **Format Detection**: [`detect_format()`] examines the path to determine the file type
54//! 2. **Dispatch to Handler**:
55//! - Text files -> [`parse_text_magic_file()`] after reading contents
56//! - Directories -> [`load_magic_directory()`] to load and merge all files
57//! - Binary files -> Returns error suggesting `--use-builtin` option
58//! 3. **Return Merged Rules**: All rules are returned in a single `Vec<MagicRule>`
59//!
60//! # Examples
61//!
62//! ## Loading Magic Files (Recommended)
63//!
64//! Use the unified [`load_magic_file()`] API for automatic format detection:
65//!
66//! ```ignore
67//! use libmagic_rs::parser::load_magic_file;
68//! use std::path::Path;
69//!
70//! let rules = load_magic_file(Path::new("/usr/share/misc/magic"))?;
71//! println!("Loaded {} magic rules", rules.len());
72//! # Ok::<(), Box<dyn std::error::Error>>(())
73//! ```
74//!
75//! ## Parsing Text Content Directly
76//!
77//! For parsing magic rule text that's already in memory:
78//!
79//! ```ignore
80//! use libmagic_rs::parser::parse_text_magic_file;
81//!
82//! let magic_content = r#"
83//! 0 string \x7fELF ELF executable
84//! >4 byte 1 32-bit
85//! >4 byte 2 64-bit
86//! "#;
87//!
88//! let rules = parse_text_magic_file(magic_content)?;
89//! assert_eq!(rules.len(), 1);
90//! assert_eq!(rules[0].children.len(), 2);
91//! # Ok::<(), Box<dyn std::error::Error>>(())
92//! ```
93//!
94//! ## Loading a Directory Explicitly
95//!
96//! For Magdir-style directories containing multiple magic files:
97//!
98//! ```ignore
99//! use libmagic_rs::parser::load_magic_directory;
100//! use std::path::Path;
101//!
102//! // Directory structure:
103//! // /usr/share/file/magic.d/
104//! // ├── elf
105//! // ├── archive
106//! // └── text
107//!
108//! let rules = load_magic_directory(Path::new("/usr/share/file/magic.d"))?;
109//! // Rules from all files are merged in alphabetical order by filename
110//! # Ok::<(), Box<dyn std::error::Error>>(())
111//! ```
112//!
113//! ## Migration Note
114//!
115//! **For users upgrading from direct function calls:**
116//!
117//! - **Old approach**: Call `detect_format()` then dispatch manually
118//! - **New approach**: Use `load_magic_file()` for automatic dispatching
119//!
120//! The individual functions (`parse_text_magic_file()`, `load_magic_directory()`)
121//! remain available for advanced use cases where you need direct control.
122//!
123//! **Key differences:**
124//! - `load_magic_file()`: Unified API with automatic format detection (recommended)
125//! - `parse_text_magic_file()`: Parses a single text string containing magic rules
126//! - `load_magic_directory()`: Loads and merges all magic files from a directory
127//! - `detect_format()`: Low-level format detection (now called internally by `load_magic_file()`)
128//!
129//! **Error handling in `load_magic_directory()`:**
130//! - Critical errors (I/O failures, invalid UTF-8): Returns `ParseError` immediately
131//! - Non-critical errors (parse failures in individual files): Logs warning to stderr and continues
132
133pub mod ast;
134#[allow(dead_code)]
135pub(crate) mod codegen;
136mod format;
137pub mod grammar;
138mod hierarchy;
139mod loader;
140pub(crate) mod preprocessing;
141pub mod types;
142
143// Re-export AST types for convenience
144pub use ast::{Endianness, MagicRule, OffsetSpec, Operator, StrengthModifier, TypeKind, Value};
145
146// Re-export parser functions for convenience
147pub use grammar::{parse_number, parse_offset};
148
149// Re-export format detection and loading
150pub use format::{MagicFileFormat, detect_format};
151pub use loader::{load_magic_directory, load_magic_file};
152
153// Internal re-exports for sibling modules and tests
154pub(crate) use hierarchy::build_rule_hierarchy;
155pub(crate) use preprocessing::preprocess_lines;
156
157use crate::error::ParseError;
158
159/// Parses a complete magic file from raw text input.
160///
161/// This is the main public-facing parser function that orchestrates the complete
162/// parsing pipeline: preprocessing, parsing individual rules, and building the
163/// hierarchical structure.
164///
165/// # Arguments
166///
167/// * `input` - The raw magic file content as a string
168///
169/// # Returns
170///
171/// `Result<Vec<MagicRule>, ParseError>` - A vector of root rules with nested children
172///
173/// # Errors
174///
175/// Returns an error if any stage of parsing fails:
176/// - Preprocessing errors
177/// - Rule parsing errors
178/// - Hierarchy building errors
179///
180/// # Example
181///
182/// ```ignore
183/// use libmagic_rs::parser::parse_text_magic_file;
184///
185/// let magic = r#"0 string \x7fELF ELF file
186/// >4 byte 1 32-bit
187/// >4 byte 2 64-bit"#;
188///
189/// let rules = parse_text_magic_file(magic)?;
190/// assert_eq!(rules.len(), 1);
191/// assert_eq!(rules[0].message, "ELF file");
192/// # Ok::<(), Box<dyn std::error::Error>>(())
193/// ```
194pub fn parse_text_magic_file(input: &str) -> Result<Vec<MagicRule>, ParseError> {
195 let lines = preprocess_lines(input)?;
196 build_rule_hierarchy(lines)
197}
198
199#[cfg(test)]
200mod unit_tests {
201 use super::*;
202
203 // ============================================================
204 // Tests for parse_text_magic_file (10+ test cases)
205 // ============================================================
206
207 #[test]
208 fn test_parse_text_magic_file_single_rule() {
209 let input = "0 string 0 ZIP archive";
210 let rules = parse_text_magic_file(input).unwrap();
211 assert_eq!(rules.len(), 1);
212 assert_eq!(rules[0].message, "ZIP archive");
213 }
214
215 #[test]
216 fn test_parse_text_magic_file_hierarchical_rules() {
217 let input = r"
2180 string 0 ELF
219>4 byte 1 32-bit
220>4 byte 2 64-bit
221";
222 let rules = parse_text_magic_file(input).unwrap();
223 assert_eq!(rules.len(), 1);
224 assert_eq!(rules[0].children.len(), 2);
225 }
226
227 #[test]
228 fn test_parse_text_magic_file_with_comments() {
229 let input = r"
230# ELF file format
2310 string 0 ELF
232>4 byte 1 32-bit
233";
234 let rules = parse_text_magic_file(input).unwrap();
235 assert_eq!(rules.len(), 1);
236 assert_eq!(rules[0].children.len(), 1);
237 }
238
239 #[test]
240 fn test_parse_text_magic_file_multiple_roots() {
241 let input = r"
2420 byte 1 ELF
243>4 byte 1 32-bit
244
2450 byte 2 PDF
246>5 byte 1 v1
247";
248 let rules = parse_text_magic_file(input).unwrap();
249 assert_eq!(rules.len(), 2);
250 }
251
252 #[test]
253 fn test_parse_text_magic_file_empty_input() {
254 let input = "";
255 let rules = parse_text_magic_file(input).unwrap();
256 assert_eq!(rules.len(), 0);
257 }
258
259 #[test]
260 fn test_parse_text_magic_file_only_comments() {
261 let input = r"
262# Comment 1
263# Comment 2
264# Comment 3
265";
266 let rules = parse_text_magic_file(input).unwrap();
267 assert_eq!(rules.len(), 0);
268 }
269
270 #[test]
271 fn test_parse_text_magic_file_empty_lines_only() {
272 let input = r"
273
274
2750 string 0 Test file
276
277
278";
279 let rules = parse_text_magic_file(input).unwrap();
280 assert_eq!(rules.len(), 1);
281 }
282
283 #[test]
284 fn test_parse_text_magic_file_with_message_spaces() {
285 let input = "0 string 0 Long message continued here";
286 let rules = parse_text_magic_file(input).unwrap();
287 assert!(rules[0].message.contains("continued"));
288 }
289
290 #[test]
291 fn test_parse_text_magic_file_mixed_indentation() {
292 let input = r"
2930 byte 1 Root1
294>4 byte 1 Child1
295>4 byte 2 Child2
296>>6 byte 3 Grandchild
297
2980 byte 2 Root2
299>4 byte 4 Child3
300";
301 let rules = parse_text_magic_file(input).unwrap();
302 assert_eq!(rules.len(), 2);
303 assert_eq!(rules[0].children.len(), 2);
304 assert_eq!(rules[0].children[1].children.len(), 1);
305 assert_eq!(rules[1].children.len(), 1);
306 }
307
308 #[test]
309 fn test_parse_text_magic_file_complex_real_world() {
310 let input = r"
311# Magic file for common formats
312
313# ELF binaries
3140 byte 0x7f ELF executable
315>4 byte 1 Intel 80386
316>4 byte 2 x86-64
317>>5 byte 1 LSB
318>>5 byte 2 MSB
319
320# PDF files
3210 byte 0x25 PDF document
322>5 byte 0x31 version 1.0
323>5 byte 0x34 version 1.4
324>5 byte 0x32 version 2.0
325";
326 let rules = parse_text_magic_file(input).unwrap();
327 assert_eq!(rules.len(), 2);
328 assert_eq!(rules[0].message, "ELF executable");
329 assert!(rules[0].children.len() > 1);
330 }
331
332 // ============================================================
333 // Strength directive integration tests
334 // ============================================================
335
336 #[test]
337 fn test_parse_text_magic_file_with_strength_directive() {
338 let input = r"
339!:strength +10
3400 string \\x7fELF ELF executable
341";
342 let rules = parse_text_magic_file(input).unwrap();
343 assert_eq!(rules.len(), 1);
344 assert_eq!(rules[0].strength_modifier, Some(StrengthModifier::Add(10)));
345 }
346
347 #[test]
348 fn test_parse_text_magic_file_strength_applies_to_next_rule() {
349 let input = r"
350!:strength *2
3510 string \\x7fELF ELF executable
3520 string \\x50\\x4b ZIP archive
353";
354 let rules = parse_text_magic_file(input).unwrap();
355 assert_eq!(rules.len(), 2);
356 // Strength should only apply to the immediately following rule
357 assert_eq!(
358 rules[0].strength_modifier,
359 Some(StrengthModifier::Multiply(2))
360 );
361 assert_eq!(rules[1].strength_modifier, None);
362 }
363
364 #[test]
365 fn test_parse_text_magic_file_strength_with_child_rules() {
366 let input = r"
367!:strength =50
3680 string \\x7fELF ELF executable
369>4 byte 1 32-bit
370>4 byte 2 64-bit
371";
372 let rules = parse_text_magic_file(input).unwrap();
373 assert_eq!(rules.len(), 1);
374 // Strength applies to root rule
375 assert_eq!(rules[0].strength_modifier, Some(StrengthModifier::Set(50)));
376 // Children should not have strength modifier
377 assert_eq!(rules[0].children[0].strength_modifier, None);
378 assert_eq!(rules[0].children[1].strength_modifier, None);
379 }
380
381 #[test]
382 fn test_parse_text_magic_file_multiple_strength_directives() {
383 let input = r"
384!:strength +10
3850 string \\x7fELF ELF executable
386!:strength -5
3870 string \\x50\\x4b ZIP archive
388";
389 let rules = parse_text_magic_file(input).unwrap();
390 assert_eq!(rules.len(), 2);
391 assert_eq!(rules[0].strength_modifier, Some(StrengthModifier::Add(10)));
392 assert_eq!(
393 rules[1].strength_modifier,
394 Some(StrengthModifier::Subtract(5))
395 );
396 }
397
398 #[test]
399 fn test_parse_text_magic_file_strength_all_operators() {
400 let inputs = [
401 ("!:strength +20\n0 byte 1 Test", StrengthModifier::Add(20)),
402 (
403 "!:strength -15\n0 byte 1 Test",
404 StrengthModifier::Subtract(15),
405 ),
406 (
407 "!:strength *3\n0 byte 1 Test",
408 StrengthModifier::Multiply(3),
409 ),
410 ("!:strength /2\n0 byte 1 Test", StrengthModifier::Divide(2)),
411 ("!:strength =100\n0 byte 1 Test", StrengthModifier::Set(100)),
412 ("!:strength 50\n0 byte 1 Test", StrengthModifier::Set(50)),
413 ];
414
415 for (input, expected_modifier) in inputs {
416 let rules = parse_text_magic_file(input).unwrap();
417 assert_eq!(
418 rules[0].strength_modifier,
419 Some(expected_modifier),
420 "Failed for input: {input}"
421 );
422 }
423 }
424
425 // ============================================================
426 // Integration and edge case tests
427 // ============================================================
428
429 #[test]
430 fn test_continuation_with_indentation() {
431 let input = r">4 byte 1 Message \
432continued";
433 let rules = parse_text_magic_file(input).unwrap();
434 assert_eq!(rules.len(), 1);
435 }
436
437 #[test]
438 fn test_multiple_hex_offsets() {
439 let input = r"
4400x100 string 0 At 256
4410x200 string 0 At 512
442";
443 let rules = parse_text_magic_file(input).unwrap();
444 assert_eq!(rules.len(), 2);
445 }
446
447 // ============================================================
448 // Overflow protection tests
449 // ============================================================
450
451 #[test]
452 fn test_overflow_decimal_too_many_digits() {
453 use crate::parser::grammar::parse_number;
454 // Test exactly 20 digits (should fail - over i64 max)
455 let result = parse_number("12345678901234567890");
456 assert!(result.is_err(), "Should reject 20+ decimal digits");
457 }
458
459 #[test]
460 fn test_overflow_hex_too_many_digits() {
461 use crate::parser::grammar::parse_number;
462 // Test 17 hex digits (should fail)
463 let result = parse_number("0x10000000000000000");
464 assert!(result.is_err(), "Should reject 17+ hex digits");
465 }
466
467 #[test]
468 fn test_overflow_i64_max() {
469 use crate::parser::grammar::parse_number;
470 // i64::MAX = 9223372036854775807
471 let result = parse_number("9223372036854775807");
472 assert!(result.is_ok(), "Should accept i64::MAX");
473 }
474
475 #[test]
476 fn test_overflow_i64_max_plus_one() {
477 use crate::parser::grammar::parse_number;
478 // i64::MAX + 1 should fail
479 let result = parse_number("9223372036854775808");
480 assert!(result.is_err(), "Should reject i64::MAX + 1");
481 }
482
483 // ============================================================
484 // Line number accuracy test (uses parse_text_magic_file)
485 // ============================================================
486
487 #[test]
488 fn test_error_reports_correct_line_for_continuation() {
489 // When a continued rule fails to parse, error should show the starting line
490 let input = "0 string 0 valid\n0 invalid \\\nsyntax here\n0 string 0 valid2";
491 let result = parse_text_magic_file(input);
492
493 match result {
494 Err(ref e) => {
495 // Error should mention line 2 (start of the bad rule), not line 3
496 let error_str = format!("{e:?}");
497 assert!(
498 error_str.contains("line 2") || error_str.contains("line: 2"),
499 "Error should reference line 2, got: {error_str}"
500 );
501 }
502 Ok(_) => panic!("Expected InvalidSyntax error"),
503 }
504 }
505}
506
507#[cfg(test)]
508mod output_test {
509 use crate::parser::{build_rule_hierarchy, parse_text_magic_file, preprocess_lines};
510
511 #[test]
512 fn demo_show_all_parser_outputs() {
513 let input = r"
514# ELF file
5150 string 0 ELF
516>4 byte 1 32-bit
517>4 byte 2 64-bit
518
5190 string 0 ZIP
520>0 byte 3 zipped
521";
522
523 println!("\n================ RAW INPUT ================\n");
524 println!("{input}");
525
526 // --------------------------------------------------
527 // 1. preprocess_lines
528 // --------------------------------------------------
529 println!("\n================ PREPROCESS LINES ================\n");
530
531 let lines = preprocess_lines(input).expect("preprocess_lines failed");
532
533 for (idx, line) in lines.iter().enumerate() {
534 println!(
535 "[{}] line_no={} is_comment={} content='{}'",
536 idx, line.line_number, line.is_comment, line.content
537 );
538 }
539
540 // --------------------------------------------------
541 // 2. parse_text_magic_file (full pipeline)
542 // --------------------------------------------------
543 println!("\n================ PARSED MAGIC RULES ================\n");
544
545 let rules = parse_text_magic_file(input).expect("parse_text_magic_file failed");
546
547 for (i, rule) in rules.iter().enumerate() {
548 println!("ROOT RULE [{i}]:");
549 print_rule(rule, 1);
550 }
551
552 // --------------------------------------------------
553 // 3. build_rule_hierarchy (explicit)
554 // --------------------------------------------------
555 println!("\n================ EXPLICIT HIERARCHY BUILD ================\n");
556
557 let rebuilt = build_rule_hierarchy(lines).expect("build_rule_hierarchy failed");
558
559 for (i, rule) in rebuilt.iter().enumerate() {
560 println!("ROOT [{i}]:");
561 print_rule(rule, 1);
562 }
563 }
564
565 // Helper to pretty-print rule trees
566 fn print_rule(rule: &crate::parser::MagicRule, indent: usize) {
567 let pad = " ".repeat(indent);
568
569 println!(
570 "{}- level={} offset={:?} type={:?} op={:?} value={:?} message='{}'",
571 pad, rule.level, rule.offset, rule.typ, rule.op, rule.value, rule.message
572 );
573
574 for child in &rule.children {
575 print_rule(child, indent + 1);
576 }
577 }
578}