libmagic_rs/parser/grammar/mod.rs
1// Copyright (c) 2025-2026 the libmagic-rs contributors
2// SPDX-License-Identifier: Apache-2.0
3
4//! Grammar parsing for magic files using nom parser combinators
5//!
6//! This module implements the parsing logic for magic file syntax, converting
7//! text-based magic rules into the AST representation defined in ast.rs.
8
9use nom::{
10 IResult, Parser,
11 branch::alt,
12 bytes::complete::{tag, take_while},
13 character::complete::{char, digit1, hex_digit1, multispace0, none_of, one_of},
14 combinator::{map, opt, recognize},
15 error::Error as NomError,
16 multi::many0,
17 sequence::pair,
18};
19
20use crate::parser::ast::{MagicRule, OffsetSpec, Operator, StrengthModifier, TypeKind, Value};
21
22/// Parse a decimal number with overflow protection
23fn parse_decimal_number(input: &str) -> IResult<&str, i64> {
24 let (input, digits) = digit1(input)?;
25
26 // Check for potential overflow before parsing
27 if digits.len() > 19 {
28 // i64::MAX has 19 digits, so anything longer will definitely overflow
29 return Err(nom::Err::Error(nom::error::Error::new(
30 input,
31 nom::error::ErrorKind::MapRes,
32 )));
33 }
34
35 let number = digits.parse::<i64>().map_err(|_| {
36 nom::Err::Error(nom::error::Error::new(input, nom::error::ErrorKind::MapRes))
37 })?;
38 Ok((input, number))
39}
40
41/// Parse a decimal number as unsigned `u64` with overflow protection
42fn parse_unsigned_decimal_number(input: &str) -> IResult<&str, u64> {
43 let (input, digits) = digit1(input)?;
44
45 // u64::MAX (18446744073709551615) has 20 digits
46 if digits.len() > 20 {
47 return Err(nom::Err::Error(nom::error::Error::new(
48 input,
49 nom::error::ErrorKind::MapRes,
50 )));
51 }
52
53 let number = digits.parse::<u64>().map_err(|_| {
54 nom::Err::Error(nom::error::Error::new(input, nom::error::ErrorKind::MapRes))
55 })?;
56 Ok((input, number))
57}
58
59/// Parse a hexadecimal number (with 0x prefix) with overflow protection
60fn parse_hex_number(input: &str) -> IResult<&str, i64> {
61 let (input, _) = tag("0x")(input)?;
62 let (input, hex_str) = hex_digit1(input)?;
63
64 // Check for potential overflow - i64 can hold up to 16 hex digits (0x7FFFFFFFFFFFFFFF)
65 if hex_str.len() > 16 {
66 return Err(nom::Err::Error(nom::error::Error::new(
67 input,
68 nom::error::ErrorKind::MapRes,
69 )));
70 }
71
72 let number = i64::from_str_radix(hex_str, 16).map_err(|_| {
73 nom::Err::Error(nom::error::Error::new(input, nom::error::ErrorKind::MapRes))
74 })?;
75
76 Ok((input, number))
77}
78
79/// Parse a hexadecimal number (with 0x prefix) as unsigned `u64`
80fn parse_unsigned_hex_number(input: &str) -> IResult<&str, u64> {
81 let (input, _) = tag("0x")(input)?;
82 let (input, hex_str) = hex_digit1(input)?;
83
84 // u64 can hold up to 16 hex digits (0xFFFFFFFFFFFFFFFF)
85 if hex_str.len() > 16 {
86 return Err(nom::Err::Error(nom::error::Error::new(
87 input,
88 nom::error::ErrorKind::MapRes,
89 )));
90 }
91
92 let number = u64::from_str_radix(hex_str, 16).map_err(|_| {
93 nom::Err::Error(nom::error::Error::new(input, nom::error::ErrorKind::MapRes))
94 })?;
95
96 Ok((input, number))
97}
98
99/// Parse a non-negative number as unsigned `u64`
100///
101/// Supports both decimal and hexadecimal (0x prefix) formats.
102/// Does not handle a leading minus sign -- callers handle sign detection.
103fn parse_unsigned_number(input: &str) -> IResult<&str, u64> {
104 if input.starts_with("0x") {
105 parse_unsigned_hex_number(input)
106 } else {
107 parse_unsigned_decimal_number(input)
108 }
109}
110
111/// Parse a decimal or hexadecimal number
112///
113/// Supports both decimal (123, -456) and hexadecimal (0x1a2b, -0xFF) formats.
114///
115/// # Examples
116///
117/// ```
118/// use libmagic_rs::parser::grammar::parse_number;
119///
120/// assert_eq!(parse_number("123"), Ok(("", 123)));
121/// assert_eq!(parse_number("0x1a"), Ok(("", 26)));
122/// assert_eq!(parse_number("-42"), Ok(("", -42)));
123/// assert_eq!(parse_number("-0xFF"), Ok(("", -255)));
124/// ```
125///
126/// # Errors
127///
128/// Returns a nom parsing error if:
129/// - Input is empty or contains no valid digits
130/// - Hexadecimal number lacks proper "0x" prefix or contains invalid hex digits
131/// - Number cannot be parsed as a valid `i64` value
132/// - Input contains invalid characters for the detected number format
133pub fn parse_number(input: &str) -> IResult<&str, i64> {
134 let (input, sign) = opt(char('-')).parse(input)?;
135 let is_negative = sign.is_some();
136
137 // Check if input starts with "0x" - if so, it must be a valid hex number
138 let (input, number) = if input.starts_with("0x") {
139 parse_hex_number(input)?
140 } else {
141 parse_decimal_number(input)?
142 };
143
144 // Apply sign with overflow checking
145 let result = if is_negative {
146 number.checked_neg().ok_or_else(|| {
147 nom::Err::Error(nom::error::Error::new(input, nom::error::ErrorKind::MapRes))
148 })?
149 } else {
150 number
151 };
152
153 Ok((input, result))
154}
155
156/// Parse an offset specification for absolute offsets
157///
158/// Supports decimal and hexadecimal formats, both positive and negative.
159///
160/// # Examples
161///
162/// ```
163/// use libmagic_rs::parser::grammar::parse_offset;
164/// use libmagic_rs::parser::ast::OffsetSpec;
165///
166/// assert_eq!(parse_offset("0"), Ok(("", OffsetSpec::Absolute(0))));
167/// assert_eq!(parse_offset("123"), Ok(("", OffsetSpec::Absolute(123))));
168/// assert_eq!(parse_offset("0x10"), Ok(("", OffsetSpec::Absolute(16))));
169/// assert_eq!(parse_offset("-4"), Ok(("", OffsetSpec::Absolute(-4))));
170/// assert_eq!(parse_offset("-0xFF"), Ok(("", OffsetSpec::Absolute(-255))));
171/// ```
172///
173/// # Errors
174///
175/// Returns a nom parsing error if:
176/// - The input contains invalid number format (propagated from `parse_number`)
177/// - Input is empty or contains no parseable offset value
178/// - The offset value cannot be represented as a valid `i64`
179pub fn parse_offset(input: &str) -> IResult<&str, OffsetSpec> {
180 let (input, _) = multispace0(input)?;
181 let (input, offset_value) = parse_number(input)?;
182 let (input, _) = multispace0(input)?;
183
184 Ok((input, OffsetSpec::Absolute(offset_value)))
185}
186
187/// Parse comparison operators for magic rules
188///
189/// Supports both symbolic and text representations of operators:
190/// - `=` or `==` for equality
191/// - `!=` or `<>` for inequality
192/// - `<` for less-than
193/// - `>` for greater-than
194/// - `<=` for less-than-or-equal
195/// - `>=` for greater-than-or-equal
196/// - `&` for bitwise AND
197/// - `^` for bitwise XOR
198/// - `~` for bitwise NOT
199/// - `x` for any value (always matches)
200///
201/// # Examples
202///
203/// ```
204/// use libmagic_rs::parser::grammar::parse_operator;
205/// use libmagic_rs::parser::ast::Operator;
206///
207/// assert_eq!(parse_operator("="), Ok(("", Operator::Equal)));
208/// assert_eq!(parse_operator("=="), Ok(("", Operator::Equal)));
209/// assert_eq!(parse_operator("!="), Ok(("", Operator::NotEqual)));
210/// assert_eq!(parse_operator("<>"), Ok(("", Operator::NotEqual)));
211/// assert_eq!(parse_operator("<"), Ok(("", Operator::LessThan)));
212/// assert_eq!(parse_operator(">"), Ok(("", Operator::GreaterThan)));
213/// assert_eq!(parse_operator("<="), Ok(("", Operator::LessEqual)));
214/// assert_eq!(parse_operator(">="), Ok(("", Operator::GreaterEqual)));
215/// assert_eq!(parse_operator("&"), Ok(("", Operator::BitwiseAnd)));
216/// assert_eq!(parse_operator("^"), Ok(("", Operator::BitwiseXor)));
217/// assert_eq!(parse_operator("~"), Ok(("", Operator::BitwiseNot)));
218/// assert_eq!(parse_operator("x"), Ok(("", Operator::AnyValue)));
219/// ```
220///
221/// # Errors
222///
223/// Returns a nom parsing error if:
224/// - Input does not start with a recognized operator symbol
225/// - Input is empty or contains no valid operator
226/// - Operator syntax is incomplete (e.g., just `!` without `=`)
227pub fn parse_operator(input: &str) -> IResult<&str, Operator> {
228 let (input, _) = multispace0(input)?;
229
230 // Try to parse each operator, starting with longer ones first
231 if let Ok((remaining, _)) = tag::<&str, &str, nom::error::Error<&str>>("==")(input) {
232 // Check that we don't have another '=' following (to reject "===")
233 if remaining.starts_with('=') {
234 return Err(nom::Err::Error(nom::error::Error::new(
235 input,
236 nom::error::ErrorKind::Tag,
237 )));
238 }
239 let (remaining, _) = multispace0(remaining)?;
240 return Ok((remaining, Operator::Equal));
241 }
242
243 if let Ok((remaining, _)) = tag::<&str, &str, nom::error::Error<&str>>("!=")(input) {
244 let (remaining, _) = multispace0(remaining)?;
245 return Ok((remaining, Operator::NotEqual));
246 }
247
248 if let Ok((remaining, _)) = tag::<&str, &str, nom::error::Error<&str>>("<>")(input) {
249 let (remaining, _) = multispace0(remaining)?;
250 return Ok((remaining, Operator::NotEqual));
251 }
252
253 if let Ok((remaining, _)) = tag::<&str, &str, nom::error::Error<&str>>("<=")(input) {
254 let (remaining, _) = multispace0(remaining)?;
255 return Ok((remaining, Operator::LessEqual));
256 }
257
258 if let Ok((remaining, _)) = tag::<&str, &str, nom::error::Error<&str>>(">=")(input) {
259 let (remaining, _) = multispace0(remaining)?;
260 return Ok((remaining, Operator::GreaterEqual));
261 }
262
263 if let Ok((remaining, _)) = tag::<&str, &str, nom::error::Error<&str>>("=")(input) {
264 // Check that we don't have another '=' following (to reject "==")
265 if remaining.starts_with('=') {
266 return Err(nom::Err::Error(nom::error::Error::new(
267 input,
268 nom::error::ErrorKind::Tag,
269 )));
270 }
271 let (remaining, _) = multispace0(remaining)?;
272 return Ok((remaining, Operator::Equal));
273 }
274
275 if let Ok((remaining, _)) = tag::<&str, &str, nom::error::Error<&str>>("&")(input) {
276 // Check that we don't have another '&' following (to reject "&&")
277 if remaining.starts_with('&') {
278 return Err(nom::Err::Error(nom::error::Error::new(
279 input,
280 nom::error::ErrorKind::Tag,
281 )));
282 }
283 let (remaining, _) = multispace0(remaining)?;
284 return Ok((remaining, Operator::BitwiseAnd));
285 }
286
287 if let Ok((remaining, _)) = tag::<&str, &str, nom::error::Error<&str>>("^")(input) {
288 if remaining.starts_with('^') {
289 return Err(nom::Err::Error(nom::error::Error::new(
290 input,
291 nom::error::ErrorKind::Tag,
292 )));
293 }
294 let (remaining, _) = multispace0(remaining)?;
295 return Ok((remaining, Operator::BitwiseXor));
296 }
297
298 if let Ok((remaining, _)) = tag::<&str, &str, nom::error::Error<&str>>("~")(input) {
299 if remaining.starts_with('~') {
300 return Err(nom::Err::Error(nom::error::Error::new(
301 input,
302 nom::error::ErrorKind::Tag,
303 )));
304 }
305 let (remaining, _) = multispace0(remaining)?;
306 return Ok((remaining, Operator::BitwiseNot));
307 }
308
309 if let Ok((remaining, _)) = tag::<&str, &str, nom::error::Error<&str>>("x")(input) {
310 // Ensure 'x' is not followed by alphanumeric (e.g., "x42" is not AnyValue)
311 if remaining.starts_with(|c: char| c.is_alphanumeric() || c == '_') {
312 return Err(nom::Err::Error(nom::error::Error::new(
313 input,
314 nom::error::ErrorKind::Tag,
315 )));
316 }
317 let (remaining, _) = multispace0(remaining)?;
318 return Ok((remaining, Operator::AnyValue));
319 }
320
321 if let Ok((remaining, _)) = tag::<&str, &str, nom::error::Error<&str>>("<")(input) {
322 let (remaining, _) = multispace0(remaining)?;
323 return Ok((remaining, Operator::LessThan));
324 }
325
326 if let Ok((remaining, _)) = tag::<&str, &str, nom::error::Error<&str>>(">")(input) {
327 let (remaining, _) = multispace0(remaining)?;
328 return Ok((remaining, Operator::GreaterThan));
329 }
330
331 // If no operator matches, return an error
332 Err(nom::Err::Error(nom::error::Error::new(
333 input,
334 nom::error::ErrorKind::Tag,
335 )))
336}
337
338/// Parse a single hex byte with \x prefix
339fn parse_hex_byte_with_prefix(input: &str) -> IResult<&str, u8> {
340 let (input, _) = tag("\\x")(input)?;
341 let (input, hex_str) = recognize(pair(
342 one_of("0123456789abcdefABCDEF"),
343 one_of("0123456789abcdefABCDEF"),
344 ))
345 .parse(input)?;
346 let byte_val = u8::from_str_radix(hex_str, 16)
347 .map_err(|_| nom::Err::Error(NomError::new(input, nom::error::ErrorKind::MapRes)))?;
348 Ok((input, byte_val))
349}
350
351/// Parse a hex byte sequence starting with \x prefix
352fn parse_hex_bytes_with_prefix(input: &str) -> IResult<&str, Vec<u8>> {
353 if input.starts_with("\\x") {
354 many0(parse_hex_byte_with_prefix).parse(input)
355 } else {
356 Err(nom::Err::Error(NomError::new(
357 input,
358 nom::error::ErrorKind::Tag,
359 )))
360 }
361}
362
363/// Parse a mixed hex and ASCII sequence (like \x7fELF)
364fn parse_mixed_hex_ascii(input: &str) -> IResult<&str, Vec<u8>> {
365 // Must start with \ to be considered an escape sequence
366 if !input.starts_with('\\') {
367 return Err(nom::Err::Error(NomError::new(
368 input,
369 nom::error::ErrorKind::Tag,
370 )));
371 }
372
373 let mut bytes = Vec::new();
374 let mut remaining = input;
375
376 while !remaining.is_empty() {
377 // Try to parse escape sequences first (hex, octal, etc.)
378 if let Ok((new_remaining, escaped_char)) = parse_escape_sequence(remaining) {
379 bytes.push(escaped_char as u8);
380 remaining = new_remaining;
381 } else if let Ok((new_remaining, hex_byte)) = parse_hex_byte_with_prefix(remaining) {
382 bytes.push(hex_byte);
383 remaining = new_remaining;
384 } else if let Ok((new_remaining, ascii_char)) =
385 none_of::<&str, &str, NomError<&str>>(" \t\n\r")(remaining)
386 {
387 // Parse regular ASCII character (not whitespace)
388 bytes.push(ascii_char as u8);
389 remaining = new_remaining;
390 } else {
391 // Stop if we can't parse anything more
392 break;
393 }
394 }
395
396 if bytes.is_empty() {
397 Err(nom::Err::Error(NomError::new(
398 input,
399 nom::error::ErrorKind::Tag,
400 )))
401 } else {
402 Ok((remaining, bytes))
403 }
404}
405
406/// Parse a hex byte sequence without prefix (only if it looks like pure hex bytes)
407fn parse_hex_bytes_no_prefix(input: &str) -> IResult<&str, Vec<u8>> {
408 // Only parse as hex bytes if:
409 // 1. Input has even number of hex digits (pairs)
410 // 2. All characters are hex digits
411 // 3. Doesn't start with 0x (that's a number)
412 // 4. Contains at least one non-decimal digit (a-f, A-F)
413
414 if input.starts_with("0x") || input.starts_with('-') {
415 return Err(nom::Err::Error(NomError::new(
416 input,
417 nom::error::ErrorKind::Tag,
418 )));
419 }
420
421 let hex_chars: String = input.chars().take_while(char::is_ascii_hexdigit).collect();
422
423 if hex_chars.is_empty() || !hex_chars.len().is_multiple_of(2) {
424 return Err(nom::Err::Error(NomError::new(
425 input,
426 nom::error::ErrorKind::Tag,
427 )));
428 }
429
430 // Check if it contains non-decimal hex digits (a-f, A-F)
431 let has_hex_letters = hex_chars
432 .chars()
433 .any(|c| matches!(c, 'a'..='f' | 'A'..='F'));
434 if !has_hex_letters {
435 return Err(nom::Err::Error(NomError::new(
436 input,
437 nom::error::ErrorKind::Tag,
438 )));
439 }
440
441 // Parse pairs of hex digits
442 let mut bytes = Vec::with_capacity(hex_chars.len() / 2);
443 let mut chars = hex_chars.chars();
444 while let (Some(c1), Some(c2)) = (chars.next(), chars.next()) {
445 // Avoid format! allocation by parsing digits directly
446 let digit1 = c1
447 .to_digit(16)
448 .ok_or_else(|| nom::Err::Error(NomError::new(input, nom::error::ErrorKind::MapRes)))?;
449 let digit2 = c2
450 .to_digit(16)
451 .ok_or_else(|| nom::Err::Error(NomError::new(input, nom::error::ErrorKind::MapRes)))?;
452 let byte_val = u8::try_from((digit1 << 4) | digit2)
453 .map_err(|_| nom::Err::Error(NomError::new(input, nom::error::ErrorKind::MapRes)))?;
454 bytes.push(byte_val);
455 }
456
457 let remaining = &input[hex_chars.len()..];
458 Ok((remaining, bytes))
459}
460
461/// Parse a hex byte sequence (e.g., "\\x7f\\x45\\x4c\\x46", "7f454c46", or "\\x7fELF")
462fn parse_hex_bytes(input: &str) -> IResult<&str, Vec<u8>> {
463 alt((
464 parse_mixed_hex_ascii,
465 parse_hex_bytes_with_prefix,
466 parse_hex_bytes_no_prefix,
467 ))
468 .parse(input)
469}
470
471/// Parse escape sequences in strings
472fn parse_escape_sequence(input: &str) -> IResult<&str, char> {
473 let (input, _) = char('\\')(input)?;
474
475 // Try to parse octal escape sequence first (\377, \123, etc.)
476 if let Ok((remaining, octal_str)) = recognize(pair(
477 one_of::<&str, &str, NomError<&str>>("0123"),
478 pair(
479 one_of::<&str, &str, NomError<&str>>("01234567"),
480 one_of::<&str, &str, NomError<&str>>("01234567"),
481 ),
482 ))
483 .parse(input)
484 && let Ok(octal_value) = u8::from_str_radix(octal_str, 8)
485 {
486 return Ok((remaining, octal_value as char));
487 }
488
489 // Parse standard escape sequences
490 let (input, escaped_char) = one_of("nrt\\\"'0")(input)?;
491
492 let result_char = match escaped_char {
493 'n' => '\n',
494 'r' => '\r',
495 't' => '\t',
496 '\\' => '\\',
497 '"' => '"',
498 '\'' => '\'',
499 '0' => '\0',
500 _ => unreachable!("one_of constrains input to known escape characters"),
501 };
502
503 Ok((input, result_char))
504}
505
506/// Parse a quoted string with escape sequences
507fn parse_quoted_string(input: &str) -> IResult<&str, String> {
508 let (input, _) = multispace0(input)?;
509 let (input, _) = char('"')(input)?;
510
511 let mut result = String::new();
512 let mut remaining = input;
513
514 loop {
515 // Try to parse an escape sequence first
516 if let Ok((new_remaining, escaped_char)) = parse_escape_sequence(remaining) {
517 result.push(escaped_char);
518 remaining = new_remaining;
519 continue;
520 }
521
522 // If no escape sequence, try to parse a regular character (not quote or backslash)
523 if let Ok((new_remaining, regular_char)) =
524 none_of::<&str, &str, NomError<&str>>("\"\\")(remaining)
525 {
526 result.push(regular_char);
527 remaining = new_remaining;
528 continue;
529 }
530
531 // If neither worked, we should be at the closing quote
532 break;
533 }
534
535 let (remaining, _) = char('"')(remaining)?;
536 let (remaining, _) = multispace0(remaining)?;
537
538 Ok((remaining, result))
539}
540
541/// Parse a numeric value (integer)
542///
543/// Non-negative literals are parsed directly as `u64` so the full unsigned
544/// 64-bit range is representable (required for `uquad` values above `i64::MAX`).
545/// Negative literals go through the signed `i64` path.
546fn parse_numeric_value(input: &str) -> IResult<&str, Value> {
547 let (input, _) = multispace0(input)?;
548
549 let (input, value) = if input.starts_with('-') {
550 // Negative: parse as i64
551 let (input, number) = parse_number(input)?;
552 (input, Value::Int(number))
553 } else {
554 // Non-negative: parse as u64 to support full unsigned 64-bit range
555 let (input, number) = parse_unsigned_number(input)?;
556 (input, Value::Uint(number))
557 };
558
559 let (input, _) = multispace0(input)?;
560 Ok((input, value))
561}
562
563/// Parse string and numeric literals for magic rule values
564///
565/// Supports:
566/// - Quoted strings with escape sequences: "Hello\nWorld", "ELF\0"
567/// - Numeric literals (decimal): 123, -456
568/// - Numeric literals (hexadecimal): 0x1a2b, -0xFF
569/// - Hex byte sequences: \\x7f\\x45\\x4c\\x46 or 7f454c46
570///
571/// # Examples
572///
573/// ```
574/// use libmagic_rs::parser::grammar::parse_value;
575/// use libmagic_rs::parser::ast::Value;
576///
577/// // String values
578/// assert_eq!(parse_value("\"Hello\""), Ok(("", Value::String("Hello".to_string()))));
579/// assert_eq!(parse_value("\"Line1\\nLine2\""), Ok(("", Value::String("Line1\nLine2".to_string()))));
580///
581/// // Numeric values
582/// assert_eq!(parse_value("123"), Ok(("", Value::Uint(123))));
583/// assert_eq!(parse_value("-456"), Ok(("", Value::Int(-456))));
584/// assert_eq!(parse_value("0x1a"), Ok(("", Value::Uint(26))));
585/// assert_eq!(parse_value("-0xFF"), Ok(("", Value::Int(-255))));
586///
587/// // Hex byte sequences
588/// assert_eq!(parse_value("\\x7f\\x45"), Ok(("", Value::Bytes(vec![0x7f, 0x45]))));
589/// ```
590///
591/// # Errors
592///
593/// Returns a nom parsing error if:
594/// - Input is empty or contains no valid value
595/// - Quoted string is not properly terminated
596/// - Numeric value cannot be parsed as a valid integer
597/// - Hex byte sequence contains invalid hex digits
598/// - Input contains invalid characters for the detected value format
599pub fn parse_value(input: &str) -> IResult<&str, Value> {
600 let (input, _) = multispace0(input)?;
601
602 // Handle empty input case - should fail for magic rules
603 if input.is_empty() {
604 return Err(nom::Err::Error(NomError::new(
605 input,
606 nom::error::ErrorKind::Tag,
607 )));
608 }
609
610 // Try to parse different value types in order of specificity
611 let (input, value) = alt((
612 // Try quoted string first
613 map(parse_quoted_string, Value::String),
614 // Try hex byte sequence before numeric (to catch patterns like "7f", "ab", "\\x7fELF", etc.)
615 map(parse_hex_bytes, Value::Bytes),
616 // Try numeric value last (for pure numbers like 0x123, 1, etc.)
617 parse_numeric_value,
618 ))
619 .parse(input)?;
620
621 Ok((input, value))
622}
623
624/// Parse a type specification with an optional attached bitwise-AND mask operator
625/// (e.g., `lelong&0xf0000000`).
626///
627/// Returns the `TypeKind` and an optional `Operator`.
628///
629/// # Examples
630///
631/// ```
632/// use libmagic_rs::parser::grammar::parse_type_and_operator;
633/// use libmagic_rs::parser::ast::{TypeKind, Operator, Endianness};
634///
635/// // Type without operator
636/// let (_, (kind, op)) = parse_type_and_operator("lelong").unwrap();
637/// assert_eq!(kind, TypeKind::Long { endian: Endianness::Little, signed: true });
638/// assert_eq!(op, None);
639///
640/// // Type with mask operator
641/// let (_, (kind, op)) = parse_type_and_operator("lelong&0xf0000000").unwrap();
642/// assert!(matches!(op, Some(Operator::BitwiseAndMask(_))));
643/// ```
644///
645/// # Errors
646/// Returns a nom parsing error if the input doesn't match the expected format
647pub fn parse_type_and_operator(input: &str) -> IResult<&str, (TypeKind, Option<Operator>)> {
648 let (input, _) = multispace0(input)?;
649
650 let (input, type_name) = crate::parser::types::parse_type_keyword(input)?;
651
652 // Check for attached operator with mask (like &0xf0000000)
653 // Uses unsigned parsing so full u64 masks (e.g. 0xffffffffffffffff) are supported.
654 // If '&' is followed by digits/0x but the mask parse fails (overflow, etc.),
655 // we return a hard error instead of silently falling back to standalone '&'.
656 let (input, attached_op) = if let Some(after_amp) = input.strip_prefix('&') {
657 if after_amp.starts_with("0x") || after_amp.starts_with(|c: char| c.is_ascii_digit()) {
658 // '&' followed by what looks like a number -- must parse as mask
659 let (rest, mask) = parse_unsigned_number(after_amp).map_err(|_| {
660 nom::Err::Error(nom::error::Error::new(input, nom::error::ErrorKind::MapRes))
661 })?;
662 (rest, Some(Operator::BitwiseAndMask(mask)))
663 } else if after_amp.starts_with('&') {
664 // Reject '&&' -- not valid operator syntax
665 return Err(nom::Err::Error(nom::error::Error::new(
666 input,
667 nom::error::ErrorKind::Tag,
668 )));
669 } else {
670 // Standalone '&' (no digits following)
671 (after_amp, Some(Operator::BitwiseAnd))
672 }
673 } else {
674 (input, None)
675 };
676
677 let (input, _) = multispace0(input)?;
678
679 let type_kind = crate::parser::types::type_keyword_to_kind(type_name);
680
681 Ok((input, (type_kind, attached_op)))
682}
683
684/// Parse a type specification (byte, short, long, quad, string, etc.)
685///
686/// Supports various type formats found in magic files:
687/// - `byte` / `ubyte` - single byte (signed / unsigned)
688/// - `short` / `ushort` - 16-bit integer (native endian, signed / unsigned)
689/// - `leshort` / `uleshort` - 16-bit little-endian integer
690/// - `beshort` / `ubeshort` - 16-bit big-endian integer
691/// - `long` / `ulong` - 32-bit integer (native endian, signed / unsigned)
692/// - `lelong` / `ulelong` - 32-bit little-endian integer
693/// - `belong` / `ubelong` - 32-bit big-endian integer
694/// - `quad` / `uquad` - 64-bit integer (native endian, signed / unsigned)
695/// - `lequad` / `ulequad` - 64-bit little-endian integer
696/// - `bequad` / `ubequad` - 64-bit big-endian integer
697/// - `string` - null-terminated string
698///
699/// # Examples
700///
701/// ```
702/// use libmagic_rs::parser::grammar::parse_type;
703/// use libmagic_rs::parser::ast::{TypeKind, Endianness};
704///
705/// assert_eq!(parse_type("byte"), Ok(("", TypeKind::Byte { signed: true })));
706/// assert_eq!(parse_type("leshort"), Ok(("", TypeKind::Short { endian: Endianness::Little, signed: true })));
707/// assert_eq!(parse_type("bequad"), Ok(("", TypeKind::Quad { endian: Endianness::Big, signed: true })));
708/// assert_eq!(parse_type("string"), Ok(("", TypeKind::String { max_length: None })));
709/// ```
710///
711/// # Errors
712/// Returns a nom parsing error if the input doesn't match any known type
713pub fn parse_type(input: &str) -> IResult<&str, TypeKind> {
714 let (input, (type_kind, _)) = parse_type_and_operator(input)?;
715 Ok((input, type_kind))
716}
717
718/// Parse the indentation level and offset for magic rules
719///
720/// Handles both absolute offsets and hierarchical child rules with `>` prefix.
721/// Child rules can be nested multiple levels deep with multiple `>` characters.
722///
723/// # Examples
724///
725/// ```
726/// use libmagic_rs::parser::grammar::parse_rule_offset;
727/// use libmagic_rs::parser::ast::OffsetSpec;
728///
729/// // Absolute offset
730/// assert_eq!(parse_rule_offset("0"), Ok(("", (0, OffsetSpec::Absolute(0)))));
731/// assert_eq!(parse_rule_offset("16"), Ok(("", (0, OffsetSpec::Absolute(16)))));
732///
733/// // Child rule (level 1)
734/// assert_eq!(parse_rule_offset(">4"), Ok(("", (1, OffsetSpec::Absolute(4)))));
735///
736/// // Nested child rule (level 2)
737/// assert_eq!(parse_rule_offset(">>8"), Ok(("", (2, OffsetSpec::Absolute(8)))));
738/// ```
739/// Parse rule offset with hierarchy level (> prefixes) and offset specification
740///
741/// # Errors
742/// Returns a nom parsing error if the input doesn't match the expected offset format
743pub fn parse_rule_offset(input: &str) -> IResult<&str, (u32, OffsetSpec)> {
744 let (input, _) = multispace0(input)?;
745
746 // Count the number of '>' characters for nesting level
747 let (input, level_chars) = many0(char('>')).parse(input)?;
748 let level = u32::try_from(level_chars.len()).unwrap_or(0);
749
750 // Parse the offset after the '>' characters
751 let (input, offset_spec) = parse_offset(input)?;
752
753 Ok((input, (level, offset_spec)))
754}
755
756/// Parse the message part of a magic rule
757///
758/// The message is everything after the value until the end of the line.
759/// It may contain format specifiers and can be empty.
760///
761/// # Examples
762///
763/// ```
764/// use libmagic_rs::parser::grammar::parse_message;
765///
766/// assert_eq!(parse_message("ELF executable"), Ok(("", "ELF executable".to_string())));
767/// assert_eq!(parse_message(""), Ok(("", "".to_string())));
768/// assert_eq!(parse_message(" \tPDF document "), Ok(("", "PDF document".to_string())));
769/// ```
770/// Parse the message/description part of a magic rule
771///
772/// # Errors
773/// Returns a nom parsing error if the input cannot be parsed as a message
774pub fn parse_message(input: &str) -> IResult<&str, String> {
775 let (input, _) = multispace0(input)?;
776
777 // Take everything until end of line, trimming whitespace
778 // Use take_while instead of take_while1 to handle empty messages
779 let (input, message_text) = take_while(|c: char| c != '\n' && c != '\r').parse(input)?;
780 let message = message_text.trim().to_string();
781
782 Ok((input, message))
783}
784
785/// Parse a strength directive (`!:strength` line)
786///
787/// Parses the `!:strength` directive that modifies rule strength.
788/// Format: `!:strength [+|-|*|/|=]N` or `!:strength N`
789///
790/// # Examples
791///
792/// ```
793/// use libmagic_rs::parser::grammar::parse_strength_directive;
794/// use libmagic_rs::parser::ast::StrengthModifier;
795///
796/// assert_eq!(parse_strength_directive("!:strength +10"), Ok(("", StrengthModifier::Add(10))));
797/// assert_eq!(parse_strength_directive("!:strength -5"), Ok(("", StrengthModifier::Subtract(5))));
798/// assert_eq!(parse_strength_directive("!:strength *2"), Ok(("", StrengthModifier::Multiply(2))));
799/// assert_eq!(parse_strength_directive("!:strength /2"), Ok(("", StrengthModifier::Divide(2))));
800/// assert_eq!(parse_strength_directive("!:strength =50"), Ok(("", StrengthModifier::Set(50))));
801/// assert_eq!(parse_strength_directive("!:strength 50"), Ok(("", StrengthModifier::Set(50))));
802/// ```
803///
804/// # Errors
805///
806/// Returns a nom parsing error if:
807/// - Input doesn't start with `!:strength`
808/// - The modifier value cannot be parsed as a valid integer
809/// - The operator is invalid
810pub fn parse_strength_directive(input: &str) -> IResult<&str, StrengthModifier> {
811 // Helper to safely convert i64 to i32 with clamping to valid strength range.
812 // This prevents silent truncation to 0 on overflow while keeping values in bounds.
813 fn clamp_to_i32(n: i64) -> i32 {
814 // Use i64::from for lossless conversion, then clamp and convert back
815 let clamped = n.clamp(i64::from(i32::MIN), i64::from(i32::MAX));
816 // Safe to unwrap: clamped value is guaranteed to be in i32 range
817 i32::try_from(clamped).unwrap()
818 }
819
820 let (input, _) = multispace0(input)?;
821 let (input, _) = tag("!:strength")(input)?;
822 let (input, _) = multispace0(input)?;
823
824 // Parse the operator: +, -, *, /, = or bare number (implies =)
825 let (input, modifier) = alt((
826 // +N -> Add
827 map(pair(char('+'), parse_number), |(_, n)| {
828 StrengthModifier::Add(clamp_to_i32(n))
829 }),
830 // -N -> Subtract (note: parse_number handles negative, so we need special handling)
831 map(pair(char('-'), parse_decimal_number), |(_, n)| {
832 StrengthModifier::Subtract(clamp_to_i32(n))
833 }),
834 // *N -> Multiply
835 map(pair(char('*'), parse_number), |(_, n)| {
836 StrengthModifier::Multiply(clamp_to_i32(n))
837 }),
838 // /N -> Divide
839 map(pair(char('/'), parse_number), |(_, n)| {
840 StrengthModifier::Divide(clamp_to_i32(n))
841 }),
842 // =N -> Set
843 map(pair(char('='), parse_number), |(_, n)| {
844 StrengthModifier::Set(clamp_to_i32(n))
845 }),
846 // Bare number -> Set
847 map(parse_number, |n| StrengthModifier::Set(clamp_to_i32(n))),
848 ))
849 .parse(input)?;
850
851 Ok((input, modifier))
852}
853
854/// Check if a line is a strength directive (starts with !:strength)
855///
856/// # Examples
857///
858/// ```
859/// use libmagic_rs::parser::grammar::is_strength_directive;
860///
861/// assert!(is_strength_directive("!:strength +10"));
862/// assert!(is_strength_directive(" !:strength -5"));
863/// assert!(!is_strength_directive("0 byte 1"));
864/// ```
865#[must_use]
866pub fn is_strength_directive(input: &str) -> bool {
867 input.trim().starts_with("!:strength")
868}
869
870/// Parse a complete magic rule line from text format
871///
872/// Parses a complete magic rule in the format:
873/// `[>...]offset type [operator] value [message]`
874///
875/// Where:
876/// - `>...` indicates child rule nesting level (optional)
877/// - `offset` is the byte offset to read from
878/// - `type` is the data type (byte, short, long, string, etc.)
879/// - `operator` is the comparison operator (=, !=, &) - defaults to = if omitted
880/// - `value` is the expected value to compare against
881/// - `message` is the human-readable description (optional)
882///
883/// # Examples
884///
885/// ```
886/// use libmagic_rs::parser::grammar::parse_magic_rule;
887/// use libmagic_rs::parser::ast::{MagicRule, OffsetSpec, TypeKind, Operator, Value};
888///
889/// // Basic rule
890/// let input = "0 string \\x7fELF ELF executable";
891/// let (_, rule) = parse_magic_rule(input).unwrap();
892/// assert_eq!(rule.level, 0);
893/// assert_eq!(rule.message, "ELF executable");
894///
895/// // Child rule
896/// let input = ">4 byte 1 32-bit";
897/// let (_, rule) = parse_magic_rule(input).unwrap();
898/// assert_eq!(rule.level, 1);
899/// assert_eq!(rule.message, "32-bit");
900/// ```
901///
902/// # Errors
903///
904/// Returns a nom parsing error if:
905/// - The offset specification is invalid
906/// - The type specification is not recognized
907/// - The operator is invalid (if present)
908/// - The value cannot be parsed
909/// - The input format doesn't match the expected magic rule syntax
910pub fn parse_magic_rule(input: &str) -> IResult<&str, MagicRule> {
911 let (input, _) = multispace0(input)?;
912
913 // Parse the offset with nesting level
914 let (input, (level, offset)) = parse_rule_offset(input)?;
915
916 // Parse the type and any attached operator
917 let (input, (typ, attached_op)) = parse_type_and_operator(input)?;
918
919 // Try to parse a separate operator (optional - use attached operator if present)
920 let (input, separate_op) = opt(parse_operator).parse(input)?;
921 let op = attached_op.or(separate_op).unwrap_or(Operator::Equal);
922
923 // For AnyValue (`x`), no operand is needed -- treat remaining text as message
924 let (input, value) = if op == Operator::AnyValue {
925 (input, Value::Uint(0))
926 } else {
927 parse_value(input)?
928 };
929
930 // Parse the message (optional - everything remaining on the line)
931 let (input, message) = if input.trim().is_empty() {
932 (input, String::new())
933 } else {
934 parse_message(input)?
935 };
936
937 let rule = MagicRule {
938 offset,
939 typ,
940 op,
941 value,
942 message,
943 children: vec![], // Children will be added during hierarchical parsing
944 level,
945 strength_modifier: None, // Will be set during directive parsing
946 };
947
948 Ok((input, rule))
949}
950
951/// Parse a comment line (starts with #)
952///
953/// Comments in magic files start with '#' and continue to the end of the line.
954/// This function consumes the entire comment line.
955///
956/// # Examples
957///
958/// ```
959/// use libmagic_rs::parser::grammar::parse_comment;
960///
961/// assert_eq!(parse_comment("# This is a comment"), Ok(("", "This is a comment".to_string())));
962/// assert_eq!(parse_comment("#"), Ok(("", "".to_string())));
963/// ```
964/// Parse a comment line (starting with #)
965///
966/// # Errors
967/// Returns a nom parsing error if the input is not a valid comment
968pub fn parse_comment(input: &str) -> IResult<&str, String> {
969 let (input, _) = multispace0(input)?;
970 let (input, _) = char('#').parse(input)?;
971 let (input, comment_text) = take_while(|c: char| c != '\n' && c != '\r').parse(input)?;
972 let comment = comment_text.trim().to_string();
973 Ok((input, comment))
974}
975
976/// Check if a line is empty or contains only whitespace
977///
978/// # Examples
979///
980/// ```
981/// use libmagic_rs::parser::grammar::is_empty_line;
982///
983/// assert!(is_empty_line(""));
984/// assert!(is_empty_line(" "));
985/// assert!(is_empty_line("\t\t"));
986/// assert!(!is_empty_line("0 byte 1"));
987/// ```
988#[must_use]
989pub fn is_empty_line(input: &str) -> bool {
990 input.trim().is_empty()
991}
992
993/// Check if a line is a comment (starts with #)
994///
995/// # Examples
996///
997/// ```
998/// use libmagic_rs::parser::grammar::is_comment_line;
999///
1000/// assert!(is_comment_line("# This is a comment"));
1001/// assert!(is_comment_line("#"));
1002/// assert!(is_comment_line(" # Indented comment"));
1003/// assert!(!is_comment_line("0 byte 1"));
1004/// ```
1005#[must_use]
1006pub fn is_comment_line(input: &str) -> bool {
1007 input.trim().starts_with('#')
1008}
1009
1010/// Check if a line ends with a continuation character (\)
1011///
1012/// Magic files support line continuation with backslash at the end of lines.
1013///
1014/// # Examples
1015///
1016/// ```
1017/// use libmagic_rs::parser::grammar::has_continuation;
1018///
1019/// assert!(has_continuation("0 string test \\"));
1020/// assert!(has_continuation("message continues \\"));
1021/// assert!(!has_continuation("0 string test"));
1022/// ```
1023#[must_use]
1024pub fn has_continuation(input: &str) -> bool {
1025 input.trim_end().ends_with('\\')
1026}
1027#[cfg(test)]
1028mod tests;