Skip to main content

libmagic_rs/parser/grammar/
mod.rs

1// Copyright (c) 2025-2026 the libmagic-rs contributors
2// SPDX-License-Identifier: Apache-2.0
3
4//! Grammar parsing for magic files using nom parser combinators
5//!
6//! This module implements the parsing logic for magic file syntax, converting
7//! text-based magic rules into the AST representation defined in ast.rs.
8
9use nom::{
10    IResult, Parser,
11    branch::alt,
12    bytes::complete::{tag, take_while},
13    character::complete::{char, digit1, hex_digit1, multispace0, none_of, one_of},
14    combinator::{map, opt, recognize},
15    error::Error as NomError,
16    multi::many0,
17    sequence::pair,
18};
19
20use crate::parser::ast::{MagicRule, OffsetSpec, Operator, StrengthModifier, TypeKind, Value};
21
22/// Parse a decimal number with overflow protection
23fn parse_decimal_number(input: &str) -> IResult<&str, i64> {
24    let (input, digits) = digit1(input)?;
25
26    // Check for potential overflow before parsing
27    if digits.len() > 19 {
28        // i64::MAX has 19 digits, so anything longer will definitely overflow
29        return Err(nom::Err::Error(nom::error::Error::new(
30            input,
31            nom::error::ErrorKind::MapRes,
32        )));
33    }
34
35    let number = digits.parse::<i64>().map_err(|_| {
36        nom::Err::Error(nom::error::Error::new(input, nom::error::ErrorKind::MapRes))
37    })?;
38    Ok((input, number))
39}
40
41/// Parse a decimal number as unsigned `u64` with overflow protection
42fn parse_unsigned_decimal_number(input: &str) -> IResult<&str, u64> {
43    let (input, digits) = digit1(input)?;
44
45    // u64::MAX (18446744073709551615) has 20 digits
46    if digits.len() > 20 {
47        return Err(nom::Err::Error(nom::error::Error::new(
48            input,
49            nom::error::ErrorKind::MapRes,
50        )));
51    }
52
53    let number = digits.parse::<u64>().map_err(|_| {
54        nom::Err::Error(nom::error::Error::new(input, nom::error::ErrorKind::MapRes))
55    })?;
56    Ok((input, number))
57}
58
59/// Parse a hexadecimal number (with 0x prefix) with overflow protection
60fn parse_hex_number(input: &str) -> IResult<&str, i64> {
61    let (input, _) = tag("0x")(input)?;
62    let (input, hex_str) = hex_digit1(input)?;
63
64    // Check for potential overflow - i64 can hold up to 16 hex digits (0x7FFFFFFFFFFFFFFF)
65    if hex_str.len() > 16 {
66        return Err(nom::Err::Error(nom::error::Error::new(
67            input,
68            nom::error::ErrorKind::MapRes,
69        )));
70    }
71
72    let number = i64::from_str_radix(hex_str, 16).map_err(|_| {
73        nom::Err::Error(nom::error::Error::new(input, nom::error::ErrorKind::MapRes))
74    })?;
75
76    Ok((input, number))
77}
78
79/// Parse a hexadecimal number (with 0x prefix) as unsigned `u64`
80fn parse_unsigned_hex_number(input: &str) -> IResult<&str, u64> {
81    let (input, _) = tag("0x")(input)?;
82    let (input, hex_str) = hex_digit1(input)?;
83
84    // u64 can hold up to 16 hex digits (0xFFFFFFFFFFFFFFFF)
85    if hex_str.len() > 16 {
86        return Err(nom::Err::Error(nom::error::Error::new(
87            input,
88            nom::error::ErrorKind::MapRes,
89        )));
90    }
91
92    let number = u64::from_str_radix(hex_str, 16).map_err(|_| {
93        nom::Err::Error(nom::error::Error::new(input, nom::error::ErrorKind::MapRes))
94    })?;
95
96    Ok((input, number))
97}
98
99/// Parse a non-negative number as unsigned `u64`
100///
101/// Supports both decimal and hexadecimal (0x prefix) formats.
102/// Does not handle a leading minus sign -- callers handle sign detection.
103fn parse_unsigned_number(input: &str) -> IResult<&str, u64> {
104    if input.starts_with("0x") {
105        parse_unsigned_hex_number(input)
106    } else {
107        parse_unsigned_decimal_number(input)
108    }
109}
110
111/// Parse a decimal or hexadecimal number
112///
113/// Supports both decimal (123, -456) and hexadecimal (0x1a2b, -0xFF) formats.
114///
115/// # Examples
116///
117/// ```
118/// use libmagic_rs::parser::grammar::parse_number;
119///
120/// assert_eq!(parse_number("123"), Ok(("", 123)));
121/// assert_eq!(parse_number("0x1a"), Ok(("", 26)));
122/// assert_eq!(parse_number("-42"), Ok(("", -42)));
123/// assert_eq!(parse_number("-0xFF"), Ok(("", -255)));
124/// ```
125///
126/// # Errors
127///
128/// Returns a nom parsing error if:
129/// - Input is empty or contains no valid digits
130/// - Hexadecimal number lacks proper "0x" prefix or contains invalid hex digits
131/// - Number cannot be parsed as a valid `i64` value
132/// - Input contains invalid characters for the detected number format
133pub fn parse_number(input: &str) -> IResult<&str, i64> {
134    let (input, sign) = opt(char('-')).parse(input)?;
135    let is_negative = sign.is_some();
136
137    // Check if input starts with "0x" - if so, it must be a valid hex number
138    let (input, number) = if input.starts_with("0x") {
139        parse_hex_number(input)?
140    } else {
141        parse_decimal_number(input)?
142    };
143
144    // Apply sign with overflow checking
145    let result = if is_negative {
146        number.checked_neg().ok_or_else(|| {
147            nom::Err::Error(nom::error::Error::new(input, nom::error::ErrorKind::MapRes))
148        })?
149    } else {
150        number
151    };
152
153    Ok((input, result))
154}
155
156/// Parse an offset specification for absolute offsets
157///
158/// Supports decimal and hexadecimal formats, both positive and negative.
159///
160/// # Examples
161///
162/// ```
163/// use libmagic_rs::parser::grammar::parse_offset;
164/// use libmagic_rs::parser::ast::OffsetSpec;
165///
166/// assert_eq!(parse_offset("0"), Ok(("", OffsetSpec::Absolute(0))));
167/// assert_eq!(parse_offset("123"), Ok(("", OffsetSpec::Absolute(123))));
168/// assert_eq!(parse_offset("0x10"), Ok(("", OffsetSpec::Absolute(16))));
169/// assert_eq!(parse_offset("-4"), Ok(("", OffsetSpec::Absolute(-4))));
170/// assert_eq!(parse_offset("-0xFF"), Ok(("", OffsetSpec::Absolute(-255))));
171/// ```
172///
173/// # Errors
174///
175/// Returns a nom parsing error if:
176/// - The input contains invalid number format (propagated from `parse_number`)
177/// - Input is empty or contains no parseable offset value
178/// - The offset value cannot be represented as a valid `i64`
179pub fn parse_offset(input: &str) -> IResult<&str, OffsetSpec> {
180    let (input, _) = multispace0(input)?;
181    let (input, offset_value) = parse_number(input)?;
182    let (input, _) = multispace0(input)?;
183
184    Ok((input, OffsetSpec::Absolute(offset_value)))
185}
186
187/// Parse comparison operators for magic rules
188///
189/// Supports both symbolic and text representations of operators:
190/// - `=` or `==` for equality
191/// - `!=` or `<>` for inequality
192/// - `<` for less-than
193/// - `>` for greater-than
194/// - `<=` for less-than-or-equal
195/// - `>=` for greater-than-or-equal
196/// - `&` for bitwise AND
197/// - `^` for bitwise XOR
198/// - `~` for bitwise NOT
199/// - `x` for any value (always matches)
200///
201/// # Examples
202///
203/// ```
204/// use libmagic_rs::parser::grammar::parse_operator;
205/// use libmagic_rs::parser::ast::Operator;
206///
207/// assert_eq!(parse_operator("="), Ok(("", Operator::Equal)));
208/// assert_eq!(parse_operator("=="), Ok(("", Operator::Equal)));
209/// assert_eq!(parse_operator("!="), Ok(("", Operator::NotEqual)));
210/// assert_eq!(parse_operator("<>"), Ok(("", Operator::NotEqual)));
211/// assert_eq!(parse_operator("<"), Ok(("", Operator::LessThan)));
212/// assert_eq!(parse_operator(">"), Ok(("", Operator::GreaterThan)));
213/// assert_eq!(parse_operator("<="), Ok(("", Operator::LessEqual)));
214/// assert_eq!(parse_operator(">="), Ok(("", Operator::GreaterEqual)));
215/// assert_eq!(parse_operator("&"), Ok(("", Operator::BitwiseAnd)));
216/// assert_eq!(parse_operator("^"), Ok(("", Operator::BitwiseXor)));
217/// assert_eq!(parse_operator("~"), Ok(("", Operator::BitwiseNot)));
218/// assert_eq!(parse_operator("x"), Ok(("", Operator::AnyValue)));
219/// ```
220///
221/// # Errors
222///
223/// Returns a nom parsing error if:
224/// - Input does not start with a recognized operator symbol
225/// - Input is empty or contains no valid operator
226/// - Operator syntax is incomplete (e.g., just `!` without `=`)
227pub fn parse_operator(input: &str) -> IResult<&str, Operator> {
228    let (input, _) = multispace0(input)?;
229
230    // Try to parse each operator, starting with longer ones first
231    if let Ok((remaining, _)) = tag::<&str, &str, nom::error::Error<&str>>("==")(input) {
232        // Check that we don't have another '=' following (to reject "===")
233        if remaining.starts_with('=') {
234            return Err(nom::Err::Error(nom::error::Error::new(
235                input,
236                nom::error::ErrorKind::Tag,
237            )));
238        }
239        let (remaining, _) = multispace0(remaining)?;
240        return Ok((remaining, Operator::Equal));
241    }
242
243    if let Ok((remaining, _)) = tag::<&str, &str, nom::error::Error<&str>>("!=")(input) {
244        let (remaining, _) = multispace0(remaining)?;
245        return Ok((remaining, Operator::NotEqual));
246    }
247
248    if let Ok((remaining, _)) = tag::<&str, &str, nom::error::Error<&str>>("<>")(input) {
249        let (remaining, _) = multispace0(remaining)?;
250        return Ok((remaining, Operator::NotEqual));
251    }
252
253    if let Ok((remaining, _)) = tag::<&str, &str, nom::error::Error<&str>>("<=")(input) {
254        let (remaining, _) = multispace0(remaining)?;
255        return Ok((remaining, Operator::LessEqual));
256    }
257
258    if let Ok((remaining, _)) = tag::<&str, &str, nom::error::Error<&str>>(">=")(input) {
259        let (remaining, _) = multispace0(remaining)?;
260        return Ok((remaining, Operator::GreaterEqual));
261    }
262
263    if let Ok((remaining, _)) = tag::<&str, &str, nom::error::Error<&str>>("=")(input) {
264        // Check that we don't have another '=' following (to reject "==")
265        if remaining.starts_with('=') {
266            return Err(nom::Err::Error(nom::error::Error::new(
267                input,
268                nom::error::ErrorKind::Tag,
269            )));
270        }
271        let (remaining, _) = multispace0(remaining)?;
272        return Ok((remaining, Operator::Equal));
273    }
274
275    if let Ok((remaining, _)) = tag::<&str, &str, nom::error::Error<&str>>("&")(input) {
276        // Check that we don't have another '&' following (to reject "&&")
277        if remaining.starts_with('&') {
278            return Err(nom::Err::Error(nom::error::Error::new(
279                input,
280                nom::error::ErrorKind::Tag,
281            )));
282        }
283        let (remaining, _) = multispace0(remaining)?;
284        return Ok((remaining, Operator::BitwiseAnd));
285    }
286
287    if let Ok((remaining, _)) = tag::<&str, &str, nom::error::Error<&str>>("^")(input) {
288        if remaining.starts_with('^') {
289            return Err(nom::Err::Error(nom::error::Error::new(
290                input,
291                nom::error::ErrorKind::Tag,
292            )));
293        }
294        let (remaining, _) = multispace0(remaining)?;
295        return Ok((remaining, Operator::BitwiseXor));
296    }
297
298    if let Ok((remaining, _)) = tag::<&str, &str, nom::error::Error<&str>>("~")(input) {
299        if remaining.starts_with('~') {
300            return Err(nom::Err::Error(nom::error::Error::new(
301                input,
302                nom::error::ErrorKind::Tag,
303            )));
304        }
305        let (remaining, _) = multispace0(remaining)?;
306        return Ok((remaining, Operator::BitwiseNot));
307    }
308
309    if let Ok((remaining, _)) = tag::<&str, &str, nom::error::Error<&str>>("x")(input) {
310        // Ensure 'x' is not followed by alphanumeric (e.g., "x42" is not AnyValue)
311        if remaining.starts_with(|c: char| c.is_alphanumeric() || c == '_') {
312            return Err(nom::Err::Error(nom::error::Error::new(
313                input,
314                nom::error::ErrorKind::Tag,
315            )));
316        }
317        let (remaining, _) = multispace0(remaining)?;
318        return Ok((remaining, Operator::AnyValue));
319    }
320
321    if let Ok((remaining, _)) = tag::<&str, &str, nom::error::Error<&str>>("<")(input) {
322        let (remaining, _) = multispace0(remaining)?;
323        return Ok((remaining, Operator::LessThan));
324    }
325
326    if let Ok((remaining, _)) = tag::<&str, &str, nom::error::Error<&str>>(">")(input) {
327        let (remaining, _) = multispace0(remaining)?;
328        return Ok((remaining, Operator::GreaterThan));
329    }
330
331    // If no operator matches, return an error
332    Err(nom::Err::Error(nom::error::Error::new(
333        input,
334        nom::error::ErrorKind::Tag,
335    )))
336}
337
338/// Parse a single hex byte with \x prefix
339fn parse_hex_byte_with_prefix(input: &str) -> IResult<&str, u8> {
340    let (input, _) = tag("\\x")(input)?;
341    let (input, hex_str) = recognize(pair(
342        one_of("0123456789abcdefABCDEF"),
343        one_of("0123456789abcdefABCDEF"),
344    ))
345    .parse(input)?;
346    let byte_val = u8::from_str_radix(hex_str, 16)
347        .map_err(|_| nom::Err::Error(NomError::new(input, nom::error::ErrorKind::MapRes)))?;
348    Ok((input, byte_val))
349}
350
351/// Parse a hex byte sequence starting with \x prefix
352fn parse_hex_bytes_with_prefix(input: &str) -> IResult<&str, Vec<u8>> {
353    if input.starts_with("\\x") {
354        many0(parse_hex_byte_with_prefix).parse(input)
355    } else {
356        Err(nom::Err::Error(NomError::new(
357            input,
358            nom::error::ErrorKind::Tag,
359        )))
360    }
361}
362
363/// Parse a mixed hex and ASCII sequence (like \x7fELF)
364fn parse_mixed_hex_ascii(input: &str) -> IResult<&str, Vec<u8>> {
365    // Must start with \ to be considered an escape sequence
366    if !input.starts_with('\\') {
367        return Err(nom::Err::Error(NomError::new(
368            input,
369            nom::error::ErrorKind::Tag,
370        )));
371    }
372
373    let mut bytes = Vec::new();
374    let mut remaining = input;
375
376    while !remaining.is_empty() {
377        // Try to parse escape sequences first (hex, octal, etc.)
378        if let Ok((new_remaining, escaped_char)) = parse_escape_sequence(remaining) {
379            bytes.push(escaped_char as u8);
380            remaining = new_remaining;
381        } else if let Ok((new_remaining, hex_byte)) = parse_hex_byte_with_prefix(remaining) {
382            bytes.push(hex_byte);
383            remaining = new_remaining;
384        } else if let Ok((new_remaining, ascii_char)) =
385            none_of::<&str, &str, NomError<&str>>(" \t\n\r")(remaining)
386        {
387            // Parse regular ASCII character (not whitespace)
388            bytes.push(ascii_char as u8);
389            remaining = new_remaining;
390        } else {
391            // Stop if we can't parse anything more
392            break;
393        }
394    }
395
396    if bytes.is_empty() {
397        Err(nom::Err::Error(NomError::new(
398            input,
399            nom::error::ErrorKind::Tag,
400        )))
401    } else {
402        Ok((remaining, bytes))
403    }
404}
405
406/// Parse a hex byte sequence without prefix (only if it looks like pure hex bytes)
407fn parse_hex_bytes_no_prefix(input: &str) -> IResult<&str, Vec<u8>> {
408    // Only parse as hex bytes if:
409    // 1. Input has even number of hex digits (pairs)
410    // 2. All characters are hex digits
411    // 3. Doesn't start with 0x (that's a number)
412    // 4. Contains at least one non-decimal digit (a-f, A-F)
413
414    if input.starts_with("0x") || input.starts_with('-') {
415        return Err(nom::Err::Error(NomError::new(
416            input,
417            nom::error::ErrorKind::Tag,
418        )));
419    }
420
421    let hex_chars: String = input.chars().take_while(char::is_ascii_hexdigit).collect();
422
423    if hex_chars.is_empty() || !hex_chars.len().is_multiple_of(2) {
424        return Err(nom::Err::Error(NomError::new(
425            input,
426            nom::error::ErrorKind::Tag,
427        )));
428    }
429
430    // Check if it contains non-decimal hex digits (a-f, A-F)
431    let has_hex_letters = hex_chars
432        .chars()
433        .any(|c| matches!(c, 'a'..='f' | 'A'..='F'));
434    if !has_hex_letters {
435        return Err(nom::Err::Error(NomError::new(
436            input,
437            nom::error::ErrorKind::Tag,
438        )));
439    }
440
441    // Parse pairs of hex digits
442    let mut bytes = Vec::with_capacity(hex_chars.len() / 2);
443    let mut chars = hex_chars.chars();
444    while let (Some(c1), Some(c2)) = (chars.next(), chars.next()) {
445        // Avoid format! allocation by parsing digits directly
446        let digit1 = c1
447            .to_digit(16)
448            .ok_or_else(|| nom::Err::Error(NomError::new(input, nom::error::ErrorKind::MapRes)))?;
449        let digit2 = c2
450            .to_digit(16)
451            .ok_or_else(|| nom::Err::Error(NomError::new(input, nom::error::ErrorKind::MapRes)))?;
452        let byte_val = u8::try_from((digit1 << 4) | digit2)
453            .map_err(|_| nom::Err::Error(NomError::new(input, nom::error::ErrorKind::MapRes)))?;
454        bytes.push(byte_val);
455    }
456
457    let remaining = &input[hex_chars.len()..];
458    Ok((remaining, bytes))
459}
460
461/// Parse a hex byte sequence (e.g., "\\x7f\\x45\\x4c\\x46", "7f454c46", or "\\x7fELF")
462fn parse_hex_bytes(input: &str) -> IResult<&str, Vec<u8>> {
463    alt((
464        parse_mixed_hex_ascii,
465        parse_hex_bytes_with_prefix,
466        parse_hex_bytes_no_prefix,
467    ))
468    .parse(input)
469}
470
471/// Parse escape sequences in strings
472fn parse_escape_sequence(input: &str) -> IResult<&str, char> {
473    let (input, _) = char('\\')(input)?;
474
475    // Try to parse octal escape sequence first (\377, \123, etc.)
476    if let Ok((remaining, octal_str)) = recognize(pair(
477        one_of::<&str, &str, NomError<&str>>("0123"),
478        pair(
479            one_of::<&str, &str, NomError<&str>>("01234567"),
480            one_of::<&str, &str, NomError<&str>>("01234567"),
481        ),
482    ))
483    .parse(input)
484        && let Ok(octal_value) = u8::from_str_radix(octal_str, 8)
485    {
486        return Ok((remaining, octal_value as char));
487    }
488
489    // Parse standard escape sequences
490    let (input, escaped_char) = one_of("nrt\\\"'0")(input)?;
491
492    let result_char = match escaped_char {
493        'n' => '\n',
494        'r' => '\r',
495        't' => '\t',
496        '\\' => '\\',
497        '"' => '"',
498        '\'' => '\'',
499        '0' => '\0',
500        _ => unreachable!("one_of constrains input to known escape characters"),
501    };
502
503    Ok((input, result_char))
504}
505
506/// Parse a quoted string with escape sequences
507fn parse_quoted_string(input: &str) -> IResult<&str, String> {
508    let (input, _) = multispace0(input)?;
509    let (input, _) = char('"')(input)?;
510
511    let mut result = String::new();
512    let mut remaining = input;
513
514    loop {
515        // Try to parse an escape sequence first
516        if let Ok((new_remaining, escaped_char)) = parse_escape_sequence(remaining) {
517            result.push(escaped_char);
518            remaining = new_remaining;
519            continue;
520        }
521
522        // If no escape sequence, try to parse a regular character (not quote or backslash)
523        if let Ok((new_remaining, regular_char)) =
524            none_of::<&str, &str, NomError<&str>>("\"\\")(remaining)
525        {
526            result.push(regular_char);
527            remaining = new_remaining;
528            continue;
529        }
530
531        // If neither worked, we should be at the closing quote
532        break;
533    }
534
535    let (remaining, _) = char('"')(remaining)?;
536    let (remaining, _) = multispace0(remaining)?;
537
538    Ok((remaining, result))
539}
540
541/// Parse a floating-point literal into `Value::Float(f64)`
542///
543/// Recognizes numbers with a mandatory decimal point (to distinguish from
544/// integers), an optional leading minus sign, and an optional exponent part.
545/// Examples: `3.14`, `-1.0`, `2.5e10`, `-0.5E-3`
546fn parse_float_value(input: &str) -> IResult<&str, f64> {
547    let (input, _) = multispace0(input)?;
548
549    let (remaining, float_str) = recognize((
550        opt(char('-')),
551        digit1,
552        char('.'),
553        digit1,
554        opt((one_of("eE"), opt(one_of("+-")), digit1)),
555    ))
556    .parse(input)?;
557
558    let value: f64 = float_str
559        .parse()
560        .map_err(|_| nom::Err::Error(NomError::new(input, nom::error::ErrorKind::MapRes)))?;
561
562    // Reject non-finite floats (NaN, +inf, -inf) to keep AST, JSON, and codegen valid
563    if !value.is_finite() {
564        return Err(nom::Err::Error(NomError::new(
565            input,
566            nom::error::ErrorKind::Float,
567        )));
568    }
569
570    let (remaining, _) = multispace0(remaining)?;
571    Ok((remaining, value))
572}
573
574/// Parse a numeric value (integer)
575///
576/// Non-negative literals are parsed directly as `u64` so the full unsigned
577/// 64-bit range is representable (required for `uquad` values above `i64::MAX`).
578/// Negative literals go through the signed `i64` path.
579fn parse_numeric_value(input: &str) -> IResult<&str, Value> {
580    let (input, _) = multispace0(input)?;
581
582    let (input, value) = if input.starts_with('-') {
583        // Negative: parse as i64
584        let (input, number) = parse_number(input)?;
585        (input, Value::Int(number))
586    } else {
587        // Non-negative: parse as u64 to support full unsigned 64-bit range
588        let (input, number) = parse_unsigned_number(input)?;
589        (input, Value::Uint(number))
590    };
591
592    let (input, _) = multispace0(input)?;
593    Ok((input, value))
594}
595
596/// Parse string, float, and numeric literals for magic rule values
597///
598/// Supports:
599/// - Quoted strings with escape sequences: "Hello\nWorld", "ELF\0"
600/// - Floating-point literals: 3.14, -1.0, 2.5e10
601/// - Numeric literals (decimal): 123, -456
602/// - Numeric literals (hexadecimal): 0x1a2b, -0xFF
603/// - Hex byte sequences: \\x7f\\x45\\x4c\\x46 or 7f454c46
604///
605/// # Examples
606///
607/// ```
608/// use libmagic_rs::parser::grammar::parse_value;
609/// use libmagic_rs::parser::ast::Value;
610///
611/// // String values
612/// assert_eq!(parse_value("\"Hello\""), Ok(("", Value::String("Hello".to_string()))));
613/// assert_eq!(parse_value("\"Line1\\nLine2\""), Ok(("", Value::String("Line1\nLine2".to_string()))));
614///
615/// // Numeric values
616/// assert_eq!(parse_value("123"), Ok(("", Value::Uint(123))));
617/// assert_eq!(parse_value("-456"), Ok(("", Value::Int(-456))));
618/// assert_eq!(parse_value("0x1a"), Ok(("", Value::Uint(26))));
619/// assert_eq!(parse_value("-0xFF"), Ok(("", Value::Int(-255))));
620///
621/// // Hex byte sequences
622/// assert_eq!(parse_value("\\x7f\\x45"), Ok(("", Value::Bytes(vec![0x7f, 0x45]))));
623/// ```
624///
625/// # Errors
626///
627/// Returns a nom parsing error if:
628/// - Input is empty or contains no valid value
629/// - Quoted string is not properly terminated
630/// - Numeric value cannot be parsed as a valid integer
631/// - Hex byte sequence contains invalid hex digits
632/// - Input contains invalid characters for the detected value format
633pub fn parse_value(input: &str) -> IResult<&str, Value> {
634    let (input, _) = multispace0(input)?;
635
636    // Handle empty input case - should fail for magic rules
637    if input.is_empty() {
638        return Err(nom::Err::Error(NomError::new(
639            input,
640            nom::error::ErrorKind::Tag,
641        )));
642    }
643
644    // Try to parse different value types in order of specificity
645    let (input, value) = alt((
646        // Try quoted string first
647        map(parse_quoted_string, Value::String),
648        // Try hex byte sequence before numeric (to catch patterns like "7f", "ab", "\\x7fELF", etc.)
649        map(parse_hex_bytes, Value::Bytes),
650        // Try float before integer (a float literal is a superset of an integer prefix)
651        map(parse_float_value, Value::Float),
652        // Try numeric value last (for pure numbers like 0x123, 1, etc.)
653        parse_numeric_value,
654    ))
655    .parse(input)?;
656
657    Ok((input, value))
658}
659
660/// Parse a type specification with an optional attached bitwise-AND mask operator
661/// (e.g., `lelong&0xf0000000`).
662///
663/// Returns the `TypeKind` and an optional `Operator`.
664///
665/// # Examples
666///
667/// ```
668/// use libmagic_rs::parser::grammar::parse_type_and_operator;
669/// use libmagic_rs::parser::ast::{TypeKind, Operator, Endianness};
670///
671/// // Type without operator
672/// let (_, (kind, op)) = parse_type_and_operator("lelong").unwrap();
673/// assert_eq!(kind, TypeKind::Long { endian: Endianness::Little, signed: true });
674/// assert_eq!(op, None);
675///
676/// // Type with mask operator
677/// let (_, (kind, op)) = parse_type_and_operator("lelong&0xf0000000").unwrap();
678/// assert!(matches!(op, Some(Operator::BitwiseAndMask(_))));
679/// ```
680///
681/// # Errors
682/// Returns a nom parsing error if the input doesn't match the expected format
683pub fn parse_type_and_operator(input: &str) -> IResult<&str, (TypeKind, Option<Operator>)> {
684    let (input, _) = multispace0(input)?;
685
686    let (input, type_name) = crate::parser::types::parse_type_keyword(input)?;
687
688    // Check for attached operator with mask (like &0xf0000000)
689    // Uses unsigned parsing so full u64 masks (e.g. 0xffffffffffffffff) are supported.
690    // If '&' is followed by digits/0x but the mask parse fails (overflow, etc.),
691    // we return a hard error instead of silently falling back to standalone '&'.
692    let (input, attached_op) = if let Some(after_amp) = input.strip_prefix('&') {
693        if after_amp.starts_with("0x") || after_amp.starts_with(|c: char| c.is_ascii_digit()) {
694            // '&' followed by what looks like a number -- must parse as mask
695            let (rest, mask) = parse_unsigned_number(after_amp).map_err(|_| {
696                nom::Err::Error(nom::error::Error::new(input, nom::error::ErrorKind::MapRes))
697            })?;
698            (rest, Some(Operator::BitwiseAndMask(mask)))
699        } else if after_amp.starts_with('&') {
700            // Reject '&&' -- not valid operator syntax
701            return Err(nom::Err::Error(nom::error::Error::new(
702                input,
703                nom::error::ErrorKind::Tag,
704            )));
705        } else {
706            // Standalone '&' (no digits following)
707            (after_amp, Some(Operator::BitwiseAnd))
708        }
709    } else {
710        (input, None)
711    };
712
713    let (input, _) = multispace0(input)?;
714
715    let type_kind = crate::parser::types::type_keyword_to_kind(type_name);
716
717    Ok((input, (type_kind, attached_op)))
718}
719
720/// Parse a type specification (byte, short, long, quad, string, etc.)
721///
722/// Supports various type formats found in magic files:
723/// - `byte` / `ubyte` - single byte (signed / unsigned)
724/// - `short` / `ushort` - 16-bit integer (native endian, signed / unsigned)
725/// - `leshort` / `uleshort` - 16-bit little-endian integer
726/// - `beshort` / `ubeshort` - 16-bit big-endian integer
727/// - `long` / `ulong` - 32-bit integer (native endian, signed / unsigned)
728/// - `lelong` / `ulelong` - 32-bit little-endian integer
729/// - `belong` / `ubelong` - 32-bit big-endian integer
730/// - `quad` / `uquad` - 64-bit integer (native endian, signed / unsigned)
731/// - `lequad` / `ulequad` - 64-bit little-endian integer
732/// - `bequad` / `ubequad` - 64-bit big-endian integer
733/// - `string` - null-terminated string
734///
735/// # Examples
736///
737/// ```
738/// use libmagic_rs::parser::grammar::parse_type;
739/// use libmagic_rs::parser::ast::{TypeKind, Endianness};
740///
741/// assert_eq!(parse_type("byte"), Ok(("", TypeKind::Byte { signed: true })));
742/// assert_eq!(parse_type("leshort"), Ok(("", TypeKind::Short { endian: Endianness::Little, signed: true })));
743/// assert_eq!(parse_type("bequad"), Ok(("", TypeKind::Quad { endian: Endianness::Big, signed: true })));
744/// assert_eq!(parse_type("string"), Ok(("", TypeKind::String { max_length: None })));
745/// ```
746///
747/// # Errors
748/// Returns a nom parsing error if the input doesn't match any known type
749pub fn parse_type(input: &str) -> IResult<&str, TypeKind> {
750    let (input, (type_kind, _)) = parse_type_and_operator(input)?;
751    Ok((input, type_kind))
752}
753
754/// Parse the indentation level and offset for magic rules
755///
756/// Handles both absolute offsets and hierarchical child rules with `>` prefix.
757/// Child rules can be nested multiple levels deep with multiple `>` characters.
758///
759/// # Examples
760///
761/// ```
762/// use libmagic_rs::parser::grammar::parse_rule_offset;
763/// use libmagic_rs::parser::ast::OffsetSpec;
764///
765/// // Absolute offset
766/// assert_eq!(parse_rule_offset("0"), Ok(("", (0, OffsetSpec::Absolute(0)))));
767/// assert_eq!(parse_rule_offset("16"), Ok(("", (0, OffsetSpec::Absolute(16)))));
768///
769/// // Child rule (level 1)
770/// assert_eq!(parse_rule_offset(">4"), Ok(("", (1, OffsetSpec::Absolute(4)))));
771///
772/// // Nested child rule (level 2)
773/// assert_eq!(parse_rule_offset(">>8"), Ok(("", (2, OffsetSpec::Absolute(8)))));
774/// ```
775/// Parse rule offset with hierarchy level (> prefixes) and offset specification
776///
777/// # Errors
778/// Returns a nom parsing error if the input doesn't match the expected offset format
779pub fn parse_rule_offset(input: &str) -> IResult<&str, (u32, OffsetSpec)> {
780    let (input, _) = multispace0(input)?;
781
782    // Count the number of '>' characters for nesting level
783    let (input, level_chars) = many0(char('>')).parse(input)?;
784    let level = u32::try_from(level_chars.len()).unwrap_or(0);
785
786    // Parse the offset after the '>' characters
787    let (input, offset_spec) = parse_offset(input)?;
788
789    Ok((input, (level, offset_spec)))
790}
791
792/// Parse the message part of a magic rule
793///
794/// The message is everything after the value until the end of the line.
795/// It may contain format specifiers and can be empty.
796///
797/// # Examples
798///
799/// ```
800/// use libmagic_rs::parser::grammar::parse_message;
801///
802/// assert_eq!(parse_message("ELF executable"), Ok(("", "ELF executable".to_string())));
803/// assert_eq!(parse_message(""), Ok(("", "".to_string())));
804/// assert_eq!(parse_message("  \tPDF document  "), Ok(("", "PDF document".to_string())));
805/// ```
806/// Parse the message/description part of a magic rule
807///
808/// # Errors
809/// Returns a nom parsing error if the input cannot be parsed as a message
810pub fn parse_message(input: &str) -> IResult<&str, String> {
811    let (input, _) = multispace0(input)?;
812
813    // Take everything until end of line, trimming whitespace
814    // Use take_while instead of take_while1 to handle empty messages
815    let (input, message_text) = take_while(|c: char| c != '\n' && c != '\r').parse(input)?;
816    let message = message_text.trim().to_string();
817
818    Ok((input, message))
819}
820
821/// Parse a strength directive (`!:strength` line)
822///
823/// Parses the `!:strength` directive that modifies rule strength.
824/// Format: `!:strength [+|-|*|/|=]N` or `!:strength N`
825///
826/// # Examples
827///
828/// ```
829/// use libmagic_rs::parser::grammar::parse_strength_directive;
830/// use libmagic_rs::parser::ast::StrengthModifier;
831///
832/// assert_eq!(parse_strength_directive("!:strength +10"), Ok(("", StrengthModifier::Add(10))));
833/// assert_eq!(parse_strength_directive("!:strength -5"), Ok(("", StrengthModifier::Subtract(5))));
834/// assert_eq!(parse_strength_directive("!:strength *2"), Ok(("", StrengthModifier::Multiply(2))));
835/// assert_eq!(parse_strength_directive("!:strength /2"), Ok(("", StrengthModifier::Divide(2))));
836/// assert_eq!(parse_strength_directive("!:strength =50"), Ok(("", StrengthModifier::Set(50))));
837/// assert_eq!(parse_strength_directive("!:strength 50"), Ok(("", StrengthModifier::Set(50))));
838/// ```
839///
840/// # Errors
841///
842/// Returns a nom parsing error if:
843/// - Input doesn't start with `!:strength`
844/// - The modifier value cannot be parsed as a valid integer
845/// - The operator is invalid
846pub fn parse_strength_directive(input: &str) -> IResult<&str, StrengthModifier> {
847    // Helper to safely convert i64 to i32 with clamping to valid strength range.
848    // This prevents silent truncation to 0 on overflow while keeping values in bounds.
849    fn clamp_to_i32(n: i64) -> i32 {
850        // Use i64::from for lossless conversion, then clamp and convert back
851        let clamped = n.clamp(i64::from(i32::MIN), i64::from(i32::MAX));
852        // Safe to unwrap: clamped value is guaranteed to be in i32 range
853        i32::try_from(clamped).unwrap()
854    }
855
856    let (input, _) = multispace0(input)?;
857    let (input, _) = tag("!:strength")(input)?;
858    let (input, _) = multispace0(input)?;
859
860    // Parse the operator: +, -, *, /, = or bare number (implies =)
861    let (input, modifier) = alt((
862        // +N -> Add
863        map(pair(char('+'), parse_number), |(_, n)| {
864            StrengthModifier::Add(clamp_to_i32(n))
865        }),
866        // -N -> Subtract (note: parse_number handles negative, so we need special handling)
867        map(pair(char('-'), parse_decimal_number), |(_, n)| {
868            StrengthModifier::Subtract(clamp_to_i32(n))
869        }),
870        // *N -> Multiply
871        map(pair(char('*'), parse_number), |(_, n)| {
872            StrengthModifier::Multiply(clamp_to_i32(n))
873        }),
874        // /N -> Divide
875        map(pair(char('/'), parse_number), |(_, n)| {
876            StrengthModifier::Divide(clamp_to_i32(n))
877        }),
878        // =N -> Set
879        map(pair(char('='), parse_number), |(_, n)| {
880            StrengthModifier::Set(clamp_to_i32(n))
881        }),
882        // Bare number -> Set
883        map(parse_number, |n| StrengthModifier::Set(clamp_to_i32(n))),
884    ))
885    .parse(input)?;
886
887    Ok((input, modifier))
888}
889
890/// Check if a line is a strength directive (starts with !:strength)
891///
892/// # Examples
893///
894/// ```
895/// use libmagic_rs::parser::grammar::is_strength_directive;
896///
897/// assert!(is_strength_directive("!:strength +10"));
898/// assert!(is_strength_directive("  !:strength -5"));
899/// assert!(!is_strength_directive("0 byte 1"));
900/// ```
901#[must_use]
902pub fn is_strength_directive(input: &str) -> bool {
903    input.trim().starts_with("!:strength")
904}
905
906/// Parse a complete magic rule line from text format
907///
908/// Parses a complete magic rule in the format:
909/// `[>...]offset type [operator] value [message]`
910///
911/// Where:
912/// - `>...` indicates child rule nesting level (optional)
913/// - `offset` is the byte offset to read from
914/// - `type` is the data type (byte, short, long, string, etc.)
915/// - `operator` is the comparison operator (=, !=, &) - defaults to = if omitted
916/// - `value` is the expected value to compare against
917/// - `message` is the human-readable description (optional)
918///
919/// # Examples
920///
921/// ```
922/// use libmagic_rs::parser::grammar::parse_magic_rule;
923/// use libmagic_rs::parser::ast::{MagicRule, OffsetSpec, TypeKind, Operator, Value};
924///
925/// // Basic rule
926/// let input = "0 string \\x7fELF ELF executable";
927/// let (_, rule) = parse_magic_rule(input).unwrap();
928/// assert_eq!(rule.level, 0);
929/// assert_eq!(rule.message, "ELF executable");
930///
931/// // Child rule
932/// let input = ">4 byte 1 32-bit";
933/// let (_, rule) = parse_magic_rule(input).unwrap();
934/// assert_eq!(rule.level, 1);
935/// assert_eq!(rule.message, "32-bit");
936/// ```
937///
938/// # Errors
939///
940/// Returns a nom parsing error if:
941/// - The offset specification is invalid
942/// - The type specification is not recognized
943/// - The operator is invalid (if present)
944/// - The value cannot be parsed
945/// - The input format doesn't match the expected magic rule syntax
946pub fn parse_magic_rule(input: &str) -> IResult<&str, MagicRule> {
947    let (input, _) = multispace0(input)?;
948
949    // Parse the offset with nesting level
950    let (input, (level, offset)) = parse_rule_offset(input)?;
951
952    // Parse the type and any attached operator
953    let (input, (typ, attached_op)) = parse_type_and_operator(input)?;
954
955    // Try to parse a separate operator (optional - use attached operator if present)
956    let (input, separate_op) = opt(parse_operator).parse(input)?;
957    let op = attached_op.or(separate_op).unwrap_or(Operator::Equal);
958
959    // For AnyValue (`x`), no operand is needed -- treat remaining text as message
960    let (input, value) = if op == Operator::AnyValue {
961        (input, Value::Uint(0))
962    } else {
963        parse_value(input)?
964    };
965
966    // Parse the message (optional - everything remaining on the line)
967    let (input, message) = if input.trim().is_empty() {
968        (input, String::new())
969    } else {
970        parse_message(input)?
971    };
972
973    let rule = MagicRule {
974        offset,
975        typ,
976        op,
977        value,
978        message,
979        children: vec![], // Children will be added during hierarchical parsing
980        level,
981        strength_modifier: None, // Will be set during directive parsing
982    };
983
984    Ok((input, rule))
985}
986
987/// Parse a comment line (starts with #)
988///
989/// Comments in magic files start with '#' and continue to the end of the line.
990/// This function consumes the entire comment line.
991///
992/// # Examples
993///
994/// ```
995/// use libmagic_rs::parser::grammar::parse_comment;
996///
997/// assert_eq!(parse_comment("# This is a comment"), Ok(("", "This is a comment".to_string())));
998/// assert_eq!(parse_comment("#"), Ok(("", "".to_string())));
999/// ```
1000/// Parse a comment line (starting with #)
1001///
1002/// # Errors
1003/// Returns a nom parsing error if the input is not a valid comment
1004pub fn parse_comment(input: &str) -> IResult<&str, String> {
1005    let (input, _) = multispace0(input)?;
1006    let (input, _) = char('#').parse(input)?;
1007    let (input, comment_text) = take_while(|c: char| c != '\n' && c != '\r').parse(input)?;
1008    let comment = comment_text.trim().to_string();
1009    Ok((input, comment))
1010}
1011
1012/// Check if a line is empty or contains only whitespace
1013///
1014/// # Examples
1015///
1016/// ```
1017/// use libmagic_rs::parser::grammar::is_empty_line;
1018///
1019/// assert!(is_empty_line(""));
1020/// assert!(is_empty_line("   "));
1021/// assert!(is_empty_line("\t\t"));
1022/// assert!(!is_empty_line("0 byte 1"));
1023/// ```
1024#[must_use]
1025pub fn is_empty_line(input: &str) -> bool {
1026    input.trim().is_empty()
1027}
1028
1029/// Check if a line is a comment (starts with #)
1030///
1031/// # Examples
1032///
1033/// ```
1034/// use libmagic_rs::parser::grammar::is_comment_line;
1035///
1036/// assert!(is_comment_line("# This is a comment"));
1037/// assert!(is_comment_line("#"));
1038/// assert!(is_comment_line("  # Indented comment"));
1039/// assert!(!is_comment_line("0 byte 1"));
1040/// ```
1041#[must_use]
1042pub fn is_comment_line(input: &str) -> bool {
1043    input.trim().starts_with('#')
1044}
1045
1046/// Check if a line ends with a continuation character (\)
1047///
1048/// Magic files support line continuation with backslash at the end of lines.
1049///
1050/// # Examples
1051///
1052/// ```
1053/// use libmagic_rs::parser::grammar::has_continuation;
1054///
1055/// assert!(has_continuation("0 string test \\"));
1056/// assert!(has_continuation("message continues \\"));
1057/// assert!(!has_continuation("0 string test"));
1058/// ```
1059#[must_use]
1060pub fn has_continuation(input: &str) -> bool {
1061    input.trim_end().ends_with('\\')
1062}
1063#[cfg(test)]
1064mod tests;