pure_magic/
lib.rs

1#![forbid(unsafe_code)]
2#![deny(unused_imports)]
3#![deny(missing_docs)]
4//! # `pure-magic`: A pure and safe Rust Reimplementation of `libmagic`
5//!
6//! Unlike many file identification crates, `pure-magic` is highly compatible with the standard
7//! `magic` rule format, allowing seamless reuse of existing
8//! [rules](https://github.com/qjerome/magic-rs/tree/main/magic-db/src/magdir). This makes it an ideal
9//! drop-in replacement for crates relying on **`libmagic` C bindings**, where memory safety is critical.
10//!
11//! **Key Features:**
12//! - File type detection
13//! - MIME type inference
14//! - Custom magic rule parsing
15//!
16//! ## Installation
17//! Add `pure-magic` to your `Cargo.toml`:
18//!
19//! ```toml
20//! [dependencies]
21//! pure-magic = "0.1"  # Replace with the latest version
22//! ```
23//!
24//! Or add the latest version with cargo:
25//!
26//! ```sh
27//! cargo add pure-magic
28//! ```
29//!
30//! ## Quick Start
31//!
32//! ### Detect File Types Programmatically
33//! ```rust
34//! use pure_magic::{MagicDb, MagicSource};
35//! use std::fs::File;
36//!
37//! fn main() -> Result<(), Box<dyn std::error::Error>> {
38//!     let mut db = MagicDb::new();
39//!     // Create a MagicSource from a file
40//!     let rust_magic = MagicSource::open("../magic-db/src/magdir/rust")?;
41//!     db.load(rust_magic)?;
42//!
43//!     // Open a file and detect its type
44//!     let mut file = File::open("src/lib.rs")?;
45//!     let magic = db.first_magic(&mut file, None)?;
46//!
47//!     println!(
48//!         "File type: {} (MIME: {}, strength: {})",
49//!         magic.message(),
50//!         magic.mime_type(),
51//!         magic.strength()
52//!     );
53//!     Ok(())
54//! }
55//! ```
56//!
57//! ### Get All Matching Rules
58//! ```rust
59//! use pure_magic::{MagicDb, MagicSource};
60//! use std::fs::File;
61//!
62//! fn main() -> Result<(), Box<dyn std::error::Error>> {
63//!     let mut db = MagicDb::new();
64//!     // Create a MagicSource from a file
65//!     let rust_magic = MagicSource::open("../magic-db/src/magdir/rust")?;
66//!     db.load(rust_magic)?;
67//!
68//!     // Open a file and detect its type
69//!     let mut file = File::open("src/lib.rs")?;
70//!
71//!     // Get all matching rules, sorted by strength
72//!     let magics = db.all_magics(&mut file)?;
73//!
74//!     // Must contain rust file magic and default text magic
75//!     assert!(magics.len() > 1);
76//!
77//!     for magic in magics {
78//!         println!(
79//!             "Match: {} (strength: {}, source: {})",
80//!             magic.message(),
81//!             magic.strength(),
82//!             magic.source().unwrap_or("unknown")
83//!         );
84//!     }
85//!     Ok(())
86//! }
87//! ```
88//!
89//! ### Serialize a Database to Disk
90//! ```rust
91//! use pure_magic::{MagicDb, MagicSource};
92//! use std::fs::File;
93//!
94//! fn main() -> Result<(), Box<dyn std::error::Error>> {
95//!     let mut db = MagicDb::new();
96//!     // Create a MagicSource from a file
97//!     let rust_magic = MagicSource::open("../magic-db/src/magdir/rust")?;
98//!     db.load(rust_magic)?;
99//!
100//!     // Serialize the database to a file
101//!     let mut output = File::create("/tmp/compiled.db")?;
102//!     db.serialize(&mut output)?;
103//!
104//!     println!("Database saved to file");
105//!     Ok(())
106//! }
107//! ```
108//!
109//! ### Deserialize a Database
110//! ```rust
111//! use pure_magic::{MagicDb, MagicSource};
112//! use std::fs::File;
113//!
114//! fn main() -> Result<(), Box<dyn std::error::Error>> {
115//!     let mut db = MagicDb::new();
116//!     // Create a MagicSource from a file
117//!     let rust_magic = MagicSource::open("../magic-db/src/magdir/rust")?;
118//!     db.load(rust_magic)?;
119//!
120//!     // Serialize the database in a vector
121//!     let mut ser = vec![];
122//!     db.serialize(&mut ser)?;
123//!     println!("Database saved to vector");
124//!
125//!     // We deserialize from slice
126//!     let db = MagicDb::deserialize(&mut ser.as_slice())?;
127//!
128//!     assert!(!db.rules().is_empty());
129//!
130//!     Ok(())
131//! }
132//! ```
133//!
134//! ## License
135//! This project is licensed under the **GPL-3.0 License**.
136//!
137//! ## Contributing
138//! Contributions are welcome! Open an issue or submit a pull request.
139//!
140//! ## Acknowledgments
141//! - Inspired by the original `libmagic` (part of the `file` command).
142
143use dyf::{DynDisplay, FormatString, dformat};
144use flagset::{FlagSet, flags};
145use flate2::{Compression, read::GzDecoder, write::GzEncoder};
146use lazy_cache::LazyCache;
147use memchr::memchr;
148use pest::{Span, error::ErrorVariant};
149use regex::bytes::{self};
150use serde::{Deserialize, Serialize};
151use std::{
152    borrow::Cow,
153    cmp::max,
154    collections::{HashMap, HashSet},
155    fmt::{self, Debug, Display},
156    io::{self, Read, Seek, SeekFrom, Write},
157    ops::{Add, BitAnd, BitOr, BitXor, Div, Mul, Rem, Sub},
158    path::Path,
159};
160use tar::Archive;
161use thiserror::Error;
162use tracing::{Level, debug, enabled, trace};
163
164use crate::{
165    numeric::{Float, FloatDataType, Scalar, ScalarDataType},
166    parser::{FileMagicParser, Rule},
167    utils::{decode_id3, find_json_boundaries, run_utf8_validation},
168};
169
170mod numeric;
171mod parser;
172mod utils;
173
174const HARDCODED_MAGIC_STRENGTH: u64 = 2048;
175const HARDCODED_SOURCE: &str = "hardcoded";
176// corresponds to FILE_INDIR_MAX constant defined in libmagic
177const MAX_RECURSION: usize = 50;
178// constant found in libmagic. It is used to limit for search tests
179const FILE_BYTES_MAX: usize = 7 * 1024 * 1024;
180// constant found in libmagic. It is used to limit for regex tests
181const FILE_REGEX_MAX: usize = 8192;
182
183/// Default mimetype for un-identified binary data
184pub const DEFAULT_BIN_MIMETYPE: &str = "application/octet-stream";
185/// Default mimetype for un-identified text data
186pub const DEFAULT_TEXT_MIMETYPE: &str = "text/plain";
187
188pub(crate) const TIMESTAMP_FORMAT: &str = "%Y-%m-%d %H:%M:%S";
189
190macro_rules! debug_panic {
191    ($($arg:tt)*) => {
192        if cfg!(debug_assertions) {
193            panic!($($arg)*);
194        }
195    };
196}
197
198macro_rules! read {
199    ($r: expr, $ty: ty) => {{
200        let mut a = [0u8; std::mem::size_of::<$ty>()];
201        $r.read_exact(&mut a)?;
202        a
203    }};
204}
205
206macro_rules! read_le {
207    ($r:expr, $ty: ty ) => {{ <$ty>::from_le_bytes(read!($r, $ty)) }};
208}
209
210macro_rules! read_be {
211    ($r:expr, $ty: ty ) => {{ <$ty>::from_be_bytes(read!($r, $ty)) }};
212}
213
214macro_rules! read_me {
215    ($r: expr) => {{ ((read_le!($r, u16) as i32) << 16) | (read_le!($r, u16) as i32) }};
216}
217
218#[inline(always)]
219fn read_octal_u64<R: Read + Seek>(haystack: &mut LazyCache<R>) -> Option<u64> {
220    let s = haystack
221        .read_while_or_limit(|b| matches!(b, b'0'..=b'7'), 22)
222        .map(|buf| str::from_utf8(buf))
223        .ok()?
224        .ok()?;
225
226    if !s.starts_with("0") {
227        return None;
228    }
229
230    u64::from_str_radix(s, 8).ok()
231}
232
233/// Represents all possible errors that can occur during file type detection and processing.
234#[derive(Debug, Error)]
235pub enum Error {
236    /// A generic error with a custom message.
237    #[error("{0}")]
238    Msg(String),
239
240    /// An error with a source location and a nested error.
241    #[error("source={0} line={1} error={2}")]
242    Localized(String, usize, Box<Error>),
243
244    /// Indicates a required rule was not found.
245    #[error("missing rule: {0}")]
246    MissingRule(String),
247
248    /// Indicates the maximum recursion depth was reached.
249    #[error("maximum recursion reached: {0}")]
250    MaximumRecursion(usize),
251
252    /// Wraps an I/O error.
253    #[error("io: {0}")]
254    Io(#[from] io::Error),
255
256    /// Wraps a parsing error from the `pest` parser.
257    #[error("parser error: {0}")]
258    Parse(#[from] Box<pest::error::Error<Rule>>),
259
260    /// Wraps a formatting error from the `dyf` crate.
261    #[error("formatting: {0}")]
262    Format(#[from] dyf::Error),
263
264    /// Wraps a regex-related error.
265    #[error("regex: {0}")]
266    Regex(#[from] regex::Error),
267
268    /// Wraps a serialization error from `bincode`.
269    #[error("{0}")]
270    Serialize(#[from] bincode::error::EncodeError),
271
272    /// Wraps a deserialization error from `bincode`.
273    #[error("{0}")]
274    Deserialize(#[from] bincode::error::DecodeError),
275}
276
277impl Error {
278    #[inline]
279    fn parser<S: ToString>(msg: S, span: Span<'_>) -> Self {
280        Self::Parse(Box::new(pest::error::Error::new_from_span(
281            ErrorVariant::CustomError {
282                message: msg.to_string(),
283            },
284            span,
285        )))
286    }
287
288    fn msg<M: AsRef<str>>(msg: M) -> Self {
289        Self::Msg(msg.as_ref().into())
290    }
291
292    fn localized<S: AsRef<str>>(source: S, line: usize, err: Error) -> Self {
293        Self::Localized(source.as_ref().into(), line, err.into())
294    }
295
296    /// Unwraps the localized error
297    pub fn unwrap_localized(&self) -> &Self {
298        match self {
299            Self::Localized(_, _, e) => e,
300            _ => self,
301        }
302    }
303}
304
305#[derive(Debug, Clone, Serialize, Deserialize)]
306enum Message {
307    String(String),
308    Format {
309        printf_spec: String,
310        fs: FormatString,
311    },
312}
313
314impl Display for Message {
315    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
316        match self {
317            Self::String(s) => write!(f, "{s}"),
318            Self::Format { printf_spec: _, fs } => write!(f, "{}", fs.to_string_lossy()),
319        }
320    }
321}
322
323impl Message {
324    fn to_string_lossy(&self) -> Cow<'_, str> {
325        match self {
326            Message::String(s) => Cow::Borrowed(s),
327            Message::Format { printf_spec: _, fs } => fs.to_string_lossy(),
328        }
329    }
330
331    #[inline(always)]
332    fn format_with(&self, mr: Option<&MatchRes>) -> Result<Cow<'_, str>, Error> {
333        match self {
334            Self::String(s) => Ok(Cow::Borrowed(s.as_str())),
335            Self::Format {
336                printf_spec: c_spec,
337                fs,
338            } => {
339                if let Some(mr) = mr {
340                    match mr {
341                        MatchRes::Float(_, _) | MatchRes::Bytes(_, _, _, _) => {
342                            Ok(Cow::Owned(dformat!(fs, mr)?))
343                        }
344                        MatchRes::Scalar(_, scalar) => {
345                            // we want to print a byte as char
346                            if c_spec.as_str() == "c" {
347                                match scalar {
348                                    Scalar::byte(b) => {
349                                        let b = (*b as u8) as char;
350                                        Ok(Cow::Owned(dformat!(fs, b)?))
351                                    }
352                                    Scalar::ubyte(b) => {
353                                        let b = *b as char;
354                                        Ok(Cow::Owned(dformat!(fs, b)?))
355                                    }
356                                    _ => Ok(Cow::Owned(dformat!(fs, mr)?)),
357                                }
358                            } else {
359                                Ok(Cow::Owned(dformat!(fs, mr)?))
360                            }
361                        }
362                    }
363                } else {
364                    Ok(fs.to_string_lossy())
365                }
366            }
367        }
368    }
369}
370
371impl ScalarDataType {
372    #[inline(always)]
373    fn read<R: Read + Seek>(&self, from: &mut R, switch_endianness: bool) -> Result<Scalar, Error> {
374        macro_rules! _read_le {
375            ($ty: ty) => {{
376                if switch_endianness {
377                    <$ty>::from_be_bytes(read!(from, $ty))
378                } else {
379                    <$ty>::from_le_bytes(read!(from, $ty))
380                }
381            }};
382        }
383
384        macro_rules! _read_be {
385            ($ty: ty) => {{
386                if switch_endianness {
387                    <$ty>::from_le_bytes(read!(from, $ty))
388                } else {
389                    <$ty>::from_be_bytes(read!(from, $ty))
390                }
391            }};
392        }
393
394        macro_rules! _read_ne {
395            ($ty: ty) => {{
396                if cfg!(target_endian = "big") {
397                    _read_be!($ty)
398                } else {
399                    _read_le!($ty)
400                }
401            }};
402        }
403
404        macro_rules! _read_me {
405            () => {
406                ((_read_le!(u16) as i32) << 16) | (_read_le!(u16) as i32)
407            };
408        }
409
410        Ok(match self {
411            // signed
412            Self::byte => Scalar::byte(read!(from, u8)[0] as i8),
413            Self::short => Scalar::short(_read_ne!(i16)),
414            Self::long => Scalar::long(_read_ne!(i32)),
415            Self::date => Scalar::date(_read_ne!(i32)),
416            Self::ldate => Scalar::ldate(_read_ne!(i32)),
417            Self::qwdate => Scalar::qwdate(_read_ne!(i64)),
418            Self::leshort => Scalar::leshort(_read_le!(i16)),
419            Self::lelong => Scalar::lelong(_read_le!(i32)),
420            Self::lequad => Scalar::lequad(_read_le!(i64)),
421            Self::bequad => Scalar::bequad(_read_be!(i64)),
422            Self::belong => Scalar::belong(_read_be!(i32)),
423            Self::bedate => Scalar::bedate(_read_be!(i32)),
424            Self::beldate => Scalar::beldate(_read_be!(i32)),
425            Self::beqdate => Scalar::beqdate(_read_be!(i64)),
426            // unsigned
427            Self::ubyte => Scalar::ubyte(read!(from, u8)[0]),
428            Self::ushort => Scalar::ushort(_read_ne!(u16)),
429            Self::uleshort => Scalar::uleshort(_read_le!(u16)),
430            Self::ulelong => Scalar::ulelong(_read_le!(u32)),
431            Self::uledate => Scalar::uledate(_read_le!(u32)),
432            Self::ulequad => Scalar::ulequad(_read_le!(u64)),
433            Self::offset => Scalar::offset(from.stream_position()?),
434            Self::ubequad => Scalar::ubequad(_read_be!(u64)),
435            Self::medate => Scalar::medate(_read_me!()),
436            Self::meldate => Scalar::meldate(_read_me!()),
437            Self::melong => Scalar::melong(_read_me!()),
438            Self::beshort => Scalar::beshort(_read_be!(i16)),
439            Self::quad => Scalar::quad(_read_ne!(i64)),
440            Self::uquad => Scalar::uquad(_read_ne!(u64)),
441            Self::ledate => Scalar::ledate(_read_le!(i32)),
442            Self::leldate => Scalar::leldate(_read_le!(i32)),
443            Self::leqdate => Scalar::leqdate(_read_le!(i64)),
444            Self::leqldate => Scalar::leqldate(_read_le!(i64)),
445            Self::leqwdate => Scalar::leqwdate(_read_le!(i64)),
446            Self::ubelong => Scalar::ubelong(_read_be!(u32)),
447            Self::ulong => Scalar::ulong(_read_ne!(u32)),
448            Self::ubeshort => Scalar::ubeshort(_read_be!(u16)),
449            Self::ubeqdate => Scalar::ubeqdate(_read_be!(u64)),
450            Self::lemsdosdate => Scalar::lemsdosdate(_read_le!(u16)),
451            Self::lemsdostime => Scalar::lemsdostime(_read_le!(u16)),
452            Self::guid => Scalar::guid(u128::from_be_bytes(read!(from, u128))),
453        })
454    }
455}
456
457impl FloatDataType {
458    #[inline(always)]
459    fn read<R: Read + Seek>(&self, from: &mut R, switch_endianness: bool) -> Result<Float, Error> {
460        macro_rules! _read_le {
461            ($ty: ty) => {{
462                if switch_endianness {
463                    <$ty>::from_be_bytes(read!(from, $ty))
464                } else {
465                    <$ty>::from_le_bytes(read!(from, $ty))
466                }
467            }};
468        }
469
470        macro_rules! _read_be {
471            ($ty: ty) => {{
472                if switch_endianness {
473                    <$ty>::from_le_bytes(read!(from, $ty))
474                } else {
475                    <$ty>::from_be_bytes(read!(from, $ty))
476                }
477            }};
478        }
479
480        macro_rules! _read_ne {
481            ($ty: ty) => {{
482                if cfg!(target_endian = "big") {
483                    _read_be!($ty)
484                } else {
485                    _read_le!($ty)
486                }
487            }};
488        }
489
490        macro_rules! _read_me {
491            () => {
492                ((_read_le!(u16) as i32) << 16) | (_read_le!(u16) as i32)
493            };
494        }
495
496        Ok(match self {
497            Self::lefloat => Float::lefloat(_read_le!(f32)),
498            Self::befloat => Float::befloat(_read_le!(f32)),
499            Self::ledouble => Float::ledouble(_read_le!(f64)),
500            Self::bedouble => Float::bedouble(_read_be!(f64)),
501        })
502    }
503}
504
505#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
506enum Op {
507    Mul,
508    Add,
509    Sub,
510    Div,
511    Mod,
512    And,
513    Xor,
514    Or,
515}
516
517impl Display for Op {
518    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
519        match self {
520            Op::Mul => write!(f, "*"),
521            Op::Add => write!(f, "+"),
522            Op::Sub => write!(f, "-"),
523            Op::Div => write!(f, "/"),
524            Op::Mod => write!(f, "%"),
525            Op::And => write!(f, "&"),
526            Op::Or => write!(f, "|"),
527            Op::Xor => write!(f, "^"),
528        }
529    }
530}
531
532#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
533enum CmpOp {
534    Eq,
535    Lt,
536    Gt,
537    BitAnd,
538    Neq, // ! operator
539    Xor,
540    Not, // ~ operator
541}
542
543impl CmpOp {
544    #[inline(always)]
545    fn is_neq(&self) -> bool {
546        matches!(self, Self::Neq)
547    }
548}
549
550#[derive(Debug, Clone, Serialize, Deserialize)]
551struct ScalarTransform {
552    op: Op,
553    num: Scalar,
554}
555
556impl ScalarTransform {
557    fn apply(&self, s: Scalar) -> Option<Scalar> {
558        match self.op {
559            Op::Add => s.checked_add(self.num),
560            Op::Sub => s.checked_sub(self.num),
561            Op::Mul => s.checked_mul(self.num),
562            Op::Div => s.checked_div(self.num),
563            Op::Mod => s.checked_rem(self.num),
564            Op::And => Some(s.bitand(self.num)),
565            Op::Xor => Some(s.bitxor(self.num)),
566            Op::Or => Some(s.bitor(self.num)),
567        }
568    }
569}
570
571#[derive(Debug, Clone, Serialize, Deserialize)]
572struct FloatTransform {
573    op: Op,
574    num: Float,
575}
576
577impl FloatTransform {
578    fn apply(&self, s: Float) -> Float {
579        match self.op {
580            Op::Add => s.add(self.num),
581            Op::Sub => s.sub(self.num),
582            Op::Mul => s.mul(self.num),
583            // returns inf when div by 0
584            Op::Div => s.div(self.num),
585            // returns NaN when rem by 0
586            Op::Mod => s.rem(self.num),
587            // parser makes sure those operators cannot be used
588            Op::And | Op::Xor | Op::Or => {
589                debug_panic!("unsupported operation");
590                s
591            }
592        }
593    }
594}
595
596#[derive(Debug, Clone, Serialize, Deserialize)]
597enum TestValue<T> {
598    Value(T),
599    Any,
600}
601
602impl<T> TestValue<T> {
603    #[inline(always)]
604    fn as_ref(&self) -> TestValue<&T> {
605        match self {
606            Self::Value(v) => TestValue::Value(v),
607            Self::Any => TestValue::Any,
608        }
609    }
610}
611
612flags! {
613    enum ReMod: u8{
614        CaseInsensitive,
615        StartOffsetUpdate,
616        LineLimit,
617        ForceBin,
618        ForceText,
619        TrimMatch,
620    }
621}
622
623fn serialize_regex<S>(re: &bytes::Regex, serializer: S) -> Result<S::Ok, S::Error>
624where
625    S: serde::Serializer,
626{
627    re.as_str().serialize(serializer)
628}
629
630fn deserialize_regex<'de, D>(deserializer: D) -> Result<bytes::Regex, D::Error>
631where
632    D: serde::Deserializer<'de>,
633{
634    let wrapper = String::deserialize(deserializer)?;
635    bytes::Regex::new(&wrapper).map_err(serde::de::Error::custom)
636}
637
638#[derive(Debug, Clone, Serialize, Deserialize)]
639struct RegexTest {
640    #[serde(
641        serialize_with = "serialize_regex",
642        deserialize_with = "deserialize_regex"
643    )]
644    re: bytes::Regex,
645    length: Option<usize>,
646    mods: FlagSet<ReMod>,
647    str_mods: FlagSet<StringMod>,
648    non_magic_len: usize,
649    binary: bool,
650    cmp_op: CmpOp,
651}
652
653impl RegexTest {
654    #[inline(always)]
655    fn is_binary(&self) -> bool {
656        self.binary
657            || self.mods.contains(ReMod::ForceBin)
658            || self.str_mods.contains(StringMod::ForceBin)
659    }
660
661    #[inline(always)]
662    fn is_text(&self) -> bool {
663        self.mods.contains(ReMod::ForceText) || self.str_mods.contains(StringMod::ForceText)
664    }
665
666    fn match_buf<'buf>(
667        &self,
668        off_buf: u64, // absolute buffer offset in content
669        stream_kind: StreamKind,
670        buf: &'buf [u8],
671    ) -> Option<MatchRes<'buf>> {
672        let mr = match stream_kind {
673            StreamKind::Text(_) => {
674                let mut off_txt = off_buf;
675
676                let mut line_limit = self.length.unwrap_or(usize::MAX);
677
678                for line in buf.split(|c| c == &b'\n') {
679                    // we don't need to break on offset
680                    // limit as buf contains the good amount
681                    // of bytes to match against
682                    if line_limit == 0 {
683                        break;
684                    }
685
686                    if let Some(re_match) = self.re.find(line) {
687                        // the offset of the string is computed from the start of the buffer
688                        let start_offset = off_txt + re_match.start() as u64;
689
690                        // if we matched until EOL we need to add one to include the delimiter removed from the split
691                        let stop_offset = if re_match.end() == line.len() {
692                            Some(start_offset + re_match.as_bytes().len() as u64 + 1)
693                        } else {
694                            None
695                        };
696
697                        return Some(MatchRes::Bytes(
698                            start_offset,
699                            stop_offset,
700                            re_match.as_bytes(),
701                            Encoding::Utf8,
702                        ));
703                    }
704
705                    off_txt += line.len() as u64;
706                    // we have to add one because lines do not contain splitting character
707                    off_txt += 1;
708                    line_limit = line_limit.saturating_sub(1)
709                }
710                None
711            }
712
713            StreamKind::Binary => {
714                self.re.find(buf).map(|re_match| {
715                    MatchRes::Bytes(
716                        // the offset of the string is computed from the start of the buffer
717                        off_buf + re_match.start() as u64,
718                        None,
719                        re_match.as_bytes(),
720                        Encoding::Utf8,
721                    )
722                })
723            }
724        };
725
726        // handle the case where we want the regex not to match
727        if self.cmp_op.is_neq() && mr.is_none() {
728            return Some(MatchRes::Bytes(off_buf, None, buf, Encoding::Utf8));
729        }
730
731        mr
732    }
733}
734
735impl From<RegexTest> for Test {
736    fn from(value: RegexTest) -> Self {
737        Self::Regex(value)
738    }
739}
740
741flags! {
742    enum StringMod: u8{
743        ForceBin,
744        UpperInsensitive,
745        LowerInsensitive,
746        FullWordMatch,
747        Trim,
748        ForceText,
749        CompactWhitespace,
750        OptBlank,
751    }
752}
753
754#[derive(Debug, Clone, Serialize, Deserialize)]
755struct StringTest {
756    test_val: TestValue<Vec<u8>>,
757    cmp_op: CmpOp,
758    length: Option<usize>,
759    mods: FlagSet<StringMod>,
760    binary: bool,
761}
762
763impl From<StringTest> for Test {
764    fn from(value: StringTest) -> Self {
765        Self::String(value)
766    }
767}
768
769#[inline(always)]
770fn string_match(str: &[u8], mods: FlagSet<StringMod>, buf: &[u8]) -> (bool, usize) {
771    let mut consumed = 0;
772    // we can do a simple string comparison
773    if mods.is_disjoint(
774        StringMod::UpperInsensitive
775            | StringMod::LowerInsensitive
776            | StringMod::FullWordMatch
777            | StringMod::CompactWhitespace
778            | StringMod::OptBlank,
779    ) {
780        // we check if target contains
781        if buf.starts_with(str) {
782            (true, str.len())
783        } else {
784            (false, consumed)
785        }
786    } else {
787        let mut i_src = 0;
788        let mut iter = buf.iter().peekable();
789
790        macro_rules! consume_target {
791            () => {{
792                if iter.next().is_some() {
793                    consumed += 1;
794                }
795            }};
796        }
797
798        macro_rules! continue_next_iteration {
799            () => {{
800                consume_target!();
801                i_src += 1;
802                continue;
803            }};
804        }
805
806        while let Some(&&b) = iter.peek() {
807            let Some(&ref_byte) = str.get(i_src) else {
808                break;
809            };
810
811            if mods.contains(StringMod::OptBlank) && (b == b' ' || ref_byte == b' ') {
812                if b == b' ' {
813                    // we ignore whitespace in target
814                    consume_target!();
815                }
816
817                if ref_byte == b' ' {
818                    // we ignore whitespace in test
819                    i_src += 1;
820                }
821
822                continue;
823            }
824
825            if mods.contains(StringMod::UpperInsensitive) {
826                //upper case characters in the magic match both lower and upper case characters in the target
827                if ref_byte.is_ascii_uppercase() && ref_byte == b.to_ascii_uppercase()
828                    || ref_byte == b
829                {
830                    continue_next_iteration!()
831                }
832            }
833
834            if mods.contains(StringMod::LowerInsensitive)
835                && (ref_byte.is_ascii_lowercase() && ref_byte == b.to_ascii_lowercase()
836                    || ref_byte == b)
837            {
838                continue_next_iteration!()
839            }
840
841            if mods.contains(StringMod::CompactWhitespace) && ref_byte == b' ' {
842                let mut src_blk = 0;
843                while let Some(b' ') = str.get(i_src) {
844                    src_blk += 1;
845                    i_src += 1;
846                }
847
848                let mut tgt_blk = 0;
849                while let Some(b' ') = iter.peek() {
850                    tgt_blk += 1;
851                    consume_target!();
852                }
853
854                if src_blk > tgt_blk {
855                    return (false, consumed);
856                }
857
858                continue;
859            }
860
861            if ref_byte == b {
862                continue_next_iteration!()
863            } else {
864                return (false, consumed);
865            }
866        }
867
868        if mods.contains(StringMod::FullWordMatch)
869            && let Some(b) = iter.peek()
870            && !b.is_ascii_whitespace()
871        {
872            return (false, consumed);
873        }
874
875        (
876            consumed > 0 && str.get(i_src).is_none() && consumed <= buf.len(),
877            consumed,
878        )
879    }
880}
881
882impl StringTest {
883    fn has_length_mod(&self) -> bool {
884        !self.mods.is_disjoint(
885            StringMod::UpperInsensitive
886                | StringMod::LowerInsensitive
887                | StringMod::FullWordMatch
888                | StringMod::CompactWhitespace
889                | StringMod::OptBlank,
890        )
891    }
892
893    #[inline(always)]
894    fn test_value_len(&self) -> usize {
895        match self.test_val.as_ref() {
896            TestValue::Value(s) => s.len(),
897            TestValue::Any => 0,
898        }
899    }
900
901    #[inline(always)]
902    fn is_binary(&self) -> bool {
903        self.binary || self.mods.contains(StringMod::ForceBin)
904    }
905
906    #[inline(always)]
907    fn is_text(&self) -> bool {
908        self.mods.contains(StringMod::ForceText)
909    }
910}
911
912#[derive(Debug, Clone, Serialize, Deserialize)]
913struct SearchTest {
914    str: Vec<u8>,
915    n_pos: Option<usize>,
916    str_mods: FlagSet<StringMod>,
917    re_mods: FlagSet<ReMod>,
918    binary: bool,
919    cmp_op: CmpOp,
920}
921
922impl From<SearchTest> for Test {
923    fn from(value: SearchTest) -> Self {
924        Self::Search(value)
925    }
926}
927
928impl SearchTest {
929    #[inline(always)]
930    fn is_binary(&self) -> bool {
931        (self.binary
932            || self.str_mods.contains(StringMod::ForceBin)
933            || self.re_mods.contains(ReMod::ForceBin))
934            && !(self.str_mods.contains(StringMod::ForceText)
935                || self.re_mods.contains(ReMod::ForceText))
936    }
937
938    // off_buf: absolute buffer offset in content
939    #[inline]
940    fn match_buf<'buf>(&self, off_buf: u64, buf: &'buf [u8]) -> Option<MatchRes<'buf>> {
941        let mut i = 0;
942
943        let needle = self.str.first()?;
944
945        while i < buf.len() {
946            // we cannot match if the first character isn't the same
947            // so we accelerate the search by finding potential matches
948            i += memchr(*needle, &buf[i..])?;
949
950            // if we want a full word match
951            if self.str_mods.contains(StringMod::FullWordMatch) {
952                let prev_is_whitespace = buf
953                    .get(i.saturating_sub(1))
954                    .map(|c| c.is_ascii_whitespace())
955                    .unwrap_or_default();
956
957                // if it is not the first character
958                // and its previous character isn't
959                // a whitespace. It cannot be a
960                // fullword match
961                if i > 0 && !prev_is_whitespace {
962                    i += 1;
963                    continue;
964                }
965            }
966
967            if let Some(npos) = self.n_pos
968                && i > npos
969            {
970                break;
971            }
972
973            let pos = i;
974            let (ok, consumed) = string_match(&self.str, self.str_mods, &buf[i..]);
975
976            if ok {
977                return Some(MatchRes::Bytes(
978                    off_buf.saturating_add(pos as u64),
979                    None,
980                    &buf[i..i + consumed],
981                    Encoding::Utf8,
982                ));
983            } else {
984                i += max(consumed, 1)
985            }
986        }
987
988        // handles the case where we want the string not to be found
989        if self.cmp_op.is_neq() {
990            return Some(MatchRes::Bytes(off_buf, None, buf, Encoding::Utf8));
991        }
992
993        None
994    }
995}
996
997#[derive(Debug, Clone, Serialize, Deserialize)]
998struct ScalarTest {
999    ty: ScalarDataType,
1000    transform: Option<ScalarTransform>,
1001    cmp_op: CmpOp,
1002    test_val: TestValue<Scalar>,
1003}
1004
1005#[derive(Debug, Clone, Serialize, Deserialize)]
1006struct FloatTest {
1007    ty: FloatDataType,
1008    transform: Option<FloatTransform>,
1009    cmp_op: CmpOp,
1010    test_val: TestValue<Float>,
1011}
1012
1013// the value read from the haystack we want to match against
1014// 'buf is the lifetime of the buffer we are scanning
1015#[derive(Debug, PartialEq)]
1016enum ReadValue<'buf> {
1017    Float(u64, Float),
1018    Scalar(u64, Scalar),
1019    Bytes(u64, &'buf [u8]),
1020}
1021
1022impl DynDisplay for ReadValue<'_> {
1023    fn dyn_fmt(&self, f: &dyf::FormatSpec) -> Result<String, dyf::Error> {
1024        match self {
1025            Self::Float(_, s) => DynDisplay::dyn_fmt(s, f),
1026            Self::Scalar(_, s) => DynDisplay::dyn_fmt(s, f),
1027            Self::Bytes(_, b) => Ok(format!("{b:?}")),
1028        }
1029    }
1030}
1031
1032impl DynDisplay for &ReadValue<'_> {
1033    fn dyn_fmt(&self, f: &dyf::FormatSpec) -> Result<String, dyf::Error> {
1034        // Dereference self to get the TestValue and call its fmt method
1035        DynDisplay::dyn_fmt(*self, f)
1036    }
1037}
1038
1039impl Display for ReadValue<'_> {
1040    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1041        match self {
1042            Self::Float(_, v) => write!(f, "{v}"),
1043            Self::Scalar(_, s) => write!(f, "{s}"),
1044            Self::Bytes(_, b) => write!(f, "{b:?}"),
1045        }
1046    }
1047}
1048
1049enum Encoding {
1050    Utf16(String16Encoding),
1051    Utf8,
1052}
1053
1054// Carry the offset of the start of the data in the stream
1055// and the data itself
1056enum MatchRes<'buf> {
1057    // Bytes.0: offset of the match
1058    // Bytes.1: optional end of match (to address the need of EOL adjustment in string regex)
1059    // Bytes.2: the bytes matching
1060    // Bytes.3: encoding of the buffer
1061    Bytes(u64, Option<u64>, &'buf [u8], Encoding),
1062    Scalar(u64, Scalar),
1063    Float(u64, Float),
1064}
1065
1066impl DynDisplay for &MatchRes<'_> {
1067    fn dyn_fmt(&self, f: &dyf::FormatSpec) -> Result<String, dyf::Error> {
1068        (*self).dyn_fmt(f)
1069    }
1070}
1071
1072impl DynDisplay for MatchRes<'_> {
1073    fn dyn_fmt(&self, f: &dyf::FormatSpec) -> Result<String, dyf::Error> {
1074        match self {
1075            Self::Scalar(_, v) => v.dyn_fmt(f),
1076            Self::Float(_, v) => v.dyn_fmt(f),
1077            Self::Bytes(_, _, v, enc) => match enc {
1078                Encoding::Utf8 => String::from_utf8_lossy(v).to_string().dyn_fmt(f),
1079                Encoding::Utf16(enc) => {
1080                    let utf16: Vec<u16> = slice_to_utf16_iter(v, *enc).collect();
1081                    String::from_utf16_lossy(&utf16).dyn_fmt(f)
1082                }
1083            },
1084        }
1085    }
1086}
1087
1088impl MatchRes<'_> {
1089    // start offset of the match
1090    #[inline]
1091    fn start_offset(&self) -> u64 {
1092        match self {
1093            MatchRes::Bytes(o, _, _, _) => *o,
1094            MatchRes::Scalar(o, _) => *o,
1095            MatchRes::Float(o, _) => *o,
1096        }
1097    }
1098
1099    // start offset of the match
1100    #[inline]
1101    fn end_offset(&self) -> u64 {
1102        match self {
1103            MatchRes::Bytes(start, end, buf, _) => match end {
1104                Some(end) => *end,
1105                None => start.saturating_add(buf.len() as u64),
1106            },
1107            MatchRes::Scalar(o, sc) => o.add(sc.size_of() as u64),
1108            MatchRes::Float(o, f) => o.add(f.size_of() as u64),
1109        }
1110    }
1111}
1112
1113fn slice_to_utf16_iter(read: &[u8], encoding: String16Encoding) -> impl Iterator<Item = u16> {
1114    let even = read
1115        .iter()
1116        .enumerate()
1117        .filter(|(i, _)| i % 2 == 0)
1118        .map(|t| t.1);
1119
1120    let odd = read
1121        .iter()
1122        .enumerate()
1123        .filter(|(i, _)| i % 2 != 0)
1124        .map(|t| t.1);
1125
1126    even.zip(odd).map(move |(e, o)| match encoding {
1127        String16Encoding::Le => u16::from_le_bytes([*e, *o]),
1128        String16Encoding::Be => u16::from_be_bytes([*e, *o]),
1129    })
1130}
1131
1132#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
1133enum String16Encoding {
1134    Le,
1135    Be,
1136}
1137
1138#[derive(Debug, Clone, Serialize, Deserialize)]
1139struct String16Test {
1140    orig: String,
1141    test_val: TestValue<Vec<u16>>,
1142    encoding: String16Encoding,
1143}
1144
1145impl String16Test {
1146    /// if the test value is a specific value this method returns
1147    /// the number of utf16 characters. To obtain the length in
1148    /// bytes the return value needs to be multiplied by two.
1149    #[inline(always)]
1150    fn test_value_len(&self) -> usize {
1151        match self.test_val.as_ref() {
1152            TestValue::Value(str16) => str16.len(),
1153            TestValue::Any => 0,
1154        }
1155    }
1156}
1157
1158flags! {
1159    enum IndirectMod: u8{
1160        Relative,
1161    }
1162}
1163
1164type IndirectMods = FlagSet<IndirectMod>;
1165
1166#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
1167enum PStringLen {
1168    Byte,    // B
1169    ShortBe, // H
1170    ShortLe, // h
1171    LongBe,  // L
1172    LongLe,  // l
1173}
1174
1175impl PStringLen {
1176    #[inline(always)]
1177    const fn size_of_len(&self) -> usize {
1178        match self {
1179            PStringLen::Byte => 1,
1180            PStringLen::ShortBe => 2,
1181            PStringLen::ShortLe => 2,
1182            PStringLen::LongBe => 4,
1183            PStringLen::LongLe => 4,
1184        }
1185    }
1186}
1187
1188#[derive(Debug, Clone, Serialize, Deserialize)]
1189struct PStringTest {
1190    len: PStringLen,
1191    test_val: TestValue<Vec<u8>>,
1192    include_len: bool,
1193}
1194
1195impl PStringTest {
1196    #[inline]
1197    fn read<'cache, R: Read + Seek>(
1198        &self,
1199        haystack: &'cache mut LazyCache<R>,
1200    ) -> Result<Option<&'cache [u8]>, Error> {
1201        let mut len = match self.len {
1202            PStringLen::Byte => read_le!(haystack, u8) as u32,
1203            PStringLen::ShortBe => read_be!(haystack, u16) as u32,
1204            PStringLen::ShortLe => read_le!(haystack, u16) as u32,
1205            PStringLen::LongBe => read_be!(haystack, u32),
1206            PStringLen::LongLe => read_le!(haystack, u32),
1207        } as usize;
1208
1209        if self.include_len {
1210            len = len.saturating_sub(self.len.size_of_len())
1211        }
1212
1213        if let TestValue::Value(s) = self.test_val.as_ref()
1214            && len != s.len()
1215        {
1216            return Ok(None);
1217        }
1218
1219        let read = haystack.read_exact_count(len as u64)?;
1220
1221        Ok(Some(read))
1222    }
1223
1224    #[inline(always)]
1225    fn test_value_len(&self) -> usize {
1226        match self.test_val.as_ref() {
1227            TestValue::Value(s) => s.len(),
1228            TestValue::Any => 0,
1229        }
1230    }
1231}
1232
1233#[derive(Debug, Clone, Serialize, Deserialize)]
1234enum Test {
1235    Name(String),
1236    Use(bool, String),
1237    Scalar(ScalarTest),
1238    Float(FloatTest),
1239    String(StringTest),
1240    Search(SearchTest),
1241    PString(PStringTest),
1242    Regex(RegexTest),
1243    Indirect(FlagSet<IndirectMod>),
1244    String16(String16Test),
1245    // FIXME: placeholder for strength computation
1246    #[allow(dead_code)]
1247    Der,
1248    Clear,
1249    Default,
1250}
1251
1252impl Test {
1253    // read the value to test from the haystack
1254    #[inline]
1255    fn read_test_value<'haystack, R: Read + Seek>(
1256        &self,
1257        haystack: &'haystack mut LazyCache<R>,
1258        switch_endianness: bool,
1259    ) -> Result<Option<ReadValue<'haystack>>, Error> {
1260        let test_value_offset = haystack.lazy_stream_position();
1261
1262        match self {
1263            Self::Scalar(t) => {
1264                t.ty.read(haystack, switch_endianness)
1265                    .map(|s| Some(ReadValue::Scalar(test_value_offset, s)))
1266            }
1267
1268            Self::Float(t) => {
1269                t.ty.read(haystack, switch_endianness)
1270                    .map(|f| Some(ReadValue::Float(test_value_offset, f)))
1271            }
1272            Self::String(t) => {
1273                match t.test_val.as_ref() {
1274                    TestValue::Value(str) => {
1275                        let buf = if let Some(length) = t.length {
1276                            // if there is a length specified
1277                            haystack.read_exact_count(length as u64)?
1278                        } else {
1279                            // no length specified we read until end of string
1280
1281                            match t.cmp_op {
1282                                CmpOp::Eq | CmpOp::Neq => {
1283                                    if !t.has_length_mod() {
1284                                        haystack.read_exact_count(str.len() as u64)?
1285                                    } else {
1286                                        haystack.read_count(FILE_BYTES_MAX as u64)?
1287                                    }
1288                                }
1289                                CmpOp::Lt | CmpOp::Gt => {
1290                                    let read =
1291                                        haystack.read_until_any_delim_or_limit(b"\n\0", 8092)?;
1292
1293                                    if read.ends_with(b"\0") || read.ends_with(b"\n") {
1294                                        &read[..read.len() - 1]
1295                                    } else {
1296                                        read
1297                                    }
1298                                }
1299                                _ => {
1300                                    return Err(Error::Msg(format!(
1301                                        "string test does not support {:?} operator",
1302                                        t.cmp_op
1303                                    )));
1304                                }
1305                            }
1306                        };
1307
1308                        Ok(Some(ReadValue::Bytes(test_value_offset, buf)))
1309                    }
1310                    TestValue::Any => {
1311                        let read = haystack.read_until_any_delim_or_limit(b"\0\n", 8192)?;
1312                        // we don't take last byte if it matches end of string
1313                        let bytes = if read.ends_with(b"\0") || read.ends_with(b"\n") {
1314                            &read[..read.len() - 1]
1315                        } else {
1316                            read
1317                        };
1318
1319                        Ok(Some(ReadValue::Bytes(test_value_offset, bytes)))
1320                    }
1321                }
1322            }
1323
1324            Self::String16(t) => {
1325                match t.test_val.as_ref() {
1326                    TestValue::Value(str16) => {
1327                        let read = haystack.read_exact_count((str16.len() * 2) as u64)?;
1328
1329                        Ok(Some(ReadValue::Bytes(test_value_offset, read)))
1330                    }
1331                    TestValue::Any => {
1332                        let read = haystack.read_until_utf16_or_limit(b"\x00\x00", 8192)?;
1333
1334                        // we make sure we have an even number of elements
1335                        let end = if read.len() % 2 == 0 {
1336                            read.len()
1337                        } else {
1338                            // we decide to read anyway even though
1339                            // length isn't even
1340                            read.len().saturating_sub(1)
1341                        };
1342
1343                        Ok(Some(ReadValue::Bytes(test_value_offset, &read[..end])))
1344                    }
1345                }
1346            }
1347
1348            Self::PString(t) => {
1349                let Some(read) = t.read(haystack)? else {
1350                    return Ok(None);
1351                };
1352                Ok(Some(ReadValue::Bytes(test_value_offset, read)))
1353            }
1354
1355            Self::Search(_) => {
1356                let buf = haystack.read_count(FILE_BYTES_MAX as u64)?;
1357                Ok(Some(ReadValue::Bytes(test_value_offset, buf)))
1358            }
1359
1360            Self::Regex(r) => {
1361                let length = {
1362                    match r.length {
1363                        Some(len) => {
1364                            if r.mods.contains(ReMod::LineLimit) {
1365                                len * 80
1366                            } else {
1367                                len
1368                            }
1369                        }
1370
1371                        None => FILE_REGEX_MAX,
1372                    }
1373                };
1374
1375                let read = haystack.read_count(length as u64)?;
1376                Ok(Some(ReadValue::Bytes(test_value_offset, read)))
1377            }
1378
1379            Self::Name(_)
1380            | Self::Use(_, _)
1381            | Self::Indirect(_)
1382            | Self::Clear
1383            | Self::Default
1384            | Self::Der => Err(Error::msg("no value to read for this test")),
1385        }
1386    }
1387
1388    #[inline(always)]
1389    fn match_value<'s>(
1390        &'s self,
1391        tv: &ReadValue<'s>,
1392        stream_kind: StreamKind,
1393    ) -> Option<MatchRes<'s>> {
1394        match (self, tv) {
1395            (Self::Scalar(t), ReadValue::Scalar(o, ts)) => {
1396                let read_value: Scalar = match t.transform.as_ref() {
1397                    Some(t) => t.apply(*ts)?,
1398                    None => *ts,
1399                };
1400
1401                match t.test_val {
1402                    TestValue::Value(test_value) => {
1403                        let ok = match t.cmp_op {
1404                            // NOTE: this should not happen in practice because
1405                            // we convert it into Eq equivalent at parsing time
1406                            CmpOp::Not => read_value == !test_value,
1407                            CmpOp::Eq => read_value == test_value,
1408                            CmpOp::Lt => read_value < test_value,
1409                            CmpOp::Gt => read_value > test_value,
1410                            CmpOp::Neq => read_value != test_value,
1411                            CmpOp::BitAnd => read_value & test_value == test_value,
1412                            CmpOp::Xor => (read_value & test_value).is_zero(),
1413                        };
1414
1415                        if ok {
1416                            Some(MatchRes::Scalar(*o, read_value))
1417                        } else {
1418                            None
1419                        }
1420                    }
1421
1422                    TestValue::Any => Some(MatchRes::Scalar(*o, read_value)),
1423                }
1424            }
1425
1426            (Self::Float(t), ReadValue::Float(o, f)) => {
1427                let read_value: Float = t.transform.as_ref().map(|t| t.apply(*f)).unwrap_or(*f);
1428
1429                match t.test_val {
1430                    TestValue::Value(tf) => {
1431                        let ok = match t.cmp_op {
1432                            CmpOp::Eq => read_value == tf,
1433                            CmpOp::Lt => read_value < tf,
1434                            CmpOp::Gt => read_value > tf,
1435                            CmpOp::Neq => read_value != tf,
1436                            _ => {
1437                                // this should never be reached as we validate
1438                                // operator in parser
1439                                debug_panic!("unsupported float comparison");
1440                                debug!("unsupported float comparison");
1441                                false
1442                            }
1443                        };
1444
1445                        if ok {
1446                            Some(MatchRes::Float(*o, read_value))
1447                        } else {
1448                            None
1449                        }
1450                    }
1451                    TestValue::Any => Some(MatchRes::Float(*o, read_value)),
1452                }
1453            }
1454
1455            (Self::String(st), ReadValue::Bytes(o, buf)) => {
1456                macro_rules! trim_buf {
1457                    ($buf: expr) => {{
1458                        if st.mods.contains(StringMod::Trim) {
1459                            $buf.trim_ascii()
1460                        } else {
1461                            $buf
1462                        }
1463                    }};
1464                }
1465
1466                match st.test_val.as_ref() {
1467                    TestValue::Value(str) => {
1468                        match st.cmp_op {
1469                            CmpOp::Eq => {
1470                                if let (true, _) = string_match(str, st.mods, buf) {
1471                                    Some(MatchRes::Bytes(*o, None, trim_buf!(str), Encoding::Utf8))
1472                                } else {
1473                                    None
1474                                }
1475                            }
1476                            CmpOp::Neq => {
1477                                if let (false, _) = string_match(str, st.mods, buf) {
1478                                    Some(MatchRes::Bytes(*o, None, trim_buf!(str), Encoding::Utf8))
1479                                } else {
1480                                    None
1481                                }
1482                            }
1483                            CmpOp::Gt => {
1484                                if buf.len() > str.len() {
1485                                    Some(MatchRes::Bytes(*o, None, trim_buf!(buf), Encoding::Utf8))
1486                                } else {
1487                                    None
1488                                }
1489                            }
1490                            CmpOp::Lt => {
1491                                if buf.len() < str.len() {
1492                                    Some(MatchRes::Bytes(*o, None, trim_buf!(buf), Encoding::Utf8))
1493                                } else {
1494                                    None
1495                                }
1496                            }
1497
1498                            // unsupported for strings
1499                            _ => {
1500                                // this should never be reached as we validate
1501                                // operator in parser
1502                                debug_panic!("unsupported string comparison");
1503                                debug!("unsupported string comparison");
1504                                None
1505                            }
1506                        }
1507                    }
1508                    TestValue::Any => {
1509                        Some(MatchRes::Bytes(*o, None, trim_buf!(buf), Encoding::Utf8))
1510                    }
1511                }
1512            }
1513
1514            (Self::PString(m), ReadValue::Bytes(o, buf)) => match m.test_val.as_ref() {
1515                TestValue::Value(psv) => {
1516                    if buf == psv {
1517                        Some(MatchRes::Bytes(*o, None, buf, Encoding::Utf8))
1518                    } else {
1519                        None
1520                    }
1521                }
1522                TestValue::Any => Some(MatchRes::Bytes(*o, None, buf, Encoding::Utf8)),
1523            },
1524
1525            (Self::String16(t), ReadValue::Bytes(o, buf)) => {
1526                match t.test_val.as_ref() {
1527                    TestValue::Value(str16) => {
1528                        // strings cannot be equal
1529                        if str16.len() * 2 != buf.len() {
1530                            return None;
1531                        }
1532
1533                        // we check string equality
1534                        for (i, utf16_char) in slice_to_utf16_iter(buf, t.encoding).enumerate() {
1535                            if str16[i] != utf16_char {
1536                                return None;
1537                            }
1538                        }
1539
1540                        Some(MatchRes::Bytes(
1541                            *o,
1542                            None,
1543                            t.orig.as_bytes(),
1544                            Encoding::Utf16(t.encoding),
1545                        ))
1546                    }
1547
1548                    TestValue::Any => {
1549                        Some(MatchRes::Bytes(*o, None, buf, Encoding::Utf16(t.encoding)))
1550                    }
1551                }
1552            }
1553
1554            (Self::Regex(r), ReadValue::Bytes(o, buf)) => r.match_buf(*o, stream_kind, buf),
1555
1556            (Self::Search(t), ReadValue::Bytes(o, buf)) => t.match_buf(*o, buf),
1557
1558            _ => None,
1559        }
1560    }
1561
1562    #[inline(always)]
1563    fn strength(&self) -> u64 {
1564        const MULT: usize = 10;
1565
1566        let mut out = 2 * MULT;
1567
1568        // FIXME: octal is missing but it is not used in practice ...
1569        match self {
1570            Test::Scalar(s) => {
1571                out += s.ty.type_size() * MULT;
1572            }
1573
1574            Test::Float(t) => {
1575                out += t.ty.type_size() * MULT;
1576            }
1577
1578            Test::String(t) => out += t.test_value_len().saturating_mul(MULT),
1579
1580            Test::PString(t) => out += t.test_value_len().saturating_mul(MULT),
1581
1582            Test::Search(s) => {
1583                // NOTE: this implementation deviates from what is in
1584                // C libmagic. The purpose of this implementation is to
1585                // minimize the difference between similar tests,
1586                // implemented differently (ex: string test VS very localized search test).
1587                let n_pos = s.n_pos.unwrap_or(FILE_BYTES_MAX);
1588
1589                match n_pos {
1590                    // a search on one line should be equivalent to a string match
1591                    0..=80 => out += s.str.len().saturating_mul(MULT),
1592                    // search on the first 3 lines gets a little penalty
1593                    81..=240 => out += s.str.len() * s.str.len().clamp(0, MULT - 2),
1594                    // a search on more than 3 lines isn't considered very accurate
1595                    _ => out += s.str.len(),
1596                }
1597            }
1598
1599            Test::Regex(r) => {
1600                // NOTE: this implementation deviates from what is in
1601                // C libmagic. The purpose of this implementation is to
1602                // minimize the difference between similar tests,
1603                // implemented differently (ex: string test VS very localized regex test).
1604
1605                // we divide length by the number of capture group
1606                // which gives us a value close to he average string
1607                // length match in the regex.
1608                let v = r.non_magic_len / r.re.captures_len();
1609
1610                let len = r
1611                    .length
1612                    .map(|l| {
1613                        if r.mods.contains(ReMod::LineLimit) {
1614                            l * 80
1615                        } else {
1616                            l
1617                        }
1618                    })
1619                    .unwrap_or(FILE_BYTES_MAX);
1620
1621                match len {
1622                    // a search on one line should be equivalent to a string match
1623                    0..=80 => out += v.saturating_mul(MULT),
1624                    // search on the first 3 lines gets a little penalty
1625                    81..=240 => out += v * v.clamp(0, MULT - 2),
1626                    // a search on more than 3 lines isn't considered very accurate
1627                    _ => out += v,
1628                }
1629            }
1630
1631            Test::String16(t) => {
1632                // NOTE: in libmagic the result is div by 2
1633                // but I GUESS it is because the len is expressed
1634                // in number bytes. In our case length is expressed
1635                // in number of u16 so we shouldn't divide.
1636                out += t.test_value_len().saturating_mul(MULT);
1637            }
1638
1639            Test::Der => out += MULT,
1640
1641            Test::Default | Test::Name(_) | Test::Use(_, _) | Test::Indirect(_) | Test::Clear => {
1642                return 0;
1643            }
1644        }
1645
1646        // matching any output gets penalty
1647        if self.is_match_any() {
1648            return 0;
1649        }
1650
1651        if let Some(op) = self.cmp_op() {
1652            match op {
1653                // matching almost any gets penalty
1654                CmpOp::Neq => out = 0,
1655                CmpOp::Eq | CmpOp::Not => out += MULT,
1656                CmpOp::Lt | CmpOp::Gt => out -= 2 * MULT,
1657                CmpOp::Xor | CmpOp::BitAnd => out -= MULT,
1658            }
1659        }
1660
1661        out as u64
1662    }
1663
1664    #[inline(always)]
1665    fn cmp_op(&self) -> Option<CmpOp> {
1666        match self {
1667            Self::String(t) => Some(t.cmp_op),
1668            Self::Scalar(s) => Some(s.cmp_op),
1669            Self::Float(t) => Some(t.cmp_op),
1670            Self::Name(_)
1671            | Self::Use(_, _)
1672            | Self::Search(_)
1673            | Self::PString(_)
1674            | Self::Regex(_)
1675            | Self::Clear
1676            | Self::Default
1677            | Self::Indirect(_)
1678            | Self::String16(_)
1679            | Self::Der => None,
1680        }
1681    }
1682
1683    #[inline(always)]
1684    fn is_recursive(&self) -> bool {
1685        matches!(self, Test::Use(_, _) | Test::Indirect(_))
1686    }
1687
1688    #[inline(always)]
1689    fn is_match_any(&self) -> bool {
1690        match self {
1691            Test::Name(_) => false,
1692            Test::Use(_, _) => false,
1693            Test::Scalar(scalar_test) => matches!(scalar_test.test_val, TestValue::Any),
1694            Test::Float(float_test) => matches!(float_test.test_val, TestValue::Any),
1695            Test::String(string_test) => matches!(string_test.test_val, TestValue::Any),
1696            Test::Search(_) => false,
1697            Test::PString(pstring_test) => matches!(pstring_test.test_val, TestValue::Any),
1698            Test::Regex(_) => false,
1699            Test::Indirect(_) => false,
1700            Test::String16(string16_test) => matches!(string16_test.test_val, TestValue::Any),
1701            Test::Der => false,
1702            Test::Clear => false,
1703            Test::Default => false,
1704        }
1705    }
1706
1707    #[inline(always)]
1708    fn is_binary(&self) -> bool {
1709        match self {
1710            Self::Name(_) => true,
1711            Self::Use(_, _) => true,
1712            Self::Scalar(_) => true,
1713            Self::Float(_) => true,
1714            Self::String(t) => !t.is_binary() & !t.is_text() || t.is_binary(),
1715            Self::Search(t) => t.is_binary(),
1716            Self::PString(_) => true,
1717            Self::Regex(t) => !t.is_binary() & !t.is_text() || t.is_binary(),
1718            Self::Clear => true,
1719            Self::Default => true,
1720            Self::Indirect(_) => true,
1721            Self::String16(_) => true,
1722            Self::Der => true,
1723        }
1724    }
1725
1726    #[inline(always)]
1727    fn is_text(&self) -> bool {
1728        match self {
1729            Self::Name(_) => true,
1730            Self::Use(_, _) => true,
1731            Self::Indirect(_) => true,
1732            Self::Clear => true,
1733            Self::Default => true,
1734            Self::String(t) => !t.is_binary() & !t.is_text() || t.is_text(),
1735            Self::Regex(t) => !t.is_binary() & !t.is_text() || t.is_text(),
1736            _ => !self.is_binary(),
1737        }
1738    }
1739
1740    #[inline(always)]
1741    fn is_only_text(&self) -> bool {
1742        self.is_text() && !self.is_binary()
1743    }
1744
1745    #[inline(always)]
1746    fn is_only_binary(&self) -> bool {
1747        self.is_binary() && !self.is_text()
1748    }
1749}
1750
1751#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
1752enum OffsetType {
1753    Byte,
1754    DoubleLe,
1755    DoubleBe,
1756    ShortLe,
1757    ShortBe,
1758    Id3Le,
1759    Id3Be,
1760    LongLe,
1761    LongBe,
1762    Middle,
1763    Octal,
1764    QuadBe,
1765    QuadLe,
1766}
1767
1768#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
1769enum Shift {
1770    Direct(u64),
1771    Indirect(i64),
1772}
1773
1774#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
1775struct IndOffset {
1776    // where to find the offset
1777    off_addr: DirOffset,
1778    // signed or unsigned
1779    signed: bool,
1780    // type of the offset
1781    ty: OffsetType,
1782    op: Option<Op>,
1783    shift: Option<Shift>,
1784}
1785
1786impl IndOffset {
1787    // if we overflow we must not return an offset
1788    fn read_offset<R: Read + Seek>(
1789        &self,
1790        haystack: &mut LazyCache<R>,
1791        rule_base_offset: Option<u64>,
1792        last_upper_match_offset: Option<u64>,
1793    ) -> Result<Option<u64>, io::Error> {
1794        let offset_address = match self.off_addr {
1795            DirOffset::Start(s) => {
1796                let Some(o) = s.checked_add(rule_base_offset.unwrap_or_default()) else {
1797                    return Ok(None);
1798                };
1799
1800                haystack.seek(SeekFrom::Start(o))?
1801            }
1802            DirOffset::LastUpper(c) => haystack.seek(SeekFrom::Start(
1803                (last_upper_match_offset.unwrap_or_default() as i64 + c) as u64,
1804            ))?,
1805            DirOffset::End(e) => haystack.seek(SeekFrom::End(e))?,
1806        };
1807
1808        macro_rules! read_value {
1809            () => {
1810                match self.ty {
1811                    OffsetType::Byte => {
1812                        if self.signed {
1813                            read_le!(haystack, u8) as u64
1814                        } else {
1815                            read_le!(haystack, i8) as u64
1816                        }
1817                    }
1818                    OffsetType::DoubleLe => read_le!(haystack, f64) as u64,
1819                    OffsetType::DoubleBe => read_be!(haystack, f64) as u64,
1820                    OffsetType::ShortLe => {
1821                        if self.signed {
1822                            read_le!(haystack, i16) as u64
1823                        } else {
1824                            read_le!(haystack, u16) as u64
1825                        }
1826                    }
1827                    OffsetType::ShortBe => {
1828                        if self.signed {
1829                            read_be!(haystack, i16) as u64
1830                        } else {
1831                            read_be!(haystack, u16) as u64
1832                        }
1833                    }
1834                    OffsetType::Id3Le => decode_id3(read_le!(haystack, u32)) as u64,
1835                    OffsetType::Id3Be => decode_id3(read_be!(haystack, u32)) as u64,
1836                    OffsetType::LongLe => {
1837                        if self.signed {
1838                            read_le!(haystack, i32) as u64
1839                        } else {
1840                            read_le!(haystack, u32) as u64
1841                        }
1842                    }
1843                    OffsetType::LongBe => {
1844                        if self.signed {
1845                            read_be!(haystack, i32) as u64
1846                        } else {
1847                            read_be!(haystack, u32) as u64
1848                        }
1849                    }
1850                    OffsetType::Middle => read_me!(haystack) as u64,
1851                    OffsetType::Octal => {
1852                        if let Some(o) = read_octal_u64(haystack) {
1853                            o
1854                        } else {
1855                            debug!("failed to read octal offset @ {offset_address}");
1856                            return Ok(None);
1857                        }
1858                    }
1859                    OffsetType::QuadLe => {
1860                        if self.signed {
1861                            read_le!(haystack, i64) as u64
1862                        } else {
1863                            read_le!(haystack, u64)
1864                        }
1865                    }
1866                    OffsetType::QuadBe => {
1867                        if self.signed {
1868                            read_be!(haystack, i64) as u64
1869                        } else {
1870                            read_be!(haystack, u64)
1871                        }
1872                    }
1873                }
1874            };
1875        }
1876
1877        // in theory every offset read should end up in something seekable from start, so we can use u64 to store the result
1878        let o = read_value!();
1879
1880        trace!(
1881            "offset read @ {offset_address} value={o} op={:?} shift={:?}",
1882            self.op, self.shift
1883        );
1884
1885        // apply transformation
1886        if let (Some(op), Some(shift)) = (self.op, self.shift) {
1887            let shift = match shift {
1888                Shift::Direct(i) => i,
1889                Shift::Indirect(i) => {
1890                    let tmp = offset_address as i128 + i as i128;
1891                    if tmp.is_negative() {
1892                        return Ok(None);
1893                    } else {
1894                        haystack.seek(SeekFrom::Start(tmp as u64))?;
1895                    };
1896                    // NOTE: here we assume that the shift has the same
1897                    // type as the main offset !
1898                    read_value!()
1899                }
1900            };
1901
1902            match op {
1903                Op::Add => return Ok(o.checked_add(shift)),
1904                Op::Mul => return Ok(o.checked_mul(shift)),
1905                Op::Sub => return Ok(o.checked_sub(shift)),
1906                Op::Div => return Ok(o.checked_div(shift)),
1907                Op::Mod => return Ok(o.checked_rem(shift)),
1908                Op::And => return Ok(Some(o & shift)),
1909                Op::Or => return Ok(Some(o | shift)),
1910                Op::Xor => return Ok(Some(o ^ shift)),
1911            }
1912        }
1913
1914        Ok(Some(o))
1915    }
1916}
1917
1918#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
1919enum DirOffset {
1920    Start(u64),
1921    // relative to the last up-level field
1922    LastUpper(i64),
1923    End(i64),
1924}
1925
1926#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
1927enum Offset {
1928    Direct(DirOffset),
1929    Indirect(IndOffset),
1930}
1931
1932impl From<DirOffset> for Offset {
1933    fn from(value: DirOffset) -> Self {
1934        Self::Direct(value)
1935    }
1936}
1937
1938impl From<IndOffset> for Offset {
1939    fn from(value: IndOffset) -> Self {
1940        Self::Indirect(value)
1941    }
1942}
1943
1944impl Display for DirOffset {
1945    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1946        match self {
1947            DirOffset::Start(i) => write!(f, "{i}"),
1948            DirOffset::LastUpper(c) => write!(f, "&{c}"),
1949            DirOffset::End(e) => write!(f, "-{e}"),
1950        }
1951    }
1952}
1953
1954impl Default for DirOffset {
1955    fn default() -> Self {
1956        Self::LastUpper(0)
1957    }
1958}
1959
1960#[derive(Debug, Clone, Serialize, Deserialize)]
1961struct Match {
1962    line: usize,
1963    depth: u8,
1964    offset: Offset,
1965    test: Test,
1966    test_strength: u64,
1967    message: Option<Message>,
1968}
1969
1970impl From<Use> for Match {
1971    fn from(value: Use) -> Self {
1972        let test = Test::Use(value.switch_endianness, value.rule_name);
1973        let test_strength = test.strength();
1974        Self {
1975            line: value.line,
1976            depth: value.depth,
1977            offset: value.start_offset,
1978            test,
1979            test_strength,
1980            message: value.message,
1981        }
1982    }
1983}
1984
1985impl From<Name> for Match {
1986    fn from(value: Name) -> Self {
1987        let test = Test::Name(value.name);
1988        let test_strength = test.strength();
1989        Self {
1990            line: value.line,
1991            depth: 0,
1992            offset: Offset::Direct(DirOffset::Start(0)),
1993            test,
1994            test_strength,
1995            message: value.message,
1996        }
1997    }
1998}
1999
2000impl Match {
2001    /// Turns the `Match`'s offset into an absolute offset from the start of the stream
2002    #[inline(always)]
2003    fn offset_from_start<R: Read + Seek>(
2004        &self,
2005        haystack: &mut LazyCache<R>,
2006        rule_base_offset: Option<u64>,
2007        last_level_offset: Option<u64>,
2008    ) -> Result<Option<u64>, io::Error> {
2009        match self.offset {
2010            Offset::Direct(dir_offset) => match dir_offset {
2011                DirOffset::Start(s) => Ok(Some(s)),
2012                DirOffset::LastUpper(shift) => {
2013                    let o = last_level_offset.unwrap_or_default() as i64 + shift;
2014
2015                    if o >= 0 { Ok(Some(o as u64)) } else { Ok(None) }
2016                }
2017                DirOffset::End(e) => Ok(Some(haystack.offset_from_start(SeekFrom::End(e)))),
2018            },
2019            Offset::Indirect(ind_offset) => {
2020                let Some(o) =
2021                    ind_offset.read_offset(haystack, rule_base_offset, last_level_offset)?
2022                else {
2023                    return Ok(None);
2024                };
2025
2026                Ok(Some(o))
2027            }
2028        }
2029    }
2030
2031    /// this method emulates the buffer based matching
2032    /// logic implemented in libmagic. It needs some aweful
2033    /// and weird offset convertions to turn buffer
2034    /// relative offsets (libmagic is based on) into
2035    /// absolute offset in the file.
2036    ///
2037    /// this method shoud bubble up only critical errors
2038    /// all the other errors should make the match result
2039    /// false and be logged via debug!
2040    ///
2041    /// the function returns an error if the maximum recursion
2042    /// has been reached or if a dependency rule is missing.
2043    #[inline]
2044    #[allow(clippy::too_many_arguments)]
2045    fn matches<'a: 'h, 'h, R: Read + Seek>(
2046        &'a self,
2047        source: Option<&str>,
2048        magic: &mut Magic<'a>,
2049        stream_kind: StreamKind,
2050        state: &mut MatchState,
2051        buf_base_offset: Option<u64>,
2052        rule_base_offset: Option<u64>,
2053        last_level_offset: Option<u64>,
2054        haystack: &'h mut LazyCache<R>,
2055        switch_endianness: bool,
2056        db: &'a MagicDb,
2057        depth: usize,
2058    ) -> Result<(bool, Option<MatchRes<'h>>), Error> {
2059        let source = source.unwrap_or("unknown");
2060        let line = self.line;
2061
2062        if depth >= MAX_RECURSION {
2063            return Err(Error::localized(
2064                source,
2065                line,
2066                Error::MaximumRecursion(MAX_RECURSION),
2067            ));
2068        }
2069
2070        if self.test.is_only_binary() && stream_kind.is_text() {
2071            trace!("skip binary test source={source} line={line} stream_kind={stream_kind:?}",);
2072            return Ok((false, None));
2073        }
2074
2075        if self.test.is_only_text() && !stream_kind.is_text() {
2076            trace!("skip text test source={source} line={line} stream_kind={stream_kind:?}",);
2077            return Ok((false, None));
2078        }
2079
2080        let Ok(Some(mut offset)) = self
2081            .offset_from_start(haystack, rule_base_offset, last_level_offset)
2082            .inspect_err(|e| debug!("source={source} line={line} failed at computing offset: {e}"))
2083        else {
2084            return Ok((false, None));
2085        };
2086
2087        offset = match self.offset {
2088            Offset::Indirect(_) => {
2089                // the result we get for an indirect offset
2090                // is relative to the start of the libmagic
2091                // buffer so we need to add base to make it
2092                // absolute.
2093                buf_base_offset.unwrap_or_default().saturating_add(offset)
2094            }
2095            // offset from start are computed from rule base
2096            Offset::Direct(DirOffset::Start(_)) => {
2097                rule_base_offset.unwrap_or_default().saturating_add(offset)
2098            }
2099            _ => offset,
2100        };
2101
2102        match &self.test {
2103            Test::Clear => {
2104                trace!("source={source} line={line} clear");
2105                state.clear_continuation_level(&self.continuation_level());
2106                Ok((true, None))
2107            }
2108
2109            Test::Name(name) => {
2110                trace!(
2111                    "source={source} line={line} running rule {name} switch_endianness={switch_endianness}",
2112                );
2113                Ok((true, None))
2114            }
2115
2116            Test::Use(flip_endianness, rule_name) => {
2117                trace!(
2118                    "source={source} line={line} use {rule_name} switch_endianness={flip_endianness}",
2119                );
2120
2121                // switch_endianness must propagate down the rule call stack
2122                let switch_endianness = switch_endianness ^ flip_endianness;
2123
2124                let dr: &DependencyRule = db.dependencies.get(rule_name).ok_or(
2125                    Error::localized(source, line, Error::MissingRule(rule_name.clone())),
2126                )?;
2127
2128                // we push the message here otherwise we push message in depth first
2129                if let Some(msg) = self.message.as_ref() {
2130                    magic.push_message(msg.to_string_lossy());
2131                }
2132
2133                let nmatch = dr.rule.magic(
2134                    magic,
2135                    stream_kind,
2136                    buf_base_offset,
2137                    Some(offset),
2138                    haystack,
2139                    db,
2140                    switch_endianness,
2141                    depth.saturating_add(1),
2142                )?;
2143
2144                // The name is always true, so we consider there to be a match
2145                // if more than one test succeeded
2146                let matched = nmatch > 1;
2147                if matched {
2148                    state.set_continuation_level(self.continuation_level());
2149                }
2150
2151                Ok((matched, None))
2152            }
2153
2154            Test::Indirect(m) => {
2155                trace!(
2156                    "source={source} line={line} indirect mods={:?} offset={offset:#x}",
2157                    m
2158                );
2159
2160                let new_buf_base_off = if m.contains(IndirectMod::Relative) {
2161                    Some(offset)
2162                } else {
2163                    None
2164                };
2165
2166                // we push the message here otherwise we push message in depth first
2167                if let Some(msg) = self.message.as_ref() {
2168                    magic.push_message(msg.to_string_lossy());
2169                }
2170
2171                let mut nmatch = 0u64;
2172                for r in db.rules.iter() {
2173                    let messages_cnt = magic.message.len();
2174                    nmatch = nmatch.saturating_add(r.magic(
2175                        magic,
2176                        stream_kind,
2177                        new_buf_base_off,
2178                        Some(offset),
2179                        haystack,
2180                        db,
2181                        false,
2182                        depth.saturating_add(1),
2183                    )?);
2184
2185                    // this means we matched a rule
2186                    if magic.message.len() != messages_cnt {
2187                        break;
2188                    }
2189                }
2190
2191                // we return false not to push message again
2192                Ok((nmatch > 0, None))
2193            }
2194
2195            Test::Default => {
2196                // default matches if nothing else at the continuation level matched
2197                let ok = !state.get_continuation_level(&self.continuation_level());
2198
2199                trace!("source={source} line={line} default match={ok}");
2200                if ok {
2201                    state.set_continuation_level(self.continuation_level());
2202                }
2203
2204                Ok((ok, None))
2205            }
2206
2207            _ => {
2208                if let Err(e) = haystack.seek(SeekFrom::Start(offset)) {
2209                    debug!("source={source} line={line} failed to seek in haystack: {e}");
2210                    return Ok((false, None));
2211                }
2212
2213                let mut trace_msg = None;
2214
2215                if enabled!(Level::DEBUG) {
2216                    trace_msg = Some(vec![format!(
2217                        "source={source} line={line} depth={} stream_offset={:#x}",
2218                        self.depth,
2219                        haystack.lazy_stream_position()
2220                    )])
2221                }
2222
2223                // NOTE: we may have a way to optimize here. In case we do a Any
2224                // test and we don't use the value to format the message, we don't
2225                // need to read the value.
2226                if let Ok(opt_test_value) = self
2227                    .test
2228                    .read_test_value(haystack, switch_endianness)
2229                    .inspect_err(|e| {
2230                        debug!("source={source} line={line} error while reading test value @{offset}: {e}",)
2231                    })
2232                {
2233                    if let Some(v) = trace_msg
2234                        .as_mut() { v.push(format!("test={:?}", self.test)) }
2235
2236                    let match_res =
2237                        opt_test_value.and_then(|tv| self.test.match_value(&tv, stream_kind));
2238
2239                    if let Some(v) = trace_msg.as_mut() { v.push(format!(
2240                            "message=\"{}\" match={}",
2241                            self.message
2242                                .as_ref()
2243                                .map(|fs| fs.to_string_lossy())
2244                                .unwrap_or_default(),
2245                            match_res.is_some()
2246                        )) }
2247
2248                    // trace message
2249                    if enabled!(Level::DEBUG) && !enabled!(Level::TRACE) && match_res.is_some() {
2250                        if let Some(m) = trace_msg{
2251                            debug!("{}", m.join(" "));
2252                        }
2253                    } else if enabled!(Level::TRACE)
2254                        && let Some(m) = trace_msg{
2255                            trace!("{}", m.join(" "));
2256                        }
2257
2258                    if let Some(mr) = match_res {
2259                        state.set_continuation_level(self.continuation_level());
2260                        return Ok((true, Some(mr)));
2261                    }
2262                }
2263
2264                Ok((false, None))
2265            }
2266        }
2267    }
2268
2269    #[inline(always)]
2270    fn continuation_level(&self) -> ContinuationLevel {
2271        ContinuationLevel(self.depth)
2272    }
2273}
2274
2275#[derive(Debug, Clone)]
2276struct Use {
2277    line: usize,
2278    depth: u8,
2279    start_offset: Offset,
2280    rule_name: String,
2281    switch_endianness: bool,
2282    message: Option<Message>,
2283}
2284
2285#[derive(Debug, Clone, Serialize, Deserialize)]
2286struct StrengthMod {
2287    op: Op,
2288    by: u8,
2289}
2290
2291impl StrengthMod {
2292    #[inline(always)]
2293    fn apply(&self, strength: u64) -> u64 {
2294        let by = self.by as u64;
2295        debug!("applying strength modifier: {strength} {} {}", self.op, by);
2296        match self.op {
2297            Op::Mul => strength.saturating_mul(by),
2298            Op::Add => strength.saturating_add(by),
2299            Op::Sub => strength.saturating_sub(by),
2300            Op::Div => {
2301                if by > 0 {
2302                    strength.saturating_div(by)
2303                } else {
2304                    strength
2305                }
2306            }
2307            Op::Mod => strength % by,
2308            Op::And => strength & by,
2309            // this should never happen as strength operators
2310            // are enforced by our parser
2311            Op::Xor | Op::Or => {
2312                debug_panic!("unsupported strength operator");
2313                strength
2314            }
2315        }
2316    }
2317}
2318
2319#[derive(Debug, Clone)]
2320enum Flag {
2321    Mime(String),
2322    Ext(HashSet<String>),
2323    Strength(StrengthMod),
2324    Apple(String),
2325}
2326
2327#[derive(Debug, Clone)]
2328struct Name {
2329    line: usize,
2330    name: String,
2331    message: Option<Message>,
2332}
2333
2334#[derive(Debug, Clone)]
2335enum Entry<'span> {
2336    Match(Span<'span>, Match),
2337    Flag(Span<'span>, Flag),
2338}
2339
2340#[derive(Debug, Clone, Serialize, Deserialize)]
2341struct EntryNode {
2342    root: bool,
2343    entry: Match,
2344    children: Vec<EntryNode>,
2345    mimetype: Option<String>,
2346    apple: Option<String>,
2347    strength_mod: Option<StrengthMod>,
2348    exts: HashSet<String>,
2349}
2350
2351#[derive(Debug, Default)]
2352struct EntryNodeVisitor {
2353    exts: HashSet<String>,
2354    score: u64,
2355}
2356
2357impl EntryNodeVisitor {
2358    fn new() -> Self {
2359        Self {
2360            ..Default::default()
2361        }
2362    }
2363
2364    fn merge(&mut self, other: Self) {
2365        self.exts.extend(other.exts);
2366        self.score += other.score;
2367    }
2368}
2369
2370impl EntryNode {
2371    #[inline]
2372    fn update_visitor(&self, v: &mut EntryNodeVisitor, depth: usize) {
2373        // update extensions
2374        for ext in self.exts.iter() {
2375            if !v.exts.contains(ext) {
2376                v.exts.insert(ext.clone());
2377            }
2378        }
2379
2380        // update score if depth
2381        if depth == 0 {
2382            v.score += self.entry.test_strength;
2383        }
2384
2385        // Tests at deeper levels contribute less to the overall score.
2386        // We use the minimum value to establish a lower bound for the rule's score,
2387        // which helps prioritize rules based on their importance.
2388        v.score += self
2389            .children
2390            .iter()
2391            .map(|e| e.entry.test_strength)
2392            .min()
2393            .unwrap_or_default()
2394            / max(1, depth as u64);
2395    }
2396
2397    fn visit(
2398        &self,
2399        v: &mut EntryNodeVisitor,
2400        deps: &HashMap<String, DependencyRule>,
2401        marked: &mut HashSet<String>,
2402        depth: usize,
2403    ) -> Result<(), Error> {
2404        // updating visitor
2405        self.update_visitor(v, depth);
2406
2407        // recursively visiting
2408        for c in self.children.iter() {
2409            if let Test::Use(_, ref name) = c.entry.test {
2410                if marked.contains(name) {
2411                    continue;
2412                }
2413
2414                marked.insert(name.clone());
2415
2416                if let Some(r) = deps.get(name) {
2417                    let dv = r.rule.visit_all_entries(deps, marked)?;
2418                    v.merge(dv);
2419                } else {
2420                    return Err(Error::MissingRule(name.clone()));
2421                }
2422            } else {
2423                c.visit(v, deps, marked, depth + 1)?;
2424            }
2425        }
2426
2427        Ok(())
2428    }
2429
2430    #[inline]
2431    #[allow(clippy::too_many_arguments)]
2432    fn matches<'r, R: Read + Seek>(
2433        &'r self,
2434        opt_source: Option<&str>,
2435        magic: &mut Magic<'r>,
2436        state: &mut MatchState,
2437        stream_kind: StreamKind,
2438        buf_base_offset: Option<u64>,
2439        rule_base_offset: Option<u64>,
2440        last_level_offset: Option<u64>,
2441        haystack: &mut LazyCache<R>,
2442        db: &'r MagicDb,
2443        switch_endianness: bool,
2444        depth: usize,
2445    ) -> Result<u64, Error> {
2446        let mut nmatch = 0u64;
2447
2448        let (ok, opt_match_res) = self.entry.matches(
2449            opt_source,
2450            magic,
2451            stream_kind,
2452            state,
2453            buf_base_offset,
2454            rule_base_offset,
2455            last_level_offset,
2456            haystack,
2457            switch_endianness,
2458            db,
2459            depth,
2460        )?;
2461
2462        let source = opt_source.unwrap_or("unknown");
2463        let line = self.entry.line;
2464
2465        if ok {
2466            nmatch = nmatch.saturating_add(1);
2467
2468            // Update the magic with the message if the match is successful
2469            // Skip updating if the test is recursive, as it's already handled
2470            // in the Match::matches function
2471            if !self.entry.test.is_recursive()
2472                && let Some(msg) = self.entry.message.as_ref()
2473                && let Ok(msg) = msg.format_with(opt_match_res.as_ref()).inspect_err(|e| {
2474                    debug!("source={source} line={line} failed to format message: {e}")
2475                })
2476            {
2477                magic.push_message(msg);
2478            }
2479
2480            // we need to adjust stream offset in case of regex/search tests
2481            if let Some(mr) = opt_match_res {
2482                match &self.entry.test {
2483                    Test::String(t) => {
2484                        if t.has_length_mod() {
2485                            let o = mr.end_offset();
2486                            haystack.seek(SeekFrom::Start(o))?;
2487                        }
2488                    }
2489                    Test::Search(t) => {
2490                        if t.re_mods.contains(ReMod::StartOffsetUpdate) {
2491                            let o = mr.start_offset();
2492                            haystack.seek(SeekFrom::Start(o))?;
2493                        } else {
2494                            let o = mr.end_offset();
2495                            haystack.seek(SeekFrom::Start(o))?;
2496                        }
2497                    }
2498
2499                    Test::Regex(t) => {
2500                        if t.mods.contains(ReMod::StartOffsetUpdate) {
2501                            let o = mr.start_offset();
2502                            haystack.seek(SeekFrom::Start(o))?;
2503                        } else {
2504                            let o = mr.end_offset();
2505                            haystack.seek(SeekFrom::Start(o))?;
2506                        }
2507                    }
2508                    // other types do not need offset adjustement
2509                    _ => {}
2510                }
2511            }
2512
2513            if let Some(mimetype) = self.mimetype.as_ref() {
2514                magic.set_mime_type(Cow::Borrowed(mimetype));
2515            }
2516
2517            if let Some(apple_ty) = self.apple.as_ref() {
2518                magic.set_creator_code(Cow::Borrowed(apple_ty));
2519            }
2520
2521            if !self.exts.is_empty() {
2522                magic.insert_extensions(self.exts.iter().map(|s| s.as_str()));
2523            }
2524
2525            // NOTE: here we try to implement a similar logic as in file_magic_strength.
2526            // Sticking to the exact same strength computation logic is complicated due
2527            // to implementation differences. Let's wait and see if that is a real issue.
2528            let mut strength = self.entry.test_strength;
2529
2530            let continuation_level = self.entry.continuation_level().0 as u64;
2531            if self.entry.message.is_none() && continuation_level < 3 {
2532                strength = strength.saturating_add(continuation_level);
2533            }
2534
2535            if let Some(sm) = self.strength_mod.as_ref() {
2536                strength = sm.apply(strength);
2537            }
2538
2539            // entries with no message get a bonus
2540            if self.entry.message.is_none() {
2541                strength += 1
2542            }
2543
2544            magic.update_strength(strength);
2545
2546            let end_upper_level = haystack.lazy_stream_position();
2547
2548            // we have to fix rule_base_offset if
2549            // the rule_base_starts from end otherwise it
2550            // breaks some offset computation in match
2551            // see test_offset_bug_1 and test_offset_bug_2
2552            // they implement the same test logic yet indirect
2553            // offsets have to be different so that it works
2554            // in libmagic/file
2555            let rule_base_offset = if self.root {
2556                match self.entry.offset {
2557                    Offset::Direct(DirOffset::End(o)) => {
2558                        Some(haystack.offset_from_start(SeekFrom::End(o)))
2559                    }
2560                    _ => rule_base_offset,
2561                }
2562            } else {
2563                rule_base_offset
2564            };
2565
2566            for e in self.children.iter() {
2567                nmatch = nmatch.saturating_add(e.matches(
2568                    opt_source,
2569                    magic,
2570                    state,
2571                    stream_kind,
2572                    buf_base_offset,
2573                    rule_base_offset,
2574                    Some(end_upper_level),
2575                    haystack,
2576                    db,
2577                    switch_endianness,
2578                    depth,
2579                )?);
2580            }
2581        }
2582
2583        Ok(nmatch)
2584    }
2585}
2586
2587/// Represents a parsed magic rule
2588#[derive(Debug, Clone, Serialize, Deserialize)]
2589pub struct MagicRule {
2590    id: usize,
2591    source: Option<String>,
2592    entries: EntryNode,
2593    extensions: HashSet<String>,
2594    /// score used for rule ranking
2595    score: u64,
2596    finalized: bool,
2597}
2598
2599impl MagicRule {
2600    #[inline(always)]
2601    fn set_id(&mut self, id: usize) {
2602        self.id = id
2603    }
2604
2605    fn visit_all_entries(
2606        &self,
2607        deps: &HashMap<String, DependencyRule>,
2608        marked: &mut HashSet<String>,
2609    ) -> Result<EntryNodeVisitor, Error> {
2610        let mut v = EntryNodeVisitor::new();
2611        self.entries.visit(&mut v, deps, marked, 0)?;
2612        Ok(v)
2613    }
2614
2615    /// Finalize a rule by searching for all extensions and computing its score
2616    /// for ranking. In the `MagicRule` is already finalized it returns immediately.
2617    fn try_finalize(&mut self, deps: &HashMap<String, DependencyRule>) {
2618        if self.finalized {
2619            return;
2620        }
2621
2622        // rule can be finalized all deps are found
2623        if let Ok(v) = self.visit_all_entries(deps, &mut HashSet::new()) {
2624            self.extensions.extend(v.exts);
2625            self.score = v.score;
2626            self.finalized = true
2627        }
2628    }
2629
2630    #[inline]
2631    fn magic_entrypoint<'r, R: Read + Seek>(
2632        &'r self,
2633        magic: &mut Magic<'r>,
2634        stream_kind: StreamKind,
2635        haystack: &mut LazyCache<R>,
2636        db: &'r MagicDb,
2637        switch_endianness: bool,
2638        depth: usize,
2639    ) -> Result<u64, Error> {
2640        self.entries.matches(
2641            self.source.as_deref(),
2642            magic,
2643            &mut MatchState::empty(),
2644            stream_kind,
2645            None,
2646            None,
2647            None,
2648            haystack,
2649            db,
2650            switch_endianness,
2651            depth,
2652        )
2653    }
2654
2655    #[inline]
2656    #[allow(clippy::too_many_arguments)]
2657    fn magic<'r, R: Read + Seek>(
2658        &'r self,
2659        magic: &mut Magic<'r>,
2660        stream_kind: StreamKind,
2661        buf_base_offset: Option<u64>,
2662        rule_base_offset: Option<u64>,
2663        haystack: &mut LazyCache<R>,
2664        db: &'r MagicDb,
2665        switch_endianness: bool,
2666        depth: usize,
2667    ) -> Result<u64, Error> {
2668        self.entries.matches(
2669            self.source.as_deref(),
2670            magic,
2671            &mut MatchState::empty(),
2672            stream_kind,
2673            buf_base_offset,
2674            rule_base_offset,
2675            None,
2676            haystack,
2677            db,
2678            switch_endianness,
2679            depth,
2680        )
2681    }
2682
2683    /// Checks if the rule is for matching against text content
2684    ///
2685    /// # Returns
2686    ///
2687    /// * `bool` - True if the rule is for text files
2688    pub fn is_text(&self) -> bool {
2689        self.entries.entry.test.is_text()
2690            && self.entries.children.iter().all(|e| e.entry.test.is_text())
2691    }
2692
2693    /// Gets the rule's score used for ranking rules between them
2694    ///
2695    /// # Returns
2696    ///
2697    /// * `u64` - The rule's score
2698    #[inline(always)]
2699    pub fn score(&self) -> u64 {
2700        self.score
2701    }
2702
2703    /// Gets the rule's filename if any
2704    ///
2705    /// # Returns
2706    ///
2707    /// * `Option<&str>` - The rule's source if available
2708    #[inline(always)]
2709    pub fn source(&self) -> Option<&str> {
2710        self.source.as_deref()
2711    }
2712
2713    /// Gets the line number at which the rule is defined
2714    ///
2715    /// # Returns
2716    ///
2717    /// * `usize` - The rule's line number
2718    #[inline(always)]
2719    pub fn line(&self) -> usize {
2720        self.entries.entry.line
2721    }
2722
2723    /// Gets all the file extensions associated to the rule
2724    ///
2725    /// # Returns
2726    ///
2727    /// * `&HashSet<String>` - The set of all associated extensions
2728    #[inline(always)]
2729    pub fn extensions(&self) -> &HashSet<String> {
2730        &self.extensions
2731    }
2732}
2733
2734#[derive(Debug, Clone, Serialize, Deserialize)]
2735struct DependencyRule {
2736    name: String,
2737    rule: MagicRule,
2738}
2739
2740/// A parsed source of magic rules
2741///
2742/// # Methods
2743///
2744/// * `open` - Opens a magic file from a path
2745#[derive(Debug, Clone, Serialize, Deserialize)]
2746pub struct MagicSource {
2747    rules: Vec<MagicRule>,
2748    dependencies: HashMap<String, DependencyRule>,
2749}
2750
2751impl MagicSource {
2752    /// Opens and parses a magic file from a path
2753    ///
2754    /// # Arguments
2755    ///
2756    /// * `p` - The path to the magic file
2757    ///
2758    /// # Returns
2759    ///
2760    /// * `Result<Self, Error>` - The parsed magic file or an error
2761    pub fn open<P: AsRef<Path>>(p: P) -> Result<Self, Error> {
2762        FileMagicParser::parse_file(p)
2763    }
2764}
2765
2766#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)]
2767struct ContinuationLevel(u8);
2768
2769// FIXME: magic handles many more text encodings
2770#[derive(Debug, PartialEq, Eq, Clone, Copy)]
2771enum TextEncoding {
2772    Ascii,
2773    Utf8,
2774    Unknown,
2775}
2776
2777impl TextEncoding {
2778    const fn as_magic_str(&self) -> &'static str {
2779        match self {
2780            TextEncoding::Ascii => "ASCII",
2781            TextEncoding::Utf8 => "UTF-8",
2782            TextEncoding::Unknown => "Unknown",
2783        }
2784    }
2785}
2786
2787#[derive(Debug, PartialEq, Eq, Clone, Copy)]
2788enum StreamKind {
2789    Binary,
2790    Text(TextEncoding),
2791}
2792
2793impl StreamKind {
2794    const fn is_text(&self) -> bool {
2795        matches!(self, StreamKind::Text(_))
2796    }
2797}
2798
2799#[derive(Debug)]
2800struct MatchState {
2801    continuation_levels: [bool; 256],
2802}
2803
2804impl MatchState {
2805    #[inline(always)]
2806    fn empty() -> Self {
2807        MatchState {
2808            continuation_levels: [false; 256],
2809        }
2810    }
2811
2812    #[inline(always)]
2813    fn get_continuation_level(&mut self, level: &ContinuationLevel) -> bool {
2814        self.continuation_levels
2815            .get(level.0 as usize)
2816            .cloned()
2817            .unwrap_or_default()
2818    }
2819
2820    #[inline(always)]
2821    fn set_continuation_level(&mut self, level: ContinuationLevel) {
2822        if let Some(b) = self.continuation_levels.get_mut(level.0 as usize) {
2823            *b = true
2824        }
2825    }
2826
2827    #[inline(always)]
2828    fn clear_continuation_level(&mut self, level: &ContinuationLevel) {
2829        if let Some(b) = self.continuation_levels.get_mut(level.0 as usize) {
2830            *b = false;
2831        }
2832    }
2833}
2834
2835/// Represents a file magic detection result
2836#[derive(Debug, Default)]
2837pub struct Magic<'m> {
2838    stream_kind: Option<StreamKind>,
2839    source: Option<Cow<'m, str>>,
2840    message: Vec<Cow<'m, str>>,
2841    mime_type: Option<Cow<'m, str>>,
2842    creator_code: Option<Cow<'m, str>>,
2843    strength: u64,
2844    exts: HashSet<Cow<'m, str>>,
2845    is_default: bool,
2846}
2847
2848impl<'m> Magic<'m> {
2849    #[inline(always)]
2850    fn set_source(&mut self, source: Option<&'m str>) {
2851        self.source = source.map(Cow::Borrowed);
2852    }
2853
2854    #[inline(always)]
2855    fn set_stream_kind(&mut self, stream_kind: StreamKind) {
2856        self.stream_kind = Some(stream_kind)
2857    }
2858
2859    #[inline(always)]
2860    fn reset(&mut self) {
2861        self.stream_kind = None;
2862        self.source = None;
2863        self.message.clear();
2864        self.mime_type = None;
2865        self.creator_code = None;
2866        self.strength = 0;
2867        self.exts.clear();
2868        self.is_default = false;
2869    }
2870
2871    /// Converts borrowed data into owned data. This method involves
2872    /// data cloning, so you must use this method only if you need to
2873    /// extend the lifetime of a [`Magic`] struct.
2874    ///
2875    /// # Returns
2876    ///
2877    /// * `Magic<'owned>` - A new [`Magic`] with owned data
2878    #[inline]
2879    pub fn into_owned<'owned>(self) -> Magic<'owned> {
2880        Magic {
2881            stream_kind: self.stream_kind,
2882            source: self.source.map(|s| Cow::Owned(s.into_owned())),
2883            message: self
2884                .message
2885                .into_iter()
2886                .map(Cow::into_owned)
2887                .map(Cow::Owned)
2888                .collect(),
2889            mime_type: self.mime_type.map(|m| Cow::Owned(m.into_owned())),
2890            creator_code: self.creator_code.map(|m| Cow::Owned(m.into_owned())),
2891            strength: self.strength,
2892            exts: self
2893                .exts
2894                .into_iter()
2895                .map(|e| Cow::Owned(e.into_owned()))
2896                .collect(),
2897            is_default: self.is_default,
2898        }
2899    }
2900
2901    /// Gets the formatted message describing the file type
2902    ///
2903    /// # Returns
2904    ///
2905    /// * `String` - The formatted message
2906    #[inline(always)]
2907    pub fn message(&self) -> String {
2908        let mut out = String::new();
2909        for (i, m) in self.message.iter().enumerate() {
2910            if let Some(s) = m.strip_prefix(r#"\b"#) {
2911                out.push_str(s);
2912            } else {
2913                // don't put space on first string
2914                if i > 0 {
2915                    out.push(' ');
2916                }
2917                out.push_str(m);
2918            }
2919        }
2920        out
2921    }
2922
2923    /// Returns an iterator over the individual parts of the magic message
2924    ///
2925    /// A magic message is typically composed of multiple parts, each appended
2926    /// during successful magic tests. This method provides an efficient way to
2927    /// iterate over these parts without concatenating them into a new string,
2928    /// as done when calling [`Magic::message`].
2929    ///
2930    /// # Returns
2931    ///
2932    /// * `impl Iterator<Item = &str>` - An iterator yielding string slices of each message part
2933    #[inline]
2934    pub fn message_parts(&self) -> impl Iterator<Item = &str> {
2935        self.message.iter().map(|p| p.as_ref())
2936    }
2937
2938    #[inline(always)]
2939    fn update_strength(&mut self, value: u64) {
2940        self.strength = self.strength.saturating_add(value);
2941        debug!("updated strength = {:?}", self.strength)
2942    }
2943
2944    /// Gets the detected MIME type
2945    ///
2946    /// # Returns
2947    ///
2948    /// * `&str` - The MIME type or default based on stream kind
2949    #[inline(always)]
2950    pub fn mime_type(&self) -> &str {
2951        self.mime_type.as_deref().unwrap_or(match self.stream_kind {
2952            Some(StreamKind::Text(_)) => DEFAULT_TEXT_MIMETYPE,
2953            Some(StreamKind::Binary) | None => DEFAULT_BIN_MIMETYPE,
2954        })
2955    }
2956
2957    #[inline(always)]
2958    fn push_message<'a: 'm>(&mut self, msg: Cow<'a, str>) {
2959        if !msg.is_empty() {
2960            debug!("pushing message: msg={msg} len={}", msg.len());
2961            self.message.push(msg);
2962        }
2963    }
2964
2965    #[inline(always)]
2966    fn set_mime_type<'a: 'm>(&mut self, mime: Cow<'a, str>) {
2967        if self.mime_type.is_none() {
2968            debug!("insert mime: {:?}", mime);
2969            self.mime_type = Some(mime)
2970        }
2971    }
2972
2973    #[inline(always)]
2974    fn set_creator_code<'a: 'm>(&mut self, apple_ty: Cow<'a, str>) {
2975        if self.creator_code.is_none() {
2976            debug!("insert apple type: {apple_ty:?}");
2977            self.creator_code = Some(apple_ty)
2978        }
2979    }
2980
2981    #[inline(always)]
2982    fn insert_extensions<'a: 'm, I: Iterator<Item = &'a str>>(&mut self, exts: I) {
2983        if self.exts.is_empty() {
2984            self.exts.extend(exts.filter_map(|e| {
2985                if e.is_empty() {
2986                    None
2987                } else {
2988                    Some(Cow::Borrowed(e))
2989                }
2990            }));
2991        }
2992    }
2993
2994    /// Gets the confidence score of the detection. This
2995    /// value is used to sort [`Magic`] in [`MagicDb::best_magic`]
2996    /// and [`MagicDb::all_magics`].
2997    ///
2998    /// # Returns
2999    ///
3000    /// * `u64` - The confidence score attributed to that [`Magic`]
3001    #[inline(always)]
3002    pub fn strength(&self) -> u64 {
3003        self.strength
3004    }
3005
3006    /// Gets the filename where the magic rule was defined
3007    ///
3008    /// # Returns
3009    ///
3010    /// * `Option<&str>` - The source if available
3011    #[inline(always)]
3012    pub fn source(&self) -> Option<&str> {
3013        self.source.as_deref()
3014    }
3015
3016    /// Gets the Apple creator code if available
3017    ///
3018    /// # Returns
3019    ///
3020    /// * `Option<&str>` - The creator code if available
3021    #[inline(always)]
3022    pub fn creator_code(&self) -> Option<&str> {
3023        self.creator_code.as_deref()
3024    }
3025
3026    /// Gets the possible file extensions for the detected [`Magic`]
3027    ///
3028    /// # Returns
3029    ///
3030    /// * `&HashSet<Cow<'m, str>>` - The set of possible extensions
3031    #[inline(always)]
3032    pub fn extensions(&self) -> &HashSet<Cow<'m, str>> {
3033        &self.exts
3034    }
3035
3036    /// Checks if this is a default fallback detection
3037    ///
3038    /// # Returns
3039    ///
3040    /// * `bool` - True if this is a default detection
3041    #[inline(always)]
3042    pub fn is_default(&self) -> bool {
3043        self.is_default
3044    }
3045}
3046
3047/// Represents a database of [`MagicRule`]
3048#[derive(Debug, Default, Clone, Serialize, Deserialize)]
3049pub struct MagicDb {
3050    rule_id: usize,
3051    rules: Vec<MagicRule>,
3052    dependencies: HashMap<String, DependencyRule>,
3053}
3054
3055#[inline(always)]
3056/// Returns `true` if the byte stream is likely text.
3057fn is_likely_text(bytes: &[u8]) -> bool {
3058    const CHUNK_SIZE: usize = std::mem::size_of::<usize>();
3059
3060    if bytes.is_empty() {
3061        return false;
3062    }
3063
3064    let mut printable = 0f64;
3065    let mut high_bytes = 0f64; // Bytes > 0x7F (non-ASCII)
3066
3067    let (chunks, remainder) = bytes.as_chunks::<CHUNK_SIZE>();
3068
3069    macro_rules! handle_byte {
3070        ($byte: expr) => {
3071            match $byte {
3072                0x00 => return false,
3073                0x09 | 0x0A | 0x0D => printable += 1.0, // Whitespace
3074                0x20..=0x7E => printable += 1.0,        // Printable ASCII
3075                _ => high_bytes += 1.0,
3076            }
3077        };
3078    }
3079
3080    for bytes in chunks {
3081        for b in bytes {
3082            handle_byte!(b)
3083        }
3084    }
3085
3086    for b in remainder {
3087        handle_byte!(b)
3088    }
3089
3090    let total = bytes.len() as f64;
3091    let printable_ratio = printable / total;
3092    let high_bytes_ratio = high_bytes / total;
3093
3094    // Heuristic thresholds (adjust as needed):
3095    printable_ratio > 0.85 && high_bytes_ratio < 0.20
3096}
3097
3098#[inline(always)]
3099fn guess_stream_kind<S: AsRef<[u8]>>(stream: S) -> StreamKind {
3100    let buf = stream.as_ref();
3101
3102    match run_utf8_validation(buf) {
3103        Ok(is_ascii) => {
3104            if is_ascii {
3105                StreamKind::Text(TextEncoding::Ascii)
3106            } else {
3107                StreamKind::Text(TextEncoding::Utf8)
3108            }
3109        }
3110        Err(e) => {
3111            if is_likely_text(&buf[e.valid_up_to..]) {
3112                StreamKind::Text(TextEncoding::Unknown)
3113            } else {
3114                StreamKind::Binary
3115            }
3116        }
3117    }
3118}
3119
3120impl MagicDb {
3121    /// Prepares an [`LazyCache`] configured with optimal parameters for
3122    /// **read** operations done during file identification
3123    pub fn optimal_lazy_cache<R: Read + Seek>(f: R) -> Result<LazyCache<R>, io::Error> {
3124        Ok(LazyCache::<R>::from_read_seek(f)
3125            .and_then(|lc| lc.with_hot_cache(2 * FILE_BYTES_MAX))?)
3126        .map(|lc| lc.with_warm_cache(100 << 20))
3127    }
3128
3129    /// Creates a new empty database
3130    ///
3131    /// # Returns
3132    ///
3133    /// * [`MagicDb`] - A new empty database
3134    pub fn new() -> Self {
3135        Self::default()
3136    }
3137
3138    #[inline(always)]
3139    fn next_rule_id(&mut self) -> usize {
3140        let t = self.rule_id;
3141        self.rule_id += 1;
3142        t
3143    }
3144
3145    #[inline(always)]
3146    fn try_json<R: Read + Seek>(
3147        haystack: &mut LazyCache<R>,
3148        stream_kind: StreamKind,
3149        magic: &mut Magic,
3150    ) -> Result<bool, Error> {
3151        // cannot be json if content is binary
3152        if matches!(stream_kind, StreamKind::Binary) {
3153            return Ok(false);
3154        }
3155
3156        let buf = haystack.read_range(0..FILE_BYTES_MAX as u64)?.trim_ascii();
3157
3158        let Some((start, end)) = find_json_boundaries(buf) else {
3159            return Ok(false);
3160        };
3161
3162        // if anything else than whitespace before start
3163        // this is not json
3164        for c in buf[0..start].iter() {
3165            if !c.is_ascii_whitespace() {
3166                return Ok(false);
3167            }
3168        }
3169
3170        let mut is_ndjson = false;
3171
3172        trace!("maybe a json document");
3173        let ok = serde_json::from_slice::<serde_json::Value>(&buf[start..=end]).is_ok();
3174        if !ok {
3175            return Ok(false);
3176        }
3177
3178        // we are sure it is json now we must look if we are ndjson
3179        if end + 1 < buf.len() {
3180            // after first json
3181            let buf = &buf[end + 1..];
3182            if let Some((second_start, second_end)) = find_json_boundaries(buf) {
3183                // there is a new line between the two json docs
3184                if memchr(b'\n', &buf[..second_start]).is_some() {
3185                    trace!("might be ndjson");
3186                    is_ndjson = serde_json::from_slice::<serde_json::Value>(
3187                        &buf[second_start..=second_end],
3188                    )
3189                    .is_ok();
3190                }
3191            }
3192        }
3193
3194        if is_ndjson {
3195            magic.push_message(Cow::Borrowed("New Line Delimited"));
3196            magic.set_mime_type(Cow::Borrowed("application/x-ndjson"));
3197            magic.insert_extensions(["ndjson", "jsonl"].into_iter());
3198        } else {
3199            magic.set_mime_type(Cow::Borrowed("application/json"));
3200            magic.insert_extensions(["json"].into_iter());
3201        }
3202
3203        magic.push_message(Cow::Borrowed("JSON text data"));
3204        magic.set_source(Some(HARDCODED_SOURCE));
3205        magic.update_strength(HARDCODED_MAGIC_STRENGTH);
3206        Ok(true)
3207    }
3208
3209    #[inline(always)]
3210    fn try_csv<R: Read + Seek>(
3211        haystack: &mut LazyCache<R>,
3212        stream_kind: StreamKind,
3213        magic: &mut Magic,
3214    ) -> Result<bool, Error> {
3215        // cannot be csv if content is binary
3216        let StreamKind::Text(enc) = stream_kind else {
3217            return Ok(false);
3218        };
3219
3220        let buf = haystack.read_range(0..FILE_BYTES_MAX as u64)?;
3221        let mut reader = csv::Reader::from_reader(io::Cursor::new(buf));
3222        let mut records = reader.records();
3223
3224        let Some(Ok(first)) = records.next() else {
3225            return Ok(false);
3226        };
3227
3228        // very not likely a CSV otherwise all programming
3229        // languages having ; line terminator would be
3230        // considered as CSV
3231        if first.len() <= 1 {
3232            return Ok(false);
3233        }
3234
3235        // we already parsed first line
3236        let mut n = 1;
3237        for i in records.take(9) {
3238            if let Ok(rec) = i {
3239                if first.len() != rec.len() {
3240                    return Ok(false);
3241                }
3242            } else {
3243                return Ok(false);
3244            }
3245            n += 1;
3246        }
3247
3248        // we need at least 10 lines
3249        if n != 10 {
3250            return Ok(false);
3251        }
3252
3253        magic.set_mime_type(Cow::Borrowed("text/csv"));
3254        magic.push_message(Cow::Borrowed("CSV"));
3255        magic.push_message(Cow::Borrowed(enc.as_magic_str()));
3256        magic.push_message(Cow::Borrowed("text"));
3257        magic.insert_extensions(["csv"].into_iter());
3258        magic.set_source(Some(HARDCODED_SOURCE));
3259        magic.update_strength(HARDCODED_MAGIC_STRENGTH);
3260        Ok(true)
3261    }
3262
3263    #[inline(always)]
3264    fn try_tar<R: Read + Seek>(
3265        haystack: &mut LazyCache<R>,
3266        stream_kind: StreamKind,
3267        magic: &mut Magic,
3268    ) -> Result<bool, Error> {
3269        // cannot be json if content is not binary
3270        if !matches!(stream_kind, StreamKind::Binary) {
3271            return Ok(false);
3272        }
3273
3274        let buf = haystack.read_range(0..FILE_BYTES_MAX as u64)?;
3275        let mut ar = Archive::new(io::Cursor::new(buf));
3276
3277        let Ok(mut entries) = ar.entries() else {
3278            return Ok(false);
3279        };
3280
3281        let Some(Ok(first)) = entries.next() else {
3282            return Ok(false);
3283        };
3284
3285        let header = first.header();
3286
3287        if header.as_ustar().is_some() {
3288            magic.push_message(Cow::Borrowed("POSIX tar archive"));
3289        } else if header.as_gnu().is_some() {
3290            magic.push_message(Cow::Borrowed("POSIX tar archive (GNU)"));
3291        } else {
3292            magic.push_message(Cow::Borrowed("tar archive"));
3293        }
3294
3295        magic.set_mime_type(Cow::Borrowed("application/x-tar"));
3296        magic.set_source(Some(HARDCODED_SOURCE));
3297        magic.update_strength(HARDCODED_MAGIC_STRENGTH);
3298        magic.insert_extensions(["tar"].into_iter());
3299        Ok(true)
3300    }
3301
3302    #[inline(always)]
3303    fn try_hard_magic<R: Read + Seek>(
3304        haystack: &mut LazyCache<R>,
3305        stream_kind: StreamKind,
3306        magic: &mut Magic,
3307    ) -> Result<bool, Error> {
3308        Ok(Self::try_json(haystack, stream_kind, magic)?
3309            || Self::try_csv(haystack, stream_kind, magic)?
3310            || Self::try_tar(haystack, stream_kind, magic)?)
3311    }
3312
3313    #[inline(always)]
3314    fn magic_default<'m, R: Read + Seek>(
3315        cache: &mut LazyCache<R>,
3316        stream_kind: StreamKind,
3317        magic: &mut Magic<'m>,
3318    ) {
3319        magic.set_source(Some(HARDCODED_SOURCE));
3320        magic.set_stream_kind(stream_kind);
3321        magic.is_default = true;
3322
3323        if cache.data_size() == 0 {
3324            magic.push_message(Cow::Borrowed("empty"));
3325            magic.set_mime_type(Cow::Borrowed(DEFAULT_BIN_MIMETYPE));
3326        }
3327
3328        match stream_kind {
3329            StreamKind::Binary => {
3330                magic.push_message(Cow::Borrowed("data"));
3331            }
3332            StreamKind::Text(e) => {
3333                magic.push_message(Cow::Borrowed(e.as_magic_str()));
3334                magic.push_message(Cow::Borrowed("text"));
3335            }
3336        }
3337    }
3338
3339    /// Loads rules from a [`MagicSource`]
3340    ///
3341    /// # Arguments
3342    ///
3343    /// * `mf` - The [`MagicSource`] to load rules from
3344    ///
3345    /// # Returns
3346    ///
3347    /// * `Result<&mut Self, Error>` - Self for chaining or an error
3348    pub fn load(&mut self, mf: MagicSource) -> Result<&mut Self, Error> {
3349        for rule in mf.rules.into_iter() {
3350            let mut rule = rule;
3351            rule.set_id(self.next_rule_id());
3352
3353            self.rules.push(rule);
3354        }
3355
3356        self.dependencies.extend(mf.dependencies);
3357        self.prepare();
3358        Ok(self)
3359    }
3360
3361    /// Gets all rules in the database
3362    ///
3363    /// # Returns
3364    ///
3365    /// * `&[MagicRule]` - A slice of all rules
3366    pub fn rules(&self) -> &[MagicRule] {
3367        &self.rules
3368    }
3369
3370    #[inline]
3371    fn first_magic_with_stream_kind<R: Read + Seek>(
3372        &self,
3373        haystack: &mut LazyCache<R>,
3374        stream_kind: StreamKind,
3375        extension: Option<&str>,
3376    ) -> Result<Magic<'_>, Error> {
3377        // re-using magic makes this function faster
3378        let mut magic = Magic::default();
3379
3380        if Self::try_hard_magic(haystack, stream_kind, &mut magic)? {
3381            return Ok(magic);
3382        }
3383
3384        let mut marked = vec![false; self.rules.len()];
3385
3386        macro_rules! do_magic {
3387            ($rule: expr) => {{
3388                $rule.magic_entrypoint(&mut magic, stream_kind, haystack, &self, false, 0)?;
3389
3390                if !magic.message.is_empty() {
3391                    magic.set_stream_kind(stream_kind);
3392                    magic.set_source($rule.source.as_deref());
3393                    return Ok(magic);
3394                }
3395
3396                magic.reset();
3397            }};
3398        }
3399
3400        if let Some(ext) = extension.map(|e| e.to_lowercase())
3401            && !ext.is_empty()
3402        {
3403            for rule in self.rules.iter().filter(|r| r.extensions.contains(&ext)) {
3404                do_magic!(rule);
3405                if let Some(f) = marked.get_mut(rule.id) {
3406                    *f = true
3407                }
3408            }
3409        }
3410
3411        for rule in self
3412            .rules
3413            .iter()
3414            // we don't run again rules run by extension
3415            .filter(|r| !*marked.get(r.id).unwrap_or(&false))
3416        {
3417            do_magic!(rule)
3418        }
3419
3420        Self::magic_default(haystack, stream_kind, &mut magic);
3421
3422        Ok(magic)
3423    }
3424
3425    /// Detects file [`Magic`] stopping at the first matching magic. Magic
3426    /// rules are evaluated from the best to the least relevant, so this method
3427    /// returns most of the time the best magic. For the rare cases where
3428    /// it doesn't or if the best result is always required, use [`MagicDb::best_magic`]
3429    ///
3430    /// # Arguments
3431    ///
3432    /// * `r` - A readable and seekable input
3433    /// * `extension` - Optional file extension to use for acceleration
3434    ///
3435    /// # Returns
3436    ///
3437    /// * `Result<Magic<'_>, Error>` - The detection result or an error
3438    pub fn first_magic<R: Read + Seek>(
3439        &self,
3440        r: &mut R,
3441        extension: Option<&str>,
3442    ) -> Result<Magic<'_>, Error> {
3443        let mut cache = Self::optimal_lazy_cache(r)?;
3444        let stream_kind = guess_stream_kind(cache.read_range(0..FILE_BYTES_MAX as u64)?);
3445        self.first_magic_with_stream_kind(&mut cache, stream_kind, extension)
3446    }
3447
3448    /// An alternative to [`Self::first_magic`] using a [`LazyCache`]
3449    /// to detects file [`Magic`] stopping at the first matching magic. Magic
3450    /// rules are evaluated from the best to the least relevant, so this method
3451    /// returns most of the time the best magic. For the rare cases where
3452    /// it doesn't or if the best result is always required, use [`MagicDb::best_magic`]
3453    ///
3454    /// # Arguments
3455    ///
3456    /// * `cache` - A [`LazyCache`] used for read operations
3457    /// * `extension` - Optional file extension to use for acceleration
3458    ///
3459    /// # Returns
3460    ///
3461    /// * `Result<Magic<'_>, Error>` - The detection result or an error
3462    ///
3463    /// # Notes
3464    ///
3465    /// * Use this method **only** if you need to re-use a [`LazyCache`] for future **read** operations.
3466    /// * Use [`Self::optimal_lazy_cache`] to prepare an optimal [`LazyCache`]
3467    pub fn first_magic_with_lazy_cache<R: Read + Seek>(
3468        &self,
3469        cache: &mut LazyCache<R>,
3470        extension: Option<&str>,
3471    ) -> Result<Magic<'_>, Error> {
3472        let stream_kind = guess_stream_kind(cache.read_range(0..FILE_BYTES_MAX as u64)?);
3473        self.first_magic_with_stream_kind(cache, stream_kind, extension)
3474    }
3475
3476    #[inline(always)]
3477    fn all_magics_sort_with_stream_kind<R: Read + Seek>(
3478        &self,
3479        haystack: &mut LazyCache<R>,
3480        stream_kind: StreamKind,
3481    ) -> Result<Vec<Magic<'_>>, Error> {
3482        let mut out = Vec::new();
3483
3484        let mut magic = Magic::default();
3485
3486        if Self::try_hard_magic(haystack, stream_kind, &mut magic)? {
3487            out.push(magic);
3488            magic = Magic::default();
3489        }
3490
3491        for rule in self.rules.iter() {
3492            rule.magic_entrypoint(&mut magic, stream_kind, haystack, self, false, 0)?;
3493
3494            // it is possible we have a strength with no message
3495            if !magic.message.is_empty() {
3496                magic.set_stream_kind(stream_kind);
3497                magic.set_source(rule.source.as_deref());
3498                out.push(magic);
3499                magic = Magic::default();
3500            }
3501
3502            magic.reset();
3503        }
3504
3505        Self::magic_default(haystack, stream_kind, &mut magic);
3506        out.push(magic);
3507
3508        out.sort_by_key(|b| std::cmp::Reverse(b.strength()));
3509
3510        Ok(out)
3511    }
3512
3513    /// Detects all [`Magic`] matching a given content.
3514    ///
3515    /// # Arguments
3516    ///
3517    /// * `r` - A readable and seekable input
3518    ///
3519    /// # Returns
3520    ///
3521    /// * `Result<Vec<Magic<'_>>, Error>` - All detection results sorted by strength or an error
3522    pub fn all_magics<R: Read + Seek>(&self, r: &mut R) -> Result<Vec<Magic<'_>>, Error> {
3523        let mut cache = Self::optimal_lazy_cache(r)?;
3524        let stream_kind = guess_stream_kind(cache.read_range(0..FILE_BYTES_MAX as u64)?);
3525        self.all_magics_sort_with_stream_kind(&mut cache, stream_kind)
3526    }
3527
3528    /// An alternative to [`Self::all_magics`] using a [`LazyCache`]
3529    /// to detects all [`Magic`] matching a given content.
3530    ///
3531    /// # Arguments
3532    ///
3533    /// * `r` - A readable and seekable input
3534    ///
3535    /// # Returns
3536    ///
3537    /// * `Result<Vec<Magic<'_>>, Error>` - All detection results sorted by strength or an error
3538    ///
3539    /// # Notes
3540    ///
3541    /// * Use this method **only** if you need to re-use a [`LazyCache`] for future **read** operations.
3542    /// * Use [`Self::optimal_lazy_cache`] to prepare an optimal [`LazyCache`]
3543    pub fn all_magics_with_lazy_cache<R: Read + Seek>(
3544        &self,
3545        cache: &mut LazyCache<R>,
3546    ) -> Result<Vec<Magic<'_>>, Error> {
3547        let stream_kind = guess_stream_kind(cache.read_range(0..FILE_BYTES_MAX as u64)?);
3548        self.all_magics_sort_with_stream_kind(cache, stream_kind)
3549    }
3550
3551    #[inline(always)]
3552    fn best_magic_with_stream_kind<R: Read + Seek>(
3553        &self,
3554        haystack: &mut LazyCache<R>,
3555        stream_kind: StreamKind,
3556    ) -> Result<Magic<'_>, Error> {
3557        let magics = self.all_magics_sort_with_stream_kind(haystack, stream_kind)?;
3558
3559        // magics is guaranteed to contain at least the
3560        // default magic but we unwrap to avoid any panic
3561        Ok(magics.into_iter().next().unwrap_or_else(|| {
3562            let mut magic = Magic::default();
3563            Self::magic_default(haystack, stream_kind, &mut magic);
3564            magic
3565        }))
3566    }
3567
3568    /// Detects the best [`Magic`] matching a given content.
3569    ///
3570    /// # Arguments
3571    ///
3572    /// * `r` - A readable and seekable input
3573    ///
3574    /// # Returns
3575    ///
3576    /// * `Result<Magic<'_>, Error>` - The best detection result or an error
3577    pub fn best_magic<R: Read + Seek>(&self, r: &mut R) -> Result<Magic<'_>, Error> {
3578        let mut cache = Self::optimal_lazy_cache(r)?;
3579        let stream_kind = guess_stream_kind(cache.read_range(0..FILE_BYTES_MAX as u64)?);
3580        self.best_magic_with_stream_kind(&mut cache, stream_kind)
3581    }
3582
3583    /// An alternative to [`Self::best_magic`] using a [`LazyCache`]
3584    /// to detect the best [`Magic`] matching a given content.
3585    ///
3586    /// # Arguments
3587    ///
3588    /// * `r` - A readable and seekable input
3589    ///
3590    /// # Returns
3591    ///
3592    /// * `Result<Magic<'_>, Error>` - The best detection result or an error
3593    ///
3594    /// # Notes
3595    ///
3596    /// * Use this method **only** if you need to re-use a [`LazyCache`] for future **read** operations.
3597    /// * Use [`Self::optimal_lazy_cache`] to prepare an optimal [`LazyCache`]
3598    pub fn best_magic_with_lazy_cache<R: Read + Seek>(
3599        &self,
3600        cache: &mut LazyCache<R>,
3601    ) -> Result<Magic<'_>, Error> {
3602        let stream_kind = guess_stream_kind(cache.read_range(0..FILE_BYTES_MAX as u64)?);
3603        self.best_magic_with_stream_kind(cache, stream_kind)
3604    }
3605
3606    /// Serializes the database to a generic writer implementing [`io::Write`]
3607    ///
3608    /// # Returns
3609    ///
3610    /// * `Result<(), Error>` - The serialized database or an error
3611    pub fn serialize<W: Write>(self, w: &mut W) -> Result<(), Error> {
3612        let mut encoder = GzEncoder::new(w, Compression::best());
3613
3614        bincode::serde::encode_into_std_write(&self, &mut encoder, bincode::config::standard())?;
3615        encoder.finish()?;
3616        Ok(())
3617    }
3618
3619    /// Deserializes the database from a generic reader implementing [`io::Read`]
3620    ///
3621    /// # Arguments
3622    ///
3623    /// * `r` - The reader to deserialize from
3624    ///
3625    /// # Returns
3626    ///
3627    /// * `Result<Self, Error>` - The deserialized database or an error
3628    pub fn deserialize<R: Read>(r: &mut R) -> Result<Self, Error> {
3629        let mut buf = vec![];
3630        let mut gz = GzDecoder::new(r);
3631        gz.read_to_end(&mut buf).map_err(|e| {
3632            bincode::error::DecodeError::OtherString(format!("failed to read: {e}"))
3633        })?;
3634        let (sdb, _): (MagicDb, usize) =
3635            bincode::serde::decode_from_slice(&buf, bincode::config::standard())?;
3636        Ok(sdb)
3637    }
3638
3639    #[inline(always)]
3640    fn prepare(&mut self) {
3641        self.rules
3642            .iter_mut()
3643            .for_each(|r| r.try_finalize(&self.dependencies));
3644
3645        // put text rules at the end
3646        self.rules.sort_by_key(|r| (r.is_text(), -(r.score as i64)));
3647    }
3648}
3649
3650#[cfg(test)]
3651mod tests {
3652    use std::io::Cursor;
3653
3654    use regex::bytes::Regex;
3655
3656    use crate::utils::unix_local_time_to_string;
3657
3658    use super::*;
3659
3660    macro_rules! lazy_cache {
3661        ($l: literal) => {
3662            LazyCache::from_read_seek(Cursor::new($l)).unwrap()
3663        };
3664    }
3665
3666    fn first_magic(
3667        rule: &str,
3668        content: &[u8],
3669        stream_kind: StreamKind,
3670    ) -> Result<Magic<'static>, Error> {
3671        let mut md = MagicDb::new();
3672        md.load(
3673            FileMagicParser::parse_str(rule, None)
3674                .inspect_err(|e| eprintln!("{e}"))
3675                .unwrap(),
3676        )
3677        .unwrap();
3678        let mut reader = LazyCache::from_read_seek(Cursor::new(content)).unwrap();
3679        let v = md.best_magic_with_stream_kind(&mut reader, stream_kind)?;
3680        Ok(v.into_owned())
3681    }
3682
3683    /// helper macro to debug tests
3684    #[allow(unused_macros)]
3685    macro_rules! enable_trace {
3686        () => {
3687            tracing_subscriber::fmt()
3688                .with_max_level(tracing_subscriber::filter::LevelFilter::TRACE)
3689                .try_init();
3690        };
3691    }
3692
3693    macro_rules! parse_assert {
3694        ($rule:literal) => {
3695            FileMagicParser::parse_str($rule, None)
3696                .inspect_err(|e| eprintln!("{e}"))
3697                .unwrap();
3698        };
3699    }
3700
3701    macro_rules! assert_magic_match_bin {
3702        ($rule: literal, $content:literal) => {{ first_magic($rule, $content, StreamKind::Binary).unwrap() }};
3703        ($rule: literal, $content:literal, $message:expr) => {{
3704            assert_eq!(
3705                first_magic($rule, $content, StreamKind::Binary)
3706                    .unwrap()
3707                    .message(),
3708                $message
3709            );
3710        }};
3711    }
3712
3713    macro_rules! assert_magic_match_text {
3714        ($rule: literal, $content:literal) => {{ first_magic($rule, $content, StreamKind::Text(TextEncoding::Utf8)).unwrap() }};
3715        ($rule: literal, $content:literal, $message:expr) => {{
3716            assert_eq!(
3717                first_magic($rule, $content, StreamKind::Text(TextEncoding::Utf8))
3718                    .unwrap()
3719                    .message(),
3720                $message
3721            );
3722        }};
3723    }
3724
3725    macro_rules! assert_magic_not_match_text {
3726        ($rule: literal, $content:literal) => {{
3727            assert!(
3728                first_magic($rule, $content, StreamKind::Text(TextEncoding::Utf8))
3729                    .unwrap()
3730                    .is_default()
3731            );
3732        }};
3733    }
3734
3735    macro_rules! assert_magic_not_match_bin {
3736        ($rule: literal, $content:literal) => {{
3737            assert!(
3738                first_magic($rule, $content, StreamKind::Binary)
3739                    .unwrap()
3740                    .is_default()
3741            );
3742        }};
3743    }
3744
3745    #[test]
3746    fn test_regex() {
3747        assert_magic_match_text!(
3748            r#"
37490	regex/1024 \^#![[:space:]]*/usr/bin/env[[:space:]]+
3750!:mime	text/x-shellscript
3751>&0  regex/64 .*($|\\b) %s shell script text executable
3752    "#,
3753            br#"#!/usr/bin/env bash
3754        echo hello world"#,
3755            // the magic generated
3756            "bash shell script text executable"
3757        );
3758
3759        let re = Regex::new(r"(?-u)\x42\x82").unwrap();
3760        assert!(re.is_match(b"\x42\x82"));
3761
3762        assert_magic_match_bin!(
3763            r#"0 regex \x42\x82 binary regex match"#,
3764            b"\x00\x00\x00\x00\x00\x00\x42\x82"
3765        );
3766
3767        // test regex continuation after match
3768        assert_magic_match_bin!(
3769            r#"
3770            0 regex \x42\x82
3771            >&0 string \xde\xad\xbe\xef it works
3772            "#,
3773            b"\x00\x00\x00\x00\x00\x00\x42\x82\xde\xad\xbe\xef"
3774        );
3775
3776        assert_magic_match_bin!(
3777            r#"
3778            0 regex/s \x42\x82
3779            >&0 string \x42\x82\xde\xad\xbe\xef it works
3780            "#,
3781            b"\x00\x00\x00\x00\x00\x00\x42\x82\xde\xad\xbe\xef"
3782        );
3783
3784        // ^ must match stat of line when matching text
3785        assert_magic_match_text!(
3786            r#"
37870	regex/1024 \^HelloWorld$ HelloWorld String"#,
3788            br#"
3789// this is a comment after an empty line
3790HelloWorld
3791            "#
3792        );
3793    }
3794
3795    #[test]
3796    fn test_string_with_mods() {
3797        assert_magic_match_text!(
3798            r#"0	string/w	#!\ \ \ /usr/bin/env\ bash	BASH
3799        "#,
3800            b"#! /usr/bin/env bash i
3801        echo hello world"
3802        );
3803
3804        // test uppercase insensitive
3805        assert_magic_match_text!(
3806            r#"0	string/C	HelloWorld	it works
3807        "#,
3808            b"helloworld"
3809        );
3810
3811        assert_magic_not_match_text!(
3812            r#"0	string/C	HelloWorld	it works
3813        "#,
3814            b"hELLOwORLD"
3815        );
3816
3817        // test lowercase insensitive
3818        assert_magic_match_text!(
3819            r#"0	string/c	HelloWorld	it works
3820        "#,
3821            b"HELLOWORLD"
3822        );
3823
3824        assert_magic_not_match_text!(
3825            r#"0	string/c	HelloWorld	it works
3826        "#,
3827            b"helloworld"
3828        );
3829
3830        // test full word match
3831        assert_magic_match_text!(
3832            r#"0	string/f	#!/usr/bin/env\ bash	BASH
3833        "#,
3834            b"#!/usr/bin/env bash"
3835        );
3836
3837        assert_magic_not_match_text!(
3838            r#"0	string/f	#!/usr/bin/python PYTHON"#,
3839            b"#!/usr/bin/pythonic"
3840        );
3841
3842        // testing whitespace compacting
3843        assert_magic_match_text!(
3844            r#"0	string/W	#!/usr/bin/env\ python  PYTHON"#,
3845            b"#!/usr/bin/env    python"
3846        );
3847
3848        assert_magic_not_match_text!(
3849            r#"0	string/W	#!/usr/bin/env\ \ python  PYTHON"#,
3850            b"#!/usr/bin/env python"
3851        );
3852    }
3853
3854    #[test]
3855    fn test_search_with_mods() {
3856        assert_magic_match_text!(
3857            r#"0	search/1/fwt	#!\ /usr/bin/luatex	LuaTex script text executable"#,
3858            b"#!          /usr/bin/luatex "
3859        );
3860
3861        // test matching from the beginning
3862        assert_magic_match_text!(
3863            r#"
3864            0	search/s	/usr/bin/env
3865            >&0 string /usr/bin/env it works
3866            "#,
3867            b"#!/usr/bin/env    python"
3868        );
3869
3870        assert_magic_not_match_text!(
3871            r#"
3872            0	search	/usr/bin/env
3873            >&0 string /usr/bin/env it works
3874            "#,
3875            b"#!/usr/bin/env    python"
3876        );
3877    }
3878
3879    #[test]
3880    fn test_pstring() {
3881        assert_magic_match_bin!(r#"0 pstring Toast it works"#, b"\x05Toast");
3882
3883        assert_magic_match_bin!(r#"0 pstring Toast %s"#, b"\x05Toast", "Toast");
3884
3885        assert_magic_not_match_bin!(r#"0 pstring Toast Doesn't work"#, b"\x07Toaster");
3886
3887        // testing with modifiers
3888        assert_magic_match_bin!(r#"0 pstring/H Toast it works"#, b"\x00\x05Toast");
3889
3890        assert_magic_match_bin!(r#"0 pstring/HJ Toast it works"#, b"\x00\x07Toast");
3891
3892        assert_magic_match_bin!(r#"0 pstring/HJ Toast %s"#, b"\x00\x07Toast", "Toast");
3893
3894        assert_magic_match_bin!(r#"0 pstring/h Toast it works"#, b"\x05\x00Toast");
3895
3896        assert_magic_match_bin!(r#"0 pstring/hJ Toast it works"#, b"\x07\x00Toast");
3897
3898        assert_magic_match_bin!(r#"0 pstring/L Toast it works"#, b"\x00\x00\x00\x05Toast");
3899
3900        assert_magic_match_bin!(r#"0 pstring/LJ Toast it works"#, b"\x00\x00\x00\x09Toast");
3901
3902        assert_magic_match_bin!(r#"0 pstring/l Toast it works"#, b"\x05\x00\x00\x00Toast");
3903
3904        assert_magic_match_bin!(r#"0 pstring/lJ Toast it works"#, b"\x09\x00\x00\x00Toast");
3905    }
3906
3907    #[test]
3908    fn test_max_recursion() {
3909        let res = first_magic(
3910            r#"0	indirect x"#,
3911            b"#!          /usr/bin/luatex ",
3912            StreamKind::Binary,
3913        );
3914        assert!(res.is_err());
3915        let _ = res.inspect_err(|e| {
3916            assert!(matches!(
3917                e.unwrap_localized(),
3918                Error::MaximumRecursion(MAX_RECURSION)
3919            ))
3920        });
3921    }
3922
3923    #[test]
3924    fn test_string_ops() {
3925        assert_magic_match_text!("0	string/b MZ MZ File", b"MZ\0");
3926        assert_magic_match_text!("0	string !MZ Not MZ File", b"AZ\0");
3927        assert_magic_match_text!("0	string >\0 Any String", b"A\0");
3928        assert_magic_match_text!("0	string >Test Any String", b"Test 1\0");
3929        assert_magic_match_text!("0	string <Test Any String", b"\0");
3930        assert_magic_not_match_text!("0	string >Test Any String", b"\0");
3931    }
3932
3933    #[test]
3934    fn test_lestring16() {
3935        assert_magic_match_bin!(
3936            "0 lestring16 abcd Little-endian UTF-16 string",
3937            b"\x61\x00\x62\x00\x63\x00\x64\x00"
3938        );
3939        assert_magic_match_bin!(
3940            "0 lestring16 x %s",
3941            b"\x61\x00\x62\x00\x63\x00\x64\x00\x00",
3942            "abcd"
3943        );
3944        assert_magic_not_match_bin!(
3945            "0 lestring16 abcd Little-endian UTF-16 string",
3946            b"\x00\x61\x00\x62\x00\x63\x00\x64"
3947        );
3948        assert_magic_match_bin!(
3949            "4 lestring16 abcd Little-endian UTF-16 string",
3950            b"\x00\x00\x00\x00\x61\x00\x62\x00\x63\x00\x64\x00"
3951        );
3952    }
3953
3954    #[test]
3955    fn test_bestring16() {
3956        assert_magic_match_bin!(
3957            "0 bestring16 abcd Big-endian UTF-16 string",
3958            b"\x00\x61\x00\x62\x00\x63\x00\x64"
3959        );
3960        assert_magic_match_bin!(
3961            "0 bestring16 x %s",
3962            b"\x00\x61\x00\x62\x00\x63\x00\x64",
3963            "abcd"
3964        );
3965        assert_magic_not_match_bin!(
3966            "0 bestring16 abcd Big-endian UTF-16 string",
3967            b"\x61\x00\x62\x00\x63\x00\x64\x00"
3968        );
3969        assert_magic_match_bin!(
3970            "4 bestring16 abcd Big-endian UTF-16 string",
3971            b"\x00\x00\x00\x00\x00\x61\x00\x62\x00\x63\x00\x64"
3972        );
3973    }
3974
3975    #[test]
3976    fn test_offset_from_end() {
3977        assert_magic_match_bin!("-1 ubyte 0x42 last byte ok", b"\x00\x00\x42");
3978        assert_magic_match_bin!("-2 ubyte 0x41 last byte ok", b"\x00\x41\x00");
3979    }
3980
3981    #[test]
3982    fn test_relative_offset() {
3983        assert_magic_match_bin!(
3984            "
3985            0 ubyte 0x42
3986            >&0 ubyte 0x00
3987            >>&0 ubyte 0x41 third byte ok
3988            ",
3989            b"\x42\x00\x41\x00"
3990        );
3991    }
3992
3993    #[test]
3994    fn test_indirect_offset() {
3995        assert_magic_match_bin!("(0.l) ubyte 0x42 it works", b"\x04\x00\x00\x00\x42");
3996        // adding fixed value to offset
3997        assert_magic_match_bin!("(0.l+3) ubyte 0x42 it works", b"\x01\x00\x00\x00\x42");
3998        // testing offset pair
3999        assert_magic_match_bin!(
4000            "(0.l+(4)) ubyte 0x42 it works",
4001            b"\x04\x00\x00\x00\x04\x00\x00\x00\x42"
4002        );
4003    }
4004
4005    #[test]
4006    fn test_use_with_message() {
4007        assert_magic_match_bin!(
4008            r#"
40090 string MZ
4010>0 use mz first match
4011
40120 name mz then second match
4013>0 string MZ
4014"#,
4015            b"MZ\0",
4016            "first match then second match"
4017        );
4018    }
4019
4020    #[test]
4021    fn test_scalar_transform() {
4022        assert_magic_match_bin!("0 ubyte+1 0x1 add works", b"\x00");
4023        assert_magic_match_bin!("0 ubyte-1 0xfe sub works", b"\xff");
4024        assert_magic_match_bin!("0 ubyte%2 0 mod works", b"\x0a");
4025        assert_magic_match_bin!("0 ubyte&0x0f 0x0f bitand works", b"\xff");
4026        assert_magic_match_bin!("0 ubyte|0x0f 0xff bitor works", b"\xf0");
4027        assert_magic_match_bin!("0 ubyte^0x0f 0xf0 bitxor works", b"\xff");
4028
4029        FileMagicParser::parse_str("0 ubyte%0 mod by zero", None)
4030            .expect_err("expect div by zero error");
4031        FileMagicParser::parse_str("0 ubyte/0 div by zero", None)
4032            .expect_err("expect div by zero error");
4033    }
4034
4035    #[test]
4036    fn test_belong() {
4037        // Test that a file with a four-byte value at offset 0 that matches the given value in big-endian byte order
4038        assert_magic_match_bin!("0 belong 0x12345678 Big-endian long", b"\x12\x34\x56\x78");
4039        // Test that a file with a four-byte value at offset 0 that does not match the given value in big-endian byte order
4040        assert_magic_not_match_bin!("0 belong 0x12345678 Big-endian long", b"\x78\x56\x34\x12");
4041        // Test that a file with a four-byte value at a non-zero offset that matches the given value in big-endian byte order
4042        assert_magic_match_bin!(
4043            "4 belong 0x12345678 Big-endian long",
4044            b"\x00\x00\x00\x00\x12\x34\x56\x78"
4045        );
4046        // Test < operator
4047        assert_magic_match_bin!("0 belong <0x12345678 Big-endian long", b"\x12\x34\x56\x77");
4048        assert_magic_not_match_bin!("0 belong <0x12345678 Big-endian long", b"\x12\x34\x56\x78");
4049
4050        // Test > operator
4051        assert_magic_match_bin!("0 belong >0x12345678 Big-endian long", b"\x12\x34\x56\x79");
4052        assert_magic_not_match_bin!("0 belong >0x12345678 Big-endian long", b"\x12\x34\x56\x78");
4053
4054        // Test & operator
4055        assert_magic_match_bin!("0 belong &0x5678 Big-endian long", b"\x00\x00\x56\x78");
4056        assert_magic_not_match_bin!("0 belong &0x0000FFFF Big-endian long", b"\x12\x34\x56\x78");
4057
4058        // Test ^ operator (bitwise AND with complement)
4059        assert_magic_match_bin!("0 belong ^0xFFFF0000 Big-endian long", b"\x00\x00\x56\x78");
4060        assert_magic_not_match_bin!("0 belong ^0xFFFF0000 Big-endian long", b"\x00\x01\x56\x78");
4061
4062        // Test ~ operator
4063        assert_magic_match_bin!("0 belong ~0x12345678 Big-endian long", b"\xed\xcb\xa9\x87");
4064        assert_magic_not_match_bin!("0 belong ~0x12345678 Big-endian long", b"\x12\x34\x56\x78");
4065
4066        // Test x operator
4067        assert_magic_match_bin!("0 belong x Big-endian long", b"\x12\x34\x56\x78");
4068        assert_magic_match_bin!("0 belong x Big-endian long", b"\x78\x56\x34\x12");
4069    }
4070
4071    #[test]
4072    fn test_parse_search() {
4073        parse_assert!("0 search test");
4074        parse_assert!("0 search/24/s test");
4075        parse_assert!("0 search/s/24 test");
4076    }
4077
4078    #[test]
4079    fn test_bedate() {
4080        assert_magic_match_bin!(
4081            "0 bedate 946684800 Unix date (Jan 1, 2000)",
4082            b"\x38\x6D\x43\x80"
4083        );
4084        assert_magic_not_match_bin!(
4085            "0 bedate 946684800 Unix date (Jan 1, 2000)",
4086            b"\x00\x00\x00\x00"
4087        );
4088        assert_magic_match_bin!(
4089            "4 bedate 946684800 %s",
4090            b"\x00\x00\x00\x00\x38\x6D\x43\x80",
4091            "2000-01-01 00:00:00"
4092        );
4093    }
4094    #[test]
4095    fn test_beldate() {
4096        assert_magic_match_bin!(
4097            "0 beldate 946684800 Local date (Jan 1, 2000)",
4098            b"\x38\x6D\x43\x80"
4099        );
4100        assert_magic_not_match_bin!(
4101            "0 beldate 946684800 Local date (Jan 1, 2000)",
4102            b"\x00\x00\x00\x00"
4103        );
4104
4105        assert_magic_match_bin!(
4106            "4 beldate 946684800 {}",
4107            b"\x00\x00\x00\x00\x38\x6D\x43\x80",
4108            unix_local_time_to_string(946684800)
4109        );
4110    }
4111
4112    #[test]
4113    fn test_beqdate() {
4114        assert_magic_match_bin!(
4115            "0 beqdate 946684800 Unix date (Jan 1, 2000)",
4116            b"\x00\x00\x00\x00\x38\x6D\x43\x80"
4117        );
4118
4119        assert_magic_not_match_bin!(
4120            "0 beqdate 946684800 Unix date (Jan 1, 2000)",
4121            b"\x00\x00\x00\x00\x00\x00\x00\x00"
4122        );
4123
4124        assert_magic_match_bin!(
4125            "0 beqdate 946684800 %s",
4126            b"\x00\x00\x00\x00\x38\x6D\x43\x80",
4127            "2000-01-01 00:00:00"
4128        );
4129    }
4130
4131    #[test]
4132    fn test_medate() {
4133        assert_magic_match_bin!(
4134            "0 medate 946684800 Unix date (Jan 1, 2000)",
4135            b"\x6D\x38\x80\x43"
4136        );
4137
4138        assert_magic_not_match_bin!(
4139            "0 medate 946684800 Unix date (Jan 1, 2000)",
4140            b"\x00\x00\x00\x00"
4141        );
4142
4143        assert_magic_match_bin!(
4144            "4 medate 946684800 %s",
4145            b"\x00\x00\x00\x00\x6D\x38\x80\x43",
4146            "2000-01-01 00:00:00"
4147        );
4148    }
4149
4150    #[test]
4151    fn test_meldate() {
4152        assert_magic_match_bin!(
4153            "0 meldate 946684800 Local date (Jan 1, 2000)",
4154            b"\x6D\x38\x80\x43"
4155        );
4156        assert_magic_not_match_bin!(
4157            "0 meldate 946684800 Local date (Jan 1, 2000)",
4158            b"\x00\x00\x00\x00"
4159        );
4160
4161        assert_magic_match_bin!(
4162            "4 meldate 946684800 %s",
4163            b"\x00\x00\x00\x00\x6D\x38\x80\x43",
4164            unix_local_time_to_string(946684800)
4165        );
4166    }
4167
4168    #[test]
4169    fn test_date() {
4170        assert_magic_match_bin!(
4171            "0 date 946684800 Local date (Jan 1, 2000)",
4172            b"\x80\x43\x6D\x38"
4173        );
4174        assert_magic_not_match_bin!(
4175            "0 date 946684800 Local date (Jan 1, 2000)",
4176            b"\x00\x00\x00\x00"
4177        );
4178        assert_magic_match_bin!(
4179            "4 date 946684800 {}",
4180            b"\x00\x00\x00\x00\x80\x43\x6D\x38",
4181            "2000-01-01 00:00:00"
4182        );
4183    }
4184
4185    #[test]
4186    fn test_leldate() {
4187        assert_magic_match_bin!(
4188            "0 leldate 946684800 Local date (Jan 1, 2000)",
4189            b"\x80\x43\x6D\x38"
4190        );
4191        assert_magic_not_match_bin!(
4192            "0 leldate 946684800 Local date (Jan 1, 2000)",
4193            b"\x00\x00\x00\x00"
4194        );
4195        assert_magic_match_bin!(
4196            "4 leldate 946684800 {}",
4197            b"\x00\x00\x00\x00\x80\x43\x6D\x38",
4198            unix_local_time_to_string(946684800)
4199        );
4200    }
4201
4202    #[test]
4203    fn test_leqdate() {
4204        assert_magic_match_bin!(
4205            "0 leqdate 1577836800 Unix date (Jan 1, 2020)",
4206            b"\x00\xe1\x0b\x5E\x00\x00\x00\x00"
4207        );
4208
4209        assert_magic_not_match_bin!(
4210            "0 leqdate 1577836800 Unix date (Jan 1, 2020)",
4211            b"\x00\x00\x00\x00\x00\x00\x00\x00"
4212        );
4213        assert_magic_match_bin!(
4214            "8 leqdate 1577836800 %s",
4215            b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\xE1\x0B\x5E\x00\x00\x00\x00",
4216            "2020-01-01 00:00:00"
4217        );
4218    }
4219
4220    #[test]
4221    fn test_leqldate() {
4222        assert_magic_match_bin!(
4223            "0 leqldate 1577836800 Unix date (Jan 1, 2020)",
4224            b"\x00\xe1\x0b\x5E\x00\x00\x00\x00"
4225        );
4226
4227        assert_magic_not_match_bin!(
4228            "0 leqldate 1577836800 Unix date (Jan 1, 2020)",
4229            b"\x00\x00\x00\x00\x00\x00\x00\x00"
4230        );
4231        assert_magic_match_bin!(
4232            "8 leqldate 1577836800 %s",
4233            b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\xE1\x0B\x5E\x00\x00\x00\x00",
4234            unix_local_time_to_string(1577836800)
4235        );
4236    }
4237
4238    #[test]
4239    fn test_melong() {
4240        // Test = operator
4241        assert_magic_match_bin!(
4242            "0 melong =0x12345678 Middle-endian long",
4243            b"\x34\x12\x78\x56"
4244        );
4245        assert_magic_not_match_bin!(
4246            "0 melong =0x12345678 Middle-endian long",
4247            b"\x00\x00\x00\x00"
4248        );
4249
4250        // Test < operator
4251        assert_magic_match_bin!(
4252            "0 melong <0x12345678 Middle-endian long",
4253            b"\x34\x12\x78\x55"
4254        ); // 0x12345677 in middle-endian
4255        assert_magic_not_match_bin!(
4256            "0 melong <0x12345678 Middle-endian long",
4257            b"\x34\x12\x78\x56"
4258        ); // 0x12345678 in middle-endian
4259
4260        // Test > operator
4261        assert_magic_match_bin!(
4262            "0 melong >0x12345678 Middle-endian long",
4263            b"\x34\x12\x78\x57"
4264        ); // 0x12345679 in middle-endian
4265        assert_magic_not_match_bin!(
4266            "0 melong >0x12345678 Middle-endian long",
4267            b"\x34\x12\x78\x56"
4268        ); // 0x12345678 in middle-endian
4269
4270        // Test & operator
4271        assert_magic_match_bin!("0 melong &0x5678 Middle-endian long", b"\xab\xcd\x78\x56"); // 0x00007856 in middle-endian
4272        assert_magic_not_match_bin!(
4273            "0 melong &0x0000FFFF Middle-endian long",
4274            b"\x34\x12\x78\x56"
4275        ); // 0x12347856 in middle-endian
4276
4277        // Test ^ operator (bitwise AND with complement)
4278        assert_magic_match_bin!(
4279            "0 melong ^0xFFFF0000 Middle-endian long",
4280            b"\x00\x00\x78\x56"
4281        ); // 0x00007856 in middle-endian
4282        assert_magic_not_match_bin!(
4283            "0 melong ^0xFFFF0000 Middle-endian long",
4284            b"\x00\x01\x78\x56"
4285        ); // 0x00017856 in middle-endian
4286
4287        // Test ~ operator
4288        assert_magic_match_bin!(
4289            "0 melong ~0x12345678 Middle-endian long",
4290            b"\xCB\xED\x87\xA9"
4291        );
4292        assert_magic_not_match_bin!(
4293            "0 melong ~0x12345678 Middle-endian long",
4294            b"\x34\x12\x78\x56"
4295        ); // The original value
4296
4297        // Test x operator
4298        assert_magic_match_bin!("0 melong x Middle-endian long", b"\x34\x12\x78\x56");
4299        assert_magic_match_bin!("0 melong x Middle-endian long", b"\x00\x00\x00\x00");
4300    }
4301
4302    #[test]
4303    fn test_uquad() {
4304        // Test = operator
4305        assert_magic_match_bin!(
4306            "0 uquad =0x123456789ABCDEF0 Unsigned quad",
4307            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4308        );
4309        assert_magic_not_match_bin!(
4310            "0 uquad =0x123456789ABCDEF0 Unsigned quad",
4311            b"\x00\x00\x00\x00\x00\x00\x00\x00"
4312        );
4313
4314        // Test < operator
4315        assert_magic_match_bin!(
4316            "0 uquad <0x123456789ABCDEF0 Unsigned quad",
4317            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x11"
4318        );
4319        assert_magic_not_match_bin!(
4320            "0 uquad <0x123456789ABCDEF0 Unsigned quad",
4321            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4322        );
4323
4324        // Test > operator
4325        assert_magic_match_bin!(
4326            "0 uquad >0x123456789ABCDEF0 Unsigned quad",
4327            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x13"
4328        );
4329        assert_magic_not_match_bin!(
4330            "0 uquad >0x123456789ABCDEF0 Unsigned quad",
4331            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4332        );
4333
4334        // Test & operator
4335        assert_magic_match_bin!(
4336            "0 uquad &0xF0 Unsigned quad",
4337            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4338        );
4339        assert_magic_not_match_bin!(
4340            "0 uquad &0xFF Unsigned quad",
4341            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4342        );
4343
4344        // Test ^ operator (bitwise AND with complement)
4345        assert_magic_match_bin!(
4346            "0 uquad ^0xFFFFFFFFFFFFFFFF Unsigned quad",
4347            b"\x00\x00\x00\x00\x00\x00\x00\x00"
4348        ); // All bits clear
4349        assert_magic_not_match_bin!(
4350            "0 uquad ^0xFFFFFFFFFFFFFFFF Unsigned quad",
4351            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4352        ); // Some bits set
4353
4354        // Test ~ operator
4355        assert_magic_match_bin!(
4356            "0 uquad ~0x123456789ABCDEF0 Unsigned quad",
4357            b"\x0F\x21\x43\x65\x87\xA9\xCB\xED"
4358        );
4359        assert_magic_not_match_bin!(
4360            "0 uquad ~0x123456789ABCDEF0 Unsigned quad",
4361            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4362        ); // The original value
4363
4364        // Test x operator
4365        assert_magic_match_bin!(
4366            "0 uquad x {:#x}",
4367            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12",
4368            "0x123456789abcdef0"
4369        );
4370        assert_magic_match_bin!(
4371            "0 uquad x Unsigned quad",
4372            b"\x00\x00\x00\x00\x00\x00\x00\x00"
4373        );
4374    }
4375
4376    #[test]
4377    fn test_guid() {
4378        assert_magic_match_bin!(
4379            "0 guid EC959539-6786-2D4E-8FDB-98814CE76C1E It works",
4380            b"\xEC\x95\x95\x39\x67\x86\x2D\x4E\x8F\xDB\x98\x81\x4C\xE7\x6C\x1E"
4381        );
4382
4383        assert_magic_not_match_bin!(
4384            "0 guid 399595EC-8667-4E2D-8FDB-98814CE76C1E It works",
4385            b"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F"
4386        );
4387
4388        assert_magic_match_bin!(
4389            "0 guid x %s",
4390            b"\xEC\x95\x95\x39\x67\x86\x2D\x4E\x8F\xDB\x98\x81\x4C\xE7\x6C\x1E",
4391            "EC959539-6786-2D4E-8FDB-98814CE76C1E"
4392        );
4393    }
4394
4395    #[test]
4396    fn test_ubeqdate() {
4397        assert_magic_match_bin!(
4398            "0 ubeqdate 1633046400 It works",
4399            b"\x00\x00\x00\x00\x61\x56\x4f\x80"
4400        );
4401
4402        assert_magic_match_bin!(
4403            "0 ubeqdate x %s",
4404            b"\x00\x00\x00\x00\x61\x56\x4f\x80",
4405            "2021-10-01 00:00:00"
4406        );
4407
4408        assert_magic_not_match_bin!(
4409            "0 ubeqdate 1633046400 It should not work",
4410            b"\x00\x00\x00\x00\x00\x00\x00\x00"
4411        );
4412    }
4413
4414    #[test]
4415    fn test_ldate() {
4416        assert_magic_match_bin!("0 ldate 1640551520 It works", b"\x60\xd4\xC8\x61");
4417
4418        assert_magic_not_match_bin!("0 ldate 1633046400 It should not work", b"\x00\x00\x00\x00");
4419
4420        assert_magic_match_bin!(
4421            "0 ldate x %s",
4422            b"\x60\xd4\xC8\x61",
4423            unix_local_time_to_string(1640551520)
4424        );
4425    }
4426
4427    #[test]
4428    fn test_scalar_with_transform() {
4429        assert_magic_match_bin!("0 ubyte/10 2 {}", b"\x14", "2");
4430        assert_magic_match_bin!("0 ubyte/10 x {}", b"\x14", "2");
4431        assert_magic_match_bin!("0 ubyte%10 x {}", b"\x14", "0");
4432    }
4433
4434    #[test]
4435    fn test_float_with_transform() {
4436        assert_magic_match_bin!("0 lefloat/10 2 {}", b"\x00\x00\xa0\x41", "2");
4437        assert_magic_match_bin!("0 lefloat/10 x {}", b"\x00\x00\xa0\x41", "2");
4438        assert_magic_match_bin!("0 lefloat%10 x {}", b"\x00\x00\xa0\x41", "0");
4439    }
4440
4441    #[test]
4442    fn test_read_octal() {
4443        // Basic cases
4444        assert_eq!(read_octal_u64(&mut lazy_cache!("0")), Some(0));
4445        assert_eq!(read_octal_u64(&mut lazy_cache!("00")), Some(0));
4446        assert_eq!(read_octal_u64(&mut lazy_cache!("01")), Some(1));
4447        assert_eq!(read_octal_u64(&mut lazy_cache!("07")), Some(7));
4448        assert_eq!(read_octal_u64(&mut lazy_cache!("010")), Some(8));
4449        assert_eq!(read_octal_u64(&mut lazy_cache!("0123")), Some(83));
4450        assert_eq!(read_octal_u64(&mut lazy_cache!("0755")), Some(493));
4451
4452        // With trailing non-octal characters
4453        assert_eq!(read_octal_u64(&mut lazy_cache!("0ABC")), Some(0));
4454        assert_eq!(read_octal_u64(&mut lazy_cache!("01ABC")), Some(1));
4455        assert_eq!(read_octal_u64(&mut lazy_cache!("0755ABC")), Some(493));
4456        assert_eq!(read_octal_u64(&mut lazy_cache!("0123ABC")), Some(83));
4457
4458        // Invalid octal digits
4459        assert_eq!(read_octal_u64(&mut lazy_cache!("08")), Some(0)); // stops at '8'
4460        assert_eq!(read_octal_u64(&mut lazy_cache!("01238")), Some(83)); // stops at '8'
4461
4462        // No leading '0'
4463        assert_eq!(read_octal_u64(&mut lazy_cache!("123")), None);
4464        assert_eq!(read_octal_u64(&mut lazy_cache!("755")), None);
4465
4466        // Empty string
4467        assert_eq!(read_octal_u64(&mut lazy_cache!("")), None);
4468
4469        // Only non-octal characters
4470        assert_eq!(read_octal_u64(&mut lazy_cache!("ABC")), None);
4471        assert_eq!(read_octal_u64(&mut lazy_cache!("8ABC")), None); // first char is not '0'
4472
4473        // Longer valid octal (but within u64 range)
4474        assert_eq!(
4475            read_octal_u64(&mut lazy_cache!("01777777777")),
4476            Some(268435455)
4477        );
4478    }
4479
4480    #[test]
4481    fn test_offset_bug_1() {
4482        // this tests the exact behaviour
4483        // expected by libmagic/file
4484        assert_magic_match_bin!(
4485            r"
44861	string		TEST Bread is
4487# offset computation is relative to
4488# rule start
4489>(5.b)	use toasted
4490
44910 name toasted
4492>0	string twice Toasted
4493>>0  use toasted_twice
4494
44950 name toasted_twice
4496>(6.b) string x %s
4497        ",
4498            b"\x00TEST\x06twice\x00\x06",
4499            "Bread is Toasted twice"
4500        );
4501    }
4502
4503    // this test implement the exact same logic as
4504    // test_offset_bug_1 except that the rule starts
4505    // matching from end. Surprisingly we need to
4506    // adjust indirect offsets so that it works in
4507    // libmagic/file
4508    #[test]
4509    fn test_offset_bug_2() {
4510        // this tests the exact behaviour
4511        // expected by libmagic/file
4512        assert_magic_match_bin!(
4513            r"
4514-12	string		TEST Bread is
4515>(4.b)	use toasted
4516
45170 name toasted
4518>0	string twice Toasted
4519>>0  use toasted_twice
4520
45210 name toasted_twice
4522>(6.b) string x %
4523        ",
4524            b"\x00TEST\x06twice\x00\x06",
4525            "Bread is Toasted twice"
4526        )
4527    }
4528
4529    #[test]
4530    fn test_offset_bug_3() {
4531        // this tests the exact behaviour
4532        // expected by libmagic/file
4533        assert_magic_match_bin!(
4534            r"
45351	string		TEST Bread is
4536>(5.b) indirect/r x
4537
45380	string twice Toasted
4539>0  use toasted_twice
4540
45410 name toasted_twice
4542>0 string x %s
4543        ",
4544            b"\x00TEST\x06twice\x00\x08",
4545            "Bread is Toasted twice"
4546        )
4547    }
4548
4549    #[test]
4550    fn test_offset_bug_4() {
4551        // this tests the exact behaviour
4552        // expected by libmagic/file
4553        assert_magic_match_bin!(
4554            r"
45551	string		Bread %s
4556>(6.b) indirect/r x
4557
4558# this one uses a based offset
4559# computed at indirection
45601	string is\ Toasted %s
4561>(11.b)  use toasted_twice
4562
4563# this one is using a new base
4564# offset being previous base
4565# offset + offset of use
45660 name toasted_twice
4567>0 string x %s
4568            ",
4569            b"\x00Bread\x06is Toasted\x0ctwice\x00",
4570            "Bread is Toasted twice"
4571        )
4572    }
4573
4574    #[test]
4575    fn test_offset_bug_5() {
4576        assert_magic_match_bin!(
4577            r"
45781	string		TEST Bread is
4579>(5.b) indirect/r x
4580
45810	string twice Toasted
4582>0  use toasted_twice
4583
45840 name toasted_twice
4585>0 string twice
4586>>&1 byte 0x08 twice
4587            ",
4588            b"\x00TEST\x06twice\x00\x08",
4589            "Bread is Toasted twice"
4590        )
4591    }
4592
4593    #[test]
4594    fn test_message_parts() {
4595        let m = first_magic(
4596            r#"0	string/W	#!/usr/bin/env\ python  PYTHON"#,
4597            b"#!/usr/bin/env    python",
4598            StreamKind::Text(TextEncoding::Ascii),
4599        )
4600        .unwrap();
4601
4602        assert!(m.message_parts().any(|p| p.eq_ignore_ascii_case("python")))
4603    }
4604}