pure_magic/
lib.rs

1#![forbid(unsafe_code)]
2#![deny(unused_imports)]
3#![deny(missing_docs)]
4//! # `pure-magic`: A pure and safe Rust Reimplementation of `libmagic`
5//!
6//! Unlike many file identification crates, `pure-magic` is highly compatible with the standard
7//! `magic` rule format, allowing seamless reuse of existing
8//! [rules](https://github.com/qjerome/magic-rs/tree/main/magic-db/src/magdir). This makes it an ideal
9//! drop-in replacement for crates relying on **`libmagic` C bindings**, where memory safety is critical.
10//!
11//! **Key Features:**
12//! - File type detection
13//! - MIME type inference
14//! - Custom magic rule parsing
15//!
16//! ## Installation
17//! Add `pure-magic` to your `Cargo.toml`:
18//!
19//! ```toml
20//! [dependencies]
21//! pure-magic = "0.1"  # Replace with the latest version
22//! ```
23//!
24//! Or add the latest version with cargo:
25//!
26//! ```sh
27//! cargo add pure-magic
28//! ```
29//!
30//! ## Quick Start
31//!
32//! ### Detect File Types Programmatically
33//! ```rust
34//! use pure_magic::{MagicDb, MagicSource};
35//! use std::fs::File;
36//!
37//! fn main() -> Result<(), Box<dyn std::error::Error>> {
38//!     let mut db = MagicDb::new();
39//!     // Create a MagicSource from a file
40//!     let rust_magic = MagicSource::open("../magic-db/src/magdir/rust")?;
41//!     db.load(rust_magic)?;
42//!
43//!     // Open a file and detect its type
44//!     let mut file = File::open("src/lib.rs")?;
45//!     let magic = db.first_magic(&mut file, None)?;
46//!
47//!     println!(
48//!         "File type: {} (MIME: {}, strength: {})",
49//!         magic.message(),
50//!         magic.mime_type(),
51//!         magic.strength()
52//!     );
53//!     Ok(())
54//! }
55//! ```
56//!
57//! ### Get All Matching Rules
58//! ```rust
59//! use pure_magic::{MagicDb, MagicSource};
60//! use std::fs::File;
61//!
62//! fn main() -> Result<(), Box<dyn std::error::Error>> {
63//!     let mut db = MagicDb::new();
64//!     // Create a MagicSource from a file
65//!     let rust_magic = MagicSource::open("../magic-db/src/magdir/rust")?;
66//!     db.load(rust_magic)?;
67//!
68//!     // Open a file and detect its type
69//!     let mut file = File::open("src/lib.rs")?;
70//!
71//!     // Get all matching rules, sorted by strength
72//!     let magics = db.all_magics(&mut file)?;
73//!
74//!     // Must contain rust file magic and default text magic
75//!     assert!(magics.len() > 1);
76//!
77//!     for magic in magics {
78//!         println!(
79//!             "Match: {} (strength: {}, source: {})",
80//!             magic.message(),
81//!             magic.strength(),
82//!             magic.source().unwrap_or("unknown")
83//!         );
84//!     }
85//!     Ok(())
86//! }
87//! ```
88//!
89//! ### Serialize a Database to Disk
90//! ```rust
91//! use pure_magic::{MagicDb, MagicSource};
92//! use std::fs::File;
93//!
94//! fn main() -> Result<(), Box<dyn std::error::Error>> {
95//!     let mut db = MagicDb::new();
96//!     // Create a MagicSource from a file
97//!     let rust_magic = MagicSource::open("../magic-db/src/magdir/rust")?;
98//!     db.load(rust_magic)?;
99//!
100//!     // Serialize the database to a file
101//!     let mut output = File::create("/tmp/compiled.db")?;
102//!     db.serialize(&mut output)?;
103//!
104//!     println!("Database saved to file");
105//!     Ok(())
106//! }
107//! ```
108//!
109//! ### Deserialize a Database
110//! ```rust
111//! use pure_magic::{MagicDb, MagicSource};
112//! use std::fs::File;
113//!
114//! fn main() -> Result<(), Box<dyn std::error::Error>> {
115//!     let mut db = MagicDb::new();
116//!     // Create a MagicSource from a file
117//!     let rust_magic = MagicSource::open("../magic-db/src/magdir/rust")?;
118//!     db.load(rust_magic)?;
119//!
120//!     // Serialize the database in a vector
121//!     let mut ser = vec![];
122//!     db.serialize(&mut ser)?;
123//!     println!("Database saved to vector");
124//!
125//!     // We deserialize from slice
126//!     let db = MagicDb::deserialize(&mut ser.as_slice())?;
127//!
128//!     assert!(!db.rules().is_empty());
129//!
130//!     Ok(())
131//! }
132//! ```
133//!
134//! ## License
135//! This project is licensed under the **GPL-3.0 License**.
136//!
137//! ## Contributing
138//! Contributions are welcome! Open an issue or submit a pull request.
139//!
140//! ## Acknowledgments
141//! - Inspired by the original `libmagic` (part of the `file` command).
142
143use dyf::{DynDisplay, FormatString, dformat};
144use flagset::{FlagSet, flags};
145use flate2::{Compression, read::GzDecoder, write::GzEncoder};
146use lazy_cache::LazyCache;
147use memchr::memchr;
148use pest::{Span, error::ErrorVariant};
149use regex::bytes::{self};
150use serde::{Deserialize, Serialize};
151use std::{
152    borrow::Cow,
153    cmp::max,
154    collections::{HashMap, HashSet},
155    fmt::{self, Debug, Display},
156    io::{self, Read, Seek, SeekFrom, Write},
157    ops::{Add, BitAnd, BitOr, BitXor, Div, Mul, Rem, Sub},
158    path::Path,
159};
160use tar::Archive;
161use thiserror::Error;
162use tracing::{Level, debug, enabled, trace};
163
164use crate::{
165    numeric::{Float, FloatDataType, Scalar, ScalarDataType},
166    parser::{FileMagicParser, Rule},
167    utils::{decode_id3, find_json_boundaries, run_utf8_validation},
168};
169
170mod numeric;
171mod parser;
172mod utils;
173
174const HARDCODED_MAGIC_STRENGTH: u64 = 2048;
175const HARDCODED_SOURCE: &str = "hardcoded";
176// corresponds to FILE_INDIR_MAX constant defined in libmagic
177const MAX_RECURSION: usize = 50;
178// constant found in libmagic. It is used to limit for search tests
179const FILE_BYTES_MAX: usize = 7 * 1024 * 1024;
180// constant found in libmagic. It is used to limit for regex tests
181const FILE_REGEX_MAX: usize = 8192;
182
183/// Default mimetype for un-identified binary data
184pub const DEFAULT_BIN_MIMETYPE: &str = "application/octet-stream";
185/// Default mimetype for un-identified text data
186pub const DEFAULT_TEXT_MIMETYPE: &str = "text/plain";
187
188pub(crate) const TIMESTAMP_FORMAT: &str = "%Y-%m-%d %H:%M:%S";
189
190macro_rules! debug_panic {
191    ($($arg:tt)*) => {
192        if cfg!(debug_assertions) {
193            panic!($($arg)*);
194        }
195    };
196}
197
198macro_rules! read {
199    ($r: expr, $ty: ty) => {{
200        let mut a = [0u8; std::mem::size_of::<$ty>()];
201        $r.read_exact(&mut a)?;
202        a
203    }};
204}
205
206macro_rules! read_le {
207    ($r:expr, $ty: ty ) => {{ <$ty>::from_le_bytes(read!($r, $ty)) }};
208}
209
210macro_rules! read_be {
211    ($r:expr, $ty: ty ) => {{ <$ty>::from_be_bytes(read!($r, $ty)) }};
212}
213
214macro_rules! read_me {
215    ($r: expr) => {{ ((read_le!($r, u16) as i32) << 16) | (read_le!($r, u16) as i32) }};
216}
217
218#[inline(always)]
219fn read_octal_u64<R: Read + Seek>(haystack: &mut LazyCache<R>) -> Option<u64> {
220    let s = haystack
221        .read_while_or_limit(|b| matches!(b, b'0'..=b'7'), 22)
222        .map(|buf| str::from_utf8(buf))
223        .ok()?
224        .ok()?;
225
226    if !s.starts_with("0") {
227        return None;
228    }
229
230    u64::from_str_radix(s, 8).ok()
231}
232
233/// Represents all possible errors that can occur during file type detection and processing.
234#[derive(Debug, Error)]
235pub enum Error {
236    /// A generic error with a custom message.
237    #[error("{0}")]
238    Msg(String),
239
240    /// An error with a source location and a nested error.
241    #[error("source={0} line={1} error={2}")]
242    Localized(String, usize, Box<Error>),
243
244    /// Indicates a required rule was not found.
245    #[error("missing rule: {0}")]
246    MissingRule(String),
247
248    /// Indicates the maximum recursion depth was reached.
249    #[error("maximum recursion reached: {0}")]
250    MaximumRecursion(usize),
251
252    /// Wraps an I/O error.
253    #[error("io: {0}")]
254    Io(#[from] io::Error),
255
256    /// Wraps a parsing error from the `pest` parser.
257    #[error("parser error: {0}")]
258    Parse(#[from] Box<pest::error::Error<Rule>>),
259
260    /// Wraps a formatting error from the `dyf` crate.
261    #[error("formatting: {0}")]
262    Format(#[from] dyf::Error),
263
264    /// Wraps a regex-related error.
265    #[error("regex: {0}")]
266    Regex(#[from] regex::Error),
267
268    /// Wraps a serialization error from `bincode`.
269    #[error("{0}")]
270    Serialize(#[from] bincode::error::EncodeError),
271
272    /// Wraps a deserialization error from `bincode`.
273    #[error("{0}")]
274    Deserialize(#[from] bincode::error::DecodeError),
275}
276
277impl Error {
278    #[inline]
279    fn parser<S: ToString>(msg: S, span: Span<'_>) -> Self {
280        Self::Parse(Box::new(pest::error::Error::new_from_span(
281            ErrorVariant::CustomError {
282                message: msg.to_string(),
283            },
284            span,
285        )))
286    }
287
288    fn msg<M: AsRef<str>>(msg: M) -> Self {
289        Self::Msg(msg.as_ref().into())
290    }
291
292    fn localized<S: AsRef<str>>(source: S, line: usize, err: Error) -> Self {
293        Self::Localized(source.as_ref().into(), line, err.into())
294    }
295
296    /// Unwraps the localized error
297    pub fn unwrap_localized(&self) -> &Self {
298        match self {
299            Self::Localized(_, _, e) => e,
300            _ => self,
301        }
302    }
303}
304
305#[derive(Debug, Clone, Serialize, Deserialize)]
306enum Message {
307    String(String),
308    Format {
309        printf_spec: String,
310        fs: FormatString,
311    },
312}
313
314impl Display for Message {
315    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
316        match self {
317            Self::String(s) => write!(f, "{s}"),
318            Self::Format { printf_spec: _, fs } => write!(f, "{}", fs.to_string_lossy()),
319        }
320    }
321}
322
323impl Message {
324    fn to_string_lossy(&self) -> Cow<'_, str> {
325        match self {
326            Message::String(s) => Cow::Borrowed(s),
327            Message::Format { printf_spec: _, fs } => fs.to_string_lossy(),
328        }
329    }
330
331    #[inline(always)]
332    fn format_with(&self, mr: Option<&MatchRes>) -> Result<Cow<'_, str>, Error> {
333        match self {
334            Self::String(s) => Ok(Cow::Borrowed(s.as_str())),
335            Self::Format {
336                printf_spec: c_spec,
337                fs,
338            } => {
339                if let Some(mr) = mr {
340                    match mr {
341                        MatchRes::Float(_, _) | MatchRes::Bytes(_, _, _, _) => {
342                            Ok(Cow::Owned(dformat!(fs, mr)?))
343                        }
344                        MatchRes::Scalar(_, scalar) => {
345                            // we want to print a byte as char
346                            if c_spec.as_str() == "c" {
347                                match scalar {
348                                    Scalar::byte(b) => {
349                                        let b = (*b as u8) as char;
350                                        Ok(Cow::Owned(dformat!(fs, b)?))
351                                    }
352                                    Scalar::ubyte(b) => {
353                                        let b = *b as char;
354                                        Ok(Cow::Owned(dformat!(fs, b)?))
355                                    }
356                                    _ => Ok(Cow::Owned(dformat!(fs, mr)?)),
357                                }
358                            } else {
359                                Ok(Cow::Owned(dformat!(fs, mr)?))
360                            }
361                        }
362                    }
363                } else {
364                    Ok(fs.to_string_lossy())
365                }
366            }
367        }
368    }
369}
370
371impl ScalarDataType {
372    #[inline(always)]
373    fn read<R: Read + Seek>(&self, from: &mut R, switch_endianness: bool) -> Result<Scalar, Error> {
374        macro_rules! _read_le {
375            ($ty: ty) => {{
376                if switch_endianness {
377                    <$ty>::from_be_bytes(read!(from, $ty))
378                } else {
379                    <$ty>::from_le_bytes(read!(from, $ty))
380                }
381            }};
382        }
383
384        macro_rules! _read_be {
385            ($ty: ty) => {{
386                if switch_endianness {
387                    <$ty>::from_le_bytes(read!(from, $ty))
388                } else {
389                    <$ty>::from_be_bytes(read!(from, $ty))
390                }
391            }};
392        }
393
394        macro_rules! _read_ne {
395            ($ty: ty) => {{
396                if cfg!(target_endian = "big") {
397                    _read_be!($ty)
398                } else {
399                    _read_le!($ty)
400                }
401            }};
402        }
403
404        macro_rules! _read_me {
405            () => {
406                ((_read_le!(u16) as i32) << 16) | (_read_le!(u16) as i32)
407            };
408        }
409
410        Ok(match self {
411            // signed
412            Self::byte => Scalar::byte(read!(from, u8)[0] as i8),
413            Self::short => Scalar::short(_read_ne!(i16)),
414            Self::long => Scalar::long(_read_ne!(i32)),
415            Self::date => Scalar::date(_read_ne!(i32)),
416            Self::ldate => Scalar::ldate(_read_ne!(i32)),
417            Self::qwdate => Scalar::qwdate(_read_ne!(i64)),
418            Self::leshort => Scalar::leshort(_read_le!(i16)),
419            Self::lelong => Scalar::lelong(_read_le!(i32)),
420            Self::lequad => Scalar::lequad(_read_le!(i64)),
421            Self::bequad => Scalar::bequad(_read_be!(i64)),
422            Self::belong => Scalar::belong(_read_be!(i32)),
423            Self::bedate => Scalar::bedate(_read_be!(i32)),
424            Self::beldate => Scalar::beldate(_read_be!(i32)),
425            Self::beqdate => Scalar::beqdate(_read_be!(i64)),
426            // unsigned
427            Self::ubyte => Scalar::ubyte(read!(from, u8)[0]),
428            Self::ushort => Scalar::ushort(_read_ne!(u16)),
429            Self::uleshort => Scalar::uleshort(_read_le!(u16)),
430            Self::ulelong => Scalar::ulelong(_read_le!(u32)),
431            Self::uledate => Scalar::uledate(_read_le!(u32)),
432            Self::ulequad => Scalar::ulequad(_read_le!(u64)),
433            Self::offset => Scalar::offset(from.stream_position()?),
434            Self::ubequad => Scalar::ubequad(_read_be!(u64)),
435            Self::medate => Scalar::medate(_read_me!()),
436            Self::meldate => Scalar::meldate(_read_me!()),
437            Self::melong => Scalar::melong(_read_me!()),
438            Self::beshort => Scalar::beshort(_read_be!(i16)),
439            Self::quad => Scalar::quad(_read_ne!(i64)),
440            Self::uquad => Scalar::uquad(_read_ne!(u64)),
441            Self::ledate => Scalar::ledate(_read_le!(i32)),
442            Self::leldate => Scalar::leldate(_read_le!(i32)),
443            Self::leqdate => Scalar::leqdate(_read_le!(i64)),
444            Self::leqldate => Scalar::leqldate(_read_le!(i64)),
445            Self::leqwdate => Scalar::leqwdate(_read_le!(i64)),
446            Self::ubelong => Scalar::ubelong(_read_be!(u32)),
447            Self::ulong => Scalar::ulong(_read_ne!(u32)),
448            Self::ubeshort => Scalar::ubeshort(_read_be!(u16)),
449            Self::ubeqdate => Scalar::ubeqdate(_read_be!(u64)),
450            Self::lemsdosdate => Scalar::lemsdosdate(_read_le!(u16)),
451            Self::lemsdostime => Scalar::lemsdostime(_read_le!(u16)),
452            Self::guid => Scalar::guid(u128::from_be_bytes(read!(from, u128))),
453        })
454    }
455}
456
457impl FloatDataType {
458    #[inline(always)]
459    fn read<R: Read + Seek>(&self, from: &mut R, switch_endianness: bool) -> Result<Float, Error> {
460        macro_rules! _read_le {
461            ($ty: ty) => {{
462                if switch_endianness {
463                    <$ty>::from_be_bytes(read!(from, $ty))
464                } else {
465                    <$ty>::from_le_bytes(read!(from, $ty))
466                }
467            }};
468        }
469
470        macro_rules! _read_be {
471            ($ty: ty) => {{
472                if switch_endianness {
473                    <$ty>::from_le_bytes(read!(from, $ty))
474                } else {
475                    <$ty>::from_be_bytes(read!(from, $ty))
476                }
477            }};
478        }
479
480        macro_rules! _read_ne {
481            ($ty: ty) => {{
482                if cfg!(target_endian = "big") {
483                    _read_be!($ty)
484                } else {
485                    _read_le!($ty)
486                }
487            }};
488        }
489
490        macro_rules! _read_me {
491            () => {
492                ((_read_le!(u16) as i32) << 16) | (_read_le!(u16) as i32)
493            };
494        }
495
496        Ok(match self {
497            Self::lefloat => Float::lefloat(_read_le!(f32)),
498            Self::befloat => Float::befloat(_read_le!(f32)),
499            Self::ledouble => Float::ledouble(_read_le!(f64)),
500            Self::bedouble => Float::bedouble(_read_be!(f64)),
501        })
502    }
503}
504
505#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
506enum Op {
507    Mul,
508    Add,
509    Sub,
510    Div,
511    Mod,
512    And,
513    Xor,
514    Or,
515}
516
517impl Display for Op {
518    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
519        match self {
520            Op::Mul => write!(f, "*"),
521            Op::Add => write!(f, "+"),
522            Op::Sub => write!(f, "-"),
523            Op::Div => write!(f, "/"),
524            Op::Mod => write!(f, "%"),
525            Op::And => write!(f, "&"),
526            Op::Or => write!(f, "|"),
527            Op::Xor => write!(f, "^"),
528        }
529    }
530}
531
532#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
533enum CmpOp {
534    Eq,
535    Lt,
536    Gt,
537    BitAnd,
538    Neq, // ! operator
539    Xor,
540    Not, // ~ operator
541}
542
543impl CmpOp {
544    #[inline(always)]
545    fn is_neq(&self) -> bool {
546        matches!(self, Self::Neq)
547    }
548}
549
550#[derive(Debug, Clone, Serialize, Deserialize)]
551struct ScalarTransform {
552    op: Op,
553    num: Scalar,
554}
555
556impl ScalarTransform {
557    fn apply(&self, s: Scalar) -> Option<Scalar> {
558        match self.op {
559            Op::Add => s.checked_add(self.num),
560            Op::Sub => s.checked_sub(self.num),
561            Op::Mul => s.checked_mul(self.num),
562            Op::Div => s.checked_div(self.num),
563            Op::Mod => s.checked_rem(self.num),
564            Op::And => Some(s.bitand(self.num)),
565            Op::Xor => Some(s.bitxor(self.num)),
566            Op::Or => Some(s.bitor(self.num)),
567        }
568    }
569}
570
571#[derive(Debug, Clone, Serialize, Deserialize)]
572struct FloatTransform {
573    op: Op,
574    num: Float,
575}
576
577impl FloatTransform {
578    fn apply(&self, s: Float) -> Float {
579        match self.op {
580            Op::Add => s.add(self.num),
581            Op::Sub => s.sub(self.num),
582            Op::Mul => s.mul(self.num),
583            // returns inf when div by 0
584            Op::Div => s.div(self.num),
585            // returns NaN when rem by 0
586            Op::Mod => s.rem(self.num),
587            // parser makes sure those operators cannot be used
588            Op::And | Op::Xor | Op::Or => {
589                debug_panic!("unsupported operation");
590                s
591            }
592        }
593    }
594}
595
596#[derive(Debug, Clone, Serialize, Deserialize)]
597enum TestValue<T> {
598    Value(T),
599    Any,
600}
601
602impl<T> TestValue<T> {
603    #[inline(always)]
604    fn as_ref(&self) -> TestValue<&T> {
605        match self {
606            Self::Value(v) => TestValue::Value(v),
607            Self::Any => TestValue::Any,
608        }
609    }
610}
611
612flags! {
613    enum ReMod: u8{
614        CaseInsensitive,
615        StartOffsetUpdate,
616        LineLimit,
617        ForceBin,
618        ForceText,
619        TrimMatch,
620    }
621}
622
623fn serialize_regex<S>(re: &bytes::Regex, serializer: S) -> Result<S::Ok, S::Error>
624where
625    S: serde::Serializer,
626{
627    re.as_str().serialize(serializer)
628}
629
630fn deserialize_regex<'de, D>(deserializer: D) -> Result<bytes::Regex, D::Error>
631where
632    D: serde::Deserializer<'de>,
633{
634    let wrapper = String::deserialize(deserializer)?;
635    bytes::Regex::new(&wrapper).map_err(serde::de::Error::custom)
636}
637
638#[derive(Debug, Clone, Serialize, Deserialize)]
639struct RegexTest {
640    #[serde(
641        serialize_with = "serialize_regex",
642        deserialize_with = "deserialize_regex"
643    )]
644    re: bytes::Regex,
645    length: Option<usize>,
646    mods: FlagSet<ReMod>,
647    str_mods: FlagSet<StringMod>,
648    non_magic_len: usize,
649    binary: bool,
650    cmp_op: CmpOp,
651}
652
653impl RegexTest {
654    #[inline(always)]
655    fn is_binary(&self) -> bool {
656        self.binary
657            || self.mods.contains(ReMod::ForceBin)
658            || self.str_mods.contains(StringMod::ForceBin)
659    }
660
661    fn match_buf<'buf>(
662        &self,
663        off_buf: u64, // absolute buffer offset in content
664        stream_kind: StreamKind,
665        buf: &'buf [u8],
666    ) -> Option<MatchRes<'buf>> {
667        let mr = match stream_kind {
668            StreamKind::Text(_) => {
669                let mut off_txt = off_buf;
670
671                let mut line_limit = self.length.unwrap_or(usize::MAX);
672
673                for line in buf.split(|c| c == &b'\n') {
674                    // we don't need to break on offset
675                    // limit as buf contains the good amount
676                    // of bytes to match against
677                    if line_limit == 0 {
678                        break;
679                    }
680
681                    if let Some(re_match) = self.re.find(line) {
682                        // the offset of the string is computed from the start of the buffer
683                        let start_offset = off_txt + re_match.start() as u64;
684
685                        // if we matched until EOL we need to add one to include the delimiter removed from the split
686                        let stop_offset = if re_match.end() == line.len() {
687                            Some(start_offset + re_match.as_bytes().len() as u64 + 1)
688                        } else {
689                            None
690                        };
691
692                        return Some(MatchRes::Bytes(
693                            start_offset,
694                            stop_offset,
695                            re_match.as_bytes(),
696                            Encoding::Utf8,
697                        ));
698                    }
699
700                    off_txt += line.len() as u64;
701                    // we have to add one because lines do not contain splitting character
702                    off_txt += 1;
703                    line_limit = line_limit.saturating_sub(1)
704                }
705                None
706            }
707
708            StreamKind::Binary => {
709                self.re.find(buf).map(|re_match| {
710                    MatchRes::Bytes(
711                        // the offset of the string is computed from the start of the buffer
712                        off_buf + re_match.start() as u64,
713                        None,
714                        re_match.as_bytes(),
715                        Encoding::Utf8,
716                    )
717                })
718            }
719        };
720
721        // handle the case where we want the regex not to match
722        if self.cmp_op.is_neq() && mr.is_none() {
723            return Some(MatchRes::Bytes(off_buf, None, buf, Encoding::Utf8));
724        }
725
726        mr
727    }
728}
729
730impl From<RegexTest> for Test {
731    fn from(value: RegexTest) -> Self {
732        Self::Regex(value)
733    }
734}
735
736flags! {
737    enum StringMod: u8{
738        ForceBin,
739        UpperInsensitive,
740        LowerInsensitive,
741        FullWordMatch,
742        Trim,
743        ForceText,
744        CompactWhitespace,
745        OptBlank,
746    }
747}
748
749#[derive(Debug, Clone, Serialize, Deserialize)]
750struct StringTest {
751    test_val: TestValue<Vec<u8>>,
752    cmp_op: CmpOp,
753    length: Option<usize>,
754    mods: FlagSet<StringMod>,
755    binary: bool,
756}
757
758impl From<StringTest> for Test {
759    fn from(value: StringTest) -> Self {
760        Self::String(value)
761    }
762}
763
764#[inline(always)]
765fn string_match(str: &[u8], mods: FlagSet<StringMod>, buf: &[u8]) -> (bool, usize) {
766    let mut consumed = 0;
767    // we can do a simple string comparison
768    if mods.is_disjoint(
769        StringMod::UpperInsensitive
770            | StringMod::LowerInsensitive
771            | StringMod::FullWordMatch
772            | StringMod::CompactWhitespace
773            | StringMod::OptBlank,
774    ) {
775        // we check if target contains
776        if buf.starts_with(str) {
777            (true, str.len())
778        } else {
779            (false, consumed)
780        }
781    } else {
782        let mut i_src = 0;
783        let mut iter = buf.iter().peekable();
784
785        macro_rules! consume_target {
786            () => {{
787                if iter.next().is_some() {
788                    consumed += 1;
789                }
790            }};
791        }
792
793        macro_rules! continue_next_iteration {
794            () => {{
795                consume_target!();
796                i_src += 1;
797                continue;
798            }};
799        }
800
801        while let Some(&&b) = iter.peek() {
802            let Some(&ref_byte) = str.get(i_src) else {
803                break;
804            };
805
806            if mods.contains(StringMod::OptBlank) && (b == b' ' || ref_byte == b' ') {
807                if b == b' ' {
808                    // we ignore whitespace in target
809                    consume_target!();
810                }
811
812                if ref_byte == b' ' {
813                    // we ignore whitespace in test
814                    i_src += 1;
815                }
816
817                continue;
818            }
819
820            if mods.contains(StringMod::UpperInsensitive) {
821                //upper case characters in the magic match both lower and upper case characters in the target
822                if ref_byte.is_ascii_uppercase() && ref_byte == b.to_ascii_uppercase()
823                    || ref_byte == b
824                {
825                    continue_next_iteration!()
826                }
827            }
828
829            if mods.contains(StringMod::LowerInsensitive)
830                && (ref_byte.is_ascii_lowercase() && ref_byte == b.to_ascii_lowercase()
831                    || ref_byte == b)
832            {
833                continue_next_iteration!()
834            }
835
836            if mods.contains(StringMod::CompactWhitespace) && ref_byte == b' ' {
837                let mut src_blk = 0;
838                while let Some(b' ') = str.get(i_src) {
839                    src_blk += 1;
840                    i_src += 1;
841                }
842
843                let mut tgt_blk = 0;
844                while let Some(b' ') = iter.peek() {
845                    tgt_blk += 1;
846                    consume_target!();
847                }
848
849                if src_blk > tgt_blk {
850                    return (false, consumed);
851                }
852
853                continue;
854            }
855
856            if ref_byte == b {
857                continue_next_iteration!()
858            } else {
859                return (false, consumed);
860            }
861        }
862
863        if mods.contains(StringMod::FullWordMatch)
864            && let Some(b) = iter.peek()
865            && !b.is_ascii_whitespace()
866        {
867            return (false, consumed);
868        }
869
870        (
871            consumed > 0 && str.get(i_src).is_none() && consumed <= buf.len(),
872            consumed,
873        )
874    }
875}
876
877impl StringTest {
878    fn has_length_mod(&self) -> bool {
879        !self.mods.is_disjoint(
880            StringMod::UpperInsensitive
881                | StringMod::LowerInsensitive
882                | StringMod::FullWordMatch
883                | StringMod::CompactWhitespace
884                | StringMod::OptBlank,
885        )
886    }
887
888    #[inline(always)]
889    fn test_value_len(&self) -> usize {
890        match self.test_val.as_ref() {
891            TestValue::Value(s) => s.len(),
892            TestValue::Any => 0,
893        }
894    }
895
896    #[inline(always)]
897    fn is_binary(&self) -> bool {
898        self.binary || self.mods.contains(StringMod::ForceBin)
899    }
900
901    #[inline(always)]
902    fn is_text(&self) -> bool {
903        self.mods.contains(StringMod::ForceText)
904    }
905}
906
907#[derive(Debug, Clone, Serialize, Deserialize)]
908struct SearchTest {
909    str: Vec<u8>,
910    n_pos: Option<usize>,
911    str_mods: FlagSet<StringMod>,
912    re_mods: FlagSet<ReMod>,
913    binary: bool,
914    cmp_op: CmpOp,
915}
916
917impl From<SearchTest> for Test {
918    fn from(value: SearchTest) -> Self {
919        Self::Search(value)
920    }
921}
922
923impl SearchTest {
924    #[inline(always)]
925    fn is_binary(&self) -> bool {
926        (self.binary
927            || self.str_mods.contains(StringMod::ForceBin)
928            || self.re_mods.contains(ReMod::ForceBin))
929            && !(self.str_mods.contains(StringMod::ForceText)
930                || self.re_mods.contains(ReMod::ForceText))
931    }
932
933    // off_buf: absolute buffer offset in content
934    #[inline]
935    fn match_buf<'buf>(&self, off_buf: u64, buf: &'buf [u8]) -> Option<MatchRes<'buf>> {
936        let mut i = 0;
937
938        let needle = self.str.first()?;
939
940        while i < buf.len() {
941            // we cannot match if the first character isn't the same
942            // so we accelerate the search by finding potential matches
943            i += memchr(*needle, &buf[i..])?;
944
945            // if we want a full word match
946            if self.str_mods.contains(StringMod::FullWordMatch) {
947                let prev_is_whitespace = buf
948                    .get(i.saturating_sub(1))
949                    .map(|c| c.is_ascii_whitespace())
950                    .unwrap_or_default();
951
952                // if it is not the first character
953                // and its previous character isn't
954                // a whitespace. It cannot be a
955                // fullword match
956                if i > 0 && !prev_is_whitespace {
957                    i += 1;
958                    continue;
959                }
960            }
961
962            if let Some(npos) = self.n_pos
963                && i > npos
964            {
965                break;
966            }
967
968            let pos = i;
969            let (ok, consumed) = string_match(&self.str, self.str_mods, &buf[i..]);
970
971            if ok {
972                return Some(MatchRes::Bytes(
973                    off_buf.saturating_add(pos as u64),
974                    None,
975                    &buf[i..i + consumed],
976                    Encoding::Utf8,
977                ));
978            } else {
979                i += max(consumed, 1)
980            }
981        }
982
983        // handles the case where we want the string not to be found
984        if self.cmp_op.is_neq() {
985            return Some(MatchRes::Bytes(off_buf, None, buf, Encoding::Utf8));
986        }
987
988        None
989    }
990}
991
992#[derive(Debug, Clone, Serialize, Deserialize)]
993struct ScalarTest {
994    ty: ScalarDataType,
995    transform: Option<ScalarTransform>,
996    cmp_op: CmpOp,
997    test_val: TestValue<Scalar>,
998}
999
1000#[derive(Debug, Clone, Serialize, Deserialize)]
1001struct FloatTest {
1002    ty: FloatDataType,
1003    transform: Option<FloatTransform>,
1004    cmp_op: CmpOp,
1005    test_val: TestValue<Float>,
1006}
1007
1008// the value read from the haystack we want to match against
1009// 'buf is the lifetime of the buffer we are scanning
1010#[derive(Debug, PartialEq)]
1011enum ReadValue<'buf> {
1012    Float(u64, Float),
1013    Scalar(u64, Scalar),
1014    Bytes(u64, &'buf [u8]),
1015}
1016
1017impl DynDisplay for ReadValue<'_> {
1018    fn dyn_fmt(&self, f: &dyf::FormatSpec) -> Result<String, dyf::Error> {
1019        match self {
1020            Self::Float(_, s) => DynDisplay::dyn_fmt(s, f),
1021            Self::Scalar(_, s) => DynDisplay::dyn_fmt(s, f),
1022            Self::Bytes(_, b) => Ok(format!("{b:?}")),
1023        }
1024    }
1025}
1026
1027impl DynDisplay for &ReadValue<'_> {
1028    fn dyn_fmt(&self, f: &dyf::FormatSpec) -> Result<String, dyf::Error> {
1029        // Dereference self to get the TestValue and call its fmt method
1030        DynDisplay::dyn_fmt(*self, f)
1031    }
1032}
1033
1034impl Display for ReadValue<'_> {
1035    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1036        match self {
1037            Self::Float(_, v) => write!(f, "{v}"),
1038            Self::Scalar(_, s) => write!(f, "{s}"),
1039            Self::Bytes(_, b) => write!(f, "{b:?}"),
1040        }
1041    }
1042}
1043
1044enum Encoding {
1045    Utf16(String16Encoding),
1046    Utf8,
1047}
1048
1049// Carry the offset of the start of the data in the stream
1050// and the data itself
1051enum MatchRes<'buf> {
1052    // Bytes.0: offset of the match
1053    // Bytes.1: optional end of match (to address the need of EOL adjustment in string regex)
1054    // Bytes.2: the bytes matching
1055    // Bytes.3: encoding of the buffer
1056    Bytes(u64, Option<u64>, &'buf [u8], Encoding),
1057    Scalar(u64, Scalar),
1058    Float(u64, Float),
1059}
1060
1061impl DynDisplay for &MatchRes<'_> {
1062    fn dyn_fmt(&self, f: &dyf::FormatSpec) -> Result<String, dyf::Error> {
1063        (*self).dyn_fmt(f)
1064    }
1065}
1066
1067impl DynDisplay for MatchRes<'_> {
1068    fn dyn_fmt(&self, f: &dyf::FormatSpec) -> Result<String, dyf::Error> {
1069        match self {
1070            Self::Scalar(_, v) => v.dyn_fmt(f),
1071            Self::Float(_, v) => v.dyn_fmt(f),
1072            Self::Bytes(_, _, v, enc) => match enc {
1073                Encoding::Utf8 => String::from_utf8_lossy(v).to_string().dyn_fmt(f),
1074                Encoding::Utf16(enc) => {
1075                    let utf16: Vec<u16> = slice_to_utf16_iter(v, *enc).collect();
1076                    String::from_utf16_lossy(&utf16).dyn_fmt(f)
1077                }
1078            },
1079        }
1080    }
1081}
1082
1083impl MatchRes<'_> {
1084    // start offset of the match
1085    #[inline]
1086    fn start_offset(&self) -> u64 {
1087        match self {
1088            MatchRes::Bytes(o, _, _, _) => *o,
1089            MatchRes::Scalar(o, _) => *o,
1090            MatchRes::Float(o, _) => *o,
1091        }
1092    }
1093
1094    // start offset of the match
1095    #[inline]
1096    fn end_offset(&self) -> u64 {
1097        match self {
1098            MatchRes::Bytes(start, end, buf, _) => match end {
1099                Some(end) => *end,
1100                None => start.saturating_add(buf.len() as u64),
1101            },
1102            MatchRes::Scalar(o, sc) => o.add(sc.size_of() as u64),
1103            MatchRes::Float(o, f) => o.add(f.size_of() as u64),
1104        }
1105    }
1106}
1107
1108fn slice_to_utf16_iter(read: &[u8], encoding: String16Encoding) -> impl Iterator<Item = u16> {
1109    let even = read
1110        .iter()
1111        .enumerate()
1112        .filter(|(i, _)| i % 2 == 0)
1113        .map(|t| t.1);
1114
1115    let odd = read
1116        .iter()
1117        .enumerate()
1118        .filter(|(i, _)| i % 2 != 0)
1119        .map(|t| t.1);
1120
1121    even.zip(odd).map(move |(e, o)| match encoding {
1122        String16Encoding::Le => u16::from_le_bytes([*e, *o]),
1123        String16Encoding::Be => u16::from_be_bytes([*e, *o]),
1124    })
1125}
1126
1127#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
1128enum String16Encoding {
1129    Le,
1130    Be,
1131}
1132
1133#[derive(Debug, Clone, Serialize, Deserialize)]
1134struct String16Test {
1135    orig: String,
1136    test_val: TestValue<Vec<u16>>,
1137    encoding: String16Encoding,
1138}
1139
1140impl String16Test {
1141    /// if the test value is a specific value this method returns
1142    /// the number of utf16 characters. To obtain the length in
1143    /// bytes the return value needs to be multiplied by two.
1144    #[inline(always)]
1145    fn test_value_len(&self) -> usize {
1146        match self.test_val.as_ref() {
1147            TestValue::Value(str16) => str16.len(),
1148            TestValue::Any => 0,
1149        }
1150    }
1151}
1152
1153flags! {
1154    enum IndirectMod: u8{
1155        Relative,
1156    }
1157}
1158
1159type IndirectMods = FlagSet<IndirectMod>;
1160
1161#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
1162enum PStringLen {
1163    Byte,    // B
1164    ShortBe, // H
1165    ShortLe, // h
1166    LongBe,  // L
1167    LongLe,  // l
1168}
1169
1170impl PStringLen {
1171    #[inline(always)]
1172    const fn size_of_len(&self) -> usize {
1173        match self {
1174            PStringLen::Byte => 1,
1175            PStringLen::ShortBe => 2,
1176            PStringLen::ShortLe => 2,
1177            PStringLen::LongBe => 4,
1178            PStringLen::LongLe => 4,
1179        }
1180    }
1181}
1182
1183#[derive(Debug, Clone, Serialize, Deserialize)]
1184struct PStringTest {
1185    len: PStringLen,
1186    test_val: TestValue<Vec<u8>>,
1187    include_len: bool,
1188}
1189
1190impl PStringTest {
1191    #[inline]
1192    fn read<'cache, R: Read + Seek>(
1193        &self,
1194        haystack: &'cache mut LazyCache<R>,
1195    ) -> Result<Option<&'cache [u8]>, Error> {
1196        let mut len = match self.len {
1197            PStringLen::Byte => read_le!(haystack, u8) as u32,
1198            PStringLen::ShortBe => read_be!(haystack, u16) as u32,
1199            PStringLen::ShortLe => read_le!(haystack, u16) as u32,
1200            PStringLen::LongBe => read_be!(haystack, u32),
1201            PStringLen::LongLe => read_le!(haystack, u32),
1202        } as usize;
1203
1204        if self.include_len {
1205            len = len.saturating_sub(self.len.size_of_len())
1206        }
1207
1208        if let TestValue::Value(s) = self.test_val.as_ref()
1209            && len != s.len()
1210        {
1211            return Ok(None);
1212        }
1213
1214        let read = haystack.read_exact_count(len as u64)?;
1215
1216        Ok(Some(read))
1217    }
1218
1219    #[inline(always)]
1220    fn test_value_len(&self) -> usize {
1221        match self.test_val.as_ref() {
1222            TestValue::Value(s) => s.len(),
1223            TestValue::Any => 0,
1224        }
1225    }
1226}
1227
1228#[derive(Debug, Clone, Serialize, Deserialize)]
1229enum Test {
1230    Name(String),
1231    Use(bool, String),
1232    Scalar(ScalarTest),
1233    Float(FloatTest),
1234    String(StringTest),
1235    Search(SearchTest),
1236    PString(PStringTest),
1237    Regex(RegexTest),
1238    Indirect(FlagSet<IndirectMod>),
1239    String16(String16Test),
1240    // FIXME: placeholder for strength computation
1241    #[allow(dead_code)]
1242    Der,
1243    Clear,
1244    Default,
1245}
1246
1247impl Test {
1248    // read the value to test from the haystack
1249    #[inline]
1250    fn read_test_value<'haystack, R: Read + Seek>(
1251        &self,
1252        haystack: &'haystack mut LazyCache<R>,
1253        switch_endianness: bool,
1254    ) -> Result<Option<ReadValue<'haystack>>, Error> {
1255        let test_value_offset = haystack.lazy_stream_position();
1256
1257        match self {
1258            Self::Scalar(t) => {
1259                t.ty.read(haystack, switch_endianness)
1260                    .map(|s| Some(ReadValue::Scalar(test_value_offset, s)))
1261            }
1262
1263            Self::Float(t) => {
1264                t.ty.read(haystack, switch_endianness)
1265                    .map(|f| Some(ReadValue::Float(test_value_offset, f)))
1266            }
1267            Self::String(t) => {
1268                match t.test_val.as_ref() {
1269                    TestValue::Value(str) => {
1270                        let buf = if let Some(length) = t.length {
1271                            // if there is a length specified
1272                            haystack.read_exact_count(length as u64)?
1273                        } else {
1274                            // no length specified we read until end of string
1275
1276                            match t.cmp_op {
1277                                CmpOp::Eq | CmpOp::Neq => {
1278                                    if !t.has_length_mod() {
1279                                        haystack.read_exact_count(str.len() as u64)?
1280                                    } else {
1281                                        haystack.read_count(FILE_BYTES_MAX as u64)?
1282                                    }
1283                                }
1284                                CmpOp::Lt | CmpOp::Gt => {
1285                                    let read =
1286                                        haystack.read_until_any_delim_or_limit(b"\n\0", 8092)?;
1287
1288                                    if read.ends_with(b"\0") || read.ends_with(b"\n") {
1289                                        &read[..read.len() - 1]
1290                                    } else {
1291                                        read
1292                                    }
1293                                }
1294                                _ => {
1295                                    return Err(Error::Msg(format!(
1296                                        "string test does not support {:?} operator",
1297                                        t.cmp_op
1298                                    )));
1299                                }
1300                            }
1301                        };
1302
1303                        Ok(Some(ReadValue::Bytes(test_value_offset, buf)))
1304                    }
1305                    TestValue::Any => {
1306                        let read = haystack.read_until_any_delim_or_limit(b"\0\n", 8192)?;
1307                        // we don't take last byte if it matches end of string
1308                        let bytes = if read.ends_with(b"\0") || read.ends_with(b"\n") {
1309                            &read[..read.len() - 1]
1310                        } else {
1311                            read
1312                        };
1313
1314                        Ok(Some(ReadValue::Bytes(test_value_offset, bytes)))
1315                    }
1316                }
1317            }
1318
1319            Self::String16(t) => {
1320                match t.test_val.as_ref() {
1321                    TestValue::Value(str16) => {
1322                        let read = haystack.read_exact_count((str16.len() * 2) as u64)?;
1323
1324                        Ok(Some(ReadValue::Bytes(test_value_offset, read)))
1325                    }
1326                    TestValue::Any => {
1327                        let read = haystack.read_until_utf16_or_limit(b"\x00\x00", 8192)?;
1328
1329                        // we make sure we have an even number of elements
1330                        let end = if read.len() % 2 == 0 {
1331                            read.len()
1332                        } else {
1333                            // we decide to read anyway even though
1334                            // length isn't even
1335                            read.len().saturating_sub(1)
1336                        };
1337
1338                        Ok(Some(ReadValue::Bytes(test_value_offset, &read[..end])))
1339                    }
1340                }
1341            }
1342
1343            Self::PString(t) => {
1344                let Some(read) = t.read(haystack)? else {
1345                    return Ok(None);
1346                };
1347                Ok(Some(ReadValue::Bytes(test_value_offset, read)))
1348            }
1349
1350            Self::Search(_) => {
1351                let buf = haystack.read_count(FILE_BYTES_MAX as u64)?;
1352                Ok(Some(ReadValue::Bytes(test_value_offset, buf)))
1353            }
1354
1355            Self::Regex(r) => {
1356                let length = {
1357                    match r.length {
1358                        Some(len) => {
1359                            if r.mods.contains(ReMod::LineLimit) {
1360                                len * 80
1361                            } else {
1362                                len
1363                            }
1364                        }
1365
1366                        None => FILE_REGEX_MAX,
1367                    }
1368                };
1369
1370                let read = haystack.read_count(length as u64)?;
1371                Ok(Some(ReadValue::Bytes(test_value_offset, read)))
1372            }
1373
1374            Self::Name(_)
1375            | Self::Use(_, _)
1376            | Self::Indirect(_)
1377            | Self::Clear
1378            | Self::Default
1379            | Self::Der => Err(Error::msg("no value to read for this test")),
1380        }
1381    }
1382
1383    #[inline(always)]
1384    fn match_value<'s>(
1385        &'s self,
1386        tv: &ReadValue<'s>,
1387        stream_kind: StreamKind,
1388    ) -> Option<MatchRes<'s>> {
1389        match (self, tv) {
1390            (Self::Scalar(t), ReadValue::Scalar(o, ts)) => {
1391                let read_value: Scalar = match t.transform.as_ref() {
1392                    Some(t) => t.apply(*ts)?,
1393                    None => *ts,
1394                };
1395
1396                match t.test_val {
1397                    TestValue::Value(test_value) => {
1398                        let ok = match t.cmp_op {
1399                            // NOTE: this should not happen in practice because
1400                            // we convert it into Eq equivalent at parsing time
1401                            CmpOp::Not => read_value == !test_value,
1402                            CmpOp::Eq => read_value == test_value,
1403                            CmpOp::Lt => read_value < test_value,
1404                            CmpOp::Gt => read_value > test_value,
1405                            CmpOp::Neq => read_value != test_value,
1406                            CmpOp::BitAnd => read_value & test_value == test_value,
1407                            CmpOp::Xor => (read_value & test_value).is_zero(),
1408                        };
1409
1410                        if ok {
1411                            Some(MatchRes::Scalar(*o, read_value))
1412                        } else {
1413                            None
1414                        }
1415                    }
1416
1417                    TestValue::Any => Some(MatchRes::Scalar(*o, read_value)),
1418                }
1419            }
1420
1421            (Self::Float(t), ReadValue::Float(o, f)) => {
1422                let read_value: Float = t.transform.as_ref().map(|t| t.apply(*f)).unwrap_or(*f);
1423
1424                match t.test_val {
1425                    TestValue::Value(tf) => {
1426                        let ok = match t.cmp_op {
1427                            CmpOp::Eq => read_value == tf,
1428                            CmpOp::Lt => read_value < tf,
1429                            CmpOp::Gt => read_value > tf,
1430                            CmpOp::Neq => read_value != tf,
1431                            _ => {
1432                                // this should never be reached as we validate
1433                                // operator in parser
1434                                debug_panic!("unsupported float comparison");
1435                                debug!("unsupported float comparison");
1436                                false
1437                            }
1438                        };
1439
1440                        if ok {
1441                            Some(MatchRes::Float(*o, read_value))
1442                        } else {
1443                            None
1444                        }
1445                    }
1446                    TestValue::Any => Some(MatchRes::Float(*o, read_value)),
1447                }
1448            }
1449
1450            (Self::String(st), ReadValue::Bytes(o, buf)) => {
1451                macro_rules! trim_buf {
1452                    ($buf: expr) => {{
1453                        if st.mods.contains(StringMod::Trim) {
1454                            $buf.trim_ascii()
1455                        } else {
1456                            $buf
1457                        }
1458                    }};
1459                }
1460
1461                match st.test_val.as_ref() {
1462                    TestValue::Value(str) => {
1463                        match st.cmp_op {
1464                            CmpOp::Eq => {
1465                                if let (true, _) = string_match(str, st.mods, buf) {
1466                                    Some(MatchRes::Bytes(*o, None, trim_buf!(str), Encoding::Utf8))
1467                                } else {
1468                                    None
1469                                }
1470                            }
1471                            CmpOp::Neq => {
1472                                if let (false, _) = string_match(str, st.mods, buf) {
1473                                    Some(MatchRes::Bytes(*o, None, trim_buf!(str), Encoding::Utf8))
1474                                } else {
1475                                    None
1476                                }
1477                            }
1478                            CmpOp::Gt => {
1479                                if buf.len() > str.len() {
1480                                    Some(MatchRes::Bytes(*o, None, trim_buf!(buf), Encoding::Utf8))
1481                                } else {
1482                                    None
1483                                }
1484                            }
1485                            CmpOp::Lt => {
1486                                if buf.len() < str.len() {
1487                                    Some(MatchRes::Bytes(*o, None, trim_buf!(buf), Encoding::Utf8))
1488                                } else {
1489                                    None
1490                                }
1491                            }
1492
1493                            // unsupported for strings
1494                            _ => {
1495                                // this should never be reached as we validate
1496                                // operator in parser
1497                                debug_panic!("unsupported string comparison");
1498                                debug!("unsupported string comparison");
1499                                None
1500                            }
1501                        }
1502                    }
1503                    TestValue::Any => {
1504                        Some(MatchRes::Bytes(*o, None, trim_buf!(buf), Encoding::Utf8))
1505                    }
1506                }
1507            }
1508
1509            (Self::PString(m), ReadValue::Bytes(o, buf)) => match m.test_val.as_ref() {
1510                TestValue::Value(psv) => {
1511                    if buf == psv {
1512                        Some(MatchRes::Bytes(*o, None, buf, Encoding::Utf8))
1513                    } else {
1514                        None
1515                    }
1516                }
1517                TestValue::Any => Some(MatchRes::Bytes(*o, None, buf, Encoding::Utf8)),
1518            },
1519
1520            (Self::String16(t), ReadValue::Bytes(o, buf)) => {
1521                match t.test_val.as_ref() {
1522                    TestValue::Value(str16) => {
1523                        // strings cannot be equal
1524                        if str16.len() * 2 != buf.len() {
1525                            return None;
1526                        }
1527
1528                        // we check string equality
1529                        for (i, utf16_char) in slice_to_utf16_iter(buf, t.encoding).enumerate() {
1530                            if str16[i] != utf16_char {
1531                                return None;
1532                            }
1533                        }
1534
1535                        Some(MatchRes::Bytes(
1536                            *o,
1537                            None,
1538                            t.orig.as_bytes(),
1539                            Encoding::Utf16(t.encoding),
1540                        ))
1541                    }
1542
1543                    TestValue::Any => {
1544                        Some(MatchRes::Bytes(*o, None, buf, Encoding::Utf16(t.encoding)))
1545                    }
1546                }
1547            }
1548
1549            (Self::Regex(r), ReadValue::Bytes(o, buf)) => r.match_buf(*o, stream_kind, buf),
1550
1551            (Self::Search(t), ReadValue::Bytes(o, buf)) => t.match_buf(*o, buf),
1552
1553            _ => None,
1554        }
1555    }
1556
1557    #[inline(always)]
1558    fn strength(&self) -> u64 {
1559        const MULT: usize = 10;
1560
1561        let mut out = 2 * MULT;
1562
1563        // FIXME: octal is missing but it is not used in practice ...
1564        match self {
1565            Test::Scalar(s) => {
1566                out += s.ty.type_size() * MULT;
1567            }
1568
1569            Test::Float(t) => {
1570                out += t.ty.type_size() * MULT;
1571            }
1572
1573            Test::String(t) => out += t.test_value_len().saturating_mul(MULT),
1574
1575            Test::PString(t) => out += t.test_value_len().saturating_mul(MULT),
1576
1577            Test::Search(s) => {
1578                // NOTE: this implementation deviates from what is in
1579                // C libmagic. The purpose of this implementation is to
1580                // minimize the difference between similar tests,
1581                // implemented differently (ex: string test VS very localized search test).
1582                let n_pos = s.n_pos.unwrap_or(FILE_BYTES_MAX);
1583
1584                match n_pos {
1585                    // a search on one line should be equivalent to a string match
1586                    0..=80 => out += s.str.len().saturating_mul(MULT),
1587                    // search on the first 3 lines gets a little penalty
1588                    81..=240 => out += s.str.len() * s.str.len().clamp(0, MULT - 2),
1589                    // a search on more than 3 lines isn't considered very accurate
1590                    _ => out += s.str.len(),
1591                }
1592            }
1593
1594            Test::Regex(r) => {
1595                // NOTE: this implementation deviates from what is in
1596                // C libmagic. The purpose of this implementation is to
1597                // minimize the difference between similar tests,
1598                // implemented differently (ex: string test VS very localized regex test).
1599
1600                // we divide length by the number of capture group
1601                // which gives us a value close to he average string
1602                // length match in the regex.
1603                let v = r.non_magic_len / r.re.captures_len();
1604
1605                let len = r
1606                    .length
1607                    .map(|l| {
1608                        if r.mods.contains(ReMod::LineLimit) {
1609                            l * 80
1610                        } else {
1611                            l
1612                        }
1613                    })
1614                    .unwrap_or(FILE_BYTES_MAX);
1615
1616                match len {
1617                    // a search on one line should be equivalent to a string match
1618                    0..=80 => out += v.saturating_mul(MULT),
1619                    // search on the first 3 lines gets a little penalty
1620                    81..=240 => out += v * v.clamp(0, MULT - 2),
1621                    // a search on more than 3 lines isn't considered very accurate
1622                    _ => out += v,
1623                }
1624            }
1625
1626            Test::String16(t) => {
1627                // NOTE: in libmagic the result is div by 2
1628                // but I GUESS it is because the len is expressed
1629                // in number bytes. In our case length is expressed
1630                // in number of u16 so we shouldn't divide.
1631                out += t.test_value_len().saturating_mul(MULT);
1632            }
1633
1634            Test::Der => out += MULT,
1635
1636            Test::Default | Test::Name(_) | Test::Use(_, _) | Test::Indirect(_) | Test::Clear => {
1637                return 0;
1638            }
1639        }
1640
1641        // matching any output gets penalty
1642        if self.is_match_any() {
1643            return 0;
1644        }
1645
1646        if let Some(op) = self.cmp_op() {
1647            match op {
1648                // matching almost any gets penalty
1649                CmpOp::Neq => out = 0,
1650                CmpOp::Eq | CmpOp::Not => out += MULT,
1651                CmpOp::Lt | CmpOp::Gt => out -= 2 * MULT,
1652                CmpOp::Xor | CmpOp::BitAnd => out -= MULT,
1653            }
1654        }
1655
1656        out as u64
1657    }
1658
1659    #[inline(always)]
1660    fn cmp_op(&self) -> Option<CmpOp> {
1661        match self {
1662            Self::String(t) => Some(t.cmp_op),
1663            Self::Scalar(s) => Some(s.cmp_op),
1664            Self::Float(t) => Some(t.cmp_op),
1665            Self::Name(_)
1666            | Self::Use(_, _)
1667            | Self::Search(_)
1668            | Self::PString(_)
1669            | Self::Regex(_)
1670            | Self::Clear
1671            | Self::Default
1672            | Self::Indirect(_)
1673            | Self::String16(_)
1674            | Self::Der => None,
1675        }
1676    }
1677
1678    #[inline(always)]
1679    fn is_match_any(&self) -> bool {
1680        match self {
1681            Test::Name(_) => false,
1682            Test::Use(_, _) => false,
1683            Test::Scalar(scalar_test) => matches!(scalar_test.test_val, TestValue::Any),
1684            Test::Float(float_test) => matches!(float_test.test_val, TestValue::Any),
1685            Test::String(string_test) => matches!(string_test.test_val, TestValue::Any),
1686            Test::Search(_) => false,
1687            Test::PString(pstring_test) => matches!(pstring_test.test_val, TestValue::Any),
1688            Test::Regex(_) => false,
1689            Test::Indirect(_) => false,
1690            Test::String16(string16_test) => matches!(string16_test.test_val, TestValue::Any),
1691            Test::Der => false,
1692            Test::Clear => false,
1693            Test::Default => false,
1694        }
1695    }
1696
1697    #[inline(always)]
1698    fn is_binary(&self) -> bool {
1699        match self {
1700            Self::Name(_) => true,
1701            Self::Use(_, _) => true,
1702            Self::Scalar(_) => true,
1703            Self::Float(_) => true,
1704            Self::String(t) => !t.is_binary() & !t.is_text() || t.is_binary(),
1705            Self::Search(t) => t.is_binary(),
1706            Self::PString(_) => true,
1707            Self::Regex(t) => t.is_binary(),
1708            Self::Clear => true,
1709            Self::Default => true,
1710            Self::Indirect(_) => true,
1711            Self::String16(_) => true,
1712            Self::Der => true,
1713        }
1714    }
1715
1716    #[inline(always)]
1717    fn is_text(&self) -> bool {
1718        match self {
1719            Self::Name(_) => true,
1720            Self::Use(_, _) => true,
1721            Self::Indirect(_) => true,
1722            Self::Clear => true,
1723            Self::Default => true,
1724            Self::String(t) => !t.is_binary() & !t.is_text() || t.is_text(),
1725            _ => !self.is_binary(),
1726        }
1727    }
1728
1729    #[inline(always)]
1730    fn is_only_text(&self) -> bool {
1731        self.is_text() && !self.is_binary()
1732    }
1733
1734    #[inline(always)]
1735    fn is_only_binary(&self) -> bool {
1736        self.is_binary() && !self.is_text()
1737    }
1738}
1739
1740#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
1741enum OffsetType {
1742    Byte,
1743    DoubleLe,
1744    DoubleBe,
1745    ShortLe,
1746    ShortBe,
1747    Id3Le,
1748    Id3Be,
1749    LongLe,
1750    LongBe,
1751    Middle,
1752    Octal,
1753    QuadBe,
1754    QuadLe,
1755}
1756
1757#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
1758enum Shift {
1759    Direct(u64),
1760    Indirect(i64),
1761}
1762
1763#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
1764struct IndOffset {
1765    // where to find the offset
1766    off_addr: DirOffset,
1767    // signed or unsigned
1768    signed: bool,
1769    // type of the offset
1770    ty: OffsetType,
1771    op: Option<Op>,
1772    shift: Option<Shift>,
1773}
1774
1775impl IndOffset {
1776    // if we overflow we must not return an offset
1777    fn read_offset<R: Read + Seek>(
1778        &self,
1779        haystack: &mut LazyCache<R>,
1780        rule_base_offset: Option<u64>,
1781        last_upper_match_offset: Option<u64>,
1782    ) -> Result<Option<u64>, io::Error> {
1783        let offset_address = match self.off_addr {
1784            DirOffset::Start(s) => {
1785                let Some(o) = s.checked_add(rule_base_offset.unwrap_or_default()) else {
1786                    return Ok(None);
1787                };
1788
1789                haystack.seek(SeekFrom::Start(o))?
1790            }
1791            DirOffset::LastUpper(c) => haystack.seek(SeekFrom::Start(
1792                (last_upper_match_offset.unwrap_or_default() as i64 + c) as u64,
1793            ))?,
1794            DirOffset::End(e) => haystack.seek(SeekFrom::End(e))?,
1795        };
1796
1797        macro_rules! read_value {
1798            () => {
1799                match self.ty {
1800                    OffsetType::Byte => {
1801                        if self.signed {
1802                            read_le!(haystack, u8) as u64
1803                        } else {
1804                            read_le!(haystack, i8) as u64
1805                        }
1806                    }
1807                    OffsetType::DoubleLe => read_le!(haystack, f64) as u64,
1808                    OffsetType::DoubleBe => read_be!(haystack, f64) as u64,
1809                    OffsetType::ShortLe => {
1810                        if self.signed {
1811                            read_le!(haystack, i16) as u64
1812                        } else {
1813                            read_le!(haystack, u16) as u64
1814                        }
1815                    }
1816                    OffsetType::ShortBe => {
1817                        if self.signed {
1818                            read_be!(haystack, i16) as u64
1819                        } else {
1820                            read_be!(haystack, u16) as u64
1821                        }
1822                    }
1823                    OffsetType::Id3Le => decode_id3(read_le!(haystack, u32)) as u64,
1824                    OffsetType::Id3Be => decode_id3(read_be!(haystack, u32)) as u64,
1825                    OffsetType::LongLe => {
1826                        if self.signed {
1827                            read_le!(haystack, i32) as u64
1828                        } else {
1829                            read_le!(haystack, u32) as u64
1830                        }
1831                    }
1832                    OffsetType::LongBe => {
1833                        if self.signed {
1834                            read_be!(haystack, i32) as u64
1835                        } else {
1836                            read_be!(haystack, u32) as u64
1837                        }
1838                    }
1839                    OffsetType::Middle => read_me!(haystack) as u64,
1840                    OffsetType::Octal => {
1841                        if let Some(o) = read_octal_u64(haystack) {
1842                            o
1843                        } else {
1844                            debug!("failed to read octal offset @ {offset_address}");
1845                            return Ok(None);
1846                        }
1847                    }
1848                    OffsetType::QuadLe => {
1849                        if self.signed {
1850                            read_le!(haystack, i64) as u64
1851                        } else {
1852                            read_le!(haystack, u64)
1853                        }
1854                    }
1855                    OffsetType::QuadBe => {
1856                        if self.signed {
1857                            read_be!(haystack, i64) as u64
1858                        } else {
1859                            read_be!(haystack, u64)
1860                        }
1861                    }
1862                }
1863            };
1864        }
1865
1866        // in theory every offset read should end up in something seekable from start, so we can use u64 to store the result
1867        let o = read_value!();
1868
1869        trace!(
1870            "offset read @ {offset_address} value={o} op={:?} shift={:?}",
1871            self.op, self.shift
1872        );
1873
1874        // apply transformation
1875        if let (Some(op), Some(shift)) = (self.op, self.shift) {
1876            let shift = match shift {
1877                Shift::Direct(i) => i,
1878                Shift::Indirect(i) => {
1879                    let tmp = offset_address as i128 + i as i128;
1880                    if tmp.is_negative() {
1881                        return Ok(None);
1882                    } else {
1883                        haystack.seek(SeekFrom::Start(tmp as u64))?;
1884                    };
1885                    // NOTE: here we assume that the shift has the same
1886                    // type as the main offset !
1887                    read_value!()
1888                }
1889            };
1890
1891            match op {
1892                Op::Add => return Ok(o.checked_add(shift)),
1893                Op::Mul => return Ok(o.checked_mul(shift)),
1894                Op::Sub => return Ok(o.checked_sub(shift)),
1895                Op::Div => return Ok(o.checked_div(shift)),
1896                Op::Mod => return Ok(o.checked_rem(shift)),
1897                Op::And => return Ok(Some(o & shift)),
1898                Op::Or => return Ok(Some(o | shift)),
1899                Op::Xor => return Ok(Some(o ^ shift)),
1900            }
1901        }
1902
1903        Ok(Some(o))
1904    }
1905}
1906
1907#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
1908enum DirOffset {
1909    Start(u64),
1910    // relative to the last up-level field
1911    LastUpper(i64),
1912    End(i64),
1913}
1914
1915#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
1916enum Offset {
1917    Direct(DirOffset),
1918    Indirect(IndOffset),
1919}
1920
1921impl From<DirOffset> for Offset {
1922    fn from(value: DirOffset) -> Self {
1923        Self::Direct(value)
1924    }
1925}
1926
1927impl From<IndOffset> for Offset {
1928    fn from(value: IndOffset) -> Self {
1929        Self::Indirect(value)
1930    }
1931}
1932
1933impl Display for DirOffset {
1934    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1935        match self {
1936            DirOffset::Start(i) => write!(f, "{i}"),
1937            DirOffset::LastUpper(c) => write!(f, "&{c}"),
1938            DirOffset::End(e) => write!(f, "-{e}"),
1939        }
1940    }
1941}
1942
1943impl Default for DirOffset {
1944    fn default() -> Self {
1945        Self::LastUpper(0)
1946    }
1947}
1948
1949#[derive(Debug, Clone, Serialize, Deserialize)]
1950struct Match {
1951    line: usize,
1952    depth: u8,
1953    offset: Offset,
1954    test: Test,
1955    test_strength: u64,
1956    message: Option<Message>,
1957}
1958
1959impl From<Use> for Match {
1960    fn from(value: Use) -> Self {
1961        let test = Test::Use(value.switch_endianness, value.rule_name);
1962        let test_strength = test.strength();
1963        Self {
1964            line: value.line,
1965            depth: value.depth,
1966            offset: value.start_offset,
1967            test,
1968            test_strength,
1969            message: value.message,
1970        }
1971    }
1972}
1973
1974impl From<Name> for Match {
1975    fn from(value: Name) -> Self {
1976        let test = Test::Name(value.name);
1977        let test_strength = test.strength();
1978        Self {
1979            line: value.line,
1980            depth: 0,
1981            offset: Offset::Direct(DirOffset::Start(0)),
1982            test,
1983            test_strength,
1984            message: value.message,
1985        }
1986    }
1987}
1988
1989impl Match {
1990    /// Turns the `Match`'s offset into an absolute offset from the start of the stream
1991    #[inline(always)]
1992    fn offset_from_start<R: Read + Seek>(
1993        &self,
1994        haystack: &mut LazyCache<R>,
1995        rule_base_offset: Option<u64>,
1996        last_level_offset: Option<u64>,
1997    ) -> Result<Option<u64>, io::Error> {
1998        match self.offset {
1999            Offset::Direct(dir_offset) => match dir_offset {
2000                DirOffset::Start(s) => Ok(Some(s)),
2001                DirOffset::LastUpper(shift) => {
2002                    let o = last_level_offset.unwrap_or_default() as i64 + shift;
2003
2004                    if o >= 0 { Ok(Some(o as u64)) } else { Ok(None) }
2005                }
2006                DirOffset::End(e) => Ok(Some(haystack.offset_from_start(SeekFrom::End(e)))),
2007            },
2008            Offset::Indirect(ind_offset) => {
2009                let Some(o) =
2010                    ind_offset.read_offset(haystack, rule_base_offset, last_level_offset)?
2011                else {
2012                    return Ok(None);
2013                };
2014
2015                Ok(Some(o))
2016            }
2017        }
2018    }
2019
2020    /// this method emulates the buffer based matching
2021    /// logic implemented in libmagic. It needs some aweful
2022    /// and weird offset convertions to turn buffer
2023    /// relative offsets (libmagic is based on) into
2024    /// absolute offset in the file.
2025    ///
2026    /// this method shoud bubble up only critical errors
2027    /// all the other errors should make the match result
2028    /// false and be logged via debug!
2029    ///
2030    /// the function returns an error if the maximum recursion
2031    /// has been reached or if a dependency rule is missing.
2032    #[inline]
2033    #[allow(clippy::too_many_arguments)]
2034    fn matches<'a: 'h, 'h, R: Read + Seek>(
2035        &'a self,
2036        source: Option<&str>,
2037        magic: &mut Magic<'a>,
2038        stream_kind: StreamKind,
2039        state: &mut MatchState,
2040        buf_base_offset: Option<u64>,
2041        rule_base_offset: Option<u64>,
2042        last_level_offset: Option<u64>,
2043        haystack: &'h mut LazyCache<R>,
2044        switch_endianness: bool,
2045        db: &'a MagicDb,
2046        depth: usize,
2047    ) -> Result<(bool, Option<MatchRes<'h>>), Error> {
2048        let source = source.unwrap_or("unknown");
2049        let line = self.line;
2050
2051        if depth >= MAX_RECURSION {
2052            return Err(Error::localized(
2053                source,
2054                line,
2055                Error::MaximumRecursion(MAX_RECURSION),
2056            ));
2057        }
2058
2059        if self.test.is_only_binary() && stream_kind.is_text() {
2060            trace!("skip binary test source={source} line={line} stream_kind={stream_kind:?}",);
2061            return Ok((false, None));
2062        }
2063
2064        if self.test.is_only_text() && !stream_kind.is_text() {
2065            trace!("skip text test source={source} line={line} stream_kind={stream_kind:?}",);
2066            return Ok((false, None));
2067        }
2068
2069        let Ok(Some(mut offset)) = self
2070            .offset_from_start(haystack, rule_base_offset, last_level_offset)
2071            .inspect_err(|e| debug!("source={source} line={line} failed at computing offset: {e}"))
2072        else {
2073            return Ok((false, None));
2074        };
2075
2076        offset = match self.offset {
2077            Offset::Indirect(_) => {
2078                // the result we get for an indirect offset
2079                // is relative to the start of the libmagic
2080                // buffer so we need to add base to make it
2081                // absolute.
2082                buf_base_offset.unwrap_or_default().saturating_add(offset)
2083            }
2084            // offset from start are computed from rule base
2085            Offset::Direct(DirOffset::Start(_)) => {
2086                rule_base_offset.unwrap_or_default().saturating_add(offset)
2087            }
2088            _ => offset,
2089        };
2090
2091        match &self.test {
2092            Test::Clear => {
2093                trace!("source={source} line={line} clear");
2094                state.clear_continuation_level(&self.continuation_level());
2095                Ok((true, None))
2096            }
2097
2098            Test::Name(name) => {
2099                trace!(
2100                    "source={source} line={line} running rule {name} switch_endianness={switch_endianness}",
2101                );
2102                Ok((true, None))
2103            }
2104
2105            Test::Use(flip_endianness, rule_name) => {
2106                trace!(
2107                    "source={source} line={line} use {rule_name} switch_endianness={flip_endianness}",
2108                );
2109
2110                // switch_endianness must propagate down the rule call stack
2111                let switch_endianness = switch_endianness ^ flip_endianness;
2112
2113                let dr: &DependencyRule = db.dependencies.get(rule_name).ok_or(
2114                    Error::localized(source, line, Error::MissingRule(rule_name.clone())),
2115                )?;
2116
2117                // we push the message here otherwise we push message in depth first
2118                if let Some(msg) = self.message.as_ref() {
2119                    magic.push_message(msg.to_string_lossy());
2120                }
2121
2122                dr.rule.magic(
2123                    magic,
2124                    stream_kind,
2125                    buf_base_offset,
2126                    Some(offset),
2127                    haystack,
2128                    db,
2129                    switch_endianness,
2130                    depth.saturating_add(1),
2131                )?;
2132
2133                // we return false not to push message again
2134                Ok((false, None))
2135            }
2136
2137            Test::Indirect(m) => {
2138                trace!(
2139                    "source={source} line={line} indirect mods={:?} offset={offset:#x}",
2140                    m
2141                );
2142
2143                let new_buf_base_off = if m.contains(IndirectMod::Relative) {
2144                    Some(offset)
2145                } else {
2146                    None
2147                };
2148
2149                // we push the message here otherwise we push message in depth first
2150                if let Some(msg) = self.message.as_ref() {
2151                    magic.push_message(msg.to_string_lossy());
2152                }
2153
2154                for r in db.rules.iter() {
2155                    let messages_cnt = magic.message.len();
2156
2157                    r.magic(
2158                        magic,
2159                        stream_kind,
2160                        new_buf_base_off,
2161                        Some(offset),
2162                        haystack,
2163                        db,
2164                        false,
2165                        depth.saturating_add(1),
2166                    )?;
2167
2168                    // this means we matched a rule
2169                    if magic.message.len() != messages_cnt {
2170                        break;
2171                    }
2172                }
2173
2174                // we return false not to push message again
2175                Ok((false, None))
2176            }
2177
2178            Test::Default => {
2179                // default matches if nothing else at the continuation level matched
2180                let ok = !state.get_continuation_level(&self.continuation_level());
2181
2182                trace!("source={source} line={line} default match={ok}");
2183                if ok {
2184                    state.set_continuation_level(self.continuation_level());
2185                }
2186
2187                Ok((ok, None))
2188            }
2189
2190            _ => {
2191                if let Err(e) = haystack.seek(SeekFrom::Start(offset)) {
2192                    debug!("source={source} line={line} failed to seek in haystack: {e}");
2193                    return Ok((false, None));
2194                }
2195
2196                let mut trace_msg = None;
2197
2198                if enabled!(Level::DEBUG) {
2199                    trace_msg = Some(vec![format!(
2200                        "source={source} line={line} depth={} stream_offset={:#x}",
2201                        self.depth,
2202                        haystack.lazy_stream_position()
2203                    )])
2204                }
2205
2206                // NOTE: we may have a way to optimize here. In case we do a Any
2207                // test and we don't use the value to format the message, we don't
2208                // need to read the value.
2209                if let Ok(opt_test_value) = self
2210                    .test
2211                    .read_test_value(haystack, switch_endianness)
2212                    .inspect_err(|e| {
2213                        debug!("source={source} line={line} error while reading test value @{offset}: {e}",)
2214                    })
2215                {
2216                    if let Some(v) = trace_msg
2217                        .as_mut() { v.push(format!("test={:?}", self.test)) }
2218
2219                    let match_res =
2220                        opt_test_value.and_then(|tv| self.test.match_value(&tv, stream_kind));
2221
2222                    if let Some(v) = trace_msg.as_mut() { v.push(format!(
2223                            "message=\"{}\" match={}",
2224                            self.message
2225                                .as_ref()
2226                                .map(|fs| fs.to_string_lossy())
2227                                .unwrap_or_default(),
2228                            match_res.is_some()
2229                        )) }
2230
2231                    // trace message
2232                    if enabled!(Level::DEBUG) && !enabled!(Level::TRACE) && match_res.is_some() {
2233                        if let Some(m) = trace_msg{
2234                            debug!("{}", m.join(" "));
2235                        }
2236                    } else if enabled!(Level::TRACE)
2237                        && let Some(m) = trace_msg{
2238                            trace!("{}", m.join(" "));
2239                        }
2240
2241                    if let Some(mr) = match_res {
2242                        state.set_continuation_level(self.continuation_level());
2243                        return Ok((true, Some(mr)));
2244                    }
2245                }
2246
2247                Ok((false, None))
2248            }
2249        }
2250    }
2251
2252    #[inline(always)]
2253    fn continuation_level(&self) -> ContinuationLevel {
2254        ContinuationLevel(self.depth)
2255    }
2256}
2257
2258#[derive(Debug, Clone)]
2259struct Use {
2260    line: usize,
2261    depth: u8,
2262    start_offset: Offset,
2263    rule_name: String,
2264    switch_endianness: bool,
2265    message: Option<Message>,
2266}
2267
2268#[derive(Debug, Clone, Serialize, Deserialize)]
2269struct StrengthMod {
2270    op: Op,
2271    by: u8,
2272}
2273
2274impl StrengthMod {
2275    #[inline(always)]
2276    fn apply(&self, strength: u64) -> u64 {
2277        let by = self.by as u64;
2278        debug!("applying strength modifier: {strength} {} {}", self.op, by);
2279        match self.op {
2280            Op::Mul => strength.saturating_mul(by),
2281            Op::Add => strength.saturating_add(by),
2282            Op::Sub => strength.saturating_sub(by),
2283            Op::Div => {
2284                if by > 0 {
2285                    strength.saturating_div(by)
2286                } else {
2287                    strength
2288                }
2289            }
2290            Op::Mod => strength % by,
2291            Op::And => strength & by,
2292            // this should never happen as strength operators
2293            // are enforced by our parser
2294            Op::Xor | Op::Or => {
2295                debug_panic!("unsupported strength operator");
2296                strength
2297            }
2298        }
2299    }
2300}
2301
2302#[derive(Debug, Clone)]
2303enum Flag {
2304    Mime(String),
2305    Ext(HashSet<String>),
2306    Strength(StrengthMod),
2307    Apple(String),
2308}
2309
2310#[derive(Debug, Clone)]
2311struct Name {
2312    line: usize,
2313    name: String,
2314    message: Option<Message>,
2315}
2316
2317#[derive(Debug, Clone)]
2318enum Entry<'span> {
2319    Match(Span<'span>, Match),
2320    Flag(Span<'span>, Flag),
2321}
2322
2323#[derive(Debug, Clone, Serialize, Deserialize)]
2324struct EntryNode {
2325    root: bool,
2326    entry: Match,
2327    children: Vec<EntryNode>,
2328    mimetype: Option<String>,
2329    apple: Option<String>,
2330    strength_mod: Option<StrengthMod>,
2331    exts: HashSet<String>,
2332}
2333
2334impl EntryNode {
2335    fn update_exts_rec(
2336        &self,
2337        exts: &mut HashSet<String>,
2338        deps: &HashMap<String, DependencyRule>,
2339        marked: &mut HashSet<String>,
2340    ) -> Result<(), ()> {
2341        for ext in self.exts.iter() {
2342            if !exts.contains(ext) {
2343                exts.insert(ext.clone());
2344            }
2345        }
2346
2347        for c in self.children.iter() {
2348            if let Test::Use(_, ref name) = c.entry.test {
2349                if marked.contains(name) {
2350                    continue;
2351                }
2352                if let Some(r) = deps.get(name) {
2353                    marked.insert(name.clone());
2354                    exts.extend(r.rule.fetch_all_extensions(deps, marked)?);
2355                } else {
2356                    return Err(());
2357                }
2358            } else {
2359                c.update_exts_rec(exts, deps, marked)?;
2360            }
2361        }
2362
2363        Ok(())
2364    }
2365
2366    fn update_score_rec(
2367        &self,
2368        depth: usize,
2369        score: &mut u64,
2370        deps: &HashMap<String, DependencyRule>,
2371        marked: &mut HashSet<String>,
2372    ) {
2373        if depth == 3 {
2374            return;
2375        }
2376
2377        *score += self
2378            .children
2379            .iter()
2380            .map(|e| e.entry.test_strength)
2381            .min()
2382            .unwrap_or_default();
2383
2384        for c in self.children.iter() {
2385            if let Test::Use(_, ref name) = c.entry.test {
2386                if marked.contains(name) {
2387                    continue;
2388                }
2389
2390                if let Some(r) = deps.get(name) {
2391                    marked.insert(name.clone());
2392                    *score += r.rule.compute_score(depth, deps, marked);
2393                }
2394            }
2395            c.update_score_rec(depth + 1, score, deps, marked);
2396        }
2397    }
2398
2399    #[inline]
2400    #[allow(clippy::too_many_arguments)]
2401    fn matches<'r, R: Read + Seek>(
2402        &'r self,
2403        opt_source: Option<&str>,
2404        magic: &mut Magic<'r>,
2405        state: &mut MatchState,
2406        stream_kind: StreamKind,
2407        buf_base_offset: Option<u64>,
2408        rule_base_offset: Option<u64>,
2409        last_level_offset: Option<u64>,
2410        haystack: &mut LazyCache<R>,
2411        db: &'r MagicDb,
2412        switch_endianness: bool,
2413        depth: usize,
2414    ) -> Result<(), Error> {
2415        let (ok, opt_match_res) = self.entry.matches(
2416            opt_source,
2417            magic,
2418            stream_kind,
2419            state,
2420            buf_base_offset,
2421            rule_base_offset,
2422            last_level_offset,
2423            haystack,
2424            switch_endianness,
2425            db,
2426            depth,
2427        )?;
2428
2429        let source = opt_source.unwrap_or("unknown");
2430        let line = self.entry.line;
2431
2432        if ok {
2433            // update magic with message if match is successful
2434            if let Some(msg) = self.entry.message.as_ref()
2435                && let Ok(msg) = msg.format_with(opt_match_res.as_ref()).inspect_err(|e| {
2436                    debug!("source={source} line={line} failed to format message: {e}")
2437                })
2438            {
2439                magic.push_message(msg);
2440            }
2441
2442            // we need to adjust stream offset in case of regex/search tests
2443            if let Some(mr) = opt_match_res {
2444                match &self.entry.test {
2445                    Test::String(t) => {
2446                        if t.has_length_mod() {
2447                            let o = mr.end_offset();
2448                            haystack.seek(SeekFrom::Start(o))?;
2449                        }
2450                    }
2451                    Test::Search(t) => {
2452                        if t.re_mods.contains(ReMod::StartOffsetUpdate) {
2453                            let o = mr.start_offset();
2454                            haystack.seek(SeekFrom::Start(o))?;
2455                        } else {
2456                            let o = mr.end_offset();
2457                            haystack.seek(SeekFrom::Start(o))?;
2458                        }
2459                    }
2460
2461                    Test::Regex(t) => {
2462                        if t.mods.contains(ReMod::StartOffsetUpdate) {
2463                            let o = mr.start_offset();
2464                            haystack.seek(SeekFrom::Start(o))?;
2465                        } else {
2466                            let o = mr.end_offset();
2467                            haystack.seek(SeekFrom::Start(o))?;
2468                        }
2469                    }
2470                    // other types do not need offset adjustement
2471                    _ => {}
2472                }
2473            }
2474
2475            if let Some(mimetype) = self.mimetype.as_ref() {
2476                magic.set_mime_type(Cow::Borrowed(mimetype));
2477            }
2478
2479            if let Some(apple_ty) = self.apple.as_ref() {
2480                magic.set_creator_code(Cow::Borrowed(apple_ty));
2481            }
2482
2483            if !self.exts.is_empty() {
2484                magic.insert_extensions(self.exts.iter().map(|s| s.as_str()));
2485            }
2486
2487            // NOTE: here we try to implement a similar logic as in file_magic_strength.
2488            // Sticking to the exact same strength computation logic is complicated due
2489            // to implementation differences. Let's wait and see if that is a real issue.
2490            let mut strength = self.entry.test_strength;
2491
2492            let continuation_level = self.entry.continuation_level().0 as u64;
2493            if self.entry.message.is_none() && continuation_level < 3 {
2494                strength = strength.saturating_add(continuation_level);
2495            }
2496
2497            if let Some(sm) = self.strength_mod.as_ref() {
2498                strength = sm.apply(strength);
2499            }
2500
2501            // entries with no message get a bonus
2502            if self.entry.message.is_none() {
2503                strength += 1
2504            }
2505
2506            magic.update_strength(strength);
2507
2508            let end_upper_level = haystack.lazy_stream_position();
2509
2510            // we have to fix rule_base_offset if
2511            // the rule_base_starts from end otherwise it
2512            // breaks some offset computation in match
2513            // see test_offset_bug_1 and test_offset_bug_2
2514            // they implement the same test logic yet indirect
2515            // offsets have to be different so that it works
2516            // in libmagic/file
2517            let rule_base_offset = if self.root {
2518                match self.entry.offset {
2519                    Offset::Direct(DirOffset::End(o)) => {
2520                        Some(haystack.offset_from_start(SeekFrom::End(o)))
2521                    }
2522                    _ => rule_base_offset,
2523                }
2524            } else {
2525                rule_base_offset
2526            };
2527
2528            for e in self.children.iter() {
2529                e.matches(
2530                    opt_source,
2531                    magic,
2532                    state,
2533                    stream_kind,
2534                    buf_base_offset,
2535                    rule_base_offset,
2536                    Some(end_upper_level),
2537                    haystack,
2538                    db,
2539                    switch_endianness,
2540                    depth,
2541                )?
2542            }
2543        }
2544
2545        Ok(())
2546    }
2547}
2548
2549/// Represents a parsed magic rule
2550#[derive(Debug, Clone, Serialize, Deserialize)]
2551pub struct MagicRule {
2552    id: usize,
2553    source: Option<String>,
2554    entries: EntryNode,
2555    extensions: HashSet<String>,
2556    /// score used for rule ranking
2557    score: u64,
2558    finalized: bool,
2559}
2560
2561impl MagicRule {
2562    #[inline(always)]
2563    fn set_id(&mut self, id: usize) {
2564        self.id = id
2565    }
2566
2567    /// Fetches all the extensions defined in the magic rule. This
2568    /// function goes recursive and find extensions also defined in
2569    /// dependencies
2570    fn fetch_all_extensions(
2571        &self,
2572        deps: &HashMap<String, DependencyRule>,
2573        marked: &mut HashSet<String>,
2574    ) -> Result<HashSet<String>, ()> {
2575        let mut exts = HashSet::new();
2576        self.entries.update_exts_rec(&mut exts, deps, marked)?;
2577        Ok(exts)
2578    }
2579
2580    /// Computes the ranking score of a magic rule by walking
2581    /// tests recursively, dependencies included.
2582    fn compute_score(
2583        &self,
2584        depth: usize,
2585        deps: &HashMap<String, DependencyRule>,
2586        marked: &mut HashSet<String>,
2587    ) -> u64 {
2588        let mut score = 0;
2589        score += self.entries.entry.test_strength;
2590        self.entries
2591            .update_score_rec(depth, &mut score, deps, marked);
2592        score
2593    }
2594
2595    /// Finalize a rule by searching for all extensions and computing its score
2596    /// for ranking. In the `MagicRule` is already finalized it returns immediately.
2597    fn try_finalize(&mut self, deps: &HashMap<String, DependencyRule>) {
2598        if self.finalized {
2599            return;
2600        }
2601
2602        let Ok(exts) = self.fetch_all_extensions(deps, &mut HashSet::new()) else {
2603            return;
2604        };
2605
2606        self.extensions.extend(exts);
2607
2608        // fetch_all_extensions walks through all the dependencies
2609        // so there is no reason for compute_score to fail as it is walking
2610        // only some of them
2611        self.score = self.compute_score(0, deps, &mut HashSet::new());
2612        self.finalized = true
2613    }
2614
2615    #[inline]
2616    fn magic_entrypoint<'r, R: Read + Seek>(
2617        &'r self,
2618        magic: &mut Magic<'r>,
2619        stream_kind: StreamKind,
2620        haystack: &mut LazyCache<R>,
2621        db: &'r MagicDb,
2622        switch_endianness: bool,
2623        depth: usize,
2624    ) -> Result<(), Error> {
2625        self.entries.matches(
2626            self.source.as_deref(),
2627            magic,
2628            &mut MatchState::empty(),
2629            stream_kind,
2630            None,
2631            None,
2632            None,
2633            haystack,
2634            db,
2635            switch_endianness,
2636            depth,
2637        )
2638    }
2639
2640    #[inline]
2641    #[allow(clippy::too_many_arguments)]
2642    fn magic<'r, R: Read + Seek>(
2643        &'r self,
2644        magic: &mut Magic<'r>,
2645        stream_kind: StreamKind,
2646        buf_base_offset: Option<u64>,
2647        rule_base_offset: Option<u64>,
2648        haystack: &mut LazyCache<R>,
2649        db: &'r MagicDb,
2650        switch_endianness: bool,
2651        depth: usize,
2652    ) -> Result<(), Error> {
2653        self.entries.matches(
2654            self.source.as_deref(),
2655            magic,
2656            &mut MatchState::empty(),
2657            stream_kind,
2658            buf_base_offset,
2659            rule_base_offset,
2660            None,
2661            haystack,
2662            db,
2663            switch_endianness,
2664            depth,
2665        )
2666    }
2667
2668    /// Checks if the rule is for matching against text content
2669    ///
2670    /// # Returns
2671    ///
2672    /// * `bool` - True if the rule is for text files
2673    pub fn is_text(&self) -> bool {
2674        self.entries.entry.test.is_text()
2675            && self.entries.children.iter().all(|e| e.entry.test.is_text())
2676    }
2677
2678    /// Gets the rule's score used for ranking rules between them
2679    ///
2680    /// # Returns
2681    ///
2682    /// * `u64` - The rule's score
2683    #[inline(always)]
2684    pub fn score(&self) -> u64 {
2685        self.score
2686    }
2687
2688    /// Gets the rule's filename if any
2689    ///
2690    /// # Returns
2691    ///
2692    /// * `Option<&str>` - The rule's source if available
2693    #[inline(always)]
2694    pub fn source(&self) -> Option<&str> {
2695        self.source.as_deref()
2696    }
2697
2698    /// Gets the line number at which the rule is defined
2699    ///
2700    /// # Returns
2701    ///
2702    /// * `usize` - The rule's line number
2703    #[inline(always)]
2704    pub fn line(&self) -> usize {
2705        self.entries.entry.line
2706    }
2707
2708    /// Gets all the file extensions associated to the rule
2709    ///
2710    /// # Returns
2711    ///
2712    /// * `&HashSet<String>` - The set of all associated extensions
2713    #[inline(always)]
2714    pub fn extensions(&self) -> &HashSet<String> {
2715        &self.extensions
2716    }
2717}
2718
2719#[derive(Debug, Clone, Serialize, Deserialize)]
2720struct DependencyRule {
2721    name: String,
2722    rule: MagicRule,
2723}
2724
2725/// A parsed source of magic rules
2726///
2727/// # Methods
2728///
2729/// * `open` - Opens a magic file from a path
2730#[derive(Debug, Clone, Serialize, Deserialize)]
2731pub struct MagicSource {
2732    rules: Vec<MagicRule>,
2733    dependencies: HashMap<String, DependencyRule>,
2734}
2735
2736impl MagicSource {
2737    /// Opens and parses a magic file from a path
2738    ///
2739    /// # Arguments
2740    ///
2741    /// * `p` - The path to the magic file
2742    ///
2743    /// # Returns
2744    ///
2745    /// * `Result<Self, Error>` - The parsed magic file or an error
2746    pub fn open<P: AsRef<Path>>(p: P) -> Result<Self, Error> {
2747        FileMagicParser::parse_file(p)
2748    }
2749}
2750
2751#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)]
2752struct ContinuationLevel(u8);
2753
2754// FIXME: magic handles many more text encodings
2755#[derive(Debug, PartialEq, Eq, Clone, Copy)]
2756enum TextEncoding {
2757    Ascii,
2758    Utf8,
2759    Unknown,
2760}
2761
2762impl TextEncoding {
2763    const fn as_magic_str(&self) -> &'static str {
2764        match self {
2765            TextEncoding::Ascii => "ASCII",
2766            TextEncoding::Utf8 => "UTF-8",
2767            TextEncoding::Unknown => "Unknown",
2768        }
2769    }
2770}
2771
2772#[derive(Debug, PartialEq, Eq, Clone, Copy)]
2773enum StreamKind {
2774    Binary,
2775    Text(TextEncoding),
2776}
2777
2778impl StreamKind {
2779    const fn is_text(&self) -> bool {
2780        matches!(self, StreamKind::Text(_))
2781    }
2782}
2783
2784#[derive(Debug)]
2785struct MatchState {
2786    continuation_levels: [bool; 256],
2787}
2788
2789impl MatchState {
2790    #[inline(always)]
2791    fn empty() -> Self {
2792        MatchState {
2793            continuation_levels: [false; 256],
2794        }
2795    }
2796
2797    #[inline(always)]
2798    fn get_continuation_level(&mut self, level: &ContinuationLevel) -> bool {
2799        self.continuation_levels
2800            .get(level.0 as usize)
2801            .cloned()
2802            .unwrap_or_default()
2803    }
2804
2805    #[inline(always)]
2806    fn set_continuation_level(&mut self, level: ContinuationLevel) {
2807        if let Some(b) = self.continuation_levels.get_mut(level.0 as usize) {
2808            *b = true
2809        }
2810    }
2811
2812    #[inline(always)]
2813    fn clear_continuation_level(&mut self, level: &ContinuationLevel) {
2814        if let Some(b) = self.continuation_levels.get_mut(level.0 as usize) {
2815            *b = false;
2816        }
2817    }
2818}
2819
2820/// Represents a file magic detection result
2821#[derive(Debug, Default)]
2822pub struct Magic<'m> {
2823    stream_kind: Option<StreamKind>,
2824    source: Option<Cow<'m, str>>,
2825    message: Vec<Cow<'m, str>>,
2826    mime_type: Option<Cow<'m, str>>,
2827    creator_code: Option<Cow<'m, str>>,
2828    strength: u64,
2829    exts: HashSet<Cow<'m, str>>,
2830    is_default: bool,
2831}
2832
2833impl<'m> Magic<'m> {
2834    #[inline(always)]
2835    fn set_source(&mut self, source: Option<&'m str>) {
2836        self.source = source.map(Cow::Borrowed);
2837    }
2838
2839    #[inline(always)]
2840    fn set_stream_kind(&mut self, stream_kind: StreamKind) {
2841        self.stream_kind = Some(stream_kind)
2842    }
2843
2844    #[inline(always)]
2845    fn reset(&mut self) {
2846        self.stream_kind = None;
2847        self.source = None;
2848        self.message.clear();
2849        self.mime_type = None;
2850        self.creator_code = None;
2851        self.strength = 0;
2852        self.exts.clear();
2853        self.is_default = false;
2854    }
2855
2856    /// Converts borrowed data into owned data. This method involves
2857    /// data cloning, so you must use this method only if you need to
2858    /// extend the lifetime of a [`Magic`] struct.
2859    ///
2860    /// # Returns
2861    ///
2862    /// * `Magic<'owned>` - A new [`Magic`] with owned data
2863    #[inline]
2864    pub fn into_owned<'owned>(self) -> Magic<'owned> {
2865        Magic {
2866            stream_kind: self.stream_kind,
2867            source: self.source.map(|s| Cow::Owned(s.into_owned())),
2868            message: self
2869                .message
2870                .into_iter()
2871                .map(Cow::into_owned)
2872                .map(Cow::Owned)
2873                .collect(),
2874            mime_type: self.mime_type.map(|m| Cow::Owned(m.into_owned())),
2875            creator_code: self.creator_code.map(|m| Cow::Owned(m.into_owned())),
2876            strength: self.strength,
2877            exts: self
2878                .exts
2879                .into_iter()
2880                .map(|e| Cow::Owned(e.into_owned()))
2881                .collect(),
2882            is_default: self.is_default,
2883        }
2884    }
2885
2886    /// Gets the formatted message describing the file type
2887    ///
2888    /// # Returns
2889    ///
2890    /// * `String` - The formatted message
2891    #[inline(always)]
2892    pub fn message(&self) -> String {
2893        let mut out = String::new();
2894        for (i, m) in self.message.iter().enumerate() {
2895            if let Some(s) = m.strip_prefix(r#"\b"#) {
2896                out.push_str(s);
2897            } else {
2898                // don't put space on first string
2899                if i > 0 {
2900                    out.push(' ');
2901                }
2902                out.push_str(m);
2903            }
2904        }
2905        out
2906    }
2907
2908    /// Returns an iterator over the individual parts of the magic message
2909    ///
2910    /// A magic message is typically composed of multiple parts, each appended
2911    /// during successful magic tests. This method provides an efficient way to
2912    /// iterate over these parts without concatenating them into a new string,
2913    /// as done when calling [`Magic::message`].
2914    ///
2915    /// # Returns
2916    ///
2917    /// * `impl Iterator<Item = &str>` - An iterator yielding string slices of each message part
2918    #[inline]
2919    pub fn message_parts(&self) -> impl Iterator<Item = &str> {
2920        self.message.iter().map(|p| p.as_ref())
2921    }
2922
2923    #[inline(always)]
2924    fn update_strength(&mut self, value: u64) {
2925        self.strength = self.strength.saturating_add(value);
2926        debug!("updated strength = {:?}", self.strength)
2927    }
2928
2929    /// Gets the detected MIME type
2930    ///
2931    /// # Returns
2932    ///
2933    /// * `&str` - The MIME type or default based on stream kind
2934    #[inline(always)]
2935    pub fn mime_type(&self) -> &str {
2936        self.mime_type.as_deref().unwrap_or(match self.stream_kind {
2937            Some(StreamKind::Text(_)) => DEFAULT_TEXT_MIMETYPE,
2938            Some(StreamKind::Binary) | None => DEFAULT_BIN_MIMETYPE,
2939        })
2940    }
2941
2942    #[inline(always)]
2943    fn push_message<'a: 'm>(&mut self, msg: Cow<'a, str>) {
2944        if !msg.is_empty() {
2945            debug!("pushing message: msg={msg} len={}", msg.len());
2946            self.message.push(msg);
2947        }
2948    }
2949
2950    #[inline(always)]
2951    fn set_mime_type<'a: 'm>(&mut self, mime: Cow<'a, str>) {
2952        if self.mime_type.is_none() {
2953            debug!("insert mime: {:?}", mime);
2954            self.mime_type = Some(mime)
2955        }
2956    }
2957
2958    #[inline(always)]
2959    fn set_creator_code<'a: 'm>(&mut self, apple_ty: Cow<'a, str>) {
2960        if self.creator_code.is_none() {
2961            debug!("insert apple type: {apple_ty:?}");
2962            self.creator_code = Some(apple_ty)
2963        }
2964    }
2965
2966    #[inline(always)]
2967    fn insert_extensions<'a: 'm, I: Iterator<Item = &'a str>>(&mut self, exts: I) {
2968        if self.exts.is_empty() {
2969            self.exts.extend(exts.filter_map(|e| {
2970                if e.is_empty() {
2971                    None
2972                } else {
2973                    Some(Cow::Borrowed(e))
2974                }
2975            }));
2976        }
2977    }
2978
2979    /// Gets the confidence score of the detection. This
2980    /// value is used to sort [`Magic`] in [`MagicDb::best_magic`]
2981    /// and [`MagicDb::all_magics`].
2982    ///
2983    /// # Returns
2984    ///
2985    /// * `u64` - The confidence score attributed to that [`Magic`]
2986    #[inline(always)]
2987    pub fn strength(&self) -> u64 {
2988        self.strength
2989    }
2990
2991    /// Gets the filename where the magic rule was defined
2992    ///
2993    /// # Returns
2994    ///
2995    /// * `Option<&str>` - The source if available
2996    #[inline(always)]
2997    pub fn source(&self) -> Option<&str> {
2998        self.source.as_deref()
2999    }
3000
3001    /// Gets the Apple creator code if available
3002    ///
3003    /// # Returns
3004    ///
3005    /// * `Option<&str>` - The creator code if available
3006    #[inline(always)]
3007    pub fn creator_code(&self) -> Option<&str> {
3008        self.creator_code.as_deref()
3009    }
3010
3011    /// Gets the possible file extensions for the detected [`Magic`]
3012    ///
3013    /// # Returns
3014    ///
3015    /// * `&HashSet<Cow<'m, str>>` - The set of possible extensions
3016    #[inline(always)]
3017    pub fn extensions(&self) -> &HashSet<Cow<'m, str>> {
3018        &self.exts
3019    }
3020
3021    /// Checks if this is a default fallback detection
3022    ///
3023    /// # Returns
3024    ///
3025    /// * `bool` - True if this is a default detection
3026    #[inline(always)]
3027    pub fn is_default(&self) -> bool {
3028        self.is_default
3029    }
3030}
3031
3032/// Represents a database of [`MagicRule`]
3033#[derive(Debug, Default, Clone, Serialize, Deserialize)]
3034pub struct MagicDb {
3035    rule_id: usize,
3036    rules: Vec<MagicRule>,
3037    dependencies: HashMap<String, DependencyRule>,
3038}
3039
3040#[inline(always)]
3041/// Returns `true` if the byte stream is likely text.
3042fn is_likely_text(bytes: &[u8]) -> bool {
3043    const CHUNK_SIZE: usize = std::mem::size_of::<usize>();
3044
3045    if bytes.is_empty() {
3046        return false;
3047    }
3048
3049    let mut printable = 0f64;
3050    let mut high_bytes = 0f64; // Bytes > 0x7F (non-ASCII)
3051
3052    let (chunks, remainder) = bytes.as_chunks::<CHUNK_SIZE>();
3053
3054    macro_rules! handle_byte {
3055        ($byte: expr) => {
3056            match $byte {
3057                0x00 => return false,
3058                0x09 | 0x0A | 0x0D => printable += 1.0, // Whitespace
3059                0x20..=0x7E => printable += 1.0,        // Printable ASCII
3060                _ => high_bytes += 1.0,
3061            }
3062        };
3063    }
3064
3065    for bytes in chunks {
3066        for b in bytes {
3067            handle_byte!(b)
3068        }
3069    }
3070
3071    for b in remainder {
3072        handle_byte!(b)
3073    }
3074
3075    let total = bytes.len() as f64;
3076    let printable_ratio = printable / total;
3077    let high_bytes_ratio = high_bytes / total;
3078
3079    // Heuristic thresholds (adjust as needed):
3080    printable_ratio > 0.85 && high_bytes_ratio < 0.20
3081}
3082
3083#[inline(always)]
3084fn guess_stream_kind<S: AsRef<[u8]>>(stream: S) -> StreamKind {
3085    let buf = stream.as_ref();
3086
3087    match run_utf8_validation(buf) {
3088        Ok(is_ascii) => {
3089            if is_ascii {
3090                StreamKind::Text(TextEncoding::Ascii)
3091            } else {
3092                StreamKind::Text(TextEncoding::Utf8)
3093            }
3094        }
3095        Err(e) => {
3096            if is_likely_text(&buf[e.valid_up_to..]) {
3097                StreamKind::Text(TextEncoding::Unknown)
3098            } else {
3099                StreamKind::Binary
3100            }
3101        }
3102    }
3103}
3104
3105impl MagicDb {
3106    /// Prepares an [`LazyCache`] configured with optimal parameters for
3107    /// **read** operations done during file identification
3108    pub fn optimal_lazy_cache<R: Read + Seek>(f: R) -> Result<LazyCache<R>, io::Error> {
3109        Ok(LazyCache::<R>::from_read_seek(f)
3110            .and_then(|lc| lc.with_hot_cache(2 * FILE_BYTES_MAX))?)
3111        .map(|lc| lc.with_warm_cache(100 << 20))
3112    }
3113
3114    /// Creates a new empty database
3115    ///
3116    /// # Returns
3117    ///
3118    /// * [`MagicDb`] - A new empty database
3119    pub fn new() -> Self {
3120        Self::default()
3121    }
3122
3123    #[inline(always)]
3124    fn next_rule_id(&mut self) -> usize {
3125        let t = self.rule_id;
3126        self.rule_id += 1;
3127        t
3128    }
3129
3130    #[inline(always)]
3131    fn try_json<R: Read + Seek>(
3132        haystack: &mut LazyCache<R>,
3133        stream_kind: StreamKind,
3134        magic: &mut Magic,
3135    ) -> Result<bool, Error> {
3136        // cannot be json if content is binary
3137        if matches!(stream_kind, StreamKind::Binary) {
3138            return Ok(false);
3139        }
3140
3141        let buf = haystack.read_range(0..FILE_BYTES_MAX as u64)?.trim_ascii();
3142
3143        let Some((start, end)) = find_json_boundaries(buf) else {
3144            return Ok(false);
3145        };
3146
3147        // if anything else than whitespace before start
3148        // this is not json
3149        for c in buf[0..start].iter() {
3150            if !c.is_ascii_whitespace() {
3151                return Ok(false);
3152            }
3153        }
3154
3155        let mut is_ndjson = false;
3156
3157        trace!("maybe a json document");
3158        let ok = serde_json::from_slice::<serde_json::Value>(&buf[start..=end]).is_ok();
3159        if !ok {
3160            return Ok(false);
3161        }
3162
3163        // we are sure it is json now we must look if we are ndjson
3164        if end + 1 < buf.len() {
3165            // after first json
3166            let buf = &buf[end + 1..];
3167            if let Some((second_start, second_end)) = find_json_boundaries(buf) {
3168                // there is a new line between the two json docs
3169                if memchr(b'\n', &buf[..second_start]).is_some() {
3170                    trace!("might be ndjson");
3171                    is_ndjson = serde_json::from_slice::<serde_json::Value>(
3172                        &buf[second_start..=second_end],
3173                    )
3174                    .is_ok();
3175                }
3176            }
3177        }
3178
3179        if is_ndjson {
3180            magic.push_message(Cow::Borrowed("New Line Delimited"));
3181            magic.set_mime_type(Cow::Borrowed("application/x-ndjson"));
3182            magic.insert_extensions(["ndjson", "jsonl"].into_iter());
3183        } else {
3184            magic.set_mime_type(Cow::Borrowed("application/json"));
3185            magic.insert_extensions(["json"].into_iter());
3186        }
3187
3188        magic.push_message(Cow::Borrowed("JSON text data"));
3189        magic.set_source(Some(HARDCODED_SOURCE));
3190        magic.update_strength(HARDCODED_MAGIC_STRENGTH);
3191        Ok(true)
3192    }
3193
3194    #[inline(always)]
3195    fn try_csv<R: Read + Seek>(
3196        haystack: &mut LazyCache<R>,
3197        stream_kind: StreamKind,
3198        magic: &mut Magic,
3199    ) -> Result<bool, Error> {
3200        // cannot be csv if content is binary
3201        let StreamKind::Text(enc) = stream_kind else {
3202            return Ok(false);
3203        };
3204
3205        let buf = haystack.read_range(0..FILE_BYTES_MAX as u64)?;
3206        let mut reader = csv::Reader::from_reader(io::Cursor::new(buf));
3207        let mut records = reader.records();
3208
3209        let Some(Ok(first)) = records.next() else {
3210            return Ok(false);
3211        };
3212
3213        // very not likely a CSV otherwise all programming
3214        // languages having ; line terminator would be
3215        // considered as CSV
3216        if first.len() <= 1 {
3217            return Ok(false);
3218        }
3219
3220        // we already parsed first line
3221        let mut n = 1;
3222        for i in records.take(9) {
3223            if let Ok(rec) = i {
3224                if first.len() != rec.len() {
3225                    return Ok(false);
3226                }
3227            } else {
3228                return Ok(false);
3229            }
3230            n += 1;
3231        }
3232
3233        // we need at least 10 lines
3234        if n != 10 {
3235            return Ok(false);
3236        }
3237
3238        magic.set_mime_type(Cow::Borrowed("text/csv"));
3239        magic.push_message(Cow::Borrowed("CSV"));
3240        magic.push_message(Cow::Borrowed(enc.as_magic_str()));
3241        magic.push_message(Cow::Borrowed("text"));
3242        magic.insert_extensions(["csv"].into_iter());
3243        magic.set_source(Some(HARDCODED_SOURCE));
3244        magic.update_strength(HARDCODED_MAGIC_STRENGTH);
3245        Ok(true)
3246    }
3247
3248    #[inline(always)]
3249    fn try_tar<R: Read + Seek>(
3250        haystack: &mut LazyCache<R>,
3251        stream_kind: StreamKind,
3252        magic: &mut Magic,
3253    ) -> Result<bool, Error> {
3254        // cannot be json if content is not binary
3255        if !matches!(stream_kind, StreamKind::Binary) {
3256            return Ok(false);
3257        }
3258
3259        let buf = haystack.read_range(0..FILE_BYTES_MAX as u64)?;
3260        let mut ar = Archive::new(io::Cursor::new(buf));
3261
3262        let Ok(mut entries) = ar.entries() else {
3263            return Ok(false);
3264        };
3265
3266        let Some(Ok(first)) = entries.next() else {
3267            return Ok(false);
3268        };
3269
3270        let header = first.header();
3271
3272        if header.as_ustar().is_some() {
3273            magic.push_message(Cow::Borrowed("POSIX tar archive"));
3274        } else if header.as_gnu().is_some() {
3275            magic.push_message(Cow::Borrowed("POSIX tar archive (GNU)"));
3276        } else {
3277            magic.push_message(Cow::Borrowed("tar archive"));
3278        }
3279
3280        magic.set_mime_type(Cow::Borrowed("application/x-tar"));
3281        magic.set_source(Some(HARDCODED_SOURCE));
3282        magic.update_strength(HARDCODED_MAGIC_STRENGTH);
3283        magic.insert_extensions(["tar"].into_iter());
3284        Ok(true)
3285    }
3286
3287    #[inline(always)]
3288    fn try_hard_magic<R: Read + Seek>(
3289        haystack: &mut LazyCache<R>,
3290        stream_kind: StreamKind,
3291        magic: &mut Magic,
3292    ) -> Result<bool, Error> {
3293        Ok(Self::try_json(haystack, stream_kind, magic)?
3294            || Self::try_csv(haystack, stream_kind, magic)?
3295            || Self::try_tar(haystack, stream_kind, magic)?)
3296    }
3297
3298    #[inline(always)]
3299    fn magic_default<'m, R: Read + Seek>(
3300        cache: &mut LazyCache<R>,
3301        stream_kind: StreamKind,
3302        magic: &mut Magic<'m>,
3303    ) {
3304        magic.set_source(Some(HARDCODED_SOURCE));
3305        magic.set_stream_kind(stream_kind);
3306        magic.is_default = true;
3307
3308        if cache.data_size() == 0 {
3309            magic.push_message(Cow::Borrowed("empty"));
3310            magic.set_mime_type(Cow::Borrowed(DEFAULT_BIN_MIMETYPE));
3311        }
3312
3313        match stream_kind {
3314            StreamKind::Binary => {
3315                magic.push_message(Cow::Borrowed("data"));
3316            }
3317            StreamKind::Text(e) => {
3318                magic.push_message(Cow::Borrowed(e.as_magic_str()));
3319                magic.push_message(Cow::Borrowed("text"));
3320            }
3321        }
3322    }
3323
3324    /// Loads rules from a [`MagicSource`]
3325    ///
3326    /// # Arguments
3327    ///
3328    /// * `mf` - The [`MagicSource`] to load rules from
3329    ///
3330    /// # Returns
3331    ///
3332    /// * `Result<&mut Self, Error>` - Self for chaining or an error
3333    pub fn load(&mut self, mf: MagicSource) -> Result<&mut Self, Error> {
3334        for rule in mf.rules.into_iter() {
3335            let mut rule = rule;
3336            rule.set_id(self.next_rule_id());
3337
3338            self.rules.push(rule);
3339        }
3340
3341        self.dependencies.extend(mf.dependencies);
3342        self.prepare();
3343        Ok(self)
3344    }
3345
3346    /// Gets all rules in the database
3347    ///
3348    /// # Returns
3349    ///
3350    /// * `&[MagicRule]` - A slice of all rules
3351    pub fn rules(&self) -> &[MagicRule] {
3352        &self.rules
3353    }
3354
3355    #[inline]
3356    fn first_magic_with_stream_kind<R: Read + Seek>(
3357        &self,
3358        haystack: &mut LazyCache<R>,
3359        stream_kind: StreamKind,
3360        extension: Option<&str>,
3361    ) -> Result<Magic<'_>, Error> {
3362        // re-using magic makes this function faster
3363        let mut magic = Magic::default();
3364
3365        if Self::try_hard_magic(haystack, stream_kind, &mut magic)? {
3366            return Ok(magic);
3367        }
3368
3369        let mut marked = vec![false; self.rules.len()];
3370
3371        macro_rules! do_magic {
3372            ($rule: expr) => {{
3373                $rule.magic_entrypoint(&mut magic, stream_kind, haystack, &self, false, 0)?;
3374
3375                if !magic.message.is_empty() {
3376                    magic.set_stream_kind(stream_kind);
3377                    magic.set_source($rule.source.as_deref());
3378                    return Ok(magic);
3379                }
3380
3381                magic.reset();
3382            }};
3383        }
3384
3385        if let Some(ext) = extension.map(|e| e.to_lowercase())
3386            && !ext.is_empty()
3387        {
3388            for rule in self.rules.iter().filter(|r| r.extensions.contains(&ext)) {
3389                do_magic!(rule);
3390                if let Some(f) = marked.get_mut(rule.id) {
3391                    *f = true
3392                }
3393            }
3394        }
3395
3396        for rule in self
3397            .rules
3398            .iter()
3399            // we don't run again rules run by extension
3400            .filter(|r| !*marked.get(r.id).unwrap_or(&false))
3401        {
3402            do_magic!(rule)
3403        }
3404
3405        Self::magic_default(haystack, stream_kind, &mut magic);
3406
3407        Ok(magic)
3408    }
3409
3410    /// Detects file [`Magic`] stopping at the first matching magic. Magic
3411    /// rules are evaluated from the best to the least relevant, so this method
3412    /// returns most of the time the best magic. For the rare cases where
3413    /// it doesn't or if the best result is always required, use [`MagicDb::best_magic`]
3414    ///
3415    /// # Arguments
3416    ///
3417    /// * `r` - A readable and seekable input
3418    /// * `extension` - Optional file extension to use for acceleration
3419    ///
3420    /// # Returns
3421    ///
3422    /// * `Result<Magic<'_>, Error>` - The detection result or an error
3423    pub fn first_magic<R: Read + Seek>(
3424        &self,
3425        r: &mut R,
3426        extension: Option<&str>,
3427    ) -> Result<Magic<'_>, Error> {
3428        let mut cache = Self::optimal_lazy_cache(r)?;
3429        let stream_kind = guess_stream_kind(cache.read_range(0..FILE_BYTES_MAX as u64)?);
3430        self.first_magic_with_stream_kind(&mut cache, stream_kind, extension)
3431    }
3432
3433    /// An alternative to [`Self::first_magic`] using a [`LazyCache`]
3434    /// to detects file [`Magic`] stopping at the first matching magic. Magic
3435    /// rules are evaluated from the best to the least relevant, so this method
3436    /// returns most of the time the best magic. For the rare cases where
3437    /// it doesn't or if the best result is always required, use [`MagicDb::best_magic`]
3438    ///
3439    /// # Arguments
3440    ///
3441    /// * `cache` - A [`LazyCache`] used for read operations
3442    /// * `extension` - Optional file extension to use for acceleration
3443    ///
3444    /// # Returns
3445    ///
3446    /// * `Result<Magic<'_>, Error>` - The detection result or an error
3447    ///
3448    /// # Notes
3449    ///
3450    /// * Use this method **only** if you need to re-use a [`LazyCache`] for future **read** operations.
3451    /// * Use [`Self::optimal_lazy_cache`] to prepare an optimal [`LazyCache`]
3452    pub fn first_magic_with_lazy_cache<R: Read + Seek>(
3453        &self,
3454        cache: &mut LazyCache<R>,
3455        extension: Option<&str>,
3456    ) -> Result<Magic<'_>, Error> {
3457        let stream_kind = guess_stream_kind(cache.read_range(0..FILE_BYTES_MAX as u64)?);
3458        self.first_magic_with_stream_kind(cache, stream_kind, extension)
3459    }
3460
3461    #[inline(always)]
3462    fn all_magics_sort_with_stream_kind<R: Read + Seek>(
3463        &self,
3464        haystack: &mut LazyCache<R>,
3465        stream_kind: StreamKind,
3466    ) -> Result<Vec<Magic<'_>>, Error> {
3467        let mut out = Vec::new();
3468
3469        let mut magic = Magic::default();
3470
3471        if Self::try_hard_magic(haystack, stream_kind, &mut magic)? {
3472            out.push(magic);
3473            magic = Magic::default();
3474        }
3475
3476        for rule in self.rules.iter() {
3477            rule.magic_entrypoint(&mut magic, stream_kind, haystack, self, false, 0)?;
3478
3479            // it is possible we have a strength with no message
3480            if !magic.message.is_empty() {
3481                magic.set_stream_kind(stream_kind);
3482                magic.set_source(rule.source.as_deref());
3483                out.push(magic);
3484                magic = Magic::default();
3485            }
3486
3487            magic.reset();
3488        }
3489
3490        Self::magic_default(haystack, stream_kind, &mut magic);
3491        out.push(magic);
3492
3493        out.sort_by_key(|b| std::cmp::Reverse(b.strength()));
3494
3495        Ok(out)
3496    }
3497
3498    /// Detects all [`Magic`] matching a given content.
3499    ///
3500    /// # Arguments
3501    ///
3502    /// * `r` - A readable and seekable input
3503    ///
3504    /// # Returns
3505    ///
3506    /// * `Result<Vec<Magic<'_>>, Error>` - All detection results sorted by strength or an error
3507    pub fn all_magics<R: Read + Seek>(&self, r: &mut R) -> Result<Vec<Magic<'_>>, Error> {
3508        let mut cache = Self::optimal_lazy_cache(r)?;
3509        let stream_kind = guess_stream_kind(cache.read_range(0..FILE_BYTES_MAX as u64)?);
3510        self.all_magics_sort_with_stream_kind(&mut cache, stream_kind)
3511    }
3512
3513    /// An alternative to [`Self::all_magics`] using a [`LazyCache`]
3514    /// to detects all [`Magic`] matching a given content.
3515    ///
3516    /// # Arguments
3517    ///
3518    /// * `r` - A readable and seekable input
3519    ///
3520    /// # Returns
3521    ///
3522    /// * `Result<Vec<Magic<'_>>, Error>` - All detection results sorted by strength or an error
3523    ///
3524    /// # Notes
3525    ///
3526    /// * Use this method **only** if you need to re-use a [`LazyCache`] for future **read** operations.
3527    /// * Use [`Self::optimal_lazy_cache`] to prepare an optimal [`LazyCache`]
3528    pub fn all_magics_with_lazy_cache<R: Read + Seek>(
3529        &self,
3530        cache: &mut LazyCache<R>,
3531    ) -> Result<Vec<Magic<'_>>, Error> {
3532        let stream_kind = guess_stream_kind(cache.read_range(0..FILE_BYTES_MAX as u64)?);
3533        self.all_magics_sort_with_stream_kind(cache, stream_kind)
3534    }
3535
3536    #[inline(always)]
3537    fn best_magic_with_stream_kind<R: Read + Seek>(
3538        &self,
3539        haystack: &mut LazyCache<R>,
3540        stream_kind: StreamKind,
3541    ) -> Result<Magic<'_>, Error> {
3542        let magics = self.all_magics_sort_with_stream_kind(haystack, stream_kind)?;
3543
3544        // magics is guaranteed to contain at least the
3545        // default magic but we unwrap to avoid any panic
3546        Ok(magics.into_iter().next().unwrap_or_else(|| {
3547            let mut magic = Magic::default();
3548            Self::magic_default(haystack, stream_kind, &mut magic);
3549            magic
3550        }))
3551    }
3552
3553    /// Detects the best [`Magic`] matching a given content.
3554    ///
3555    /// # Arguments
3556    ///
3557    /// * `r` - A readable and seekable input
3558    ///
3559    /// # Returns
3560    ///
3561    /// * `Result<Magic<'_>, Error>` - The best detection result or an error
3562    pub fn best_magic<R: Read + Seek>(&self, r: &mut R) -> Result<Magic<'_>, Error> {
3563        let mut cache = Self::optimal_lazy_cache(r)?;
3564        let stream_kind = guess_stream_kind(cache.read_range(0..FILE_BYTES_MAX as u64)?);
3565        self.best_magic_with_stream_kind(&mut cache, stream_kind)
3566    }
3567
3568    /// An alternative to [`Self::best_magic`] using a [`LazyCache`]
3569    /// to detect the best [`Magic`] matching a given content.
3570    ///
3571    /// # Arguments
3572    ///
3573    /// * `r` - A readable and seekable input
3574    ///
3575    /// # Returns
3576    ///
3577    /// * `Result<Magic<'_>, Error>` - The best detection result or an error
3578    ///
3579    /// # Notes
3580    ///
3581    /// * Use this method **only** if you need to re-use a [`LazyCache`] for future **read** operations.
3582    /// * Use [`Self::optimal_lazy_cache`] to prepare an optimal [`LazyCache`]
3583    pub fn best_magic_with_lazy_cache<R: Read + Seek>(
3584        &self,
3585        cache: &mut LazyCache<R>,
3586    ) -> Result<Magic<'_>, Error> {
3587        let stream_kind = guess_stream_kind(cache.read_range(0..FILE_BYTES_MAX as u64)?);
3588        self.best_magic_with_stream_kind(cache, stream_kind)
3589    }
3590
3591    /// Serializes the database to a generic writer implementing [`io::Write`]
3592    ///
3593    /// # Returns
3594    ///
3595    /// * `Result<(), Error>` - The serialized database or an error
3596    pub fn serialize<W: Write>(self, w: &mut W) -> Result<(), Error> {
3597        let mut encoder = GzEncoder::new(w, Compression::best());
3598
3599        bincode::serde::encode_into_std_write(&self, &mut encoder, bincode::config::standard())?;
3600        encoder.finish()?;
3601        Ok(())
3602    }
3603
3604    /// Deserializes the database from a generic reader implementing [`io::Read`]
3605    ///
3606    /// # Arguments
3607    ///
3608    /// * `r` - The reader to deserialize from
3609    ///
3610    /// # Returns
3611    ///
3612    /// * `Result<Self, Error>` - The deserialized database or an error
3613    pub fn deserialize<R: Read>(r: &mut R) -> Result<Self, Error> {
3614        let mut buf = vec![];
3615        let mut gz = GzDecoder::new(r);
3616        gz.read_to_end(&mut buf).map_err(|e| {
3617            bincode::error::DecodeError::OtherString(format!("failed to read: {e}"))
3618        })?;
3619        let (sdb, _): (MagicDb, usize) =
3620            bincode::serde::decode_from_slice(&buf, bincode::config::standard())?;
3621        Ok(sdb)
3622    }
3623
3624    #[inline(always)]
3625    fn prepare(&mut self) {
3626        self.rules
3627            .iter_mut()
3628            .for_each(|r| r.try_finalize(&self.dependencies));
3629
3630        // put text rules at the end
3631        self.rules.sort_by_key(|r| (r.is_text(), -(r.score as i64)));
3632    }
3633}
3634
3635#[cfg(test)]
3636mod tests {
3637    use std::io::Cursor;
3638
3639    use regex::bytes::Regex;
3640
3641    use crate::utils::unix_local_time_to_string;
3642
3643    use super::*;
3644
3645    macro_rules! lazy_cache {
3646        ($l: literal) => {
3647            LazyCache::from_read_seek(Cursor::new($l)).unwrap()
3648        };
3649    }
3650
3651    fn first_magic(
3652        rule: &str,
3653        content: &[u8],
3654        stream_kind: StreamKind,
3655    ) -> Result<Magic<'static>, Error> {
3656        let mut md = MagicDb::new();
3657        md.load(
3658            FileMagicParser::parse_str(rule, None)
3659                .inspect_err(|e| eprintln!("{e}"))
3660                .unwrap(),
3661        )
3662        .unwrap();
3663        let mut reader = LazyCache::from_read_seek(Cursor::new(content)).unwrap();
3664        let v = md.best_magic_with_stream_kind(&mut reader, stream_kind)?;
3665        Ok(v.into_owned())
3666    }
3667
3668    /// helper macro to debug tests
3669    #[allow(unused_macros)]
3670    macro_rules! enable_trace {
3671        () => {
3672            tracing_subscriber::fmt()
3673                .with_max_level(tracing_subscriber::filter::LevelFilter::TRACE)
3674                .try_init();
3675        };
3676    }
3677
3678    macro_rules! parse_assert {
3679        ($rule:literal) => {
3680            FileMagicParser::parse_str($rule, None)
3681                .inspect_err(|e| eprintln!("{e}"))
3682                .unwrap();
3683        };
3684    }
3685
3686    macro_rules! assert_magic_match_bin {
3687        ($rule: literal, $content:literal) => {{ first_magic($rule, $content, StreamKind::Binary).unwrap() }};
3688        ($rule: literal, $content:literal, $message:expr) => {{
3689            assert_eq!(
3690                first_magic($rule, $content, StreamKind::Binary)
3691                    .unwrap()
3692                    .message(),
3693                $message
3694            );
3695        }};
3696    }
3697
3698    macro_rules! assert_magic_match_text {
3699        ($rule: literal, $content:literal) => {{ first_magic($rule, $content, StreamKind::Text(TextEncoding::Utf8)).unwrap() }};
3700        ($rule: literal, $content:literal, $message:expr) => {{
3701            assert_eq!(
3702                first_magic($rule, $content, StreamKind::Text(TextEncoding::Utf8))
3703                    .unwrap()
3704                    .message(),
3705                $message
3706            );
3707        }};
3708    }
3709
3710    macro_rules! assert_magic_not_match_text {
3711        ($rule: literal, $content:literal) => {{
3712            assert!(
3713                first_magic($rule, $content, StreamKind::Text(TextEncoding::Utf8))
3714                    .unwrap()
3715                    .is_default()
3716            );
3717        }};
3718    }
3719
3720    macro_rules! assert_magic_not_match_bin {
3721        ($rule: literal, $content:literal) => {{
3722            assert!(
3723                first_magic($rule, $content, StreamKind::Binary)
3724                    .unwrap()
3725                    .is_default()
3726            );
3727        }};
3728    }
3729
3730    #[test]
3731    fn test_regex() {
3732        assert_magic_match_text!(
3733            r#"
37340	regex/1024 \^#![[:space:]]*/usr/bin/env[[:space:]]+
3735!:mime	text/x-shellscript
3736>&0  regex/64 .*($|\\b) %s shell script text executable
3737    "#,
3738            br#"#!/usr/bin/env bash
3739        echo hello world"#,
3740            // the magic generated
3741            "bash shell script text executable"
3742        );
3743
3744        let re = Regex::new(r"(?-u)\x42\x82").unwrap();
3745        assert!(re.is_match(b"\x42\x82"));
3746
3747        assert_magic_match_bin!(
3748            r#"0 regex \x42\x82 binary regex match"#,
3749            b"\x00\x00\x00\x00\x00\x00\x42\x82"
3750        );
3751
3752        // test regex continuation after match
3753        assert_magic_match_bin!(
3754            r#"
3755            0 regex \x42\x82
3756            >&0 string \xde\xad\xbe\xef it works
3757            "#,
3758            b"\x00\x00\x00\x00\x00\x00\x42\x82\xde\xad\xbe\xef"
3759        );
3760
3761        assert_magic_match_bin!(
3762            r#"
3763            0 regex/s \x42\x82
3764            >&0 string \x42\x82\xde\xad\xbe\xef it works
3765            "#,
3766            b"\x00\x00\x00\x00\x00\x00\x42\x82\xde\xad\xbe\xef"
3767        );
3768
3769        // ^ must match stat of line when matching text
3770        assert_magic_match_text!(
3771            r#"
37720	regex/1024 \^HelloWorld$ HelloWorld String"#,
3773            br#"
3774// this is a comment after an empty line
3775HelloWorld
3776            "#
3777        );
3778    }
3779
3780    #[test]
3781    fn test_string_with_mods() {
3782        assert_magic_match_text!(
3783            r#"0	string/w	#!\ \ \ /usr/bin/env\ bash	BASH
3784        "#,
3785            b"#! /usr/bin/env bash i
3786        echo hello world"
3787        );
3788
3789        // test uppercase insensitive
3790        assert_magic_match_text!(
3791            r#"0	string/C	HelloWorld	it works
3792        "#,
3793            b"helloworld"
3794        );
3795
3796        assert_magic_not_match_text!(
3797            r#"0	string/C	HelloWorld	it works
3798        "#,
3799            b"hELLOwORLD"
3800        );
3801
3802        // test lowercase insensitive
3803        assert_magic_match_text!(
3804            r#"0	string/c	HelloWorld	it works
3805        "#,
3806            b"HELLOWORLD"
3807        );
3808
3809        assert_magic_not_match_text!(
3810            r#"0	string/c	HelloWorld	it works
3811        "#,
3812            b"helloworld"
3813        );
3814
3815        // test full word match
3816        assert_magic_match_text!(
3817            r#"0	string/f	#!/usr/bin/env\ bash	BASH
3818        "#,
3819            b"#!/usr/bin/env bash"
3820        );
3821
3822        assert_magic_not_match_text!(
3823            r#"0	string/f	#!/usr/bin/python PYTHON"#,
3824            b"#!/usr/bin/pythonic"
3825        );
3826
3827        // testing whitespace compacting
3828        assert_magic_match_text!(
3829            r#"0	string/W	#!/usr/bin/env\ python  PYTHON"#,
3830            b"#!/usr/bin/env    python"
3831        );
3832
3833        assert_magic_not_match_text!(
3834            r#"0	string/W	#!/usr/bin/env\ \ python  PYTHON"#,
3835            b"#!/usr/bin/env python"
3836        );
3837    }
3838
3839    #[test]
3840    fn test_search_with_mods() {
3841        assert_magic_match_text!(
3842            r#"0	search/1/fwt	#!\ /usr/bin/luatex	LuaTex script text executable"#,
3843            b"#!          /usr/bin/luatex "
3844        );
3845
3846        // test matching from the beginning
3847        assert_magic_match_text!(
3848            r#"
3849            0	search/s	/usr/bin/env
3850            >&0 string /usr/bin/env it works
3851            "#,
3852            b"#!/usr/bin/env    python"
3853        );
3854
3855        assert_magic_not_match_text!(
3856            r#"
3857            0	search	/usr/bin/env
3858            >&0 string /usr/bin/env it works
3859            "#,
3860            b"#!/usr/bin/env    python"
3861        );
3862    }
3863
3864    #[test]
3865    fn test_pstring() {
3866        assert_magic_match_bin!(r#"0 pstring Toast it works"#, b"\x05Toast");
3867
3868        assert_magic_match_bin!(r#"0 pstring Toast %s"#, b"\x05Toast", "Toast");
3869
3870        assert_magic_not_match_bin!(r#"0 pstring Toast Doesn't work"#, b"\x07Toaster");
3871
3872        // testing with modifiers
3873        assert_magic_match_bin!(r#"0 pstring/H Toast it works"#, b"\x00\x05Toast");
3874
3875        assert_magic_match_bin!(r#"0 pstring/HJ Toast it works"#, b"\x00\x07Toast");
3876
3877        assert_magic_match_bin!(r#"0 pstring/HJ Toast %s"#, b"\x00\x07Toast", "Toast");
3878
3879        assert_magic_match_bin!(r#"0 pstring/h Toast it works"#, b"\x05\x00Toast");
3880
3881        assert_magic_match_bin!(r#"0 pstring/hJ Toast it works"#, b"\x07\x00Toast");
3882
3883        assert_magic_match_bin!(r#"0 pstring/L Toast it works"#, b"\x00\x00\x00\x05Toast");
3884
3885        assert_magic_match_bin!(r#"0 pstring/LJ Toast it works"#, b"\x00\x00\x00\x09Toast");
3886
3887        assert_magic_match_bin!(r#"0 pstring/l Toast it works"#, b"\x05\x00\x00\x00Toast");
3888
3889        assert_magic_match_bin!(r#"0 pstring/lJ Toast it works"#, b"\x09\x00\x00\x00Toast");
3890    }
3891
3892    #[test]
3893    fn test_max_recursion() {
3894        let res = first_magic(
3895            r#"0	indirect x"#,
3896            b"#!          /usr/bin/luatex ",
3897            StreamKind::Binary,
3898        );
3899        assert!(res.is_err());
3900        let _ = res.inspect_err(|e| {
3901            assert!(matches!(
3902                e.unwrap_localized(),
3903                Error::MaximumRecursion(MAX_RECURSION)
3904            ))
3905        });
3906    }
3907
3908    #[test]
3909    fn test_string_ops() {
3910        assert_magic_match_text!("0	string/b MZ MZ File", b"MZ\0");
3911        assert_magic_match_text!("0	string !MZ Not MZ File", b"AZ\0");
3912        assert_magic_match_text!("0	string >\0 Any String", b"A\0");
3913        assert_magic_match_text!("0	string >Test Any String", b"Test 1\0");
3914        assert_magic_match_text!("0	string <Test Any String", b"\0");
3915        assert_magic_not_match_text!("0	string >Test Any String", b"\0");
3916    }
3917
3918    #[test]
3919    fn test_lestring16() {
3920        assert_magic_match_bin!(
3921            "0 lestring16 abcd Little-endian UTF-16 string",
3922            b"\x61\x00\x62\x00\x63\x00\x64\x00"
3923        );
3924        assert_magic_match_bin!(
3925            "0 lestring16 x %s",
3926            b"\x61\x00\x62\x00\x63\x00\x64\x00\x00",
3927            "abcd"
3928        );
3929        assert_magic_not_match_bin!(
3930            "0 lestring16 abcd Little-endian UTF-16 string",
3931            b"\x00\x61\x00\x62\x00\x63\x00\x64"
3932        );
3933        assert_magic_match_bin!(
3934            "4 lestring16 abcd Little-endian UTF-16 string",
3935            b"\x00\x00\x00\x00\x61\x00\x62\x00\x63\x00\x64\x00"
3936        );
3937    }
3938
3939    #[test]
3940    fn test_bestring16() {
3941        assert_magic_match_bin!(
3942            "0 bestring16 abcd Big-endian UTF-16 string",
3943            b"\x00\x61\x00\x62\x00\x63\x00\x64"
3944        );
3945        assert_magic_match_bin!(
3946            "0 bestring16 x %s",
3947            b"\x00\x61\x00\x62\x00\x63\x00\x64",
3948            "abcd"
3949        );
3950        assert_magic_not_match_bin!(
3951            "0 bestring16 abcd Big-endian UTF-16 string",
3952            b"\x61\x00\x62\x00\x63\x00\x64\x00"
3953        );
3954        assert_magic_match_bin!(
3955            "4 bestring16 abcd Big-endian UTF-16 string",
3956            b"\x00\x00\x00\x00\x00\x61\x00\x62\x00\x63\x00\x64"
3957        );
3958    }
3959
3960    #[test]
3961    fn test_offset_from_end() {
3962        assert_magic_match_bin!("-1 ubyte 0x42 last byte ok", b"\x00\x00\x42");
3963        assert_magic_match_bin!("-2 ubyte 0x41 last byte ok", b"\x00\x41\x00");
3964    }
3965
3966    #[test]
3967    fn test_relative_offset() {
3968        assert_magic_match_bin!(
3969            "
3970            0 ubyte 0x42
3971            >&0 ubyte 0x00
3972            >>&0 ubyte 0x41 third byte ok
3973            ",
3974            b"\x42\x00\x41\x00"
3975        );
3976    }
3977
3978    #[test]
3979    fn test_indirect_offset() {
3980        assert_magic_match_bin!("(0.l) ubyte 0x42 it works", b"\x04\x00\x00\x00\x42");
3981        // adding fixed value to offset
3982        assert_magic_match_bin!("(0.l+3) ubyte 0x42 it works", b"\x01\x00\x00\x00\x42");
3983        // testing offset pair
3984        assert_magic_match_bin!(
3985            "(0.l+(4)) ubyte 0x42 it works",
3986            b"\x04\x00\x00\x00\x04\x00\x00\x00\x42"
3987        );
3988    }
3989
3990    #[test]
3991    fn test_use_with_message() {
3992        assert_magic_match_bin!(
3993            r#"
39940 string MZ
3995>0 use mz first match
3996
39970 name mz then second match
3998>0 string MZ
3999"#,
4000            b"MZ\0",
4001            "first match then second match"
4002        );
4003    }
4004
4005    #[test]
4006    fn test_scalar_transform() {
4007        assert_magic_match_bin!("0 ubyte+1 0x1 add works", b"\x00");
4008        assert_magic_match_bin!("0 ubyte-1 0xfe sub works", b"\xff");
4009        assert_magic_match_bin!("0 ubyte%2 0 mod works", b"\x0a");
4010        assert_magic_match_bin!("0 ubyte&0x0f 0x0f bitand works", b"\xff");
4011        assert_magic_match_bin!("0 ubyte|0x0f 0xff bitor works", b"\xf0");
4012        assert_magic_match_bin!("0 ubyte^0x0f 0xf0 bitxor works", b"\xff");
4013
4014        FileMagicParser::parse_str("0 ubyte%0 mod by zero", None)
4015            .expect_err("expect div by zero error");
4016        FileMagicParser::parse_str("0 ubyte/0 div by zero", None)
4017            .expect_err("expect div by zero error");
4018    }
4019
4020    #[test]
4021    fn test_belong() {
4022        // Test that a file with a four-byte value at offset 0 that matches the given value in big-endian byte order
4023        assert_magic_match_bin!("0 belong 0x12345678 Big-endian long", b"\x12\x34\x56\x78");
4024        // Test that a file with a four-byte value at offset 0 that does not match the given value in big-endian byte order
4025        assert_magic_not_match_bin!("0 belong 0x12345678 Big-endian long", b"\x78\x56\x34\x12");
4026        // Test that a file with a four-byte value at a non-zero offset that matches the given value in big-endian byte order
4027        assert_magic_match_bin!(
4028            "4 belong 0x12345678 Big-endian long",
4029            b"\x00\x00\x00\x00\x12\x34\x56\x78"
4030        );
4031        // Test < operator
4032        assert_magic_match_bin!("0 belong <0x12345678 Big-endian long", b"\x12\x34\x56\x77");
4033        assert_magic_not_match_bin!("0 belong <0x12345678 Big-endian long", b"\x12\x34\x56\x78");
4034
4035        // Test > operator
4036        assert_magic_match_bin!("0 belong >0x12345678 Big-endian long", b"\x12\x34\x56\x79");
4037        assert_magic_not_match_bin!("0 belong >0x12345678 Big-endian long", b"\x12\x34\x56\x78");
4038
4039        // Test & operator
4040        assert_magic_match_bin!("0 belong &0x5678 Big-endian long", b"\x00\x00\x56\x78");
4041        assert_magic_not_match_bin!("0 belong &0x0000FFFF Big-endian long", b"\x12\x34\x56\x78");
4042
4043        // Test ^ operator (bitwise AND with complement)
4044        assert_magic_match_bin!("0 belong ^0xFFFF0000 Big-endian long", b"\x00\x00\x56\x78");
4045        assert_magic_not_match_bin!("0 belong ^0xFFFF0000 Big-endian long", b"\x00\x01\x56\x78");
4046
4047        // Test ~ operator
4048        assert_magic_match_bin!("0 belong ~0x12345678 Big-endian long", b"\xed\xcb\xa9\x87");
4049        assert_magic_not_match_bin!("0 belong ~0x12345678 Big-endian long", b"\x12\x34\x56\x78");
4050
4051        // Test x operator
4052        assert_magic_match_bin!("0 belong x Big-endian long", b"\x12\x34\x56\x78");
4053        assert_magic_match_bin!("0 belong x Big-endian long", b"\x78\x56\x34\x12");
4054    }
4055
4056    #[test]
4057    fn test_parse_search() {
4058        parse_assert!("0 search test");
4059        parse_assert!("0 search/24/s test");
4060        parse_assert!("0 search/s/24 test");
4061    }
4062
4063    #[test]
4064    fn test_bedate() {
4065        assert_magic_match_bin!(
4066            "0 bedate 946684800 Unix date (Jan 1, 2000)",
4067            b"\x38\x6D\x43\x80"
4068        );
4069        assert_magic_not_match_bin!(
4070            "0 bedate 946684800 Unix date (Jan 1, 2000)",
4071            b"\x00\x00\x00\x00"
4072        );
4073        assert_magic_match_bin!(
4074            "4 bedate 946684800 %s",
4075            b"\x00\x00\x00\x00\x38\x6D\x43\x80",
4076            "2000-01-01 00:00:00"
4077        );
4078    }
4079    #[test]
4080    fn test_beldate() {
4081        assert_magic_match_bin!(
4082            "0 beldate 946684800 Local date (Jan 1, 2000)",
4083            b"\x38\x6D\x43\x80"
4084        );
4085        assert_magic_not_match_bin!(
4086            "0 beldate 946684800 Local date (Jan 1, 2000)",
4087            b"\x00\x00\x00\x00"
4088        );
4089
4090        assert_magic_match_bin!(
4091            "4 beldate 946684800 {}",
4092            b"\x00\x00\x00\x00\x38\x6D\x43\x80",
4093            unix_local_time_to_string(946684800)
4094        );
4095    }
4096
4097    #[test]
4098    fn test_beqdate() {
4099        assert_magic_match_bin!(
4100            "0 beqdate 946684800 Unix date (Jan 1, 2000)",
4101            b"\x00\x00\x00\x00\x38\x6D\x43\x80"
4102        );
4103
4104        assert_magic_not_match_bin!(
4105            "0 beqdate 946684800 Unix date (Jan 1, 2000)",
4106            b"\x00\x00\x00\x00\x00\x00\x00\x00"
4107        );
4108
4109        assert_magic_match_bin!(
4110            "0 beqdate 946684800 %s",
4111            b"\x00\x00\x00\x00\x38\x6D\x43\x80",
4112            "2000-01-01 00:00:00"
4113        );
4114    }
4115
4116    #[test]
4117    fn test_medate() {
4118        assert_magic_match_bin!(
4119            "0 medate 946684800 Unix date (Jan 1, 2000)",
4120            b"\x6D\x38\x80\x43"
4121        );
4122
4123        assert_magic_not_match_bin!(
4124            "0 medate 946684800 Unix date (Jan 1, 2000)",
4125            b"\x00\x00\x00\x00"
4126        );
4127
4128        assert_magic_match_bin!(
4129            "4 medate 946684800 %s",
4130            b"\x00\x00\x00\x00\x6D\x38\x80\x43",
4131            "2000-01-01 00:00:00"
4132        );
4133    }
4134
4135    #[test]
4136    fn test_meldate() {
4137        assert_magic_match_bin!(
4138            "0 meldate 946684800 Local date (Jan 1, 2000)",
4139            b"\x6D\x38\x80\x43"
4140        );
4141        assert_magic_not_match_bin!(
4142            "0 meldate 946684800 Local date (Jan 1, 2000)",
4143            b"\x00\x00\x00\x00"
4144        );
4145
4146        assert_magic_match_bin!(
4147            "4 meldate 946684800 %s",
4148            b"\x00\x00\x00\x00\x6D\x38\x80\x43",
4149            unix_local_time_to_string(946684800)
4150        );
4151    }
4152
4153    #[test]
4154    fn test_date() {
4155        assert_magic_match_bin!(
4156            "0 date 946684800 Local date (Jan 1, 2000)",
4157            b"\x80\x43\x6D\x38"
4158        );
4159        assert_magic_not_match_bin!(
4160            "0 date 946684800 Local date (Jan 1, 2000)",
4161            b"\x00\x00\x00\x00"
4162        );
4163        assert_magic_match_bin!(
4164            "4 date 946684800 {}",
4165            b"\x00\x00\x00\x00\x80\x43\x6D\x38",
4166            "2000-01-01 00:00:00"
4167        );
4168    }
4169
4170    #[test]
4171    fn test_leldate() {
4172        assert_magic_match_bin!(
4173            "0 leldate 946684800 Local date (Jan 1, 2000)",
4174            b"\x80\x43\x6D\x38"
4175        );
4176        assert_magic_not_match_bin!(
4177            "0 leldate 946684800 Local date (Jan 1, 2000)",
4178            b"\x00\x00\x00\x00"
4179        );
4180        assert_magic_match_bin!(
4181            "4 leldate 946684800 {}",
4182            b"\x00\x00\x00\x00\x80\x43\x6D\x38",
4183            unix_local_time_to_string(946684800)
4184        );
4185    }
4186
4187    #[test]
4188    fn test_leqdate() {
4189        assert_magic_match_bin!(
4190            "0 leqdate 1577836800 Unix date (Jan 1, 2020)",
4191            b"\x00\xe1\x0b\x5E\x00\x00\x00\x00"
4192        );
4193
4194        assert_magic_not_match_bin!(
4195            "0 leqdate 1577836800 Unix date (Jan 1, 2020)",
4196            b"\x00\x00\x00\x00\x00\x00\x00\x00"
4197        );
4198        assert_magic_match_bin!(
4199            "8 leqdate 1577836800 %s",
4200            b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\xE1\x0B\x5E\x00\x00\x00\x00",
4201            "2020-01-01 00:00:00"
4202        );
4203    }
4204
4205    #[test]
4206    fn test_leqldate() {
4207        assert_magic_match_bin!(
4208            "0 leqldate 1577836800 Unix date (Jan 1, 2020)",
4209            b"\x00\xe1\x0b\x5E\x00\x00\x00\x00"
4210        );
4211
4212        assert_magic_not_match_bin!(
4213            "0 leqldate 1577836800 Unix date (Jan 1, 2020)",
4214            b"\x00\x00\x00\x00\x00\x00\x00\x00"
4215        );
4216        assert_magic_match_bin!(
4217            "8 leqldate 1577836800 %s",
4218            b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\xE1\x0B\x5E\x00\x00\x00\x00",
4219            unix_local_time_to_string(1577836800)
4220        );
4221    }
4222
4223    #[test]
4224    fn test_melong() {
4225        // Test = operator
4226        assert_magic_match_bin!(
4227            "0 melong =0x12345678 Middle-endian long",
4228            b"\x34\x12\x78\x56"
4229        );
4230        assert_magic_not_match_bin!(
4231            "0 melong =0x12345678 Middle-endian long",
4232            b"\x00\x00\x00\x00"
4233        );
4234
4235        // Test < operator
4236        assert_magic_match_bin!(
4237            "0 melong <0x12345678 Middle-endian long",
4238            b"\x34\x12\x78\x55"
4239        ); // 0x12345677 in middle-endian
4240        assert_magic_not_match_bin!(
4241            "0 melong <0x12345678 Middle-endian long",
4242            b"\x34\x12\x78\x56"
4243        ); // 0x12345678 in middle-endian
4244
4245        // Test > operator
4246        assert_magic_match_bin!(
4247            "0 melong >0x12345678 Middle-endian long",
4248            b"\x34\x12\x78\x57"
4249        ); // 0x12345679 in middle-endian
4250        assert_magic_not_match_bin!(
4251            "0 melong >0x12345678 Middle-endian long",
4252            b"\x34\x12\x78\x56"
4253        ); // 0x12345678 in middle-endian
4254
4255        // Test & operator
4256        assert_magic_match_bin!("0 melong &0x5678 Middle-endian long", b"\xab\xcd\x78\x56"); // 0x00007856 in middle-endian
4257        assert_magic_not_match_bin!(
4258            "0 melong &0x0000FFFF Middle-endian long",
4259            b"\x34\x12\x78\x56"
4260        ); // 0x12347856 in middle-endian
4261
4262        // Test ^ operator (bitwise AND with complement)
4263        assert_magic_match_bin!(
4264            "0 melong ^0xFFFF0000 Middle-endian long",
4265            b"\x00\x00\x78\x56"
4266        ); // 0x00007856 in middle-endian
4267        assert_magic_not_match_bin!(
4268            "0 melong ^0xFFFF0000 Middle-endian long",
4269            b"\x00\x01\x78\x56"
4270        ); // 0x00017856 in middle-endian
4271
4272        // Test ~ operator
4273        assert_magic_match_bin!(
4274            "0 melong ~0x12345678 Middle-endian long",
4275            b"\xCB\xED\x87\xA9"
4276        );
4277        assert_magic_not_match_bin!(
4278            "0 melong ~0x12345678 Middle-endian long",
4279            b"\x34\x12\x78\x56"
4280        ); // The original value
4281
4282        // Test x operator
4283        assert_magic_match_bin!("0 melong x Middle-endian long", b"\x34\x12\x78\x56");
4284        assert_magic_match_bin!("0 melong x Middle-endian long", b"\x00\x00\x00\x00");
4285    }
4286
4287    #[test]
4288    fn test_uquad() {
4289        // Test = operator
4290        assert_magic_match_bin!(
4291            "0 uquad =0x123456789ABCDEF0 Unsigned quad",
4292            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4293        );
4294        assert_magic_not_match_bin!(
4295            "0 uquad =0x123456789ABCDEF0 Unsigned quad",
4296            b"\x00\x00\x00\x00\x00\x00\x00\x00"
4297        );
4298
4299        // Test < operator
4300        assert_magic_match_bin!(
4301            "0 uquad <0x123456789ABCDEF0 Unsigned quad",
4302            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x11"
4303        );
4304        assert_magic_not_match_bin!(
4305            "0 uquad <0x123456789ABCDEF0 Unsigned quad",
4306            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4307        );
4308
4309        // Test > operator
4310        assert_magic_match_bin!(
4311            "0 uquad >0x123456789ABCDEF0 Unsigned quad",
4312            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x13"
4313        );
4314        assert_magic_not_match_bin!(
4315            "0 uquad >0x123456789ABCDEF0 Unsigned quad",
4316            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4317        );
4318
4319        // Test & operator
4320        assert_magic_match_bin!(
4321            "0 uquad &0xF0 Unsigned quad",
4322            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4323        );
4324        assert_magic_not_match_bin!(
4325            "0 uquad &0xFF Unsigned quad",
4326            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4327        );
4328
4329        // Test ^ operator (bitwise AND with complement)
4330        assert_magic_match_bin!(
4331            "0 uquad ^0xFFFFFFFFFFFFFFFF Unsigned quad",
4332            b"\x00\x00\x00\x00\x00\x00\x00\x00"
4333        ); // All bits clear
4334        assert_magic_not_match_bin!(
4335            "0 uquad ^0xFFFFFFFFFFFFFFFF Unsigned quad",
4336            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4337        ); // Some bits set
4338
4339        // Test ~ operator
4340        assert_magic_match_bin!(
4341            "0 uquad ~0x123456789ABCDEF0 Unsigned quad",
4342            b"\x0F\x21\x43\x65\x87\xA9\xCB\xED"
4343        );
4344        assert_magic_not_match_bin!(
4345            "0 uquad ~0x123456789ABCDEF0 Unsigned quad",
4346            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4347        ); // The original value
4348
4349        // Test x operator
4350        assert_magic_match_bin!(
4351            "0 uquad x {:#x}",
4352            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12",
4353            "0x123456789abcdef0"
4354        );
4355        assert_magic_match_bin!(
4356            "0 uquad x Unsigned quad",
4357            b"\x00\x00\x00\x00\x00\x00\x00\x00"
4358        );
4359    }
4360
4361    #[test]
4362    fn test_guid() {
4363        assert_magic_match_bin!(
4364            "0 guid EC959539-6786-2D4E-8FDB-98814CE76C1E It works",
4365            b"\xEC\x95\x95\x39\x67\x86\x2D\x4E\x8F\xDB\x98\x81\x4C\xE7\x6C\x1E"
4366        );
4367
4368        assert_magic_not_match_bin!(
4369            "0 guid 399595EC-8667-4E2D-8FDB-98814CE76C1E It works",
4370            b"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F"
4371        );
4372
4373        assert_magic_match_bin!(
4374            "0 guid x %s",
4375            b"\xEC\x95\x95\x39\x67\x86\x2D\x4E\x8F\xDB\x98\x81\x4C\xE7\x6C\x1E",
4376            "EC959539-6786-2D4E-8FDB-98814CE76C1E"
4377        );
4378    }
4379
4380    #[test]
4381    fn test_ubeqdate() {
4382        assert_magic_match_bin!(
4383            "0 ubeqdate 1633046400 It works",
4384            b"\x00\x00\x00\x00\x61\x56\x4f\x80"
4385        );
4386
4387        assert_magic_match_bin!(
4388            "0 ubeqdate x %s",
4389            b"\x00\x00\x00\x00\x61\x56\x4f\x80",
4390            "2021-10-01 00:00:00"
4391        );
4392
4393        assert_magic_not_match_bin!(
4394            "0 ubeqdate 1633046400 It should not work",
4395            b"\x00\x00\x00\x00\x00\x00\x00\x00"
4396        );
4397    }
4398
4399    #[test]
4400    fn test_ldate() {
4401        assert_magic_match_bin!("0 ldate 1640551520 It works", b"\x60\xd4\xC8\x61");
4402
4403        assert_magic_not_match_bin!("0 ldate 1633046400 It should not work", b"\x00\x00\x00\x00");
4404
4405        assert_magic_match_bin!(
4406            "0 ldate x %s",
4407            b"\x60\xd4\xC8\x61",
4408            unix_local_time_to_string(1640551520)
4409        );
4410    }
4411
4412    #[test]
4413    fn test_scalar_with_transform() {
4414        assert_magic_match_bin!("0 ubyte/10 2 {}", b"\x14", "2");
4415        assert_magic_match_bin!("0 ubyte/10 x {}", b"\x14", "2");
4416        assert_magic_match_bin!("0 ubyte%10 x {}", b"\x14", "0");
4417    }
4418
4419    #[test]
4420    fn test_float_with_transform() {
4421        assert_magic_match_bin!("0 lefloat/10 2 {}", b"\x00\x00\xa0\x41", "2");
4422        assert_magic_match_bin!("0 lefloat/10 x {}", b"\x00\x00\xa0\x41", "2");
4423        assert_magic_match_bin!("0 lefloat%10 x {}", b"\x00\x00\xa0\x41", "0");
4424    }
4425
4426    #[test]
4427    fn test_read_octal() {
4428        // Basic cases
4429        assert_eq!(read_octal_u64(&mut lazy_cache!("0")), Some(0));
4430        assert_eq!(read_octal_u64(&mut lazy_cache!("00")), Some(0));
4431        assert_eq!(read_octal_u64(&mut lazy_cache!("01")), Some(1));
4432        assert_eq!(read_octal_u64(&mut lazy_cache!("07")), Some(7));
4433        assert_eq!(read_octal_u64(&mut lazy_cache!("010")), Some(8));
4434        assert_eq!(read_octal_u64(&mut lazy_cache!("0123")), Some(83));
4435        assert_eq!(read_octal_u64(&mut lazy_cache!("0755")), Some(493));
4436
4437        // With trailing non-octal characters
4438        assert_eq!(read_octal_u64(&mut lazy_cache!("0ABC")), Some(0));
4439        assert_eq!(read_octal_u64(&mut lazy_cache!("01ABC")), Some(1));
4440        assert_eq!(read_octal_u64(&mut lazy_cache!("0755ABC")), Some(493));
4441        assert_eq!(read_octal_u64(&mut lazy_cache!("0123ABC")), Some(83));
4442
4443        // Invalid octal digits
4444        assert_eq!(read_octal_u64(&mut lazy_cache!("08")), Some(0)); // stops at '8'
4445        assert_eq!(read_octal_u64(&mut lazy_cache!("01238")), Some(83)); // stops at '8'
4446
4447        // No leading '0'
4448        assert_eq!(read_octal_u64(&mut lazy_cache!("123")), None);
4449        assert_eq!(read_octal_u64(&mut lazy_cache!("755")), None);
4450
4451        // Empty string
4452        assert_eq!(read_octal_u64(&mut lazy_cache!("")), None);
4453
4454        // Only non-octal characters
4455        assert_eq!(read_octal_u64(&mut lazy_cache!("ABC")), None);
4456        assert_eq!(read_octal_u64(&mut lazy_cache!("8ABC")), None); // first char is not '0'
4457
4458        // Longer valid octal (but within u64 range)
4459        assert_eq!(
4460            read_octal_u64(&mut lazy_cache!("01777777777")),
4461            Some(268435455)
4462        );
4463    }
4464
4465    #[test]
4466    fn test_offset_bug_1() {
4467        // this tests the exact behaviour
4468        // expected by libmagic/file
4469        assert_magic_match_bin!(
4470            r"
44711	string		TEST Bread is
4472# offset computation is relative to
4473# rule start
4474>(5.b)	use toasted
4475
44760 name toasted
4477>0	string twice Toasted
4478>>0  use toasted_twice 
4479
44800 name toasted_twice
4481>(6.b) string x %s
4482        ",
4483            b"\x00TEST\x06twice\x00\x06",
4484            "Bread is Toasted twice"
4485        );
4486    }
4487
4488    // this test implement the exact same logic as
4489    // test_offset_bug_1 except that the rule starts
4490    // matching from end. Surprisingly we need to
4491    // adjust indirect offsets so that it works in
4492    // libmagic/file
4493    #[test]
4494    fn test_offset_bug_2() {
4495        // this tests the exact behaviour
4496        // expected by libmagic/file
4497        assert_magic_match_bin!(
4498            r"
4499-12	string		TEST Bread is
4500>(4.b)	use toasted
4501
45020 name toasted
4503>0	string twice Toasted
4504>>0  use toasted_twice
4505
45060 name toasted_twice
4507>(6.b) string x %
4508        ",
4509            b"\x00TEST\x06twice\x00\x06",
4510            "Bread is Toasted twice"
4511        )
4512    }
4513
4514    #[test]
4515    fn test_offset_bug_3() {
4516        // this tests the exact behaviour
4517        // expected by libmagic/file
4518        assert_magic_match_bin!(
4519            r"
45201	string		TEST Bread is
4521>(5.b) indirect/r x
4522
45230	string twice Toasted
4524>0  use toasted_twice
4525
45260 name toasted_twice
4527>0 string x %s
4528        ",
4529            b"\x00TEST\x06twice\x00\x08",
4530            "Bread is Toasted twice"
4531        )
4532    }
4533
4534    #[test]
4535    fn test_offset_bug_4() {
4536        // this tests the exact behaviour
4537        // expected by libmagic/file
4538        assert_magic_match_bin!(
4539            r"
45401	string		Bread %s
4541>(6.b) indirect/r x
4542
4543# this one uses a based offset
4544# computed at indirection
45451	string is\ Toasted %s
4546>(11.b)  use toasted_twice
4547
4548# this one is using a new base
4549# offset being previous base 
4550# offset + offset of use
45510 name toasted_twice
4552>0 string x %s
4553            ",
4554            b"\x00Bread\x06is Toasted\x0ctwice\x00",
4555            "Bread is Toasted twice"
4556        )
4557    }
4558
4559    #[test]
4560    fn test_offset_bug_5() {
4561        assert_magic_match_bin!(
4562            r"
45631	string		TEST Bread is
4564>(5.b) indirect/r x
4565
45660	string twice Toasted
4567>0  use toasted_twice
4568
45690 name toasted_twice
4570>0 string twice
4571>>&1 byte 0x08 twice
4572            ",
4573            b"\x00TEST\x06twice\x00\x08",
4574            "Bread is Toasted twice"
4575        )
4576    }
4577
4578    #[test]
4579    fn test_message_parts() {
4580        let m = first_magic(
4581            r#"0	string/W	#!/usr/bin/env\ python  PYTHON"#,
4582            b"#!/usr/bin/env    python",
4583            StreamKind::Text(TextEncoding::Ascii),
4584        )
4585        .unwrap();
4586
4587        assert!(m.message_parts().any(|p| p.eq_ignore_ascii_case("python")))
4588    }
4589}