Skip to main content

pure_magic/
lib.rs

1#![forbid(unsafe_code)]
2#![deny(unused_imports)]
3#![deny(missing_docs)]
4//! # `pure-magic`: A pure and safe Rust Reimplementation of `libmagic`
5//!
6//! Unlike many file identification crates, `pure-magic` is highly compatible with the standard
7//! `magic` rule format, allowing seamless reuse of existing
8//! [rules](https://github.com/qjerome/magic-rs/tree/main/magic-db/src/magdir). This makes it an ideal
9//! drop-in replacement for crates relying on **`libmagic` C bindings**, where memory safety is critical.
10//!
11//! **Key Features:**
12//! - File type detection
13//! - MIME type inference
14//! - Custom magic rule parsing
15//!
16//! ## Installation
17//! Add `pure-magic` to your `Cargo.toml`:
18//!
19//! ```toml
20//! [dependencies]
21//! pure-magic = "0.1"  # Replace with the latest version
22//! ```
23//!
24//! Or add the latest version with cargo:
25//!
26//! ```sh
27//! cargo add pure-magic
28//! ```
29//!
30//! ## Quick Start
31//!
32//! ### Detect File Types Programmatically
33//! ```rust
34//! use pure_magic::{MagicDb, MagicSource};
35//! use std::fs::File;
36//!
37//! fn main() -> Result<(), Box<dyn std::error::Error>> {
38//!     let mut db = MagicDb::new();
39//!     // Create a MagicSource from a file
40//!     let rust_magic = MagicSource::open("../magic-db/src/magdir/rust")?;
41//!     db.load(rust_magic);
42//!     // Verification is not mandatory
43//!     db.verify()?;
44//!
45//!     // Open a file and detect its type
46//!     let mut file = File::open("src/lib.rs")?;
47//!     let magic = db.first_magic(&mut file, None)?;
48//!
49//!     println!(
50//!         "File type: {} (MIME: {}, strength: {})",
51//!         magic.message(),
52//!         magic.mime_type(),
53//!         magic.strength()
54//!     );
55//!     Ok(())
56//! }
57//! ```
58//!
59//! ### Get All Matching Rules
60//! ```rust
61//! use pure_magic::{MagicDb, MagicSource};
62//! use std::fs::File;
63//!
64//! fn main() -> Result<(), Box<dyn std::error::Error>> {
65//!     let mut db = MagicDb::new();
66//!     // Create a MagicSource from a file
67//!     let rust_magic = MagicSource::open("../magic-db/src/magdir/rust")?;
68//!     db.load(rust_magic);
69//!
70//!     // Open a file and detect its type
71//!     let mut file = File::open("src/lib.rs")?;
72//!
73//!     // Get all matching rules, sorted by strength
74//!     let magics = db.all_magics(&mut file)?;
75//!
76//!     // Must contain rust file magic and default text magic
77//!     assert!(magics.len() > 1);
78//!
79//!     for magic in magics {
80//!         println!(
81//!             "Match: {} (strength: {}, source: {})",
82//!             magic.message(),
83//!             magic.strength(),
84//!             magic.source().unwrap_or("unknown")
85//!         );
86//!     }
87//!     Ok(())
88//! }
89//! ```
90//!
91//! ### Serialize a Database to Disk
92//! ```rust
93//! use pure_magic::{MagicDb, MagicSource};
94//! use std::fs::File;
95//!
96//! fn main() -> Result<(), Box<dyn std::error::Error>> {
97//!     let mut db = MagicDb::new();
98//!     // Create a MagicSource from a file
99//!     let rust_magic = MagicSource::open("../magic-db/src/magdir/rust")?;
100//!     db.load(rust_magic);
101//!
102//!     // Serialize the database to a file
103//!     let mut output = File::create("/tmp/compiled.db")?;
104//!     db.serialize(&mut output)?;
105//!
106//!     println!("Database saved to file");
107//!     Ok(())
108//! }
109//! ```
110//!
111//! ### Deserialize a Database
112//! ```rust
113//! use pure_magic::{MagicDb, MagicSource};
114//! use std::fs::File;
115//!
116//! fn main() -> Result<(), Box<dyn std::error::Error>> {
117//!     let mut db = MagicDb::new();
118//!     // Create a MagicSource from a file
119//!     let rust_magic = MagicSource::open("../magic-db/src/magdir/rust")?;
120//!     db.load(rust_magic);
121//!
122//!     // Serialize the database in a vector
123//!     let mut ser = vec![];
124//!     db.serialize(&mut ser)?;
125//!     println!("Database saved to vector");
126//!
127//!     // We deserialize from slice
128//!     let db = MagicDb::deserialize(&mut ser.as_slice())?;
129//!
130//!     assert!(!db.rules().is_empty());
131//!
132//!     Ok(())
133//! }
134//! ```
135//!
136//! ## License
137//! This project is dual-licensed under either:
138//! - **GPL-3.0**
139//! - **BSD-2-Clause**
140//!
141//! ## Contributing
142//! Contributions are welcome! Open an issue or submit a pull request.
143//!
144//! ## Acknowledgments
145//! - Inspired by the original `libmagic` (part of the `file` command).
146
147use dyf::{DynDisplay, FormatString, dformat};
148use flagset::{FlagSet, flags};
149use flate2::{Compression, read::GzDecoder, write::GzEncoder};
150use lazy_cache::LazyCache;
151use memchr::memchr;
152use pest::{Span, error::ErrorVariant};
153use regex::bytes::{self};
154use serde::{Deserialize, Serialize};
155use std::{
156    borrow::Cow,
157    cmp::max,
158    collections::{HashMap, HashSet},
159    fmt::{self, Debug, Display},
160    io::{self, Read, Seek, SeekFrom, Write},
161    ops::{Add, BitAnd, BitOr, BitXor, Deref, Div, Mul, Rem, Sub},
162    path::Path,
163};
164use tar::Archive;
165use thiserror::Error;
166use tracing::{Level, debug, enabled, trace};
167
168use crate::{
169    numeric::{Float, FloatDataType, Scalar, ScalarDataType},
170    parser::{FileMagicParser, Rule},
171    utils::{
172        debug_string_from_vec_u8, debug_string_from_vec_u16, decode_id3, find_json_boundaries,
173        run_utf8_validation,
174    },
175};
176
177mod numeric;
178mod parser;
179mod utils;
180
181const HARDCODED_MAGIC_STRENGTH: u64 = 2048;
182const HARDCODED_SOURCE: &str = "hardcoded";
183// corresponds to FILE_INDIR_MAX constant defined in libmagic
184const MAX_RECURSION: usize = 50;
185// constant found in libmagic. It is used to limit for regex tests
186const FILE_REGEX_MAX: usize = 8192;
187
188/// Maximum number of bytes to read for search tests.
189///
190/// This constant is derived from `libmagic` and is used to limit the number of bytes
191/// read during search tests to ensure performance and efficiency. The value is set
192/// to 7 megabytes.
193pub const FILE_BYTES_MAX: usize = 7 * 1024 * 1024;
194/// Default mimetype for un-identified binary data
195pub const DEFAULT_BIN_MIMETYPE: &str = "application/octet-stream";
196/// Default mimetype for un-identified text data
197pub const DEFAULT_TEXT_MIMETYPE: &str = "text/plain";
198
199pub(crate) const TIMESTAMP_FORMAT: &str = "%Y-%m-%d %H:%M:%S";
200
201macro_rules! debug_panic {
202    ($($arg:tt)*) => {
203        if cfg!(debug_assertions) {
204            panic!($($arg)*);
205        }
206    };
207}
208
209macro_rules! read {
210    ($r: expr, $ty: ty) => {{
211        let mut a = [0u8; std::mem::size_of::<$ty>()];
212        $r.read_exact(&mut a)?;
213        a
214    }};
215}
216
217macro_rules! read_le {
218    ($r:expr, $ty: ty ) => {{ <$ty>::from_le_bytes(read!($r, $ty)) }};
219}
220
221macro_rules! read_be {
222    ($r:expr, $ty: ty ) => {{ <$ty>::from_be_bytes(read!($r, $ty)) }};
223}
224
225macro_rules! read_me {
226    ($r: expr) => {{ ((read_le!($r, u16) as i32) << 16) | (read_le!($r, u16) as i32) }};
227}
228
229#[inline(always)]
230fn read_octal_u64<R: Read + Seek>(haystack: &mut LazyCache<R>) -> Option<u64> {
231    let s = haystack
232        .read_while_or_limit(|b| matches!(b, b'0'..=b'7'), 22)
233        .map(|buf| str::from_utf8(buf))
234        .ok()?
235        .ok()?;
236
237    if !s.starts_with("0") {
238        return None;
239    }
240
241    u64::from_str_radix(s, 8).ok()
242}
243
244/// Represents all possible errors that can occur during file type detection and processing.
245#[derive(Debug, Error)]
246pub enum Error {
247    /// A generic error with a custom message.
248    #[error("{0}")]
249    Msg(String),
250
251    /// Indicate a rule load failure
252    #[error("source={0} line={1} error={2}")]
253    Verify(String, usize, Box<Error>),
254
255    /// An error with a source location and a nested error.
256    #[error("source={0} line={1} error={2}")]
257    Localized(String, usize, Box<Error>),
258
259    /// Indicates a required rule was not found.
260    #[error("missing rule: {0}")]
261    MissingRule(String),
262
263    /// Indicates the maximum recursion depth was reached.
264    #[error("maximum recursion reached: {0}")]
265    MaximumRecursion(usize),
266
267    /// Wraps an I/O error.
268    #[error("io: {0}")]
269    Io(#[from] io::Error),
270
271    /// Wraps a parsing error from the `pest` parser.
272    #[error("parser error: {0}")]
273    Parse(#[from] Box<pest::error::Error<Rule>>),
274
275    /// Wraps a formatting error from the `dyf` crate.
276    #[error("formatting: {0}")]
277    Format(#[from] dyf::Error),
278
279    /// Wraps a regex-related error.
280    #[error("regex: {0}")]
281    Regex(#[from] regex::Error),
282
283    /// Wraps a serialization error from `bincode`.
284    #[error("{0}")]
285    Serialize(#[from] bincode::error::EncodeError),
286
287    /// Wraps a deserialization error from `bincode`.
288    #[error("{0}")]
289    Deserialize(#[from] bincode::error::DecodeError),
290}
291
292impl Error {
293    #[inline]
294    fn parser<S: ToString>(msg: S, span: Span<'_>) -> Self {
295        Self::Parse(Box::new(pest::error::Error::new_from_span(
296            ErrorVariant::CustomError {
297                message: msg.to_string(),
298            },
299            span,
300        )))
301    }
302
303    fn msg<M: AsRef<str>>(msg: M) -> Self {
304        Self::Msg(msg.as_ref().into())
305    }
306
307    fn localized<S: AsRef<str>>(source: S, line: usize, err: Error) -> Self {
308        Self::Localized(source.as_ref().into(), line, err.into())
309    }
310
311    /// Unwraps the localized error
312    pub fn unwrap_localized(&self) -> &Self {
313        match self {
314            Self::Localized(_, _, e) => e,
315            _ => self,
316        }
317    }
318}
319
320#[derive(Debug, Clone, Serialize, Deserialize)]
321enum Message {
322    String(String),
323    Format {
324        printf_spec: String,
325        fs: FormatString,
326    },
327}
328
329impl Display for Message {
330    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
331        match self {
332            Self::String(s) => write!(f, "{s}"),
333            Self::Format { printf_spec: _, fs } => write!(f, "{}", fs.to_string_lossy()),
334        }
335    }
336}
337
338impl Message {
339    fn to_string_lossy(&self) -> Cow<'_, str> {
340        match self {
341            Message::String(s) => Cow::Borrowed(s),
342            Message::Format { printf_spec: _, fs } => fs.to_string_lossy(),
343        }
344    }
345
346    #[inline(always)]
347    fn format_with(&self, mr: Option<&MatchRes>) -> Result<Cow<'_, str>, Error> {
348        match self {
349            Self::String(s) => Ok(Cow::Borrowed(s.as_str())),
350            Self::Format {
351                printf_spec: c_spec,
352                fs,
353            } => {
354                if let Some(mr) = mr {
355                    match mr {
356                        MatchRes::Float(_, _) | MatchRes::Bytes(_, _, _, _) => {
357                            Ok(Cow::Owned(dformat!(fs, mr)?))
358                        }
359                        MatchRes::Scalar(_, scalar) => {
360                            // we want to print a byte as char
361                            if c_spec.as_str() == "c" {
362                                match scalar {
363                                    Scalar::byte(b) => {
364                                        let b = (*b as u8) as char;
365                                        Ok(Cow::Owned(dformat!(fs, b)?))
366                                    }
367                                    Scalar::ubyte(b) => {
368                                        let b = *b as char;
369                                        Ok(Cow::Owned(dformat!(fs, b)?))
370                                    }
371                                    _ => Ok(Cow::Owned(dformat!(fs, mr)?)),
372                                }
373                            } else {
374                                Ok(Cow::Owned(dformat!(fs, mr)?))
375                            }
376                        }
377                    }
378                } else {
379                    Ok(fs.to_string_lossy())
380                }
381            }
382        }
383    }
384}
385
386impl ScalarDataType {
387    #[inline(always)]
388    fn read<R: Read + Seek>(&self, from: &mut R, switch_endianness: bool) -> Result<Scalar, Error> {
389        macro_rules! _read_le {
390            ($ty: ty) => {{
391                if switch_endianness {
392                    <$ty>::from_be_bytes(read!(from, $ty))
393                } else {
394                    <$ty>::from_le_bytes(read!(from, $ty))
395                }
396            }};
397        }
398
399        macro_rules! _read_be {
400            ($ty: ty) => {{
401                if switch_endianness {
402                    <$ty>::from_le_bytes(read!(from, $ty))
403                } else {
404                    <$ty>::from_be_bytes(read!(from, $ty))
405                }
406            }};
407        }
408
409        macro_rules! _read_ne {
410            ($ty: ty) => {{
411                if cfg!(target_endian = "big") {
412                    _read_be!($ty)
413                } else {
414                    _read_le!($ty)
415                }
416            }};
417        }
418
419        macro_rules! _read_me {
420            () => {
421                ((_read_le!(u16) as i32) << 16) | (_read_le!(u16) as i32)
422            };
423        }
424
425        Ok(match self {
426            // signed
427            Self::byte => Scalar::byte(read!(from, u8)[0] as i8),
428            Self::short => Scalar::short(_read_ne!(i16)),
429            Self::long => Scalar::long(_read_ne!(i32)),
430            Self::date => Scalar::date(_read_ne!(i32)),
431            Self::ldate => Scalar::ldate(_read_ne!(i32)),
432            Self::qwdate => Scalar::qwdate(_read_ne!(i64)),
433            Self::leshort => Scalar::leshort(_read_le!(i16)),
434            Self::lelong => Scalar::lelong(_read_le!(i32)),
435            Self::lequad => Scalar::lequad(_read_le!(i64)),
436            Self::bequad => Scalar::bequad(_read_be!(i64)),
437            Self::belong => Scalar::belong(_read_be!(i32)),
438            Self::bedate => Scalar::bedate(_read_be!(i32)),
439            Self::beldate => Scalar::beldate(_read_be!(i32)),
440            Self::beqdate => Scalar::beqdate(_read_be!(i64)),
441            // unsigned
442            Self::ubyte => Scalar::ubyte(read!(from, u8)[0]),
443            Self::ushort => Scalar::ushort(_read_ne!(u16)),
444            Self::uleshort => Scalar::uleshort(_read_le!(u16)),
445            Self::ulelong => Scalar::ulelong(_read_le!(u32)),
446            Self::uledate => Scalar::uledate(_read_le!(u32)),
447            Self::ulequad => Scalar::ulequad(_read_le!(u64)),
448            Self::offset => Scalar::offset(from.stream_position()?),
449            Self::ubequad => Scalar::ubequad(_read_be!(u64)),
450            Self::medate => Scalar::medate(_read_me!()),
451            Self::meldate => Scalar::meldate(_read_me!()),
452            Self::melong => Scalar::melong(_read_me!()),
453            Self::beshort => Scalar::beshort(_read_be!(i16)),
454            Self::quad => Scalar::quad(_read_ne!(i64)),
455            Self::uquad => Scalar::uquad(_read_ne!(u64)),
456            Self::ledate => Scalar::ledate(_read_le!(i32)),
457            Self::leldate => Scalar::leldate(_read_le!(i32)),
458            Self::leqdate => Scalar::leqdate(_read_le!(i64)),
459            Self::leqldate => Scalar::leqldate(_read_le!(i64)),
460            Self::leqwdate => Scalar::leqwdate(_read_le!(i64)),
461            Self::ubelong => Scalar::ubelong(_read_be!(u32)),
462            Self::ulong => Scalar::ulong(_read_ne!(u32)),
463            Self::ubeshort => Scalar::ubeshort(_read_be!(u16)),
464            Self::ubeqdate => Scalar::ubeqdate(_read_be!(u64)),
465            Self::lemsdosdate => Scalar::lemsdosdate(_read_le!(u16)),
466            Self::lemsdostime => Scalar::lemsdostime(_read_le!(u16)),
467            Self::guid => Scalar::guid(u128::from_be_bytes(read!(from, u128))),
468        })
469    }
470}
471
472impl FloatDataType {
473    #[inline(always)]
474    fn read<R: Read + Seek>(&self, from: &mut R, switch_endianness: bool) -> Result<Float, Error> {
475        macro_rules! _read_le {
476            ($ty: ty) => {{
477                if switch_endianness {
478                    <$ty>::from_be_bytes(read!(from, $ty))
479                } else {
480                    <$ty>::from_le_bytes(read!(from, $ty))
481                }
482            }};
483        }
484
485        macro_rules! _read_be {
486            ($ty: ty) => {{
487                if switch_endianness {
488                    <$ty>::from_le_bytes(read!(from, $ty))
489                } else {
490                    <$ty>::from_be_bytes(read!(from, $ty))
491                }
492            }};
493        }
494
495        macro_rules! _read_ne {
496            ($ty: ty) => {{
497                if cfg!(target_endian = "big") {
498                    _read_be!($ty)
499                } else {
500                    _read_le!($ty)
501                }
502            }};
503        }
504
505        macro_rules! _read_me {
506            () => {
507                ((_read_le!(u16) as i32) << 16) | (_read_le!(u16) as i32)
508            };
509        }
510
511        Ok(match self {
512            Self::lefloat => Float::lefloat(_read_le!(f32)),
513            Self::befloat => Float::befloat(_read_le!(f32)),
514            Self::ledouble => Float::ledouble(_read_le!(f64)),
515            Self::bedouble => Float::bedouble(_read_be!(f64)),
516        })
517    }
518}
519
520#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
521enum Op {
522    Mul,
523    Add,
524    Sub,
525    Div,
526    Mod,
527    And,
528    Xor,
529    Or,
530}
531
532impl Display for Op {
533    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
534        match self {
535            Op::Mul => write!(f, "*"),
536            Op::Add => write!(f, "+"),
537            Op::Sub => write!(f, "-"),
538            Op::Div => write!(f, "/"),
539            Op::Mod => write!(f, "%"),
540            Op::And => write!(f, "&"),
541            Op::Or => write!(f, "|"),
542            Op::Xor => write!(f, "^"),
543        }
544    }
545}
546
547#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
548enum CmpOp {
549    Eq,
550    Lt,
551    Gt,
552    BitAnd,
553    Neq, // ! operator
554    Xor,
555    Not, // ~ operator
556}
557
558impl CmpOp {
559    #[inline(always)]
560    fn is_neq(&self) -> bool {
561        matches!(self, Self::Neq)
562    }
563}
564
565#[derive(Debug, Clone, Serialize, Deserialize)]
566struct ScalarTransform {
567    op: Op,
568    num: Scalar,
569}
570
571impl ScalarTransform {
572    fn apply(&self, s: Scalar) -> Option<Scalar> {
573        match self.op {
574            Op::Add => s.checked_add(self.num),
575            Op::Sub => s.checked_sub(self.num),
576            Op::Mul => s.checked_mul(self.num),
577            Op::Div => s.checked_div(self.num),
578            Op::Mod => s.checked_rem(self.num),
579            Op::And => Some(s.bitand(self.num)),
580            Op::Xor => Some(s.bitxor(self.num)),
581            Op::Or => Some(s.bitor(self.num)),
582        }
583    }
584}
585
586#[derive(Debug, Clone, Serialize, Deserialize)]
587struct FloatTransform {
588    op: Op,
589    num: Float,
590}
591
592impl FloatTransform {
593    fn apply(&self, s: Float) -> Float {
594        match self.op {
595            Op::Add => s.add(self.num),
596            Op::Sub => s.sub(self.num),
597            Op::Mul => s.mul(self.num),
598            // returns inf when div by 0
599            Op::Div => s.div(self.num),
600            // returns NaN when rem by 0
601            Op::Mod => s.rem(self.num),
602            // parser makes sure those operators cannot be used
603            Op::And | Op::Xor | Op::Or => {
604                debug_panic!("unsupported operation");
605                s
606            }
607        }
608    }
609}
610
611#[derive(Clone, Serialize, Deserialize)]
612enum TestValue<T> {
613    Value(T),
614    Any,
615}
616
617impl Debug for TestValue<Vec<u8>> {
618    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
619        match self {
620            Self::Value(v) => write!(f, "\"{}\"", debug_string_from_vec_u8(v)),
621            Self::Any => write!(f, "ANY"),
622        }
623    }
624}
625
626impl Debug for TestValue<Vec<u16>> {
627    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
628        match self {
629            Self::Value(v) => write!(f, "\"{}\"", debug_string_from_vec_u16(v)),
630            Self::Any => write!(f, "ANY"),
631        }
632    }
633}
634
635impl Debug for TestValue<Scalar> {
636    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
637        match self {
638            Self::Value(s) => write!(f, "{s:?}"),
639            Self::Any => write!(f, "ANY"),
640        }
641    }
642}
643
644impl Debug for TestValue<Float> {
645    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
646        match self {
647            Self::Value(fl) => write!(f, "{fl:?}"),
648            Self::Any => write!(f, "ANY"),
649        }
650    }
651}
652
653impl<T> TestValue<T> {
654    #[inline(always)]
655    fn as_ref(&self) -> TestValue<&T> {
656        match self {
657            Self::Value(v) => TestValue::Value(v),
658            Self::Any => TestValue::Any,
659        }
660    }
661}
662
663flags! {
664    enum ReMod: u8{
665        CaseInsensitive,
666        StartOffsetUpdate,
667        LineLimit,
668        ForceBin,
669        ForceText,
670        TrimMatch,
671    }
672}
673
674fn serialize_regex<S>(re: &bytes::Regex, serializer: S) -> Result<S::Ok, S::Error>
675where
676    S: serde::Serializer,
677{
678    re.as_str().serialize(serializer)
679}
680
681fn deserialize_regex<'de, D>(deserializer: D) -> Result<bytes::Regex, D::Error>
682where
683    D: serde::Deserializer<'de>,
684{
685    let wrapper = String::deserialize(deserializer)?;
686    bytes::Regex::new(&wrapper).map_err(serde::de::Error::custom)
687}
688
689#[derive(Debug, Clone, Serialize, Deserialize)]
690struct RegexTest {
691    #[serde(
692        serialize_with = "serialize_regex",
693        deserialize_with = "deserialize_regex"
694    )]
695    re: bytes::Regex,
696    length: Option<usize>,
697    mods: FlagSet<ReMod>,
698    str_mods: FlagSet<StringMod>,
699    non_magic_len: usize,
700    binary: bool,
701    cmp_op: CmpOp,
702}
703
704impl RegexTest {
705    #[inline(always)]
706    fn is_binary(&self) -> bool {
707        self.binary
708            || self.mods.contains(ReMod::ForceBin)
709            || self.str_mods.contains(StringMod::ForceBin)
710    }
711
712    #[inline(always)]
713    fn is_text(&self) -> bool {
714        self.mods.contains(ReMod::ForceText) || self.str_mods.contains(StringMod::ForceText)
715    }
716
717    fn match_buf<'buf>(
718        &self,
719        off_buf: u64, // absolute buffer offset in content
720        stream_kind: StreamKind,
721        buf: &'buf [u8],
722    ) -> Option<MatchRes<'buf>> {
723        let mr = match stream_kind {
724            StreamKind::Text(_) => {
725                let mut off_txt = off_buf;
726
727                let mut line_limit = self.length.unwrap_or(usize::MAX);
728
729                for line in buf.split(|c| c == &b'\n') {
730                    // we don't need to break on offset
731                    // limit as buf contains the good amount
732                    // of bytes to match against
733                    if line_limit == 0 {
734                        break;
735                    }
736
737                    if let Some(re_match) = self.re.find(line) {
738                        // the offset of the string is computed from the start of the buffer
739                        let start_offset = off_txt + re_match.start() as u64;
740
741                        // if we matched until EOL we need to add one to include the delimiter removed from the split
742                        let stop_offset = if re_match.end() == line.len() {
743                            Some(start_offset + re_match.as_bytes().len() as u64 + 1)
744                        } else {
745                            None
746                        };
747
748                        return Some(MatchRes::Bytes(
749                            start_offset,
750                            stop_offset,
751                            re_match.as_bytes(),
752                            Encoding::Utf8,
753                        ));
754                    }
755
756                    off_txt += line.len() as u64;
757                    // we have to add one because lines do not contain splitting character
758                    off_txt += 1;
759                    line_limit = line_limit.saturating_sub(1)
760                }
761                None
762            }
763
764            StreamKind::Binary => {
765                self.re.find(buf).map(|re_match| {
766                    MatchRes::Bytes(
767                        // the offset of the string is computed from the start of the buffer
768                        off_buf + re_match.start() as u64,
769                        None,
770                        re_match.as_bytes(),
771                        Encoding::Utf8,
772                    )
773                })
774            }
775        };
776
777        // handle the case where we want the regex not to match
778        if self.cmp_op.is_neq() && mr.is_none() {
779            return Some(MatchRes::Bytes(off_buf, None, buf, Encoding::Utf8));
780        }
781
782        mr
783    }
784}
785
786impl From<RegexTest> for Test {
787    fn from(value: RegexTest) -> Self {
788        Self::Regex(value)
789    }
790}
791
792flags! {
793    enum StringMod: u8{
794        ForceBin,
795        UpperInsensitive,
796        LowerInsensitive,
797        FullWordMatch,
798        Trim,
799        ForceText,
800        CompactWhitespace,
801        OptBlank,
802    }
803}
804
805#[derive(Debug, Clone, Serialize, Deserialize)]
806struct StringTest {
807    test_val: TestValue<Vec<u8>>,
808    cmp_op: CmpOp,
809    length: Option<usize>,
810    mods: FlagSet<StringMod>,
811    binary: bool,
812}
813
814impl From<StringTest> for Test {
815    fn from(value: StringTest) -> Self {
816        Self::String(value)
817    }
818}
819
820#[inline(always)]
821fn string_match(str: &[u8], mods: FlagSet<StringMod>, buf: &[u8]) -> (bool, usize) {
822    let mut consumed = 0;
823    // we can do a simple string comparison
824    if mods.is_disjoint(
825        StringMod::UpperInsensitive
826            | StringMod::LowerInsensitive
827            | StringMod::FullWordMatch
828            | StringMod::CompactWhitespace
829            | StringMod::OptBlank,
830    ) {
831        // we check if target contains
832        if buf.starts_with(str) {
833            (true, str.len())
834        } else {
835            (false, consumed)
836        }
837    } else {
838        let mut i_src = 0;
839        let mut iter = buf.iter().peekable();
840
841        macro_rules! consume_target {
842            () => {{
843                if iter.next().is_some() {
844                    consumed += 1;
845                }
846            }};
847        }
848
849        macro_rules! continue_next_iteration {
850            () => {{
851                consume_target!();
852                i_src += 1;
853                continue;
854            }};
855        }
856
857        while let Some(&&b) = iter.peek() {
858            let Some(&ref_byte) = str.get(i_src) else {
859                break;
860            };
861
862            if mods.contains(StringMod::OptBlank) && (b == b' ' || ref_byte == b' ') {
863                if b == b' ' {
864                    // we ignore whitespace in target
865                    consume_target!();
866                }
867
868                if ref_byte == b' ' {
869                    // we ignore whitespace in test
870                    i_src += 1;
871                }
872
873                continue;
874            }
875
876            if mods.contains(StringMod::UpperInsensitive) {
877                //upper case characters in the magic match both lower and upper case characters in the target
878                if ref_byte.is_ascii_uppercase() && ref_byte == b.to_ascii_uppercase()
879                    || ref_byte == b
880                {
881                    continue_next_iteration!()
882                }
883            }
884
885            if mods.contains(StringMod::LowerInsensitive)
886                && (ref_byte.is_ascii_lowercase() && ref_byte == b.to_ascii_lowercase()
887                    || ref_byte == b)
888            {
889                continue_next_iteration!()
890            }
891
892            if mods.contains(StringMod::CompactWhitespace) && ref_byte == b' ' {
893                let mut src_blk = 0;
894                while let Some(b' ') = str.get(i_src) {
895                    src_blk += 1;
896                    i_src += 1;
897                }
898
899                let mut tgt_blk = 0;
900                while let Some(b' ') = iter.peek() {
901                    tgt_blk += 1;
902                    consume_target!();
903                }
904
905                if src_blk > tgt_blk {
906                    return (false, consumed);
907                }
908
909                continue;
910            }
911
912            if ref_byte == b {
913                continue_next_iteration!()
914            } else {
915                return (false, consumed);
916            }
917        }
918
919        if mods.contains(StringMod::FullWordMatch)
920            && let Some(b) = iter.peek()
921            && !b.is_ascii_whitespace()
922        {
923            return (false, consumed);
924        }
925
926        (
927            consumed > 0 && str.get(i_src).is_none() && consumed <= buf.len(),
928            consumed,
929        )
930    }
931}
932
933impl StringTest {
934    fn has_length_mod(&self) -> bool {
935        !self.mods.is_disjoint(
936            StringMod::UpperInsensitive
937                | StringMod::LowerInsensitive
938                | StringMod::FullWordMatch
939                | StringMod::CompactWhitespace
940                | StringMod::OptBlank,
941        )
942    }
943
944    #[inline(always)]
945    fn test_value_len(&self) -> usize {
946        match self.test_val.as_ref() {
947            TestValue::Value(s) => s.len(),
948            TestValue::Any => 0,
949        }
950    }
951
952    #[inline(always)]
953    fn is_binary(&self) -> bool {
954        self.binary || self.mods.contains(StringMod::ForceBin)
955    }
956
957    #[inline(always)]
958    fn is_text(&self) -> bool {
959        self.mods.contains(StringMod::ForceText)
960    }
961}
962
963#[derive(Clone, Serialize, Deserialize)]
964struct ByteVec(Vec<u8>);
965
966impl Debug for ByteVec {
967    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
968        write!(f, "\"{}\"", debug_string_from_vec_u8(self))
969    }
970}
971
972impl From<Vec<u8>> for ByteVec {
973    fn from(value: Vec<u8>) -> Self {
974        Self(value)
975    }
976}
977
978impl Deref for ByteVec {
979    type Target = Vec<u8>;
980
981    fn deref(&self) -> &Self::Target {
982        &self.0
983    }
984}
985
986#[derive(Debug, Clone, Serialize, Deserialize)]
987struct SearchTest {
988    str: ByteVec,
989    n_pos: Option<usize>,
990    str_mods: FlagSet<StringMod>,
991    re_mods: FlagSet<ReMod>,
992    binary: bool,
993    cmp_op: CmpOp,
994}
995
996impl From<SearchTest> for Test {
997    fn from(value: SearchTest) -> Self {
998        Self::Search(value)
999    }
1000}
1001
1002impl SearchTest {
1003    #[inline(always)]
1004    fn is_binary(&self) -> bool {
1005        (self.binary
1006            || self.str_mods.contains(StringMod::ForceBin)
1007            || self.re_mods.contains(ReMod::ForceBin))
1008            && !(self.str_mods.contains(StringMod::ForceText)
1009                || self.re_mods.contains(ReMod::ForceText))
1010    }
1011
1012    // off_buf: absolute buffer offset in content
1013    #[inline]
1014    fn match_buf<'buf>(&self, off_buf: u64, buf: &'buf [u8]) -> Option<MatchRes<'buf>> {
1015        let mut i = 0;
1016
1017        let needle = self.str.first()?;
1018
1019        while i < buf.len() {
1020            // we cannot match if the first character isn't the same
1021            // so we accelerate the search by finding potential matches
1022            let Some(k) = memchr(*needle, &buf[i..]) else {
1023                break;
1024            };
1025
1026            i += k;
1027
1028            // if we want a full word match
1029            if self.str_mods.contains(StringMod::FullWordMatch) {
1030                let prev_is_whitespace = buf
1031                    .get(i.saturating_sub(1))
1032                    .map(|c| c.is_ascii_whitespace())
1033                    .unwrap_or_default();
1034
1035                // if it is not the first character
1036                // and its previous character isn't
1037                // a whitespace. It cannot be a
1038                // fullword match
1039                if i > 0 && !prev_is_whitespace {
1040                    i += 1;
1041                    continue;
1042                }
1043            }
1044
1045            if let Some(npos) = self.n_pos
1046                && i > npos
1047            {
1048                break;
1049            }
1050
1051            let pos = i;
1052            let (ok, consumed) = string_match(&self.str, self.str_mods, &buf[i..]);
1053
1054            if ok {
1055                return Some(MatchRes::Bytes(
1056                    off_buf.saturating_add(pos as u64),
1057                    None,
1058                    &buf[i..i + consumed],
1059                    Encoding::Utf8,
1060                ));
1061            } else {
1062                i += max(consumed, 1)
1063            }
1064        }
1065
1066        // handles the case where we want the string not to be found
1067        if self.cmp_op.is_neq() {
1068            return Some(MatchRes::Bytes(off_buf, None, buf, Encoding::Utf8));
1069        }
1070
1071        None
1072    }
1073}
1074
1075#[derive(Debug, Clone, Serialize, Deserialize)]
1076struct ScalarTest {
1077    ty: ScalarDataType,
1078    transform: Option<ScalarTransform>,
1079    cmp_op: CmpOp,
1080    test_val: TestValue<Scalar>,
1081}
1082
1083#[derive(Debug, Clone, Serialize, Deserialize)]
1084struct FloatTest {
1085    ty: FloatDataType,
1086    transform: Option<FloatTransform>,
1087    cmp_op: CmpOp,
1088    test_val: TestValue<Float>,
1089}
1090
1091// the value read from the haystack we want to match against
1092// 'buf is the lifetime of the buffer we are scanning
1093#[derive(PartialEq)]
1094enum ReadValue<'buf> {
1095    Float(u64, Float),
1096    Scalar(u64, Scalar),
1097    Bytes(u64, &'buf [u8]),
1098}
1099
1100impl<'buf> Debug for ReadValue<'buf> {
1101    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1102        match self {
1103            Self::Float(_, fl) => write!(f, "{fl:?}"),
1104            Self::Scalar(_, s) => write!(f, "{s:?}"),
1105            Self::Bytes(_, b) => {
1106                if b.len() <= 128 {
1107                    write!(f, "\"{}\"", debug_string_from_vec_u8(b))
1108                } else {
1109                    let limit = 128;
1110                    write!(
1111                        f,
1112                        "\"{}\" (first {limit} bytes)",
1113                        debug_string_from_vec_u8(&b[..limit])
1114                    )
1115                }
1116            }
1117        }
1118    }
1119}
1120
1121impl DynDisplay for ReadValue<'_> {
1122    fn dyn_fmt(&self, f: &dyf::FormatSpec) -> Result<String, dyf::Error> {
1123        match self {
1124            Self::Float(_, s) => DynDisplay::dyn_fmt(s, f),
1125            Self::Scalar(_, s) => DynDisplay::dyn_fmt(s, f),
1126            Self::Bytes(_, b) => Ok(format!("{b:?}")),
1127        }
1128    }
1129}
1130
1131impl DynDisplay for &ReadValue<'_> {
1132    fn dyn_fmt(&self, f: &dyf::FormatSpec) -> Result<String, dyf::Error> {
1133        // Dereference self to get the TestValue and call its fmt method
1134        DynDisplay::dyn_fmt(*self, f)
1135    }
1136}
1137
1138impl Display for ReadValue<'_> {
1139    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1140        match self {
1141            Self::Float(_, v) => write!(f, "{v}"),
1142            Self::Scalar(_, s) => write!(f, "{s}"),
1143            Self::Bytes(_, b) => write!(f, "{b:?}"),
1144        }
1145    }
1146}
1147
1148enum Encoding {
1149    Utf16(String16Encoding),
1150    Utf8,
1151}
1152
1153// Carry the offset of the start of the data in the stream
1154// and the data itself
1155enum MatchRes<'buf> {
1156    // Bytes.0: offset of the match
1157    // Bytes.1: optional end of match (to address the need of EOL adjustment in string regex)
1158    // Bytes.2: the bytes matching
1159    // Bytes.3: encoding of the buffer
1160    Bytes(u64, Option<u64>, &'buf [u8], Encoding),
1161    Scalar(u64, Scalar),
1162    Float(u64, Float),
1163}
1164
1165impl DynDisplay for &MatchRes<'_> {
1166    fn dyn_fmt(&self, f: &dyf::FormatSpec) -> Result<String, dyf::Error> {
1167        (*self).dyn_fmt(f)
1168    }
1169}
1170
1171impl DynDisplay for MatchRes<'_> {
1172    fn dyn_fmt(&self, f: &dyf::FormatSpec) -> Result<String, dyf::Error> {
1173        match self {
1174            Self::Scalar(_, v) => v.dyn_fmt(f),
1175            Self::Float(_, v) => v.dyn_fmt(f),
1176            Self::Bytes(_, _, v, enc) => match enc {
1177                Encoding::Utf8 => String::from_utf8_lossy(v).to_string().dyn_fmt(f),
1178                Encoding::Utf16(enc) => {
1179                    let utf16: Vec<u16> = slice_to_utf16_iter(v, *enc).collect();
1180                    String::from_utf16_lossy(&utf16).dyn_fmt(f)
1181                }
1182            },
1183        }
1184    }
1185}
1186
1187impl MatchRes<'_> {
1188    // start offset of the match
1189    #[inline]
1190    fn start_offset(&self) -> u64 {
1191        match self {
1192            MatchRes::Bytes(o, _, _, _) => *o,
1193            MatchRes::Scalar(o, _) => *o,
1194            MatchRes::Float(o, _) => *o,
1195        }
1196    }
1197
1198    // start offset of the match
1199    #[inline]
1200    fn end_offset(&self) -> u64 {
1201        match self {
1202            MatchRes::Bytes(start, end, buf, _) => match end {
1203                Some(end) => *end,
1204                None => start.saturating_add(buf.len() as u64),
1205            },
1206            MatchRes::Scalar(o, sc) => o.add(sc.size_of() as u64),
1207            MatchRes::Float(o, f) => o.add(f.size_of() as u64),
1208        }
1209    }
1210}
1211
1212fn slice_to_utf16_iter(read: &[u8], encoding: String16Encoding) -> impl Iterator<Item = u16> {
1213    let even = read
1214        .iter()
1215        .enumerate()
1216        .filter(|(i, _)| i % 2 == 0)
1217        .map(|t| t.1);
1218
1219    let odd = read
1220        .iter()
1221        .enumerate()
1222        .filter(|(i, _)| i % 2 != 0)
1223        .map(|t| t.1);
1224
1225    even.zip(odd).map(move |(e, o)| match encoding {
1226        String16Encoding::Le => u16::from_le_bytes([*e, *o]),
1227        String16Encoding::Be => u16::from_be_bytes([*e, *o]),
1228    })
1229}
1230
1231#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
1232enum String16Encoding {
1233    Le,
1234    Be,
1235}
1236
1237#[derive(Debug, Clone, Serialize, Deserialize)]
1238struct String16Test {
1239    orig: String,
1240    test_val: TestValue<Vec<u16>>,
1241    encoding: String16Encoding,
1242}
1243
1244impl String16Test {
1245    /// if the test value is a specific value this method returns
1246    /// the number of utf16 characters. To obtain the length in
1247    /// bytes the return value needs to be multiplied by two.
1248    #[inline(always)]
1249    fn test_value_len(&self) -> usize {
1250        match self.test_val.as_ref() {
1251            TestValue::Value(str16) => str16.len(),
1252            TestValue::Any => 0,
1253        }
1254    }
1255}
1256
1257flags! {
1258    enum IndirectMod: u8{
1259        Relative,
1260    }
1261}
1262
1263type IndirectMods = FlagSet<IndirectMod>;
1264
1265#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
1266enum PStringLen {
1267    Byte,    // B
1268    ShortBe, // H
1269    ShortLe, // h
1270    LongBe,  // L
1271    LongLe,  // l
1272}
1273
1274impl PStringLen {
1275    #[inline(always)]
1276    const fn size_of_len(&self) -> usize {
1277        match self {
1278            PStringLen::Byte => 1,
1279            PStringLen::ShortBe => 2,
1280            PStringLen::ShortLe => 2,
1281            PStringLen::LongBe => 4,
1282            PStringLen::LongLe => 4,
1283        }
1284    }
1285}
1286
1287#[derive(Debug, Clone, Serialize, Deserialize)]
1288struct PStringTest {
1289    len: PStringLen,
1290    test_val: TestValue<Vec<u8>>,
1291    include_len: bool,
1292}
1293
1294impl PStringTest {
1295    #[inline]
1296    fn read<'cache, R: Read + Seek>(
1297        &self,
1298        haystack: &'cache mut LazyCache<R>,
1299    ) -> Result<Option<&'cache [u8]>, Error> {
1300        let mut len = match self.len {
1301            PStringLen::Byte => read_le!(haystack, u8) as u32,
1302            PStringLen::ShortBe => read_be!(haystack, u16) as u32,
1303            PStringLen::ShortLe => read_le!(haystack, u16) as u32,
1304            PStringLen::LongBe => read_be!(haystack, u32),
1305            PStringLen::LongLe => read_le!(haystack, u32),
1306        } as usize;
1307
1308        if self.include_len {
1309            len = len.saturating_sub(self.len.size_of_len())
1310        }
1311
1312        if let TestValue::Value(s) = self.test_val.as_ref()
1313            && len != s.len()
1314        {
1315            return Ok(None);
1316        }
1317
1318        let read = haystack.read_exact_count(len as u64)?;
1319
1320        Ok(Some(read))
1321    }
1322
1323    #[inline(always)]
1324    fn test_value_len(&self) -> usize {
1325        match self.test_val.as_ref() {
1326            TestValue::Value(s) => s.len(),
1327            TestValue::Any => 0,
1328        }
1329    }
1330}
1331
1332#[derive(Debug, Clone, Serialize, Deserialize)]
1333enum Test {
1334    Name(String),
1335    Use(bool, String),
1336    Scalar(ScalarTest),
1337    Float(FloatTest),
1338    String(StringTest),
1339    Search(SearchTest),
1340    PString(PStringTest),
1341    Regex(RegexTest),
1342    Indirect(FlagSet<IndirectMod>),
1343    String16(String16Test),
1344    // FIXME: placeholder for strength computation
1345    #[allow(dead_code)]
1346    Der,
1347    Clear,
1348    Default,
1349}
1350
1351impl Display for Test {
1352    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1353        match self {
1354            Test::Name(name) => write!(f, "name {name}"),
1355            Test::Use(flip, rule) => {
1356                if *flip {
1357                    write!(f, "use {rule}")
1358                } else {
1359                    write!(f, "use ^{rule}")
1360                }
1361            }
1362            Test::Scalar(st) => write!(f, "{st:?}"),
1363            Test::Float(ft) => write!(f, "{ft:?}"),
1364            Test::String(st) => write!(f, "{st:?}"),
1365            Test::Search(st) => write!(f, "{st:?}"),
1366            Test::PString(pt) => write!(f, "{pt:?}"),
1367            Test::Regex(rt) => write!(f, "{rt:?}"),
1368            Test::Indirect(fs) => write!(f, "indirect {fs:?}"),
1369            Test::String16(s16t) => write!(f, "{s16t:?}"),
1370            Test::Der => write!(f, "unimplemented der"),
1371            Test::Clear => write!(f, "clear"),
1372            Test::Default => write!(f, "default"),
1373        }
1374    }
1375}
1376
1377impl Test {
1378    // read the value to test from the haystack
1379    #[inline]
1380    fn read_test_value<'haystack, R: Read + Seek>(
1381        &self,
1382        haystack: &'haystack mut LazyCache<R>,
1383        switch_endianness: bool,
1384    ) -> Result<Option<ReadValue<'haystack>>, Error> {
1385        let test_value_offset = haystack.lazy_stream_position();
1386
1387        match self {
1388            Self::Scalar(t) => {
1389                t.ty.read(haystack, switch_endianness)
1390                    .map(|s| Some(ReadValue::Scalar(test_value_offset, s)))
1391            }
1392
1393            Self::Float(t) => {
1394                t.ty.read(haystack, switch_endianness)
1395                    .map(|f| Some(ReadValue::Float(test_value_offset, f)))
1396            }
1397            Self::String(t) => {
1398                match t.test_val.as_ref() {
1399                    TestValue::Value(str) => {
1400                        let buf = if let Some(length) = t.length {
1401                            // if there is a length specified
1402                            haystack.read_exact_count(length as u64)?
1403                        } else {
1404                            // no length specified we read until end of string
1405
1406                            match t.cmp_op {
1407                                CmpOp::Eq | CmpOp::Neq => {
1408                                    if !t.has_length_mod() {
1409                                        haystack.read_exact_count(str.len() as u64)?
1410                                    } else {
1411                                        haystack.read_count(FILE_BYTES_MAX as u64)?
1412                                    }
1413                                }
1414                                CmpOp::Lt | CmpOp::Gt => {
1415                                    let read =
1416                                        haystack.read_until_any_delim_or_limit(b"\n\0", 8092)?;
1417
1418                                    if read.ends_with(b"\0") || read.ends_with(b"\n") {
1419                                        &read[..read.len() - 1]
1420                                    } else {
1421                                        read
1422                                    }
1423                                }
1424                                _ => {
1425                                    return Err(Error::Msg(format!(
1426                                        "string test does not support {:?} operator",
1427                                        t.cmp_op
1428                                    )));
1429                                }
1430                            }
1431                        };
1432
1433                        Ok(Some(ReadValue::Bytes(test_value_offset, buf)))
1434                    }
1435                    TestValue::Any => {
1436                        let read = haystack.read_until_any_delim_or_limit(b"\0\n", 8192)?;
1437                        // we don't take last byte if it matches end of string
1438                        let bytes = if read.ends_with(b"\0") || read.ends_with(b"\n") {
1439                            &read[..read.len() - 1]
1440                        } else {
1441                            read
1442                        };
1443
1444                        Ok(Some(ReadValue::Bytes(test_value_offset, bytes)))
1445                    }
1446                }
1447            }
1448
1449            Self::String16(t) => {
1450                match t.test_val.as_ref() {
1451                    TestValue::Value(str16) => {
1452                        let read = haystack.read_exact_count((str16.len() * 2) as u64)?;
1453
1454                        Ok(Some(ReadValue::Bytes(test_value_offset, read)))
1455                    }
1456                    TestValue::Any => {
1457                        let read = haystack.read_until_utf16_or_limit(b"\x00\x00", 8192)?;
1458
1459                        // we make sure we have an even number of elements
1460                        let end = if read.len() % 2 == 0 {
1461                            read.len()
1462                        } else {
1463                            // we decide to read anyway even though
1464                            // length isn't even
1465                            read.len().saturating_sub(1)
1466                        };
1467
1468                        Ok(Some(ReadValue::Bytes(test_value_offset, &read[..end])))
1469                    }
1470                }
1471            }
1472
1473            Self::PString(t) => {
1474                let Some(read) = t.read(haystack)? else {
1475                    return Ok(None);
1476                };
1477                Ok(Some(ReadValue::Bytes(test_value_offset, read)))
1478            }
1479
1480            Self::Search(_) => {
1481                let buf = haystack.read_count(FILE_BYTES_MAX as u64)?;
1482                Ok(Some(ReadValue::Bytes(test_value_offset, buf)))
1483            }
1484
1485            Self::Regex(r) => {
1486                let length = {
1487                    match r.length {
1488                        Some(len) => {
1489                            if r.mods.contains(ReMod::LineLimit) {
1490                                len * 80
1491                            } else {
1492                                len
1493                            }
1494                        }
1495
1496                        None => FILE_REGEX_MAX,
1497                    }
1498                };
1499
1500                let read = haystack.read_count(length as u64)?;
1501                Ok(Some(ReadValue::Bytes(test_value_offset, read)))
1502            }
1503
1504            Self::Name(_)
1505            | Self::Use(_, _)
1506            | Self::Indirect(_)
1507            | Self::Clear
1508            | Self::Default
1509            | Self::Der => Err(Error::msg("no value to read for this test")),
1510        }
1511    }
1512
1513    #[inline(always)]
1514    fn match_value<'s>(
1515        &'s self,
1516        tv: &ReadValue<'s>,
1517        stream_kind: StreamKind,
1518    ) -> Option<MatchRes<'s>> {
1519        match (self, tv) {
1520            (Self::Scalar(t), ReadValue::Scalar(o, ts)) => {
1521                let read_value: Scalar = match t.transform.as_ref() {
1522                    Some(t) => t.apply(*ts)?,
1523                    None => *ts,
1524                };
1525
1526                match t.test_val {
1527                    TestValue::Value(test_value) => {
1528                        let ok = match t.cmp_op {
1529                            // NOTE: this should not happen in practice because
1530                            // we convert it into Eq equivalent at parsing time
1531                            CmpOp::Not => read_value == !test_value,
1532                            CmpOp::Eq => read_value == test_value,
1533                            CmpOp::Lt => read_value < test_value,
1534                            CmpOp::Gt => read_value > test_value,
1535                            CmpOp::Neq => read_value != test_value,
1536                            CmpOp::BitAnd => read_value & test_value == test_value,
1537                            CmpOp::Xor => (read_value & test_value).is_zero(),
1538                        };
1539
1540                        if ok {
1541                            Some(MatchRes::Scalar(*o, read_value))
1542                        } else {
1543                            None
1544                        }
1545                    }
1546
1547                    TestValue::Any => Some(MatchRes::Scalar(*o, read_value)),
1548                }
1549            }
1550
1551            (Self::Float(t), ReadValue::Float(o, f)) => {
1552                let read_value: Float = t.transform.as_ref().map(|t| t.apply(*f)).unwrap_or(*f);
1553
1554                match t.test_val {
1555                    TestValue::Value(tf) => {
1556                        let ok = match t.cmp_op {
1557                            CmpOp::Eq => read_value == tf,
1558                            CmpOp::Lt => read_value < tf,
1559                            CmpOp::Gt => read_value > tf,
1560                            CmpOp::Neq => read_value != tf,
1561                            _ => {
1562                                // this should never be reached as we validate
1563                                // operator in parser
1564                                debug_panic!("unsupported float comparison");
1565                                debug!("unsupported float comparison");
1566                                false
1567                            }
1568                        };
1569
1570                        if ok {
1571                            Some(MatchRes::Float(*o, read_value))
1572                        } else {
1573                            None
1574                        }
1575                    }
1576                    TestValue::Any => Some(MatchRes::Float(*o, read_value)),
1577                }
1578            }
1579
1580            (Self::String(st), ReadValue::Bytes(o, buf)) => {
1581                macro_rules! trim_buf {
1582                    ($buf: expr) => {{
1583                        if st.mods.contains(StringMod::Trim) {
1584                            $buf.trim_ascii()
1585                        } else {
1586                            $buf
1587                        }
1588                    }};
1589                }
1590
1591                match st.test_val.as_ref() {
1592                    TestValue::Value(str) => {
1593                        match st.cmp_op {
1594                            CmpOp::Eq => {
1595                                if let (true, _) = string_match(str, st.mods, buf) {
1596                                    Some(MatchRes::Bytes(*o, None, trim_buf!(str), Encoding::Utf8))
1597                                } else {
1598                                    None
1599                                }
1600                            }
1601                            CmpOp::Neq => {
1602                                if let (false, _) = string_match(str, st.mods, buf) {
1603                                    Some(MatchRes::Bytes(*o, None, trim_buf!(str), Encoding::Utf8))
1604                                } else {
1605                                    None
1606                                }
1607                            }
1608                            CmpOp::Gt => {
1609                                if buf.len() > str.len() {
1610                                    Some(MatchRes::Bytes(*o, None, trim_buf!(buf), Encoding::Utf8))
1611                                } else {
1612                                    None
1613                                }
1614                            }
1615                            CmpOp::Lt => {
1616                                if buf.len() < str.len() {
1617                                    Some(MatchRes::Bytes(*o, None, trim_buf!(buf), Encoding::Utf8))
1618                                } else {
1619                                    None
1620                                }
1621                            }
1622
1623                            // unsupported for strings
1624                            _ => {
1625                                // this should never be reached as we validate
1626                                // operator in parser
1627                                debug_panic!("unsupported string comparison");
1628                                debug!("unsupported string comparison");
1629                                None
1630                            }
1631                        }
1632                    }
1633                    TestValue::Any => {
1634                        Some(MatchRes::Bytes(*o, None, trim_buf!(buf), Encoding::Utf8))
1635                    }
1636                }
1637            }
1638
1639            (Self::PString(m), ReadValue::Bytes(o, buf)) => match m.test_val.as_ref() {
1640                TestValue::Value(psv) => {
1641                    if buf == psv {
1642                        Some(MatchRes::Bytes(*o, None, buf, Encoding::Utf8))
1643                    } else {
1644                        None
1645                    }
1646                }
1647                TestValue::Any => Some(MatchRes::Bytes(*o, None, buf, Encoding::Utf8)),
1648            },
1649
1650            (Self::String16(t), ReadValue::Bytes(o, buf)) => {
1651                match t.test_val.as_ref() {
1652                    TestValue::Value(str16) => {
1653                        // strings cannot be equal
1654                        if str16.len() * 2 != buf.len() {
1655                            return None;
1656                        }
1657
1658                        // we check string equality
1659                        for (i, utf16_char) in slice_to_utf16_iter(buf, t.encoding).enumerate() {
1660                            if str16[i] != utf16_char {
1661                                return None;
1662                            }
1663                        }
1664
1665                        Some(MatchRes::Bytes(
1666                            *o,
1667                            None,
1668                            t.orig.as_bytes(),
1669                            Encoding::Utf16(t.encoding),
1670                        ))
1671                    }
1672
1673                    TestValue::Any => {
1674                        Some(MatchRes::Bytes(*o, None, buf, Encoding::Utf16(t.encoding)))
1675                    }
1676                }
1677            }
1678
1679            (Self::Regex(r), ReadValue::Bytes(o, buf)) => r.match_buf(*o, stream_kind, buf),
1680
1681            (Self::Search(t), ReadValue::Bytes(o, buf)) => t.match_buf(*o, buf),
1682
1683            _ => None,
1684        }
1685    }
1686
1687    #[inline(always)]
1688    fn strength(&self) -> u64 {
1689        const MULT: usize = 10;
1690
1691        let mut out = 2 * MULT;
1692
1693        // FIXME: octal is missing but it is not used in practice ...
1694        match self {
1695            Test::Scalar(s) => {
1696                out += s.ty.type_size() * MULT;
1697            }
1698
1699            Test::Float(t) => {
1700                out += t.ty.type_size() * MULT;
1701            }
1702
1703            Test::String(t) => out += t.test_value_len().saturating_mul(MULT),
1704
1705            Test::PString(t) => out += t.test_value_len().saturating_mul(MULT),
1706
1707            Test::Search(s) => {
1708                // NOTE: this implementation deviates from what is in
1709                // C libmagic. The purpose of this implementation is to
1710                // minimize the difference between similar tests,
1711                // implemented differently (ex: string test VS very localized search test).
1712                let n_pos = s.n_pos.unwrap_or(FILE_BYTES_MAX);
1713
1714                match n_pos {
1715                    // a search on one line should be equivalent to a string match
1716                    0..=80 => out += s.str.len().saturating_mul(MULT),
1717                    // search on the first 3 lines gets a little penalty
1718                    81..=240 => out += s.str.len() * s.str.len().clamp(0, MULT - 2),
1719                    // a search on more than 3 lines isn't considered very accurate
1720                    _ => out += s.str.len(),
1721                }
1722            }
1723
1724            Test::Regex(r) => {
1725                // NOTE: this implementation deviates from what is in
1726                // C libmagic. The purpose of this implementation is to
1727                // minimize the difference between similar tests,
1728                // implemented differently (ex: string test VS very localized regex test).
1729
1730                // we divide length by the number of capture group
1731                // which gives us a value close to he average string
1732                // length match in the regex.
1733                let v = r.non_magic_len / r.re.captures_len();
1734
1735                let len = r
1736                    .length
1737                    .map(|l| {
1738                        if r.mods.contains(ReMod::LineLimit) {
1739                            l * 80
1740                        } else {
1741                            l
1742                        }
1743                    })
1744                    .unwrap_or(FILE_BYTES_MAX);
1745
1746                match len {
1747                    // a search on one line should be equivalent to a string match
1748                    0..=80 => out += v.saturating_mul(MULT),
1749                    // search on the first 3 lines gets a little penalty
1750                    81..=240 => out += v * v.clamp(0, MULT - 2),
1751                    // a search on more than 3 lines isn't considered very accurate
1752                    _ => out += v,
1753                }
1754            }
1755
1756            Test::String16(t) => {
1757                // NOTE: in libmagic the result is div by 2
1758                // but I GUESS it is because the len is expressed
1759                // in number bytes. In our case length is expressed
1760                // in number of u16 so we shouldn't divide.
1761                out += t.test_value_len().saturating_mul(MULT);
1762            }
1763
1764            Test::Der => out += MULT,
1765
1766            Test::Default | Test::Name(_) | Test::Use(_, _) | Test::Indirect(_) | Test::Clear => {
1767                return 0;
1768            }
1769        }
1770
1771        // matching any output gets penalty
1772        if self.is_match_any() {
1773            return 0;
1774        }
1775
1776        if let Some(op) = self.cmp_op() {
1777            match op {
1778                // matching almost any gets penalty
1779                CmpOp::Neq => out = 0,
1780                CmpOp::Eq | CmpOp::Not => out += MULT,
1781                CmpOp::Lt | CmpOp::Gt => out -= 2 * MULT,
1782                CmpOp::Xor | CmpOp::BitAnd => out -= MULT,
1783            }
1784        }
1785
1786        out as u64
1787    }
1788
1789    #[inline(always)]
1790    fn cmp_op(&self) -> Option<CmpOp> {
1791        match self {
1792            Self::String(t) => Some(t.cmp_op),
1793            Self::Scalar(s) => Some(s.cmp_op),
1794            Self::Float(t) => Some(t.cmp_op),
1795            Self::Name(_)
1796            | Self::Use(_, _)
1797            | Self::Search(_)
1798            | Self::PString(_)
1799            | Self::Regex(_)
1800            | Self::Clear
1801            | Self::Default
1802            | Self::Indirect(_)
1803            | Self::String16(_)
1804            | Self::Der => None,
1805        }
1806    }
1807
1808    #[inline(always)]
1809    fn is_recursive(&self) -> bool {
1810        matches!(self, Test::Use(_, _) | Test::Indirect(_))
1811    }
1812
1813    #[inline(always)]
1814    fn is_match_any(&self) -> bool {
1815        match self {
1816            Test::Name(_) => false,
1817            Test::Use(_, _) => false,
1818            Test::Scalar(scalar_test) => matches!(scalar_test.test_val, TestValue::Any),
1819            Test::Float(float_test) => matches!(float_test.test_val, TestValue::Any),
1820            Test::String(string_test) => matches!(string_test.test_val, TestValue::Any),
1821            Test::Search(_) => false,
1822            Test::PString(pstring_test) => matches!(pstring_test.test_val, TestValue::Any),
1823            Test::Regex(_) => false,
1824            Test::Indirect(_) => false,
1825            Test::String16(string16_test) => matches!(string16_test.test_val, TestValue::Any),
1826            Test::Der => false,
1827            Test::Clear => false,
1828            Test::Default => false,
1829        }
1830    }
1831
1832    #[inline(always)]
1833    fn is_binary(&self) -> bool {
1834        match self {
1835            Self::Name(_) => true,
1836            Self::Use(_, _) => true,
1837            Self::Scalar(_) => true,
1838            Self::Float(_) => true,
1839            Self::String(t) => !t.is_binary() & !t.is_text() || t.is_binary(),
1840            Self::Search(t) => t.is_binary(),
1841            Self::PString(_) => true,
1842            Self::Regex(t) => !t.is_binary() & !t.is_text() || t.is_binary(),
1843            Self::Clear => true,
1844            Self::Default => true,
1845            Self::Indirect(_) => true,
1846            Self::String16(_) => true,
1847            Self::Der => true,
1848        }
1849    }
1850
1851    #[inline(always)]
1852    fn is_text(&self) -> bool {
1853        match self {
1854            Self::Name(_) => true,
1855            Self::Use(_, _) => true,
1856            Self::Indirect(_) => true,
1857            Self::Clear => true,
1858            Self::Default => true,
1859            Self::String(t) => !t.is_binary() & !t.is_text() || t.is_text(),
1860            Self::Regex(t) => !t.is_binary() & !t.is_text() || t.is_text(),
1861            _ => !self.is_binary(),
1862        }
1863    }
1864
1865    #[inline(always)]
1866    fn is_only_text(&self) -> bool {
1867        self.is_text() && !self.is_binary()
1868    }
1869
1870    #[inline(always)]
1871    fn is_only_binary(&self) -> bool {
1872        self.is_binary() && !self.is_text()
1873    }
1874}
1875
1876#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
1877enum OffsetType {
1878    Byte,
1879    DoubleLe,
1880    DoubleBe,
1881    ShortLe,
1882    ShortBe,
1883    Id3Le,
1884    Id3Be,
1885    LongLe,
1886    LongBe,
1887    Middle,
1888    Octal,
1889    QuadBe,
1890    QuadLe,
1891}
1892
1893#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
1894enum Shift {
1895    Direct(u64),
1896    Indirect(i64),
1897}
1898
1899#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
1900struct IndOffset {
1901    // where to find the offset
1902    off_addr: DirOffset,
1903    // signed or unsigned
1904    signed: bool,
1905    // type of the offset
1906    ty: OffsetType,
1907    op: Option<Op>,
1908    shift: Option<Shift>,
1909}
1910
1911impl IndOffset {
1912    // if we overflow we must not return an offset
1913    fn read_offset<R: Read + Seek>(
1914        &self,
1915        haystack: &mut LazyCache<R>,
1916        rule_base_offset: Option<u64>,
1917        last_upper_match_offset: Option<u64>,
1918    ) -> Result<Option<u64>, io::Error> {
1919        let offset_address = match self.off_addr {
1920            DirOffset::Start(s) => {
1921                let Some(o) = s.checked_add(rule_base_offset.unwrap_or_default()) else {
1922                    return Ok(None);
1923                };
1924
1925                haystack.seek(SeekFrom::Start(o))?
1926            }
1927            DirOffset::LastUpper(c) => haystack.seek(SeekFrom::Start(
1928                (last_upper_match_offset.unwrap_or_default() as i64 + c) as u64,
1929            ))?,
1930            DirOffset::End(e) => haystack.seek(SeekFrom::End(e))?,
1931        };
1932
1933        macro_rules! read_value {
1934            () => {
1935                match self.ty {
1936                    OffsetType::Byte => {
1937                        if self.signed {
1938                            read_le!(haystack, u8) as u64
1939                        } else {
1940                            read_le!(haystack, i8) as u64
1941                        }
1942                    }
1943                    OffsetType::DoubleLe => read_le!(haystack, f64) as u64,
1944                    OffsetType::DoubleBe => read_be!(haystack, f64) as u64,
1945                    OffsetType::ShortLe => {
1946                        if self.signed {
1947                            read_le!(haystack, i16) as u64
1948                        } else {
1949                            read_le!(haystack, u16) as u64
1950                        }
1951                    }
1952                    OffsetType::ShortBe => {
1953                        if self.signed {
1954                            read_be!(haystack, i16) as u64
1955                        } else {
1956                            read_be!(haystack, u16) as u64
1957                        }
1958                    }
1959                    OffsetType::Id3Le => decode_id3(read_le!(haystack, u32)) as u64,
1960                    OffsetType::Id3Be => decode_id3(read_be!(haystack, u32)) as u64,
1961                    OffsetType::LongLe => {
1962                        if self.signed {
1963                            read_le!(haystack, i32) as u64
1964                        } else {
1965                            read_le!(haystack, u32) as u64
1966                        }
1967                    }
1968                    OffsetType::LongBe => {
1969                        if self.signed {
1970                            read_be!(haystack, i32) as u64
1971                        } else {
1972                            read_be!(haystack, u32) as u64
1973                        }
1974                    }
1975                    OffsetType::Middle => read_me!(haystack) as u64,
1976                    OffsetType::Octal => {
1977                        if let Some(o) = read_octal_u64(haystack) {
1978                            o
1979                        } else {
1980                            debug!("failed to read octal offset @ {offset_address}");
1981                            return Ok(None);
1982                        }
1983                    }
1984                    OffsetType::QuadLe => {
1985                        if self.signed {
1986                            read_le!(haystack, i64) as u64
1987                        } else {
1988                            read_le!(haystack, u64)
1989                        }
1990                    }
1991                    OffsetType::QuadBe => {
1992                        if self.signed {
1993                            read_be!(haystack, i64) as u64
1994                        } else {
1995                            read_be!(haystack, u64)
1996                        }
1997                    }
1998                }
1999            };
2000        }
2001
2002        // in theory every offset read should end up in something seekable from start, so we can use u64 to store the result
2003        let o = read_value!();
2004
2005        trace!(
2006            "offset read @ {offset_address} value={o} op={:?} shift={:?}",
2007            self.op, self.shift
2008        );
2009
2010        // apply transformation
2011        if let (Some(op), Some(shift)) = (self.op, self.shift) {
2012            let shift = match shift {
2013                Shift::Direct(i) => i,
2014                Shift::Indirect(i) => {
2015                    let tmp = offset_address as i128 + i as i128;
2016                    if tmp.is_negative() {
2017                        return Ok(None);
2018                    } else {
2019                        haystack.seek(SeekFrom::Start(tmp as u64))?;
2020                    };
2021                    // NOTE: here we assume that the shift has the same
2022                    // type as the main offset !
2023                    read_value!()
2024                }
2025            };
2026
2027            match op {
2028                Op::Add => return Ok(o.checked_add(shift)),
2029                Op::Mul => return Ok(o.checked_mul(shift)),
2030                Op::Sub => return Ok(o.checked_sub(shift)),
2031                Op::Div => return Ok(o.checked_div(shift)),
2032                Op::Mod => return Ok(o.checked_rem(shift)),
2033                Op::And => return Ok(Some(o & shift)),
2034                Op::Or => return Ok(Some(o | shift)),
2035                Op::Xor => return Ok(Some(o ^ shift)),
2036            }
2037        }
2038
2039        Ok(Some(o))
2040    }
2041}
2042
2043#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
2044enum DirOffset {
2045    Start(u64),
2046    // relative to the last up-level field
2047    LastUpper(i64),
2048    End(i64),
2049}
2050
2051#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
2052enum Offset {
2053    Direct(DirOffset),
2054    Indirect(IndOffset),
2055}
2056
2057impl Offset {
2058    #[inline(always)]
2059    fn is_indirect(&self) -> bool {
2060        matches!(self, Self::Indirect(_))
2061    }
2062}
2063
2064impl From<DirOffset> for Offset {
2065    fn from(value: DirOffset) -> Self {
2066        Self::Direct(value)
2067    }
2068}
2069
2070impl From<IndOffset> for Offset {
2071    fn from(value: IndOffset) -> Self {
2072        Self::Indirect(value)
2073    }
2074}
2075
2076impl Display for DirOffset {
2077    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
2078        match self {
2079            DirOffset::Start(i) => write!(f, "{i}"),
2080            DirOffset::LastUpper(c) => write!(f, "&{c}"),
2081            DirOffset::End(e) => write!(f, "-{e}"),
2082        }
2083    }
2084}
2085
2086impl Default for DirOffset {
2087    fn default() -> Self {
2088        Self::LastUpper(0)
2089    }
2090}
2091
2092#[derive(Debug, Clone, Serialize, Deserialize)]
2093struct Match {
2094    line: usize,
2095    depth: u8,
2096    offset: Offset,
2097    test: Test,
2098    test_strength: u64,
2099    message: Option<Message>,
2100}
2101
2102impl From<Use> for Match {
2103    fn from(value: Use) -> Self {
2104        let test = Test::Use(value.switch_endianness, value.rule_name);
2105        let test_strength = test.strength();
2106        Self {
2107            line: value.line,
2108            depth: value.depth,
2109            offset: value.start_offset,
2110            test,
2111            test_strength,
2112            message: value.message,
2113        }
2114    }
2115}
2116
2117impl From<Name> for Match {
2118    fn from(value: Name) -> Self {
2119        let test = Test::Name(value.name);
2120        let test_strength = test.strength();
2121        Self {
2122            line: value.line,
2123            depth: 0,
2124            offset: Offset::Direct(DirOffset::Start(0)),
2125            test,
2126            test_strength,
2127            message: value.message,
2128        }
2129    }
2130}
2131
2132impl Match {
2133    /// Turns the `Match`'s offset into an absolute offset from the start of the stream
2134    #[inline(always)]
2135    fn offset_from_start<R: Read + Seek>(
2136        &self,
2137        haystack: &mut LazyCache<R>,
2138        rule_base_offset: Option<u64>,
2139        last_level_offset: Option<u64>,
2140    ) -> Result<Option<u64>, io::Error> {
2141        match self.offset {
2142            Offset::Direct(dir_offset) => match dir_offset {
2143                DirOffset::Start(s) => Ok(Some(s)),
2144                DirOffset::LastUpper(shift) => {
2145                    let o = last_level_offset.unwrap_or_default() as i64 + shift;
2146
2147                    if o >= 0 { Ok(Some(o as u64)) } else { Ok(None) }
2148                }
2149                DirOffset::End(e) => Ok(Some(haystack.offset_from_start(SeekFrom::End(e)))),
2150            },
2151            Offset::Indirect(ind_offset) => {
2152                let Some(o) =
2153                    ind_offset.read_offset(haystack, rule_base_offset, last_level_offset)?
2154                else {
2155                    return Ok(None);
2156                };
2157
2158                Ok(Some(o))
2159            }
2160        }
2161    }
2162
2163    /// this method emulates the buffer based matching
2164    /// logic implemented in libmagic. It needs some aweful
2165    /// and weird offset convertions to turn buffer
2166    /// relative offsets (libmagic is based on) into
2167    /// absolute offset in the file.
2168    ///
2169    /// this method shoud bubble up only critical errors
2170    /// all the other errors should make the match result
2171    /// false and be logged via debug!
2172    ///
2173    /// the function returns an error if the maximum recursion
2174    /// has been reached or if a dependency rule is missing.
2175    #[inline]
2176    #[allow(clippy::too_many_arguments)]
2177    fn matches<'a: 'h, 'h, R: Read + Seek>(
2178        &'a self,
2179        source: Option<&str>,
2180        magic: &mut Magic<'a>,
2181        stream_kind: StreamKind,
2182        state: &mut MatchState,
2183        buf_base_offset: Option<u64>,
2184        rule_base_offset: Option<u64>,
2185        last_level_offset: Option<u64>,
2186        haystack: &'h mut LazyCache<R>,
2187        switch_endianness: bool,
2188        db: &'a MagicDb,
2189        depth: usize,
2190    ) -> Result<(bool, Option<MatchRes<'h>>), Error> {
2191        let source = source.unwrap_or("unknown");
2192        let line = self.line;
2193
2194        if depth >= MAX_RECURSION {
2195            return Err(Error::localized(
2196                source,
2197                line,
2198                Error::MaximumRecursion(MAX_RECURSION),
2199            ));
2200        }
2201
2202        if self.test.is_only_binary() && stream_kind.is_text() {
2203            trace!("skip binary test source={source} line={line} stream_kind={stream_kind:?}",);
2204            return Ok((false, None));
2205        }
2206
2207        if self.test.is_only_text() && !stream_kind.is_text() {
2208            trace!("skip text test source={source} line={line} stream_kind={stream_kind:?}",);
2209            return Ok((false, None));
2210        }
2211
2212        let Ok(Some(mut offset)) = self
2213            .offset_from_start(haystack, rule_base_offset, last_level_offset)
2214            .inspect_err(|e| debug!("source={source} line={line} failed at computing offset: {e}"))
2215        else {
2216            return Ok((false, None));
2217        };
2218
2219        offset = match self.offset {
2220            Offset::Indirect(_) => {
2221                // the result we get for an indirect offset
2222                // is relative to the start of the libmagic
2223                // buffer so we need to add base to make it
2224                // absolute.
2225                buf_base_offset.unwrap_or_default().saturating_add(offset)
2226            }
2227            // offset from start are computed from rule base
2228            Offset::Direct(DirOffset::Start(_)) => {
2229                rule_base_offset.unwrap_or_default().saturating_add(offset)
2230            }
2231            _ => offset,
2232        };
2233
2234        match &self.test {
2235            Test::Clear => {
2236                trace!("source={source} line={line} clear");
2237                state.clear_continuation_level(&self.continuation_level());
2238                Ok((true, None))
2239            }
2240
2241            Test::Name(name) => {
2242                trace!(
2243                    "source={source} line={line} running rule {name} switch_endianness={switch_endianness}",
2244                );
2245                Ok((true, None))
2246            }
2247
2248            Test::Use(flip_endianness, rule_name) => {
2249                trace!(
2250                    "source={source} line={line} use {rule_name} switch_endianness={flip_endianness}",
2251                );
2252
2253                // switch_endianness must propagate down the rule call stack
2254                let switch_endianness = switch_endianness ^ flip_endianness;
2255
2256                let dr: &DependencyRule = db.dependencies.get(rule_name).ok_or(
2257                    Error::localized(source, line, Error::MissingRule(rule_name.clone())),
2258                )?;
2259
2260                // we push the message here otherwise we push message in depth first
2261                if let Some(msg) = self.message.as_ref() {
2262                    magic.push_message(msg.to_string_lossy());
2263                }
2264
2265                let new_buf_base_off = if self.offset.is_indirect() {
2266                    Some(offset)
2267                } else {
2268                    None
2269                };
2270
2271                let nmatch = dr.rule.magic(
2272                    magic,
2273                    stream_kind,
2274                    new_buf_base_off,
2275                    Some(offset),
2276                    haystack,
2277                    db,
2278                    switch_endianness,
2279                    depth.saturating_add(1),
2280                )?;
2281
2282                // The name is always true, so we consider there to be a match
2283                // if more than one test succeeded
2284                let matched = nmatch > 0;
2285                if matched {
2286                    state.set_continuation_level(self.continuation_level());
2287                }
2288
2289                Ok((matched, None))
2290            }
2291
2292            Test::Indirect(m) => {
2293                trace!(
2294                    "source={source} line={line} indirect mods={:?} offset={offset:#x}",
2295                    m
2296                );
2297
2298                let new_buf_base_off = if m.contains(IndirectMod::Relative) {
2299                    Some(offset)
2300                } else {
2301                    None
2302                };
2303
2304                // we push the message here otherwise we push message in depth first
2305                if let Some(msg) = self.message.as_ref() {
2306                    magic.push_message(msg.to_string_lossy());
2307                }
2308
2309                let mut nmatch = 0u64;
2310                for r in db.rules.iter() {
2311                    nmatch = nmatch.saturating_add(r.magic(
2312                        magic,
2313                        stream_kind,
2314                        new_buf_base_off,
2315                        Some(offset),
2316                        haystack,
2317                        db,
2318                        false,
2319                        depth.saturating_add(1),
2320                    )?);
2321
2322                    if nmatch > 0 {
2323                        break;
2324                    }
2325                }
2326
2327                Ok((nmatch > 0, None))
2328            }
2329
2330            Test::Default => {
2331                // default matches if nothing else at the continuation level matched
2332                let ok = !state.get_continuation_level(&self.continuation_level());
2333
2334                trace!("source={source} line={line} default match={ok}");
2335                if ok {
2336                    state.set_continuation_level(self.continuation_level());
2337                }
2338
2339                Ok((ok, None))
2340            }
2341
2342            _ => {
2343                if let Err(e) = haystack.seek(SeekFrom::Start(offset)) {
2344                    debug!("source={source} line={line} failed to seek in haystack: {e}");
2345                    return Ok((false, None));
2346                }
2347
2348                let mut trace_msg = None;
2349
2350                if enabled!(Level::DEBUG) {
2351                    trace_msg = Some(vec![format!(
2352                        "source={source} line={line} depth={} stream_offset={:#x}",
2353                        self.depth,
2354                        haystack.lazy_stream_position()
2355                    )])
2356                }
2357
2358                // NOTE: we may have a way to optimize here. In case we do a Any
2359                // test and we don't use the value to format the message, we don't
2360                // need to read the value.
2361                if let Ok(opt_test_value) = self
2362                    .test
2363                    .read_test_value(haystack, switch_endianness)
2364                    .inspect_err(|e| {
2365                        debug!("source={source} line={line} error while reading test value @{offset}: {e}",)
2366                    })
2367                {
2368                    if let Some(v) = trace_msg
2369                        .as_mut() { v.push(format!("test={}", self.test)) }
2370
2371                    if let Some(v) = trace_msg.as_mut(){
2372                        let drv = match opt_test_value.as_ref(){
2373                            Some(r) => format!("{r:?}"),
2374                            None =>String::new(),
2375                        };
2376                        v.push(format!("read_in_stream={drv}"))
2377                    }
2378
2379                    let match_res =
2380                        opt_test_value.and_then(|tv| self.test.match_value(&tv, stream_kind));
2381
2382                    if let Some(v) = trace_msg.as_mut() { v.push(format!(
2383                            "message=\"{}\" match={}",
2384                            self.message
2385                                .as_ref()
2386                                .map(|fs| fs.to_string_lossy())
2387                                .unwrap_or_default(),
2388                            match_res.is_some()
2389                        )) }
2390
2391                    // trace message
2392                    if enabled!(Level::DEBUG) && !enabled!(Level::TRACE) && match_res.is_some() {
2393                        if let Some(m) = trace_msg{
2394                            debug!("{}", m.join(" "));
2395                        }
2396                    } else if enabled!(Level::TRACE)
2397                        && let Some(m) = trace_msg{
2398                            trace!("{}", m.join(" "));
2399                        }
2400
2401                    if let Some(mr) = match_res {
2402                        state.set_continuation_level(self.continuation_level());
2403                        return Ok((true, Some(mr)));
2404                    }
2405                }
2406
2407                Ok((false, None))
2408            }
2409        }
2410    }
2411
2412    #[inline(always)]
2413    fn continuation_level(&self) -> ContinuationLevel {
2414        ContinuationLevel(self.depth)
2415    }
2416}
2417
2418#[derive(Debug, Clone)]
2419struct Use {
2420    line: usize,
2421    depth: u8,
2422    start_offset: Offset,
2423    rule_name: String,
2424    switch_endianness: bool,
2425    message: Option<Message>,
2426}
2427
2428#[derive(Debug, Clone, Serialize, Deserialize)]
2429struct StrengthMod {
2430    op: Op,
2431    by: u8,
2432}
2433
2434impl StrengthMod {
2435    #[inline(always)]
2436    fn apply(&self, strength: u64) -> u64 {
2437        let by = self.by as u64;
2438        debug!("applying strength modifier: {strength} {} {}", self.op, by);
2439        match self.op {
2440            Op::Mul => strength.saturating_mul(by),
2441            Op::Add => strength.saturating_add(by),
2442            Op::Sub => strength.saturating_sub(by),
2443            Op::Div => {
2444                if by > 0 {
2445                    strength.saturating_div(by)
2446                } else {
2447                    strength
2448                }
2449            }
2450            Op::Mod => strength % by,
2451            Op::And => strength & by,
2452            // this should never happen as strength operators
2453            // are enforced by our parser
2454            Op::Xor | Op::Or => {
2455                debug_panic!("unsupported strength operator");
2456                strength
2457            }
2458        }
2459    }
2460}
2461
2462#[derive(Debug, Clone)]
2463enum Flag {
2464    Mime(String),
2465    Ext(HashSet<String>),
2466    Strength(StrengthMod),
2467    Apple(String),
2468}
2469
2470#[derive(Debug, Clone)]
2471struct Name {
2472    line: usize,
2473    name: String,
2474    message: Option<Message>,
2475}
2476
2477#[derive(Debug, Clone)]
2478enum Entry<'span> {
2479    Match(Span<'span>, Match),
2480    Flag(Span<'span>, Flag),
2481}
2482
2483#[derive(Debug, Clone, Serialize, Deserialize)]
2484struct EntryNode {
2485    root: bool,
2486    entry: Match,
2487    children: Vec<EntryNode>,
2488    mimetype: Option<String>,
2489    apple: Option<String>,
2490    strength_mod: Option<StrengthMod>,
2491    exts: HashSet<String>,
2492}
2493
2494#[derive(Debug, Default)]
2495struct EntryNodeVisitor {
2496    exts: HashSet<String>,
2497    score: u64,
2498}
2499
2500impl EntryNodeVisitor {
2501    fn new() -> Self {
2502        Self {
2503            ..Default::default()
2504        }
2505    }
2506
2507    fn merge(&mut self, other: Self) {
2508        self.exts.extend(other.exts);
2509        self.score += other.score;
2510    }
2511}
2512
2513impl EntryNode {
2514    #[inline]
2515    fn update_visitor(&self, v: &mut EntryNodeVisitor, depth: usize) {
2516        // update extensions
2517        for ext in self.exts.iter() {
2518            if !v.exts.contains(ext) {
2519                v.exts.insert(ext.clone());
2520            }
2521        }
2522
2523        // update score if depth
2524        if depth == 0 {
2525            v.score += self.entry.test_strength;
2526        }
2527
2528        // Tests at deeper levels contribute less to the overall score.
2529        // We use the minimum value to establish a lower bound for the rule's score,
2530        // which helps prioritize rules based on their importance.
2531        v.score += self
2532            .children
2533            .iter()
2534            .map(|e| e.entry.test_strength)
2535            .min()
2536            .unwrap_or_default()
2537            / max(1, depth as u64);
2538    }
2539
2540    fn visit(
2541        &self,
2542        v: &mut EntryNodeVisitor,
2543        deps: &HashMap<String, DependencyRule>,
2544        marked: &mut HashSet<String>,
2545        depth: usize,
2546    ) -> Result<(), Error> {
2547        // updating visitor
2548        self.update_visitor(v, depth);
2549
2550        // recursively visiting
2551        for c in self.children.iter() {
2552            if let Test::Use(_, ref name) = c.entry.test {
2553                if marked.contains(name) {
2554                    continue;
2555                }
2556
2557                marked.insert(name.clone());
2558
2559                if let Some(r) = deps.get(name) {
2560                    let dv = r.rule.visit_all_entries(deps, marked)?;
2561                    v.merge(dv);
2562                } else {
2563                    return Err(Error::MissingRule(name.clone()));
2564                }
2565            } else {
2566                c.visit(v, deps, marked, depth + 1)?;
2567            }
2568        }
2569
2570        Ok(())
2571    }
2572
2573    /// Executes the magic matching logic recursively and returns the count of matches that produce messages.
2574    /// Matches that don't result in message appends are not counted, consistent with libmagic's behavior.
2575    #[inline]
2576    #[allow(clippy::too_many_arguments)]
2577    fn matches<'r, R: Read + Seek>(
2578        &'r self,
2579        opt_source: Option<&str>,
2580        magic: &mut Magic<'r>,
2581        state: &mut MatchState,
2582        stream_kind: StreamKind,
2583        buf_base_offset: Option<u64>,
2584        rule_base_offset: Option<u64>,
2585        last_level_offset: Option<u64>,
2586        haystack: &mut LazyCache<R>,
2587        db: &'r MagicDb,
2588        switch_endianness: bool,
2589        depth: usize,
2590    ) -> Result<u64, Error> {
2591        let mut nmatch = 0u64;
2592
2593        let (ok, opt_match_res) = self.entry.matches(
2594            opt_source,
2595            magic,
2596            stream_kind,
2597            state,
2598            buf_base_offset,
2599            rule_base_offset,
2600            last_level_offset,
2601            haystack,
2602            switch_endianness,
2603            db,
2604            depth,
2605        )?;
2606
2607        let source = opt_source.unwrap_or("unknown");
2608        let line = self.entry.line;
2609
2610        if ok {
2611            // Update the magic with the message if the match is successful
2612            // Skip updating if the test is recursive, as it's already handled
2613            // in the Match::matches function
2614            if !self.entry.test.is_recursive()
2615                && let Some(msg) = self.entry.message.as_ref()
2616                && let Ok(msg) = msg.format_with(opt_match_res.as_ref()).inspect_err(|e| {
2617                    debug!("source={source} line={line} failed to format message: {e}")
2618                })
2619            {
2620                nmatch = nmatch.saturating_add(1);
2621                magic.push_message(msg);
2622            }
2623
2624            // we need to adjust stream offset in case of regex/search tests
2625            if let Some(mr) = opt_match_res {
2626                match &self.entry.test {
2627                    Test::String(t) if t.has_length_mod() => {
2628                        let o = mr.end_offset();
2629                        haystack.seek(SeekFrom::Start(o))?;
2630                    }
2631                    Test::Search(t) => {
2632                        if t.re_mods.contains(ReMod::StartOffsetUpdate) {
2633                            let o = mr.start_offset();
2634                            haystack.seek(SeekFrom::Start(o))?;
2635                        } else {
2636                            let o = mr.end_offset();
2637                            haystack.seek(SeekFrom::Start(o))?;
2638                        }
2639                    }
2640
2641                    Test::Regex(t) => {
2642                        if t.mods.contains(ReMod::StartOffsetUpdate) {
2643                            let o = mr.start_offset();
2644                            haystack.seek(SeekFrom::Start(o))?;
2645                        } else {
2646                            let o = mr.end_offset();
2647                            haystack.seek(SeekFrom::Start(o))?;
2648                        }
2649                    }
2650                    // other types do not need offset adjustement
2651                    _ => {}
2652                }
2653            }
2654
2655            if let Some(mimetype) = self.mimetype.as_ref() {
2656                magic.set_mime_type(Cow::Borrowed(mimetype));
2657            }
2658
2659            if let Some(apple_ty) = self.apple.as_ref() {
2660                magic.set_creator_code(Cow::Borrowed(apple_ty));
2661            }
2662
2663            if !self.exts.is_empty() {
2664                magic.insert_extensions(self.exts.iter().map(|s| s.as_str()));
2665            }
2666
2667            // NOTE: here we try to implement a similar logic as in file_magic_strength.
2668            // Sticking to the exact same strength computation logic is complicated due
2669            // to implementation differences. Let's wait and see if that is a real issue.
2670            let mut strength = self.entry.test_strength;
2671
2672            let continuation_level = self.entry.continuation_level().0 as u64;
2673            if self.entry.message.is_none() && continuation_level < 3 {
2674                strength = strength.saturating_add(continuation_level);
2675            }
2676
2677            if let Some(sm) = self.strength_mod.as_ref() {
2678                strength = sm.apply(strength);
2679            }
2680
2681            // entries with no message get a bonus
2682            if self.entry.message.is_none() {
2683                strength += 1
2684            }
2685
2686            magic.update_strength(strength);
2687
2688            let end_upper_level = haystack.lazy_stream_position();
2689
2690            // we have to fix rule_base_offset if
2691            // the rule_base_starts from end otherwise it
2692            // breaks some offset computation in match
2693            // see test_offset_bug_1 and test_offset_bug_2
2694            // they implement the same test logic yet indirect
2695            // offsets have to be different so that it works
2696            // in libmagic/file
2697            let rule_base_offset = if self.root {
2698                match self.entry.offset {
2699                    Offset::Direct(DirOffset::End(o)) => {
2700                        Some(haystack.offset_from_start(SeekFrom::End(o)))
2701                    }
2702                    _ => rule_base_offset,
2703                }
2704            } else {
2705                rule_base_offset
2706            };
2707
2708            for e in self.children.iter() {
2709                nmatch = nmatch.saturating_add(e.matches(
2710                    opt_source,
2711                    magic,
2712                    state,
2713                    stream_kind,
2714                    buf_base_offset,
2715                    rule_base_offset,
2716                    Some(end_upper_level),
2717                    haystack,
2718                    db,
2719                    switch_endianness,
2720                    depth,
2721                )?);
2722            }
2723        }
2724
2725        Ok(nmatch)
2726    }
2727}
2728
2729/// Represents a parsed magic rule
2730#[derive(Debug, Clone, Serialize, Deserialize)]
2731pub struct MagicRule {
2732    id: usize,
2733    source: Option<String>,
2734    entries: EntryNode,
2735    extensions: HashSet<String>,
2736    /// score used for rule ranking
2737    score: u64,
2738    finalized: bool,
2739}
2740
2741impl MagicRule {
2742    #[inline(always)]
2743    fn set_id(&mut self, id: usize) {
2744        self.id = id
2745    }
2746
2747    fn visit_all_entries(
2748        &self,
2749        deps: &HashMap<String, DependencyRule>,
2750        marked: &mut HashSet<String>,
2751    ) -> Result<EntryNodeVisitor, Error> {
2752        let mut v = EntryNodeVisitor::new();
2753        self.entries.visit(&mut v, deps, marked, 0)?;
2754        Ok(v)
2755    }
2756
2757    /// Finalize a rule by searching for all extensions and computing its score
2758    /// for ranking. In the `MagicRule` is already finalized it returns immediately.
2759    fn try_finalize(&mut self, deps: &HashMap<String, DependencyRule>) -> Result<(), Error> {
2760        if self.finalized {
2761            return Ok(());
2762        }
2763
2764        // rule can be finalized all deps are found
2765        let v = self.visit_all_entries(deps, &mut HashSet::new())?;
2766
2767        self.extensions.extend(v.exts);
2768        self.score = v.score;
2769        self.finalized = true;
2770
2771        Ok(())
2772    }
2773
2774    #[inline]
2775    fn magic_entrypoint<'r, R: Read + Seek>(
2776        &'r self,
2777        magic: &mut Magic<'r>,
2778        stream_kind: StreamKind,
2779        haystack: &mut LazyCache<R>,
2780        db: &'r MagicDb,
2781        switch_endianness: bool,
2782        depth: usize,
2783    ) -> Result<u64, Error> {
2784        self.entries.matches(
2785            self.source.as_deref(),
2786            magic,
2787            &mut MatchState::empty(),
2788            stream_kind,
2789            None,
2790            None,
2791            None,
2792            haystack,
2793            db,
2794            switch_endianness,
2795            depth,
2796        )
2797    }
2798
2799    /// Executes the magic matching logic and returns the count of matches that produce messages.
2800    /// Matches that don't result in message appends are not counted, consistent with libmagic's behavior.
2801    #[inline]
2802    #[allow(clippy::too_many_arguments)]
2803    fn magic<'r, R: Read + Seek>(
2804        &'r self,
2805        magic: &mut Magic<'r>,
2806        stream_kind: StreamKind,
2807        buf_base_offset: Option<u64>,
2808        rule_base_offset: Option<u64>,
2809        haystack: &mut LazyCache<R>,
2810        db: &'r MagicDb,
2811        switch_endianness: bool,
2812        depth: usize,
2813    ) -> Result<u64, Error> {
2814        self.entries.matches(
2815            self.source.as_deref(),
2816            magic,
2817            &mut MatchState::empty(),
2818            stream_kind,
2819            buf_base_offset,
2820            rule_base_offset,
2821            None,
2822            haystack,
2823            db,
2824            switch_endianness,
2825            depth,
2826        )
2827    }
2828
2829    /// Checks if the rule is for matching against text content
2830    ///
2831    /// # Returns
2832    ///
2833    /// * `bool` - True if the rule is for text files
2834    pub fn is_text(&self) -> bool {
2835        self.entries.entry.test.is_text()
2836            && self.entries.children.iter().all(|e| e.entry.test.is_text())
2837    }
2838
2839    /// Gets the rule's score used for ranking rules between them
2840    ///
2841    /// # Returns
2842    ///
2843    /// * `u64` - The rule's score
2844    #[inline(always)]
2845    pub fn score(&self) -> u64 {
2846        self.score
2847    }
2848
2849    /// Gets the rule's filename if any
2850    ///
2851    /// # Returns
2852    ///
2853    /// * `Option<&str>` - The rule's source if available
2854    #[inline(always)]
2855    pub fn source(&self) -> Option<&str> {
2856        self.source.as_deref()
2857    }
2858
2859    /// Gets the line number at which the rule is defined
2860    ///
2861    /// # Returns
2862    ///
2863    /// * `usize` - The rule's line number
2864    #[inline(always)]
2865    pub fn line(&self) -> usize {
2866        self.entries.entry.line
2867    }
2868
2869    /// Gets all the file extensions associated to the rule
2870    ///
2871    /// # Returns
2872    ///
2873    /// * `&HashSet<String>` - The set of all associated extensions
2874    #[inline(always)]
2875    pub fn extensions(&self) -> &HashSet<String> {
2876        &self.extensions
2877    }
2878}
2879
2880#[derive(Debug, Clone, Serialize, Deserialize)]
2881struct DependencyRule {
2882    name: String,
2883    rule: MagicRule,
2884}
2885
2886/// A parsed source of magic rules
2887///
2888/// # Methods
2889///
2890/// * `open` - Opens a magic file from a path
2891#[derive(Debug, Clone, Serialize, Deserialize)]
2892pub struct MagicSource {
2893    rules: Vec<MagicRule>,
2894    dependencies: HashMap<String, DependencyRule>,
2895}
2896
2897impl MagicSource {
2898    /// Opens and parses a magic file from a path
2899    ///
2900    /// # Arguments
2901    ///
2902    /// * `p` - The path to the magic file
2903    ///
2904    /// # Returns
2905    ///
2906    /// * `Result<Self, Error>` - The parsed magic file or an error
2907    pub fn open<P: AsRef<Path>>(p: P) -> Result<Self, Error> {
2908        FileMagicParser::parse_file(p)
2909    }
2910}
2911
2912#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)]
2913struct ContinuationLevel(u8);
2914
2915// FIXME: magic handles many more text encodings
2916#[derive(Debug, PartialEq, Eq, Clone, Copy)]
2917enum TextEncoding {
2918    Ascii,
2919    Utf8,
2920    Unknown,
2921}
2922
2923impl TextEncoding {
2924    const fn as_magic_str(&self) -> &'static str {
2925        match self {
2926            TextEncoding::Ascii => "ASCII",
2927            TextEncoding::Utf8 => "UTF-8",
2928            TextEncoding::Unknown => "Unknown",
2929        }
2930    }
2931}
2932
2933#[derive(Debug, PartialEq, Eq, Clone, Copy)]
2934enum StreamKind {
2935    Binary,
2936    Text(TextEncoding),
2937}
2938
2939impl StreamKind {
2940    const fn is_text(&self) -> bool {
2941        matches!(self, StreamKind::Text(_))
2942    }
2943}
2944
2945#[derive(Debug)]
2946struct MatchState {
2947    continuation_levels: [bool; 256],
2948}
2949
2950impl MatchState {
2951    #[inline(always)]
2952    fn empty() -> Self {
2953        MatchState {
2954            continuation_levels: [false; 256],
2955        }
2956    }
2957
2958    #[inline(always)]
2959    fn get_continuation_level(&mut self, level: &ContinuationLevel) -> bool {
2960        self.continuation_levels
2961            .get(level.0 as usize)
2962            .cloned()
2963            .unwrap_or_default()
2964    }
2965
2966    #[inline(always)]
2967    fn set_continuation_level(&mut self, level: ContinuationLevel) {
2968        if let Some(b) = self.continuation_levels.get_mut(level.0 as usize) {
2969            *b = true
2970        }
2971    }
2972
2973    #[inline(always)]
2974    fn clear_continuation_level(&mut self, level: &ContinuationLevel) {
2975        if let Some(b) = self.continuation_levels.get_mut(level.0 as usize) {
2976            *b = false;
2977        }
2978    }
2979}
2980
2981/// Represents a file magic detection result
2982#[derive(Debug, Default)]
2983pub struct Magic<'m> {
2984    stream_kind: Option<StreamKind>,
2985    source: Option<Cow<'m, str>>,
2986    message: Vec<Cow<'m, str>>,
2987    mime_type: Option<Cow<'m, str>>,
2988    creator_code: Option<Cow<'m, str>>,
2989    strength: u64,
2990    exts: HashSet<Cow<'m, str>>,
2991    is_default: bool,
2992}
2993
2994impl<'m> Magic<'m> {
2995    #[inline(always)]
2996    fn set_source(&mut self, source: Option<&'m str>) {
2997        self.source = source.map(Cow::Borrowed);
2998    }
2999
3000    #[inline(always)]
3001    fn set_stream_kind(&mut self, stream_kind: StreamKind) {
3002        self.stream_kind = Some(stream_kind)
3003    }
3004
3005    #[inline(always)]
3006    fn reset(&mut self) {
3007        self.stream_kind = None;
3008        self.source = None;
3009        self.message.clear();
3010        self.mime_type = None;
3011        self.creator_code = None;
3012        self.strength = 0;
3013        self.exts.clear();
3014        self.is_default = false;
3015    }
3016
3017    /// Converts borrowed data into owned data. This method involves
3018    /// data cloning, so you must use this method only if you need to
3019    /// extend the lifetime of a [`Magic`] struct.
3020    ///
3021    /// # Returns
3022    ///
3023    /// * `Magic<'owned>` - A new [`Magic`] with owned data
3024    #[inline]
3025    pub fn into_owned<'owned>(self) -> Magic<'owned> {
3026        Magic {
3027            stream_kind: self.stream_kind,
3028            source: self.source.map(|s| Cow::Owned(s.into_owned())),
3029            message: self
3030                .message
3031                .into_iter()
3032                .map(Cow::into_owned)
3033                .map(Cow::Owned)
3034                .collect(),
3035            mime_type: self.mime_type.map(|m| Cow::Owned(m.into_owned())),
3036            creator_code: self.creator_code.map(|m| Cow::Owned(m.into_owned())),
3037            strength: self.strength,
3038            exts: self
3039                .exts
3040                .into_iter()
3041                .map(|e| Cow::Owned(e.into_owned()))
3042                .collect(),
3043            is_default: self.is_default,
3044        }
3045    }
3046
3047    /// Gets the formatted message describing the file type
3048    ///
3049    /// # Returns
3050    ///
3051    /// * `String` - The formatted message
3052    #[inline(always)]
3053    pub fn message(&self) -> String {
3054        let mut out = String::new();
3055        for (i, m) in self.message.iter().enumerate() {
3056            if let Some(s) = m.strip_prefix(r#"\b"#) {
3057                out.push_str(s);
3058            } else {
3059                // don't put space on first string
3060                if i > 0 {
3061                    out.push(' ');
3062                }
3063                out.push_str(m);
3064            }
3065        }
3066        out
3067    }
3068
3069    /// Returns an iterator over the individual parts of the magic message
3070    ///
3071    /// A magic message is typically composed of multiple parts, each appended
3072    /// during successful magic tests. This method provides an efficient way to
3073    /// iterate over these parts without concatenating them into a new string,
3074    /// as done when calling [`Magic::message`].
3075    ///
3076    /// # Returns
3077    ///
3078    /// * `impl Iterator<Item = &str>` - An iterator yielding string slices of each message part
3079    #[inline]
3080    pub fn message_parts(&self) -> impl Iterator<Item = &str> {
3081        self.message.iter().map(|p| p.as_ref())
3082    }
3083
3084    #[inline(always)]
3085    fn update_strength(&mut self, value: u64) {
3086        self.strength = self.strength.saturating_add(value);
3087        debug!("updated strength = {:?}", self.strength)
3088    }
3089
3090    /// Gets the detected MIME type
3091    ///
3092    /// # Returns
3093    ///
3094    /// * `&str` - The MIME type or default based on stream kind
3095    #[inline(always)]
3096    pub fn mime_type(&self) -> &str {
3097        self.mime_type.as_deref().unwrap_or(match self.stream_kind {
3098            Some(StreamKind::Text(_)) => DEFAULT_TEXT_MIMETYPE,
3099            Some(StreamKind::Binary) | None => DEFAULT_BIN_MIMETYPE,
3100        })
3101    }
3102
3103    #[inline(always)]
3104    fn push_message<'a: 'm>(&mut self, msg: Cow<'a, str>) {
3105        if !msg.is_empty() {
3106            debug!("pushing message: msg={msg} len={}", msg.len());
3107            self.message.push(msg);
3108        }
3109    }
3110
3111    #[inline(always)]
3112    fn set_mime_type<'a: 'm>(&mut self, mime: Cow<'a, str>) {
3113        if self.mime_type.is_none() {
3114            debug!("insert mime: {:?}", mime);
3115            self.mime_type = Some(mime)
3116        }
3117    }
3118
3119    #[inline(always)]
3120    fn set_creator_code<'a: 'm>(&mut self, apple_ty: Cow<'a, str>) {
3121        if self.creator_code.is_none() {
3122            debug!("insert apple type: {apple_ty:?}");
3123            self.creator_code = Some(apple_ty)
3124        }
3125    }
3126
3127    #[inline(always)]
3128    fn insert_extensions<'a: 'm, I: Iterator<Item = &'a str>>(&mut self, exts: I) {
3129        if self.exts.is_empty() {
3130            self.exts.extend(exts.filter_map(|e| {
3131                if e.is_empty() {
3132                    None
3133                } else {
3134                    Some(Cow::Borrowed(e))
3135                }
3136            }));
3137        }
3138    }
3139
3140    /// Gets the confidence score of the detection. This
3141    /// value is used to sort [`Magic`] in [`MagicDb::best_magic`]
3142    /// and [`MagicDb::all_magics`].
3143    ///
3144    /// # Returns
3145    ///
3146    /// * `u64` - The confidence score attributed to that [`Magic`]
3147    #[inline(always)]
3148    pub fn strength(&self) -> u64 {
3149        self.strength
3150    }
3151
3152    /// Gets the filename where the magic rule was defined
3153    ///
3154    /// # Returns
3155    ///
3156    /// * `Option<&str>` - The source if available
3157    #[inline(always)]
3158    pub fn source(&self) -> Option<&str> {
3159        self.source.as_deref()
3160    }
3161
3162    /// Gets the Apple creator code if available
3163    ///
3164    /// # Returns
3165    ///
3166    /// * `Option<&str>` - The creator code if available
3167    #[inline(always)]
3168    pub fn creator_code(&self) -> Option<&str> {
3169        self.creator_code.as_deref()
3170    }
3171
3172    /// Gets the possible file extensions for the detected [`Magic`]
3173    ///
3174    /// # Returns
3175    ///
3176    /// * `&HashSet<Cow<'m, str>>` - The set of possible extensions
3177    #[inline(always)]
3178    pub fn extensions(&self) -> &HashSet<Cow<'m, str>> {
3179        &self.exts
3180    }
3181
3182    /// Checks if this is a default fallback detection
3183    ///
3184    /// # Returns
3185    ///
3186    /// * `bool` - True if this is a default detection
3187    #[inline(always)]
3188    pub fn is_default(&self) -> bool {
3189        self.is_default
3190    }
3191}
3192
3193/// Represents a database of [`MagicRule`]
3194#[derive(Debug, Default, Clone, Serialize, Deserialize)]
3195pub struct MagicDb {
3196    rule_id: usize,
3197    rules: Vec<MagicRule>,
3198    dependencies: HashMap<String, DependencyRule>,
3199    finalized: usize,
3200}
3201
3202#[inline(always)]
3203/// Returns `true` if the byte stream is likely text.
3204fn is_likely_text(bytes: &[u8]) -> bool {
3205    const CHUNK_SIZE: usize = std::mem::size_of::<usize>();
3206
3207    if bytes.is_empty() {
3208        return false;
3209    }
3210
3211    let mut printable = 0f64;
3212    let mut high_bytes = 0f64; // Bytes > 0x7F (non-ASCII)
3213
3214    let (chunks, remainder) = bytes.as_chunks::<CHUNK_SIZE>();
3215
3216    macro_rules! handle_byte {
3217        ($byte: expr) => {
3218            match $byte {
3219                0x00 => return false,
3220                0x09 | 0x0A | 0x0D => printable += 1.0, // Whitespace
3221                0x20..=0x7E => printable += 1.0,        // Printable ASCII
3222                _ => high_bytes += 1.0,
3223            }
3224        };
3225    }
3226
3227    for bytes in chunks {
3228        for b in bytes {
3229            handle_byte!(b)
3230        }
3231    }
3232
3233    for b in remainder {
3234        handle_byte!(b)
3235    }
3236
3237    let total = bytes.len() as f64;
3238    let printable_ratio = printable / total;
3239    let high_bytes_ratio = high_bytes / total;
3240
3241    // Heuristic thresholds (adjust as needed):
3242    printable_ratio > 0.85 && high_bytes_ratio < 0.20
3243}
3244
3245#[inline(always)]
3246fn guess_stream_kind<S: AsRef<[u8]>>(stream: S) -> StreamKind {
3247    let buf = stream.as_ref();
3248
3249    match run_utf8_validation(buf) {
3250        Ok(is_ascii) => {
3251            if is_ascii {
3252                StreamKind::Text(TextEncoding::Ascii)
3253            } else {
3254                StreamKind::Text(TextEncoding::Utf8)
3255            }
3256        }
3257        Err(e) => {
3258            if is_likely_text(&buf[e.valid_up_to..]) {
3259                StreamKind::Text(TextEncoding::Unknown)
3260            } else {
3261                StreamKind::Binary
3262            }
3263        }
3264    }
3265}
3266
3267impl MagicDb {
3268    /// Prepares an [`LazyCache`] configured with optimal parameters for
3269    /// **read** operations done during file identification
3270    pub fn optimal_lazy_cache<R: Read + Seek>(f: R) -> Result<LazyCache<R>, io::Error> {
3271        Ok(LazyCache::<R>::from_read_seek(f)
3272            .and_then(|lc| lc.with_hot_cache(2 * FILE_BYTES_MAX))?)
3273        .map(|lc| lc.with_warm_cache(100 << 20))
3274    }
3275
3276    /// Creates a new empty database
3277    ///
3278    /// # Returns
3279    ///
3280    /// * [`MagicDb`] - A new empty database
3281    pub fn new() -> Self {
3282        Self::default()
3283    }
3284
3285    #[inline(always)]
3286    fn next_rule_id(&mut self) -> usize {
3287        let t = self.rule_id;
3288        self.rule_id += 1;
3289        t
3290    }
3291
3292    #[inline(always)]
3293    fn try_json<R: Read + Seek>(
3294        haystack: &mut LazyCache<R>,
3295        stream_kind: StreamKind,
3296        magic: &mut Magic,
3297    ) -> Result<bool, Error> {
3298        // cannot be json if content is binary
3299        if matches!(stream_kind, StreamKind::Binary) {
3300            return Ok(false);
3301        }
3302
3303        let buf = haystack.read_range(0..FILE_BYTES_MAX as u64)?.trim_ascii();
3304
3305        let Some((start, end)) = find_json_boundaries(buf) else {
3306            return Ok(false);
3307        };
3308
3309        // if anything else than whitespace before start
3310        // this is not json
3311        for c in buf[0..start].iter() {
3312            if !c.is_ascii_whitespace() {
3313                return Ok(false);
3314            }
3315        }
3316
3317        let mut is_ndjson = false;
3318
3319        trace!("maybe a json document");
3320        let ok = serde_json::from_slice::<serde_json::Value>(&buf[start..=end]).is_ok();
3321        if !ok {
3322            return Ok(false);
3323        }
3324
3325        // we are sure it is json now we must look if we are ndjson
3326        if end + 1 < buf.len() {
3327            // after first json
3328            let buf = &buf[end + 1..];
3329            if let Some((second_start, second_end)) = find_json_boundaries(buf) {
3330                // there is a new line between the two json docs
3331                if memchr(b'\n', &buf[..second_start]).is_some() {
3332                    trace!("might be ndjson");
3333                    is_ndjson = serde_json::from_slice::<serde_json::Value>(
3334                        &buf[second_start..=second_end],
3335                    )
3336                    .is_ok();
3337                }
3338            }
3339        }
3340
3341        if is_ndjson {
3342            magic.push_message(Cow::Borrowed("New Line Delimited"));
3343            magic.set_mime_type(Cow::Borrowed("application/x-ndjson"));
3344            magic.insert_extensions(["ndjson", "jsonl"].into_iter());
3345        } else {
3346            magic.set_mime_type(Cow::Borrowed("application/json"));
3347            magic.insert_extensions(["json"].into_iter());
3348        }
3349
3350        magic.push_message(Cow::Borrowed("JSON text data"));
3351        magic.set_source(Some(HARDCODED_SOURCE));
3352        magic.update_strength(HARDCODED_MAGIC_STRENGTH);
3353        Ok(true)
3354    }
3355
3356    #[inline(always)]
3357    fn try_csv<R: Read + Seek>(
3358        haystack: &mut LazyCache<R>,
3359        stream_kind: StreamKind,
3360        magic: &mut Magic,
3361    ) -> Result<bool, Error> {
3362        // cannot be csv if content is binary
3363        let StreamKind::Text(enc) = stream_kind else {
3364            return Ok(false);
3365        };
3366
3367        let buf = haystack.read_range(0..FILE_BYTES_MAX as u64)?;
3368        let mut reader = csv::Reader::from_reader(io::Cursor::new(buf));
3369        let mut records = reader.records();
3370
3371        let Some(Ok(first)) = records.next() else {
3372            return Ok(false);
3373        };
3374
3375        // very not likely a CSV otherwise all programming
3376        // languages having ; line terminator would be
3377        // considered as CSV
3378        if first.len() <= 1 {
3379            return Ok(false);
3380        }
3381
3382        // we already parsed first line
3383        let mut n = 1;
3384        for i in records.take(9) {
3385            if let Ok(rec) = i {
3386                if first.len() != rec.len() {
3387                    return Ok(false);
3388                }
3389            } else {
3390                return Ok(false);
3391            }
3392            n += 1;
3393        }
3394
3395        // we need at least 10 lines
3396        if n != 10 {
3397            return Ok(false);
3398        }
3399
3400        magic.set_mime_type(Cow::Borrowed("text/csv"));
3401        magic.push_message(Cow::Borrowed("CSV"));
3402        magic.push_message(Cow::Borrowed(enc.as_magic_str()));
3403        magic.push_message(Cow::Borrowed("text"));
3404        magic.insert_extensions(["csv"].into_iter());
3405        magic.set_source(Some(HARDCODED_SOURCE));
3406        magic.update_strength(HARDCODED_MAGIC_STRENGTH);
3407        Ok(true)
3408    }
3409
3410    #[inline(always)]
3411    fn try_tar<R: Read + Seek>(
3412        haystack: &mut LazyCache<R>,
3413        stream_kind: StreamKind,
3414        magic: &mut Magic,
3415    ) -> Result<bool, Error> {
3416        // cannot be json if content is not binary
3417        if !matches!(stream_kind, StreamKind::Binary) {
3418            return Ok(false);
3419        }
3420
3421        let buf = haystack.read_range(0..FILE_BYTES_MAX as u64)?;
3422        let mut ar = Archive::new(io::Cursor::new(buf));
3423
3424        let Ok(mut entries) = ar.entries() else {
3425            return Ok(false);
3426        };
3427
3428        let Some(Ok(first)) = entries.next() else {
3429            return Ok(false);
3430        };
3431
3432        let header = first.header();
3433
3434        if header.as_ustar().is_some() {
3435            magic.push_message(Cow::Borrowed("POSIX tar archive"));
3436        } else if header.as_gnu().is_some() {
3437            magic.push_message(Cow::Borrowed("POSIX tar archive (GNU)"));
3438        } else {
3439            magic.push_message(Cow::Borrowed("tar archive"));
3440        }
3441
3442        magic.set_mime_type(Cow::Borrowed("application/x-tar"));
3443        magic.set_source(Some(HARDCODED_SOURCE));
3444        magic.update_strength(HARDCODED_MAGIC_STRENGTH);
3445        magic.insert_extensions(["tar"].into_iter());
3446        Ok(true)
3447    }
3448
3449    #[inline(always)]
3450    fn try_hard_magic<R: Read + Seek>(
3451        haystack: &mut LazyCache<R>,
3452        stream_kind: StreamKind,
3453        magic: &mut Magic,
3454    ) -> Result<bool, Error> {
3455        Ok(Self::try_json(haystack, stream_kind, magic)?
3456            || Self::try_csv(haystack, stream_kind, magic)?
3457            || Self::try_tar(haystack, stream_kind, magic)?)
3458    }
3459
3460    #[inline(always)]
3461    fn magic_default<'m, R: Read + Seek>(
3462        cache: &mut LazyCache<R>,
3463        stream_kind: StreamKind,
3464        magic: &mut Magic<'m>,
3465    ) {
3466        magic.set_source(Some(HARDCODED_SOURCE));
3467        magic.set_stream_kind(stream_kind);
3468        magic.is_default = true;
3469
3470        if cache.data_size() == 0 {
3471            magic.push_message(Cow::Borrowed("empty"));
3472            magic.set_mime_type(Cow::Borrowed(DEFAULT_BIN_MIMETYPE));
3473        }
3474
3475        match stream_kind {
3476            StreamKind::Binary => {
3477                magic.push_message(Cow::Borrowed("data"));
3478            }
3479            StreamKind::Text(e) => {
3480                magic.push_message(Cow::Borrowed(e.as_magic_str()));
3481                magic.push_message(Cow::Borrowed("text"));
3482            }
3483        }
3484    }
3485
3486    fn load_rules_no_prepare(&mut self, rules: Vec<MagicRule>) {
3487        for rule in rules.into_iter() {
3488            let mut rule = rule;
3489            rule.set_id(self.next_rule_id());
3490
3491            self.rules.push(rule);
3492        }
3493    }
3494
3495    /// Loads rules from a [`MagicSource`]
3496    ///
3497    /// # Arguments
3498    ///
3499    /// * `ms` - The [`MagicSource`] to load rules from
3500    pub fn load(&mut self, ms: MagicSource) -> &mut Self {
3501        self.load_rules_no_prepare(ms.rules);
3502        self.dependencies.extend(ms.dependencies);
3503        self.try_finalize();
3504        self
3505    }
3506
3507    /// Loads multiple [`MagicSource`] items efficiently in bulk.
3508    ///
3509    /// This is more efficient than loading each individually. After processing
3510    /// all sources, it applies finalization step only once.
3511    pub fn load_bulk<I: Iterator<Item = MagicSource>>(&mut self, it: I) -> &mut Self {
3512        for ms in it {
3513            self.load_rules_no_prepare(ms.rules);
3514            self.dependencies.extend(ms.dependencies);
3515        }
3516        self.try_finalize();
3517        self
3518    }
3519
3520    /// Gets all rules in the database
3521    ///
3522    /// # Returns
3523    ///
3524    /// * `&[MagicRule]` - A slice of all rules
3525    pub fn rules(&self) -> &[MagicRule] {
3526        &self.rules
3527    }
3528
3529    #[inline]
3530    fn first_magic_with_stream_kind<R: Read + Seek>(
3531        &self,
3532        haystack: &mut LazyCache<R>,
3533        stream_kind: StreamKind,
3534        extension: Option<&str>,
3535    ) -> Result<Magic<'_>, Error> {
3536        // re-using magic makes this function faster
3537        let mut magic = Magic::default();
3538
3539        if Self::try_hard_magic(haystack, stream_kind, &mut magic)? {
3540            return Ok(magic);
3541        }
3542
3543        let mut marked = vec![false; self.rules.len()];
3544
3545        macro_rules! do_magic {
3546            ($rule: expr) => {{
3547                $rule.magic_entrypoint(&mut magic, stream_kind, haystack, &self, false, 0)?;
3548
3549                if !magic.message.is_empty() {
3550                    magic.set_stream_kind(stream_kind);
3551                    magic.set_source($rule.source.as_deref());
3552                    return Ok(magic);
3553                }
3554
3555                magic.reset();
3556            }};
3557        }
3558
3559        if let Some(ext) = extension.map(|e| e.to_lowercase())
3560            && !ext.is_empty()
3561        {
3562            for rule in self.rules.iter().filter(|r| r.extensions.contains(&ext)) {
3563                do_magic!(rule);
3564                if let Some(f) = marked.get_mut(rule.id) {
3565                    *f = true
3566                }
3567            }
3568        }
3569
3570        for rule in self
3571            .rules
3572            .iter()
3573            // we don't run again rules run by extension
3574            .filter(|r| !*marked.get(r.id).unwrap_or(&false))
3575        {
3576            do_magic!(rule)
3577        }
3578
3579        Self::magic_default(haystack, stream_kind, &mut magic);
3580
3581        Ok(magic)
3582    }
3583
3584    /// Detects file [`Magic`] stopping at the first matching magic. Magic
3585    /// rules are evaluated from the best to the least relevant, so this method
3586    /// returns most of the time the best magic. For the rare cases where
3587    /// it doesn't or if the best result is always required, use [`MagicDb::best_magic`]
3588    ///
3589    /// # Arguments
3590    ///
3591    /// * `r` - A readable and seekable input
3592    /// * `extension` - Optional file extension to use for acceleration
3593    ///
3594    /// # Returns
3595    ///
3596    /// * `Result<Magic<'_>, Error>` - The detection result or an error
3597    ///
3598    /// # Warning
3599    ///
3600    /// File extension acceleration is made to evaluate rules faster by testing
3601    /// first the rules defining this extension with an `!:ext` entry.
3602    /// Whether you use `extension` acceleration or not with this function should not
3603    /// produce different results. Yet this makes the assumption rules are written
3604    /// correctly and every rule concerned defines `!:ext` when it is appropriate.
3605    /// If some rules are missing it, results might differ.
3606    pub fn first_magic<R: Read + Seek>(
3607        &self,
3608        r: &mut R,
3609        extension: Option<&str>,
3610    ) -> Result<Magic<'_>, Error> {
3611        let mut cache = Self::optimal_lazy_cache(r)?;
3612        let stream_kind = guess_stream_kind(cache.read_range(0..FILE_BYTES_MAX as u64)?);
3613        self.first_magic_with_stream_kind(&mut cache, stream_kind, extension)
3614    }
3615
3616    /// An alternative to [`Self::first_magic`] using a [`LazyCache`]
3617    /// to detects file [`Magic`] stopping at the first matching magic. Magic
3618    /// rules are evaluated from the best to the least relevant, so this method
3619    /// returns most of the time the best magic. For the rare cases where
3620    /// it doesn't or if the best result is always required, use [`MagicDb::best_magic`]
3621    ///
3622    /// # Arguments
3623    ///
3624    /// * `cache` - A [`LazyCache`] used for read operations
3625    /// * `extension` - Optional file extension to use for acceleration
3626    ///
3627    /// # Returns
3628    ///
3629    /// * `Result<Magic<'_>, Error>` - The detection result or an error
3630    ///
3631    /// # Notes
3632    ///
3633    /// * Use this method **only** if you need to re-use a [`LazyCache`] for future **read** operations.
3634    /// * Use [`Self::optimal_lazy_cache`] to prepare an optimal [`LazyCache`]
3635    ///
3636    /// # Warning
3637    ///
3638    /// File extension acceleration is made to evaluate rules faster by testing
3639    /// first the rules defining this extension with an `!:ext` entry.
3640    /// Whether you use `extension` acceleration or not with this function should not
3641    /// produce different results. Yet this makes the assumption rules are written
3642    /// correctly and every rule concerned defines `!:ext` when it is appropriate.
3643    /// If some rules are missing it, results might differ.
3644    pub fn first_magic_with_lazy_cache<R: Read + Seek>(
3645        &self,
3646        cache: &mut LazyCache<R>,
3647        extension: Option<&str>,
3648    ) -> Result<Magic<'_>, Error> {
3649        let stream_kind = guess_stream_kind(cache.read_range(0..FILE_BYTES_MAX as u64)?);
3650        self.first_magic_with_stream_kind(cache, stream_kind, extension)
3651    }
3652
3653    #[inline(always)]
3654    fn all_magics_sort_with_stream_kind<R: Read + Seek>(
3655        &self,
3656        haystack: &mut LazyCache<R>,
3657        stream_kind: StreamKind,
3658    ) -> Result<Vec<Magic<'_>>, Error> {
3659        let mut out = Vec::new();
3660
3661        let mut magic = Magic::default();
3662
3663        if Self::try_hard_magic(haystack, stream_kind, &mut magic)? {
3664            out.push(magic);
3665            magic = Magic::default();
3666        }
3667
3668        for rule in self.rules.iter() {
3669            rule.magic_entrypoint(&mut magic, stream_kind, haystack, self, false, 0)?;
3670
3671            // it is possible we have a strength with no message
3672            if !magic.message.is_empty() {
3673                magic.set_stream_kind(stream_kind);
3674                magic.set_source(rule.source.as_deref());
3675                out.push(magic);
3676                magic = Magic::default();
3677            }
3678
3679            magic.reset();
3680        }
3681
3682        Self::magic_default(haystack, stream_kind, &mut magic);
3683        out.push(magic);
3684
3685        out.sort_by_key(|b| std::cmp::Reverse(b.strength()));
3686
3687        Ok(out)
3688    }
3689
3690    /// Detects all [`Magic`] matching a given content.
3691    ///
3692    /// # Arguments
3693    ///
3694    /// * `r` - A readable and seekable input
3695    ///
3696    /// # Returns
3697    ///
3698    /// * `Result<Vec<Magic<'_>>, Error>` - All detection results sorted by strength or an error
3699    pub fn all_magics<R: Read + Seek>(&self, r: &mut R) -> Result<Vec<Magic<'_>>, Error> {
3700        let mut cache = Self::optimal_lazy_cache(r)?;
3701        let stream_kind = guess_stream_kind(cache.read_range(0..FILE_BYTES_MAX as u64)?);
3702        self.all_magics_sort_with_stream_kind(&mut cache, stream_kind)
3703    }
3704
3705    /// An alternative to [`Self::all_magics`] using a [`LazyCache`]
3706    /// to detects all [`Magic`] matching a given content.
3707    ///
3708    /// # Arguments
3709    ///
3710    /// * `r` - A readable and seekable input
3711    ///
3712    /// # Returns
3713    ///
3714    /// * `Result<Vec<Magic<'_>>, Error>` - All detection results sorted by strength or an error
3715    ///
3716    /// # Notes
3717    ///
3718    /// * Use this method **only** if you need to re-use a [`LazyCache`] for future **read** operations.
3719    /// * Use [`Self::optimal_lazy_cache`] to prepare an optimal [`LazyCache`]
3720    pub fn all_magics_with_lazy_cache<R: Read + Seek>(
3721        &self,
3722        cache: &mut LazyCache<R>,
3723    ) -> Result<Vec<Magic<'_>>, Error> {
3724        let stream_kind = guess_stream_kind(cache.read_range(0..FILE_BYTES_MAX as u64)?);
3725        self.all_magics_sort_with_stream_kind(cache, stream_kind)
3726    }
3727
3728    #[inline(always)]
3729    fn best_magic_with_stream_kind<R: Read + Seek>(
3730        &self,
3731        haystack: &mut LazyCache<R>,
3732        stream_kind: StreamKind,
3733    ) -> Result<Magic<'_>, Error> {
3734        let magics = self.all_magics_sort_with_stream_kind(haystack, stream_kind)?;
3735
3736        // magics is guaranteed to contain at least the
3737        // default magic but we unwrap to avoid any panic
3738        Ok(magics.into_iter().next().unwrap_or_else(|| {
3739            let mut magic = Magic::default();
3740            Self::magic_default(haystack, stream_kind, &mut magic);
3741            magic
3742        }))
3743    }
3744
3745    /// Detects the best [`Magic`] matching a given content.
3746    ///
3747    /// # Arguments
3748    ///
3749    /// * `r` - A readable and seekable input
3750    ///
3751    /// # Returns
3752    ///
3753    /// * `Result<Magic<'_>, Error>` - The best detection result or an error
3754    pub fn best_magic<R: Read + Seek>(&self, r: &mut R) -> Result<Magic<'_>, Error> {
3755        let mut cache = Self::optimal_lazy_cache(r)?;
3756        let stream_kind = guess_stream_kind(cache.read_range(0..FILE_BYTES_MAX as u64)?);
3757        self.best_magic_with_stream_kind(&mut cache, stream_kind)
3758    }
3759
3760    /// An alternative to [`Self::best_magic`] using a [`LazyCache`]
3761    /// to detect the best [`Magic`] matching a given content.
3762    ///
3763    /// # Arguments
3764    ///
3765    /// * `r` - A readable and seekable input
3766    ///
3767    /// # Returns
3768    ///
3769    /// * `Result<Magic<'_>, Error>` - The best detection result or an error
3770    ///
3771    /// # Notes
3772    ///
3773    /// * Use this method **only** if you need to re-use a [`LazyCache`] for future **read** operations.
3774    /// * Use [`Self::optimal_lazy_cache`] to prepare an optimal [`LazyCache`]
3775    pub fn best_magic_with_lazy_cache<R: Read + Seek>(
3776        &self,
3777        cache: &mut LazyCache<R>,
3778    ) -> Result<Magic<'_>, Error> {
3779        let stream_kind = guess_stream_kind(cache.read_range(0..FILE_BYTES_MAX as u64)?);
3780        self.best_magic_with_stream_kind(cache, stream_kind)
3781    }
3782
3783    /// Serializes the database to a generic writer implementing [`io::Write`]
3784    ///
3785    /// # Returns
3786    ///
3787    /// * `Result<(), Error>` - The serialized database or an error
3788    pub fn serialize<W: Write>(self, w: &mut W) -> Result<(), Error> {
3789        let mut encoder = GzEncoder::new(w, Compression::best());
3790
3791        bincode::serde::encode_into_std_write(&self, &mut encoder, bincode::config::standard())?;
3792        encoder.finish()?;
3793        Ok(())
3794    }
3795
3796    /// Deserializes the database from a generic reader implementing [`io::Read`]
3797    ///
3798    /// # Arguments
3799    ///
3800    /// * `r` - The reader to deserialize from
3801    ///
3802    /// # Returns
3803    ///
3804    /// * `Result<Self, Error>` - The deserialized database or an error
3805    pub fn deserialize<R: Read>(r: &mut R) -> Result<Self, Error> {
3806        let mut buf = vec![];
3807        let mut gz = GzDecoder::new(r);
3808        gz.read_to_end(&mut buf).map_err(|e| {
3809            bincode::error::DecodeError::OtherString(format!("failed to read: {e}"))
3810        })?;
3811        let (sdb, _): (MagicDb, usize) =
3812            bincode::serde::decode_from_slice(&buf, bincode::config::standard())?;
3813        Ok(sdb)
3814    }
3815
3816    /// Verifies the consistency of the [`MagicDb`] database.
3817    /// This method must be called when the database is built once and used later.
3818    /// It catches [`enum@Error`] that would raise at rule evaluation time.
3819    ///
3820    /// # Errors
3821    /// Returns an error if any rule fails verification
3822    pub fn verify(&mut self) -> Result<(), Error> {
3823        if self.rules.len() == self.finalized {
3824            return Ok(());
3825        }
3826
3827        for r in self.rules.iter_mut().filter(|r| !r.finalized) {
3828            // return at the first rule failing verification
3829            r.try_finalize(&self.dependencies).map_err(|e| {
3830                Error::Verify(
3831                    r.source.clone().unwrap_or(String::from("unknown")),
3832                    r.line(),
3833                    e.into(),
3834                )
3835            })?;
3836            self.finalized += 1;
3837        }
3838
3839        debug_assert!(self.finalized <= self.rules.len());
3840
3841        Ok(())
3842    }
3843
3844    #[inline(always)]
3845    fn try_finalize(&mut self) {
3846        if self.rules.len() == self.finalized {
3847            return;
3848        }
3849
3850        let mut finalized = 0usize;
3851        self.rules.iter_mut().for_each(|r| {
3852            if r.try_finalize(&self.dependencies).is_ok() {
3853                finalized += 1;
3854            }
3855        });
3856
3857        self.finalized = finalized;
3858
3859        debug_assert!(self.finalized <= self.rules.len());
3860
3861        // put text rules at the end
3862        self.rules.sort_by_key(|r| (r.is_text(), -(r.score as i64)));
3863    }
3864}
3865
3866#[cfg(test)]
3867mod tests {
3868    use std::io::Cursor;
3869
3870    use regex::bytes::Regex;
3871
3872    use crate::utils::unix_local_time_to_string;
3873
3874    use super::*;
3875
3876    macro_rules! lazy_cache {
3877        ($l: literal) => {
3878            LazyCache::from_read_seek(Cursor::new($l)).unwrap()
3879        };
3880    }
3881
3882    fn first_magic(
3883        rule: &str,
3884        content: &[u8],
3885        stream_kind: StreamKind,
3886    ) -> Result<Magic<'static>, Error> {
3887        let mut md = MagicDb::new();
3888        md.load(
3889            FileMagicParser::parse_str(rule, None)
3890                .inspect_err(|e| eprintln!("{e}"))
3891                .unwrap(),
3892        );
3893        let mut reader = LazyCache::from_read_seek(Cursor::new(content)).unwrap();
3894        let v = md.best_magic_with_stream_kind(&mut reader, stream_kind)?;
3895        Ok(v.into_owned())
3896    }
3897
3898    /// helper macro to debug tests
3899    #[allow(unused_macros)]
3900    macro_rules! enable_trace {
3901        () => {
3902            tracing_subscriber::fmt()
3903                .with_max_level(tracing_subscriber::filter::LevelFilter::TRACE)
3904                .try_init();
3905        };
3906    }
3907
3908    macro_rules! parse_assert {
3909        ($rule:literal) => {
3910            FileMagicParser::parse_str($rule, None)
3911                .inspect_err(|e| eprintln!("{e}"))
3912                .unwrap()
3913        };
3914    }
3915
3916    macro_rules! assert_magic_match_bin {
3917        ($rule: literal, $content:literal) => {{ first_magic($rule, $content, StreamKind::Binary).unwrap() }};
3918        ($rule: literal, $content:literal, $message:expr) => {{
3919            assert_eq!(
3920                first_magic($rule, $content, StreamKind::Binary)
3921                    .unwrap()
3922                    .message(),
3923                $message
3924            );
3925        }};
3926    }
3927
3928    macro_rules! assert_magic_match_text {
3929        ($rule: literal, $content:literal) => {{ first_magic($rule, $content, StreamKind::Text(TextEncoding::Utf8)).unwrap() }};
3930        ($rule: literal, $content:literal, $message:expr) => {{
3931            assert_eq!(
3932                first_magic($rule, $content, StreamKind::Text(TextEncoding::Utf8))
3933                    .unwrap()
3934                    .message(),
3935                $message
3936            );
3937        }};
3938    }
3939
3940    macro_rules! assert_magic_not_match_text {
3941        ($rule: literal, $content:literal) => {{
3942            assert!(
3943                first_magic($rule, $content, StreamKind::Text(TextEncoding::Utf8))
3944                    .unwrap()
3945                    .is_default()
3946            );
3947        }};
3948    }
3949
3950    macro_rules! assert_magic_not_match_bin {
3951        ($rule: literal, $content:literal) => {{
3952            assert!(
3953                first_magic($rule, $content, StreamKind::Binary)
3954                    .unwrap()
3955                    .is_default()
3956            );
3957        }};
3958    }
3959
3960    #[test]
3961    fn test_regex() {
3962        assert_magic_match_text!(
3963            r#"
39640	regex/1024 \^#![[:space:]]*/usr/bin/env[[:space:]]+
3965!:mime	text/x-shellscript
3966>&0  regex/64 .*($|\\b) %s shell script text executable
3967    "#,
3968            br#"#!/usr/bin/env bash
3969        echo hello world"#,
3970            // the magic generated
3971            "bash shell script text executable"
3972        );
3973
3974        let re = Regex::new(r"(?-u)\x42\x82").unwrap();
3975        assert!(re.is_match(b"\x42\x82"));
3976
3977        assert_magic_match_bin!(
3978            r#"0 regex \x42\x82 binary regex match"#,
3979            b"\x00\x00\x00\x00\x00\x00\x42\x82"
3980        );
3981
3982        // test regex continuation after match
3983        assert_magic_match_bin!(
3984            r#"
3985            0 regex \x42\x82
3986            >&0 string \xde\xad\xbe\xef it works
3987            "#,
3988            b"\x00\x00\x00\x00\x00\x00\x42\x82\xde\xad\xbe\xef"
3989        );
3990
3991        assert_magic_match_bin!(
3992            r#"
3993            0 regex/s \x42\x82
3994            >&0 string \x42\x82\xde\xad\xbe\xef it works
3995            "#,
3996            b"\x00\x00\x00\x00\x00\x00\x42\x82\xde\xad\xbe\xef"
3997        );
3998
3999        // ^ must match stat of line when matching text
4000        assert_magic_match_text!(
4001            r#"
40020	regex/1024 \^HelloWorld$ HelloWorld String"#,
4003            br#"
4004// this is a comment after an empty line
4005HelloWorld
4006            "#
4007        );
4008    }
4009
4010    #[test]
4011    fn test_string_with_mods() {
4012        assert_magic_match_text!(
4013            r#"0	string/w	#!\ \ \ /usr/bin/env\ bash	BASH
4014        "#,
4015            b"#! /usr/bin/env bash i
4016        echo hello world"
4017        );
4018
4019        // test uppercase insensitive
4020        assert_magic_match_text!(
4021            r#"0	string/C	HelloWorld	it works
4022        "#,
4023            b"helloworld"
4024        );
4025
4026        assert_magic_not_match_text!(
4027            r#"0	string/C	HelloWorld	it works
4028        "#,
4029            b"hELLOwORLD"
4030        );
4031
4032        // test lowercase insensitive
4033        assert_magic_match_text!(
4034            r#"0	string/c	HelloWorld	it works
4035        "#,
4036            b"HELLOWORLD"
4037        );
4038
4039        assert_magic_not_match_text!(
4040            r#"0	string/c	HelloWorld	it works
4041        "#,
4042            b"helloworld"
4043        );
4044
4045        // test full word match
4046        assert_magic_match_text!(
4047            r#"0	string/f	#!/usr/bin/env\ bash	BASH
4048        "#,
4049            b"#!/usr/bin/env bash"
4050        );
4051
4052        assert_magic_not_match_text!(
4053            r#"0	string/f	#!/usr/bin/python PYTHON"#,
4054            b"#!/usr/bin/pythonic"
4055        );
4056
4057        // testing whitespace compacting
4058        assert_magic_match_text!(
4059            r#"0	string/W	#!/usr/bin/env\ python  PYTHON"#,
4060            b"#!/usr/bin/env    python"
4061        );
4062
4063        assert_magic_not_match_text!(
4064            r#"0	string/W	#!/usr/bin/env\ \ python  PYTHON"#,
4065            b"#!/usr/bin/env python"
4066        );
4067    }
4068
4069    #[test]
4070    fn test_search_with_mods() {
4071        assert_magic_match_text!(
4072            r#"0	search/1/fwt	#!\ /usr/bin/luatex	LuaTex script text executable"#,
4073            b"#!          /usr/bin/luatex "
4074        );
4075
4076        // test matching from the beginning
4077        assert_magic_match_text!(
4078            r#"
4079            0	search/s	/usr/bin/env
4080            >&0 string /usr/bin/env it works
4081            "#,
4082            b"#!/usr/bin/env    python"
4083        );
4084
4085        assert_magic_not_match_text!(
4086            r#"
4087            0	search	/usr/bin/env
4088            >&0 string /usr/bin/env it works
4089            "#,
4090            b"#!/usr/bin/env    python"
4091        );
4092    }
4093
4094    #[test]
4095    fn test_pstring() {
4096        assert_magic_match_bin!(r#"0 pstring Toast it works"#, b"\x05Toast");
4097
4098        assert_magic_match_bin!(r#"0 pstring Toast %s"#, b"\x05Toast", "Toast");
4099
4100        assert_magic_not_match_bin!(r#"0 pstring Toast Doesn't work"#, b"\x07Toaster");
4101
4102        // testing with modifiers
4103        assert_magic_match_bin!(r#"0 pstring/H Toast it works"#, b"\x00\x05Toast");
4104
4105        assert_magic_match_bin!(r#"0 pstring/HJ Toast it works"#, b"\x00\x07Toast");
4106
4107        assert_magic_match_bin!(r#"0 pstring/HJ Toast %s"#, b"\x00\x07Toast", "Toast");
4108
4109        assert_magic_match_bin!(r#"0 pstring/h Toast it works"#, b"\x05\x00Toast");
4110
4111        assert_magic_match_bin!(r#"0 pstring/hJ Toast it works"#, b"\x07\x00Toast");
4112
4113        assert_magic_match_bin!(r#"0 pstring/L Toast it works"#, b"\x00\x00\x00\x05Toast");
4114
4115        assert_magic_match_bin!(r#"0 pstring/LJ Toast it works"#, b"\x00\x00\x00\x09Toast");
4116
4117        assert_magic_match_bin!(r#"0 pstring/l Toast it works"#, b"\x05\x00\x00\x00Toast");
4118
4119        assert_magic_match_bin!(r#"0 pstring/lJ Toast it works"#, b"\x09\x00\x00\x00Toast");
4120    }
4121
4122    #[test]
4123    fn test_max_recursion() {
4124        let res = first_magic(
4125            r#"0	indirect x"#,
4126            b"#!          /usr/bin/luatex ",
4127            StreamKind::Binary,
4128        );
4129        assert!(res.is_err());
4130        let _ = res.inspect_err(|e| {
4131            assert!(matches!(
4132                e.unwrap_localized(),
4133                Error::MaximumRecursion(MAX_RECURSION)
4134            ))
4135        });
4136    }
4137
4138    #[test]
4139    fn test_string_ops() {
4140        assert_magic_match_text!("0	string/b MZ MZ File", b"MZ\0");
4141        assert_magic_match_text!("0	string !MZ Not MZ File", b"AZ\0");
4142        assert_magic_match_text!("0	string >\0 Any String", b"A\0");
4143        assert_magic_match_text!("0	string >Test Any String", b"Test 1\0");
4144        assert_magic_match_text!("0	string <Test Any String", b"\0");
4145        assert_magic_not_match_text!("0	string >Test Any String", b"\0");
4146    }
4147
4148    #[test]
4149    fn test_lestring16() {
4150        assert_magic_match_bin!(
4151            "0 lestring16 abcd Little-endian UTF-16 string",
4152            b"\x61\x00\x62\x00\x63\x00\x64\x00"
4153        );
4154        assert_magic_match_bin!(
4155            "0 lestring16 x %s",
4156            b"\x61\x00\x62\x00\x63\x00\x64\x00\x00",
4157            "abcd"
4158        );
4159        assert_magic_not_match_bin!(
4160            "0 lestring16 abcd Little-endian UTF-16 string",
4161            b"\x00\x61\x00\x62\x00\x63\x00\x64"
4162        );
4163        assert_magic_match_bin!(
4164            "4 lestring16 abcd Little-endian UTF-16 string",
4165            b"\x00\x00\x00\x00\x61\x00\x62\x00\x63\x00\x64\x00"
4166        );
4167    }
4168
4169    #[test]
4170    fn test_bestring16() {
4171        assert_magic_match_bin!(
4172            "0 bestring16 abcd Big-endian UTF-16 string",
4173            b"\x00\x61\x00\x62\x00\x63\x00\x64"
4174        );
4175        assert_magic_match_bin!(
4176            "0 bestring16 x %s",
4177            b"\x00\x61\x00\x62\x00\x63\x00\x64",
4178            "abcd"
4179        );
4180        assert_magic_not_match_bin!(
4181            "0 bestring16 abcd Big-endian UTF-16 string",
4182            b"\x61\x00\x62\x00\x63\x00\x64\x00"
4183        );
4184        assert_magic_match_bin!(
4185            "4 bestring16 abcd Big-endian UTF-16 string",
4186            b"\x00\x00\x00\x00\x00\x61\x00\x62\x00\x63\x00\x64"
4187        );
4188    }
4189
4190    #[test]
4191    fn test_offset_from_end() {
4192        assert_magic_match_bin!("-1 ubyte 0x42 last byte ok", b"\x00\x00\x42");
4193        assert_magic_match_bin!("-2 ubyte 0x41 last byte ok", b"\x00\x41\x00");
4194    }
4195
4196    #[test]
4197    fn test_relative_offset() {
4198        assert_magic_match_bin!(
4199            "
4200            0 ubyte 0x42
4201            >&0 ubyte 0x00
4202            >>&0 ubyte 0x41 third byte ok
4203            ",
4204            b"\x42\x00\x41\x00"
4205        );
4206    }
4207
4208    #[test]
4209    fn test_indirect_offset() {
4210        assert_magic_match_bin!("(0.l) ubyte 0x42 it works", b"\x04\x00\x00\x00\x42");
4211        // adding fixed value to offset
4212        assert_magic_match_bin!("(0.l+3) ubyte 0x42 it works", b"\x01\x00\x00\x00\x42");
4213        // testing offset pair
4214        assert_magic_match_bin!(
4215            "(0.l+(4)) ubyte 0x42 it works",
4216            b"\x04\x00\x00\x00\x04\x00\x00\x00\x42"
4217        );
4218    }
4219
4220    #[test]
4221    fn test_use_with_message() {
4222        assert_magic_match_bin!(
4223            r#"
42240 string MZ
4225>0 use mz first match
4226
42270 name mz then second match
4228>0 string MZ
4229"#,
4230            b"MZ\0",
4231            "first match then second match"
4232        );
4233    }
4234
4235    #[test]
4236    fn test_scalar_transform() {
4237        assert_magic_match_bin!("0 ubyte+1 0x1 add works", b"\x00");
4238        assert_magic_match_bin!("0 ubyte-1 0xfe sub works", b"\xff");
4239        assert_magic_match_bin!("0 ubyte%2 0 mod works", b"\x0a");
4240        assert_magic_match_bin!("0 ubyte&0x0f 0x0f bitand works", b"\xff");
4241        assert_magic_match_bin!("0 ubyte|0x0f 0xff bitor works", b"\xf0");
4242        assert_magic_match_bin!("0 ubyte^0x0f 0xf0 bitxor works", b"\xff");
4243
4244        FileMagicParser::parse_str("0 ubyte%0 mod by zero", None)
4245            .expect_err("expect div by zero error");
4246        FileMagicParser::parse_str("0 ubyte/0 div by zero", None)
4247            .expect_err("expect div by zero error");
4248    }
4249
4250    #[test]
4251    fn test_belong() {
4252        // Test that a file with a four-byte value at offset 0 that matches the given value in big-endian byte order
4253        assert_magic_match_bin!("0 belong 0x12345678 Big-endian long", b"\x12\x34\x56\x78");
4254        // Test that a file with a four-byte value at offset 0 that does not match the given value in big-endian byte order
4255        assert_magic_not_match_bin!("0 belong 0x12345678 Big-endian long", b"\x78\x56\x34\x12");
4256        // Test that a file with a four-byte value at a non-zero offset that matches the given value in big-endian byte order
4257        assert_magic_match_bin!(
4258            "4 belong 0x12345678 Big-endian long",
4259            b"\x00\x00\x00\x00\x12\x34\x56\x78"
4260        );
4261        // Test < operator
4262        assert_magic_match_bin!("0 belong <0x12345678 Big-endian long", b"\x12\x34\x56\x77");
4263        assert_magic_not_match_bin!("0 belong <0x12345678 Big-endian long", b"\x12\x34\x56\x78");
4264
4265        // Test > operator
4266        assert_magic_match_bin!("0 belong >0x12345678 Big-endian long", b"\x12\x34\x56\x79");
4267        assert_magic_not_match_bin!("0 belong >0x12345678 Big-endian long", b"\x12\x34\x56\x78");
4268
4269        // Test & operator
4270        assert_magic_match_bin!("0 belong &0x5678 Big-endian long", b"\x00\x00\x56\x78");
4271        assert_magic_not_match_bin!("0 belong &0x0000FFFF Big-endian long", b"\x12\x34\x56\x78");
4272
4273        // Test ^ operator (bitwise AND with complement)
4274        assert_magic_match_bin!("0 belong ^0xFFFF0000 Big-endian long", b"\x00\x00\x56\x78");
4275        assert_magic_not_match_bin!("0 belong ^0xFFFF0000 Big-endian long", b"\x00\x01\x56\x78");
4276
4277        // Test ~ operator
4278        assert_magic_match_bin!("0 belong ~0x12345678 Big-endian long", b"\xed\xcb\xa9\x87");
4279        assert_magic_not_match_bin!("0 belong ~0x12345678 Big-endian long", b"\x12\x34\x56\x78");
4280
4281        // Test x operator
4282        assert_magic_match_bin!("0 belong x Big-endian long", b"\x12\x34\x56\x78");
4283        assert_magic_match_bin!("0 belong x Big-endian long", b"\x78\x56\x34\x12");
4284    }
4285
4286    #[test]
4287    fn test_parse_search() {
4288        parse_assert!("0 search test");
4289        parse_assert!("0 search/24/s test");
4290        parse_assert!("0 search/s/24 test");
4291    }
4292
4293    #[test]
4294    fn test_bedate() {
4295        assert_magic_match_bin!(
4296            "0 bedate 946684800 Unix date (Jan 1, 2000)",
4297            b"\x38\x6D\x43\x80"
4298        );
4299        assert_magic_not_match_bin!(
4300            "0 bedate 946684800 Unix date (Jan 1, 2000)",
4301            b"\x00\x00\x00\x00"
4302        );
4303        assert_magic_match_bin!(
4304            "4 bedate 946684800 %s",
4305            b"\x00\x00\x00\x00\x38\x6D\x43\x80",
4306            "2000-01-01 00:00:00"
4307        );
4308    }
4309    #[test]
4310    fn test_beldate() {
4311        assert_magic_match_bin!(
4312            "0 beldate 946684800 Local date (Jan 1, 2000)",
4313            b"\x38\x6D\x43\x80"
4314        );
4315        assert_magic_not_match_bin!(
4316            "0 beldate 946684800 Local date (Jan 1, 2000)",
4317            b"\x00\x00\x00\x00"
4318        );
4319
4320        assert_magic_match_bin!(
4321            "4 beldate 946684800 {}",
4322            b"\x00\x00\x00\x00\x38\x6D\x43\x80",
4323            unix_local_time_to_string(946684800)
4324        );
4325    }
4326
4327    #[test]
4328    fn test_beqdate() {
4329        assert_magic_match_bin!(
4330            "0 beqdate 946684800 Unix date (Jan 1, 2000)",
4331            b"\x00\x00\x00\x00\x38\x6D\x43\x80"
4332        );
4333
4334        assert_magic_not_match_bin!(
4335            "0 beqdate 946684800 Unix date (Jan 1, 2000)",
4336            b"\x00\x00\x00\x00\x00\x00\x00\x00"
4337        );
4338
4339        assert_magic_match_bin!(
4340            "0 beqdate 946684800 %s",
4341            b"\x00\x00\x00\x00\x38\x6D\x43\x80",
4342            "2000-01-01 00:00:00"
4343        );
4344    }
4345
4346    #[test]
4347    fn test_medate() {
4348        assert_magic_match_bin!(
4349            "0 medate 946684800 Unix date (Jan 1, 2000)",
4350            b"\x6D\x38\x80\x43"
4351        );
4352
4353        assert_magic_not_match_bin!(
4354            "0 medate 946684800 Unix date (Jan 1, 2000)",
4355            b"\x00\x00\x00\x00"
4356        );
4357
4358        assert_magic_match_bin!(
4359            "4 medate 946684800 %s",
4360            b"\x00\x00\x00\x00\x6D\x38\x80\x43",
4361            "2000-01-01 00:00:00"
4362        );
4363    }
4364
4365    #[test]
4366    fn test_meldate() {
4367        assert_magic_match_bin!(
4368            "0 meldate 946684800 Local date (Jan 1, 2000)",
4369            b"\x6D\x38\x80\x43"
4370        );
4371        assert_magic_not_match_bin!(
4372            "0 meldate 946684800 Local date (Jan 1, 2000)",
4373            b"\x00\x00\x00\x00"
4374        );
4375
4376        assert_magic_match_bin!(
4377            "4 meldate 946684800 %s",
4378            b"\x00\x00\x00\x00\x6D\x38\x80\x43",
4379            unix_local_time_to_string(946684800)
4380        );
4381    }
4382
4383    #[test]
4384    fn test_date() {
4385        assert_magic_match_bin!(
4386            "0 date 946684800 Local date (Jan 1, 2000)",
4387            b"\x80\x43\x6D\x38"
4388        );
4389        assert_magic_not_match_bin!(
4390            "0 date 946684800 Local date (Jan 1, 2000)",
4391            b"\x00\x00\x00\x00"
4392        );
4393        assert_magic_match_bin!(
4394            "4 date 946684800 {}",
4395            b"\x00\x00\x00\x00\x80\x43\x6D\x38",
4396            "2000-01-01 00:00:00"
4397        );
4398    }
4399
4400    #[test]
4401    fn test_leldate() {
4402        assert_magic_match_bin!(
4403            "0 leldate 946684800 Local date (Jan 1, 2000)",
4404            b"\x80\x43\x6D\x38"
4405        );
4406        assert_magic_not_match_bin!(
4407            "0 leldate 946684800 Local date (Jan 1, 2000)",
4408            b"\x00\x00\x00\x00"
4409        );
4410        assert_magic_match_bin!(
4411            "4 leldate 946684800 {}",
4412            b"\x00\x00\x00\x00\x80\x43\x6D\x38",
4413            unix_local_time_to_string(946684800)
4414        );
4415    }
4416
4417    #[test]
4418    fn test_leqdate() {
4419        assert_magic_match_bin!(
4420            "0 leqdate 1577836800 Unix date (Jan 1, 2020)",
4421            b"\x00\xe1\x0b\x5E\x00\x00\x00\x00"
4422        );
4423
4424        assert_magic_not_match_bin!(
4425            "0 leqdate 1577836800 Unix date (Jan 1, 2020)",
4426            b"\x00\x00\x00\x00\x00\x00\x00\x00"
4427        );
4428        assert_magic_match_bin!(
4429            "8 leqdate 1577836800 %s",
4430            b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\xE1\x0B\x5E\x00\x00\x00\x00",
4431            "2020-01-01 00:00:00"
4432        );
4433    }
4434
4435    #[test]
4436    fn test_leqldate() {
4437        assert_magic_match_bin!(
4438            "0 leqldate 1577836800 Unix date (Jan 1, 2020)",
4439            b"\x00\xe1\x0b\x5E\x00\x00\x00\x00"
4440        );
4441
4442        assert_magic_not_match_bin!(
4443            "0 leqldate 1577836800 Unix date (Jan 1, 2020)",
4444            b"\x00\x00\x00\x00\x00\x00\x00\x00"
4445        );
4446        assert_magic_match_bin!(
4447            "8 leqldate 1577836800 %s",
4448            b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\xE1\x0B\x5E\x00\x00\x00\x00",
4449            unix_local_time_to_string(1577836800)
4450        );
4451    }
4452
4453    #[test]
4454    fn test_melong() {
4455        // Test = operator
4456        assert_magic_match_bin!(
4457            "0 melong =0x12345678 Middle-endian long",
4458            b"\x34\x12\x78\x56"
4459        );
4460        assert_magic_not_match_bin!(
4461            "0 melong =0x12345678 Middle-endian long",
4462            b"\x00\x00\x00\x00"
4463        );
4464
4465        // Test < operator
4466        assert_magic_match_bin!(
4467            "0 melong <0x12345678 Middle-endian long",
4468            b"\x34\x12\x78\x55"
4469        ); // 0x12345677 in middle-endian
4470        assert_magic_not_match_bin!(
4471            "0 melong <0x12345678 Middle-endian long",
4472            b"\x34\x12\x78\x56"
4473        ); // 0x12345678 in middle-endian
4474
4475        // Test > operator
4476        assert_magic_match_bin!(
4477            "0 melong >0x12345678 Middle-endian long",
4478            b"\x34\x12\x78\x57"
4479        ); // 0x12345679 in middle-endian
4480        assert_magic_not_match_bin!(
4481            "0 melong >0x12345678 Middle-endian long",
4482            b"\x34\x12\x78\x56"
4483        ); // 0x12345678 in middle-endian
4484
4485        // Test & operator
4486        assert_magic_match_bin!("0 melong &0x5678 Middle-endian long", b"\xab\xcd\x78\x56"); // 0x00007856 in middle-endian
4487        assert_magic_not_match_bin!(
4488            "0 melong &0x0000FFFF Middle-endian long",
4489            b"\x34\x12\x78\x56"
4490        ); // 0x12347856 in middle-endian
4491
4492        // Test ^ operator (bitwise AND with complement)
4493        assert_magic_match_bin!(
4494            "0 melong ^0xFFFF0000 Middle-endian long",
4495            b"\x00\x00\x78\x56"
4496        ); // 0x00007856 in middle-endian
4497        assert_magic_not_match_bin!(
4498            "0 melong ^0xFFFF0000 Middle-endian long",
4499            b"\x00\x01\x78\x56"
4500        ); // 0x00017856 in middle-endian
4501
4502        // Test ~ operator
4503        assert_magic_match_bin!(
4504            "0 melong ~0x12345678 Middle-endian long",
4505            b"\xCB\xED\x87\xA9"
4506        );
4507        assert_magic_not_match_bin!(
4508            "0 melong ~0x12345678 Middle-endian long",
4509            b"\x34\x12\x78\x56"
4510        ); // The original value
4511
4512        // Test x operator
4513        assert_magic_match_bin!("0 melong x Middle-endian long", b"\x34\x12\x78\x56");
4514        assert_magic_match_bin!("0 melong x Middle-endian long", b"\x00\x00\x00\x00");
4515    }
4516
4517    #[test]
4518    fn test_uquad() {
4519        // Test = operator
4520        assert_magic_match_bin!(
4521            "0 uquad =0x123456789ABCDEF0 Unsigned quad",
4522            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4523        );
4524        assert_magic_not_match_bin!(
4525            "0 uquad =0x123456789ABCDEF0 Unsigned quad",
4526            b"\x00\x00\x00\x00\x00\x00\x00\x00"
4527        );
4528
4529        // Test < operator
4530        assert_magic_match_bin!(
4531            "0 uquad <0x123456789ABCDEF0 Unsigned quad",
4532            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x11"
4533        );
4534        assert_magic_not_match_bin!(
4535            "0 uquad <0x123456789ABCDEF0 Unsigned quad",
4536            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4537        );
4538
4539        // Test > operator
4540        assert_magic_match_bin!(
4541            "0 uquad >0x123456789ABCDEF0 Unsigned quad",
4542            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x13"
4543        );
4544        assert_magic_not_match_bin!(
4545            "0 uquad >0x123456789ABCDEF0 Unsigned quad",
4546            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4547        );
4548
4549        // Test & operator
4550        assert_magic_match_bin!(
4551            "0 uquad &0xF0 Unsigned quad",
4552            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4553        );
4554        assert_magic_not_match_bin!(
4555            "0 uquad &0xFF Unsigned quad",
4556            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4557        );
4558
4559        // Test ^ operator (bitwise AND with complement)
4560        assert_magic_match_bin!(
4561            "0 uquad ^0xFFFFFFFFFFFFFFFF Unsigned quad",
4562            b"\x00\x00\x00\x00\x00\x00\x00\x00"
4563        ); // All bits clear
4564        assert_magic_not_match_bin!(
4565            "0 uquad ^0xFFFFFFFFFFFFFFFF Unsigned quad",
4566            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4567        ); // Some bits set
4568
4569        // Test ~ operator
4570        assert_magic_match_bin!(
4571            "0 uquad ~0x123456789ABCDEF0 Unsigned quad",
4572            b"\x0F\x21\x43\x65\x87\xA9\xCB\xED"
4573        );
4574        assert_magic_not_match_bin!(
4575            "0 uquad ~0x123456789ABCDEF0 Unsigned quad",
4576            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4577        ); // The original value
4578
4579        // Test x operator
4580        assert_magic_match_bin!(
4581            "0 uquad x {:#x}",
4582            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12",
4583            "0x123456789abcdef0"
4584        );
4585        assert_magic_match_bin!(
4586            "0 uquad x Unsigned quad",
4587            b"\x00\x00\x00\x00\x00\x00\x00\x00"
4588        );
4589    }
4590
4591    #[test]
4592    fn test_guid() {
4593        assert_magic_match_bin!(
4594            "0 guid EC959539-6786-2D4E-8FDB-98814CE76C1E It works",
4595            b"\xEC\x95\x95\x39\x67\x86\x2D\x4E\x8F\xDB\x98\x81\x4C\xE7\x6C\x1E"
4596        );
4597
4598        assert_magic_not_match_bin!(
4599            "0 guid 399595EC-8667-4E2D-8FDB-98814CE76C1E It works",
4600            b"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F"
4601        );
4602
4603        assert_magic_match_bin!(
4604            "0 guid x %s",
4605            b"\xEC\x95\x95\x39\x67\x86\x2D\x4E\x8F\xDB\x98\x81\x4C\xE7\x6C\x1E",
4606            "EC959539-6786-2D4E-8FDB-98814CE76C1E"
4607        );
4608    }
4609
4610    #[test]
4611    fn test_ubeqdate() {
4612        assert_magic_match_bin!(
4613            "0 ubeqdate 1633046400 It works",
4614            b"\x00\x00\x00\x00\x61\x56\x4f\x80"
4615        );
4616
4617        assert_magic_match_bin!(
4618            "0 ubeqdate x %s",
4619            b"\x00\x00\x00\x00\x61\x56\x4f\x80",
4620            "2021-10-01 00:00:00"
4621        );
4622
4623        assert_magic_not_match_bin!(
4624            "0 ubeqdate 1633046400 It should not work",
4625            b"\x00\x00\x00\x00\x00\x00\x00\x00"
4626        );
4627    }
4628
4629    #[test]
4630    fn test_ldate() {
4631        assert_magic_match_bin!("0 ldate 1640551520 It works", b"\x60\xd4\xC8\x61");
4632
4633        assert_magic_not_match_bin!("0 ldate 1633046400 It should not work", b"\x00\x00\x00\x00");
4634
4635        assert_magic_match_bin!(
4636            "0 ldate x %s",
4637            b"\x60\xd4\xC8\x61",
4638            unix_local_time_to_string(1640551520)
4639        );
4640    }
4641
4642    #[test]
4643    fn test_scalar_with_transform() {
4644        assert_magic_match_bin!("0 ubyte/10 2 {}", b"\x14", "2");
4645        assert_magic_match_bin!("0 ubyte/10 x {}", b"\x14", "2");
4646        assert_magic_match_bin!("0 ubyte%10 x {}", b"\x14", "0");
4647    }
4648
4649    #[test]
4650    fn test_float_with_transform() {
4651        assert_magic_match_bin!("0 lefloat/10 2 {}", b"\x00\x00\xa0\x41", "2");
4652        assert_magic_match_bin!("0 lefloat/10 x {}", b"\x00\x00\xa0\x41", "2");
4653        assert_magic_match_bin!("0 lefloat%10 x {}", b"\x00\x00\xa0\x41", "0");
4654    }
4655
4656    #[test]
4657    fn test_read_octal() {
4658        // Basic cases
4659        assert_eq!(read_octal_u64(&mut lazy_cache!("0")), Some(0));
4660        assert_eq!(read_octal_u64(&mut lazy_cache!("00")), Some(0));
4661        assert_eq!(read_octal_u64(&mut lazy_cache!("01")), Some(1));
4662        assert_eq!(read_octal_u64(&mut lazy_cache!("07")), Some(7));
4663        assert_eq!(read_octal_u64(&mut lazy_cache!("010")), Some(8));
4664        assert_eq!(read_octal_u64(&mut lazy_cache!("0123")), Some(83));
4665        assert_eq!(read_octal_u64(&mut lazy_cache!("0755")), Some(493));
4666
4667        // With trailing non-octal characters
4668        assert_eq!(read_octal_u64(&mut lazy_cache!("0ABC")), Some(0));
4669        assert_eq!(read_octal_u64(&mut lazy_cache!("01ABC")), Some(1));
4670        assert_eq!(read_octal_u64(&mut lazy_cache!("0755ABC")), Some(493));
4671        assert_eq!(read_octal_u64(&mut lazy_cache!("0123ABC")), Some(83));
4672
4673        // Invalid octal digits
4674        assert_eq!(read_octal_u64(&mut lazy_cache!("08")), Some(0)); // stops at '8'
4675        assert_eq!(read_octal_u64(&mut lazy_cache!("01238")), Some(83)); // stops at '8'
4676
4677        // No leading '0'
4678        assert_eq!(read_octal_u64(&mut lazy_cache!("123")), None);
4679        assert_eq!(read_octal_u64(&mut lazy_cache!("755")), None);
4680
4681        // Empty string
4682        assert_eq!(read_octal_u64(&mut lazy_cache!("")), None);
4683
4684        // Only non-octal characters
4685        assert_eq!(read_octal_u64(&mut lazy_cache!("ABC")), None);
4686        assert_eq!(read_octal_u64(&mut lazy_cache!("8ABC")), None); // first char is not '0'
4687
4688        // Longer valid octal (but within u64 range)
4689        assert_eq!(
4690            read_octal_u64(&mut lazy_cache!("01777777777")),
4691            Some(268435455)
4692        );
4693    }
4694
4695    #[test]
4696    fn test_offset_bug_1() {
4697        // this tests the exact behaviour
4698        // expected by libmagic/file
4699        assert_magic_match_bin!(
4700            r"
47011	string		TEST Bread is
4702# offset computation is relative to
4703# rule start
4704>(5.b)	use toasted
4705
47060 name toasted
4707>0	string twice Toasted
4708>>0  use toasted_twice
4709
47100 name toasted_twice
4711>(6.b) string x %s
4712        ",
4713            b"\x00TEST\x06twice\x00\x06",
4714            "Bread is Toasted twice"
4715        );
4716    }
4717
4718    // this test implement the exact same logic as
4719    // test_offset_bug_1 except that the rule starts
4720    // matching from end. Surprisingly we need to
4721    // adjust indirect offsets so that it works in
4722    // libmagic/file
4723    #[test]
4724    fn test_offset_bug_2() {
4725        // this tests the exact behaviour
4726        // expected by libmagic/file
4727        assert_magic_match_bin!(
4728            r"
4729-12	string		TEST Bread is
4730>(4.b)	use toasted
4731
47320 name toasted
4733>0	string twice Toasted
4734>>0  use toasted_twice
4735
47360 name toasted_twice
4737>(6.b) string x %
4738        ",
4739            b"\x00TEST\x06twice\x00\x06",
4740            "Bread is Toasted twice"
4741        )
4742    }
4743
4744    #[test]
4745    fn test_offset_bug_3() {
4746        // this tests the exact behaviour
4747        // expected by libmagic/file
4748        assert_magic_match_bin!(
4749            r"
47501	string		TEST Bread is
4751>(5.b) indirect/r x
4752
47530	string twice Toasted
4754>0  use toasted_twice
4755
47560 name toasted_twice
4757>0 string x %s
4758        ",
4759            b"\x00TEST\x06twice\x00\x08",
4760            "Bread is Toasted twice"
4761        )
4762    }
4763
4764    #[test]
4765    fn test_offset_bug_4() {
4766        // this tests the exact behaviour
4767        // expected by libmagic/file
4768        assert_magic_match_bin!(
4769            r"
47701	string		Bread %s
4771>(6.b) indirect/r x
4772
4773# this one uses a based offset
4774# computed at indirection
47751	string is\ Toasted %s
4776>(11.b)  use toasted_twice
4777
4778# this one is using a new base
4779# offset being previous base
4780# offset + offset of use
47810 name toasted_twice
4782>0 string x %s
4783            ",
4784            b"\x00Bread\x06is Toasted\x0ctwice\x00",
4785            "Bread is Toasted twice"
4786        )
4787    }
4788
4789    #[test]
4790    fn test_offset_bug_5() {
4791        assert_magic_match_bin!(
4792            r"
47931	string		TEST Bread is
4794>(5.b) indirect/r x
4795
47960	string twice Toasted
4797>0  use toasted_twice
4798
47990 name toasted_twice
4800>0 string twice
4801>>&1 byte 0x08 twice
4802            ",
4803            b"\x00TEST\x06twice\x00\x08",
4804            "Bread is Toasted twice"
4805        )
4806    }
4807
4808    #[test]
4809    fn test_bug_6() {
4810        // An indirect use test should not be successful
4811        // even if a match with no message occurs
4812
4813        assert_magic_match_bin!(
4814            r"
48151	string		TEST Bread is toasted
4816>&0 use toasted
4817>>&0 default x but not burnt
4818
48190 name toasted
4820>1 string toasted
4821            ",
4822            b"\x00TEST\x06toasted",
4823            "Bread is toasted"
4824        )
4825    }
4826
4827    #[test]
4828    fn test_offset_bug_7() {
4829        // Bug: nested 'use' directives with indirect offsets don't properly
4830        // adjust offsets during recursion. This test encodes the behavior
4831        // libmagic has when dealing with such scenarios.
4832        assert_magic_match_bin!(
4833            r"
48341	string		TEST Bread is
4835# offset computation is relative to
4836# rule start
4837>(5.b)	use toasted
4838
48390 name toasted
4840>0	string toast Toasted
4841>>(6.b)  use toasted_twice
4842
48430 name toasted_twice
4844>1 string x %s
4845        ",
4846            b"\x00TEST\x06toast\x00\x06twice\x00",
4847            "Bread is Toasted twice"
4848        );
4849    }
4850
4851    #[test]
4852    fn test_message_parts() {
4853        let m = first_magic(
4854            r#"0	string/W	#!/usr/bin/env\ python  PYTHON"#,
4855            b"#!/usr/bin/env    python",
4856            StreamKind::Text(TextEncoding::Ascii),
4857        )
4858        .unwrap();
4859
4860        assert!(m.message_parts().any(|p| p.eq_ignore_ascii_case("python")))
4861    }
4862
4863    #[test]
4864    fn test_load_bulk() {
4865        let mut db = MagicDb::new();
4866
4867        let rules = vec![
4868            parse_assert!("0 search test"),
4869            parse_assert!("0 search/24/s test"),
4870            parse_assert!("0 search/s/24 test"),
4871        ];
4872
4873        db.load_bulk(rules.into_iter());
4874        db.verify().unwrap();
4875    }
4876
4877    #[test]
4878    fn test_load_bulk_failure() {
4879        let mut db = MagicDb::new();
4880
4881        let rules = vec![parse_assert!(
4882            r#"
48830 search/s/24 test
4884>0 use test
4885"#
4886        )];
4887
4888        db.load_bulk(rules.into_iter());
4889        assert!(matches!(db.verify(), Err(Error::Verify(_, _, _))));
4890    }
4891}