pure_magic/
lib.rs

1#![forbid(unsafe_code)]
2#![deny(unused_imports)]
3#![deny(missing_docs)]
4//! # `pure-magic`: A pure and safe Rust Reimplementation of `libmagic`
5//!
6//! Unlike many file identification crates, `pure-magic` is highly compatible with the standard
7//! `magic` rule format, allowing seamless reuse of existing
8//! [rules](https://github.com/qjerome/magic-rs/tree/main/magic-db/src/magdir). This makes it an ideal
9//! drop-in replacement for crates relying on **`libmagic` C bindings**, where memory safety is critical.
10//!
11//! **Key Features:**
12//! - File type detection
13//! - MIME type inference
14//! - Custom magic rule parsing
15//!
16//! ## Installation
17//! Add `pure-magic` to your `Cargo.toml`:
18//!
19//! ```toml
20//! [dependencies]
21//! pure-magic = "0.1"  # Replace with the latest version
22//! ```
23//!
24//! Or add the latest version with cargo:
25//!
26//! ```sh
27//! cargo add pure-magic
28//! ```
29//!
30//! ## Quick Start
31//!
32//! ### Detect File Types Programmatically
33//! ```rust
34//! use pure_magic::{MagicDb, MagicSource, DataReader};
35//! use std::fs::File;
36//!
37//! fn main() -> Result<(), Box<dyn std::error::Error>> {
38//!     let mut db = MagicDb::new();
39//!     // Create a MagicSource from a file
40//!     let rust_magic = MagicSource::open("../magic-db/src/magdir/rust")?;
41//!     db.load(rust_magic);
42//!     // Verification is not mandatory
43//!     db.verify()?;
44//!
45//!     // Detect file type
46//!     let magic = db.first_magic_file("src/lib.rs")?;
47//!
48//!     println!(
49//!         "File type: {} (MIME: {}, strength: {})",
50//!         magic.message(),
51//!         magic.mime_type(),
52//!         magic.strength()
53//!     );
54//!     Ok(())
55//! }
56//! ```
57//!
58//! ### Get All Matching Rules
59//! ```rust
60//! use pure_magic::{MagicDb, MagicSource, DataReader};
61//! use std::fs::File;
62//!
63//! fn main() -> Result<(), Box<dyn std::error::Error>> {
64//!     let mut db = MagicDb::new();
65//!     // Create a MagicSource from a file
66//!     let rust_magic = MagicSource::open("../magic-db/src/magdir/rust")?;
67//!     db.load(rust_magic);
68//!
69//!     // Get all matching rules, sorted by strength
70//!     let magics = db.all_magics_file("src/lib.rs")?;
71//!
72//!     // Must contain rust file magic and default text magic
73//!     assert!(magics.len() > 1);
74//!
75//!     for magic in magics {
76//!         println!(
77//!             "Match: {} (strength: {}, source: {})",
78//!             magic.message(),
79//!             magic.strength(),
80//!             magic.source().unwrap_or("unknown")
81//!         );
82//!     }
83//!     Ok(())
84//! }
85//! ```
86//!
87//! ### Serialize a Database to Disk
88//! ```rust
89//! use pure_magic::{MagicDb, MagicSource};
90//! use std::fs::File;
91//!
92//! fn main() -> Result<(), Box<dyn std::error::Error>> {
93//!     let mut db = MagicDb::new();
94//!     // Create a MagicSource from a file
95//!     let rust_magic = MagicSource::open("../magic-db/src/magdir/rust")?;
96//!     db.load(rust_magic);
97//!
98//!     // Serialize the database to a file
99//!     let mut output = File::create("/tmp/compiled.db")?;
100//!     db.serialize(&mut output)?;
101//!
102//!     println!("Database saved to file");
103//!     Ok(())
104//! }
105//! ```
106//!
107//! ### Deserialize a Database
108//! ```rust
109//! use pure_magic::{MagicDb, MagicSource};
110//! use std::fs::File;
111//!
112//! fn main() -> Result<(), Box<dyn std::error::Error>> {
113//!     let mut db = MagicDb::new();
114//!     // Create a MagicSource from a file
115//!     let rust_magic = MagicSource::open("../magic-db/src/magdir/rust")?;
116//!     db.load(rust_magic);
117//!
118//!     // Serialize the database in a vector
119//!     let mut ser = vec![];
120//!     db.serialize(&mut ser)?;
121//!     println!("Database saved to vector");
122//!
123//!     // We deserialize from slice
124//!     let db = MagicDb::deserialize(&mut ser.as_slice())?;
125//!
126//!     assert!(!db.rules().is_empty());
127//!
128//!     Ok(())
129//! }
130//! ```
131//!
132//! ## License
133//! This project is dual-licensed under either:
134//! - **GPL-3.0**
135//! - **BSD-2-Clause**
136//!
137//! ## Contributing
138//! Contributions are welcome! Open an issue or submit a pull request.
139//!
140//! ## Acknowledgments
141//! - Inspired by the original `libmagic` (part of the `file` command).
142
143use dyf::{DynDisplay, FormatString, dformat};
144use flagset::{FlagSet, flags};
145use flate2::{Compression, read::GzDecoder, write::GzEncoder};
146use memchr::memchr;
147use pest::{Span, error::ErrorVariant};
148use regex::bytes::{self};
149use serde::{Deserialize, Serialize};
150use std::{
151    borrow::Cow,
152    cmp::max,
153    collections::{HashMap, HashSet},
154    fmt::{self, Debug, Display},
155    fs::File,
156    io::{self, Read, SeekFrom, Write},
157    ops::{Add, BitAnd, BitOr, BitXor, Deref, Div, Mul, Rem, Sub},
158    path::Path,
159};
160use tar::Archive;
161use thiserror::Error;
162use tracing::{Level, debug, enabled, trace};
163
164use crate::{
165    numeric::{Float, FloatDataType, Scalar, ScalarDataType},
166    parser::{FileMagicParser, Rule},
167    readers::DataRead,
168    utils::{
169        debug_string_from_vec_u8, debug_string_from_vec_u16, decode_id3, find_json_boundaries,
170        run_utf8_validation,
171    },
172};
173
174mod numeric;
175mod parser;
176pub mod readers;
177pub use readers::DataReader;
178mod utils;
179
180const HARDCODED_MAGIC_STRENGTH: u64 = 2048;
181const HARDCODED_SOURCE: &str = "hardcoded";
182// corresponds to FILE_INDIR_MAX constant defined in libmagic
183const MAX_RECURSION: usize = 50;
184// constant found in libmagic. It is used to limit for regex tests
185const FILE_REGEX_MAX: usize = 8192;
186
187/// Maximum number of bytes to read for search tests.
188///
189/// This constant is derived from `libmagic` and is used to limit the number of bytes
190/// read during search tests to ensure performance and efficiency. The value is set
191/// to 7 megabytes.
192pub const FILE_BYTES_MAX: usize = 7 * 1024 * 1024;
193/// Default mimetype for un-identified binary data
194pub const DEFAULT_BIN_MIMETYPE: &str = "application/octet-stream";
195/// Default mimetype for un-identified text data
196pub const DEFAULT_TEXT_MIMETYPE: &str = "text/plain";
197
198pub(crate) const TIMESTAMP_FORMAT: &str = "%Y-%m-%d %H:%M:%S";
199
200macro_rules! debug_panic {
201    ($($arg:tt)*) => {
202        if cfg!(debug_assertions) {
203            panic!($($arg)*);
204        }
205    };
206}
207
208macro_rules! read {
209    ($r: expr, $ty: ty) => {{
210        let mut a = [0u8; std::mem::size_of::<$ty>()];
211        $r.read_exact_into(&mut a)?;
212        a
213    }};
214}
215
216macro_rules! read_le {
217    ($r:expr, $ty: ty ) => {{ <$ty>::from_le_bytes(read!($r, $ty)) }};
218}
219
220macro_rules! read_be {
221    ($r:expr, $ty: ty ) => {{ <$ty>::from_be_bytes(read!($r, $ty)) }};
222}
223
224macro_rules! read_me {
225    ($r: expr) => {{ ((read_le!($r, u16) as i32) << 16) | (read_le!($r, u16) as i32) }};
226}
227
228#[inline(always)]
229fn read_octal_u64<D: DataRead>(haystack: &mut D) -> Option<u64> {
230    let s = haystack
231        .read_while_or_limit(|b| matches!(b, b'0'..=b'7'), 22)
232        .map(|buf| str::from_utf8(buf))
233        .ok()?
234        .ok()?;
235
236    if !s.starts_with("0") {
237        return None;
238    }
239
240    u64::from_str_radix(s, 8).ok()
241}
242
243/// Represents all possible errors that can occur during file type detection and processing.
244#[derive(Debug, Error)]
245pub enum Error {
246    /// A generic error with a custom message.
247    #[error("{0}")]
248    Msg(String),
249
250    /// Indicate a rule load failure
251    #[error("source={0} line={1} error={2}")]
252    Verify(String, usize, Box<Error>),
253
254    /// An error with a source location and a nested error.
255    #[error("source={0} line={1} error={2}")]
256    Localized(String, usize, Box<Error>),
257
258    /// Indicates a required rule was not found.
259    #[error("missing rule: {0}")]
260    MissingRule(String),
261
262    /// Indicates the maximum recursion depth was reached.
263    #[error("maximum recursion reached: {0}")]
264    MaximumRecursion(usize),
265
266    /// Wraps an I/O error.
267    #[error("io: {0}")]
268    Io(#[from] io::Error),
269
270    /// Wraps a parsing error from the `pest` parser.
271    #[error("parser error: {0}")]
272    Parse(#[from] Box<pest::error::Error<Rule>>),
273
274    /// Wraps a formatting error from the `dyf` crate.
275    #[error("formatting: {0}")]
276    Format(#[from] dyf::Error),
277
278    /// Wraps a regex-related error.
279    #[error("regex: {0}")]
280    Regex(#[from] regex::Error),
281
282    /// Wraps a serialization error from `bincode`.
283    #[error("{0}")]
284    Serialize(#[from] bincode::error::EncodeError),
285
286    /// Wraps a deserialization error from `bincode`.
287    #[error("{0}")]
288    Deserialize(#[from] bincode::error::DecodeError),
289}
290
291impl Error {
292    #[inline]
293    fn parser<S: ToString>(msg: S, span: Span<'_>) -> Self {
294        Self::Parse(Box::new(pest::error::Error::new_from_span(
295            ErrorVariant::CustomError {
296                message: msg.to_string(),
297            },
298            span,
299        )))
300    }
301
302    fn msg<M: AsRef<str>>(msg: M) -> Self {
303        Self::Msg(msg.as_ref().into())
304    }
305
306    fn localized<S: AsRef<str>>(source: S, line: usize, err: Error) -> Self {
307        Self::Localized(source.as_ref().into(), line, err.into())
308    }
309
310    /// Unwraps the localized error
311    pub fn unwrap_localized(&self) -> &Self {
312        match self {
313            Self::Localized(_, _, e) => e,
314            _ => self,
315        }
316    }
317}
318
319#[derive(Debug, Clone, Serialize, Deserialize)]
320enum Message {
321    String(String),
322    Format {
323        printf_spec: String,
324        fs: FormatString,
325    },
326}
327
328impl Display for Message {
329    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
330        match self {
331            Self::String(s) => write!(f, "{s}"),
332            Self::Format { printf_spec: _, fs } => write!(f, "{}", fs.to_string_lossy()),
333        }
334    }
335}
336
337impl Message {
338    fn to_string_lossy(&self) -> Cow<'_, str> {
339        match self {
340            Message::String(s) => Cow::Borrowed(s),
341            Message::Format { printf_spec: _, fs } => fs.to_string_lossy(),
342        }
343    }
344
345    #[inline(always)]
346    fn format_with(&self, mr: Option<&MatchRes>) -> Result<Cow<'_, str>, Error> {
347        match self {
348            Self::String(s) => Ok(Cow::Borrowed(s.as_str())),
349            Self::Format {
350                printf_spec: c_spec,
351                fs,
352            } => {
353                if let Some(mr) = mr {
354                    match mr {
355                        MatchRes::Float(_, _) | MatchRes::Bytes(_, _, _, _) => {
356                            Ok(Cow::Owned(dformat!(fs, mr)?))
357                        }
358                        MatchRes::Scalar(_, scalar) => {
359                            // we want to print a byte as char
360                            if c_spec.as_str() == "c" {
361                                match scalar {
362                                    Scalar::byte(b) => {
363                                        let b = (*b as u8) as char;
364                                        Ok(Cow::Owned(dformat!(fs, b)?))
365                                    }
366                                    Scalar::ubyte(b) => {
367                                        let b = *b as char;
368                                        Ok(Cow::Owned(dformat!(fs, b)?))
369                                    }
370                                    _ => Ok(Cow::Owned(dformat!(fs, mr)?)),
371                                }
372                            } else {
373                                Ok(Cow::Owned(dformat!(fs, mr)?))
374                            }
375                        }
376                    }
377                } else {
378                    Ok(fs.to_string_lossy())
379                }
380            }
381        }
382    }
383}
384
385impl ScalarDataType {
386    #[inline(always)]
387    fn read<R: DataRead>(&self, from: &mut R, switch_endianness: bool) -> Result<Scalar, Error> {
388        macro_rules! _read_le {
389            ($ty: ty) => {{
390                if switch_endianness {
391                    <$ty>::from_be_bytes(read!(from, $ty))
392                } else {
393                    <$ty>::from_le_bytes(read!(from, $ty))
394                }
395            }};
396        }
397
398        macro_rules! _read_be {
399            ($ty: ty) => {{
400                if switch_endianness {
401                    <$ty>::from_le_bytes(read!(from, $ty))
402                } else {
403                    <$ty>::from_be_bytes(read!(from, $ty))
404                }
405            }};
406        }
407
408        macro_rules! _read_ne {
409            ($ty: ty) => {{
410                if cfg!(target_endian = "big") {
411                    _read_be!($ty)
412                } else {
413                    _read_le!($ty)
414                }
415            }};
416        }
417
418        macro_rules! _read_me {
419            () => {
420                ((_read_le!(u16) as i32) << 16) | (_read_le!(u16) as i32)
421            };
422        }
423
424        Ok(match self {
425            // signed
426            Self::byte => Scalar::byte(read!(from, u8)[0] as i8),
427            Self::short => Scalar::short(_read_ne!(i16)),
428            Self::long => Scalar::long(_read_ne!(i32)),
429            Self::date => Scalar::date(_read_ne!(i32)),
430            Self::ldate => Scalar::ldate(_read_ne!(i32)),
431            Self::qwdate => Scalar::qwdate(_read_ne!(i64)),
432            Self::leshort => Scalar::leshort(_read_le!(i16)),
433            Self::lelong => Scalar::lelong(_read_le!(i32)),
434            Self::lequad => Scalar::lequad(_read_le!(i64)),
435            Self::bequad => Scalar::bequad(_read_be!(i64)),
436            Self::belong => Scalar::belong(_read_be!(i32)),
437            Self::bedate => Scalar::bedate(_read_be!(i32)),
438            Self::beldate => Scalar::beldate(_read_be!(i32)),
439            Self::beqdate => Scalar::beqdate(_read_be!(i64)),
440            // unsigned
441            Self::ubyte => Scalar::ubyte(read!(from, u8)[0]),
442            Self::ushort => Scalar::ushort(_read_ne!(u16)),
443            Self::uleshort => Scalar::uleshort(_read_le!(u16)),
444            Self::ulelong => Scalar::ulelong(_read_le!(u32)),
445            Self::uledate => Scalar::uledate(_read_le!(u32)),
446            Self::ulequad => Scalar::ulequad(_read_le!(u64)),
447            Self::offset => Scalar::offset(from.stream_position()),
448            Self::ubequad => Scalar::ubequad(_read_be!(u64)),
449            Self::medate => Scalar::medate(_read_me!()),
450            Self::meldate => Scalar::meldate(_read_me!()),
451            Self::melong => Scalar::melong(_read_me!()),
452            Self::beshort => Scalar::beshort(_read_be!(i16)),
453            Self::quad => Scalar::quad(_read_ne!(i64)),
454            Self::uquad => Scalar::uquad(_read_ne!(u64)),
455            Self::ledate => Scalar::ledate(_read_le!(i32)),
456            Self::leldate => Scalar::leldate(_read_le!(i32)),
457            Self::leqdate => Scalar::leqdate(_read_le!(i64)),
458            Self::leqldate => Scalar::leqldate(_read_le!(i64)),
459            Self::leqwdate => Scalar::leqwdate(_read_le!(i64)),
460            Self::ubelong => Scalar::ubelong(_read_be!(u32)),
461            Self::ulong => Scalar::ulong(_read_ne!(u32)),
462            Self::ubeshort => Scalar::ubeshort(_read_be!(u16)),
463            Self::ubeqdate => Scalar::ubeqdate(_read_be!(u64)),
464            Self::lemsdosdate => Scalar::lemsdosdate(_read_le!(u16)),
465            Self::lemsdostime => Scalar::lemsdostime(_read_le!(u16)),
466            Self::guid => Scalar::guid(u128::from_be_bytes(read!(from, u128))),
467        })
468    }
469}
470
471impl FloatDataType {
472    #[inline(always)]
473    fn read<R: DataRead>(&self, from: &mut R, switch_endianness: bool) -> Result<Float, Error> {
474        macro_rules! _read_le {
475            ($ty: ty) => {{
476                if switch_endianness {
477                    <$ty>::from_be_bytes(read!(from, $ty))
478                } else {
479                    <$ty>::from_le_bytes(read!(from, $ty))
480                }
481            }};
482        }
483
484        macro_rules! _read_be {
485            ($ty: ty) => {{
486                if switch_endianness {
487                    <$ty>::from_le_bytes(read!(from, $ty))
488                } else {
489                    <$ty>::from_be_bytes(read!(from, $ty))
490                }
491            }};
492        }
493
494        macro_rules! _read_ne {
495            ($ty: ty) => {{
496                if cfg!(target_endian = "big") {
497                    _read_be!($ty)
498                } else {
499                    _read_le!($ty)
500                }
501            }};
502        }
503
504        macro_rules! _read_me {
505            () => {
506                ((_read_le!(u16) as i32) << 16) | (_read_le!(u16) as i32)
507            };
508        }
509
510        Ok(match self {
511            Self::lefloat => Float::lefloat(_read_le!(f32)),
512            Self::befloat => Float::befloat(_read_le!(f32)),
513            Self::ledouble => Float::ledouble(_read_le!(f64)),
514            Self::bedouble => Float::bedouble(_read_be!(f64)),
515        })
516    }
517}
518
519#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
520enum Op {
521    Mul,
522    Add,
523    Sub,
524    Div,
525    Mod,
526    And,
527    Xor,
528    Or,
529}
530
531impl Display for Op {
532    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
533        match self {
534            Op::Mul => write!(f, "*"),
535            Op::Add => write!(f, "+"),
536            Op::Sub => write!(f, "-"),
537            Op::Div => write!(f, "/"),
538            Op::Mod => write!(f, "%"),
539            Op::And => write!(f, "&"),
540            Op::Or => write!(f, "|"),
541            Op::Xor => write!(f, "^"),
542        }
543    }
544}
545
546#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
547enum CmpOp {
548    Eq,
549    Lt,
550    Gt,
551    BitAnd,
552    Neq, // ! operator
553    Xor,
554    Not, // ~ operator
555}
556
557impl CmpOp {
558    #[inline(always)]
559    fn is_neq(&self) -> bool {
560        matches!(self, Self::Neq)
561    }
562}
563
564#[derive(Debug, Clone, Serialize, Deserialize)]
565struct ScalarTransform {
566    op: Op,
567    num: Scalar,
568}
569
570impl ScalarTransform {
571    fn apply(&self, s: Scalar) -> Option<Scalar> {
572        match self.op {
573            Op::Add => s.checked_add(self.num),
574            Op::Sub => s.checked_sub(self.num),
575            Op::Mul => s.checked_mul(self.num),
576            Op::Div => s.checked_div(self.num),
577            Op::Mod => s.checked_rem(self.num),
578            Op::And => Some(s.bitand(self.num)),
579            Op::Xor => Some(s.bitxor(self.num)),
580            Op::Or => Some(s.bitor(self.num)),
581        }
582    }
583}
584
585#[derive(Debug, Clone, Serialize, Deserialize)]
586struct FloatTransform {
587    op: Op,
588    num: Float,
589}
590
591impl FloatTransform {
592    fn apply(&self, s: Float) -> Float {
593        match self.op {
594            Op::Add => s.add(self.num),
595            Op::Sub => s.sub(self.num),
596            Op::Mul => s.mul(self.num),
597            // returns inf when div by 0
598            Op::Div => s.div(self.num),
599            // returns NaN when rem by 0
600            Op::Mod => s.rem(self.num),
601            // parser makes sure those operators cannot be used
602            Op::And | Op::Xor | Op::Or => {
603                debug_panic!("unsupported operation");
604                s
605            }
606        }
607    }
608}
609
610#[derive(Clone, Serialize, Deserialize)]
611enum TestValue<T> {
612    Value(T),
613    Any,
614}
615
616impl Debug for TestValue<Vec<u8>> {
617    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
618        match self {
619            Self::Value(v) => write!(f, "\"{}\"", debug_string_from_vec_u8(v)),
620            Self::Any => write!(f, "ANY"),
621        }
622    }
623}
624
625impl Debug for TestValue<Vec<u16>> {
626    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
627        match self {
628            Self::Value(v) => write!(f, "\"{}\"", debug_string_from_vec_u16(v)),
629            Self::Any => write!(f, "ANY"),
630        }
631    }
632}
633
634impl Debug for TestValue<Scalar> {
635    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
636        match self {
637            Self::Value(s) => write!(f, "{s:?}"),
638            Self::Any => write!(f, "ANY"),
639        }
640    }
641}
642
643impl Debug for TestValue<Float> {
644    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
645        match self {
646            Self::Value(fl) => write!(f, "{fl:?}"),
647            Self::Any => write!(f, "ANY"),
648        }
649    }
650}
651
652impl<T> TestValue<T> {
653    #[inline(always)]
654    fn as_ref(&self) -> TestValue<&T> {
655        match self {
656            Self::Value(v) => TestValue::Value(v),
657            Self::Any => TestValue::Any,
658        }
659    }
660}
661
662flags! {
663    enum ReMod: u8{
664        CaseInsensitive,
665        StartOffsetUpdate,
666        LineLimit,
667        ForceBin,
668        ForceText,
669        TrimMatch,
670    }
671}
672
673fn serialize_regex<S>(re: &bytes::Regex, serializer: S) -> Result<S::Ok, S::Error>
674where
675    S: serde::Serializer,
676{
677    re.as_str().serialize(serializer)
678}
679
680fn deserialize_regex<'de, D>(deserializer: D) -> Result<bytes::Regex, D::Error>
681where
682    D: serde::Deserializer<'de>,
683{
684    let wrapper = String::deserialize(deserializer)?;
685    bytes::Regex::new(&wrapper).map_err(serde::de::Error::custom)
686}
687
688#[derive(Debug, Clone, Serialize, Deserialize)]
689struct RegexTest {
690    #[serde(
691        serialize_with = "serialize_regex",
692        deserialize_with = "deserialize_regex"
693    )]
694    re: bytes::Regex,
695    length: Option<usize>,
696    mods: FlagSet<ReMod>,
697    str_mods: FlagSet<StringMod>,
698    non_magic_len: usize,
699    binary: bool,
700    cmp_op: CmpOp,
701}
702
703impl RegexTest {
704    #[inline(always)]
705    fn is_binary(&self) -> bool {
706        self.binary
707            || self.mods.contains(ReMod::ForceBin)
708            || self.str_mods.contains(StringMod::ForceBin)
709    }
710
711    #[inline(always)]
712    fn is_text(&self) -> bool {
713        self.mods.contains(ReMod::ForceText) || self.str_mods.contains(StringMod::ForceText)
714    }
715
716    fn match_buf<'buf>(
717        &self,
718        off_buf: u64, // absolute buffer offset in content
719        stream_kind: StreamKind,
720        buf: &'buf [u8],
721    ) -> Option<MatchRes<'buf>> {
722        let mr = match stream_kind {
723            StreamKind::Text(_) => {
724                let mut off_txt = off_buf;
725
726                let mut line_limit = self.length.unwrap_or(usize::MAX);
727
728                for line in buf.split(|c| c == &b'\n') {
729                    // we don't need to break on offset
730                    // limit as buf contains the good amount
731                    // of bytes to match against
732                    if line_limit == 0 {
733                        break;
734                    }
735
736                    if let Some(re_match) = self.re.find(line) {
737                        // the offset of the string is computed from the start of the buffer
738                        let start_offset = off_txt + re_match.start() as u64;
739
740                        // if we matched until EOL we need to add one to include the delimiter removed from the split
741                        let stop_offset = if re_match.end() == line.len() {
742                            Some(start_offset + re_match.as_bytes().len() as u64 + 1)
743                        } else {
744                            None
745                        };
746
747                        return Some(MatchRes::Bytes(
748                            start_offset,
749                            stop_offset,
750                            re_match.as_bytes(),
751                            Encoding::Utf8,
752                        ));
753                    }
754
755                    off_txt += line.len() as u64;
756                    // we have to add one because lines do not contain splitting character
757                    off_txt += 1;
758                    line_limit = line_limit.saturating_sub(1)
759                }
760                None
761            }
762
763            StreamKind::Binary => {
764                self.re.find(buf).map(|re_match| {
765                    MatchRes::Bytes(
766                        // the offset of the string is computed from the start of the buffer
767                        off_buf + re_match.start() as u64,
768                        None,
769                        re_match.as_bytes(),
770                        Encoding::Utf8,
771                    )
772                })
773            }
774        };
775
776        // handle the case where we want the regex not to match
777        if self.cmp_op.is_neq() && mr.is_none() {
778            return Some(MatchRes::Bytes(off_buf, None, buf, Encoding::Utf8));
779        }
780
781        mr
782    }
783}
784
785impl From<RegexTest> for Test {
786    fn from(value: RegexTest) -> Self {
787        Self::Regex(value)
788    }
789}
790
791flags! {
792    enum StringMod: u8{
793        ForceBin,
794        UpperInsensitive,
795        LowerInsensitive,
796        FullWordMatch,
797        Trim,
798        ForceText,
799        CompactWhitespace,
800        OptBlank,
801    }
802}
803
804#[derive(Debug, Clone, Serialize, Deserialize)]
805struct StringTest {
806    test_val: TestValue<Vec<u8>>,
807    cmp_op: CmpOp,
808    length: Option<usize>,
809    mods: FlagSet<StringMod>,
810    binary: bool,
811}
812
813impl From<StringTest> for Test {
814    fn from(value: StringTest) -> Self {
815        Self::String(value)
816    }
817}
818
819#[inline(always)]
820fn string_match(str: &[u8], mods: FlagSet<StringMod>, buf: &[u8]) -> (bool, usize) {
821    let mut consumed = 0;
822    // we can do a simple string comparison
823    if mods.is_disjoint(
824        StringMod::UpperInsensitive
825            | StringMod::LowerInsensitive
826            | StringMod::FullWordMatch
827            | StringMod::CompactWhitespace
828            | StringMod::OptBlank,
829    ) {
830        // we check if target contains
831        if buf.starts_with(str) {
832            (true, str.len())
833        } else {
834            (false, consumed)
835        }
836    } else {
837        let mut i_src = 0;
838        let mut iter = buf.iter().peekable();
839
840        macro_rules! consume_target {
841            () => {{
842                if iter.next().is_some() {
843                    consumed += 1;
844                }
845            }};
846        }
847
848        macro_rules! continue_next_iteration {
849            () => {{
850                consume_target!();
851                i_src += 1;
852                continue;
853            }};
854        }
855
856        while let Some(&&b) = iter.peek() {
857            let Some(&ref_byte) = str.get(i_src) else {
858                break;
859            };
860
861            if mods.contains(StringMod::OptBlank) && (b == b' ' || ref_byte == b' ') {
862                if b == b' ' {
863                    // we ignore whitespace in target
864                    consume_target!();
865                }
866
867                if ref_byte == b' ' {
868                    // we ignore whitespace in test
869                    i_src += 1;
870                }
871
872                continue;
873            }
874
875            if mods.contains(StringMod::UpperInsensitive) {
876                //upper case characters in the magic match both lower and upper case characters in the target
877                if ref_byte.is_ascii_uppercase() && ref_byte == b.to_ascii_uppercase()
878                    || ref_byte == b
879                {
880                    continue_next_iteration!()
881                }
882            }
883
884            if mods.contains(StringMod::LowerInsensitive)
885                && (ref_byte.is_ascii_lowercase() && ref_byte == b.to_ascii_lowercase()
886                    || ref_byte == b)
887            {
888                continue_next_iteration!()
889            }
890
891            if mods.contains(StringMod::CompactWhitespace) && ref_byte == b' ' {
892                let mut src_blk = 0;
893                while let Some(b' ') = str.get(i_src) {
894                    src_blk += 1;
895                    i_src += 1;
896                }
897
898                let mut tgt_blk = 0;
899                while let Some(b' ') = iter.peek() {
900                    tgt_blk += 1;
901                    consume_target!();
902                }
903
904                if src_blk > tgt_blk {
905                    return (false, consumed);
906                }
907
908                continue;
909            }
910
911            if ref_byte == b {
912                continue_next_iteration!()
913            } else {
914                return (false, consumed);
915            }
916        }
917
918        if mods.contains(StringMod::FullWordMatch)
919            && let Some(b) = iter.peek()
920            && !b.is_ascii_whitespace()
921        {
922            return (false, consumed);
923        }
924
925        (
926            consumed > 0 && str.get(i_src).is_none() && consumed <= buf.len(),
927            consumed,
928        )
929    }
930}
931
932impl StringTest {
933    fn has_length_mod(&self) -> bool {
934        !self.mods.is_disjoint(
935            StringMod::UpperInsensitive
936                | StringMod::LowerInsensitive
937                | StringMod::FullWordMatch
938                | StringMod::CompactWhitespace
939                | StringMod::OptBlank,
940        )
941    }
942
943    #[inline(always)]
944    fn test_value_len(&self) -> usize {
945        match self.test_val.as_ref() {
946            TestValue::Value(s) => s.len(),
947            TestValue::Any => 0,
948        }
949    }
950
951    #[inline(always)]
952    fn is_binary(&self) -> bool {
953        self.binary || self.mods.contains(StringMod::ForceBin)
954    }
955
956    #[inline(always)]
957    fn is_text(&self) -> bool {
958        self.mods.contains(StringMod::ForceText)
959    }
960}
961
962#[derive(Clone, Serialize, Deserialize)]
963struct ByteVec(Vec<u8>);
964
965impl Debug for ByteVec {
966    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
967        write!(f, "\"{}\"", debug_string_from_vec_u8(self))
968    }
969}
970
971impl From<Vec<u8>> for ByteVec {
972    fn from(value: Vec<u8>) -> Self {
973        Self(value)
974    }
975}
976
977impl Deref for ByteVec {
978    type Target = Vec<u8>;
979
980    fn deref(&self) -> &Self::Target {
981        &self.0
982    }
983}
984
985#[derive(Debug, Clone, Serialize, Deserialize)]
986struct SearchTest {
987    str: ByteVec,
988    n_pos: Option<usize>,
989    str_mods: FlagSet<StringMod>,
990    re_mods: FlagSet<ReMod>,
991    binary: bool,
992    cmp_op: CmpOp,
993}
994
995impl From<SearchTest> for Test {
996    fn from(value: SearchTest) -> Self {
997        Self::Search(value)
998    }
999}
1000
1001impl SearchTest {
1002    #[inline(always)]
1003    fn is_binary(&self) -> bool {
1004        (self.binary
1005            || self.str_mods.contains(StringMod::ForceBin)
1006            || self.re_mods.contains(ReMod::ForceBin))
1007            && !(self.str_mods.contains(StringMod::ForceText)
1008                || self.re_mods.contains(ReMod::ForceText))
1009    }
1010
1011    // off_buf: absolute buffer offset in content
1012    #[inline]
1013    fn match_buf<'buf>(&self, off_buf: u64, buf: &'buf [u8]) -> Option<MatchRes<'buf>> {
1014        let mut i = 0;
1015
1016        let needle = self.str.first()?;
1017
1018        while i < buf.len() {
1019            // we cannot match if the first character isn't the same
1020            // so we accelerate the search by finding potential matches
1021            let Some(k) = memchr(*needle, &buf[i..]) else {
1022                break;
1023            };
1024
1025            i += k;
1026
1027            // if we want a full word match
1028            if self.str_mods.contains(StringMod::FullWordMatch) {
1029                let prev_is_whitespace = buf
1030                    .get(i.saturating_sub(1))
1031                    .map(|c| c.is_ascii_whitespace())
1032                    .unwrap_or_default();
1033
1034                // if it is not the first character
1035                // and its previous character isn't
1036                // a whitespace. It cannot be a
1037                // fullword match
1038                if i > 0 && !prev_is_whitespace {
1039                    i += 1;
1040                    continue;
1041                }
1042            }
1043
1044            if let Some(npos) = self.n_pos
1045                && i > npos
1046            {
1047                break;
1048            }
1049
1050            let pos = i;
1051            let (ok, consumed) = string_match(&self.str, self.str_mods, &buf[i..]);
1052
1053            if ok {
1054                return Some(MatchRes::Bytes(
1055                    off_buf.saturating_add(pos as u64),
1056                    None,
1057                    &buf[i..i + consumed],
1058                    Encoding::Utf8,
1059                ));
1060            } else {
1061                i += max(consumed, 1)
1062            }
1063        }
1064
1065        // handles the case where we want the string not to be found
1066        if self.cmp_op.is_neq() {
1067            return Some(MatchRes::Bytes(off_buf, None, buf, Encoding::Utf8));
1068        }
1069
1070        None
1071    }
1072}
1073
1074#[derive(Debug, Clone, Serialize, Deserialize)]
1075struct ScalarTest {
1076    ty: ScalarDataType,
1077    transform: Option<ScalarTransform>,
1078    cmp_op: CmpOp,
1079    test_val: TestValue<Scalar>,
1080}
1081
1082#[derive(Debug, Clone, Serialize, Deserialize)]
1083struct FloatTest {
1084    ty: FloatDataType,
1085    transform: Option<FloatTransform>,
1086    cmp_op: CmpOp,
1087    test_val: TestValue<Float>,
1088}
1089
1090// the value read from the haystack we want to match against
1091// 'buf is the lifetime of the buffer we are scanning
1092#[derive(PartialEq)]
1093enum ReadValue<'buf> {
1094    Float(u64, Float),
1095    Scalar(u64, Scalar),
1096    Bytes(u64, &'buf [u8]),
1097}
1098
1099impl<'buf> Debug for ReadValue<'buf> {
1100    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1101        match self {
1102            Self::Float(_, fl) => write!(f, "{fl:?}"),
1103            Self::Scalar(_, s) => write!(f, "{s:?}"),
1104            Self::Bytes(_, b) => {
1105                if b.len() <= 128 {
1106                    write!(f, "\"{}\"", debug_string_from_vec_u8(b))
1107                } else {
1108                    let limit = 128;
1109                    write!(
1110                        f,
1111                        "\"{}\" (first {limit} bytes)",
1112                        debug_string_from_vec_u8(&b[..limit])
1113                    )
1114                }
1115            }
1116        }
1117    }
1118}
1119
1120impl DynDisplay for ReadValue<'_> {
1121    fn dyn_fmt(&self, f: &dyf::FormatSpec) -> Result<String, dyf::Error> {
1122        match self {
1123            Self::Float(_, s) => DynDisplay::dyn_fmt(s, f),
1124            Self::Scalar(_, s) => DynDisplay::dyn_fmt(s, f),
1125            Self::Bytes(_, b) => Ok(format!("{b:?}")),
1126        }
1127    }
1128}
1129
1130impl DynDisplay for &ReadValue<'_> {
1131    fn dyn_fmt(&self, f: &dyf::FormatSpec) -> Result<String, dyf::Error> {
1132        // Dereference self to get the TestValue and call its fmt method
1133        DynDisplay::dyn_fmt(*self, f)
1134    }
1135}
1136
1137impl Display for ReadValue<'_> {
1138    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1139        match self {
1140            Self::Float(_, v) => write!(f, "{v}"),
1141            Self::Scalar(_, s) => write!(f, "{s}"),
1142            Self::Bytes(_, b) => write!(f, "{b:?}"),
1143        }
1144    }
1145}
1146
1147enum Encoding {
1148    Utf16(String16Encoding),
1149    Utf8,
1150}
1151
1152// Carry the offset of the start of the data in the stream
1153// and the data itself
1154enum MatchRes<'buf> {
1155    // Bytes.0: offset of the match
1156    // Bytes.1: optional end of match (to address the need of EOL adjustment in string regex)
1157    // Bytes.2: the bytes matching
1158    // Bytes.3: encoding of the buffer
1159    Bytes(u64, Option<u64>, &'buf [u8], Encoding),
1160    Scalar(u64, Scalar),
1161    Float(u64, Float),
1162}
1163
1164impl DynDisplay for &MatchRes<'_> {
1165    fn dyn_fmt(&self, f: &dyf::FormatSpec) -> Result<String, dyf::Error> {
1166        (*self).dyn_fmt(f)
1167    }
1168}
1169
1170impl DynDisplay for MatchRes<'_> {
1171    fn dyn_fmt(&self, f: &dyf::FormatSpec) -> Result<String, dyf::Error> {
1172        match self {
1173            Self::Scalar(_, v) => v.dyn_fmt(f),
1174            Self::Float(_, v) => v.dyn_fmt(f),
1175            Self::Bytes(_, _, v, enc) => match enc {
1176                Encoding::Utf8 => String::from_utf8_lossy(v).to_string().dyn_fmt(f),
1177                Encoding::Utf16(enc) => {
1178                    let utf16: Vec<u16> = slice_to_utf16_iter(v, *enc).collect();
1179                    String::from_utf16_lossy(&utf16).dyn_fmt(f)
1180                }
1181            },
1182        }
1183    }
1184}
1185
1186impl MatchRes<'_> {
1187    // start offset of the match
1188    #[inline]
1189    fn start_offset(&self) -> u64 {
1190        match self {
1191            MatchRes::Bytes(o, _, _, _) => *o,
1192            MatchRes::Scalar(o, _) => *o,
1193            MatchRes::Float(o, _) => *o,
1194        }
1195    }
1196
1197    // start offset of the match
1198    #[inline]
1199    fn end_offset(&self) -> u64 {
1200        match self {
1201            MatchRes::Bytes(start, end, buf, _) => match end {
1202                Some(end) => *end,
1203                None => start.saturating_add(buf.len() as u64),
1204            },
1205            MatchRes::Scalar(o, sc) => o.add(sc.size_of() as u64),
1206            MatchRes::Float(o, f) => o.add(f.size_of() as u64),
1207        }
1208    }
1209}
1210
1211fn slice_to_utf16_iter(read: &[u8], encoding: String16Encoding) -> impl Iterator<Item = u16> {
1212    let even = read
1213        .iter()
1214        .enumerate()
1215        .filter(|(i, _)| i % 2 == 0)
1216        .map(|t| t.1);
1217
1218    let odd = read
1219        .iter()
1220        .enumerate()
1221        .filter(|(i, _)| i % 2 != 0)
1222        .map(|t| t.1);
1223
1224    even.zip(odd).map(move |(e, o)| match encoding {
1225        String16Encoding::Le => u16::from_le_bytes([*e, *o]),
1226        String16Encoding::Be => u16::from_be_bytes([*e, *o]),
1227    })
1228}
1229
1230#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
1231enum String16Encoding {
1232    Le,
1233    Be,
1234}
1235
1236#[derive(Debug, Clone, Serialize, Deserialize)]
1237struct String16Test {
1238    orig: String,
1239    test_val: TestValue<Vec<u16>>,
1240    encoding: String16Encoding,
1241}
1242
1243impl String16Test {
1244    /// if the test value is a specific value this method returns
1245    /// the number of utf16 characters. To obtain the length in
1246    /// bytes the return value needs to be multiplied by two.
1247    #[inline(always)]
1248    fn test_value_len(&self) -> usize {
1249        match self.test_val.as_ref() {
1250            TestValue::Value(str16) => str16.len(),
1251            TestValue::Any => 0,
1252        }
1253    }
1254}
1255
1256flags! {
1257    enum IndirectMod: u8{
1258        Relative,
1259    }
1260}
1261
1262type IndirectMods = FlagSet<IndirectMod>;
1263
1264#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
1265enum PStringLen {
1266    Byte,    // B
1267    ShortBe, // H
1268    ShortLe, // h
1269    LongBe,  // L
1270    LongLe,  // l
1271}
1272
1273impl PStringLen {
1274    #[inline(always)]
1275    const fn size_of_len(&self) -> usize {
1276        match self {
1277            PStringLen::Byte => 1,
1278            PStringLen::ShortBe => 2,
1279            PStringLen::ShortLe => 2,
1280            PStringLen::LongBe => 4,
1281            PStringLen::LongLe => 4,
1282        }
1283    }
1284}
1285
1286#[derive(Debug, Clone, Serialize, Deserialize)]
1287struct PStringTest {
1288    len: PStringLen,
1289    test_val: TestValue<Vec<u8>>,
1290    include_len: bool,
1291}
1292
1293impl PStringTest {
1294    #[inline]
1295    fn read<'cache, R: DataRead>(
1296        &self,
1297        haystack: &'cache mut R,
1298    ) -> Result<Option<&'cache [u8]>, Error> {
1299        let mut len = match self.len {
1300            PStringLen::Byte => read_le!(haystack, u8) as u32,
1301            PStringLen::ShortBe => read_be!(haystack, u16) as u32,
1302            PStringLen::ShortLe => read_le!(haystack, u16) as u32,
1303            PStringLen::LongBe => read_be!(haystack, u32),
1304            PStringLen::LongLe => read_le!(haystack, u32),
1305        } as usize;
1306
1307        if self.include_len {
1308            len = len.saturating_sub(self.len.size_of_len())
1309        }
1310
1311        if let TestValue::Value(s) = self.test_val.as_ref()
1312            && len != s.len()
1313        {
1314            return Ok(None);
1315        }
1316
1317        let read = haystack.read_exact_count(len as u64)?;
1318
1319        Ok(Some(read))
1320    }
1321
1322    #[inline(always)]
1323    fn test_value_len(&self) -> usize {
1324        match self.test_val.as_ref() {
1325            TestValue::Value(s) => s.len(),
1326            TestValue::Any => 0,
1327        }
1328    }
1329}
1330
1331#[derive(Debug, Clone, Serialize, Deserialize)]
1332enum Test {
1333    Name(String),
1334    Use(bool, String),
1335    Scalar(ScalarTest),
1336    Float(FloatTest),
1337    String(StringTest),
1338    Search(SearchTest),
1339    PString(PStringTest),
1340    Regex(RegexTest),
1341    Indirect(FlagSet<IndirectMod>),
1342    String16(String16Test),
1343    // FIXME: placeholder for strength computation
1344    #[allow(dead_code)]
1345    Der,
1346    Clear,
1347    Default,
1348}
1349
1350impl Display for Test {
1351    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1352        match self {
1353            Test::Name(name) => write!(f, "name {name}"),
1354            Test::Use(flip, rule) => {
1355                if *flip {
1356                    write!(f, "use {rule}")
1357                } else {
1358                    write!(f, "use ^{rule}")
1359                }
1360            }
1361            Test::Scalar(st) => write!(f, "{st:?}"),
1362            Test::Float(ft) => write!(f, "{ft:?}"),
1363            Test::String(st) => write!(f, "{st:?}"),
1364            Test::Search(st) => write!(f, "{st:?}"),
1365            Test::PString(pt) => write!(f, "{pt:?}"),
1366            Test::Regex(rt) => write!(f, "{rt:?}"),
1367            Test::Indirect(fs) => write!(f, "indirect {fs:?}"),
1368            Test::String16(s16t) => write!(f, "{s16t:?}"),
1369            Test::Der => write!(f, "unimplemented der"),
1370            Test::Clear => write!(f, "clear"),
1371            Test::Default => write!(f, "default"),
1372        }
1373    }
1374}
1375
1376impl Test {
1377    // read the value to test from the haystack
1378    #[inline]
1379    fn read_test_value<'haystack, D: DataRead>(
1380        &self,
1381        haystack: &'haystack mut D,
1382        switch_endianness: bool,
1383    ) -> Result<Option<ReadValue<'haystack>>, Error> {
1384        let test_value_offset = haystack.stream_position();
1385
1386        match self {
1387            Self::Scalar(t) => {
1388                t.ty.read(haystack, switch_endianness)
1389                    .map(|s| Some(ReadValue::Scalar(test_value_offset, s)))
1390            }
1391
1392            Self::Float(t) => {
1393                t.ty.read(haystack, switch_endianness)
1394                    .map(|f| Some(ReadValue::Float(test_value_offset, f)))
1395            }
1396            Self::String(t) => {
1397                match t.test_val.as_ref() {
1398                    TestValue::Value(str) => {
1399                        let buf = if let Some(length) = t.length {
1400                            // if there is a length specified
1401                            haystack.read_exact_count(length as u64)?
1402                        } else {
1403                            // no length specified we read until end of string
1404
1405                            match t.cmp_op {
1406                                CmpOp::Eq | CmpOp::Neq => {
1407                                    if !t.has_length_mod() {
1408                                        haystack.read_exact_count(str.len() as u64)?
1409                                    } else {
1410                                        haystack.read_count(FILE_BYTES_MAX as u64)?
1411                                    }
1412                                }
1413                                CmpOp::Lt | CmpOp::Gt => {
1414                                    let read =
1415                                        haystack.read_until_any_delim_or_limit(b"\n\0", 8092)?;
1416
1417                                    if read.ends_with(b"\0") || read.ends_with(b"\n") {
1418                                        &read[..read.len() - 1]
1419                                    } else {
1420                                        read
1421                                    }
1422                                }
1423                                _ => {
1424                                    return Err(Error::Msg(format!(
1425                                        "string test does not support {:?} operator",
1426                                        t.cmp_op
1427                                    )));
1428                                }
1429                            }
1430                        };
1431
1432                        Ok(Some(ReadValue::Bytes(test_value_offset, buf)))
1433                    }
1434                    TestValue::Any => {
1435                        let read = haystack.read_until_any_delim_or_limit(b"\0\n", 8192)?;
1436                        // we don't take last byte if it matches end of string
1437                        let bytes = if read.ends_with(b"\0") || read.ends_with(b"\n") {
1438                            &read[..read.len() - 1]
1439                        } else {
1440                            read
1441                        };
1442
1443                        Ok(Some(ReadValue::Bytes(test_value_offset, bytes)))
1444                    }
1445                }
1446            }
1447
1448            Self::String16(t) => {
1449                match t.test_val.as_ref() {
1450                    TestValue::Value(str16) => {
1451                        let read = haystack.read_exact_count((str16.len() * 2) as u64)?;
1452
1453                        Ok(Some(ReadValue::Bytes(test_value_offset, read)))
1454                    }
1455                    TestValue::Any => {
1456                        let read = haystack.read_until_utf16_or_limit(b"\x00\x00", 8192)?;
1457
1458                        // we make sure we have an even number of elements
1459                        let end = if read.len() % 2 == 0 {
1460                            read.len()
1461                        } else {
1462                            // we decide to read anyway even though
1463                            // length isn't even
1464                            read.len().saturating_sub(1)
1465                        };
1466
1467                        Ok(Some(ReadValue::Bytes(test_value_offset, &read[..end])))
1468                    }
1469                }
1470            }
1471
1472            Self::PString(t) => {
1473                let Some(read) = t.read(haystack)? else {
1474                    return Ok(None);
1475                };
1476                Ok(Some(ReadValue::Bytes(test_value_offset, read)))
1477            }
1478
1479            Self::Search(_) => {
1480                let buf = haystack.read_count(FILE_BYTES_MAX as u64)?;
1481                Ok(Some(ReadValue::Bytes(test_value_offset, buf)))
1482            }
1483
1484            Self::Regex(r) => {
1485                let length = {
1486                    match r.length {
1487                        Some(len) => {
1488                            if r.mods.contains(ReMod::LineLimit) {
1489                                len * 80
1490                            } else {
1491                                len
1492                            }
1493                        }
1494
1495                        None => FILE_REGEX_MAX,
1496                    }
1497                };
1498
1499                let read = haystack.read_count(length as u64)?;
1500                Ok(Some(ReadValue::Bytes(test_value_offset, read)))
1501            }
1502
1503            Self::Name(_)
1504            | Self::Use(_, _)
1505            | Self::Indirect(_)
1506            | Self::Clear
1507            | Self::Default
1508            | Self::Der => Err(Error::msg("no value to read for this test")),
1509        }
1510    }
1511
1512    #[inline(always)]
1513    fn match_value<'s>(
1514        &'s self,
1515        tv: &ReadValue<'s>,
1516        stream_kind: StreamKind,
1517    ) -> Option<MatchRes<'s>> {
1518        match (self, tv) {
1519            (Self::Scalar(t), ReadValue::Scalar(o, ts)) => {
1520                let read_value: Scalar = match t.transform.as_ref() {
1521                    Some(t) => t.apply(*ts)?,
1522                    None => *ts,
1523                };
1524
1525                match t.test_val {
1526                    TestValue::Value(test_value) => {
1527                        let ok = match t.cmp_op {
1528                            // NOTE: this should not happen in practice because
1529                            // we convert it into Eq equivalent at parsing time
1530                            CmpOp::Not => read_value == !test_value,
1531                            CmpOp::Eq => read_value == test_value,
1532                            CmpOp::Lt => read_value < test_value,
1533                            CmpOp::Gt => read_value > test_value,
1534                            CmpOp::Neq => read_value != test_value,
1535                            CmpOp::BitAnd => read_value & test_value == test_value,
1536                            CmpOp::Xor => (read_value & test_value).is_zero(),
1537                        };
1538
1539                        if ok {
1540                            Some(MatchRes::Scalar(*o, read_value))
1541                        } else {
1542                            None
1543                        }
1544                    }
1545
1546                    TestValue::Any => Some(MatchRes::Scalar(*o, read_value)),
1547                }
1548            }
1549
1550            (Self::Float(t), ReadValue::Float(o, f)) => {
1551                let read_value: Float = t.transform.as_ref().map(|t| t.apply(*f)).unwrap_or(*f);
1552
1553                match t.test_val {
1554                    TestValue::Value(tf) => {
1555                        let ok = match t.cmp_op {
1556                            CmpOp::Eq => read_value == tf,
1557                            CmpOp::Lt => read_value < tf,
1558                            CmpOp::Gt => read_value > tf,
1559                            CmpOp::Neq => read_value != tf,
1560                            _ => {
1561                                // this should never be reached as we validate
1562                                // operator in parser
1563                                debug_panic!("unsupported float comparison");
1564                                debug!("unsupported float comparison");
1565                                false
1566                            }
1567                        };
1568
1569                        if ok {
1570                            Some(MatchRes::Float(*o, read_value))
1571                        } else {
1572                            None
1573                        }
1574                    }
1575                    TestValue::Any => Some(MatchRes::Float(*o, read_value)),
1576                }
1577            }
1578
1579            (Self::String(st), ReadValue::Bytes(o, buf)) => {
1580                macro_rules! trim_buf {
1581                    ($buf: expr) => {{
1582                        if st.mods.contains(StringMod::Trim) {
1583                            $buf.trim_ascii()
1584                        } else {
1585                            $buf
1586                        }
1587                    }};
1588                }
1589
1590                match st.test_val.as_ref() {
1591                    TestValue::Value(str) => {
1592                        match st.cmp_op {
1593                            CmpOp::Eq => {
1594                                if let (true, _) = string_match(str, st.mods, buf) {
1595                                    Some(MatchRes::Bytes(*o, None, trim_buf!(str), Encoding::Utf8))
1596                                } else {
1597                                    None
1598                                }
1599                            }
1600                            CmpOp::Neq => {
1601                                if let (false, _) = string_match(str, st.mods, buf) {
1602                                    Some(MatchRes::Bytes(*o, None, trim_buf!(str), Encoding::Utf8))
1603                                } else {
1604                                    None
1605                                }
1606                            }
1607                            CmpOp::Gt => {
1608                                if buf.len() > str.len() {
1609                                    Some(MatchRes::Bytes(*o, None, trim_buf!(buf), Encoding::Utf8))
1610                                } else {
1611                                    None
1612                                }
1613                            }
1614                            CmpOp::Lt => {
1615                                if buf.len() < str.len() {
1616                                    Some(MatchRes::Bytes(*o, None, trim_buf!(buf), Encoding::Utf8))
1617                                } else {
1618                                    None
1619                                }
1620                            }
1621
1622                            // unsupported for strings
1623                            _ => {
1624                                // this should never be reached as we validate
1625                                // operator in parser
1626                                debug_panic!("unsupported string comparison");
1627                                debug!("unsupported string comparison");
1628                                None
1629                            }
1630                        }
1631                    }
1632                    TestValue::Any => {
1633                        Some(MatchRes::Bytes(*o, None, trim_buf!(buf), Encoding::Utf8))
1634                    }
1635                }
1636            }
1637
1638            (Self::PString(m), ReadValue::Bytes(o, buf)) => match m.test_val.as_ref() {
1639                TestValue::Value(psv) => {
1640                    if buf == psv {
1641                        Some(MatchRes::Bytes(*o, None, buf, Encoding::Utf8))
1642                    } else {
1643                        None
1644                    }
1645                }
1646                TestValue::Any => Some(MatchRes::Bytes(*o, None, buf, Encoding::Utf8)),
1647            },
1648
1649            (Self::String16(t), ReadValue::Bytes(o, buf)) => {
1650                match t.test_val.as_ref() {
1651                    TestValue::Value(str16) => {
1652                        // strings cannot be equal
1653                        if str16.len() * 2 != buf.len() {
1654                            return None;
1655                        }
1656
1657                        // we check string equality
1658                        for (i, utf16_char) in slice_to_utf16_iter(buf, t.encoding).enumerate() {
1659                            if str16[i] != utf16_char {
1660                                return None;
1661                            }
1662                        }
1663
1664                        Some(MatchRes::Bytes(
1665                            *o,
1666                            None,
1667                            t.orig.as_bytes(),
1668                            Encoding::Utf16(t.encoding),
1669                        ))
1670                    }
1671
1672                    TestValue::Any => {
1673                        Some(MatchRes::Bytes(*o, None, buf, Encoding::Utf16(t.encoding)))
1674                    }
1675                }
1676            }
1677
1678            (Self::Regex(r), ReadValue::Bytes(o, buf)) => r.match_buf(*o, stream_kind, buf),
1679
1680            (Self::Search(t), ReadValue::Bytes(o, buf)) => t.match_buf(*o, buf),
1681
1682            _ => None,
1683        }
1684    }
1685
1686    #[inline(always)]
1687    fn strength(&self) -> u64 {
1688        const MULT: usize = 10;
1689
1690        let mut out = 2 * MULT;
1691
1692        // FIXME: octal is missing but it is not used in practice ...
1693        match self {
1694            Test::Scalar(s) => {
1695                out += s.ty.type_size() * MULT;
1696            }
1697
1698            Test::Float(t) => {
1699                out += t.ty.type_size() * MULT;
1700            }
1701
1702            Test::String(t) => out += t.test_value_len().saturating_mul(MULT),
1703
1704            Test::PString(t) => out += t.test_value_len().saturating_mul(MULT),
1705
1706            Test::Search(s) => {
1707                // NOTE: this implementation deviates from what is in
1708                // C libmagic. The purpose of this implementation is to
1709                // minimize the difference between similar tests,
1710                // implemented differently (ex: string test VS very localized search test).
1711                let n_pos = s.n_pos.unwrap_or(FILE_BYTES_MAX);
1712
1713                match n_pos {
1714                    // a search on one line should be equivalent to a string match
1715                    0..=80 => out += s.str.len().saturating_mul(MULT),
1716                    // search on the first 3 lines gets a little penalty
1717                    81..=240 => out += s.str.len() * s.str.len().clamp(0, MULT - 2),
1718                    // a search on more than 3 lines isn't considered very accurate
1719                    _ => out += s.str.len(),
1720                }
1721            }
1722
1723            Test::Regex(r) => {
1724                // NOTE: this implementation deviates from what is in
1725                // C libmagic. The purpose of this implementation is to
1726                // minimize the difference between similar tests,
1727                // implemented differently (ex: string test VS very localized regex test).
1728
1729                // we divide length by the number of capture group
1730                // which gives us a value close to he average string
1731                // length match in the regex.
1732                let v = r.non_magic_len / r.re.captures_len();
1733
1734                let len = r
1735                    .length
1736                    .map(|l| {
1737                        if r.mods.contains(ReMod::LineLimit) {
1738                            l * 80
1739                        } else {
1740                            l
1741                        }
1742                    })
1743                    .unwrap_or(FILE_BYTES_MAX);
1744
1745                match len {
1746                    // a search on one line should be equivalent to a string match
1747                    0..=80 => out += v.saturating_mul(MULT),
1748                    // search on the first 3 lines gets a little penalty
1749                    81..=240 => out += v * v.clamp(0, MULT - 2),
1750                    // a search on more than 3 lines isn't considered very accurate
1751                    _ => out += v,
1752                }
1753            }
1754
1755            Test::String16(t) => {
1756                // NOTE: in libmagic the result is div by 2
1757                // but I GUESS it is because the len is expressed
1758                // in number bytes. In our case length is expressed
1759                // in number of u16 so we shouldn't divide.
1760                out += t.test_value_len().saturating_mul(MULT);
1761            }
1762
1763            Test::Der => out += MULT,
1764
1765            Test::Default | Test::Name(_) | Test::Use(_, _) | Test::Indirect(_) | Test::Clear => {
1766                return 0;
1767            }
1768        }
1769
1770        // matching any output gets penalty
1771        if self.is_match_any() {
1772            return 0;
1773        }
1774
1775        if let Some(op) = self.cmp_op() {
1776            match op {
1777                // matching almost any gets penalty
1778                CmpOp::Neq => out = 0,
1779                CmpOp::Eq | CmpOp::Not => out += MULT,
1780                CmpOp::Lt | CmpOp::Gt => out -= 2 * MULT,
1781                CmpOp::Xor | CmpOp::BitAnd => out -= MULT,
1782            }
1783        }
1784
1785        out as u64
1786    }
1787
1788    #[inline(always)]
1789    fn cmp_op(&self) -> Option<CmpOp> {
1790        match self {
1791            Self::String(t) => Some(t.cmp_op),
1792            Self::Scalar(s) => Some(s.cmp_op),
1793            Self::Float(t) => Some(t.cmp_op),
1794            Self::Name(_)
1795            | Self::Use(_, _)
1796            | Self::Search(_)
1797            | Self::PString(_)
1798            | Self::Regex(_)
1799            | Self::Clear
1800            | Self::Default
1801            | Self::Indirect(_)
1802            | Self::String16(_)
1803            | Self::Der => None,
1804        }
1805    }
1806
1807    #[inline(always)]
1808    fn is_recursive(&self) -> bool {
1809        matches!(self, Test::Use(_, _) | Test::Indirect(_))
1810    }
1811
1812    #[inline(always)]
1813    fn is_match_any(&self) -> bool {
1814        match self {
1815            Test::Name(_) => false,
1816            Test::Use(_, _) => false,
1817            Test::Scalar(scalar_test) => matches!(scalar_test.test_val, TestValue::Any),
1818            Test::Float(float_test) => matches!(float_test.test_val, TestValue::Any),
1819            Test::String(string_test) => matches!(string_test.test_val, TestValue::Any),
1820            Test::Search(_) => false,
1821            Test::PString(pstring_test) => matches!(pstring_test.test_val, TestValue::Any),
1822            Test::Regex(_) => false,
1823            Test::Indirect(_) => false,
1824            Test::String16(string16_test) => matches!(string16_test.test_val, TestValue::Any),
1825            Test::Der => false,
1826            Test::Clear => false,
1827            Test::Default => false,
1828        }
1829    }
1830
1831    #[inline(always)]
1832    fn is_binary(&self) -> bool {
1833        match self {
1834            Self::Name(_) => true,
1835            Self::Use(_, _) => true,
1836            Self::Scalar(_) => true,
1837            Self::Float(_) => true,
1838            Self::String(t) => !t.is_binary() & !t.is_text() || t.is_binary(),
1839            Self::Search(t) => t.is_binary(),
1840            Self::PString(_) => true,
1841            Self::Regex(t) => !t.is_binary() & !t.is_text() || t.is_binary(),
1842            Self::Clear => true,
1843            Self::Default => true,
1844            Self::Indirect(_) => true,
1845            Self::String16(_) => true,
1846            Self::Der => true,
1847        }
1848    }
1849
1850    #[inline(always)]
1851    fn is_text(&self) -> bool {
1852        match self {
1853            Self::Name(_) => true,
1854            Self::Use(_, _) => true,
1855            Self::Indirect(_) => true,
1856            Self::Clear => true,
1857            Self::Default => true,
1858            Self::String(t) => !t.is_binary() & !t.is_text() || t.is_text(),
1859            Self::Regex(t) => !t.is_binary() & !t.is_text() || t.is_text(),
1860            _ => !self.is_binary(),
1861        }
1862    }
1863
1864    #[inline(always)]
1865    fn is_only_text(&self) -> bool {
1866        self.is_text() && !self.is_binary()
1867    }
1868
1869    #[inline(always)]
1870    fn is_only_binary(&self) -> bool {
1871        self.is_binary() && !self.is_text()
1872    }
1873}
1874
1875#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
1876enum OffsetType {
1877    Byte,
1878    DoubleLe,
1879    DoubleBe,
1880    ShortLe,
1881    ShortBe,
1882    Id3Le,
1883    Id3Be,
1884    LongLe,
1885    LongBe,
1886    Middle,
1887    Octal,
1888    QuadBe,
1889    QuadLe,
1890}
1891
1892#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
1893enum Shift {
1894    Direct(u64),
1895    Indirect(i64),
1896}
1897
1898#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
1899struct IndOffset {
1900    // where to find the offset
1901    off_addr: DirOffset,
1902    // signed or unsigned
1903    signed: bool,
1904    // type of the offset
1905    ty: OffsetType,
1906    op: Option<Op>,
1907    shift: Option<Shift>,
1908}
1909
1910impl IndOffset {
1911    // if we overflow we must not return an offset
1912    fn read_offset<D: DataRead>(
1913        &self,
1914        haystack: &mut D,
1915        rule_base_offset: Option<u64>,
1916        last_upper_match_offset: Option<u64>,
1917    ) -> Result<Option<u64>, io::Error> {
1918        let offset_address = match self.off_addr {
1919            DirOffset::Start(s) => {
1920                let Some(o) = s.checked_add(rule_base_offset.unwrap_or_default()) else {
1921                    return Ok(None);
1922                };
1923
1924                haystack.seek(SeekFrom::Start(o))?
1925            }
1926            DirOffset::LastUpper(c) => haystack.seek(SeekFrom::Start(
1927                (last_upper_match_offset.unwrap_or_default() as i64 + c) as u64,
1928            ))?,
1929            DirOffset::End(e) => haystack.seek(SeekFrom::End(e))?,
1930        };
1931
1932        macro_rules! read_value {
1933            () => {
1934                match self.ty {
1935                    OffsetType::Byte => {
1936                        if self.signed {
1937                            read_le!(haystack, u8) as u64
1938                        } else {
1939                            read_le!(haystack, i8) as u64
1940                        }
1941                    }
1942                    OffsetType::DoubleLe => read_le!(haystack, f64) as u64,
1943                    OffsetType::DoubleBe => read_be!(haystack, f64) as u64,
1944                    OffsetType::ShortLe => {
1945                        if self.signed {
1946                            read_le!(haystack, i16) as u64
1947                        } else {
1948                            read_le!(haystack, u16) as u64
1949                        }
1950                    }
1951                    OffsetType::ShortBe => {
1952                        if self.signed {
1953                            read_be!(haystack, i16) as u64
1954                        } else {
1955                            read_be!(haystack, u16) as u64
1956                        }
1957                    }
1958                    OffsetType::Id3Le => decode_id3(read_le!(haystack, u32)) as u64,
1959                    OffsetType::Id3Be => decode_id3(read_be!(haystack, u32)) as u64,
1960                    OffsetType::LongLe => {
1961                        if self.signed {
1962                            read_le!(haystack, i32) as u64
1963                        } else {
1964                            read_le!(haystack, u32) as u64
1965                        }
1966                    }
1967                    OffsetType::LongBe => {
1968                        if self.signed {
1969                            read_be!(haystack, i32) as u64
1970                        } else {
1971                            read_be!(haystack, u32) as u64
1972                        }
1973                    }
1974                    OffsetType::Middle => read_me!(haystack) as u64,
1975                    OffsetType::Octal => {
1976                        if let Some(o) = read_octal_u64(haystack) {
1977                            o
1978                        } else {
1979                            debug!("failed to read octal offset @ {offset_address}");
1980                            return Ok(None);
1981                        }
1982                    }
1983                    OffsetType::QuadLe => {
1984                        if self.signed {
1985                            read_le!(haystack, i64) as u64
1986                        } else {
1987                            read_le!(haystack, u64)
1988                        }
1989                    }
1990                    OffsetType::QuadBe => {
1991                        if self.signed {
1992                            read_be!(haystack, i64) as u64
1993                        } else {
1994                            read_be!(haystack, u64)
1995                        }
1996                    }
1997                }
1998            };
1999        }
2000
2001        // in theory every offset read should end up in something seekable from start, so we can use u64 to store the result
2002        let o = read_value!();
2003
2004        trace!(
2005            "offset read @ {offset_address} value={o} op={:?} shift={:?}",
2006            self.op, self.shift
2007        );
2008
2009        // apply transformation
2010        if let (Some(op), Some(shift)) = (self.op, self.shift) {
2011            let shift = match shift {
2012                Shift::Direct(i) => i,
2013                Shift::Indirect(i) => {
2014                    let tmp = offset_address as i128 + i as i128;
2015                    if tmp.is_negative() {
2016                        return Ok(None);
2017                    } else {
2018                        haystack.seek(SeekFrom::Start(tmp as u64))?;
2019                    };
2020                    // NOTE: here we assume that the shift has the same
2021                    // type as the main offset !
2022                    read_value!()
2023                }
2024            };
2025
2026            match op {
2027                Op::Add => return Ok(o.checked_add(shift)),
2028                Op::Mul => return Ok(o.checked_mul(shift)),
2029                Op::Sub => return Ok(o.checked_sub(shift)),
2030                Op::Div => return Ok(o.checked_div(shift)),
2031                Op::Mod => return Ok(o.checked_rem(shift)),
2032                Op::And => return Ok(Some(o & shift)),
2033                Op::Or => return Ok(Some(o | shift)),
2034                Op::Xor => return Ok(Some(o ^ shift)),
2035            }
2036        }
2037
2038        Ok(Some(o))
2039    }
2040}
2041
2042#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
2043enum DirOffset {
2044    Start(u64),
2045    // relative to the last up-level field
2046    LastUpper(i64),
2047    End(i64),
2048}
2049
2050#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
2051enum Offset {
2052    Direct(DirOffset),
2053    Indirect(IndOffset),
2054}
2055
2056impl Offset {
2057    #[inline(always)]
2058    fn is_indirect(&self) -> bool {
2059        matches!(self, Self::Indirect(_))
2060    }
2061}
2062
2063impl From<DirOffset> for Offset {
2064    fn from(value: DirOffset) -> Self {
2065        Self::Direct(value)
2066    }
2067}
2068
2069impl From<IndOffset> for Offset {
2070    fn from(value: IndOffset) -> Self {
2071        Self::Indirect(value)
2072    }
2073}
2074
2075impl Display for DirOffset {
2076    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
2077        match self {
2078            DirOffset::Start(i) => write!(f, "{i}"),
2079            DirOffset::LastUpper(c) => write!(f, "&{c}"),
2080            DirOffset::End(e) => write!(f, "-{e}"),
2081        }
2082    }
2083}
2084
2085impl Default for DirOffset {
2086    fn default() -> Self {
2087        Self::LastUpper(0)
2088    }
2089}
2090
2091#[derive(Debug, Clone, Serialize, Deserialize)]
2092struct Match {
2093    line: usize,
2094    depth: u8,
2095    offset: Offset,
2096    test: Test,
2097    test_strength: u64,
2098    message: Option<Message>,
2099}
2100
2101impl From<Use> for Match {
2102    fn from(value: Use) -> Self {
2103        let test = Test::Use(value.switch_endianness, value.rule_name);
2104        let test_strength = test.strength();
2105        Self {
2106            line: value.line,
2107            depth: value.depth,
2108            offset: value.start_offset,
2109            test,
2110            test_strength,
2111            message: value.message,
2112        }
2113    }
2114}
2115
2116impl From<Name> for Match {
2117    fn from(value: Name) -> Self {
2118        let test = Test::Name(value.name);
2119        let test_strength = test.strength();
2120        Self {
2121            line: value.line,
2122            depth: 0,
2123            offset: Offset::Direct(DirOffset::Start(0)),
2124            test,
2125            test_strength,
2126            message: value.message,
2127        }
2128    }
2129}
2130
2131impl Match {
2132    /// Turns the `Match`'s offset into an absolute offset from the start of the stream
2133    #[inline(always)]
2134    fn offset_from_start<D: DataRead>(
2135        &self,
2136        haystack: &mut D,
2137        rule_base_offset: Option<u64>,
2138        last_level_offset: Option<u64>,
2139    ) -> Result<Option<u64>, io::Error> {
2140        match self.offset {
2141            Offset::Direct(dir_offset) => match dir_offset {
2142                DirOffset::Start(s) => Ok(Some(s)),
2143                DirOffset::LastUpper(shift) => {
2144                    let o = last_level_offset.unwrap_or_default() as i64 + shift;
2145
2146                    if o >= 0 { Ok(Some(o as u64)) } else { Ok(None) }
2147                }
2148                DirOffset::End(e) => Ok(Some(haystack.offset_from_start(SeekFrom::End(e)))),
2149            },
2150            Offset::Indirect(ind_offset) => {
2151                let Some(o) =
2152                    ind_offset.read_offset(haystack, rule_base_offset, last_level_offset)?
2153                else {
2154                    return Ok(None);
2155                };
2156
2157                Ok(Some(o))
2158            }
2159        }
2160    }
2161
2162    /// this method emulates the buffer based matching
2163    /// logic implemented in libmagic. It needs some aweful
2164    /// and weird offset convertions to turn buffer
2165    /// relative offsets (libmagic is based on) into
2166    /// absolute offset in the file.
2167    ///
2168    /// this method shoud bubble up only critical errors
2169    /// all the other errors should make the match result
2170    /// false and be logged via debug!
2171    ///
2172    /// the function returns an error if the maximum recursion
2173    /// has been reached or if a dependency rule is missing.
2174    #[inline]
2175    #[allow(clippy::too_many_arguments)]
2176    fn matches<'a: 'h, 'h, D: DataRead>(
2177        &'a self,
2178        source: Option<&str>,
2179        magic: &mut Magic<'a>,
2180        stream_kind: StreamKind,
2181        state: &mut MatchState,
2182        buf_base_offset: Option<u64>,
2183        rule_base_offset: Option<u64>,
2184        last_level_offset: Option<u64>,
2185        haystack: &'h mut D,
2186        switch_endianness: bool,
2187        db: &'a MagicDb,
2188        depth: usize,
2189    ) -> Result<(bool, Option<MatchRes<'h>>), Error> {
2190        let source = source.unwrap_or("unknown");
2191        let line = self.line;
2192
2193        if depth >= MAX_RECURSION {
2194            return Err(Error::localized(
2195                source,
2196                line,
2197                Error::MaximumRecursion(MAX_RECURSION),
2198            ));
2199        }
2200
2201        if self.test.is_only_binary() && stream_kind.is_text() {
2202            trace!("skip binary test source={source} line={line} stream_kind={stream_kind:?}",);
2203            return Ok((false, None));
2204        }
2205
2206        if self.test.is_only_text() && !stream_kind.is_text() {
2207            trace!("skip text test source={source} line={line} stream_kind={stream_kind:?}",);
2208            return Ok((false, None));
2209        }
2210
2211        let Ok(Some(mut offset)) = self
2212            .offset_from_start(haystack, rule_base_offset, last_level_offset)
2213            .inspect_err(|e| debug!("source={source} line={line} failed at computing offset: {e}"))
2214        else {
2215            return Ok((false, None));
2216        };
2217
2218        offset = match self.offset {
2219            Offset::Indirect(_) => {
2220                // the result we get for an indirect offset
2221                // is relative to the start of the libmagic
2222                // buffer so we need to add base to make it
2223                // absolute.
2224                buf_base_offset.unwrap_or_default().saturating_add(offset)
2225            }
2226            // offset from start are computed from rule base
2227            Offset::Direct(DirOffset::Start(_)) => {
2228                rule_base_offset.unwrap_or_default().saturating_add(offset)
2229            }
2230            _ => offset,
2231        };
2232
2233        match &self.test {
2234            Test::Clear => {
2235                trace!("source={source} line={line} clear");
2236                state.clear_continuation_level(&self.continuation_level());
2237                Ok((true, None))
2238            }
2239
2240            Test::Name(name) => {
2241                trace!(
2242                    "source={source} line={line} running rule {name} switch_endianness={switch_endianness}",
2243                );
2244                Ok((true, None))
2245            }
2246
2247            Test::Use(flip_endianness, rule_name) => {
2248                trace!(
2249                    "source={source} line={line} use {rule_name} switch_endianness={flip_endianness}",
2250                );
2251
2252                // switch_endianness must propagate down the rule call stack
2253                let switch_endianness = switch_endianness ^ flip_endianness;
2254
2255                let dr: &DependencyRule = db.dependencies.get(rule_name).ok_or(
2256                    Error::localized(source, line, Error::MissingRule(rule_name.clone())),
2257                )?;
2258
2259                // we push the message here otherwise we push message in depth first
2260                if let Some(msg) = self.message.as_ref() {
2261                    magic.push_message(msg.to_string_lossy());
2262                }
2263
2264                let new_buf_base_off = if self.offset.is_indirect() {
2265                    Some(offset)
2266                } else {
2267                    None
2268                };
2269
2270                let nmatch = dr.rule.magic(
2271                    magic,
2272                    stream_kind,
2273                    new_buf_base_off,
2274                    Some(offset),
2275                    haystack,
2276                    db,
2277                    switch_endianness,
2278                    depth.saturating_add(1),
2279                )?;
2280
2281                // The name is always true, so we consider there to be a match
2282                // if more than one test succeeded
2283                let matched = nmatch > 0;
2284                if matched {
2285                    state.set_continuation_level(self.continuation_level());
2286                }
2287
2288                Ok((matched, None))
2289            }
2290
2291            Test::Indirect(m) => {
2292                trace!(
2293                    "source={source} line={line} indirect mods={:?} offset={offset:#x}",
2294                    m
2295                );
2296
2297                let new_buf_base_off = if m.contains(IndirectMod::Relative) {
2298                    Some(offset)
2299                } else {
2300                    None
2301                };
2302
2303                // we push the message here otherwise we push message in depth first
2304                if let Some(msg) = self.message.as_ref() {
2305                    magic.push_message(msg.to_string_lossy());
2306                }
2307
2308                let mut nmatch = 0u64;
2309                for r in db.rules.iter() {
2310                    nmatch = nmatch.saturating_add(r.magic(
2311                        magic,
2312                        stream_kind,
2313                        new_buf_base_off,
2314                        Some(offset),
2315                        haystack,
2316                        db,
2317                        false,
2318                        depth.saturating_add(1),
2319                    )?);
2320
2321                    if nmatch > 0 {
2322                        break;
2323                    }
2324                }
2325
2326                Ok((nmatch > 0, None))
2327            }
2328
2329            Test::Default => {
2330                // default matches if nothing else at the continuation level matched
2331                let ok = !state.get_continuation_level(&self.continuation_level());
2332
2333                trace!("source={source} line={line} default match={ok}");
2334                if ok {
2335                    state.set_continuation_level(self.continuation_level());
2336                }
2337
2338                Ok((ok, None))
2339            }
2340
2341            _ => {
2342                if let Err(e) = haystack.seek(SeekFrom::Start(offset)) {
2343                    debug!("source={source} line={line} failed to seek in haystack: {e}");
2344                    return Ok((false, None));
2345                }
2346
2347                let mut trace_msg = None;
2348
2349                if enabled!(Level::DEBUG) {
2350                    trace_msg = Some(vec![format!(
2351                        "source={source} line={line} depth={} stream_offset={:#x}",
2352                        self.depth,
2353                        haystack.stream_position()
2354                    )])
2355                }
2356
2357                // NOTE: we may have a way to optimize here. In case we do a Any
2358                // test and we don't use the value to format the message, we don't
2359                // need to read the value.
2360                if let Ok(opt_test_value) = self
2361                    .test
2362                    .read_test_value(haystack, switch_endianness)
2363                    .inspect_err(|e| {
2364                        debug!("source={source} line={line} error while reading test value @{offset}: {e}",)
2365                    })
2366                {
2367                    if let Some(v) = trace_msg
2368                        .as_mut() { v.push(format!("test={}", self.test)) }
2369
2370                    if let Some(v) = trace_msg.as_mut(){
2371                        let drv = match opt_test_value.as_ref(){
2372                            Some(r) => format!("{r:?}"),
2373                            None =>String::new(),
2374                        };
2375                        v.push(format!("read_in_stream={drv}"))
2376                    }
2377
2378                    let match_res =
2379                        opt_test_value.and_then(|tv| self.test.match_value(&tv, stream_kind));
2380
2381                    if let Some(v) = trace_msg.as_mut() { v.push(format!(
2382                            "message=\"{}\" match={}",
2383                            self.message
2384                                .as_ref()
2385                                .map(|fs| fs.to_string_lossy())
2386                                .unwrap_or_default(),
2387                            match_res.is_some()
2388                        )) }
2389
2390                    // trace message
2391                    if enabled!(Level::DEBUG) && !enabled!(Level::TRACE) && match_res.is_some() {
2392                        if let Some(m) = trace_msg{
2393                            debug!("{}", m.join(" "));
2394                        }
2395                    } else if enabled!(Level::TRACE)
2396                        && let Some(m) = trace_msg{
2397                            trace!("{}", m.join(" "));
2398                        }
2399
2400                    if let Some(mr) = match_res {
2401                        state.set_continuation_level(self.continuation_level());
2402                        return Ok((true, Some(mr)));
2403                    }
2404                }
2405
2406                Ok((false, None))
2407            }
2408        }
2409    }
2410
2411    #[inline(always)]
2412    fn continuation_level(&self) -> ContinuationLevel {
2413        ContinuationLevel(self.depth)
2414    }
2415}
2416
2417#[derive(Debug, Clone)]
2418struct Use {
2419    line: usize,
2420    depth: u8,
2421    start_offset: Offset,
2422    rule_name: String,
2423    switch_endianness: bool,
2424    message: Option<Message>,
2425}
2426
2427#[derive(Debug, Clone, Serialize, Deserialize)]
2428struct StrengthMod {
2429    op: Op,
2430    by: u8,
2431}
2432
2433impl StrengthMod {
2434    #[inline(always)]
2435    fn apply(&self, strength: u64) -> u64 {
2436        let by = self.by as u64;
2437        debug!("applying strength modifier: {strength} {} {}", self.op, by);
2438        match self.op {
2439            Op::Mul => strength.saturating_mul(by),
2440            Op::Add => strength.saturating_add(by),
2441            Op::Sub => strength.saturating_sub(by),
2442            Op::Div => {
2443                if by > 0 {
2444                    strength.saturating_div(by)
2445                } else {
2446                    strength
2447                }
2448            }
2449            Op::Mod => strength % by,
2450            Op::And => strength & by,
2451            // this should never happen as strength operators
2452            // are enforced by our parser
2453            Op::Xor | Op::Or => {
2454                debug_panic!("unsupported strength operator");
2455                strength
2456            }
2457        }
2458    }
2459}
2460
2461#[derive(Debug, Clone)]
2462enum Flag {
2463    Mime(String),
2464    Ext(HashSet<String>),
2465    Strength(StrengthMod),
2466    Apple(String),
2467}
2468
2469#[derive(Debug, Clone)]
2470struct Name {
2471    line: usize,
2472    name: String,
2473    message: Option<Message>,
2474}
2475
2476#[derive(Debug, Clone)]
2477enum Entry<'span> {
2478    Match(Span<'span>, Match),
2479    Flag(Span<'span>, Flag),
2480}
2481
2482#[derive(Debug, Clone, Serialize, Deserialize)]
2483struct EntryNode {
2484    root: bool,
2485    entry: Match,
2486    children: Vec<EntryNode>,
2487    mimetype: Option<String>,
2488    apple: Option<String>,
2489    strength_mod: Option<StrengthMod>,
2490    exts: HashSet<String>,
2491}
2492
2493#[derive(Debug, Default)]
2494struct EntryNodeVisitor {
2495    exts: HashSet<String>,
2496    score: u64,
2497}
2498
2499impl EntryNodeVisitor {
2500    fn new() -> Self {
2501        Self {
2502            ..Default::default()
2503        }
2504    }
2505
2506    fn merge(&mut self, other: Self) {
2507        self.exts.extend(other.exts);
2508        self.score += other.score;
2509    }
2510}
2511
2512impl EntryNode {
2513    #[inline]
2514    fn update_visitor(&self, v: &mut EntryNodeVisitor, depth: usize) {
2515        // update extensions
2516        for ext in self.exts.iter() {
2517            if !v.exts.contains(ext) {
2518                v.exts.insert(ext.clone());
2519            }
2520        }
2521
2522        // update score if depth
2523        if depth == 0 {
2524            v.score += self.entry.test_strength;
2525        }
2526
2527        // Tests at deeper levels contribute less to the overall score.
2528        // We use the minimum value to establish a lower bound for the rule's score,
2529        // which helps prioritize rules based on their importance.
2530        v.score += self
2531            .children
2532            .iter()
2533            .map(|e| e.entry.test_strength)
2534            .min()
2535            .unwrap_or_default()
2536            / max(1, depth as u64);
2537    }
2538
2539    fn visit(
2540        &self,
2541        v: &mut EntryNodeVisitor,
2542        deps: &HashMap<String, DependencyRule>,
2543        marked: &mut HashSet<String>,
2544        depth: usize,
2545    ) -> Result<(), Error> {
2546        // updating visitor
2547        self.update_visitor(v, depth);
2548
2549        // recursively visiting
2550        for c in self.children.iter() {
2551            if let Test::Use(_, ref name) = c.entry.test {
2552                if marked.contains(name) {
2553                    continue;
2554                }
2555
2556                marked.insert(name.clone());
2557
2558                if let Some(r) = deps.get(name) {
2559                    let dv = r.rule.visit_all_entries(deps, marked)?;
2560                    v.merge(dv);
2561                } else {
2562                    return Err(Error::MissingRule(name.clone()));
2563                }
2564            } else {
2565                c.visit(v, deps, marked, depth + 1)?;
2566            }
2567        }
2568
2569        Ok(())
2570    }
2571
2572    /// Executes the magic matching logic recursively and returns the count of matches that produce messages.
2573    /// Matches that don't result in message appends are not counted, consistent with libmagic's behavior.
2574    #[inline]
2575    #[allow(clippy::too_many_arguments)]
2576    fn matches<'r, D: DataRead>(
2577        &'r self,
2578        opt_source: Option<&str>,
2579        magic: &mut Magic<'r>,
2580        state: &mut MatchState,
2581        stream_kind: StreamKind,
2582        buf_base_offset: Option<u64>,
2583        rule_base_offset: Option<u64>,
2584        last_level_offset: Option<u64>,
2585        haystack: &mut D,
2586        db: &'r MagicDb,
2587        switch_endianness: bool,
2588        depth: usize,
2589    ) -> Result<u64, Error> {
2590        let mut nmatch = 0u64;
2591
2592        let (ok, opt_match_res) = self.entry.matches(
2593            opt_source,
2594            magic,
2595            stream_kind,
2596            state,
2597            buf_base_offset,
2598            rule_base_offset,
2599            last_level_offset,
2600            haystack,
2601            switch_endianness,
2602            db,
2603            depth,
2604        )?;
2605
2606        let source = opt_source.unwrap_or("unknown");
2607        let line = self.entry.line;
2608
2609        if ok {
2610            // Update the magic with the message if the match is successful
2611            // Skip updating if the test is recursive, as it's already handled
2612            // in the Match::matches function
2613            if !self.entry.test.is_recursive()
2614                && let Some(msg) = self.entry.message.as_ref()
2615                && let Ok(msg) = msg.format_with(opt_match_res.as_ref()).inspect_err(|e| {
2616                    debug!("source={source} line={line} failed to format message: {e}")
2617                })
2618            {
2619                nmatch = nmatch.saturating_add(1);
2620                magic.push_message(msg);
2621            }
2622
2623            // we need to adjust stream offset in case of regex/search tests
2624            if let Some(mr) = opt_match_res {
2625                match &self.entry.test {
2626                    Test::String(t) if t.has_length_mod() => {
2627                        let o = mr.end_offset();
2628                        haystack.seek(SeekFrom::Start(o))?;
2629                    }
2630                    Test::Search(t) => {
2631                        if t.re_mods.contains(ReMod::StartOffsetUpdate) {
2632                            let o = mr.start_offset();
2633                            haystack.seek(SeekFrom::Start(o))?;
2634                        } else {
2635                            let o = mr.end_offset();
2636                            haystack.seek(SeekFrom::Start(o))?;
2637                        }
2638                    }
2639
2640                    Test::Regex(t) => {
2641                        if t.mods.contains(ReMod::StartOffsetUpdate) {
2642                            let o = mr.start_offset();
2643                            haystack.seek(SeekFrom::Start(o))?;
2644                        } else {
2645                            let o = mr.end_offset();
2646                            haystack.seek(SeekFrom::Start(o))?;
2647                        }
2648                    }
2649                    // other types do not need offset adjustement
2650                    _ => {}
2651                }
2652            }
2653
2654            if let Some(mimetype) = self.mimetype.as_ref() {
2655                magic.set_mime_type(Cow::Borrowed(mimetype));
2656            }
2657
2658            if let Some(apple_ty) = self.apple.as_ref() {
2659                magic.set_creator_code(Cow::Borrowed(apple_ty));
2660            }
2661
2662            if !self.exts.is_empty() {
2663                magic.insert_extensions(self.exts.iter().map(|s| s.as_str()));
2664            }
2665
2666            // NOTE: here we try to implement a similar logic as in file_magic_strength.
2667            // Sticking to the exact same strength computation logic is complicated due
2668            // to implementation differences. Let's wait and see if that is a real issue.
2669            let mut strength = self.entry.test_strength;
2670
2671            let continuation_level = self.entry.continuation_level().0 as u64;
2672            if self.entry.message.is_none() && continuation_level < 3 {
2673                strength = strength.saturating_add(continuation_level);
2674            }
2675
2676            if let Some(sm) = self.strength_mod.as_ref() {
2677                strength = sm.apply(strength);
2678            }
2679
2680            // entries with no message get a bonus
2681            if self.entry.message.is_none() {
2682                strength += 1
2683            }
2684
2685            magic.update_strength(strength);
2686
2687            let end_upper_level = haystack.stream_position();
2688
2689            // we have to fix rule_base_offset if
2690            // the rule_base_starts from end otherwise it
2691            // breaks some offset computation in match
2692            // see test_offset_bug_1 and test_offset_bug_2
2693            // they implement the same test logic yet indirect
2694            // offsets have to be different so that it works
2695            // in libmagic/file
2696            let rule_base_offset = if self.root {
2697                match self.entry.offset {
2698                    Offset::Direct(DirOffset::End(o)) => {
2699                        Some(haystack.offset_from_start(SeekFrom::End(o)))
2700                    }
2701                    _ => rule_base_offset,
2702                }
2703            } else {
2704                rule_base_offset
2705            };
2706
2707            for e in self.children.iter() {
2708                nmatch = nmatch.saturating_add(e.matches(
2709                    opt_source,
2710                    magic,
2711                    state,
2712                    stream_kind,
2713                    buf_base_offset,
2714                    rule_base_offset,
2715                    Some(end_upper_level),
2716                    haystack,
2717                    db,
2718                    switch_endianness,
2719                    depth,
2720                )?);
2721            }
2722        }
2723
2724        Ok(nmatch)
2725    }
2726}
2727
2728/// Represents a parsed magic rule
2729#[derive(Debug, Clone, Serialize, Deserialize)]
2730pub struct MagicRule {
2731    id: usize,
2732    source: Option<String>,
2733    entries: EntryNode,
2734    extensions: HashSet<String>,
2735    /// score used for rule ranking
2736    score: u64,
2737    finalized: bool,
2738}
2739
2740impl MagicRule {
2741    #[inline(always)]
2742    fn set_id(&mut self, id: usize) {
2743        self.id = id
2744    }
2745
2746    fn visit_all_entries(
2747        &self,
2748        deps: &HashMap<String, DependencyRule>,
2749        marked: &mut HashSet<String>,
2750    ) -> Result<EntryNodeVisitor, Error> {
2751        let mut v = EntryNodeVisitor::new();
2752        self.entries.visit(&mut v, deps, marked, 0)?;
2753        Ok(v)
2754    }
2755
2756    /// Finalize a rule by searching for all extensions and computing its score
2757    /// for ranking. In the `MagicRule` is already finalized it returns immediately.
2758    fn try_finalize(&mut self, deps: &HashMap<String, DependencyRule>) -> Result<(), Error> {
2759        if self.finalized {
2760            return Ok(());
2761        }
2762
2763        // rule can be finalized all deps are found
2764        let v = self.visit_all_entries(deps, &mut HashSet::new())?;
2765
2766        self.extensions.extend(v.exts);
2767        self.score = v.score;
2768        self.finalized = true;
2769
2770        Ok(())
2771    }
2772
2773    #[inline]
2774    fn magic_entrypoint<'r, D: DataRead>(
2775        &'r self,
2776        magic: &mut Magic<'r>,
2777        stream_kind: StreamKind,
2778        haystack: &mut D,
2779        db: &'r MagicDb,
2780        switch_endianness: bool,
2781        depth: usize,
2782    ) -> Result<u64, Error> {
2783        self.entries.matches(
2784            self.source.as_deref(),
2785            magic,
2786            &mut MatchState::empty(),
2787            stream_kind,
2788            None,
2789            None,
2790            None,
2791            haystack,
2792            db,
2793            switch_endianness,
2794            depth,
2795        )
2796    }
2797
2798    /// Executes the magic matching logic and returns the count of matches that produce messages.
2799    /// Matches that don't result in message appends are not counted, consistent with libmagic's behavior.
2800    #[inline]
2801    #[allow(clippy::too_many_arguments)]
2802    fn magic<'r, D: DataRead>(
2803        &'r self,
2804        magic: &mut Magic<'r>,
2805        stream_kind: StreamKind,
2806        buf_base_offset: Option<u64>,
2807        rule_base_offset: Option<u64>,
2808        haystack: &mut D,
2809        db: &'r MagicDb,
2810        switch_endianness: bool,
2811        depth: usize,
2812    ) -> Result<u64, Error> {
2813        self.entries.matches(
2814            self.source.as_deref(),
2815            magic,
2816            &mut MatchState::empty(),
2817            stream_kind,
2818            buf_base_offset,
2819            rule_base_offset,
2820            None,
2821            haystack,
2822            db,
2823            switch_endianness,
2824            depth,
2825        )
2826    }
2827
2828    /// Checks if the rule is for matching against text content
2829    ///
2830    /// # Returns
2831    ///
2832    /// * `bool` - True if the rule is for text files
2833    pub fn is_text(&self) -> bool {
2834        self.entries.entry.test.is_text()
2835            && self.entries.children.iter().all(|e| e.entry.test.is_text())
2836    }
2837
2838    /// Gets the rule's score used for ranking rules between them
2839    ///
2840    /// # Returns
2841    ///
2842    /// * `u64` - The rule's score
2843    #[inline(always)]
2844    pub fn score(&self) -> u64 {
2845        self.score
2846    }
2847
2848    /// Gets the rule's filename if any
2849    ///
2850    /// # Returns
2851    ///
2852    /// * `Option<&str>` - The rule's source if available
2853    #[inline(always)]
2854    pub fn source(&self) -> Option<&str> {
2855        self.source.as_deref()
2856    }
2857
2858    /// Gets the line number at which the rule is defined
2859    ///
2860    /// # Returns
2861    ///
2862    /// * `usize` - The rule's line number
2863    #[inline(always)]
2864    pub fn line(&self) -> usize {
2865        self.entries.entry.line
2866    }
2867
2868    /// Gets all the file extensions associated to the rule
2869    ///
2870    /// # Returns
2871    ///
2872    /// * `&HashSet<String>` - The set of all associated extensions
2873    #[inline(always)]
2874    pub fn extensions(&self) -> &HashSet<String> {
2875        &self.extensions
2876    }
2877}
2878
2879#[derive(Debug, Clone, Serialize, Deserialize)]
2880struct DependencyRule {
2881    name: String,
2882    rule: MagicRule,
2883}
2884
2885/// A parsed source of magic rules
2886///
2887/// # Methods
2888///
2889/// * `open` - Opens a magic file from a path
2890#[derive(Debug, Clone, Serialize, Deserialize)]
2891pub struct MagicSource {
2892    rules: Vec<MagicRule>,
2893    dependencies: HashMap<String, DependencyRule>,
2894}
2895
2896impl MagicSource {
2897    /// Opens and parses a magic file from a path
2898    ///
2899    /// # Arguments
2900    ///
2901    /// * `p` - The path to the magic file
2902    ///
2903    /// # Returns
2904    ///
2905    /// * `Result<Self, Error>` - The parsed magic file or an error
2906    pub fn open<P: AsRef<Path>>(p: P) -> Result<Self, Error> {
2907        FileMagicParser::parse_file(p)
2908    }
2909}
2910
2911#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)]
2912struct ContinuationLevel(u8);
2913
2914// FIXME: magic handles many more text encodings
2915#[derive(Debug, PartialEq, Eq, Clone, Copy)]
2916enum TextEncoding {
2917    Ascii,
2918    Utf8,
2919    Unknown,
2920}
2921
2922impl TextEncoding {
2923    const fn as_magic_str(&self) -> &'static str {
2924        match self {
2925            TextEncoding::Ascii => "ASCII",
2926            TextEncoding::Utf8 => "UTF-8",
2927            TextEncoding::Unknown => "Unknown",
2928        }
2929    }
2930}
2931
2932#[derive(Debug, PartialEq, Eq, Clone, Copy)]
2933enum StreamKind {
2934    Binary,
2935    Text(TextEncoding),
2936}
2937
2938impl StreamKind {
2939    const fn is_text(&self) -> bool {
2940        matches!(self, StreamKind::Text(_))
2941    }
2942}
2943
2944#[derive(Debug)]
2945struct MatchState {
2946    continuation_levels: [bool; 256],
2947}
2948
2949impl MatchState {
2950    #[inline(always)]
2951    fn empty() -> Self {
2952        MatchState {
2953            continuation_levels: [false; 256],
2954        }
2955    }
2956
2957    #[inline(always)]
2958    fn get_continuation_level(&mut self, level: &ContinuationLevel) -> bool {
2959        self.continuation_levels
2960            .get(level.0 as usize)
2961            .cloned()
2962            .unwrap_or_default()
2963    }
2964
2965    #[inline(always)]
2966    fn set_continuation_level(&mut self, level: ContinuationLevel) {
2967        if let Some(b) = self.continuation_levels.get_mut(level.0 as usize) {
2968            *b = true
2969        }
2970    }
2971
2972    #[inline(always)]
2973    fn clear_continuation_level(&mut self, level: &ContinuationLevel) {
2974        if let Some(b) = self.continuation_levels.get_mut(level.0 as usize) {
2975            *b = false;
2976        }
2977    }
2978}
2979
2980/// Represents a file magic detection result
2981#[derive(Debug, Default)]
2982pub struct Magic<'m> {
2983    stream_kind: Option<StreamKind>,
2984    source: Option<Cow<'m, str>>,
2985    message: Vec<Cow<'m, str>>,
2986    mime_type: Option<Cow<'m, str>>,
2987    creator_code: Option<Cow<'m, str>>,
2988    strength: u64,
2989    exts: HashSet<Cow<'m, str>>,
2990    is_default: bool,
2991}
2992
2993impl<'m> Magic<'m> {
2994    #[inline(always)]
2995    fn set_source(&mut self, source: Option<&'m str>) {
2996        self.source = source.map(Cow::Borrowed);
2997    }
2998
2999    #[inline(always)]
3000    fn set_stream_kind(&mut self, stream_kind: StreamKind) {
3001        self.stream_kind = Some(stream_kind)
3002    }
3003
3004    #[inline(always)]
3005    fn reset(&mut self) {
3006        self.stream_kind = None;
3007        self.source = None;
3008        self.message.clear();
3009        self.mime_type = None;
3010        self.creator_code = None;
3011        self.strength = 0;
3012        self.exts.clear();
3013        self.is_default = false;
3014    }
3015
3016    /// Converts borrowed data into owned data. This method involves
3017    /// data cloning, so you must use this method only if you need to
3018    /// extend the lifetime of a [`Magic`] struct.
3019    ///
3020    /// # Returns
3021    ///
3022    /// * `Magic<'owned>` - A new [`Magic`] with owned data
3023    #[inline]
3024    pub fn into_owned<'owned>(self) -> Magic<'owned> {
3025        Magic {
3026            stream_kind: self.stream_kind,
3027            source: self.source.map(|s| Cow::Owned(s.into_owned())),
3028            message: self
3029                .message
3030                .into_iter()
3031                .map(Cow::into_owned)
3032                .map(Cow::Owned)
3033                .collect(),
3034            mime_type: self.mime_type.map(|m| Cow::Owned(m.into_owned())),
3035            creator_code: self.creator_code.map(|m| Cow::Owned(m.into_owned())),
3036            strength: self.strength,
3037            exts: self
3038                .exts
3039                .into_iter()
3040                .map(|e| Cow::Owned(e.into_owned()))
3041                .collect(),
3042            is_default: self.is_default,
3043        }
3044    }
3045
3046    /// Gets the formatted message describing the file type
3047    ///
3048    /// # Returns
3049    ///
3050    /// * `String` - The formatted message
3051    #[inline(always)]
3052    pub fn message(&self) -> String {
3053        let mut out = String::new();
3054        for (i, m) in self.message.iter().enumerate() {
3055            if let Some(s) = m.strip_prefix(r#"\b"#) {
3056                out.push_str(s);
3057            } else {
3058                // don't put space on first string
3059                if i > 0 {
3060                    out.push(' ');
3061                }
3062                out.push_str(m);
3063            }
3064        }
3065        out
3066    }
3067
3068    /// Returns an iterator over the individual parts of the magic message
3069    ///
3070    /// A magic message is typically composed of multiple parts, each appended
3071    /// during successful magic tests. This method provides an efficient way to
3072    /// iterate over these parts without concatenating them into a new string,
3073    /// as done when calling [`Magic::message`].
3074    ///
3075    /// # Returns
3076    ///
3077    /// * `impl Iterator<Item = &str>` - An iterator yielding string slices of each message part
3078    #[inline]
3079    pub fn message_parts(&self) -> impl Iterator<Item = &str> {
3080        self.message.iter().map(|p| p.as_ref())
3081    }
3082
3083    #[inline(always)]
3084    fn update_strength(&mut self, value: u64) {
3085        self.strength = self.strength.saturating_add(value);
3086        debug!("updated strength = {:?}", self.strength)
3087    }
3088
3089    /// Gets the detected MIME type
3090    ///
3091    /// # Returns
3092    ///
3093    /// * `&str` - The MIME type or default based on stream kind
3094    #[inline(always)]
3095    pub fn mime_type(&self) -> &str {
3096        self.mime_type.as_deref().unwrap_or(match self.stream_kind {
3097            Some(StreamKind::Text(_)) => DEFAULT_TEXT_MIMETYPE,
3098            Some(StreamKind::Binary) | None => DEFAULT_BIN_MIMETYPE,
3099        })
3100    }
3101
3102    #[inline(always)]
3103    fn push_message<'a: 'm>(&mut self, msg: Cow<'a, str>) {
3104        if !msg.is_empty() {
3105            debug!("pushing message: msg={msg} len={}", msg.len());
3106            self.message.push(msg);
3107        }
3108    }
3109
3110    #[inline(always)]
3111    fn set_mime_type<'a: 'm>(&mut self, mime: Cow<'a, str>) {
3112        if self.mime_type.is_none() {
3113            debug!("insert mime: {:?}", mime);
3114            self.mime_type = Some(mime)
3115        }
3116    }
3117
3118    #[inline(always)]
3119    fn set_creator_code<'a: 'm>(&mut self, apple_ty: Cow<'a, str>) {
3120        if self.creator_code.is_none() {
3121            debug!("insert apple type: {apple_ty:?}");
3122            self.creator_code = Some(apple_ty)
3123        }
3124    }
3125
3126    #[inline(always)]
3127    fn insert_extensions<'a: 'm, I: Iterator<Item = &'a str>>(&mut self, exts: I) {
3128        if self.exts.is_empty() {
3129            self.exts.extend(exts.filter_map(|e| {
3130                if e.is_empty() {
3131                    None
3132                } else {
3133                    Some(Cow::Borrowed(e))
3134                }
3135            }));
3136        }
3137    }
3138
3139    /// Gets the confidence score of the detection. This
3140    /// value is used to sort [`Magic`] in [`MagicDb::best_magic`]
3141    /// and [`MagicDb::all_magics`].
3142    ///
3143    /// # Returns
3144    ///
3145    /// * `u64` - The confidence score attributed to that [`Magic`]
3146    #[inline(always)]
3147    pub fn strength(&self) -> u64 {
3148        self.strength
3149    }
3150
3151    /// Gets the filename where the magic rule was defined
3152    ///
3153    /// # Returns
3154    ///
3155    /// * `Option<&str>` - The source if available
3156    #[inline(always)]
3157    pub fn source(&self) -> Option<&str> {
3158        self.source.as_deref()
3159    }
3160
3161    /// Gets the Apple creator code if available
3162    ///
3163    /// # Returns
3164    ///
3165    /// * `Option<&str>` - The creator code if available
3166    #[inline(always)]
3167    pub fn creator_code(&self) -> Option<&str> {
3168        self.creator_code.as_deref()
3169    }
3170
3171    /// Gets the possible file extensions for the detected [`Magic`]
3172    ///
3173    /// # Returns
3174    ///
3175    /// * `&HashSet<Cow<'m, str>>` - The set of possible extensions
3176    #[inline(always)]
3177    pub fn extensions(&self) -> &HashSet<Cow<'m, str>> {
3178        &self.exts
3179    }
3180
3181    /// Checks if this is a default fallback detection
3182    ///
3183    /// # Returns
3184    ///
3185    /// * `bool` - True if this is a default detection
3186    #[inline(always)]
3187    pub fn is_default(&self) -> bool {
3188        self.is_default
3189    }
3190}
3191
3192/// Represents a database of [`MagicRule`]
3193#[derive(Debug, Default, Clone, Serialize, Deserialize)]
3194pub struct MagicDb {
3195    rule_id: usize,
3196    rules: Vec<MagicRule>,
3197    dependencies: HashMap<String, DependencyRule>,
3198    finalized: usize,
3199}
3200
3201#[inline(always)]
3202/// Returns `true` if the byte stream is likely text.
3203fn is_likely_text(bytes: &[u8]) -> bool {
3204    const CHUNK_SIZE: usize = std::mem::size_of::<usize>();
3205
3206    if bytes.is_empty() {
3207        return false;
3208    }
3209
3210    let mut printable = 0f64;
3211    let mut high_bytes = 0f64; // Bytes > 0x7F (non-ASCII)
3212
3213    let (chunks, remainder) = bytes.as_chunks::<CHUNK_SIZE>();
3214
3215    macro_rules! handle_byte {
3216        ($byte: expr) => {
3217            match $byte {
3218                0x00 => return false,
3219                0x09 | 0x0A | 0x0D => printable += 1.0, // Whitespace
3220                0x20..=0x7E => printable += 1.0,        // Printable ASCII
3221                _ => high_bytes += 1.0,
3222            }
3223        };
3224    }
3225
3226    for bytes in chunks {
3227        for b in bytes {
3228            handle_byte!(b)
3229        }
3230    }
3231
3232    for b in remainder {
3233        handle_byte!(b)
3234    }
3235
3236    let total = bytes.len() as f64;
3237    let printable_ratio = printable / total;
3238    let high_bytes_ratio = high_bytes / total;
3239
3240    // Heuristic thresholds (adjust as needed):
3241    printable_ratio > 0.85 && high_bytes_ratio < 0.20
3242}
3243
3244#[inline(always)]
3245fn guess_stream_kind<S: AsRef<[u8]>>(stream: S) -> StreamKind {
3246    let buf = stream.as_ref();
3247
3248    match run_utf8_validation(buf) {
3249        Ok(is_ascii) => {
3250            if is_ascii {
3251                StreamKind::Text(TextEncoding::Ascii)
3252            } else {
3253                StreamKind::Text(TextEncoding::Utf8)
3254            }
3255        }
3256        Err(e) => {
3257            if is_likely_text(&buf[e.valid_up_to..]) {
3258                StreamKind::Text(TextEncoding::Unknown)
3259            } else {
3260                StreamKind::Binary
3261            }
3262        }
3263    }
3264}
3265
3266impl MagicDb {
3267    /// Creates a new empty database
3268    ///
3269    /// # Returns
3270    ///
3271    /// * [`MagicDb`] - A new empty database
3272    pub fn new() -> Self {
3273        Self::default()
3274    }
3275
3276    #[inline(always)]
3277    fn next_rule_id(&mut self) -> usize {
3278        let t = self.rule_id;
3279        self.rule_id += 1;
3280        t
3281    }
3282
3283    #[inline(always)]
3284    fn try_json<D: DataRead>(
3285        haystack: &mut D,
3286        stream_kind: StreamKind,
3287        magic: &mut Magic,
3288    ) -> Result<bool, Error> {
3289        // cannot be json if content is binary
3290        if matches!(stream_kind, StreamKind::Binary) {
3291            return Ok(false);
3292        }
3293
3294        let buf = haystack.read_range(0..FILE_BYTES_MAX as u64)?.trim_ascii();
3295
3296        let Some((start, end)) = find_json_boundaries(buf) else {
3297            return Ok(false);
3298        };
3299
3300        // if anything else than whitespace before start
3301        // this is not json
3302        for c in buf[0..start].iter() {
3303            if !c.is_ascii_whitespace() {
3304                return Ok(false);
3305            }
3306        }
3307
3308        let mut is_ndjson = false;
3309
3310        trace!("maybe a json document");
3311        let ok = serde_json::from_slice::<serde_json::Value>(&buf[start..=end]).is_ok();
3312        if !ok {
3313            return Ok(false);
3314        }
3315
3316        // we are sure it is json now we must look if we are ndjson
3317        if end + 1 < buf.len() {
3318            // after first json
3319            let buf = &buf[end + 1..];
3320            if let Some((second_start, second_end)) = find_json_boundaries(buf) {
3321                // there is a new line between the two json docs
3322                if memchr(b'\n', &buf[..second_start]).is_some() {
3323                    trace!("might be ndjson");
3324                    is_ndjson = serde_json::from_slice::<serde_json::Value>(
3325                        &buf[second_start..=second_end],
3326                    )
3327                    .is_ok();
3328                }
3329            }
3330        }
3331
3332        if is_ndjson {
3333            magic.push_message(Cow::Borrowed("New Line Delimited"));
3334            magic.set_mime_type(Cow::Borrowed("application/x-ndjson"));
3335            magic.insert_extensions(["ndjson", "jsonl"].into_iter());
3336        } else {
3337            magic.set_mime_type(Cow::Borrowed("application/json"));
3338            magic.insert_extensions(["json"].into_iter());
3339        }
3340
3341        magic.push_message(Cow::Borrowed("JSON text data"));
3342        magic.set_source(Some(HARDCODED_SOURCE));
3343        magic.update_strength(HARDCODED_MAGIC_STRENGTH);
3344        Ok(true)
3345    }
3346
3347    #[inline(always)]
3348    fn try_csv<D: DataRead>(
3349        haystack: &mut D,
3350        stream_kind: StreamKind,
3351        magic: &mut Magic,
3352    ) -> Result<bool, Error> {
3353        // cannot be csv if content is binary
3354        let StreamKind::Text(enc) = stream_kind else {
3355            return Ok(false);
3356        };
3357
3358        let buf = haystack.read_range(0..FILE_BYTES_MAX as u64)?;
3359        let mut reader = csv::Reader::from_reader(io::Cursor::new(buf));
3360        let mut records = reader.records();
3361
3362        let Some(Ok(first)) = records.next() else {
3363            return Ok(false);
3364        };
3365
3366        // very not likely a CSV otherwise all programming
3367        // languages having ; line terminator would be
3368        // considered as CSV
3369        if first.len() <= 1 {
3370            return Ok(false);
3371        }
3372
3373        // we already parsed first line
3374        let mut n = 1;
3375        for i in records.take(9) {
3376            if let Ok(rec) = i {
3377                if first.len() != rec.len() {
3378                    return Ok(false);
3379                }
3380            } else {
3381                return Ok(false);
3382            }
3383            n += 1;
3384        }
3385
3386        // we need at least 10 lines
3387        if n != 10 {
3388            return Ok(false);
3389        }
3390
3391        magic.set_mime_type(Cow::Borrowed("text/csv"));
3392        magic.push_message(Cow::Borrowed("CSV"));
3393        magic.push_message(Cow::Borrowed(enc.as_magic_str()));
3394        magic.push_message(Cow::Borrowed("text"));
3395        magic.insert_extensions(["csv"].into_iter());
3396        magic.set_source(Some(HARDCODED_SOURCE));
3397        magic.update_strength(HARDCODED_MAGIC_STRENGTH);
3398        Ok(true)
3399    }
3400
3401    #[inline(always)]
3402    fn try_tar<D: DataRead>(
3403        haystack: &mut D,
3404        stream_kind: StreamKind,
3405        magic: &mut Magic,
3406    ) -> Result<bool, Error> {
3407        // cannot be json if content is not binary
3408        if !matches!(stream_kind, StreamKind::Binary) {
3409            return Ok(false);
3410        }
3411
3412        let buf = haystack.read_range(0..FILE_BYTES_MAX as u64)?;
3413        let mut ar = Archive::new(io::Cursor::new(buf));
3414
3415        let Ok(mut entries) = ar.entries() else {
3416            return Ok(false);
3417        };
3418
3419        let Some(Ok(first)) = entries.next() else {
3420            return Ok(false);
3421        };
3422
3423        let header = first.header();
3424
3425        if header.as_ustar().is_some() {
3426            magic.push_message(Cow::Borrowed("POSIX tar archive"));
3427        } else if header.as_gnu().is_some() {
3428            magic.push_message(Cow::Borrowed("POSIX tar archive (GNU)"));
3429        } else {
3430            magic.push_message(Cow::Borrowed("tar archive"));
3431        }
3432
3433        magic.set_mime_type(Cow::Borrowed("application/x-tar"));
3434        magic.set_source(Some(HARDCODED_SOURCE));
3435        magic.update_strength(HARDCODED_MAGIC_STRENGTH);
3436        magic.insert_extensions(["tar"].into_iter());
3437        Ok(true)
3438    }
3439
3440    #[inline(always)]
3441    fn try_hard_magic<D: DataRead>(
3442        haystack: &mut D,
3443        stream_kind: StreamKind,
3444        magic: &mut Magic,
3445    ) -> Result<bool, Error> {
3446        Ok(Self::try_json(haystack, stream_kind, magic)?
3447            || Self::try_csv(haystack, stream_kind, magic)?
3448            || Self::try_tar(haystack, stream_kind, magic)?)
3449    }
3450
3451    #[inline(always)]
3452    fn magic_default<'m, D: DataRead>(
3453        cache: &mut D,
3454        stream_kind: StreamKind,
3455        magic: &mut Magic<'m>,
3456    ) {
3457        magic.set_source(Some(HARDCODED_SOURCE));
3458        magic.set_stream_kind(stream_kind);
3459        magic.is_default = true;
3460
3461        if cache.data_size() == 0 {
3462            magic.push_message(Cow::Borrowed("empty"));
3463            magic.set_mime_type(Cow::Borrowed(DEFAULT_BIN_MIMETYPE));
3464        }
3465
3466        match stream_kind {
3467            StreamKind::Binary => {
3468                magic.push_message(Cow::Borrowed("data"));
3469            }
3470            StreamKind::Text(e) => {
3471                magic.push_message(Cow::Borrowed(e.as_magic_str()));
3472                magic.push_message(Cow::Borrowed("text"));
3473            }
3474        }
3475    }
3476
3477    fn load_rules_no_prepare(&mut self, rules: Vec<MagicRule>) {
3478        for rule in rules.into_iter() {
3479            let mut rule = rule;
3480            rule.set_id(self.next_rule_id());
3481
3482            self.rules.push(rule);
3483        }
3484    }
3485
3486    /// Loads rules from a [`MagicSource`]
3487    ///
3488    /// # Arguments
3489    ///
3490    /// * `ms` - The [`MagicSource`] to load rules from
3491    pub fn load(&mut self, ms: MagicSource) -> &mut Self {
3492        self.load_rules_no_prepare(ms.rules);
3493        self.dependencies.extend(ms.dependencies);
3494        self.try_finalize();
3495        self
3496    }
3497
3498    /// Loads multiple [`MagicSource`] items efficiently in bulk.
3499    ///
3500    /// This is more efficient than loading each individually. After processing
3501    /// all sources, it applies finalization step only once.
3502    pub fn load_bulk<I: Iterator<Item = MagicSource>>(&mut self, it: I) -> &mut Self {
3503        for ms in it {
3504            self.load_rules_no_prepare(ms.rules);
3505            self.dependencies.extend(ms.dependencies);
3506        }
3507        self.try_finalize();
3508        self
3509    }
3510
3511    /// Gets all rules in the database
3512    ///
3513    /// # Returns
3514    ///
3515    /// * `&[MagicRule]` - A slice of all rules
3516    pub fn rules(&self) -> &[MagicRule] {
3517        &self.rules
3518    }
3519
3520    #[inline]
3521    fn first_magic_with_stream_kind<D: DataRead>(
3522        &self,
3523        haystack: &mut D,
3524        stream_kind: StreamKind,
3525        extension: Option<&str>,
3526    ) -> Result<Magic<'_>, Error> {
3527        // re-using magic makes this function faster
3528        let mut magic = Magic::default();
3529
3530        if Self::try_hard_magic(haystack, stream_kind, &mut magic)? {
3531            return Ok(magic);
3532        }
3533
3534        let mut marked = vec![false; self.rules.len()];
3535
3536        macro_rules! do_magic {
3537            ($rule: expr) => {{
3538                $rule.magic_entrypoint(&mut magic, stream_kind, haystack, &self, false, 0)?;
3539
3540                if !magic.message.is_empty() {
3541                    magic.set_stream_kind(stream_kind);
3542                    magic.set_source($rule.source.as_deref());
3543                    return Ok(magic);
3544                }
3545
3546                magic.reset();
3547            }};
3548        }
3549
3550        if let Some(ext) = extension.map(|e| e.to_lowercase())
3551            && !ext.is_empty()
3552        {
3553            for rule in self.rules.iter().filter(|r| r.extensions.contains(&ext)) {
3554                do_magic!(rule);
3555                if let Some(f) = marked.get_mut(rule.id) {
3556                    *f = true
3557                }
3558            }
3559        }
3560
3561        for rule in self
3562            .rules
3563            .iter()
3564            // we don't run again rules run by extension
3565            .filter(|r| !*marked.get(r.id).unwrap_or(&false))
3566        {
3567            do_magic!(rule)
3568        }
3569
3570        Self::magic_default(haystack, stream_kind, &mut magic);
3571
3572        Ok(magic)
3573    }
3574
3575    /// Detects file [`Magic`] stopping at the first matching magic. Magic
3576    /// rules are evaluated from the best to the least relevant, so this method
3577    /// returns most of the time the best magic. For the rare cases where
3578    /// it doesn't or if the best result is always required, use [`MagicDb::best_magic`]
3579    ///
3580    /// # Arguments
3581    ///
3582    /// * `r` - A reader implementing [`DataRead`]
3583    /// * `extension` - Optional file extension to use for acceleration
3584    ///
3585    /// # Returns
3586    ///
3587    /// * `Result<Magic<'_>, Error>` - The detection result or an error
3588    ///
3589    /// # Notes
3590    ///
3591    /// * Use this method **only** if you need to re-use a `reader` for future **read** operations.
3592    /// * Use [`DataReader`] to create a generic `reader`
3593    ///
3594    /// # Warning
3595    ///
3596    /// File extension acceleration is made to evaluate rules faster by testing
3597    /// first the rules defining this extension with an `!:ext` entry.
3598    /// Whether you use `extension` acceleration or not with this function should not
3599    /// produce different results. Yet this makes the assumption rules are written
3600    /// correctly and every rule concerned defines `!:ext` when it is appropriate.
3601    /// If some rules are missing it, results might differ.
3602    pub fn first_magic<R: DataRead>(
3603        &self,
3604        r: &mut R,
3605        extension: Option<&str>,
3606    ) -> Result<Magic<'_>, Error> {
3607        let stream_kind = guess_stream_kind(r.read_range(0..FILE_BYTES_MAX as u64)?);
3608        self.first_magic_with_stream_kind(r, stream_kind, extension)
3609    }
3610
3611    /// Detects file [`Magic`] from a file path.
3612    ///
3613    /// This is a convenience method that opens the file and creates a [`DataReader::File`]
3614    /// internally. The file extension is automatically extracted and passed to
3615    /// [`MagicDb::first_magic`].
3616    ///
3617    /// # Errors
3618    ///
3619    /// Returns an error if the file cannot be opened or if magic detection fails.
3620    pub fn first_magic_file<P: AsRef<Path>>(&self, path: P) -> Result<Magic<'_>, Error> {
3621        let ext = path.as_ref().extension().and_then(|e| e.to_str());
3622        self.first_magic(&mut DataReader::from_file(File::open(path.as_ref())?)?, ext)
3623    }
3624
3625    /// Detects file [`Magic`] from an in-memory byte slice.
3626    ///
3627    /// This is a convenience method that creates a [`DataReader::Slice`] internally.
3628    ///
3629    /// # Errors
3630    ///
3631    /// Returns an error if magic detection fails.
3632    pub fn first_magic_slice<S: AsRef<[u8]>>(
3633        &self,
3634        s: S,
3635        extension: Option<&str>,
3636    ) -> Result<Magic<'_>, Error> {
3637        self.first_magic(&mut DataReader::from_slice(s.as_ref()), extension)
3638    }
3639
3640    #[inline(always)]
3641    fn all_magics_sort_with_stream_kind<R: DataRead>(
3642        &self,
3643        haystack: &mut R,
3644        stream_kind: StreamKind,
3645    ) -> Result<Vec<Magic<'_>>, Error> {
3646        let mut out = Vec::new();
3647
3648        let mut magic = Magic::default();
3649
3650        if Self::try_hard_magic(haystack, stream_kind, &mut magic)? {
3651            out.push(magic);
3652            magic = Magic::default();
3653        }
3654
3655        for rule in self.rules.iter() {
3656            rule.magic_entrypoint(&mut magic, stream_kind, haystack, self, false, 0)?;
3657
3658            // it is possible we have a strength with no message
3659            if !magic.message.is_empty() {
3660                magic.set_stream_kind(stream_kind);
3661                magic.set_source(rule.source.as_deref());
3662                out.push(magic);
3663                magic = Magic::default();
3664            }
3665
3666            magic.reset();
3667        }
3668
3669        Self::magic_default(haystack, stream_kind, &mut magic);
3670        out.push(magic);
3671
3672        out.sort_by_key(|b| std::cmp::Reverse(b.strength()));
3673
3674        Ok(out)
3675    }
3676
3677    /// Detects all [`Magic`] matching a given content.
3678    ///
3679    /// # Arguments
3680    ///
3681    /// * `r` - A reader implementing [`DataRead`]
3682    ///
3683    /// # Returns
3684    ///
3685    /// * `Result<Vec<Magic<'_>>, Error>` - All detection results sorted by strength or an error
3686    ///
3687    /// # Notes
3688    ///
3689    /// * Use this method **only** if you need to re-use a `reader` for future **read** operations.
3690    /// * Use [`DataReader`] to create a generic `reader`
3691    #[inline]
3692    pub fn all_magics<R: DataRead>(&self, r: &mut R) -> Result<Vec<Magic<'_>>, Error> {
3693        let stream_kind = guess_stream_kind(r.read_range(0..FILE_BYTES_MAX as u64)?);
3694        self.all_magics_sort_with_stream_kind(r, stream_kind)
3695    }
3696
3697    /// Detects all matching [`Magic`] entries from a file path.
3698    ///
3699    /// This is a convenience method that opens the file and creates a [`DataReader::File`]
3700    /// internally, then calls [`MagicDb::all_magics`].
3701    ///
3702    /// # Errors
3703    ///
3704    /// Returns an error if the file cannot be opened or if magic detection fails.
3705    pub fn all_magics_file<P: AsRef<Path>>(&self, path: P) -> Result<Vec<Magic<'_>>, Error> {
3706        self.all_magics(&mut DataReader::from_file(File::open(path)?)?)
3707    }
3708
3709    /// Detects all matching [`Magic`] entries from an in-memory byte slice.
3710    ///
3711    /// This is a convenience method that creates a [`DataReader::Slice`] internally,
3712    /// then calls [`MagicDb::all_magics`].
3713    ///
3714    /// # Errors
3715    ///
3716    /// Returns an error if magic detection fails.
3717    pub fn all_magics_slice<S: AsRef<[u8]>>(&self, slice: S) -> Result<Vec<Magic<'_>>, Error> {
3718        self.all_magics(&mut DataReader::from_slice(slice.as_ref()))
3719    }
3720
3721    #[inline(always)]
3722    fn best_magic_with_stream_kind<R: DataRead>(
3723        &self,
3724        reader: &mut R,
3725        stream_kind: StreamKind,
3726    ) -> Result<Magic<'_>, Error> {
3727        let magics = self.all_magics_sort_with_stream_kind(reader, stream_kind)?;
3728
3729        // magics is guaranteed to contain at least the
3730        // default magic but we unwrap to avoid any panic
3731        Ok(magics.into_iter().next().unwrap_or_else(|| {
3732            let mut magic = Magic::default();
3733            Self::magic_default(reader, stream_kind, &mut magic);
3734            magic
3735        }))
3736    }
3737
3738    /// Detects the best [`Magic`] matching a given content.
3739    ///
3740    /// # Arguments
3741    ///
3742    /// * `r` - A reader implementing [`DataRead`]
3743    ///
3744    /// # Returns
3745    ///
3746    /// * `Result<Magic<'_>, Error>` - The best detection result or an error
3747    ///
3748    /// # Notes
3749    ///
3750    /// * Use this method **only** if you need to re-use a `reader` for future **read** operations.
3751    /// * Use [`DataReader`] to create a generic `reader`
3752    #[inline]
3753    pub fn best_magic<R: DataRead>(&self, r: &mut R) -> Result<Magic<'_>, Error> {
3754        let stream_kind = guess_stream_kind(r.read_range(0..FILE_BYTES_MAX as u64)?);
3755        self.best_magic_with_stream_kind(r, stream_kind)
3756    }
3757
3758    /// Detects the best matching [`Magic`] from a file path.
3759    ///
3760    /// This is a convenience method that opens the file and creates a [`DataReader::File`]
3761    /// internally, then calls [`MagicDb::best_magic`].
3762    ///
3763    /// # Errors
3764    ///
3765    /// Returns an error if the file cannot be opened or if magic detection fails.
3766    pub fn best_magic_file<P: AsRef<Path>>(&self, path: P) -> Result<Magic<'_>, Error> {
3767        self.best_magic(&mut DataReader::from_file(File::open(path)?)?)
3768    }
3769
3770    /// Detects the best matching [`Magic`] from an in-memory byte slice.
3771    ///
3772    /// This is a convenience method that creates a [`DataReader::Slice`] internally,
3773    /// then calls [`MagicDb::best_magic`].
3774    ///
3775    /// # Errors
3776    ///
3777    /// Returns an error if magic detection fails.
3778    pub fn best_magic_slice<S: AsRef<[u8]>>(&self, slice: S) -> Result<Magic<'_>, Error> {
3779        self.best_magic(&mut DataReader::from_slice(slice.as_ref()))
3780    }
3781
3782    /// Serializes the database to a generic writer implementing [`io::Write`]
3783    ///
3784    /// # Returns
3785    ///
3786    /// * `Result<(), Error>` - The serialized database or an error
3787    pub fn serialize<W: Write>(self, w: &mut W) -> Result<(), Error> {
3788        let mut encoder = GzEncoder::new(w, Compression::best());
3789
3790        bincode::serde::encode_into_std_write(&self, &mut encoder, bincode::config::standard())?;
3791        encoder.finish()?;
3792        Ok(())
3793    }
3794
3795    /// Deserializes the database from a generic reader implementing [`io::Read`]
3796    ///
3797    /// # Arguments
3798    ///
3799    /// * `r` - The reader to deserialize from
3800    ///
3801    /// # Returns
3802    ///
3803    /// * `Result<Self, Error>` - The deserialized database or an error
3804    pub fn deserialize<R: Read>(r: &mut R) -> Result<Self, Error> {
3805        let mut buf = vec![];
3806        let mut gz = GzDecoder::new(r);
3807        gz.read_to_end(&mut buf).map_err(|e| {
3808            bincode::error::DecodeError::OtherString(format!("failed to read: {e}"))
3809        })?;
3810        let (sdb, _): (MagicDb, usize) =
3811            bincode::serde::decode_from_slice(&buf, bincode::config::standard())?;
3812        Ok(sdb)
3813    }
3814
3815    /// Verifies the consistency of the [`MagicDb`] database.
3816    /// This method must be called when the database is built once and used later.
3817    /// It catches [`enum@Error`] that would raise at rule evaluation time.
3818    ///
3819    /// # Errors
3820    /// Returns an error if any rule fails verification
3821    pub fn verify(&mut self) -> Result<(), Error> {
3822        if self.rules.len() == self.finalized {
3823            return Ok(());
3824        }
3825
3826        for r in self.rules.iter_mut().filter(|r| !r.finalized) {
3827            // return at the first rule failing verification
3828            r.try_finalize(&self.dependencies).map_err(|e| {
3829                Error::Verify(
3830                    r.source.clone().unwrap_or(String::from("unknown")),
3831                    r.line(),
3832                    e.into(),
3833                )
3834            })?;
3835            self.finalized += 1;
3836        }
3837
3838        debug_assert!(self.finalized <= self.rules.len());
3839
3840        Ok(())
3841    }
3842
3843    #[inline(always)]
3844    fn try_finalize(&mut self) {
3845        if self.rules.len() == self.finalized {
3846            return;
3847        }
3848
3849        let mut finalized = 0usize;
3850        self.rules.iter_mut().for_each(|r| {
3851            if r.try_finalize(&self.dependencies).is_ok() {
3852                finalized += 1;
3853            }
3854        });
3855
3856        self.finalized = finalized;
3857
3858        debug_assert!(self.finalized <= self.rules.len());
3859
3860        // put text rules at the end
3861        self.rules.sort_by_key(|r| (r.is_text(), -(r.score as i64)));
3862    }
3863}
3864
3865#[cfg(test)]
3866mod tests {
3867
3868    use regex::bytes::Regex;
3869
3870    use crate::{readers::BufReader, utils::unix_local_time_to_string};
3871
3872    use super::*;
3873
3874    macro_rules! buf_reader {
3875        ($l: literal) => {
3876            BufReader::from_slice($l.as_bytes())
3877        };
3878    }
3879
3880    fn first_magic(
3881        rule: &str,
3882        content: &[u8],
3883        stream_kind: StreamKind,
3884    ) -> Result<Magic<'static>, Error> {
3885        let mut md = MagicDb::new();
3886        md.load(
3887            FileMagicParser::parse_str(rule, None)
3888                .inspect_err(|e| eprintln!("{e}"))
3889                .unwrap(),
3890        );
3891        let mut reader = BufReader::from_slice(content);
3892        let v = md.best_magic_with_stream_kind(&mut reader, stream_kind)?;
3893        Ok(v.into_owned())
3894    }
3895
3896    /// helper macro to debug tests
3897    #[allow(unused_macros)]
3898    macro_rules! enable_trace {
3899        () => {
3900            tracing_subscriber::fmt()
3901                .with_max_level(tracing_subscriber::filter::LevelFilter::TRACE)
3902                .try_init();
3903        };
3904    }
3905
3906    macro_rules! parse_assert {
3907        ($rule:literal) => {
3908            FileMagicParser::parse_str($rule, None)
3909                .inspect_err(|e| eprintln!("{e}"))
3910                .unwrap()
3911        };
3912    }
3913
3914    macro_rules! assert_magic_match_bin {
3915        ($rule: literal, $content:literal) => {{ first_magic($rule, $content, StreamKind::Binary).unwrap() }};
3916        ($rule: literal, $content:literal, $message:expr) => {{
3917            assert_eq!(
3918                first_magic($rule, $content, StreamKind::Binary)
3919                    .unwrap()
3920                    .message(),
3921                $message
3922            );
3923        }};
3924    }
3925
3926    macro_rules! assert_magic_match_text {
3927        ($rule: literal, $content:literal) => {{ first_magic($rule, $content, StreamKind::Text(TextEncoding::Utf8)).unwrap() }};
3928        ($rule: literal, $content:literal, $message:expr) => {{
3929            assert_eq!(
3930                first_magic($rule, $content, StreamKind::Text(TextEncoding::Utf8))
3931                    .unwrap()
3932                    .message(),
3933                $message
3934            );
3935        }};
3936    }
3937
3938    macro_rules! assert_magic_not_match_text {
3939        ($rule: literal, $content:literal) => {{
3940            assert!(
3941                first_magic($rule, $content, StreamKind::Text(TextEncoding::Utf8))
3942                    .unwrap()
3943                    .is_default()
3944            );
3945        }};
3946    }
3947
3948    macro_rules! assert_magic_not_match_bin {
3949        ($rule: literal, $content:literal) => {{
3950            assert!(
3951                first_magic($rule, $content, StreamKind::Binary)
3952                    .unwrap()
3953                    .is_default()
3954            );
3955        }};
3956    }
3957
3958    #[test]
3959    fn test_regex() {
3960        assert_magic_match_text!(
3961            r#"
39620	regex/1024 \^#![[:space:]]*/usr/bin/env[[:space:]]+
3963!:mime	text/x-shellscript
3964>&0  regex/64 .*($|\\b) %s shell script text executable
3965    "#,
3966            br#"#!/usr/bin/env bash
3967        echo hello world"#,
3968            // the magic generated
3969            "bash shell script text executable"
3970        );
3971
3972        let re = Regex::new(r"(?-u)\x42\x82").unwrap();
3973        assert!(re.is_match(b"\x42\x82"));
3974
3975        assert_magic_match_bin!(
3976            r#"0 regex \x42\x82 binary regex match"#,
3977            b"\x00\x00\x00\x00\x00\x00\x42\x82"
3978        );
3979
3980        // test regex continuation after match
3981        assert_magic_match_bin!(
3982            r#"
3983            0 regex \x42\x82
3984            >&0 string \xde\xad\xbe\xef it works
3985            "#,
3986            b"\x00\x00\x00\x00\x00\x00\x42\x82\xde\xad\xbe\xef"
3987        );
3988
3989        assert_magic_match_bin!(
3990            r#"
3991            0 regex/s \x42\x82
3992            >&0 string \x42\x82\xde\xad\xbe\xef it works
3993            "#,
3994            b"\x00\x00\x00\x00\x00\x00\x42\x82\xde\xad\xbe\xef"
3995        );
3996
3997        // ^ must match stat of line when matching text
3998        assert_magic_match_text!(
3999            r#"
40000	regex/1024 \^HelloWorld$ HelloWorld String"#,
4001            br#"
4002// this is a comment after an empty line
4003HelloWorld
4004            "#
4005        );
4006    }
4007
4008    #[test]
4009    fn test_string_with_mods() {
4010        assert_magic_match_text!(
4011            r#"0	string/w	#!\ \ \ /usr/bin/env\ bash	BASH
4012        "#,
4013            b"#! /usr/bin/env bash i
4014        echo hello world"
4015        );
4016
4017        // test uppercase insensitive
4018        assert_magic_match_text!(
4019            r#"0	string/C	HelloWorld	it works
4020        "#,
4021            b"helloworld"
4022        );
4023
4024        assert_magic_not_match_text!(
4025            r#"0	string/C	HelloWorld	it works
4026        "#,
4027            b"hELLOwORLD"
4028        );
4029
4030        // test lowercase insensitive
4031        assert_magic_match_text!(
4032            r#"0	string/c	HelloWorld	it works
4033        "#,
4034            b"HELLOWORLD"
4035        );
4036
4037        assert_magic_not_match_text!(
4038            r#"0	string/c	HelloWorld	it works
4039        "#,
4040            b"helloworld"
4041        );
4042
4043        // test full word match
4044        assert_magic_match_text!(
4045            r#"0	string/f	#!/usr/bin/env\ bash	BASH
4046        "#,
4047            b"#!/usr/bin/env bash"
4048        );
4049
4050        assert_magic_not_match_text!(
4051            r#"0	string/f	#!/usr/bin/python PYTHON"#,
4052            b"#!/usr/bin/pythonic"
4053        );
4054
4055        // testing whitespace compacting
4056        assert_magic_match_text!(
4057            r#"0	string/W	#!/usr/bin/env\ python  PYTHON"#,
4058            b"#!/usr/bin/env    python"
4059        );
4060
4061        assert_magic_not_match_text!(
4062            r#"0	string/W	#!/usr/bin/env\ \ python  PYTHON"#,
4063            b"#!/usr/bin/env python"
4064        );
4065    }
4066
4067    #[test]
4068    fn test_search_with_mods() {
4069        assert_magic_match_text!(
4070            r#"0	search/1/fwt	#!\ /usr/bin/luatex	LuaTex script text executable"#,
4071            b"#!          /usr/bin/luatex "
4072        );
4073
4074        // test matching from the beginning
4075        assert_magic_match_text!(
4076            r#"
4077            0	search/s	/usr/bin/env
4078            >&0 string /usr/bin/env it works
4079            "#,
4080            b"#!/usr/bin/env    python"
4081        );
4082
4083        assert_magic_not_match_text!(
4084            r#"
4085            0	search	/usr/bin/env
4086            >&0 string /usr/bin/env it works
4087            "#,
4088            b"#!/usr/bin/env    python"
4089        );
4090    }
4091
4092    #[test]
4093    fn test_pstring() {
4094        assert_magic_match_bin!(r#"0 pstring Toast it works"#, b"\x05Toast");
4095
4096        assert_magic_match_bin!(r#"0 pstring Toast %s"#, b"\x05Toast", "Toast");
4097
4098        assert_magic_not_match_bin!(r#"0 pstring Toast Doesn't work"#, b"\x07Toaster");
4099
4100        // testing with modifiers
4101        assert_magic_match_bin!(r#"0 pstring/H Toast it works"#, b"\x00\x05Toast");
4102
4103        assert_magic_match_bin!(r#"0 pstring/HJ Toast it works"#, b"\x00\x07Toast");
4104
4105        assert_magic_match_bin!(r#"0 pstring/HJ Toast %s"#, b"\x00\x07Toast", "Toast");
4106
4107        assert_magic_match_bin!(r#"0 pstring/h Toast it works"#, b"\x05\x00Toast");
4108
4109        assert_magic_match_bin!(r#"0 pstring/hJ Toast it works"#, b"\x07\x00Toast");
4110
4111        assert_magic_match_bin!(r#"0 pstring/L Toast it works"#, b"\x00\x00\x00\x05Toast");
4112
4113        assert_magic_match_bin!(r#"0 pstring/LJ Toast it works"#, b"\x00\x00\x00\x09Toast");
4114
4115        assert_magic_match_bin!(r#"0 pstring/l Toast it works"#, b"\x05\x00\x00\x00Toast");
4116
4117        assert_magic_match_bin!(r#"0 pstring/lJ Toast it works"#, b"\x09\x00\x00\x00Toast");
4118    }
4119
4120    #[test]
4121    fn test_max_recursion() {
4122        let res = first_magic(
4123            r#"0	indirect x"#,
4124            b"#!          /usr/bin/luatex ",
4125            StreamKind::Binary,
4126        );
4127        assert!(res.is_err());
4128        let _ = res.inspect_err(|e| {
4129            assert!(matches!(
4130                e.unwrap_localized(),
4131                Error::MaximumRecursion(MAX_RECURSION)
4132            ))
4133        });
4134    }
4135
4136    #[test]
4137    fn test_string_ops() {
4138        assert_magic_match_text!("0	string/b MZ MZ File", b"MZ\0");
4139        assert_magic_match_text!("0	string !MZ Not MZ File", b"AZ\0");
4140        assert_magic_match_text!("0	string >\0 Any String", b"A\0");
4141        assert_magic_match_text!("0	string >Test Any String", b"Test 1\0");
4142        assert_magic_match_text!("0	string <Test Any String", b"\0");
4143        assert_magic_not_match_text!("0	string >Test Any String", b"\0");
4144    }
4145
4146    #[test]
4147    fn test_lestring16() {
4148        assert_magic_match_bin!(
4149            "0 lestring16 abcd Little-endian UTF-16 string",
4150            b"\x61\x00\x62\x00\x63\x00\x64\x00"
4151        );
4152        assert_magic_match_bin!(
4153            "0 lestring16 x %s",
4154            b"\x61\x00\x62\x00\x63\x00\x64\x00\x00",
4155            "abcd"
4156        );
4157        assert_magic_not_match_bin!(
4158            "0 lestring16 abcd Little-endian UTF-16 string",
4159            b"\x00\x61\x00\x62\x00\x63\x00\x64"
4160        );
4161        assert_magic_match_bin!(
4162            "4 lestring16 abcd Little-endian UTF-16 string",
4163            b"\x00\x00\x00\x00\x61\x00\x62\x00\x63\x00\x64\x00"
4164        );
4165    }
4166
4167    #[test]
4168    fn test_bestring16() {
4169        assert_magic_match_bin!(
4170            "0 bestring16 abcd Big-endian UTF-16 string",
4171            b"\x00\x61\x00\x62\x00\x63\x00\x64"
4172        );
4173        assert_magic_match_bin!(
4174            "0 bestring16 x %s",
4175            b"\x00\x61\x00\x62\x00\x63\x00\x64",
4176            "abcd"
4177        );
4178        assert_magic_not_match_bin!(
4179            "0 bestring16 abcd Big-endian UTF-16 string",
4180            b"\x61\x00\x62\x00\x63\x00\x64\x00"
4181        );
4182        assert_magic_match_bin!(
4183            "4 bestring16 abcd Big-endian UTF-16 string",
4184            b"\x00\x00\x00\x00\x00\x61\x00\x62\x00\x63\x00\x64"
4185        );
4186    }
4187
4188    #[test]
4189    fn test_offset_from_end() {
4190        assert_magic_match_bin!("-1 ubyte 0x42 last byte ok", b"\x00\x00\x42");
4191        assert_magic_match_bin!("-2 ubyte 0x41 last byte ok", b"\x00\x41\x00");
4192    }
4193
4194    #[test]
4195    fn test_relative_offset() {
4196        assert_magic_match_bin!(
4197            "
4198            0 ubyte 0x42
4199            >&0 ubyte 0x00
4200            >>&0 ubyte 0x41 third byte ok
4201            ",
4202            b"\x42\x00\x41\x00"
4203        );
4204    }
4205
4206    #[test]
4207    fn test_indirect_offset() {
4208        assert_magic_match_bin!("(0.l) ubyte 0x42 it works", b"\x04\x00\x00\x00\x42");
4209        // adding fixed value to offset
4210        assert_magic_match_bin!("(0.l+3) ubyte 0x42 it works", b"\x01\x00\x00\x00\x42");
4211        // testing offset pair
4212        assert_magic_match_bin!(
4213            "(0.l+(4)) ubyte 0x42 it works",
4214            b"\x04\x00\x00\x00\x04\x00\x00\x00\x42"
4215        );
4216    }
4217
4218    #[test]
4219    fn test_use_with_message() {
4220        assert_magic_match_bin!(
4221            r#"
42220 string MZ
4223>0 use mz first match
4224
42250 name mz then second match
4226>0 string MZ
4227"#,
4228            b"MZ\0",
4229            "first match then second match"
4230        );
4231    }
4232
4233    #[test]
4234    fn test_scalar_transform() {
4235        assert_magic_match_bin!("0 ubyte+1 0x1 add works", b"\x00");
4236        assert_magic_match_bin!("0 ubyte-1 0xfe sub works", b"\xff");
4237        assert_magic_match_bin!("0 ubyte%2 0 mod works", b"\x0a");
4238        assert_magic_match_bin!("0 ubyte&0x0f 0x0f bitand works", b"\xff");
4239        assert_magic_match_bin!("0 ubyte|0x0f 0xff bitor works", b"\xf0");
4240        assert_magic_match_bin!("0 ubyte^0x0f 0xf0 bitxor works", b"\xff");
4241
4242        FileMagicParser::parse_str("0 ubyte%0 mod by zero", None)
4243            .expect_err("expect div by zero error");
4244        FileMagicParser::parse_str("0 ubyte/0 div by zero", None)
4245            .expect_err("expect div by zero error");
4246    }
4247
4248    #[test]
4249    fn test_belong() {
4250        // Test that a file with a four-byte value at offset 0 that matches the given value in big-endian byte order
4251        assert_magic_match_bin!("0 belong 0x12345678 Big-endian long", b"\x12\x34\x56\x78");
4252        // Test that a file with a four-byte value at offset 0 that does not match the given value in big-endian byte order
4253        assert_magic_not_match_bin!("0 belong 0x12345678 Big-endian long", b"\x78\x56\x34\x12");
4254        // Test that a file with a four-byte value at a non-zero offset that matches the given value in big-endian byte order
4255        assert_magic_match_bin!(
4256            "4 belong 0x12345678 Big-endian long",
4257            b"\x00\x00\x00\x00\x12\x34\x56\x78"
4258        );
4259        // Test < operator
4260        assert_magic_match_bin!("0 belong <0x12345678 Big-endian long", b"\x12\x34\x56\x77");
4261        assert_magic_not_match_bin!("0 belong <0x12345678 Big-endian long", b"\x12\x34\x56\x78");
4262
4263        // Test > operator
4264        assert_magic_match_bin!("0 belong >0x12345678 Big-endian long", b"\x12\x34\x56\x79");
4265        assert_magic_not_match_bin!("0 belong >0x12345678 Big-endian long", b"\x12\x34\x56\x78");
4266
4267        // Test & operator
4268        assert_magic_match_bin!("0 belong &0x5678 Big-endian long", b"\x00\x00\x56\x78");
4269        assert_magic_not_match_bin!("0 belong &0x0000FFFF Big-endian long", b"\x12\x34\x56\x78");
4270
4271        // Test ^ operator (bitwise AND with complement)
4272        assert_magic_match_bin!("0 belong ^0xFFFF0000 Big-endian long", b"\x00\x00\x56\x78");
4273        assert_magic_not_match_bin!("0 belong ^0xFFFF0000 Big-endian long", b"\x00\x01\x56\x78");
4274
4275        // Test ~ operator
4276        assert_magic_match_bin!("0 belong ~0x12345678 Big-endian long", b"\xed\xcb\xa9\x87");
4277        assert_magic_not_match_bin!("0 belong ~0x12345678 Big-endian long", b"\x12\x34\x56\x78");
4278
4279        // Test x operator
4280        assert_magic_match_bin!("0 belong x Big-endian long", b"\x12\x34\x56\x78");
4281        assert_magic_match_bin!("0 belong x Big-endian long", b"\x78\x56\x34\x12");
4282    }
4283
4284    #[test]
4285    fn test_parse_search() {
4286        parse_assert!("0 search test");
4287        parse_assert!("0 search/24/s test");
4288        parse_assert!("0 search/s/24 test");
4289    }
4290
4291    #[test]
4292    fn test_bedate() {
4293        assert_magic_match_bin!(
4294            "0 bedate 946684800 Unix date (Jan 1, 2000)",
4295            b"\x38\x6D\x43\x80"
4296        );
4297        assert_magic_not_match_bin!(
4298            "0 bedate 946684800 Unix date (Jan 1, 2000)",
4299            b"\x00\x00\x00\x00"
4300        );
4301        assert_magic_match_bin!(
4302            "4 bedate 946684800 %s",
4303            b"\x00\x00\x00\x00\x38\x6D\x43\x80",
4304            "2000-01-01 00:00:00"
4305        );
4306    }
4307    #[test]
4308    fn test_beldate() {
4309        assert_magic_match_bin!(
4310            "0 beldate 946684800 Local date (Jan 1, 2000)",
4311            b"\x38\x6D\x43\x80"
4312        );
4313        assert_magic_not_match_bin!(
4314            "0 beldate 946684800 Local date (Jan 1, 2000)",
4315            b"\x00\x00\x00\x00"
4316        );
4317
4318        assert_magic_match_bin!(
4319            "4 beldate 946684800 {}",
4320            b"\x00\x00\x00\x00\x38\x6D\x43\x80",
4321            unix_local_time_to_string(946684800)
4322        );
4323    }
4324
4325    #[test]
4326    fn test_beqdate() {
4327        assert_magic_match_bin!(
4328            "0 beqdate 946684800 Unix date (Jan 1, 2000)",
4329            b"\x00\x00\x00\x00\x38\x6D\x43\x80"
4330        );
4331
4332        assert_magic_not_match_bin!(
4333            "0 beqdate 946684800 Unix date (Jan 1, 2000)",
4334            b"\x00\x00\x00\x00\x00\x00\x00\x00"
4335        );
4336
4337        assert_magic_match_bin!(
4338            "0 beqdate 946684800 %s",
4339            b"\x00\x00\x00\x00\x38\x6D\x43\x80",
4340            "2000-01-01 00:00:00"
4341        );
4342    }
4343
4344    #[test]
4345    fn test_medate() {
4346        assert_magic_match_bin!(
4347            "0 medate 946684800 Unix date (Jan 1, 2000)",
4348            b"\x6D\x38\x80\x43"
4349        );
4350
4351        assert_magic_not_match_bin!(
4352            "0 medate 946684800 Unix date (Jan 1, 2000)",
4353            b"\x00\x00\x00\x00"
4354        );
4355
4356        assert_magic_match_bin!(
4357            "4 medate 946684800 %s",
4358            b"\x00\x00\x00\x00\x6D\x38\x80\x43",
4359            "2000-01-01 00:00:00"
4360        );
4361    }
4362
4363    #[test]
4364    fn test_meldate() {
4365        assert_magic_match_bin!(
4366            "0 meldate 946684800 Local date (Jan 1, 2000)",
4367            b"\x6D\x38\x80\x43"
4368        );
4369        assert_magic_not_match_bin!(
4370            "0 meldate 946684800 Local date (Jan 1, 2000)",
4371            b"\x00\x00\x00\x00"
4372        );
4373
4374        assert_magic_match_bin!(
4375            "4 meldate 946684800 %s",
4376            b"\x00\x00\x00\x00\x6D\x38\x80\x43",
4377            unix_local_time_to_string(946684800)
4378        );
4379    }
4380
4381    #[test]
4382    fn test_date() {
4383        assert_magic_match_bin!(
4384            "0 date 946684800 Local date (Jan 1, 2000)",
4385            b"\x80\x43\x6D\x38"
4386        );
4387        assert_magic_not_match_bin!(
4388            "0 date 946684800 Local date (Jan 1, 2000)",
4389            b"\x00\x00\x00\x00"
4390        );
4391        assert_magic_match_bin!(
4392            "4 date 946684800 {}",
4393            b"\x00\x00\x00\x00\x80\x43\x6D\x38",
4394            "2000-01-01 00:00:00"
4395        );
4396    }
4397
4398    #[test]
4399    fn test_leldate() {
4400        assert_magic_match_bin!(
4401            "0 leldate 946684800 Local date (Jan 1, 2000)",
4402            b"\x80\x43\x6D\x38"
4403        );
4404        assert_magic_not_match_bin!(
4405            "0 leldate 946684800 Local date (Jan 1, 2000)",
4406            b"\x00\x00\x00\x00"
4407        );
4408        assert_magic_match_bin!(
4409            "4 leldate 946684800 {}",
4410            b"\x00\x00\x00\x00\x80\x43\x6D\x38",
4411            unix_local_time_to_string(946684800)
4412        );
4413    }
4414
4415    #[test]
4416    fn test_leqdate() {
4417        assert_magic_match_bin!(
4418            "0 leqdate 1577836800 Unix date (Jan 1, 2020)",
4419            b"\x00\xe1\x0b\x5E\x00\x00\x00\x00"
4420        );
4421
4422        assert_magic_not_match_bin!(
4423            "0 leqdate 1577836800 Unix date (Jan 1, 2020)",
4424            b"\x00\x00\x00\x00\x00\x00\x00\x00"
4425        );
4426        assert_magic_match_bin!(
4427            "8 leqdate 1577836800 %s",
4428            b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\xE1\x0B\x5E\x00\x00\x00\x00",
4429            "2020-01-01 00:00:00"
4430        );
4431    }
4432
4433    #[test]
4434    fn test_leqldate() {
4435        assert_magic_match_bin!(
4436            "0 leqldate 1577836800 Unix date (Jan 1, 2020)",
4437            b"\x00\xe1\x0b\x5E\x00\x00\x00\x00"
4438        );
4439
4440        assert_magic_not_match_bin!(
4441            "0 leqldate 1577836800 Unix date (Jan 1, 2020)",
4442            b"\x00\x00\x00\x00\x00\x00\x00\x00"
4443        );
4444        assert_magic_match_bin!(
4445            "8 leqldate 1577836800 %s",
4446            b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\xE1\x0B\x5E\x00\x00\x00\x00",
4447            unix_local_time_to_string(1577836800)
4448        );
4449    }
4450
4451    #[test]
4452    fn test_melong() {
4453        // Test = operator
4454        assert_magic_match_bin!(
4455            "0 melong =0x12345678 Middle-endian long",
4456            b"\x34\x12\x78\x56"
4457        );
4458        assert_magic_not_match_bin!(
4459            "0 melong =0x12345678 Middle-endian long",
4460            b"\x00\x00\x00\x00"
4461        );
4462
4463        // Test < operator
4464        assert_magic_match_bin!(
4465            "0 melong <0x12345678 Middle-endian long",
4466            b"\x34\x12\x78\x55"
4467        ); // 0x12345677 in middle-endian
4468        assert_magic_not_match_bin!(
4469            "0 melong <0x12345678 Middle-endian long",
4470            b"\x34\x12\x78\x56"
4471        ); // 0x12345678 in middle-endian
4472
4473        // Test > operator
4474        assert_magic_match_bin!(
4475            "0 melong >0x12345678 Middle-endian long",
4476            b"\x34\x12\x78\x57"
4477        ); // 0x12345679 in middle-endian
4478        assert_magic_not_match_bin!(
4479            "0 melong >0x12345678 Middle-endian long",
4480            b"\x34\x12\x78\x56"
4481        ); // 0x12345678 in middle-endian
4482
4483        // Test & operator
4484        assert_magic_match_bin!("0 melong &0x5678 Middle-endian long", b"\xab\xcd\x78\x56"); // 0x00007856 in middle-endian
4485        assert_magic_not_match_bin!(
4486            "0 melong &0x0000FFFF Middle-endian long",
4487            b"\x34\x12\x78\x56"
4488        ); // 0x12347856 in middle-endian
4489
4490        // Test ^ operator (bitwise AND with complement)
4491        assert_magic_match_bin!(
4492            "0 melong ^0xFFFF0000 Middle-endian long",
4493            b"\x00\x00\x78\x56"
4494        ); // 0x00007856 in middle-endian
4495        assert_magic_not_match_bin!(
4496            "0 melong ^0xFFFF0000 Middle-endian long",
4497            b"\x00\x01\x78\x56"
4498        ); // 0x00017856 in middle-endian
4499
4500        // Test ~ operator
4501        assert_magic_match_bin!(
4502            "0 melong ~0x12345678 Middle-endian long",
4503            b"\xCB\xED\x87\xA9"
4504        );
4505        assert_magic_not_match_bin!(
4506            "0 melong ~0x12345678 Middle-endian long",
4507            b"\x34\x12\x78\x56"
4508        ); // The original value
4509
4510        // Test x operator
4511        assert_magic_match_bin!("0 melong x Middle-endian long", b"\x34\x12\x78\x56");
4512        assert_magic_match_bin!("0 melong x Middle-endian long", b"\x00\x00\x00\x00");
4513    }
4514
4515    #[test]
4516    fn test_uquad() {
4517        // Test = operator
4518        assert_magic_match_bin!(
4519            "0 uquad =0x123456789ABCDEF0 Unsigned quad",
4520            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4521        );
4522        assert_magic_not_match_bin!(
4523            "0 uquad =0x123456789ABCDEF0 Unsigned quad",
4524            b"\x00\x00\x00\x00\x00\x00\x00\x00"
4525        );
4526
4527        // Test < operator
4528        assert_magic_match_bin!(
4529            "0 uquad <0x123456789ABCDEF0 Unsigned quad",
4530            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x11"
4531        );
4532        assert_magic_not_match_bin!(
4533            "0 uquad <0x123456789ABCDEF0 Unsigned quad",
4534            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4535        );
4536
4537        // Test > operator
4538        assert_magic_match_bin!(
4539            "0 uquad >0x123456789ABCDEF0 Unsigned quad",
4540            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x13"
4541        );
4542        assert_magic_not_match_bin!(
4543            "0 uquad >0x123456789ABCDEF0 Unsigned quad",
4544            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4545        );
4546
4547        // Test & operator
4548        assert_magic_match_bin!(
4549            "0 uquad &0xF0 Unsigned quad",
4550            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4551        );
4552        assert_magic_not_match_bin!(
4553            "0 uquad &0xFF Unsigned quad",
4554            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4555        );
4556
4557        // Test ^ operator (bitwise AND with complement)
4558        assert_magic_match_bin!(
4559            "0 uquad ^0xFFFFFFFFFFFFFFFF Unsigned quad",
4560            b"\x00\x00\x00\x00\x00\x00\x00\x00"
4561        ); // All bits clear
4562        assert_magic_not_match_bin!(
4563            "0 uquad ^0xFFFFFFFFFFFFFFFF Unsigned quad",
4564            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4565        ); // Some bits set
4566
4567        // Test ~ operator
4568        assert_magic_match_bin!(
4569            "0 uquad ~0x123456789ABCDEF0 Unsigned quad",
4570            b"\x0F\x21\x43\x65\x87\xA9\xCB\xED"
4571        );
4572        assert_magic_not_match_bin!(
4573            "0 uquad ~0x123456789ABCDEF0 Unsigned quad",
4574            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4575        ); // The original value
4576
4577        // Test x operator
4578        assert_magic_match_bin!(
4579            "0 uquad x {:#x}",
4580            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12",
4581            "0x123456789abcdef0"
4582        );
4583        assert_magic_match_bin!(
4584            "0 uquad x Unsigned quad",
4585            b"\x00\x00\x00\x00\x00\x00\x00\x00"
4586        );
4587    }
4588
4589    #[test]
4590    fn test_guid() {
4591        assert_magic_match_bin!(
4592            "0 guid EC959539-6786-2D4E-8FDB-98814CE76C1E It works",
4593            b"\xEC\x95\x95\x39\x67\x86\x2D\x4E\x8F\xDB\x98\x81\x4C\xE7\x6C\x1E"
4594        );
4595
4596        assert_magic_not_match_bin!(
4597            "0 guid 399595EC-8667-4E2D-8FDB-98814CE76C1E It works",
4598            b"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F"
4599        );
4600
4601        assert_magic_match_bin!(
4602            "0 guid x %s",
4603            b"\xEC\x95\x95\x39\x67\x86\x2D\x4E\x8F\xDB\x98\x81\x4C\xE7\x6C\x1E",
4604            "EC959539-6786-2D4E-8FDB-98814CE76C1E"
4605        );
4606    }
4607
4608    #[test]
4609    fn test_ubeqdate() {
4610        assert_magic_match_bin!(
4611            "0 ubeqdate 1633046400 It works",
4612            b"\x00\x00\x00\x00\x61\x56\x4f\x80"
4613        );
4614
4615        assert_magic_match_bin!(
4616            "0 ubeqdate x %s",
4617            b"\x00\x00\x00\x00\x61\x56\x4f\x80",
4618            "2021-10-01 00:00:00"
4619        );
4620
4621        assert_magic_not_match_bin!(
4622            "0 ubeqdate 1633046400 It should not work",
4623            b"\x00\x00\x00\x00\x00\x00\x00\x00"
4624        );
4625    }
4626
4627    #[test]
4628    fn test_ldate() {
4629        assert_magic_match_bin!("0 ldate 1640551520 It works", b"\x60\xd4\xC8\x61");
4630
4631        assert_magic_not_match_bin!("0 ldate 1633046400 It should not work", b"\x00\x00\x00\x00");
4632
4633        assert_magic_match_bin!(
4634            "0 ldate x %s",
4635            b"\x60\xd4\xC8\x61",
4636            unix_local_time_to_string(1640551520)
4637        );
4638    }
4639
4640    #[test]
4641    fn test_scalar_with_transform() {
4642        assert_magic_match_bin!("0 ubyte/10 2 {}", b"\x14", "2");
4643        assert_magic_match_bin!("0 ubyte/10 x {}", b"\x14", "2");
4644        assert_magic_match_bin!("0 ubyte%10 x {}", b"\x14", "0");
4645    }
4646
4647    #[test]
4648    fn test_float_with_transform() {
4649        assert_magic_match_bin!("0 lefloat/10 2 {}", b"\x00\x00\xa0\x41", "2");
4650        assert_magic_match_bin!("0 lefloat/10 x {}", b"\x00\x00\xa0\x41", "2");
4651        assert_magic_match_bin!("0 lefloat%10 x {}", b"\x00\x00\xa0\x41", "0");
4652    }
4653
4654    #[test]
4655    fn test_read_octal() {
4656        // Basic cases
4657        assert_eq!(read_octal_u64(&mut buf_reader!("0")), Some(0));
4658        assert_eq!(read_octal_u64(&mut buf_reader!("00")), Some(0));
4659        assert_eq!(read_octal_u64(&mut buf_reader!("01")), Some(1));
4660        assert_eq!(read_octal_u64(&mut buf_reader!("07")), Some(7));
4661        assert_eq!(read_octal_u64(&mut buf_reader!("010")), Some(8));
4662        assert_eq!(read_octal_u64(&mut buf_reader!("0123")), Some(83));
4663        assert_eq!(read_octal_u64(&mut buf_reader!("0755")), Some(493));
4664
4665        // With trailing non-octal characters
4666        assert_eq!(read_octal_u64(&mut buf_reader!("0ABC")), Some(0));
4667        assert_eq!(read_octal_u64(&mut buf_reader!("01ABC")), Some(1));
4668        assert_eq!(read_octal_u64(&mut buf_reader!("0755ABC")), Some(493));
4669        assert_eq!(read_octal_u64(&mut buf_reader!("0123ABC")), Some(83));
4670
4671        // Invalid octal digits
4672        assert_eq!(read_octal_u64(&mut buf_reader!("08")), Some(0)); // stops at '8'
4673        assert_eq!(read_octal_u64(&mut buf_reader!("01238")), Some(83)); // stops at '8'
4674
4675        // No leading '0'
4676        assert_eq!(read_octal_u64(&mut buf_reader!("123")), None);
4677        assert_eq!(read_octal_u64(&mut buf_reader!("755")), None);
4678
4679        // Empty string
4680        assert_eq!(read_octal_u64(&mut buf_reader!("")), None);
4681
4682        // Only non-octal characters
4683        assert_eq!(read_octal_u64(&mut buf_reader!("ABC")), None);
4684        assert_eq!(read_octal_u64(&mut buf_reader!("8ABC")), None); // first char is not '0'
4685
4686        // Longer valid octal (but within u64 range)
4687        assert_eq!(
4688            read_octal_u64(&mut buf_reader!("01777777777")),
4689            Some(268435455)
4690        );
4691    }
4692
4693    #[test]
4694    fn test_offset_bug_1() {
4695        // this tests the exact behaviour
4696        // expected by libmagic/file
4697        assert_magic_match_bin!(
4698            r"
46991	string		TEST Bread is
4700# offset computation is relative to
4701# rule start
4702>(5.b)	use toasted
4703
47040 name toasted
4705>0	string twice Toasted
4706>>0  use toasted_twice
4707
47080 name toasted_twice
4709>(6.b) string x %s
4710        ",
4711            b"\x00TEST\x06twice\x00\x06",
4712            "Bread is Toasted twice"
4713        );
4714    }
4715
4716    // this test implement the exact same logic as
4717    // test_offset_bug_1 except that the rule starts
4718    // matching from end. Surprisingly we need to
4719    // adjust indirect offsets so that it works in
4720    // libmagic/file
4721    #[test]
4722    fn test_offset_bug_2() {
4723        // this tests the exact behaviour
4724        // expected by libmagic/file
4725        assert_magic_match_bin!(
4726            r"
4727-12	string		TEST Bread is
4728>(4.b)	use toasted
4729
47300 name toasted
4731>0	string twice Toasted
4732>>0  use toasted_twice
4733
47340 name toasted_twice
4735>(6.b) string x %
4736        ",
4737            b"\x00TEST\x06twice\x00\x06",
4738            "Bread is Toasted twice"
4739        )
4740    }
4741
4742    #[test]
4743    fn test_offset_bug_3() {
4744        // this tests the exact behaviour
4745        // expected by libmagic/file
4746        assert_magic_match_bin!(
4747            r"
47481	string		TEST Bread is
4749>(5.b) indirect/r x
4750
47510	string twice Toasted
4752>0  use toasted_twice
4753
47540 name toasted_twice
4755>0 string x %s
4756        ",
4757            b"\x00TEST\x06twice\x00\x08",
4758            "Bread is Toasted twice"
4759        )
4760    }
4761
4762    #[test]
4763    fn test_offset_bug_4() {
4764        // this tests the exact behaviour
4765        // expected by libmagic/file
4766        assert_magic_match_bin!(
4767            r"
47681	string		Bread %s
4769>(6.b) indirect/r x
4770
4771# this one uses a based offset
4772# computed at indirection
47731	string is\ Toasted %s
4774>(11.b)  use toasted_twice
4775
4776# this one is using a new base
4777# offset being previous base
4778# offset + offset of use
47790 name toasted_twice
4780>0 string x %s
4781            ",
4782            b"\x00Bread\x06is Toasted\x0ctwice\x00",
4783            "Bread is Toasted twice"
4784        )
4785    }
4786
4787    #[test]
4788    fn test_offset_bug_5() {
4789        assert_magic_match_bin!(
4790            r"
47911	string		TEST Bread is
4792>(5.b) indirect/r x
4793
47940	string twice Toasted
4795>0  use toasted_twice
4796
47970 name toasted_twice
4798>0 string twice
4799>>&1 byte 0x08 twice
4800            ",
4801            b"\x00TEST\x06twice\x00\x08",
4802            "Bread is Toasted twice"
4803        )
4804    }
4805
4806    #[test]
4807    fn test_bug_6() {
4808        // An indirect use test should not be successful
4809        // even if a match with no message occurs
4810
4811        assert_magic_match_bin!(
4812            r"
48131	string		TEST Bread is toasted
4814>&0 use toasted
4815>>&0 default x but not burnt
4816
48170 name toasted
4818>1 string toasted
4819            ",
4820            b"\x00TEST\x06toasted",
4821            "Bread is toasted"
4822        )
4823    }
4824
4825    #[test]
4826    fn test_offset_bug_7() {
4827        // Bug: nested 'use' directives with indirect offsets don't properly
4828        // adjust offsets during recursion. This test encodes the behavior
4829        // libmagic has when dealing with such scenarios.
4830        assert_magic_match_bin!(
4831            r"
48321	string		TEST Bread is
4833# offset computation is relative to
4834# rule start
4835>(5.b)	use toasted
4836
48370 name toasted
4838>0	string toast Toasted
4839>>(6.b)  use toasted_twice
4840
48410 name toasted_twice
4842>1 string x %s
4843        ",
4844            b"\x00TEST\x06toast\x00\x06twice\x00",
4845            "Bread is Toasted twice"
4846        );
4847    }
4848
4849    #[test]
4850    fn test_message_parts() {
4851        let m = first_magic(
4852            r#"0	string/W	#!/usr/bin/env\ python  PYTHON"#,
4853            b"#!/usr/bin/env    python",
4854            StreamKind::Text(TextEncoding::Ascii),
4855        )
4856        .unwrap();
4857
4858        assert!(m.message_parts().any(|p| p.eq_ignore_ascii_case("python")))
4859    }
4860
4861    #[test]
4862    fn test_load_bulk() {
4863        let mut db = MagicDb::new();
4864
4865        let rules = vec![
4866            parse_assert!("0 search test"),
4867            parse_assert!("0 search/24/s test"),
4868            parse_assert!("0 search/s/24 test"),
4869        ];
4870
4871        db.load_bulk(rules.into_iter());
4872        db.verify().unwrap();
4873    }
4874
4875    #[test]
4876    fn test_load_bulk_failure() {
4877        let mut db = MagicDb::new();
4878
4879        let rules = vec![parse_assert!(
4880            r#"
48810 search/s/24 test
4882>0 use test
4883"#
4884        )];
4885
4886        db.load_bulk(rules.into_iter());
4887        assert!(matches!(db.verify(), Err(Error::Verify(_, _, _))));
4888    }
4889}
pure_magic/lib.rs

pure_magic/
lib.rs