Skip to main content

pure_magic/
lib.rs

1#![forbid(unsafe_code)]
2#![deny(unused_imports)]
3#![deny(missing_docs)]
4//! # `pure-magic`: A pure and safe Rust Reimplementation of `libmagic`
5//!
6//! Unlike many file identification crates, `pure-magic` is highly compatible with the standard
7//! `magic` rule format, allowing seamless reuse of existing
8//! [rules](https://github.com/qjerome/magic-rs/tree/main/magic-db/src/magdir). This makes it an ideal
9//! drop-in replacement for crates relying on **`libmagic` C bindings**, where memory safety is critical.
10//!
11//! **Key Features:**
12//! - File type detection
13//! - MIME type inference
14//! - Custom magic rule parsing
15//!
16//! ## Installation
17//! Add `pure-magic` to your `Cargo.toml`:
18//!
19//! ```toml
20//! [dependencies]
21//! pure-magic = "0.1"  # Replace with the latest version
22//! ```
23//!
24//! Or add the latest version with cargo:
25//!
26//! ```sh
27//! cargo add pure-magic
28//! ```
29//!
30//! ## Quick Start
31//!
32//! ### Detect File Types Programmatically
33//! ```rust
34//! use pure_magic::{MagicDb, MagicSource};
35//! use std::fs::File;
36//!
37//! fn main() -> Result<(), Box<dyn std::error::Error>> {
38//!     let mut db = MagicDb::new();
39//!     // Create a MagicSource from a file
40//!     let rust_magic = MagicSource::open("../magic-db/src/magdir/rust")?;
41//!     db.load(rust_magic);
42//!     // Verification is not mandatory
43//!     db.verify()?;
44//!
45//!     // Open a file and detect its type
46//!     let mut file = File::open("src/lib.rs")?;
47//!     let magic = db.first_magic(&mut file, None)?;
48//!
49//!     println!(
50//!         "File type: {} (MIME: {}, strength: {})",
51//!         magic.message(),
52//!         magic.mime_type(),
53//!         magic.strength()
54//!     );
55//!     Ok(())
56//! }
57//! ```
58//!
59//! ### Get All Matching Rules
60//! ```rust
61//! use pure_magic::{MagicDb, MagicSource};
62//! use std::fs::File;
63//!
64//! fn main() -> Result<(), Box<dyn std::error::Error>> {
65//!     let mut db = MagicDb::new();
66//!     // Create a MagicSource from a file
67//!     let rust_magic = MagicSource::open("../magic-db/src/magdir/rust")?;
68//!     db.load(rust_magic);
69//!
70//!     // Open a file and detect its type
71//!     let mut file = File::open("src/lib.rs")?;
72//!
73//!     // Get all matching rules, sorted by strength
74//!     let magics = db.all_magics(&mut file)?;
75//!
76//!     // Must contain rust file magic and default text magic
77//!     assert!(magics.len() > 1);
78//!
79//!     for magic in magics {
80//!         println!(
81//!             "Match: {} (strength: {}, source: {})",
82//!             magic.message(),
83//!             magic.strength(),
84//!             magic.source().unwrap_or("unknown")
85//!         );
86//!     }
87//!     Ok(())
88//! }
89//! ```
90//!
91//! ### Serialize a Database to Disk
92//! ```rust
93//! use pure_magic::{MagicDb, MagicSource};
94//! use std::fs::File;
95//!
96//! fn main() -> Result<(), Box<dyn std::error::Error>> {
97//!     let mut db = MagicDb::new();
98//!     // Create a MagicSource from a file
99//!     let rust_magic = MagicSource::open("../magic-db/src/magdir/rust")?;
100//!     db.load(rust_magic);
101//!
102//!     // Serialize the database to a file
103//!     let mut output = File::create("/tmp/compiled.db")?;
104//!     db.serialize(&mut output)?;
105//!
106//!     println!("Database saved to file");
107//!     Ok(())
108//! }
109//! ```
110//!
111//! ### Deserialize a Database
112//! ```rust
113//! use pure_magic::{MagicDb, MagicSource};
114//! use std::fs::File;
115//!
116//! fn main() -> Result<(), Box<dyn std::error::Error>> {
117//!     let mut db = MagicDb::new();
118//!     // Create a MagicSource from a file
119//!     let rust_magic = MagicSource::open("../magic-db/src/magdir/rust")?;
120//!     db.load(rust_magic);
121//!
122//!     // Serialize the database in a vector
123//!     let mut ser = vec![];
124//!     db.serialize(&mut ser)?;
125//!     println!("Database saved to vector");
126//!
127//!     // We deserialize from slice
128//!     let db = MagicDb::deserialize(&mut ser.as_slice())?;
129//!
130//!     assert!(!db.rules().is_empty());
131//!
132//!     Ok(())
133//! }
134//! ```
135//!
136//! ## License
137//! This project is licensed under the **GPL-3.0 License**.
138//!
139//! ## Contributing
140//! Contributions are welcome! Open an issue or submit a pull request.
141//!
142//! ## Acknowledgments
143//! - Inspired by the original `libmagic` (part of the `file` command).
144
145use dyf::{DynDisplay, FormatString, dformat};
146use flagset::{FlagSet, flags};
147use flate2::{Compression, read::GzDecoder, write::GzEncoder};
148use lazy_cache::LazyCache;
149use memchr::memchr;
150use pest::{Span, error::ErrorVariant};
151use regex::bytes::{self};
152use serde::{Deserialize, Serialize};
153use std::{
154    borrow::Cow,
155    cmp::max,
156    collections::{HashMap, HashSet},
157    fmt::{self, Debug, Display},
158    io::{self, Read, Seek, SeekFrom, Write},
159    ops::{Add, BitAnd, BitOr, BitXor, Deref, Div, Mul, Rem, Sub},
160    path::Path,
161};
162use tar::Archive;
163use thiserror::Error;
164use tracing::{Level, debug, enabled, trace};
165
166use crate::{
167    numeric::{Float, FloatDataType, Scalar, ScalarDataType},
168    parser::{FileMagicParser, Rule},
169    utils::{
170        debug_string_from_vec_u8, debug_string_from_vec_u16, decode_id3, find_json_boundaries,
171        run_utf8_validation,
172    },
173};
174
175mod numeric;
176mod parser;
177mod utils;
178
179const HARDCODED_MAGIC_STRENGTH: u64 = 2048;
180const HARDCODED_SOURCE: &str = "hardcoded";
181// corresponds to FILE_INDIR_MAX constant defined in libmagic
182const MAX_RECURSION: usize = 50;
183// constant found in libmagic. It is used to limit for regex tests
184const FILE_REGEX_MAX: usize = 8192;
185
186/// Maximum number of bytes to read for search tests.
187///
188/// This constant is derived from `libmagic` and is used to limit the number of bytes
189/// read during search tests to ensure performance and efficiency. The value is set
190/// to 7 megabytes.
191pub const FILE_BYTES_MAX: usize = 7 * 1024 * 1024;
192/// Default mimetype for un-identified binary data
193pub const DEFAULT_BIN_MIMETYPE: &str = "application/octet-stream";
194/// Default mimetype for un-identified text data
195pub const DEFAULT_TEXT_MIMETYPE: &str = "text/plain";
196
197pub(crate) const TIMESTAMP_FORMAT: &str = "%Y-%m-%d %H:%M:%S";
198
199macro_rules! debug_panic {
200    ($($arg:tt)*) => {
201        if cfg!(debug_assertions) {
202            panic!($($arg)*);
203        }
204    };
205}
206
207macro_rules! read {
208    ($r: expr, $ty: ty) => {{
209        let mut a = [0u8; std::mem::size_of::<$ty>()];
210        $r.read_exact(&mut a)?;
211        a
212    }};
213}
214
215macro_rules! read_le {
216    ($r:expr, $ty: ty ) => {{ <$ty>::from_le_bytes(read!($r, $ty)) }};
217}
218
219macro_rules! read_be {
220    ($r:expr, $ty: ty ) => {{ <$ty>::from_be_bytes(read!($r, $ty)) }};
221}
222
223macro_rules! read_me {
224    ($r: expr) => {{ ((read_le!($r, u16) as i32) << 16) | (read_le!($r, u16) as i32) }};
225}
226
227#[inline(always)]
228fn read_octal_u64<R: Read + Seek>(haystack: &mut LazyCache<R>) -> Option<u64> {
229    let s = haystack
230        .read_while_or_limit(|b| matches!(b, b'0'..=b'7'), 22)
231        .map(|buf| str::from_utf8(buf))
232        .ok()?
233        .ok()?;
234
235    if !s.starts_with("0") {
236        return None;
237    }
238
239    u64::from_str_radix(s, 8).ok()
240}
241
242/// Represents all possible errors that can occur during file type detection and processing.
243#[derive(Debug, Error)]
244pub enum Error {
245    /// A generic error with a custom message.
246    #[error("{0}")]
247    Msg(String),
248
249    /// Indicate a rule load failure
250    #[error("source={0} line={1} error={2}")]
251    Verify(String, usize, Box<Error>),
252
253    /// An error with a source location and a nested error.
254    #[error("source={0} line={1} error={2}")]
255    Localized(String, usize, Box<Error>),
256
257    /// Indicates a required rule was not found.
258    #[error("missing rule: {0}")]
259    MissingRule(String),
260
261    /// Indicates the maximum recursion depth was reached.
262    #[error("maximum recursion reached: {0}")]
263    MaximumRecursion(usize),
264
265    /// Wraps an I/O error.
266    #[error("io: {0}")]
267    Io(#[from] io::Error),
268
269    /// Wraps a parsing error from the `pest` parser.
270    #[error("parser error: {0}")]
271    Parse(#[from] Box<pest::error::Error<Rule>>),
272
273    /// Wraps a formatting error from the `dyf` crate.
274    #[error("formatting: {0}")]
275    Format(#[from] dyf::Error),
276
277    /// Wraps a regex-related error.
278    #[error("regex: {0}")]
279    Regex(#[from] regex::Error),
280
281    /// Wraps a serialization error from `bincode`.
282    #[error("{0}")]
283    Serialize(#[from] bincode::error::EncodeError),
284
285    /// Wraps a deserialization error from `bincode`.
286    #[error("{0}")]
287    Deserialize(#[from] bincode::error::DecodeError),
288}
289
290impl Error {
291    #[inline]
292    fn parser<S: ToString>(msg: S, span: Span<'_>) -> Self {
293        Self::Parse(Box::new(pest::error::Error::new_from_span(
294            ErrorVariant::CustomError {
295                message: msg.to_string(),
296            },
297            span,
298        )))
299    }
300
301    fn msg<M: AsRef<str>>(msg: M) -> Self {
302        Self::Msg(msg.as_ref().into())
303    }
304
305    fn localized<S: AsRef<str>>(source: S, line: usize, err: Error) -> Self {
306        Self::Localized(source.as_ref().into(), line, err.into())
307    }
308
309    /// Unwraps the localized error
310    pub fn unwrap_localized(&self) -> &Self {
311        match self {
312            Self::Localized(_, _, e) => e,
313            _ => self,
314        }
315    }
316}
317
318#[derive(Debug, Clone, Serialize, Deserialize)]
319enum Message {
320    String(String),
321    Format {
322        printf_spec: String,
323        fs: FormatString,
324    },
325}
326
327impl Display for Message {
328    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
329        match self {
330            Self::String(s) => write!(f, "{s}"),
331            Self::Format { printf_spec: _, fs } => write!(f, "{}", fs.to_string_lossy()),
332        }
333    }
334}
335
336impl Message {
337    fn to_string_lossy(&self) -> Cow<'_, str> {
338        match self {
339            Message::String(s) => Cow::Borrowed(s),
340            Message::Format { printf_spec: _, fs } => fs.to_string_lossy(),
341        }
342    }
343
344    #[inline(always)]
345    fn format_with(&self, mr: Option<&MatchRes>) -> Result<Cow<'_, str>, Error> {
346        match self {
347            Self::String(s) => Ok(Cow::Borrowed(s.as_str())),
348            Self::Format {
349                printf_spec: c_spec,
350                fs,
351            } => {
352                if let Some(mr) = mr {
353                    match mr {
354                        MatchRes::Float(_, _) | MatchRes::Bytes(_, _, _, _) => {
355                            Ok(Cow::Owned(dformat!(fs, mr)?))
356                        }
357                        MatchRes::Scalar(_, scalar) => {
358                            // we want to print a byte as char
359                            if c_spec.as_str() == "c" {
360                                match scalar {
361                                    Scalar::byte(b) => {
362                                        let b = (*b as u8) as char;
363                                        Ok(Cow::Owned(dformat!(fs, b)?))
364                                    }
365                                    Scalar::ubyte(b) => {
366                                        let b = *b as char;
367                                        Ok(Cow::Owned(dformat!(fs, b)?))
368                                    }
369                                    _ => Ok(Cow::Owned(dformat!(fs, mr)?)),
370                                }
371                            } else {
372                                Ok(Cow::Owned(dformat!(fs, mr)?))
373                            }
374                        }
375                    }
376                } else {
377                    Ok(fs.to_string_lossy())
378                }
379            }
380        }
381    }
382}
383
384impl ScalarDataType {
385    #[inline(always)]
386    fn read<R: Read + Seek>(&self, from: &mut R, switch_endianness: bool) -> Result<Scalar, Error> {
387        macro_rules! _read_le {
388            ($ty: ty) => {{
389                if switch_endianness {
390                    <$ty>::from_be_bytes(read!(from, $ty))
391                } else {
392                    <$ty>::from_le_bytes(read!(from, $ty))
393                }
394            }};
395        }
396
397        macro_rules! _read_be {
398            ($ty: ty) => {{
399                if switch_endianness {
400                    <$ty>::from_le_bytes(read!(from, $ty))
401                } else {
402                    <$ty>::from_be_bytes(read!(from, $ty))
403                }
404            }};
405        }
406
407        macro_rules! _read_ne {
408            ($ty: ty) => {{
409                if cfg!(target_endian = "big") {
410                    _read_be!($ty)
411                } else {
412                    _read_le!($ty)
413                }
414            }};
415        }
416
417        macro_rules! _read_me {
418            () => {
419                ((_read_le!(u16) as i32) << 16) | (_read_le!(u16) as i32)
420            };
421        }
422
423        Ok(match self {
424            // signed
425            Self::byte => Scalar::byte(read!(from, u8)[0] as i8),
426            Self::short => Scalar::short(_read_ne!(i16)),
427            Self::long => Scalar::long(_read_ne!(i32)),
428            Self::date => Scalar::date(_read_ne!(i32)),
429            Self::ldate => Scalar::ldate(_read_ne!(i32)),
430            Self::qwdate => Scalar::qwdate(_read_ne!(i64)),
431            Self::leshort => Scalar::leshort(_read_le!(i16)),
432            Self::lelong => Scalar::lelong(_read_le!(i32)),
433            Self::lequad => Scalar::lequad(_read_le!(i64)),
434            Self::bequad => Scalar::bequad(_read_be!(i64)),
435            Self::belong => Scalar::belong(_read_be!(i32)),
436            Self::bedate => Scalar::bedate(_read_be!(i32)),
437            Self::beldate => Scalar::beldate(_read_be!(i32)),
438            Self::beqdate => Scalar::beqdate(_read_be!(i64)),
439            // unsigned
440            Self::ubyte => Scalar::ubyte(read!(from, u8)[0]),
441            Self::ushort => Scalar::ushort(_read_ne!(u16)),
442            Self::uleshort => Scalar::uleshort(_read_le!(u16)),
443            Self::ulelong => Scalar::ulelong(_read_le!(u32)),
444            Self::uledate => Scalar::uledate(_read_le!(u32)),
445            Self::ulequad => Scalar::ulequad(_read_le!(u64)),
446            Self::offset => Scalar::offset(from.stream_position()?),
447            Self::ubequad => Scalar::ubequad(_read_be!(u64)),
448            Self::medate => Scalar::medate(_read_me!()),
449            Self::meldate => Scalar::meldate(_read_me!()),
450            Self::melong => Scalar::melong(_read_me!()),
451            Self::beshort => Scalar::beshort(_read_be!(i16)),
452            Self::quad => Scalar::quad(_read_ne!(i64)),
453            Self::uquad => Scalar::uquad(_read_ne!(u64)),
454            Self::ledate => Scalar::ledate(_read_le!(i32)),
455            Self::leldate => Scalar::leldate(_read_le!(i32)),
456            Self::leqdate => Scalar::leqdate(_read_le!(i64)),
457            Self::leqldate => Scalar::leqldate(_read_le!(i64)),
458            Self::leqwdate => Scalar::leqwdate(_read_le!(i64)),
459            Self::ubelong => Scalar::ubelong(_read_be!(u32)),
460            Self::ulong => Scalar::ulong(_read_ne!(u32)),
461            Self::ubeshort => Scalar::ubeshort(_read_be!(u16)),
462            Self::ubeqdate => Scalar::ubeqdate(_read_be!(u64)),
463            Self::lemsdosdate => Scalar::lemsdosdate(_read_le!(u16)),
464            Self::lemsdostime => Scalar::lemsdostime(_read_le!(u16)),
465            Self::guid => Scalar::guid(u128::from_be_bytes(read!(from, u128))),
466        })
467    }
468}
469
470impl FloatDataType {
471    #[inline(always)]
472    fn read<R: Read + Seek>(&self, from: &mut R, switch_endianness: bool) -> Result<Float, Error> {
473        macro_rules! _read_le {
474            ($ty: ty) => {{
475                if switch_endianness {
476                    <$ty>::from_be_bytes(read!(from, $ty))
477                } else {
478                    <$ty>::from_le_bytes(read!(from, $ty))
479                }
480            }};
481        }
482
483        macro_rules! _read_be {
484            ($ty: ty) => {{
485                if switch_endianness {
486                    <$ty>::from_le_bytes(read!(from, $ty))
487                } else {
488                    <$ty>::from_be_bytes(read!(from, $ty))
489                }
490            }};
491        }
492
493        macro_rules! _read_ne {
494            ($ty: ty) => {{
495                if cfg!(target_endian = "big") {
496                    _read_be!($ty)
497                } else {
498                    _read_le!($ty)
499                }
500            }};
501        }
502
503        macro_rules! _read_me {
504            () => {
505                ((_read_le!(u16) as i32) << 16) | (_read_le!(u16) as i32)
506            };
507        }
508
509        Ok(match self {
510            Self::lefloat => Float::lefloat(_read_le!(f32)),
511            Self::befloat => Float::befloat(_read_le!(f32)),
512            Self::ledouble => Float::ledouble(_read_le!(f64)),
513            Self::bedouble => Float::bedouble(_read_be!(f64)),
514        })
515    }
516}
517
518#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
519enum Op {
520    Mul,
521    Add,
522    Sub,
523    Div,
524    Mod,
525    And,
526    Xor,
527    Or,
528}
529
530impl Display for Op {
531    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
532        match self {
533            Op::Mul => write!(f, "*"),
534            Op::Add => write!(f, "+"),
535            Op::Sub => write!(f, "-"),
536            Op::Div => write!(f, "/"),
537            Op::Mod => write!(f, "%"),
538            Op::And => write!(f, "&"),
539            Op::Or => write!(f, "|"),
540            Op::Xor => write!(f, "^"),
541        }
542    }
543}
544
545#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
546enum CmpOp {
547    Eq,
548    Lt,
549    Gt,
550    BitAnd,
551    Neq, // ! operator
552    Xor,
553    Not, // ~ operator
554}
555
556impl CmpOp {
557    #[inline(always)]
558    fn is_neq(&self) -> bool {
559        matches!(self, Self::Neq)
560    }
561}
562
563#[derive(Debug, Clone, Serialize, Deserialize)]
564struct ScalarTransform {
565    op: Op,
566    num: Scalar,
567}
568
569impl ScalarTransform {
570    fn apply(&self, s: Scalar) -> Option<Scalar> {
571        match self.op {
572            Op::Add => s.checked_add(self.num),
573            Op::Sub => s.checked_sub(self.num),
574            Op::Mul => s.checked_mul(self.num),
575            Op::Div => s.checked_div(self.num),
576            Op::Mod => s.checked_rem(self.num),
577            Op::And => Some(s.bitand(self.num)),
578            Op::Xor => Some(s.bitxor(self.num)),
579            Op::Or => Some(s.bitor(self.num)),
580        }
581    }
582}
583
584#[derive(Debug, Clone, Serialize, Deserialize)]
585struct FloatTransform {
586    op: Op,
587    num: Float,
588}
589
590impl FloatTransform {
591    fn apply(&self, s: Float) -> Float {
592        match self.op {
593            Op::Add => s.add(self.num),
594            Op::Sub => s.sub(self.num),
595            Op::Mul => s.mul(self.num),
596            // returns inf when div by 0
597            Op::Div => s.div(self.num),
598            // returns NaN when rem by 0
599            Op::Mod => s.rem(self.num),
600            // parser makes sure those operators cannot be used
601            Op::And | Op::Xor | Op::Or => {
602                debug_panic!("unsupported operation");
603                s
604            }
605        }
606    }
607}
608
609#[derive(Clone, Serialize, Deserialize)]
610enum TestValue<T> {
611    Value(T),
612    Any,
613}
614
615impl Debug for TestValue<Vec<u8>> {
616    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
617        match self {
618            Self::Value(v) => write!(f, "\"{}\"", debug_string_from_vec_u8(v)),
619            Self::Any => write!(f, "ANY"),
620        }
621    }
622}
623
624impl Debug for TestValue<Vec<u16>> {
625    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
626        match self {
627            Self::Value(v) => write!(f, "\"{}\"", debug_string_from_vec_u16(v)),
628            Self::Any => write!(f, "ANY"),
629        }
630    }
631}
632
633impl Debug for TestValue<Scalar> {
634    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
635        match self {
636            Self::Value(s) => write!(f, "{s:?}"),
637            Self::Any => write!(f, "ANY"),
638        }
639    }
640}
641
642impl Debug for TestValue<Float> {
643    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
644        match self {
645            Self::Value(fl) => write!(f, "{fl:?}"),
646            Self::Any => write!(f, "ANY"),
647        }
648    }
649}
650
651impl<T> TestValue<T> {
652    #[inline(always)]
653    fn as_ref(&self) -> TestValue<&T> {
654        match self {
655            Self::Value(v) => TestValue::Value(v),
656            Self::Any => TestValue::Any,
657        }
658    }
659}
660
661flags! {
662    enum ReMod: u8{
663        CaseInsensitive,
664        StartOffsetUpdate,
665        LineLimit,
666        ForceBin,
667        ForceText,
668        TrimMatch,
669    }
670}
671
672fn serialize_regex<S>(re: &bytes::Regex, serializer: S) -> Result<S::Ok, S::Error>
673where
674    S: serde::Serializer,
675{
676    re.as_str().serialize(serializer)
677}
678
679fn deserialize_regex<'de, D>(deserializer: D) -> Result<bytes::Regex, D::Error>
680where
681    D: serde::Deserializer<'de>,
682{
683    let wrapper = String::deserialize(deserializer)?;
684    bytes::Regex::new(&wrapper).map_err(serde::de::Error::custom)
685}
686
687#[derive(Debug, Clone, Serialize, Deserialize)]
688struct RegexTest {
689    #[serde(
690        serialize_with = "serialize_regex",
691        deserialize_with = "deserialize_regex"
692    )]
693    re: bytes::Regex,
694    length: Option<usize>,
695    mods: FlagSet<ReMod>,
696    str_mods: FlagSet<StringMod>,
697    non_magic_len: usize,
698    binary: bool,
699    cmp_op: CmpOp,
700}
701
702impl RegexTest {
703    #[inline(always)]
704    fn is_binary(&self) -> bool {
705        self.binary
706            || self.mods.contains(ReMod::ForceBin)
707            || self.str_mods.contains(StringMod::ForceBin)
708    }
709
710    #[inline(always)]
711    fn is_text(&self) -> bool {
712        self.mods.contains(ReMod::ForceText) || self.str_mods.contains(StringMod::ForceText)
713    }
714
715    fn match_buf<'buf>(
716        &self,
717        off_buf: u64, // absolute buffer offset in content
718        stream_kind: StreamKind,
719        buf: &'buf [u8],
720    ) -> Option<MatchRes<'buf>> {
721        let mr = match stream_kind {
722            StreamKind::Text(_) => {
723                let mut off_txt = off_buf;
724
725                let mut line_limit = self.length.unwrap_or(usize::MAX);
726
727                for line in buf.split(|c| c == &b'\n') {
728                    // we don't need to break on offset
729                    // limit as buf contains the good amount
730                    // of bytes to match against
731                    if line_limit == 0 {
732                        break;
733                    }
734
735                    if let Some(re_match) = self.re.find(line) {
736                        // the offset of the string is computed from the start of the buffer
737                        let start_offset = off_txt + re_match.start() as u64;
738
739                        // if we matched until EOL we need to add one to include the delimiter removed from the split
740                        let stop_offset = if re_match.end() == line.len() {
741                            Some(start_offset + re_match.as_bytes().len() as u64 + 1)
742                        } else {
743                            None
744                        };
745
746                        return Some(MatchRes::Bytes(
747                            start_offset,
748                            stop_offset,
749                            re_match.as_bytes(),
750                            Encoding::Utf8,
751                        ));
752                    }
753
754                    off_txt += line.len() as u64;
755                    // we have to add one because lines do not contain splitting character
756                    off_txt += 1;
757                    line_limit = line_limit.saturating_sub(1)
758                }
759                None
760            }
761
762            StreamKind::Binary => {
763                self.re.find(buf).map(|re_match| {
764                    MatchRes::Bytes(
765                        // the offset of the string is computed from the start of the buffer
766                        off_buf + re_match.start() as u64,
767                        None,
768                        re_match.as_bytes(),
769                        Encoding::Utf8,
770                    )
771                })
772            }
773        };
774
775        // handle the case where we want the regex not to match
776        if self.cmp_op.is_neq() && mr.is_none() {
777            return Some(MatchRes::Bytes(off_buf, None, buf, Encoding::Utf8));
778        }
779
780        mr
781    }
782}
783
784impl From<RegexTest> for Test {
785    fn from(value: RegexTest) -> Self {
786        Self::Regex(value)
787    }
788}
789
790flags! {
791    enum StringMod: u8{
792        ForceBin,
793        UpperInsensitive,
794        LowerInsensitive,
795        FullWordMatch,
796        Trim,
797        ForceText,
798        CompactWhitespace,
799        OptBlank,
800    }
801}
802
803#[derive(Debug, Clone, Serialize, Deserialize)]
804struct StringTest {
805    test_val: TestValue<Vec<u8>>,
806    cmp_op: CmpOp,
807    length: Option<usize>,
808    mods: FlagSet<StringMod>,
809    binary: bool,
810}
811
812impl From<StringTest> for Test {
813    fn from(value: StringTest) -> Self {
814        Self::String(value)
815    }
816}
817
818#[inline(always)]
819fn string_match(str: &[u8], mods: FlagSet<StringMod>, buf: &[u8]) -> (bool, usize) {
820    let mut consumed = 0;
821    // we can do a simple string comparison
822    if mods.is_disjoint(
823        StringMod::UpperInsensitive
824            | StringMod::LowerInsensitive
825            | StringMod::FullWordMatch
826            | StringMod::CompactWhitespace
827            | StringMod::OptBlank,
828    ) {
829        // we check if target contains
830        if buf.starts_with(str) {
831            (true, str.len())
832        } else {
833            (false, consumed)
834        }
835    } else {
836        let mut i_src = 0;
837        let mut iter = buf.iter().peekable();
838
839        macro_rules! consume_target {
840            () => {{
841                if iter.next().is_some() {
842                    consumed += 1;
843                }
844            }};
845        }
846
847        macro_rules! continue_next_iteration {
848            () => {{
849                consume_target!();
850                i_src += 1;
851                continue;
852            }};
853        }
854
855        while let Some(&&b) = iter.peek() {
856            let Some(&ref_byte) = str.get(i_src) else {
857                break;
858            };
859
860            if mods.contains(StringMod::OptBlank) && (b == b' ' || ref_byte == b' ') {
861                if b == b' ' {
862                    // we ignore whitespace in target
863                    consume_target!();
864                }
865
866                if ref_byte == b' ' {
867                    // we ignore whitespace in test
868                    i_src += 1;
869                }
870
871                continue;
872            }
873
874            if mods.contains(StringMod::UpperInsensitive) {
875                //upper case characters in the magic match both lower and upper case characters in the target
876                if ref_byte.is_ascii_uppercase() && ref_byte == b.to_ascii_uppercase()
877                    || ref_byte == b
878                {
879                    continue_next_iteration!()
880                }
881            }
882
883            if mods.contains(StringMod::LowerInsensitive)
884                && (ref_byte.is_ascii_lowercase() && ref_byte == b.to_ascii_lowercase()
885                    || ref_byte == b)
886            {
887                continue_next_iteration!()
888            }
889
890            if mods.contains(StringMod::CompactWhitespace) && ref_byte == b' ' {
891                let mut src_blk = 0;
892                while let Some(b' ') = str.get(i_src) {
893                    src_blk += 1;
894                    i_src += 1;
895                }
896
897                let mut tgt_blk = 0;
898                while let Some(b' ') = iter.peek() {
899                    tgt_blk += 1;
900                    consume_target!();
901                }
902
903                if src_blk > tgt_blk {
904                    return (false, consumed);
905                }
906
907                continue;
908            }
909
910            if ref_byte == b {
911                continue_next_iteration!()
912            } else {
913                return (false, consumed);
914            }
915        }
916
917        if mods.contains(StringMod::FullWordMatch)
918            && let Some(b) = iter.peek()
919            && !b.is_ascii_whitespace()
920        {
921            return (false, consumed);
922        }
923
924        (
925            consumed > 0 && str.get(i_src).is_none() && consumed <= buf.len(),
926            consumed,
927        )
928    }
929}
930
931impl StringTest {
932    fn has_length_mod(&self) -> bool {
933        !self.mods.is_disjoint(
934            StringMod::UpperInsensitive
935                | StringMod::LowerInsensitive
936                | StringMod::FullWordMatch
937                | StringMod::CompactWhitespace
938                | StringMod::OptBlank,
939        )
940    }
941
942    #[inline(always)]
943    fn test_value_len(&self) -> usize {
944        match self.test_val.as_ref() {
945            TestValue::Value(s) => s.len(),
946            TestValue::Any => 0,
947        }
948    }
949
950    #[inline(always)]
951    fn is_binary(&self) -> bool {
952        self.binary || self.mods.contains(StringMod::ForceBin)
953    }
954
955    #[inline(always)]
956    fn is_text(&self) -> bool {
957        self.mods.contains(StringMod::ForceText)
958    }
959}
960
961#[derive(Clone, Serialize, Deserialize)]
962struct ByteVec(Vec<u8>);
963
964impl Debug for ByteVec {
965    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
966        write!(f, "\"{}\"", debug_string_from_vec_u8(self))
967    }
968}
969
970impl From<Vec<u8>> for ByteVec {
971    fn from(value: Vec<u8>) -> Self {
972        Self(value)
973    }
974}
975
976impl Deref for ByteVec {
977    type Target = Vec<u8>;
978
979    fn deref(&self) -> &Self::Target {
980        &self.0
981    }
982}
983
984#[derive(Debug, Clone, Serialize, Deserialize)]
985struct SearchTest {
986    str: ByteVec,
987    n_pos: Option<usize>,
988    str_mods: FlagSet<StringMod>,
989    re_mods: FlagSet<ReMod>,
990    binary: bool,
991    cmp_op: CmpOp,
992}
993
994impl From<SearchTest> for Test {
995    fn from(value: SearchTest) -> Self {
996        Self::Search(value)
997    }
998}
999
1000impl SearchTest {
1001    #[inline(always)]
1002    fn is_binary(&self) -> bool {
1003        (self.binary
1004            || self.str_mods.contains(StringMod::ForceBin)
1005            || self.re_mods.contains(ReMod::ForceBin))
1006            && !(self.str_mods.contains(StringMod::ForceText)
1007                || self.re_mods.contains(ReMod::ForceText))
1008    }
1009
1010    // off_buf: absolute buffer offset in content
1011    #[inline]
1012    fn match_buf<'buf>(&self, off_buf: u64, buf: &'buf [u8]) -> Option<MatchRes<'buf>> {
1013        let mut i = 0;
1014
1015        let needle = self.str.first()?;
1016
1017        while i < buf.len() {
1018            // we cannot match if the first character isn't the same
1019            // so we accelerate the search by finding potential matches
1020            let Some(k) = memchr(*needle, &buf[i..]) else {
1021                break;
1022            };
1023
1024            i += k;
1025
1026            // if we want a full word match
1027            if self.str_mods.contains(StringMod::FullWordMatch) {
1028                let prev_is_whitespace = buf
1029                    .get(i.saturating_sub(1))
1030                    .map(|c| c.is_ascii_whitespace())
1031                    .unwrap_or_default();
1032
1033                // if it is not the first character
1034                // and its previous character isn't
1035                // a whitespace. It cannot be a
1036                // fullword match
1037                if i > 0 && !prev_is_whitespace {
1038                    i += 1;
1039                    continue;
1040                }
1041            }
1042
1043            if let Some(npos) = self.n_pos
1044                && i > npos
1045            {
1046                break;
1047            }
1048
1049            let pos = i;
1050            let (ok, consumed) = string_match(&self.str, self.str_mods, &buf[i..]);
1051
1052            if ok {
1053                return Some(MatchRes::Bytes(
1054                    off_buf.saturating_add(pos as u64),
1055                    None,
1056                    &buf[i..i + consumed],
1057                    Encoding::Utf8,
1058                ));
1059            } else {
1060                i += max(consumed, 1)
1061            }
1062        }
1063
1064        // handles the case where we want the string not to be found
1065        if self.cmp_op.is_neq() {
1066            return Some(MatchRes::Bytes(off_buf, None, buf, Encoding::Utf8));
1067        }
1068
1069        None
1070    }
1071}
1072
1073#[derive(Debug, Clone, Serialize, Deserialize)]
1074struct ScalarTest {
1075    ty: ScalarDataType,
1076    transform: Option<ScalarTransform>,
1077    cmp_op: CmpOp,
1078    test_val: TestValue<Scalar>,
1079}
1080
1081#[derive(Debug, Clone, Serialize, Deserialize)]
1082struct FloatTest {
1083    ty: FloatDataType,
1084    transform: Option<FloatTransform>,
1085    cmp_op: CmpOp,
1086    test_val: TestValue<Float>,
1087}
1088
1089// the value read from the haystack we want to match against
1090// 'buf is the lifetime of the buffer we are scanning
1091#[derive(PartialEq)]
1092enum ReadValue<'buf> {
1093    Float(u64, Float),
1094    Scalar(u64, Scalar),
1095    Bytes(u64, &'buf [u8]),
1096}
1097
1098impl<'buf> Debug for ReadValue<'buf> {
1099    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1100        match self {
1101            Self::Float(_, fl) => write!(f, "{fl:?}"),
1102            Self::Scalar(_, s) => write!(f, "{s:?}"),
1103            Self::Bytes(_, b) => {
1104                if b.len() <= 128 {
1105                    write!(f, "\"{}\"", debug_string_from_vec_u8(b))
1106                } else {
1107                    let limit = 128;
1108                    write!(
1109                        f,
1110                        "\"{}\" (first {limit} bytes)",
1111                        debug_string_from_vec_u8(&b[..limit])
1112                    )
1113                }
1114            }
1115        }
1116    }
1117}
1118
1119impl DynDisplay for ReadValue<'_> {
1120    fn dyn_fmt(&self, f: &dyf::FormatSpec) -> Result<String, dyf::Error> {
1121        match self {
1122            Self::Float(_, s) => DynDisplay::dyn_fmt(s, f),
1123            Self::Scalar(_, s) => DynDisplay::dyn_fmt(s, f),
1124            Self::Bytes(_, b) => Ok(format!("{b:?}")),
1125        }
1126    }
1127}
1128
1129impl DynDisplay for &ReadValue<'_> {
1130    fn dyn_fmt(&self, f: &dyf::FormatSpec) -> Result<String, dyf::Error> {
1131        // Dereference self to get the TestValue and call its fmt method
1132        DynDisplay::dyn_fmt(*self, f)
1133    }
1134}
1135
1136impl Display for ReadValue<'_> {
1137    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1138        match self {
1139            Self::Float(_, v) => write!(f, "{v}"),
1140            Self::Scalar(_, s) => write!(f, "{s}"),
1141            Self::Bytes(_, b) => write!(f, "{b:?}"),
1142        }
1143    }
1144}
1145
1146enum Encoding {
1147    Utf16(String16Encoding),
1148    Utf8,
1149}
1150
1151// Carry the offset of the start of the data in the stream
1152// and the data itself
1153enum MatchRes<'buf> {
1154    // Bytes.0: offset of the match
1155    // Bytes.1: optional end of match (to address the need of EOL adjustment in string regex)
1156    // Bytes.2: the bytes matching
1157    // Bytes.3: encoding of the buffer
1158    Bytes(u64, Option<u64>, &'buf [u8], Encoding),
1159    Scalar(u64, Scalar),
1160    Float(u64, Float),
1161}
1162
1163impl DynDisplay for &MatchRes<'_> {
1164    fn dyn_fmt(&self, f: &dyf::FormatSpec) -> Result<String, dyf::Error> {
1165        (*self).dyn_fmt(f)
1166    }
1167}
1168
1169impl DynDisplay for MatchRes<'_> {
1170    fn dyn_fmt(&self, f: &dyf::FormatSpec) -> Result<String, dyf::Error> {
1171        match self {
1172            Self::Scalar(_, v) => v.dyn_fmt(f),
1173            Self::Float(_, v) => v.dyn_fmt(f),
1174            Self::Bytes(_, _, v, enc) => match enc {
1175                Encoding::Utf8 => String::from_utf8_lossy(v).to_string().dyn_fmt(f),
1176                Encoding::Utf16(enc) => {
1177                    let utf16: Vec<u16> = slice_to_utf16_iter(v, *enc).collect();
1178                    String::from_utf16_lossy(&utf16).dyn_fmt(f)
1179                }
1180            },
1181        }
1182    }
1183}
1184
1185impl MatchRes<'_> {
1186    // start offset of the match
1187    #[inline]
1188    fn start_offset(&self) -> u64 {
1189        match self {
1190            MatchRes::Bytes(o, _, _, _) => *o,
1191            MatchRes::Scalar(o, _) => *o,
1192            MatchRes::Float(o, _) => *o,
1193        }
1194    }
1195
1196    // start offset of the match
1197    #[inline]
1198    fn end_offset(&self) -> u64 {
1199        match self {
1200            MatchRes::Bytes(start, end, buf, _) => match end {
1201                Some(end) => *end,
1202                None => start.saturating_add(buf.len() as u64),
1203            },
1204            MatchRes::Scalar(o, sc) => o.add(sc.size_of() as u64),
1205            MatchRes::Float(o, f) => o.add(f.size_of() as u64),
1206        }
1207    }
1208}
1209
1210fn slice_to_utf16_iter(read: &[u8], encoding: String16Encoding) -> impl Iterator<Item = u16> {
1211    let even = read
1212        .iter()
1213        .enumerate()
1214        .filter(|(i, _)| i % 2 == 0)
1215        .map(|t| t.1);
1216
1217    let odd = read
1218        .iter()
1219        .enumerate()
1220        .filter(|(i, _)| i % 2 != 0)
1221        .map(|t| t.1);
1222
1223    even.zip(odd).map(move |(e, o)| match encoding {
1224        String16Encoding::Le => u16::from_le_bytes([*e, *o]),
1225        String16Encoding::Be => u16::from_be_bytes([*e, *o]),
1226    })
1227}
1228
1229#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
1230enum String16Encoding {
1231    Le,
1232    Be,
1233}
1234
1235#[derive(Debug, Clone, Serialize, Deserialize)]
1236struct String16Test {
1237    orig: String,
1238    test_val: TestValue<Vec<u16>>,
1239    encoding: String16Encoding,
1240}
1241
1242impl String16Test {
1243    /// if the test value is a specific value this method returns
1244    /// the number of utf16 characters. To obtain the length in
1245    /// bytes the return value needs to be multiplied by two.
1246    #[inline(always)]
1247    fn test_value_len(&self) -> usize {
1248        match self.test_val.as_ref() {
1249            TestValue::Value(str16) => str16.len(),
1250            TestValue::Any => 0,
1251        }
1252    }
1253}
1254
1255flags! {
1256    enum IndirectMod: u8{
1257        Relative,
1258    }
1259}
1260
1261type IndirectMods = FlagSet<IndirectMod>;
1262
1263#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
1264enum PStringLen {
1265    Byte,    // B
1266    ShortBe, // H
1267    ShortLe, // h
1268    LongBe,  // L
1269    LongLe,  // l
1270}
1271
1272impl PStringLen {
1273    #[inline(always)]
1274    const fn size_of_len(&self) -> usize {
1275        match self {
1276            PStringLen::Byte => 1,
1277            PStringLen::ShortBe => 2,
1278            PStringLen::ShortLe => 2,
1279            PStringLen::LongBe => 4,
1280            PStringLen::LongLe => 4,
1281        }
1282    }
1283}
1284
1285#[derive(Debug, Clone, Serialize, Deserialize)]
1286struct PStringTest {
1287    len: PStringLen,
1288    test_val: TestValue<Vec<u8>>,
1289    include_len: bool,
1290}
1291
1292impl PStringTest {
1293    #[inline]
1294    fn read<'cache, R: Read + Seek>(
1295        &self,
1296        haystack: &'cache mut LazyCache<R>,
1297    ) -> Result<Option<&'cache [u8]>, Error> {
1298        let mut len = match self.len {
1299            PStringLen::Byte => read_le!(haystack, u8) as u32,
1300            PStringLen::ShortBe => read_be!(haystack, u16) as u32,
1301            PStringLen::ShortLe => read_le!(haystack, u16) as u32,
1302            PStringLen::LongBe => read_be!(haystack, u32),
1303            PStringLen::LongLe => read_le!(haystack, u32),
1304        } as usize;
1305
1306        if self.include_len {
1307            len = len.saturating_sub(self.len.size_of_len())
1308        }
1309
1310        if let TestValue::Value(s) = self.test_val.as_ref()
1311            && len != s.len()
1312        {
1313            return Ok(None);
1314        }
1315
1316        let read = haystack.read_exact_count(len as u64)?;
1317
1318        Ok(Some(read))
1319    }
1320
1321    #[inline(always)]
1322    fn test_value_len(&self) -> usize {
1323        match self.test_val.as_ref() {
1324            TestValue::Value(s) => s.len(),
1325            TestValue::Any => 0,
1326        }
1327    }
1328}
1329
1330#[derive(Debug, Clone, Serialize, Deserialize)]
1331enum Test {
1332    Name(String),
1333    Use(bool, String),
1334    Scalar(ScalarTest),
1335    Float(FloatTest),
1336    String(StringTest),
1337    Search(SearchTest),
1338    PString(PStringTest),
1339    Regex(RegexTest),
1340    Indirect(FlagSet<IndirectMod>),
1341    String16(String16Test),
1342    // FIXME: placeholder for strength computation
1343    #[allow(dead_code)]
1344    Der,
1345    Clear,
1346    Default,
1347}
1348
1349impl Display for Test {
1350    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1351        match self {
1352            Test::Name(name) => write!(f, "name {name}"),
1353            Test::Use(flip, rule) => {
1354                if *flip {
1355                    write!(f, "use {rule}")
1356                } else {
1357                    write!(f, "use ^{rule}")
1358                }
1359            }
1360            Test::Scalar(st) => write!(f, "{st:?}"),
1361            Test::Float(ft) => write!(f, "{ft:?}"),
1362            Test::String(st) => write!(f, "{st:?}"),
1363            Test::Search(st) => write!(f, "{st:?}"),
1364            Test::PString(pt) => write!(f, "{pt:?}"),
1365            Test::Regex(rt) => write!(f, "{rt:?}"),
1366            Test::Indirect(fs) => write!(f, "indirect {fs:?}"),
1367            Test::String16(s16t) => write!(f, "{s16t:?}"),
1368            Test::Der => write!(f, "unimplemented der"),
1369            Test::Clear => write!(f, "clear"),
1370            Test::Default => write!(f, "default"),
1371        }
1372    }
1373}
1374
1375impl Test {
1376    // read the value to test from the haystack
1377    #[inline]
1378    fn read_test_value<'haystack, R: Read + Seek>(
1379        &self,
1380        haystack: &'haystack mut LazyCache<R>,
1381        switch_endianness: bool,
1382    ) -> Result<Option<ReadValue<'haystack>>, Error> {
1383        let test_value_offset = haystack.lazy_stream_position();
1384
1385        match self {
1386            Self::Scalar(t) => {
1387                t.ty.read(haystack, switch_endianness)
1388                    .map(|s| Some(ReadValue::Scalar(test_value_offset, s)))
1389            }
1390
1391            Self::Float(t) => {
1392                t.ty.read(haystack, switch_endianness)
1393                    .map(|f| Some(ReadValue::Float(test_value_offset, f)))
1394            }
1395            Self::String(t) => {
1396                match t.test_val.as_ref() {
1397                    TestValue::Value(str) => {
1398                        let buf = if let Some(length) = t.length {
1399                            // if there is a length specified
1400                            haystack.read_exact_count(length as u64)?
1401                        } else {
1402                            // no length specified we read until end of string
1403
1404                            match t.cmp_op {
1405                                CmpOp::Eq | CmpOp::Neq => {
1406                                    if !t.has_length_mod() {
1407                                        haystack.read_exact_count(str.len() as u64)?
1408                                    } else {
1409                                        haystack.read_count(FILE_BYTES_MAX as u64)?
1410                                    }
1411                                }
1412                                CmpOp::Lt | CmpOp::Gt => {
1413                                    let read =
1414                                        haystack.read_until_any_delim_or_limit(b"\n\0", 8092)?;
1415
1416                                    if read.ends_with(b"\0") || read.ends_with(b"\n") {
1417                                        &read[..read.len() - 1]
1418                                    } else {
1419                                        read
1420                                    }
1421                                }
1422                                _ => {
1423                                    return Err(Error::Msg(format!(
1424                                        "string test does not support {:?} operator",
1425                                        t.cmp_op
1426                                    )));
1427                                }
1428                            }
1429                        };
1430
1431                        Ok(Some(ReadValue::Bytes(test_value_offset, buf)))
1432                    }
1433                    TestValue::Any => {
1434                        let read = haystack.read_until_any_delim_or_limit(b"\0\n", 8192)?;
1435                        // we don't take last byte if it matches end of string
1436                        let bytes = if read.ends_with(b"\0") || read.ends_with(b"\n") {
1437                            &read[..read.len() - 1]
1438                        } else {
1439                            read
1440                        };
1441
1442                        Ok(Some(ReadValue::Bytes(test_value_offset, bytes)))
1443                    }
1444                }
1445            }
1446
1447            Self::String16(t) => {
1448                match t.test_val.as_ref() {
1449                    TestValue::Value(str16) => {
1450                        let read = haystack.read_exact_count((str16.len() * 2) as u64)?;
1451
1452                        Ok(Some(ReadValue::Bytes(test_value_offset, read)))
1453                    }
1454                    TestValue::Any => {
1455                        let read = haystack.read_until_utf16_or_limit(b"\x00\x00", 8192)?;
1456
1457                        // we make sure we have an even number of elements
1458                        let end = if read.len() % 2 == 0 {
1459                            read.len()
1460                        } else {
1461                            // we decide to read anyway even though
1462                            // length isn't even
1463                            read.len().saturating_sub(1)
1464                        };
1465
1466                        Ok(Some(ReadValue::Bytes(test_value_offset, &read[..end])))
1467                    }
1468                }
1469            }
1470
1471            Self::PString(t) => {
1472                let Some(read) = t.read(haystack)? else {
1473                    return Ok(None);
1474                };
1475                Ok(Some(ReadValue::Bytes(test_value_offset, read)))
1476            }
1477
1478            Self::Search(_) => {
1479                let buf = haystack.read_count(FILE_BYTES_MAX as u64)?;
1480                Ok(Some(ReadValue::Bytes(test_value_offset, buf)))
1481            }
1482
1483            Self::Regex(r) => {
1484                let length = {
1485                    match r.length {
1486                        Some(len) => {
1487                            if r.mods.contains(ReMod::LineLimit) {
1488                                len * 80
1489                            } else {
1490                                len
1491                            }
1492                        }
1493
1494                        None => FILE_REGEX_MAX,
1495                    }
1496                };
1497
1498                let read = haystack.read_count(length as u64)?;
1499                Ok(Some(ReadValue::Bytes(test_value_offset, read)))
1500            }
1501
1502            Self::Name(_)
1503            | Self::Use(_, _)
1504            | Self::Indirect(_)
1505            | Self::Clear
1506            | Self::Default
1507            | Self::Der => Err(Error::msg("no value to read for this test")),
1508        }
1509    }
1510
1511    #[inline(always)]
1512    fn match_value<'s>(
1513        &'s self,
1514        tv: &ReadValue<'s>,
1515        stream_kind: StreamKind,
1516    ) -> Option<MatchRes<'s>> {
1517        match (self, tv) {
1518            (Self::Scalar(t), ReadValue::Scalar(o, ts)) => {
1519                let read_value: Scalar = match t.transform.as_ref() {
1520                    Some(t) => t.apply(*ts)?,
1521                    None => *ts,
1522                };
1523
1524                match t.test_val {
1525                    TestValue::Value(test_value) => {
1526                        let ok = match t.cmp_op {
1527                            // NOTE: this should not happen in practice because
1528                            // we convert it into Eq equivalent at parsing time
1529                            CmpOp::Not => read_value == !test_value,
1530                            CmpOp::Eq => read_value == test_value,
1531                            CmpOp::Lt => read_value < test_value,
1532                            CmpOp::Gt => read_value > test_value,
1533                            CmpOp::Neq => read_value != test_value,
1534                            CmpOp::BitAnd => read_value & test_value == test_value,
1535                            CmpOp::Xor => (read_value & test_value).is_zero(),
1536                        };
1537
1538                        if ok {
1539                            Some(MatchRes::Scalar(*o, read_value))
1540                        } else {
1541                            None
1542                        }
1543                    }
1544
1545                    TestValue::Any => Some(MatchRes::Scalar(*o, read_value)),
1546                }
1547            }
1548
1549            (Self::Float(t), ReadValue::Float(o, f)) => {
1550                let read_value: Float = t.transform.as_ref().map(|t| t.apply(*f)).unwrap_or(*f);
1551
1552                match t.test_val {
1553                    TestValue::Value(tf) => {
1554                        let ok = match t.cmp_op {
1555                            CmpOp::Eq => read_value == tf,
1556                            CmpOp::Lt => read_value < tf,
1557                            CmpOp::Gt => read_value > tf,
1558                            CmpOp::Neq => read_value != tf,
1559                            _ => {
1560                                // this should never be reached as we validate
1561                                // operator in parser
1562                                debug_panic!("unsupported float comparison");
1563                                debug!("unsupported float comparison");
1564                                false
1565                            }
1566                        };
1567
1568                        if ok {
1569                            Some(MatchRes::Float(*o, read_value))
1570                        } else {
1571                            None
1572                        }
1573                    }
1574                    TestValue::Any => Some(MatchRes::Float(*o, read_value)),
1575                }
1576            }
1577
1578            (Self::String(st), ReadValue::Bytes(o, buf)) => {
1579                macro_rules! trim_buf {
1580                    ($buf: expr) => {{
1581                        if st.mods.contains(StringMod::Trim) {
1582                            $buf.trim_ascii()
1583                        } else {
1584                            $buf
1585                        }
1586                    }};
1587                }
1588
1589                match st.test_val.as_ref() {
1590                    TestValue::Value(str) => {
1591                        match st.cmp_op {
1592                            CmpOp::Eq => {
1593                                if let (true, _) = string_match(str, st.mods, buf) {
1594                                    Some(MatchRes::Bytes(*o, None, trim_buf!(str), Encoding::Utf8))
1595                                } else {
1596                                    None
1597                                }
1598                            }
1599                            CmpOp::Neq => {
1600                                if let (false, _) = string_match(str, st.mods, buf) {
1601                                    Some(MatchRes::Bytes(*o, None, trim_buf!(str), Encoding::Utf8))
1602                                } else {
1603                                    None
1604                                }
1605                            }
1606                            CmpOp::Gt => {
1607                                if buf.len() > str.len() {
1608                                    Some(MatchRes::Bytes(*o, None, trim_buf!(buf), Encoding::Utf8))
1609                                } else {
1610                                    None
1611                                }
1612                            }
1613                            CmpOp::Lt => {
1614                                if buf.len() < str.len() {
1615                                    Some(MatchRes::Bytes(*o, None, trim_buf!(buf), Encoding::Utf8))
1616                                } else {
1617                                    None
1618                                }
1619                            }
1620
1621                            // unsupported for strings
1622                            _ => {
1623                                // this should never be reached as we validate
1624                                // operator in parser
1625                                debug_panic!("unsupported string comparison");
1626                                debug!("unsupported string comparison");
1627                                None
1628                            }
1629                        }
1630                    }
1631                    TestValue::Any => {
1632                        Some(MatchRes::Bytes(*o, None, trim_buf!(buf), Encoding::Utf8))
1633                    }
1634                }
1635            }
1636
1637            (Self::PString(m), ReadValue::Bytes(o, buf)) => match m.test_val.as_ref() {
1638                TestValue::Value(psv) => {
1639                    if buf == psv {
1640                        Some(MatchRes::Bytes(*o, None, buf, Encoding::Utf8))
1641                    } else {
1642                        None
1643                    }
1644                }
1645                TestValue::Any => Some(MatchRes::Bytes(*o, None, buf, Encoding::Utf8)),
1646            },
1647
1648            (Self::String16(t), ReadValue::Bytes(o, buf)) => {
1649                match t.test_val.as_ref() {
1650                    TestValue::Value(str16) => {
1651                        // strings cannot be equal
1652                        if str16.len() * 2 != buf.len() {
1653                            return None;
1654                        }
1655
1656                        // we check string equality
1657                        for (i, utf16_char) in slice_to_utf16_iter(buf, t.encoding).enumerate() {
1658                            if str16[i] != utf16_char {
1659                                return None;
1660                            }
1661                        }
1662
1663                        Some(MatchRes::Bytes(
1664                            *o,
1665                            None,
1666                            t.orig.as_bytes(),
1667                            Encoding::Utf16(t.encoding),
1668                        ))
1669                    }
1670
1671                    TestValue::Any => {
1672                        Some(MatchRes::Bytes(*o, None, buf, Encoding::Utf16(t.encoding)))
1673                    }
1674                }
1675            }
1676
1677            (Self::Regex(r), ReadValue::Bytes(o, buf)) => r.match_buf(*o, stream_kind, buf),
1678
1679            (Self::Search(t), ReadValue::Bytes(o, buf)) => t.match_buf(*o, buf),
1680
1681            _ => None,
1682        }
1683    }
1684
1685    #[inline(always)]
1686    fn strength(&self) -> u64 {
1687        const MULT: usize = 10;
1688
1689        let mut out = 2 * MULT;
1690
1691        // FIXME: octal is missing but it is not used in practice ...
1692        match self {
1693            Test::Scalar(s) => {
1694                out += s.ty.type_size() * MULT;
1695            }
1696
1697            Test::Float(t) => {
1698                out += t.ty.type_size() * MULT;
1699            }
1700
1701            Test::String(t) => out += t.test_value_len().saturating_mul(MULT),
1702
1703            Test::PString(t) => out += t.test_value_len().saturating_mul(MULT),
1704
1705            Test::Search(s) => {
1706                // NOTE: this implementation deviates from what is in
1707                // C libmagic. The purpose of this implementation is to
1708                // minimize the difference between similar tests,
1709                // implemented differently (ex: string test VS very localized search test).
1710                let n_pos = s.n_pos.unwrap_or(FILE_BYTES_MAX);
1711
1712                match n_pos {
1713                    // a search on one line should be equivalent to a string match
1714                    0..=80 => out += s.str.len().saturating_mul(MULT),
1715                    // search on the first 3 lines gets a little penalty
1716                    81..=240 => out += s.str.len() * s.str.len().clamp(0, MULT - 2),
1717                    // a search on more than 3 lines isn't considered very accurate
1718                    _ => out += s.str.len(),
1719                }
1720            }
1721
1722            Test::Regex(r) => {
1723                // NOTE: this implementation deviates from what is in
1724                // C libmagic. The purpose of this implementation is to
1725                // minimize the difference between similar tests,
1726                // implemented differently (ex: string test VS very localized regex test).
1727
1728                // we divide length by the number of capture group
1729                // which gives us a value close to he average string
1730                // length match in the regex.
1731                let v = r.non_magic_len / r.re.captures_len();
1732
1733                let len = r
1734                    .length
1735                    .map(|l| {
1736                        if r.mods.contains(ReMod::LineLimit) {
1737                            l * 80
1738                        } else {
1739                            l
1740                        }
1741                    })
1742                    .unwrap_or(FILE_BYTES_MAX);
1743
1744                match len {
1745                    // a search on one line should be equivalent to a string match
1746                    0..=80 => out += v.saturating_mul(MULT),
1747                    // search on the first 3 lines gets a little penalty
1748                    81..=240 => out += v * v.clamp(0, MULT - 2),
1749                    // a search on more than 3 lines isn't considered very accurate
1750                    _ => out += v,
1751                }
1752            }
1753
1754            Test::String16(t) => {
1755                // NOTE: in libmagic the result is div by 2
1756                // but I GUESS it is because the len is expressed
1757                // in number bytes. In our case length is expressed
1758                // in number of u16 so we shouldn't divide.
1759                out += t.test_value_len().saturating_mul(MULT);
1760            }
1761
1762            Test::Der => out += MULT,
1763
1764            Test::Default | Test::Name(_) | Test::Use(_, _) | Test::Indirect(_) | Test::Clear => {
1765                return 0;
1766            }
1767        }
1768
1769        // matching any output gets penalty
1770        if self.is_match_any() {
1771            return 0;
1772        }
1773
1774        if let Some(op) = self.cmp_op() {
1775            match op {
1776                // matching almost any gets penalty
1777                CmpOp::Neq => out = 0,
1778                CmpOp::Eq | CmpOp::Not => out += MULT,
1779                CmpOp::Lt | CmpOp::Gt => out -= 2 * MULT,
1780                CmpOp::Xor | CmpOp::BitAnd => out -= MULT,
1781            }
1782        }
1783
1784        out as u64
1785    }
1786
1787    #[inline(always)]
1788    fn cmp_op(&self) -> Option<CmpOp> {
1789        match self {
1790            Self::String(t) => Some(t.cmp_op),
1791            Self::Scalar(s) => Some(s.cmp_op),
1792            Self::Float(t) => Some(t.cmp_op),
1793            Self::Name(_)
1794            | Self::Use(_, _)
1795            | Self::Search(_)
1796            | Self::PString(_)
1797            | Self::Regex(_)
1798            | Self::Clear
1799            | Self::Default
1800            | Self::Indirect(_)
1801            | Self::String16(_)
1802            | Self::Der => None,
1803        }
1804    }
1805
1806    #[inline(always)]
1807    fn is_recursive(&self) -> bool {
1808        matches!(self, Test::Use(_, _) | Test::Indirect(_))
1809    }
1810
1811    #[inline(always)]
1812    fn is_match_any(&self) -> bool {
1813        match self {
1814            Test::Name(_) => false,
1815            Test::Use(_, _) => false,
1816            Test::Scalar(scalar_test) => matches!(scalar_test.test_val, TestValue::Any),
1817            Test::Float(float_test) => matches!(float_test.test_val, TestValue::Any),
1818            Test::String(string_test) => matches!(string_test.test_val, TestValue::Any),
1819            Test::Search(_) => false,
1820            Test::PString(pstring_test) => matches!(pstring_test.test_val, TestValue::Any),
1821            Test::Regex(_) => false,
1822            Test::Indirect(_) => false,
1823            Test::String16(string16_test) => matches!(string16_test.test_val, TestValue::Any),
1824            Test::Der => false,
1825            Test::Clear => false,
1826            Test::Default => false,
1827        }
1828    }
1829
1830    #[inline(always)]
1831    fn is_binary(&self) -> bool {
1832        match self {
1833            Self::Name(_) => true,
1834            Self::Use(_, _) => true,
1835            Self::Scalar(_) => true,
1836            Self::Float(_) => true,
1837            Self::String(t) => !t.is_binary() & !t.is_text() || t.is_binary(),
1838            Self::Search(t) => t.is_binary(),
1839            Self::PString(_) => true,
1840            Self::Regex(t) => !t.is_binary() & !t.is_text() || t.is_binary(),
1841            Self::Clear => true,
1842            Self::Default => true,
1843            Self::Indirect(_) => true,
1844            Self::String16(_) => true,
1845            Self::Der => true,
1846        }
1847    }
1848
1849    #[inline(always)]
1850    fn is_text(&self) -> bool {
1851        match self {
1852            Self::Name(_) => true,
1853            Self::Use(_, _) => true,
1854            Self::Indirect(_) => true,
1855            Self::Clear => true,
1856            Self::Default => true,
1857            Self::String(t) => !t.is_binary() & !t.is_text() || t.is_text(),
1858            Self::Regex(t) => !t.is_binary() & !t.is_text() || t.is_text(),
1859            _ => !self.is_binary(),
1860        }
1861    }
1862
1863    #[inline(always)]
1864    fn is_only_text(&self) -> bool {
1865        self.is_text() && !self.is_binary()
1866    }
1867
1868    #[inline(always)]
1869    fn is_only_binary(&self) -> bool {
1870        self.is_binary() && !self.is_text()
1871    }
1872}
1873
1874#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
1875enum OffsetType {
1876    Byte,
1877    DoubleLe,
1878    DoubleBe,
1879    ShortLe,
1880    ShortBe,
1881    Id3Le,
1882    Id3Be,
1883    LongLe,
1884    LongBe,
1885    Middle,
1886    Octal,
1887    QuadBe,
1888    QuadLe,
1889}
1890
1891#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
1892enum Shift {
1893    Direct(u64),
1894    Indirect(i64),
1895}
1896
1897#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
1898struct IndOffset {
1899    // where to find the offset
1900    off_addr: DirOffset,
1901    // signed or unsigned
1902    signed: bool,
1903    // type of the offset
1904    ty: OffsetType,
1905    op: Option<Op>,
1906    shift: Option<Shift>,
1907}
1908
1909impl IndOffset {
1910    // if we overflow we must not return an offset
1911    fn read_offset<R: Read + Seek>(
1912        &self,
1913        haystack: &mut LazyCache<R>,
1914        rule_base_offset: Option<u64>,
1915        last_upper_match_offset: Option<u64>,
1916    ) -> Result<Option<u64>, io::Error> {
1917        let offset_address = match self.off_addr {
1918            DirOffset::Start(s) => {
1919                let Some(o) = s.checked_add(rule_base_offset.unwrap_or_default()) else {
1920                    return Ok(None);
1921                };
1922
1923                haystack.seek(SeekFrom::Start(o))?
1924            }
1925            DirOffset::LastUpper(c) => haystack.seek(SeekFrom::Start(
1926                (last_upper_match_offset.unwrap_or_default() as i64 + c) as u64,
1927            ))?,
1928            DirOffset::End(e) => haystack.seek(SeekFrom::End(e))?,
1929        };
1930
1931        macro_rules! read_value {
1932            () => {
1933                match self.ty {
1934                    OffsetType::Byte => {
1935                        if self.signed {
1936                            read_le!(haystack, u8) as u64
1937                        } else {
1938                            read_le!(haystack, i8) as u64
1939                        }
1940                    }
1941                    OffsetType::DoubleLe => read_le!(haystack, f64) as u64,
1942                    OffsetType::DoubleBe => read_be!(haystack, f64) as u64,
1943                    OffsetType::ShortLe => {
1944                        if self.signed {
1945                            read_le!(haystack, i16) as u64
1946                        } else {
1947                            read_le!(haystack, u16) as u64
1948                        }
1949                    }
1950                    OffsetType::ShortBe => {
1951                        if self.signed {
1952                            read_be!(haystack, i16) as u64
1953                        } else {
1954                            read_be!(haystack, u16) as u64
1955                        }
1956                    }
1957                    OffsetType::Id3Le => decode_id3(read_le!(haystack, u32)) as u64,
1958                    OffsetType::Id3Be => decode_id3(read_be!(haystack, u32)) as u64,
1959                    OffsetType::LongLe => {
1960                        if self.signed {
1961                            read_le!(haystack, i32) as u64
1962                        } else {
1963                            read_le!(haystack, u32) as u64
1964                        }
1965                    }
1966                    OffsetType::LongBe => {
1967                        if self.signed {
1968                            read_be!(haystack, i32) as u64
1969                        } else {
1970                            read_be!(haystack, u32) as u64
1971                        }
1972                    }
1973                    OffsetType::Middle => read_me!(haystack) as u64,
1974                    OffsetType::Octal => {
1975                        if let Some(o) = read_octal_u64(haystack) {
1976                            o
1977                        } else {
1978                            debug!("failed to read octal offset @ {offset_address}");
1979                            return Ok(None);
1980                        }
1981                    }
1982                    OffsetType::QuadLe => {
1983                        if self.signed {
1984                            read_le!(haystack, i64) as u64
1985                        } else {
1986                            read_le!(haystack, u64)
1987                        }
1988                    }
1989                    OffsetType::QuadBe => {
1990                        if self.signed {
1991                            read_be!(haystack, i64) as u64
1992                        } else {
1993                            read_be!(haystack, u64)
1994                        }
1995                    }
1996                }
1997            };
1998        }
1999
2000        // in theory every offset read should end up in something seekable from start, so we can use u64 to store the result
2001        let o = read_value!();
2002
2003        trace!(
2004            "offset read @ {offset_address} value={o} op={:?} shift={:?}",
2005            self.op, self.shift
2006        );
2007
2008        // apply transformation
2009        if let (Some(op), Some(shift)) = (self.op, self.shift) {
2010            let shift = match shift {
2011                Shift::Direct(i) => i,
2012                Shift::Indirect(i) => {
2013                    let tmp = offset_address as i128 + i as i128;
2014                    if tmp.is_negative() {
2015                        return Ok(None);
2016                    } else {
2017                        haystack.seek(SeekFrom::Start(tmp as u64))?;
2018                    };
2019                    // NOTE: here we assume that the shift has the same
2020                    // type as the main offset !
2021                    read_value!()
2022                }
2023            };
2024
2025            match op {
2026                Op::Add => return Ok(o.checked_add(shift)),
2027                Op::Mul => return Ok(o.checked_mul(shift)),
2028                Op::Sub => return Ok(o.checked_sub(shift)),
2029                Op::Div => return Ok(o.checked_div(shift)),
2030                Op::Mod => return Ok(o.checked_rem(shift)),
2031                Op::And => return Ok(Some(o & shift)),
2032                Op::Or => return Ok(Some(o | shift)),
2033                Op::Xor => return Ok(Some(o ^ shift)),
2034            }
2035        }
2036
2037        Ok(Some(o))
2038    }
2039}
2040
2041#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
2042enum DirOffset {
2043    Start(u64),
2044    // relative to the last up-level field
2045    LastUpper(i64),
2046    End(i64),
2047}
2048
2049#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
2050enum Offset {
2051    Direct(DirOffset),
2052    Indirect(IndOffset),
2053}
2054
2055impl From<DirOffset> for Offset {
2056    fn from(value: DirOffset) -> Self {
2057        Self::Direct(value)
2058    }
2059}
2060
2061impl From<IndOffset> for Offset {
2062    fn from(value: IndOffset) -> Self {
2063        Self::Indirect(value)
2064    }
2065}
2066
2067impl Display for DirOffset {
2068    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
2069        match self {
2070            DirOffset::Start(i) => write!(f, "{i}"),
2071            DirOffset::LastUpper(c) => write!(f, "&{c}"),
2072            DirOffset::End(e) => write!(f, "-{e}"),
2073        }
2074    }
2075}
2076
2077impl Default for DirOffset {
2078    fn default() -> Self {
2079        Self::LastUpper(0)
2080    }
2081}
2082
2083#[derive(Debug, Clone, Serialize, Deserialize)]
2084struct Match {
2085    line: usize,
2086    depth: u8,
2087    offset: Offset,
2088    test: Test,
2089    test_strength: u64,
2090    message: Option<Message>,
2091}
2092
2093impl From<Use> for Match {
2094    fn from(value: Use) -> Self {
2095        let test = Test::Use(value.switch_endianness, value.rule_name);
2096        let test_strength = test.strength();
2097        Self {
2098            line: value.line,
2099            depth: value.depth,
2100            offset: value.start_offset,
2101            test,
2102            test_strength,
2103            message: value.message,
2104        }
2105    }
2106}
2107
2108impl From<Name> for Match {
2109    fn from(value: Name) -> Self {
2110        let test = Test::Name(value.name);
2111        let test_strength = test.strength();
2112        Self {
2113            line: value.line,
2114            depth: 0,
2115            offset: Offset::Direct(DirOffset::Start(0)),
2116            test,
2117            test_strength,
2118            message: value.message,
2119        }
2120    }
2121}
2122
2123impl Match {
2124    /// Turns the `Match`'s offset into an absolute offset from the start of the stream
2125    #[inline(always)]
2126    fn offset_from_start<R: Read + Seek>(
2127        &self,
2128        haystack: &mut LazyCache<R>,
2129        rule_base_offset: Option<u64>,
2130        last_level_offset: Option<u64>,
2131    ) -> Result<Option<u64>, io::Error> {
2132        match self.offset {
2133            Offset::Direct(dir_offset) => match dir_offset {
2134                DirOffset::Start(s) => Ok(Some(s)),
2135                DirOffset::LastUpper(shift) => {
2136                    let o = last_level_offset.unwrap_or_default() as i64 + shift;
2137
2138                    if o >= 0 { Ok(Some(o as u64)) } else { Ok(None) }
2139                }
2140                DirOffset::End(e) => Ok(Some(haystack.offset_from_start(SeekFrom::End(e)))),
2141            },
2142            Offset::Indirect(ind_offset) => {
2143                let Some(o) =
2144                    ind_offset.read_offset(haystack, rule_base_offset, last_level_offset)?
2145                else {
2146                    return Ok(None);
2147                };
2148
2149                Ok(Some(o))
2150            }
2151        }
2152    }
2153
2154    /// this method emulates the buffer based matching
2155    /// logic implemented in libmagic. It needs some aweful
2156    /// and weird offset convertions to turn buffer
2157    /// relative offsets (libmagic is based on) into
2158    /// absolute offset in the file.
2159    ///
2160    /// this method shoud bubble up only critical errors
2161    /// all the other errors should make the match result
2162    /// false and be logged via debug!
2163    ///
2164    /// the function returns an error if the maximum recursion
2165    /// has been reached or if a dependency rule is missing.
2166    #[inline]
2167    #[allow(clippy::too_many_arguments)]
2168    fn matches<'a: 'h, 'h, R: Read + Seek>(
2169        &'a self,
2170        source: Option<&str>,
2171        magic: &mut Magic<'a>,
2172        stream_kind: StreamKind,
2173        state: &mut MatchState,
2174        buf_base_offset: Option<u64>,
2175        rule_base_offset: Option<u64>,
2176        last_level_offset: Option<u64>,
2177        haystack: &'h mut LazyCache<R>,
2178        switch_endianness: bool,
2179        db: &'a MagicDb,
2180        depth: usize,
2181    ) -> Result<(bool, Option<MatchRes<'h>>), Error> {
2182        let source = source.unwrap_or("unknown");
2183        let line = self.line;
2184
2185        if depth >= MAX_RECURSION {
2186            return Err(Error::localized(
2187                source,
2188                line,
2189                Error::MaximumRecursion(MAX_RECURSION),
2190            ));
2191        }
2192
2193        if self.test.is_only_binary() && stream_kind.is_text() {
2194            trace!("skip binary test source={source} line={line} stream_kind={stream_kind:?}",);
2195            return Ok((false, None));
2196        }
2197
2198        if self.test.is_only_text() && !stream_kind.is_text() {
2199            trace!("skip text test source={source} line={line} stream_kind={stream_kind:?}",);
2200            return Ok((false, None));
2201        }
2202
2203        let Ok(Some(mut offset)) = self
2204            .offset_from_start(haystack, rule_base_offset, last_level_offset)
2205            .inspect_err(|e| debug!("source={source} line={line} failed at computing offset: {e}"))
2206        else {
2207            return Ok((false, None));
2208        };
2209
2210        offset = match self.offset {
2211            Offset::Indirect(_) => {
2212                // the result we get for an indirect offset
2213                // is relative to the start of the libmagic
2214                // buffer so we need to add base to make it
2215                // absolute.
2216                buf_base_offset.unwrap_or_default().saturating_add(offset)
2217            }
2218            // offset from start are computed from rule base
2219            Offset::Direct(DirOffset::Start(_)) => {
2220                rule_base_offset.unwrap_or_default().saturating_add(offset)
2221            }
2222            _ => offset,
2223        };
2224
2225        match &self.test {
2226            Test::Clear => {
2227                trace!("source={source} line={line} clear");
2228                state.clear_continuation_level(&self.continuation_level());
2229                Ok((true, None))
2230            }
2231
2232            Test::Name(name) => {
2233                trace!(
2234                    "source={source} line={line} running rule {name} switch_endianness={switch_endianness}",
2235                );
2236                Ok((true, None))
2237            }
2238
2239            Test::Use(flip_endianness, rule_name) => {
2240                trace!(
2241                    "source={source} line={line} use {rule_name} switch_endianness={flip_endianness}",
2242                );
2243
2244                // switch_endianness must propagate down the rule call stack
2245                let switch_endianness = switch_endianness ^ flip_endianness;
2246
2247                let dr: &DependencyRule = db.dependencies.get(rule_name).ok_or(
2248                    Error::localized(source, line, Error::MissingRule(rule_name.clone())),
2249                )?;
2250
2251                // we push the message here otherwise we push message in depth first
2252                if let Some(msg) = self.message.as_ref() {
2253                    magic.push_message(msg.to_string_lossy());
2254                }
2255
2256                let nmatch = dr.rule.magic(
2257                    magic,
2258                    stream_kind,
2259                    buf_base_offset,
2260                    Some(offset),
2261                    haystack,
2262                    db,
2263                    switch_endianness,
2264                    depth.saturating_add(1),
2265                )?;
2266
2267                // The name is always true, so we consider there to be a match
2268                // if more than one test succeeded
2269                let matched = nmatch > 0;
2270                if matched {
2271                    state.set_continuation_level(self.continuation_level());
2272                }
2273
2274                Ok((matched, None))
2275            }
2276
2277            Test::Indirect(m) => {
2278                trace!(
2279                    "source={source} line={line} indirect mods={:?} offset={offset:#x}",
2280                    m
2281                );
2282
2283                let new_buf_base_off = if m.contains(IndirectMod::Relative) {
2284                    Some(offset)
2285                } else {
2286                    None
2287                };
2288
2289                // we push the message here otherwise we push message in depth first
2290                if let Some(msg) = self.message.as_ref() {
2291                    magic.push_message(msg.to_string_lossy());
2292                }
2293
2294                let mut nmatch = 0u64;
2295                for r in db.rules.iter() {
2296                    nmatch = nmatch.saturating_add(r.magic(
2297                        magic,
2298                        stream_kind,
2299                        new_buf_base_off,
2300                        Some(offset),
2301                        haystack,
2302                        db,
2303                        false,
2304                        depth.saturating_add(1),
2305                    )?);
2306
2307                    if nmatch > 0 {
2308                        break;
2309                    }
2310                }
2311
2312                Ok((nmatch > 0, None))
2313            }
2314
2315            Test::Default => {
2316                // default matches if nothing else at the continuation level matched
2317                let ok = !state.get_continuation_level(&self.continuation_level());
2318
2319                trace!("source={source} line={line} default match={ok}");
2320                if ok {
2321                    state.set_continuation_level(self.continuation_level());
2322                }
2323
2324                Ok((ok, None))
2325            }
2326
2327            _ => {
2328                if let Err(e) = haystack.seek(SeekFrom::Start(offset)) {
2329                    debug!("source={source} line={line} failed to seek in haystack: {e}");
2330                    return Ok((false, None));
2331                }
2332
2333                let mut trace_msg = None;
2334
2335                if enabled!(Level::DEBUG) {
2336                    trace_msg = Some(vec![format!(
2337                        "source={source} line={line} depth={} stream_offset={:#x}",
2338                        self.depth,
2339                        haystack.lazy_stream_position()
2340                    )])
2341                }
2342
2343                // NOTE: we may have a way to optimize here. In case we do a Any
2344                // test and we don't use the value to format the message, we don't
2345                // need to read the value.
2346                if let Ok(opt_test_value) = self
2347                    .test
2348                    .read_test_value(haystack, switch_endianness)
2349                    .inspect_err(|e| {
2350                        debug!("source={source} line={line} error while reading test value @{offset}: {e}",)
2351                    })
2352                {
2353                    if let Some(v) = trace_msg
2354                        .as_mut() { v.push(format!("test={}", self.test)) }
2355
2356                    if let Some(v) = trace_msg.as_mut(){
2357                        let drv = match opt_test_value.as_ref(){
2358                            Some(r) => format!("{r:?}"),
2359                            None =>String::new(),
2360                        };
2361                        v.push(format!("read_in_stream={drv}"))
2362                    }
2363
2364                    let match_res =
2365                        opt_test_value.and_then(|tv| self.test.match_value(&tv, stream_kind));
2366
2367                    if let Some(v) = trace_msg.as_mut() { v.push(format!(
2368                            "message=\"{}\" match={}",
2369                            self.message
2370                                .as_ref()
2371                                .map(|fs| fs.to_string_lossy())
2372                                .unwrap_or_default(),
2373                            match_res.is_some()
2374                        )) }
2375
2376                    // trace message
2377                    if enabled!(Level::DEBUG) && !enabled!(Level::TRACE) && match_res.is_some() {
2378                        if let Some(m) = trace_msg{
2379                            debug!("{}", m.join(" "));
2380                        }
2381                    } else if enabled!(Level::TRACE)
2382                        && let Some(m) = trace_msg{
2383                            trace!("{}", m.join(" "));
2384                        }
2385
2386                    if let Some(mr) = match_res {
2387                        state.set_continuation_level(self.continuation_level());
2388                        return Ok((true, Some(mr)));
2389                    }
2390                }
2391
2392                Ok((false, None))
2393            }
2394        }
2395    }
2396
2397    #[inline(always)]
2398    fn continuation_level(&self) -> ContinuationLevel {
2399        ContinuationLevel(self.depth)
2400    }
2401}
2402
2403#[derive(Debug, Clone)]
2404struct Use {
2405    line: usize,
2406    depth: u8,
2407    start_offset: Offset,
2408    rule_name: String,
2409    switch_endianness: bool,
2410    message: Option<Message>,
2411}
2412
2413#[derive(Debug, Clone, Serialize, Deserialize)]
2414struct StrengthMod {
2415    op: Op,
2416    by: u8,
2417}
2418
2419impl StrengthMod {
2420    #[inline(always)]
2421    fn apply(&self, strength: u64) -> u64 {
2422        let by = self.by as u64;
2423        debug!("applying strength modifier: {strength} {} {}", self.op, by);
2424        match self.op {
2425            Op::Mul => strength.saturating_mul(by),
2426            Op::Add => strength.saturating_add(by),
2427            Op::Sub => strength.saturating_sub(by),
2428            Op::Div => {
2429                if by > 0 {
2430                    strength.saturating_div(by)
2431                } else {
2432                    strength
2433                }
2434            }
2435            Op::Mod => strength % by,
2436            Op::And => strength & by,
2437            // this should never happen as strength operators
2438            // are enforced by our parser
2439            Op::Xor | Op::Or => {
2440                debug_panic!("unsupported strength operator");
2441                strength
2442            }
2443        }
2444    }
2445}
2446
2447#[derive(Debug, Clone)]
2448enum Flag {
2449    Mime(String),
2450    Ext(HashSet<String>),
2451    Strength(StrengthMod),
2452    Apple(String),
2453}
2454
2455#[derive(Debug, Clone)]
2456struct Name {
2457    line: usize,
2458    name: String,
2459    message: Option<Message>,
2460}
2461
2462#[derive(Debug, Clone)]
2463enum Entry<'span> {
2464    Match(Span<'span>, Match),
2465    Flag(Span<'span>, Flag),
2466}
2467
2468#[derive(Debug, Clone, Serialize, Deserialize)]
2469struct EntryNode {
2470    root: bool,
2471    entry: Match,
2472    children: Vec<EntryNode>,
2473    mimetype: Option<String>,
2474    apple: Option<String>,
2475    strength_mod: Option<StrengthMod>,
2476    exts: HashSet<String>,
2477}
2478
2479#[derive(Debug, Default)]
2480struct EntryNodeVisitor {
2481    exts: HashSet<String>,
2482    score: u64,
2483}
2484
2485impl EntryNodeVisitor {
2486    fn new() -> Self {
2487        Self {
2488            ..Default::default()
2489        }
2490    }
2491
2492    fn merge(&mut self, other: Self) {
2493        self.exts.extend(other.exts);
2494        self.score += other.score;
2495    }
2496}
2497
2498impl EntryNode {
2499    #[inline]
2500    fn update_visitor(&self, v: &mut EntryNodeVisitor, depth: usize) {
2501        // update extensions
2502        for ext in self.exts.iter() {
2503            if !v.exts.contains(ext) {
2504                v.exts.insert(ext.clone());
2505            }
2506        }
2507
2508        // update score if depth
2509        if depth == 0 {
2510            v.score += self.entry.test_strength;
2511        }
2512
2513        // Tests at deeper levels contribute less to the overall score.
2514        // We use the minimum value to establish a lower bound for the rule's score,
2515        // which helps prioritize rules based on their importance.
2516        v.score += self
2517            .children
2518            .iter()
2519            .map(|e| e.entry.test_strength)
2520            .min()
2521            .unwrap_or_default()
2522            / max(1, depth as u64);
2523    }
2524
2525    fn visit(
2526        &self,
2527        v: &mut EntryNodeVisitor,
2528        deps: &HashMap<String, DependencyRule>,
2529        marked: &mut HashSet<String>,
2530        depth: usize,
2531    ) -> Result<(), Error> {
2532        // updating visitor
2533        self.update_visitor(v, depth);
2534
2535        // recursively visiting
2536        for c in self.children.iter() {
2537            if let Test::Use(_, ref name) = c.entry.test {
2538                if marked.contains(name) {
2539                    continue;
2540                }
2541
2542                marked.insert(name.clone());
2543
2544                if let Some(r) = deps.get(name) {
2545                    let dv = r.rule.visit_all_entries(deps, marked)?;
2546                    v.merge(dv);
2547                } else {
2548                    return Err(Error::MissingRule(name.clone()));
2549                }
2550            } else {
2551                c.visit(v, deps, marked, depth + 1)?;
2552            }
2553        }
2554
2555        Ok(())
2556    }
2557
2558    /// Executes the magic matching logic recursively and returns the count of matches that produce messages.
2559    /// Matches that don't result in message appends are not counted, consistent with libmagic's behavior.
2560    #[inline]
2561    #[allow(clippy::too_many_arguments)]
2562    fn matches<'r, R: Read + Seek>(
2563        &'r self,
2564        opt_source: Option<&str>,
2565        magic: &mut Magic<'r>,
2566        state: &mut MatchState,
2567        stream_kind: StreamKind,
2568        buf_base_offset: Option<u64>,
2569        rule_base_offset: Option<u64>,
2570        last_level_offset: Option<u64>,
2571        haystack: &mut LazyCache<R>,
2572        db: &'r MagicDb,
2573        switch_endianness: bool,
2574        depth: usize,
2575    ) -> Result<u64, Error> {
2576        let mut nmatch = 0u64;
2577
2578        let (ok, opt_match_res) = self.entry.matches(
2579            opt_source,
2580            magic,
2581            stream_kind,
2582            state,
2583            buf_base_offset,
2584            rule_base_offset,
2585            last_level_offset,
2586            haystack,
2587            switch_endianness,
2588            db,
2589            depth,
2590        )?;
2591
2592        let source = opt_source.unwrap_or("unknown");
2593        let line = self.entry.line;
2594
2595        if ok {
2596            // Update the magic with the message if the match is successful
2597            // Skip updating if the test is recursive, as it's already handled
2598            // in the Match::matches function
2599            if !self.entry.test.is_recursive()
2600                && let Some(msg) = self.entry.message.as_ref()
2601                && let Ok(msg) = msg.format_with(opt_match_res.as_ref()).inspect_err(|e| {
2602                    debug!("source={source} line={line} failed to format message: {e}")
2603                })
2604            {
2605                nmatch = nmatch.saturating_add(1);
2606                magic.push_message(msg);
2607            }
2608
2609            // we need to adjust stream offset in case of regex/search tests
2610            if let Some(mr) = opt_match_res {
2611                match &self.entry.test {
2612                    Test::String(t) => {
2613                        if t.has_length_mod() {
2614                            let o = mr.end_offset();
2615                            haystack.seek(SeekFrom::Start(o))?;
2616                        }
2617                    }
2618                    Test::Search(t) => {
2619                        if t.re_mods.contains(ReMod::StartOffsetUpdate) {
2620                            let o = mr.start_offset();
2621                            haystack.seek(SeekFrom::Start(o))?;
2622                        } else {
2623                            let o = mr.end_offset();
2624                            haystack.seek(SeekFrom::Start(o))?;
2625                        }
2626                    }
2627
2628                    Test::Regex(t) => {
2629                        if t.mods.contains(ReMod::StartOffsetUpdate) {
2630                            let o = mr.start_offset();
2631                            haystack.seek(SeekFrom::Start(o))?;
2632                        } else {
2633                            let o = mr.end_offset();
2634                            haystack.seek(SeekFrom::Start(o))?;
2635                        }
2636                    }
2637                    // other types do not need offset adjustement
2638                    _ => {}
2639                }
2640            }
2641
2642            if let Some(mimetype) = self.mimetype.as_ref() {
2643                magic.set_mime_type(Cow::Borrowed(mimetype));
2644            }
2645
2646            if let Some(apple_ty) = self.apple.as_ref() {
2647                magic.set_creator_code(Cow::Borrowed(apple_ty));
2648            }
2649
2650            if !self.exts.is_empty() {
2651                magic.insert_extensions(self.exts.iter().map(|s| s.as_str()));
2652            }
2653
2654            // NOTE: here we try to implement a similar logic as in file_magic_strength.
2655            // Sticking to the exact same strength computation logic is complicated due
2656            // to implementation differences. Let's wait and see if that is a real issue.
2657            let mut strength = self.entry.test_strength;
2658
2659            let continuation_level = self.entry.continuation_level().0 as u64;
2660            if self.entry.message.is_none() && continuation_level < 3 {
2661                strength = strength.saturating_add(continuation_level);
2662            }
2663
2664            if let Some(sm) = self.strength_mod.as_ref() {
2665                strength = sm.apply(strength);
2666            }
2667
2668            // entries with no message get a bonus
2669            if self.entry.message.is_none() {
2670                strength += 1
2671            }
2672
2673            magic.update_strength(strength);
2674
2675            let end_upper_level = haystack.lazy_stream_position();
2676
2677            // we have to fix rule_base_offset if
2678            // the rule_base_starts from end otherwise it
2679            // breaks some offset computation in match
2680            // see test_offset_bug_1 and test_offset_bug_2
2681            // they implement the same test logic yet indirect
2682            // offsets have to be different so that it works
2683            // in libmagic/file
2684            let rule_base_offset = if self.root {
2685                match self.entry.offset {
2686                    Offset::Direct(DirOffset::End(o)) => {
2687                        Some(haystack.offset_from_start(SeekFrom::End(o)))
2688                    }
2689                    _ => rule_base_offset,
2690                }
2691            } else {
2692                rule_base_offset
2693            };
2694
2695            for e in self.children.iter() {
2696                nmatch = nmatch.saturating_add(e.matches(
2697                    opt_source,
2698                    magic,
2699                    state,
2700                    stream_kind,
2701                    buf_base_offset,
2702                    rule_base_offset,
2703                    Some(end_upper_level),
2704                    haystack,
2705                    db,
2706                    switch_endianness,
2707                    depth,
2708                )?);
2709            }
2710        }
2711
2712        Ok(nmatch)
2713    }
2714}
2715
2716/// Represents a parsed magic rule
2717#[derive(Debug, Clone, Serialize, Deserialize)]
2718pub struct MagicRule {
2719    id: usize,
2720    source: Option<String>,
2721    entries: EntryNode,
2722    extensions: HashSet<String>,
2723    /// score used for rule ranking
2724    score: u64,
2725    finalized: bool,
2726}
2727
2728impl MagicRule {
2729    #[inline(always)]
2730    fn set_id(&mut self, id: usize) {
2731        self.id = id
2732    }
2733
2734    fn visit_all_entries(
2735        &self,
2736        deps: &HashMap<String, DependencyRule>,
2737        marked: &mut HashSet<String>,
2738    ) -> Result<EntryNodeVisitor, Error> {
2739        let mut v = EntryNodeVisitor::new();
2740        self.entries.visit(&mut v, deps, marked, 0)?;
2741        Ok(v)
2742    }
2743
2744    /// Finalize a rule by searching for all extensions and computing its score
2745    /// for ranking. In the `MagicRule` is already finalized it returns immediately.
2746    fn try_finalize(&mut self, deps: &HashMap<String, DependencyRule>) -> Result<(), Error> {
2747        if self.finalized {
2748            return Ok(());
2749        }
2750
2751        // rule can be finalized all deps are found
2752        let v = self.visit_all_entries(deps, &mut HashSet::new())?;
2753
2754        self.extensions.extend(v.exts);
2755        self.score = v.score;
2756        self.finalized = true;
2757
2758        Ok(())
2759    }
2760
2761    #[inline]
2762    fn magic_entrypoint<'r, R: Read + Seek>(
2763        &'r self,
2764        magic: &mut Magic<'r>,
2765        stream_kind: StreamKind,
2766        haystack: &mut LazyCache<R>,
2767        db: &'r MagicDb,
2768        switch_endianness: bool,
2769        depth: usize,
2770    ) -> Result<u64, Error> {
2771        self.entries.matches(
2772            self.source.as_deref(),
2773            magic,
2774            &mut MatchState::empty(),
2775            stream_kind,
2776            None,
2777            None,
2778            None,
2779            haystack,
2780            db,
2781            switch_endianness,
2782            depth,
2783        )
2784    }
2785
2786    /// Executes the magic matching logic and returns the count of matches that produce messages.
2787    /// Matches that don't result in message appends are not counted, consistent with libmagic's behavior.
2788    #[inline]
2789    #[allow(clippy::too_many_arguments)]
2790    fn magic<'r, R: Read + Seek>(
2791        &'r self,
2792        magic: &mut Magic<'r>,
2793        stream_kind: StreamKind,
2794        buf_base_offset: Option<u64>,
2795        rule_base_offset: Option<u64>,
2796        haystack: &mut LazyCache<R>,
2797        db: &'r MagicDb,
2798        switch_endianness: bool,
2799        depth: usize,
2800    ) -> Result<u64, Error> {
2801        self.entries.matches(
2802            self.source.as_deref(),
2803            magic,
2804            &mut MatchState::empty(),
2805            stream_kind,
2806            buf_base_offset,
2807            rule_base_offset,
2808            None,
2809            haystack,
2810            db,
2811            switch_endianness,
2812            depth,
2813        )
2814    }
2815
2816    /// Checks if the rule is for matching against text content
2817    ///
2818    /// # Returns
2819    ///
2820    /// * `bool` - True if the rule is for text files
2821    pub fn is_text(&self) -> bool {
2822        self.entries.entry.test.is_text()
2823            && self.entries.children.iter().all(|e| e.entry.test.is_text())
2824    }
2825
2826    /// Gets the rule's score used for ranking rules between them
2827    ///
2828    /// # Returns
2829    ///
2830    /// * `u64` - The rule's score
2831    #[inline(always)]
2832    pub fn score(&self) -> u64 {
2833        self.score
2834    }
2835
2836    /// Gets the rule's filename if any
2837    ///
2838    /// # Returns
2839    ///
2840    /// * `Option<&str>` - The rule's source if available
2841    #[inline(always)]
2842    pub fn source(&self) -> Option<&str> {
2843        self.source.as_deref()
2844    }
2845
2846    /// Gets the line number at which the rule is defined
2847    ///
2848    /// # Returns
2849    ///
2850    /// * `usize` - The rule's line number
2851    #[inline(always)]
2852    pub fn line(&self) -> usize {
2853        self.entries.entry.line
2854    }
2855
2856    /// Gets all the file extensions associated to the rule
2857    ///
2858    /// # Returns
2859    ///
2860    /// * `&HashSet<String>` - The set of all associated extensions
2861    #[inline(always)]
2862    pub fn extensions(&self) -> &HashSet<String> {
2863        &self.extensions
2864    }
2865}
2866
2867#[derive(Debug, Clone, Serialize, Deserialize)]
2868struct DependencyRule {
2869    name: String,
2870    rule: MagicRule,
2871}
2872
2873/// A parsed source of magic rules
2874///
2875/// # Methods
2876///
2877/// * `open` - Opens a magic file from a path
2878#[derive(Debug, Clone, Serialize, Deserialize)]
2879pub struct MagicSource {
2880    rules: Vec<MagicRule>,
2881    dependencies: HashMap<String, DependencyRule>,
2882}
2883
2884impl MagicSource {
2885    /// Opens and parses a magic file from a path
2886    ///
2887    /// # Arguments
2888    ///
2889    /// * `p` - The path to the magic file
2890    ///
2891    /// # Returns
2892    ///
2893    /// * `Result<Self, Error>` - The parsed magic file or an error
2894    pub fn open<P: AsRef<Path>>(p: P) -> Result<Self, Error> {
2895        FileMagicParser::parse_file(p)
2896    }
2897}
2898
2899#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)]
2900struct ContinuationLevel(u8);
2901
2902// FIXME: magic handles many more text encodings
2903#[derive(Debug, PartialEq, Eq, Clone, Copy)]
2904enum TextEncoding {
2905    Ascii,
2906    Utf8,
2907    Unknown,
2908}
2909
2910impl TextEncoding {
2911    const fn as_magic_str(&self) -> &'static str {
2912        match self {
2913            TextEncoding::Ascii => "ASCII",
2914            TextEncoding::Utf8 => "UTF-8",
2915            TextEncoding::Unknown => "Unknown",
2916        }
2917    }
2918}
2919
2920#[derive(Debug, PartialEq, Eq, Clone, Copy)]
2921enum StreamKind {
2922    Binary,
2923    Text(TextEncoding),
2924}
2925
2926impl StreamKind {
2927    const fn is_text(&self) -> bool {
2928        matches!(self, StreamKind::Text(_))
2929    }
2930}
2931
2932#[derive(Debug)]
2933struct MatchState {
2934    continuation_levels: [bool; 256],
2935}
2936
2937impl MatchState {
2938    #[inline(always)]
2939    fn empty() -> Self {
2940        MatchState {
2941            continuation_levels: [false; 256],
2942        }
2943    }
2944
2945    #[inline(always)]
2946    fn get_continuation_level(&mut self, level: &ContinuationLevel) -> bool {
2947        self.continuation_levels
2948            .get(level.0 as usize)
2949            .cloned()
2950            .unwrap_or_default()
2951    }
2952
2953    #[inline(always)]
2954    fn set_continuation_level(&mut self, level: ContinuationLevel) {
2955        if let Some(b) = self.continuation_levels.get_mut(level.0 as usize) {
2956            *b = true
2957        }
2958    }
2959
2960    #[inline(always)]
2961    fn clear_continuation_level(&mut self, level: &ContinuationLevel) {
2962        if let Some(b) = self.continuation_levels.get_mut(level.0 as usize) {
2963            *b = false;
2964        }
2965    }
2966}
2967
2968/// Represents a file magic detection result
2969#[derive(Debug, Default)]
2970pub struct Magic<'m> {
2971    stream_kind: Option<StreamKind>,
2972    source: Option<Cow<'m, str>>,
2973    message: Vec<Cow<'m, str>>,
2974    mime_type: Option<Cow<'m, str>>,
2975    creator_code: Option<Cow<'m, str>>,
2976    strength: u64,
2977    exts: HashSet<Cow<'m, str>>,
2978    is_default: bool,
2979}
2980
2981impl<'m> Magic<'m> {
2982    #[inline(always)]
2983    fn set_source(&mut self, source: Option<&'m str>) {
2984        self.source = source.map(Cow::Borrowed);
2985    }
2986
2987    #[inline(always)]
2988    fn set_stream_kind(&mut self, stream_kind: StreamKind) {
2989        self.stream_kind = Some(stream_kind)
2990    }
2991
2992    #[inline(always)]
2993    fn reset(&mut self) {
2994        self.stream_kind = None;
2995        self.source = None;
2996        self.message.clear();
2997        self.mime_type = None;
2998        self.creator_code = None;
2999        self.strength = 0;
3000        self.exts.clear();
3001        self.is_default = false;
3002    }
3003
3004    /// Converts borrowed data into owned data. This method involves
3005    /// data cloning, so you must use this method only if you need to
3006    /// extend the lifetime of a [`Magic`] struct.
3007    ///
3008    /// # Returns
3009    ///
3010    /// * `Magic<'owned>` - A new [`Magic`] with owned data
3011    #[inline]
3012    pub fn into_owned<'owned>(self) -> Magic<'owned> {
3013        Magic {
3014            stream_kind: self.stream_kind,
3015            source: self.source.map(|s| Cow::Owned(s.into_owned())),
3016            message: self
3017                .message
3018                .into_iter()
3019                .map(Cow::into_owned)
3020                .map(Cow::Owned)
3021                .collect(),
3022            mime_type: self.mime_type.map(|m| Cow::Owned(m.into_owned())),
3023            creator_code: self.creator_code.map(|m| Cow::Owned(m.into_owned())),
3024            strength: self.strength,
3025            exts: self
3026                .exts
3027                .into_iter()
3028                .map(|e| Cow::Owned(e.into_owned()))
3029                .collect(),
3030            is_default: self.is_default,
3031        }
3032    }
3033
3034    /// Gets the formatted message describing the file type
3035    ///
3036    /// # Returns
3037    ///
3038    /// * `String` - The formatted message
3039    #[inline(always)]
3040    pub fn message(&self) -> String {
3041        let mut out = String::new();
3042        for (i, m) in self.message.iter().enumerate() {
3043            if let Some(s) = m.strip_prefix(r#"\b"#) {
3044                out.push_str(s);
3045            } else {
3046                // don't put space on first string
3047                if i > 0 {
3048                    out.push(' ');
3049                }
3050                out.push_str(m);
3051            }
3052        }
3053        out
3054    }
3055
3056    /// Returns an iterator over the individual parts of the magic message
3057    ///
3058    /// A magic message is typically composed of multiple parts, each appended
3059    /// during successful magic tests. This method provides an efficient way to
3060    /// iterate over these parts without concatenating them into a new string,
3061    /// as done when calling [`Magic::message`].
3062    ///
3063    /// # Returns
3064    ///
3065    /// * `impl Iterator<Item = &str>` - An iterator yielding string slices of each message part
3066    #[inline]
3067    pub fn message_parts(&self) -> impl Iterator<Item = &str> {
3068        self.message.iter().map(|p| p.as_ref())
3069    }
3070
3071    #[inline(always)]
3072    fn update_strength(&mut self, value: u64) {
3073        self.strength = self.strength.saturating_add(value);
3074        debug!("updated strength = {:?}", self.strength)
3075    }
3076
3077    /// Gets the detected MIME type
3078    ///
3079    /// # Returns
3080    ///
3081    /// * `&str` - The MIME type or default based on stream kind
3082    #[inline(always)]
3083    pub fn mime_type(&self) -> &str {
3084        self.mime_type.as_deref().unwrap_or(match self.stream_kind {
3085            Some(StreamKind::Text(_)) => DEFAULT_TEXT_MIMETYPE,
3086            Some(StreamKind::Binary) | None => DEFAULT_BIN_MIMETYPE,
3087        })
3088    }
3089
3090    #[inline(always)]
3091    fn push_message<'a: 'm>(&mut self, msg: Cow<'a, str>) {
3092        if !msg.is_empty() {
3093            debug!("pushing message: msg={msg} len={}", msg.len());
3094            self.message.push(msg);
3095        }
3096    }
3097
3098    #[inline(always)]
3099    fn set_mime_type<'a: 'm>(&mut self, mime: Cow<'a, str>) {
3100        if self.mime_type.is_none() {
3101            debug!("insert mime: {:?}", mime);
3102            self.mime_type = Some(mime)
3103        }
3104    }
3105
3106    #[inline(always)]
3107    fn set_creator_code<'a: 'm>(&mut self, apple_ty: Cow<'a, str>) {
3108        if self.creator_code.is_none() {
3109            debug!("insert apple type: {apple_ty:?}");
3110            self.creator_code = Some(apple_ty)
3111        }
3112    }
3113
3114    #[inline(always)]
3115    fn insert_extensions<'a: 'm, I: Iterator<Item = &'a str>>(&mut self, exts: I) {
3116        if self.exts.is_empty() {
3117            self.exts.extend(exts.filter_map(|e| {
3118                if e.is_empty() {
3119                    None
3120                } else {
3121                    Some(Cow::Borrowed(e))
3122                }
3123            }));
3124        }
3125    }
3126
3127    /// Gets the confidence score of the detection. This
3128    /// value is used to sort [`Magic`] in [`MagicDb::best_magic`]
3129    /// and [`MagicDb::all_magics`].
3130    ///
3131    /// # Returns
3132    ///
3133    /// * `u64` - The confidence score attributed to that [`Magic`]
3134    #[inline(always)]
3135    pub fn strength(&self) -> u64 {
3136        self.strength
3137    }
3138
3139    /// Gets the filename where the magic rule was defined
3140    ///
3141    /// # Returns
3142    ///
3143    /// * `Option<&str>` - The source if available
3144    #[inline(always)]
3145    pub fn source(&self) -> Option<&str> {
3146        self.source.as_deref()
3147    }
3148
3149    /// Gets the Apple creator code if available
3150    ///
3151    /// # Returns
3152    ///
3153    /// * `Option<&str>` - The creator code if available
3154    #[inline(always)]
3155    pub fn creator_code(&self) -> Option<&str> {
3156        self.creator_code.as_deref()
3157    }
3158
3159    /// Gets the possible file extensions for the detected [`Magic`]
3160    ///
3161    /// # Returns
3162    ///
3163    /// * `&HashSet<Cow<'m, str>>` - The set of possible extensions
3164    #[inline(always)]
3165    pub fn extensions(&self) -> &HashSet<Cow<'m, str>> {
3166        &self.exts
3167    }
3168
3169    /// Checks if this is a default fallback detection
3170    ///
3171    /// # Returns
3172    ///
3173    /// * `bool` - True if this is a default detection
3174    #[inline(always)]
3175    pub fn is_default(&self) -> bool {
3176        self.is_default
3177    }
3178}
3179
3180/// Represents a database of [`MagicRule`]
3181#[derive(Debug, Default, Clone, Serialize, Deserialize)]
3182pub struct MagicDb {
3183    rule_id: usize,
3184    rules: Vec<MagicRule>,
3185    dependencies: HashMap<String, DependencyRule>,
3186    finalized: usize,
3187}
3188
3189#[inline(always)]
3190/// Returns `true` if the byte stream is likely text.
3191fn is_likely_text(bytes: &[u8]) -> bool {
3192    const CHUNK_SIZE: usize = std::mem::size_of::<usize>();
3193
3194    if bytes.is_empty() {
3195        return false;
3196    }
3197
3198    let mut printable = 0f64;
3199    let mut high_bytes = 0f64; // Bytes > 0x7F (non-ASCII)
3200
3201    let (chunks, remainder) = bytes.as_chunks::<CHUNK_SIZE>();
3202
3203    macro_rules! handle_byte {
3204        ($byte: expr) => {
3205            match $byte {
3206                0x00 => return false,
3207                0x09 | 0x0A | 0x0D => printable += 1.0, // Whitespace
3208                0x20..=0x7E => printable += 1.0,        // Printable ASCII
3209                _ => high_bytes += 1.0,
3210            }
3211        };
3212    }
3213
3214    for bytes in chunks {
3215        for b in bytes {
3216            handle_byte!(b)
3217        }
3218    }
3219
3220    for b in remainder {
3221        handle_byte!(b)
3222    }
3223
3224    let total = bytes.len() as f64;
3225    let printable_ratio = printable / total;
3226    let high_bytes_ratio = high_bytes / total;
3227
3228    // Heuristic thresholds (adjust as needed):
3229    printable_ratio > 0.85 && high_bytes_ratio < 0.20
3230}
3231
3232#[inline(always)]
3233fn guess_stream_kind<S: AsRef<[u8]>>(stream: S) -> StreamKind {
3234    let buf = stream.as_ref();
3235
3236    match run_utf8_validation(buf) {
3237        Ok(is_ascii) => {
3238            if is_ascii {
3239                StreamKind::Text(TextEncoding::Ascii)
3240            } else {
3241                StreamKind::Text(TextEncoding::Utf8)
3242            }
3243        }
3244        Err(e) => {
3245            if is_likely_text(&buf[e.valid_up_to..]) {
3246                StreamKind::Text(TextEncoding::Unknown)
3247            } else {
3248                StreamKind::Binary
3249            }
3250        }
3251    }
3252}
3253
3254impl MagicDb {
3255    /// Prepares an [`LazyCache`] configured with optimal parameters for
3256    /// **read** operations done during file identification
3257    pub fn optimal_lazy_cache<R: Read + Seek>(f: R) -> Result<LazyCache<R>, io::Error> {
3258        Ok(LazyCache::<R>::from_read_seek(f)
3259            .and_then(|lc| lc.with_hot_cache(2 * FILE_BYTES_MAX))?)
3260        .map(|lc| lc.with_warm_cache(100 << 20))
3261    }
3262
3263    /// Creates a new empty database
3264    ///
3265    /// # Returns
3266    ///
3267    /// * [`MagicDb`] - A new empty database
3268    pub fn new() -> Self {
3269        Self::default()
3270    }
3271
3272    #[inline(always)]
3273    fn next_rule_id(&mut self) -> usize {
3274        let t = self.rule_id;
3275        self.rule_id += 1;
3276        t
3277    }
3278
3279    #[inline(always)]
3280    fn try_json<R: Read + Seek>(
3281        haystack: &mut LazyCache<R>,
3282        stream_kind: StreamKind,
3283        magic: &mut Magic,
3284    ) -> Result<bool, Error> {
3285        // cannot be json if content is binary
3286        if matches!(stream_kind, StreamKind::Binary) {
3287            return Ok(false);
3288        }
3289
3290        let buf = haystack.read_range(0..FILE_BYTES_MAX as u64)?.trim_ascii();
3291
3292        let Some((start, end)) = find_json_boundaries(buf) else {
3293            return Ok(false);
3294        };
3295
3296        // if anything else than whitespace before start
3297        // this is not json
3298        for c in buf[0..start].iter() {
3299            if !c.is_ascii_whitespace() {
3300                return Ok(false);
3301            }
3302        }
3303
3304        let mut is_ndjson = false;
3305
3306        trace!("maybe a json document");
3307        let ok = serde_json::from_slice::<serde_json::Value>(&buf[start..=end]).is_ok();
3308        if !ok {
3309            return Ok(false);
3310        }
3311
3312        // we are sure it is json now we must look if we are ndjson
3313        if end + 1 < buf.len() {
3314            // after first json
3315            let buf = &buf[end + 1..];
3316            if let Some((second_start, second_end)) = find_json_boundaries(buf) {
3317                // there is a new line between the two json docs
3318                if memchr(b'\n', &buf[..second_start]).is_some() {
3319                    trace!("might be ndjson");
3320                    is_ndjson = serde_json::from_slice::<serde_json::Value>(
3321                        &buf[second_start..=second_end],
3322                    )
3323                    .is_ok();
3324                }
3325            }
3326        }
3327
3328        if is_ndjson {
3329            magic.push_message(Cow::Borrowed("New Line Delimited"));
3330            magic.set_mime_type(Cow::Borrowed("application/x-ndjson"));
3331            magic.insert_extensions(["ndjson", "jsonl"].into_iter());
3332        } else {
3333            magic.set_mime_type(Cow::Borrowed("application/json"));
3334            magic.insert_extensions(["json"].into_iter());
3335        }
3336
3337        magic.push_message(Cow::Borrowed("JSON text data"));
3338        magic.set_source(Some(HARDCODED_SOURCE));
3339        magic.update_strength(HARDCODED_MAGIC_STRENGTH);
3340        Ok(true)
3341    }
3342
3343    #[inline(always)]
3344    fn try_csv<R: Read + Seek>(
3345        haystack: &mut LazyCache<R>,
3346        stream_kind: StreamKind,
3347        magic: &mut Magic,
3348    ) -> Result<bool, Error> {
3349        // cannot be csv if content is binary
3350        let StreamKind::Text(enc) = stream_kind else {
3351            return Ok(false);
3352        };
3353
3354        let buf = haystack.read_range(0..FILE_BYTES_MAX as u64)?;
3355        let mut reader = csv::Reader::from_reader(io::Cursor::new(buf));
3356        let mut records = reader.records();
3357
3358        let Some(Ok(first)) = records.next() else {
3359            return Ok(false);
3360        };
3361
3362        // very not likely a CSV otherwise all programming
3363        // languages having ; line terminator would be
3364        // considered as CSV
3365        if first.len() <= 1 {
3366            return Ok(false);
3367        }
3368
3369        // we already parsed first line
3370        let mut n = 1;
3371        for i in records.take(9) {
3372            if let Ok(rec) = i {
3373                if first.len() != rec.len() {
3374                    return Ok(false);
3375                }
3376            } else {
3377                return Ok(false);
3378            }
3379            n += 1;
3380        }
3381
3382        // we need at least 10 lines
3383        if n != 10 {
3384            return Ok(false);
3385        }
3386
3387        magic.set_mime_type(Cow::Borrowed("text/csv"));
3388        magic.push_message(Cow::Borrowed("CSV"));
3389        magic.push_message(Cow::Borrowed(enc.as_magic_str()));
3390        magic.push_message(Cow::Borrowed("text"));
3391        magic.insert_extensions(["csv"].into_iter());
3392        magic.set_source(Some(HARDCODED_SOURCE));
3393        magic.update_strength(HARDCODED_MAGIC_STRENGTH);
3394        Ok(true)
3395    }
3396
3397    #[inline(always)]
3398    fn try_tar<R: Read + Seek>(
3399        haystack: &mut LazyCache<R>,
3400        stream_kind: StreamKind,
3401        magic: &mut Magic,
3402    ) -> Result<bool, Error> {
3403        // cannot be json if content is not binary
3404        if !matches!(stream_kind, StreamKind::Binary) {
3405            return Ok(false);
3406        }
3407
3408        let buf = haystack.read_range(0..FILE_BYTES_MAX as u64)?;
3409        let mut ar = Archive::new(io::Cursor::new(buf));
3410
3411        let Ok(mut entries) = ar.entries() else {
3412            return Ok(false);
3413        };
3414
3415        let Some(Ok(first)) = entries.next() else {
3416            return Ok(false);
3417        };
3418
3419        let header = first.header();
3420
3421        if header.as_ustar().is_some() {
3422            magic.push_message(Cow::Borrowed("POSIX tar archive"));
3423        } else if header.as_gnu().is_some() {
3424            magic.push_message(Cow::Borrowed("POSIX tar archive (GNU)"));
3425        } else {
3426            magic.push_message(Cow::Borrowed("tar archive"));
3427        }
3428
3429        magic.set_mime_type(Cow::Borrowed("application/x-tar"));
3430        magic.set_source(Some(HARDCODED_SOURCE));
3431        magic.update_strength(HARDCODED_MAGIC_STRENGTH);
3432        magic.insert_extensions(["tar"].into_iter());
3433        Ok(true)
3434    }
3435
3436    #[inline(always)]
3437    fn try_hard_magic<R: Read + Seek>(
3438        haystack: &mut LazyCache<R>,
3439        stream_kind: StreamKind,
3440        magic: &mut Magic,
3441    ) -> Result<bool, Error> {
3442        Ok(Self::try_json(haystack, stream_kind, magic)?
3443            || Self::try_csv(haystack, stream_kind, magic)?
3444            || Self::try_tar(haystack, stream_kind, magic)?)
3445    }
3446
3447    #[inline(always)]
3448    fn magic_default<'m, R: Read + Seek>(
3449        cache: &mut LazyCache<R>,
3450        stream_kind: StreamKind,
3451        magic: &mut Magic<'m>,
3452    ) {
3453        magic.set_source(Some(HARDCODED_SOURCE));
3454        magic.set_stream_kind(stream_kind);
3455        magic.is_default = true;
3456
3457        if cache.data_size() == 0 {
3458            magic.push_message(Cow::Borrowed("empty"));
3459            magic.set_mime_type(Cow::Borrowed(DEFAULT_BIN_MIMETYPE));
3460        }
3461
3462        match stream_kind {
3463            StreamKind::Binary => {
3464                magic.push_message(Cow::Borrowed("data"));
3465            }
3466            StreamKind::Text(e) => {
3467                magic.push_message(Cow::Borrowed(e.as_magic_str()));
3468                magic.push_message(Cow::Borrowed("text"));
3469            }
3470        }
3471    }
3472
3473    fn load_rules_no_prepare(&mut self, rules: Vec<MagicRule>) {
3474        for rule in rules.into_iter() {
3475            let mut rule = rule;
3476            rule.set_id(self.next_rule_id());
3477
3478            self.rules.push(rule);
3479        }
3480    }
3481
3482    /// Loads rules from a [`MagicSource`]
3483    ///
3484    /// # Arguments
3485    ///
3486    /// * `ms` - The [`MagicSource`] to load rules from
3487    pub fn load(&mut self, ms: MagicSource) -> &mut Self {
3488        self.load_rules_no_prepare(ms.rules);
3489        self.dependencies.extend(ms.dependencies);
3490        self.try_finalize();
3491        self
3492    }
3493
3494    /// Loads multiple [`MagicSource`] items efficiently in bulk.
3495    ///
3496    /// This is more efficient than loading each individually. After processing
3497    /// all sources, it applies finalization step only once.
3498    pub fn load_bulk<I: Iterator<Item = MagicSource>>(&mut self, it: I) -> &mut Self {
3499        for ms in it {
3500            self.load_rules_no_prepare(ms.rules);
3501            self.dependencies.extend(ms.dependencies);
3502        }
3503        self.try_finalize();
3504        self
3505    }
3506
3507    /// Gets all rules in the database
3508    ///
3509    /// # Returns
3510    ///
3511    /// * `&[MagicRule]` - A slice of all rules
3512    pub fn rules(&self) -> &[MagicRule] {
3513        &self.rules
3514    }
3515
3516    #[inline]
3517    fn first_magic_with_stream_kind<R: Read + Seek>(
3518        &self,
3519        haystack: &mut LazyCache<R>,
3520        stream_kind: StreamKind,
3521        extension: Option<&str>,
3522    ) -> Result<Magic<'_>, Error> {
3523        // re-using magic makes this function faster
3524        let mut magic = Magic::default();
3525
3526        if Self::try_hard_magic(haystack, stream_kind, &mut magic)? {
3527            return Ok(magic);
3528        }
3529
3530        let mut marked = vec![false; self.rules.len()];
3531
3532        macro_rules! do_magic {
3533            ($rule: expr) => {{
3534                $rule.magic_entrypoint(&mut magic, stream_kind, haystack, &self, false, 0)?;
3535
3536                if !magic.message.is_empty() {
3537                    magic.set_stream_kind(stream_kind);
3538                    magic.set_source($rule.source.as_deref());
3539                    return Ok(magic);
3540                }
3541
3542                magic.reset();
3543            }};
3544        }
3545
3546        if let Some(ext) = extension.map(|e| e.to_lowercase())
3547            && !ext.is_empty()
3548        {
3549            for rule in self.rules.iter().filter(|r| r.extensions.contains(&ext)) {
3550                do_magic!(rule);
3551                if let Some(f) = marked.get_mut(rule.id) {
3552                    *f = true
3553                }
3554            }
3555        }
3556
3557        for rule in self
3558            .rules
3559            .iter()
3560            // we don't run again rules run by extension
3561            .filter(|r| !*marked.get(r.id).unwrap_or(&false))
3562        {
3563            do_magic!(rule)
3564        }
3565
3566        Self::magic_default(haystack, stream_kind, &mut magic);
3567
3568        Ok(magic)
3569    }
3570
3571    /// Detects file [`Magic`] stopping at the first matching magic. Magic
3572    /// rules are evaluated from the best to the least relevant, so this method
3573    /// returns most of the time the best magic. For the rare cases where
3574    /// it doesn't or if the best result is always required, use [`MagicDb::best_magic`]
3575    ///
3576    /// # Arguments
3577    ///
3578    /// * `r` - A readable and seekable input
3579    /// * `extension` - Optional file extension to use for acceleration
3580    ///
3581    /// # Returns
3582    ///
3583    /// * `Result<Magic<'_>, Error>` - The detection result or an error
3584    ///
3585    /// # Warning
3586    ///
3587    /// File extension acceleration is made to evaluate rules faster by testing
3588    /// first the rules defining this extension with an `!:ext` entry.
3589    /// Whether you use `extension` acceleration or not with this function should not
3590    /// produce different results. Yet this makes the assumption rules are written
3591    /// correctly and every rule concerned defines `!:ext` when it is appropriate.
3592    /// If some rules are missing it, results might differ.
3593    pub fn first_magic<R: Read + Seek>(
3594        &self,
3595        r: &mut R,
3596        extension: Option<&str>,
3597    ) -> Result<Magic<'_>, Error> {
3598        let mut cache = Self::optimal_lazy_cache(r)?;
3599        let stream_kind = guess_stream_kind(cache.read_range(0..FILE_BYTES_MAX as u64)?);
3600        self.first_magic_with_stream_kind(&mut cache, stream_kind, extension)
3601    }
3602
3603    /// An alternative to [`Self::first_magic`] using a [`LazyCache`]
3604    /// to detects file [`Magic`] stopping at the first matching magic. Magic
3605    /// rules are evaluated from the best to the least relevant, so this method
3606    /// returns most of the time the best magic. For the rare cases where
3607    /// it doesn't or if the best result is always required, use [`MagicDb::best_magic`]
3608    ///
3609    /// # Arguments
3610    ///
3611    /// * `cache` - A [`LazyCache`] used for read operations
3612    /// * `extension` - Optional file extension to use for acceleration
3613    ///
3614    /// # Returns
3615    ///
3616    /// * `Result<Magic<'_>, Error>` - The detection result or an error
3617    ///
3618    /// # Notes
3619    ///
3620    /// * Use this method **only** if you need to re-use a [`LazyCache`] for future **read** operations.
3621    /// * Use [`Self::optimal_lazy_cache`] to prepare an optimal [`LazyCache`]
3622    ///
3623    /// # Warning
3624    ///
3625    /// File extension acceleration is made to evaluate rules faster by testing
3626    /// first the rules defining this extension with an `!:ext` entry.
3627    /// Whether you use `extension` acceleration or not with this function should not
3628    /// produce different results. Yet this makes the assumption rules are written
3629    /// correctly and every rule concerned defines `!:ext` when it is appropriate.
3630    /// If some rules are missing it, results might differ.
3631    pub fn first_magic_with_lazy_cache<R: Read + Seek>(
3632        &self,
3633        cache: &mut LazyCache<R>,
3634        extension: Option<&str>,
3635    ) -> Result<Magic<'_>, Error> {
3636        let stream_kind = guess_stream_kind(cache.read_range(0..FILE_BYTES_MAX as u64)?);
3637        self.first_magic_with_stream_kind(cache, stream_kind, extension)
3638    }
3639
3640    #[inline(always)]
3641    fn all_magics_sort_with_stream_kind<R: Read + Seek>(
3642        &self,
3643        haystack: &mut LazyCache<R>,
3644        stream_kind: StreamKind,
3645    ) -> Result<Vec<Magic<'_>>, Error> {
3646        let mut out = Vec::new();
3647
3648        let mut magic = Magic::default();
3649
3650        if Self::try_hard_magic(haystack, stream_kind, &mut magic)? {
3651            out.push(magic);
3652            magic = Magic::default();
3653        }
3654
3655        for rule in self.rules.iter() {
3656            rule.magic_entrypoint(&mut magic, stream_kind, haystack, self, false, 0)?;
3657
3658            // it is possible we have a strength with no message
3659            if !magic.message.is_empty() {
3660                magic.set_stream_kind(stream_kind);
3661                magic.set_source(rule.source.as_deref());
3662                out.push(magic);
3663                magic = Magic::default();
3664            }
3665
3666            magic.reset();
3667        }
3668
3669        Self::magic_default(haystack, stream_kind, &mut magic);
3670        out.push(magic);
3671
3672        out.sort_by_key(|b| std::cmp::Reverse(b.strength()));
3673
3674        Ok(out)
3675    }
3676
3677    /// Detects all [`Magic`] matching a given content.
3678    ///
3679    /// # Arguments
3680    ///
3681    /// * `r` - A readable and seekable input
3682    ///
3683    /// # Returns
3684    ///
3685    /// * `Result<Vec<Magic<'_>>, Error>` - All detection results sorted by strength or an error
3686    pub fn all_magics<R: Read + Seek>(&self, r: &mut R) -> Result<Vec<Magic<'_>>, Error> {
3687        let mut cache = Self::optimal_lazy_cache(r)?;
3688        let stream_kind = guess_stream_kind(cache.read_range(0..FILE_BYTES_MAX as u64)?);
3689        self.all_magics_sort_with_stream_kind(&mut cache, stream_kind)
3690    }
3691
3692    /// An alternative to [`Self::all_magics`] using a [`LazyCache`]
3693    /// to detects all [`Magic`] matching a given content.
3694    ///
3695    /// # Arguments
3696    ///
3697    /// * `r` - A readable and seekable input
3698    ///
3699    /// # Returns
3700    ///
3701    /// * `Result<Vec<Magic<'_>>, Error>` - All detection results sorted by strength or an error
3702    ///
3703    /// # Notes
3704    ///
3705    /// * Use this method **only** if you need to re-use a [`LazyCache`] for future **read** operations.
3706    /// * Use [`Self::optimal_lazy_cache`] to prepare an optimal [`LazyCache`]
3707    pub fn all_magics_with_lazy_cache<R: Read + Seek>(
3708        &self,
3709        cache: &mut LazyCache<R>,
3710    ) -> Result<Vec<Magic<'_>>, Error> {
3711        let stream_kind = guess_stream_kind(cache.read_range(0..FILE_BYTES_MAX as u64)?);
3712        self.all_magics_sort_with_stream_kind(cache, stream_kind)
3713    }
3714
3715    #[inline(always)]
3716    fn best_magic_with_stream_kind<R: Read + Seek>(
3717        &self,
3718        haystack: &mut LazyCache<R>,
3719        stream_kind: StreamKind,
3720    ) -> Result<Magic<'_>, Error> {
3721        let magics = self.all_magics_sort_with_stream_kind(haystack, stream_kind)?;
3722
3723        // magics is guaranteed to contain at least the
3724        // default magic but we unwrap to avoid any panic
3725        Ok(magics.into_iter().next().unwrap_or_else(|| {
3726            let mut magic = Magic::default();
3727            Self::magic_default(haystack, stream_kind, &mut magic);
3728            magic
3729        }))
3730    }
3731
3732    /// Detects the best [`Magic`] matching a given content.
3733    ///
3734    /// # Arguments
3735    ///
3736    /// * `r` - A readable and seekable input
3737    ///
3738    /// # Returns
3739    ///
3740    /// * `Result<Magic<'_>, Error>` - The best detection result or an error
3741    pub fn best_magic<R: Read + Seek>(&self, r: &mut R) -> Result<Magic<'_>, Error> {
3742        let mut cache = Self::optimal_lazy_cache(r)?;
3743        let stream_kind = guess_stream_kind(cache.read_range(0..FILE_BYTES_MAX as u64)?);
3744        self.best_magic_with_stream_kind(&mut cache, stream_kind)
3745    }
3746
3747    /// An alternative to [`Self::best_magic`] using a [`LazyCache`]
3748    /// to detect the best [`Magic`] matching a given content.
3749    ///
3750    /// # Arguments
3751    ///
3752    /// * `r` - A readable and seekable input
3753    ///
3754    /// # Returns
3755    ///
3756    /// * `Result<Magic<'_>, Error>` - The best detection result or an error
3757    ///
3758    /// # Notes
3759    ///
3760    /// * Use this method **only** if you need to re-use a [`LazyCache`] for future **read** operations.
3761    /// * Use [`Self::optimal_lazy_cache`] to prepare an optimal [`LazyCache`]
3762    pub fn best_magic_with_lazy_cache<R: Read + Seek>(
3763        &self,
3764        cache: &mut LazyCache<R>,
3765    ) -> Result<Magic<'_>, Error> {
3766        let stream_kind = guess_stream_kind(cache.read_range(0..FILE_BYTES_MAX as u64)?);
3767        self.best_magic_with_stream_kind(cache, stream_kind)
3768    }
3769
3770    /// Serializes the database to a generic writer implementing [`io::Write`]
3771    ///
3772    /// # Returns
3773    ///
3774    /// * `Result<(), Error>` - The serialized database or an error
3775    pub fn serialize<W: Write>(self, w: &mut W) -> Result<(), Error> {
3776        let mut encoder = GzEncoder::new(w, Compression::best());
3777
3778        bincode::serde::encode_into_std_write(&self, &mut encoder, bincode::config::standard())?;
3779        encoder.finish()?;
3780        Ok(())
3781    }
3782
3783    /// Deserializes the database from a generic reader implementing [`io::Read`]
3784    ///
3785    /// # Arguments
3786    ///
3787    /// * `r` - The reader to deserialize from
3788    ///
3789    /// # Returns
3790    ///
3791    /// * `Result<Self, Error>` - The deserialized database or an error
3792    pub fn deserialize<R: Read>(r: &mut R) -> Result<Self, Error> {
3793        let mut buf = vec![];
3794        let mut gz = GzDecoder::new(r);
3795        gz.read_to_end(&mut buf).map_err(|e| {
3796            bincode::error::DecodeError::OtherString(format!("failed to read: {e}"))
3797        })?;
3798        let (sdb, _): (MagicDb, usize) =
3799            bincode::serde::decode_from_slice(&buf, bincode::config::standard())?;
3800        Ok(sdb)
3801    }
3802
3803    /// Verifies the consistency of the [`MagicDb`] database.
3804    /// This method must be called when the database is built once and used later.
3805    /// It catches [`enum@Error`] that would raise at rule evaluation time.
3806    ///
3807    /// # Errors
3808    /// Returns an error if any rule fails verification
3809    pub fn verify(&mut self) -> Result<(), Error> {
3810        if self.rules.len() == self.finalized {
3811            return Ok(());
3812        }
3813
3814        for r in self.rules.iter_mut().filter(|r| !r.finalized) {
3815            // return at the first rule failing verification
3816            r.try_finalize(&self.dependencies).map_err(|e| {
3817                Error::Verify(
3818                    r.source.clone().unwrap_or(String::from("unknown")),
3819                    r.line(),
3820                    e.into(),
3821                )
3822            })?;
3823            self.finalized += 1;
3824        }
3825
3826        debug_assert!(self.finalized <= self.rules.len());
3827
3828        Ok(())
3829    }
3830
3831    #[inline(always)]
3832    fn try_finalize(&mut self) {
3833        if self.rules.len() == self.finalized {
3834            return;
3835        }
3836
3837        let mut finalized = 0usize;
3838        self.rules.iter_mut().for_each(|r| {
3839            if r.try_finalize(&self.dependencies).is_ok() {
3840                finalized += 1;
3841            }
3842        });
3843
3844        self.finalized = finalized;
3845
3846        debug_assert!(self.finalized <= self.rules.len());
3847
3848        // put text rules at the end
3849        self.rules.sort_by_key(|r| (r.is_text(), -(r.score as i64)));
3850    }
3851}
3852
3853#[cfg(test)]
3854mod tests {
3855    use std::io::Cursor;
3856
3857    use regex::bytes::Regex;
3858
3859    use crate::utils::unix_local_time_to_string;
3860
3861    use super::*;
3862
3863    macro_rules! lazy_cache {
3864        ($l: literal) => {
3865            LazyCache::from_read_seek(Cursor::new($l)).unwrap()
3866        };
3867    }
3868
3869    fn first_magic(
3870        rule: &str,
3871        content: &[u8],
3872        stream_kind: StreamKind,
3873    ) -> Result<Magic<'static>, Error> {
3874        let mut md = MagicDb::new();
3875        md.load(
3876            FileMagicParser::parse_str(rule, None)
3877                .inspect_err(|e| eprintln!("{e}"))
3878                .unwrap(),
3879        );
3880        let mut reader = LazyCache::from_read_seek(Cursor::new(content)).unwrap();
3881        let v = md.best_magic_with_stream_kind(&mut reader, stream_kind)?;
3882        Ok(v.into_owned())
3883    }
3884
3885    /// helper macro to debug tests
3886    #[allow(unused_macros)]
3887    macro_rules! enable_trace {
3888        () => {
3889            tracing_subscriber::fmt()
3890                .with_max_level(tracing_subscriber::filter::LevelFilter::TRACE)
3891                .try_init();
3892        };
3893    }
3894
3895    macro_rules! parse_assert {
3896        ($rule:literal) => {
3897            FileMagicParser::parse_str($rule, None)
3898                .inspect_err(|e| eprintln!("{e}"))
3899                .unwrap()
3900        };
3901    }
3902
3903    macro_rules! assert_magic_match_bin {
3904        ($rule: literal, $content:literal) => {{ first_magic($rule, $content, StreamKind::Binary).unwrap() }};
3905        ($rule: literal, $content:literal, $message:expr) => {{
3906            assert_eq!(
3907                first_magic($rule, $content, StreamKind::Binary)
3908                    .unwrap()
3909                    .message(),
3910                $message
3911            );
3912        }};
3913    }
3914
3915    macro_rules! assert_magic_match_text {
3916        ($rule: literal, $content:literal) => {{ first_magic($rule, $content, StreamKind::Text(TextEncoding::Utf8)).unwrap() }};
3917        ($rule: literal, $content:literal, $message:expr) => {{
3918            assert_eq!(
3919                first_magic($rule, $content, StreamKind::Text(TextEncoding::Utf8))
3920                    .unwrap()
3921                    .message(),
3922                $message
3923            );
3924        }};
3925    }
3926
3927    macro_rules! assert_magic_not_match_text {
3928        ($rule: literal, $content:literal) => {{
3929            assert!(
3930                first_magic($rule, $content, StreamKind::Text(TextEncoding::Utf8))
3931                    .unwrap()
3932                    .is_default()
3933            );
3934        }};
3935    }
3936
3937    macro_rules! assert_magic_not_match_bin {
3938        ($rule: literal, $content:literal) => {{
3939            assert!(
3940                first_magic($rule, $content, StreamKind::Binary)
3941                    .unwrap()
3942                    .is_default()
3943            );
3944        }};
3945    }
3946
3947    #[test]
3948    fn test_regex() {
3949        assert_magic_match_text!(
3950            r#"
39510	regex/1024 \^#![[:space:]]*/usr/bin/env[[:space:]]+
3952!:mime	text/x-shellscript
3953>&0  regex/64 .*($|\\b) %s shell script text executable
3954    "#,
3955            br#"#!/usr/bin/env bash
3956        echo hello world"#,
3957            // the magic generated
3958            "bash shell script text executable"
3959        );
3960
3961        let re = Regex::new(r"(?-u)\x42\x82").unwrap();
3962        assert!(re.is_match(b"\x42\x82"));
3963
3964        assert_magic_match_bin!(
3965            r#"0 regex \x42\x82 binary regex match"#,
3966            b"\x00\x00\x00\x00\x00\x00\x42\x82"
3967        );
3968
3969        // test regex continuation after match
3970        assert_magic_match_bin!(
3971            r#"
3972            0 regex \x42\x82
3973            >&0 string \xde\xad\xbe\xef it works
3974            "#,
3975            b"\x00\x00\x00\x00\x00\x00\x42\x82\xde\xad\xbe\xef"
3976        );
3977
3978        assert_magic_match_bin!(
3979            r#"
3980            0 regex/s \x42\x82
3981            >&0 string \x42\x82\xde\xad\xbe\xef it works
3982            "#,
3983            b"\x00\x00\x00\x00\x00\x00\x42\x82\xde\xad\xbe\xef"
3984        );
3985
3986        // ^ must match stat of line when matching text
3987        assert_magic_match_text!(
3988            r#"
39890	regex/1024 \^HelloWorld$ HelloWorld String"#,
3990            br#"
3991// this is a comment after an empty line
3992HelloWorld
3993            "#
3994        );
3995    }
3996
3997    #[test]
3998    fn test_string_with_mods() {
3999        assert_magic_match_text!(
4000            r#"0	string/w	#!\ \ \ /usr/bin/env\ bash	BASH
4001        "#,
4002            b"#! /usr/bin/env bash i
4003        echo hello world"
4004        );
4005
4006        // test uppercase insensitive
4007        assert_magic_match_text!(
4008            r#"0	string/C	HelloWorld	it works
4009        "#,
4010            b"helloworld"
4011        );
4012
4013        assert_magic_not_match_text!(
4014            r#"0	string/C	HelloWorld	it works
4015        "#,
4016            b"hELLOwORLD"
4017        );
4018
4019        // test lowercase insensitive
4020        assert_magic_match_text!(
4021            r#"0	string/c	HelloWorld	it works
4022        "#,
4023            b"HELLOWORLD"
4024        );
4025
4026        assert_magic_not_match_text!(
4027            r#"0	string/c	HelloWorld	it works
4028        "#,
4029            b"helloworld"
4030        );
4031
4032        // test full word match
4033        assert_magic_match_text!(
4034            r#"0	string/f	#!/usr/bin/env\ bash	BASH
4035        "#,
4036            b"#!/usr/bin/env bash"
4037        );
4038
4039        assert_magic_not_match_text!(
4040            r#"0	string/f	#!/usr/bin/python PYTHON"#,
4041            b"#!/usr/bin/pythonic"
4042        );
4043
4044        // testing whitespace compacting
4045        assert_magic_match_text!(
4046            r#"0	string/W	#!/usr/bin/env\ python  PYTHON"#,
4047            b"#!/usr/bin/env    python"
4048        );
4049
4050        assert_magic_not_match_text!(
4051            r#"0	string/W	#!/usr/bin/env\ \ python  PYTHON"#,
4052            b"#!/usr/bin/env python"
4053        );
4054    }
4055
4056    #[test]
4057    fn test_search_with_mods() {
4058        assert_magic_match_text!(
4059            r#"0	search/1/fwt	#!\ /usr/bin/luatex	LuaTex script text executable"#,
4060            b"#!          /usr/bin/luatex "
4061        );
4062
4063        // test matching from the beginning
4064        assert_magic_match_text!(
4065            r#"
4066            0	search/s	/usr/bin/env
4067            >&0 string /usr/bin/env it works
4068            "#,
4069            b"#!/usr/bin/env    python"
4070        );
4071
4072        assert_magic_not_match_text!(
4073            r#"
4074            0	search	/usr/bin/env
4075            >&0 string /usr/bin/env it works
4076            "#,
4077            b"#!/usr/bin/env    python"
4078        );
4079    }
4080
4081    #[test]
4082    fn test_pstring() {
4083        assert_magic_match_bin!(r#"0 pstring Toast it works"#, b"\x05Toast");
4084
4085        assert_magic_match_bin!(r#"0 pstring Toast %s"#, b"\x05Toast", "Toast");
4086
4087        assert_magic_not_match_bin!(r#"0 pstring Toast Doesn't work"#, b"\x07Toaster");
4088
4089        // testing with modifiers
4090        assert_magic_match_bin!(r#"0 pstring/H Toast it works"#, b"\x00\x05Toast");
4091
4092        assert_magic_match_bin!(r#"0 pstring/HJ Toast it works"#, b"\x00\x07Toast");
4093
4094        assert_magic_match_bin!(r#"0 pstring/HJ Toast %s"#, b"\x00\x07Toast", "Toast");
4095
4096        assert_magic_match_bin!(r#"0 pstring/h Toast it works"#, b"\x05\x00Toast");
4097
4098        assert_magic_match_bin!(r#"0 pstring/hJ Toast it works"#, b"\x07\x00Toast");
4099
4100        assert_magic_match_bin!(r#"0 pstring/L Toast it works"#, b"\x00\x00\x00\x05Toast");
4101
4102        assert_magic_match_bin!(r#"0 pstring/LJ Toast it works"#, b"\x00\x00\x00\x09Toast");
4103
4104        assert_magic_match_bin!(r#"0 pstring/l Toast it works"#, b"\x05\x00\x00\x00Toast");
4105
4106        assert_magic_match_bin!(r#"0 pstring/lJ Toast it works"#, b"\x09\x00\x00\x00Toast");
4107    }
4108
4109    #[test]
4110    fn test_max_recursion() {
4111        let res = first_magic(
4112            r#"0	indirect x"#,
4113            b"#!          /usr/bin/luatex ",
4114            StreamKind::Binary,
4115        );
4116        assert!(res.is_err());
4117        let _ = res.inspect_err(|e| {
4118            assert!(matches!(
4119                e.unwrap_localized(),
4120                Error::MaximumRecursion(MAX_RECURSION)
4121            ))
4122        });
4123    }
4124
4125    #[test]
4126    fn test_string_ops() {
4127        assert_magic_match_text!("0	string/b MZ MZ File", b"MZ\0");
4128        assert_magic_match_text!("0	string !MZ Not MZ File", b"AZ\0");
4129        assert_magic_match_text!("0	string >\0 Any String", b"A\0");
4130        assert_magic_match_text!("0	string >Test Any String", b"Test 1\0");
4131        assert_magic_match_text!("0	string <Test Any String", b"\0");
4132        assert_magic_not_match_text!("0	string >Test Any String", b"\0");
4133    }
4134
4135    #[test]
4136    fn test_lestring16() {
4137        assert_magic_match_bin!(
4138            "0 lestring16 abcd Little-endian UTF-16 string",
4139            b"\x61\x00\x62\x00\x63\x00\x64\x00"
4140        );
4141        assert_magic_match_bin!(
4142            "0 lestring16 x %s",
4143            b"\x61\x00\x62\x00\x63\x00\x64\x00\x00",
4144            "abcd"
4145        );
4146        assert_magic_not_match_bin!(
4147            "0 lestring16 abcd Little-endian UTF-16 string",
4148            b"\x00\x61\x00\x62\x00\x63\x00\x64"
4149        );
4150        assert_magic_match_bin!(
4151            "4 lestring16 abcd Little-endian UTF-16 string",
4152            b"\x00\x00\x00\x00\x61\x00\x62\x00\x63\x00\x64\x00"
4153        );
4154    }
4155
4156    #[test]
4157    fn test_bestring16() {
4158        assert_magic_match_bin!(
4159            "0 bestring16 abcd Big-endian UTF-16 string",
4160            b"\x00\x61\x00\x62\x00\x63\x00\x64"
4161        );
4162        assert_magic_match_bin!(
4163            "0 bestring16 x %s",
4164            b"\x00\x61\x00\x62\x00\x63\x00\x64",
4165            "abcd"
4166        );
4167        assert_magic_not_match_bin!(
4168            "0 bestring16 abcd Big-endian UTF-16 string",
4169            b"\x61\x00\x62\x00\x63\x00\x64\x00"
4170        );
4171        assert_magic_match_bin!(
4172            "4 bestring16 abcd Big-endian UTF-16 string",
4173            b"\x00\x00\x00\x00\x00\x61\x00\x62\x00\x63\x00\x64"
4174        );
4175    }
4176
4177    #[test]
4178    fn test_offset_from_end() {
4179        assert_magic_match_bin!("-1 ubyte 0x42 last byte ok", b"\x00\x00\x42");
4180        assert_magic_match_bin!("-2 ubyte 0x41 last byte ok", b"\x00\x41\x00");
4181    }
4182
4183    #[test]
4184    fn test_relative_offset() {
4185        assert_magic_match_bin!(
4186            "
4187            0 ubyte 0x42
4188            >&0 ubyte 0x00
4189            >>&0 ubyte 0x41 third byte ok
4190            ",
4191            b"\x42\x00\x41\x00"
4192        );
4193    }
4194
4195    #[test]
4196    fn test_indirect_offset() {
4197        assert_magic_match_bin!("(0.l) ubyte 0x42 it works", b"\x04\x00\x00\x00\x42");
4198        // adding fixed value to offset
4199        assert_magic_match_bin!("(0.l+3) ubyte 0x42 it works", b"\x01\x00\x00\x00\x42");
4200        // testing offset pair
4201        assert_magic_match_bin!(
4202            "(0.l+(4)) ubyte 0x42 it works",
4203            b"\x04\x00\x00\x00\x04\x00\x00\x00\x42"
4204        );
4205    }
4206
4207    #[test]
4208    fn test_use_with_message() {
4209        assert_magic_match_bin!(
4210            r#"
42110 string MZ
4212>0 use mz first match
4213
42140 name mz then second match
4215>0 string MZ
4216"#,
4217            b"MZ\0",
4218            "first match then second match"
4219        );
4220    }
4221
4222    #[test]
4223    fn test_scalar_transform() {
4224        assert_magic_match_bin!("0 ubyte+1 0x1 add works", b"\x00");
4225        assert_magic_match_bin!("0 ubyte-1 0xfe sub works", b"\xff");
4226        assert_magic_match_bin!("0 ubyte%2 0 mod works", b"\x0a");
4227        assert_magic_match_bin!("0 ubyte&0x0f 0x0f bitand works", b"\xff");
4228        assert_magic_match_bin!("0 ubyte|0x0f 0xff bitor works", b"\xf0");
4229        assert_magic_match_bin!("0 ubyte^0x0f 0xf0 bitxor works", b"\xff");
4230
4231        FileMagicParser::parse_str("0 ubyte%0 mod by zero", None)
4232            .expect_err("expect div by zero error");
4233        FileMagicParser::parse_str("0 ubyte/0 div by zero", None)
4234            .expect_err("expect div by zero error");
4235    }
4236
4237    #[test]
4238    fn test_belong() {
4239        // Test that a file with a four-byte value at offset 0 that matches the given value in big-endian byte order
4240        assert_magic_match_bin!("0 belong 0x12345678 Big-endian long", b"\x12\x34\x56\x78");
4241        // Test that a file with a four-byte value at offset 0 that does not match the given value in big-endian byte order
4242        assert_magic_not_match_bin!("0 belong 0x12345678 Big-endian long", b"\x78\x56\x34\x12");
4243        // Test that a file with a four-byte value at a non-zero offset that matches the given value in big-endian byte order
4244        assert_magic_match_bin!(
4245            "4 belong 0x12345678 Big-endian long",
4246            b"\x00\x00\x00\x00\x12\x34\x56\x78"
4247        );
4248        // Test < operator
4249        assert_magic_match_bin!("0 belong <0x12345678 Big-endian long", b"\x12\x34\x56\x77");
4250        assert_magic_not_match_bin!("0 belong <0x12345678 Big-endian long", b"\x12\x34\x56\x78");
4251
4252        // Test > operator
4253        assert_magic_match_bin!("0 belong >0x12345678 Big-endian long", b"\x12\x34\x56\x79");
4254        assert_magic_not_match_bin!("0 belong >0x12345678 Big-endian long", b"\x12\x34\x56\x78");
4255
4256        // Test & operator
4257        assert_magic_match_bin!("0 belong &0x5678 Big-endian long", b"\x00\x00\x56\x78");
4258        assert_magic_not_match_bin!("0 belong &0x0000FFFF Big-endian long", b"\x12\x34\x56\x78");
4259
4260        // Test ^ operator (bitwise AND with complement)
4261        assert_magic_match_bin!("0 belong ^0xFFFF0000 Big-endian long", b"\x00\x00\x56\x78");
4262        assert_magic_not_match_bin!("0 belong ^0xFFFF0000 Big-endian long", b"\x00\x01\x56\x78");
4263
4264        // Test ~ operator
4265        assert_magic_match_bin!("0 belong ~0x12345678 Big-endian long", b"\xed\xcb\xa9\x87");
4266        assert_magic_not_match_bin!("0 belong ~0x12345678 Big-endian long", b"\x12\x34\x56\x78");
4267
4268        // Test x operator
4269        assert_magic_match_bin!("0 belong x Big-endian long", b"\x12\x34\x56\x78");
4270        assert_magic_match_bin!("0 belong x Big-endian long", b"\x78\x56\x34\x12");
4271    }
4272
4273    #[test]
4274    fn test_parse_search() {
4275        parse_assert!("0 search test");
4276        parse_assert!("0 search/24/s test");
4277        parse_assert!("0 search/s/24 test");
4278    }
4279
4280    #[test]
4281    fn test_bedate() {
4282        assert_magic_match_bin!(
4283            "0 bedate 946684800 Unix date (Jan 1, 2000)",
4284            b"\x38\x6D\x43\x80"
4285        );
4286        assert_magic_not_match_bin!(
4287            "0 bedate 946684800 Unix date (Jan 1, 2000)",
4288            b"\x00\x00\x00\x00"
4289        );
4290        assert_magic_match_bin!(
4291            "4 bedate 946684800 %s",
4292            b"\x00\x00\x00\x00\x38\x6D\x43\x80",
4293            "2000-01-01 00:00:00"
4294        );
4295    }
4296    #[test]
4297    fn test_beldate() {
4298        assert_magic_match_bin!(
4299            "0 beldate 946684800 Local date (Jan 1, 2000)",
4300            b"\x38\x6D\x43\x80"
4301        );
4302        assert_magic_not_match_bin!(
4303            "0 beldate 946684800 Local date (Jan 1, 2000)",
4304            b"\x00\x00\x00\x00"
4305        );
4306
4307        assert_magic_match_bin!(
4308            "4 beldate 946684800 {}",
4309            b"\x00\x00\x00\x00\x38\x6D\x43\x80",
4310            unix_local_time_to_string(946684800)
4311        );
4312    }
4313
4314    #[test]
4315    fn test_beqdate() {
4316        assert_magic_match_bin!(
4317            "0 beqdate 946684800 Unix date (Jan 1, 2000)",
4318            b"\x00\x00\x00\x00\x38\x6D\x43\x80"
4319        );
4320
4321        assert_magic_not_match_bin!(
4322            "0 beqdate 946684800 Unix date (Jan 1, 2000)",
4323            b"\x00\x00\x00\x00\x00\x00\x00\x00"
4324        );
4325
4326        assert_magic_match_bin!(
4327            "0 beqdate 946684800 %s",
4328            b"\x00\x00\x00\x00\x38\x6D\x43\x80",
4329            "2000-01-01 00:00:00"
4330        );
4331    }
4332
4333    #[test]
4334    fn test_medate() {
4335        assert_magic_match_bin!(
4336            "0 medate 946684800 Unix date (Jan 1, 2000)",
4337            b"\x6D\x38\x80\x43"
4338        );
4339
4340        assert_magic_not_match_bin!(
4341            "0 medate 946684800 Unix date (Jan 1, 2000)",
4342            b"\x00\x00\x00\x00"
4343        );
4344
4345        assert_magic_match_bin!(
4346            "4 medate 946684800 %s",
4347            b"\x00\x00\x00\x00\x6D\x38\x80\x43",
4348            "2000-01-01 00:00:00"
4349        );
4350    }
4351
4352    #[test]
4353    fn test_meldate() {
4354        assert_magic_match_bin!(
4355            "0 meldate 946684800 Local date (Jan 1, 2000)",
4356            b"\x6D\x38\x80\x43"
4357        );
4358        assert_magic_not_match_bin!(
4359            "0 meldate 946684800 Local date (Jan 1, 2000)",
4360            b"\x00\x00\x00\x00"
4361        );
4362
4363        assert_magic_match_bin!(
4364            "4 meldate 946684800 %s",
4365            b"\x00\x00\x00\x00\x6D\x38\x80\x43",
4366            unix_local_time_to_string(946684800)
4367        );
4368    }
4369
4370    #[test]
4371    fn test_date() {
4372        assert_magic_match_bin!(
4373            "0 date 946684800 Local date (Jan 1, 2000)",
4374            b"\x80\x43\x6D\x38"
4375        );
4376        assert_magic_not_match_bin!(
4377            "0 date 946684800 Local date (Jan 1, 2000)",
4378            b"\x00\x00\x00\x00"
4379        );
4380        assert_magic_match_bin!(
4381            "4 date 946684800 {}",
4382            b"\x00\x00\x00\x00\x80\x43\x6D\x38",
4383            "2000-01-01 00:00:00"
4384        );
4385    }
4386
4387    #[test]
4388    fn test_leldate() {
4389        assert_magic_match_bin!(
4390            "0 leldate 946684800 Local date (Jan 1, 2000)",
4391            b"\x80\x43\x6D\x38"
4392        );
4393        assert_magic_not_match_bin!(
4394            "0 leldate 946684800 Local date (Jan 1, 2000)",
4395            b"\x00\x00\x00\x00"
4396        );
4397        assert_magic_match_bin!(
4398            "4 leldate 946684800 {}",
4399            b"\x00\x00\x00\x00\x80\x43\x6D\x38",
4400            unix_local_time_to_string(946684800)
4401        );
4402    }
4403
4404    #[test]
4405    fn test_leqdate() {
4406        assert_magic_match_bin!(
4407            "0 leqdate 1577836800 Unix date (Jan 1, 2020)",
4408            b"\x00\xe1\x0b\x5E\x00\x00\x00\x00"
4409        );
4410
4411        assert_magic_not_match_bin!(
4412            "0 leqdate 1577836800 Unix date (Jan 1, 2020)",
4413            b"\x00\x00\x00\x00\x00\x00\x00\x00"
4414        );
4415        assert_magic_match_bin!(
4416            "8 leqdate 1577836800 %s",
4417            b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\xE1\x0B\x5E\x00\x00\x00\x00",
4418            "2020-01-01 00:00:00"
4419        );
4420    }
4421
4422    #[test]
4423    fn test_leqldate() {
4424        assert_magic_match_bin!(
4425            "0 leqldate 1577836800 Unix date (Jan 1, 2020)",
4426            b"\x00\xe1\x0b\x5E\x00\x00\x00\x00"
4427        );
4428
4429        assert_magic_not_match_bin!(
4430            "0 leqldate 1577836800 Unix date (Jan 1, 2020)",
4431            b"\x00\x00\x00\x00\x00\x00\x00\x00"
4432        );
4433        assert_magic_match_bin!(
4434            "8 leqldate 1577836800 %s",
4435            b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\xE1\x0B\x5E\x00\x00\x00\x00",
4436            unix_local_time_to_string(1577836800)
4437        );
4438    }
4439
4440    #[test]
4441    fn test_melong() {
4442        // Test = operator
4443        assert_magic_match_bin!(
4444            "0 melong =0x12345678 Middle-endian long",
4445            b"\x34\x12\x78\x56"
4446        );
4447        assert_magic_not_match_bin!(
4448            "0 melong =0x12345678 Middle-endian long",
4449            b"\x00\x00\x00\x00"
4450        );
4451
4452        // Test < operator
4453        assert_magic_match_bin!(
4454            "0 melong <0x12345678 Middle-endian long",
4455            b"\x34\x12\x78\x55"
4456        ); // 0x12345677 in middle-endian
4457        assert_magic_not_match_bin!(
4458            "0 melong <0x12345678 Middle-endian long",
4459            b"\x34\x12\x78\x56"
4460        ); // 0x12345678 in middle-endian
4461
4462        // Test > operator
4463        assert_magic_match_bin!(
4464            "0 melong >0x12345678 Middle-endian long",
4465            b"\x34\x12\x78\x57"
4466        ); // 0x12345679 in middle-endian
4467        assert_magic_not_match_bin!(
4468            "0 melong >0x12345678 Middle-endian long",
4469            b"\x34\x12\x78\x56"
4470        ); // 0x12345678 in middle-endian
4471
4472        // Test & operator
4473        assert_magic_match_bin!("0 melong &0x5678 Middle-endian long", b"\xab\xcd\x78\x56"); // 0x00007856 in middle-endian
4474        assert_magic_not_match_bin!(
4475            "0 melong &0x0000FFFF Middle-endian long",
4476            b"\x34\x12\x78\x56"
4477        ); // 0x12347856 in middle-endian
4478
4479        // Test ^ operator (bitwise AND with complement)
4480        assert_magic_match_bin!(
4481            "0 melong ^0xFFFF0000 Middle-endian long",
4482            b"\x00\x00\x78\x56"
4483        ); // 0x00007856 in middle-endian
4484        assert_magic_not_match_bin!(
4485            "0 melong ^0xFFFF0000 Middle-endian long",
4486            b"\x00\x01\x78\x56"
4487        ); // 0x00017856 in middle-endian
4488
4489        // Test ~ operator
4490        assert_magic_match_bin!(
4491            "0 melong ~0x12345678 Middle-endian long",
4492            b"\xCB\xED\x87\xA9"
4493        );
4494        assert_magic_not_match_bin!(
4495            "0 melong ~0x12345678 Middle-endian long",
4496            b"\x34\x12\x78\x56"
4497        ); // The original value
4498
4499        // Test x operator
4500        assert_magic_match_bin!("0 melong x Middle-endian long", b"\x34\x12\x78\x56");
4501        assert_magic_match_bin!("0 melong x Middle-endian long", b"\x00\x00\x00\x00");
4502    }
4503
4504    #[test]
4505    fn test_uquad() {
4506        // Test = operator
4507        assert_magic_match_bin!(
4508            "0 uquad =0x123456789ABCDEF0 Unsigned quad",
4509            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4510        );
4511        assert_magic_not_match_bin!(
4512            "0 uquad =0x123456789ABCDEF0 Unsigned quad",
4513            b"\x00\x00\x00\x00\x00\x00\x00\x00"
4514        );
4515
4516        // Test < operator
4517        assert_magic_match_bin!(
4518            "0 uquad <0x123456789ABCDEF0 Unsigned quad",
4519            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x11"
4520        );
4521        assert_magic_not_match_bin!(
4522            "0 uquad <0x123456789ABCDEF0 Unsigned quad",
4523            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4524        );
4525
4526        // Test > operator
4527        assert_magic_match_bin!(
4528            "0 uquad >0x123456789ABCDEF0 Unsigned quad",
4529            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x13"
4530        );
4531        assert_magic_not_match_bin!(
4532            "0 uquad >0x123456789ABCDEF0 Unsigned quad",
4533            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4534        );
4535
4536        // Test & operator
4537        assert_magic_match_bin!(
4538            "0 uquad &0xF0 Unsigned quad",
4539            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4540        );
4541        assert_magic_not_match_bin!(
4542            "0 uquad &0xFF Unsigned quad",
4543            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4544        );
4545
4546        // Test ^ operator (bitwise AND with complement)
4547        assert_magic_match_bin!(
4548            "0 uquad ^0xFFFFFFFFFFFFFFFF Unsigned quad",
4549            b"\x00\x00\x00\x00\x00\x00\x00\x00"
4550        ); // All bits clear
4551        assert_magic_not_match_bin!(
4552            "0 uquad ^0xFFFFFFFFFFFFFFFF Unsigned quad",
4553            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4554        ); // Some bits set
4555
4556        // Test ~ operator
4557        assert_magic_match_bin!(
4558            "0 uquad ~0x123456789ABCDEF0 Unsigned quad",
4559            b"\x0F\x21\x43\x65\x87\xA9\xCB\xED"
4560        );
4561        assert_magic_not_match_bin!(
4562            "0 uquad ~0x123456789ABCDEF0 Unsigned quad",
4563            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4564        ); // The original value
4565
4566        // Test x operator
4567        assert_magic_match_bin!(
4568            "0 uquad x {:#x}",
4569            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12",
4570            "0x123456789abcdef0"
4571        );
4572        assert_magic_match_bin!(
4573            "0 uquad x Unsigned quad",
4574            b"\x00\x00\x00\x00\x00\x00\x00\x00"
4575        );
4576    }
4577
4578    #[test]
4579    fn test_guid() {
4580        assert_magic_match_bin!(
4581            "0 guid EC959539-6786-2D4E-8FDB-98814CE76C1E It works",
4582            b"\xEC\x95\x95\x39\x67\x86\x2D\x4E\x8F\xDB\x98\x81\x4C\xE7\x6C\x1E"
4583        );
4584
4585        assert_magic_not_match_bin!(
4586            "0 guid 399595EC-8667-4E2D-8FDB-98814CE76C1E It works",
4587            b"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F"
4588        );
4589
4590        assert_magic_match_bin!(
4591            "0 guid x %s",
4592            b"\xEC\x95\x95\x39\x67\x86\x2D\x4E\x8F\xDB\x98\x81\x4C\xE7\x6C\x1E",
4593            "EC959539-6786-2D4E-8FDB-98814CE76C1E"
4594        );
4595    }
4596
4597    #[test]
4598    fn test_ubeqdate() {
4599        assert_magic_match_bin!(
4600            "0 ubeqdate 1633046400 It works",
4601            b"\x00\x00\x00\x00\x61\x56\x4f\x80"
4602        );
4603
4604        assert_magic_match_bin!(
4605            "0 ubeqdate x %s",
4606            b"\x00\x00\x00\x00\x61\x56\x4f\x80",
4607            "2021-10-01 00:00:00"
4608        );
4609
4610        assert_magic_not_match_bin!(
4611            "0 ubeqdate 1633046400 It should not work",
4612            b"\x00\x00\x00\x00\x00\x00\x00\x00"
4613        );
4614    }
4615
4616    #[test]
4617    fn test_ldate() {
4618        assert_magic_match_bin!("0 ldate 1640551520 It works", b"\x60\xd4\xC8\x61");
4619
4620        assert_magic_not_match_bin!("0 ldate 1633046400 It should not work", b"\x00\x00\x00\x00");
4621
4622        assert_magic_match_bin!(
4623            "0 ldate x %s",
4624            b"\x60\xd4\xC8\x61",
4625            unix_local_time_to_string(1640551520)
4626        );
4627    }
4628
4629    #[test]
4630    fn test_scalar_with_transform() {
4631        assert_magic_match_bin!("0 ubyte/10 2 {}", b"\x14", "2");
4632        assert_magic_match_bin!("0 ubyte/10 x {}", b"\x14", "2");
4633        assert_magic_match_bin!("0 ubyte%10 x {}", b"\x14", "0");
4634    }
4635
4636    #[test]
4637    fn test_float_with_transform() {
4638        assert_magic_match_bin!("0 lefloat/10 2 {}", b"\x00\x00\xa0\x41", "2");
4639        assert_magic_match_bin!("0 lefloat/10 x {}", b"\x00\x00\xa0\x41", "2");
4640        assert_magic_match_bin!("0 lefloat%10 x {}", b"\x00\x00\xa0\x41", "0");
4641    }
4642
4643    #[test]
4644    fn test_read_octal() {
4645        // Basic cases
4646        assert_eq!(read_octal_u64(&mut lazy_cache!("0")), Some(0));
4647        assert_eq!(read_octal_u64(&mut lazy_cache!("00")), Some(0));
4648        assert_eq!(read_octal_u64(&mut lazy_cache!("01")), Some(1));
4649        assert_eq!(read_octal_u64(&mut lazy_cache!("07")), Some(7));
4650        assert_eq!(read_octal_u64(&mut lazy_cache!("010")), Some(8));
4651        assert_eq!(read_octal_u64(&mut lazy_cache!("0123")), Some(83));
4652        assert_eq!(read_octal_u64(&mut lazy_cache!("0755")), Some(493));
4653
4654        // With trailing non-octal characters
4655        assert_eq!(read_octal_u64(&mut lazy_cache!("0ABC")), Some(0));
4656        assert_eq!(read_octal_u64(&mut lazy_cache!("01ABC")), Some(1));
4657        assert_eq!(read_octal_u64(&mut lazy_cache!("0755ABC")), Some(493));
4658        assert_eq!(read_octal_u64(&mut lazy_cache!("0123ABC")), Some(83));
4659
4660        // Invalid octal digits
4661        assert_eq!(read_octal_u64(&mut lazy_cache!("08")), Some(0)); // stops at '8'
4662        assert_eq!(read_octal_u64(&mut lazy_cache!("01238")), Some(83)); // stops at '8'
4663
4664        // No leading '0'
4665        assert_eq!(read_octal_u64(&mut lazy_cache!("123")), None);
4666        assert_eq!(read_octal_u64(&mut lazy_cache!("755")), None);
4667
4668        // Empty string
4669        assert_eq!(read_octal_u64(&mut lazy_cache!("")), None);
4670
4671        // Only non-octal characters
4672        assert_eq!(read_octal_u64(&mut lazy_cache!("ABC")), None);
4673        assert_eq!(read_octal_u64(&mut lazy_cache!("8ABC")), None); // first char is not '0'
4674
4675        // Longer valid octal (but within u64 range)
4676        assert_eq!(
4677            read_octal_u64(&mut lazy_cache!("01777777777")),
4678            Some(268435455)
4679        );
4680    }
4681
4682    #[test]
4683    fn test_offset_bug_1() {
4684        // this tests the exact behaviour
4685        // expected by libmagic/file
4686        assert_magic_match_bin!(
4687            r"
46881	string		TEST Bread is
4689# offset computation is relative to
4690# rule start
4691>(5.b)	use toasted
4692
46930 name toasted
4694>0	string twice Toasted
4695>>0  use toasted_twice
4696
46970 name toasted_twice
4698>(6.b) string x %s
4699        ",
4700            b"\x00TEST\x06twice\x00\x06",
4701            "Bread is Toasted twice"
4702        );
4703    }
4704
4705    // this test implement the exact same logic as
4706    // test_offset_bug_1 except that the rule starts
4707    // matching from end. Surprisingly we need to
4708    // adjust indirect offsets so that it works in
4709    // libmagic/file
4710    #[test]
4711    fn test_offset_bug_2() {
4712        // this tests the exact behaviour
4713        // expected by libmagic/file
4714        assert_magic_match_bin!(
4715            r"
4716-12	string		TEST Bread is
4717>(4.b)	use toasted
4718
47190 name toasted
4720>0	string twice Toasted
4721>>0  use toasted_twice
4722
47230 name toasted_twice
4724>(6.b) string x %
4725        ",
4726            b"\x00TEST\x06twice\x00\x06",
4727            "Bread is Toasted twice"
4728        )
4729    }
4730
4731    #[test]
4732    fn test_offset_bug_3() {
4733        // this tests the exact behaviour
4734        // expected by libmagic/file
4735        assert_magic_match_bin!(
4736            r"
47371	string		TEST Bread is
4738>(5.b) indirect/r x
4739
47400	string twice Toasted
4741>0  use toasted_twice
4742
47430 name toasted_twice
4744>0 string x %s
4745        ",
4746            b"\x00TEST\x06twice\x00\x08",
4747            "Bread is Toasted twice"
4748        )
4749    }
4750
4751    #[test]
4752    fn test_offset_bug_4() {
4753        // this tests the exact behaviour
4754        // expected by libmagic/file
4755        assert_magic_match_bin!(
4756            r"
47571	string		Bread %s
4758>(6.b) indirect/r x
4759
4760# this one uses a based offset
4761# computed at indirection
47621	string is\ Toasted %s
4763>(11.b)  use toasted_twice
4764
4765# this one is using a new base
4766# offset being previous base
4767# offset + offset of use
47680 name toasted_twice
4769>0 string x %s
4770            ",
4771            b"\x00Bread\x06is Toasted\x0ctwice\x00",
4772            "Bread is Toasted twice"
4773        )
4774    }
4775
4776    #[test]
4777    fn test_offset_bug_5() {
4778        assert_magic_match_bin!(
4779            r"
47801	string		TEST Bread is
4781>(5.b) indirect/r x
4782
47830	string twice Toasted
4784>0  use toasted_twice
4785
47860 name toasted_twice
4787>0 string twice
4788>>&1 byte 0x08 twice
4789            ",
4790            b"\x00TEST\x06twice\x00\x08",
4791            "Bread is Toasted twice"
4792        )
4793    }
4794
4795    #[test]
4796    fn test_bug_6() {
4797        // An indirect use test should not be successful
4798        // even if a match with no message occurs
4799        
4800        assert_magic_match_bin!(
4801            r"
48021	string		TEST Bread is toasted
4803>&0 use toasted
4804>>&0 default x but not burnt
4805
48060 name toasted
4807>1 string toasted
4808            ",
4809            b"\x00TEST\x06toasted",
4810            "Bread is toasted"
4811        )
4812    }
4813
4814    #[test]
4815    fn test_message_parts() {
4816        let m = first_magic(
4817            r#"0	string/W	#!/usr/bin/env\ python  PYTHON"#,
4818            b"#!/usr/bin/env    python",
4819            StreamKind::Text(TextEncoding::Ascii),
4820        )
4821        .unwrap();
4822
4823        assert!(m.message_parts().any(|p| p.eq_ignore_ascii_case("python")))
4824    }
4825
4826    #[test]
4827    fn test_load_bulk() {
4828        let mut db = MagicDb::new();
4829
4830        let rules = vec![
4831            parse_assert!("0 search test"),
4832            parse_assert!("0 search/24/s test"),
4833            parse_assert!("0 search/s/24 test"),
4834        ];
4835
4836        db.load_bulk(rules.into_iter());
4837        db.verify().unwrap();
4838    }
4839
4840    #[test]
4841    fn test_load_bulk_failure() {
4842        let mut db = MagicDb::new();
4843
4844        let rules = vec![parse_assert!(
4845            r#"
48460 search/s/24 test
4847>0 use test
4848"#
4849        )];
4850
4851        db.load_bulk(rules.into_iter());
4852        assert!(matches!(db.verify(), Err(Error::Verify(_, _, _))));
4853    }
4854}