pure_magic/
lib.rs

1#![forbid(unsafe_code)]
2#![deny(unused_imports)]
3#![deny(missing_docs)]
4//! # `pure-magic`: A pure and safe Rust Reimplementation of `libmagic`
5//!
6//! Unlike many file identification crates, `pure-magic` is highly compatible with the standard
7//! `magic` rule format, allowing seamless reuse of existing
8//! [rules](https://github.com/qjerome/magic-rs/tree/main/magic-db/src/magdir). This makes it an ideal
9//! drop-in replacement for crates relying on **`libmagic` C bindings**, where memory safety is critical.
10//!
11//! **Key Features:**
12//! - File type detection
13//! - MIME type inference
14//! - Custom magic rule parsing
15//!
16//! ## Installation
17//! Add `pure-magic` to your `Cargo.toml`:
18//!
19//! ```toml
20//! [dependencies]
21//! pure-magic = "0.1"  # Replace with the latest version
22//! ```
23//!
24//! Or add the latest version with cargo:
25//!
26//! ```sh
27//! cargo add pure-magic
28//! ```
29//!
30//! ## Quick Start
31//!
32//! ### Detect File Types Programmatically
33//! ```rust
34//! use pure_magic::{MagicDb, MagicSource};
35//! use std::fs::File;
36//!
37//! fn main() -> Result<(), Box<dyn std::error::Error>> {
38//!     let mut db = MagicDb::new();
39//!     // Create a MagicSource from a file
40//!     let rust_magic = MagicSource::open("../magic-db/src/magdir/rust")?;
41//!     db.load(rust_magic);
42//!     // Verification is not mandatory
43//!     db.verify()?;
44//!
45//!     // Open a file and detect its type
46//!     let mut file = File::open("src/lib.rs")?;
47//!     let magic = db.first_magic(&mut file, None)?;
48//!
49//!     println!(
50//!         "File type: {} (MIME: {}, strength: {})",
51//!         magic.message(),
52//!         magic.mime_type(),
53//!         magic.strength()
54//!     );
55//!     Ok(())
56//! }
57//! ```
58//!
59//! ### Get All Matching Rules
60//! ```rust
61//! use pure_magic::{MagicDb, MagicSource};
62//! use std::fs::File;
63//!
64//! fn main() -> Result<(), Box<dyn std::error::Error>> {
65//!     let mut db = MagicDb::new();
66//!     // Create a MagicSource from a file
67//!     let rust_magic = MagicSource::open("../magic-db/src/magdir/rust")?;
68//!     db.load(rust_magic);
69//!
70//!     // Open a file and detect its type
71//!     let mut file = File::open("src/lib.rs")?;
72//!
73//!     // Get all matching rules, sorted by strength
74//!     let magics = db.all_magics(&mut file)?;
75//!
76//!     // Must contain rust file magic and default text magic
77//!     assert!(magics.len() > 1);
78//!
79//!     for magic in magics {
80//!         println!(
81//!             "Match: {} (strength: {}, source: {})",
82//!             magic.message(),
83//!             magic.strength(),
84//!             magic.source().unwrap_or("unknown")
85//!         );
86//!     }
87//!     Ok(())
88//! }
89//! ```
90//!
91//! ### Serialize a Database to Disk
92//! ```rust
93//! use pure_magic::{MagicDb, MagicSource};
94//! use std::fs::File;
95//!
96//! fn main() -> Result<(), Box<dyn std::error::Error>> {
97//!     let mut db = MagicDb::new();
98//!     // Create a MagicSource from a file
99//!     let rust_magic = MagicSource::open("../magic-db/src/magdir/rust")?;
100//!     db.load(rust_magic);
101//!
102//!     // Serialize the database to a file
103//!     let mut output = File::create("/tmp/compiled.db")?;
104//!     db.serialize(&mut output)?;
105//!
106//!     println!("Database saved to file");
107//!     Ok(())
108//! }
109//! ```
110//!
111//! ### Deserialize a Database
112//! ```rust
113//! use pure_magic::{MagicDb, MagicSource};
114//! use std::fs::File;
115//!
116//! fn main() -> Result<(), Box<dyn std::error::Error>> {
117//!     let mut db = MagicDb::new();
118//!     // Create a MagicSource from a file
119//!     let rust_magic = MagicSource::open("../magic-db/src/magdir/rust")?;
120//!     db.load(rust_magic);
121//!
122//!     // Serialize the database in a vector
123//!     let mut ser = vec![];
124//!     db.serialize(&mut ser)?;
125//!     println!("Database saved to vector");
126//!
127//!     // We deserialize from slice
128//!     let db = MagicDb::deserialize(&mut ser.as_slice())?;
129//!
130//!     assert!(!db.rules().is_empty());
131//!
132//!     Ok(())
133//! }
134//! ```
135//!
136//! ## License
137//! This project is licensed under the **GPL-3.0 License**.
138//!
139//! ## Contributing
140//! Contributions are welcome! Open an issue or submit a pull request.
141//!
142//! ## Acknowledgments
143//! - Inspired by the original `libmagic` (part of the `file` command).
144
145use dyf::{DynDisplay, FormatString, dformat};
146use flagset::{FlagSet, flags};
147use flate2::{Compression, read::GzDecoder, write::GzEncoder};
148use lazy_cache::LazyCache;
149use memchr::memchr;
150use pest::{Span, error::ErrorVariant};
151use regex::bytes::{self};
152use serde::{Deserialize, Serialize};
153use std::{
154    borrow::Cow,
155    cmp::max,
156    collections::{HashMap, HashSet},
157    fmt::{self, Debug, Display},
158    io::{self, Read, Seek, SeekFrom, Write},
159    ops::{Add, BitAnd, BitOr, BitXor, Div, Mul, Rem, Sub},
160    path::Path,
161};
162use tar::Archive;
163use thiserror::Error;
164use tracing::{Level, debug, enabled, trace};
165
166use crate::{
167    numeric::{Float, FloatDataType, Scalar, ScalarDataType},
168    parser::{FileMagicParser, Rule},
169    utils::{decode_id3, find_json_boundaries, run_utf8_validation},
170};
171
172mod numeric;
173mod parser;
174mod utils;
175
176const HARDCODED_MAGIC_STRENGTH: u64 = 2048;
177const HARDCODED_SOURCE: &str = "hardcoded";
178// corresponds to FILE_INDIR_MAX constant defined in libmagic
179const MAX_RECURSION: usize = 50;
180// constant found in libmagic. It is used to limit for regex tests
181const FILE_REGEX_MAX: usize = 8192;
182
183/// Maximum number of bytes to read for search tests.
184///
185/// This constant is derived from `libmagic` and is used to limit the number of bytes
186/// read during search tests to ensure performance and efficiency. The value is set
187/// to 7 megabytes.
188pub const FILE_BYTES_MAX: usize = 7 * 1024 * 1024;
189/// Default mimetype for un-identified binary data
190pub const DEFAULT_BIN_MIMETYPE: &str = "application/octet-stream";
191/// Default mimetype for un-identified text data
192pub const DEFAULT_TEXT_MIMETYPE: &str = "text/plain";
193
194pub(crate) const TIMESTAMP_FORMAT: &str = "%Y-%m-%d %H:%M:%S";
195
196macro_rules! debug_panic {
197    ($($arg:tt)*) => {
198        if cfg!(debug_assertions) {
199            panic!($($arg)*);
200        }
201    };
202}
203
204macro_rules! read {
205    ($r: expr, $ty: ty) => {{
206        let mut a = [0u8; std::mem::size_of::<$ty>()];
207        $r.read_exact(&mut a)?;
208        a
209    }};
210}
211
212macro_rules! read_le {
213    ($r:expr, $ty: ty ) => {{ <$ty>::from_le_bytes(read!($r, $ty)) }};
214}
215
216macro_rules! read_be {
217    ($r:expr, $ty: ty ) => {{ <$ty>::from_be_bytes(read!($r, $ty)) }};
218}
219
220macro_rules! read_me {
221    ($r: expr) => {{ ((read_le!($r, u16) as i32) << 16) | (read_le!($r, u16) as i32) }};
222}
223
224#[inline(always)]
225fn read_octal_u64<R: Read + Seek>(haystack: &mut LazyCache<R>) -> Option<u64> {
226    let s = haystack
227        .read_while_or_limit(|b| matches!(b, b'0'..=b'7'), 22)
228        .map(|buf| str::from_utf8(buf))
229        .ok()?
230        .ok()?;
231
232    if !s.starts_with("0") {
233        return None;
234    }
235
236    u64::from_str_radix(s, 8).ok()
237}
238
239/// Represents all possible errors that can occur during file type detection and processing.
240#[derive(Debug, Error)]
241pub enum Error {
242    /// A generic error with a custom message.
243    #[error("{0}")]
244    Msg(String),
245
246    /// Indicate a rule load failure
247    #[error("source={0} line={1} error={2}")]
248    Verify(String, usize, Box<Error>),
249
250    /// An error with a source location and a nested error.
251    #[error("source={0} line={1} error={2}")]
252    Localized(String, usize, Box<Error>),
253
254    /// Indicates a required rule was not found.
255    #[error("missing rule: {0}")]
256    MissingRule(String),
257
258    /// Indicates the maximum recursion depth was reached.
259    #[error("maximum recursion reached: {0}")]
260    MaximumRecursion(usize),
261
262    /// Wraps an I/O error.
263    #[error("io: {0}")]
264    Io(#[from] io::Error),
265
266    /// Wraps a parsing error from the `pest` parser.
267    #[error("parser error: {0}")]
268    Parse(#[from] Box<pest::error::Error<Rule>>),
269
270    /// Wraps a formatting error from the `dyf` crate.
271    #[error("formatting: {0}")]
272    Format(#[from] dyf::Error),
273
274    /// Wraps a regex-related error.
275    #[error("regex: {0}")]
276    Regex(#[from] regex::Error),
277
278    /// Wraps a serialization error from `bincode`.
279    #[error("{0}")]
280    Serialize(#[from] bincode::error::EncodeError),
281
282    /// Wraps a deserialization error from `bincode`.
283    #[error("{0}")]
284    Deserialize(#[from] bincode::error::DecodeError),
285}
286
287impl Error {
288    #[inline]
289    fn parser<S: ToString>(msg: S, span: Span<'_>) -> Self {
290        Self::Parse(Box::new(pest::error::Error::new_from_span(
291            ErrorVariant::CustomError {
292                message: msg.to_string(),
293            },
294            span,
295        )))
296    }
297
298    fn msg<M: AsRef<str>>(msg: M) -> Self {
299        Self::Msg(msg.as_ref().into())
300    }
301
302    fn localized<S: AsRef<str>>(source: S, line: usize, err: Error) -> Self {
303        Self::Localized(source.as_ref().into(), line, err.into())
304    }
305
306    /// Unwraps the localized error
307    pub fn unwrap_localized(&self) -> &Self {
308        match self {
309            Self::Localized(_, _, e) => e,
310            _ => self,
311        }
312    }
313}
314
315#[derive(Debug, Clone, Serialize, Deserialize)]
316enum Message {
317    String(String),
318    Format {
319        printf_spec: String,
320        fs: FormatString,
321    },
322}
323
324impl Display for Message {
325    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
326        match self {
327            Self::String(s) => write!(f, "{s}"),
328            Self::Format { printf_spec: _, fs } => write!(f, "{}", fs.to_string_lossy()),
329        }
330    }
331}
332
333impl Message {
334    fn to_string_lossy(&self) -> Cow<'_, str> {
335        match self {
336            Message::String(s) => Cow::Borrowed(s),
337            Message::Format { printf_spec: _, fs } => fs.to_string_lossy(),
338        }
339    }
340
341    #[inline(always)]
342    fn format_with(&self, mr: Option<&MatchRes>) -> Result<Cow<'_, str>, Error> {
343        match self {
344            Self::String(s) => Ok(Cow::Borrowed(s.as_str())),
345            Self::Format {
346                printf_spec: c_spec,
347                fs,
348            } => {
349                if let Some(mr) = mr {
350                    match mr {
351                        MatchRes::Float(_, _) | MatchRes::Bytes(_, _, _, _) => {
352                            Ok(Cow::Owned(dformat!(fs, mr)?))
353                        }
354                        MatchRes::Scalar(_, scalar) => {
355                            // we want to print a byte as char
356                            if c_spec.as_str() == "c" {
357                                match scalar {
358                                    Scalar::byte(b) => {
359                                        let b = (*b as u8) as char;
360                                        Ok(Cow::Owned(dformat!(fs, b)?))
361                                    }
362                                    Scalar::ubyte(b) => {
363                                        let b = *b as char;
364                                        Ok(Cow::Owned(dformat!(fs, b)?))
365                                    }
366                                    _ => Ok(Cow::Owned(dformat!(fs, mr)?)),
367                                }
368                            } else {
369                                Ok(Cow::Owned(dformat!(fs, mr)?))
370                            }
371                        }
372                    }
373                } else {
374                    Ok(fs.to_string_lossy())
375                }
376            }
377        }
378    }
379}
380
381impl ScalarDataType {
382    #[inline(always)]
383    fn read<R: Read + Seek>(&self, from: &mut R, switch_endianness: bool) -> Result<Scalar, Error> {
384        macro_rules! _read_le {
385            ($ty: ty) => {{
386                if switch_endianness {
387                    <$ty>::from_be_bytes(read!(from, $ty))
388                } else {
389                    <$ty>::from_le_bytes(read!(from, $ty))
390                }
391            }};
392        }
393
394        macro_rules! _read_be {
395            ($ty: ty) => {{
396                if switch_endianness {
397                    <$ty>::from_le_bytes(read!(from, $ty))
398                } else {
399                    <$ty>::from_be_bytes(read!(from, $ty))
400                }
401            }};
402        }
403
404        macro_rules! _read_ne {
405            ($ty: ty) => {{
406                if cfg!(target_endian = "big") {
407                    _read_be!($ty)
408                } else {
409                    _read_le!($ty)
410                }
411            }};
412        }
413
414        macro_rules! _read_me {
415            () => {
416                ((_read_le!(u16) as i32) << 16) | (_read_le!(u16) as i32)
417            };
418        }
419
420        Ok(match self {
421            // signed
422            Self::byte => Scalar::byte(read!(from, u8)[0] as i8),
423            Self::short => Scalar::short(_read_ne!(i16)),
424            Self::long => Scalar::long(_read_ne!(i32)),
425            Self::date => Scalar::date(_read_ne!(i32)),
426            Self::ldate => Scalar::ldate(_read_ne!(i32)),
427            Self::qwdate => Scalar::qwdate(_read_ne!(i64)),
428            Self::leshort => Scalar::leshort(_read_le!(i16)),
429            Self::lelong => Scalar::lelong(_read_le!(i32)),
430            Self::lequad => Scalar::lequad(_read_le!(i64)),
431            Self::bequad => Scalar::bequad(_read_be!(i64)),
432            Self::belong => Scalar::belong(_read_be!(i32)),
433            Self::bedate => Scalar::bedate(_read_be!(i32)),
434            Self::beldate => Scalar::beldate(_read_be!(i32)),
435            Self::beqdate => Scalar::beqdate(_read_be!(i64)),
436            // unsigned
437            Self::ubyte => Scalar::ubyte(read!(from, u8)[0]),
438            Self::ushort => Scalar::ushort(_read_ne!(u16)),
439            Self::uleshort => Scalar::uleshort(_read_le!(u16)),
440            Self::ulelong => Scalar::ulelong(_read_le!(u32)),
441            Self::uledate => Scalar::uledate(_read_le!(u32)),
442            Self::ulequad => Scalar::ulequad(_read_le!(u64)),
443            Self::offset => Scalar::offset(from.stream_position()?),
444            Self::ubequad => Scalar::ubequad(_read_be!(u64)),
445            Self::medate => Scalar::medate(_read_me!()),
446            Self::meldate => Scalar::meldate(_read_me!()),
447            Self::melong => Scalar::melong(_read_me!()),
448            Self::beshort => Scalar::beshort(_read_be!(i16)),
449            Self::quad => Scalar::quad(_read_ne!(i64)),
450            Self::uquad => Scalar::uquad(_read_ne!(u64)),
451            Self::ledate => Scalar::ledate(_read_le!(i32)),
452            Self::leldate => Scalar::leldate(_read_le!(i32)),
453            Self::leqdate => Scalar::leqdate(_read_le!(i64)),
454            Self::leqldate => Scalar::leqldate(_read_le!(i64)),
455            Self::leqwdate => Scalar::leqwdate(_read_le!(i64)),
456            Self::ubelong => Scalar::ubelong(_read_be!(u32)),
457            Self::ulong => Scalar::ulong(_read_ne!(u32)),
458            Self::ubeshort => Scalar::ubeshort(_read_be!(u16)),
459            Self::ubeqdate => Scalar::ubeqdate(_read_be!(u64)),
460            Self::lemsdosdate => Scalar::lemsdosdate(_read_le!(u16)),
461            Self::lemsdostime => Scalar::lemsdostime(_read_le!(u16)),
462            Self::guid => Scalar::guid(u128::from_be_bytes(read!(from, u128))),
463        })
464    }
465}
466
467impl FloatDataType {
468    #[inline(always)]
469    fn read<R: Read + Seek>(&self, from: &mut R, switch_endianness: bool) -> Result<Float, Error> {
470        macro_rules! _read_le {
471            ($ty: ty) => {{
472                if switch_endianness {
473                    <$ty>::from_be_bytes(read!(from, $ty))
474                } else {
475                    <$ty>::from_le_bytes(read!(from, $ty))
476                }
477            }};
478        }
479
480        macro_rules! _read_be {
481            ($ty: ty) => {{
482                if switch_endianness {
483                    <$ty>::from_le_bytes(read!(from, $ty))
484                } else {
485                    <$ty>::from_be_bytes(read!(from, $ty))
486                }
487            }};
488        }
489
490        macro_rules! _read_ne {
491            ($ty: ty) => {{
492                if cfg!(target_endian = "big") {
493                    _read_be!($ty)
494                } else {
495                    _read_le!($ty)
496                }
497            }};
498        }
499
500        macro_rules! _read_me {
501            () => {
502                ((_read_le!(u16) as i32) << 16) | (_read_le!(u16) as i32)
503            };
504        }
505
506        Ok(match self {
507            Self::lefloat => Float::lefloat(_read_le!(f32)),
508            Self::befloat => Float::befloat(_read_le!(f32)),
509            Self::ledouble => Float::ledouble(_read_le!(f64)),
510            Self::bedouble => Float::bedouble(_read_be!(f64)),
511        })
512    }
513}
514
515#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
516enum Op {
517    Mul,
518    Add,
519    Sub,
520    Div,
521    Mod,
522    And,
523    Xor,
524    Or,
525}
526
527impl Display for Op {
528    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
529        match self {
530            Op::Mul => write!(f, "*"),
531            Op::Add => write!(f, "+"),
532            Op::Sub => write!(f, "-"),
533            Op::Div => write!(f, "/"),
534            Op::Mod => write!(f, "%"),
535            Op::And => write!(f, "&"),
536            Op::Or => write!(f, "|"),
537            Op::Xor => write!(f, "^"),
538        }
539    }
540}
541
542#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
543enum CmpOp {
544    Eq,
545    Lt,
546    Gt,
547    BitAnd,
548    Neq, // ! operator
549    Xor,
550    Not, // ~ operator
551}
552
553impl CmpOp {
554    #[inline(always)]
555    fn is_neq(&self) -> bool {
556        matches!(self, Self::Neq)
557    }
558}
559
560#[derive(Debug, Clone, Serialize, Deserialize)]
561struct ScalarTransform {
562    op: Op,
563    num: Scalar,
564}
565
566impl ScalarTransform {
567    fn apply(&self, s: Scalar) -> Option<Scalar> {
568        match self.op {
569            Op::Add => s.checked_add(self.num),
570            Op::Sub => s.checked_sub(self.num),
571            Op::Mul => s.checked_mul(self.num),
572            Op::Div => s.checked_div(self.num),
573            Op::Mod => s.checked_rem(self.num),
574            Op::And => Some(s.bitand(self.num)),
575            Op::Xor => Some(s.bitxor(self.num)),
576            Op::Or => Some(s.bitor(self.num)),
577        }
578    }
579}
580
581#[derive(Debug, Clone, Serialize, Deserialize)]
582struct FloatTransform {
583    op: Op,
584    num: Float,
585}
586
587impl FloatTransform {
588    fn apply(&self, s: Float) -> Float {
589        match self.op {
590            Op::Add => s.add(self.num),
591            Op::Sub => s.sub(self.num),
592            Op::Mul => s.mul(self.num),
593            // returns inf when div by 0
594            Op::Div => s.div(self.num),
595            // returns NaN when rem by 0
596            Op::Mod => s.rem(self.num),
597            // parser makes sure those operators cannot be used
598            Op::And | Op::Xor | Op::Or => {
599                debug_panic!("unsupported operation");
600                s
601            }
602        }
603    }
604}
605
606#[derive(Debug, Clone, Serialize, Deserialize)]
607enum TestValue<T> {
608    Value(T),
609    Any,
610}
611
612impl<T> TestValue<T> {
613    #[inline(always)]
614    fn as_ref(&self) -> TestValue<&T> {
615        match self {
616            Self::Value(v) => TestValue::Value(v),
617            Self::Any => TestValue::Any,
618        }
619    }
620}
621
622flags! {
623    enum ReMod: u8{
624        CaseInsensitive,
625        StartOffsetUpdate,
626        LineLimit,
627        ForceBin,
628        ForceText,
629        TrimMatch,
630    }
631}
632
633fn serialize_regex<S>(re: &bytes::Regex, serializer: S) -> Result<S::Ok, S::Error>
634where
635    S: serde::Serializer,
636{
637    re.as_str().serialize(serializer)
638}
639
640fn deserialize_regex<'de, D>(deserializer: D) -> Result<bytes::Regex, D::Error>
641where
642    D: serde::Deserializer<'de>,
643{
644    let wrapper = String::deserialize(deserializer)?;
645    bytes::Regex::new(&wrapper).map_err(serde::de::Error::custom)
646}
647
648#[derive(Debug, Clone, Serialize, Deserialize)]
649struct RegexTest {
650    #[serde(
651        serialize_with = "serialize_regex",
652        deserialize_with = "deserialize_regex"
653    )]
654    re: bytes::Regex,
655    length: Option<usize>,
656    mods: FlagSet<ReMod>,
657    str_mods: FlagSet<StringMod>,
658    non_magic_len: usize,
659    binary: bool,
660    cmp_op: CmpOp,
661}
662
663impl RegexTest {
664    #[inline(always)]
665    fn is_binary(&self) -> bool {
666        self.binary
667            || self.mods.contains(ReMod::ForceBin)
668            || self.str_mods.contains(StringMod::ForceBin)
669    }
670
671    #[inline(always)]
672    fn is_text(&self) -> bool {
673        self.mods.contains(ReMod::ForceText) || self.str_mods.contains(StringMod::ForceText)
674    }
675
676    fn match_buf<'buf>(
677        &self,
678        off_buf: u64, // absolute buffer offset in content
679        stream_kind: StreamKind,
680        buf: &'buf [u8],
681    ) -> Option<MatchRes<'buf>> {
682        let mr = match stream_kind {
683            StreamKind::Text(_) => {
684                let mut off_txt = off_buf;
685
686                let mut line_limit = self.length.unwrap_or(usize::MAX);
687
688                for line in buf.split(|c| c == &b'\n') {
689                    // we don't need to break on offset
690                    // limit as buf contains the good amount
691                    // of bytes to match against
692                    if line_limit == 0 {
693                        break;
694                    }
695
696                    if let Some(re_match) = self.re.find(line) {
697                        // the offset of the string is computed from the start of the buffer
698                        let start_offset = off_txt + re_match.start() as u64;
699
700                        // if we matched until EOL we need to add one to include the delimiter removed from the split
701                        let stop_offset = if re_match.end() == line.len() {
702                            Some(start_offset + re_match.as_bytes().len() as u64 + 1)
703                        } else {
704                            None
705                        };
706
707                        return Some(MatchRes::Bytes(
708                            start_offset,
709                            stop_offset,
710                            re_match.as_bytes(),
711                            Encoding::Utf8,
712                        ));
713                    }
714
715                    off_txt += line.len() as u64;
716                    // we have to add one because lines do not contain splitting character
717                    off_txt += 1;
718                    line_limit = line_limit.saturating_sub(1)
719                }
720                None
721            }
722
723            StreamKind::Binary => {
724                self.re.find(buf).map(|re_match| {
725                    MatchRes::Bytes(
726                        // the offset of the string is computed from the start of the buffer
727                        off_buf + re_match.start() as u64,
728                        None,
729                        re_match.as_bytes(),
730                        Encoding::Utf8,
731                    )
732                })
733            }
734        };
735
736        // handle the case where we want the regex not to match
737        if self.cmp_op.is_neq() && mr.is_none() {
738            return Some(MatchRes::Bytes(off_buf, None, buf, Encoding::Utf8));
739        }
740
741        mr
742    }
743}
744
745impl From<RegexTest> for Test {
746    fn from(value: RegexTest) -> Self {
747        Self::Regex(value)
748    }
749}
750
751flags! {
752    enum StringMod: u8{
753        ForceBin,
754        UpperInsensitive,
755        LowerInsensitive,
756        FullWordMatch,
757        Trim,
758        ForceText,
759        CompactWhitespace,
760        OptBlank,
761    }
762}
763
764#[derive(Debug, Clone, Serialize, Deserialize)]
765struct StringTest {
766    test_val: TestValue<Vec<u8>>,
767    cmp_op: CmpOp,
768    length: Option<usize>,
769    mods: FlagSet<StringMod>,
770    binary: bool,
771}
772
773impl From<StringTest> for Test {
774    fn from(value: StringTest) -> Self {
775        Self::String(value)
776    }
777}
778
779#[inline(always)]
780fn string_match(str: &[u8], mods: FlagSet<StringMod>, buf: &[u8]) -> (bool, usize) {
781    let mut consumed = 0;
782    // we can do a simple string comparison
783    if mods.is_disjoint(
784        StringMod::UpperInsensitive
785            | StringMod::LowerInsensitive
786            | StringMod::FullWordMatch
787            | StringMod::CompactWhitespace
788            | StringMod::OptBlank,
789    ) {
790        // we check if target contains
791        if buf.starts_with(str) {
792            (true, str.len())
793        } else {
794            (false, consumed)
795        }
796    } else {
797        let mut i_src = 0;
798        let mut iter = buf.iter().peekable();
799
800        macro_rules! consume_target {
801            () => {{
802                if iter.next().is_some() {
803                    consumed += 1;
804                }
805            }};
806        }
807
808        macro_rules! continue_next_iteration {
809            () => {{
810                consume_target!();
811                i_src += 1;
812                continue;
813            }};
814        }
815
816        while let Some(&&b) = iter.peek() {
817            let Some(&ref_byte) = str.get(i_src) else {
818                break;
819            };
820
821            if mods.contains(StringMod::OptBlank) && (b == b' ' || ref_byte == b' ') {
822                if b == b' ' {
823                    // we ignore whitespace in target
824                    consume_target!();
825                }
826
827                if ref_byte == b' ' {
828                    // we ignore whitespace in test
829                    i_src += 1;
830                }
831
832                continue;
833            }
834
835            if mods.contains(StringMod::UpperInsensitive) {
836                //upper case characters in the magic match both lower and upper case characters in the target
837                if ref_byte.is_ascii_uppercase() && ref_byte == b.to_ascii_uppercase()
838                    || ref_byte == b
839                {
840                    continue_next_iteration!()
841                }
842            }
843
844            if mods.contains(StringMod::LowerInsensitive)
845                && (ref_byte.is_ascii_lowercase() && ref_byte == b.to_ascii_lowercase()
846                    || ref_byte == b)
847            {
848                continue_next_iteration!()
849            }
850
851            if mods.contains(StringMod::CompactWhitespace) && ref_byte == b' ' {
852                let mut src_blk = 0;
853                while let Some(b' ') = str.get(i_src) {
854                    src_blk += 1;
855                    i_src += 1;
856                }
857
858                let mut tgt_blk = 0;
859                while let Some(b' ') = iter.peek() {
860                    tgt_blk += 1;
861                    consume_target!();
862                }
863
864                if src_blk > tgt_blk {
865                    return (false, consumed);
866                }
867
868                continue;
869            }
870
871            if ref_byte == b {
872                continue_next_iteration!()
873            } else {
874                return (false, consumed);
875            }
876        }
877
878        if mods.contains(StringMod::FullWordMatch)
879            && let Some(b) = iter.peek()
880            && !b.is_ascii_whitespace()
881        {
882            return (false, consumed);
883        }
884
885        (
886            consumed > 0 && str.get(i_src).is_none() && consumed <= buf.len(),
887            consumed,
888        )
889    }
890}
891
892impl StringTest {
893    fn has_length_mod(&self) -> bool {
894        !self.mods.is_disjoint(
895            StringMod::UpperInsensitive
896                | StringMod::LowerInsensitive
897                | StringMod::FullWordMatch
898                | StringMod::CompactWhitespace
899                | StringMod::OptBlank,
900        )
901    }
902
903    #[inline(always)]
904    fn test_value_len(&self) -> usize {
905        match self.test_val.as_ref() {
906            TestValue::Value(s) => s.len(),
907            TestValue::Any => 0,
908        }
909    }
910
911    #[inline(always)]
912    fn is_binary(&self) -> bool {
913        self.binary || self.mods.contains(StringMod::ForceBin)
914    }
915
916    #[inline(always)]
917    fn is_text(&self) -> bool {
918        self.mods.contains(StringMod::ForceText)
919    }
920}
921
922#[derive(Debug, Clone, Serialize, Deserialize)]
923struct SearchTest {
924    str: Vec<u8>,
925    n_pos: Option<usize>,
926    str_mods: FlagSet<StringMod>,
927    re_mods: FlagSet<ReMod>,
928    binary: bool,
929    cmp_op: CmpOp,
930}
931
932impl From<SearchTest> for Test {
933    fn from(value: SearchTest) -> Self {
934        Self::Search(value)
935    }
936}
937
938impl SearchTest {
939    #[inline(always)]
940    fn is_binary(&self) -> bool {
941        (self.binary
942            || self.str_mods.contains(StringMod::ForceBin)
943            || self.re_mods.contains(ReMod::ForceBin))
944            && !(self.str_mods.contains(StringMod::ForceText)
945                || self.re_mods.contains(ReMod::ForceText))
946    }
947
948    // off_buf: absolute buffer offset in content
949    #[inline]
950    fn match_buf<'buf>(&self, off_buf: u64, buf: &'buf [u8]) -> Option<MatchRes<'buf>> {
951        let mut i = 0;
952
953        let needle = self.str.first()?;
954
955        while i < buf.len() {
956            // we cannot match if the first character isn't the same
957            // so we accelerate the search by finding potential matches
958            let Some(k) = memchr(*needle, &buf[i..]) else {
959                break;
960            };
961
962            i += k;
963
964            // if we want a full word match
965            if self.str_mods.contains(StringMod::FullWordMatch) {
966                let prev_is_whitespace = buf
967                    .get(i.saturating_sub(1))
968                    .map(|c| c.is_ascii_whitespace())
969                    .unwrap_or_default();
970
971                // if it is not the first character
972                // and its previous character isn't
973                // a whitespace. It cannot be a
974                // fullword match
975                if i > 0 && !prev_is_whitespace {
976                    i += 1;
977                    continue;
978                }
979            }
980
981            if let Some(npos) = self.n_pos
982                && i > npos
983            {
984                break;
985            }
986
987            let pos = i;
988            let (ok, consumed) = string_match(&self.str, self.str_mods, &buf[i..]);
989
990            if ok {
991                return Some(MatchRes::Bytes(
992                    off_buf.saturating_add(pos as u64),
993                    None,
994                    &buf[i..i + consumed],
995                    Encoding::Utf8,
996                ));
997            } else {
998                i += max(consumed, 1)
999            }
1000        }
1001
1002        // handles the case where we want the string not to be found
1003        if self.cmp_op.is_neq() {
1004            return Some(MatchRes::Bytes(off_buf, None, buf, Encoding::Utf8));
1005        }
1006
1007        None
1008    }
1009}
1010
1011#[derive(Debug, Clone, Serialize, Deserialize)]
1012struct ScalarTest {
1013    ty: ScalarDataType,
1014    transform: Option<ScalarTransform>,
1015    cmp_op: CmpOp,
1016    test_val: TestValue<Scalar>,
1017}
1018
1019#[derive(Debug, Clone, Serialize, Deserialize)]
1020struct FloatTest {
1021    ty: FloatDataType,
1022    transform: Option<FloatTransform>,
1023    cmp_op: CmpOp,
1024    test_val: TestValue<Float>,
1025}
1026
1027// the value read from the haystack we want to match against
1028// 'buf is the lifetime of the buffer we are scanning
1029#[derive(Debug, PartialEq)]
1030enum ReadValue<'buf> {
1031    Float(u64, Float),
1032    Scalar(u64, Scalar),
1033    Bytes(u64, &'buf [u8]),
1034}
1035
1036impl DynDisplay for ReadValue<'_> {
1037    fn dyn_fmt(&self, f: &dyf::FormatSpec) -> Result<String, dyf::Error> {
1038        match self {
1039            Self::Float(_, s) => DynDisplay::dyn_fmt(s, f),
1040            Self::Scalar(_, s) => DynDisplay::dyn_fmt(s, f),
1041            Self::Bytes(_, b) => Ok(format!("{b:?}")),
1042        }
1043    }
1044}
1045
1046impl DynDisplay for &ReadValue<'_> {
1047    fn dyn_fmt(&self, f: &dyf::FormatSpec) -> Result<String, dyf::Error> {
1048        // Dereference self to get the TestValue and call its fmt method
1049        DynDisplay::dyn_fmt(*self, f)
1050    }
1051}
1052
1053impl Display for ReadValue<'_> {
1054    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1055        match self {
1056            Self::Float(_, v) => write!(f, "{v}"),
1057            Self::Scalar(_, s) => write!(f, "{s}"),
1058            Self::Bytes(_, b) => write!(f, "{b:?}"),
1059        }
1060    }
1061}
1062
1063enum Encoding {
1064    Utf16(String16Encoding),
1065    Utf8,
1066}
1067
1068// Carry the offset of the start of the data in the stream
1069// and the data itself
1070enum MatchRes<'buf> {
1071    // Bytes.0: offset of the match
1072    // Bytes.1: optional end of match (to address the need of EOL adjustment in string regex)
1073    // Bytes.2: the bytes matching
1074    // Bytes.3: encoding of the buffer
1075    Bytes(u64, Option<u64>, &'buf [u8], Encoding),
1076    Scalar(u64, Scalar),
1077    Float(u64, Float),
1078}
1079
1080impl DynDisplay for &MatchRes<'_> {
1081    fn dyn_fmt(&self, f: &dyf::FormatSpec) -> Result<String, dyf::Error> {
1082        (*self).dyn_fmt(f)
1083    }
1084}
1085
1086impl DynDisplay for MatchRes<'_> {
1087    fn dyn_fmt(&self, f: &dyf::FormatSpec) -> Result<String, dyf::Error> {
1088        match self {
1089            Self::Scalar(_, v) => v.dyn_fmt(f),
1090            Self::Float(_, v) => v.dyn_fmt(f),
1091            Self::Bytes(_, _, v, enc) => match enc {
1092                Encoding::Utf8 => String::from_utf8_lossy(v).to_string().dyn_fmt(f),
1093                Encoding::Utf16(enc) => {
1094                    let utf16: Vec<u16> = slice_to_utf16_iter(v, *enc).collect();
1095                    String::from_utf16_lossy(&utf16).dyn_fmt(f)
1096                }
1097            },
1098        }
1099    }
1100}
1101
1102impl MatchRes<'_> {
1103    // start offset of the match
1104    #[inline]
1105    fn start_offset(&self) -> u64 {
1106        match self {
1107            MatchRes::Bytes(o, _, _, _) => *o,
1108            MatchRes::Scalar(o, _) => *o,
1109            MatchRes::Float(o, _) => *o,
1110        }
1111    }
1112
1113    // start offset of the match
1114    #[inline]
1115    fn end_offset(&self) -> u64 {
1116        match self {
1117            MatchRes::Bytes(start, end, buf, _) => match end {
1118                Some(end) => *end,
1119                None => start.saturating_add(buf.len() as u64),
1120            },
1121            MatchRes::Scalar(o, sc) => o.add(sc.size_of() as u64),
1122            MatchRes::Float(o, f) => o.add(f.size_of() as u64),
1123        }
1124    }
1125}
1126
1127fn slice_to_utf16_iter(read: &[u8], encoding: String16Encoding) -> impl Iterator<Item = u16> {
1128    let even = read
1129        .iter()
1130        .enumerate()
1131        .filter(|(i, _)| i % 2 == 0)
1132        .map(|t| t.1);
1133
1134    let odd = read
1135        .iter()
1136        .enumerate()
1137        .filter(|(i, _)| i % 2 != 0)
1138        .map(|t| t.1);
1139
1140    even.zip(odd).map(move |(e, o)| match encoding {
1141        String16Encoding::Le => u16::from_le_bytes([*e, *o]),
1142        String16Encoding::Be => u16::from_be_bytes([*e, *o]),
1143    })
1144}
1145
1146#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
1147enum String16Encoding {
1148    Le,
1149    Be,
1150}
1151
1152#[derive(Debug, Clone, Serialize, Deserialize)]
1153struct String16Test {
1154    orig: String,
1155    test_val: TestValue<Vec<u16>>,
1156    encoding: String16Encoding,
1157}
1158
1159impl String16Test {
1160    /// if the test value is a specific value this method returns
1161    /// the number of utf16 characters. To obtain the length in
1162    /// bytes the return value needs to be multiplied by two.
1163    #[inline(always)]
1164    fn test_value_len(&self) -> usize {
1165        match self.test_val.as_ref() {
1166            TestValue::Value(str16) => str16.len(),
1167            TestValue::Any => 0,
1168        }
1169    }
1170}
1171
1172flags! {
1173    enum IndirectMod: u8{
1174        Relative,
1175    }
1176}
1177
1178type IndirectMods = FlagSet<IndirectMod>;
1179
1180#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
1181enum PStringLen {
1182    Byte,    // B
1183    ShortBe, // H
1184    ShortLe, // h
1185    LongBe,  // L
1186    LongLe,  // l
1187}
1188
1189impl PStringLen {
1190    #[inline(always)]
1191    const fn size_of_len(&self) -> usize {
1192        match self {
1193            PStringLen::Byte => 1,
1194            PStringLen::ShortBe => 2,
1195            PStringLen::ShortLe => 2,
1196            PStringLen::LongBe => 4,
1197            PStringLen::LongLe => 4,
1198        }
1199    }
1200}
1201
1202#[derive(Debug, Clone, Serialize, Deserialize)]
1203struct PStringTest {
1204    len: PStringLen,
1205    test_val: TestValue<Vec<u8>>,
1206    include_len: bool,
1207}
1208
1209impl PStringTest {
1210    #[inline]
1211    fn read<'cache, R: Read + Seek>(
1212        &self,
1213        haystack: &'cache mut LazyCache<R>,
1214    ) -> Result<Option<&'cache [u8]>, Error> {
1215        let mut len = match self.len {
1216            PStringLen::Byte => read_le!(haystack, u8) as u32,
1217            PStringLen::ShortBe => read_be!(haystack, u16) as u32,
1218            PStringLen::ShortLe => read_le!(haystack, u16) as u32,
1219            PStringLen::LongBe => read_be!(haystack, u32),
1220            PStringLen::LongLe => read_le!(haystack, u32),
1221        } as usize;
1222
1223        if self.include_len {
1224            len = len.saturating_sub(self.len.size_of_len())
1225        }
1226
1227        if let TestValue::Value(s) = self.test_val.as_ref()
1228            && len != s.len()
1229        {
1230            return Ok(None);
1231        }
1232
1233        let read = haystack.read_exact_count(len as u64)?;
1234
1235        Ok(Some(read))
1236    }
1237
1238    #[inline(always)]
1239    fn test_value_len(&self) -> usize {
1240        match self.test_val.as_ref() {
1241            TestValue::Value(s) => s.len(),
1242            TestValue::Any => 0,
1243        }
1244    }
1245}
1246
1247#[derive(Debug, Clone, Serialize, Deserialize)]
1248enum Test {
1249    Name(String),
1250    Use(bool, String),
1251    Scalar(ScalarTest),
1252    Float(FloatTest),
1253    String(StringTest),
1254    Search(SearchTest),
1255    PString(PStringTest),
1256    Regex(RegexTest),
1257    Indirect(FlagSet<IndirectMod>),
1258    String16(String16Test),
1259    // FIXME: placeholder for strength computation
1260    #[allow(dead_code)]
1261    Der,
1262    Clear,
1263    Default,
1264}
1265
1266impl Test {
1267    // read the value to test from the haystack
1268    #[inline]
1269    fn read_test_value<'haystack, R: Read + Seek>(
1270        &self,
1271        haystack: &'haystack mut LazyCache<R>,
1272        switch_endianness: bool,
1273    ) -> Result<Option<ReadValue<'haystack>>, Error> {
1274        let test_value_offset = haystack.lazy_stream_position();
1275
1276        match self {
1277            Self::Scalar(t) => {
1278                t.ty.read(haystack, switch_endianness)
1279                    .map(|s| Some(ReadValue::Scalar(test_value_offset, s)))
1280            }
1281
1282            Self::Float(t) => {
1283                t.ty.read(haystack, switch_endianness)
1284                    .map(|f| Some(ReadValue::Float(test_value_offset, f)))
1285            }
1286            Self::String(t) => {
1287                match t.test_val.as_ref() {
1288                    TestValue::Value(str) => {
1289                        let buf = if let Some(length) = t.length {
1290                            // if there is a length specified
1291                            haystack.read_exact_count(length as u64)?
1292                        } else {
1293                            // no length specified we read until end of string
1294
1295                            match t.cmp_op {
1296                                CmpOp::Eq | CmpOp::Neq => {
1297                                    if !t.has_length_mod() {
1298                                        haystack.read_exact_count(str.len() as u64)?
1299                                    } else {
1300                                        haystack.read_count(FILE_BYTES_MAX as u64)?
1301                                    }
1302                                }
1303                                CmpOp::Lt | CmpOp::Gt => {
1304                                    let read =
1305                                        haystack.read_until_any_delim_or_limit(b"\n\0", 8092)?;
1306
1307                                    if read.ends_with(b"\0") || read.ends_with(b"\n") {
1308                                        &read[..read.len() - 1]
1309                                    } else {
1310                                        read
1311                                    }
1312                                }
1313                                _ => {
1314                                    return Err(Error::Msg(format!(
1315                                        "string test does not support {:?} operator",
1316                                        t.cmp_op
1317                                    )));
1318                                }
1319                            }
1320                        };
1321
1322                        Ok(Some(ReadValue::Bytes(test_value_offset, buf)))
1323                    }
1324                    TestValue::Any => {
1325                        let read = haystack.read_until_any_delim_or_limit(b"\0\n", 8192)?;
1326                        // we don't take last byte if it matches end of string
1327                        let bytes = if read.ends_with(b"\0") || read.ends_with(b"\n") {
1328                            &read[..read.len() - 1]
1329                        } else {
1330                            read
1331                        };
1332
1333                        Ok(Some(ReadValue::Bytes(test_value_offset, bytes)))
1334                    }
1335                }
1336            }
1337
1338            Self::String16(t) => {
1339                match t.test_val.as_ref() {
1340                    TestValue::Value(str16) => {
1341                        let read = haystack.read_exact_count((str16.len() * 2) as u64)?;
1342
1343                        Ok(Some(ReadValue::Bytes(test_value_offset, read)))
1344                    }
1345                    TestValue::Any => {
1346                        let read = haystack.read_until_utf16_or_limit(b"\x00\x00", 8192)?;
1347
1348                        // we make sure we have an even number of elements
1349                        let end = if read.len() % 2 == 0 {
1350                            read.len()
1351                        } else {
1352                            // we decide to read anyway even though
1353                            // length isn't even
1354                            read.len().saturating_sub(1)
1355                        };
1356
1357                        Ok(Some(ReadValue::Bytes(test_value_offset, &read[..end])))
1358                    }
1359                }
1360            }
1361
1362            Self::PString(t) => {
1363                let Some(read) = t.read(haystack)? else {
1364                    return Ok(None);
1365                };
1366                Ok(Some(ReadValue::Bytes(test_value_offset, read)))
1367            }
1368
1369            Self::Search(_) => {
1370                let buf = haystack.read_count(FILE_BYTES_MAX as u64)?;
1371                Ok(Some(ReadValue::Bytes(test_value_offset, buf)))
1372            }
1373
1374            Self::Regex(r) => {
1375                let length = {
1376                    match r.length {
1377                        Some(len) => {
1378                            if r.mods.contains(ReMod::LineLimit) {
1379                                len * 80
1380                            } else {
1381                                len
1382                            }
1383                        }
1384
1385                        None => FILE_REGEX_MAX,
1386                    }
1387                };
1388
1389                let read = haystack.read_count(length as u64)?;
1390                Ok(Some(ReadValue::Bytes(test_value_offset, read)))
1391            }
1392
1393            Self::Name(_)
1394            | Self::Use(_, _)
1395            | Self::Indirect(_)
1396            | Self::Clear
1397            | Self::Default
1398            | Self::Der => Err(Error::msg("no value to read for this test")),
1399        }
1400    }
1401
1402    #[inline(always)]
1403    fn match_value<'s>(
1404        &'s self,
1405        tv: &ReadValue<'s>,
1406        stream_kind: StreamKind,
1407    ) -> Option<MatchRes<'s>> {
1408        match (self, tv) {
1409            (Self::Scalar(t), ReadValue::Scalar(o, ts)) => {
1410                let read_value: Scalar = match t.transform.as_ref() {
1411                    Some(t) => t.apply(*ts)?,
1412                    None => *ts,
1413                };
1414
1415                match t.test_val {
1416                    TestValue::Value(test_value) => {
1417                        let ok = match t.cmp_op {
1418                            // NOTE: this should not happen in practice because
1419                            // we convert it into Eq equivalent at parsing time
1420                            CmpOp::Not => read_value == !test_value,
1421                            CmpOp::Eq => read_value == test_value,
1422                            CmpOp::Lt => read_value < test_value,
1423                            CmpOp::Gt => read_value > test_value,
1424                            CmpOp::Neq => read_value != test_value,
1425                            CmpOp::BitAnd => read_value & test_value == test_value,
1426                            CmpOp::Xor => (read_value & test_value).is_zero(),
1427                        };
1428
1429                        if ok {
1430                            Some(MatchRes::Scalar(*o, read_value))
1431                        } else {
1432                            None
1433                        }
1434                    }
1435
1436                    TestValue::Any => Some(MatchRes::Scalar(*o, read_value)),
1437                }
1438            }
1439
1440            (Self::Float(t), ReadValue::Float(o, f)) => {
1441                let read_value: Float = t.transform.as_ref().map(|t| t.apply(*f)).unwrap_or(*f);
1442
1443                match t.test_val {
1444                    TestValue::Value(tf) => {
1445                        let ok = match t.cmp_op {
1446                            CmpOp::Eq => read_value == tf,
1447                            CmpOp::Lt => read_value < tf,
1448                            CmpOp::Gt => read_value > tf,
1449                            CmpOp::Neq => read_value != tf,
1450                            _ => {
1451                                // this should never be reached as we validate
1452                                // operator in parser
1453                                debug_panic!("unsupported float comparison");
1454                                debug!("unsupported float comparison");
1455                                false
1456                            }
1457                        };
1458
1459                        if ok {
1460                            Some(MatchRes::Float(*o, read_value))
1461                        } else {
1462                            None
1463                        }
1464                    }
1465                    TestValue::Any => Some(MatchRes::Float(*o, read_value)),
1466                }
1467            }
1468
1469            (Self::String(st), ReadValue::Bytes(o, buf)) => {
1470                macro_rules! trim_buf {
1471                    ($buf: expr) => {{
1472                        if st.mods.contains(StringMod::Trim) {
1473                            $buf.trim_ascii()
1474                        } else {
1475                            $buf
1476                        }
1477                    }};
1478                }
1479
1480                match st.test_val.as_ref() {
1481                    TestValue::Value(str) => {
1482                        match st.cmp_op {
1483                            CmpOp::Eq => {
1484                                if let (true, _) = string_match(str, st.mods, buf) {
1485                                    Some(MatchRes::Bytes(*o, None, trim_buf!(str), Encoding::Utf8))
1486                                } else {
1487                                    None
1488                                }
1489                            }
1490                            CmpOp::Neq => {
1491                                if let (false, _) = string_match(str, st.mods, buf) {
1492                                    Some(MatchRes::Bytes(*o, None, trim_buf!(str), Encoding::Utf8))
1493                                } else {
1494                                    None
1495                                }
1496                            }
1497                            CmpOp::Gt => {
1498                                if buf.len() > str.len() {
1499                                    Some(MatchRes::Bytes(*o, None, trim_buf!(buf), Encoding::Utf8))
1500                                } else {
1501                                    None
1502                                }
1503                            }
1504                            CmpOp::Lt => {
1505                                if buf.len() < str.len() {
1506                                    Some(MatchRes::Bytes(*o, None, trim_buf!(buf), Encoding::Utf8))
1507                                } else {
1508                                    None
1509                                }
1510                            }
1511
1512                            // unsupported for strings
1513                            _ => {
1514                                // this should never be reached as we validate
1515                                // operator in parser
1516                                debug_panic!("unsupported string comparison");
1517                                debug!("unsupported string comparison");
1518                                None
1519                            }
1520                        }
1521                    }
1522                    TestValue::Any => {
1523                        Some(MatchRes::Bytes(*o, None, trim_buf!(buf), Encoding::Utf8))
1524                    }
1525                }
1526            }
1527
1528            (Self::PString(m), ReadValue::Bytes(o, buf)) => match m.test_val.as_ref() {
1529                TestValue::Value(psv) => {
1530                    if buf == psv {
1531                        Some(MatchRes::Bytes(*o, None, buf, Encoding::Utf8))
1532                    } else {
1533                        None
1534                    }
1535                }
1536                TestValue::Any => Some(MatchRes::Bytes(*o, None, buf, Encoding::Utf8)),
1537            },
1538
1539            (Self::String16(t), ReadValue::Bytes(o, buf)) => {
1540                match t.test_val.as_ref() {
1541                    TestValue::Value(str16) => {
1542                        // strings cannot be equal
1543                        if str16.len() * 2 != buf.len() {
1544                            return None;
1545                        }
1546
1547                        // we check string equality
1548                        for (i, utf16_char) in slice_to_utf16_iter(buf, t.encoding).enumerate() {
1549                            if str16[i] != utf16_char {
1550                                return None;
1551                            }
1552                        }
1553
1554                        Some(MatchRes::Bytes(
1555                            *o,
1556                            None,
1557                            t.orig.as_bytes(),
1558                            Encoding::Utf16(t.encoding),
1559                        ))
1560                    }
1561
1562                    TestValue::Any => {
1563                        Some(MatchRes::Bytes(*o, None, buf, Encoding::Utf16(t.encoding)))
1564                    }
1565                }
1566            }
1567
1568            (Self::Regex(r), ReadValue::Bytes(o, buf)) => r.match_buf(*o, stream_kind, buf),
1569
1570            (Self::Search(t), ReadValue::Bytes(o, buf)) => t.match_buf(*o, buf),
1571
1572            _ => None,
1573        }
1574    }
1575
1576    #[inline(always)]
1577    fn strength(&self) -> u64 {
1578        const MULT: usize = 10;
1579
1580        let mut out = 2 * MULT;
1581
1582        // FIXME: octal is missing but it is not used in practice ...
1583        match self {
1584            Test::Scalar(s) => {
1585                out += s.ty.type_size() * MULT;
1586            }
1587
1588            Test::Float(t) => {
1589                out += t.ty.type_size() * MULT;
1590            }
1591
1592            Test::String(t) => out += t.test_value_len().saturating_mul(MULT),
1593
1594            Test::PString(t) => out += t.test_value_len().saturating_mul(MULT),
1595
1596            Test::Search(s) => {
1597                // NOTE: this implementation deviates from what is in
1598                // C libmagic. The purpose of this implementation is to
1599                // minimize the difference between similar tests,
1600                // implemented differently (ex: string test VS very localized search test).
1601                let n_pos = s.n_pos.unwrap_or(FILE_BYTES_MAX);
1602
1603                match n_pos {
1604                    // a search on one line should be equivalent to a string match
1605                    0..=80 => out += s.str.len().saturating_mul(MULT),
1606                    // search on the first 3 lines gets a little penalty
1607                    81..=240 => out += s.str.len() * s.str.len().clamp(0, MULT - 2),
1608                    // a search on more than 3 lines isn't considered very accurate
1609                    _ => out += s.str.len(),
1610                }
1611            }
1612
1613            Test::Regex(r) => {
1614                // NOTE: this implementation deviates from what is in
1615                // C libmagic. The purpose of this implementation is to
1616                // minimize the difference between similar tests,
1617                // implemented differently (ex: string test VS very localized regex test).
1618
1619                // we divide length by the number of capture group
1620                // which gives us a value close to he average string
1621                // length match in the regex.
1622                let v = r.non_magic_len / r.re.captures_len();
1623
1624                let len = r
1625                    .length
1626                    .map(|l| {
1627                        if r.mods.contains(ReMod::LineLimit) {
1628                            l * 80
1629                        } else {
1630                            l
1631                        }
1632                    })
1633                    .unwrap_or(FILE_BYTES_MAX);
1634
1635                match len {
1636                    // a search on one line should be equivalent to a string match
1637                    0..=80 => out += v.saturating_mul(MULT),
1638                    // search on the first 3 lines gets a little penalty
1639                    81..=240 => out += v * v.clamp(0, MULT - 2),
1640                    // a search on more than 3 lines isn't considered very accurate
1641                    _ => out += v,
1642                }
1643            }
1644
1645            Test::String16(t) => {
1646                // NOTE: in libmagic the result is div by 2
1647                // but I GUESS it is because the len is expressed
1648                // in number bytes. In our case length is expressed
1649                // in number of u16 so we shouldn't divide.
1650                out += t.test_value_len().saturating_mul(MULT);
1651            }
1652
1653            Test::Der => out += MULT,
1654
1655            Test::Default | Test::Name(_) | Test::Use(_, _) | Test::Indirect(_) | Test::Clear => {
1656                return 0;
1657            }
1658        }
1659
1660        // matching any output gets penalty
1661        if self.is_match_any() {
1662            return 0;
1663        }
1664
1665        if let Some(op) = self.cmp_op() {
1666            match op {
1667                // matching almost any gets penalty
1668                CmpOp::Neq => out = 0,
1669                CmpOp::Eq | CmpOp::Not => out += MULT,
1670                CmpOp::Lt | CmpOp::Gt => out -= 2 * MULT,
1671                CmpOp::Xor | CmpOp::BitAnd => out -= MULT,
1672            }
1673        }
1674
1675        out as u64
1676    }
1677
1678    #[inline(always)]
1679    fn cmp_op(&self) -> Option<CmpOp> {
1680        match self {
1681            Self::String(t) => Some(t.cmp_op),
1682            Self::Scalar(s) => Some(s.cmp_op),
1683            Self::Float(t) => Some(t.cmp_op),
1684            Self::Name(_)
1685            | Self::Use(_, _)
1686            | Self::Search(_)
1687            | Self::PString(_)
1688            | Self::Regex(_)
1689            | Self::Clear
1690            | Self::Default
1691            | Self::Indirect(_)
1692            | Self::String16(_)
1693            | Self::Der => None,
1694        }
1695    }
1696
1697    #[inline(always)]
1698    fn is_recursive(&self) -> bool {
1699        matches!(self, Test::Use(_, _) | Test::Indirect(_))
1700    }
1701
1702    #[inline(always)]
1703    fn is_match_any(&self) -> bool {
1704        match self {
1705            Test::Name(_) => false,
1706            Test::Use(_, _) => false,
1707            Test::Scalar(scalar_test) => matches!(scalar_test.test_val, TestValue::Any),
1708            Test::Float(float_test) => matches!(float_test.test_val, TestValue::Any),
1709            Test::String(string_test) => matches!(string_test.test_val, TestValue::Any),
1710            Test::Search(_) => false,
1711            Test::PString(pstring_test) => matches!(pstring_test.test_val, TestValue::Any),
1712            Test::Regex(_) => false,
1713            Test::Indirect(_) => false,
1714            Test::String16(string16_test) => matches!(string16_test.test_val, TestValue::Any),
1715            Test::Der => false,
1716            Test::Clear => false,
1717            Test::Default => false,
1718        }
1719    }
1720
1721    #[inline(always)]
1722    fn is_binary(&self) -> bool {
1723        match self {
1724            Self::Name(_) => true,
1725            Self::Use(_, _) => true,
1726            Self::Scalar(_) => true,
1727            Self::Float(_) => true,
1728            Self::String(t) => !t.is_binary() & !t.is_text() || t.is_binary(),
1729            Self::Search(t) => t.is_binary(),
1730            Self::PString(_) => true,
1731            Self::Regex(t) => !t.is_binary() & !t.is_text() || t.is_binary(),
1732            Self::Clear => true,
1733            Self::Default => true,
1734            Self::Indirect(_) => true,
1735            Self::String16(_) => true,
1736            Self::Der => true,
1737        }
1738    }
1739
1740    #[inline(always)]
1741    fn is_text(&self) -> bool {
1742        match self {
1743            Self::Name(_) => true,
1744            Self::Use(_, _) => true,
1745            Self::Indirect(_) => true,
1746            Self::Clear => true,
1747            Self::Default => true,
1748            Self::String(t) => !t.is_binary() & !t.is_text() || t.is_text(),
1749            Self::Regex(t) => !t.is_binary() & !t.is_text() || t.is_text(),
1750            _ => !self.is_binary(),
1751        }
1752    }
1753
1754    #[inline(always)]
1755    fn is_only_text(&self) -> bool {
1756        self.is_text() && !self.is_binary()
1757    }
1758
1759    #[inline(always)]
1760    fn is_only_binary(&self) -> bool {
1761        self.is_binary() && !self.is_text()
1762    }
1763}
1764
1765#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
1766enum OffsetType {
1767    Byte,
1768    DoubleLe,
1769    DoubleBe,
1770    ShortLe,
1771    ShortBe,
1772    Id3Le,
1773    Id3Be,
1774    LongLe,
1775    LongBe,
1776    Middle,
1777    Octal,
1778    QuadBe,
1779    QuadLe,
1780}
1781
1782#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
1783enum Shift {
1784    Direct(u64),
1785    Indirect(i64),
1786}
1787
1788#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
1789struct IndOffset {
1790    // where to find the offset
1791    off_addr: DirOffset,
1792    // signed or unsigned
1793    signed: bool,
1794    // type of the offset
1795    ty: OffsetType,
1796    op: Option<Op>,
1797    shift: Option<Shift>,
1798}
1799
1800impl IndOffset {
1801    // if we overflow we must not return an offset
1802    fn read_offset<R: Read + Seek>(
1803        &self,
1804        haystack: &mut LazyCache<R>,
1805        rule_base_offset: Option<u64>,
1806        last_upper_match_offset: Option<u64>,
1807    ) -> Result<Option<u64>, io::Error> {
1808        let offset_address = match self.off_addr {
1809            DirOffset::Start(s) => {
1810                let Some(o) = s.checked_add(rule_base_offset.unwrap_or_default()) else {
1811                    return Ok(None);
1812                };
1813
1814                haystack.seek(SeekFrom::Start(o))?
1815            }
1816            DirOffset::LastUpper(c) => haystack.seek(SeekFrom::Start(
1817                (last_upper_match_offset.unwrap_or_default() as i64 + c) as u64,
1818            ))?,
1819            DirOffset::End(e) => haystack.seek(SeekFrom::End(e))?,
1820        };
1821
1822        macro_rules! read_value {
1823            () => {
1824                match self.ty {
1825                    OffsetType::Byte => {
1826                        if self.signed {
1827                            read_le!(haystack, u8) as u64
1828                        } else {
1829                            read_le!(haystack, i8) as u64
1830                        }
1831                    }
1832                    OffsetType::DoubleLe => read_le!(haystack, f64) as u64,
1833                    OffsetType::DoubleBe => read_be!(haystack, f64) as u64,
1834                    OffsetType::ShortLe => {
1835                        if self.signed {
1836                            read_le!(haystack, i16) as u64
1837                        } else {
1838                            read_le!(haystack, u16) as u64
1839                        }
1840                    }
1841                    OffsetType::ShortBe => {
1842                        if self.signed {
1843                            read_be!(haystack, i16) as u64
1844                        } else {
1845                            read_be!(haystack, u16) as u64
1846                        }
1847                    }
1848                    OffsetType::Id3Le => decode_id3(read_le!(haystack, u32)) as u64,
1849                    OffsetType::Id3Be => decode_id3(read_be!(haystack, u32)) as u64,
1850                    OffsetType::LongLe => {
1851                        if self.signed {
1852                            read_le!(haystack, i32) as u64
1853                        } else {
1854                            read_le!(haystack, u32) as u64
1855                        }
1856                    }
1857                    OffsetType::LongBe => {
1858                        if self.signed {
1859                            read_be!(haystack, i32) as u64
1860                        } else {
1861                            read_be!(haystack, u32) as u64
1862                        }
1863                    }
1864                    OffsetType::Middle => read_me!(haystack) as u64,
1865                    OffsetType::Octal => {
1866                        if let Some(o) = read_octal_u64(haystack) {
1867                            o
1868                        } else {
1869                            debug!("failed to read octal offset @ {offset_address}");
1870                            return Ok(None);
1871                        }
1872                    }
1873                    OffsetType::QuadLe => {
1874                        if self.signed {
1875                            read_le!(haystack, i64) as u64
1876                        } else {
1877                            read_le!(haystack, u64)
1878                        }
1879                    }
1880                    OffsetType::QuadBe => {
1881                        if self.signed {
1882                            read_be!(haystack, i64) as u64
1883                        } else {
1884                            read_be!(haystack, u64)
1885                        }
1886                    }
1887                }
1888            };
1889        }
1890
1891        // in theory every offset read should end up in something seekable from start, so we can use u64 to store the result
1892        let o = read_value!();
1893
1894        trace!(
1895            "offset read @ {offset_address} value={o} op={:?} shift={:?}",
1896            self.op, self.shift
1897        );
1898
1899        // apply transformation
1900        if let (Some(op), Some(shift)) = (self.op, self.shift) {
1901            let shift = match shift {
1902                Shift::Direct(i) => i,
1903                Shift::Indirect(i) => {
1904                    let tmp = offset_address as i128 + i as i128;
1905                    if tmp.is_negative() {
1906                        return Ok(None);
1907                    } else {
1908                        haystack.seek(SeekFrom::Start(tmp as u64))?;
1909                    };
1910                    // NOTE: here we assume that the shift has the same
1911                    // type as the main offset !
1912                    read_value!()
1913                }
1914            };
1915
1916            match op {
1917                Op::Add => return Ok(o.checked_add(shift)),
1918                Op::Mul => return Ok(o.checked_mul(shift)),
1919                Op::Sub => return Ok(o.checked_sub(shift)),
1920                Op::Div => return Ok(o.checked_div(shift)),
1921                Op::Mod => return Ok(o.checked_rem(shift)),
1922                Op::And => return Ok(Some(o & shift)),
1923                Op::Or => return Ok(Some(o | shift)),
1924                Op::Xor => return Ok(Some(o ^ shift)),
1925            }
1926        }
1927
1928        Ok(Some(o))
1929    }
1930}
1931
1932#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
1933enum DirOffset {
1934    Start(u64),
1935    // relative to the last up-level field
1936    LastUpper(i64),
1937    End(i64),
1938}
1939
1940#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
1941enum Offset {
1942    Direct(DirOffset),
1943    Indirect(IndOffset),
1944}
1945
1946impl From<DirOffset> for Offset {
1947    fn from(value: DirOffset) -> Self {
1948        Self::Direct(value)
1949    }
1950}
1951
1952impl From<IndOffset> for Offset {
1953    fn from(value: IndOffset) -> Self {
1954        Self::Indirect(value)
1955    }
1956}
1957
1958impl Display for DirOffset {
1959    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1960        match self {
1961            DirOffset::Start(i) => write!(f, "{i}"),
1962            DirOffset::LastUpper(c) => write!(f, "&{c}"),
1963            DirOffset::End(e) => write!(f, "-{e}"),
1964        }
1965    }
1966}
1967
1968impl Default for DirOffset {
1969    fn default() -> Self {
1970        Self::LastUpper(0)
1971    }
1972}
1973
1974#[derive(Debug, Clone, Serialize, Deserialize)]
1975struct Match {
1976    line: usize,
1977    depth: u8,
1978    offset: Offset,
1979    test: Test,
1980    test_strength: u64,
1981    message: Option<Message>,
1982}
1983
1984impl From<Use> for Match {
1985    fn from(value: Use) -> Self {
1986        let test = Test::Use(value.switch_endianness, value.rule_name);
1987        let test_strength = test.strength();
1988        Self {
1989            line: value.line,
1990            depth: value.depth,
1991            offset: value.start_offset,
1992            test,
1993            test_strength,
1994            message: value.message,
1995        }
1996    }
1997}
1998
1999impl From<Name> for Match {
2000    fn from(value: Name) -> Self {
2001        let test = Test::Name(value.name);
2002        let test_strength = test.strength();
2003        Self {
2004            line: value.line,
2005            depth: 0,
2006            offset: Offset::Direct(DirOffset::Start(0)),
2007            test,
2008            test_strength,
2009            message: value.message,
2010        }
2011    }
2012}
2013
2014impl Match {
2015    /// Turns the `Match`'s offset into an absolute offset from the start of the stream
2016    #[inline(always)]
2017    fn offset_from_start<R: Read + Seek>(
2018        &self,
2019        haystack: &mut LazyCache<R>,
2020        rule_base_offset: Option<u64>,
2021        last_level_offset: Option<u64>,
2022    ) -> Result<Option<u64>, io::Error> {
2023        match self.offset {
2024            Offset::Direct(dir_offset) => match dir_offset {
2025                DirOffset::Start(s) => Ok(Some(s)),
2026                DirOffset::LastUpper(shift) => {
2027                    let o = last_level_offset.unwrap_or_default() as i64 + shift;
2028
2029                    if o >= 0 { Ok(Some(o as u64)) } else { Ok(None) }
2030                }
2031                DirOffset::End(e) => Ok(Some(haystack.offset_from_start(SeekFrom::End(e)))),
2032            },
2033            Offset::Indirect(ind_offset) => {
2034                let Some(o) =
2035                    ind_offset.read_offset(haystack, rule_base_offset, last_level_offset)?
2036                else {
2037                    return Ok(None);
2038                };
2039
2040                Ok(Some(o))
2041            }
2042        }
2043    }
2044
2045    /// this method emulates the buffer based matching
2046    /// logic implemented in libmagic. It needs some aweful
2047    /// and weird offset convertions to turn buffer
2048    /// relative offsets (libmagic is based on) into
2049    /// absolute offset in the file.
2050    ///
2051    /// this method shoud bubble up only critical errors
2052    /// all the other errors should make the match result
2053    /// false and be logged via debug!
2054    ///
2055    /// the function returns an error if the maximum recursion
2056    /// has been reached or if a dependency rule is missing.
2057    #[inline]
2058    #[allow(clippy::too_many_arguments)]
2059    fn matches<'a: 'h, 'h, R: Read + Seek>(
2060        &'a self,
2061        source: Option<&str>,
2062        magic: &mut Magic<'a>,
2063        stream_kind: StreamKind,
2064        state: &mut MatchState,
2065        buf_base_offset: Option<u64>,
2066        rule_base_offset: Option<u64>,
2067        last_level_offset: Option<u64>,
2068        haystack: &'h mut LazyCache<R>,
2069        switch_endianness: bool,
2070        db: &'a MagicDb,
2071        depth: usize,
2072    ) -> Result<(bool, Option<MatchRes<'h>>), Error> {
2073        let source = source.unwrap_or("unknown");
2074        let line = self.line;
2075
2076        if depth >= MAX_RECURSION {
2077            return Err(Error::localized(
2078                source,
2079                line,
2080                Error::MaximumRecursion(MAX_RECURSION),
2081            ));
2082        }
2083
2084        if self.test.is_only_binary() && stream_kind.is_text() {
2085            trace!("skip binary test source={source} line={line} stream_kind={stream_kind:?}",);
2086            return Ok((false, None));
2087        }
2088
2089        if self.test.is_only_text() && !stream_kind.is_text() {
2090            trace!("skip text test source={source} line={line} stream_kind={stream_kind:?}",);
2091            return Ok((false, None));
2092        }
2093
2094        let Ok(Some(mut offset)) = self
2095            .offset_from_start(haystack, rule_base_offset, last_level_offset)
2096            .inspect_err(|e| debug!("source={source} line={line} failed at computing offset: {e}"))
2097        else {
2098            return Ok((false, None));
2099        };
2100
2101        offset = match self.offset {
2102            Offset::Indirect(_) => {
2103                // the result we get for an indirect offset
2104                // is relative to the start of the libmagic
2105                // buffer so we need to add base to make it
2106                // absolute.
2107                buf_base_offset.unwrap_or_default().saturating_add(offset)
2108            }
2109            // offset from start are computed from rule base
2110            Offset::Direct(DirOffset::Start(_)) => {
2111                rule_base_offset.unwrap_or_default().saturating_add(offset)
2112            }
2113            _ => offset,
2114        };
2115
2116        match &self.test {
2117            Test::Clear => {
2118                trace!("source={source} line={line} clear");
2119                state.clear_continuation_level(&self.continuation_level());
2120                Ok((true, None))
2121            }
2122
2123            Test::Name(name) => {
2124                trace!(
2125                    "source={source} line={line} running rule {name} switch_endianness={switch_endianness}",
2126                );
2127                Ok((true, None))
2128            }
2129
2130            Test::Use(flip_endianness, rule_name) => {
2131                trace!(
2132                    "source={source} line={line} use {rule_name} switch_endianness={flip_endianness}",
2133                );
2134
2135                // switch_endianness must propagate down the rule call stack
2136                let switch_endianness = switch_endianness ^ flip_endianness;
2137
2138                let dr: &DependencyRule = db.dependencies.get(rule_name).ok_or(
2139                    Error::localized(source, line, Error::MissingRule(rule_name.clone())),
2140                )?;
2141
2142                // we push the message here otherwise we push message in depth first
2143                if let Some(msg) = self.message.as_ref() {
2144                    magic.push_message(msg.to_string_lossy());
2145                }
2146
2147                let nmatch = dr.rule.magic(
2148                    magic,
2149                    stream_kind,
2150                    buf_base_offset,
2151                    Some(offset),
2152                    haystack,
2153                    db,
2154                    switch_endianness,
2155                    depth.saturating_add(1),
2156                )?;
2157
2158                // The name is always true, so we consider there to be a match
2159                // if more than one test succeeded
2160                let matched = nmatch > 1;
2161                if matched {
2162                    state.set_continuation_level(self.continuation_level());
2163                }
2164
2165                Ok((matched, None))
2166            }
2167
2168            Test::Indirect(m) => {
2169                trace!(
2170                    "source={source} line={line} indirect mods={:?} offset={offset:#x}",
2171                    m
2172                );
2173
2174                let new_buf_base_off = if m.contains(IndirectMod::Relative) {
2175                    Some(offset)
2176                } else {
2177                    None
2178                };
2179
2180                // we push the message here otherwise we push message in depth first
2181                if let Some(msg) = self.message.as_ref() {
2182                    magic.push_message(msg.to_string_lossy());
2183                }
2184
2185                let mut nmatch = 0u64;
2186                for r in db.rules.iter() {
2187                    let messages_cnt = magic.message.len();
2188                    nmatch = nmatch.saturating_add(r.magic(
2189                        magic,
2190                        stream_kind,
2191                        new_buf_base_off,
2192                        Some(offset),
2193                        haystack,
2194                        db,
2195                        false,
2196                        depth.saturating_add(1),
2197                    )?);
2198
2199                    // this means we matched a rule
2200                    if magic.message.len() != messages_cnt {
2201                        break;
2202                    }
2203                }
2204
2205                // we return false not to push message again
2206                Ok((nmatch > 0, None))
2207            }
2208
2209            Test::Default => {
2210                // default matches if nothing else at the continuation level matched
2211                let ok = !state.get_continuation_level(&self.continuation_level());
2212
2213                trace!("source={source} line={line} default match={ok}");
2214                if ok {
2215                    state.set_continuation_level(self.continuation_level());
2216                }
2217
2218                Ok((ok, None))
2219            }
2220
2221            _ => {
2222                if let Err(e) = haystack.seek(SeekFrom::Start(offset)) {
2223                    debug!("source={source} line={line} failed to seek in haystack: {e}");
2224                    return Ok((false, None));
2225                }
2226
2227                let mut trace_msg = None;
2228
2229                if enabled!(Level::DEBUG) {
2230                    trace_msg = Some(vec![format!(
2231                        "source={source} line={line} depth={} stream_offset={:#x}",
2232                        self.depth,
2233                        haystack.lazy_stream_position()
2234                    )])
2235                }
2236
2237                // NOTE: we may have a way to optimize here. In case we do a Any
2238                // test and we don't use the value to format the message, we don't
2239                // need to read the value.
2240                if let Ok(opt_test_value) = self
2241                    .test
2242                    .read_test_value(haystack, switch_endianness)
2243                    .inspect_err(|e| {
2244                        debug!("source={source} line={line} error while reading test value @{offset}: {e}",)
2245                    })
2246                {
2247                    if let Some(v) = trace_msg
2248                        .as_mut() { v.push(format!("test={:?}", self.test)) }
2249
2250                    let match_res =
2251                        opt_test_value.and_then(|tv| self.test.match_value(&tv, stream_kind));
2252
2253                    if let Some(v) = trace_msg.as_mut() { v.push(format!(
2254                            "message=\"{}\" match={}",
2255                            self.message
2256                                .as_ref()
2257                                .map(|fs| fs.to_string_lossy())
2258                                .unwrap_or_default(),
2259                            match_res.is_some()
2260                        )) }
2261
2262                    // trace message
2263                    if enabled!(Level::DEBUG) && !enabled!(Level::TRACE) && match_res.is_some() {
2264                        if let Some(m) = trace_msg{
2265                            debug!("{}", m.join(" "));
2266                        }
2267                    } else if enabled!(Level::TRACE)
2268                        && let Some(m) = trace_msg{
2269                            trace!("{}", m.join(" "));
2270                        }
2271
2272                    if let Some(mr) = match_res {
2273                        state.set_continuation_level(self.continuation_level());
2274                        return Ok((true, Some(mr)));
2275                    }
2276                }
2277
2278                Ok((false, None))
2279            }
2280        }
2281    }
2282
2283    #[inline(always)]
2284    fn continuation_level(&self) -> ContinuationLevel {
2285        ContinuationLevel(self.depth)
2286    }
2287}
2288
2289#[derive(Debug, Clone)]
2290struct Use {
2291    line: usize,
2292    depth: u8,
2293    start_offset: Offset,
2294    rule_name: String,
2295    switch_endianness: bool,
2296    message: Option<Message>,
2297}
2298
2299#[derive(Debug, Clone, Serialize, Deserialize)]
2300struct StrengthMod {
2301    op: Op,
2302    by: u8,
2303}
2304
2305impl StrengthMod {
2306    #[inline(always)]
2307    fn apply(&self, strength: u64) -> u64 {
2308        let by = self.by as u64;
2309        debug!("applying strength modifier: {strength} {} {}", self.op, by);
2310        match self.op {
2311            Op::Mul => strength.saturating_mul(by),
2312            Op::Add => strength.saturating_add(by),
2313            Op::Sub => strength.saturating_sub(by),
2314            Op::Div => {
2315                if by > 0 {
2316                    strength.saturating_div(by)
2317                } else {
2318                    strength
2319                }
2320            }
2321            Op::Mod => strength % by,
2322            Op::And => strength & by,
2323            // this should never happen as strength operators
2324            // are enforced by our parser
2325            Op::Xor | Op::Or => {
2326                debug_panic!("unsupported strength operator");
2327                strength
2328            }
2329        }
2330    }
2331}
2332
2333#[derive(Debug, Clone)]
2334enum Flag {
2335    Mime(String),
2336    Ext(HashSet<String>),
2337    Strength(StrengthMod),
2338    Apple(String),
2339}
2340
2341#[derive(Debug, Clone)]
2342struct Name {
2343    line: usize,
2344    name: String,
2345    message: Option<Message>,
2346}
2347
2348#[derive(Debug, Clone)]
2349enum Entry<'span> {
2350    Match(Span<'span>, Match),
2351    Flag(Span<'span>, Flag),
2352}
2353
2354#[derive(Debug, Clone, Serialize, Deserialize)]
2355struct EntryNode {
2356    root: bool,
2357    entry: Match,
2358    children: Vec<EntryNode>,
2359    mimetype: Option<String>,
2360    apple: Option<String>,
2361    strength_mod: Option<StrengthMod>,
2362    exts: HashSet<String>,
2363}
2364
2365#[derive(Debug, Default)]
2366struct EntryNodeVisitor {
2367    exts: HashSet<String>,
2368    score: u64,
2369}
2370
2371impl EntryNodeVisitor {
2372    fn new() -> Self {
2373        Self {
2374            ..Default::default()
2375        }
2376    }
2377
2378    fn merge(&mut self, other: Self) {
2379        self.exts.extend(other.exts);
2380        self.score += other.score;
2381    }
2382}
2383
2384impl EntryNode {
2385    #[inline]
2386    fn update_visitor(&self, v: &mut EntryNodeVisitor, depth: usize) {
2387        // update extensions
2388        for ext in self.exts.iter() {
2389            if !v.exts.contains(ext) {
2390                v.exts.insert(ext.clone());
2391            }
2392        }
2393
2394        // update score if depth
2395        if depth == 0 {
2396            v.score += self.entry.test_strength;
2397        }
2398
2399        // Tests at deeper levels contribute less to the overall score.
2400        // We use the minimum value to establish a lower bound for the rule's score,
2401        // which helps prioritize rules based on their importance.
2402        v.score += self
2403            .children
2404            .iter()
2405            .map(|e| e.entry.test_strength)
2406            .min()
2407            .unwrap_or_default()
2408            / max(1, depth as u64);
2409    }
2410
2411    fn visit(
2412        &self,
2413        v: &mut EntryNodeVisitor,
2414        deps: &HashMap<String, DependencyRule>,
2415        marked: &mut HashSet<String>,
2416        depth: usize,
2417    ) -> Result<(), Error> {
2418        // updating visitor
2419        self.update_visitor(v, depth);
2420
2421        // recursively visiting
2422        for c in self.children.iter() {
2423            if let Test::Use(_, ref name) = c.entry.test {
2424                if marked.contains(name) {
2425                    continue;
2426                }
2427
2428                marked.insert(name.clone());
2429
2430                if let Some(r) = deps.get(name) {
2431                    let dv = r.rule.visit_all_entries(deps, marked)?;
2432                    v.merge(dv);
2433                } else {
2434                    return Err(Error::MissingRule(name.clone()));
2435                }
2436            } else {
2437                c.visit(v, deps, marked, depth + 1)?;
2438            }
2439        }
2440
2441        Ok(())
2442    }
2443
2444    #[inline]
2445    #[allow(clippy::too_many_arguments)]
2446    fn matches<'r, R: Read + Seek>(
2447        &'r self,
2448        opt_source: Option<&str>,
2449        magic: &mut Magic<'r>,
2450        state: &mut MatchState,
2451        stream_kind: StreamKind,
2452        buf_base_offset: Option<u64>,
2453        rule_base_offset: Option<u64>,
2454        last_level_offset: Option<u64>,
2455        haystack: &mut LazyCache<R>,
2456        db: &'r MagicDb,
2457        switch_endianness: bool,
2458        depth: usize,
2459    ) -> Result<u64, Error> {
2460        let mut nmatch = 0u64;
2461
2462        let (ok, opt_match_res) = self.entry.matches(
2463            opt_source,
2464            magic,
2465            stream_kind,
2466            state,
2467            buf_base_offset,
2468            rule_base_offset,
2469            last_level_offset,
2470            haystack,
2471            switch_endianness,
2472            db,
2473            depth,
2474        )?;
2475
2476        let source = opt_source.unwrap_or("unknown");
2477        let line = self.entry.line;
2478
2479        if ok {
2480            nmatch = nmatch.saturating_add(1);
2481
2482            // Update the magic with the message if the match is successful
2483            // Skip updating if the test is recursive, as it's already handled
2484            // in the Match::matches function
2485            if !self.entry.test.is_recursive()
2486                && let Some(msg) = self.entry.message.as_ref()
2487                && let Ok(msg) = msg.format_with(opt_match_res.as_ref()).inspect_err(|e| {
2488                    debug!("source={source} line={line} failed to format message: {e}")
2489                })
2490            {
2491                magic.push_message(msg);
2492            }
2493
2494            // we need to adjust stream offset in case of regex/search tests
2495            if let Some(mr) = opt_match_res {
2496                match &self.entry.test {
2497                    Test::String(t) => {
2498                        if t.has_length_mod() {
2499                            let o = mr.end_offset();
2500                            haystack.seek(SeekFrom::Start(o))?;
2501                        }
2502                    }
2503                    Test::Search(t) => {
2504                        if t.re_mods.contains(ReMod::StartOffsetUpdate) {
2505                            let o = mr.start_offset();
2506                            haystack.seek(SeekFrom::Start(o))?;
2507                        } else {
2508                            let o = mr.end_offset();
2509                            haystack.seek(SeekFrom::Start(o))?;
2510                        }
2511                    }
2512
2513                    Test::Regex(t) => {
2514                        if t.mods.contains(ReMod::StartOffsetUpdate) {
2515                            let o = mr.start_offset();
2516                            haystack.seek(SeekFrom::Start(o))?;
2517                        } else {
2518                            let o = mr.end_offset();
2519                            haystack.seek(SeekFrom::Start(o))?;
2520                        }
2521                    }
2522                    // other types do not need offset adjustement
2523                    _ => {}
2524                }
2525            }
2526
2527            if let Some(mimetype) = self.mimetype.as_ref() {
2528                magic.set_mime_type(Cow::Borrowed(mimetype));
2529            }
2530
2531            if let Some(apple_ty) = self.apple.as_ref() {
2532                magic.set_creator_code(Cow::Borrowed(apple_ty));
2533            }
2534
2535            if !self.exts.is_empty() {
2536                magic.insert_extensions(self.exts.iter().map(|s| s.as_str()));
2537            }
2538
2539            // NOTE: here we try to implement a similar logic as in file_magic_strength.
2540            // Sticking to the exact same strength computation logic is complicated due
2541            // to implementation differences. Let's wait and see if that is a real issue.
2542            let mut strength = self.entry.test_strength;
2543
2544            let continuation_level = self.entry.continuation_level().0 as u64;
2545            if self.entry.message.is_none() && continuation_level < 3 {
2546                strength = strength.saturating_add(continuation_level);
2547            }
2548
2549            if let Some(sm) = self.strength_mod.as_ref() {
2550                strength = sm.apply(strength);
2551            }
2552
2553            // entries with no message get a bonus
2554            if self.entry.message.is_none() {
2555                strength += 1
2556            }
2557
2558            magic.update_strength(strength);
2559
2560            let end_upper_level = haystack.lazy_stream_position();
2561
2562            // we have to fix rule_base_offset if
2563            // the rule_base_starts from end otherwise it
2564            // breaks some offset computation in match
2565            // see test_offset_bug_1 and test_offset_bug_2
2566            // they implement the same test logic yet indirect
2567            // offsets have to be different so that it works
2568            // in libmagic/file
2569            let rule_base_offset = if self.root {
2570                match self.entry.offset {
2571                    Offset::Direct(DirOffset::End(o)) => {
2572                        Some(haystack.offset_from_start(SeekFrom::End(o)))
2573                    }
2574                    _ => rule_base_offset,
2575                }
2576            } else {
2577                rule_base_offset
2578            };
2579
2580            for e in self.children.iter() {
2581                nmatch = nmatch.saturating_add(e.matches(
2582                    opt_source,
2583                    magic,
2584                    state,
2585                    stream_kind,
2586                    buf_base_offset,
2587                    rule_base_offset,
2588                    Some(end_upper_level),
2589                    haystack,
2590                    db,
2591                    switch_endianness,
2592                    depth,
2593                )?);
2594            }
2595        }
2596
2597        Ok(nmatch)
2598    }
2599}
2600
2601/// Represents a parsed magic rule
2602#[derive(Debug, Clone, Serialize, Deserialize)]
2603pub struct MagicRule {
2604    id: usize,
2605    source: Option<String>,
2606    entries: EntryNode,
2607    extensions: HashSet<String>,
2608    /// score used for rule ranking
2609    score: u64,
2610    finalized: bool,
2611}
2612
2613impl MagicRule {
2614    #[inline(always)]
2615    fn set_id(&mut self, id: usize) {
2616        self.id = id
2617    }
2618
2619    fn visit_all_entries(
2620        &self,
2621        deps: &HashMap<String, DependencyRule>,
2622        marked: &mut HashSet<String>,
2623    ) -> Result<EntryNodeVisitor, Error> {
2624        let mut v = EntryNodeVisitor::new();
2625        self.entries.visit(&mut v, deps, marked, 0)?;
2626        Ok(v)
2627    }
2628
2629    /// Finalize a rule by searching for all extensions and computing its score
2630    /// for ranking. In the `MagicRule` is already finalized it returns immediately.
2631    fn try_finalize(&mut self, deps: &HashMap<String, DependencyRule>) -> Result<(), Error> {
2632        if self.finalized {
2633            return Ok(());
2634        }
2635
2636        // rule can be finalized all deps are found
2637        let v = self.visit_all_entries(deps, &mut HashSet::new())?;
2638
2639        self.extensions.extend(v.exts);
2640        self.score = v.score;
2641        self.finalized = true;
2642
2643        Ok(())
2644    }
2645
2646    #[inline]
2647    fn magic_entrypoint<'r, R: Read + Seek>(
2648        &'r self,
2649        magic: &mut Magic<'r>,
2650        stream_kind: StreamKind,
2651        haystack: &mut LazyCache<R>,
2652        db: &'r MagicDb,
2653        switch_endianness: bool,
2654        depth: usize,
2655    ) -> Result<u64, Error> {
2656        self.entries.matches(
2657            self.source.as_deref(),
2658            magic,
2659            &mut MatchState::empty(),
2660            stream_kind,
2661            None,
2662            None,
2663            None,
2664            haystack,
2665            db,
2666            switch_endianness,
2667            depth,
2668        )
2669    }
2670
2671    #[inline]
2672    #[allow(clippy::too_many_arguments)]
2673    fn magic<'r, R: Read + Seek>(
2674        &'r self,
2675        magic: &mut Magic<'r>,
2676        stream_kind: StreamKind,
2677        buf_base_offset: Option<u64>,
2678        rule_base_offset: Option<u64>,
2679        haystack: &mut LazyCache<R>,
2680        db: &'r MagicDb,
2681        switch_endianness: bool,
2682        depth: usize,
2683    ) -> Result<u64, Error> {
2684        self.entries.matches(
2685            self.source.as_deref(),
2686            magic,
2687            &mut MatchState::empty(),
2688            stream_kind,
2689            buf_base_offset,
2690            rule_base_offset,
2691            None,
2692            haystack,
2693            db,
2694            switch_endianness,
2695            depth,
2696        )
2697    }
2698
2699    /// Checks if the rule is for matching against text content
2700    ///
2701    /// # Returns
2702    ///
2703    /// * `bool` - True if the rule is for text files
2704    pub fn is_text(&self) -> bool {
2705        self.entries.entry.test.is_text()
2706            && self.entries.children.iter().all(|e| e.entry.test.is_text())
2707    }
2708
2709    /// Gets the rule's score used for ranking rules between them
2710    ///
2711    /// # Returns
2712    ///
2713    /// * `u64` - The rule's score
2714    #[inline(always)]
2715    pub fn score(&self) -> u64 {
2716        self.score
2717    }
2718
2719    /// Gets the rule's filename if any
2720    ///
2721    /// # Returns
2722    ///
2723    /// * `Option<&str>` - The rule's source if available
2724    #[inline(always)]
2725    pub fn source(&self) -> Option<&str> {
2726        self.source.as_deref()
2727    }
2728
2729    /// Gets the line number at which the rule is defined
2730    ///
2731    /// # Returns
2732    ///
2733    /// * `usize` - The rule's line number
2734    #[inline(always)]
2735    pub fn line(&self) -> usize {
2736        self.entries.entry.line
2737    }
2738
2739    /// Gets all the file extensions associated to the rule
2740    ///
2741    /// # Returns
2742    ///
2743    /// * `&HashSet<String>` - The set of all associated extensions
2744    #[inline(always)]
2745    pub fn extensions(&self) -> &HashSet<String> {
2746        &self.extensions
2747    }
2748}
2749
2750#[derive(Debug, Clone, Serialize, Deserialize)]
2751struct DependencyRule {
2752    name: String,
2753    rule: MagicRule,
2754}
2755
2756/// A parsed source of magic rules
2757///
2758/// # Methods
2759///
2760/// * `open` - Opens a magic file from a path
2761#[derive(Debug, Clone, Serialize, Deserialize)]
2762pub struct MagicSource {
2763    rules: Vec<MagicRule>,
2764    dependencies: HashMap<String, DependencyRule>,
2765}
2766
2767impl MagicSource {
2768    /// Opens and parses a magic file from a path
2769    ///
2770    /// # Arguments
2771    ///
2772    /// * `p` - The path to the magic file
2773    ///
2774    /// # Returns
2775    ///
2776    /// * `Result<Self, Error>` - The parsed magic file or an error
2777    pub fn open<P: AsRef<Path>>(p: P) -> Result<Self, Error> {
2778        FileMagicParser::parse_file(p)
2779    }
2780}
2781
2782#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)]
2783struct ContinuationLevel(u8);
2784
2785// FIXME: magic handles many more text encodings
2786#[derive(Debug, PartialEq, Eq, Clone, Copy)]
2787enum TextEncoding {
2788    Ascii,
2789    Utf8,
2790    Unknown,
2791}
2792
2793impl TextEncoding {
2794    const fn as_magic_str(&self) -> &'static str {
2795        match self {
2796            TextEncoding::Ascii => "ASCII",
2797            TextEncoding::Utf8 => "UTF-8",
2798            TextEncoding::Unknown => "Unknown",
2799        }
2800    }
2801}
2802
2803#[derive(Debug, PartialEq, Eq, Clone, Copy)]
2804enum StreamKind {
2805    Binary,
2806    Text(TextEncoding),
2807}
2808
2809impl StreamKind {
2810    const fn is_text(&self) -> bool {
2811        matches!(self, StreamKind::Text(_))
2812    }
2813}
2814
2815#[derive(Debug)]
2816struct MatchState {
2817    continuation_levels: [bool; 256],
2818}
2819
2820impl MatchState {
2821    #[inline(always)]
2822    fn empty() -> Self {
2823        MatchState {
2824            continuation_levels: [false; 256],
2825        }
2826    }
2827
2828    #[inline(always)]
2829    fn get_continuation_level(&mut self, level: &ContinuationLevel) -> bool {
2830        self.continuation_levels
2831            .get(level.0 as usize)
2832            .cloned()
2833            .unwrap_or_default()
2834    }
2835
2836    #[inline(always)]
2837    fn set_continuation_level(&mut self, level: ContinuationLevel) {
2838        if let Some(b) = self.continuation_levels.get_mut(level.0 as usize) {
2839            *b = true
2840        }
2841    }
2842
2843    #[inline(always)]
2844    fn clear_continuation_level(&mut self, level: &ContinuationLevel) {
2845        if let Some(b) = self.continuation_levels.get_mut(level.0 as usize) {
2846            *b = false;
2847        }
2848    }
2849}
2850
2851/// Represents a file magic detection result
2852#[derive(Debug, Default)]
2853pub struct Magic<'m> {
2854    stream_kind: Option<StreamKind>,
2855    source: Option<Cow<'m, str>>,
2856    message: Vec<Cow<'m, str>>,
2857    mime_type: Option<Cow<'m, str>>,
2858    creator_code: Option<Cow<'m, str>>,
2859    strength: u64,
2860    exts: HashSet<Cow<'m, str>>,
2861    is_default: bool,
2862}
2863
2864impl<'m> Magic<'m> {
2865    #[inline(always)]
2866    fn set_source(&mut self, source: Option<&'m str>) {
2867        self.source = source.map(Cow::Borrowed);
2868    }
2869
2870    #[inline(always)]
2871    fn set_stream_kind(&mut self, stream_kind: StreamKind) {
2872        self.stream_kind = Some(stream_kind)
2873    }
2874
2875    #[inline(always)]
2876    fn reset(&mut self) {
2877        self.stream_kind = None;
2878        self.source = None;
2879        self.message.clear();
2880        self.mime_type = None;
2881        self.creator_code = None;
2882        self.strength = 0;
2883        self.exts.clear();
2884        self.is_default = false;
2885    }
2886
2887    /// Converts borrowed data into owned data. This method involves
2888    /// data cloning, so you must use this method only if you need to
2889    /// extend the lifetime of a [`Magic`] struct.
2890    ///
2891    /// # Returns
2892    ///
2893    /// * `Magic<'owned>` - A new [`Magic`] with owned data
2894    #[inline]
2895    pub fn into_owned<'owned>(self) -> Magic<'owned> {
2896        Magic {
2897            stream_kind: self.stream_kind,
2898            source: self.source.map(|s| Cow::Owned(s.into_owned())),
2899            message: self
2900                .message
2901                .into_iter()
2902                .map(Cow::into_owned)
2903                .map(Cow::Owned)
2904                .collect(),
2905            mime_type: self.mime_type.map(|m| Cow::Owned(m.into_owned())),
2906            creator_code: self.creator_code.map(|m| Cow::Owned(m.into_owned())),
2907            strength: self.strength,
2908            exts: self
2909                .exts
2910                .into_iter()
2911                .map(|e| Cow::Owned(e.into_owned()))
2912                .collect(),
2913            is_default: self.is_default,
2914        }
2915    }
2916
2917    /// Gets the formatted message describing the file type
2918    ///
2919    /// # Returns
2920    ///
2921    /// * `String` - The formatted message
2922    #[inline(always)]
2923    pub fn message(&self) -> String {
2924        let mut out = String::new();
2925        for (i, m) in self.message.iter().enumerate() {
2926            if let Some(s) = m.strip_prefix(r#"\b"#) {
2927                out.push_str(s);
2928            } else {
2929                // don't put space on first string
2930                if i > 0 {
2931                    out.push(' ');
2932                }
2933                out.push_str(m);
2934            }
2935        }
2936        out
2937    }
2938
2939    /// Returns an iterator over the individual parts of the magic message
2940    ///
2941    /// A magic message is typically composed of multiple parts, each appended
2942    /// during successful magic tests. This method provides an efficient way to
2943    /// iterate over these parts without concatenating them into a new string,
2944    /// as done when calling [`Magic::message`].
2945    ///
2946    /// # Returns
2947    ///
2948    /// * `impl Iterator<Item = &str>` - An iterator yielding string slices of each message part
2949    #[inline]
2950    pub fn message_parts(&self) -> impl Iterator<Item = &str> {
2951        self.message.iter().map(|p| p.as_ref())
2952    }
2953
2954    #[inline(always)]
2955    fn update_strength(&mut self, value: u64) {
2956        self.strength = self.strength.saturating_add(value);
2957        debug!("updated strength = {:?}", self.strength)
2958    }
2959
2960    /// Gets the detected MIME type
2961    ///
2962    /// # Returns
2963    ///
2964    /// * `&str` - The MIME type or default based on stream kind
2965    #[inline(always)]
2966    pub fn mime_type(&self) -> &str {
2967        self.mime_type.as_deref().unwrap_or(match self.stream_kind {
2968            Some(StreamKind::Text(_)) => DEFAULT_TEXT_MIMETYPE,
2969            Some(StreamKind::Binary) | None => DEFAULT_BIN_MIMETYPE,
2970        })
2971    }
2972
2973    #[inline(always)]
2974    fn push_message<'a: 'm>(&mut self, msg: Cow<'a, str>) {
2975        if !msg.is_empty() {
2976            debug!("pushing message: msg={msg} len={}", msg.len());
2977            self.message.push(msg);
2978        }
2979    }
2980
2981    #[inline(always)]
2982    fn set_mime_type<'a: 'm>(&mut self, mime: Cow<'a, str>) {
2983        if self.mime_type.is_none() {
2984            debug!("insert mime: {:?}", mime);
2985            self.mime_type = Some(mime)
2986        }
2987    }
2988
2989    #[inline(always)]
2990    fn set_creator_code<'a: 'm>(&mut self, apple_ty: Cow<'a, str>) {
2991        if self.creator_code.is_none() {
2992            debug!("insert apple type: {apple_ty:?}");
2993            self.creator_code = Some(apple_ty)
2994        }
2995    }
2996
2997    #[inline(always)]
2998    fn insert_extensions<'a: 'm, I: Iterator<Item = &'a str>>(&mut self, exts: I) {
2999        if self.exts.is_empty() {
3000            self.exts.extend(exts.filter_map(|e| {
3001                if e.is_empty() {
3002                    None
3003                } else {
3004                    Some(Cow::Borrowed(e))
3005                }
3006            }));
3007        }
3008    }
3009
3010    /// Gets the confidence score of the detection. This
3011    /// value is used to sort [`Magic`] in [`MagicDb::best_magic`]
3012    /// and [`MagicDb::all_magics`].
3013    ///
3014    /// # Returns
3015    ///
3016    /// * `u64` - The confidence score attributed to that [`Magic`]
3017    #[inline(always)]
3018    pub fn strength(&self) -> u64 {
3019        self.strength
3020    }
3021
3022    /// Gets the filename where the magic rule was defined
3023    ///
3024    /// # Returns
3025    ///
3026    /// * `Option<&str>` - The source if available
3027    #[inline(always)]
3028    pub fn source(&self) -> Option<&str> {
3029        self.source.as_deref()
3030    }
3031
3032    /// Gets the Apple creator code if available
3033    ///
3034    /// # Returns
3035    ///
3036    /// * `Option<&str>` - The creator code if available
3037    #[inline(always)]
3038    pub fn creator_code(&self) -> Option<&str> {
3039        self.creator_code.as_deref()
3040    }
3041
3042    /// Gets the possible file extensions for the detected [`Magic`]
3043    ///
3044    /// # Returns
3045    ///
3046    /// * `&HashSet<Cow<'m, str>>` - The set of possible extensions
3047    #[inline(always)]
3048    pub fn extensions(&self) -> &HashSet<Cow<'m, str>> {
3049        &self.exts
3050    }
3051
3052    /// Checks if this is a default fallback detection
3053    ///
3054    /// # Returns
3055    ///
3056    /// * `bool` - True if this is a default detection
3057    #[inline(always)]
3058    pub fn is_default(&self) -> bool {
3059        self.is_default
3060    }
3061}
3062
3063/// Represents a database of [`MagicRule`]
3064#[derive(Debug, Default, Clone, Serialize, Deserialize)]
3065pub struct MagicDb {
3066    rule_id: usize,
3067    rules: Vec<MagicRule>,
3068    dependencies: HashMap<String, DependencyRule>,
3069    finalized: usize,
3070}
3071
3072#[inline(always)]
3073/// Returns `true` if the byte stream is likely text.
3074fn is_likely_text(bytes: &[u8]) -> bool {
3075    const CHUNK_SIZE: usize = std::mem::size_of::<usize>();
3076
3077    if bytes.is_empty() {
3078        return false;
3079    }
3080
3081    let mut printable = 0f64;
3082    let mut high_bytes = 0f64; // Bytes > 0x7F (non-ASCII)
3083
3084    let (chunks, remainder) = bytes.as_chunks::<CHUNK_SIZE>();
3085
3086    macro_rules! handle_byte {
3087        ($byte: expr) => {
3088            match $byte {
3089                0x00 => return false,
3090                0x09 | 0x0A | 0x0D => printable += 1.0, // Whitespace
3091                0x20..=0x7E => printable += 1.0,        // Printable ASCII
3092                _ => high_bytes += 1.0,
3093            }
3094        };
3095    }
3096
3097    for bytes in chunks {
3098        for b in bytes {
3099            handle_byte!(b)
3100        }
3101    }
3102
3103    for b in remainder {
3104        handle_byte!(b)
3105    }
3106
3107    let total = bytes.len() as f64;
3108    let printable_ratio = printable / total;
3109    let high_bytes_ratio = high_bytes / total;
3110
3111    // Heuristic thresholds (adjust as needed):
3112    printable_ratio > 0.85 && high_bytes_ratio < 0.20
3113}
3114
3115#[inline(always)]
3116fn guess_stream_kind<S: AsRef<[u8]>>(stream: S) -> StreamKind {
3117    let buf = stream.as_ref();
3118
3119    match run_utf8_validation(buf) {
3120        Ok(is_ascii) => {
3121            if is_ascii {
3122                StreamKind::Text(TextEncoding::Ascii)
3123            } else {
3124                StreamKind::Text(TextEncoding::Utf8)
3125            }
3126        }
3127        Err(e) => {
3128            if is_likely_text(&buf[e.valid_up_to..]) {
3129                StreamKind::Text(TextEncoding::Unknown)
3130            } else {
3131                StreamKind::Binary
3132            }
3133        }
3134    }
3135}
3136
3137impl MagicDb {
3138    /// Prepares an [`LazyCache`] configured with optimal parameters for
3139    /// **read** operations done during file identification
3140    pub fn optimal_lazy_cache<R: Read + Seek>(f: R) -> Result<LazyCache<R>, io::Error> {
3141        Ok(LazyCache::<R>::from_read_seek(f)
3142            .and_then(|lc| lc.with_hot_cache(2 * FILE_BYTES_MAX))?)
3143        .map(|lc| lc.with_warm_cache(100 << 20))
3144    }
3145
3146    /// Creates a new empty database
3147    ///
3148    /// # Returns
3149    ///
3150    /// * [`MagicDb`] - A new empty database
3151    pub fn new() -> Self {
3152        Self::default()
3153    }
3154
3155    #[inline(always)]
3156    fn next_rule_id(&mut self) -> usize {
3157        let t = self.rule_id;
3158        self.rule_id += 1;
3159        t
3160    }
3161
3162    #[inline(always)]
3163    fn try_json<R: Read + Seek>(
3164        haystack: &mut LazyCache<R>,
3165        stream_kind: StreamKind,
3166        magic: &mut Magic,
3167    ) -> Result<bool, Error> {
3168        // cannot be json if content is binary
3169        if matches!(stream_kind, StreamKind::Binary) {
3170            return Ok(false);
3171        }
3172
3173        let buf = haystack.read_range(0..FILE_BYTES_MAX as u64)?.trim_ascii();
3174
3175        let Some((start, end)) = find_json_boundaries(buf) else {
3176            return Ok(false);
3177        };
3178
3179        // if anything else than whitespace before start
3180        // this is not json
3181        for c in buf[0..start].iter() {
3182            if !c.is_ascii_whitespace() {
3183                return Ok(false);
3184            }
3185        }
3186
3187        let mut is_ndjson = false;
3188
3189        trace!("maybe a json document");
3190        let ok = serde_json::from_slice::<serde_json::Value>(&buf[start..=end]).is_ok();
3191        if !ok {
3192            return Ok(false);
3193        }
3194
3195        // we are sure it is json now we must look if we are ndjson
3196        if end + 1 < buf.len() {
3197            // after first json
3198            let buf = &buf[end + 1..];
3199            if let Some((second_start, second_end)) = find_json_boundaries(buf) {
3200                // there is a new line between the two json docs
3201                if memchr(b'\n', &buf[..second_start]).is_some() {
3202                    trace!("might be ndjson");
3203                    is_ndjson = serde_json::from_slice::<serde_json::Value>(
3204                        &buf[second_start..=second_end],
3205                    )
3206                    .is_ok();
3207                }
3208            }
3209        }
3210
3211        if is_ndjson {
3212            magic.push_message(Cow::Borrowed("New Line Delimited"));
3213            magic.set_mime_type(Cow::Borrowed("application/x-ndjson"));
3214            magic.insert_extensions(["ndjson", "jsonl"].into_iter());
3215        } else {
3216            magic.set_mime_type(Cow::Borrowed("application/json"));
3217            magic.insert_extensions(["json"].into_iter());
3218        }
3219
3220        magic.push_message(Cow::Borrowed("JSON text data"));
3221        magic.set_source(Some(HARDCODED_SOURCE));
3222        magic.update_strength(HARDCODED_MAGIC_STRENGTH);
3223        Ok(true)
3224    }
3225
3226    #[inline(always)]
3227    fn try_csv<R: Read + Seek>(
3228        haystack: &mut LazyCache<R>,
3229        stream_kind: StreamKind,
3230        magic: &mut Magic,
3231    ) -> Result<bool, Error> {
3232        // cannot be csv if content is binary
3233        let StreamKind::Text(enc) = stream_kind else {
3234            return Ok(false);
3235        };
3236
3237        let buf = haystack.read_range(0..FILE_BYTES_MAX as u64)?;
3238        let mut reader = csv::Reader::from_reader(io::Cursor::new(buf));
3239        let mut records = reader.records();
3240
3241        let Some(Ok(first)) = records.next() else {
3242            return Ok(false);
3243        };
3244
3245        // very not likely a CSV otherwise all programming
3246        // languages having ; line terminator would be
3247        // considered as CSV
3248        if first.len() <= 1 {
3249            return Ok(false);
3250        }
3251
3252        // we already parsed first line
3253        let mut n = 1;
3254        for i in records.take(9) {
3255            if let Ok(rec) = i {
3256                if first.len() != rec.len() {
3257                    return Ok(false);
3258                }
3259            } else {
3260                return Ok(false);
3261            }
3262            n += 1;
3263        }
3264
3265        // we need at least 10 lines
3266        if n != 10 {
3267            return Ok(false);
3268        }
3269
3270        magic.set_mime_type(Cow::Borrowed("text/csv"));
3271        magic.push_message(Cow::Borrowed("CSV"));
3272        magic.push_message(Cow::Borrowed(enc.as_magic_str()));
3273        magic.push_message(Cow::Borrowed("text"));
3274        magic.insert_extensions(["csv"].into_iter());
3275        magic.set_source(Some(HARDCODED_SOURCE));
3276        magic.update_strength(HARDCODED_MAGIC_STRENGTH);
3277        Ok(true)
3278    }
3279
3280    #[inline(always)]
3281    fn try_tar<R: Read + Seek>(
3282        haystack: &mut LazyCache<R>,
3283        stream_kind: StreamKind,
3284        magic: &mut Magic,
3285    ) -> Result<bool, Error> {
3286        // cannot be json if content is not binary
3287        if !matches!(stream_kind, StreamKind::Binary) {
3288            return Ok(false);
3289        }
3290
3291        let buf = haystack.read_range(0..FILE_BYTES_MAX as u64)?;
3292        let mut ar = Archive::new(io::Cursor::new(buf));
3293
3294        let Ok(mut entries) = ar.entries() else {
3295            return Ok(false);
3296        };
3297
3298        let Some(Ok(first)) = entries.next() else {
3299            return Ok(false);
3300        };
3301
3302        let header = first.header();
3303
3304        if header.as_ustar().is_some() {
3305            magic.push_message(Cow::Borrowed("POSIX tar archive"));
3306        } else if header.as_gnu().is_some() {
3307            magic.push_message(Cow::Borrowed("POSIX tar archive (GNU)"));
3308        } else {
3309            magic.push_message(Cow::Borrowed("tar archive"));
3310        }
3311
3312        magic.set_mime_type(Cow::Borrowed("application/x-tar"));
3313        magic.set_source(Some(HARDCODED_SOURCE));
3314        magic.update_strength(HARDCODED_MAGIC_STRENGTH);
3315        magic.insert_extensions(["tar"].into_iter());
3316        Ok(true)
3317    }
3318
3319    #[inline(always)]
3320    fn try_hard_magic<R: Read + Seek>(
3321        haystack: &mut LazyCache<R>,
3322        stream_kind: StreamKind,
3323        magic: &mut Magic,
3324    ) -> Result<bool, Error> {
3325        Ok(Self::try_json(haystack, stream_kind, magic)?
3326            || Self::try_csv(haystack, stream_kind, magic)?
3327            || Self::try_tar(haystack, stream_kind, magic)?)
3328    }
3329
3330    #[inline(always)]
3331    fn magic_default<'m, R: Read + Seek>(
3332        cache: &mut LazyCache<R>,
3333        stream_kind: StreamKind,
3334        magic: &mut Magic<'m>,
3335    ) {
3336        magic.set_source(Some(HARDCODED_SOURCE));
3337        magic.set_stream_kind(stream_kind);
3338        magic.is_default = true;
3339
3340        if cache.data_size() == 0 {
3341            magic.push_message(Cow::Borrowed("empty"));
3342            magic.set_mime_type(Cow::Borrowed(DEFAULT_BIN_MIMETYPE));
3343        }
3344
3345        match stream_kind {
3346            StreamKind::Binary => {
3347                magic.push_message(Cow::Borrowed("data"));
3348            }
3349            StreamKind::Text(e) => {
3350                magic.push_message(Cow::Borrowed(e.as_magic_str()));
3351                magic.push_message(Cow::Borrowed("text"));
3352            }
3353        }
3354    }
3355
3356    fn load_rules_no_prepare(&mut self, rules: Vec<MagicRule>) {
3357        for rule in rules.into_iter() {
3358            let mut rule = rule;
3359            rule.set_id(self.next_rule_id());
3360
3361            self.rules.push(rule);
3362        }
3363    }
3364
3365    /// Loads rules from a [`MagicSource`]
3366    ///
3367    /// # Arguments
3368    ///
3369    /// * `ms` - The [`MagicSource`] to load rules from
3370    pub fn load(&mut self, ms: MagicSource) -> &mut Self {
3371        self.load_rules_no_prepare(ms.rules);
3372        self.dependencies.extend(ms.dependencies);
3373        self.try_finalize();
3374        self
3375    }
3376
3377    /// Loads multiple [`MagicSource`] items efficiently in bulk.
3378    ///
3379    /// This is more efficient than loading each individually. After processing
3380    /// all sources, it applies finalization step only once.
3381    pub fn load_bulk<I: Iterator<Item = MagicSource>>(&mut self, it: I) -> &mut Self {
3382        for ms in it {
3383            self.load_rules_no_prepare(ms.rules);
3384            self.dependencies.extend(ms.dependencies);
3385        }
3386        self.try_finalize();
3387        self
3388    }
3389
3390    /// Gets all rules in the database
3391    ///
3392    /// # Returns
3393    ///
3394    /// * `&[MagicRule]` - A slice of all rules
3395    pub fn rules(&self) -> &[MagicRule] {
3396        &self.rules
3397    }
3398
3399    #[inline]
3400    fn first_magic_with_stream_kind<R: Read + Seek>(
3401        &self,
3402        haystack: &mut LazyCache<R>,
3403        stream_kind: StreamKind,
3404        extension: Option<&str>,
3405    ) -> Result<Magic<'_>, Error> {
3406        // re-using magic makes this function faster
3407        let mut magic = Magic::default();
3408
3409        if Self::try_hard_magic(haystack, stream_kind, &mut magic)? {
3410            return Ok(magic);
3411        }
3412
3413        let mut marked = vec![false; self.rules.len()];
3414
3415        macro_rules! do_magic {
3416            ($rule: expr) => {{
3417                $rule.magic_entrypoint(&mut magic, stream_kind, haystack, &self, false, 0)?;
3418
3419                if !magic.message.is_empty() {
3420                    magic.set_stream_kind(stream_kind);
3421                    magic.set_source($rule.source.as_deref());
3422                    return Ok(magic);
3423                }
3424
3425                magic.reset();
3426            }};
3427        }
3428
3429        if let Some(ext) = extension.map(|e| e.to_lowercase())
3430            && !ext.is_empty()
3431        {
3432            for rule in self.rules.iter().filter(|r| r.extensions.contains(&ext)) {
3433                do_magic!(rule);
3434                if let Some(f) = marked.get_mut(rule.id) {
3435                    *f = true
3436                }
3437            }
3438        }
3439
3440        for rule in self
3441            .rules
3442            .iter()
3443            // we don't run again rules run by extension
3444            .filter(|r| !*marked.get(r.id).unwrap_or(&false))
3445        {
3446            do_magic!(rule)
3447        }
3448
3449        Self::magic_default(haystack, stream_kind, &mut magic);
3450
3451        Ok(magic)
3452    }
3453
3454    /// Detects file [`Magic`] stopping at the first matching magic. Magic
3455    /// rules are evaluated from the best to the least relevant, so this method
3456    /// returns most of the time the best magic. For the rare cases where
3457    /// it doesn't or if the best result is always required, use [`MagicDb::best_magic`]
3458    ///
3459    /// # Arguments
3460    ///
3461    /// * `r` - A readable and seekable input
3462    /// * `extension` - Optional file extension to use for acceleration
3463    ///
3464    /// # Returns
3465    ///
3466    /// * `Result<Magic<'_>, Error>` - The detection result or an error
3467    ///
3468    /// # Warning
3469    ///
3470    /// File extension acceleration is made to evaluate rules faster by testing
3471    /// first the rules defining this extension with an `!:ext` entry.
3472    /// Whether you use `extension` acceleration or not with this function should not
3473    /// produce different results. Yet this makes the assumption rules are written
3474    /// correctly and every rule concerned defines `!:ext` when it is appropriate.
3475    /// If some rules are missing it, results might differ.
3476    pub fn first_magic<R: Read + Seek>(
3477        &self,
3478        r: &mut R,
3479        extension: Option<&str>,
3480    ) -> Result<Magic<'_>, Error> {
3481        let mut cache = Self::optimal_lazy_cache(r)?;
3482        let stream_kind = guess_stream_kind(cache.read_range(0..FILE_BYTES_MAX as u64)?);
3483        self.first_magic_with_stream_kind(&mut cache, stream_kind, extension)
3484    }
3485
3486    /// An alternative to [`Self::first_magic`] using a [`LazyCache`]
3487    /// to detects file [`Magic`] stopping at the first matching magic. Magic
3488    /// rules are evaluated from the best to the least relevant, so this method
3489    /// returns most of the time the best magic. For the rare cases where
3490    /// it doesn't or if the best result is always required, use [`MagicDb::best_magic`]
3491    ///
3492    /// # Arguments
3493    ///
3494    /// * `cache` - A [`LazyCache`] used for read operations
3495    /// * `extension` - Optional file extension to use for acceleration
3496    ///
3497    /// # Returns
3498    ///
3499    /// * `Result<Magic<'_>, Error>` - The detection result or an error
3500    ///
3501    /// # Notes
3502    ///
3503    /// * Use this method **only** if you need to re-use a [`LazyCache`] for future **read** operations.
3504    /// * Use [`Self::optimal_lazy_cache`] to prepare an optimal [`LazyCache`]
3505    ///
3506    /// # Warning
3507    ///
3508    /// File extension acceleration is made to evaluate rules faster by testing
3509    /// first the rules defining this extension with an `!:ext` entry.
3510    /// Whether you use `extension` acceleration or not with this function should not
3511    /// produce different results. Yet this makes the assumption rules are written
3512    /// correctly and every rule concerned defines `!:ext` when it is appropriate.
3513    /// If some rules are missing it, results might differ.
3514    pub fn first_magic_with_lazy_cache<R: Read + Seek>(
3515        &self,
3516        cache: &mut LazyCache<R>,
3517        extension: Option<&str>,
3518    ) -> Result<Magic<'_>, Error> {
3519        let stream_kind = guess_stream_kind(cache.read_range(0..FILE_BYTES_MAX as u64)?);
3520        self.first_magic_with_stream_kind(cache, stream_kind, extension)
3521    }
3522
3523    #[inline(always)]
3524    fn all_magics_sort_with_stream_kind<R: Read + Seek>(
3525        &self,
3526        haystack: &mut LazyCache<R>,
3527        stream_kind: StreamKind,
3528    ) -> Result<Vec<Magic<'_>>, Error> {
3529        let mut out = Vec::new();
3530
3531        let mut magic = Magic::default();
3532
3533        if Self::try_hard_magic(haystack, stream_kind, &mut magic)? {
3534            out.push(magic);
3535            magic = Magic::default();
3536        }
3537
3538        for rule in self.rules.iter() {
3539            rule.magic_entrypoint(&mut magic, stream_kind, haystack, self, false, 0)?;
3540
3541            // it is possible we have a strength with no message
3542            if !magic.message.is_empty() {
3543                magic.set_stream_kind(stream_kind);
3544                magic.set_source(rule.source.as_deref());
3545                out.push(magic);
3546                magic = Magic::default();
3547            }
3548
3549            magic.reset();
3550        }
3551
3552        Self::magic_default(haystack, stream_kind, &mut magic);
3553        out.push(magic);
3554
3555        out.sort_by_key(|b| std::cmp::Reverse(b.strength()));
3556
3557        Ok(out)
3558    }
3559
3560    /// Detects all [`Magic`] matching a given content.
3561    ///
3562    /// # Arguments
3563    ///
3564    /// * `r` - A readable and seekable input
3565    ///
3566    /// # Returns
3567    ///
3568    /// * `Result<Vec<Magic<'_>>, Error>` - All detection results sorted by strength or an error
3569    pub fn all_magics<R: Read + Seek>(&self, r: &mut R) -> Result<Vec<Magic<'_>>, Error> {
3570        let mut cache = Self::optimal_lazy_cache(r)?;
3571        let stream_kind = guess_stream_kind(cache.read_range(0..FILE_BYTES_MAX as u64)?);
3572        self.all_magics_sort_with_stream_kind(&mut cache, stream_kind)
3573    }
3574
3575    /// An alternative to [`Self::all_magics`] using a [`LazyCache`]
3576    /// to detects all [`Magic`] matching a given content.
3577    ///
3578    /// # Arguments
3579    ///
3580    /// * `r` - A readable and seekable input
3581    ///
3582    /// # Returns
3583    ///
3584    /// * `Result<Vec<Magic<'_>>, Error>` - All detection results sorted by strength or an error
3585    ///
3586    /// # Notes
3587    ///
3588    /// * Use this method **only** if you need to re-use a [`LazyCache`] for future **read** operations.
3589    /// * Use [`Self::optimal_lazy_cache`] to prepare an optimal [`LazyCache`]
3590    pub fn all_magics_with_lazy_cache<R: Read + Seek>(
3591        &self,
3592        cache: &mut LazyCache<R>,
3593    ) -> Result<Vec<Magic<'_>>, Error> {
3594        let stream_kind = guess_stream_kind(cache.read_range(0..FILE_BYTES_MAX as u64)?);
3595        self.all_magics_sort_with_stream_kind(cache, stream_kind)
3596    }
3597
3598    #[inline(always)]
3599    fn best_magic_with_stream_kind<R: Read + Seek>(
3600        &self,
3601        haystack: &mut LazyCache<R>,
3602        stream_kind: StreamKind,
3603    ) -> Result<Magic<'_>, Error> {
3604        let magics = self.all_magics_sort_with_stream_kind(haystack, stream_kind)?;
3605
3606        // magics is guaranteed to contain at least the
3607        // default magic but we unwrap to avoid any panic
3608        Ok(magics.into_iter().next().unwrap_or_else(|| {
3609            let mut magic = Magic::default();
3610            Self::magic_default(haystack, stream_kind, &mut magic);
3611            magic
3612        }))
3613    }
3614
3615    /// Detects the best [`Magic`] matching a given content.
3616    ///
3617    /// # Arguments
3618    ///
3619    /// * `r` - A readable and seekable input
3620    ///
3621    /// # Returns
3622    ///
3623    /// * `Result<Magic<'_>, Error>` - The best detection result or an error
3624    pub fn best_magic<R: Read + Seek>(&self, r: &mut R) -> Result<Magic<'_>, Error> {
3625        let mut cache = Self::optimal_lazy_cache(r)?;
3626        let stream_kind = guess_stream_kind(cache.read_range(0..FILE_BYTES_MAX as u64)?);
3627        self.best_magic_with_stream_kind(&mut cache, stream_kind)
3628    }
3629
3630    /// An alternative to [`Self::best_magic`] using a [`LazyCache`]
3631    /// to detect the best [`Magic`] matching a given content.
3632    ///
3633    /// # Arguments
3634    ///
3635    /// * `r` - A readable and seekable input
3636    ///
3637    /// # Returns
3638    ///
3639    /// * `Result<Magic<'_>, Error>` - The best detection result or an error
3640    ///
3641    /// # Notes
3642    ///
3643    /// * Use this method **only** if you need to re-use a [`LazyCache`] for future **read** operations.
3644    /// * Use [`Self::optimal_lazy_cache`] to prepare an optimal [`LazyCache`]
3645    pub fn best_magic_with_lazy_cache<R: Read + Seek>(
3646        &self,
3647        cache: &mut LazyCache<R>,
3648    ) -> Result<Magic<'_>, Error> {
3649        let stream_kind = guess_stream_kind(cache.read_range(0..FILE_BYTES_MAX as u64)?);
3650        self.best_magic_with_stream_kind(cache, stream_kind)
3651    }
3652
3653    /// Serializes the database to a generic writer implementing [`io::Write`]
3654    ///
3655    /// # Returns
3656    ///
3657    /// * `Result<(), Error>` - The serialized database or an error
3658    pub fn serialize<W: Write>(self, w: &mut W) -> Result<(), Error> {
3659        let mut encoder = GzEncoder::new(w, Compression::best());
3660
3661        bincode::serde::encode_into_std_write(&self, &mut encoder, bincode::config::standard())?;
3662        encoder.finish()?;
3663        Ok(())
3664    }
3665
3666    /// Deserializes the database from a generic reader implementing [`io::Read`]
3667    ///
3668    /// # Arguments
3669    ///
3670    /// * `r` - The reader to deserialize from
3671    ///
3672    /// # Returns
3673    ///
3674    /// * `Result<Self, Error>` - The deserialized database or an error
3675    pub fn deserialize<R: Read>(r: &mut R) -> Result<Self, Error> {
3676        let mut buf = vec![];
3677        let mut gz = GzDecoder::new(r);
3678        gz.read_to_end(&mut buf).map_err(|e| {
3679            bincode::error::DecodeError::OtherString(format!("failed to read: {e}"))
3680        })?;
3681        let (sdb, _): (MagicDb, usize) =
3682            bincode::serde::decode_from_slice(&buf, bincode::config::standard())?;
3683        Ok(sdb)
3684    }
3685
3686    /// Verifies the consistency of the [`MagicDb`] database.
3687    /// This method must be called when the database is built once and used later.
3688    /// It catches [`enum@Error`] that would raise at rule evaluation time.
3689    ///
3690    /// # Errors
3691    /// Returns an error if any rule fails verification
3692    pub fn verify(&mut self) -> Result<(), Error> {
3693        if self.rules.len() == self.finalized {
3694            return Ok(());
3695        }
3696
3697        for r in self.rules.iter_mut().filter(|r| !r.finalized) {
3698            // return at the first rule failing verification
3699            r.try_finalize(&self.dependencies).map_err(|e| {
3700                Error::Verify(
3701                    r.source.clone().unwrap_or(String::from("unknown")),
3702                    r.line(),
3703                    e.into(),
3704                )
3705            })?;
3706            self.finalized += 1;
3707        }
3708
3709        debug_assert!(self.finalized <= self.rules.len());
3710
3711        Ok(())
3712    }
3713
3714    #[inline(always)]
3715    fn try_finalize(&mut self) {
3716        if self.rules.len() == self.finalized {
3717            return;
3718        }
3719
3720        let mut finalized = 0usize;
3721        self.rules.iter_mut().for_each(|r| {
3722            if r.try_finalize(&self.dependencies).is_ok() {
3723                finalized += 1;
3724            }
3725        });
3726
3727        self.finalized = finalized;
3728
3729        debug_assert!(self.finalized <= self.rules.len());
3730
3731        // put text rules at the end
3732        self.rules.sort_by_key(|r| (r.is_text(), -(r.score as i64)));
3733    }
3734}
3735
3736#[cfg(test)]
3737mod tests {
3738    use std::io::Cursor;
3739
3740    use regex::bytes::Regex;
3741
3742    use crate::utils::unix_local_time_to_string;
3743
3744    use super::*;
3745
3746    macro_rules! lazy_cache {
3747        ($l: literal) => {
3748            LazyCache::from_read_seek(Cursor::new($l)).unwrap()
3749        };
3750    }
3751
3752    fn first_magic(
3753        rule: &str,
3754        content: &[u8],
3755        stream_kind: StreamKind,
3756    ) -> Result<Magic<'static>, Error> {
3757        let mut md = MagicDb::new();
3758        md.load(
3759            FileMagicParser::parse_str(rule, None)
3760                .inspect_err(|e| eprintln!("{e}"))
3761                .unwrap(),
3762        );
3763        let mut reader = LazyCache::from_read_seek(Cursor::new(content)).unwrap();
3764        let v = md.best_magic_with_stream_kind(&mut reader, stream_kind)?;
3765        Ok(v.into_owned())
3766    }
3767
3768    /// helper macro to debug tests
3769    #[allow(unused_macros)]
3770    macro_rules! enable_trace {
3771        () => {
3772            tracing_subscriber::fmt()
3773                .with_max_level(tracing_subscriber::filter::LevelFilter::TRACE)
3774                .try_init();
3775        };
3776    }
3777
3778    macro_rules! parse_assert {
3779        ($rule:literal) => {
3780            FileMagicParser::parse_str($rule, None)
3781                .inspect_err(|e| eprintln!("{e}"))
3782                .unwrap()
3783        };
3784    }
3785
3786    macro_rules! assert_magic_match_bin {
3787        ($rule: literal, $content:literal) => {{ first_magic($rule, $content, StreamKind::Binary).unwrap() }};
3788        ($rule: literal, $content:literal, $message:expr) => {{
3789            assert_eq!(
3790                first_magic($rule, $content, StreamKind::Binary)
3791                    .unwrap()
3792                    .message(),
3793                $message
3794            );
3795        }};
3796    }
3797
3798    macro_rules! assert_magic_match_text {
3799        ($rule: literal, $content:literal) => {{ first_magic($rule, $content, StreamKind::Text(TextEncoding::Utf8)).unwrap() }};
3800        ($rule: literal, $content:literal, $message:expr) => {{
3801            assert_eq!(
3802                first_magic($rule, $content, StreamKind::Text(TextEncoding::Utf8))
3803                    .unwrap()
3804                    .message(),
3805                $message
3806            );
3807        }};
3808    }
3809
3810    macro_rules! assert_magic_not_match_text {
3811        ($rule: literal, $content:literal) => {{
3812            assert!(
3813                first_magic($rule, $content, StreamKind::Text(TextEncoding::Utf8))
3814                    .unwrap()
3815                    .is_default()
3816            );
3817        }};
3818    }
3819
3820    macro_rules! assert_magic_not_match_bin {
3821        ($rule: literal, $content:literal) => {{
3822            assert!(
3823                first_magic($rule, $content, StreamKind::Binary)
3824                    .unwrap()
3825                    .is_default()
3826            );
3827        }};
3828    }
3829
3830    #[test]
3831    fn test_regex() {
3832        assert_magic_match_text!(
3833            r#"
38340	regex/1024 \^#![[:space:]]*/usr/bin/env[[:space:]]+
3835!:mime	text/x-shellscript
3836>&0  regex/64 .*($|\\b) %s shell script text executable
3837    "#,
3838            br#"#!/usr/bin/env bash
3839        echo hello world"#,
3840            // the magic generated
3841            "bash shell script text executable"
3842        );
3843
3844        let re = Regex::new(r"(?-u)\x42\x82").unwrap();
3845        assert!(re.is_match(b"\x42\x82"));
3846
3847        assert_magic_match_bin!(
3848            r#"0 regex \x42\x82 binary regex match"#,
3849            b"\x00\x00\x00\x00\x00\x00\x42\x82"
3850        );
3851
3852        // test regex continuation after match
3853        assert_magic_match_bin!(
3854            r#"
3855            0 regex \x42\x82
3856            >&0 string \xde\xad\xbe\xef it works
3857            "#,
3858            b"\x00\x00\x00\x00\x00\x00\x42\x82\xde\xad\xbe\xef"
3859        );
3860
3861        assert_magic_match_bin!(
3862            r#"
3863            0 regex/s \x42\x82
3864            >&0 string \x42\x82\xde\xad\xbe\xef it works
3865            "#,
3866            b"\x00\x00\x00\x00\x00\x00\x42\x82\xde\xad\xbe\xef"
3867        );
3868
3869        // ^ must match stat of line when matching text
3870        assert_magic_match_text!(
3871            r#"
38720	regex/1024 \^HelloWorld$ HelloWorld String"#,
3873            br#"
3874// this is a comment after an empty line
3875HelloWorld
3876            "#
3877        );
3878    }
3879
3880    #[test]
3881    fn test_string_with_mods() {
3882        assert_magic_match_text!(
3883            r#"0	string/w	#!\ \ \ /usr/bin/env\ bash	BASH
3884        "#,
3885            b"#! /usr/bin/env bash i
3886        echo hello world"
3887        );
3888
3889        // test uppercase insensitive
3890        assert_magic_match_text!(
3891            r#"0	string/C	HelloWorld	it works
3892        "#,
3893            b"helloworld"
3894        );
3895
3896        assert_magic_not_match_text!(
3897            r#"0	string/C	HelloWorld	it works
3898        "#,
3899            b"hELLOwORLD"
3900        );
3901
3902        // test lowercase insensitive
3903        assert_magic_match_text!(
3904            r#"0	string/c	HelloWorld	it works
3905        "#,
3906            b"HELLOWORLD"
3907        );
3908
3909        assert_magic_not_match_text!(
3910            r#"0	string/c	HelloWorld	it works
3911        "#,
3912            b"helloworld"
3913        );
3914
3915        // test full word match
3916        assert_magic_match_text!(
3917            r#"0	string/f	#!/usr/bin/env\ bash	BASH
3918        "#,
3919            b"#!/usr/bin/env bash"
3920        );
3921
3922        assert_magic_not_match_text!(
3923            r#"0	string/f	#!/usr/bin/python PYTHON"#,
3924            b"#!/usr/bin/pythonic"
3925        );
3926
3927        // testing whitespace compacting
3928        assert_magic_match_text!(
3929            r#"0	string/W	#!/usr/bin/env\ python  PYTHON"#,
3930            b"#!/usr/bin/env    python"
3931        );
3932
3933        assert_magic_not_match_text!(
3934            r#"0	string/W	#!/usr/bin/env\ \ python  PYTHON"#,
3935            b"#!/usr/bin/env python"
3936        );
3937    }
3938
3939    #[test]
3940    fn test_search_with_mods() {
3941        assert_magic_match_text!(
3942            r#"0	search/1/fwt	#!\ /usr/bin/luatex	LuaTex script text executable"#,
3943            b"#!          /usr/bin/luatex "
3944        );
3945
3946        // test matching from the beginning
3947        assert_magic_match_text!(
3948            r#"
3949            0	search/s	/usr/bin/env
3950            >&0 string /usr/bin/env it works
3951            "#,
3952            b"#!/usr/bin/env    python"
3953        );
3954
3955        assert_magic_not_match_text!(
3956            r#"
3957            0	search	/usr/bin/env
3958            >&0 string /usr/bin/env it works
3959            "#,
3960            b"#!/usr/bin/env    python"
3961        );
3962    }
3963
3964    #[test]
3965    fn test_pstring() {
3966        assert_magic_match_bin!(r#"0 pstring Toast it works"#, b"\x05Toast");
3967
3968        assert_magic_match_bin!(r#"0 pstring Toast %s"#, b"\x05Toast", "Toast");
3969
3970        assert_magic_not_match_bin!(r#"0 pstring Toast Doesn't work"#, b"\x07Toaster");
3971
3972        // testing with modifiers
3973        assert_magic_match_bin!(r#"0 pstring/H Toast it works"#, b"\x00\x05Toast");
3974
3975        assert_magic_match_bin!(r#"0 pstring/HJ Toast it works"#, b"\x00\x07Toast");
3976
3977        assert_magic_match_bin!(r#"0 pstring/HJ Toast %s"#, b"\x00\x07Toast", "Toast");
3978
3979        assert_magic_match_bin!(r#"0 pstring/h Toast it works"#, b"\x05\x00Toast");
3980
3981        assert_magic_match_bin!(r#"0 pstring/hJ Toast it works"#, b"\x07\x00Toast");
3982
3983        assert_magic_match_bin!(r#"0 pstring/L Toast it works"#, b"\x00\x00\x00\x05Toast");
3984
3985        assert_magic_match_bin!(r#"0 pstring/LJ Toast it works"#, b"\x00\x00\x00\x09Toast");
3986
3987        assert_magic_match_bin!(r#"0 pstring/l Toast it works"#, b"\x05\x00\x00\x00Toast");
3988
3989        assert_magic_match_bin!(r#"0 pstring/lJ Toast it works"#, b"\x09\x00\x00\x00Toast");
3990    }
3991
3992    #[test]
3993    fn test_max_recursion() {
3994        let res = first_magic(
3995            r#"0	indirect x"#,
3996            b"#!          /usr/bin/luatex ",
3997            StreamKind::Binary,
3998        );
3999        assert!(res.is_err());
4000        let _ = res.inspect_err(|e| {
4001            assert!(matches!(
4002                e.unwrap_localized(),
4003                Error::MaximumRecursion(MAX_RECURSION)
4004            ))
4005        });
4006    }
4007
4008    #[test]
4009    fn test_string_ops() {
4010        assert_magic_match_text!("0	string/b MZ MZ File", b"MZ\0");
4011        assert_magic_match_text!("0	string !MZ Not MZ File", b"AZ\0");
4012        assert_magic_match_text!("0	string >\0 Any String", b"A\0");
4013        assert_magic_match_text!("0	string >Test Any String", b"Test 1\0");
4014        assert_magic_match_text!("0	string <Test Any String", b"\0");
4015        assert_magic_not_match_text!("0	string >Test Any String", b"\0");
4016    }
4017
4018    #[test]
4019    fn test_lestring16() {
4020        assert_magic_match_bin!(
4021            "0 lestring16 abcd Little-endian UTF-16 string",
4022            b"\x61\x00\x62\x00\x63\x00\x64\x00"
4023        );
4024        assert_magic_match_bin!(
4025            "0 lestring16 x %s",
4026            b"\x61\x00\x62\x00\x63\x00\x64\x00\x00",
4027            "abcd"
4028        );
4029        assert_magic_not_match_bin!(
4030            "0 lestring16 abcd Little-endian UTF-16 string",
4031            b"\x00\x61\x00\x62\x00\x63\x00\x64"
4032        );
4033        assert_magic_match_bin!(
4034            "4 lestring16 abcd Little-endian UTF-16 string",
4035            b"\x00\x00\x00\x00\x61\x00\x62\x00\x63\x00\x64\x00"
4036        );
4037    }
4038
4039    #[test]
4040    fn test_bestring16() {
4041        assert_magic_match_bin!(
4042            "0 bestring16 abcd Big-endian UTF-16 string",
4043            b"\x00\x61\x00\x62\x00\x63\x00\x64"
4044        );
4045        assert_magic_match_bin!(
4046            "0 bestring16 x %s",
4047            b"\x00\x61\x00\x62\x00\x63\x00\x64",
4048            "abcd"
4049        );
4050        assert_magic_not_match_bin!(
4051            "0 bestring16 abcd Big-endian UTF-16 string",
4052            b"\x61\x00\x62\x00\x63\x00\x64\x00"
4053        );
4054        assert_magic_match_bin!(
4055            "4 bestring16 abcd Big-endian UTF-16 string",
4056            b"\x00\x00\x00\x00\x00\x61\x00\x62\x00\x63\x00\x64"
4057        );
4058    }
4059
4060    #[test]
4061    fn test_offset_from_end() {
4062        assert_magic_match_bin!("-1 ubyte 0x42 last byte ok", b"\x00\x00\x42");
4063        assert_magic_match_bin!("-2 ubyte 0x41 last byte ok", b"\x00\x41\x00");
4064    }
4065
4066    #[test]
4067    fn test_relative_offset() {
4068        assert_magic_match_bin!(
4069            "
4070            0 ubyte 0x42
4071            >&0 ubyte 0x00
4072            >>&0 ubyte 0x41 third byte ok
4073            ",
4074            b"\x42\x00\x41\x00"
4075        );
4076    }
4077
4078    #[test]
4079    fn test_indirect_offset() {
4080        assert_magic_match_bin!("(0.l) ubyte 0x42 it works", b"\x04\x00\x00\x00\x42");
4081        // adding fixed value to offset
4082        assert_magic_match_bin!("(0.l+3) ubyte 0x42 it works", b"\x01\x00\x00\x00\x42");
4083        // testing offset pair
4084        assert_magic_match_bin!(
4085            "(0.l+(4)) ubyte 0x42 it works",
4086            b"\x04\x00\x00\x00\x04\x00\x00\x00\x42"
4087        );
4088    }
4089
4090    #[test]
4091    fn test_use_with_message() {
4092        assert_magic_match_bin!(
4093            r#"
40940 string MZ
4095>0 use mz first match
4096
40970 name mz then second match
4098>0 string MZ
4099"#,
4100            b"MZ\0",
4101            "first match then second match"
4102        );
4103    }
4104
4105    #[test]
4106    fn test_scalar_transform() {
4107        assert_magic_match_bin!("0 ubyte+1 0x1 add works", b"\x00");
4108        assert_magic_match_bin!("0 ubyte-1 0xfe sub works", b"\xff");
4109        assert_magic_match_bin!("0 ubyte%2 0 mod works", b"\x0a");
4110        assert_magic_match_bin!("0 ubyte&0x0f 0x0f bitand works", b"\xff");
4111        assert_magic_match_bin!("0 ubyte|0x0f 0xff bitor works", b"\xf0");
4112        assert_magic_match_bin!("0 ubyte^0x0f 0xf0 bitxor works", b"\xff");
4113
4114        FileMagicParser::parse_str("0 ubyte%0 mod by zero", None)
4115            .expect_err("expect div by zero error");
4116        FileMagicParser::parse_str("0 ubyte/0 div by zero", None)
4117            .expect_err("expect div by zero error");
4118    }
4119
4120    #[test]
4121    fn test_belong() {
4122        // Test that a file with a four-byte value at offset 0 that matches the given value in big-endian byte order
4123        assert_magic_match_bin!("0 belong 0x12345678 Big-endian long", b"\x12\x34\x56\x78");
4124        // Test that a file with a four-byte value at offset 0 that does not match the given value in big-endian byte order
4125        assert_magic_not_match_bin!("0 belong 0x12345678 Big-endian long", b"\x78\x56\x34\x12");
4126        // Test that a file with a four-byte value at a non-zero offset that matches the given value in big-endian byte order
4127        assert_magic_match_bin!(
4128            "4 belong 0x12345678 Big-endian long",
4129            b"\x00\x00\x00\x00\x12\x34\x56\x78"
4130        );
4131        // Test < operator
4132        assert_magic_match_bin!("0 belong <0x12345678 Big-endian long", b"\x12\x34\x56\x77");
4133        assert_magic_not_match_bin!("0 belong <0x12345678 Big-endian long", b"\x12\x34\x56\x78");
4134
4135        // Test > operator
4136        assert_magic_match_bin!("0 belong >0x12345678 Big-endian long", b"\x12\x34\x56\x79");
4137        assert_magic_not_match_bin!("0 belong >0x12345678 Big-endian long", b"\x12\x34\x56\x78");
4138
4139        // Test & operator
4140        assert_magic_match_bin!("0 belong &0x5678 Big-endian long", b"\x00\x00\x56\x78");
4141        assert_magic_not_match_bin!("0 belong &0x0000FFFF Big-endian long", b"\x12\x34\x56\x78");
4142
4143        // Test ^ operator (bitwise AND with complement)
4144        assert_magic_match_bin!("0 belong ^0xFFFF0000 Big-endian long", b"\x00\x00\x56\x78");
4145        assert_magic_not_match_bin!("0 belong ^0xFFFF0000 Big-endian long", b"\x00\x01\x56\x78");
4146
4147        // Test ~ operator
4148        assert_magic_match_bin!("0 belong ~0x12345678 Big-endian long", b"\xed\xcb\xa9\x87");
4149        assert_magic_not_match_bin!("0 belong ~0x12345678 Big-endian long", b"\x12\x34\x56\x78");
4150
4151        // Test x operator
4152        assert_magic_match_bin!("0 belong x Big-endian long", b"\x12\x34\x56\x78");
4153        assert_magic_match_bin!("0 belong x Big-endian long", b"\x78\x56\x34\x12");
4154    }
4155
4156    #[test]
4157    fn test_parse_search() {
4158        parse_assert!("0 search test");
4159        parse_assert!("0 search/24/s test");
4160        parse_assert!("0 search/s/24 test");
4161    }
4162
4163    #[test]
4164    fn test_bedate() {
4165        assert_magic_match_bin!(
4166            "0 bedate 946684800 Unix date (Jan 1, 2000)",
4167            b"\x38\x6D\x43\x80"
4168        );
4169        assert_magic_not_match_bin!(
4170            "0 bedate 946684800 Unix date (Jan 1, 2000)",
4171            b"\x00\x00\x00\x00"
4172        );
4173        assert_magic_match_bin!(
4174            "4 bedate 946684800 %s",
4175            b"\x00\x00\x00\x00\x38\x6D\x43\x80",
4176            "2000-01-01 00:00:00"
4177        );
4178    }
4179    #[test]
4180    fn test_beldate() {
4181        assert_magic_match_bin!(
4182            "0 beldate 946684800 Local date (Jan 1, 2000)",
4183            b"\x38\x6D\x43\x80"
4184        );
4185        assert_magic_not_match_bin!(
4186            "0 beldate 946684800 Local date (Jan 1, 2000)",
4187            b"\x00\x00\x00\x00"
4188        );
4189
4190        assert_magic_match_bin!(
4191            "4 beldate 946684800 {}",
4192            b"\x00\x00\x00\x00\x38\x6D\x43\x80",
4193            unix_local_time_to_string(946684800)
4194        );
4195    }
4196
4197    #[test]
4198    fn test_beqdate() {
4199        assert_magic_match_bin!(
4200            "0 beqdate 946684800 Unix date (Jan 1, 2000)",
4201            b"\x00\x00\x00\x00\x38\x6D\x43\x80"
4202        );
4203
4204        assert_magic_not_match_bin!(
4205            "0 beqdate 946684800 Unix date (Jan 1, 2000)",
4206            b"\x00\x00\x00\x00\x00\x00\x00\x00"
4207        );
4208
4209        assert_magic_match_bin!(
4210            "0 beqdate 946684800 %s",
4211            b"\x00\x00\x00\x00\x38\x6D\x43\x80",
4212            "2000-01-01 00:00:00"
4213        );
4214    }
4215
4216    #[test]
4217    fn test_medate() {
4218        assert_magic_match_bin!(
4219            "0 medate 946684800 Unix date (Jan 1, 2000)",
4220            b"\x6D\x38\x80\x43"
4221        );
4222
4223        assert_magic_not_match_bin!(
4224            "0 medate 946684800 Unix date (Jan 1, 2000)",
4225            b"\x00\x00\x00\x00"
4226        );
4227
4228        assert_magic_match_bin!(
4229            "4 medate 946684800 %s",
4230            b"\x00\x00\x00\x00\x6D\x38\x80\x43",
4231            "2000-01-01 00:00:00"
4232        );
4233    }
4234
4235    #[test]
4236    fn test_meldate() {
4237        assert_magic_match_bin!(
4238            "0 meldate 946684800 Local date (Jan 1, 2000)",
4239            b"\x6D\x38\x80\x43"
4240        );
4241        assert_magic_not_match_bin!(
4242            "0 meldate 946684800 Local date (Jan 1, 2000)",
4243            b"\x00\x00\x00\x00"
4244        );
4245
4246        assert_magic_match_bin!(
4247            "4 meldate 946684800 %s",
4248            b"\x00\x00\x00\x00\x6D\x38\x80\x43",
4249            unix_local_time_to_string(946684800)
4250        );
4251    }
4252
4253    #[test]
4254    fn test_date() {
4255        assert_magic_match_bin!(
4256            "0 date 946684800 Local date (Jan 1, 2000)",
4257            b"\x80\x43\x6D\x38"
4258        );
4259        assert_magic_not_match_bin!(
4260            "0 date 946684800 Local date (Jan 1, 2000)",
4261            b"\x00\x00\x00\x00"
4262        );
4263        assert_magic_match_bin!(
4264            "4 date 946684800 {}",
4265            b"\x00\x00\x00\x00\x80\x43\x6D\x38",
4266            "2000-01-01 00:00:00"
4267        );
4268    }
4269
4270    #[test]
4271    fn test_leldate() {
4272        assert_magic_match_bin!(
4273            "0 leldate 946684800 Local date (Jan 1, 2000)",
4274            b"\x80\x43\x6D\x38"
4275        );
4276        assert_magic_not_match_bin!(
4277            "0 leldate 946684800 Local date (Jan 1, 2000)",
4278            b"\x00\x00\x00\x00"
4279        );
4280        assert_magic_match_bin!(
4281            "4 leldate 946684800 {}",
4282            b"\x00\x00\x00\x00\x80\x43\x6D\x38",
4283            unix_local_time_to_string(946684800)
4284        );
4285    }
4286
4287    #[test]
4288    fn test_leqdate() {
4289        assert_magic_match_bin!(
4290            "0 leqdate 1577836800 Unix date (Jan 1, 2020)",
4291            b"\x00\xe1\x0b\x5E\x00\x00\x00\x00"
4292        );
4293
4294        assert_magic_not_match_bin!(
4295            "0 leqdate 1577836800 Unix date (Jan 1, 2020)",
4296            b"\x00\x00\x00\x00\x00\x00\x00\x00"
4297        );
4298        assert_magic_match_bin!(
4299            "8 leqdate 1577836800 %s",
4300            b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\xE1\x0B\x5E\x00\x00\x00\x00",
4301            "2020-01-01 00:00:00"
4302        );
4303    }
4304
4305    #[test]
4306    fn test_leqldate() {
4307        assert_magic_match_bin!(
4308            "0 leqldate 1577836800 Unix date (Jan 1, 2020)",
4309            b"\x00\xe1\x0b\x5E\x00\x00\x00\x00"
4310        );
4311
4312        assert_magic_not_match_bin!(
4313            "0 leqldate 1577836800 Unix date (Jan 1, 2020)",
4314            b"\x00\x00\x00\x00\x00\x00\x00\x00"
4315        );
4316        assert_magic_match_bin!(
4317            "8 leqldate 1577836800 %s",
4318            b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\xE1\x0B\x5E\x00\x00\x00\x00",
4319            unix_local_time_to_string(1577836800)
4320        );
4321    }
4322
4323    #[test]
4324    fn test_melong() {
4325        // Test = operator
4326        assert_magic_match_bin!(
4327            "0 melong =0x12345678 Middle-endian long",
4328            b"\x34\x12\x78\x56"
4329        );
4330        assert_magic_not_match_bin!(
4331            "0 melong =0x12345678 Middle-endian long",
4332            b"\x00\x00\x00\x00"
4333        );
4334
4335        // Test < operator
4336        assert_magic_match_bin!(
4337            "0 melong <0x12345678 Middle-endian long",
4338            b"\x34\x12\x78\x55"
4339        ); // 0x12345677 in middle-endian
4340        assert_magic_not_match_bin!(
4341            "0 melong <0x12345678 Middle-endian long",
4342            b"\x34\x12\x78\x56"
4343        ); // 0x12345678 in middle-endian
4344
4345        // Test > operator
4346        assert_magic_match_bin!(
4347            "0 melong >0x12345678 Middle-endian long",
4348            b"\x34\x12\x78\x57"
4349        ); // 0x12345679 in middle-endian
4350        assert_magic_not_match_bin!(
4351            "0 melong >0x12345678 Middle-endian long",
4352            b"\x34\x12\x78\x56"
4353        ); // 0x12345678 in middle-endian
4354
4355        // Test & operator
4356        assert_magic_match_bin!("0 melong &0x5678 Middle-endian long", b"\xab\xcd\x78\x56"); // 0x00007856 in middle-endian
4357        assert_magic_not_match_bin!(
4358            "0 melong &0x0000FFFF Middle-endian long",
4359            b"\x34\x12\x78\x56"
4360        ); // 0x12347856 in middle-endian
4361
4362        // Test ^ operator (bitwise AND with complement)
4363        assert_magic_match_bin!(
4364            "0 melong ^0xFFFF0000 Middle-endian long",
4365            b"\x00\x00\x78\x56"
4366        ); // 0x00007856 in middle-endian
4367        assert_magic_not_match_bin!(
4368            "0 melong ^0xFFFF0000 Middle-endian long",
4369            b"\x00\x01\x78\x56"
4370        ); // 0x00017856 in middle-endian
4371
4372        // Test ~ operator
4373        assert_magic_match_bin!(
4374            "0 melong ~0x12345678 Middle-endian long",
4375            b"\xCB\xED\x87\xA9"
4376        );
4377        assert_magic_not_match_bin!(
4378            "0 melong ~0x12345678 Middle-endian long",
4379            b"\x34\x12\x78\x56"
4380        ); // The original value
4381
4382        // Test x operator
4383        assert_magic_match_bin!("0 melong x Middle-endian long", b"\x34\x12\x78\x56");
4384        assert_magic_match_bin!("0 melong x Middle-endian long", b"\x00\x00\x00\x00");
4385    }
4386
4387    #[test]
4388    fn test_uquad() {
4389        // Test = operator
4390        assert_magic_match_bin!(
4391            "0 uquad =0x123456789ABCDEF0 Unsigned quad",
4392            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4393        );
4394        assert_magic_not_match_bin!(
4395            "0 uquad =0x123456789ABCDEF0 Unsigned quad",
4396            b"\x00\x00\x00\x00\x00\x00\x00\x00"
4397        );
4398
4399        // Test < operator
4400        assert_magic_match_bin!(
4401            "0 uquad <0x123456789ABCDEF0 Unsigned quad",
4402            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x11"
4403        );
4404        assert_magic_not_match_bin!(
4405            "0 uquad <0x123456789ABCDEF0 Unsigned quad",
4406            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4407        );
4408
4409        // Test > operator
4410        assert_magic_match_bin!(
4411            "0 uquad >0x123456789ABCDEF0 Unsigned quad",
4412            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x13"
4413        );
4414        assert_magic_not_match_bin!(
4415            "0 uquad >0x123456789ABCDEF0 Unsigned quad",
4416            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4417        );
4418
4419        // Test & operator
4420        assert_magic_match_bin!(
4421            "0 uquad &0xF0 Unsigned quad",
4422            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4423        );
4424        assert_magic_not_match_bin!(
4425            "0 uquad &0xFF Unsigned quad",
4426            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4427        );
4428
4429        // Test ^ operator (bitwise AND with complement)
4430        assert_magic_match_bin!(
4431            "0 uquad ^0xFFFFFFFFFFFFFFFF Unsigned quad",
4432            b"\x00\x00\x00\x00\x00\x00\x00\x00"
4433        ); // All bits clear
4434        assert_magic_not_match_bin!(
4435            "0 uquad ^0xFFFFFFFFFFFFFFFF Unsigned quad",
4436            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4437        ); // Some bits set
4438
4439        // Test ~ operator
4440        assert_magic_match_bin!(
4441            "0 uquad ~0x123456789ABCDEF0 Unsigned quad",
4442            b"\x0F\x21\x43\x65\x87\xA9\xCB\xED"
4443        );
4444        assert_magic_not_match_bin!(
4445            "0 uquad ~0x123456789ABCDEF0 Unsigned quad",
4446            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4447        ); // The original value
4448
4449        // Test x operator
4450        assert_magic_match_bin!(
4451            "0 uquad x {:#x}",
4452            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12",
4453            "0x123456789abcdef0"
4454        );
4455        assert_magic_match_bin!(
4456            "0 uquad x Unsigned quad",
4457            b"\x00\x00\x00\x00\x00\x00\x00\x00"
4458        );
4459    }
4460
4461    #[test]
4462    fn test_guid() {
4463        assert_magic_match_bin!(
4464            "0 guid EC959539-6786-2D4E-8FDB-98814CE76C1E It works",
4465            b"\xEC\x95\x95\x39\x67\x86\x2D\x4E\x8F\xDB\x98\x81\x4C\xE7\x6C\x1E"
4466        );
4467
4468        assert_magic_not_match_bin!(
4469            "0 guid 399595EC-8667-4E2D-8FDB-98814CE76C1E It works",
4470            b"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F"
4471        );
4472
4473        assert_magic_match_bin!(
4474            "0 guid x %s",
4475            b"\xEC\x95\x95\x39\x67\x86\x2D\x4E\x8F\xDB\x98\x81\x4C\xE7\x6C\x1E",
4476            "EC959539-6786-2D4E-8FDB-98814CE76C1E"
4477        );
4478    }
4479
4480    #[test]
4481    fn test_ubeqdate() {
4482        assert_magic_match_bin!(
4483            "0 ubeqdate 1633046400 It works",
4484            b"\x00\x00\x00\x00\x61\x56\x4f\x80"
4485        );
4486
4487        assert_magic_match_bin!(
4488            "0 ubeqdate x %s",
4489            b"\x00\x00\x00\x00\x61\x56\x4f\x80",
4490            "2021-10-01 00:00:00"
4491        );
4492
4493        assert_magic_not_match_bin!(
4494            "0 ubeqdate 1633046400 It should not work",
4495            b"\x00\x00\x00\x00\x00\x00\x00\x00"
4496        );
4497    }
4498
4499    #[test]
4500    fn test_ldate() {
4501        assert_magic_match_bin!("0 ldate 1640551520 It works", b"\x60\xd4\xC8\x61");
4502
4503        assert_magic_not_match_bin!("0 ldate 1633046400 It should not work", b"\x00\x00\x00\x00");
4504
4505        assert_magic_match_bin!(
4506            "0 ldate x %s",
4507            b"\x60\xd4\xC8\x61",
4508            unix_local_time_to_string(1640551520)
4509        );
4510    }
4511
4512    #[test]
4513    fn test_scalar_with_transform() {
4514        assert_magic_match_bin!("0 ubyte/10 2 {}", b"\x14", "2");
4515        assert_magic_match_bin!("0 ubyte/10 x {}", b"\x14", "2");
4516        assert_magic_match_bin!("0 ubyte%10 x {}", b"\x14", "0");
4517    }
4518
4519    #[test]
4520    fn test_float_with_transform() {
4521        assert_magic_match_bin!("0 lefloat/10 2 {}", b"\x00\x00\xa0\x41", "2");
4522        assert_magic_match_bin!("0 lefloat/10 x {}", b"\x00\x00\xa0\x41", "2");
4523        assert_magic_match_bin!("0 lefloat%10 x {}", b"\x00\x00\xa0\x41", "0");
4524    }
4525
4526    #[test]
4527    fn test_read_octal() {
4528        // Basic cases
4529        assert_eq!(read_octal_u64(&mut lazy_cache!("0")), Some(0));
4530        assert_eq!(read_octal_u64(&mut lazy_cache!("00")), Some(0));
4531        assert_eq!(read_octal_u64(&mut lazy_cache!("01")), Some(1));
4532        assert_eq!(read_octal_u64(&mut lazy_cache!("07")), Some(7));
4533        assert_eq!(read_octal_u64(&mut lazy_cache!("010")), Some(8));
4534        assert_eq!(read_octal_u64(&mut lazy_cache!("0123")), Some(83));
4535        assert_eq!(read_octal_u64(&mut lazy_cache!("0755")), Some(493));
4536
4537        // With trailing non-octal characters
4538        assert_eq!(read_octal_u64(&mut lazy_cache!("0ABC")), Some(0));
4539        assert_eq!(read_octal_u64(&mut lazy_cache!("01ABC")), Some(1));
4540        assert_eq!(read_octal_u64(&mut lazy_cache!("0755ABC")), Some(493));
4541        assert_eq!(read_octal_u64(&mut lazy_cache!("0123ABC")), Some(83));
4542
4543        // Invalid octal digits
4544        assert_eq!(read_octal_u64(&mut lazy_cache!("08")), Some(0)); // stops at '8'
4545        assert_eq!(read_octal_u64(&mut lazy_cache!("01238")), Some(83)); // stops at '8'
4546
4547        // No leading '0'
4548        assert_eq!(read_octal_u64(&mut lazy_cache!("123")), None);
4549        assert_eq!(read_octal_u64(&mut lazy_cache!("755")), None);
4550
4551        // Empty string
4552        assert_eq!(read_octal_u64(&mut lazy_cache!("")), None);
4553
4554        // Only non-octal characters
4555        assert_eq!(read_octal_u64(&mut lazy_cache!("ABC")), None);
4556        assert_eq!(read_octal_u64(&mut lazy_cache!("8ABC")), None); // first char is not '0'
4557
4558        // Longer valid octal (but within u64 range)
4559        assert_eq!(
4560            read_octal_u64(&mut lazy_cache!("01777777777")),
4561            Some(268435455)
4562        );
4563    }
4564
4565    #[test]
4566    fn test_offset_bug_1() {
4567        // this tests the exact behaviour
4568        // expected by libmagic/file
4569        assert_magic_match_bin!(
4570            r"
45711	string		TEST Bread is
4572# offset computation is relative to
4573# rule start
4574>(5.b)	use toasted
4575
45760 name toasted
4577>0	string twice Toasted
4578>>0  use toasted_twice
4579
45800 name toasted_twice
4581>(6.b) string x %s
4582        ",
4583            b"\x00TEST\x06twice\x00\x06",
4584            "Bread is Toasted twice"
4585        );
4586    }
4587
4588    // this test implement the exact same logic as
4589    // test_offset_bug_1 except that the rule starts
4590    // matching from end. Surprisingly we need to
4591    // adjust indirect offsets so that it works in
4592    // libmagic/file
4593    #[test]
4594    fn test_offset_bug_2() {
4595        // this tests the exact behaviour
4596        // expected by libmagic/file
4597        assert_magic_match_bin!(
4598            r"
4599-12	string		TEST Bread is
4600>(4.b)	use toasted
4601
46020 name toasted
4603>0	string twice Toasted
4604>>0  use toasted_twice
4605
46060 name toasted_twice
4607>(6.b) string x %
4608        ",
4609            b"\x00TEST\x06twice\x00\x06",
4610            "Bread is Toasted twice"
4611        )
4612    }
4613
4614    #[test]
4615    fn test_offset_bug_3() {
4616        // this tests the exact behaviour
4617        // expected by libmagic/file
4618        assert_magic_match_bin!(
4619            r"
46201	string		TEST Bread is
4621>(5.b) indirect/r x
4622
46230	string twice Toasted
4624>0  use toasted_twice
4625
46260 name toasted_twice
4627>0 string x %s
4628        ",
4629            b"\x00TEST\x06twice\x00\x08",
4630            "Bread is Toasted twice"
4631        )
4632    }
4633
4634    #[test]
4635    fn test_offset_bug_4() {
4636        // this tests the exact behaviour
4637        // expected by libmagic/file
4638        assert_magic_match_bin!(
4639            r"
46401	string		Bread %s
4641>(6.b) indirect/r x
4642
4643# this one uses a based offset
4644# computed at indirection
46451	string is\ Toasted %s
4646>(11.b)  use toasted_twice
4647
4648# this one is using a new base
4649# offset being previous base
4650# offset + offset of use
46510 name toasted_twice
4652>0 string x %s
4653            ",
4654            b"\x00Bread\x06is Toasted\x0ctwice\x00",
4655            "Bread is Toasted twice"
4656        )
4657    }
4658
4659    #[test]
4660    fn test_offset_bug_5() {
4661        assert_magic_match_bin!(
4662            r"
46631	string		TEST Bread is
4664>(5.b) indirect/r x
4665
46660	string twice Toasted
4667>0  use toasted_twice
4668
46690 name toasted_twice
4670>0 string twice
4671>>&1 byte 0x08 twice
4672            ",
4673            b"\x00TEST\x06twice\x00\x08",
4674            "Bread is Toasted twice"
4675        )
4676    }
4677
4678    #[test]
4679    fn test_message_parts() {
4680        let m = first_magic(
4681            r#"0	string/W	#!/usr/bin/env\ python  PYTHON"#,
4682            b"#!/usr/bin/env    python",
4683            StreamKind::Text(TextEncoding::Ascii),
4684        )
4685        .unwrap();
4686
4687        assert!(m.message_parts().any(|p| p.eq_ignore_ascii_case("python")))
4688    }
4689
4690    #[test]
4691    fn test_load_bulk() {
4692        let mut db = MagicDb::new();
4693
4694        let rules = vec![
4695            parse_assert!("0 search test"),
4696            parse_assert!("0 search/24/s test"),
4697            parse_assert!("0 search/s/24 test"),
4698        ];
4699
4700        db.load_bulk(rules.into_iter());
4701        db.verify().unwrap();
4702    }
4703
4704    #[test]
4705    fn test_load_bulk_failure() {
4706        let mut db = MagicDb::new();
4707
4708        let rules = vec![parse_assert!(
4709            r#"
47100 search/s/24 test
4711>0 use test
4712"#
4713        )];
4714
4715        db.load_bulk(rules.into_iter());
4716        assert!(matches!(db.verify(), Err(Error::Verify(_, _, _))));
4717    }
4718}
pure_magic/lib.rs

pure_magic/
lib.rs