pure_magic/
lib.rs

1#![forbid(unsafe_code)]
2#![deny(unused_imports)]
3//! # `pure-magic`: A pure and safe Rust Reimplementation of `libmagic`
4//!
5//! Unlike many file identification crates, `pure-magic` is highly compatible with the standard
6//! `magic` rule format, allowing seamless reuse of existing
7//! [rules](https://github.com/qjerome/magic-rs/tree/main/magic-db/src/magdir). This makes it an ideal
8//! drop-in replacement for crates relying on **`libmagic` C bindings**, where memory safety is critical.
9//!
10//! **Key Features:**
11//! - File type detection
12//! - MIME type inference
13//! - Custom magic rule parsing
14//!
15//! ## Installation
16//! Add `pure-magic` to your `Cargo.toml`:
17//!
18//! ```toml
19//! [dependencies]
20//! pure-magic = "0.1"  # Replace with the latest version
21//! ```
22//!
23//! Or add the latest version with cargo:
24//!
25//! ```sh
26//! cargo add pure-magic
27//! ```
28//!
29//! ## Quick Start
30//!
31//! ### Detect File Types Programmatically
32//! ```rust
33//! use pure_magic::{MagicDb, MagicSource};
34//! use std::fs::File;
35//!
36//! fn main() -> Result<(), Box<dyn std::error::Error>> {
37//!     let mut db = MagicDb::new();
38//!     // Create a MagicSource from a file
39//!     let rust_magic = MagicSource::open("../magic-db/src/magdir/rust")?;
40//!     db.load(rust_magic)?;
41//!
42//!     // Open a file and detect its type
43//!     let mut file = File::open("src/lib.rs")?;
44//!     let magic = db.first_magic(&mut file, None)?;
45//!
46//!     println!(
47//!         "File type: {} (MIME: {}, strength: {})",
48//!         magic.message(),
49//!         magic.mime_type(),
50//!         magic.strength()
51//!     );
52//!     Ok(())
53//! }
54//! ```
55//!
56//! ### Get All Matching Rules
57//! ```rust
58//! use pure_magic::{MagicDb, MagicSource};
59//! use std::fs::File;
60//!
61//! fn main() -> Result<(), Box<dyn std::error::Error>> {
62//!     let mut db = MagicDb::new();
63//!     // Create a MagicSource from a file
64//!     let rust_magic = MagicSource::open("../magic-db/src/magdir/rust")?;
65//!     db.load(rust_magic)?;
66//!
67//!     // Open a file and detect its type
68//!     let mut file = File::open("src/lib.rs")?;
69//!
70//!     // Get all matching rules, sorted by strength
71//!     let magics = db.all_magics(&mut file)?;
72//!
73//!     // Must contain rust file magic and default text magic
74//!     assert!(magics.len() > 1);
75//!
76//!     for magic in magics {
77//!         println!(
78//!             "Match: {} (strength: {}, source: {})",
79//!             magic.message(),
80//!             magic.strength(),
81//!             magic.source().unwrap_or("unknown")
82//!         );
83//!     }
84//!     Ok(())
85//! }
86//! ```
87//!
88//! ### Serialize a Database to Disk
89//! ```rust
90//! use pure_magic::{MagicDb, MagicSource};
91//! use std::fs::File;
92//!
93//! fn main() -> Result<(), Box<dyn std::error::Error>> {
94//!     let mut db = MagicDb::new();
95//!     // Create a MagicSource from a file
96//!     let rust_magic = MagicSource::open("../magic-db/src/magdir/rust")?;
97//!     db.load(rust_magic)?;
98//!
99//!     // Serialize the database to a file
100//!     let mut output = File::create("/tmp/compiled.db")?;
101//!     db.serialize(&mut output)?;
102//!
103//!     println!("Database saved to file");
104//!     Ok(())
105//! }
106//! ```
107//!
108//! ### Deserialize a Database
109//! ```rust
110//! use pure_magic::{MagicDb, MagicSource};
111//! use std::fs::File;
112//!
113//! fn main() -> Result<(), Box<dyn std::error::Error>> {
114//!     let mut db = MagicDb::new();
115//!     // Create a MagicSource from a file
116//!     let rust_magic = MagicSource::open("../magic-db/src/magdir/rust")?;
117//!     db.load(rust_magic)?;
118//!
119//!     // Serialize the database in a vector
120//!     let mut ser = vec![];
121//!     db.serialize(&mut ser)?;
122//!     println!("Database saved to vector");
123//!
124//!     // We deserialize from slice
125//!     let db = MagicDb::deserialize(&mut ser.as_slice())?;
126//!
127//!     assert!(!db.rules().is_empty());
128//!
129//!     Ok(())
130//! }
131//! ```
132//!
133//! ## License
134//! This project is licensed under the **GPL-3.0 License**.
135//!
136//! ## Contributing
137//! Contributions are welcome! Open an issue or submit a pull request.
138//!
139//! ## Acknowledgments
140//! - Inspired by the original `libmagic` (part of the `file` command).
141
142use dyf::{DynDisplay, FormatString, dformat};
143use flagset::{FlagSet, flags};
144use flate2::{Compression, read::GzDecoder, write::GzEncoder};
145use lazy_cache::LazyCache;
146use memchr::memchr;
147use pest::{Span, error::ErrorVariant};
148use regex::bytes::{self};
149use serde::{Deserialize, Serialize};
150use std::{
151    borrow::Cow,
152    cmp::max,
153    collections::{HashMap, HashSet},
154    fmt::{self, Debug, Display},
155    io::{self, Read, Seek, SeekFrom, Write},
156    ops::{Add, BitAnd, BitOr, BitXor, Div, Mul, Rem, Sub},
157    path::Path,
158};
159use tar::Archive;
160use thiserror::Error;
161use tracing::{Level, debug, enabled, trace};
162
163use crate::{
164    numeric::{Float, FloatDataType, Scalar, ScalarDataType},
165    parser::{FileMagicParser, Rule},
166    utils::{decode_id3, find_json_boundaries},
167};
168
169mod numeric;
170mod parser;
171mod utils;
172
173const HARDCODED_MAGIC_STRENGTH: u64 = 2048;
174const HARDCODED_SOURCE: &str = "hardcoded";
175// corresponds to FILE_INDIR_MAX constant defined in libmagic
176const MAX_RECURSION: usize = 50;
177// constant found in libmagic. It is used to limit for search tests
178pub const FILE_BYTES_MAX: usize = 7 * 1024 * 1024;
179// constant found in libmagic. It is used to limit for regex tests
180const FILE_REGEX_MAX: usize = 8192;
181
182pub const DEFAULT_BIN_MIMETYPE: &str = "application/octet-stream";
183pub const DEFAULT_TEXT_MIMETYPE: &str = "text/plain";
184
185pub(crate) const TIMESTAMP_FORMAT: &str = "%Y-%m-%d %H:%M:%S";
186
187macro_rules! debug_panic {
188    ($($arg:tt)*) => {
189        if cfg!(debug_assertions) {
190            panic!($($arg)*);
191        }
192    };
193}
194
195macro_rules! read {
196    ($r: expr, $ty: ty) => {{
197        let mut a = [0u8; std::mem::size_of::<$ty>()];
198        $r.read_exact(&mut a)?;
199        a
200    }};
201}
202
203macro_rules! read_le {
204    ($r:expr, $ty: ty ) => {{ <$ty>::from_le_bytes(read!($r, $ty)) }};
205}
206
207macro_rules! read_be {
208    ($r:expr, $ty: ty ) => {{ <$ty>::from_be_bytes(read!($r, $ty)) }};
209}
210
211macro_rules! read_me {
212    ($r: expr) => {{ ((read_le!($r, u16) as i32) << 16) | (read_le!($r, u16) as i32) }};
213}
214
215#[inline(always)]
216fn read_octal_u64<R: Read + Seek>(haystack: &mut LazyCache<R>) -> Option<u64> {
217    let s = haystack
218        .read_while_or_limit(|b| matches!(b, b'0'..=b'7'), 22)
219        .map(|buf| str::from_utf8(buf))
220        .ok()?
221        .ok()?;
222
223    if !s.starts_with("0") {
224        return None;
225    }
226
227    u64::from_str_radix(s, 8).ok()
228}
229
230/// Represents all possible errors that can occur during file type detection and processing.
231#[derive(Debug, Error)]
232pub enum Error {
233    /// A generic error with a custom message.
234    #[error("{0}")]
235    Msg(String),
236
237    /// An error with a source location and a nested error.
238    #[error("source={0} line={1} error={2}")]
239    Localized(String, usize, Box<Error>),
240
241    /// Indicates a required rule was not found.
242    #[error("missing rule: {0}")]
243    MissingRule(String),
244
245    /// Indicates the maximum recursion depth was reached.
246    #[error("maximum recursion reached: {0}")]
247    MaximumRecursion(usize),
248
249    /// Wraps an I/O error.
250    #[error("io: {0}")]
251    Io(#[from] io::Error),
252
253    /// Wraps a parsing error from the `pest` parser.
254    #[error("parser error: {0}")]
255    Parse(#[from] Box<pest::error::Error<Rule>>),
256
257    /// Wraps a formatting error from the `dyf` crate.
258    #[error("formatting: {0}")]
259    Format(#[from] dyf::Error),
260
261    /// Wraps a regex-related error.
262    #[error("regex: {0}")]
263    Regex(#[from] regex::Error),
264
265    /// Wraps a serialization error from `bincode`.
266    #[error("{0}")]
267    Serialize(#[from] bincode::error::EncodeError),
268
269    /// Wraps a deserialization error from `bincode`.
270    #[error("{0}")]
271    Deserialize(#[from] bincode::error::DecodeError),
272}
273
274impl Error {
275    #[inline]
276    fn parser<S: ToString>(msg: S, span: Span<'_>) -> Self {
277        Self::Parse(Box::new(pest::error::Error::new_from_span(
278            ErrorVariant::CustomError {
279                message: msg.to_string(),
280            },
281            span,
282        )))
283    }
284
285    fn msg<M: AsRef<str>>(msg: M) -> Self {
286        Self::Msg(msg.as_ref().into())
287    }
288
289    fn localized<S: AsRef<str>>(source: S, line: usize, err: Error) -> Self {
290        Self::Localized(source.as_ref().into(), line, err.into())
291    }
292
293    /// Unwraps the localized error
294    pub fn unwrap_localized(&self) -> &Self {
295        match self {
296            Self::Localized(_, _, e) => e,
297            _ => self,
298        }
299    }
300}
301
302#[derive(Debug, Clone, Serialize, Deserialize)]
303enum Message {
304    String(String),
305    Format {
306        printf_spec: String,
307        fs: FormatString,
308    },
309}
310
311impl Display for Message {
312    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
313        match self {
314            Self::String(s) => write!(f, "{s}"),
315            Self::Format { printf_spec: _, fs } => write!(f, "{}", fs.to_string_lossy()),
316        }
317    }
318}
319
320impl Message {
321    fn to_string_lossy(&self) -> Cow<'_, str> {
322        match self {
323            Message::String(s) => Cow::Borrowed(s),
324            Message::Format { printf_spec: _, fs } => fs.to_string_lossy(),
325        }
326    }
327
328    #[inline(always)]
329    fn format_with(&self, mr: Option<&MatchRes>) -> Result<Cow<'_, str>, Error> {
330        match self {
331            Self::String(s) => Ok(Cow::Borrowed(s.as_str())),
332            Self::Format {
333                printf_spec: c_spec,
334                fs,
335            } => {
336                if let Some(mr) = mr {
337                    match mr {
338                        MatchRes::Float(_, _) | MatchRes::Bytes(_, _, _, _) => {
339                            Ok(Cow::Owned(dformat!(fs, mr)?))
340                        }
341                        MatchRes::Scalar(_, scalar) => {
342                            // we want to print a byte as char
343                            if c_spec.as_str() == "c" {
344                                match scalar {
345                                    Scalar::byte(b) => {
346                                        let b = (*b as u8) as char;
347                                        Ok(Cow::Owned(dformat!(fs, b)?))
348                                    }
349                                    Scalar::ubyte(b) => {
350                                        let b = *b as char;
351                                        Ok(Cow::Owned(dformat!(fs, b)?))
352                                    }
353                                    _ => Ok(Cow::Owned(dformat!(fs, mr)?)),
354                                }
355                            } else {
356                                Ok(Cow::Owned(dformat!(fs, mr)?))
357                            }
358                        }
359                    }
360                } else {
361                    Ok(fs.to_string_lossy())
362                }
363            }
364        }
365    }
366}
367
368impl ScalarDataType {
369    #[inline(always)]
370    fn read<R: Read + Seek>(&self, from: &mut R, switch_endianness: bool) -> Result<Scalar, Error> {
371        macro_rules! _read_le {
372            ($ty: ty) => {{
373                if switch_endianness {
374                    <$ty>::from_be_bytes(read!(from, $ty))
375                } else {
376                    <$ty>::from_le_bytes(read!(from, $ty))
377                }
378            }};
379        }
380
381        macro_rules! _read_be {
382            ($ty: ty) => {{
383                if switch_endianness {
384                    <$ty>::from_le_bytes(read!(from, $ty))
385                } else {
386                    <$ty>::from_be_bytes(read!(from, $ty))
387                }
388            }};
389        }
390
391        macro_rules! _read_ne {
392            ($ty: ty) => {{
393                if cfg!(target_endian = "big") {
394                    _read_be!($ty)
395                } else {
396                    _read_le!($ty)
397                }
398            }};
399        }
400
401        macro_rules! _read_me {
402            () => {
403                ((_read_le!(u16) as i32) << 16) | (_read_le!(u16) as i32)
404            };
405        }
406
407        Ok(match self {
408            // signed
409            Self::byte => Scalar::byte(read!(from, u8)[0] as i8),
410            Self::short => Scalar::short(_read_ne!(i16)),
411            Self::long => Scalar::long(_read_ne!(i32)),
412            Self::date => Scalar::date(_read_ne!(i32)),
413            Self::ldate => Scalar::ldate(_read_ne!(i32)),
414            Self::qwdate => Scalar::qwdate(_read_ne!(i64)),
415            Self::leshort => Scalar::leshort(_read_le!(i16)),
416            Self::lelong => Scalar::lelong(_read_le!(i32)),
417            Self::lequad => Scalar::lequad(_read_le!(i64)),
418            Self::bequad => Scalar::bequad(_read_be!(i64)),
419            Self::belong => Scalar::belong(_read_be!(i32)),
420            Self::bedate => Scalar::bedate(_read_be!(i32)),
421            Self::beldate => Scalar::beldate(_read_be!(i32)),
422            Self::beqdate => Scalar::beqdate(_read_be!(i64)),
423            // unsigned
424            Self::ubyte => Scalar::ubyte(read!(from, u8)[0]),
425            Self::ushort => Scalar::ushort(_read_ne!(u16)),
426            Self::uleshort => Scalar::uleshort(_read_le!(u16)),
427            Self::ulelong => Scalar::ulelong(_read_le!(u32)),
428            Self::uledate => Scalar::uledate(_read_le!(u32)),
429            Self::ulequad => Scalar::ulequad(_read_le!(u64)),
430            Self::offset => Scalar::offset(from.stream_position()?),
431            Self::ubequad => Scalar::ubequad(_read_be!(u64)),
432            Self::medate => Scalar::medate(_read_me!()),
433            Self::meldate => Scalar::meldate(_read_me!()),
434            Self::melong => Scalar::melong(_read_me!()),
435            Self::beshort => Scalar::beshort(_read_be!(i16)),
436            Self::quad => Scalar::quad(_read_ne!(i64)),
437            Self::uquad => Scalar::uquad(_read_ne!(u64)),
438            Self::ledate => Scalar::ledate(_read_le!(i32)),
439            Self::leldate => Scalar::leldate(_read_le!(i32)),
440            Self::leqdate => Scalar::leqdate(_read_le!(i64)),
441            Self::leqldate => Scalar::leqldate(_read_le!(i64)),
442            Self::leqwdate => Scalar::leqwdate(_read_le!(i64)),
443            Self::ubelong => Scalar::ubelong(_read_be!(u32)),
444            Self::ulong => Scalar::ulong(_read_ne!(u32)),
445            Self::ubeshort => Scalar::ubeshort(_read_be!(u16)),
446            Self::ubeqdate => Scalar::ubeqdate(_read_be!(u64)),
447            Self::lemsdosdate => Scalar::lemsdosdate(_read_le!(u16)),
448            Self::lemsdostime => Scalar::lemsdostime(_read_le!(u16)),
449            Self::guid => Scalar::guid(u128::from_be_bytes(read!(from, u128))),
450        })
451    }
452}
453
454impl FloatDataType {
455    #[inline(always)]
456    fn read<R: Read + Seek>(&self, from: &mut R, switch_endianness: bool) -> Result<Float, Error> {
457        macro_rules! _read_le {
458            ($ty: ty) => {{
459                if switch_endianness {
460                    <$ty>::from_be_bytes(read!(from, $ty))
461                } else {
462                    <$ty>::from_le_bytes(read!(from, $ty))
463                }
464            }};
465        }
466
467        macro_rules! _read_be {
468            ($ty: ty) => {{
469                if switch_endianness {
470                    <$ty>::from_le_bytes(read!(from, $ty))
471                } else {
472                    <$ty>::from_be_bytes(read!(from, $ty))
473                }
474            }};
475        }
476
477        macro_rules! _read_ne {
478            ($ty: ty) => {{
479                if cfg!(target_endian = "big") {
480                    _read_be!($ty)
481                } else {
482                    _read_le!($ty)
483                }
484            }};
485        }
486
487        macro_rules! _read_me {
488            () => {
489                ((_read_le!(u16) as i32) << 16) | (_read_le!(u16) as i32)
490            };
491        }
492
493        Ok(match self {
494            Self::lefloat => Float::lefloat(_read_le!(f32)),
495            Self::befloat => Float::befloat(_read_le!(f32)),
496            Self::ledouble => Float::ledouble(_read_le!(f64)),
497            Self::bedouble => Float::bedouble(_read_be!(f64)),
498        })
499    }
500}
501
502#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
503enum Op {
504    Mul,
505    Add,
506    Sub,
507    Div,
508    Mod,
509    And,
510    Xor,
511    Or,
512}
513
514impl Display for Op {
515    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
516        match self {
517            Op::Mul => write!(f, "*"),
518            Op::Add => write!(f, "+"),
519            Op::Sub => write!(f, "-"),
520            Op::Div => write!(f, "/"),
521            Op::Mod => write!(f, "%"),
522            Op::And => write!(f, "&"),
523            Op::Or => write!(f, "|"),
524            Op::Xor => write!(f, "^"),
525        }
526    }
527}
528
529#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
530enum CmpOp {
531    Eq,
532    Lt,
533    Gt,
534    BitAnd,
535    Neq, // ! operator
536    Xor,
537    Not, // ~ operator
538}
539
540impl CmpOp {
541    #[inline(always)]
542    fn is_neq(&self) -> bool {
543        matches!(self, Self::Neq)
544    }
545}
546
547#[derive(Debug, Clone, Serialize, Deserialize)]
548struct ScalarTransform {
549    op: Op,
550    num: Scalar,
551}
552
553impl ScalarTransform {
554    fn apply(&self, s: Scalar) -> Option<Scalar> {
555        match self.op {
556            Op::Add => s.checked_add(self.num),
557            Op::Sub => s.checked_sub(self.num),
558            Op::Mul => s.checked_mul(self.num),
559            Op::Div => s.checked_div(self.num),
560            Op::Mod => s.checked_rem(self.num),
561            Op::And => Some(s.bitand(self.num)),
562            Op::Xor => Some(s.bitxor(self.num)),
563            Op::Or => Some(s.bitor(self.num)),
564        }
565    }
566}
567
568#[derive(Debug, Clone, Serialize, Deserialize)]
569struct FloatTransform {
570    op: Op,
571    num: Float,
572}
573
574impl FloatTransform {
575    fn apply(&self, s: Float) -> Float {
576        match self.op {
577            Op::Add => s.add(self.num),
578            Op::Sub => s.sub(self.num),
579            Op::Mul => s.mul(self.num),
580            // returns inf when div by 0
581            Op::Div => s.div(self.num),
582            // returns NaN when rem by 0
583            Op::Mod => s.rem(self.num),
584            // parser makes sure those operators cannot be used
585            Op::And | Op::Xor | Op::Or => {
586                debug_panic!("unsupported operation");
587                s
588            }
589        }
590    }
591}
592
593#[derive(Debug, Clone, Serialize, Deserialize)]
594enum TestValue<T> {
595    Value(T),
596    Any,
597}
598
599impl<T> TestValue<T> {
600    #[inline(always)]
601    fn as_ref(&self) -> TestValue<&T> {
602        match self {
603            Self::Value(v) => TestValue::Value(v),
604            Self::Any => TestValue::Any,
605        }
606    }
607}
608
609flags! {
610    enum ReMod: u8{
611        CaseInsensitive,
612        StartOffsetUpdate,
613        LineLimit,
614        ForceBin,
615        ForceText,
616        TrimMatch,
617    }
618}
619
620fn serialize_regex<S>(re: &bytes::Regex, serializer: S) -> Result<S::Ok, S::Error>
621where
622    S: serde::Serializer,
623{
624    re.as_str().serialize(serializer)
625}
626
627fn deserialize_regex<'de, D>(deserializer: D) -> Result<bytes::Regex, D::Error>
628where
629    D: serde::Deserializer<'de>,
630{
631    let wrapper = String::deserialize(deserializer)?;
632    bytes::Regex::new(&wrapper).map_err(serde::de::Error::custom)
633}
634
635#[derive(Debug, Clone, Serialize, Deserialize)]
636struct RegexTest {
637    #[serde(
638        serialize_with = "serialize_regex",
639        deserialize_with = "deserialize_regex"
640    )]
641    re: bytes::Regex,
642    length: Option<usize>,
643    mods: FlagSet<ReMod>,
644    str_mods: FlagSet<StringMod>,
645    non_magic_len: usize,
646    binary: bool,
647    cmp_op: CmpOp,
648}
649
650impl RegexTest {
651    #[inline(always)]
652    fn is_binary(&self) -> bool {
653        self.binary
654            || self.mods.contains(ReMod::ForceBin)
655            || self.str_mods.contains(StringMod::ForceBin)
656    }
657
658    fn match_buf<'buf>(
659        &self,
660        off_buf: u64, // absolute buffer offset in content
661        stream_kind: StreamKind,
662        buf: &'buf [u8],
663    ) -> Option<MatchRes<'buf>> {
664        let mr = match stream_kind {
665            StreamKind::Text(_) => {
666                let mut off_txt = off_buf;
667
668                let mut line_limit = self.length.unwrap_or(usize::MAX);
669
670                for line in buf.split(|c| c == &b'\n') {
671                    // we don't need to break on offset
672                    // limit as buf contains the good amount
673                    // of bytes to match against
674                    if line_limit == 0 {
675                        break;
676                    }
677
678                    if let Some(re_match) = self.re.find(line) {
679                        // the offset of the string is computed from the start of the buffer
680                        let start_offset = off_txt + re_match.start() as u64;
681
682                        // if we matched until EOL we need to add one to include the delimiter removed from the split
683                        let stop_offset = if re_match.end() == line.len() {
684                            Some(start_offset + re_match.as_bytes().len() as u64 + 1)
685                        } else {
686                            None
687                        };
688
689                        return Some(MatchRes::Bytes(
690                            start_offset,
691                            stop_offset,
692                            re_match.as_bytes(),
693                            Encoding::Utf8,
694                        ));
695                    }
696
697                    off_txt += line.len() as u64;
698                    // we have to add one because lines do not contain splitting character
699                    off_txt += 1;
700                    line_limit = line_limit.saturating_sub(1)
701                }
702                None
703            }
704
705            StreamKind::Binary => {
706                self.re.find(buf).map(|re_match| {
707                    MatchRes::Bytes(
708                        // the offset of the string is computed from the start of the buffer
709                        off_buf + re_match.start() as u64,
710                        None,
711                        re_match.as_bytes(),
712                        Encoding::Utf8,
713                    )
714                })
715            }
716        };
717
718        // handle the case where we want the regex not to match
719        if self.cmp_op.is_neq() && mr.is_none() {
720            return Some(MatchRes::Bytes(off_buf, None, buf, Encoding::Utf8));
721        }
722
723        mr
724    }
725}
726
727impl From<RegexTest> for Test {
728    fn from(value: RegexTest) -> Self {
729        Self::Regex(value)
730    }
731}
732
733flags! {
734    enum StringMod: u8{
735        ForceBin,
736        UpperInsensitive,
737        LowerInsensitive,
738        FullWordMatch,
739        Trim,
740        ForceText,
741        CompactWhitespace,
742        OptBlank,
743    }
744}
745
746#[derive(Debug, Clone, Serialize, Deserialize)]
747struct StringTest {
748    test_val: TestValue<Vec<u8>>,
749    cmp_op: CmpOp,
750    length: Option<usize>,
751    mods: FlagSet<StringMod>,
752    binary: bool,
753}
754
755impl From<StringTest> for Test {
756    fn from(value: StringTest) -> Self {
757        Self::String(value)
758    }
759}
760
761#[inline(always)]
762fn string_match(str: &[u8], mods: FlagSet<StringMod>, buf: &[u8]) -> (bool, usize) {
763    let mut consumed = 0;
764    // we can do a simple string comparison
765    if mods.is_disjoint(
766        StringMod::UpperInsensitive
767            | StringMod::LowerInsensitive
768            | StringMod::FullWordMatch
769            | StringMod::CompactWhitespace
770            | StringMod::OptBlank,
771    ) {
772        // we check if target contains
773        if buf.starts_with(str) {
774            (true, str.len())
775        } else {
776            (false, consumed)
777        }
778    } else {
779        let mut i_src = 0;
780        let mut iter = buf.iter().peekable();
781
782        macro_rules! consume_target {
783            () => {{
784                iter.next();
785                consumed += 1;
786            }};
787        }
788
789        macro_rules! continue_next_iteration {
790            () => {{
791                consume_target!();
792                i_src += 1;
793                continue;
794            }};
795        }
796
797        while let Some(&&b) = iter.peek() {
798            let Some(&ref_byte) = str.get(i_src) else {
799                break;
800            };
801
802            if mods.contains(StringMod::OptBlank) && (b == b' ' || ref_byte == b' ') {
803                if b == b' ' {
804                    // we ignore whitespace in target
805                    consume_target!();
806                }
807
808                if ref_byte == b' ' {
809                    // we ignore whitespace in test
810                    i_src += 1;
811                }
812
813                continue;
814            }
815
816            if mods.contains(StringMod::UpperInsensitive) {
817                //upper case characters in the magic match both lower and upper case characters in the target
818                if ref_byte.is_ascii_uppercase() && ref_byte == b.to_ascii_uppercase()
819                    || ref_byte == b
820                {
821                    continue_next_iteration!()
822                }
823            }
824
825            if mods.contains(StringMod::LowerInsensitive)
826                && (ref_byte.is_ascii_lowercase() && ref_byte == b.to_ascii_lowercase()
827                    || ref_byte == b)
828            {
829                continue_next_iteration!()
830            }
831
832            if mods.contains(StringMod::CompactWhitespace) && ref_byte == b' ' {
833                let mut src_blk = 0;
834                while let Some(b' ') = str.get(i_src) {
835                    src_blk += 1;
836                    i_src += 1;
837                }
838
839                let mut tgt_blk = 0;
840                while let Some(b' ') = iter.peek() {
841                    tgt_blk += 1;
842                    consume_target!();
843                }
844
845                if src_blk > tgt_blk {
846                    return (false, consumed);
847                }
848
849                continue;
850            }
851
852            if ref_byte == b {
853                continue_next_iteration!()
854            } else {
855                return (false, consumed);
856            }
857        }
858
859        if mods.contains(StringMod::FullWordMatch)
860            && let Some(b) = iter.peek()
861            && !b.is_ascii_whitespace()
862        {
863            return (false, consumed);
864        }
865
866        (consumed > 0 && consumed <= buf.len(), consumed)
867    }
868}
869
870impl StringTest {
871    fn has_length_mod(&self) -> bool {
872        !self.mods.is_disjoint(
873            StringMod::UpperInsensitive
874                | StringMod::LowerInsensitive
875                | StringMod::FullWordMatch
876                | StringMod::CompactWhitespace
877                | StringMod::OptBlank,
878        )
879    }
880
881    #[inline(always)]
882    fn test_value_len(&self) -> usize {
883        match self.test_val.as_ref() {
884            TestValue::Value(s) => s.len(),
885            TestValue::Any => 0,
886        }
887    }
888
889    #[inline(always)]
890    fn is_binary(&self) -> bool {
891        self.binary || self.mods.contains(StringMod::ForceBin)
892    }
893
894    #[inline(always)]
895    fn is_text(&self) -> bool {
896        self.mods.contains(StringMod::ForceText)
897    }
898}
899
900#[derive(Debug, Clone, Serialize, Deserialize)]
901struct SearchTest {
902    str: Vec<u8>,
903    n_pos: Option<usize>,
904    str_mods: FlagSet<StringMod>,
905    re_mods: FlagSet<ReMod>,
906    binary: bool,
907    cmp_op: CmpOp,
908}
909
910impl From<SearchTest> for Test {
911    fn from(value: SearchTest) -> Self {
912        Self::Search(value)
913    }
914}
915
916impl SearchTest {
917    #[inline(always)]
918    fn is_binary(&self) -> bool {
919        (self.binary
920            || self.str_mods.contains(StringMod::ForceBin)
921            || self.re_mods.contains(ReMod::ForceBin))
922            && !(self.str_mods.contains(StringMod::ForceText)
923                || self.re_mods.contains(ReMod::ForceText))
924    }
925
926    // off_buf: absolute buffer offset in content
927    #[inline]
928    fn match_buf<'buf>(&self, off_buf: u64, buf: &'buf [u8]) -> Option<MatchRes<'buf>> {
929        let mut i = 0;
930
931        let needle = self.str.first()?;
932
933        while i < buf.len() {
934            // we cannot match if the first character isn't the same
935            // so we accelerate the search by finding potential matches
936            i += memchr(*needle, &buf[i..])?;
937
938            // if we want a full word match
939            if self.str_mods.contains(StringMod::FullWordMatch) {
940                let prev_is_whitespace = buf
941                    .get(i.saturating_sub(1))
942                    .map(|c| c.is_ascii_whitespace())
943                    .unwrap_or_default();
944
945                // if it is not the first character
946                // and its previous character isn't
947                // a whitespace. It cannot be a
948                // fullword match
949                if i > 0 && !prev_is_whitespace {
950                    i += 1;
951                    continue;
952                }
953            }
954
955            if let Some(npos) = self.n_pos
956                && i > npos
957            {
958                break;
959            }
960
961            let pos = i;
962            let (ok, consumed) = string_match(&self.str, self.str_mods, &buf[i..]);
963
964            if ok {
965                return Some(MatchRes::Bytes(
966                    off_buf.saturating_add(pos as u64),
967                    None,
968                    &buf[i..i + consumed],
969                    Encoding::Utf8,
970                ));
971            } else {
972                i += max(consumed, 1)
973            }
974        }
975
976        // handles the case where we want the string not to be found
977        if self.cmp_op.is_neq() {
978            return Some(MatchRes::Bytes(off_buf, None, buf, Encoding::Utf8));
979        }
980
981        None
982    }
983}
984
985#[derive(Debug, Clone, Serialize, Deserialize)]
986struct ScalarTest {
987    ty: ScalarDataType,
988    transform: Option<ScalarTransform>,
989    cmp_op: CmpOp,
990    test_val: TestValue<Scalar>,
991}
992
993#[derive(Debug, Clone, Serialize, Deserialize)]
994struct FloatTest {
995    ty: FloatDataType,
996    transform: Option<FloatTransform>,
997    cmp_op: CmpOp,
998    test_val: TestValue<Float>,
999}
1000
1001// the value read from the haystack we want to match against
1002// 'buf is the lifetime of the buffer we are scanning
1003#[derive(Debug, PartialEq)]
1004enum ReadValue<'buf> {
1005    Float(u64, Float),
1006    Scalar(u64, Scalar),
1007    Bytes(u64, &'buf [u8]),
1008}
1009
1010impl DynDisplay for ReadValue<'_> {
1011    fn dyn_fmt(&self, f: &dyf::FormatSpec) -> Result<String, dyf::Error> {
1012        match self {
1013            Self::Float(_, s) => DynDisplay::dyn_fmt(s, f),
1014            Self::Scalar(_, s) => DynDisplay::dyn_fmt(s, f),
1015            Self::Bytes(_, b) => Ok(format!("{b:?}")),
1016        }
1017    }
1018}
1019
1020impl DynDisplay for &ReadValue<'_> {
1021    fn dyn_fmt(&self, f: &dyf::FormatSpec) -> Result<String, dyf::Error> {
1022        // Dereference self to get the TestValue and call its fmt method
1023        DynDisplay::dyn_fmt(*self, f)
1024    }
1025}
1026
1027impl Display for ReadValue<'_> {
1028    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1029        match self {
1030            Self::Float(_, v) => write!(f, "{v}"),
1031            Self::Scalar(_, s) => write!(f, "{s}"),
1032            Self::Bytes(_, b) => write!(f, "{b:?}"),
1033        }
1034    }
1035}
1036
1037enum Encoding {
1038    Utf16(String16Encoding),
1039    Utf8,
1040}
1041
1042// Carry the offset of the start of the data in the stream
1043// and the data itself
1044enum MatchRes<'buf> {
1045    // Bytes.0: offset of the match
1046    // Bytes.1: optional end of match (to address the need of EOL adjustment in string regex)
1047    // Bytes.2: the bytes matching
1048    // Bytes.3: encoding of the buffer
1049    Bytes(u64, Option<u64>, &'buf [u8], Encoding),
1050    Scalar(u64, Scalar),
1051    Float(u64, Float),
1052}
1053
1054impl DynDisplay for &MatchRes<'_> {
1055    fn dyn_fmt(&self, f: &dyf::FormatSpec) -> Result<String, dyf::Error> {
1056        (*self).dyn_fmt(f)
1057    }
1058}
1059
1060impl DynDisplay for MatchRes<'_> {
1061    fn dyn_fmt(&self, f: &dyf::FormatSpec) -> Result<String, dyf::Error> {
1062        match self {
1063            Self::Scalar(_, v) => v.dyn_fmt(f),
1064            Self::Float(_, v) => v.dyn_fmt(f),
1065            Self::Bytes(_, _, v, enc) => match enc {
1066                Encoding::Utf8 => String::from_utf8_lossy(v).to_string().dyn_fmt(f),
1067                Encoding::Utf16(enc) => {
1068                    let utf16: Vec<u16> = slice_to_utf16_iter(v, *enc).collect();
1069                    String::from_utf16_lossy(&utf16).dyn_fmt(f)
1070                }
1071            },
1072        }
1073    }
1074}
1075
1076impl MatchRes<'_> {
1077    // start offset of the match
1078    #[inline]
1079    fn start_offset(&self) -> u64 {
1080        match self {
1081            MatchRes::Bytes(o, _, _, _) => *o,
1082            MatchRes::Scalar(o, _) => *o,
1083            MatchRes::Float(o, _) => *o,
1084        }
1085    }
1086
1087    // start offset of the match
1088    #[inline]
1089    fn end_offset(&self) -> u64 {
1090        match self {
1091            MatchRes::Bytes(start, end, buf, _) => match end {
1092                Some(end) => *end,
1093                None => start.saturating_add(buf.len() as u64),
1094            },
1095            MatchRes::Scalar(o, sc) => o.add(sc.size_of() as u64),
1096            MatchRes::Float(o, f) => o.add(f.size_of() as u64),
1097        }
1098    }
1099}
1100
1101fn slice_to_utf16_iter(read: &[u8], encoding: String16Encoding) -> impl Iterator<Item = u16> {
1102    let even = read
1103        .iter()
1104        .enumerate()
1105        .filter(|(i, _)| i % 2 == 0)
1106        .map(|t| t.1);
1107
1108    let odd = read
1109        .iter()
1110        .enumerate()
1111        .filter(|(i, _)| i % 2 != 0)
1112        .map(|t| t.1);
1113
1114    even.zip(odd).map(move |(e, o)| match encoding {
1115        String16Encoding::Le => u16::from_le_bytes([*e, *o]),
1116        String16Encoding::Be => u16::from_be_bytes([*e, *o]),
1117    })
1118}
1119
1120#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
1121enum String16Encoding {
1122    Le,
1123    Be,
1124}
1125
1126#[derive(Debug, Clone, Serialize, Deserialize)]
1127struct String16Test {
1128    orig: String,
1129    test_val: TestValue<Vec<u16>>,
1130    encoding: String16Encoding,
1131}
1132
1133impl String16Test {
1134    /// if the test value is a specific value this method returns
1135    /// the number of utf16 characters. To obtain the length in
1136    /// bytes the return value needs to be multiplied by two.
1137    #[inline(always)]
1138    fn test_value_len(&self) -> usize {
1139        match self.test_val.as_ref() {
1140            TestValue::Value(str16) => str16.len(),
1141            TestValue::Any => 0,
1142        }
1143    }
1144}
1145
1146flags! {
1147    enum IndirectMod: u8{
1148        Relative,
1149    }
1150}
1151
1152type IndirectMods = FlagSet<IndirectMod>;
1153
1154#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
1155enum PStringLen {
1156    Byte,    // B
1157    ShortBe, // H
1158    ShortLe, // h
1159    LongBe,  // L
1160    LongLe,  // l
1161}
1162
1163impl PStringLen {
1164    #[inline(always)]
1165    const fn size_of_len(&self) -> usize {
1166        match self {
1167            PStringLen::Byte => 1,
1168            PStringLen::ShortBe => 2,
1169            PStringLen::ShortLe => 2,
1170            PStringLen::LongBe => 4,
1171            PStringLen::LongLe => 4,
1172        }
1173    }
1174}
1175
1176#[derive(Debug, Clone, Serialize, Deserialize)]
1177struct PStringTest {
1178    len: PStringLen,
1179    test_val: TestValue<Vec<u8>>,
1180    include_len: bool,
1181}
1182
1183impl PStringTest {
1184    #[inline]
1185    fn read<'cache, R: Read + Seek>(
1186        &self,
1187        haystack: &'cache mut LazyCache<R>,
1188    ) -> Result<Option<&'cache [u8]>, Error> {
1189        let mut len = match self.len {
1190            PStringLen::Byte => read_le!(haystack, u8) as u32,
1191            PStringLen::ShortBe => read_be!(haystack, u16) as u32,
1192            PStringLen::ShortLe => read_le!(haystack, u16) as u32,
1193            PStringLen::LongBe => read_be!(haystack, u32),
1194            PStringLen::LongLe => read_le!(haystack, u32),
1195        } as usize;
1196
1197        if self.include_len {
1198            len = len.saturating_sub(self.len.size_of_len())
1199        }
1200
1201        if let TestValue::Value(s) = self.test_val.as_ref()
1202            && len != s.len()
1203        {
1204            return Ok(None);
1205        }
1206
1207        let read = haystack.read_exact_count(len as u64)?;
1208
1209        Ok(Some(read))
1210    }
1211
1212    #[inline(always)]
1213    fn test_value_len(&self) -> usize {
1214        match self.test_val.as_ref() {
1215            TestValue::Value(s) => s.len(),
1216            TestValue::Any => 0,
1217        }
1218    }
1219}
1220
1221#[derive(Debug, Clone, Serialize, Deserialize)]
1222enum Test {
1223    Name(String),
1224    Use(bool, String),
1225    Scalar(ScalarTest),
1226    Float(FloatTest),
1227    String(StringTest),
1228    Search(SearchTest),
1229    PString(PStringTest),
1230    Regex(RegexTest),
1231    Indirect(FlagSet<IndirectMod>),
1232    String16(String16Test),
1233    // FIXME: placeholder for strength computation
1234    #[allow(dead_code)]
1235    Der,
1236    Clear,
1237    Default,
1238}
1239
1240impl Test {
1241    // read the value to test from the haystack
1242    #[inline]
1243    fn read_test_value<'haystack, R: Read + Seek>(
1244        &self,
1245        haystack: &'haystack mut LazyCache<R>,
1246        switch_endianness: bool,
1247    ) -> Result<Option<ReadValue<'haystack>>, Error> {
1248        let test_value_offset = haystack.lazy_stream_position();
1249
1250        match self {
1251            Self::Scalar(t) => {
1252                t.ty.read(haystack, switch_endianness)
1253                    .map(|s| Some(ReadValue::Scalar(test_value_offset, s)))
1254            }
1255
1256            Self::Float(t) => {
1257                t.ty.read(haystack, switch_endianness)
1258                    .map(|f| Some(ReadValue::Float(test_value_offset, f)))
1259            }
1260            Self::String(t) => {
1261                match t.test_val.as_ref() {
1262                    TestValue::Value(str) => {
1263                        let buf = if let Some(length) = t.length {
1264                            // if there is a length specified
1265                            haystack.read_exact_count(length as u64)?
1266                        } else {
1267                            // no length specified we read until end of string
1268
1269                            match t.cmp_op {
1270                                CmpOp::Eq | CmpOp::Neq => {
1271                                    if !t.has_length_mod() {
1272                                        haystack.read_exact_count(str.len() as u64)?
1273                                    } else {
1274                                        haystack.read_count(FILE_BYTES_MAX as u64)?
1275                                    }
1276                                }
1277                                CmpOp::Lt | CmpOp::Gt => {
1278                                    let read =
1279                                        haystack.read_until_any_delim_or_limit(b"\n\0", 8092)?;
1280
1281                                    if read.ends_with(b"\0") || read.ends_with(b"\n") {
1282                                        &read[..read.len() - 1]
1283                                    } else {
1284                                        read
1285                                    }
1286                                }
1287                                _ => {
1288                                    return Err(Error::Msg(format!(
1289                                        "string test does not support {:?} operator",
1290                                        t.cmp_op
1291                                    )));
1292                                }
1293                            }
1294                        };
1295
1296                        Ok(Some(ReadValue::Bytes(test_value_offset, buf)))
1297                    }
1298                    TestValue::Any => {
1299                        let read = haystack.read_until_any_delim_or_limit(b"\0\n", 8192)?;
1300                        // we don't take last byte if it matches end of string
1301                        let bytes = if read.ends_with(b"\0") || read.ends_with(b"\n") {
1302                            &read[..read.len() - 1]
1303                        } else {
1304                            read
1305                        };
1306
1307                        Ok(Some(ReadValue::Bytes(test_value_offset, bytes)))
1308                    }
1309                }
1310            }
1311
1312            Self::String16(t) => {
1313                match t.test_val.as_ref() {
1314                    TestValue::Value(str16) => {
1315                        let read = haystack.read_exact_count((str16.len() * 2) as u64)?;
1316
1317                        Ok(Some(ReadValue::Bytes(test_value_offset, read)))
1318                    }
1319                    TestValue::Any => {
1320                        let read = haystack.read_until_utf16_or_limit(b"\x00\x00", 8192)?;
1321
1322                        // we make sure we have an even number of elements
1323                        let end = if read.len() % 2 == 0 {
1324                            read.len()
1325                        } else {
1326                            // we decide to read anyway even though
1327                            // length isn't even
1328                            read.len().saturating_sub(1)
1329                        };
1330
1331                        Ok(Some(ReadValue::Bytes(test_value_offset, &read[..end])))
1332                    }
1333                }
1334            }
1335
1336            Self::PString(t) => {
1337                let Some(read) = t.read(haystack)? else {
1338                    return Ok(None);
1339                };
1340                Ok(Some(ReadValue::Bytes(test_value_offset, read)))
1341            }
1342
1343            Self::Search(_) => {
1344                let buf = haystack.read_count(FILE_BYTES_MAX as u64)?;
1345                Ok(Some(ReadValue::Bytes(test_value_offset, buf)))
1346            }
1347
1348            Self::Regex(r) => {
1349                let length = {
1350                    match r.length {
1351                        Some(len) => {
1352                            if r.mods.contains(ReMod::LineLimit) {
1353                                len * 80
1354                            } else {
1355                                len
1356                            }
1357                        }
1358
1359                        None => FILE_REGEX_MAX,
1360                    }
1361                };
1362
1363                let read = haystack.read_count(length as u64)?;
1364                Ok(Some(ReadValue::Bytes(test_value_offset, read)))
1365            }
1366
1367            Self::Name(_)
1368            | Self::Use(_, _)
1369            | Self::Indirect(_)
1370            | Self::Clear
1371            | Self::Default
1372            | Self::Der => Err(Error::msg("no value to read for this test")),
1373        }
1374    }
1375
1376    #[inline(always)]
1377    fn match_value<'s>(
1378        &'s self,
1379        tv: &ReadValue<'s>,
1380        stream_kind: StreamKind,
1381    ) -> Option<MatchRes<'s>> {
1382        match (self, tv) {
1383            (Self::Scalar(t), ReadValue::Scalar(o, ts)) => {
1384                let read_value: Scalar = match t.transform.as_ref() {
1385                    Some(t) => t.apply(*ts)?,
1386                    None => *ts,
1387                };
1388
1389                match t.test_val {
1390                    TestValue::Value(test_value) => {
1391                        let ok = match t.cmp_op {
1392                            // NOTE: this should not happen in practice because
1393                            // we convert it into Eq equivalent at parsing time
1394                            CmpOp::Not => read_value == !test_value,
1395                            CmpOp::Eq => read_value == test_value,
1396                            CmpOp::Lt => read_value < test_value,
1397                            CmpOp::Gt => read_value > test_value,
1398                            CmpOp::Neq => read_value != test_value,
1399                            CmpOp::BitAnd => read_value & test_value == test_value,
1400                            CmpOp::Xor => (read_value & test_value).is_zero(),
1401                        };
1402
1403                        if ok {
1404                            Some(MatchRes::Scalar(*o, read_value))
1405                        } else {
1406                            None
1407                        }
1408                    }
1409
1410                    TestValue::Any => Some(MatchRes::Scalar(*o, read_value)),
1411                }
1412            }
1413
1414            (Self::Float(t), ReadValue::Float(o, f)) => {
1415                let read_value: Float = t.transform.as_ref().map(|t| t.apply(*f)).unwrap_or(*f);
1416
1417                match t.test_val {
1418                    TestValue::Value(tf) => {
1419                        let ok = match t.cmp_op {
1420                            CmpOp::Eq => read_value == tf,
1421                            CmpOp::Lt => read_value < tf,
1422                            CmpOp::Gt => read_value > tf,
1423                            CmpOp::Neq => read_value != tf,
1424                            _ => {
1425                                // this should never be reached as we validate
1426                                // operator in parser
1427                                debug_panic!("unsupported float comparison");
1428                                debug!("unsupported float comparison");
1429                                false
1430                            }
1431                        };
1432
1433                        if ok {
1434                            Some(MatchRes::Float(*o, read_value))
1435                        } else {
1436                            None
1437                        }
1438                    }
1439                    TestValue::Any => Some(MatchRes::Float(*o, read_value)),
1440                }
1441            }
1442
1443            (Self::String(st), ReadValue::Bytes(o, buf)) => {
1444                macro_rules! trim_buf {
1445                    ($buf: expr) => {{
1446                        if st.mods.contains(StringMod::Trim) {
1447                            $buf.trim_ascii()
1448                        } else {
1449                            $buf
1450                        }
1451                    }};
1452                }
1453
1454                match st.test_val.as_ref() {
1455                    TestValue::Value(str) => {
1456                        match st.cmp_op {
1457                            CmpOp::Eq => {
1458                                if let (true, _) = string_match(str, st.mods, buf) {
1459                                    Some(MatchRes::Bytes(*o, None, trim_buf!(str), Encoding::Utf8))
1460                                } else {
1461                                    None
1462                                }
1463                            }
1464                            CmpOp::Neq => {
1465                                if let (false, _) = string_match(str, st.mods, buf) {
1466                                    Some(MatchRes::Bytes(*o, None, trim_buf!(str), Encoding::Utf8))
1467                                } else {
1468                                    None
1469                                }
1470                            }
1471                            CmpOp::Gt => {
1472                                if buf.len() > str.len() {
1473                                    Some(MatchRes::Bytes(*o, None, trim_buf!(buf), Encoding::Utf8))
1474                                } else {
1475                                    None
1476                                }
1477                            }
1478                            CmpOp::Lt => {
1479                                if buf.len() < str.len() {
1480                                    Some(MatchRes::Bytes(*o, None, trim_buf!(buf), Encoding::Utf8))
1481                                } else {
1482                                    None
1483                                }
1484                            }
1485
1486                            // unsupported for strings
1487                            _ => {
1488                                // this should never be reached as we validate
1489                                // operator in parser
1490                                debug_panic!("unsupported string comparison");
1491                                debug!("unsupported string comparison");
1492                                None
1493                            }
1494                        }
1495                    }
1496                    TestValue::Any => {
1497                        Some(MatchRes::Bytes(*o, None, trim_buf!(buf), Encoding::Utf8))
1498                    }
1499                }
1500            }
1501
1502            (Self::PString(m), ReadValue::Bytes(o, buf)) => match m.test_val.as_ref() {
1503                TestValue::Value(psv) => {
1504                    if buf == psv {
1505                        Some(MatchRes::Bytes(*o, None, buf, Encoding::Utf8))
1506                    } else {
1507                        None
1508                    }
1509                }
1510                TestValue::Any => Some(MatchRes::Bytes(*o, None, buf, Encoding::Utf8)),
1511            },
1512
1513            (Self::String16(t), ReadValue::Bytes(o, buf)) => {
1514                match t.test_val.as_ref() {
1515                    TestValue::Value(str16) => {
1516                        // strings cannot be equal
1517                        if str16.len() * 2 != buf.len() {
1518                            return None;
1519                        }
1520
1521                        // we check string equality
1522                        for (i, utf16_char) in slice_to_utf16_iter(buf, t.encoding).enumerate() {
1523                            if str16[i] != utf16_char {
1524                                return None;
1525                            }
1526                        }
1527
1528                        Some(MatchRes::Bytes(
1529                            *o,
1530                            None,
1531                            t.orig.as_bytes(),
1532                            Encoding::Utf16(t.encoding),
1533                        ))
1534                    }
1535
1536                    TestValue::Any => {
1537                        Some(MatchRes::Bytes(*o, None, buf, Encoding::Utf16(t.encoding)))
1538                    }
1539                }
1540            }
1541
1542            (Self::Regex(r), ReadValue::Bytes(o, buf)) => r.match_buf(*o, stream_kind, buf),
1543
1544            (Self::Search(t), ReadValue::Bytes(o, buf)) => t.match_buf(*o, buf),
1545
1546            _ => None,
1547        }
1548    }
1549
1550    #[inline(always)]
1551    fn strength(&self) -> u64 {
1552        const MULT: usize = 10;
1553
1554        let mut out = 2 * MULT;
1555
1556        // FIXME: octal is missing but it is not used in practice ...
1557        match self {
1558            Test::Scalar(s) => {
1559                out += s.ty.type_size() * MULT;
1560            }
1561
1562            Test::Float(t) => {
1563                out += t.ty.type_size() * MULT;
1564            }
1565
1566            Test::String(t) => out += t.test_value_len().saturating_mul(MULT),
1567
1568            Test::PString(t) => out += t.test_value_len().saturating_mul(MULT),
1569
1570            Test::Search(s) => {
1571                // NOTE: this implementation deviates from what is in
1572                // C libmagic. The purpose of this implementation is to
1573                // minimize the difference between similar tests,
1574                // implemented differently (ex: string test VS very localized search test).
1575                let n_pos = s.n_pos.unwrap_or(FILE_BYTES_MAX);
1576
1577                match n_pos {
1578                    // a search on one line should be equivalent to a string match
1579                    0..=80 => out += s.str.len().saturating_mul(MULT),
1580                    // search on the first 3 lines gets a little penalty
1581                    81..=240 => out += s.str.len() * s.str.len().clamp(0, MULT - 2),
1582                    // a search on more than 3 lines isn't considered very accurate
1583                    _ => out += s.str.len(),
1584                }
1585            }
1586
1587            Test::Regex(r) => {
1588                // NOTE: this implementation deviates from what is in
1589                // C libmagic. The purpose of this implementation is to
1590                // minimize the difference between similar tests,
1591                // implemented differently (ex: string test VS very localized regex test).
1592
1593                // we divide length by the number of capture group
1594                // which gives us a value close to he average string
1595                // length match in the regex.
1596                let v = r.non_magic_len / r.re.captures_len();
1597
1598                let len = r
1599                    .length
1600                    .map(|l| {
1601                        if r.mods.contains(ReMod::LineLimit) {
1602                            l * 80
1603                        } else {
1604                            l
1605                        }
1606                    })
1607                    .unwrap_or(FILE_BYTES_MAX);
1608
1609                match len {
1610                    // a search on one line should be equivalent to a string match
1611                    0..=80 => out += v.saturating_mul(MULT),
1612                    // search on the first 3 lines gets a little penalty
1613                    81..=240 => out += v * v.clamp(0, MULT - 2),
1614                    // a search on more than 3 lines isn't considered very accurate
1615                    _ => out += v,
1616                }
1617            }
1618
1619            Test::String16(t) => {
1620                // NOTE: in libmagic the result is div by 2
1621                // but I GUESS it is because the len is expressed
1622                // in number bytes. In our case length is expressed
1623                // in number of u16 so we shouldn't divide.
1624                out += t.test_value_len().saturating_mul(MULT);
1625            }
1626
1627            Test::Der => out += MULT,
1628
1629            Test::Default | Test::Name(_) | Test::Use(_, _) | Test::Indirect(_) | Test::Clear => {
1630                return 0;
1631            }
1632        }
1633
1634        // matching any output gets penalty
1635        if self.is_match_any() {
1636            return 0;
1637        }
1638
1639        if let Some(op) = self.cmp_op() {
1640            match op {
1641                // matching almost any gets penalty
1642                CmpOp::Neq => out = 0,
1643                CmpOp::Eq | CmpOp::Not => out += MULT,
1644                CmpOp::Lt | CmpOp::Gt => out -= 2 * MULT,
1645                CmpOp::Xor | CmpOp::BitAnd => out -= MULT,
1646            }
1647        }
1648
1649        out as u64
1650    }
1651
1652    #[inline(always)]
1653    fn cmp_op(&self) -> Option<CmpOp> {
1654        match self {
1655            Self::String(t) => Some(t.cmp_op),
1656            Self::Scalar(s) => Some(s.cmp_op),
1657            Self::Float(t) => Some(t.cmp_op),
1658            Self::Name(_)
1659            | Self::Use(_, _)
1660            | Self::Search(_)
1661            | Self::PString(_)
1662            | Self::Regex(_)
1663            | Self::Clear
1664            | Self::Default
1665            | Self::Indirect(_)
1666            | Self::String16(_)
1667            | Self::Der => None,
1668        }
1669    }
1670
1671    #[inline(always)]
1672    fn is_match_any(&self) -> bool {
1673        match self {
1674            Test::Name(_) => false,
1675            Test::Use(_, _) => false,
1676            Test::Scalar(scalar_test) => matches!(scalar_test.test_val, TestValue::Any),
1677            Test::Float(float_test) => matches!(float_test.test_val, TestValue::Any),
1678            Test::String(string_test) => matches!(string_test.test_val, TestValue::Any),
1679            Test::Search(_) => false,
1680            Test::PString(pstring_test) => matches!(pstring_test.test_val, TestValue::Any),
1681            Test::Regex(_) => false,
1682            Test::Indirect(_) => false,
1683            Test::String16(string16_test) => matches!(string16_test.test_val, TestValue::Any),
1684            Test::Der => false,
1685            Test::Clear => false,
1686            Test::Default => false,
1687        }
1688    }
1689
1690    #[inline(always)]
1691    fn is_binary(&self) -> bool {
1692        match self {
1693            Self::Name(_) => true,
1694            Self::Use(_, _) => true,
1695            Self::Scalar(_) => true,
1696            Self::Float(_) => true,
1697            Self::String(t) => !t.is_binary() & !t.is_text() || t.is_binary(),
1698            Self::Search(t) => t.is_binary(),
1699            Self::PString(_) => true,
1700            Self::Regex(t) => t.is_binary(),
1701            Self::Clear => true,
1702            Self::Default => true,
1703            Self::Indirect(_) => true,
1704            Self::String16(_) => true,
1705            Self::Der => true,
1706        }
1707    }
1708
1709    #[inline(always)]
1710    fn is_text(&self) -> bool {
1711        match self {
1712            Self::Name(_) => true,
1713            Self::Use(_, _) => true,
1714            Self::Indirect(_) => true,
1715            Self::Clear => true,
1716            Self::Default => true,
1717            Self::String(t) => !t.is_binary() & !t.is_text() || t.is_text(),
1718            _ => !self.is_binary(),
1719        }
1720    }
1721
1722    #[inline(always)]
1723    fn is_only_text(&self) -> bool {
1724        self.is_text() && !self.is_binary()
1725    }
1726
1727    #[inline(always)]
1728    fn is_only_binary(&self) -> bool {
1729        self.is_binary() && !self.is_text()
1730    }
1731}
1732
1733#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
1734enum OffsetType {
1735    Byte,
1736    DoubleLe,
1737    DoubleBe,
1738    ShortLe,
1739    ShortBe,
1740    Id3Le,
1741    Id3Be,
1742    LongLe,
1743    LongBe,
1744    Middle,
1745    Octal,
1746    QuadBe,
1747    QuadLe,
1748}
1749
1750#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
1751enum Shift {
1752    Direct(u64),
1753    Indirect(i64),
1754}
1755
1756#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
1757struct IndOffset {
1758    // where to find the offset
1759    off_addr: DirOffset,
1760    // signed or unsigned
1761    signed: bool,
1762    // type of the offset
1763    ty: OffsetType,
1764    op: Option<Op>,
1765    shift: Option<Shift>,
1766}
1767
1768impl IndOffset {
1769    // if we overflow we must not return an offset
1770    fn read_offset<R: Read + Seek>(
1771        &self,
1772        haystack: &mut LazyCache<R>,
1773        rule_base_offset: Option<u64>,
1774        last_upper_match_offset: Option<u64>,
1775    ) -> Result<Option<u64>, io::Error> {
1776        let offset_address = match self.off_addr {
1777            DirOffset::Start(s) => {
1778                let Some(o) = s.checked_add(rule_base_offset.unwrap_or_default()) else {
1779                    return Ok(None);
1780                };
1781
1782                haystack.seek(SeekFrom::Start(o))?
1783            }
1784            DirOffset::LastUpper(c) => haystack.seek(SeekFrom::Start(
1785                (last_upper_match_offset.unwrap_or_default() as i64 + c) as u64,
1786            ))?,
1787            DirOffset::End(e) => haystack.seek(SeekFrom::End(e))?,
1788        };
1789
1790        macro_rules! read_value {
1791            () => {
1792                match self.ty {
1793                    OffsetType::Byte => {
1794                        if self.signed {
1795                            read_le!(haystack, u8) as u64
1796                        } else {
1797                            read_le!(haystack, i8) as u64
1798                        }
1799                    }
1800                    OffsetType::DoubleLe => read_le!(haystack, f64) as u64,
1801                    OffsetType::DoubleBe => read_be!(haystack, f64) as u64,
1802                    OffsetType::ShortLe => {
1803                        if self.signed {
1804                            read_le!(haystack, i16) as u64
1805                        } else {
1806                            read_le!(haystack, u16) as u64
1807                        }
1808                    }
1809                    OffsetType::ShortBe => {
1810                        if self.signed {
1811                            read_be!(haystack, i16) as u64
1812                        } else {
1813                            read_be!(haystack, u16) as u64
1814                        }
1815                    }
1816                    OffsetType::Id3Le => decode_id3(read_le!(haystack, u32)) as u64,
1817                    OffsetType::Id3Be => decode_id3(read_be!(haystack, u32)) as u64,
1818                    OffsetType::LongLe => {
1819                        if self.signed {
1820                            read_le!(haystack, i32) as u64
1821                        } else {
1822                            read_le!(haystack, u32) as u64
1823                        }
1824                    }
1825                    OffsetType::LongBe => {
1826                        if self.signed {
1827                            read_be!(haystack, i32) as u64
1828                        } else {
1829                            read_be!(haystack, u32) as u64
1830                        }
1831                    }
1832                    OffsetType::Middle => read_me!(haystack) as u64,
1833                    OffsetType::Octal => {
1834                        if let Some(o) = read_octal_u64(haystack) {
1835                            o
1836                        } else {
1837                            debug!("failed to read octal offset @ {offset_address}");
1838                            return Ok(None);
1839                        }
1840                    }
1841                    OffsetType::QuadLe => {
1842                        if self.signed {
1843                            read_le!(haystack, i64) as u64
1844                        } else {
1845                            read_le!(haystack, u64)
1846                        }
1847                    }
1848                    OffsetType::QuadBe => {
1849                        if self.signed {
1850                            read_be!(haystack, i64) as u64
1851                        } else {
1852                            read_be!(haystack, u64)
1853                        }
1854                    }
1855                }
1856            };
1857        }
1858
1859        // in theory every offset read should end up in something seekable from start, so we can use u64 to store the result
1860        let o = read_value!();
1861
1862        trace!(
1863            "offset read @ {offset_address} value={o} op={:?} shift={:?}",
1864            self.op, self.shift
1865        );
1866
1867        // apply transformation
1868        if let (Some(op), Some(shift)) = (self.op, self.shift) {
1869            let shift = match shift {
1870                Shift::Direct(i) => i,
1871                Shift::Indirect(i) => {
1872                    let tmp = offset_address as i128 + i as i128;
1873                    if tmp.is_negative() {
1874                        return Ok(None);
1875                    } else {
1876                        haystack.seek(SeekFrom::Start(tmp as u64))?;
1877                    };
1878                    // NOTE: here we assume that the shift has the same
1879                    // type as the main offset !
1880                    read_value!()
1881                }
1882            };
1883
1884            match op {
1885                Op::Add => return Ok(o.checked_add(shift)),
1886                Op::Mul => return Ok(o.checked_mul(shift)),
1887                Op::Sub => return Ok(o.checked_sub(shift)),
1888                Op::Div => return Ok(o.checked_div(shift)),
1889                Op::Mod => return Ok(o.checked_rem(shift)),
1890                Op::And => return Ok(Some(o & shift)),
1891                Op::Or => return Ok(Some(o | shift)),
1892                Op::Xor => return Ok(Some(o ^ shift)),
1893            }
1894        }
1895
1896        Ok(Some(o))
1897    }
1898}
1899
1900#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
1901enum DirOffset {
1902    Start(u64),
1903    // relative to the last up-level field
1904    LastUpper(i64),
1905    End(i64),
1906}
1907
1908#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
1909enum Offset {
1910    Direct(DirOffset),
1911    Indirect(IndOffset),
1912}
1913
1914impl From<DirOffset> for Offset {
1915    fn from(value: DirOffset) -> Self {
1916        Self::Direct(value)
1917    }
1918}
1919
1920impl From<IndOffset> for Offset {
1921    fn from(value: IndOffset) -> Self {
1922        Self::Indirect(value)
1923    }
1924}
1925
1926impl Display for DirOffset {
1927    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1928        match self {
1929            DirOffset::Start(i) => write!(f, "{i}"),
1930            DirOffset::LastUpper(c) => write!(f, "&{c}"),
1931            DirOffset::End(e) => write!(f, "-{e}"),
1932        }
1933    }
1934}
1935
1936impl Default for DirOffset {
1937    fn default() -> Self {
1938        Self::LastUpper(0)
1939    }
1940}
1941
1942#[derive(Debug, Clone, Serialize, Deserialize)]
1943struct Match {
1944    line: usize,
1945    depth: u8,
1946    offset: Offset,
1947    test: Test,
1948    test_strength: u64,
1949    message: Option<Message>,
1950}
1951
1952impl From<Use> for Match {
1953    fn from(value: Use) -> Self {
1954        let test = Test::Use(value.switch_endianness, value.rule_name);
1955        let test_strength = test.strength();
1956        Self {
1957            line: value.line,
1958            depth: value.depth,
1959            offset: value.start_offset,
1960            test,
1961            test_strength,
1962            message: value.message,
1963        }
1964    }
1965}
1966
1967impl From<Name> for Match {
1968    fn from(value: Name) -> Self {
1969        let test = Test::Name(value.name);
1970        let test_strength = test.strength();
1971        Self {
1972            line: value.line,
1973            depth: 0,
1974            offset: Offset::Direct(DirOffset::Start(0)),
1975            test,
1976            test_strength,
1977            message: value.message,
1978        }
1979    }
1980}
1981
1982impl Match {
1983    /// Turns the `Match`'s offset into an absolute offset from the start of the stream
1984    #[inline(always)]
1985    fn offset_from_start<R: Read + Seek>(
1986        &self,
1987        haystack: &mut LazyCache<R>,
1988        rule_base_offset: Option<u64>,
1989        last_level_offset: Option<u64>,
1990    ) -> Result<Option<u64>, io::Error> {
1991        match self.offset {
1992            Offset::Direct(dir_offset) => match dir_offset {
1993                DirOffset::Start(s) => Ok(Some(s)),
1994                DirOffset::LastUpper(shift) => {
1995                    let o = last_level_offset.unwrap_or_default() as i64 + shift;
1996
1997                    if o >= 0 { Ok(Some(o as u64)) } else { Ok(None) }
1998                }
1999                DirOffset::End(e) => Ok(Some(haystack.offset_from_start(SeekFrom::End(e)))),
2000            },
2001            Offset::Indirect(ind_offset) => {
2002                let Some(o) =
2003                    ind_offset.read_offset(haystack, rule_base_offset, last_level_offset)?
2004                else {
2005                    return Ok(None);
2006                };
2007
2008                Ok(Some(o))
2009            }
2010        }
2011    }
2012
2013    /// this method emulates the buffer based matching
2014    /// logic implemented in libmagic. It needs some aweful
2015    /// and weird offset convertions to turn buffer
2016    /// relative offsets (libmagic is based on) into
2017    /// absolute offset in the file.
2018    ///
2019    /// this method shoud bubble up only critical errors
2020    /// all the other errors should make the match result
2021    /// false and be logged via debug!
2022    ///
2023    /// the function returns an error if the maximum recursion
2024    /// has been reached or if a dependency rule is missing.
2025    #[inline]
2026    #[allow(clippy::too_many_arguments)]
2027    fn matches<'a: 'h, 'h, R: Read + Seek>(
2028        &'a self,
2029        source: Option<&str>,
2030        magic: &mut Magic<'a>,
2031        stream_kind: StreamKind,
2032        state: &mut MatchState,
2033        buf_base_offset: Option<u64>,
2034        rule_base_offset: Option<u64>,
2035        last_level_offset: Option<u64>,
2036        haystack: &'h mut LazyCache<R>,
2037        switch_endianness: bool,
2038        db: &'a MagicDb,
2039        depth: usize,
2040    ) -> Result<(bool, Option<MatchRes<'h>>), Error> {
2041        let source = source.unwrap_or("unknown");
2042        let line = self.line;
2043
2044        if depth >= MAX_RECURSION {
2045            return Err(Error::localized(
2046                source,
2047                line,
2048                Error::MaximumRecursion(MAX_RECURSION),
2049            ));
2050        }
2051
2052        if self.test.is_only_binary() && stream_kind.is_text() {
2053            trace!("skip binary test source={source} line={line} stream_kind={stream_kind:?}",);
2054            return Ok((false, None));
2055        }
2056
2057        if self.test.is_only_text() && !stream_kind.is_text() {
2058            trace!("skip text test source={source} line={line} stream_kind={stream_kind:?}",);
2059            return Ok((false, None));
2060        }
2061
2062        let Ok(Some(mut offset)) = self
2063            .offset_from_start(haystack, rule_base_offset, last_level_offset)
2064            .inspect_err(|e| debug!("source={source} line={line} failed at computing offset: {e}"))
2065        else {
2066            return Ok((false, None));
2067        };
2068
2069        offset = match self.offset {
2070            Offset::Indirect(_) => {
2071                // the result we get for an indirect offset
2072                // is relative to the start of the libmagic
2073                // buffer so we need to add base to make it
2074                // absolute.
2075                buf_base_offset.unwrap_or_default().saturating_add(offset)
2076            }
2077            // offset from start are computed from rule base
2078            Offset::Direct(DirOffset::Start(_)) => {
2079                rule_base_offset.unwrap_or_default().saturating_add(offset)
2080            }
2081            _ => offset,
2082        };
2083
2084        match &self.test {
2085            Test::Clear => {
2086                trace!("source={source} line={line} clear");
2087                state.clear_continuation_level(&self.continuation_level());
2088                Ok((true, None))
2089            }
2090
2091            Test::Name(name) => {
2092                trace!(
2093                    "source={source} line={line} running rule {name} switch_endianness={switch_endianness}",
2094                );
2095                Ok((true, None))
2096            }
2097
2098            Test::Use(flip_endianness, rule_name) => {
2099                trace!(
2100                    "source={source} line={line} use {rule_name} switch_endianness={flip_endianness}",
2101                );
2102
2103                // switch_endianness must propagate down the rule call stack
2104                let switch_endianness = switch_endianness ^ flip_endianness;
2105
2106                let dr: &DependencyRule = db.dependencies.get(rule_name).ok_or(
2107                    Error::localized(source, line, Error::MissingRule(rule_name.clone())),
2108                )?;
2109
2110                // we push the message here otherwise we push message in depth first
2111                if let Some(msg) = self.message.as_ref() {
2112                    magic.push_message(msg.to_string_lossy());
2113                }
2114
2115                dr.rule.magic(
2116                    magic,
2117                    stream_kind,
2118                    buf_base_offset,
2119                    Some(offset),
2120                    haystack,
2121                    db,
2122                    switch_endianness,
2123                    depth.saturating_add(1),
2124                )?;
2125
2126                // we return false not to push message again
2127                Ok((false, None))
2128            }
2129
2130            Test::Indirect(m) => {
2131                trace!(
2132                    "source={source} line={line} indirect mods={:?} offset={offset:#x}",
2133                    m
2134                );
2135
2136                let new_buf_base_off = if m.contains(IndirectMod::Relative) {
2137                    Some(offset)
2138                } else {
2139                    None
2140                };
2141
2142                // we push the message here otherwise we push message in depth first
2143                if let Some(msg) = self.message.as_ref() {
2144                    magic.push_message(msg.to_string_lossy());
2145                }
2146
2147                for r in db.rules.iter() {
2148                    let messages_cnt = magic.message.len();
2149
2150                    r.magic(
2151                        magic,
2152                        stream_kind,
2153                        new_buf_base_off,
2154                        Some(offset),
2155                        haystack,
2156                        db,
2157                        false,
2158                        depth.saturating_add(1),
2159                    )?;
2160
2161                    // this means we matched a rule
2162                    if magic.message.len() != messages_cnt {
2163                        break;
2164                    }
2165                }
2166
2167                // we return false not to push message again
2168                Ok((false, None))
2169            }
2170
2171            Test::Default => {
2172                // default matches if nothing else at the continuation level matched
2173                let ok = !state.get_continuation_level(&self.continuation_level());
2174
2175                trace!("source={source} line={line} default match={ok}");
2176                if ok {
2177                    state.set_continuation_level(self.continuation_level());
2178                }
2179
2180                Ok((ok, None))
2181            }
2182
2183            _ => {
2184                if let Err(e) = haystack.seek(SeekFrom::Start(offset)) {
2185                    debug!("source={source} line={line} failed to seek in haystack: {e}");
2186                    return Ok((false, None));
2187                }
2188
2189                let mut trace_msg = None;
2190
2191                if enabled!(Level::DEBUG) {
2192                    trace_msg = Some(vec![format!(
2193                        "source={source} line={line} depth={} stream_offset={:#x}",
2194                        self.depth,
2195                        haystack.lazy_stream_position()
2196                    )])
2197                }
2198
2199                // NOTE: we may have a way to optimize here. In case we do a Any
2200                // test and we don't use the value to format the message, we don't
2201                // need to read the value.
2202                if let Ok(opt_test_value) = self
2203                    .test
2204                    .read_test_value(haystack, switch_endianness)
2205                    .inspect_err(|e| {
2206                        debug!("source={source} line={line} error while reading test value @{offset}: {e}",)
2207                    })
2208                {
2209                    if let Some(v) = trace_msg
2210                        .as_mut() { v.push(format!("test={:?}", self.test)) }
2211
2212                    let match_res =
2213                        opt_test_value.and_then(|tv| self.test.match_value(&tv, stream_kind));
2214
2215                    if let Some(v) = trace_msg.as_mut() { v.push(format!(
2216                            "message=\"{}\" match={}",
2217                            self.message
2218                                .as_ref()
2219                                .map(|fs| fs.to_string_lossy())
2220                                .unwrap_or_default(),
2221                            match_res.is_some()
2222                        )) }
2223
2224                    // trace message
2225                    if enabled!(Level::DEBUG) && !enabled!(Level::TRACE) && match_res.is_some() {
2226                        if let Some(m) = trace_msg{
2227                            debug!("{}", m.join(" "));
2228                        }
2229                    } else if enabled!(Level::TRACE)
2230                        && let Some(m) = trace_msg{
2231                            trace!("{}", m.join(" "));
2232                        }
2233
2234                    if let Some(mr) = match_res {
2235                        state.set_continuation_level(self.continuation_level());
2236                        return Ok((true, Some(mr)));
2237                    }
2238                }
2239
2240                Ok((false, None))
2241            }
2242        }
2243    }
2244
2245    #[inline(always)]
2246    fn continuation_level(&self) -> ContinuationLevel {
2247        ContinuationLevel(self.depth)
2248    }
2249}
2250
2251#[derive(Debug, Clone)]
2252struct Use {
2253    line: usize,
2254    depth: u8,
2255    start_offset: Offset,
2256    rule_name: String,
2257    switch_endianness: bool,
2258    message: Option<Message>,
2259}
2260
2261#[derive(Debug, Clone, Serialize, Deserialize)]
2262struct StrengthMod {
2263    op: Op,
2264    by: u8,
2265}
2266
2267impl StrengthMod {
2268    #[inline(always)]
2269    fn apply(&self, strength: u64) -> u64 {
2270        let by = self.by as u64;
2271        debug!("applying strength modifier: {strength} {} {}", self.op, by);
2272        match self.op {
2273            Op::Mul => strength.saturating_mul(by),
2274            Op::Add => strength.saturating_add(by),
2275            Op::Sub => strength.saturating_sub(by),
2276            Op::Div => {
2277                if by > 0 {
2278                    strength.saturating_div(by)
2279                } else {
2280                    strength
2281                }
2282            }
2283            Op::Mod => strength % by,
2284            Op::And => strength & by,
2285            // this should never happen as strength operators
2286            // are enforced by our parser
2287            Op::Xor | Op::Or => {
2288                debug_panic!("unsupported strength operator");
2289                strength
2290            }
2291        }
2292    }
2293}
2294
2295#[derive(Debug, Clone)]
2296enum Flag {
2297    Mime(String),
2298    Ext(HashSet<String>),
2299    Strength(StrengthMod),
2300    Apple(String),
2301}
2302
2303#[derive(Debug, Clone)]
2304struct Name {
2305    line: usize,
2306    name: String,
2307    message: Option<Message>,
2308}
2309
2310#[derive(Debug, Clone)]
2311enum Entry<'span> {
2312    Match(Span<'span>, Match),
2313    Flag(Span<'span>, Flag),
2314}
2315
2316#[derive(Debug, Clone, Serialize, Deserialize)]
2317struct EntryNode {
2318    root: bool,
2319    entry: Match,
2320    children: Vec<EntryNode>,
2321    mimetype: Option<String>,
2322    apple: Option<String>,
2323    strength_mod: Option<StrengthMod>,
2324    exts: HashSet<String>,
2325}
2326
2327impl EntryNode {
2328    fn update_exts_rec(
2329        &self,
2330        exts: &mut HashSet<String>,
2331        deps: &HashMap<String, DependencyRule>,
2332        marked: &mut HashSet<String>,
2333    ) -> Result<(), ()> {
2334        for ext in self.exts.iter() {
2335            if !exts.contains(ext) {
2336                exts.insert(ext.clone());
2337            }
2338        }
2339
2340        for c in self.children.iter() {
2341            if let Test::Use(_, ref name) = c.entry.test {
2342                if marked.contains(name) {
2343                    continue;
2344                }
2345                if let Some(r) = deps.get(name) {
2346                    marked.insert(name.clone());
2347                    exts.extend(r.rule.fetch_all_extensions(deps, marked)?);
2348                } else {
2349                    return Err(());
2350                }
2351            } else {
2352                c.update_exts_rec(exts, deps, marked)?;
2353            }
2354        }
2355
2356        Ok(())
2357    }
2358
2359    fn update_score_rec(
2360        &self,
2361        depth: usize,
2362        score: &mut u64,
2363        deps: &HashMap<String, DependencyRule>,
2364        marked: &mut HashSet<String>,
2365    ) {
2366        if depth == 3 {
2367            return;
2368        }
2369
2370        *score += self
2371            .children
2372            .iter()
2373            .map(|e| e.entry.test_strength)
2374            .min()
2375            .unwrap_or_default();
2376
2377        for c in self.children.iter() {
2378            if let Test::Use(_, ref name) = c.entry.test {
2379                if marked.contains(name) {
2380                    continue;
2381                }
2382
2383                if let Some(r) = deps.get(name) {
2384                    marked.insert(name.clone());
2385                    *score += r.rule.compute_score(depth, deps, marked);
2386                }
2387            }
2388            c.update_score_rec(depth + 1, score, deps, marked);
2389        }
2390    }
2391
2392    #[inline]
2393    #[allow(clippy::too_many_arguments)]
2394    fn matches<'r, R: Read + Seek>(
2395        &'r self,
2396        opt_source: Option<&str>,
2397        magic: &mut Magic<'r>,
2398        state: &mut MatchState,
2399        stream_kind: StreamKind,
2400        buf_base_offset: Option<u64>,
2401        rule_base_offset: Option<u64>,
2402        last_level_offset: Option<u64>,
2403        haystack: &mut LazyCache<R>,
2404        db: &'r MagicDb,
2405        switch_endianness: bool,
2406        depth: usize,
2407    ) -> Result<(), Error> {
2408        let (ok, opt_match_res) = self.entry.matches(
2409            opt_source,
2410            magic,
2411            stream_kind,
2412            state,
2413            buf_base_offset,
2414            rule_base_offset,
2415            last_level_offset,
2416            haystack,
2417            switch_endianness,
2418            db,
2419            depth,
2420        )?;
2421
2422        let source = opt_source.unwrap_or("unknown");
2423        let line = self.entry.line;
2424
2425        if ok {
2426            // update magic with message if match is successful
2427            if let Some(msg) = self.entry.message.as_ref()
2428                && let Ok(msg) = msg.format_with(opt_match_res.as_ref()).inspect_err(|e| {
2429                    debug!("source={source} line={line} failed to format message: {e}")
2430                })
2431            {
2432                magic.push_message(msg);
2433            }
2434
2435            // we need to adjust stream offset in case of regex/search tests
2436            if let Some(mr) = opt_match_res {
2437                match &self.entry.test {
2438                    Test::String(t) => {
2439                        if t.has_length_mod() {
2440                            let o = mr.end_offset();
2441                            haystack.seek(SeekFrom::Start(o))?;
2442                        }
2443                    }
2444                    Test::Search(t) => {
2445                        if t.re_mods.contains(ReMod::StartOffsetUpdate) {
2446                            let o = mr.start_offset();
2447                            haystack.seek(SeekFrom::Start(o))?;
2448                        } else {
2449                            let o = mr.end_offset();
2450                            haystack.seek(SeekFrom::Start(o))?;
2451                        }
2452                    }
2453
2454                    Test::Regex(t) => {
2455                        if t.mods.contains(ReMod::StartOffsetUpdate) {
2456                            let o = mr.start_offset();
2457                            haystack.seek(SeekFrom::Start(o))?;
2458                        } else {
2459                            let o = mr.end_offset();
2460                            haystack.seek(SeekFrom::Start(o))?;
2461                        }
2462                    }
2463                    // other types do not need offset adjustement
2464                    _ => {}
2465                }
2466            }
2467
2468            if let Some(mimetype) = self.mimetype.as_ref() {
2469                magic.set_mime_type(Cow::Borrowed(mimetype));
2470            }
2471
2472            if let Some(apple_ty) = self.apple.as_ref() {
2473                magic.set_creator_code(Cow::Borrowed(apple_ty));
2474            }
2475
2476            if !self.exts.is_empty() {
2477                magic.insert_extensions(self.exts.iter().map(|s| s.as_str()));
2478            }
2479
2480            // NOTE: here we try to implement a similar logic as in file_magic_strength.
2481            // Sticking to the exact same strength computation logic is complicated due
2482            // to implementation differences. Let's wait and see if that is a real issue.
2483            let mut strength = self.entry.test_strength;
2484
2485            let continuation_level = self.entry.continuation_level().0 as u64;
2486            if self.entry.message.is_none() && continuation_level < 3 {
2487                strength = strength.saturating_add(continuation_level);
2488            }
2489
2490            if let Some(sm) = self.strength_mod.as_ref() {
2491                strength = sm.apply(strength);
2492            }
2493
2494            // entries with no message get a bonus
2495            if self.entry.message.is_none() {
2496                strength += 1
2497            }
2498
2499            magic.update_strength(strength);
2500
2501            let end_upper_level = haystack.lazy_stream_position();
2502
2503            // we have to fix rule_base_offset if
2504            // the rule_base_starts from end otherwise it
2505            // breaks some offset computation in match
2506            // see test_offset_bug_1 and test_offset_bug_2
2507            // they implement the same test logic yet indirect
2508            // offsets have to be different so that it works
2509            // in libmagic/file
2510            let rule_base_offset = if self.root {
2511                match self.entry.offset {
2512                    Offset::Direct(DirOffset::End(o)) => {
2513                        Some(haystack.offset_from_start(SeekFrom::End(o)))
2514                    }
2515                    _ => rule_base_offset,
2516                }
2517            } else {
2518                rule_base_offset
2519            };
2520
2521            for e in self.children.iter() {
2522                e.matches(
2523                    opt_source,
2524                    magic,
2525                    state,
2526                    stream_kind,
2527                    buf_base_offset,
2528                    rule_base_offset,
2529                    Some(end_upper_level),
2530                    haystack,
2531                    db,
2532                    switch_endianness,
2533                    depth,
2534                )?
2535            }
2536        }
2537
2538        Ok(())
2539    }
2540}
2541
2542/// Represents a parsed magic rule
2543#[derive(Debug, Clone, Serialize, Deserialize)]
2544pub struct MagicRule {
2545    id: usize,
2546    source: Option<String>,
2547    entries: EntryNode,
2548    extensions: HashSet<String>,
2549    /// score used for rule ranking
2550    score: u64,
2551    finalized: bool,
2552}
2553
2554impl MagicRule {
2555    #[inline(always)]
2556    fn set_id(&mut self, id: usize) {
2557        self.id = id
2558    }
2559
2560    /// Fetches all the extensions defined in the magic rule. This
2561    /// function goes recursive and find extensions also defined in
2562    /// dependencies
2563    fn fetch_all_extensions(
2564        &self,
2565        deps: &HashMap<String, DependencyRule>,
2566        marked: &mut HashSet<String>,
2567    ) -> Result<HashSet<String>, ()> {
2568        let mut exts = HashSet::new();
2569        self.entries.update_exts_rec(&mut exts, deps, marked)?;
2570        Ok(exts)
2571    }
2572
2573    /// Computes the ranking score of a magic rule by walking
2574    /// tests recursively, dependencies included.
2575    fn compute_score(
2576        &self,
2577        depth: usize,
2578        deps: &HashMap<String, DependencyRule>,
2579        marked: &mut HashSet<String>,
2580    ) -> u64 {
2581        let mut score = 0;
2582        score += self.entries.entry.test_strength;
2583        self.entries
2584            .update_score_rec(depth, &mut score, deps, marked);
2585        score
2586    }
2587
2588    /// Finalize a rule by searching for all extensions and computing its score
2589    /// for ranking. In the `MagicRule` is already finalized it returns immediately.
2590    fn try_finalize(&mut self, deps: &HashMap<String, DependencyRule>) {
2591        if self.finalized {
2592            return;
2593        }
2594
2595        let Ok(exts) = self.fetch_all_extensions(deps, &mut HashSet::new()) else {
2596            return;
2597        };
2598
2599        self.extensions.extend(exts);
2600
2601        // fetch_all_extensions walks through all the dependencies
2602        // so there is no reason for compute_score to fail as it is walking
2603        // only some of them
2604        self.score = self.compute_score(0, deps, &mut HashSet::new());
2605        self.finalized = true
2606    }
2607
2608    #[inline]
2609    fn magic_entrypoint<'r, R: Read + Seek>(
2610        &'r self,
2611        magic: &mut Magic<'r>,
2612        stream_kind: StreamKind,
2613        haystack: &mut LazyCache<R>,
2614        db: &'r MagicDb,
2615        switch_endianness: bool,
2616        depth: usize,
2617    ) -> Result<(), Error> {
2618        self.entries.matches(
2619            self.source.as_deref(),
2620            magic,
2621            &mut MatchState::empty(),
2622            stream_kind,
2623            None,
2624            None,
2625            None,
2626            haystack,
2627            db,
2628            switch_endianness,
2629            depth,
2630        )
2631    }
2632
2633    #[inline]
2634    #[allow(clippy::too_many_arguments)]
2635    fn magic<'r, R: Read + Seek>(
2636        &'r self,
2637        magic: &mut Magic<'r>,
2638        stream_kind: StreamKind,
2639        buf_base_offset: Option<u64>,
2640        rule_base_offset: Option<u64>,
2641        haystack: &mut LazyCache<R>,
2642        db: &'r MagicDb,
2643        switch_endianness: bool,
2644        depth: usize,
2645    ) -> Result<(), Error> {
2646        self.entries.matches(
2647            self.source.as_deref(),
2648            magic,
2649            &mut MatchState::empty(),
2650            stream_kind,
2651            buf_base_offset,
2652            rule_base_offset,
2653            None,
2654            haystack,
2655            db,
2656            switch_endianness,
2657            depth,
2658        )
2659    }
2660
2661    /// Checks if the rule is for matching against text content
2662    ///
2663    /// # Returns
2664    ///
2665    /// * `bool` - True if the rule is for text files
2666    pub fn is_text(&self) -> bool {
2667        self.entries.entry.test.is_text()
2668            && self.entries.children.iter().all(|e| e.entry.test.is_text())
2669    }
2670
2671    /// Gets the rule's score used for ranking rules between them
2672    ///
2673    /// # Returns
2674    ///
2675    /// * `u64` - The rule's score
2676    #[inline(always)]
2677    pub fn score(&self) -> u64 {
2678        self.score
2679    }
2680
2681    /// Gets the rule's filename if any
2682    ///
2683    /// # Returns
2684    ///
2685    /// * `Option<&str>` - The rule's source if available
2686    #[inline(always)]
2687    pub fn source(&self) -> Option<&str> {
2688        self.source.as_deref()
2689    }
2690
2691    /// Gets the line number at which the rule is defined
2692    ///
2693    /// # Returns
2694    ///
2695    /// * `usize` - The rule's line number
2696    #[inline(always)]
2697    pub fn line(&self) -> usize {
2698        self.entries.entry.line
2699    }
2700
2701    /// Gets all the file extensions associated to the rule
2702    ///
2703    /// # Returns
2704    ///
2705    /// * `&HashSet<String>` - The set of all associated extensions
2706    #[inline(always)]
2707    pub fn extensions(&self) -> &HashSet<String> {
2708        &self.extensions
2709    }
2710}
2711
2712#[derive(Debug, Clone, Serialize, Deserialize)]
2713struct DependencyRule {
2714    name: String,
2715    rule: MagicRule,
2716}
2717
2718/// A parsed source of magic rules
2719///
2720/// # Methods
2721///
2722/// * `open` - Opens a magic file from a path
2723#[derive(Debug, Clone, Serialize, Deserialize)]
2724pub struct MagicSource {
2725    rules: Vec<MagicRule>,
2726    dependencies: HashMap<String, DependencyRule>,
2727}
2728
2729impl MagicSource {
2730    /// Opens and parses a magic file from a path
2731    ///
2732    /// # Arguments
2733    ///
2734    /// * `p` - The path to the magic file
2735    ///
2736    /// # Returns
2737    ///
2738    /// * `Result<Self, Error>` - The parsed magic file or an error
2739    pub fn open<P: AsRef<Path>>(p: P) -> Result<Self, Error> {
2740        FileMagicParser::parse_file(p)
2741    }
2742}
2743
2744#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)]
2745struct ContinuationLevel(u8);
2746
2747// FIXME: magic handles many more text encodings
2748#[derive(Debug, PartialEq, Eq, Clone, Copy)]
2749enum TextEncoding {
2750    Ascii,
2751    Utf8,
2752    Unknown,
2753}
2754
2755impl TextEncoding {
2756    const fn as_magic_str(&self) -> &'static str {
2757        match self {
2758            TextEncoding::Ascii => "ASCII",
2759            TextEncoding::Utf8 => "UTF-8",
2760            TextEncoding::Unknown => "Unknown",
2761        }
2762    }
2763}
2764
2765#[derive(Debug, PartialEq, Eq, Clone, Copy)]
2766enum StreamKind {
2767    Binary,
2768    Text(TextEncoding),
2769}
2770
2771impl StreamKind {
2772    const fn is_text(&self) -> bool {
2773        matches!(self, StreamKind::Text(_))
2774    }
2775}
2776
2777#[derive(Debug)]
2778struct MatchState {
2779    continuation_levels: [bool; 256],
2780}
2781
2782impl MatchState {
2783    #[inline(always)]
2784    fn empty() -> Self {
2785        MatchState {
2786            continuation_levels: [false; 256],
2787        }
2788    }
2789
2790    #[inline(always)]
2791    fn get_continuation_level(&mut self, level: &ContinuationLevel) -> bool {
2792        self.continuation_levels
2793            .get(level.0 as usize)
2794            .cloned()
2795            .unwrap_or_default()
2796    }
2797
2798    #[inline(always)]
2799    fn set_continuation_level(&mut self, level: ContinuationLevel) {
2800        if let Some(b) = self.continuation_levels.get_mut(level.0 as usize) {
2801            *b = true
2802        }
2803    }
2804
2805    #[inline(always)]
2806    fn clear_continuation_level(&mut self, level: &ContinuationLevel) {
2807        if let Some(b) = self.continuation_levels.get_mut(level.0 as usize) {
2808            *b = false;
2809        }
2810    }
2811}
2812
2813/// Represents a file magic detection result
2814#[derive(Debug, Default)]
2815pub struct Magic<'m> {
2816    stream_kind: Option<StreamKind>,
2817    source: Option<Cow<'m, str>>,
2818    message: Vec<Cow<'m, str>>,
2819    mime_type: Option<Cow<'m, str>>,
2820    creator_code: Option<Cow<'m, str>>,
2821    strength: u64,
2822    exts: HashSet<Cow<'m, str>>,
2823    is_default: bool,
2824}
2825
2826impl<'m> Magic<'m> {
2827    #[inline(always)]
2828    fn set_source(&mut self, source: Option<&'m str>) {
2829        self.source = source.map(Cow::Borrowed);
2830    }
2831
2832    #[inline(always)]
2833    fn set_stream_kind(&mut self, stream_kind: StreamKind) {
2834        self.stream_kind = Some(stream_kind)
2835    }
2836
2837    #[inline(always)]
2838    fn reset(&mut self) {
2839        self.stream_kind = None;
2840        self.source = None;
2841        self.message.clear();
2842        self.mime_type = None;
2843        self.creator_code = None;
2844        self.strength = 0;
2845        self.exts.clear();
2846        self.is_default = false;
2847    }
2848
2849    /// Converts borrowed data into owned data. This method involves
2850    /// data cloning, so you must use this method only if you need to
2851    /// extend the lifetime of a [`Magic`] struct.
2852    ///
2853    /// # Returns
2854    ///
2855    /// * `Magic<'owned>` - A new [`Magic`] with owned data
2856    #[inline]
2857    pub fn into_owned<'owned>(self) -> Magic<'owned> {
2858        Magic {
2859            stream_kind: self.stream_kind,
2860            source: self.source.map(|s| Cow::Owned(s.into_owned())),
2861            message: self
2862                .message
2863                .into_iter()
2864                .map(Cow::into_owned)
2865                .map(Cow::Owned)
2866                .collect(),
2867            mime_type: self.mime_type.map(|m| Cow::Owned(m.into_owned())),
2868            creator_code: self.creator_code.map(|m| Cow::Owned(m.into_owned())),
2869            strength: self.strength,
2870            exts: self
2871                .exts
2872                .into_iter()
2873                .map(|e| Cow::Owned(e.into_owned()))
2874                .collect(),
2875            is_default: self.is_default,
2876        }
2877    }
2878
2879    /// Gets the formatted message describing the file type
2880    ///
2881    /// # Returns
2882    ///
2883    /// * `String` - The formatted message
2884    #[inline(always)]
2885    pub fn message(&self) -> String {
2886        let mut out = String::new();
2887        for (i, m) in self.message.iter().enumerate() {
2888            if let Some(s) = m.strip_prefix(r#"\b"#) {
2889                out.push_str(s);
2890            } else {
2891                // don't put space on first string
2892                if i > 0 {
2893                    out.push(' ');
2894                }
2895                out.push_str(m);
2896            }
2897        }
2898        out
2899    }
2900
2901    /// Returns an iterator over the individual parts of the magic message
2902    ///
2903    /// A magic message is typically composed of multiple parts, each appended
2904    /// during successful magic tests. This method provides an efficient way to
2905    /// iterate over these parts without concatenating them into a new string,
2906    /// as done when calling [`Magic::message`].
2907    ///
2908    /// # Returns
2909    ///
2910    /// * `impl Iterator<Item = &str>` - An iterator yielding string slices of each message part
2911    #[inline]
2912    pub fn message_parts(&self) -> impl Iterator<Item = &str> {
2913        self.message.iter().map(|p| p.as_ref())
2914    }
2915
2916    #[inline(always)]
2917    fn update_strength(&mut self, value: u64) {
2918        self.strength = self.strength.saturating_add(value);
2919        debug!("updated strength = {:?}", self.strength)
2920    }
2921
2922    /// Gets the detected MIME type
2923    ///
2924    /// # Returns
2925    ///
2926    /// * `&str` - The MIME type or default based on stream kind
2927    #[inline(always)]
2928    pub fn mime_type(&self) -> &str {
2929        self.mime_type.as_deref().unwrap_or(match self.stream_kind {
2930            Some(StreamKind::Text(_)) => DEFAULT_TEXT_MIMETYPE,
2931            Some(StreamKind::Binary) | None => DEFAULT_BIN_MIMETYPE,
2932        })
2933    }
2934
2935    #[inline(always)]
2936    fn push_message<'a: 'm>(&mut self, msg: Cow<'a, str>) {
2937        if !msg.is_empty() {
2938            debug!("pushing message: msg={msg} len={}", msg.len());
2939            self.message.push(msg);
2940        }
2941    }
2942
2943    #[inline(always)]
2944    fn set_mime_type<'a: 'm>(&mut self, mime: Cow<'a, str>) {
2945        if self.mime_type.is_none() {
2946            debug!("insert mime: {:?}", mime);
2947            self.mime_type = Some(mime)
2948        }
2949    }
2950
2951    #[inline(always)]
2952    fn set_creator_code<'a: 'm>(&mut self, apple_ty: Cow<'a, str>) {
2953        if self.creator_code.is_none() {
2954            debug!("insert apple type: {apple_ty:?}");
2955            self.creator_code = Some(apple_ty)
2956        }
2957    }
2958
2959    #[inline(always)]
2960    fn insert_extensions<'a: 'm, I: Iterator<Item = &'a str>>(&mut self, exts: I) {
2961        if self.exts.is_empty() {
2962            self.exts.extend(exts.filter_map(|e| {
2963                if e.is_empty() {
2964                    None
2965                } else {
2966                    Some(Cow::Borrowed(e))
2967                }
2968            }));
2969        }
2970    }
2971
2972    /// Gets the confidence score of the detection. This
2973    /// value is used to sort [`Magic`] in [`MagicDb::best_magic`]
2974    /// and [`MagicDb::all_magics`].
2975    ///
2976    /// # Returns
2977    ///
2978    /// * `u64` - The confidence score attributed to that [`Magic`]
2979    #[inline(always)]
2980    pub fn strength(&self) -> u64 {
2981        self.strength
2982    }
2983
2984    /// Gets the filename where the magic rule was defined
2985    ///
2986    /// # Returns
2987    ///
2988    /// * `Option<&str>` - The source if available
2989    #[inline(always)]
2990    pub fn source(&self) -> Option<&str> {
2991        self.source.as_deref()
2992    }
2993
2994    /// Gets the Apple creator code if available
2995    ///
2996    /// # Returns
2997    ///
2998    /// * `Option<&str>` - The creator code if available
2999    #[inline(always)]
3000    pub fn creator_code(&self) -> Option<&str> {
3001        self.creator_code.as_deref()
3002    }
3003
3004    /// Gets the possible file extensions for the detected [`Magic`]
3005    ///
3006    /// # Returns
3007    ///
3008    /// * `&HashSet<Cow<'m, str>>` - The set of possible extensions
3009    #[inline(always)]
3010    pub fn extensions(&self) -> &HashSet<Cow<'m, str>> {
3011        &self.exts
3012    }
3013
3014    /// Checks if this is a default fallback detection
3015    ///
3016    /// # Returns
3017    ///
3018    /// * `bool` - True if this is a default detection
3019    #[inline(always)]
3020    pub fn is_default(&self) -> bool {
3021        self.is_default
3022    }
3023}
3024
3025/// Represents a database of [`MagicRule`]
3026#[derive(Debug, Default, Clone, Serialize, Deserialize)]
3027pub struct MagicDb {
3028    rule_id: usize,
3029    rules: Vec<MagicRule>,
3030    dependencies: HashMap<String, DependencyRule>,
3031}
3032
3033#[inline(always)]
3034/// Returns `true` if the byte stream is likely text.
3035fn is_likely_text(bytes: &[u8]) -> bool {
3036    if bytes.is_empty() {
3037        return false;
3038    }
3039
3040    let mut printable = 0f64;
3041    let mut high_bytes = 0f64; // Bytes > 0x7F (non-ASCII)
3042
3043    for byte in bytes.iter() {
3044        match byte {
3045            0x00 => return false,
3046            0x09 | 0x0A | 0x0D => printable += 1.0, // Whitespace
3047            0x20..=0x7E => printable += 1.0,        // Printable ASCII
3048            _ => high_bytes += 1.0,
3049        }
3050    }
3051
3052    let total = bytes.len() as f64;
3053    let printable_ratio = printable / total;
3054    let high_bytes_ratio = high_bytes / total;
3055
3056    // Heuristic thresholds (adjust as needed):
3057    printable_ratio > 0.85 && high_bytes_ratio < 0.20
3058}
3059
3060#[inline(always)]
3061fn guess_stream_kind<S: AsRef<[u8]>>(stream: S) -> StreamKind {
3062    let Ok(s) = str::from_utf8(stream.as_ref()) else {
3063        if is_likely_text(stream.as_ref()) {
3064            return StreamKind::Text(TextEncoding::Unknown);
3065        } else {
3066            return StreamKind::Binary;
3067        }
3068    };
3069
3070    let count = s.chars().count();
3071    let mut is_ascii = true;
3072
3073    for c in s.chars().take(count.saturating_sub(1)) {
3074        is_ascii &= c.is_ascii()
3075    }
3076
3077    if is_ascii {
3078        StreamKind::Text(TextEncoding::Ascii)
3079    } else {
3080        StreamKind::Text(TextEncoding::Utf8)
3081    }
3082}
3083
3084impl MagicDb {
3085    fn open_reader<R: Read + Seek>(f: R) -> Result<LazyCache<R>, Error> {
3086        Ok(LazyCache::<R>::from_read_seek(f)
3087            .and_then(|lc| lc.with_hot_cache(2 * FILE_BYTES_MAX))?)
3088        .map(|lc| lc.with_warm_cache(100 << 20))
3089    }
3090
3091    /// Creates a new empty database
3092    ///
3093    /// # Returns
3094    ///
3095    /// * [`MagicDb`] - A new empty database
3096    pub fn new() -> Self {
3097        Self::default()
3098    }
3099
3100    #[inline(always)]
3101    fn next_rule_id(&mut self) -> usize {
3102        let t = self.rule_id;
3103        self.rule_id += 1;
3104        t
3105    }
3106
3107    #[inline(always)]
3108    fn try_json<R: Read + Seek>(
3109        haystack: &mut LazyCache<R>,
3110        stream_kind: StreamKind,
3111        magic: &mut Magic,
3112    ) -> Result<bool, Error> {
3113        // cannot be json if content is binary
3114        if matches!(stream_kind, StreamKind::Binary) {
3115            return Ok(false);
3116        }
3117
3118        let buf = haystack.read_range(0..FILE_BYTES_MAX as u64)?.trim_ascii();
3119
3120        let Some((start, end)) = find_json_boundaries(buf) else {
3121            return Ok(false);
3122        };
3123
3124        // if anything else than whitespace before start
3125        // this is not json
3126        for c in buf[0..start].iter() {
3127            if !c.is_ascii_whitespace() {
3128                return Ok(false);
3129            }
3130        }
3131
3132        let mut is_ndjson = false;
3133
3134        trace!("maybe a json document");
3135        let ok = serde_json::from_slice::<serde_json::Value>(&buf[start..=end]).is_ok();
3136        if !ok {
3137            return Ok(false);
3138        }
3139
3140        // we are sure it is json now we must look if we are ndjson
3141        if end + 1 < buf.len() {
3142            // after first json
3143            let buf = &buf[end + 1..];
3144            if let Some((second_start, second_end)) = find_json_boundaries(buf) {
3145                // there is a new line between the two json docs
3146                if memchr(b'\n', &buf[..second_start]).is_some() {
3147                    trace!("might be ndjson");
3148                    is_ndjson = serde_json::from_slice::<serde_json::Value>(
3149                        &buf[second_start..=second_end],
3150                    )
3151                    .is_ok();
3152                }
3153            }
3154        }
3155
3156        if is_ndjson {
3157            magic.push_message(Cow::Borrowed("New Line Delimited"));
3158            magic.set_mime_type(Cow::Borrowed("application/x-ndjson"));
3159            magic.insert_extensions(["ndjson", "jsonl"].into_iter());
3160        } else {
3161            magic.set_mime_type(Cow::Borrowed("application/json"));
3162            magic.insert_extensions(["json"].into_iter());
3163        }
3164
3165        magic.push_message(Cow::Borrowed("JSON text data"));
3166        magic.set_source(Some(HARDCODED_SOURCE));
3167        magic.update_strength(HARDCODED_MAGIC_STRENGTH);
3168        Ok(true)
3169    }
3170
3171    #[inline(always)]
3172    fn try_csv<R: Read + Seek>(
3173        haystack: &mut LazyCache<R>,
3174        stream_kind: StreamKind,
3175        magic: &mut Magic,
3176    ) -> Result<bool, Error> {
3177        // cannot be csv if content is binary
3178        let StreamKind::Text(enc) = stream_kind else {
3179            return Ok(false);
3180        };
3181
3182        let buf = haystack.read_range(0..FILE_BYTES_MAX as u64)?;
3183        let mut reader = csv::Reader::from_reader(io::Cursor::new(buf));
3184        let mut records = reader.records();
3185
3186        let Some(Ok(first)) = records.next() else {
3187            return Ok(false);
3188        };
3189
3190        // very not likely a CSV otherwise all programming
3191        // languages having ; line terminator would be
3192        // considered as CSV
3193        if first.len() <= 1 {
3194            return Ok(false);
3195        }
3196
3197        // we already parsed first line
3198        let mut n = 1;
3199        for i in records.take(9) {
3200            if let Ok(rec) = i {
3201                if first.len() != rec.len() {
3202                    return Ok(false);
3203                }
3204            } else {
3205                return Ok(false);
3206            }
3207            n += 1;
3208        }
3209
3210        // we need at least 10 lines
3211        if n != 10 {
3212            return Ok(false);
3213        }
3214
3215        magic.set_mime_type(Cow::Borrowed("text/csv"));
3216        magic.push_message(Cow::Borrowed("CSV"));
3217        magic.push_message(Cow::Borrowed(enc.as_magic_str()));
3218        magic.push_message(Cow::Borrowed("text"));
3219        magic.insert_extensions(["csv"].into_iter());
3220        magic.set_source(Some(HARDCODED_SOURCE));
3221        magic.update_strength(HARDCODED_MAGIC_STRENGTH);
3222        Ok(true)
3223    }
3224
3225    #[inline(always)]
3226    fn try_tar<R: Read + Seek>(
3227        haystack: &mut LazyCache<R>,
3228        stream_kind: StreamKind,
3229        magic: &mut Magic,
3230    ) -> Result<bool, Error> {
3231        // cannot be json if content is not binary
3232        if !matches!(stream_kind, StreamKind::Binary) {
3233            return Ok(false);
3234        }
3235
3236        let buf = haystack.read_range(0..FILE_BYTES_MAX as u64)?;
3237        let mut ar = Archive::new(io::Cursor::new(buf));
3238
3239        let Ok(mut entries) = ar.entries() else {
3240            return Ok(false);
3241        };
3242
3243        let Some(Ok(first)) = entries.next() else {
3244            return Ok(false);
3245        };
3246
3247        let header = first.header();
3248
3249        if header.as_ustar().is_some() {
3250            magic.push_message(Cow::Borrowed("POSIX tar archive"));
3251        } else if header.as_gnu().is_some() {
3252            magic.push_message(Cow::Borrowed("POSIX tar archive (GNU)"));
3253        } else {
3254            magic.push_message(Cow::Borrowed("tar archive"));
3255        }
3256
3257        magic.set_mime_type(Cow::Borrowed("application/x-tar"));
3258        magic.set_source(Some(HARDCODED_SOURCE));
3259        magic.update_strength(HARDCODED_MAGIC_STRENGTH);
3260        magic.insert_extensions(["tar"].into_iter());
3261        Ok(true)
3262    }
3263
3264    #[inline(always)]
3265    fn try_hard_magic<R: Read + Seek>(
3266        haystack: &mut LazyCache<R>,
3267        stream_kind: StreamKind,
3268        magic: &mut Magic,
3269    ) -> Result<bool, Error> {
3270        Ok(Self::try_json(haystack, stream_kind, magic)?
3271            || Self::try_csv(haystack, stream_kind, magic)?
3272            || Self::try_tar(haystack, stream_kind, magic)?)
3273    }
3274
3275    #[inline(always)]
3276    fn magic_default<'m, R: Read + Seek>(
3277        haystack: &mut LazyCache<R>,
3278        stream_kind: StreamKind,
3279        magic: &mut Magic<'m>,
3280    ) -> Result<(), Error> {
3281        let buf = haystack.read_range(0..FILE_BYTES_MAX as u64)?;
3282
3283        magic.set_source(Some(HARDCODED_SOURCE));
3284        magic.set_stream_kind(stream_kind);
3285        magic.is_default = true;
3286
3287        if buf.is_empty() {
3288            magic.push_message(Cow::Borrowed("empty"));
3289            magic.set_mime_type(Cow::Borrowed(DEFAULT_BIN_MIMETYPE));
3290            return Ok(());
3291        }
3292
3293        match stream_kind {
3294            StreamKind::Binary => {
3295                magic.push_message(Cow::Borrowed("data"));
3296            }
3297            StreamKind::Text(e) => {
3298                magic.push_message(Cow::Borrowed(e.as_magic_str()));
3299                magic.push_message(Cow::Borrowed("text"));
3300            }
3301        }
3302
3303        Ok(())
3304    }
3305
3306    /// Loads rules from a [`MagicSource`]
3307    ///
3308    /// # Arguments
3309    ///
3310    /// * `mf` - The [`MagicSource`] to load rules from
3311    ///
3312    /// # Returns
3313    ///
3314    /// * `Result<&mut Self, Error>` - Self for chaining or an error
3315    pub fn load(&mut self, mf: MagicSource) -> Result<&mut Self, Error> {
3316        for rule in mf.rules.into_iter() {
3317            let mut rule = rule;
3318            rule.set_id(self.next_rule_id());
3319
3320            self.rules.push(rule);
3321        }
3322
3323        self.dependencies.extend(mf.dependencies);
3324        self.prepare();
3325        Ok(self)
3326    }
3327
3328    /// Gets all rules in the database
3329    ///
3330    /// # Returns
3331    ///
3332    /// * `&[MagicRule]` - A slice of all rules
3333    pub fn rules(&self) -> &[MagicRule] {
3334        &self.rules
3335    }
3336
3337    #[inline]
3338    fn first_magic_with_stream_kind<R: Read + Seek>(
3339        &self,
3340        haystack: &mut LazyCache<R>,
3341        stream_kind: StreamKind,
3342        extension: Option<&str>,
3343    ) -> Result<Magic<'_>, Error> {
3344        // re-using magic makes this function faster
3345        let mut magic = Magic::default();
3346
3347        if Self::try_hard_magic(haystack, stream_kind, &mut magic)? {
3348            return Ok(magic);
3349        }
3350
3351        let mut marked = vec![false; self.rules.len()];
3352
3353        macro_rules! do_magic {
3354            ($rule: expr) => {{
3355                $rule.magic_entrypoint(&mut magic, stream_kind, haystack, &self, false, 0)?;
3356
3357                if !magic.message.is_empty() {
3358                    magic.set_stream_kind(stream_kind);
3359                    magic.set_source($rule.source.as_deref());
3360                    return Ok(magic);
3361                }
3362
3363                magic.reset();
3364            }};
3365        }
3366
3367        if let Some(ext) = extension.map(|e| e.to_lowercase())
3368            && !ext.is_empty()
3369        {
3370            for rule in self.rules.iter().filter(|r| r.extensions.contains(&ext)) {
3371                do_magic!(rule);
3372                if let Some(f) = marked.get_mut(rule.id) {
3373                    *f = true
3374                }
3375            }
3376        }
3377
3378        for rule in self
3379            .rules
3380            .iter()
3381            // we don't run again rules run by extension
3382            .filter(|r| !*marked.get(r.id).unwrap_or(&false))
3383        {
3384            do_magic!(rule)
3385        }
3386
3387        Self::magic_default(haystack, stream_kind, &mut magic)?;
3388
3389        Ok(magic)
3390    }
3391
3392    /// Detects file [`Magic`] stopping at the first matching magic. Magic
3393    /// rules are evaluated from the best to the least relevant, so this method
3394    /// returns most of the time the best magic. For the rare cases where
3395    /// it doesn't or if the best result is always required, use [`MagicDb::best_magic`]
3396    ///
3397    /// # Arguments
3398    ///
3399    /// * `r` - A readable and seekable input
3400    /// * `extension` - Optional file extension to use for acceleration
3401    ///
3402    /// # Returns
3403    ///
3404    /// * `Result<Magic<'_>, Error>` - The detection result or an error
3405    pub fn first_magic<R: Read + Seek>(
3406        &self,
3407        r: &mut R,
3408        extension: Option<&str>,
3409    ) -> Result<Magic<'_>, Error> {
3410        let mut haystack = Self::open_reader(r)?;
3411        let stream_kind = guess_stream_kind(haystack.read_range(0..FILE_BYTES_MAX as u64)?);
3412        self.first_magic_with_stream_kind(&mut haystack, stream_kind, extension)
3413    }
3414
3415    #[inline(always)]
3416    fn all_magics_sort_with_stream_kind<R: Read + Seek>(
3417        &self,
3418        haystack: &mut LazyCache<R>,
3419        stream_kind: StreamKind,
3420    ) -> Result<Vec<Magic<'_>>, Error> {
3421        let mut out = Vec::new();
3422
3423        let mut magic = Magic::default();
3424
3425        if Self::try_hard_magic(haystack, stream_kind, &mut magic)? {
3426            out.push(magic);
3427            magic = Magic::default();
3428        }
3429
3430        for rule in self.rules.iter() {
3431            rule.magic_entrypoint(&mut magic, stream_kind, haystack, self, false, 0)?;
3432
3433            // it is possible we have a strength with no message
3434            if !magic.message.is_empty() {
3435                magic.set_stream_kind(stream_kind);
3436                magic.set_source(rule.source.as_deref());
3437                out.push(magic);
3438                magic = Magic::default();
3439            }
3440
3441            magic.reset();
3442        }
3443
3444        Self::magic_default(haystack, stream_kind, &mut magic)?;
3445        out.push(magic);
3446
3447        out.sort_by_key(|b| std::cmp::Reverse(b.strength()));
3448
3449        Ok(out)
3450    }
3451
3452    /// Detects all [`Magic`] matching a given content.
3453    ///
3454    /// # Arguments
3455    ///
3456    /// * `r` - A readable and seekable input
3457    ///
3458    /// # Returns
3459    ///
3460    /// * `Result<Vec<Magic<'_>>, Error>` - All detection results sorted by strength or an error
3461    pub fn all_magics<R: Read + Seek>(&self, r: &mut R) -> Result<Vec<Magic<'_>>, Error> {
3462        let mut haystack = Self::open_reader(r)?;
3463        let stream_kind = guess_stream_kind(haystack.read_range(0..FILE_BYTES_MAX as u64)?);
3464        self.all_magics_sort_with_stream_kind(&mut haystack, stream_kind)
3465    }
3466
3467    #[inline(always)]
3468    fn best_magic_with_stream_kind<R: Read + Seek>(
3469        &self,
3470        haystack: &mut LazyCache<R>,
3471        stream_kind: StreamKind,
3472    ) -> Result<Magic<'_>, Error> {
3473        let magics = self.all_magics_sort_with_stream_kind(haystack, stream_kind)?;
3474
3475        // magics is guaranteed to contain at least the default magic
3476        return Ok(magics
3477            .into_iter()
3478            .next()
3479            .expect("magics must at least contain default"));
3480    }
3481
3482    /// Detects the best [`Magic`] matching a given content.
3483    ///
3484    /// # Arguments
3485    ///
3486    /// * `r` - A readable and seekable input
3487    ///
3488    /// # Returns
3489    ///
3490    /// * `Result<Magic<'_>, Error>` - The best detection result or an error
3491    pub fn best_magic<R: Read + Seek>(&self, r: &mut R) -> Result<Magic<'_>, Error> {
3492        let mut haystack = Self::open_reader(r)?;
3493        let stream_kind = guess_stream_kind(haystack.read_range(0..FILE_BYTES_MAX as u64)?);
3494        self.best_magic_with_stream_kind(&mut haystack, stream_kind)
3495    }
3496
3497    /// Serializes the database to a generic writer implementing [`io::Write`]
3498    ///
3499    /// # Returns
3500    ///
3501    /// * `Result<(), Error>` - The serialized database or an error
3502    pub fn serialize<W: Write>(self, w: &mut W) -> Result<(), Error> {
3503        let mut encoder = GzEncoder::new(w, Compression::best());
3504
3505        bincode::serde::encode_into_std_write(&self, &mut encoder, bincode::config::standard())?;
3506        encoder.finish()?;
3507        Ok(())
3508    }
3509
3510    /// Deserializes the database from a generic reader implementing [`io::Read`]
3511    ///
3512    /// # Arguments
3513    ///
3514    /// * `r` - The reader to deserialize from
3515    ///
3516    /// # Returns
3517    ///
3518    /// * `Result<Self, Error>` - The deserialized database or an error
3519    pub fn deserialize<R: Read>(r: &mut R) -> Result<Self, Error> {
3520        let mut buf = vec![];
3521        let mut gz = GzDecoder::new(r);
3522        gz.read_to_end(&mut buf).map_err(|e| {
3523            bincode::error::DecodeError::OtherString(format!("failed to read: {e}"))
3524        })?;
3525        let (sdb, _): (MagicDb, usize) =
3526            bincode::serde::decode_from_slice(&buf, bincode::config::standard())?;
3527        Ok(sdb)
3528    }
3529
3530    #[inline(always)]
3531    fn prepare(&mut self) {
3532        self.rules
3533            .iter_mut()
3534            .for_each(|r| r.try_finalize(&self.dependencies));
3535
3536        // put text rules at the end
3537        self.rules.sort_by_key(|r| (r.is_text(), -(r.score as i64)));
3538    }
3539}
3540
3541#[cfg(test)]
3542mod tests {
3543    use std::io::Cursor;
3544
3545    use regex::bytes::Regex;
3546
3547    use crate::utils::unix_local_time_to_string;
3548
3549    use super::*;
3550
3551    macro_rules! lazy_cache {
3552        ($l: literal) => {
3553            LazyCache::from_read_seek(Cursor::new($l)).unwrap()
3554        };
3555    }
3556
3557    fn first_magic(
3558        rule: &str,
3559        content: &[u8],
3560        stream_kind: StreamKind,
3561    ) -> Result<Magic<'static>, Error> {
3562        let mut md = MagicDb::new();
3563        md.load(
3564            FileMagicParser::parse_str(rule, None)
3565                .inspect_err(|e| eprintln!("{e}"))
3566                .unwrap(),
3567        )
3568        .unwrap();
3569        let mut reader = LazyCache::from_read_seek(Cursor::new(content)).unwrap();
3570        let v = md.best_magic_with_stream_kind(&mut reader, stream_kind)?;
3571        Ok(v.into_owned())
3572    }
3573
3574    /// helper macro to debug tests
3575    #[allow(unused_macros)]
3576    macro_rules! enable_trace {
3577        () => {
3578            tracing_subscriber::fmt()
3579                .with_max_level(tracing_subscriber::filter::LevelFilter::TRACE)
3580                .try_init();
3581        };
3582    }
3583
3584    macro_rules! parse_assert {
3585        ($rule:literal) => {
3586            FileMagicParser::parse_str($rule, None)
3587                .inspect_err(|e| eprintln!("{e}"))
3588                .unwrap();
3589        };
3590    }
3591
3592    macro_rules! assert_magic_match_bin {
3593        ($rule: literal, $content:literal) => {{ first_magic($rule, $content, StreamKind::Binary).unwrap() }};
3594        ($rule: literal, $content:literal, $message:expr) => {{
3595            assert_eq!(
3596                first_magic($rule, $content, StreamKind::Binary)
3597                    .unwrap()
3598                    .message(),
3599                $message
3600            );
3601        }};
3602    }
3603
3604    macro_rules! assert_magic_match_text {
3605        ($rule: literal, $content:literal) => {{ first_magic($rule, $content, StreamKind::Text(TextEncoding::Utf8)).unwrap() }};
3606        ($rule: literal, $content:literal, $message:expr) => {{
3607            assert_eq!(
3608                first_magic($rule, $content, StreamKind::Text(TextEncoding::Utf8))
3609                    .unwrap()
3610                    .message(),
3611                $message
3612            );
3613        }};
3614    }
3615
3616    macro_rules! assert_magic_not_match_text {
3617        ($rule: literal, $content:literal) => {{
3618            assert!(
3619                first_magic($rule, $content, StreamKind::Text(TextEncoding::Utf8))
3620                    .unwrap()
3621                    .is_default()
3622            );
3623        }};
3624    }
3625
3626    macro_rules! assert_magic_not_match_bin {
3627        ($rule: literal, $content:literal) => {{
3628            assert!(
3629                first_magic($rule, $content, StreamKind::Binary)
3630                    .unwrap()
3631                    .is_default()
3632            );
3633        }};
3634    }
3635
3636    #[test]
3637    fn test_regex() {
3638        assert_magic_match_text!(
3639            r#"
36400	regex/1024 \^#![[:space:]]*/usr/bin/env[[:space:]]+
3641!:mime	text/x-shellscript
3642>&0  regex/64 .*($|\\b) %s shell script text executable
3643    "#,
3644            br#"#!/usr/bin/env bash
3645        echo hello world"#,
3646            // the magic generated
3647            "bash shell script text executable"
3648        );
3649
3650        let re = Regex::new(r"(?-u)\x42\x82").unwrap();
3651        assert!(re.is_match(b"\x42\x82"));
3652
3653        assert_magic_match_bin!(
3654            r#"0 regex \x42\x82 binary regex match"#,
3655            b"\x00\x00\x00\x00\x00\x00\x42\x82"
3656        );
3657
3658        // test regex continuation after match
3659        assert_magic_match_bin!(
3660            r#"
3661            0 regex \x42\x82
3662            >&0 string \xde\xad\xbe\xef it works
3663            "#,
3664            b"\x00\x00\x00\x00\x00\x00\x42\x82\xde\xad\xbe\xef"
3665        );
3666
3667        assert_magic_match_bin!(
3668            r#"
3669            0 regex/s \x42\x82
3670            >&0 string \x42\x82\xde\xad\xbe\xef it works
3671            "#,
3672            b"\x00\x00\x00\x00\x00\x00\x42\x82\xde\xad\xbe\xef"
3673        );
3674
3675        // ^ must match stat of line when matching text
3676        assert_magic_match_text!(
3677            r#"
36780	regex/1024 \^HelloWorld$ HelloWorld String"#,
3679            br#"
3680// this is a comment after an empty line
3681HelloWorld
3682            "#
3683        );
3684    }
3685
3686    #[test]
3687    fn test_string_with_mods() {
3688        assert_magic_match_text!(
3689            r#"0	string/w	#!\ \ \ /usr/bin/env\ bash	BASH
3690        "#,
3691            b"#! /usr/bin/env bash i
3692        echo hello world"
3693        );
3694
3695        // test uppercase insensitive
3696        assert_magic_match_text!(
3697            r#"0	string/C	HelloWorld	it works
3698        "#,
3699            b"helloworld"
3700        );
3701
3702        assert_magic_not_match_text!(
3703            r#"0	string/C	HelloWorld	it works
3704        "#,
3705            b"hELLOwORLD"
3706        );
3707
3708        // test lowercase insensitive
3709        assert_magic_match_text!(
3710            r#"0	string/c	HelloWorld	it works
3711        "#,
3712            b"HELLOWORLD"
3713        );
3714
3715        assert_magic_not_match_text!(
3716            r#"0	string/c	HelloWorld	it works
3717        "#,
3718            b"helloworld"
3719        );
3720
3721        // test full word match
3722        assert_magic_match_text!(
3723            r#"0	string/f	#!/usr/bin/env\ bash	BASH
3724        "#,
3725            b"#!/usr/bin/env bash"
3726        );
3727
3728        assert_magic_not_match_text!(
3729            r#"0	string/f	#!/usr/bin/python PYTHON"#,
3730            b"#!/usr/bin/pythonic"
3731        );
3732
3733        // testing whitespace compacting
3734        assert_magic_match_text!(
3735            r#"0	string/W	#!/usr/bin/env\ python  PYTHON"#,
3736            b"#!/usr/bin/env    python"
3737        );
3738
3739        assert_magic_not_match_text!(
3740            r#"0	string/W	#!/usr/bin/env\ \ python  PYTHON"#,
3741            b"#!/usr/bin/env python"
3742        );
3743    }
3744
3745    #[test]
3746    fn test_search_with_mods() {
3747        assert_magic_match_text!(
3748            r#"0	search/1/fwt	#!\ /usr/bin/luatex	LuaTex script text executable"#,
3749            b"#!          /usr/bin/luatex "
3750        );
3751
3752        // test matching from the beginning
3753        assert_magic_match_text!(
3754            r#"
3755            0	search/s	/usr/bin/env
3756            >&0 string /usr/bin/env it works
3757            "#,
3758            b"#!/usr/bin/env    python"
3759        );
3760
3761        assert_magic_not_match_text!(
3762            r#"
3763            0	search	/usr/bin/env
3764            >&0 string /usr/bin/env it works
3765            "#,
3766            b"#!/usr/bin/env    python"
3767        );
3768    }
3769
3770    #[test]
3771    fn test_pstring() {
3772        assert_magic_match_bin!(r#"0 pstring Toast it works"#, b"\x05Toast");
3773
3774        assert_magic_match_bin!(r#"0 pstring Toast %s"#, b"\x05Toast", "Toast");
3775
3776        assert_magic_not_match_bin!(r#"0 pstring Toast Doesn't work"#, b"\x07Toaster");
3777
3778        // testing with modifiers
3779        assert_magic_match_bin!(r#"0 pstring/H Toast it works"#, b"\x00\x05Toast");
3780
3781        assert_magic_match_bin!(r#"0 pstring/HJ Toast it works"#, b"\x00\x07Toast");
3782
3783        assert_magic_match_bin!(r#"0 pstring/HJ Toast %s"#, b"\x00\x07Toast", "Toast");
3784
3785        assert_magic_match_bin!(r#"0 pstring/h Toast it works"#, b"\x05\x00Toast");
3786
3787        assert_magic_match_bin!(r#"0 pstring/hJ Toast it works"#, b"\x07\x00Toast");
3788
3789        assert_magic_match_bin!(r#"0 pstring/L Toast it works"#, b"\x00\x00\x00\x05Toast");
3790
3791        assert_magic_match_bin!(r#"0 pstring/LJ Toast it works"#, b"\x00\x00\x00\x09Toast");
3792
3793        assert_magic_match_bin!(r#"0 pstring/l Toast it works"#, b"\x05\x00\x00\x00Toast");
3794
3795        assert_magic_match_bin!(r#"0 pstring/lJ Toast it works"#, b"\x09\x00\x00\x00Toast");
3796    }
3797
3798    #[test]
3799    fn test_max_recursion() {
3800        let res = first_magic(
3801            r#"0	indirect x"#,
3802            b"#!          /usr/bin/luatex ",
3803            StreamKind::Binary,
3804        );
3805        assert!(res.is_err());
3806        let _ = res.inspect_err(|e| {
3807            assert!(matches!(
3808                e.unwrap_localized(),
3809                Error::MaximumRecursion(MAX_RECURSION)
3810            ))
3811        });
3812    }
3813
3814    #[test]
3815    fn test_string_ops() {
3816        assert_magic_match_text!("0	string/b MZ MZ File", b"MZ\0");
3817        assert_magic_match_text!("0	string !MZ Not MZ File", b"AZ\0");
3818        assert_magic_match_text!("0	string >\0 Any String", b"A\0");
3819        assert_magic_match_text!("0	string >Test Any String", b"Test 1\0");
3820        assert_magic_match_text!("0	string <Test Any String", b"\0");
3821        assert_magic_not_match_text!("0	string >Test Any String", b"\0");
3822    }
3823
3824    #[test]
3825    fn test_lestring16() {
3826        assert_magic_match_bin!(
3827            "0 lestring16 abcd Little-endian UTF-16 string",
3828            b"\x61\x00\x62\x00\x63\x00\x64\x00"
3829        );
3830        assert_magic_match_bin!(
3831            "0 lestring16 x %s",
3832            b"\x61\x00\x62\x00\x63\x00\x64\x00\x00",
3833            "abcd"
3834        );
3835        assert_magic_not_match_bin!(
3836            "0 lestring16 abcd Little-endian UTF-16 string",
3837            b"\x00\x61\x00\x62\x00\x63\x00\x64"
3838        );
3839        assert_magic_match_bin!(
3840            "4 lestring16 abcd Little-endian UTF-16 string",
3841            b"\x00\x00\x00\x00\x61\x00\x62\x00\x63\x00\x64\x00"
3842        );
3843    }
3844
3845    #[test]
3846    fn test_bestring16() {
3847        assert_magic_match_bin!(
3848            "0 bestring16 abcd Big-endian UTF-16 string",
3849            b"\x00\x61\x00\x62\x00\x63\x00\x64"
3850        );
3851        assert_magic_match_bin!(
3852            "0 bestring16 x %s",
3853            b"\x00\x61\x00\x62\x00\x63\x00\x64",
3854            "abcd"
3855        );
3856        assert_magic_not_match_bin!(
3857            "0 bestring16 abcd Big-endian UTF-16 string",
3858            b"\x61\x00\x62\x00\x63\x00\x64\x00"
3859        );
3860        assert_magic_match_bin!(
3861            "4 bestring16 abcd Big-endian UTF-16 string",
3862            b"\x00\x00\x00\x00\x00\x61\x00\x62\x00\x63\x00\x64"
3863        );
3864    }
3865
3866    #[test]
3867    fn test_offset_from_end() {
3868        assert_magic_match_bin!("-1 ubyte 0x42 last byte ok", b"\x00\x00\x42");
3869        assert_magic_match_bin!("-2 ubyte 0x41 last byte ok", b"\x00\x41\x00");
3870    }
3871
3872    #[test]
3873    fn test_relative_offset() {
3874        assert_magic_match_bin!(
3875            "
3876            0 ubyte 0x42
3877            >&0 ubyte 0x00
3878            >>&0 ubyte 0x41 third byte ok
3879            ",
3880            b"\x42\x00\x41\x00"
3881        );
3882    }
3883
3884    #[test]
3885    fn test_indirect_offset() {
3886        assert_magic_match_bin!("(0.l) ubyte 0x42 it works", b"\x04\x00\x00\x00\x42");
3887        // adding fixed value to offset
3888        assert_magic_match_bin!("(0.l+3) ubyte 0x42 it works", b"\x01\x00\x00\x00\x42");
3889        // testing offset pair
3890        assert_magic_match_bin!(
3891            "(0.l+(4)) ubyte 0x42 it works",
3892            b"\x04\x00\x00\x00\x04\x00\x00\x00\x42"
3893        );
3894    }
3895
3896    #[test]
3897    fn test_use_with_message() {
3898        assert_magic_match_bin!(
3899            r#"
39000 string MZ
3901>0 use mz first match
3902
39030 name mz then second match
3904>0 string MZ
3905"#,
3906            b"MZ\0",
3907            "first match then second match"
3908        );
3909    }
3910
3911    #[test]
3912    fn test_scalar_transform() {
3913        assert_magic_match_bin!("0 ubyte+1 0x1 add works", b"\x00");
3914        assert_magic_match_bin!("0 ubyte-1 0xfe sub works", b"\xff");
3915        assert_magic_match_bin!("0 ubyte%2 0 mod works", b"\x0a");
3916        assert_magic_match_bin!("0 ubyte&0x0f 0x0f bitand works", b"\xff");
3917        assert_magic_match_bin!("0 ubyte|0x0f 0xff bitor works", b"\xf0");
3918        assert_magic_match_bin!("0 ubyte^0x0f 0xf0 bitxor works", b"\xff");
3919
3920        FileMagicParser::parse_str("0 ubyte%0 mod by zero", None)
3921            .expect_err("expect div by zero error");
3922        FileMagicParser::parse_str("0 ubyte/0 div by zero", None)
3923            .expect_err("expect div by zero error");
3924    }
3925
3926    #[test]
3927    fn test_belong() {
3928        // Test that a file with a four-byte value at offset 0 that matches the given value in big-endian byte order
3929        assert_magic_match_bin!("0 belong 0x12345678 Big-endian long", b"\x12\x34\x56\x78");
3930        // Test that a file with a four-byte value at offset 0 that does not match the given value in big-endian byte order
3931        assert_magic_not_match_bin!("0 belong 0x12345678 Big-endian long", b"\x78\x56\x34\x12");
3932        // Test that a file with a four-byte value at a non-zero offset that matches the given value in big-endian byte order
3933        assert_magic_match_bin!(
3934            "4 belong 0x12345678 Big-endian long",
3935            b"\x00\x00\x00\x00\x12\x34\x56\x78"
3936        );
3937        // Test < operator
3938        assert_magic_match_bin!("0 belong <0x12345678 Big-endian long", b"\x12\x34\x56\x77");
3939        assert_magic_not_match_bin!("0 belong <0x12345678 Big-endian long", b"\x12\x34\x56\x78");
3940
3941        // Test > operator
3942        assert_magic_match_bin!("0 belong >0x12345678 Big-endian long", b"\x12\x34\x56\x79");
3943        assert_magic_not_match_bin!("0 belong >0x12345678 Big-endian long", b"\x12\x34\x56\x78");
3944
3945        // Test & operator
3946        assert_magic_match_bin!("0 belong &0x5678 Big-endian long", b"\x00\x00\x56\x78");
3947        assert_magic_not_match_bin!("0 belong &0x0000FFFF Big-endian long", b"\x12\x34\x56\x78");
3948
3949        // Test ^ operator (bitwise AND with complement)
3950        assert_magic_match_bin!("0 belong ^0xFFFF0000 Big-endian long", b"\x00\x00\x56\x78");
3951        assert_magic_not_match_bin!("0 belong ^0xFFFF0000 Big-endian long", b"\x00\x01\x56\x78");
3952
3953        // Test ~ operator
3954        assert_magic_match_bin!("0 belong ~0x12345678 Big-endian long", b"\xed\xcb\xa9\x87");
3955        assert_magic_not_match_bin!("0 belong ~0x12345678 Big-endian long", b"\x12\x34\x56\x78");
3956
3957        // Test x operator
3958        assert_magic_match_bin!("0 belong x Big-endian long", b"\x12\x34\x56\x78");
3959        assert_magic_match_bin!("0 belong x Big-endian long", b"\x78\x56\x34\x12");
3960    }
3961
3962    #[test]
3963    fn test_parse_search() {
3964        parse_assert!("0 search test");
3965        parse_assert!("0 search/24/s test");
3966        parse_assert!("0 search/s/24 test");
3967    }
3968
3969    #[test]
3970    fn test_bedate() {
3971        assert_magic_match_bin!(
3972            "0 bedate 946684800 Unix date (Jan 1, 2000)",
3973            b"\x38\x6D\x43\x80"
3974        );
3975        assert_magic_not_match_bin!(
3976            "0 bedate 946684800 Unix date (Jan 1, 2000)",
3977            b"\x00\x00\x00\x00"
3978        );
3979        assert_magic_match_bin!(
3980            "4 bedate 946684800 %s",
3981            b"\x00\x00\x00\x00\x38\x6D\x43\x80",
3982            "2000-01-01 00:00:00"
3983        );
3984    }
3985    #[test]
3986    fn test_beldate() {
3987        assert_magic_match_bin!(
3988            "0 beldate 946684800 Local date (Jan 1, 2000)",
3989            b"\x38\x6D\x43\x80"
3990        );
3991        assert_magic_not_match_bin!(
3992            "0 beldate 946684800 Local date (Jan 1, 2000)",
3993            b"\x00\x00\x00\x00"
3994        );
3995
3996        assert_magic_match_bin!(
3997            "4 beldate 946684800 {}",
3998            b"\x00\x00\x00\x00\x38\x6D\x43\x80",
3999            unix_local_time_to_string(946684800)
4000        );
4001    }
4002
4003    #[test]
4004    fn test_beqdate() {
4005        assert_magic_match_bin!(
4006            "0 beqdate 946684800 Unix date (Jan 1, 2000)",
4007            b"\x00\x00\x00\x00\x38\x6D\x43\x80"
4008        );
4009
4010        assert_magic_not_match_bin!(
4011            "0 beqdate 946684800 Unix date (Jan 1, 2000)",
4012            b"\x00\x00\x00\x00\x00\x00\x00\x00"
4013        );
4014
4015        assert_magic_match_bin!(
4016            "0 beqdate 946684800 %s",
4017            b"\x00\x00\x00\x00\x38\x6D\x43\x80",
4018            "2000-01-01 00:00:00"
4019        );
4020    }
4021
4022    #[test]
4023    fn test_medate() {
4024        assert_magic_match_bin!(
4025            "0 medate 946684800 Unix date (Jan 1, 2000)",
4026            b"\x6D\x38\x80\x43"
4027        );
4028
4029        assert_magic_not_match_bin!(
4030            "0 medate 946684800 Unix date (Jan 1, 2000)",
4031            b"\x00\x00\x00\x00"
4032        );
4033
4034        assert_magic_match_bin!(
4035            "4 medate 946684800 %s",
4036            b"\x00\x00\x00\x00\x6D\x38\x80\x43",
4037            "2000-01-01 00:00:00"
4038        );
4039    }
4040
4041    #[test]
4042    fn test_meldate() {
4043        assert_magic_match_bin!(
4044            "0 meldate 946684800 Local date (Jan 1, 2000)",
4045            b"\x6D\x38\x80\x43"
4046        );
4047        assert_magic_not_match_bin!(
4048            "0 meldate 946684800 Local date (Jan 1, 2000)",
4049            b"\x00\x00\x00\x00"
4050        );
4051
4052        assert_magic_match_bin!(
4053            "4 meldate 946684800 %s",
4054            b"\x00\x00\x00\x00\x6D\x38\x80\x43",
4055            unix_local_time_to_string(946684800)
4056        );
4057    }
4058
4059    #[test]
4060    fn test_date() {
4061        assert_magic_match_bin!(
4062            "0 date 946684800 Local date (Jan 1, 2000)",
4063            b"\x80\x43\x6D\x38"
4064        );
4065        assert_magic_not_match_bin!(
4066            "0 date 946684800 Local date (Jan 1, 2000)",
4067            b"\x00\x00\x00\x00"
4068        );
4069        assert_magic_match_bin!(
4070            "4 date 946684800 {}",
4071            b"\x00\x00\x00\x00\x80\x43\x6D\x38",
4072            "2000-01-01 00:00:00"
4073        );
4074    }
4075
4076    #[test]
4077    fn test_leldate() {
4078        assert_magic_match_bin!(
4079            "0 leldate 946684800 Local date (Jan 1, 2000)",
4080            b"\x80\x43\x6D\x38"
4081        );
4082        assert_magic_not_match_bin!(
4083            "0 leldate 946684800 Local date (Jan 1, 2000)",
4084            b"\x00\x00\x00\x00"
4085        );
4086        assert_magic_match_bin!(
4087            "4 leldate 946684800 {}",
4088            b"\x00\x00\x00\x00\x80\x43\x6D\x38",
4089            unix_local_time_to_string(946684800)
4090        );
4091    }
4092
4093    #[test]
4094    fn test_leqdate() {
4095        assert_magic_match_bin!(
4096            "0 leqdate 1577836800 Unix date (Jan 1, 2020)",
4097            b"\x00\xe1\x0b\x5E\x00\x00\x00\x00"
4098        );
4099
4100        assert_magic_not_match_bin!(
4101            "0 leqdate 1577836800 Unix date (Jan 1, 2020)",
4102            b"\x00\x00\x00\x00\x00\x00\x00\x00"
4103        );
4104        assert_magic_match_bin!(
4105            "8 leqdate 1577836800 %s",
4106            b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\xE1\x0B\x5E\x00\x00\x00\x00",
4107            "2020-01-01 00:00:00"
4108        );
4109    }
4110
4111    #[test]
4112    fn test_leqldate() {
4113        assert_magic_match_bin!(
4114            "0 leqldate 1577836800 Unix date (Jan 1, 2020)",
4115            b"\x00\xe1\x0b\x5E\x00\x00\x00\x00"
4116        );
4117
4118        assert_magic_not_match_bin!(
4119            "0 leqldate 1577836800 Unix date (Jan 1, 2020)",
4120            b"\x00\x00\x00\x00\x00\x00\x00\x00"
4121        );
4122        assert_magic_match_bin!(
4123            "8 leqldate 1577836800 %s",
4124            b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\xE1\x0B\x5E\x00\x00\x00\x00",
4125            unix_local_time_to_string(1577836800)
4126        );
4127    }
4128
4129    #[test]
4130    fn test_melong() {
4131        // Test = operator
4132        assert_magic_match_bin!(
4133            "0 melong =0x12345678 Middle-endian long",
4134            b"\x34\x12\x78\x56"
4135        );
4136        assert_magic_not_match_bin!(
4137            "0 melong =0x12345678 Middle-endian long",
4138            b"\x00\x00\x00\x00"
4139        );
4140
4141        // Test < operator
4142        assert_magic_match_bin!(
4143            "0 melong <0x12345678 Middle-endian long",
4144            b"\x34\x12\x78\x55"
4145        ); // 0x12345677 in middle-endian
4146        assert_magic_not_match_bin!(
4147            "0 melong <0x12345678 Middle-endian long",
4148            b"\x34\x12\x78\x56"
4149        ); // 0x12345678 in middle-endian
4150
4151        // Test > operator
4152        assert_magic_match_bin!(
4153            "0 melong >0x12345678 Middle-endian long",
4154            b"\x34\x12\x78\x57"
4155        ); // 0x12345679 in middle-endian
4156        assert_magic_not_match_bin!(
4157            "0 melong >0x12345678 Middle-endian long",
4158            b"\x34\x12\x78\x56"
4159        ); // 0x12345678 in middle-endian
4160
4161        // Test & operator
4162        assert_magic_match_bin!("0 melong &0x5678 Middle-endian long", b"\xab\xcd\x78\x56"); // 0x00007856 in middle-endian
4163        assert_magic_not_match_bin!(
4164            "0 melong &0x0000FFFF Middle-endian long",
4165            b"\x34\x12\x78\x56"
4166        ); // 0x12347856 in middle-endian
4167
4168        // Test ^ operator (bitwise AND with complement)
4169        assert_magic_match_bin!(
4170            "0 melong ^0xFFFF0000 Middle-endian long",
4171            b"\x00\x00\x78\x56"
4172        ); // 0x00007856 in middle-endian
4173        assert_magic_not_match_bin!(
4174            "0 melong ^0xFFFF0000 Middle-endian long",
4175            b"\x00\x01\x78\x56"
4176        ); // 0x00017856 in middle-endian
4177
4178        // Test ~ operator
4179        assert_magic_match_bin!(
4180            "0 melong ~0x12345678 Middle-endian long",
4181            b"\xCB\xED\x87\xA9"
4182        );
4183        assert_magic_not_match_bin!(
4184            "0 melong ~0x12345678 Middle-endian long",
4185            b"\x34\x12\x78\x56"
4186        ); // The original value
4187
4188        // Test x operator
4189        assert_magic_match_bin!("0 melong x Middle-endian long", b"\x34\x12\x78\x56");
4190        assert_magic_match_bin!("0 melong x Middle-endian long", b"\x00\x00\x00\x00");
4191    }
4192
4193    #[test]
4194    fn test_uquad() {
4195        // Test = operator
4196        assert_magic_match_bin!(
4197            "0 uquad =0x123456789ABCDEF0 Unsigned quad",
4198            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4199        );
4200        assert_magic_not_match_bin!(
4201            "0 uquad =0x123456789ABCDEF0 Unsigned quad",
4202            b"\x00\x00\x00\x00\x00\x00\x00\x00"
4203        );
4204
4205        // Test < operator
4206        assert_magic_match_bin!(
4207            "0 uquad <0x123456789ABCDEF0 Unsigned quad",
4208            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x11"
4209        );
4210        assert_magic_not_match_bin!(
4211            "0 uquad <0x123456789ABCDEF0 Unsigned quad",
4212            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4213        );
4214
4215        // Test > operator
4216        assert_magic_match_bin!(
4217            "0 uquad >0x123456789ABCDEF0 Unsigned quad",
4218            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x13"
4219        );
4220        assert_magic_not_match_bin!(
4221            "0 uquad >0x123456789ABCDEF0 Unsigned quad",
4222            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4223        );
4224
4225        // Test & operator
4226        assert_magic_match_bin!(
4227            "0 uquad &0xF0 Unsigned quad",
4228            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4229        );
4230        assert_magic_not_match_bin!(
4231            "0 uquad &0xFF Unsigned quad",
4232            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4233        );
4234
4235        // Test ^ operator (bitwise AND with complement)
4236        assert_magic_match_bin!(
4237            "0 uquad ^0xFFFFFFFFFFFFFFFF Unsigned quad",
4238            b"\x00\x00\x00\x00\x00\x00\x00\x00"
4239        ); // All bits clear
4240        assert_magic_not_match_bin!(
4241            "0 uquad ^0xFFFFFFFFFFFFFFFF Unsigned quad",
4242            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4243        ); // Some bits set
4244
4245        // Test ~ operator
4246        assert_magic_match_bin!(
4247            "0 uquad ~0x123456789ABCDEF0 Unsigned quad",
4248            b"\x0F\x21\x43\x65\x87\xA9\xCB\xED"
4249        );
4250        assert_magic_not_match_bin!(
4251            "0 uquad ~0x123456789ABCDEF0 Unsigned quad",
4252            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4253        ); // The original value
4254
4255        // Test x operator
4256        assert_magic_match_bin!(
4257            "0 uquad x {:#x}",
4258            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12",
4259            "0x123456789abcdef0"
4260        );
4261        assert_magic_match_bin!(
4262            "0 uquad x Unsigned quad",
4263            b"\x00\x00\x00\x00\x00\x00\x00\x00"
4264        );
4265    }
4266
4267    #[test]
4268    fn test_guid() {
4269        assert_magic_match_bin!(
4270            "0 guid EC959539-6786-2D4E-8FDB-98814CE76C1E It works",
4271            b"\xEC\x95\x95\x39\x67\x86\x2D\x4E\x8F\xDB\x98\x81\x4C\xE7\x6C\x1E"
4272        );
4273
4274        assert_magic_not_match_bin!(
4275            "0 guid 399595EC-8667-4E2D-8FDB-98814CE76C1E It works",
4276            b"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F"
4277        );
4278
4279        assert_magic_match_bin!(
4280            "0 guid x %s",
4281            b"\xEC\x95\x95\x39\x67\x86\x2D\x4E\x8F\xDB\x98\x81\x4C\xE7\x6C\x1E",
4282            "EC959539-6786-2D4E-8FDB-98814CE76C1E"
4283        );
4284    }
4285
4286    #[test]
4287    fn test_ubeqdate() {
4288        assert_magic_match_bin!(
4289            "0 ubeqdate 1633046400 It works",
4290            b"\x00\x00\x00\x00\x61\x56\x4f\x80"
4291        );
4292
4293        assert_magic_match_bin!(
4294            "0 ubeqdate x %s",
4295            b"\x00\x00\x00\x00\x61\x56\x4f\x80",
4296            "2021-10-01 00:00:00"
4297        );
4298
4299        assert_magic_not_match_bin!(
4300            "0 ubeqdate 1633046400 It should not work",
4301            b"\x00\x00\x00\x00\x00\x00\x00\x00"
4302        );
4303    }
4304
4305    #[test]
4306    fn test_ldate() {
4307        assert_magic_match_bin!("0 ldate 1640551520 It works", b"\x60\xd4\xC8\x61");
4308
4309        assert_magic_not_match_bin!("0 ldate 1633046400 It should not work", b"\x00\x00\x00\x00");
4310
4311        assert_magic_match_bin!(
4312            "0 ldate x %s",
4313            b"\x60\xd4\xC8\x61",
4314            unix_local_time_to_string(1640551520)
4315        );
4316    }
4317
4318    #[test]
4319    fn test_scalar_with_transform() {
4320        assert_magic_match_bin!("0 ubyte/10 2 {}", b"\x14", "2");
4321        assert_magic_match_bin!("0 ubyte/10 x {}", b"\x14", "2");
4322        assert_magic_match_bin!("0 ubyte%10 x {}", b"\x14", "0");
4323    }
4324
4325    #[test]
4326    fn test_float_with_transform() {
4327        assert_magic_match_bin!("0 lefloat/10 2 {}", b"\x00\x00\xa0\x41", "2");
4328        assert_magic_match_bin!("0 lefloat/10 x {}", b"\x00\x00\xa0\x41", "2");
4329        assert_magic_match_bin!("0 lefloat%10 x {}", b"\x00\x00\xa0\x41", "0");
4330    }
4331
4332    #[test]
4333    fn test_read_octal() {
4334        // Basic cases
4335        assert_eq!(read_octal_u64(&mut lazy_cache!("0")), Some(0));
4336        assert_eq!(read_octal_u64(&mut lazy_cache!("00")), Some(0));
4337        assert_eq!(read_octal_u64(&mut lazy_cache!("01")), Some(1));
4338        assert_eq!(read_octal_u64(&mut lazy_cache!("07")), Some(7));
4339        assert_eq!(read_octal_u64(&mut lazy_cache!("010")), Some(8));
4340        assert_eq!(read_octal_u64(&mut lazy_cache!("0123")), Some(83));
4341        assert_eq!(read_octal_u64(&mut lazy_cache!("0755")), Some(493));
4342
4343        // With trailing non-octal characters
4344        assert_eq!(read_octal_u64(&mut lazy_cache!("0ABC")), Some(0));
4345        assert_eq!(read_octal_u64(&mut lazy_cache!("01ABC")), Some(1));
4346        assert_eq!(read_octal_u64(&mut lazy_cache!("0755ABC")), Some(493));
4347        assert_eq!(read_octal_u64(&mut lazy_cache!("0123ABC")), Some(83));
4348
4349        // Invalid octal digits
4350        assert_eq!(read_octal_u64(&mut lazy_cache!("08")), Some(0)); // stops at '8'
4351        assert_eq!(read_octal_u64(&mut lazy_cache!("01238")), Some(83)); // stops at '8'
4352
4353        // No leading '0'
4354        assert_eq!(read_octal_u64(&mut lazy_cache!("123")), None);
4355        assert_eq!(read_octal_u64(&mut lazy_cache!("755")), None);
4356
4357        // Empty string
4358        assert_eq!(read_octal_u64(&mut lazy_cache!("")), None);
4359
4360        // Only non-octal characters
4361        assert_eq!(read_octal_u64(&mut lazy_cache!("ABC")), None);
4362        assert_eq!(read_octal_u64(&mut lazy_cache!("8ABC")), None); // first char is not '0'
4363
4364        // Longer valid octal (but within u64 range)
4365        assert_eq!(
4366            read_octal_u64(&mut lazy_cache!("01777777777")),
4367            Some(268435455)
4368        );
4369    }
4370
4371    #[test]
4372    fn test_offset_bug_1() {
4373        // this tests the exact behaviour
4374        // expected by libmagic/file
4375        assert_magic_match_bin!(
4376            r"
43771	string		TEST Bread is
4378# offset computation is relative to
4379# rule start
4380>(5.b)	use toasted
4381
43820 name toasted
4383>0	string twice Toasted
4384>>0  use toasted_twice 
4385
43860 name toasted_twice
4387>(6.b) string x %s
4388        ",
4389            b"\x00TEST\x06twice\x00\x06",
4390            "Bread is Toasted twice"
4391        );
4392    }
4393
4394    // this test implement the exact same logic as
4395    // test_offset_bug_1 except that the rule starts
4396    // matching from end. Surprisingly we need to
4397    // adjust indirect offsets so that it works in
4398    // libmagic/file
4399    #[test]
4400    fn test_offset_bug_2() {
4401        // this tests the exact behaviour
4402        // expected by libmagic/file
4403        assert_magic_match_bin!(
4404            r"
4405-12	string		TEST Bread is
4406>(4.b)	use toasted
4407
44080 name toasted
4409>0	string twice Toasted
4410>>0  use toasted_twice
4411
44120 name toasted_twice
4413>(6.b) string x %
4414        ",
4415            b"\x00TEST\x06twice\x00\x06",
4416            "Bread is Toasted twice"
4417        )
4418    }
4419
4420    #[test]
4421    fn test_offset_bug_3() {
4422        // this tests the exact behaviour
4423        // expected by libmagic/file
4424        assert_magic_match_bin!(
4425            r"
44261	string		TEST Bread is
4427>(5.b) indirect/r x
4428
44290	string twice Toasted
4430>0  use toasted_twice
4431
44320 name toasted_twice
4433>0 string x %s
4434        ",
4435            b"\x00TEST\x06twice\x00\x08",
4436            "Bread is Toasted twice"
4437        )
4438    }
4439
4440    #[test]
4441    fn test_offset_bug_4() {
4442        // this tests the exact behaviour
4443        // expected by libmagic/file
4444        assert_magic_match_bin!(
4445            r"
44461	string		Bread %s
4447>(6.b) indirect/r x
4448
4449# this one uses a based offset
4450# computed at indirection
44511	string is\ Toasted %s
4452>(11.b)  use toasted_twice
4453
4454# this one is using a new base
4455# offset being previous base 
4456# offset + offset of use
44570 name toasted_twice
4458>0 string x %s
4459            ",
4460            b"\x00Bread\x06is Toasted\x0ctwice\x00",
4461            "Bread is Toasted twice"
4462        )
4463    }
4464
4465    #[test]
4466    fn test_offset_bug_5() {
4467        assert_magic_match_bin!(
4468            r"
44691	string		TEST Bread is
4470>(5.b) indirect/r x
4471
44720	string twice Toasted
4473>0  use toasted_twice
4474
44750 name toasted_twice
4476>0 string twice
4477>>&1 byte 0x08 twice
4478            ",
4479            b"\x00TEST\x06twice\x00\x08",
4480            "Bread is Toasted twice"
4481        )
4482    }
4483
4484    #[test]
4485    fn test_message_parts() {
4486        let m = first_magic(
4487            r#"0	string/W	#!/usr/bin/env\ python  PYTHON"#,
4488            b"#!/usr/bin/env    python",
4489            StreamKind::Text(TextEncoding::Ascii),
4490        )
4491        .unwrap();
4492
4493        assert!(m.message_parts().any(|p| p.eq_ignore_ascii_case("python")))
4494    }
4495}