pure_magic/
lib.rs

1#![forbid(unsafe_code)]
2#![deny(unused_imports)]
3//! # `pure-magic`: A Safe Rust Reimplementation of `libmagic`
4//!
5//! This crate provides a high-performance, memory-safe alternative to the traditional
6//! `libmagic` (used by the `file` command). It supports **file type detection**,
7//! **MIME type inference**, and **custom magic rule parsing**.
8//!
9//! ## Installation
10//! Add `pure-magic` to your `Cargo.toml`:
11//!
12//! ```toml
13//! [dependencies]
14//! pure-magic = "0.1"  # Replace with the latest version
15//! ```
16//!
17//! Or add the latest version with cargo:
18//!
19//! ```sh
20//! cargo add pure-magic
21//! ```
22//!
23//! ## Quick Start
24//!
25//! ### Detect File Types Programmatically
26//! ```rust
27//! use pure_magic::{MagicDb, MagicSource};
28//! use std::fs::File;
29//!
30//! fn main() -> Result<(), Box<dyn std::error::Error>> {
31//!     let mut db = MagicDb::new();
32//!     // Create a MagicSource from a file
33//!     let rust_magic = MagicSource::open("../magic-db/src/magdir/rust")?;
34//!     db.load(rust_magic)?;
35//!
36//!     // Open a file and detect its type
37//!     let mut file = File::open("src/lib.rs")?;
38//!     let magic = db.first_magic(&mut file, None)?;
39//!
40//!     println!(
41//!         "File type: {} (MIME: {}, strength: {})",
42//!         magic.message(),
43//!         magic.mime_type(),
44//!         magic.strength()
45//!     );
46//!     Ok(())
47//! }
48//! ```
49//!
50//! ### Get All Matching Rules
51//! ```rust
52//! use pure_magic::{MagicDb, MagicSource};
53//! use std::fs::File;
54//!
55//! fn main() -> Result<(), Box<dyn std::error::Error>> {
56//!     let mut db = MagicDb::new();
57//!     // Create a MagicSource from a file
58//!     let rust_magic = MagicSource::open("../magic-db/src/magdir/rust")?;
59//!     db.load(rust_magic)?;
60//!
61//!     // Open a file and detect its type
62//!     let mut file = File::open("src/lib.rs")?;
63//!
64//!     // Get all matching rules, sorted by strength
65//!     let magics = db.all_magics(&mut file)?;
66//!
67//!     // Must contain rust file magic and default text magic
68//!     assert!(magics.len() > 1);
69//!
70//!     for magic in magics {
71//!         println!(
72//!             "Match: {} (strength: {}, source: {})",
73//!             magic.message(),
74//!             magic.strength(),
75//!             magic.source().unwrap_or("unknown")
76//!         );
77//!     }
78//!     Ok(())
79//! }
80//! ```
81//!
82//! ### Serialize a Database to Disk
83//! ```rust
84//! use pure_magic::{MagicDb, MagicSource};
85//! use std::fs::File;
86//!
87//! fn main() -> Result<(), Box<dyn std::error::Error>> {
88//!     let mut db = MagicDb::new();
89//!     // Create a MagicSource from a file
90//!     let rust_magic = MagicSource::open("../magic-db/src/magdir/rust")?;
91//!     db.load(rust_magic)?;
92//!
93//!     // Serialize the database to a file
94//!     let mut output = File::create("/tmp/compiled.db")?;
95//!     db.serialize(&mut output)?;
96//!
97//!     println!("Database saved to file");
98//!     Ok(())
99//! }
100//! ```
101//!
102//! ### Deserialize a Database
103//! ```rust
104//! use pure_magic::{MagicDb, MagicSource};
105//! use std::fs::File;
106//!
107//! fn main() -> Result<(), Box<dyn std::error::Error>> {
108//!     let mut db = MagicDb::new();
109//!     // Create a MagicSource from a file
110//!     let rust_magic = MagicSource::open("../magic-db/src/magdir/rust")?;
111//!     db.load(rust_magic)?;
112//!
113//!     // Serialize the database in a vector
114//!     let mut ser = vec![];
115//!     db.serialize(&mut ser)?;
116//!     println!("Database saved to vector");
117//!
118//!     // We deserialize from slice
119//!     let db = MagicDb::deserialize(&mut ser.as_slice())?;
120//!
121//!     assert!(!db.rules().is_empty());
122//!
123//!     Ok(())
124//! }
125//! ```
126//!
127//! ## License
128//! This project is licensed under the **GPL-3.0 License**.
129//!
130//! ## Contributing
131//! Contributions are welcome! Open an issue or submit a pull request.
132//!
133//! ## Acknowledgments
134//! - Inspired by the original `libmagic` (part of the `file` command).
135
136use dyf::{DynDisplay, FormatString, dformat};
137use flagset::{FlagSet, flags};
138use flate2::{Compression, read::GzDecoder, write::GzEncoder};
139use lazy_cache::LazyCache;
140use memchr::memchr;
141use pest::{Span, error::ErrorVariant};
142use regex::bytes::{self};
143use serde::{Deserialize, Serialize};
144use std::{
145    borrow::Cow,
146    cmp::max,
147    collections::{HashMap, HashSet},
148    fmt::{self, Debug, Display},
149    io::{self, Read, Seek, SeekFrom, Write},
150    ops::{Add, BitAnd, BitOr, BitXor, Div, Mul, Rem, Sub},
151    path::Path,
152};
153use tar::Archive;
154use thiserror::Error;
155use tracing::{Level, debug, enabled, trace};
156
157use crate::{
158    numeric::{Float, FloatDataType, Scalar, ScalarDataType},
159    parser::{FileMagicParser, Rule},
160    utils::{decode_id3, find_json_boundaries},
161};
162
163mod numeric;
164mod parser;
165mod utils;
166
167const HARDCODED_MAGIC_STRENGTH: u64 = 2048;
168const HARDCODED_SOURCE: &str = "hardcoded";
169// corresponds to FILE_INDIR_MAX constant defined in libmagic
170const MAX_RECURSION: usize = 50;
171// constant found in libmagic. It is used to limit for search tests
172pub const FILE_BYTES_MAX: usize = 7 * 1024 * 1024;
173// constant found in libmagic. It is used to limit for regex tests
174const FILE_REGEX_MAX: usize = 8192;
175
176pub const DEFAULT_BIN_MIMETYPE: &str = "application/octet-stream";
177pub const DEFAULT_TEXT_MIMETYPE: &str = "text/plain";
178
179pub(crate) const TIMESTAMP_FORMAT: &str = "%Y-%m-%d %H:%M:%S";
180
181macro_rules! debug_panic {
182    ($($arg:tt)*) => {
183        if cfg!(debug_assertions) {
184            panic!($($arg)*);
185        }
186    };
187}
188
189macro_rules! read {
190    ($r: expr, $ty: ty) => {{
191        let mut a = [0u8; std::mem::size_of::<$ty>()];
192        $r.read_exact(&mut a)?;
193        a
194    }};
195}
196
197macro_rules! read_le {
198    ($r:expr, $ty: ty ) => {{ <$ty>::from_le_bytes(read!($r, $ty)) }};
199}
200
201macro_rules! read_be {
202    ($r:expr, $ty: ty ) => {{ <$ty>::from_be_bytes(read!($r, $ty)) }};
203}
204
205macro_rules! read_me {
206    ($r: expr) => {{ ((read_le!($r, u16) as i32) << 16) | (read_le!($r, u16) as i32) }};
207}
208
209#[inline(always)]
210fn read_octal_u64<R: Read + Seek>(haystack: &mut LazyCache<R>) -> Option<u64> {
211    let s = haystack
212        .read_while_or_limit(|b| matches!(b, b'0'..=b'7'), 22)
213        .map(|buf| str::from_utf8(buf))
214        .ok()?
215        .ok()?;
216
217    if !s.starts_with("0") {
218        return None;
219    }
220
221    u64::from_str_radix(s, 8).ok()
222}
223
224/// Represents all possible errors that can occur during file type detection and processing.
225#[derive(Debug, Error)]
226pub enum Error {
227    /// A generic error with a custom message.
228    #[error("{0}")]
229    Msg(String),
230
231    /// An error with a source location and a nested error.
232    #[error("source={0} line={1} error={2}")]
233    Localized(String, usize, Box<Error>),
234
235    /// Indicates a required rule was not found.
236    #[error("missing rule: {0}")]
237    MissingRule(String),
238
239    /// Indicates the maximum recursion depth was reached.
240    #[error("maximum recursion reached: {0}")]
241    MaximumRecursion(usize),
242
243    /// Wraps an I/O error.
244    #[error("io: {0}")]
245    Io(#[from] io::Error),
246
247    /// Wraps a parsing error from the `pest` parser.
248    #[error("parser error: {0}")]
249    Parse(#[from] Box<pest::error::Error<Rule>>),
250
251    /// Wraps a formatting error from the `dyf` crate.
252    #[error("formatting: {0}")]
253    Format(#[from] dyf::Error),
254
255    /// Wraps a regex-related error.
256    #[error("regex: {0}")]
257    Regex(#[from] regex::Error),
258
259    /// Wraps a serialization error from `bincode`.
260    #[error("{0}")]
261    Serialize(#[from] bincode::error::EncodeError),
262
263    /// Wraps a deserialization error from `bincode`.
264    #[error("{0}")]
265    Deserialize(#[from] bincode::error::DecodeError),
266}
267
268impl Error {
269    #[inline]
270    fn parser<S: ToString>(msg: S, span: Span<'_>) -> Self {
271        Self::Parse(Box::new(pest::error::Error::new_from_span(
272            ErrorVariant::CustomError {
273                message: msg.to_string(),
274            },
275            span,
276        )))
277    }
278
279    fn msg<M: AsRef<str>>(msg: M) -> Self {
280        Self::Msg(msg.as_ref().into())
281    }
282
283    fn localized<S: AsRef<str>>(source: S, line: usize, err: Error) -> Self {
284        Self::Localized(source.as_ref().into(), line, err.into())
285    }
286
287    /// Unwraps the localized error
288    pub fn unwrap_localized(&self) -> &Self {
289        match self {
290            Self::Localized(_, _, e) => e,
291            _ => self,
292        }
293    }
294}
295
296#[derive(Debug, Clone, Serialize, Deserialize)]
297enum Message {
298    String(String),
299    Format {
300        printf_spec: String,
301        fs: FormatString,
302    },
303}
304
305impl Display for Message {
306    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
307        match self {
308            Self::String(s) => write!(f, "{s}"),
309            Self::Format { printf_spec: _, fs } => write!(f, "{}", fs.to_string_lossy()),
310        }
311    }
312}
313
314impl Message {
315    fn to_string_lossy(&self) -> Cow<'_, str> {
316        match self {
317            Message::String(s) => Cow::Borrowed(s),
318            Message::Format { printf_spec: _, fs } => fs.to_string_lossy(),
319        }
320    }
321
322    #[inline(always)]
323    fn format_with(&self, mr: Option<&MatchRes>) -> Result<Cow<'_, str>, Error> {
324        match self {
325            Self::String(s) => Ok(Cow::Borrowed(s.as_str())),
326            Self::Format {
327                printf_spec: c_spec,
328                fs,
329            } => {
330                if let Some(mr) = mr {
331                    match mr {
332                        MatchRes::Float(_, _) | MatchRes::Bytes(_, _, _, _) => {
333                            Ok(Cow::Owned(dformat!(fs, mr)?))
334                        }
335                        MatchRes::Scalar(_, scalar) => {
336                            // we want to print a byte as char
337                            if c_spec.as_str() == "c" {
338                                match scalar {
339                                    Scalar::byte(b) => {
340                                        let b = (*b as u8) as char;
341                                        Ok(Cow::Owned(dformat!(fs, b)?))
342                                    }
343                                    Scalar::ubyte(b) => {
344                                        let b = *b as char;
345                                        Ok(Cow::Owned(dformat!(fs, b)?))
346                                    }
347                                    _ => Ok(Cow::Owned(dformat!(fs, mr)?)),
348                                }
349                            } else {
350                                Ok(Cow::Owned(dformat!(fs, mr)?))
351                            }
352                        }
353                    }
354                } else {
355                    Ok(fs.to_string_lossy())
356                }
357            }
358        }
359    }
360}
361
362impl ScalarDataType {
363    #[inline(always)]
364    fn read<R: Read + Seek>(&self, from: &mut R, switch_endianness: bool) -> Result<Scalar, Error> {
365        macro_rules! _read_le {
366            ($ty: ty) => {{
367                if switch_endianness {
368                    <$ty>::from_be_bytes(read!(from, $ty))
369                } else {
370                    <$ty>::from_le_bytes(read!(from, $ty))
371                }
372            }};
373        }
374
375        macro_rules! _read_be {
376            ($ty: ty) => {{
377                if switch_endianness {
378                    <$ty>::from_le_bytes(read!(from, $ty))
379                } else {
380                    <$ty>::from_be_bytes(read!(from, $ty))
381                }
382            }};
383        }
384
385        macro_rules! _read_ne {
386            ($ty: ty) => {{
387                if cfg!(target_endian = "big") {
388                    _read_be!($ty)
389                } else {
390                    _read_le!($ty)
391                }
392            }};
393        }
394
395        macro_rules! _read_me {
396            () => {
397                ((_read_le!(u16) as i32) << 16) | (_read_le!(u16) as i32)
398            };
399        }
400
401        Ok(match self {
402            // signed
403            Self::byte => Scalar::byte(read!(from, u8)[0] as i8),
404            Self::short => Scalar::short(_read_ne!(i16)),
405            Self::long => Scalar::long(_read_ne!(i32)),
406            Self::date => Scalar::date(_read_ne!(i32)),
407            Self::ldate => Scalar::ldate(_read_ne!(i32)),
408            Self::qwdate => Scalar::qwdate(_read_ne!(i64)),
409            Self::leshort => Scalar::leshort(_read_le!(i16)),
410            Self::lelong => Scalar::lelong(_read_le!(i32)),
411            Self::lequad => Scalar::lequad(_read_le!(i64)),
412            Self::bequad => Scalar::bequad(_read_be!(i64)),
413            Self::belong => Scalar::belong(_read_be!(i32)),
414            Self::bedate => Scalar::bedate(_read_be!(i32)),
415            Self::beldate => Scalar::beldate(_read_be!(i32)),
416            Self::beqdate => Scalar::beqdate(_read_be!(i64)),
417            // unsigned
418            Self::ubyte => Scalar::ubyte(read!(from, u8)[0]),
419            Self::ushort => Scalar::ushort(_read_ne!(u16)),
420            Self::uleshort => Scalar::uleshort(_read_le!(u16)),
421            Self::ulelong => Scalar::ulelong(_read_le!(u32)),
422            Self::uledate => Scalar::uledate(_read_le!(u32)),
423            Self::ulequad => Scalar::ulequad(_read_le!(u64)),
424            Self::offset => Scalar::offset(from.stream_position()?),
425            Self::ubequad => Scalar::ubequad(_read_be!(u64)),
426            Self::medate => Scalar::medate(_read_me!()),
427            Self::meldate => Scalar::meldate(_read_me!()),
428            Self::melong => Scalar::melong(_read_me!()),
429            Self::beshort => Scalar::beshort(_read_be!(i16)),
430            Self::quad => Scalar::quad(_read_ne!(i64)),
431            Self::uquad => Scalar::uquad(_read_ne!(u64)),
432            Self::ledate => Scalar::ledate(_read_le!(i32)),
433            Self::leldate => Scalar::leldate(_read_le!(i32)),
434            Self::leqdate => Scalar::leqdate(_read_le!(i64)),
435            Self::leqldate => Scalar::leqldate(_read_le!(i64)),
436            Self::leqwdate => Scalar::leqwdate(_read_le!(i64)),
437            Self::ubelong => Scalar::ubelong(_read_be!(u32)),
438            Self::ulong => Scalar::ulong(_read_ne!(u32)),
439            Self::ubeshort => Scalar::ubeshort(_read_be!(u16)),
440            Self::ubeqdate => Scalar::ubeqdate(_read_be!(u64)),
441            Self::lemsdosdate => Scalar::lemsdosdate(_read_le!(u16)),
442            Self::lemsdostime => Scalar::lemsdostime(_read_le!(u16)),
443            Self::guid => Scalar::guid(u128::from_be_bytes(read!(from, u128))),
444        })
445    }
446}
447
448impl FloatDataType {
449    #[inline(always)]
450    fn read<R: Read + Seek>(&self, from: &mut R, switch_endianness: bool) -> Result<Float, Error> {
451        macro_rules! _read_le {
452            ($ty: ty) => {{
453                if switch_endianness {
454                    <$ty>::from_be_bytes(read!(from, $ty))
455                } else {
456                    <$ty>::from_le_bytes(read!(from, $ty))
457                }
458            }};
459        }
460
461        macro_rules! _read_be {
462            ($ty: ty) => {{
463                if switch_endianness {
464                    <$ty>::from_le_bytes(read!(from, $ty))
465                } else {
466                    <$ty>::from_be_bytes(read!(from, $ty))
467                }
468            }};
469        }
470
471        macro_rules! _read_ne {
472            ($ty: ty) => {{
473                if cfg!(target_endian = "big") {
474                    _read_be!($ty)
475                } else {
476                    _read_le!($ty)
477                }
478            }};
479        }
480
481        macro_rules! _read_me {
482            () => {
483                ((_read_le!(u16) as i32) << 16) | (_read_le!(u16) as i32)
484            };
485        }
486
487        Ok(match self {
488            Self::lefloat => Float::lefloat(_read_le!(f32)),
489            Self::befloat => Float::befloat(_read_le!(f32)),
490            Self::ledouble => Float::ledouble(_read_le!(f64)),
491            Self::bedouble => Float::bedouble(_read_be!(f64)),
492        })
493    }
494}
495
496#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
497enum Op {
498    Mul,
499    Add,
500    Sub,
501    Div,
502    Mod,
503    And,
504    Xor,
505    Or,
506}
507
508impl Display for Op {
509    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
510        match self {
511            Op::Mul => write!(f, "*"),
512            Op::Add => write!(f, "+"),
513            Op::Sub => write!(f, "-"),
514            Op::Div => write!(f, "/"),
515            Op::Mod => write!(f, "%"),
516            Op::And => write!(f, "&"),
517            Op::Or => write!(f, "|"),
518            Op::Xor => write!(f, "^"),
519        }
520    }
521}
522
523#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
524enum CmpOp {
525    Eq,
526    Lt,
527    Gt,
528    BitAnd,
529    Neq, // ! operator
530    Xor,
531    Not, // ~ operator
532}
533
534impl CmpOp {
535    #[inline(always)]
536    fn is_neq(&self) -> bool {
537        matches!(self, Self::Neq)
538    }
539}
540
541#[derive(Debug, Clone, Serialize, Deserialize)]
542struct ScalarTransform {
543    op: Op,
544    num: Scalar,
545}
546
547impl ScalarTransform {
548    fn apply(&self, s: Scalar) -> Option<Scalar> {
549        match self.op {
550            Op::Add => s.checked_add(self.num),
551            Op::Sub => s.checked_sub(self.num),
552            Op::Mul => s.checked_mul(self.num),
553            Op::Div => s.checked_div(self.num),
554            Op::Mod => s.checked_rem(self.num),
555            Op::And => Some(s.bitand(self.num)),
556            Op::Xor => Some(s.bitxor(self.num)),
557            Op::Or => Some(s.bitor(self.num)),
558        }
559    }
560}
561
562#[derive(Debug, Clone, Serialize, Deserialize)]
563struct FloatTransform {
564    op: Op,
565    num: Float,
566}
567
568impl FloatTransform {
569    fn apply(&self, s: Float) -> Float {
570        match self.op {
571            Op::Add => s.add(self.num),
572            Op::Sub => s.sub(self.num),
573            Op::Mul => s.mul(self.num),
574            // returns inf when div by 0
575            Op::Div => s.div(self.num),
576            // returns NaN when rem by 0
577            Op::Mod => s.rem(self.num),
578            // parser makes sure those operators cannot be used
579            Op::And | Op::Xor | Op::Or => {
580                debug_panic!("unsupported operation");
581                s
582            }
583        }
584    }
585}
586
587#[derive(Debug, Clone, Serialize, Deserialize)]
588enum TestValue<T> {
589    Value(T),
590    Any,
591}
592
593impl<T> TestValue<T> {
594    #[inline(always)]
595    fn as_ref(&self) -> TestValue<&T> {
596        match self {
597            Self::Value(v) => TestValue::Value(v),
598            Self::Any => TestValue::Any,
599        }
600    }
601}
602
603flags! {
604    enum ReMod: u8{
605        CaseInsensitive,
606        StartOffsetUpdate,
607        LineLimit,
608        ForceBin,
609        ForceText,
610        TrimMatch,
611    }
612}
613
614fn serialize_regex<S>(re: &bytes::Regex, serializer: S) -> Result<S::Ok, S::Error>
615where
616    S: serde::Serializer,
617{
618    re.as_str().serialize(serializer)
619}
620
621fn deserialize_regex<'de, D>(deserializer: D) -> Result<bytes::Regex, D::Error>
622where
623    D: serde::Deserializer<'de>,
624{
625    let wrapper = String::deserialize(deserializer)?;
626    bytes::Regex::new(&wrapper).map_err(serde::de::Error::custom)
627}
628
629#[derive(Debug, Clone, Serialize, Deserialize)]
630struct RegexTest {
631    #[serde(
632        serialize_with = "serialize_regex",
633        deserialize_with = "deserialize_regex"
634    )]
635    re: bytes::Regex,
636    length: Option<usize>,
637    mods: FlagSet<ReMod>,
638    str_mods: FlagSet<StringMod>,
639    non_magic_len: usize,
640    binary: bool,
641    cmp_op: CmpOp,
642}
643
644impl RegexTest {
645    #[inline(always)]
646    fn is_binary(&self) -> bool {
647        self.binary
648            || self.mods.contains(ReMod::ForceBin)
649            || self.str_mods.contains(StringMod::ForceBin)
650    }
651
652    fn match_buf<'buf>(
653        &self,
654        off_buf: u64, // absolute buffer offset in content
655        stream_kind: StreamKind,
656        buf: &'buf [u8],
657    ) -> Option<MatchRes<'buf>> {
658        let mr = match stream_kind {
659            StreamKind::Text(_) => {
660                let mut off_txt = off_buf;
661
662                let mut line_limit = self.length.unwrap_or(usize::MAX);
663
664                for line in buf.split(|c| c == &b'\n') {
665                    // we don't need to break on offset
666                    // limit as buf contains the good amount
667                    // of bytes to match against
668                    if line_limit == 0 {
669                        break;
670                    }
671
672                    if let Some(re_match) = self.re.find(line) {
673                        // the offset of the string is computed from the start of the buffer
674                        let start_offset = off_txt + re_match.start() as u64;
675
676                        // if we matched until EOL we need to add one to include the delimiter removed from the split
677                        let stop_offset = if re_match.end() == line.len() {
678                            Some(start_offset + re_match.as_bytes().len() as u64 + 1)
679                        } else {
680                            None
681                        };
682
683                        return Some(MatchRes::Bytes(
684                            start_offset,
685                            stop_offset,
686                            re_match.as_bytes(),
687                            Encoding::Utf8,
688                        ));
689                    }
690
691                    off_txt += line.len() as u64;
692                    // we have to add one because lines do not contain splitting character
693                    off_txt += 1;
694                    line_limit = line_limit.saturating_sub(1)
695                }
696                None
697            }
698
699            StreamKind::Binary => {
700                self.re.find(buf).map(|re_match| {
701                    MatchRes::Bytes(
702                        // the offset of the string is computed from the start of the buffer
703                        off_buf + re_match.start() as u64,
704                        None,
705                        re_match.as_bytes(),
706                        Encoding::Utf8,
707                    )
708                })
709            }
710        };
711
712        // handle the case where we want the regex not to match
713        if self.cmp_op.is_neq() && mr.is_none() {
714            return Some(MatchRes::Bytes(off_buf, None, buf, Encoding::Utf8));
715        }
716
717        mr
718    }
719}
720
721impl From<RegexTest> for Test {
722    fn from(value: RegexTest) -> Self {
723        Self::Regex(value)
724    }
725}
726
727flags! {
728    enum StringMod: u8{
729        ForceBin,
730        UpperInsensitive,
731        LowerInsensitive,
732        FullWordMatch,
733        Trim,
734        ForceText,
735        CompactWhitespace,
736        OptBlank,
737    }
738}
739
740#[derive(Debug, Clone, Serialize, Deserialize)]
741struct StringTest {
742    test_val: TestValue<Vec<u8>>,
743    cmp_op: CmpOp,
744    length: Option<usize>,
745    mods: FlagSet<StringMod>,
746    binary: bool,
747}
748
749impl From<StringTest> for Test {
750    fn from(value: StringTest) -> Self {
751        Self::String(value)
752    }
753}
754
755#[inline(always)]
756fn string_match(str: &[u8], mods: FlagSet<StringMod>, buf: &[u8]) -> (bool, usize) {
757    let mut consumed = 0;
758    // we can do a simple string comparison
759    if mods.is_disjoint(
760        StringMod::UpperInsensitive
761            | StringMod::LowerInsensitive
762            | StringMod::FullWordMatch
763            | StringMod::CompactWhitespace
764            | StringMod::OptBlank,
765    ) {
766        // we check if target contains
767        if buf.starts_with(str) {
768            (true, str.len())
769        } else {
770            (false, consumed)
771        }
772    } else {
773        let mut i_src = 0;
774        let mut iter = buf.iter().peekable();
775
776        macro_rules! consume_target {
777            () => {{
778                iter.next();
779                consumed += 1;
780            }};
781        }
782
783        macro_rules! continue_next_iteration {
784            () => {{
785                consume_target!();
786                i_src += 1;
787                continue;
788            }};
789        }
790
791        while let Some(&&b) = iter.peek() {
792            let Some(&ref_byte) = str.get(i_src) else {
793                break;
794            };
795
796            if mods.contains(StringMod::OptBlank) && (b == b' ' || ref_byte == b' ') {
797                if b == b' ' {
798                    // we ignore whitespace in target
799                    consume_target!();
800                }
801
802                if ref_byte == b' ' {
803                    // we ignore whitespace in test
804                    i_src += 1;
805                }
806
807                continue;
808            }
809
810            if mods.contains(StringMod::UpperInsensitive) {
811                //upper case characters in the magic match both lower and upper case characters in the target
812                if ref_byte.is_ascii_uppercase() && ref_byte == b.to_ascii_uppercase()
813                    || ref_byte == b
814                {
815                    continue_next_iteration!()
816                }
817            }
818
819            if mods.contains(StringMod::LowerInsensitive)
820                && (ref_byte.is_ascii_lowercase() && ref_byte == b.to_ascii_lowercase()
821                    || ref_byte == b)
822            {
823                continue_next_iteration!()
824            }
825
826            if mods.contains(StringMod::CompactWhitespace) && ref_byte == b' ' {
827                let mut src_blk = 0;
828                while let Some(b' ') = str.get(i_src) {
829                    src_blk += 1;
830                    i_src += 1;
831                }
832
833                let mut tgt_blk = 0;
834                while let Some(b' ') = iter.peek() {
835                    tgt_blk += 1;
836                    consume_target!();
837                }
838
839                if src_blk > tgt_blk {
840                    return (false, consumed);
841                }
842
843                continue;
844            }
845
846            if ref_byte == b {
847                continue_next_iteration!()
848            } else {
849                return (false, consumed);
850            }
851        }
852
853        if mods.contains(StringMod::FullWordMatch)
854            && let Some(b) = iter.peek()
855            && !b.is_ascii_whitespace()
856        {
857            return (false, consumed);
858        }
859
860        (consumed > 0 && consumed <= buf.len(), consumed)
861    }
862}
863
864impl StringTest {
865    fn has_length_mod(&self) -> bool {
866        !self.mods.is_disjoint(
867            StringMod::UpperInsensitive
868                | StringMod::LowerInsensitive
869                | StringMod::FullWordMatch
870                | StringMod::CompactWhitespace
871                | StringMod::OptBlank,
872        )
873    }
874
875    #[inline(always)]
876    fn test_value_len(&self) -> usize {
877        match self.test_val.as_ref() {
878            TestValue::Value(s) => s.len(),
879            TestValue::Any => 0,
880        }
881    }
882
883    #[inline(always)]
884    fn is_binary(&self) -> bool {
885        self.binary || self.mods.contains(StringMod::ForceBin)
886    }
887
888    #[inline(always)]
889    fn is_text(&self) -> bool {
890        self.mods.contains(StringMod::ForceText)
891    }
892}
893
894#[derive(Debug, Clone, Serialize, Deserialize)]
895struct SearchTest {
896    str: Vec<u8>,
897    n_pos: Option<usize>,
898    str_mods: FlagSet<StringMod>,
899    re_mods: FlagSet<ReMod>,
900    binary: bool,
901    cmp_op: CmpOp,
902}
903
904impl From<SearchTest> for Test {
905    fn from(value: SearchTest) -> Self {
906        Self::Search(value)
907    }
908}
909
910impl SearchTest {
911    #[inline(always)]
912    fn is_binary(&self) -> bool {
913        (self.binary
914            || self.str_mods.contains(StringMod::ForceBin)
915            || self.re_mods.contains(ReMod::ForceBin))
916            && !(self.str_mods.contains(StringMod::ForceText)
917                || self.re_mods.contains(ReMod::ForceText))
918    }
919
920    // off_buf: absolute buffer offset in content
921    #[inline]
922    fn match_buf<'buf>(&self, off_buf: u64, buf: &'buf [u8]) -> Option<MatchRes<'buf>> {
923        let mut i = 0;
924
925        let needle = self.str.first()?;
926
927        while i < buf.len() {
928            // we cannot match if the first character isn't the same
929            // so we accelerate the search by finding potential matches
930            i += memchr(*needle, &buf[i..])?;
931
932            // if we want a full word match
933            if self.str_mods.contains(StringMod::FullWordMatch) {
934                let prev_is_whitespace = buf
935                    .get(i.saturating_sub(1))
936                    .map(|c| c.is_ascii_whitespace())
937                    .unwrap_or_default();
938
939                // if it is not the first character
940                // and its previous character isn't
941                // a whitespace. It cannot be a
942                // fullword match
943                if i > 0 && !prev_is_whitespace {
944                    i += 1;
945                    continue;
946                }
947            }
948
949            if let Some(npos) = self.n_pos
950                && i > npos
951            {
952                break;
953            }
954
955            let pos = i;
956            let (ok, consumed) = string_match(&self.str, self.str_mods, &buf[i..]);
957
958            if ok {
959                return Some(MatchRes::Bytes(
960                    off_buf.saturating_add(pos as u64),
961                    None,
962                    &buf[i..i + consumed],
963                    Encoding::Utf8,
964                ));
965            } else {
966                i += max(consumed, 1)
967            }
968        }
969
970        // handles the case where we want the string not to be found
971        if self.cmp_op.is_neq() {
972            return Some(MatchRes::Bytes(off_buf, None, buf, Encoding::Utf8));
973        }
974
975        None
976    }
977}
978
979#[derive(Debug, Clone, Serialize, Deserialize)]
980struct ScalarTest {
981    ty: ScalarDataType,
982    transform: Option<ScalarTransform>,
983    cmp_op: CmpOp,
984    test_val: TestValue<Scalar>,
985}
986
987#[derive(Debug, Clone, Serialize, Deserialize)]
988struct FloatTest {
989    ty: FloatDataType,
990    transform: Option<FloatTransform>,
991    cmp_op: CmpOp,
992    test_val: TestValue<Float>,
993}
994
995// the value read from the haystack we want to match against
996// 'buf is the lifetime of the buffer we are scanning
997#[derive(Debug, PartialEq)]
998enum ReadValue<'buf> {
999    Float(u64, Float),
1000    Scalar(u64, Scalar),
1001    Bytes(u64, &'buf [u8]),
1002}
1003
1004impl DynDisplay for ReadValue<'_> {
1005    fn dyn_fmt(&self, f: &dyf::FormatSpec) -> Result<String, dyf::Error> {
1006        match self {
1007            Self::Float(_, s) => DynDisplay::dyn_fmt(s, f),
1008            Self::Scalar(_, s) => DynDisplay::dyn_fmt(s, f),
1009            Self::Bytes(_, b) => Ok(format!("{b:?}")),
1010        }
1011    }
1012}
1013
1014impl DynDisplay for &ReadValue<'_> {
1015    fn dyn_fmt(&self, f: &dyf::FormatSpec) -> Result<String, dyf::Error> {
1016        // Dereference self to get the TestValue and call its fmt method
1017        DynDisplay::dyn_fmt(*self, f)
1018    }
1019}
1020
1021impl Display for ReadValue<'_> {
1022    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1023        match self {
1024            Self::Float(_, v) => write!(f, "{v}"),
1025            Self::Scalar(_, s) => write!(f, "{s}"),
1026            Self::Bytes(_, b) => write!(f, "{b:?}"),
1027        }
1028    }
1029}
1030
1031enum Encoding {
1032    Utf16(String16Encoding),
1033    Utf8,
1034}
1035
1036// Carry the offset of the start of the data in the stream
1037// and the data itself
1038enum MatchRes<'buf> {
1039    // Bytes.0: offset of the match
1040    // Bytes.1: optional end of match (to address the need of EOL adjustment in string regex)
1041    // Bytes.2: the bytes matching
1042    // Bytes.3: encoding of the buffer
1043    Bytes(u64, Option<u64>, &'buf [u8], Encoding),
1044    Scalar(u64, Scalar),
1045    Float(u64, Float),
1046}
1047
1048impl DynDisplay for &MatchRes<'_> {
1049    fn dyn_fmt(&self, f: &dyf::FormatSpec) -> Result<String, dyf::Error> {
1050        (*self).dyn_fmt(f)
1051    }
1052}
1053
1054impl DynDisplay for MatchRes<'_> {
1055    fn dyn_fmt(&self, f: &dyf::FormatSpec) -> Result<String, dyf::Error> {
1056        match self {
1057            Self::Scalar(_, v) => v.dyn_fmt(f),
1058            Self::Float(_, v) => v.dyn_fmt(f),
1059            Self::Bytes(_, _, v, enc) => match enc {
1060                Encoding::Utf8 => String::from_utf8_lossy(v).to_string().dyn_fmt(f),
1061                Encoding::Utf16(enc) => {
1062                    let utf16: Vec<u16> = slice_to_utf16_iter(v, *enc).collect();
1063                    String::from_utf16_lossy(&utf16).dyn_fmt(f)
1064                }
1065            },
1066        }
1067    }
1068}
1069
1070impl MatchRes<'_> {
1071    // start offset of the match
1072    #[inline]
1073    fn start_offset(&self) -> u64 {
1074        match self {
1075            MatchRes::Bytes(o, _, _, _) => *o,
1076            MatchRes::Scalar(o, _) => *o,
1077            MatchRes::Float(o, _) => *o,
1078        }
1079    }
1080
1081    // start offset of the match
1082    #[inline]
1083    fn end_offset(&self) -> u64 {
1084        match self {
1085            MatchRes::Bytes(start, end, buf, _) => match end {
1086                Some(end) => *end,
1087                None => start.saturating_add(buf.len() as u64),
1088            },
1089            MatchRes::Scalar(o, sc) => o.add(sc.size_of() as u64),
1090            MatchRes::Float(o, f) => o.add(f.size_of() as u64),
1091        }
1092    }
1093}
1094
1095fn slice_to_utf16_iter(read: &[u8], encoding: String16Encoding) -> impl Iterator<Item = u16> {
1096    let even = read
1097        .iter()
1098        .enumerate()
1099        .filter(|(i, _)| i % 2 == 0)
1100        .map(|t| t.1);
1101
1102    let odd = read
1103        .iter()
1104        .enumerate()
1105        .filter(|(i, _)| i % 2 != 0)
1106        .map(|t| t.1);
1107
1108    even.zip(odd).map(move |(e, o)| match encoding {
1109        String16Encoding::Le => u16::from_le_bytes([*e, *o]),
1110        String16Encoding::Be => u16::from_be_bytes([*e, *o]),
1111    })
1112}
1113
1114#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
1115enum String16Encoding {
1116    Le,
1117    Be,
1118}
1119
1120#[derive(Debug, Clone, Serialize, Deserialize)]
1121struct String16Test {
1122    orig: String,
1123    test_val: TestValue<Vec<u16>>,
1124    encoding: String16Encoding,
1125}
1126
1127impl String16Test {
1128    /// if the test value is a specific value this method returns
1129    /// the number of utf16 characters. To obtain the length in
1130    /// bytes the return value needs to be multiplied by two.
1131    #[inline(always)]
1132    fn test_value_len(&self) -> usize {
1133        match self.test_val.as_ref() {
1134            TestValue::Value(str16) => str16.len(),
1135            TestValue::Any => 0,
1136        }
1137    }
1138}
1139
1140flags! {
1141    enum IndirectMod: u8{
1142        Relative,
1143    }
1144}
1145
1146type IndirectMods = FlagSet<IndirectMod>;
1147
1148#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
1149enum PStringLen {
1150    Byte,    // B
1151    ShortBe, // H
1152    ShortLe, // h
1153    LongBe,  // L
1154    LongLe,  // l
1155}
1156
1157impl PStringLen {
1158    #[inline(always)]
1159    const fn size_of_len(&self) -> usize {
1160        match self {
1161            PStringLen::Byte => 1,
1162            PStringLen::ShortBe => 2,
1163            PStringLen::ShortLe => 2,
1164            PStringLen::LongBe => 4,
1165            PStringLen::LongLe => 4,
1166        }
1167    }
1168}
1169
1170#[derive(Debug, Clone, Serialize, Deserialize)]
1171struct PStringTest {
1172    len: PStringLen,
1173    test_val: TestValue<Vec<u8>>,
1174    include_len: bool,
1175}
1176
1177impl PStringTest {
1178    #[inline]
1179    fn read<'cache, R: Read + Seek>(
1180        &self,
1181        haystack: &'cache mut LazyCache<R>,
1182    ) -> Result<Option<&'cache [u8]>, Error> {
1183        let mut len = match self.len {
1184            PStringLen::Byte => read_le!(haystack, u8) as u32,
1185            PStringLen::ShortBe => read_be!(haystack, u16) as u32,
1186            PStringLen::ShortLe => read_le!(haystack, u16) as u32,
1187            PStringLen::LongBe => read_be!(haystack, u32),
1188            PStringLen::LongLe => read_le!(haystack, u32),
1189        } as usize;
1190
1191        if self.include_len {
1192            len = len.saturating_sub(self.len.size_of_len())
1193        }
1194
1195        if let TestValue::Value(s) = self.test_val.as_ref()
1196            && len != s.len()
1197        {
1198            return Ok(None);
1199        }
1200
1201        let read = haystack.read_exact_count(len as u64)?;
1202
1203        Ok(Some(read))
1204    }
1205
1206    #[inline(always)]
1207    fn test_value_len(&self) -> usize {
1208        match self.test_val.as_ref() {
1209            TestValue::Value(s) => s.len(),
1210            TestValue::Any => 0,
1211        }
1212    }
1213}
1214
1215#[derive(Debug, Clone, Serialize, Deserialize)]
1216enum Test {
1217    Name(String),
1218    Use(bool, String),
1219    Scalar(ScalarTest),
1220    Float(FloatTest),
1221    String(StringTest),
1222    Search(SearchTest),
1223    PString(PStringTest),
1224    Regex(RegexTest),
1225    Indirect(FlagSet<IndirectMod>),
1226    String16(String16Test),
1227    // FIXME: placeholder for strength computation
1228    #[allow(dead_code)]
1229    Der,
1230    Clear,
1231    Default,
1232}
1233
1234impl Test {
1235    // read the value to test from the haystack
1236    #[inline]
1237    fn read_test_value<'haystack, R: Read + Seek>(
1238        &self,
1239        haystack: &'haystack mut LazyCache<R>,
1240        switch_endianness: bool,
1241    ) -> Result<Option<ReadValue<'haystack>>, Error> {
1242        let test_value_offset = haystack.lazy_stream_position();
1243
1244        match self {
1245            Self::Scalar(t) => {
1246                t.ty.read(haystack, switch_endianness)
1247                    .map(|s| Some(ReadValue::Scalar(test_value_offset, s)))
1248            }
1249
1250            Self::Float(t) => {
1251                t.ty.read(haystack, switch_endianness)
1252                    .map(|f| Some(ReadValue::Float(test_value_offset, f)))
1253            }
1254            Self::String(t) => {
1255                match t.test_val.as_ref() {
1256                    TestValue::Value(str) => {
1257                        let buf = if let Some(length) = t.length {
1258                            // if there is a length specified
1259                            haystack.read_exact_count(length as u64)?
1260                        } else {
1261                            // no length specified we read until end of string
1262
1263                            match t.cmp_op {
1264                                CmpOp::Eq | CmpOp::Neq => {
1265                                    if !t.has_length_mod() {
1266                                        haystack.read_exact_count(str.len() as u64)?
1267                                    } else {
1268                                        haystack.read_count(FILE_BYTES_MAX as u64)?
1269                                    }
1270                                }
1271                                CmpOp::Lt | CmpOp::Gt => {
1272                                    let read =
1273                                        haystack.read_until_any_delim_or_limit(b"\n\0", 8092)?;
1274
1275                                    if read.ends_with(b"\0") || read.ends_with(b"\n") {
1276                                        &read[..read.len() - 1]
1277                                    } else {
1278                                        read
1279                                    }
1280                                }
1281                                _ => {
1282                                    return Err(Error::Msg(format!(
1283                                        "string test does not support {:?} operator",
1284                                        t.cmp_op
1285                                    )));
1286                                }
1287                            }
1288                        };
1289
1290                        Ok(Some(ReadValue::Bytes(test_value_offset, buf)))
1291                    }
1292                    TestValue::Any => {
1293                        let read = haystack.read_until_any_delim_or_limit(b"\0\n", 8192)?;
1294                        // we don't take last byte if it matches end of string
1295                        let bytes = if read.ends_with(b"\0") || read.ends_with(b"\n") {
1296                            &read[..read.len() - 1]
1297                        } else {
1298                            read
1299                        };
1300
1301                        Ok(Some(ReadValue::Bytes(test_value_offset, bytes)))
1302                    }
1303                }
1304            }
1305
1306            Self::String16(t) => {
1307                match t.test_val.as_ref() {
1308                    TestValue::Value(str16) => {
1309                        let read = haystack.read_exact_count((str16.len() * 2) as u64)?;
1310
1311                        Ok(Some(ReadValue::Bytes(test_value_offset, read)))
1312                    }
1313                    TestValue::Any => {
1314                        let read = haystack.read_until_utf16_or_limit(b"\x00\x00", 8192)?;
1315
1316                        // we make sure we have an even number of elements
1317                        let end = if read.len() % 2 == 0 {
1318                            read.len()
1319                        } else {
1320                            // we decide to read anyway even though
1321                            // length isn't even
1322                            read.len().saturating_sub(1)
1323                        };
1324
1325                        Ok(Some(ReadValue::Bytes(test_value_offset, &read[..end])))
1326                    }
1327                }
1328            }
1329
1330            Self::PString(t) => {
1331                let Some(read) = t.read(haystack)? else {
1332                    return Ok(None);
1333                };
1334                Ok(Some(ReadValue::Bytes(test_value_offset, read)))
1335            }
1336
1337            Self::Search(_) => {
1338                let buf = haystack.read_count(FILE_BYTES_MAX as u64)?;
1339                Ok(Some(ReadValue::Bytes(test_value_offset, buf)))
1340            }
1341
1342            Self::Regex(r) => {
1343                let length = {
1344                    match r.length {
1345                        Some(len) => {
1346                            if r.mods.contains(ReMod::LineLimit) {
1347                                len * 80
1348                            } else {
1349                                len
1350                            }
1351                        }
1352
1353                        None => FILE_REGEX_MAX,
1354                    }
1355                };
1356
1357                let read = haystack.read_count(length as u64)?;
1358                Ok(Some(ReadValue::Bytes(test_value_offset, read)))
1359            }
1360
1361            Self::Name(_)
1362            | Self::Use(_, _)
1363            | Self::Indirect(_)
1364            | Self::Clear
1365            | Self::Default
1366            | Self::Der => Err(Error::msg("no value to read for this test")),
1367        }
1368    }
1369
1370    #[inline(always)]
1371    fn match_value<'s>(
1372        &'s self,
1373        tv: &ReadValue<'s>,
1374        stream_kind: StreamKind,
1375    ) -> Option<MatchRes<'s>> {
1376        match (self, tv) {
1377            (Self::Scalar(t), ReadValue::Scalar(o, ts)) => {
1378                let read_value: Scalar = match t.transform.as_ref() {
1379                    Some(t) => t.apply(*ts)?,
1380                    None => *ts,
1381                };
1382
1383                match t.test_val {
1384                    TestValue::Value(test_value) => {
1385                        let ok = match t.cmp_op {
1386                            // NOTE: this should not happen in practice because
1387                            // we convert it into Eq equivalent at parsing time
1388                            CmpOp::Not => read_value == !test_value,
1389                            CmpOp::Eq => read_value == test_value,
1390                            CmpOp::Lt => read_value < test_value,
1391                            CmpOp::Gt => read_value > test_value,
1392                            CmpOp::Neq => read_value != test_value,
1393                            CmpOp::BitAnd => read_value & test_value == test_value,
1394                            CmpOp::Xor => (read_value & test_value).is_zero(),
1395                        };
1396
1397                        if ok {
1398                            Some(MatchRes::Scalar(*o, read_value))
1399                        } else {
1400                            None
1401                        }
1402                    }
1403
1404                    TestValue::Any => Some(MatchRes::Scalar(*o, read_value)),
1405                }
1406            }
1407
1408            (Self::Float(t), ReadValue::Float(o, f)) => {
1409                let read_value: Float = t.transform.as_ref().map(|t| t.apply(*f)).unwrap_or(*f);
1410
1411                match t.test_val {
1412                    TestValue::Value(tf) => {
1413                        let ok = match t.cmp_op {
1414                            CmpOp::Eq => read_value == tf,
1415                            CmpOp::Lt => read_value < tf,
1416                            CmpOp::Gt => read_value > tf,
1417                            CmpOp::Neq => read_value != tf,
1418                            _ => {
1419                                // this should never be reached as we validate
1420                                // operator in parser
1421                                debug_panic!("unsupported float comparison");
1422                                debug!("unsupported float comparison");
1423                                false
1424                            }
1425                        };
1426
1427                        if ok {
1428                            Some(MatchRes::Float(*o, read_value))
1429                        } else {
1430                            None
1431                        }
1432                    }
1433                    TestValue::Any => Some(MatchRes::Float(*o, read_value)),
1434                }
1435            }
1436
1437            (Self::String(st), ReadValue::Bytes(o, buf)) => {
1438                macro_rules! trim_buf {
1439                    ($buf: expr) => {{
1440                        if st.mods.contains(StringMod::Trim) {
1441                            $buf.trim_ascii()
1442                        } else {
1443                            $buf
1444                        }
1445                    }};
1446                }
1447
1448                match st.test_val.as_ref() {
1449                    TestValue::Value(str) => {
1450                        match st.cmp_op {
1451                            CmpOp::Eq => {
1452                                if let (true, _) = string_match(str, st.mods, buf) {
1453                                    Some(MatchRes::Bytes(*o, None, trim_buf!(str), Encoding::Utf8))
1454                                } else {
1455                                    None
1456                                }
1457                            }
1458                            CmpOp::Neq => {
1459                                if let (false, _) = string_match(str, st.mods, buf) {
1460                                    Some(MatchRes::Bytes(*o, None, trim_buf!(str), Encoding::Utf8))
1461                                } else {
1462                                    None
1463                                }
1464                            }
1465                            CmpOp::Gt => {
1466                                if buf.len() > str.len() {
1467                                    Some(MatchRes::Bytes(*o, None, trim_buf!(buf), Encoding::Utf8))
1468                                } else {
1469                                    None
1470                                }
1471                            }
1472                            CmpOp::Lt => {
1473                                if buf.len() < str.len() {
1474                                    Some(MatchRes::Bytes(*o, None, trim_buf!(buf), Encoding::Utf8))
1475                                } else {
1476                                    None
1477                                }
1478                            }
1479
1480                            // unsupported for strings
1481                            _ => {
1482                                // this should never be reached as we validate
1483                                // operator in parser
1484                                debug_panic!("unsupported string comparison");
1485                                debug!("unsupported string comparison");
1486                                None
1487                            }
1488                        }
1489                    }
1490                    TestValue::Any => {
1491                        Some(MatchRes::Bytes(*o, None, trim_buf!(buf), Encoding::Utf8))
1492                    }
1493                }
1494            }
1495
1496            (Self::PString(m), ReadValue::Bytes(o, buf)) => match m.test_val.as_ref() {
1497                TestValue::Value(psv) => {
1498                    if buf == psv {
1499                        Some(MatchRes::Bytes(*o, None, buf, Encoding::Utf8))
1500                    } else {
1501                        None
1502                    }
1503                }
1504                TestValue::Any => Some(MatchRes::Bytes(*o, None, buf, Encoding::Utf8)),
1505            },
1506
1507            (Self::String16(t), ReadValue::Bytes(o, buf)) => {
1508                match t.test_val.as_ref() {
1509                    TestValue::Value(str16) => {
1510                        // strings cannot be equal
1511                        if str16.len() * 2 != buf.len() {
1512                            return None;
1513                        }
1514
1515                        // we check string equality
1516                        for (i, utf16_char) in slice_to_utf16_iter(buf, t.encoding).enumerate() {
1517                            if str16[i] != utf16_char {
1518                                return None;
1519                            }
1520                        }
1521
1522                        Some(MatchRes::Bytes(
1523                            *o,
1524                            None,
1525                            t.orig.as_bytes(),
1526                            Encoding::Utf16(t.encoding),
1527                        ))
1528                    }
1529
1530                    TestValue::Any => {
1531                        Some(MatchRes::Bytes(*o, None, buf, Encoding::Utf16(t.encoding)))
1532                    }
1533                }
1534            }
1535
1536            (Self::Regex(r), ReadValue::Bytes(o, buf)) => r.match_buf(*o, stream_kind, buf),
1537
1538            (Self::Search(t), ReadValue::Bytes(o, buf)) => t.match_buf(*o, buf),
1539
1540            _ => None,
1541        }
1542    }
1543
1544    #[inline(always)]
1545    fn strength(&self) -> u64 {
1546        const MULT: usize = 10;
1547
1548        let mut out = 2 * MULT;
1549
1550        // FIXME: octal is missing but it is not used in practice ...
1551        match self {
1552            Test::Scalar(s) => {
1553                out += s.ty.type_size() * MULT;
1554            }
1555
1556            Test::Float(t) => {
1557                out += t.ty.type_size() * MULT;
1558            }
1559
1560            Test::String(t) => out += t.test_value_len().saturating_mul(MULT),
1561
1562            Test::PString(t) => out += t.test_value_len().saturating_mul(MULT),
1563
1564            Test::Search(s) => {
1565                // NOTE: this implementation deviates from what is in
1566                // C libmagic. The purpose of this implementation is to
1567                // minimize the difference between similar tests,
1568                // implemented differently (ex: string test VS very localized search test).
1569                let n_pos = s.n_pos.unwrap_or(FILE_BYTES_MAX);
1570
1571                match n_pos {
1572                    // a search on one line should be equivalent to a string match
1573                    0..=80 => out += s.str.len().saturating_mul(MULT),
1574                    // search on the first 3 lines gets a little penalty
1575                    81..=240 => out += s.str.len() * s.str.len().clamp(0, MULT - 2),
1576                    // a search on more than 3 lines isn't considered very accurate
1577                    _ => out += s.str.len(),
1578                }
1579            }
1580
1581            Test::Regex(r) => {
1582                // NOTE: this implementation deviates from what is in
1583                // C libmagic. The purpose of this implementation is to
1584                // minimize the difference between similar tests,
1585                // implemented differently (ex: string test VS very localized regex test).
1586
1587                // we divide length by the number of capture group
1588                // which gives us a value close to he average string
1589                // length match in the regex.
1590                let v = r.non_magic_len / r.re.captures_len();
1591
1592                let len = r
1593                    .length
1594                    .map(|l| {
1595                        if r.mods.contains(ReMod::LineLimit) {
1596                            l * 80
1597                        } else {
1598                            l
1599                        }
1600                    })
1601                    .unwrap_or(FILE_BYTES_MAX);
1602
1603                match len {
1604                    // a search on one line should be equivalent to a string match
1605                    0..=80 => out += v.saturating_mul(MULT),
1606                    // search on the first 3 lines gets a little penalty
1607                    81..=240 => out += v * v.clamp(0, MULT - 2),
1608                    // a search on more than 3 lines isn't considered very accurate
1609                    _ => out += v,
1610                }
1611            }
1612
1613            Test::String16(t) => {
1614                // NOTE: in libmagic the result is div by 2
1615                // but I GUESS it is because the len is expressed
1616                // in number bytes. In our case length is expressed
1617                // in number of u16 so we shouldn't divide.
1618                out += t.test_value_len().saturating_mul(MULT);
1619            }
1620
1621            Test::Der => out += MULT,
1622
1623            Test::Default | Test::Name(_) | Test::Use(_, _) | Test::Indirect(_) | Test::Clear => {
1624                return 0;
1625            }
1626        }
1627
1628        // matching any output gets penalty
1629        if self.is_match_any() {
1630            return 0;
1631        }
1632
1633        if let Some(op) = self.cmp_op() {
1634            match op {
1635                // matching almost any gets penalty
1636                CmpOp::Neq => out = 0,
1637                CmpOp::Eq | CmpOp::Not => out += MULT,
1638                CmpOp::Lt | CmpOp::Gt => out -= 2 * MULT,
1639                CmpOp::Xor | CmpOp::BitAnd => out -= MULT,
1640            }
1641        }
1642
1643        out as u64
1644    }
1645
1646    #[inline(always)]
1647    fn cmp_op(&self) -> Option<CmpOp> {
1648        match self {
1649            Self::String(t) => Some(t.cmp_op),
1650            Self::Scalar(s) => Some(s.cmp_op),
1651            Self::Float(t) => Some(t.cmp_op),
1652            Self::Name(_)
1653            | Self::Use(_, _)
1654            | Self::Search(_)
1655            | Self::PString(_)
1656            | Self::Regex(_)
1657            | Self::Clear
1658            | Self::Default
1659            | Self::Indirect(_)
1660            | Self::String16(_)
1661            | Self::Der => None,
1662        }
1663    }
1664
1665    #[inline(always)]
1666    fn is_match_any(&self) -> bool {
1667        match self {
1668            Test::Name(_) => false,
1669            Test::Use(_, _) => false,
1670            Test::Scalar(scalar_test) => matches!(scalar_test.test_val, TestValue::Any),
1671            Test::Float(float_test) => matches!(float_test.test_val, TestValue::Any),
1672            Test::String(string_test) => matches!(string_test.test_val, TestValue::Any),
1673            Test::Search(_) => false,
1674            Test::PString(pstring_test) => matches!(pstring_test.test_val, TestValue::Any),
1675            Test::Regex(_) => false,
1676            Test::Indirect(_) => false,
1677            Test::String16(string16_test) => matches!(string16_test.test_val, TestValue::Any),
1678            Test::Der => false,
1679            Test::Clear => false,
1680            Test::Default => false,
1681        }
1682    }
1683
1684    #[inline(always)]
1685    fn is_binary(&self) -> bool {
1686        match self {
1687            Self::Name(_) => true,
1688            Self::Use(_, _) => true,
1689            Self::Scalar(_) => true,
1690            Self::Float(_) => true,
1691            Self::String(t) => !t.is_binary() & !t.is_text() || t.is_binary(),
1692            Self::Search(t) => t.is_binary(),
1693            Self::PString(_) => true,
1694            Self::Regex(t) => t.is_binary(),
1695            Self::Clear => true,
1696            Self::Default => true,
1697            Self::Indirect(_) => true,
1698            Self::String16(_) => true,
1699            Self::Der => true,
1700        }
1701    }
1702
1703    #[inline(always)]
1704    fn is_text(&self) -> bool {
1705        match self {
1706            Self::Name(_) => true,
1707            Self::Use(_, _) => true,
1708            Self::Indirect(_) => true,
1709            Self::Clear => true,
1710            Self::Default => true,
1711            Self::String(t) => !t.is_binary() & !t.is_text() || t.is_text(),
1712            _ => !self.is_binary(),
1713        }
1714    }
1715
1716    #[inline(always)]
1717    fn is_only_text(&self) -> bool {
1718        self.is_text() && !self.is_binary()
1719    }
1720
1721    #[inline(always)]
1722    fn is_only_binary(&self) -> bool {
1723        self.is_binary() && !self.is_text()
1724    }
1725}
1726
1727#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
1728enum OffsetType {
1729    Byte,
1730    DoubleLe,
1731    DoubleBe,
1732    ShortLe,
1733    ShortBe,
1734    Id3Le,
1735    Id3Be,
1736    LongLe,
1737    LongBe,
1738    Middle,
1739    Octal,
1740    QuadBe,
1741    QuadLe,
1742}
1743
1744#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
1745enum Shift {
1746    Direct(u64),
1747    Indirect(i64),
1748}
1749
1750#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
1751struct IndOffset {
1752    // where to find the offset
1753    off_addr: DirOffset,
1754    // signed or unsigned
1755    signed: bool,
1756    // type of the offset
1757    ty: OffsetType,
1758    op: Option<Op>,
1759    shift: Option<Shift>,
1760}
1761
1762impl IndOffset {
1763    // if we overflow we must not return an offset
1764    fn read_offset<R: Read + Seek>(
1765        &self,
1766        haystack: &mut LazyCache<R>,
1767        rule_base_offset: Option<u64>,
1768        last_upper_match_offset: Option<u64>,
1769    ) -> Result<Option<u64>, io::Error> {
1770        let offset_address = match self.off_addr {
1771            DirOffset::Start(s) => {
1772                let Some(o) = s.checked_add(rule_base_offset.unwrap_or_default()) else {
1773                    return Ok(None);
1774                };
1775
1776                haystack.seek(SeekFrom::Start(o))?
1777            }
1778            DirOffset::LastUpper(c) => haystack.seek(SeekFrom::Start(
1779                (last_upper_match_offset.unwrap_or_default() as i64 + c) as u64,
1780            ))?,
1781            DirOffset::End(e) => haystack.seek(SeekFrom::End(e))?,
1782        };
1783
1784        macro_rules! read_value {
1785            () => {
1786                match self.ty {
1787                    OffsetType::Byte => {
1788                        if self.signed {
1789                            read_le!(haystack, u8) as u64
1790                        } else {
1791                            read_le!(haystack, i8) as u64
1792                        }
1793                    }
1794                    OffsetType::DoubleLe => read_le!(haystack, f64) as u64,
1795                    OffsetType::DoubleBe => read_be!(haystack, f64) as u64,
1796                    OffsetType::ShortLe => {
1797                        if self.signed {
1798                            read_le!(haystack, i16) as u64
1799                        } else {
1800                            read_le!(haystack, u16) as u64
1801                        }
1802                    }
1803                    OffsetType::ShortBe => {
1804                        if self.signed {
1805                            read_be!(haystack, i16) as u64
1806                        } else {
1807                            read_be!(haystack, u16) as u64
1808                        }
1809                    }
1810                    OffsetType::Id3Le => decode_id3(read_le!(haystack, u32)) as u64,
1811                    OffsetType::Id3Be => decode_id3(read_be!(haystack, u32)) as u64,
1812                    OffsetType::LongLe => {
1813                        if self.signed {
1814                            read_le!(haystack, i32) as u64
1815                        } else {
1816                            read_le!(haystack, u32) as u64
1817                        }
1818                    }
1819                    OffsetType::LongBe => {
1820                        if self.signed {
1821                            read_be!(haystack, i32) as u64
1822                        } else {
1823                            read_be!(haystack, u32) as u64
1824                        }
1825                    }
1826                    OffsetType::Middle => read_me!(haystack) as u64,
1827                    OffsetType::Octal => {
1828                        if let Some(o) = read_octal_u64(haystack) {
1829                            o
1830                        } else {
1831                            debug!("failed to read octal offset @ {offset_address}");
1832                            return Ok(None);
1833                        }
1834                    }
1835                    OffsetType::QuadLe => {
1836                        if self.signed {
1837                            read_le!(haystack, i64) as u64
1838                        } else {
1839                            read_le!(haystack, u64)
1840                        }
1841                    }
1842                    OffsetType::QuadBe => {
1843                        if self.signed {
1844                            read_be!(haystack, i64) as u64
1845                        } else {
1846                            read_be!(haystack, u64)
1847                        }
1848                    }
1849                }
1850            };
1851        }
1852
1853        // in theory every offset read should end up in something seekable from start, so we can use u64 to store the result
1854        let o = read_value!();
1855
1856        trace!(
1857            "offset read @ {offset_address} value={o} op={:?} shift={:?}",
1858            self.op, self.shift
1859        );
1860
1861        // apply transformation
1862        if let (Some(op), Some(shift)) = (self.op, self.shift) {
1863            let shift = match shift {
1864                Shift::Direct(i) => i,
1865                Shift::Indirect(i) => {
1866                    let tmp = offset_address as i128 + i as i128;
1867                    if tmp.is_negative() {
1868                        return Ok(None);
1869                    } else {
1870                        haystack.seek(SeekFrom::Start(tmp as u64))?;
1871                    };
1872                    // NOTE: here we assume that the shift has the same
1873                    // type as the main offset !
1874                    read_value!()
1875                }
1876            };
1877
1878            match op {
1879                Op::Add => return Ok(o.checked_add(shift)),
1880                Op::Mul => return Ok(o.checked_mul(shift)),
1881                Op::Sub => return Ok(o.checked_sub(shift)),
1882                Op::Div => return Ok(o.checked_div(shift)),
1883                Op::Mod => return Ok(o.checked_rem(shift)),
1884                Op::And => return Ok(Some(o & shift)),
1885                Op::Or => return Ok(Some(o | shift)),
1886                Op::Xor => return Ok(Some(o ^ shift)),
1887            }
1888        }
1889
1890        Ok(Some(o))
1891    }
1892}
1893
1894#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
1895enum DirOffset {
1896    Start(u64),
1897    // relative to the last up-level field
1898    LastUpper(i64),
1899    End(i64),
1900}
1901
1902#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
1903enum Offset {
1904    Direct(DirOffset),
1905    Indirect(IndOffset),
1906}
1907
1908impl From<DirOffset> for Offset {
1909    fn from(value: DirOffset) -> Self {
1910        Self::Direct(value)
1911    }
1912}
1913
1914impl From<IndOffset> for Offset {
1915    fn from(value: IndOffset) -> Self {
1916        Self::Indirect(value)
1917    }
1918}
1919
1920impl Display for DirOffset {
1921    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1922        match self {
1923            DirOffset::Start(i) => write!(f, "{i}"),
1924            DirOffset::LastUpper(c) => write!(f, "&{c}"),
1925            DirOffset::End(e) => write!(f, "-{e}"),
1926        }
1927    }
1928}
1929
1930impl Default for DirOffset {
1931    fn default() -> Self {
1932        Self::LastUpper(0)
1933    }
1934}
1935
1936#[derive(Debug, Clone, Serialize, Deserialize)]
1937struct Match {
1938    line: usize,
1939    depth: u8,
1940    offset: Offset,
1941    test: Test,
1942    test_strength: u64,
1943    message: Option<Message>,
1944}
1945
1946impl From<Use> for Match {
1947    fn from(value: Use) -> Self {
1948        let test = Test::Use(value.switch_endianness, value.rule_name);
1949        let test_strength = test.strength();
1950        Self {
1951            line: value.line,
1952            depth: value.depth,
1953            offset: value.start_offset,
1954            test,
1955            test_strength,
1956            message: value.message,
1957        }
1958    }
1959}
1960
1961impl From<Name> for Match {
1962    fn from(value: Name) -> Self {
1963        let test = Test::Name(value.name);
1964        let test_strength = test.strength();
1965        Self {
1966            line: value.line,
1967            depth: 0,
1968            offset: Offset::Direct(DirOffset::Start(0)),
1969            test,
1970            test_strength,
1971            message: value.message,
1972        }
1973    }
1974}
1975
1976impl Match {
1977    /// Turns the `Match`'s offset into an absolute offset from the start of the stream
1978    #[inline(always)]
1979    fn offset_from_start<R: Read + Seek>(
1980        &self,
1981        haystack: &mut LazyCache<R>,
1982        rule_base_offset: Option<u64>,
1983        last_level_offset: Option<u64>,
1984    ) -> Result<Option<u64>, io::Error> {
1985        match self.offset {
1986            Offset::Direct(dir_offset) => match dir_offset {
1987                DirOffset::Start(s) => Ok(Some(s)),
1988                DirOffset::LastUpper(shift) => {
1989                    let o = last_level_offset.unwrap_or_default() as i64 + shift;
1990
1991                    if o.is_positive() {
1992                        Ok(Some(o as u64))
1993                    } else {
1994                        Ok(None)
1995                    }
1996                }
1997                DirOffset::End(e) => Ok(Some(haystack.offset_from_start(SeekFrom::End(e)))),
1998            },
1999            Offset::Indirect(ind_offset) => {
2000                let Some(o) =
2001                    ind_offset.read_offset(haystack, rule_base_offset, last_level_offset)?
2002                else {
2003                    return Ok(None);
2004                };
2005
2006                Ok(Some(o))
2007            }
2008        }
2009    }
2010
2011    /// this method emulates the buffer based matching
2012    /// logic implemented in libmagic. It needs some aweful
2013    /// and weird offset convertions to turn buffer
2014    /// relative offsets (libmagic is based on) into
2015    /// absolute offset in the file.
2016    ///
2017    /// this method shoud bubble up only critical errors
2018    /// all the other errors should make the match result
2019    /// false and be logged via debug!
2020    ///
2021    /// the function returns an error if the maximum recursion
2022    /// has been reached or if a dependency rule is missing.
2023    #[inline]
2024    #[allow(clippy::too_many_arguments)]
2025    fn matches<'a: 'h, 'h, R: Read + Seek>(
2026        &'a self,
2027        source: Option<&str>,
2028        magic: &mut Magic<'a>,
2029        stream_kind: StreamKind,
2030        state: &mut MatchState,
2031        buf_base_offset: Option<u64>,
2032        rule_base_offset: Option<u64>,
2033        last_level_offset: Option<u64>,
2034        haystack: &'h mut LazyCache<R>,
2035        switch_endianness: bool,
2036        db: &'a MagicDb,
2037        depth: usize,
2038    ) -> Result<(bool, Option<MatchRes<'h>>), Error> {
2039        let source = source.unwrap_or("unknown");
2040        let line = self.line;
2041
2042        if depth >= MAX_RECURSION {
2043            return Err(Error::localized(
2044                source,
2045                line,
2046                Error::MaximumRecursion(MAX_RECURSION),
2047            ));
2048        }
2049
2050        if self.test.is_only_binary() && stream_kind.is_text() {
2051            trace!("skip binary test source={source} line={line} stream_kind={stream_kind:?}",);
2052            return Ok((false, None));
2053        }
2054
2055        if self.test.is_only_text() && !stream_kind.is_text() {
2056            trace!("skip text test source={source} line={line} stream_kind={stream_kind:?}",);
2057            return Ok((false, None));
2058        }
2059
2060        let Ok(Some(mut offset)) = self
2061            .offset_from_start(haystack, rule_base_offset, last_level_offset)
2062            .inspect_err(|e| debug!("source={source} line={line} failed at computing offset: {e}"))
2063        else {
2064            return Ok((false, None));
2065        };
2066
2067        offset = match self.offset {
2068            Offset::Indirect(_) => {
2069                // the result we get for an indirect offset
2070                // is relative to the start of the libmagic
2071                // buffer so we need to add base to make it
2072                // absolute.
2073                buf_base_offset.unwrap_or_default().saturating_add(offset)
2074            }
2075            // offset from start are computed from rule base
2076            Offset::Direct(DirOffset::Start(_)) => {
2077                rule_base_offset.unwrap_or_default().saturating_add(offset)
2078            }
2079            _ => offset,
2080        };
2081
2082        match &self.test {
2083            Test::Clear => {
2084                trace!("source={source} line={line} clear");
2085                state.clear_continuation_level(&self.continuation_level());
2086                Ok((true, None))
2087            }
2088
2089            Test::Name(name) => {
2090                trace!(
2091                    "source={source} line={line} running rule {name} switch_endianness={switch_endianness}",
2092                );
2093                Ok((true, None))
2094            }
2095
2096            Test::Use(flip_endianness, rule_name) => {
2097                trace!(
2098                    "source={source} line={line} use {rule_name} switch_endianness={flip_endianness}",
2099                );
2100
2101                // switch_endianness must propagate down the rule call stack
2102                let switch_endianness = switch_endianness ^ flip_endianness;
2103
2104                let dr: &DependencyRule = db.dependencies.get(rule_name).ok_or(
2105                    Error::localized(source, line, Error::MissingRule(rule_name.clone())),
2106                )?;
2107
2108                // we push the message here otherwise we push message in depth first
2109                if let Some(msg) = self.message.as_ref() {
2110                    magic.push_message(msg.to_string_lossy());
2111                }
2112
2113                dr.rule.magic(
2114                    magic,
2115                    stream_kind,
2116                    buf_base_offset,
2117                    Some(offset),
2118                    haystack,
2119                    db,
2120                    switch_endianness,
2121                    depth.saturating_add(1),
2122                )?;
2123
2124                // we return false not to push message again
2125                Ok((false, None))
2126            }
2127
2128            Test::Indirect(m) => {
2129                trace!(
2130                    "source={source} line={line} indirect mods={:?} offset={offset:#x}",
2131                    m
2132                );
2133
2134                let new_buf_base_off = if m.contains(IndirectMod::Relative) {
2135                    Some(offset)
2136                } else {
2137                    None
2138                };
2139
2140                // we push the message here otherwise we push message in depth first
2141                if let Some(msg) = self.message.as_ref() {
2142                    magic.push_message(msg.to_string_lossy());
2143                }
2144
2145                for r in db.rules.iter() {
2146                    let messages_cnt = magic.message.len();
2147
2148                    r.magic(
2149                        magic,
2150                        stream_kind,
2151                        new_buf_base_off,
2152                        Some(offset),
2153                        haystack,
2154                        db,
2155                        false,
2156                        depth.saturating_add(1),
2157                    )?;
2158
2159                    // this means we matched a rule
2160                    if magic.message.len() != messages_cnt {
2161                        break;
2162                    }
2163                }
2164
2165                // we return false not to push message again
2166                Ok((false, None))
2167            }
2168
2169            Test::Default => {
2170                // default matches if nothing else at the continuation level matched
2171                let ok = !state.get_continuation_level(&self.continuation_level());
2172
2173                trace!("source={source} line={line} default match={ok}");
2174                if ok {
2175                    state.set_continuation_level(self.continuation_level());
2176                }
2177
2178                Ok((ok, None))
2179            }
2180
2181            _ => {
2182                if let Err(e) = haystack.seek(SeekFrom::Start(offset)) {
2183                    debug!("source={source} line={line} failed to seek in haystack: {e}");
2184                    return Ok((false, None));
2185                }
2186
2187                let mut trace_msg = None;
2188
2189                if enabled!(Level::DEBUG) {
2190                    trace_msg = Some(vec![format!(
2191                        "source={source} line={line} depth={} stream_offset={:#x}",
2192                        self.depth,
2193                        haystack.lazy_stream_position()
2194                    )])
2195                }
2196
2197                // NOTE: we may have a way to optimize here. In case we do a Any
2198                // test and we don't use the value to format the message, we don't
2199                // need to read the value.
2200                if let Ok(opt_test_value) = self
2201                    .test
2202                    .read_test_value(haystack, switch_endianness)
2203                    .inspect_err(|e| {
2204                        debug!("source={source} line={line} error while reading test value @{offset}: {e}",)
2205                    })
2206                {
2207                    if let Some(v) = trace_msg
2208                        .as_mut() { v.push(format!("test={:?}", self.test)) }
2209
2210                    let match_res =
2211                        opt_test_value.and_then(|tv| self.test.match_value(&tv, stream_kind));
2212
2213                    if let Some(v) = trace_msg.as_mut() { v.push(format!(
2214                            "message=\"{}\" match={}",
2215                            self.message
2216                                .as_ref()
2217                                .map(|fs| fs.to_string_lossy())
2218                                .unwrap_or_default(),
2219                            match_res.is_some()
2220                        )) }
2221
2222                    // trace message
2223                    if enabled!(Level::DEBUG) && !enabled!(Level::TRACE) && match_res.is_some() {
2224                        if let Some(m) = trace_msg{
2225                            debug!("{}", m.join(" "));
2226                        }
2227                    } else if enabled!(Level::TRACE)
2228                        && let Some(m) = trace_msg{
2229                            trace!("{}", m.join(" "));
2230                        }
2231
2232                    if let Some(mr) = match_res {
2233                        state.set_continuation_level(self.continuation_level());
2234                        return Ok((true, Some(mr)));
2235                    }
2236                }
2237
2238                Ok((false, None))
2239            }
2240        }
2241    }
2242
2243    #[inline(always)]
2244    fn continuation_level(&self) -> ContinuationLevel {
2245        ContinuationLevel(self.depth)
2246    }
2247}
2248
2249#[derive(Debug, Clone)]
2250struct Use {
2251    line: usize,
2252    depth: u8,
2253    start_offset: Offset,
2254    rule_name: String,
2255    switch_endianness: bool,
2256    message: Option<Message>,
2257}
2258
2259#[derive(Debug, Clone, Serialize, Deserialize)]
2260struct StrengthMod {
2261    op: Op,
2262    by: u8,
2263}
2264
2265impl StrengthMod {
2266    #[inline(always)]
2267    fn apply(&self, strength: u64) -> u64 {
2268        let by = self.by as u64;
2269        debug!("applying strength modifier: {strength} {} {}", self.op, by);
2270        match self.op {
2271            Op::Mul => strength.saturating_mul(by),
2272            Op::Add => strength.saturating_add(by),
2273            Op::Sub => strength.saturating_sub(by),
2274            Op::Div => {
2275                if by > 0 {
2276                    strength.saturating_div(by)
2277                } else {
2278                    strength
2279                }
2280            }
2281            Op::Mod => strength % by,
2282            Op::And => strength & by,
2283            // this should never happen as strength operators
2284            // are enforced by our parser
2285            Op::Xor | Op::Or => {
2286                debug_panic!("unsupported strength operator");
2287                strength
2288            }
2289        }
2290    }
2291}
2292
2293#[derive(Debug, Clone)]
2294enum Flag {
2295    Mime(String),
2296    Ext(HashSet<String>),
2297    Strength(StrengthMod),
2298    Apple(String),
2299}
2300
2301#[derive(Debug, Clone)]
2302struct Name {
2303    line: usize,
2304    name: String,
2305    message: Option<Message>,
2306}
2307
2308#[derive(Debug, Clone)]
2309enum Entry<'span> {
2310    Match(Span<'span>, Match),
2311    Flag(Span<'span>, Flag),
2312}
2313
2314#[derive(Debug, Clone, Serialize, Deserialize)]
2315struct EntryNode {
2316    root: bool,
2317    entry: Match,
2318    children: Vec<EntryNode>,
2319    mimetype: Option<String>,
2320    apple: Option<String>,
2321    strength_mod: Option<StrengthMod>,
2322    exts: HashSet<String>,
2323}
2324
2325impl EntryNode {
2326    fn update_exts_rec(
2327        &self,
2328        exts: &mut HashSet<String>,
2329        deps: &HashMap<String, DependencyRule>,
2330        marked: &mut HashSet<String>,
2331    ) -> Result<(), ()> {
2332        for ext in self.exts.iter() {
2333            if !exts.contains(ext) {
2334                exts.insert(ext.clone());
2335            }
2336        }
2337
2338        for c in self.children.iter() {
2339            if let Test::Use(_, ref name) = c.entry.test {
2340                if marked.contains(name) {
2341                    continue;
2342                }
2343                if let Some(r) = deps.get(name) {
2344                    marked.insert(name.clone());
2345                    exts.extend(r.rule.fetch_all_extensions(deps, marked)?);
2346                } else {
2347                    return Err(());
2348                }
2349            } else {
2350                c.update_exts_rec(exts, deps, marked)?;
2351            }
2352        }
2353
2354        Ok(())
2355    }
2356
2357    fn update_score_rec(
2358        &self,
2359        depth: usize,
2360        score: &mut u64,
2361        deps: &HashMap<String, DependencyRule>,
2362        marked: &mut HashSet<String>,
2363    ) {
2364        if depth == 3 {
2365            return;
2366        }
2367
2368        *score += self
2369            .children
2370            .iter()
2371            .map(|e| e.entry.test_strength)
2372            .min()
2373            .unwrap_or_default();
2374
2375        for c in self.children.iter() {
2376            if let Test::Use(_, ref name) = c.entry.test {
2377                if marked.contains(name) {
2378                    continue;
2379                }
2380
2381                if let Some(r) = deps.get(name) {
2382                    marked.insert(name.clone());
2383                    *score += r.rule.compute_score(depth, deps, marked);
2384                }
2385            }
2386            c.update_score_rec(depth + 1, score, deps, marked);
2387        }
2388    }
2389
2390    #[inline]
2391    #[allow(clippy::too_many_arguments)]
2392    fn matches<'r, R: Read + Seek>(
2393        &'r self,
2394        opt_source: Option<&str>,
2395        magic: &mut Magic<'r>,
2396        state: &mut MatchState,
2397        stream_kind: StreamKind,
2398        buf_base_offset: Option<u64>,
2399        rule_base_offset: Option<u64>,
2400        last_level_offset: Option<u64>,
2401        haystack: &mut LazyCache<R>,
2402        db: &'r MagicDb,
2403        switch_endianness: bool,
2404        depth: usize,
2405    ) -> Result<(), Error> {
2406        let (ok, opt_match_res) = self.entry.matches(
2407            opt_source,
2408            magic,
2409            stream_kind,
2410            state,
2411            buf_base_offset,
2412            rule_base_offset,
2413            last_level_offset,
2414            haystack,
2415            switch_endianness,
2416            db,
2417            depth,
2418        )?;
2419
2420        let source = opt_source.unwrap_or("unknown");
2421        let line = self.entry.line;
2422
2423        if ok {
2424            // update magic with message if match is successful
2425            if let Some(msg) = self.entry.message.as_ref()
2426                && let Ok(msg) = msg.format_with(opt_match_res.as_ref()).inspect_err(|e| {
2427                    debug!("source={source} line={line} failed to format message: {e}")
2428                })
2429            {
2430                magic.push_message(msg);
2431            }
2432
2433            // we need to adjust stream offset in case of regex/search tests
2434            if let Some(mr) = opt_match_res {
2435                match &self.entry.test {
2436                    Test::String(t) => {
2437                        if t.has_length_mod() {
2438                            let o = mr.end_offset();
2439                            haystack.seek(SeekFrom::Start(o))?;
2440                        }
2441                    }
2442                    Test::Search(t) => {
2443                        if t.re_mods.contains(ReMod::StartOffsetUpdate) {
2444                            let o = mr.start_offset();
2445                            haystack.seek(SeekFrom::Start(o))?;
2446                        } else {
2447                            let o = mr.end_offset();
2448                            haystack.seek(SeekFrom::Start(o))?;
2449                        }
2450                    }
2451
2452                    Test::Regex(t) => {
2453                        if t.mods.contains(ReMod::StartOffsetUpdate) {
2454                            let o = mr.start_offset();
2455                            haystack.seek(SeekFrom::Start(o))?;
2456                        } else {
2457                            let o = mr.end_offset();
2458                            haystack.seek(SeekFrom::Start(o))?;
2459                        }
2460                    }
2461                    // other types do not need offset adjustement
2462                    _ => {}
2463                }
2464            }
2465
2466            if let Some(mimetype) = self.mimetype.as_ref() {
2467                magic.set_mime_type(Cow::Borrowed(mimetype));
2468            }
2469
2470            if let Some(apple_ty) = self.apple.as_ref() {
2471                magic.set_creator_code(Cow::Borrowed(apple_ty));
2472            }
2473
2474            if !self.exts.is_empty() {
2475                magic.insert_extensions(self.exts.iter().map(|s| s.as_str()));
2476            }
2477
2478            // NOTE: here we try to implement a similar logic as in file_magic_strength.
2479            // Sticking to the exact same strength computation logic is complicated due
2480            // to implementation differences. Let's wait and see if that is a real issue.
2481            let mut strength = self.entry.test_strength;
2482
2483            let continuation_level = self.entry.continuation_level().0 as u64;
2484            if self.entry.message.is_none() && continuation_level < 3 {
2485                strength = strength.saturating_add(continuation_level);
2486            }
2487
2488            if let Some(sm) = self.strength_mod.as_ref() {
2489                strength = sm.apply(strength);
2490            }
2491
2492            // entries with no message get a bonus
2493            if self.entry.message.is_none() {
2494                strength += 1
2495            }
2496
2497            magic.update_strength(strength);
2498
2499            let end_upper_level = haystack.lazy_stream_position();
2500
2501            // we have to fix rule_base_offset if
2502            // the rule_base_starts from end otherwise it
2503            // breaks some offset computation in match
2504            // see test_offset_bug_1 and test_offset_bug_2
2505            // they implement the same test logic yet indirect
2506            // offsets have to be different so that it works
2507            // in libmagic/file
2508            let rule_base_offset = if self.root {
2509                match self.entry.offset {
2510                    Offset::Direct(DirOffset::End(o)) => {
2511                        Some(haystack.offset_from_start(SeekFrom::End(o)))
2512                    }
2513                    _ => rule_base_offset,
2514                }
2515            } else {
2516                rule_base_offset
2517            };
2518
2519            for e in self.children.iter() {
2520                e.matches(
2521                    opt_source,
2522                    magic,
2523                    state,
2524                    stream_kind,
2525                    buf_base_offset,
2526                    rule_base_offset,
2527                    Some(end_upper_level),
2528                    haystack,
2529                    db,
2530                    switch_endianness,
2531                    depth,
2532                )?
2533            }
2534        }
2535
2536        Ok(())
2537    }
2538}
2539
2540/// Represents a parsed magic rule
2541#[derive(Debug, Clone, Serialize, Deserialize)]
2542pub struct MagicRule {
2543    id: usize,
2544    source: Option<String>,
2545    entries: EntryNode,
2546    extensions: HashSet<String>,
2547    /// score used for rule ranking
2548    score: u64,
2549    finalized: bool,
2550}
2551
2552impl MagicRule {
2553    #[inline(always)]
2554    fn set_id(&mut self, id: usize) {
2555        self.id = id
2556    }
2557
2558    /// Fetches all the extensions defined in the magic rule. This
2559    /// function goes recursive and find extensions also defined in
2560    /// dependencies
2561    fn fetch_all_extensions(
2562        &self,
2563        deps: &HashMap<String, DependencyRule>,
2564        marked: &mut HashSet<String>,
2565    ) -> Result<HashSet<String>, ()> {
2566        let mut exts = HashSet::new();
2567        self.entries.update_exts_rec(&mut exts, deps, marked)?;
2568        Ok(exts)
2569    }
2570
2571    /// Computes the ranking score of a magic rule by walking
2572    /// tests recursively, dependencies included.
2573    fn compute_score(
2574        &self,
2575        depth: usize,
2576        deps: &HashMap<String, DependencyRule>,
2577        marked: &mut HashSet<String>,
2578    ) -> u64 {
2579        let mut score = 0;
2580        score += self.entries.entry.test_strength;
2581        self.entries
2582            .update_score_rec(depth, &mut score, deps, marked);
2583        score
2584    }
2585
2586    /// Finalize a rule by searching for all extensions and computing its score
2587    /// for ranking. In the `MagicRule` is already finalized it returns immediately.
2588    fn try_finalize(&mut self, deps: &HashMap<String, DependencyRule>) {
2589        if self.finalized {
2590            return;
2591        }
2592
2593        let Ok(exts) = self.fetch_all_extensions(deps, &mut HashSet::new()) else {
2594            return;
2595        };
2596
2597        self.extensions.extend(exts);
2598
2599        // fetch_all_extensions walks through all the dependencies
2600        // so there is no reason for compute_score to fail as it is walking
2601        // only some of them
2602        self.score = self.compute_score(0, deps, &mut HashSet::new());
2603        self.finalized = true
2604    }
2605
2606    #[inline]
2607    fn magic_entrypoint<'r, R: Read + Seek>(
2608        &'r self,
2609        magic: &mut Magic<'r>,
2610        stream_kind: StreamKind,
2611        haystack: &mut LazyCache<R>,
2612        db: &'r MagicDb,
2613        switch_endianness: bool,
2614        depth: usize,
2615    ) -> Result<(), Error> {
2616        self.entries.matches(
2617            self.source.as_deref(),
2618            magic,
2619            &mut MatchState::empty(),
2620            stream_kind,
2621            None,
2622            None,
2623            None,
2624            haystack,
2625            db,
2626            switch_endianness,
2627            depth,
2628        )
2629    }
2630
2631    #[inline]
2632    #[allow(clippy::too_many_arguments)]
2633    fn magic<'r, R: Read + Seek>(
2634        &'r self,
2635        magic: &mut Magic<'r>,
2636        stream_kind: StreamKind,
2637        buf_base_offset: Option<u64>,
2638        rule_base_offset: Option<u64>,
2639        haystack: &mut LazyCache<R>,
2640        db: &'r MagicDb,
2641        switch_endianness: bool,
2642        depth: usize,
2643    ) -> Result<(), Error> {
2644        self.entries.matches(
2645            self.source.as_deref(),
2646            magic,
2647            &mut MatchState::empty(),
2648            stream_kind,
2649            buf_base_offset,
2650            rule_base_offset,
2651            None,
2652            haystack,
2653            db,
2654            switch_endianness,
2655            depth,
2656        )
2657    }
2658
2659    /// Checks if the rule is for matching against text content
2660    ///
2661    /// # Returns
2662    ///
2663    /// * `bool` - True if the rule is for text files
2664    pub fn is_text(&self) -> bool {
2665        self.entries.entry.test.is_text()
2666            && self.entries.children.iter().all(|e| e.entry.test.is_text())
2667    }
2668
2669    /// Gets the rule's score used for ranking rules between them
2670    ///
2671    /// # Returns
2672    ///
2673    /// * `u64` - The rule's score
2674    #[inline(always)]
2675    pub fn score(&self) -> u64 {
2676        self.score
2677    }
2678
2679    /// Gets the rule's filename if any
2680    ///
2681    /// # Returns
2682    ///
2683    /// * `Option<&str>` - The rule's source if available
2684    #[inline(always)]
2685    pub fn source(&self) -> Option<&str> {
2686        self.source.as_deref()
2687    }
2688
2689    /// Gets the line number at which the rule is defined
2690    ///
2691    /// # Returns
2692    ///
2693    /// * `usize` - The rule's line number
2694    #[inline(always)]
2695    pub fn line(&self) -> usize {
2696        self.entries.entry.line
2697    }
2698
2699    /// Gets all the file extensions associated to the rule
2700    ///
2701    /// # Returns
2702    ///
2703    /// * `&HashSet<String>` - The set of all associated extensions
2704    #[inline(always)]
2705    pub fn extensions(&self) -> &HashSet<String> {
2706        &self.extensions
2707    }
2708}
2709
2710#[derive(Debug, Clone, Serialize, Deserialize)]
2711struct DependencyRule {
2712    name: String,
2713    rule: MagicRule,
2714}
2715
2716/// A parsed source of magic rules
2717///
2718/// # Methods
2719///
2720/// * `open` - Opens a magic file from a path
2721#[derive(Debug, Clone, Serialize, Deserialize)]
2722pub struct MagicSource {
2723    rules: Vec<MagicRule>,
2724    dependencies: HashMap<String, DependencyRule>,
2725}
2726
2727impl MagicSource {
2728    /// Opens and parses a magic file from a path
2729    ///
2730    /// # Arguments
2731    ///
2732    /// * `p` - The path to the magic file
2733    ///
2734    /// # Returns
2735    ///
2736    /// * `Result<Self, Error>` - The parsed magic file or an error
2737    pub fn open<P: AsRef<Path>>(p: P) -> Result<Self, Error> {
2738        FileMagicParser::parse_file(p)
2739    }
2740}
2741
2742#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)]
2743struct ContinuationLevel(u8);
2744
2745// FIXME: magic handles many more text encodings
2746#[derive(Debug, PartialEq, Eq, Clone, Copy)]
2747enum TextEncoding {
2748    Ascii,
2749    Utf8,
2750    Unknown,
2751}
2752
2753impl TextEncoding {
2754    const fn as_magic_str(&self) -> &'static str {
2755        match self {
2756            TextEncoding::Ascii => "ASCII",
2757            TextEncoding::Utf8 => "UTF-8",
2758            TextEncoding::Unknown => "Unknown",
2759        }
2760    }
2761}
2762
2763#[derive(Debug, PartialEq, Eq, Clone, Copy)]
2764enum StreamKind {
2765    Binary,
2766    Text(TextEncoding),
2767}
2768
2769impl StreamKind {
2770    const fn is_text(&self) -> bool {
2771        matches!(self, StreamKind::Text(_))
2772    }
2773}
2774
2775#[derive(Debug)]
2776struct MatchState {
2777    continuation_levels: [bool; 256],
2778}
2779
2780impl MatchState {
2781    #[inline(always)]
2782    fn empty() -> Self {
2783        MatchState {
2784            continuation_levels: [false; 256],
2785        }
2786    }
2787
2788    #[inline(always)]
2789    fn get_continuation_level(&mut self, level: &ContinuationLevel) -> bool {
2790        self.continuation_levels
2791            .get(level.0 as usize)
2792            .cloned()
2793            .unwrap_or_default()
2794    }
2795
2796    #[inline(always)]
2797    fn set_continuation_level(&mut self, level: ContinuationLevel) {
2798        if let Some(b) = self.continuation_levels.get_mut(level.0 as usize) {
2799            *b = true
2800        }
2801    }
2802
2803    #[inline(always)]
2804    fn clear_continuation_level(&mut self, level: &ContinuationLevel) {
2805        if let Some(b) = self.continuation_levels.get_mut(level.0 as usize) {
2806            *b = false;
2807        }
2808    }
2809}
2810
2811/// Represents a file magic detection result
2812#[derive(Debug, Default)]
2813pub struct Magic<'m> {
2814    stream_kind: Option<StreamKind>,
2815    source: Option<Cow<'m, str>>,
2816    message: Vec<Cow<'m, str>>,
2817    mime_type: Option<Cow<'m, str>>,
2818    creator_code: Option<Cow<'m, str>>,
2819    strength: u64,
2820    exts: HashSet<Cow<'m, str>>,
2821    is_default: bool,
2822}
2823
2824impl<'m> Magic<'m> {
2825    #[inline(always)]
2826    fn set_source(&mut self, source: Option<&'m str>) {
2827        self.source = source.map(Cow::Borrowed);
2828    }
2829
2830    #[inline(always)]
2831    fn set_stream_kind(&mut self, stream_kind: StreamKind) {
2832        self.stream_kind = Some(stream_kind)
2833    }
2834
2835    #[inline(always)]
2836    fn reset(&mut self) {
2837        self.stream_kind = None;
2838        self.source = None;
2839        self.message.clear();
2840        self.mime_type = None;
2841        self.creator_code = None;
2842        self.strength = 0;
2843        self.exts.clear();
2844        self.is_default = false;
2845    }
2846
2847    /// Converts borrowed data into owned data. This method involves
2848    /// data cloning, so you must use this method only if you need to
2849    /// extend the lifetime of a [`Magic`] struct.
2850    ///
2851    /// # Returns
2852    ///
2853    /// * `Magic<'owned>` - A new [`Magic`] with owned data
2854    #[inline]
2855    pub fn into_owned<'owned>(self) -> Magic<'owned> {
2856        Magic {
2857            stream_kind: self.stream_kind,
2858            source: self.source.map(|s| Cow::Owned(s.into_owned())),
2859            message: self
2860                .message
2861                .into_iter()
2862                .map(Cow::into_owned)
2863                .map(Cow::Owned)
2864                .collect(),
2865            mime_type: self.mime_type.map(|m| Cow::Owned(m.into_owned())),
2866            creator_code: self.creator_code.map(|m| Cow::Owned(m.into_owned())),
2867            strength: self.strength,
2868            exts: self
2869                .exts
2870                .into_iter()
2871                .map(|e| Cow::Owned(e.into_owned()))
2872                .collect(),
2873            is_default: self.is_default,
2874        }
2875    }
2876
2877    /// Gets the formatted message describing the file type
2878    ///
2879    /// # Returns
2880    ///
2881    /// * `String` - The formatted message
2882    #[inline(always)]
2883    pub fn message(&self) -> String {
2884        let mut out = String::new();
2885        for (i, m) in self.message.iter().enumerate() {
2886            if let Some(s) = m.strip_prefix(r#"\b"#) {
2887                out.push_str(s);
2888            } else {
2889                // don't put space on first string
2890                if i > 0 {
2891                    out.push(' ');
2892                }
2893                out.push_str(m);
2894            }
2895        }
2896        out
2897    }
2898
2899    /// Returns an iterator over the individual parts of the magic message
2900    ///
2901    /// A magic message is typically composed of multiple parts, each appended
2902    /// during successful magic tests. This method provides an efficient way to
2903    /// iterate over these parts without concatenating them into a new string,
2904    /// as done when calling [`Magic::message`].
2905    ///
2906    /// # Returns
2907    ///
2908    /// * `impl Iterator<Item = &str>` - An iterator yielding string slices of each message part
2909    #[inline]
2910    pub fn message_parts(&self) -> impl Iterator<Item = &str> {
2911        self.message.iter().map(|p| p.as_ref())
2912    }
2913
2914    #[inline(always)]
2915    fn update_strength(&mut self, value: u64) {
2916        self.strength = self.strength.saturating_add(value);
2917        debug!("updated strength = {:?}", self.strength)
2918    }
2919
2920    /// Gets the detected MIME type
2921    ///
2922    /// # Returns
2923    ///
2924    /// * `&str` - The MIME type or default based on stream kind
2925    #[inline(always)]
2926    pub fn mime_type(&self) -> &str {
2927        self.mime_type.as_deref().unwrap_or(match self.stream_kind {
2928            Some(StreamKind::Text(_)) => DEFAULT_TEXT_MIMETYPE,
2929            Some(StreamKind::Binary) | None => DEFAULT_BIN_MIMETYPE,
2930        })
2931    }
2932
2933    #[inline(always)]
2934    fn push_message<'a: 'm>(&mut self, msg: Cow<'a, str>) {
2935        if !msg.is_empty() {
2936            debug!("pushing message: msg={msg} len={}", msg.len());
2937            self.message.push(msg);
2938        }
2939    }
2940
2941    #[inline(always)]
2942    fn set_mime_type<'a: 'm>(&mut self, mime: Cow<'a, str>) {
2943        if self.mime_type.is_none() {
2944            debug!("insert mime: {:?}", mime);
2945            self.mime_type = Some(mime)
2946        }
2947    }
2948
2949    #[inline(always)]
2950    fn set_creator_code<'a: 'm>(&mut self, apple_ty: Cow<'a, str>) {
2951        if self.creator_code.is_none() {
2952            debug!("insert apple type: {apple_ty:?}");
2953            self.creator_code = Some(apple_ty)
2954        }
2955    }
2956
2957    #[inline(always)]
2958    fn insert_extensions<'a: 'm, I: Iterator<Item = &'a str>>(&mut self, exts: I) {
2959        if self.exts.is_empty() {
2960            self.exts.extend(exts.filter_map(|e| {
2961                if e.is_empty() {
2962                    None
2963                } else {
2964                    Some(Cow::Borrowed(e))
2965                }
2966            }));
2967        }
2968    }
2969
2970    /// Gets the confidence score of the detection. This
2971    /// value is used to sort [`Magic`] in [`MagicDb::best_magic`]
2972    /// and [`MagicDb::all_magics`].
2973    ///
2974    /// # Returns
2975    ///
2976    /// * `u64` - The confidence score attributed to that [`Magic`]
2977    #[inline(always)]
2978    pub fn strength(&self) -> u64 {
2979        self.strength
2980    }
2981
2982    /// Gets the filename where the magic rule was defined
2983    ///
2984    /// # Returns
2985    ///
2986    /// * `Option<&str>` - The source if available
2987    #[inline(always)]
2988    pub fn source(&self) -> Option<&str> {
2989        self.source.as_deref()
2990    }
2991
2992    /// Gets the Apple creator code if available
2993    ///
2994    /// # Returns
2995    ///
2996    /// * `Option<&str>` - The creator code if available
2997    #[inline(always)]
2998    pub fn creator_code(&self) -> Option<&str> {
2999        self.creator_code.as_deref()
3000    }
3001
3002    /// Gets the possible file extensions for the detected [`Magic`]
3003    ///
3004    /// # Returns
3005    ///
3006    /// * `&HashSet<Cow<'m, str>>` - The set of possible extensions
3007    #[inline(always)]
3008    pub fn extensions(&self) -> &HashSet<Cow<'m, str>> {
3009        &self.exts
3010    }
3011
3012    /// Checks if this is a default fallback detection
3013    ///
3014    /// # Returns
3015    ///
3016    /// * `bool` - True if this is a default detection
3017    #[inline(always)]
3018    pub fn is_default(&self) -> bool {
3019        self.is_default
3020    }
3021}
3022
3023/// Represents a database of [`MagicRule`]
3024#[derive(Debug, Default, Clone, Serialize, Deserialize)]
3025pub struct MagicDb {
3026    rule_id: usize,
3027    rules: Vec<MagicRule>,
3028    dependencies: HashMap<String, DependencyRule>,
3029}
3030
3031#[inline(always)]
3032/// Returns `true` if the byte stream is likely text.
3033fn is_likely_text(bytes: &[u8]) -> bool {
3034    if bytes.is_empty() {
3035        return false;
3036    }
3037
3038    let mut printable = 0f64;
3039    let mut high_bytes = 0f64; // Bytes > 0x7F (non-ASCII)
3040
3041    for byte in bytes.iter() {
3042        match byte {
3043            0x00 => return false,
3044            0x09 | 0x0A | 0x0D => printable += 1.0, // Whitespace
3045            0x20..=0x7E => printable += 1.0,        // Printable ASCII
3046            _ => high_bytes += 1.0,
3047        }
3048    }
3049
3050    let total = bytes.len() as f64;
3051    let printable_ratio = printable / total;
3052    let high_bytes_ratio = high_bytes / total;
3053
3054    // Heuristic thresholds (adjust as needed):
3055    printable_ratio > 0.85 && high_bytes_ratio < 0.20
3056}
3057
3058#[inline(always)]
3059fn guess_stream_kind<S: AsRef<[u8]>>(stream: S) -> StreamKind {
3060    let Ok(s) = str::from_utf8(stream.as_ref()) else {
3061        if is_likely_text(stream.as_ref()) {
3062            return StreamKind::Text(TextEncoding::Unknown);
3063        } else {
3064            return StreamKind::Binary;
3065        }
3066    };
3067
3068    let count = s.chars().count();
3069    let mut is_ascii = true;
3070
3071    for c in s.chars().take(count.saturating_sub(1)) {
3072        is_ascii &= c.is_ascii()
3073    }
3074
3075    if is_ascii {
3076        StreamKind::Text(TextEncoding::Ascii)
3077    } else {
3078        StreamKind::Text(TextEncoding::Utf8)
3079    }
3080}
3081
3082impl MagicDb {
3083    fn open_reader<R: Read + Seek>(f: R) -> Result<LazyCache<R>, Error> {
3084        Ok(LazyCache::<R>::from_read_seek(f)
3085            .and_then(|lc| lc.with_hot_cache(2 * FILE_BYTES_MAX))?)
3086        .map(|lc| lc.with_warm_cache(100 << 20))
3087    }
3088
3089    /// Creates a new empty database
3090    ///
3091    /// # Returns
3092    ///
3093    /// * [`MagicDb`] - A new empty database
3094    pub fn new() -> Self {
3095        Self::default()
3096    }
3097
3098    #[inline(always)]
3099    fn next_rule_id(&mut self) -> usize {
3100        let t = self.rule_id;
3101        self.rule_id += 1;
3102        t
3103    }
3104
3105    #[inline(always)]
3106    fn try_json<R: Read + Seek>(
3107        haystack: &mut LazyCache<R>,
3108        stream_kind: StreamKind,
3109        magic: &mut Magic,
3110    ) -> Result<bool, Error> {
3111        // cannot be json if content is binary
3112        if matches!(stream_kind, StreamKind::Binary) {
3113            return Ok(false);
3114        }
3115
3116        let buf = haystack.read_range(0..FILE_BYTES_MAX as u64)?.trim_ascii();
3117
3118        let Some((start, end)) = find_json_boundaries(buf) else {
3119            return Ok(false);
3120        };
3121
3122        // if anything else than whitespace before start
3123        // this is not json
3124        for c in buf[0..start].iter() {
3125            if !c.is_ascii_whitespace() {
3126                return Ok(false);
3127            }
3128        }
3129
3130        let mut is_ndjson = false;
3131
3132        trace!("maybe a json document");
3133        let ok = serde_json::from_slice::<serde_json::Value>(&buf[start..=end]).is_ok();
3134        if !ok {
3135            return Ok(false);
3136        }
3137
3138        // we are sure it is json now we must look if we are ndjson
3139        if end + 1 < buf.len() {
3140            // after first json
3141            let buf = &buf[end + 1..];
3142            if let Some((second_start, second_end)) = find_json_boundaries(buf) {
3143                // there is a new line between the two json docs
3144                if memchr(b'\n', &buf[..second_start]).is_some() {
3145                    trace!("might be ndjson");
3146                    is_ndjson = serde_json::from_slice::<serde_json::Value>(
3147                        &buf[second_start..=second_end],
3148                    )
3149                    .is_ok();
3150                }
3151            }
3152        }
3153
3154        if is_ndjson {
3155            magic.push_message(Cow::Borrowed("New Line Delimited"));
3156            magic.set_mime_type(Cow::Borrowed("application/x-ndjson"));
3157            magic.insert_extensions(["ndjson", "jsonl"].into_iter());
3158        } else {
3159            magic.set_mime_type(Cow::Borrowed("application/json"));
3160            magic.insert_extensions(["json"].into_iter());
3161        }
3162
3163        magic.push_message(Cow::Borrowed("JSON text data"));
3164        magic.set_source(Some(HARDCODED_SOURCE));
3165        magic.update_strength(HARDCODED_MAGIC_STRENGTH);
3166        Ok(true)
3167    }
3168
3169    #[inline(always)]
3170    fn try_csv<R: Read + Seek>(
3171        haystack: &mut LazyCache<R>,
3172        stream_kind: StreamKind,
3173        magic: &mut Magic,
3174    ) -> Result<bool, Error> {
3175        // cannot be csv if content is binary
3176        let StreamKind::Text(enc) = stream_kind else {
3177            return Ok(false);
3178        };
3179
3180        let buf = haystack.read_range(0..FILE_BYTES_MAX as u64)?;
3181        let mut reader = csv::Reader::from_reader(io::Cursor::new(buf));
3182        let mut records = reader.records();
3183
3184        let Some(Ok(first)) = records.next() else {
3185            return Ok(false);
3186        };
3187
3188        // very not likely a CSV otherwise all programming
3189        // languages having ; line terminator would be
3190        // considered as CSV
3191        if first.len() <= 1 {
3192            return Ok(false);
3193        }
3194
3195        // we already parsed first line
3196        let mut n = 1;
3197        for i in records.take(9) {
3198            if let Ok(rec) = i {
3199                if first.len() != rec.len() {
3200                    return Ok(false);
3201                }
3202            } else {
3203                return Ok(false);
3204            }
3205            n += 1;
3206        }
3207
3208        // we need at least 10 lines
3209        if n != 10 {
3210            return Ok(false);
3211        }
3212
3213        magic.set_mime_type(Cow::Borrowed("text/csv"));
3214        magic.push_message(Cow::Borrowed("CSV"));
3215        magic.push_message(Cow::Borrowed(enc.as_magic_str()));
3216        magic.push_message(Cow::Borrowed("text"));
3217        magic.insert_extensions(["csv"].into_iter());
3218        magic.set_source(Some(HARDCODED_SOURCE));
3219        magic.update_strength(HARDCODED_MAGIC_STRENGTH);
3220        Ok(true)
3221    }
3222
3223    #[inline(always)]
3224    fn try_tar<R: Read + Seek>(
3225        haystack: &mut LazyCache<R>,
3226        stream_kind: StreamKind,
3227        magic: &mut Magic,
3228    ) -> Result<bool, Error> {
3229        // cannot be json if content is not binary
3230        if !matches!(stream_kind, StreamKind::Binary) {
3231            return Ok(false);
3232        }
3233
3234        let buf = haystack.read_range(0..FILE_BYTES_MAX as u64)?;
3235        let mut ar = Archive::new(io::Cursor::new(buf));
3236
3237        let Ok(mut entries) = ar.entries() else {
3238            return Ok(false);
3239        };
3240
3241        let Some(Ok(first)) = entries.next() else {
3242            return Ok(false);
3243        };
3244
3245        let header = first.header();
3246
3247        if header.as_ustar().is_some() {
3248            magic.push_message(Cow::Borrowed("POSIX tar archive"));
3249        } else if header.as_gnu().is_some() {
3250            magic.push_message(Cow::Borrowed("POSIX tar archive (GNU)"));
3251        } else {
3252            magic.push_message(Cow::Borrowed("tar archive"));
3253        }
3254
3255        magic.set_mime_type(Cow::Borrowed("application/x-tar"));
3256        magic.set_source(Some(HARDCODED_SOURCE));
3257        magic.update_strength(HARDCODED_MAGIC_STRENGTH);
3258        magic.insert_extensions(["tar"].into_iter());
3259        Ok(true)
3260    }
3261
3262    #[inline(always)]
3263    fn try_hard_magic<R: Read + Seek>(
3264        haystack: &mut LazyCache<R>,
3265        stream_kind: StreamKind,
3266        magic: &mut Magic,
3267    ) -> Result<bool, Error> {
3268        Ok(Self::try_json(haystack, stream_kind, magic)?
3269            || Self::try_csv(haystack, stream_kind, magic)?
3270            || Self::try_tar(haystack, stream_kind, magic)?)
3271    }
3272
3273    #[inline(always)]
3274    fn magic_default<'m, R: Read + Seek>(
3275        haystack: &mut LazyCache<R>,
3276        stream_kind: StreamKind,
3277        magic: &mut Magic<'m>,
3278    ) -> Result<(), Error> {
3279        let buf = haystack.read_range(0..FILE_BYTES_MAX as u64)?;
3280
3281        magic.set_source(Some(HARDCODED_SOURCE));
3282        magic.set_stream_kind(stream_kind);
3283        magic.is_default = true;
3284
3285        if buf.is_empty() {
3286            magic.push_message(Cow::Borrowed("empty"));
3287            magic.set_mime_type(Cow::Borrowed(DEFAULT_BIN_MIMETYPE));
3288            return Ok(());
3289        }
3290
3291        match stream_kind {
3292            StreamKind::Binary => {
3293                magic.push_message(Cow::Borrowed("data"));
3294            }
3295            StreamKind::Text(e) => {
3296                magic.push_message(Cow::Borrowed(e.as_magic_str()));
3297                magic.push_message(Cow::Borrowed("text"));
3298            }
3299        }
3300
3301        Ok(())
3302    }
3303
3304    /// Loads rules from a [`MagicSource`]
3305    ///
3306    /// # Arguments
3307    ///
3308    /// * `mf` - The [`MagicSource`] to load rules from
3309    ///
3310    /// # Returns
3311    ///
3312    /// * `Result<&mut Self, Error>` - Self for chaining or an error
3313    pub fn load(&mut self, mf: MagicSource) -> Result<&mut Self, Error> {
3314        for rule in mf.rules.into_iter() {
3315            let mut rule = rule;
3316            rule.set_id(self.next_rule_id());
3317
3318            self.rules.push(rule);
3319        }
3320
3321        self.dependencies.extend(mf.dependencies);
3322        self.prepare();
3323        Ok(self)
3324    }
3325
3326    /// Gets all rules in the database
3327    ///
3328    /// # Returns
3329    ///
3330    /// * `&[MagicRule]` - A slice of all rules
3331    pub fn rules(&self) -> &[MagicRule] {
3332        &self.rules
3333    }
3334
3335    #[inline]
3336    fn first_magic_with_stream_kind<R: Read + Seek>(
3337        &self,
3338        haystack: &mut LazyCache<R>,
3339        stream_kind: StreamKind,
3340        extension: Option<&str>,
3341    ) -> Result<Magic<'_>, Error> {
3342        // re-using magic makes this function faster
3343        let mut magic = Magic::default();
3344
3345        if Self::try_hard_magic(haystack, stream_kind, &mut magic)? {
3346            return Ok(magic);
3347        }
3348
3349        let mut marked = vec![false; self.rules.len()];
3350
3351        macro_rules! do_magic {
3352            ($rule: expr) => {{
3353                $rule.magic_entrypoint(&mut magic, stream_kind, haystack, &self, false, 0)?;
3354
3355                if !magic.message.is_empty() {
3356                    magic.set_stream_kind(stream_kind);
3357                    magic.set_source($rule.source.as_deref());
3358                    return Ok(magic);
3359                }
3360
3361                magic.reset();
3362            }};
3363        }
3364
3365        if let Some(ext) = extension.map(|e| e.to_lowercase())
3366            && !ext.is_empty()
3367        {
3368            for rule in self.rules.iter().filter(|r| r.extensions.contains(&ext)) {
3369                do_magic!(rule);
3370                if let Some(f) = marked.get_mut(rule.id) {
3371                    *f = true
3372                }
3373            }
3374        }
3375
3376        for rule in self
3377            .rules
3378            .iter()
3379            // we don't run again rules run by extension
3380            .filter(|r| !*marked.get(r.id).unwrap_or(&false))
3381        {
3382            do_magic!(rule)
3383        }
3384
3385        Self::magic_default(haystack, stream_kind, &mut magic)?;
3386
3387        Ok(magic)
3388    }
3389
3390    /// Detects file [`Magic`] stopping at the first matching magic. Magic
3391    /// rules are evaluated from the best to the least relevant, so this method
3392    /// returns most of the time the best magic. For the rare cases where
3393    /// it doesn't or if the best result is always required, use [`MagicDb::best_magic`]
3394    ///
3395    /// # Arguments
3396    ///
3397    /// * `r` - A readable and seekable input
3398    /// * `extension` - Optional file extension to use for acceleration
3399    ///
3400    /// # Returns
3401    ///
3402    /// * `Result<Magic<'_>, Error>` - The detection result or an error
3403    pub fn first_magic<R: Read + Seek>(
3404        &self,
3405        r: &mut R,
3406        extension: Option<&str>,
3407    ) -> Result<Magic<'_>, Error> {
3408        let mut haystack = Self::open_reader(r)?;
3409        let stream_kind = guess_stream_kind(haystack.read_range(0..FILE_BYTES_MAX as u64)?);
3410        self.first_magic_with_stream_kind(&mut haystack, stream_kind, extension)
3411    }
3412
3413    #[inline(always)]
3414    fn all_magics_sort_with_stream_kind<R: Read + Seek>(
3415        &self,
3416        haystack: &mut LazyCache<R>,
3417        stream_kind: StreamKind,
3418    ) -> Result<Vec<Magic<'_>>, Error> {
3419        let mut out = Vec::new();
3420
3421        let mut magic = Magic::default();
3422
3423        if Self::try_hard_magic(haystack, stream_kind, &mut magic)? {
3424            out.push(magic);
3425            magic = Magic::default();
3426        }
3427
3428        for rule in self.rules.iter() {
3429            rule.magic_entrypoint(&mut magic, stream_kind, haystack, self, false, 0)?;
3430
3431            // it is possible we have a strength with no message
3432            if !magic.message.is_empty() {
3433                magic.set_stream_kind(stream_kind);
3434                magic.set_source(rule.source.as_deref());
3435                out.push(magic);
3436                magic = Magic::default();
3437            }
3438
3439            magic.reset();
3440        }
3441
3442        Self::magic_default(haystack, stream_kind, &mut magic)?;
3443        out.push(magic);
3444
3445        out.sort_by_key(|b| std::cmp::Reverse(b.strength()));
3446
3447        Ok(out)
3448    }
3449
3450    /// Detects all [`Magic`] matching a given content.
3451    ///
3452    /// # Arguments
3453    ///
3454    /// * `r` - A readable and seekable input
3455    ///
3456    /// # Returns
3457    ///
3458    /// * `Result<Vec<Magic<'_>>, Error>` - All detection results sorted by strength or an error
3459    pub fn all_magics<R: Read + Seek>(&self, r: &mut R) -> Result<Vec<Magic<'_>>, Error> {
3460        let mut haystack = Self::open_reader(r)?;
3461        let stream_kind = guess_stream_kind(haystack.read_range(0..FILE_BYTES_MAX as u64)?);
3462        self.all_magics_sort_with_stream_kind(&mut haystack, stream_kind)
3463    }
3464
3465    #[inline(always)]
3466    fn best_magic_with_stream_kind<R: Read + Seek>(
3467        &self,
3468        haystack: &mut LazyCache<R>,
3469        stream_kind: StreamKind,
3470    ) -> Result<Magic<'_>, Error> {
3471        let magics = self.all_magics_sort_with_stream_kind(haystack, stream_kind)?;
3472
3473        // magics is guaranteed to contain at least the default magic
3474        return Ok(magics
3475            .into_iter()
3476            .next()
3477            .expect("magics must at least contain default"));
3478    }
3479
3480    /// Detects the best [`Magic`] matching a given content.
3481    ///
3482    /// # Arguments
3483    ///
3484    /// * `r` - A readable and seekable input
3485    ///
3486    /// # Returns
3487    ///
3488    /// * `Result<Magic<'_>, Error>` - The best detection result or an error
3489    pub fn best_magic<R: Read + Seek>(&self, r: &mut R) -> Result<Magic<'_>, Error> {
3490        let mut haystack = Self::open_reader(r)?;
3491        let stream_kind = guess_stream_kind(haystack.read_range(0..FILE_BYTES_MAX as u64)?);
3492        self.best_magic_with_stream_kind(&mut haystack, stream_kind)
3493    }
3494
3495    /// Serializes the database to a generic writer implementing [`io::Write`]
3496    ///
3497    /// # Returns
3498    ///
3499    /// * `Result<(), Error>` - The serialized database or an error
3500    pub fn serialize<W: Write>(self, w: &mut W) -> Result<(), Error> {
3501        let mut encoder = GzEncoder::new(w, Compression::best());
3502
3503        bincode::serde::encode_into_std_write(&self, &mut encoder, bincode::config::standard())?;
3504        encoder.finish()?;
3505        Ok(())
3506    }
3507
3508    /// Deserializes the database from a generic reader implementing [`io::Read`]
3509    ///
3510    /// # Arguments
3511    ///
3512    /// * `r` - The reader to deserialize from
3513    ///
3514    /// # Returns
3515    ///
3516    /// * `Result<Self, Error>` - The deserialized database or an error
3517    pub fn deserialize<R: Read>(r: &mut R) -> Result<Self, Error> {
3518        let mut buf = vec![];
3519        let mut gz = GzDecoder::new(r);
3520        gz.read_to_end(&mut buf).map_err(|e| {
3521            bincode::error::DecodeError::OtherString(format!("failed to read: {e}"))
3522        })?;
3523        let (sdb, _): (MagicDb, usize) =
3524            bincode::serde::decode_from_slice(&buf, bincode::config::standard())?;
3525        Ok(sdb)
3526    }
3527
3528    #[inline(always)]
3529    fn prepare(&mut self) {
3530        self.rules
3531            .iter_mut()
3532            .for_each(|r| r.try_finalize(&self.dependencies));
3533
3534        // put text rules at the end
3535        self.rules.sort_by_key(|r| (r.is_text(), -(r.score as i64)));
3536    }
3537}
3538
3539#[cfg(test)]
3540mod tests {
3541    use std::io::Cursor;
3542
3543    use regex::bytes::Regex;
3544
3545    use crate::utils::unix_local_time_to_string;
3546
3547    use super::*;
3548
3549    macro_rules! lazy_cache {
3550        ($l: literal) => {
3551            LazyCache::from_read_seek(Cursor::new($l)).unwrap()
3552        };
3553    }
3554
3555    fn first_magic(
3556        rule: &str,
3557        content: &[u8],
3558        stream_kind: StreamKind,
3559    ) -> Result<Magic<'static>, Error> {
3560        let mut md = MagicDb::new();
3561        md.load(
3562            FileMagicParser::parse_str(rule, None)
3563                .inspect_err(|e| eprintln!("{e}"))
3564                .unwrap(),
3565        )
3566        .unwrap();
3567        let mut reader = LazyCache::from_read_seek(Cursor::new(content)).unwrap();
3568        let v = md.best_magic_with_stream_kind(&mut reader, stream_kind)?;
3569        Ok(v.into_owned())
3570    }
3571
3572    /// helper macro to debug tests
3573    #[allow(unused_macros)]
3574    macro_rules! enable_trace {
3575        () => {
3576            tracing_subscriber::fmt()
3577                .with_max_level(tracing_subscriber::filter::LevelFilter::TRACE)
3578                .try_init();
3579        };
3580    }
3581
3582    macro_rules! parse_assert {
3583        ($rule:literal) => {
3584            FileMagicParser::parse_str($rule, None)
3585                .inspect_err(|e| eprintln!("{e}"))
3586                .unwrap();
3587        };
3588    }
3589
3590    macro_rules! assert_magic_match_bin {
3591        ($rule: literal, $content:literal) => {{ first_magic($rule, $content, StreamKind::Binary).unwrap() }};
3592        ($rule: literal, $content:literal, $message:expr) => {{
3593            assert_eq!(
3594                first_magic($rule, $content, StreamKind::Binary)
3595                    .unwrap()
3596                    .message(),
3597                $message
3598            );
3599        }};
3600    }
3601
3602    macro_rules! assert_magic_match_text {
3603        ($rule: literal, $content:literal) => {{ first_magic($rule, $content, StreamKind::Text(TextEncoding::Utf8)).unwrap() }};
3604        ($rule: literal, $content:literal, $message:expr) => {{
3605            assert_eq!(
3606                first_magic($rule, $content, StreamKind::Text(TextEncoding::Utf8))
3607                    .unwrap()
3608                    .message(),
3609                $message
3610            );
3611        }};
3612    }
3613
3614    macro_rules! assert_magic_not_match_text {
3615        ($rule: literal, $content:literal) => {{
3616            assert!(
3617                first_magic($rule, $content, StreamKind::Text(TextEncoding::Utf8))
3618                    .unwrap()
3619                    .is_default()
3620            );
3621        }};
3622    }
3623
3624    macro_rules! assert_magic_not_match_bin {
3625        ($rule: literal, $content:literal) => {{
3626            assert!(
3627                first_magic($rule, $content, StreamKind::Binary)
3628                    .unwrap()
3629                    .is_default()
3630            );
3631        }};
3632    }
3633
3634    #[test]
3635    fn test_regex() {
3636        assert_magic_match_text!(
3637            r#"
36380	regex/1024 \^#![[:space:]]*/usr/bin/env[[:space:]]+
3639!:mime	text/x-shellscript
3640>&0  regex/64 .*($|\\b) %s shell script text executable
3641    "#,
3642            br#"#!/usr/bin/env bash
3643        echo hello world"#,
3644            // the magic generated
3645            "bash shell script text executable"
3646        );
3647
3648        let re = Regex::new(r"(?-u)\x42\x82").unwrap();
3649        assert!(re.is_match(b"\x42\x82"));
3650
3651        assert_magic_match_bin!(
3652            r#"0 regex \x42\x82 binary regex match"#,
3653            b"\x00\x00\x00\x00\x00\x00\x42\x82"
3654        );
3655
3656        // test regex continuation after match
3657        assert_magic_match_bin!(
3658            r#"
3659            0 regex \x42\x82
3660            >&0 string \xde\xad\xbe\xef it works
3661            "#,
3662            b"\x00\x00\x00\x00\x00\x00\x42\x82\xde\xad\xbe\xef"
3663        );
3664
3665        assert_magic_match_bin!(
3666            r#"
3667            0 regex/s \x42\x82
3668            >&0 string \x42\x82\xde\xad\xbe\xef it works
3669            "#,
3670            b"\x00\x00\x00\x00\x00\x00\x42\x82\xde\xad\xbe\xef"
3671        );
3672
3673        // ^ must match stat of line when matching text
3674        assert_magic_match_text!(
3675            r#"
36760	regex/1024 \^HelloWorld$ HelloWorld String"#,
3677            br#"
3678// this is a comment after an empty line
3679HelloWorld
3680            "#
3681        );
3682    }
3683
3684    #[test]
3685    fn test_string_with_mods() {
3686        assert_magic_match_text!(
3687            r#"0	string/w	#!\ \ \ /usr/bin/env\ bash	BASH
3688        "#,
3689            b"#! /usr/bin/env bash i
3690        echo hello world"
3691        );
3692
3693        // test uppercase insensitive
3694        assert_magic_match_text!(
3695            r#"0	string/C	HelloWorld	it works
3696        "#,
3697            b"helloworld"
3698        );
3699
3700        assert_magic_not_match_text!(
3701            r#"0	string/C	HelloWorld	it works
3702        "#,
3703            b"hELLOwORLD"
3704        );
3705
3706        // test lowercase insensitive
3707        assert_magic_match_text!(
3708            r#"0	string/c	HelloWorld	it works
3709        "#,
3710            b"HELLOWORLD"
3711        );
3712
3713        assert_magic_not_match_text!(
3714            r#"0	string/c	HelloWorld	it works
3715        "#,
3716            b"helloworld"
3717        );
3718
3719        // test full word match
3720        assert_magic_match_text!(
3721            r#"0	string/f	#!/usr/bin/env\ bash	BASH
3722        "#,
3723            b"#!/usr/bin/env bash"
3724        );
3725
3726        assert_magic_not_match_text!(
3727            r#"0	string/f	#!/usr/bin/python PYTHON"#,
3728            b"#!/usr/bin/pythonic"
3729        );
3730
3731        // testing whitespace compacting
3732        assert_magic_match_text!(
3733            r#"0	string/W	#!/usr/bin/env\ python  PYTHON"#,
3734            b"#!/usr/bin/env    python"
3735        );
3736
3737        assert_magic_not_match_text!(
3738            r#"0	string/W	#!/usr/bin/env\ \ python  PYTHON"#,
3739            b"#!/usr/bin/env python"
3740        );
3741    }
3742
3743    #[test]
3744    fn test_search_with_mods() {
3745        assert_magic_match_text!(
3746            r#"0	search/1/fwt	#!\ /usr/bin/luatex	LuaTex script text executable"#,
3747            b"#!          /usr/bin/luatex "
3748        );
3749
3750        // test matching from the beginning
3751        assert_magic_match_text!(
3752            r#"
3753            0	search/s	/usr/bin/env
3754            >&0 string /usr/bin/env it works
3755            "#,
3756            b"#!/usr/bin/env    python"
3757        );
3758
3759        assert_magic_not_match_text!(
3760            r#"
3761            0	search	/usr/bin/env
3762            >&0 string /usr/bin/env it works
3763            "#,
3764            b"#!/usr/bin/env    python"
3765        );
3766    }
3767
3768    #[test]
3769    fn test_pstring() {
3770        assert_magic_match_bin!(r#"0 pstring Toast it works"#, b"\x05Toast");
3771
3772        assert_magic_match_bin!(r#"0 pstring Toast %s"#, b"\x05Toast", "Toast");
3773
3774        assert_magic_not_match_bin!(r#"0 pstring Toast Doesn't work"#, b"\x07Toaster");
3775
3776        // testing with modifiers
3777        assert_magic_match_bin!(r#"0 pstring/H Toast it works"#, b"\x00\x05Toast");
3778
3779        assert_magic_match_bin!(r#"0 pstring/HJ Toast it works"#, b"\x00\x07Toast");
3780
3781        assert_magic_match_bin!(r#"0 pstring/HJ Toast %s"#, b"\x00\x07Toast", "Toast");
3782
3783        assert_magic_match_bin!(r#"0 pstring/h Toast it works"#, b"\x05\x00Toast");
3784
3785        assert_magic_match_bin!(r#"0 pstring/hJ Toast it works"#, b"\x07\x00Toast");
3786
3787        assert_magic_match_bin!(r#"0 pstring/L Toast it works"#, b"\x00\x00\x00\x05Toast");
3788
3789        assert_magic_match_bin!(r#"0 pstring/LJ Toast it works"#, b"\x00\x00\x00\x09Toast");
3790
3791        assert_magic_match_bin!(r#"0 pstring/l Toast it works"#, b"\x05\x00\x00\x00Toast");
3792
3793        assert_magic_match_bin!(r#"0 pstring/lJ Toast it works"#, b"\x09\x00\x00\x00Toast");
3794    }
3795
3796    #[test]
3797    fn test_max_recursion() {
3798        let res = first_magic(
3799            r#"0	indirect x"#,
3800            b"#!          /usr/bin/luatex ",
3801            StreamKind::Binary,
3802        );
3803        assert!(res.is_err());
3804        let _ = res.inspect_err(|e| {
3805            assert!(matches!(
3806                e.unwrap_localized(),
3807                Error::MaximumRecursion(MAX_RECURSION)
3808            ))
3809        });
3810    }
3811
3812    #[test]
3813    fn test_string_ops() {
3814        assert_magic_match_text!("0	string/b MZ MZ File", b"MZ\0");
3815        assert_magic_match_text!("0	string !MZ Not MZ File", b"AZ\0");
3816        assert_magic_match_text!("0	string >\0 Any String", b"A\0");
3817        assert_magic_match_text!("0	string >Test Any String", b"Test 1\0");
3818        assert_magic_match_text!("0	string <Test Any String", b"\0");
3819        assert_magic_not_match_text!("0	string >Test Any String", b"\0");
3820    }
3821
3822    #[test]
3823    fn test_lestring16() {
3824        assert_magic_match_bin!(
3825            "0 lestring16 abcd Little-endian UTF-16 string",
3826            b"\x61\x00\x62\x00\x63\x00\x64\x00"
3827        );
3828        assert_magic_match_bin!(
3829            "0 lestring16 x %s",
3830            b"\x61\x00\x62\x00\x63\x00\x64\x00\x00",
3831            "abcd"
3832        );
3833        assert_magic_not_match_bin!(
3834            "0 lestring16 abcd Little-endian UTF-16 string",
3835            b"\x00\x61\x00\x62\x00\x63\x00\x64"
3836        );
3837        assert_magic_match_bin!(
3838            "4 lestring16 abcd Little-endian UTF-16 string",
3839            b"\x00\x00\x00\x00\x61\x00\x62\x00\x63\x00\x64\x00"
3840        );
3841    }
3842
3843    #[test]
3844    fn test_bestring16() {
3845        assert_magic_match_bin!(
3846            "0 bestring16 abcd Big-endian UTF-16 string",
3847            b"\x00\x61\x00\x62\x00\x63\x00\x64"
3848        );
3849        assert_magic_match_bin!(
3850            "0 bestring16 x %s",
3851            b"\x00\x61\x00\x62\x00\x63\x00\x64",
3852            "abcd"
3853        );
3854        assert_magic_not_match_bin!(
3855            "0 bestring16 abcd Big-endian UTF-16 string",
3856            b"\x61\x00\x62\x00\x63\x00\x64\x00"
3857        );
3858        assert_magic_match_bin!(
3859            "4 bestring16 abcd Big-endian UTF-16 string",
3860            b"\x00\x00\x00\x00\x00\x61\x00\x62\x00\x63\x00\x64"
3861        );
3862    }
3863
3864    #[test]
3865    fn test_offset_from_end() {
3866        assert_magic_match_bin!("-1 ubyte 0x42 last byte ok", b"\x00\x00\x42");
3867        assert_magic_match_bin!("-2 ubyte 0x41 last byte ok", b"\x00\x41\x00");
3868    }
3869
3870    #[test]
3871    fn test_relative_offset() {
3872        assert_magic_match_bin!(
3873            "
3874            0 ubyte 0x42
3875            >&0 ubyte 0x00
3876            >>&0 ubyte 0x41 third byte ok
3877            ",
3878            b"\x42\x00\x41\x00"
3879        );
3880    }
3881
3882    #[test]
3883    fn test_indirect_offset() {
3884        assert_magic_match_bin!("(0.l) ubyte 0x42 it works", b"\x04\x00\x00\x00\x42");
3885        // adding fixed value to offset
3886        assert_magic_match_bin!("(0.l+3) ubyte 0x42 it works", b"\x01\x00\x00\x00\x42");
3887        // testing offset pair
3888        assert_magic_match_bin!(
3889            "(0.l+(4)) ubyte 0x42 it works",
3890            b"\x04\x00\x00\x00\x04\x00\x00\x00\x42"
3891        );
3892    }
3893
3894    #[test]
3895    fn test_use_with_message() {
3896        assert_magic_match_bin!(
3897            r#"
38980 string MZ
3899>0 use mz first match
3900
39010 name mz then second match
3902>0 string MZ
3903"#,
3904            b"MZ\0",
3905            "first match then second match"
3906        );
3907    }
3908
3909    #[test]
3910    fn test_scalar_transform() {
3911        assert_magic_match_bin!("0 ubyte+1 0x1 add works", b"\x00");
3912        assert_magic_match_bin!("0 ubyte-1 0xfe sub works", b"\xff");
3913        assert_magic_match_bin!("0 ubyte%2 0 mod works", b"\x0a");
3914        assert_magic_match_bin!("0 ubyte&0x0f 0x0f bitand works", b"\xff");
3915        assert_magic_match_bin!("0 ubyte|0x0f 0xff bitor works", b"\xf0");
3916        assert_magic_match_bin!("0 ubyte^0x0f 0xf0 bitxor works", b"\xff");
3917
3918        FileMagicParser::parse_str("0 ubyte%0 mod by zero", None)
3919            .expect_err("expect div by zero error");
3920        FileMagicParser::parse_str("0 ubyte/0 div by zero", None)
3921            .expect_err("expect div by zero error");
3922    }
3923
3924    #[test]
3925    fn test_belong() {
3926        // Test that a file with a four-byte value at offset 0 that matches the given value in big-endian byte order
3927        assert_magic_match_bin!("0 belong 0x12345678 Big-endian long", b"\x12\x34\x56\x78");
3928        // Test that a file with a four-byte value at offset 0 that does not match the given value in big-endian byte order
3929        assert_magic_not_match_bin!("0 belong 0x12345678 Big-endian long", b"\x78\x56\x34\x12");
3930        // Test that a file with a four-byte value at a non-zero offset that matches the given value in big-endian byte order
3931        assert_magic_match_bin!(
3932            "4 belong 0x12345678 Big-endian long",
3933            b"\x00\x00\x00\x00\x12\x34\x56\x78"
3934        );
3935        // Test < operator
3936        assert_magic_match_bin!("0 belong <0x12345678 Big-endian long", b"\x12\x34\x56\x77");
3937        assert_magic_not_match_bin!("0 belong <0x12345678 Big-endian long", b"\x12\x34\x56\x78");
3938
3939        // Test > operator
3940        assert_magic_match_bin!("0 belong >0x12345678 Big-endian long", b"\x12\x34\x56\x79");
3941        assert_magic_not_match_bin!("0 belong >0x12345678 Big-endian long", b"\x12\x34\x56\x78");
3942
3943        // Test & operator
3944        assert_magic_match_bin!("0 belong &0x5678 Big-endian long", b"\x00\x00\x56\x78");
3945        assert_magic_not_match_bin!("0 belong &0x0000FFFF Big-endian long", b"\x12\x34\x56\x78");
3946
3947        // Test ^ operator (bitwise AND with complement)
3948        assert_magic_match_bin!("0 belong ^0xFFFF0000 Big-endian long", b"\x00\x00\x56\x78");
3949        assert_magic_not_match_bin!("0 belong ^0xFFFF0000 Big-endian long", b"\x00\x01\x56\x78");
3950
3951        // Test ~ operator
3952        assert_magic_match_bin!("0 belong ~0x12345678 Big-endian long", b"\xed\xcb\xa9\x87");
3953        assert_magic_not_match_bin!("0 belong ~0x12345678 Big-endian long", b"\x12\x34\x56\x78");
3954
3955        // Test x operator
3956        assert_magic_match_bin!("0 belong x Big-endian long", b"\x12\x34\x56\x78");
3957        assert_magic_match_bin!("0 belong x Big-endian long", b"\x78\x56\x34\x12");
3958    }
3959
3960    #[test]
3961    fn test_parse_search() {
3962        parse_assert!("0 search test");
3963        parse_assert!("0 search/24/s test");
3964        parse_assert!("0 search/s/24 test");
3965    }
3966
3967    #[test]
3968    fn test_bedate() {
3969        assert_magic_match_bin!(
3970            "0 bedate 946684800 Unix date (Jan 1, 2000)",
3971            b"\x38\x6D\x43\x80"
3972        );
3973        assert_magic_not_match_bin!(
3974            "0 bedate 946684800 Unix date (Jan 1, 2000)",
3975            b"\x00\x00\x00\x00"
3976        );
3977        assert_magic_match_bin!(
3978            "4 bedate 946684800 %s",
3979            b"\x00\x00\x00\x00\x38\x6D\x43\x80",
3980            "2000-01-01 00:00:00"
3981        );
3982    }
3983    #[test]
3984    fn test_beldate() {
3985        assert_magic_match_bin!(
3986            "0 beldate 946684800 Local date (Jan 1, 2000)",
3987            b"\x38\x6D\x43\x80"
3988        );
3989        assert_magic_not_match_bin!(
3990            "0 beldate 946684800 Local date (Jan 1, 2000)",
3991            b"\x00\x00\x00\x00"
3992        );
3993
3994        assert_magic_match_bin!(
3995            "4 beldate 946684800 {}",
3996            b"\x00\x00\x00\x00\x38\x6D\x43\x80",
3997            unix_local_time_to_string(946684800)
3998        );
3999    }
4000
4001    #[test]
4002    fn test_beqdate() {
4003        assert_magic_match_bin!(
4004            "0 beqdate 946684800 Unix date (Jan 1, 2000)",
4005            b"\x00\x00\x00\x00\x38\x6D\x43\x80"
4006        );
4007
4008        assert_magic_not_match_bin!(
4009            "0 beqdate 946684800 Unix date (Jan 1, 2000)",
4010            b"\x00\x00\x00\x00\x00\x00\x00\x00"
4011        );
4012
4013        assert_magic_match_bin!(
4014            "0 beqdate 946684800 %s",
4015            b"\x00\x00\x00\x00\x38\x6D\x43\x80",
4016            "2000-01-01 00:00:00"
4017        );
4018    }
4019
4020    #[test]
4021    fn test_medate() {
4022        assert_magic_match_bin!(
4023            "0 medate 946684800 Unix date (Jan 1, 2000)",
4024            b"\x6D\x38\x80\x43"
4025        );
4026
4027        assert_magic_not_match_bin!(
4028            "0 medate 946684800 Unix date (Jan 1, 2000)",
4029            b"\x00\x00\x00\x00"
4030        );
4031
4032        assert_magic_match_bin!(
4033            "4 medate 946684800 %s",
4034            b"\x00\x00\x00\x00\x6D\x38\x80\x43",
4035            "2000-01-01 00:00:00"
4036        );
4037    }
4038
4039    #[test]
4040    fn test_meldate() {
4041        assert_magic_match_bin!(
4042            "0 meldate 946684800 Local date (Jan 1, 2000)",
4043            b"\x6D\x38\x80\x43"
4044        );
4045        assert_magic_not_match_bin!(
4046            "0 meldate 946684800 Local date (Jan 1, 2000)",
4047            b"\x00\x00\x00\x00"
4048        );
4049
4050        assert_magic_match_bin!(
4051            "4 meldate 946684800 %s",
4052            b"\x00\x00\x00\x00\x6D\x38\x80\x43",
4053            unix_local_time_to_string(946684800)
4054        );
4055    }
4056
4057    #[test]
4058    fn test_date() {
4059        assert_magic_match_bin!(
4060            "0 date 946684800 Local date (Jan 1, 2000)",
4061            b"\x80\x43\x6D\x38"
4062        );
4063        assert_magic_not_match_bin!(
4064            "0 date 946684800 Local date (Jan 1, 2000)",
4065            b"\x00\x00\x00\x00"
4066        );
4067        assert_magic_match_bin!(
4068            "4 date 946684800 {}",
4069            b"\x00\x00\x00\x00\x80\x43\x6D\x38",
4070            "2000-01-01 00:00:00"
4071        );
4072    }
4073
4074    #[test]
4075    fn test_leldate() {
4076        assert_magic_match_bin!(
4077            "0 leldate 946684800 Local date (Jan 1, 2000)",
4078            b"\x80\x43\x6D\x38"
4079        );
4080        assert_magic_not_match_bin!(
4081            "0 leldate 946684800 Local date (Jan 1, 2000)",
4082            b"\x00\x00\x00\x00"
4083        );
4084        assert_magic_match_bin!(
4085            "4 leldate 946684800 {}",
4086            b"\x00\x00\x00\x00\x80\x43\x6D\x38",
4087            unix_local_time_to_string(946684800)
4088        );
4089    }
4090
4091    #[test]
4092    fn test_leqdate() {
4093        assert_magic_match_bin!(
4094            "0 leqdate 1577836800 Unix date (Jan 1, 2020)",
4095            b"\x00\xe1\x0b\x5E\x00\x00\x00\x00"
4096        );
4097
4098        assert_magic_not_match_bin!(
4099            "0 leqdate 1577836800 Unix date (Jan 1, 2020)",
4100            b"\x00\x00\x00\x00\x00\x00\x00\x00"
4101        );
4102        assert_magic_match_bin!(
4103            "8 leqdate 1577836800 %s",
4104            b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\xE1\x0B\x5E\x00\x00\x00\x00",
4105            "2020-01-01 00:00:00"
4106        );
4107    }
4108
4109    #[test]
4110    fn test_leqldate() {
4111        assert_magic_match_bin!(
4112            "0 leqldate 1577836800 Unix date (Jan 1, 2020)",
4113            b"\x00\xe1\x0b\x5E\x00\x00\x00\x00"
4114        );
4115
4116        assert_magic_not_match_bin!(
4117            "0 leqldate 1577836800 Unix date (Jan 1, 2020)",
4118            b"\x00\x00\x00\x00\x00\x00\x00\x00"
4119        );
4120        assert_magic_match_bin!(
4121            "8 leqldate 1577836800 %s",
4122            b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\xE1\x0B\x5E\x00\x00\x00\x00",
4123            unix_local_time_to_string(1577836800)
4124        );
4125    }
4126
4127    #[test]
4128    fn test_melong() {
4129        // Test = operator
4130        assert_magic_match_bin!(
4131            "0 melong =0x12345678 Middle-endian long",
4132            b"\x34\x12\x78\x56"
4133        );
4134        assert_magic_not_match_bin!(
4135            "0 melong =0x12345678 Middle-endian long",
4136            b"\x00\x00\x00\x00"
4137        );
4138
4139        // Test < operator
4140        assert_magic_match_bin!(
4141            "0 melong <0x12345678 Middle-endian long",
4142            b"\x34\x12\x78\x55"
4143        ); // 0x12345677 in middle-endian
4144        assert_magic_not_match_bin!(
4145            "0 melong <0x12345678 Middle-endian long",
4146            b"\x34\x12\x78\x56"
4147        ); // 0x12345678 in middle-endian
4148
4149        // Test > operator
4150        assert_magic_match_bin!(
4151            "0 melong >0x12345678 Middle-endian long",
4152            b"\x34\x12\x78\x57"
4153        ); // 0x12345679 in middle-endian
4154        assert_magic_not_match_bin!(
4155            "0 melong >0x12345678 Middle-endian long",
4156            b"\x34\x12\x78\x56"
4157        ); // 0x12345678 in middle-endian
4158
4159        // Test & operator
4160        assert_magic_match_bin!("0 melong &0x5678 Middle-endian long", b"\xab\xcd\x78\x56"); // 0x00007856 in middle-endian
4161        assert_magic_not_match_bin!(
4162            "0 melong &0x0000FFFF Middle-endian long",
4163            b"\x34\x12\x78\x56"
4164        ); // 0x12347856 in middle-endian
4165
4166        // Test ^ operator (bitwise AND with complement)
4167        assert_magic_match_bin!(
4168            "0 melong ^0xFFFF0000 Middle-endian long",
4169            b"\x00\x00\x78\x56"
4170        ); // 0x00007856 in middle-endian
4171        assert_magic_not_match_bin!(
4172            "0 melong ^0xFFFF0000 Middle-endian long",
4173            b"\x00\x01\x78\x56"
4174        ); // 0x00017856 in middle-endian
4175
4176        // Test ~ operator
4177        assert_magic_match_bin!(
4178            "0 melong ~0x12345678 Middle-endian long",
4179            b"\xCB\xED\x87\xA9"
4180        );
4181        assert_magic_not_match_bin!(
4182            "0 melong ~0x12345678 Middle-endian long",
4183            b"\x34\x12\x78\x56"
4184        ); // The original value
4185
4186        // Test x operator
4187        assert_magic_match_bin!("0 melong x Middle-endian long", b"\x34\x12\x78\x56");
4188        assert_magic_match_bin!("0 melong x Middle-endian long", b"\x00\x00\x00\x00");
4189    }
4190
4191    #[test]
4192    fn test_uquad() {
4193        // Test = operator
4194        assert_magic_match_bin!(
4195            "0 uquad =0x123456789ABCDEF0 Unsigned quad",
4196            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4197        );
4198        assert_magic_not_match_bin!(
4199            "0 uquad =0x123456789ABCDEF0 Unsigned quad",
4200            b"\x00\x00\x00\x00\x00\x00\x00\x00"
4201        );
4202
4203        // Test < operator
4204        assert_magic_match_bin!(
4205            "0 uquad <0x123456789ABCDEF0 Unsigned quad",
4206            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x11"
4207        );
4208        assert_magic_not_match_bin!(
4209            "0 uquad <0x123456789ABCDEF0 Unsigned quad",
4210            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4211        );
4212
4213        // Test > operator
4214        assert_magic_match_bin!(
4215            "0 uquad >0x123456789ABCDEF0 Unsigned quad",
4216            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x13"
4217        );
4218        assert_magic_not_match_bin!(
4219            "0 uquad >0x123456789ABCDEF0 Unsigned quad",
4220            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4221        );
4222
4223        // Test & operator
4224        assert_magic_match_bin!(
4225            "0 uquad &0xF0 Unsigned quad",
4226            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4227        );
4228        assert_magic_not_match_bin!(
4229            "0 uquad &0xFF Unsigned quad",
4230            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4231        );
4232
4233        // Test ^ operator (bitwise AND with complement)
4234        assert_magic_match_bin!(
4235            "0 uquad ^0xFFFFFFFFFFFFFFFF Unsigned quad",
4236            b"\x00\x00\x00\x00\x00\x00\x00\x00"
4237        ); // All bits clear
4238        assert_magic_not_match_bin!(
4239            "0 uquad ^0xFFFFFFFFFFFFFFFF Unsigned quad",
4240            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4241        ); // Some bits set
4242
4243        // Test ~ operator
4244        assert_magic_match_bin!(
4245            "0 uquad ~0x123456789ABCDEF0 Unsigned quad",
4246            b"\x0F\x21\x43\x65\x87\xA9\xCB\xED"
4247        );
4248        assert_magic_not_match_bin!(
4249            "0 uquad ~0x123456789ABCDEF0 Unsigned quad",
4250            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4251        ); // The original value
4252
4253        // Test x operator
4254        assert_magic_match_bin!(
4255            "0 uquad x {:#x}",
4256            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12",
4257            "0x123456789abcdef0"
4258        );
4259        assert_magic_match_bin!(
4260            "0 uquad x Unsigned quad",
4261            b"\x00\x00\x00\x00\x00\x00\x00\x00"
4262        );
4263    }
4264
4265    #[test]
4266    fn test_guid() {
4267        assert_magic_match_bin!(
4268            "0 guid EC959539-6786-2D4E-8FDB-98814CE76C1E It works",
4269            b"\xEC\x95\x95\x39\x67\x86\x2D\x4E\x8F\xDB\x98\x81\x4C\xE7\x6C\x1E"
4270        );
4271
4272        assert_magic_not_match_bin!(
4273            "0 guid 399595EC-8667-4E2D-8FDB-98814CE76C1E It works",
4274            b"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F"
4275        );
4276
4277        assert_magic_match_bin!(
4278            "0 guid x %s",
4279            b"\xEC\x95\x95\x39\x67\x86\x2D\x4E\x8F\xDB\x98\x81\x4C\xE7\x6C\x1E",
4280            "EC959539-6786-2D4E-8FDB-98814CE76C1E"
4281        );
4282    }
4283
4284    #[test]
4285    fn test_ubeqdate() {
4286        assert_magic_match_bin!(
4287            "0 ubeqdate 1633046400 It works",
4288            b"\x00\x00\x00\x00\x61\x56\x4f\x80"
4289        );
4290
4291        assert_magic_match_bin!(
4292            "0 ubeqdate x %s",
4293            b"\x00\x00\x00\x00\x61\x56\x4f\x80",
4294            "2021-10-01 00:00:00"
4295        );
4296
4297        assert_magic_not_match_bin!(
4298            "0 ubeqdate 1633046400 It should not work",
4299            b"\x00\x00\x00\x00\x00\x00\x00\x00"
4300        );
4301    }
4302
4303    #[test]
4304    fn test_ldate() {
4305        assert_magic_match_bin!("0 ldate 1640551520 It works", b"\x60\xd4\xC8\x61");
4306
4307        assert_magic_not_match_bin!("0 ldate 1633046400 It should not work", b"\x00\x00\x00\x00");
4308
4309        assert_magic_match_bin!(
4310            "0 ldate x %s",
4311            b"\x60\xd4\xC8\x61",
4312            unix_local_time_to_string(1640551520)
4313        );
4314    }
4315
4316    #[test]
4317    fn test_scalar_with_transform() {
4318        assert_magic_match_bin!("0 ubyte/10 2 {}", b"\x14", "2");
4319        assert_magic_match_bin!("0 ubyte/10 x {}", b"\x14", "2");
4320        assert_magic_match_bin!("0 ubyte%10 x {}", b"\x14", "0");
4321    }
4322
4323    #[test]
4324    fn test_float_with_transform() {
4325        assert_magic_match_bin!("0 lefloat/10 2 {}", b"\x00\x00\xa0\x41", "2");
4326        assert_magic_match_bin!("0 lefloat/10 x {}", b"\x00\x00\xa0\x41", "2");
4327        assert_magic_match_bin!("0 lefloat%10 x {}", b"\x00\x00\xa0\x41", "0");
4328    }
4329
4330    #[test]
4331    fn test_read_octal() {
4332        // Basic cases
4333        assert_eq!(read_octal_u64(&mut lazy_cache!("0")), Some(0));
4334        assert_eq!(read_octal_u64(&mut lazy_cache!("00")), Some(0));
4335        assert_eq!(read_octal_u64(&mut lazy_cache!("01")), Some(1));
4336        assert_eq!(read_octal_u64(&mut lazy_cache!("07")), Some(7));
4337        assert_eq!(read_octal_u64(&mut lazy_cache!("010")), Some(8));
4338        assert_eq!(read_octal_u64(&mut lazy_cache!("0123")), Some(83));
4339        assert_eq!(read_octal_u64(&mut lazy_cache!("0755")), Some(493));
4340
4341        // With trailing non-octal characters
4342        assert_eq!(read_octal_u64(&mut lazy_cache!("0ABC")), Some(0));
4343        assert_eq!(read_octal_u64(&mut lazy_cache!("01ABC")), Some(1));
4344        assert_eq!(read_octal_u64(&mut lazy_cache!("0755ABC")), Some(493));
4345        assert_eq!(read_octal_u64(&mut lazy_cache!("0123ABC")), Some(83));
4346
4347        // Invalid octal digits
4348        assert_eq!(read_octal_u64(&mut lazy_cache!("08")), Some(0)); // stops at '8'
4349        assert_eq!(read_octal_u64(&mut lazy_cache!("01238")), Some(83)); // stops at '8'
4350
4351        // No leading '0'
4352        assert_eq!(read_octal_u64(&mut lazy_cache!("123")), None);
4353        assert_eq!(read_octal_u64(&mut lazy_cache!("755")), None);
4354
4355        // Empty string
4356        assert_eq!(read_octal_u64(&mut lazy_cache!("")), None);
4357
4358        // Only non-octal characters
4359        assert_eq!(read_octal_u64(&mut lazy_cache!("ABC")), None);
4360        assert_eq!(read_octal_u64(&mut lazy_cache!("8ABC")), None); // first char is not '0'
4361
4362        // Longer valid octal (but within u64 range)
4363        assert_eq!(
4364            read_octal_u64(&mut lazy_cache!("01777777777")),
4365            Some(268435455)
4366        );
4367    }
4368
4369    #[test]
4370    fn test_offset_bug_1() {
4371        // this tests the exact behaviour
4372        // expected by libmagic/file
4373        assert_magic_match_bin!(
4374            r"
43751	string		TEST Bread is
4376# offset computation is relative to
4377# rule start
4378>(5.b)	use toasted
4379
43800 name toasted
4381>0	string twice Toasted
4382>>0  use toasted_twice 
4383
43840 name toasted_twice
4385>(6.b) string x %s
4386        ",
4387            b"\x00TEST\x06twice\x00\x06",
4388            "Bread is Toasted twice"
4389        );
4390    }
4391
4392    // this test implement the exact same logic as
4393    // test_offset_bug_1 except that the rule starts
4394    // matching from end. Surprisingly we need to
4395    // adjust indirect offsets so that it works in
4396    // libmagic/file
4397    #[test]
4398    fn test_offset_bug_2() {
4399        // this tests the exact behaviour
4400        // expected by libmagic/file
4401        assert_magic_match_bin!(
4402            r"
4403-12	string		TEST Bread is
4404>(4.b)	use toasted
4405
44060 name toasted
4407>0	string twice Toasted
4408>>0  use toasted_twice
4409
44100 name toasted_twice
4411>(6.b) string x %
4412        ",
4413            b"\x00TEST\x06twice\x00\x06",
4414            "Bread is Toasted twice"
4415        )
4416    }
4417
4418    #[test]
4419    fn test_offset_bug_3() {
4420        // this tests the exact behaviour
4421        // expected by libmagic/file
4422        assert_magic_match_bin!(
4423            r"
44241	string		TEST Bread is
4425>(5.b) indirect/r x
4426
44270	string twice Toasted
4428>0  use toasted_twice
4429
44300 name toasted_twice
4431>0 string x %s
4432        ",
4433            b"\x00TEST\x06twice\x00\x08",
4434            "Bread is Toasted twice"
4435        )
4436    }
4437
4438    #[test]
4439    fn test_offset_bug_4() {
4440        // this tests the exact behaviour
4441        // expected by libmagic/file
4442        assert_magic_match_bin!(
4443            r"
44441	string		Bread %s
4445>(6.b) indirect/r x
4446
4447# this one uses a based offset
4448# computed at indirection
44491	string is\ Toasted %s
4450>(11.b)  use toasted_twice
4451
4452# this one is using a new base
4453# offset being previous base 
4454# offset + offset of use
44550 name toasted_twice
4456>0 string x %s
4457            ",
4458            b"\x00Bread\x06is Toasted\x0ctwice\x00",
4459            "Bread is Toasted twice"
4460        )
4461    }
4462
4463    #[test]
4464    fn test_offset_bug_5() {
4465        assert_magic_match_bin!(
4466            r"
44671	string		TEST Bread is
4468>(5.b) indirect/r x
4469
44700	string twice Toasted
4471>0  use toasted_twice
4472
44730 name toasted_twice
4474>0 string twice
4475>>&1 byte 0x08 twice
4476            ",
4477            b"\x00TEST\x06twice\x00\x08",
4478            "Bread is Toasted twice"
4479        )
4480    }
4481
4482    #[test]
4483    fn test_message_parts() {
4484        let m = first_magic(
4485            r#"0	string/W	#!/usr/bin/env\ python  PYTHON"#,
4486            b"#!/usr/bin/env    python",
4487            StreamKind::Text(TextEncoding::Ascii),
4488        )
4489        .unwrap();
4490
4491        assert!(m.message_parts().any(|p| p.eq_ignore_ascii_case("python")))
4492    }
4493}