pure_magic/
lib.rs

1#![forbid(unsafe_code)]
2#![deny(unused_imports)]
3//! # `pure-magic`: A pure and safe Rust Reimplementation of `libmagic`
4//!
5//! Unlike many file identification crates, `pure-magic` is highly compatible with the standard
6//! `magic` rule format, allowing seamless reuse of existing
7//! [rules](https://github.com/qjerome/magic-rs/tree/main/magic-db/src/magdir). This makes it an ideal
8//! drop-in replacement for crates relying on **`libmagic` C bindings**, where memory safety is critical.
9//!
10//! **Key Features:**
11//! - File type detection
12//! - MIME type inference
13//! - Custom magic rule parsing
14//!
15//! ## Installation
16//! Add `pure-magic` to your `Cargo.toml`:
17//!
18//! ```toml
19//! [dependencies]
20//! pure-magic = "0.1"  # Replace with the latest version
21//! ```
22//!
23//! Or add the latest version with cargo:
24//!
25//! ```sh
26//! cargo add pure-magic
27//! ```
28//!
29//! ## Quick Start
30//!
31//! ### Detect File Types Programmatically
32//! ```rust
33//! use pure_magic::{MagicDb, MagicSource};
34//! use std::fs::File;
35//!
36//! fn main() -> Result<(), Box<dyn std::error::Error>> {
37//!     let mut db = MagicDb::new();
38//!     // Create a MagicSource from a file
39//!     let rust_magic = MagicSource::open("../magic-db/src/magdir/rust")?;
40//!     db.load(rust_magic)?;
41//!
42//!     // Open a file and detect its type
43//!     let mut file = File::open("src/lib.rs")?;
44//!     let magic = db.first_magic(&mut file, None)?;
45//!
46//!     println!(
47//!         "File type: {} (MIME: {}, strength: {})",
48//!         magic.message(),
49//!         magic.mime_type(),
50//!         magic.strength()
51//!     );
52//!     Ok(())
53//! }
54//! ```
55//!
56//! ### Get All Matching Rules
57//! ```rust
58//! use pure_magic::{MagicDb, MagicSource};
59//! use std::fs::File;
60//!
61//! fn main() -> Result<(), Box<dyn std::error::Error>> {
62//!     let mut db = MagicDb::new();
63//!     // Create a MagicSource from a file
64//!     let rust_magic = MagicSource::open("../magic-db/src/magdir/rust")?;
65//!     db.load(rust_magic)?;
66//!
67//!     // Open a file and detect its type
68//!     let mut file = File::open("src/lib.rs")?;
69//!
70//!     // Get all matching rules, sorted by strength
71//!     let magics = db.all_magics(&mut file)?;
72//!
73//!     // Must contain rust file magic and default text magic
74//!     assert!(magics.len() > 1);
75//!
76//!     for magic in magics {
77//!         println!(
78//!             "Match: {} (strength: {}, source: {})",
79//!             magic.message(),
80//!             magic.strength(),
81//!             magic.source().unwrap_or("unknown")
82//!         );
83//!     }
84//!     Ok(())
85//! }
86//! ```
87//!
88//! ### Serialize a Database to Disk
89//! ```rust
90//! use pure_magic::{MagicDb, MagicSource};
91//! use std::fs::File;
92//!
93//! fn main() -> Result<(), Box<dyn std::error::Error>> {
94//!     let mut db = MagicDb::new();
95//!     // Create a MagicSource from a file
96//!     let rust_magic = MagicSource::open("../magic-db/src/magdir/rust")?;
97//!     db.load(rust_magic)?;
98//!
99//!     // Serialize the database to a file
100//!     let mut output = File::create("/tmp/compiled.db")?;
101//!     db.serialize(&mut output)?;
102//!
103//!     println!("Database saved to file");
104//!     Ok(())
105//! }
106//! ```
107//!
108//! ### Deserialize a Database
109//! ```rust
110//! use pure_magic::{MagicDb, MagicSource};
111//! use std::fs::File;
112//!
113//! fn main() -> Result<(), Box<dyn std::error::Error>> {
114//!     let mut db = MagicDb::new();
115//!     // Create a MagicSource from a file
116//!     let rust_magic = MagicSource::open("../magic-db/src/magdir/rust")?;
117//!     db.load(rust_magic)?;
118//!
119//!     // Serialize the database in a vector
120//!     let mut ser = vec![];
121//!     db.serialize(&mut ser)?;
122//!     println!("Database saved to vector");
123//!
124//!     // We deserialize from slice
125//!     let db = MagicDb::deserialize(&mut ser.as_slice())?;
126//!
127//!     assert!(!db.rules().is_empty());
128//!
129//!     Ok(())
130//! }
131//! ```
132//!
133//! ## License
134//! This project is licensed under the **GPL-3.0 License**.
135//!
136//! ## Contributing
137//! Contributions are welcome! Open an issue or submit a pull request.
138//!
139//! ## Acknowledgments
140//! - Inspired by the original `libmagic` (part of the `file` command).
141
142use dyf::{DynDisplay, FormatString, dformat};
143use flagset::{FlagSet, flags};
144use flate2::{Compression, read::GzDecoder, write::GzEncoder};
145use lazy_cache::LazyCache;
146use memchr::memchr;
147use pest::{Span, error::ErrorVariant};
148use regex::bytes::{self};
149use serde::{Deserialize, Serialize};
150use std::{
151    borrow::Cow,
152    cmp::max,
153    collections::{HashMap, HashSet},
154    fmt::{self, Debug, Display},
155    io::{self, Read, Seek, SeekFrom, Write},
156    ops::{Add, BitAnd, BitOr, BitXor, Div, Mul, Rem, Sub},
157    path::Path,
158};
159use tar::Archive;
160use thiserror::Error;
161use tracing::{Level, debug, enabled, trace};
162
163use crate::{
164    numeric::{Float, FloatDataType, Scalar, ScalarDataType},
165    parser::{FileMagicParser, Rule},
166    utils::{decode_id3, find_json_boundaries, run_utf8_validation},
167};
168
169mod numeric;
170mod parser;
171mod utils;
172
173const HARDCODED_MAGIC_STRENGTH: u64 = 2048;
174const HARDCODED_SOURCE: &str = "hardcoded";
175// corresponds to FILE_INDIR_MAX constant defined in libmagic
176const MAX_RECURSION: usize = 50;
177// constant found in libmagic. It is used to limit for search tests
178pub const FILE_BYTES_MAX: usize = 7 * 1024 * 1024;
179// constant found in libmagic. It is used to limit for regex tests
180const FILE_REGEX_MAX: usize = 8192;
181
182pub const DEFAULT_BIN_MIMETYPE: &str = "application/octet-stream";
183pub const DEFAULT_TEXT_MIMETYPE: &str = "text/plain";
184
185pub(crate) const TIMESTAMP_FORMAT: &str = "%Y-%m-%d %H:%M:%S";
186
187macro_rules! debug_panic {
188    ($($arg:tt)*) => {
189        if cfg!(debug_assertions) {
190            panic!($($arg)*);
191        }
192    };
193}
194
195macro_rules! read {
196    ($r: expr, $ty: ty) => {{
197        let mut a = [0u8; std::mem::size_of::<$ty>()];
198        $r.read_exact(&mut a)?;
199        a
200    }};
201}
202
203macro_rules! read_le {
204    ($r:expr, $ty: ty ) => {{ <$ty>::from_le_bytes(read!($r, $ty)) }};
205}
206
207macro_rules! read_be {
208    ($r:expr, $ty: ty ) => {{ <$ty>::from_be_bytes(read!($r, $ty)) }};
209}
210
211macro_rules! read_me {
212    ($r: expr) => {{ ((read_le!($r, u16) as i32) << 16) | (read_le!($r, u16) as i32) }};
213}
214
215#[inline(always)]
216fn read_octal_u64<R: Read + Seek>(haystack: &mut LazyCache<R>) -> Option<u64> {
217    let s = haystack
218        .read_while_or_limit(|b| matches!(b, b'0'..=b'7'), 22)
219        .map(|buf| str::from_utf8(buf))
220        .ok()?
221        .ok()?;
222
223    if !s.starts_with("0") {
224        return None;
225    }
226
227    u64::from_str_radix(s, 8).ok()
228}
229
230/// Represents all possible errors that can occur during file type detection and processing.
231#[derive(Debug, Error)]
232pub enum Error {
233    /// A generic error with a custom message.
234    #[error("{0}")]
235    Msg(String),
236
237    /// An error with a source location and a nested error.
238    #[error("source={0} line={1} error={2}")]
239    Localized(String, usize, Box<Error>),
240
241    /// Indicates a required rule was not found.
242    #[error("missing rule: {0}")]
243    MissingRule(String),
244
245    /// Indicates the maximum recursion depth was reached.
246    #[error("maximum recursion reached: {0}")]
247    MaximumRecursion(usize),
248
249    /// Wraps an I/O error.
250    #[error("io: {0}")]
251    Io(#[from] io::Error),
252
253    /// Wraps a parsing error from the `pest` parser.
254    #[error("parser error: {0}")]
255    Parse(#[from] Box<pest::error::Error<Rule>>),
256
257    /// Wraps a formatting error from the `dyf` crate.
258    #[error("formatting: {0}")]
259    Format(#[from] dyf::Error),
260
261    /// Wraps a regex-related error.
262    #[error("regex: {0}")]
263    Regex(#[from] regex::Error),
264
265    /// Wraps a serialization error from `bincode`.
266    #[error("{0}")]
267    Serialize(#[from] bincode::error::EncodeError),
268
269    /// Wraps a deserialization error from `bincode`.
270    #[error("{0}")]
271    Deserialize(#[from] bincode::error::DecodeError),
272}
273
274impl Error {
275    #[inline]
276    fn parser<S: ToString>(msg: S, span: Span<'_>) -> Self {
277        Self::Parse(Box::new(pest::error::Error::new_from_span(
278            ErrorVariant::CustomError {
279                message: msg.to_string(),
280            },
281            span,
282        )))
283    }
284
285    fn msg<M: AsRef<str>>(msg: M) -> Self {
286        Self::Msg(msg.as_ref().into())
287    }
288
289    fn localized<S: AsRef<str>>(source: S, line: usize, err: Error) -> Self {
290        Self::Localized(source.as_ref().into(), line, err.into())
291    }
292
293    /// Unwraps the localized error
294    pub fn unwrap_localized(&self) -> &Self {
295        match self {
296            Self::Localized(_, _, e) => e,
297            _ => self,
298        }
299    }
300}
301
302#[derive(Debug, Clone, Serialize, Deserialize)]
303enum Message {
304    String(String),
305    Format {
306        printf_spec: String,
307        fs: FormatString,
308    },
309}
310
311impl Display for Message {
312    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
313        match self {
314            Self::String(s) => write!(f, "{s}"),
315            Self::Format { printf_spec: _, fs } => write!(f, "{}", fs.to_string_lossy()),
316        }
317    }
318}
319
320impl Message {
321    fn to_string_lossy(&self) -> Cow<'_, str> {
322        match self {
323            Message::String(s) => Cow::Borrowed(s),
324            Message::Format { printf_spec: _, fs } => fs.to_string_lossy(),
325        }
326    }
327
328    #[inline(always)]
329    fn format_with(&self, mr: Option<&MatchRes>) -> Result<Cow<'_, str>, Error> {
330        match self {
331            Self::String(s) => Ok(Cow::Borrowed(s.as_str())),
332            Self::Format {
333                printf_spec: c_spec,
334                fs,
335            } => {
336                if let Some(mr) = mr {
337                    match mr {
338                        MatchRes::Float(_, _) | MatchRes::Bytes(_, _, _, _) => {
339                            Ok(Cow::Owned(dformat!(fs, mr)?))
340                        }
341                        MatchRes::Scalar(_, scalar) => {
342                            // we want to print a byte as char
343                            if c_spec.as_str() == "c" {
344                                match scalar {
345                                    Scalar::byte(b) => {
346                                        let b = (*b as u8) as char;
347                                        Ok(Cow::Owned(dformat!(fs, b)?))
348                                    }
349                                    Scalar::ubyte(b) => {
350                                        let b = *b as char;
351                                        Ok(Cow::Owned(dformat!(fs, b)?))
352                                    }
353                                    _ => Ok(Cow::Owned(dformat!(fs, mr)?)),
354                                }
355                            } else {
356                                Ok(Cow::Owned(dformat!(fs, mr)?))
357                            }
358                        }
359                    }
360                } else {
361                    Ok(fs.to_string_lossy())
362                }
363            }
364        }
365    }
366}
367
368impl ScalarDataType {
369    #[inline(always)]
370    fn read<R: Read + Seek>(&self, from: &mut R, switch_endianness: bool) -> Result<Scalar, Error> {
371        macro_rules! _read_le {
372            ($ty: ty) => {{
373                if switch_endianness {
374                    <$ty>::from_be_bytes(read!(from, $ty))
375                } else {
376                    <$ty>::from_le_bytes(read!(from, $ty))
377                }
378            }};
379        }
380
381        macro_rules! _read_be {
382            ($ty: ty) => {{
383                if switch_endianness {
384                    <$ty>::from_le_bytes(read!(from, $ty))
385                } else {
386                    <$ty>::from_be_bytes(read!(from, $ty))
387                }
388            }};
389        }
390
391        macro_rules! _read_ne {
392            ($ty: ty) => {{
393                if cfg!(target_endian = "big") {
394                    _read_be!($ty)
395                } else {
396                    _read_le!($ty)
397                }
398            }};
399        }
400
401        macro_rules! _read_me {
402            () => {
403                ((_read_le!(u16) as i32) << 16) | (_read_le!(u16) as i32)
404            };
405        }
406
407        Ok(match self {
408            // signed
409            Self::byte => Scalar::byte(read!(from, u8)[0] as i8),
410            Self::short => Scalar::short(_read_ne!(i16)),
411            Self::long => Scalar::long(_read_ne!(i32)),
412            Self::date => Scalar::date(_read_ne!(i32)),
413            Self::ldate => Scalar::ldate(_read_ne!(i32)),
414            Self::qwdate => Scalar::qwdate(_read_ne!(i64)),
415            Self::leshort => Scalar::leshort(_read_le!(i16)),
416            Self::lelong => Scalar::lelong(_read_le!(i32)),
417            Self::lequad => Scalar::lequad(_read_le!(i64)),
418            Self::bequad => Scalar::bequad(_read_be!(i64)),
419            Self::belong => Scalar::belong(_read_be!(i32)),
420            Self::bedate => Scalar::bedate(_read_be!(i32)),
421            Self::beldate => Scalar::beldate(_read_be!(i32)),
422            Self::beqdate => Scalar::beqdate(_read_be!(i64)),
423            // unsigned
424            Self::ubyte => Scalar::ubyte(read!(from, u8)[0]),
425            Self::ushort => Scalar::ushort(_read_ne!(u16)),
426            Self::uleshort => Scalar::uleshort(_read_le!(u16)),
427            Self::ulelong => Scalar::ulelong(_read_le!(u32)),
428            Self::uledate => Scalar::uledate(_read_le!(u32)),
429            Self::ulequad => Scalar::ulequad(_read_le!(u64)),
430            Self::offset => Scalar::offset(from.stream_position()?),
431            Self::ubequad => Scalar::ubequad(_read_be!(u64)),
432            Self::medate => Scalar::medate(_read_me!()),
433            Self::meldate => Scalar::meldate(_read_me!()),
434            Self::melong => Scalar::melong(_read_me!()),
435            Self::beshort => Scalar::beshort(_read_be!(i16)),
436            Self::quad => Scalar::quad(_read_ne!(i64)),
437            Self::uquad => Scalar::uquad(_read_ne!(u64)),
438            Self::ledate => Scalar::ledate(_read_le!(i32)),
439            Self::leldate => Scalar::leldate(_read_le!(i32)),
440            Self::leqdate => Scalar::leqdate(_read_le!(i64)),
441            Self::leqldate => Scalar::leqldate(_read_le!(i64)),
442            Self::leqwdate => Scalar::leqwdate(_read_le!(i64)),
443            Self::ubelong => Scalar::ubelong(_read_be!(u32)),
444            Self::ulong => Scalar::ulong(_read_ne!(u32)),
445            Self::ubeshort => Scalar::ubeshort(_read_be!(u16)),
446            Self::ubeqdate => Scalar::ubeqdate(_read_be!(u64)),
447            Self::lemsdosdate => Scalar::lemsdosdate(_read_le!(u16)),
448            Self::lemsdostime => Scalar::lemsdostime(_read_le!(u16)),
449            Self::guid => Scalar::guid(u128::from_be_bytes(read!(from, u128))),
450        })
451    }
452}
453
454impl FloatDataType {
455    #[inline(always)]
456    fn read<R: Read + Seek>(&self, from: &mut R, switch_endianness: bool) -> Result<Float, Error> {
457        macro_rules! _read_le {
458            ($ty: ty) => {{
459                if switch_endianness {
460                    <$ty>::from_be_bytes(read!(from, $ty))
461                } else {
462                    <$ty>::from_le_bytes(read!(from, $ty))
463                }
464            }};
465        }
466
467        macro_rules! _read_be {
468            ($ty: ty) => {{
469                if switch_endianness {
470                    <$ty>::from_le_bytes(read!(from, $ty))
471                } else {
472                    <$ty>::from_be_bytes(read!(from, $ty))
473                }
474            }};
475        }
476
477        macro_rules! _read_ne {
478            ($ty: ty) => {{
479                if cfg!(target_endian = "big") {
480                    _read_be!($ty)
481                } else {
482                    _read_le!($ty)
483                }
484            }};
485        }
486
487        macro_rules! _read_me {
488            () => {
489                ((_read_le!(u16) as i32) << 16) | (_read_le!(u16) as i32)
490            };
491        }
492
493        Ok(match self {
494            Self::lefloat => Float::lefloat(_read_le!(f32)),
495            Self::befloat => Float::befloat(_read_le!(f32)),
496            Self::ledouble => Float::ledouble(_read_le!(f64)),
497            Self::bedouble => Float::bedouble(_read_be!(f64)),
498        })
499    }
500}
501
502#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
503enum Op {
504    Mul,
505    Add,
506    Sub,
507    Div,
508    Mod,
509    And,
510    Xor,
511    Or,
512}
513
514impl Display for Op {
515    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
516        match self {
517            Op::Mul => write!(f, "*"),
518            Op::Add => write!(f, "+"),
519            Op::Sub => write!(f, "-"),
520            Op::Div => write!(f, "/"),
521            Op::Mod => write!(f, "%"),
522            Op::And => write!(f, "&"),
523            Op::Or => write!(f, "|"),
524            Op::Xor => write!(f, "^"),
525        }
526    }
527}
528
529#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
530enum CmpOp {
531    Eq,
532    Lt,
533    Gt,
534    BitAnd,
535    Neq, // ! operator
536    Xor,
537    Not, // ~ operator
538}
539
540impl CmpOp {
541    #[inline(always)]
542    fn is_neq(&self) -> bool {
543        matches!(self, Self::Neq)
544    }
545}
546
547#[derive(Debug, Clone, Serialize, Deserialize)]
548struct ScalarTransform {
549    op: Op,
550    num: Scalar,
551}
552
553impl ScalarTransform {
554    fn apply(&self, s: Scalar) -> Option<Scalar> {
555        match self.op {
556            Op::Add => s.checked_add(self.num),
557            Op::Sub => s.checked_sub(self.num),
558            Op::Mul => s.checked_mul(self.num),
559            Op::Div => s.checked_div(self.num),
560            Op::Mod => s.checked_rem(self.num),
561            Op::And => Some(s.bitand(self.num)),
562            Op::Xor => Some(s.bitxor(self.num)),
563            Op::Or => Some(s.bitor(self.num)),
564        }
565    }
566}
567
568#[derive(Debug, Clone, Serialize, Deserialize)]
569struct FloatTransform {
570    op: Op,
571    num: Float,
572}
573
574impl FloatTransform {
575    fn apply(&self, s: Float) -> Float {
576        match self.op {
577            Op::Add => s.add(self.num),
578            Op::Sub => s.sub(self.num),
579            Op::Mul => s.mul(self.num),
580            // returns inf when div by 0
581            Op::Div => s.div(self.num),
582            // returns NaN when rem by 0
583            Op::Mod => s.rem(self.num),
584            // parser makes sure those operators cannot be used
585            Op::And | Op::Xor | Op::Or => {
586                debug_panic!("unsupported operation");
587                s
588            }
589        }
590    }
591}
592
593#[derive(Debug, Clone, Serialize, Deserialize)]
594enum TestValue<T> {
595    Value(T),
596    Any,
597}
598
599impl<T> TestValue<T> {
600    #[inline(always)]
601    fn as_ref(&self) -> TestValue<&T> {
602        match self {
603            Self::Value(v) => TestValue::Value(v),
604            Self::Any => TestValue::Any,
605        }
606    }
607}
608
609flags! {
610    enum ReMod: u8{
611        CaseInsensitive,
612        StartOffsetUpdate,
613        LineLimit,
614        ForceBin,
615        ForceText,
616        TrimMatch,
617    }
618}
619
620fn serialize_regex<S>(re: &bytes::Regex, serializer: S) -> Result<S::Ok, S::Error>
621where
622    S: serde::Serializer,
623{
624    re.as_str().serialize(serializer)
625}
626
627fn deserialize_regex<'de, D>(deserializer: D) -> Result<bytes::Regex, D::Error>
628where
629    D: serde::Deserializer<'de>,
630{
631    let wrapper = String::deserialize(deserializer)?;
632    bytes::Regex::new(&wrapper).map_err(serde::de::Error::custom)
633}
634
635#[derive(Debug, Clone, Serialize, Deserialize)]
636struct RegexTest {
637    #[serde(
638        serialize_with = "serialize_regex",
639        deserialize_with = "deserialize_regex"
640    )]
641    re: bytes::Regex,
642    length: Option<usize>,
643    mods: FlagSet<ReMod>,
644    str_mods: FlagSet<StringMod>,
645    non_magic_len: usize,
646    binary: bool,
647    cmp_op: CmpOp,
648}
649
650impl RegexTest {
651    #[inline(always)]
652    fn is_binary(&self) -> bool {
653        self.binary
654            || self.mods.contains(ReMod::ForceBin)
655            || self.str_mods.contains(StringMod::ForceBin)
656    }
657
658    fn match_buf<'buf>(
659        &self,
660        off_buf: u64, // absolute buffer offset in content
661        stream_kind: StreamKind,
662        buf: &'buf [u8],
663    ) -> Option<MatchRes<'buf>> {
664        let mr = match stream_kind {
665            StreamKind::Text(_) => {
666                let mut off_txt = off_buf;
667
668                let mut line_limit = self.length.unwrap_or(usize::MAX);
669
670                for line in buf.split(|c| c == &b'\n') {
671                    // we don't need to break on offset
672                    // limit as buf contains the good amount
673                    // of bytes to match against
674                    if line_limit == 0 {
675                        break;
676                    }
677
678                    if let Some(re_match) = self.re.find(line) {
679                        // the offset of the string is computed from the start of the buffer
680                        let start_offset = off_txt + re_match.start() as u64;
681
682                        // if we matched until EOL we need to add one to include the delimiter removed from the split
683                        let stop_offset = if re_match.end() == line.len() {
684                            Some(start_offset + re_match.as_bytes().len() as u64 + 1)
685                        } else {
686                            None
687                        };
688
689                        return Some(MatchRes::Bytes(
690                            start_offset,
691                            stop_offset,
692                            re_match.as_bytes(),
693                            Encoding::Utf8,
694                        ));
695                    }
696
697                    off_txt += line.len() as u64;
698                    // we have to add one because lines do not contain splitting character
699                    off_txt += 1;
700                    line_limit = line_limit.saturating_sub(1)
701                }
702                None
703            }
704
705            StreamKind::Binary => {
706                self.re.find(buf).map(|re_match| {
707                    MatchRes::Bytes(
708                        // the offset of the string is computed from the start of the buffer
709                        off_buf + re_match.start() as u64,
710                        None,
711                        re_match.as_bytes(),
712                        Encoding::Utf8,
713                    )
714                })
715            }
716        };
717
718        // handle the case where we want the regex not to match
719        if self.cmp_op.is_neq() && mr.is_none() {
720            return Some(MatchRes::Bytes(off_buf, None, buf, Encoding::Utf8));
721        }
722
723        mr
724    }
725}
726
727impl From<RegexTest> for Test {
728    fn from(value: RegexTest) -> Self {
729        Self::Regex(value)
730    }
731}
732
733flags! {
734    enum StringMod: u8{
735        ForceBin,
736        UpperInsensitive,
737        LowerInsensitive,
738        FullWordMatch,
739        Trim,
740        ForceText,
741        CompactWhitespace,
742        OptBlank,
743    }
744}
745
746#[derive(Debug, Clone, Serialize, Deserialize)]
747struct StringTest {
748    test_val: TestValue<Vec<u8>>,
749    cmp_op: CmpOp,
750    length: Option<usize>,
751    mods: FlagSet<StringMod>,
752    binary: bool,
753}
754
755impl From<StringTest> for Test {
756    fn from(value: StringTest) -> Self {
757        Self::String(value)
758    }
759}
760
761#[inline(always)]
762fn string_match(str: &[u8], mods: FlagSet<StringMod>, buf: &[u8]) -> (bool, usize) {
763    let mut consumed = 0;
764    // we can do a simple string comparison
765    if mods.is_disjoint(
766        StringMod::UpperInsensitive
767            | StringMod::LowerInsensitive
768            | StringMod::FullWordMatch
769            | StringMod::CompactWhitespace
770            | StringMod::OptBlank,
771    ) {
772        // we check if target contains
773        if buf.starts_with(str) {
774            (true, str.len())
775        } else {
776            (false, consumed)
777        }
778    } else {
779        let mut i_src = 0;
780        let mut iter = buf.iter().peekable();
781
782        macro_rules! consume_target {
783            () => {{
784                if iter.next().is_some() {
785                    consumed += 1;
786                }
787            }};
788        }
789
790        macro_rules! continue_next_iteration {
791            () => {{
792                consume_target!();
793                i_src += 1;
794                continue;
795            }};
796        }
797
798        while let Some(&&b) = iter.peek() {
799            let Some(&ref_byte) = str.get(i_src) else {
800                break;
801            };
802
803            if mods.contains(StringMod::OptBlank) && (b == b' ' || ref_byte == b' ') {
804                if b == b' ' {
805                    // we ignore whitespace in target
806                    consume_target!();
807                }
808
809                if ref_byte == b' ' {
810                    // we ignore whitespace in test
811                    i_src += 1;
812                }
813
814                continue;
815            }
816
817            if mods.contains(StringMod::UpperInsensitive) {
818                //upper case characters in the magic match both lower and upper case characters in the target
819                if ref_byte.is_ascii_uppercase() && ref_byte == b.to_ascii_uppercase()
820                    || ref_byte == b
821                {
822                    continue_next_iteration!()
823                }
824            }
825
826            if mods.contains(StringMod::LowerInsensitive)
827                && (ref_byte.is_ascii_lowercase() && ref_byte == b.to_ascii_lowercase()
828                    || ref_byte == b)
829            {
830                continue_next_iteration!()
831            }
832
833            if mods.contains(StringMod::CompactWhitespace) && ref_byte == b' ' {
834                let mut src_blk = 0;
835                while let Some(b' ') = str.get(i_src) {
836                    src_blk += 1;
837                    i_src += 1;
838                }
839
840                let mut tgt_blk = 0;
841                while let Some(b' ') = iter.peek() {
842                    tgt_blk += 1;
843                    consume_target!();
844                }
845
846                if src_blk > tgt_blk {
847                    return (false, consumed);
848                }
849
850                continue;
851            }
852
853            if ref_byte == b {
854                continue_next_iteration!()
855            } else {
856                return (false, consumed);
857            }
858        }
859
860        if mods.contains(StringMod::FullWordMatch)
861            && let Some(b) = iter.peek()
862            && !b.is_ascii_whitespace()
863        {
864            return (false, consumed);
865        }
866
867        (
868            consumed > 0 && str.get(i_src).is_none() && consumed <= buf.len(),
869            consumed,
870        )
871    }
872}
873
874impl StringTest {
875    fn has_length_mod(&self) -> bool {
876        !self.mods.is_disjoint(
877            StringMod::UpperInsensitive
878                | StringMod::LowerInsensitive
879                | StringMod::FullWordMatch
880                | StringMod::CompactWhitespace
881                | StringMod::OptBlank,
882        )
883    }
884
885    #[inline(always)]
886    fn test_value_len(&self) -> usize {
887        match self.test_val.as_ref() {
888            TestValue::Value(s) => s.len(),
889            TestValue::Any => 0,
890        }
891    }
892
893    #[inline(always)]
894    fn is_binary(&self) -> bool {
895        self.binary || self.mods.contains(StringMod::ForceBin)
896    }
897
898    #[inline(always)]
899    fn is_text(&self) -> bool {
900        self.mods.contains(StringMod::ForceText)
901    }
902}
903
904#[derive(Debug, Clone, Serialize, Deserialize)]
905struct SearchTest {
906    str: Vec<u8>,
907    n_pos: Option<usize>,
908    str_mods: FlagSet<StringMod>,
909    re_mods: FlagSet<ReMod>,
910    binary: bool,
911    cmp_op: CmpOp,
912}
913
914impl From<SearchTest> for Test {
915    fn from(value: SearchTest) -> Self {
916        Self::Search(value)
917    }
918}
919
920impl SearchTest {
921    #[inline(always)]
922    fn is_binary(&self) -> bool {
923        (self.binary
924            || self.str_mods.contains(StringMod::ForceBin)
925            || self.re_mods.contains(ReMod::ForceBin))
926            && !(self.str_mods.contains(StringMod::ForceText)
927                || self.re_mods.contains(ReMod::ForceText))
928    }
929
930    // off_buf: absolute buffer offset in content
931    #[inline]
932    fn match_buf<'buf>(&self, off_buf: u64, buf: &'buf [u8]) -> Option<MatchRes<'buf>> {
933        let mut i = 0;
934
935        let needle = self.str.first()?;
936
937        while i < buf.len() {
938            // we cannot match if the first character isn't the same
939            // so we accelerate the search by finding potential matches
940            i += memchr(*needle, &buf[i..])?;
941
942            // if we want a full word match
943            if self.str_mods.contains(StringMod::FullWordMatch) {
944                let prev_is_whitespace = buf
945                    .get(i.saturating_sub(1))
946                    .map(|c| c.is_ascii_whitespace())
947                    .unwrap_or_default();
948
949                // if it is not the first character
950                // and its previous character isn't
951                // a whitespace. It cannot be a
952                // fullword match
953                if i > 0 && !prev_is_whitespace {
954                    i += 1;
955                    continue;
956                }
957            }
958
959            if let Some(npos) = self.n_pos
960                && i > npos
961            {
962                break;
963            }
964
965            let pos = i;
966            let (ok, consumed) = string_match(&self.str, self.str_mods, &buf[i..]);
967
968            if ok {
969                return Some(MatchRes::Bytes(
970                    off_buf.saturating_add(pos as u64),
971                    None,
972                    &buf[i..i + consumed],
973                    Encoding::Utf8,
974                ));
975            } else {
976                i += max(consumed, 1)
977            }
978        }
979
980        // handles the case where we want the string not to be found
981        if self.cmp_op.is_neq() {
982            return Some(MatchRes::Bytes(off_buf, None, buf, Encoding::Utf8));
983        }
984
985        None
986    }
987}
988
989#[derive(Debug, Clone, Serialize, Deserialize)]
990struct ScalarTest {
991    ty: ScalarDataType,
992    transform: Option<ScalarTransform>,
993    cmp_op: CmpOp,
994    test_val: TestValue<Scalar>,
995}
996
997#[derive(Debug, Clone, Serialize, Deserialize)]
998struct FloatTest {
999    ty: FloatDataType,
1000    transform: Option<FloatTransform>,
1001    cmp_op: CmpOp,
1002    test_val: TestValue<Float>,
1003}
1004
1005// the value read from the haystack we want to match against
1006// 'buf is the lifetime of the buffer we are scanning
1007#[derive(Debug, PartialEq)]
1008enum ReadValue<'buf> {
1009    Float(u64, Float),
1010    Scalar(u64, Scalar),
1011    Bytes(u64, &'buf [u8]),
1012}
1013
1014impl DynDisplay for ReadValue<'_> {
1015    fn dyn_fmt(&self, f: &dyf::FormatSpec) -> Result<String, dyf::Error> {
1016        match self {
1017            Self::Float(_, s) => DynDisplay::dyn_fmt(s, f),
1018            Self::Scalar(_, s) => DynDisplay::dyn_fmt(s, f),
1019            Self::Bytes(_, b) => Ok(format!("{b:?}")),
1020        }
1021    }
1022}
1023
1024impl DynDisplay for &ReadValue<'_> {
1025    fn dyn_fmt(&self, f: &dyf::FormatSpec) -> Result<String, dyf::Error> {
1026        // Dereference self to get the TestValue and call its fmt method
1027        DynDisplay::dyn_fmt(*self, f)
1028    }
1029}
1030
1031impl Display for ReadValue<'_> {
1032    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1033        match self {
1034            Self::Float(_, v) => write!(f, "{v}"),
1035            Self::Scalar(_, s) => write!(f, "{s}"),
1036            Self::Bytes(_, b) => write!(f, "{b:?}"),
1037        }
1038    }
1039}
1040
1041enum Encoding {
1042    Utf16(String16Encoding),
1043    Utf8,
1044}
1045
1046// Carry the offset of the start of the data in the stream
1047// and the data itself
1048enum MatchRes<'buf> {
1049    // Bytes.0: offset of the match
1050    // Bytes.1: optional end of match (to address the need of EOL adjustment in string regex)
1051    // Bytes.2: the bytes matching
1052    // Bytes.3: encoding of the buffer
1053    Bytes(u64, Option<u64>, &'buf [u8], Encoding),
1054    Scalar(u64, Scalar),
1055    Float(u64, Float),
1056}
1057
1058impl DynDisplay for &MatchRes<'_> {
1059    fn dyn_fmt(&self, f: &dyf::FormatSpec) -> Result<String, dyf::Error> {
1060        (*self).dyn_fmt(f)
1061    }
1062}
1063
1064impl DynDisplay for MatchRes<'_> {
1065    fn dyn_fmt(&self, f: &dyf::FormatSpec) -> Result<String, dyf::Error> {
1066        match self {
1067            Self::Scalar(_, v) => v.dyn_fmt(f),
1068            Self::Float(_, v) => v.dyn_fmt(f),
1069            Self::Bytes(_, _, v, enc) => match enc {
1070                Encoding::Utf8 => String::from_utf8_lossy(v).to_string().dyn_fmt(f),
1071                Encoding::Utf16(enc) => {
1072                    let utf16: Vec<u16> = slice_to_utf16_iter(v, *enc).collect();
1073                    String::from_utf16_lossy(&utf16).dyn_fmt(f)
1074                }
1075            },
1076        }
1077    }
1078}
1079
1080impl MatchRes<'_> {
1081    // start offset of the match
1082    #[inline]
1083    fn start_offset(&self) -> u64 {
1084        match self {
1085            MatchRes::Bytes(o, _, _, _) => *o,
1086            MatchRes::Scalar(o, _) => *o,
1087            MatchRes::Float(o, _) => *o,
1088        }
1089    }
1090
1091    // start offset of the match
1092    #[inline]
1093    fn end_offset(&self) -> u64 {
1094        match self {
1095            MatchRes::Bytes(start, end, buf, _) => match end {
1096                Some(end) => *end,
1097                None => start.saturating_add(buf.len() as u64),
1098            },
1099            MatchRes::Scalar(o, sc) => o.add(sc.size_of() as u64),
1100            MatchRes::Float(o, f) => o.add(f.size_of() as u64),
1101        }
1102    }
1103}
1104
1105fn slice_to_utf16_iter(read: &[u8], encoding: String16Encoding) -> impl Iterator<Item = u16> {
1106    let even = read
1107        .iter()
1108        .enumerate()
1109        .filter(|(i, _)| i % 2 == 0)
1110        .map(|t| t.1);
1111
1112    let odd = read
1113        .iter()
1114        .enumerate()
1115        .filter(|(i, _)| i % 2 != 0)
1116        .map(|t| t.1);
1117
1118    even.zip(odd).map(move |(e, o)| match encoding {
1119        String16Encoding::Le => u16::from_le_bytes([*e, *o]),
1120        String16Encoding::Be => u16::from_be_bytes([*e, *o]),
1121    })
1122}
1123
1124#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
1125enum String16Encoding {
1126    Le,
1127    Be,
1128}
1129
1130#[derive(Debug, Clone, Serialize, Deserialize)]
1131struct String16Test {
1132    orig: String,
1133    test_val: TestValue<Vec<u16>>,
1134    encoding: String16Encoding,
1135}
1136
1137impl String16Test {
1138    /// if the test value is a specific value this method returns
1139    /// the number of utf16 characters. To obtain the length in
1140    /// bytes the return value needs to be multiplied by two.
1141    #[inline(always)]
1142    fn test_value_len(&self) -> usize {
1143        match self.test_val.as_ref() {
1144            TestValue::Value(str16) => str16.len(),
1145            TestValue::Any => 0,
1146        }
1147    }
1148}
1149
1150flags! {
1151    enum IndirectMod: u8{
1152        Relative,
1153    }
1154}
1155
1156type IndirectMods = FlagSet<IndirectMod>;
1157
1158#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
1159enum PStringLen {
1160    Byte,    // B
1161    ShortBe, // H
1162    ShortLe, // h
1163    LongBe,  // L
1164    LongLe,  // l
1165}
1166
1167impl PStringLen {
1168    #[inline(always)]
1169    const fn size_of_len(&self) -> usize {
1170        match self {
1171            PStringLen::Byte => 1,
1172            PStringLen::ShortBe => 2,
1173            PStringLen::ShortLe => 2,
1174            PStringLen::LongBe => 4,
1175            PStringLen::LongLe => 4,
1176        }
1177    }
1178}
1179
1180#[derive(Debug, Clone, Serialize, Deserialize)]
1181struct PStringTest {
1182    len: PStringLen,
1183    test_val: TestValue<Vec<u8>>,
1184    include_len: bool,
1185}
1186
1187impl PStringTest {
1188    #[inline]
1189    fn read<'cache, R: Read + Seek>(
1190        &self,
1191        haystack: &'cache mut LazyCache<R>,
1192    ) -> Result<Option<&'cache [u8]>, Error> {
1193        let mut len = match self.len {
1194            PStringLen::Byte => read_le!(haystack, u8) as u32,
1195            PStringLen::ShortBe => read_be!(haystack, u16) as u32,
1196            PStringLen::ShortLe => read_le!(haystack, u16) as u32,
1197            PStringLen::LongBe => read_be!(haystack, u32),
1198            PStringLen::LongLe => read_le!(haystack, u32),
1199        } as usize;
1200
1201        if self.include_len {
1202            len = len.saturating_sub(self.len.size_of_len())
1203        }
1204
1205        if let TestValue::Value(s) = self.test_val.as_ref()
1206            && len != s.len()
1207        {
1208            return Ok(None);
1209        }
1210
1211        let read = haystack.read_exact_count(len as u64)?;
1212
1213        Ok(Some(read))
1214    }
1215
1216    #[inline(always)]
1217    fn test_value_len(&self) -> usize {
1218        match self.test_val.as_ref() {
1219            TestValue::Value(s) => s.len(),
1220            TestValue::Any => 0,
1221        }
1222    }
1223}
1224
1225#[derive(Debug, Clone, Serialize, Deserialize)]
1226enum Test {
1227    Name(String),
1228    Use(bool, String),
1229    Scalar(ScalarTest),
1230    Float(FloatTest),
1231    String(StringTest),
1232    Search(SearchTest),
1233    PString(PStringTest),
1234    Regex(RegexTest),
1235    Indirect(FlagSet<IndirectMod>),
1236    String16(String16Test),
1237    // FIXME: placeholder for strength computation
1238    #[allow(dead_code)]
1239    Der,
1240    Clear,
1241    Default,
1242}
1243
1244impl Test {
1245    // read the value to test from the haystack
1246    #[inline]
1247    fn read_test_value<'haystack, R: Read + Seek>(
1248        &self,
1249        haystack: &'haystack mut LazyCache<R>,
1250        switch_endianness: bool,
1251    ) -> Result<Option<ReadValue<'haystack>>, Error> {
1252        let test_value_offset = haystack.lazy_stream_position();
1253
1254        match self {
1255            Self::Scalar(t) => {
1256                t.ty.read(haystack, switch_endianness)
1257                    .map(|s| Some(ReadValue::Scalar(test_value_offset, s)))
1258            }
1259
1260            Self::Float(t) => {
1261                t.ty.read(haystack, switch_endianness)
1262                    .map(|f| Some(ReadValue::Float(test_value_offset, f)))
1263            }
1264            Self::String(t) => {
1265                match t.test_val.as_ref() {
1266                    TestValue::Value(str) => {
1267                        let buf = if let Some(length) = t.length {
1268                            // if there is a length specified
1269                            haystack.read_exact_count(length as u64)?
1270                        } else {
1271                            // no length specified we read until end of string
1272
1273                            match t.cmp_op {
1274                                CmpOp::Eq | CmpOp::Neq => {
1275                                    if !t.has_length_mod() {
1276                                        haystack.read_exact_count(str.len() as u64)?
1277                                    } else {
1278                                        haystack.read_count(FILE_BYTES_MAX as u64)?
1279                                    }
1280                                }
1281                                CmpOp::Lt | CmpOp::Gt => {
1282                                    let read =
1283                                        haystack.read_until_any_delim_or_limit(b"\n\0", 8092)?;
1284
1285                                    if read.ends_with(b"\0") || read.ends_with(b"\n") {
1286                                        &read[..read.len() - 1]
1287                                    } else {
1288                                        read
1289                                    }
1290                                }
1291                                _ => {
1292                                    return Err(Error::Msg(format!(
1293                                        "string test does not support {:?} operator",
1294                                        t.cmp_op
1295                                    )));
1296                                }
1297                            }
1298                        };
1299
1300                        Ok(Some(ReadValue::Bytes(test_value_offset, buf)))
1301                    }
1302                    TestValue::Any => {
1303                        let read = haystack.read_until_any_delim_or_limit(b"\0\n", 8192)?;
1304                        // we don't take last byte if it matches end of string
1305                        let bytes = if read.ends_with(b"\0") || read.ends_with(b"\n") {
1306                            &read[..read.len() - 1]
1307                        } else {
1308                            read
1309                        };
1310
1311                        Ok(Some(ReadValue::Bytes(test_value_offset, bytes)))
1312                    }
1313                }
1314            }
1315
1316            Self::String16(t) => {
1317                match t.test_val.as_ref() {
1318                    TestValue::Value(str16) => {
1319                        let read = haystack.read_exact_count((str16.len() * 2) as u64)?;
1320
1321                        Ok(Some(ReadValue::Bytes(test_value_offset, read)))
1322                    }
1323                    TestValue::Any => {
1324                        let read = haystack.read_until_utf16_or_limit(b"\x00\x00", 8192)?;
1325
1326                        // we make sure we have an even number of elements
1327                        let end = if read.len() % 2 == 0 {
1328                            read.len()
1329                        } else {
1330                            // we decide to read anyway even though
1331                            // length isn't even
1332                            read.len().saturating_sub(1)
1333                        };
1334
1335                        Ok(Some(ReadValue::Bytes(test_value_offset, &read[..end])))
1336                    }
1337                }
1338            }
1339
1340            Self::PString(t) => {
1341                let Some(read) = t.read(haystack)? else {
1342                    return Ok(None);
1343                };
1344                Ok(Some(ReadValue::Bytes(test_value_offset, read)))
1345            }
1346
1347            Self::Search(_) => {
1348                let buf = haystack.read_count(FILE_BYTES_MAX as u64)?;
1349                Ok(Some(ReadValue::Bytes(test_value_offset, buf)))
1350            }
1351
1352            Self::Regex(r) => {
1353                let length = {
1354                    match r.length {
1355                        Some(len) => {
1356                            if r.mods.contains(ReMod::LineLimit) {
1357                                len * 80
1358                            } else {
1359                                len
1360                            }
1361                        }
1362
1363                        None => FILE_REGEX_MAX,
1364                    }
1365                };
1366
1367                let read = haystack.read_count(length as u64)?;
1368                Ok(Some(ReadValue::Bytes(test_value_offset, read)))
1369            }
1370
1371            Self::Name(_)
1372            | Self::Use(_, _)
1373            | Self::Indirect(_)
1374            | Self::Clear
1375            | Self::Default
1376            | Self::Der => Err(Error::msg("no value to read for this test")),
1377        }
1378    }
1379
1380    #[inline(always)]
1381    fn match_value<'s>(
1382        &'s self,
1383        tv: &ReadValue<'s>,
1384        stream_kind: StreamKind,
1385    ) -> Option<MatchRes<'s>> {
1386        match (self, tv) {
1387            (Self::Scalar(t), ReadValue::Scalar(o, ts)) => {
1388                let read_value: Scalar = match t.transform.as_ref() {
1389                    Some(t) => t.apply(*ts)?,
1390                    None => *ts,
1391                };
1392
1393                match t.test_val {
1394                    TestValue::Value(test_value) => {
1395                        let ok = match t.cmp_op {
1396                            // NOTE: this should not happen in practice because
1397                            // we convert it into Eq equivalent at parsing time
1398                            CmpOp::Not => read_value == !test_value,
1399                            CmpOp::Eq => read_value == test_value,
1400                            CmpOp::Lt => read_value < test_value,
1401                            CmpOp::Gt => read_value > test_value,
1402                            CmpOp::Neq => read_value != test_value,
1403                            CmpOp::BitAnd => read_value & test_value == test_value,
1404                            CmpOp::Xor => (read_value & test_value).is_zero(),
1405                        };
1406
1407                        if ok {
1408                            Some(MatchRes::Scalar(*o, read_value))
1409                        } else {
1410                            None
1411                        }
1412                    }
1413
1414                    TestValue::Any => Some(MatchRes::Scalar(*o, read_value)),
1415                }
1416            }
1417
1418            (Self::Float(t), ReadValue::Float(o, f)) => {
1419                let read_value: Float = t.transform.as_ref().map(|t| t.apply(*f)).unwrap_or(*f);
1420
1421                match t.test_val {
1422                    TestValue::Value(tf) => {
1423                        let ok = match t.cmp_op {
1424                            CmpOp::Eq => read_value == tf,
1425                            CmpOp::Lt => read_value < tf,
1426                            CmpOp::Gt => read_value > tf,
1427                            CmpOp::Neq => read_value != tf,
1428                            _ => {
1429                                // this should never be reached as we validate
1430                                // operator in parser
1431                                debug_panic!("unsupported float comparison");
1432                                debug!("unsupported float comparison");
1433                                false
1434                            }
1435                        };
1436
1437                        if ok {
1438                            Some(MatchRes::Float(*o, read_value))
1439                        } else {
1440                            None
1441                        }
1442                    }
1443                    TestValue::Any => Some(MatchRes::Float(*o, read_value)),
1444                }
1445            }
1446
1447            (Self::String(st), ReadValue::Bytes(o, buf)) => {
1448                macro_rules! trim_buf {
1449                    ($buf: expr) => {{
1450                        if st.mods.contains(StringMod::Trim) {
1451                            $buf.trim_ascii()
1452                        } else {
1453                            $buf
1454                        }
1455                    }};
1456                }
1457
1458                match st.test_val.as_ref() {
1459                    TestValue::Value(str) => {
1460                        match st.cmp_op {
1461                            CmpOp::Eq => {
1462                                if let (true, _) = string_match(str, st.mods, buf) {
1463                                    Some(MatchRes::Bytes(*o, None, trim_buf!(str), Encoding::Utf8))
1464                                } else {
1465                                    None
1466                                }
1467                            }
1468                            CmpOp::Neq => {
1469                                if let (false, _) = string_match(str, st.mods, buf) {
1470                                    Some(MatchRes::Bytes(*o, None, trim_buf!(str), Encoding::Utf8))
1471                                } else {
1472                                    None
1473                                }
1474                            }
1475                            CmpOp::Gt => {
1476                                if buf.len() > str.len() {
1477                                    Some(MatchRes::Bytes(*o, None, trim_buf!(buf), Encoding::Utf8))
1478                                } else {
1479                                    None
1480                                }
1481                            }
1482                            CmpOp::Lt => {
1483                                if buf.len() < str.len() {
1484                                    Some(MatchRes::Bytes(*o, None, trim_buf!(buf), Encoding::Utf8))
1485                                } else {
1486                                    None
1487                                }
1488                            }
1489
1490                            // unsupported for strings
1491                            _ => {
1492                                // this should never be reached as we validate
1493                                // operator in parser
1494                                debug_panic!("unsupported string comparison");
1495                                debug!("unsupported string comparison");
1496                                None
1497                            }
1498                        }
1499                    }
1500                    TestValue::Any => {
1501                        Some(MatchRes::Bytes(*o, None, trim_buf!(buf), Encoding::Utf8))
1502                    }
1503                }
1504            }
1505
1506            (Self::PString(m), ReadValue::Bytes(o, buf)) => match m.test_val.as_ref() {
1507                TestValue::Value(psv) => {
1508                    if buf == psv {
1509                        Some(MatchRes::Bytes(*o, None, buf, Encoding::Utf8))
1510                    } else {
1511                        None
1512                    }
1513                }
1514                TestValue::Any => Some(MatchRes::Bytes(*o, None, buf, Encoding::Utf8)),
1515            },
1516
1517            (Self::String16(t), ReadValue::Bytes(o, buf)) => {
1518                match t.test_val.as_ref() {
1519                    TestValue::Value(str16) => {
1520                        // strings cannot be equal
1521                        if str16.len() * 2 != buf.len() {
1522                            return None;
1523                        }
1524
1525                        // we check string equality
1526                        for (i, utf16_char) in slice_to_utf16_iter(buf, t.encoding).enumerate() {
1527                            if str16[i] != utf16_char {
1528                                return None;
1529                            }
1530                        }
1531
1532                        Some(MatchRes::Bytes(
1533                            *o,
1534                            None,
1535                            t.orig.as_bytes(),
1536                            Encoding::Utf16(t.encoding),
1537                        ))
1538                    }
1539
1540                    TestValue::Any => {
1541                        Some(MatchRes::Bytes(*o, None, buf, Encoding::Utf16(t.encoding)))
1542                    }
1543                }
1544            }
1545
1546            (Self::Regex(r), ReadValue::Bytes(o, buf)) => r.match_buf(*o, stream_kind, buf),
1547
1548            (Self::Search(t), ReadValue::Bytes(o, buf)) => t.match_buf(*o, buf),
1549
1550            _ => None,
1551        }
1552    }
1553
1554    #[inline(always)]
1555    fn strength(&self) -> u64 {
1556        const MULT: usize = 10;
1557
1558        let mut out = 2 * MULT;
1559
1560        // FIXME: octal is missing but it is not used in practice ...
1561        match self {
1562            Test::Scalar(s) => {
1563                out += s.ty.type_size() * MULT;
1564            }
1565
1566            Test::Float(t) => {
1567                out += t.ty.type_size() * MULT;
1568            }
1569
1570            Test::String(t) => out += t.test_value_len().saturating_mul(MULT),
1571
1572            Test::PString(t) => out += t.test_value_len().saturating_mul(MULT),
1573
1574            Test::Search(s) => {
1575                // NOTE: this implementation deviates from what is in
1576                // C libmagic. The purpose of this implementation is to
1577                // minimize the difference between similar tests,
1578                // implemented differently (ex: string test VS very localized search test).
1579                let n_pos = s.n_pos.unwrap_or(FILE_BYTES_MAX);
1580
1581                match n_pos {
1582                    // a search on one line should be equivalent to a string match
1583                    0..=80 => out += s.str.len().saturating_mul(MULT),
1584                    // search on the first 3 lines gets a little penalty
1585                    81..=240 => out += s.str.len() * s.str.len().clamp(0, MULT - 2),
1586                    // a search on more than 3 lines isn't considered very accurate
1587                    _ => out += s.str.len(),
1588                }
1589            }
1590
1591            Test::Regex(r) => {
1592                // NOTE: this implementation deviates from what is in
1593                // C libmagic. The purpose of this implementation is to
1594                // minimize the difference between similar tests,
1595                // implemented differently (ex: string test VS very localized regex test).
1596
1597                // we divide length by the number of capture group
1598                // which gives us a value close to he average string
1599                // length match in the regex.
1600                let v = r.non_magic_len / r.re.captures_len();
1601
1602                let len = r
1603                    .length
1604                    .map(|l| {
1605                        if r.mods.contains(ReMod::LineLimit) {
1606                            l * 80
1607                        } else {
1608                            l
1609                        }
1610                    })
1611                    .unwrap_or(FILE_BYTES_MAX);
1612
1613                match len {
1614                    // a search on one line should be equivalent to a string match
1615                    0..=80 => out += v.saturating_mul(MULT),
1616                    // search on the first 3 lines gets a little penalty
1617                    81..=240 => out += v * v.clamp(0, MULT - 2),
1618                    // a search on more than 3 lines isn't considered very accurate
1619                    _ => out += v,
1620                }
1621            }
1622
1623            Test::String16(t) => {
1624                // NOTE: in libmagic the result is div by 2
1625                // but I GUESS it is because the len is expressed
1626                // in number bytes. In our case length is expressed
1627                // in number of u16 so we shouldn't divide.
1628                out += t.test_value_len().saturating_mul(MULT);
1629            }
1630
1631            Test::Der => out += MULT,
1632
1633            Test::Default | Test::Name(_) | Test::Use(_, _) | Test::Indirect(_) | Test::Clear => {
1634                return 0;
1635            }
1636        }
1637
1638        // matching any output gets penalty
1639        if self.is_match_any() {
1640            return 0;
1641        }
1642
1643        if let Some(op) = self.cmp_op() {
1644            match op {
1645                // matching almost any gets penalty
1646                CmpOp::Neq => out = 0,
1647                CmpOp::Eq | CmpOp::Not => out += MULT,
1648                CmpOp::Lt | CmpOp::Gt => out -= 2 * MULT,
1649                CmpOp::Xor | CmpOp::BitAnd => out -= MULT,
1650            }
1651        }
1652
1653        out as u64
1654    }
1655
1656    #[inline(always)]
1657    fn cmp_op(&self) -> Option<CmpOp> {
1658        match self {
1659            Self::String(t) => Some(t.cmp_op),
1660            Self::Scalar(s) => Some(s.cmp_op),
1661            Self::Float(t) => Some(t.cmp_op),
1662            Self::Name(_)
1663            | Self::Use(_, _)
1664            | Self::Search(_)
1665            | Self::PString(_)
1666            | Self::Regex(_)
1667            | Self::Clear
1668            | Self::Default
1669            | Self::Indirect(_)
1670            | Self::String16(_)
1671            | Self::Der => None,
1672        }
1673    }
1674
1675    #[inline(always)]
1676    fn is_match_any(&self) -> bool {
1677        match self {
1678            Test::Name(_) => false,
1679            Test::Use(_, _) => false,
1680            Test::Scalar(scalar_test) => matches!(scalar_test.test_val, TestValue::Any),
1681            Test::Float(float_test) => matches!(float_test.test_val, TestValue::Any),
1682            Test::String(string_test) => matches!(string_test.test_val, TestValue::Any),
1683            Test::Search(_) => false,
1684            Test::PString(pstring_test) => matches!(pstring_test.test_val, TestValue::Any),
1685            Test::Regex(_) => false,
1686            Test::Indirect(_) => false,
1687            Test::String16(string16_test) => matches!(string16_test.test_val, TestValue::Any),
1688            Test::Der => false,
1689            Test::Clear => false,
1690            Test::Default => false,
1691        }
1692    }
1693
1694    #[inline(always)]
1695    fn is_binary(&self) -> bool {
1696        match self {
1697            Self::Name(_) => true,
1698            Self::Use(_, _) => true,
1699            Self::Scalar(_) => true,
1700            Self::Float(_) => true,
1701            Self::String(t) => !t.is_binary() & !t.is_text() || t.is_binary(),
1702            Self::Search(t) => t.is_binary(),
1703            Self::PString(_) => true,
1704            Self::Regex(t) => t.is_binary(),
1705            Self::Clear => true,
1706            Self::Default => true,
1707            Self::Indirect(_) => true,
1708            Self::String16(_) => true,
1709            Self::Der => true,
1710        }
1711    }
1712
1713    #[inline(always)]
1714    fn is_text(&self) -> bool {
1715        match self {
1716            Self::Name(_) => true,
1717            Self::Use(_, _) => true,
1718            Self::Indirect(_) => true,
1719            Self::Clear => true,
1720            Self::Default => true,
1721            Self::String(t) => !t.is_binary() & !t.is_text() || t.is_text(),
1722            _ => !self.is_binary(),
1723        }
1724    }
1725
1726    #[inline(always)]
1727    fn is_only_text(&self) -> bool {
1728        self.is_text() && !self.is_binary()
1729    }
1730
1731    #[inline(always)]
1732    fn is_only_binary(&self) -> bool {
1733        self.is_binary() && !self.is_text()
1734    }
1735}
1736
1737#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
1738enum OffsetType {
1739    Byte,
1740    DoubleLe,
1741    DoubleBe,
1742    ShortLe,
1743    ShortBe,
1744    Id3Le,
1745    Id3Be,
1746    LongLe,
1747    LongBe,
1748    Middle,
1749    Octal,
1750    QuadBe,
1751    QuadLe,
1752}
1753
1754#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
1755enum Shift {
1756    Direct(u64),
1757    Indirect(i64),
1758}
1759
1760#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
1761struct IndOffset {
1762    // where to find the offset
1763    off_addr: DirOffset,
1764    // signed or unsigned
1765    signed: bool,
1766    // type of the offset
1767    ty: OffsetType,
1768    op: Option<Op>,
1769    shift: Option<Shift>,
1770}
1771
1772impl IndOffset {
1773    // if we overflow we must not return an offset
1774    fn read_offset<R: Read + Seek>(
1775        &self,
1776        haystack: &mut LazyCache<R>,
1777        rule_base_offset: Option<u64>,
1778        last_upper_match_offset: Option<u64>,
1779    ) -> Result<Option<u64>, io::Error> {
1780        let offset_address = match self.off_addr {
1781            DirOffset::Start(s) => {
1782                let Some(o) = s.checked_add(rule_base_offset.unwrap_or_default()) else {
1783                    return Ok(None);
1784                };
1785
1786                haystack.seek(SeekFrom::Start(o))?
1787            }
1788            DirOffset::LastUpper(c) => haystack.seek(SeekFrom::Start(
1789                (last_upper_match_offset.unwrap_or_default() as i64 + c) as u64,
1790            ))?,
1791            DirOffset::End(e) => haystack.seek(SeekFrom::End(e))?,
1792        };
1793
1794        macro_rules! read_value {
1795            () => {
1796                match self.ty {
1797                    OffsetType::Byte => {
1798                        if self.signed {
1799                            read_le!(haystack, u8) as u64
1800                        } else {
1801                            read_le!(haystack, i8) as u64
1802                        }
1803                    }
1804                    OffsetType::DoubleLe => read_le!(haystack, f64) as u64,
1805                    OffsetType::DoubleBe => read_be!(haystack, f64) as u64,
1806                    OffsetType::ShortLe => {
1807                        if self.signed {
1808                            read_le!(haystack, i16) as u64
1809                        } else {
1810                            read_le!(haystack, u16) as u64
1811                        }
1812                    }
1813                    OffsetType::ShortBe => {
1814                        if self.signed {
1815                            read_be!(haystack, i16) as u64
1816                        } else {
1817                            read_be!(haystack, u16) as u64
1818                        }
1819                    }
1820                    OffsetType::Id3Le => decode_id3(read_le!(haystack, u32)) as u64,
1821                    OffsetType::Id3Be => decode_id3(read_be!(haystack, u32)) as u64,
1822                    OffsetType::LongLe => {
1823                        if self.signed {
1824                            read_le!(haystack, i32) as u64
1825                        } else {
1826                            read_le!(haystack, u32) as u64
1827                        }
1828                    }
1829                    OffsetType::LongBe => {
1830                        if self.signed {
1831                            read_be!(haystack, i32) as u64
1832                        } else {
1833                            read_be!(haystack, u32) as u64
1834                        }
1835                    }
1836                    OffsetType::Middle => read_me!(haystack) as u64,
1837                    OffsetType::Octal => {
1838                        if let Some(o) = read_octal_u64(haystack) {
1839                            o
1840                        } else {
1841                            debug!("failed to read octal offset @ {offset_address}");
1842                            return Ok(None);
1843                        }
1844                    }
1845                    OffsetType::QuadLe => {
1846                        if self.signed {
1847                            read_le!(haystack, i64) as u64
1848                        } else {
1849                            read_le!(haystack, u64)
1850                        }
1851                    }
1852                    OffsetType::QuadBe => {
1853                        if self.signed {
1854                            read_be!(haystack, i64) as u64
1855                        } else {
1856                            read_be!(haystack, u64)
1857                        }
1858                    }
1859                }
1860            };
1861        }
1862
1863        // in theory every offset read should end up in something seekable from start, so we can use u64 to store the result
1864        let o = read_value!();
1865
1866        trace!(
1867            "offset read @ {offset_address} value={o} op={:?} shift={:?}",
1868            self.op, self.shift
1869        );
1870
1871        // apply transformation
1872        if let (Some(op), Some(shift)) = (self.op, self.shift) {
1873            let shift = match shift {
1874                Shift::Direct(i) => i,
1875                Shift::Indirect(i) => {
1876                    let tmp = offset_address as i128 + i as i128;
1877                    if tmp.is_negative() {
1878                        return Ok(None);
1879                    } else {
1880                        haystack.seek(SeekFrom::Start(tmp as u64))?;
1881                    };
1882                    // NOTE: here we assume that the shift has the same
1883                    // type as the main offset !
1884                    read_value!()
1885                }
1886            };
1887
1888            match op {
1889                Op::Add => return Ok(o.checked_add(shift)),
1890                Op::Mul => return Ok(o.checked_mul(shift)),
1891                Op::Sub => return Ok(o.checked_sub(shift)),
1892                Op::Div => return Ok(o.checked_div(shift)),
1893                Op::Mod => return Ok(o.checked_rem(shift)),
1894                Op::And => return Ok(Some(o & shift)),
1895                Op::Or => return Ok(Some(o | shift)),
1896                Op::Xor => return Ok(Some(o ^ shift)),
1897            }
1898        }
1899
1900        Ok(Some(o))
1901    }
1902}
1903
1904#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
1905enum DirOffset {
1906    Start(u64),
1907    // relative to the last up-level field
1908    LastUpper(i64),
1909    End(i64),
1910}
1911
1912#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
1913enum Offset {
1914    Direct(DirOffset),
1915    Indirect(IndOffset),
1916}
1917
1918impl From<DirOffset> for Offset {
1919    fn from(value: DirOffset) -> Self {
1920        Self::Direct(value)
1921    }
1922}
1923
1924impl From<IndOffset> for Offset {
1925    fn from(value: IndOffset) -> Self {
1926        Self::Indirect(value)
1927    }
1928}
1929
1930impl Display for DirOffset {
1931    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1932        match self {
1933            DirOffset::Start(i) => write!(f, "{i}"),
1934            DirOffset::LastUpper(c) => write!(f, "&{c}"),
1935            DirOffset::End(e) => write!(f, "-{e}"),
1936        }
1937    }
1938}
1939
1940impl Default for DirOffset {
1941    fn default() -> Self {
1942        Self::LastUpper(0)
1943    }
1944}
1945
1946#[derive(Debug, Clone, Serialize, Deserialize)]
1947struct Match {
1948    line: usize,
1949    depth: u8,
1950    offset: Offset,
1951    test: Test,
1952    test_strength: u64,
1953    message: Option<Message>,
1954}
1955
1956impl From<Use> for Match {
1957    fn from(value: Use) -> Self {
1958        let test = Test::Use(value.switch_endianness, value.rule_name);
1959        let test_strength = test.strength();
1960        Self {
1961            line: value.line,
1962            depth: value.depth,
1963            offset: value.start_offset,
1964            test,
1965            test_strength,
1966            message: value.message,
1967        }
1968    }
1969}
1970
1971impl From<Name> for Match {
1972    fn from(value: Name) -> Self {
1973        let test = Test::Name(value.name);
1974        let test_strength = test.strength();
1975        Self {
1976            line: value.line,
1977            depth: 0,
1978            offset: Offset::Direct(DirOffset::Start(0)),
1979            test,
1980            test_strength,
1981            message: value.message,
1982        }
1983    }
1984}
1985
1986impl Match {
1987    /// Turns the `Match`'s offset into an absolute offset from the start of the stream
1988    #[inline(always)]
1989    fn offset_from_start<R: Read + Seek>(
1990        &self,
1991        haystack: &mut LazyCache<R>,
1992        rule_base_offset: Option<u64>,
1993        last_level_offset: Option<u64>,
1994    ) -> Result<Option<u64>, io::Error> {
1995        match self.offset {
1996            Offset::Direct(dir_offset) => match dir_offset {
1997                DirOffset::Start(s) => Ok(Some(s)),
1998                DirOffset::LastUpper(shift) => {
1999                    let o = last_level_offset.unwrap_or_default() as i64 + shift;
2000
2001                    if o >= 0 { Ok(Some(o as u64)) } else { Ok(None) }
2002                }
2003                DirOffset::End(e) => Ok(Some(haystack.offset_from_start(SeekFrom::End(e)))),
2004            },
2005            Offset::Indirect(ind_offset) => {
2006                let Some(o) =
2007                    ind_offset.read_offset(haystack, rule_base_offset, last_level_offset)?
2008                else {
2009                    return Ok(None);
2010                };
2011
2012                Ok(Some(o))
2013            }
2014        }
2015    }
2016
2017    /// this method emulates the buffer based matching
2018    /// logic implemented in libmagic. It needs some aweful
2019    /// and weird offset convertions to turn buffer
2020    /// relative offsets (libmagic is based on) into
2021    /// absolute offset in the file.
2022    ///
2023    /// this method shoud bubble up only critical errors
2024    /// all the other errors should make the match result
2025    /// false and be logged via debug!
2026    ///
2027    /// the function returns an error if the maximum recursion
2028    /// has been reached or if a dependency rule is missing.
2029    #[inline]
2030    #[allow(clippy::too_many_arguments)]
2031    fn matches<'a: 'h, 'h, R: Read + Seek>(
2032        &'a self,
2033        source: Option<&str>,
2034        magic: &mut Magic<'a>,
2035        stream_kind: StreamKind,
2036        state: &mut MatchState,
2037        buf_base_offset: Option<u64>,
2038        rule_base_offset: Option<u64>,
2039        last_level_offset: Option<u64>,
2040        haystack: &'h mut LazyCache<R>,
2041        switch_endianness: bool,
2042        db: &'a MagicDb,
2043        depth: usize,
2044    ) -> Result<(bool, Option<MatchRes<'h>>), Error> {
2045        let source = source.unwrap_or("unknown");
2046        let line = self.line;
2047
2048        if depth >= MAX_RECURSION {
2049            return Err(Error::localized(
2050                source,
2051                line,
2052                Error::MaximumRecursion(MAX_RECURSION),
2053            ));
2054        }
2055
2056        if self.test.is_only_binary() && stream_kind.is_text() {
2057            trace!("skip binary test source={source} line={line} stream_kind={stream_kind:?}",);
2058            return Ok((false, None));
2059        }
2060
2061        if self.test.is_only_text() && !stream_kind.is_text() {
2062            trace!("skip text test source={source} line={line} stream_kind={stream_kind:?}",);
2063            return Ok((false, None));
2064        }
2065
2066        let Ok(Some(mut offset)) = self
2067            .offset_from_start(haystack, rule_base_offset, last_level_offset)
2068            .inspect_err(|e| debug!("source={source} line={line} failed at computing offset: {e}"))
2069        else {
2070            return Ok((false, None));
2071        };
2072
2073        offset = match self.offset {
2074            Offset::Indirect(_) => {
2075                // the result we get for an indirect offset
2076                // is relative to the start of the libmagic
2077                // buffer so we need to add base to make it
2078                // absolute.
2079                buf_base_offset.unwrap_or_default().saturating_add(offset)
2080            }
2081            // offset from start are computed from rule base
2082            Offset::Direct(DirOffset::Start(_)) => {
2083                rule_base_offset.unwrap_or_default().saturating_add(offset)
2084            }
2085            _ => offset,
2086        };
2087
2088        match &self.test {
2089            Test::Clear => {
2090                trace!("source={source} line={line} clear");
2091                state.clear_continuation_level(&self.continuation_level());
2092                Ok((true, None))
2093            }
2094
2095            Test::Name(name) => {
2096                trace!(
2097                    "source={source} line={line} running rule {name} switch_endianness={switch_endianness}",
2098                );
2099                Ok((true, None))
2100            }
2101
2102            Test::Use(flip_endianness, rule_name) => {
2103                trace!(
2104                    "source={source} line={line} use {rule_name} switch_endianness={flip_endianness}",
2105                );
2106
2107                // switch_endianness must propagate down the rule call stack
2108                let switch_endianness = switch_endianness ^ flip_endianness;
2109
2110                let dr: &DependencyRule = db.dependencies.get(rule_name).ok_or(
2111                    Error::localized(source, line, Error::MissingRule(rule_name.clone())),
2112                )?;
2113
2114                // we push the message here otherwise we push message in depth first
2115                if let Some(msg) = self.message.as_ref() {
2116                    magic.push_message(msg.to_string_lossy());
2117                }
2118
2119                dr.rule.magic(
2120                    magic,
2121                    stream_kind,
2122                    buf_base_offset,
2123                    Some(offset),
2124                    haystack,
2125                    db,
2126                    switch_endianness,
2127                    depth.saturating_add(1),
2128                )?;
2129
2130                // we return false not to push message again
2131                Ok((false, None))
2132            }
2133
2134            Test::Indirect(m) => {
2135                trace!(
2136                    "source={source} line={line} indirect mods={:?} offset={offset:#x}",
2137                    m
2138                );
2139
2140                let new_buf_base_off = if m.contains(IndirectMod::Relative) {
2141                    Some(offset)
2142                } else {
2143                    None
2144                };
2145
2146                // we push the message here otherwise we push message in depth first
2147                if let Some(msg) = self.message.as_ref() {
2148                    magic.push_message(msg.to_string_lossy());
2149                }
2150
2151                for r in db.rules.iter() {
2152                    let messages_cnt = magic.message.len();
2153
2154                    r.magic(
2155                        magic,
2156                        stream_kind,
2157                        new_buf_base_off,
2158                        Some(offset),
2159                        haystack,
2160                        db,
2161                        false,
2162                        depth.saturating_add(1),
2163                    )?;
2164
2165                    // this means we matched a rule
2166                    if magic.message.len() != messages_cnt {
2167                        break;
2168                    }
2169                }
2170
2171                // we return false not to push message again
2172                Ok((false, None))
2173            }
2174
2175            Test::Default => {
2176                // default matches if nothing else at the continuation level matched
2177                let ok = !state.get_continuation_level(&self.continuation_level());
2178
2179                trace!("source={source} line={line} default match={ok}");
2180                if ok {
2181                    state.set_continuation_level(self.continuation_level());
2182                }
2183
2184                Ok((ok, None))
2185            }
2186
2187            _ => {
2188                if let Err(e) = haystack.seek(SeekFrom::Start(offset)) {
2189                    debug!("source={source} line={line} failed to seek in haystack: {e}");
2190                    return Ok((false, None));
2191                }
2192
2193                let mut trace_msg = None;
2194
2195                if enabled!(Level::DEBUG) {
2196                    trace_msg = Some(vec![format!(
2197                        "source={source} line={line} depth={} stream_offset={:#x}",
2198                        self.depth,
2199                        haystack.lazy_stream_position()
2200                    )])
2201                }
2202
2203                // NOTE: we may have a way to optimize here. In case we do a Any
2204                // test and we don't use the value to format the message, we don't
2205                // need to read the value.
2206                if let Ok(opt_test_value) = self
2207                    .test
2208                    .read_test_value(haystack, switch_endianness)
2209                    .inspect_err(|e| {
2210                        debug!("source={source} line={line} error while reading test value @{offset}: {e}",)
2211                    })
2212                {
2213                    if let Some(v) = trace_msg
2214                        .as_mut() { v.push(format!("test={:?}", self.test)) }
2215
2216                    let match_res =
2217                        opt_test_value.and_then(|tv| self.test.match_value(&tv, stream_kind));
2218
2219                    if let Some(v) = trace_msg.as_mut() { v.push(format!(
2220                            "message=\"{}\" match={}",
2221                            self.message
2222                                .as_ref()
2223                                .map(|fs| fs.to_string_lossy())
2224                                .unwrap_or_default(),
2225                            match_res.is_some()
2226                        )) }
2227
2228                    // trace message
2229                    if enabled!(Level::DEBUG) && !enabled!(Level::TRACE) && match_res.is_some() {
2230                        if let Some(m) = trace_msg{
2231                            debug!("{}", m.join(" "));
2232                        }
2233                    } else if enabled!(Level::TRACE)
2234                        && let Some(m) = trace_msg{
2235                            trace!("{}", m.join(" "));
2236                        }
2237
2238                    if let Some(mr) = match_res {
2239                        state.set_continuation_level(self.continuation_level());
2240                        return Ok((true, Some(mr)));
2241                    }
2242                }
2243
2244                Ok((false, None))
2245            }
2246        }
2247    }
2248
2249    #[inline(always)]
2250    fn continuation_level(&self) -> ContinuationLevel {
2251        ContinuationLevel(self.depth)
2252    }
2253}
2254
2255#[derive(Debug, Clone)]
2256struct Use {
2257    line: usize,
2258    depth: u8,
2259    start_offset: Offset,
2260    rule_name: String,
2261    switch_endianness: bool,
2262    message: Option<Message>,
2263}
2264
2265#[derive(Debug, Clone, Serialize, Deserialize)]
2266struct StrengthMod {
2267    op: Op,
2268    by: u8,
2269}
2270
2271impl StrengthMod {
2272    #[inline(always)]
2273    fn apply(&self, strength: u64) -> u64 {
2274        let by = self.by as u64;
2275        debug!("applying strength modifier: {strength} {} {}", self.op, by);
2276        match self.op {
2277            Op::Mul => strength.saturating_mul(by),
2278            Op::Add => strength.saturating_add(by),
2279            Op::Sub => strength.saturating_sub(by),
2280            Op::Div => {
2281                if by > 0 {
2282                    strength.saturating_div(by)
2283                } else {
2284                    strength
2285                }
2286            }
2287            Op::Mod => strength % by,
2288            Op::And => strength & by,
2289            // this should never happen as strength operators
2290            // are enforced by our parser
2291            Op::Xor | Op::Or => {
2292                debug_panic!("unsupported strength operator");
2293                strength
2294            }
2295        }
2296    }
2297}
2298
2299#[derive(Debug, Clone)]
2300enum Flag {
2301    Mime(String),
2302    Ext(HashSet<String>),
2303    Strength(StrengthMod),
2304    Apple(String),
2305}
2306
2307#[derive(Debug, Clone)]
2308struct Name {
2309    line: usize,
2310    name: String,
2311    message: Option<Message>,
2312}
2313
2314#[derive(Debug, Clone)]
2315enum Entry<'span> {
2316    Match(Span<'span>, Match),
2317    Flag(Span<'span>, Flag),
2318}
2319
2320#[derive(Debug, Clone, Serialize, Deserialize)]
2321struct EntryNode {
2322    root: bool,
2323    entry: Match,
2324    children: Vec<EntryNode>,
2325    mimetype: Option<String>,
2326    apple: Option<String>,
2327    strength_mod: Option<StrengthMod>,
2328    exts: HashSet<String>,
2329}
2330
2331impl EntryNode {
2332    fn update_exts_rec(
2333        &self,
2334        exts: &mut HashSet<String>,
2335        deps: &HashMap<String, DependencyRule>,
2336        marked: &mut HashSet<String>,
2337    ) -> Result<(), ()> {
2338        for ext in self.exts.iter() {
2339            if !exts.contains(ext) {
2340                exts.insert(ext.clone());
2341            }
2342        }
2343
2344        for c in self.children.iter() {
2345            if let Test::Use(_, ref name) = c.entry.test {
2346                if marked.contains(name) {
2347                    continue;
2348                }
2349                if let Some(r) = deps.get(name) {
2350                    marked.insert(name.clone());
2351                    exts.extend(r.rule.fetch_all_extensions(deps, marked)?);
2352                } else {
2353                    return Err(());
2354                }
2355            } else {
2356                c.update_exts_rec(exts, deps, marked)?;
2357            }
2358        }
2359
2360        Ok(())
2361    }
2362
2363    fn update_score_rec(
2364        &self,
2365        depth: usize,
2366        score: &mut u64,
2367        deps: &HashMap<String, DependencyRule>,
2368        marked: &mut HashSet<String>,
2369    ) {
2370        if depth == 3 {
2371            return;
2372        }
2373
2374        *score += self
2375            .children
2376            .iter()
2377            .map(|e| e.entry.test_strength)
2378            .min()
2379            .unwrap_or_default();
2380
2381        for c in self.children.iter() {
2382            if let Test::Use(_, ref name) = c.entry.test {
2383                if marked.contains(name) {
2384                    continue;
2385                }
2386
2387                if let Some(r) = deps.get(name) {
2388                    marked.insert(name.clone());
2389                    *score += r.rule.compute_score(depth, deps, marked);
2390                }
2391            }
2392            c.update_score_rec(depth + 1, score, deps, marked);
2393        }
2394    }
2395
2396    #[inline]
2397    #[allow(clippy::too_many_arguments)]
2398    fn matches<'r, R: Read + Seek>(
2399        &'r self,
2400        opt_source: Option<&str>,
2401        magic: &mut Magic<'r>,
2402        state: &mut MatchState,
2403        stream_kind: StreamKind,
2404        buf_base_offset: Option<u64>,
2405        rule_base_offset: Option<u64>,
2406        last_level_offset: Option<u64>,
2407        haystack: &mut LazyCache<R>,
2408        db: &'r MagicDb,
2409        switch_endianness: bool,
2410        depth: usize,
2411    ) -> Result<(), Error> {
2412        let (ok, opt_match_res) = self.entry.matches(
2413            opt_source,
2414            magic,
2415            stream_kind,
2416            state,
2417            buf_base_offset,
2418            rule_base_offset,
2419            last_level_offset,
2420            haystack,
2421            switch_endianness,
2422            db,
2423            depth,
2424        )?;
2425
2426        let source = opt_source.unwrap_or("unknown");
2427        let line = self.entry.line;
2428
2429        if ok {
2430            // update magic with message if match is successful
2431            if let Some(msg) = self.entry.message.as_ref()
2432                && let Ok(msg) = msg.format_with(opt_match_res.as_ref()).inspect_err(|e| {
2433                    debug!("source={source} line={line} failed to format message: {e}")
2434                })
2435            {
2436                magic.push_message(msg);
2437            }
2438
2439            // we need to adjust stream offset in case of regex/search tests
2440            if let Some(mr) = opt_match_res {
2441                match &self.entry.test {
2442                    Test::String(t) => {
2443                        if t.has_length_mod() {
2444                            let o = mr.end_offset();
2445                            haystack.seek(SeekFrom::Start(o))?;
2446                        }
2447                    }
2448                    Test::Search(t) => {
2449                        if t.re_mods.contains(ReMod::StartOffsetUpdate) {
2450                            let o = mr.start_offset();
2451                            haystack.seek(SeekFrom::Start(o))?;
2452                        } else {
2453                            let o = mr.end_offset();
2454                            haystack.seek(SeekFrom::Start(o))?;
2455                        }
2456                    }
2457
2458                    Test::Regex(t) => {
2459                        if t.mods.contains(ReMod::StartOffsetUpdate) {
2460                            let o = mr.start_offset();
2461                            haystack.seek(SeekFrom::Start(o))?;
2462                        } else {
2463                            let o = mr.end_offset();
2464                            haystack.seek(SeekFrom::Start(o))?;
2465                        }
2466                    }
2467                    // other types do not need offset adjustement
2468                    _ => {}
2469                }
2470            }
2471
2472            if let Some(mimetype) = self.mimetype.as_ref() {
2473                magic.set_mime_type(Cow::Borrowed(mimetype));
2474            }
2475
2476            if let Some(apple_ty) = self.apple.as_ref() {
2477                magic.set_creator_code(Cow::Borrowed(apple_ty));
2478            }
2479
2480            if !self.exts.is_empty() {
2481                magic.insert_extensions(self.exts.iter().map(|s| s.as_str()));
2482            }
2483
2484            // NOTE: here we try to implement a similar logic as in file_magic_strength.
2485            // Sticking to the exact same strength computation logic is complicated due
2486            // to implementation differences. Let's wait and see if that is a real issue.
2487            let mut strength = self.entry.test_strength;
2488
2489            let continuation_level = self.entry.continuation_level().0 as u64;
2490            if self.entry.message.is_none() && continuation_level < 3 {
2491                strength = strength.saturating_add(continuation_level);
2492            }
2493
2494            if let Some(sm) = self.strength_mod.as_ref() {
2495                strength = sm.apply(strength);
2496            }
2497
2498            // entries with no message get a bonus
2499            if self.entry.message.is_none() {
2500                strength += 1
2501            }
2502
2503            magic.update_strength(strength);
2504
2505            let end_upper_level = haystack.lazy_stream_position();
2506
2507            // we have to fix rule_base_offset if
2508            // the rule_base_starts from end otherwise it
2509            // breaks some offset computation in match
2510            // see test_offset_bug_1 and test_offset_bug_2
2511            // they implement the same test logic yet indirect
2512            // offsets have to be different so that it works
2513            // in libmagic/file
2514            let rule_base_offset = if self.root {
2515                match self.entry.offset {
2516                    Offset::Direct(DirOffset::End(o)) => {
2517                        Some(haystack.offset_from_start(SeekFrom::End(o)))
2518                    }
2519                    _ => rule_base_offset,
2520                }
2521            } else {
2522                rule_base_offset
2523            };
2524
2525            for e in self.children.iter() {
2526                e.matches(
2527                    opt_source,
2528                    magic,
2529                    state,
2530                    stream_kind,
2531                    buf_base_offset,
2532                    rule_base_offset,
2533                    Some(end_upper_level),
2534                    haystack,
2535                    db,
2536                    switch_endianness,
2537                    depth,
2538                )?
2539            }
2540        }
2541
2542        Ok(())
2543    }
2544}
2545
2546/// Represents a parsed magic rule
2547#[derive(Debug, Clone, Serialize, Deserialize)]
2548pub struct MagicRule {
2549    id: usize,
2550    source: Option<String>,
2551    entries: EntryNode,
2552    extensions: HashSet<String>,
2553    /// score used for rule ranking
2554    score: u64,
2555    finalized: bool,
2556}
2557
2558impl MagicRule {
2559    #[inline(always)]
2560    fn set_id(&mut self, id: usize) {
2561        self.id = id
2562    }
2563
2564    /// Fetches all the extensions defined in the magic rule. This
2565    /// function goes recursive and find extensions also defined in
2566    /// dependencies
2567    fn fetch_all_extensions(
2568        &self,
2569        deps: &HashMap<String, DependencyRule>,
2570        marked: &mut HashSet<String>,
2571    ) -> Result<HashSet<String>, ()> {
2572        let mut exts = HashSet::new();
2573        self.entries.update_exts_rec(&mut exts, deps, marked)?;
2574        Ok(exts)
2575    }
2576
2577    /// Computes the ranking score of a magic rule by walking
2578    /// tests recursively, dependencies included.
2579    fn compute_score(
2580        &self,
2581        depth: usize,
2582        deps: &HashMap<String, DependencyRule>,
2583        marked: &mut HashSet<String>,
2584    ) -> u64 {
2585        let mut score = 0;
2586        score += self.entries.entry.test_strength;
2587        self.entries
2588            .update_score_rec(depth, &mut score, deps, marked);
2589        score
2590    }
2591
2592    /// Finalize a rule by searching for all extensions and computing its score
2593    /// for ranking. In the `MagicRule` is already finalized it returns immediately.
2594    fn try_finalize(&mut self, deps: &HashMap<String, DependencyRule>) {
2595        if self.finalized {
2596            return;
2597        }
2598
2599        let Ok(exts) = self.fetch_all_extensions(deps, &mut HashSet::new()) else {
2600            return;
2601        };
2602
2603        self.extensions.extend(exts);
2604
2605        // fetch_all_extensions walks through all the dependencies
2606        // so there is no reason for compute_score to fail as it is walking
2607        // only some of them
2608        self.score = self.compute_score(0, deps, &mut HashSet::new());
2609        self.finalized = true
2610    }
2611
2612    #[inline]
2613    fn magic_entrypoint<'r, R: Read + Seek>(
2614        &'r self,
2615        magic: &mut Magic<'r>,
2616        stream_kind: StreamKind,
2617        haystack: &mut LazyCache<R>,
2618        db: &'r MagicDb,
2619        switch_endianness: bool,
2620        depth: usize,
2621    ) -> Result<(), Error> {
2622        self.entries.matches(
2623            self.source.as_deref(),
2624            magic,
2625            &mut MatchState::empty(),
2626            stream_kind,
2627            None,
2628            None,
2629            None,
2630            haystack,
2631            db,
2632            switch_endianness,
2633            depth,
2634        )
2635    }
2636
2637    #[inline]
2638    #[allow(clippy::too_many_arguments)]
2639    fn magic<'r, R: Read + Seek>(
2640        &'r self,
2641        magic: &mut Magic<'r>,
2642        stream_kind: StreamKind,
2643        buf_base_offset: Option<u64>,
2644        rule_base_offset: Option<u64>,
2645        haystack: &mut LazyCache<R>,
2646        db: &'r MagicDb,
2647        switch_endianness: bool,
2648        depth: usize,
2649    ) -> Result<(), Error> {
2650        self.entries.matches(
2651            self.source.as_deref(),
2652            magic,
2653            &mut MatchState::empty(),
2654            stream_kind,
2655            buf_base_offset,
2656            rule_base_offset,
2657            None,
2658            haystack,
2659            db,
2660            switch_endianness,
2661            depth,
2662        )
2663    }
2664
2665    /// Checks if the rule is for matching against text content
2666    ///
2667    /// # Returns
2668    ///
2669    /// * `bool` - True if the rule is for text files
2670    pub fn is_text(&self) -> bool {
2671        self.entries.entry.test.is_text()
2672            && self.entries.children.iter().all(|e| e.entry.test.is_text())
2673    }
2674
2675    /// Gets the rule's score used for ranking rules between them
2676    ///
2677    /// # Returns
2678    ///
2679    /// * `u64` - The rule's score
2680    #[inline(always)]
2681    pub fn score(&self) -> u64 {
2682        self.score
2683    }
2684
2685    /// Gets the rule's filename if any
2686    ///
2687    /// # Returns
2688    ///
2689    /// * `Option<&str>` - The rule's source if available
2690    #[inline(always)]
2691    pub fn source(&self) -> Option<&str> {
2692        self.source.as_deref()
2693    }
2694
2695    /// Gets the line number at which the rule is defined
2696    ///
2697    /// # Returns
2698    ///
2699    /// * `usize` - The rule's line number
2700    #[inline(always)]
2701    pub fn line(&self) -> usize {
2702        self.entries.entry.line
2703    }
2704
2705    /// Gets all the file extensions associated to the rule
2706    ///
2707    /// # Returns
2708    ///
2709    /// * `&HashSet<String>` - The set of all associated extensions
2710    #[inline(always)]
2711    pub fn extensions(&self) -> &HashSet<String> {
2712        &self.extensions
2713    }
2714}
2715
2716#[derive(Debug, Clone, Serialize, Deserialize)]
2717struct DependencyRule {
2718    name: String,
2719    rule: MagicRule,
2720}
2721
2722/// A parsed source of magic rules
2723///
2724/// # Methods
2725///
2726/// * `open` - Opens a magic file from a path
2727#[derive(Debug, Clone, Serialize, Deserialize)]
2728pub struct MagicSource {
2729    rules: Vec<MagicRule>,
2730    dependencies: HashMap<String, DependencyRule>,
2731}
2732
2733impl MagicSource {
2734    /// Opens and parses a magic file from a path
2735    ///
2736    /// # Arguments
2737    ///
2738    /// * `p` - The path to the magic file
2739    ///
2740    /// # Returns
2741    ///
2742    /// * `Result<Self, Error>` - The parsed magic file or an error
2743    pub fn open<P: AsRef<Path>>(p: P) -> Result<Self, Error> {
2744        FileMagicParser::parse_file(p)
2745    }
2746}
2747
2748#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)]
2749struct ContinuationLevel(u8);
2750
2751// FIXME: magic handles many more text encodings
2752#[derive(Debug, PartialEq, Eq, Clone, Copy)]
2753enum TextEncoding {
2754    Ascii,
2755    Utf8,
2756    Unknown,
2757}
2758
2759impl TextEncoding {
2760    const fn as_magic_str(&self) -> &'static str {
2761        match self {
2762            TextEncoding::Ascii => "ASCII",
2763            TextEncoding::Utf8 => "UTF-8",
2764            TextEncoding::Unknown => "Unknown",
2765        }
2766    }
2767}
2768
2769#[derive(Debug, PartialEq, Eq, Clone, Copy)]
2770enum StreamKind {
2771    Binary,
2772    Text(TextEncoding),
2773}
2774
2775impl StreamKind {
2776    const fn is_text(&self) -> bool {
2777        matches!(self, StreamKind::Text(_))
2778    }
2779}
2780
2781#[derive(Debug)]
2782struct MatchState {
2783    continuation_levels: [bool; 256],
2784}
2785
2786impl MatchState {
2787    #[inline(always)]
2788    fn empty() -> Self {
2789        MatchState {
2790            continuation_levels: [false; 256],
2791        }
2792    }
2793
2794    #[inline(always)]
2795    fn get_continuation_level(&mut self, level: &ContinuationLevel) -> bool {
2796        self.continuation_levels
2797            .get(level.0 as usize)
2798            .cloned()
2799            .unwrap_or_default()
2800    }
2801
2802    #[inline(always)]
2803    fn set_continuation_level(&mut self, level: ContinuationLevel) {
2804        if let Some(b) = self.continuation_levels.get_mut(level.0 as usize) {
2805            *b = true
2806        }
2807    }
2808
2809    #[inline(always)]
2810    fn clear_continuation_level(&mut self, level: &ContinuationLevel) {
2811        if let Some(b) = self.continuation_levels.get_mut(level.0 as usize) {
2812            *b = false;
2813        }
2814    }
2815}
2816
2817/// Represents a file magic detection result
2818#[derive(Debug, Default)]
2819pub struct Magic<'m> {
2820    stream_kind: Option<StreamKind>,
2821    source: Option<Cow<'m, str>>,
2822    message: Vec<Cow<'m, str>>,
2823    mime_type: Option<Cow<'m, str>>,
2824    creator_code: Option<Cow<'m, str>>,
2825    strength: u64,
2826    exts: HashSet<Cow<'m, str>>,
2827    is_default: bool,
2828}
2829
2830impl<'m> Magic<'m> {
2831    #[inline(always)]
2832    fn set_source(&mut self, source: Option<&'m str>) {
2833        self.source = source.map(Cow::Borrowed);
2834    }
2835
2836    #[inline(always)]
2837    fn set_stream_kind(&mut self, stream_kind: StreamKind) {
2838        self.stream_kind = Some(stream_kind)
2839    }
2840
2841    #[inline(always)]
2842    fn reset(&mut self) {
2843        self.stream_kind = None;
2844        self.source = None;
2845        self.message.clear();
2846        self.mime_type = None;
2847        self.creator_code = None;
2848        self.strength = 0;
2849        self.exts.clear();
2850        self.is_default = false;
2851    }
2852
2853    /// Converts borrowed data into owned data. This method involves
2854    /// data cloning, so you must use this method only if you need to
2855    /// extend the lifetime of a [`Magic`] struct.
2856    ///
2857    /// # Returns
2858    ///
2859    /// * `Magic<'owned>` - A new [`Magic`] with owned data
2860    #[inline]
2861    pub fn into_owned<'owned>(self) -> Magic<'owned> {
2862        Magic {
2863            stream_kind: self.stream_kind,
2864            source: self.source.map(|s| Cow::Owned(s.into_owned())),
2865            message: self
2866                .message
2867                .into_iter()
2868                .map(Cow::into_owned)
2869                .map(Cow::Owned)
2870                .collect(),
2871            mime_type: self.mime_type.map(|m| Cow::Owned(m.into_owned())),
2872            creator_code: self.creator_code.map(|m| Cow::Owned(m.into_owned())),
2873            strength: self.strength,
2874            exts: self
2875                .exts
2876                .into_iter()
2877                .map(|e| Cow::Owned(e.into_owned()))
2878                .collect(),
2879            is_default: self.is_default,
2880        }
2881    }
2882
2883    /// Gets the formatted message describing the file type
2884    ///
2885    /// # Returns
2886    ///
2887    /// * `String` - The formatted message
2888    #[inline(always)]
2889    pub fn message(&self) -> String {
2890        let mut out = String::new();
2891        for (i, m) in self.message.iter().enumerate() {
2892            if let Some(s) = m.strip_prefix(r#"\b"#) {
2893                out.push_str(s);
2894            } else {
2895                // don't put space on first string
2896                if i > 0 {
2897                    out.push(' ');
2898                }
2899                out.push_str(m);
2900            }
2901        }
2902        out
2903    }
2904
2905    /// Returns an iterator over the individual parts of the magic message
2906    ///
2907    /// A magic message is typically composed of multiple parts, each appended
2908    /// during successful magic tests. This method provides an efficient way to
2909    /// iterate over these parts without concatenating them into a new string,
2910    /// as done when calling [`Magic::message`].
2911    ///
2912    /// # Returns
2913    ///
2914    /// * `impl Iterator<Item = &str>` - An iterator yielding string slices of each message part
2915    #[inline]
2916    pub fn message_parts(&self) -> impl Iterator<Item = &str> {
2917        self.message.iter().map(|p| p.as_ref())
2918    }
2919
2920    #[inline(always)]
2921    fn update_strength(&mut self, value: u64) {
2922        self.strength = self.strength.saturating_add(value);
2923        debug!("updated strength = {:?}", self.strength)
2924    }
2925
2926    /// Gets the detected MIME type
2927    ///
2928    /// # Returns
2929    ///
2930    /// * `&str` - The MIME type or default based on stream kind
2931    #[inline(always)]
2932    pub fn mime_type(&self) -> &str {
2933        self.mime_type.as_deref().unwrap_or(match self.stream_kind {
2934            Some(StreamKind::Text(_)) => DEFAULT_TEXT_MIMETYPE,
2935            Some(StreamKind::Binary) | None => DEFAULT_BIN_MIMETYPE,
2936        })
2937    }
2938
2939    #[inline(always)]
2940    fn push_message<'a: 'm>(&mut self, msg: Cow<'a, str>) {
2941        if !msg.is_empty() {
2942            debug!("pushing message: msg={msg} len={}", msg.len());
2943            self.message.push(msg);
2944        }
2945    }
2946
2947    #[inline(always)]
2948    fn set_mime_type<'a: 'm>(&mut self, mime: Cow<'a, str>) {
2949        if self.mime_type.is_none() {
2950            debug!("insert mime: {:?}", mime);
2951            self.mime_type = Some(mime)
2952        }
2953    }
2954
2955    #[inline(always)]
2956    fn set_creator_code<'a: 'm>(&mut self, apple_ty: Cow<'a, str>) {
2957        if self.creator_code.is_none() {
2958            debug!("insert apple type: {apple_ty:?}");
2959            self.creator_code = Some(apple_ty)
2960        }
2961    }
2962
2963    #[inline(always)]
2964    fn insert_extensions<'a: 'm, I: Iterator<Item = &'a str>>(&mut self, exts: I) {
2965        if self.exts.is_empty() {
2966            self.exts.extend(exts.filter_map(|e| {
2967                if e.is_empty() {
2968                    None
2969                } else {
2970                    Some(Cow::Borrowed(e))
2971                }
2972            }));
2973        }
2974    }
2975
2976    /// Gets the confidence score of the detection. This
2977    /// value is used to sort [`Magic`] in [`MagicDb::best_magic`]
2978    /// and [`MagicDb::all_magics`].
2979    ///
2980    /// # Returns
2981    ///
2982    /// * `u64` - The confidence score attributed to that [`Magic`]
2983    #[inline(always)]
2984    pub fn strength(&self) -> u64 {
2985        self.strength
2986    }
2987
2988    /// Gets the filename where the magic rule was defined
2989    ///
2990    /// # Returns
2991    ///
2992    /// * `Option<&str>` - The source if available
2993    #[inline(always)]
2994    pub fn source(&self) -> Option<&str> {
2995        self.source.as_deref()
2996    }
2997
2998    /// Gets the Apple creator code if available
2999    ///
3000    /// # Returns
3001    ///
3002    /// * `Option<&str>` - The creator code if available
3003    #[inline(always)]
3004    pub fn creator_code(&self) -> Option<&str> {
3005        self.creator_code.as_deref()
3006    }
3007
3008    /// Gets the possible file extensions for the detected [`Magic`]
3009    ///
3010    /// # Returns
3011    ///
3012    /// * `&HashSet<Cow<'m, str>>` - The set of possible extensions
3013    #[inline(always)]
3014    pub fn extensions(&self) -> &HashSet<Cow<'m, str>> {
3015        &self.exts
3016    }
3017
3018    /// Checks if this is a default fallback detection
3019    ///
3020    /// # Returns
3021    ///
3022    /// * `bool` - True if this is a default detection
3023    #[inline(always)]
3024    pub fn is_default(&self) -> bool {
3025        self.is_default
3026    }
3027}
3028
3029/// Represents a database of [`MagicRule`]
3030#[derive(Debug, Default, Clone, Serialize, Deserialize)]
3031pub struct MagicDb {
3032    rule_id: usize,
3033    rules: Vec<MagicRule>,
3034    dependencies: HashMap<String, DependencyRule>,
3035}
3036
3037#[inline(always)]
3038/// Returns `true` if the byte stream is likely text.
3039fn is_likely_text(bytes: &[u8]) -> bool {
3040    const CHUNK_SIZE: usize = std::mem::size_of::<usize>();
3041
3042    if bytes.is_empty() {
3043        return false;
3044    }
3045
3046    let mut printable = 0f64;
3047    let mut high_bytes = 0f64; // Bytes > 0x7F (non-ASCII)
3048
3049    let (chunks, remainder) = bytes.as_chunks::<CHUNK_SIZE>();
3050
3051    macro_rules! handle_byte {
3052        ($byte: expr) => {
3053            match $byte {
3054                0x00 => return false,
3055                0x09 | 0x0A | 0x0D => printable += 1.0, // Whitespace
3056                0x20..=0x7E => printable += 1.0,        // Printable ASCII
3057                _ => high_bytes += 1.0,
3058            }
3059        };
3060    }
3061
3062    for bytes in chunks {
3063        for b in bytes {
3064            handle_byte!(b)
3065        }
3066    }
3067
3068    for b in remainder {
3069        handle_byte!(b)
3070    }
3071
3072    let total = bytes.len() as f64;
3073    let printable_ratio = printable / total;
3074    let high_bytes_ratio = high_bytes / total;
3075
3076    // Heuristic thresholds (adjust as needed):
3077    printable_ratio > 0.85 && high_bytes_ratio < 0.20
3078}
3079
3080#[inline(always)]
3081fn guess_stream_kind<S: AsRef<[u8]>>(stream: S) -> StreamKind {
3082    let buf = stream.as_ref();
3083
3084    match run_utf8_validation(buf) {
3085        Ok(is_ascii) => {
3086            if is_ascii {
3087                StreamKind::Text(TextEncoding::Ascii)
3088            } else {
3089                StreamKind::Text(TextEncoding::Utf8)
3090            }
3091        }
3092        Err(e) => {
3093            if is_likely_text(&buf[e.valid_up_to..]) {
3094                StreamKind::Text(TextEncoding::Unknown)
3095            } else {
3096                StreamKind::Binary
3097            }
3098        }
3099    }
3100}
3101
3102impl MagicDb {
3103    fn open_reader<R: Read + Seek>(f: R) -> Result<LazyCache<R>, Error> {
3104        Ok(LazyCache::<R>::from_read_seek(f)
3105            .and_then(|lc| lc.with_hot_cache(2 * FILE_BYTES_MAX))?)
3106        .map(|lc| lc.with_warm_cache(100 << 20))
3107    }
3108
3109    /// Creates a new empty database
3110    ///
3111    /// # Returns
3112    ///
3113    /// * [`MagicDb`] - A new empty database
3114    pub fn new() -> Self {
3115        Self::default()
3116    }
3117
3118    #[inline(always)]
3119    fn next_rule_id(&mut self) -> usize {
3120        let t = self.rule_id;
3121        self.rule_id += 1;
3122        t
3123    }
3124
3125    #[inline(always)]
3126    fn try_json<R: Read + Seek>(
3127        haystack: &mut LazyCache<R>,
3128        stream_kind: StreamKind,
3129        magic: &mut Magic,
3130    ) -> Result<bool, Error> {
3131        // cannot be json if content is binary
3132        if matches!(stream_kind, StreamKind::Binary) {
3133            return Ok(false);
3134        }
3135
3136        let buf = haystack.read_range(0..FILE_BYTES_MAX as u64)?.trim_ascii();
3137
3138        let Some((start, end)) = find_json_boundaries(buf) else {
3139            return Ok(false);
3140        };
3141
3142        // if anything else than whitespace before start
3143        // this is not json
3144        for c in buf[0..start].iter() {
3145            if !c.is_ascii_whitespace() {
3146                return Ok(false);
3147            }
3148        }
3149
3150        let mut is_ndjson = false;
3151
3152        trace!("maybe a json document");
3153        let ok = serde_json::from_slice::<serde_json::Value>(&buf[start..=end]).is_ok();
3154        if !ok {
3155            return Ok(false);
3156        }
3157
3158        // we are sure it is json now we must look if we are ndjson
3159        if end + 1 < buf.len() {
3160            // after first json
3161            let buf = &buf[end + 1..];
3162            if let Some((second_start, second_end)) = find_json_boundaries(buf) {
3163                // there is a new line between the two json docs
3164                if memchr(b'\n', &buf[..second_start]).is_some() {
3165                    trace!("might be ndjson");
3166                    is_ndjson = serde_json::from_slice::<serde_json::Value>(
3167                        &buf[second_start..=second_end],
3168                    )
3169                    .is_ok();
3170                }
3171            }
3172        }
3173
3174        if is_ndjson {
3175            magic.push_message(Cow::Borrowed("New Line Delimited"));
3176            magic.set_mime_type(Cow::Borrowed("application/x-ndjson"));
3177            magic.insert_extensions(["ndjson", "jsonl"].into_iter());
3178        } else {
3179            magic.set_mime_type(Cow::Borrowed("application/json"));
3180            magic.insert_extensions(["json"].into_iter());
3181        }
3182
3183        magic.push_message(Cow::Borrowed("JSON text data"));
3184        magic.set_source(Some(HARDCODED_SOURCE));
3185        magic.update_strength(HARDCODED_MAGIC_STRENGTH);
3186        Ok(true)
3187    }
3188
3189    #[inline(always)]
3190    fn try_csv<R: Read + Seek>(
3191        haystack: &mut LazyCache<R>,
3192        stream_kind: StreamKind,
3193        magic: &mut Magic,
3194    ) -> Result<bool, Error> {
3195        // cannot be csv if content is binary
3196        let StreamKind::Text(enc) = stream_kind else {
3197            return Ok(false);
3198        };
3199
3200        let buf = haystack.read_range(0..FILE_BYTES_MAX as u64)?;
3201        let mut reader = csv::Reader::from_reader(io::Cursor::new(buf));
3202        let mut records = reader.records();
3203
3204        let Some(Ok(first)) = records.next() else {
3205            return Ok(false);
3206        };
3207
3208        // very not likely a CSV otherwise all programming
3209        // languages having ; line terminator would be
3210        // considered as CSV
3211        if first.len() <= 1 {
3212            return Ok(false);
3213        }
3214
3215        // we already parsed first line
3216        let mut n = 1;
3217        for i in records.take(9) {
3218            if let Ok(rec) = i {
3219                if first.len() != rec.len() {
3220                    return Ok(false);
3221                }
3222            } else {
3223                return Ok(false);
3224            }
3225            n += 1;
3226        }
3227
3228        // we need at least 10 lines
3229        if n != 10 {
3230            return Ok(false);
3231        }
3232
3233        magic.set_mime_type(Cow::Borrowed("text/csv"));
3234        magic.push_message(Cow::Borrowed("CSV"));
3235        magic.push_message(Cow::Borrowed(enc.as_magic_str()));
3236        magic.push_message(Cow::Borrowed("text"));
3237        magic.insert_extensions(["csv"].into_iter());
3238        magic.set_source(Some(HARDCODED_SOURCE));
3239        magic.update_strength(HARDCODED_MAGIC_STRENGTH);
3240        Ok(true)
3241    }
3242
3243    #[inline(always)]
3244    fn try_tar<R: Read + Seek>(
3245        haystack: &mut LazyCache<R>,
3246        stream_kind: StreamKind,
3247        magic: &mut Magic,
3248    ) -> Result<bool, Error> {
3249        // cannot be json if content is not binary
3250        if !matches!(stream_kind, StreamKind::Binary) {
3251            return Ok(false);
3252        }
3253
3254        let buf = haystack.read_range(0..FILE_BYTES_MAX as u64)?;
3255        let mut ar = Archive::new(io::Cursor::new(buf));
3256
3257        let Ok(mut entries) = ar.entries() else {
3258            return Ok(false);
3259        };
3260
3261        let Some(Ok(first)) = entries.next() else {
3262            return Ok(false);
3263        };
3264
3265        let header = first.header();
3266
3267        if header.as_ustar().is_some() {
3268            magic.push_message(Cow::Borrowed("POSIX tar archive"));
3269        } else if header.as_gnu().is_some() {
3270            magic.push_message(Cow::Borrowed("POSIX tar archive (GNU)"));
3271        } else {
3272            magic.push_message(Cow::Borrowed("tar archive"));
3273        }
3274
3275        magic.set_mime_type(Cow::Borrowed("application/x-tar"));
3276        magic.set_source(Some(HARDCODED_SOURCE));
3277        magic.update_strength(HARDCODED_MAGIC_STRENGTH);
3278        magic.insert_extensions(["tar"].into_iter());
3279        Ok(true)
3280    }
3281
3282    #[inline(always)]
3283    fn try_hard_magic<R: Read + Seek>(
3284        haystack: &mut LazyCache<R>,
3285        stream_kind: StreamKind,
3286        magic: &mut Magic,
3287    ) -> Result<bool, Error> {
3288        Ok(Self::try_json(haystack, stream_kind, magic)?
3289            || Self::try_csv(haystack, stream_kind, magic)?
3290            || Self::try_tar(haystack, stream_kind, magic)?)
3291    }
3292
3293    #[inline(always)]
3294    fn magic_default<'m, R: Read + Seek>(
3295        haystack: &mut LazyCache<R>,
3296        stream_kind: StreamKind,
3297        magic: &mut Magic<'m>,
3298    ) -> Result<(), Error> {
3299        let buf = haystack.read_range(0..FILE_BYTES_MAX as u64)?;
3300
3301        magic.set_source(Some(HARDCODED_SOURCE));
3302        magic.set_stream_kind(stream_kind);
3303        magic.is_default = true;
3304
3305        if buf.is_empty() {
3306            magic.push_message(Cow::Borrowed("empty"));
3307            magic.set_mime_type(Cow::Borrowed(DEFAULT_BIN_MIMETYPE));
3308            return Ok(());
3309        }
3310
3311        match stream_kind {
3312            StreamKind::Binary => {
3313                magic.push_message(Cow::Borrowed("data"));
3314            }
3315            StreamKind::Text(e) => {
3316                magic.push_message(Cow::Borrowed(e.as_magic_str()));
3317                magic.push_message(Cow::Borrowed("text"));
3318            }
3319        }
3320
3321        Ok(())
3322    }
3323
3324    /// Loads rules from a [`MagicSource`]
3325    ///
3326    /// # Arguments
3327    ///
3328    /// * `mf` - The [`MagicSource`] to load rules from
3329    ///
3330    /// # Returns
3331    ///
3332    /// * `Result<&mut Self, Error>` - Self for chaining or an error
3333    pub fn load(&mut self, mf: MagicSource) -> Result<&mut Self, Error> {
3334        for rule in mf.rules.into_iter() {
3335            let mut rule = rule;
3336            rule.set_id(self.next_rule_id());
3337
3338            self.rules.push(rule);
3339        }
3340
3341        self.dependencies.extend(mf.dependencies);
3342        self.prepare();
3343        Ok(self)
3344    }
3345
3346    /// Gets all rules in the database
3347    ///
3348    /// # Returns
3349    ///
3350    /// * `&[MagicRule]` - A slice of all rules
3351    pub fn rules(&self) -> &[MagicRule] {
3352        &self.rules
3353    }
3354
3355    #[inline]
3356    fn first_magic_with_stream_kind<R: Read + Seek>(
3357        &self,
3358        haystack: &mut LazyCache<R>,
3359        stream_kind: StreamKind,
3360        extension: Option<&str>,
3361    ) -> Result<Magic<'_>, Error> {
3362        // re-using magic makes this function faster
3363        let mut magic = Magic::default();
3364
3365        if Self::try_hard_magic(haystack, stream_kind, &mut magic)? {
3366            return Ok(magic);
3367        }
3368
3369        let mut marked = vec![false; self.rules.len()];
3370
3371        macro_rules! do_magic {
3372            ($rule: expr) => {{
3373                $rule.magic_entrypoint(&mut magic, stream_kind, haystack, &self, false, 0)?;
3374
3375                if !magic.message.is_empty() {
3376                    magic.set_stream_kind(stream_kind);
3377                    magic.set_source($rule.source.as_deref());
3378                    return Ok(magic);
3379                }
3380
3381                magic.reset();
3382            }};
3383        }
3384
3385        if let Some(ext) = extension.map(|e| e.to_lowercase())
3386            && !ext.is_empty()
3387        {
3388            for rule in self.rules.iter().filter(|r| r.extensions.contains(&ext)) {
3389                do_magic!(rule);
3390                if let Some(f) = marked.get_mut(rule.id) {
3391                    *f = true
3392                }
3393            }
3394        }
3395
3396        for rule in self
3397            .rules
3398            .iter()
3399            // we don't run again rules run by extension
3400            .filter(|r| !*marked.get(r.id).unwrap_or(&false))
3401        {
3402            do_magic!(rule)
3403        }
3404
3405        Self::magic_default(haystack, stream_kind, &mut magic)?;
3406
3407        Ok(magic)
3408    }
3409
3410    /// Detects file [`Magic`] stopping at the first matching magic. Magic
3411    /// rules are evaluated from the best to the least relevant, so this method
3412    /// returns most of the time the best magic. For the rare cases where
3413    /// it doesn't or if the best result is always required, use [`MagicDb::best_magic`]
3414    ///
3415    /// # Arguments
3416    ///
3417    /// * `r` - A readable and seekable input
3418    /// * `extension` - Optional file extension to use for acceleration
3419    ///
3420    /// # Returns
3421    ///
3422    /// * `Result<Magic<'_>, Error>` - The detection result or an error
3423    pub fn first_magic<R: Read + Seek>(
3424        &self,
3425        r: &mut R,
3426        extension: Option<&str>,
3427    ) -> Result<Magic<'_>, Error> {
3428        let mut haystack = Self::open_reader(r)?;
3429        let stream_kind = guess_stream_kind(haystack.read_range(0..FILE_BYTES_MAX as u64)?);
3430        self.first_magic_with_stream_kind(&mut haystack, stream_kind, extension)
3431    }
3432
3433    #[inline(always)]
3434    fn all_magics_sort_with_stream_kind<R: Read + Seek>(
3435        &self,
3436        haystack: &mut LazyCache<R>,
3437        stream_kind: StreamKind,
3438    ) -> Result<Vec<Magic<'_>>, Error> {
3439        let mut out = Vec::new();
3440
3441        let mut magic = Magic::default();
3442
3443        if Self::try_hard_magic(haystack, stream_kind, &mut magic)? {
3444            out.push(magic);
3445            magic = Magic::default();
3446        }
3447
3448        for rule in self.rules.iter() {
3449            rule.magic_entrypoint(&mut magic, stream_kind, haystack, self, false, 0)?;
3450
3451            // it is possible we have a strength with no message
3452            if !magic.message.is_empty() {
3453                magic.set_stream_kind(stream_kind);
3454                magic.set_source(rule.source.as_deref());
3455                out.push(magic);
3456                magic = Magic::default();
3457            }
3458
3459            magic.reset();
3460        }
3461
3462        Self::magic_default(haystack, stream_kind, &mut magic)?;
3463        out.push(magic);
3464
3465        out.sort_by_key(|b| std::cmp::Reverse(b.strength()));
3466
3467        Ok(out)
3468    }
3469
3470    /// Detects all [`Magic`] matching a given content.
3471    ///
3472    /// # Arguments
3473    ///
3474    /// * `r` - A readable and seekable input
3475    ///
3476    /// # Returns
3477    ///
3478    /// * `Result<Vec<Magic<'_>>, Error>` - All detection results sorted by strength or an error
3479    pub fn all_magics<R: Read + Seek>(&self, r: &mut R) -> Result<Vec<Magic<'_>>, Error> {
3480        let mut haystack = Self::open_reader(r)?;
3481        let stream_kind = guess_stream_kind(haystack.read_range(0..FILE_BYTES_MAX as u64)?);
3482        self.all_magics_sort_with_stream_kind(&mut haystack, stream_kind)
3483    }
3484
3485    #[inline(always)]
3486    fn best_magic_with_stream_kind<R: Read + Seek>(
3487        &self,
3488        haystack: &mut LazyCache<R>,
3489        stream_kind: StreamKind,
3490    ) -> Result<Magic<'_>, Error> {
3491        let magics = self.all_magics_sort_with_stream_kind(haystack, stream_kind)?;
3492
3493        // magics is guaranteed to contain at least the default magic
3494        return Ok(magics
3495            .into_iter()
3496            .next()
3497            .expect("magics must at least contain default"));
3498    }
3499
3500    /// Detects the best [`Magic`] matching a given content.
3501    ///
3502    /// # Arguments
3503    ///
3504    /// * `r` - A readable and seekable input
3505    ///
3506    /// # Returns
3507    ///
3508    /// * `Result<Magic<'_>, Error>` - The best detection result or an error
3509    pub fn best_magic<R: Read + Seek>(&self, r: &mut R) -> Result<Magic<'_>, Error> {
3510        let mut haystack = Self::open_reader(r)?;
3511        let stream_kind = guess_stream_kind(haystack.read_range(0..FILE_BYTES_MAX as u64)?);
3512        self.best_magic_with_stream_kind(&mut haystack, stream_kind)
3513    }
3514
3515    /// Serializes the database to a generic writer implementing [`io::Write`]
3516    ///
3517    /// # Returns
3518    ///
3519    /// * `Result<(), Error>` - The serialized database or an error
3520    pub fn serialize<W: Write>(self, w: &mut W) -> Result<(), Error> {
3521        let mut encoder = GzEncoder::new(w, Compression::best());
3522
3523        bincode::serde::encode_into_std_write(&self, &mut encoder, bincode::config::standard())?;
3524        encoder.finish()?;
3525        Ok(())
3526    }
3527
3528    /// Deserializes the database from a generic reader implementing [`io::Read`]
3529    ///
3530    /// # Arguments
3531    ///
3532    /// * `r` - The reader to deserialize from
3533    ///
3534    /// # Returns
3535    ///
3536    /// * `Result<Self, Error>` - The deserialized database or an error
3537    pub fn deserialize<R: Read>(r: &mut R) -> Result<Self, Error> {
3538        let mut buf = vec![];
3539        let mut gz = GzDecoder::new(r);
3540        gz.read_to_end(&mut buf).map_err(|e| {
3541            bincode::error::DecodeError::OtherString(format!("failed to read: {e}"))
3542        })?;
3543        let (sdb, _): (MagicDb, usize) =
3544            bincode::serde::decode_from_slice(&buf, bincode::config::standard())?;
3545        Ok(sdb)
3546    }
3547
3548    #[inline(always)]
3549    fn prepare(&mut self) {
3550        self.rules
3551            .iter_mut()
3552            .for_each(|r| r.try_finalize(&self.dependencies));
3553
3554        // put text rules at the end
3555        self.rules.sort_by_key(|r| (r.is_text(), -(r.score as i64)));
3556    }
3557}
3558
3559#[cfg(test)]
3560mod tests {
3561    use std::io::Cursor;
3562
3563    use regex::bytes::Regex;
3564
3565    use crate::utils::unix_local_time_to_string;
3566
3567    use super::*;
3568
3569    macro_rules! lazy_cache {
3570        ($l: literal) => {
3571            LazyCache::from_read_seek(Cursor::new($l)).unwrap()
3572        };
3573    }
3574
3575    fn first_magic(
3576        rule: &str,
3577        content: &[u8],
3578        stream_kind: StreamKind,
3579    ) -> Result<Magic<'static>, Error> {
3580        let mut md = MagicDb::new();
3581        md.load(
3582            FileMagicParser::parse_str(rule, None)
3583                .inspect_err(|e| eprintln!("{e}"))
3584                .unwrap(),
3585        )
3586        .unwrap();
3587        let mut reader = LazyCache::from_read_seek(Cursor::new(content)).unwrap();
3588        let v = md.best_magic_with_stream_kind(&mut reader, stream_kind)?;
3589        Ok(v.into_owned())
3590    }
3591
3592    /// helper macro to debug tests
3593    #[allow(unused_macros)]
3594    macro_rules! enable_trace {
3595        () => {
3596            tracing_subscriber::fmt()
3597                .with_max_level(tracing_subscriber::filter::LevelFilter::TRACE)
3598                .try_init();
3599        };
3600    }
3601
3602    macro_rules! parse_assert {
3603        ($rule:literal) => {
3604            FileMagicParser::parse_str($rule, None)
3605                .inspect_err(|e| eprintln!("{e}"))
3606                .unwrap();
3607        };
3608    }
3609
3610    macro_rules! assert_magic_match_bin {
3611        ($rule: literal, $content:literal) => {{ first_magic($rule, $content, StreamKind::Binary).unwrap() }};
3612        ($rule: literal, $content:literal, $message:expr) => {{
3613            assert_eq!(
3614                first_magic($rule, $content, StreamKind::Binary)
3615                    .unwrap()
3616                    .message(),
3617                $message
3618            );
3619        }};
3620    }
3621
3622    macro_rules! assert_magic_match_text {
3623        ($rule: literal, $content:literal) => {{ first_magic($rule, $content, StreamKind::Text(TextEncoding::Utf8)).unwrap() }};
3624        ($rule: literal, $content:literal, $message:expr) => {{
3625            assert_eq!(
3626                first_magic($rule, $content, StreamKind::Text(TextEncoding::Utf8))
3627                    .unwrap()
3628                    .message(),
3629                $message
3630            );
3631        }};
3632    }
3633
3634    macro_rules! assert_magic_not_match_text {
3635        ($rule: literal, $content:literal) => {{
3636            assert!(
3637                first_magic($rule, $content, StreamKind::Text(TextEncoding::Utf8))
3638                    .unwrap()
3639                    .is_default()
3640            );
3641        }};
3642    }
3643
3644    macro_rules! assert_magic_not_match_bin {
3645        ($rule: literal, $content:literal) => {{
3646            assert!(
3647                first_magic($rule, $content, StreamKind::Binary)
3648                    .unwrap()
3649                    .is_default()
3650            );
3651        }};
3652    }
3653
3654    #[test]
3655    fn test_regex() {
3656        assert_magic_match_text!(
3657            r#"
36580	regex/1024 \^#![[:space:]]*/usr/bin/env[[:space:]]+
3659!:mime	text/x-shellscript
3660>&0  regex/64 .*($|\\b) %s shell script text executable
3661    "#,
3662            br#"#!/usr/bin/env bash
3663        echo hello world"#,
3664            // the magic generated
3665            "bash shell script text executable"
3666        );
3667
3668        let re = Regex::new(r"(?-u)\x42\x82").unwrap();
3669        assert!(re.is_match(b"\x42\x82"));
3670
3671        assert_magic_match_bin!(
3672            r#"0 regex \x42\x82 binary regex match"#,
3673            b"\x00\x00\x00\x00\x00\x00\x42\x82"
3674        );
3675
3676        // test regex continuation after match
3677        assert_magic_match_bin!(
3678            r#"
3679            0 regex \x42\x82
3680            >&0 string \xde\xad\xbe\xef it works
3681            "#,
3682            b"\x00\x00\x00\x00\x00\x00\x42\x82\xde\xad\xbe\xef"
3683        );
3684
3685        assert_magic_match_bin!(
3686            r#"
3687            0 regex/s \x42\x82
3688            >&0 string \x42\x82\xde\xad\xbe\xef it works
3689            "#,
3690            b"\x00\x00\x00\x00\x00\x00\x42\x82\xde\xad\xbe\xef"
3691        );
3692
3693        // ^ must match stat of line when matching text
3694        assert_magic_match_text!(
3695            r#"
36960	regex/1024 \^HelloWorld$ HelloWorld String"#,
3697            br#"
3698// this is a comment after an empty line
3699HelloWorld
3700            "#
3701        );
3702    }
3703
3704    #[test]
3705    fn test_string_with_mods() {
3706        assert_magic_match_text!(
3707            r#"0	string/w	#!\ \ \ /usr/bin/env\ bash	BASH
3708        "#,
3709            b"#! /usr/bin/env bash i
3710        echo hello world"
3711        );
3712
3713        // test uppercase insensitive
3714        assert_magic_match_text!(
3715            r#"0	string/C	HelloWorld	it works
3716        "#,
3717            b"helloworld"
3718        );
3719
3720        assert_magic_not_match_text!(
3721            r#"0	string/C	HelloWorld	it works
3722        "#,
3723            b"hELLOwORLD"
3724        );
3725
3726        // test lowercase insensitive
3727        assert_magic_match_text!(
3728            r#"0	string/c	HelloWorld	it works
3729        "#,
3730            b"HELLOWORLD"
3731        );
3732
3733        assert_magic_not_match_text!(
3734            r#"0	string/c	HelloWorld	it works
3735        "#,
3736            b"helloworld"
3737        );
3738
3739        // test full word match
3740        assert_magic_match_text!(
3741            r#"0	string/f	#!/usr/bin/env\ bash	BASH
3742        "#,
3743            b"#!/usr/bin/env bash"
3744        );
3745
3746        assert_magic_not_match_text!(
3747            r#"0	string/f	#!/usr/bin/python PYTHON"#,
3748            b"#!/usr/bin/pythonic"
3749        );
3750
3751        // testing whitespace compacting
3752        assert_magic_match_text!(
3753            r#"0	string/W	#!/usr/bin/env\ python  PYTHON"#,
3754            b"#!/usr/bin/env    python"
3755        );
3756
3757        assert_magic_not_match_text!(
3758            r#"0	string/W	#!/usr/bin/env\ \ python  PYTHON"#,
3759            b"#!/usr/bin/env python"
3760        );
3761    }
3762
3763    #[test]
3764    fn test_search_with_mods() {
3765        assert_magic_match_text!(
3766            r#"0	search/1/fwt	#!\ /usr/bin/luatex	LuaTex script text executable"#,
3767            b"#!          /usr/bin/luatex "
3768        );
3769
3770        // test matching from the beginning
3771        assert_magic_match_text!(
3772            r#"
3773            0	search/s	/usr/bin/env
3774            >&0 string /usr/bin/env it works
3775            "#,
3776            b"#!/usr/bin/env    python"
3777        );
3778
3779        assert_magic_not_match_text!(
3780            r#"
3781            0	search	/usr/bin/env
3782            >&0 string /usr/bin/env it works
3783            "#,
3784            b"#!/usr/bin/env    python"
3785        );
3786    }
3787
3788    #[test]
3789    fn test_pstring() {
3790        assert_magic_match_bin!(r#"0 pstring Toast it works"#, b"\x05Toast");
3791
3792        assert_magic_match_bin!(r#"0 pstring Toast %s"#, b"\x05Toast", "Toast");
3793
3794        assert_magic_not_match_bin!(r#"0 pstring Toast Doesn't work"#, b"\x07Toaster");
3795
3796        // testing with modifiers
3797        assert_magic_match_bin!(r#"0 pstring/H Toast it works"#, b"\x00\x05Toast");
3798
3799        assert_magic_match_bin!(r#"0 pstring/HJ Toast it works"#, b"\x00\x07Toast");
3800
3801        assert_magic_match_bin!(r#"0 pstring/HJ Toast %s"#, b"\x00\x07Toast", "Toast");
3802
3803        assert_magic_match_bin!(r#"0 pstring/h Toast it works"#, b"\x05\x00Toast");
3804
3805        assert_magic_match_bin!(r#"0 pstring/hJ Toast it works"#, b"\x07\x00Toast");
3806
3807        assert_magic_match_bin!(r#"0 pstring/L Toast it works"#, b"\x00\x00\x00\x05Toast");
3808
3809        assert_magic_match_bin!(r#"0 pstring/LJ Toast it works"#, b"\x00\x00\x00\x09Toast");
3810
3811        assert_magic_match_bin!(r#"0 pstring/l Toast it works"#, b"\x05\x00\x00\x00Toast");
3812
3813        assert_magic_match_bin!(r#"0 pstring/lJ Toast it works"#, b"\x09\x00\x00\x00Toast");
3814    }
3815
3816    #[test]
3817    fn test_max_recursion() {
3818        let res = first_magic(
3819            r#"0	indirect x"#,
3820            b"#!          /usr/bin/luatex ",
3821            StreamKind::Binary,
3822        );
3823        assert!(res.is_err());
3824        let _ = res.inspect_err(|e| {
3825            assert!(matches!(
3826                e.unwrap_localized(),
3827                Error::MaximumRecursion(MAX_RECURSION)
3828            ))
3829        });
3830    }
3831
3832    #[test]
3833    fn test_string_ops() {
3834        assert_magic_match_text!("0	string/b MZ MZ File", b"MZ\0");
3835        assert_magic_match_text!("0	string !MZ Not MZ File", b"AZ\0");
3836        assert_magic_match_text!("0	string >\0 Any String", b"A\0");
3837        assert_magic_match_text!("0	string >Test Any String", b"Test 1\0");
3838        assert_magic_match_text!("0	string <Test Any String", b"\0");
3839        assert_magic_not_match_text!("0	string >Test Any String", b"\0");
3840    }
3841
3842    #[test]
3843    fn test_lestring16() {
3844        assert_magic_match_bin!(
3845            "0 lestring16 abcd Little-endian UTF-16 string",
3846            b"\x61\x00\x62\x00\x63\x00\x64\x00"
3847        );
3848        assert_magic_match_bin!(
3849            "0 lestring16 x %s",
3850            b"\x61\x00\x62\x00\x63\x00\x64\x00\x00",
3851            "abcd"
3852        );
3853        assert_magic_not_match_bin!(
3854            "0 lestring16 abcd Little-endian UTF-16 string",
3855            b"\x00\x61\x00\x62\x00\x63\x00\x64"
3856        );
3857        assert_magic_match_bin!(
3858            "4 lestring16 abcd Little-endian UTF-16 string",
3859            b"\x00\x00\x00\x00\x61\x00\x62\x00\x63\x00\x64\x00"
3860        );
3861    }
3862
3863    #[test]
3864    fn test_bestring16() {
3865        assert_magic_match_bin!(
3866            "0 bestring16 abcd Big-endian UTF-16 string",
3867            b"\x00\x61\x00\x62\x00\x63\x00\x64"
3868        );
3869        assert_magic_match_bin!(
3870            "0 bestring16 x %s",
3871            b"\x00\x61\x00\x62\x00\x63\x00\x64",
3872            "abcd"
3873        );
3874        assert_magic_not_match_bin!(
3875            "0 bestring16 abcd Big-endian UTF-16 string",
3876            b"\x61\x00\x62\x00\x63\x00\x64\x00"
3877        );
3878        assert_magic_match_bin!(
3879            "4 bestring16 abcd Big-endian UTF-16 string",
3880            b"\x00\x00\x00\x00\x00\x61\x00\x62\x00\x63\x00\x64"
3881        );
3882    }
3883
3884    #[test]
3885    fn test_offset_from_end() {
3886        assert_magic_match_bin!("-1 ubyte 0x42 last byte ok", b"\x00\x00\x42");
3887        assert_magic_match_bin!("-2 ubyte 0x41 last byte ok", b"\x00\x41\x00");
3888    }
3889
3890    #[test]
3891    fn test_relative_offset() {
3892        assert_magic_match_bin!(
3893            "
3894            0 ubyte 0x42
3895            >&0 ubyte 0x00
3896            >>&0 ubyte 0x41 third byte ok
3897            ",
3898            b"\x42\x00\x41\x00"
3899        );
3900    }
3901
3902    #[test]
3903    fn test_indirect_offset() {
3904        assert_magic_match_bin!("(0.l) ubyte 0x42 it works", b"\x04\x00\x00\x00\x42");
3905        // adding fixed value to offset
3906        assert_magic_match_bin!("(0.l+3) ubyte 0x42 it works", b"\x01\x00\x00\x00\x42");
3907        // testing offset pair
3908        assert_magic_match_bin!(
3909            "(0.l+(4)) ubyte 0x42 it works",
3910            b"\x04\x00\x00\x00\x04\x00\x00\x00\x42"
3911        );
3912    }
3913
3914    #[test]
3915    fn test_use_with_message() {
3916        assert_magic_match_bin!(
3917            r#"
39180 string MZ
3919>0 use mz first match
3920
39210 name mz then second match
3922>0 string MZ
3923"#,
3924            b"MZ\0",
3925            "first match then second match"
3926        );
3927    }
3928
3929    #[test]
3930    fn test_scalar_transform() {
3931        assert_magic_match_bin!("0 ubyte+1 0x1 add works", b"\x00");
3932        assert_magic_match_bin!("0 ubyte-1 0xfe sub works", b"\xff");
3933        assert_magic_match_bin!("0 ubyte%2 0 mod works", b"\x0a");
3934        assert_magic_match_bin!("0 ubyte&0x0f 0x0f bitand works", b"\xff");
3935        assert_magic_match_bin!("0 ubyte|0x0f 0xff bitor works", b"\xf0");
3936        assert_magic_match_bin!("0 ubyte^0x0f 0xf0 bitxor works", b"\xff");
3937
3938        FileMagicParser::parse_str("0 ubyte%0 mod by zero", None)
3939            .expect_err("expect div by zero error");
3940        FileMagicParser::parse_str("0 ubyte/0 div by zero", None)
3941            .expect_err("expect div by zero error");
3942    }
3943
3944    #[test]
3945    fn test_belong() {
3946        // Test that a file with a four-byte value at offset 0 that matches the given value in big-endian byte order
3947        assert_magic_match_bin!("0 belong 0x12345678 Big-endian long", b"\x12\x34\x56\x78");
3948        // Test that a file with a four-byte value at offset 0 that does not match the given value in big-endian byte order
3949        assert_magic_not_match_bin!("0 belong 0x12345678 Big-endian long", b"\x78\x56\x34\x12");
3950        // Test that a file with a four-byte value at a non-zero offset that matches the given value in big-endian byte order
3951        assert_magic_match_bin!(
3952            "4 belong 0x12345678 Big-endian long",
3953            b"\x00\x00\x00\x00\x12\x34\x56\x78"
3954        );
3955        // Test < operator
3956        assert_magic_match_bin!("0 belong <0x12345678 Big-endian long", b"\x12\x34\x56\x77");
3957        assert_magic_not_match_bin!("0 belong <0x12345678 Big-endian long", b"\x12\x34\x56\x78");
3958
3959        // Test > operator
3960        assert_magic_match_bin!("0 belong >0x12345678 Big-endian long", b"\x12\x34\x56\x79");
3961        assert_magic_not_match_bin!("0 belong >0x12345678 Big-endian long", b"\x12\x34\x56\x78");
3962
3963        // Test & operator
3964        assert_magic_match_bin!("0 belong &0x5678 Big-endian long", b"\x00\x00\x56\x78");
3965        assert_magic_not_match_bin!("0 belong &0x0000FFFF Big-endian long", b"\x12\x34\x56\x78");
3966
3967        // Test ^ operator (bitwise AND with complement)
3968        assert_magic_match_bin!("0 belong ^0xFFFF0000 Big-endian long", b"\x00\x00\x56\x78");
3969        assert_magic_not_match_bin!("0 belong ^0xFFFF0000 Big-endian long", b"\x00\x01\x56\x78");
3970
3971        // Test ~ operator
3972        assert_magic_match_bin!("0 belong ~0x12345678 Big-endian long", b"\xed\xcb\xa9\x87");
3973        assert_magic_not_match_bin!("0 belong ~0x12345678 Big-endian long", b"\x12\x34\x56\x78");
3974
3975        // Test x operator
3976        assert_magic_match_bin!("0 belong x Big-endian long", b"\x12\x34\x56\x78");
3977        assert_magic_match_bin!("0 belong x Big-endian long", b"\x78\x56\x34\x12");
3978    }
3979
3980    #[test]
3981    fn test_parse_search() {
3982        parse_assert!("0 search test");
3983        parse_assert!("0 search/24/s test");
3984        parse_assert!("0 search/s/24 test");
3985    }
3986
3987    #[test]
3988    fn test_bedate() {
3989        assert_magic_match_bin!(
3990            "0 bedate 946684800 Unix date (Jan 1, 2000)",
3991            b"\x38\x6D\x43\x80"
3992        );
3993        assert_magic_not_match_bin!(
3994            "0 bedate 946684800 Unix date (Jan 1, 2000)",
3995            b"\x00\x00\x00\x00"
3996        );
3997        assert_magic_match_bin!(
3998            "4 bedate 946684800 %s",
3999            b"\x00\x00\x00\x00\x38\x6D\x43\x80",
4000            "2000-01-01 00:00:00"
4001        );
4002    }
4003    #[test]
4004    fn test_beldate() {
4005        assert_magic_match_bin!(
4006            "0 beldate 946684800 Local date (Jan 1, 2000)",
4007            b"\x38\x6D\x43\x80"
4008        );
4009        assert_magic_not_match_bin!(
4010            "0 beldate 946684800 Local date (Jan 1, 2000)",
4011            b"\x00\x00\x00\x00"
4012        );
4013
4014        assert_magic_match_bin!(
4015            "4 beldate 946684800 {}",
4016            b"\x00\x00\x00\x00\x38\x6D\x43\x80",
4017            unix_local_time_to_string(946684800)
4018        );
4019    }
4020
4021    #[test]
4022    fn test_beqdate() {
4023        assert_magic_match_bin!(
4024            "0 beqdate 946684800 Unix date (Jan 1, 2000)",
4025            b"\x00\x00\x00\x00\x38\x6D\x43\x80"
4026        );
4027
4028        assert_magic_not_match_bin!(
4029            "0 beqdate 946684800 Unix date (Jan 1, 2000)",
4030            b"\x00\x00\x00\x00\x00\x00\x00\x00"
4031        );
4032
4033        assert_magic_match_bin!(
4034            "0 beqdate 946684800 %s",
4035            b"\x00\x00\x00\x00\x38\x6D\x43\x80",
4036            "2000-01-01 00:00:00"
4037        );
4038    }
4039
4040    #[test]
4041    fn test_medate() {
4042        assert_magic_match_bin!(
4043            "0 medate 946684800 Unix date (Jan 1, 2000)",
4044            b"\x6D\x38\x80\x43"
4045        );
4046
4047        assert_magic_not_match_bin!(
4048            "0 medate 946684800 Unix date (Jan 1, 2000)",
4049            b"\x00\x00\x00\x00"
4050        );
4051
4052        assert_magic_match_bin!(
4053            "4 medate 946684800 %s",
4054            b"\x00\x00\x00\x00\x6D\x38\x80\x43",
4055            "2000-01-01 00:00:00"
4056        );
4057    }
4058
4059    #[test]
4060    fn test_meldate() {
4061        assert_magic_match_bin!(
4062            "0 meldate 946684800 Local date (Jan 1, 2000)",
4063            b"\x6D\x38\x80\x43"
4064        );
4065        assert_magic_not_match_bin!(
4066            "0 meldate 946684800 Local date (Jan 1, 2000)",
4067            b"\x00\x00\x00\x00"
4068        );
4069
4070        assert_magic_match_bin!(
4071            "4 meldate 946684800 %s",
4072            b"\x00\x00\x00\x00\x6D\x38\x80\x43",
4073            unix_local_time_to_string(946684800)
4074        );
4075    }
4076
4077    #[test]
4078    fn test_date() {
4079        assert_magic_match_bin!(
4080            "0 date 946684800 Local date (Jan 1, 2000)",
4081            b"\x80\x43\x6D\x38"
4082        );
4083        assert_magic_not_match_bin!(
4084            "0 date 946684800 Local date (Jan 1, 2000)",
4085            b"\x00\x00\x00\x00"
4086        );
4087        assert_magic_match_bin!(
4088            "4 date 946684800 {}",
4089            b"\x00\x00\x00\x00\x80\x43\x6D\x38",
4090            "2000-01-01 00:00:00"
4091        );
4092    }
4093
4094    #[test]
4095    fn test_leldate() {
4096        assert_magic_match_bin!(
4097            "0 leldate 946684800 Local date (Jan 1, 2000)",
4098            b"\x80\x43\x6D\x38"
4099        );
4100        assert_magic_not_match_bin!(
4101            "0 leldate 946684800 Local date (Jan 1, 2000)",
4102            b"\x00\x00\x00\x00"
4103        );
4104        assert_magic_match_bin!(
4105            "4 leldate 946684800 {}",
4106            b"\x00\x00\x00\x00\x80\x43\x6D\x38",
4107            unix_local_time_to_string(946684800)
4108        );
4109    }
4110
4111    #[test]
4112    fn test_leqdate() {
4113        assert_magic_match_bin!(
4114            "0 leqdate 1577836800 Unix date (Jan 1, 2020)",
4115            b"\x00\xe1\x0b\x5E\x00\x00\x00\x00"
4116        );
4117
4118        assert_magic_not_match_bin!(
4119            "0 leqdate 1577836800 Unix date (Jan 1, 2020)",
4120            b"\x00\x00\x00\x00\x00\x00\x00\x00"
4121        );
4122        assert_magic_match_bin!(
4123            "8 leqdate 1577836800 %s",
4124            b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\xE1\x0B\x5E\x00\x00\x00\x00",
4125            "2020-01-01 00:00:00"
4126        );
4127    }
4128
4129    #[test]
4130    fn test_leqldate() {
4131        assert_magic_match_bin!(
4132            "0 leqldate 1577836800 Unix date (Jan 1, 2020)",
4133            b"\x00\xe1\x0b\x5E\x00\x00\x00\x00"
4134        );
4135
4136        assert_magic_not_match_bin!(
4137            "0 leqldate 1577836800 Unix date (Jan 1, 2020)",
4138            b"\x00\x00\x00\x00\x00\x00\x00\x00"
4139        );
4140        assert_magic_match_bin!(
4141            "8 leqldate 1577836800 %s",
4142            b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\xE1\x0B\x5E\x00\x00\x00\x00",
4143            unix_local_time_to_string(1577836800)
4144        );
4145    }
4146
4147    #[test]
4148    fn test_melong() {
4149        // Test = operator
4150        assert_magic_match_bin!(
4151            "0 melong =0x12345678 Middle-endian long",
4152            b"\x34\x12\x78\x56"
4153        );
4154        assert_magic_not_match_bin!(
4155            "0 melong =0x12345678 Middle-endian long",
4156            b"\x00\x00\x00\x00"
4157        );
4158
4159        // Test < operator
4160        assert_magic_match_bin!(
4161            "0 melong <0x12345678 Middle-endian long",
4162            b"\x34\x12\x78\x55"
4163        ); // 0x12345677 in middle-endian
4164        assert_magic_not_match_bin!(
4165            "0 melong <0x12345678 Middle-endian long",
4166            b"\x34\x12\x78\x56"
4167        ); // 0x12345678 in middle-endian
4168
4169        // Test > operator
4170        assert_magic_match_bin!(
4171            "0 melong >0x12345678 Middle-endian long",
4172            b"\x34\x12\x78\x57"
4173        ); // 0x12345679 in middle-endian
4174        assert_magic_not_match_bin!(
4175            "0 melong >0x12345678 Middle-endian long",
4176            b"\x34\x12\x78\x56"
4177        ); // 0x12345678 in middle-endian
4178
4179        // Test & operator
4180        assert_magic_match_bin!("0 melong &0x5678 Middle-endian long", b"\xab\xcd\x78\x56"); // 0x00007856 in middle-endian
4181        assert_magic_not_match_bin!(
4182            "0 melong &0x0000FFFF Middle-endian long",
4183            b"\x34\x12\x78\x56"
4184        ); // 0x12347856 in middle-endian
4185
4186        // Test ^ operator (bitwise AND with complement)
4187        assert_magic_match_bin!(
4188            "0 melong ^0xFFFF0000 Middle-endian long",
4189            b"\x00\x00\x78\x56"
4190        ); // 0x00007856 in middle-endian
4191        assert_magic_not_match_bin!(
4192            "0 melong ^0xFFFF0000 Middle-endian long",
4193            b"\x00\x01\x78\x56"
4194        ); // 0x00017856 in middle-endian
4195
4196        // Test ~ operator
4197        assert_magic_match_bin!(
4198            "0 melong ~0x12345678 Middle-endian long",
4199            b"\xCB\xED\x87\xA9"
4200        );
4201        assert_magic_not_match_bin!(
4202            "0 melong ~0x12345678 Middle-endian long",
4203            b"\x34\x12\x78\x56"
4204        ); // The original value
4205
4206        // Test x operator
4207        assert_magic_match_bin!("0 melong x Middle-endian long", b"\x34\x12\x78\x56");
4208        assert_magic_match_bin!("0 melong x Middle-endian long", b"\x00\x00\x00\x00");
4209    }
4210
4211    #[test]
4212    fn test_uquad() {
4213        // Test = operator
4214        assert_magic_match_bin!(
4215            "0 uquad =0x123456789ABCDEF0 Unsigned quad",
4216            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4217        );
4218        assert_magic_not_match_bin!(
4219            "0 uquad =0x123456789ABCDEF0 Unsigned quad",
4220            b"\x00\x00\x00\x00\x00\x00\x00\x00"
4221        );
4222
4223        // Test < operator
4224        assert_magic_match_bin!(
4225            "0 uquad <0x123456789ABCDEF0 Unsigned quad",
4226            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x11"
4227        );
4228        assert_magic_not_match_bin!(
4229            "0 uquad <0x123456789ABCDEF0 Unsigned quad",
4230            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4231        );
4232
4233        // Test > operator
4234        assert_magic_match_bin!(
4235            "0 uquad >0x123456789ABCDEF0 Unsigned quad",
4236            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x13"
4237        );
4238        assert_magic_not_match_bin!(
4239            "0 uquad >0x123456789ABCDEF0 Unsigned quad",
4240            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4241        );
4242
4243        // Test & operator
4244        assert_magic_match_bin!(
4245            "0 uquad &0xF0 Unsigned quad",
4246            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4247        );
4248        assert_magic_not_match_bin!(
4249            "0 uquad &0xFF Unsigned quad",
4250            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4251        );
4252
4253        // Test ^ operator (bitwise AND with complement)
4254        assert_magic_match_bin!(
4255            "0 uquad ^0xFFFFFFFFFFFFFFFF Unsigned quad",
4256            b"\x00\x00\x00\x00\x00\x00\x00\x00"
4257        ); // All bits clear
4258        assert_magic_not_match_bin!(
4259            "0 uquad ^0xFFFFFFFFFFFFFFFF Unsigned quad",
4260            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4261        ); // Some bits set
4262
4263        // Test ~ operator
4264        assert_magic_match_bin!(
4265            "0 uquad ~0x123456789ABCDEF0 Unsigned quad",
4266            b"\x0F\x21\x43\x65\x87\xA9\xCB\xED"
4267        );
4268        assert_magic_not_match_bin!(
4269            "0 uquad ~0x123456789ABCDEF0 Unsigned quad",
4270            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4271        ); // The original value
4272
4273        // Test x operator
4274        assert_magic_match_bin!(
4275            "0 uquad x {:#x}",
4276            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12",
4277            "0x123456789abcdef0"
4278        );
4279        assert_magic_match_bin!(
4280            "0 uquad x Unsigned quad",
4281            b"\x00\x00\x00\x00\x00\x00\x00\x00"
4282        );
4283    }
4284
4285    #[test]
4286    fn test_guid() {
4287        assert_magic_match_bin!(
4288            "0 guid EC959539-6786-2D4E-8FDB-98814CE76C1E It works",
4289            b"\xEC\x95\x95\x39\x67\x86\x2D\x4E\x8F\xDB\x98\x81\x4C\xE7\x6C\x1E"
4290        );
4291
4292        assert_magic_not_match_bin!(
4293            "0 guid 399595EC-8667-4E2D-8FDB-98814CE76C1E It works",
4294            b"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F"
4295        );
4296
4297        assert_magic_match_bin!(
4298            "0 guid x %s",
4299            b"\xEC\x95\x95\x39\x67\x86\x2D\x4E\x8F\xDB\x98\x81\x4C\xE7\x6C\x1E",
4300            "EC959539-6786-2D4E-8FDB-98814CE76C1E"
4301        );
4302    }
4303
4304    #[test]
4305    fn test_ubeqdate() {
4306        assert_magic_match_bin!(
4307            "0 ubeqdate 1633046400 It works",
4308            b"\x00\x00\x00\x00\x61\x56\x4f\x80"
4309        );
4310
4311        assert_magic_match_bin!(
4312            "0 ubeqdate x %s",
4313            b"\x00\x00\x00\x00\x61\x56\x4f\x80",
4314            "2021-10-01 00:00:00"
4315        );
4316
4317        assert_magic_not_match_bin!(
4318            "0 ubeqdate 1633046400 It should not work",
4319            b"\x00\x00\x00\x00\x00\x00\x00\x00"
4320        );
4321    }
4322
4323    #[test]
4324    fn test_ldate() {
4325        assert_magic_match_bin!("0 ldate 1640551520 It works", b"\x60\xd4\xC8\x61");
4326
4327        assert_magic_not_match_bin!("0 ldate 1633046400 It should not work", b"\x00\x00\x00\x00");
4328
4329        assert_magic_match_bin!(
4330            "0 ldate x %s",
4331            b"\x60\xd4\xC8\x61",
4332            unix_local_time_to_string(1640551520)
4333        );
4334    }
4335
4336    #[test]
4337    fn test_scalar_with_transform() {
4338        assert_magic_match_bin!("0 ubyte/10 2 {}", b"\x14", "2");
4339        assert_magic_match_bin!("0 ubyte/10 x {}", b"\x14", "2");
4340        assert_magic_match_bin!("0 ubyte%10 x {}", b"\x14", "0");
4341    }
4342
4343    #[test]
4344    fn test_float_with_transform() {
4345        assert_magic_match_bin!("0 lefloat/10 2 {}", b"\x00\x00\xa0\x41", "2");
4346        assert_magic_match_bin!("0 lefloat/10 x {}", b"\x00\x00\xa0\x41", "2");
4347        assert_magic_match_bin!("0 lefloat%10 x {}", b"\x00\x00\xa0\x41", "0");
4348    }
4349
4350    #[test]
4351    fn test_read_octal() {
4352        // Basic cases
4353        assert_eq!(read_octal_u64(&mut lazy_cache!("0")), Some(0));
4354        assert_eq!(read_octal_u64(&mut lazy_cache!("00")), Some(0));
4355        assert_eq!(read_octal_u64(&mut lazy_cache!("01")), Some(1));
4356        assert_eq!(read_octal_u64(&mut lazy_cache!("07")), Some(7));
4357        assert_eq!(read_octal_u64(&mut lazy_cache!("010")), Some(8));
4358        assert_eq!(read_octal_u64(&mut lazy_cache!("0123")), Some(83));
4359        assert_eq!(read_octal_u64(&mut lazy_cache!("0755")), Some(493));
4360
4361        // With trailing non-octal characters
4362        assert_eq!(read_octal_u64(&mut lazy_cache!("0ABC")), Some(0));
4363        assert_eq!(read_octal_u64(&mut lazy_cache!("01ABC")), Some(1));
4364        assert_eq!(read_octal_u64(&mut lazy_cache!("0755ABC")), Some(493));
4365        assert_eq!(read_octal_u64(&mut lazy_cache!("0123ABC")), Some(83));
4366
4367        // Invalid octal digits
4368        assert_eq!(read_octal_u64(&mut lazy_cache!("08")), Some(0)); // stops at '8'
4369        assert_eq!(read_octal_u64(&mut lazy_cache!("01238")), Some(83)); // stops at '8'
4370
4371        // No leading '0'
4372        assert_eq!(read_octal_u64(&mut lazy_cache!("123")), None);
4373        assert_eq!(read_octal_u64(&mut lazy_cache!("755")), None);
4374
4375        // Empty string
4376        assert_eq!(read_octal_u64(&mut lazy_cache!("")), None);
4377
4378        // Only non-octal characters
4379        assert_eq!(read_octal_u64(&mut lazy_cache!("ABC")), None);
4380        assert_eq!(read_octal_u64(&mut lazy_cache!("8ABC")), None); // first char is not '0'
4381
4382        // Longer valid octal (but within u64 range)
4383        assert_eq!(
4384            read_octal_u64(&mut lazy_cache!("01777777777")),
4385            Some(268435455)
4386        );
4387    }
4388
4389    #[test]
4390    fn test_offset_bug_1() {
4391        // this tests the exact behaviour
4392        // expected by libmagic/file
4393        assert_magic_match_bin!(
4394            r"
43951	string		TEST Bread is
4396# offset computation is relative to
4397# rule start
4398>(5.b)	use toasted
4399
44000 name toasted
4401>0	string twice Toasted
4402>>0  use toasted_twice 
4403
44040 name toasted_twice
4405>(6.b) string x %s
4406        ",
4407            b"\x00TEST\x06twice\x00\x06",
4408            "Bread is Toasted twice"
4409        );
4410    }
4411
4412    // this test implement the exact same logic as
4413    // test_offset_bug_1 except that the rule starts
4414    // matching from end. Surprisingly we need to
4415    // adjust indirect offsets so that it works in
4416    // libmagic/file
4417    #[test]
4418    fn test_offset_bug_2() {
4419        // this tests the exact behaviour
4420        // expected by libmagic/file
4421        assert_magic_match_bin!(
4422            r"
4423-12	string		TEST Bread is
4424>(4.b)	use toasted
4425
44260 name toasted
4427>0	string twice Toasted
4428>>0  use toasted_twice
4429
44300 name toasted_twice
4431>(6.b) string x %
4432        ",
4433            b"\x00TEST\x06twice\x00\x06",
4434            "Bread is Toasted twice"
4435        )
4436    }
4437
4438    #[test]
4439    fn test_offset_bug_3() {
4440        // this tests the exact behaviour
4441        // expected by libmagic/file
4442        assert_magic_match_bin!(
4443            r"
44441	string		TEST Bread is
4445>(5.b) indirect/r x
4446
44470	string twice Toasted
4448>0  use toasted_twice
4449
44500 name toasted_twice
4451>0 string x %s
4452        ",
4453            b"\x00TEST\x06twice\x00\x08",
4454            "Bread is Toasted twice"
4455        )
4456    }
4457
4458    #[test]
4459    fn test_offset_bug_4() {
4460        // this tests the exact behaviour
4461        // expected by libmagic/file
4462        assert_magic_match_bin!(
4463            r"
44641	string		Bread %s
4465>(6.b) indirect/r x
4466
4467# this one uses a based offset
4468# computed at indirection
44691	string is\ Toasted %s
4470>(11.b)  use toasted_twice
4471
4472# this one is using a new base
4473# offset being previous base 
4474# offset + offset of use
44750 name toasted_twice
4476>0 string x %s
4477            ",
4478            b"\x00Bread\x06is Toasted\x0ctwice\x00",
4479            "Bread is Toasted twice"
4480        )
4481    }
4482
4483    #[test]
4484    fn test_offset_bug_5() {
4485        assert_magic_match_bin!(
4486            r"
44871	string		TEST Bread is
4488>(5.b) indirect/r x
4489
44900	string twice Toasted
4491>0  use toasted_twice
4492
44930 name toasted_twice
4494>0 string twice
4495>>&1 byte 0x08 twice
4496            ",
4497            b"\x00TEST\x06twice\x00\x08",
4498            "Bread is Toasted twice"
4499        )
4500    }
4501
4502    #[test]
4503    fn test_message_parts() {
4504        let m = first_magic(
4505            r#"0	string/W	#!/usr/bin/env\ python  PYTHON"#,
4506            b"#!/usr/bin/env    python",
4507            StreamKind::Text(TextEncoding::Ascii),
4508        )
4509        .unwrap();
4510
4511        assert!(m.message_parts().any(|p| p.eq_ignore_ascii_case("python")))
4512    }
4513}