magic_rs/
lib.rs

1#![forbid(unsafe_code)]
2#![deny(unused_imports)]
3//! # `magic-rs`: A Safe Rust Reimplementation of `libmagic`
4//!
5//! This crate provides a high-performance, memory-safe alternative to the traditional
6//! `libmagic` (used by the `file` command). It supports **file type detection**,
7//! **MIME type inference**, and **custom magic rule parsing**.
8//!
9//! ## Installation
10//! Add `magic-rs` to your `Cargo.toml`:
11//!
12//! ```toml
13//! [dependencies]
14//! magic-rs = "0.1"  # Replace with the latest version
15//! ```
16//!
17//! Or add the latest version with cargo:
18//!
19//! ```sh
20//! cargo add magic-rs
21//! ```
22//!
23//! ## Quick Start
24//!
25//! ### Detect File Types Programmatically
26//! ```rust
27//! use magic_rs::{MagicDb, MagicSource};
28//! use std::fs::File;
29//!
30//! fn main() -> Result<(), Box<dyn std::error::Error>> {
31//!     let mut db = MagicDb::new();
32//!     // Create a MagicSource from a file
33//!     let rust_magic = MagicSource::open("../magic-db/src/magdir/rust")?;
34//!     db.load(rust_magic)?;
35//!
36//!     // Open a file and detect its type
37//!     let mut file = File::open("src/lib.rs")?;
38//!     let magic = db.magic_first(&mut file, None)?;
39//!
40//!     println!(
41//!         "File type: {} (MIME: {}, strength: {})",
42//!         magic.message(),
43//!         magic.mime_type(),
44//!         magic.strength()
45//!     );
46//!     Ok(())
47//! }
48//! ```
49//!
50//! ### Get All Matching Rules
51//! ```rust
52//! use magic_rs::{MagicDb, MagicSource};
53//! use std::fs::File;
54//!
55//! fn main() -> Result<(), Box<dyn std::error::Error>> {
56//!     let mut db = MagicDb::new();
57//!     // Create a MagicSource from a file
58//!     let rust_magic = MagicSource::open("../magic-db/src/magdir/rust")?;
59//!     db.load(rust_magic)?;
60//!
61//!     // Open a file and detect its type
62//!     let mut file = File::open("src/lib.rs")?;
63//!
64//!     // Get all matching rules, sorted by strength
65//!     let magics = db.magic_all(&mut file)?;
66//!
67//!     // Must contain rust file magic and default text magic
68//!     assert!(magics.len() > 1);
69//!
70//!     for magic in magics {
71//!         println!(
72//!             "Match: {} (strength: {}, source: {})",
73//!             magic.message(),
74//!             magic.strength(),
75//!             magic.source().unwrap_or("unknown")
76//!         );
77//!     }
78//!     Ok(())
79//! }
80//! ```
81//!
82//! ### Serialize a Database to Disk
83//! ```rust
84//! use magic_rs::{MagicDb, MagicSource};
85//! use std::fs::File;
86//!
87//! fn main() -> Result<(), Box<dyn std::error::Error>> {
88//!     let mut db = MagicDb::new();
89//!     // Create a MagicSource from a file
90//!     let rust_magic = MagicSource::open("../magic-db/src/magdir/rust")?;
91//!     db.load(rust_magic)?;
92//!
93//!     // Serialize the database to a file
94//!     let mut output = File::create("/tmp/compiled.db")?;
95//!     db.serialize(&mut output)?;
96//!
97//!     println!("Database saved to file");
98//!     Ok(())
99//! }
100//! ```
101//!
102//! ### Deserialize a Database
103//! ```rust
104//! use magic_rs::{MagicDb, MagicSource};
105//! use std::fs::File;
106//!
107//! fn main() -> Result<(), Box<dyn std::error::Error>> {
108//!     let mut db = MagicDb::new();
109//!     // Create a MagicSource from a file
110//!     let rust_magic = MagicSource::open("../magic-db/src/magdir/rust")?;
111//!     db.load(rust_magic)?;
112//!
113//!     // Serialize the database in a vector
114//!     let mut ser = vec![];
115//!     db.serialize(&mut ser)?;
116//!     println!("Database saved to vector");
117//!
118//!     // We deserialize from slice
119//!     let db = MagicDb::deserialize(&mut ser.as_slice())?;
120//!
121//!     assert!(!db.rules().is_empty());
122//!
123//!     Ok(())
124//! }
125//! ```
126//!
127//! ## License
128//! This project is licensed under the **GPL-3.0 License**.
129//!
130//! ## Contributing
131//! Contributions are welcome! Open an issue or submit a pull request.
132//!
133//! ## Acknowledgments
134//! - Inspired by the original `libmagic` (part of the `file` command).
135
136use dyf::{DynDisplay, FormatString, dformat};
137use flagset::{FlagSet, flags};
138use flate2::{Compression, read::GzDecoder, write::GzEncoder};
139use lazy_cache::LazyCache;
140use memchr::memchr;
141use pest::{Span, error::ErrorVariant};
142use regex::bytes::{self};
143use serde::{Deserialize, Serialize};
144use std::{
145    borrow::Cow,
146    cmp::max,
147    collections::{HashMap, HashSet},
148    fmt::{self, Debug, Display},
149    io::{self, Read, Seek, SeekFrom, Write},
150    ops::{Add, BitAnd, BitOr, BitXor, Div, Mul, Rem, Sub},
151    path::Path,
152};
153use tar::Archive;
154use thiserror::Error;
155use tracing::{Level, debug, enabled, trace};
156
157use crate::{
158    numeric::{Float, FloatDataType, Scalar, ScalarDataType},
159    parser::{FileMagicParser, Rule},
160    utils::{decode_id3, find_json_boundaries},
161};
162
163mod numeric;
164mod parser;
165mod utils;
166
167const HARDCODED_MAGIC_STRENGTH: u64 = 2048;
168const HARDCODED_SOURCE: &str = "hardcoded";
169// corresponds to FILE_INDIR_MAX constant defined in libmagic
170const MAX_RECURSION: usize = 50;
171// constant found in libmagic. It is used to limit for search tests
172pub const FILE_BYTES_MAX: usize = 7 * 1024 * 1024;
173// constant found in libmagic. It is used to limit for regex tests
174const FILE_REGEX_MAX: usize = 8192;
175
176pub const DEFAULT_BIN_MIMETYPE: &str = "application/octet-stream";
177pub const DEFAULT_TEXT_MIMETYPE: &str = "text/plain";
178
179pub(crate) const TIMESTAMP_FORMAT: &str = "%Y-%m-%d %H:%M:%S";
180
181macro_rules! debug_panic {
182    ($($arg:tt)*) => {
183        if cfg!(debug_assertions) {
184            panic!($($arg)*);
185        }
186    };
187}
188
189macro_rules! read {
190    ($r: expr, $ty: ty) => {{
191        let mut a = [0u8; std::mem::size_of::<$ty>()];
192        $r.read_exact(&mut a)?;
193        a
194    }};
195}
196
197macro_rules! read_le {
198    ($r:expr, $ty: ty ) => {{ <$ty>::from_le_bytes(read!($r, $ty)) }};
199}
200
201macro_rules! read_be {
202    ($r:expr, $ty: ty ) => {{ <$ty>::from_be_bytes(read!($r, $ty)) }};
203}
204
205macro_rules! read_me {
206    ($r: expr) => {{ ((read_le!($r, u16) as i32) << 16) | (read_le!($r, u16) as i32) }};
207}
208
209#[inline(always)]
210fn read_octal_u64<R: Read + Seek>(haystack: &mut LazyCache<R>) -> Option<u64> {
211    let s = haystack
212        .read_while_or_limit(|b| matches!(b, b'0'..=b'7'), 22)
213        .map(|buf| str::from_utf8(buf))
214        .ok()?
215        .ok()?;
216
217    if !s.starts_with("0") {
218        return None;
219    }
220
221    u64::from_str_radix(s, 8).ok()
222}
223
224/// Represents all possible errors that can occur during file type detection and processing.
225#[derive(Debug, Error)]
226pub enum Error {
227    /// A generic error with a custom message.
228    #[error("{0}")]
229    Msg(String),
230
231    /// An error with a source location and a nested error.
232    #[error("source={0} line={1} error={2}")]
233    Localized(String, usize, Box<Error>),
234
235    /// Indicates a required rule was not found.
236    #[error("missing rule: {0}")]
237    MissingRule(String),
238
239    /// Indicates the maximum recursion depth was reached.
240    #[error("maximum recursion reached: {0}")]
241    MaximumRecursion(usize),
242
243    /// Wraps an I/O error.
244    #[error("io: {0}")]
245    Io(#[from] io::Error),
246
247    /// Wraps a parsing error from the `pest` parser.
248    #[error("parser error: {0}")]
249    Parse(#[from] Box<pest::error::Error<Rule>>),
250
251    /// Wraps a formatting error from the `dyf` crate.
252    #[error("formatting: {0}")]
253    Format(#[from] dyf::Error),
254
255    /// Wraps a regex-related error.
256    #[error("regex: {0}")]
257    Regex(#[from] regex::Error),
258
259    /// Wraps a serialization error from `bincode`.
260    #[error("{0}")]
261    Serialize(#[from] bincode::error::EncodeError),
262
263    /// Wraps a deserialization error from `bincode`.
264    #[error("{0}")]
265    Deserialize(#[from] bincode::error::DecodeError),
266}
267
268impl Error {
269    #[inline]
270    fn parser<S: ToString>(msg: S, span: Span<'_>) -> Self {
271        Self::Parse(Box::new(pest::error::Error::new_from_span(
272            ErrorVariant::CustomError {
273                message: msg.to_string(),
274            },
275            span,
276        )))
277    }
278
279    fn msg<M: AsRef<str>>(msg: M) -> Self {
280        Self::Msg(msg.as_ref().into())
281    }
282
283    fn localized<S: AsRef<str>>(source: S, line: usize, err: Error) -> Self {
284        Self::Localized(source.as_ref().into(), line, err.into())
285    }
286
287    /// Unwraps the localized error
288    pub fn unwrap_localized(&self) -> &Self {
289        match self {
290            Self::Localized(_, _, e) => e,
291            _ => self,
292        }
293    }
294}
295
296#[derive(Debug, Clone, Serialize, Deserialize)]
297enum Message {
298    String(String),
299    Format {
300        printf_spec: String,
301        fs: FormatString,
302    },
303}
304
305impl Display for Message {
306    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
307        match self {
308            Self::String(s) => write!(f, "{s}"),
309            Self::Format { printf_spec: _, fs } => write!(f, "{}", fs.to_string_lossy()),
310        }
311    }
312}
313
314impl Message {
315    fn to_string_lossy(&self) -> Cow<'_, str> {
316        match self {
317            Message::String(s) => Cow::Borrowed(s),
318            Message::Format { printf_spec: _, fs } => fs.to_string_lossy(),
319        }
320    }
321
322    #[inline(always)]
323    fn format_with(&self, mr: Option<&MatchRes>) -> Result<Cow<'_, str>, Error> {
324        match self {
325            Self::String(s) => Ok(Cow::Borrowed(s.as_str())),
326            Self::Format {
327                printf_spec: c_spec,
328                fs,
329            } => {
330                if let Some(mr) = mr {
331                    match mr {
332                        MatchRes::Float(_, _) | MatchRes::Bytes(_, _, _, _) => {
333                            Ok(Cow::Owned(dformat!(fs, mr)?))
334                        }
335                        MatchRes::Scalar(_, scalar) => {
336                            // we want to print a byte as char
337                            if c_spec.as_str() == "c" {
338                                match scalar {
339                                    Scalar::byte(b) => {
340                                        let b = (*b as u8) as char;
341                                        Ok(Cow::Owned(dformat!(fs, b)?))
342                                    }
343                                    Scalar::ubyte(b) => {
344                                        let b = *b as char;
345                                        Ok(Cow::Owned(dformat!(fs, b)?))
346                                    }
347                                    _ => Ok(Cow::Owned(dformat!(fs, mr)?)),
348                                }
349                            } else {
350                                Ok(Cow::Owned(dformat!(fs, mr)?))
351                            }
352                        }
353                    }
354                } else {
355                    Ok(fs.to_string_lossy())
356                }
357            }
358        }
359    }
360}
361
362impl ScalarDataType {
363    #[inline(always)]
364    fn read<R: Read + Seek>(&self, from: &mut R, switch_endianness: bool) -> Result<Scalar, Error> {
365        macro_rules! _read_le {
366            ($ty: ty) => {{
367                if switch_endianness {
368                    <$ty>::from_be_bytes(read!(from, $ty))
369                } else {
370                    <$ty>::from_le_bytes(read!(from, $ty))
371                }
372            }};
373        }
374
375        macro_rules! _read_be {
376            ($ty: ty) => {{
377                if switch_endianness {
378                    <$ty>::from_le_bytes(read!(from, $ty))
379                } else {
380                    <$ty>::from_be_bytes(read!(from, $ty))
381                }
382            }};
383        }
384
385        macro_rules! _read_ne {
386            ($ty: ty) => {{
387                if cfg!(target_endian = "big") {
388                    _read_be!($ty)
389                } else {
390                    _read_le!($ty)
391                }
392            }};
393        }
394
395        macro_rules! _read_me {
396            () => {
397                ((_read_le!(u16) as i32) << 16) | (_read_le!(u16) as i32)
398            };
399        }
400
401        Ok(match self {
402            // signed
403            Self::byte => Scalar::byte(read!(from, u8)[0] as i8),
404            Self::short => Scalar::short(_read_ne!(i16)),
405            Self::long => Scalar::long(_read_ne!(i32)),
406            Self::date => Scalar::date(_read_ne!(i32)),
407            Self::ldate => Scalar::ldate(_read_ne!(i32)),
408            Self::qwdate => Scalar::qwdate(_read_ne!(i64)),
409            Self::leshort => Scalar::leshort(_read_le!(i16)),
410            Self::lelong => Scalar::lelong(_read_le!(i32)),
411            Self::lequad => Scalar::lequad(_read_le!(i64)),
412            Self::bequad => Scalar::bequad(_read_be!(i64)),
413            Self::belong => Scalar::belong(_read_be!(i32)),
414            Self::bedate => Scalar::bedate(_read_be!(i32)),
415            Self::beldate => Scalar::beldate(_read_be!(i32)),
416            Self::beqdate => Scalar::beqdate(_read_be!(i64)),
417            // unsigned
418            Self::ubyte => Scalar::ubyte(read!(from, u8)[0]),
419            Self::ushort => Scalar::ushort(_read_ne!(u16)),
420            Self::uleshort => Scalar::uleshort(_read_le!(u16)),
421            Self::ulelong => Scalar::ulelong(_read_le!(u32)),
422            Self::uledate => Scalar::uledate(_read_le!(u32)),
423            Self::ulequad => Scalar::ulequad(_read_le!(u64)),
424            Self::offset => Scalar::offset(from.stream_position()?),
425            Self::ubequad => Scalar::ubequad(_read_be!(u64)),
426            Self::medate => Scalar::medate(_read_me!()),
427            Self::meldate => Scalar::meldate(_read_me!()),
428            Self::melong => Scalar::melong(_read_me!()),
429            Self::beshort => Scalar::beshort(_read_be!(i16)),
430            Self::quad => Scalar::quad(_read_ne!(i64)),
431            Self::uquad => Scalar::uquad(_read_ne!(u64)),
432            Self::ledate => Scalar::ledate(_read_le!(i32)),
433            Self::leldate => Scalar::leldate(_read_le!(i32)),
434            Self::leqdate => Scalar::leqdate(_read_le!(i64)),
435            Self::leqldate => Scalar::leqldate(_read_le!(i64)),
436            Self::leqwdate => Scalar::leqwdate(_read_le!(i64)),
437            Self::ubelong => Scalar::ubelong(_read_be!(u32)),
438            Self::ulong => Scalar::ulong(_read_ne!(u32)),
439            Self::ubeshort => Scalar::ubeshort(_read_be!(u16)),
440            Self::ubeqdate => Scalar::ubeqdate(_read_be!(u64)),
441            Self::lemsdosdate => Scalar::lemsdosdate(_read_le!(u16)),
442            Self::lemsdostime => Scalar::lemsdostime(_read_le!(u16)),
443            Self::guid => Scalar::guid(u128::from_be_bytes(read!(from, u128))),
444        })
445    }
446}
447
448impl FloatDataType {
449    #[inline(always)]
450    fn read<R: Read + Seek>(&self, from: &mut R, switch_endianness: bool) -> Result<Float, Error> {
451        macro_rules! _read_le {
452            ($ty: ty) => {{
453                if switch_endianness {
454                    <$ty>::from_be_bytes(read!(from, $ty))
455                } else {
456                    <$ty>::from_le_bytes(read!(from, $ty))
457                }
458            }};
459        }
460
461        macro_rules! _read_be {
462            ($ty: ty) => {{
463                if switch_endianness {
464                    <$ty>::from_le_bytes(read!(from, $ty))
465                } else {
466                    <$ty>::from_be_bytes(read!(from, $ty))
467                }
468            }};
469        }
470
471        macro_rules! _read_ne {
472            ($ty: ty) => {{
473                if cfg!(target_endian = "big") {
474                    _read_be!($ty)
475                } else {
476                    _read_le!($ty)
477                }
478            }};
479        }
480
481        macro_rules! _read_me {
482            () => {
483                ((_read_le!(u16) as i32) << 16) | (_read_le!(u16) as i32)
484            };
485        }
486
487        Ok(match self {
488            Self::lefloat => Float::lefloat(_read_le!(f32)),
489            Self::befloat => Float::befloat(_read_le!(f32)),
490            Self::ledouble => Float::ledouble(_read_le!(f64)),
491            Self::bedouble => Float::bedouble(_read_be!(f64)),
492        })
493    }
494}
495
496#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
497enum Op {
498    Mul,
499    Add,
500    Sub,
501    Div,
502    Mod,
503    And,
504    Xor,
505    Or,
506}
507
508impl Display for Op {
509    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
510        match self {
511            Op::Mul => write!(f, "*"),
512            Op::Add => write!(f, "+"),
513            Op::Sub => write!(f, "-"),
514            Op::Div => write!(f, "/"),
515            Op::Mod => write!(f, "%"),
516            Op::And => write!(f, "&"),
517            Op::Or => write!(f, "|"),
518            Op::Xor => write!(f, "^"),
519        }
520    }
521}
522
523#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
524enum CmpOp {
525    Eq,
526    Lt,
527    Gt,
528    BitAnd,
529    Neq, // ! operator
530    Xor,
531    Not, // ~ operator
532}
533
534impl CmpOp {
535    #[inline(always)]
536    fn is_neq(&self) -> bool {
537        matches!(self, Self::Neq)
538    }
539}
540
541#[derive(Debug, Clone, Serialize, Deserialize)]
542struct ScalarTransform {
543    op: Op,
544    num: Scalar,
545}
546
547impl ScalarTransform {
548    fn apply(&self, s: Scalar) -> Option<Scalar> {
549        match self.op {
550            Op::Add => s.checked_add(self.num),
551            Op::Sub => s.checked_sub(self.num),
552            Op::Mul => s.checked_mul(self.num),
553            Op::Div => s.checked_div(self.num),
554            Op::Mod => s.checked_rem(self.num),
555            Op::And => Some(s.bitand(self.num)),
556            Op::Xor => Some(s.bitxor(self.num)),
557            Op::Or => Some(s.bitor(self.num)),
558        }
559    }
560}
561
562#[derive(Debug, Clone, Serialize, Deserialize)]
563struct FloatTransform {
564    op: Op,
565    num: Float,
566}
567
568impl FloatTransform {
569    fn apply(&self, s: Float) -> Float {
570        match self.op {
571            Op::Add => s.add(self.num),
572            Op::Sub => s.sub(self.num),
573            Op::Mul => s.mul(self.num),
574            // returns inf when div by 0
575            Op::Div => s.div(self.num),
576            // returns NaN when rem by 0
577            Op::Mod => s.rem(self.num),
578            // parser makes sure those operators cannot be used
579            Op::And | Op::Xor | Op::Or => {
580                debug_panic!("unsupported operation");
581                s
582            }
583        }
584    }
585}
586
587#[derive(Debug, Clone, Serialize, Deserialize)]
588enum TestValue<T> {
589    Value(T),
590    Any,
591}
592
593impl<T> TestValue<T> {
594    #[inline(always)]
595    fn as_ref(&self) -> TestValue<&T> {
596        match self {
597            Self::Value(v) => TestValue::Value(v),
598            Self::Any => TestValue::Any,
599        }
600    }
601}
602
603flags! {
604    enum ReMod: u8{
605        CaseInsensitive,
606        StartOffsetUpdate,
607        LineLimit,
608        ForceBin,
609        ForceText,
610        TrimMatch,
611    }
612}
613
614fn serialize_regex<S>(re: &bytes::Regex, serializer: S) -> Result<S::Ok, S::Error>
615where
616    S: serde::Serializer,
617{
618    re.as_str().serialize(serializer)
619}
620
621fn deserialize_regex<'de, D>(deserializer: D) -> Result<bytes::Regex, D::Error>
622where
623    D: serde::Deserializer<'de>,
624{
625    let wrapper = String::deserialize(deserializer)?;
626    bytes::Regex::new(&wrapper).map_err(serde::de::Error::custom)
627}
628
629#[derive(Debug, Clone, Serialize, Deserialize)]
630struct RegexTest {
631    #[serde(
632        serialize_with = "serialize_regex",
633        deserialize_with = "deserialize_regex"
634    )]
635    re: bytes::Regex,
636    length: Option<usize>,
637    mods: FlagSet<ReMod>,
638    str_mods: FlagSet<StringMod>,
639    non_magic_len: usize,
640    binary: bool,
641    cmp_op: CmpOp,
642}
643
644impl RegexTest {
645    #[inline(always)]
646    fn is_binary(&self) -> bool {
647        self.binary
648            || self.mods.contains(ReMod::ForceBin)
649            || self.str_mods.contains(StringMod::ForceBin)
650    }
651
652    fn match_buf<'buf>(
653        &self,
654        off_buf: u64, // absolute buffer offset in content
655        stream_kind: StreamKind,
656        buf: &'buf [u8],
657    ) -> Option<MatchRes<'buf>> {
658        let mr = match stream_kind {
659            StreamKind::Text(_) => {
660                let mut off_txt = off_buf;
661
662                let mut line_limit = self.length.unwrap_or(usize::MAX);
663
664                for line in buf.split(|c| c == &b'\n') {
665                    // we don't need to break on offset
666                    // limit as buf contains the good amount
667                    // of bytes to match against
668                    if line_limit == 0 {
669                        break;
670                    }
671
672                    if let Some(re_match) = self.re.find(line) {
673                        // the offset of the string is computed from the start of the buffer
674                        let start_offset = off_txt + re_match.start() as u64;
675
676                        // if we matched until EOL we need to add one to include the delimiter removed from the split
677                        let stop_offset = if re_match.end() == line.len() {
678                            Some(start_offset + re_match.as_bytes().len() as u64 + 1)
679                        } else {
680                            None
681                        };
682
683                        return Some(MatchRes::Bytes(
684                            start_offset,
685                            stop_offset,
686                            re_match.as_bytes(),
687                            Encoding::Utf8,
688                        ));
689                    }
690
691                    off_txt += line.len() as u64;
692                    // we have to add one because lines do not contain splitting character
693                    off_txt += 1;
694                    line_limit = line_limit.saturating_sub(1)
695                }
696                None
697            }
698
699            StreamKind::Binary => {
700                self.re.find(buf).map(|re_match| {
701                    MatchRes::Bytes(
702                        // the offset of the string is computed from the start of the buffer
703                        off_buf + re_match.start() as u64,
704                        None,
705                        re_match.as_bytes(),
706                        Encoding::Utf8,
707                    )
708                })
709            }
710        };
711
712        // handle the case where we want the regex not to match
713        if self.cmp_op.is_neq() && mr.is_none() {
714            return Some(MatchRes::Bytes(off_buf, None, buf, Encoding::Utf8));
715        }
716
717        mr
718    }
719}
720
721impl From<RegexTest> for Test {
722    fn from(value: RegexTest) -> Self {
723        Self::Regex(value)
724    }
725}
726
727flags! {
728    enum StringMod: u8{
729        ForceBin,
730        UpperInsensitive,
731        LowerInsensitive,
732        FullWordMatch,
733        Trim,
734        ForceText,
735        CompactWhitespace,
736        OptBlank,
737    }
738}
739
740#[derive(Debug, Clone, Serialize, Deserialize)]
741struct StringTest {
742    test_val: TestValue<Vec<u8>>,
743    cmp_op: CmpOp,
744    length: Option<usize>,
745    mods: FlagSet<StringMod>,
746    binary: bool,
747}
748
749impl From<StringTest> for Test {
750    fn from(value: StringTest) -> Self {
751        Self::String(value)
752    }
753}
754
755#[inline(always)]
756fn string_match(str: &[u8], mods: FlagSet<StringMod>, buf: &[u8]) -> (bool, usize) {
757    let mut consumed = 0;
758    // we can do a simple string comparison
759    if mods.is_disjoint(
760        StringMod::UpperInsensitive
761            | StringMod::LowerInsensitive
762            | StringMod::FullWordMatch
763            | StringMod::CompactWhitespace
764            | StringMod::OptBlank,
765    ) {
766        // we check if target contains
767        if buf.starts_with(str) {
768            (true, str.len())
769        } else {
770            (false, consumed)
771        }
772    } else {
773        let mut i_src = 0;
774        let mut iter = buf.iter().peekable();
775
776        macro_rules! consume_target {
777            () => {{
778                iter.next();
779                consumed += 1;
780            }};
781        }
782
783        macro_rules! continue_next_iteration {
784            () => {{
785                consume_target!();
786                i_src += 1;
787                continue;
788            }};
789        }
790
791        while let Some(&&b) = iter.peek() {
792            let Some(&ref_byte) = str.get(i_src) else {
793                break;
794            };
795
796            if mods.contains(StringMod::OptBlank) && (b == b' ' || ref_byte == b' ') {
797                if b == b' ' {
798                    // we ignore whitespace in target
799                    consume_target!();
800                }
801
802                if ref_byte == b' ' {
803                    // we ignore whitespace in test
804                    i_src += 1;
805                }
806
807                continue;
808            }
809
810            if mods.contains(StringMod::UpperInsensitive) {
811                //upper case characters in the magic match both lower and upper case characters in the target
812                if ref_byte.is_ascii_uppercase() && ref_byte == b.to_ascii_uppercase()
813                    || ref_byte == b
814                {
815                    continue_next_iteration!()
816                }
817            }
818
819            if mods.contains(StringMod::LowerInsensitive)
820                && (ref_byte.is_ascii_lowercase() && ref_byte == b.to_ascii_lowercase()
821                    || ref_byte == b)
822            {
823                continue_next_iteration!()
824            }
825
826            if mods.contains(StringMod::CompactWhitespace) && ref_byte == b' ' {
827                let mut src_blk = 0;
828                while let Some(b' ') = str.get(i_src) {
829                    src_blk += 1;
830                    i_src += 1;
831                }
832
833                let mut tgt_blk = 0;
834                while let Some(b' ') = iter.peek() {
835                    tgt_blk += 1;
836                    consume_target!();
837                }
838
839                if src_blk > tgt_blk {
840                    return (false, consumed);
841                }
842
843                continue;
844            }
845
846            if ref_byte == b {
847                continue_next_iteration!()
848            } else {
849                return (false, consumed);
850            }
851        }
852
853        if mods.contains(StringMod::FullWordMatch) {
854            if let Some(b) = iter.peek() {
855                if !b.is_ascii_whitespace() {
856                    return (false, consumed);
857                }
858            }
859        }
860
861        (consumed > 0 && consumed < buf.len(), consumed)
862    }
863}
864
865impl StringTest {
866    fn has_length_mod(&self) -> bool {
867        !self.mods.is_disjoint(
868            StringMod::UpperInsensitive
869                | StringMod::LowerInsensitive
870                | StringMod::FullWordMatch
871                | StringMod::CompactWhitespace
872                | StringMod::OptBlank,
873        )
874    }
875
876    #[inline(always)]
877    fn test_value_len(&self) -> usize {
878        match self.test_val.as_ref() {
879            TestValue::Value(s) => s.len(),
880            TestValue::Any => 0,
881        }
882    }
883
884    #[inline(always)]
885    fn is_binary(&self) -> bool {
886        self.binary || self.mods.contains(StringMod::ForceBin)
887    }
888
889    #[inline(always)]
890    fn is_text(&self) -> bool {
891        self.mods.contains(StringMod::ForceText)
892    }
893}
894
895#[derive(Debug, Clone, Serialize, Deserialize)]
896struct SearchTest {
897    str: Vec<u8>,
898    n_pos: Option<usize>,
899    str_mods: FlagSet<StringMod>,
900    re_mods: FlagSet<ReMod>,
901    binary: bool,
902    cmp_op: CmpOp,
903}
904
905impl From<SearchTest> for Test {
906    fn from(value: SearchTest) -> Self {
907        Self::Search(value)
908    }
909}
910
911impl SearchTest {
912    #[inline(always)]
913    fn is_binary(&self) -> bool {
914        (self.binary
915            || self.str_mods.contains(StringMod::ForceBin)
916            || self.re_mods.contains(ReMod::ForceBin))
917            && !(self.str_mods.contains(StringMod::ForceText)
918                || self.re_mods.contains(ReMod::ForceText))
919    }
920
921    // off_buf: absolute buffer offset in content
922    #[inline]
923    fn match_buf<'buf>(&self, off_buf: u64, buf: &'buf [u8]) -> Option<MatchRes<'buf>> {
924        let mut i = 0;
925
926        let needle = self.str.first()?;
927
928        while i < buf.len() {
929            // we cannot match if the first character isn't the same
930            // so we accelerate the search by finding potential matches
931            i += memchr(*needle, &buf[i..])?;
932
933            // if we want a full word match
934            if self.str_mods.contains(StringMod::FullWordMatch) {
935                let prev_is_whitespace = buf
936                    .get(i.saturating_sub(1))
937                    .map(|c| c.is_ascii_whitespace())
938                    .unwrap_or_default();
939
940                // if it is not the first character
941                // and its previous character isn't
942                // a whitespace. It cannot be a
943                // fullword match
944                if i > 0 && !prev_is_whitespace {
945                    i += 1;
946                    continue;
947                }
948            }
949
950            if let Some(npos) = self.n_pos {
951                if i > npos {
952                    break;
953                }
954            }
955
956            let pos = i;
957            let (ok, consumed) = string_match(&self.str, self.str_mods, &buf[i..]);
958
959            if ok {
960                return Some(MatchRes::Bytes(
961                    off_buf.saturating_add(pos as u64),
962                    None,
963                    &buf[i..i + consumed],
964                    Encoding::Utf8,
965                ));
966            } else {
967                i += max(consumed, 1)
968            }
969        }
970
971        // handles the case where we want the string not to be found
972        if self.cmp_op.is_neq() {
973            return Some(MatchRes::Bytes(off_buf, None, buf, Encoding::Utf8));
974        }
975
976        None
977    }
978}
979
980#[derive(Debug, Clone, Serialize, Deserialize)]
981struct ScalarTest {
982    ty: ScalarDataType,
983    transform: Option<ScalarTransform>,
984    cmp_op: CmpOp,
985    test_val: TestValue<Scalar>,
986}
987
988#[derive(Debug, Clone, Serialize, Deserialize)]
989struct FloatTest {
990    ty: FloatDataType,
991    transform: Option<FloatTransform>,
992    cmp_op: CmpOp,
993    test_val: TestValue<Float>,
994}
995
996// the value read from the haystack we want to match against
997// 'buf is the lifetime of the buffer we are scanning
998#[derive(Debug, PartialEq)]
999enum ReadValue<'buf> {
1000    Float(u64, Float),
1001    Scalar(u64, Scalar),
1002    Bytes(u64, &'buf [u8]),
1003}
1004
1005impl DynDisplay for ReadValue<'_> {
1006    fn dyn_fmt(&self, f: &dyf::FormatSpec) -> Result<String, dyf::Error> {
1007        match self {
1008            Self::Float(_, s) => DynDisplay::dyn_fmt(s, f),
1009            Self::Scalar(_, s) => DynDisplay::dyn_fmt(s, f),
1010            Self::Bytes(_, b) => Ok(format!("{b:?}")),
1011        }
1012    }
1013}
1014
1015impl DynDisplay for &ReadValue<'_> {
1016    fn dyn_fmt(&self, f: &dyf::FormatSpec) -> Result<String, dyf::Error> {
1017        // Dereference self to get the TestValue and call its fmt method
1018        DynDisplay::dyn_fmt(*self, f)
1019    }
1020}
1021
1022impl Display for ReadValue<'_> {
1023    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1024        match self {
1025            Self::Float(_, v) => write!(f, "{v}"),
1026            Self::Scalar(_, s) => write!(f, "{s}"),
1027            Self::Bytes(_, b) => write!(f, "{b:?}"),
1028        }
1029    }
1030}
1031
1032enum Encoding {
1033    Utf16(String16Encoding),
1034    Utf8,
1035}
1036
1037// Carry the offset of the start of the data in the stream
1038// and the data itself
1039enum MatchRes<'buf> {
1040    // Bytes.0: offset of the match
1041    // Bytes.1: optional end of match (to address the need of EOL adjustment in string regex)
1042    // Bytes.2: the bytes matching
1043    // Bytes.3: encoding of the buffer
1044    Bytes(u64, Option<u64>, &'buf [u8], Encoding),
1045    Scalar(u64, Scalar),
1046    Float(u64, Float),
1047}
1048
1049impl DynDisplay for &MatchRes<'_> {
1050    fn dyn_fmt(&self, f: &dyf::FormatSpec) -> Result<String, dyf::Error> {
1051        (*self).dyn_fmt(f)
1052    }
1053}
1054
1055impl DynDisplay for MatchRes<'_> {
1056    fn dyn_fmt(&self, f: &dyf::FormatSpec) -> Result<String, dyf::Error> {
1057        match self {
1058            Self::Scalar(_, v) => v.dyn_fmt(f),
1059            Self::Float(_, v) => v.dyn_fmt(f),
1060            Self::Bytes(_, _, v, enc) => match enc {
1061                Encoding::Utf8 => String::from_utf8_lossy(v).to_string().dyn_fmt(f),
1062                Encoding::Utf16(enc) => {
1063                    let utf16: Vec<u16> = slice_to_utf16_iter(v, *enc).collect();
1064                    String::from_utf16_lossy(&utf16).dyn_fmt(f)
1065                }
1066            },
1067        }
1068    }
1069}
1070
1071impl MatchRes<'_> {
1072    // start offset of the match
1073    #[inline]
1074    fn start_offset(&self) -> u64 {
1075        match self {
1076            MatchRes::Bytes(o, _, _, _) => *o,
1077            MatchRes::Scalar(o, _) => *o,
1078            MatchRes::Float(o, _) => *o,
1079        }
1080    }
1081
1082    // start offset of the match
1083    #[inline]
1084    fn end_offset(&self) -> u64 {
1085        match self {
1086            MatchRes::Bytes(start, end, buf, _) => match end {
1087                Some(end) => *end,
1088                None => start.saturating_add(buf.len() as u64),
1089            },
1090            MatchRes::Scalar(o, sc) => o.add(sc.size_of() as u64),
1091            MatchRes::Float(o, f) => o.add(f.size_of() as u64),
1092        }
1093    }
1094}
1095
1096fn slice_to_utf16_iter(read: &[u8], encoding: String16Encoding) -> impl Iterator<Item = u16> {
1097    let even = read
1098        .iter()
1099        .enumerate()
1100        .filter(|(i, _)| i % 2 == 0)
1101        .map(|t| t.1);
1102
1103    let odd = read
1104        .iter()
1105        .enumerate()
1106        .filter(|(i, _)| i % 2 != 0)
1107        .map(|t| t.1);
1108
1109    even.zip(odd).map(move |(e, o)| match encoding {
1110        String16Encoding::Le => u16::from_le_bytes([*e, *o]),
1111        String16Encoding::Be => u16::from_be_bytes([*e, *o]),
1112    })
1113}
1114
1115#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
1116enum String16Encoding {
1117    Le,
1118    Be,
1119}
1120
1121#[derive(Debug, Clone, Serialize, Deserialize)]
1122struct String16Test {
1123    orig: String,
1124    test_val: TestValue<Vec<u16>>,
1125    encoding: String16Encoding,
1126}
1127
1128impl String16Test {
1129    /// if the test value is a specific value this method returns
1130    /// the number of utf16 characters. To obtain the length in
1131    /// bytes the return value needs to be multiplied by two.
1132    #[inline(always)]
1133    fn test_value_len(&self) -> usize {
1134        match self.test_val.as_ref() {
1135            TestValue::Value(str16) => str16.len(),
1136            TestValue::Any => 0,
1137        }
1138    }
1139}
1140
1141flags! {
1142    enum IndirectMod: u8{
1143        Relative,
1144    }
1145}
1146
1147type IndirectMods = FlagSet<IndirectMod>;
1148
1149#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
1150enum PStringLen {
1151    Byte,    // B
1152    ShortBe, // H
1153    ShortLe, // h
1154    LongBe,  // L
1155    LongLe,  // l
1156}
1157
1158impl PStringLen {
1159    #[inline(always)]
1160    const fn size_of_len(&self) -> usize {
1161        match self {
1162            PStringLen::Byte => 1,
1163            PStringLen::ShortBe => 2,
1164            PStringLen::ShortLe => 2,
1165            PStringLen::LongBe => 4,
1166            PStringLen::LongLe => 4,
1167        }
1168    }
1169}
1170
1171#[derive(Debug, Clone, Serialize, Deserialize)]
1172struct PStringTest {
1173    len: PStringLen,
1174    test_val: TestValue<Vec<u8>>,
1175    include_len: bool,
1176}
1177
1178impl PStringTest {
1179    #[inline]
1180    fn read<'cache, R: Read + Seek>(
1181        &self,
1182        haystack: &'cache mut LazyCache<R>,
1183    ) -> Result<Option<&'cache [u8]>, Error> {
1184        let mut len = match self.len {
1185            PStringLen::Byte => read_le!(haystack, u8) as u32,
1186            PStringLen::ShortBe => read_be!(haystack, u16) as u32,
1187            PStringLen::ShortLe => read_le!(haystack, u16) as u32,
1188            PStringLen::LongBe => read_be!(haystack, u32),
1189            PStringLen::LongLe => read_le!(haystack, u32),
1190        } as usize;
1191
1192        if self.include_len {
1193            len = len.saturating_sub(self.len.size_of_len())
1194        }
1195
1196        if let TestValue::Value(s) = self.test_val.as_ref() {
1197            if len != s.len() {
1198                return Ok(None);
1199            }
1200        }
1201
1202        let read = haystack.read_exact_count(len as u64)?;
1203
1204        Ok(Some(read))
1205    }
1206
1207    #[inline(always)]
1208    fn test_value_len(&self) -> usize {
1209        match self.test_val.as_ref() {
1210            TestValue::Value(s) => s.len(),
1211            TestValue::Any => 0,
1212        }
1213    }
1214}
1215
1216#[derive(Debug, Clone, Serialize, Deserialize)]
1217enum Test {
1218    Name(String),
1219    Use(bool, String),
1220    Scalar(ScalarTest),
1221    Float(FloatTest),
1222    String(StringTest),
1223    Search(SearchTest),
1224    PString(PStringTest),
1225    Regex(RegexTest),
1226    Indirect(FlagSet<IndirectMod>),
1227    String16(String16Test),
1228    // FIXME: placeholder for strength computation
1229    #[allow(dead_code)]
1230    Der,
1231    Clear,
1232    Default,
1233}
1234
1235impl Test {
1236    // read the value to test from the haystack
1237    #[inline]
1238    fn read_test_value<'haystack, R: Read + Seek>(
1239        &self,
1240        haystack: &'haystack mut LazyCache<R>,
1241        switch_endianness: bool,
1242    ) -> Result<Option<ReadValue<'haystack>>, Error> {
1243        let test_value_offset = haystack.lazy_stream_position();
1244
1245        match self {
1246            Self::Scalar(t) => {
1247                t.ty.read(haystack, switch_endianness)
1248                    .map(|s| Some(ReadValue::Scalar(test_value_offset, s)))
1249            }
1250
1251            Self::Float(t) => {
1252                t.ty.read(haystack, switch_endianness)
1253                    .map(|f| Some(ReadValue::Float(test_value_offset, f)))
1254            }
1255            Self::String(t) => {
1256                match t.test_val.as_ref() {
1257                    TestValue::Value(str) => {
1258                        let buf = if let Some(length) = t.length {
1259                            // if there is a length specified
1260                            haystack.read_exact_count(length as u64)?
1261                        } else {
1262                            // no length specified we read until end of string
1263
1264                            match t.cmp_op {
1265                                CmpOp::Eq | CmpOp::Neq => {
1266                                    if !t.has_length_mod() {
1267                                        haystack.read_exact_count(str.len() as u64)?
1268                                    } else {
1269                                        haystack.read_count(FILE_BYTES_MAX as u64)?
1270                                    }
1271                                }
1272                                CmpOp::Lt | CmpOp::Gt => {
1273                                    let read =
1274                                        haystack.read_until_any_delim_or_limit(b"\n\0", 8092)?;
1275
1276                                    if read.ends_with(b"\0") || read.ends_with(b"\n") {
1277                                        &read[..read.len() - 1]
1278                                    } else {
1279                                        read
1280                                    }
1281                                }
1282                                _ => {
1283                                    return Err(Error::Msg(format!(
1284                                        "string test does not support {:?} operator",
1285                                        t.cmp_op
1286                                    )));
1287                                }
1288                            }
1289                        };
1290
1291                        Ok(Some(ReadValue::Bytes(test_value_offset, buf)))
1292                    }
1293                    TestValue::Any => {
1294                        let read = haystack.read_until_any_delim_or_limit(b"\0\n", 8192)?;
1295                        // we don't take last byte if it matches end of string
1296                        let bytes = if read.ends_with(b"\0") || read.ends_with(b"\n") {
1297                            &read[..read.len() - 1]
1298                        } else {
1299                            read
1300                        };
1301
1302                        Ok(Some(ReadValue::Bytes(test_value_offset, bytes)))
1303                    }
1304                }
1305            }
1306
1307            Self::String16(t) => {
1308                match t.test_val.as_ref() {
1309                    TestValue::Value(str16) => {
1310                        let read = haystack.read_exact_count((str16.len() * 2) as u64)?;
1311
1312                        Ok(Some(ReadValue::Bytes(test_value_offset, read)))
1313                    }
1314                    TestValue::Any => {
1315                        let read = haystack.read_until_utf16_or_limit(b"\x00\x00", 8192)?;
1316
1317                        // we make sure we have an even number of elements
1318                        let end = if read.len() % 2 == 0 {
1319                            read.len()
1320                        } else {
1321                            // we decide to read anyway even though
1322                            // length isn't even
1323                            read.len().saturating_sub(1)
1324                        };
1325
1326                        Ok(Some(ReadValue::Bytes(test_value_offset, &read[..end])))
1327                    }
1328                }
1329            }
1330
1331            Self::PString(t) => {
1332                let Some(read) = t.read(haystack)? else {
1333                    return Ok(None);
1334                };
1335                Ok(Some(ReadValue::Bytes(test_value_offset, read)))
1336            }
1337
1338            Self::Search(_) => {
1339                let buf = haystack.read_count(FILE_BYTES_MAX as u64)?;
1340                Ok(Some(ReadValue::Bytes(test_value_offset, buf)))
1341            }
1342
1343            Self::Regex(r) => {
1344                let length = {
1345                    match r.length {
1346                        Some(len) => {
1347                            if r.mods.contains(ReMod::LineLimit) {
1348                                len * 80
1349                            } else {
1350                                len
1351                            }
1352                        }
1353
1354                        None => FILE_REGEX_MAX,
1355                    }
1356                };
1357
1358                let read = haystack.read_count(length as u64)?;
1359                Ok(Some(ReadValue::Bytes(test_value_offset, read)))
1360            }
1361
1362            Self::Name(_)
1363            | Self::Use(_, _)
1364            | Self::Indirect(_)
1365            | Self::Clear
1366            | Self::Default
1367            | Self::Der => Err(Error::msg("no value to read for this test")),
1368        }
1369    }
1370
1371    #[inline(always)]
1372    fn match_value<'s>(
1373        &'s self,
1374        tv: &ReadValue<'s>,
1375        stream_kind: StreamKind,
1376    ) -> Option<MatchRes<'s>> {
1377        match (self, tv) {
1378            (Self::Scalar(t), ReadValue::Scalar(o, ts)) => {
1379                let read_value: Scalar = match t.transform.as_ref() {
1380                    Some(t) => t.apply(*ts)?,
1381                    None => *ts,
1382                };
1383
1384                match t.test_val {
1385                    TestValue::Value(test_value) => {
1386                        let ok = match t.cmp_op {
1387                            // NOTE: this should not happen in practice because
1388                            // we convert it into Eq equivalent at parsing time
1389                            CmpOp::Not => read_value == !test_value,
1390                            CmpOp::Eq => read_value == test_value,
1391                            CmpOp::Lt => read_value < test_value,
1392                            CmpOp::Gt => read_value > test_value,
1393                            CmpOp::Neq => read_value != test_value,
1394                            CmpOp::BitAnd => read_value & test_value == test_value,
1395                            CmpOp::Xor => (read_value & test_value).is_zero(),
1396                        };
1397
1398                        if ok {
1399                            Some(MatchRes::Scalar(*o, read_value))
1400                        } else {
1401                            None
1402                        }
1403                    }
1404
1405                    TestValue::Any => Some(MatchRes::Scalar(*o, read_value)),
1406                }
1407            }
1408
1409            (Self::Float(t), ReadValue::Float(o, f)) => {
1410                let read_value: Float = t.transform.as_ref().map(|t| t.apply(*f)).unwrap_or(*f);
1411
1412                match t.test_val {
1413                    TestValue::Value(tf) => {
1414                        let ok = match t.cmp_op {
1415                            CmpOp::Eq => read_value == tf,
1416                            CmpOp::Lt => read_value < tf,
1417                            CmpOp::Gt => read_value > tf,
1418                            CmpOp::Neq => read_value != tf,
1419                            _ => {
1420                                // this should never be reached as we validate
1421                                // operator in parser
1422                                debug_panic!("unsupported float comparison");
1423                                debug!("unsupported float comparison");
1424                                false
1425                            }
1426                        };
1427
1428                        if ok {
1429                            Some(MatchRes::Float(*o, read_value))
1430                        } else {
1431                            None
1432                        }
1433                    }
1434                    TestValue::Any => Some(MatchRes::Float(*o, read_value)),
1435                }
1436            }
1437
1438            (Self::String(st), ReadValue::Bytes(o, buf)) => {
1439                macro_rules! trim_buf {
1440                    ($buf: expr) => {{
1441                        if st.mods.contains(StringMod::Trim) {
1442                            $buf.trim_ascii()
1443                        } else {
1444                            $buf
1445                        }
1446                    }};
1447                }
1448
1449                match st.test_val.as_ref() {
1450                    TestValue::Value(str) => {
1451                        match st.cmp_op {
1452                            CmpOp::Eq => {
1453                                if let (true, _) = string_match(str, st.mods, buf) {
1454                                    Some(MatchRes::Bytes(*o, None, trim_buf!(str), Encoding::Utf8))
1455                                } else {
1456                                    None
1457                                }
1458                            }
1459                            CmpOp::Neq => {
1460                                if let (false, _) = string_match(str, st.mods, buf) {
1461                                    Some(MatchRes::Bytes(*o, None, trim_buf!(str), Encoding::Utf8))
1462                                } else {
1463                                    None
1464                                }
1465                            }
1466                            CmpOp::Gt => {
1467                                if buf.len() > str.len() {
1468                                    Some(MatchRes::Bytes(*o, None, trim_buf!(buf), Encoding::Utf8))
1469                                } else {
1470                                    None
1471                                }
1472                            }
1473                            CmpOp::Lt => {
1474                                if buf.len() < str.len() {
1475                                    Some(MatchRes::Bytes(*o, None, trim_buf!(buf), Encoding::Utf8))
1476                                } else {
1477                                    None
1478                                }
1479                            }
1480
1481                            // unsupported for strings
1482                            _ => {
1483                                // this should never be reached as we validate
1484                                // operator in parser
1485                                debug_panic!("unsupported string comparison");
1486                                debug!("unsupported string comparison");
1487                                None
1488                            }
1489                        }
1490                    }
1491                    TestValue::Any => {
1492                        Some(MatchRes::Bytes(*o, None, trim_buf!(buf), Encoding::Utf8))
1493                    }
1494                }
1495            }
1496
1497            (Self::PString(m), ReadValue::Bytes(o, buf)) => match m.test_val.as_ref() {
1498                TestValue::Value(psv) => {
1499                    if buf == psv {
1500                        Some(MatchRes::Bytes(*o, None, buf, Encoding::Utf8))
1501                    } else {
1502                        None
1503                    }
1504                }
1505                TestValue::Any => Some(MatchRes::Bytes(*o, None, buf, Encoding::Utf8)),
1506            },
1507
1508            (Self::String16(t), ReadValue::Bytes(o, buf)) => {
1509                match t.test_val.as_ref() {
1510                    TestValue::Value(str16) => {
1511                        // strings cannot be equal
1512                        if str16.len() * 2 != buf.len() {
1513                            return None;
1514                        }
1515
1516                        // we check string equality
1517                        for (i, utf16_char) in slice_to_utf16_iter(buf, t.encoding).enumerate() {
1518                            if str16[i] != utf16_char {
1519                                return None;
1520                            }
1521                        }
1522
1523                        Some(MatchRes::Bytes(
1524                            *o,
1525                            None,
1526                            t.orig.as_bytes(),
1527                            Encoding::Utf16(t.encoding),
1528                        ))
1529                    }
1530
1531                    TestValue::Any => {
1532                        Some(MatchRes::Bytes(*o, None, buf, Encoding::Utf16(t.encoding)))
1533                    }
1534                }
1535            }
1536
1537            (Self::Regex(r), ReadValue::Bytes(o, buf)) => r.match_buf(*o, stream_kind, buf),
1538
1539            (Self::Search(t), ReadValue::Bytes(o, buf)) => t.match_buf(*o, buf),
1540
1541            _ => None,
1542        }
1543    }
1544
1545    #[inline(always)]
1546    fn strength(&self) -> u64 {
1547        const MULT: usize = 10;
1548
1549        let mut out = 2 * MULT;
1550
1551        // FIXME: octal is missing but it is not used in practice ...
1552        match self {
1553            Test::Scalar(s) => {
1554                out += s.ty.type_size() * MULT;
1555            }
1556
1557            Test::Float(t) => {
1558                out += t.ty.type_size() * MULT;
1559            }
1560
1561            Test::String(t) => out += t.test_value_len().saturating_mul(MULT),
1562
1563            Test::PString(t) => out += t.test_value_len().saturating_mul(MULT),
1564
1565            Test::Search(s) => {
1566                // NOTE: this implementation deviates from what is in
1567                // C libmagic. The purpose of this implementation is to
1568                // minimize the difference between similar tests,
1569                // implemented differently (ex: string test VS very localized search test).
1570                let n_pos = s.n_pos.unwrap_or(FILE_BYTES_MAX);
1571
1572                match n_pos {
1573                    // a search on one line should be equivalent to a string match
1574                    0..=80 => out += s.str.len().saturating_mul(MULT),
1575                    // search on the first 3 lines gets a little penalty
1576                    81..=240 => out += s.str.len() * s.str.len().clamp(0, MULT - 2),
1577                    // a search on more than 3 lines isn't considered very accurate
1578                    _ => out += s.str.len(),
1579                }
1580            }
1581
1582            Test::Regex(r) => {
1583                // NOTE: this implementation deviates from what is in
1584                // C libmagic. The purpose of this implementation is to
1585                // minimize the difference between similar tests,
1586                // implemented differently (ex: string test VS very localized regex test).
1587
1588                // we divide length by the number of capture group
1589                // which gives us a value close to he average string
1590                // length match in the regex.
1591                let v = r.non_magic_len / r.re.captures_len();
1592
1593                let len = r
1594                    .length
1595                    .map(|l| {
1596                        if r.mods.contains(ReMod::LineLimit) {
1597                            l * 80
1598                        } else {
1599                            l
1600                        }
1601                    })
1602                    .unwrap_or(FILE_BYTES_MAX);
1603
1604                match len {
1605                    // a search on one line should be equivalent to a string match
1606                    0..=80 => out += v.saturating_mul(MULT),
1607                    // search on the first 3 lines gets a little penalty
1608                    81..=240 => out += v * v.clamp(0, MULT - 2),
1609                    // a search on more than 3 lines isn't considered very accurate
1610                    _ => out += v,
1611                }
1612            }
1613
1614            Test::String16(t) => {
1615                // NOTE: in libmagic the result is div by 2
1616                // but I GUESS it is because the len is expressed
1617                // in number bytes. In our case length is expressed
1618                // in number of u16 so we shouldn't divide.
1619                out += t.test_value_len().saturating_mul(MULT);
1620            }
1621
1622            Test::Der => out += MULT,
1623
1624            Test::Default | Test::Name(_) | Test::Use(_, _) | Test::Indirect(_) | Test::Clear => {
1625                return 0;
1626            }
1627        }
1628
1629        // matching any output gets penalty
1630        if self.is_match_any() {
1631            return 0;
1632        }
1633
1634        if let Some(op) = self.cmp_op() {
1635            match op {
1636                // matching almost any gets penalty
1637                CmpOp::Neq => out = 0,
1638                CmpOp::Eq | CmpOp::Not => out += MULT,
1639                CmpOp::Lt | CmpOp::Gt => out -= 2 * MULT,
1640                CmpOp::Xor | CmpOp::BitAnd => out -= MULT,
1641            }
1642        }
1643
1644        out as u64
1645    }
1646
1647    #[inline(always)]
1648    fn cmp_op(&self) -> Option<CmpOp> {
1649        match self {
1650            Self::String(t) => Some(t.cmp_op),
1651            Self::Scalar(s) => Some(s.cmp_op),
1652            Self::Float(t) => Some(t.cmp_op),
1653            Self::Name(_)
1654            | Self::Use(_, _)
1655            | Self::Search(_)
1656            | Self::PString(_)
1657            | Self::Regex(_)
1658            | Self::Clear
1659            | Self::Default
1660            | Self::Indirect(_)
1661            | Self::String16(_)
1662            | Self::Der => None,
1663        }
1664    }
1665
1666    #[inline(always)]
1667    fn is_match_any(&self) -> bool {
1668        match self {
1669            Test::Name(_) => false,
1670            Test::Use(_, _) => false,
1671            Test::Scalar(scalar_test) => matches!(scalar_test.test_val, TestValue::Any),
1672            Test::Float(float_test) => matches!(float_test.test_val, TestValue::Any),
1673            Test::String(string_test) => matches!(string_test.test_val, TestValue::Any),
1674            Test::Search(_) => false,
1675            Test::PString(pstring_test) => matches!(pstring_test.test_val, TestValue::Any),
1676            Test::Regex(_) => false,
1677            Test::Indirect(_) => false,
1678            Test::String16(string16_test) => matches!(string16_test.test_val, TestValue::Any),
1679            Test::Der => false,
1680            Test::Clear => false,
1681            Test::Default => false,
1682        }
1683    }
1684
1685    #[inline(always)]
1686    fn is_binary(&self) -> bool {
1687        match self {
1688            Self::Name(_) => true,
1689            Self::Use(_, _) => true,
1690            Self::Scalar(_) => true,
1691            Self::Float(_) => true,
1692            Self::String(t) => !t.is_binary() & !t.is_text() || t.is_binary(),
1693            Self::Search(t) => t.is_binary(),
1694            Self::PString(_) => true,
1695            Self::Regex(t) => t.is_binary(),
1696            Self::Clear => true,
1697            Self::Default => true,
1698            Self::Indirect(_) => true,
1699            Self::String16(_) => true,
1700            Self::Der => true,
1701        }
1702    }
1703
1704    #[inline(always)]
1705    fn is_text(&self) -> bool {
1706        match self {
1707            Self::Name(_) => true,
1708            Self::Use(_, _) => true,
1709            Self::Indirect(_) => true,
1710            Self::Clear => true,
1711            Self::Default => true,
1712            Self::String(t) => !t.is_binary() & !t.is_text() || t.is_text(),
1713            _ => !self.is_binary(),
1714        }
1715    }
1716
1717    #[inline(always)]
1718    fn is_only_text(&self) -> bool {
1719        self.is_text() && !self.is_binary()
1720    }
1721
1722    #[inline(always)]
1723    fn is_only_binary(&self) -> bool {
1724        self.is_binary() && !self.is_text()
1725    }
1726}
1727
1728#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
1729enum OffsetType {
1730    Byte,
1731    DoubleLe,
1732    DoubleBe,
1733    ShortLe,
1734    ShortBe,
1735    Id3Le,
1736    Id3Be,
1737    LongLe,
1738    LongBe,
1739    Middle,
1740    Octal,
1741    QuadBe,
1742    QuadLe,
1743}
1744
1745#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
1746enum Shift {
1747    Direct(u64),
1748    Indirect(i64),
1749}
1750
1751#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
1752struct IndOffset {
1753    // where to find the offset
1754    off_addr: DirOffset,
1755    // signed or unsigned
1756    signed: bool,
1757    // type of the offset
1758    ty: OffsetType,
1759    op: Option<Op>,
1760    shift: Option<Shift>,
1761}
1762
1763impl IndOffset {
1764    // if we overflow we must not return an offset
1765    fn read_offset<R: Read + Seek>(
1766        &self,
1767        haystack: &mut LazyCache<R>,
1768        rule_base_offset: Option<u64>,
1769        last_upper_match_offset: Option<u64>,
1770    ) -> Result<Option<u64>, io::Error> {
1771        let offset_address = match self.off_addr {
1772            DirOffset::Start(s) => {
1773                let Some(o) = s.checked_add(rule_base_offset.unwrap_or_default()) else {
1774                    return Ok(None);
1775                };
1776
1777                haystack.seek(SeekFrom::Start(o))?
1778            }
1779            DirOffset::LastUpper(c) => haystack.seek(SeekFrom::Start(
1780                (last_upper_match_offset.unwrap_or_default() as i64 + c) as u64,
1781            ))?,
1782            DirOffset::End(e) => haystack.seek(SeekFrom::End(e))?,
1783        };
1784
1785        macro_rules! read_value {
1786            () => {
1787                match self.ty {
1788                    OffsetType::Byte => {
1789                        if self.signed {
1790                            read_le!(haystack, u8) as u64
1791                        } else {
1792                            read_le!(haystack, i8) as u64
1793                        }
1794                    }
1795                    OffsetType::DoubleLe => read_le!(haystack, f64) as u64,
1796                    OffsetType::DoubleBe => read_be!(haystack, f64) as u64,
1797                    OffsetType::ShortLe => {
1798                        if self.signed {
1799                            read_le!(haystack, i16) as u64
1800                        } else {
1801                            read_le!(haystack, u16) as u64
1802                        }
1803                    }
1804                    OffsetType::ShortBe => {
1805                        if self.signed {
1806                            read_be!(haystack, i16) as u64
1807                        } else {
1808                            read_be!(haystack, u16) as u64
1809                        }
1810                    }
1811                    OffsetType::Id3Le => decode_id3(read_le!(haystack, u32)) as u64,
1812                    OffsetType::Id3Be => decode_id3(read_be!(haystack, u32)) as u64,
1813                    OffsetType::LongLe => {
1814                        if self.signed {
1815                            read_le!(haystack, i32) as u64
1816                        } else {
1817                            read_le!(haystack, u32) as u64
1818                        }
1819                    }
1820                    OffsetType::LongBe => {
1821                        if self.signed {
1822                            read_be!(haystack, i32) as u64
1823                        } else {
1824                            read_be!(haystack, u32) as u64
1825                        }
1826                    }
1827                    OffsetType::Middle => read_me!(haystack) as u64,
1828                    OffsetType::Octal => {
1829                        if let Some(o) = read_octal_u64(haystack) {
1830                            o
1831                        } else {
1832                            debug!("failed to read octal offset @ {offset_address}");
1833                            return Ok(None);
1834                        }
1835                    }
1836                    OffsetType::QuadLe => {
1837                        if self.signed {
1838                            read_le!(haystack, i64) as u64
1839                        } else {
1840                            read_le!(haystack, u64)
1841                        }
1842                    }
1843                    OffsetType::QuadBe => {
1844                        if self.signed {
1845                            read_be!(haystack, i64) as u64
1846                        } else {
1847                            read_be!(haystack, u64)
1848                        }
1849                    }
1850                }
1851            };
1852        }
1853
1854        // in theory every offset read should end up in something seekable from start, so we can use u64 to store the result
1855        let o = read_value!();
1856
1857        trace!(
1858            "offset read @ {offset_address} value={o} op={:?} shift={:?}",
1859            self.op, self.shift
1860        );
1861
1862        // apply transformation
1863        if let (Some(op), Some(shift)) = (self.op, self.shift) {
1864            let shift = match shift {
1865                Shift::Direct(i) => i,
1866                Shift::Indirect(i) => {
1867                    let tmp = offset_address as i128 + i as i128;
1868                    if tmp.is_negative() {
1869                        return Ok(None);
1870                    } else {
1871                        haystack.seek(SeekFrom::Start(tmp as u64))?;
1872                    };
1873                    // NOTE: here we assume that the shift has the same
1874                    // type as the main offset !
1875                    read_value!()
1876                }
1877            };
1878
1879            match op {
1880                Op::Add => return Ok(o.checked_add(shift)),
1881                Op::Mul => return Ok(o.checked_mul(shift)),
1882                Op::Sub => return Ok(o.checked_sub(shift)),
1883                Op::Div => return Ok(o.checked_div(shift)),
1884                Op::Mod => return Ok(o.checked_rem(shift)),
1885                Op::And => return Ok(Some(o & shift)),
1886                Op::Or => return Ok(Some(o | shift)),
1887                Op::Xor => return Ok(Some(o ^ shift)),
1888            }
1889        }
1890
1891        Ok(Some(o))
1892    }
1893}
1894
1895#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
1896enum DirOffset {
1897    Start(u64),
1898    // relative to the last up-level field
1899    LastUpper(i64),
1900    End(i64),
1901}
1902
1903#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
1904enum Offset {
1905    Direct(DirOffset),
1906    Indirect(IndOffset),
1907}
1908
1909impl From<DirOffset> for Offset {
1910    fn from(value: DirOffset) -> Self {
1911        Self::Direct(value)
1912    }
1913}
1914
1915impl From<IndOffset> for Offset {
1916    fn from(value: IndOffset) -> Self {
1917        Self::Indirect(value)
1918    }
1919}
1920
1921impl Display for DirOffset {
1922    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1923        match self {
1924            DirOffset::Start(i) => write!(f, "{i}"),
1925            DirOffset::LastUpper(c) => write!(f, "&{c}"),
1926            DirOffset::End(e) => write!(f, "-{e}"),
1927        }
1928    }
1929}
1930
1931impl Default for DirOffset {
1932    fn default() -> Self {
1933        Self::LastUpper(0)
1934    }
1935}
1936
1937#[derive(Debug, Clone, Serialize, Deserialize)]
1938struct Match {
1939    line: usize,
1940    depth: u8,
1941    offset: Offset,
1942    test: Test,
1943    test_strength: u64,
1944    message: Option<Message>,
1945}
1946
1947impl From<Use> for Match {
1948    fn from(value: Use) -> Self {
1949        let test = Test::Use(value.switch_endianness, value.rule_name);
1950        let test_strength = test.strength();
1951        Self {
1952            line: value.line,
1953            depth: value.depth,
1954            offset: value.start_offset,
1955            test,
1956            test_strength,
1957            message: value.message,
1958        }
1959    }
1960}
1961
1962impl From<Name> for Match {
1963    fn from(value: Name) -> Self {
1964        let test = Test::Name(value.name);
1965        let test_strength = test.strength();
1966        Self {
1967            line: value.line,
1968            depth: 0,
1969            offset: Offset::Direct(DirOffset::Start(0)),
1970            test,
1971            test_strength,
1972            message: value.message,
1973        }
1974    }
1975}
1976
1977impl Match {
1978    /// Turns the `Match`'s offset into an absolute offset from the start of the stream
1979    #[inline(always)]
1980    fn offset_from_start<R: Read + Seek>(
1981        &self,
1982        haystack: &mut LazyCache<R>,
1983        rule_base_offset: Option<u64>,
1984        last_level_offset: Option<u64>,
1985    ) -> Result<Option<u64>, io::Error> {
1986        match self.offset {
1987            Offset::Direct(dir_offset) => match dir_offset {
1988                DirOffset::Start(s) => Ok(Some(s)),
1989                DirOffset::LastUpper(shift) => {
1990                    let o = last_level_offset.unwrap_or_default() as i64 + shift;
1991
1992                    if o.is_positive() {
1993                        Ok(Some(o as u64))
1994                    } else {
1995                        Ok(None)
1996                    }
1997                }
1998                DirOffset::End(e) => Ok(Some(haystack.offset_from_start(SeekFrom::End(e)))),
1999            },
2000            Offset::Indirect(ind_offset) => {
2001                let Some(o) =
2002                    ind_offset.read_offset(haystack, rule_base_offset, last_level_offset)?
2003                else {
2004                    return Ok(None);
2005                };
2006
2007                Ok(Some(o))
2008            }
2009        }
2010    }
2011
2012    /// this method emulates the buffer based matching
2013    /// logic implemented in libmagic. It needs some aweful
2014    /// and weird offset convertions to turn buffer
2015    /// relative offsets (libmagic is based on) into
2016    /// absolute offset in the file.
2017    ///
2018    /// this method shoud bubble up only critical errors
2019    /// all the other errors should make the match result
2020    /// false and be logged via debug!
2021    ///
2022    /// the function returns an error if the maximum recursion
2023    /// has been reached or if a dependency rule is missing.
2024    #[inline]
2025    #[allow(clippy::too_many_arguments)]
2026    fn matches<'a: 'h, 'h, R: Read + Seek>(
2027        &'a self,
2028        source: Option<&str>,
2029        magic: &mut Magic<'a>,
2030        stream_kind: StreamKind,
2031        state: &mut MatchState,
2032        buf_base_offset: Option<u64>,
2033        rule_base_offset: Option<u64>,
2034        last_level_offset: Option<u64>,
2035        haystack: &'h mut LazyCache<R>,
2036        switch_endianness: bool,
2037        db: &'a MagicDb,
2038        depth: usize,
2039    ) -> Result<(bool, Option<MatchRes<'h>>), Error> {
2040        let source = source.unwrap_or("unknown");
2041        let line = self.line;
2042
2043        if depth >= MAX_RECURSION {
2044            return Err(Error::localized(
2045                source,
2046                line,
2047                Error::MaximumRecursion(MAX_RECURSION),
2048            ));
2049        }
2050
2051        if self.test.is_only_binary() && stream_kind.is_text() {
2052            trace!("skip binary test source={source} line={line} stream_kind={stream_kind:?}",);
2053            return Ok((false, None));
2054        }
2055
2056        if self.test.is_only_text() && !stream_kind.is_text() {
2057            trace!("skip text test source={source} line={line} stream_kind={stream_kind:?}",);
2058            return Ok((false, None));
2059        }
2060
2061        let Ok(Some(mut offset)) = self
2062            .offset_from_start(haystack, rule_base_offset, last_level_offset)
2063            .inspect_err(|e| debug!("source={source} line={line} failed at computing offset: {e}"))
2064        else {
2065            return Ok((false, None));
2066        };
2067
2068        offset = match self.offset {
2069            Offset::Indirect(_) => {
2070                // the result we get for an indirect offset
2071                // is relative to the start of the libmagic
2072                // buffer so we need to add base to make it
2073                // absolute.
2074                buf_base_offset.unwrap_or_default().saturating_add(offset)
2075            }
2076            // offset from start are computed from rule base
2077            Offset::Direct(DirOffset::Start(_)) => {
2078                rule_base_offset.unwrap_or_default().saturating_add(offset)
2079            }
2080            _ => offset,
2081        };
2082
2083        match &self.test {
2084            Test::Clear => {
2085                trace!("source={source} line={line} clear");
2086                state.clear_continuation_level(&self.continuation_level());
2087                Ok((true, None))
2088            }
2089
2090            Test::Name(name) => {
2091                trace!(
2092                    "source={source} line={line} running rule {name} switch_endianness={switch_endianness}",
2093                );
2094                Ok((true, None))
2095            }
2096
2097            Test::Use(flip_endianness, rule_name) => {
2098                trace!(
2099                    "source={source} line={line} use {rule_name} switch_endianness={flip_endianness}",
2100                );
2101
2102                // switch_endianness must propagate down the rule call stack
2103                let switch_endianness = switch_endianness ^ flip_endianness;
2104
2105                let dr: &DependencyRule = db.dependencies.get(rule_name).ok_or(
2106                    Error::localized(source, line, Error::MissingRule(rule_name.clone())),
2107                )?;
2108
2109                // we push the message here otherwise we push message in depth first
2110                if let Some(msg) = self.message.as_ref() {
2111                    magic.push_message(msg.to_string_lossy());
2112                }
2113
2114                dr.rule.magic(
2115                    magic,
2116                    stream_kind,
2117                    buf_base_offset,
2118                    Some(offset),
2119                    haystack,
2120                    db,
2121                    switch_endianness,
2122                    depth.saturating_add(1),
2123                )?;
2124
2125                // we return false not to push message again
2126                Ok((false, None))
2127            }
2128
2129            Test::Indirect(m) => {
2130                trace!(
2131                    "source={source} line={line} indirect mods={:?} offset={offset:#x}",
2132                    m
2133                );
2134
2135                let new_buf_base_off = if m.contains(IndirectMod::Relative) {
2136                    Some(offset)
2137                } else {
2138                    None
2139                };
2140
2141                // we push the message here otherwise we push message in depth first
2142                if let Some(msg) = self.message.as_ref() {
2143                    magic.push_message(msg.to_string_lossy());
2144                }
2145
2146                for r in db.rules.iter() {
2147                    let messages_cnt = magic.message.len();
2148
2149                    r.magic(
2150                        magic,
2151                        stream_kind,
2152                        new_buf_base_off,
2153                        Some(offset),
2154                        haystack,
2155                        db,
2156                        false,
2157                        depth.saturating_add(1),
2158                    )?;
2159
2160                    // this means we matched a rule
2161                    if magic.message.len() != messages_cnt {
2162                        break;
2163                    }
2164                }
2165
2166                // we return false not to push message again
2167                Ok((false, None))
2168            }
2169
2170            Test::Default => {
2171                // default matches if nothing else at the continuation level matched
2172                let ok = !state.get_continuation_level(&self.continuation_level());
2173
2174                trace!("source={source} line={line} default match={ok}");
2175                if ok {
2176                    state.set_continuation_level(self.continuation_level());
2177                }
2178
2179                Ok((ok, None))
2180            }
2181
2182            _ => {
2183                if let Err(e) = haystack.seek(SeekFrom::Start(offset)) {
2184                    debug!("source={source} line={line} failed to seek in haystack: {e}");
2185                    return Ok((false, None));
2186                }
2187
2188                let mut trace_msg = None;
2189
2190                if enabled!(Level::DEBUG) {
2191                    trace_msg = Some(vec![format!(
2192                        "source={source} line={line} depth={} stream_offset={:#x}",
2193                        self.depth,
2194                        haystack.lazy_stream_position()
2195                    )])
2196                }
2197
2198                // NOTE: we may have a way to optimize here. In case we do a Any
2199                // test and we don't use the value to format the message, we don't
2200                // need to read the value.
2201                if let Ok(opt_test_value) = self
2202                    .test
2203                    .read_test_value(haystack, switch_endianness)
2204                    .inspect_err(|e| {
2205                        debug!("source={source} line={line} error while reading test value @{offset}: {e}",)
2206                    })
2207                {
2208                    if let Some(v) = trace_msg
2209                        .as_mut() { v.push(format!("test={:?}", self.test)) }
2210
2211                    let match_res =
2212                        opt_test_value.and_then(|tv| self.test.match_value(&tv, stream_kind));
2213
2214                    if let Some(v) = trace_msg.as_mut() { v.push(format!(
2215                            "message=\"{}\" match={}",
2216                            self.message
2217                                .as_ref()
2218                                .map(|fs| fs.to_string_lossy())
2219                                .unwrap_or_default(),
2220                            match_res.is_some()
2221                        )) }
2222
2223                    // trace message
2224                    if enabled!(Level::DEBUG) && !enabled!(Level::TRACE) && match_res.is_some() {
2225                        if let Some(m) = trace_msg{
2226                            debug!("{}", m.join(" "));
2227                        }
2228                    } else if enabled!(Level::TRACE) {
2229                        if let Some(m) = trace_msg{
2230                            trace!("{}", m.join(" "));
2231                        }
2232                    }
2233
2234                    if let Some(mr) = match_res {
2235                        state.set_continuation_level(self.continuation_level());
2236                        return Ok((true, Some(mr)));
2237                    }
2238                }
2239
2240                Ok((false, None))
2241            }
2242        }
2243    }
2244
2245    #[inline(always)]
2246    fn continuation_level(&self) -> ContinuationLevel {
2247        ContinuationLevel(self.depth)
2248    }
2249}
2250
2251#[derive(Debug, Clone)]
2252struct Use {
2253    line: usize,
2254    depth: u8,
2255    start_offset: Offset,
2256    rule_name: String,
2257    switch_endianness: bool,
2258    message: Option<Message>,
2259}
2260
2261#[derive(Debug, Clone, Serialize, Deserialize)]
2262struct StrengthMod {
2263    op: Op,
2264    by: u8,
2265}
2266
2267impl StrengthMod {
2268    #[inline(always)]
2269    fn apply(&self, strength: u64) -> u64 {
2270        let by = self.by as u64;
2271        debug!("applying strength modifier: {strength} {} {}", self.op, by);
2272        match self.op {
2273            Op::Mul => strength.saturating_mul(by),
2274            Op::Add => strength.saturating_add(by),
2275            Op::Sub => strength.saturating_sub(by),
2276            Op::Div => {
2277                if by > 0 {
2278                    strength.saturating_div(by)
2279                } else {
2280                    strength
2281                }
2282            }
2283            Op::Mod => strength % by,
2284            Op::And => strength & by,
2285            // this should never happen as strength operators
2286            // are enforced by our parser
2287            Op::Xor | Op::Or => {
2288                debug_panic!("unsupported strength operator");
2289                strength
2290            }
2291        }
2292    }
2293}
2294
2295#[derive(Debug, Clone)]
2296enum Flag {
2297    Mime(String),
2298    Ext(HashSet<String>),
2299    Strength(StrengthMod),
2300    Apple(String),
2301}
2302
2303#[derive(Debug, Clone)]
2304struct Name {
2305    line: usize,
2306    name: String,
2307    message: Option<Message>,
2308}
2309
2310#[derive(Debug, Clone)]
2311enum Entry<'span> {
2312    Match(Span<'span>, Match),
2313    Flag(Span<'span>, Flag),
2314}
2315
2316#[derive(Debug, Clone, Serialize, Deserialize)]
2317struct EntryNode {
2318    root: bool,
2319    entry: Match,
2320    children: Vec<EntryNode>,
2321    mimetype: Option<String>,
2322    apple: Option<String>,
2323    strength_mod: Option<StrengthMod>,
2324    exts: HashSet<String>,
2325}
2326
2327impl EntryNode {
2328    fn update_exts_rec(
2329        &self,
2330        exts: &mut HashSet<String>,
2331        deps: &HashMap<String, DependencyRule>,
2332        marked: &mut HashSet<String>,
2333    ) -> Result<(), ()> {
2334        for ext in self.exts.iter() {
2335            if !exts.contains(ext) {
2336                exts.insert(ext.clone());
2337            }
2338        }
2339
2340        for c in self.children.iter() {
2341            if let Test::Use(_, ref name) = c.entry.test {
2342                if marked.contains(name) {
2343                    continue;
2344                }
2345                if let Some(r) = deps.get(name) {
2346                    marked.insert(name.clone());
2347                    exts.extend(r.rule.fetch_all_extensions(deps, marked)?);
2348                } else {
2349                    return Err(());
2350                }
2351            } else {
2352                c.update_exts_rec(exts, deps, marked)?;
2353            }
2354        }
2355
2356        Ok(())
2357    }
2358
2359    fn update_score_rec(
2360        &self,
2361        depth: usize,
2362        score: &mut u64,
2363        deps: &HashMap<String, DependencyRule>,
2364        marked: &mut HashSet<String>,
2365    ) {
2366        if depth == 3 {
2367            return;
2368        }
2369
2370        *score += self
2371            .children
2372            .iter()
2373            .map(|e| e.entry.test_strength)
2374            .min()
2375            .unwrap_or_default();
2376
2377        for c in self.children.iter() {
2378            if let Test::Use(_, ref name) = c.entry.test {
2379                if marked.contains(name) {
2380                    continue;
2381                }
2382
2383                if let Some(r) = deps.get(name) {
2384                    marked.insert(name.clone());
2385                    *score += r.rule.compute_score(depth, deps, marked);
2386                }
2387            }
2388            c.update_score_rec(depth + 1, score, deps, marked);
2389        }
2390    }
2391
2392    #[inline]
2393    #[allow(clippy::too_many_arguments)]
2394    fn matches<'r, R: Read + Seek>(
2395        &'r self,
2396        opt_source: Option<&str>,
2397        magic: &mut Magic<'r>,
2398        state: &mut MatchState,
2399        stream_kind: StreamKind,
2400        buf_base_offset: Option<u64>,
2401        rule_base_offset: Option<u64>,
2402        last_level_offset: Option<u64>,
2403        haystack: &mut LazyCache<R>,
2404        db: &'r MagicDb,
2405        switch_endianness: bool,
2406        depth: usize,
2407    ) -> Result<(), Error> {
2408        let (ok, opt_match_res) = self.entry.matches(
2409            opt_source,
2410            magic,
2411            stream_kind,
2412            state,
2413            buf_base_offset,
2414            rule_base_offset,
2415            last_level_offset,
2416            haystack,
2417            switch_endianness,
2418            db,
2419            depth,
2420        )?;
2421
2422        let source = opt_source.unwrap_or("unknown");
2423        let line = self.entry.line;
2424
2425        if ok {
2426            // update magic with message if match is successful
2427            if let Some(msg) = self.entry.message.as_ref() {
2428                if let Ok(msg) = msg.format_with(opt_match_res.as_ref()).inspect_err(|e| {
2429                    debug!("source={source} line={line} failed to format message: {e}")
2430                }) {
2431                    magic.push_message(msg);
2432                }
2433            }
2434
2435            // we need to adjust stream offset in case of regex/search tests
2436            if let Some(mr) = opt_match_res {
2437                match &self.entry.test {
2438                    Test::String(t) => {
2439                        if t.has_length_mod() {
2440                            let o = mr.end_offset();
2441                            haystack.seek(SeekFrom::Start(o))?;
2442                        }
2443                    }
2444                    Test::Search(t) => {
2445                        if t.re_mods.contains(ReMod::StartOffsetUpdate) {
2446                            let o = mr.start_offset();
2447                            haystack.seek(SeekFrom::Start(o))?;
2448                        } else {
2449                            let o = mr.end_offset();
2450                            haystack.seek(SeekFrom::Start(o))?;
2451                        }
2452                    }
2453
2454                    Test::Regex(t) => {
2455                        if t.mods.contains(ReMod::StartOffsetUpdate) {
2456                            let o = mr.start_offset();
2457                            haystack.seek(SeekFrom::Start(o))?;
2458                        } else {
2459                            let o = mr.end_offset();
2460                            haystack.seek(SeekFrom::Start(o))?;
2461                        }
2462                    }
2463                    // other types do not need offset adjustement
2464                    _ => {}
2465                }
2466            }
2467
2468            if let Some(mimetype) = self.mimetype.as_ref() {
2469                magic.set_mime_type(Cow::Borrowed(mimetype));
2470            }
2471
2472            if let Some(apple_ty) = self.apple.as_ref() {
2473                magic.set_creator_code(Cow::Borrowed(apple_ty));
2474            }
2475
2476            if !self.exts.is_empty() {
2477                magic.insert_extensions(self.exts.iter().map(|s| s.as_str()));
2478            }
2479
2480            // NOTE: here we try to implement a similar logic as in file_magic_strength.
2481            // Sticking to the exact same strength computation logic is complicated due
2482            // to implementation differences. Let's wait and see if that is a real issue.
2483            let mut strength = self.entry.test_strength;
2484
2485            let continuation_level = self.entry.continuation_level().0 as u64;
2486            if self.entry.message.is_none() && continuation_level < 3 {
2487                strength = strength.saturating_add(continuation_level);
2488            }
2489
2490            if let Some(sm) = self.strength_mod.as_ref() {
2491                strength = sm.apply(strength);
2492            }
2493
2494            // entries with no message get a bonus
2495            if self.entry.message.is_none() {
2496                strength += 1
2497            }
2498
2499            magic.update_strength(strength);
2500
2501            let end_upper_level = haystack.lazy_stream_position();
2502
2503            // we have to fix rule_base_offset if
2504            // the rule_base_starts from end otherwise it
2505            // breaks some offset computation in match
2506            // see test_offset_bug_1 and test_offset_bug_2
2507            // they implement the same test logic yet indirect
2508            // offsets have to be different so that it works
2509            // in libmagic/file
2510            let rule_base_offset = if self.root {
2511                match self.entry.offset {
2512                    Offset::Direct(DirOffset::End(o)) => {
2513                        Some(haystack.offset_from_start(SeekFrom::End(o)))
2514                    }
2515                    _ => rule_base_offset,
2516                }
2517            } else {
2518                rule_base_offset
2519            };
2520
2521            for e in self.children.iter() {
2522                e.matches(
2523                    opt_source,
2524                    magic,
2525                    state,
2526                    stream_kind,
2527                    buf_base_offset,
2528                    rule_base_offset,
2529                    Some(end_upper_level),
2530                    haystack,
2531                    db,
2532                    switch_endianness,
2533                    depth,
2534                )?
2535            }
2536        }
2537
2538        Ok(())
2539    }
2540}
2541
2542/// Represents a parsed magic rule
2543#[derive(Debug, Clone, Serialize, Deserialize)]
2544pub struct MagicRule {
2545    id: usize,
2546    source: Option<String>,
2547    entries: EntryNode,
2548    extensions: HashSet<String>,
2549    /// score used for rule ranking
2550    score: u64,
2551    finalized: bool,
2552}
2553
2554impl MagicRule {
2555    #[inline(always)]
2556    fn set_id(&mut self, id: usize) {
2557        self.id = id
2558    }
2559
2560    /// Fetches all the extensions defined in the magic rule. This
2561    /// function goes recursive and find extensions also defined in
2562    /// dependencies
2563    fn fetch_all_extensions(
2564        &self,
2565        deps: &HashMap<String, DependencyRule>,
2566        marked: &mut HashSet<String>,
2567    ) -> Result<HashSet<String>, ()> {
2568        let mut exts = HashSet::new();
2569        self.entries.update_exts_rec(&mut exts, deps, marked)?;
2570        Ok(exts)
2571    }
2572
2573    /// Computes the ranking score of a magic rule by walking
2574    /// tests recursively, dependencies included.
2575    fn compute_score(
2576        &self,
2577        depth: usize,
2578        deps: &HashMap<String, DependencyRule>,
2579        marked: &mut HashSet<String>,
2580    ) -> u64 {
2581        let mut score = 0;
2582        score += self.entries.entry.test_strength;
2583        self.entries
2584            .update_score_rec(depth, &mut score, deps, marked);
2585        score
2586    }
2587
2588    /// Finalize a rule by searching for all extensions and computing its score
2589    /// for ranking. In the `MagicRule` is already finalized it returns immediately.
2590    fn try_finalize(&mut self, deps: &HashMap<String, DependencyRule>) {
2591        if self.finalized {
2592            return;
2593        }
2594
2595        let Ok(exts) = self.fetch_all_extensions(deps, &mut HashSet::new()) else {
2596            return;
2597        };
2598
2599        self.extensions.extend(exts);
2600
2601        // fetch_all_extensions walks through all the dependencies
2602        // so there is no reason for compute_score to fail as it is walking
2603        // only some of them
2604        self.score = self.compute_score(0, deps, &mut HashSet::new());
2605        self.finalized = true
2606    }
2607
2608    #[inline]
2609    fn magic_entrypoint<'r, R: Read + Seek>(
2610        &'r self,
2611        magic: &mut Magic<'r>,
2612        stream_kind: StreamKind,
2613        haystack: &mut LazyCache<R>,
2614        db: &'r MagicDb,
2615        switch_endianness: bool,
2616        depth: usize,
2617    ) -> Result<(), Error> {
2618        self.entries.matches(
2619            self.source.as_deref(),
2620            magic,
2621            &mut MatchState::empty(),
2622            stream_kind,
2623            None,
2624            None,
2625            None,
2626            haystack,
2627            db,
2628            switch_endianness,
2629            depth,
2630        )
2631    }
2632
2633    #[inline]
2634    #[allow(clippy::too_many_arguments)]
2635    fn magic<'r, R: Read + Seek>(
2636        &'r self,
2637        magic: &mut Magic<'r>,
2638        stream_kind: StreamKind,
2639        buf_base_offset: Option<u64>,
2640        rule_base_offset: Option<u64>,
2641        haystack: &mut LazyCache<R>,
2642        db: &'r MagicDb,
2643        switch_endianness: bool,
2644        depth: usize,
2645    ) -> Result<(), Error> {
2646        self.entries.matches(
2647            self.source.as_deref(),
2648            magic,
2649            &mut MatchState::empty(),
2650            stream_kind,
2651            buf_base_offset,
2652            rule_base_offset,
2653            None,
2654            haystack,
2655            db,
2656            switch_endianness,
2657            depth,
2658        )
2659    }
2660
2661    /// Checks if the rule is for matching against text content
2662    ///
2663    /// # Returns
2664    ///
2665    /// * `bool` - True if the rule is for text files
2666    pub fn is_text(&self) -> bool {
2667        self.entries.entry.test.is_text()
2668            && self.entries.children.iter().all(|e| e.entry.test.is_text())
2669    }
2670
2671    /// Gets the rule's score used for ranking rules between them
2672    ///
2673    /// # Returns
2674    ///
2675    /// * `u64` - The rule's score
2676    #[inline(always)]
2677    pub fn score(&self) -> u64 {
2678        self.score
2679    }
2680
2681    /// Gets the rule's filename if any
2682    ///
2683    /// # Returns
2684    ///
2685    /// * `Option<&str>` - The rule's source if available
2686    #[inline(always)]
2687    pub fn source(&self) -> Option<&str> {
2688        self.source.as_deref()
2689    }
2690
2691    /// Gets the line number at which the rule is defined
2692    ///
2693    /// # Returns
2694    ///
2695    /// * `usize` - The rule's line number
2696    #[inline(always)]
2697    pub fn line(&self) -> usize {
2698        self.entries.entry.line
2699    }
2700
2701    /// Gets all the file extensions associated to the rule
2702    ///
2703    /// # Returns
2704    ///
2705    /// * `&HashSet<String>` - The set of all associated extensions
2706    #[inline(always)]
2707    pub fn extensions(&self) -> &HashSet<String> {
2708        &self.extensions
2709    }
2710}
2711
2712#[derive(Debug, Clone, Serialize, Deserialize)]
2713struct DependencyRule {
2714    name: String,
2715    rule: MagicRule,
2716}
2717
2718/// A parsed source of magic rules
2719///
2720/// # Methods
2721///
2722/// * `open` - Opens a magic file from a path
2723#[derive(Debug, Clone, Serialize, Deserialize)]
2724pub struct MagicSource {
2725    rules: Vec<MagicRule>,
2726    dependencies: HashMap<String, DependencyRule>,
2727}
2728
2729impl MagicSource {
2730    /// Opens and parses a magic file from a path
2731    ///
2732    /// # Arguments
2733    ///
2734    /// * `p` - The path to the magic file
2735    ///
2736    /// # Returns
2737    ///
2738    /// * `Result<Self, Error>` - The parsed magic file or an error
2739    pub fn open<P: AsRef<Path>>(p: P) -> Result<Self, Error> {
2740        FileMagicParser::parse_file(p)
2741    }
2742}
2743
2744#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)]
2745struct ContinuationLevel(u8);
2746
2747// FIXME: magic handles many more text encodings
2748#[derive(Debug, PartialEq, Eq, Clone, Copy)]
2749enum TextEncoding {
2750    Ascii,
2751    Utf8,
2752    Unknown,
2753}
2754
2755impl TextEncoding {
2756    const fn as_magic_str(&self) -> &'static str {
2757        match self {
2758            TextEncoding::Ascii => "ASCII",
2759            TextEncoding::Utf8 => "UTF-8",
2760            TextEncoding::Unknown => "Unknown",
2761        }
2762    }
2763}
2764
2765#[derive(Debug, PartialEq, Eq, Clone, Copy)]
2766enum StreamKind {
2767    Binary,
2768    Text(TextEncoding),
2769}
2770
2771impl StreamKind {
2772    const fn is_text(&self) -> bool {
2773        matches!(self, StreamKind::Text(_))
2774    }
2775}
2776
2777#[derive(Debug)]
2778struct MatchState {
2779    continuation_levels: [bool; 256],
2780}
2781
2782impl MatchState {
2783    #[inline(always)]
2784    fn empty() -> Self {
2785        MatchState {
2786            continuation_levels: [false; 256],
2787        }
2788    }
2789
2790    #[inline(always)]
2791    fn get_continuation_level(&mut self, level: &ContinuationLevel) -> bool {
2792        self.continuation_levels
2793            .get(level.0 as usize)
2794            .cloned()
2795            .unwrap_or_default()
2796    }
2797
2798    #[inline(always)]
2799    fn set_continuation_level(&mut self, level: ContinuationLevel) {
2800        if let Some(b) = self.continuation_levels.get_mut(level.0 as usize) {
2801            *b = true
2802        }
2803    }
2804
2805    #[inline(always)]
2806    fn clear_continuation_level(&mut self, level: &ContinuationLevel) {
2807        if let Some(b) = self.continuation_levels.get_mut(level.0 as usize) {
2808            *b = false;
2809        }
2810    }
2811}
2812
2813/// Represents a file magic detection result
2814#[derive(Debug, Default)]
2815pub struct Magic<'m> {
2816    stream_kind: Option<StreamKind>,
2817    source: Option<Cow<'m, str>>,
2818    message: Vec<Cow<'m, str>>,
2819    mime_type: Option<Cow<'m, str>>,
2820    creator_code: Option<Cow<'m, str>>,
2821    strength: u64,
2822    exts: HashSet<Cow<'m, str>>,
2823    is_default: bool,
2824}
2825
2826impl<'m> Magic<'m> {
2827    #[inline(always)]
2828    fn set_source(&mut self, source: Option<&'m str>) {
2829        self.source = source.map(Cow::Borrowed);
2830    }
2831
2832    #[inline(always)]
2833    fn set_stream_kind(&mut self, stream_kind: StreamKind) {
2834        self.stream_kind = Some(stream_kind)
2835    }
2836
2837    #[inline(always)]
2838    fn reset(&mut self) {
2839        self.stream_kind = None;
2840        self.source = None;
2841        self.message.clear();
2842        self.mime_type = None;
2843        self.creator_code = None;
2844        self.strength = 0;
2845        self.exts.clear();
2846        self.is_default = false;
2847    }
2848
2849    /// Converts borrowed data into owned data. This method involves
2850    /// data cloning, so you must use this method only if you need to
2851    /// extend the lifetime of a [`Magic`] struct.
2852    ///
2853    /// # Returns
2854    ///
2855    /// * `Magic<'owned>` - A new [`Magic`] with owned data
2856    #[inline]
2857    pub fn into_owned<'owned>(self) -> Magic<'owned> {
2858        Magic {
2859            stream_kind: self.stream_kind,
2860            source: self.source.map(|s| Cow::Owned(s.into_owned())),
2861            message: self
2862                .message
2863                .into_iter()
2864                .map(Cow::into_owned)
2865                .map(Cow::Owned)
2866                .collect(),
2867            mime_type: self.mime_type.map(|m| Cow::Owned(m.into_owned())),
2868            creator_code: self.creator_code.map(|m| Cow::Owned(m.into_owned())),
2869            strength: self.strength,
2870            exts: self
2871                .exts
2872                .into_iter()
2873                .map(|e| Cow::Owned(e.into_owned()))
2874                .collect(),
2875            is_default: self.is_default,
2876        }
2877    }
2878
2879    /// Gets the formatted message describing the file type
2880    ///
2881    /// # Returns
2882    ///
2883    /// * `String` - The formatted message
2884    #[inline(always)]
2885    pub fn message(&self) -> String {
2886        let mut out = String::new();
2887        for (i, m) in self.message.iter().enumerate() {
2888            if let Some(s) = m.strip_prefix(r#"\b"#) {
2889                out.push_str(s);
2890            } else {
2891                // don't put space on first string
2892                if i > 0 {
2893                    out.push(' ');
2894                }
2895                out.push_str(m);
2896            }
2897        }
2898        out
2899    }
2900
2901    #[inline(always)]
2902    fn update_strength(&mut self, value: u64) {
2903        self.strength = self.strength.saturating_add(value);
2904        debug!("updated strength = {:?}", self.strength)
2905    }
2906
2907    /// Gets the detected MIME type
2908    ///
2909    /// # Returns
2910    ///
2911    /// * `&str` - The MIME type or default based on stream kind
2912    #[inline(always)]
2913    pub fn mime_type(&self) -> &str {
2914        self.mime_type.as_deref().unwrap_or(match self.stream_kind {
2915            Some(StreamKind::Text(_)) => DEFAULT_TEXT_MIMETYPE,
2916            Some(StreamKind::Binary) | None => DEFAULT_BIN_MIMETYPE,
2917        })
2918    }
2919
2920    #[inline(always)]
2921    fn push_message<'a: 'm>(&mut self, msg: Cow<'a, str>) {
2922        if !msg.is_empty() {
2923            debug!("pushing message: msg={msg} len={}", msg.len());
2924            self.message.push(msg);
2925        }
2926    }
2927
2928    #[inline(always)]
2929    fn set_mime_type<'a: 'm>(&mut self, mime: Cow<'a, str>) {
2930        if self.mime_type.is_none() {
2931            debug!("insert mime: {:?}", mime);
2932            self.mime_type = Some(mime)
2933        }
2934    }
2935
2936    #[inline(always)]
2937    fn set_creator_code<'a: 'm>(&mut self, apple_ty: Cow<'a, str>) {
2938        if self.creator_code.is_none() {
2939            debug!("insert apple type: {apple_ty:?}");
2940            self.creator_code = Some(apple_ty)
2941        }
2942    }
2943
2944    #[inline(always)]
2945    fn insert_extensions<'a: 'm, I: Iterator<Item = &'a str>>(&mut self, exts: I) {
2946        if self.exts.is_empty() {
2947            self.exts.extend(exts.filter_map(|e| {
2948                if e.is_empty() {
2949                    None
2950                } else {
2951                    Some(Cow::Borrowed(e))
2952                }
2953            }));
2954        }
2955    }
2956
2957    /// Gets the confidence score of the detection. This
2958    /// value is used to sort [`Magic`] in [`MagicDb::magic_best`]
2959    /// and [`MagicDb::magic_all`].
2960    ///
2961    /// # Returns
2962    ///
2963    /// * `u64` - The confidence score attributed to that [`Magic`]
2964    #[inline(always)]
2965    pub fn strength(&self) -> u64 {
2966        self.strength
2967    }
2968
2969    /// Gets the filename where the magic rule was defined
2970    ///
2971    /// # Returns
2972    ///
2973    /// * `Option<&str>` - The source if available
2974    #[inline(always)]
2975    pub fn source(&self) -> Option<&str> {
2976        self.source.as_deref()
2977    }
2978
2979    /// Gets the Apple creator code if available
2980    ///
2981    /// # Returns
2982    ///
2983    /// * `Option<&str>` - The creator code if available
2984    #[inline(always)]
2985    pub fn creator_code(&self) -> Option<&str> {
2986        self.creator_code.as_deref()
2987    }
2988
2989    /// Gets the possible file extensions for the detected [`Magic`]
2990    ///
2991    /// # Returns
2992    ///
2993    /// * `&HashSet<Cow<'m, str>>` - The set of possible extensions
2994    #[inline(always)]
2995    pub fn extensions(&self) -> &HashSet<Cow<'m, str>> {
2996        &self.exts
2997    }
2998
2999    /// Checks if this is a default fallback detection
3000    ///
3001    /// # Returns
3002    ///
3003    /// * `bool` - True if this is a default detection
3004    #[inline(always)]
3005    pub fn is_default(&self) -> bool {
3006        self.is_default
3007    }
3008}
3009
3010/// Represents a database of [`MagicRule`]
3011#[derive(Debug, Default, Clone, Serialize, Deserialize)]
3012pub struct MagicDb {
3013    rule_id: usize,
3014    rules: Vec<MagicRule>,
3015    dependencies: HashMap<String, DependencyRule>,
3016}
3017
3018#[inline(always)]
3019/// Returns `true` if the byte stream is likely text.
3020fn is_likely_text(bytes: &[u8]) -> bool {
3021    if bytes.is_empty() {
3022        return false;
3023    }
3024
3025    let mut printable = 0f64;
3026    let mut high_bytes = 0f64; // Bytes > 0x7F (non-ASCII)
3027
3028    for byte in bytes.iter() {
3029        match byte {
3030            0x00 => return false,
3031            0x09 | 0x0A | 0x0D => printable += 1.0, // Whitespace
3032            0x20..=0x7E => printable += 1.0,        // Printable ASCII
3033            _ => high_bytes += 1.0,
3034        }
3035    }
3036
3037    let total = bytes.len() as f64;
3038    let printable_ratio = printable / total;
3039    let high_bytes_ratio = high_bytes / total;
3040
3041    // Heuristic thresholds (adjust as needed):
3042    printable_ratio > 0.85 && high_bytes_ratio < 0.20
3043}
3044
3045#[inline(always)]
3046fn guess_stream_kind<S: AsRef<[u8]>>(stream: S) -> StreamKind {
3047    let Ok(s) = str::from_utf8(stream.as_ref()) else {
3048        if is_likely_text(stream.as_ref()) {
3049            return StreamKind::Text(TextEncoding::Unknown);
3050        } else {
3051            return StreamKind::Binary;
3052        }
3053    };
3054
3055    let count = s.chars().count();
3056    let mut is_ascii = true;
3057
3058    for c in s.chars().take(count.saturating_sub(1)) {
3059        is_ascii &= c.is_ascii()
3060    }
3061
3062    if is_ascii {
3063        StreamKind::Text(TextEncoding::Ascii)
3064    } else {
3065        StreamKind::Text(TextEncoding::Utf8)
3066    }
3067}
3068
3069impl MagicDb {
3070    fn open_reader<R: Read + Seek>(f: R) -> Result<LazyCache<R>, Error> {
3071        Ok(LazyCache::<R>::from_read_seek(f)
3072            .and_then(|lc| lc.with_hot_cache(2 * FILE_BYTES_MAX))?)
3073        .map(|lc| lc.with_warm_cache(100 << 20))
3074    }
3075
3076    /// Creates a new empty database
3077    ///
3078    /// # Returns
3079    ///
3080    /// * [`MagicDb`] - A new empty database
3081    pub fn new() -> Self {
3082        Self::default()
3083    }
3084
3085    #[inline(always)]
3086    fn next_rule_id(&mut self) -> usize {
3087        let t = self.rule_id;
3088        self.rule_id += 1;
3089        t
3090    }
3091
3092    #[inline(always)]
3093    fn try_json<R: Read + Seek>(
3094        haystack: &mut LazyCache<R>,
3095        stream_kind: StreamKind,
3096        magic: &mut Magic,
3097    ) -> Result<bool, Error> {
3098        // cannot be json if content is binary
3099        if matches!(stream_kind, StreamKind::Binary) {
3100            return Ok(false);
3101        }
3102
3103        let buf = haystack.read_range(0..FILE_BYTES_MAX as u64)?.trim_ascii();
3104
3105        let Some((start, end)) = find_json_boundaries(buf) else {
3106            return Ok(false);
3107        };
3108
3109        // if anything else than whitespace before start
3110        // this is not json
3111        for c in buf[0..start].iter() {
3112            if !c.is_ascii_whitespace() {
3113                return Ok(false);
3114            }
3115        }
3116
3117        let mut is_ndjson = false;
3118
3119        trace!("maybe a json document");
3120        let ok = serde_json::from_slice::<serde_json::Value>(&buf[start..=end]).is_ok();
3121        if !ok {
3122            return Ok(false);
3123        }
3124
3125        // we are sure it is json now we must look if we are ndjson
3126        if end + 1 < buf.len() {
3127            // after first json
3128            let buf = &buf[end + 1..];
3129            if let Some((second_start, second_end)) = find_json_boundaries(buf) {
3130                // there is a new line between the two json docs
3131                if memchr(b'\n', &buf[..second_start]).is_some() {
3132                    trace!("might be ndjson");
3133                    is_ndjson = serde_json::from_slice::<serde_json::Value>(
3134                        &buf[second_start..=second_end],
3135                    )
3136                    .is_ok();
3137                }
3138            }
3139        }
3140
3141        if is_ndjson {
3142            magic.push_message(Cow::Borrowed("New Line Delimited"));
3143            magic.set_mime_type(Cow::Borrowed("application/x-ndjson"));
3144            magic.insert_extensions(["ndjson", "jsonl"].into_iter());
3145        } else {
3146            magic.set_mime_type(Cow::Borrowed("application/json"));
3147            magic.insert_extensions(["json"].into_iter());
3148        }
3149
3150        magic.push_message(Cow::Borrowed("JSON text data"));
3151        magic.set_source(Some(HARDCODED_SOURCE));
3152        magic.update_strength(HARDCODED_MAGIC_STRENGTH);
3153        Ok(true)
3154    }
3155
3156    #[inline(always)]
3157    fn try_csv<R: Read + Seek>(
3158        haystack: &mut LazyCache<R>,
3159        stream_kind: StreamKind,
3160        magic: &mut Magic,
3161    ) -> Result<bool, Error> {
3162        // cannot be csv if content is binary
3163        let StreamKind::Text(enc) = stream_kind else {
3164            return Ok(false);
3165        };
3166
3167        let buf = haystack.read_range(0..FILE_BYTES_MAX as u64)?;
3168        let mut reader = csv::Reader::from_reader(io::Cursor::new(buf));
3169        let mut records = reader.records();
3170
3171        let Some(Ok(first)) = records.next() else {
3172            return Ok(false);
3173        };
3174
3175        // very not likely a CSV otherwise all programming
3176        // languages having ; line terminator would be
3177        // considered as CSV
3178        if first.len() <= 1 {
3179            return Ok(false);
3180        }
3181
3182        // we already parsed first line
3183        let mut n = 1;
3184        for i in records.take(9) {
3185            if let Ok(rec) = i {
3186                if first.len() != rec.len() {
3187                    return Ok(false);
3188                }
3189            } else {
3190                return Ok(false);
3191            }
3192            n += 1;
3193        }
3194
3195        // we need at least 10 lines
3196        if n != 10 {
3197            return Ok(false);
3198        }
3199
3200        magic.set_mime_type(Cow::Borrowed("text/csv"));
3201        magic.push_message(Cow::Borrowed("CSV"));
3202        magic.push_message(Cow::Borrowed(enc.as_magic_str()));
3203        magic.push_message(Cow::Borrowed("text"));
3204        magic.insert_extensions(["csv"].into_iter());
3205        magic.set_source(Some(HARDCODED_SOURCE));
3206        magic.update_strength(HARDCODED_MAGIC_STRENGTH);
3207        Ok(true)
3208    }
3209
3210    #[inline(always)]
3211    fn try_tar<R: Read + Seek>(
3212        haystack: &mut LazyCache<R>,
3213        stream_kind: StreamKind,
3214        magic: &mut Magic,
3215    ) -> Result<bool, Error> {
3216        // cannot be json if content is not binary
3217        if !matches!(stream_kind, StreamKind::Binary) {
3218            return Ok(false);
3219        }
3220
3221        let buf = haystack.read_range(0..FILE_BYTES_MAX as u64)?;
3222        let mut ar = Archive::new(io::Cursor::new(buf));
3223
3224        let Ok(mut entries) = ar.entries() else {
3225            return Ok(false);
3226        };
3227
3228        let Some(Ok(first)) = entries.next() else {
3229            return Ok(false);
3230        };
3231
3232        let header = first.header();
3233
3234        if header.as_ustar().is_some() {
3235            magic.push_message(Cow::Borrowed("POSIX tar archive"));
3236        } else if header.as_gnu().is_some() {
3237            magic.push_message(Cow::Borrowed("POSIX tar archive (GNU)"));
3238        } else {
3239            magic.push_message(Cow::Borrowed("tar archive"));
3240        }
3241
3242        magic.set_mime_type(Cow::Borrowed("application/x-tar"));
3243        magic.set_source(Some(HARDCODED_SOURCE));
3244        magic.update_strength(HARDCODED_MAGIC_STRENGTH);
3245        magic.insert_extensions(["tar"].into_iter());
3246        Ok(true)
3247    }
3248
3249    #[inline(always)]
3250    fn try_hard_magic<R: Read + Seek>(
3251        haystack: &mut LazyCache<R>,
3252        stream_kind: StreamKind,
3253        magic: &mut Magic,
3254    ) -> Result<bool, Error> {
3255        Ok(Self::try_json(haystack, stream_kind, magic)?
3256            || Self::try_csv(haystack, stream_kind, magic)?
3257            || Self::try_tar(haystack, stream_kind, magic)?)
3258    }
3259
3260    #[inline(always)]
3261    fn magic_default<'m, R: Read + Seek>(
3262        haystack: &mut LazyCache<R>,
3263        stream_kind: StreamKind,
3264        magic: &mut Magic<'m>,
3265    ) -> Result<(), Error> {
3266        let buf = haystack.read_range(0..FILE_BYTES_MAX as u64)?;
3267
3268        magic.set_source(Some(HARDCODED_SOURCE));
3269        magic.set_stream_kind(stream_kind);
3270        magic.is_default = true;
3271
3272        if buf.is_empty() {
3273            magic.push_message(Cow::Borrowed("empty"));
3274            magic.set_mime_type(Cow::Borrowed(DEFAULT_BIN_MIMETYPE));
3275            return Ok(());
3276        }
3277
3278        match stream_kind {
3279            StreamKind::Binary => {
3280                magic.push_message(Cow::Borrowed("data"));
3281            }
3282            StreamKind::Text(e) => {
3283                magic.push_message(Cow::Borrowed(e.as_magic_str()));
3284                magic.push_message(Cow::Borrowed("text"));
3285            }
3286        }
3287
3288        Ok(())
3289    }
3290
3291    /// Loads rules from a [`MagicSource`]
3292    ///
3293    /// # Arguments
3294    ///
3295    /// * `mf` - The [`MagicSource`] to load rules from
3296    ///
3297    /// # Returns
3298    ///
3299    /// * `Result<&mut Self, Error>` - Self for chaining or an error
3300    pub fn load(&mut self, mf: MagicSource) -> Result<&mut Self, Error> {
3301        for rule in mf.rules.into_iter() {
3302            let mut rule = rule;
3303            rule.set_id(self.next_rule_id());
3304
3305            self.rules.push(rule);
3306        }
3307
3308        self.dependencies.extend(mf.dependencies);
3309        self.prepare();
3310        Ok(self)
3311    }
3312
3313    /// Gets all rules in the database
3314    ///
3315    /// # Returns
3316    ///
3317    /// * `&[MagicRule]` - A slice of all rules
3318    pub fn rules(&self) -> &[MagicRule] {
3319        &self.rules
3320    }
3321
3322    #[inline]
3323    fn magic_first_with_stream_kind<R: Read + Seek>(
3324        &self,
3325        haystack: &mut LazyCache<R>,
3326        stream_kind: StreamKind,
3327        extension: Option<&str>,
3328    ) -> Result<Magic<'_>, Error> {
3329        // re-using magic makes this function faster
3330        let mut magic = Magic::default();
3331
3332        if Self::try_hard_magic(haystack, stream_kind, &mut magic)? {
3333            return Ok(magic);
3334        }
3335
3336        let mut marked = vec![false; self.rules.len()];
3337
3338        macro_rules! do_magic {
3339            ($rule: expr) => {{
3340                $rule.magic_entrypoint(&mut magic, stream_kind, haystack, &self, false, 0)?;
3341
3342                if !magic.message.is_empty() {
3343                    magic.set_stream_kind(stream_kind);
3344                    magic.set_source($rule.source.as_deref());
3345                    return Ok(magic);
3346                }
3347
3348                magic.reset();
3349            }};
3350        }
3351
3352        if let Some(ext) = extension.map(|e| e.to_lowercase()) {
3353            if !ext.is_empty() {
3354                for rule in self.rules.iter().filter(|r| r.extensions.contains(&ext)) {
3355                    do_magic!(rule);
3356                    if let Some(f) = marked.get_mut(rule.id) {
3357                        *f = true
3358                    }
3359                }
3360            }
3361        }
3362
3363        for rule in self
3364            .rules
3365            .iter()
3366            // we don't run again rules run by extension
3367            .filter(|r| !*marked.get(r.id).unwrap_or(&false))
3368        {
3369            do_magic!(rule)
3370        }
3371
3372        Self::magic_default(haystack, stream_kind, &mut magic)?;
3373
3374        Ok(magic)
3375    }
3376
3377    /// Detects file [`Magic`] stopping at the first matching magic. Magic
3378    /// rules are evaluated from the best to the least relevant, so this method
3379    /// returns most of the time the best magic. For the rare cases where
3380    /// it doesn't or if the best result is always required, use [`MagicDb::magic_best`]
3381    ///
3382    /// # Arguments
3383    ///
3384    /// * `r` - A readable and seekable input
3385    /// * `extension` - Optional file extension to use for acceleration
3386    ///
3387    /// # Returns
3388    ///
3389    /// * `Result<Magic<'_>, Error>` - The detection result or an error
3390    pub fn magic_first<R: Read + Seek>(
3391        &self,
3392        r: &mut R,
3393        extension: Option<&str>,
3394    ) -> Result<Magic<'_>, Error> {
3395        let mut haystack = Self::open_reader(r)?;
3396        let stream_kind = guess_stream_kind(haystack.read_range(0..FILE_BYTES_MAX as u64)?);
3397        self.magic_first_with_stream_kind(&mut haystack, stream_kind, extension)
3398    }
3399
3400    #[inline(always)]
3401    fn magic_all_sort_with_stream_kind<R: Read + Seek>(
3402        &self,
3403        haystack: &mut LazyCache<R>,
3404        stream_kind: StreamKind,
3405    ) -> Result<Vec<Magic<'_>>, Error> {
3406        let mut out = Vec::new();
3407
3408        let mut magic = Magic::default();
3409
3410        if Self::try_hard_magic(haystack, stream_kind, &mut magic)? {
3411            out.push(magic);
3412            magic = Magic::default();
3413        }
3414
3415        for rule in self.rules.iter() {
3416            rule.magic_entrypoint(&mut magic, stream_kind, haystack, self, false, 0)?;
3417
3418            // it is possible we have a strength with no message
3419            if !magic.message.is_empty() {
3420                magic.set_stream_kind(stream_kind);
3421                magic.set_source(rule.source.as_deref());
3422                out.push(magic);
3423                magic = Magic::default();
3424            }
3425
3426            magic.reset();
3427        }
3428
3429        Self::magic_default(haystack, stream_kind, &mut magic)?;
3430        out.push(magic);
3431
3432        out.sort_by_key(|b| std::cmp::Reverse(b.strength()));
3433
3434        Ok(out)
3435    }
3436
3437    /// Detects all [`Magic`] matching a given content.
3438    ///
3439    /// # Arguments
3440    ///
3441    /// * `r` - A readable and seekable input
3442    ///
3443    /// # Returns
3444    ///
3445    /// * `Result<Vec<Magic<'_>>, Error>` - All detection results sorted by strength or an error
3446    pub fn magic_all<R: Read + Seek>(&self, r: &mut R) -> Result<Vec<Magic<'_>>, Error> {
3447        let mut haystack = Self::open_reader(r)?;
3448        let stream_kind = guess_stream_kind(haystack.read_range(0..FILE_BYTES_MAX as u64)?);
3449        self.magic_all_sort_with_stream_kind(&mut haystack, stream_kind)
3450    }
3451
3452    #[inline(always)]
3453    fn magic_best_with_stream_kind<R: Read + Seek>(
3454        &self,
3455        haystack: &mut LazyCache<R>,
3456        stream_kind: StreamKind,
3457    ) -> Result<Magic<'_>, Error> {
3458        let magics = self.magic_all_sort_with_stream_kind(haystack, stream_kind)?;
3459
3460        // magics is guaranteed to contain at least the default magic
3461        return Ok(magics
3462            .into_iter()
3463            .next()
3464            .expect("magics must at least contain default"));
3465    }
3466
3467    /// Detects the best [`Magic`] matching a given content.
3468    ///
3469    /// # Arguments
3470    ///
3471    /// * `r` - A readable and seekable input
3472    ///
3473    /// # Returns
3474    ///
3475    /// * `Result<Magic<'_>, Error>` - The best detection result or an error
3476    pub fn magic_best<R: Read + Seek>(&self, r: &mut R) -> Result<Magic<'_>, Error> {
3477        let mut haystack = Self::open_reader(r)?;
3478        let stream_kind = guess_stream_kind(haystack.read_range(0..FILE_BYTES_MAX as u64)?);
3479        self.magic_best_with_stream_kind(&mut haystack, stream_kind)
3480    }
3481
3482    /// Serializes the database to a generic writer implementing [`io::Write`]
3483    ///
3484    /// # Returns
3485    ///
3486    /// * `Result<(), Error>` - The serialized database or an error
3487    pub fn serialize<W: Write>(self, w: &mut W) -> Result<(), Error> {
3488        let mut encoder = GzEncoder::new(w, Compression::best());
3489
3490        bincode::serde::encode_into_std_write(&self, &mut encoder, bincode::config::standard())?;
3491        encoder.finish()?;
3492        Ok(())
3493    }
3494
3495    /// Deserializes the database from a generic reader implementing [`io::Read`]
3496    ///
3497    /// # Arguments
3498    ///
3499    /// * `r` - The reader to deserialize from
3500    ///
3501    /// # Returns
3502    ///
3503    /// * `Result<Self, Error>` - The deserialized database or an error
3504    pub fn deserialize<R: Read>(r: &mut R) -> Result<Self, Error> {
3505        let mut buf = vec![];
3506        let mut gz = GzDecoder::new(r);
3507        gz.read_to_end(&mut buf).map_err(|e| {
3508            bincode::error::DecodeError::OtherString(format!("failed to read: {e}"))
3509        })?;
3510        let (sdb, _): (MagicDb, usize) =
3511            bincode::serde::decode_from_slice(&buf, bincode::config::standard())?;
3512        Ok(sdb)
3513    }
3514
3515    #[inline(always)]
3516    fn prepare(&mut self) {
3517        self.rules
3518            .iter_mut()
3519            .for_each(|r| r.try_finalize(&self.dependencies));
3520
3521        // put text rules at the end
3522        self.rules.sort_by_key(|r| (r.is_text(), -(r.score as i64)));
3523    }
3524}
3525
3526#[cfg(test)]
3527mod tests {
3528    use std::io::Cursor;
3529
3530    use regex::bytes::Regex;
3531
3532    use crate::utils::unix_local_time_to_string;
3533
3534    use super::*;
3535
3536    macro_rules! lazy_cache {
3537        ($l: literal) => {
3538            LazyCache::from_read_seek(Cursor::new($l)).unwrap()
3539        };
3540    }
3541
3542    fn first_magic(
3543        rule: &str,
3544        content: &[u8],
3545        stream_kind: StreamKind,
3546    ) -> Result<Magic<'static>, Error> {
3547        let mut md = MagicDb::new();
3548        md.load(
3549            FileMagicParser::parse_str(rule, None)
3550                .inspect_err(|e| eprintln!("{e}"))
3551                .unwrap(),
3552        )
3553        .unwrap();
3554        let mut reader = LazyCache::from_read_seek(Cursor::new(content)).unwrap();
3555        let v = md.magic_best_with_stream_kind(&mut reader, stream_kind)?;
3556        Ok(v.into_owned())
3557    }
3558
3559    /// helper macro to debug tests
3560    #[allow(unused_macros)]
3561    macro_rules! enable_trace {
3562        () => {
3563            tracing_subscriber::fmt()
3564                .with_max_level(tracing_subscriber::filter::LevelFilter::TRACE)
3565                .try_init();
3566        };
3567    }
3568
3569    macro_rules! parse_assert {
3570        ($rule:literal) => {
3571            FileMagicParser::parse_str($rule, None)
3572                .inspect_err(|e| eprintln!("{e}"))
3573                .unwrap();
3574        };
3575    }
3576
3577    macro_rules! assert_magic_match_bin {
3578        ($rule: literal, $content:literal) => {{ first_magic($rule, $content, StreamKind::Binary).unwrap() }};
3579        ($rule: literal, $content:literal, $message:expr) => {{
3580            assert_eq!(
3581                first_magic($rule, $content, StreamKind::Binary)
3582                    .unwrap()
3583                    .message(),
3584                $message
3585            );
3586        }};
3587    }
3588
3589    macro_rules! assert_magic_match_text {
3590        ($rule: literal, $content:literal) => {{ first_magic($rule, $content, StreamKind::Text(TextEncoding::Utf8)).unwrap() }};
3591        ($rule: literal, $content:literal, $message:expr) => {{
3592            assert_eq!(
3593                first_magic($rule, $content, StreamKind::Text(TextEncoding::Utf8))
3594                    .unwrap()
3595                    .message(),
3596                $message
3597            );
3598        }};
3599    }
3600
3601    macro_rules! assert_magic_not_match_text {
3602        ($rule: literal, $content:literal) => {{
3603            assert!(
3604                first_magic($rule, $content, StreamKind::Text(TextEncoding::Utf8))
3605                    .unwrap()
3606                    .is_default()
3607            );
3608        }};
3609    }
3610
3611    macro_rules! assert_magic_not_match_bin {
3612        ($rule: literal, $content:literal) => {{
3613            assert!(
3614                first_magic($rule, $content, StreamKind::Binary)
3615                    .unwrap()
3616                    .is_default()
3617            );
3618        }};
3619    }
3620
3621    #[test]
3622    fn test_regex() {
3623        assert_magic_match_text!(
3624            r#"
36250	regex/1024 \^#![[:space:]]*/usr/bin/env[[:space:]]+
3626!:mime	text/x-shellscript
3627>&0  regex/64 .*($|\\b) %s shell script text executable
3628    "#,
3629            br#"#!/usr/bin/env bash
3630        echo hello world"#,
3631            // the magic generated
3632            "bash shell script text executable"
3633        );
3634
3635        let re = Regex::new(r"(?-u)\x42\x82").unwrap();
3636        assert!(re.is_match(b"\x42\x82"));
3637
3638        assert_magic_match_bin!(
3639            r#"0 regex \x42\x82 binary regex match"#,
3640            b"\x00\x00\x00\x00\x00\x00\x42\x82"
3641        );
3642
3643        // test regex continuation after match
3644        assert_magic_match_bin!(
3645            r#"
3646            0 regex \x42\x82
3647            >&0 string \xde\xad\xbe\xef it works
3648            "#,
3649            b"\x00\x00\x00\x00\x00\x00\x42\x82\xde\xad\xbe\xef"
3650        );
3651
3652        assert_magic_match_bin!(
3653            r#"
3654            0 regex/s \x42\x82
3655            >&0 string \x42\x82\xde\xad\xbe\xef it works
3656            "#,
3657            b"\x00\x00\x00\x00\x00\x00\x42\x82\xde\xad\xbe\xef"
3658        );
3659
3660        // ^ must match stat of line when matching text
3661        assert_magic_match_text!(
3662            r#"
36630	regex/1024 \^HelloWorld$ HelloWorld String"#,
3664            br#"
3665// this is a comment after an empty line
3666HelloWorld
3667            "#
3668        );
3669    }
3670
3671    #[test]
3672    fn test_string_with_mods() {
3673        assert_magic_match_text!(
3674            r#"0	string/w	#!\ \ \ /usr/bin/env\ bash	BASH
3675        "#,
3676            b"#! /usr/bin/env bash i
3677        echo hello world"
3678        );
3679
3680        // test uppercase insensitive
3681        assert_magic_match_text!(
3682            r#"0	string/C	HelloWorld	it works
3683        "#,
3684            b"helloworld"
3685        );
3686
3687        assert_magic_not_match_text!(
3688            r#"0	string/C	HelloWorld	it works
3689        "#,
3690            b"hELLOwORLD"
3691        );
3692
3693        // test lowercase insensitive
3694        assert_magic_match_text!(
3695            r#"0	string/c	HelloWorld	it works
3696        "#,
3697            b"HELLOWORLD"
3698        );
3699
3700        assert_magic_not_match_text!(
3701            r#"0	string/c	HelloWorld	it works
3702        "#,
3703            b"helloworld"
3704        );
3705
3706        // test full word match
3707        assert_magic_match_text!(
3708            r#"0	string/f	#!/usr/bin/env\ bash	BASH
3709        "#,
3710            b"#!/usr/bin/env bash"
3711        );
3712
3713        assert_magic_not_match_text!(
3714            r#"0	string/f	#!/usr/bin/python PYTHON"#,
3715            b"#!/usr/bin/pythonic"
3716        );
3717
3718        // testing whitespace compacting
3719        assert_magic_match_text!(
3720            r#"0	string/W	#!/usr/bin/env\ python  PYTHON"#,
3721            b"#!/usr/bin/env    python"
3722        );
3723
3724        assert_magic_not_match_text!(
3725            r#"0	string/W	#!/usr/bin/env\ \ python  PYTHON"#,
3726            b"#!/usr/bin/env python"
3727        );
3728    }
3729
3730    #[test]
3731    fn test_search_with_mods() {
3732        assert_magic_match_text!(
3733            r#"0	search/1/fwt	#!\ /usr/bin/luatex	LuaTex script text executable"#,
3734            b"#!          /usr/bin/luatex "
3735        );
3736
3737        // test matching from the beginning
3738        assert_magic_match_text!(
3739            r#"
3740            0	search/s	/usr/bin/env
3741            >&0 string /usr/bin/env it works
3742            "#,
3743            b"#!/usr/bin/env    python"
3744        );
3745
3746        assert_magic_not_match_text!(
3747            r#"
3748            0	search	/usr/bin/env
3749            >&0 string /usr/bin/env it works
3750            "#,
3751            b"#!/usr/bin/env    python"
3752        );
3753    }
3754
3755    #[test]
3756    fn test_pstring() {
3757        assert_magic_match_bin!(r#"0 pstring Toast it works"#, b"\x05Toast");
3758
3759        assert_magic_match_bin!(r#"0 pstring Toast %s"#, b"\x05Toast", "Toast");
3760
3761        assert_magic_not_match_bin!(r#"0 pstring Toast Doesn't work"#, b"\x07Toaster");
3762
3763        // testing with modifiers
3764        assert_magic_match_bin!(r#"0 pstring/H Toast it works"#, b"\x00\x05Toast");
3765
3766        assert_magic_match_bin!(r#"0 pstring/HJ Toast it works"#, b"\x00\x07Toast");
3767
3768        assert_magic_match_bin!(r#"0 pstring/HJ Toast %s"#, b"\x00\x07Toast", "Toast");
3769
3770        assert_magic_match_bin!(r#"0 pstring/h Toast it works"#, b"\x05\x00Toast");
3771
3772        assert_magic_match_bin!(r#"0 pstring/hJ Toast it works"#, b"\x07\x00Toast");
3773
3774        assert_magic_match_bin!(r#"0 pstring/L Toast it works"#, b"\x00\x00\x00\x05Toast");
3775
3776        assert_magic_match_bin!(r#"0 pstring/LJ Toast it works"#, b"\x00\x00\x00\x09Toast");
3777
3778        assert_magic_match_bin!(r#"0 pstring/l Toast it works"#, b"\x05\x00\x00\x00Toast");
3779
3780        assert_magic_match_bin!(r#"0 pstring/lJ Toast it works"#, b"\x09\x00\x00\x00Toast");
3781    }
3782
3783    #[test]
3784    fn test_max_recursion() {
3785        let res = first_magic(
3786            r#"0	indirect x"#,
3787            b"#!          /usr/bin/luatex ",
3788            StreamKind::Binary,
3789        );
3790        assert!(res.is_err());
3791        let _ = res.inspect_err(|e| {
3792            assert!(matches!(
3793                e.unwrap_localized(),
3794                Error::MaximumRecursion(MAX_RECURSION)
3795            ))
3796        });
3797    }
3798
3799    #[test]
3800    fn test_string_ops() {
3801        assert_magic_match_text!("0	string/b MZ MZ File", b"MZ\0");
3802        assert_magic_match_text!("0	string !MZ Not MZ File", b"AZ\0");
3803        assert_magic_match_text!("0	string >\0 Any String", b"A\0");
3804        assert_magic_match_text!("0	string >Test Any String", b"Test 1\0");
3805        assert_magic_match_text!("0	string <Test Any String", b"\0");
3806        assert_magic_not_match_text!("0	string >Test Any String", b"\0");
3807    }
3808
3809    #[test]
3810    fn test_lestring16() {
3811        assert_magic_match_bin!(
3812            "0 lestring16 abcd Little-endian UTF-16 string",
3813            b"\x61\x00\x62\x00\x63\x00\x64\x00"
3814        );
3815        assert_magic_match_bin!(
3816            "0 lestring16 x %s",
3817            b"\x61\x00\x62\x00\x63\x00\x64\x00\x00",
3818            "abcd"
3819        );
3820        assert_magic_not_match_bin!(
3821            "0 lestring16 abcd Little-endian UTF-16 string",
3822            b"\x00\x61\x00\x62\x00\x63\x00\x64"
3823        );
3824        assert_magic_match_bin!(
3825            "4 lestring16 abcd Little-endian UTF-16 string",
3826            b"\x00\x00\x00\x00\x61\x00\x62\x00\x63\x00\x64\x00"
3827        );
3828    }
3829
3830    #[test]
3831    fn test_bestring16() {
3832        assert_magic_match_bin!(
3833            "0 bestring16 abcd Big-endian UTF-16 string",
3834            b"\x00\x61\x00\x62\x00\x63\x00\x64"
3835        );
3836        assert_magic_match_bin!(
3837            "0 bestring16 x %s",
3838            b"\x00\x61\x00\x62\x00\x63\x00\x64",
3839            "abcd"
3840        );
3841        assert_magic_not_match_bin!(
3842            "0 bestring16 abcd Big-endian UTF-16 string",
3843            b"\x61\x00\x62\x00\x63\x00\x64\x00"
3844        );
3845        assert_magic_match_bin!(
3846            "4 bestring16 abcd Big-endian UTF-16 string",
3847            b"\x00\x00\x00\x00\x00\x61\x00\x62\x00\x63\x00\x64"
3848        );
3849    }
3850
3851    #[test]
3852    fn test_offset_from_end() {
3853        assert_magic_match_bin!("-1 ubyte 0x42 last byte ok", b"\x00\x00\x42");
3854        assert_magic_match_bin!("-2 ubyte 0x41 last byte ok", b"\x00\x41\x00");
3855    }
3856
3857    #[test]
3858    fn test_relative_offset() {
3859        assert_magic_match_bin!(
3860            "
3861            0 ubyte 0x42
3862            >&0 ubyte 0x00
3863            >>&0 ubyte 0x41 third byte ok
3864            ",
3865            b"\x42\x00\x41\x00"
3866        );
3867    }
3868
3869    #[test]
3870    fn test_indirect_offset() {
3871        assert_magic_match_bin!("(0.l) ubyte 0x42 it works", b"\x04\x00\x00\x00\x42");
3872        // adding fixed value to offset
3873        assert_magic_match_bin!("(0.l+3) ubyte 0x42 it works", b"\x01\x00\x00\x00\x42");
3874        // testing offset pair
3875        assert_magic_match_bin!(
3876            "(0.l+(4)) ubyte 0x42 it works",
3877            b"\x04\x00\x00\x00\x04\x00\x00\x00\x42"
3878        );
3879    }
3880
3881    #[test]
3882    fn test_use_with_message() {
3883        assert_magic_match_bin!(
3884            r#"
38850 string MZ
3886>0 use mz first match
3887
38880 name mz then second match
3889>0 string MZ
3890"#,
3891            b"MZ\0",
3892            "first match then second match"
3893        );
3894    }
3895
3896    #[test]
3897    fn test_scalar_transform() {
3898        assert_magic_match_bin!("0 ubyte+1 0x1 add works", b"\x00");
3899        assert_magic_match_bin!("0 ubyte-1 0xfe sub works", b"\xff");
3900        assert_magic_match_bin!("0 ubyte%2 0 mod works", b"\x0a");
3901        assert_magic_match_bin!("0 ubyte&0x0f 0x0f bitand works", b"\xff");
3902        assert_magic_match_bin!("0 ubyte|0x0f 0xff bitor works", b"\xf0");
3903        assert_magic_match_bin!("0 ubyte^0x0f 0xf0 bitxor works", b"\xff");
3904
3905        FileMagicParser::parse_str("0 ubyte%0 mod by zero", None)
3906            .expect_err("expect div by zero error");
3907        FileMagicParser::parse_str("0 ubyte/0 div by zero", None)
3908            .expect_err("expect div by zero error");
3909    }
3910
3911    #[test]
3912    fn test_belong() {
3913        // Test that a file with a four-byte value at offset 0 that matches the given value in big-endian byte order
3914        assert_magic_match_bin!("0 belong 0x12345678 Big-endian long", b"\x12\x34\x56\x78");
3915        // Test that a file with a four-byte value at offset 0 that does not match the given value in big-endian byte order
3916        assert_magic_not_match_bin!("0 belong 0x12345678 Big-endian long", b"\x78\x56\x34\x12");
3917        // Test that a file with a four-byte value at a non-zero offset that matches the given value in big-endian byte order
3918        assert_magic_match_bin!(
3919            "4 belong 0x12345678 Big-endian long",
3920            b"\x00\x00\x00\x00\x12\x34\x56\x78"
3921        );
3922        // Test < operator
3923        assert_magic_match_bin!("0 belong <0x12345678 Big-endian long", b"\x12\x34\x56\x77");
3924        assert_magic_not_match_bin!("0 belong <0x12345678 Big-endian long", b"\x12\x34\x56\x78");
3925
3926        // Test > operator
3927        assert_magic_match_bin!("0 belong >0x12345678 Big-endian long", b"\x12\x34\x56\x79");
3928        assert_magic_not_match_bin!("0 belong >0x12345678 Big-endian long", b"\x12\x34\x56\x78");
3929
3930        // Test & operator
3931        assert_magic_match_bin!("0 belong &0x5678 Big-endian long", b"\x00\x00\x56\x78");
3932        assert_magic_not_match_bin!("0 belong &0x0000FFFF Big-endian long", b"\x12\x34\x56\x78");
3933
3934        // Test ^ operator (bitwise AND with complement)
3935        assert_magic_match_bin!("0 belong ^0xFFFF0000 Big-endian long", b"\x00\x00\x56\x78");
3936        assert_magic_not_match_bin!("0 belong ^0xFFFF0000 Big-endian long", b"\x00\x01\x56\x78");
3937
3938        // Test ~ operator
3939        assert_magic_match_bin!("0 belong ~0x12345678 Big-endian long", b"\xed\xcb\xa9\x87");
3940        assert_magic_not_match_bin!("0 belong ~0x12345678 Big-endian long", b"\x12\x34\x56\x78");
3941
3942        // Test x operator
3943        assert_magic_match_bin!("0 belong x Big-endian long", b"\x12\x34\x56\x78");
3944        assert_magic_match_bin!("0 belong x Big-endian long", b"\x78\x56\x34\x12");
3945    }
3946
3947    #[test]
3948    fn test_parse_search() {
3949        parse_assert!("0 search test");
3950        parse_assert!("0 search/24/s test");
3951        parse_assert!("0 search/s/24 test");
3952    }
3953
3954    #[test]
3955    fn test_bedate() {
3956        assert_magic_match_bin!(
3957            "0 bedate 946684800 Unix date (Jan 1, 2000)",
3958            b"\x38\x6D\x43\x80"
3959        );
3960        assert_magic_not_match_bin!(
3961            "0 bedate 946684800 Unix date (Jan 1, 2000)",
3962            b"\x00\x00\x00\x00"
3963        );
3964        assert_magic_match_bin!(
3965            "4 bedate 946684800 %s",
3966            b"\x00\x00\x00\x00\x38\x6D\x43\x80",
3967            "2000-01-01 00:00:00"
3968        );
3969    }
3970    #[test]
3971    fn test_beldate() {
3972        assert_magic_match_bin!(
3973            "0 beldate 946684800 Local date (Jan 1, 2000)",
3974            b"\x38\x6D\x43\x80"
3975        );
3976        assert_magic_not_match_bin!(
3977            "0 beldate 946684800 Local date (Jan 1, 2000)",
3978            b"\x00\x00\x00\x00"
3979        );
3980
3981        assert_magic_match_bin!(
3982            "4 beldate 946684800 {}",
3983            b"\x00\x00\x00\x00\x38\x6D\x43\x80",
3984            unix_local_time_to_string(946684800)
3985        );
3986    }
3987
3988    #[test]
3989    fn test_beqdate() {
3990        assert_magic_match_bin!(
3991            "0 beqdate 946684800 Unix date (Jan 1, 2000)",
3992            b"\x00\x00\x00\x00\x38\x6D\x43\x80"
3993        );
3994
3995        assert_magic_not_match_bin!(
3996            "0 beqdate 946684800 Unix date (Jan 1, 2000)",
3997            b"\x00\x00\x00\x00\x00\x00\x00\x00"
3998        );
3999
4000        assert_magic_match_bin!(
4001            "0 beqdate 946684800 %s",
4002            b"\x00\x00\x00\x00\x38\x6D\x43\x80",
4003            "2000-01-01 00:00:00"
4004        );
4005    }
4006
4007    #[test]
4008    fn test_medate() {
4009        assert_magic_match_bin!(
4010            "0 medate 946684800 Unix date (Jan 1, 2000)",
4011            b"\x6D\x38\x80\x43"
4012        );
4013
4014        assert_magic_not_match_bin!(
4015            "0 medate 946684800 Unix date (Jan 1, 2000)",
4016            b"\x00\x00\x00\x00"
4017        );
4018
4019        assert_magic_match_bin!(
4020            "4 medate 946684800 %s",
4021            b"\x00\x00\x00\x00\x6D\x38\x80\x43",
4022            "2000-01-01 00:00:00"
4023        );
4024    }
4025
4026    #[test]
4027    fn test_meldate() {
4028        assert_magic_match_bin!(
4029            "0 meldate 946684800 Local date (Jan 1, 2000)",
4030            b"\x6D\x38\x80\x43"
4031        );
4032        assert_magic_not_match_bin!(
4033            "0 meldate 946684800 Local date (Jan 1, 2000)",
4034            b"\x00\x00\x00\x00"
4035        );
4036
4037        assert_magic_match_bin!(
4038            "4 meldate 946684800 %s",
4039            b"\x00\x00\x00\x00\x6D\x38\x80\x43",
4040            unix_local_time_to_string(946684800)
4041        );
4042    }
4043
4044    #[test]
4045    fn test_date() {
4046        assert_magic_match_bin!(
4047            "0 date 946684800 Local date (Jan 1, 2000)",
4048            b"\x80\x43\x6D\x38"
4049        );
4050        assert_magic_not_match_bin!(
4051            "0 date 946684800 Local date (Jan 1, 2000)",
4052            b"\x00\x00\x00\x00"
4053        );
4054        assert_magic_match_bin!(
4055            "4 date 946684800 {}",
4056            b"\x00\x00\x00\x00\x80\x43\x6D\x38",
4057            "2000-01-01 00:00:00"
4058        );
4059    }
4060
4061    #[test]
4062    fn test_leldate() {
4063        assert_magic_match_bin!(
4064            "0 leldate 946684800 Local date (Jan 1, 2000)",
4065            b"\x80\x43\x6D\x38"
4066        );
4067        assert_magic_not_match_bin!(
4068            "0 leldate 946684800 Local date (Jan 1, 2000)",
4069            b"\x00\x00\x00\x00"
4070        );
4071        assert_magic_match_bin!(
4072            "4 leldate 946684800 {}",
4073            b"\x00\x00\x00\x00\x80\x43\x6D\x38",
4074            unix_local_time_to_string(946684800)
4075        );
4076    }
4077
4078    #[test]
4079    fn test_leqdate() {
4080        assert_magic_match_bin!(
4081            "0 leqdate 1577836800 Unix date (Jan 1, 2020)",
4082            b"\x00\xe1\x0b\x5E\x00\x00\x00\x00"
4083        );
4084
4085        assert_magic_not_match_bin!(
4086            "0 leqdate 1577836800 Unix date (Jan 1, 2020)",
4087            b"\x00\x00\x00\x00\x00\x00\x00\x00"
4088        );
4089        assert_magic_match_bin!(
4090            "8 leqdate 1577836800 %s",
4091            b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\xE1\x0B\x5E\x00\x00\x00\x00",
4092            "2020-01-01 00:00:00"
4093        );
4094    }
4095
4096    #[test]
4097    fn test_leqldate() {
4098        assert_magic_match_bin!(
4099            "0 leqldate 1577836800 Unix date (Jan 1, 2020)",
4100            b"\x00\xe1\x0b\x5E\x00\x00\x00\x00"
4101        );
4102
4103        assert_magic_not_match_bin!(
4104            "0 leqldate 1577836800 Unix date (Jan 1, 2020)",
4105            b"\x00\x00\x00\x00\x00\x00\x00\x00"
4106        );
4107        assert_magic_match_bin!(
4108            "8 leqldate 1577836800 %s",
4109            b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\xE1\x0B\x5E\x00\x00\x00\x00",
4110            unix_local_time_to_string(1577836800)
4111        );
4112    }
4113
4114    #[test]
4115    fn test_melong() {
4116        // Test = operator
4117        assert_magic_match_bin!(
4118            "0 melong =0x12345678 Middle-endian long",
4119            b"\x34\x12\x78\x56"
4120        );
4121        assert_magic_not_match_bin!(
4122            "0 melong =0x12345678 Middle-endian long",
4123            b"\x00\x00\x00\x00"
4124        );
4125
4126        // Test < operator
4127        assert_magic_match_bin!(
4128            "0 melong <0x12345678 Middle-endian long",
4129            b"\x34\x12\x78\x55"
4130        ); // 0x12345677 in middle-endian
4131        assert_magic_not_match_bin!(
4132            "0 melong <0x12345678 Middle-endian long",
4133            b"\x34\x12\x78\x56"
4134        ); // 0x12345678 in middle-endian
4135
4136        // Test > operator
4137        assert_magic_match_bin!(
4138            "0 melong >0x12345678 Middle-endian long",
4139            b"\x34\x12\x78\x57"
4140        ); // 0x12345679 in middle-endian
4141        assert_magic_not_match_bin!(
4142            "0 melong >0x12345678 Middle-endian long",
4143            b"\x34\x12\x78\x56"
4144        ); // 0x12345678 in middle-endian
4145
4146        // Test & operator
4147        assert_magic_match_bin!("0 melong &0x5678 Middle-endian long", b"\xab\xcd\x78\x56"); // 0x00007856 in middle-endian
4148        assert_magic_not_match_bin!(
4149            "0 melong &0x0000FFFF Middle-endian long",
4150            b"\x34\x12\x78\x56"
4151        ); // 0x12347856 in middle-endian
4152
4153        // Test ^ operator (bitwise AND with complement)
4154        assert_magic_match_bin!(
4155            "0 melong ^0xFFFF0000 Middle-endian long",
4156            b"\x00\x00\x78\x56"
4157        ); // 0x00007856 in middle-endian
4158        assert_magic_not_match_bin!(
4159            "0 melong ^0xFFFF0000 Middle-endian long",
4160            b"\x00\x01\x78\x56"
4161        ); // 0x00017856 in middle-endian
4162
4163        // Test ~ operator
4164        assert_magic_match_bin!(
4165            "0 melong ~0x12345678 Middle-endian long",
4166            b"\xCB\xED\x87\xA9"
4167        );
4168        assert_magic_not_match_bin!(
4169            "0 melong ~0x12345678 Middle-endian long",
4170            b"\x34\x12\x78\x56"
4171        ); // The original value
4172
4173        // Test x operator
4174        assert_magic_match_bin!("0 melong x Middle-endian long", b"\x34\x12\x78\x56");
4175        assert_magic_match_bin!("0 melong x Middle-endian long", b"\x00\x00\x00\x00");
4176    }
4177
4178    #[test]
4179    fn test_uquad() {
4180        // Test = operator
4181        assert_magic_match_bin!(
4182            "0 uquad =0x123456789ABCDEF0 Unsigned quad",
4183            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4184        );
4185        assert_magic_not_match_bin!(
4186            "0 uquad =0x123456789ABCDEF0 Unsigned quad",
4187            b"\x00\x00\x00\x00\x00\x00\x00\x00"
4188        );
4189
4190        // Test < operator
4191        assert_magic_match_bin!(
4192            "0 uquad <0x123456789ABCDEF0 Unsigned quad",
4193            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x11"
4194        );
4195        assert_magic_not_match_bin!(
4196            "0 uquad <0x123456789ABCDEF0 Unsigned quad",
4197            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4198        );
4199
4200        // Test > operator
4201        assert_magic_match_bin!(
4202            "0 uquad >0x123456789ABCDEF0 Unsigned quad",
4203            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x13"
4204        );
4205        assert_magic_not_match_bin!(
4206            "0 uquad >0x123456789ABCDEF0 Unsigned quad",
4207            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4208        );
4209
4210        // Test & operator
4211        assert_magic_match_bin!(
4212            "0 uquad &0xF0 Unsigned quad",
4213            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4214        );
4215        assert_magic_not_match_bin!(
4216            "0 uquad &0xFF Unsigned quad",
4217            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4218        );
4219
4220        // Test ^ operator (bitwise AND with complement)
4221        assert_magic_match_bin!(
4222            "0 uquad ^0xFFFFFFFFFFFFFFFF Unsigned quad",
4223            b"\x00\x00\x00\x00\x00\x00\x00\x00"
4224        ); // All bits clear
4225        assert_magic_not_match_bin!(
4226            "0 uquad ^0xFFFFFFFFFFFFFFFF Unsigned quad",
4227            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4228        ); // Some bits set
4229
4230        // Test ~ operator
4231        assert_magic_match_bin!(
4232            "0 uquad ~0x123456789ABCDEF0 Unsigned quad",
4233            b"\x0F\x21\x43\x65\x87\xA9\xCB\xED"
4234        );
4235        assert_magic_not_match_bin!(
4236            "0 uquad ~0x123456789ABCDEF0 Unsigned quad",
4237            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4238        ); // The original value
4239
4240        // Test x operator
4241        assert_magic_match_bin!(
4242            "0 uquad x {:#x}",
4243            b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12",
4244            "0x123456789abcdef0"
4245        );
4246        assert_magic_match_bin!(
4247            "0 uquad x Unsigned quad",
4248            b"\x00\x00\x00\x00\x00\x00\x00\x00"
4249        );
4250    }
4251
4252    #[test]
4253    fn test_guid() {
4254        assert_magic_match_bin!(
4255            "0 guid EC959539-6786-2D4E-8FDB-98814CE76C1E It works",
4256            b"\xEC\x95\x95\x39\x67\x86\x2D\x4E\x8F\xDB\x98\x81\x4C\xE7\x6C\x1E"
4257        );
4258
4259        assert_magic_not_match_bin!(
4260            "0 guid 399595EC-8667-4E2D-8FDB-98814CE76C1E It works",
4261            b"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F"
4262        );
4263
4264        assert_magic_match_bin!(
4265            "0 guid x %s",
4266            b"\xEC\x95\x95\x39\x67\x86\x2D\x4E\x8F\xDB\x98\x81\x4C\xE7\x6C\x1E",
4267            "EC959539-6786-2D4E-8FDB-98814CE76C1E"
4268        );
4269    }
4270
4271    #[test]
4272    fn test_ubeqdate() {
4273        assert_magic_match_bin!(
4274            "0 ubeqdate 1633046400 It works",
4275            b"\x00\x00\x00\x00\x61\x56\x4f\x80"
4276        );
4277
4278        assert_magic_match_bin!(
4279            "0 ubeqdate x %s",
4280            b"\x00\x00\x00\x00\x61\x56\x4f\x80",
4281            "2021-10-01 00:00:00"
4282        );
4283
4284        assert_magic_not_match_bin!(
4285            "0 ubeqdate 1633046400 It should not work",
4286            b"\x00\x00\x00\x00\x00\x00\x00\x00"
4287        );
4288    }
4289
4290    #[test]
4291    fn test_ldate() {
4292        assert_magic_match_bin!("0 ldate 1640551520 It works", b"\x60\xd4\xC8\x61");
4293
4294        assert_magic_not_match_bin!("0 ldate 1633046400 It should not work", b"\x00\x00\x00\x00");
4295
4296        assert_magic_match_bin!(
4297            "0 ldate x %s",
4298            b"\x60\xd4\xC8\x61",
4299            unix_local_time_to_string(1640551520)
4300        );
4301    }
4302
4303    #[test]
4304    fn test_scalar_with_transform() {
4305        assert_magic_match_bin!("0 ubyte/10 2 {}", b"\x14", "2");
4306        assert_magic_match_bin!("0 ubyte/10 x {}", b"\x14", "2");
4307        assert_magic_match_bin!("0 ubyte%10 x {}", b"\x14", "0");
4308    }
4309
4310    #[test]
4311    fn test_float_with_transform() {
4312        assert_magic_match_bin!("0 lefloat/10 2 {}", b"\x00\x00\xa0\x41", "2");
4313        assert_magic_match_bin!("0 lefloat/10 x {}", b"\x00\x00\xa0\x41", "2");
4314        assert_magic_match_bin!("0 lefloat%10 x {}", b"\x00\x00\xa0\x41", "0");
4315    }
4316
4317    #[test]
4318    fn test_read_octal() {
4319        // Basic cases
4320        assert_eq!(read_octal_u64(&mut lazy_cache!("0")), Some(0));
4321        assert_eq!(read_octal_u64(&mut lazy_cache!("00")), Some(0));
4322        assert_eq!(read_octal_u64(&mut lazy_cache!("01")), Some(1));
4323        assert_eq!(read_octal_u64(&mut lazy_cache!("07")), Some(7));
4324        assert_eq!(read_octal_u64(&mut lazy_cache!("010")), Some(8));
4325        assert_eq!(read_octal_u64(&mut lazy_cache!("0123")), Some(83));
4326        assert_eq!(read_octal_u64(&mut lazy_cache!("0755")), Some(493));
4327
4328        // With trailing non-octal characters
4329        assert_eq!(read_octal_u64(&mut lazy_cache!("0ABC")), Some(0));
4330        assert_eq!(read_octal_u64(&mut lazy_cache!("01ABC")), Some(1));
4331        assert_eq!(read_octal_u64(&mut lazy_cache!("0755ABC")), Some(493));
4332        assert_eq!(read_octal_u64(&mut lazy_cache!("0123ABC")), Some(83));
4333
4334        // Invalid octal digits
4335        assert_eq!(read_octal_u64(&mut lazy_cache!("08")), Some(0)); // stops at '8'
4336        assert_eq!(read_octal_u64(&mut lazy_cache!("01238")), Some(83)); // stops at '8'
4337
4338        // No leading '0'
4339        assert_eq!(read_octal_u64(&mut lazy_cache!("123")), None);
4340        assert_eq!(read_octal_u64(&mut lazy_cache!("755")), None);
4341
4342        // Empty string
4343        assert_eq!(read_octal_u64(&mut lazy_cache!("")), None);
4344
4345        // Only non-octal characters
4346        assert_eq!(read_octal_u64(&mut lazy_cache!("ABC")), None);
4347        assert_eq!(read_octal_u64(&mut lazy_cache!("8ABC")), None); // first char is not '0'
4348
4349        // Longer valid octal (but within u64 range)
4350        assert_eq!(
4351            read_octal_u64(&mut lazy_cache!("01777777777")),
4352            Some(268435455)
4353        );
4354    }
4355
4356    #[test]
4357    fn test_offset_bug_1() {
4358        // this tests the exact behaviour
4359        // expected by libmagic/file
4360        assert_magic_match_bin!(
4361            r"
43621	string		TEST Bread is
4363# offset computation is relative to
4364# rule start
4365>(5.b)	use toasted
4366
43670 name toasted
4368>0	string twice Toasted
4369>>0  use toasted_twice 
4370
43710 name toasted_twice
4372>(6.b) string x %s
4373        ",
4374            b"\x00TEST\x06twice\x00\x06",
4375            "Bread is Toasted twice"
4376        );
4377    }
4378
4379    // this test implement the exact same logic as
4380    // test_offset_bug_1 except that the rule starts
4381    // matching from end. Surprisingly we need to
4382    // adjust indirect offsets so that it works in
4383    // libmagic/file
4384    #[test]
4385    fn test_offset_bug_2() {
4386        // this tests the exact behaviour
4387        // expected by libmagic/file
4388        assert_magic_match_bin!(
4389            r"
4390-12	string		TEST Bread is
4391>(4.b)	use toasted
4392
43930 name toasted
4394>0	string twice Toasted
4395>>0  use toasted_twice
4396
43970 name toasted_twice
4398>(6.b) string x %
4399        ",
4400            b"\x00TEST\x06twice\x00\x06",
4401            "Bread is Toasted twice"
4402        )
4403    }
4404
4405    #[test]
4406    fn test_offset_bug_3() {
4407        // this tests the exact behaviour
4408        // expected by libmagic/file
4409        assert_magic_match_bin!(
4410            r"
44111	string		TEST Bread is
4412>(5.b) indirect/r x
4413
44140	string twice Toasted
4415>0  use toasted_twice
4416
44170 name toasted_twice
4418>0 string x %s
4419        ",
4420            b"\x00TEST\x06twice\x00\x08",
4421            "Bread is Toasted twice"
4422        )
4423    }
4424
4425    #[test]
4426    fn test_offset_bug_4() {
4427        // this tests the exact behaviour
4428        // expected by libmagic/file
4429        assert_magic_match_bin!(
4430            r"
44311	string		Bread %s
4432>(6.b) indirect/r x
4433
4434# this one uses a based offset
4435# computed at indirection
44361	string is\ Toasted %s
4437>(11.b)  use toasted_twice
4438
4439# this one is using a new base
4440# offset being previous base 
4441# offset + offset of use
44420 name toasted_twice
4443>0 string x %s
4444            ",
4445            b"\x00Bread\x06is Toasted\x0ctwice\x00",
4446            "Bread is Toasted twice"
4447        )
4448    }
4449
4450    #[test]
4451    fn test_offset_bug_5() {
4452        assert_magic_match_bin!(
4453            r"
44541	string		TEST Bread is
4455>(5.b) indirect/r x
4456
44570	string twice Toasted
4458>0  use toasted_twice
4459
44600 name toasted_twice
4461>0 string twice
4462>>&1 byte 0x08 twice
4463            ",
4464            b"\x00TEST\x06twice\x00\x08",
4465            "Bread is Toasted twice"
4466        )
4467    }
4468}