1#![forbid(unsafe_code)]
2#![deny(unused_imports)]
3#![deny(missing_docs)]
4use dyf::{DynDisplay, FormatString, dformat};
146use flagset::{FlagSet, flags};
147use flate2::{Compression, read::GzDecoder, write::GzEncoder};
148use lazy_cache::LazyCache;
149use memchr::memchr;
150use pest::{Span, error::ErrorVariant};
151use regex::bytes::{self};
152use serde::{Deserialize, Serialize};
153use std::{
154 borrow::Cow,
155 cmp::max,
156 collections::{HashMap, HashSet},
157 fmt::{self, Debug, Display},
158 io::{self, Read, Seek, SeekFrom, Write},
159 ops::{Add, BitAnd, BitOr, BitXor, Div, Mul, Rem, Sub},
160 path::Path,
161};
162use tar::Archive;
163use thiserror::Error;
164use tracing::{Level, debug, enabled, trace};
165
166use crate::{
167 numeric::{Float, FloatDataType, Scalar, ScalarDataType},
168 parser::{FileMagicParser, Rule},
169 utils::{decode_id3, find_json_boundaries, run_utf8_validation},
170};
171
172mod numeric;
173mod parser;
174mod utils;
175
176const HARDCODED_MAGIC_STRENGTH: u64 = 2048;
177const HARDCODED_SOURCE: &str = "hardcoded";
178const MAX_RECURSION: usize = 50;
180const FILE_REGEX_MAX: usize = 8192;
182
183pub const FILE_BYTES_MAX: usize = 7 * 1024 * 1024;
189pub const DEFAULT_BIN_MIMETYPE: &str = "application/octet-stream";
191pub const DEFAULT_TEXT_MIMETYPE: &str = "text/plain";
193
194pub(crate) const TIMESTAMP_FORMAT: &str = "%Y-%m-%d %H:%M:%S";
195
196macro_rules! debug_panic {
197 ($($arg:tt)*) => {
198 if cfg!(debug_assertions) {
199 panic!($($arg)*);
200 }
201 };
202}
203
204macro_rules! read {
205 ($r: expr, $ty: ty) => {{
206 let mut a = [0u8; std::mem::size_of::<$ty>()];
207 $r.read_exact(&mut a)?;
208 a
209 }};
210}
211
212macro_rules! read_le {
213 ($r:expr, $ty: ty ) => {{ <$ty>::from_le_bytes(read!($r, $ty)) }};
214}
215
216macro_rules! read_be {
217 ($r:expr, $ty: ty ) => {{ <$ty>::from_be_bytes(read!($r, $ty)) }};
218}
219
220macro_rules! read_me {
221 ($r: expr) => {{ ((read_le!($r, u16) as i32) << 16) | (read_le!($r, u16) as i32) }};
222}
223
224#[inline(always)]
225fn read_octal_u64<R: Read + Seek>(haystack: &mut LazyCache<R>) -> Option<u64> {
226 let s = haystack
227 .read_while_or_limit(|b| matches!(b, b'0'..=b'7'), 22)
228 .map(|buf| str::from_utf8(buf))
229 .ok()?
230 .ok()?;
231
232 if !s.starts_with("0") {
233 return None;
234 }
235
236 u64::from_str_radix(s, 8).ok()
237}
238
239#[derive(Debug, Error)]
241pub enum Error {
242 #[error("{0}")]
244 Msg(String),
245
246 #[error("source={0} line={1} error={2}")]
248 Verify(String, usize, Box<Error>),
249
250 #[error("source={0} line={1} error={2}")]
252 Localized(String, usize, Box<Error>),
253
254 #[error("missing rule: {0}")]
256 MissingRule(String),
257
258 #[error("maximum recursion reached: {0}")]
260 MaximumRecursion(usize),
261
262 #[error("io: {0}")]
264 Io(#[from] io::Error),
265
266 #[error("parser error: {0}")]
268 Parse(#[from] Box<pest::error::Error<Rule>>),
269
270 #[error("formatting: {0}")]
272 Format(#[from] dyf::Error),
273
274 #[error("regex: {0}")]
276 Regex(#[from] regex::Error),
277
278 #[error("{0}")]
280 Serialize(#[from] bincode::error::EncodeError),
281
282 #[error("{0}")]
284 Deserialize(#[from] bincode::error::DecodeError),
285}
286
287impl Error {
288 #[inline]
289 fn parser<S: ToString>(msg: S, span: Span<'_>) -> Self {
290 Self::Parse(Box::new(pest::error::Error::new_from_span(
291 ErrorVariant::CustomError {
292 message: msg.to_string(),
293 },
294 span,
295 )))
296 }
297
298 fn msg<M: AsRef<str>>(msg: M) -> Self {
299 Self::Msg(msg.as_ref().into())
300 }
301
302 fn localized<S: AsRef<str>>(source: S, line: usize, err: Error) -> Self {
303 Self::Localized(source.as_ref().into(), line, err.into())
304 }
305
306 pub fn unwrap_localized(&self) -> &Self {
308 match self {
309 Self::Localized(_, _, e) => e,
310 _ => self,
311 }
312 }
313}
314
315#[derive(Debug, Clone, Serialize, Deserialize)]
316enum Message {
317 String(String),
318 Format {
319 printf_spec: String,
320 fs: FormatString,
321 },
322}
323
324impl Display for Message {
325 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
326 match self {
327 Self::String(s) => write!(f, "{s}"),
328 Self::Format { printf_spec: _, fs } => write!(f, "{}", fs.to_string_lossy()),
329 }
330 }
331}
332
333impl Message {
334 fn to_string_lossy(&self) -> Cow<'_, str> {
335 match self {
336 Message::String(s) => Cow::Borrowed(s),
337 Message::Format { printf_spec: _, fs } => fs.to_string_lossy(),
338 }
339 }
340
341 #[inline(always)]
342 fn format_with(&self, mr: Option<&MatchRes>) -> Result<Cow<'_, str>, Error> {
343 match self {
344 Self::String(s) => Ok(Cow::Borrowed(s.as_str())),
345 Self::Format {
346 printf_spec: c_spec,
347 fs,
348 } => {
349 if let Some(mr) = mr {
350 match mr {
351 MatchRes::Float(_, _) | MatchRes::Bytes(_, _, _, _) => {
352 Ok(Cow::Owned(dformat!(fs, mr)?))
353 }
354 MatchRes::Scalar(_, scalar) => {
355 if c_spec.as_str() == "c" {
357 match scalar {
358 Scalar::byte(b) => {
359 let b = (*b as u8) as char;
360 Ok(Cow::Owned(dformat!(fs, b)?))
361 }
362 Scalar::ubyte(b) => {
363 let b = *b as char;
364 Ok(Cow::Owned(dformat!(fs, b)?))
365 }
366 _ => Ok(Cow::Owned(dformat!(fs, mr)?)),
367 }
368 } else {
369 Ok(Cow::Owned(dformat!(fs, mr)?))
370 }
371 }
372 }
373 } else {
374 Ok(fs.to_string_lossy())
375 }
376 }
377 }
378 }
379}
380
381impl ScalarDataType {
382 #[inline(always)]
383 fn read<R: Read + Seek>(&self, from: &mut R, switch_endianness: bool) -> Result<Scalar, Error> {
384 macro_rules! _read_le {
385 ($ty: ty) => {{
386 if switch_endianness {
387 <$ty>::from_be_bytes(read!(from, $ty))
388 } else {
389 <$ty>::from_le_bytes(read!(from, $ty))
390 }
391 }};
392 }
393
394 macro_rules! _read_be {
395 ($ty: ty) => {{
396 if switch_endianness {
397 <$ty>::from_le_bytes(read!(from, $ty))
398 } else {
399 <$ty>::from_be_bytes(read!(from, $ty))
400 }
401 }};
402 }
403
404 macro_rules! _read_ne {
405 ($ty: ty) => {{
406 if cfg!(target_endian = "big") {
407 _read_be!($ty)
408 } else {
409 _read_le!($ty)
410 }
411 }};
412 }
413
414 macro_rules! _read_me {
415 () => {
416 ((_read_le!(u16) as i32) << 16) | (_read_le!(u16) as i32)
417 };
418 }
419
420 Ok(match self {
421 Self::byte => Scalar::byte(read!(from, u8)[0] as i8),
423 Self::short => Scalar::short(_read_ne!(i16)),
424 Self::long => Scalar::long(_read_ne!(i32)),
425 Self::date => Scalar::date(_read_ne!(i32)),
426 Self::ldate => Scalar::ldate(_read_ne!(i32)),
427 Self::qwdate => Scalar::qwdate(_read_ne!(i64)),
428 Self::leshort => Scalar::leshort(_read_le!(i16)),
429 Self::lelong => Scalar::lelong(_read_le!(i32)),
430 Self::lequad => Scalar::lequad(_read_le!(i64)),
431 Self::bequad => Scalar::bequad(_read_be!(i64)),
432 Self::belong => Scalar::belong(_read_be!(i32)),
433 Self::bedate => Scalar::bedate(_read_be!(i32)),
434 Self::beldate => Scalar::beldate(_read_be!(i32)),
435 Self::beqdate => Scalar::beqdate(_read_be!(i64)),
436 Self::ubyte => Scalar::ubyte(read!(from, u8)[0]),
438 Self::ushort => Scalar::ushort(_read_ne!(u16)),
439 Self::uleshort => Scalar::uleshort(_read_le!(u16)),
440 Self::ulelong => Scalar::ulelong(_read_le!(u32)),
441 Self::uledate => Scalar::uledate(_read_le!(u32)),
442 Self::ulequad => Scalar::ulequad(_read_le!(u64)),
443 Self::offset => Scalar::offset(from.stream_position()?),
444 Self::ubequad => Scalar::ubequad(_read_be!(u64)),
445 Self::medate => Scalar::medate(_read_me!()),
446 Self::meldate => Scalar::meldate(_read_me!()),
447 Self::melong => Scalar::melong(_read_me!()),
448 Self::beshort => Scalar::beshort(_read_be!(i16)),
449 Self::quad => Scalar::quad(_read_ne!(i64)),
450 Self::uquad => Scalar::uquad(_read_ne!(u64)),
451 Self::ledate => Scalar::ledate(_read_le!(i32)),
452 Self::leldate => Scalar::leldate(_read_le!(i32)),
453 Self::leqdate => Scalar::leqdate(_read_le!(i64)),
454 Self::leqldate => Scalar::leqldate(_read_le!(i64)),
455 Self::leqwdate => Scalar::leqwdate(_read_le!(i64)),
456 Self::ubelong => Scalar::ubelong(_read_be!(u32)),
457 Self::ulong => Scalar::ulong(_read_ne!(u32)),
458 Self::ubeshort => Scalar::ubeshort(_read_be!(u16)),
459 Self::ubeqdate => Scalar::ubeqdate(_read_be!(u64)),
460 Self::lemsdosdate => Scalar::lemsdosdate(_read_le!(u16)),
461 Self::lemsdostime => Scalar::lemsdostime(_read_le!(u16)),
462 Self::guid => Scalar::guid(u128::from_be_bytes(read!(from, u128))),
463 })
464 }
465}
466
467impl FloatDataType {
468 #[inline(always)]
469 fn read<R: Read + Seek>(&self, from: &mut R, switch_endianness: bool) -> Result<Float, Error> {
470 macro_rules! _read_le {
471 ($ty: ty) => {{
472 if switch_endianness {
473 <$ty>::from_be_bytes(read!(from, $ty))
474 } else {
475 <$ty>::from_le_bytes(read!(from, $ty))
476 }
477 }};
478 }
479
480 macro_rules! _read_be {
481 ($ty: ty) => {{
482 if switch_endianness {
483 <$ty>::from_le_bytes(read!(from, $ty))
484 } else {
485 <$ty>::from_be_bytes(read!(from, $ty))
486 }
487 }};
488 }
489
490 macro_rules! _read_ne {
491 ($ty: ty) => {{
492 if cfg!(target_endian = "big") {
493 _read_be!($ty)
494 } else {
495 _read_le!($ty)
496 }
497 }};
498 }
499
500 macro_rules! _read_me {
501 () => {
502 ((_read_le!(u16) as i32) << 16) | (_read_le!(u16) as i32)
503 };
504 }
505
506 Ok(match self {
507 Self::lefloat => Float::lefloat(_read_le!(f32)),
508 Self::befloat => Float::befloat(_read_le!(f32)),
509 Self::ledouble => Float::ledouble(_read_le!(f64)),
510 Self::bedouble => Float::bedouble(_read_be!(f64)),
511 })
512 }
513}
514
515#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
516enum Op {
517 Mul,
518 Add,
519 Sub,
520 Div,
521 Mod,
522 And,
523 Xor,
524 Or,
525}
526
527impl Display for Op {
528 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
529 match self {
530 Op::Mul => write!(f, "*"),
531 Op::Add => write!(f, "+"),
532 Op::Sub => write!(f, "-"),
533 Op::Div => write!(f, "/"),
534 Op::Mod => write!(f, "%"),
535 Op::And => write!(f, "&"),
536 Op::Or => write!(f, "|"),
537 Op::Xor => write!(f, "^"),
538 }
539 }
540}
541
542#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
543enum CmpOp {
544 Eq,
545 Lt,
546 Gt,
547 BitAnd,
548 Neq, Xor,
550 Not, }
552
553impl CmpOp {
554 #[inline(always)]
555 fn is_neq(&self) -> bool {
556 matches!(self, Self::Neq)
557 }
558}
559
560#[derive(Debug, Clone, Serialize, Deserialize)]
561struct ScalarTransform {
562 op: Op,
563 num: Scalar,
564}
565
566impl ScalarTransform {
567 fn apply(&self, s: Scalar) -> Option<Scalar> {
568 match self.op {
569 Op::Add => s.checked_add(self.num),
570 Op::Sub => s.checked_sub(self.num),
571 Op::Mul => s.checked_mul(self.num),
572 Op::Div => s.checked_div(self.num),
573 Op::Mod => s.checked_rem(self.num),
574 Op::And => Some(s.bitand(self.num)),
575 Op::Xor => Some(s.bitxor(self.num)),
576 Op::Or => Some(s.bitor(self.num)),
577 }
578 }
579}
580
581#[derive(Debug, Clone, Serialize, Deserialize)]
582struct FloatTransform {
583 op: Op,
584 num: Float,
585}
586
587impl FloatTransform {
588 fn apply(&self, s: Float) -> Float {
589 match self.op {
590 Op::Add => s.add(self.num),
591 Op::Sub => s.sub(self.num),
592 Op::Mul => s.mul(self.num),
593 Op::Div => s.div(self.num),
595 Op::Mod => s.rem(self.num),
597 Op::And | Op::Xor | Op::Or => {
599 debug_panic!("unsupported operation");
600 s
601 }
602 }
603 }
604}
605
606#[derive(Debug, Clone, Serialize, Deserialize)]
607enum TestValue<T> {
608 Value(T),
609 Any,
610}
611
612impl<T> TestValue<T> {
613 #[inline(always)]
614 fn as_ref(&self) -> TestValue<&T> {
615 match self {
616 Self::Value(v) => TestValue::Value(v),
617 Self::Any => TestValue::Any,
618 }
619 }
620}
621
622flags! {
623 enum ReMod: u8{
624 CaseInsensitive,
625 StartOffsetUpdate,
626 LineLimit,
627 ForceBin,
628 ForceText,
629 TrimMatch,
630 }
631}
632
633fn serialize_regex<S>(re: &bytes::Regex, serializer: S) -> Result<S::Ok, S::Error>
634where
635 S: serde::Serializer,
636{
637 re.as_str().serialize(serializer)
638}
639
640fn deserialize_regex<'de, D>(deserializer: D) -> Result<bytes::Regex, D::Error>
641where
642 D: serde::Deserializer<'de>,
643{
644 let wrapper = String::deserialize(deserializer)?;
645 bytes::Regex::new(&wrapper).map_err(serde::de::Error::custom)
646}
647
648#[derive(Debug, Clone, Serialize, Deserialize)]
649struct RegexTest {
650 #[serde(
651 serialize_with = "serialize_regex",
652 deserialize_with = "deserialize_regex"
653 )]
654 re: bytes::Regex,
655 length: Option<usize>,
656 mods: FlagSet<ReMod>,
657 str_mods: FlagSet<StringMod>,
658 non_magic_len: usize,
659 binary: bool,
660 cmp_op: CmpOp,
661}
662
663impl RegexTest {
664 #[inline(always)]
665 fn is_binary(&self) -> bool {
666 self.binary
667 || self.mods.contains(ReMod::ForceBin)
668 || self.str_mods.contains(StringMod::ForceBin)
669 }
670
671 #[inline(always)]
672 fn is_text(&self) -> bool {
673 self.mods.contains(ReMod::ForceText) || self.str_mods.contains(StringMod::ForceText)
674 }
675
676 fn match_buf<'buf>(
677 &self,
678 off_buf: u64, stream_kind: StreamKind,
680 buf: &'buf [u8],
681 ) -> Option<MatchRes<'buf>> {
682 let mr = match stream_kind {
683 StreamKind::Text(_) => {
684 let mut off_txt = off_buf;
685
686 let mut line_limit = self.length.unwrap_or(usize::MAX);
687
688 for line in buf.split(|c| c == &b'\n') {
689 if line_limit == 0 {
693 break;
694 }
695
696 if let Some(re_match) = self.re.find(line) {
697 let start_offset = off_txt + re_match.start() as u64;
699
700 let stop_offset = if re_match.end() == line.len() {
702 Some(start_offset + re_match.as_bytes().len() as u64 + 1)
703 } else {
704 None
705 };
706
707 return Some(MatchRes::Bytes(
708 start_offset,
709 stop_offset,
710 re_match.as_bytes(),
711 Encoding::Utf8,
712 ));
713 }
714
715 off_txt += line.len() as u64;
716 off_txt += 1;
718 line_limit = line_limit.saturating_sub(1)
719 }
720 None
721 }
722
723 StreamKind::Binary => {
724 self.re.find(buf).map(|re_match| {
725 MatchRes::Bytes(
726 off_buf + re_match.start() as u64,
728 None,
729 re_match.as_bytes(),
730 Encoding::Utf8,
731 )
732 })
733 }
734 };
735
736 if self.cmp_op.is_neq() && mr.is_none() {
738 return Some(MatchRes::Bytes(off_buf, None, buf, Encoding::Utf8));
739 }
740
741 mr
742 }
743}
744
745impl From<RegexTest> for Test {
746 fn from(value: RegexTest) -> Self {
747 Self::Regex(value)
748 }
749}
750
751flags! {
752 enum StringMod: u8{
753 ForceBin,
754 UpperInsensitive,
755 LowerInsensitive,
756 FullWordMatch,
757 Trim,
758 ForceText,
759 CompactWhitespace,
760 OptBlank,
761 }
762}
763
764#[derive(Debug, Clone, Serialize, Deserialize)]
765struct StringTest {
766 test_val: TestValue<Vec<u8>>,
767 cmp_op: CmpOp,
768 length: Option<usize>,
769 mods: FlagSet<StringMod>,
770 binary: bool,
771}
772
773impl From<StringTest> for Test {
774 fn from(value: StringTest) -> Self {
775 Self::String(value)
776 }
777}
778
779#[inline(always)]
780fn string_match(str: &[u8], mods: FlagSet<StringMod>, buf: &[u8]) -> (bool, usize) {
781 let mut consumed = 0;
782 if mods.is_disjoint(
784 StringMod::UpperInsensitive
785 | StringMod::LowerInsensitive
786 | StringMod::FullWordMatch
787 | StringMod::CompactWhitespace
788 | StringMod::OptBlank,
789 ) {
790 if buf.starts_with(str) {
792 (true, str.len())
793 } else {
794 (false, consumed)
795 }
796 } else {
797 let mut i_src = 0;
798 let mut iter = buf.iter().peekable();
799
800 macro_rules! consume_target {
801 () => {{
802 if iter.next().is_some() {
803 consumed += 1;
804 }
805 }};
806 }
807
808 macro_rules! continue_next_iteration {
809 () => {{
810 consume_target!();
811 i_src += 1;
812 continue;
813 }};
814 }
815
816 while let Some(&&b) = iter.peek() {
817 let Some(&ref_byte) = str.get(i_src) else {
818 break;
819 };
820
821 if mods.contains(StringMod::OptBlank) && (b == b' ' || ref_byte == b' ') {
822 if b == b' ' {
823 consume_target!();
825 }
826
827 if ref_byte == b' ' {
828 i_src += 1;
830 }
831
832 continue;
833 }
834
835 if mods.contains(StringMod::UpperInsensitive) {
836 if ref_byte.is_ascii_uppercase() && ref_byte == b.to_ascii_uppercase()
838 || ref_byte == b
839 {
840 continue_next_iteration!()
841 }
842 }
843
844 if mods.contains(StringMod::LowerInsensitive)
845 && (ref_byte.is_ascii_lowercase() && ref_byte == b.to_ascii_lowercase()
846 || ref_byte == b)
847 {
848 continue_next_iteration!()
849 }
850
851 if mods.contains(StringMod::CompactWhitespace) && ref_byte == b' ' {
852 let mut src_blk = 0;
853 while let Some(b' ') = str.get(i_src) {
854 src_blk += 1;
855 i_src += 1;
856 }
857
858 let mut tgt_blk = 0;
859 while let Some(b' ') = iter.peek() {
860 tgt_blk += 1;
861 consume_target!();
862 }
863
864 if src_blk > tgt_blk {
865 return (false, consumed);
866 }
867
868 continue;
869 }
870
871 if ref_byte == b {
872 continue_next_iteration!()
873 } else {
874 return (false, consumed);
875 }
876 }
877
878 if mods.contains(StringMod::FullWordMatch)
879 && let Some(b) = iter.peek()
880 && !b.is_ascii_whitespace()
881 {
882 return (false, consumed);
883 }
884
885 (
886 consumed > 0 && str.get(i_src).is_none() && consumed <= buf.len(),
887 consumed,
888 )
889 }
890}
891
892impl StringTest {
893 fn has_length_mod(&self) -> bool {
894 !self.mods.is_disjoint(
895 StringMod::UpperInsensitive
896 | StringMod::LowerInsensitive
897 | StringMod::FullWordMatch
898 | StringMod::CompactWhitespace
899 | StringMod::OptBlank,
900 )
901 }
902
903 #[inline(always)]
904 fn test_value_len(&self) -> usize {
905 match self.test_val.as_ref() {
906 TestValue::Value(s) => s.len(),
907 TestValue::Any => 0,
908 }
909 }
910
911 #[inline(always)]
912 fn is_binary(&self) -> bool {
913 self.binary || self.mods.contains(StringMod::ForceBin)
914 }
915
916 #[inline(always)]
917 fn is_text(&self) -> bool {
918 self.mods.contains(StringMod::ForceText)
919 }
920}
921
922#[derive(Debug, Clone, Serialize, Deserialize)]
923struct SearchTest {
924 str: Vec<u8>,
925 n_pos: Option<usize>,
926 str_mods: FlagSet<StringMod>,
927 re_mods: FlagSet<ReMod>,
928 binary: bool,
929 cmp_op: CmpOp,
930}
931
932impl From<SearchTest> for Test {
933 fn from(value: SearchTest) -> Self {
934 Self::Search(value)
935 }
936}
937
938impl SearchTest {
939 #[inline(always)]
940 fn is_binary(&self) -> bool {
941 (self.binary
942 || self.str_mods.contains(StringMod::ForceBin)
943 || self.re_mods.contains(ReMod::ForceBin))
944 && !(self.str_mods.contains(StringMod::ForceText)
945 || self.re_mods.contains(ReMod::ForceText))
946 }
947
948 #[inline]
950 fn match_buf<'buf>(&self, off_buf: u64, buf: &'buf [u8]) -> Option<MatchRes<'buf>> {
951 let mut i = 0;
952
953 let needle = self.str.first()?;
954
955 while i < buf.len() {
956 let Some(k) = memchr(*needle, &buf[i..]) else {
959 break;
960 };
961
962 i += k;
963
964 if self.str_mods.contains(StringMod::FullWordMatch) {
966 let prev_is_whitespace = buf
967 .get(i.saturating_sub(1))
968 .map(|c| c.is_ascii_whitespace())
969 .unwrap_or_default();
970
971 if i > 0 && !prev_is_whitespace {
976 i += 1;
977 continue;
978 }
979 }
980
981 if let Some(npos) = self.n_pos
982 && i > npos
983 {
984 break;
985 }
986
987 let pos = i;
988 let (ok, consumed) = string_match(&self.str, self.str_mods, &buf[i..]);
989
990 if ok {
991 return Some(MatchRes::Bytes(
992 off_buf.saturating_add(pos as u64),
993 None,
994 &buf[i..i + consumed],
995 Encoding::Utf8,
996 ));
997 } else {
998 i += max(consumed, 1)
999 }
1000 }
1001
1002 if self.cmp_op.is_neq() {
1004 return Some(MatchRes::Bytes(off_buf, None, buf, Encoding::Utf8));
1005 }
1006
1007 None
1008 }
1009}
1010
1011#[derive(Debug, Clone, Serialize, Deserialize)]
1012struct ScalarTest {
1013 ty: ScalarDataType,
1014 transform: Option<ScalarTransform>,
1015 cmp_op: CmpOp,
1016 test_val: TestValue<Scalar>,
1017}
1018
1019#[derive(Debug, Clone, Serialize, Deserialize)]
1020struct FloatTest {
1021 ty: FloatDataType,
1022 transform: Option<FloatTransform>,
1023 cmp_op: CmpOp,
1024 test_val: TestValue<Float>,
1025}
1026
1027#[derive(Debug, PartialEq)]
1030enum ReadValue<'buf> {
1031 Float(u64, Float),
1032 Scalar(u64, Scalar),
1033 Bytes(u64, &'buf [u8]),
1034}
1035
1036impl DynDisplay for ReadValue<'_> {
1037 fn dyn_fmt(&self, f: &dyf::FormatSpec) -> Result<String, dyf::Error> {
1038 match self {
1039 Self::Float(_, s) => DynDisplay::dyn_fmt(s, f),
1040 Self::Scalar(_, s) => DynDisplay::dyn_fmt(s, f),
1041 Self::Bytes(_, b) => Ok(format!("{b:?}")),
1042 }
1043 }
1044}
1045
1046impl DynDisplay for &ReadValue<'_> {
1047 fn dyn_fmt(&self, f: &dyf::FormatSpec) -> Result<String, dyf::Error> {
1048 DynDisplay::dyn_fmt(*self, f)
1050 }
1051}
1052
1053impl Display for ReadValue<'_> {
1054 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1055 match self {
1056 Self::Float(_, v) => write!(f, "{v}"),
1057 Self::Scalar(_, s) => write!(f, "{s}"),
1058 Self::Bytes(_, b) => write!(f, "{b:?}"),
1059 }
1060 }
1061}
1062
1063enum Encoding {
1064 Utf16(String16Encoding),
1065 Utf8,
1066}
1067
1068enum MatchRes<'buf> {
1071 Bytes(u64, Option<u64>, &'buf [u8], Encoding),
1076 Scalar(u64, Scalar),
1077 Float(u64, Float),
1078}
1079
1080impl DynDisplay for &MatchRes<'_> {
1081 fn dyn_fmt(&self, f: &dyf::FormatSpec) -> Result<String, dyf::Error> {
1082 (*self).dyn_fmt(f)
1083 }
1084}
1085
1086impl DynDisplay for MatchRes<'_> {
1087 fn dyn_fmt(&self, f: &dyf::FormatSpec) -> Result<String, dyf::Error> {
1088 match self {
1089 Self::Scalar(_, v) => v.dyn_fmt(f),
1090 Self::Float(_, v) => v.dyn_fmt(f),
1091 Self::Bytes(_, _, v, enc) => match enc {
1092 Encoding::Utf8 => String::from_utf8_lossy(v).to_string().dyn_fmt(f),
1093 Encoding::Utf16(enc) => {
1094 let utf16: Vec<u16> = slice_to_utf16_iter(v, *enc).collect();
1095 String::from_utf16_lossy(&utf16).dyn_fmt(f)
1096 }
1097 },
1098 }
1099 }
1100}
1101
1102impl MatchRes<'_> {
1103 #[inline]
1105 fn start_offset(&self) -> u64 {
1106 match self {
1107 MatchRes::Bytes(o, _, _, _) => *o,
1108 MatchRes::Scalar(o, _) => *o,
1109 MatchRes::Float(o, _) => *o,
1110 }
1111 }
1112
1113 #[inline]
1115 fn end_offset(&self) -> u64 {
1116 match self {
1117 MatchRes::Bytes(start, end, buf, _) => match end {
1118 Some(end) => *end,
1119 None => start.saturating_add(buf.len() as u64),
1120 },
1121 MatchRes::Scalar(o, sc) => o.add(sc.size_of() as u64),
1122 MatchRes::Float(o, f) => o.add(f.size_of() as u64),
1123 }
1124 }
1125}
1126
1127fn slice_to_utf16_iter(read: &[u8], encoding: String16Encoding) -> impl Iterator<Item = u16> {
1128 let even = read
1129 .iter()
1130 .enumerate()
1131 .filter(|(i, _)| i % 2 == 0)
1132 .map(|t| t.1);
1133
1134 let odd = read
1135 .iter()
1136 .enumerate()
1137 .filter(|(i, _)| i % 2 != 0)
1138 .map(|t| t.1);
1139
1140 even.zip(odd).map(move |(e, o)| match encoding {
1141 String16Encoding::Le => u16::from_le_bytes([*e, *o]),
1142 String16Encoding::Be => u16::from_be_bytes([*e, *o]),
1143 })
1144}
1145
1146#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
1147enum String16Encoding {
1148 Le,
1149 Be,
1150}
1151
1152#[derive(Debug, Clone, Serialize, Deserialize)]
1153struct String16Test {
1154 orig: String,
1155 test_val: TestValue<Vec<u16>>,
1156 encoding: String16Encoding,
1157}
1158
1159impl String16Test {
1160 #[inline(always)]
1164 fn test_value_len(&self) -> usize {
1165 match self.test_val.as_ref() {
1166 TestValue::Value(str16) => str16.len(),
1167 TestValue::Any => 0,
1168 }
1169 }
1170}
1171
1172flags! {
1173 enum IndirectMod: u8{
1174 Relative,
1175 }
1176}
1177
1178type IndirectMods = FlagSet<IndirectMod>;
1179
1180#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
1181enum PStringLen {
1182 Byte, ShortBe, ShortLe, LongBe, LongLe, }
1188
1189impl PStringLen {
1190 #[inline(always)]
1191 const fn size_of_len(&self) -> usize {
1192 match self {
1193 PStringLen::Byte => 1,
1194 PStringLen::ShortBe => 2,
1195 PStringLen::ShortLe => 2,
1196 PStringLen::LongBe => 4,
1197 PStringLen::LongLe => 4,
1198 }
1199 }
1200}
1201
1202#[derive(Debug, Clone, Serialize, Deserialize)]
1203struct PStringTest {
1204 len: PStringLen,
1205 test_val: TestValue<Vec<u8>>,
1206 include_len: bool,
1207}
1208
1209impl PStringTest {
1210 #[inline]
1211 fn read<'cache, R: Read + Seek>(
1212 &self,
1213 haystack: &'cache mut LazyCache<R>,
1214 ) -> Result<Option<&'cache [u8]>, Error> {
1215 let mut len = match self.len {
1216 PStringLen::Byte => read_le!(haystack, u8) as u32,
1217 PStringLen::ShortBe => read_be!(haystack, u16) as u32,
1218 PStringLen::ShortLe => read_le!(haystack, u16) as u32,
1219 PStringLen::LongBe => read_be!(haystack, u32),
1220 PStringLen::LongLe => read_le!(haystack, u32),
1221 } as usize;
1222
1223 if self.include_len {
1224 len = len.saturating_sub(self.len.size_of_len())
1225 }
1226
1227 if let TestValue::Value(s) = self.test_val.as_ref()
1228 && len != s.len()
1229 {
1230 return Ok(None);
1231 }
1232
1233 let read = haystack.read_exact_count(len as u64)?;
1234
1235 Ok(Some(read))
1236 }
1237
1238 #[inline(always)]
1239 fn test_value_len(&self) -> usize {
1240 match self.test_val.as_ref() {
1241 TestValue::Value(s) => s.len(),
1242 TestValue::Any => 0,
1243 }
1244 }
1245}
1246
1247#[derive(Debug, Clone, Serialize, Deserialize)]
1248enum Test {
1249 Name(String),
1250 Use(bool, String),
1251 Scalar(ScalarTest),
1252 Float(FloatTest),
1253 String(StringTest),
1254 Search(SearchTest),
1255 PString(PStringTest),
1256 Regex(RegexTest),
1257 Indirect(FlagSet<IndirectMod>),
1258 String16(String16Test),
1259 #[allow(dead_code)]
1261 Der,
1262 Clear,
1263 Default,
1264}
1265
1266impl Test {
1267 #[inline]
1269 fn read_test_value<'haystack, R: Read + Seek>(
1270 &self,
1271 haystack: &'haystack mut LazyCache<R>,
1272 switch_endianness: bool,
1273 ) -> Result<Option<ReadValue<'haystack>>, Error> {
1274 let test_value_offset = haystack.lazy_stream_position();
1275
1276 match self {
1277 Self::Scalar(t) => {
1278 t.ty.read(haystack, switch_endianness)
1279 .map(|s| Some(ReadValue::Scalar(test_value_offset, s)))
1280 }
1281
1282 Self::Float(t) => {
1283 t.ty.read(haystack, switch_endianness)
1284 .map(|f| Some(ReadValue::Float(test_value_offset, f)))
1285 }
1286 Self::String(t) => {
1287 match t.test_val.as_ref() {
1288 TestValue::Value(str) => {
1289 let buf = if let Some(length) = t.length {
1290 haystack.read_exact_count(length as u64)?
1292 } else {
1293 match t.cmp_op {
1296 CmpOp::Eq | CmpOp::Neq => {
1297 if !t.has_length_mod() {
1298 haystack.read_exact_count(str.len() as u64)?
1299 } else {
1300 haystack.read_count(FILE_BYTES_MAX as u64)?
1301 }
1302 }
1303 CmpOp::Lt | CmpOp::Gt => {
1304 let read =
1305 haystack.read_until_any_delim_or_limit(b"\n\0", 8092)?;
1306
1307 if read.ends_with(b"\0") || read.ends_with(b"\n") {
1308 &read[..read.len() - 1]
1309 } else {
1310 read
1311 }
1312 }
1313 _ => {
1314 return Err(Error::Msg(format!(
1315 "string test does not support {:?} operator",
1316 t.cmp_op
1317 )));
1318 }
1319 }
1320 };
1321
1322 Ok(Some(ReadValue::Bytes(test_value_offset, buf)))
1323 }
1324 TestValue::Any => {
1325 let read = haystack.read_until_any_delim_or_limit(b"\0\n", 8192)?;
1326 let bytes = if read.ends_with(b"\0") || read.ends_with(b"\n") {
1328 &read[..read.len() - 1]
1329 } else {
1330 read
1331 };
1332
1333 Ok(Some(ReadValue::Bytes(test_value_offset, bytes)))
1334 }
1335 }
1336 }
1337
1338 Self::String16(t) => {
1339 match t.test_val.as_ref() {
1340 TestValue::Value(str16) => {
1341 let read = haystack.read_exact_count((str16.len() * 2) as u64)?;
1342
1343 Ok(Some(ReadValue::Bytes(test_value_offset, read)))
1344 }
1345 TestValue::Any => {
1346 let read = haystack.read_until_utf16_or_limit(b"\x00\x00", 8192)?;
1347
1348 let end = if read.len() % 2 == 0 {
1350 read.len()
1351 } else {
1352 read.len().saturating_sub(1)
1355 };
1356
1357 Ok(Some(ReadValue::Bytes(test_value_offset, &read[..end])))
1358 }
1359 }
1360 }
1361
1362 Self::PString(t) => {
1363 let Some(read) = t.read(haystack)? else {
1364 return Ok(None);
1365 };
1366 Ok(Some(ReadValue::Bytes(test_value_offset, read)))
1367 }
1368
1369 Self::Search(_) => {
1370 let buf = haystack.read_count(FILE_BYTES_MAX as u64)?;
1371 Ok(Some(ReadValue::Bytes(test_value_offset, buf)))
1372 }
1373
1374 Self::Regex(r) => {
1375 let length = {
1376 match r.length {
1377 Some(len) => {
1378 if r.mods.contains(ReMod::LineLimit) {
1379 len * 80
1380 } else {
1381 len
1382 }
1383 }
1384
1385 None => FILE_REGEX_MAX,
1386 }
1387 };
1388
1389 let read = haystack.read_count(length as u64)?;
1390 Ok(Some(ReadValue::Bytes(test_value_offset, read)))
1391 }
1392
1393 Self::Name(_)
1394 | Self::Use(_, _)
1395 | Self::Indirect(_)
1396 | Self::Clear
1397 | Self::Default
1398 | Self::Der => Err(Error::msg("no value to read for this test")),
1399 }
1400 }
1401
1402 #[inline(always)]
1403 fn match_value<'s>(
1404 &'s self,
1405 tv: &ReadValue<'s>,
1406 stream_kind: StreamKind,
1407 ) -> Option<MatchRes<'s>> {
1408 match (self, tv) {
1409 (Self::Scalar(t), ReadValue::Scalar(o, ts)) => {
1410 let read_value: Scalar = match t.transform.as_ref() {
1411 Some(t) => t.apply(*ts)?,
1412 None => *ts,
1413 };
1414
1415 match t.test_val {
1416 TestValue::Value(test_value) => {
1417 let ok = match t.cmp_op {
1418 CmpOp::Not => read_value == !test_value,
1421 CmpOp::Eq => read_value == test_value,
1422 CmpOp::Lt => read_value < test_value,
1423 CmpOp::Gt => read_value > test_value,
1424 CmpOp::Neq => read_value != test_value,
1425 CmpOp::BitAnd => read_value & test_value == test_value,
1426 CmpOp::Xor => (read_value & test_value).is_zero(),
1427 };
1428
1429 if ok {
1430 Some(MatchRes::Scalar(*o, read_value))
1431 } else {
1432 None
1433 }
1434 }
1435
1436 TestValue::Any => Some(MatchRes::Scalar(*o, read_value)),
1437 }
1438 }
1439
1440 (Self::Float(t), ReadValue::Float(o, f)) => {
1441 let read_value: Float = t.transform.as_ref().map(|t| t.apply(*f)).unwrap_or(*f);
1442
1443 match t.test_val {
1444 TestValue::Value(tf) => {
1445 let ok = match t.cmp_op {
1446 CmpOp::Eq => read_value == tf,
1447 CmpOp::Lt => read_value < tf,
1448 CmpOp::Gt => read_value > tf,
1449 CmpOp::Neq => read_value != tf,
1450 _ => {
1451 debug_panic!("unsupported float comparison");
1454 debug!("unsupported float comparison");
1455 false
1456 }
1457 };
1458
1459 if ok {
1460 Some(MatchRes::Float(*o, read_value))
1461 } else {
1462 None
1463 }
1464 }
1465 TestValue::Any => Some(MatchRes::Float(*o, read_value)),
1466 }
1467 }
1468
1469 (Self::String(st), ReadValue::Bytes(o, buf)) => {
1470 macro_rules! trim_buf {
1471 ($buf: expr) => {{
1472 if st.mods.contains(StringMod::Trim) {
1473 $buf.trim_ascii()
1474 } else {
1475 $buf
1476 }
1477 }};
1478 }
1479
1480 match st.test_val.as_ref() {
1481 TestValue::Value(str) => {
1482 match st.cmp_op {
1483 CmpOp::Eq => {
1484 if let (true, _) = string_match(str, st.mods, buf) {
1485 Some(MatchRes::Bytes(*o, None, trim_buf!(str), Encoding::Utf8))
1486 } else {
1487 None
1488 }
1489 }
1490 CmpOp::Neq => {
1491 if let (false, _) = string_match(str, st.mods, buf) {
1492 Some(MatchRes::Bytes(*o, None, trim_buf!(str), Encoding::Utf8))
1493 } else {
1494 None
1495 }
1496 }
1497 CmpOp::Gt => {
1498 if buf.len() > str.len() {
1499 Some(MatchRes::Bytes(*o, None, trim_buf!(buf), Encoding::Utf8))
1500 } else {
1501 None
1502 }
1503 }
1504 CmpOp::Lt => {
1505 if buf.len() < str.len() {
1506 Some(MatchRes::Bytes(*o, None, trim_buf!(buf), Encoding::Utf8))
1507 } else {
1508 None
1509 }
1510 }
1511
1512 _ => {
1514 debug_panic!("unsupported string comparison");
1517 debug!("unsupported string comparison");
1518 None
1519 }
1520 }
1521 }
1522 TestValue::Any => {
1523 Some(MatchRes::Bytes(*o, None, trim_buf!(buf), Encoding::Utf8))
1524 }
1525 }
1526 }
1527
1528 (Self::PString(m), ReadValue::Bytes(o, buf)) => match m.test_val.as_ref() {
1529 TestValue::Value(psv) => {
1530 if buf == psv {
1531 Some(MatchRes::Bytes(*o, None, buf, Encoding::Utf8))
1532 } else {
1533 None
1534 }
1535 }
1536 TestValue::Any => Some(MatchRes::Bytes(*o, None, buf, Encoding::Utf8)),
1537 },
1538
1539 (Self::String16(t), ReadValue::Bytes(o, buf)) => {
1540 match t.test_val.as_ref() {
1541 TestValue::Value(str16) => {
1542 if str16.len() * 2 != buf.len() {
1544 return None;
1545 }
1546
1547 for (i, utf16_char) in slice_to_utf16_iter(buf, t.encoding).enumerate() {
1549 if str16[i] != utf16_char {
1550 return None;
1551 }
1552 }
1553
1554 Some(MatchRes::Bytes(
1555 *o,
1556 None,
1557 t.orig.as_bytes(),
1558 Encoding::Utf16(t.encoding),
1559 ))
1560 }
1561
1562 TestValue::Any => {
1563 Some(MatchRes::Bytes(*o, None, buf, Encoding::Utf16(t.encoding)))
1564 }
1565 }
1566 }
1567
1568 (Self::Regex(r), ReadValue::Bytes(o, buf)) => r.match_buf(*o, stream_kind, buf),
1569
1570 (Self::Search(t), ReadValue::Bytes(o, buf)) => t.match_buf(*o, buf),
1571
1572 _ => None,
1573 }
1574 }
1575
1576 #[inline(always)]
1577 fn strength(&self) -> u64 {
1578 const MULT: usize = 10;
1579
1580 let mut out = 2 * MULT;
1581
1582 match self {
1584 Test::Scalar(s) => {
1585 out += s.ty.type_size() * MULT;
1586 }
1587
1588 Test::Float(t) => {
1589 out += t.ty.type_size() * MULT;
1590 }
1591
1592 Test::String(t) => out += t.test_value_len().saturating_mul(MULT),
1593
1594 Test::PString(t) => out += t.test_value_len().saturating_mul(MULT),
1595
1596 Test::Search(s) => {
1597 let n_pos = s.n_pos.unwrap_or(FILE_BYTES_MAX);
1602
1603 match n_pos {
1604 0..=80 => out += s.str.len().saturating_mul(MULT),
1606 81..=240 => out += s.str.len() * s.str.len().clamp(0, MULT - 2),
1608 _ => out += s.str.len(),
1610 }
1611 }
1612
1613 Test::Regex(r) => {
1614 let v = r.non_magic_len / r.re.captures_len();
1623
1624 let len = r
1625 .length
1626 .map(|l| {
1627 if r.mods.contains(ReMod::LineLimit) {
1628 l * 80
1629 } else {
1630 l
1631 }
1632 })
1633 .unwrap_or(FILE_BYTES_MAX);
1634
1635 match len {
1636 0..=80 => out += v.saturating_mul(MULT),
1638 81..=240 => out += v * v.clamp(0, MULT - 2),
1640 _ => out += v,
1642 }
1643 }
1644
1645 Test::String16(t) => {
1646 out += t.test_value_len().saturating_mul(MULT);
1651 }
1652
1653 Test::Der => out += MULT,
1654
1655 Test::Default | Test::Name(_) | Test::Use(_, _) | Test::Indirect(_) | Test::Clear => {
1656 return 0;
1657 }
1658 }
1659
1660 if self.is_match_any() {
1662 return 0;
1663 }
1664
1665 if let Some(op) = self.cmp_op() {
1666 match op {
1667 CmpOp::Neq => out = 0,
1669 CmpOp::Eq | CmpOp::Not => out += MULT,
1670 CmpOp::Lt | CmpOp::Gt => out -= 2 * MULT,
1671 CmpOp::Xor | CmpOp::BitAnd => out -= MULT,
1672 }
1673 }
1674
1675 out as u64
1676 }
1677
1678 #[inline(always)]
1679 fn cmp_op(&self) -> Option<CmpOp> {
1680 match self {
1681 Self::String(t) => Some(t.cmp_op),
1682 Self::Scalar(s) => Some(s.cmp_op),
1683 Self::Float(t) => Some(t.cmp_op),
1684 Self::Name(_)
1685 | Self::Use(_, _)
1686 | Self::Search(_)
1687 | Self::PString(_)
1688 | Self::Regex(_)
1689 | Self::Clear
1690 | Self::Default
1691 | Self::Indirect(_)
1692 | Self::String16(_)
1693 | Self::Der => None,
1694 }
1695 }
1696
1697 #[inline(always)]
1698 fn is_recursive(&self) -> bool {
1699 matches!(self, Test::Use(_, _) | Test::Indirect(_))
1700 }
1701
1702 #[inline(always)]
1703 fn is_match_any(&self) -> bool {
1704 match self {
1705 Test::Name(_) => false,
1706 Test::Use(_, _) => false,
1707 Test::Scalar(scalar_test) => matches!(scalar_test.test_val, TestValue::Any),
1708 Test::Float(float_test) => matches!(float_test.test_val, TestValue::Any),
1709 Test::String(string_test) => matches!(string_test.test_val, TestValue::Any),
1710 Test::Search(_) => false,
1711 Test::PString(pstring_test) => matches!(pstring_test.test_val, TestValue::Any),
1712 Test::Regex(_) => false,
1713 Test::Indirect(_) => false,
1714 Test::String16(string16_test) => matches!(string16_test.test_val, TestValue::Any),
1715 Test::Der => false,
1716 Test::Clear => false,
1717 Test::Default => false,
1718 }
1719 }
1720
1721 #[inline(always)]
1722 fn is_binary(&self) -> bool {
1723 match self {
1724 Self::Name(_) => true,
1725 Self::Use(_, _) => true,
1726 Self::Scalar(_) => true,
1727 Self::Float(_) => true,
1728 Self::String(t) => !t.is_binary() & !t.is_text() || t.is_binary(),
1729 Self::Search(t) => t.is_binary(),
1730 Self::PString(_) => true,
1731 Self::Regex(t) => !t.is_binary() & !t.is_text() || t.is_binary(),
1732 Self::Clear => true,
1733 Self::Default => true,
1734 Self::Indirect(_) => true,
1735 Self::String16(_) => true,
1736 Self::Der => true,
1737 }
1738 }
1739
1740 #[inline(always)]
1741 fn is_text(&self) -> bool {
1742 match self {
1743 Self::Name(_) => true,
1744 Self::Use(_, _) => true,
1745 Self::Indirect(_) => true,
1746 Self::Clear => true,
1747 Self::Default => true,
1748 Self::String(t) => !t.is_binary() & !t.is_text() || t.is_text(),
1749 Self::Regex(t) => !t.is_binary() & !t.is_text() || t.is_text(),
1750 _ => !self.is_binary(),
1751 }
1752 }
1753
1754 #[inline(always)]
1755 fn is_only_text(&self) -> bool {
1756 self.is_text() && !self.is_binary()
1757 }
1758
1759 #[inline(always)]
1760 fn is_only_binary(&self) -> bool {
1761 self.is_binary() && !self.is_text()
1762 }
1763}
1764
1765#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
1766enum OffsetType {
1767 Byte,
1768 DoubleLe,
1769 DoubleBe,
1770 ShortLe,
1771 ShortBe,
1772 Id3Le,
1773 Id3Be,
1774 LongLe,
1775 LongBe,
1776 Middle,
1777 Octal,
1778 QuadBe,
1779 QuadLe,
1780}
1781
1782#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
1783enum Shift {
1784 Direct(u64),
1785 Indirect(i64),
1786}
1787
1788#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
1789struct IndOffset {
1790 off_addr: DirOffset,
1792 signed: bool,
1794 ty: OffsetType,
1796 op: Option<Op>,
1797 shift: Option<Shift>,
1798}
1799
1800impl IndOffset {
1801 fn read_offset<R: Read + Seek>(
1803 &self,
1804 haystack: &mut LazyCache<R>,
1805 rule_base_offset: Option<u64>,
1806 last_upper_match_offset: Option<u64>,
1807 ) -> Result<Option<u64>, io::Error> {
1808 let offset_address = match self.off_addr {
1809 DirOffset::Start(s) => {
1810 let Some(o) = s.checked_add(rule_base_offset.unwrap_or_default()) else {
1811 return Ok(None);
1812 };
1813
1814 haystack.seek(SeekFrom::Start(o))?
1815 }
1816 DirOffset::LastUpper(c) => haystack.seek(SeekFrom::Start(
1817 (last_upper_match_offset.unwrap_or_default() as i64 + c) as u64,
1818 ))?,
1819 DirOffset::End(e) => haystack.seek(SeekFrom::End(e))?,
1820 };
1821
1822 macro_rules! read_value {
1823 () => {
1824 match self.ty {
1825 OffsetType::Byte => {
1826 if self.signed {
1827 read_le!(haystack, u8) as u64
1828 } else {
1829 read_le!(haystack, i8) as u64
1830 }
1831 }
1832 OffsetType::DoubleLe => read_le!(haystack, f64) as u64,
1833 OffsetType::DoubleBe => read_be!(haystack, f64) as u64,
1834 OffsetType::ShortLe => {
1835 if self.signed {
1836 read_le!(haystack, i16) as u64
1837 } else {
1838 read_le!(haystack, u16) as u64
1839 }
1840 }
1841 OffsetType::ShortBe => {
1842 if self.signed {
1843 read_be!(haystack, i16) as u64
1844 } else {
1845 read_be!(haystack, u16) as u64
1846 }
1847 }
1848 OffsetType::Id3Le => decode_id3(read_le!(haystack, u32)) as u64,
1849 OffsetType::Id3Be => decode_id3(read_be!(haystack, u32)) as u64,
1850 OffsetType::LongLe => {
1851 if self.signed {
1852 read_le!(haystack, i32) as u64
1853 } else {
1854 read_le!(haystack, u32) as u64
1855 }
1856 }
1857 OffsetType::LongBe => {
1858 if self.signed {
1859 read_be!(haystack, i32) as u64
1860 } else {
1861 read_be!(haystack, u32) as u64
1862 }
1863 }
1864 OffsetType::Middle => read_me!(haystack) as u64,
1865 OffsetType::Octal => {
1866 if let Some(o) = read_octal_u64(haystack) {
1867 o
1868 } else {
1869 debug!("failed to read octal offset @ {offset_address}");
1870 return Ok(None);
1871 }
1872 }
1873 OffsetType::QuadLe => {
1874 if self.signed {
1875 read_le!(haystack, i64) as u64
1876 } else {
1877 read_le!(haystack, u64)
1878 }
1879 }
1880 OffsetType::QuadBe => {
1881 if self.signed {
1882 read_be!(haystack, i64) as u64
1883 } else {
1884 read_be!(haystack, u64)
1885 }
1886 }
1887 }
1888 };
1889 }
1890
1891 let o = read_value!();
1893
1894 trace!(
1895 "offset read @ {offset_address} value={o} op={:?} shift={:?}",
1896 self.op, self.shift
1897 );
1898
1899 if let (Some(op), Some(shift)) = (self.op, self.shift) {
1901 let shift = match shift {
1902 Shift::Direct(i) => i,
1903 Shift::Indirect(i) => {
1904 let tmp = offset_address as i128 + i as i128;
1905 if tmp.is_negative() {
1906 return Ok(None);
1907 } else {
1908 haystack.seek(SeekFrom::Start(tmp as u64))?;
1909 };
1910 read_value!()
1913 }
1914 };
1915
1916 match op {
1917 Op::Add => return Ok(o.checked_add(shift)),
1918 Op::Mul => return Ok(o.checked_mul(shift)),
1919 Op::Sub => return Ok(o.checked_sub(shift)),
1920 Op::Div => return Ok(o.checked_div(shift)),
1921 Op::Mod => return Ok(o.checked_rem(shift)),
1922 Op::And => return Ok(Some(o & shift)),
1923 Op::Or => return Ok(Some(o | shift)),
1924 Op::Xor => return Ok(Some(o ^ shift)),
1925 }
1926 }
1927
1928 Ok(Some(o))
1929 }
1930}
1931
1932#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
1933enum DirOffset {
1934 Start(u64),
1935 LastUpper(i64),
1937 End(i64),
1938}
1939
1940#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
1941enum Offset {
1942 Direct(DirOffset),
1943 Indirect(IndOffset),
1944}
1945
1946impl From<DirOffset> for Offset {
1947 fn from(value: DirOffset) -> Self {
1948 Self::Direct(value)
1949 }
1950}
1951
1952impl From<IndOffset> for Offset {
1953 fn from(value: IndOffset) -> Self {
1954 Self::Indirect(value)
1955 }
1956}
1957
1958impl Display for DirOffset {
1959 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1960 match self {
1961 DirOffset::Start(i) => write!(f, "{i}"),
1962 DirOffset::LastUpper(c) => write!(f, "&{c}"),
1963 DirOffset::End(e) => write!(f, "-{e}"),
1964 }
1965 }
1966}
1967
1968impl Default for DirOffset {
1969 fn default() -> Self {
1970 Self::LastUpper(0)
1971 }
1972}
1973
1974#[derive(Debug, Clone, Serialize, Deserialize)]
1975struct Match {
1976 line: usize,
1977 depth: u8,
1978 offset: Offset,
1979 test: Test,
1980 test_strength: u64,
1981 message: Option<Message>,
1982}
1983
1984impl From<Use> for Match {
1985 fn from(value: Use) -> Self {
1986 let test = Test::Use(value.switch_endianness, value.rule_name);
1987 let test_strength = test.strength();
1988 Self {
1989 line: value.line,
1990 depth: value.depth,
1991 offset: value.start_offset,
1992 test,
1993 test_strength,
1994 message: value.message,
1995 }
1996 }
1997}
1998
1999impl From<Name> for Match {
2000 fn from(value: Name) -> Self {
2001 let test = Test::Name(value.name);
2002 let test_strength = test.strength();
2003 Self {
2004 line: value.line,
2005 depth: 0,
2006 offset: Offset::Direct(DirOffset::Start(0)),
2007 test,
2008 test_strength,
2009 message: value.message,
2010 }
2011 }
2012}
2013
2014impl Match {
2015 #[inline(always)]
2017 fn offset_from_start<R: Read + Seek>(
2018 &self,
2019 haystack: &mut LazyCache<R>,
2020 rule_base_offset: Option<u64>,
2021 last_level_offset: Option<u64>,
2022 ) -> Result<Option<u64>, io::Error> {
2023 match self.offset {
2024 Offset::Direct(dir_offset) => match dir_offset {
2025 DirOffset::Start(s) => Ok(Some(s)),
2026 DirOffset::LastUpper(shift) => {
2027 let o = last_level_offset.unwrap_or_default() as i64 + shift;
2028
2029 if o >= 0 { Ok(Some(o as u64)) } else { Ok(None) }
2030 }
2031 DirOffset::End(e) => Ok(Some(haystack.offset_from_start(SeekFrom::End(e)))),
2032 },
2033 Offset::Indirect(ind_offset) => {
2034 let Some(o) =
2035 ind_offset.read_offset(haystack, rule_base_offset, last_level_offset)?
2036 else {
2037 return Ok(None);
2038 };
2039
2040 Ok(Some(o))
2041 }
2042 }
2043 }
2044
2045 #[inline]
2058 #[allow(clippy::too_many_arguments)]
2059 fn matches<'a: 'h, 'h, R: Read + Seek>(
2060 &'a self,
2061 source: Option<&str>,
2062 magic: &mut Magic<'a>,
2063 stream_kind: StreamKind,
2064 state: &mut MatchState,
2065 buf_base_offset: Option<u64>,
2066 rule_base_offset: Option<u64>,
2067 last_level_offset: Option<u64>,
2068 haystack: &'h mut LazyCache<R>,
2069 switch_endianness: bool,
2070 db: &'a MagicDb,
2071 depth: usize,
2072 ) -> Result<(bool, Option<MatchRes<'h>>), Error> {
2073 let source = source.unwrap_or("unknown");
2074 let line = self.line;
2075
2076 if depth >= MAX_RECURSION {
2077 return Err(Error::localized(
2078 source,
2079 line,
2080 Error::MaximumRecursion(MAX_RECURSION),
2081 ));
2082 }
2083
2084 if self.test.is_only_binary() && stream_kind.is_text() {
2085 trace!("skip binary test source={source} line={line} stream_kind={stream_kind:?}",);
2086 return Ok((false, None));
2087 }
2088
2089 if self.test.is_only_text() && !stream_kind.is_text() {
2090 trace!("skip text test source={source} line={line} stream_kind={stream_kind:?}",);
2091 return Ok((false, None));
2092 }
2093
2094 let Ok(Some(mut offset)) = self
2095 .offset_from_start(haystack, rule_base_offset, last_level_offset)
2096 .inspect_err(|e| debug!("source={source} line={line} failed at computing offset: {e}"))
2097 else {
2098 return Ok((false, None));
2099 };
2100
2101 offset = match self.offset {
2102 Offset::Indirect(_) => {
2103 buf_base_offset.unwrap_or_default().saturating_add(offset)
2108 }
2109 Offset::Direct(DirOffset::Start(_)) => {
2111 rule_base_offset.unwrap_or_default().saturating_add(offset)
2112 }
2113 _ => offset,
2114 };
2115
2116 match &self.test {
2117 Test::Clear => {
2118 trace!("source={source} line={line} clear");
2119 state.clear_continuation_level(&self.continuation_level());
2120 Ok((true, None))
2121 }
2122
2123 Test::Name(name) => {
2124 trace!(
2125 "source={source} line={line} running rule {name} switch_endianness={switch_endianness}",
2126 );
2127 Ok((true, None))
2128 }
2129
2130 Test::Use(flip_endianness, rule_name) => {
2131 trace!(
2132 "source={source} line={line} use {rule_name} switch_endianness={flip_endianness}",
2133 );
2134
2135 let switch_endianness = switch_endianness ^ flip_endianness;
2137
2138 let dr: &DependencyRule = db.dependencies.get(rule_name).ok_or(
2139 Error::localized(source, line, Error::MissingRule(rule_name.clone())),
2140 )?;
2141
2142 if let Some(msg) = self.message.as_ref() {
2144 magic.push_message(msg.to_string_lossy());
2145 }
2146
2147 let nmatch = dr.rule.magic(
2148 magic,
2149 stream_kind,
2150 buf_base_offset,
2151 Some(offset),
2152 haystack,
2153 db,
2154 switch_endianness,
2155 depth.saturating_add(1),
2156 )?;
2157
2158 let matched = nmatch > 1;
2161 if matched {
2162 state.set_continuation_level(self.continuation_level());
2163 }
2164
2165 Ok((matched, None))
2166 }
2167
2168 Test::Indirect(m) => {
2169 trace!(
2170 "source={source} line={line} indirect mods={:?} offset={offset:#x}",
2171 m
2172 );
2173
2174 let new_buf_base_off = if m.contains(IndirectMod::Relative) {
2175 Some(offset)
2176 } else {
2177 None
2178 };
2179
2180 if let Some(msg) = self.message.as_ref() {
2182 magic.push_message(msg.to_string_lossy());
2183 }
2184
2185 let mut nmatch = 0u64;
2186 for r in db.rules.iter() {
2187 let messages_cnt = magic.message.len();
2188 nmatch = nmatch.saturating_add(r.magic(
2189 magic,
2190 stream_kind,
2191 new_buf_base_off,
2192 Some(offset),
2193 haystack,
2194 db,
2195 false,
2196 depth.saturating_add(1),
2197 )?);
2198
2199 if magic.message.len() != messages_cnt {
2201 break;
2202 }
2203 }
2204
2205 Ok((nmatch > 0, None))
2207 }
2208
2209 Test::Default => {
2210 let ok = !state.get_continuation_level(&self.continuation_level());
2212
2213 trace!("source={source} line={line} default match={ok}");
2214 if ok {
2215 state.set_continuation_level(self.continuation_level());
2216 }
2217
2218 Ok((ok, None))
2219 }
2220
2221 _ => {
2222 if let Err(e) = haystack.seek(SeekFrom::Start(offset)) {
2223 debug!("source={source} line={line} failed to seek in haystack: {e}");
2224 return Ok((false, None));
2225 }
2226
2227 let mut trace_msg = None;
2228
2229 if enabled!(Level::DEBUG) {
2230 trace_msg = Some(vec![format!(
2231 "source={source} line={line} depth={} stream_offset={:#x}",
2232 self.depth,
2233 haystack.lazy_stream_position()
2234 )])
2235 }
2236
2237 if let Ok(opt_test_value) = self
2241 .test
2242 .read_test_value(haystack, switch_endianness)
2243 .inspect_err(|e| {
2244 debug!("source={source} line={line} error while reading test value @{offset}: {e}",)
2245 })
2246 {
2247 if let Some(v) = trace_msg
2248 .as_mut() { v.push(format!("test={:?}", self.test)) }
2249
2250 let match_res =
2251 opt_test_value.and_then(|tv| self.test.match_value(&tv, stream_kind));
2252
2253 if let Some(v) = trace_msg.as_mut() { v.push(format!(
2254 "message=\"{}\" match={}",
2255 self.message
2256 .as_ref()
2257 .map(|fs| fs.to_string_lossy())
2258 .unwrap_or_default(),
2259 match_res.is_some()
2260 )) }
2261
2262 if enabled!(Level::DEBUG) && !enabled!(Level::TRACE) && match_res.is_some() {
2264 if let Some(m) = trace_msg{
2265 debug!("{}", m.join(" "));
2266 }
2267 } else if enabled!(Level::TRACE)
2268 && let Some(m) = trace_msg{
2269 trace!("{}", m.join(" "));
2270 }
2271
2272 if let Some(mr) = match_res {
2273 state.set_continuation_level(self.continuation_level());
2274 return Ok((true, Some(mr)));
2275 }
2276 }
2277
2278 Ok((false, None))
2279 }
2280 }
2281 }
2282
2283 #[inline(always)]
2284 fn continuation_level(&self) -> ContinuationLevel {
2285 ContinuationLevel(self.depth)
2286 }
2287}
2288
2289#[derive(Debug, Clone)]
2290struct Use {
2291 line: usize,
2292 depth: u8,
2293 start_offset: Offset,
2294 rule_name: String,
2295 switch_endianness: bool,
2296 message: Option<Message>,
2297}
2298
2299#[derive(Debug, Clone, Serialize, Deserialize)]
2300struct StrengthMod {
2301 op: Op,
2302 by: u8,
2303}
2304
2305impl StrengthMod {
2306 #[inline(always)]
2307 fn apply(&self, strength: u64) -> u64 {
2308 let by = self.by as u64;
2309 debug!("applying strength modifier: {strength} {} {}", self.op, by);
2310 match self.op {
2311 Op::Mul => strength.saturating_mul(by),
2312 Op::Add => strength.saturating_add(by),
2313 Op::Sub => strength.saturating_sub(by),
2314 Op::Div => {
2315 if by > 0 {
2316 strength.saturating_div(by)
2317 } else {
2318 strength
2319 }
2320 }
2321 Op::Mod => strength % by,
2322 Op::And => strength & by,
2323 Op::Xor | Op::Or => {
2326 debug_panic!("unsupported strength operator");
2327 strength
2328 }
2329 }
2330 }
2331}
2332
2333#[derive(Debug, Clone)]
2334enum Flag {
2335 Mime(String),
2336 Ext(HashSet<String>),
2337 Strength(StrengthMod),
2338 Apple(String),
2339}
2340
2341#[derive(Debug, Clone)]
2342struct Name {
2343 line: usize,
2344 name: String,
2345 message: Option<Message>,
2346}
2347
2348#[derive(Debug, Clone)]
2349enum Entry<'span> {
2350 Match(Span<'span>, Match),
2351 Flag(Span<'span>, Flag),
2352}
2353
2354#[derive(Debug, Clone, Serialize, Deserialize)]
2355struct EntryNode {
2356 root: bool,
2357 entry: Match,
2358 children: Vec<EntryNode>,
2359 mimetype: Option<String>,
2360 apple: Option<String>,
2361 strength_mod: Option<StrengthMod>,
2362 exts: HashSet<String>,
2363}
2364
2365#[derive(Debug, Default)]
2366struct EntryNodeVisitor {
2367 exts: HashSet<String>,
2368 score: u64,
2369}
2370
2371impl EntryNodeVisitor {
2372 fn new() -> Self {
2373 Self {
2374 ..Default::default()
2375 }
2376 }
2377
2378 fn merge(&mut self, other: Self) {
2379 self.exts.extend(other.exts);
2380 self.score += other.score;
2381 }
2382}
2383
2384impl EntryNode {
2385 #[inline]
2386 fn update_visitor(&self, v: &mut EntryNodeVisitor, depth: usize) {
2387 for ext in self.exts.iter() {
2389 if !v.exts.contains(ext) {
2390 v.exts.insert(ext.clone());
2391 }
2392 }
2393
2394 if depth == 0 {
2396 v.score += self.entry.test_strength;
2397 }
2398
2399 v.score += self
2403 .children
2404 .iter()
2405 .map(|e| e.entry.test_strength)
2406 .min()
2407 .unwrap_or_default()
2408 / max(1, depth as u64);
2409 }
2410
2411 fn visit(
2412 &self,
2413 v: &mut EntryNodeVisitor,
2414 deps: &HashMap<String, DependencyRule>,
2415 marked: &mut HashSet<String>,
2416 depth: usize,
2417 ) -> Result<(), Error> {
2418 self.update_visitor(v, depth);
2420
2421 for c in self.children.iter() {
2423 if let Test::Use(_, ref name) = c.entry.test {
2424 if marked.contains(name) {
2425 continue;
2426 }
2427
2428 marked.insert(name.clone());
2429
2430 if let Some(r) = deps.get(name) {
2431 let dv = r.rule.visit_all_entries(deps, marked)?;
2432 v.merge(dv);
2433 } else {
2434 return Err(Error::MissingRule(name.clone()));
2435 }
2436 } else {
2437 c.visit(v, deps, marked, depth + 1)?;
2438 }
2439 }
2440
2441 Ok(())
2442 }
2443
2444 #[inline]
2445 #[allow(clippy::too_many_arguments)]
2446 fn matches<'r, R: Read + Seek>(
2447 &'r self,
2448 opt_source: Option<&str>,
2449 magic: &mut Magic<'r>,
2450 state: &mut MatchState,
2451 stream_kind: StreamKind,
2452 buf_base_offset: Option<u64>,
2453 rule_base_offset: Option<u64>,
2454 last_level_offset: Option<u64>,
2455 haystack: &mut LazyCache<R>,
2456 db: &'r MagicDb,
2457 switch_endianness: bool,
2458 depth: usize,
2459 ) -> Result<u64, Error> {
2460 let mut nmatch = 0u64;
2461
2462 let (ok, opt_match_res) = self.entry.matches(
2463 opt_source,
2464 magic,
2465 stream_kind,
2466 state,
2467 buf_base_offset,
2468 rule_base_offset,
2469 last_level_offset,
2470 haystack,
2471 switch_endianness,
2472 db,
2473 depth,
2474 )?;
2475
2476 let source = opt_source.unwrap_or("unknown");
2477 let line = self.entry.line;
2478
2479 if ok {
2480 nmatch = nmatch.saturating_add(1);
2481
2482 if !self.entry.test.is_recursive()
2486 && let Some(msg) = self.entry.message.as_ref()
2487 && let Ok(msg) = msg.format_with(opt_match_res.as_ref()).inspect_err(|e| {
2488 debug!("source={source} line={line} failed to format message: {e}")
2489 })
2490 {
2491 magic.push_message(msg);
2492 }
2493
2494 if let Some(mr) = opt_match_res {
2496 match &self.entry.test {
2497 Test::String(t) => {
2498 if t.has_length_mod() {
2499 let o = mr.end_offset();
2500 haystack.seek(SeekFrom::Start(o))?;
2501 }
2502 }
2503 Test::Search(t) => {
2504 if t.re_mods.contains(ReMod::StartOffsetUpdate) {
2505 let o = mr.start_offset();
2506 haystack.seek(SeekFrom::Start(o))?;
2507 } else {
2508 let o = mr.end_offset();
2509 haystack.seek(SeekFrom::Start(o))?;
2510 }
2511 }
2512
2513 Test::Regex(t) => {
2514 if t.mods.contains(ReMod::StartOffsetUpdate) {
2515 let o = mr.start_offset();
2516 haystack.seek(SeekFrom::Start(o))?;
2517 } else {
2518 let o = mr.end_offset();
2519 haystack.seek(SeekFrom::Start(o))?;
2520 }
2521 }
2522 _ => {}
2524 }
2525 }
2526
2527 if let Some(mimetype) = self.mimetype.as_ref() {
2528 magic.set_mime_type(Cow::Borrowed(mimetype));
2529 }
2530
2531 if let Some(apple_ty) = self.apple.as_ref() {
2532 magic.set_creator_code(Cow::Borrowed(apple_ty));
2533 }
2534
2535 if !self.exts.is_empty() {
2536 magic.insert_extensions(self.exts.iter().map(|s| s.as_str()));
2537 }
2538
2539 let mut strength = self.entry.test_strength;
2543
2544 let continuation_level = self.entry.continuation_level().0 as u64;
2545 if self.entry.message.is_none() && continuation_level < 3 {
2546 strength = strength.saturating_add(continuation_level);
2547 }
2548
2549 if let Some(sm) = self.strength_mod.as_ref() {
2550 strength = sm.apply(strength);
2551 }
2552
2553 if self.entry.message.is_none() {
2555 strength += 1
2556 }
2557
2558 magic.update_strength(strength);
2559
2560 let end_upper_level = haystack.lazy_stream_position();
2561
2562 let rule_base_offset = if self.root {
2570 match self.entry.offset {
2571 Offset::Direct(DirOffset::End(o)) => {
2572 Some(haystack.offset_from_start(SeekFrom::End(o)))
2573 }
2574 _ => rule_base_offset,
2575 }
2576 } else {
2577 rule_base_offset
2578 };
2579
2580 for e in self.children.iter() {
2581 nmatch = nmatch.saturating_add(e.matches(
2582 opt_source,
2583 magic,
2584 state,
2585 stream_kind,
2586 buf_base_offset,
2587 rule_base_offset,
2588 Some(end_upper_level),
2589 haystack,
2590 db,
2591 switch_endianness,
2592 depth,
2593 )?);
2594 }
2595 }
2596
2597 Ok(nmatch)
2598 }
2599}
2600
2601#[derive(Debug, Clone, Serialize, Deserialize)]
2603pub struct MagicRule {
2604 id: usize,
2605 source: Option<String>,
2606 entries: EntryNode,
2607 extensions: HashSet<String>,
2608 score: u64,
2610 finalized: bool,
2611}
2612
2613impl MagicRule {
2614 #[inline(always)]
2615 fn set_id(&mut self, id: usize) {
2616 self.id = id
2617 }
2618
2619 fn visit_all_entries(
2620 &self,
2621 deps: &HashMap<String, DependencyRule>,
2622 marked: &mut HashSet<String>,
2623 ) -> Result<EntryNodeVisitor, Error> {
2624 let mut v = EntryNodeVisitor::new();
2625 self.entries.visit(&mut v, deps, marked, 0)?;
2626 Ok(v)
2627 }
2628
2629 fn try_finalize(&mut self, deps: &HashMap<String, DependencyRule>) -> Result<(), Error> {
2632 if self.finalized {
2633 return Ok(());
2634 }
2635
2636 let v = self.visit_all_entries(deps, &mut HashSet::new())?;
2638
2639 self.extensions.extend(v.exts);
2640 self.score = v.score;
2641 self.finalized = true;
2642
2643 Ok(())
2644 }
2645
2646 #[inline]
2647 fn magic_entrypoint<'r, R: Read + Seek>(
2648 &'r self,
2649 magic: &mut Magic<'r>,
2650 stream_kind: StreamKind,
2651 haystack: &mut LazyCache<R>,
2652 db: &'r MagicDb,
2653 switch_endianness: bool,
2654 depth: usize,
2655 ) -> Result<u64, Error> {
2656 self.entries.matches(
2657 self.source.as_deref(),
2658 magic,
2659 &mut MatchState::empty(),
2660 stream_kind,
2661 None,
2662 None,
2663 None,
2664 haystack,
2665 db,
2666 switch_endianness,
2667 depth,
2668 )
2669 }
2670
2671 #[inline]
2672 #[allow(clippy::too_many_arguments)]
2673 fn magic<'r, R: Read + Seek>(
2674 &'r self,
2675 magic: &mut Magic<'r>,
2676 stream_kind: StreamKind,
2677 buf_base_offset: Option<u64>,
2678 rule_base_offset: Option<u64>,
2679 haystack: &mut LazyCache<R>,
2680 db: &'r MagicDb,
2681 switch_endianness: bool,
2682 depth: usize,
2683 ) -> Result<u64, Error> {
2684 self.entries.matches(
2685 self.source.as_deref(),
2686 magic,
2687 &mut MatchState::empty(),
2688 stream_kind,
2689 buf_base_offset,
2690 rule_base_offset,
2691 None,
2692 haystack,
2693 db,
2694 switch_endianness,
2695 depth,
2696 )
2697 }
2698
2699 pub fn is_text(&self) -> bool {
2705 self.entries.entry.test.is_text()
2706 && self.entries.children.iter().all(|e| e.entry.test.is_text())
2707 }
2708
2709 #[inline(always)]
2715 pub fn score(&self) -> u64 {
2716 self.score
2717 }
2718
2719 #[inline(always)]
2725 pub fn source(&self) -> Option<&str> {
2726 self.source.as_deref()
2727 }
2728
2729 #[inline(always)]
2735 pub fn line(&self) -> usize {
2736 self.entries.entry.line
2737 }
2738
2739 #[inline(always)]
2745 pub fn extensions(&self) -> &HashSet<String> {
2746 &self.extensions
2747 }
2748}
2749
2750#[derive(Debug, Clone, Serialize, Deserialize)]
2751struct DependencyRule {
2752 name: String,
2753 rule: MagicRule,
2754}
2755
2756#[derive(Debug, Clone, Serialize, Deserialize)]
2762pub struct MagicSource {
2763 rules: Vec<MagicRule>,
2764 dependencies: HashMap<String, DependencyRule>,
2765}
2766
2767impl MagicSource {
2768 pub fn open<P: AsRef<Path>>(p: P) -> Result<Self, Error> {
2778 FileMagicParser::parse_file(p)
2779 }
2780}
2781
2782#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)]
2783struct ContinuationLevel(u8);
2784
2785#[derive(Debug, PartialEq, Eq, Clone, Copy)]
2787enum TextEncoding {
2788 Ascii,
2789 Utf8,
2790 Unknown,
2791}
2792
2793impl TextEncoding {
2794 const fn as_magic_str(&self) -> &'static str {
2795 match self {
2796 TextEncoding::Ascii => "ASCII",
2797 TextEncoding::Utf8 => "UTF-8",
2798 TextEncoding::Unknown => "Unknown",
2799 }
2800 }
2801}
2802
2803#[derive(Debug, PartialEq, Eq, Clone, Copy)]
2804enum StreamKind {
2805 Binary,
2806 Text(TextEncoding),
2807}
2808
2809impl StreamKind {
2810 const fn is_text(&self) -> bool {
2811 matches!(self, StreamKind::Text(_))
2812 }
2813}
2814
2815#[derive(Debug)]
2816struct MatchState {
2817 continuation_levels: [bool; 256],
2818}
2819
2820impl MatchState {
2821 #[inline(always)]
2822 fn empty() -> Self {
2823 MatchState {
2824 continuation_levels: [false; 256],
2825 }
2826 }
2827
2828 #[inline(always)]
2829 fn get_continuation_level(&mut self, level: &ContinuationLevel) -> bool {
2830 self.continuation_levels
2831 .get(level.0 as usize)
2832 .cloned()
2833 .unwrap_or_default()
2834 }
2835
2836 #[inline(always)]
2837 fn set_continuation_level(&mut self, level: ContinuationLevel) {
2838 if let Some(b) = self.continuation_levels.get_mut(level.0 as usize) {
2839 *b = true
2840 }
2841 }
2842
2843 #[inline(always)]
2844 fn clear_continuation_level(&mut self, level: &ContinuationLevel) {
2845 if let Some(b) = self.continuation_levels.get_mut(level.0 as usize) {
2846 *b = false;
2847 }
2848 }
2849}
2850
2851#[derive(Debug, Default)]
2853pub struct Magic<'m> {
2854 stream_kind: Option<StreamKind>,
2855 source: Option<Cow<'m, str>>,
2856 message: Vec<Cow<'m, str>>,
2857 mime_type: Option<Cow<'m, str>>,
2858 creator_code: Option<Cow<'m, str>>,
2859 strength: u64,
2860 exts: HashSet<Cow<'m, str>>,
2861 is_default: bool,
2862}
2863
2864impl<'m> Magic<'m> {
2865 #[inline(always)]
2866 fn set_source(&mut self, source: Option<&'m str>) {
2867 self.source = source.map(Cow::Borrowed);
2868 }
2869
2870 #[inline(always)]
2871 fn set_stream_kind(&mut self, stream_kind: StreamKind) {
2872 self.stream_kind = Some(stream_kind)
2873 }
2874
2875 #[inline(always)]
2876 fn reset(&mut self) {
2877 self.stream_kind = None;
2878 self.source = None;
2879 self.message.clear();
2880 self.mime_type = None;
2881 self.creator_code = None;
2882 self.strength = 0;
2883 self.exts.clear();
2884 self.is_default = false;
2885 }
2886
2887 #[inline]
2895 pub fn into_owned<'owned>(self) -> Magic<'owned> {
2896 Magic {
2897 stream_kind: self.stream_kind,
2898 source: self.source.map(|s| Cow::Owned(s.into_owned())),
2899 message: self
2900 .message
2901 .into_iter()
2902 .map(Cow::into_owned)
2903 .map(Cow::Owned)
2904 .collect(),
2905 mime_type: self.mime_type.map(|m| Cow::Owned(m.into_owned())),
2906 creator_code: self.creator_code.map(|m| Cow::Owned(m.into_owned())),
2907 strength: self.strength,
2908 exts: self
2909 .exts
2910 .into_iter()
2911 .map(|e| Cow::Owned(e.into_owned()))
2912 .collect(),
2913 is_default: self.is_default,
2914 }
2915 }
2916
2917 #[inline(always)]
2923 pub fn message(&self) -> String {
2924 let mut out = String::new();
2925 for (i, m) in self.message.iter().enumerate() {
2926 if let Some(s) = m.strip_prefix(r#"\b"#) {
2927 out.push_str(s);
2928 } else {
2929 if i > 0 {
2931 out.push(' ');
2932 }
2933 out.push_str(m);
2934 }
2935 }
2936 out
2937 }
2938
2939 #[inline]
2950 pub fn message_parts(&self) -> impl Iterator<Item = &str> {
2951 self.message.iter().map(|p| p.as_ref())
2952 }
2953
2954 #[inline(always)]
2955 fn update_strength(&mut self, value: u64) {
2956 self.strength = self.strength.saturating_add(value);
2957 debug!("updated strength = {:?}", self.strength)
2958 }
2959
2960 #[inline(always)]
2966 pub fn mime_type(&self) -> &str {
2967 self.mime_type.as_deref().unwrap_or(match self.stream_kind {
2968 Some(StreamKind::Text(_)) => DEFAULT_TEXT_MIMETYPE,
2969 Some(StreamKind::Binary) | None => DEFAULT_BIN_MIMETYPE,
2970 })
2971 }
2972
2973 #[inline(always)]
2974 fn push_message<'a: 'm>(&mut self, msg: Cow<'a, str>) {
2975 if !msg.is_empty() {
2976 debug!("pushing message: msg={msg} len={}", msg.len());
2977 self.message.push(msg);
2978 }
2979 }
2980
2981 #[inline(always)]
2982 fn set_mime_type<'a: 'm>(&mut self, mime: Cow<'a, str>) {
2983 if self.mime_type.is_none() {
2984 debug!("insert mime: {:?}", mime);
2985 self.mime_type = Some(mime)
2986 }
2987 }
2988
2989 #[inline(always)]
2990 fn set_creator_code<'a: 'm>(&mut self, apple_ty: Cow<'a, str>) {
2991 if self.creator_code.is_none() {
2992 debug!("insert apple type: {apple_ty:?}");
2993 self.creator_code = Some(apple_ty)
2994 }
2995 }
2996
2997 #[inline(always)]
2998 fn insert_extensions<'a: 'm, I: Iterator<Item = &'a str>>(&mut self, exts: I) {
2999 if self.exts.is_empty() {
3000 self.exts.extend(exts.filter_map(|e| {
3001 if e.is_empty() {
3002 None
3003 } else {
3004 Some(Cow::Borrowed(e))
3005 }
3006 }));
3007 }
3008 }
3009
3010 #[inline(always)]
3018 pub fn strength(&self) -> u64 {
3019 self.strength
3020 }
3021
3022 #[inline(always)]
3028 pub fn source(&self) -> Option<&str> {
3029 self.source.as_deref()
3030 }
3031
3032 #[inline(always)]
3038 pub fn creator_code(&self) -> Option<&str> {
3039 self.creator_code.as_deref()
3040 }
3041
3042 #[inline(always)]
3048 pub fn extensions(&self) -> &HashSet<Cow<'m, str>> {
3049 &self.exts
3050 }
3051
3052 #[inline(always)]
3058 pub fn is_default(&self) -> bool {
3059 self.is_default
3060 }
3061}
3062
3063#[derive(Debug, Default, Clone, Serialize, Deserialize)]
3065pub struct MagicDb {
3066 rule_id: usize,
3067 rules: Vec<MagicRule>,
3068 dependencies: HashMap<String, DependencyRule>,
3069 finalized: usize,
3070}
3071
3072#[inline(always)]
3073fn is_likely_text(bytes: &[u8]) -> bool {
3075 const CHUNK_SIZE: usize = std::mem::size_of::<usize>();
3076
3077 if bytes.is_empty() {
3078 return false;
3079 }
3080
3081 let mut printable = 0f64;
3082 let mut high_bytes = 0f64; let (chunks, remainder) = bytes.as_chunks::<CHUNK_SIZE>();
3085
3086 macro_rules! handle_byte {
3087 ($byte: expr) => {
3088 match $byte {
3089 0x00 => return false,
3090 0x09 | 0x0A | 0x0D => printable += 1.0, 0x20..=0x7E => printable += 1.0, _ => high_bytes += 1.0,
3093 }
3094 };
3095 }
3096
3097 for bytes in chunks {
3098 for b in bytes {
3099 handle_byte!(b)
3100 }
3101 }
3102
3103 for b in remainder {
3104 handle_byte!(b)
3105 }
3106
3107 let total = bytes.len() as f64;
3108 let printable_ratio = printable / total;
3109 let high_bytes_ratio = high_bytes / total;
3110
3111 printable_ratio > 0.85 && high_bytes_ratio < 0.20
3113}
3114
3115#[inline(always)]
3116fn guess_stream_kind<S: AsRef<[u8]>>(stream: S) -> StreamKind {
3117 let buf = stream.as_ref();
3118
3119 match run_utf8_validation(buf) {
3120 Ok(is_ascii) => {
3121 if is_ascii {
3122 StreamKind::Text(TextEncoding::Ascii)
3123 } else {
3124 StreamKind::Text(TextEncoding::Utf8)
3125 }
3126 }
3127 Err(e) => {
3128 if is_likely_text(&buf[e.valid_up_to..]) {
3129 StreamKind::Text(TextEncoding::Unknown)
3130 } else {
3131 StreamKind::Binary
3132 }
3133 }
3134 }
3135}
3136
3137impl MagicDb {
3138 pub fn optimal_lazy_cache<R: Read + Seek>(f: R) -> Result<LazyCache<R>, io::Error> {
3141 Ok(LazyCache::<R>::from_read_seek(f)
3142 .and_then(|lc| lc.with_hot_cache(2 * FILE_BYTES_MAX))?)
3143 .map(|lc| lc.with_warm_cache(100 << 20))
3144 }
3145
3146 pub fn new() -> Self {
3152 Self::default()
3153 }
3154
3155 #[inline(always)]
3156 fn next_rule_id(&mut self) -> usize {
3157 let t = self.rule_id;
3158 self.rule_id += 1;
3159 t
3160 }
3161
3162 #[inline(always)]
3163 fn try_json<R: Read + Seek>(
3164 haystack: &mut LazyCache<R>,
3165 stream_kind: StreamKind,
3166 magic: &mut Magic,
3167 ) -> Result<bool, Error> {
3168 if matches!(stream_kind, StreamKind::Binary) {
3170 return Ok(false);
3171 }
3172
3173 let buf = haystack.read_range(0..FILE_BYTES_MAX as u64)?.trim_ascii();
3174
3175 let Some((start, end)) = find_json_boundaries(buf) else {
3176 return Ok(false);
3177 };
3178
3179 for c in buf[0..start].iter() {
3182 if !c.is_ascii_whitespace() {
3183 return Ok(false);
3184 }
3185 }
3186
3187 let mut is_ndjson = false;
3188
3189 trace!("maybe a json document");
3190 let ok = serde_json::from_slice::<serde_json::Value>(&buf[start..=end]).is_ok();
3191 if !ok {
3192 return Ok(false);
3193 }
3194
3195 if end + 1 < buf.len() {
3197 let buf = &buf[end + 1..];
3199 if let Some((second_start, second_end)) = find_json_boundaries(buf) {
3200 if memchr(b'\n', &buf[..second_start]).is_some() {
3202 trace!("might be ndjson");
3203 is_ndjson = serde_json::from_slice::<serde_json::Value>(
3204 &buf[second_start..=second_end],
3205 )
3206 .is_ok();
3207 }
3208 }
3209 }
3210
3211 if is_ndjson {
3212 magic.push_message(Cow::Borrowed("New Line Delimited"));
3213 magic.set_mime_type(Cow::Borrowed("application/x-ndjson"));
3214 magic.insert_extensions(["ndjson", "jsonl"].into_iter());
3215 } else {
3216 magic.set_mime_type(Cow::Borrowed("application/json"));
3217 magic.insert_extensions(["json"].into_iter());
3218 }
3219
3220 magic.push_message(Cow::Borrowed("JSON text data"));
3221 magic.set_source(Some(HARDCODED_SOURCE));
3222 magic.update_strength(HARDCODED_MAGIC_STRENGTH);
3223 Ok(true)
3224 }
3225
3226 #[inline(always)]
3227 fn try_csv<R: Read + Seek>(
3228 haystack: &mut LazyCache<R>,
3229 stream_kind: StreamKind,
3230 magic: &mut Magic,
3231 ) -> Result<bool, Error> {
3232 let StreamKind::Text(enc) = stream_kind else {
3234 return Ok(false);
3235 };
3236
3237 let buf = haystack.read_range(0..FILE_BYTES_MAX as u64)?;
3238 let mut reader = csv::Reader::from_reader(io::Cursor::new(buf));
3239 let mut records = reader.records();
3240
3241 let Some(Ok(first)) = records.next() else {
3242 return Ok(false);
3243 };
3244
3245 if first.len() <= 1 {
3249 return Ok(false);
3250 }
3251
3252 let mut n = 1;
3254 for i in records.take(9) {
3255 if let Ok(rec) = i {
3256 if first.len() != rec.len() {
3257 return Ok(false);
3258 }
3259 } else {
3260 return Ok(false);
3261 }
3262 n += 1;
3263 }
3264
3265 if n != 10 {
3267 return Ok(false);
3268 }
3269
3270 magic.set_mime_type(Cow::Borrowed("text/csv"));
3271 magic.push_message(Cow::Borrowed("CSV"));
3272 magic.push_message(Cow::Borrowed(enc.as_magic_str()));
3273 magic.push_message(Cow::Borrowed("text"));
3274 magic.insert_extensions(["csv"].into_iter());
3275 magic.set_source(Some(HARDCODED_SOURCE));
3276 magic.update_strength(HARDCODED_MAGIC_STRENGTH);
3277 Ok(true)
3278 }
3279
3280 #[inline(always)]
3281 fn try_tar<R: Read + Seek>(
3282 haystack: &mut LazyCache<R>,
3283 stream_kind: StreamKind,
3284 magic: &mut Magic,
3285 ) -> Result<bool, Error> {
3286 if !matches!(stream_kind, StreamKind::Binary) {
3288 return Ok(false);
3289 }
3290
3291 let buf = haystack.read_range(0..FILE_BYTES_MAX as u64)?;
3292 let mut ar = Archive::new(io::Cursor::new(buf));
3293
3294 let Ok(mut entries) = ar.entries() else {
3295 return Ok(false);
3296 };
3297
3298 let Some(Ok(first)) = entries.next() else {
3299 return Ok(false);
3300 };
3301
3302 let header = first.header();
3303
3304 if header.as_ustar().is_some() {
3305 magic.push_message(Cow::Borrowed("POSIX tar archive"));
3306 } else if header.as_gnu().is_some() {
3307 magic.push_message(Cow::Borrowed("POSIX tar archive (GNU)"));
3308 } else {
3309 magic.push_message(Cow::Borrowed("tar archive"));
3310 }
3311
3312 magic.set_mime_type(Cow::Borrowed("application/x-tar"));
3313 magic.set_source(Some(HARDCODED_SOURCE));
3314 magic.update_strength(HARDCODED_MAGIC_STRENGTH);
3315 magic.insert_extensions(["tar"].into_iter());
3316 Ok(true)
3317 }
3318
3319 #[inline(always)]
3320 fn try_hard_magic<R: Read + Seek>(
3321 haystack: &mut LazyCache<R>,
3322 stream_kind: StreamKind,
3323 magic: &mut Magic,
3324 ) -> Result<bool, Error> {
3325 Ok(Self::try_json(haystack, stream_kind, magic)?
3326 || Self::try_csv(haystack, stream_kind, magic)?
3327 || Self::try_tar(haystack, stream_kind, magic)?)
3328 }
3329
3330 #[inline(always)]
3331 fn magic_default<'m, R: Read + Seek>(
3332 cache: &mut LazyCache<R>,
3333 stream_kind: StreamKind,
3334 magic: &mut Magic<'m>,
3335 ) {
3336 magic.set_source(Some(HARDCODED_SOURCE));
3337 magic.set_stream_kind(stream_kind);
3338 magic.is_default = true;
3339
3340 if cache.data_size() == 0 {
3341 magic.push_message(Cow::Borrowed("empty"));
3342 magic.set_mime_type(Cow::Borrowed(DEFAULT_BIN_MIMETYPE));
3343 }
3344
3345 match stream_kind {
3346 StreamKind::Binary => {
3347 magic.push_message(Cow::Borrowed("data"));
3348 }
3349 StreamKind::Text(e) => {
3350 magic.push_message(Cow::Borrowed(e.as_magic_str()));
3351 magic.push_message(Cow::Borrowed("text"));
3352 }
3353 }
3354 }
3355
3356 fn load_rules_no_prepare(&mut self, rules: Vec<MagicRule>) {
3357 for rule in rules.into_iter() {
3358 let mut rule = rule;
3359 rule.set_id(self.next_rule_id());
3360
3361 self.rules.push(rule);
3362 }
3363 }
3364
3365 pub fn load(&mut self, ms: MagicSource) -> &mut Self {
3371 self.load_rules_no_prepare(ms.rules);
3372 self.dependencies.extend(ms.dependencies);
3373 self.try_finalize();
3374 self
3375 }
3376
3377 pub fn load_bulk<I: Iterator<Item = MagicSource>>(&mut self, it: I) -> &mut Self {
3382 for ms in it {
3383 self.load_rules_no_prepare(ms.rules);
3384 self.dependencies.extend(ms.dependencies);
3385 }
3386 self.try_finalize();
3387 self
3388 }
3389
3390 pub fn rules(&self) -> &[MagicRule] {
3396 &self.rules
3397 }
3398
3399 #[inline]
3400 fn first_magic_with_stream_kind<R: Read + Seek>(
3401 &self,
3402 haystack: &mut LazyCache<R>,
3403 stream_kind: StreamKind,
3404 extension: Option<&str>,
3405 ) -> Result<Magic<'_>, Error> {
3406 let mut magic = Magic::default();
3408
3409 if Self::try_hard_magic(haystack, stream_kind, &mut magic)? {
3410 return Ok(magic);
3411 }
3412
3413 let mut marked = vec![false; self.rules.len()];
3414
3415 macro_rules! do_magic {
3416 ($rule: expr) => {{
3417 $rule.magic_entrypoint(&mut magic, stream_kind, haystack, &self, false, 0)?;
3418
3419 if !magic.message.is_empty() {
3420 magic.set_stream_kind(stream_kind);
3421 magic.set_source($rule.source.as_deref());
3422 return Ok(magic);
3423 }
3424
3425 magic.reset();
3426 }};
3427 }
3428
3429 if let Some(ext) = extension.map(|e| e.to_lowercase())
3430 && !ext.is_empty()
3431 {
3432 for rule in self.rules.iter().filter(|r| r.extensions.contains(&ext)) {
3433 do_magic!(rule);
3434 if let Some(f) = marked.get_mut(rule.id) {
3435 *f = true
3436 }
3437 }
3438 }
3439
3440 for rule in self
3441 .rules
3442 .iter()
3443 .filter(|r| !*marked.get(r.id).unwrap_or(&false))
3445 {
3446 do_magic!(rule)
3447 }
3448
3449 Self::magic_default(haystack, stream_kind, &mut magic);
3450
3451 Ok(magic)
3452 }
3453
3454 pub fn first_magic<R: Read + Seek>(
3477 &self,
3478 r: &mut R,
3479 extension: Option<&str>,
3480 ) -> Result<Magic<'_>, Error> {
3481 let mut cache = Self::optimal_lazy_cache(r)?;
3482 let stream_kind = guess_stream_kind(cache.read_range(0..FILE_BYTES_MAX as u64)?);
3483 self.first_magic_with_stream_kind(&mut cache, stream_kind, extension)
3484 }
3485
3486 pub fn first_magic_with_lazy_cache<R: Read + Seek>(
3515 &self,
3516 cache: &mut LazyCache<R>,
3517 extension: Option<&str>,
3518 ) -> Result<Magic<'_>, Error> {
3519 let stream_kind = guess_stream_kind(cache.read_range(0..FILE_BYTES_MAX as u64)?);
3520 self.first_magic_with_stream_kind(cache, stream_kind, extension)
3521 }
3522
3523 #[inline(always)]
3524 fn all_magics_sort_with_stream_kind<R: Read + Seek>(
3525 &self,
3526 haystack: &mut LazyCache<R>,
3527 stream_kind: StreamKind,
3528 ) -> Result<Vec<Magic<'_>>, Error> {
3529 let mut out = Vec::new();
3530
3531 let mut magic = Magic::default();
3532
3533 if Self::try_hard_magic(haystack, stream_kind, &mut magic)? {
3534 out.push(magic);
3535 magic = Magic::default();
3536 }
3537
3538 for rule in self.rules.iter() {
3539 rule.magic_entrypoint(&mut magic, stream_kind, haystack, self, false, 0)?;
3540
3541 if !magic.message.is_empty() {
3543 magic.set_stream_kind(stream_kind);
3544 magic.set_source(rule.source.as_deref());
3545 out.push(magic);
3546 magic = Magic::default();
3547 }
3548
3549 magic.reset();
3550 }
3551
3552 Self::magic_default(haystack, stream_kind, &mut magic);
3553 out.push(magic);
3554
3555 out.sort_by_key(|b| std::cmp::Reverse(b.strength()));
3556
3557 Ok(out)
3558 }
3559
3560 pub fn all_magics<R: Read + Seek>(&self, r: &mut R) -> Result<Vec<Magic<'_>>, Error> {
3570 let mut cache = Self::optimal_lazy_cache(r)?;
3571 let stream_kind = guess_stream_kind(cache.read_range(0..FILE_BYTES_MAX as u64)?);
3572 self.all_magics_sort_with_stream_kind(&mut cache, stream_kind)
3573 }
3574
3575 pub fn all_magics_with_lazy_cache<R: Read + Seek>(
3591 &self,
3592 cache: &mut LazyCache<R>,
3593 ) -> Result<Vec<Magic<'_>>, Error> {
3594 let stream_kind = guess_stream_kind(cache.read_range(0..FILE_BYTES_MAX as u64)?);
3595 self.all_magics_sort_with_stream_kind(cache, stream_kind)
3596 }
3597
3598 #[inline(always)]
3599 fn best_magic_with_stream_kind<R: Read + Seek>(
3600 &self,
3601 haystack: &mut LazyCache<R>,
3602 stream_kind: StreamKind,
3603 ) -> Result<Magic<'_>, Error> {
3604 let magics = self.all_magics_sort_with_stream_kind(haystack, stream_kind)?;
3605
3606 Ok(magics.into_iter().next().unwrap_or_else(|| {
3609 let mut magic = Magic::default();
3610 Self::magic_default(haystack, stream_kind, &mut magic);
3611 magic
3612 }))
3613 }
3614
3615 pub fn best_magic<R: Read + Seek>(&self, r: &mut R) -> Result<Magic<'_>, Error> {
3625 let mut cache = Self::optimal_lazy_cache(r)?;
3626 let stream_kind = guess_stream_kind(cache.read_range(0..FILE_BYTES_MAX as u64)?);
3627 self.best_magic_with_stream_kind(&mut cache, stream_kind)
3628 }
3629
3630 pub fn best_magic_with_lazy_cache<R: Read + Seek>(
3646 &self,
3647 cache: &mut LazyCache<R>,
3648 ) -> Result<Magic<'_>, Error> {
3649 let stream_kind = guess_stream_kind(cache.read_range(0..FILE_BYTES_MAX as u64)?);
3650 self.best_magic_with_stream_kind(cache, stream_kind)
3651 }
3652
3653 pub fn serialize<W: Write>(self, w: &mut W) -> Result<(), Error> {
3659 let mut encoder = GzEncoder::new(w, Compression::best());
3660
3661 bincode::serde::encode_into_std_write(&self, &mut encoder, bincode::config::standard())?;
3662 encoder.finish()?;
3663 Ok(())
3664 }
3665
3666 pub fn deserialize<R: Read>(r: &mut R) -> Result<Self, Error> {
3676 let mut buf = vec![];
3677 let mut gz = GzDecoder::new(r);
3678 gz.read_to_end(&mut buf).map_err(|e| {
3679 bincode::error::DecodeError::OtherString(format!("failed to read: {e}"))
3680 })?;
3681 let (sdb, _): (MagicDb, usize) =
3682 bincode::serde::decode_from_slice(&buf, bincode::config::standard())?;
3683 Ok(sdb)
3684 }
3685
3686 pub fn verify(&mut self) -> Result<(), Error> {
3693 if self.rules.len() == self.finalized {
3694 return Ok(());
3695 }
3696
3697 for r in self.rules.iter_mut().filter(|r| !r.finalized) {
3698 r.try_finalize(&self.dependencies).map_err(|e| {
3700 Error::Verify(
3701 r.source.clone().unwrap_or(String::from("unknown")),
3702 r.line(),
3703 e.into(),
3704 )
3705 })?;
3706 self.finalized += 1;
3707 }
3708
3709 debug_assert!(self.finalized <= self.rules.len());
3710
3711 Ok(())
3712 }
3713
3714 #[inline(always)]
3715 fn try_finalize(&mut self) {
3716 if self.rules.len() == self.finalized {
3717 return;
3718 }
3719
3720 let mut finalized = 0usize;
3721 self.rules.iter_mut().for_each(|r| {
3722 if r.try_finalize(&self.dependencies).is_ok() {
3723 finalized += 1;
3724 }
3725 });
3726
3727 self.finalized = finalized;
3728
3729 debug_assert!(self.finalized <= self.rules.len());
3730
3731 self.rules.sort_by_key(|r| (r.is_text(), -(r.score as i64)));
3733 }
3734}
3735
3736#[cfg(test)]
3737mod tests {
3738 use std::io::Cursor;
3739
3740 use regex::bytes::Regex;
3741
3742 use crate::utils::unix_local_time_to_string;
3743
3744 use super::*;
3745
3746 macro_rules! lazy_cache {
3747 ($l: literal) => {
3748 LazyCache::from_read_seek(Cursor::new($l)).unwrap()
3749 };
3750 }
3751
3752 fn first_magic(
3753 rule: &str,
3754 content: &[u8],
3755 stream_kind: StreamKind,
3756 ) -> Result<Magic<'static>, Error> {
3757 let mut md = MagicDb::new();
3758 md.load(
3759 FileMagicParser::parse_str(rule, None)
3760 .inspect_err(|e| eprintln!("{e}"))
3761 .unwrap(),
3762 );
3763 let mut reader = LazyCache::from_read_seek(Cursor::new(content)).unwrap();
3764 let v = md.best_magic_with_stream_kind(&mut reader, stream_kind)?;
3765 Ok(v.into_owned())
3766 }
3767
3768 #[allow(unused_macros)]
3770 macro_rules! enable_trace {
3771 () => {
3772 tracing_subscriber::fmt()
3773 .with_max_level(tracing_subscriber::filter::LevelFilter::TRACE)
3774 .try_init();
3775 };
3776 }
3777
3778 macro_rules! parse_assert {
3779 ($rule:literal) => {
3780 FileMagicParser::parse_str($rule, None)
3781 .inspect_err(|e| eprintln!("{e}"))
3782 .unwrap()
3783 };
3784 }
3785
3786 macro_rules! assert_magic_match_bin {
3787 ($rule: literal, $content:literal) => {{ first_magic($rule, $content, StreamKind::Binary).unwrap() }};
3788 ($rule: literal, $content:literal, $message:expr) => {{
3789 assert_eq!(
3790 first_magic($rule, $content, StreamKind::Binary)
3791 .unwrap()
3792 .message(),
3793 $message
3794 );
3795 }};
3796 }
3797
3798 macro_rules! assert_magic_match_text {
3799 ($rule: literal, $content:literal) => {{ first_magic($rule, $content, StreamKind::Text(TextEncoding::Utf8)).unwrap() }};
3800 ($rule: literal, $content:literal, $message:expr) => {{
3801 assert_eq!(
3802 first_magic($rule, $content, StreamKind::Text(TextEncoding::Utf8))
3803 .unwrap()
3804 .message(),
3805 $message
3806 );
3807 }};
3808 }
3809
3810 macro_rules! assert_magic_not_match_text {
3811 ($rule: literal, $content:literal) => {{
3812 assert!(
3813 first_magic($rule, $content, StreamKind::Text(TextEncoding::Utf8))
3814 .unwrap()
3815 .is_default()
3816 );
3817 }};
3818 }
3819
3820 macro_rules! assert_magic_not_match_bin {
3821 ($rule: literal, $content:literal) => {{
3822 assert!(
3823 first_magic($rule, $content, StreamKind::Binary)
3824 .unwrap()
3825 .is_default()
3826 );
3827 }};
3828 }
3829
3830 #[test]
3831 fn test_regex() {
3832 assert_magic_match_text!(
3833 r#"
38340 regex/1024 \^#![[:space:]]*/usr/bin/env[[:space:]]+
3835!:mime text/x-shellscript
3836>&0 regex/64 .*($|\\b) %s shell script text executable
3837 "#,
3838 br#"#!/usr/bin/env bash
3839 echo hello world"#,
3840 "bash shell script text executable"
3842 );
3843
3844 let re = Regex::new(r"(?-u)\x42\x82").unwrap();
3845 assert!(re.is_match(b"\x42\x82"));
3846
3847 assert_magic_match_bin!(
3848 r#"0 regex \x42\x82 binary regex match"#,
3849 b"\x00\x00\x00\x00\x00\x00\x42\x82"
3850 );
3851
3852 assert_magic_match_bin!(
3854 r#"
3855 0 regex \x42\x82
3856 >&0 string \xde\xad\xbe\xef it works
3857 "#,
3858 b"\x00\x00\x00\x00\x00\x00\x42\x82\xde\xad\xbe\xef"
3859 );
3860
3861 assert_magic_match_bin!(
3862 r#"
3863 0 regex/s \x42\x82
3864 >&0 string \x42\x82\xde\xad\xbe\xef it works
3865 "#,
3866 b"\x00\x00\x00\x00\x00\x00\x42\x82\xde\xad\xbe\xef"
3867 );
3868
3869 assert_magic_match_text!(
3871 r#"
38720 regex/1024 \^HelloWorld$ HelloWorld String"#,
3873 br#"
3874// this is a comment after an empty line
3875HelloWorld
3876 "#
3877 );
3878 }
3879
3880 #[test]
3881 fn test_string_with_mods() {
3882 assert_magic_match_text!(
3883 r#"0 string/w #!\ \ \ /usr/bin/env\ bash BASH
3884 "#,
3885 b"#! /usr/bin/env bash i
3886 echo hello world"
3887 );
3888
3889 assert_magic_match_text!(
3891 r#"0 string/C HelloWorld it works
3892 "#,
3893 b"helloworld"
3894 );
3895
3896 assert_magic_not_match_text!(
3897 r#"0 string/C HelloWorld it works
3898 "#,
3899 b"hELLOwORLD"
3900 );
3901
3902 assert_magic_match_text!(
3904 r#"0 string/c HelloWorld it works
3905 "#,
3906 b"HELLOWORLD"
3907 );
3908
3909 assert_magic_not_match_text!(
3910 r#"0 string/c HelloWorld it works
3911 "#,
3912 b"helloworld"
3913 );
3914
3915 assert_magic_match_text!(
3917 r#"0 string/f #!/usr/bin/env\ bash BASH
3918 "#,
3919 b"#!/usr/bin/env bash"
3920 );
3921
3922 assert_magic_not_match_text!(
3923 r#"0 string/f #!/usr/bin/python PYTHON"#,
3924 b"#!/usr/bin/pythonic"
3925 );
3926
3927 assert_magic_match_text!(
3929 r#"0 string/W #!/usr/bin/env\ python PYTHON"#,
3930 b"#!/usr/bin/env python"
3931 );
3932
3933 assert_magic_not_match_text!(
3934 r#"0 string/W #!/usr/bin/env\ \ python PYTHON"#,
3935 b"#!/usr/bin/env python"
3936 );
3937 }
3938
3939 #[test]
3940 fn test_search_with_mods() {
3941 assert_magic_match_text!(
3942 r#"0 search/1/fwt #!\ /usr/bin/luatex LuaTex script text executable"#,
3943 b"#! /usr/bin/luatex "
3944 );
3945
3946 assert_magic_match_text!(
3948 r#"
3949 0 search/s /usr/bin/env
3950 >&0 string /usr/bin/env it works
3951 "#,
3952 b"#!/usr/bin/env python"
3953 );
3954
3955 assert_magic_not_match_text!(
3956 r#"
3957 0 search /usr/bin/env
3958 >&0 string /usr/bin/env it works
3959 "#,
3960 b"#!/usr/bin/env python"
3961 );
3962 }
3963
3964 #[test]
3965 fn test_pstring() {
3966 assert_magic_match_bin!(r#"0 pstring Toast it works"#, b"\x05Toast");
3967
3968 assert_magic_match_bin!(r#"0 pstring Toast %s"#, b"\x05Toast", "Toast");
3969
3970 assert_magic_not_match_bin!(r#"0 pstring Toast Doesn't work"#, b"\x07Toaster");
3971
3972 assert_magic_match_bin!(r#"0 pstring/H Toast it works"#, b"\x00\x05Toast");
3974
3975 assert_magic_match_bin!(r#"0 pstring/HJ Toast it works"#, b"\x00\x07Toast");
3976
3977 assert_magic_match_bin!(r#"0 pstring/HJ Toast %s"#, b"\x00\x07Toast", "Toast");
3978
3979 assert_magic_match_bin!(r#"0 pstring/h Toast it works"#, b"\x05\x00Toast");
3980
3981 assert_magic_match_bin!(r#"0 pstring/hJ Toast it works"#, b"\x07\x00Toast");
3982
3983 assert_magic_match_bin!(r#"0 pstring/L Toast it works"#, b"\x00\x00\x00\x05Toast");
3984
3985 assert_magic_match_bin!(r#"0 pstring/LJ Toast it works"#, b"\x00\x00\x00\x09Toast");
3986
3987 assert_magic_match_bin!(r#"0 pstring/l Toast it works"#, b"\x05\x00\x00\x00Toast");
3988
3989 assert_magic_match_bin!(r#"0 pstring/lJ Toast it works"#, b"\x09\x00\x00\x00Toast");
3990 }
3991
3992 #[test]
3993 fn test_max_recursion() {
3994 let res = first_magic(
3995 r#"0 indirect x"#,
3996 b"#! /usr/bin/luatex ",
3997 StreamKind::Binary,
3998 );
3999 assert!(res.is_err());
4000 let _ = res.inspect_err(|e| {
4001 assert!(matches!(
4002 e.unwrap_localized(),
4003 Error::MaximumRecursion(MAX_RECURSION)
4004 ))
4005 });
4006 }
4007
4008 #[test]
4009 fn test_string_ops() {
4010 assert_magic_match_text!("0 string/b MZ MZ File", b"MZ\0");
4011 assert_magic_match_text!("0 string !MZ Not MZ File", b"AZ\0");
4012 assert_magic_match_text!("0 string >\0 Any String", b"A\0");
4013 assert_magic_match_text!("0 string >Test Any String", b"Test 1\0");
4014 assert_magic_match_text!("0 string <Test Any String", b"\0");
4015 assert_magic_not_match_text!("0 string >Test Any String", b"\0");
4016 }
4017
4018 #[test]
4019 fn test_lestring16() {
4020 assert_magic_match_bin!(
4021 "0 lestring16 abcd Little-endian UTF-16 string",
4022 b"\x61\x00\x62\x00\x63\x00\x64\x00"
4023 );
4024 assert_magic_match_bin!(
4025 "0 lestring16 x %s",
4026 b"\x61\x00\x62\x00\x63\x00\x64\x00\x00",
4027 "abcd"
4028 );
4029 assert_magic_not_match_bin!(
4030 "0 lestring16 abcd Little-endian UTF-16 string",
4031 b"\x00\x61\x00\x62\x00\x63\x00\x64"
4032 );
4033 assert_magic_match_bin!(
4034 "4 lestring16 abcd Little-endian UTF-16 string",
4035 b"\x00\x00\x00\x00\x61\x00\x62\x00\x63\x00\x64\x00"
4036 );
4037 }
4038
4039 #[test]
4040 fn test_bestring16() {
4041 assert_magic_match_bin!(
4042 "0 bestring16 abcd Big-endian UTF-16 string",
4043 b"\x00\x61\x00\x62\x00\x63\x00\x64"
4044 );
4045 assert_magic_match_bin!(
4046 "0 bestring16 x %s",
4047 b"\x00\x61\x00\x62\x00\x63\x00\x64",
4048 "abcd"
4049 );
4050 assert_magic_not_match_bin!(
4051 "0 bestring16 abcd Big-endian UTF-16 string",
4052 b"\x61\x00\x62\x00\x63\x00\x64\x00"
4053 );
4054 assert_magic_match_bin!(
4055 "4 bestring16 abcd Big-endian UTF-16 string",
4056 b"\x00\x00\x00\x00\x00\x61\x00\x62\x00\x63\x00\x64"
4057 );
4058 }
4059
4060 #[test]
4061 fn test_offset_from_end() {
4062 assert_magic_match_bin!("-1 ubyte 0x42 last byte ok", b"\x00\x00\x42");
4063 assert_magic_match_bin!("-2 ubyte 0x41 last byte ok", b"\x00\x41\x00");
4064 }
4065
4066 #[test]
4067 fn test_relative_offset() {
4068 assert_magic_match_bin!(
4069 "
4070 0 ubyte 0x42
4071 >&0 ubyte 0x00
4072 >>&0 ubyte 0x41 third byte ok
4073 ",
4074 b"\x42\x00\x41\x00"
4075 );
4076 }
4077
4078 #[test]
4079 fn test_indirect_offset() {
4080 assert_magic_match_bin!("(0.l) ubyte 0x42 it works", b"\x04\x00\x00\x00\x42");
4081 assert_magic_match_bin!("(0.l+3) ubyte 0x42 it works", b"\x01\x00\x00\x00\x42");
4083 assert_magic_match_bin!(
4085 "(0.l+(4)) ubyte 0x42 it works",
4086 b"\x04\x00\x00\x00\x04\x00\x00\x00\x42"
4087 );
4088 }
4089
4090 #[test]
4091 fn test_use_with_message() {
4092 assert_magic_match_bin!(
4093 r#"
40940 string MZ
4095>0 use mz first match
4096
40970 name mz then second match
4098>0 string MZ
4099"#,
4100 b"MZ\0",
4101 "first match then second match"
4102 );
4103 }
4104
4105 #[test]
4106 fn test_scalar_transform() {
4107 assert_magic_match_bin!("0 ubyte+1 0x1 add works", b"\x00");
4108 assert_magic_match_bin!("0 ubyte-1 0xfe sub works", b"\xff");
4109 assert_magic_match_bin!("0 ubyte%2 0 mod works", b"\x0a");
4110 assert_magic_match_bin!("0 ubyte&0x0f 0x0f bitand works", b"\xff");
4111 assert_magic_match_bin!("0 ubyte|0x0f 0xff bitor works", b"\xf0");
4112 assert_magic_match_bin!("0 ubyte^0x0f 0xf0 bitxor works", b"\xff");
4113
4114 FileMagicParser::parse_str("0 ubyte%0 mod by zero", None)
4115 .expect_err("expect div by zero error");
4116 FileMagicParser::parse_str("0 ubyte/0 div by zero", None)
4117 .expect_err("expect div by zero error");
4118 }
4119
4120 #[test]
4121 fn test_belong() {
4122 assert_magic_match_bin!("0 belong 0x12345678 Big-endian long", b"\x12\x34\x56\x78");
4124 assert_magic_not_match_bin!("0 belong 0x12345678 Big-endian long", b"\x78\x56\x34\x12");
4126 assert_magic_match_bin!(
4128 "4 belong 0x12345678 Big-endian long",
4129 b"\x00\x00\x00\x00\x12\x34\x56\x78"
4130 );
4131 assert_magic_match_bin!("0 belong <0x12345678 Big-endian long", b"\x12\x34\x56\x77");
4133 assert_magic_not_match_bin!("0 belong <0x12345678 Big-endian long", b"\x12\x34\x56\x78");
4134
4135 assert_magic_match_bin!("0 belong >0x12345678 Big-endian long", b"\x12\x34\x56\x79");
4137 assert_magic_not_match_bin!("0 belong >0x12345678 Big-endian long", b"\x12\x34\x56\x78");
4138
4139 assert_magic_match_bin!("0 belong &0x5678 Big-endian long", b"\x00\x00\x56\x78");
4141 assert_magic_not_match_bin!("0 belong &0x0000FFFF Big-endian long", b"\x12\x34\x56\x78");
4142
4143 assert_magic_match_bin!("0 belong ^0xFFFF0000 Big-endian long", b"\x00\x00\x56\x78");
4145 assert_magic_not_match_bin!("0 belong ^0xFFFF0000 Big-endian long", b"\x00\x01\x56\x78");
4146
4147 assert_magic_match_bin!("0 belong ~0x12345678 Big-endian long", b"\xed\xcb\xa9\x87");
4149 assert_magic_not_match_bin!("0 belong ~0x12345678 Big-endian long", b"\x12\x34\x56\x78");
4150
4151 assert_magic_match_bin!("0 belong x Big-endian long", b"\x12\x34\x56\x78");
4153 assert_magic_match_bin!("0 belong x Big-endian long", b"\x78\x56\x34\x12");
4154 }
4155
4156 #[test]
4157 fn test_parse_search() {
4158 parse_assert!("0 search test");
4159 parse_assert!("0 search/24/s test");
4160 parse_assert!("0 search/s/24 test");
4161 }
4162
4163 #[test]
4164 fn test_bedate() {
4165 assert_magic_match_bin!(
4166 "0 bedate 946684800 Unix date (Jan 1, 2000)",
4167 b"\x38\x6D\x43\x80"
4168 );
4169 assert_magic_not_match_bin!(
4170 "0 bedate 946684800 Unix date (Jan 1, 2000)",
4171 b"\x00\x00\x00\x00"
4172 );
4173 assert_magic_match_bin!(
4174 "4 bedate 946684800 %s",
4175 b"\x00\x00\x00\x00\x38\x6D\x43\x80",
4176 "2000-01-01 00:00:00"
4177 );
4178 }
4179 #[test]
4180 fn test_beldate() {
4181 assert_magic_match_bin!(
4182 "0 beldate 946684800 Local date (Jan 1, 2000)",
4183 b"\x38\x6D\x43\x80"
4184 );
4185 assert_magic_not_match_bin!(
4186 "0 beldate 946684800 Local date (Jan 1, 2000)",
4187 b"\x00\x00\x00\x00"
4188 );
4189
4190 assert_magic_match_bin!(
4191 "4 beldate 946684800 {}",
4192 b"\x00\x00\x00\x00\x38\x6D\x43\x80",
4193 unix_local_time_to_string(946684800)
4194 );
4195 }
4196
4197 #[test]
4198 fn test_beqdate() {
4199 assert_magic_match_bin!(
4200 "0 beqdate 946684800 Unix date (Jan 1, 2000)",
4201 b"\x00\x00\x00\x00\x38\x6D\x43\x80"
4202 );
4203
4204 assert_magic_not_match_bin!(
4205 "0 beqdate 946684800 Unix date (Jan 1, 2000)",
4206 b"\x00\x00\x00\x00\x00\x00\x00\x00"
4207 );
4208
4209 assert_magic_match_bin!(
4210 "0 beqdate 946684800 %s",
4211 b"\x00\x00\x00\x00\x38\x6D\x43\x80",
4212 "2000-01-01 00:00:00"
4213 );
4214 }
4215
4216 #[test]
4217 fn test_medate() {
4218 assert_magic_match_bin!(
4219 "0 medate 946684800 Unix date (Jan 1, 2000)",
4220 b"\x6D\x38\x80\x43"
4221 );
4222
4223 assert_magic_not_match_bin!(
4224 "0 medate 946684800 Unix date (Jan 1, 2000)",
4225 b"\x00\x00\x00\x00"
4226 );
4227
4228 assert_magic_match_bin!(
4229 "4 medate 946684800 %s",
4230 b"\x00\x00\x00\x00\x6D\x38\x80\x43",
4231 "2000-01-01 00:00:00"
4232 );
4233 }
4234
4235 #[test]
4236 fn test_meldate() {
4237 assert_magic_match_bin!(
4238 "0 meldate 946684800 Local date (Jan 1, 2000)",
4239 b"\x6D\x38\x80\x43"
4240 );
4241 assert_magic_not_match_bin!(
4242 "0 meldate 946684800 Local date (Jan 1, 2000)",
4243 b"\x00\x00\x00\x00"
4244 );
4245
4246 assert_magic_match_bin!(
4247 "4 meldate 946684800 %s",
4248 b"\x00\x00\x00\x00\x6D\x38\x80\x43",
4249 unix_local_time_to_string(946684800)
4250 );
4251 }
4252
4253 #[test]
4254 fn test_date() {
4255 assert_magic_match_bin!(
4256 "0 date 946684800 Local date (Jan 1, 2000)",
4257 b"\x80\x43\x6D\x38"
4258 );
4259 assert_magic_not_match_bin!(
4260 "0 date 946684800 Local date (Jan 1, 2000)",
4261 b"\x00\x00\x00\x00"
4262 );
4263 assert_magic_match_bin!(
4264 "4 date 946684800 {}",
4265 b"\x00\x00\x00\x00\x80\x43\x6D\x38",
4266 "2000-01-01 00:00:00"
4267 );
4268 }
4269
4270 #[test]
4271 fn test_leldate() {
4272 assert_magic_match_bin!(
4273 "0 leldate 946684800 Local date (Jan 1, 2000)",
4274 b"\x80\x43\x6D\x38"
4275 );
4276 assert_magic_not_match_bin!(
4277 "0 leldate 946684800 Local date (Jan 1, 2000)",
4278 b"\x00\x00\x00\x00"
4279 );
4280 assert_magic_match_bin!(
4281 "4 leldate 946684800 {}",
4282 b"\x00\x00\x00\x00\x80\x43\x6D\x38",
4283 unix_local_time_to_string(946684800)
4284 );
4285 }
4286
4287 #[test]
4288 fn test_leqdate() {
4289 assert_magic_match_bin!(
4290 "0 leqdate 1577836800 Unix date (Jan 1, 2020)",
4291 b"\x00\xe1\x0b\x5E\x00\x00\x00\x00"
4292 );
4293
4294 assert_magic_not_match_bin!(
4295 "0 leqdate 1577836800 Unix date (Jan 1, 2020)",
4296 b"\x00\x00\x00\x00\x00\x00\x00\x00"
4297 );
4298 assert_magic_match_bin!(
4299 "8 leqdate 1577836800 %s",
4300 b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\xE1\x0B\x5E\x00\x00\x00\x00",
4301 "2020-01-01 00:00:00"
4302 );
4303 }
4304
4305 #[test]
4306 fn test_leqldate() {
4307 assert_magic_match_bin!(
4308 "0 leqldate 1577836800 Unix date (Jan 1, 2020)",
4309 b"\x00\xe1\x0b\x5E\x00\x00\x00\x00"
4310 );
4311
4312 assert_magic_not_match_bin!(
4313 "0 leqldate 1577836800 Unix date (Jan 1, 2020)",
4314 b"\x00\x00\x00\x00\x00\x00\x00\x00"
4315 );
4316 assert_magic_match_bin!(
4317 "8 leqldate 1577836800 %s",
4318 b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\xE1\x0B\x5E\x00\x00\x00\x00",
4319 unix_local_time_to_string(1577836800)
4320 );
4321 }
4322
4323 #[test]
4324 fn test_melong() {
4325 assert_magic_match_bin!(
4327 "0 melong =0x12345678 Middle-endian long",
4328 b"\x34\x12\x78\x56"
4329 );
4330 assert_magic_not_match_bin!(
4331 "0 melong =0x12345678 Middle-endian long",
4332 b"\x00\x00\x00\x00"
4333 );
4334
4335 assert_magic_match_bin!(
4337 "0 melong <0x12345678 Middle-endian long",
4338 b"\x34\x12\x78\x55"
4339 ); assert_magic_not_match_bin!(
4341 "0 melong <0x12345678 Middle-endian long",
4342 b"\x34\x12\x78\x56"
4343 ); assert_magic_match_bin!(
4347 "0 melong >0x12345678 Middle-endian long",
4348 b"\x34\x12\x78\x57"
4349 ); assert_magic_not_match_bin!(
4351 "0 melong >0x12345678 Middle-endian long",
4352 b"\x34\x12\x78\x56"
4353 ); assert_magic_match_bin!("0 melong &0x5678 Middle-endian long", b"\xab\xcd\x78\x56"); assert_magic_not_match_bin!(
4358 "0 melong &0x0000FFFF Middle-endian long",
4359 b"\x34\x12\x78\x56"
4360 ); assert_magic_match_bin!(
4364 "0 melong ^0xFFFF0000 Middle-endian long",
4365 b"\x00\x00\x78\x56"
4366 ); assert_magic_not_match_bin!(
4368 "0 melong ^0xFFFF0000 Middle-endian long",
4369 b"\x00\x01\x78\x56"
4370 ); assert_magic_match_bin!(
4374 "0 melong ~0x12345678 Middle-endian long",
4375 b"\xCB\xED\x87\xA9"
4376 );
4377 assert_magic_not_match_bin!(
4378 "0 melong ~0x12345678 Middle-endian long",
4379 b"\x34\x12\x78\x56"
4380 ); assert_magic_match_bin!("0 melong x Middle-endian long", b"\x34\x12\x78\x56");
4384 assert_magic_match_bin!("0 melong x Middle-endian long", b"\x00\x00\x00\x00");
4385 }
4386
4387 #[test]
4388 fn test_uquad() {
4389 assert_magic_match_bin!(
4391 "0 uquad =0x123456789ABCDEF0 Unsigned quad",
4392 b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4393 );
4394 assert_magic_not_match_bin!(
4395 "0 uquad =0x123456789ABCDEF0 Unsigned quad",
4396 b"\x00\x00\x00\x00\x00\x00\x00\x00"
4397 );
4398
4399 assert_magic_match_bin!(
4401 "0 uquad <0x123456789ABCDEF0 Unsigned quad",
4402 b"\xF0\xDE\xBC\x9A\x78\x56\x34\x11"
4403 );
4404 assert_magic_not_match_bin!(
4405 "0 uquad <0x123456789ABCDEF0 Unsigned quad",
4406 b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4407 );
4408
4409 assert_magic_match_bin!(
4411 "0 uquad >0x123456789ABCDEF0 Unsigned quad",
4412 b"\xF0\xDE\xBC\x9A\x78\x56\x34\x13"
4413 );
4414 assert_magic_not_match_bin!(
4415 "0 uquad >0x123456789ABCDEF0 Unsigned quad",
4416 b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4417 );
4418
4419 assert_magic_match_bin!(
4421 "0 uquad &0xF0 Unsigned quad",
4422 b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4423 );
4424 assert_magic_not_match_bin!(
4425 "0 uquad &0xFF Unsigned quad",
4426 b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4427 );
4428
4429 assert_magic_match_bin!(
4431 "0 uquad ^0xFFFFFFFFFFFFFFFF Unsigned quad",
4432 b"\x00\x00\x00\x00\x00\x00\x00\x00"
4433 ); assert_magic_not_match_bin!(
4435 "0 uquad ^0xFFFFFFFFFFFFFFFF Unsigned quad",
4436 b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4437 ); assert_magic_match_bin!(
4441 "0 uquad ~0x123456789ABCDEF0 Unsigned quad",
4442 b"\x0F\x21\x43\x65\x87\xA9\xCB\xED"
4443 );
4444 assert_magic_not_match_bin!(
4445 "0 uquad ~0x123456789ABCDEF0 Unsigned quad",
4446 b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4447 ); assert_magic_match_bin!(
4451 "0 uquad x {:#x}",
4452 b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12",
4453 "0x123456789abcdef0"
4454 );
4455 assert_magic_match_bin!(
4456 "0 uquad x Unsigned quad",
4457 b"\x00\x00\x00\x00\x00\x00\x00\x00"
4458 );
4459 }
4460
4461 #[test]
4462 fn test_guid() {
4463 assert_magic_match_bin!(
4464 "0 guid EC959539-6786-2D4E-8FDB-98814CE76C1E It works",
4465 b"\xEC\x95\x95\x39\x67\x86\x2D\x4E\x8F\xDB\x98\x81\x4C\xE7\x6C\x1E"
4466 );
4467
4468 assert_magic_not_match_bin!(
4469 "0 guid 399595EC-8667-4E2D-8FDB-98814CE76C1E It works",
4470 b"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F"
4471 );
4472
4473 assert_magic_match_bin!(
4474 "0 guid x %s",
4475 b"\xEC\x95\x95\x39\x67\x86\x2D\x4E\x8F\xDB\x98\x81\x4C\xE7\x6C\x1E",
4476 "EC959539-6786-2D4E-8FDB-98814CE76C1E"
4477 );
4478 }
4479
4480 #[test]
4481 fn test_ubeqdate() {
4482 assert_magic_match_bin!(
4483 "0 ubeqdate 1633046400 It works",
4484 b"\x00\x00\x00\x00\x61\x56\x4f\x80"
4485 );
4486
4487 assert_magic_match_bin!(
4488 "0 ubeqdate x %s",
4489 b"\x00\x00\x00\x00\x61\x56\x4f\x80",
4490 "2021-10-01 00:00:00"
4491 );
4492
4493 assert_magic_not_match_bin!(
4494 "0 ubeqdate 1633046400 It should not work",
4495 b"\x00\x00\x00\x00\x00\x00\x00\x00"
4496 );
4497 }
4498
4499 #[test]
4500 fn test_ldate() {
4501 assert_magic_match_bin!("0 ldate 1640551520 It works", b"\x60\xd4\xC8\x61");
4502
4503 assert_magic_not_match_bin!("0 ldate 1633046400 It should not work", b"\x00\x00\x00\x00");
4504
4505 assert_magic_match_bin!(
4506 "0 ldate x %s",
4507 b"\x60\xd4\xC8\x61",
4508 unix_local_time_to_string(1640551520)
4509 );
4510 }
4511
4512 #[test]
4513 fn test_scalar_with_transform() {
4514 assert_magic_match_bin!("0 ubyte/10 2 {}", b"\x14", "2");
4515 assert_magic_match_bin!("0 ubyte/10 x {}", b"\x14", "2");
4516 assert_magic_match_bin!("0 ubyte%10 x {}", b"\x14", "0");
4517 }
4518
4519 #[test]
4520 fn test_float_with_transform() {
4521 assert_magic_match_bin!("0 lefloat/10 2 {}", b"\x00\x00\xa0\x41", "2");
4522 assert_magic_match_bin!("0 lefloat/10 x {}", b"\x00\x00\xa0\x41", "2");
4523 assert_magic_match_bin!("0 lefloat%10 x {}", b"\x00\x00\xa0\x41", "0");
4524 }
4525
4526 #[test]
4527 fn test_read_octal() {
4528 assert_eq!(read_octal_u64(&mut lazy_cache!("0")), Some(0));
4530 assert_eq!(read_octal_u64(&mut lazy_cache!("00")), Some(0));
4531 assert_eq!(read_octal_u64(&mut lazy_cache!("01")), Some(1));
4532 assert_eq!(read_octal_u64(&mut lazy_cache!("07")), Some(7));
4533 assert_eq!(read_octal_u64(&mut lazy_cache!("010")), Some(8));
4534 assert_eq!(read_octal_u64(&mut lazy_cache!("0123")), Some(83));
4535 assert_eq!(read_octal_u64(&mut lazy_cache!("0755")), Some(493));
4536
4537 assert_eq!(read_octal_u64(&mut lazy_cache!("0ABC")), Some(0));
4539 assert_eq!(read_octal_u64(&mut lazy_cache!("01ABC")), Some(1));
4540 assert_eq!(read_octal_u64(&mut lazy_cache!("0755ABC")), Some(493));
4541 assert_eq!(read_octal_u64(&mut lazy_cache!("0123ABC")), Some(83));
4542
4543 assert_eq!(read_octal_u64(&mut lazy_cache!("08")), Some(0)); assert_eq!(read_octal_u64(&mut lazy_cache!("01238")), Some(83)); assert_eq!(read_octal_u64(&mut lazy_cache!("123")), None);
4549 assert_eq!(read_octal_u64(&mut lazy_cache!("755")), None);
4550
4551 assert_eq!(read_octal_u64(&mut lazy_cache!("")), None);
4553
4554 assert_eq!(read_octal_u64(&mut lazy_cache!("ABC")), None);
4556 assert_eq!(read_octal_u64(&mut lazy_cache!("8ABC")), None); assert_eq!(
4560 read_octal_u64(&mut lazy_cache!("01777777777")),
4561 Some(268435455)
4562 );
4563 }
4564
4565 #[test]
4566 fn test_offset_bug_1() {
4567 assert_magic_match_bin!(
4570 r"
45711 string TEST Bread is
4572# offset computation is relative to
4573# rule start
4574>(5.b) use toasted
4575
45760 name toasted
4577>0 string twice Toasted
4578>>0 use toasted_twice
4579
45800 name toasted_twice
4581>(6.b) string x %s
4582 ",
4583 b"\x00TEST\x06twice\x00\x06",
4584 "Bread is Toasted twice"
4585 );
4586 }
4587
4588 #[test]
4594 fn test_offset_bug_2() {
4595 assert_magic_match_bin!(
4598 r"
4599-12 string TEST Bread is
4600>(4.b) use toasted
4601
46020 name toasted
4603>0 string twice Toasted
4604>>0 use toasted_twice
4605
46060 name toasted_twice
4607>(6.b) string x %
4608 ",
4609 b"\x00TEST\x06twice\x00\x06",
4610 "Bread is Toasted twice"
4611 )
4612 }
4613
4614 #[test]
4615 fn test_offset_bug_3() {
4616 assert_magic_match_bin!(
4619 r"
46201 string TEST Bread is
4621>(5.b) indirect/r x
4622
46230 string twice Toasted
4624>0 use toasted_twice
4625
46260 name toasted_twice
4627>0 string x %s
4628 ",
4629 b"\x00TEST\x06twice\x00\x08",
4630 "Bread is Toasted twice"
4631 )
4632 }
4633
4634 #[test]
4635 fn test_offset_bug_4() {
4636 assert_magic_match_bin!(
4639 r"
46401 string Bread %s
4641>(6.b) indirect/r x
4642
4643# this one uses a based offset
4644# computed at indirection
46451 string is\ Toasted %s
4646>(11.b) use toasted_twice
4647
4648# this one is using a new base
4649# offset being previous base
4650# offset + offset of use
46510 name toasted_twice
4652>0 string x %s
4653 ",
4654 b"\x00Bread\x06is Toasted\x0ctwice\x00",
4655 "Bread is Toasted twice"
4656 )
4657 }
4658
4659 #[test]
4660 fn test_offset_bug_5() {
4661 assert_magic_match_bin!(
4662 r"
46631 string TEST Bread is
4664>(5.b) indirect/r x
4665
46660 string twice Toasted
4667>0 use toasted_twice
4668
46690 name toasted_twice
4670>0 string twice
4671>>&1 byte 0x08 twice
4672 ",
4673 b"\x00TEST\x06twice\x00\x08",
4674 "Bread is Toasted twice"
4675 )
4676 }
4677
4678 #[test]
4679 fn test_message_parts() {
4680 let m = first_magic(
4681 r#"0 string/W #!/usr/bin/env\ python PYTHON"#,
4682 b"#!/usr/bin/env python",
4683 StreamKind::Text(TextEncoding::Ascii),
4684 )
4685 .unwrap();
4686
4687 assert!(m.message_parts().any(|p| p.eq_ignore_ascii_case("python")))
4688 }
4689
4690 #[test]
4691 fn test_load_bulk() {
4692 let mut db = MagicDb::new();
4693
4694 let rules = vec![
4695 parse_assert!("0 search test"),
4696 parse_assert!("0 search/24/s test"),
4697 parse_assert!("0 search/s/24 test"),
4698 ];
4699
4700 db.load_bulk(rules.into_iter());
4701 db.verify().unwrap();
4702 }
4703
4704 #[test]
4705 fn test_load_bulk_failure() {
4706 let mut db = MagicDb::new();
4707
4708 let rules = vec![parse_assert!(
4709 r#"
47100 search/s/24 test
4711>0 use test
4712"#
4713 )];
4714
4715 db.load_bulk(rules.into_iter());
4716 assert!(matches!(db.verify(), Err(Error::Verify(_, _, _))));
4717 }
4718}