1#![forbid(unsafe_code)]
2#![deny(unused_imports)]
3#![deny(missing_docs)]
4use dyf::{DynDisplay, FormatString, dformat};
146use flagset::{FlagSet, flags};
147use flate2::{Compression, read::GzDecoder, write::GzEncoder};
148use lazy_cache::LazyCache;
149use memchr::memchr;
150use pest::{Span, error::ErrorVariant};
151use regex::bytes::{self};
152use serde::{Deserialize, Serialize};
153use std::{
154 borrow::Cow,
155 cmp::max,
156 collections::{HashMap, HashSet},
157 fmt::{self, Debug, Display},
158 io::{self, Read, Seek, SeekFrom, Write},
159 ops::{Add, BitAnd, BitOr, BitXor, Deref, Div, Mul, Rem, Sub},
160 path::Path,
161};
162use tar::Archive;
163use thiserror::Error;
164use tracing::{Level, debug, enabled, trace};
165
166use crate::{
167 numeric::{Float, FloatDataType, Scalar, ScalarDataType},
168 parser::{FileMagicParser, Rule},
169 utils::{
170 debug_string_from_vec_u8, debug_string_from_vec_u16, decode_id3, find_json_boundaries,
171 run_utf8_validation,
172 },
173};
174
175mod numeric;
176mod parser;
177mod utils;
178
179const HARDCODED_MAGIC_STRENGTH: u64 = 2048;
180const HARDCODED_SOURCE: &str = "hardcoded";
181const MAX_RECURSION: usize = 50;
183const FILE_REGEX_MAX: usize = 8192;
185
186pub const FILE_BYTES_MAX: usize = 7 * 1024 * 1024;
192pub const DEFAULT_BIN_MIMETYPE: &str = "application/octet-stream";
194pub const DEFAULT_TEXT_MIMETYPE: &str = "text/plain";
196
197pub(crate) const TIMESTAMP_FORMAT: &str = "%Y-%m-%d %H:%M:%S";
198
199macro_rules! debug_panic {
200 ($($arg:tt)*) => {
201 if cfg!(debug_assertions) {
202 panic!($($arg)*);
203 }
204 };
205}
206
207macro_rules! read {
208 ($r: expr, $ty: ty) => {{
209 let mut a = [0u8; std::mem::size_of::<$ty>()];
210 $r.read_exact(&mut a)?;
211 a
212 }};
213}
214
215macro_rules! read_le {
216 ($r:expr, $ty: ty ) => {{ <$ty>::from_le_bytes(read!($r, $ty)) }};
217}
218
219macro_rules! read_be {
220 ($r:expr, $ty: ty ) => {{ <$ty>::from_be_bytes(read!($r, $ty)) }};
221}
222
223macro_rules! read_me {
224 ($r: expr) => {{ ((read_le!($r, u16) as i32) << 16) | (read_le!($r, u16) as i32) }};
225}
226
227#[inline(always)]
228fn read_octal_u64<R: Read + Seek>(haystack: &mut LazyCache<R>) -> Option<u64> {
229 let s = haystack
230 .read_while_or_limit(|b| matches!(b, b'0'..=b'7'), 22)
231 .map(|buf| str::from_utf8(buf))
232 .ok()?
233 .ok()?;
234
235 if !s.starts_with("0") {
236 return None;
237 }
238
239 u64::from_str_radix(s, 8).ok()
240}
241
242#[derive(Debug, Error)]
244pub enum Error {
245 #[error("{0}")]
247 Msg(String),
248
249 #[error("source={0} line={1} error={2}")]
251 Verify(String, usize, Box<Error>),
252
253 #[error("source={0} line={1} error={2}")]
255 Localized(String, usize, Box<Error>),
256
257 #[error("missing rule: {0}")]
259 MissingRule(String),
260
261 #[error("maximum recursion reached: {0}")]
263 MaximumRecursion(usize),
264
265 #[error("io: {0}")]
267 Io(#[from] io::Error),
268
269 #[error("parser error: {0}")]
271 Parse(#[from] Box<pest::error::Error<Rule>>),
272
273 #[error("formatting: {0}")]
275 Format(#[from] dyf::Error),
276
277 #[error("regex: {0}")]
279 Regex(#[from] regex::Error),
280
281 #[error("{0}")]
283 Serialize(#[from] bincode::error::EncodeError),
284
285 #[error("{0}")]
287 Deserialize(#[from] bincode::error::DecodeError),
288}
289
290impl Error {
291 #[inline]
292 fn parser<S: ToString>(msg: S, span: Span<'_>) -> Self {
293 Self::Parse(Box::new(pest::error::Error::new_from_span(
294 ErrorVariant::CustomError {
295 message: msg.to_string(),
296 },
297 span,
298 )))
299 }
300
301 fn msg<M: AsRef<str>>(msg: M) -> Self {
302 Self::Msg(msg.as_ref().into())
303 }
304
305 fn localized<S: AsRef<str>>(source: S, line: usize, err: Error) -> Self {
306 Self::Localized(source.as_ref().into(), line, err.into())
307 }
308
309 pub fn unwrap_localized(&self) -> &Self {
311 match self {
312 Self::Localized(_, _, e) => e,
313 _ => self,
314 }
315 }
316}
317
318#[derive(Debug, Clone, Serialize, Deserialize)]
319enum Message {
320 String(String),
321 Format {
322 printf_spec: String,
323 fs: FormatString,
324 },
325}
326
327impl Display for Message {
328 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
329 match self {
330 Self::String(s) => write!(f, "{s}"),
331 Self::Format { printf_spec: _, fs } => write!(f, "{}", fs.to_string_lossy()),
332 }
333 }
334}
335
336impl Message {
337 fn to_string_lossy(&self) -> Cow<'_, str> {
338 match self {
339 Message::String(s) => Cow::Borrowed(s),
340 Message::Format { printf_spec: _, fs } => fs.to_string_lossy(),
341 }
342 }
343
344 #[inline(always)]
345 fn format_with(&self, mr: Option<&MatchRes>) -> Result<Cow<'_, str>, Error> {
346 match self {
347 Self::String(s) => Ok(Cow::Borrowed(s.as_str())),
348 Self::Format {
349 printf_spec: c_spec,
350 fs,
351 } => {
352 if let Some(mr) = mr {
353 match mr {
354 MatchRes::Float(_, _) | MatchRes::Bytes(_, _, _, _) => {
355 Ok(Cow::Owned(dformat!(fs, mr)?))
356 }
357 MatchRes::Scalar(_, scalar) => {
358 if c_spec.as_str() == "c" {
360 match scalar {
361 Scalar::byte(b) => {
362 let b = (*b as u8) as char;
363 Ok(Cow::Owned(dformat!(fs, b)?))
364 }
365 Scalar::ubyte(b) => {
366 let b = *b as char;
367 Ok(Cow::Owned(dformat!(fs, b)?))
368 }
369 _ => Ok(Cow::Owned(dformat!(fs, mr)?)),
370 }
371 } else {
372 Ok(Cow::Owned(dformat!(fs, mr)?))
373 }
374 }
375 }
376 } else {
377 Ok(fs.to_string_lossy())
378 }
379 }
380 }
381 }
382}
383
384impl ScalarDataType {
385 #[inline(always)]
386 fn read<R: Read + Seek>(&self, from: &mut R, switch_endianness: bool) -> Result<Scalar, Error> {
387 macro_rules! _read_le {
388 ($ty: ty) => {{
389 if switch_endianness {
390 <$ty>::from_be_bytes(read!(from, $ty))
391 } else {
392 <$ty>::from_le_bytes(read!(from, $ty))
393 }
394 }};
395 }
396
397 macro_rules! _read_be {
398 ($ty: ty) => {{
399 if switch_endianness {
400 <$ty>::from_le_bytes(read!(from, $ty))
401 } else {
402 <$ty>::from_be_bytes(read!(from, $ty))
403 }
404 }};
405 }
406
407 macro_rules! _read_ne {
408 ($ty: ty) => {{
409 if cfg!(target_endian = "big") {
410 _read_be!($ty)
411 } else {
412 _read_le!($ty)
413 }
414 }};
415 }
416
417 macro_rules! _read_me {
418 () => {
419 ((_read_le!(u16) as i32) << 16) | (_read_le!(u16) as i32)
420 };
421 }
422
423 Ok(match self {
424 Self::byte => Scalar::byte(read!(from, u8)[0] as i8),
426 Self::short => Scalar::short(_read_ne!(i16)),
427 Self::long => Scalar::long(_read_ne!(i32)),
428 Self::date => Scalar::date(_read_ne!(i32)),
429 Self::ldate => Scalar::ldate(_read_ne!(i32)),
430 Self::qwdate => Scalar::qwdate(_read_ne!(i64)),
431 Self::leshort => Scalar::leshort(_read_le!(i16)),
432 Self::lelong => Scalar::lelong(_read_le!(i32)),
433 Self::lequad => Scalar::lequad(_read_le!(i64)),
434 Self::bequad => Scalar::bequad(_read_be!(i64)),
435 Self::belong => Scalar::belong(_read_be!(i32)),
436 Self::bedate => Scalar::bedate(_read_be!(i32)),
437 Self::beldate => Scalar::beldate(_read_be!(i32)),
438 Self::beqdate => Scalar::beqdate(_read_be!(i64)),
439 Self::ubyte => Scalar::ubyte(read!(from, u8)[0]),
441 Self::ushort => Scalar::ushort(_read_ne!(u16)),
442 Self::uleshort => Scalar::uleshort(_read_le!(u16)),
443 Self::ulelong => Scalar::ulelong(_read_le!(u32)),
444 Self::uledate => Scalar::uledate(_read_le!(u32)),
445 Self::ulequad => Scalar::ulequad(_read_le!(u64)),
446 Self::offset => Scalar::offset(from.stream_position()?),
447 Self::ubequad => Scalar::ubequad(_read_be!(u64)),
448 Self::medate => Scalar::medate(_read_me!()),
449 Self::meldate => Scalar::meldate(_read_me!()),
450 Self::melong => Scalar::melong(_read_me!()),
451 Self::beshort => Scalar::beshort(_read_be!(i16)),
452 Self::quad => Scalar::quad(_read_ne!(i64)),
453 Self::uquad => Scalar::uquad(_read_ne!(u64)),
454 Self::ledate => Scalar::ledate(_read_le!(i32)),
455 Self::leldate => Scalar::leldate(_read_le!(i32)),
456 Self::leqdate => Scalar::leqdate(_read_le!(i64)),
457 Self::leqldate => Scalar::leqldate(_read_le!(i64)),
458 Self::leqwdate => Scalar::leqwdate(_read_le!(i64)),
459 Self::ubelong => Scalar::ubelong(_read_be!(u32)),
460 Self::ulong => Scalar::ulong(_read_ne!(u32)),
461 Self::ubeshort => Scalar::ubeshort(_read_be!(u16)),
462 Self::ubeqdate => Scalar::ubeqdate(_read_be!(u64)),
463 Self::lemsdosdate => Scalar::lemsdosdate(_read_le!(u16)),
464 Self::lemsdostime => Scalar::lemsdostime(_read_le!(u16)),
465 Self::guid => Scalar::guid(u128::from_be_bytes(read!(from, u128))),
466 })
467 }
468}
469
470impl FloatDataType {
471 #[inline(always)]
472 fn read<R: Read + Seek>(&self, from: &mut R, switch_endianness: bool) -> Result<Float, Error> {
473 macro_rules! _read_le {
474 ($ty: ty) => {{
475 if switch_endianness {
476 <$ty>::from_be_bytes(read!(from, $ty))
477 } else {
478 <$ty>::from_le_bytes(read!(from, $ty))
479 }
480 }};
481 }
482
483 macro_rules! _read_be {
484 ($ty: ty) => {{
485 if switch_endianness {
486 <$ty>::from_le_bytes(read!(from, $ty))
487 } else {
488 <$ty>::from_be_bytes(read!(from, $ty))
489 }
490 }};
491 }
492
493 macro_rules! _read_ne {
494 ($ty: ty) => {{
495 if cfg!(target_endian = "big") {
496 _read_be!($ty)
497 } else {
498 _read_le!($ty)
499 }
500 }};
501 }
502
503 macro_rules! _read_me {
504 () => {
505 ((_read_le!(u16) as i32) << 16) | (_read_le!(u16) as i32)
506 };
507 }
508
509 Ok(match self {
510 Self::lefloat => Float::lefloat(_read_le!(f32)),
511 Self::befloat => Float::befloat(_read_le!(f32)),
512 Self::ledouble => Float::ledouble(_read_le!(f64)),
513 Self::bedouble => Float::bedouble(_read_be!(f64)),
514 })
515 }
516}
517
518#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
519enum Op {
520 Mul,
521 Add,
522 Sub,
523 Div,
524 Mod,
525 And,
526 Xor,
527 Or,
528}
529
530impl Display for Op {
531 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
532 match self {
533 Op::Mul => write!(f, "*"),
534 Op::Add => write!(f, "+"),
535 Op::Sub => write!(f, "-"),
536 Op::Div => write!(f, "/"),
537 Op::Mod => write!(f, "%"),
538 Op::And => write!(f, "&"),
539 Op::Or => write!(f, "|"),
540 Op::Xor => write!(f, "^"),
541 }
542 }
543}
544
545#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
546enum CmpOp {
547 Eq,
548 Lt,
549 Gt,
550 BitAnd,
551 Neq, Xor,
553 Not, }
555
556impl CmpOp {
557 #[inline(always)]
558 fn is_neq(&self) -> bool {
559 matches!(self, Self::Neq)
560 }
561}
562
563#[derive(Debug, Clone, Serialize, Deserialize)]
564struct ScalarTransform {
565 op: Op,
566 num: Scalar,
567}
568
569impl ScalarTransform {
570 fn apply(&self, s: Scalar) -> Option<Scalar> {
571 match self.op {
572 Op::Add => s.checked_add(self.num),
573 Op::Sub => s.checked_sub(self.num),
574 Op::Mul => s.checked_mul(self.num),
575 Op::Div => s.checked_div(self.num),
576 Op::Mod => s.checked_rem(self.num),
577 Op::And => Some(s.bitand(self.num)),
578 Op::Xor => Some(s.bitxor(self.num)),
579 Op::Or => Some(s.bitor(self.num)),
580 }
581 }
582}
583
584#[derive(Debug, Clone, Serialize, Deserialize)]
585struct FloatTransform {
586 op: Op,
587 num: Float,
588}
589
590impl FloatTransform {
591 fn apply(&self, s: Float) -> Float {
592 match self.op {
593 Op::Add => s.add(self.num),
594 Op::Sub => s.sub(self.num),
595 Op::Mul => s.mul(self.num),
596 Op::Div => s.div(self.num),
598 Op::Mod => s.rem(self.num),
600 Op::And | Op::Xor | Op::Or => {
602 debug_panic!("unsupported operation");
603 s
604 }
605 }
606 }
607}
608
609#[derive(Clone, Serialize, Deserialize)]
610enum TestValue<T> {
611 Value(T),
612 Any,
613}
614
615impl Debug for TestValue<Vec<u8>> {
616 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
617 match self {
618 Self::Value(v) => write!(f, "\"{}\"", debug_string_from_vec_u8(v)),
619 Self::Any => write!(f, "ANY"),
620 }
621 }
622}
623
624impl Debug for TestValue<Vec<u16>> {
625 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
626 match self {
627 Self::Value(v) => write!(f, "\"{}\"", debug_string_from_vec_u16(v)),
628 Self::Any => write!(f, "ANY"),
629 }
630 }
631}
632
633impl Debug for TestValue<Scalar> {
634 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
635 match self {
636 Self::Value(s) => write!(f, "{s:?}"),
637 Self::Any => write!(f, "ANY"),
638 }
639 }
640}
641
642impl Debug for TestValue<Float> {
643 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
644 match self {
645 Self::Value(fl) => write!(f, "{fl:?}"),
646 Self::Any => write!(f, "ANY"),
647 }
648 }
649}
650
651impl<T> TestValue<T> {
652 #[inline(always)]
653 fn as_ref(&self) -> TestValue<&T> {
654 match self {
655 Self::Value(v) => TestValue::Value(v),
656 Self::Any => TestValue::Any,
657 }
658 }
659}
660
661flags! {
662 enum ReMod: u8{
663 CaseInsensitive,
664 StartOffsetUpdate,
665 LineLimit,
666 ForceBin,
667 ForceText,
668 TrimMatch,
669 }
670}
671
672fn serialize_regex<S>(re: &bytes::Regex, serializer: S) -> Result<S::Ok, S::Error>
673where
674 S: serde::Serializer,
675{
676 re.as_str().serialize(serializer)
677}
678
679fn deserialize_regex<'de, D>(deserializer: D) -> Result<bytes::Regex, D::Error>
680where
681 D: serde::Deserializer<'de>,
682{
683 let wrapper = String::deserialize(deserializer)?;
684 bytes::Regex::new(&wrapper).map_err(serde::de::Error::custom)
685}
686
687#[derive(Debug, Clone, Serialize, Deserialize)]
688struct RegexTest {
689 #[serde(
690 serialize_with = "serialize_regex",
691 deserialize_with = "deserialize_regex"
692 )]
693 re: bytes::Regex,
694 length: Option<usize>,
695 mods: FlagSet<ReMod>,
696 str_mods: FlagSet<StringMod>,
697 non_magic_len: usize,
698 binary: bool,
699 cmp_op: CmpOp,
700}
701
702impl RegexTest {
703 #[inline(always)]
704 fn is_binary(&self) -> bool {
705 self.binary
706 || self.mods.contains(ReMod::ForceBin)
707 || self.str_mods.contains(StringMod::ForceBin)
708 }
709
710 #[inline(always)]
711 fn is_text(&self) -> bool {
712 self.mods.contains(ReMod::ForceText) || self.str_mods.contains(StringMod::ForceText)
713 }
714
715 fn match_buf<'buf>(
716 &self,
717 off_buf: u64, stream_kind: StreamKind,
719 buf: &'buf [u8],
720 ) -> Option<MatchRes<'buf>> {
721 let mr = match stream_kind {
722 StreamKind::Text(_) => {
723 let mut off_txt = off_buf;
724
725 let mut line_limit = self.length.unwrap_or(usize::MAX);
726
727 for line in buf.split(|c| c == &b'\n') {
728 if line_limit == 0 {
732 break;
733 }
734
735 if let Some(re_match) = self.re.find(line) {
736 let start_offset = off_txt + re_match.start() as u64;
738
739 let stop_offset = if re_match.end() == line.len() {
741 Some(start_offset + re_match.as_bytes().len() as u64 + 1)
742 } else {
743 None
744 };
745
746 return Some(MatchRes::Bytes(
747 start_offset,
748 stop_offset,
749 re_match.as_bytes(),
750 Encoding::Utf8,
751 ));
752 }
753
754 off_txt += line.len() as u64;
755 off_txt += 1;
757 line_limit = line_limit.saturating_sub(1)
758 }
759 None
760 }
761
762 StreamKind::Binary => {
763 self.re.find(buf).map(|re_match| {
764 MatchRes::Bytes(
765 off_buf + re_match.start() as u64,
767 None,
768 re_match.as_bytes(),
769 Encoding::Utf8,
770 )
771 })
772 }
773 };
774
775 if self.cmp_op.is_neq() && mr.is_none() {
777 return Some(MatchRes::Bytes(off_buf, None, buf, Encoding::Utf8));
778 }
779
780 mr
781 }
782}
783
784impl From<RegexTest> for Test {
785 fn from(value: RegexTest) -> Self {
786 Self::Regex(value)
787 }
788}
789
790flags! {
791 enum StringMod: u8{
792 ForceBin,
793 UpperInsensitive,
794 LowerInsensitive,
795 FullWordMatch,
796 Trim,
797 ForceText,
798 CompactWhitespace,
799 OptBlank,
800 }
801}
802
803#[derive(Debug, Clone, Serialize, Deserialize)]
804struct StringTest {
805 test_val: TestValue<Vec<u8>>,
806 cmp_op: CmpOp,
807 length: Option<usize>,
808 mods: FlagSet<StringMod>,
809 binary: bool,
810}
811
812impl From<StringTest> for Test {
813 fn from(value: StringTest) -> Self {
814 Self::String(value)
815 }
816}
817
818#[inline(always)]
819fn string_match(str: &[u8], mods: FlagSet<StringMod>, buf: &[u8]) -> (bool, usize) {
820 let mut consumed = 0;
821 if mods.is_disjoint(
823 StringMod::UpperInsensitive
824 | StringMod::LowerInsensitive
825 | StringMod::FullWordMatch
826 | StringMod::CompactWhitespace
827 | StringMod::OptBlank,
828 ) {
829 if buf.starts_with(str) {
831 (true, str.len())
832 } else {
833 (false, consumed)
834 }
835 } else {
836 let mut i_src = 0;
837 let mut iter = buf.iter().peekable();
838
839 macro_rules! consume_target {
840 () => {{
841 if iter.next().is_some() {
842 consumed += 1;
843 }
844 }};
845 }
846
847 macro_rules! continue_next_iteration {
848 () => {{
849 consume_target!();
850 i_src += 1;
851 continue;
852 }};
853 }
854
855 while let Some(&&b) = iter.peek() {
856 let Some(&ref_byte) = str.get(i_src) else {
857 break;
858 };
859
860 if mods.contains(StringMod::OptBlank) && (b == b' ' || ref_byte == b' ') {
861 if b == b' ' {
862 consume_target!();
864 }
865
866 if ref_byte == b' ' {
867 i_src += 1;
869 }
870
871 continue;
872 }
873
874 if mods.contains(StringMod::UpperInsensitive) {
875 if ref_byte.is_ascii_uppercase() && ref_byte == b.to_ascii_uppercase()
877 || ref_byte == b
878 {
879 continue_next_iteration!()
880 }
881 }
882
883 if mods.contains(StringMod::LowerInsensitive)
884 && (ref_byte.is_ascii_lowercase() && ref_byte == b.to_ascii_lowercase()
885 || ref_byte == b)
886 {
887 continue_next_iteration!()
888 }
889
890 if mods.contains(StringMod::CompactWhitespace) && ref_byte == b' ' {
891 let mut src_blk = 0;
892 while let Some(b' ') = str.get(i_src) {
893 src_blk += 1;
894 i_src += 1;
895 }
896
897 let mut tgt_blk = 0;
898 while let Some(b' ') = iter.peek() {
899 tgt_blk += 1;
900 consume_target!();
901 }
902
903 if src_blk > tgt_blk {
904 return (false, consumed);
905 }
906
907 continue;
908 }
909
910 if ref_byte == b {
911 continue_next_iteration!()
912 } else {
913 return (false, consumed);
914 }
915 }
916
917 if mods.contains(StringMod::FullWordMatch)
918 && let Some(b) = iter.peek()
919 && !b.is_ascii_whitespace()
920 {
921 return (false, consumed);
922 }
923
924 (
925 consumed > 0 && str.get(i_src).is_none() && consumed <= buf.len(),
926 consumed,
927 )
928 }
929}
930
931impl StringTest {
932 fn has_length_mod(&self) -> bool {
933 !self.mods.is_disjoint(
934 StringMod::UpperInsensitive
935 | StringMod::LowerInsensitive
936 | StringMod::FullWordMatch
937 | StringMod::CompactWhitespace
938 | StringMod::OptBlank,
939 )
940 }
941
942 #[inline(always)]
943 fn test_value_len(&self) -> usize {
944 match self.test_val.as_ref() {
945 TestValue::Value(s) => s.len(),
946 TestValue::Any => 0,
947 }
948 }
949
950 #[inline(always)]
951 fn is_binary(&self) -> bool {
952 self.binary || self.mods.contains(StringMod::ForceBin)
953 }
954
955 #[inline(always)]
956 fn is_text(&self) -> bool {
957 self.mods.contains(StringMod::ForceText)
958 }
959}
960
961#[derive(Clone, Serialize, Deserialize)]
962struct ByteVec(Vec<u8>);
963
964impl Debug for ByteVec {
965 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
966 write!(f, "\"{}\"", debug_string_from_vec_u8(self))
967 }
968}
969
970impl From<Vec<u8>> for ByteVec {
971 fn from(value: Vec<u8>) -> Self {
972 Self(value)
973 }
974}
975
976impl Deref for ByteVec {
977 type Target = Vec<u8>;
978
979 fn deref(&self) -> &Self::Target {
980 &self.0
981 }
982}
983
984#[derive(Debug, Clone, Serialize, Deserialize)]
985struct SearchTest {
986 str: ByteVec,
987 n_pos: Option<usize>,
988 str_mods: FlagSet<StringMod>,
989 re_mods: FlagSet<ReMod>,
990 binary: bool,
991 cmp_op: CmpOp,
992}
993
994impl From<SearchTest> for Test {
995 fn from(value: SearchTest) -> Self {
996 Self::Search(value)
997 }
998}
999
1000impl SearchTest {
1001 #[inline(always)]
1002 fn is_binary(&self) -> bool {
1003 (self.binary
1004 || self.str_mods.contains(StringMod::ForceBin)
1005 || self.re_mods.contains(ReMod::ForceBin))
1006 && !(self.str_mods.contains(StringMod::ForceText)
1007 || self.re_mods.contains(ReMod::ForceText))
1008 }
1009
1010 #[inline]
1012 fn match_buf<'buf>(&self, off_buf: u64, buf: &'buf [u8]) -> Option<MatchRes<'buf>> {
1013 let mut i = 0;
1014
1015 let needle = self.str.first()?;
1016
1017 while i < buf.len() {
1018 let Some(k) = memchr(*needle, &buf[i..]) else {
1021 break;
1022 };
1023
1024 i += k;
1025
1026 if self.str_mods.contains(StringMod::FullWordMatch) {
1028 let prev_is_whitespace = buf
1029 .get(i.saturating_sub(1))
1030 .map(|c| c.is_ascii_whitespace())
1031 .unwrap_or_default();
1032
1033 if i > 0 && !prev_is_whitespace {
1038 i += 1;
1039 continue;
1040 }
1041 }
1042
1043 if let Some(npos) = self.n_pos
1044 && i > npos
1045 {
1046 break;
1047 }
1048
1049 let pos = i;
1050 let (ok, consumed) = string_match(&self.str, self.str_mods, &buf[i..]);
1051
1052 if ok {
1053 return Some(MatchRes::Bytes(
1054 off_buf.saturating_add(pos as u64),
1055 None,
1056 &buf[i..i + consumed],
1057 Encoding::Utf8,
1058 ));
1059 } else {
1060 i += max(consumed, 1)
1061 }
1062 }
1063
1064 if self.cmp_op.is_neq() {
1066 return Some(MatchRes::Bytes(off_buf, None, buf, Encoding::Utf8));
1067 }
1068
1069 None
1070 }
1071}
1072
1073#[derive(Debug, Clone, Serialize, Deserialize)]
1074struct ScalarTest {
1075 ty: ScalarDataType,
1076 transform: Option<ScalarTransform>,
1077 cmp_op: CmpOp,
1078 test_val: TestValue<Scalar>,
1079}
1080
1081#[derive(Debug, Clone, Serialize, Deserialize)]
1082struct FloatTest {
1083 ty: FloatDataType,
1084 transform: Option<FloatTransform>,
1085 cmp_op: CmpOp,
1086 test_val: TestValue<Float>,
1087}
1088
1089#[derive(PartialEq)]
1092enum ReadValue<'buf> {
1093 Float(u64, Float),
1094 Scalar(u64, Scalar),
1095 Bytes(u64, &'buf [u8]),
1096}
1097
1098impl<'buf> Debug for ReadValue<'buf> {
1099 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1100 match self {
1101 Self::Float(_, fl) => write!(f, "{fl:?}"),
1102 Self::Scalar(_, s) => write!(f, "{s:?}"),
1103 Self::Bytes(_, b) => {
1104 if b.len() <= 128 {
1105 write!(f, "\"{}\"", debug_string_from_vec_u8(b))
1106 } else {
1107 let limit = 128;
1108 write!(
1109 f,
1110 "\"{}\" (first {limit} bytes)",
1111 debug_string_from_vec_u8(&b[..limit])
1112 )
1113 }
1114 }
1115 }
1116 }
1117}
1118
1119impl DynDisplay for ReadValue<'_> {
1120 fn dyn_fmt(&self, f: &dyf::FormatSpec) -> Result<String, dyf::Error> {
1121 match self {
1122 Self::Float(_, s) => DynDisplay::dyn_fmt(s, f),
1123 Self::Scalar(_, s) => DynDisplay::dyn_fmt(s, f),
1124 Self::Bytes(_, b) => Ok(format!("{b:?}")),
1125 }
1126 }
1127}
1128
1129impl DynDisplay for &ReadValue<'_> {
1130 fn dyn_fmt(&self, f: &dyf::FormatSpec) -> Result<String, dyf::Error> {
1131 DynDisplay::dyn_fmt(*self, f)
1133 }
1134}
1135
1136impl Display for ReadValue<'_> {
1137 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1138 match self {
1139 Self::Float(_, v) => write!(f, "{v}"),
1140 Self::Scalar(_, s) => write!(f, "{s}"),
1141 Self::Bytes(_, b) => write!(f, "{b:?}"),
1142 }
1143 }
1144}
1145
1146enum Encoding {
1147 Utf16(String16Encoding),
1148 Utf8,
1149}
1150
1151enum MatchRes<'buf> {
1154 Bytes(u64, Option<u64>, &'buf [u8], Encoding),
1159 Scalar(u64, Scalar),
1160 Float(u64, Float),
1161}
1162
1163impl DynDisplay for &MatchRes<'_> {
1164 fn dyn_fmt(&self, f: &dyf::FormatSpec) -> Result<String, dyf::Error> {
1165 (*self).dyn_fmt(f)
1166 }
1167}
1168
1169impl DynDisplay for MatchRes<'_> {
1170 fn dyn_fmt(&self, f: &dyf::FormatSpec) -> Result<String, dyf::Error> {
1171 match self {
1172 Self::Scalar(_, v) => v.dyn_fmt(f),
1173 Self::Float(_, v) => v.dyn_fmt(f),
1174 Self::Bytes(_, _, v, enc) => match enc {
1175 Encoding::Utf8 => String::from_utf8_lossy(v).to_string().dyn_fmt(f),
1176 Encoding::Utf16(enc) => {
1177 let utf16: Vec<u16> = slice_to_utf16_iter(v, *enc).collect();
1178 String::from_utf16_lossy(&utf16).dyn_fmt(f)
1179 }
1180 },
1181 }
1182 }
1183}
1184
1185impl MatchRes<'_> {
1186 #[inline]
1188 fn start_offset(&self) -> u64 {
1189 match self {
1190 MatchRes::Bytes(o, _, _, _) => *o,
1191 MatchRes::Scalar(o, _) => *o,
1192 MatchRes::Float(o, _) => *o,
1193 }
1194 }
1195
1196 #[inline]
1198 fn end_offset(&self) -> u64 {
1199 match self {
1200 MatchRes::Bytes(start, end, buf, _) => match end {
1201 Some(end) => *end,
1202 None => start.saturating_add(buf.len() as u64),
1203 },
1204 MatchRes::Scalar(o, sc) => o.add(sc.size_of() as u64),
1205 MatchRes::Float(o, f) => o.add(f.size_of() as u64),
1206 }
1207 }
1208}
1209
1210fn slice_to_utf16_iter(read: &[u8], encoding: String16Encoding) -> impl Iterator<Item = u16> {
1211 let even = read
1212 .iter()
1213 .enumerate()
1214 .filter(|(i, _)| i % 2 == 0)
1215 .map(|t| t.1);
1216
1217 let odd = read
1218 .iter()
1219 .enumerate()
1220 .filter(|(i, _)| i % 2 != 0)
1221 .map(|t| t.1);
1222
1223 even.zip(odd).map(move |(e, o)| match encoding {
1224 String16Encoding::Le => u16::from_le_bytes([*e, *o]),
1225 String16Encoding::Be => u16::from_be_bytes([*e, *o]),
1226 })
1227}
1228
1229#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
1230enum String16Encoding {
1231 Le,
1232 Be,
1233}
1234
1235#[derive(Debug, Clone, Serialize, Deserialize)]
1236struct String16Test {
1237 orig: String,
1238 test_val: TestValue<Vec<u16>>,
1239 encoding: String16Encoding,
1240}
1241
1242impl String16Test {
1243 #[inline(always)]
1247 fn test_value_len(&self) -> usize {
1248 match self.test_val.as_ref() {
1249 TestValue::Value(str16) => str16.len(),
1250 TestValue::Any => 0,
1251 }
1252 }
1253}
1254
1255flags! {
1256 enum IndirectMod: u8{
1257 Relative,
1258 }
1259}
1260
1261type IndirectMods = FlagSet<IndirectMod>;
1262
1263#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
1264enum PStringLen {
1265 Byte, ShortBe, ShortLe, LongBe, LongLe, }
1271
1272impl PStringLen {
1273 #[inline(always)]
1274 const fn size_of_len(&self) -> usize {
1275 match self {
1276 PStringLen::Byte => 1,
1277 PStringLen::ShortBe => 2,
1278 PStringLen::ShortLe => 2,
1279 PStringLen::LongBe => 4,
1280 PStringLen::LongLe => 4,
1281 }
1282 }
1283}
1284
1285#[derive(Debug, Clone, Serialize, Deserialize)]
1286struct PStringTest {
1287 len: PStringLen,
1288 test_val: TestValue<Vec<u8>>,
1289 include_len: bool,
1290}
1291
1292impl PStringTest {
1293 #[inline]
1294 fn read<'cache, R: Read + Seek>(
1295 &self,
1296 haystack: &'cache mut LazyCache<R>,
1297 ) -> Result<Option<&'cache [u8]>, Error> {
1298 let mut len = match self.len {
1299 PStringLen::Byte => read_le!(haystack, u8) as u32,
1300 PStringLen::ShortBe => read_be!(haystack, u16) as u32,
1301 PStringLen::ShortLe => read_le!(haystack, u16) as u32,
1302 PStringLen::LongBe => read_be!(haystack, u32),
1303 PStringLen::LongLe => read_le!(haystack, u32),
1304 } as usize;
1305
1306 if self.include_len {
1307 len = len.saturating_sub(self.len.size_of_len())
1308 }
1309
1310 if let TestValue::Value(s) = self.test_val.as_ref()
1311 && len != s.len()
1312 {
1313 return Ok(None);
1314 }
1315
1316 let read = haystack.read_exact_count(len as u64)?;
1317
1318 Ok(Some(read))
1319 }
1320
1321 #[inline(always)]
1322 fn test_value_len(&self) -> usize {
1323 match self.test_val.as_ref() {
1324 TestValue::Value(s) => s.len(),
1325 TestValue::Any => 0,
1326 }
1327 }
1328}
1329
1330#[derive(Debug, Clone, Serialize, Deserialize)]
1331enum Test {
1332 Name(String),
1333 Use(bool, String),
1334 Scalar(ScalarTest),
1335 Float(FloatTest),
1336 String(StringTest),
1337 Search(SearchTest),
1338 PString(PStringTest),
1339 Regex(RegexTest),
1340 Indirect(FlagSet<IndirectMod>),
1341 String16(String16Test),
1342 #[allow(dead_code)]
1344 Der,
1345 Clear,
1346 Default,
1347}
1348
1349impl Display for Test {
1350 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1351 match self {
1352 Test::Name(name) => write!(f, "name {name}"),
1353 Test::Use(flip, rule) => {
1354 if *flip {
1355 write!(f, "use {rule}")
1356 } else {
1357 write!(f, "use ^{rule}")
1358 }
1359 }
1360 Test::Scalar(st) => write!(f, "{st:?}"),
1361 Test::Float(ft) => write!(f, "{ft:?}"),
1362 Test::String(st) => write!(f, "{st:?}"),
1363 Test::Search(st) => write!(f, "{st:?}"),
1364 Test::PString(pt) => write!(f, "{pt:?}"),
1365 Test::Regex(rt) => write!(f, "{rt:?}"),
1366 Test::Indirect(fs) => write!(f, "indirect {fs:?}"),
1367 Test::String16(s16t) => write!(f, "{s16t:?}"),
1368 Test::Der => write!(f, "unimplemented der"),
1369 Test::Clear => write!(f, "clear"),
1370 Test::Default => write!(f, "default"),
1371 }
1372 }
1373}
1374
1375impl Test {
1376 #[inline]
1378 fn read_test_value<'haystack, R: Read + Seek>(
1379 &self,
1380 haystack: &'haystack mut LazyCache<R>,
1381 switch_endianness: bool,
1382 ) -> Result<Option<ReadValue<'haystack>>, Error> {
1383 let test_value_offset = haystack.lazy_stream_position();
1384
1385 match self {
1386 Self::Scalar(t) => {
1387 t.ty.read(haystack, switch_endianness)
1388 .map(|s| Some(ReadValue::Scalar(test_value_offset, s)))
1389 }
1390
1391 Self::Float(t) => {
1392 t.ty.read(haystack, switch_endianness)
1393 .map(|f| Some(ReadValue::Float(test_value_offset, f)))
1394 }
1395 Self::String(t) => {
1396 match t.test_val.as_ref() {
1397 TestValue::Value(str) => {
1398 let buf = if let Some(length) = t.length {
1399 haystack.read_exact_count(length as u64)?
1401 } else {
1402 match t.cmp_op {
1405 CmpOp::Eq | CmpOp::Neq => {
1406 if !t.has_length_mod() {
1407 haystack.read_exact_count(str.len() as u64)?
1408 } else {
1409 haystack.read_count(FILE_BYTES_MAX as u64)?
1410 }
1411 }
1412 CmpOp::Lt | CmpOp::Gt => {
1413 let read =
1414 haystack.read_until_any_delim_or_limit(b"\n\0", 8092)?;
1415
1416 if read.ends_with(b"\0") || read.ends_with(b"\n") {
1417 &read[..read.len() - 1]
1418 } else {
1419 read
1420 }
1421 }
1422 _ => {
1423 return Err(Error::Msg(format!(
1424 "string test does not support {:?} operator",
1425 t.cmp_op
1426 )));
1427 }
1428 }
1429 };
1430
1431 Ok(Some(ReadValue::Bytes(test_value_offset, buf)))
1432 }
1433 TestValue::Any => {
1434 let read = haystack.read_until_any_delim_or_limit(b"\0\n", 8192)?;
1435 let bytes = if read.ends_with(b"\0") || read.ends_with(b"\n") {
1437 &read[..read.len() - 1]
1438 } else {
1439 read
1440 };
1441
1442 Ok(Some(ReadValue::Bytes(test_value_offset, bytes)))
1443 }
1444 }
1445 }
1446
1447 Self::String16(t) => {
1448 match t.test_val.as_ref() {
1449 TestValue::Value(str16) => {
1450 let read = haystack.read_exact_count((str16.len() * 2) as u64)?;
1451
1452 Ok(Some(ReadValue::Bytes(test_value_offset, read)))
1453 }
1454 TestValue::Any => {
1455 let read = haystack.read_until_utf16_or_limit(b"\x00\x00", 8192)?;
1456
1457 let end = if read.len() % 2 == 0 {
1459 read.len()
1460 } else {
1461 read.len().saturating_sub(1)
1464 };
1465
1466 Ok(Some(ReadValue::Bytes(test_value_offset, &read[..end])))
1467 }
1468 }
1469 }
1470
1471 Self::PString(t) => {
1472 let Some(read) = t.read(haystack)? else {
1473 return Ok(None);
1474 };
1475 Ok(Some(ReadValue::Bytes(test_value_offset, read)))
1476 }
1477
1478 Self::Search(_) => {
1479 let buf = haystack.read_count(FILE_BYTES_MAX as u64)?;
1480 Ok(Some(ReadValue::Bytes(test_value_offset, buf)))
1481 }
1482
1483 Self::Regex(r) => {
1484 let length = {
1485 match r.length {
1486 Some(len) => {
1487 if r.mods.contains(ReMod::LineLimit) {
1488 len * 80
1489 } else {
1490 len
1491 }
1492 }
1493
1494 None => FILE_REGEX_MAX,
1495 }
1496 };
1497
1498 let read = haystack.read_count(length as u64)?;
1499 Ok(Some(ReadValue::Bytes(test_value_offset, read)))
1500 }
1501
1502 Self::Name(_)
1503 | Self::Use(_, _)
1504 | Self::Indirect(_)
1505 | Self::Clear
1506 | Self::Default
1507 | Self::Der => Err(Error::msg("no value to read for this test")),
1508 }
1509 }
1510
1511 #[inline(always)]
1512 fn match_value<'s>(
1513 &'s self,
1514 tv: &ReadValue<'s>,
1515 stream_kind: StreamKind,
1516 ) -> Option<MatchRes<'s>> {
1517 match (self, tv) {
1518 (Self::Scalar(t), ReadValue::Scalar(o, ts)) => {
1519 let read_value: Scalar = match t.transform.as_ref() {
1520 Some(t) => t.apply(*ts)?,
1521 None => *ts,
1522 };
1523
1524 match t.test_val {
1525 TestValue::Value(test_value) => {
1526 let ok = match t.cmp_op {
1527 CmpOp::Not => read_value == !test_value,
1530 CmpOp::Eq => read_value == test_value,
1531 CmpOp::Lt => read_value < test_value,
1532 CmpOp::Gt => read_value > test_value,
1533 CmpOp::Neq => read_value != test_value,
1534 CmpOp::BitAnd => read_value & test_value == test_value,
1535 CmpOp::Xor => (read_value & test_value).is_zero(),
1536 };
1537
1538 if ok {
1539 Some(MatchRes::Scalar(*o, read_value))
1540 } else {
1541 None
1542 }
1543 }
1544
1545 TestValue::Any => Some(MatchRes::Scalar(*o, read_value)),
1546 }
1547 }
1548
1549 (Self::Float(t), ReadValue::Float(o, f)) => {
1550 let read_value: Float = t.transform.as_ref().map(|t| t.apply(*f)).unwrap_or(*f);
1551
1552 match t.test_val {
1553 TestValue::Value(tf) => {
1554 let ok = match t.cmp_op {
1555 CmpOp::Eq => read_value == tf,
1556 CmpOp::Lt => read_value < tf,
1557 CmpOp::Gt => read_value > tf,
1558 CmpOp::Neq => read_value != tf,
1559 _ => {
1560 debug_panic!("unsupported float comparison");
1563 debug!("unsupported float comparison");
1564 false
1565 }
1566 };
1567
1568 if ok {
1569 Some(MatchRes::Float(*o, read_value))
1570 } else {
1571 None
1572 }
1573 }
1574 TestValue::Any => Some(MatchRes::Float(*o, read_value)),
1575 }
1576 }
1577
1578 (Self::String(st), ReadValue::Bytes(o, buf)) => {
1579 macro_rules! trim_buf {
1580 ($buf: expr) => {{
1581 if st.mods.contains(StringMod::Trim) {
1582 $buf.trim_ascii()
1583 } else {
1584 $buf
1585 }
1586 }};
1587 }
1588
1589 match st.test_val.as_ref() {
1590 TestValue::Value(str) => {
1591 match st.cmp_op {
1592 CmpOp::Eq => {
1593 if let (true, _) = string_match(str, st.mods, buf) {
1594 Some(MatchRes::Bytes(*o, None, trim_buf!(str), Encoding::Utf8))
1595 } else {
1596 None
1597 }
1598 }
1599 CmpOp::Neq => {
1600 if let (false, _) = string_match(str, st.mods, buf) {
1601 Some(MatchRes::Bytes(*o, None, trim_buf!(str), Encoding::Utf8))
1602 } else {
1603 None
1604 }
1605 }
1606 CmpOp::Gt => {
1607 if buf.len() > str.len() {
1608 Some(MatchRes::Bytes(*o, None, trim_buf!(buf), Encoding::Utf8))
1609 } else {
1610 None
1611 }
1612 }
1613 CmpOp::Lt => {
1614 if buf.len() < str.len() {
1615 Some(MatchRes::Bytes(*o, None, trim_buf!(buf), Encoding::Utf8))
1616 } else {
1617 None
1618 }
1619 }
1620
1621 _ => {
1623 debug_panic!("unsupported string comparison");
1626 debug!("unsupported string comparison");
1627 None
1628 }
1629 }
1630 }
1631 TestValue::Any => {
1632 Some(MatchRes::Bytes(*o, None, trim_buf!(buf), Encoding::Utf8))
1633 }
1634 }
1635 }
1636
1637 (Self::PString(m), ReadValue::Bytes(o, buf)) => match m.test_val.as_ref() {
1638 TestValue::Value(psv) => {
1639 if buf == psv {
1640 Some(MatchRes::Bytes(*o, None, buf, Encoding::Utf8))
1641 } else {
1642 None
1643 }
1644 }
1645 TestValue::Any => Some(MatchRes::Bytes(*o, None, buf, Encoding::Utf8)),
1646 },
1647
1648 (Self::String16(t), ReadValue::Bytes(o, buf)) => {
1649 match t.test_val.as_ref() {
1650 TestValue::Value(str16) => {
1651 if str16.len() * 2 != buf.len() {
1653 return None;
1654 }
1655
1656 for (i, utf16_char) in slice_to_utf16_iter(buf, t.encoding).enumerate() {
1658 if str16[i] != utf16_char {
1659 return None;
1660 }
1661 }
1662
1663 Some(MatchRes::Bytes(
1664 *o,
1665 None,
1666 t.orig.as_bytes(),
1667 Encoding::Utf16(t.encoding),
1668 ))
1669 }
1670
1671 TestValue::Any => {
1672 Some(MatchRes::Bytes(*o, None, buf, Encoding::Utf16(t.encoding)))
1673 }
1674 }
1675 }
1676
1677 (Self::Regex(r), ReadValue::Bytes(o, buf)) => r.match_buf(*o, stream_kind, buf),
1678
1679 (Self::Search(t), ReadValue::Bytes(o, buf)) => t.match_buf(*o, buf),
1680
1681 _ => None,
1682 }
1683 }
1684
1685 #[inline(always)]
1686 fn strength(&self) -> u64 {
1687 const MULT: usize = 10;
1688
1689 let mut out = 2 * MULT;
1690
1691 match self {
1693 Test::Scalar(s) => {
1694 out += s.ty.type_size() * MULT;
1695 }
1696
1697 Test::Float(t) => {
1698 out += t.ty.type_size() * MULT;
1699 }
1700
1701 Test::String(t) => out += t.test_value_len().saturating_mul(MULT),
1702
1703 Test::PString(t) => out += t.test_value_len().saturating_mul(MULT),
1704
1705 Test::Search(s) => {
1706 let n_pos = s.n_pos.unwrap_or(FILE_BYTES_MAX);
1711
1712 match n_pos {
1713 0..=80 => out += s.str.len().saturating_mul(MULT),
1715 81..=240 => out += s.str.len() * s.str.len().clamp(0, MULT - 2),
1717 _ => out += s.str.len(),
1719 }
1720 }
1721
1722 Test::Regex(r) => {
1723 let v = r.non_magic_len / r.re.captures_len();
1732
1733 let len = r
1734 .length
1735 .map(|l| {
1736 if r.mods.contains(ReMod::LineLimit) {
1737 l * 80
1738 } else {
1739 l
1740 }
1741 })
1742 .unwrap_or(FILE_BYTES_MAX);
1743
1744 match len {
1745 0..=80 => out += v.saturating_mul(MULT),
1747 81..=240 => out += v * v.clamp(0, MULT - 2),
1749 _ => out += v,
1751 }
1752 }
1753
1754 Test::String16(t) => {
1755 out += t.test_value_len().saturating_mul(MULT);
1760 }
1761
1762 Test::Der => out += MULT,
1763
1764 Test::Default | Test::Name(_) | Test::Use(_, _) | Test::Indirect(_) | Test::Clear => {
1765 return 0;
1766 }
1767 }
1768
1769 if self.is_match_any() {
1771 return 0;
1772 }
1773
1774 if let Some(op) = self.cmp_op() {
1775 match op {
1776 CmpOp::Neq => out = 0,
1778 CmpOp::Eq | CmpOp::Not => out += MULT,
1779 CmpOp::Lt | CmpOp::Gt => out -= 2 * MULT,
1780 CmpOp::Xor | CmpOp::BitAnd => out -= MULT,
1781 }
1782 }
1783
1784 out as u64
1785 }
1786
1787 #[inline(always)]
1788 fn cmp_op(&self) -> Option<CmpOp> {
1789 match self {
1790 Self::String(t) => Some(t.cmp_op),
1791 Self::Scalar(s) => Some(s.cmp_op),
1792 Self::Float(t) => Some(t.cmp_op),
1793 Self::Name(_)
1794 | Self::Use(_, _)
1795 | Self::Search(_)
1796 | Self::PString(_)
1797 | Self::Regex(_)
1798 | Self::Clear
1799 | Self::Default
1800 | Self::Indirect(_)
1801 | Self::String16(_)
1802 | Self::Der => None,
1803 }
1804 }
1805
1806 #[inline(always)]
1807 fn is_recursive(&self) -> bool {
1808 matches!(self, Test::Use(_, _) | Test::Indirect(_))
1809 }
1810
1811 #[inline(always)]
1812 fn is_match_any(&self) -> bool {
1813 match self {
1814 Test::Name(_) => false,
1815 Test::Use(_, _) => false,
1816 Test::Scalar(scalar_test) => matches!(scalar_test.test_val, TestValue::Any),
1817 Test::Float(float_test) => matches!(float_test.test_val, TestValue::Any),
1818 Test::String(string_test) => matches!(string_test.test_val, TestValue::Any),
1819 Test::Search(_) => false,
1820 Test::PString(pstring_test) => matches!(pstring_test.test_val, TestValue::Any),
1821 Test::Regex(_) => false,
1822 Test::Indirect(_) => false,
1823 Test::String16(string16_test) => matches!(string16_test.test_val, TestValue::Any),
1824 Test::Der => false,
1825 Test::Clear => false,
1826 Test::Default => false,
1827 }
1828 }
1829
1830 #[inline(always)]
1831 fn is_binary(&self) -> bool {
1832 match self {
1833 Self::Name(_) => true,
1834 Self::Use(_, _) => true,
1835 Self::Scalar(_) => true,
1836 Self::Float(_) => true,
1837 Self::String(t) => !t.is_binary() & !t.is_text() || t.is_binary(),
1838 Self::Search(t) => t.is_binary(),
1839 Self::PString(_) => true,
1840 Self::Regex(t) => !t.is_binary() & !t.is_text() || t.is_binary(),
1841 Self::Clear => true,
1842 Self::Default => true,
1843 Self::Indirect(_) => true,
1844 Self::String16(_) => true,
1845 Self::Der => true,
1846 }
1847 }
1848
1849 #[inline(always)]
1850 fn is_text(&self) -> bool {
1851 match self {
1852 Self::Name(_) => true,
1853 Self::Use(_, _) => true,
1854 Self::Indirect(_) => true,
1855 Self::Clear => true,
1856 Self::Default => true,
1857 Self::String(t) => !t.is_binary() & !t.is_text() || t.is_text(),
1858 Self::Regex(t) => !t.is_binary() & !t.is_text() || t.is_text(),
1859 _ => !self.is_binary(),
1860 }
1861 }
1862
1863 #[inline(always)]
1864 fn is_only_text(&self) -> bool {
1865 self.is_text() && !self.is_binary()
1866 }
1867
1868 #[inline(always)]
1869 fn is_only_binary(&self) -> bool {
1870 self.is_binary() && !self.is_text()
1871 }
1872}
1873
1874#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
1875enum OffsetType {
1876 Byte,
1877 DoubleLe,
1878 DoubleBe,
1879 ShortLe,
1880 ShortBe,
1881 Id3Le,
1882 Id3Be,
1883 LongLe,
1884 LongBe,
1885 Middle,
1886 Octal,
1887 QuadBe,
1888 QuadLe,
1889}
1890
1891#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
1892enum Shift {
1893 Direct(u64),
1894 Indirect(i64),
1895}
1896
1897#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
1898struct IndOffset {
1899 off_addr: DirOffset,
1901 signed: bool,
1903 ty: OffsetType,
1905 op: Option<Op>,
1906 shift: Option<Shift>,
1907}
1908
1909impl IndOffset {
1910 fn read_offset<R: Read + Seek>(
1912 &self,
1913 haystack: &mut LazyCache<R>,
1914 rule_base_offset: Option<u64>,
1915 last_upper_match_offset: Option<u64>,
1916 ) -> Result<Option<u64>, io::Error> {
1917 let offset_address = match self.off_addr {
1918 DirOffset::Start(s) => {
1919 let Some(o) = s.checked_add(rule_base_offset.unwrap_or_default()) else {
1920 return Ok(None);
1921 };
1922
1923 haystack.seek(SeekFrom::Start(o))?
1924 }
1925 DirOffset::LastUpper(c) => haystack.seek(SeekFrom::Start(
1926 (last_upper_match_offset.unwrap_or_default() as i64 + c) as u64,
1927 ))?,
1928 DirOffset::End(e) => haystack.seek(SeekFrom::End(e))?,
1929 };
1930
1931 macro_rules! read_value {
1932 () => {
1933 match self.ty {
1934 OffsetType::Byte => {
1935 if self.signed {
1936 read_le!(haystack, u8) as u64
1937 } else {
1938 read_le!(haystack, i8) as u64
1939 }
1940 }
1941 OffsetType::DoubleLe => read_le!(haystack, f64) as u64,
1942 OffsetType::DoubleBe => read_be!(haystack, f64) as u64,
1943 OffsetType::ShortLe => {
1944 if self.signed {
1945 read_le!(haystack, i16) as u64
1946 } else {
1947 read_le!(haystack, u16) as u64
1948 }
1949 }
1950 OffsetType::ShortBe => {
1951 if self.signed {
1952 read_be!(haystack, i16) as u64
1953 } else {
1954 read_be!(haystack, u16) as u64
1955 }
1956 }
1957 OffsetType::Id3Le => decode_id3(read_le!(haystack, u32)) as u64,
1958 OffsetType::Id3Be => decode_id3(read_be!(haystack, u32)) as u64,
1959 OffsetType::LongLe => {
1960 if self.signed {
1961 read_le!(haystack, i32) as u64
1962 } else {
1963 read_le!(haystack, u32) as u64
1964 }
1965 }
1966 OffsetType::LongBe => {
1967 if self.signed {
1968 read_be!(haystack, i32) as u64
1969 } else {
1970 read_be!(haystack, u32) as u64
1971 }
1972 }
1973 OffsetType::Middle => read_me!(haystack) as u64,
1974 OffsetType::Octal => {
1975 if let Some(o) = read_octal_u64(haystack) {
1976 o
1977 } else {
1978 debug!("failed to read octal offset @ {offset_address}");
1979 return Ok(None);
1980 }
1981 }
1982 OffsetType::QuadLe => {
1983 if self.signed {
1984 read_le!(haystack, i64) as u64
1985 } else {
1986 read_le!(haystack, u64)
1987 }
1988 }
1989 OffsetType::QuadBe => {
1990 if self.signed {
1991 read_be!(haystack, i64) as u64
1992 } else {
1993 read_be!(haystack, u64)
1994 }
1995 }
1996 }
1997 };
1998 }
1999
2000 let o = read_value!();
2002
2003 trace!(
2004 "offset read @ {offset_address} value={o} op={:?} shift={:?}",
2005 self.op, self.shift
2006 );
2007
2008 if let (Some(op), Some(shift)) = (self.op, self.shift) {
2010 let shift = match shift {
2011 Shift::Direct(i) => i,
2012 Shift::Indirect(i) => {
2013 let tmp = offset_address as i128 + i as i128;
2014 if tmp.is_negative() {
2015 return Ok(None);
2016 } else {
2017 haystack.seek(SeekFrom::Start(tmp as u64))?;
2018 };
2019 read_value!()
2022 }
2023 };
2024
2025 match op {
2026 Op::Add => return Ok(o.checked_add(shift)),
2027 Op::Mul => return Ok(o.checked_mul(shift)),
2028 Op::Sub => return Ok(o.checked_sub(shift)),
2029 Op::Div => return Ok(o.checked_div(shift)),
2030 Op::Mod => return Ok(o.checked_rem(shift)),
2031 Op::And => return Ok(Some(o & shift)),
2032 Op::Or => return Ok(Some(o | shift)),
2033 Op::Xor => return Ok(Some(o ^ shift)),
2034 }
2035 }
2036
2037 Ok(Some(o))
2038 }
2039}
2040
2041#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
2042enum DirOffset {
2043 Start(u64),
2044 LastUpper(i64),
2046 End(i64),
2047}
2048
2049#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
2050enum Offset {
2051 Direct(DirOffset),
2052 Indirect(IndOffset),
2053}
2054
2055impl From<DirOffset> for Offset {
2056 fn from(value: DirOffset) -> Self {
2057 Self::Direct(value)
2058 }
2059}
2060
2061impl From<IndOffset> for Offset {
2062 fn from(value: IndOffset) -> Self {
2063 Self::Indirect(value)
2064 }
2065}
2066
2067impl Display for DirOffset {
2068 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
2069 match self {
2070 DirOffset::Start(i) => write!(f, "{i}"),
2071 DirOffset::LastUpper(c) => write!(f, "&{c}"),
2072 DirOffset::End(e) => write!(f, "-{e}"),
2073 }
2074 }
2075}
2076
2077impl Default for DirOffset {
2078 fn default() -> Self {
2079 Self::LastUpper(0)
2080 }
2081}
2082
2083#[derive(Debug, Clone, Serialize, Deserialize)]
2084struct Match {
2085 line: usize,
2086 depth: u8,
2087 offset: Offset,
2088 test: Test,
2089 test_strength: u64,
2090 message: Option<Message>,
2091}
2092
2093impl From<Use> for Match {
2094 fn from(value: Use) -> Self {
2095 let test = Test::Use(value.switch_endianness, value.rule_name);
2096 let test_strength = test.strength();
2097 Self {
2098 line: value.line,
2099 depth: value.depth,
2100 offset: value.start_offset,
2101 test,
2102 test_strength,
2103 message: value.message,
2104 }
2105 }
2106}
2107
2108impl From<Name> for Match {
2109 fn from(value: Name) -> Self {
2110 let test = Test::Name(value.name);
2111 let test_strength = test.strength();
2112 Self {
2113 line: value.line,
2114 depth: 0,
2115 offset: Offset::Direct(DirOffset::Start(0)),
2116 test,
2117 test_strength,
2118 message: value.message,
2119 }
2120 }
2121}
2122
2123impl Match {
2124 #[inline(always)]
2126 fn offset_from_start<R: Read + Seek>(
2127 &self,
2128 haystack: &mut LazyCache<R>,
2129 rule_base_offset: Option<u64>,
2130 last_level_offset: Option<u64>,
2131 ) -> Result<Option<u64>, io::Error> {
2132 match self.offset {
2133 Offset::Direct(dir_offset) => match dir_offset {
2134 DirOffset::Start(s) => Ok(Some(s)),
2135 DirOffset::LastUpper(shift) => {
2136 let o = last_level_offset.unwrap_or_default() as i64 + shift;
2137
2138 if o >= 0 { Ok(Some(o as u64)) } else { Ok(None) }
2139 }
2140 DirOffset::End(e) => Ok(Some(haystack.offset_from_start(SeekFrom::End(e)))),
2141 },
2142 Offset::Indirect(ind_offset) => {
2143 let Some(o) =
2144 ind_offset.read_offset(haystack, rule_base_offset, last_level_offset)?
2145 else {
2146 return Ok(None);
2147 };
2148
2149 Ok(Some(o))
2150 }
2151 }
2152 }
2153
2154 #[inline]
2167 #[allow(clippy::too_many_arguments)]
2168 fn matches<'a: 'h, 'h, R: Read + Seek>(
2169 &'a self,
2170 source: Option<&str>,
2171 magic: &mut Magic<'a>,
2172 stream_kind: StreamKind,
2173 state: &mut MatchState,
2174 buf_base_offset: Option<u64>,
2175 rule_base_offset: Option<u64>,
2176 last_level_offset: Option<u64>,
2177 haystack: &'h mut LazyCache<R>,
2178 switch_endianness: bool,
2179 db: &'a MagicDb,
2180 depth: usize,
2181 ) -> Result<(bool, Option<MatchRes<'h>>), Error> {
2182 let source = source.unwrap_or("unknown");
2183 let line = self.line;
2184
2185 if depth >= MAX_RECURSION {
2186 return Err(Error::localized(
2187 source,
2188 line,
2189 Error::MaximumRecursion(MAX_RECURSION),
2190 ));
2191 }
2192
2193 if self.test.is_only_binary() && stream_kind.is_text() {
2194 trace!("skip binary test source={source} line={line} stream_kind={stream_kind:?}",);
2195 return Ok((false, None));
2196 }
2197
2198 if self.test.is_only_text() && !stream_kind.is_text() {
2199 trace!("skip text test source={source} line={line} stream_kind={stream_kind:?}",);
2200 return Ok((false, None));
2201 }
2202
2203 let Ok(Some(mut offset)) = self
2204 .offset_from_start(haystack, rule_base_offset, last_level_offset)
2205 .inspect_err(|e| debug!("source={source} line={line} failed at computing offset: {e}"))
2206 else {
2207 return Ok((false, None));
2208 };
2209
2210 offset = match self.offset {
2211 Offset::Indirect(_) => {
2212 buf_base_offset.unwrap_or_default().saturating_add(offset)
2217 }
2218 Offset::Direct(DirOffset::Start(_)) => {
2220 rule_base_offset.unwrap_or_default().saturating_add(offset)
2221 }
2222 _ => offset,
2223 };
2224
2225 match &self.test {
2226 Test::Clear => {
2227 trace!("source={source} line={line} clear");
2228 state.clear_continuation_level(&self.continuation_level());
2229 Ok((true, None))
2230 }
2231
2232 Test::Name(name) => {
2233 trace!(
2234 "source={source} line={line} running rule {name} switch_endianness={switch_endianness}",
2235 );
2236 Ok((true, None))
2237 }
2238
2239 Test::Use(flip_endianness, rule_name) => {
2240 trace!(
2241 "source={source} line={line} use {rule_name} switch_endianness={flip_endianness}",
2242 );
2243
2244 let switch_endianness = switch_endianness ^ flip_endianness;
2246
2247 let dr: &DependencyRule = db.dependencies.get(rule_name).ok_or(
2248 Error::localized(source, line, Error::MissingRule(rule_name.clone())),
2249 )?;
2250
2251 if let Some(msg) = self.message.as_ref() {
2253 magic.push_message(msg.to_string_lossy());
2254 }
2255
2256 let nmatch = dr.rule.magic(
2257 magic,
2258 stream_kind,
2259 buf_base_offset,
2260 Some(offset),
2261 haystack,
2262 db,
2263 switch_endianness,
2264 depth.saturating_add(1),
2265 )?;
2266
2267 let matched = nmatch > 1;
2270 if matched {
2271 state.set_continuation_level(self.continuation_level());
2272 }
2273
2274 Ok((matched, None))
2275 }
2276
2277 Test::Indirect(m) => {
2278 trace!(
2279 "source={source} line={line} indirect mods={:?} offset={offset:#x}",
2280 m
2281 );
2282
2283 let new_buf_base_off = if m.contains(IndirectMod::Relative) {
2284 Some(offset)
2285 } else {
2286 None
2287 };
2288
2289 if let Some(msg) = self.message.as_ref() {
2291 magic.push_message(msg.to_string_lossy());
2292 }
2293
2294 let mut nmatch = 0u64;
2295 for r in db.rules.iter() {
2296 let messages_cnt = magic.message.len();
2297 nmatch = nmatch.saturating_add(r.magic(
2298 magic,
2299 stream_kind,
2300 new_buf_base_off,
2301 Some(offset),
2302 haystack,
2303 db,
2304 false,
2305 depth.saturating_add(1),
2306 )?);
2307
2308 if magic.message.len() != messages_cnt {
2310 break;
2311 }
2312 }
2313
2314 Ok((nmatch > 0, None))
2316 }
2317
2318 Test::Default => {
2319 let ok = !state.get_continuation_level(&self.continuation_level());
2321
2322 trace!("source={source} line={line} default match={ok}");
2323 if ok {
2324 state.set_continuation_level(self.continuation_level());
2325 }
2326
2327 Ok((ok, None))
2328 }
2329
2330 _ => {
2331 if let Err(e) = haystack.seek(SeekFrom::Start(offset)) {
2332 debug!("source={source} line={line} failed to seek in haystack: {e}");
2333 return Ok((false, None));
2334 }
2335
2336 let mut trace_msg = None;
2337
2338 if enabled!(Level::DEBUG) {
2339 trace_msg = Some(vec![format!(
2340 "source={source} line={line} depth={} stream_offset={:#x}",
2341 self.depth,
2342 haystack.lazy_stream_position()
2343 )])
2344 }
2345
2346 if let Ok(opt_test_value) = self
2350 .test
2351 .read_test_value(haystack, switch_endianness)
2352 .inspect_err(|e| {
2353 debug!("source={source} line={line} error while reading test value @{offset}: {e}",)
2354 })
2355 {
2356 if let Some(v) = trace_msg
2357 .as_mut() { v.push(format!("test={}", self.test)) }
2358
2359 if let Some(v) = trace_msg.as_mut(){
2360 let drv = match opt_test_value.as_ref(){
2361 Some(r) => format!("{r:?}"),
2362 None =>String::new(),
2363 };
2364 v.push(format!("read_in_stream={drv}"))
2365 }
2366
2367 let match_res =
2368 opt_test_value.and_then(|tv| self.test.match_value(&tv, stream_kind));
2369
2370 if let Some(v) = trace_msg.as_mut() { v.push(format!(
2371 "message=\"{}\" match={}",
2372 self.message
2373 .as_ref()
2374 .map(|fs| fs.to_string_lossy())
2375 .unwrap_or_default(),
2376 match_res.is_some()
2377 )) }
2378
2379 if enabled!(Level::DEBUG) && !enabled!(Level::TRACE) && match_res.is_some() {
2381 if let Some(m) = trace_msg{
2382 debug!("{}", m.join(" "));
2383 }
2384 } else if enabled!(Level::TRACE)
2385 && let Some(m) = trace_msg{
2386 trace!("{}", m.join(" "));
2387 }
2388
2389 if let Some(mr) = match_res {
2390 state.set_continuation_level(self.continuation_level());
2391 return Ok((true, Some(mr)));
2392 }
2393 }
2394
2395 Ok((false, None))
2396 }
2397 }
2398 }
2399
2400 #[inline(always)]
2401 fn continuation_level(&self) -> ContinuationLevel {
2402 ContinuationLevel(self.depth)
2403 }
2404}
2405
2406#[derive(Debug, Clone)]
2407struct Use {
2408 line: usize,
2409 depth: u8,
2410 start_offset: Offset,
2411 rule_name: String,
2412 switch_endianness: bool,
2413 message: Option<Message>,
2414}
2415
2416#[derive(Debug, Clone, Serialize, Deserialize)]
2417struct StrengthMod {
2418 op: Op,
2419 by: u8,
2420}
2421
2422impl StrengthMod {
2423 #[inline(always)]
2424 fn apply(&self, strength: u64) -> u64 {
2425 let by = self.by as u64;
2426 debug!("applying strength modifier: {strength} {} {}", self.op, by);
2427 match self.op {
2428 Op::Mul => strength.saturating_mul(by),
2429 Op::Add => strength.saturating_add(by),
2430 Op::Sub => strength.saturating_sub(by),
2431 Op::Div => {
2432 if by > 0 {
2433 strength.saturating_div(by)
2434 } else {
2435 strength
2436 }
2437 }
2438 Op::Mod => strength % by,
2439 Op::And => strength & by,
2440 Op::Xor | Op::Or => {
2443 debug_panic!("unsupported strength operator");
2444 strength
2445 }
2446 }
2447 }
2448}
2449
2450#[derive(Debug, Clone)]
2451enum Flag {
2452 Mime(String),
2453 Ext(HashSet<String>),
2454 Strength(StrengthMod),
2455 Apple(String),
2456}
2457
2458#[derive(Debug, Clone)]
2459struct Name {
2460 line: usize,
2461 name: String,
2462 message: Option<Message>,
2463}
2464
2465#[derive(Debug, Clone)]
2466enum Entry<'span> {
2467 Match(Span<'span>, Match),
2468 Flag(Span<'span>, Flag),
2469}
2470
2471#[derive(Debug, Clone, Serialize, Deserialize)]
2472struct EntryNode {
2473 root: bool,
2474 entry: Match,
2475 children: Vec<EntryNode>,
2476 mimetype: Option<String>,
2477 apple: Option<String>,
2478 strength_mod: Option<StrengthMod>,
2479 exts: HashSet<String>,
2480}
2481
2482#[derive(Debug, Default)]
2483struct EntryNodeVisitor {
2484 exts: HashSet<String>,
2485 score: u64,
2486}
2487
2488impl EntryNodeVisitor {
2489 fn new() -> Self {
2490 Self {
2491 ..Default::default()
2492 }
2493 }
2494
2495 fn merge(&mut self, other: Self) {
2496 self.exts.extend(other.exts);
2497 self.score += other.score;
2498 }
2499}
2500
2501impl EntryNode {
2502 #[inline]
2503 fn update_visitor(&self, v: &mut EntryNodeVisitor, depth: usize) {
2504 for ext in self.exts.iter() {
2506 if !v.exts.contains(ext) {
2507 v.exts.insert(ext.clone());
2508 }
2509 }
2510
2511 if depth == 0 {
2513 v.score += self.entry.test_strength;
2514 }
2515
2516 v.score += self
2520 .children
2521 .iter()
2522 .map(|e| e.entry.test_strength)
2523 .min()
2524 .unwrap_or_default()
2525 / max(1, depth as u64);
2526 }
2527
2528 fn visit(
2529 &self,
2530 v: &mut EntryNodeVisitor,
2531 deps: &HashMap<String, DependencyRule>,
2532 marked: &mut HashSet<String>,
2533 depth: usize,
2534 ) -> Result<(), Error> {
2535 self.update_visitor(v, depth);
2537
2538 for c in self.children.iter() {
2540 if let Test::Use(_, ref name) = c.entry.test {
2541 if marked.contains(name) {
2542 continue;
2543 }
2544
2545 marked.insert(name.clone());
2546
2547 if let Some(r) = deps.get(name) {
2548 let dv = r.rule.visit_all_entries(deps, marked)?;
2549 v.merge(dv);
2550 } else {
2551 return Err(Error::MissingRule(name.clone()));
2552 }
2553 } else {
2554 c.visit(v, deps, marked, depth + 1)?;
2555 }
2556 }
2557
2558 Ok(())
2559 }
2560
2561 #[inline]
2562 #[allow(clippy::too_many_arguments)]
2563 fn matches<'r, R: Read + Seek>(
2564 &'r self,
2565 opt_source: Option<&str>,
2566 magic: &mut Magic<'r>,
2567 state: &mut MatchState,
2568 stream_kind: StreamKind,
2569 buf_base_offset: Option<u64>,
2570 rule_base_offset: Option<u64>,
2571 last_level_offset: Option<u64>,
2572 haystack: &mut LazyCache<R>,
2573 db: &'r MagicDb,
2574 switch_endianness: bool,
2575 depth: usize,
2576 ) -> Result<u64, Error> {
2577 let mut nmatch = 0u64;
2578
2579 let (ok, opt_match_res) = self.entry.matches(
2580 opt_source,
2581 magic,
2582 stream_kind,
2583 state,
2584 buf_base_offset,
2585 rule_base_offset,
2586 last_level_offset,
2587 haystack,
2588 switch_endianness,
2589 db,
2590 depth,
2591 )?;
2592
2593 let source = opt_source.unwrap_or("unknown");
2594 let line = self.entry.line;
2595
2596 if ok {
2597 nmatch = nmatch.saturating_add(1);
2598
2599 if !self.entry.test.is_recursive()
2603 && let Some(msg) = self.entry.message.as_ref()
2604 && let Ok(msg) = msg.format_with(opt_match_res.as_ref()).inspect_err(|e| {
2605 debug!("source={source} line={line} failed to format message: {e}")
2606 })
2607 {
2608 magic.push_message(msg);
2609 }
2610
2611 if let Some(mr) = opt_match_res {
2613 match &self.entry.test {
2614 Test::String(t) => {
2615 if t.has_length_mod() {
2616 let o = mr.end_offset();
2617 haystack.seek(SeekFrom::Start(o))?;
2618 }
2619 }
2620 Test::Search(t) => {
2621 if t.re_mods.contains(ReMod::StartOffsetUpdate) {
2622 let o = mr.start_offset();
2623 haystack.seek(SeekFrom::Start(o))?;
2624 } else {
2625 let o = mr.end_offset();
2626 haystack.seek(SeekFrom::Start(o))?;
2627 }
2628 }
2629
2630 Test::Regex(t) => {
2631 if t.mods.contains(ReMod::StartOffsetUpdate) {
2632 let o = mr.start_offset();
2633 haystack.seek(SeekFrom::Start(o))?;
2634 } else {
2635 let o = mr.end_offset();
2636 haystack.seek(SeekFrom::Start(o))?;
2637 }
2638 }
2639 _ => {}
2641 }
2642 }
2643
2644 if let Some(mimetype) = self.mimetype.as_ref() {
2645 magic.set_mime_type(Cow::Borrowed(mimetype));
2646 }
2647
2648 if let Some(apple_ty) = self.apple.as_ref() {
2649 magic.set_creator_code(Cow::Borrowed(apple_ty));
2650 }
2651
2652 if !self.exts.is_empty() {
2653 magic.insert_extensions(self.exts.iter().map(|s| s.as_str()));
2654 }
2655
2656 let mut strength = self.entry.test_strength;
2660
2661 let continuation_level = self.entry.continuation_level().0 as u64;
2662 if self.entry.message.is_none() && continuation_level < 3 {
2663 strength = strength.saturating_add(continuation_level);
2664 }
2665
2666 if let Some(sm) = self.strength_mod.as_ref() {
2667 strength = sm.apply(strength);
2668 }
2669
2670 if self.entry.message.is_none() {
2672 strength += 1
2673 }
2674
2675 magic.update_strength(strength);
2676
2677 let end_upper_level = haystack.lazy_stream_position();
2678
2679 let rule_base_offset = if self.root {
2687 match self.entry.offset {
2688 Offset::Direct(DirOffset::End(o)) => {
2689 Some(haystack.offset_from_start(SeekFrom::End(o)))
2690 }
2691 _ => rule_base_offset,
2692 }
2693 } else {
2694 rule_base_offset
2695 };
2696
2697 for e in self.children.iter() {
2698 nmatch = nmatch.saturating_add(e.matches(
2699 opt_source,
2700 magic,
2701 state,
2702 stream_kind,
2703 buf_base_offset,
2704 rule_base_offset,
2705 Some(end_upper_level),
2706 haystack,
2707 db,
2708 switch_endianness,
2709 depth,
2710 )?);
2711 }
2712 }
2713
2714 Ok(nmatch)
2715 }
2716}
2717
2718#[derive(Debug, Clone, Serialize, Deserialize)]
2720pub struct MagicRule {
2721 id: usize,
2722 source: Option<String>,
2723 entries: EntryNode,
2724 extensions: HashSet<String>,
2725 score: u64,
2727 finalized: bool,
2728}
2729
2730impl MagicRule {
2731 #[inline(always)]
2732 fn set_id(&mut self, id: usize) {
2733 self.id = id
2734 }
2735
2736 fn visit_all_entries(
2737 &self,
2738 deps: &HashMap<String, DependencyRule>,
2739 marked: &mut HashSet<String>,
2740 ) -> Result<EntryNodeVisitor, Error> {
2741 let mut v = EntryNodeVisitor::new();
2742 self.entries.visit(&mut v, deps, marked, 0)?;
2743 Ok(v)
2744 }
2745
2746 fn try_finalize(&mut self, deps: &HashMap<String, DependencyRule>) -> Result<(), Error> {
2749 if self.finalized {
2750 return Ok(());
2751 }
2752
2753 let v = self.visit_all_entries(deps, &mut HashSet::new())?;
2755
2756 self.extensions.extend(v.exts);
2757 self.score = v.score;
2758 self.finalized = true;
2759
2760 Ok(())
2761 }
2762
2763 #[inline]
2764 fn magic_entrypoint<'r, R: Read + Seek>(
2765 &'r self,
2766 magic: &mut Magic<'r>,
2767 stream_kind: StreamKind,
2768 haystack: &mut LazyCache<R>,
2769 db: &'r MagicDb,
2770 switch_endianness: bool,
2771 depth: usize,
2772 ) -> Result<u64, Error> {
2773 self.entries.matches(
2774 self.source.as_deref(),
2775 magic,
2776 &mut MatchState::empty(),
2777 stream_kind,
2778 None,
2779 None,
2780 None,
2781 haystack,
2782 db,
2783 switch_endianness,
2784 depth,
2785 )
2786 }
2787
2788 #[inline]
2789 #[allow(clippy::too_many_arguments)]
2790 fn magic<'r, R: Read + Seek>(
2791 &'r self,
2792 magic: &mut Magic<'r>,
2793 stream_kind: StreamKind,
2794 buf_base_offset: Option<u64>,
2795 rule_base_offset: Option<u64>,
2796 haystack: &mut LazyCache<R>,
2797 db: &'r MagicDb,
2798 switch_endianness: bool,
2799 depth: usize,
2800 ) -> Result<u64, Error> {
2801 self.entries.matches(
2802 self.source.as_deref(),
2803 magic,
2804 &mut MatchState::empty(),
2805 stream_kind,
2806 buf_base_offset,
2807 rule_base_offset,
2808 None,
2809 haystack,
2810 db,
2811 switch_endianness,
2812 depth,
2813 )
2814 }
2815
2816 pub fn is_text(&self) -> bool {
2822 self.entries.entry.test.is_text()
2823 && self.entries.children.iter().all(|e| e.entry.test.is_text())
2824 }
2825
2826 #[inline(always)]
2832 pub fn score(&self) -> u64 {
2833 self.score
2834 }
2835
2836 #[inline(always)]
2842 pub fn source(&self) -> Option<&str> {
2843 self.source.as_deref()
2844 }
2845
2846 #[inline(always)]
2852 pub fn line(&self) -> usize {
2853 self.entries.entry.line
2854 }
2855
2856 #[inline(always)]
2862 pub fn extensions(&self) -> &HashSet<String> {
2863 &self.extensions
2864 }
2865}
2866
2867#[derive(Debug, Clone, Serialize, Deserialize)]
2868struct DependencyRule {
2869 name: String,
2870 rule: MagicRule,
2871}
2872
2873#[derive(Debug, Clone, Serialize, Deserialize)]
2879pub struct MagicSource {
2880 rules: Vec<MagicRule>,
2881 dependencies: HashMap<String, DependencyRule>,
2882}
2883
2884impl MagicSource {
2885 pub fn open<P: AsRef<Path>>(p: P) -> Result<Self, Error> {
2895 FileMagicParser::parse_file(p)
2896 }
2897}
2898
2899#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)]
2900struct ContinuationLevel(u8);
2901
2902#[derive(Debug, PartialEq, Eq, Clone, Copy)]
2904enum TextEncoding {
2905 Ascii,
2906 Utf8,
2907 Unknown,
2908}
2909
2910impl TextEncoding {
2911 const fn as_magic_str(&self) -> &'static str {
2912 match self {
2913 TextEncoding::Ascii => "ASCII",
2914 TextEncoding::Utf8 => "UTF-8",
2915 TextEncoding::Unknown => "Unknown",
2916 }
2917 }
2918}
2919
2920#[derive(Debug, PartialEq, Eq, Clone, Copy)]
2921enum StreamKind {
2922 Binary,
2923 Text(TextEncoding),
2924}
2925
2926impl StreamKind {
2927 const fn is_text(&self) -> bool {
2928 matches!(self, StreamKind::Text(_))
2929 }
2930}
2931
2932#[derive(Debug)]
2933struct MatchState {
2934 continuation_levels: [bool; 256],
2935}
2936
2937impl MatchState {
2938 #[inline(always)]
2939 fn empty() -> Self {
2940 MatchState {
2941 continuation_levels: [false; 256],
2942 }
2943 }
2944
2945 #[inline(always)]
2946 fn get_continuation_level(&mut self, level: &ContinuationLevel) -> bool {
2947 self.continuation_levels
2948 .get(level.0 as usize)
2949 .cloned()
2950 .unwrap_or_default()
2951 }
2952
2953 #[inline(always)]
2954 fn set_continuation_level(&mut self, level: ContinuationLevel) {
2955 if let Some(b) = self.continuation_levels.get_mut(level.0 as usize) {
2956 *b = true
2957 }
2958 }
2959
2960 #[inline(always)]
2961 fn clear_continuation_level(&mut self, level: &ContinuationLevel) {
2962 if let Some(b) = self.continuation_levels.get_mut(level.0 as usize) {
2963 *b = false;
2964 }
2965 }
2966}
2967
2968#[derive(Debug, Default)]
2970pub struct Magic<'m> {
2971 stream_kind: Option<StreamKind>,
2972 source: Option<Cow<'m, str>>,
2973 message: Vec<Cow<'m, str>>,
2974 mime_type: Option<Cow<'m, str>>,
2975 creator_code: Option<Cow<'m, str>>,
2976 strength: u64,
2977 exts: HashSet<Cow<'m, str>>,
2978 is_default: bool,
2979}
2980
2981impl<'m> Magic<'m> {
2982 #[inline(always)]
2983 fn set_source(&mut self, source: Option<&'m str>) {
2984 self.source = source.map(Cow::Borrowed);
2985 }
2986
2987 #[inline(always)]
2988 fn set_stream_kind(&mut self, stream_kind: StreamKind) {
2989 self.stream_kind = Some(stream_kind)
2990 }
2991
2992 #[inline(always)]
2993 fn reset(&mut self) {
2994 self.stream_kind = None;
2995 self.source = None;
2996 self.message.clear();
2997 self.mime_type = None;
2998 self.creator_code = None;
2999 self.strength = 0;
3000 self.exts.clear();
3001 self.is_default = false;
3002 }
3003
3004 #[inline]
3012 pub fn into_owned<'owned>(self) -> Magic<'owned> {
3013 Magic {
3014 stream_kind: self.stream_kind,
3015 source: self.source.map(|s| Cow::Owned(s.into_owned())),
3016 message: self
3017 .message
3018 .into_iter()
3019 .map(Cow::into_owned)
3020 .map(Cow::Owned)
3021 .collect(),
3022 mime_type: self.mime_type.map(|m| Cow::Owned(m.into_owned())),
3023 creator_code: self.creator_code.map(|m| Cow::Owned(m.into_owned())),
3024 strength: self.strength,
3025 exts: self
3026 .exts
3027 .into_iter()
3028 .map(|e| Cow::Owned(e.into_owned()))
3029 .collect(),
3030 is_default: self.is_default,
3031 }
3032 }
3033
3034 #[inline(always)]
3040 pub fn message(&self) -> String {
3041 let mut out = String::new();
3042 for (i, m) in self.message.iter().enumerate() {
3043 if let Some(s) = m.strip_prefix(r#"\b"#) {
3044 out.push_str(s);
3045 } else {
3046 if i > 0 {
3048 out.push(' ');
3049 }
3050 out.push_str(m);
3051 }
3052 }
3053 out
3054 }
3055
3056 #[inline]
3067 pub fn message_parts(&self) -> impl Iterator<Item = &str> {
3068 self.message.iter().map(|p| p.as_ref())
3069 }
3070
3071 #[inline(always)]
3072 fn update_strength(&mut self, value: u64) {
3073 self.strength = self.strength.saturating_add(value);
3074 debug!("updated strength = {:?}", self.strength)
3075 }
3076
3077 #[inline(always)]
3083 pub fn mime_type(&self) -> &str {
3084 self.mime_type.as_deref().unwrap_or(match self.stream_kind {
3085 Some(StreamKind::Text(_)) => DEFAULT_TEXT_MIMETYPE,
3086 Some(StreamKind::Binary) | None => DEFAULT_BIN_MIMETYPE,
3087 })
3088 }
3089
3090 #[inline(always)]
3091 fn push_message<'a: 'm>(&mut self, msg: Cow<'a, str>) {
3092 if !msg.is_empty() {
3093 debug!("pushing message: msg={msg} len={}", msg.len());
3094 self.message.push(msg);
3095 }
3096 }
3097
3098 #[inline(always)]
3099 fn set_mime_type<'a: 'm>(&mut self, mime: Cow<'a, str>) {
3100 if self.mime_type.is_none() {
3101 debug!("insert mime: {:?}", mime);
3102 self.mime_type = Some(mime)
3103 }
3104 }
3105
3106 #[inline(always)]
3107 fn set_creator_code<'a: 'm>(&mut self, apple_ty: Cow<'a, str>) {
3108 if self.creator_code.is_none() {
3109 debug!("insert apple type: {apple_ty:?}");
3110 self.creator_code = Some(apple_ty)
3111 }
3112 }
3113
3114 #[inline(always)]
3115 fn insert_extensions<'a: 'm, I: Iterator<Item = &'a str>>(&mut self, exts: I) {
3116 if self.exts.is_empty() {
3117 self.exts.extend(exts.filter_map(|e| {
3118 if e.is_empty() {
3119 None
3120 } else {
3121 Some(Cow::Borrowed(e))
3122 }
3123 }));
3124 }
3125 }
3126
3127 #[inline(always)]
3135 pub fn strength(&self) -> u64 {
3136 self.strength
3137 }
3138
3139 #[inline(always)]
3145 pub fn source(&self) -> Option<&str> {
3146 self.source.as_deref()
3147 }
3148
3149 #[inline(always)]
3155 pub fn creator_code(&self) -> Option<&str> {
3156 self.creator_code.as_deref()
3157 }
3158
3159 #[inline(always)]
3165 pub fn extensions(&self) -> &HashSet<Cow<'m, str>> {
3166 &self.exts
3167 }
3168
3169 #[inline(always)]
3175 pub fn is_default(&self) -> bool {
3176 self.is_default
3177 }
3178}
3179
3180#[derive(Debug, Default, Clone, Serialize, Deserialize)]
3182pub struct MagicDb {
3183 rule_id: usize,
3184 rules: Vec<MagicRule>,
3185 dependencies: HashMap<String, DependencyRule>,
3186 finalized: usize,
3187}
3188
3189#[inline(always)]
3190fn is_likely_text(bytes: &[u8]) -> bool {
3192 const CHUNK_SIZE: usize = std::mem::size_of::<usize>();
3193
3194 if bytes.is_empty() {
3195 return false;
3196 }
3197
3198 let mut printable = 0f64;
3199 let mut high_bytes = 0f64; let (chunks, remainder) = bytes.as_chunks::<CHUNK_SIZE>();
3202
3203 macro_rules! handle_byte {
3204 ($byte: expr) => {
3205 match $byte {
3206 0x00 => return false,
3207 0x09 | 0x0A | 0x0D => printable += 1.0, 0x20..=0x7E => printable += 1.0, _ => high_bytes += 1.0,
3210 }
3211 };
3212 }
3213
3214 for bytes in chunks {
3215 for b in bytes {
3216 handle_byte!(b)
3217 }
3218 }
3219
3220 for b in remainder {
3221 handle_byte!(b)
3222 }
3223
3224 let total = bytes.len() as f64;
3225 let printable_ratio = printable / total;
3226 let high_bytes_ratio = high_bytes / total;
3227
3228 printable_ratio > 0.85 && high_bytes_ratio < 0.20
3230}
3231
3232#[inline(always)]
3233fn guess_stream_kind<S: AsRef<[u8]>>(stream: S) -> StreamKind {
3234 let buf = stream.as_ref();
3235
3236 match run_utf8_validation(buf) {
3237 Ok(is_ascii) => {
3238 if is_ascii {
3239 StreamKind::Text(TextEncoding::Ascii)
3240 } else {
3241 StreamKind::Text(TextEncoding::Utf8)
3242 }
3243 }
3244 Err(e) => {
3245 if is_likely_text(&buf[e.valid_up_to..]) {
3246 StreamKind::Text(TextEncoding::Unknown)
3247 } else {
3248 StreamKind::Binary
3249 }
3250 }
3251 }
3252}
3253
3254impl MagicDb {
3255 pub fn optimal_lazy_cache<R: Read + Seek>(f: R) -> Result<LazyCache<R>, io::Error> {
3258 Ok(LazyCache::<R>::from_read_seek(f)
3259 .and_then(|lc| lc.with_hot_cache(2 * FILE_BYTES_MAX))?)
3260 .map(|lc| lc.with_warm_cache(100 << 20))
3261 }
3262
3263 pub fn new() -> Self {
3269 Self::default()
3270 }
3271
3272 #[inline(always)]
3273 fn next_rule_id(&mut self) -> usize {
3274 let t = self.rule_id;
3275 self.rule_id += 1;
3276 t
3277 }
3278
3279 #[inline(always)]
3280 fn try_json<R: Read + Seek>(
3281 haystack: &mut LazyCache<R>,
3282 stream_kind: StreamKind,
3283 magic: &mut Magic,
3284 ) -> Result<bool, Error> {
3285 if matches!(stream_kind, StreamKind::Binary) {
3287 return Ok(false);
3288 }
3289
3290 let buf = haystack.read_range(0..FILE_BYTES_MAX as u64)?.trim_ascii();
3291
3292 let Some((start, end)) = find_json_boundaries(buf) else {
3293 return Ok(false);
3294 };
3295
3296 for c in buf[0..start].iter() {
3299 if !c.is_ascii_whitespace() {
3300 return Ok(false);
3301 }
3302 }
3303
3304 let mut is_ndjson = false;
3305
3306 trace!("maybe a json document");
3307 let ok = serde_json::from_slice::<serde_json::Value>(&buf[start..=end]).is_ok();
3308 if !ok {
3309 return Ok(false);
3310 }
3311
3312 if end + 1 < buf.len() {
3314 let buf = &buf[end + 1..];
3316 if let Some((second_start, second_end)) = find_json_boundaries(buf) {
3317 if memchr(b'\n', &buf[..second_start]).is_some() {
3319 trace!("might be ndjson");
3320 is_ndjson = serde_json::from_slice::<serde_json::Value>(
3321 &buf[second_start..=second_end],
3322 )
3323 .is_ok();
3324 }
3325 }
3326 }
3327
3328 if is_ndjson {
3329 magic.push_message(Cow::Borrowed("New Line Delimited"));
3330 magic.set_mime_type(Cow::Borrowed("application/x-ndjson"));
3331 magic.insert_extensions(["ndjson", "jsonl"].into_iter());
3332 } else {
3333 magic.set_mime_type(Cow::Borrowed("application/json"));
3334 magic.insert_extensions(["json"].into_iter());
3335 }
3336
3337 magic.push_message(Cow::Borrowed("JSON text data"));
3338 magic.set_source(Some(HARDCODED_SOURCE));
3339 magic.update_strength(HARDCODED_MAGIC_STRENGTH);
3340 Ok(true)
3341 }
3342
3343 #[inline(always)]
3344 fn try_csv<R: Read + Seek>(
3345 haystack: &mut LazyCache<R>,
3346 stream_kind: StreamKind,
3347 magic: &mut Magic,
3348 ) -> Result<bool, Error> {
3349 let StreamKind::Text(enc) = stream_kind else {
3351 return Ok(false);
3352 };
3353
3354 let buf = haystack.read_range(0..FILE_BYTES_MAX as u64)?;
3355 let mut reader = csv::Reader::from_reader(io::Cursor::new(buf));
3356 let mut records = reader.records();
3357
3358 let Some(Ok(first)) = records.next() else {
3359 return Ok(false);
3360 };
3361
3362 if first.len() <= 1 {
3366 return Ok(false);
3367 }
3368
3369 let mut n = 1;
3371 for i in records.take(9) {
3372 if let Ok(rec) = i {
3373 if first.len() != rec.len() {
3374 return Ok(false);
3375 }
3376 } else {
3377 return Ok(false);
3378 }
3379 n += 1;
3380 }
3381
3382 if n != 10 {
3384 return Ok(false);
3385 }
3386
3387 magic.set_mime_type(Cow::Borrowed("text/csv"));
3388 magic.push_message(Cow::Borrowed("CSV"));
3389 magic.push_message(Cow::Borrowed(enc.as_magic_str()));
3390 magic.push_message(Cow::Borrowed("text"));
3391 magic.insert_extensions(["csv"].into_iter());
3392 magic.set_source(Some(HARDCODED_SOURCE));
3393 magic.update_strength(HARDCODED_MAGIC_STRENGTH);
3394 Ok(true)
3395 }
3396
3397 #[inline(always)]
3398 fn try_tar<R: Read + Seek>(
3399 haystack: &mut LazyCache<R>,
3400 stream_kind: StreamKind,
3401 magic: &mut Magic,
3402 ) -> Result<bool, Error> {
3403 if !matches!(stream_kind, StreamKind::Binary) {
3405 return Ok(false);
3406 }
3407
3408 let buf = haystack.read_range(0..FILE_BYTES_MAX as u64)?;
3409 let mut ar = Archive::new(io::Cursor::new(buf));
3410
3411 let Ok(mut entries) = ar.entries() else {
3412 return Ok(false);
3413 };
3414
3415 let Some(Ok(first)) = entries.next() else {
3416 return Ok(false);
3417 };
3418
3419 let header = first.header();
3420
3421 if header.as_ustar().is_some() {
3422 magic.push_message(Cow::Borrowed("POSIX tar archive"));
3423 } else if header.as_gnu().is_some() {
3424 magic.push_message(Cow::Borrowed("POSIX tar archive (GNU)"));
3425 } else {
3426 magic.push_message(Cow::Borrowed("tar archive"));
3427 }
3428
3429 magic.set_mime_type(Cow::Borrowed("application/x-tar"));
3430 magic.set_source(Some(HARDCODED_SOURCE));
3431 magic.update_strength(HARDCODED_MAGIC_STRENGTH);
3432 magic.insert_extensions(["tar"].into_iter());
3433 Ok(true)
3434 }
3435
3436 #[inline(always)]
3437 fn try_hard_magic<R: Read + Seek>(
3438 haystack: &mut LazyCache<R>,
3439 stream_kind: StreamKind,
3440 magic: &mut Magic,
3441 ) -> Result<bool, Error> {
3442 Ok(Self::try_json(haystack, stream_kind, magic)?
3443 || Self::try_csv(haystack, stream_kind, magic)?
3444 || Self::try_tar(haystack, stream_kind, magic)?)
3445 }
3446
3447 #[inline(always)]
3448 fn magic_default<'m, R: Read + Seek>(
3449 cache: &mut LazyCache<R>,
3450 stream_kind: StreamKind,
3451 magic: &mut Magic<'m>,
3452 ) {
3453 magic.set_source(Some(HARDCODED_SOURCE));
3454 magic.set_stream_kind(stream_kind);
3455 magic.is_default = true;
3456
3457 if cache.data_size() == 0 {
3458 magic.push_message(Cow::Borrowed("empty"));
3459 magic.set_mime_type(Cow::Borrowed(DEFAULT_BIN_MIMETYPE));
3460 }
3461
3462 match stream_kind {
3463 StreamKind::Binary => {
3464 magic.push_message(Cow::Borrowed("data"));
3465 }
3466 StreamKind::Text(e) => {
3467 magic.push_message(Cow::Borrowed(e.as_magic_str()));
3468 magic.push_message(Cow::Borrowed("text"));
3469 }
3470 }
3471 }
3472
3473 fn load_rules_no_prepare(&mut self, rules: Vec<MagicRule>) {
3474 for rule in rules.into_iter() {
3475 let mut rule = rule;
3476 rule.set_id(self.next_rule_id());
3477
3478 self.rules.push(rule);
3479 }
3480 }
3481
3482 pub fn load(&mut self, ms: MagicSource) -> &mut Self {
3488 self.load_rules_no_prepare(ms.rules);
3489 self.dependencies.extend(ms.dependencies);
3490 self.try_finalize();
3491 self
3492 }
3493
3494 pub fn load_bulk<I: Iterator<Item = MagicSource>>(&mut self, it: I) -> &mut Self {
3499 for ms in it {
3500 self.load_rules_no_prepare(ms.rules);
3501 self.dependencies.extend(ms.dependencies);
3502 }
3503 self.try_finalize();
3504 self
3505 }
3506
3507 pub fn rules(&self) -> &[MagicRule] {
3513 &self.rules
3514 }
3515
3516 #[inline]
3517 fn first_magic_with_stream_kind<R: Read + Seek>(
3518 &self,
3519 haystack: &mut LazyCache<R>,
3520 stream_kind: StreamKind,
3521 extension: Option<&str>,
3522 ) -> Result<Magic<'_>, Error> {
3523 let mut magic = Magic::default();
3525
3526 if Self::try_hard_magic(haystack, stream_kind, &mut magic)? {
3527 return Ok(magic);
3528 }
3529
3530 let mut marked = vec![false; self.rules.len()];
3531
3532 macro_rules! do_magic {
3533 ($rule: expr) => {{
3534 $rule.magic_entrypoint(&mut magic, stream_kind, haystack, &self, false, 0)?;
3535
3536 if !magic.message.is_empty() {
3537 magic.set_stream_kind(stream_kind);
3538 magic.set_source($rule.source.as_deref());
3539 return Ok(magic);
3540 }
3541
3542 magic.reset();
3543 }};
3544 }
3545
3546 if let Some(ext) = extension.map(|e| e.to_lowercase())
3547 && !ext.is_empty()
3548 {
3549 for rule in self.rules.iter().filter(|r| r.extensions.contains(&ext)) {
3550 do_magic!(rule);
3551 if let Some(f) = marked.get_mut(rule.id) {
3552 *f = true
3553 }
3554 }
3555 }
3556
3557 for rule in self
3558 .rules
3559 .iter()
3560 .filter(|r| !*marked.get(r.id).unwrap_or(&false))
3562 {
3563 do_magic!(rule)
3564 }
3565
3566 Self::magic_default(haystack, stream_kind, &mut magic);
3567
3568 Ok(magic)
3569 }
3570
3571 pub fn first_magic<R: Read + Seek>(
3594 &self,
3595 r: &mut R,
3596 extension: Option<&str>,
3597 ) -> Result<Magic<'_>, Error> {
3598 let mut cache = Self::optimal_lazy_cache(r)?;
3599 let stream_kind = guess_stream_kind(cache.read_range(0..FILE_BYTES_MAX as u64)?);
3600 self.first_magic_with_stream_kind(&mut cache, stream_kind, extension)
3601 }
3602
3603 pub fn first_magic_with_lazy_cache<R: Read + Seek>(
3632 &self,
3633 cache: &mut LazyCache<R>,
3634 extension: Option<&str>,
3635 ) -> Result<Magic<'_>, Error> {
3636 let stream_kind = guess_stream_kind(cache.read_range(0..FILE_BYTES_MAX as u64)?);
3637 self.first_magic_with_stream_kind(cache, stream_kind, extension)
3638 }
3639
3640 #[inline(always)]
3641 fn all_magics_sort_with_stream_kind<R: Read + Seek>(
3642 &self,
3643 haystack: &mut LazyCache<R>,
3644 stream_kind: StreamKind,
3645 ) -> Result<Vec<Magic<'_>>, Error> {
3646 let mut out = Vec::new();
3647
3648 let mut magic = Magic::default();
3649
3650 if Self::try_hard_magic(haystack, stream_kind, &mut magic)? {
3651 out.push(magic);
3652 magic = Magic::default();
3653 }
3654
3655 for rule in self.rules.iter() {
3656 rule.magic_entrypoint(&mut magic, stream_kind, haystack, self, false, 0)?;
3657
3658 if !magic.message.is_empty() {
3660 magic.set_stream_kind(stream_kind);
3661 magic.set_source(rule.source.as_deref());
3662 out.push(magic);
3663 magic = Magic::default();
3664 }
3665
3666 magic.reset();
3667 }
3668
3669 Self::magic_default(haystack, stream_kind, &mut magic);
3670 out.push(magic);
3671
3672 out.sort_by_key(|b| std::cmp::Reverse(b.strength()));
3673
3674 Ok(out)
3675 }
3676
3677 pub fn all_magics<R: Read + Seek>(&self, r: &mut R) -> Result<Vec<Magic<'_>>, Error> {
3687 let mut cache = Self::optimal_lazy_cache(r)?;
3688 let stream_kind = guess_stream_kind(cache.read_range(0..FILE_BYTES_MAX as u64)?);
3689 self.all_magics_sort_with_stream_kind(&mut cache, stream_kind)
3690 }
3691
3692 pub fn all_magics_with_lazy_cache<R: Read + Seek>(
3708 &self,
3709 cache: &mut LazyCache<R>,
3710 ) -> Result<Vec<Magic<'_>>, Error> {
3711 let stream_kind = guess_stream_kind(cache.read_range(0..FILE_BYTES_MAX as u64)?);
3712 self.all_magics_sort_with_stream_kind(cache, stream_kind)
3713 }
3714
3715 #[inline(always)]
3716 fn best_magic_with_stream_kind<R: Read + Seek>(
3717 &self,
3718 haystack: &mut LazyCache<R>,
3719 stream_kind: StreamKind,
3720 ) -> Result<Magic<'_>, Error> {
3721 let magics = self.all_magics_sort_with_stream_kind(haystack, stream_kind)?;
3722
3723 Ok(magics.into_iter().next().unwrap_or_else(|| {
3726 let mut magic = Magic::default();
3727 Self::magic_default(haystack, stream_kind, &mut magic);
3728 magic
3729 }))
3730 }
3731
3732 pub fn best_magic<R: Read + Seek>(&self, r: &mut R) -> Result<Magic<'_>, Error> {
3742 let mut cache = Self::optimal_lazy_cache(r)?;
3743 let stream_kind = guess_stream_kind(cache.read_range(0..FILE_BYTES_MAX as u64)?);
3744 self.best_magic_with_stream_kind(&mut cache, stream_kind)
3745 }
3746
3747 pub fn best_magic_with_lazy_cache<R: Read + Seek>(
3763 &self,
3764 cache: &mut LazyCache<R>,
3765 ) -> Result<Magic<'_>, Error> {
3766 let stream_kind = guess_stream_kind(cache.read_range(0..FILE_BYTES_MAX as u64)?);
3767 self.best_magic_with_stream_kind(cache, stream_kind)
3768 }
3769
3770 pub fn serialize<W: Write>(self, w: &mut W) -> Result<(), Error> {
3776 let mut encoder = GzEncoder::new(w, Compression::best());
3777
3778 bincode::serde::encode_into_std_write(&self, &mut encoder, bincode::config::standard())?;
3779 encoder.finish()?;
3780 Ok(())
3781 }
3782
3783 pub fn deserialize<R: Read>(r: &mut R) -> Result<Self, Error> {
3793 let mut buf = vec![];
3794 let mut gz = GzDecoder::new(r);
3795 gz.read_to_end(&mut buf).map_err(|e| {
3796 bincode::error::DecodeError::OtherString(format!("failed to read: {e}"))
3797 })?;
3798 let (sdb, _): (MagicDb, usize) =
3799 bincode::serde::decode_from_slice(&buf, bincode::config::standard())?;
3800 Ok(sdb)
3801 }
3802
3803 pub fn verify(&mut self) -> Result<(), Error> {
3810 if self.rules.len() == self.finalized {
3811 return Ok(());
3812 }
3813
3814 for r in self.rules.iter_mut().filter(|r| !r.finalized) {
3815 r.try_finalize(&self.dependencies).map_err(|e| {
3817 Error::Verify(
3818 r.source.clone().unwrap_or(String::from("unknown")),
3819 r.line(),
3820 e.into(),
3821 )
3822 })?;
3823 self.finalized += 1;
3824 }
3825
3826 debug_assert!(self.finalized <= self.rules.len());
3827
3828 Ok(())
3829 }
3830
3831 #[inline(always)]
3832 fn try_finalize(&mut self) {
3833 if self.rules.len() == self.finalized {
3834 return;
3835 }
3836
3837 let mut finalized = 0usize;
3838 self.rules.iter_mut().for_each(|r| {
3839 if r.try_finalize(&self.dependencies).is_ok() {
3840 finalized += 1;
3841 }
3842 });
3843
3844 self.finalized = finalized;
3845
3846 debug_assert!(self.finalized <= self.rules.len());
3847
3848 self.rules.sort_by_key(|r| (r.is_text(), -(r.score as i64)));
3850 }
3851}
3852
3853#[cfg(test)]
3854mod tests {
3855 use std::io::Cursor;
3856
3857 use regex::bytes::Regex;
3858
3859 use crate::utils::unix_local_time_to_string;
3860
3861 use super::*;
3862
3863 macro_rules! lazy_cache {
3864 ($l: literal) => {
3865 LazyCache::from_read_seek(Cursor::new($l)).unwrap()
3866 };
3867 }
3868
3869 fn first_magic(
3870 rule: &str,
3871 content: &[u8],
3872 stream_kind: StreamKind,
3873 ) -> Result<Magic<'static>, Error> {
3874 let mut md = MagicDb::new();
3875 md.load(
3876 FileMagicParser::parse_str(rule, None)
3877 .inspect_err(|e| eprintln!("{e}"))
3878 .unwrap(),
3879 );
3880 let mut reader = LazyCache::from_read_seek(Cursor::new(content)).unwrap();
3881 let v = md.best_magic_with_stream_kind(&mut reader, stream_kind)?;
3882 Ok(v.into_owned())
3883 }
3884
3885 #[allow(unused_macros)]
3887 macro_rules! enable_trace {
3888 () => {
3889 tracing_subscriber::fmt()
3890 .with_max_level(tracing_subscriber::filter::LevelFilter::TRACE)
3891 .try_init();
3892 };
3893 }
3894
3895 macro_rules! parse_assert {
3896 ($rule:literal) => {
3897 FileMagicParser::parse_str($rule, None)
3898 .inspect_err(|e| eprintln!("{e}"))
3899 .unwrap()
3900 };
3901 }
3902
3903 macro_rules! assert_magic_match_bin {
3904 ($rule: literal, $content:literal) => {{ first_magic($rule, $content, StreamKind::Binary).unwrap() }};
3905 ($rule: literal, $content:literal, $message:expr) => {{
3906 assert_eq!(
3907 first_magic($rule, $content, StreamKind::Binary)
3908 .unwrap()
3909 .message(),
3910 $message
3911 );
3912 }};
3913 }
3914
3915 macro_rules! assert_magic_match_text {
3916 ($rule: literal, $content:literal) => {{ first_magic($rule, $content, StreamKind::Text(TextEncoding::Utf8)).unwrap() }};
3917 ($rule: literal, $content:literal, $message:expr) => {{
3918 assert_eq!(
3919 first_magic($rule, $content, StreamKind::Text(TextEncoding::Utf8))
3920 .unwrap()
3921 .message(),
3922 $message
3923 );
3924 }};
3925 }
3926
3927 macro_rules! assert_magic_not_match_text {
3928 ($rule: literal, $content:literal) => {{
3929 assert!(
3930 first_magic($rule, $content, StreamKind::Text(TextEncoding::Utf8))
3931 .unwrap()
3932 .is_default()
3933 );
3934 }};
3935 }
3936
3937 macro_rules! assert_magic_not_match_bin {
3938 ($rule: literal, $content:literal) => {{
3939 assert!(
3940 first_magic($rule, $content, StreamKind::Binary)
3941 .unwrap()
3942 .is_default()
3943 );
3944 }};
3945 }
3946
3947 #[test]
3948 fn test_regex() {
3949 assert_magic_match_text!(
3950 r#"
39510 regex/1024 \^#![[:space:]]*/usr/bin/env[[:space:]]+
3952!:mime text/x-shellscript
3953>&0 regex/64 .*($|\\b) %s shell script text executable
3954 "#,
3955 br#"#!/usr/bin/env bash
3956 echo hello world"#,
3957 "bash shell script text executable"
3959 );
3960
3961 let re = Regex::new(r"(?-u)\x42\x82").unwrap();
3962 assert!(re.is_match(b"\x42\x82"));
3963
3964 assert_magic_match_bin!(
3965 r#"0 regex \x42\x82 binary regex match"#,
3966 b"\x00\x00\x00\x00\x00\x00\x42\x82"
3967 );
3968
3969 assert_magic_match_bin!(
3971 r#"
3972 0 regex \x42\x82
3973 >&0 string \xde\xad\xbe\xef it works
3974 "#,
3975 b"\x00\x00\x00\x00\x00\x00\x42\x82\xde\xad\xbe\xef"
3976 );
3977
3978 assert_magic_match_bin!(
3979 r#"
3980 0 regex/s \x42\x82
3981 >&0 string \x42\x82\xde\xad\xbe\xef it works
3982 "#,
3983 b"\x00\x00\x00\x00\x00\x00\x42\x82\xde\xad\xbe\xef"
3984 );
3985
3986 assert_magic_match_text!(
3988 r#"
39890 regex/1024 \^HelloWorld$ HelloWorld String"#,
3990 br#"
3991// this is a comment after an empty line
3992HelloWorld
3993 "#
3994 );
3995 }
3996
3997 #[test]
3998 fn test_string_with_mods() {
3999 assert_magic_match_text!(
4000 r#"0 string/w #!\ \ \ /usr/bin/env\ bash BASH
4001 "#,
4002 b"#! /usr/bin/env bash i
4003 echo hello world"
4004 );
4005
4006 assert_magic_match_text!(
4008 r#"0 string/C HelloWorld it works
4009 "#,
4010 b"helloworld"
4011 );
4012
4013 assert_magic_not_match_text!(
4014 r#"0 string/C HelloWorld it works
4015 "#,
4016 b"hELLOwORLD"
4017 );
4018
4019 assert_magic_match_text!(
4021 r#"0 string/c HelloWorld it works
4022 "#,
4023 b"HELLOWORLD"
4024 );
4025
4026 assert_magic_not_match_text!(
4027 r#"0 string/c HelloWorld it works
4028 "#,
4029 b"helloworld"
4030 );
4031
4032 assert_magic_match_text!(
4034 r#"0 string/f #!/usr/bin/env\ bash BASH
4035 "#,
4036 b"#!/usr/bin/env bash"
4037 );
4038
4039 assert_magic_not_match_text!(
4040 r#"0 string/f #!/usr/bin/python PYTHON"#,
4041 b"#!/usr/bin/pythonic"
4042 );
4043
4044 assert_magic_match_text!(
4046 r#"0 string/W #!/usr/bin/env\ python PYTHON"#,
4047 b"#!/usr/bin/env python"
4048 );
4049
4050 assert_magic_not_match_text!(
4051 r#"0 string/W #!/usr/bin/env\ \ python PYTHON"#,
4052 b"#!/usr/bin/env python"
4053 );
4054 }
4055
4056 #[test]
4057 fn test_search_with_mods() {
4058 assert_magic_match_text!(
4059 r#"0 search/1/fwt #!\ /usr/bin/luatex LuaTex script text executable"#,
4060 b"#! /usr/bin/luatex "
4061 );
4062
4063 assert_magic_match_text!(
4065 r#"
4066 0 search/s /usr/bin/env
4067 >&0 string /usr/bin/env it works
4068 "#,
4069 b"#!/usr/bin/env python"
4070 );
4071
4072 assert_magic_not_match_text!(
4073 r#"
4074 0 search /usr/bin/env
4075 >&0 string /usr/bin/env it works
4076 "#,
4077 b"#!/usr/bin/env python"
4078 );
4079 }
4080
4081 #[test]
4082 fn test_pstring() {
4083 assert_magic_match_bin!(r#"0 pstring Toast it works"#, b"\x05Toast");
4084
4085 assert_magic_match_bin!(r#"0 pstring Toast %s"#, b"\x05Toast", "Toast");
4086
4087 assert_magic_not_match_bin!(r#"0 pstring Toast Doesn't work"#, b"\x07Toaster");
4088
4089 assert_magic_match_bin!(r#"0 pstring/H Toast it works"#, b"\x00\x05Toast");
4091
4092 assert_magic_match_bin!(r#"0 pstring/HJ Toast it works"#, b"\x00\x07Toast");
4093
4094 assert_magic_match_bin!(r#"0 pstring/HJ Toast %s"#, b"\x00\x07Toast", "Toast");
4095
4096 assert_magic_match_bin!(r#"0 pstring/h Toast it works"#, b"\x05\x00Toast");
4097
4098 assert_magic_match_bin!(r#"0 pstring/hJ Toast it works"#, b"\x07\x00Toast");
4099
4100 assert_magic_match_bin!(r#"0 pstring/L Toast it works"#, b"\x00\x00\x00\x05Toast");
4101
4102 assert_magic_match_bin!(r#"0 pstring/LJ Toast it works"#, b"\x00\x00\x00\x09Toast");
4103
4104 assert_magic_match_bin!(r#"0 pstring/l Toast it works"#, b"\x05\x00\x00\x00Toast");
4105
4106 assert_magic_match_bin!(r#"0 pstring/lJ Toast it works"#, b"\x09\x00\x00\x00Toast");
4107 }
4108
4109 #[test]
4110 fn test_max_recursion() {
4111 let res = first_magic(
4112 r#"0 indirect x"#,
4113 b"#! /usr/bin/luatex ",
4114 StreamKind::Binary,
4115 );
4116 assert!(res.is_err());
4117 let _ = res.inspect_err(|e| {
4118 assert!(matches!(
4119 e.unwrap_localized(),
4120 Error::MaximumRecursion(MAX_RECURSION)
4121 ))
4122 });
4123 }
4124
4125 #[test]
4126 fn test_string_ops() {
4127 assert_magic_match_text!("0 string/b MZ MZ File", b"MZ\0");
4128 assert_magic_match_text!("0 string !MZ Not MZ File", b"AZ\0");
4129 assert_magic_match_text!("0 string >\0 Any String", b"A\0");
4130 assert_magic_match_text!("0 string >Test Any String", b"Test 1\0");
4131 assert_magic_match_text!("0 string <Test Any String", b"\0");
4132 assert_magic_not_match_text!("0 string >Test Any String", b"\0");
4133 }
4134
4135 #[test]
4136 fn test_lestring16() {
4137 assert_magic_match_bin!(
4138 "0 lestring16 abcd Little-endian UTF-16 string",
4139 b"\x61\x00\x62\x00\x63\x00\x64\x00"
4140 );
4141 assert_magic_match_bin!(
4142 "0 lestring16 x %s",
4143 b"\x61\x00\x62\x00\x63\x00\x64\x00\x00",
4144 "abcd"
4145 );
4146 assert_magic_not_match_bin!(
4147 "0 lestring16 abcd Little-endian UTF-16 string",
4148 b"\x00\x61\x00\x62\x00\x63\x00\x64"
4149 );
4150 assert_magic_match_bin!(
4151 "4 lestring16 abcd Little-endian UTF-16 string",
4152 b"\x00\x00\x00\x00\x61\x00\x62\x00\x63\x00\x64\x00"
4153 );
4154 }
4155
4156 #[test]
4157 fn test_bestring16() {
4158 assert_magic_match_bin!(
4159 "0 bestring16 abcd Big-endian UTF-16 string",
4160 b"\x00\x61\x00\x62\x00\x63\x00\x64"
4161 );
4162 assert_magic_match_bin!(
4163 "0 bestring16 x %s",
4164 b"\x00\x61\x00\x62\x00\x63\x00\x64",
4165 "abcd"
4166 );
4167 assert_magic_not_match_bin!(
4168 "0 bestring16 abcd Big-endian UTF-16 string",
4169 b"\x61\x00\x62\x00\x63\x00\x64\x00"
4170 );
4171 assert_magic_match_bin!(
4172 "4 bestring16 abcd Big-endian UTF-16 string",
4173 b"\x00\x00\x00\x00\x00\x61\x00\x62\x00\x63\x00\x64"
4174 );
4175 }
4176
4177 #[test]
4178 fn test_offset_from_end() {
4179 assert_magic_match_bin!("-1 ubyte 0x42 last byte ok", b"\x00\x00\x42");
4180 assert_magic_match_bin!("-2 ubyte 0x41 last byte ok", b"\x00\x41\x00");
4181 }
4182
4183 #[test]
4184 fn test_relative_offset() {
4185 assert_magic_match_bin!(
4186 "
4187 0 ubyte 0x42
4188 >&0 ubyte 0x00
4189 >>&0 ubyte 0x41 third byte ok
4190 ",
4191 b"\x42\x00\x41\x00"
4192 );
4193 }
4194
4195 #[test]
4196 fn test_indirect_offset() {
4197 assert_magic_match_bin!("(0.l) ubyte 0x42 it works", b"\x04\x00\x00\x00\x42");
4198 assert_magic_match_bin!("(0.l+3) ubyte 0x42 it works", b"\x01\x00\x00\x00\x42");
4200 assert_magic_match_bin!(
4202 "(0.l+(4)) ubyte 0x42 it works",
4203 b"\x04\x00\x00\x00\x04\x00\x00\x00\x42"
4204 );
4205 }
4206
4207 #[test]
4208 fn test_use_with_message() {
4209 assert_magic_match_bin!(
4210 r#"
42110 string MZ
4212>0 use mz first match
4213
42140 name mz then second match
4215>0 string MZ
4216"#,
4217 b"MZ\0",
4218 "first match then second match"
4219 );
4220 }
4221
4222 #[test]
4223 fn test_scalar_transform() {
4224 assert_magic_match_bin!("0 ubyte+1 0x1 add works", b"\x00");
4225 assert_magic_match_bin!("0 ubyte-1 0xfe sub works", b"\xff");
4226 assert_magic_match_bin!("0 ubyte%2 0 mod works", b"\x0a");
4227 assert_magic_match_bin!("0 ubyte&0x0f 0x0f bitand works", b"\xff");
4228 assert_magic_match_bin!("0 ubyte|0x0f 0xff bitor works", b"\xf0");
4229 assert_magic_match_bin!("0 ubyte^0x0f 0xf0 bitxor works", b"\xff");
4230
4231 FileMagicParser::parse_str("0 ubyte%0 mod by zero", None)
4232 .expect_err("expect div by zero error");
4233 FileMagicParser::parse_str("0 ubyte/0 div by zero", None)
4234 .expect_err("expect div by zero error");
4235 }
4236
4237 #[test]
4238 fn test_belong() {
4239 assert_magic_match_bin!("0 belong 0x12345678 Big-endian long", b"\x12\x34\x56\x78");
4241 assert_magic_not_match_bin!("0 belong 0x12345678 Big-endian long", b"\x78\x56\x34\x12");
4243 assert_magic_match_bin!(
4245 "4 belong 0x12345678 Big-endian long",
4246 b"\x00\x00\x00\x00\x12\x34\x56\x78"
4247 );
4248 assert_magic_match_bin!("0 belong <0x12345678 Big-endian long", b"\x12\x34\x56\x77");
4250 assert_magic_not_match_bin!("0 belong <0x12345678 Big-endian long", b"\x12\x34\x56\x78");
4251
4252 assert_magic_match_bin!("0 belong >0x12345678 Big-endian long", b"\x12\x34\x56\x79");
4254 assert_magic_not_match_bin!("0 belong >0x12345678 Big-endian long", b"\x12\x34\x56\x78");
4255
4256 assert_magic_match_bin!("0 belong &0x5678 Big-endian long", b"\x00\x00\x56\x78");
4258 assert_magic_not_match_bin!("0 belong &0x0000FFFF Big-endian long", b"\x12\x34\x56\x78");
4259
4260 assert_magic_match_bin!("0 belong ^0xFFFF0000 Big-endian long", b"\x00\x00\x56\x78");
4262 assert_magic_not_match_bin!("0 belong ^0xFFFF0000 Big-endian long", b"\x00\x01\x56\x78");
4263
4264 assert_magic_match_bin!("0 belong ~0x12345678 Big-endian long", b"\xed\xcb\xa9\x87");
4266 assert_magic_not_match_bin!("0 belong ~0x12345678 Big-endian long", b"\x12\x34\x56\x78");
4267
4268 assert_magic_match_bin!("0 belong x Big-endian long", b"\x12\x34\x56\x78");
4270 assert_magic_match_bin!("0 belong x Big-endian long", b"\x78\x56\x34\x12");
4271 }
4272
4273 #[test]
4274 fn test_parse_search() {
4275 parse_assert!("0 search test");
4276 parse_assert!("0 search/24/s test");
4277 parse_assert!("0 search/s/24 test");
4278 }
4279
4280 #[test]
4281 fn test_bedate() {
4282 assert_magic_match_bin!(
4283 "0 bedate 946684800 Unix date (Jan 1, 2000)",
4284 b"\x38\x6D\x43\x80"
4285 );
4286 assert_magic_not_match_bin!(
4287 "0 bedate 946684800 Unix date (Jan 1, 2000)",
4288 b"\x00\x00\x00\x00"
4289 );
4290 assert_magic_match_bin!(
4291 "4 bedate 946684800 %s",
4292 b"\x00\x00\x00\x00\x38\x6D\x43\x80",
4293 "2000-01-01 00:00:00"
4294 );
4295 }
4296 #[test]
4297 fn test_beldate() {
4298 assert_magic_match_bin!(
4299 "0 beldate 946684800 Local date (Jan 1, 2000)",
4300 b"\x38\x6D\x43\x80"
4301 );
4302 assert_magic_not_match_bin!(
4303 "0 beldate 946684800 Local date (Jan 1, 2000)",
4304 b"\x00\x00\x00\x00"
4305 );
4306
4307 assert_magic_match_bin!(
4308 "4 beldate 946684800 {}",
4309 b"\x00\x00\x00\x00\x38\x6D\x43\x80",
4310 unix_local_time_to_string(946684800)
4311 );
4312 }
4313
4314 #[test]
4315 fn test_beqdate() {
4316 assert_magic_match_bin!(
4317 "0 beqdate 946684800 Unix date (Jan 1, 2000)",
4318 b"\x00\x00\x00\x00\x38\x6D\x43\x80"
4319 );
4320
4321 assert_magic_not_match_bin!(
4322 "0 beqdate 946684800 Unix date (Jan 1, 2000)",
4323 b"\x00\x00\x00\x00\x00\x00\x00\x00"
4324 );
4325
4326 assert_magic_match_bin!(
4327 "0 beqdate 946684800 %s",
4328 b"\x00\x00\x00\x00\x38\x6D\x43\x80",
4329 "2000-01-01 00:00:00"
4330 );
4331 }
4332
4333 #[test]
4334 fn test_medate() {
4335 assert_magic_match_bin!(
4336 "0 medate 946684800 Unix date (Jan 1, 2000)",
4337 b"\x6D\x38\x80\x43"
4338 );
4339
4340 assert_magic_not_match_bin!(
4341 "0 medate 946684800 Unix date (Jan 1, 2000)",
4342 b"\x00\x00\x00\x00"
4343 );
4344
4345 assert_magic_match_bin!(
4346 "4 medate 946684800 %s",
4347 b"\x00\x00\x00\x00\x6D\x38\x80\x43",
4348 "2000-01-01 00:00:00"
4349 );
4350 }
4351
4352 #[test]
4353 fn test_meldate() {
4354 assert_magic_match_bin!(
4355 "0 meldate 946684800 Local date (Jan 1, 2000)",
4356 b"\x6D\x38\x80\x43"
4357 );
4358 assert_magic_not_match_bin!(
4359 "0 meldate 946684800 Local date (Jan 1, 2000)",
4360 b"\x00\x00\x00\x00"
4361 );
4362
4363 assert_magic_match_bin!(
4364 "4 meldate 946684800 %s",
4365 b"\x00\x00\x00\x00\x6D\x38\x80\x43",
4366 unix_local_time_to_string(946684800)
4367 );
4368 }
4369
4370 #[test]
4371 fn test_date() {
4372 assert_magic_match_bin!(
4373 "0 date 946684800 Local date (Jan 1, 2000)",
4374 b"\x80\x43\x6D\x38"
4375 );
4376 assert_magic_not_match_bin!(
4377 "0 date 946684800 Local date (Jan 1, 2000)",
4378 b"\x00\x00\x00\x00"
4379 );
4380 assert_magic_match_bin!(
4381 "4 date 946684800 {}",
4382 b"\x00\x00\x00\x00\x80\x43\x6D\x38",
4383 "2000-01-01 00:00:00"
4384 );
4385 }
4386
4387 #[test]
4388 fn test_leldate() {
4389 assert_magic_match_bin!(
4390 "0 leldate 946684800 Local date (Jan 1, 2000)",
4391 b"\x80\x43\x6D\x38"
4392 );
4393 assert_magic_not_match_bin!(
4394 "0 leldate 946684800 Local date (Jan 1, 2000)",
4395 b"\x00\x00\x00\x00"
4396 );
4397 assert_magic_match_bin!(
4398 "4 leldate 946684800 {}",
4399 b"\x00\x00\x00\x00\x80\x43\x6D\x38",
4400 unix_local_time_to_string(946684800)
4401 );
4402 }
4403
4404 #[test]
4405 fn test_leqdate() {
4406 assert_magic_match_bin!(
4407 "0 leqdate 1577836800 Unix date (Jan 1, 2020)",
4408 b"\x00\xe1\x0b\x5E\x00\x00\x00\x00"
4409 );
4410
4411 assert_magic_not_match_bin!(
4412 "0 leqdate 1577836800 Unix date (Jan 1, 2020)",
4413 b"\x00\x00\x00\x00\x00\x00\x00\x00"
4414 );
4415 assert_magic_match_bin!(
4416 "8 leqdate 1577836800 %s",
4417 b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\xE1\x0B\x5E\x00\x00\x00\x00",
4418 "2020-01-01 00:00:00"
4419 );
4420 }
4421
4422 #[test]
4423 fn test_leqldate() {
4424 assert_magic_match_bin!(
4425 "0 leqldate 1577836800 Unix date (Jan 1, 2020)",
4426 b"\x00\xe1\x0b\x5E\x00\x00\x00\x00"
4427 );
4428
4429 assert_magic_not_match_bin!(
4430 "0 leqldate 1577836800 Unix date (Jan 1, 2020)",
4431 b"\x00\x00\x00\x00\x00\x00\x00\x00"
4432 );
4433 assert_magic_match_bin!(
4434 "8 leqldate 1577836800 %s",
4435 b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\xE1\x0B\x5E\x00\x00\x00\x00",
4436 unix_local_time_to_string(1577836800)
4437 );
4438 }
4439
4440 #[test]
4441 fn test_melong() {
4442 assert_magic_match_bin!(
4444 "0 melong =0x12345678 Middle-endian long",
4445 b"\x34\x12\x78\x56"
4446 );
4447 assert_magic_not_match_bin!(
4448 "0 melong =0x12345678 Middle-endian long",
4449 b"\x00\x00\x00\x00"
4450 );
4451
4452 assert_magic_match_bin!(
4454 "0 melong <0x12345678 Middle-endian long",
4455 b"\x34\x12\x78\x55"
4456 ); assert_magic_not_match_bin!(
4458 "0 melong <0x12345678 Middle-endian long",
4459 b"\x34\x12\x78\x56"
4460 ); assert_magic_match_bin!(
4464 "0 melong >0x12345678 Middle-endian long",
4465 b"\x34\x12\x78\x57"
4466 ); assert_magic_not_match_bin!(
4468 "0 melong >0x12345678 Middle-endian long",
4469 b"\x34\x12\x78\x56"
4470 ); assert_magic_match_bin!("0 melong &0x5678 Middle-endian long", b"\xab\xcd\x78\x56"); assert_magic_not_match_bin!(
4475 "0 melong &0x0000FFFF Middle-endian long",
4476 b"\x34\x12\x78\x56"
4477 ); assert_magic_match_bin!(
4481 "0 melong ^0xFFFF0000 Middle-endian long",
4482 b"\x00\x00\x78\x56"
4483 ); assert_magic_not_match_bin!(
4485 "0 melong ^0xFFFF0000 Middle-endian long",
4486 b"\x00\x01\x78\x56"
4487 ); assert_magic_match_bin!(
4491 "0 melong ~0x12345678 Middle-endian long",
4492 b"\xCB\xED\x87\xA9"
4493 );
4494 assert_magic_not_match_bin!(
4495 "0 melong ~0x12345678 Middle-endian long",
4496 b"\x34\x12\x78\x56"
4497 ); assert_magic_match_bin!("0 melong x Middle-endian long", b"\x34\x12\x78\x56");
4501 assert_magic_match_bin!("0 melong x Middle-endian long", b"\x00\x00\x00\x00");
4502 }
4503
4504 #[test]
4505 fn test_uquad() {
4506 assert_magic_match_bin!(
4508 "0 uquad =0x123456789ABCDEF0 Unsigned quad",
4509 b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4510 );
4511 assert_magic_not_match_bin!(
4512 "0 uquad =0x123456789ABCDEF0 Unsigned quad",
4513 b"\x00\x00\x00\x00\x00\x00\x00\x00"
4514 );
4515
4516 assert_magic_match_bin!(
4518 "0 uquad <0x123456789ABCDEF0 Unsigned quad",
4519 b"\xF0\xDE\xBC\x9A\x78\x56\x34\x11"
4520 );
4521 assert_magic_not_match_bin!(
4522 "0 uquad <0x123456789ABCDEF0 Unsigned quad",
4523 b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4524 );
4525
4526 assert_magic_match_bin!(
4528 "0 uquad >0x123456789ABCDEF0 Unsigned quad",
4529 b"\xF0\xDE\xBC\x9A\x78\x56\x34\x13"
4530 );
4531 assert_magic_not_match_bin!(
4532 "0 uquad >0x123456789ABCDEF0 Unsigned quad",
4533 b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4534 );
4535
4536 assert_magic_match_bin!(
4538 "0 uquad &0xF0 Unsigned quad",
4539 b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4540 );
4541 assert_magic_not_match_bin!(
4542 "0 uquad &0xFF Unsigned quad",
4543 b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4544 );
4545
4546 assert_magic_match_bin!(
4548 "0 uquad ^0xFFFFFFFFFFFFFFFF Unsigned quad",
4549 b"\x00\x00\x00\x00\x00\x00\x00\x00"
4550 ); assert_magic_not_match_bin!(
4552 "0 uquad ^0xFFFFFFFFFFFFFFFF Unsigned quad",
4553 b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4554 ); assert_magic_match_bin!(
4558 "0 uquad ~0x123456789ABCDEF0 Unsigned quad",
4559 b"\x0F\x21\x43\x65\x87\xA9\xCB\xED"
4560 );
4561 assert_magic_not_match_bin!(
4562 "0 uquad ~0x123456789ABCDEF0 Unsigned quad",
4563 b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4564 ); assert_magic_match_bin!(
4568 "0 uquad x {:#x}",
4569 b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12",
4570 "0x123456789abcdef0"
4571 );
4572 assert_magic_match_bin!(
4573 "0 uquad x Unsigned quad",
4574 b"\x00\x00\x00\x00\x00\x00\x00\x00"
4575 );
4576 }
4577
4578 #[test]
4579 fn test_guid() {
4580 assert_magic_match_bin!(
4581 "0 guid EC959539-6786-2D4E-8FDB-98814CE76C1E It works",
4582 b"\xEC\x95\x95\x39\x67\x86\x2D\x4E\x8F\xDB\x98\x81\x4C\xE7\x6C\x1E"
4583 );
4584
4585 assert_magic_not_match_bin!(
4586 "0 guid 399595EC-8667-4E2D-8FDB-98814CE76C1E It works",
4587 b"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F"
4588 );
4589
4590 assert_magic_match_bin!(
4591 "0 guid x %s",
4592 b"\xEC\x95\x95\x39\x67\x86\x2D\x4E\x8F\xDB\x98\x81\x4C\xE7\x6C\x1E",
4593 "EC959539-6786-2D4E-8FDB-98814CE76C1E"
4594 );
4595 }
4596
4597 #[test]
4598 fn test_ubeqdate() {
4599 assert_magic_match_bin!(
4600 "0 ubeqdate 1633046400 It works",
4601 b"\x00\x00\x00\x00\x61\x56\x4f\x80"
4602 );
4603
4604 assert_magic_match_bin!(
4605 "0 ubeqdate x %s",
4606 b"\x00\x00\x00\x00\x61\x56\x4f\x80",
4607 "2021-10-01 00:00:00"
4608 );
4609
4610 assert_magic_not_match_bin!(
4611 "0 ubeqdate 1633046400 It should not work",
4612 b"\x00\x00\x00\x00\x00\x00\x00\x00"
4613 );
4614 }
4615
4616 #[test]
4617 fn test_ldate() {
4618 assert_magic_match_bin!("0 ldate 1640551520 It works", b"\x60\xd4\xC8\x61");
4619
4620 assert_magic_not_match_bin!("0 ldate 1633046400 It should not work", b"\x00\x00\x00\x00");
4621
4622 assert_magic_match_bin!(
4623 "0 ldate x %s",
4624 b"\x60\xd4\xC8\x61",
4625 unix_local_time_to_string(1640551520)
4626 );
4627 }
4628
4629 #[test]
4630 fn test_scalar_with_transform() {
4631 assert_magic_match_bin!("0 ubyte/10 2 {}", b"\x14", "2");
4632 assert_magic_match_bin!("0 ubyte/10 x {}", b"\x14", "2");
4633 assert_magic_match_bin!("0 ubyte%10 x {}", b"\x14", "0");
4634 }
4635
4636 #[test]
4637 fn test_float_with_transform() {
4638 assert_magic_match_bin!("0 lefloat/10 2 {}", b"\x00\x00\xa0\x41", "2");
4639 assert_magic_match_bin!("0 lefloat/10 x {}", b"\x00\x00\xa0\x41", "2");
4640 assert_magic_match_bin!("0 lefloat%10 x {}", b"\x00\x00\xa0\x41", "0");
4641 }
4642
4643 #[test]
4644 fn test_read_octal() {
4645 assert_eq!(read_octal_u64(&mut lazy_cache!("0")), Some(0));
4647 assert_eq!(read_octal_u64(&mut lazy_cache!("00")), Some(0));
4648 assert_eq!(read_octal_u64(&mut lazy_cache!("01")), Some(1));
4649 assert_eq!(read_octal_u64(&mut lazy_cache!("07")), Some(7));
4650 assert_eq!(read_octal_u64(&mut lazy_cache!("010")), Some(8));
4651 assert_eq!(read_octal_u64(&mut lazy_cache!("0123")), Some(83));
4652 assert_eq!(read_octal_u64(&mut lazy_cache!("0755")), Some(493));
4653
4654 assert_eq!(read_octal_u64(&mut lazy_cache!("0ABC")), Some(0));
4656 assert_eq!(read_octal_u64(&mut lazy_cache!("01ABC")), Some(1));
4657 assert_eq!(read_octal_u64(&mut lazy_cache!("0755ABC")), Some(493));
4658 assert_eq!(read_octal_u64(&mut lazy_cache!("0123ABC")), Some(83));
4659
4660 assert_eq!(read_octal_u64(&mut lazy_cache!("08")), Some(0)); assert_eq!(read_octal_u64(&mut lazy_cache!("01238")), Some(83)); assert_eq!(read_octal_u64(&mut lazy_cache!("123")), None);
4666 assert_eq!(read_octal_u64(&mut lazy_cache!("755")), None);
4667
4668 assert_eq!(read_octal_u64(&mut lazy_cache!("")), None);
4670
4671 assert_eq!(read_octal_u64(&mut lazy_cache!("ABC")), None);
4673 assert_eq!(read_octal_u64(&mut lazy_cache!("8ABC")), None); assert_eq!(
4677 read_octal_u64(&mut lazy_cache!("01777777777")),
4678 Some(268435455)
4679 );
4680 }
4681
4682 #[test]
4683 fn test_offset_bug_1() {
4684 assert_magic_match_bin!(
4687 r"
46881 string TEST Bread is
4689# offset computation is relative to
4690# rule start
4691>(5.b) use toasted
4692
46930 name toasted
4694>0 string twice Toasted
4695>>0 use toasted_twice
4696
46970 name toasted_twice
4698>(6.b) string x %s
4699 ",
4700 b"\x00TEST\x06twice\x00\x06",
4701 "Bread is Toasted twice"
4702 );
4703 }
4704
4705 #[test]
4711 fn test_offset_bug_2() {
4712 assert_magic_match_bin!(
4715 r"
4716-12 string TEST Bread is
4717>(4.b) use toasted
4718
47190 name toasted
4720>0 string twice Toasted
4721>>0 use toasted_twice
4722
47230 name toasted_twice
4724>(6.b) string x %
4725 ",
4726 b"\x00TEST\x06twice\x00\x06",
4727 "Bread is Toasted twice"
4728 )
4729 }
4730
4731 #[test]
4732 fn test_offset_bug_3() {
4733 assert_magic_match_bin!(
4736 r"
47371 string TEST Bread is
4738>(5.b) indirect/r x
4739
47400 string twice Toasted
4741>0 use toasted_twice
4742
47430 name toasted_twice
4744>0 string x %s
4745 ",
4746 b"\x00TEST\x06twice\x00\x08",
4747 "Bread is Toasted twice"
4748 )
4749 }
4750
4751 #[test]
4752 fn test_offset_bug_4() {
4753 assert_magic_match_bin!(
4756 r"
47571 string Bread %s
4758>(6.b) indirect/r x
4759
4760# this one uses a based offset
4761# computed at indirection
47621 string is\ Toasted %s
4763>(11.b) use toasted_twice
4764
4765# this one is using a new base
4766# offset being previous base
4767# offset + offset of use
47680 name toasted_twice
4769>0 string x %s
4770 ",
4771 b"\x00Bread\x06is Toasted\x0ctwice\x00",
4772 "Bread is Toasted twice"
4773 )
4774 }
4775
4776 #[test]
4777 fn test_offset_bug_5() {
4778 assert_magic_match_bin!(
4779 r"
47801 string TEST Bread is
4781>(5.b) indirect/r x
4782
47830 string twice Toasted
4784>0 use toasted_twice
4785
47860 name toasted_twice
4787>0 string twice
4788>>&1 byte 0x08 twice
4789 ",
4790 b"\x00TEST\x06twice\x00\x08",
4791 "Bread is Toasted twice"
4792 )
4793 }
4794
4795 #[test]
4796 fn test_message_parts() {
4797 let m = first_magic(
4798 r#"0 string/W #!/usr/bin/env\ python PYTHON"#,
4799 b"#!/usr/bin/env python",
4800 StreamKind::Text(TextEncoding::Ascii),
4801 )
4802 .unwrap();
4803
4804 assert!(m.message_parts().any(|p| p.eq_ignore_ascii_case("python")))
4805 }
4806
4807 #[test]
4808 fn test_load_bulk() {
4809 let mut db = MagicDb::new();
4810
4811 let rules = vec![
4812 parse_assert!("0 search test"),
4813 parse_assert!("0 search/24/s test"),
4814 parse_assert!("0 search/s/24 test"),
4815 ];
4816
4817 db.load_bulk(rules.into_iter());
4818 db.verify().unwrap();
4819 }
4820
4821 #[test]
4822 fn test_load_bulk_failure() {
4823 let mut db = MagicDb::new();
4824
4825 let rules = vec![parse_assert!(
4826 r#"
48270 search/s/24 test
4828>0 use test
4829"#
4830 )];
4831
4832 db.load_bulk(rules.into_iter());
4833 assert!(matches!(db.verify(), Err(Error::Verify(_, _, _))));
4834 }
4835}