1#![forbid(unsafe_code)]
2#![deny(unused_imports)]
3#![deny(missing_docs)]
4use dyf::{DynDisplay, FormatString, dformat};
144use flagset::{FlagSet, flags};
145use flate2::{Compression, read::GzDecoder, write::GzEncoder};
146use memchr::memchr;
147use pest::{Span, error::ErrorVariant};
148use regex::bytes::{self};
149use serde::{Deserialize, Serialize};
150use std::{
151 borrow::Cow,
152 cmp::max,
153 collections::{HashMap, HashSet},
154 fmt::{self, Debug, Display},
155 fs::File,
156 io::{self, Read, SeekFrom, Write},
157 ops::{Add, BitAnd, BitOr, BitXor, Deref, Div, Mul, Rem, Sub},
158 path::Path,
159};
160use tar::Archive;
161use thiserror::Error;
162use tracing::{Level, debug, enabled, trace};
163
164use crate::{
165 numeric::{Float, FloatDataType, Scalar, ScalarDataType},
166 parser::{FileMagicParser, Rule},
167 readers::DataRead,
168 utils::{
169 debug_string_from_vec_u8, debug_string_from_vec_u16, decode_id3, find_json_boundaries,
170 run_utf8_validation,
171 },
172};
173
174mod numeric;
175mod parser;
176pub mod readers;
177pub use readers::DataReader;
178mod utils;
179
180const HARDCODED_MAGIC_STRENGTH: u64 = 2048;
181const HARDCODED_SOURCE: &str = "hardcoded";
182const MAX_RECURSION: usize = 50;
184const FILE_REGEX_MAX: usize = 8192;
186
187pub const FILE_BYTES_MAX: usize = 7 * 1024 * 1024;
193pub const DEFAULT_BIN_MIMETYPE: &str = "application/octet-stream";
195pub const DEFAULT_TEXT_MIMETYPE: &str = "text/plain";
197
198pub(crate) const TIMESTAMP_FORMAT: &str = "%Y-%m-%d %H:%M:%S";
199
200macro_rules! debug_panic {
201 ($($arg:tt)*) => {
202 if cfg!(debug_assertions) {
203 panic!($($arg)*);
204 }
205 };
206}
207
208macro_rules! read {
209 ($r: expr, $ty: ty) => {{
210 let mut a = [0u8; std::mem::size_of::<$ty>()];
211 $r.read_exact_into(&mut a)?;
212 a
213 }};
214}
215
216macro_rules! read_le {
217 ($r:expr, $ty: ty ) => {{ <$ty>::from_le_bytes(read!($r, $ty)) }};
218}
219
220macro_rules! read_be {
221 ($r:expr, $ty: ty ) => {{ <$ty>::from_be_bytes(read!($r, $ty)) }};
222}
223
224macro_rules! read_me {
225 ($r: expr) => {{ ((read_le!($r, u16) as i32) << 16) | (read_le!($r, u16) as i32) }};
226}
227
228#[inline(always)]
229fn read_octal_u64<D: DataRead>(haystack: &mut D) -> Option<u64> {
230 let s = haystack
231 .read_while_or_limit(|b| matches!(b, b'0'..=b'7'), 22)
232 .map(|buf| str::from_utf8(buf))
233 .ok()?
234 .ok()?;
235
236 if !s.starts_with("0") {
237 return None;
238 }
239
240 u64::from_str_radix(s, 8).ok()
241}
242
243#[derive(Debug, Error)]
245pub enum Error {
246 #[error("{0}")]
248 Msg(String),
249
250 #[error("source={0} line={1} error={2}")]
252 Verify(String, usize, Box<Error>),
253
254 #[error("source={0} line={1} error={2}")]
256 Localized(String, usize, Box<Error>),
257
258 #[error("missing rule: {0}")]
260 MissingRule(String),
261
262 #[error("maximum recursion reached: {0}")]
264 MaximumRecursion(usize),
265
266 #[error("io: {0}")]
268 Io(#[from] io::Error),
269
270 #[error("parser error: {0}")]
272 Parse(#[from] Box<pest::error::Error<Rule>>),
273
274 #[error("formatting: {0}")]
276 Format(#[from] dyf::Error),
277
278 #[error("regex: {0}")]
280 Regex(#[from] regex::Error),
281
282 #[error("{0}")]
284 Serialize(#[from] bincode::error::EncodeError),
285
286 #[error("{0}")]
288 Deserialize(#[from] bincode::error::DecodeError),
289}
290
291impl Error {
292 #[inline]
293 fn parser<S: ToString>(msg: S, span: Span<'_>) -> Self {
294 Self::Parse(Box::new(pest::error::Error::new_from_span(
295 ErrorVariant::CustomError {
296 message: msg.to_string(),
297 },
298 span,
299 )))
300 }
301
302 fn msg<M: AsRef<str>>(msg: M) -> Self {
303 Self::Msg(msg.as_ref().into())
304 }
305
306 fn localized<S: AsRef<str>>(source: S, line: usize, err: Error) -> Self {
307 Self::Localized(source.as_ref().into(), line, err.into())
308 }
309
310 pub fn unwrap_localized(&self) -> &Self {
312 match self {
313 Self::Localized(_, _, e) => e,
314 _ => self,
315 }
316 }
317}
318
319#[derive(Debug, Clone, Serialize, Deserialize)]
320enum Message {
321 String(String),
322 Format {
323 printf_spec: String,
324 fs: FormatString,
325 },
326}
327
328impl Display for Message {
329 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
330 match self {
331 Self::String(s) => write!(f, "{s}"),
332 Self::Format { printf_spec: _, fs } => write!(f, "{}", fs.to_string_lossy()),
333 }
334 }
335}
336
337impl Message {
338 fn to_string_lossy(&self) -> Cow<'_, str> {
339 match self {
340 Message::String(s) => Cow::Borrowed(s),
341 Message::Format { printf_spec: _, fs } => fs.to_string_lossy(),
342 }
343 }
344
345 #[inline(always)]
346 fn format_with(&self, mr: Option<&MatchRes>) -> Result<Cow<'_, str>, Error> {
347 match self {
348 Self::String(s) => Ok(Cow::Borrowed(s.as_str())),
349 Self::Format {
350 printf_spec: c_spec,
351 fs,
352 } => {
353 if let Some(mr) = mr {
354 match mr {
355 MatchRes::Float(_, _) | MatchRes::Bytes(_, _, _, _) => {
356 Ok(Cow::Owned(dformat!(fs, mr)?))
357 }
358 MatchRes::Scalar(_, scalar) => {
359 if c_spec.as_str() == "c" {
361 match scalar {
362 Scalar::byte(b) => {
363 let b = (*b as u8) as char;
364 Ok(Cow::Owned(dformat!(fs, b)?))
365 }
366 Scalar::ubyte(b) => {
367 let b = *b as char;
368 Ok(Cow::Owned(dformat!(fs, b)?))
369 }
370 _ => Ok(Cow::Owned(dformat!(fs, mr)?)),
371 }
372 } else {
373 Ok(Cow::Owned(dformat!(fs, mr)?))
374 }
375 }
376 }
377 } else {
378 Ok(fs.to_string_lossy())
379 }
380 }
381 }
382 }
383}
384
385impl ScalarDataType {
386 #[inline(always)]
387 fn read<R: DataRead>(&self, from: &mut R, switch_endianness: bool) -> Result<Scalar, Error> {
388 macro_rules! _read_le {
389 ($ty: ty) => {{
390 if switch_endianness {
391 <$ty>::from_be_bytes(read!(from, $ty))
392 } else {
393 <$ty>::from_le_bytes(read!(from, $ty))
394 }
395 }};
396 }
397
398 macro_rules! _read_be {
399 ($ty: ty) => {{
400 if switch_endianness {
401 <$ty>::from_le_bytes(read!(from, $ty))
402 } else {
403 <$ty>::from_be_bytes(read!(from, $ty))
404 }
405 }};
406 }
407
408 macro_rules! _read_ne {
409 ($ty: ty) => {{
410 if cfg!(target_endian = "big") {
411 _read_be!($ty)
412 } else {
413 _read_le!($ty)
414 }
415 }};
416 }
417
418 macro_rules! _read_me {
419 () => {
420 ((_read_le!(u16) as i32) << 16) | (_read_le!(u16) as i32)
421 };
422 }
423
424 Ok(match self {
425 Self::byte => Scalar::byte(read!(from, u8)[0] as i8),
427 Self::short => Scalar::short(_read_ne!(i16)),
428 Self::long => Scalar::long(_read_ne!(i32)),
429 Self::date => Scalar::date(_read_ne!(i32)),
430 Self::ldate => Scalar::ldate(_read_ne!(i32)),
431 Self::qwdate => Scalar::qwdate(_read_ne!(i64)),
432 Self::leshort => Scalar::leshort(_read_le!(i16)),
433 Self::lelong => Scalar::lelong(_read_le!(i32)),
434 Self::lequad => Scalar::lequad(_read_le!(i64)),
435 Self::bequad => Scalar::bequad(_read_be!(i64)),
436 Self::belong => Scalar::belong(_read_be!(i32)),
437 Self::bedate => Scalar::bedate(_read_be!(i32)),
438 Self::beldate => Scalar::beldate(_read_be!(i32)),
439 Self::beqdate => Scalar::beqdate(_read_be!(i64)),
440 Self::ubyte => Scalar::ubyte(read!(from, u8)[0]),
442 Self::ushort => Scalar::ushort(_read_ne!(u16)),
443 Self::uleshort => Scalar::uleshort(_read_le!(u16)),
444 Self::ulelong => Scalar::ulelong(_read_le!(u32)),
445 Self::uledate => Scalar::uledate(_read_le!(u32)),
446 Self::ulequad => Scalar::ulequad(_read_le!(u64)),
447 Self::offset => Scalar::offset(from.stream_position()),
448 Self::ubequad => Scalar::ubequad(_read_be!(u64)),
449 Self::medate => Scalar::medate(_read_me!()),
450 Self::meldate => Scalar::meldate(_read_me!()),
451 Self::melong => Scalar::melong(_read_me!()),
452 Self::beshort => Scalar::beshort(_read_be!(i16)),
453 Self::quad => Scalar::quad(_read_ne!(i64)),
454 Self::uquad => Scalar::uquad(_read_ne!(u64)),
455 Self::ledate => Scalar::ledate(_read_le!(i32)),
456 Self::leldate => Scalar::leldate(_read_le!(i32)),
457 Self::leqdate => Scalar::leqdate(_read_le!(i64)),
458 Self::leqldate => Scalar::leqldate(_read_le!(i64)),
459 Self::leqwdate => Scalar::leqwdate(_read_le!(i64)),
460 Self::ubelong => Scalar::ubelong(_read_be!(u32)),
461 Self::ulong => Scalar::ulong(_read_ne!(u32)),
462 Self::ubeshort => Scalar::ubeshort(_read_be!(u16)),
463 Self::ubeqdate => Scalar::ubeqdate(_read_be!(u64)),
464 Self::lemsdosdate => Scalar::lemsdosdate(_read_le!(u16)),
465 Self::lemsdostime => Scalar::lemsdostime(_read_le!(u16)),
466 Self::guid => Scalar::guid(u128::from_be_bytes(read!(from, u128))),
467 })
468 }
469}
470
471impl FloatDataType {
472 #[inline(always)]
473 fn read<R: DataRead>(&self, from: &mut R, switch_endianness: bool) -> Result<Float, Error> {
474 macro_rules! _read_le {
475 ($ty: ty) => {{
476 if switch_endianness {
477 <$ty>::from_be_bytes(read!(from, $ty))
478 } else {
479 <$ty>::from_le_bytes(read!(from, $ty))
480 }
481 }};
482 }
483
484 macro_rules! _read_be {
485 ($ty: ty) => {{
486 if switch_endianness {
487 <$ty>::from_le_bytes(read!(from, $ty))
488 } else {
489 <$ty>::from_be_bytes(read!(from, $ty))
490 }
491 }};
492 }
493
494 macro_rules! _read_ne {
495 ($ty: ty) => {{
496 if cfg!(target_endian = "big") {
497 _read_be!($ty)
498 } else {
499 _read_le!($ty)
500 }
501 }};
502 }
503
504 macro_rules! _read_me {
505 () => {
506 ((_read_le!(u16) as i32) << 16) | (_read_le!(u16) as i32)
507 };
508 }
509
510 Ok(match self {
511 Self::lefloat => Float::lefloat(_read_le!(f32)),
512 Self::befloat => Float::befloat(_read_le!(f32)),
513 Self::ledouble => Float::ledouble(_read_le!(f64)),
514 Self::bedouble => Float::bedouble(_read_be!(f64)),
515 })
516 }
517}
518
519#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
520enum Op {
521 Mul,
522 Add,
523 Sub,
524 Div,
525 Mod,
526 And,
527 Xor,
528 Or,
529}
530
531impl Display for Op {
532 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
533 match self {
534 Op::Mul => write!(f, "*"),
535 Op::Add => write!(f, "+"),
536 Op::Sub => write!(f, "-"),
537 Op::Div => write!(f, "/"),
538 Op::Mod => write!(f, "%"),
539 Op::And => write!(f, "&"),
540 Op::Or => write!(f, "|"),
541 Op::Xor => write!(f, "^"),
542 }
543 }
544}
545
546#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
547enum CmpOp {
548 Eq,
549 Lt,
550 Gt,
551 BitAnd,
552 Neq, Xor,
554 Not, }
556
557impl CmpOp {
558 #[inline(always)]
559 fn is_neq(&self) -> bool {
560 matches!(self, Self::Neq)
561 }
562}
563
564#[derive(Debug, Clone, Serialize, Deserialize)]
565struct ScalarTransform {
566 op: Op,
567 num: Scalar,
568}
569
570impl ScalarTransform {
571 fn apply(&self, s: Scalar) -> Option<Scalar> {
572 match self.op {
573 Op::Add => s.checked_add(self.num),
574 Op::Sub => s.checked_sub(self.num),
575 Op::Mul => s.checked_mul(self.num),
576 Op::Div => s.checked_div(self.num),
577 Op::Mod => s.checked_rem(self.num),
578 Op::And => Some(s.bitand(self.num)),
579 Op::Xor => Some(s.bitxor(self.num)),
580 Op::Or => Some(s.bitor(self.num)),
581 }
582 }
583}
584
585#[derive(Debug, Clone, Serialize, Deserialize)]
586struct FloatTransform {
587 op: Op,
588 num: Float,
589}
590
591impl FloatTransform {
592 fn apply(&self, s: Float) -> Float {
593 match self.op {
594 Op::Add => s.add(self.num),
595 Op::Sub => s.sub(self.num),
596 Op::Mul => s.mul(self.num),
597 Op::Div => s.div(self.num),
599 Op::Mod => s.rem(self.num),
601 Op::And | Op::Xor | Op::Or => {
603 debug_panic!("unsupported operation");
604 s
605 }
606 }
607 }
608}
609
610#[derive(Clone, Serialize, Deserialize)]
611enum TestValue<T> {
612 Value(T),
613 Any,
614}
615
616impl Debug for TestValue<Vec<u8>> {
617 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
618 match self {
619 Self::Value(v) => write!(f, "\"{}\"", debug_string_from_vec_u8(v)),
620 Self::Any => write!(f, "ANY"),
621 }
622 }
623}
624
625impl Debug for TestValue<Vec<u16>> {
626 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
627 match self {
628 Self::Value(v) => write!(f, "\"{}\"", debug_string_from_vec_u16(v)),
629 Self::Any => write!(f, "ANY"),
630 }
631 }
632}
633
634impl Debug for TestValue<Scalar> {
635 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
636 match self {
637 Self::Value(s) => write!(f, "{s:?}"),
638 Self::Any => write!(f, "ANY"),
639 }
640 }
641}
642
643impl Debug for TestValue<Float> {
644 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
645 match self {
646 Self::Value(fl) => write!(f, "{fl:?}"),
647 Self::Any => write!(f, "ANY"),
648 }
649 }
650}
651
652impl<T> TestValue<T> {
653 #[inline(always)]
654 fn as_ref(&self) -> TestValue<&T> {
655 match self {
656 Self::Value(v) => TestValue::Value(v),
657 Self::Any => TestValue::Any,
658 }
659 }
660}
661
662flags! {
663 enum ReMod: u8{
664 CaseInsensitive,
665 StartOffsetUpdate,
666 LineLimit,
667 ForceBin,
668 ForceText,
669 TrimMatch,
670 }
671}
672
673fn serialize_regex<S>(re: &bytes::Regex, serializer: S) -> Result<S::Ok, S::Error>
674where
675 S: serde::Serializer,
676{
677 re.as_str().serialize(serializer)
678}
679
680fn deserialize_regex<'de, D>(deserializer: D) -> Result<bytes::Regex, D::Error>
681where
682 D: serde::Deserializer<'de>,
683{
684 let wrapper = String::deserialize(deserializer)?;
685 bytes::Regex::new(&wrapper).map_err(serde::de::Error::custom)
686}
687
688#[derive(Debug, Clone, Serialize, Deserialize)]
689struct RegexTest {
690 #[serde(
691 serialize_with = "serialize_regex",
692 deserialize_with = "deserialize_regex"
693 )]
694 re: bytes::Regex,
695 length: Option<usize>,
696 mods: FlagSet<ReMod>,
697 str_mods: FlagSet<StringMod>,
698 non_magic_len: usize,
699 binary: bool,
700 cmp_op: CmpOp,
701}
702
703impl RegexTest {
704 #[inline(always)]
705 fn is_binary(&self) -> bool {
706 self.binary
707 || self.mods.contains(ReMod::ForceBin)
708 || self.str_mods.contains(StringMod::ForceBin)
709 }
710
711 #[inline(always)]
712 fn is_text(&self) -> bool {
713 self.mods.contains(ReMod::ForceText) || self.str_mods.contains(StringMod::ForceText)
714 }
715
716 fn match_buf<'buf>(
717 &self,
718 off_buf: u64, stream_kind: StreamKind,
720 buf: &'buf [u8],
721 ) -> Option<MatchRes<'buf>> {
722 let mr = match stream_kind {
723 StreamKind::Text(_) => {
724 let mut off_txt = off_buf;
725
726 let mut line_limit = self.length.unwrap_or(usize::MAX);
727
728 for line in buf.split(|c| c == &b'\n') {
729 if line_limit == 0 {
733 break;
734 }
735
736 if let Some(re_match) = self.re.find(line) {
737 let start_offset = off_txt + re_match.start() as u64;
739
740 let stop_offset = if re_match.end() == line.len() {
742 Some(start_offset + re_match.as_bytes().len() as u64 + 1)
743 } else {
744 None
745 };
746
747 return Some(MatchRes::Bytes(
748 start_offset,
749 stop_offset,
750 re_match.as_bytes(),
751 Encoding::Utf8,
752 ));
753 }
754
755 off_txt += line.len() as u64;
756 off_txt += 1;
758 line_limit = line_limit.saturating_sub(1)
759 }
760 None
761 }
762
763 StreamKind::Binary => {
764 self.re.find(buf).map(|re_match| {
765 MatchRes::Bytes(
766 off_buf + re_match.start() as u64,
768 None,
769 re_match.as_bytes(),
770 Encoding::Utf8,
771 )
772 })
773 }
774 };
775
776 if self.cmp_op.is_neq() && mr.is_none() {
778 return Some(MatchRes::Bytes(off_buf, None, buf, Encoding::Utf8));
779 }
780
781 mr
782 }
783}
784
785impl From<RegexTest> for Test {
786 fn from(value: RegexTest) -> Self {
787 Self::Regex(value)
788 }
789}
790
791flags! {
792 enum StringMod: u8{
793 ForceBin,
794 UpperInsensitive,
795 LowerInsensitive,
796 FullWordMatch,
797 Trim,
798 ForceText,
799 CompactWhitespace,
800 OptBlank,
801 }
802}
803
804#[derive(Debug, Clone, Serialize, Deserialize)]
805struct StringTest {
806 test_val: TestValue<Vec<u8>>,
807 cmp_op: CmpOp,
808 length: Option<usize>,
809 mods: FlagSet<StringMod>,
810 binary: bool,
811}
812
813impl From<StringTest> for Test {
814 fn from(value: StringTest) -> Self {
815 Self::String(value)
816 }
817}
818
819#[inline(always)]
820fn string_match(str: &[u8], mods: FlagSet<StringMod>, buf: &[u8]) -> (bool, usize) {
821 let mut consumed = 0;
822 if mods.is_disjoint(
824 StringMod::UpperInsensitive
825 | StringMod::LowerInsensitive
826 | StringMod::FullWordMatch
827 | StringMod::CompactWhitespace
828 | StringMod::OptBlank,
829 ) {
830 if buf.starts_with(str) {
832 (true, str.len())
833 } else {
834 (false, consumed)
835 }
836 } else {
837 let mut i_src = 0;
838 let mut iter = buf.iter().peekable();
839
840 macro_rules! consume_target {
841 () => {{
842 if iter.next().is_some() {
843 consumed += 1;
844 }
845 }};
846 }
847
848 macro_rules! continue_next_iteration {
849 () => {{
850 consume_target!();
851 i_src += 1;
852 continue;
853 }};
854 }
855
856 while let Some(&&b) = iter.peek() {
857 let Some(&ref_byte) = str.get(i_src) else {
858 break;
859 };
860
861 if mods.contains(StringMod::OptBlank) && (b == b' ' || ref_byte == b' ') {
862 if b == b' ' {
863 consume_target!();
865 }
866
867 if ref_byte == b' ' {
868 i_src += 1;
870 }
871
872 continue;
873 }
874
875 if mods.contains(StringMod::UpperInsensitive) {
876 if ref_byte.is_ascii_uppercase() && ref_byte == b.to_ascii_uppercase()
878 || ref_byte == b
879 {
880 continue_next_iteration!()
881 }
882 }
883
884 if mods.contains(StringMod::LowerInsensitive)
885 && (ref_byte.is_ascii_lowercase() && ref_byte == b.to_ascii_lowercase()
886 || ref_byte == b)
887 {
888 continue_next_iteration!()
889 }
890
891 if mods.contains(StringMod::CompactWhitespace) && ref_byte == b' ' {
892 let mut src_blk = 0;
893 while let Some(b' ') = str.get(i_src) {
894 src_blk += 1;
895 i_src += 1;
896 }
897
898 let mut tgt_blk = 0;
899 while let Some(b' ') = iter.peek() {
900 tgt_blk += 1;
901 consume_target!();
902 }
903
904 if src_blk > tgt_blk {
905 return (false, consumed);
906 }
907
908 continue;
909 }
910
911 if ref_byte == b {
912 continue_next_iteration!()
913 } else {
914 return (false, consumed);
915 }
916 }
917
918 if mods.contains(StringMod::FullWordMatch)
919 && let Some(b) = iter.peek()
920 && !b.is_ascii_whitespace()
921 {
922 return (false, consumed);
923 }
924
925 (
926 consumed > 0 && str.get(i_src).is_none() && consumed <= buf.len(),
927 consumed,
928 )
929 }
930}
931
932impl StringTest {
933 fn has_length_mod(&self) -> bool {
934 !self.mods.is_disjoint(
935 StringMod::UpperInsensitive
936 | StringMod::LowerInsensitive
937 | StringMod::FullWordMatch
938 | StringMod::CompactWhitespace
939 | StringMod::OptBlank,
940 )
941 }
942
943 #[inline(always)]
944 fn test_value_len(&self) -> usize {
945 match self.test_val.as_ref() {
946 TestValue::Value(s) => s.len(),
947 TestValue::Any => 0,
948 }
949 }
950
951 #[inline(always)]
952 fn is_binary(&self) -> bool {
953 self.binary || self.mods.contains(StringMod::ForceBin)
954 }
955
956 #[inline(always)]
957 fn is_text(&self) -> bool {
958 self.mods.contains(StringMod::ForceText)
959 }
960}
961
962#[derive(Clone, Serialize, Deserialize)]
963struct ByteVec(Vec<u8>);
964
965impl Debug for ByteVec {
966 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
967 write!(f, "\"{}\"", debug_string_from_vec_u8(self))
968 }
969}
970
971impl From<Vec<u8>> for ByteVec {
972 fn from(value: Vec<u8>) -> Self {
973 Self(value)
974 }
975}
976
977impl Deref for ByteVec {
978 type Target = Vec<u8>;
979
980 fn deref(&self) -> &Self::Target {
981 &self.0
982 }
983}
984
985#[derive(Debug, Clone, Serialize, Deserialize)]
986struct SearchTest {
987 str: ByteVec,
988 n_pos: Option<usize>,
989 str_mods: FlagSet<StringMod>,
990 re_mods: FlagSet<ReMod>,
991 binary: bool,
992 cmp_op: CmpOp,
993}
994
995impl From<SearchTest> for Test {
996 fn from(value: SearchTest) -> Self {
997 Self::Search(value)
998 }
999}
1000
1001impl SearchTest {
1002 #[inline(always)]
1003 fn is_binary(&self) -> bool {
1004 (self.binary
1005 || self.str_mods.contains(StringMod::ForceBin)
1006 || self.re_mods.contains(ReMod::ForceBin))
1007 && !(self.str_mods.contains(StringMod::ForceText)
1008 || self.re_mods.contains(ReMod::ForceText))
1009 }
1010
1011 #[inline]
1013 fn match_buf<'buf>(&self, off_buf: u64, buf: &'buf [u8]) -> Option<MatchRes<'buf>> {
1014 let mut i = 0;
1015
1016 let needle = self.str.first()?;
1017
1018 while i < buf.len() {
1019 let Some(k) = memchr(*needle, &buf[i..]) else {
1022 break;
1023 };
1024
1025 i += k;
1026
1027 if self.str_mods.contains(StringMod::FullWordMatch) {
1029 let prev_is_whitespace = buf
1030 .get(i.saturating_sub(1))
1031 .map(|c| c.is_ascii_whitespace())
1032 .unwrap_or_default();
1033
1034 if i > 0 && !prev_is_whitespace {
1039 i += 1;
1040 continue;
1041 }
1042 }
1043
1044 if let Some(npos) = self.n_pos
1045 && i > npos
1046 {
1047 break;
1048 }
1049
1050 let pos = i;
1051 let (ok, consumed) = string_match(&self.str, self.str_mods, &buf[i..]);
1052
1053 if ok {
1054 return Some(MatchRes::Bytes(
1055 off_buf.saturating_add(pos as u64),
1056 None,
1057 &buf[i..i + consumed],
1058 Encoding::Utf8,
1059 ));
1060 } else {
1061 i += max(consumed, 1)
1062 }
1063 }
1064
1065 if self.cmp_op.is_neq() {
1067 return Some(MatchRes::Bytes(off_buf, None, buf, Encoding::Utf8));
1068 }
1069
1070 None
1071 }
1072}
1073
1074#[derive(Debug, Clone, Serialize, Deserialize)]
1075struct ScalarTest {
1076 ty: ScalarDataType,
1077 transform: Option<ScalarTransform>,
1078 cmp_op: CmpOp,
1079 test_val: TestValue<Scalar>,
1080}
1081
1082#[derive(Debug, Clone, Serialize, Deserialize)]
1083struct FloatTest {
1084 ty: FloatDataType,
1085 transform: Option<FloatTransform>,
1086 cmp_op: CmpOp,
1087 test_val: TestValue<Float>,
1088}
1089
1090#[derive(PartialEq)]
1093enum ReadValue<'buf> {
1094 Float(u64, Float),
1095 Scalar(u64, Scalar),
1096 Bytes(u64, &'buf [u8]),
1097}
1098
1099impl<'buf> Debug for ReadValue<'buf> {
1100 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1101 match self {
1102 Self::Float(_, fl) => write!(f, "{fl:?}"),
1103 Self::Scalar(_, s) => write!(f, "{s:?}"),
1104 Self::Bytes(_, b) => {
1105 if b.len() <= 128 {
1106 write!(f, "\"{}\"", debug_string_from_vec_u8(b))
1107 } else {
1108 let limit = 128;
1109 write!(
1110 f,
1111 "\"{}\" (first {limit} bytes)",
1112 debug_string_from_vec_u8(&b[..limit])
1113 )
1114 }
1115 }
1116 }
1117 }
1118}
1119
1120impl DynDisplay for ReadValue<'_> {
1121 fn dyn_fmt(&self, f: &mut dyf::Formatter<'_>) -> dyf::Result {
1122 use std::fmt::Write;
1123 match self {
1124 Self::Float(_, s) => DynDisplay::dyn_fmt(s, f),
1125 Self::Scalar(_, s) => DynDisplay::dyn_fmt(s, f),
1126 Self::Bytes(_, b) => Ok(write!(f, "{b:?}")?),
1127 }
1128 }
1129}
1130
1131impl DynDisplay for &ReadValue<'_> {
1132 fn dyn_fmt(&self, f: &mut dyf::Formatter<'_>) -> dyf::Result {
1133 DynDisplay::dyn_fmt(*self, f)
1135 }
1136}
1137
1138impl Display for ReadValue<'_> {
1139 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1140 match self {
1141 Self::Float(_, v) => write!(f, "{v}"),
1142 Self::Scalar(_, s) => write!(f, "{s}"),
1143 Self::Bytes(_, b) => write!(f, "{b:?}"),
1144 }
1145 }
1146}
1147
1148enum Encoding {
1149 Utf16(String16Encoding),
1150 Utf8,
1151}
1152
1153enum MatchRes<'buf> {
1156 Bytes(u64, Option<u64>, &'buf [u8], Encoding),
1161 Scalar(u64, Scalar),
1162 Float(u64, Float),
1163}
1164
1165impl DynDisplay for &MatchRes<'_> {
1166 fn dyn_fmt(&self, f: &mut dyf::Formatter) -> dyf::Result {
1167 (*self).dyn_fmt(f)
1168 }
1169}
1170
1171impl DynDisplay for MatchRes<'_> {
1172 fn dyn_fmt(&self, f: &mut dyf::Formatter) -> dyf::Result {
1173 match self {
1174 Self::Scalar(_, v) => v.dyn_fmt(f),
1175 Self::Float(_, v) => v.dyn_fmt(f),
1176 Self::Bytes(_, _, v, enc) => match enc {
1177 Encoding::Utf8 => String::from_utf8_lossy(v).to_string().dyn_fmt(f),
1178 Encoding::Utf16(enc) => {
1179 let utf16: Vec<u16> = slice_to_utf16_iter(v, *enc).collect();
1180 String::from_utf16_lossy(&utf16).dyn_fmt(f)
1181 }
1182 },
1183 }
1184 }
1185}
1186
1187impl MatchRes<'_> {
1188 #[inline]
1190 fn start_offset(&self) -> u64 {
1191 match self {
1192 MatchRes::Bytes(o, _, _, _) => *o,
1193 MatchRes::Scalar(o, _) => *o,
1194 MatchRes::Float(o, _) => *o,
1195 }
1196 }
1197
1198 #[inline]
1200 fn end_offset(&self) -> u64 {
1201 match self {
1202 MatchRes::Bytes(start, end, buf, _) => match end {
1203 Some(end) => *end,
1204 None => start.saturating_add(buf.len() as u64),
1205 },
1206 MatchRes::Scalar(o, sc) => o.add(sc.size_of() as u64),
1207 MatchRes::Float(o, f) => o.add(f.size_of() as u64),
1208 }
1209 }
1210}
1211
1212fn slice_to_utf16_iter(read: &[u8], encoding: String16Encoding) -> impl Iterator<Item = u16> {
1213 let even = read
1214 .iter()
1215 .enumerate()
1216 .filter(|(i, _)| i % 2 == 0)
1217 .map(|t| t.1);
1218
1219 let odd = read
1220 .iter()
1221 .enumerate()
1222 .filter(|(i, _)| i % 2 != 0)
1223 .map(|t| t.1);
1224
1225 even.zip(odd).map(move |(e, o)| match encoding {
1226 String16Encoding::Le => u16::from_le_bytes([*e, *o]),
1227 String16Encoding::Be => u16::from_be_bytes([*e, *o]),
1228 })
1229}
1230
1231#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
1232enum String16Encoding {
1233 Le,
1234 Be,
1235}
1236
1237#[derive(Debug, Clone, Serialize, Deserialize)]
1238struct String16Test {
1239 orig: String,
1240 test_val: TestValue<Vec<u16>>,
1241 encoding: String16Encoding,
1242}
1243
1244impl String16Test {
1245 #[inline(always)]
1249 fn test_value_len(&self) -> usize {
1250 match self.test_val.as_ref() {
1251 TestValue::Value(str16) => str16.len(),
1252 TestValue::Any => 0,
1253 }
1254 }
1255}
1256
1257flags! {
1258 enum IndirectMod: u8{
1259 Relative,
1260 }
1261}
1262
1263type IndirectMods = FlagSet<IndirectMod>;
1264
1265#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
1266enum PStringLen {
1267 Byte, ShortBe, ShortLe, LongBe, LongLe, }
1273
1274impl PStringLen {
1275 #[inline(always)]
1276 const fn size_of_len(&self) -> usize {
1277 match self {
1278 PStringLen::Byte => 1,
1279 PStringLen::ShortBe => 2,
1280 PStringLen::ShortLe => 2,
1281 PStringLen::LongBe => 4,
1282 PStringLen::LongLe => 4,
1283 }
1284 }
1285}
1286
1287#[derive(Debug, Clone, Serialize, Deserialize)]
1288struct PStringTest {
1289 len: PStringLen,
1290 test_val: TestValue<Vec<u8>>,
1291 include_len: bool,
1292}
1293
1294impl PStringTest {
1295 #[inline]
1296 fn read<'cache, R: DataRead>(
1297 &self,
1298 haystack: &'cache mut R,
1299 ) -> Result<Option<&'cache [u8]>, Error> {
1300 let mut len = match self.len {
1301 PStringLen::Byte => read_le!(haystack, u8) as u32,
1302 PStringLen::ShortBe => read_be!(haystack, u16) as u32,
1303 PStringLen::ShortLe => read_le!(haystack, u16) as u32,
1304 PStringLen::LongBe => read_be!(haystack, u32),
1305 PStringLen::LongLe => read_le!(haystack, u32),
1306 } as usize;
1307
1308 if self.include_len {
1309 len = len.saturating_sub(self.len.size_of_len())
1310 }
1311
1312 if let TestValue::Value(s) = self.test_val.as_ref()
1313 && len != s.len()
1314 {
1315 return Ok(None);
1316 }
1317
1318 let read = haystack.read_exact_count(len as u64)?;
1319
1320 Ok(Some(read))
1321 }
1322
1323 #[inline(always)]
1324 fn test_value_len(&self) -> usize {
1325 match self.test_val.as_ref() {
1326 TestValue::Value(s) => s.len(),
1327 TestValue::Any => 0,
1328 }
1329 }
1330}
1331
1332#[derive(Debug, Clone, Serialize, Deserialize)]
1333enum Test {
1334 Name(String),
1335 Use(bool, String),
1336 Scalar(ScalarTest),
1337 Float(FloatTest),
1338 String(StringTest),
1339 Search(SearchTest),
1340 PString(PStringTest),
1341 Regex(RegexTest),
1342 Indirect(FlagSet<IndirectMod>),
1343 String16(String16Test),
1344 #[allow(dead_code)]
1346 Der,
1347 Clear,
1348 Default,
1349}
1350
1351impl Display for Test {
1352 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1353 match self {
1354 Test::Name(name) => write!(f, "name {name}"),
1355 Test::Use(flip, rule) => {
1356 if *flip {
1357 write!(f, "use {rule}")
1358 } else {
1359 write!(f, "use ^{rule}")
1360 }
1361 }
1362 Test::Scalar(st) => write!(f, "{st:?}"),
1363 Test::Float(ft) => write!(f, "{ft:?}"),
1364 Test::String(st) => write!(f, "{st:?}"),
1365 Test::Search(st) => write!(f, "{st:?}"),
1366 Test::PString(pt) => write!(f, "{pt:?}"),
1367 Test::Regex(rt) => write!(f, "{rt:?}"),
1368 Test::Indirect(fs) => write!(f, "indirect {fs:?}"),
1369 Test::String16(s16t) => write!(f, "{s16t:?}"),
1370 Test::Der => write!(f, "unimplemented der"),
1371 Test::Clear => write!(f, "clear"),
1372 Test::Default => write!(f, "default"),
1373 }
1374 }
1375}
1376
1377impl Test {
1378 #[inline]
1380 fn read_test_value<'haystack, D: DataRead>(
1381 &self,
1382 haystack: &'haystack mut D,
1383 switch_endianness: bool,
1384 ) -> Result<Option<ReadValue<'haystack>>, Error> {
1385 let test_value_offset = haystack.stream_position();
1386
1387 match self {
1388 Self::Scalar(t) => {
1389 t.ty.read(haystack, switch_endianness)
1390 .map(|s| Some(ReadValue::Scalar(test_value_offset, s)))
1391 }
1392
1393 Self::Float(t) => {
1394 t.ty.read(haystack, switch_endianness)
1395 .map(|f| Some(ReadValue::Float(test_value_offset, f)))
1396 }
1397 Self::String(t) => {
1398 match t.test_val.as_ref() {
1399 TestValue::Value(str) => {
1400 let buf = if let Some(length) = t.length {
1401 haystack.read_exact_count(length as u64)?
1403 } else {
1404 match t.cmp_op {
1407 CmpOp::Eq | CmpOp::Neq => {
1408 if !t.has_length_mod() {
1409 haystack.read_exact_count(str.len() as u64)?
1410 } else {
1411 haystack.read_count(FILE_BYTES_MAX as u64)?
1412 }
1413 }
1414 CmpOp::Lt | CmpOp::Gt => {
1415 let read =
1416 haystack.read_until_any_delim_or_limit(b"\n\0", 8092)?;
1417
1418 if read.ends_with(b"\0") || read.ends_with(b"\n") {
1419 &read[..read.len() - 1]
1420 } else {
1421 read
1422 }
1423 }
1424 _ => {
1425 return Err(Error::Msg(format!(
1426 "string test does not support {:?} operator",
1427 t.cmp_op
1428 )));
1429 }
1430 }
1431 };
1432
1433 Ok(Some(ReadValue::Bytes(test_value_offset, buf)))
1434 }
1435 TestValue::Any => {
1436 let read = haystack.read_until_any_delim_or_limit(b"\0\n", 8192)?;
1437 let bytes = if read.ends_with(b"\0") || read.ends_with(b"\n") {
1439 &read[..read.len() - 1]
1440 } else {
1441 read
1442 };
1443
1444 Ok(Some(ReadValue::Bytes(test_value_offset, bytes)))
1445 }
1446 }
1447 }
1448
1449 Self::String16(t) => {
1450 match t.test_val.as_ref() {
1451 TestValue::Value(str16) => {
1452 let read = haystack.read_exact_count((str16.len() * 2) as u64)?;
1453
1454 Ok(Some(ReadValue::Bytes(test_value_offset, read)))
1455 }
1456 TestValue::Any => {
1457 let read = haystack.read_until_utf16_or_limit(b"\x00\x00", 8192)?;
1458
1459 let end = if read.len() % 2 == 0 {
1461 read.len()
1462 } else {
1463 read.len().saturating_sub(1)
1466 };
1467
1468 Ok(Some(ReadValue::Bytes(test_value_offset, &read[..end])))
1469 }
1470 }
1471 }
1472
1473 Self::PString(t) => {
1474 let Some(read) = t.read(haystack)? else {
1475 return Ok(None);
1476 };
1477 Ok(Some(ReadValue::Bytes(test_value_offset, read)))
1478 }
1479
1480 Self::Search(_) => {
1481 let buf = haystack.read_count(FILE_BYTES_MAX as u64)?;
1482 Ok(Some(ReadValue::Bytes(test_value_offset, buf)))
1483 }
1484
1485 Self::Regex(r) => {
1486 let length = {
1487 match r.length {
1488 Some(len) => {
1489 if r.mods.contains(ReMod::LineLimit) {
1490 len * 80
1491 } else {
1492 len
1493 }
1494 }
1495
1496 None => FILE_REGEX_MAX,
1497 }
1498 };
1499
1500 let read = haystack.read_count(length as u64)?;
1501 Ok(Some(ReadValue::Bytes(test_value_offset, read)))
1502 }
1503
1504 Self::Name(_)
1505 | Self::Use(_, _)
1506 | Self::Indirect(_)
1507 | Self::Clear
1508 | Self::Default
1509 | Self::Der => Err(Error::msg("no value to read for this test")),
1510 }
1511 }
1512
1513 #[inline(always)]
1514 fn match_value<'s>(
1515 &'s self,
1516 tv: &ReadValue<'s>,
1517 stream_kind: StreamKind,
1518 ) -> Option<MatchRes<'s>> {
1519 match (self, tv) {
1520 (Self::Scalar(t), ReadValue::Scalar(o, ts)) => {
1521 let read_value: Scalar = match t.transform.as_ref() {
1522 Some(t) => t.apply(*ts)?,
1523 None => *ts,
1524 };
1525
1526 match t.test_val {
1527 TestValue::Value(test_value) => {
1528 let ok = match t.cmp_op {
1529 CmpOp::Not => read_value == !test_value,
1532 CmpOp::Eq => read_value == test_value,
1533 CmpOp::Lt => read_value < test_value,
1534 CmpOp::Gt => read_value > test_value,
1535 CmpOp::Neq => read_value != test_value,
1536 CmpOp::BitAnd => read_value & test_value == test_value,
1537 CmpOp::Xor => (read_value & test_value).is_zero(),
1538 };
1539
1540 if ok {
1541 Some(MatchRes::Scalar(*o, read_value))
1542 } else {
1543 None
1544 }
1545 }
1546
1547 TestValue::Any => Some(MatchRes::Scalar(*o, read_value)),
1548 }
1549 }
1550
1551 (Self::Float(t), ReadValue::Float(o, f)) => {
1552 let read_value: Float = t.transform.as_ref().map(|t| t.apply(*f)).unwrap_or(*f);
1553
1554 match t.test_val {
1555 TestValue::Value(tf) => {
1556 let ok = match t.cmp_op {
1557 CmpOp::Eq => read_value == tf,
1558 CmpOp::Lt => read_value < tf,
1559 CmpOp::Gt => read_value > tf,
1560 CmpOp::Neq => read_value != tf,
1561 _ => {
1562 debug_panic!("unsupported float comparison");
1565 debug!("unsupported float comparison");
1566 false
1567 }
1568 };
1569
1570 if ok {
1571 Some(MatchRes::Float(*o, read_value))
1572 } else {
1573 None
1574 }
1575 }
1576 TestValue::Any => Some(MatchRes::Float(*o, read_value)),
1577 }
1578 }
1579
1580 (Self::String(st), ReadValue::Bytes(o, buf)) => {
1581 macro_rules! trim_buf {
1582 ($buf: expr) => {{
1583 if st.mods.contains(StringMod::Trim) {
1584 $buf.trim_ascii()
1585 } else {
1586 $buf
1587 }
1588 }};
1589 }
1590
1591 match st.test_val.as_ref() {
1592 TestValue::Value(str) => {
1593 match st.cmp_op {
1594 CmpOp::Eq => {
1595 if let (true, _) = string_match(str, st.mods, buf) {
1596 Some(MatchRes::Bytes(*o, None, trim_buf!(str), Encoding::Utf8))
1597 } else {
1598 None
1599 }
1600 }
1601 CmpOp::Neq => {
1602 if let (false, _) = string_match(str, st.mods, buf) {
1603 Some(MatchRes::Bytes(*o, None, trim_buf!(str), Encoding::Utf8))
1604 } else {
1605 None
1606 }
1607 }
1608 CmpOp::Gt => {
1609 if buf.len() > str.len() {
1610 Some(MatchRes::Bytes(*o, None, trim_buf!(buf), Encoding::Utf8))
1611 } else {
1612 None
1613 }
1614 }
1615 CmpOp::Lt => {
1616 if buf.len() < str.len() {
1617 Some(MatchRes::Bytes(*o, None, trim_buf!(buf), Encoding::Utf8))
1618 } else {
1619 None
1620 }
1621 }
1622
1623 _ => {
1625 debug_panic!("unsupported string comparison");
1628 debug!("unsupported string comparison");
1629 None
1630 }
1631 }
1632 }
1633 TestValue::Any => {
1634 Some(MatchRes::Bytes(*o, None, trim_buf!(buf), Encoding::Utf8))
1635 }
1636 }
1637 }
1638
1639 (Self::PString(m), ReadValue::Bytes(o, buf)) => match m.test_val.as_ref() {
1640 TestValue::Value(psv) => {
1641 if buf == psv {
1642 Some(MatchRes::Bytes(*o, None, buf, Encoding::Utf8))
1643 } else {
1644 None
1645 }
1646 }
1647 TestValue::Any => Some(MatchRes::Bytes(*o, None, buf, Encoding::Utf8)),
1648 },
1649
1650 (Self::String16(t), ReadValue::Bytes(o, buf)) => {
1651 match t.test_val.as_ref() {
1652 TestValue::Value(str16) => {
1653 if str16.len() * 2 != buf.len() {
1655 return None;
1656 }
1657
1658 for (i, utf16_char) in slice_to_utf16_iter(buf, t.encoding).enumerate() {
1660 if str16[i] != utf16_char {
1661 return None;
1662 }
1663 }
1664
1665 Some(MatchRes::Bytes(
1666 *o,
1667 None,
1668 t.orig.as_bytes(),
1669 Encoding::Utf16(t.encoding),
1670 ))
1671 }
1672
1673 TestValue::Any => {
1674 Some(MatchRes::Bytes(*o, None, buf, Encoding::Utf16(t.encoding)))
1675 }
1676 }
1677 }
1678
1679 (Self::Regex(r), ReadValue::Bytes(o, buf)) => r.match_buf(*o, stream_kind, buf),
1680
1681 (Self::Search(t), ReadValue::Bytes(o, buf)) => t.match_buf(*o, buf),
1682
1683 _ => None,
1684 }
1685 }
1686
1687 #[inline(always)]
1688 fn strength(&self) -> u64 {
1689 const MULT: usize = 10;
1690
1691 let mut out = 2 * MULT;
1692
1693 match self {
1695 Test::Scalar(s) => {
1696 out += s.ty.type_size() * MULT;
1697 }
1698
1699 Test::Float(t) => {
1700 out += t.ty.type_size() * MULT;
1701 }
1702
1703 Test::String(t) => out += t.test_value_len().saturating_mul(MULT),
1704
1705 Test::PString(t) => out += t.test_value_len().saturating_mul(MULT),
1706
1707 Test::Search(s) => {
1708 let n_pos = s.n_pos.unwrap_or(FILE_BYTES_MAX);
1713
1714 match n_pos {
1715 0..=80 => out += s.str.len().saturating_mul(MULT),
1717 81..=240 => out += s.str.len() * s.str.len().clamp(0, MULT - 2),
1719 _ => out += s.str.len(),
1721 }
1722 }
1723
1724 Test::Regex(r) => {
1725 let v = r.non_magic_len / r.re.captures_len();
1734
1735 let len = r
1736 .length
1737 .map(|l| {
1738 if r.mods.contains(ReMod::LineLimit) {
1739 l * 80
1740 } else {
1741 l
1742 }
1743 })
1744 .unwrap_or(FILE_BYTES_MAX);
1745
1746 match len {
1747 0..=80 => out += v.saturating_mul(MULT),
1749 81..=240 => out += v * v.clamp(0, MULT - 2),
1751 _ => out += v,
1753 }
1754 }
1755
1756 Test::String16(t) => {
1757 out += t.test_value_len().saturating_mul(MULT);
1762 }
1763
1764 Test::Der => out += MULT,
1765
1766 Test::Default | Test::Name(_) | Test::Use(_, _) | Test::Indirect(_) | Test::Clear => {
1767 return 0;
1768 }
1769 }
1770
1771 if self.is_match_any() {
1773 return 0;
1774 }
1775
1776 if let Some(op) = self.cmp_op() {
1777 match op {
1778 CmpOp::Neq => out = 0,
1780 CmpOp::Eq | CmpOp::Not => out += MULT,
1781 CmpOp::Lt | CmpOp::Gt => out -= 2 * MULT,
1782 CmpOp::Xor | CmpOp::BitAnd => out -= MULT,
1783 }
1784 }
1785
1786 out as u64
1787 }
1788
1789 #[inline(always)]
1790 fn cmp_op(&self) -> Option<CmpOp> {
1791 match self {
1792 Self::String(t) => Some(t.cmp_op),
1793 Self::Scalar(s) => Some(s.cmp_op),
1794 Self::Float(t) => Some(t.cmp_op),
1795 Self::Name(_)
1796 | Self::Use(_, _)
1797 | Self::Search(_)
1798 | Self::PString(_)
1799 | Self::Regex(_)
1800 | Self::Clear
1801 | Self::Default
1802 | Self::Indirect(_)
1803 | Self::String16(_)
1804 | Self::Der => None,
1805 }
1806 }
1807
1808 #[inline(always)]
1809 fn is_recursive(&self) -> bool {
1810 matches!(self, Test::Use(_, _) | Test::Indirect(_))
1811 }
1812
1813 #[inline(always)]
1814 fn is_match_any(&self) -> bool {
1815 match self {
1816 Test::Name(_) => false,
1817 Test::Use(_, _) => false,
1818 Test::Scalar(scalar_test) => matches!(scalar_test.test_val, TestValue::Any),
1819 Test::Float(float_test) => matches!(float_test.test_val, TestValue::Any),
1820 Test::String(string_test) => matches!(string_test.test_val, TestValue::Any),
1821 Test::Search(_) => false,
1822 Test::PString(pstring_test) => matches!(pstring_test.test_val, TestValue::Any),
1823 Test::Regex(_) => false,
1824 Test::Indirect(_) => false,
1825 Test::String16(string16_test) => matches!(string16_test.test_val, TestValue::Any),
1826 Test::Der => false,
1827 Test::Clear => false,
1828 Test::Default => false,
1829 }
1830 }
1831
1832 #[inline(always)]
1833 fn is_binary(&self) -> bool {
1834 match self {
1835 Self::Name(_) => true,
1836 Self::Use(_, _) => true,
1837 Self::Scalar(_) => true,
1838 Self::Float(_) => true,
1839 Self::String(t) => !t.is_binary() & !t.is_text() || t.is_binary(),
1840 Self::Search(t) => t.is_binary(),
1841 Self::PString(_) => true,
1842 Self::Regex(t) => !t.is_binary() & !t.is_text() || t.is_binary(),
1843 Self::Clear => true,
1844 Self::Default => true,
1845 Self::Indirect(_) => true,
1846 Self::String16(_) => true,
1847 Self::Der => true,
1848 }
1849 }
1850
1851 #[inline(always)]
1852 fn is_text(&self) -> bool {
1853 match self {
1854 Self::Name(_) => true,
1855 Self::Use(_, _) => true,
1856 Self::Indirect(_) => true,
1857 Self::Clear => true,
1858 Self::Default => true,
1859 Self::String(t) => !t.is_binary() & !t.is_text() || t.is_text(),
1860 Self::Regex(t) => !t.is_binary() & !t.is_text() || t.is_text(),
1861 _ => !self.is_binary(),
1862 }
1863 }
1864
1865 #[inline(always)]
1866 fn is_only_text(&self) -> bool {
1867 self.is_text() && !self.is_binary()
1868 }
1869
1870 #[inline(always)]
1871 fn is_only_binary(&self) -> bool {
1872 self.is_binary() && !self.is_text()
1873 }
1874}
1875
1876#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
1877enum OffsetType {
1878 Byte,
1879 DoubleLe,
1880 DoubleBe,
1881 ShortLe,
1882 ShortBe,
1883 Id3Le,
1884 Id3Be,
1885 LongLe,
1886 LongBe,
1887 Middle,
1888 Octal,
1889 QuadBe,
1890 QuadLe,
1891}
1892
1893#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
1894enum Shift {
1895 Direct(u64),
1896 Indirect(i64),
1897}
1898
1899#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
1900struct IndOffset {
1901 off_addr: DirOffset,
1903 signed: bool,
1905 ty: OffsetType,
1907 op: Option<Op>,
1908 shift: Option<Shift>,
1909}
1910
1911impl IndOffset {
1912 fn read_offset<D: DataRead>(
1914 &self,
1915 haystack: &mut D,
1916 rule_base_offset: Option<u64>,
1917 last_upper_match_offset: Option<u64>,
1918 ) -> Result<Option<u64>, io::Error> {
1919 let offset_address = match self.off_addr {
1920 DirOffset::Start(s) => {
1921 let Some(o) = s.checked_add(rule_base_offset.unwrap_or_default()) else {
1922 return Ok(None);
1923 };
1924
1925 haystack.seek(SeekFrom::Start(o))?
1926 }
1927 DirOffset::LastUpper(c) => haystack.seek(SeekFrom::Start(
1928 (last_upper_match_offset.unwrap_or_default() as i64 + c) as u64,
1929 ))?,
1930 DirOffset::End(e) => haystack.seek(SeekFrom::End(e))?,
1931 };
1932
1933 macro_rules! read_value {
1934 () => {
1935 match self.ty {
1936 OffsetType::Byte => {
1937 if self.signed {
1938 read_le!(haystack, u8) as u64
1939 } else {
1940 read_le!(haystack, i8) as u64
1941 }
1942 }
1943 OffsetType::DoubleLe => read_le!(haystack, f64) as u64,
1944 OffsetType::DoubleBe => read_be!(haystack, f64) as u64,
1945 OffsetType::ShortLe => {
1946 if self.signed {
1947 read_le!(haystack, i16) as u64
1948 } else {
1949 read_le!(haystack, u16) as u64
1950 }
1951 }
1952 OffsetType::ShortBe => {
1953 if self.signed {
1954 read_be!(haystack, i16) as u64
1955 } else {
1956 read_be!(haystack, u16) as u64
1957 }
1958 }
1959 OffsetType::Id3Le => decode_id3(read_le!(haystack, u32)) as u64,
1960 OffsetType::Id3Be => decode_id3(read_be!(haystack, u32)) as u64,
1961 OffsetType::LongLe => {
1962 if self.signed {
1963 read_le!(haystack, i32) as u64
1964 } else {
1965 read_le!(haystack, u32) as u64
1966 }
1967 }
1968 OffsetType::LongBe => {
1969 if self.signed {
1970 read_be!(haystack, i32) as u64
1971 } else {
1972 read_be!(haystack, u32) as u64
1973 }
1974 }
1975 OffsetType::Middle => read_me!(haystack) as u64,
1976 OffsetType::Octal => {
1977 if let Some(o) = read_octal_u64(haystack) {
1978 o
1979 } else {
1980 debug!("failed to read octal offset @ {offset_address}");
1981 return Ok(None);
1982 }
1983 }
1984 OffsetType::QuadLe => {
1985 if self.signed {
1986 read_le!(haystack, i64) as u64
1987 } else {
1988 read_le!(haystack, u64)
1989 }
1990 }
1991 OffsetType::QuadBe => {
1992 if self.signed {
1993 read_be!(haystack, i64) as u64
1994 } else {
1995 read_be!(haystack, u64)
1996 }
1997 }
1998 }
1999 };
2000 }
2001
2002 let o = read_value!();
2004
2005 trace!(
2006 "offset read @ {offset_address} value={o} op={:?} shift={:?}",
2007 self.op, self.shift
2008 );
2009
2010 if let (Some(op), Some(shift)) = (self.op, self.shift) {
2012 let shift = match shift {
2013 Shift::Direct(i) => i,
2014 Shift::Indirect(i) => {
2015 let tmp = offset_address as i128 + i as i128;
2016 if tmp.is_negative() {
2017 return Ok(None);
2018 } else {
2019 haystack.seek(SeekFrom::Start(tmp as u64))?;
2020 };
2021 read_value!()
2024 }
2025 };
2026
2027 match op {
2028 Op::Add => return Ok(o.checked_add(shift)),
2029 Op::Mul => return Ok(o.checked_mul(shift)),
2030 Op::Sub => return Ok(o.checked_sub(shift)),
2031 Op::Div => return Ok(o.checked_div(shift)),
2032 Op::Mod => return Ok(o.checked_rem(shift)),
2033 Op::And => return Ok(Some(o & shift)),
2034 Op::Or => return Ok(Some(o | shift)),
2035 Op::Xor => return Ok(Some(o ^ shift)),
2036 }
2037 }
2038
2039 Ok(Some(o))
2040 }
2041}
2042
2043#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
2044enum DirOffset {
2045 Start(u64),
2046 LastUpper(i64),
2048 End(i64),
2049}
2050
2051#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
2052enum Offset {
2053 Direct(DirOffset),
2054 Indirect(IndOffset),
2055}
2056
2057impl Offset {
2058 #[inline(always)]
2059 fn is_indirect(&self) -> bool {
2060 matches!(self, Self::Indirect(_))
2061 }
2062}
2063
2064impl From<DirOffset> for Offset {
2065 fn from(value: DirOffset) -> Self {
2066 Self::Direct(value)
2067 }
2068}
2069
2070impl From<IndOffset> for Offset {
2071 fn from(value: IndOffset) -> Self {
2072 Self::Indirect(value)
2073 }
2074}
2075
2076impl Display for DirOffset {
2077 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
2078 match self {
2079 DirOffset::Start(i) => write!(f, "{i}"),
2080 DirOffset::LastUpper(c) => write!(f, "&{c}"),
2081 DirOffset::End(e) => write!(f, "-{e}"),
2082 }
2083 }
2084}
2085
2086impl Default for DirOffset {
2087 fn default() -> Self {
2088 Self::LastUpper(0)
2089 }
2090}
2091
2092#[derive(Debug, Clone, Serialize, Deserialize)]
2093struct Match {
2094 line: usize,
2095 depth: u8,
2096 offset: Offset,
2097 test: Test,
2098 test_strength: u64,
2099 message: Option<Message>,
2100}
2101
2102impl From<Use> for Match {
2103 fn from(value: Use) -> Self {
2104 let test = Test::Use(value.switch_endianness, value.rule_name);
2105 let test_strength = test.strength();
2106 Self {
2107 line: value.line,
2108 depth: value.depth,
2109 offset: value.start_offset,
2110 test,
2111 test_strength,
2112 message: value.message,
2113 }
2114 }
2115}
2116
2117impl From<Name> for Match {
2118 fn from(value: Name) -> Self {
2119 let test = Test::Name(value.name);
2120 let test_strength = test.strength();
2121 Self {
2122 line: value.line,
2123 depth: 0,
2124 offset: Offset::Direct(DirOffset::Start(0)),
2125 test,
2126 test_strength,
2127 message: value.message,
2128 }
2129 }
2130}
2131
2132impl Match {
2133 #[inline(always)]
2135 fn offset_from_start<D: DataRead>(
2136 &self,
2137 haystack: &mut D,
2138 rule_base_offset: Option<u64>,
2139 last_level_offset: Option<u64>,
2140 ) -> Result<Option<u64>, io::Error> {
2141 match self.offset {
2142 Offset::Direct(dir_offset) => match dir_offset {
2143 DirOffset::Start(s) => Ok(Some(s)),
2144 DirOffset::LastUpper(shift) => {
2145 let o = last_level_offset.unwrap_or_default() as i64 + shift;
2146
2147 if o >= 0 { Ok(Some(o as u64)) } else { Ok(None) }
2148 }
2149 DirOffset::End(e) => Ok(Some(haystack.offset_from_start(SeekFrom::End(e)))),
2150 },
2151 Offset::Indirect(ind_offset) => {
2152 let Some(o) =
2153 ind_offset.read_offset(haystack, rule_base_offset, last_level_offset)?
2154 else {
2155 return Ok(None);
2156 };
2157
2158 Ok(Some(o))
2159 }
2160 }
2161 }
2162
2163 #[inline]
2176 #[allow(clippy::too_many_arguments)]
2177 fn matches<'a: 'h, 'h, D: DataRead>(
2178 &'a self,
2179 source: Option<&str>,
2180 magic: &mut Magic<'a>,
2181 stream_kind: StreamKind,
2182 state: &mut MatchState,
2183 buf_base_offset: Option<u64>,
2184 rule_base_offset: Option<u64>,
2185 last_level_offset: Option<u64>,
2186 haystack: &'h mut D,
2187 switch_endianness: bool,
2188 db: &'a MagicDb,
2189 depth: usize,
2190 ) -> Result<(bool, Option<MatchRes<'h>>), Error> {
2191 let source = source.unwrap_or("unknown");
2192 let line = self.line;
2193
2194 if depth >= MAX_RECURSION {
2195 return Err(Error::localized(
2196 source,
2197 line,
2198 Error::MaximumRecursion(MAX_RECURSION),
2199 ));
2200 }
2201
2202 if self.test.is_only_binary() && stream_kind.is_text() {
2203 trace!("skip binary test source={source} line={line} stream_kind={stream_kind:?}",);
2204 return Ok((false, None));
2205 }
2206
2207 if self.test.is_only_text() && !stream_kind.is_text() {
2208 trace!("skip text test source={source} line={line} stream_kind={stream_kind:?}",);
2209 return Ok((false, None));
2210 }
2211
2212 let Ok(Some(mut offset)) = self
2213 .offset_from_start(haystack, rule_base_offset, last_level_offset)
2214 .inspect_err(|e| debug!("source={source} line={line} failed at computing offset: {e}"))
2215 else {
2216 return Ok((false, None));
2217 };
2218
2219 offset = match self.offset {
2220 Offset::Indirect(_) => {
2221 buf_base_offset.unwrap_or_default().saturating_add(offset)
2226 }
2227 Offset::Direct(DirOffset::Start(_)) => {
2229 rule_base_offset.unwrap_or_default().saturating_add(offset)
2230 }
2231 _ => offset,
2232 };
2233
2234 match &self.test {
2235 Test::Clear => {
2236 trace!("source={source} line={line} clear");
2237 state.clear_continuation_level(&self.continuation_level());
2238 Ok((true, None))
2239 }
2240
2241 Test::Name(name) => {
2242 trace!(
2243 "source={source} line={line} running rule {name} switch_endianness={switch_endianness}",
2244 );
2245 Ok((true, None))
2246 }
2247
2248 Test::Use(flip_endianness, rule_name) => {
2249 trace!(
2250 "source={source} line={line} use {rule_name} switch_endianness={flip_endianness}",
2251 );
2252
2253 let switch_endianness = switch_endianness ^ flip_endianness;
2255
2256 let dr: &DependencyRule = db.dependencies.get(rule_name).ok_or(
2257 Error::localized(source, line, Error::MissingRule(rule_name.clone())),
2258 )?;
2259
2260 if let Some(msg) = self.message.as_ref() {
2262 magic.push_message(msg.to_string_lossy());
2263 }
2264
2265 let new_buf_base_off = if self.offset.is_indirect() {
2266 Some(offset)
2267 } else {
2268 None
2269 };
2270
2271 let nmatch = dr.rule.magic(
2272 magic,
2273 stream_kind,
2274 new_buf_base_off,
2275 Some(offset),
2276 haystack,
2277 db,
2278 switch_endianness,
2279 depth.saturating_add(1),
2280 )?;
2281
2282 let matched = nmatch > 0;
2285 if matched {
2286 state.set_continuation_level(self.continuation_level());
2287 }
2288
2289 Ok((matched, None))
2290 }
2291
2292 Test::Indirect(m) => {
2293 trace!(
2294 "source={source} line={line} indirect mods={:?} offset={offset:#x}",
2295 m
2296 );
2297
2298 let new_buf_base_off = if m.contains(IndirectMod::Relative) {
2299 Some(offset)
2300 } else {
2301 None
2302 };
2303
2304 if let Some(msg) = self.message.as_ref() {
2306 magic.push_message(msg.to_string_lossy());
2307 }
2308
2309 let mut nmatch = 0u64;
2310 for r in db.rules.iter() {
2311 nmatch = nmatch.saturating_add(r.magic(
2312 magic,
2313 stream_kind,
2314 new_buf_base_off,
2315 Some(offset),
2316 haystack,
2317 db,
2318 false,
2319 depth.saturating_add(1),
2320 )?);
2321
2322 if nmatch > 0 {
2323 break;
2324 }
2325 }
2326
2327 Ok((nmatch > 0, None))
2328 }
2329
2330 Test::Default => {
2331 let ok = !state.get_continuation_level(&self.continuation_level());
2333
2334 trace!("source={source} line={line} default match={ok}");
2335 if ok {
2336 state.set_continuation_level(self.continuation_level());
2337 }
2338
2339 Ok((ok, None))
2340 }
2341
2342 _ => {
2343 if let Err(e) = haystack.seek(SeekFrom::Start(offset)) {
2344 debug!("source={source} line={line} failed to seek in haystack: {e}");
2345 return Ok((false, None));
2346 }
2347
2348 let mut trace_msg = None;
2349
2350 if enabled!(Level::DEBUG) {
2351 trace_msg = Some(vec![format!(
2352 "source={source} line={line} depth={} stream_offset={:#x}",
2353 self.depth,
2354 haystack.stream_position()
2355 )])
2356 }
2357
2358 if let Ok(opt_test_value) = self
2362 .test
2363 .read_test_value(haystack, switch_endianness)
2364 .inspect_err(|e| {
2365 debug!("source={source} line={line} error while reading test value @{offset}: {e}",)
2366 })
2367 {
2368 if let Some(v) = trace_msg
2369 .as_mut() { v.push(format!("test={}", self.test)) }
2370
2371 if let Some(v) = trace_msg.as_mut(){
2372 let drv = match opt_test_value.as_ref(){
2373 Some(r) => format!("{r:?}"),
2374 None =>String::new(),
2375 };
2376 v.push(format!("read_in_stream={drv}"))
2377 }
2378
2379 let match_res =
2380 opt_test_value.and_then(|tv| self.test.match_value(&tv, stream_kind));
2381
2382 if let Some(v) = trace_msg.as_mut() { v.push(format!(
2383 "message=\"{}\" match={}",
2384 self.message
2385 .as_ref()
2386 .map(|fs| fs.to_string_lossy())
2387 .unwrap_or_default(),
2388 match_res.is_some()
2389 )) }
2390
2391 if enabled!(Level::DEBUG) && !enabled!(Level::TRACE) && match_res.is_some() {
2393 if let Some(m) = trace_msg{
2394 debug!("{}", m.join(" "));
2395 }
2396 } else if enabled!(Level::TRACE)
2397 && let Some(m) = trace_msg{
2398 trace!("{}", m.join(" "));
2399 }
2400
2401 if let Some(mr) = match_res {
2402 state.set_continuation_level(self.continuation_level());
2403 return Ok((true, Some(mr)));
2404 }
2405 }
2406
2407 Ok((false, None))
2408 }
2409 }
2410 }
2411
2412 #[inline(always)]
2413 fn continuation_level(&self) -> ContinuationLevel {
2414 ContinuationLevel(self.depth)
2415 }
2416}
2417
2418#[derive(Debug, Clone)]
2419struct Use {
2420 line: usize,
2421 depth: u8,
2422 start_offset: Offset,
2423 rule_name: String,
2424 switch_endianness: bool,
2425 message: Option<Message>,
2426}
2427
2428#[derive(Debug, Clone, Serialize, Deserialize)]
2429struct StrengthMod {
2430 op: Op,
2431 by: u8,
2432}
2433
2434impl StrengthMod {
2435 #[inline(always)]
2436 fn apply(&self, strength: u64) -> u64 {
2437 let by = self.by as u64;
2438 debug!("applying strength modifier: {strength} {} {}", self.op, by);
2439 match self.op {
2440 Op::Mul => strength.saturating_mul(by),
2441 Op::Add => strength.saturating_add(by),
2442 Op::Sub => strength.saturating_sub(by),
2443 Op::Div => {
2444 if by > 0 {
2445 strength.saturating_div(by)
2446 } else {
2447 strength
2448 }
2449 }
2450 Op::Mod => strength % by,
2451 Op::And => strength & by,
2452 Op::Xor | Op::Or => {
2455 debug_panic!("unsupported strength operator");
2456 strength
2457 }
2458 }
2459 }
2460}
2461
2462#[derive(Debug, Clone)]
2463enum Flag {
2464 Mime(String),
2465 Ext(HashSet<String>),
2466 Strength(StrengthMod),
2467 Apple(String),
2468}
2469
2470#[derive(Debug, Clone)]
2471struct Name {
2472 line: usize,
2473 name: String,
2474 message: Option<Message>,
2475}
2476
2477#[derive(Debug, Clone)]
2478enum Entry<'span> {
2479 Match(Span<'span>, Match),
2480 Flag(Span<'span>, Flag),
2481}
2482
2483#[derive(Debug, Clone, Serialize, Deserialize)]
2484struct EntryNode {
2485 root: bool,
2486 entry: Match,
2487 children: Vec<EntryNode>,
2488 mimetype: Option<String>,
2489 apple: Option<String>,
2490 strength_mod: Option<StrengthMod>,
2491 exts: HashSet<String>,
2492}
2493
2494#[derive(Debug, Default)]
2495struct EntryNodeVisitor {
2496 exts: HashSet<String>,
2497 score: u64,
2498}
2499
2500impl EntryNodeVisitor {
2501 fn new() -> Self {
2502 Self {
2503 ..Default::default()
2504 }
2505 }
2506
2507 fn merge(&mut self, other: Self) {
2508 self.exts.extend(other.exts);
2509 self.score += other.score;
2510 }
2511}
2512
2513impl EntryNode {
2514 #[inline]
2515 fn update_visitor(&self, v: &mut EntryNodeVisitor, depth: usize) {
2516 for ext in self.exts.iter() {
2518 if !v.exts.contains(ext) {
2519 v.exts.insert(ext.clone());
2520 }
2521 }
2522
2523 if depth == 0 {
2525 v.score += self.entry.test_strength;
2526 }
2527
2528 v.score += self
2532 .children
2533 .iter()
2534 .map(|e| e.entry.test_strength)
2535 .min()
2536 .unwrap_or_default()
2537 / max(1, depth as u64);
2538 }
2539
2540 fn visit(
2541 &self,
2542 v: &mut EntryNodeVisitor,
2543 deps: &HashMap<String, DependencyRule>,
2544 marked: &mut HashSet<String>,
2545 depth: usize,
2546 ) -> Result<(), Error> {
2547 self.update_visitor(v, depth);
2549
2550 for c in self.children.iter() {
2552 if let Test::Use(_, ref name) = c.entry.test {
2553 if marked.contains(name) {
2554 continue;
2555 }
2556
2557 marked.insert(name.clone());
2558
2559 if let Some(r) = deps.get(name) {
2560 let dv = r.rule.visit_all_entries(deps, marked)?;
2561 v.merge(dv);
2562 } else {
2563 return Err(Error::MissingRule(name.clone()));
2564 }
2565 } else {
2566 c.visit(v, deps, marked, depth + 1)?;
2567 }
2568 }
2569
2570 Ok(())
2571 }
2572
2573 #[inline]
2576 #[allow(clippy::too_many_arguments)]
2577 fn matches<'r, D: DataRead>(
2578 &'r self,
2579 opt_source: Option<&str>,
2580 magic: &mut Magic<'r>,
2581 state: &mut MatchState,
2582 stream_kind: StreamKind,
2583 buf_base_offset: Option<u64>,
2584 rule_base_offset: Option<u64>,
2585 last_level_offset: Option<u64>,
2586 haystack: &mut D,
2587 db: &'r MagicDb,
2588 switch_endianness: bool,
2589 depth: usize,
2590 ) -> Result<u64, Error> {
2591 let mut nmatch = 0u64;
2592
2593 let (ok, opt_match_res) = self.entry.matches(
2594 opt_source,
2595 magic,
2596 stream_kind,
2597 state,
2598 buf_base_offset,
2599 rule_base_offset,
2600 last_level_offset,
2601 haystack,
2602 switch_endianness,
2603 db,
2604 depth,
2605 )?;
2606
2607 let source = opt_source.unwrap_or("unknown");
2608 let line = self.entry.line;
2609
2610 if ok {
2611 if !self.entry.test.is_recursive()
2615 && let Some(msg) = self.entry.message.as_ref()
2616 && let Ok(msg) = msg.format_with(opt_match_res.as_ref()).inspect_err(|e| {
2617 debug!("source={source} line={line} failed to format message: {e}")
2618 })
2619 {
2620 nmatch = nmatch.saturating_add(1);
2621 magic.push_message(msg);
2622 }
2623
2624 if let Some(mr) = opt_match_res {
2626 match &self.entry.test {
2627 Test::String(t) if t.has_length_mod() => {
2628 let o = mr.end_offset();
2629 haystack.seek(SeekFrom::Start(o))?;
2630 }
2631 Test::Search(t) => {
2632 if t.re_mods.contains(ReMod::StartOffsetUpdate) {
2633 let o = mr.start_offset();
2634 haystack.seek(SeekFrom::Start(o))?;
2635 } else {
2636 let o = mr.end_offset();
2637 haystack.seek(SeekFrom::Start(o))?;
2638 }
2639 }
2640
2641 Test::Regex(t) => {
2642 if t.mods.contains(ReMod::StartOffsetUpdate) {
2643 let o = mr.start_offset();
2644 haystack.seek(SeekFrom::Start(o))?;
2645 } else {
2646 let o = mr.end_offset();
2647 haystack.seek(SeekFrom::Start(o))?;
2648 }
2649 }
2650 _ => {}
2652 }
2653 }
2654
2655 if let Some(mimetype) = self.mimetype.as_ref() {
2656 magic.set_mime_type(Cow::Borrowed(mimetype));
2657 }
2658
2659 if let Some(apple_ty) = self.apple.as_ref() {
2660 magic.set_creator_code(Cow::Borrowed(apple_ty));
2661 }
2662
2663 if !self.exts.is_empty() {
2664 magic.insert_extensions(self.exts.iter().map(|s| s.as_str()));
2665 }
2666
2667 let mut strength = self.entry.test_strength;
2671
2672 let continuation_level = self.entry.continuation_level().0 as u64;
2673 if self.entry.message.is_none() && continuation_level < 3 {
2674 strength = strength.saturating_add(continuation_level);
2675 }
2676
2677 if let Some(sm) = self.strength_mod.as_ref() {
2678 strength = sm.apply(strength);
2679 }
2680
2681 if self.entry.message.is_none() {
2683 strength += 1
2684 }
2685
2686 magic.update_strength(strength);
2687
2688 let end_upper_level = haystack.stream_position();
2689
2690 let rule_base_offset = if self.root {
2698 match self.entry.offset {
2699 Offset::Direct(DirOffset::End(o)) => {
2700 Some(haystack.offset_from_start(SeekFrom::End(o)))
2701 }
2702 _ => rule_base_offset,
2703 }
2704 } else {
2705 rule_base_offset
2706 };
2707
2708 for e in self.children.iter() {
2709 nmatch = nmatch.saturating_add(e.matches(
2710 opt_source,
2711 magic,
2712 state,
2713 stream_kind,
2714 buf_base_offset,
2715 rule_base_offset,
2716 Some(end_upper_level),
2717 haystack,
2718 db,
2719 switch_endianness,
2720 depth,
2721 )?);
2722 }
2723 }
2724
2725 Ok(nmatch)
2726 }
2727}
2728
2729#[derive(Debug, Clone, Serialize, Deserialize)]
2731pub struct MagicRule {
2732 id: usize,
2733 source: Option<String>,
2734 entries: EntryNode,
2735 extensions: HashSet<String>,
2736 score: u64,
2738 finalized: bool,
2739}
2740
2741impl MagicRule {
2742 #[inline(always)]
2743 fn set_id(&mut self, id: usize) {
2744 self.id = id
2745 }
2746
2747 fn visit_all_entries(
2748 &self,
2749 deps: &HashMap<String, DependencyRule>,
2750 marked: &mut HashSet<String>,
2751 ) -> Result<EntryNodeVisitor, Error> {
2752 let mut v = EntryNodeVisitor::new();
2753 self.entries.visit(&mut v, deps, marked, 0)?;
2754 Ok(v)
2755 }
2756
2757 fn try_finalize(&mut self, deps: &HashMap<String, DependencyRule>) -> Result<(), Error> {
2760 if self.finalized {
2761 return Ok(());
2762 }
2763
2764 let v = self.visit_all_entries(deps, &mut HashSet::new())?;
2766
2767 self.extensions.extend(v.exts);
2768 self.score = v.score;
2769 self.finalized = true;
2770
2771 Ok(())
2772 }
2773
2774 #[inline]
2775 fn magic_entrypoint<'r, D: DataRead>(
2776 &'r self,
2777 magic: &mut Magic<'r>,
2778 stream_kind: StreamKind,
2779 haystack: &mut D,
2780 db: &'r MagicDb,
2781 switch_endianness: bool,
2782 depth: usize,
2783 ) -> Result<u64, Error> {
2784 self.entries.matches(
2785 self.source.as_deref(),
2786 magic,
2787 &mut MatchState::empty(),
2788 stream_kind,
2789 None,
2790 None,
2791 None,
2792 haystack,
2793 db,
2794 switch_endianness,
2795 depth,
2796 )
2797 }
2798
2799 #[inline]
2802 #[allow(clippy::too_many_arguments)]
2803 fn magic<'r, D: DataRead>(
2804 &'r self,
2805 magic: &mut Magic<'r>,
2806 stream_kind: StreamKind,
2807 buf_base_offset: Option<u64>,
2808 rule_base_offset: Option<u64>,
2809 haystack: &mut D,
2810 db: &'r MagicDb,
2811 switch_endianness: bool,
2812 depth: usize,
2813 ) -> Result<u64, Error> {
2814 self.entries.matches(
2815 self.source.as_deref(),
2816 magic,
2817 &mut MatchState::empty(),
2818 stream_kind,
2819 buf_base_offset,
2820 rule_base_offset,
2821 None,
2822 haystack,
2823 db,
2824 switch_endianness,
2825 depth,
2826 )
2827 }
2828
2829 pub fn is_text(&self) -> bool {
2835 self.entries.entry.test.is_text()
2836 && self.entries.children.iter().all(|e| e.entry.test.is_text())
2837 }
2838
2839 #[inline(always)]
2845 pub fn score(&self) -> u64 {
2846 self.score
2847 }
2848
2849 #[inline(always)]
2855 pub fn source(&self) -> Option<&str> {
2856 self.source.as_deref()
2857 }
2858
2859 #[inline(always)]
2865 pub fn line(&self) -> usize {
2866 self.entries.entry.line
2867 }
2868
2869 #[inline(always)]
2875 pub fn extensions(&self) -> &HashSet<String> {
2876 &self.extensions
2877 }
2878}
2879
2880#[derive(Debug, Clone, Serialize, Deserialize)]
2881struct DependencyRule {
2882 name: String,
2883 rule: MagicRule,
2884}
2885
2886#[derive(Debug, Clone, Serialize, Deserialize)]
2892pub struct MagicSource {
2893 rules: Vec<MagicRule>,
2894 dependencies: HashMap<String, DependencyRule>,
2895}
2896
2897impl MagicSource {
2898 pub fn open<P: AsRef<Path>>(p: P) -> Result<Self, Error> {
2908 FileMagicParser::parse_file(p)
2909 }
2910}
2911
2912#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)]
2913struct ContinuationLevel(u8);
2914
2915#[derive(Debug, PartialEq, Eq, Clone, Copy)]
2917enum TextEncoding {
2918 Ascii,
2919 Utf8,
2920 Unknown,
2921}
2922
2923impl TextEncoding {
2924 const fn as_magic_str(&self) -> &'static str {
2925 match self {
2926 TextEncoding::Ascii => "ASCII",
2927 TextEncoding::Utf8 => "UTF-8",
2928 TextEncoding::Unknown => "Unknown",
2929 }
2930 }
2931}
2932
2933#[derive(Debug, PartialEq, Eq, Clone, Copy)]
2934enum StreamKind {
2935 Binary,
2936 Text(TextEncoding),
2937}
2938
2939impl StreamKind {
2940 const fn is_text(&self) -> bool {
2941 matches!(self, StreamKind::Text(_))
2942 }
2943}
2944
2945#[derive(Debug)]
2946struct MatchState {
2947 continuation_levels: [bool; 256],
2948}
2949
2950impl MatchState {
2951 #[inline(always)]
2952 fn empty() -> Self {
2953 MatchState {
2954 continuation_levels: [false; 256],
2955 }
2956 }
2957
2958 #[inline(always)]
2959 fn get_continuation_level(&mut self, level: &ContinuationLevel) -> bool {
2960 self.continuation_levels
2961 .get(level.0 as usize)
2962 .cloned()
2963 .unwrap_or_default()
2964 }
2965
2966 #[inline(always)]
2967 fn set_continuation_level(&mut self, level: ContinuationLevel) {
2968 if let Some(b) = self.continuation_levels.get_mut(level.0 as usize) {
2969 *b = true
2970 }
2971 }
2972
2973 #[inline(always)]
2974 fn clear_continuation_level(&mut self, level: &ContinuationLevel) {
2975 if let Some(b) = self.continuation_levels.get_mut(level.0 as usize) {
2976 *b = false;
2977 }
2978 }
2979}
2980
2981#[derive(Debug, Default)]
2983pub struct Magic<'m> {
2984 stream_kind: Option<StreamKind>,
2985 source: Option<Cow<'m, str>>,
2986 message: Vec<Cow<'m, str>>,
2987 mime_type: Option<Cow<'m, str>>,
2988 creator_code: Option<Cow<'m, str>>,
2989 strength: u64,
2990 exts: HashSet<Cow<'m, str>>,
2991 is_default: bool,
2992}
2993
2994impl<'m> Magic<'m> {
2995 #[inline(always)]
2996 fn set_source(&mut self, source: Option<&'m str>) {
2997 self.source = source.map(Cow::Borrowed);
2998 }
2999
3000 #[inline(always)]
3001 fn set_stream_kind(&mut self, stream_kind: StreamKind) {
3002 self.stream_kind = Some(stream_kind)
3003 }
3004
3005 #[inline(always)]
3006 fn reset(&mut self) {
3007 self.stream_kind = None;
3008 self.source = None;
3009 self.message.clear();
3010 self.mime_type = None;
3011 self.creator_code = None;
3012 self.strength = 0;
3013 self.exts.clear();
3014 self.is_default = false;
3015 }
3016
3017 #[inline]
3025 pub fn into_owned<'owned>(self) -> Magic<'owned> {
3026 Magic {
3027 stream_kind: self.stream_kind,
3028 source: self.source.map(|s| Cow::Owned(s.into_owned())),
3029 message: self
3030 .message
3031 .into_iter()
3032 .map(Cow::into_owned)
3033 .map(Cow::Owned)
3034 .collect(),
3035 mime_type: self.mime_type.map(|m| Cow::Owned(m.into_owned())),
3036 creator_code: self.creator_code.map(|m| Cow::Owned(m.into_owned())),
3037 strength: self.strength,
3038 exts: self
3039 .exts
3040 .into_iter()
3041 .map(|e| Cow::Owned(e.into_owned()))
3042 .collect(),
3043 is_default: self.is_default,
3044 }
3045 }
3046
3047 #[inline(always)]
3053 pub fn message(&self) -> String {
3054 let mut out = String::new();
3055 for (i, m) in self.message.iter().enumerate() {
3056 if let Some(s) = m.strip_prefix(r#"\b"#) {
3057 out.push_str(s);
3058 } else {
3059 if i > 0 {
3061 out.push(' ');
3062 }
3063 out.push_str(m);
3064 }
3065 }
3066 out
3067 }
3068
3069 #[inline]
3080 pub fn message_parts(&self) -> impl Iterator<Item = &str> {
3081 self.message.iter().map(|p| p.as_ref())
3082 }
3083
3084 #[inline(always)]
3085 fn update_strength(&mut self, value: u64) {
3086 self.strength = self.strength.saturating_add(value);
3087 debug!("updated strength = {:?}", self.strength)
3088 }
3089
3090 #[inline(always)]
3096 pub fn mime_type(&self) -> &str {
3097 self.mime_type.as_deref().unwrap_or(match self.stream_kind {
3098 Some(StreamKind::Text(_)) => DEFAULT_TEXT_MIMETYPE,
3099 Some(StreamKind::Binary) | None => DEFAULT_BIN_MIMETYPE,
3100 })
3101 }
3102
3103 #[inline(always)]
3104 fn push_message<'a: 'm>(&mut self, msg: Cow<'a, str>) {
3105 if !msg.is_empty() {
3106 debug!("pushing message: msg={msg} len={}", msg.len());
3107 self.message.push(msg);
3108 }
3109 }
3110
3111 #[inline(always)]
3112 fn set_mime_type<'a: 'm>(&mut self, mime: Cow<'a, str>) {
3113 if self.mime_type.is_none() {
3114 debug!("insert mime: {:?}", mime);
3115 self.mime_type = Some(mime)
3116 }
3117 }
3118
3119 #[inline(always)]
3120 fn set_creator_code<'a: 'm>(&mut self, apple_ty: Cow<'a, str>) {
3121 if self.creator_code.is_none() {
3122 debug!("insert apple type: {apple_ty:?}");
3123 self.creator_code = Some(apple_ty)
3124 }
3125 }
3126
3127 #[inline(always)]
3128 fn insert_extensions<'a: 'm, I: Iterator<Item = &'a str>>(&mut self, exts: I) {
3129 if self.exts.is_empty() {
3130 self.exts.extend(exts.filter_map(|e| {
3131 if e.is_empty() {
3132 None
3133 } else {
3134 Some(Cow::Borrowed(e))
3135 }
3136 }));
3137 }
3138 }
3139
3140 #[inline(always)]
3148 pub fn strength(&self) -> u64 {
3149 self.strength
3150 }
3151
3152 #[inline(always)]
3158 pub fn source(&self) -> Option<&str> {
3159 self.source.as_deref()
3160 }
3161
3162 #[inline(always)]
3168 pub fn creator_code(&self) -> Option<&str> {
3169 self.creator_code.as_deref()
3170 }
3171
3172 #[inline(always)]
3178 pub fn extensions(&self) -> &HashSet<Cow<'m, str>> {
3179 &self.exts
3180 }
3181
3182 #[inline(always)]
3188 pub fn is_default(&self) -> bool {
3189 self.is_default
3190 }
3191}
3192
3193#[derive(Debug, Default, Clone, Serialize, Deserialize)]
3195pub struct MagicDb {
3196 rule_id: usize,
3197 rules: Vec<MagicRule>,
3198 dependencies: HashMap<String, DependencyRule>,
3199 finalized: usize,
3200}
3201
3202#[inline(always)]
3203fn is_likely_text(bytes: &[u8]) -> bool {
3205 const CHUNK_SIZE: usize = std::mem::size_of::<usize>();
3206
3207 if bytes.is_empty() {
3208 return false;
3209 }
3210
3211 let mut printable = 0f64;
3212 let mut high_bytes = 0f64; let (chunks, remainder) = bytes.as_chunks::<CHUNK_SIZE>();
3215
3216 macro_rules! handle_byte {
3217 ($byte: expr) => {
3218 match $byte {
3219 0x00 => return false,
3220 0x09 | 0x0A | 0x0D => printable += 1.0, 0x20..=0x7E => printable += 1.0, _ => high_bytes += 1.0,
3223 }
3224 };
3225 }
3226
3227 for bytes in chunks {
3228 for b in bytes {
3229 handle_byte!(b)
3230 }
3231 }
3232
3233 for b in remainder {
3234 handle_byte!(b)
3235 }
3236
3237 let total = bytes.len() as f64;
3238 let printable_ratio = printable / total;
3239 let high_bytes_ratio = high_bytes / total;
3240
3241 printable_ratio > 0.85 && high_bytes_ratio < 0.20
3243}
3244
3245#[inline(always)]
3246fn guess_stream_kind<S: AsRef<[u8]>>(stream: S) -> StreamKind {
3247 let buf = stream.as_ref();
3248
3249 match run_utf8_validation(buf) {
3250 Ok(is_ascii) => {
3251 if is_ascii {
3252 StreamKind::Text(TextEncoding::Ascii)
3253 } else {
3254 StreamKind::Text(TextEncoding::Utf8)
3255 }
3256 }
3257 Err(e) => {
3258 if is_likely_text(&buf[e.valid_up_to..]) {
3259 StreamKind::Text(TextEncoding::Unknown)
3260 } else {
3261 StreamKind::Binary
3262 }
3263 }
3264 }
3265}
3266
3267impl MagicDb {
3268 pub fn new() -> Self {
3274 Self::default()
3275 }
3276
3277 #[inline(always)]
3278 fn next_rule_id(&mut self) -> usize {
3279 let t = self.rule_id;
3280 self.rule_id += 1;
3281 t
3282 }
3283
3284 #[inline(always)]
3285 fn try_json<D: DataRead>(
3286 haystack: &mut D,
3287 stream_kind: StreamKind,
3288 magic: &mut Magic,
3289 ) -> Result<bool, Error> {
3290 if matches!(stream_kind, StreamKind::Binary) {
3292 return Ok(false);
3293 }
3294
3295 let buf = haystack.read_range(0..FILE_BYTES_MAX as u64)?.trim_ascii();
3296
3297 let Some((start, end)) = find_json_boundaries(buf) else {
3298 return Ok(false);
3299 };
3300
3301 for c in buf[0..start].iter() {
3304 if !c.is_ascii_whitespace() {
3305 return Ok(false);
3306 }
3307 }
3308
3309 let mut is_ndjson = false;
3310
3311 trace!("maybe a json document");
3312 let ok = serde_json::from_slice::<serde_json::Value>(&buf[start..=end]).is_ok();
3313 if !ok {
3314 return Ok(false);
3315 }
3316
3317 if end + 1 < buf.len() {
3319 let buf = &buf[end + 1..];
3321 if let Some((second_start, second_end)) = find_json_boundaries(buf) {
3322 if memchr(b'\n', &buf[..second_start]).is_some() {
3324 trace!("might be ndjson");
3325 is_ndjson = serde_json::from_slice::<serde_json::Value>(
3326 &buf[second_start..=second_end],
3327 )
3328 .is_ok();
3329 }
3330 }
3331 }
3332
3333 if is_ndjson {
3334 magic.push_message(Cow::Borrowed("New Line Delimited"));
3335 magic.set_mime_type(Cow::Borrowed("application/x-ndjson"));
3336 magic.insert_extensions(["ndjson", "jsonl"].into_iter());
3337 } else {
3338 magic.set_mime_type(Cow::Borrowed("application/json"));
3339 magic.insert_extensions(["json"].into_iter());
3340 }
3341
3342 magic.push_message(Cow::Borrowed("JSON text data"));
3343 magic.set_source(Some(HARDCODED_SOURCE));
3344 magic.update_strength(HARDCODED_MAGIC_STRENGTH);
3345 Ok(true)
3346 }
3347
3348 #[inline(always)]
3349 fn try_csv<D: DataRead>(
3350 haystack: &mut D,
3351 stream_kind: StreamKind,
3352 magic: &mut Magic,
3353 ) -> Result<bool, Error> {
3354 let StreamKind::Text(enc) = stream_kind else {
3356 return Ok(false);
3357 };
3358
3359 let buf = haystack.read_range(0..FILE_BYTES_MAX as u64)?;
3360 let mut reader = csv::Reader::from_reader(io::Cursor::new(buf));
3361 let mut records = reader.records();
3362
3363 let Some(Ok(first)) = records.next() else {
3364 return Ok(false);
3365 };
3366
3367 if first.len() <= 1 {
3371 return Ok(false);
3372 }
3373
3374 let mut n = 1;
3376 for i in records.take(9) {
3377 if let Ok(rec) = i {
3378 if first.len() != rec.len() {
3379 return Ok(false);
3380 }
3381 } else {
3382 return Ok(false);
3383 }
3384 n += 1;
3385 }
3386
3387 if n != 10 {
3389 return Ok(false);
3390 }
3391
3392 magic.set_mime_type(Cow::Borrowed("text/csv"));
3393 magic.push_message(Cow::Borrowed("CSV"));
3394 magic.push_message(Cow::Borrowed(enc.as_magic_str()));
3395 magic.push_message(Cow::Borrowed("text"));
3396 magic.insert_extensions(["csv"].into_iter());
3397 magic.set_source(Some(HARDCODED_SOURCE));
3398 magic.update_strength(HARDCODED_MAGIC_STRENGTH);
3399 Ok(true)
3400 }
3401
3402 #[inline(always)]
3403 fn try_tar<D: DataRead>(
3404 haystack: &mut D,
3405 stream_kind: StreamKind,
3406 magic: &mut Magic,
3407 ) -> Result<bool, Error> {
3408 if !matches!(stream_kind, StreamKind::Binary) {
3410 return Ok(false);
3411 }
3412
3413 let buf = haystack.read_range(0..FILE_BYTES_MAX as u64)?;
3414 let mut ar = Archive::new(io::Cursor::new(buf));
3415
3416 let Ok(mut entries) = ar.entries() else {
3417 return Ok(false);
3418 };
3419
3420 let Some(Ok(first)) = entries.next() else {
3421 return Ok(false);
3422 };
3423
3424 let header = first.header();
3425
3426 if header.as_ustar().is_some() {
3427 magic.push_message(Cow::Borrowed("POSIX tar archive"));
3428 } else if header.as_gnu().is_some() {
3429 magic.push_message(Cow::Borrowed("POSIX tar archive (GNU)"));
3430 } else {
3431 magic.push_message(Cow::Borrowed("tar archive"));
3432 }
3433
3434 magic.set_mime_type(Cow::Borrowed("application/x-tar"));
3435 magic.set_source(Some(HARDCODED_SOURCE));
3436 magic.update_strength(HARDCODED_MAGIC_STRENGTH);
3437 magic.insert_extensions(["tar"].into_iter());
3438 Ok(true)
3439 }
3440
3441 #[inline(always)]
3442 fn try_hard_magic<D: DataRead>(
3443 haystack: &mut D,
3444 stream_kind: StreamKind,
3445 magic: &mut Magic,
3446 ) -> Result<bool, Error> {
3447 Ok(Self::try_json(haystack, stream_kind, magic)?
3448 || Self::try_csv(haystack, stream_kind, magic)?
3449 || Self::try_tar(haystack, stream_kind, magic)?)
3450 }
3451
3452 #[inline(always)]
3453 fn magic_default<'m, D: DataRead>(
3454 cache: &mut D,
3455 stream_kind: StreamKind,
3456 magic: &mut Magic<'m>,
3457 ) {
3458 magic.set_source(Some(HARDCODED_SOURCE));
3459 magic.set_stream_kind(stream_kind);
3460 magic.is_default = true;
3461
3462 if cache.data_size() == 0 {
3463 magic.push_message(Cow::Borrowed("empty"));
3464 magic.set_mime_type(Cow::Borrowed(DEFAULT_BIN_MIMETYPE));
3465 }
3466
3467 match stream_kind {
3468 StreamKind::Binary => {
3469 magic.push_message(Cow::Borrowed("data"));
3470 }
3471 StreamKind::Text(e) => {
3472 magic.push_message(Cow::Borrowed(e.as_magic_str()));
3473 magic.push_message(Cow::Borrowed("text"));
3474 }
3475 }
3476 }
3477
3478 fn load_rules_no_prepare(&mut self, rules: Vec<MagicRule>) {
3479 for rule in rules.into_iter() {
3480 let mut rule = rule;
3481 rule.set_id(self.next_rule_id());
3482
3483 self.rules.push(rule);
3484 }
3485 }
3486
3487 pub fn load(&mut self, ms: MagicSource) -> &mut Self {
3493 self.load_rules_no_prepare(ms.rules);
3494 self.dependencies.extend(ms.dependencies);
3495 self.try_finalize();
3496 self
3497 }
3498
3499 pub fn load_bulk<I: Iterator<Item = MagicSource>>(&mut self, it: I) -> &mut Self {
3504 for ms in it {
3505 self.load_rules_no_prepare(ms.rules);
3506 self.dependencies.extend(ms.dependencies);
3507 }
3508 self.try_finalize();
3509 self
3510 }
3511
3512 pub fn rules(&self) -> &[MagicRule] {
3518 &self.rules
3519 }
3520
3521 #[inline]
3522 fn first_magic_with_stream_kind<D: DataRead>(
3523 &self,
3524 haystack: &mut D,
3525 stream_kind: StreamKind,
3526 extension: Option<&str>,
3527 ) -> Result<Magic<'_>, Error> {
3528 let mut magic = Magic::default();
3530
3531 if Self::try_hard_magic(haystack, stream_kind, &mut magic)? {
3532 return Ok(magic);
3533 }
3534
3535 let mut marked = vec![false; self.rules.len()];
3536
3537 macro_rules! do_magic {
3538 ($rule: expr) => {{
3539 $rule.magic_entrypoint(&mut magic, stream_kind, haystack, &self, false, 0)?;
3540
3541 if !magic.message.is_empty() {
3542 magic.set_stream_kind(stream_kind);
3543 magic.set_source($rule.source.as_deref());
3544 return Ok(magic);
3545 }
3546
3547 magic.reset();
3548 }};
3549 }
3550
3551 if let Some(ext) = extension.map(|e| e.to_lowercase())
3552 && !ext.is_empty()
3553 {
3554 for rule in self.rules.iter().filter(|r| r.extensions.contains(&ext)) {
3555 do_magic!(rule);
3556 if let Some(f) = marked.get_mut(rule.id) {
3557 *f = true
3558 }
3559 }
3560 }
3561
3562 for rule in self
3563 .rules
3564 .iter()
3565 .filter(|r| !*marked.get(r.id).unwrap_or(&false))
3567 {
3568 do_magic!(rule)
3569 }
3570
3571 Self::magic_default(haystack, stream_kind, &mut magic);
3572
3573 Ok(magic)
3574 }
3575
3576 pub fn first_magic<R: DataRead>(
3604 &self,
3605 r: &mut R,
3606 extension: Option<&str>,
3607 ) -> Result<Magic<'_>, Error> {
3608 let stream_kind = guess_stream_kind(r.read_range(0..FILE_BYTES_MAX as u64)?);
3609 self.first_magic_with_stream_kind(r, stream_kind, extension)
3610 }
3611
3612 pub fn first_magic_file<P: AsRef<Path>>(&self, path: P) -> Result<Magic<'_>, Error> {
3622 let ext = path.as_ref().extension().and_then(|e| e.to_str());
3623 self.first_magic(&mut DataReader::from_file(File::open(path.as_ref())?)?, ext)
3624 }
3625
3626 pub fn first_magic_slice<S: AsRef<[u8]>>(
3634 &self,
3635 s: S,
3636 extension: Option<&str>,
3637 ) -> Result<Magic<'_>, Error> {
3638 self.first_magic(&mut DataReader::from_slice(s.as_ref()), extension)
3639 }
3640
3641 #[inline(always)]
3642 fn all_magics_sort_with_stream_kind<R: DataRead>(
3643 &self,
3644 haystack: &mut R,
3645 stream_kind: StreamKind,
3646 ) -> Result<Vec<Magic<'_>>, Error> {
3647 let mut out = Vec::new();
3648
3649 let mut magic = Magic::default();
3650
3651 if Self::try_hard_magic(haystack, stream_kind, &mut magic)? {
3652 out.push(magic);
3653 magic = Magic::default();
3654 }
3655
3656 for rule in self.rules.iter() {
3657 rule.magic_entrypoint(&mut magic, stream_kind, haystack, self, false, 0)?;
3658
3659 if !magic.message.is_empty() {
3661 magic.set_stream_kind(stream_kind);
3662 magic.set_source(rule.source.as_deref());
3663 out.push(magic);
3664 magic = Magic::default();
3665 }
3666
3667 magic.reset();
3668 }
3669
3670 Self::magic_default(haystack, stream_kind, &mut magic);
3671 out.push(magic);
3672
3673 out.sort_by_key(|b| std::cmp::Reverse(b.strength()));
3674
3675 Ok(out)
3676 }
3677
3678 #[inline]
3693 pub fn all_magics<R: DataRead>(&self, r: &mut R) -> Result<Vec<Magic<'_>>, Error> {
3694 let stream_kind = guess_stream_kind(r.read_range(0..FILE_BYTES_MAX as u64)?);
3695 self.all_magics_sort_with_stream_kind(r, stream_kind)
3696 }
3697
3698 pub fn all_magics_file<P: AsRef<Path>>(&self, path: P) -> Result<Vec<Magic<'_>>, Error> {
3707 self.all_magics(&mut DataReader::from_file(File::open(path)?)?)
3708 }
3709
3710 pub fn all_magics_slice<S: AsRef<[u8]>>(&self, slice: S) -> Result<Vec<Magic<'_>>, Error> {
3719 self.all_magics(&mut DataReader::from_slice(slice.as_ref()))
3720 }
3721
3722 #[inline(always)]
3723 fn best_magic_with_stream_kind<R: DataRead>(
3724 &self,
3725 reader: &mut R,
3726 stream_kind: StreamKind,
3727 ) -> Result<Magic<'_>, Error> {
3728 let magics = self.all_magics_sort_with_stream_kind(reader, stream_kind)?;
3729
3730 Ok(magics.into_iter().next().unwrap_or_else(|| {
3733 let mut magic = Magic::default();
3734 Self::magic_default(reader, stream_kind, &mut magic);
3735 magic
3736 }))
3737 }
3738
3739 #[inline]
3754 pub fn best_magic<R: DataRead>(&self, r: &mut R) -> Result<Magic<'_>, Error> {
3755 let stream_kind = guess_stream_kind(r.read_range(0..FILE_BYTES_MAX as u64)?);
3756 self.best_magic_with_stream_kind(r, stream_kind)
3757 }
3758
3759 pub fn best_magic_file<P: AsRef<Path>>(&self, path: P) -> Result<Magic<'_>, Error> {
3768 self.best_magic(&mut DataReader::from_file(File::open(path)?)?)
3769 }
3770
3771 pub fn best_magic_slice<S: AsRef<[u8]>>(&self, slice: S) -> Result<Magic<'_>, Error> {
3780 self.best_magic(&mut DataReader::from_slice(slice.as_ref()))
3781 }
3782
3783 pub fn serialize<W: Write>(self, w: &mut W) -> Result<(), Error> {
3789 let mut encoder = GzEncoder::new(w, Compression::best());
3790
3791 bincode::serde::encode_into_std_write(&self, &mut encoder, bincode::config::standard())?;
3792 encoder.finish()?;
3793 Ok(())
3794 }
3795
3796 pub fn deserialize<R: Read>(r: &mut R) -> Result<Self, Error> {
3806 let mut buf = vec![];
3807 let mut gz = GzDecoder::new(r);
3808 gz.read_to_end(&mut buf).map_err(|e| {
3809 bincode::error::DecodeError::OtherString(format!("failed to read: {e}"))
3810 })?;
3811 let (sdb, _): (MagicDb, usize) =
3812 bincode::serde::decode_from_slice(&buf, bincode::config::standard())?;
3813 Ok(sdb)
3814 }
3815
3816 pub fn verify(&mut self) -> Result<(), Error> {
3823 if self.rules.len() == self.finalized {
3824 return Ok(());
3825 }
3826
3827 for r in self.rules.iter_mut().filter(|r| !r.finalized) {
3828 r.try_finalize(&self.dependencies).map_err(|e| {
3830 Error::Verify(
3831 r.source.clone().unwrap_or(String::from("unknown")),
3832 r.line(),
3833 e.into(),
3834 )
3835 })?;
3836 self.finalized += 1;
3837 }
3838
3839 debug_assert!(self.finalized <= self.rules.len());
3840
3841 Ok(())
3842 }
3843
3844 #[inline(always)]
3845 fn try_finalize(&mut self) {
3846 if self.rules.len() == self.finalized {
3847 return;
3848 }
3849
3850 let mut finalized = 0usize;
3851 self.rules.iter_mut().for_each(|r| {
3852 if r.try_finalize(&self.dependencies).is_ok() {
3853 finalized += 1;
3854 }
3855 });
3856
3857 self.finalized = finalized;
3858
3859 debug_assert!(self.finalized <= self.rules.len());
3860
3861 self.rules.sort_by_key(|r| (r.is_text(), -(r.score as i64)));
3863 }
3864}
3865
3866#[cfg(test)]
3867mod tests {
3868
3869 use regex::bytes::Regex;
3870
3871 use crate::{readers::BufReader, utils::unix_local_time_to_string};
3872
3873 use super::*;
3874
3875 macro_rules! buf_reader {
3876 ($l: literal) => {
3877 BufReader::from_slice($l.as_bytes())
3878 };
3879 }
3880
3881 fn first_magic(
3882 rule: &str,
3883 content: &[u8],
3884 stream_kind: StreamKind,
3885 ) -> Result<Magic<'static>, Error> {
3886 let mut md = MagicDb::new();
3887 md.load(
3888 FileMagicParser::parse_str(rule, None)
3889 .inspect_err(|e| eprintln!("{e}"))
3890 .unwrap(),
3891 );
3892 let mut reader = BufReader::from_slice(content);
3893 let v = md.best_magic_with_stream_kind(&mut reader, stream_kind)?;
3894 Ok(v.into_owned())
3895 }
3896
3897 #[allow(unused_macros)]
3899 macro_rules! enable_trace {
3900 () => {
3901 tracing_subscriber::fmt()
3902 .with_max_level(tracing_subscriber::filter::LevelFilter::TRACE)
3903 .try_init();
3904 };
3905 }
3906
3907 macro_rules! parse_assert {
3908 ($rule:literal) => {
3909 FileMagicParser::parse_str($rule, None)
3910 .inspect_err(|e| eprintln!("{e}"))
3911 .unwrap()
3912 };
3913 }
3914
3915 macro_rules! assert_magic_match_bin {
3916 ($rule: literal, $content:literal) => {{ first_magic($rule, $content, StreamKind::Binary).unwrap() }};
3917 ($rule: literal, $content:literal, $message:expr) => {{
3918 assert_eq!(
3919 first_magic($rule, $content, StreamKind::Binary)
3920 .unwrap()
3921 .message(),
3922 $message
3923 );
3924 }};
3925 }
3926
3927 macro_rules! assert_magic_match_text {
3928 ($rule: literal, $content:literal) => {{ first_magic($rule, $content, StreamKind::Text(TextEncoding::Utf8)).unwrap() }};
3929 ($rule: literal, $content:literal, $message:expr) => {{
3930 assert_eq!(
3931 first_magic($rule, $content, StreamKind::Text(TextEncoding::Utf8))
3932 .unwrap()
3933 .message(),
3934 $message
3935 );
3936 }};
3937 }
3938
3939 macro_rules! assert_magic_not_match_text {
3940 ($rule: literal, $content:literal) => {{
3941 assert!(
3942 first_magic($rule, $content, StreamKind::Text(TextEncoding::Utf8))
3943 .unwrap()
3944 .is_default()
3945 );
3946 }};
3947 }
3948
3949 macro_rules! assert_magic_not_match_bin {
3950 ($rule: literal, $content:literal) => {{
3951 assert!(
3952 first_magic($rule, $content, StreamKind::Binary)
3953 .unwrap()
3954 .is_default()
3955 );
3956 }};
3957 }
3958
3959 #[test]
3960 fn test_regex() {
3961 assert_magic_match_text!(
3962 r#"
39630 regex/1024 \^#![[:space:]]*/usr/bin/env[[:space:]]+
3964!:mime text/x-shellscript
3965>&0 regex/64 .*($|\\b) %s shell script text executable
3966 "#,
3967 br#"#!/usr/bin/env bash
3968 echo hello world"#,
3969 "bash shell script text executable"
3971 );
3972
3973 let re = Regex::new(r"(?-u)\x42\x82").unwrap();
3974 assert!(re.is_match(b"\x42\x82"));
3975
3976 assert_magic_match_bin!(
3977 r#"0 regex \x42\x82 binary regex match"#,
3978 b"\x00\x00\x00\x00\x00\x00\x42\x82"
3979 );
3980
3981 assert_magic_match_bin!(
3983 r#"
3984 0 regex \x42\x82
3985 >&0 string \xde\xad\xbe\xef it works
3986 "#,
3987 b"\x00\x00\x00\x00\x00\x00\x42\x82\xde\xad\xbe\xef"
3988 );
3989
3990 assert_magic_match_bin!(
3991 r#"
3992 0 regex/s \x42\x82
3993 >&0 string \x42\x82\xde\xad\xbe\xef it works
3994 "#,
3995 b"\x00\x00\x00\x00\x00\x00\x42\x82\xde\xad\xbe\xef"
3996 );
3997
3998 assert_magic_match_text!(
4000 r#"
40010 regex/1024 \^HelloWorld$ HelloWorld String"#,
4002 br#"
4003// this is a comment after an empty line
4004HelloWorld
4005 "#
4006 );
4007 }
4008
4009 #[test]
4010 fn test_string_with_mods() {
4011 assert_magic_match_text!(
4012 r#"0 string/w #!\ \ \ /usr/bin/env\ bash BASH
4013 "#,
4014 b"#! /usr/bin/env bash i
4015 echo hello world"
4016 );
4017
4018 assert_magic_match_text!(
4020 r#"0 string/C HelloWorld it works
4021 "#,
4022 b"helloworld"
4023 );
4024
4025 assert_magic_not_match_text!(
4026 r#"0 string/C HelloWorld it works
4027 "#,
4028 b"hELLOwORLD"
4029 );
4030
4031 assert_magic_match_text!(
4033 r#"0 string/c HelloWorld it works
4034 "#,
4035 b"HELLOWORLD"
4036 );
4037
4038 assert_magic_not_match_text!(
4039 r#"0 string/c HelloWorld it works
4040 "#,
4041 b"helloworld"
4042 );
4043
4044 assert_magic_match_text!(
4046 r#"0 string/f #!/usr/bin/env\ bash BASH
4047 "#,
4048 b"#!/usr/bin/env bash"
4049 );
4050
4051 assert_magic_not_match_text!(
4052 r#"0 string/f #!/usr/bin/python PYTHON"#,
4053 b"#!/usr/bin/pythonic"
4054 );
4055
4056 assert_magic_match_text!(
4058 r#"0 string/W #!/usr/bin/env\ python PYTHON"#,
4059 b"#!/usr/bin/env python"
4060 );
4061
4062 assert_magic_not_match_text!(
4063 r#"0 string/W #!/usr/bin/env\ \ python PYTHON"#,
4064 b"#!/usr/bin/env python"
4065 );
4066 }
4067
4068 #[test]
4069 fn test_search_with_mods() {
4070 assert_magic_match_text!(
4071 r#"0 search/1/fwt #!\ /usr/bin/luatex LuaTex script text executable"#,
4072 b"#! /usr/bin/luatex "
4073 );
4074
4075 assert_magic_match_text!(
4077 r#"
4078 0 search/s /usr/bin/env
4079 >&0 string /usr/bin/env it works
4080 "#,
4081 b"#!/usr/bin/env python"
4082 );
4083
4084 assert_magic_not_match_text!(
4085 r#"
4086 0 search /usr/bin/env
4087 >&0 string /usr/bin/env it works
4088 "#,
4089 b"#!/usr/bin/env python"
4090 );
4091 }
4092
4093 #[test]
4094 fn test_pstring() {
4095 assert_magic_match_bin!(r#"0 pstring Toast it works"#, b"\x05Toast");
4096
4097 assert_magic_match_bin!(r#"0 pstring Toast %s"#, b"\x05Toast", "Toast");
4098
4099 assert_magic_not_match_bin!(r#"0 pstring Toast Doesn't work"#, b"\x07Toaster");
4100
4101 assert_magic_match_bin!(r#"0 pstring/H Toast it works"#, b"\x00\x05Toast");
4103
4104 assert_magic_match_bin!(r#"0 pstring/HJ Toast it works"#, b"\x00\x07Toast");
4105
4106 assert_magic_match_bin!(r#"0 pstring/HJ Toast %s"#, b"\x00\x07Toast", "Toast");
4107
4108 assert_magic_match_bin!(r#"0 pstring/h Toast it works"#, b"\x05\x00Toast");
4109
4110 assert_magic_match_bin!(r#"0 pstring/hJ Toast it works"#, b"\x07\x00Toast");
4111
4112 assert_magic_match_bin!(r#"0 pstring/L Toast it works"#, b"\x00\x00\x00\x05Toast");
4113
4114 assert_magic_match_bin!(r#"0 pstring/LJ Toast it works"#, b"\x00\x00\x00\x09Toast");
4115
4116 assert_magic_match_bin!(r#"0 pstring/l Toast it works"#, b"\x05\x00\x00\x00Toast");
4117
4118 assert_magic_match_bin!(r#"0 pstring/lJ Toast it works"#, b"\x09\x00\x00\x00Toast");
4119 }
4120
4121 #[test]
4122 fn test_max_recursion() {
4123 let res = first_magic(
4124 r#"0 indirect x"#,
4125 b"#! /usr/bin/luatex ",
4126 StreamKind::Binary,
4127 );
4128 assert!(res.is_err());
4129 let _ = res.inspect_err(|e| {
4130 assert!(matches!(
4131 e.unwrap_localized(),
4132 Error::MaximumRecursion(MAX_RECURSION)
4133 ))
4134 });
4135 }
4136
4137 #[test]
4138 fn test_string_ops() {
4139 assert_magic_match_text!("0 string/b MZ MZ File", b"MZ\0");
4140 assert_magic_match_text!("0 string !MZ Not MZ File", b"AZ\0");
4141 assert_magic_match_text!("0 string >\0 Any String", b"A\0");
4142 assert_magic_match_text!("0 string >Test Any String", b"Test 1\0");
4143 assert_magic_match_text!("0 string <Test Any String", b"\0");
4144 assert_magic_not_match_text!("0 string >Test Any String", b"\0");
4145 }
4146
4147 #[test]
4148 fn test_lestring16() {
4149 assert_magic_match_bin!(
4150 "0 lestring16 abcd Little-endian UTF-16 string",
4151 b"\x61\x00\x62\x00\x63\x00\x64\x00"
4152 );
4153 assert_magic_match_bin!(
4154 "0 lestring16 x %s",
4155 b"\x61\x00\x62\x00\x63\x00\x64\x00\x00",
4156 "abcd"
4157 );
4158 assert_magic_not_match_bin!(
4159 "0 lestring16 abcd Little-endian UTF-16 string",
4160 b"\x00\x61\x00\x62\x00\x63\x00\x64"
4161 );
4162 assert_magic_match_bin!(
4163 "4 lestring16 abcd Little-endian UTF-16 string",
4164 b"\x00\x00\x00\x00\x61\x00\x62\x00\x63\x00\x64\x00"
4165 );
4166 }
4167
4168 #[test]
4169 fn test_bestring16() {
4170 assert_magic_match_bin!(
4171 "0 bestring16 abcd Big-endian UTF-16 string",
4172 b"\x00\x61\x00\x62\x00\x63\x00\x64"
4173 );
4174 assert_magic_match_bin!(
4175 "0 bestring16 x %s",
4176 b"\x00\x61\x00\x62\x00\x63\x00\x64",
4177 "abcd"
4178 );
4179 assert_magic_not_match_bin!(
4180 "0 bestring16 abcd Big-endian UTF-16 string",
4181 b"\x61\x00\x62\x00\x63\x00\x64\x00"
4182 );
4183 assert_magic_match_bin!(
4184 "4 bestring16 abcd Big-endian UTF-16 string",
4185 b"\x00\x00\x00\x00\x00\x61\x00\x62\x00\x63\x00\x64"
4186 );
4187 }
4188
4189 #[test]
4190 fn test_offset_from_end() {
4191 assert_magic_match_bin!("-1 ubyte 0x42 last byte ok", b"\x00\x00\x42");
4192 assert_magic_match_bin!("-2 ubyte 0x41 last byte ok", b"\x00\x41\x00");
4193 }
4194
4195 #[test]
4196 fn test_relative_offset() {
4197 assert_magic_match_bin!(
4198 "
4199 0 ubyte 0x42
4200 >&0 ubyte 0x00
4201 >>&0 ubyte 0x41 third byte ok
4202 ",
4203 b"\x42\x00\x41\x00"
4204 );
4205 }
4206
4207 #[test]
4208 fn test_indirect_offset() {
4209 assert_magic_match_bin!("(0.l) ubyte 0x42 it works", b"\x04\x00\x00\x00\x42");
4210 assert_magic_match_bin!("(0.l+3) ubyte 0x42 it works", b"\x01\x00\x00\x00\x42");
4212 assert_magic_match_bin!(
4214 "(0.l+(4)) ubyte 0x42 it works",
4215 b"\x04\x00\x00\x00\x04\x00\x00\x00\x42"
4216 );
4217 }
4218
4219 #[test]
4220 fn test_use_with_message() {
4221 assert_magic_match_bin!(
4222 r#"
42230 string MZ
4224>0 use mz first match
4225
42260 name mz then second match
4227>0 string MZ
4228"#,
4229 b"MZ\0",
4230 "first match then second match"
4231 );
4232 }
4233
4234 #[test]
4235 fn test_scalar_transform() {
4236 assert_magic_match_bin!("0 ubyte+1 0x1 add works", b"\x00");
4237 assert_magic_match_bin!("0 ubyte-1 0xfe sub works", b"\xff");
4238 assert_magic_match_bin!("0 ubyte%2 0 mod works", b"\x0a");
4239 assert_magic_match_bin!("0 ubyte&0x0f 0x0f bitand works", b"\xff");
4240 assert_magic_match_bin!("0 ubyte|0x0f 0xff bitor works", b"\xf0");
4241 assert_magic_match_bin!("0 ubyte^0x0f 0xf0 bitxor works", b"\xff");
4242
4243 FileMagicParser::parse_str("0 ubyte%0 mod by zero", None)
4244 .expect_err("expect div by zero error");
4245 FileMagicParser::parse_str("0 ubyte/0 div by zero", None)
4246 .expect_err("expect div by zero error");
4247 }
4248
4249 #[test]
4250 fn test_belong() {
4251 assert_magic_match_bin!("0 belong 0x12345678 Big-endian long", b"\x12\x34\x56\x78");
4253 assert_magic_not_match_bin!("0 belong 0x12345678 Big-endian long", b"\x78\x56\x34\x12");
4255 assert_magic_match_bin!(
4257 "4 belong 0x12345678 Big-endian long",
4258 b"\x00\x00\x00\x00\x12\x34\x56\x78"
4259 );
4260 assert_magic_match_bin!("0 belong <0x12345678 Big-endian long", b"\x12\x34\x56\x77");
4262 assert_magic_not_match_bin!("0 belong <0x12345678 Big-endian long", b"\x12\x34\x56\x78");
4263
4264 assert_magic_match_bin!("0 belong >0x12345678 Big-endian long", b"\x12\x34\x56\x79");
4266 assert_magic_not_match_bin!("0 belong >0x12345678 Big-endian long", b"\x12\x34\x56\x78");
4267
4268 assert_magic_match_bin!("0 belong &0x5678 Big-endian long", b"\x00\x00\x56\x78");
4270 assert_magic_not_match_bin!("0 belong &0x0000FFFF Big-endian long", b"\x12\x34\x56\x78");
4271
4272 assert_magic_match_bin!("0 belong ^0xFFFF0000 Big-endian long", b"\x00\x00\x56\x78");
4274 assert_magic_not_match_bin!("0 belong ^0xFFFF0000 Big-endian long", b"\x00\x01\x56\x78");
4275
4276 assert_magic_match_bin!("0 belong ~0x12345678 Big-endian long", b"\xed\xcb\xa9\x87");
4278 assert_magic_not_match_bin!("0 belong ~0x12345678 Big-endian long", b"\x12\x34\x56\x78");
4279
4280 assert_magic_match_bin!("0 belong x Big-endian long", b"\x12\x34\x56\x78");
4282 assert_magic_match_bin!("0 belong x Big-endian long", b"\x78\x56\x34\x12");
4283 }
4284
4285 #[test]
4286 fn test_parse_search() {
4287 parse_assert!("0 search test");
4288 parse_assert!("0 search/24/s test");
4289 parse_assert!("0 search/s/24 test");
4290 }
4291
4292 #[test]
4293 fn test_bedate() {
4294 assert_magic_match_bin!(
4295 "0 bedate 946684800 Unix date (Jan 1, 2000)",
4296 b"\x38\x6D\x43\x80"
4297 );
4298 assert_magic_not_match_bin!(
4299 "0 bedate 946684800 Unix date (Jan 1, 2000)",
4300 b"\x00\x00\x00\x00"
4301 );
4302 assert_magic_match_bin!(
4303 "4 bedate 946684800 %s",
4304 b"\x00\x00\x00\x00\x38\x6D\x43\x80",
4305 "2000-01-01 00:00:00"
4306 );
4307 }
4308 #[test]
4309 fn test_beldate() {
4310 assert_magic_match_bin!(
4311 "0 beldate 946684800 Local date (Jan 1, 2000)",
4312 b"\x38\x6D\x43\x80"
4313 );
4314 assert_magic_not_match_bin!(
4315 "0 beldate 946684800 Local date (Jan 1, 2000)",
4316 b"\x00\x00\x00\x00"
4317 );
4318
4319 assert_magic_match_bin!(
4320 "4 beldate 946684800 {}",
4321 b"\x00\x00\x00\x00\x38\x6D\x43\x80",
4322 unix_local_time_to_string(946684800)
4323 );
4324 }
4325
4326 #[test]
4327 fn test_beqdate() {
4328 assert_magic_match_bin!(
4329 "0 beqdate 946684800 Unix date (Jan 1, 2000)",
4330 b"\x00\x00\x00\x00\x38\x6D\x43\x80"
4331 );
4332
4333 assert_magic_not_match_bin!(
4334 "0 beqdate 946684800 Unix date (Jan 1, 2000)",
4335 b"\x00\x00\x00\x00\x00\x00\x00\x00"
4336 );
4337
4338 assert_magic_match_bin!(
4339 "0 beqdate 946684800 %s",
4340 b"\x00\x00\x00\x00\x38\x6D\x43\x80",
4341 "2000-01-01 00:00:00"
4342 );
4343 }
4344
4345 #[test]
4346 fn test_medate() {
4347 assert_magic_match_bin!(
4348 "0 medate 946684800 Unix date (Jan 1, 2000)",
4349 b"\x6D\x38\x80\x43"
4350 );
4351
4352 assert_magic_not_match_bin!(
4353 "0 medate 946684800 Unix date (Jan 1, 2000)",
4354 b"\x00\x00\x00\x00"
4355 );
4356
4357 assert_magic_match_bin!(
4358 "4 medate 946684800 %s",
4359 b"\x00\x00\x00\x00\x6D\x38\x80\x43",
4360 "2000-01-01 00:00:00"
4361 );
4362 }
4363
4364 #[test]
4365 fn test_meldate() {
4366 assert_magic_match_bin!(
4367 "0 meldate 946684800 Local date (Jan 1, 2000)",
4368 b"\x6D\x38\x80\x43"
4369 );
4370 assert_magic_not_match_bin!(
4371 "0 meldate 946684800 Local date (Jan 1, 2000)",
4372 b"\x00\x00\x00\x00"
4373 );
4374
4375 assert_magic_match_bin!(
4376 "4 meldate 946684800 %s",
4377 b"\x00\x00\x00\x00\x6D\x38\x80\x43",
4378 unix_local_time_to_string(946684800)
4379 );
4380 }
4381
4382 #[test]
4383 fn test_date() {
4384 assert_magic_match_bin!(
4385 "0 date 946684800 Local date (Jan 1, 2000)",
4386 b"\x80\x43\x6D\x38"
4387 );
4388 assert_magic_not_match_bin!(
4389 "0 date 946684800 Local date (Jan 1, 2000)",
4390 b"\x00\x00\x00\x00"
4391 );
4392 assert_magic_match_bin!(
4393 "4 date 946684800 {}",
4394 b"\x00\x00\x00\x00\x80\x43\x6D\x38",
4395 "2000-01-01 00:00:00"
4396 );
4397 }
4398
4399 #[test]
4400 fn test_leldate() {
4401 assert_magic_match_bin!(
4402 "0 leldate 946684800 Local date (Jan 1, 2000)",
4403 b"\x80\x43\x6D\x38"
4404 );
4405 assert_magic_not_match_bin!(
4406 "0 leldate 946684800 Local date (Jan 1, 2000)",
4407 b"\x00\x00\x00\x00"
4408 );
4409 assert_magic_match_bin!(
4410 "4 leldate 946684800 {}",
4411 b"\x00\x00\x00\x00\x80\x43\x6D\x38",
4412 unix_local_time_to_string(946684800)
4413 );
4414 }
4415
4416 #[test]
4417 fn test_leqdate() {
4418 assert_magic_match_bin!(
4419 "0 leqdate 1577836800 Unix date (Jan 1, 2020)",
4420 b"\x00\xe1\x0b\x5E\x00\x00\x00\x00"
4421 );
4422
4423 assert_magic_not_match_bin!(
4424 "0 leqdate 1577836800 Unix date (Jan 1, 2020)",
4425 b"\x00\x00\x00\x00\x00\x00\x00\x00"
4426 );
4427 assert_magic_match_bin!(
4428 "8 leqdate 1577836800 %s",
4429 b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\xE1\x0B\x5E\x00\x00\x00\x00",
4430 "2020-01-01 00:00:00"
4431 );
4432 }
4433
4434 #[test]
4435 fn test_leqldate() {
4436 assert_magic_match_bin!(
4437 "0 leqldate 1577836800 Unix date (Jan 1, 2020)",
4438 b"\x00\xe1\x0b\x5E\x00\x00\x00\x00"
4439 );
4440
4441 assert_magic_not_match_bin!(
4442 "0 leqldate 1577836800 Unix date (Jan 1, 2020)",
4443 b"\x00\x00\x00\x00\x00\x00\x00\x00"
4444 );
4445 assert_magic_match_bin!(
4446 "8 leqldate 1577836800 %s",
4447 b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\xE1\x0B\x5E\x00\x00\x00\x00",
4448 unix_local_time_to_string(1577836800)
4449 );
4450 }
4451
4452 #[test]
4453 fn test_melong() {
4454 assert_magic_match_bin!(
4456 "0 melong =0x12345678 Middle-endian long",
4457 b"\x34\x12\x78\x56"
4458 );
4459 assert_magic_not_match_bin!(
4460 "0 melong =0x12345678 Middle-endian long",
4461 b"\x00\x00\x00\x00"
4462 );
4463
4464 assert_magic_match_bin!(
4466 "0 melong <0x12345678 Middle-endian long",
4467 b"\x34\x12\x78\x55"
4468 ); assert_magic_not_match_bin!(
4470 "0 melong <0x12345678 Middle-endian long",
4471 b"\x34\x12\x78\x56"
4472 ); assert_magic_match_bin!(
4476 "0 melong >0x12345678 Middle-endian long",
4477 b"\x34\x12\x78\x57"
4478 ); assert_magic_not_match_bin!(
4480 "0 melong >0x12345678 Middle-endian long",
4481 b"\x34\x12\x78\x56"
4482 ); assert_magic_match_bin!("0 melong &0x5678 Middle-endian long", b"\xab\xcd\x78\x56"); assert_magic_not_match_bin!(
4487 "0 melong &0x0000FFFF Middle-endian long",
4488 b"\x34\x12\x78\x56"
4489 ); assert_magic_match_bin!(
4493 "0 melong ^0xFFFF0000 Middle-endian long",
4494 b"\x00\x00\x78\x56"
4495 ); assert_magic_not_match_bin!(
4497 "0 melong ^0xFFFF0000 Middle-endian long",
4498 b"\x00\x01\x78\x56"
4499 ); assert_magic_match_bin!(
4503 "0 melong ~0x12345678 Middle-endian long",
4504 b"\xCB\xED\x87\xA9"
4505 );
4506 assert_magic_not_match_bin!(
4507 "0 melong ~0x12345678 Middle-endian long",
4508 b"\x34\x12\x78\x56"
4509 ); assert_magic_match_bin!("0 melong x Middle-endian long", b"\x34\x12\x78\x56");
4513 assert_magic_match_bin!("0 melong x Middle-endian long", b"\x00\x00\x00\x00");
4514 }
4515
4516 #[test]
4517 fn test_uquad() {
4518 assert_magic_match_bin!(
4520 "0 uquad =0x123456789ABCDEF0 Unsigned quad",
4521 b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4522 );
4523 assert_magic_not_match_bin!(
4524 "0 uquad =0x123456789ABCDEF0 Unsigned quad",
4525 b"\x00\x00\x00\x00\x00\x00\x00\x00"
4526 );
4527
4528 assert_magic_match_bin!(
4530 "0 uquad <0x123456789ABCDEF0 Unsigned quad",
4531 b"\xF0\xDE\xBC\x9A\x78\x56\x34\x11"
4532 );
4533 assert_magic_not_match_bin!(
4534 "0 uquad <0x123456789ABCDEF0 Unsigned quad",
4535 b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4536 );
4537
4538 assert_magic_match_bin!(
4540 "0 uquad >0x123456789ABCDEF0 Unsigned quad",
4541 b"\xF0\xDE\xBC\x9A\x78\x56\x34\x13"
4542 );
4543 assert_magic_not_match_bin!(
4544 "0 uquad >0x123456789ABCDEF0 Unsigned quad",
4545 b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4546 );
4547
4548 assert_magic_match_bin!(
4550 "0 uquad &0xF0 Unsigned quad",
4551 b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4552 );
4553 assert_magic_not_match_bin!(
4554 "0 uquad &0xFF Unsigned quad",
4555 b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4556 );
4557
4558 assert_magic_match_bin!(
4560 "0 uquad ^0xFFFFFFFFFFFFFFFF Unsigned quad",
4561 b"\x00\x00\x00\x00\x00\x00\x00\x00"
4562 ); assert_magic_not_match_bin!(
4564 "0 uquad ^0xFFFFFFFFFFFFFFFF Unsigned quad",
4565 b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4566 ); assert_magic_match_bin!(
4570 "0 uquad ~0x123456789ABCDEF0 Unsigned quad",
4571 b"\x0F\x21\x43\x65\x87\xA9\xCB\xED"
4572 );
4573 assert_magic_not_match_bin!(
4574 "0 uquad ~0x123456789ABCDEF0 Unsigned quad",
4575 b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4576 ); assert_magic_match_bin!(
4580 "0 uquad x {:#x}",
4581 b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12",
4582 "0x123456789abcdef0"
4583 );
4584 assert_magic_match_bin!(
4585 "0 uquad x Unsigned quad",
4586 b"\x00\x00\x00\x00\x00\x00\x00\x00"
4587 );
4588 }
4589
4590 #[test]
4591 fn test_guid() {
4592 assert_magic_match_bin!(
4593 "0 guid EC959539-6786-2D4E-8FDB-98814CE76C1E It works",
4594 b"\xEC\x95\x95\x39\x67\x86\x2D\x4E\x8F\xDB\x98\x81\x4C\xE7\x6C\x1E"
4595 );
4596
4597 assert_magic_not_match_bin!(
4598 "0 guid 399595EC-8667-4E2D-8FDB-98814CE76C1E It works",
4599 b"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F"
4600 );
4601
4602 assert_magic_match_bin!(
4603 "0 guid x %s",
4604 b"\xEC\x95\x95\x39\x67\x86\x2D\x4E\x8F\xDB\x98\x81\x4C\xE7\x6C\x1E",
4605 "EC959539-6786-2D4E-8FDB-98814CE76C1E"
4606 );
4607 }
4608
4609 #[test]
4610 fn test_ubeqdate() {
4611 assert_magic_match_bin!(
4612 "0 ubeqdate 1633046400 It works",
4613 b"\x00\x00\x00\x00\x61\x56\x4f\x80"
4614 );
4615
4616 assert_magic_match_bin!(
4617 "0 ubeqdate x %s",
4618 b"\x00\x00\x00\x00\x61\x56\x4f\x80",
4619 "2021-10-01 00:00:00"
4620 );
4621
4622 assert_magic_not_match_bin!(
4623 "0 ubeqdate 1633046400 It should not work",
4624 b"\x00\x00\x00\x00\x00\x00\x00\x00"
4625 );
4626 }
4627
4628 #[test]
4629 fn test_ldate() {
4630 assert_magic_match_bin!("0 ldate 1640551520 It works", b"\x60\xd4\xC8\x61");
4631
4632 assert_magic_not_match_bin!("0 ldate 1633046400 It should not work", b"\x00\x00\x00\x00");
4633
4634 assert_magic_match_bin!(
4635 "0 ldate x %s",
4636 b"\x60\xd4\xC8\x61",
4637 unix_local_time_to_string(1640551520)
4638 );
4639 }
4640
4641 #[test]
4642 fn test_scalar_with_transform() {
4643 assert_magic_match_bin!("0 ubyte/10 2 {}", b"\x14", "2");
4644 assert_magic_match_bin!("0 ubyte/10 x {}", b"\x14", "2");
4645 assert_magic_match_bin!("0 ubyte%10 x {}", b"\x14", "0");
4646 }
4647
4648 #[test]
4649 fn test_float_with_transform() {
4650 assert_magic_match_bin!("0 lefloat/10 2 {}", b"\x00\x00\xa0\x41", "2");
4651 assert_magic_match_bin!("0 lefloat/10 x {}", b"\x00\x00\xa0\x41", "2");
4652 assert_magic_match_bin!("0 lefloat%10 x {}", b"\x00\x00\xa0\x41", "0");
4653 }
4654
4655 #[test]
4656 fn test_read_octal() {
4657 assert_eq!(read_octal_u64(&mut buf_reader!("0")), Some(0));
4659 assert_eq!(read_octal_u64(&mut buf_reader!("00")), Some(0));
4660 assert_eq!(read_octal_u64(&mut buf_reader!("01")), Some(1));
4661 assert_eq!(read_octal_u64(&mut buf_reader!("07")), Some(7));
4662 assert_eq!(read_octal_u64(&mut buf_reader!("010")), Some(8));
4663 assert_eq!(read_octal_u64(&mut buf_reader!("0123")), Some(83));
4664 assert_eq!(read_octal_u64(&mut buf_reader!("0755")), Some(493));
4665
4666 assert_eq!(read_octal_u64(&mut buf_reader!("0ABC")), Some(0));
4668 assert_eq!(read_octal_u64(&mut buf_reader!("01ABC")), Some(1));
4669 assert_eq!(read_octal_u64(&mut buf_reader!("0755ABC")), Some(493));
4670 assert_eq!(read_octal_u64(&mut buf_reader!("0123ABC")), Some(83));
4671
4672 assert_eq!(read_octal_u64(&mut buf_reader!("08")), Some(0)); assert_eq!(read_octal_u64(&mut buf_reader!("01238")), Some(83)); assert_eq!(read_octal_u64(&mut buf_reader!("123")), None);
4678 assert_eq!(read_octal_u64(&mut buf_reader!("755")), None);
4679
4680 assert_eq!(read_octal_u64(&mut buf_reader!("")), None);
4682
4683 assert_eq!(read_octal_u64(&mut buf_reader!("ABC")), None);
4685 assert_eq!(read_octal_u64(&mut buf_reader!("8ABC")), None); assert_eq!(
4689 read_octal_u64(&mut buf_reader!("01777777777")),
4690 Some(268435455)
4691 );
4692 }
4693
4694 #[test]
4695 fn test_offset_bug_1() {
4696 assert_magic_match_bin!(
4699 r"
47001 string TEST Bread is
4701# offset computation is relative to
4702# rule start
4703>(5.b) use toasted
4704
47050 name toasted
4706>0 string twice Toasted
4707>>0 use toasted_twice
4708
47090 name toasted_twice
4710>(6.b) string x %s
4711 ",
4712 b"\x00TEST\x06twice\x00\x06",
4713 "Bread is Toasted twice"
4714 );
4715 }
4716
4717 #[test]
4723 fn test_offset_bug_2() {
4724 assert_magic_match_bin!(
4727 r"
4728-12 string TEST Bread is
4729>(4.b) use toasted
4730
47310 name toasted
4732>0 string twice Toasted
4733>>0 use toasted_twice
4734
47350 name toasted_twice
4736>(6.b) string x %
4737 ",
4738 b"\x00TEST\x06twice\x00\x06",
4739 "Bread is Toasted twice"
4740 )
4741 }
4742
4743 #[test]
4744 fn test_offset_bug_3() {
4745 assert_magic_match_bin!(
4748 r"
47491 string TEST Bread is
4750>(5.b) indirect/r x
4751
47520 string twice Toasted
4753>0 use toasted_twice
4754
47550 name toasted_twice
4756>0 string x %s
4757 ",
4758 b"\x00TEST\x06twice\x00\x08",
4759 "Bread is Toasted twice"
4760 )
4761 }
4762
4763 #[test]
4764 fn test_offset_bug_4() {
4765 assert_magic_match_bin!(
4768 r"
47691 string Bread %s
4770>(6.b) indirect/r x
4771
4772# this one uses a based offset
4773# computed at indirection
47741 string is\ Toasted %s
4775>(11.b) use toasted_twice
4776
4777# this one is using a new base
4778# offset being previous base
4779# offset + offset of use
47800 name toasted_twice
4781>0 string x %s
4782 ",
4783 b"\x00Bread\x06is Toasted\x0ctwice\x00",
4784 "Bread is Toasted twice"
4785 )
4786 }
4787
4788 #[test]
4789 fn test_offset_bug_5() {
4790 assert_magic_match_bin!(
4791 r"
47921 string TEST Bread is
4793>(5.b) indirect/r x
4794
47950 string twice Toasted
4796>0 use toasted_twice
4797
47980 name toasted_twice
4799>0 string twice
4800>>&1 byte 0x08 twice
4801 ",
4802 b"\x00TEST\x06twice\x00\x08",
4803 "Bread is Toasted twice"
4804 )
4805 }
4806
4807 #[test]
4808 fn test_bug_6() {
4809 assert_magic_match_bin!(
4813 r"
48141 string TEST Bread is toasted
4815>&0 use toasted
4816>>&0 default x but not burnt
4817
48180 name toasted
4819>1 string toasted
4820 ",
4821 b"\x00TEST\x06toasted",
4822 "Bread is toasted"
4823 )
4824 }
4825
4826 #[test]
4827 fn test_offset_bug_7() {
4828 assert_magic_match_bin!(
4832 r"
48331 string TEST Bread is
4834# offset computation is relative to
4835# rule start
4836>(5.b) use toasted
4837
48380 name toasted
4839>0 string toast Toasted
4840>>(6.b) use toasted_twice
4841
48420 name toasted_twice
4843>1 string x %s
4844 ",
4845 b"\x00TEST\x06toast\x00\x06twice\x00",
4846 "Bread is Toasted twice"
4847 );
4848 }
4849
4850 #[test]
4851 fn test_message_parts() {
4852 let m = first_magic(
4853 r#"0 string/W #!/usr/bin/env\ python PYTHON"#,
4854 b"#!/usr/bin/env python",
4855 StreamKind::Text(TextEncoding::Ascii),
4856 )
4857 .unwrap();
4858
4859 assert!(m.message_parts().any(|p| p.eq_ignore_ascii_case("python")))
4860 }
4861
4862 #[test]
4863 fn test_load_bulk() {
4864 let mut db = MagicDb::new();
4865
4866 let rules = vec![
4867 parse_assert!("0 search test"),
4868 parse_assert!("0 search/24/s test"),
4869 parse_assert!("0 search/s/24 test"),
4870 ];
4871
4872 db.load_bulk(rules.into_iter());
4873 db.verify().unwrap();
4874 }
4875
4876 #[test]
4877 fn test_load_bulk_failure() {
4878 let mut db = MagicDb::new();
4879
4880 let rules = vec![parse_assert!(
4881 r#"
48820 search/s/24 test
4883>0 use test
4884"#
4885 )];
4886
4887 db.load_bulk(rules.into_iter());
4888 assert!(matches!(db.verify(), Err(Error::Verify(_, _, _))));
4889 }
4890}