1#![forbid(unsafe_code)]
2#![deny(unused_imports)]
3#![deny(missing_docs)]
4use dyf::{DynDisplay, FormatString, dformat};
144use flagset::{FlagSet, flags};
145use flate2::{Compression, read::GzDecoder, write::GzEncoder};
146use memchr::memchr;
147use pest::{Span, error::ErrorVariant};
148use regex::bytes::{self};
149use serde::{Deserialize, Serialize};
150use std::{
151 borrow::Cow,
152 cmp::max,
153 collections::{HashMap, HashSet},
154 fmt::{self, Debug, Display},
155 fs::File,
156 io::{self, Read, SeekFrom, Write},
157 ops::{Add, BitAnd, BitOr, BitXor, Deref, Div, Mul, Rem, Sub},
158 path::Path,
159};
160use tar::Archive;
161use thiserror::Error;
162use tracing::{Level, debug, enabled, trace};
163
164use crate::{
165 numeric::{Float, FloatDataType, Scalar, ScalarDataType},
166 parser::{FileMagicParser, Rule},
167 readers::DataRead,
168 utils::{
169 debug_string_from_vec_u8, debug_string_from_vec_u16, decode_id3, find_json_boundaries,
170 run_utf8_validation,
171 },
172};
173
174mod numeric;
175mod parser;
176pub mod readers;
177pub use readers::DataReader;
178mod utils;
179
180const HARDCODED_MAGIC_STRENGTH: u64 = 2048;
181const HARDCODED_SOURCE: &str = "hardcoded";
182const MAX_RECURSION: usize = 50;
184const FILE_REGEX_MAX: usize = 8192;
186
187pub const FILE_BYTES_MAX: usize = 7 * 1024 * 1024;
193pub const DEFAULT_BIN_MIMETYPE: &str = "application/octet-stream";
195pub const DEFAULT_TEXT_MIMETYPE: &str = "text/plain";
197
198pub(crate) const TIMESTAMP_FORMAT: &str = "%Y-%m-%d %H:%M:%S";
199
200macro_rules! debug_panic {
201 ($($arg:tt)*) => {
202 if cfg!(debug_assertions) {
203 panic!($($arg)*);
204 }
205 };
206}
207
208macro_rules! read {
209 ($r: expr, $ty: ty) => {{
210 let mut a = [0u8; std::mem::size_of::<$ty>()];
211 $r.read_exact_into(&mut a)?;
212 a
213 }};
214}
215
216macro_rules! read_le {
217 ($r:expr, $ty: ty ) => {{ <$ty>::from_le_bytes(read!($r, $ty)) }};
218}
219
220macro_rules! read_be {
221 ($r:expr, $ty: ty ) => {{ <$ty>::from_be_bytes(read!($r, $ty)) }};
222}
223
224macro_rules! read_me {
225 ($r: expr) => {{ ((read_le!($r, u16) as i32) << 16) | (read_le!($r, u16) as i32) }};
226}
227
228#[inline(always)]
229fn read_octal_u64<D: DataRead>(haystack: &mut D) -> Option<u64> {
230 let s = haystack
231 .read_while_or_limit(|b| matches!(b, b'0'..=b'7'), 22)
232 .map(|buf| str::from_utf8(buf))
233 .ok()?
234 .ok()?;
235
236 if !s.starts_with("0") {
237 return None;
238 }
239
240 u64::from_str_radix(s, 8).ok()
241}
242
243#[derive(Debug, Error)]
245pub enum Error {
246 #[error("{0}")]
248 Msg(String),
249
250 #[error("source={0} line={1} error={2}")]
252 Verify(String, usize, Box<Error>),
253
254 #[error("source={0} line={1} error={2}")]
256 Localized(String, usize, Box<Error>),
257
258 #[error("missing rule: {0}")]
260 MissingRule(String),
261
262 #[error("maximum recursion reached: {0}")]
264 MaximumRecursion(usize),
265
266 #[error("io: {0}")]
268 Io(#[from] io::Error),
269
270 #[error("parser error: {0}")]
272 Parse(#[from] Box<pest::error::Error<Rule>>),
273
274 #[error("formatting: {0}")]
276 Format(#[from] dyf::Error),
277
278 #[error("regex: {0}")]
280 Regex(#[from] regex::Error),
281
282 #[error("{0}")]
284 Serialize(#[from] bincode::error::EncodeError),
285
286 #[error("{0}")]
288 Deserialize(#[from] bincode::error::DecodeError),
289}
290
291impl Error {
292 #[inline]
293 fn parser<S: ToString>(msg: S, span: Span<'_>) -> Self {
294 Self::Parse(Box::new(pest::error::Error::new_from_span(
295 ErrorVariant::CustomError {
296 message: msg.to_string(),
297 },
298 span,
299 )))
300 }
301
302 fn msg<M: AsRef<str>>(msg: M) -> Self {
303 Self::Msg(msg.as_ref().into())
304 }
305
306 fn localized<S: AsRef<str>>(source: S, line: usize, err: Error) -> Self {
307 Self::Localized(source.as_ref().into(), line, err.into())
308 }
309
310 pub fn unwrap_localized(&self) -> &Self {
312 match self {
313 Self::Localized(_, _, e) => e,
314 _ => self,
315 }
316 }
317}
318
319#[derive(Debug, Clone, Serialize, Deserialize)]
320enum Message {
321 String(String),
322 Format {
323 printf_spec: String,
324 fs: FormatString,
325 },
326}
327
328impl Display for Message {
329 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
330 match self {
331 Self::String(s) => write!(f, "{s}"),
332 Self::Format { printf_spec: _, fs } => write!(f, "{}", fs.to_string_lossy()),
333 }
334 }
335}
336
337impl Message {
338 fn to_string_lossy(&self) -> Cow<'_, str> {
339 match self {
340 Message::String(s) => Cow::Borrowed(s),
341 Message::Format { printf_spec: _, fs } => fs.to_string_lossy(),
342 }
343 }
344
345 #[inline(always)]
346 fn format_with(&self, mr: Option<&MatchRes>) -> Result<Cow<'_, str>, Error> {
347 match self {
348 Self::String(s) => Ok(Cow::Borrowed(s.as_str())),
349 Self::Format {
350 printf_spec: c_spec,
351 fs,
352 } => {
353 if let Some(mr) = mr {
354 match mr {
355 MatchRes::Float(_, _) | MatchRes::Bytes(_, _, _, _) => {
356 Ok(Cow::Owned(dformat!(fs, mr)?))
357 }
358 MatchRes::Scalar(_, scalar) => {
359 if c_spec.as_str() == "c" {
361 match scalar {
362 Scalar::byte(b) => {
363 let b = (*b as u8) as char;
364 Ok(Cow::Owned(dformat!(fs, b)?))
365 }
366 Scalar::ubyte(b) => {
367 let b = *b as char;
368 Ok(Cow::Owned(dformat!(fs, b)?))
369 }
370 _ => Ok(Cow::Owned(dformat!(fs, mr)?)),
371 }
372 } else {
373 Ok(Cow::Owned(dformat!(fs, mr)?))
374 }
375 }
376 }
377 } else {
378 Ok(fs.to_string_lossy())
379 }
380 }
381 }
382 }
383}
384
385impl ScalarDataType {
386 #[inline(always)]
387 fn read<R: DataRead>(&self, from: &mut R, switch_endianness: bool) -> Result<Scalar, Error> {
388 macro_rules! _read_le {
389 ($ty: ty) => {{
390 if switch_endianness {
391 <$ty>::from_be_bytes(read!(from, $ty))
392 } else {
393 <$ty>::from_le_bytes(read!(from, $ty))
394 }
395 }};
396 }
397
398 macro_rules! _read_be {
399 ($ty: ty) => {{
400 if switch_endianness {
401 <$ty>::from_le_bytes(read!(from, $ty))
402 } else {
403 <$ty>::from_be_bytes(read!(from, $ty))
404 }
405 }};
406 }
407
408 macro_rules! _read_ne {
409 ($ty: ty) => {{
410 if cfg!(target_endian = "big") {
411 _read_be!($ty)
412 } else {
413 _read_le!($ty)
414 }
415 }};
416 }
417
418 macro_rules! _read_me {
419 () => {
420 ((_read_le!(u16) as i32) << 16) | (_read_le!(u16) as i32)
421 };
422 }
423
424 Ok(match self {
425 Self::byte => Scalar::byte(read!(from, u8)[0] as i8),
427 Self::short => Scalar::short(_read_ne!(i16)),
428 Self::long => Scalar::long(_read_ne!(i32)),
429 Self::date => Scalar::date(_read_ne!(i32)),
430 Self::ldate => Scalar::ldate(_read_ne!(i32)),
431 Self::qwdate => Scalar::qwdate(_read_ne!(i64)),
432 Self::leshort => Scalar::leshort(_read_le!(i16)),
433 Self::lelong => Scalar::lelong(_read_le!(i32)),
434 Self::lequad => Scalar::lequad(_read_le!(i64)),
435 Self::bequad => Scalar::bequad(_read_be!(i64)),
436 Self::belong => Scalar::belong(_read_be!(i32)),
437 Self::bedate => Scalar::bedate(_read_be!(i32)),
438 Self::beldate => Scalar::beldate(_read_be!(i32)),
439 Self::beqdate => Scalar::beqdate(_read_be!(i64)),
440 Self::ubyte => Scalar::ubyte(read!(from, u8)[0]),
442 Self::ushort => Scalar::ushort(_read_ne!(u16)),
443 Self::uleshort => Scalar::uleshort(_read_le!(u16)),
444 Self::ulelong => Scalar::ulelong(_read_le!(u32)),
445 Self::uledate => Scalar::uledate(_read_le!(u32)),
446 Self::ulequad => Scalar::ulequad(_read_le!(u64)),
447 Self::offset => Scalar::offset(from.stream_position()),
448 Self::ubequad => Scalar::ubequad(_read_be!(u64)),
449 Self::medate => Scalar::medate(_read_me!()),
450 Self::meldate => Scalar::meldate(_read_me!()),
451 Self::melong => Scalar::melong(_read_me!()),
452 Self::beshort => Scalar::beshort(_read_be!(i16)),
453 Self::quad => Scalar::quad(_read_ne!(i64)),
454 Self::uquad => Scalar::uquad(_read_ne!(u64)),
455 Self::ledate => Scalar::ledate(_read_le!(i32)),
456 Self::leldate => Scalar::leldate(_read_le!(i32)),
457 Self::leqdate => Scalar::leqdate(_read_le!(i64)),
458 Self::leqldate => Scalar::leqldate(_read_le!(i64)),
459 Self::leqwdate => Scalar::leqwdate(_read_le!(i64)),
460 Self::ubelong => Scalar::ubelong(_read_be!(u32)),
461 Self::ulong => Scalar::ulong(_read_ne!(u32)),
462 Self::ubeshort => Scalar::ubeshort(_read_be!(u16)),
463 Self::ubeqdate => Scalar::ubeqdate(_read_be!(u64)),
464 Self::lemsdosdate => Scalar::lemsdosdate(_read_le!(u16)),
465 Self::lemsdostime => Scalar::lemsdostime(_read_le!(u16)),
466 Self::guid => Scalar::guid(u128::from_be_bytes(read!(from, u128))),
467 })
468 }
469}
470
471impl FloatDataType {
472 #[inline(always)]
473 fn read<R: DataRead>(&self, from: &mut R, switch_endianness: bool) -> Result<Float, Error> {
474 macro_rules! _read_le {
475 ($ty: ty) => {{
476 if switch_endianness {
477 <$ty>::from_be_bytes(read!(from, $ty))
478 } else {
479 <$ty>::from_le_bytes(read!(from, $ty))
480 }
481 }};
482 }
483
484 macro_rules! _read_be {
485 ($ty: ty) => {{
486 if switch_endianness {
487 <$ty>::from_le_bytes(read!(from, $ty))
488 } else {
489 <$ty>::from_be_bytes(read!(from, $ty))
490 }
491 }};
492 }
493
494 macro_rules! _read_ne {
495 ($ty: ty) => {{
496 if cfg!(target_endian = "big") {
497 _read_be!($ty)
498 } else {
499 _read_le!($ty)
500 }
501 }};
502 }
503
504 macro_rules! _read_me {
505 () => {
506 ((_read_le!(u16) as i32) << 16) | (_read_le!(u16) as i32)
507 };
508 }
509
510 Ok(match self {
511 Self::lefloat => Float::lefloat(_read_le!(f32)),
512 Self::befloat => Float::befloat(_read_le!(f32)),
513 Self::ledouble => Float::ledouble(_read_le!(f64)),
514 Self::bedouble => Float::bedouble(_read_be!(f64)),
515 })
516 }
517}
518
519#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
520enum Op {
521 Mul,
522 Add,
523 Sub,
524 Div,
525 Mod,
526 And,
527 Xor,
528 Or,
529}
530
531impl Display for Op {
532 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
533 match self {
534 Op::Mul => write!(f, "*"),
535 Op::Add => write!(f, "+"),
536 Op::Sub => write!(f, "-"),
537 Op::Div => write!(f, "/"),
538 Op::Mod => write!(f, "%"),
539 Op::And => write!(f, "&"),
540 Op::Or => write!(f, "|"),
541 Op::Xor => write!(f, "^"),
542 }
543 }
544}
545
546#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
547enum CmpOp {
548 Eq,
549 Lt,
550 Gt,
551 BitAnd,
552 Neq, Xor,
554 Not, }
556
557impl CmpOp {
558 #[inline(always)]
559 fn is_neq(&self) -> bool {
560 matches!(self, Self::Neq)
561 }
562}
563
564#[derive(Debug, Clone, Serialize, Deserialize)]
565struct ScalarTransform {
566 op: Op,
567 num: Scalar,
568}
569
570impl ScalarTransform {
571 fn apply(&self, s: Scalar) -> Option<Scalar> {
572 match self.op {
573 Op::Add => s.checked_add(self.num),
574 Op::Sub => s.checked_sub(self.num),
575 Op::Mul => s.checked_mul(self.num),
576 Op::Div => s.checked_div(self.num),
577 Op::Mod => s.checked_rem(self.num),
578 Op::And => Some(s.bitand(self.num)),
579 Op::Xor => Some(s.bitxor(self.num)),
580 Op::Or => Some(s.bitor(self.num)),
581 }
582 }
583}
584
585#[derive(Debug, Clone, Serialize, Deserialize)]
586struct FloatTransform {
587 op: Op,
588 num: Float,
589}
590
591impl FloatTransform {
592 fn apply(&self, s: Float) -> Float {
593 match self.op {
594 Op::Add => s.add(self.num),
595 Op::Sub => s.sub(self.num),
596 Op::Mul => s.mul(self.num),
597 Op::Div => s.div(self.num),
599 Op::Mod => s.rem(self.num),
601 Op::And | Op::Xor | Op::Or => {
603 debug_panic!("unsupported operation");
604 s
605 }
606 }
607 }
608}
609
610#[derive(Clone, Serialize, Deserialize)]
611enum TestValue<T> {
612 Value(T),
613 Any,
614}
615
616impl Debug for TestValue<Vec<u8>> {
617 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
618 match self {
619 Self::Value(v) => write!(f, "\"{}\"", debug_string_from_vec_u8(v)),
620 Self::Any => write!(f, "ANY"),
621 }
622 }
623}
624
625impl Debug for TestValue<Vec<u16>> {
626 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
627 match self {
628 Self::Value(v) => write!(f, "\"{}\"", debug_string_from_vec_u16(v)),
629 Self::Any => write!(f, "ANY"),
630 }
631 }
632}
633
634impl Debug for TestValue<Scalar> {
635 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
636 match self {
637 Self::Value(s) => write!(f, "{s:?}"),
638 Self::Any => write!(f, "ANY"),
639 }
640 }
641}
642
643impl Debug for TestValue<Float> {
644 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
645 match self {
646 Self::Value(fl) => write!(f, "{fl:?}"),
647 Self::Any => write!(f, "ANY"),
648 }
649 }
650}
651
652impl<T> TestValue<T> {
653 #[inline(always)]
654 fn as_ref(&self) -> TestValue<&T> {
655 match self {
656 Self::Value(v) => TestValue::Value(v),
657 Self::Any => TestValue::Any,
658 }
659 }
660}
661
662flags! {
663 enum ReMod: u8{
664 CaseInsensitive,
665 StartOffsetUpdate,
666 LineLimit,
667 ForceBin,
668 ForceText,
669 TrimMatch,
670 }
671}
672
673fn serialize_regex<S>(re: &bytes::Regex, serializer: S) -> Result<S::Ok, S::Error>
674where
675 S: serde::Serializer,
676{
677 re.as_str().serialize(serializer)
678}
679
680fn deserialize_regex<'de, D>(deserializer: D) -> Result<bytes::Regex, D::Error>
681where
682 D: serde::Deserializer<'de>,
683{
684 let wrapper = String::deserialize(deserializer)?;
685 bytes::Regex::new(&wrapper).map_err(serde::de::Error::custom)
686}
687
688#[derive(Debug, Clone, Serialize, Deserialize)]
689struct RegexTest {
690 #[serde(
691 serialize_with = "serialize_regex",
692 deserialize_with = "deserialize_regex"
693 )]
694 re: bytes::Regex,
695 length: Option<usize>,
696 mods: FlagSet<ReMod>,
697 str_mods: FlagSet<StringMod>,
698 non_magic_len: usize,
699 binary: bool,
700 cmp_op: CmpOp,
701}
702
703impl RegexTest {
704 #[inline(always)]
705 fn is_binary(&self) -> bool {
706 self.binary
707 || self.mods.contains(ReMod::ForceBin)
708 || self.str_mods.contains(StringMod::ForceBin)
709 }
710
711 #[inline(always)]
712 fn is_text(&self) -> bool {
713 self.mods.contains(ReMod::ForceText) || self.str_mods.contains(StringMod::ForceText)
714 }
715
716 fn match_buf<'buf>(
717 &self,
718 off_buf: u64, stream_kind: StreamKind,
720 buf: &'buf [u8],
721 ) -> Option<MatchRes<'buf>> {
722 let mr = match stream_kind {
723 StreamKind::Text(_) => {
724 let mut off_txt = off_buf;
725
726 let mut line_limit = self.length.unwrap_or(usize::MAX);
727
728 for line in buf.split(|c| c == &b'\n') {
729 if line_limit == 0 {
733 break;
734 }
735
736 if let Some(re_match) = self.re.find(line) {
737 let start_offset = off_txt + re_match.start() as u64;
739
740 let stop_offset = if re_match.end() == line.len() {
742 Some(start_offset + re_match.as_bytes().len() as u64 + 1)
743 } else {
744 None
745 };
746
747 return Some(MatchRes::Bytes(
748 start_offset,
749 stop_offset,
750 re_match.as_bytes(),
751 Encoding::Utf8,
752 ));
753 }
754
755 off_txt += line.len() as u64;
756 off_txt += 1;
758 line_limit = line_limit.saturating_sub(1)
759 }
760 None
761 }
762
763 StreamKind::Binary => {
764 self.re.find(buf).map(|re_match| {
765 MatchRes::Bytes(
766 off_buf + re_match.start() as u64,
768 None,
769 re_match.as_bytes(),
770 Encoding::Utf8,
771 )
772 })
773 }
774 };
775
776 if self.cmp_op.is_neq() && mr.is_none() {
778 return Some(MatchRes::Bytes(off_buf, None, buf, Encoding::Utf8));
779 }
780
781 mr
782 }
783}
784
785impl From<RegexTest> for Test {
786 fn from(value: RegexTest) -> Self {
787 Self::Regex(value)
788 }
789}
790
791flags! {
792 enum StringMod: u8{
793 ForceBin,
794 UpperInsensitive,
795 LowerInsensitive,
796 FullWordMatch,
797 Trim,
798 ForceText,
799 CompactWhitespace,
800 OptBlank,
801 }
802}
803
804#[derive(Debug, Clone, Serialize, Deserialize)]
805struct StringTest {
806 test_val: TestValue<Vec<u8>>,
807 cmp_op: CmpOp,
808 length: Option<usize>,
809 mods: FlagSet<StringMod>,
810 binary: bool,
811}
812
813impl From<StringTest> for Test {
814 fn from(value: StringTest) -> Self {
815 Self::String(value)
816 }
817}
818
819#[inline(always)]
820fn string_match(str: &[u8], mods: FlagSet<StringMod>, buf: &[u8]) -> (bool, usize) {
821 let mut consumed = 0;
822 if mods.is_disjoint(
824 StringMod::UpperInsensitive
825 | StringMod::LowerInsensitive
826 | StringMod::FullWordMatch
827 | StringMod::CompactWhitespace
828 | StringMod::OptBlank,
829 ) {
830 if buf.starts_with(str) {
832 (true, str.len())
833 } else {
834 (false, consumed)
835 }
836 } else {
837 let mut i_src = 0;
838 let mut iter = buf.iter().peekable();
839
840 macro_rules! consume_target {
841 () => {{
842 if iter.next().is_some() {
843 consumed += 1;
844 }
845 }};
846 }
847
848 macro_rules! continue_next_iteration {
849 () => {{
850 consume_target!();
851 i_src += 1;
852 continue;
853 }};
854 }
855
856 while let Some(&&b) = iter.peek() {
857 let Some(&ref_byte) = str.get(i_src) else {
858 break;
859 };
860
861 if mods.contains(StringMod::OptBlank) && (b == b' ' || ref_byte == b' ') {
862 if b == b' ' {
863 consume_target!();
865 }
866
867 if ref_byte == b' ' {
868 i_src += 1;
870 }
871
872 continue;
873 }
874
875 if mods.contains(StringMod::UpperInsensitive) {
876 if ref_byte.is_ascii_uppercase() && ref_byte == b.to_ascii_uppercase()
878 || ref_byte == b
879 {
880 continue_next_iteration!()
881 }
882 }
883
884 if mods.contains(StringMod::LowerInsensitive)
885 && (ref_byte.is_ascii_lowercase() && ref_byte == b.to_ascii_lowercase()
886 || ref_byte == b)
887 {
888 continue_next_iteration!()
889 }
890
891 if mods.contains(StringMod::CompactWhitespace) && ref_byte == b' ' {
892 let mut src_blk = 0;
893 while let Some(b' ') = str.get(i_src) {
894 src_blk += 1;
895 i_src += 1;
896 }
897
898 let mut tgt_blk = 0;
899 while let Some(b' ') = iter.peek() {
900 tgt_blk += 1;
901 consume_target!();
902 }
903
904 if src_blk > tgt_blk {
905 return (false, consumed);
906 }
907
908 continue;
909 }
910
911 if ref_byte == b {
912 continue_next_iteration!()
913 } else {
914 return (false, consumed);
915 }
916 }
917
918 if mods.contains(StringMod::FullWordMatch)
919 && let Some(b) = iter.peek()
920 && !b.is_ascii_whitespace()
921 {
922 return (false, consumed);
923 }
924
925 (
926 consumed > 0 && str.get(i_src).is_none() && consumed <= buf.len(),
927 consumed,
928 )
929 }
930}
931
932impl StringTest {
933 fn has_length_mod(&self) -> bool {
934 !self.mods.is_disjoint(
935 StringMod::UpperInsensitive
936 | StringMod::LowerInsensitive
937 | StringMod::FullWordMatch
938 | StringMod::CompactWhitespace
939 | StringMod::OptBlank,
940 )
941 }
942
943 #[inline(always)]
944 fn test_value_len(&self) -> usize {
945 match self.test_val.as_ref() {
946 TestValue::Value(s) => s.len(),
947 TestValue::Any => 0,
948 }
949 }
950
951 #[inline(always)]
952 fn is_binary(&self) -> bool {
953 self.binary || self.mods.contains(StringMod::ForceBin)
954 }
955
956 #[inline(always)]
957 fn is_text(&self) -> bool {
958 self.mods.contains(StringMod::ForceText)
959 }
960}
961
962#[derive(Clone, Serialize, Deserialize)]
963struct ByteVec(Vec<u8>);
964
965impl Debug for ByteVec {
966 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
967 write!(f, "\"{}\"", debug_string_from_vec_u8(self))
968 }
969}
970
971impl From<Vec<u8>> for ByteVec {
972 fn from(value: Vec<u8>) -> Self {
973 Self(value)
974 }
975}
976
977impl Deref for ByteVec {
978 type Target = Vec<u8>;
979
980 fn deref(&self) -> &Self::Target {
981 &self.0
982 }
983}
984
985#[derive(Debug, Clone, Serialize, Deserialize)]
986struct SearchTest {
987 str: ByteVec,
988 n_pos: Option<usize>,
989 str_mods: FlagSet<StringMod>,
990 re_mods: FlagSet<ReMod>,
991 binary: bool,
992 cmp_op: CmpOp,
993}
994
995impl From<SearchTest> for Test {
996 fn from(value: SearchTest) -> Self {
997 Self::Search(value)
998 }
999}
1000
1001impl SearchTest {
1002 #[inline(always)]
1003 fn is_binary(&self) -> bool {
1004 (self.binary
1005 || self.str_mods.contains(StringMod::ForceBin)
1006 || self.re_mods.contains(ReMod::ForceBin))
1007 && !(self.str_mods.contains(StringMod::ForceText)
1008 || self.re_mods.contains(ReMod::ForceText))
1009 }
1010
1011 #[inline]
1013 fn match_buf<'buf>(&self, off_buf: u64, buf: &'buf [u8]) -> Option<MatchRes<'buf>> {
1014 let mut i = 0;
1015
1016 let needle = self.str.first()?;
1017
1018 while i < buf.len() {
1019 let Some(k) = memchr(*needle, &buf[i..]) else {
1022 break;
1023 };
1024
1025 i += k;
1026
1027 if self.str_mods.contains(StringMod::FullWordMatch) {
1029 let prev_is_whitespace = buf
1030 .get(i.saturating_sub(1))
1031 .map(|c| c.is_ascii_whitespace())
1032 .unwrap_or_default();
1033
1034 if i > 0 && !prev_is_whitespace {
1039 i += 1;
1040 continue;
1041 }
1042 }
1043
1044 if let Some(npos) = self.n_pos
1045 && i > npos
1046 {
1047 break;
1048 }
1049
1050 let pos = i;
1051 let (ok, consumed) = string_match(&self.str, self.str_mods, &buf[i..]);
1052
1053 if ok {
1054 return Some(MatchRes::Bytes(
1055 off_buf.saturating_add(pos as u64),
1056 None,
1057 &buf[i..i + consumed],
1058 Encoding::Utf8,
1059 ));
1060 } else {
1061 i += max(consumed, 1)
1062 }
1063 }
1064
1065 if self.cmp_op.is_neq() {
1067 return Some(MatchRes::Bytes(off_buf, None, buf, Encoding::Utf8));
1068 }
1069
1070 None
1071 }
1072}
1073
1074#[derive(Debug, Clone, Serialize, Deserialize)]
1075struct ScalarTest {
1076 ty: ScalarDataType,
1077 transform: Option<ScalarTransform>,
1078 cmp_op: CmpOp,
1079 test_val: TestValue<Scalar>,
1080}
1081
1082#[derive(Debug, Clone, Serialize, Deserialize)]
1083struct FloatTest {
1084 ty: FloatDataType,
1085 transform: Option<FloatTransform>,
1086 cmp_op: CmpOp,
1087 test_val: TestValue<Float>,
1088}
1089
1090#[derive(PartialEq)]
1093enum ReadValue<'buf> {
1094 Float(u64, Float),
1095 Scalar(u64, Scalar),
1096 Bytes(u64, &'buf [u8]),
1097}
1098
1099impl<'buf> Debug for ReadValue<'buf> {
1100 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1101 match self {
1102 Self::Float(_, fl) => write!(f, "{fl:?}"),
1103 Self::Scalar(_, s) => write!(f, "{s:?}"),
1104 Self::Bytes(_, b) => {
1105 if b.len() <= 128 {
1106 write!(f, "\"{}\"", debug_string_from_vec_u8(b))
1107 } else {
1108 let limit = 128;
1109 write!(
1110 f,
1111 "\"{}\" (first {limit} bytes)",
1112 debug_string_from_vec_u8(&b[..limit])
1113 )
1114 }
1115 }
1116 }
1117 }
1118}
1119
1120impl DynDisplay for ReadValue<'_> {
1121 fn dyn_fmt(&self, f: &dyf::FormatSpec) -> Result<String, dyf::Error> {
1122 match self {
1123 Self::Float(_, s) => DynDisplay::dyn_fmt(s, f),
1124 Self::Scalar(_, s) => DynDisplay::dyn_fmt(s, f),
1125 Self::Bytes(_, b) => Ok(format!("{b:?}")),
1126 }
1127 }
1128}
1129
1130impl DynDisplay for &ReadValue<'_> {
1131 fn dyn_fmt(&self, f: &dyf::FormatSpec) -> Result<String, dyf::Error> {
1132 DynDisplay::dyn_fmt(*self, f)
1134 }
1135}
1136
1137impl Display for ReadValue<'_> {
1138 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1139 match self {
1140 Self::Float(_, v) => write!(f, "{v}"),
1141 Self::Scalar(_, s) => write!(f, "{s}"),
1142 Self::Bytes(_, b) => write!(f, "{b:?}"),
1143 }
1144 }
1145}
1146
1147enum Encoding {
1148 Utf16(String16Encoding),
1149 Utf8,
1150}
1151
1152enum MatchRes<'buf> {
1155 Bytes(u64, Option<u64>, &'buf [u8], Encoding),
1160 Scalar(u64, Scalar),
1161 Float(u64, Float),
1162}
1163
1164impl DynDisplay for &MatchRes<'_> {
1165 fn dyn_fmt(&self, f: &dyf::FormatSpec) -> Result<String, dyf::Error> {
1166 (*self).dyn_fmt(f)
1167 }
1168}
1169
1170impl DynDisplay for MatchRes<'_> {
1171 fn dyn_fmt(&self, f: &dyf::FormatSpec) -> Result<String, dyf::Error> {
1172 match self {
1173 Self::Scalar(_, v) => v.dyn_fmt(f),
1174 Self::Float(_, v) => v.dyn_fmt(f),
1175 Self::Bytes(_, _, v, enc) => match enc {
1176 Encoding::Utf8 => String::from_utf8_lossy(v).to_string().dyn_fmt(f),
1177 Encoding::Utf16(enc) => {
1178 let utf16: Vec<u16> = slice_to_utf16_iter(v, *enc).collect();
1179 String::from_utf16_lossy(&utf16).dyn_fmt(f)
1180 }
1181 },
1182 }
1183 }
1184}
1185
1186impl MatchRes<'_> {
1187 #[inline]
1189 fn start_offset(&self) -> u64 {
1190 match self {
1191 MatchRes::Bytes(o, _, _, _) => *o,
1192 MatchRes::Scalar(o, _) => *o,
1193 MatchRes::Float(o, _) => *o,
1194 }
1195 }
1196
1197 #[inline]
1199 fn end_offset(&self) -> u64 {
1200 match self {
1201 MatchRes::Bytes(start, end, buf, _) => match end {
1202 Some(end) => *end,
1203 None => start.saturating_add(buf.len() as u64),
1204 },
1205 MatchRes::Scalar(o, sc) => o.add(sc.size_of() as u64),
1206 MatchRes::Float(o, f) => o.add(f.size_of() as u64),
1207 }
1208 }
1209}
1210
1211fn slice_to_utf16_iter(read: &[u8], encoding: String16Encoding) -> impl Iterator<Item = u16> {
1212 let even = read
1213 .iter()
1214 .enumerate()
1215 .filter(|(i, _)| i % 2 == 0)
1216 .map(|t| t.1);
1217
1218 let odd = read
1219 .iter()
1220 .enumerate()
1221 .filter(|(i, _)| i % 2 != 0)
1222 .map(|t| t.1);
1223
1224 even.zip(odd).map(move |(e, o)| match encoding {
1225 String16Encoding::Le => u16::from_le_bytes([*e, *o]),
1226 String16Encoding::Be => u16::from_be_bytes([*e, *o]),
1227 })
1228}
1229
1230#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
1231enum String16Encoding {
1232 Le,
1233 Be,
1234}
1235
1236#[derive(Debug, Clone, Serialize, Deserialize)]
1237struct String16Test {
1238 orig: String,
1239 test_val: TestValue<Vec<u16>>,
1240 encoding: String16Encoding,
1241}
1242
1243impl String16Test {
1244 #[inline(always)]
1248 fn test_value_len(&self) -> usize {
1249 match self.test_val.as_ref() {
1250 TestValue::Value(str16) => str16.len(),
1251 TestValue::Any => 0,
1252 }
1253 }
1254}
1255
1256flags! {
1257 enum IndirectMod: u8{
1258 Relative,
1259 }
1260}
1261
1262type IndirectMods = FlagSet<IndirectMod>;
1263
1264#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
1265enum PStringLen {
1266 Byte, ShortBe, ShortLe, LongBe, LongLe, }
1272
1273impl PStringLen {
1274 #[inline(always)]
1275 const fn size_of_len(&self) -> usize {
1276 match self {
1277 PStringLen::Byte => 1,
1278 PStringLen::ShortBe => 2,
1279 PStringLen::ShortLe => 2,
1280 PStringLen::LongBe => 4,
1281 PStringLen::LongLe => 4,
1282 }
1283 }
1284}
1285
1286#[derive(Debug, Clone, Serialize, Deserialize)]
1287struct PStringTest {
1288 len: PStringLen,
1289 test_val: TestValue<Vec<u8>>,
1290 include_len: bool,
1291}
1292
1293impl PStringTest {
1294 #[inline]
1295 fn read<'cache, R: DataRead>(
1296 &self,
1297 haystack: &'cache mut R,
1298 ) -> Result<Option<&'cache [u8]>, Error> {
1299 let mut len = match self.len {
1300 PStringLen::Byte => read_le!(haystack, u8) as u32,
1301 PStringLen::ShortBe => read_be!(haystack, u16) as u32,
1302 PStringLen::ShortLe => read_le!(haystack, u16) as u32,
1303 PStringLen::LongBe => read_be!(haystack, u32),
1304 PStringLen::LongLe => read_le!(haystack, u32),
1305 } as usize;
1306
1307 if self.include_len {
1308 len = len.saturating_sub(self.len.size_of_len())
1309 }
1310
1311 if let TestValue::Value(s) = self.test_val.as_ref()
1312 && len != s.len()
1313 {
1314 return Ok(None);
1315 }
1316
1317 let read = haystack.read_exact_count(len as u64)?;
1318
1319 Ok(Some(read))
1320 }
1321
1322 #[inline(always)]
1323 fn test_value_len(&self) -> usize {
1324 match self.test_val.as_ref() {
1325 TestValue::Value(s) => s.len(),
1326 TestValue::Any => 0,
1327 }
1328 }
1329}
1330
1331#[derive(Debug, Clone, Serialize, Deserialize)]
1332enum Test {
1333 Name(String),
1334 Use(bool, String),
1335 Scalar(ScalarTest),
1336 Float(FloatTest),
1337 String(StringTest),
1338 Search(SearchTest),
1339 PString(PStringTest),
1340 Regex(RegexTest),
1341 Indirect(FlagSet<IndirectMod>),
1342 String16(String16Test),
1343 #[allow(dead_code)]
1345 Der,
1346 Clear,
1347 Default,
1348}
1349
1350impl Display for Test {
1351 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1352 match self {
1353 Test::Name(name) => write!(f, "name {name}"),
1354 Test::Use(flip, rule) => {
1355 if *flip {
1356 write!(f, "use {rule}")
1357 } else {
1358 write!(f, "use ^{rule}")
1359 }
1360 }
1361 Test::Scalar(st) => write!(f, "{st:?}"),
1362 Test::Float(ft) => write!(f, "{ft:?}"),
1363 Test::String(st) => write!(f, "{st:?}"),
1364 Test::Search(st) => write!(f, "{st:?}"),
1365 Test::PString(pt) => write!(f, "{pt:?}"),
1366 Test::Regex(rt) => write!(f, "{rt:?}"),
1367 Test::Indirect(fs) => write!(f, "indirect {fs:?}"),
1368 Test::String16(s16t) => write!(f, "{s16t:?}"),
1369 Test::Der => write!(f, "unimplemented der"),
1370 Test::Clear => write!(f, "clear"),
1371 Test::Default => write!(f, "default"),
1372 }
1373 }
1374}
1375
1376impl Test {
1377 #[inline]
1379 fn read_test_value<'haystack, D: DataRead>(
1380 &self,
1381 haystack: &'haystack mut D,
1382 switch_endianness: bool,
1383 ) -> Result<Option<ReadValue<'haystack>>, Error> {
1384 let test_value_offset = haystack.stream_position();
1385
1386 match self {
1387 Self::Scalar(t) => {
1388 t.ty.read(haystack, switch_endianness)
1389 .map(|s| Some(ReadValue::Scalar(test_value_offset, s)))
1390 }
1391
1392 Self::Float(t) => {
1393 t.ty.read(haystack, switch_endianness)
1394 .map(|f| Some(ReadValue::Float(test_value_offset, f)))
1395 }
1396 Self::String(t) => {
1397 match t.test_val.as_ref() {
1398 TestValue::Value(str) => {
1399 let buf = if let Some(length) = t.length {
1400 haystack.read_exact_count(length as u64)?
1402 } else {
1403 match t.cmp_op {
1406 CmpOp::Eq | CmpOp::Neq => {
1407 if !t.has_length_mod() {
1408 haystack.read_exact_count(str.len() as u64)?
1409 } else {
1410 haystack.read_count(FILE_BYTES_MAX as u64)?
1411 }
1412 }
1413 CmpOp::Lt | CmpOp::Gt => {
1414 let read =
1415 haystack.read_until_any_delim_or_limit(b"\n\0", 8092)?;
1416
1417 if read.ends_with(b"\0") || read.ends_with(b"\n") {
1418 &read[..read.len() - 1]
1419 } else {
1420 read
1421 }
1422 }
1423 _ => {
1424 return Err(Error::Msg(format!(
1425 "string test does not support {:?} operator",
1426 t.cmp_op
1427 )));
1428 }
1429 }
1430 };
1431
1432 Ok(Some(ReadValue::Bytes(test_value_offset, buf)))
1433 }
1434 TestValue::Any => {
1435 let read = haystack.read_until_any_delim_or_limit(b"\0\n", 8192)?;
1436 let bytes = if read.ends_with(b"\0") || read.ends_with(b"\n") {
1438 &read[..read.len() - 1]
1439 } else {
1440 read
1441 };
1442
1443 Ok(Some(ReadValue::Bytes(test_value_offset, bytes)))
1444 }
1445 }
1446 }
1447
1448 Self::String16(t) => {
1449 match t.test_val.as_ref() {
1450 TestValue::Value(str16) => {
1451 let read = haystack.read_exact_count((str16.len() * 2) as u64)?;
1452
1453 Ok(Some(ReadValue::Bytes(test_value_offset, read)))
1454 }
1455 TestValue::Any => {
1456 let read = haystack.read_until_utf16_or_limit(b"\x00\x00", 8192)?;
1457
1458 let end = if read.len() % 2 == 0 {
1460 read.len()
1461 } else {
1462 read.len().saturating_sub(1)
1465 };
1466
1467 Ok(Some(ReadValue::Bytes(test_value_offset, &read[..end])))
1468 }
1469 }
1470 }
1471
1472 Self::PString(t) => {
1473 let Some(read) = t.read(haystack)? else {
1474 return Ok(None);
1475 };
1476 Ok(Some(ReadValue::Bytes(test_value_offset, read)))
1477 }
1478
1479 Self::Search(_) => {
1480 let buf = haystack.read_count(FILE_BYTES_MAX as u64)?;
1481 Ok(Some(ReadValue::Bytes(test_value_offset, buf)))
1482 }
1483
1484 Self::Regex(r) => {
1485 let length = {
1486 match r.length {
1487 Some(len) => {
1488 if r.mods.contains(ReMod::LineLimit) {
1489 len * 80
1490 } else {
1491 len
1492 }
1493 }
1494
1495 None => FILE_REGEX_MAX,
1496 }
1497 };
1498
1499 let read = haystack.read_count(length as u64)?;
1500 Ok(Some(ReadValue::Bytes(test_value_offset, read)))
1501 }
1502
1503 Self::Name(_)
1504 | Self::Use(_, _)
1505 | Self::Indirect(_)
1506 | Self::Clear
1507 | Self::Default
1508 | Self::Der => Err(Error::msg("no value to read for this test")),
1509 }
1510 }
1511
1512 #[inline(always)]
1513 fn match_value<'s>(
1514 &'s self,
1515 tv: &ReadValue<'s>,
1516 stream_kind: StreamKind,
1517 ) -> Option<MatchRes<'s>> {
1518 match (self, tv) {
1519 (Self::Scalar(t), ReadValue::Scalar(o, ts)) => {
1520 let read_value: Scalar = match t.transform.as_ref() {
1521 Some(t) => t.apply(*ts)?,
1522 None => *ts,
1523 };
1524
1525 match t.test_val {
1526 TestValue::Value(test_value) => {
1527 let ok = match t.cmp_op {
1528 CmpOp::Not => read_value == !test_value,
1531 CmpOp::Eq => read_value == test_value,
1532 CmpOp::Lt => read_value < test_value,
1533 CmpOp::Gt => read_value > test_value,
1534 CmpOp::Neq => read_value != test_value,
1535 CmpOp::BitAnd => read_value & test_value == test_value,
1536 CmpOp::Xor => (read_value & test_value).is_zero(),
1537 };
1538
1539 if ok {
1540 Some(MatchRes::Scalar(*o, read_value))
1541 } else {
1542 None
1543 }
1544 }
1545
1546 TestValue::Any => Some(MatchRes::Scalar(*o, read_value)),
1547 }
1548 }
1549
1550 (Self::Float(t), ReadValue::Float(o, f)) => {
1551 let read_value: Float = t.transform.as_ref().map(|t| t.apply(*f)).unwrap_or(*f);
1552
1553 match t.test_val {
1554 TestValue::Value(tf) => {
1555 let ok = match t.cmp_op {
1556 CmpOp::Eq => read_value == tf,
1557 CmpOp::Lt => read_value < tf,
1558 CmpOp::Gt => read_value > tf,
1559 CmpOp::Neq => read_value != tf,
1560 _ => {
1561 debug_panic!("unsupported float comparison");
1564 debug!("unsupported float comparison");
1565 false
1566 }
1567 };
1568
1569 if ok {
1570 Some(MatchRes::Float(*o, read_value))
1571 } else {
1572 None
1573 }
1574 }
1575 TestValue::Any => Some(MatchRes::Float(*o, read_value)),
1576 }
1577 }
1578
1579 (Self::String(st), ReadValue::Bytes(o, buf)) => {
1580 macro_rules! trim_buf {
1581 ($buf: expr) => {{
1582 if st.mods.contains(StringMod::Trim) {
1583 $buf.trim_ascii()
1584 } else {
1585 $buf
1586 }
1587 }};
1588 }
1589
1590 match st.test_val.as_ref() {
1591 TestValue::Value(str) => {
1592 match st.cmp_op {
1593 CmpOp::Eq => {
1594 if let (true, _) = string_match(str, st.mods, buf) {
1595 Some(MatchRes::Bytes(*o, None, trim_buf!(str), Encoding::Utf8))
1596 } else {
1597 None
1598 }
1599 }
1600 CmpOp::Neq => {
1601 if let (false, _) = string_match(str, st.mods, buf) {
1602 Some(MatchRes::Bytes(*o, None, trim_buf!(str), Encoding::Utf8))
1603 } else {
1604 None
1605 }
1606 }
1607 CmpOp::Gt => {
1608 if buf.len() > str.len() {
1609 Some(MatchRes::Bytes(*o, None, trim_buf!(buf), Encoding::Utf8))
1610 } else {
1611 None
1612 }
1613 }
1614 CmpOp::Lt => {
1615 if buf.len() < str.len() {
1616 Some(MatchRes::Bytes(*o, None, trim_buf!(buf), Encoding::Utf8))
1617 } else {
1618 None
1619 }
1620 }
1621
1622 _ => {
1624 debug_panic!("unsupported string comparison");
1627 debug!("unsupported string comparison");
1628 None
1629 }
1630 }
1631 }
1632 TestValue::Any => {
1633 Some(MatchRes::Bytes(*o, None, trim_buf!(buf), Encoding::Utf8))
1634 }
1635 }
1636 }
1637
1638 (Self::PString(m), ReadValue::Bytes(o, buf)) => match m.test_val.as_ref() {
1639 TestValue::Value(psv) => {
1640 if buf == psv {
1641 Some(MatchRes::Bytes(*o, None, buf, Encoding::Utf8))
1642 } else {
1643 None
1644 }
1645 }
1646 TestValue::Any => Some(MatchRes::Bytes(*o, None, buf, Encoding::Utf8)),
1647 },
1648
1649 (Self::String16(t), ReadValue::Bytes(o, buf)) => {
1650 match t.test_val.as_ref() {
1651 TestValue::Value(str16) => {
1652 if str16.len() * 2 != buf.len() {
1654 return None;
1655 }
1656
1657 for (i, utf16_char) in slice_to_utf16_iter(buf, t.encoding).enumerate() {
1659 if str16[i] != utf16_char {
1660 return None;
1661 }
1662 }
1663
1664 Some(MatchRes::Bytes(
1665 *o,
1666 None,
1667 t.orig.as_bytes(),
1668 Encoding::Utf16(t.encoding),
1669 ))
1670 }
1671
1672 TestValue::Any => {
1673 Some(MatchRes::Bytes(*o, None, buf, Encoding::Utf16(t.encoding)))
1674 }
1675 }
1676 }
1677
1678 (Self::Regex(r), ReadValue::Bytes(o, buf)) => r.match_buf(*o, stream_kind, buf),
1679
1680 (Self::Search(t), ReadValue::Bytes(o, buf)) => t.match_buf(*o, buf),
1681
1682 _ => None,
1683 }
1684 }
1685
1686 #[inline(always)]
1687 fn strength(&self) -> u64 {
1688 const MULT: usize = 10;
1689
1690 let mut out = 2 * MULT;
1691
1692 match self {
1694 Test::Scalar(s) => {
1695 out += s.ty.type_size() * MULT;
1696 }
1697
1698 Test::Float(t) => {
1699 out += t.ty.type_size() * MULT;
1700 }
1701
1702 Test::String(t) => out += t.test_value_len().saturating_mul(MULT),
1703
1704 Test::PString(t) => out += t.test_value_len().saturating_mul(MULT),
1705
1706 Test::Search(s) => {
1707 let n_pos = s.n_pos.unwrap_or(FILE_BYTES_MAX);
1712
1713 match n_pos {
1714 0..=80 => out += s.str.len().saturating_mul(MULT),
1716 81..=240 => out += s.str.len() * s.str.len().clamp(0, MULT - 2),
1718 _ => out += s.str.len(),
1720 }
1721 }
1722
1723 Test::Regex(r) => {
1724 let v = r.non_magic_len / r.re.captures_len();
1733
1734 let len = r
1735 .length
1736 .map(|l| {
1737 if r.mods.contains(ReMod::LineLimit) {
1738 l * 80
1739 } else {
1740 l
1741 }
1742 })
1743 .unwrap_or(FILE_BYTES_MAX);
1744
1745 match len {
1746 0..=80 => out += v.saturating_mul(MULT),
1748 81..=240 => out += v * v.clamp(0, MULT - 2),
1750 _ => out += v,
1752 }
1753 }
1754
1755 Test::String16(t) => {
1756 out += t.test_value_len().saturating_mul(MULT);
1761 }
1762
1763 Test::Der => out += MULT,
1764
1765 Test::Default | Test::Name(_) | Test::Use(_, _) | Test::Indirect(_) | Test::Clear => {
1766 return 0;
1767 }
1768 }
1769
1770 if self.is_match_any() {
1772 return 0;
1773 }
1774
1775 if let Some(op) = self.cmp_op() {
1776 match op {
1777 CmpOp::Neq => out = 0,
1779 CmpOp::Eq | CmpOp::Not => out += MULT,
1780 CmpOp::Lt | CmpOp::Gt => out -= 2 * MULT,
1781 CmpOp::Xor | CmpOp::BitAnd => out -= MULT,
1782 }
1783 }
1784
1785 out as u64
1786 }
1787
1788 #[inline(always)]
1789 fn cmp_op(&self) -> Option<CmpOp> {
1790 match self {
1791 Self::String(t) => Some(t.cmp_op),
1792 Self::Scalar(s) => Some(s.cmp_op),
1793 Self::Float(t) => Some(t.cmp_op),
1794 Self::Name(_)
1795 | Self::Use(_, _)
1796 | Self::Search(_)
1797 | Self::PString(_)
1798 | Self::Regex(_)
1799 | Self::Clear
1800 | Self::Default
1801 | Self::Indirect(_)
1802 | Self::String16(_)
1803 | Self::Der => None,
1804 }
1805 }
1806
1807 #[inline(always)]
1808 fn is_recursive(&self) -> bool {
1809 matches!(self, Test::Use(_, _) | Test::Indirect(_))
1810 }
1811
1812 #[inline(always)]
1813 fn is_match_any(&self) -> bool {
1814 match self {
1815 Test::Name(_) => false,
1816 Test::Use(_, _) => false,
1817 Test::Scalar(scalar_test) => matches!(scalar_test.test_val, TestValue::Any),
1818 Test::Float(float_test) => matches!(float_test.test_val, TestValue::Any),
1819 Test::String(string_test) => matches!(string_test.test_val, TestValue::Any),
1820 Test::Search(_) => false,
1821 Test::PString(pstring_test) => matches!(pstring_test.test_val, TestValue::Any),
1822 Test::Regex(_) => false,
1823 Test::Indirect(_) => false,
1824 Test::String16(string16_test) => matches!(string16_test.test_val, TestValue::Any),
1825 Test::Der => false,
1826 Test::Clear => false,
1827 Test::Default => false,
1828 }
1829 }
1830
1831 #[inline(always)]
1832 fn is_binary(&self) -> bool {
1833 match self {
1834 Self::Name(_) => true,
1835 Self::Use(_, _) => true,
1836 Self::Scalar(_) => true,
1837 Self::Float(_) => true,
1838 Self::String(t) => !t.is_binary() & !t.is_text() || t.is_binary(),
1839 Self::Search(t) => t.is_binary(),
1840 Self::PString(_) => true,
1841 Self::Regex(t) => !t.is_binary() & !t.is_text() || t.is_binary(),
1842 Self::Clear => true,
1843 Self::Default => true,
1844 Self::Indirect(_) => true,
1845 Self::String16(_) => true,
1846 Self::Der => true,
1847 }
1848 }
1849
1850 #[inline(always)]
1851 fn is_text(&self) -> bool {
1852 match self {
1853 Self::Name(_) => true,
1854 Self::Use(_, _) => true,
1855 Self::Indirect(_) => true,
1856 Self::Clear => true,
1857 Self::Default => true,
1858 Self::String(t) => !t.is_binary() & !t.is_text() || t.is_text(),
1859 Self::Regex(t) => !t.is_binary() & !t.is_text() || t.is_text(),
1860 _ => !self.is_binary(),
1861 }
1862 }
1863
1864 #[inline(always)]
1865 fn is_only_text(&self) -> bool {
1866 self.is_text() && !self.is_binary()
1867 }
1868
1869 #[inline(always)]
1870 fn is_only_binary(&self) -> bool {
1871 self.is_binary() && !self.is_text()
1872 }
1873}
1874
1875#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
1876enum OffsetType {
1877 Byte,
1878 DoubleLe,
1879 DoubleBe,
1880 ShortLe,
1881 ShortBe,
1882 Id3Le,
1883 Id3Be,
1884 LongLe,
1885 LongBe,
1886 Middle,
1887 Octal,
1888 QuadBe,
1889 QuadLe,
1890}
1891
1892#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
1893enum Shift {
1894 Direct(u64),
1895 Indirect(i64),
1896}
1897
1898#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
1899struct IndOffset {
1900 off_addr: DirOffset,
1902 signed: bool,
1904 ty: OffsetType,
1906 op: Option<Op>,
1907 shift: Option<Shift>,
1908}
1909
1910impl IndOffset {
1911 fn read_offset<D: DataRead>(
1913 &self,
1914 haystack: &mut D,
1915 rule_base_offset: Option<u64>,
1916 last_upper_match_offset: Option<u64>,
1917 ) -> Result<Option<u64>, io::Error> {
1918 let offset_address = match self.off_addr {
1919 DirOffset::Start(s) => {
1920 let Some(o) = s.checked_add(rule_base_offset.unwrap_or_default()) else {
1921 return Ok(None);
1922 };
1923
1924 haystack.seek(SeekFrom::Start(o))?
1925 }
1926 DirOffset::LastUpper(c) => haystack.seek(SeekFrom::Start(
1927 (last_upper_match_offset.unwrap_or_default() as i64 + c) as u64,
1928 ))?,
1929 DirOffset::End(e) => haystack.seek(SeekFrom::End(e))?,
1930 };
1931
1932 macro_rules! read_value {
1933 () => {
1934 match self.ty {
1935 OffsetType::Byte => {
1936 if self.signed {
1937 read_le!(haystack, u8) as u64
1938 } else {
1939 read_le!(haystack, i8) as u64
1940 }
1941 }
1942 OffsetType::DoubleLe => read_le!(haystack, f64) as u64,
1943 OffsetType::DoubleBe => read_be!(haystack, f64) as u64,
1944 OffsetType::ShortLe => {
1945 if self.signed {
1946 read_le!(haystack, i16) as u64
1947 } else {
1948 read_le!(haystack, u16) as u64
1949 }
1950 }
1951 OffsetType::ShortBe => {
1952 if self.signed {
1953 read_be!(haystack, i16) as u64
1954 } else {
1955 read_be!(haystack, u16) as u64
1956 }
1957 }
1958 OffsetType::Id3Le => decode_id3(read_le!(haystack, u32)) as u64,
1959 OffsetType::Id3Be => decode_id3(read_be!(haystack, u32)) as u64,
1960 OffsetType::LongLe => {
1961 if self.signed {
1962 read_le!(haystack, i32) as u64
1963 } else {
1964 read_le!(haystack, u32) as u64
1965 }
1966 }
1967 OffsetType::LongBe => {
1968 if self.signed {
1969 read_be!(haystack, i32) as u64
1970 } else {
1971 read_be!(haystack, u32) as u64
1972 }
1973 }
1974 OffsetType::Middle => read_me!(haystack) as u64,
1975 OffsetType::Octal => {
1976 if let Some(o) = read_octal_u64(haystack) {
1977 o
1978 } else {
1979 debug!("failed to read octal offset @ {offset_address}");
1980 return Ok(None);
1981 }
1982 }
1983 OffsetType::QuadLe => {
1984 if self.signed {
1985 read_le!(haystack, i64) as u64
1986 } else {
1987 read_le!(haystack, u64)
1988 }
1989 }
1990 OffsetType::QuadBe => {
1991 if self.signed {
1992 read_be!(haystack, i64) as u64
1993 } else {
1994 read_be!(haystack, u64)
1995 }
1996 }
1997 }
1998 };
1999 }
2000
2001 let o = read_value!();
2003
2004 trace!(
2005 "offset read @ {offset_address} value={o} op={:?} shift={:?}",
2006 self.op, self.shift
2007 );
2008
2009 if let (Some(op), Some(shift)) = (self.op, self.shift) {
2011 let shift = match shift {
2012 Shift::Direct(i) => i,
2013 Shift::Indirect(i) => {
2014 let tmp = offset_address as i128 + i as i128;
2015 if tmp.is_negative() {
2016 return Ok(None);
2017 } else {
2018 haystack.seek(SeekFrom::Start(tmp as u64))?;
2019 };
2020 read_value!()
2023 }
2024 };
2025
2026 match op {
2027 Op::Add => return Ok(o.checked_add(shift)),
2028 Op::Mul => return Ok(o.checked_mul(shift)),
2029 Op::Sub => return Ok(o.checked_sub(shift)),
2030 Op::Div => return Ok(o.checked_div(shift)),
2031 Op::Mod => return Ok(o.checked_rem(shift)),
2032 Op::And => return Ok(Some(o & shift)),
2033 Op::Or => return Ok(Some(o | shift)),
2034 Op::Xor => return Ok(Some(o ^ shift)),
2035 }
2036 }
2037
2038 Ok(Some(o))
2039 }
2040}
2041
2042#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
2043enum DirOffset {
2044 Start(u64),
2045 LastUpper(i64),
2047 End(i64),
2048}
2049
2050#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
2051enum Offset {
2052 Direct(DirOffset),
2053 Indirect(IndOffset),
2054}
2055
2056impl Offset {
2057 #[inline(always)]
2058 fn is_indirect(&self) -> bool {
2059 matches!(self, Self::Indirect(_))
2060 }
2061}
2062
2063impl From<DirOffset> for Offset {
2064 fn from(value: DirOffset) -> Self {
2065 Self::Direct(value)
2066 }
2067}
2068
2069impl From<IndOffset> for Offset {
2070 fn from(value: IndOffset) -> Self {
2071 Self::Indirect(value)
2072 }
2073}
2074
2075impl Display for DirOffset {
2076 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
2077 match self {
2078 DirOffset::Start(i) => write!(f, "{i}"),
2079 DirOffset::LastUpper(c) => write!(f, "&{c}"),
2080 DirOffset::End(e) => write!(f, "-{e}"),
2081 }
2082 }
2083}
2084
2085impl Default for DirOffset {
2086 fn default() -> Self {
2087 Self::LastUpper(0)
2088 }
2089}
2090
2091#[derive(Debug, Clone, Serialize, Deserialize)]
2092struct Match {
2093 line: usize,
2094 depth: u8,
2095 offset: Offset,
2096 test: Test,
2097 test_strength: u64,
2098 message: Option<Message>,
2099}
2100
2101impl From<Use> for Match {
2102 fn from(value: Use) -> Self {
2103 let test = Test::Use(value.switch_endianness, value.rule_name);
2104 let test_strength = test.strength();
2105 Self {
2106 line: value.line,
2107 depth: value.depth,
2108 offset: value.start_offset,
2109 test,
2110 test_strength,
2111 message: value.message,
2112 }
2113 }
2114}
2115
2116impl From<Name> for Match {
2117 fn from(value: Name) -> Self {
2118 let test = Test::Name(value.name);
2119 let test_strength = test.strength();
2120 Self {
2121 line: value.line,
2122 depth: 0,
2123 offset: Offset::Direct(DirOffset::Start(0)),
2124 test,
2125 test_strength,
2126 message: value.message,
2127 }
2128 }
2129}
2130
2131impl Match {
2132 #[inline(always)]
2134 fn offset_from_start<D: DataRead>(
2135 &self,
2136 haystack: &mut D,
2137 rule_base_offset: Option<u64>,
2138 last_level_offset: Option<u64>,
2139 ) -> Result<Option<u64>, io::Error> {
2140 match self.offset {
2141 Offset::Direct(dir_offset) => match dir_offset {
2142 DirOffset::Start(s) => Ok(Some(s)),
2143 DirOffset::LastUpper(shift) => {
2144 let o = last_level_offset.unwrap_or_default() as i64 + shift;
2145
2146 if o >= 0 { Ok(Some(o as u64)) } else { Ok(None) }
2147 }
2148 DirOffset::End(e) => Ok(Some(haystack.offset_from_start(SeekFrom::End(e)))),
2149 },
2150 Offset::Indirect(ind_offset) => {
2151 let Some(o) =
2152 ind_offset.read_offset(haystack, rule_base_offset, last_level_offset)?
2153 else {
2154 return Ok(None);
2155 };
2156
2157 Ok(Some(o))
2158 }
2159 }
2160 }
2161
2162 #[inline]
2175 #[allow(clippy::too_many_arguments)]
2176 fn matches<'a: 'h, 'h, D: DataRead>(
2177 &'a self,
2178 source: Option<&str>,
2179 magic: &mut Magic<'a>,
2180 stream_kind: StreamKind,
2181 state: &mut MatchState,
2182 buf_base_offset: Option<u64>,
2183 rule_base_offset: Option<u64>,
2184 last_level_offset: Option<u64>,
2185 haystack: &'h mut D,
2186 switch_endianness: bool,
2187 db: &'a MagicDb,
2188 depth: usize,
2189 ) -> Result<(bool, Option<MatchRes<'h>>), Error> {
2190 let source = source.unwrap_or("unknown");
2191 let line = self.line;
2192
2193 if depth >= MAX_RECURSION {
2194 return Err(Error::localized(
2195 source,
2196 line,
2197 Error::MaximumRecursion(MAX_RECURSION),
2198 ));
2199 }
2200
2201 if self.test.is_only_binary() && stream_kind.is_text() {
2202 trace!("skip binary test source={source} line={line} stream_kind={stream_kind:?}",);
2203 return Ok((false, None));
2204 }
2205
2206 if self.test.is_only_text() && !stream_kind.is_text() {
2207 trace!("skip text test source={source} line={line} stream_kind={stream_kind:?}",);
2208 return Ok((false, None));
2209 }
2210
2211 let Ok(Some(mut offset)) = self
2212 .offset_from_start(haystack, rule_base_offset, last_level_offset)
2213 .inspect_err(|e| debug!("source={source} line={line} failed at computing offset: {e}"))
2214 else {
2215 return Ok((false, None));
2216 };
2217
2218 offset = match self.offset {
2219 Offset::Indirect(_) => {
2220 buf_base_offset.unwrap_or_default().saturating_add(offset)
2225 }
2226 Offset::Direct(DirOffset::Start(_)) => {
2228 rule_base_offset.unwrap_or_default().saturating_add(offset)
2229 }
2230 _ => offset,
2231 };
2232
2233 match &self.test {
2234 Test::Clear => {
2235 trace!("source={source} line={line} clear");
2236 state.clear_continuation_level(&self.continuation_level());
2237 Ok((true, None))
2238 }
2239
2240 Test::Name(name) => {
2241 trace!(
2242 "source={source} line={line} running rule {name} switch_endianness={switch_endianness}",
2243 );
2244 Ok((true, None))
2245 }
2246
2247 Test::Use(flip_endianness, rule_name) => {
2248 trace!(
2249 "source={source} line={line} use {rule_name} switch_endianness={flip_endianness}",
2250 );
2251
2252 let switch_endianness = switch_endianness ^ flip_endianness;
2254
2255 let dr: &DependencyRule = db.dependencies.get(rule_name).ok_or(
2256 Error::localized(source, line, Error::MissingRule(rule_name.clone())),
2257 )?;
2258
2259 if let Some(msg) = self.message.as_ref() {
2261 magic.push_message(msg.to_string_lossy());
2262 }
2263
2264 let new_buf_base_off = if self.offset.is_indirect() {
2265 Some(offset)
2266 } else {
2267 None
2268 };
2269
2270 let nmatch = dr.rule.magic(
2271 magic,
2272 stream_kind,
2273 new_buf_base_off,
2274 Some(offset),
2275 haystack,
2276 db,
2277 switch_endianness,
2278 depth.saturating_add(1),
2279 )?;
2280
2281 let matched = nmatch > 0;
2284 if matched {
2285 state.set_continuation_level(self.continuation_level());
2286 }
2287
2288 Ok((matched, None))
2289 }
2290
2291 Test::Indirect(m) => {
2292 trace!(
2293 "source={source} line={line} indirect mods={:?} offset={offset:#x}",
2294 m
2295 );
2296
2297 let new_buf_base_off = if m.contains(IndirectMod::Relative) {
2298 Some(offset)
2299 } else {
2300 None
2301 };
2302
2303 if let Some(msg) = self.message.as_ref() {
2305 magic.push_message(msg.to_string_lossy());
2306 }
2307
2308 let mut nmatch = 0u64;
2309 for r in db.rules.iter() {
2310 nmatch = nmatch.saturating_add(r.magic(
2311 magic,
2312 stream_kind,
2313 new_buf_base_off,
2314 Some(offset),
2315 haystack,
2316 db,
2317 false,
2318 depth.saturating_add(1),
2319 )?);
2320
2321 if nmatch > 0 {
2322 break;
2323 }
2324 }
2325
2326 Ok((nmatch > 0, None))
2327 }
2328
2329 Test::Default => {
2330 let ok = !state.get_continuation_level(&self.continuation_level());
2332
2333 trace!("source={source} line={line} default match={ok}");
2334 if ok {
2335 state.set_continuation_level(self.continuation_level());
2336 }
2337
2338 Ok((ok, None))
2339 }
2340
2341 _ => {
2342 if let Err(e) = haystack.seek(SeekFrom::Start(offset)) {
2343 debug!("source={source} line={line} failed to seek in haystack: {e}");
2344 return Ok((false, None));
2345 }
2346
2347 let mut trace_msg = None;
2348
2349 if enabled!(Level::DEBUG) {
2350 trace_msg = Some(vec![format!(
2351 "source={source} line={line} depth={} stream_offset={:#x}",
2352 self.depth,
2353 haystack.stream_position()
2354 )])
2355 }
2356
2357 if let Ok(opt_test_value) = self
2361 .test
2362 .read_test_value(haystack, switch_endianness)
2363 .inspect_err(|e| {
2364 debug!("source={source} line={line} error while reading test value @{offset}: {e}",)
2365 })
2366 {
2367 if let Some(v) = trace_msg
2368 .as_mut() { v.push(format!("test={}", self.test)) }
2369
2370 if let Some(v) = trace_msg.as_mut(){
2371 let drv = match opt_test_value.as_ref(){
2372 Some(r) => format!("{r:?}"),
2373 None =>String::new(),
2374 };
2375 v.push(format!("read_in_stream={drv}"))
2376 }
2377
2378 let match_res =
2379 opt_test_value.and_then(|tv| self.test.match_value(&tv, stream_kind));
2380
2381 if let Some(v) = trace_msg.as_mut() { v.push(format!(
2382 "message=\"{}\" match={}",
2383 self.message
2384 .as_ref()
2385 .map(|fs| fs.to_string_lossy())
2386 .unwrap_or_default(),
2387 match_res.is_some()
2388 )) }
2389
2390 if enabled!(Level::DEBUG) && !enabled!(Level::TRACE) && match_res.is_some() {
2392 if let Some(m) = trace_msg{
2393 debug!("{}", m.join(" "));
2394 }
2395 } else if enabled!(Level::TRACE)
2396 && let Some(m) = trace_msg{
2397 trace!("{}", m.join(" "));
2398 }
2399
2400 if let Some(mr) = match_res {
2401 state.set_continuation_level(self.continuation_level());
2402 return Ok((true, Some(mr)));
2403 }
2404 }
2405
2406 Ok((false, None))
2407 }
2408 }
2409 }
2410
2411 #[inline(always)]
2412 fn continuation_level(&self) -> ContinuationLevel {
2413 ContinuationLevel(self.depth)
2414 }
2415}
2416
2417#[derive(Debug, Clone)]
2418struct Use {
2419 line: usize,
2420 depth: u8,
2421 start_offset: Offset,
2422 rule_name: String,
2423 switch_endianness: bool,
2424 message: Option<Message>,
2425}
2426
2427#[derive(Debug, Clone, Serialize, Deserialize)]
2428struct StrengthMod {
2429 op: Op,
2430 by: u8,
2431}
2432
2433impl StrengthMod {
2434 #[inline(always)]
2435 fn apply(&self, strength: u64) -> u64 {
2436 let by = self.by as u64;
2437 debug!("applying strength modifier: {strength} {} {}", self.op, by);
2438 match self.op {
2439 Op::Mul => strength.saturating_mul(by),
2440 Op::Add => strength.saturating_add(by),
2441 Op::Sub => strength.saturating_sub(by),
2442 Op::Div => {
2443 if by > 0 {
2444 strength.saturating_div(by)
2445 } else {
2446 strength
2447 }
2448 }
2449 Op::Mod => strength % by,
2450 Op::And => strength & by,
2451 Op::Xor | Op::Or => {
2454 debug_panic!("unsupported strength operator");
2455 strength
2456 }
2457 }
2458 }
2459}
2460
2461#[derive(Debug, Clone)]
2462enum Flag {
2463 Mime(String),
2464 Ext(HashSet<String>),
2465 Strength(StrengthMod),
2466 Apple(String),
2467}
2468
2469#[derive(Debug, Clone)]
2470struct Name {
2471 line: usize,
2472 name: String,
2473 message: Option<Message>,
2474}
2475
2476#[derive(Debug, Clone)]
2477enum Entry<'span> {
2478 Match(Span<'span>, Match),
2479 Flag(Span<'span>, Flag),
2480}
2481
2482#[derive(Debug, Clone, Serialize, Deserialize)]
2483struct EntryNode {
2484 root: bool,
2485 entry: Match,
2486 children: Vec<EntryNode>,
2487 mimetype: Option<String>,
2488 apple: Option<String>,
2489 strength_mod: Option<StrengthMod>,
2490 exts: HashSet<String>,
2491}
2492
2493#[derive(Debug, Default)]
2494struct EntryNodeVisitor {
2495 exts: HashSet<String>,
2496 score: u64,
2497}
2498
2499impl EntryNodeVisitor {
2500 fn new() -> Self {
2501 Self {
2502 ..Default::default()
2503 }
2504 }
2505
2506 fn merge(&mut self, other: Self) {
2507 self.exts.extend(other.exts);
2508 self.score += other.score;
2509 }
2510}
2511
2512impl EntryNode {
2513 #[inline]
2514 fn update_visitor(&self, v: &mut EntryNodeVisitor, depth: usize) {
2515 for ext in self.exts.iter() {
2517 if !v.exts.contains(ext) {
2518 v.exts.insert(ext.clone());
2519 }
2520 }
2521
2522 if depth == 0 {
2524 v.score += self.entry.test_strength;
2525 }
2526
2527 v.score += self
2531 .children
2532 .iter()
2533 .map(|e| e.entry.test_strength)
2534 .min()
2535 .unwrap_or_default()
2536 / max(1, depth as u64);
2537 }
2538
2539 fn visit(
2540 &self,
2541 v: &mut EntryNodeVisitor,
2542 deps: &HashMap<String, DependencyRule>,
2543 marked: &mut HashSet<String>,
2544 depth: usize,
2545 ) -> Result<(), Error> {
2546 self.update_visitor(v, depth);
2548
2549 for c in self.children.iter() {
2551 if let Test::Use(_, ref name) = c.entry.test {
2552 if marked.contains(name) {
2553 continue;
2554 }
2555
2556 marked.insert(name.clone());
2557
2558 if let Some(r) = deps.get(name) {
2559 let dv = r.rule.visit_all_entries(deps, marked)?;
2560 v.merge(dv);
2561 } else {
2562 return Err(Error::MissingRule(name.clone()));
2563 }
2564 } else {
2565 c.visit(v, deps, marked, depth + 1)?;
2566 }
2567 }
2568
2569 Ok(())
2570 }
2571
2572 #[inline]
2575 #[allow(clippy::too_many_arguments)]
2576 fn matches<'r, D: DataRead>(
2577 &'r self,
2578 opt_source: Option<&str>,
2579 magic: &mut Magic<'r>,
2580 state: &mut MatchState,
2581 stream_kind: StreamKind,
2582 buf_base_offset: Option<u64>,
2583 rule_base_offset: Option<u64>,
2584 last_level_offset: Option<u64>,
2585 haystack: &mut D,
2586 db: &'r MagicDb,
2587 switch_endianness: bool,
2588 depth: usize,
2589 ) -> Result<u64, Error> {
2590 let mut nmatch = 0u64;
2591
2592 let (ok, opt_match_res) = self.entry.matches(
2593 opt_source,
2594 magic,
2595 stream_kind,
2596 state,
2597 buf_base_offset,
2598 rule_base_offset,
2599 last_level_offset,
2600 haystack,
2601 switch_endianness,
2602 db,
2603 depth,
2604 )?;
2605
2606 let source = opt_source.unwrap_or("unknown");
2607 let line = self.entry.line;
2608
2609 if ok {
2610 if !self.entry.test.is_recursive()
2614 && let Some(msg) = self.entry.message.as_ref()
2615 && let Ok(msg) = msg.format_with(opt_match_res.as_ref()).inspect_err(|e| {
2616 debug!("source={source} line={line} failed to format message: {e}")
2617 })
2618 {
2619 nmatch = nmatch.saturating_add(1);
2620 magic.push_message(msg);
2621 }
2622
2623 if let Some(mr) = opt_match_res {
2625 match &self.entry.test {
2626 Test::String(t) if t.has_length_mod() => {
2627 let o = mr.end_offset();
2628 haystack.seek(SeekFrom::Start(o))?;
2629 }
2630 Test::Search(t) => {
2631 if t.re_mods.contains(ReMod::StartOffsetUpdate) {
2632 let o = mr.start_offset();
2633 haystack.seek(SeekFrom::Start(o))?;
2634 } else {
2635 let o = mr.end_offset();
2636 haystack.seek(SeekFrom::Start(o))?;
2637 }
2638 }
2639
2640 Test::Regex(t) => {
2641 if t.mods.contains(ReMod::StartOffsetUpdate) {
2642 let o = mr.start_offset();
2643 haystack.seek(SeekFrom::Start(o))?;
2644 } else {
2645 let o = mr.end_offset();
2646 haystack.seek(SeekFrom::Start(o))?;
2647 }
2648 }
2649 _ => {}
2651 }
2652 }
2653
2654 if let Some(mimetype) = self.mimetype.as_ref() {
2655 magic.set_mime_type(Cow::Borrowed(mimetype));
2656 }
2657
2658 if let Some(apple_ty) = self.apple.as_ref() {
2659 magic.set_creator_code(Cow::Borrowed(apple_ty));
2660 }
2661
2662 if !self.exts.is_empty() {
2663 magic.insert_extensions(self.exts.iter().map(|s| s.as_str()));
2664 }
2665
2666 let mut strength = self.entry.test_strength;
2670
2671 let continuation_level = self.entry.continuation_level().0 as u64;
2672 if self.entry.message.is_none() && continuation_level < 3 {
2673 strength = strength.saturating_add(continuation_level);
2674 }
2675
2676 if let Some(sm) = self.strength_mod.as_ref() {
2677 strength = sm.apply(strength);
2678 }
2679
2680 if self.entry.message.is_none() {
2682 strength += 1
2683 }
2684
2685 magic.update_strength(strength);
2686
2687 let end_upper_level = haystack.stream_position();
2688
2689 let rule_base_offset = if self.root {
2697 match self.entry.offset {
2698 Offset::Direct(DirOffset::End(o)) => {
2699 Some(haystack.offset_from_start(SeekFrom::End(o)))
2700 }
2701 _ => rule_base_offset,
2702 }
2703 } else {
2704 rule_base_offset
2705 };
2706
2707 for e in self.children.iter() {
2708 nmatch = nmatch.saturating_add(e.matches(
2709 opt_source,
2710 magic,
2711 state,
2712 stream_kind,
2713 buf_base_offset,
2714 rule_base_offset,
2715 Some(end_upper_level),
2716 haystack,
2717 db,
2718 switch_endianness,
2719 depth,
2720 )?);
2721 }
2722 }
2723
2724 Ok(nmatch)
2725 }
2726}
2727
2728#[derive(Debug, Clone, Serialize, Deserialize)]
2730pub struct MagicRule {
2731 id: usize,
2732 source: Option<String>,
2733 entries: EntryNode,
2734 extensions: HashSet<String>,
2735 score: u64,
2737 finalized: bool,
2738}
2739
2740impl MagicRule {
2741 #[inline(always)]
2742 fn set_id(&mut self, id: usize) {
2743 self.id = id
2744 }
2745
2746 fn visit_all_entries(
2747 &self,
2748 deps: &HashMap<String, DependencyRule>,
2749 marked: &mut HashSet<String>,
2750 ) -> Result<EntryNodeVisitor, Error> {
2751 let mut v = EntryNodeVisitor::new();
2752 self.entries.visit(&mut v, deps, marked, 0)?;
2753 Ok(v)
2754 }
2755
2756 fn try_finalize(&mut self, deps: &HashMap<String, DependencyRule>) -> Result<(), Error> {
2759 if self.finalized {
2760 return Ok(());
2761 }
2762
2763 let v = self.visit_all_entries(deps, &mut HashSet::new())?;
2765
2766 self.extensions.extend(v.exts);
2767 self.score = v.score;
2768 self.finalized = true;
2769
2770 Ok(())
2771 }
2772
2773 #[inline]
2774 fn magic_entrypoint<'r, D: DataRead>(
2775 &'r self,
2776 magic: &mut Magic<'r>,
2777 stream_kind: StreamKind,
2778 haystack: &mut D,
2779 db: &'r MagicDb,
2780 switch_endianness: bool,
2781 depth: usize,
2782 ) -> Result<u64, Error> {
2783 self.entries.matches(
2784 self.source.as_deref(),
2785 magic,
2786 &mut MatchState::empty(),
2787 stream_kind,
2788 None,
2789 None,
2790 None,
2791 haystack,
2792 db,
2793 switch_endianness,
2794 depth,
2795 )
2796 }
2797
2798 #[inline]
2801 #[allow(clippy::too_many_arguments)]
2802 fn magic<'r, D: DataRead>(
2803 &'r self,
2804 magic: &mut Magic<'r>,
2805 stream_kind: StreamKind,
2806 buf_base_offset: Option<u64>,
2807 rule_base_offset: Option<u64>,
2808 haystack: &mut D,
2809 db: &'r MagicDb,
2810 switch_endianness: bool,
2811 depth: usize,
2812 ) -> Result<u64, Error> {
2813 self.entries.matches(
2814 self.source.as_deref(),
2815 magic,
2816 &mut MatchState::empty(),
2817 stream_kind,
2818 buf_base_offset,
2819 rule_base_offset,
2820 None,
2821 haystack,
2822 db,
2823 switch_endianness,
2824 depth,
2825 )
2826 }
2827
2828 pub fn is_text(&self) -> bool {
2834 self.entries.entry.test.is_text()
2835 && self.entries.children.iter().all(|e| e.entry.test.is_text())
2836 }
2837
2838 #[inline(always)]
2844 pub fn score(&self) -> u64 {
2845 self.score
2846 }
2847
2848 #[inline(always)]
2854 pub fn source(&self) -> Option<&str> {
2855 self.source.as_deref()
2856 }
2857
2858 #[inline(always)]
2864 pub fn line(&self) -> usize {
2865 self.entries.entry.line
2866 }
2867
2868 #[inline(always)]
2874 pub fn extensions(&self) -> &HashSet<String> {
2875 &self.extensions
2876 }
2877}
2878
2879#[derive(Debug, Clone, Serialize, Deserialize)]
2880struct DependencyRule {
2881 name: String,
2882 rule: MagicRule,
2883}
2884
2885#[derive(Debug, Clone, Serialize, Deserialize)]
2891pub struct MagicSource {
2892 rules: Vec<MagicRule>,
2893 dependencies: HashMap<String, DependencyRule>,
2894}
2895
2896impl MagicSource {
2897 pub fn open<P: AsRef<Path>>(p: P) -> Result<Self, Error> {
2907 FileMagicParser::parse_file(p)
2908 }
2909}
2910
2911#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)]
2912struct ContinuationLevel(u8);
2913
2914#[derive(Debug, PartialEq, Eq, Clone, Copy)]
2916enum TextEncoding {
2917 Ascii,
2918 Utf8,
2919 Unknown,
2920}
2921
2922impl TextEncoding {
2923 const fn as_magic_str(&self) -> &'static str {
2924 match self {
2925 TextEncoding::Ascii => "ASCII",
2926 TextEncoding::Utf8 => "UTF-8",
2927 TextEncoding::Unknown => "Unknown",
2928 }
2929 }
2930}
2931
2932#[derive(Debug, PartialEq, Eq, Clone, Copy)]
2933enum StreamKind {
2934 Binary,
2935 Text(TextEncoding),
2936}
2937
2938impl StreamKind {
2939 const fn is_text(&self) -> bool {
2940 matches!(self, StreamKind::Text(_))
2941 }
2942}
2943
2944#[derive(Debug)]
2945struct MatchState {
2946 continuation_levels: [bool; 256],
2947}
2948
2949impl MatchState {
2950 #[inline(always)]
2951 fn empty() -> Self {
2952 MatchState {
2953 continuation_levels: [false; 256],
2954 }
2955 }
2956
2957 #[inline(always)]
2958 fn get_continuation_level(&mut self, level: &ContinuationLevel) -> bool {
2959 self.continuation_levels
2960 .get(level.0 as usize)
2961 .cloned()
2962 .unwrap_or_default()
2963 }
2964
2965 #[inline(always)]
2966 fn set_continuation_level(&mut self, level: ContinuationLevel) {
2967 if let Some(b) = self.continuation_levels.get_mut(level.0 as usize) {
2968 *b = true
2969 }
2970 }
2971
2972 #[inline(always)]
2973 fn clear_continuation_level(&mut self, level: &ContinuationLevel) {
2974 if let Some(b) = self.continuation_levels.get_mut(level.0 as usize) {
2975 *b = false;
2976 }
2977 }
2978}
2979
2980#[derive(Debug, Default)]
2982pub struct Magic<'m> {
2983 stream_kind: Option<StreamKind>,
2984 source: Option<Cow<'m, str>>,
2985 message: Vec<Cow<'m, str>>,
2986 mime_type: Option<Cow<'m, str>>,
2987 creator_code: Option<Cow<'m, str>>,
2988 strength: u64,
2989 exts: HashSet<Cow<'m, str>>,
2990 is_default: bool,
2991}
2992
2993impl<'m> Magic<'m> {
2994 #[inline(always)]
2995 fn set_source(&mut self, source: Option<&'m str>) {
2996 self.source = source.map(Cow::Borrowed);
2997 }
2998
2999 #[inline(always)]
3000 fn set_stream_kind(&mut self, stream_kind: StreamKind) {
3001 self.stream_kind = Some(stream_kind)
3002 }
3003
3004 #[inline(always)]
3005 fn reset(&mut self) {
3006 self.stream_kind = None;
3007 self.source = None;
3008 self.message.clear();
3009 self.mime_type = None;
3010 self.creator_code = None;
3011 self.strength = 0;
3012 self.exts.clear();
3013 self.is_default = false;
3014 }
3015
3016 #[inline]
3024 pub fn into_owned<'owned>(self) -> Magic<'owned> {
3025 Magic {
3026 stream_kind: self.stream_kind,
3027 source: self.source.map(|s| Cow::Owned(s.into_owned())),
3028 message: self
3029 .message
3030 .into_iter()
3031 .map(Cow::into_owned)
3032 .map(Cow::Owned)
3033 .collect(),
3034 mime_type: self.mime_type.map(|m| Cow::Owned(m.into_owned())),
3035 creator_code: self.creator_code.map(|m| Cow::Owned(m.into_owned())),
3036 strength: self.strength,
3037 exts: self
3038 .exts
3039 .into_iter()
3040 .map(|e| Cow::Owned(e.into_owned()))
3041 .collect(),
3042 is_default: self.is_default,
3043 }
3044 }
3045
3046 #[inline(always)]
3052 pub fn message(&self) -> String {
3053 let mut out = String::new();
3054 for (i, m) in self.message.iter().enumerate() {
3055 if let Some(s) = m.strip_prefix(r#"\b"#) {
3056 out.push_str(s);
3057 } else {
3058 if i > 0 {
3060 out.push(' ');
3061 }
3062 out.push_str(m);
3063 }
3064 }
3065 out
3066 }
3067
3068 #[inline]
3079 pub fn message_parts(&self) -> impl Iterator<Item = &str> {
3080 self.message.iter().map(|p| p.as_ref())
3081 }
3082
3083 #[inline(always)]
3084 fn update_strength(&mut self, value: u64) {
3085 self.strength = self.strength.saturating_add(value);
3086 debug!("updated strength = {:?}", self.strength)
3087 }
3088
3089 #[inline(always)]
3095 pub fn mime_type(&self) -> &str {
3096 self.mime_type.as_deref().unwrap_or(match self.stream_kind {
3097 Some(StreamKind::Text(_)) => DEFAULT_TEXT_MIMETYPE,
3098 Some(StreamKind::Binary) | None => DEFAULT_BIN_MIMETYPE,
3099 })
3100 }
3101
3102 #[inline(always)]
3103 fn push_message<'a: 'm>(&mut self, msg: Cow<'a, str>) {
3104 if !msg.is_empty() {
3105 debug!("pushing message: msg={msg} len={}", msg.len());
3106 self.message.push(msg);
3107 }
3108 }
3109
3110 #[inline(always)]
3111 fn set_mime_type<'a: 'm>(&mut self, mime: Cow<'a, str>) {
3112 if self.mime_type.is_none() {
3113 debug!("insert mime: {:?}", mime);
3114 self.mime_type = Some(mime)
3115 }
3116 }
3117
3118 #[inline(always)]
3119 fn set_creator_code<'a: 'm>(&mut self, apple_ty: Cow<'a, str>) {
3120 if self.creator_code.is_none() {
3121 debug!("insert apple type: {apple_ty:?}");
3122 self.creator_code = Some(apple_ty)
3123 }
3124 }
3125
3126 #[inline(always)]
3127 fn insert_extensions<'a: 'm, I: Iterator<Item = &'a str>>(&mut self, exts: I) {
3128 if self.exts.is_empty() {
3129 self.exts.extend(exts.filter_map(|e| {
3130 if e.is_empty() {
3131 None
3132 } else {
3133 Some(Cow::Borrowed(e))
3134 }
3135 }));
3136 }
3137 }
3138
3139 #[inline(always)]
3147 pub fn strength(&self) -> u64 {
3148 self.strength
3149 }
3150
3151 #[inline(always)]
3157 pub fn source(&self) -> Option<&str> {
3158 self.source.as_deref()
3159 }
3160
3161 #[inline(always)]
3167 pub fn creator_code(&self) -> Option<&str> {
3168 self.creator_code.as_deref()
3169 }
3170
3171 #[inline(always)]
3177 pub fn extensions(&self) -> &HashSet<Cow<'m, str>> {
3178 &self.exts
3179 }
3180
3181 #[inline(always)]
3187 pub fn is_default(&self) -> bool {
3188 self.is_default
3189 }
3190}
3191
3192#[derive(Debug, Default, Clone, Serialize, Deserialize)]
3194pub struct MagicDb {
3195 rule_id: usize,
3196 rules: Vec<MagicRule>,
3197 dependencies: HashMap<String, DependencyRule>,
3198 finalized: usize,
3199}
3200
3201#[inline(always)]
3202fn is_likely_text(bytes: &[u8]) -> bool {
3204 const CHUNK_SIZE: usize = std::mem::size_of::<usize>();
3205
3206 if bytes.is_empty() {
3207 return false;
3208 }
3209
3210 let mut printable = 0f64;
3211 let mut high_bytes = 0f64; let (chunks, remainder) = bytes.as_chunks::<CHUNK_SIZE>();
3214
3215 macro_rules! handle_byte {
3216 ($byte: expr) => {
3217 match $byte {
3218 0x00 => return false,
3219 0x09 | 0x0A | 0x0D => printable += 1.0, 0x20..=0x7E => printable += 1.0, _ => high_bytes += 1.0,
3222 }
3223 };
3224 }
3225
3226 for bytes in chunks {
3227 for b in bytes {
3228 handle_byte!(b)
3229 }
3230 }
3231
3232 for b in remainder {
3233 handle_byte!(b)
3234 }
3235
3236 let total = bytes.len() as f64;
3237 let printable_ratio = printable / total;
3238 let high_bytes_ratio = high_bytes / total;
3239
3240 printable_ratio > 0.85 && high_bytes_ratio < 0.20
3242}
3243
3244#[inline(always)]
3245fn guess_stream_kind<S: AsRef<[u8]>>(stream: S) -> StreamKind {
3246 let buf = stream.as_ref();
3247
3248 match run_utf8_validation(buf) {
3249 Ok(is_ascii) => {
3250 if is_ascii {
3251 StreamKind::Text(TextEncoding::Ascii)
3252 } else {
3253 StreamKind::Text(TextEncoding::Utf8)
3254 }
3255 }
3256 Err(e) => {
3257 if is_likely_text(&buf[e.valid_up_to..]) {
3258 StreamKind::Text(TextEncoding::Unknown)
3259 } else {
3260 StreamKind::Binary
3261 }
3262 }
3263 }
3264}
3265
3266impl MagicDb {
3267 pub fn new() -> Self {
3273 Self::default()
3274 }
3275
3276 #[inline(always)]
3277 fn next_rule_id(&mut self) -> usize {
3278 let t = self.rule_id;
3279 self.rule_id += 1;
3280 t
3281 }
3282
3283 #[inline(always)]
3284 fn try_json<D: DataRead>(
3285 haystack: &mut D,
3286 stream_kind: StreamKind,
3287 magic: &mut Magic,
3288 ) -> Result<bool, Error> {
3289 if matches!(stream_kind, StreamKind::Binary) {
3291 return Ok(false);
3292 }
3293
3294 let buf = haystack.read_range(0..FILE_BYTES_MAX as u64)?.trim_ascii();
3295
3296 let Some((start, end)) = find_json_boundaries(buf) else {
3297 return Ok(false);
3298 };
3299
3300 for c in buf[0..start].iter() {
3303 if !c.is_ascii_whitespace() {
3304 return Ok(false);
3305 }
3306 }
3307
3308 let mut is_ndjson = false;
3309
3310 trace!("maybe a json document");
3311 let ok = serde_json::from_slice::<serde_json::Value>(&buf[start..=end]).is_ok();
3312 if !ok {
3313 return Ok(false);
3314 }
3315
3316 if end + 1 < buf.len() {
3318 let buf = &buf[end + 1..];
3320 if let Some((second_start, second_end)) = find_json_boundaries(buf) {
3321 if memchr(b'\n', &buf[..second_start]).is_some() {
3323 trace!("might be ndjson");
3324 is_ndjson = serde_json::from_slice::<serde_json::Value>(
3325 &buf[second_start..=second_end],
3326 )
3327 .is_ok();
3328 }
3329 }
3330 }
3331
3332 if is_ndjson {
3333 magic.push_message(Cow::Borrowed("New Line Delimited"));
3334 magic.set_mime_type(Cow::Borrowed("application/x-ndjson"));
3335 magic.insert_extensions(["ndjson", "jsonl"].into_iter());
3336 } else {
3337 magic.set_mime_type(Cow::Borrowed("application/json"));
3338 magic.insert_extensions(["json"].into_iter());
3339 }
3340
3341 magic.push_message(Cow::Borrowed("JSON text data"));
3342 magic.set_source(Some(HARDCODED_SOURCE));
3343 magic.update_strength(HARDCODED_MAGIC_STRENGTH);
3344 Ok(true)
3345 }
3346
3347 #[inline(always)]
3348 fn try_csv<D: DataRead>(
3349 haystack: &mut D,
3350 stream_kind: StreamKind,
3351 magic: &mut Magic,
3352 ) -> Result<bool, Error> {
3353 let StreamKind::Text(enc) = stream_kind else {
3355 return Ok(false);
3356 };
3357
3358 let buf = haystack.read_range(0..FILE_BYTES_MAX as u64)?;
3359 let mut reader = csv::Reader::from_reader(io::Cursor::new(buf));
3360 let mut records = reader.records();
3361
3362 let Some(Ok(first)) = records.next() else {
3363 return Ok(false);
3364 };
3365
3366 if first.len() <= 1 {
3370 return Ok(false);
3371 }
3372
3373 let mut n = 1;
3375 for i in records.take(9) {
3376 if let Ok(rec) = i {
3377 if first.len() != rec.len() {
3378 return Ok(false);
3379 }
3380 } else {
3381 return Ok(false);
3382 }
3383 n += 1;
3384 }
3385
3386 if n != 10 {
3388 return Ok(false);
3389 }
3390
3391 magic.set_mime_type(Cow::Borrowed("text/csv"));
3392 magic.push_message(Cow::Borrowed("CSV"));
3393 magic.push_message(Cow::Borrowed(enc.as_magic_str()));
3394 magic.push_message(Cow::Borrowed("text"));
3395 magic.insert_extensions(["csv"].into_iter());
3396 magic.set_source(Some(HARDCODED_SOURCE));
3397 magic.update_strength(HARDCODED_MAGIC_STRENGTH);
3398 Ok(true)
3399 }
3400
3401 #[inline(always)]
3402 fn try_tar<D: DataRead>(
3403 haystack: &mut D,
3404 stream_kind: StreamKind,
3405 magic: &mut Magic,
3406 ) -> Result<bool, Error> {
3407 if !matches!(stream_kind, StreamKind::Binary) {
3409 return Ok(false);
3410 }
3411
3412 let buf = haystack.read_range(0..FILE_BYTES_MAX as u64)?;
3413 let mut ar = Archive::new(io::Cursor::new(buf));
3414
3415 let Ok(mut entries) = ar.entries() else {
3416 return Ok(false);
3417 };
3418
3419 let Some(Ok(first)) = entries.next() else {
3420 return Ok(false);
3421 };
3422
3423 let header = first.header();
3424
3425 if header.as_ustar().is_some() {
3426 magic.push_message(Cow::Borrowed("POSIX tar archive"));
3427 } else if header.as_gnu().is_some() {
3428 magic.push_message(Cow::Borrowed("POSIX tar archive (GNU)"));
3429 } else {
3430 magic.push_message(Cow::Borrowed("tar archive"));
3431 }
3432
3433 magic.set_mime_type(Cow::Borrowed("application/x-tar"));
3434 magic.set_source(Some(HARDCODED_SOURCE));
3435 magic.update_strength(HARDCODED_MAGIC_STRENGTH);
3436 magic.insert_extensions(["tar"].into_iter());
3437 Ok(true)
3438 }
3439
3440 #[inline(always)]
3441 fn try_hard_magic<D: DataRead>(
3442 haystack: &mut D,
3443 stream_kind: StreamKind,
3444 magic: &mut Magic,
3445 ) -> Result<bool, Error> {
3446 Ok(Self::try_json(haystack, stream_kind, magic)?
3447 || Self::try_csv(haystack, stream_kind, magic)?
3448 || Self::try_tar(haystack, stream_kind, magic)?)
3449 }
3450
3451 #[inline(always)]
3452 fn magic_default<'m, D: DataRead>(
3453 cache: &mut D,
3454 stream_kind: StreamKind,
3455 magic: &mut Magic<'m>,
3456 ) {
3457 magic.set_source(Some(HARDCODED_SOURCE));
3458 magic.set_stream_kind(stream_kind);
3459 magic.is_default = true;
3460
3461 if cache.data_size() == 0 {
3462 magic.push_message(Cow::Borrowed("empty"));
3463 magic.set_mime_type(Cow::Borrowed(DEFAULT_BIN_MIMETYPE));
3464 }
3465
3466 match stream_kind {
3467 StreamKind::Binary => {
3468 magic.push_message(Cow::Borrowed("data"));
3469 }
3470 StreamKind::Text(e) => {
3471 magic.push_message(Cow::Borrowed(e.as_magic_str()));
3472 magic.push_message(Cow::Borrowed("text"));
3473 }
3474 }
3475 }
3476
3477 fn load_rules_no_prepare(&mut self, rules: Vec<MagicRule>) {
3478 for rule in rules.into_iter() {
3479 let mut rule = rule;
3480 rule.set_id(self.next_rule_id());
3481
3482 self.rules.push(rule);
3483 }
3484 }
3485
3486 pub fn load(&mut self, ms: MagicSource) -> &mut Self {
3492 self.load_rules_no_prepare(ms.rules);
3493 self.dependencies.extend(ms.dependencies);
3494 self.try_finalize();
3495 self
3496 }
3497
3498 pub fn load_bulk<I: Iterator<Item = MagicSource>>(&mut self, it: I) -> &mut Self {
3503 for ms in it {
3504 self.load_rules_no_prepare(ms.rules);
3505 self.dependencies.extend(ms.dependencies);
3506 }
3507 self.try_finalize();
3508 self
3509 }
3510
3511 pub fn rules(&self) -> &[MagicRule] {
3517 &self.rules
3518 }
3519
3520 #[inline]
3521 fn first_magic_with_stream_kind<D: DataRead>(
3522 &self,
3523 haystack: &mut D,
3524 stream_kind: StreamKind,
3525 extension: Option<&str>,
3526 ) -> Result<Magic<'_>, Error> {
3527 let mut magic = Magic::default();
3529
3530 if Self::try_hard_magic(haystack, stream_kind, &mut magic)? {
3531 return Ok(magic);
3532 }
3533
3534 let mut marked = vec![false; self.rules.len()];
3535
3536 macro_rules! do_magic {
3537 ($rule: expr) => {{
3538 $rule.magic_entrypoint(&mut magic, stream_kind, haystack, &self, false, 0)?;
3539
3540 if !magic.message.is_empty() {
3541 magic.set_stream_kind(stream_kind);
3542 magic.set_source($rule.source.as_deref());
3543 return Ok(magic);
3544 }
3545
3546 magic.reset();
3547 }};
3548 }
3549
3550 if let Some(ext) = extension.map(|e| e.to_lowercase())
3551 && !ext.is_empty()
3552 {
3553 for rule in self.rules.iter().filter(|r| r.extensions.contains(&ext)) {
3554 do_magic!(rule);
3555 if let Some(f) = marked.get_mut(rule.id) {
3556 *f = true
3557 }
3558 }
3559 }
3560
3561 for rule in self
3562 .rules
3563 .iter()
3564 .filter(|r| !*marked.get(r.id).unwrap_or(&false))
3566 {
3567 do_magic!(rule)
3568 }
3569
3570 Self::magic_default(haystack, stream_kind, &mut magic);
3571
3572 Ok(magic)
3573 }
3574
3575 pub fn first_magic<R: DataRead>(
3603 &self,
3604 r: &mut R,
3605 extension: Option<&str>,
3606 ) -> Result<Magic<'_>, Error> {
3607 let stream_kind = guess_stream_kind(r.read_range(0..FILE_BYTES_MAX as u64)?);
3608 self.first_magic_with_stream_kind(r, stream_kind, extension)
3609 }
3610
3611 pub fn first_magic_file<P: AsRef<Path>>(&self, path: P) -> Result<Magic<'_>, Error> {
3621 let ext = path.as_ref().extension().and_then(|e| e.to_str());
3622 self.first_magic(&mut DataReader::from_file(File::open(path.as_ref())?)?, ext)
3623 }
3624
3625 pub fn first_magic_slice<S: AsRef<[u8]>>(
3633 &self,
3634 s: S,
3635 extension: Option<&str>,
3636 ) -> Result<Magic<'_>, Error> {
3637 self.first_magic(&mut DataReader::from_slice(s.as_ref()), extension)
3638 }
3639
3640 #[inline(always)]
3641 fn all_magics_sort_with_stream_kind<R: DataRead>(
3642 &self,
3643 haystack: &mut R,
3644 stream_kind: StreamKind,
3645 ) -> Result<Vec<Magic<'_>>, Error> {
3646 let mut out = Vec::new();
3647
3648 let mut magic = Magic::default();
3649
3650 if Self::try_hard_magic(haystack, stream_kind, &mut magic)? {
3651 out.push(magic);
3652 magic = Magic::default();
3653 }
3654
3655 for rule in self.rules.iter() {
3656 rule.magic_entrypoint(&mut magic, stream_kind, haystack, self, false, 0)?;
3657
3658 if !magic.message.is_empty() {
3660 magic.set_stream_kind(stream_kind);
3661 magic.set_source(rule.source.as_deref());
3662 out.push(magic);
3663 magic = Magic::default();
3664 }
3665
3666 magic.reset();
3667 }
3668
3669 Self::magic_default(haystack, stream_kind, &mut magic);
3670 out.push(magic);
3671
3672 out.sort_by_key(|b| std::cmp::Reverse(b.strength()));
3673
3674 Ok(out)
3675 }
3676
3677 #[inline]
3692 pub fn all_magics<R: DataRead>(&self, r: &mut R) -> Result<Vec<Magic<'_>>, Error> {
3693 let stream_kind = guess_stream_kind(r.read_range(0..FILE_BYTES_MAX as u64)?);
3694 self.all_magics_sort_with_stream_kind(r, stream_kind)
3695 }
3696
3697 pub fn all_magics_file<P: AsRef<Path>>(&self, path: P) -> Result<Vec<Magic<'_>>, Error> {
3706 self.all_magics(&mut DataReader::from_file(File::open(path)?)?)
3707 }
3708
3709 pub fn all_magics_slice<S: AsRef<[u8]>>(&self, slice: S) -> Result<Vec<Magic<'_>>, Error> {
3718 self.all_magics(&mut DataReader::from_slice(slice.as_ref()))
3719 }
3720
3721 #[inline(always)]
3722 fn best_magic_with_stream_kind<R: DataRead>(
3723 &self,
3724 reader: &mut R,
3725 stream_kind: StreamKind,
3726 ) -> Result<Magic<'_>, Error> {
3727 let magics = self.all_magics_sort_with_stream_kind(reader, stream_kind)?;
3728
3729 Ok(magics.into_iter().next().unwrap_or_else(|| {
3732 let mut magic = Magic::default();
3733 Self::magic_default(reader, stream_kind, &mut magic);
3734 magic
3735 }))
3736 }
3737
3738 #[inline]
3753 pub fn best_magic<R: DataRead>(&self, r: &mut R) -> Result<Magic<'_>, Error> {
3754 let stream_kind = guess_stream_kind(r.read_range(0..FILE_BYTES_MAX as u64)?);
3755 self.best_magic_with_stream_kind(r, stream_kind)
3756 }
3757
3758 pub fn best_magic_file<P: AsRef<Path>>(&self, path: P) -> Result<Magic<'_>, Error> {
3767 self.best_magic(&mut DataReader::from_file(File::open(path)?)?)
3768 }
3769
3770 pub fn best_magic_slice<S: AsRef<[u8]>>(&self, slice: S) -> Result<Magic<'_>, Error> {
3779 self.best_magic(&mut DataReader::from_slice(slice.as_ref()))
3780 }
3781
3782 pub fn serialize<W: Write>(self, w: &mut W) -> Result<(), Error> {
3788 let mut encoder = GzEncoder::new(w, Compression::best());
3789
3790 bincode::serde::encode_into_std_write(&self, &mut encoder, bincode::config::standard())?;
3791 encoder.finish()?;
3792 Ok(())
3793 }
3794
3795 pub fn deserialize<R: Read>(r: &mut R) -> Result<Self, Error> {
3805 let mut buf = vec![];
3806 let mut gz = GzDecoder::new(r);
3807 gz.read_to_end(&mut buf).map_err(|e| {
3808 bincode::error::DecodeError::OtherString(format!("failed to read: {e}"))
3809 })?;
3810 let (sdb, _): (MagicDb, usize) =
3811 bincode::serde::decode_from_slice(&buf, bincode::config::standard())?;
3812 Ok(sdb)
3813 }
3814
3815 pub fn verify(&mut self) -> Result<(), Error> {
3822 if self.rules.len() == self.finalized {
3823 return Ok(());
3824 }
3825
3826 for r in self.rules.iter_mut().filter(|r| !r.finalized) {
3827 r.try_finalize(&self.dependencies).map_err(|e| {
3829 Error::Verify(
3830 r.source.clone().unwrap_or(String::from("unknown")),
3831 r.line(),
3832 e.into(),
3833 )
3834 })?;
3835 self.finalized += 1;
3836 }
3837
3838 debug_assert!(self.finalized <= self.rules.len());
3839
3840 Ok(())
3841 }
3842
3843 #[inline(always)]
3844 fn try_finalize(&mut self) {
3845 if self.rules.len() == self.finalized {
3846 return;
3847 }
3848
3849 let mut finalized = 0usize;
3850 self.rules.iter_mut().for_each(|r| {
3851 if r.try_finalize(&self.dependencies).is_ok() {
3852 finalized += 1;
3853 }
3854 });
3855
3856 self.finalized = finalized;
3857
3858 debug_assert!(self.finalized <= self.rules.len());
3859
3860 self.rules.sort_by_key(|r| (r.is_text(), -(r.score as i64)));
3862 }
3863}
3864
3865#[cfg(test)]
3866mod tests {
3867
3868 use regex::bytes::Regex;
3869
3870 use crate::{readers::BufReader, utils::unix_local_time_to_string};
3871
3872 use super::*;
3873
3874 macro_rules! buf_reader {
3875 ($l: literal) => {
3876 BufReader::from_slice($l.as_bytes())
3877 };
3878 }
3879
3880 fn first_magic(
3881 rule: &str,
3882 content: &[u8],
3883 stream_kind: StreamKind,
3884 ) -> Result<Magic<'static>, Error> {
3885 let mut md = MagicDb::new();
3886 md.load(
3887 FileMagicParser::parse_str(rule, None)
3888 .inspect_err(|e| eprintln!("{e}"))
3889 .unwrap(),
3890 );
3891 let mut reader = BufReader::from_slice(content);
3892 let v = md.best_magic_with_stream_kind(&mut reader, stream_kind)?;
3893 Ok(v.into_owned())
3894 }
3895
3896 #[allow(unused_macros)]
3898 macro_rules! enable_trace {
3899 () => {
3900 tracing_subscriber::fmt()
3901 .with_max_level(tracing_subscriber::filter::LevelFilter::TRACE)
3902 .try_init();
3903 };
3904 }
3905
3906 macro_rules! parse_assert {
3907 ($rule:literal) => {
3908 FileMagicParser::parse_str($rule, None)
3909 .inspect_err(|e| eprintln!("{e}"))
3910 .unwrap()
3911 };
3912 }
3913
3914 macro_rules! assert_magic_match_bin {
3915 ($rule: literal, $content:literal) => {{ first_magic($rule, $content, StreamKind::Binary).unwrap() }};
3916 ($rule: literal, $content:literal, $message:expr) => {{
3917 assert_eq!(
3918 first_magic($rule, $content, StreamKind::Binary)
3919 .unwrap()
3920 .message(),
3921 $message
3922 );
3923 }};
3924 }
3925
3926 macro_rules! assert_magic_match_text {
3927 ($rule: literal, $content:literal) => {{ first_magic($rule, $content, StreamKind::Text(TextEncoding::Utf8)).unwrap() }};
3928 ($rule: literal, $content:literal, $message:expr) => {{
3929 assert_eq!(
3930 first_magic($rule, $content, StreamKind::Text(TextEncoding::Utf8))
3931 .unwrap()
3932 .message(),
3933 $message
3934 );
3935 }};
3936 }
3937
3938 macro_rules! assert_magic_not_match_text {
3939 ($rule: literal, $content:literal) => {{
3940 assert!(
3941 first_magic($rule, $content, StreamKind::Text(TextEncoding::Utf8))
3942 .unwrap()
3943 .is_default()
3944 );
3945 }};
3946 }
3947
3948 macro_rules! assert_magic_not_match_bin {
3949 ($rule: literal, $content:literal) => {{
3950 assert!(
3951 first_magic($rule, $content, StreamKind::Binary)
3952 .unwrap()
3953 .is_default()
3954 );
3955 }};
3956 }
3957
3958 #[test]
3959 fn test_regex() {
3960 assert_magic_match_text!(
3961 r#"
39620 regex/1024 \^#![[:space:]]*/usr/bin/env[[:space:]]+
3963!:mime text/x-shellscript
3964>&0 regex/64 .*($|\\b) %s shell script text executable
3965 "#,
3966 br#"#!/usr/bin/env bash
3967 echo hello world"#,
3968 "bash shell script text executable"
3970 );
3971
3972 let re = Regex::new(r"(?-u)\x42\x82").unwrap();
3973 assert!(re.is_match(b"\x42\x82"));
3974
3975 assert_magic_match_bin!(
3976 r#"0 regex \x42\x82 binary regex match"#,
3977 b"\x00\x00\x00\x00\x00\x00\x42\x82"
3978 );
3979
3980 assert_magic_match_bin!(
3982 r#"
3983 0 regex \x42\x82
3984 >&0 string \xde\xad\xbe\xef it works
3985 "#,
3986 b"\x00\x00\x00\x00\x00\x00\x42\x82\xde\xad\xbe\xef"
3987 );
3988
3989 assert_magic_match_bin!(
3990 r#"
3991 0 regex/s \x42\x82
3992 >&0 string \x42\x82\xde\xad\xbe\xef it works
3993 "#,
3994 b"\x00\x00\x00\x00\x00\x00\x42\x82\xde\xad\xbe\xef"
3995 );
3996
3997 assert_magic_match_text!(
3999 r#"
40000 regex/1024 \^HelloWorld$ HelloWorld String"#,
4001 br#"
4002// this is a comment after an empty line
4003HelloWorld
4004 "#
4005 );
4006 }
4007
4008 #[test]
4009 fn test_string_with_mods() {
4010 assert_magic_match_text!(
4011 r#"0 string/w #!\ \ \ /usr/bin/env\ bash BASH
4012 "#,
4013 b"#! /usr/bin/env bash i
4014 echo hello world"
4015 );
4016
4017 assert_magic_match_text!(
4019 r#"0 string/C HelloWorld it works
4020 "#,
4021 b"helloworld"
4022 );
4023
4024 assert_magic_not_match_text!(
4025 r#"0 string/C HelloWorld it works
4026 "#,
4027 b"hELLOwORLD"
4028 );
4029
4030 assert_magic_match_text!(
4032 r#"0 string/c HelloWorld it works
4033 "#,
4034 b"HELLOWORLD"
4035 );
4036
4037 assert_magic_not_match_text!(
4038 r#"0 string/c HelloWorld it works
4039 "#,
4040 b"helloworld"
4041 );
4042
4043 assert_magic_match_text!(
4045 r#"0 string/f #!/usr/bin/env\ bash BASH
4046 "#,
4047 b"#!/usr/bin/env bash"
4048 );
4049
4050 assert_magic_not_match_text!(
4051 r#"0 string/f #!/usr/bin/python PYTHON"#,
4052 b"#!/usr/bin/pythonic"
4053 );
4054
4055 assert_magic_match_text!(
4057 r#"0 string/W #!/usr/bin/env\ python PYTHON"#,
4058 b"#!/usr/bin/env python"
4059 );
4060
4061 assert_magic_not_match_text!(
4062 r#"0 string/W #!/usr/bin/env\ \ python PYTHON"#,
4063 b"#!/usr/bin/env python"
4064 );
4065 }
4066
4067 #[test]
4068 fn test_search_with_mods() {
4069 assert_magic_match_text!(
4070 r#"0 search/1/fwt #!\ /usr/bin/luatex LuaTex script text executable"#,
4071 b"#! /usr/bin/luatex "
4072 );
4073
4074 assert_magic_match_text!(
4076 r#"
4077 0 search/s /usr/bin/env
4078 >&0 string /usr/bin/env it works
4079 "#,
4080 b"#!/usr/bin/env python"
4081 );
4082
4083 assert_magic_not_match_text!(
4084 r#"
4085 0 search /usr/bin/env
4086 >&0 string /usr/bin/env it works
4087 "#,
4088 b"#!/usr/bin/env python"
4089 );
4090 }
4091
4092 #[test]
4093 fn test_pstring() {
4094 assert_magic_match_bin!(r#"0 pstring Toast it works"#, b"\x05Toast");
4095
4096 assert_magic_match_bin!(r#"0 pstring Toast %s"#, b"\x05Toast", "Toast");
4097
4098 assert_magic_not_match_bin!(r#"0 pstring Toast Doesn't work"#, b"\x07Toaster");
4099
4100 assert_magic_match_bin!(r#"0 pstring/H Toast it works"#, b"\x00\x05Toast");
4102
4103 assert_magic_match_bin!(r#"0 pstring/HJ Toast it works"#, b"\x00\x07Toast");
4104
4105 assert_magic_match_bin!(r#"0 pstring/HJ Toast %s"#, b"\x00\x07Toast", "Toast");
4106
4107 assert_magic_match_bin!(r#"0 pstring/h Toast it works"#, b"\x05\x00Toast");
4108
4109 assert_magic_match_bin!(r#"0 pstring/hJ Toast it works"#, b"\x07\x00Toast");
4110
4111 assert_magic_match_bin!(r#"0 pstring/L Toast it works"#, b"\x00\x00\x00\x05Toast");
4112
4113 assert_magic_match_bin!(r#"0 pstring/LJ Toast it works"#, b"\x00\x00\x00\x09Toast");
4114
4115 assert_magic_match_bin!(r#"0 pstring/l Toast it works"#, b"\x05\x00\x00\x00Toast");
4116
4117 assert_magic_match_bin!(r#"0 pstring/lJ Toast it works"#, b"\x09\x00\x00\x00Toast");
4118 }
4119
4120 #[test]
4121 fn test_max_recursion() {
4122 let res = first_magic(
4123 r#"0 indirect x"#,
4124 b"#! /usr/bin/luatex ",
4125 StreamKind::Binary,
4126 );
4127 assert!(res.is_err());
4128 let _ = res.inspect_err(|e| {
4129 assert!(matches!(
4130 e.unwrap_localized(),
4131 Error::MaximumRecursion(MAX_RECURSION)
4132 ))
4133 });
4134 }
4135
4136 #[test]
4137 fn test_string_ops() {
4138 assert_magic_match_text!("0 string/b MZ MZ File", b"MZ\0");
4139 assert_magic_match_text!("0 string !MZ Not MZ File", b"AZ\0");
4140 assert_magic_match_text!("0 string >\0 Any String", b"A\0");
4141 assert_magic_match_text!("0 string >Test Any String", b"Test 1\0");
4142 assert_magic_match_text!("0 string <Test Any String", b"\0");
4143 assert_magic_not_match_text!("0 string >Test Any String", b"\0");
4144 }
4145
4146 #[test]
4147 fn test_lestring16() {
4148 assert_magic_match_bin!(
4149 "0 lestring16 abcd Little-endian UTF-16 string",
4150 b"\x61\x00\x62\x00\x63\x00\x64\x00"
4151 );
4152 assert_magic_match_bin!(
4153 "0 lestring16 x %s",
4154 b"\x61\x00\x62\x00\x63\x00\x64\x00\x00",
4155 "abcd"
4156 );
4157 assert_magic_not_match_bin!(
4158 "0 lestring16 abcd Little-endian UTF-16 string",
4159 b"\x00\x61\x00\x62\x00\x63\x00\x64"
4160 );
4161 assert_magic_match_bin!(
4162 "4 lestring16 abcd Little-endian UTF-16 string",
4163 b"\x00\x00\x00\x00\x61\x00\x62\x00\x63\x00\x64\x00"
4164 );
4165 }
4166
4167 #[test]
4168 fn test_bestring16() {
4169 assert_magic_match_bin!(
4170 "0 bestring16 abcd Big-endian UTF-16 string",
4171 b"\x00\x61\x00\x62\x00\x63\x00\x64"
4172 );
4173 assert_magic_match_bin!(
4174 "0 bestring16 x %s",
4175 b"\x00\x61\x00\x62\x00\x63\x00\x64",
4176 "abcd"
4177 );
4178 assert_magic_not_match_bin!(
4179 "0 bestring16 abcd Big-endian UTF-16 string",
4180 b"\x61\x00\x62\x00\x63\x00\x64\x00"
4181 );
4182 assert_magic_match_bin!(
4183 "4 bestring16 abcd Big-endian UTF-16 string",
4184 b"\x00\x00\x00\x00\x00\x61\x00\x62\x00\x63\x00\x64"
4185 );
4186 }
4187
4188 #[test]
4189 fn test_offset_from_end() {
4190 assert_magic_match_bin!("-1 ubyte 0x42 last byte ok", b"\x00\x00\x42");
4191 assert_magic_match_bin!("-2 ubyte 0x41 last byte ok", b"\x00\x41\x00");
4192 }
4193
4194 #[test]
4195 fn test_relative_offset() {
4196 assert_magic_match_bin!(
4197 "
4198 0 ubyte 0x42
4199 >&0 ubyte 0x00
4200 >>&0 ubyte 0x41 third byte ok
4201 ",
4202 b"\x42\x00\x41\x00"
4203 );
4204 }
4205
4206 #[test]
4207 fn test_indirect_offset() {
4208 assert_magic_match_bin!("(0.l) ubyte 0x42 it works", b"\x04\x00\x00\x00\x42");
4209 assert_magic_match_bin!("(0.l+3) ubyte 0x42 it works", b"\x01\x00\x00\x00\x42");
4211 assert_magic_match_bin!(
4213 "(0.l+(4)) ubyte 0x42 it works",
4214 b"\x04\x00\x00\x00\x04\x00\x00\x00\x42"
4215 );
4216 }
4217
4218 #[test]
4219 fn test_use_with_message() {
4220 assert_magic_match_bin!(
4221 r#"
42220 string MZ
4223>0 use mz first match
4224
42250 name mz then second match
4226>0 string MZ
4227"#,
4228 b"MZ\0",
4229 "first match then second match"
4230 );
4231 }
4232
4233 #[test]
4234 fn test_scalar_transform() {
4235 assert_magic_match_bin!("0 ubyte+1 0x1 add works", b"\x00");
4236 assert_magic_match_bin!("0 ubyte-1 0xfe sub works", b"\xff");
4237 assert_magic_match_bin!("0 ubyte%2 0 mod works", b"\x0a");
4238 assert_magic_match_bin!("0 ubyte&0x0f 0x0f bitand works", b"\xff");
4239 assert_magic_match_bin!("0 ubyte|0x0f 0xff bitor works", b"\xf0");
4240 assert_magic_match_bin!("0 ubyte^0x0f 0xf0 bitxor works", b"\xff");
4241
4242 FileMagicParser::parse_str("0 ubyte%0 mod by zero", None)
4243 .expect_err("expect div by zero error");
4244 FileMagicParser::parse_str("0 ubyte/0 div by zero", None)
4245 .expect_err("expect div by zero error");
4246 }
4247
4248 #[test]
4249 fn test_belong() {
4250 assert_magic_match_bin!("0 belong 0x12345678 Big-endian long", b"\x12\x34\x56\x78");
4252 assert_magic_not_match_bin!("0 belong 0x12345678 Big-endian long", b"\x78\x56\x34\x12");
4254 assert_magic_match_bin!(
4256 "4 belong 0x12345678 Big-endian long",
4257 b"\x00\x00\x00\x00\x12\x34\x56\x78"
4258 );
4259 assert_magic_match_bin!("0 belong <0x12345678 Big-endian long", b"\x12\x34\x56\x77");
4261 assert_magic_not_match_bin!("0 belong <0x12345678 Big-endian long", b"\x12\x34\x56\x78");
4262
4263 assert_magic_match_bin!("0 belong >0x12345678 Big-endian long", b"\x12\x34\x56\x79");
4265 assert_magic_not_match_bin!("0 belong >0x12345678 Big-endian long", b"\x12\x34\x56\x78");
4266
4267 assert_magic_match_bin!("0 belong &0x5678 Big-endian long", b"\x00\x00\x56\x78");
4269 assert_magic_not_match_bin!("0 belong &0x0000FFFF Big-endian long", b"\x12\x34\x56\x78");
4270
4271 assert_magic_match_bin!("0 belong ^0xFFFF0000 Big-endian long", b"\x00\x00\x56\x78");
4273 assert_magic_not_match_bin!("0 belong ^0xFFFF0000 Big-endian long", b"\x00\x01\x56\x78");
4274
4275 assert_magic_match_bin!("0 belong ~0x12345678 Big-endian long", b"\xed\xcb\xa9\x87");
4277 assert_magic_not_match_bin!("0 belong ~0x12345678 Big-endian long", b"\x12\x34\x56\x78");
4278
4279 assert_magic_match_bin!("0 belong x Big-endian long", b"\x12\x34\x56\x78");
4281 assert_magic_match_bin!("0 belong x Big-endian long", b"\x78\x56\x34\x12");
4282 }
4283
4284 #[test]
4285 fn test_parse_search() {
4286 parse_assert!("0 search test");
4287 parse_assert!("0 search/24/s test");
4288 parse_assert!("0 search/s/24 test");
4289 }
4290
4291 #[test]
4292 fn test_bedate() {
4293 assert_magic_match_bin!(
4294 "0 bedate 946684800 Unix date (Jan 1, 2000)",
4295 b"\x38\x6D\x43\x80"
4296 );
4297 assert_magic_not_match_bin!(
4298 "0 bedate 946684800 Unix date (Jan 1, 2000)",
4299 b"\x00\x00\x00\x00"
4300 );
4301 assert_magic_match_bin!(
4302 "4 bedate 946684800 %s",
4303 b"\x00\x00\x00\x00\x38\x6D\x43\x80",
4304 "2000-01-01 00:00:00"
4305 );
4306 }
4307 #[test]
4308 fn test_beldate() {
4309 assert_magic_match_bin!(
4310 "0 beldate 946684800 Local date (Jan 1, 2000)",
4311 b"\x38\x6D\x43\x80"
4312 );
4313 assert_magic_not_match_bin!(
4314 "0 beldate 946684800 Local date (Jan 1, 2000)",
4315 b"\x00\x00\x00\x00"
4316 );
4317
4318 assert_magic_match_bin!(
4319 "4 beldate 946684800 {}",
4320 b"\x00\x00\x00\x00\x38\x6D\x43\x80",
4321 unix_local_time_to_string(946684800)
4322 );
4323 }
4324
4325 #[test]
4326 fn test_beqdate() {
4327 assert_magic_match_bin!(
4328 "0 beqdate 946684800 Unix date (Jan 1, 2000)",
4329 b"\x00\x00\x00\x00\x38\x6D\x43\x80"
4330 );
4331
4332 assert_magic_not_match_bin!(
4333 "0 beqdate 946684800 Unix date (Jan 1, 2000)",
4334 b"\x00\x00\x00\x00\x00\x00\x00\x00"
4335 );
4336
4337 assert_magic_match_bin!(
4338 "0 beqdate 946684800 %s",
4339 b"\x00\x00\x00\x00\x38\x6D\x43\x80",
4340 "2000-01-01 00:00:00"
4341 );
4342 }
4343
4344 #[test]
4345 fn test_medate() {
4346 assert_magic_match_bin!(
4347 "0 medate 946684800 Unix date (Jan 1, 2000)",
4348 b"\x6D\x38\x80\x43"
4349 );
4350
4351 assert_magic_not_match_bin!(
4352 "0 medate 946684800 Unix date (Jan 1, 2000)",
4353 b"\x00\x00\x00\x00"
4354 );
4355
4356 assert_magic_match_bin!(
4357 "4 medate 946684800 %s",
4358 b"\x00\x00\x00\x00\x6D\x38\x80\x43",
4359 "2000-01-01 00:00:00"
4360 );
4361 }
4362
4363 #[test]
4364 fn test_meldate() {
4365 assert_magic_match_bin!(
4366 "0 meldate 946684800 Local date (Jan 1, 2000)",
4367 b"\x6D\x38\x80\x43"
4368 );
4369 assert_magic_not_match_bin!(
4370 "0 meldate 946684800 Local date (Jan 1, 2000)",
4371 b"\x00\x00\x00\x00"
4372 );
4373
4374 assert_magic_match_bin!(
4375 "4 meldate 946684800 %s",
4376 b"\x00\x00\x00\x00\x6D\x38\x80\x43",
4377 unix_local_time_to_string(946684800)
4378 );
4379 }
4380
4381 #[test]
4382 fn test_date() {
4383 assert_magic_match_bin!(
4384 "0 date 946684800 Local date (Jan 1, 2000)",
4385 b"\x80\x43\x6D\x38"
4386 );
4387 assert_magic_not_match_bin!(
4388 "0 date 946684800 Local date (Jan 1, 2000)",
4389 b"\x00\x00\x00\x00"
4390 );
4391 assert_magic_match_bin!(
4392 "4 date 946684800 {}",
4393 b"\x00\x00\x00\x00\x80\x43\x6D\x38",
4394 "2000-01-01 00:00:00"
4395 );
4396 }
4397
4398 #[test]
4399 fn test_leldate() {
4400 assert_magic_match_bin!(
4401 "0 leldate 946684800 Local date (Jan 1, 2000)",
4402 b"\x80\x43\x6D\x38"
4403 );
4404 assert_magic_not_match_bin!(
4405 "0 leldate 946684800 Local date (Jan 1, 2000)",
4406 b"\x00\x00\x00\x00"
4407 );
4408 assert_magic_match_bin!(
4409 "4 leldate 946684800 {}",
4410 b"\x00\x00\x00\x00\x80\x43\x6D\x38",
4411 unix_local_time_to_string(946684800)
4412 );
4413 }
4414
4415 #[test]
4416 fn test_leqdate() {
4417 assert_magic_match_bin!(
4418 "0 leqdate 1577836800 Unix date (Jan 1, 2020)",
4419 b"\x00\xe1\x0b\x5E\x00\x00\x00\x00"
4420 );
4421
4422 assert_magic_not_match_bin!(
4423 "0 leqdate 1577836800 Unix date (Jan 1, 2020)",
4424 b"\x00\x00\x00\x00\x00\x00\x00\x00"
4425 );
4426 assert_magic_match_bin!(
4427 "8 leqdate 1577836800 %s",
4428 b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\xE1\x0B\x5E\x00\x00\x00\x00",
4429 "2020-01-01 00:00:00"
4430 );
4431 }
4432
4433 #[test]
4434 fn test_leqldate() {
4435 assert_magic_match_bin!(
4436 "0 leqldate 1577836800 Unix date (Jan 1, 2020)",
4437 b"\x00\xe1\x0b\x5E\x00\x00\x00\x00"
4438 );
4439
4440 assert_magic_not_match_bin!(
4441 "0 leqldate 1577836800 Unix date (Jan 1, 2020)",
4442 b"\x00\x00\x00\x00\x00\x00\x00\x00"
4443 );
4444 assert_magic_match_bin!(
4445 "8 leqldate 1577836800 %s",
4446 b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\xE1\x0B\x5E\x00\x00\x00\x00",
4447 unix_local_time_to_string(1577836800)
4448 );
4449 }
4450
4451 #[test]
4452 fn test_melong() {
4453 assert_magic_match_bin!(
4455 "0 melong =0x12345678 Middle-endian long",
4456 b"\x34\x12\x78\x56"
4457 );
4458 assert_magic_not_match_bin!(
4459 "0 melong =0x12345678 Middle-endian long",
4460 b"\x00\x00\x00\x00"
4461 );
4462
4463 assert_magic_match_bin!(
4465 "0 melong <0x12345678 Middle-endian long",
4466 b"\x34\x12\x78\x55"
4467 ); assert_magic_not_match_bin!(
4469 "0 melong <0x12345678 Middle-endian long",
4470 b"\x34\x12\x78\x56"
4471 ); assert_magic_match_bin!(
4475 "0 melong >0x12345678 Middle-endian long",
4476 b"\x34\x12\x78\x57"
4477 ); assert_magic_not_match_bin!(
4479 "0 melong >0x12345678 Middle-endian long",
4480 b"\x34\x12\x78\x56"
4481 ); assert_magic_match_bin!("0 melong &0x5678 Middle-endian long", b"\xab\xcd\x78\x56"); assert_magic_not_match_bin!(
4486 "0 melong &0x0000FFFF Middle-endian long",
4487 b"\x34\x12\x78\x56"
4488 ); assert_magic_match_bin!(
4492 "0 melong ^0xFFFF0000 Middle-endian long",
4493 b"\x00\x00\x78\x56"
4494 ); assert_magic_not_match_bin!(
4496 "0 melong ^0xFFFF0000 Middle-endian long",
4497 b"\x00\x01\x78\x56"
4498 ); assert_magic_match_bin!(
4502 "0 melong ~0x12345678 Middle-endian long",
4503 b"\xCB\xED\x87\xA9"
4504 );
4505 assert_magic_not_match_bin!(
4506 "0 melong ~0x12345678 Middle-endian long",
4507 b"\x34\x12\x78\x56"
4508 ); assert_magic_match_bin!("0 melong x Middle-endian long", b"\x34\x12\x78\x56");
4512 assert_magic_match_bin!("0 melong x Middle-endian long", b"\x00\x00\x00\x00");
4513 }
4514
4515 #[test]
4516 fn test_uquad() {
4517 assert_magic_match_bin!(
4519 "0 uquad =0x123456789ABCDEF0 Unsigned quad",
4520 b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4521 );
4522 assert_magic_not_match_bin!(
4523 "0 uquad =0x123456789ABCDEF0 Unsigned quad",
4524 b"\x00\x00\x00\x00\x00\x00\x00\x00"
4525 );
4526
4527 assert_magic_match_bin!(
4529 "0 uquad <0x123456789ABCDEF0 Unsigned quad",
4530 b"\xF0\xDE\xBC\x9A\x78\x56\x34\x11"
4531 );
4532 assert_magic_not_match_bin!(
4533 "0 uquad <0x123456789ABCDEF0 Unsigned quad",
4534 b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4535 );
4536
4537 assert_magic_match_bin!(
4539 "0 uquad >0x123456789ABCDEF0 Unsigned quad",
4540 b"\xF0\xDE\xBC\x9A\x78\x56\x34\x13"
4541 );
4542 assert_magic_not_match_bin!(
4543 "0 uquad >0x123456789ABCDEF0 Unsigned quad",
4544 b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4545 );
4546
4547 assert_magic_match_bin!(
4549 "0 uquad &0xF0 Unsigned quad",
4550 b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4551 );
4552 assert_magic_not_match_bin!(
4553 "0 uquad &0xFF Unsigned quad",
4554 b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4555 );
4556
4557 assert_magic_match_bin!(
4559 "0 uquad ^0xFFFFFFFFFFFFFFFF Unsigned quad",
4560 b"\x00\x00\x00\x00\x00\x00\x00\x00"
4561 ); assert_magic_not_match_bin!(
4563 "0 uquad ^0xFFFFFFFFFFFFFFFF Unsigned quad",
4564 b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4565 ); assert_magic_match_bin!(
4569 "0 uquad ~0x123456789ABCDEF0 Unsigned quad",
4570 b"\x0F\x21\x43\x65\x87\xA9\xCB\xED"
4571 );
4572 assert_magic_not_match_bin!(
4573 "0 uquad ~0x123456789ABCDEF0 Unsigned quad",
4574 b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4575 ); assert_magic_match_bin!(
4579 "0 uquad x {:#x}",
4580 b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12",
4581 "0x123456789abcdef0"
4582 );
4583 assert_magic_match_bin!(
4584 "0 uquad x Unsigned quad",
4585 b"\x00\x00\x00\x00\x00\x00\x00\x00"
4586 );
4587 }
4588
4589 #[test]
4590 fn test_guid() {
4591 assert_magic_match_bin!(
4592 "0 guid EC959539-6786-2D4E-8FDB-98814CE76C1E It works",
4593 b"\xEC\x95\x95\x39\x67\x86\x2D\x4E\x8F\xDB\x98\x81\x4C\xE7\x6C\x1E"
4594 );
4595
4596 assert_magic_not_match_bin!(
4597 "0 guid 399595EC-8667-4E2D-8FDB-98814CE76C1E It works",
4598 b"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F"
4599 );
4600
4601 assert_magic_match_bin!(
4602 "0 guid x %s",
4603 b"\xEC\x95\x95\x39\x67\x86\x2D\x4E\x8F\xDB\x98\x81\x4C\xE7\x6C\x1E",
4604 "EC959539-6786-2D4E-8FDB-98814CE76C1E"
4605 );
4606 }
4607
4608 #[test]
4609 fn test_ubeqdate() {
4610 assert_magic_match_bin!(
4611 "0 ubeqdate 1633046400 It works",
4612 b"\x00\x00\x00\x00\x61\x56\x4f\x80"
4613 );
4614
4615 assert_magic_match_bin!(
4616 "0 ubeqdate x %s",
4617 b"\x00\x00\x00\x00\x61\x56\x4f\x80",
4618 "2021-10-01 00:00:00"
4619 );
4620
4621 assert_magic_not_match_bin!(
4622 "0 ubeqdate 1633046400 It should not work",
4623 b"\x00\x00\x00\x00\x00\x00\x00\x00"
4624 );
4625 }
4626
4627 #[test]
4628 fn test_ldate() {
4629 assert_magic_match_bin!("0 ldate 1640551520 It works", b"\x60\xd4\xC8\x61");
4630
4631 assert_magic_not_match_bin!("0 ldate 1633046400 It should not work", b"\x00\x00\x00\x00");
4632
4633 assert_magic_match_bin!(
4634 "0 ldate x %s",
4635 b"\x60\xd4\xC8\x61",
4636 unix_local_time_to_string(1640551520)
4637 );
4638 }
4639
4640 #[test]
4641 fn test_scalar_with_transform() {
4642 assert_magic_match_bin!("0 ubyte/10 2 {}", b"\x14", "2");
4643 assert_magic_match_bin!("0 ubyte/10 x {}", b"\x14", "2");
4644 assert_magic_match_bin!("0 ubyte%10 x {}", b"\x14", "0");
4645 }
4646
4647 #[test]
4648 fn test_float_with_transform() {
4649 assert_magic_match_bin!("0 lefloat/10 2 {}", b"\x00\x00\xa0\x41", "2");
4650 assert_magic_match_bin!("0 lefloat/10 x {}", b"\x00\x00\xa0\x41", "2");
4651 assert_magic_match_bin!("0 lefloat%10 x {}", b"\x00\x00\xa0\x41", "0");
4652 }
4653
4654 #[test]
4655 fn test_read_octal() {
4656 assert_eq!(read_octal_u64(&mut buf_reader!("0")), Some(0));
4658 assert_eq!(read_octal_u64(&mut buf_reader!("00")), Some(0));
4659 assert_eq!(read_octal_u64(&mut buf_reader!("01")), Some(1));
4660 assert_eq!(read_octal_u64(&mut buf_reader!("07")), Some(7));
4661 assert_eq!(read_octal_u64(&mut buf_reader!("010")), Some(8));
4662 assert_eq!(read_octal_u64(&mut buf_reader!("0123")), Some(83));
4663 assert_eq!(read_octal_u64(&mut buf_reader!("0755")), Some(493));
4664
4665 assert_eq!(read_octal_u64(&mut buf_reader!("0ABC")), Some(0));
4667 assert_eq!(read_octal_u64(&mut buf_reader!("01ABC")), Some(1));
4668 assert_eq!(read_octal_u64(&mut buf_reader!("0755ABC")), Some(493));
4669 assert_eq!(read_octal_u64(&mut buf_reader!("0123ABC")), Some(83));
4670
4671 assert_eq!(read_octal_u64(&mut buf_reader!("08")), Some(0)); assert_eq!(read_octal_u64(&mut buf_reader!("01238")), Some(83)); assert_eq!(read_octal_u64(&mut buf_reader!("123")), None);
4677 assert_eq!(read_octal_u64(&mut buf_reader!("755")), None);
4678
4679 assert_eq!(read_octal_u64(&mut buf_reader!("")), None);
4681
4682 assert_eq!(read_octal_u64(&mut buf_reader!("ABC")), None);
4684 assert_eq!(read_octal_u64(&mut buf_reader!("8ABC")), None); assert_eq!(
4688 read_octal_u64(&mut buf_reader!("01777777777")),
4689 Some(268435455)
4690 );
4691 }
4692
4693 #[test]
4694 fn test_offset_bug_1() {
4695 assert_magic_match_bin!(
4698 r"
46991 string TEST Bread is
4700# offset computation is relative to
4701# rule start
4702>(5.b) use toasted
4703
47040 name toasted
4705>0 string twice Toasted
4706>>0 use toasted_twice
4707
47080 name toasted_twice
4709>(6.b) string x %s
4710 ",
4711 b"\x00TEST\x06twice\x00\x06",
4712 "Bread is Toasted twice"
4713 );
4714 }
4715
4716 #[test]
4722 fn test_offset_bug_2() {
4723 assert_magic_match_bin!(
4726 r"
4727-12 string TEST Bread is
4728>(4.b) use toasted
4729
47300 name toasted
4731>0 string twice Toasted
4732>>0 use toasted_twice
4733
47340 name toasted_twice
4735>(6.b) string x %
4736 ",
4737 b"\x00TEST\x06twice\x00\x06",
4738 "Bread is Toasted twice"
4739 )
4740 }
4741
4742 #[test]
4743 fn test_offset_bug_3() {
4744 assert_magic_match_bin!(
4747 r"
47481 string TEST Bread is
4749>(5.b) indirect/r x
4750
47510 string twice Toasted
4752>0 use toasted_twice
4753
47540 name toasted_twice
4755>0 string x %s
4756 ",
4757 b"\x00TEST\x06twice\x00\x08",
4758 "Bread is Toasted twice"
4759 )
4760 }
4761
4762 #[test]
4763 fn test_offset_bug_4() {
4764 assert_magic_match_bin!(
4767 r"
47681 string Bread %s
4769>(6.b) indirect/r x
4770
4771# this one uses a based offset
4772# computed at indirection
47731 string is\ Toasted %s
4774>(11.b) use toasted_twice
4775
4776# this one is using a new base
4777# offset being previous base
4778# offset + offset of use
47790 name toasted_twice
4780>0 string x %s
4781 ",
4782 b"\x00Bread\x06is Toasted\x0ctwice\x00",
4783 "Bread is Toasted twice"
4784 )
4785 }
4786
4787 #[test]
4788 fn test_offset_bug_5() {
4789 assert_magic_match_bin!(
4790 r"
47911 string TEST Bread is
4792>(5.b) indirect/r x
4793
47940 string twice Toasted
4795>0 use toasted_twice
4796
47970 name toasted_twice
4798>0 string twice
4799>>&1 byte 0x08 twice
4800 ",
4801 b"\x00TEST\x06twice\x00\x08",
4802 "Bread is Toasted twice"
4803 )
4804 }
4805
4806 #[test]
4807 fn test_bug_6() {
4808 assert_magic_match_bin!(
4812 r"
48131 string TEST Bread is toasted
4814>&0 use toasted
4815>>&0 default x but not burnt
4816
48170 name toasted
4818>1 string toasted
4819 ",
4820 b"\x00TEST\x06toasted",
4821 "Bread is toasted"
4822 )
4823 }
4824
4825 #[test]
4826 fn test_offset_bug_7() {
4827 assert_magic_match_bin!(
4831 r"
48321 string TEST Bread is
4833# offset computation is relative to
4834# rule start
4835>(5.b) use toasted
4836
48370 name toasted
4838>0 string toast Toasted
4839>>(6.b) use toasted_twice
4840
48410 name toasted_twice
4842>1 string x %s
4843 ",
4844 b"\x00TEST\x06toast\x00\x06twice\x00",
4845 "Bread is Toasted twice"
4846 );
4847 }
4848
4849 #[test]
4850 fn test_message_parts() {
4851 let m = first_magic(
4852 r#"0 string/W #!/usr/bin/env\ python PYTHON"#,
4853 b"#!/usr/bin/env python",
4854 StreamKind::Text(TextEncoding::Ascii),
4855 )
4856 .unwrap();
4857
4858 assert!(m.message_parts().any(|p| p.eq_ignore_ascii_case("python")))
4859 }
4860
4861 #[test]
4862 fn test_load_bulk() {
4863 let mut db = MagicDb::new();
4864
4865 let rules = vec![
4866 parse_assert!("0 search test"),
4867 parse_assert!("0 search/24/s test"),
4868 parse_assert!("0 search/s/24 test"),
4869 ];
4870
4871 db.load_bulk(rules.into_iter());
4872 db.verify().unwrap();
4873 }
4874
4875 #[test]
4876 fn test_load_bulk_failure() {
4877 let mut db = MagicDb::new();
4878
4879 let rules = vec![parse_assert!(
4880 r#"
48810 search/s/24 test
4882>0 use test
4883"#
4884 )];
4885
4886 db.load_bulk(rules.into_iter());
4887 assert!(matches!(db.verify(), Err(Error::Verify(_, _, _))));
4888 }
4889}