1#![forbid(unsafe_code)]
2#![deny(unused_imports)]
3#![deny(missing_docs)]
4use dyf::{DynDisplay, FormatString, dformat};
144use flaglet::flags;
145use flate2::{Compression, read::GzDecoder, write::GzEncoder};
146use memchr::memchr;
147use pest::{Span, error::ErrorVariant};
148use regex::bytes::{self};
149use serde::{Deserialize, Serialize};
150use std::{
151 borrow::Cow,
152 cmp::max,
153 collections::{HashMap, HashSet},
154 fmt::{self, Debug, Display},
155 fs::File,
156 io::{self, Read, SeekFrom, Write},
157 ops::{Add, BitAnd, BitOr, BitXor, Deref, Div, Mul, Rem, Sub},
158 path::Path,
159};
160use tar::Archive;
161use thiserror::Error;
162use tracing::{Level, debug, enabled, trace};
163
164use crate::{
165 numeric::{Float, FloatDataType, Scalar, ScalarDataType},
166 parser::{FileMagicParser, Rule},
167 readers::DataRead,
168 utils::{
169 debug_string_from_vec_u8, debug_string_from_vec_u16, decode_id3, find_json_boundaries,
170 run_utf8_validation,
171 },
172};
173
174mod numeric;
175mod parser;
176pub mod readers;
177pub use readers::DataReader;
178mod utils;
179
180const HARDCODED_MAGIC_STRENGTH: u64 = 2048;
181const HARDCODED_SOURCE: &str = "hardcoded";
182const MAX_RECURSION: usize = 50;
184const FILE_REGEX_MAX: usize = 8192;
186
187pub const FILE_BYTES_MAX: usize = 7 * 1024 * 1024;
193pub const DEFAULT_BIN_MIMETYPE: &str = "application/octet-stream";
195pub const DEFAULT_TEXT_MIMETYPE: &str = "text/plain";
197
198pub(crate) const TIMESTAMP_FORMAT: &str = "%Y-%m-%d %H:%M:%S";
199
200macro_rules! debug_panic {
201 ($($arg:tt)*) => {
202 if cfg!(debug_assertions) {
203 panic!($($arg)*);
204 }
205 };
206}
207
208macro_rules! read {
209 ($r: expr, $ty: ty) => {{
210 let mut a = [0u8; std::mem::size_of::<$ty>()];
211 $r.read_exact_into(&mut a)?;
212 a
213 }};
214}
215
216macro_rules! read_le {
217 ($r:expr, $ty: ty ) => {{ <$ty>::from_le_bytes(read!($r, $ty)) }};
218}
219
220macro_rules! read_be {
221 ($r:expr, $ty: ty ) => {{ <$ty>::from_be_bytes(read!($r, $ty)) }};
222}
223
224macro_rules! read_me {
225 ($r: expr) => {{ ((read_le!($r, u16) as i32) << 16) | (read_le!($r, u16) as i32) }};
226}
227
228#[inline(always)]
229fn read_octal_u64<D: DataRead>(haystack: &mut D) -> Option<u64> {
230 let s = haystack
231 .read_while_or_limit(|b| matches!(b, b'0'..=b'7'), 22)
232 .map(|buf| str::from_utf8(buf))
233 .ok()?
234 .ok()?;
235
236 if !s.starts_with("0") {
237 return None;
238 }
239
240 u64::from_str_radix(s, 8).ok()
241}
242
243#[derive(Debug, Error)]
245pub enum Error {
246 #[error("{0}")]
248 Msg(String),
249
250 #[error("source={0} line={1} error={2}")]
252 Verify(String, usize, Box<Error>),
253
254 #[error("source={0} line={1} error={2}")]
256 Localized(String, usize, Box<Error>),
257
258 #[error("missing rule: {0}")]
260 MissingRule(String),
261
262 #[error("maximum recursion reached: {0}")]
264 MaximumRecursion(usize),
265
266 #[error("io: {0}")]
268 Io(#[from] io::Error),
269
270 #[error("parser error: {0}")]
272 Parse(#[from] Box<pest::error::Error<Rule>>),
273
274 #[error("formatting: {0}")]
276 Format(#[from] dyf::Error),
277
278 #[error("regex: {0}")]
280 Regex(#[from] regex::Error),
281
282 #[error("{0}")]
284 Serialize(#[from] bincode::error::EncodeError),
285
286 #[error("{0}")]
288 Deserialize(#[from] bincode::error::DecodeError),
289}
290
291impl Error {
292 #[inline]
293 fn parser<S: ToString>(msg: S, span: Span<'_>) -> Self {
294 Self::Parse(Box::new(pest::error::Error::new_from_span(
295 ErrorVariant::CustomError {
296 message: msg.to_string(),
297 },
298 span,
299 )))
300 }
301
302 fn msg<M: AsRef<str>>(msg: M) -> Self {
303 Self::Msg(msg.as_ref().into())
304 }
305
306 fn localized<S: AsRef<str>>(source: S, line: usize, err: Error) -> Self {
307 Self::Localized(source.as_ref().into(), line, err.into())
308 }
309
310 pub fn unwrap_localized(&self) -> &Self {
312 match self {
313 Self::Localized(_, _, e) => e,
314 _ => self,
315 }
316 }
317}
318
319#[derive(Debug, Clone, Serialize, Deserialize)]
320enum Message {
321 String(String),
322 Format {
323 printf_spec: String,
324 fs: FormatString,
325 },
326}
327
328impl Display for Message {
329 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
330 match self {
331 Self::String(s) => write!(f, "{s}"),
332 Self::Format { printf_spec: _, fs } => write!(f, "{}", fs.to_string_lossy()),
333 }
334 }
335}
336
337impl Message {
338 fn to_string_lossy(&self) -> Cow<'_, str> {
339 match self {
340 Message::String(s) => Cow::Borrowed(s),
341 Message::Format { printf_spec: _, fs } => fs.to_string_lossy(),
342 }
343 }
344
345 #[inline(always)]
346 fn format_with(&self, mr: Option<&MatchRes>) -> Result<Cow<'_, str>, Error> {
347 match self {
348 Self::String(s) => Ok(Cow::Borrowed(s.as_str())),
349 Self::Format {
350 printf_spec: c_spec,
351 fs,
352 } => {
353 if let Some(mr) = mr {
354 match mr {
355 MatchRes::Float(_, _) | MatchRes::Bytes(_, _, _, _) => {
356 Ok(Cow::Owned(dformat!(fs, mr)?))
357 }
358 MatchRes::Scalar(_, scalar) => {
359 if c_spec.as_str() == "c" {
361 match scalar {
362 Scalar::byte(b) => {
363 let b = (*b as u8) as char;
364 Ok(Cow::Owned(dformat!(fs, b)?))
365 }
366 Scalar::ubyte(b) => {
367 let b = *b as char;
368 Ok(Cow::Owned(dformat!(fs, b)?))
369 }
370 _ => Ok(Cow::Owned(dformat!(fs, mr)?)),
371 }
372 } else {
373 Ok(Cow::Owned(dformat!(fs, mr)?))
374 }
375 }
376 }
377 } else {
378 Ok(fs.to_string_lossy())
379 }
380 }
381 }
382 }
383}
384
385impl ScalarDataType {
386 #[inline(always)]
387 fn read<R: DataRead>(&self, from: &mut R, switch_endianness: bool) -> Result<Scalar, Error> {
388 macro_rules! _read_le {
389 ($ty: ty) => {{
390 if switch_endianness {
391 <$ty>::from_be_bytes(read!(from, $ty))
392 } else {
393 <$ty>::from_le_bytes(read!(from, $ty))
394 }
395 }};
396 }
397
398 macro_rules! _read_be {
399 ($ty: ty) => {{
400 if switch_endianness {
401 <$ty>::from_le_bytes(read!(from, $ty))
402 } else {
403 <$ty>::from_be_bytes(read!(from, $ty))
404 }
405 }};
406 }
407
408 macro_rules! _read_ne {
409 ($ty: ty) => {{
410 if cfg!(target_endian = "big") {
411 _read_be!($ty)
412 } else {
413 _read_le!($ty)
414 }
415 }};
416 }
417
418 macro_rules! _read_me {
419 () => {
420 ((_read_le!(u16) as i32) << 16) | (_read_le!(u16) as i32)
421 };
422 }
423
424 Ok(match self {
425 Self::byte => Scalar::byte(read!(from, u8)[0] as i8),
427 Self::short => Scalar::short(_read_ne!(i16)),
428 Self::long => Scalar::long(_read_ne!(i32)),
429 Self::date => Scalar::date(_read_ne!(i32)),
430 Self::ldate => Scalar::ldate(_read_ne!(i32)),
431 Self::qwdate => Scalar::qwdate(_read_ne!(i64)),
432 Self::leshort => Scalar::leshort(_read_le!(i16)),
433 Self::lelong => Scalar::lelong(_read_le!(i32)),
434 Self::lequad => Scalar::lequad(_read_le!(i64)),
435 Self::bequad => Scalar::bequad(_read_be!(i64)),
436 Self::belong => Scalar::belong(_read_be!(i32)),
437 Self::bedate => Scalar::bedate(_read_be!(i32)),
438 Self::beldate => Scalar::beldate(_read_be!(i32)),
439 Self::beqdate => Scalar::beqdate(_read_be!(i64)),
440 Self::ubyte => Scalar::ubyte(read!(from, u8)[0]),
442 Self::ushort => Scalar::ushort(_read_ne!(u16)),
443 Self::uleshort => Scalar::uleshort(_read_le!(u16)),
444 Self::ulelong => Scalar::ulelong(_read_le!(u32)),
445 Self::uledate => Scalar::uledate(_read_le!(u32)),
446 Self::ulequad => Scalar::ulequad(_read_le!(u64)),
447 Self::offset => Scalar::offset(from.stream_position()),
448 Self::ubequad => Scalar::ubequad(_read_be!(u64)),
449 Self::medate => Scalar::medate(_read_me!()),
450 Self::meldate => Scalar::meldate(_read_me!()),
451 Self::melong => Scalar::melong(_read_me!()),
452 Self::beshort => Scalar::beshort(_read_be!(i16)),
453 Self::quad => Scalar::quad(_read_ne!(i64)),
454 Self::uquad => Scalar::uquad(_read_ne!(u64)),
455 Self::ledate => Scalar::ledate(_read_le!(i32)),
456 Self::leldate => Scalar::leldate(_read_le!(i32)),
457 Self::leqdate => Scalar::leqdate(_read_le!(i64)),
458 Self::leqldate => Scalar::leqldate(_read_le!(i64)),
459 Self::leqwdate => Scalar::leqwdate(_read_le!(i64)),
460 Self::ubelong => Scalar::ubelong(_read_be!(u32)),
461 Self::ulong => Scalar::ulong(_read_ne!(u32)),
462 Self::ubeshort => Scalar::ubeshort(_read_be!(u16)),
463 Self::ubeqdate => Scalar::ubeqdate(_read_be!(u64)),
464 Self::lemsdosdate => Scalar::lemsdosdate(_read_le!(u16)),
465 Self::lemsdostime => Scalar::lemsdostime(_read_le!(u16)),
466 Self::guid => Scalar::guid(u128::from_be_bytes(read!(from, u128))),
467 })
468 }
469}
470
471impl FloatDataType {
472 #[inline(always)]
473 fn read<R: DataRead>(&self, from: &mut R, switch_endianness: bool) -> Result<Float, Error> {
474 macro_rules! _read_le {
475 ($ty: ty) => {{
476 if switch_endianness {
477 <$ty>::from_be_bytes(read!(from, $ty))
478 } else {
479 <$ty>::from_le_bytes(read!(from, $ty))
480 }
481 }};
482 }
483
484 macro_rules! _read_be {
485 ($ty: ty) => {{
486 if switch_endianness {
487 <$ty>::from_le_bytes(read!(from, $ty))
488 } else {
489 <$ty>::from_be_bytes(read!(from, $ty))
490 }
491 }};
492 }
493
494 macro_rules! _read_ne {
495 ($ty: ty) => {{
496 if cfg!(target_endian = "big") {
497 _read_be!($ty)
498 } else {
499 _read_le!($ty)
500 }
501 }};
502 }
503
504 macro_rules! _read_me {
505 () => {
506 ((_read_le!(u16) as i32) << 16) | (_read_le!(u16) as i32)
507 };
508 }
509
510 Ok(match self {
511 Self::lefloat => Float::lefloat(_read_le!(f32)),
512 Self::befloat => Float::befloat(_read_le!(f32)),
513 Self::ledouble => Float::ledouble(_read_le!(f64)),
514 Self::bedouble => Float::bedouble(_read_be!(f64)),
515 })
516 }
517}
518
519#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
520enum Op {
521 Mul,
522 Add,
523 Sub,
524 Div,
525 Mod,
526 And,
527 Xor,
528 Or,
529}
530
531impl Display for Op {
532 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
533 match self {
534 Op::Mul => write!(f, "*"),
535 Op::Add => write!(f, "+"),
536 Op::Sub => write!(f, "-"),
537 Op::Div => write!(f, "/"),
538 Op::Mod => write!(f, "%"),
539 Op::And => write!(f, "&"),
540 Op::Or => write!(f, "|"),
541 Op::Xor => write!(f, "^"),
542 }
543 }
544}
545
546#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
547enum CmpOp {
548 Eq,
549 Lt,
550 Gt,
551 BitAnd,
552 Neq, Xor,
554 Not, }
556
557impl CmpOp {
558 #[inline(always)]
559 fn is_neq(&self) -> bool {
560 matches!(self, Self::Neq)
561 }
562}
563
564#[derive(Debug, Clone, Serialize, Deserialize)]
565struct ScalarTransform {
566 op: Op,
567 num: Scalar,
568}
569
570impl ScalarTransform {
571 fn apply(&self, s: Scalar) -> Option<Scalar> {
572 match self.op {
573 Op::Add => s.checked_add(self.num),
574 Op::Sub => s.checked_sub(self.num),
575 Op::Mul => s.checked_mul(self.num),
576 Op::Div => s.checked_div(self.num),
577 Op::Mod => s.checked_rem(self.num),
578 Op::And => Some(s.bitand(self.num)),
579 Op::Xor => Some(s.bitxor(self.num)),
580 Op::Or => Some(s.bitor(self.num)),
581 }
582 }
583}
584
585#[derive(Debug, Clone, Serialize, Deserialize)]
586struct FloatTransform {
587 op: Op,
588 num: Float,
589}
590
591impl FloatTransform {
592 fn apply(&self, s: Float) -> Float {
593 match self.op {
594 Op::Add => s.add(self.num),
595 Op::Sub => s.sub(self.num),
596 Op::Mul => s.mul(self.num),
597 Op::Div => s.div(self.num),
599 Op::Mod => s.rem(self.num),
601 Op::And | Op::Xor | Op::Or => {
603 debug_panic!("unsupported operation");
604 s
605 }
606 }
607 }
608}
609
610#[derive(Clone, Serialize, Deserialize)]
611enum TestValue<T> {
612 Value(T),
613 Any,
614}
615
616impl Debug for TestValue<Vec<u8>> {
617 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
618 match self {
619 Self::Value(v) => write!(f, "\"{}\"", debug_string_from_vec_u8(v)),
620 Self::Any => write!(f, "ANY"),
621 }
622 }
623}
624
625impl Debug for TestValue<Vec<u16>> {
626 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
627 match self {
628 Self::Value(v) => write!(f, "\"{}\"", debug_string_from_vec_u16(v)),
629 Self::Any => write!(f, "ANY"),
630 }
631 }
632}
633
634impl Debug for TestValue<Scalar> {
635 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
636 match self {
637 Self::Value(s) => write!(f, "{s:?}"),
638 Self::Any => write!(f, "ANY"),
639 }
640 }
641}
642
643impl Debug for TestValue<Float> {
644 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
645 match self {
646 Self::Value(fl) => write!(f, "{fl:?}"),
647 Self::Any => write!(f, "ANY"),
648 }
649 }
650}
651
652impl<T> TestValue<T> {
653 #[inline(always)]
654 fn as_ref(&self) -> TestValue<&T> {
655 match self {
656 Self::Value(v) => TestValue::Value(v),
657 Self::Any => TestValue::Any,
658 }
659 }
660}
661
662#[flags(u8)]
663#[derive(Debug, Serialize, Deserialize)]
664enum ReMod {
665 CaseInsensitive = 1 << 0,
666 StartOffsetUpdate = 1 << 1,
667 LineLimit = 1 << 2,
668 ForceBin = 1 << 3,
669 ForceText = 1 << 4,
670 TrimMatch = 1 << 5,
671}
672
673fn serialize_regex<S>(re: &bytes::Regex, serializer: S) -> Result<S::Ok, S::Error>
674where
675 S: serde::Serializer,
676{
677 re.as_str().serialize(serializer)
678}
679
680fn deserialize_regex<'de, D>(deserializer: D) -> Result<bytes::Regex, D::Error>
681where
682 D: serde::Deserializer<'de>,
683{
684 let wrapper = String::deserialize(deserializer)?;
685 bytes::Regex::new(&wrapper).map_err(serde::de::Error::custom)
686}
687
688#[derive(Debug, Clone, Serialize, Deserialize)]
689struct RegexTest {
690 #[serde(
691 serialize_with = "serialize_regex",
692 deserialize_with = "deserialize_regex"
693 )]
694 re: bytes::Regex,
695 length: Option<usize>,
696 mods: ReModFlags,
697 str_mods: StringModFlags,
698 non_magic_len: usize,
699 binary: bool,
700 cmp_op: CmpOp,
701}
702
703impl RegexTest {
704 #[inline(always)]
705 fn is_binary(&self) -> bool {
706 self.binary
707 || self.mods.contains(ReMod::ForceBin)
708 || self.str_mods.contains(StringMod::ForceBin)
709 }
710
711 #[inline(always)]
712 fn is_text(&self) -> bool {
713 self.mods.contains(ReMod::ForceText) || self.str_mods.contains(StringMod::ForceText)
714 }
715
716 fn match_buf<'buf>(
717 &self,
718 off_buf: u64, stream_kind: StreamKind,
720 buf: &'buf [u8],
721 ) -> Option<MatchRes<'buf>> {
722 let mr = match stream_kind {
723 StreamKind::Text(_) => {
724 let mut off_txt = off_buf;
725
726 let mut line_limit = self.length.unwrap_or(usize::MAX);
727
728 for line in buf.split(|c| c == &b'\n') {
729 if line_limit == 0 {
733 break;
734 }
735
736 if let Some(re_match) = self.re.find(line) {
737 let start_offset = off_txt + re_match.start() as u64;
739
740 let stop_offset = if re_match.end() == line.len() {
742 Some(start_offset + re_match.as_bytes().len() as u64 + 1)
743 } else {
744 None
745 };
746
747 return Some(MatchRes::Bytes(
748 start_offset,
749 stop_offset,
750 re_match.as_bytes(),
751 Encoding::Utf8,
752 ));
753 }
754
755 off_txt += line.len() as u64;
756 off_txt += 1;
758 line_limit = line_limit.saturating_sub(1)
759 }
760 None
761 }
762
763 StreamKind::Binary => {
764 self.re.find(buf).map(|re_match| {
765 MatchRes::Bytes(
766 off_buf + re_match.start() as u64,
768 None,
769 re_match.as_bytes(),
770 Encoding::Utf8,
771 )
772 })
773 }
774 };
775
776 if self.cmp_op.is_neq() && mr.is_none() {
778 return Some(MatchRes::Bytes(off_buf, None, buf, Encoding::Utf8));
779 }
780
781 mr
782 }
783}
784
785impl From<RegexTest> for Test {
786 fn from(value: RegexTest) -> Self {
787 Self::Regex(value)
788 }
789}
790
791#[flags(u8)]
792#[derive(Debug, Serialize, Deserialize)]
793enum StringMod {
794 ForceBin = 1 << 0,
795 UpperInsensitive = 1 << 1,
796 LowerInsensitive = 1 << 2,
797 FullWordMatch = 1 << 3,
798 Trim = 1 << 4,
799 ForceText = 1 << 5,
800 CompactWhitespace = 1 << 6,
801 OptBlank = 1 << 7,
802}
803
804#[derive(Debug, Clone, Serialize, Deserialize)]
805struct StringTest {
806 test_val: TestValue<Vec<u8>>,
807 cmp_op: CmpOp,
808 length: Option<usize>,
809 mods: StringModFlags,
810 binary: bool,
811}
812
813impl From<StringTest> for Test {
814 fn from(value: StringTest) -> Self {
815 Self::String(value)
816 }
817}
818
819#[inline(always)]
820fn string_match(str: &[u8], mods: StringModFlags, buf: &[u8]) -> (bool, usize) {
821 let mut consumed = 0;
822 if mods.is_disjoint(
824 StringMod::UpperInsensitive
825 | StringMod::LowerInsensitive
826 | StringMod::FullWordMatch
827 | StringMod::CompactWhitespace
828 | StringMod::OptBlank,
829 ) {
830 if buf.starts_with(str) {
832 (true, str.len())
833 } else {
834 (false, consumed)
835 }
836 } else {
837 let mut i_src = 0;
838 let mut iter = buf.iter().peekable();
839
840 macro_rules! consume_target {
841 () => {{
842 if iter.next().is_some() {
843 consumed += 1;
844 }
845 }};
846 }
847
848 macro_rules! continue_next_iteration {
849 () => {{
850 consume_target!();
851 i_src += 1;
852 continue;
853 }};
854 }
855
856 while let Some(&&b) = iter.peek() {
857 let Some(&ref_byte) = str.get(i_src) else {
858 break;
859 };
860
861 if mods.contains(StringMod::OptBlank) && (b == b' ' || ref_byte == b' ') {
862 if b == b' ' {
863 consume_target!();
865 }
866
867 if ref_byte == b' ' {
868 i_src += 1;
870 }
871
872 continue;
873 }
874
875 if mods.contains(StringMod::UpperInsensitive) {
876 if ref_byte.is_ascii_uppercase() && ref_byte == b.to_ascii_uppercase()
878 || ref_byte == b
879 {
880 continue_next_iteration!()
881 }
882 }
883
884 if mods.contains(StringMod::LowerInsensitive)
885 && (ref_byte.is_ascii_lowercase() && ref_byte == b.to_ascii_lowercase()
886 || ref_byte == b)
887 {
888 continue_next_iteration!()
889 }
890
891 if mods.contains(StringMod::CompactWhitespace) && ref_byte == b' ' {
892 let mut src_blk = 0;
893 while let Some(b' ') = str.get(i_src) {
894 src_blk += 1;
895 i_src += 1;
896 }
897
898 let mut tgt_blk = 0;
899 while let Some(b' ') = iter.peek() {
900 tgt_blk += 1;
901 consume_target!();
902 }
903
904 if src_blk > tgt_blk {
905 return (false, consumed);
906 }
907
908 continue;
909 }
910
911 if ref_byte == b {
912 continue_next_iteration!()
913 } else {
914 return (false, consumed);
915 }
916 }
917
918 if mods.contains(StringMod::FullWordMatch)
919 && let Some(b) = iter.peek()
920 && !b.is_ascii_whitespace()
921 {
922 return (false, consumed);
923 }
924
925 (
926 consumed > 0 && str.get(i_src).is_none() && consumed <= buf.len(),
927 consumed,
928 )
929 }
930}
931
932impl StringTest {
933 fn has_length_mod(&self) -> bool {
934 !self.mods.is_disjoint(
935 StringMod::UpperInsensitive
936 | StringMod::LowerInsensitive
937 | StringMod::FullWordMatch
938 | StringMod::CompactWhitespace
939 | StringMod::OptBlank,
940 )
941 }
942
943 #[inline(always)]
944 fn test_value_len(&self) -> usize {
945 match self.test_val.as_ref() {
946 TestValue::Value(s) => s.len(),
947 TestValue::Any => 0,
948 }
949 }
950
951 #[inline(always)]
952 fn is_binary(&self) -> bool {
953 self.binary || self.mods.contains(StringMod::ForceBin)
954 }
955
956 #[inline(always)]
957 fn is_text(&self) -> bool {
958 self.mods.contains(StringMod::ForceText)
959 }
960}
961
962#[derive(Clone, Serialize, Deserialize)]
963struct ByteVec(Vec<u8>);
964
965impl Debug for ByteVec {
966 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
967 write!(f, "\"{}\"", debug_string_from_vec_u8(self))
968 }
969}
970
971impl From<Vec<u8>> for ByteVec {
972 fn from(value: Vec<u8>) -> Self {
973 Self(value)
974 }
975}
976
977impl Deref for ByteVec {
978 type Target = Vec<u8>;
979
980 fn deref(&self) -> &Self::Target {
981 &self.0
982 }
983}
984
985#[derive(Debug, Clone, Serialize, Deserialize)]
986struct SearchTest {
987 str: ByteVec,
988 n_pos: Option<usize>,
989 str_mods: StringModFlags,
990 re_mods: ReModFlags,
991 binary: bool,
992 cmp_op: CmpOp,
993}
994
995impl From<SearchTest> for Test {
996 fn from(value: SearchTest) -> Self {
997 Self::Search(value)
998 }
999}
1000
1001impl SearchTest {
1002 #[inline(always)]
1003 fn is_binary(&self) -> bool {
1004 (self.binary
1005 || self.str_mods.contains(StringMod::ForceBin)
1006 || self.re_mods.contains(ReMod::ForceBin))
1007 && !(self.str_mods.contains(StringMod::ForceText)
1008 || self.re_mods.contains(ReMod::ForceText))
1009 }
1010
1011 #[inline]
1013 fn match_buf<'buf>(&self, off_buf: u64, buf: &'buf [u8]) -> Option<MatchRes<'buf>> {
1014 let mut i = 0;
1015
1016 let needle = self.str.first()?;
1017
1018 while i < buf.len() {
1019 let Some(k) = memchr(*needle, &buf[i..]) else {
1022 break;
1023 };
1024
1025 i += k;
1026
1027 if self.str_mods.contains(StringMod::FullWordMatch) {
1029 let prev_is_whitespace = buf
1030 .get(i.saturating_sub(1))
1031 .map(|c| c.is_ascii_whitespace())
1032 .unwrap_or_default();
1033
1034 if i > 0 && !prev_is_whitespace {
1039 i += 1;
1040 continue;
1041 }
1042 }
1043
1044 if let Some(npos) = self.n_pos
1045 && i > npos
1046 {
1047 break;
1048 }
1049
1050 let pos = i;
1051 let (ok, consumed) = string_match(&self.str, self.str_mods, &buf[i..]);
1052
1053 if ok {
1054 return Some(MatchRes::Bytes(
1055 off_buf.saturating_add(pos as u64),
1056 None,
1057 &buf[i..i + consumed],
1058 Encoding::Utf8,
1059 ));
1060 } else {
1061 i += max(consumed, 1)
1062 }
1063 }
1064
1065 if self.cmp_op.is_neq() {
1067 return Some(MatchRes::Bytes(off_buf, None, buf, Encoding::Utf8));
1068 }
1069
1070 None
1071 }
1072}
1073
1074#[derive(Debug, Clone, Serialize, Deserialize)]
1075struct ScalarTest {
1076 ty: ScalarDataType,
1077 transform: Option<ScalarTransform>,
1078 cmp_op: CmpOp,
1079 test_val: TestValue<Scalar>,
1080}
1081
1082#[derive(Debug, Clone, Serialize, Deserialize)]
1083struct FloatTest {
1084 ty: FloatDataType,
1085 transform: Option<FloatTransform>,
1086 cmp_op: CmpOp,
1087 test_val: TestValue<Float>,
1088}
1089
1090#[derive(PartialEq)]
1093enum ReadValue<'buf> {
1094 Float(u64, Float),
1095 Scalar(u64, Scalar),
1096 Bytes(u64, &'buf [u8]),
1097}
1098
1099impl<'buf> Debug for ReadValue<'buf> {
1100 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1101 match self {
1102 Self::Float(_, fl) => write!(f, "{fl:?}"),
1103 Self::Scalar(_, s) => write!(f, "{s:?}"),
1104 Self::Bytes(_, b) => {
1105 if b.len() <= 128 {
1106 write!(f, "\"{}\"", debug_string_from_vec_u8(b))
1107 } else {
1108 let limit = 128;
1109 write!(
1110 f,
1111 "\"{}\" (first {limit} bytes)",
1112 debug_string_from_vec_u8(&b[..limit])
1113 )
1114 }
1115 }
1116 }
1117 }
1118}
1119
1120impl DynDisplay for ReadValue<'_> {
1121 fn dyn_fmt(&self, f: &mut dyf::Formatter<'_>) -> dyf::Result {
1122 use std::fmt::Write;
1123 match self {
1124 Self::Float(_, s) => DynDisplay::dyn_fmt(s, f),
1125 Self::Scalar(_, s) => DynDisplay::dyn_fmt(s, f),
1126 Self::Bytes(_, b) => Ok(write!(f, "{b:?}")?),
1127 }
1128 }
1129}
1130
1131impl DynDisplay for &ReadValue<'_> {
1132 fn dyn_fmt(&self, f: &mut dyf::Formatter<'_>) -> dyf::Result {
1133 DynDisplay::dyn_fmt(*self, f)
1135 }
1136}
1137
1138impl Display for ReadValue<'_> {
1139 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1140 match self {
1141 Self::Float(_, v) => write!(f, "{v}"),
1142 Self::Scalar(_, s) => write!(f, "{s}"),
1143 Self::Bytes(_, b) => write!(f, "{b:?}"),
1144 }
1145 }
1146}
1147
1148enum Encoding {
1149 Utf16(String16Encoding),
1150 Utf8,
1151}
1152
1153enum MatchRes<'buf> {
1156 Bytes(u64, Option<u64>, &'buf [u8], Encoding),
1161 Scalar(u64, Scalar),
1162 Float(u64, Float),
1163}
1164
1165impl DynDisplay for &MatchRes<'_> {
1166 fn dyn_fmt(&self, f: &mut dyf::Formatter) -> dyf::Result {
1167 (*self).dyn_fmt(f)
1168 }
1169}
1170
1171impl DynDisplay for MatchRes<'_> {
1172 fn dyn_fmt(&self, f: &mut dyf::Formatter) -> dyf::Result {
1173 match self {
1174 Self::Scalar(_, v) => v.dyn_fmt(f),
1175 Self::Float(_, v) => v.dyn_fmt(f),
1176 Self::Bytes(_, _, v, enc) => match enc {
1177 Encoding::Utf8 => String::from_utf8_lossy(v).to_string().dyn_fmt(f),
1178 Encoding::Utf16(enc) => {
1179 let utf16: Vec<u16> = slice_to_utf16_iter(v, *enc).collect();
1180 String::from_utf16_lossy(&utf16).dyn_fmt(f)
1181 }
1182 },
1183 }
1184 }
1185}
1186
1187impl MatchRes<'_> {
1188 #[inline]
1190 fn start_offset(&self) -> u64 {
1191 match self {
1192 MatchRes::Bytes(o, _, _, _) => *o,
1193 MatchRes::Scalar(o, _) => *o,
1194 MatchRes::Float(o, _) => *o,
1195 }
1196 }
1197
1198 #[inline]
1200 fn end_offset(&self) -> u64 {
1201 match self {
1202 MatchRes::Bytes(start, end, buf, _) => match end {
1203 Some(end) => *end,
1204 None => start.saturating_add(buf.len() as u64),
1205 },
1206 MatchRes::Scalar(o, sc) => o.add(sc.size_of() as u64),
1207 MatchRes::Float(o, f) => o.add(f.size_of() as u64),
1208 }
1209 }
1210}
1211
1212fn slice_to_utf16_iter(read: &[u8], encoding: String16Encoding) -> impl Iterator<Item = u16> {
1213 let even = read
1214 .iter()
1215 .enumerate()
1216 .filter(|(i, _)| i % 2 == 0)
1217 .map(|t| t.1);
1218
1219 let odd = read
1220 .iter()
1221 .enumerate()
1222 .filter(|(i, _)| i % 2 != 0)
1223 .map(|t| t.1);
1224
1225 even.zip(odd).map(move |(e, o)| match encoding {
1226 String16Encoding::Le => u16::from_le_bytes([*e, *o]),
1227 String16Encoding::Be => u16::from_be_bytes([*e, *o]),
1228 })
1229}
1230
1231#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
1232enum String16Encoding {
1233 Le,
1234 Be,
1235}
1236
1237#[derive(Debug, Clone, Serialize, Deserialize)]
1238struct String16Test {
1239 orig: String,
1240 test_val: TestValue<Vec<u16>>,
1241 encoding: String16Encoding,
1242}
1243
1244impl String16Test {
1245 #[inline(always)]
1249 fn test_value_len(&self) -> usize {
1250 match self.test_val.as_ref() {
1251 TestValue::Value(str16) => str16.len(),
1252 TestValue::Any => 0,
1253 }
1254 }
1255}
1256
1257#[flags(u8)]
1258#[derive(Debug, Serialize, Deserialize)]
1259enum IndirectMod {
1260 Relative = 1 << 0,
1261}
1262
1263#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
1264enum PStringLen {
1265 Byte, ShortBe, ShortLe, LongBe, LongLe, }
1271
1272impl PStringLen {
1273 #[inline(always)]
1274 const fn size_of_len(&self) -> usize {
1275 match self {
1276 PStringLen::Byte => 1,
1277 PStringLen::ShortBe => 2,
1278 PStringLen::ShortLe => 2,
1279 PStringLen::LongBe => 4,
1280 PStringLen::LongLe => 4,
1281 }
1282 }
1283}
1284
1285#[derive(Debug, Clone, Serialize, Deserialize)]
1286struct PStringTest {
1287 len: PStringLen,
1288 test_val: TestValue<Vec<u8>>,
1289 include_len: bool,
1290}
1291
1292impl PStringTest {
1293 #[inline]
1294 fn read<'cache, R: DataRead>(
1295 &self,
1296 haystack: &'cache mut R,
1297 ) -> Result<Option<&'cache [u8]>, Error> {
1298 let mut len = match self.len {
1299 PStringLen::Byte => read_le!(haystack, u8) as u32,
1300 PStringLen::ShortBe => read_be!(haystack, u16) as u32,
1301 PStringLen::ShortLe => read_le!(haystack, u16) as u32,
1302 PStringLen::LongBe => read_be!(haystack, u32),
1303 PStringLen::LongLe => read_le!(haystack, u32),
1304 } as usize;
1305
1306 if self.include_len {
1307 len = len.saturating_sub(self.len.size_of_len())
1308 }
1309
1310 if let TestValue::Value(s) = self.test_val.as_ref()
1311 && len != s.len()
1312 {
1313 return Ok(None);
1314 }
1315
1316 let read = haystack.read_exact_count(len as u64)?;
1317
1318 Ok(Some(read))
1319 }
1320
1321 #[inline(always)]
1322 fn test_value_len(&self) -> usize {
1323 match self.test_val.as_ref() {
1324 TestValue::Value(s) => s.len(),
1325 TestValue::Any => 0,
1326 }
1327 }
1328}
1329
1330#[derive(Debug, Clone, Serialize, Deserialize)]
1331enum Test {
1332 Name(String),
1333 Use(bool, String),
1334 Scalar(ScalarTest),
1335 Float(FloatTest),
1336 String(StringTest),
1337 Search(SearchTest),
1338 PString(PStringTest),
1339 Regex(RegexTest),
1340 Indirect(IndirectModFlags),
1341 String16(String16Test),
1342 #[allow(dead_code)]
1344 Der,
1345 Clear,
1346 Default,
1347}
1348
1349impl Display for Test {
1350 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1351 match self {
1352 Test::Name(name) => write!(f, "name {name}"),
1353 Test::Use(flip, rule) => {
1354 if *flip {
1355 write!(f, "use {rule}")
1356 } else {
1357 write!(f, "use ^{rule}")
1358 }
1359 }
1360 Test::Scalar(st) => write!(f, "{st:?}"),
1361 Test::Float(ft) => write!(f, "{ft:?}"),
1362 Test::String(st) => write!(f, "{st:?}"),
1363 Test::Search(st) => write!(f, "{st:?}"),
1364 Test::PString(pt) => write!(f, "{pt:?}"),
1365 Test::Regex(rt) => write!(f, "{rt:?}"),
1366 Test::Indirect(fs) => write!(f, "indirect {fs:?}"),
1367 Test::String16(s16t) => write!(f, "{s16t:?}"),
1368 Test::Der => write!(f, "unimplemented der"),
1369 Test::Clear => write!(f, "clear"),
1370 Test::Default => write!(f, "default"),
1371 }
1372 }
1373}
1374
1375impl Test {
1376 #[inline]
1378 fn read_test_value<'haystack, D: DataRead>(
1379 &self,
1380 haystack: &'haystack mut D,
1381 switch_endianness: bool,
1382 ) -> Result<Option<ReadValue<'haystack>>, Error> {
1383 let test_value_offset = haystack.stream_position();
1384
1385 match self {
1386 Self::Scalar(t) => {
1387 t.ty.read(haystack, switch_endianness)
1388 .map(|s| Some(ReadValue::Scalar(test_value_offset, s)))
1389 }
1390
1391 Self::Float(t) => {
1392 t.ty.read(haystack, switch_endianness)
1393 .map(|f| Some(ReadValue::Float(test_value_offset, f)))
1394 }
1395 Self::String(t) => {
1396 match t.test_val.as_ref() {
1397 TestValue::Value(str) => {
1398 let buf = if let Some(length) = t.length {
1399 haystack.read_exact_count(length as u64)?
1401 } else {
1402 match t.cmp_op {
1405 CmpOp::Eq | CmpOp::Neq => {
1406 if !t.has_length_mod() {
1407 haystack.read_exact_count(str.len() as u64)?
1408 } else {
1409 haystack.read_count(FILE_BYTES_MAX as u64)?
1410 }
1411 }
1412 CmpOp::Lt | CmpOp::Gt => {
1413 let read =
1414 haystack.read_until_any_delim_or_limit(b"\n\0", 8092)?;
1415
1416 if read.ends_with(b"\0") || read.ends_with(b"\n") {
1417 &read[..read.len() - 1]
1418 } else {
1419 read
1420 }
1421 }
1422 _ => {
1423 return Err(Error::Msg(format!(
1424 "string test does not support {:?} operator",
1425 t.cmp_op
1426 )));
1427 }
1428 }
1429 };
1430
1431 Ok(Some(ReadValue::Bytes(test_value_offset, buf)))
1432 }
1433 TestValue::Any => {
1434 let read = haystack.read_until_any_delim_or_limit(b"\0\n", 8192)?;
1435 let bytes = if read.ends_with(b"\0") || read.ends_with(b"\n") {
1437 &read[..read.len() - 1]
1438 } else {
1439 read
1440 };
1441
1442 Ok(Some(ReadValue::Bytes(test_value_offset, bytes)))
1443 }
1444 }
1445 }
1446
1447 Self::String16(t) => {
1448 match t.test_val.as_ref() {
1449 TestValue::Value(str16) => {
1450 let read = haystack.read_exact_count((str16.len() * 2) as u64)?;
1451
1452 Ok(Some(ReadValue::Bytes(test_value_offset, read)))
1453 }
1454 TestValue::Any => {
1455 let read = haystack.read_until_utf16_or_limit(b"\x00\x00", 8192)?;
1456
1457 let end = if read.len() % 2 == 0 {
1459 read.len()
1460 } else {
1461 read.len().saturating_sub(1)
1464 };
1465
1466 Ok(Some(ReadValue::Bytes(test_value_offset, &read[..end])))
1467 }
1468 }
1469 }
1470
1471 Self::PString(t) => {
1472 let Some(read) = t.read(haystack)? else {
1473 return Ok(None);
1474 };
1475 Ok(Some(ReadValue::Bytes(test_value_offset, read)))
1476 }
1477
1478 Self::Search(_) => {
1479 let buf = haystack.read_count(FILE_BYTES_MAX as u64)?;
1480 Ok(Some(ReadValue::Bytes(test_value_offset, buf)))
1481 }
1482
1483 Self::Regex(r) => {
1484 let length = {
1485 match r.length {
1486 Some(len) => {
1487 if r.mods.contains(ReMod::LineLimit) {
1488 len * 80
1489 } else {
1490 len
1491 }
1492 }
1493
1494 None => FILE_REGEX_MAX,
1495 }
1496 };
1497
1498 let read = haystack.read_count(length as u64)?;
1499 Ok(Some(ReadValue::Bytes(test_value_offset, read)))
1500 }
1501
1502 Self::Name(_)
1503 | Self::Use(_, _)
1504 | Self::Indirect(_)
1505 | Self::Clear
1506 | Self::Default
1507 | Self::Der => Err(Error::msg("no value to read for this test")),
1508 }
1509 }
1510
1511 #[inline(always)]
1512 fn match_value<'s>(
1513 &'s self,
1514 tv: &ReadValue<'s>,
1515 stream_kind: StreamKind,
1516 ) -> Option<MatchRes<'s>> {
1517 match (self, tv) {
1518 (Self::Scalar(t), ReadValue::Scalar(o, ts)) => {
1519 let read_value: Scalar = match t.transform.as_ref() {
1520 Some(t) => t.apply(*ts)?,
1521 None => *ts,
1522 };
1523
1524 match t.test_val {
1525 TestValue::Value(test_value) => {
1526 let ok = match t.cmp_op {
1527 CmpOp::Not => read_value == !test_value,
1530 CmpOp::Eq => read_value == test_value,
1531 CmpOp::Lt => read_value < test_value,
1532 CmpOp::Gt => read_value > test_value,
1533 CmpOp::Neq => read_value != test_value,
1534 CmpOp::BitAnd => read_value & test_value == test_value,
1535 CmpOp::Xor => (read_value & test_value).is_zero(),
1536 };
1537
1538 if ok {
1539 Some(MatchRes::Scalar(*o, read_value))
1540 } else {
1541 None
1542 }
1543 }
1544
1545 TestValue::Any => Some(MatchRes::Scalar(*o, read_value)),
1546 }
1547 }
1548
1549 (Self::Float(t), ReadValue::Float(o, f)) => {
1550 let read_value: Float = t.transform.as_ref().map(|t| t.apply(*f)).unwrap_or(*f);
1551
1552 match t.test_val {
1553 TestValue::Value(tf) => {
1554 let ok = match t.cmp_op {
1555 CmpOp::Eq => read_value == tf,
1556 CmpOp::Lt => read_value < tf,
1557 CmpOp::Gt => read_value > tf,
1558 CmpOp::Neq => read_value != tf,
1559 _ => {
1560 debug_panic!("unsupported float comparison");
1563 debug!("unsupported float comparison");
1564 false
1565 }
1566 };
1567
1568 if ok {
1569 Some(MatchRes::Float(*o, read_value))
1570 } else {
1571 None
1572 }
1573 }
1574 TestValue::Any => Some(MatchRes::Float(*o, read_value)),
1575 }
1576 }
1577
1578 (Self::String(st), ReadValue::Bytes(o, buf)) => {
1579 macro_rules! trim_buf {
1580 ($buf: expr) => {{
1581 if st.mods.contains(StringMod::Trim) {
1582 $buf.trim_ascii()
1583 } else {
1584 $buf
1585 }
1586 }};
1587 }
1588
1589 match st.test_val.as_ref() {
1590 TestValue::Value(str) => {
1591 match st.cmp_op {
1592 CmpOp::Eq => {
1593 if let (true, _) = string_match(str, st.mods, buf) {
1594 Some(MatchRes::Bytes(*o, None, trim_buf!(str), Encoding::Utf8))
1595 } else {
1596 None
1597 }
1598 }
1599 CmpOp::Neq => {
1600 if let (false, _) = string_match(str, st.mods, buf) {
1601 Some(MatchRes::Bytes(*o, None, trim_buf!(str), Encoding::Utf8))
1602 } else {
1603 None
1604 }
1605 }
1606 CmpOp::Gt => {
1607 if buf.len() > str.len() {
1608 Some(MatchRes::Bytes(*o, None, trim_buf!(buf), Encoding::Utf8))
1609 } else {
1610 None
1611 }
1612 }
1613 CmpOp::Lt => {
1614 if buf.len() < str.len() {
1615 Some(MatchRes::Bytes(*o, None, trim_buf!(buf), Encoding::Utf8))
1616 } else {
1617 None
1618 }
1619 }
1620
1621 _ => {
1623 debug_panic!("unsupported string comparison");
1626 debug!("unsupported string comparison");
1627 None
1628 }
1629 }
1630 }
1631 TestValue::Any => {
1632 Some(MatchRes::Bytes(*o, None, trim_buf!(buf), Encoding::Utf8))
1633 }
1634 }
1635 }
1636
1637 (Self::PString(m), ReadValue::Bytes(o, buf)) => match m.test_val.as_ref() {
1638 TestValue::Value(psv) => {
1639 if buf == psv {
1640 Some(MatchRes::Bytes(*o, None, buf, Encoding::Utf8))
1641 } else {
1642 None
1643 }
1644 }
1645 TestValue::Any => Some(MatchRes::Bytes(*o, None, buf, Encoding::Utf8)),
1646 },
1647
1648 (Self::String16(t), ReadValue::Bytes(o, buf)) => {
1649 match t.test_val.as_ref() {
1650 TestValue::Value(str16) => {
1651 if str16.len() * 2 != buf.len() {
1653 return None;
1654 }
1655
1656 for (i, utf16_char) in slice_to_utf16_iter(buf, t.encoding).enumerate() {
1658 if str16[i] != utf16_char {
1659 return None;
1660 }
1661 }
1662
1663 Some(MatchRes::Bytes(
1664 *o,
1665 None,
1666 t.orig.as_bytes(),
1667 Encoding::Utf16(t.encoding),
1668 ))
1669 }
1670
1671 TestValue::Any => {
1672 Some(MatchRes::Bytes(*o, None, buf, Encoding::Utf16(t.encoding)))
1673 }
1674 }
1675 }
1676
1677 (Self::Regex(r), ReadValue::Bytes(o, buf)) => r.match_buf(*o, stream_kind, buf),
1678
1679 (Self::Search(t), ReadValue::Bytes(o, buf)) => t.match_buf(*o, buf),
1680
1681 _ => None,
1682 }
1683 }
1684
1685 #[inline(always)]
1686 fn strength(&self) -> u64 {
1687 const MULT: usize = 10;
1688
1689 let mut out = 2 * MULT;
1690
1691 match self {
1693 Test::Scalar(s) => {
1694 out += s.ty.type_size() * MULT;
1695 }
1696
1697 Test::Float(t) => {
1698 out += t.ty.type_size() * MULT;
1699 }
1700
1701 Test::String(t) => out += t.test_value_len().saturating_mul(MULT),
1702
1703 Test::PString(t) => out += t.test_value_len().saturating_mul(MULT),
1704
1705 Test::Search(s) => {
1706 let n_pos = s.n_pos.unwrap_or(FILE_BYTES_MAX);
1711
1712 match n_pos {
1713 0..=80 => out += s.str.len().saturating_mul(MULT),
1715 81..=240 => out += s.str.len() * s.str.len().clamp(0, MULT - 2),
1717 _ => out += s.str.len(),
1719 }
1720 }
1721
1722 Test::Regex(r) => {
1723 let v = r.non_magic_len / r.re.captures_len();
1732
1733 let len = r
1734 .length
1735 .map(|l| {
1736 if r.mods.contains(ReMod::LineLimit) {
1737 l * 80
1738 } else {
1739 l
1740 }
1741 })
1742 .unwrap_or(FILE_BYTES_MAX);
1743
1744 match len {
1745 0..=80 => out += v.saturating_mul(MULT),
1747 81..=240 => out += v * v.clamp(0, MULT - 2),
1749 _ => out += v,
1751 }
1752 }
1753
1754 Test::String16(t) => {
1755 out += t.test_value_len().saturating_mul(MULT);
1760 }
1761
1762 Test::Der => out += MULT,
1763
1764 Test::Default | Test::Name(_) | Test::Use(_, _) | Test::Indirect(_) | Test::Clear => {
1765 return 0;
1766 }
1767 }
1768
1769 if self.is_match_any() {
1771 return 0;
1772 }
1773
1774 if let Some(op) = self.cmp_op() {
1775 match op {
1776 CmpOp::Neq => out = 0,
1778 CmpOp::Eq | CmpOp::Not => out += MULT,
1779 CmpOp::Lt | CmpOp::Gt => out -= 2 * MULT,
1780 CmpOp::Xor | CmpOp::BitAnd => out -= MULT,
1781 }
1782 }
1783
1784 out as u64
1785 }
1786
1787 #[inline(always)]
1788 fn cmp_op(&self) -> Option<CmpOp> {
1789 match self {
1790 Self::String(t) => Some(t.cmp_op),
1791 Self::Scalar(s) => Some(s.cmp_op),
1792 Self::Float(t) => Some(t.cmp_op),
1793 Self::Name(_)
1794 | Self::Use(_, _)
1795 | Self::Search(_)
1796 | Self::PString(_)
1797 | Self::Regex(_)
1798 | Self::Clear
1799 | Self::Default
1800 | Self::Indirect(_)
1801 | Self::String16(_)
1802 | Self::Der => None,
1803 }
1804 }
1805
1806 #[inline(always)]
1807 fn is_recursive(&self) -> bool {
1808 matches!(self, Test::Use(_, _) | Test::Indirect(_))
1809 }
1810
1811 #[inline(always)]
1812 fn is_match_any(&self) -> bool {
1813 match self {
1814 Test::Name(_) => false,
1815 Test::Use(_, _) => false,
1816 Test::Scalar(scalar_test) => matches!(scalar_test.test_val, TestValue::Any),
1817 Test::Float(float_test) => matches!(float_test.test_val, TestValue::Any),
1818 Test::String(string_test) => matches!(string_test.test_val, TestValue::Any),
1819 Test::Search(_) => false,
1820 Test::PString(pstring_test) => matches!(pstring_test.test_val, TestValue::Any),
1821 Test::Regex(_) => false,
1822 Test::Indirect(_) => false,
1823 Test::String16(string16_test) => matches!(string16_test.test_val, TestValue::Any),
1824 Test::Der => false,
1825 Test::Clear => false,
1826 Test::Default => false,
1827 }
1828 }
1829
1830 #[inline(always)]
1831 fn is_binary(&self) -> bool {
1832 match self {
1833 Self::Name(_) => true,
1834 Self::Use(_, _) => true,
1835 Self::Scalar(_) => true,
1836 Self::Float(_) => true,
1837 Self::String(t) => !t.is_binary() & !t.is_text() || t.is_binary(),
1838 Self::Search(t) => t.is_binary(),
1839 Self::PString(_) => true,
1840 Self::Regex(t) => !t.is_binary() & !t.is_text() || t.is_binary(),
1841 Self::Clear => true,
1842 Self::Default => true,
1843 Self::Indirect(_) => true,
1844 Self::String16(_) => true,
1845 Self::Der => true,
1846 }
1847 }
1848
1849 #[inline(always)]
1850 fn is_text(&self) -> bool {
1851 match self {
1852 Self::Name(_) => true,
1853 Self::Use(_, _) => true,
1854 Self::Indirect(_) => true,
1855 Self::Clear => true,
1856 Self::Default => true,
1857 Self::String(t) => !t.is_binary() & !t.is_text() || t.is_text(),
1858 Self::Regex(t) => !t.is_binary() & !t.is_text() || t.is_text(),
1859 _ => !self.is_binary(),
1860 }
1861 }
1862
1863 #[inline(always)]
1864 fn is_only_text(&self) -> bool {
1865 self.is_text() && !self.is_binary()
1866 }
1867
1868 #[inline(always)]
1869 fn is_only_binary(&self) -> bool {
1870 self.is_binary() && !self.is_text()
1871 }
1872}
1873
1874#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
1875enum OffsetType {
1876 Byte,
1877 DoubleLe,
1878 DoubleBe,
1879 ShortLe,
1880 ShortBe,
1881 Id3Le,
1882 Id3Be,
1883 LongLe,
1884 LongBe,
1885 Middle,
1886 Octal,
1887 QuadBe,
1888 QuadLe,
1889}
1890
1891#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
1892enum Shift {
1893 Direct(u64),
1894 Indirect(i64),
1895}
1896
1897#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
1898struct IndOffset {
1899 off_addr: DirOffset,
1901 signed: bool,
1903 ty: OffsetType,
1905 op: Option<Op>,
1906 shift: Option<Shift>,
1907}
1908
1909impl IndOffset {
1910 fn read_offset<D: DataRead>(
1912 &self,
1913 haystack: &mut D,
1914 rule_base_offset: Option<u64>,
1915 last_upper_match_offset: Option<u64>,
1916 ) -> Result<Option<u64>, io::Error> {
1917 let offset_address = match self.off_addr {
1918 DirOffset::Start(s) => {
1919 let Some(o) = s.checked_add(rule_base_offset.unwrap_or_default()) else {
1920 return Ok(None);
1921 };
1922
1923 haystack.seek(SeekFrom::Start(o))?
1924 }
1925 DirOffset::LastUpper(c) => haystack.seek(SeekFrom::Start(
1926 (last_upper_match_offset.unwrap_or_default() as i64 + c) as u64,
1927 ))?,
1928 DirOffset::End(e) => haystack.seek(SeekFrom::End(e))?,
1929 };
1930
1931 macro_rules! read_value {
1932 () => {
1933 match self.ty {
1934 OffsetType::Byte => {
1935 if self.signed {
1936 read_le!(haystack, u8) as u64
1937 } else {
1938 read_le!(haystack, i8) as u64
1939 }
1940 }
1941 OffsetType::DoubleLe => read_le!(haystack, f64) as u64,
1942 OffsetType::DoubleBe => read_be!(haystack, f64) as u64,
1943 OffsetType::ShortLe => {
1944 if self.signed {
1945 read_le!(haystack, i16) as u64
1946 } else {
1947 read_le!(haystack, u16) as u64
1948 }
1949 }
1950 OffsetType::ShortBe => {
1951 if self.signed {
1952 read_be!(haystack, i16) as u64
1953 } else {
1954 read_be!(haystack, u16) as u64
1955 }
1956 }
1957 OffsetType::Id3Le => decode_id3(read_le!(haystack, u32)) as u64,
1958 OffsetType::Id3Be => decode_id3(read_be!(haystack, u32)) as u64,
1959 OffsetType::LongLe => {
1960 if self.signed {
1961 read_le!(haystack, i32) as u64
1962 } else {
1963 read_le!(haystack, u32) as u64
1964 }
1965 }
1966 OffsetType::LongBe => {
1967 if self.signed {
1968 read_be!(haystack, i32) as u64
1969 } else {
1970 read_be!(haystack, u32) as u64
1971 }
1972 }
1973 OffsetType::Middle => read_me!(haystack) as u64,
1974 OffsetType::Octal => {
1975 if let Some(o) = read_octal_u64(haystack) {
1976 o
1977 } else {
1978 debug!("failed to read octal offset @ {offset_address}");
1979 return Ok(None);
1980 }
1981 }
1982 OffsetType::QuadLe => {
1983 if self.signed {
1984 read_le!(haystack, i64) as u64
1985 } else {
1986 read_le!(haystack, u64)
1987 }
1988 }
1989 OffsetType::QuadBe => {
1990 if self.signed {
1991 read_be!(haystack, i64) as u64
1992 } else {
1993 read_be!(haystack, u64)
1994 }
1995 }
1996 }
1997 };
1998 }
1999
2000 let o = read_value!();
2002
2003 trace!(
2004 "offset read @ {offset_address} value={o} op={:?} shift={:?}",
2005 self.op, self.shift
2006 );
2007
2008 if let (Some(op), Some(shift)) = (self.op, self.shift) {
2010 let shift = match shift {
2011 Shift::Direct(i) => i,
2012 Shift::Indirect(i) => {
2013 let tmp = offset_address as i128 + i as i128;
2014 if tmp.is_negative() {
2015 return Ok(None);
2016 } else {
2017 haystack.seek(SeekFrom::Start(tmp as u64))?;
2018 };
2019 read_value!()
2022 }
2023 };
2024
2025 match op {
2026 Op::Add => return Ok(o.checked_add(shift)),
2027 Op::Mul => return Ok(o.checked_mul(shift)),
2028 Op::Sub => return Ok(o.checked_sub(shift)),
2029 Op::Div => return Ok(o.checked_div(shift)),
2030 Op::Mod => return Ok(o.checked_rem(shift)),
2031 Op::And => return Ok(Some(o & shift)),
2032 Op::Or => return Ok(Some(o | shift)),
2033 Op::Xor => return Ok(Some(o ^ shift)),
2034 }
2035 }
2036
2037 Ok(Some(o))
2038 }
2039}
2040
2041#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
2042enum DirOffset {
2043 Start(u64),
2044 LastUpper(i64),
2046 End(i64),
2047}
2048
2049#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
2050enum Offset {
2051 Direct(DirOffset),
2052 Indirect(IndOffset),
2053}
2054
2055impl Offset {
2056 #[inline(always)]
2057 fn is_indirect(&self) -> bool {
2058 matches!(self, Self::Indirect(_))
2059 }
2060}
2061
2062impl From<DirOffset> for Offset {
2063 fn from(value: DirOffset) -> Self {
2064 Self::Direct(value)
2065 }
2066}
2067
2068impl From<IndOffset> for Offset {
2069 fn from(value: IndOffset) -> Self {
2070 Self::Indirect(value)
2071 }
2072}
2073
2074impl Display for DirOffset {
2075 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
2076 match self {
2077 DirOffset::Start(i) => write!(f, "{i}"),
2078 DirOffset::LastUpper(c) => write!(f, "&{c}"),
2079 DirOffset::End(e) => write!(f, "-{e}"),
2080 }
2081 }
2082}
2083
2084impl Default for DirOffset {
2085 fn default() -> Self {
2086 Self::LastUpper(0)
2087 }
2088}
2089
2090#[derive(Debug, Clone, Serialize, Deserialize)]
2091struct Match {
2092 line: usize,
2093 depth: u8,
2094 offset: Offset,
2095 test: Test,
2096 test_strength: u64,
2097 message: Option<Message>,
2098}
2099
2100impl From<Use> for Match {
2101 fn from(value: Use) -> Self {
2102 let test = Test::Use(value.switch_endianness, value.rule_name);
2103 let test_strength = test.strength();
2104 Self {
2105 line: value.line,
2106 depth: value.depth,
2107 offset: value.start_offset,
2108 test,
2109 test_strength,
2110 message: value.message,
2111 }
2112 }
2113}
2114
2115impl From<Name> for Match {
2116 fn from(value: Name) -> Self {
2117 let test = Test::Name(value.name);
2118 let test_strength = test.strength();
2119 Self {
2120 line: value.line,
2121 depth: 0,
2122 offset: Offset::Direct(DirOffset::Start(0)),
2123 test,
2124 test_strength,
2125 message: value.message,
2126 }
2127 }
2128}
2129
2130impl Match {
2131 #[inline(always)]
2133 fn offset_from_start<D: DataRead>(
2134 &self,
2135 haystack: &mut D,
2136 rule_base_offset: Option<u64>,
2137 last_level_offset: Option<u64>,
2138 ) -> Result<Option<u64>, io::Error> {
2139 match self.offset {
2140 Offset::Direct(dir_offset) => match dir_offset {
2141 DirOffset::Start(s) => Ok(Some(s)),
2142 DirOffset::LastUpper(shift) => {
2143 let o = last_level_offset.unwrap_or_default() as i64 + shift;
2144
2145 if o >= 0 { Ok(Some(o as u64)) } else { Ok(None) }
2146 }
2147 DirOffset::End(e) => Ok(Some(haystack.offset_from_start(SeekFrom::End(e)))),
2148 },
2149 Offset::Indirect(ind_offset) => {
2150 let Some(o) =
2151 ind_offset.read_offset(haystack, rule_base_offset, last_level_offset)?
2152 else {
2153 return Ok(None);
2154 };
2155
2156 Ok(Some(o))
2157 }
2158 }
2159 }
2160
2161 #[inline]
2174 #[allow(clippy::too_many_arguments)]
2175 fn matches<'a: 'h, 'h, D: DataRead>(
2176 &'a self,
2177 source: Option<&str>,
2178 magic: &mut Magic<'a>,
2179 stream_kind: StreamKind,
2180 state: &mut MatchState,
2181 buf_base_offset: Option<u64>,
2182 rule_base_offset: Option<u64>,
2183 last_level_offset: Option<u64>,
2184 haystack: &'h mut D,
2185 switch_endianness: bool,
2186 db: &'a MagicDb,
2187 depth: usize,
2188 ) -> Result<(bool, Option<MatchRes<'h>>), Error> {
2189 let source = source.unwrap_or("unknown");
2190 let line = self.line;
2191
2192 if depth >= MAX_RECURSION {
2193 return Err(Error::localized(
2194 source,
2195 line,
2196 Error::MaximumRecursion(MAX_RECURSION),
2197 ));
2198 }
2199
2200 if self.test.is_only_binary() && stream_kind.is_text() {
2201 trace!("skip binary test source={source} line={line} stream_kind={stream_kind:?}",);
2202 return Ok((false, None));
2203 }
2204
2205 if self.test.is_only_text() && !stream_kind.is_text() {
2206 trace!("skip text test source={source} line={line} stream_kind={stream_kind:?}",);
2207 return Ok((false, None));
2208 }
2209
2210 let Ok(Some(mut offset)) = self
2211 .offset_from_start(haystack, rule_base_offset, last_level_offset)
2212 .inspect_err(|e| debug!("source={source} line={line} failed at computing offset: {e}"))
2213 else {
2214 return Ok((false, None));
2215 };
2216
2217 offset = match self.offset {
2218 Offset::Indirect(_) => {
2219 buf_base_offset.unwrap_or_default().saturating_add(offset)
2224 }
2225 Offset::Direct(DirOffset::Start(_)) => {
2227 rule_base_offset.unwrap_or_default().saturating_add(offset)
2228 }
2229 _ => offset,
2230 };
2231
2232 match &self.test {
2233 Test::Clear => {
2234 trace!("source={source} line={line} clear");
2235 state.clear_continuation_level(&self.continuation_level());
2236 Ok((true, None))
2237 }
2238
2239 Test::Name(name) => {
2240 trace!(
2241 "source={source} line={line} running rule {name} switch_endianness={switch_endianness}",
2242 );
2243 Ok((true, None))
2244 }
2245
2246 Test::Use(flip_endianness, rule_name) => {
2247 trace!(
2248 "source={source} line={line} use {rule_name} switch_endianness={flip_endianness}",
2249 );
2250
2251 let switch_endianness = switch_endianness ^ flip_endianness;
2253
2254 let dr: &DependencyRule = db.dependencies.get(rule_name).ok_or(
2255 Error::localized(source, line, Error::MissingRule(rule_name.clone())),
2256 )?;
2257
2258 if let Some(msg) = self.message.as_ref() {
2260 magic.push_message(msg.to_string_lossy());
2261 }
2262
2263 let new_buf_base_off = if self.offset.is_indirect() {
2264 Some(offset)
2265 } else {
2266 None
2267 };
2268
2269 let nmatch = dr.rule.magic(
2270 magic,
2271 stream_kind,
2272 new_buf_base_off,
2273 Some(offset),
2274 haystack,
2275 db,
2276 switch_endianness,
2277 depth.saturating_add(1),
2278 )?;
2279
2280 let matched = nmatch > 0;
2283 if matched {
2284 state.set_continuation_level(self.continuation_level());
2285 }
2286
2287 Ok((matched, None))
2288 }
2289
2290 Test::Indirect(m) => {
2291 trace!(
2292 "source={source} line={line} indirect mods={:?} offset={offset:#x}",
2293 m
2294 );
2295
2296 let new_buf_base_off = if m.contains(IndirectMod::Relative) {
2297 Some(offset)
2298 } else {
2299 None
2300 };
2301
2302 if let Some(msg) = self.message.as_ref() {
2304 magic.push_message(msg.to_string_lossy());
2305 }
2306
2307 let mut nmatch = 0u64;
2308 for r in db.rules.iter() {
2309 nmatch = nmatch.saturating_add(r.magic(
2310 magic,
2311 stream_kind,
2312 new_buf_base_off,
2313 Some(offset),
2314 haystack,
2315 db,
2316 false,
2317 depth.saturating_add(1),
2318 )?);
2319
2320 if nmatch > 0 {
2321 break;
2322 }
2323 }
2324
2325 Ok((nmatch > 0, None))
2326 }
2327
2328 Test::Default => {
2329 let ok = !state.get_continuation_level(&self.continuation_level());
2331
2332 trace!("source={source} line={line} default match={ok}");
2333 if ok {
2334 state.set_continuation_level(self.continuation_level());
2335 }
2336
2337 Ok((ok, None))
2338 }
2339
2340 _ => {
2341 if let Err(e) = haystack.seek(SeekFrom::Start(offset)) {
2342 debug!("source={source} line={line} failed to seek in haystack: {e}");
2343 return Ok((false, None));
2344 }
2345
2346 let mut trace_msg = None;
2347
2348 if enabled!(Level::DEBUG) {
2349 trace_msg = Some(vec![format!(
2350 "source={source} line={line} depth={} stream_offset={:#x}",
2351 self.depth,
2352 haystack.stream_position()
2353 )])
2354 }
2355
2356 if let Ok(opt_test_value) = self
2360 .test
2361 .read_test_value(haystack, switch_endianness)
2362 .inspect_err(|e| {
2363 debug!("source={source} line={line} error while reading test value @{offset}: {e}",)
2364 })
2365 {
2366 if let Some(v) = trace_msg
2367 .as_mut() { v.push(format!("test={}", self.test)) }
2368
2369 if let Some(v) = trace_msg.as_mut(){
2370 let drv = match opt_test_value.as_ref(){
2371 Some(r) => format!("{r:?}"),
2372 None =>String::new(),
2373 };
2374 v.push(format!("read_in_stream={drv}"))
2375 }
2376
2377 let match_res =
2378 opt_test_value.and_then(|tv| self.test.match_value(&tv, stream_kind));
2379
2380 if let Some(v) = trace_msg.as_mut() { v.push(format!(
2381 "message=\"{}\" match={}",
2382 self.message
2383 .as_ref()
2384 .map(|fs| fs.to_string_lossy())
2385 .unwrap_or_default(),
2386 match_res.is_some()
2387 )) }
2388
2389 if enabled!(Level::DEBUG) && !enabled!(Level::TRACE) && match_res.is_some() {
2391 if let Some(m) = trace_msg{
2392 debug!("{}", m.join(" "));
2393 }
2394 } else if enabled!(Level::TRACE)
2395 && let Some(m) = trace_msg{
2396 trace!("{}", m.join(" "));
2397 }
2398
2399 if let Some(mr) = match_res {
2400 state.set_continuation_level(self.continuation_level());
2401 return Ok((true, Some(mr)));
2402 }
2403 }
2404
2405 Ok((false, None))
2406 }
2407 }
2408 }
2409
2410 #[inline(always)]
2411 fn continuation_level(&self) -> ContinuationLevel {
2412 ContinuationLevel(self.depth)
2413 }
2414}
2415
2416#[derive(Debug, Clone)]
2417struct Use {
2418 line: usize,
2419 depth: u8,
2420 start_offset: Offset,
2421 rule_name: String,
2422 switch_endianness: bool,
2423 message: Option<Message>,
2424}
2425
2426#[derive(Debug, Clone, Serialize, Deserialize)]
2427struct StrengthMod {
2428 op: Op,
2429 by: u8,
2430}
2431
2432impl StrengthMod {
2433 #[inline(always)]
2434 fn apply(&self, strength: u64) -> u64 {
2435 let by = self.by as u64;
2436 debug!("applying strength modifier: {strength} {} {}", self.op, by);
2437 match self.op {
2438 Op::Mul => strength.saturating_mul(by),
2439 Op::Add => strength.saturating_add(by),
2440 Op::Sub => strength.saturating_sub(by),
2441 Op::Div => {
2442 if by > 0 {
2443 strength.saturating_div(by)
2444 } else {
2445 strength
2446 }
2447 }
2448 Op::Mod => strength % by,
2449 Op::And => strength & by,
2450 Op::Xor | Op::Or => {
2453 debug_panic!("unsupported strength operator");
2454 strength
2455 }
2456 }
2457 }
2458}
2459
2460#[derive(Debug, Clone)]
2461enum Flag {
2462 Mime(String),
2463 Ext(HashSet<String>),
2464 Strength(StrengthMod),
2465 Apple(String),
2466}
2467
2468#[derive(Debug, Clone)]
2469struct Name {
2470 line: usize,
2471 name: String,
2472 message: Option<Message>,
2473}
2474
2475#[derive(Debug, Clone)]
2476enum Entry<'span> {
2477 Match(Span<'span>, Match),
2478 Flag(Span<'span>, Flag),
2479}
2480
2481#[derive(Debug, Clone, Serialize, Deserialize)]
2482struct EntryNode {
2483 root: bool,
2484 entry: Match,
2485 children: Vec<EntryNode>,
2486 mimetype: Option<String>,
2487 apple: Option<String>,
2488 strength_mod: Option<StrengthMod>,
2489 exts: HashSet<String>,
2490}
2491
2492#[derive(Debug, Default)]
2493struct EntryNodeVisitor {
2494 exts: HashSet<String>,
2495 score: u64,
2496}
2497
2498impl EntryNodeVisitor {
2499 fn new() -> Self {
2500 Self {
2501 ..Default::default()
2502 }
2503 }
2504
2505 fn merge(&mut self, other: Self) {
2506 self.exts.extend(other.exts);
2507 self.score += other.score;
2508 }
2509}
2510
2511impl EntryNode {
2512 #[inline]
2513 fn update_visitor(&self, v: &mut EntryNodeVisitor, depth: usize) {
2514 for ext in self.exts.iter() {
2516 if !v.exts.contains(ext) {
2517 v.exts.insert(ext.clone());
2518 }
2519 }
2520
2521 if depth == 0 {
2523 v.score += self.entry.test_strength;
2524 }
2525
2526 v.score += self
2530 .children
2531 .iter()
2532 .map(|e| e.entry.test_strength)
2533 .min()
2534 .unwrap_or_default()
2535 / max(1, depth as u64);
2536 }
2537
2538 fn visit(
2539 &self,
2540 v: &mut EntryNodeVisitor,
2541 deps: &HashMap<String, DependencyRule>,
2542 marked: &mut HashSet<String>,
2543 depth: usize,
2544 ) -> Result<(), Error> {
2545 self.update_visitor(v, depth);
2547
2548 for c in self.children.iter() {
2550 if let Test::Use(_, ref name) = c.entry.test {
2551 if marked.contains(name) {
2552 continue;
2553 }
2554
2555 marked.insert(name.clone());
2556
2557 if let Some(r) = deps.get(name) {
2558 let dv = r.rule.visit_all_entries(deps, marked)?;
2559 v.merge(dv);
2560 } else {
2561 return Err(Error::MissingRule(name.clone()));
2562 }
2563 } else {
2564 c.visit(v, deps, marked, depth + 1)?;
2565 }
2566 }
2567
2568 Ok(())
2569 }
2570
2571 #[inline]
2574 #[allow(clippy::too_many_arguments)]
2575 fn matches<'r, D: DataRead>(
2576 &'r self,
2577 opt_source: Option<&str>,
2578 magic: &mut Magic<'r>,
2579 state: &mut MatchState,
2580 stream_kind: StreamKind,
2581 buf_base_offset: Option<u64>,
2582 rule_base_offset: Option<u64>,
2583 last_level_offset: Option<u64>,
2584 haystack: &mut D,
2585 db: &'r MagicDb,
2586 switch_endianness: bool,
2587 depth: usize,
2588 ) -> Result<u64, Error> {
2589 let mut nmatch = 0u64;
2590
2591 let (ok, opt_match_res) = self.entry.matches(
2592 opt_source,
2593 magic,
2594 stream_kind,
2595 state,
2596 buf_base_offset,
2597 rule_base_offset,
2598 last_level_offset,
2599 haystack,
2600 switch_endianness,
2601 db,
2602 depth,
2603 )?;
2604
2605 let source = opt_source.unwrap_or("unknown");
2606 let line = self.entry.line;
2607
2608 if ok {
2609 if !self.entry.test.is_recursive()
2613 && let Some(msg) = self.entry.message.as_ref()
2614 && let Ok(msg) = msg.format_with(opt_match_res.as_ref()).inspect_err(|e| {
2615 debug!("source={source} line={line} failed to format message: {e}")
2616 })
2617 {
2618 nmatch = nmatch.saturating_add(1);
2619 magic.push_message(msg);
2620 }
2621
2622 if let Some(mr) = opt_match_res {
2624 match &self.entry.test {
2625 Test::String(t) if t.has_length_mod() => {
2626 let o = mr.end_offset();
2627 haystack.seek(SeekFrom::Start(o))?;
2628 }
2629 Test::Search(t) => {
2630 if t.re_mods.contains(ReMod::StartOffsetUpdate) {
2631 let o = mr.start_offset();
2632 haystack.seek(SeekFrom::Start(o))?;
2633 } else {
2634 let o = mr.end_offset();
2635 haystack.seek(SeekFrom::Start(o))?;
2636 }
2637 }
2638
2639 Test::Regex(t) => {
2640 if t.mods.contains(ReMod::StartOffsetUpdate) {
2641 let o = mr.start_offset();
2642 haystack.seek(SeekFrom::Start(o))?;
2643 } else {
2644 let o = mr.end_offset();
2645 haystack.seek(SeekFrom::Start(o))?;
2646 }
2647 }
2648 _ => {}
2650 }
2651 }
2652
2653 if let Some(mimetype) = self.mimetype.as_ref() {
2654 magic.set_mime_type(Cow::Borrowed(mimetype));
2655 }
2656
2657 if let Some(apple_ty) = self.apple.as_ref() {
2658 magic.set_creator_code(Cow::Borrowed(apple_ty));
2659 }
2660
2661 if !self.exts.is_empty() {
2662 magic.insert_extensions(self.exts.iter().map(|s| s.as_str()));
2663 }
2664
2665 let mut strength = self.entry.test_strength;
2669
2670 let continuation_level = self.entry.continuation_level().0 as u64;
2671 if self.entry.message.is_none() && continuation_level < 3 {
2672 strength = strength.saturating_add(continuation_level);
2673 }
2674
2675 if let Some(sm) = self.strength_mod.as_ref() {
2676 strength = sm.apply(strength);
2677 }
2678
2679 if self.entry.message.is_none() {
2681 strength += 1
2682 }
2683
2684 magic.update_strength(strength);
2685
2686 let end_upper_level = haystack.stream_position();
2687
2688 let rule_base_offset = if self.root {
2696 match self.entry.offset {
2697 Offset::Direct(DirOffset::End(o)) => {
2698 Some(haystack.offset_from_start(SeekFrom::End(o)))
2699 }
2700 _ => rule_base_offset,
2701 }
2702 } else {
2703 rule_base_offset
2704 };
2705
2706 for e in self.children.iter() {
2707 nmatch = nmatch.saturating_add(e.matches(
2708 opt_source,
2709 magic,
2710 state,
2711 stream_kind,
2712 buf_base_offset,
2713 rule_base_offset,
2714 Some(end_upper_level),
2715 haystack,
2716 db,
2717 switch_endianness,
2718 depth,
2719 )?);
2720 }
2721 }
2722
2723 Ok(nmatch)
2724 }
2725}
2726
2727#[derive(Debug, Clone, Serialize, Deserialize)]
2729pub struct MagicRule {
2730 id: usize,
2731 source: Option<String>,
2732 entries: EntryNode,
2733 extensions: HashSet<String>,
2734 score: u64,
2736 finalized: bool,
2737}
2738
2739impl MagicRule {
2740 #[inline(always)]
2741 fn set_id(&mut self, id: usize) {
2742 self.id = id
2743 }
2744
2745 fn visit_all_entries(
2746 &self,
2747 deps: &HashMap<String, DependencyRule>,
2748 marked: &mut HashSet<String>,
2749 ) -> Result<EntryNodeVisitor, Error> {
2750 let mut v = EntryNodeVisitor::new();
2751 self.entries.visit(&mut v, deps, marked, 0)?;
2752 Ok(v)
2753 }
2754
2755 fn try_finalize(&mut self, deps: &HashMap<String, DependencyRule>) -> Result<(), Error> {
2758 if self.finalized {
2759 return Ok(());
2760 }
2761
2762 let v = self.visit_all_entries(deps, &mut HashSet::new())?;
2764
2765 self.extensions.extend(v.exts);
2766 self.score = v.score;
2767 self.finalized = true;
2768
2769 Ok(())
2770 }
2771
2772 #[inline]
2773 fn magic_entrypoint<'r, D: DataRead>(
2774 &'r self,
2775 magic: &mut Magic<'r>,
2776 stream_kind: StreamKind,
2777 haystack: &mut D,
2778 db: &'r MagicDb,
2779 switch_endianness: bool,
2780 depth: usize,
2781 ) -> Result<u64, Error> {
2782 self.entries.matches(
2783 self.source.as_deref(),
2784 magic,
2785 &mut MatchState::empty(),
2786 stream_kind,
2787 None,
2788 None,
2789 None,
2790 haystack,
2791 db,
2792 switch_endianness,
2793 depth,
2794 )
2795 }
2796
2797 #[inline]
2800 #[allow(clippy::too_many_arguments)]
2801 fn magic<'r, D: DataRead>(
2802 &'r self,
2803 magic: &mut Magic<'r>,
2804 stream_kind: StreamKind,
2805 buf_base_offset: Option<u64>,
2806 rule_base_offset: Option<u64>,
2807 haystack: &mut D,
2808 db: &'r MagicDb,
2809 switch_endianness: bool,
2810 depth: usize,
2811 ) -> Result<u64, Error> {
2812 self.entries.matches(
2813 self.source.as_deref(),
2814 magic,
2815 &mut MatchState::empty(),
2816 stream_kind,
2817 buf_base_offset,
2818 rule_base_offset,
2819 None,
2820 haystack,
2821 db,
2822 switch_endianness,
2823 depth,
2824 )
2825 }
2826
2827 pub fn is_text(&self) -> bool {
2833 self.entries.entry.test.is_text()
2834 && self.entries.children.iter().all(|e| e.entry.test.is_text())
2835 }
2836
2837 #[inline(always)]
2843 pub fn score(&self) -> u64 {
2844 self.score
2845 }
2846
2847 #[inline(always)]
2853 pub fn source(&self) -> Option<&str> {
2854 self.source.as_deref()
2855 }
2856
2857 #[inline(always)]
2863 pub fn line(&self) -> usize {
2864 self.entries.entry.line
2865 }
2866
2867 #[inline(always)]
2873 pub fn extensions(&self) -> &HashSet<String> {
2874 &self.extensions
2875 }
2876}
2877
2878#[derive(Debug, Clone, Serialize, Deserialize)]
2879struct DependencyRule {
2880 name: String,
2881 rule: MagicRule,
2882}
2883
2884#[derive(Debug, Clone, Serialize, Deserialize)]
2890pub struct MagicSource {
2891 rules: Vec<MagicRule>,
2892 dependencies: HashMap<String, DependencyRule>,
2893}
2894
2895impl MagicSource {
2896 pub fn open<P: AsRef<Path>>(p: P) -> Result<Self, Error> {
2906 FileMagicParser::parse_file(p)
2907 }
2908}
2909
2910#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)]
2911struct ContinuationLevel(u8);
2912
2913#[derive(Debug, PartialEq, Eq, Clone, Copy)]
2915enum TextEncoding {
2916 Ascii,
2917 Utf8,
2918 Unknown,
2919}
2920
2921impl TextEncoding {
2922 const fn as_magic_str(&self) -> &'static str {
2923 match self {
2924 TextEncoding::Ascii => "ASCII",
2925 TextEncoding::Utf8 => "UTF-8",
2926 TextEncoding::Unknown => "Unknown",
2927 }
2928 }
2929}
2930
2931#[derive(Debug, PartialEq, Eq, Clone, Copy)]
2932enum StreamKind {
2933 Binary,
2934 Text(TextEncoding),
2935}
2936
2937impl StreamKind {
2938 const fn is_text(&self) -> bool {
2939 matches!(self, StreamKind::Text(_))
2940 }
2941}
2942
2943#[derive(Debug)]
2944struct MatchState {
2945 continuation_levels: [bool; 256],
2946}
2947
2948impl MatchState {
2949 #[inline(always)]
2950 fn empty() -> Self {
2951 MatchState {
2952 continuation_levels: [false; 256],
2953 }
2954 }
2955
2956 #[inline(always)]
2957 fn get_continuation_level(&mut self, level: &ContinuationLevel) -> bool {
2958 self.continuation_levels
2959 .get(level.0 as usize)
2960 .cloned()
2961 .unwrap_or_default()
2962 }
2963
2964 #[inline(always)]
2965 fn set_continuation_level(&mut self, level: ContinuationLevel) {
2966 if let Some(b) = self.continuation_levels.get_mut(level.0 as usize) {
2967 *b = true
2968 }
2969 }
2970
2971 #[inline(always)]
2972 fn clear_continuation_level(&mut self, level: &ContinuationLevel) {
2973 if let Some(b) = self.continuation_levels.get_mut(level.0 as usize) {
2974 *b = false;
2975 }
2976 }
2977}
2978
2979#[derive(Debug, Default)]
2981pub struct Magic<'m> {
2982 stream_kind: Option<StreamKind>,
2983 source: Option<Cow<'m, str>>,
2984 message: Vec<Cow<'m, str>>,
2985 mime_type: Option<Cow<'m, str>>,
2986 creator_code: Option<Cow<'m, str>>,
2987 strength: u64,
2988 exts: HashSet<Cow<'m, str>>,
2989 is_default: bool,
2990}
2991
2992impl<'m> Magic<'m> {
2993 #[inline(always)]
2994 fn set_source(&mut self, source: Option<&'m str>) {
2995 self.source = source.map(Cow::Borrowed);
2996 }
2997
2998 #[inline(always)]
2999 fn set_stream_kind(&mut self, stream_kind: StreamKind) {
3000 self.stream_kind = Some(stream_kind)
3001 }
3002
3003 #[inline(always)]
3004 fn reset(&mut self) {
3005 self.stream_kind = None;
3006 self.source = None;
3007 self.message.clear();
3008 self.mime_type = None;
3009 self.creator_code = None;
3010 self.strength = 0;
3011 self.exts.clear();
3012 self.is_default = false;
3013 }
3014
3015 #[inline]
3023 pub fn into_owned<'owned>(self) -> Magic<'owned> {
3024 Magic {
3025 stream_kind: self.stream_kind,
3026 source: self.source.map(|s| Cow::Owned(s.into_owned())),
3027 message: self
3028 .message
3029 .into_iter()
3030 .map(Cow::into_owned)
3031 .map(Cow::Owned)
3032 .collect(),
3033 mime_type: self.mime_type.map(|m| Cow::Owned(m.into_owned())),
3034 creator_code: self.creator_code.map(|m| Cow::Owned(m.into_owned())),
3035 strength: self.strength,
3036 exts: self
3037 .exts
3038 .into_iter()
3039 .map(|e| Cow::Owned(e.into_owned()))
3040 .collect(),
3041 is_default: self.is_default,
3042 }
3043 }
3044
3045 #[inline(always)]
3051 pub fn message(&self) -> String {
3052 let mut out = String::new();
3053 for (i, m) in self.message.iter().enumerate() {
3054 if let Some(s) = m.strip_prefix(r#"\b"#) {
3055 out.push_str(s);
3056 } else {
3057 if i > 0 {
3059 out.push(' ');
3060 }
3061 out.push_str(m);
3062 }
3063 }
3064 out
3065 }
3066
3067 #[inline]
3078 pub fn message_parts(&self) -> impl Iterator<Item = &str> {
3079 self.message.iter().map(|p| p.as_ref())
3080 }
3081
3082 #[inline(always)]
3083 fn update_strength(&mut self, value: u64) {
3084 self.strength = self.strength.saturating_add(value);
3085 debug!("updated strength = {:?}", self.strength)
3086 }
3087
3088 #[inline(always)]
3094 pub fn mime_type(&self) -> &str {
3095 self.mime_type.as_deref().unwrap_or(match self.stream_kind {
3096 Some(StreamKind::Text(_)) => DEFAULT_TEXT_MIMETYPE,
3097 Some(StreamKind::Binary) | None => DEFAULT_BIN_MIMETYPE,
3098 })
3099 }
3100
3101 #[inline(always)]
3102 fn push_message<'a: 'm>(&mut self, msg: Cow<'a, str>) {
3103 if !msg.is_empty() {
3104 debug!("pushing message: msg={msg} len={}", msg.len());
3105 self.message.push(msg);
3106 }
3107 }
3108
3109 #[inline(always)]
3110 fn set_mime_type<'a: 'm>(&mut self, mime: Cow<'a, str>) {
3111 if self.mime_type.is_none() {
3112 debug!("insert mime: {:?}", mime);
3113 self.mime_type = Some(mime)
3114 }
3115 }
3116
3117 #[inline(always)]
3118 fn set_creator_code<'a: 'm>(&mut self, apple_ty: Cow<'a, str>) {
3119 if self.creator_code.is_none() {
3120 debug!("insert apple type: {apple_ty:?}");
3121 self.creator_code = Some(apple_ty)
3122 }
3123 }
3124
3125 #[inline(always)]
3126 fn insert_extensions<'a: 'm, I: Iterator<Item = &'a str>>(&mut self, exts: I) {
3127 if self.exts.is_empty() {
3128 self.exts.extend(exts.filter_map(|e| {
3129 if e.is_empty() {
3130 None
3131 } else {
3132 Some(Cow::Borrowed(e))
3133 }
3134 }));
3135 }
3136 }
3137
3138 #[inline(always)]
3146 pub fn strength(&self) -> u64 {
3147 self.strength
3148 }
3149
3150 #[inline(always)]
3156 pub fn source(&self) -> Option<&str> {
3157 self.source.as_deref()
3158 }
3159
3160 #[inline(always)]
3166 pub fn creator_code(&self) -> Option<&str> {
3167 self.creator_code.as_deref()
3168 }
3169
3170 #[inline(always)]
3176 pub fn extensions(&self) -> &HashSet<Cow<'m, str>> {
3177 &self.exts
3178 }
3179
3180 #[inline(always)]
3186 pub fn is_default(&self) -> bool {
3187 self.is_default
3188 }
3189}
3190
3191#[derive(Debug, Default, Clone, Serialize, Deserialize)]
3193pub struct MagicDb {
3194 rule_id: usize,
3195 rules: Vec<MagicRule>,
3196 dependencies: HashMap<String, DependencyRule>,
3197 finalized: usize,
3198}
3199
3200#[inline(always)]
3201fn is_likely_text(bytes: &[u8]) -> bool {
3203 const CHUNK_SIZE: usize = std::mem::size_of::<usize>();
3204
3205 if bytes.is_empty() {
3206 return false;
3207 }
3208
3209 let mut printable = 0f64;
3210 let mut high_bytes = 0f64; let (chunks, remainder) = bytes.as_chunks::<CHUNK_SIZE>();
3213
3214 macro_rules! handle_byte {
3215 ($byte: expr) => {
3216 match $byte {
3217 0x00 => return false,
3218 0x09 | 0x0A | 0x0D => printable += 1.0, 0x20..=0x7E => printable += 1.0, _ => high_bytes += 1.0,
3221 }
3222 };
3223 }
3224
3225 for bytes in chunks {
3226 for b in bytes {
3227 handle_byte!(b)
3228 }
3229 }
3230
3231 for b in remainder {
3232 handle_byte!(b)
3233 }
3234
3235 let total = bytes.len() as f64;
3236 let printable_ratio = printable / total;
3237 let high_bytes_ratio = high_bytes / total;
3238
3239 printable_ratio > 0.85 && high_bytes_ratio < 0.20
3241}
3242
3243#[inline(always)]
3244fn guess_stream_kind<S: AsRef<[u8]>>(stream: S) -> StreamKind {
3245 let buf = stream.as_ref();
3246
3247 match run_utf8_validation(buf) {
3248 Ok(is_ascii) => {
3249 if is_ascii {
3250 StreamKind::Text(TextEncoding::Ascii)
3251 } else {
3252 StreamKind::Text(TextEncoding::Utf8)
3253 }
3254 }
3255 Err(e) => {
3256 if is_likely_text(&buf[e.valid_up_to..]) {
3257 StreamKind::Text(TextEncoding::Unknown)
3258 } else {
3259 StreamKind::Binary
3260 }
3261 }
3262 }
3263}
3264
3265impl MagicDb {
3266 pub fn new() -> Self {
3272 Self::default()
3273 }
3274
3275 #[inline(always)]
3276 fn next_rule_id(&mut self) -> usize {
3277 let t = self.rule_id;
3278 self.rule_id += 1;
3279 t
3280 }
3281
3282 #[inline(always)]
3283 fn try_json<D: DataRead>(
3284 haystack: &mut D,
3285 stream_kind: StreamKind,
3286 magic: &mut Magic,
3287 ) -> Result<bool, Error> {
3288 if matches!(stream_kind, StreamKind::Binary) {
3290 return Ok(false);
3291 }
3292
3293 let buf = haystack.read_range(0..FILE_BYTES_MAX as u64)?.trim_ascii();
3294
3295 let Some((start, end)) = find_json_boundaries(buf) else {
3296 return Ok(false);
3297 };
3298
3299 for c in buf[0..start].iter() {
3302 if !c.is_ascii_whitespace() {
3303 return Ok(false);
3304 }
3305 }
3306
3307 let mut is_ndjson = false;
3308
3309 trace!("maybe a json document");
3310 let ok = serde_json::from_slice::<serde_json::Value>(&buf[start..=end]).is_ok();
3311 if !ok {
3312 return Ok(false);
3313 }
3314
3315 if end + 1 < buf.len() {
3317 let buf = &buf[end + 1..];
3319 if let Some((second_start, second_end)) = find_json_boundaries(buf) {
3320 if memchr(b'\n', &buf[..second_start]).is_some() {
3322 trace!("might be ndjson");
3323 is_ndjson = serde_json::from_slice::<serde_json::Value>(
3324 &buf[second_start..=second_end],
3325 )
3326 .is_ok();
3327 }
3328 }
3329 }
3330
3331 if is_ndjson {
3332 magic.push_message(Cow::Borrowed("New Line Delimited"));
3333 magic.set_mime_type(Cow::Borrowed("application/x-ndjson"));
3334 magic.insert_extensions(["ndjson", "jsonl"].into_iter());
3335 } else {
3336 magic.set_mime_type(Cow::Borrowed("application/json"));
3337 magic.insert_extensions(["json"].into_iter());
3338 }
3339
3340 magic.push_message(Cow::Borrowed("JSON text data"));
3341 magic.set_source(Some(HARDCODED_SOURCE));
3342 magic.update_strength(HARDCODED_MAGIC_STRENGTH);
3343 Ok(true)
3344 }
3345
3346 #[inline(always)]
3347 fn try_csv<D: DataRead>(
3348 haystack: &mut D,
3349 stream_kind: StreamKind,
3350 magic: &mut Magic,
3351 ) -> Result<bool, Error> {
3352 let StreamKind::Text(enc) = stream_kind else {
3354 return Ok(false);
3355 };
3356
3357 let buf = haystack.read_range(0..FILE_BYTES_MAX as u64)?;
3358 let mut reader = csv::Reader::from_reader(io::Cursor::new(buf));
3359 let mut records = reader.records();
3360
3361 let Some(Ok(first)) = records.next() else {
3362 return Ok(false);
3363 };
3364
3365 if first.len() <= 1 {
3369 return Ok(false);
3370 }
3371
3372 let mut n = 1;
3374 for i in records.take(9) {
3375 if let Ok(rec) = i {
3376 if first.len() != rec.len() {
3377 return Ok(false);
3378 }
3379 } else {
3380 return Ok(false);
3381 }
3382 n += 1;
3383 }
3384
3385 if n != 10 {
3387 return Ok(false);
3388 }
3389
3390 magic.set_mime_type(Cow::Borrowed("text/csv"));
3391 magic.push_message(Cow::Borrowed("CSV"));
3392 magic.push_message(Cow::Borrowed(enc.as_magic_str()));
3393 magic.push_message(Cow::Borrowed("text"));
3394 magic.insert_extensions(["csv"].into_iter());
3395 magic.set_source(Some(HARDCODED_SOURCE));
3396 magic.update_strength(HARDCODED_MAGIC_STRENGTH);
3397 Ok(true)
3398 }
3399
3400 #[inline(always)]
3401 fn try_tar<D: DataRead>(
3402 haystack: &mut D,
3403 stream_kind: StreamKind,
3404 magic: &mut Magic,
3405 ) -> Result<bool, Error> {
3406 if !matches!(stream_kind, StreamKind::Binary) {
3408 return Ok(false);
3409 }
3410
3411 let buf = haystack.read_range(0..FILE_BYTES_MAX as u64)?;
3412 let mut ar = Archive::new(io::Cursor::new(buf));
3413
3414 let Ok(mut entries) = ar.entries() else {
3415 return Ok(false);
3416 };
3417
3418 let Some(Ok(first)) = entries.next() else {
3419 return Ok(false);
3420 };
3421
3422 let header = first.header();
3423
3424 if header.as_ustar().is_some() {
3425 magic.push_message(Cow::Borrowed("POSIX tar archive"));
3426 } else if header.as_gnu().is_some() {
3427 magic.push_message(Cow::Borrowed("POSIX tar archive (GNU)"));
3428 } else {
3429 magic.push_message(Cow::Borrowed("tar archive"));
3430 }
3431
3432 magic.set_mime_type(Cow::Borrowed("application/x-tar"));
3433 magic.set_source(Some(HARDCODED_SOURCE));
3434 magic.update_strength(HARDCODED_MAGIC_STRENGTH);
3435 magic.insert_extensions(["tar"].into_iter());
3436 Ok(true)
3437 }
3438
3439 #[inline(always)]
3440 fn try_hard_magic<D: DataRead>(
3441 haystack: &mut D,
3442 stream_kind: StreamKind,
3443 magic: &mut Magic,
3444 ) -> Result<bool, Error> {
3445 Ok(Self::try_json(haystack, stream_kind, magic)?
3446 || Self::try_csv(haystack, stream_kind, magic)?
3447 || Self::try_tar(haystack, stream_kind, magic)?)
3448 }
3449
3450 #[inline(always)]
3451 fn magic_default<'m, D: DataRead>(
3452 cache: &mut D,
3453 stream_kind: StreamKind,
3454 magic: &mut Magic<'m>,
3455 ) {
3456 magic.set_source(Some(HARDCODED_SOURCE));
3457 magic.set_stream_kind(stream_kind);
3458 magic.is_default = true;
3459
3460 if cache.data_size() == 0 {
3461 magic.push_message(Cow::Borrowed("empty"));
3462 magic.set_mime_type(Cow::Borrowed(DEFAULT_BIN_MIMETYPE));
3463 }
3464
3465 match stream_kind {
3466 StreamKind::Binary => {
3467 magic.push_message(Cow::Borrowed("data"));
3468 }
3469 StreamKind::Text(e) => {
3470 magic.push_message(Cow::Borrowed(e.as_magic_str()));
3471 magic.push_message(Cow::Borrowed("text"));
3472 }
3473 }
3474 }
3475
3476 fn load_rules_no_prepare(&mut self, rules: Vec<MagicRule>) {
3477 for rule in rules.into_iter() {
3478 let mut rule = rule;
3479 rule.set_id(self.next_rule_id());
3480
3481 self.rules.push(rule);
3482 }
3483 }
3484
3485 pub fn load(&mut self, ms: MagicSource) -> &mut Self {
3491 self.load_rules_no_prepare(ms.rules);
3492 self.dependencies.extend(ms.dependencies);
3493 self.try_finalize();
3494 self
3495 }
3496
3497 pub fn load_bulk<I: Iterator<Item = MagicSource>>(&mut self, it: I) -> &mut Self {
3502 for ms in it {
3503 self.load_rules_no_prepare(ms.rules);
3504 self.dependencies.extend(ms.dependencies);
3505 }
3506 self.try_finalize();
3507 self
3508 }
3509
3510 pub fn rules(&self) -> &[MagicRule] {
3516 &self.rules
3517 }
3518
3519 #[inline]
3520 fn first_magic_with_stream_kind<D: DataRead>(
3521 &self,
3522 haystack: &mut D,
3523 stream_kind: StreamKind,
3524 extension: Option<&str>,
3525 ) -> Result<Magic<'_>, Error> {
3526 let mut magic = Magic::default();
3528
3529 if Self::try_hard_magic(haystack, stream_kind, &mut magic)? {
3530 return Ok(magic);
3531 }
3532
3533 let mut marked = vec![false; self.rules.len()];
3534
3535 macro_rules! do_magic {
3536 ($rule: expr) => {{
3537 $rule.magic_entrypoint(&mut magic, stream_kind, haystack, &self, false, 0)?;
3538
3539 if !magic.message.is_empty() {
3540 magic.set_stream_kind(stream_kind);
3541 magic.set_source($rule.source.as_deref());
3542 return Ok(magic);
3543 }
3544
3545 magic.reset();
3546 }};
3547 }
3548
3549 if let Some(ext) = extension.map(|e| e.to_lowercase())
3550 && !ext.is_empty()
3551 {
3552 for rule in self.rules.iter().filter(|r| r.extensions.contains(&ext)) {
3553 do_magic!(rule);
3554 if let Some(f) = marked.get_mut(rule.id) {
3555 *f = true
3556 }
3557 }
3558 }
3559
3560 for rule in self
3561 .rules
3562 .iter()
3563 .filter(|r| !*marked.get(r.id).unwrap_or(&false))
3565 {
3566 do_magic!(rule)
3567 }
3568
3569 Self::magic_default(haystack, stream_kind, &mut magic);
3570
3571 Ok(magic)
3572 }
3573
3574 pub fn first_magic<R: DataRead>(
3602 &self,
3603 r: &mut R,
3604 extension: Option<&str>,
3605 ) -> Result<Magic<'_>, Error> {
3606 let stream_kind = guess_stream_kind(r.read_range(0..FILE_BYTES_MAX as u64)?);
3607 self.first_magic_with_stream_kind(r, stream_kind, extension)
3608 }
3609
3610 pub fn first_magic_file<P: AsRef<Path>>(&self, path: P) -> Result<Magic<'_>, Error> {
3620 let ext = path.as_ref().extension().and_then(|e| e.to_str());
3621 self.first_magic(&mut DataReader::from_file(File::open(path.as_ref())?)?, ext)
3622 }
3623
3624 pub fn first_magic_slice<S: AsRef<[u8]>>(
3632 &self,
3633 s: S,
3634 extension: Option<&str>,
3635 ) -> Result<Magic<'_>, Error> {
3636 self.first_magic(&mut DataReader::from_slice(s.as_ref()), extension)
3637 }
3638
3639 #[inline(always)]
3640 fn all_magics_sort_with_stream_kind<R: DataRead>(
3641 &self,
3642 haystack: &mut R,
3643 stream_kind: StreamKind,
3644 ) -> Result<Vec<Magic<'_>>, Error> {
3645 let mut out = Vec::new();
3646
3647 let mut magic = Magic::default();
3648
3649 if Self::try_hard_magic(haystack, stream_kind, &mut magic)? {
3650 out.push(magic);
3651 magic = Magic::default();
3652 }
3653
3654 for rule in self.rules.iter() {
3655 rule.magic_entrypoint(&mut magic, stream_kind, haystack, self, false, 0)?;
3656
3657 if !magic.message.is_empty() {
3659 magic.set_stream_kind(stream_kind);
3660 magic.set_source(rule.source.as_deref());
3661 out.push(magic);
3662 magic = Magic::default();
3663 }
3664
3665 magic.reset();
3666 }
3667
3668 Self::magic_default(haystack, stream_kind, &mut magic);
3669 out.push(magic);
3670
3671 out.sort_by_key(|b| std::cmp::Reverse(b.strength()));
3672
3673 Ok(out)
3674 }
3675
3676 #[inline]
3691 pub fn all_magics<R: DataRead>(&self, r: &mut R) -> Result<Vec<Magic<'_>>, Error> {
3692 let stream_kind = guess_stream_kind(r.read_range(0..FILE_BYTES_MAX as u64)?);
3693 self.all_magics_sort_with_stream_kind(r, stream_kind)
3694 }
3695
3696 pub fn all_magics_file<P: AsRef<Path>>(&self, path: P) -> Result<Vec<Magic<'_>>, Error> {
3705 self.all_magics(&mut DataReader::from_file(File::open(path)?)?)
3706 }
3707
3708 pub fn all_magics_slice<S: AsRef<[u8]>>(&self, slice: S) -> Result<Vec<Magic<'_>>, Error> {
3717 self.all_magics(&mut DataReader::from_slice(slice.as_ref()))
3718 }
3719
3720 #[inline(always)]
3721 fn best_magic_with_stream_kind<R: DataRead>(
3722 &self,
3723 reader: &mut R,
3724 stream_kind: StreamKind,
3725 ) -> Result<Magic<'_>, Error> {
3726 let magics = self.all_magics_sort_with_stream_kind(reader, stream_kind)?;
3727
3728 Ok(magics.into_iter().next().unwrap_or_else(|| {
3731 let mut magic = Magic::default();
3732 Self::magic_default(reader, stream_kind, &mut magic);
3733 magic
3734 }))
3735 }
3736
3737 #[inline]
3752 pub fn best_magic<R: DataRead>(&self, r: &mut R) -> Result<Magic<'_>, Error> {
3753 let stream_kind = guess_stream_kind(r.read_range(0..FILE_BYTES_MAX as u64)?);
3754 self.best_magic_with_stream_kind(r, stream_kind)
3755 }
3756
3757 pub fn best_magic_file<P: AsRef<Path>>(&self, path: P) -> Result<Magic<'_>, Error> {
3766 self.best_magic(&mut DataReader::from_file(File::open(path)?)?)
3767 }
3768
3769 pub fn best_magic_slice<S: AsRef<[u8]>>(&self, slice: S) -> Result<Magic<'_>, Error> {
3778 self.best_magic(&mut DataReader::from_slice(slice.as_ref()))
3779 }
3780
3781 pub fn serialize<W: Write>(self, w: &mut W) -> Result<(), Error> {
3787 let mut encoder = GzEncoder::new(w, Compression::best());
3788
3789 bincode::serde::encode_into_std_write(&self, &mut encoder, bincode::config::standard())?;
3790 encoder.finish()?;
3791 Ok(())
3792 }
3793
3794 pub fn deserialize<R: Read>(r: &mut R) -> Result<Self, Error> {
3804 let mut buf = vec![];
3805 let mut gz = GzDecoder::new(r);
3806 gz.read_to_end(&mut buf).map_err(|e| {
3807 bincode::error::DecodeError::OtherString(format!("failed to read: {e}"))
3808 })?;
3809 let (sdb, _): (MagicDb, usize) =
3810 bincode::serde::decode_from_slice(&buf, bincode::config::standard())?;
3811 Ok(sdb)
3812 }
3813
3814 pub fn verify(&mut self) -> Result<(), Error> {
3821 if self.rules.len() == self.finalized {
3822 return Ok(());
3823 }
3824
3825 for r in self.rules.iter_mut().filter(|r| !r.finalized) {
3826 r.try_finalize(&self.dependencies).map_err(|e| {
3828 Error::Verify(
3829 r.source.clone().unwrap_or(String::from("unknown")),
3830 r.line(),
3831 e.into(),
3832 )
3833 })?;
3834 self.finalized += 1;
3835 }
3836
3837 debug_assert!(self.finalized <= self.rules.len());
3838
3839 Ok(())
3840 }
3841
3842 #[inline(always)]
3843 fn try_finalize(&mut self) {
3844 if self.rules.len() == self.finalized {
3845 return;
3846 }
3847
3848 let mut finalized = 0usize;
3849 self.rules.iter_mut().for_each(|r| {
3850 if r.try_finalize(&self.dependencies).is_ok() {
3851 finalized += 1;
3852 }
3853 });
3854
3855 self.finalized = finalized;
3856
3857 debug_assert!(self.finalized <= self.rules.len());
3858
3859 self.rules.sort_by_key(|r| (r.is_text(), -(r.score as i64)));
3861 }
3862}
3863
3864#[cfg(test)]
3865mod tests {
3866
3867 use regex::bytes::Regex;
3868
3869 use crate::{readers::BufReader, utils::unix_local_time_to_string};
3870
3871 use super::*;
3872
3873 macro_rules! buf_reader {
3874 ($l: literal) => {
3875 BufReader::from_slice($l.as_bytes())
3876 };
3877 }
3878
3879 fn first_magic(
3880 rule: &str,
3881 content: &[u8],
3882 stream_kind: StreamKind,
3883 ) -> Result<Magic<'static>, Error> {
3884 let mut md = MagicDb::new();
3885 md.load(
3886 FileMagicParser::parse_str(rule, None)
3887 .inspect_err(|e| eprintln!("{e}"))
3888 .unwrap(),
3889 );
3890 let mut reader = BufReader::from_slice(content);
3891 let v = md.best_magic_with_stream_kind(&mut reader, stream_kind)?;
3892 Ok(v.into_owned())
3893 }
3894
3895 #[allow(unused_macros)]
3897 macro_rules! enable_trace {
3898 () => {
3899 tracing_subscriber::fmt()
3900 .with_max_level(tracing_subscriber::filter::LevelFilter::TRACE)
3901 .try_init();
3902 };
3903 }
3904
3905 macro_rules! parse_assert {
3906 ($rule:literal) => {
3907 FileMagicParser::parse_str($rule, None)
3908 .inspect_err(|e| eprintln!("{e}"))
3909 .unwrap()
3910 };
3911 }
3912
3913 macro_rules! assert_magic_match_bin {
3914 ($rule: literal, $content:literal) => {{ first_magic($rule, $content, StreamKind::Binary).unwrap() }};
3915 ($rule: literal, $content:literal, $message:expr) => {{
3916 assert_eq!(
3917 first_magic($rule, $content, StreamKind::Binary)
3918 .unwrap()
3919 .message(),
3920 $message
3921 );
3922 }};
3923 }
3924
3925 macro_rules! assert_magic_match_text {
3926 ($rule: literal, $content:literal) => {{ first_magic($rule, $content, StreamKind::Text(TextEncoding::Utf8)).unwrap() }};
3927 ($rule: literal, $content:literal, $message:expr) => {{
3928 assert_eq!(
3929 first_magic($rule, $content, StreamKind::Text(TextEncoding::Utf8))
3930 .unwrap()
3931 .message(),
3932 $message
3933 );
3934 }};
3935 }
3936
3937 macro_rules! assert_magic_not_match_text {
3938 ($rule: literal, $content:literal) => {{
3939 assert!(
3940 first_magic($rule, $content, StreamKind::Text(TextEncoding::Utf8))
3941 .unwrap()
3942 .is_default()
3943 );
3944 }};
3945 }
3946
3947 macro_rules! assert_magic_not_match_bin {
3948 ($rule: literal, $content:literal) => {{
3949 assert!(
3950 first_magic($rule, $content, StreamKind::Binary)
3951 .unwrap()
3952 .is_default()
3953 );
3954 }};
3955 }
3956
3957 #[test]
3958 fn test_regex() {
3959 assert_magic_match_text!(
3960 r#"
39610 regex/1024 \^#![[:space:]]*/usr/bin/env[[:space:]]+
3962!:mime text/x-shellscript
3963>&0 regex/64 .*($|\\b) %s shell script text executable
3964 "#,
3965 br#"#!/usr/bin/env bash
3966 echo hello world"#,
3967 "bash shell script text executable"
3969 );
3970
3971 let re = Regex::new(r"(?-u)\x42\x82").unwrap();
3972 assert!(re.is_match(b"\x42\x82"));
3973
3974 assert_magic_match_bin!(
3975 r#"0 regex \x42\x82 binary regex match"#,
3976 b"\x00\x00\x00\x00\x00\x00\x42\x82"
3977 );
3978
3979 assert_magic_match_bin!(
3981 r#"
3982 0 regex \x42\x82
3983 >&0 string \xde\xad\xbe\xef it works
3984 "#,
3985 b"\x00\x00\x00\x00\x00\x00\x42\x82\xde\xad\xbe\xef"
3986 );
3987
3988 assert_magic_match_bin!(
3989 r#"
3990 0 regex/s \x42\x82
3991 >&0 string \x42\x82\xde\xad\xbe\xef it works
3992 "#,
3993 b"\x00\x00\x00\x00\x00\x00\x42\x82\xde\xad\xbe\xef"
3994 );
3995
3996 assert_magic_match_text!(
3998 r#"
39990 regex/1024 \^HelloWorld$ HelloWorld String"#,
4000 br#"
4001// this is a comment after an empty line
4002HelloWorld
4003 "#
4004 );
4005 }
4006
4007 #[test]
4008 fn test_string_with_mods() {
4009 assert_magic_match_text!(
4010 r#"0 string/w #!\ \ \ /usr/bin/env\ bash BASH
4011 "#,
4012 b"#! /usr/bin/env bash i
4013 echo hello world"
4014 );
4015
4016 assert_magic_match_text!(
4018 r#"0 string/C HelloWorld it works
4019 "#,
4020 b"helloworld"
4021 );
4022
4023 assert_magic_not_match_text!(
4024 r#"0 string/C HelloWorld it works
4025 "#,
4026 b"hELLOwORLD"
4027 );
4028
4029 assert_magic_match_text!(
4031 r#"0 string/c HelloWorld it works
4032 "#,
4033 b"HELLOWORLD"
4034 );
4035
4036 assert_magic_not_match_text!(
4037 r#"0 string/c HelloWorld it works
4038 "#,
4039 b"helloworld"
4040 );
4041
4042 assert_magic_match_text!(
4044 r#"0 string/f #!/usr/bin/env\ bash BASH
4045 "#,
4046 b"#!/usr/bin/env bash"
4047 );
4048
4049 assert_magic_not_match_text!(
4050 r#"0 string/f #!/usr/bin/python PYTHON"#,
4051 b"#!/usr/bin/pythonic"
4052 );
4053
4054 assert_magic_match_text!(
4056 r#"0 string/W #!/usr/bin/env\ python PYTHON"#,
4057 b"#!/usr/bin/env python"
4058 );
4059
4060 assert_magic_not_match_text!(
4061 r#"0 string/W #!/usr/bin/env\ \ python PYTHON"#,
4062 b"#!/usr/bin/env python"
4063 );
4064 }
4065
4066 #[test]
4067 fn test_search_with_mods() {
4068 assert_magic_match_text!(
4069 r#"0 search/1/fwt #!\ /usr/bin/luatex LuaTex script text executable"#,
4070 b"#! /usr/bin/luatex "
4071 );
4072
4073 assert_magic_match_text!(
4075 r#"
4076 0 search/s /usr/bin/env
4077 >&0 string /usr/bin/env it works
4078 "#,
4079 b"#!/usr/bin/env python"
4080 );
4081
4082 assert_magic_not_match_text!(
4083 r#"
4084 0 search /usr/bin/env
4085 >&0 string /usr/bin/env it works
4086 "#,
4087 b"#!/usr/bin/env python"
4088 );
4089 }
4090
4091 #[test]
4092 fn test_pstring() {
4093 assert_magic_match_bin!(r#"0 pstring Toast it works"#, b"\x05Toast");
4094
4095 assert_magic_match_bin!(r#"0 pstring Toast %s"#, b"\x05Toast", "Toast");
4096
4097 assert_magic_not_match_bin!(r#"0 pstring Toast Doesn't work"#, b"\x07Toaster");
4098
4099 assert_magic_match_bin!(r#"0 pstring/H Toast it works"#, b"\x00\x05Toast");
4101
4102 assert_magic_match_bin!(r#"0 pstring/HJ Toast it works"#, b"\x00\x07Toast");
4103
4104 assert_magic_match_bin!(r#"0 pstring/HJ Toast %s"#, b"\x00\x07Toast", "Toast");
4105
4106 assert_magic_match_bin!(r#"0 pstring/h Toast it works"#, b"\x05\x00Toast");
4107
4108 assert_magic_match_bin!(r#"0 pstring/hJ Toast it works"#, b"\x07\x00Toast");
4109
4110 assert_magic_match_bin!(r#"0 pstring/L Toast it works"#, b"\x00\x00\x00\x05Toast");
4111
4112 assert_magic_match_bin!(r#"0 pstring/LJ Toast it works"#, b"\x00\x00\x00\x09Toast");
4113
4114 assert_magic_match_bin!(r#"0 pstring/l Toast it works"#, b"\x05\x00\x00\x00Toast");
4115
4116 assert_magic_match_bin!(r#"0 pstring/lJ Toast it works"#, b"\x09\x00\x00\x00Toast");
4117 }
4118
4119 #[test]
4120 fn test_max_recursion() {
4121 let res = first_magic(
4122 r#"0 indirect x"#,
4123 b"#! /usr/bin/luatex ",
4124 StreamKind::Binary,
4125 );
4126 assert!(res.is_err());
4127 let _ = res.inspect_err(|e| {
4128 assert!(matches!(
4129 e.unwrap_localized(),
4130 Error::MaximumRecursion(MAX_RECURSION)
4131 ))
4132 });
4133 }
4134
4135 #[test]
4136 fn test_string_ops() {
4137 assert_magic_match_text!("0 string/b MZ MZ File", b"MZ\0");
4138 assert_magic_match_text!("0 string !MZ Not MZ File", b"AZ\0");
4139 assert_magic_match_text!("0 string >\0 Any String", b"A\0");
4140 assert_magic_match_text!("0 string >Test Any String", b"Test 1\0");
4141 assert_magic_match_text!("0 string <Test Any String", b"\0");
4142 assert_magic_not_match_text!("0 string >Test Any String", b"\0");
4143 }
4144
4145 #[test]
4146 fn test_lestring16() {
4147 assert_magic_match_bin!(
4148 "0 lestring16 abcd Little-endian UTF-16 string",
4149 b"\x61\x00\x62\x00\x63\x00\x64\x00"
4150 );
4151 assert_magic_match_bin!(
4152 "0 lestring16 x %s",
4153 b"\x61\x00\x62\x00\x63\x00\x64\x00\x00",
4154 "abcd"
4155 );
4156 assert_magic_not_match_bin!(
4157 "0 lestring16 abcd Little-endian UTF-16 string",
4158 b"\x00\x61\x00\x62\x00\x63\x00\x64"
4159 );
4160 assert_magic_match_bin!(
4161 "4 lestring16 abcd Little-endian UTF-16 string",
4162 b"\x00\x00\x00\x00\x61\x00\x62\x00\x63\x00\x64\x00"
4163 );
4164 }
4165
4166 #[test]
4167 fn test_bestring16() {
4168 assert_magic_match_bin!(
4169 "0 bestring16 abcd Big-endian UTF-16 string",
4170 b"\x00\x61\x00\x62\x00\x63\x00\x64"
4171 );
4172 assert_magic_match_bin!(
4173 "0 bestring16 x %s",
4174 b"\x00\x61\x00\x62\x00\x63\x00\x64",
4175 "abcd"
4176 );
4177 assert_magic_not_match_bin!(
4178 "0 bestring16 abcd Big-endian UTF-16 string",
4179 b"\x61\x00\x62\x00\x63\x00\x64\x00"
4180 );
4181 assert_magic_match_bin!(
4182 "4 bestring16 abcd Big-endian UTF-16 string",
4183 b"\x00\x00\x00\x00\x00\x61\x00\x62\x00\x63\x00\x64"
4184 );
4185 }
4186
4187 #[test]
4188 fn test_offset_from_end() {
4189 assert_magic_match_bin!("-1 ubyte 0x42 last byte ok", b"\x00\x00\x42");
4190 assert_magic_match_bin!("-2 ubyte 0x41 last byte ok", b"\x00\x41\x00");
4191 }
4192
4193 #[test]
4194 fn test_relative_offset() {
4195 assert_magic_match_bin!(
4196 "
4197 0 ubyte 0x42
4198 >&0 ubyte 0x00
4199 >>&0 ubyte 0x41 third byte ok
4200 ",
4201 b"\x42\x00\x41\x00"
4202 );
4203 }
4204
4205 #[test]
4206 fn test_indirect_offset() {
4207 assert_magic_match_bin!("(0.l) ubyte 0x42 it works", b"\x04\x00\x00\x00\x42");
4208 assert_magic_match_bin!("(0.l+3) ubyte 0x42 it works", b"\x01\x00\x00\x00\x42");
4210 assert_magic_match_bin!(
4212 "(0.l+(4)) ubyte 0x42 it works",
4213 b"\x04\x00\x00\x00\x04\x00\x00\x00\x42"
4214 );
4215 }
4216
4217 #[test]
4218 fn test_use_with_message() {
4219 assert_magic_match_bin!(
4220 r#"
42210 string MZ
4222>0 use mz first match
4223
42240 name mz then second match
4225>0 string MZ
4226"#,
4227 b"MZ\0",
4228 "first match then second match"
4229 );
4230 }
4231
4232 #[test]
4233 fn test_scalar_transform() {
4234 assert_magic_match_bin!("0 ubyte+1 0x1 add works", b"\x00");
4235 assert_magic_match_bin!("0 ubyte-1 0xfe sub works", b"\xff");
4236 assert_magic_match_bin!("0 ubyte%2 0 mod works", b"\x0a");
4237 assert_magic_match_bin!("0 ubyte&0x0f 0x0f bitand works", b"\xff");
4238 assert_magic_match_bin!("0 ubyte|0x0f 0xff bitor works", b"\xf0");
4239 assert_magic_match_bin!("0 ubyte^0x0f 0xf0 bitxor works", b"\xff");
4240
4241 FileMagicParser::parse_str("0 ubyte%0 mod by zero", None)
4242 .expect_err("expect div by zero error");
4243 FileMagicParser::parse_str("0 ubyte/0 div by zero", None)
4244 .expect_err("expect div by zero error");
4245 }
4246
4247 #[test]
4248 fn test_belong() {
4249 assert_magic_match_bin!("0 belong 0x12345678 Big-endian long", b"\x12\x34\x56\x78");
4251 assert_magic_not_match_bin!("0 belong 0x12345678 Big-endian long", b"\x78\x56\x34\x12");
4253 assert_magic_match_bin!(
4255 "4 belong 0x12345678 Big-endian long",
4256 b"\x00\x00\x00\x00\x12\x34\x56\x78"
4257 );
4258 assert_magic_match_bin!("0 belong <0x12345678 Big-endian long", b"\x12\x34\x56\x77");
4260 assert_magic_not_match_bin!("0 belong <0x12345678 Big-endian long", b"\x12\x34\x56\x78");
4261
4262 assert_magic_match_bin!("0 belong >0x12345678 Big-endian long", b"\x12\x34\x56\x79");
4264 assert_magic_not_match_bin!("0 belong >0x12345678 Big-endian long", b"\x12\x34\x56\x78");
4265
4266 assert_magic_match_bin!("0 belong &0x5678 Big-endian long", b"\x00\x00\x56\x78");
4268 assert_magic_not_match_bin!("0 belong &0x0000FFFF Big-endian long", b"\x12\x34\x56\x78");
4269
4270 assert_magic_match_bin!("0 belong ^0xFFFF0000 Big-endian long", b"\x00\x00\x56\x78");
4272 assert_magic_not_match_bin!("0 belong ^0xFFFF0000 Big-endian long", b"\x00\x01\x56\x78");
4273
4274 assert_magic_match_bin!("0 belong ~0x12345678 Big-endian long", b"\xed\xcb\xa9\x87");
4276 assert_magic_not_match_bin!("0 belong ~0x12345678 Big-endian long", b"\x12\x34\x56\x78");
4277
4278 assert_magic_match_bin!("0 belong x Big-endian long", b"\x12\x34\x56\x78");
4280 assert_magic_match_bin!("0 belong x Big-endian long", b"\x78\x56\x34\x12");
4281 }
4282
4283 #[test]
4284 fn test_parse_search() {
4285 parse_assert!("0 search test");
4286 parse_assert!("0 search/24/s test");
4287 parse_assert!("0 search/s/24 test");
4288 }
4289
4290 #[test]
4291 fn test_bedate() {
4292 assert_magic_match_bin!(
4293 "0 bedate 946684800 Unix date (Jan 1, 2000)",
4294 b"\x38\x6D\x43\x80"
4295 );
4296 assert_magic_not_match_bin!(
4297 "0 bedate 946684800 Unix date (Jan 1, 2000)",
4298 b"\x00\x00\x00\x00"
4299 );
4300 assert_magic_match_bin!(
4301 "4 bedate 946684800 %s",
4302 b"\x00\x00\x00\x00\x38\x6D\x43\x80",
4303 "2000-01-01 00:00:00"
4304 );
4305 }
4306 #[test]
4307 fn test_beldate() {
4308 assert_magic_match_bin!(
4309 "0 beldate 946684800 Local date (Jan 1, 2000)",
4310 b"\x38\x6D\x43\x80"
4311 );
4312 assert_magic_not_match_bin!(
4313 "0 beldate 946684800 Local date (Jan 1, 2000)",
4314 b"\x00\x00\x00\x00"
4315 );
4316
4317 assert_magic_match_bin!(
4318 "4 beldate 946684800 {}",
4319 b"\x00\x00\x00\x00\x38\x6D\x43\x80",
4320 unix_local_time_to_string(946684800)
4321 );
4322 }
4323
4324 #[test]
4325 fn test_beqdate() {
4326 assert_magic_match_bin!(
4327 "0 beqdate 946684800 Unix date (Jan 1, 2000)",
4328 b"\x00\x00\x00\x00\x38\x6D\x43\x80"
4329 );
4330
4331 assert_magic_not_match_bin!(
4332 "0 beqdate 946684800 Unix date (Jan 1, 2000)",
4333 b"\x00\x00\x00\x00\x00\x00\x00\x00"
4334 );
4335
4336 assert_magic_match_bin!(
4337 "0 beqdate 946684800 %s",
4338 b"\x00\x00\x00\x00\x38\x6D\x43\x80",
4339 "2000-01-01 00:00:00"
4340 );
4341 }
4342
4343 #[test]
4344 fn test_medate() {
4345 assert_magic_match_bin!(
4346 "0 medate 946684800 Unix date (Jan 1, 2000)",
4347 b"\x6D\x38\x80\x43"
4348 );
4349
4350 assert_magic_not_match_bin!(
4351 "0 medate 946684800 Unix date (Jan 1, 2000)",
4352 b"\x00\x00\x00\x00"
4353 );
4354
4355 assert_magic_match_bin!(
4356 "4 medate 946684800 %s",
4357 b"\x00\x00\x00\x00\x6D\x38\x80\x43",
4358 "2000-01-01 00:00:00"
4359 );
4360 }
4361
4362 #[test]
4363 fn test_meldate() {
4364 assert_magic_match_bin!(
4365 "0 meldate 946684800 Local date (Jan 1, 2000)",
4366 b"\x6D\x38\x80\x43"
4367 );
4368 assert_magic_not_match_bin!(
4369 "0 meldate 946684800 Local date (Jan 1, 2000)",
4370 b"\x00\x00\x00\x00"
4371 );
4372
4373 assert_magic_match_bin!(
4374 "4 meldate 946684800 %s",
4375 b"\x00\x00\x00\x00\x6D\x38\x80\x43",
4376 unix_local_time_to_string(946684800)
4377 );
4378 }
4379
4380 #[test]
4381 fn test_date() {
4382 assert_magic_match_bin!(
4383 "0 date 946684800 Local date (Jan 1, 2000)",
4384 b"\x80\x43\x6D\x38"
4385 );
4386 assert_magic_not_match_bin!(
4387 "0 date 946684800 Local date (Jan 1, 2000)",
4388 b"\x00\x00\x00\x00"
4389 );
4390 assert_magic_match_bin!(
4391 "4 date 946684800 {}",
4392 b"\x00\x00\x00\x00\x80\x43\x6D\x38",
4393 "2000-01-01 00:00:00"
4394 );
4395 }
4396
4397 #[test]
4398 fn test_leldate() {
4399 assert_magic_match_bin!(
4400 "0 leldate 946684800 Local date (Jan 1, 2000)",
4401 b"\x80\x43\x6D\x38"
4402 );
4403 assert_magic_not_match_bin!(
4404 "0 leldate 946684800 Local date (Jan 1, 2000)",
4405 b"\x00\x00\x00\x00"
4406 );
4407 assert_magic_match_bin!(
4408 "4 leldate 946684800 {}",
4409 b"\x00\x00\x00\x00\x80\x43\x6D\x38",
4410 unix_local_time_to_string(946684800)
4411 );
4412 }
4413
4414 #[test]
4415 fn test_leqdate() {
4416 assert_magic_match_bin!(
4417 "0 leqdate 1577836800 Unix date (Jan 1, 2020)",
4418 b"\x00\xe1\x0b\x5E\x00\x00\x00\x00"
4419 );
4420
4421 assert_magic_not_match_bin!(
4422 "0 leqdate 1577836800 Unix date (Jan 1, 2020)",
4423 b"\x00\x00\x00\x00\x00\x00\x00\x00"
4424 );
4425 assert_magic_match_bin!(
4426 "8 leqdate 1577836800 %s",
4427 b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\xE1\x0B\x5E\x00\x00\x00\x00",
4428 "2020-01-01 00:00:00"
4429 );
4430 }
4431
4432 #[test]
4433 fn test_leqldate() {
4434 assert_magic_match_bin!(
4435 "0 leqldate 1577836800 Unix date (Jan 1, 2020)",
4436 b"\x00\xe1\x0b\x5E\x00\x00\x00\x00"
4437 );
4438
4439 assert_magic_not_match_bin!(
4440 "0 leqldate 1577836800 Unix date (Jan 1, 2020)",
4441 b"\x00\x00\x00\x00\x00\x00\x00\x00"
4442 );
4443 assert_magic_match_bin!(
4444 "8 leqldate 1577836800 %s",
4445 b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\xE1\x0B\x5E\x00\x00\x00\x00",
4446 unix_local_time_to_string(1577836800)
4447 );
4448 }
4449
4450 #[test]
4451 fn test_melong() {
4452 assert_magic_match_bin!(
4454 "0 melong =0x12345678 Middle-endian long",
4455 b"\x34\x12\x78\x56"
4456 );
4457 assert_magic_not_match_bin!(
4458 "0 melong =0x12345678 Middle-endian long",
4459 b"\x00\x00\x00\x00"
4460 );
4461
4462 assert_magic_match_bin!(
4464 "0 melong <0x12345678 Middle-endian long",
4465 b"\x34\x12\x78\x55"
4466 ); assert_magic_not_match_bin!(
4468 "0 melong <0x12345678 Middle-endian long",
4469 b"\x34\x12\x78\x56"
4470 ); assert_magic_match_bin!(
4474 "0 melong >0x12345678 Middle-endian long",
4475 b"\x34\x12\x78\x57"
4476 ); assert_magic_not_match_bin!(
4478 "0 melong >0x12345678 Middle-endian long",
4479 b"\x34\x12\x78\x56"
4480 ); assert_magic_match_bin!("0 melong &0x5678 Middle-endian long", b"\xab\xcd\x78\x56"); assert_magic_not_match_bin!(
4485 "0 melong &0x0000FFFF Middle-endian long",
4486 b"\x34\x12\x78\x56"
4487 ); assert_magic_match_bin!(
4491 "0 melong ^0xFFFF0000 Middle-endian long",
4492 b"\x00\x00\x78\x56"
4493 ); assert_magic_not_match_bin!(
4495 "0 melong ^0xFFFF0000 Middle-endian long",
4496 b"\x00\x01\x78\x56"
4497 ); assert_magic_match_bin!(
4501 "0 melong ~0x12345678 Middle-endian long",
4502 b"\xCB\xED\x87\xA9"
4503 );
4504 assert_magic_not_match_bin!(
4505 "0 melong ~0x12345678 Middle-endian long",
4506 b"\x34\x12\x78\x56"
4507 ); assert_magic_match_bin!("0 melong x Middle-endian long", b"\x34\x12\x78\x56");
4511 assert_magic_match_bin!("0 melong x Middle-endian long", b"\x00\x00\x00\x00");
4512 }
4513
4514 #[test]
4515 fn test_uquad() {
4516 assert_magic_match_bin!(
4518 "0 uquad =0x123456789ABCDEF0 Unsigned quad",
4519 b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4520 );
4521 assert_magic_not_match_bin!(
4522 "0 uquad =0x123456789ABCDEF0 Unsigned quad",
4523 b"\x00\x00\x00\x00\x00\x00\x00\x00"
4524 );
4525
4526 assert_magic_match_bin!(
4528 "0 uquad <0x123456789ABCDEF0 Unsigned quad",
4529 b"\xF0\xDE\xBC\x9A\x78\x56\x34\x11"
4530 );
4531 assert_magic_not_match_bin!(
4532 "0 uquad <0x123456789ABCDEF0 Unsigned quad",
4533 b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4534 );
4535
4536 assert_magic_match_bin!(
4538 "0 uquad >0x123456789ABCDEF0 Unsigned quad",
4539 b"\xF0\xDE\xBC\x9A\x78\x56\x34\x13"
4540 );
4541 assert_magic_not_match_bin!(
4542 "0 uquad >0x123456789ABCDEF0 Unsigned quad",
4543 b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4544 );
4545
4546 assert_magic_match_bin!(
4548 "0 uquad &0xF0 Unsigned quad",
4549 b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4550 );
4551 assert_magic_not_match_bin!(
4552 "0 uquad &0xFF Unsigned quad",
4553 b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4554 );
4555
4556 assert_magic_match_bin!(
4558 "0 uquad ^0xFFFFFFFFFFFFFFFF Unsigned quad",
4559 b"\x00\x00\x00\x00\x00\x00\x00\x00"
4560 ); assert_magic_not_match_bin!(
4562 "0 uquad ^0xFFFFFFFFFFFFFFFF Unsigned quad",
4563 b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4564 ); assert_magic_match_bin!(
4568 "0 uquad ~0x123456789ABCDEF0 Unsigned quad",
4569 b"\x0F\x21\x43\x65\x87\xA9\xCB\xED"
4570 );
4571 assert_magic_not_match_bin!(
4572 "0 uquad ~0x123456789ABCDEF0 Unsigned quad",
4573 b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12"
4574 ); assert_magic_match_bin!(
4578 "0 uquad x {:#x}",
4579 b"\xF0\xDE\xBC\x9A\x78\x56\x34\x12",
4580 "0x123456789abcdef0"
4581 );
4582 assert_magic_match_bin!(
4583 "0 uquad x Unsigned quad",
4584 b"\x00\x00\x00\x00\x00\x00\x00\x00"
4585 );
4586 }
4587
4588 #[test]
4589 fn test_guid() {
4590 assert_magic_match_bin!(
4591 "0 guid EC959539-6786-2D4E-8FDB-98814CE76C1E It works",
4592 b"\xEC\x95\x95\x39\x67\x86\x2D\x4E\x8F\xDB\x98\x81\x4C\xE7\x6C\x1E"
4593 );
4594
4595 assert_magic_not_match_bin!(
4596 "0 guid 399595EC-8667-4E2D-8FDB-98814CE76C1E It works",
4597 b"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F"
4598 );
4599
4600 assert_magic_match_bin!(
4601 "0 guid x %s",
4602 b"\xEC\x95\x95\x39\x67\x86\x2D\x4E\x8F\xDB\x98\x81\x4C\xE7\x6C\x1E",
4603 "EC959539-6786-2D4E-8FDB-98814CE76C1E"
4604 );
4605 }
4606
4607 #[test]
4608 fn test_ubeqdate() {
4609 assert_magic_match_bin!(
4610 "0 ubeqdate 1633046400 It works",
4611 b"\x00\x00\x00\x00\x61\x56\x4f\x80"
4612 );
4613
4614 assert_magic_match_bin!(
4615 "0 ubeqdate x %s",
4616 b"\x00\x00\x00\x00\x61\x56\x4f\x80",
4617 "2021-10-01 00:00:00"
4618 );
4619
4620 assert_magic_not_match_bin!(
4621 "0 ubeqdate 1633046400 It should not work",
4622 b"\x00\x00\x00\x00\x00\x00\x00\x00"
4623 );
4624 }
4625
4626 #[test]
4627 fn test_ldate() {
4628 assert_magic_match_bin!("0 ldate 1640551520 It works", b"\x60\xd4\xC8\x61");
4629
4630 assert_magic_not_match_bin!("0 ldate 1633046400 It should not work", b"\x00\x00\x00\x00");
4631
4632 assert_magic_match_bin!(
4633 "0 ldate x %s",
4634 b"\x60\xd4\xC8\x61",
4635 unix_local_time_to_string(1640551520)
4636 );
4637 }
4638
4639 #[test]
4640 fn test_scalar_with_transform() {
4641 assert_magic_match_bin!("0 ubyte/10 2 {}", b"\x14", "2");
4642 assert_magic_match_bin!("0 ubyte/10 x {}", b"\x14", "2");
4643 assert_magic_match_bin!("0 ubyte%10 x {}", b"\x14", "0");
4644 }
4645
4646 #[test]
4647 fn test_float_with_transform() {
4648 assert_magic_match_bin!("0 lefloat/10 2 {}", b"\x00\x00\xa0\x41", "2");
4649 assert_magic_match_bin!("0 lefloat/10 x {}", b"\x00\x00\xa0\x41", "2");
4650 assert_magic_match_bin!("0 lefloat%10 x {}", b"\x00\x00\xa0\x41", "0");
4651 }
4652
4653 #[test]
4654 fn test_read_octal() {
4655 assert_eq!(read_octal_u64(&mut buf_reader!("0")), Some(0));
4657 assert_eq!(read_octal_u64(&mut buf_reader!("00")), Some(0));
4658 assert_eq!(read_octal_u64(&mut buf_reader!("01")), Some(1));
4659 assert_eq!(read_octal_u64(&mut buf_reader!("07")), Some(7));
4660 assert_eq!(read_octal_u64(&mut buf_reader!("010")), Some(8));
4661 assert_eq!(read_octal_u64(&mut buf_reader!("0123")), Some(83));
4662 assert_eq!(read_octal_u64(&mut buf_reader!("0755")), Some(493));
4663
4664 assert_eq!(read_octal_u64(&mut buf_reader!("0ABC")), Some(0));
4666 assert_eq!(read_octal_u64(&mut buf_reader!("01ABC")), Some(1));
4667 assert_eq!(read_octal_u64(&mut buf_reader!("0755ABC")), Some(493));
4668 assert_eq!(read_octal_u64(&mut buf_reader!("0123ABC")), Some(83));
4669
4670 assert_eq!(read_octal_u64(&mut buf_reader!("08")), Some(0)); assert_eq!(read_octal_u64(&mut buf_reader!("01238")), Some(83)); assert_eq!(read_octal_u64(&mut buf_reader!("123")), None);
4676 assert_eq!(read_octal_u64(&mut buf_reader!("755")), None);
4677
4678 assert_eq!(read_octal_u64(&mut buf_reader!("")), None);
4680
4681 assert_eq!(read_octal_u64(&mut buf_reader!("ABC")), None);
4683 assert_eq!(read_octal_u64(&mut buf_reader!("8ABC")), None); assert_eq!(
4687 read_octal_u64(&mut buf_reader!("01777777777")),
4688 Some(268435455)
4689 );
4690 }
4691
4692 #[test]
4693 fn test_offset_bug_1() {
4694 assert_magic_match_bin!(
4697 r"
46981 string TEST Bread is
4699# offset computation is relative to
4700# rule start
4701>(5.b) use toasted
4702
47030 name toasted
4704>0 string twice Toasted
4705>>0 use toasted_twice
4706
47070 name toasted_twice
4708>(6.b) string x %s
4709 ",
4710 b"\x00TEST\x06twice\x00\x06",
4711 "Bread is Toasted twice"
4712 );
4713 }
4714
4715 #[test]
4721 fn test_offset_bug_2() {
4722 assert_magic_match_bin!(
4725 r"
4726-12 string TEST Bread is
4727>(4.b) use toasted
4728
47290 name toasted
4730>0 string twice Toasted
4731>>0 use toasted_twice
4732
47330 name toasted_twice
4734>(6.b) string x %
4735 ",
4736 b"\x00TEST\x06twice\x00\x06",
4737 "Bread is Toasted twice"
4738 )
4739 }
4740
4741 #[test]
4742 fn test_offset_bug_3() {
4743 assert_magic_match_bin!(
4746 r"
47471 string TEST Bread is
4748>(5.b) indirect/r x
4749
47500 string twice Toasted
4751>0 use toasted_twice
4752
47530 name toasted_twice
4754>0 string x %s
4755 ",
4756 b"\x00TEST\x06twice\x00\x08",
4757 "Bread is Toasted twice"
4758 )
4759 }
4760
4761 #[test]
4762 fn test_offset_bug_4() {
4763 assert_magic_match_bin!(
4766 r"
47671 string Bread %s
4768>(6.b) indirect/r x
4769
4770# this one uses a based offset
4771# computed at indirection
47721 string is\ Toasted %s
4773>(11.b) use toasted_twice
4774
4775# this one is using a new base
4776# offset being previous base
4777# offset + offset of use
47780 name toasted_twice
4779>0 string x %s
4780 ",
4781 b"\x00Bread\x06is Toasted\x0ctwice\x00",
4782 "Bread is Toasted twice"
4783 )
4784 }
4785
4786 #[test]
4787 fn test_offset_bug_5() {
4788 assert_magic_match_bin!(
4789 r"
47901 string TEST Bread is
4791>(5.b) indirect/r x
4792
47930 string twice Toasted
4794>0 use toasted_twice
4795
47960 name toasted_twice
4797>0 string twice
4798>>&1 byte 0x08 twice
4799 ",
4800 b"\x00TEST\x06twice\x00\x08",
4801 "Bread is Toasted twice"
4802 )
4803 }
4804
4805 #[test]
4806 fn test_bug_6() {
4807 assert_magic_match_bin!(
4811 r"
48121 string TEST Bread is toasted
4813>&0 use toasted
4814>>&0 default x but not burnt
4815
48160 name toasted
4817>1 string toasted
4818 ",
4819 b"\x00TEST\x06toasted",
4820 "Bread is toasted"
4821 )
4822 }
4823
4824 #[test]
4825 fn test_offset_bug_7() {
4826 assert_magic_match_bin!(
4830 r"
48311 string TEST Bread is
4832# offset computation is relative to
4833# rule start
4834>(5.b) use toasted
4835
48360 name toasted
4837>0 string toast Toasted
4838>>(6.b) use toasted_twice
4839
48400 name toasted_twice
4841>1 string x %s
4842 ",
4843 b"\x00TEST\x06toast\x00\x06twice\x00",
4844 "Bread is Toasted twice"
4845 );
4846 }
4847
4848 #[test]
4849 fn test_message_parts() {
4850 let m = first_magic(
4851 r#"0 string/W #!/usr/bin/env\ python PYTHON"#,
4852 b"#!/usr/bin/env python",
4853 StreamKind::Text(TextEncoding::Ascii),
4854 )
4855 .unwrap();
4856
4857 assert!(m.message_parts().any(|p| p.eq_ignore_ascii_case("python")))
4858 }
4859
4860 #[test]
4861 fn test_load_bulk() {
4862 let mut db = MagicDb::new();
4863
4864 let rules = vec![
4865 parse_assert!("0 search test"),
4866 parse_assert!("0 search/24/s test"),
4867 parse_assert!("0 search/s/24 test"),
4868 ];
4869
4870 db.load_bulk(rules.into_iter());
4871 db.verify().unwrap();
4872 }
4873
4874 #[test]
4875 fn test_load_bulk_failure() {
4876 let mut db = MagicDb::new();
4877
4878 let rules = vec![parse_assert!(
4879 r#"
48800 search/s/24 test
4881>0 use test
4882"#
4883 )];
4884
4885 db.load_bulk(rules.into_iter());
4886 assert!(matches!(db.verify(), Err(Error::Verify(_, _, _))));
4887 }
4888}