Skip to main content

patterns/
pattern.rs

1use core::{
2    marker::PhantomData,
3    num::IntErrorKind,
4    ops::Not,
5    simd::{Mask, Simd},
6    str::FromStr,
7};
8
9use crate::{const_utils, Scanner, VUNKNOWN as DEFAULT_BYTES};
10
11/// A prepared pattern. Allows to search for a given byte sequence in data.
12/// Supports masking and alignment requirements.
13///
14/// `BYTES` determines the LANES size.
15/// Every block of data is processed in chunks of `BYTES` bytes.
16/// Rust will compile this to other targets without issue, but will use
17/// inner loops for that.
18/// It is also the max length for patterns.
19#[derive(Clone, Debug)]
20#[must_use]
21pub struct Pattern<const ALIGNMENT: usize = 1, const BYTES: usize = DEFAULT_BYTES> {
22    /// raw pattern
23    pub(crate) bytes: Simd<u8, BYTES>,
24    /// pattern mask
25    pub(crate) mask: Mask<i8, BYTES>,
26    /// preprocessed chunk of `self.bytes` of size ALIGNMENT,
27    /// repeated `BYTES / ALIGNMENT` times
28    ///
29    /// first aligned chunk that has the fewest wildcards
30    pub(crate) first_bytes: Simd<u8, BYTES>,
31    /// mask for `self.first_bytes`
32    ///
33    /// first bytes mask is inverted
34    /// x & mask == mask === x | ^mask == -1
35    pub(crate) first_bytes_mask: Mask<i8, BYTES>,
36    /// offset of `self.first_bytes` from the start of the pattern
37    pub(crate) first_byte_offset: u8,
38    /// length of the pattern
39    pub(crate) length: u8,
40    phantom: PhantomData<[u8; ALIGNMENT]>,
41}
42
43impl<const ALIGNMENT: usize, const BYTES: usize> Pattern<ALIGNMENT, BYTES> {
44    const _ALIGNED: () = {
45        if ALIGNMENT > BYTES {
46            panic!("Pattern ALIGNMENT must be less or equal to BYTES");
47        }
48    };
49
50    /// Parse a pattern. Use [`Pattern::from_str`] to return an error instead of
51    /// panicking.
52    ///
53    /// # Panics
54    /// Panics if [`ParsePatternError`] is returned.
55    #[inline]
56    pub const fn new(pattern: &str) -> Self {
57        match Self::from_str(pattern) {
58            Ok(p) => p,
59            Err(ParsePatternError::PatternTooLong) => panic!("PatternTooLong"),
60            Err(ParsePatternError::InvalidHexNumber(..)) => panic!("InvalidHexNumber"),
61            Err(ParsePatternError::MissingNonWildcardByte) => panic!("MissingNonWildcardByte"),
62        }
63    }
64
65    /// Create a pattern from a byte slice and a mask.
66    /// Byte slices longer than `BYTES` are cut short.
67    /// Mask expects a [`u64`] bitencoding. A 0 bit marks the byte as wildcard.
68    /// Mask is trimmed to `bytes.len()`.
69    ///
70    /// # Panics
71    /// Panics when all bytes are masked as wildcards.
72    pub fn from_slice(bytes: &[u8], mask: u64) -> Self {
73        #[expect(
74            path_statements,
75            reason = "This forces evaluation of the const and emits a compile time error."
76        )]
77        Self::_ALIGNED;
78        let mut input: [u8; BYTES] = [0; BYTES];
79        let length = bytes.len().min(BYTES);
80        input[..length].copy_from_slice(bytes);
81        let mask = u64::MAX.checked_shr(length as u32).unwrap_or(0).not() & mask;
82        let bytes = Simd::<u8, BYTES>::from_array(input);
83        let mask = Mask::<i8, BYTES>::from_bitmask(mask.reverse_bits());
84        let mask_array = mask.to_simd();
85        let mask_array = mask_array.as_array().as_slice();
86
87        let first_byte_offset = find_first_byte_offset::<ALIGNMENT>(mask_array).unwrap();
88
89        let (first_bytes, first_bytes_mask) = fill_first_bytes::<ALIGNMENT, BYTES>(
90            &input[first_byte_offset..],
91            &mask_array[first_byte_offset..],
92        );
93
94        Self {
95            bytes,
96            mask,
97            first_bytes,
98            first_bytes_mask,
99            first_byte_offset: first_byte_offset as _,
100            length: length as _,
101            phantom: PhantomData,
102        }
103    }
104
105    /// Parses a string to a pattern.
106    ///
107    /// Expected format:
108    /// ```text
109    /// ? 1A . B4 ?? e0 .. 5f
110    /// ```
111    /// `?` and `.` mark wildcards. Only the first character is checked for
112    /// wildcards. `10 .error 00` is equivalent to `10 ? 00`.
113    pub const fn from_str(s: &str) -> Result<Self, ParsePatternError> {
114        #[expect(
115            path_statements,
116            reason = "This forces evaluation of the const and emits a compile time error."
117        )]
118        Self::_ALIGNED;
119        let bytes = const_utils::SplitAsciiWhitespace::new(s);
120
121        let length = bytes.clone().count();
122        if length > BYTES {
123            return Err(ParsePatternError::PatternTooLong);
124        }
125
126        let (buffer, mask) = {
127            let mut buffer = [0_u8; BYTES];
128            let mut mask = [0_i8; BYTES];
129            let mut index = 0;
130            let mut bytes = bytes;
131
132            loop {
133                let byte;
134                (bytes, byte) = bytes.next();
135                let byte = match byte {
136                    Some(b) => b,
137                    None => break,
138                };
139
140                if !const_utils::is_wildcard(byte) {
141                    let parsed = match const_utils::hex_to_u8(byte) {
142                        Ok(parsed) => parsed,
143                        Err(e) => return Err(ParsePatternError::InvalidHexNumber(e)),
144                    };
145                    buffer[index] = parsed;
146                    mask[index] = -1;
147                }
148
149                index += 1;
150            }
151
152            (buffer, mask)
153        };
154
155        let first_byte_offset = match find_first_byte_offset::<ALIGNMENT>(&mask) {
156            Ok(offset) => offset,
157            Err(e) => return Err(e),
158        };
159
160        let (_, chunk) = buffer.split_at(first_byte_offset);
161        let (_, mask_chunk) = mask.split_at(first_byte_offset);
162        let (first_bytes, first_bytes_mask) =
163            fill_first_bytes::<ALIGNMENT, BYTES>(chunk, mask_chunk);
164
165        // There is no const way to create a Mask
166        // # Safety: Mask is defined as repr transparent over Simd<T>
167        //
168        // transmute doesn't accept types with sizes depending on BYTES
169        let mask = Simd::from_array(mask);
170        let mask = unsafe { *(&mask as *const _ as *const _) };
171
172        Ok(Self {
173            bytes: Simd::<u8, BYTES>::from_array(buffer),
174            mask,
175            first_bytes,
176            first_bytes_mask,
177            first_byte_offset: first_byte_offset as _,
178            length: length as _,
179            phantom: PhantomData,
180        })
181    }
182
183    /// Creates an [`Iterator`] over data.
184    #[inline]
185    pub fn matches<'pattern, 'data>(
186        &'pattern self,
187        data: &'data [u8],
188    ) -> Scanner<'pattern, 'data, ALIGNMENT, BYTES> {
189        Scanner::new(self, data)
190    }
191}
192
193const fn find_first_byte_offset<const ALIGNMENT: usize>(
194    mut mask: &[i8],
195) -> Result<usize, ParsePatternError> {
196    let mut i = 0;
197    let mut smallest = 0;
198    let mut highest_count = 0;
199    loop {
200        if mask.len() < ALIGNMENT {
201            break;
202        }
203        let chunk;
204        (chunk, mask) = mask.split_at(ALIGNMENT);
205
206        let mut j = 0;
207        let mut count = 0;
208        while j < chunk.len() {
209            count += (chunk[j] != 0) as usize;
210            j += 1;
211        }
212
213        let chunk_count = count;
214        if chunk_count > highest_count {
215            highest_count = chunk_count;
216            smallest = i;
217        }
218
219        i += 1;
220    }
221
222    if highest_count == 0 {
223        Err(ParsePatternError::MissingNonWildcardByte)
224    } else {
225        Ok(smallest * ALIGNMENT)
226    }
227}
228
229const fn fill_first_bytes<const ALIGNMENT: usize, const BYTES: usize>(
230    chunk: &[u8],
231    mask: &[i8],
232) -> (Simd<u8, BYTES>, Mask<i8, BYTES>) {
233    let mut first = [0u8; BYTES];
234    let mut first_mask = [0i8; BYTES];
235
236    let mut i = 0;
237
238    while i < BYTES / ALIGNMENT {
239        let mut j = 0;
240        while j < ALIGNMENT {
241            first[i * ALIGNMENT + j] = chunk[j];
242            first_mask[i * ALIGNMENT + j] = !mask[j];
243            j += 1;
244        }
245        i += 1;
246    }
247
248    let bytes = Simd::from_array(first);
249    // There is no const way to create a Mask
250    // # Safety: Mask is defined as repr transparent over Simd<T>
251    //
252    // transmute doesn't accept types with sizes depending on BYTES
253    let mask = Simd::from_array(first_mask);
254    let mask = unsafe { *(&mask as *const _ as *const _) };
255
256    (bytes, mask)
257}
258
259impl<const ALIGNMENT: usize, const BYTES: usize> FromStr for Pattern<ALIGNMENT, BYTES> {
260    type Err = ParsePatternError;
261
262    #[inline]
263    fn from_str(s: &str) -> Result<Self, Self::Err> {
264        Self::from_str(s)
265    }
266}
267
268#[derive(Debug, Clone, PartialEq, Eq)]
269#[non_exhaustive]
270pub enum ParsePatternError {
271    PatternTooLong,
272    InvalidHexNumber(IntErrorKind),
273    MissingNonWildcardByte,
274}
275
276impl From<IntErrorKind> for ParsePatternError {
277    #[inline]
278    fn from(value: IntErrorKind) -> Self {
279        Self::InvalidHexNumber(value)
280    }
281}