vectorscan/
expression.rs

1/* Copyright 2022-2023 Danny McClanahan */
2/* SPDX-License-Identifier: BSD-3-Clause */
3
4//! FFI wrappers for different types of pattern strings.
5//!
6//! Vectorscan supports 3 distinct types of pattern strings which can be formed
7//! to produce a database:
8//! - [`Expression`]: Vectorscan PCRE-like regex syntax (null-terminated
9//!   [`CString`]).
10//! - [`Literal`]: Literal byte string (`Vec<u8>`) which may contain nulls.
11//! - [`chimera::ChimeraExpression`]: PCRE regex syntax.
12//!
13//! Each vectorscan database only supports matching against *exactly one* type
14//! of these patterns, but each pattern string variant also has a `*Set` form,
15//! and all of these forms support the same interface to vectorscan's most
16//! powerful feature: multi-pattern matching, where patterns registered with
17//! [`ExprId`] in a set can be associated to
18//! [`ExpressionIndex`](crate::matchers::ExpressionIndex) instances when matched
19//! against.
20//!
21//! Creating instances of these structs performs no pattern compilation itself,
22//! which is instead performed in a subsequent step by e.g.
23//! [`Database::compile()`]. References to these structs can be reused multiple
24//! times to create multiple databases without re-allocating the underlying
25//! pattern string data:
26//!
27//!```
28//! # #[allow(unused_variables)]
29//! # fn main() -> Result<(), vectorscan::error::VectorscanError> {
30//! use vectorscan::{expression::*, flags::*};
31//!
32//! let a: Expression = "a+".parse()?;
33//! let b: Expression = "b+".parse()?;
34//! let c: Expression = "c+".parse()?;
35//!
36//! let ab_db = ExpressionSet::from_exprs([&a, &b]).compile(Mode::BLOCK)?;
37//! let bc_db = ExpressionSet::from_exprs([&b, &c]).compile(Mode::BLOCK)?;
38//! let ca_db = ExpressionSet::from_exprs([&c, &a]).compile(Mode::BLOCK)?;
39//! # Ok(())
40//! # }
41//! ```
42
43use crate::{
44  database::Database,
45  error::{VectorscanCompileError, VectorscanRuntimeError},
46  flags::{ExtFlags, Flags, Mode},
47  hs,
48};
49
50use std::{
51  ffi::{CStr, CString},
52  fmt,
53  marker::PhantomData,
54  mem, ops,
55  os::raw::{c_char, c_uint, c_ulonglong},
56  ptr, slice, str,
57};
58
59/// Vectorscan regex pattern string.
60///
61/// Vectorscan itself supports a subset of PCRE syntax in the pattern string;
62/// see [Pattern Support] for reference. The use of unsupported constructs will
63/// result in compilation errors.
64///
65/// Note that as the underlying vectorscan library interprets pattern strings as
66/// null-terminated [`CStr`]s, null bytes are *not* supported within
67/// `Expression` strings. Use a [`Literal`] or [`LiteralSet`] database if you
68/// need to match against pattern strings containing explicit null bytes.
69///
70/// Instances can be created equivalently with [`Self::new()`] or
71/// [`str::parse()`] via the [`str::FromStr`] impl:
72///
73///```
74/// # fn main() -> Result<(), vectorscan::error::VectorscanError> {
75/// use vectorscan::expression::Expression;
76///
77/// let e1: Expression = "asdf+".parse()?;
78/// let e2 = Expression::new("asdf+")?;
79/// assert_eq!(e1, e2);
80/// # Ok(())
81/// # }
82/// ```
83///
84/// [Pattern Support]: https://intel.github.io/hyperscan/dev-reference/compilation.html#pattern-support
85#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
86pub struct Expression(CString);
87
88impl fmt::Display for Expression {
89  fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
90    let b = self.as_bytes();
91    match str::from_utf8(b) {
92      Ok(s) => write!(f, "{}", s),
93      Err(_) => write!(f, "(non-utf8: {:?})", b),
94    }
95  }
96}
97
98impl Expression {
99  /// Reference the underlying bytes, *without* the trailing null terminator.
100  ///
101  ///```
102  /// # fn main() -> Result<(), vectorscan::error::VectorscanError> {
103  /// let e = vectorscan::expression::Expression::new("asdf")?;
104  /// assert_eq!(e.as_bytes(), b"asdf");
105  /// # Ok(())
106  /// # }
107  /// ```
108  pub fn as_bytes(&self) -> &[u8] { self.0.as_bytes() }
109
110  pub(crate) fn as_ptr(&self) -> *const c_char { self.0.as_c_str().as_ptr() }
111
112  /// Produce a `NULL`-terminated C-style wrapper for the given pattern string.
113  ///
114  /// This will fail if the string contains any internal `NULL` bytes, as those
115  /// are not supported by the vectorscan regex compiler:
116  ///```
117  /// use vectorscan::{expression::*, error::*};
118  ///
119  /// let pat = "as\0df";
120  /// let e = match Expression::new(pat) {
121  ///    Err(VectorscanCompileError::NullByte(e)) => e,
122  ///    _ => unreachable!(),
123  /// };
124  /// assert_eq!(e.nul_position(), 2);
125  /// ```
126  pub fn new(x: impl Into<Vec<u8>>) -> Result<Self, VectorscanCompileError> {
127    Ok(Self(CString::new(x)?))
128  }
129
130  /// Utility function providing information about a regular expression. The
131  /// information provided in [`info::ExprInfo`] includes the minimum and
132  /// maximum width of a pattern match.
133  ///
134  /// Note: successful analysis of an expression with this function does not
135  /// imply that compilation of the same expression (via
136  /// [`Database::compile()`] or [`Database::compile_multi()`]) would succeed.
137  /// This function may return [`Ok`] for regular expressions that
138  /// Vectorscan cannot compile.
139  ///
140  /// Note: some per-pattern flags (such as [`Flags::ALLOWEMPTY`] and
141  /// [`Flags::SOM_LEFTMOST`]) are accepted by this call, but as they do not
142  /// affect the properties returned in the [`info::ExprInfo`] structure,
143  /// they will not affect the outcome of this function.
144  ///
145  ///```
146  /// # fn main() -> Result<(), vectorscan::error::VectorscanError> {
147  /// use vectorscan::{expression::{*, info::*}, flags::Flags};
148  ///
149  /// let expr: Expression = "(he)llo".parse()?;
150  ///
151  /// let info = expr.info(Flags::default())?;
152  ///
153  /// assert_eq!(info, ExprInfo {
154  ///   min_width: ExprWidth(5),
155  ///   max_width: Some(ExprWidth(5)),
156  ///   unordered_matches: UnorderedMatchBehavior::OnlyOrdered,
157  ///   matches_at_eod: MatchAtEndBehavior::WillNeverMatchAtEOD,
158  /// });
159  /// # Ok(())
160  /// # }
161  /// ```
162  pub fn info(&self, flags: Flags) -> Result<info::ExprInfo, VectorscanCompileError> {
163    let mut info = ptr::null_mut();
164    let mut compile_err = ptr::null_mut();
165    VectorscanRuntimeError::copy_from_native_compile_error(
166      unsafe {
167        hs::hs_expression_info(
168          self.as_ptr(),
169          flags.into_native(),
170          &mut info,
171          &mut compile_err,
172        )
173      },
174      compile_err,
175    )?;
176
177    let ret = info::ExprInfo::from_native(unsafe { *info });
178
179    unsafe {
180      crate::free_misc(info as *mut u8);
181    }
182
183    Ok(ret)
184  }
185
186  /// Utility function providing information about a regular expression, with
187  /// extended parameter support. The information provided in [`info::ExprInfo`]
188  /// includes the minimum and maximum width of a pattern match.
189  ///
190  /// Note: successful analysis of an expression with this function does not
191  /// imply that compilation of the same expression (via
192  /// [`Database::compile()`] or [`Database::compile_multi()`]) would succeed.
193  /// This function may return [`Ok`] for regular expressions that
194  /// Vectorscan cannot compile.
195  ///
196  /// Note: some per-pattern flags (such as [`Flags::ALLOWEMPTY`] and
197  /// [`Flags::SOM_LEFTMOST`]) are accepted by this call, but as they do not
198  /// affect the properties returned in the [`info::ExprInfo`] structure,
199  /// they will not affect the outcome of this function.
200  ///
201  ///```
202  /// # fn main() -> Result<(), vectorscan::error::VectorscanError> {
203  /// use vectorscan::{expression::{*, info::*}, flags::Flags};
204  ///
205  /// let expr: Expression = ".*lo".parse()?;
206  ///
207  /// let ext = ExprExt::from_min_length(4);
208  ///
209  /// let info = expr.ext_info(Flags::default(), &ext)?;
210  ///
211  /// assert_eq!(info, ExprInfo {
212  ///   min_width: ExprWidth(4),
213  ///   max_width: None,
214  ///   unordered_matches: UnorderedMatchBehavior::OnlyOrdered,
215  ///   matches_at_eod: MatchAtEndBehavior::WillNeverMatchAtEOD,
216  /// });
217  /// # Ok(())
218  /// # }
219  /// ```
220  pub fn ext_info(
221    &self,
222    flags: Flags,
223    ext_flags: &ExprExt,
224  ) -> Result<info::ExprInfo, VectorscanCompileError> {
225    let mut info = ptr::null_mut();
226    let mut compile_err = ptr::null_mut();
227    VectorscanRuntimeError::copy_from_native_compile_error(
228      unsafe {
229        hs::hs_expression_ext_info(
230          self.as_ptr(),
231          flags.into_native(),
232          ext_flags.as_ref_native(),
233          &mut info,
234          &mut compile_err,
235        )
236      },
237      compile_err,
238    )?;
239
240    let ret = info::ExprInfo::from_native(unsafe { *info });
241
242    unsafe {
243      crate::free_misc(info as *mut u8);
244    }
245
246    Ok(ret)
247  }
248
249  /// Call [`Database::compile()`] with [`None`] for the platform.
250  pub fn compile(&self, flags: Flags, mode: Mode) -> Result<Database, VectorscanCompileError> {
251    Database::compile(self, flags, mode, None)
252  }
253}
254
255impl str::FromStr for Expression {
256  type Err = VectorscanCompileError;
257
258  fn from_str(s: &str) -> Result<Self, Self::Err> { Self::new(s) }
259}
260
261/// A literal byte string.
262///
263/// Unlike for [`Expression`], [`Database::compile_literal()`] will parse the
264/// string content in a literal sense without any regular grammars. For example,
265/// the expression `abc?` simply means a char sequence of `a`, `b`, `c`,
266/// and `?`. The `?` here doesn't mean 0 or 1 quantifier under regular
267/// semantics.
268///
269/// Also unlike [`Expression`], the underlying vectorscan library interprets
270/// literal patterns with a pointer and a length instead of a `NULL`-terminated
271/// string. **Importantly, this allows it to contain `\0` or `NULL` bytes
272/// itself!**
273///
274/// Finally note that literal expressions do not support an "info" interface
275/// like [`Expression::info()`] and [`Expression::ext_info()`], since most of
276/// those properties can be inferred from the literal string itself.
277///
278/// Instances can be created equivalently with [`Self::new()`] or
279/// [`str::parse()`] via the [`str::FromStr`] impl:
280///```
281/// # fn main() -> Result<(), vectorscan::error::VectorscanError> {
282/// use vectorscan::expression::Literal;
283///
284/// let e1: Literal = "as\0df".parse()?;
285/// let e2 = Literal::new("as\0df")?;
286/// assert_eq!(e1, e2);
287/// # Ok(())
288/// # }
289/// ```
290#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
291pub struct Literal(Vec<u8>);
292
293impl fmt::Debug for Literal {
294  fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
295    let b = self.as_bytes();
296    match str::from_utf8(b) {
297      Ok(s) => write!(f, "Literal({:?})", s),
298      Err(_) => write!(f, "Literal({:?})", b),
299    }
300  }
301}
302
303impl fmt::Display for Literal {
304  fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
305    let b = self.as_bytes();
306    match str::from_utf8(b) {
307      Ok(s) => write!(f, "{}", s),
308      Err(_) => write!(f, "(non-utf8 literal: {:?})", b),
309    }
310  }
311}
312
313impl Literal {
314  /// Reference the underlying bytes. This wrapper does *not* allocate any null
315  /// terminator.
316  ///
317  ///```
318  /// # fn main() -> Result<(), vectorscan::error::VectorscanError> {
319  /// let e = vectorscan::expression::Literal::new("as\0df")?;
320  /// assert_eq!(e.as_bytes(), b"as\0df");
321  /// # Ok(())
322  /// # }
323  /// ```
324  pub fn as_bytes(&self) -> &[u8] { &self.0 }
325
326  pub(crate) fn as_ptr(&self) -> *const c_char {
327    unsafe { mem::transmute(self.as_bytes().as_ptr()) }
328  }
329
330  /// Wrap a byte slice to be interpreted literally. This does *not* allocate
331  /// any null terminator.
332  pub fn new(x: impl Into<Vec<u8>>) -> Result<Self, VectorscanCompileError> { Ok(Self(x.into())) }
333
334  /// Call [`Database::compile_literal()`] with [`None`] for the platform.
335  pub fn compile(&self, flags: Flags, mode: Mode) -> Result<Database, VectorscanCompileError> {
336    Database::compile_literal(self, flags, mode, None)
337  }
338}
339
340impl str::FromStr for Literal {
341  type Err = VectorscanCompileError;
342
343  fn from_str(s: &str) -> Result<Self, Self::Err> { Self::new(s) }
344}
345
346/// The ID number to associate with a pattern match in an expression set.
347///
348/// When provided to an expression set, this value is converted into an
349/// [`ExpressionIndex`](crate::matchers::ExpressionIndex) in a
350/// [`Match`](crate::matchers::Match),
351/// [`VectoredMatch`](crate::matchers::VectoredMatch), or
352/// [`ChimeraMatch`](crate::matchers::chimera::ChimeraMatch) upon matching the
353/// given pattern.
354///
355/// This ID is used in [`ExpressionSet::with_ids()`],
356/// [`LiteralSet::with_ids()`], and
357/// [`ChimeraExpressionSet::with_ids()`](chimera::ChimeraExpressionSet::with_ids).
358#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
359#[repr(transparent)]
360pub struct ExprId(pub c_uint);
361
362/// Collection of regular expressions.
363///
364/// This is the main entry point to vectorscan's primary functionality: matching
365/// against sets of patterns at once, which is typically poorly supported or
366/// less featureful than single-pattern matching in many other regex engines.
367///
368/// This struct provides an immutable (returning `Self`) builder interface
369/// to attach additional configuration to the initial set of patterns
370/// constructed with [`Self::from_exprs()`].
371#[derive(Clone)]
372pub struct ExpressionSet<'a> {
373  ptrs: Vec<*const c_char>,
374  flags: Option<Vec<Flags>>,
375  ids: Option<Vec<ExprId>>,
376  exts: Option<Vec<*const hs::hs_expr_ext>>,
377  _ph: PhantomData<&'a u8>,
378}
379
380impl<'a> fmt::Debug for ExpressionSet<'a> {
381  fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
382    let exprs: Vec<&'a CStr> = self
383      .ptrs
384      .iter()
385      .map(|p| unsafe { CStr::from_ptr(*p) })
386      .collect();
387    let exts: Option<&[Option<&ExprExt>]> = self
388      .exts
389      .as_ref()
390      .map(|exts| unsafe { slice::from_raw_parts(mem::transmute(exts.as_ptr()), exprs.len()) });
391    write!(
392      f,
393      "ExpressionSet(exprs={:?}, flags={:?}, ids={:?}, exts={:?})",
394      exprs, &self.flags, &self.ids, exts,
395    )
396  }
397}
398
399impl<'a> ExpressionSet<'a> {
400  /// Construct a pattern set from references to parsed expressions.
401  ///
402  /// The length of this initial `exprs` argument is returned by
403  /// [`Self::len()`], and all subsequent configuration methods are checked to
404  /// provide iterators of the same length:
405  ///
406  ///```should_panic
407  /// use vectorscan::expression::*;
408  ///
409  /// let a: Expression = "a+".parse().unwrap();
410  /// // Fails due to argument length mismatch:
411  /// ExpressionSet::from_exprs([&a])
412  ///   .with_flags([]);
413  /// ```
414  pub fn from_exprs(exprs: impl IntoIterator<Item=&'a Expression>) -> Self {
415    Self {
416      ptrs: exprs.into_iter().map(|e| e.as_ptr()).collect(),
417      flags: None,
418      ids: None,
419      exts: None,
420      _ph: PhantomData,
421    }
422  }
423
424  /// Provide flags which modify the behavior of each expression.
425  ///
426  /// The length of `flags` is checked to be the same as [`Self::len()`].
427  ///
428  /// If this builder method is not used, [`Flags::default()`] will be assigned
429  /// to all patterns.
430  ///
431  ///```
432  /// # fn main() -> Result<(), vectorscan::error::VectorscanError> {
433  /// use vectorscan::{expression::*, flags::*, matchers::*};
434  ///
435  /// // Create two expressions to demonstrate separate flags for each pattern:
436  /// let a: Expression = "a+[^a]".parse()?;
437  /// let b: Expression = "b+[^b]".parse()?;
438  ///
439  /// // Get the start of match for one pattern, but not the other:
440  /// let db = ExpressionSet::from_exprs([&a, &b])
441  ///   .with_flags([Flags::default(), Flags::SOM_LEFTMOST])
442  ///   .compile(Mode::BLOCK)?;
443  ///
444  /// let mut scratch = db.allocate_scratch()?;
445  ///
446  /// let mut matches: Vec<&str> = Vec::new();
447  /// scratch.scan_sync(&db, "aardvark imbibbe".into(), |m| {
448  ///   matches.push(unsafe { m.source.as_str() });
449  ///   MatchResult::Continue
450  /// })?;
451  /// // Start of match is preserved for only one pattern:
452  /// assert_eq!(&matches, &["aar", "aardvar", "bi", "bbe"]);
453  /// # Ok(())
454  /// # }
455  /// ```
456  pub fn with_flags(mut self, flags: impl IntoIterator<Item=Flags>) -> Self {
457    let flags: Vec<_> = flags.into_iter().collect();
458    assert_eq!(self.len(), flags.len());
459    self.flags = Some(flags);
460    self
461  }
462
463  /// Assign an ID number to each pattern.
464  ///
465  /// The length of `ids` is checked to be the same as [`Self::len()`]. Multiple
466  /// patterns can be assigned the same ID.
467  ///
468  /// If this builder method is not used, vectorscan will assign them all the ID
469  /// number 0:
470  ///
471  ///```
472  /// # fn main() -> Result<(), vectorscan::error::VectorscanError> {
473  /// use vectorscan::{expression::*, flags::*, state::*, matchers::*, sources::*};
474  ///
475  /// // Create two expressions to demonstrate multiple pattern IDs.
476  /// let a: Expression = "a+[^a]".parse()?;
477  /// let b: Expression = "b+[^b]".parse()?;
478  ///
479  /// // Create one db with ID numbers, and one without.
480  /// let set1 = ExpressionSet::from_exprs([&a, &b]).compile(Mode::BLOCK)?;
481  /// let set2 = ExpressionSet::from_exprs([&a, &b])
482  ///   .with_ids([ExprId(300), ExprId(12)])
483  ///   .compile(Mode::BLOCK)?;
484  ///
485  /// let mut scratch = Scratch::blank();
486  /// scratch.setup_for_db(&set1)?;
487  /// scratch.setup_for_db(&set2)?;
488  ///
489  /// let msg: ByteSlice = "aardvark imbibbe".into();
490  ///
491  /// // The first db doesn't differentiate matches by ID number:
492  /// let mut matches1: Vec<ExpressionIndex> = Vec::new();
493  /// scratch.scan_sync(&set1, msg, |m| {
494  ///   matches1.push(m.id);
495  ///   MatchResult::Continue
496  /// })?;
497  /// assert_eq!(
498  ///   &matches1,
499  ///   &[ExpressionIndex(0), ExpressionIndex(0), ExpressionIndex(0), ExpressionIndex(0)],
500  /// );
501  ///
502  /// // The second db returns corresponding ExpressionIndex instances:
503  /// let mut matches2: Vec<ExpressionIndex> = Vec::new();
504  /// scratch.scan_sync(&set2, msg, |m| {
505  ///   matches2.push(m.id);
506  ///   MatchResult::Continue
507  /// })?;
508  /// assert_eq!(
509  ///   &matches2,
510  ///   &[ExpressionIndex(300), ExpressionIndex(300), ExpressionIndex(12), ExpressionIndex(12)],
511  /// );
512  /// # Ok(())
513  /// # }
514  /// ```
515  pub fn with_ids(mut self, ids: impl IntoIterator<Item=ExprId>) -> Self {
516    let ids: Vec<_> = ids.into_iter().collect();
517    assert_eq!(self.len(), ids.len());
518    self.ids = Some(ids);
519    self
520  }
521
522  /// Optionally assign [`ExprExt`] configuration to each pattern.
523  ///
524  /// This is the only available entry point to compiling a database with
525  /// [`ExprExt`] configuration for a given pattern (i.e. the single
526  /// expression compiler does not support extended configuration).
527  ///
528  /// If [`Expression::ext_info()`] succeeds with a given
529  /// [`Expression`]/[`ExprExt`] pair, then compiling the same pattern and
530  /// configuration into a vectorscan database via an expression set with this
531  /// method is likely but not guaranteed to succeed.
532  ///
533  ///```
534  /// # fn main() -> Result<(), vectorscan::error::VectorscanError> {
535  /// use vectorscan::{expression::*, flags::*, matchers::*};
536  ///
537  /// // Apply extended configuration to one version of the pattern, but not the other:
538  /// let a: Expression = "a.*b".parse()?;
539  /// let a_ext = ExprExt::from_min_length(4);
540  /// let set = ExpressionSet::from_exprs([&a, &a])
541  ///   .with_exts([Some(&a_ext), None])
542  ///   .with_ids([ExprId(1), ExprId(2)])
543  ///   .compile(Mode::BLOCK)?;
544  /// let mut scratch = set.allocate_scratch()?;
545  ///
546  /// // The configured pattern does not match because of its min length attribute:
547  /// let mut matches: Vec<ExpressionIndex> = Vec::new();
548  /// scratch.scan_sync(&set, "ab".into(), |m| {
549  ///   matches.push(m.id);
550  ///   MatchResult::Continue
551  /// })?;
552  /// assert_eq!(&matches, &[ExpressionIndex(2)]);
553  ///
554  /// // Howver, both patterns match a longer input:
555  /// matches.clear();
556  /// scratch.scan_sync(&set, "asssssb".into(), |m| {
557  ///   matches.push(m.id);
558  ///   MatchResult::Continue
559  /// })?;
560  /// assert_eq!(&matches, &[ExpressionIndex(1), ExpressionIndex(2)]);
561  /// # Ok(())
562  /// # }
563  /// ```
564  pub fn with_exts(mut self, exts: impl IntoIterator<Item=Option<&'a ExprExt>>) -> Self {
565    let exts: Vec<*const hs::hs_expr_ext> = exts
566      .into_iter()
567      .map(|e| {
568        e.map(|e| e.as_ref_native() as *const hs::hs_expr_ext)
569          .unwrap_or(ptr::null())
570      })
571      .collect();
572    assert_eq!(self.len(), exts.len());
573    self.exts = Some(exts);
574    self
575  }
576
577  /// Call [`Database::compile_multi()`] with [`None`] for the platform.
578  pub fn compile(self, mode: Mode) -> Result<Database, VectorscanCompileError> {
579    Database::compile_multi(&self, mode, None)
580  }
581
582  /// The number of patterns in this set.
583  pub fn len(&self) -> usize { self.ptrs.len() }
584
585  /// Whether this set contains any patterns.
586  pub fn is_empty(&self) -> bool { self.len() == 0 }
587
588  pub(crate) fn num_elements(&self) -> c_uint { self.len() as c_uint }
589
590  pub(crate) fn exts_ptr(&self) -> Option<*const *const hs::hs_expr_ext> {
591    self.exts.as_ref().map(|e| e.as_ptr())
592  }
593
594  pub(crate) fn expressions_ptr(&self) -> *const *const c_char { self.ptrs.as_ptr() }
595
596  pub(crate) fn flags_ptr(&self) -> *const c_uint {
597    self
598      .flags
599      .as_ref()
600      .map(|f| unsafe { mem::transmute(f.as_ptr()) })
601      .unwrap_or(ptr::null())
602  }
603
604  pub(crate) fn ids_ptr(&self) -> *const c_uint {
605    self
606      .ids
607      .as_ref()
608      .map(|i| unsafe { mem::transmute(i.as_ptr()) })
609      .unwrap_or(ptr::null())
610  }
611}
612
613/// Data produced by vectorscan to analyze a particular expression.
614///
615/// These structs cover the output of [`Expression::info()`] and
616/// [`Expression::ext_info()`].
617pub mod info {
618  use crate::hs;
619
620  use displaydoc::Display;
621
622  use std::os::raw::{c_char, c_uint};
623
624  /// The upper or lower bound for the length of any matches returned by a
625  /// pattern.
626  #[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
627  #[repr(transparent)]
628  pub struct ExprWidth(pub usize);
629
630  impl ExprWidth {
631    pub(crate) const fn parse_min_width(x: c_uint) -> Self { Self(x as usize) }
632
633    pub(crate) const fn parse_max_width(x: c_uint) -> Option<Self> {
634      if x == c_uint::MAX {
635        None
636      } else {
637        Some(Self(x as usize))
638      }
639    }
640  }
641
642  /// Whether the expression can produce matches that are not returned in order,
643  /// such as those produced by assertions.
644  #[derive(
645    Debug,
646    Display,
647    Copy,
648    Clone,
649    PartialEq,
650    Eq,
651    PartialOrd,
652    Ord,
653    Hash,
654    num_enum::IntoPrimitive,
655    num_enum::FromPrimitive,
656  )]
657  #[repr(i8)]
658  pub enum UnorderedMatchBehavior {
659    /// Disallows matches that are not returned in order.
660    #[num_enum(default)]
661    OnlyOrdered = 0,
662    /// Allows matches that are not returned in order.
663    AllowsUnordered = 1,
664  }
665
666  impl UnorderedMatchBehavior {
667    pub(crate) const fn from_native(x: c_char) -> Self {
668      if x == 0 {
669        Self::OnlyOrdered
670      } else {
671        Self::AllowsUnordered
672      }
673    }
674  }
675
676  /// Whether this expression can produce matches at end of data (EOD).
677  ///
678  /// In streaming mode, EOD matches are raised during
679  /// [`Scratch::flush_eod_sync()`](crate::state::Scratch::flush_eod_sync) or
680  /// [`Scratch::flush_eod_sync()`](crate::state::Scratch::flush_eod_sync),
681  /// since it is only when `flush_eod()` is called that the EOD location is
682  /// known.
683  ///
684  /// Note: trailing `\b` word boundary assertions may also result in EOD
685  /// matches as end-of-data can act as a word boundary.
686  #[derive(Debug, Display, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
687  #[repr(i8)]
688  pub enum MatchAtEndBehavior {
689    /// Pattern will never match at EOD.
690    WillNeverMatchAtEOD,
691    /// Pattern *may* match at EOD.
692    MayMatchAtEOD,
693    /// Pattern will *only* match at EOD.
694    WillOnlyMatchAtEOD,
695  }
696
697  impl MatchAtEndBehavior {
698    pub(crate) fn from_native(matches_at_eod: c_char, matches_only_at_eod: c_char) -> Self {
699      match (matches_at_eod, matches_only_at_eod) {
700        (0, 0) => Self::WillNeverMatchAtEOD,
701        (x, 0) if x != 0 => Self::MayMatchAtEOD,
702        (_, x) if x != 0 => Self::WillOnlyMatchAtEOD,
703        x => unreachable!("unreachable pattern: {:?}", x),
704      }
705    }
706  }
707
708  /// Data produced by vectorscan to analyze a particular expression.
709  ///
710  /// This struct is produced by [`super::Expression::info()`]:
711  ///
712  ///```
713  /// # fn main() -> Result<(), vectorscan::error::VectorscanError> {
714  /// use vectorscan::{expression::{*, info::*}, flags::Flags};
715  ///
716  /// let expr: Expression = "(he)llo$".parse()?;
717  /// let info = expr.info(Flags::default())?;
718  /// assert_eq!(info, ExprInfo {
719  ///   min_width: ExprWidth(5),
720  ///   max_width: Some(ExprWidth(5)),
721  ///   unordered_matches: UnorderedMatchBehavior::AllowsUnordered,
722  ///   matches_at_eod: MatchAtEndBehavior::WillOnlyMatchAtEOD,
723  /// });
724  /// # Ok(())
725  /// # }
726  /// ```
727  ///
728  /// as well as [`super::Expression::ext_info()`]:
729  ///
730  ///```
731  /// # fn main() -> Result<(), vectorscan::error::VectorscanError> {
732  /// use vectorscan::{expression::{*, info::*}, flags::Flags};
733  ///
734  /// let expr: Expression = ".*lo($)?".parse()?;
735  /// let ext = ExprExt::from_min_length(4);
736  /// let info = expr.ext_info(Flags::default(), &ext)?;
737  /// assert_eq!(info, ExprInfo {
738  ///   min_width: ExprWidth(4),
739  ///   max_width: None,
740  ///   unordered_matches: UnorderedMatchBehavior::AllowsUnordered,
741  ///   matches_at_eod: MatchAtEndBehavior::MayMatchAtEOD,
742  /// });
743  /// # Ok(())
744  /// # }
745  /// ```
746  #[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
747  pub struct ExprInfo {
748    /// The minimum length in bytes of a match for the pattern. If the pattern
749    /// has an unbounded minimum length, this will be 0.
750    ///
751    /// Note: in some cases when using advanced features to suppress matches
752    /// (such as extended parameters or
753    /// [`Flags::SINGLEMATCH`](crate::flags::Flags::SINGLEMATCH)) this
754    /// may represent a conservative lower bound for the true minimum length of
755    /// a match.
756    pub min_width: ExprWidth,
757    /// The maximum length in bytes of a match for the pattern. If the pattern
758    /// has an unbounded maximum length, this will be [`None`].
759    ///
760    /// Note: in some cases when using advanced features to suppress matches
761    /// (such as extended parameters or
762    /// [`Flags::SINGLEMATCH`](crate::flags::Flags::SINGLEMATCH)) this
763    /// may represent a conservative upper bound for the true maximum length of
764    /// a match.
765    pub max_width: Option<ExprWidth>,
766    /// Whether this expression can produce matches that are not returned in
767    /// order, such as those produced by assertions.
768    pub unordered_matches: UnorderedMatchBehavior,
769    /// Whether this expression can produce matches at end of data (EOD).
770    ///
771    /// In streaming mode, EOD matches are raised during
772    /// [`Scratch::flush_eod_sync()`](crate::state::Scratch::flush_eod_sync) or
773    /// [`Scratch::flush_eod_sync()`](crate::state::Scratch::flush_eod_sync),
774    /// since it is only when `flush_eod()` is called that the EOD location
775    /// is known.
776    ///
777    /// Note: trailing `\b` word boundary assertions may also result in EOD
778    /// matches as end-of-data can act as a word boundary.
779    pub matches_at_eod: MatchAtEndBehavior,
780  }
781
782  impl ExprInfo {
783    pub(crate) fn from_native(x: hs::hs_expr_info) -> Self {
784      let hs::hs_expr_info {
785        min_width,
786        max_width,
787        unordered_matches,
788        matches_at_eod,
789        matches_only_at_eod,
790      } = x;
791      let min_width = ExprWidth::parse_min_width(min_width);
792      let max_width = ExprWidth::parse_max_width(max_width);
793      let unordered_matches = UnorderedMatchBehavior::from_native(unordered_matches);
794      let matches_at_eod = MatchAtEndBehavior::from_native(matches_at_eod, matches_only_at_eod);
795      Self {
796        min_width,
797        max_width,
798        unordered_matches,
799        matches_at_eod,
800      }
801    }
802  }
803}
804
805/// Configuration for extended vectorscan parameters.
806///
807/// These parameters cover various types of fuzzy search as well as input
808/// subsetting features. See [Extended Parameters] for a further reference.
809///
810/// [Extended Parameters]: https://intel.github.io/hyperscan/dev-reference/compilation.html#extparam
811///
812/// This structure may be passed in when building a database with
813/// [`ExpressionSet::with_exts()`], or used to interrogate a single expression
814/// with [`Expression::ext_info()`].
815///
816/// Like many other flags arguments, this struct also supports [`ops::BitOr`]
817/// and the `|` operator for composition:
818///
819///```
820/// # fn main() -> Result<(), vectorscan::error::VectorscanError> {
821/// use vectorscan::{expression::*, flags::*, matchers::*, sources::*};
822///
823/// // Apply extended configuration to one version of the pattern, but not the other:
824/// let a: Expression = "ab".parse()?;
825/// let ext = ExprExt::from_min_offset(3) | ExprExt::from_max_offset(15);
826/// let set = ExpressionSet::from_exprs([&a, &a])
827///   .with_exts([Some(&ext), None])
828///   .with_ids([ExprId(1), ExprId(2)])
829///   .compile(Mode::BLOCK)?;
830/// let mut scratch = set.allocate_scratch()?;
831///
832/// let msg: ByteSlice = "ab   ab                ab".into();
833///
834/// let mut matches: Vec<ExpressionIndex> = Vec::new();
835/// scratch.scan_sync(&set, msg, |m| {
836///   matches.push(m.id);
837///   MatchResult::Continue
838/// })?;
839///
840/// // The configured pattern misses out on the first and last match of "ab":
841/// assert_eq!(&matches, &[
842///   ExpressionIndex(2), ExpressionIndex(1), ExpressionIndex(2), ExpressionIndex(2),
843/// ]);
844/// # Ok(())
845/// # }
846/// ```
847#[derive(Debug, Copy, Clone)]
848#[repr(transparent)]
849pub struct ExprExt(hs::hs_expr_ext);
850
851impl Default for ExprExt {
852  fn default() -> Self { Self::zeroed() }
853}
854
855impl ExprExt {
856  /// Generate an empty instance with all features disabled.
857  /* FIXME: make this const when const zeroed() is stabilized! */
858  pub fn zeroed() -> Self { unsafe { mem::MaybeUninit::zeroed().assume_init() } }
859
860  /// The minimum end offset in the data stream at which this expression should
861  /// match successfully.
862  pub fn from_min_offset(x: usize) -> Self {
863    let ext_flags = ExtFlags::MIN_OFFSET;
864    let mut s = Self::zeroed();
865    s.0.flags = ext_flags.into_native();
866    s.0.min_offset = x as c_ulonglong;
867    s
868  }
869
870  /// The maximum end offset in the data stream at which this expression should
871  /// match successfully.
872  pub fn from_max_offset(x: usize) -> Self {
873    let ext_flags = ExtFlags::MAX_OFFSET;
874    let mut s = Self::zeroed();
875    s.0.flags = ext_flags.into_native();
876    s.0.max_offset = x as c_ulonglong;
877    s
878  }
879
880  /// The minimum match length (from start to end) required to successfully
881  /// match this expression.
882  ///
883  /// This is one alternative to the use of [`Flags::ALLOWEMPTY`].
884  ///
885  /// This does not require [`Flags::SOM_LEFTMOST`]:
886  ///
887  ///```
888  /// # fn main() -> Result<(), vectorscan::error::VectorscanError> {
889  /// use vectorscan::{expression::*, flags::*, matchers::*, sources::*};
890  ///
891  /// let a: Expression = "a.*b".parse()?;
892  /// let ext = ExprExt::from_min_length(4);
893  /// let set = ExpressionSet::from_exprs([&a, &a])
894  ///   // #1 has no min_length, #2 does:
895  ///   .with_exts([None, Some(&ext)])
896  ///   .with_ids([ExprId(1), ExprId(2)])
897  ///   .compile(Mode::BLOCK)?;
898  /// let mut scratch = set.allocate_scratch()?;
899  ///
900  /// let msg: ByteSlice = "   ab   ab   ".into();
901  ///
902  /// let mut matches: Vec<(u32, &str)> = Vec::new();
903  /// scratch.scan_sync(&set, msg, |m| {
904  ///   matches.push((m.id.0, unsafe { m.source.as_str() }));
905  ///   MatchResult::Continue
906  /// })?;
907  ///
908  /// assert_eq!(&matches, &[
909  ///   // Without min_length, both matches show up:
910  ///   (1, "   ab"),
911  ///   (1, "   ab   ab"),
912  ///   // SOM_LEFTMOST is disabled, so we don't know the match start,
913  ///   // but the min_length property is correctly applied regardless:
914  ///   (2, "   ab   ab"),
915  /// ]);
916  /// # Ok(())
917  /// # }
918  /// ```
919  pub fn from_min_length(x: usize) -> Self {
920    let ext_flags = ExtFlags::MIN_LENGTH;
921    let mut s = Self::zeroed();
922    s.0.flags = ext_flags.into_native();
923    s.0.min_length = x as c_ulonglong;
924    s
925  }
926
927  /// Allow patterns to approximately match within this [edit distance](https://en.wikipedia.org/wiki/Edit_distance).
928  pub fn from_edit_distance(x: usize) -> Self {
929    let ext_flags = ExtFlags::EDIT_DISTANCE;
930    let mut s = Self::zeroed();
931    s.0.flags = ext_flags.into_native();
932    assert!(x < c_uint::MAX as usize);
933    s.0.edit_distance = x as c_uint;
934    s
935  }
936
937  /// Allow patterns to approximately match within this [Hamming distance](https://en.wikipedia.org/wiki/Hamming_distance).
938  pub fn from_hamming_distance(x: usize) -> Self {
939    let ext_flags = ExtFlags::HAMMING_DISTANCE;
940    let mut s = Self::zeroed();
941    s.0.flags = ext_flags.into_native();
942    assert!(x < c_uint::MAX as usize);
943    s.0.hamming_distance = x as c_uint;
944    s
945  }
946
947  const fn ext_flags(&self) -> ExtFlags { ExtFlags::from_native(self.0.flags) }
948
949  fn min_offset(&self) -> Option<c_ulonglong> {
950    if self.ext_flags().has_min_offset() {
951      Some(self.0.min_offset)
952    } else {
953      None
954    }
955  }
956
957  fn max_offset(&self) -> Option<c_ulonglong> {
958    if self.ext_flags().has_max_offset() {
959      Some(self.0.max_offset)
960    } else {
961      None
962    }
963  }
964
965  fn min_length(&self) -> Option<c_ulonglong> {
966    if self.ext_flags().has_min_length() {
967      Some(self.0.min_length)
968    } else {
969      None
970    }
971  }
972
973  fn edit_distance(&self) -> Option<c_uint> {
974    if self.ext_flags().has_edit_distance() {
975      Some(self.0.edit_distance)
976    } else {
977      None
978    }
979  }
980
981  fn hamming_distance(&self) -> Option<c_uint> {
982    if self.ext_flags().has_hamming_distance() {
983      Some(self.0.hamming_distance)
984    } else {
985      None
986    }
987  }
988
989  fn compose(mut self, rhs: Self) -> Self {
990    self.0.flags = (self.ext_flags() | rhs.ext_flags()).into_native();
991    if let Some(min_offset) = rhs.min_offset() {
992      self.0.min_offset = min_offset;
993    }
994    if let Some(max_offset) = rhs.max_offset() {
995      self.0.max_offset = max_offset;
996    }
997    if let Some(min_length) = rhs.min_length() {
998      self.0.min_length = min_length;
999    }
1000    if let Some(edit_distance) = rhs.edit_distance() {
1001      self.0.edit_distance = edit_distance;
1002    }
1003    if let Some(hamming_distance) = rhs.hamming_distance() {
1004      self.0.hamming_distance = hamming_distance;
1005    }
1006    self
1007  }
1008
1009  pub(crate) fn as_ref_native(&self) -> &hs::hs_expr_ext { &self.0 }
1010}
1011
1012impl ops::BitOr for ExprExt {
1013  type Output = Self;
1014
1015  fn bitor(self, other: Self) -> Self { self.compose(other) }
1016}
1017
1018impl ops::BitOrAssign for ExprExt {
1019  fn bitor_assign(&mut self, rhs: Self) {
1020    use ops::BitOr;
1021    *self = self.bitor(rhs);
1022  }
1023}
1024
1025/// Collection of literals.
1026///
1027/// This is the analogue to [`ExpressionSet`] for [`Literal`] expressions, which
1028/// cannot be combined with [`Expression`] patterns in the same database.
1029///
1030/// This struct provides an immutable (returning `Self`) builder interface
1031/// to attach additional configuration to the initial set of patterns
1032/// constructed with [`Self::from_lits()`].
1033#[derive(Clone)]
1034pub struct LiteralSet<'a> {
1035  ptrs: Vec<*const c_char>,
1036  lens: Vec<usize>,
1037  flags: Option<Vec<Flags>>,
1038  ids: Option<Vec<ExprId>>,
1039  _ph: PhantomData<&'a u8>,
1040}
1041
1042impl<'a> fmt::Debug for LiteralSet<'a> {
1043  fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1044    let exprs: Vec<&'a [u8]> = self
1045      .ptrs
1046      .iter()
1047      .zip(self.lens.iter())
1048      .map(|(p, n)| unsafe { slice::from_raw_parts(*p as *const u8, *n) })
1049      .collect();
1050    let joined_exprs: String = exprs
1051      .into_iter()
1052      .map(|s| {
1053        str::from_utf8(s)
1054          .map(|s| format!("{:?}", s))
1055          .unwrap_or_else(|_| format!("(non-utf8: {:?})", s))
1056      })
1057      .collect::<Vec<_>>()
1058      .join(", ");
1059    write!(
1060      f,
1061      "LiteralSet(exprs=[{}], flags={:?}, ids={:?})",
1062      joined_exprs, &self.flags, &self.ids
1063    )
1064  }
1065}
1066
1067impl<'a> LiteralSet<'a> {
1068  /// Construct a pattern set from references to parsed literals.
1069  ///
1070  /// The length of this initial `exprs` argument is returned by
1071  /// [`Self::len()`], and all subsequent configuration methods are checked to
1072  /// provide iterators of the same length:
1073  ///
1074  ///```should_panic
1075  /// use vectorscan::expression::*;
1076  ///
1077  /// let a: Literal = "a\0b".parse().unwrap();
1078  /// // Fails due to argument length mismatch:
1079  /// LiteralSet::from_lits([&a])
1080  ///   .with_flags([]);
1081  /// ```
1082  pub fn from_lits(lits: impl IntoIterator<Item=&'a Literal>) -> Self {
1083    let mut ptrs: Vec<_> = Vec::new();
1084    let mut lens: Vec<_> = Vec::new();
1085
1086    for l in lits.into_iter() {
1087      ptrs.push(l.as_ptr());
1088      lens.push(l.as_bytes().len());
1089    }
1090
1091    Self {
1092      ptrs,
1093      lens,
1094      flags: None,
1095      ids: None,
1096      _ph: PhantomData,
1097    }
1098  }
1099
1100  /// Provide flags which modify the behavior of each expression.
1101  ///
1102  /// The length of `flags` is checked to be the same as [`Self::len()`].
1103  ///
1104  /// If this builder method is not used, [`Flags::default()`] will be assigned
1105  /// to all patterns.
1106  ///
1107  ///```
1108  /// # fn main() -> Result<(), vectorscan::error::VectorscanError> {
1109  /// use vectorscan::{expression::*, flags::*, matchers::*};
1110  ///
1111  /// // Create two expressions to demonstrate separate flags for each pattern:
1112  /// let a: Literal = "a".parse()?;
1113  /// let b: Literal = "b".parse()?;
1114  ///
1115  /// // Get the start of match for one pattern, but not the other:
1116  /// let db = LiteralSet::from_lits([&a, &b])
1117  ///   .with_flags([Flags::default(), Flags::SOM_LEFTMOST])
1118  ///   .compile(Mode::BLOCK)?;
1119  ///
1120  /// let mut scratch = db.allocate_scratch()?;
1121  ///
1122  /// let mut matches: Vec<&str> = Vec::new();
1123  /// scratch.scan_sync(&db, "aardvark imbibbe".into(), |m| {
1124  ///   matches.push(unsafe { m.source.as_str() });
1125  ///   MatchResult::Continue
1126  /// })?;
1127  /// // Start of match is preserved for only one pattern:
1128  /// assert_eq!(&matches, &["a", "aa", "aardva", "b", "b", "b"]);
1129  /// # Ok(())
1130  /// # }
1131  /// ```
1132  pub fn with_flags(mut self, flags: impl IntoIterator<Item=Flags>) -> Self {
1133    let flags: Vec<_> = flags.into_iter().collect();
1134    assert_eq!(self.len(), flags.len());
1135    self.flags = Some(flags.to_vec());
1136    self
1137  }
1138
1139  /// Assign an ID number to each pattern.
1140  ///
1141  /// The length of `ids` is checked to be the same as [`Self::len()`]. Multiple
1142  /// patterns can be assigned the same ID.
1143  ///
1144  /// If this builder method is not used, vectorscan will assign them all the ID
1145  /// number 0:
1146  ///
1147  ///```
1148  /// # fn main() -> Result<(), vectorscan::error::VectorscanError> {
1149  /// use vectorscan::{expression::*, flags::*, state::*, matchers::*, sources::*};
1150  ///
1151  /// // Create two expressions to demonstrate multiple pattern IDs.
1152  /// let a: Literal = "a".parse()?;
1153  /// let b: Literal = "b".parse()?;
1154  ///
1155  /// // Create one db with ID numbers, and one without.
1156  /// let set1 = LiteralSet::from_lits([&a, &b]).compile(Mode::BLOCK)?;
1157  /// let set2 = LiteralSet::from_lits([&a, &b])
1158  ///   .with_ids([ExprId(300), ExprId(12)])
1159  ///   .compile(Mode::BLOCK)?;
1160  ///
1161  /// let mut scratch = Scratch::blank();
1162  /// scratch.setup_for_db(&set1)?;
1163  /// scratch.setup_for_db(&set2)?;
1164  ///
1165  /// let msg: ByteSlice = "aardvark imbibbe".into();
1166  ///
1167  /// // The first db doesn't differentiate matches by ID number:
1168  /// let mut matches1: Vec<ExpressionIndex> = Vec::new();
1169  /// scratch.scan_sync(&set1, msg, |m| {
1170  ///   matches1.push(m.id);
1171  ///   MatchResult::Continue
1172  /// })?;
1173  /// assert_eq!(
1174  ///   &matches1,
1175  ///   &[
1176  ///      ExpressionIndex(0), ExpressionIndex(0), ExpressionIndex(0), ExpressionIndex(0),
1177  ///      ExpressionIndex(0), ExpressionIndex(0),
1178  ///    ],
1179  /// );
1180  ///
1181  /// // The second db returns corresponding ExpressionIndex instances:
1182  /// let mut matches2: Vec<ExpressionIndex> = Vec::new();
1183  /// scratch.scan_sync(&set2, msg, |m| {
1184  ///   matches2.push(m.id);
1185  ///   MatchResult::Continue
1186  /// })?;
1187  /// assert_eq!(
1188  ///   &matches2,
1189  ///   &[
1190  ///      ExpressionIndex(300), ExpressionIndex(300), ExpressionIndex(300),
1191  ///      ExpressionIndex(12), ExpressionIndex(12), ExpressionIndex(12),
1192  ///    ],
1193  /// );
1194  /// # Ok(())
1195  /// # }
1196  /// ```
1197  pub fn with_ids(mut self, ids: impl IntoIterator<Item=ExprId>) -> Self {
1198    let ids: Vec<_> = ids.into_iter().collect();
1199    assert_eq!(self.len(), ids.len());
1200    self.ids = Some(ids.to_vec());
1201    self
1202  }
1203
1204  /// Call [`Database::compile_multi_literal()`] with [`None`] for the platform.
1205  pub fn compile(self, mode: Mode) -> Result<Database, VectorscanCompileError> {
1206    Database::compile_multi_literal(&self, mode, None)
1207  }
1208
1209  /// The number of literals in this set.
1210  pub fn len(&self) -> usize { self.ptrs.len() }
1211
1212  /// Whether this set contains any literals.
1213  pub fn is_empty(&self) -> bool { self.len() == 0 }
1214
1215  pub(crate) fn num_elements(&self) -> c_uint { self.len() as c_uint }
1216
1217  pub(crate) fn literals_ptr(&self) -> *const *const c_char { self.ptrs.as_ptr() }
1218
1219  pub(crate) fn lengths_ptr(&self) -> *const usize { self.lens.as_ptr() }
1220
1221  pub(crate) fn flags_ptr(&self) -> *const c_uint {
1222    self
1223      .flags
1224      .as_ref()
1225      .map(|f| unsafe { mem::transmute(f.as_ptr()) })
1226      .unwrap_or(ptr::null())
1227  }
1228
1229  pub(crate) fn ids_ptr(&self) -> *const c_uint {
1230    self
1231      .ids
1232      .as_ref()
1233      .map(|i| unsafe { mem::transmute(i.as_ptr()) })
1234      .unwrap_or(ptr::null())
1235  }
1236}
1237
1238/// Pattern strings for the chimera library.
1239///
1240/// As per [Pattern Support], chimera has full support for PCRE.
1241///
1242/// [Pattern Support]: https://intel.github.io/hyperscan/dev-reference/chimera.html#pattern-support
1243///
1244/// As chimera focuses mainly on supporting PCRE compatibility and group
1245/// matching support, this interface is less full-featured than the standard
1246/// vectorscan library [`super::expression`]. However, the same idioms apply:
1247/// creating expression instances performs no pattern compilation itself, and
1248/// references to these structs can be reused without re-allocating the
1249/// underlying pattern string data:
1250///
1251///```
1252/// # #[allow(unused_variables)]
1253/// # fn main() -> Result<(), vectorscan::error::chimera::ChimeraError> {
1254/// use vectorscan::{expression::chimera::*, flags::chimera::*};
1255///
1256/// let a: ChimeraExpression = "a+".parse()?;
1257/// let b: ChimeraExpression = "b+".parse()?;
1258/// let c: ChimeraExpression = "c+".parse()?;
1259///
1260/// let ab_db = ChimeraExpressionSet::from_exprs([&a, &b]).compile(ChimeraMode::NOGROUPS)?;
1261/// let bc_db = ChimeraExpressionSet::from_exprs([&b, &c]).compile(ChimeraMode::NOGROUPS)?;
1262/// let ca_db = ChimeraExpressionSet::from_exprs([&c, &a]).compile(ChimeraMode::NOGROUPS)?;
1263/// # Ok(())
1264/// # }
1265/// ```
1266#[cfg(feature = "chimera")]
1267#[cfg_attr(docsrs, doc(cfg(feature = "chimera")))]
1268pub mod chimera {
1269  use super::ExprId;
1270  use crate::{
1271    database::chimera::ChimeraDb,
1272    error::chimera::ChimeraCompileError,
1273    flags::chimera::{ChimeraFlags, ChimeraMode},
1274  };
1275
1276  use std::{
1277    ffi::{CStr, CString},
1278    fmt,
1279    marker::PhantomData,
1280    mem,
1281    os::raw::{c_char, c_uint, c_ulong},
1282    ptr, str,
1283  };
1284
1285  /// Chimera (PCRE) pattern string.
1286  ///
1287  /// Note that as the underlying chimera library interprets pattern strings as
1288  /// null-terminated [`CStr`]s, null bytes are *not* supported within
1289  /// `ChimeraExpression` strings. If matching against patterns containing
1290  /// explicit null bytes is necessary, consider [`super::Literal`] or
1291  /// [`super::LiteralSet`] from the base vectorscan library.
1292  ///
1293  /// Note also that the chimera library does not support an "info" interface
1294  /// such as [`super::Expression::info()`] and
1295  /// [`super::Expression::ext_info()`] from the base vectorscan library.
1296  ///
1297  /// Instances can be created equivalently with [`Self::new()`] or
1298  /// [`str::parse()`] via the [`str::FromStr`] impl:
1299  ///
1300  ///```
1301  /// # fn main() -> Result<(), vectorscan::error::chimera::ChimeraError> {
1302  /// use vectorscan::expression::chimera::ChimeraExpression;
1303  ///
1304  /// let e1: ChimeraExpression = "asd(f+)".parse()?;
1305  /// let e2 = ChimeraExpression::new("asd(f+)")?;
1306  /// assert_eq!(e1, e2);
1307  /// # Ok(())
1308  /// # }
1309  /// ```
1310  #[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
1311  pub struct ChimeraExpression(CString);
1312
1313  impl fmt::Debug for ChimeraExpression {
1314    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1315      let b = self.as_bytes();
1316      match str::from_utf8(b) {
1317        Ok(s) => write!(f, "ChimeraExpression({:?})", s),
1318        Err(_) => write!(f, "ChimeraExpression({:?})", b),
1319      }
1320    }
1321  }
1322
1323  impl fmt::Display for ChimeraExpression {
1324    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1325      let b = self.as_bytes();
1326      match str::from_utf8(b) {
1327        Ok(s) => write!(f, "{}", s),
1328        Err(_) => write!(f, "(non-utf8: {:?})", b),
1329      }
1330    }
1331  }
1332
1333  impl ChimeraExpression {
1334    /// Reference the underlying bytes, *without* the trailing null terminator.
1335    ///
1336    ///```
1337    /// # fn main() -> Result<(), vectorscan::error::chimera::ChimeraError> {
1338    /// let e = vectorscan::expression::chimera::ChimeraExpression::new("asd(f+)")?;
1339    /// assert_eq!(e.as_bytes(), b"asd(f+)");
1340    /// # Ok(())
1341    /// # }
1342    /// ```
1343    pub fn as_bytes(&self) -> &[u8] { self.0.as_bytes() }
1344
1345    pub(crate) fn as_ptr(&self) -> *const c_char { self.0.as_c_str().as_ptr() }
1346
1347    /// Produce a `NULL`-terminated C-style wrapper for the given pattern
1348    /// string.
1349    ///
1350    /// This will fail if the string contains any internal `NULL` bytes, as
1351    /// those are not supported by the chimera library:
1352    ///```
1353    /// use vectorscan::{expression::chimera::*, error::chimera::*};
1354    ///
1355    /// let pat = "as\0df";
1356    /// let e = match ChimeraExpression::new(pat) {
1357    ///    Err(ChimeraCompileError::NullByte(e)) => e,
1358    ///    _ => unreachable!(),
1359    /// };
1360    /// assert_eq!(e.nul_position(), 2);
1361    /// ```
1362    pub fn new(x: impl Into<Vec<u8>>) -> Result<Self, ChimeraCompileError> {
1363      Ok(Self(CString::new(x)?))
1364    }
1365
1366    /// Call [`ChimeraDb::compile()`] with [`None`] for the platform.
1367    pub fn compile(
1368      &self,
1369      flags: ChimeraFlags,
1370      mode: ChimeraMode,
1371    ) -> Result<ChimeraDb, ChimeraCompileError> {
1372      ChimeraDb::compile(self, flags, mode, None)
1373    }
1374  }
1375
1376  impl str::FromStr for ChimeraExpression {
1377    type Err = ChimeraCompileError;
1378
1379    fn from_str(s: &str) -> Result<Self, Self::Err> { Self::new(s) }
1380  }
1381
1382  /// Extended configuration for the PCRE matching phase of chimera.
1383  ///
1384  /// The only entry point to configuring this is
1385  /// [`ChimeraExpressionSet::with_limits()`].
1386  #[derive(Debug, Copy, Clone)]
1387  pub struct ChimeraMatchLimits {
1388    /// A limit from pcre_extra on the amount of match function called in PCRE
1389    /// to limit backtracking that can take place.
1390    pub match_limit: c_ulong,
1391    /// A limit from pcre_extra on the recursion depth of match function in
1392    /// PCRE.
1393    pub match_limit_recursion: c_ulong,
1394  }
1395
1396  /// Collection of regular expressions.
1397  ///
1398  /// This is the analogue to [`super::ExpressionSet`] for [`ChimeraExpression`]
1399  /// instances.
1400  ///
1401  /// This struct provides an immutable (returning `Self`) builder interface
1402  /// to attach additional configuration to the initial set of patterns
1403  /// constructed with [`Self::from_exprs()`].
1404  #[derive(Clone)]
1405  pub struct ChimeraExpressionSet<'a> {
1406    ptrs: Vec<*const c_char>,
1407    flags: Option<Vec<ChimeraFlags>>,
1408    ids: Option<Vec<ExprId>>,
1409    limits: Option<ChimeraMatchLimits>,
1410    _ph: PhantomData<&'a u8>,
1411  }
1412
1413  impl<'a> fmt::Debug for ChimeraExpressionSet<'a> {
1414    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1415      let exprs: Vec<&'a CStr> = self
1416        .ptrs
1417        .iter()
1418        .map(|p| unsafe { CStr::from_ptr(*p) })
1419        .collect();
1420      write!(
1421        f,
1422        "ChimeraExpressionSet(exprs={:?}, flags={:?}, ids={:?}, limits={:?})",
1423        exprs, &self.flags, &self.ids, &self.limits
1424      )
1425    }
1426  }
1427
1428  impl<'a> ChimeraExpressionSet<'a> {
1429    /// Construct a pattern set from references to parsed expressions.
1430    ///
1431    /// The length of this initial `exprs` argument is returned by
1432    /// [`Self::len()`], and all subsequent configuration methods are checked to
1433    /// provide iterators of the same length:
1434    ///
1435    ///```should_panic
1436    /// use vectorscan::expression::chimera::*;
1437    ///
1438    /// let a: ChimeraExpression = "a+".parse().unwrap();
1439    /// // Fails due to argument length mismatch:
1440    /// ChimeraExpressionSet::from_exprs([&a])
1441    ///   .with_flags([]);
1442    /// ```
1443    pub fn from_exprs(exprs: impl IntoIterator<Item=&'a ChimeraExpression>) -> Self {
1444      Self {
1445        ptrs: exprs.into_iter().map(|e| e.as_ptr()).collect(),
1446        flags: None,
1447        ids: None,
1448        limits: None,
1449        _ph: PhantomData,
1450      }
1451    }
1452
1453    /// Provide flags which modify the behavior of each expression.
1454    ///
1455    /// The length of `flags` is checked to be the same as [`Self::len()`].
1456    ///
1457    /// If this builder method is not used, [`ChimeraFlags::default()`] will be
1458    /// assigned to all patterns.
1459    ///
1460    ///```
1461    /// # fn main() -> Result<(), vectorscan::error::chimera::ChimeraError> {
1462    /// use vectorscan::{expression::chimera::*, flags::chimera::*, matchers::chimera::*};
1463    ///
1464    /// // Create two expressions to demonstrate separate flags for each pattern:
1465    /// let a: ChimeraExpression = "a+[^a]".parse()?;
1466    /// let b: ChimeraExpression = "b+[^b]".parse()?;
1467    ///
1468    /// // Get the start of match for one pattern, but not the other:
1469    /// let db = ChimeraExpressionSet::from_exprs([&a, &b])
1470    ///   .with_flags([ChimeraFlags::default(), ChimeraFlags::SINGLEMATCH])
1471    ///   .compile(ChimeraMode::NOGROUPS)?;
1472    ///
1473    /// let mut scratch = db.allocate_scratch()?;
1474    ///
1475    /// let mut matches: Vec<&str> = Vec::new();
1476    /// scratch.scan_sync(&db, "aardvark imbibbe".into(), |m| {
1477    ///   matches.push(unsafe { m.source.as_str() });
1478    ///   ChimeraMatchResult::Continue
1479    /// }, |_| ChimeraMatchResult::Continue)?;
1480    /// // SINGLEMATCH is preserved for only one pattern:
1481    /// assert_eq!(&matches, &["aar", "ar", "bi"]);
1482    /// # Ok(())
1483    /// # }
1484    /// ```
1485    pub fn with_flags(mut self, flags: impl IntoIterator<Item=ChimeraFlags>) -> Self {
1486      let flags: Vec<_> = flags.into_iter().collect();
1487      assert_eq!(self.len(), flags.len());
1488      self.flags = Some(flags);
1489      self
1490    }
1491
1492    /// Assign an ID number to each pattern.
1493    ///
1494    /// The length of `ids` is checked to be the same as [`Self::len()`].
1495    /// Multiple patterns can be assigned the same ID.
1496    ///
1497    /// If this builder method is not used, vectorscan will assign them all the
1498    /// ID number 0:
1499    ///
1500    ///```
1501    /// # fn main() -> Result<(), vectorscan::error::chimera::ChimeraError> {
1502    /// use vectorscan::{sources::*, expression::{*, chimera::*}, flags::chimera::*, state::chimera::*, matchers::{*, chimera::*}};
1503    ///
1504    /// // Create two expressions to demonstrate multiple pattern IDs.
1505    /// let a: ChimeraExpression = "a+[^a]".parse()?;
1506    /// let b: ChimeraExpression = "b+[^b]".parse()?;
1507    ///
1508    /// // Create one db with ID numbers, and one without.
1509    /// let set1 = ChimeraExpressionSet::from_exprs([&a, &b]).compile(ChimeraMode::NOGROUPS)?;
1510    /// let set2 = ChimeraExpressionSet::from_exprs([&a, &b])
1511    ///   .with_ids([ExprId(300), ExprId(12)])
1512    ///   .compile(ChimeraMode::NOGROUPS)?;
1513    ///
1514    /// let mut scratch = ChimeraScratch::blank();
1515    /// scratch.setup_for_db(&set1)?;
1516    /// scratch.setup_for_db(&set2)?;
1517    ///
1518    /// let msg: ByteSlice = "aardvark imbibbe".into();
1519    ///
1520    /// // The first db doesn't differentiate matches by ID number:
1521    /// let mut matches1: Vec<ExpressionIndex> = Vec::new();
1522    /// scratch.scan_sync(&set1, msg, |m| {
1523    ///   matches1.push(m.id);
1524    ///   ChimeraMatchResult::Continue
1525    /// }, |_| ChimeraMatchResult::Continue)?;
1526    /// assert_eq!(
1527    ///   &matches1,
1528    ///   &[ExpressionIndex(0), ExpressionIndex(0), ExpressionIndex(0), ExpressionIndex(0)],
1529    /// );
1530    ///
1531    /// // The second db returns corresponding ExpressionIndex instances:
1532    /// let mut matches2: Vec<ExpressionIndex> = Vec::new();
1533    /// scratch.scan_sync(&set2, msg, |m| {
1534    ///   matches2.push(m.id);
1535    ///   ChimeraMatchResult::Continue
1536    /// }, |_| ChimeraMatchResult::Continue)?;
1537    /// assert_eq!(
1538    ///   &matches2,
1539    ///   &[ExpressionIndex(300), ExpressionIndex(300), ExpressionIndex(12), ExpressionIndex(12)],
1540    /// );
1541    /// # Ok(())
1542    /// # }
1543    /// ```
1544    pub fn with_ids(mut self, ids: impl IntoIterator<Item=ExprId>) -> Self {
1545      let ids: Vec<_> = ids.into_iter().collect();
1546      assert_eq!(self.len(), ids.len());
1547      self.ids = Some(ids);
1548      self
1549    }
1550
1551    /// Assign extended PCRE configuration to the entire pattern set.
1552    ///
1553    /// This is the only entry point to configuring PCRE match limits (i.e. the
1554    /// single-pattern compiler does not support match limits).
1555    ///
1556    ///```
1557    /// # fn main() -> Result<(), vectorscan::error::chimera::ChimeraError> {
1558    /// use vectorscan::{sources::*, expression::chimera::*, flags::chimera::*, state::chimera::*, matchers::chimera::*, error::chimera::*};
1559    ///
1560    /// // Create one db with backtracking match limits, and one without.
1561    /// let a: ChimeraExpression = r"(asdf?)hey\1".parse()?;
1562    /// let set1 = ChimeraExpressionSet::from_exprs([&a]).compile(ChimeraMode::GROUPS)?;
1563    /// let set2 = ChimeraExpressionSet::from_exprs([&a])
1564    ///   .with_limits(ChimeraMatchLimits { match_limit: 1, match_limit_recursion: 1 })
1565    ///   .compile(ChimeraMode::GROUPS)?;
1566    ///
1567    /// let mut scratch = ChimeraScratch::blank();
1568    /// scratch.setup_for_db(&set1)?;
1569    /// scratch.setup_for_db(&set2)?;
1570    ///
1571    /// let msg: ByteSlice = "asdfheyasdf".into();
1572    ///
1573    /// // The first db doesn't stop the matching engine:
1574    /// let mut matches1: Vec<&str> = Vec::new();
1575    /// scratch.scan_sync(&set1, msg, |m| {
1576    ///   matches1.push(unsafe { m.captures.unwrap()[1].unwrap().as_str() });
1577    ///   ChimeraMatchResult::Continue
1578    /// }, |_| ChimeraMatchResult::Terminate)?;
1579    /// assert_eq!(&matches1, &["asdf"]);
1580    ///
1581    /// // The second db imposes a match limit, which triggers the second callback to return
1582    /// // `ChimeraMatchResult::Terminate`.
1583    /// let mut matches2: Vec<ChimeraMatchError> = Vec::new();
1584    /// let result = scratch.scan_sync(
1585    ///   &set2,
1586    ///   msg,
1587    ///   |_| unreachable!(),
1588    ///   |e| {
1589    ///     matches2.push(e);
1590    ///     ChimeraMatchResult::Terminate
1591    ///   },
1592    /// );
1593    /// assert!(matches![result, Err(ChimeraRuntimeError::ScanTerminated)]);
1594    /// assert_eq!(matches2.len(), 1);
1595    /// assert_eq!(matches2[0].error_type, ChimeraMatchErrorType::MatchLimit);
1596    /// # Ok(())
1597    /// # }
1598    /// ```
1599    pub fn with_limits(mut self, limits: ChimeraMatchLimits) -> Self {
1600      self.limits = Some(limits);
1601      self
1602    }
1603
1604    /// Call [`ChimeraDb::compile_multi()`] with [`None`] for the platform.
1605    pub fn compile(self, mode: ChimeraMode) -> Result<ChimeraDb, ChimeraCompileError> {
1606      ChimeraDb::compile_multi(&self, mode, None)
1607    }
1608
1609    /// The number of patterns in this set.
1610    pub fn len(&self) -> usize { self.ptrs.len() }
1611
1612    /// Whether this set contains any patterns.
1613    pub fn is_empty(&self) -> bool { self.len() == 0 }
1614
1615    pub(crate) fn limits(&self) -> Option<ChimeraMatchLimits> { self.limits }
1616
1617    pub(crate) fn num_elements(&self) -> c_uint { self.len() as c_uint }
1618
1619    pub(crate) fn expressions_ptr(&self) -> *const *const c_char { self.ptrs.as_ptr() }
1620
1621    pub(crate) fn flags_ptr(&self) -> *const c_uint {
1622      self
1623        .flags
1624        .as_ref()
1625        .map(|f| unsafe { mem::transmute(f.as_ptr()) })
1626        .unwrap_or(ptr::null())
1627    }
1628
1629    pub(crate) fn ids_ptr(&self) -> *const c_uint {
1630      self
1631        .ids
1632        .as_ref()
1633        .map(|i| unsafe { mem::transmute(i.as_ptr()) })
1634        .unwrap_or(ptr::null())
1635    }
1636  }
1637}