vectorscan/expression.rs
1/* Copyright 2022-2023 Danny McClanahan */
2/* SPDX-License-Identifier: BSD-3-Clause */
3
4//! FFI wrappers for different types of pattern strings.
5//!
6//! Vectorscan supports 3 distinct types of pattern strings which can be formed
7//! to produce a database:
8//! - [`Expression`]: Vectorscan PCRE-like regex syntax (null-terminated
9//! [`CString`]).
10//! - [`Literal`]: Literal byte string (`Vec<u8>`) which may contain nulls.
11//! - [`chimera::ChimeraExpression`]: PCRE regex syntax.
12//!
13//! Each vectorscan database only supports matching against *exactly one* type
14//! of these patterns, but each pattern string variant also has a `*Set` form,
15//! and all of these forms support the same interface to vectorscan's most
16//! powerful feature: multi-pattern matching, where patterns registered with
17//! [`ExprId`] in a set can be associated to
18//! [`ExpressionIndex`](crate::matchers::ExpressionIndex) instances when matched
19//! against.
20//!
21//! Creating instances of these structs performs no pattern compilation itself,
22//! which is instead performed in a subsequent step by e.g.
23//! [`Database::compile()`]. References to these structs can be reused multiple
24//! times to create multiple databases without re-allocating the underlying
25//! pattern string data:
26//!
27//!```
28//! # #[allow(unused_variables)]
29//! # fn main() -> Result<(), vectorscan::error::VectorscanError> {
30//! use vectorscan::{expression::*, flags::*};
31//!
32//! let a: Expression = "a+".parse()?;
33//! let b: Expression = "b+".parse()?;
34//! let c: Expression = "c+".parse()?;
35//!
36//! let ab_db = ExpressionSet::from_exprs([&a, &b]).compile(Mode::BLOCK)?;
37//! let bc_db = ExpressionSet::from_exprs([&b, &c]).compile(Mode::BLOCK)?;
38//! let ca_db = ExpressionSet::from_exprs([&c, &a]).compile(Mode::BLOCK)?;
39//! # Ok(())
40//! # }
41//! ```
42
43use crate::{
44 database::Database,
45 error::{VectorscanCompileError, VectorscanRuntimeError},
46 flags::{ExtFlags, Flags, Mode},
47 hs,
48};
49
50use std::{
51 ffi::{CStr, CString},
52 fmt,
53 marker::PhantomData,
54 mem, ops,
55 os::raw::{c_char, c_uint, c_ulonglong},
56 ptr, slice, str,
57};
58
59/// Vectorscan regex pattern string.
60///
61/// Vectorscan itself supports a subset of PCRE syntax in the pattern string;
62/// see [Pattern Support] for reference. The use of unsupported constructs will
63/// result in compilation errors.
64///
65/// Note that as the underlying vectorscan library interprets pattern strings as
66/// null-terminated [`CStr`]s, null bytes are *not* supported within
67/// `Expression` strings. Use a [`Literal`] or [`LiteralSet`] database if you
68/// need to match against pattern strings containing explicit null bytes.
69///
70/// Instances can be created equivalently with [`Self::new()`] or
71/// [`str::parse()`] via the [`str::FromStr`] impl:
72///
73///```
74/// # fn main() -> Result<(), vectorscan::error::VectorscanError> {
75/// use vectorscan::expression::Expression;
76///
77/// let e1: Expression = "asdf+".parse()?;
78/// let e2 = Expression::new("asdf+")?;
79/// assert_eq!(e1, e2);
80/// # Ok(())
81/// # }
82/// ```
83///
84/// [Pattern Support]: https://intel.github.io/hyperscan/dev-reference/compilation.html#pattern-support
85#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
86pub struct Expression(CString);
87
88impl fmt::Display for Expression {
89 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
90 let b = self.as_bytes();
91 match str::from_utf8(b) {
92 Ok(s) => write!(f, "{}", s),
93 Err(_) => write!(f, "(non-utf8: {:?})", b),
94 }
95 }
96}
97
98impl Expression {
99 /// Reference the underlying bytes, *without* the trailing null terminator.
100 ///
101 ///```
102 /// # fn main() -> Result<(), vectorscan::error::VectorscanError> {
103 /// let e = vectorscan::expression::Expression::new("asdf")?;
104 /// assert_eq!(e.as_bytes(), b"asdf");
105 /// # Ok(())
106 /// # }
107 /// ```
108 pub fn as_bytes(&self) -> &[u8] { self.0.as_bytes() }
109
110 pub(crate) fn as_ptr(&self) -> *const c_char { self.0.as_c_str().as_ptr() }
111
112 /// Produce a `NULL`-terminated C-style wrapper for the given pattern string.
113 ///
114 /// This will fail if the string contains any internal `NULL` bytes, as those
115 /// are not supported by the vectorscan regex compiler:
116 ///```
117 /// use vectorscan::{expression::*, error::*};
118 ///
119 /// let pat = "as\0df";
120 /// let e = match Expression::new(pat) {
121 /// Err(VectorscanCompileError::NullByte(e)) => e,
122 /// _ => unreachable!(),
123 /// };
124 /// assert_eq!(e.nul_position(), 2);
125 /// ```
126 pub fn new(x: impl Into<Vec<u8>>) -> Result<Self, VectorscanCompileError> {
127 Ok(Self(CString::new(x)?))
128 }
129
130 /// Utility function providing information about a regular expression. The
131 /// information provided in [`info::ExprInfo`] includes the minimum and
132 /// maximum width of a pattern match.
133 ///
134 /// Note: successful analysis of an expression with this function does not
135 /// imply that compilation of the same expression (via
136 /// [`Database::compile()`] or [`Database::compile_multi()`]) would succeed.
137 /// This function may return [`Ok`] for regular expressions that
138 /// Vectorscan cannot compile.
139 ///
140 /// Note: some per-pattern flags (such as [`Flags::ALLOWEMPTY`] and
141 /// [`Flags::SOM_LEFTMOST`]) are accepted by this call, but as they do not
142 /// affect the properties returned in the [`info::ExprInfo`] structure,
143 /// they will not affect the outcome of this function.
144 ///
145 ///```
146 /// # fn main() -> Result<(), vectorscan::error::VectorscanError> {
147 /// use vectorscan::{expression::{*, info::*}, flags::Flags};
148 ///
149 /// let expr: Expression = "(he)llo".parse()?;
150 ///
151 /// let info = expr.info(Flags::default())?;
152 ///
153 /// assert_eq!(info, ExprInfo {
154 /// min_width: ExprWidth(5),
155 /// max_width: Some(ExprWidth(5)),
156 /// unordered_matches: UnorderedMatchBehavior::OnlyOrdered,
157 /// matches_at_eod: MatchAtEndBehavior::WillNeverMatchAtEOD,
158 /// });
159 /// # Ok(())
160 /// # }
161 /// ```
162 pub fn info(&self, flags: Flags) -> Result<info::ExprInfo, VectorscanCompileError> {
163 let mut info = ptr::null_mut();
164 let mut compile_err = ptr::null_mut();
165 VectorscanRuntimeError::copy_from_native_compile_error(
166 unsafe {
167 hs::hs_expression_info(
168 self.as_ptr(),
169 flags.into_native(),
170 &mut info,
171 &mut compile_err,
172 )
173 },
174 compile_err,
175 )?;
176
177 let ret = info::ExprInfo::from_native(unsafe { *info });
178
179 unsafe {
180 crate::free_misc(info as *mut u8);
181 }
182
183 Ok(ret)
184 }
185
186 /// Utility function providing information about a regular expression, with
187 /// extended parameter support. The information provided in [`info::ExprInfo`]
188 /// includes the minimum and maximum width of a pattern match.
189 ///
190 /// Note: successful analysis of an expression with this function does not
191 /// imply that compilation of the same expression (via
192 /// [`Database::compile()`] or [`Database::compile_multi()`]) would succeed.
193 /// This function may return [`Ok`] for regular expressions that
194 /// Vectorscan cannot compile.
195 ///
196 /// Note: some per-pattern flags (such as [`Flags::ALLOWEMPTY`] and
197 /// [`Flags::SOM_LEFTMOST`]) are accepted by this call, but as they do not
198 /// affect the properties returned in the [`info::ExprInfo`] structure,
199 /// they will not affect the outcome of this function.
200 ///
201 ///```
202 /// # fn main() -> Result<(), vectorscan::error::VectorscanError> {
203 /// use vectorscan::{expression::{*, info::*}, flags::Flags};
204 ///
205 /// let expr: Expression = ".*lo".parse()?;
206 ///
207 /// let ext = ExprExt::from_min_length(4);
208 ///
209 /// let info = expr.ext_info(Flags::default(), &ext)?;
210 ///
211 /// assert_eq!(info, ExprInfo {
212 /// min_width: ExprWidth(4),
213 /// max_width: None,
214 /// unordered_matches: UnorderedMatchBehavior::OnlyOrdered,
215 /// matches_at_eod: MatchAtEndBehavior::WillNeverMatchAtEOD,
216 /// });
217 /// # Ok(())
218 /// # }
219 /// ```
220 pub fn ext_info(
221 &self,
222 flags: Flags,
223 ext_flags: &ExprExt,
224 ) -> Result<info::ExprInfo, VectorscanCompileError> {
225 let mut info = ptr::null_mut();
226 let mut compile_err = ptr::null_mut();
227 VectorscanRuntimeError::copy_from_native_compile_error(
228 unsafe {
229 hs::hs_expression_ext_info(
230 self.as_ptr(),
231 flags.into_native(),
232 ext_flags.as_ref_native(),
233 &mut info,
234 &mut compile_err,
235 )
236 },
237 compile_err,
238 )?;
239
240 let ret = info::ExprInfo::from_native(unsafe { *info });
241
242 unsafe {
243 crate::free_misc(info as *mut u8);
244 }
245
246 Ok(ret)
247 }
248
249 /// Call [`Database::compile()`] with [`None`] for the platform.
250 pub fn compile(&self, flags: Flags, mode: Mode) -> Result<Database, VectorscanCompileError> {
251 Database::compile(self, flags, mode, None)
252 }
253}
254
255impl str::FromStr for Expression {
256 type Err = VectorscanCompileError;
257
258 fn from_str(s: &str) -> Result<Self, Self::Err> { Self::new(s) }
259}
260
261/// A literal byte string.
262///
263/// Unlike for [`Expression`], [`Database::compile_literal()`] will parse the
264/// string content in a literal sense without any regular grammars. For example,
265/// the expression `abc?` simply means a char sequence of `a`, `b`, `c`,
266/// and `?`. The `?` here doesn't mean 0 or 1 quantifier under regular
267/// semantics.
268///
269/// Also unlike [`Expression`], the underlying vectorscan library interprets
270/// literal patterns with a pointer and a length instead of a `NULL`-terminated
271/// string. **Importantly, this allows it to contain `\0` or `NULL` bytes
272/// itself!**
273///
274/// Finally note that literal expressions do not support an "info" interface
275/// like [`Expression::info()`] and [`Expression::ext_info()`], since most of
276/// those properties can be inferred from the literal string itself.
277///
278/// Instances can be created equivalently with [`Self::new()`] or
279/// [`str::parse()`] via the [`str::FromStr`] impl:
280///```
281/// # fn main() -> Result<(), vectorscan::error::VectorscanError> {
282/// use vectorscan::expression::Literal;
283///
284/// let e1: Literal = "as\0df".parse()?;
285/// let e2 = Literal::new("as\0df")?;
286/// assert_eq!(e1, e2);
287/// # Ok(())
288/// # }
289/// ```
290#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
291pub struct Literal(Vec<u8>);
292
293impl fmt::Debug for Literal {
294 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
295 let b = self.as_bytes();
296 match str::from_utf8(b) {
297 Ok(s) => write!(f, "Literal({:?})", s),
298 Err(_) => write!(f, "Literal({:?})", b),
299 }
300 }
301}
302
303impl fmt::Display for Literal {
304 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
305 let b = self.as_bytes();
306 match str::from_utf8(b) {
307 Ok(s) => write!(f, "{}", s),
308 Err(_) => write!(f, "(non-utf8 literal: {:?})", b),
309 }
310 }
311}
312
313impl Literal {
314 /// Reference the underlying bytes. This wrapper does *not* allocate any null
315 /// terminator.
316 ///
317 ///```
318 /// # fn main() -> Result<(), vectorscan::error::VectorscanError> {
319 /// let e = vectorscan::expression::Literal::new("as\0df")?;
320 /// assert_eq!(e.as_bytes(), b"as\0df");
321 /// # Ok(())
322 /// # }
323 /// ```
324 pub fn as_bytes(&self) -> &[u8] { &self.0 }
325
326 pub(crate) fn as_ptr(&self) -> *const c_char {
327 unsafe { mem::transmute(self.as_bytes().as_ptr()) }
328 }
329
330 /// Wrap a byte slice to be interpreted literally. This does *not* allocate
331 /// any null terminator.
332 pub fn new(x: impl Into<Vec<u8>>) -> Result<Self, VectorscanCompileError> { Ok(Self(x.into())) }
333
334 /// Call [`Database::compile_literal()`] with [`None`] for the platform.
335 pub fn compile(&self, flags: Flags, mode: Mode) -> Result<Database, VectorscanCompileError> {
336 Database::compile_literal(self, flags, mode, None)
337 }
338}
339
340impl str::FromStr for Literal {
341 type Err = VectorscanCompileError;
342
343 fn from_str(s: &str) -> Result<Self, Self::Err> { Self::new(s) }
344}
345
346/// The ID number to associate with a pattern match in an expression set.
347///
348/// When provided to an expression set, this value is converted into an
349/// [`ExpressionIndex`](crate::matchers::ExpressionIndex) in a
350/// [`Match`](crate::matchers::Match),
351/// [`VectoredMatch`](crate::matchers::VectoredMatch), or
352/// [`ChimeraMatch`](crate::matchers::chimera::ChimeraMatch) upon matching the
353/// given pattern.
354///
355/// This ID is used in [`ExpressionSet::with_ids()`],
356/// [`LiteralSet::with_ids()`], and
357/// [`ChimeraExpressionSet::with_ids()`](chimera::ChimeraExpressionSet::with_ids).
358#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
359#[repr(transparent)]
360pub struct ExprId(pub c_uint);
361
362/// Collection of regular expressions.
363///
364/// This is the main entry point to vectorscan's primary functionality: matching
365/// against sets of patterns at once, which is typically poorly supported or
366/// less featureful than single-pattern matching in many other regex engines.
367///
368/// This struct provides an immutable (returning `Self`) builder interface
369/// to attach additional configuration to the initial set of patterns
370/// constructed with [`Self::from_exprs()`].
371#[derive(Clone)]
372pub struct ExpressionSet<'a> {
373 ptrs: Vec<*const c_char>,
374 flags: Option<Vec<Flags>>,
375 ids: Option<Vec<ExprId>>,
376 exts: Option<Vec<*const hs::hs_expr_ext>>,
377 _ph: PhantomData<&'a u8>,
378}
379
380impl<'a> fmt::Debug for ExpressionSet<'a> {
381 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
382 let exprs: Vec<&'a CStr> = self
383 .ptrs
384 .iter()
385 .map(|p| unsafe { CStr::from_ptr(*p) })
386 .collect();
387 let exts: Option<&[Option<&ExprExt>]> = self
388 .exts
389 .as_ref()
390 .map(|exts| unsafe { slice::from_raw_parts(mem::transmute(exts.as_ptr()), exprs.len()) });
391 write!(
392 f,
393 "ExpressionSet(exprs={:?}, flags={:?}, ids={:?}, exts={:?})",
394 exprs, &self.flags, &self.ids, exts,
395 )
396 }
397}
398
399impl<'a> ExpressionSet<'a> {
400 /// Construct a pattern set from references to parsed expressions.
401 ///
402 /// The length of this initial `exprs` argument is returned by
403 /// [`Self::len()`], and all subsequent configuration methods are checked to
404 /// provide iterators of the same length:
405 ///
406 ///```should_panic
407 /// use vectorscan::expression::*;
408 ///
409 /// let a: Expression = "a+".parse().unwrap();
410 /// // Fails due to argument length mismatch:
411 /// ExpressionSet::from_exprs([&a])
412 /// .with_flags([]);
413 /// ```
414 pub fn from_exprs(exprs: impl IntoIterator<Item=&'a Expression>) -> Self {
415 Self {
416 ptrs: exprs.into_iter().map(|e| e.as_ptr()).collect(),
417 flags: None,
418 ids: None,
419 exts: None,
420 _ph: PhantomData,
421 }
422 }
423
424 /// Provide flags which modify the behavior of each expression.
425 ///
426 /// The length of `flags` is checked to be the same as [`Self::len()`].
427 ///
428 /// If this builder method is not used, [`Flags::default()`] will be assigned
429 /// to all patterns.
430 ///
431 ///```
432 /// # fn main() -> Result<(), vectorscan::error::VectorscanError> {
433 /// use vectorscan::{expression::*, flags::*, matchers::*};
434 ///
435 /// // Create two expressions to demonstrate separate flags for each pattern:
436 /// let a: Expression = "a+[^a]".parse()?;
437 /// let b: Expression = "b+[^b]".parse()?;
438 ///
439 /// // Get the start of match for one pattern, but not the other:
440 /// let db = ExpressionSet::from_exprs([&a, &b])
441 /// .with_flags([Flags::default(), Flags::SOM_LEFTMOST])
442 /// .compile(Mode::BLOCK)?;
443 ///
444 /// let mut scratch = db.allocate_scratch()?;
445 ///
446 /// let mut matches: Vec<&str> = Vec::new();
447 /// scratch.scan_sync(&db, "aardvark imbibbe".into(), |m| {
448 /// matches.push(unsafe { m.source.as_str() });
449 /// MatchResult::Continue
450 /// })?;
451 /// // Start of match is preserved for only one pattern:
452 /// assert_eq!(&matches, &["aar", "aardvar", "bi", "bbe"]);
453 /// # Ok(())
454 /// # }
455 /// ```
456 pub fn with_flags(mut self, flags: impl IntoIterator<Item=Flags>) -> Self {
457 let flags: Vec<_> = flags.into_iter().collect();
458 assert_eq!(self.len(), flags.len());
459 self.flags = Some(flags);
460 self
461 }
462
463 /// Assign an ID number to each pattern.
464 ///
465 /// The length of `ids` is checked to be the same as [`Self::len()`]. Multiple
466 /// patterns can be assigned the same ID.
467 ///
468 /// If this builder method is not used, vectorscan will assign them all the ID
469 /// number 0:
470 ///
471 ///```
472 /// # fn main() -> Result<(), vectorscan::error::VectorscanError> {
473 /// use vectorscan::{expression::*, flags::*, state::*, matchers::*, sources::*};
474 ///
475 /// // Create two expressions to demonstrate multiple pattern IDs.
476 /// let a: Expression = "a+[^a]".parse()?;
477 /// let b: Expression = "b+[^b]".parse()?;
478 ///
479 /// // Create one db with ID numbers, and one without.
480 /// let set1 = ExpressionSet::from_exprs([&a, &b]).compile(Mode::BLOCK)?;
481 /// let set2 = ExpressionSet::from_exprs([&a, &b])
482 /// .with_ids([ExprId(300), ExprId(12)])
483 /// .compile(Mode::BLOCK)?;
484 ///
485 /// let mut scratch = Scratch::blank();
486 /// scratch.setup_for_db(&set1)?;
487 /// scratch.setup_for_db(&set2)?;
488 ///
489 /// let msg: ByteSlice = "aardvark imbibbe".into();
490 ///
491 /// // The first db doesn't differentiate matches by ID number:
492 /// let mut matches1: Vec<ExpressionIndex> = Vec::new();
493 /// scratch.scan_sync(&set1, msg, |m| {
494 /// matches1.push(m.id);
495 /// MatchResult::Continue
496 /// })?;
497 /// assert_eq!(
498 /// &matches1,
499 /// &[ExpressionIndex(0), ExpressionIndex(0), ExpressionIndex(0), ExpressionIndex(0)],
500 /// );
501 ///
502 /// // The second db returns corresponding ExpressionIndex instances:
503 /// let mut matches2: Vec<ExpressionIndex> = Vec::new();
504 /// scratch.scan_sync(&set2, msg, |m| {
505 /// matches2.push(m.id);
506 /// MatchResult::Continue
507 /// })?;
508 /// assert_eq!(
509 /// &matches2,
510 /// &[ExpressionIndex(300), ExpressionIndex(300), ExpressionIndex(12), ExpressionIndex(12)],
511 /// );
512 /// # Ok(())
513 /// # }
514 /// ```
515 pub fn with_ids(mut self, ids: impl IntoIterator<Item=ExprId>) -> Self {
516 let ids: Vec<_> = ids.into_iter().collect();
517 assert_eq!(self.len(), ids.len());
518 self.ids = Some(ids);
519 self
520 }
521
522 /// Optionally assign [`ExprExt`] configuration to each pattern.
523 ///
524 /// This is the only available entry point to compiling a database with
525 /// [`ExprExt`] configuration for a given pattern (i.e. the single
526 /// expression compiler does not support extended configuration).
527 ///
528 /// If [`Expression::ext_info()`] succeeds with a given
529 /// [`Expression`]/[`ExprExt`] pair, then compiling the same pattern and
530 /// configuration into a vectorscan database via an expression set with this
531 /// method is likely but not guaranteed to succeed.
532 ///
533 ///```
534 /// # fn main() -> Result<(), vectorscan::error::VectorscanError> {
535 /// use vectorscan::{expression::*, flags::*, matchers::*};
536 ///
537 /// // Apply extended configuration to one version of the pattern, but not the other:
538 /// let a: Expression = "a.*b".parse()?;
539 /// let a_ext = ExprExt::from_min_length(4);
540 /// let set = ExpressionSet::from_exprs([&a, &a])
541 /// .with_exts([Some(&a_ext), None])
542 /// .with_ids([ExprId(1), ExprId(2)])
543 /// .compile(Mode::BLOCK)?;
544 /// let mut scratch = set.allocate_scratch()?;
545 ///
546 /// // The configured pattern does not match because of its min length attribute:
547 /// let mut matches: Vec<ExpressionIndex> = Vec::new();
548 /// scratch.scan_sync(&set, "ab".into(), |m| {
549 /// matches.push(m.id);
550 /// MatchResult::Continue
551 /// })?;
552 /// assert_eq!(&matches, &[ExpressionIndex(2)]);
553 ///
554 /// // Howver, both patterns match a longer input:
555 /// matches.clear();
556 /// scratch.scan_sync(&set, "asssssb".into(), |m| {
557 /// matches.push(m.id);
558 /// MatchResult::Continue
559 /// })?;
560 /// assert_eq!(&matches, &[ExpressionIndex(1), ExpressionIndex(2)]);
561 /// # Ok(())
562 /// # }
563 /// ```
564 pub fn with_exts(mut self, exts: impl IntoIterator<Item=Option<&'a ExprExt>>) -> Self {
565 let exts: Vec<*const hs::hs_expr_ext> = exts
566 .into_iter()
567 .map(|e| {
568 e.map(|e| e.as_ref_native() as *const hs::hs_expr_ext)
569 .unwrap_or(ptr::null())
570 })
571 .collect();
572 assert_eq!(self.len(), exts.len());
573 self.exts = Some(exts);
574 self
575 }
576
577 /// Call [`Database::compile_multi()`] with [`None`] for the platform.
578 pub fn compile(self, mode: Mode) -> Result<Database, VectorscanCompileError> {
579 Database::compile_multi(&self, mode, None)
580 }
581
582 /// The number of patterns in this set.
583 pub fn len(&self) -> usize { self.ptrs.len() }
584
585 /// Whether this set contains any patterns.
586 pub fn is_empty(&self) -> bool { self.len() == 0 }
587
588 pub(crate) fn num_elements(&self) -> c_uint { self.len() as c_uint }
589
590 pub(crate) fn exts_ptr(&self) -> Option<*const *const hs::hs_expr_ext> {
591 self.exts.as_ref().map(|e| e.as_ptr())
592 }
593
594 pub(crate) fn expressions_ptr(&self) -> *const *const c_char { self.ptrs.as_ptr() }
595
596 pub(crate) fn flags_ptr(&self) -> *const c_uint {
597 self
598 .flags
599 .as_ref()
600 .map(|f| unsafe { mem::transmute(f.as_ptr()) })
601 .unwrap_or(ptr::null())
602 }
603
604 pub(crate) fn ids_ptr(&self) -> *const c_uint {
605 self
606 .ids
607 .as_ref()
608 .map(|i| unsafe { mem::transmute(i.as_ptr()) })
609 .unwrap_or(ptr::null())
610 }
611}
612
613/// Data produced by vectorscan to analyze a particular expression.
614///
615/// These structs cover the output of [`Expression::info()`] and
616/// [`Expression::ext_info()`].
617pub mod info {
618 use crate::hs;
619
620 use displaydoc::Display;
621
622 use std::os::raw::{c_char, c_uint};
623
624 /// The upper or lower bound for the length of any matches returned by a
625 /// pattern.
626 #[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
627 #[repr(transparent)]
628 pub struct ExprWidth(pub usize);
629
630 impl ExprWidth {
631 pub(crate) const fn parse_min_width(x: c_uint) -> Self { Self(x as usize) }
632
633 pub(crate) const fn parse_max_width(x: c_uint) -> Option<Self> {
634 if x == c_uint::MAX {
635 None
636 } else {
637 Some(Self(x as usize))
638 }
639 }
640 }
641
642 /// Whether the expression can produce matches that are not returned in order,
643 /// such as those produced by assertions.
644 #[derive(
645 Debug,
646 Display,
647 Copy,
648 Clone,
649 PartialEq,
650 Eq,
651 PartialOrd,
652 Ord,
653 Hash,
654 num_enum::IntoPrimitive,
655 num_enum::FromPrimitive,
656 )]
657 #[repr(i8)]
658 pub enum UnorderedMatchBehavior {
659 /// Disallows matches that are not returned in order.
660 #[num_enum(default)]
661 OnlyOrdered = 0,
662 /// Allows matches that are not returned in order.
663 AllowsUnordered = 1,
664 }
665
666 impl UnorderedMatchBehavior {
667 pub(crate) const fn from_native(x: c_char) -> Self {
668 if x == 0 {
669 Self::OnlyOrdered
670 } else {
671 Self::AllowsUnordered
672 }
673 }
674 }
675
676 /// Whether this expression can produce matches at end of data (EOD).
677 ///
678 /// In streaming mode, EOD matches are raised during
679 /// [`Scratch::flush_eod_sync()`](crate::state::Scratch::flush_eod_sync) or
680 /// [`Scratch::flush_eod_sync()`](crate::state::Scratch::flush_eod_sync),
681 /// since it is only when `flush_eod()` is called that the EOD location is
682 /// known.
683 ///
684 /// Note: trailing `\b` word boundary assertions may also result in EOD
685 /// matches as end-of-data can act as a word boundary.
686 #[derive(Debug, Display, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
687 #[repr(i8)]
688 pub enum MatchAtEndBehavior {
689 /// Pattern will never match at EOD.
690 WillNeverMatchAtEOD,
691 /// Pattern *may* match at EOD.
692 MayMatchAtEOD,
693 /// Pattern will *only* match at EOD.
694 WillOnlyMatchAtEOD,
695 }
696
697 impl MatchAtEndBehavior {
698 pub(crate) fn from_native(matches_at_eod: c_char, matches_only_at_eod: c_char) -> Self {
699 match (matches_at_eod, matches_only_at_eod) {
700 (0, 0) => Self::WillNeverMatchAtEOD,
701 (x, 0) if x != 0 => Self::MayMatchAtEOD,
702 (_, x) if x != 0 => Self::WillOnlyMatchAtEOD,
703 x => unreachable!("unreachable pattern: {:?}", x),
704 }
705 }
706 }
707
708 /// Data produced by vectorscan to analyze a particular expression.
709 ///
710 /// This struct is produced by [`super::Expression::info()`]:
711 ///
712 ///```
713 /// # fn main() -> Result<(), vectorscan::error::VectorscanError> {
714 /// use vectorscan::{expression::{*, info::*}, flags::Flags};
715 ///
716 /// let expr: Expression = "(he)llo$".parse()?;
717 /// let info = expr.info(Flags::default())?;
718 /// assert_eq!(info, ExprInfo {
719 /// min_width: ExprWidth(5),
720 /// max_width: Some(ExprWidth(5)),
721 /// unordered_matches: UnorderedMatchBehavior::AllowsUnordered,
722 /// matches_at_eod: MatchAtEndBehavior::WillOnlyMatchAtEOD,
723 /// });
724 /// # Ok(())
725 /// # }
726 /// ```
727 ///
728 /// as well as [`super::Expression::ext_info()`]:
729 ///
730 ///```
731 /// # fn main() -> Result<(), vectorscan::error::VectorscanError> {
732 /// use vectorscan::{expression::{*, info::*}, flags::Flags};
733 ///
734 /// let expr: Expression = ".*lo($)?".parse()?;
735 /// let ext = ExprExt::from_min_length(4);
736 /// let info = expr.ext_info(Flags::default(), &ext)?;
737 /// assert_eq!(info, ExprInfo {
738 /// min_width: ExprWidth(4),
739 /// max_width: None,
740 /// unordered_matches: UnorderedMatchBehavior::AllowsUnordered,
741 /// matches_at_eod: MatchAtEndBehavior::MayMatchAtEOD,
742 /// });
743 /// # Ok(())
744 /// # }
745 /// ```
746 #[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
747 pub struct ExprInfo {
748 /// The minimum length in bytes of a match for the pattern. If the pattern
749 /// has an unbounded minimum length, this will be 0.
750 ///
751 /// Note: in some cases when using advanced features to suppress matches
752 /// (such as extended parameters or
753 /// [`Flags::SINGLEMATCH`](crate::flags::Flags::SINGLEMATCH)) this
754 /// may represent a conservative lower bound for the true minimum length of
755 /// a match.
756 pub min_width: ExprWidth,
757 /// The maximum length in bytes of a match for the pattern. If the pattern
758 /// has an unbounded maximum length, this will be [`None`].
759 ///
760 /// Note: in some cases when using advanced features to suppress matches
761 /// (such as extended parameters or
762 /// [`Flags::SINGLEMATCH`](crate::flags::Flags::SINGLEMATCH)) this
763 /// may represent a conservative upper bound for the true maximum length of
764 /// a match.
765 pub max_width: Option<ExprWidth>,
766 /// Whether this expression can produce matches that are not returned in
767 /// order, such as those produced by assertions.
768 pub unordered_matches: UnorderedMatchBehavior,
769 /// Whether this expression can produce matches at end of data (EOD).
770 ///
771 /// In streaming mode, EOD matches are raised during
772 /// [`Scratch::flush_eod_sync()`](crate::state::Scratch::flush_eod_sync) or
773 /// [`Scratch::flush_eod_sync()`](crate::state::Scratch::flush_eod_sync),
774 /// since it is only when `flush_eod()` is called that the EOD location
775 /// is known.
776 ///
777 /// Note: trailing `\b` word boundary assertions may also result in EOD
778 /// matches as end-of-data can act as a word boundary.
779 pub matches_at_eod: MatchAtEndBehavior,
780 }
781
782 impl ExprInfo {
783 pub(crate) fn from_native(x: hs::hs_expr_info) -> Self {
784 let hs::hs_expr_info {
785 min_width,
786 max_width,
787 unordered_matches,
788 matches_at_eod,
789 matches_only_at_eod,
790 } = x;
791 let min_width = ExprWidth::parse_min_width(min_width);
792 let max_width = ExprWidth::parse_max_width(max_width);
793 let unordered_matches = UnorderedMatchBehavior::from_native(unordered_matches);
794 let matches_at_eod = MatchAtEndBehavior::from_native(matches_at_eod, matches_only_at_eod);
795 Self {
796 min_width,
797 max_width,
798 unordered_matches,
799 matches_at_eod,
800 }
801 }
802 }
803}
804
805/// Configuration for extended vectorscan parameters.
806///
807/// These parameters cover various types of fuzzy search as well as input
808/// subsetting features. See [Extended Parameters] for a further reference.
809///
810/// [Extended Parameters]: https://intel.github.io/hyperscan/dev-reference/compilation.html#extparam
811///
812/// This structure may be passed in when building a database with
813/// [`ExpressionSet::with_exts()`], or used to interrogate a single expression
814/// with [`Expression::ext_info()`].
815///
816/// Like many other flags arguments, this struct also supports [`ops::BitOr`]
817/// and the `|` operator for composition:
818///
819///```
820/// # fn main() -> Result<(), vectorscan::error::VectorscanError> {
821/// use vectorscan::{expression::*, flags::*, matchers::*, sources::*};
822///
823/// // Apply extended configuration to one version of the pattern, but not the other:
824/// let a: Expression = "ab".parse()?;
825/// let ext = ExprExt::from_min_offset(3) | ExprExt::from_max_offset(15);
826/// let set = ExpressionSet::from_exprs([&a, &a])
827/// .with_exts([Some(&ext), None])
828/// .with_ids([ExprId(1), ExprId(2)])
829/// .compile(Mode::BLOCK)?;
830/// let mut scratch = set.allocate_scratch()?;
831///
832/// let msg: ByteSlice = "ab ab ab".into();
833///
834/// let mut matches: Vec<ExpressionIndex> = Vec::new();
835/// scratch.scan_sync(&set, msg, |m| {
836/// matches.push(m.id);
837/// MatchResult::Continue
838/// })?;
839///
840/// // The configured pattern misses out on the first and last match of "ab":
841/// assert_eq!(&matches, &[
842/// ExpressionIndex(2), ExpressionIndex(1), ExpressionIndex(2), ExpressionIndex(2),
843/// ]);
844/// # Ok(())
845/// # }
846/// ```
847#[derive(Debug, Copy, Clone)]
848#[repr(transparent)]
849pub struct ExprExt(hs::hs_expr_ext);
850
851impl Default for ExprExt {
852 fn default() -> Self { Self::zeroed() }
853}
854
855impl ExprExt {
856 /// Generate an empty instance with all features disabled.
857 /* FIXME: make this const when const zeroed() is stabilized! */
858 pub fn zeroed() -> Self { unsafe { mem::MaybeUninit::zeroed().assume_init() } }
859
860 /// The minimum end offset in the data stream at which this expression should
861 /// match successfully.
862 pub fn from_min_offset(x: usize) -> Self {
863 let ext_flags = ExtFlags::MIN_OFFSET;
864 let mut s = Self::zeroed();
865 s.0.flags = ext_flags.into_native();
866 s.0.min_offset = x as c_ulonglong;
867 s
868 }
869
870 /// The maximum end offset in the data stream at which this expression should
871 /// match successfully.
872 pub fn from_max_offset(x: usize) -> Self {
873 let ext_flags = ExtFlags::MAX_OFFSET;
874 let mut s = Self::zeroed();
875 s.0.flags = ext_flags.into_native();
876 s.0.max_offset = x as c_ulonglong;
877 s
878 }
879
880 /// The minimum match length (from start to end) required to successfully
881 /// match this expression.
882 ///
883 /// This is one alternative to the use of [`Flags::ALLOWEMPTY`].
884 ///
885 /// This does not require [`Flags::SOM_LEFTMOST`]:
886 ///
887 ///```
888 /// # fn main() -> Result<(), vectorscan::error::VectorscanError> {
889 /// use vectorscan::{expression::*, flags::*, matchers::*, sources::*};
890 ///
891 /// let a: Expression = "a.*b".parse()?;
892 /// let ext = ExprExt::from_min_length(4);
893 /// let set = ExpressionSet::from_exprs([&a, &a])
894 /// // #1 has no min_length, #2 does:
895 /// .with_exts([None, Some(&ext)])
896 /// .with_ids([ExprId(1), ExprId(2)])
897 /// .compile(Mode::BLOCK)?;
898 /// let mut scratch = set.allocate_scratch()?;
899 ///
900 /// let msg: ByteSlice = " ab ab ".into();
901 ///
902 /// let mut matches: Vec<(u32, &str)> = Vec::new();
903 /// scratch.scan_sync(&set, msg, |m| {
904 /// matches.push((m.id.0, unsafe { m.source.as_str() }));
905 /// MatchResult::Continue
906 /// })?;
907 ///
908 /// assert_eq!(&matches, &[
909 /// // Without min_length, both matches show up:
910 /// (1, " ab"),
911 /// (1, " ab ab"),
912 /// // SOM_LEFTMOST is disabled, so we don't know the match start,
913 /// // but the min_length property is correctly applied regardless:
914 /// (2, " ab ab"),
915 /// ]);
916 /// # Ok(())
917 /// # }
918 /// ```
919 pub fn from_min_length(x: usize) -> Self {
920 let ext_flags = ExtFlags::MIN_LENGTH;
921 let mut s = Self::zeroed();
922 s.0.flags = ext_flags.into_native();
923 s.0.min_length = x as c_ulonglong;
924 s
925 }
926
927 /// Allow patterns to approximately match within this [edit distance](https://en.wikipedia.org/wiki/Edit_distance).
928 pub fn from_edit_distance(x: usize) -> Self {
929 let ext_flags = ExtFlags::EDIT_DISTANCE;
930 let mut s = Self::zeroed();
931 s.0.flags = ext_flags.into_native();
932 assert!(x < c_uint::MAX as usize);
933 s.0.edit_distance = x as c_uint;
934 s
935 }
936
937 /// Allow patterns to approximately match within this [Hamming distance](https://en.wikipedia.org/wiki/Hamming_distance).
938 pub fn from_hamming_distance(x: usize) -> Self {
939 let ext_flags = ExtFlags::HAMMING_DISTANCE;
940 let mut s = Self::zeroed();
941 s.0.flags = ext_flags.into_native();
942 assert!(x < c_uint::MAX as usize);
943 s.0.hamming_distance = x as c_uint;
944 s
945 }
946
947 const fn ext_flags(&self) -> ExtFlags { ExtFlags::from_native(self.0.flags) }
948
949 fn min_offset(&self) -> Option<c_ulonglong> {
950 if self.ext_flags().has_min_offset() {
951 Some(self.0.min_offset)
952 } else {
953 None
954 }
955 }
956
957 fn max_offset(&self) -> Option<c_ulonglong> {
958 if self.ext_flags().has_max_offset() {
959 Some(self.0.max_offset)
960 } else {
961 None
962 }
963 }
964
965 fn min_length(&self) -> Option<c_ulonglong> {
966 if self.ext_flags().has_min_length() {
967 Some(self.0.min_length)
968 } else {
969 None
970 }
971 }
972
973 fn edit_distance(&self) -> Option<c_uint> {
974 if self.ext_flags().has_edit_distance() {
975 Some(self.0.edit_distance)
976 } else {
977 None
978 }
979 }
980
981 fn hamming_distance(&self) -> Option<c_uint> {
982 if self.ext_flags().has_hamming_distance() {
983 Some(self.0.hamming_distance)
984 } else {
985 None
986 }
987 }
988
989 fn compose(mut self, rhs: Self) -> Self {
990 self.0.flags = (self.ext_flags() | rhs.ext_flags()).into_native();
991 if let Some(min_offset) = rhs.min_offset() {
992 self.0.min_offset = min_offset;
993 }
994 if let Some(max_offset) = rhs.max_offset() {
995 self.0.max_offset = max_offset;
996 }
997 if let Some(min_length) = rhs.min_length() {
998 self.0.min_length = min_length;
999 }
1000 if let Some(edit_distance) = rhs.edit_distance() {
1001 self.0.edit_distance = edit_distance;
1002 }
1003 if let Some(hamming_distance) = rhs.hamming_distance() {
1004 self.0.hamming_distance = hamming_distance;
1005 }
1006 self
1007 }
1008
1009 pub(crate) fn as_ref_native(&self) -> &hs::hs_expr_ext { &self.0 }
1010}
1011
1012impl ops::BitOr for ExprExt {
1013 type Output = Self;
1014
1015 fn bitor(self, other: Self) -> Self { self.compose(other) }
1016}
1017
1018impl ops::BitOrAssign for ExprExt {
1019 fn bitor_assign(&mut self, rhs: Self) {
1020 use ops::BitOr;
1021 *self = self.bitor(rhs);
1022 }
1023}
1024
1025/// Collection of literals.
1026///
1027/// This is the analogue to [`ExpressionSet`] for [`Literal`] expressions, which
1028/// cannot be combined with [`Expression`] patterns in the same database.
1029///
1030/// This struct provides an immutable (returning `Self`) builder interface
1031/// to attach additional configuration to the initial set of patterns
1032/// constructed with [`Self::from_lits()`].
1033#[derive(Clone)]
1034pub struct LiteralSet<'a> {
1035 ptrs: Vec<*const c_char>,
1036 lens: Vec<usize>,
1037 flags: Option<Vec<Flags>>,
1038 ids: Option<Vec<ExprId>>,
1039 _ph: PhantomData<&'a u8>,
1040}
1041
1042impl<'a> fmt::Debug for LiteralSet<'a> {
1043 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1044 let exprs: Vec<&'a [u8]> = self
1045 .ptrs
1046 .iter()
1047 .zip(self.lens.iter())
1048 .map(|(p, n)| unsafe { slice::from_raw_parts(*p as *const u8, *n) })
1049 .collect();
1050 let joined_exprs: String = exprs
1051 .into_iter()
1052 .map(|s| {
1053 str::from_utf8(s)
1054 .map(|s| format!("{:?}", s))
1055 .unwrap_or_else(|_| format!("(non-utf8: {:?})", s))
1056 })
1057 .collect::<Vec<_>>()
1058 .join(", ");
1059 write!(
1060 f,
1061 "LiteralSet(exprs=[{}], flags={:?}, ids={:?})",
1062 joined_exprs, &self.flags, &self.ids
1063 )
1064 }
1065}
1066
1067impl<'a> LiteralSet<'a> {
1068 /// Construct a pattern set from references to parsed literals.
1069 ///
1070 /// The length of this initial `exprs` argument is returned by
1071 /// [`Self::len()`], and all subsequent configuration methods are checked to
1072 /// provide iterators of the same length:
1073 ///
1074 ///```should_panic
1075 /// use vectorscan::expression::*;
1076 ///
1077 /// let a: Literal = "a\0b".parse().unwrap();
1078 /// // Fails due to argument length mismatch:
1079 /// LiteralSet::from_lits([&a])
1080 /// .with_flags([]);
1081 /// ```
1082 pub fn from_lits(lits: impl IntoIterator<Item=&'a Literal>) -> Self {
1083 let mut ptrs: Vec<_> = Vec::new();
1084 let mut lens: Vec<_> = Vec::new();
1085
1086 for l in lits.into_iter() {
1087 ptrs.push(l.as_ptr());
1088 lens.push(l.as_bytes().len());
1089 }
1090
1091 Self {
1092 ptrs,
1093 lens,
1094 flags: None,
1095 ids: None,
1096 _ph: PhantomData,
1097 }
1098 }
1099
1100 /// Provide flags which modify the behavior of each expression.
1101 ///
1102 /// The length of `flags` is checked to be the same as [`Self::len()`].
1103 ///
1104 /// If this builder method is not used, [`Flags::default()`] will be assigned
1105 /// to all patterns.
1106 ///
1107 ///```
1108 /// # fn main() -> Result<(), vectorscan::error::VectorscanError> {
1109 /// use vectorscan::{expression::*, flags::*, matchers::*};
1110 ///
1111 /// // Create two expressions to demonstrate separate flags for each pattern:
1112 /// let a: Literal = "a".parse()?;
1113 /// let b: Literal = "b".parse()?;
1114 ///
1115 /// // Get the start of match for one pattern, but not the other:
1116 /// let db = LiteralSet::from_lits([&a, &b])
1117 /// .with_flags([Flags::default(), Flags::SOM_LEFTMOST])
1118 /// .compile(Mode::BLOCK)?;
1119 ///
1120 /// let mut scratch = db.allocate_scratch()?;
1121 ///
1122 /// let mut matches: Vec<&str> = Vec::new();
1123 /// scratch.scan_sync(&db, "aardvark imbibbe".into(), |m| {
1124 /// matches.push(unsafe { m.source.as_str() });
1125 /// MatchResult::Continue
1126 /// })?;
1127 /// // Start of match is preserved for only one pattern:
1128 /// assert_eq!(&matches, &["a", "aa", "aardva", "b", "b", "b"]);
1129 /// # Ok(())
1130 /// # }
1131 /// ```
1132 pub fn with_flags(mut self, flags: impl IntoIterator<Item=Flags>) -> Self {
1133 let flags: Vec<_> = flags.into_iter().collect();
1134 assert_eq!(self.len(), flags.len());
1135 self.flags = Some(flags.to_vec());
1136 self
1137 }
1138
1139 /// Assign an ID number to each pattern.
1140 ///
1141 /// The length of `ids` is checked to be the same as [`Self::len()`]. Multiple
1142 /// patterns can be assigned the same ID.
1143 ///
1144 /// If this builder method is not used, vectorscan will assign them all the ID
1145 /// number 0:
1146 ///
1147 ///```
1148 /// # fn main() -> Result<(), vectorscan::error::VectorscanError> {
1149 /// use vectorscan::{expression::*, flags::*, state::*, matchers::*, sources::*};
1150 ///
1151 /// // Create two expressions to demonstrate multiple pattern IDs.
1152 /// let a: Literal = "a".parse()?;
1153 /// let b: Literal = "b".parse()?;
1154 ///
1155 /// // Create one db with ID numbers, and one without.
1156 /// let set1 = LiteralSet::from_lits([&a, &b]).compile(Mode::BLOCK)?;
1157 /// let set2 = LiteralSet::from_lits([&a, &b])
1158 /// .with_ids([ExprId(300), ExprId(12)])
1159 /// .compile(Mode::BLOCK)?;
1160 ///
1161 /// let mut scratch = Scratch::blank();
1162 /// scratch.setup_for_db(&set1)?;
1163 /// scratch.setup_for_db(&set2)?;
1164 ///
1165 /// let msg: ByteSlice = "aardvark imbibbe".into();
1166 ///
1167 /// // The first db doesn't differentiate matches by ID number:
1168 /// let mut matches1: Vec<ExpressionIndex> = Vec::new();
1169 /// scratch.scan_sync(&set1, msg, |m| {
1170 /// matches1.push(m.id);
1171 /// MatchResult::Continue
1172 /// })?;
1173 /// assert_eq!(
1174 /// &matches1,
1175 /// &[
1176 /// ExpressionIndex(0), ExpressionIndex(0), ExpressionIndex(0), ExpressionIndex(0),
1177 /// ExpressionIndex(0), ExpressionIndex(0),
1178 /// ],
1179 /// );
1180 ///
1181 /// // The second db returns corresponding ExpressionIndex instances:
1182 /// let mut matches2: Vec<ExpressionIndex> = Vec::new();
1183 /// scratch.scan_sync(&set2, msg, |m| {
1184 /// matches2.push(m.id);
1185 /// MatchResult::Continue
1186 /// })?;
1187 /// assert_eq!(
1188 /// &matches2,
1189 /// &[
1190 /// ExpressionIndex(300), ExpressionIndex(300), ExpressionIndex(300),
1191 /// ExpressionIndex(12), ExpressionIndex(12), ExpressionIndex(12),
1192 /// ],
1193 /// );
1194 /// # Ok(())
1195 /// # }
1196 /// ```
1197 pub fn with_ids(mut self, ids: impl IntoIterator<Item=ExprId>) -> Self {
1198 let ids: Vec<_> = ids.into_iter().collect();
1199 assert_eq!(self.len(), ids.len());
1200 self.ids = Some(ids.to_vec());
1201 self
1202 }
1203
1204 /// Call [`Database::compile_multi_literal()`] with [`None`] for the platform.
1205 pub fn compile(self, mode: Mode) -> Result<Database, VectorscanCompileError> {
1206 Database::compile_multi_literal(&self, mode, None)
1207 }
1208
1209 /// The number of literals in this set.
1210 pub fn len(&self) -> usize { self.ptrs.len() }
1211
1212 /// Whether this set contains any literals.
1213 pub fn is_empty(&self) -> bool { self.len() == 0 }
1214
1215 pub(crate) fn num_elements(&self) -> c_uint { self.len() as c_uint }
1216
1217 pub(crate) fn literals_ptr(&self) -> *const *const c_char { self.ptrs.as_ptr() }
1218
1219 pub(crate) fn lengths_ptr(&self) -> *const usize { self.lens.as_ptr() }
1220
1221 pub(crate) fn flags_ptr(&self) -> *const c_uint {
1222 self
1223 .flags
1224 .as_ref()
1225 .map(|f| unsafe { mem::transmute(f.as_ptr()) })
1226 .unwrap_or(ptr::null())
1227 }
1228
1229 pub(crate) fn ids_ptr(&self) -> *const c_uint {
1230 self
1231 .ids
1232 .as_ref()
1233 .map(|i| unsafe { mem::transmute(i.as_ptr()) })
1234 .unwrap_or(ptr::null())
1235 }
1236}
1237
1238/// Pattern strings for the chimera library.
1239///
1240/// As per [Pattern Support], chimera has full support for PCRE.
1241///
1242/// [Pattern Support]: https://intel.github.io/hyperscan/dev-reference/chimera.html#pattern-support
1243///
1244/// As chimera focuses mainly on supporting PCRE compatibility and group
1245/// matching support, this interface is less full-featured than the standard
1246/// vectorscan library [`super::expression`]. However, the same idioms apply:
1247/// creating expression instances performs no pattern compilation itself, and
1248/// references to these structs can be reused without re-allocating the
1249/// underlying pattern string data:
1250///
1251///```
1252/// # #[allow(unused_variables)]
1253/// # fn main() -> Result<(), vectorscan::error::chimera::ChimeraError> {
1254/// use vectorscan::{expression::chimera::*, flags::chimera::*};
1255///
1256/// let a: ChimeraExpression = "a+".parse()?;
1257/// let b: ChimeraExpression = "b+".parse()?;
1258/// let c: ChimeraExpression = "c+".parse()?;
1259///
1260/// let ab_db = ChimeraExpressionSet::from_exprs([&a, &b]).compile(ChimeraMode::NOGROUPS)?;
1261/// let bc_db = ChimeraExpressionSet::from_exprs([&b, &c]).compile(ChimeraMode::NOGROUPS)?;
1262/// let ca_db = ChimeraExpressionSet::from_exprs([&c, &a]).compile(ChimeraMode::NOGROUPS)?;
1263/// # Ok(())
1264/// # }
1265/// ```
1266#[cfg(feature = "chimera")]
1267#[cfg_attr(docsrs, doc(cfg(feature = "chimera")))]
1268pub mod chimera {
1269 use super::ExprId;
1270 use crate::{
1271 database::chimera::ChimeraDb,
1272 error::chimera::ChimeraCompileError,
1273 flags::chimera::{ChimeraFlags, ChimeraMode},
1274 };
1275
1276 use std::{
1277 ffi::{CStr, CString},
1278 fmt,
1279 marker::PhantomData,
1280 mem,
1281 os::raw::{c_char, c_uint, c_ulong},
1282 ptr, str,
1283 };
1284
1285 /// Chimera (PCRE) pattern string.
1286 ///
1287 /// Note that as the underlying chimera library interprets pattern strings as
1288 /// null-terminated [`CStr`]s, null bytes are *not* supported within
1289 /// `ChimeraExpression` strings. If matching against patterns containing
1290 /// explicit null bytes is necessary, consider [`super::Literal`] or
1291 /// [`super::LiteralSet`] from the base vectorscan library.
1292 ///
1293 /// Note also that the chimera library does not support an "info" interface
1294 /// such as [`super::Expression::info()`] and
1295 /// [`super::Expression::ext_info()`] from the base vectorscan library.
1296 ///
1297 /// Instances can be created equivalently with [`Self::new()`] or
1298 /// [`str::parse()`] via the [`str::FromStr`] impl:
1299 ///
1300 ///```
1301 /// # fn main() -> Result<(), vectorscan::error::chimera::ChimeraError> {
1302 /// use vectorscan::expression::chimera::ChimeraExpression;
1303 ///
1304 /// let e1: ChimeraExpression = "asd(f+)".parse()?;
1305 /// let e2 = ChimeraExpression::new("asd(f+)")?;
1306 /// assert_eq!(e1, e2);
1307 /// # Ok(())
1308 /// # }
1309 /// ```
1310 #[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
1311 pub struct ChimeraExpression(CString);
1312
1313 impl fmt::Debug for ChimeraExpression {
1314 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1315 let b = self.as_bytes();
1316 match str::from_utf8(b) {
1317 Ok(s) => write!(f, "ChimeraExpression({:?})", s),
1318 Err(_) => write!(f, "ChimeraExpression({:?})", b),
1319 }
1320 }
1321 }
1322
1323 impl fmt::Display for ChimeraExpression {
1324 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1325 let b = self.as_bytes();
1326 match str::from_utf8(b) {
1327 Ok(s) => write!(f, "{}", s),
1328 Err(_) => write!(f, "(non-utf8: {:?})", b),
1329 }
1330 }
1331 }
1332
1333 impl ChimeraExpression {
1334 /// Reference the underlying bytes, *without* the trailing null terminator.
1335 ///
1336 ///```
1337 /// # fn main() -> Result<(), vectorscan::error::chimera::ChimeraError> {
1338 /// let e = vectorscan::expression::chimera::ChimeraExpression::new("asd(f+)")?;
1339 /// assert_eq!(e.as_bytes(), b"asd(f+)");
1340 /// # Ok(())
1341 /// # }
1342 /// ```
1343 pub fn as_bytes(&self) -> &[u8] { self.0.as_bytes() }
1344
1345 pub(crate) fn as_ptr(&self) -> *const c_char { self.0.as_c_str().as_ptr() }
1346
1347 /// Produce a `NULL`-terminated C-style wrapper for the given pattern
1348 /// string.
1349 ///
1350 /// This will fail if the string contains any internal `NULL` bytes, as
1351 /// those are not supported by the chimera library:
1352 ///```
1353 /// use vectorscan::{expression::chimera::*, error::chimera::*};
1354 ///
1355 /// let pat = "as\0df";
1356 /// let e = match ChimeraExpression::new(pat) {
1357 /// Err(ChimeraCompileError::NullByte(e)) => e,
1358 /// _ => unreachable!(),
1359 /// };
1360 /// assert_eq!(e.nul_position(), 2);
1361 /// ```
1362 pub fn new(x: impl Into<Vec<u8>>) -> Result<Self, ChimeraCompileError> {
1363 Ok(Self(CString::new(x)?))
1364 }
1365
1366 /// Call [`ChimeraDb::compile()`] with [`None`] for the platform.
1367 pub fn compile(
1368 &self,
1369 flags: ChimeraFlags,
1370 mode: ChimeraMode,
1371 ) -> Result<ChimeraDb, ChimeraCompileError> {
1372 ChimeraDb::compile(self, flags, mode, None)
1373 }
1374 }
1375
1376 impl str::FromStr for ChimeraExpression {
1377 type Err = ChimeraCompileError;
1378
1379 fn from_str(s: &str) -> Result<Self, Self::Err> { Self::new(s) }
1380 }
1381
1382 /// Extended configuration for the PCRE matching phase of chimera.
1383 ///
1384 /// The only entry point to configuring this is
1385 /// [`ChimeraExpressionSet::with_limits()`].
1386 #[derive(Debug, Copy, Clone)]
1387 pub struct ChimeraMatchLimits {
1388 /// A limit from pcre_extra on the amount of match function called in PCRE
1389 /// to limit backtracking that can take place.
1390 pub match_limit: c_ulong,
1391 /// A limit from pcre_extra on the recursion depth of match function in
1392 /// PCRE.
1393 pub match_limit_recursion: c_ulong,
1394 }
1395
1396 /// Collection of regular expressions.
1397 ///
1398 /// This is the analogue to [`super::ExpressionSet`] for [`ChimeraExpression`]
1399 /// instances.
1400 ///
1401 /// This struct provides an immutable (returning `Self`) builder interface
1402 /// to attach additional configuration to the initial set of patterns
1403 /// constructed with [`Self::from_exprs()`].
1404 #[derive(Clone)]
1405 pub struct ChimeraExpressionSet<'a> {
1406 ptrs: Vec<*const c_char>,
1407 flags: Option<Vec<ChimeraFlags>>,
1408 ids: Option<Vec<ExprId>>,
1409 limits: Option<ChimeraMatchLimits>,
1410 _ph: PhantomData<&'a u8>,
1411 }
1412
1413 impl<'a> fmt::Debug for ChimeraExpressionSet<'a> {
1414 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1415 let exprs: Vec<&'a CStr> = self
1416 .ptrs
1417 .iter()
1418 .map(|p| unsafe { CStr::from_ptr(*p) })
1419 .collect();
1420 write!(
1421 f,
1422 "ChimeraExpressionSet(exprs={:?}, flags={:?}, ids={:?}, limits={:?})",
1423 exprs, &self.flags, &self.ids, &self.limits
1424 )
1425 }
1426 }
1427
1428 impl<'a> ChimeraExpressionSet<'a> {
1429 /// Construct a pattern set from references to parsed expressions.
1430 ///
1431 /// The length of this initial `exprs` argument is returned by
1432 /// [`Self::len()`], and all subsequent configuration methods are checked to
1433 /// provide iterators of the same length:
1434 ///
1435 ///```should_panic
1436 /// use vectorscan::expression::chimera::*;
1437 ///
1438 /// let a: ChimeraExpression = "a+".parse().unwrap();
1439 /// // Fails due to argument length mismatch:
1440 /// ChimeraExpressionSet::from_exprs([&a])
1441 /// .with_flags([]);
1442 /// ```
1443 pub fn from_exprs(exprs: impl IntoIterator<Item=&'a ChimeraExpression>) -> Self {
1444 Self {
1445 ptrs: exprs.into_iter().map(|e| e.as_ptr()).collect(),
1446 flags: None,
1447 ids: None,
1448 limits: None,
1449 _ph: PhantomData,
1450 }
1451 }
1452
1453 /// Provide flags which modify the behavior of each expression.
1454 ///
1455 /// The length of `flags` is checked to be the same as [`Self::len()`].
1456 ///
1457 /// If this builder method is not used, [`ChimeraFlags::default()`] will be
1458 /// assigned to all patterns.
1459 ///
1460 ///```
1461 /// # fn main() -> Result<(), vectorscan::error::chimera::ChimeraError> {
1462 /// use vectorscan::{expression::chimera::*, flags::chimera::*, matchers::chimera::*};
1463 ///
1464 /// // Create two expressions to demonstrate separate flags for each pattern:
1465 /// let a: ChimeraExpression = "a+[^a]".parse()?;
1466 /// let b: ChimeraExpression = "b+[^b]".parse()?;
1467 ///
1468 /// // Get the start of match for one pattern, but not the other:
1469 /// let db = ChimeraExpressionSet::from_exprs([&a, &b])
1470 /// .with_flags([ChimeraFlags::default(), ChimeraFlags::SINGLEMATCH])
1471 /// .compile(ChimeraMode::NOGROUPS)?;
1472 ///
1473 /// let mut scratch = db.allocate_scratch()?;
1474 ///
1475 /// let mut matches: Vec<&str> = Vec::new();
1476 /// scratch.scan_sync(&db, "aardvark imbibbe".into(), |m| {
1477 /// matches.push(unsafe { m.source.as_str() });
1478 /// ChimeraMatchResult::Continue
1479 /// }, |_| ChimeraMatchResult::Continue)?;
1480 /// // SINGLEMATCH is preserved for only one pattern:
1481 /// assert_eq!(&matches, &["aar", "ar", "bi"]);
1482 /// # Ok(())
1483 /// # }
1484 /// ```
1485 pub fn with_flags(mut self, flags: impl IntoIterator<Item=ChimeraFlags>) -> Self {
1486 let flags: Vec<_> = flags.into_iter().collect();
1487 assert_eq!(self.len(), flags.len());
1488 self.flags = Some(flags);
1489 self
1490 }
1491
1492 /// Assign an ID number to each pattern.
1493 ///
1494 /// The length of `ids` is checked to be the same as [`Self::len()`].
1495 /// Multiple patterns can be assigned the same ID.
1496 ///
1497 /// If this builder method is not used, vectorscan will assign them all the
1498 /// ID number 0:
1499 ///
1500 ///```
1501 /// # fn main() -> Result<(), vectorscan::error::chimera::ChimeraError> {
1502 /// use vectorscan::{sources::*, expression::{*, chimera::*}, flags::chimera::*, state::chimera::*, matchers::{*, chimera::*}};
1503 ///
1504 /// // Create two expressions to demonstrate multiple pattern IDs.
1505 /// let a: ChimeraExpression = "a+[^a]".parse()?;
1506 /// let b: ChimeraExpression = "b+[^b]".parse()?;
1507 ///
1508 /// // Create one db with ID numbers, and one without.
1509 /// let set1 = ChimeraExpressionSet::from_exprs([&a, &b]).compile(ChimeraMode::NOGROUPS)?;
1510 /// let set2 = ChimeraExpressionSet::from_exprs([&a, &b])
1511 /// .with_ids([ExprId(300), ExprId(12)])
1512 /// .compile(ChimeraMode::NOGROUPS)?;
1513 ///
1514 /// let mut scratch = ChimeraScratch::blank();
1515 /// scratch.setup_for_db(&set1)?;
1516 /// scratch.setup_for_db(&set2)?;
1517 ///
1518 /// let msg: ByteSlice = "aardvark imbibbe".into();
1519 ///
1520 /// // The first db doesn't differentiate matches by ID number:
1521 /// let mut matches1: Vec<ExpressionIndex> = Vec::new();
1522 /// scratch.scan_sync(&set1, msg, |m| {
1523 /// matches1.push(m.id);
1524 /// ChimeraMatchResult::Continue
1525 /// }, |_| ChimeraMatchResult::Continue)?;
1526 /// assert_eq!(
1527 /// &matches1,
1528 /// &[ExpressionIndex(0), ExpressionIndex(0), ExpressionIndex(0), ExpressionIndex(0)],
1529 /// );
1530 ///
1531 /// // The second db returns corresponding ExpressionIndex instances:
1532 /// let mut matches2: Vec<ExpressionIndex> = Vec::new();
1533 /// scratch.scan_sync(&set2, msg, |m| {
1534 /// matches2.push(m.id);
1535 /// ChimeraMatchResult::Continue
1536 /// }, |_| ChimeraMatchResult::Continue)?;
1537 /// assert_eq!(
1538 /// &matches2,
1539 /// &[ExpressionIndex(300), ExpressionIndex(300), ExpressionIndex(12), ExpressionIndex(12)],
1540 /// );
1541 /// # Ok(())
1542 /// # }
1543 /// ```
1544 pub fn with_ids(mut self, ids: impl IntoIterator<Item=ExprId>) -> Self {
1545 let ids: Vec<_> = ids.into_iter().collect();
1546 assert_eq!(self.len(), ids.len());
1547 self.ids = Some(ids);
1548 self
1549 }
1550
1551 /// Assign extended PCRE configuration to the entire pattern set.
1552 ///
1553 /// This is the only entry point to configuring PCRE match limits (i.e. the
1554 /// single-pattern compiler does not support match limits).
1555 ///
1556 ///```
1557 /// # fn main() -> Result<(), vectorscan::error::chimera::ChimeraError> {
1558 /// use vectorscan::{sources::*, expression::chimera::*, flags::chimera::*, state::chimera::*, matchers::chimera::*, error::chimera::*};
1559 ///
1560 /// // Create one db with backtracking match limits, and one without.
1561 /// let a: ChimeraExpression = r"(asdf?)hey\1".parse()?;
1562 /// let set1 = ChimeraExpressionSet::from_exprs([&a]).compile(ChimeraMode::GROUPS)?;
1563 /// let set2 = ChimeraExpressionSet::from_exprs([&a])
1564 /// .with_limits(ChimeraMatchLimits { match_limit: 1, match_limit_recursion: 1 })
1565 /// .compile(ChimeraMode::GROUPS)?;
1566 ///
1567 /// let mut scratch = ChimeraScratch::blank();
1568 /// scratch.setup_for_db(&set1)?;
1569 /// scratch.setup_for_db(&set2)?;
1570 ///
1571 /// let msg: ByteSlice = "asdfheyasdf".into();
1572 ///
1573 /// // The first db doesn't stop the matching engine:
1574 /// let mut matches1: Vec<&str> = Vec::new();
1575 /// scratch.scan_sync(&set1, msg, |m| {
1576 /// matches1.push(unsafe { m.captures.unwrap()[1].unwrap().as_str() });
1577 /// ChimeraMatchResult::Continue
1578 /// }, |_| ChimeraMatchResult::Terminate)?;
1579 /// assert_eq!(&matches1, &["asdf"]);
1580 ///
1581 /// // The second db imposes a match limit, which triggers the second callback to return
1582 /// // `ChimeraMatchResult::Terminate`.
1583 /// let mut matches2: Vec<ChimeraMatchError> = Vec::new();
1584 /// let result = scratch.scan_sync(
1585 /// &set2,
1586 /// msg,
1587 /// |_| unreachable!(),
1588 /// |e| {
1589 /// matches2.push(e);
1590 /// ChimeraMatchResult::Terminate
1591 /// },
1592 /// );
1593 /// assert!(matches![result, Err(ChimeraRuntimeError::ScanTerminated)]);
1594 /// assert_eq!(matches2.len(), 1);
1595 /// assert_eq!(matches2[0].error_type, ChimeraMatchErrorType::MatchLimit);
1596 /// # Ok(())
1597 /// # }
1598 /// ```
1599 pub fn with_limits(mut self, limits: ChimeraMatchLimits) -> Self {
1600 self.limits = Some(limits);
1601 self
1602 }
1603
1604 /// Call [`ChimeraDb::compile_multi()`] with [`None`] for the platform.
1605 pub fn compile(self, mode: ChimeraMode) -> Result<ChimeraDb, ChimeraCompileError> {
1606 ChimeraDb::compile_multi(&self, mode, None)
1607 }
1608
1609 /// The number of patterns in this set.
1610 pub fn len(&self) -> usize { self.ptrs.len() }
1611
1612 /// Whether this set contains any patterns.
1613 pub fn is_empty(&self) -> bool { self.len() == 0 }
1614
1615 pub(crate) fn limits(&self) -> Option<ChimeraMatchLimits> { self.limits }
1616
1617 pub(crate) fn num_elements(&self) -> c_uint { self.len() as c_uint }
1618
1619 pub(crate) fn expressions_ptr(&self) -> *const *const c_char { self.ptrs.as_ptr() }
1620
1621 pub(crate) fn flags_ptr(&self) -> *const c_uint {
1622 self
1623 .flags
1624 .as_ref()
1625 .map(|f| unsafe { mem::transmute(f.as_ptr()) })
1626 .unwrap_or(ptr::null())
1627 }
1628
1629 pub(crate) fn ids_ptr(&self) -> *const c_uint {
1630 self
1631 .ids
1632 .as_ref()
1633 .map(|i| unsafe { mem::transmute(i.as_ptr()) })
1634 .unwrap_or(ptr::null())
1635 }
1636 }
1637}