multimatch 0.1.1

Multi-pattern matching engine — Aho-Corasick + regex with optional Hyperscan SIMD acceleration
Documentation
//! Pattern definitions and the builder for compiling pattern sets.

use crate::engine::MatchEngine;
use crate::MatchError;

/// A single pattern to match against.
#[derive(Debug, Clone)]
pub struct PatternDef {
    /// User-assigned ID for this pattern (returned in match results).
    pub id: usize,
    /// The pattern content.
    pub kind: PatternKind,
    /// Whether matching is case-insensitive.
    pub case_insensitive: bool,
}

/// Whether a pattern is a literal string or a regex.
#[derive(Debug, Clone)]
pub enum PatternKind {
    /// Exact literal match (uses Aho-Corasick).
    Literal(String),
    /// Regex match (uses regex crate or Hyperscan).
    Regex(String),
}

/// A compiled set of patterns ready for scanning.
pub struct PatternSet {
    engine: MatchEngine,
    pattern_count: usize,
}

impl crate::Scanner for PatternSet {
    fn scan(&self, input: &[u8]) -> Vec<crate::MatchResult> {
        self.engine.scan(input)
    }

    fn is_match(&self, input: &[u8]) -> bool {
        self.engine.is_match(input)
    }

    fn pattern_count(&self) -> usize {
        self.pattern_count
    }
}

impl PatternSet {
    /// Start building a new pattern set.
    pub fn builder() -> PatternSetBuilder {
        PatternSetBuilder::new()
    }

    /// Scan a string for all matching patterns.
    pub fn scan_str(&self, input: &str) -> Vec<crate::MatchResult> {
        crate::Scanner::scan(self, input.as_bytes())
    }
}

/// Builder for constructing a [`PatternSet`].
pub struct PatternSetBuilder {
    patterns: Vec<PatternDef>,
}

impl PatternSetBuilder {
    /// Create a new empty builder.
    pub fn new() -> Self {
        Self {
            patterns: Vec::new(),
        }
    }

    /// Add a literal pattern (exact string match).
    pub fn add_literal(mut self, literal: &str, id: usize) -> Self {
        self.patterns.push(PatternDef {
            id,
            kind: PatternKind::Literal(literal.to_string()),
            case_insensitive: false,
        });
        self
    }

    /// Add a case-insensitive literal pattern.
    pub fn add_literal_ci(mut self, literal: &str, id: usize) -> Self {
        self.patterns.push(PatternDef {
            id,
            kind: PatternKind::Literal(literal.to_string()),
            case_insensitive: true,
        });
        self
    }

    /// Add a regex pattern.
    pub fn add_regex(mut self, regex: &str, id: usize) -> Self {
        self.patterns.push(PatternDef {
            id,
            kind: PatternKind::Regex(regex.to_string()),
            case_insensitive: false,
        });
        self
    }

    /// Add a case-insensitive regex pattern.
    pub fn add_regex_ci(mut self, regex: &str, id: usize) -> Self {
        self.patterns.push(PatternDef {
            id,
            kind: PatternKind::Regex(regex.to_string()),
            case_insensitive: true,
        });
        self
    }

    /// Add a raw PatternDef.
    pub fn add(mut self, pattern: PatternDef) -> Self {
        self.patterns.push(pattern);
        self
    }

    /// Compile all patterns into a [`PatternSet`].
    pub fn build(self) -> Result<PatternSet, MatchError> {
        if self.patterns.is_empty() {
            return Err(MatchError::Empty);
        }
        let count = self.patterns.len();
        let engine = MatchEngine::compile(self.patterns)?;
        Ok(PatternSet {
            engine,
            pattern_count: count,
        })
    }
}

impl Default for PatternSetBuilder {
    fn default() -> Self {
        Self::new()
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::Scanner;

    #[test]
    fn builder_empty_fails() {
        assert!(PatternSetBuilder::new().build().is_err());
    }

    #[test]
    fn builder_literal() {
        let ps = PatternSet::builder()
            .add_literal("hello", 0)
            .build()
            .unwrap();
        assert_eq!(ps.pattern_count(), 1);
    }

    #[test]
    fn builder_regex() {
        let ps = PatternSet::builder()
            .add_regex(r"\d+", 0)
            .build()
            .unwrap();
        assert_eq!(ps.pattern_count(), 1);
    }

    #[test]
    fn builder_mixed() {
        let ps = PatternSet::builder()
            .add_literal("token", 0)
            .add_regex(r"[A-Z]{5}", 1)
            .add_literal_ci("SECRET", 2)
            .build()
            .unwrap();
        assert_eq!(ps.pattern_count(), 3);
    }

    #[test]
    fn edge_case_empty_literal() {
        let ps = PatternSet::builder()
            .add_literal("", 0)
            .build()
            .unwrap();
        assert_eq!(ps.pattern_count(), 1);
        let _res = ps.scan_str("test");
        // Usually matches at every boundary, Aho-Corasick handles empty string depending on configuration.
    }

    #[test]
    fn edge_case_large_pattern_count() {
        let mut builder = PatternSet::builder();
        for i in 0..100 {
            builder = builder.add_literal(&format!("literal{}", i), i);
        }
        let ps = builder.build().unwrap();
        assert_eq!(ps.pattern_count(), 100);
    }

    #[test]
    fn edge_case_complex_regex() {
        let ps = PatternSet::builder()
            .add_regex(r"^(abc|def)*[0-9]+$", 99)
            .build()
            .unwrap();
        assert_eq!(ps.pattern_count(), 1);
    }

    #[test]
    fn edge_case_regex_ci_mixed() {
        let ps = PatternSet::builder()
            .add_regex_ci(r"[a-z]", 1)
            .add_literal_ci("TeSt", 2)
            .build()
            .unwrap();
        assert_eq!(ps.pattern_count(), 2);
    }

    #[test]
    fn edge_case_multiple_same_id() {
        let ps = PatternSet::builder()
            .add_literal("foo", 10)
            .add_literal("bar", 10)
            .build()
            .unwrap();
        assert_eq!(ps.pattern_count(), 2);
        let matches = ps.scan_str("foobar");
        assert_eq!(matches.len(), 2);
        assert_eq!(matches[0].pattern_id, 10);
        assert_eq!(matches[1].pattern_id, 10);
    }

    #[test]
    fn edge_case_scan_str_vs_bytes() {
        let ps = PatternSet::builder()
            .add_literal("rust", 1)
            .build()
            .unwrap();
        let s = "learning rust is fun";
        assert_eq!(ps.scan_str(s).len(), 1);
        assert_eq!(ps.scan(s.as_bytes()).len(), 1);
        assert!(ps.is_match(s.as_bytes()));
    }

    #[test]
    fn edge_case_add_raw_pattern_def() {
        let def = PatternDef {
            id: 42,
            kind: PatternKind::Literal("raw".to_string()),
            case_insensitive: false,
        };
        let ps = PatternSet::builder().add(def).build().unwrap();
        assert_eq!(ps.pattern_count(), 1);
    }

    #[test]
    fn edge_case_special_chars_literal() {
        let ps = PatternSet::builder()
            .add_literal("!@#$%^&*()_+", 5)
            .build()
            .unwrap();
        assert_eq!(ps.pattern_count(), 1);
        let m = ps.scan_str("here is !@#$%^&*()_+ special");
        assert_eq!(m.len(), 1);
    }

    #[test]
    fn edge_case_special_chars_regex() {
        let ps = PatternSet::builder()
            .add_regex(r"\!\@#\$\%\^\&\*\(\)_\+", 5)
            .build()
            .unwrap();
        assert_eq!(ps.pattern_count(), 1);
        let m = ps.scan_str("here is !@#$%^&*()_+ special");
        assert_eq!(m.len(), 1);
    }

    #[test]
    fn edge_case_very_long_input_no_match() {
        let ps = PatternSet::builder().add_literal("FINDME", 1).build().unwrap();
        let input = "x".repeat(100_000);
        assert!(!ps.is_match(input.as_bytes()));
    }

    #[test]
    fn edge_case_default_builder() {
        let builder = PatternSetBuilder::default();
        assert!(builder.build().is_err());
    }
}