Crate hyperscan_tokio

Crate hyperscan_tokio 

Source
Expand description

§hyperscan-tokio

The most complete high-performance async regular expression matching library for Rust, featuring both Hyperscan/VectorScan and Chimera (PCRE with capture groups) support.

§Core Features

  • Dual Engine Support: Both Hyperscan and Chimera in one library
  • PCRE Pattern Support: Full PCRE syntax with capture groups via Chimera
  • Async-first design: Built for Tokio with async/await support
  • High performance: Leverages Hyperscan/VectorScan’s SIMD acceleration
  • Multiple scanning modes: Block, streaming, and vectored scanning
  • Thread-safe: Databases can be shared across threads
  • Memory efficient: Scratch space pooling and zero-copy operations

§Why hyperscan-tokio?

This is the only Rust library that provides both:

  1. Hyperscan: For blazing-fast multi-pattern matching
  2. Chimera: For PCRE-compatible patterns with capture group support

Whether you need simple pattern matching or complex PCRE expressions with capture groups, this library has you covered.

§Optional Features

  • chimera: Enable PCRE-compatible patterns with capture groups
  • jemalloc/mimalloc: Alternative memory allocators
  • arrow: Apache Arrow integration for batch processing
  • simd-accel: SIMD acceleration features

§Quick Start

§Hyperscan - Fast Multi-Pattern Matching

use hyperscan_tokio::prelude::*;

// Build a database with multiple patterns
let db = DatabaseBuilder::new()
    .add(Pattern::new(r"\d+").id(1).build()?)
    .add(Pattern::new(r"[a-z]+").id(2).build()?)
    .build()?;

// Create a scanner
let scanner = Scanner::new(db)?;

// Scan data - finds all patterns simultaneously
let matches = scanner.scan("abc 123 def 456").await?;

§Chimera - PCRE Patterns with Capture Groups

To use Chimera, enable the chimera feature in your Cargo.toml:

[dependencies]
hyperscan-tokio = { version = "0.1", features = ["chimera"] }
#[cfg(feature = "chimera")]
use hyperscan_tokio::prelude::*;

#[cfg(feature = "chimera")]
// Compile a PCRE pattern with named capture groups
let chimera = Chimera::compile(
    r"(?P<user>\w+)@(?P<domain>\w+\.\w+)",
    Flags::empty(),
    Mode::BLOCK
)?;

// Scan and extract capture groups
chimera.scan(b"Contact: alice@example.com", |m| {
    if let Some(user) = m.group_by_name("user") {
        println!("User: {:?}", user.as_str(m.data));
    }
    if let Some(domain) = m.group_by_name("domain") {
        println!("Domain: {:?}", domain.as_str(m.data));
    }
    MatchControl::Continue
})?;

§Performance

This library achieves scanning speeds of 20+ GB/s on modern hardware:

scanning_throughput/1048576     time:   [45.2 µs 45.8 µs 46.4 µs]
                                thrpt:  [21.5 GiB/s 21.9 GiB/s 22.1 GiB/s]

§Working with Patterns

§Pattern Building

use hyperscan_tokio::{Pattern, Flags};

// Simple pattern
let p1 = Pattern::new(r"\d+").id(1).build()?;

// Pattern with flags
let p2 = Pattern::new("test")
    .id(2)
    .flags(Flags::CASELESS | Flags::MULTILINE)
    .build()?;

// Extended pattern with constraints
let p3 = Pattern::new(r"secret")
    .id(3)
    .min_offset(100)  // Must start after byte 100
    .max_offset(500)  // Must start before byte 500
    .min_length(10)   // Match must be at least 10 bytes
    .build()?;

// Pattern with edit distance
let p4 = Pattern::new("password")
    .id(4)
    .edit_distance(2)  // Allow up to 2 character edits
    .build()?;

§Database Compilation

use hyperscan_tokio::{DatabaseBuilder, Mode};

// Block mode (for complete data)
let block_db = DatabaseBuilder::new()
    .add_pattern(pattern1)
    .add_pattern(pattern2)
    .mode(Mode::BLOCK)
    .build()?;

// Stream mode (for data streams)
let stream_db = DatabaseBuilder::new()
    .add_pattern(pattern)
    .mode(Mode::STREAM)
    .build()?;

// Vectored mode (for scattered data)
let vectored_db = DatabaseBuilder::new()
    .add_pattern(pattern)
    .mode(Mode::VECTORED)
    .build()?;

§Scanning Modes

§Block Scanning

For scanning complete data blocks:

let scanner = Scanner::new(database)?;
 
// Scan string data
let matches = scanner.scan("text to scan").await?;

// Scan bytes
let matches = scanner.scan_bytes(b"binary data").await?;

// Zero-copy with Bytes
let data = bytes::Bytes::from_static(b"zero-copy scan");
let matches = scanner.scan_bytes(data).await?;

§Stream Scanning

For scanning data streams:

let scanner = StreamScanner::new(database)?;

// Scan a Tokio stream
let stream = tokio::fs::File::open("large_file.txt").await?;
let mut match_stream = scanner.scan_stream(stream).await?;

while let Some(m) = match_stream.next().await {
    let match_result = m?;
    println!("Found match: {:?}", match_result);
}

§Vectored Scanning

For scanning multiple non-contiguous buffers:

let scanner = VectoredScanner::new(database)?;

let buffers = vec![
    b"first buffer",
    b"second buffer",
    b"third buffer",
];

let matches = scanner.scan_vectored(&buffers).await?;

§Advanced Features

§Worker Pool for Parallel Processing

let pool = WorkerPoolBuilder::default()
    .num_workers(8)
    .queue_size(10_000)
    .build(database)?;

// Process many items in parallel
let jobs: Vec<ScanJob> = data_items.into_iter()
    .map(|data| ScanJob { id: generate_id(), data })
    .collect();

let results = pool.scan_batch(jobs).await?;

§Hot-Reloadable Patterns

let reloadable = ReloadableDatabase::new(initial_database);

// In another task, reload patterns without stopping
tokio::spawn(async move {
    let new_db = load_new_patterns().await?;
    reloadable.reload(new_db).await?;
});

// Scanning continues with new patterns automatically
let scanner = Scanner::new(reloadable.current())?;

§Database Caching

let cache = DatabaseCache::builder()
    .max_size(100)
    .ttl(Duration::from_secs(3600))
    .build();

// Patterns are compiled only once and cached
let db = cache.get_or_compile(patterns, || {
    DatabaseBuilder::new()
        .patterns(patterns)
        .build()
}).await?;

§Error Handling

All operations return Result<T, Error> with detailed error information:

match Pattern::new("[invalid").build() {
    Ok(pattern) => { /* use pattern */ },
    Err(Error::Compile { message, pattern_id, position }) => {
        println!("Compile error in pattern {}: {} at position {:?}", 
                 pattern_id.unwrap_or(0), message, position);
    },
    Err(e) => println!("Other error: {}", e),
}

§Thread Safety

  • Database: Send + Sync - can be shared across threads
  • Scanner: Send + Clone - can be cloned for each thread
  • Scratch: Thread-local - each thread needs its own

§Memory Management

The library supports custom allocators for optimal performance:

# Use jemalloc (default)
hyperscan-tokio = { version = "0.1", features = ["jemalloc"] }

# Or use mimalloc
hyperscan-tokio = { version = "0.1", features = ["mimalloc"] }

Re-exports§

pub use builder::DatabaseBuilder;
pub use builder::Pattern;
pub use builder::PatternBuilder;
pub use cache::DatabaseCache;
pub use cache::DatabaseCacheBuilder;
pub use cache::CacheKey;
pub use cache::CacheStats;
pub use pattern::CaptureGroup;
pub use pattern::PatternInfo;
pub use database::Database;
pub use database::DatabaseInfo;
pub use database::ExpressionInfo;
pub use database::ReloadableDatabase;
pub use error::Error;
pub use error::Result;
pub use expression::ExpressionContext;
pub use expression::ExpressionContextBuilder;
pub use literal::Literal;
pub use literal::LiteralBuilder;
pub use literal::LiteralFlags;
pub use literal::LiteralDatabaseBuilder;
pub use scanner::Match;
pub use scanner::Scanner;
pub use scanner::BlockScanner;
pub use scratch_pool::Scratch;
pub use scratch_pool::ScratchPool;
pub use stream::StreamScanner;
pub use stream::StreamState;
pub use stream::MatchStream;
pub use vectored::VectoredScanner;
pub use worker_pool::WorkerPool;
pub use worker_pool::WorkerPoolBuilder;
pub use worker_pool::ScanJob;
pub use worker_pool::ScanResult;
pub use zero_copy::ScanInput;
pub use features::supported_features;
pub use features::SupportedFeatures;
pub use features::CpuArchitecture;

Modules§

allocator
Custom memory allocator support
builder
cache
database
error
expression
Expression context and metadata
features
Runtime CPU feature detection for optimal performance
literal
Literal (exact string) matching support
pattern
Pattern information and capture group structures
prelude
Prelude module for convenient imports
scanner
Scanner module for pattern matching operations
scratch_pool
stream
vectored
Vectored scanning - scan multiple non-contiguous buffers as one logical stream
worker_pool
zero_copy

Structs§

Flags
Mode
Platform
Platform information