Expand description
§hyperscan-tokio
The most complete high-performance async regular expression matching library for Rust, featuring both Hyperscan/VectorScan and Chimera (PCRE with capture groups) support.
§Core Features
- Dual Engine Support: Both Hyperscan and Chimera in one library
- PCRE Pattern Support: Full PCRE syntax with capture groups via Chimera
- Async-first design: Built for Tokio with async/await support
- High performance: Leverages Hyperscan/VectorScan’s SIMD acceleration
- Multiple scanning modes: Block, streaming, and vectored scanning
- Thread-safe: Databases can be shared across threads
- Memory efficient: Scratch space pooling and zero-copy operations
§Why hyperscan-tokio?
This is the only Rust library that provides both:
- Hyperscan: For blazing-fast multi-pattern matching
- Chimera: For PCRE-compatible patterns with capture group support
Whether you need simple pattern matching or complex PCRE expressions with capture groups, this library has you covered.
§Optional Features
chimera: Enable PCRE-compatible patterns with capture groupsjemalloc/mimalloc: Alternative memory allocatorsarrow: Apache Arrow integration for batch processingsimd-accel: SIMD acceleration features
§Quick Start
§Hyperscan - Fast Multi-Pattern Matching
use hyperscan_tokio::prelude::*;
// Build a database with multiple patterns
let db = DatabaseBuilder::new()
.add(Pattern::new(r"\d+").id(1).build()?)
.add(Pattern::new(r"[a-z]+").id(2).build()?)
.build()?;
// Create a scanner
let scanner = Scanner::new(db)?;
// Scan data - finds all patterns simultaneously
let matches = scanner.scan("abc 123 def 456").await?;§Chimera - PCRE Patterns with Capture Groups
To use Chimera, enable the chimera feature in your Cargo.toml:
[dependencies]
hyperscan-tokio = { version = "0.1", features = ["chimera"] }#[cfg(feature = "chimera")]
use hyperscan_tokio::prelude::*;
#[cfg(feature = "chimera")]
// Compile a PCRE pattern with named capture groups
let chimera = Chimera::compile(
r"(?P<user>\w+)@(?P<domain>\w+\.\w+)",
Flags::empty(),
Mode::BLOCK
)?;
// Scan and extract capture groups
chimera.scan(b"Contact: alice@example.com", |m| {
if let Some(user) = m.group_by_name("user") {
println!("User: {:?}", user.as_str(m.data));
}
if let Some(domain) = m.group_by_name("domain") {
println!("Domain: {:?}", domain.as_str(m.data));
}
MatchControl::Continue
})?;§Performance
This library achieves scanning speeds of 20+ GB/s on modern hardware:
scanning_throughput/1048576 time: [45.2 µs 45.8 µs 46.4 µs]
thrpt: [21.5 GiB/s 21.9 GiB/s 22.1 GiB/s]§Working with Patterns
§Pattern Building
use hyperscan_tokio::{Pattern, Flags};
// Simple pattern
let p1 = Pattern::new(r"\d+").id(1).build()?;
// Pattern with flags
let p2 = Pattern::new("test")
.id(2)
.flags(Flags::CASELESS | Flags::MULTILINE)
.build()?;
// Extended pattern with constraints
let p3 = Pattern::new(r"secret")
.id(3)
.min_offset(100) // Must start after byte 100
.max_offset(500) // Must start before byte 500
.min_length(10) // Match must be at least 10 bytes
.build()?;
// Pattern with edit distance
let p4 = Pattern::new("password")
.id(4)
.edit_distance(2) // Allow up to 2 character edits
.build()?;§Database Compilation
use hyperscan_tokio::{DatabaseBuilder, Mode};
// Block mode (for complete data)
let block_db = DatabaseBuilder::new()
.add_pattern(pattern1)
.add_pattern(pattern2)
.mode(Mode::BLOCK)
.build()?;
// Stream mode (for data streams)
let stream_db = DatabaseBuilder::new()
.add_pattern(pattern)
.mode(Mode::STREAM)
.build()?;
// Vectored mode (for scattered data)
let vectored_db = DatabaseBuilder::new()
.add_pattern(pattern)
.mode(Mode::VECTORED)
.build()?;§Scanning Modes
§Block Scanning
For scanning complete data blocks:
let scanner = Scanner::new(database)?;
// Scan string data
let matches = scanner.scan("text to scan").await?;
// Scan bytes
let matches = scanner.scan_bytes(b"binary data").await?;
// Zero-copy with Bytes
let data = bytes::Bytes::from_static(b"zero-copy scan");
let matches = scanner.scan_bytes(data).await?;§Stream Scanning
For scanning data streams:
let scanner = StreamScanner::new(database)?;
// Scan a Tokio stream
let stream = tokio::fs::File::open("large_file.txt").await?;
let mut match_stream = scanner.scan_stream(stream).await?;
while let Some(m) = match_stream.next().await {
let match_result = m?;
println!("Found match: {:?}", match_result);
}§Vectored Scanning
For scanning multiple non-contiguous buffers:
let scanner = VectoredScanner::new(database)?;
let buffers = vec![
b"first buffer",
b"second buffer",
b"third buffer",
];
let matches = scanner.scan_vectored(&buffers).await?;§Advanced Features
§Worker Pool for Parallel Processing
let pool = WorkerPoolBuilder::default()
.num_workers(8)
.queue_size(10_000)
.build(database)?;
// Process many items in parallel
let jobs: Vec<ScanJob> = data_items.into_iter()
.map(|data| ScanJob { id: generate_id(), data })
.collect();
let results = pool.scan_batch(jobs).await?;§Hot-Reloadable Patterns
let reloadable = ReloadableDatabase::new(initial_database);
// In another task, reload patterns without stopping
tokio::spawn(async move {
let new_db = load_new_patterns().await?;
reloadable.reload(new_db).await?;
});
// Scanning continues with new patterns automatically
let scanner = Scanner::new(reloadable.current())?;§Database Caching
let cache = DatabaseCache::builder()
.max_size(100)
.ttl(Duration::from_secs(3600))
.build();
// Patterns are compiled only once and cached
let db = cache.get_or_compile(patterns, || {
DatabaseBuilder::new()
.patterns(patterns)
.build()
}).await?;§Error Handling
All operations return Result<T, Error> with detailed error information:
match Pattern::new("[invalid").build() {
Ok(pattern) => { /* use pattern */ },
Err(Error::Compile { message, pattern_id, position }) => {
println!("Compile error in pattern {}: {} at position {:?}",
pattern_id.unwrap_or(0), message, position);
},
Err(e) => println!("Other error: {}", e),
}§Thread Safety
Database:Send + Sync- can be shared across threadsScanner:Send + Clone- can be cloned for each threadScratch: Thread-local - each thread needs its own
§Memory Management
The library supports custom allocators for optimal performance:
# Use jemalloc (default)
hyperscan-tokio = { version = "0.1", features = ["jemalloc"] }
# Or use mimalloc
hyperscan-tokio = { version = "0.1", features = ["mimalloc"] }Re-exports§
pub use builder::DatabaseBuilder;pub use builder::Pattern;pub use builder::PatternBuilder;pub use cache::DatabaseCache;pub use cache::DatabaseCacheBuilder;pub use cache::CacheKey;pub use cache::CacheStats;pub use pattern::CaptureGroup;pub use pattern::PatternInfo;pub use database::Database;pub use database::DatabaseInfo;pub use database::ExpressionInfo;pub use database::ReloadableDatabase;pub use error::Error;pub use error::Result;pub use expression::ExpressionContext;pub use expression::ExpressionContextBuilder;pub use literal::Literal;pub use literal::LiteralBuilder;pub use literal::LiteralFlags;pub use literal::LiteralDatabaseBuilder;pub use scanner::Match;pub use scanner::Scanner;pub use scanner::BlockScanner;pub use scratch_pool::Scratch;pub use scratch_pool::ScratchPool;pub use stream::StreamScanner;pub use stream::StreamState;pub use stream::MatchStream;pub use vectored::VectoredScanner;pub use worker_pool::WorkerPool;pub use worker_pool::WorkerPoolBuilder;pub use worker_pool::ScanJob;pub use worker_pool::ScanResult;pub use zero_copy::ScanInput;pub use features::supported_features;pub use features::SupportedFeatures;pub use features::CpuArchitecture;
Modules§
- allocator
- Custom memory allocator support
- builder
- cache
- database
- error
- expression
- Expression context and metadata
- features
- Runtime CPU feature detection for optimal performance
- literal
- Literal (exact string) matching support
- pattern
- Pattern information and capture group structures
- prelude
- Prelude module for convenient imports
- scanner
- Scanner module for pattern matching operations
- scratch_
pool - stream
- vectored
- Vectored scanning - scan multiple non-contiguous buffers as one logical stream
- worker_
pool - zero_
copy