flashsieve 0.1.1

Storage-level pre-filtering for pattern matching — skip blocks that can't contain matches
Documentation
#![allow(clippy::expect_used, clippy::panic, clippy::unwrap_used)]
use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
use flashsieve::{BlockIndexBuilder, ByteFilter, ByteHistogram, NgramBloom, NgramFilter};
use rand::rngs::StdRng;
use rand::{RngCore, SeedableRng};

const BLOCK_SIZE: usize = 256 * 1024;
const BLOOM_BITS: usize = 4_096;
const DATASET_BYTES: usize = 100 * 1024 * 1024;

fn synthetic_block() -> Vec<u8> {
    let mut rng = StdRng::seed_from_u64(0xB100_0001);
    let mut block = vec![0_u8; BLOCK_SIZE];
    rng.fill_bytes(&mut block);
    block
}

fn synthetic_dataset() -> Vec<u8> {
    let mut rng = StdRng::seed_from_u64(0xB100_0002);
    let mut data = vec![0_u8; DATASET_BYTES];
    rng.fill_bytes(&mut data);

    for offset in (0..DATASET_BYTES).step_by(BLOCK_SIZE * 7) {
        let end = offset + b"needle".len();
        if end <= data.len() {
            data[offset..end].copy_from_slice(b"needle");
        }
    }

    data
}

fn bench_bloom_insert_query(criterion: &mut Criterion) {
    let block = synthetic_block();
    let mut group = criterion.benchmark_group("bloom");
    group.throughput(Throughput::Bytes(block.len() as u64));

    group.bench_function("insert_from_block", |bencher| {
        bencher.iter(|| {
            let mut bloom = NgramBloom::new(BLOOM_BITS).unwrap_or_else(|error| panic!("{error}"));
            for window in black_box(block.as_slice()).windows(2) {
                bloom.insert_ngram(window[0], window[1]);
            }
            black_box(bloom);
        });
    });

    let bloom =
        NgramBloom::from_block(&block, BLOOM_BITS).unwrap_or_else(|error| panic!("{error}"));
    group.bench_function("query_pattern", |bencher| {
        bencher.iter(|| black_box(bloom.maybe_contains_pattern(black_box(b"needle"))));
    });

    group.finish();
}

fn bench_histogram_build(criterion: &mut Criterion) {
    let block = synthetic_block();
    let mut group = criterion.benchmark_group("histogram");
    group.throughput(Throughput::Bytes(block.len() as u64));
    group.bench_function("build", |bencher| {
        bencher.iter(|| black_box(ByteHistogram::from_block(black_box(block.as_slice()))));
    });
    group.finish();
}

fn bench_skip_decision(criterion: &mut Criterion) {
    let data = synthetic_dataset();
    let index = BlockIndexBuilder::new()
        .block_size(BLOCK_SIZE)
        .bloom_bits(BLOOM_BITS)
        .build(&data)
        .unwrap_or_else(|error| panic!("{error}"));
    let byte_filter = ByteFilter::from_patterns(&[b"needle".as_slice()]);
    let ngram_filter = NgramFilter::from_patterns(&[b"needle".as_slice()]);

    let mut group = criterion.benchmark_group("skip_decision");
    group.throughput(Throughput::Elements(index.block_count() as u64));
    group.bench_function("candidate_blocks", |bencher| {
        bencher.iter(|| {
            black_box(index.candidate_blocks(black_box(&byte_filter), black_box(&ngram_filter)));
        });
    });
    group.finish();
}

fn bench_full_pipeline(criterion: &mut Criterion) {
    let data = synthetic_dataset();
    let byte_filter = ByteFilter::from_patterns(&[b"needle".as_slice()]);
    let ngram_filter = NgramFilter::from_patterns(&[b"needle".as_slice()]);
    let mut group = criterion.benchmark_group("pipeline");
    group.throughput(Throughput::Bytes(DATASET_BYTES as u64));
    group.bench_with_input(
        BenchmarkId::new("build_and_query", "100_mib"),
        &data,
        |bencher, data| {
            bencher.iter(|| {
                let index = BlockIndexBuilder::new()
                    .block_size(BLOCK_SIZE)
                    .bloom_bits(BLOOM_BITS)
                    .build(black_box(data))
                    .unwrap_or_else(|error| panic!("{error}"));
                black_box(
                    index.candidate_blocks(black_box(&byte_filter), black_box(&ngram_filter)),
                );
            });
        },
    );
    group.finish();
}

criterion_group!(
    benches,
    bench_bloom_insert_query,
    bench_histogram_build,
    bench_skip_decision,
    bench_full_pipeline
);
criterion_main!(benches);