iscrawl 1.2.0

Fast crawler/bot detection from User-Agent strings.
Documentation
#[cfg(feature = "database")]
use iscrawl::crawler_info;
#[cfg(feature = "database")]
use std::hint::black_box;
#[cfg(feature = "database")]
use std::time::{Duration, Instant};

#[cfg(feature = "database")]
const CRAWLER_UAS: &str = include_str!("../tests/fixtures/crawler_user_agents.txt");
#[cfg(feature = "database")]
const LOADKPI_CRAWLERS: &str = include_str!("../tests/fixtures/loadkpi_crawlers.txt");
#[cfg(feature = "database")]
const PGTS_CRAWLERS: &str = include_str!("../tests/fixtures/crawler_user_agents_pgts.txt");
#[cfg(feature = "database")]
const BROWSER_UAS: &str = include_str!("../tests/fixtures/browser_user_agents.txt");

#[cfg(feature = "database")]
const RUNS: usize = 10;
#[cfg(feature = "database")]
const PASSES: usize = 4;

#[cfg(feature = "database")]
fn main() {
    let crawler_count = CRAWLER_UAS.lines().count()
        + LOADKPI_CRAWLERS.lines().count()
        + PGTS_CRAWLERS.lines().count();
    let browser_count = BROWSER_UAS.lines().count();

    let mut corpus = Vec::with_capacity(crawler_count + browser_count);
    corpus.extend(CRAWLER_UAS.lines());
    corpus.extend(LOADKPI_CRAWLERS.lines());
    corpus.extend(PGTS_CRAWLERS.lines());
    corpus.extend(BROWSER_UAS.lines());

    let orders = random_orders(corpus.len());
    let summary = summarize(
        orders
            .iter()
            .map(|order| bench_database(&corpus, order))
            .collect(),
    );

    println!(
        "fixtures: {} total ({crawler_count} crawler, {browser_count} browser)",
        corpus.len()
    );
    println!("runs: {RUNS}, random order, passes/run: {PASSES}");
    println!(
        "database: {:>6.1} ns/call median, {:>6.1} best, \
         {:>6.1} mean, {:>7.2} M calls/s, {} matches/run",
        summary.median_ns,
        summary.best_ns,
        summary.mean_ns,
        calls_per_second(summary.median_ns),
        summary.match_count
    );
}

#[cfg(not(feature = "database"))]
fn main() {
    eprintln!("run with: cargo bench --features database --bench database");
}

#[cfg(feature = "database")]
fn bench_database(corpus: &[&str], order: &[usize]) -> Run {
    let mut match_count = 0usize;
    let start = Instant::now();

    for _ in 0..PASSES {
        for &index in order {
            let matched = crawler_info(black_box(corpus[index])).is_some();
            match_count += black_box(matched) as usize;
        }
    }

    Run {
        elapsed: start.elapsed(),
        calls: PASSES * corpus.len(),
        match_count,
    }
}

#[cfg(feature = "database")]
fn random_orders(len: usize) -> Vec<Vec<usize>> {
    (0..RUNS)
        .map(|run| {
            let mut order: Vec<_> = (0..len).collect();
            shuffle(&mut order, 0x9e37_79b9_7f4a_7c15 ^ run as u64);
            order
        })
        .collect()
}

#[cfg(feature = "database")]
fn shuffle(values: &mut [usize], mut state: u64) {
    for i in (1..values.len()).rev() {
        state = splitmix64(state);
        values.swap(i, state as usize % (i + 1));
    }
}

#[cfg(feature = "database")]
fn splitmix64(mut value: u64) -> u64 {
    value = value.wrapping_add(0x9e37_79b9_7f4a_7c15);
    value = (value ^ (value >> 30)).wrapping_mul(0xbf58_476d_1ce4_e5b9);
    value = (value ^ (value >> 27)).wrapping_mul(0x94d0_49bb_1331_11eb);
    value ^ (value >> 31)
}

#[cfg(feature = "database")]
struct Run {
    elapsed: Duration,
    calls: usize,
    match_count: usize,
}

#[cfg(feature = "database")]
struct Summary {
    median_ns: f64,
    best_ns: f64,
    mean_ns: f64,
    match_count: usize,
}

#[cfg(feature = "database")]
fn summarize(runs: Vec<Run>) -> Summary {
    let match_count = runs[0].match_count;
    assert!(
        runs.iter().all(|run| run.match_count == match_count),
        "benchmark runs produced inconsistent results"
    );

    let mut samples: Vec<_> = runs
        .iter()
        .map(|run| ns_per_call(run.elapsed, run.calls))
        .collect();
    samples.sort_by(f64::total_cmp);

    Summary {
        median_ns: samples[samples.len() / 2],
        best_ns: samples[0],
        mean_ns: samples.iter().sum::<f64>() / samples.len() as f64,
        match_count,
    }
}

#[cfg(feature = "database")]
fn ns_per_call(elapsed: Duration, calls: usize) -> f64 {
    elapsed.as_nanos() as f64 / calls as f64
}

#[cfg(feature = "database")]
fn calls_per_second(ns_per_call: f64) -> f64 {
    1_000.0 / ns_per_call
}