ssb 0.1.1

Simple benchmarking for Rust, with hierarchical call tree, based on fastrace.
Documentation
use fastrace::Span;
use ssb::Bench;

fn parse(data: &[u8]) -> Vec<u32> {
    let _s = Span::enter_with_local_parent("parse");
    data.chunks(4)
        .map(|c| u32::from_le_bytes(c.try_into().unwrap_or([0; 4])))
        .collect()
}

fn parse_by_bytes(data: &[u8]) -> Vec<u32> {
    let _s = Span::enter_with_local_parent("parse_by_bytes");
    let mut result = Vec::new();
    let mut acc = 0;
    for (i, &b) in data.iter().enumerate() {
        acc |= (b as u32) << ((i % 4) * 8);
        if i % 4 == 3 {
            result.push(acc);
            acc = 0;
        }
    }
    // last few bytes can be skipped.
    result
}

#[fastrace::trace]
fn subprocess(items: &[u32]) -> u64 {
    items.iter().map(|&x| x as u64 * 31).sum()
}

fn process(items: Vec<u32>) -> u64 {
    // Call set_local_parent() so that child spans (subprocess) are recorded
    // as children of this span, not siblings. Equivalent to #[fastrace::trace].
    let span = Span::enter_with_local_parent("process");
    let _guard = span.set_local_parent();
    subprocess(&items)
}

fn serialize(result: u64) -> Vec<u8> {
    let _s = Span::enter_with_local_parent("serialize");
    result.to_le_bytes().to_vec()
}

fn pipeline(data: &[u8]) -> Vec<u8> {
    serialize(process(parse(data)))
}

fn bench_pipeline() {
    let data: Vec<u8> = (0..1024u16).flat_map(|x| x.to_le_bytes()).collect();
    pipeline(&data);
}

fn bench_parse(bench: &mut Bench) {
    let data: Vec<u8> = (0..1024u16).flat_map(|x| x.to_le_bytes()).collect();
    // specify a group for benchmarks.
    let mut bench = bench.group("parsing");

    bench.name("parse").run(|| parse(&data));
    bench.name("parse_by_bytes").run(|| parse_by_bytes(&data));
}

// Runs bench_pipeline() then bench_parse().
// The second invocation of each (on a repeated run) will show a comparison.
ssb::bench_main!(bench_pipeline, bench_parse);