urldecoder 2.0.0

High performance URL batch decoder
Documentation
#![cfg(feature = "bin")]
#[cfg(feature = "verbose-log")]
use std::sync::atomic::AtomicUsize;
use std::{
    fs::File,
    io::{BufWriter, Write},
    path::PathBuf,
};

use criterion::{Criterion, Throughput, criterion_group, criterion_main};
use rayon::iter::{IntoParallelIterator as _, IntoParallelRefIterator, ParallelIterator};
use tempfile::TempDir;
use urldecoder::decode_file;

pub type Result<T> = std::result::Result<T, Box<dyn std::error::Error>>;

fn generate_mixed_data() -> Vec<u8> {
    let url = "https://2.com/1?q=%E5%A4%A9%E6%B0%94";
    let url_len = url.len();

    // Total = URL / 0.1
    let total_len = url_len * 10;
    let text_len = total_len - url_len;

    let mut pattern = String::with_capacity(total_len);
    let raw_text = "This is a standard chunk of text used to simulate the payload which acts as the ninety percent of the content stream. It contains spaces and normal sentences. ";

    while pattern.len() < text_len {
        let needed = text_len - pattern.len();
        if needed >= raw_text.len() {
            pattern.push_str(raw_text);
        } else {
            pattern.push_str(&raw_text[..needed]);
        }
    }
    pattern.push_str(url);
    pattern.into_bytes()
}

fn prepare_test_env() -> (TempDir, Vec<PathBuf>, u64) {
    let dir = tempfile::tempdir().expect("failed to create temp dir");
    let file_count = 32;
    // let min_file_size = 4 * 1024 * 1024; // 4 MB
    let min_file_size = 32 * 1024; // 32 KB

    let pattern = generate_mixed_data();

    let mut file_content = Vec::with_capacity(min_file_size + pattern.len());
    while file_content.len() < min_file_size {
        file_content.extend_from_slice(&pattern);
    }
    let single_file_size = file_content.len() as u64;

    let paths: Vec<PathBuf> = (0..file_count)
        .into_par_iter()
        .map(|i| {
            let file_path = dir.path().join(format!("test_file_{}.txt", i));
            let mut f = BufWriter::new(File::create(&file_path).unwrap());
            f.write_all(&file_content).unwrap();
            f.flush().unwrap();
            file_path
        })
        .collect();

    let total_bytes = single_file_size * file_count as u64;
    (dir, paths, total_bytes)
}

fn bench_decode_throughput(c: &mut Criterion) {
    let (temp_dir, paths, total_bytes) = prepare_test_env();

    let mut group = c.benchmark_group("decode_throughput");

    group.throughput(Throughput::Bytes(total_bytes));

    // 4 MB
    // unsafe: 27.133 GiB/s
    group.bench_function("rayon_decode_dry_run", |b| {
        b.iter(|| {
            let escape_space = false;
            let dry_run = true;
            #[cfg(feature = "verbose-log")]
            let processed_count = AtomicUsize::new(0);
            #[cfg(feature = "verbose-log")]
            let changed_count = AtomicUsize::new(0);
            paths.par_iter().for_each(|path| {
                decode_file(
                    path,
                    escape_space,
                    dry_run,
                    #[cfg(feature = "verbose-log")]
                    false,
                    #[cfg(feature = "verbose-log")]
                    &processed_count,
                    #[cfg(feature = "verbose-log")]
                    &changed_count,
                )
                .unwrap()
            })
        })
    });

    // 4 MB
    // unsafe: 21.860 GiB/s
    // safe: 11.954 GiB/s
    group.bench_function("rayon_decode", |b| {
        b.iter(|| {
            let escape_space = false;
            let dry_run = false;
            #[cfg(feature = "verbose-log")]
            let processed_count = AtomicUsize::new(0);
            #[cfg(feature = "verbose-log")]
            let changed_count = AtomicUsize::new(0);
            paths.par_iter().for_each(|path| {
                decode_file(
                    path,
                    escape_space,
                    dry_run,
                    #[cfg(feature = "verbose-log")]
                    false,
                    #[cfg(feature = "verbose-log")]
                    &processed_count,
                    #[cfg(feature = "verbose-log")]
                    &changed_count,
                )
                .unwrap()
            })
        })
    });

    group.finish();
    drop(temp_dir);
}

criterion_group!(benches, bench_decode_throughput);
criterion_main!(benches);