overclocked_sort 0.2.0

A hyper-optimized Parallel Counting Sort utilizing L2 Cache-oblivious block sizing, SIMD Auto-vectorization, Prefix-Sum, and Zero-Runtime Dynamic Work Stealing.
Documentation
use overclocked_sort::overclocked_sort;
use rand::Rng;
use std::time::Instant;

fn main() {
    let n = 1_000_000_000;
    println!("Init Array 1 Tỷ phần tử...");
    // Just a placeholder to ensure the logic works. Using smaller for fast test
    let n_test = 100_000_000;
    let mut arr: Vec<u32> = (0..n_test).map(|_| {
        let val = rand::thread_rng().gen_range(1..100_000_000);
        val
    }).collect();

    println!("Start sorting (Hybrid Partitioning) {} items...", n_test);
    let start = Instant::now();
    overclocked_sort(&mut arr);
    let dur = start.elapsed();
    println!("Time: {:?}", dur);
    
    assert!(arr.windows(2).all(|w| w[0] <= w[1]));
    println!("Sorted Correctly!");
}