overclocked_sort 0.2.0

A hyper-optimized Parallel Counting Sort utilizing L2 Cache-oblivious block sizing, SIMD Auto-vectorization, Prefix-Sum, and Zero-Runtime Dynamic Work Stealing.
Documentation
use overclocked_sort::overclocked_parallel_sort;
use std::time::Instant;

// Bộ sinh số ngẫu nhiên đơn giản gọn nhẹ (Linear Congruential Generator)
struct Lcg { seed: u64 }
impl Lcg {
    fn next(&mut self) -> u32 {
        self.seed = self.seed.wrapping_mul(6364136223846793005).wrapping_add(1);
        (self.seed >> 32) as u32
    }
}

fn main() {
    println!("=== SO SÁNH OVERCLOCKED SORT VÀ STD::SORT_UNSTABLE ===\n");
    
    let mut rng = Lcg { seed: 123456789 };

    // =======================================================
    // 1. Trường hợp Vượt trội (Dense Data)
    // =======================================================
    let n1 = 50_000_000;
    let m1 = 100_000;
    println!("[1] TEST DENSE DATA (N: {}, M: {})", n1, m1);
    println!("-> Vùng sức mạnh lõi. Dữ liệu dày đặc (Dense). L2 Cache Oblivious phát huy sức mạnh.");
    let mut data1 = Vec::with_capacity(n1);
    for _ in 0..n1 { data1.push((rng.next() % (m1 as u32 + 1)) as i32); }
    let mut data1_clone = data1.clone();
    
    let start = Instant::now();
    let _ = overclocked_parallel_sort(&data1, m1);
    let elapsed_overclocked = start.elapsed();
    println!("Overclocked Sort:\t {:?}", elapsed_overclocked);

    let start = Instant::now();
    data1_clone.sort_unstable();
    let elapsed_std = start.elapsed();
    println!("std::sort_unstable:\t {:?}", elapsed_std);


    // =======================================================
    // 2. Trường hợp "Ác Mộng" cũ (Sparse Data / Extreme M)
    // =======================================================
    let n2 = 10_000_000;
    let m2 = 20_000_000; // M > N, kích hoạt Sparse Adaptivity
    println!("\n[2] TEST SPARSE DATA / EXTREME M (N: {}, M: {})", n2, m2);
    println!("-> Vùng từng dính Cache Thrashing (35s). Giờ kích hoạt Fallback Adaptivity.");
    let mut data2 = Vec::with_capacity(n2);
    for _ in 0..n2 { data2.push((rng.next() % (m2 as u32 + 1)) as i32); }
    let mut data2_clone = data2.clone();

    let start = Instant::now();
    let _ = overclocked_parallel_sort(&data2, m2);
    let elapsed_overclocked2 = start.elapsed();
    println!("Overclocked (Fallback): {:?}", elapsed_overclocked2);

    let start = Instant::now();
    data2_clone.sort_unstable();
    let elapsed_std2 = start.elapsed();
    println!("std::sort_unstable:\t {:?}", elapsed_std2);


    // =======================================================
    // 3. Trường hợp Đã được Sort (Heuristic Check)
    // =======================================================
    let n3 = 50_000_000;
    let m3 = 50_000_000;
    println!("\n[3] TEST ALREADY-SORTED DATA (N: {})", n3);
    println!("-> Mảng đã được sort từ trước. Kích hoạt Heuristic Fast-Path O(N).");
    let mut data3: Vec<i32> = (0..n3 as i32).collect();
    let mut data3_clone = data3.clone();

    let start = Instant::now();
    let _ = overclocked_parallel_sort(&data3, m3);
    let elapsed_overclocked3 = start.elapsed();
    println!("Overclocked (Heuristic): {:?}", elapsed_overclocked3);

    let start = Instant::now();
    data3_clone.sort_unstable(); // Sẽ nhanh nhờ cơ chế PDQ (Pattern-Defeating Quicksort)
    let elapsed_std3 = start.elapsed();
    println!("std::sort_unstable:\t {:?}", elapsed_std3);
}