overclocked_sort 0.1.1

A hyper-optimized Parallel Counting Sort utilizing L2 Cache-oblivious block sizing, SIMD Auto-vectorization, Prefix-Sum, and Zero-Runtime Dynamic Work Stealing.
Documentation
use std::sync::atomic::{AtomicUsize, Ordering};

#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub struct KeyPtr {
    pub key: i32,
    pub ptr: u64,
}

pub fn overclocked_kp_sort(input: &[KeyPtr], max_val: usize) -> Vec<KeyPtr> {
    let num_elements = input.len();
    if num_elements == 0 {
        return vec![];
    }
    
    // Heuristic Check
    if input.windows(2).all(|w| w[0].key <= w[1].key) {
        return input.to_vec();
    }
    
    let num_threads = std::thread::available_parallelism().map(|n| n.get()).unwrap_or(4);
    
    // Fallback if M is too large (Adaptivity)
    let bucket_mem_bytes = num_threads.saturating_mul(max_val).saturating_mul(8);
    if bucket_mem_bytes > 512_000_000 || max_val > num_elements {
        let mut fallback = input.to_vec();
        fallback.sort_unstable_by_key(|kp| kp.key);
        return fallback;
    }
    
    let chunk_sz = num_elements / num_threads;
    
    // PHA 1: LOCAL COUNTING
    let mut local_counts = vec![vec![0usize; max_val + 1]; num_threads];
    
    std::thread::scope(|s| {
        for (i, counts) in local_counts.iter_mut().enumerate() {
            let start = i * chunk_sz;
            let end = if i == num_threads - 1 { num_elements } else { (i + 1) * chunk_sz };
            
            s.spawn(move || {
                for item in &input[start..end] {
                    counts[item.key as usize] += 1;
                }
            });
        }
    });
    
    // PHA 2: GLOBAL OFFSET MAPPING
    let mut global_offsets = vec![vec![0usize; max_val + 1]; num_threads];
    let mut total_offset = 0;
    
    for val in 0..=max_val {
        for t in 0..num_threads {
            global_offsets[t][val] = total_offset;
            total_offset += local_counts[t][val];
        }
    }
    
    // PHA 3: SHUFFLING (Parallel Scatter with Small Local Buffers)
    let mut result_vec = vec![KeyPtr { key: 0, ptr: 0 }; num_elements];
    
    // Using UnsafeCell/pointers for parallel mutable access without locks
    // Since we computed exact non-overlapping offsets, this is safe.
    let result_ptr = result_vec.as_mut_ptr() as usize;
    let offsets_ref = &global_offsets;
    
    std::thread::scope(|s| {
        for t_id in 0..num_threads {
            s.spawn(move || {
                let start = t_id * chunk_sz;
                let end = if t_id == num_threads - 1 { num_elements } else { (t_id + 1) * chunk_sz };
                
                let mut current_offsets = offsets_ref[t_id].clone();

                for item in &input[start..end] {
                    let target_pos = current_offsets[item.key as usize];
                    unsafe {
                        let ptr = (result_ptr as *mut KeyPtr).add(target_pos);
                        std::ptr::write(ptr, *item);
                    }
                    current_offsets[item.key as usize] += 1;
                }
            });
        }
    });
    
    result_vec
}