overclocked_sort 0.2.0

A hyper-optimized Parallel Counting Sort utilizing L2 Cache-oblivious block sizing, SIMD Auto-vectorization, Prefix-Sum, and Zero-Runtime Dynamic Work Stealing.
Documentation
#[derive(Clone, Copy, Debug, PartialEq, Eq, Default)]
pub struct KeyPtr {
    pub key: i32,
    pub ptr: u64,
}

// Ensure strict ordering
impl PartialOrd for KeyPtr {
    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
        Some(self.cmp(other))
    }
}
impl Ord for KeyPtr {
    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
        self.key.cmp(&other.key) // Only sort by key
    }
}

impl crate::SortableKey for KeyPtr {
    type KeyType = usize;
    #[inline(always)]
    fn extract_key(&self) -> usize { (self.key as i64 + 2147483648) as usize }
    const IS_PRIMITIVE: bool = false;
    #[inline(always)]
    fn from_key(_k: usize) -> Self { unreachable!("Cannot synthesize KeyPtr from key") }
}

pub fn overclocked_kp_sort(input: &[KeyPtr], _max_val: usize) -> Vec<KeyPtr> {
    crate::overclocked_kp_sort(input, _max_val)
}