use std::sync::atomic::{AtomicUsize, Ordering};
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub struct KeyPtr {
pub key: i32,
pub ptr: u64,
}
pub fn overclocked_kp_sort(input: &[KeyPtr], max_val: usize) -> Vec<KeyPtr> {
let num_elements = input.len();
if num_elements == 0 {
return vec![];
}
if input.windows(2).all(|w| w[0].key <= w[1].key) {
return input.to_vec();
}
let num_threads = std::thread::available_parallelism().map(|n| n.get()).unwrap_or(4);
let bucket_mem_bytes = num_threads.saturating_mul(max_val).saturating_mul(8);
if bucket_mem_bytes > 512_000_000 || max_val > num_elements {
let mut fallback = input.to_vec();
fallback.sort_unstable_by_key(|kp| kp.key);
return fallback;
}
let chunk_sz = num_elements / num_threads;
let mut local_counts = vec![vec![0usize; max_val + 1]; num_threads];
std::thread::scope(|s| {
for (i, counts) in local_counts.iter_mut().enumerate() {
let start = i * chunk_sz;
let end = if i == num_threads - 1 { num_elements } else { (i + 1) * chunk_sz };
s.spawn(move || {
for item in &input[start..end] {
counts[item.key as usize] += 1;
}
});
}
});
let mut global_offsets = vec![vec![0usize; max_val + 1]; num_threads];
let mut total_offset = 0;
for val in 0..=max_val {
for t in 0..num_threads {
global_offsets[t][val] = total_offset;
total_offset += local_counts[t][val];
}
}
let mut result_vec = vec![KeyPtr { key: 0, ptr: 0 }; num_elements];
let result_ptr = result_vec.as_mut_ptr() as usize;
let offsets_ref = &global_offsets;
std::thread::scope(|s| {
for t_id in 0..num_threads {
s.spawn(move || {
let start = t_id * chunk_sz;
let end = if t_id == num_threads - 1 { num_elements } else { (t_id + 1) * chunk_sz };
let mut current_offsets = offsets_ref[t_id].clone();
for item in &input[start..end] {
let target_pos = current_offsets[item.key as usize];
unsafe {
let ptr = (result_ptr as *mut KeyPtr).add(target_pos);
std::ptr::write(ptr, *item);
}
current_offsets[item.key as usize] += 1;
}
});
}
});
result_vec
}