learned-partition-sort 0.1.0

//! Core implementation of Learned Partition Sort.
//!
//! The algorithm works in three phases:
//! 1. **Sample**: Analyze a sample of data to learn min/max bounds
//! 2. **Scatter**: Distribute elements to buckets using calculated indices
//! 3. **Refine**: Sort each bucket (in parallel with Rayon)

use rayon::prelude::*;

/// Wrapper to allow sending raw pointer across threads.
/// SAFETY: Users must guarantee non-overlapping access.
#[derive(Clone, Copy)]
struct SendPtr<T>(*mut T);
unsafe impl<T: Send> Send for SendPtr<T> {}
unsafe impl<T: Sync> Sync for SendPtr<T> {}

impl<T> SendPtr<T> {
    /// Returns the raw pointer. Use with care.
    #[inline]
    fn get(self) -> *mut T {
        self.0
    }
}

/// Hybridization threshold: below this size, use standard sort.
/// Benchmarks show LPS wins at N >= 100K, so we use 32K as a safe cutoff.
const HYBRID_THRESHOLD: usize = 32768;

/// Target number of elements per bucket (fits in L2 cache).
const BUCKET_TARGET_SIZE: usize = 512;

/// Maximum bucket size multiplier before triggering fallback.
/// If a bucket has more than this × expected size, the distribution model failed.
const BUCKET_OVERFLOW_FACTOR: usize = 4;

/// Threshold for using insertion sort on small buckets.
const INSERTION_SORT_THRESHOLD: usize = 32;

/// Sorts a slice using the Learned Partition Sort algorithm.
///
/// This algorithm achieves O(N) complexity on well-distributed numerical data
/// by learning the data distribution and using it to directly calculate
/// element positions.
///
/// # Algorithm
///
/// 1. For small inputs (< 8192 elements), falls back to `sort_unstable`
/// 2. Samples data to find min/max bounds
/// 3. Distributes elements to buckets based on calculated positions
/// 4. Sorts each bucket in parallel using Rayon
///
/// # Examples
///
/// ```
/// use learned_partition_sort::learned_sort;
///
/// let mut data: Vec<i64> = vec![5, 2, 8, 1, 9];
/// learned_sort(&mut data);
/// assert_eq!(data, vec![1, 2, 5, 8, 9]);
/// ```
pub fn learned_sort<T>(arr: &mut [T])
where
    T: Ord + Copy + Send + Sync + Into<i64>,
{
    let n = arr.len();

    // Guard: empty or single element
    if n <= 1 {
        return;
    }

    // Hybridization: use standard sort for small arrays
    if n < HYBRID_THRESHOLD {
        arr.sort_unstable();
        return;
    }

    // Phase 1: Sample to find min/max
    let (min_val, max_val) = sample_minmax(arr);

    // Handle edge case: all elements are identical
    if min_val == max_val {
        return; // Already sorted (all same)
    }

    // Calculate number of buckets
    let num_buckets = (n / BUCKET_TARGET_SIZE).max(1);

    // Phase 2: Count elements per bucket
    let mut counts = count_buckets(arr, min_val, max_val, num_buckets);

    // Convert counts to start offsets via prefix sum
    let offsets = prefix_sum(&counts);

    // Phase 3: Scatter elements to auxiliary buffer
    let mut aux = vec![arr[0]; n]; // Allocate auxiliary buffer
    scatter(arr, &mut aux, &mut counts, &offsets, min_val, max_val, num_buckets);

    // Phase 4: Refine - sort each bucket in parallel
    refine_buckets(&mut aux, &offsets, num_buckets, n);

    // Copy back to original array
    arr.copy_from_slice(&aux);
}

/// Sorts a slice using an in-place variant of Learned Partition Sort.
///
/// This version uses **O(num_buckets)** additional memory instead of O(N),
/// making it suitable for very large arrays (100M+ elements) or memory-constrained
/// environments where allocating an auxiliary buffer would fail.
///
/// # ⚠️ Performance Warning
///
/// This function is **~5x slower** than [`learned_sort`] due to cache-unfriendly
/// cycle-sort memory access patterns. Only use when memory is more important than speed.
///
/// # Trade-offs
///
/// | Metric | `learned_sort` | `learned_sort_inplace` |
/// |--------|----------------|------------------------|
/// | Memory | 2N (data + aux) | N + ~200KB |
/// | Time @ 100M | 3.3s | 16.3s |
/// | Throughput | 30 Melem/s | 6 Melem/s |
///
/// # When to Use
///
/// - ✅ Sorting 100M+ elements on 8GB RAM machines
/// - ✅ Serverless/embedded with strict memory limits
/// - ❌ When speed matters more than memory
///
/// # Algorithm
///
/// Uses cycle-sort style in-place permutation:
/// 1. Count elements per bucket
/// 2. Compute bucket start positions  
/// 3. Follow permutation cycles to move elements to correct buckets in-place
/// 4. Sort each bucket in parallel
///
/// # Examples
///
/// ```
/// use learned_partition_sort::learned_sort_inplace;
///
/// let mut data: Vec<i64> = vec![5, 2, 8, 1, 9];
/// learned_sort_inplace(&mut data);
/// assert_eq!(data, vec![1, 2, 5, 8, 9]);
/// ```
pub fn learned_sort_inplace<T>(arr: &mut [T])
where
    T: Ord + Copy + Send + Sync + Into<i64>,
{
    let n = arr.len();

    // Guard: empty or single element
    if n <= 1 {
        return;
    }

    // Hybridization: use standard sort for small arrays
    if n < HYBRID_THRESHOLD {
        arr.sort_unstable();
        return;
    }

    // Phase 1: Sample to find min/max
    let (min_val, max_val) = sample_minmax(arr);

    // Handle edge case: all elements are identical
    if min_val == max_val {
        return; // Already sorted (all same)
    }

    // Calculate number of buckets
    let num_buckets = (n / BUCKET_TARGET_SIZE).max(1);

    // Phase 2: Count elements per bucket
    let counts = count_buckets(arr, min_val, max_val, num_buckets);

    // Convert counts to start offsets via prefix sum
    let offsets = prefix_sum(&counts);

    // Phase 3: In-place permutation to bucket positions
    scatter_inplace(arr, &offsets, min_val, max_val, num_buckets);

    // Phase 4: Refine - sort each bucket in parallel
    refine_buckets(arr, &offsets, num_buckets, n);
}

/// Computes the bucket index for a given value.
#[inline]
fn compute_bucket(val: i64, min_val: i64, scale: f64, num_buckets: usize) -> usize {
    let idx = ((val - min_val) as f64 * scale) as usize;
    idx.min(num_buckets - 1)
}

/// Permutes elements in-place to their bucket positions using optimized cycle-sort.
///
/// Uses O(num_buckets) additional memory for write cursors.
/// Optimized for cache locality by processing buckets sequentially.
fn scatter_inplace<T>(
    arr: &mut [T],
    offsets: &[usize],
    min_val: i64,
    max_val: i64,
    num_buckets: usize,
) where
    T: Copy + Into<i64>,
{
    let range = (max_val - min_val) as f64;
    let scale = (num_buckets as f64 - 0.001) / range;

    // Write cursors: next position to write in each bucket
    let mut write_cursors: Vec<usize> = offsets[..num_buckets].to_vec();

    // Process each bucket region
    for bucket in 0..num_buckets {
        let bucket_start = offsets[bucket];
        let bucket_end = offsets[bucket + 1];

        // Process each position in the bucket region
        let mut pos = bucket_start;
        while pos < bucket_end {
            let current_val: i64 = arr[pos].into();
            let target_bucket = compute_bucket(current_val, min_val, scale, num_buckets);

            if target_bucket == bucket {
                // Element already in correct bucket region
                // Advance write cursor if needed
                if write_cursors[bucket] <= pos {
                    write_cursors[bucket] = pos + 1;
                }
                pos += 1;
                continue;
            }

            // Element needs to move - follow the permutation cycle
            let mut current = arr[pos];
            let mut current_bucket = target_bucket;

            loop {
                // Get destination position
                let dest_pos = write_cursors[current_bucket];
                
                // Advance cursor for this bucket
                write_cursors[current_bucket] += 1;

                // Swap
                let next = arr[dest_pos];
                arr[dest_pos] = current;
                
                let next_bucket = compute_bucket(next.into(), min_val, scale, num_buckets);

                // Check if cycle is complete
                if next_bucket == bucket {
                    // Put the final element back
                    arr[pos] = next;
                    break;
                }

                current = next;
                current_bucket = next_bucket;
            }

            // Update write cursor for current bucket
            if write_cursors[bucket] <= pos {
                write_cursors[bucket] = pos + 1;
            }
            pos += 1;
        }
    }
}

/// Samples the array to find minimum and maximum values.
/// Uses full scan for accuracy (sampling 1% can miss outliers).
#[inline]
fn sample_minmax<T>(arr: &[T]) -> (i64, i64)
where
    T: Ord + Copy + Into<i64>,
{
    let mut min_val = arr[0].into();
    let mut max_val = arr[0].into();

    for &item in arr.iter() {
        let val: i64 = item.into();
        if val < min_val {
            min_val = val;
        }
        if val > max_val {
            max_val = val;
        }
    }

    (min_val, max_val)
}

/// Counts elements per bucket.
/// Returns a vector where `counts[i]` is the number of elements in bucket `i`.
#[inline]
fn count_buckets<T>(arr: &[T], min_val: i64, max_val: i64, num_buckets: usize) -> Vec<usize>
where
    T: Copy + Into<i64>,
{
    let mut counts = vec![0usize; num_buckets];
    let range = (max_val - min_val) as f64;
    let scale = (num_buckets as f64 - 0.001) / range; // Slight reduction to avoid overflow

    for &item in arr.iter() {
        let val: i64 = item.into();
        let bucket_idx = ((val - min_val) as f64 * scale) as usize;
        let bucket_idx = bucket_idx.min(num_buckets - 1); // Safety clamp
        counts[bucket_idx] += 1;
    }

    counts
}

/// Converts counts to start offsets using prefix sum.
/// `offsets[i]` is the starting index of bucket `i` in the auxiliary array.
#[inline]
fn prefix_sum(counts: &[usize]) -> Vec<usize> {
    let mut offsets = Vec::with_capacity(counts.len() + 1);
    let mut sum = 0;

    for &count in counts.iter() {
        offsets.push(sum);
        sum += count;
    }
    offsets.push(sum); // Final offset marks end of last bucket

    offsets
}

/// Scatters elements from source to auxiliary buffer based on bucket indices.
/// Uses mutable counts as write pointers.
///
/// # Safety
/// Uses unsafe `get_unchecked_mut` for performance in the hot loop.
#[inline]
fn scatter<T>(
    src: &[T],
    aux: &mut [T],
    counts: &mut [usize],
    offsets: &[usize],
    min_val: i64,
    max_val: i64,
    num_buckets: usize,
) where
    T: Copy + Into<i64>,
{
    // Reset counts to use as write pointers (start at offset positions)
    for (i, count) in counts.iter_mut().enumerate() {
        *count = offsets[i];
    }

    let range = (max_val - min_val) as f64;
    let scale = (num_buckets as f64 - 0.001) / range;

    for &item in src.iter() {
        let val: i64 = item.into();
        let bucket_idx = ((val - min_val) as f64 * scale) as usize;
        let bucket_idx = bucket_idx.min(num_buckets - 1);

        let write_pos = counts[bucket_idx];
        counts[bucket_idx] += 1;

        // SAFETY: write_pos is guaranteed to be within bounds because:
        // - offsets are computed from prefix sum of counts
        // - we increment write_pos exactly as many times as there are elements
        unsafe {
            *aux.get_unchecked_mut(write_pos) = item;
        }
    }
}

/// Sorts each bucket in parallel using Rayon.
/// Small buckets use insertion sort, larger ones use `sort_unstable`.
fn refine_buckets<T>(aux: &mut [T], offsets: &[usize], num_buckets: usize, total_len: usize)
where
    T: Ord + Copy + Send + Sync,
{
    let expected_bucket_size = total_len / num_buckets;
    let overflow_threshold = expected_bucket_size * BUCKET_OVERFLOW_FACTOR;

    let ptr = SendPtr(aux.as_mut_ptr());

    // Create bucket ranges
    let bucket_ranges: Vec<(usize, usize)> = (0..num_buckets)
        .map(|i| (offsets[i], offsets[i + 1]))
        .collect();

    // Sort buckets in parallel
    bucket_ranges.par_iter().for_each(move |&(start, end)| {
        let bucket_len = end - start;
        if bucket_len <= 1 {
            return;
        }

        // SAFETY: Each bucket range is non-overlapping due to prefix sum construction
        let bucket_slice = unsafe { std::slice::from_raw_parts_mut(ptr.get().add(start), bucket_len) };

        if bucket_len < INSERTION_SORT_THRESHOLD {
            insertion_sort(bucket_slice);
        } else if bucket_len > overflow_threshold {
            // Distribution model failed for this bucket - use robust fallback
            bucket_slice.sort_unstable();
        } else {
            // Normal bucket - still use sort_unstable (fast for small slices)
            bucket_slice.sort_unstable();
        }
    });
}

/// Simple insertion sort for very small slices.
/// O(N²) but extremely fast for N < 32 due to low overhead.
#[inline]
fn insertion_sort<T: Ord + Copy>(arr: &mut [T]) {
    for i in 1..arr.len() {
        let key = arr[i];
        let mut j = i;
        while j > 0 && arr[j - 1] > key {
            arr[j] = arr[j - 1];
            j -= 1;
        }
        arr[j] = key;
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use rand::prelude::*;

    #[test]
    fn test_empty_slice() {
        let mut data: Vec<i64> = vec![];
        learned_sort(&mut data);
        assert!(data.is_empty());
    }

    #[test]
    fn test_single_element() {
        let mut data = vec![42i64];
        learned_sort(&mut data);
        assert_eq!(data, vec![42]);
    }

    #[test]
    fn test_two_elements() {
        let mut data = vec![5i64, 3];
        learned_sort(&mut data);
        assert_eq!(data, vec![3, 5]);
    }

    #[test]
    fn test_small_array_uses_fallback() {
        let mut data: Vec<i64> = (0..100).rev().collect();
        learned_sort(&mut data);
        assert_eq!(data, (0..100).collect::<Vec<_>>());
    }

    #[test]
    fn test_medium_array() {
        let mut data: Vec<i64> = (0..1000).rev().collect();
        learned_sort(&mut data);
        assert_eq!(data, (0..1000).collect::<Vec<_>>());
    }

    #[test]
    fn test_large_uniform_distribution() {
        let mut rng = rand::thread_rng();
        let mut data: Vec<i64> = (0..100_000).map(|_| rng.gen_range(0..1_000_000)).collect();
        let mut expected = data.clone();
        expected.sort_unstable();

        learned_sort(&mut data);
        assert_eq!(data, expected);
    }

    #[test]
    fn test_sorted_input() {
        let mut data: Vec<i64> = (0..10_000).collect();
        let expected = data.clone();
        learned_sort(&mut data);
        assert_eq!(data, expected);
    }

    #[test]
    fn test_reverse_sorted() {
        let mut data: Vec<i64> = (0..10_000).rev().collect();
        let expected: Vec<i64> = (0..10_000).collect();
        learned_sort(&mut data);
        assert_eq!(data, expected);
    }

    #[test]
    fn test_duplicates() {
        let mut data: Vec<i64> = vec![5; 10_000];
        let expected = data.clone();
        learned_sort(&mut data);
        assert_eq!(data, expected);
    }

    #[test]
    fn test_many_duplicates() {
        let mut rng = rand::thread_rng();
        let mut data: Vec<i64> = (0..10_000).map(|_| rng.gen_range(0..10)).collect();
        let mut expected = data.clone();
        expected.sort_unstable();

        learned_sort(&mut data);
        assert_eq!(data, expected);
    }

    #[test]
    fn test_negative_numbers() {
        let mut rng = rand::thread_rng();
        let mut data: Vec<i64> = (0..10_000).map(|_| rng.gen_range(-500_000..500_000)).collect();
        let mut expected = data.clone();
        expected.sort_unstable();

        learned_sort(&mut data);
        assert_eq!(data, expected);
    }

    #[test]
    fn test_i32_type() {
        let mut rng = rand::thread_rng();
        let mut data: Vec<i32> = (0..10_000).map(|_| rng.gen_range(0..1_000_000)).collect();
        let mut expected = data.clone();
        expected.sort_unstable();

        learned_sort(&mut data);
        assert_eq!(data, expected);
    }

    // ============ Tests for learned_sort_inplace ============

    #[test]
    fn test_inplace_empty_slice() {
        let mut data: Vec<i64> = vec![];
        learned_sort_inplace(&mut data);
        assert!(data.is_empty());
    }

    #[test]
    fn test_inplace_single_element() {
        let mut data = vec![42i64];
        learned_sort_inplace(&mut data);
        assert_eq!(data, vec![42]);
    }

    #[test]
    fn test_inplace_small_array() {
        let mut data: Vec<i64> = (0..100).rev().collect();
        learned_sort_inplace(&mut data);
        assert_eq!(data, (0..100).collect::<Vec<_>>());
    }

    #[test]
    fn test_inplace_large_uniform() {
        let mut rng = rand::thread_rng();
        let mut data: Vec<i64> = (0..100_000).map(|_| rng.gen_range(0..1_000_000)).collect();
        let mut expected = data.clone();
        expected.sort_unstable();

        learned_sort_inplace(&mut data);
        assert_eq!(data, expected);
    }

    #[test]
    fn test_inplace_duplicates() {
        let mut data: Vec<i64> = vec![5; 50_000];
        let expected = data.clone();
        learned_sort_inplace(&mut data);
        assert_eq!(data, expected);
    }

    #[test]
    fn test_inplace_many_duplicates() {
        let mut rng = rand::thread_rng();
        let mut data: Vec<i64> = (0..50_000).map(|_| rng.gen_range(0..10)).collect();
        let mut expected = data.clone();
        expected.sort_unstable();

        learned_sort_inplace(&mut data);
        assert_eq!(data, expected);
    }

    #[test]
    fn test_inplace_negative_numbers() {
        let mut rng = rand::thread_rng();
        let mut data: Vec<i64> = (0..50_000).map(|_| rng.gen_range(-500_000..500_000)).collect();
        let mut expected = data.clone();
        expected.sort_unstable();

        learned_sort_inplace(&mut data);
        assert_eq!(data, expected);
    }

    #[test]
    fn test_inplace_matches_regular() {
        let mut rng = rand::thread_rng();
        let original: Vec<i64> = (0..100_000).map(|_| rng.gen_range(0..1_000_000)).collect();
        
        let mut data_regular = original.clone();
        let mut data_inplace = original.clone();
        
        learned_sort(&mut data_regular);
        learned_sort_inplace(&mut data_inplace);
        
        assert_eq!(data_regular, data_inplace);
    }
}