warp-types 0.3.2

//! Prototype: How to handle loops with varying trip counts?
//!
//! **STATUS: Validated** — Research exploration complete. See conclusions below.
//!
//! Q6: "How to handle loops (recursive protocols)?"
//!
//! THE HARD PROBLEM:
//! ```ignore
//! for i in 0..data[lane_id] {  // Each lane has different trip count!
//!     process(i);
//! }
//! // After loop: all lanes reconverge
//! ```
//!
//! Lane 0 might do 5 iterations, lane 1 might do 7. This creates:
//! 1. Data-dependent divergence (trip count depends on runtime data)
//! 2. Temporal divergence (lanes are "out of sync" during loop)
//! 3. Implicit reconvergence (all lanes continue after their loops complete)
//!
//! The active set SHRINKS during the loop:
//! - Iteration 0: All 32 lanes active
//! - Iteration 5: Maybe 20 lanes still active
//! - Iteration 10: Maybe 3 lanes still active
//! - After loop: All 32 lanes active again
//!
//! We CAN'T know statically which lanes are active at iteration N.
//! This is fundamentally dynamic - depends on runtime data.

use std::marker::PhantomData;

// Import from core modules (reorganized from static_verify)
use crate::active_set::All;
use crate::data::PerLane;
use crate::warp::Warp;

// ============================================================================
// APPROACH A: FORBID WARP OPS IN VARYING LOOPS (Practical)
// ============================================================================
//
// Key insight: The problem isn't the loop itself - it's what you DO inside.
// If the loop body doesn't use warp operations, divergence doesn't matter.
//
// Solution: Loop body gets no warp access. After loop, warp is restored.

pub mod forbid_warp_ops {
    use super::*;

    /// Execute a varying-trip-count loop.
    ///
    /// **Type signature encodes the contract:**
    /// - Input: `Warp<All>` (all lanes active)
    /// - Output: `Warp<All>` (all lanes reconverge)
    /// - Body: `Fn(lane_id, iteration)` - NO warp parameter!
    ///
    /// The body cannot do warp operations because it doesn't have the warp.
    /// This is enforced by the type system - you can't conjure a Warp.
    pub fn varying_loop<F>(warp: Warp<All>, trip_counts: PerLane<u32>, mut body: F) -> Warp<All>
    where
        F: FnMut(u32, u32), // (lane_id, iteration) -> ()
    {
        // In real implementation:
        // - Each lane loops trip_counts[lane_id] times
        // - Hardware masks out finished lanes
        // - All lanes reconverge after loop

        // Simulate for lane 0
        for i in 0..trip_counts.get() {
            body(0, i);
        }

        warp // Returns Warp<All> - reconvergence guaranteed by hardware
    }

    /// What CAN the loop body do?
    /// - Per-lane computation (no warp ops needed)
    /// - Memory access (each lane accesses its own data)
    /// - Accumulate into per-lane variable
    ///
    /// What CAN'T the loop body do?
    /// - Shuffle (needs all lanes)
    /// - Reduce (needs all lanes)
    /// - Ballot (needs all lanes)
    /// - Sync (could deadlock)

    #[cfg(test)]
    mod tests {
        use super::*;

        #[test]
        fn test_varying_loop_compiles() {
            let warp: Warp<All> = Warp::new();
            let trips = PerLane::new(10u32); // Each lane does 10 iterations

            // Loop body has no warp access - can only do per-lane work
            let warp_after = varying_loop(warp, trips, |_lane, iter| {
                // Per-lane computation
                let _ = iter * 2;
            });

            // After loop, we have Warp<All> back
            assert_eq!(warp_after.active_set_name(), "All");
        }

        // This would NOT compile - body can't access warp:
        // varying_loop(warp, trips, |lane, iter| {
        //     warp.shuffle_xor(...);  // ERROR: warp not in scope!
        // });
    }
}

// ============================================================================
// APPROACH B: UNIFORM LOOPS WITH MASKING (Common Pattern)
// ============================================================================
//
// Instead of varying trip counts, use max(trip_counts) with predication.
// All lanes execute the same number of iterations, but inactive lanes no-op.

pub mod uniform_with_mask {
    use super::*;

    /// Execute a uniform loop where some lanes may be "done" early.
    ///
    /// All lanes execute `max_iters` iterations, but each lane has a
    /// `done_at` threshold. After their threshold, they execute no-ops.
    ///
    /// **Warp stays All throughout** - no divergence at type level!
    pub fn uniform_loop<F>(
        warp: Warp<All>,
        max_iters: u32,
        done_at: PerLane<u32>, // Each lane's actual trip count
        mut body: F,
    ) -> Warp<All>
    where
        F: FnMut(&Warp<All>, u32, bool), // (warp, iteration, is_active)
    {
        for i in 0..max_iters {
            // is_active = (i < done_at[lane_id])
            let is_active = i < done_at.get(); // Simplified for lane 0
            body(&warp, i, is_active);
        }
        warp
    }

    /// With uniform loops, we CAN do warp ops - but must handle inactive lanes.
    ///
    /// Pattern: Use ballot to find active lanes, then conditional shuffle.
    pub fn example_with_warp_ops(warp: Warp<All>) {
        let done_at = PerLane::new(5u32);

        let _ = uniform_loop(warp, 10, done_at, |w, iter, is_active| {
            if is_active {
                // Active lanes can contribute to warp ops
                // But must be careful - shuffle reads from ALL lanes
                let _ = w.broadcast(iter);
            }
            // Inactive lanes skip but warp ops still work
        });
    }

    #[cfg(test)]
    mod tests {
        use super::*;

        #[test]
        fn test_uniform_loop() {
            let warp: Warp<All> = Warp::new();
            let done_at = PerLane::new(3u32);
            let mut count = 0;

            let _ = uniform_loop(warp, 5, done_at, |_w, _iter, is_active| {
                if is_active {
                    count += 1;
                }
            });

            assert_eq!(count, 3); // Only 3 "active" iterations
        }
    }
}

// ============================================================================
// APPROACH C: PHASED LOOPS (Uniform + Cleanup)
// ============================================================================
//
// Split the loop into phases:
// 1. Uniform phase: All lanes do min(trip_counts) iterations together
// 2. Cleanup phase: Remaining lanes continue (with varying active set)
//
// This maximizes warp-efficient execution while handling variation.

pub mod phased_loop {
    use super::*;

    /// A phased loop with uniform and varying parts.
    ///
    /// Phase 1: All lanes together (warp ops OK)
    /// Phase 2: Stragglers continue (no warp ops)
    pub fn phased_loop<F1, F2>(
        warp: Warp<All>,
        trip_counts: PerLane<u32>,
        min_trips: u32, // min(trip_counts) - must be computed beforehand!
        mut uniform_body: F1,
        mut cleanup_body: F2,
    ) -> Warp<All>
    where
        F1: FnMut(&Warp<All>, u32), // Warp available in uniform phase
        F2: FnMut(u32, u32),        // No warp in cleanup phase
    {
        // Phase 1: Uniform - all lanes together
        for i in 0..min_trips {
            uniform_body(&warp, i);
        }

        // Phase 2: Cleanup - remaining iterations per lane
        // Each lane does (trip_counts[lane] - min_trips) more iterations
        let remaining = trip_counts.get().saturating_sub(min_trips);
        for i in 0..remaining {
            cleanup_body(0, min_trips + i);
        }

        warp
    }

    /// Computing min_trips requires a reduction - which needs Warp<All>!
    /// This is the "setup" before the phased loop.
    pub fn compute_min_trips(_warp: &Warp<All>, trip_counts: PerLane<u32>) -> u32 {
        // In real impl: warp.reduce_min(trip_counts)
        trip_counts.get() // Placeholder
    }

    #[cfg(test)]
    mod tests {
        use super::*;

        #[test]
        fn test_phased_loop() {
            let warp: Warp<All> = Warp::new();
            let trips = PerLane::new(10u32);
            let min_trips = 5;

            let mut uniform_count = 0;
            let mut cleanup_count = 0;

            let _ = phased_loop(
                warp,
                trips,
                min_trips,
                |_w, _i| {
                    uniform_count += 1;
                },
                |_lane, _i| {
                    cleanup_count += 1;
                },
            );

            assert_eq!(uniform_count, 5); // min_trips iterations
            assert_eq!(cleanup_count, 5); // remaining iterations
        }
    }
}

// ============================================================================
// APPROACH D: WORK REDISTRIBUTION (Algorithmic Transform)
// ============================================================================
//
// Instead of each lane looping independently, redistribute work.
// Transform varying-trip-count loop into uniform collective operation.
//
// This is the "GPU-efficient" approach used in real parallel algorithms.

pub mod work_redistribution {
    use super::*;

    /// Redistribute varying workloads into uniform parallel execution.
    ///
    /// Input: Each lane has N[lane] items to process
    /// Output: Process all items collectively, work distributed across lanes
    ///
    /// Key insight: Warp<All> THROUGHOUT - no divergence at type level.
    pub fn redistribute_work<T: Copy + Default, F>(
        warp: &Warp<All>,
        items_per_lane: PerLane<u32>,
        get_item: impl Fn(u32, u32) -> T, // (lane, index) -> item
        process: F,
    ) where
        F: Fn(T),
    {
        // Step 1: Compute total work (all lanes contribute)
        let _total_items = warp.reduce_sum(items_per_lane);

        // Step 2: Compute prefix sum for work distribution
        // prefix[lane] = sum of items_per_lane[0..lane]
        // This tells each lane where its items start in global order

        // Step 3: Parallel iteration over total_items
        // Each iteration, ONE lane owns the item; it processes, others wait
        //
        // In real GPU code, this is done with ballot + ffs to find owner

        // Simplified: just process lane 0's items as example
        for i in 0..items_per_lane.get() {
            let item = get_item(0, i);
            process(item);
        }
    }

    /// Example: Process variable-length arrays in each lane
    pub fn example_variable_arrays(warp: &Warp<All>) {
        let array_lengths = PerLane::new(5u32);

        redistribute_work(
            warp,
            array_lengths,
            |lane, idx| (lane, idx), // Item is (lane_id, index)
            |item| {
                let (_lane, _idx) = item;
                // Process item - all lanes participate
            },
        );
    }
}

// ============================================================================
// APPROACH E: EFFECT SYSTEM (Theoretical)
// ============================================================================
//
// Track "varying divergence" as an effect, not a type parameter.
//
// fn varying_loop() -> impl VaryingDivergence
//
// Operations requiring uniform execution reject VaryingDivergence effect.

pub mod effect_system {
    use super::*;

    /// Effect marker: code may have varying divergence
    pub trait MayDiverge {}

    /// Effect marker: code is uniform (all lanes in sync)
    pub trait Uniform {}

    /// A computation with tracked effects
    pub struct Computation<E, T> {
        _effect: PhantomData<E>,
        value: T,
    }

    #[derive(Copy, Clone)]
    pub struct UniformEffect;
    impl Uniform for UniformEffect {}

    #[derive(Copy, Clone)]
    pub struct DivergentEffect;
    impl MayDiverge for DivergentEffect {}

    // Shuffle only works with Uniform effect
    pub fn shuffle<T: Copy>(
        _warp: &Warp<All>,
        data: Computation<UniformEffect, T>,
    ) -> Computation<UniformEffect, T> {
        // Placeholder: on CPU, shuffle is a no-op (return same value)
        Computation {
            _effect: PhantomData,
            value: data.value,
        }
    }

    // Varying loop produces Divergent effect
    pub fn varying_loop<T>(body: impl Fn() -> T) -> Computation<DivergentEffect, T> {
        // Placeholder: on CPU, execute body once (no actual divergence)
        Computation {
            _effect: PhantomData,
            value: body(),
        }
    }

    // Can't compose: shuffle(varying_loop(...)) is a type error
    // because DivergentEffect doesn't implement Uniform

    // To use shuffle after loop, must "synchronize" which consumes
    // DivergentEffect and produces UniformEffect:
    pub fn synchronize<T>(comp: Computation<DivergentEffect, T>) -> Computation<UniformEffect, T> {
        // Placeholder: on CPU, synchronize is a no-op barrier (value passes through)
        Computation {
            _effect: PhantomData,
            value: comp.value,
        }
    }
}

// ============================================================================
// APPROACH F: SESSION TYPE WITH RECURSION (Theoretical)
// ============================================================================
//
// Model varying loop as recursive session type with bounded recursion.
//
// type VaryingLoop<N> =
//   if lane_active then (Body; VaryingLoop<N-1>)
//   else Wait<VaryingLoop<0>>
//
// This requires dependent types to track N per-lane.

pub mod recursive_session {
    // This is beyond what we can express in Rust.
    // Would need full dependent types to track iteration count per lane.
    //
    // Key insight: the TYPE of iteration N depends on RUNTIME data
    // (which lanes are still active). This is fundamentally dynamic.
    //
    // Possible approaches in a research language:
    //
    // 1. Existential quantification:
    //    ∃S ⊆ All. Warp<S>  -- "some subset of lanes"
    //    Problem: Can't recover All after loop without proof obligation
    //
    // 2. Refinement types:
    //    Warp<{s : LaneSet | s ⊆ active_at(iter)}
    //    Problem: active_at depends on runtime data
    //
    // 3. Temporal session types:
    //    Eventually<Warp<All>> -- "will eventually have all lanes"
    //    Problem: Need to verify termination
    //
    // 4. Linear temporal logic in types:
    //    □(in_loop → ◇reconverged) -- "always, if in loop, eventually reconverge"
    //    Problem: Very complex type system
    //
    // CONCLUSION: True varying-trip-count loops with full type tracking
    // require research-level type systems. Practical approaches must
    // either restrict expressiveness or accept some dynamic checking.
}

// ============================================================================
// SUMMARY: PRACTICAL RECOMMENDATIONS
// ============================================================================
//
// For practical use, we recommend a LAYERED approach:
//
// LAYER 1: Uniform loops (Approach B)
// - Same trip count for all lanes (computed via reduce_min beforehand)
// - Full warp ops available
// - Type: Warp<All> -> Warp<All>
// - Covers: Most GPU-efficient algorithms already use this pattern
//
// LAYER 2: Varying loops without warp ops (Approach A)
// - Different trip counts per lane
// - No warp ops in loop body
// - Type: Warp<All> -> Warp<All> with restricted body
// - Covers: Per-lane accumulation, memory traversal
//
// LAYER 3: Phased loops (Approach C)
// - Uniform phase with warp ops + cleanup phase without
// - Best of both worlds
// - Type: Warp<All> -> Warp<All> with two bodies
// - Covers: Real-world varying-length algorithms
//
// LAYER 4: Work redistribution (Approach D)
// - Algorithmic transformation to stay uniform
// - Warp<All> throughout
// - Covers: Load-balanced parallel algorithms
//
// The key insight: We don't need to TYPE the per-iteration active set.
// We just need to:
// 1. Know the loop STARTS with All
// 2. Restrict what happens INSIDE based on pattern
// 3. Know the loop ENDS with All (hardware guarantees reconvergence)
//
// This is achievable with marker types and careful API design.
// Full dependent types would be elegant but aren't necessary.

#[cfg(test)]
mod integration_tests {
    use super::forbid_warp_ops::varying_loop;
    use super::phased_loop::phased_loop;
    use super::uniform_with_mask::uniform_loop;
    use super::*;

    #[test]
    fn test_all_approaches_preserve_warp_all() {
        let warp: Warp<All> = Warp::new();

        // Approach A: varying loop
        let warp = varying_loop(warp, PerLane::new(5), |_, _| {});
        assert_eq!(warp.active_set_name(), "All");

        // Approach B: uniform with mask
        let warp = uniform_loop(warp, 5, PerLane::new(3), |_, _, _| {});
        assert_eq!(warp.active_set_name(), "All");

        // Approach C: phased loop
        let warp = phased_loop(warp, PerLane::new(10), 5, |_, _| {}, |_, _| {});
        assert_eq!(warp.active_set_name(), "All");

        // All approaches: Warp<All> -> Warp<All>
        // The TYPE SYSTEM verifies this!
    }
}