warp-types 0.3.2

//! Recursive Protocols: Session Types with Loops
//!
//! **STATUS: Research** — Exploratory prototype, not promoted to main API.
//!
//! This module explores how recursive (looping) protocols interact with
//! session-typed warp divergence.
//!
//! # The Challenge
//!
//! Traditional session types handle recursion with μ-types:
//! ```text
//! μX. send(data); recv(ack); X    // Infinite loop
//! μX. send(data); (recv(ack); X ⊕ done)  // Loop with exit
//! ```
//!
//! For warp sessions, recursion interacts with divergence:
//! ```text
//! μX. diverge(pred,
//!       left: shuffle; merge; X,
//!       right: skip; merge; X)
//! ```
//!
//! Key questions:
//! 1. Can the active set change across iterations?
//! 2. Must all lanes iterate the same number of times?
//! 3. How to type early exit (break)?
//!
//! # Patterns Explored
//!
//! 1. **Uniform iteration**: All lanes do N iterations (easy)
//! 2. **Convergent iteration**: Loop until condition (e.g., Newton-Raphson)
//! 3. **Reducing iteration**: Active set shrinks each iteration
//! 4. **Recursive diverge/merge**: Nested recursion with divergence

use std::marker::PhantomData;

// ============================================================================
// BACKGROUND: RECURSIVE SESSION TYPES
// ============================================================================

/// Traditional μ-types for session recursion.
///
/// In classical session types:
/// - `μX.P` defines a recursive protocol P that can refer to X
/// - `X` is the recursion variable (jump back to start)
/// - Unfolding: `μX.P ≡ P[μX.P/X]` (substitute definition for variable)
///
/// For GPU warps, we need to track how the active set evolves.
pub mod mu_types {
    use super::*;

    /// A protocol that can recur
    pub trait Protocol {
        /// The active set at protocol start
        type StartSet: ActiveSet;
        /// The active set at protocol end (before recursion)
        type EndSet: ActiveSet;
    }

    /// Recursion: μX.P
    ///
    /// For soundness, we require: P.EndSet == P.StartSet
    /// This ensures the loop can actually repeat.
    pub struct Mu<P: Protocol>(PhantomData<P>);

    impl<P: Protocol> Protocol for Mu<P>
    where
        P: Protocol<EndSet = <P as Protocol>::StartSet>, // Loop invariant
    {
        type StartSet = P::StartSet;
        type EndSet = P::EndSet;
    }

    /// End: protocol terminates
    pub struct End<S: ActiveSet>(PhantomData<S>);

    impl<S: ActiveSet> Protocol for End<S> {
        type StartSet = S;
        type EndSet = S;
    }

    /// Sequence: P1; P2
    pub struct Seq<P1: Protocol, P2: Protocol>(PhantomData<(P1, P2)>);

    impl<P1, P2> Protocol for Seq<P1, P2>
    where
        P1: Protocol,
        P2: Protocol<StartSet = P1::EndSet>,
    {
        type StartSet = P1::StartSet;
        type EndSet = P2::EndSet;
    }
}

// ============================================================================
// PATTERN 1: UNIFORM ITERATION
// ============================================================================

/// All lanes iterate exactly N times. No divergence concerns.
///
/// This is the easy case - equivalent to unrolling the loop.
/// Active set is constant throughout: Warp<S> -> Warp<S>
pub mod uniform_iteration {
    use super::*;

    /// A uniform loop that preserves active set
    pub struct UniformLoop<S: ActiveSet, const N: usize> {
        _marker: PhantomData<S>,
    }

    impl<S: ActiveSet, const N: usize> UniformLoop<S, N> {
        pub fn new() -> Self {
            UniformLoop {
                _marker: PhantomData,
            }
        }

        /// Execute body N times, preserving warp type
        pub fn execute<F>(self, mut warp: Warp<S>, mut body: F) -> Warp<S>
        where
            F: FnMut(&mut Warp<S>, usize),
        {
            for i in 0..N {
                body(&mut warp, i);
            }
            warp
        }
    }

    /// Type-level encoding: μX. body; X (N times)
    ///
    /// Protocol: Warp<S> -[body]-> Warp<S> -[recurse N]-> Warp<S>
    pub trait UniformLoopProtocol {
        type ActiveSet: ActiveSet;
        const ITERATIONS: usize;

        fn body(warp: &mut Warp<Self::ActiveSet>, iteration: usize);
    }

    #[cfg(test)]
    mod tests {
        use super::*;

        #[test]
        fn test_uniform_loop() {
            let loop_5: UniformLoop<All, 5> = UniformLoop::new();
            let warp: Warp<All> = Warp::new();

            let mut count = 0;
            let result = loop_5.execute(warp, |_w, _i| {
                count += 1;
            });

            assert_eq!(count, 5);
            let _: Warp<All> = result; // Type preserved
        }
    }
}

// ============================================================================
// PATTERN 2: CONVERGENT ITERATION
// ============================================================================

/// Loop until a warp-wide condition is met.
///
/// Example: Newton-Raphson iteration
/// ```text
/// while !warp.all(|lane| converged[lane]) {
///     x = x - f(x)/f'(x);  // Per-lane update
///     converged = |x - x_prev| < epsilon;
/// }
/// ```
///
/// Key insight: The loop body doesn't change the active set, but lanes
/// may "finish early" by not updating. The loop continues until ALL
/// active lanes satisfy the condition.
pub mod convergent_iteration {
    use super::*;

    /// A convergent loop: iterate until all lanes satisfy predicate
    pub struct ConvergentLoop<S: ActiveSet> {
        _marker: PhantomData<S>,
        max_iterations: usize,
    }

    impl<S: ActiveSet> ConvergentLoop<S> {
        pub fn new(max_iterations: usize) -> Self {
            ConvergentLoop {
                _marker: PhantomData,
                max_iterations,
            }
        }

        /// Execute until convergence or max iterations
        ///
        /// Returns: (final_warp, converged, iterations_used)
        pub fn execute<F, P>(
            self,
            warp: Warp<S>,
            mut body: F,
            mut converged: P,
        ) -> (Warp<S>, bool, usize)
        where
            F: FnMut(&Warp<S>),         // Loop body
            P: FnMut(&Warp<S>) -> bool, // All-lanes convergence check
        {
            for i in 0..self.max_iterations {
                if converged(&warp) {
                    return (warp, true, i);
                }
                body(&warp);
            }
            (warp, converged(&warp), self.max_iterations)
        }
    }

    /// Protocol: μX. body; (converged? End : X)
    ///
    /// This is a conditional recursion - the protocol either ends or repeats.
    /// Importantly, ALL lanes make the same choice (convergence is uniform).
    pub trait ConvergentProtocol {
        type ActiveSet: ActiveSet;

        /// Per-iteration body
        fn body(warp: &Warp<Self::ActiveSet>);

        /// Uniform convergence check (all lanes agree)
        fn converged(warp: &Warp<Self::ActiveSet>) -> bool;
    }

    #[cfg(test)]
    mod tests {
        use super::*;

        #[test]
        fn test_convergent_loop() {
            use std::cell::Cell;

            let conv_loop: ConvergentLoop<All> = ConvergentLoop::new(100);
            let warp: Warp<All> = Warp::new();

            let iteration = Cell::new(0);
            let (result, converged, iters) = conv_loop.execute(
                warp,
                |_w| {
                    iteration.set(iteration.get() + 1);
                },
                |_w| iteration.get() >= 5, // Converge after 5 iterations
            );

            assert!(converged);
            assert_eq!(iters, 5);
            let _: Warp<All> = result;
        }
    }
}

// ============================================================================
// PATTERN 3: REDUCING ITERATION (The Hard Case)
// ============================================================================

/// Loop where the active set shrinks each iteration.
///
/// Example: Processing variable-length lists per lane
/// ```text
/// while warp.any(|lane| !done[lane]) {
///     // Only non-done lanes execute this
///     process_next_item();
///     done = check_if_done();
/// }
/// ```
///
/// This is fundamentally different from convergent iteration:
/// - Convergent: All lanes do same work, exit together
/// - Reducing: Each lane does different amount of work, exits independently
///
/// The challenge: We CAN'T track the active set statically because it
/// depends on runtime data.
pub mod reducing_iteration {
    use super::*;

    /// A reducing loop: active set shrinks until empty
    ///
    /// Type signature: Warp<S> -> Warp<S> (reconverges after all done)
    ///
    /// But DURING the loop, we can't know the active set.
    /// Solution: Body doesn't get a typed Warp, just per-lane access.
    pub struct ReducingLoop<S: ActiveSet> {
        _marker: PhantomData<S>,
        max_iterations: usize,
    }

    impl<S: ActiveSet> ReducingLoop<S> {
        pub fn new(max_iterations: usize) -> Self {
            ReducingLoop {
                _marker: PhantomData,
                max_iterations,
            }
        }

        /// Execute with reducing active set
        ///
        /// Key: Body takes (lane_id, iteration) but NOT a Warp.
        /// This prevents warp operations in the body.
        pub fn execute<F, P>(self, warp: Warp<S>, mut body: F, mut any_active: P) -> Warp<S>
        where
            F: FnMut(u32, usize), // (lane_id, iteration)
            P: FnMut() -> bool,   // Any lanes still active?
        {
            for i in 0..self.max_iterations {
                if !any_active() {
                    break;
                }
                // In real GPU: only active lanes execute body
                body(0, i); // Simulated for lane 0
            }
            warp // All lanes reconverge (hardware guarantees)
        }
    }

    /// Alternative: Phased reducing loop
    ///
    /// Split into two phases:
    /// 1. Warp phase: All active lanes shuffle/communicate
    /// 2. Per-lane phase: Individual processing
    ///
    /// Each iteration does: warp_ops -> per_lane_ops -> check_done
    pub struct PhasedReducingLoop<S: ActiveSet> {
        _marker: PhantomData<S>,
    }

    impl<S: ActiveSet> PhasedReducingLoop<S> {
        /// Type: Warp<S> for warp_phase, nothing for per_lane_phase
        pub fn execute<W, L, P>(
            warp: Warp<S>,
            mut warp_phase: W,
            mut lane_phase: L,
            mut done: P,
            max_iters: usize,
        ) -> Warp<S>
        where
            W: FnMut(&Warp<S>), // Has warp access
            L: FnMut(u32),      // Per-lane only
            P: FnMut() -> bool, // All done?
        {
            for _ in 0..max_iters {
                warp_phase(&warp); // Warp ops allowed here
                lane_phase(0); // Per-lane only
                if done() {
                    break;
                }
            }
            warp
        }
    }
}

// ============================================================================
// PATTERN 4: RECURSIVE DIVERGE/MERGE
// ============================================================================

/// Recursion with divergence inside the loop.
///
/// Example: Tree traversal
/// ```text
/// μX. diverge(has_left_child,
///       left: visit(left_child); X,   // Recurse on left subtree
///       right: skip)
///     merge;
///     diverge(has_right_child,
///       left: visit(right_child); X,  // Recurse on right subtree
///       right: skip)
///     merge
/// ```
///
/// This is complex because:
/// 1. Recursion depth varies per lane
/// 2. Divergence creates sub-warps
/// 3. Must ensure proper merge at each level
pub mod recursive_diverge {
    use super::*;

    /// Recursive tree protocol
    ///
    /// Each iteration may diverge, recurse, and merge.
    /// Key invariant: Each diverge has a matching merge BEFORE recursion.
    pub trait RecursiveTreeProtocol {
        /// Process current node
        fn visit_node(warp: &Warp<All>, depth: usize);

        /// Check if lane has left child
        fn has_left(lane: u32) -> bool;

        /// Check if lane has right child
        fn has_right(lane: u32) -> bool;
    }

    /// Bounded recursive traversal
    ///
    /// We bound the recursion depth to make it tractable.
    /// Type: Warp<All> -> Warp<All>
    pub fn bounded_tree_traversal<P: RecursiveTreeProtocol>(
        warp: Warp<All>,
        max_depth: usize,
    ) -> Warp<All> {
        fn go<P: RecursiveTreeProtocol>(warp: Warp<All>, depth: usize, max: usize) -> Warp<All> {
            if depth >= max {
                return warp;
            }

            P::visit_node(&warp, depth);

            // In real implementation:
            // 1. diverge by has_left -> (left_warp, no_left_warp)
            // 2. left_warp recurses
            // 3. merge
            // 4. diverge by has_right -> (right_warp, no_right_warp)
            // 5. right_warp recurses
            // 6. merge

            // Simplified: just recurse uniformly
            go::<P>(warp, depth + 1, max)
        }

        go::<P>(warp, 0, max_depth)
    }

    /// The key insight for recursive diverge:
    ///
    /// Diverge/merge must be BALANCED within each recursive call.
    /// You cannot:
    /// - Diverge in one iteration, merge in the next
    /// - Leave a diverge unmatched across recursion boundary
    ///
    /// Valid: μX. diverge; body; merge; X
    /// Invalid: μX. diverge; X; merge  (merge outside recursion)
    pub struct BalancedRecursion;
}

// ============================================================================
// PATTERN 5: FOLD/UNFOLD RECURSION
// ============================================================================

/// Type-safe recursion via explicit fold/unfold.
///
/// Instead of μX.P, we use:
/// - `fold`: wrap a protocol as recursive
/// - `unfold`: expose one iteration
///
/// This makes the recursion structure explicit in the type.
pub mod fold_unfold {
    use super::*;

    /// A recursive protocol wrapper
    pub struct Rec<P>(PhantomData<P>);

    /// One unfolding of a recursive protocol
    pub struct Unfolded<P>(PhantomData<P>);

    /// Body of a recursive protocol
    pub trait RecBody {
        type ActiveSet: ActiveSet;

        /// The protocol body, parameterized by the recursion point
        /// F represents "continue to next iteration"
        fn body<F: FnOnce(Warp<Self::ActiveSet>) -> Warp<Self::ActiveSet>>(
            warp: Warp<Self::ActiveSet>,
            recurse: F,
        ) -> Warp<Self::ActiveSet>;
    }

    /// Execute a recursive protocol with bounded unfolding
    pub fn execute_rec<P: RecBody>(
        warp: Warp<P::ActiveSet>,
        max_unfolds: usize,
    ) -> Warp<P::ActiveSet> {
        fn go<P: RecBody>(warp: Warp<P::ActiveSet>, remaining: usize) -> Warp<P::ActiveSet> {
            if remaining == 0 {
                warp // Base case: stop recursing
            } else {
                P::body(warp, |w| go::<P>(w, remaining - 1))
            }
        }

        go::<P>(warp, max_unfolds)
    }

    #[cfg(test)]
    mod tests {
        use super::*;

        struct CountingProtocol;

        impl RecBody for CountingProtocol {
            type ActiveSet = All;

            fn body<F>(warp: Warp<All>, recurse: F) -> Warp<All>
            where
                F: FnOnce(Warp<All>) -> Warp<All>,
            {
                // Do some work, then recurse
                recurse(warp)
            }
        }

        #[test]
        fn test_fold_unfold() {
            let warp: Warp<All> = Warp::new();
            let result = execute_rec::<CountingProtocol>(warp, 10);
            let _: Warp<All> = result; // Type preserved
        }
    }
}

// ============================================================================
// DECIDABILITY ANALYSIS
// ============================================================================

/// Is the protocol encoding decidable?
///
/// For general recursive session types: UNDECIDABLE
/// - Equivalence of recursive types is undecidable in general
/// - Subtyping with recursion can be undecidable
///
/// For our restricted system: DECIDABLE (with restrictions)
///
/// Restrictions that ensure decidability:
/// 1. **Equi-recursive with structural equality**: μX.P = P[μX.P/X]
/// 2. **Finite active set lattice**: Only finitely many possible sets
/// 3. **Bounded recursion depth**: Max unfoldings specified
/// 4. **No recursive types in active set**: S doesn't depend on X
pub mod decidability {
    /// Conditions for decidable protocol checking:
    ///
    /// 1. Active set must be invariant across loop iterations
    ///    - OK: μX. shuffle; X  (Warp<All> -> Warp<All> -> ...)
    ///    - BAD: μX. diverge(p); X  (active set shrinks each iteration)
    ///
    /// 2. Diverge must have matching merge WITHIN each iteration
    ///    - OK: μX. diverge; body; merge; X
    ///    - BAD: μX. diverge; X; merge
    ///
    /// 3. Recursion variable only at tail position
    ///    - OK: μX. body; X
    ///    - BAD: μX. X; body  (infinite prefix)
    pub struct DecidabilityConditions;

    /// Given these restrictions, protocol equivalence reduces to:
    /// - Structural equality of protocol terms
    /// - Active set equality (decidable: finite lattice)
    /// - No need for coinductive reasoning
    pub fn protocol_equivalent<P1, P2>() -> bool
    where
        P1: super::mu_types::Protocol + 'static,
        P2: super::mu_types::Protocol + 'static,
    {
        // Check structural equality of protocol representations
        // This is a compile-time check via trait bounds
        std::any::TypeId::of::<P1>() == std::any::TypeId::of::<P2>()
    }
}

// ============================================================================
// COMPARISON: APPROACHES TO RECURSIVE WARP PROTOCOLS
// ============================================================================

/// Summary of approaches:
///
/// | Pattern | Active Set | Warp Ops in Body | Decidable |
/// |---------|------------|------------------|-----------|
/// | Uniform | Constant   | Yes              | Yes       |
/// | Convergent | Constant | Yes            | Yes       |
/// | Reducing | Shrinks   | No (restricted)  | Yes*      |
/// | Recursive Diverge | Varies | Yes (balanced) | Yes* |
/// | Fold/Unfold | Explicit | Yes           | Yes       |
///
/// *With bounded unfolding
///
/// Key insight: The challenge isn't recursion itself, but how active
/// sets evolve. If we maintain invariants (balanced diverge/merge,
/// constant active set across iterations), the system is tractable.
pub mod summary {
    /// Recommendation for GPU programmers:
    ///
    /// 1. Prefer UNIFORM loops when possible (all lanes same iterations)
    /// 2. Use CONVERGENT loops for iterative algorithms (Newton, etc.)
    /// 3. Use REDUCING loops with restricted body for variable work
    /// 4. Avoid RECURSIVE DIVERGE unless necessary (complex, error-prone)
    /// 5. Use FOLD/UNFOLD for explicit control over recursion structure
    pub struct Recommendations;
}

// ============================================================================
// INTEGRATION WITH SESSION-TYPED LANGUAGES
// ============================================================================

/// How recursive protocols could integrate with a session-typed language:
///
/// A language with μ-types for session recursion could extend them as follows:
///
/// 1. Extend session types with ActiveSet parameter
///    `session WarpProto<S: ActiveSet> = μX. shuffle<S>; X`
///
/// 2. Add warp primitives as session operations
///    `shuffle<All>`, `diverge<S, P>`, `merge<S1, S2>`
///
/// 3. Enforce active set invariants in type checker
///    - diverge/merge balance
///    - shuffle requires All
///    - recursion preserves active set
///
/// 4. Compile to GPU code with correct masking
pub mod language_integration {
    /// Example session-typed syntax (hypothetical):
    /// ```text
    /// session ButterflySum<S: ActiveSet> =
    ///   μX. shuffle_xor<1, S>;
    ///       shuffle_xor<2, S>;
    ///       shuffle_xor<4, S>;
    ///       shuffle_xor<8, S>;
    ///       shuffle_xor<16, S>;
    ///       end
    ///
    /// // Recursive version (bounded by warp size)
    /// session ButterflySum<S: ActiveSet, const N: u32> =
    ///   if N == 0 then end
    ///   else shuffle_xor<N, S>; ButterflySum<S, N/2>
    /// ```
    pub struct Example;
}

// ============================================================================
// ACTIVE SET TRAIT (for this module)
// ============================================================================

pub trait ActiveSet: Copy + Clone + 'static {
    const MASK: u32;
}

#[derive(Copy, Clone)]
pub struct All;
impl ActiveSet for All {
    const MASK: u32 = 0xFFFFFFFF;
}

#[derive(Copy, Clone)]
pub struct Even;
impl ActiveSet for Even {
    const MASK: u32 = 0x55555555;
}

#[derive(Copy, Clone)]
pub struct Warp<S: ActiveSet> {
    _marker: PhantomData<S>,
}

impl<S: ActiveSet> Warp<S> {
    pub fn new() -> Self {
        Warp {
            _marker: PhantomData,
        }
    }
}

// ============================================================================
// TESTS
// ============================================================================

#[cfg(test)]
mod tests {
    use super::convergent_iteration::*;
    use super::reducing_iteration::*;
    use super::uniform_iteration::*;
    use super::*;

    #[test]
    fn test_uniform_preserves_type() {
        let warp: Warp<All> = Warp::new();
        let loop_10: UniformLoop<All, 10> = UniformLoop::new();
        let result: Warp<All> = loop_10.execute(warp, |_w, _i| {});
        let _ = result;
    }

    #[test]
    fn test_convergent_preserves_type() {
        use std::cell::Cell;

        let warp: Warp<All> = Warp::new();
        let conv: ConvergentLoop<All> = ConvergentLoop::new(100);
        let i = Cell::new(0);
        let (result, _, _): (Warp<All>, _, _) = conv.execute(
            warp,
            |_w| {
                i.set(i.get() + 1);
            },
            |_w| i.get() >= 10,
        );
        let _ = result;
    }

    #[test]
    fn test_reducing_preserves_type() {
        use std::cell::Cell;

        let warp: Warp<All> = Warp::new();
        let red: ReducingLoop<All> = ReducingLoop::new(100);
        let count = Cell::new(0);
        let result: Warp<All> = red.execute(
            warp,
            |_lane, _iter| {
                count.set(count.get() + 1);
            },
            || count.get() < 10,
        );
        let _ = result;
    }

    #[test]
    fn test_protocol_types_compile() {
        // This test verifies the protocol type system compiles
        use mu_types::*;

        // μX. End<All>
        type SimpleRec = Mu<End<All>>;

        // Verify it implements Protocol
        fn check_protocol<P: Protocol>() {}
        check_protocol::<SimpleRec>();
    }

    #[test]
    fn test_bounded_recursion() {
        struct DummyTree;
        impl recursive_diverge::RecursiveTreeProtocol for DummyTree {
            fn visit_node(_warp: &Warp<All>, _depth: usize) {}
            fn has_left(_lane: u32) -> bool {
                false
            }
            fn has_right(_lane: u32) -> bool {
                false
            }
        }

        let warp: Warp<All> = Warp::new();
        let result = recursive_diverge::bounded_tree_traversal::<DummyTree>(warp, 5);
        let _: Warp<All> = result;
    }
}

// ============================================================================
// RESEARCH QUESTIONS ANSWERED
// ============================================================================

// Q: How to handle loops (recursive protocols)?
//
// A: Multiple patterns depending on how active set evolves:
//
// 1. UNIFORM: Active set constant, full warp ops allowed
//    Type: Warp<S> -> Warp<S>, decidable
//
// 2. CONVERGENT: Active set constant, exit when all lanes agree
//    Type: Warp<S> -> Warp<S>, decidable
//
// 3. REDUCING: Active set shrinks, restrict body to per-lane ops
//    Type: Warp<S> -> Warp<S> (reconverge), decidable with restrictions
//
// 4. RECURSIVE DIVERGE: Must balance diverge/merge within each iteration
//    Type: Complex, decidable with structural restrictions
//
// 5. FOLD/UNFOLD: Explicit recursion structure, bounded unfolding
//    Type: Explicit, decidable by construction
//
// Key insight: Decidability requires either:
// - Constant active set across iterations, OR
// - Restricted operations in body (no warp ops), OR
// - Bounded unfolding (max iterations specified)
//
// The varying_loops.rs module already implements the REDUCING pattern.
// This module adds formal session-type framing and other patterns.