tulip_rs 0.1.15

//! SIMD-parallel state structs for the Ehlers TrendMode.
//!
//! Two sub-modules are provided for the two SIMD parallelism modes:
//!
//! - [`assets`] — `N` independent assets with the same α. Each lane has its own
//!   HD pipeline (`HdSimdState<N>`), its own CC pipeline, and its own price input.
//! - [`options`] — 1 asset with `N` different α values. HD is a single shared
//!   scalar state (all lanes see the same price); CC runs in SIMD with per-lane
//!   multipliers.
//!
//! ## Why two separate `SimdState` structs?
//!
//! The only structural difference is the `hd` field:
//! - `assets` needs `HdSimdState<N>` (N independent HD pipelines).
//! - `options` needs a scalar `homodynediscriminator::State` (one shared HD).
//!
//! ## Shared computation
//!
//! Everything after the CC step — peak envelope update and TrendMode classification
//! — is identical in both modes and is factored into the module-level
//! [`trendmode_pipeline`] function.
//!
//! `pk` is `Simd<f64, N>` so the peak update and mode detection are fully
//! vectorised with no per-lane scalar loop.

#[cfg(feature = "simd_assets")]
pub use crate::indicators::simd_indicators::by_asset::trendmode::indicator_by_assets;
#[cfg(feature = "simd_options")]
pub use crate::indicators::simd_indicators::by_option::trendmode::indicator_by_options;

use std::simd::{cmp::SimdPartialOrd, num::SimdFloat, Select, Simd};

/// Shared post-CC TrendMode pipeline used by both `assets` and `options` modes.
///
/// Updates the decaying peak envelope and returns the TrendMode classification
/// vector — all in SIMD, no per-lane loop.
///
/// Steps:
/// 1. `pk = max(pk × 0.991, |cycle|)`
/// 2. `trendmode = 1.0` if `pk > 0` and `|cycle| < 0.2 × pk`, else `0.0`
#[inline(always)]
fn trendmode_pipeline<const N: usize>(cycle: Simd<f64, N>, pk: &mut Simd<f64, N>) -> Simd<f64, N> {
    *pk = (*pk * Simd::splat(0.991)).simd_max(cycle.abs());
    let pk_positive = pk.simd_gt(Simd::splat(0.0));
    let small_cycle = cycle.abs().simd_lt(Simd::splat(0.2) * *pk);
    (pk_positive & small_cycle).select(Simd::splat(1.0_f64), Simd::splat(0.0_f64))
}

// ─────────────────────────────────────────────────────────────────────────────
// assets — N assets, same alpha
// ─────────────────────────────────────────────────────────────────────────────

/// SIMD state for `N` assets with the same α (used by `indicator_by_assets`).
pub mod assets {
    use super::trendmode_pipeline;
    use crate::indicators::simd_indicators::cybercycle_simd::SimdState as CcSimdState;
    use crate::indicators::simd_indicators::homodynediscriminator_simd::SimdState as HdSimdState;
    use crate::indicators::trendmode;
    use std::simd::{num::SimdFloat, Simd};

    /// SIMD state for N assets with a shared α.
    ///
    /// `hd` is `HdSimdState<N>` because each asset has an independent price
    /// history requiring its own HD pipeline. `pk` is `Simd<f64, N>` — the peak
    /// envelope update and mode detection are fully vectorised.
    pub struct SimdState<const N: usize> {
        /// N independent HD pipelines — one per asset.
        pub hd: HdSimdState<N>,
        /// N independent CC pipelines — one per asset.
        pub cc: CcSimdState<N>,
        /// Per-asset decaying peak amplitude: `max(pk[1] × 0.991, |Cycle|)`.
        pub pk: Simd<f64, N>,
    }

    impl<const N: usize> SimdState<N> {
        /// Gathers `N` scalar [`trendmode::State`] references into a `SimdState`.
        pub fn new(states: &mut [&mut trendmode::State]) -> Self {
            let pk = Simd::from_array(std::array::from_fn(|j| states[j].pk));
            let hd = {
                let refs: Vec<&mut _> = states.iter_mut().map(|s| &mut s.hd).collect();
                HdSimdState::new(&refs)
            };
            let cc = {
                let mut refs: Vec<&mut _> = states.iter_mut().map(|s| &mut s.cc).collect();
                CcSimdState::new(&mut refs)
            };
            Self { hd, cc, pk }
        }

        /// Scatters the SIMD state back into `N` scalar [`trendmode::State`] references.
        pub fn write_states(&self, states: &mut [&mut trendmode::State]) {
            {
                let mut refs: Vec<&mut _> = states.iter_mut().map(|s| &mut s.hd).collect();
                self.hd.write_states(&mut refs);
            }
            {
                let mut refs: Vec<&mut _> = states.iter_mut().map(|s| &mut s.cc).collect();
                self.cc.write_states(&mut refs);
            }
            let pk = self.pk.to_array();
            for j in 0..N {
                states[j].pk = pk[j];
            }
        }

        /// One bar of TrendMode for N assets simultaneously.
        ///
        /// HD and CC run in SIMD; post-CC peak + classification via
        /// [`trendmode_pipeline`] — no scalar loop.
        ///
        /// Returns `Simd<f64, N>` of `1.0` (Trend) / `0.0` (Cycle) per lane.
        ///
        /// # Safety
        ///
        /// All HD and CC ring buffers must be full. Guaranteed after
        /// [`trendmode::State::init_state`] for every lane.
        #[inline(always)]
        pub unsafe fn calc_simd_unchecked(
            &mut self,
            real: Simd<f64, N>,
            multipliers: (Simd<f64, N>, Simd<f64, N>, Simd<f64, N>),
        ) -> Simd<f64, N> {
            self.hd.calc_simd_unchecked(real);
            let cycle = self.cc.calc_simd_unchecked(real, multipliers);
            trendmode_pipeline(cycle, &mut self.pk)
        }

        /// One bar of TrendMode for N assets using **adaptive alpha per lane**.
        ///
        /// HD runs in SIMD — each asset lane has its own `smooth_period`. The per-lane
        /// adaptive alpha is derived via `2 / (smooth_period.max(3) + 1)`, then
        /// per-lane multipliers are computed and fed into CC and `trendmode_pipeline`.
        ///
        /// # Safety
        /// All HD and CC ring buffers must be full. Guaranteed after
        /// [`trendmode::State::init_state`] for every lane.
        #[inline(always)]
        pub unsafe fn calc_simd_unchecked_adaptive(&mut self, real: Simd<f64, N>) -> Simd<f64, N> {
            self.hd.calc_simd_unchecked(real);
            let effective_period = self.hd.smooth_period.simd_max(Simd::splat(3.0_f64));
            let alpha = Simd::splat(2.0_f64) / (effective_period + Simd::splat(1.0_f64));
            let one = Simd::splat(1.0_f64);
            let c = one - Simd::splat(0.5_f64) * alpha;
            let b = one - alpha;
            let mults = (c * c, Simd::splat(2.0_f64) * b, b * b);
            let cycle = self.cc.calc_simd_unchecked(real, mults);
            trendmode_pipeline(cycle, &mut self.pk)
        }
    }
}

// ─────────────────────────────────────────────────────────────────────────────
// options — 1 asset, N alpha values
// ─────────────────────────────────────────────────────────────────────────────

/// SIMD state for 1 asset with `N` different α values (used by `indicator_by_options`).
pub mod options {
    use super::trendmode_pipeline;
    use crate::indicators::homodynediscriminator;
    use crate::indicators::simd_indicators::cybercycle_simd::SimdState as CcSimdState;
    use crate::indicators::trendmode;
    use std::simd::Simd;

    /// SIMD state for 1 asset with N different α values.
    ///
    /// `hd` is a single scalar state because all N option lanes process the same
    /// price series — they share one HD output. `pk` is `Simd<f64, N>` as in the
    /// assets case.
    pub struct SimdState<const N: usize> {
        /// Single shared HD state — same price input for all N lanes.
        pub hd: homodynediscriminator::State,
        /// N CC pipelines with per-lane α multipliers.
        pub cc: CcSimdState<N>,
        /// Per-lane decaying peak amplitude.
        pub pk: Simd<f64, N>,
    }

    impl<const N: usize> SimdState<N> {
        /// Gathers `N` scalar [`trendmode::State`] references into a `SimdState`.
        ///
        /// All N lanes have identical HD states (same price), so `states[0].hd`
        /// is cloned as the shared scalar HD.
        pub fn new(states: &mut [&mut trendmode::State]) -> Self {
            let hd = states[0].hd.clone();
            let pk = Simd::from_array(std::array::from_fn(|j| states[j].pk));
            let cc = {
                let mut refs: Vec<&mut _> = states.iter_mut().map(|s| &mut s.cc).collect();
                CcSimdState::new(&mut refs)
            };
            Self { hd, cc, pk }
        }

        /// Scatters the SIMD state back into `N` scalar [`trendmode::State`] references.
        pub fn write_states(&self, states: &mut [&mut trendmode::State]) {
            {
                let mut refs: Vec<&mut _> = states.iter_mut().map(|s| &mut s.cc).collect();
                self.cc.write_states(&mut refs);
            }
            let pk = self.pk.to_array();
            for j in 0..N {
                states[j].hd = self.hd.clone();
                states[j].pk = pk[j];
            }
        }

        /// One bar of TrendMode for N α-option lanes simultaneously.
        ///
        /// HD advances once (shared price, via `real[0]`). CC runs in SIMD with
        /// per-lane multipliers. Post-CC via [`trendmode_pipeline`].
        ///
        /// Returns `Simd<f64, N>` of `1.0` (Trend) / `0.0` (Cycle) per lane.
        ///
        /// # Safety
        ///
        /// All HD and CC ring buffers must be full. Guaranteed after
        /// [`trendmode::State::init_state`] for every lane.
        #[inline(always)]
        pub unsafe fn calc_simd_unchecked(
            &mut self,
            real: Simd<f64, N>,
            multipliers: (Simd<f64, N>, Simd<f64, N>, Simd<f64, N>),
        ) -> Simd<f64, N> {
            // All lanes share the same price — use lane 0 for the scalar HD.
            self.hd.calc_unchecked(real[0]);
            let cycle = self.cc.calc_simd_unchecked(real, multipliers);
            trendmode_pipeline(cycle, &mut self.pk)
        }

        /// Advances the shared scalar HD one bar and returns the updated `smooth_period`.
        ///
        /// Call this before [`advance_cc`] when computing per-bar adaptive multipliers.
        ///
        /// # Safety
        /// All HD ring buffers must be full on entry.
        #[inline(always)]
        pub unsafe fn advance_hd(&mut self, price: f64) -> f64 {
            self.hd.calc_unchecked(price);
            self.hd.smooth_period
        }

        /// Advances CC and the peak pipeline for one bar with per-lane `multipliers`.
        ///
        /// Complements [`advance_hd`]: the caller computes the per-lane SIMD multipliers
        /// (e.g. via adaptive mask+select) and passes them here.
        ///
        /// # Safety
        /// CC ring buffers must be full on entry.
        #[inline(always)]
        pub unsafe fn advance_cc(
            &mut self,
            real: Simd<f64, N>,
            multipliers: (Simd<f64, N>, Simd<f64, N>, Simd<f64, N>),
        ) -> Simd<f64, N> {
            let cycle = self.cc.calc_simd_unchecked(real, multipliers);
            trendmode_pipeline(cycle, &mut self.pk)
        }
    }
}