opendeviationbar-core 13.75.1

//! Tier 3 inter-bar features: Kaufman ER, Hurst R/S, Permutation entropy, Approximate entropy
//!
//! Advanced features requiring more computation.

use super::EntropyCache;
use crate::interbar_types::TradeSnapshot;
use crate::normalization_lut::soft_clamp_hurst_lut;
use libm;
use opendeviationbar_hurst;
use smallvec::SmallVec;
use wide::f64x2;

pub fn compute_kaufman_er(prices: &[f64]) -> f64 {
    if prices.len() < 2 {
        return 0.0;
    }

    // Issue #96 Task #210: Memoize first/last element access (Kaufman ER)
    let n = prices.len();
    let net_movement = (prices[n - 1] - prices[0]).abs();

    // Issue #96 Task #169: Vectorize volatility loop with SIMD f64x4 (0.3-0.8% speedup)
    // Process 4 price differences simultaneously, then horizontal sum
    use wide::f64x4;

    let mut volatility_vec = f64x4::splat(0.0);

    // SIMD loop: process 4 differences per iteration
    let chunks = (n - 1) / 4;
    for chunk_idx in 0..chunks {
        let i = chunk_idx * 4 + 1;
        let diff1 = (prices[i] - prices[i - 1]).abs();
        let diff2 = (prices[i + 1] - prices[i]).abs();
        let diff3 = (prices[i + 2] - prices[i + 1]).abs();
        let diff4 = (prices[i + 3] - prices[i + 2]).abs();
        volatility_vec += f64x4::new([diff1, diff2, diff3, diff4]);
    }

    // Horizontal sum: add all 4 lanes
    let arr: [f64; 4] = volatility_vec.into();
    let mut volatility = arr[0] + arr[1] + arr[2] + arr[3];

    // Handle remainder trades (when n % 4 != 1)
    let remainder = (n - 1) % 4;
    for i in (chunks * 4 + 1)..(chunks * 4 + 1 + remainder) {
        if i < n {
            volatility += (prices[i] - prices[i - 1]).abs();
        }
    }

    if volatility > f64::EPSILON {
        net_movement / volatility
    } else {
        0.0 // No movement
    }
}

/// Precomputed ln(2!) for M=2 permutation entropy normalization
/// Exact value: ln(2)
const LN_2_FACTORIAL: f64 = std::f64::consts::LN_2;

/// Precomputed ln(3!) for M=3 permutation entropy normalization
/// Exact value: ln(6) ≈ 1.791759469228055
const LN_3_FACTORIAL: f64 = 1.791_759_469_228_055;

/// Compute Hurst exponent via Rescaled Range (R/S) Analysis
///
/// Uses the simple R/S method from opendeviationbar-hurst crate.
/// Previously named `compute_hurst_dfa` — renamed to reflect the actual algorithm.
///
/// Interpretation:
/// - H < 0.5: Anti-correlated (mean-reverting)
/// - H = 0.5: Random walk
/// - H > 0.5: Positively correlated (trending)
///
/// Output: soft-clamped to [0, 1] for ML consumption
#[inline]
pub fn compute_hurst_rescaled_range(prices: &[f64]) -> f64 {
    // Issue #96 Phase 3b: R/S Analysis — O(n) single-pass

    // R/S Analysis needs enough samples for the rescaled-range estimator to
    // have meaningful variance; the upstream `evrom/hurst` implementation
    // degrades on very short windows. 64 was picked as the smallest
    // power-of-two window where the bias-vs-variance trade-off was acceptable
    // in Issue #96 Phase 3b microbenchmarks; it has NOT been re-derived since.
    // Treat it as a tuning knob, not a derived bound.
    //
    // Audit context (3-axis column audit v2, 2026-04-29 in opendeviationbar-patterns):
    // flagged alongside `TIER2_PARALLEL_THRESHOLD_BASE = 80` /
    // `TIER3_PARALLEL_THRESHOLD_BASE = 150` (interbar.rs) and
    // `HURST_ENTROPY_SHORTCIRCUIT_THRESHOLD = 0.75` (top of interbar.rs) as
    // un-derived tuning knobs — surfaced together for greppability and to
    // make future re-derivation easier.
    const MIN_SAMPLES: usize = 64;
    if prices.len() < MIN_SAMPLES {
        return 0.5; // Neutral (insufficient data)
    }

    // Use evrom/hurst R/S Analysis (O(n log n), 4-5x faster than DFA)
    // Issue #96 Task #168: Eliminate .to_vec() clone - pass &[f64] directly (1-2% speedup)
    let h = opendeviationbar_hurst::rssimple(prices);

    // Soft clamp to [0, 1] using tanh (matches DFA output normalization)
    soft_clamp_hurst(h)
}

/// Soft-clamp Hurst exponent to [0, 1] using precomputed tanh LUT
///
/// Formula: 0.5 + 0.5 * tanh((x - 0.5) * 4)
/// Maps 0.5 -> 0.5, and asymptotically approaches 0 or 1 for extreme values
///
/// Issue #96 Task #198: Replace transcendental tanh() with O(1) lookup
#[inline]
pub fn soft_clamp_hurst(h: f64) -> f64 {
    soft_clamp_hurst_lut(h)
}

/// Compute Adaptive Permutation Entropy with dynamic embedding dimension
///
/// Selects embedding dimension M based on window size for optimal efficiency:
/// - n < 10: Insufficient data -> return 1.0
/// - 10 ≤ n < 20: M=2 (2 patterns) -> ~3-5x faster than M=3 on these sizes
/// - n ≥ 20: M=3 (6 patterns) -> standard Bandt-Pompe choice
///
/// Issue #96 Task #49: Batch caching for large windows (3-8x speedup)
/// Uses rolling pattern histogram for O(1) incremental computation
/// vs O(n) recomputation from scratch. Beneficial for:
/// - Streaming scenarios (adding trades to bar one at a time)
/// - Batch processing (precomputing entropy for multiple lookback windows)
///
/// Trade-off: Function call overhead (~5-10% on large windows) vs significant gains
/// on small windows (which are common in live trading). Overall win on typical
/// mixed workloads (10-500 sample windows).
///
/// Formula: H_PE = -sum p_pi * ln(p_pi) / ln(m!)
///
/// Reference: Bandt & Pompe (2002), Phys. Rev. Lett. 88, 174102
///
/// Output range: [0, 1] where 0 = deterministic, 1 = completely random
///
/// Performance characteristics:
/// - Small windows (10-20 samples): 3-5x faster (fewer patterns, less computation)
/// - Medium windows (20-100 samples): Baseline (minimal overhead)
/// - Large windows (>100 samples): 3-8x with batch caching on >20 trades
///
/// Issue #96 Task #93: Dispatch between scalar and batch-optimized implementations
#[inline]
pub fn compute_permutation_entropy(prices: &[f64]) -> f64 {
    let n = prices.len();

    if n < 10 {
        return 1.0; // Insufficient data
    }

    // Issue #103: Use M=2 for wider small window range (10-30) to avoid monotonic check overhead
    // Monotonic check is O(n) work that may dominate for small windows.
    // M=2 is fast enough and reasonably accurate for trending market detection.
    if n >= 30 {
        // Standard M=3 with rolling histogram cache for O(1) per new pattern
        // Task #93: Use batch-optimized version for better cache locality
        compute_permutation_entropy_m3_cached_batch(prices)
    } else {
        // Small windows: M=2 path (10-30 trades)
        // Much faster than M=3's monotonic check, good enough for streaming
        compute_permutation_entropy_m2(prices)
    }
}

/// Batch-optimized permutation entropy (Task #93: 3-6x speedup via cache locality)
/// Issue #108: Dispatcher that delegates to SIMD-optimized implementation
/// Processes patterns with improved memory access patterns and instruction parallelism
/// Issue #103: Optimized for small windows and early-exit monotonic check
#[inline]
fn compute_permutation_entropy_m3_cached_batch(prices: &[f64]) -> f64 {
    // Issue #108: Dispatch to SIMD-optimized batch processor
    // Branchless ordinal pattern index + 8x unroll for better ILP
    compute_permutation_entropy_m3_simd_batch(prices)
}

/// Permutation entropy with M=2 (2 patterns: a<=b, b<a)
/// Faster than M=3, suitable for small windows (10-20 samples)
/// Issue #103: Use u8 for better L1 cache locality on small windows
#[inline]
fn compute_permutation_entropy_m2(prices: &[f64]) -> f64 {
    debug_assert!(prices.len() >= 10);

    // Issue #96 Task #204: Early-exit for sorted sequences
    // If all prices[i] <= prices[i+1] (monotonic ascending), all patterns are 0
    // Early detection avoids full loop computation for consolidated/trending price periods
    let mut all_ascending = true;
    for i in 0..prices.len() - 1 {
        if prices[i] > prices[i + 1] {
            all_ascending = false;
            break;
        }
    }

    if all_ascending {
        return 0.0; // All patterns identical = entropy 0
    }

    let mut counts = [0u16; 2]; // 2! = 2 patterns, u16 for windows up to 65535
    let n_patterns = prices.len() - 1;

    for i in 0..n_patterns {
        let idx = if prices[i] <= prices[i + 1] { 0 } else { 1 };
        counts[idx] += 1;
    }

    // Shannon entropy
    let total = n_patterns as f64;
    // Issue #96 Task #212: Pre-compute reciprocal to avoid repeated division in hot loop
    // Division (~10-15 cycles) replaced with multiplication (~1 cycle) for each pattern
    let reciprocal = 1.0 / total;
    // Issue #96 Task #214: Eliminate filter() iterator overhead
    // fold() with inline condition avoids filter iterator chain overhead (~1-1.5% speedup)
    let entropy: f64 = counts.iter().fold(0.0, |acc, &c| {
        if c > 0 {
            let p = (c as f64) * reciprocal;
            acc + (-p * libm::log(p)) // Issue #116: Use libm for 1.2-1.5x speedup
        } else {
            acc
        }
    });

    entropy / LN_2_FACTORIAL // ln(2!) - precomputed constant
}

/// Issue #108 Phase 2: SIMD-optimized pattern batch processor
/// Computes M=3 ordinal patterns for contiguous price triplets using vectorization
///
/// This processes a batch of price triplets in parallel where possible,
/// reducing instruction latency and improving branch predictor efficiency.
///
/// # Performance
/// - Scalar path: ~50-75 cycles per triplet (branching overhead)
/// - Branchless path: ~20-30 cycles per triplet (better pipelining)
/// - Expected: 1.5-2.5x speedup on medium/large windows (100+ trades)
#[inline]
fn compute_permutation_entropy_m3_simd_batch(prices: &[f64]) -> f64 {
    let n = prices.len();
    let n_patterns = n - 2;

    // Early-exit for monotonic sequences (unchanged from scalar path)
    let mut is_monotonic_inc = true;
    let mut is_monotonic_dec = true;
    for i in 0..n - 1 {
        let cmp = (prices[i] > prices[i + 1]) as u8;
        is_monotonic_inc &= cmp == 0;
        is_monotonic_dec &= cmp == 1;
        if !is_monotonic_inc && !is_monotonic_dec {
            break;
        }
    }

    if is_monotonic_inc || is_monotonic_dec {
        return 0.0; // Single pattern = entropy 0
    }

    // Pattern histogram — u16 supports windows up to 65535 trades without overflow
    // Previous u8 capped at 255, causing incorrect entropy for FixedCount(500) lookback windows
    let mut pattern_counts: [u16; 6] = [0; 6];

    // Issue #96 Task #130: SIMD-accelerated ordinal pattern extraction
    // Process patterns in groups of 16 using vectorized approach
    // Each iteration computes 16 pattern indices with better ILP and SIMD potential
    let simd_bulk_patterns = (n_patterns / 16) * 16;

    let mut i = 0;
    while i < simd_bulk_patterns {
        // Vectorized loop: compute 16 patterns in a single iteration
        // These 16 independent operations allow CPU out-of-order execution and SIMD parallelism
        let p0 = ordinal_pattern_index_m3(prices[i], prices[i + 1], prices[i + 2]);
        let p1 = ordinal_pattern_index_m3(prices[i + 1], prices[i + 2], prices[i + 3]);
        let p2 = ordinal_pattern_index_m3(prices[i + 2], prices[i + 3], prices[i + 4]);
        let p3 = ordinal_pattern_index_m3(prices[i + 3], prices[i + 4], prices[i + 5]);
        let p4 = ordinal_pattern_index_m3(prices[i + 4], prices[i + 5], prices[i + 6]);
        let p5 = ordinal_pattern_index_m3(prices[i + 5], prices[i + 6], prices[i + 7]);
        let p6 = ordinal_pattern_index_m3(prices[i + 6], prices[i + 7], prices[i + 8]);
        let p7 = ordinal_pattern_index_m3(prices[i + 7], prices[i + 8], prices[i + 9]);
        let p8 = ordinal_pattern_index_m3(prices[i + 8], prices[i + 9], prices[i + 10]);
        let p9 = ordinal_pattern_index_m3(prices[i + 9], prices[i + 10], prices[i + 11]);
        let p10 = ordinal_pattern_index_m3(prices[i + 10], prices[i + 11], prices[i + 12]);
        let p11 = ordinal_pattern_index_m3(prices[i + 11], prices[i + 12], prices[i + 13]);
        let p12 = ordinal_pattern_index_m3(prices[i + 12], prices[i + 13], prices[i + 14]);
        let p13 = ordinal_pattern_index_m3(prices[i + 13], prices[i + 14], prices[i + 15]);
        let p14 = ordinal_pattern_index_m3(prices[i + 14], prices[i + 15], prices[i + 16]);
        let p15 = ordinal_pattern_index_m3(prices[i + 15], prices[i + 16], prices[i + 17]);

        // Batch accumulation — u16 never overflows for realistic window sizes
        pattern_counts[p0] += 1;
        pattern_counts[p1] += 1;
        pattern_counts[p2] += 1;
        pattern_counts[p3] += 1;
        pattern_counts[p4] += 1;
        pattern_counts[p5] += 1;
        pattern_counts[p6] += 1;
        pattern_counts[p7] += 1;
        pattern_counts[p8] += 1;
        pattern_counts[p9] += 1;
        pattern_counts[p10] += 1;
        pattern_counts[p11] += 1;
        pattern_counts[p12] += 1;
        pattern_counts[p13] += 1;
        pattern_counts[p14] += 1;
        pattern_counts[p15] += 1;

        i += 16;
    }

    // Remainder patterns (8x unroll for small tails)
    let remainder_patterns = n_patterns - simd_bulk_patterns;
    let remainder_8x = (remainder_patterns / 8) * 8;
    let mut j = simd_bulk_patterns;

    while j < simd_bulk_patterns + remainder_8x {
        let p0 = ordinal_pattern_index_m3(prices[j], prices[j + 1], prices[j + 2]);
        let p1 = ordinal_pattern_index_m3(prices[j + 1], prices[j + 2], prices[j + 3]);
        let p2 = ordinal_pattern_index_m3(prices[j + 2], prices[j + 3], prices[j + 4]);
        let p3 = ordinal_pattern_index_m3(prices[j + 3], prices[j + 4], prices[j + 5]);
        let p4 = ordinal_pattern_index_m3(prices[j + 4], prices[j + 5], prices[j + 6]);
        let p5 = ordinal_pattern_index_m3(prices[j + 5], prices[j + 6], prices[j + 7]);
        let p6 = ordinal_pattern_index_m3(prices[j + 6], prices[j + 7], prices[j + 8]);
        let p7 = ordinal_pattern_index_m3(prices[j + 7], prices[j + 8], prices[j + 9]);

        pattern_counts[p0] += 1;
        pattern_counts[p1] += 1;
        pattern_counts[p2] += 1;
        pattern_counts[p3] += 1;
        pattern_counts[p4] += 1;
        pattern_counts[p5] += 1;
        pattern_counts[p6] += 1;
        pattern_counts[p7] += 1;

        j += 8;
    }

    // Final scalar remainder (0-7 patterns)
    for k in (simd_bulk_patterns + remainder_8x)..n_patterns {
        let pattern_idx = ordinal_pattern_index_m3(prices[k], prices[k + 1], prices[k + 2]);
        pattern_counts[pattern_idx] += 1;
    }

    // Compute entropy from final histogram state
    // Issue #96: Pre-compute reciprocal (consistency with M=2 path)
    let inv_total = 1.0 / n_patterns as f64;
    // Issue #96 Task #214: Eliminate filter() iterator overhead in M=3 path
    // fold() with inline condition avoids filter iterator chain overhead (~1-1.5% speedup)
    let entropy: f64 = pattern_counts.iter().fold(0.0, |acc, &count| {
        if count > 0 {
            let p = count as f64 * inv_total;
            acc + (-p * libm::log(p)) // Issue #116: Use libm for 1.2-1.5x speedup
        } else {
            acc
        }
    });

    entropy / LN_3_FACTORIAL // ln(3!) - precomputed constant
}

/// Get ordinal pattern index for m=3 (0-5) - Branchless SIMD-friendly version
///
/// Patterns (lexicographic order):
/// 0: 012 (a <= b <= c)
/// 1: 021 (a <= c < b)
/// 2: 102 (b < a <= c)
/// 3: 120 (b <= c < a)
/// 4: 201 (c < a <= b)
/// 5: 210 (c < b < a)
///
/// Issue #108 Phase 1: Branchless computation using lookup table
/// - Replaces nested conditionals with 3 comparison bits + lookup
/// - Better CPU pipeline utilization (no branch misprediction)
/// - Enables future SIMD vectorization
///
/// Comparison bits: (a<=b, b<=c, a<=c) map to patterns via lookup table
#[inline(always)]
pub fn ordinal_pattern_index_m3(a: f64, b: f64, c: f64) -> usize {
    // Lookup table: 3-bit comparison (a<=b, b<=c, a<=c) → ordinal pattern (0-5)
    // Issue #108 Phase 1: Branchless implementation with lookup table
    // Maps all 8 possible comparison results to valid ordinal patterns
    //
    // Truth table (index = (a<=b)<<2 | (b<=c)<<1 | (a<=c)):
    // 000: a>b, b>c, a>c → c < b < a (pattern 5)
    // 001: IMPOSSIBLE (if a>b and b>c then a>c always)
    // 010: a>b, b<=c, a>c → c <= b < a (pattern 3)
    // 011: a>b, b<=c, a<=c → b < a <= c (pattern 2)
    // 100: a<=b, b>c, a>c → c < a <= b (pattern 4)
    // 101: a<=b, b>c, a<=c → a <= c < b (pattern 1)
    // 110: IMPOSSIBLE (if a<=b and b<=c then a<=c always)
    // 111: a<=b, b<=c, a<=c → a <= b <= c (pattern 0)
    const LOOKUP: [usize; 8] = [
        5, // 000
        0, // 001 (impossible, use sentinel)
        3, // 010
        2, // 011
        4, // 100
        1, // 101
        0, // 110 (impossible, use sentinel)
        0, // 111
    ];

    let ab = (a <= b) as usize;
    let bc = (b <= c) as usize;
    let ac = (a <= c) as usize;

    LOOKUP[(ab << 2) | (bc << 1) | ac]
}

/// Issue #96 Task #129: Vectorized ordinal pattern batch computation (SIMD-ready)
///
/// Computes multiple ordinal pattern indices in parallel, preparing infrastructure
/// for future wide crate vectorization. Current implementation uses 16x unroll for
/// better ILP while maintaining compatibility with wide::u8x16 vectorization.
///
/// # Performance
/// Current (16x unroll): ~30-40 cycles per 16 patterns
/// Future (wide::u8x16): Target ~8-12 cycles per 16 patterns (further 3-4x speedup)
///
/// # Vectorization Ready
/// The 16x unroll pattern is structured to accept wide::u8x16 SIMD operations:
/// Batch OHLC extraction from trade snapshots
///
/// Extracts Open, High, Low, Close prices in a single pass.
/// Enables cache-friendly optimization for multiple features.
///
/// Performance: O(n) single fold, ~5-10% faster than computing OHLC separately
///
/// Returns: (open_price, high_price, low_price, close_price)
#[inline]
pub fn extract_ohlc_batch(lookback: &[&TradeSnapshot]) -> (f64, f64, f64, f64) {
    if lookback.is_empty() {
        return (0.0, 0.0, 0.0, 0.0);
    }

    // Issue #96 Task #210: Memoize first/last element access (OHLC batch extraction)
    let n = lookback.len();
    let open = lookback[0].price.to_f64();
    let close = lookback[n - 1].price.to_f64();

    let (high, low) = lookback.iter().fold((f64::MIN, f64::MAX), |acc, t| {
        let p = t.price.to_f64();
        (acc.0.max(p), acc.1.min(p))
    });

    (open, high, low, close)
}

/// Issue #96 Task #77: Combined OHLC + prices extraction in single pass (1.3-1.6x speedup)
/// Extract both prices vector and OHLC values in ONE pass through lookback
/// Replaces separate price iteration + extract_ohlc_batch calls
///
/// Performance: Single O(n) pass instead of O(n) + O(n) separate iterations
/// Returns: (prices SmallVec, ohlc tuple)
#[inline]
pub fn extract_prices_and_ohlc_cached(
    lookback: &[&TradeSnapshot],
) -> (SmallVec<[f64; 256]>, (f64, f64, f64, f64)) {
    if lookback.is_empty() {
        return (SmallVec::new(), (0.0, 0.0, 0.0, 0.0));
    }

    // Issue #96 Task #210: Memoize first/last element access (prices + OHLC extraction)
    let n = lookback.len();
    let open = lookback[0].price.to_f64();
    let close = lookback[n - 1].price.to_f64();

    // Single pass: collect prices AND compute OHLC bounds
    let mut prices = SmallVec::with_capacity(lookback.len());
    let mut high = f64::MIN;
    let mut low = f64::MAX;

    for trade in lookback {
        let p = trade.price.to_f64();
        prices.push(p);
        if p > high {
            high = p;
        }
        if p < low {
            low = p;
        }
    }

    (prices, (open, high, low, close))
}

/// Compute Approximate Entropy (ApEn)
///
/// Alternative to Permutation Entropy for large windows (n > 100).
/// Measures self-similarity using distance-based pattern matching.
///
/// Formula: ApEn(u, m, r) = φ(m) - φ(m+1)
/// where φ(m) = -Σ p_i * log(p_i)
///
/// Reference: Pincus (1991), PNAS Vol. 88, No. 6
///
/// Performance:
/// - O(n²) complexity but lower constant than Permutation Entropy
/// - ~0.5-2ms for n=100-500 (vs 2-10ms for Permutation Entropy)
/// - Better suited for large windows
///
/// Parameters:
/// - m: embedding dimension (default 2)
/// - r: tolerance (typically 0.2*std(prices))
///
/// Returns entropy in [0, 1] range (normalized by ln(n))
#[inline]
pub fn compute_approximate_entropy(prices: &[f64], m: usize, r: f64) -> f64 {
    let n = prices.len();

    if n < m + 1 {
        return 0.0;
    }

    // Compute φ(m) - count patterns of length m
    let phi_m = compute_phi(prices, m, r);

    // Compute φ(m+1) - count patterns of length m+1
    let phi_m1 = compute_phi(prices, m + 1, r);

    // ApEn = φ(m) - φ(m+1)
    // Normalized by ln(n) for [0,1] range (Issue #116: Use libm for optimization)
    ((phi_m - phi_m1) / libm::log(n as f64)).max(0.0).min(1.0)
}

/// Helper: Compute φ(m) for ApEn
///
/// Counts matching patterns within tolerance r
/// Issue #96 Task #161: Phase 1 scalar optimization (1-2x speedup)
/// - Direct Chebyshev distance instead of zip+all()
/// - Single pass through pattern elements
/// - Avoid iterator overhead
///
/// Check if two patterns are within Chebyshev distance using SIMD for m=2 case.
/// Issue #96 Task #161 Phase 2: SIMD vectorization of pattern distance checks.
///
/// Uses wide::f64x2 to compute both abs differences in parallel when m=2,
/// providing ~2x speedup vs scalar by reducing latency and improving ILP.
/// Issue #96 Task #88: #[inline] — called in O(n²) loop for approximate entropy
#[inline]
fn patterns_within_distance_simd(p1: &[f64], p2: &[f64], r: f64, m: usize) -> bool {
    // Optimize common case: m=2 (used for ApEn in lookback_permutation_entropy)
    if m == 2 && p1.len() >= 2 && p2.len() >= 2 {
        // SIMD path: compute both abs differences in parallel
        let v1 = f64x2::new([p1[0], p1[1]]);
        let v2 = f64x2::new([p2[0], p2[1]]);
        let diffs = (v1 - v2).abs();

        // Check both distances: compute max of diffs and compare to r
        // For Chebyshev: max(abs(diff)) <= r
        let d0 = diffs.to_array()[0];
        let d1 = diffs.to_array()[1];
        d0 <= r && d1 <= r
    } else {
        // Fallback: scalar path for other cases
        let mut is_within_distance = true;
        for k in 0..m.min(p1.len()).min(p2.len()) {
            if (p1[k] - p2[k]).abs() > r {
                is_within_distance = false;
                break;
            }
        }
        is_within_distance
    }
}

/// Adaptive pattern sampling for large windows
/// Issue #96 Task #161 Phase 3: Algorithm optimization via pattern sampling
///
/// For large windows, sample patterns at intervals to reduce O(n²) cost.
/// Scales match count quadratically to approximate full comparison.
///
/// # Accuracy
/// Assumes uniform pattern distribution. Works well for random/high-entropy sequences.
/// May underestimate entropy for highly structured data.
///
/// # Strategy
/// - n < 300: full computation (O(n²) manageable)
/// - 300 ≤ n < 500: sample every 2nd pattern (4x reduction)
/// - 500 ≤ n < 1000: sample every 3rd pattern (9x reduction)
/// - n ≥ 1000: sample every 4th pattern (16x reduction)
fn compute_phi_sampled(prices: &[f64], m: usize, r: f64) -> f64 {
    let n = prices.len();
    if n < m {
        return 0.0;
    }

    let num_patterns = n - m + 1;

    // Adaptive sampling: sample interval based on window size
    let sample_interval = if num_patterns >= 1000 {
        4 // 16x reduction for very large windows
    } else if num_patterns >= 500 {
        3 // 9x reduction for large windows
    } else if num_patterns >= 300 {
        2 // 4x reduction for medium windows
    } else {
        1 // No sampling for smaller windows
    };

    let mut count = 0usize;

    if sample_interval == 1 {
        // Full computation: no sampling
        for i in 0..num_patterns {
            let p1 = &prices[i..i + m];
            for j in (i + 1)..num_patterns {
                let p2 = &prices[j..j + m];
                if patterns_within_distance_simd(p1, p2, r, m) {
                    count += 1;
                }
            }
        }
    } else {
        // Sampled computation: only compare patterns at intervals
        for i in (0..num_patterns).step_by(sample_interval) {
            let p1 = &prices[i..i + m];
            for j in ((i + sample_interval)..num_patterns).step_by(sample_interval) {
                let p2 = &prices[j..j + m];
                if patterns_within_distance_simd(p1, p2, r, m) {
                    count += 1;
                }
            }
        }

        // Scale count up: if we sampled every k patterns, we compared ~(n/k)² pairs
        // Scale back to approximate full comparison: count *= k²
        // Issue #96 Task #168: Optimize powi(2) to direct multiplication (0.5-1% speedup)
        let interval_f64 = sample_interval as f64;
        count = (count as f64 * (interval_f64 * interval_f64)).round() as usize;
    }

    // Avoid log(0)
    if count == 0 {
        return 0.0;
    }

    // Issue #96: Pre-compute reciprocal of C(n,2) binomial coefficient
    let inv_total_pairs = 2.0 / (num_patterns as f64 * (num_patterns - 1) as f64);
    let c = count as f64 * inv_total_pairs;
    -c * libm::log(c) // Issue #116: Use libm for 1.2-1.5x speedup
}

fn compute_phi(prices: &[f64], m: usize, r: f64) -> f64 {
    let n = prices.len();
    if n < m {
        return 0.0;
    }

    let num_patterns = n - m + 1;

    // Issue #96 Task #161 Phase 3: Adaptive algorithm selection
    // Use sampled computation for large windows (> 300 patterns)
    // Reduces O(n²) cost while maintaining accuracy via quadratic scaling
    if num_patterns > 300 {
        return compute_phi_sampled(prices, m, r);
    }

    // Fallback: full SIMD-accelerated computation for smaller windows
    let mut count = 0usize;

    for i in 0..num_patterns {
        let p1 = &prices[i..i + m];
        for j in (i + 1)..num_patterns {
            let p2 = &prices[j..j + m];

            // Use SIMD-accelerated distance check when beneficial (m=2)
            if patterns_within_distance_simd(p1, p2, r, m) {
                count += 1;
            }
        }
    }

    // Avoid log(0)
    if count == 0 {
        return 0.0;
    }

    // Issue #96: Pre-compute reciprocal of C(n,2) binomial coefficient
    let inv_total_pairs = 2.0 / (num_patterns as f64 * (num_patterns - 1) as f64);
    let c = count as f64 * inv_total_pairs;
    -c * libm::log(c) // Issue #116: Use libm for 1.2-1.5x speedup
}

/// Adaptive entropy computation: Permutation Entropy for small windows, ApEn for large
///
/// Issue #96 Task #7 Phase 2: Strategy B - Approximate Entropy
///
/// Trade-off:
/// - Small windows (n < 100): Permutation Entropy (fast and accurate)
/// - Medium windows (100-500): Permutation Entropy (acceptable)
/// - Large windows (n > 500): ApEn (5-10x faster, sufficient accuracy)
///
/// Returns entropy in [0, 1] range
/// Compute adaptive entropy with optional result caching (Issue #96 Task #117)
///
/// Dispatches to either Permutation Entropy (n < 500) or Approximate Entropy (n >= 500).
/// Uses cache for Permutation Entropy results to avoid redundant computation on
/// identical price sequences.
#[inline]
pub fn compute_entropy_adaptive(prices: &[f64]) -> f64 {
    let n = prices.len();

    // Small/medium windows: use Permutation Entropy
    if n < 500 {
        return compute_permutation_entropy(prices);
    }

    // Large windows: use ApEn with adaptive tolerance
    // Issue #96: Pre-compute reciprocal — replaces 2 divisions with 1 division + 2 multiplications
    let n_inv = 1.0 / n as f64;
    let mean = prices.iter().sum::<f64>() * n_inv;
    // Issue #96 Task #168: Optimize powi(2) to direct multiplication (0.5-1% speedup)
    let variance = prices
        .iter()
        .map(|p| {
            let d = p - mean;
            d * d
        })
        .sum::<f64>()
        * n_inv;
    let std = variance.sqrt();
    let r = 0.2 * std;

    compute_approximate_entropy(prices, 2, r)
}

/// Compute adaptive entropy with caching support (Issue #96 Task #117)
///
/// Integrates EntropyCache for Permutation Entropy (n < 500) to avoid redundant
/// computation on identical price sequences. Useful for consolidation periods
/// where identical price patterns repeat frequently.
///
/// # Performance
/// - Consolidation periods: 1.5-2.5x speedup (high repetition)
/// - Trending markets: 1.0-1.2x speedup (low repetition, more cache misses)
///
/// Read-only entropy cache lookup for try-lock fast-path optimization.
///
/// Issue #96 Task #156: Enables lock-free fast-path by checking cache
/// with read-lock only. Returns Some(entropy) if cached, None if miss
/// or requires computation.
#[inline]
pub fn compute_entropy_adaptive_cached_readonly(
    prices: &[f64],
    cache: &EntropyCache,
) -> Option<f64> {
    let n = prices.len();

    // Only check cache for small/medium windows (caching window)
    if n < 500 {
        cache.get(prices)
    } else {
        // Large windows use ApEn (not cached), so no fast-path
        None
    }
}

#[inline]
pub fn compute_entropy_adaptive_cached(prices: &[f64], cache: &mut EntropyCache) -> f64 {
    let n = prices.len();

    // Small/medium windows: use Permutation Entropy with caching
    if n < 500 {
        // Check cache first
        if let Some(cached_entropy) = cache.get(prices) {
            return cached_entropy;
        }

        // Cache miss: compute and store
        let entropy = compute_permutation_entropy(prices);
        cache.insert(prices, entropy);
        return entropy;
    }

    // Large windows: use ApEn (no caching benefit, too variable)
    // Issue #96: Pre-compute reciprocal — replaces 2 divisions with 1 division + 2 multiplications
    let n_inv = 1.0 / n as f64;
    let mean = prices.iter().sum::<f64>() * n_inv;
    // Issue #96 Task #168: Optimize powi(2) to direct multiplication (0.5-1% speedup)
    let variance = prices
        .iter()
        .map(|p| {
            let d = p - mean;
            d * d
        })
        .sum::<f64>()
        * n_inv;
    let std = variance.sqrt();
    let r = 0.2 * std;

    compute_approximate_entropy(prices, 2, r)
}