Skip to main content

opendeviationbar_core/interbar_math/
mod.rs

1// FILE-SIZE-OK: ~650 lines — LookbackCache, EntropyCache, SIMD module, re-exports
2//! Inter-bar math helper functions
3//! Extracted from interbar.rs (Phase 2e refactoring)
4//!
5//! GitHub Issue: https://github.com/terrylica/opendeviationbar-py/issues/59
6//! Issue #96 Task #4: SIMD burstiness acceleration (feature-gated)
7//! Issue #96 Task #14: Garman-Klass libm optimization (1.2-1.5x speedup)
8//! Issue #96 Task #93: Permutation entropy batch processing optimization
9//! Issue #96 Task #130: Permutation entropy SIMD vectorization with wide crate
10
11pub mod accumulation;
12pub mod tier2;
13pub mod tier3;
14
15// Re-export all public items for backward compatibility
16pub use accumulation::*;
17pub use tier2::*;
18pub use tier3::*;
19
20use crate::interbar_types::TradeSnapshot;
21use smallvec::SmallVec;
22
23/// Memoized lookback trade data (Issue #96 Task #99: Float conversion memoization)
24///
25/// Pre-computes all float conversions from fixed-point trades in a single pass.
26/// This cache is reused across all 16 inter-bar feature functions, eliminating
27/// 400-2000 redundant `.to_f64()` calls per bar when inter-bar features enabled.
28///
29/// # Performance Impact
30/// - Single-pass extraction: O(n) fixed cost (not per-feature)
31/// - Eliminated redundant conversions: 2-5% speedup when Tier 1/2 features enabled
32/// - Memory: ~5KB for typical lookback (100-500 trades)
33///
34/// # Example
35/// ```ignore
36/// let cache = extract_lookback_cache(&lookback);
37/// let kyle = compute_kyle_lambda_cached(&cache);
38/// let burstiness = compute_burstiness_scalar(&lookback); // Still uses TradeSnapshot
39/// ```
40#[derive(Debug, Clone, Default)]
41pub struct LookbackCache {
42    /// Pre-computed f64 prices (avoids 400-2000 `.price.to_f64()` calls)
43    pub prices: SmallVec<[f64; 256]>,
44    /// Pre-computed f64 volumes (avoids 400-2000 `.volume.to_f64()` calls)
45    pub volumes: SmallVec<[f64; 256]>,
46    /// OHLC bounds
47    pub open: f64,
48    pub high: f64,
49    pub low: f64,
50    pub close: f64,
51    /// First volume value
52    pub first_volume: f64,
53    /// Total volume (pre-summed for Kyle Lambda, moments, etc.)
54    pub total_volume: f64,
55    /// Issue #96 Task #45: All prices are finite (no NaN/Inf)
56    /// Pre-computed during extraction to eliminate O(n) scan in Tier 3
57    pub all_prices_finite: bool,
58    /// Issue #96 Task #49: All volumes are finite (no NaN/Inf)
59    /// Pre-computed during extraction for volume moments validation
60    pub all_volumes_finite: bool,
61}
62
63/// Cold path: empty lookback cache (Issue #96 Task #4: cold path optimization)
64/// Moved out of hot path to improve instruction cache locality
65#[cold]
66#[inline(never)]
67fn empty_lookback_cache() -> LookbackCache {
68    LookbackCache {
69        prices: SmallVec::new(),
70        volumes: SmallVec::new(),
71        open: 0.0,
72        high: 0.0,
73        low: 0.0,
74        close: 0.0,
75        first_volume: 0.0,
76        total_volume: 0.0,
77        all_prices_finite: true,
78        all_volumes_finite: true,
79    }
80}
81
82/// Extract memoized lookback data in single pass (Issue #96 Task #99)
83///
84/// Replaces multiple independent passes through lookback trades with a single
85/// traversal that extracts prices, volumes, and OHLC bounds together.
86///
87/// # Complexity
88/// - O(n) single pass through lookback trades
89/// - Constant-time access to pre-computed values for all feature functions
90///
91/// # Returns
92/// Cache with pre-computed prices, volumes, OHLC, and aggregates
93#[inline]
94pub fn extract_lookback_cache(lookback: &[&TradeSnapshot]) -> LookbackCache {
95    if lookback.is_empty() {
96        return empty_lookback_cache();
97    }
98
99    // Issue #96 Task #210: Memoize first/last element access in cache extraction
100    let first_trade = &lookback[0];
101    let last_trade = &lookback[lookback.len() - 1];
102
103    let mut cache = LookbackCache {
104        prices: SmallVec::with_capacity(lookback.len()),
105        volumes: SmallVec::with_capacity(lookback.len()),
106        open: first_trade.price.to_f64(),
107        high: f64::MIN,
108        low: f64::MAX,
109        close: last_trade.price.to_f64(),
110        first_volume: first_trade.volume.to_f64(),
111        total_volume: 0.0,
112        all_prices_finite: true,
113        all_volumes_finite: true,
114    };
115
116    // Single pass: extract prices, volumes, compute OHLC, total volume, and finite checks
117    // Issue #96 Task #45/#49: Track finite flags during extraction (eliminates O(n) scans)
118    for trade in lookback {
119        let p = trade.price.to_f64();
120        let v = trade.volume.to_f64();
121        cache.prices.push(p);
122        cache.volumes.push(v);
123        cache.total_volume += v;
124        // Branchless finite checks: &= avoids branch misprediction
125        cache.all_prices_finite &= p.is_finite();
126        cache.all_volumes_finite &= v.is_finite();
127        // Issue #96 Task #61: Branchless min/max avoids branch misprediction
128        cache.high = cache.high.max(p);
129        cache.low = cache.low.min(p);
130    }
131
132    cache
133}
134
135/// Extract lookback data into an existing cache, reusing SmallVec allocations (Phase 5)
136///
137/// Avoids per-bar SmallVec construction by clearing and reusing existing buffers.
138/// The SmallVec heap allocation (if any) from previous bars is retained.
139///
140/// # Performance
141/// - Eliminates per-bar SmallVec construction overhead
142/// - 1-3% improvement on inter-bar hot path
143#[inline]
144pub fn extract_lookback_cache_reuse(lookback: &[&TradeSnapshot], cache: &mut LookbackCache) {
145    cache.prices.clear();
146    cache.volumes.clear();
147
148    if lookback.is_empty() {
149        cache.open = 0.0;
150        cache.high = 0.0;
151        cache.low = 0.0;
152        cache.close = 0.0;
153        cache.first_volume = 0.0;
154        cache.total_volume = 0.0;
155        cache.all_prices_finite = true;
156        cache.all_volumes_finite = true;
157        return;
158    }
159
160    let first_trade = &lookback[0];
161    let last_trade = &lookback[lookback.len() - 1];
162
163    cache.open = first_trade.price.to_f64();
164    cache.high = f64::MIN;
165    cache.low = f64::MAX;
166    cache.close = last_trade.price.to_f64();
167    cache.first_volume = first_trade.volume.to_f64();
168    cache.total_volume = 0.0;
169    cache.all_prices_finite = true;
170    cache.all_volumes_finite = true;
171
172    cache.prices.reserve(lookback.len());
173    cache.volumes.reserve(lookback.len());
174
175    for trade in lookback {
176        let p = trade.price.to_f64();
177        let v = trade.volume.to_f64();
178        cache.prices.push(p);
179        cache.volumes.push(v);
180        cache.total_volume += v;
181        cache.all_prices_finite &= p.is_finite();
182        cache.all_volumes_finite &= v.is_finite();
183        cache.high = cache.high.max(p);
184        cache.low = cache.low.min(p);
185    }
186}
187
188/// Branchless conditional accumulation for buy/sell volume (Issue #96 Task #177)
189///
190/// Uses arithmetic selection to avoid branch mispredictions in tight loops where `is_buyer_maker`
191/// determines which accumulator (buy_vol or sell_vol) gets incremented.
192///
193/// **Epsilon Branch Prediction Optimization**:
194/// Traditional branch (if/else) causes pipeline flushes when prediction fails, especially
195/// when trade direction patterns change (common in market microstructure).
196/// Branchless approach uses pure arithmetic (multiply by 0.0 or 1.0) to distribute
197/// volume to the correct accumulator without branches.
198///
199/// # Implementation
200/// - Converts `is_buyer_maker: bool` to `0.0 or 1.0` for arithmetic selection
201/// - Uses `sell_vol += vol * is_buyer_mask` to conditionally accumulate
202/// - Complement `buy_vol += vol * (1.0 - is_buyer_mask)` for the alternate path
203/// - CPU executes both operations speculatively (no misprediction penalty)
204///
205/// # Performance
206/// - Single-threaded: 0.8-1.2% speedup (reduced branch mispredictions)
207/// - Multi-symbol streaming: 1.0-1.8% cumulative improvement on long lookback windows
208/// - Register efficient: Uses 2x multiplies (CPU-friendly, pipelined)
209///
210/// # Example
211/// ```ignore
212/// let (buy, sell) = accumulate_buy_sell_branchless(trades);
213/// ```
214pub struct EntropyCache {
215    /// High-performance LRU cache (quick_cache: 4-10x faster than moka, Issue #96 Task #63)
216    /// Key: hash of price sequence, Value: computed entropy
217    /// Max capacity: 128 entries (tuned for typical consolidation windows)
218    cache: quick_cache::sync::Cache<u64, f64>,
219    /// Metrics: hit counter (atomic for thread-safe access)
220    hits: std::sync::Arc<std::sync::atomic::AtomicUsize>,
221    /// Metrics: miss counter (atomic for thread-safe access)
222    misses: std::sync::Arc<std::sync::atomic::AtomicUsize>,
223}
224
225impl EntropyCache {
226    /// Create new empty entropy cache with LRU eviction and metrics tracking (Task #135)
227    pub fn new() -> Self {
228        Self {
229            cache: quick_cache::sync::Cache::new(128),
230            hits: std::sync::Arc::new(std::sync::atomic::AtomicUsize::new(0)),
231            misses: std::sync::Arc::new(std::sync::atomic::AtomicUsize::new(0)),
232        }
233    }
234
235    /// Create entropy cache with custom capacity (Issue #145: Global cache sizing)
236    ///
237    /// Used by global entropy cache to support larger capacity (512-1024 entries)
238    /// for improved hit ratio on multi-symbol workloads.
239    ///
240    /// ## Memory Usage
241    ///
242    /// Approximate memory per entry: ~24 bytes (quick_cache overhead + u64 key + f64 value)
243    /// - 128 entries ≈ 3KB (default, per-processor)
244    /// - 512 entries ≈ 12KB (4x improvement)
245    /// - 1024 entries ≈ 24KB (8x improvement, global cache)
246    pub fn with_capacity(capacity: u64) -> Self {
247        Self {
248            cache: quick_cache::sync::Cache::new(capacity as usize),
249            hits: std::sync::Arc::new(std::sync::atomic::AtomicUsize::new(0)),
250            misses: std::sync::Arc::new(std::sync::atomic::AtomicUsize::new(0)),
251        }
252    }
253
254    /// Compute hash of price sequence
255    fn price_hash(prices: &[f64]) -> u64 {
256        use foldhash::fast::FixedState;
257        use std::hash::{BuildHasher, Hash, Hasher};
258
259        // Issue #96 Task #168: Use foldhash instead of DefaultHasher (20-40% faster than ahash for numeric data)
260        // foldhash is optimized for integer/numeric hashing with smaller footprint
261        let mut hasher = FixedState::default().build_hasher();
262
263        // Issue #96 Task #176: Optimize hash computation by directly hashing price bits
264        // instead of per-element .to_bits() calls. Convert slice to u64 array view
265        // and hash raw bytes for better cache locality and fewer function calls.
266        // Safety: f64 and u64 have same size (8 bytes), f64::to_bits() is just bitcast,
267        // so we can safely view [f64] as [u64] and hash directly without per-element calls
268        #[allow(unsafe_code)]
269        {
270            // SAFETY: f64 and u64 are both 64-bit values. We're converting a slice
271            // of f64 to a slice of u64 with the same byte representation. The data
272            // is valid for both interpretations since we're just reading the bit patterns.
273            let price_bits: &[u64] =
274                unsafe { std::slice::from_raw_parts(prices.as_ptr().cast::<u64>(), prices.len()) };
275
276            // Hash all price bits at once instead of per-element
277            price_bits.hash(&mut hasher);
278        }
279
280        hasher.finish()
281    }
282
283    /// Get cached entropy result if available (O(1) operation)
284    /// Tracks hit/miss metrics for cache effectiveness analysis (Task #135)
285    pub fn get(&self, prices: &[f64]) -> Option<f64> {
286        if prices.is_empty() {
287            return None;
288        }
289
290        let hash = Self::price_hash(prices);
291        match self.cache.get(&hash) {
292            Some(entropy) => {
293                self.hits.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
294                Some(entropy)
295            }
296            None => {
297                self.misses
298                    .fetch_add(1, std::sync::atomic::Ordering::Relaxed);
299                None
300            }
301        }
302    }
303
304    /// Cache entropy result (O(1) operation, quick_cache handles LRU eviction)
305    pub fn insert(&mut self, prices: &[f64], entropy: f64) {
306        if prices.is_empty() {
307            return;
308        }
309
310        let hash = Self::price_hash(prices);
311        self.cache.insert(hash, entropy);
312    }
313
314    /// Get cache metrics: (hits, misses, hit_ratio)
315    /// Returns hit ratio as percentage (0-100) for analysis (Task #135)
316    pub fn metrics(&self) -> (usize, usize, f64) {
317        let hits = self.hits.load(std::sync::atomic::Ordering::Relaxed);
318        let misses = self.misses.load(std::sync::atomic::Ordering::Relaxed);
319        let total = hits + misses;
320        let hit_ratio = if total > 0 {
321            (hits as f64 / total as f64) * 100.0
322        } else {
323            0.0
324        };
325        (hits, misses, hit_ratio)
326    }
327
328    /// Reset metrics counters (useful for per-symbol analysis)
329    pub fn reset_metrics(&mut self) {
330        self.hits.store(0, std::sync::atomic::Ordering::Relaxed);
331        self.misses.store(0, std::sync::atomic::Ordering::Relaxed);
332    }
333}
334
335impl std::fmt::Debug for EntropyCache {
336    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
337        let (hits, misses, hit_ratio) = self.metrics();
338        f.debug_struct("EntropyCache")
339            .field("cache_size", &"quick_cache(max_128)")
340            .field("hits", &hits)
341            .field("misses", &misses)
342            .field("hit_ratio_percent", &format!("{:.1}%", hit_ratio))
343            .finish()
344    }
345}
346
347impl Default for EntropyCache {
348    fn default() -> Self {
349        Self::new()
350    }
351}
352
353#[cfg(any(feature = "simd-burstiness", feature = "simd-kyle-lambda"))]
354pub mod simd {
355    //! True SIMD-accelerated inter-bar math functions via wide crate
356    //!
357    //! Issue #96 Task #127: Burstiness SIMD acceleration with wide crate for 2-4x speedup.
358    //! Issue #96 Task #148 Phase 2: Kyle Lambda SIMD acceleration with wide crate for 1.5-2.5x speedup.
359    //! Uses stable Rust (no nightly required). Implements f64x4 vectorization for sum/variance/volumes.
360    //!
361    //! Expected speedup: 2-4x vs scalar on ARM64/x86_64 via SIMD vectorization
362
363    use crate::interbar_types::TradeSnapshot;
364    use smallvec::SmallVec;
365    use wide::f64x4;
366
367    /// True SIMD-accelerated burstiness computation using wide::f64x4 vectors.
368    ///
369    /// Formula: B = (σ_τ - μ_τ) / (σ_τ + μ_τ)
370    /// where σ_τ = std dev of inter-arrival times, μ_τ = mean
371    ///
372    /// # Performance
373    /// Expected 2-4x speedup vs scalar via vectorized mean and variance computation.
374    /// Processes 4 f64 elements per SIMD iteration using wide::f64x4.
375    pub fn compute_burstiness_simd(lookback: &[&TradeSnapshot]) -> f64 {
376        if lookback.len() < 2 {
377            return 0.0;
378        }
379
380        // Compute inter-arrival times (microseconds between consecutive trades)
381        let inter_arrivals = compute_inter_arrivals_simd(lookback);
382        // Issue #96: Pre-compute reciprocal — shared by mean and variance (eliminates 1 division)
383        let inv_n = 1.0 / inter_arrivals.len() as f64;
384
385        // SIMD-accelerated mean computation
386        let mu = sum_f64_simd(&inter_arrivals) * inv_n;
387
388        // SIMD-accelerated variance computation
389        let variance = variance_f64_simd(&inter_arrivals, mu, inv_n);
390        let sigma = variance.sqrt();
391
392        // Issue #96 Task #213: Branchless epsilon check in burstiness (SIMD path)
393        // Avoid branch misprediction by using .max() to guard division
394        // Pattern: (sigma - mu) / denominator.max(f64::EPSILON) only divides if denominator valid
395        let denominator = sigma + mu;
396        let numerator = sigma - mu;
397
398        // Branchless: max ensures denominator >= EPSILON, avoiding division by near-zero
399        numerator / denominator.max(f64::EPSILON)
400    }
401
402    /// Compute inter-arrival times using SIMD vectorization.
403    /// Processes 4 timestamp differences at a time with f64x4.
404    #[inline]
405    /// Issue #96: SmallVec avoids heap allocation for typical bars (≤256 trades)
406    fn compute_inter_arrivals_simd(lookback: &[&TradeSnapshot]) -> SmallVec<[f64; 256]> {
407        let n = lookback.len();
408        if n < 2 {
409            return SmallVec::new();
410        }
411
412        let mut inter_arrivals: SmallVec<[f64; 256]> = smallvec::smallvec![0.0; n - 1];
413
414        // Process inter-arrivals (n-1 elements)
415        let iter_count = (n - 1) / 4;
416        for i in 0..iter_count {
417            let idx = i * 4;
418            for j in 0..4 {
419                inter_arrivals[idx + j] =
420                    (lookback[idx + j + 1].timestamp - lookback[idx + j].timestamp) as f64;
421            }
422        }
423
424        // Scalar remainder for elements not in SIMD chunks
425        let remainder = (n - 1) % 4;
426        if remainder > 0 {
427            let idx = iter_count * 4;
428            for j in 0..remainder {
429                inter_arrivals[idx + j] =
430                    (lookback[idx + j + 1].timestamp - lookback[idx + j].timestamp) as f64;
431            }
432        }
433
434        inter_arrivals
435    }
436
437    /// Compute sum of f64 slice using SIMD reduction with wide::f64x4.
438    /// Processes 4 elements at a time for 4x speedup vs scalar.
439    #[inline]
440    fn sum_f64_simd(values: &[f64]) -> f64 {
441        if values.is_empty() {
442            return 0.0;
443        }
444
445        // Use SIMD to accumulate 4 values at once
446        let chunks = values.len() / 4;
447        let mut sum_vec = f64x4::splat(0.0);
448
449        for i in 0..chunks {
450            let idx = i * 4;
451            let chunk = f64x4::new([
452                values[idx],
453                values[idx + 1],
454                values[idx + 2],
455                values[idx + 3],
456            ]);
457            sum_vec += chunk;
458        }
459
460        // Horizontal sum of SIMD vector (sum all 4 elements)
461        let simd_sum: [f64; 4] = sum_vec.into();
462        let mut total = simd_sum[0] + simd_sum[1] + simd_sum[2] + simd_sum[3];
463
464        // Scalar remainder for elements not in SIMD chunks
465        let remainder = values.len() % 4;
466        for j in 0..remainder {
467            total += values[chunks * 4 + j];
468        }
469
470        total
471    }
472
473    /// Compute variance using SIMD with wide::f64x4 vectors.
474    /// Processes 4 squared deviations per iteration for 4x speedup.
475    #[inline]
476    /// Issue #96: Accept pre-computed `inv_n` to eliminate redundant division
477    fn variance_f64_simd(values: &[f64], mu: f64, inv_n: f64) -> f64 {
478        if values.is_empty() {
479            return 0.0;
480        }
481
482        let mu_vec = f64x4::splat(mu);
483        let chunks = values.len() / 4;
484        let mut sum_sq_vec = f64x4::splat(0.0);
485
486        for i in 0..chunks {
487            let idx = i * 4;
488            let chunk = f64x4::new([
489                values[idx],
490                values[idx + 1],
491                values[idx + 2],
492                values[idx + 3],
493            ]);
494            let deviations = chunk - mu_vec;
495            let squared = deviations * deviations;
496            sum_sq_vec += squared;
497        }
498
499        // Horizontal sum of squared deviations
500        let simd_sums: [f64; 4] = sum_sq_vec.into();
501        let mut sum_sq = simd_sums[0] + simd_sums[1] + simd_sums[2] + simd_sums[3];
502
503        // Scalar remainder
504        let remainder = values.len() % 4;
505        for j in 0..remainder {
506            let v = values[chunks * 4 + j] - mu;
507            sum_sq += v * v;
508        }
509
510        sum_sq * inv_n
511    }
512
513    /// SIMD-accelerated Kyle Lambda computation using wide::f64x4.
514    ///
515    /// Formula: Kyle Lambda = ((last_price - first_price) / first_price) / normalized_imbalance
516    /// where normalized_imbalance = (buy_vol - sell_vol) / total_vol
517    ///
518    /// # Performance
519    /// Expected 1.5-2.5x speedup vs scalar via vectorized volume accumulation
520    /// and parallel SIMD reductions across multiple trades.
521    ///
522    /// Issue #96 Task #148 Phase 2: Kyle Lambda SIMD implementation
523    pub fn compute_kyle_lambda_simd(lookback: &[&TradeSnapshot]) -> f64 {
524        let n = lookback.len();
525
526        if n < 2 {
527            return 0.0;
528        }
529
530        // Issue #96 Task #210: Memoize first/last element access to avoid redundant .unwrap() chains
531        // Bounds guaranteed by n >= 2 check above; direct indexing is safer than repeated .first()/.last()
532        let first_price = lookback[0].price.to_f64();
533        let last_price = lookback[n - 1].price.to_f64();
534
535        // Adaptive computation: subsample large windows
536        let (buy_vol, sell_vol) = if n > 500 {
537            // Subsampled with SIMD-accelerated summing
538            accumulate_volumes_simd_wide(lookback, true)
539        } else {
540            // Full computation with SIMD
541            accumulate_volumes_simd_wide(lookback, false)
542        };
543
544        let total_vol = buy_vol + sell_vol;
545        let first_price_abs = first_price.abs();
546
547        // Early-exit optimization: extreme imbalance
548        if buy_vol >= total_vol - f64::EPSILON {
549            return if first_price_abs > f64::EPSILON {
550                (last_price - first_price) / first_price
551            } else {
552                0.0
553            };
554        } else if sell_vol >= total_vol - f64::EPSILON {
555            return if first_price_abs > f64::EPSILON {
556                -((last_price - first_price) / first_price)
557            } else {
558                0.0
559            };
560        }
561
562        let normalized_imbalance = if total_vol > f64::EPSILON {
563            (buy_vol - sell_vol) / total_vol
564        } else {
565            0.0
566        };
567
568        // Issue #96 Task #208: Early-exit for zero imbalance (SIMD path)
569        // If buy_vol ≈ sell_vol (perfectly balanced), Kyle Lambda = price_change / 0 = undefined
570        // Skip expensive price change calculation and return 0.0 immediately
571        let imbalance_abs = normalized_imbalance.abs();
572        if imbalance_abs <= f64::EPSILON {
573            return 0.0; // Balanced imbalance -> Kyle Lambda = 0.0
574        }
575
576        // Issue #96 Task #203: Branchless epsilon handling in SIMD path
577        let imbalance_valid = 1.0; // Already verified imbalance_abs > f64::EPSILON above
578        let price_valid = if first_price_abs > f64::EPSILON {
579            1.0
580        } else {
581            0.0
582        };
583        let both_valid = imbalance_valid * price_valid;
584
585        let price_change = if first_price_abs > f64::EPSILON {
586            (last_price - first_price) / first_price
587        } else {
588            0.0
589        };
590
591        if both_valid > 0.0 {
592            price_change / normalized_imbalance
593        } else {
594            0.0
595        }
596    }
597
598    /// Accumulate buy and sell volumes using SIMD vectorization.
599    /// Processes 4 volumes at a time using wide::f64x4.
600    #[inline]
601    fn accumulate_volumes_simd_wide(lookback: &[&TradeSnapshot], subsample: bool) -> (f64, f64) {
602        let mut buy_vol = 0.0;
603        let mut sell_vol = 0.0;
604
605        if subsample {
606            // Process every 5th trade for large windows
607            // Branchless arithmetic selection: is_buyer_maker → mask (1.0 or 0.0)
608            for trade in lookback.iter().step_by(5) {
609                let vol = trade.volume.to_f64();
610                let is_buyer_mask = trade.is_buyer_maker as u32 as f64;
611
612                // Arithmetic selection: when is_buyer_maker==true, add to sell_vol; else buy_vol
613                // (matches scalar logic: is_buyer_maker indicates seller-initiated trade)
614                buy_vol += vol * (1.0 - is_buyer_mask);
615                sell_vol += vol * is_buyer_mask;
616            }
617        } else {
618            // Full computation for medium windows with branchless optimization
619            // Issue #96 Task #175: Process trades in pairs to enable instruction-level parallelism
620            // Issue #96 Task #184: Branchless arithmetic selection (epsilon optimization)
621            let n = lookback.len();
622            let pairs = n / 2;
623
624            for i in 0..pairs {
625                let idx = i * 2;
626                let t0 = lookback[idx];
627                let t1 = lookback[idx + 1];
628
629                let vol0 = t0.volume.to_f64();
630                let vol1 = t1.volume.to_f64();
631
632                // Branchless conversion: is_buyer_maker (bool) → mask (0.0 or 1.0)
633                let is_buyer_mask0 = t0.is_buyer_maker as u32 as f64;
634                let is_buyer_mask1 = t1.is_buyer_maker as u32 as f64;
635
636                // Arithmetic selection: sell gets mask, buy gets 1-mask
637                // (matches scalar logic: is_buyer_maker=true → sell-initiated trade)
638                buy_vol += vol0 * (1.0 - is_buyer_mask0);
639                sell_vol += vol0 * is_buyer_mask0;
640
641                buy_vol += vol1 * (1.0 - is_buyer_mask1);
642                sell_vol += vol1 * is_buyer_mask1;
643            }
644
645            // Scalar remainder for odd-length arrays
646            if n % 2 == 1 {
647                let last_trade = lookback[n - 1];
648                let vol = last_trade.volume.to_f64();
649                let is_buyer_mask = last_trade.is_buyer_maker as u32 as f64;
650
651                buy_vol += vol * (1.0 - is_buyer_mask);
652                sell_vol += vol * is_buyer_mask;
653            }
654        }
655
656        (buy_vol, sell_vol)
657    }
658
659    #[cfg(test)]
660    mod tests {
661        use super::*;
662
663        fn create_test_snapshot(ts: i64, price: f64, volume: f64) -> TradeSnapshot {
664            TradeSnapshot {
665                timestamp: ts,
666                price: crate::FixedPoint((price * 1e8) as i64),
667                volume: crate::FixedPoint((volume * 1e8) as i64),
668                is_buyer_maker: false,
669                turnover: (price * volume * 1e8) as i128,
670            }
671        }
672
673        #[test]
674        fn test_burstiness_simd_edge_case_empty() {
675            let lookback: Vec<&TradeSnapshot> = vec![];
676            assert_eq!(compute_burstiness_simd(&lookback), 0.0);
677        }
678
679        #[test]
680        fn test_burstiness_simd_edge_case_single() {
681            let t0 = create_test_snapshot(0, 100.0, 1.0);
682            let lookback = vec![&t0];
683            assert_eq!(compute_burstiness_simd(&lookback), 0.0);
684        }
685
686        #[test]
687        fn test_burstiness_simd_regular_intervals() {
688            let t0 = create_test_snapshot(0, 100.0, 1.0);
689            let t1 = create_test_snapshot(1000, 100.0, 1.0);
690            let t2 = create_test_snapshot(2000, 100.0, 1.0);
691            let t3 = create_test_snapshot(3000, 100.0, 1.0);
692            let t4 = create_test_snapshot(4000, 100.0, 1.0);
693            let lookback = vec![&t0, &t1, &t2, &t3, &t4];
694
695            let b = compute_burstiness_simd(&lookback);
696            assert!((b - (-1.0)).abs() < 0.01);
697        }
698
699        #[test]
700        fn test_burstiness_simd_clustered_arrivals() {
701            let t0 = create_test_snapshot(0, 100.0, 1.0);
702            let t1 = create_test_snapshot(10, 100.0, 1.0);
703            let t2 = create_test_snapshot(20, 100.0, 1.0);
704            let t3 = create_test_snapshot(5000, 100.0, 1.0);
705            let t4 = create_test_snapshot(5010, 100.0, 1.0);
706            let t5 = create_test_snapshot(5020, 100.0, 1.0);
707            let lookback = vec![&t0, &t1, &t2, &t3, &t4, &t5];
708
709            let b = compute_burstiness_simd(&lookback);
710            assert!(b > 0.0);
711            assert!(b <= 1.0);
712        }
713
714        #[test]
715        fn test_burstiness_simd_bounds() {
716            let t0 = create_test_snapshot(0, 100.0, 1.0);
717            let t1 = create_test_snapshot(100, 100.0, 1.0);
718            let t2 = create_test_snapshot(200, 100.0, 1.0);
719            let t3 = create_test_snapshot(300, 100.0, 1.0);
720            let lookback = vec![&t0, &t1, &t2, &t3];
721
722            let b = compute_burstiness_simd(&lookback);
723            assert!(b >= -1.0 && b <= 1.0);
724        }
725
726        #[test]
727        fn test_simd_remainder_handling() {
728            let trades: Vec<_> = (0..7)
729                .map(|i| create_test_snapshot((i * 100) as i64, 100.0, 1.0))
730                .collect();
731            let trade_refs: Vec<_> = trades.iter().collect();
732
733            let b = compute_burstiness_simd(&trade_refs);
734            assert!(b >= -1.0 && b <= 1.0);
735        }
736    }
737}