tulip_rs 0.1.11

High-performance technical analysis library — 100+ indicators and 60+ candlestick patterns with SIMD acceleration
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
//use crate::indicators::nightly::simd_types::F64Constants;
use std::simd::{
    cmp::{SimdPartialEq, SimdPartialOrd},
    num::{SimdFloat, SimdInt, SimdUint},
    Select, Simd, StdFloat,
};
/// High-performance SIMD natural logarithm
/// Based on Agner Fog's implementation
/// High-performance SIMD natural logarithm
/// Based on the wide crate implementation
/// High-performance SIMD natural logarithm
/// Based on fdlibm implementation
#[inline(always)]
pub fn ln<const N: usize>(x: Simd<f64, N>) -> Simd<f64, N> {
    // Pre-computed constants to reduce Simd::splat() calls
    const L2U: f64 = 0.6931471805599452862;
    const L2L: f64 = 2.3190468138462996e-17;
    const SQRT2: f64 = 1.4142135623730950488;
    const TWO54: f64 = 18014398509481984.0;
    const ZERO: f64 = 0.0;
    const ONE: f64 = 1.0;
    const HALF: f64 = 0.5;
    const TWO: f64 = 2.0;
    const NEG_INF: f64 = f64::NEG_INFINITY;
    const POS_INF: f64 = f64::INFINITY;
    const NAN: f64 = f64::NAN;
    
    // Full 8-term polynomial coefficients for accuracy
    const C1: f64 = 1.0;
    const C3: f64 = 0.3333333333333333333333333333333;
    const C5: f64 = 0.2;
    const C7: f64 = 0.14285714285714285714285714285714;
    const C9: f64 = 0.11111111111111111111111111111111;
    const C11: f64 = 0.09090909090909090909090909090909;
    const C13: f64 = 0.07692307692307692307692307692308;
    const C15: f64 = 0.06666666666666666666666666666667;
    
    // Pre-splat constants
    let zero_splat = Simd::splat(ZERO);
    let one_splat = Simd::splat(ONE);
    let half_splat = Simd::splat(HALF);
    let two_splat = Simd::splat(TWO);
    let sqrt2_splat = Simd::splat(SQRT2);
    let two54_splat = Simd::splat(TWO54);
    let pos_inf_splat = Simd::<f64, N>::splat(POS_INF);
    let neg_inf_splat = Simd::<f64, N>::splat(NEG_INF);
    
    // OPTIMIZATION 1: Reduced Special Case Checks (kept from previous)
    let is_zero = x.simd_eq(zero_splat);
    let is_neg = x.simd_lt(zero_splat);
    let is_inf = x.simd_eq(pos_inf_splat);
    let is_nan = x.is_nan();
    
    let special_mask = is_zero | is_neg | is_inf | is_nan;
    
    if special_mask.all() {
        return is_zero.select(
            neg_inf_splat,
            is_neg.select(
                Simd::splat(NAN),
                is_inf.select(
                    pos_inf_splat,
                    Simd::splat(NAN)
                )
            )
        );
    }
    
    // OPTIMIZATION 2: Combined Bit Operations
    // Extract bits once and do all bit operations together
    let ix = x.to_bits().cast::<i64>();
    let is_subnormal = ix.simd_lt(Simd::splat(0x0010000000000000_i64));
    
    // Combined subnormal handling and bit extraction
    let (x_normalized, exponent_adjustment) = if is_subnormal.any() {
        let x_scaled = x * two54_splat;
        let x_work = is_subnormal.select(x_scaled, x);
        let adj = is_subnormal.select(Simd::splat(-54_i32), Simd::splat(0_i32));
        (x_work, adj)
    } else {
        // Fast path when no subnormals
        (x, Simd::splat(0_i32))
    };
    
    // Single bit extraction for both exponent and mantissa
    let ix_work = x_normalized.to_bits().cast::<i64>();
    
    // Extract exponent and create mantissa in one step
    let raw_exponent = (ix_work >> 52).cast::<i32>();
    let e = exponent_adjustment + raw_exponent - Simd::splat(0x3ff_i32);
    
    // Create mantissa directly from the same bits
    let mantissa_bits = (ix_work & Simd::splat(0x000fffffffffffff_i64)) | Simd::splat(0x3ff0000000000000_i64);
    let mantissa = Simd::<f64, N>::from_bits(mantissa_bits.cast::<u64>());
    
    // Range reduction (kept the same)
    let adjust = mantissa.simd_ge(sqrt2_splat);
    let mantissa = adjust.select(mantissa * half_splat, mantissa);
    let e = adjust.select(e + Simd::splat(1_i32), e);
    
    // Optimized transformation s = (m-1)/(m+1)
    let numerator = mantissa - one_splat;
    let denominator = mantissa + one_splat;
    let s = numerator / denominator;
    let s2 = s * s;
    
    // Full 8-term polynomial evaluation for accuracy
    let mut poly = Simd::splat(C15);
    poly = s2.mul_add(poly, Simd::splat(C13));
    poly = s2.mul_add(poly, Simd::splat(C11));
    poly = s2.mul_add(poly, Simd::splat(C9));
    poly = s2.mul_add(poly, Simd::splat(C7));
    poly = s2.mul_add(poly, Simd::splat(C5));
    poly = s2.mul_add(poly, Simd::splat(C3));
    poly = s2.mul_add(poly, Simd::splat(C1));
    
    // Final result computation with optimized mul_add chain
    let log_mantissa = s * poly * two_splat;
    let e_f64 = e.cast::<f64>();
    
    // Single mul_add chain for final result
    let result = e_f64.mul_add(
        Simd::splat(L2U), 
        e_f64.mul_add(Simd::splat(L2L), log_mantissa)
    );
    
    // Apply special case mask only if there are any special cases
    if special_mask.any() {
        special_mask.select(
            is_zero.select(
                neg_inf_splat,
                is_neg.select(
                    Simd::splat(NAN),
                    is_inf.select(
                        pos_inf_splat,
                        Simd::splat(NAN)
                    )
                )
            ),
            result
        )
    } else {
        result
    }
}

/// Unsafe natural logarithm that assumes inputs are positive, finite, and non-NaN.
/// 
/// # Safety
/// 
/// The caller must guarantee that all input values satisfy:
/// - x > 0.0 (no negative values or zero)
/// - x is finite (no positive infinity)
/// - x is not NaN
/// 
/// Violating these preconditions may result in incorrect results or undefined behavior.
/// 
/// # Performance
/// 
/// This function skips all special case checks and branching, providing maximum
/// performance for hot paths where input constraints are guaranteed by the caller.
#[inline(always)]
pub unsafe fn ln_unchecked<const N: usize>(x: Simd<f64, N>) -> Simd<f64, N> {
    // Pre-computed constants
    const L2U: f64 = 0.6931471805599452862;
    const L2L: f64 = 2.3190468138462996e-17;
    const SQRT2: f64 = 1.4142135623730950488;
    const TWO54: f64 = 18014398509481984.0;
    const ONE: f64 = 1.0;
    const HALF: f64 = 0.5;
    const TWO: f64 = 2.0;
    
    // Full 8-term polynomial coefficients for accuracy
    const C1: f64 = 1.0;
    const C3: f64 = 0.3333333333333333333333333333333;
    const C5: f64 = 0.2;
    const C7: f64 = 0.14285714285714285714285714285714;
    const C9: f64 = 0.11111111111111111111111111111111;
    const C11: f64 = 0.09090909090909090909090909090909;
    const C13: f64 = 0.07692307692307692307692307692308;
    const C15: f64 = 0.06666666666666666666666666666667;
    
    // Pre-splat constants
    let one_splat = Simd::splat(ONE);
    let half_splat = Simd::splat(HALF);
    let two_splat = Simd::splat(TWO);
    let sqrt2_splat = Simd::splat(SQRT2);
    let two54_splat = Simd::splat(TWO54);
    
    // Debug assertion in debug builds to catch violations
    #[cfg(debug_assertions)]
    {
        debug_assert!(
            !x.simd_le(Simd::splat(0.0)).any(),
            "ln_unchecked called with non-positive value"
        );
        debug_assert!(
            !x.is_nan().any(),
            "ln_unchecked called with NaN"
        );
        debug_assert!(
            x.is_finite().all(),
            "ln_unchecked called with infinite value"
        );
    }
    
    // Combined bit operations for subnormal handling
    let ix = x.to_bits().cast::<i64>();
    let is_subnormal = ix.simd_lt(Simd::splat(0x0010000000000000_i64));
    
    // Handle subnormals by scaling
    let (x_normalized, exponent_adjustment) = if is_subnormal.any() {
        let x_scaled = x * two54_splat;
        let x_work = is_subnormal.select(x_scaled, x);
        let adj = is_subnormal.select(Simd::splat(-54_i32), Simd::splat(0_i32));
        (x_work, adj)
    } else {
        // Fast path when no subnormals
        (x, Simd::splat(0_i32))
    };
    
    // Single bit extraction for both exponent and mantissa
    let ix_work = x_normalized.to_bits().cast::<i64>();
    
    // Extract exponent and create mantissa in one step
    let raw_exponent = (ix_work >> 52).cast::<i32>();
    let e = exponent_adjustment + raw_exponent - Simd::splat(0x3ff_i32);
    
    // Create mantissa directly from the same bits
    let mantissa_bits = (ix_work & Simd::splat(0x000fffffffffffff_i64)) | Simd::splat(0x3ff0000000000000_i64);
    let mantissa = Simd::<f64, N>::from_bits(mantissa_bits.cast::<u64>());
    
    // Range reduction
    let adjust = mantissa.simd_ge(sqrt2_splat);
    let mantissa = adjust.select(mantissa * half_splat, mantissa);
    let e = adjust.select(e + Simd::splat(1_i32), e);
    
    // Optimized transformation s = (m-1)/(m+1)
    let numerator = mantissa - one_splat;
    let denominator = mantissa + one_splat;
    let s = numerator / denominator;
    let s2 = s * s;
    
    // Full 8-term polynomial evaluation for accuracy using Horner's method with FMA
    let poly = s2
        .mul_add(s2
            .mul_add(s2
                .mul_add(s2
                    .mul_add(s2
                        .mul_add(s2
                            .mul_add(s2
                                .mul_add(Simd::splat(C15), Simd::splat(C13)), 
                                Simd::splat(C11)), 
                            Simd::splat(C9)), 
                        Simd::splat(C7)), 
                    Simd::splat(C5)), 
                Simd::splat(C3)), 
            Simd::splat(C1));
    // Final result computation with optimized mul_add chain
    let log_mantissa = s * poly * two_splat;
    let e_f64 = e.cast::<f64>();
    
    // Single mul_add chain for final result
    e_f64.mul_add(
        Simd::splat(L2U), 
        e_f64.mul_add(Simd::splat(L2L), log_mantissa)
    )
}



/// High-performance SIMD exponential (e^x)
#[inline(always)]
pub fn exp<const N: usize>(x: Simd<f64, N>) -> Simd<f64, N> {
    // Coefficients for exp polynomial
    let log2e = Simd::splat(1.44269504088896340736);
    let ln2 = Simd::splat(0.693147180559945309417);

    let c1 = Simd::splat(1.0);
    let c2 = Simd::splat(0.5);
    let c3 = Simd::splat(1.66666666666666019037e-1);
    let c4 = Simd::splat(4.16666666666666019037e-2);
    let c5 = Simd::splat(8.33333333333329318027e-3);
    let c6 = Simd::splat(1.38888888889814059901e-3);
    let c7 = Simd::splat(1.98412698413242405037e-4);

    // Range reduction
    let k = (x * log2e).round();
    let r = x - k * ln2;

    // Polynomial approximation
    let r2 = r * r;
    let poly = c1 + r * (c1 + r2 * (c2 + r * (c3 + r * (c4 + r * (c5 + r * (c6 + r * c7))))));

    // Reconstruct with 2^k
    let k_int = unsafe { k.to_int_unchecked::<i64>() };
    let scale = Simd::<f64, N>::from_bits(((k_int + Simd::splat(1023)) << 52).cast::<u64>());

    poly * scale
}

/// SIMD power function (x^y)
#[inline(always)]
pub fn pow<const N: usize>(x: Simd<f64, N>, y: Simd<f64, N>) -> Simd<f64, N> {
    // x^y = exp(y * ln(x))
    exp(y * ln(x))
}

/// SIMD square root (already in std::simd but provided for completeness)
#[inline(always)]
pub fn sqrt<const N: usize>(x: Simd<f64, N>) -> Simd<f64, N> {
    x.sqrt() // std::simd has this optimized
}

pub mod trig {
    use std::simd::{
        cmp::{SimdPartialEq, SimdPartialOrd},
        num::{SimdFloat, SimdInt},
        Simd, StdFloat, Select
    };
    #[inline(always)]
    pub fn simd_atan<const N: usize>(x: Simd<f64, N>) -> Simd<f64, N> {
        // Polynomial coefficients for atan approximation
        const P4: f64 = -8.750608600031904122785E-1;
        const P3: f64 = -1.615753718733365076637E1;
        const P2: f64 = -7.500855792314704667340E1;
        const P1: f64 = -1.228866684490136173410E2;
        const P0: f64 = -6.485021904942025371773E1;

        const Q4: f64 = 2.485846490142306297962E1;
        const Q3: f64 = 1.650270098316988542046E2;
        const Q2: f64 = 4.328810604912902668951E2;
        const Q1: f64 = 4.853903996359136964868E2;
        const Q0: f64 = 1.945506571482613964425E2;

        const MORE_BITS: f64 = 6.123233995736765886130E-17;
        const MORE_BITS_O2: f64 = 6.123233995736765886130E-17 * 0.5;
        const T3PO8: f64 = std::f64::consts::SQRT_2 + 1.0;
        let zero_splat = Simd::<f64, N>::splat(0.0);
        let one_splat = Simd::<f64, N>::splat(1.0);
        let t = x.abs();

        // Range classification
        let notbig = t.simd_le(Simd::splat(T3PO8));
        let notsmal = t.simd_ge(Simd::splat(0.66));

        // Select offset angle
        let mut s = notbig.select(
            Simd::splat(std::f64::consts::FRAC_PI_4),
            Simd::splat(std::f64::consts::FRAC_PI_2),
        );
        s = notsmal.select(s, zero_splat);

        let mut fac = notbig.select(Simd::splat(MORE_BITS_O2), Simd::splat(MORE_BITS));
        fac = notsmal.select(fac, zero_splat);

        // Compute reduced argument z
        let mut a = notbig.select(t, zero_splat);
        a = notsmal.select(a - one_splat, a);

        let mut b = notbig.select(one_splat, zero_splat);
        b = notsmal.select(b + t, b);

        let z = a / b;
        let zz = z * z;

        // Numerator polynomial with mul_add (Horner's method)
        let px = zz.mul_add(
            zz.mul_add(
                zz.mul_add(
                    zz.mul_add(Simd::splat(P4), Simd::splat(P3)),
                    Simd::splat(P2)
                ),
                Simd::splat(P1)
            ),
            Simd::splat(P0)
        );

        // Denominator polynomial with mul_add (Horner's method)
        let qx = zz.mul_add(
            zz.mul_add(
                zz.mul_add(
                    zz.mul_add(Simd::splat(Q4), Simd::splat(Q3)),
                    Simd::splat(Q2)
                ),
                Simd::splat(Q1)
            ),
            Simd::splat(Q0)
        );
        
        // Compute final result: z * ((px/qx) * zz + 1)
        let ratio = px / qx;
        let temp = ratio.mul_add(zz, one_splat);
        let mut result = z * temp;

        // Add offset and correction
        result = result + s + fac;

        let sign_mask = x.to_bits() & Simd::splat(0x8000000000000000u64);
        let result_bits = result.to_bits() ^ sign_mask;
        Simd::from_bits(result_bits)

    }

    #[inline(always)]
    pub fn simd_sin_cos<const N: usize>(x: Simd<f64, N>) -> (Simd<f64, N>, Simd<f64, N>) {
        // Polynomial coefficients
        let p0sin = Simd::splat(-1.66666666666666307295E-1);
        let p1sin = Simd::splat(8.33333333332211858878E-3);
        let p2sin = Simd::splat(-1.98412698295895385996E-4);
        let p3sin = Simd::splat(2.75573136213857245213E-6);
        let p4sin = Simd::splat(-2.50507477628578072866E-8);
        let p5sin = Simd::splat(1.58962301576546568060E-10);

        let p0cos = Simd::splat(4.16666666666665929218E-2);
        let p1cos = Simd::splat(-1.38888888888730564116E-3);
        let p2cos = Simd::splat(2.48015872888517045348E-5);
        let p3cos = Simd::splat(-2.75573141792967388112E-7);
        let p4cos = Simd::splat(2.08757008419747316778E-9);
        let p5cos = Simd::splat(-1.13585365213876817300E-11);

        let dp1 = Simd::splat(7.853981554508209228515625E-1 * 2.0);
        let dp2 = Simd::splat(7.94662735614792836714E-9 * 2.0);
        let dp3 = Simd::splat(3.06161699786838294307E-17 * 2.0);
        let two_over_pi = Simd::splat(2.0 / std::f64::consts::PI);

        let i_one_splat = Simd::<i64, N>::splat(1);
        let i_zero_splat = Simd::<i64, N>::splat(0);
        let f64_zero_splat = Simd::<f64, N>::splat(0.0);
        let f64_one_splat = Simd::<f64, N>::splat(1.0);
        
        let xa = x.abs();
        let y = (xa * two_over_pi).round();
        let q: Simd<i64, N> = unsafe { y.to_int_unchecked() };

        // Cody-Waite range reduction
        //let x_reduced = xa - y * dp1 - y * dp2 - y * dp3;
        let x_reduced = y.mul_add(-dp3, y.mul_add(-dp2, y.mul_add(-dp1, xa)));

        let x2 = x_reduced * x_reduced;

        // Polynomial evaluation using Horner's method
        // Single expression for sine polynomial
        let s = x_reduced * (x2.mul_add(
            x2.mul_add(
                x2.mul_add(
                    x2.mul_add(
                        x2.mul_add(p5sin, p4sin),
                        p3sin
                    ),
                    p2sin
                ),
                p1sin
            ) * x2 + p0sin,
            f64_one_splat
        ));
        
        let c = (x2 * x2).mul_add(
            x2.mul_add(
                x2.mul_add(
                    x2.mul_add(
                        x2.mul_add(
                            x2.mul_add(p5cos, p4cos),
                            p3cos
                        ),
                        p2cos
                    ),
                    p1cos
                ),
                p0cos
            ),
            (-x2).mul_add(Simd::splat(0.5), f64_one_splat)
        );

        // Swap sin/cos for odd quadrants
        let swap = (q & i_one_splat).simd_ne(i_zero_splat);

        // Handle overflow (check for finite input)
        let overflow = q.simd_gt(Simd::splat(0x80000000000000i64)) & xa.is_finite();
        let s = overflow.select(f64_zero_splat, s);
        let c = overflow.select(f64_one_splat, c);

        // Apply swap using select
        let sin1 = swap.select(c, s);
        let cos1 = swap.select(s, c);

        // Apply signs using SIMD bit manipulation
        // Convert to bits for manipulation
        let sin1_bits: Simd<u64, N> = sin1.to_bits();
        let cos1_bits: Simd<u64, N> = cos1.to_bits();
        let x_bits: Simd<u64, N> = x.to_bits();

        // Sign for sin: XOR bit 63 (sign bit) based on quadrant and original sign
        let sign_sin_bits = (q << 62).cast::<u64>() ^ x_bits;
        let sign_sin_mask = sign_sin_bits & Simd::splat(0x8000000000000000u64);
        let sin_result_bits = sin1_bits ^ sign_sin_mask;

        // Sign for cos: bit 1 of (q+1) determines sign
        let sign_cos_bits = ((q + i_one_splat) & Simd::splat(2)) << 62;
        let sign_cos_mask = sign_cos_bits.cast::<u64>();
        let cos_result_bits = cos1_bits ^ sign_cos_mask;

        // Convert back to f64
        let sin_result = Simd::<f64, N>::from_bits(sin_result_bits);
        let cos_result = Simd::<f64, N>::from_bits(cos_result_bits);

        (sin_result, cos_result)
    }

    #[inline(always)]
    pub fn simd_sin<const N: usize>(x: Simd<f64, N>) -> Simd<f64, N> {
        let (sin, _) = simd_sin_cos(x);
        sin
    }

    /// High-performance SIMD cosine
    /// Wrapper around simd_sin_cos when only cosine is needed
    #[inline(always)]
    pub fn simd_cos<const N: usize>(x: Simd<f64, N>) -> Simd<f64, N> {
        let (_, cos) = simd_sin_cos(x);
        cos
    }
}