u_analytics/
correlation.rs

1//! Correlation analysis.
2//!
3//! Pearson, Spearman, and Kendall correlation coefficients with p-values,
4//! correlation matrices, and Fisher z-transformation confidence intervals.
5//!
6//! # Examples
7//!
8//! ```
9//! use u_analytics::correlation::{pearson, spearman, kendall_tau_b};
10//!
11//! let x = [1.0, 2.0, 3.0, 4.0, 5.0];
12//! let y = [2.0, 4.0, 5.0, 4.0, 5.0];
13//!
14//! let p = pearson(&x, &y).unwrap();
15//! assert!(p.r > 0.7);
16//! assert!(p.p_value < 0.2);
17//!
18//! let s = spearman(&x, &y).unwrap();
19//! assert!(s.r > 0.7);
20//!
21//! let k = kendall_tau_b(&x, &y).unwrap();
22//! assert!(k.r > 0.5);
23//! ```
24
25use u_numflow::matrix::Matrix;
26use u_numflow::special;
27use u_numflow::stats;
28
29/// Result of a correlation computation.
30#[derive(Debug, Clone, Copy)]
31pub struct CorrelationResult {
32    /// Correlation coefficient in [-1, 1].
33    pub r: f64,
34    /// Two-tailed p-value for testing H₀: ρ = 0.
35    pub p_value: f64,
36    /// Sample size.
37    pub n: usize,
38}
39
40/// Confidence interval for a correlation coefficient.
41#[derive(Debug, Clone, Copy)]
42pub struct CorrelationCI {
43    /// Lower bound of the confidence interval.
44    pub lower: f64,
45    /// Upper bound of the confidence interval.
46    pub upper: f64,
47    /// Confidence level (e.g. 0.95).
48    pub confidence: f64,
49}
50
51// ---------------------------------------------------------------------------
52// Pearson
53// ---------------------------------------------------------------------------
54
55/// Computes Pearson product-moment correlation coefficient and p-value.
56///
57/// # Algorithm
58///
59/// r = cov(x,y) / (σ_x · σ_y)
60///
61/// p-value via t-test: t = r·√(n-2) / √(1-r²), df = n-2.
62///
63/// # Returns
64///
65/// `None` if either slice has fewer than 3 elements, the slices differ in
66/// length, or either variable has zero variance.
67///
68/// # References
69///
70/// Pearson (1895). "Note on regression and inheritance in the case of
71/// two parents". Proceedings of the Royal Society of London, 58, 240–242.
72///
73/// # Examples
74///
75/// ```
76/// use u_analytics::correlation::pearson;
77///
78/// let x = [1.0, 2.0, 3.0, 4.0, 5.0];
79/// let y = [2.0, 4.0, 6.0, 8.0, 10.0];
80/// let result = pearson(&x, &y).unwrap();
81/// assert!((result.r - 1.0).abs() < 1e-10);
82/// ```
83pub fn pearson(x: &[f64], y: &[f64]) -> Option<CorrelationResult> {
84    let n = x.len();
85    if n < 3 || n != y.len() {
86        return None;
87    }
88
89    if x.iter().any(|v| !v.is_finite()) || y.iter().any(|v| !v.is_finite()) {
90        return None;
91    }
92
93    let cov = stats::covariance(x, y)?;
94    let sx = stats::std_dev(x)?;
95    let sy = stats::std_dev(y)?;
96
97    if sx < 1e-300 || sy < 1e-300 {
98        return None; // zero variance
99    }
100
101    let r = (cov / (sx * sy)).clamp(-1.0, 1.0);
102    let p_value = correlation_p_value(r, n);
103
104    Some(CorrelationResult { r, p_value, n })
105}
106
107// ---------------------------------------------------------------------------
108// Spearman
109// ---------------------------------------------------------------------------
110
111/// Computes Spearman rank correlation coefficient and p-value.
112///
113/// # Algorithm
114///
115/// Ranks both variables using the mid-rank method for ties, then computes
116/// Pearson correlation on the ranks. P-value uses the same t-test
117/// approximation as Pearson.
118///
119/// # Returns
120///
121/// `None` if fewer than 3 observations, slices differ in length, or inputs
122/// contain non-finite values.
123///
124/// # References
125///
126/// Spearman (1904). "The proof and measurement of association between two
127/// things". The American Journal of Psychology, 15(1), 72–101.
128///
129/// # Examples
130///
131/// ```
132/// use u_analytics::correlation::spearman;
133///
134/// let x = [1.0, 2.0, 3.0, 4.0, 5.0];
135/// let y = [5.0, 6.0, 7.0, 8.0, 7.0];
136/// let result = spearman(&x, &y).unwrap();
137/// assert!(result.r > 0.5);
138/// ```
139pub fn spearman(x: &[f64], y: &[f64]) -> Option<CorrelationResult> {
140    let n = x.len();
141    if n < 3 || n != y.len() {
142        return None;
143    }
144
145    if x.iter().any(|v| !v.is_finite()) || y.iter().any(|v| !v.is_finite()) {
146        return None;
147    }
148
149    let rx = rank_data(x);
150    let ry = rank_data(y);
151
152    // Pearson on ranks
153    let cov = stats::covariance(&rx, &ry)?;
154    let srx = stats::std_dev(&rx)?;
155    let sry = stats::std_dev(&ry)?;
156
157    if srx < 1e-300 || sry < 1e-300 {
158        return None;
159    }
160
161    let r = (cov / (srx * sry)).clamp(-1.0, 1.0);
162    let p_value = correlation_p_value(r, n);
163
164    Some(CorrelationResult { r, p_value, n })
165}
166
167// ---------------------------------------------------------------------------
168// Kendall tau-b
169// ---------------------------------------------------------------------------
170
171/// Computes Kendall's tau-b correlation coefficient with tie correction.
172///
173/// # Algorithm
174///
175/// τ_b = (C - D) / √[(n₀ - n₁)(n₀ - n₂)]
176///
177/// where C = concordant pairs, D = discordant pairs,
178/// n₀ = n(n-1)/2, n₁ = Σ tᵢ(tᵢ-1)/2 (ties in x), n₂ = Σ uⱼ(uⱼ-1)/2 (ties in y).
179///
180/// # Complexity
181///
182/// O(n²) naive enumeration. For n > 10,000 consider O(n log n) Knight's
183/// algorithm (not implemented — sufficient for u-insight's typical data sizes).
184///
185/// # Returns
186///
187/// `None` if fewer than 3 observations, slices differ in length, or inputs
188/// contain non-finite values.
189///
190/// # References
191///
192/// Kendall (1938). "A new measure of rank correlation".
193/// Biometrika, 30(1/2), 81–93.
194///
195/// # Examples
196///
197/// ```
198/// use u_analytics::correlation::kendall_tau_b;
199///
200/// let x = [1.0, 2.0, 3.0, 4.0, 5.0];
201/// let y = [1.0, 2.0, 3.0, 4.0, 5.0];
202/// let result = kendall_tau_b(&x, &y).unwrap();
203/// assert!((result.r - 1.0).abs() < 1e-10);
204/// ```
205pub fn kendall_tau_b(x: &[f64], y: &[f64]) -> Option<CorrelationResult> {
206    let n = x.len();
207    if n < 3 || n != y.len() {
208        return None;
209    }
210
211    if x.iter().any(|v| !v.is_finite()) || y.iter().any(|v| !v.is_finite()) {
212        return None;
213    }
214
215    let mut concordant: i64 = 0;
216    let mut discordant: i64 = 0;
217    let mut ties_x: i64 = 0;
218    let mut ties_y: i64 = 0;
219    let mut _ties_xy: i64 = 0;
220
221    for i in 0..n {
222        for j in (i + 1)..n {
223            let dx = x[i] - x[j];
224            let dy = y[i] - y[j];
225            let product = dx * dy;
226
227            if dx == 0.0 && dy == 0.0 {
228                _ties_xy += 1;
229                ties_x += 1;
230                ties_y += 1;
231            } else if dx == 0.0 {
232                ties_x += 1;
233            } else if dy == 0.0 {
234                ties_y += 1;
235            } else if product > 0.0 {
236                concordant += 1;
237            } else {
238                discordant += 1;
239            }
240        }
241    }
242
243    let n0 = (n as i64) * (n as i64 - 1) / 2;
244    let denom_sq = (n0 - ties_x) as f64 * (n0 - ties_y) as f64;
245
246    if denom_sq <= 0.0 {
247        return None; // all values tied in x or y
248    }
249
250    let tau = (concordant - discordant) as f64 / denom_sq.sqrt();
251    let tau = tau.clamp(-1.0, 1.0);
252
253    // Normal approximation for p-value (valid for n > ~10)
254    // Variance under H0: var(tau) = 2(2n+5) / (9n(n-1))
255    // With ties: var(S) = (v0 - vt - vu) / 18 + ...
256    // Simplified: use standard formula for tau-b
257    let nf = n as f64;
258    let v0 = nf * (nf - 1.0) * (2.0 * nf + 5.0);
259    let vt = compute_tie_variance_term(x);
260    let vu = compute_tie_variance_term(y);
261    let var_s = (v0 - vt - vu) / 18.0;
262
263    let p_value = if var_s > 0.0 {
264        let s = (concordant - discordant) as f64;
265        let z = s / var_s.sqrt();
266        2.0 * (1.0 - special::standard_normal_cdf(z.abs()))
267    } else {
268        1.0
269    };
270
271    Some(CorrelationResult { r: tau, p_value, n })
272}
273
274// ---------------------------------------------------------------------------
275// Fisher z-transformation
276// ---------------------------------------------------------------------------
277
278/// Computes Fisher z-transformation: z = arctanh(r).
279///
280/// Transforms r ∈ (-1, 1) to z ∈ (-∞, +∞) where z is approximately normal
281/// with variance 1/(n-3).
282///
283/// # Returns
284///
285/// `None` if r is not in (-1, 1).
286///
287/// # References
288///
289/// Fisher (1921). "On the probable error of a coefficient of correlation".
290/// Metron, 1, 3–32.
291pub fn fisher_z(r: f64) -> Option<f64> {
292    if r <= -1.0 || r >= 1.0 || !r.is_finite() {
293        return None;
294    }
295    Some(r.atanh())
296}
297
298/// Inverse Fisher z-transformation: r = tanh(z).
299pub fn fisher_z_inv(z: f64) -> f64 {
300    z.tanh()
301}
302
303/// Computes confidence interval for a Pearson correlation coefficient
304/// using Fisher z-transformation.
305///
306/// # Arguments
307///
308/// * `r` — Pearson correlation coefficient
309/// * `n` — Sample size
310/// * `confidence` — Confidence level (e.g. 0.95)
311///
312/// # Returns
313///
314/// `None` if n < 4, r is not in (-1, 1), or confidence is not in (0, 1).
315///
316/// # Examples
317///
318/// ```
319/// use u_analytics::correlation::correlation_ci;
320///
321/// let ci = correlation_ci(0.8, 30, 0.95).unwrap();
322/// assert!(ci.lower < 0.8);
323/// assert!(ci.upper > 0.8);
324/// assert!(ci.lower > 0.0);
325/// assert!(ci.upper < 1.0);
326/// ```
327pub fn correlation_ci(r: f64, n: usize, confidence: f64) -> Option<CorrelationCI> {
328    if n < 4 || r <= -1.0 || r >= 1.0 || !r.is_finite() {
329        return None;
330    }
331    if confidence <= 0.0 || confidence >= 1.0 {
332        return None;
333    }
334
335    let z = r.atanh();
336    let se = 1.0 / ((n as f64 - 3.0).sqrt());
337    let alpha = 1.0 - confidence;
338    let z_crit = special::inverse_normal_cdf(1.0 - alpha / 2.0);
339
340    let z_lower = z - z_crit * se;
341    let z_upper = z + z_crit * se;
342
343    Some(CorrelationCI {
344        lower: z_lower.tanh(),
345        upper: z_upper.tanh(),
346        confidence,
347    })
348}
349
350// ---------------------------------------------------------------------------
351// Correlation Matrix
352// ---------------------------------------------------------------------------
353
354/// Computes a pairwise Pearson correlation matrix.
355///
356/// # Arguments
357///
358/// * `variables` — Slice of variable data. Each inner slice is one variable's
359///   observations. All must have the same length.
360///
361/// # Returns
362///
363/// A symmetric n×n `Matrix` where entry (i,j) is the Pearson r between
364/// variables i and j. Diagonal is 1.0. Returns `None` if fewer than 2
365/// variables, observations < 3, or variable lengths differ.
366///
367/// # Examples
368///
369/// ```
370/// use u_analytics::correlation::correlation_matrix;
371///
372/// let x = [1.0, 2.0, 3.0, 4.0, 5.0];
373/// let y = [2.0, 4.0, 6.0, 8.0, 10.0];
374/// let z = [5.0, 4.0, 3.0, 2.0, 1.0];
375/// let mat = correlation_matrix(&[&x, &y, &z]).unwrap();
376/// assert!((mat.get(0, 1) - 1.0).abs() < 1e-10);   // x,y perfectly correlated
377/// assert!((mat.get(0, 2) + 1.0).abs() < 1e-10);   // x,z perfectly anti-correlated
378/// ```
379pub fn correlation_matrix(variables: &[&[f64]]) -> Option<Matrix> {
380    let p = variables.len();
381    if p < 2 {
382        return None;
383    }
384    let n = variables[0].len();
385    if n < 3 {
386        return None;
387    }
388    for v in variables {
389        if v.len() != n {
390            return None;
391        }
392    }
393
394    // Pre-compute means and std devs
395    let mut sds = Vec::with_capacity(p);
396    for v in variables {
397        let m = stats::mean(v)?;
398        if !m.is_finite() {
399            return None;
400        }
401        let s = stats::std_dev(v)?;
402        if !s.is_finite() || s < 1e-300 {
403            return None;
404        }
405        sds.push(s);
406    }
407
408    let mut data = vec![0.0; p * p];
409
410    for i in 0..p {
411        data[i * p + i] = 1.0; // diagonal
412        for j in (i + 1)..p {
413            let cov = stats::covariance(variables[i], variables[j])?;
414            let r = (cov / (sds[i] * sds[j])).clamp(-1.0, 1.0);
415            data[i * p + j] = r;
416            data[j * p + i] = r;
417        }
418    }
419
420    Matrix::new(p, p, data).ok()
421}
422
423/// Computes a pairwise Spearman rank correlation matrix.
424///
425/// Same as [`correlation_matrix`] but uses rank-transformed data.
426pub fn spearman_matrix(variables: &[&[f64]]) -> Option<Matrix> {
427    let p = variables.len();
428    if p < 2 {
429        return None;
430    }
431    let n = variables[0].len();
432    if n < 3 {
433        return None;
434    }
435    for v in variables {
436        if v.len() != n || v.iter().any(|x| !x.is_finite()) {
437            return None;
438        }
439    }
440
441    // Rank-transform all variables
442    let ranked: Vec<Vec<f64>> = variables.iter().map(|v| rank_data(v)).collect();
443    let ranked_refs: Vec<&[f64]> = ranked.iter().map(|v| v.as_slice()).collect();
444
445    correlation_matrix(&ranked_refs)
446}
447
448// ---------------------------------------------------------------------------
449// Internal helpers
450// ---------------------------------------------------------------------------
451
452/// Computes two-tailed p-value for a correlation coefficient via t-test.
453///
454/// t = r·√(n-2) / √(1-r²), df = n-2.
455fn correlation_p_value(r: f64, n: usize) -> f64 {
456    if n < 3 {
457        return 1.0;
458    }
459    let df = (n - 2) as f64;
460    let r2 = r * r;
461
462    // Handle r ≈ ±1.0 (denominator → 0)
463    if r2 >= 1.0 - 1e-15 {
464        return 0.0; // perfect correlation
465    }
466
467    let t = r * (df / (1.0 - r2)).sqrt();
468    2.0 * (1.0 - special::t_distribution_cdf(t.abs(), df))
469}
470
471/// Ranks data using the mid-rank method for ties.
472///
473/// Returns a Vec of ranks (1-based). Tied values receive the average rank.
474fn rank_data(data: &[f64]) -> Vec<f64> {
475    let n = data.len();
476    let mut indexed: Vec<(usize, f64)> = data.iter().copied().enumerate().collect();
477    indexed.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal));
478
479    let mut ranks = vec![0.0; n];
480    let mut i = 0;
481    while i < n {
482        let mut j = i;
483        // Find all tied elements
484        while j < n && (indexed[j].1 - indexed[i].1).abs() < 1e-300 {
485            j += 1;
486        }
487        // Average rank for the tied group (1-based)
488        let avg_rank = (i + j) as f64 / 2.0 + 0.5;
489        for item in indexed.iter().take(j).skip(i) {
490            ranks[item.0] = avg_rank;
491        }
492        i = j;
493    }
494
495    ranks
496}
497
498/// Computes the tie variance term for Kendall's tau-b.
499///
500/// Returns Σ tᵢ(tᵢ-1)(2tᵢ+5) for each group of tᵢ tied values.
501fn compute_tie_variance_term(data: &[f64]) -> f64 {
502    let mut sorted: Vec<f64> = data.to_vec();
503    sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
504
505    let mut result = 0.0;
506    let mut i = 0;
507    while i < sorted.len() {
508        let mut j = i;
509        while j < sorted.len() && (sorted[j] - sorted[i]).abs() < 1e-300 {
510            j += 1;
511        }
512        let t = (j - i) as f64;
513        if t > 1.0 {
514            result += t * (t - 1.0) * (2.0 * t + 5.0);
515        }
516        i = j;
517    }
518    result
519}
520
521// ---------------------------------------------------------------------------
522// Autocorrelation (ACF) and Partial Autocorrelation (PACF)
523// ---------------------------------------------------------------------------
524
525/// Result of autocorrelation analysis.
526#[derive(Debug, Clone)]
527pub struct AcfResult {
528    /// Autocorrelation values for lags 0, 1, ..., max_lag.
529    /// `acf[0]` is always 1.0.
530    pub acf: Vec<f64>,
531    /// 95% confidence threshold = 1.96 / √n.
532    /// Values with |acf\[k\]| > threshold (for k > 0) are significant.
533    pub confidence_threshold: f64,
534}
535
536/// Result of partial autocorrelation analysis.
537#[derive(Debug, Clone)]
538pub struct PacfResult {
539    /// Partial autocorrelation values for lags 1, ..., max_lag.
540    pub pacf: Vec<f64>,
541    /// 95% confidence threshold = 1.96 / √n.
542    pub confidence_threshold: f64,
543}
544
545/// Compute the sample autocorrelation function (ACF).
546///
547/// Uses the biased estimator (denominator N, not N-k) which guarantees a
548/// positive-semidefinite autocovariance matrix.
549///
550/// # Arguments
551///
552/// * `data` — time series observations (at least 2).
553/// * `max_lag` — maximum lag to compute. Clamped to `n - 1`.
554///
555/// # Returns
556///
557/// `None` if `data.len() < 2`, `max_lag == 0`, or data contains non-finite values.
558///
559/// # References
560///
561/// - Box & Jenkins (1976). *Time Series Analysis: Forecasting and Control*.
562///
563/// # Examples
564///
565/// ```
566/// use u_analytics::correlation::acf;
567///
568/// let data = [1.0, 2.0, 3.0, 4.0, 5.0, 4.0, 3.0, 2.0, 1.0, 2.0];
569/// let r = acf(&data, 5).unwrap();
570/// assert!((r.acf[0] - 1.0).abs() < 1e-10); // lag 0 is always 1
571/// assert!(r.acf.len() == 6); // lags 0..=5
572/// ```
573pub fn acf(data: &[f64], max_lag: usize) -> Option<AcfResult> {
574    let n = data.len();
575    if n < 2 || max_lag == 0 || data.iter().any(|v| !v.is_finite()) {
576        return None;
577    }
578    let max_lag = max_lag.min(n - 1);
579    let nf = n as f64;
580
581    let mean = data.iter().sum::<f64>() / nf;
582
583    // Lag-0 autocovariance (biased variance)
584    let c0: f64 = data.iter().map(|&x| (x - mean).powi(2)).sum::<f64>() / nf;
585
586    if c0 <= 0.0 {
587        return None; // constant series
588    }
589
590    let mut acf_vals = Vec::with_capacity(max_lag + 1);
591    acf_vals.push(1.0); // r(0) = 1
592
593    for lag in 1..=max_lag {
594        let ck: f64 = data[..n - lag]
595            .iter()
596            .zip(&data[lag..])
597            .map(|(&xt, &xt_h)| (xt - mean) * (xt_h - mean))
598            .sum::<f64>()
599            / nf;
600        acf_vals.push(ck / c0);
601    }
602
603    let threshold = 1.96 / nf.sqrt();
604
605    Some(AcfResult {
606        acf: acf_vals,
607        confidence_threshold: threshold,
608    })
609}
610
611/// Compute the sample partial autocorrelation function (PACF) via
612/// Durbin-Levinson recursion.
613///
614/// PACF at lag *h* measures the correlation between xₜ and xₜ₊ₕ after
615/// removing the linear effect of intermediate lags.
616///
617/// # Arguments
618///
619/// * `data` — time series observations (at least 3).
620/// * `max_lag` — maximum lag to compute. Clamped to `n - 1`.
621///
622/// # Returns
623///
624/// `None` if `data.len() < 3`, `max_lag == 0`, or data contains non-finite values.
625///
626/// # References
627///
628/// - Durbin (1960). "The fitting of time-series models". Revue de l'Institut
629///   International de Statistique, 28(3), 233–244.
630///
631/// # Examples
632///
633/// ```
634/// use u_analytics::correlation::pacf;
635///
636/// let data = [1.0, 3.0, 2.0, 4.0, 3.0, 5.0, 4.0, 6.0, 5.0, 7.0];
637/// let r = pacf(&data, 4).unwrap();
638/// assert!(r.pacf.len() == 4); // lags 1..=4
639/// // Lag 1 PACF equals lag 1 ACF
640/// ```
641pub fn pacf(data: &[f64], max_lag: usize) -> Option<PacfResult> {
642    let n = data.len();
643    if n < 3 || max_lag == 0 {
644        return None;
645    }
646
647    // First compute ACF
648    let acf_result = acf(data, max_lag)?;
649    let rho = &acf_result.acf;
650    let max_lag = rho.len() - 1; // actual max lag (may be clamped)
651
652    if max_lag == 0 {
653        return None;
654    }
655
656    let mut pacf_vals = Vec::with_capacity(max_lag);
657
658    // φ₁₁ = ρ(1)
659    pacf_vals.push(rho[1]);
660
661    if max_lag == 1 {
662        return Some(PacfResult {
663            pacf: pacf_vals,
664            confidence_threshold: 1.96 / (n as f64).sqrt(),
665        });
666    }
667
668    // Durbin-Levinson recursion
669    let mut phi_prev = vec![rho[1]];
670
671    for h in 2..=max_lag {
672        // Numerator: ρ(h) - Σⱼ₌₁ᴴ⁻¹ φ_{h-1,j} × ρ(h-j)
673        let mut num = rho[h];
674        for j in 0..h - 1 {
675            num -= phi_prev[j] * rho[h - 1 - j];
676        }
677
678        // Denominator: 1 - Σⱼ₌₁ᴴ⁻¹ φ_{h-1,j} × ρ(j+1)
679        let mut den = 1.0;
680        for j in 0..h - 1 {
681            den -= phi_prev[j] * rho[j + 1];
682        }
683
684        let phi_hh = if den.abs() > 1e-14 { num / den } else { 0.0 };
685
686        // Update: φₕⱼ = φ_{h-1,j} - φₕₕ × φ_{h-1,h-1-j}
687        let mut phi_new = Vec::with_capacity(h);
688        for j in 0..h - 1 {
689            phi_new.push(phi_prev[j] - phi_hh * phi_prev[h - 2 - j]);
690        }
691        phi_new.push(phi_hh);
692
693        pacf_vals.push(phi_hh);
694        phi_prev = phi_new;
695    }
696
697    Some(PacfResult {
698        pacf: pacf_vals,
699        confidence_threshold: 1.96 / (n as f64).sqrt(),
700    })
701}
702
703#[cfg(test)]
704mod tests {
705    use super::*;
706
707    // -----------------------------------------------------------------------
708    // Pearson tests
709    // -----------------------------------------------------------------------
710
711    #[test]
712    fn pearson_perfect_positive() {
713        let x = [1.0, 2.0, 3.0, 4.0, 5.0];
714        let y = [2.0, 4.0, 6.0, 8.0, 10.0];
715        let result = pearson(&x, &y).expect("should compute");
716        assert!((result.r - 1.0).abs() < 1e-10);
717        assert!(result.p_value < 1e-10);
718    }
719
720    #[test]
721    fn pearson_perfect_negative() {
722        let x = [1.0, 2.0, 3.0, 4.0, 5.0];
723        let y = [10.0, 8.0, 6.0, 4.0, 2.0];
724        let result = pearson(&x, &y).expect("should compute");
725        assert!((result.r + 1.0).abs() < 1e-10);
726        assert!(result.p_value < 1e-10);
727    }
728
729    #[test]
730    fn pearson_uncorrelated() {
731        // Designed to have r = 0
732        let x = [1.0, 2.0, 3.0, 4.0, 5.0];
733        let y = [3.0, 1.0, 5.0, 1.0, 5.0];
734        let result = pearson(&x, &y).expect("should compute");
735        assert!(result.r.abs() < 0.5);
736    }
737
738    #[test]
739    fn pearson_known_value() {
740        // Height (inches) vs GPA example
741        let x = [68.0, 71.0, 62.0, 75.0, 58.0, 60.0, 67.0, 68.0, 71.0, 69.0];
742        let y = [4.1, 4.6, 3.8, 4.4, 3.2, 3.1, 3.8, 4.1, 4.3, 3.7];
743        let result = pearson(&x, &y).expect("should compute");
744        // Computed: r ≈ 0.8816
745        assert!((result.r - 0.8816).abs() < 0.01, "r = {}", result.r);
746    }
747
748    /// Verifies exact Pearson r numeric reference from the spec.
749    ///
750    /// x = [1,2,3,4,5], y = [2,4,5,4,5]
751    /// Σ(xᵢ-x̄)(yᵢ-ȳ) = 6
752    /// Σ(xᵢ-x̄)² = 10, Σ(yᵢ-ȳ)² = 6
753    /// r = 6 / √(10·6) = 6/√60 = 6/7.7460 ≈ 0.7746
754    ///
755    /// Reference: Pearson (1895), Proc. Royal Society London, 58, 240–242.
756    #[test]
757    fn pearson_numeric_reference() {
758        let x = [1.0, 2.0, 3.0, 4.0, 5.0];
759        let y = [2.0, 4.0, 5.0, 4.0, 5.0];
760        let result = pearson(&x, &y).expect("should compute");
761
762        let expected_r = 6.0_f64 / (60.0_f64).sqrt(); // = 0.774596...
763        assert!(
764            (result.r - expected_r).abs() < 1e-3,
765            "r expected {:.6}, got {}",
766            expected_r,
767            result.r
768        );
769        assert!(
770            (result.r - 0.7746).abs() < 1e-3,
771            "r expected ≈0.7746, got {}",
772            result.r
773        );
774    }
775
776    #[test]
777    fn pearson_insufficient_data() {
778        assert!(pearson(&[1.0, 2.0], &[3.0, 4.0]).is_none());
779        assert!(pearson(&[1.0], &[2.0]).is_none());
780    }
781
782    #[test]
783    fn pearson_length_mismatch() {
784        assert!(pearson(&[1.0, 2.0, 3.0], &[4.0, 5.0]).is_none());
785    }
786
787    #[test]
788    fn pearson_zero_variance() {
789        assert!(pearson(&[5.0, 5.0, 5.0], &[1.0, 2.0, 3.0]).is_none());
790    }
791
792    #[test]
793    fn pearson_nan_input() {
794        assert!(pearson(&[1.0, f64::NAN, 3.0], &[4.0, 5.0, 6.0]).is_none());
795    }
796
797    // -----------------------------------------------------------------------
798    // Spearman tests
799    // -----------------------------------------------------------------------
800
801    #[test]
802    fn spearman_perfect_monotone() {
803        let x = [1.0, 2.0, 3.0, 4.0, 5.0];
804        let y = [1.0, 4.0, 9.0, 16.0, 25.0]; // monotone increasing
805        let result = spearman(&x, &y).expect("should compute");
806        assert!((result.r - 1.0).abs() < 1e-10);
807    }
808
809    #[test]
810    fn spearman_perfect_negative() {
811        let x = [1.0, 2.0, 3.0, 4.0, 5.0];
812        let y = [5.0, 4.0, 3.0, 2.0, 1.0];
813        let result = spearman(&x, &y).expect("should compute");
814        assert!((result.r + 1.0).abs() < 1e-10);
815    }
816
817    #[test]
818    fn spearman_with_ties() {
819        let x = [1.0, 2.0, 2.0, 4.0, 5.0];
820        let y = [1.0, 3.0, 3.0, 4.0, 5.0];
821        let result = spearman(&x, &y).expect("should compute");
822        assert!(result.r > 0.9); // strong positive
823    }
824
825    #[test]
826    fn spearman_nonlinear_monotone() {
827        // Spearman should be 1.0 for any monotone function
828        let x = [1.0, 2.0, 3.0, 4.0, 5.0];
829        let y: Vec<f64> = x.iter().map(|&v: &f64| v.powi(3)).collect();
830        let result = spearman(&x, &y).expect("should compute");
831        assert!((result.r - 1.0).abs() < 1e-10);
832    }
833
834    // -----------------------------------------------------------------------
835    // Kendall tests
836    // -----------------------------------------------------------------------
837
838    #[test]
839    fn kendall_perfect_concordance() {
840        let x = [1.0, 2.0, 3.0, 4.0, 5.0];
841        let y = [1.0, 2.0, 3.0, 4.0, 5.0];
842        let result = kendall_tau_b(&x, &y).expect("should compute");
843        assert!((result.r - 1.0).abs() < 1e-10);
844    }
845
846    #[test]
847    fn kendall_perfect_discordance() {
848        let x = [1.0, 2.0, 3.0, 4.0, 5.0];
849        let y = [5.0, 4.0, 3.0, 2.0, 1.0];
850        let result = kendall_tau_b(&x, &y).expect("should compute");
851        assert!((result.r + 1.0).abs() < 1e-10);
852    }
853
854    #[test]
855    fn kendall_known_value() {
856        // Known example: tau ≈ 0.733
857        let x = [1.0, 2.0, 3.0, 4.0, 5.0];
858        let y = [3.0, 4.0, 1.0, 2.0, 5.0];
859        let result = kendall_tau_b(&x, &y).expect("should compute");
860        assert!((result.r - 0.2).abs() < 0.01);
861    }
862
863    #[test]
864    fn kendall_with_ties() {
865        let x = [1.0, 2.0, 2.0, 4.0, 5.0];
866        let y = [1.0, 3.0, 3.0, 4.0, 5.0];
867        let result = kendall_tau_b(&x, &y).expect("should compute");
868        assert!(result.r > 0.8);
869    }
870
871    #[test]
872    fn kendall_all_ties() {
873        let x = [1.0, 1.0, 1.0];
874        let y = [2.0, 2.0, 2.0];
875        assert!(kendall_tau_b(&x, &y).is_none()); // denom = 0
876    }
877
878    // -----------------------------------------------------------------------
879    // Fisher z tests
880    // -----------------------------------------------------------------------
881
882    #[test]
883    fn fisher_z_zero() {
884        let z = fisher_z(0.0).expect("should compute");
885        assert!(z.abs() < 1e-15);
886    }
887
888    #[test]
889    fn fisher_z_roundtrip() {
890        for &r in &[0.0, 0.3, 0.5, 0.8, 0.95, -0.5, -0.95] {
891            let z = fisher_z(r).expect("should compute");
892            let r_back = fisher_z_inv(z);
893            assert!((r - r_back).abs() < 1e-10, "Roundtrip failed for r={r}");
894        }
895    }
896
897    #[test]
898    fn fisher_z_boundary() {
899        assert!(fisher_z(1.0).is_none());
900        assert!(fisher_z(-1.0).is_none());
901        assert!(fisher_z(1.5).is_none());
902        assert!(fisher_z(f64::NAN).is_none());
903    }
904
905    // -----------------------------------------------------------------------
906    // Confidence interval tests
907    // -----------------------------------------------------------------------
908
909    #[test]
910    fn ci_contains_r() {
911        let ci = correlation_ci(0.6, 50, 0.95).expect("should compute");
912        assert!(ci.lower < 0.6);
913        assert!(ci.upper > 0.6);
914        assert!(ci.lower > -1.0);
915        assert!(ci.upper < 1.0);
916    }
917
918    #[test]
919    fn ci_wider_at_higher_confidence() {
920        let ci_95 = correlation_ci(0.5, 30, 0.95).expect("should compute");
921        let ci_99 = correlation_ci(0.5, 30, 0.99).expect("should compute");
922        assert!(ci_99.upper - ci_99.lower > ci_95.upper - ci_95.lower);
923    }
924
925    #[test]
926    fn ci_narrower_with_more_data() {
927        let ci_30 = correlation_ci(0.5, 30, 0.95).expect("should compute");
928        let ci_100 = correlation_ci(0.5, 100, 0.95).expect("should compute");
929        assert!(ci_100.upper - ci_100.lower < ci_30.upper - ci_30.lower);
930    }
931
932    #[test]
933    fn ci_edge_cases() {
934        assert!(correlation_ci(0.5, 3, 0.95).is_none()); // n too small
935        assert!(correlation_ci(1.0, 30, 0.95).is_none()); // r = 1
936        assert!(correlation_ci(0.5, 30, 0.0).is_none()); // invalid confidence
937        assert!(correlation_ci(0.5, 30, 1.0).is_none());
938    }
939
940    // -----------------------------------------------------------------------
941    // Correlation matrix tests
942    // -----------------------------------------------------------------------
943
944    #[test]
945    fn corr_matrix_identity() {
946        let x = [1.0, 2.0, 3.0, 4.0, 5.0];
947        let y = [2.0, 4.0, 6.0, 8.0, 10.0];
948        let z = [5.0, 4.0, 3.0, 2.0, 1.0];
949        let mat = correlation_matrix(&[&x, &y, &z]).expect("should compute");
950
951        // Diagonal = 1
952        assert!((mat.get(0, 0) - 1.0).abs() < 1e-10);
953        assert!((mat.get(1, 1) - 1.0).abs() < 1e-10);
954        assert!((mat.get(2, 2) - 1.0).abs() < 1e-10);
955
956        // x,y perfectly correlated
957        assert!((mat.get(0, 1) - 1.0).abs() < 1e-10);
958
959        // x,z perfectly anti-correlated
960        assert!((mat.get(0, 2) + 1.0).abs() < 1e-10);
961
962        // Symmetric
963        assert!((mat.get(0, 1) - mat.get(1, 0)).abs() < 1e-15);
964        assert!((mat.get(0, 2) - mat.get(2, 0)).abs() < 1e-15);
965    }
966
967    #[test]
968    fn corr_matrix_insufficient_variables() {
969        let x = [1.0, 2.0, 3.0];
970        assert!(correlation_matrix(&[&x]).is_none());
971    }
972
973    #[test]
974    fn corr_matrix_length_mismatch() {
975        let x = [1.0, 2.0, 3.0];
976        let y = [4.0, 5.0];
977        assert!(correlation_matrix(&[&x, &y]).is_none());
978    }
979
980    #[test]
981    fn spearman_matrix_basic() {
982        let x = [1.0, 2.0, 3.0, 4.0, 5.0];
983        let y = [1.0, 4.0, 9.0, 16.0, 25.0]; // monotone
984        let z = [5.0, 4.0, 3.0, 2.0, 1.0];
985        let mat = spearman_matrix(&[&x, &y, &z]).expect("should compute");
986
987        // x and y have perfect monotone relationship
988        assert!((mat.get(0, 1) - 1.0).abs() < 1e-10);
989        // x and z are perfectly anti-monotone
990        assert!((mat.get(0, 2) + 1.0).abs() < 1e-10);
991    }
992
993    // -----------------------------------------------------------------------
994    // Rank data tests
995    // -----------------------------------------------------------------------
996
997    #[test]
998    fn rank_no_ties() {
999        let ranks = rank_data(&[3.0, 1.0, 2.0]);
1000        assert!((ranks[0] - 3.0).abs() < 1e-10); // 3.0 → rank 3
1001        assert!((ranks[1] - 1.0).abs() < 1e-10); // 1.0 → rank 1
1002        assert!((ranks[2] - 2.0).abs() < 1e-10); // 2.0 → rank 2
1003    }
1004
1005    #[test]
1006    fn rank_with_ties() {
1007        let ranks = rank_data(&[1.0, 2.0, 2.0, 4.0]);
1008        assert!((ranks[0] - 1.0).abs() < 1e-10);
1009        assert!((ranks[1] - 2.5).abs() < 1e-10); // average of ranks 2,3
1010        assert!((ranks[2] - 2.5).abs() < 1e-10);
1011        assert!((ranks[3] - 4.0).abs() < 1e-10);
1012    }
1013
1014    #[test]
1015    fn rank_all_same() {
1016        let ranks = rank_data(&[5.0, 5.0, 5.0]);
1017        assert!((ranks[0] - 2.0).abs() < 1e-10); // average of 1,2,3
1018        assert!((ranks[1] - 2.0).abs() < 1e-10);
1019        assert!((ranks[2] - 2.0).abs() < 1e-10);
1020    }
1021
1022    // -----------------------------------------------------------------------
1023    // P-value tests
1024    // -----------------------------------------------------------------------
1025
1026    #[test]
1027    fn pvalue_significant() {
1028        // Large sample, strong correlation → very small p-value
1029        let x: Vec<f64> = (0..100).map(|i| i as f64).collect();
1030        let y: Vec<f64> = x.iter().map(|&v| v * 2.0 + 1.0).collect();
1031        let result = pearson(&x, &y).expect("should compute");
1032        assert!(result.p_value < 1e-10);
1033    }
1034
1035    #[test]
1036    fn pvalue_not_significant() {
1037        // Carefully constructed to have very low correlation
1038        let x = [1.0, 2.0, 3.0, 4.0, 5.0];
1039        let y = [5.0, 1.0, 3.0, 5.0, 1.0];
1040        let result = pearson(&x, &y).expect("should compute");
1041        assert!(result.p_value > 0.3); // not significant
1042    }
1043
1044    // -----------------------------------------------------------------------
1045    // ACF tests
1046    // -----------------------------------------------------------------------
1047
1048    #[test]
1049    fn acf_lag_zero_is_one() {
1050        let data = [1.0, 3.0, 2.0, 5.0, 4.0, 7.0, 6.0, 8.0];
1051        let r = acf(&data, 3).expect("should compute");
1052        assert!((r.acf[0] - 1.0).abs() < 1e-10);
1053    }
1054
1055    #[test]
1056    fn acf_linear_trend() {
1057        // Linear trend → strong positive autocorrelation at low lags
1058        let data: Vec<f64> = (0..20).map(|i| i as f64).collect();
1059        let r = acf(&data, 5).expect("should compute");
1060        assert!(r.acf[1] > 0.8, "lag-1 ACF for linear trend should be high");
1061        assert!(r.acf.len() == 6);
1062    }
1063
1064    #[test]
1065    fn acf_alternating() {
1066        // Alternating series → negative lag-1 autocorrelation
1067        let data: Vec<f64> = (0..30)
1068            .map(|i| if i % 2 == 0 { 1.0 } else { -1.0 })
1069            .collect();
1070        let r = acf(&data, 5).expect("should compute");
1071        assert!(r.acf[1] < -0.8, "alternating → negative lag-1 ACF");
1072        assert!(r.acf[2] > 0.8, "alternating → positive lag-2 ACF");
1073    }
1074
1075    #[test]
1076    fn acf_white_noise_threshold() {
1077        let data: Vec<f64> = (0..100).map(|i| ((i * 7 + 3) % 13) as f64).collect();
1078        let r = acf(&data, 10).expect("should compute");
1079        // Threshold should be ~0.196 for n=100
1080        assert!((r.confidence_threshold - 1.96 / 10.0).abs() < 0.01);
1081    }
1082
1083    #[test]
1084    fn acf_edge_cases() {
1085        assert!(acf(&[1.0], 3).is_none()); // too short
1086        assert!(acf(&[1.0, 2.0], 0).is_none()); // max_lag 0
1087        assert!(acf(&[5.0, 5.0, 5.0, 5.0], 2).is_none()); // constant
1088        assert!(acf(&[1.0, f64::NAN, 3.0], 1).is_none()); // NaN
1089    }
1090
1091    #[test]
1092    fn acf_max_lag_clamped() {
1093        let data = [1.0, 2.0, 3.0, 4.0, 5.0];
1094        let r = acf(&data, 100).expect("should compute");
1095        assert_eq!(r.acf.len(), 5); // clamped to n-1 = 4, so lags 0..=4
1096    }
1097
1098    // -----------------------------------------------------------------------
1099    // PACF tests
1100    // -----------------------------------------------------------------------
1101
1102    #[test]
1103    fn pacf_lag1_equals_acf_lag1() {
1104        let data = [1.0, 3.0, 2.0, 5.0, 4.0, 7.0, 6.0, 8.0, 5.0, 3.0];
1105        let acf_r = acf(&data, 5).expect("should compute ACF");
1106        let pacf_r = pacf(&data, 5).expect("should compute PACF");
1107        assert!(
1108            (pacf_r.pacf[0] - acf_r.acf[1]).abs() < 1e-10,
1109            "PACF[1] should equal ACF[1]: {} vs {}",
1110            pacf_r.pacf[0],
1111            acf_r.acf[1]
1112        );
1113    }
1114
1115    #[test]
1116    fn pacf_linear_trend() {
1117        let data: Vec<f64> = (0..20).map(|i| i as f64).collect();
1118        let r = pacf(&data, 5).expect("should compute");
1119        // AR(1)-like: strong lag-1 PACF, weaker at higher lags
1120        assert!(r.pacf[0].abs() > 0.5, "lag-1 PACF should be strong");
1121    }
1122
1123    #[test]
1124    fn pacf_bounded() {
1125        let data: Vec<f64> = (0..30).map(|i| (i as f64 * 0.5).sin()).collect();
1126        let r = pacf(&data, 10).expect("should compute");
1127        for (i, &p) in r.pacf.iter().enumerate() {
1128            assert!(
1129                (-1.0..=1.0).contains(&p),
1130                "PACF[{}] = {} out of [-1, 1]",
1131                i + 1,
1132                p
1133            );
1134        }
1135    }
1136
1137    #[test]
1138    fn pacf_edge_cases() {
1139        assert!(pacf(&[1.0, 2.0], 3).is_none()); // too short (< 3)
1140        assert!(pacf(&[1.0, 2.0, 3.0], 0).is_none()); // max_lag 0
1141    }
1142}
1143
1144#[cfg(test)]
1145mod proptests {
1146    use super::*;
1147    use proptest::prelude::*;
1148
1149    fn bounded_vec(min_len: usize, max_len: usize) -> BoxedStrategy<Vec<f64>> {
1150        proptest::collection::vec(-1e6_f64..1e6, min_len..=max_len).boxed()
1151    }
1152
1153    proptest! {
1154        #[test]
1155        fn pearson_bounded(
1156            data in bounded_vec(5, 50).prop_flat_map(|x| {
1157                let n = x.len();
1158                (Just(x), proptest::collection::vec(-1e6_f64..1e6, n..=n))
1159            })
1160        ) {
1161            let (x, y) = data;
1162            if let Some(result) = pearson(&x, &y) {
1163                prop_assert!(result.r >= -1.0 && result.r <= 1.0, "r out of bounds: {}", result.r);
1164                prop_assert!(result.p_value >= 0.0 && result.p_value <= 1.0, "p out of bounds: {}", result.p_value);
1165            }
1166        }
1167
1168        #[test]
1169        fn spearman_bounded(
1170            data in bounded_vec(5, 50).prop_flat_map(|x| {
1171                let n = x.len();
1172                (Just(x), proptest::collection::vec(-1e6_f64..1e6, n..=n))
1173            })
1174        ) {
1175            let (x, y) = data;
1176            if let Some(result) = spearman(&x, &y) {
1177                prop_assert!(result.r >= -1.0 && result.r <= 1.0, "r out of bounds: {}", result.r);
1178                prop_assert!(result.p_value >= 0.0 && result.p_value <= 1.0, "p out of bounds: {}", result.p_value);
1179            }
1180        }
1181
1182        #[test]
1183        fn kendall_bounded(
1184            data in bounded_vec(5, 30).prop_flat_map(|x| {
1185                let n = x.len();
1186                (Just(x), proptest::collection::vec(-1e6_f64..1e6, n..=n))
1187            })
1188        ) {
1189            let (x, y) = data;
1190            if let Some(result) = kendall_tau_b(&x, &y) {
1191                prop_assert!(result.r >= -1.0 && result.r <= 1.0, "tau out of bounds: {}", result.r);
1192                prop_assert!(result.p_value >= 0.0 && result.p_value <= 1.0, "p out of bounds: {}", result.p_value);
1193            }
1194        }
1195
1196        #[test]
1197        fn pearson_symmetric(
1198            data in bounded_vec(5, 50).prop_flat_map(|x| {
1199                let n = x.len();
1200                (Just(x), proptest::collection::vec(-1e6_f64..1e6, n..=n))
1201            })
1202        ) {
1203            let (x, y) = data;
1204            let r_xy = pearson(&x, &y);
1205            let r_yx = pearson(&y, &x);
1206            match (r_xy, r_yx) {
1207                (Some(a), Some(b)) => {
1208                    prop_assert!((a.r - b.r).abs() < 1e-10, "not symmetric: {} vs {}", a.r, b.r);
1209                }
1210                (None, None) => {}
1211                _ => prop_assert!(false, "one is None but not the other"),
1212            }
1213        }
1214
1215        #[test]
1216        fn fisher_z_roundtrip_prop(r in -0.99_f64..0.99) {
1217            let z = fisher_z(r).expect("should compute");
1218            let r_back = fisher_z_inv(z);
1219            prop_assert!((r - r_back).abs() < 1e-10);
1220        }
1221
1222        #[test]
1223        fn ci_contains_true_r(
1224            r in -0.99_f64..0.99,
1225            n in 10_usize..200
1226        ) {
1227            let ci = correlation_ci(r, n, 0.95).expect("should compute");
1228            prop_assert!(ci.lower < ci.upper, "CI inverted");
1229            prop_assert!(ci.lower >= -1.0 && ci.upper <= 1.0, "CI out of bounds");
1230        }
1231
1232        #[test]
1233        fn acf_bounded_prop(
1234            data in proptest::collection::vec(-1e3_f64..1e3, 5..=50),
1235        ) {
1236            if let Some(r) = acf(&data, 10) {
1237                prop_assert!((r.acf[0] - 1.0).abs() < 1e-10, "ACF[0] must be 1.0");
1238                for (i, &v) in r.acf.iter().enumerate() {
1239                    prop_assert!((-1.0..=1.0).contains(&v), "ACF[{i}] = {v} out of [-1,1]");
1240                }
1241            }
1242        }
1243
1244        #[test]
1245        fn pacf_bounded_prop(
1246            data in proptest::collection::vec(-1e3_f64..1e3, 5..=50),
1247        ) {
1248            if let Some(r) = pacf(&data, 5) {
1249                for (i, &v) in r.pacf.iter().enumerate() {
1250                    prop_assert!((-1.0 - 1e-10..=1.0 + 1e-10).contains(&v), "PACF[{i}] = {v} out of bounds");
1251                }
1252            }
1253        }
1254    }
1255}
u_analytics/correlation.rs

u_analytics/
correlation.rs