cyanea_stats/
popgen.rs

1//! Population genetics — allele frequencies, HWE, Fst, diversity, Tajima's D, LD, genotype PCA.
2//!
3//! All functions operate on 012-encoded genotype data (PLINK convention):
4//! - `0` = homozygous reference
5//! - `1` = heterozygous
6//! - `2` = homozygous alternate
7//! - `None` (or `255` for u8 variants) = missing
8//!
9//! No VCF dependency — genotypes are plain slices.
10
11// Genetics nomenclature uses mixed-case allele names (A/a, B/b) — suppress warnings.
12#![allow(non_snake_case)]
13
14use cyanea_core::{CyaneaError, Result};
15
16use crate::distribution::{ChiSquared, Distribution};
17
18// ── Result types ─────────────────────────────────────────────────────────
19
20/// Allele frequency summary for a single site.
21#[derive(Debug, Clone)]
22pub struct AlleleFrequencies {
23    /// Reference allele frequency.
24    pub freq_ref: f64,
25    /// Alternate allele frequency.
26    pub freq_alt: f64,
27    /// Total non-missing allele count (2 × non-missing individuals).
28    pub allele_count: usize,
29    /// Number of missing genotypes.
30    pub missing_count: usize,
31    /// Observed heterozygosity (proportion of hets among non-missing).
32    pub observed_het: f64,
33    /// Expected heterozygosity under HWE (2pq).
34    pub expected_het: f64,
35}
36
37/// Hardy-Weinberg equilibrium test result.
38#[derive(Debug, Clone)]
39pub struct HweResult {
40    /// Chi-squared statistic (1 df).
41    pub chi_squared: f64,
42    /// P-value from chi-squared distribution.
43    pub p_value: f64,
44    /// Observed genotype counts \[hom-ref, het, hom-alt\].
45    pub observed: [usize; 3],
46    /// Expected genotype counts under HWE.
47    pub expected: [f64; 3],
48    /// Inbreeding coefficient F = 1 - Ho/He.
49    pub inbreeding_coeff: f64,
50}
51
52/// Method used for Fst estimation.
53#[derive(Debug, Clone, Copy, PartialEq, Eq)]
54pub enum FstMethod {
55    /// Weir & Cockerham (1984) ANOVA-based estimator.
56    WeirCockerham,
57    /// Hudson (1992) estimator.
58    Hudson,
59}
60
61/// Fixation index (Fst) result.
62#[derive(Debug, Clone)]
63pub struct FstResult {
64    /// Estimated Fst value.
65    pub fst: f64,
66    /// Method used.
67    pub method: FstMethod,
68}
69
70/// Nucleotide diversity statistics.
71#[derive(Debug, Clone)]
72pub struct DiversityStats {
73    /// Nucleotide diversity (pi) — average pairwise differences per site.
74    pub pi: f64,
75    /// Watterson's theta (theta_w).
76    pub theta_w: f64,
77    /// Number of segregating sites.
78    pub segregating_sites: usize,
79    /// Number of sequences.
80    pub n_sequences: usize,
81}
82
83/// Tajima's D statistic.
84#[derive(Debug, Clone)]
85pub struct TajimaD {
86    /// Tajima's D value.
87    pub d: f64,
88    /// Nucleotide diversity (pi).
89    pub pi: f64,
90    /// Watterson's theta.
91    pub theta_w: f64,
92    /// Number of segregating sites.
93    pub segregating_sites: usize,
94    /// Number of sequences.
95    pub n_sequences: usize,
96}
97
98/// Linkage disequilibrium result between two sites.
99#[derive(Debug, Clone)]
100pub struct LdResult {
101    /// Squared correlation coefficient.
102    pub r_squared: f64,
103    /// Normalized LD coefficient (|D| / D_max).
104    pub d_prime: f64,
105    /// Raw LD coefficient D.
106    pub d: f64,
107    /// Number of haplotypes used (2 × non-missing diploid individuals).
108    pub n_haplotypes: usize,
109}
110
111// ── Internal helpers ─────────────────────────────────────────────────────
112
113/// Count genotypes: returns (n_0, n_1, n_2, n_missing).
114fn count_genotypes(genotypes: &[Option<u8>]) -> Result<(usize, usize, usize, usize)> {
115    let mut n0 = 0usize;
116    let mut n1 = 0usize;
117    let mut n2 = 0usize;
118    let mut n_miss = 0usize;
119    for (i, g) in genotypes.iter().enumerate() {
120        match g {
121            Some(0) => n0 += 1,
122            Some(1) => n1 += 1,
123            Some(2) => n2 += 1,
124            None => n_miss += 1,
125            Some(v) => {
126                return Err(CyaneaError::InvalidInput(format!(
127                    "invalid genotype {} at index {} (expected 0, 1, or 2)",
128                    v, i
129                )));
130            }
131        }
132    }
133    Ok((n0, n1, n2, n_miss))
134}
135
136/// Allele frequencies (p=ref, q=alt) from genotype counts.
137fn allele_freq_from_counts(n0: usize, n1: usize, n2: usize) -> (f64, f64) {
138    let total = 2 * (n0 + n1 + n2);
139    if total == 0 {
140        return (0.0, 0.0);
141    }
142    let p = (2 * n0 + n1) as f64 / total as f64;
143    let q = (2 * n2 + n1) as f64 / total as f64;
144    (p, q)
145}
146
147/// Harmonic number H_n = sum_{i=1}^{n} 1/i.
148fn harmonic(n: usize) -> f64 {
149    (1..=n).map(|i| 1.0 / i as f64).sum()
150}
151
152/// Sum of squared reciprocals: sum_{i=1}^{n} 1/i^2.
153fn harmonic_sq(n: usize) -> f64 {
154    (1..=n).map(|i| 1.0 / (i as f64 * i as f64)).sum()
155}
156
157/// EM algorithm for haplotype frequency estimation from unphased diploid data.
158///
159/// Returns (p_AB, p_Ab, p_aB, p_ab) haplotype frequencies.
160fn em_haplotype_freqs(
161    site_a: &[Option<u8>],
162    site_b: &[Option<u8>],
163    max_iter: usize,
164    tol: f64,
165) -> Result<(f64, f64, f64, f64)> {
166    // Count joint genotypes (only non-missing at both sites)
167    let mut counts = [[0usize; 3]; 3]; // counts[ga][gb]
168    let mut n = 0usize;
169    for (ga, gb) in site_a.iter().zip(site_b.iter()) {
170        if let (Some(a), Some(b)) = (ga, gb) {
171            if *a > 2 || *b > 2 {
172                return Err(CyaneaError::InvalidInput(
173                    "invalid genotype value (expected 0, 1, or 2)".into(),
174                ));
175            }
176            counts[*a as usize][*b as usize] += 1;
177            n += 1;
178        }
179    }
180
181    if n == 0 {
182        return Err(CyaneaError::InvalidInput(
183            "no non-missing genotype pairs".into(),
184        ));
185    }
186
187    // Compute marginal allele frequencies for initialization
188    let mut a_alt = 0usize;
189    let mut b_alt = 0usize;
190    for ga in 0..3 {
191        for gb in 0..3 {
192            a_alt += ga * counts[ga][gb];
193            b_alt += gb * counts[ga][gb];
194        }
195    }
196    let pa = a_alt as f64 / (2 * n) as f64; // freq of 'a' (alt at site A)
197    let pb = b_alt as f64 / (2 * n) as f64; // freq of 'b' (alt at site B)
198
199    // Initialize haplotype freqs assuming linkage equilibrium
200    let mut p_ab = pa * pb; // ab = both alt
201    let mut p_aB = pa * (1.0 - pb);
202    let mut p_Ab = (1.0 - pa) * pb;
203    let mut p_AB = (1.0 - pa) * (1.0 - pb);
204
205    for _ in 0..max_iter {
206        // E-step: estimate haplotype counts
207        let mut h_AB = 0.0;
208        let mut h_Ab = 0.0;
209        let mut h_aB = 0.0;
210        let mut h_ab = 0.0;
211
212        for ga in 0..3usize {
213            for gb in 0..3usize {
214                let c = counts[ga][gb] as f64;
215                if c == 0.0 {
216                    continue;
217                }
218                // For each diploid genotype pair, distribute haplotypes
219                match (ga, gb) {
220                    (0, 0) => h_AB += 2.0 * c, // AA,BB → 2×AB
221                    (0, 1) => {
222                        h_AB += c;
223                        h_Ab += c;
224                    } // AA,Bb → AB + Ab
225                    (0, 2) => h_Ab += 2.0 * c, // AA,bb → 2×Ab
226                    (1, 0) => {
227                        h_AB += c;
228                        h_aB += c;
229                    } // Aa,BB → AB + aB
230                    (1, 1) => {
231                        // Aa,Bb → ambiguous: AB+ab or Ab+aB
232                        let denom = p_AB * p_ab + p_Ab * p_aB;
233                        if denom > 0.0 {
234                            let w = p_AB * p_ab / denom;
235                            h_AB += c * w;
236                            h_ab += c * w;
237                            h_Ab += c * (1.0 - w);
238                            h_aB += c * (1.0 - w);
239                        } else {
240                            // Ambiguous — split evenly
241                            h_AB += 0.5 * c;
242                            h_ab += 0.5 * c;
243                            h_Ab += 0.5 * c;
244                            h_aB += 0.5 * c;
245                        }
246                    }
247                    (1, 2) => {
248                        h_Ab += c;
249                        h_ab += c;
250                    } // Aa,bb → Ab + ab
251                    (2, 0) => h_aB += 2.0 * c, // aa,BB → 2×aB
252                    (2, 1) => {
253                        h_aB += c;
254                        h_ab += c;
255                    } // aa,Bb → aB + ab
256                    (2, 2) => h_ab += 2.0 * c, // aa,bb → 2×ab
257                    _ => {}
258                }
259            }
260        }
261
262        // M-step: update frequencies
263        let total = h_AB + h_Ab + h_aB + h_ab;
264        let new_AB = h_AB / total;
265        let new_Ab = h_Ab / total;
266        let new_aB = h_aB / total;
267        let new_ab = h_ab / total;
268
269        let diff = (new_AB - p_AB).abs()
270            + (new_Ab - p_Ab).abs()
271            + (new_aB - p_aB).abs()
272            + (new_ab - p_ab).abs();
273
274        p_AB = new_AB;
275        p_Ab = new_Ab;
276        p_aB = new_aB;
277        p_ab = new_ab;
278
279        if diff < tol {
280            break;
281        }
282    }
283
284    Ok((p_AB, p_Ab, p_aB, p_ab))
285}
286
287// ── Public API ───────────────────────────────────────────────────────────
288
289/// Compute allele frequencies and heterozygosity from 012-encoded genotypes.
290///
291/// # Errors
292///
293/// Returns an error if genotypes is empty, all genotypes are missing,
294/// or any value is outside {0, 1, 2, None}.
295pub fn allele_frequencies(genotypes: &[Option<u8>]) -> Result<AlleleFrequencies> {
296    if genotypes.is_empty() {
297        return Err(CyaneaError::InvalidInput("empty genotype array".into()));
298    }
299    let (n0, n1, n2, n_miss) = count_genotypes(genotypes)?;
300    let n_called = n0 + n1 + n2;
301    if n_called == 0 {
302        return Err(CyaneaError::InvalidInput(
303            "all genotypes are missing".into(),
304        ));
305    }
306    let (p, q) = allele_freq_from_counts(n0, n1, n2);
307    let obs_het = n1 as f64 / n_called as f64;
308    let exp_het = 2.0 * p * q;
309
310    Ok(AlleleFrequencies {
311        freq_ref: p,
312        freq_alt: q,
313        allele_count: 2 * n_called,
314        missing_count: n_miss,
315        observed_het: obs_het,
316        expected_het: exp_het,
317    })
318}
319
320/// Convenience wrapper: compute allele frequencies from u8 genotypes
321/// where 255 encodes missing data.
322pub fn allele_frequencies_u8(genotypes: &[u8]) -> Result<AlleleFrequencies> {
323    let opts: Vec<Option<u8>> = genotypes
324        .iter()
325        .map(|&g| if g == 255 { None } else { Some(g) })
326        .collect();
327    allele_frequencies(&opts)
328}
329
330/// Hardy-Weinberg equilibrium chi-squared test (1 df).
331///
332/// Tests whether observed genotype frequencies deviate from HWE expectations.
333///
334/// # Errors
335///
336/// Returns an error if the site is monomorphic (freq_alt == 0 or freq_ref == 0),
337/// genotypes are empty, or all are missing.
338pub fn hwe_test(genotypes: &[Option<u8>]) -> Result<HweResult> {
339    if genotypes.is_empty() {
340        return Err(CyaneaError::InvalidInput("empty genotype array".into()));
341    }
342    let (n0, n1, n2, _) = count_genotypes(genotypes)?;
343    let n = n0 + n1 + n2;
344    if n == 0 {
345        return Err(CyaneaError::InvalidInput(
346            "all genotypes are missing".into(),
347        ));
348    }
349    let (p, q) = allele_freq_from_counts(n0, n1, n2);
350    if p == 0.0 || q == 0.0 {
351        return Err(CyaneaError::InvalidInput(
352            "monomorphic site: HWE test requires polymorphism".into(),
353        ));
354    }
355
356    let n_f = n as f64;
357    let exp = [p * p * n_f, 2.0 * p * q * n_f, q * q * n_f];
358    let obs = [n0, n1, n2];
359
360    let chi_sq: f64 = obs
361        .iter()
362        .zip(exp.iter())
363        .map(|(&o, &e)| {
364            if e > 0.0 {
365                (o as f64 - e).powi(2) / e
366            } else {
367                0.0
368            }
369        })
370        .sum();
371
372    let chi2_dist = ChiSquared::new(1.0)?;
373    let p_value = 1.0 - chi2_dist.cdf(chi_sq);
374
375    let obs_het = n1 as f64 / n_f;
376    let exp_het = 2.0 * p * q;
377    let f = if exp_het > 0.0 {
378        1.0 - obs_het / exp_het
379    } else {
380        0.0
381    };
382
383    Ok(HweResult {
384        chi_squared: chi_sq,
385        p_value,
386        observed: obs,
387        expected: exp,
388        inbreeding_coeff: f,
389    })
390}
391
392/// Weir & Cockerham (1984) Fst estimator across multiple loci.
393///
394/// Each element of `pop1` and `pop2` is the genotype vector for one locus.
395/// Uses ratio-of-averages: Fst = sum(a) / sum(a + b + c).
396///
397/// # Errors
398///
399/// Returns an error if no loci are provided or populations are empty.
400pub fn fst_weir_cockerham(
401    pop1: &[&[Option<u8>]],
402    pop2: &[&[Option<u8>]],
403) -> Result<FstResult> {
404    if pop1.is_empty() || pop2.is_empty() {
405        return Err(CyaneaError::InvalidInput("empty loci or populations".into()));
406    }
407
408    let mut sum_a = 0.0;
409    let mut sum_abc = 0.0;
410
411    for (locus1, locus2) in pop1.iter().zip(pop2.iter()) {
412        let (n0_1, n1_1, n2_1, _) = count_genotypes(locus1)?;
413        let (n0_2, n1_2, n2_2, _) = count_genotypes(locus2)?;
414        let ni1 = (n0_1 + n1_1 + n2_1) as f64;
415        let ni2 = (n0_2 + n1_2 + n2_2) as f64;
416        if ni1 == 0.0 || ni2 == 0.0 {
417            continue;
418        }
419
420        let r = 2.0; // number of populations
421        let n_bar = (ni1 + ni2) / r;
422        let n_total = ni1 + ni2;
423
424        let (_, q1) = allele_freq_from_counts(n0_1, n1_1, n2_1);
425        let (_, q2) = allele_freq_from_counts(n0_2, n1_2, n2_2);
426
427        let p_bar = (ni1 * q1 + ni2 * q2) / n_total;
428
429        let s_sq = (ni1 * (q1 - p_bar).powi(2) + ni2 * (q2 - p_bar).powi(2))
430            / ((r - 1.0) * n_bar);
431
432        let h_bar = (ni1 * n1_1 as f64 / ni1 + ni2 * n1_2 as f64 / ni2) / n_total;
433
434        let nc = n_total - (ni1 * ni1 + ni2 * ni2) / n_total;
435        let nc = nc / (r - 1.0);
436
437        // Weir-Cockerham variance components
438        let a = (n_bar / nc)
439            * (s_sq - (1.0 / (n_bar - 1.0)) * (p_bar * (1.0 - p_bar) - (r - 1.0) / r * s_sq - 0.25 * h_bar));
440
441        let b = (n_bar / (n_bar - 1.0))
442            * (p_bar * (1.0 - p_bar) - (r - 1.0) / r * s_sq
443                - (2.0 * n_bar - 1.0) / (4.0 * n_bar) * h_bar);
444
445        let c = 0.5 * h_bar;
446
447        sum_a += a;
448        sum_abc += a + b + c;
449    }
450
451    let fst = if sum_abc > 0.0 {
452        (sum_a / sum_abc).clamp(0.0, 1.0)
453    } else {
454        0.0
455    };
456
457    Ok(FstResult {
458        fst,
459        method: FstMethod::WeirCockerham,
460    })
461}
462
463/// Hudson (1992) Fst estimator across multiple loci.
464///
465/// Fst = 1 - Hw / Hb, averaged across loci (ratio of averages).
466///
467/// # Errors
468///
469/// Returns an error if no loci are provided or populations are empty.
470pub fn fst_hudson(
471    pop1: &[&[Option<u8>]],
472    pop2: &[&[Option<u8>]],
473) -> Result<FstResult> {
474    if pop1.is_empty() || pop2.is_empty() {
475        return Err(CyaneaError::InvalidInput("empty loci or populations".into()));
476    }
477
478    let mut sum_num = 0.0;
479    let mut sum_den = 0.0;
480
481    for (locus1, locus2) in pop1.iter().zip(pop2.iter()) {
482        let (n0_1, n1_1, n2_1, _) = count_genotypes(locus1)?;
483        let (n0_2, n1_2, n2_2, _) = count_genotypes(locus2)?;
484        let n1 = (n0_1 + n1_1 + n2_1) as f64;
485        let n2_pop = (n0_2 + n1_2 + n2_2) as f64;
486        if n1 == 0.0 || n2_pop == 0.0 {
487            continue;
488        }
489
490        let (_, q1) = allele_freq_from_counts(n0_1, n1_1, n2_1);
491        let (_, q2) = allele_freq_from_counts(n0_2, n1_2, n2_2);
492
493        // Within-population heterozygosity (bias-corrected)
494        let hw1 = 2.0 * q1 * (1.0 - q1) * n1 / (n1 - 1.0);
495        let hw2 = 2.0 * q2 * (1.0 - q2) * n2_pop / (n2_pop - 1.0);
496        let hw = (hw1 + hw2) / 2.0;
497
498        // Between-population heterozygosity
499        let hb = q1 * (1.0 - q2) + q2 * (1.0 - q1);
500
501        // Hudson's numerator/denominator
502        sum_num += hb - hw;
503        sum_den += hb;
504    }
505
506    let fst = if sum_den > 0.0 {
507        (sum_num / sum_den).clamp(0.0, 1.0)
508    } else {
509        0.0
510    };
511
512    Ok(FstResult {
513        fst,
514        method: FstMethod::Hudson,
515    })
516}
517
518/// Multi-population Fst using the Weir-Cockerham estimator.
519///
520/// `populations[i][j]` is the genotype vector for population i, locus j.
521/// All populations must have the same number of loci.
522///
523/// # Errors
524///
525/// Returns an error if fewer than 2 populations or no loci.
526pub fn fst_multi_population(populations: &[&[&[Option<u8>]]]) -> Result<FstResult> {
527    let r = populations.len();
528    if r < 2 {
529        return Err(CyaneaError::InvalidInput(
530            "need at least 2 populations for Fst".into(),
531        ));
532    }
533    let n_loci = populations[0].len();
534    if n_loci == 0 {
535        return Err(CyaneaError::InvalidInput("no loci provided".into()));
536    }
537    for pop in populations.iter() {
538        if pop.len() != n_loci {
539            return Err(CyaneaError::InvalidInput(
540                "all populations must have the same number of loci".into(),
541            ));
542        }
543    }
544
545    let mut sum_a = 0.0;
546    let mut sum_abc = 0.0;
547    let r_f = r as f64;
548
549    for locus_idx in 0..n_loci {
550        let mut ni = Vec::with_capacity(r);
551        let mut qi = Vec::with_capacity(r);
552        let mut hi = Vec::with_capacity(r);
553
554        for pop in populations.iter() {
555            let (n0, n1, n2, _) = count_genotypes(pop[locus_idx])?;
556            let n = (n0 + n1 + n2) as f64;
557            if n == 0.0 {
558                ni.push(0.0);
559                qi.push(0.0);
560                hi.push(0.0);
561                continue;
562            }
563            let (_, q) = allele_freq_from_counts(n0, n1, n2);
564            ni.push(n);
565            qi.push(q);
566            hi.push(n1 as f64 / n);
567        }
568
569        let n_total: f64 = ni.iter().sum();
570        if n_total == 0.0 {
571            continue;
572        }
573
574        let n_bar = n_total / r_f;
575        let p_bar: f64 = ni.iter().zip(qi.iter()).map(|(n, q)| n * q).sum::<f64>() / n_total;
576        let s_sq: f64 = ni
577            .iter()
578            .zip(qi.iter())
579            .map(|(n, q)| n * (q - p_bar).powi(2))
580            .sum::<f64>()
581            / ((r_f - 1.0) * n_bar);
582        let h_bar: f64 = ni.iter().zip(hi.iter()).map(|(n, h)| n * h).sum::<f64>() / n_total;
583
584        let nc = (n_total - ni.iter().map(|n| n * n).sum::<f64>() / n_total) / (r_f - 1.0);
585
586        let a = (n_bar / nc)
587            * (s_sq
588                - (1.0 / (n_bar - 1.0))
589                    * (p_bar * (1.0 - p_bar) - (r_f - 1.0) / r_f * s_sq - 0.25 * h_bar));
590
591        let b = (n_bar / (n_bar - 1.0))
592            * (p_bar * (1.0 - p_bar) - (r_f - 1.0) / r_f * s_sq
593                - (2.0 * n_bar - 1.0) / (4.0 * n_bar) * h_bar);
594
595        let c = 0.5 * h_bar;
596
597        sum_a += a;
598        sum_abc += a + b + c;
599    }
600
601    let fst = if sum_abc > 0.0 {
602        (sum_a / sum_abc).clamp(0.0, 1.0)
603    } else {
604        0.0
605    };
606
607    Ok(FstResult {
608        fst,
609        method: FstMethod::WeirCockerham,
610    })
611}
612
613/// Compute nucleotide diversity (pi) and Watterson's theta from a genotype matrix.
614///
615/// `genotype_matrix[i]` is the genotype vector for locus i.
616/// `n_sequences` is the number of haploid sequences (typically 2× diploid individuals).
617///
618/// Pi is estimated as the average number of pairwise differences per site.
619///
620/// # Errors
621///
622/// Returns an error if `n_sequences` < 2 or no loci are provided.
623pub fn diversity(
624    genotype_matrix: &[&[Option<u8>]],
625    n_sequences: usize,
626) -> Result<DiversityStats> {
627    if n_sequences < 2 {
628        return Err(CyaneaError::InvalidInput(
629            "need at least 2 sequences for diversity".into(),
630        ));
631    }
632    if genotype_matrix.is_empty() {
633        return Err(CyaneaError::InvalidInput("no loci provided".into()));
634    }
635
636    let mut seg_sites = 0usize;
637    let mut total_pi = 0.0;
638
639    for locus in genotype_matrix {
640        let (n0, n1, n2, _) = count_genotypes(locus)?;
641        let n_called = n0 + n1 + n2;
642        if n_called == 0 {
643            continue;
644        }
645        let (_, q) = allele_freq_from_counts(n0, n1, n2);
646        if q > 0.0 && q < 1.0 {
647            seg_sites += 1;
648        }
649        // Per-site pi: for diploid 012 data, estimate from allele frequency
650        let n_hap = (2 * n_called) as f64;
651        let pi_site = 2.0 * q * (1.0 - q) * n_hap / (n_hap - 1.0);
652        total_pi += pi_site;
653    }
654
655    let n_loci = genotype_matrix.len() as f64;
656    let pi = total_pi / n_loci;
657
658    let a1 = harmonic(n_sequences - 1);
659    let theta_w = if a1 > 0.0 {
660        (seg_sites as f64 / n_loci) / a1
661    } else {
662        0.0
663    };
664
665    Ok(DiversityStats {
666        pi,
667        theta_w,
668        segregating_sites: seg_sites,
669        n_sequences,
670    })
671}
672
673/// Compute diversity statistics from pre-computed summary values.
674///
675/// Useful when you already have segregating site counts and average pairwise
676/// differences (e.g., from a VCF summary).
677///
678/// # Errors
679///
680/// Returns an error if `n_sequences` < 2.
681pub fn diversity_from_counts(
682    segregating_sites: usize,
683    n_sequences: usize,
684    avg_pairwise_diff: f64,
685) -> Result<DiversityStats> {
686    if n_sequences < 2 {
687        return Err(CyaneaError::InvalidInput(
688            "need at least 2 sequences for diversity".into(),
689        ));
690    }
691
692    let a1 = harmonic(n_sequences - 1);
693    let theta_w = if a1 > 0.0 {
694        segregating_sites as f64 / a1
695    } else {
696        0.0
697    };
698
699    Ok(DiversityStats {
700        pi: avg_pairwise_diff,
701        theta_w,
702        segregating_sites,
703        n_sequences,
704    })
705}
706
707/// Compute Tajima's D statistic.
708///
709/// Compares pi (average pairwise differences) to theta_w (from segregating sites).
710/// Under neutrality, D ≈ 0. Positive D suggests balancing selection or population
711/// contraction; negative D suggests purifying selection or expansion.
712///
713/// No p-value is provided (the null distribution requires coalescent simulation).
714///
715/// # Errors
716///
717/// Returns an error if `n_sequences` < 4 or `segregating_sites` == 0.
718pub fn tajimas_d(
719    segregating_sites: usize,
720    n_sequences: usize,
721    avg_pairwise_diff: f64,
722) -> Result<TajimaD> {
723    if n_sequences < 4 {
724        return Err(CyaneaError::InvalidInput(
725            "Tajima's D requires at least 4 sequences".into(),
726        ));
727    }
728    if segregating_sites == 0 {
729        return Err(CyaneaError::InvalidInput(
730            "Tajima's D undefined with 0 segregating sites".into(),
731        ));
732    }
733
734    let n = n_sequences as f64;
735    let s = segregating_sites as f64;
736
737    let a1 = harmonic(n_sequences - 1);
738    let a2 = harmonic_sq(n_sequences - 1);
739
740    let theta_w = s / a1;
741
742    // Tajima's D variance components
743    let b1 = (n + 1.0) / (3.0 * (n - 1.0));
744    let b2 = 2.0 * (n * n + n + 3.0) / (9.0 * n * (n - 1.0));
745
746    let c1 = b1 - 1.0 / a1;
747    let c2 = b2 - (n + 2.0) / (a1 * n) + a2 / (a1 * a1);
748
749    let e1 = c1 / a1;
750    let e2 = c2 / (a1 * a1 + a2);
751
752    let var_d = e1 * s + e2 * s * (s - 1.0);
753
754    let d = if var_d > 0.0 {
755        (avg_pairwise_diff - theta_w) / var_d.sqrt()
756    } else {
757        0.0
758    };
759
760    Ok(TajimaD {
761        d,
762        pi: avg_pairwise_diff,
763        theta_w,
764        segregating_sites,
765        n_sequences,
766    })
767}
768
769/// Compute linkage disequilibrium (r², D', D) between two sites.
770///
771/// Uses the EM algorithm to estimate haplotype frequencies from unphased
772/// diploid genotype data.
773///
774/// # Errors
775///
776/// Returns an error if sites have different lengths, either site is monomorphic,
777/// or all genotypes are missing.
778pub fn linkage_disequilibrium(
779    site_a: &[Option<u8>],
780    site_b: &[Option<u8>],
781) -> Result<LdResult> {
782    if site_a.len() != site_b.len() {
783        return Err(CyaneaError::InvalidInput(
784            "site vectors must have the same length".into(),
785        ));
786    }
787    if site_a.is_empty() {
788        return Err(CyaneaError::InvalidInput("empty site vectors".into()));
789    }
790
791    // Count non-missing pairs
792    let mut n_pairs = 0usize;
793    for (a, b) in site_a.iter().zip(site_b.iter()) {
794        if a.is_some() && b.is_some() {
795            n_pairs += 1;
796        }
797    }
798    if n_pairs == 0 {
799        return Err(CyaneaError::InvalidInput(
800            "no non-missing genotype pairs".into(),
801        ));
802    }
803
804    // Check that both sites are polymorphic
805    let af_a = allele_frequencies(site_a)?;
806    let af_b = allele_frequencies(site_b)?;
807    if af_a.freq_alt == 0.0 || af_a.freq_alt == 1.0 {
808        return Err(CyaneaError::InvalidInput(
809            "site A is monomorphic: LD undefined".into(),
810        ));
811    }
812    if af_b.freq_alt == 0.0 || af_b.freq_alt == 1.0 {
813        return Err(CyaneaError::InvalidInput(
814            "site B is monomorphic: LD undefined".into(),
815        ));
816    }
817
818    let (p_AB, _p_Ab, _p_aB, _p_ab) = em_haplotype_freqs(site_a, site_b, 1000, 1e-10)?;
819
820    // Marginal frequencies (A = ref at site A, a = alt at site A)
821    let p_a = af_a.freq_alt; // freq of alt allele at site A
822    let p_A = af_a.freq_ref;
823    let p_b = af_b.freq_alt; // freq of alt allele at site B
824    let p_B = af_b.freq_ref;
825
826    // D = freq(AB) - freq(A)*freq(B)
827    let d = p_AB - p_A * p_B;
828
829    // D'
830    let d_max = if d >= 0.0 {
831        (p_A * p_b).min(p_a * p_B)
832    } else {
833        (p_A * p_B).min(p_a * p_b)
834    };
835    let d_prime = if d_max.abs() > 0.0 {
836        (d / d_max).abs().min(1.0)
837    } else {
838        0.0
839    };
840
841    // r²
842    let denom = p_A * p_a * p_B * p_b;
843    let r_squared = if denom > 0.0 {
844        (d * d / denom).min(1.0)
845    } else {
846        0.0
847    };
848
849    Ok(LdResult {
850        r_squared,
851        d_prime,
852        d,
853        n_haplotypes: 2 * n_pairs,
854    })
855}
856
857/// PCA on a genotype matrix with missing-value imputation.
858///
859/// `genotype_matrix[i]` is the genotype vector for sample i (each sample across all loci).
860/// Missing values are imputed with the per-site mean before delegating to
861/// [`crate::reduction::pca`].
862///
863/// # Errors
864///
865/// Returns an error if the matrix is empty, samples have inconsistent lengths,
866/// or `n_components` is invalid.
867pub fn genotype_pca(
868    genotype_matrix: &[&[Option<u8>]],
869    n_components: usize,
870) -> Result<crate::reduction::PcaResult> {
871    if genotype_matrix.is_empty() {
872        return Err(CyaneaError::InvalidInput(
873            "empty genotype matrix".into(),
874        ));
875    }
876    let n_features = genotype_matrix[0].len();
877    if n_features == 0 {
878        return Err(CyaneaError::InvalidInput("zero loci".into()));
879    }
880    for (i, row) in genotype_matrix.iter().enumerate() {
881        if row.len() != n_features {
882            return Err(CyaneaError::InvalidInput(format!(
883                "sample {} has {} loci, expected {}",
884                i,
885                row.len(),
886                n_features
887            )));
888        }
889    }
890
891    let n_samples = genotype_matrix.len();
892
893    // Compute per-site mean (for imputation)
894    let mut site_sum = vec![0.0; n_features];
895    let mut site_count = vec![0usize; n_features];
896    for row in genotype_matrix {
897        for (j, g) in row.iter().enumerate() {
898            if let Some(v) = g {
899                site_sum[j] += *v as f64;
900                site_count[j] += 1;
901            }
902        }
903    }
904    let site_mean: Vec<f64> = (0..n_features)
905        .map(|j| {
906            if site_count[j] > 0 {
907                site_sum[j] / site_count[j] as f64
908            } else {
909                0.0
910            }
911        })
912        .collect();
913
914    // Build imputed f64 matrix
915    let mut data: Vec<Vec<f64>> = Vec::with_capacity(n_samples);
916    for row in genotype_matrix {
917        let imputed: Vec<f64> = row
918            .iter()
919            .enumerate()
920            .map(|(j, g)| match g {
921                Some(v) => *v as f64,
922                None => site_mean[j],
923            })
924            .collect();
925        data.push(imputed);
926    }
927
928    let refs: Vec<&[f64]> = data.iter().map(|v| v.as_slice()).collect();
929    crate::reduction::pca(&refs, n_components)
930}
931
932// ── Tests ────────────────────────────────────────────────────────────────
933
934#[cfg(test)]
935mod tests {
936    use super::*;
937
938    const TOL: f64 = 1e-6;
939
940    // ── Allele frequencies ───────────────────────────────────────────
941
942    #[test]
943    fn af_basic() {
944        // 2 hom-ref, 2 het, 1 hom-alt → p=0.6, q=0.4
945        let g = [Some(0), Some(0), Some(1), Some(1), Some(2)];
946        let af = allele_frequencies(&g).unwrap();
947        assert!((af.freq_ref - 0.6).abs() < TOL);
948        assert!((af.freq_alt - 0.4).abs() < TOL);
949        assert_eq!(af.allele_count, 10);
950        assert_eq!(af.missing_count, 0);
951        assert!((af.observed_het - 0.4).abs() < TOL);
952    }
953
954    #[test]
955    fn af_all_hom_ref() {
956        let g = [Some(0), Some(0), Some(0)];
957        let af = allele_frequencies(&g).unwrap();
958        assert!((af.freq_ref - 1.0).abs() < TOL);
959        assert!((af.freq_alt - 0.0).abs() < TOL);
960        assert!((af.observed_het - 0.0).abs() < TOL);
961    }
962
963    #[test]
964    fn af_all_het() {
965        let g = [Some(1), Some(1), Some(1), Some(1)];
966        let af = allele_frequencies(&g).unwrap();
967        assert!((af.freq_ref - 0.5).abs() < TOL);
968        assert!((af.freq_alt - 0.5).abs() < TOL);
969        assert!((af.observed_het - 1.0).abs() < TOL);
970    }
971
972    #[test]
973    fn af_with_missing() {
974        let g = [Some(0), None, Some(1), Some(2), None];
975        let af = allele_frequencies(&g).unwrap();
976        assert_eq!(af.missing_count, 2);
977        assert_eq!(af.allele_count, 6);
978    }
979
980    #[test]
981    fn af_empty_error() {
982        let g: [Option<u8>; 0] = [];
983        assert!(allele_frequencies(&g).is_err());
984    }
985
986    #[test]
987    fn af_invalid_genotype() {
988        let g = [Some(0), Some(3), Some(1)];
989        assert!(allele_frequencies(&g).is_err());
990    }
991
992    // ── HWE ──────────────────────────────────────────────────────────
993
994    #[test]
995    fn hwe_perfect_equilibrium() {
996        // p=0.5 → expected 25% AA, 50% Aa, 25% aa
997        // 25 hom-ref, 50 het, 25 hom-alt
998        let mut g = Vec::new();
999        for _ in 0..25 {
1000            g.push(Some(0));
1001        }
1002        for _ in 0..50 {
1003            g.push(Some(1));
1004        }
1005        for _ in 0..25 {
1006            g.push(Some(2));
1007        }
1008        let result = hwe_test(&g).unwrap();
1009        assert!(result.chi_squared < 0.01, "chi_sq={}", result.chi_squared);
1010        assert!(result.p_value > 0.9);
1011        assert!(result.inbreeding_coeff.abs() < 0.01);
1012    }
1013
1014    #[test]
1015    fn hwe_excess_heterozygosity() {
1016        // All het → strong deviation
1017        let g: Vec<Option<u8>> = vec![Some(1); 100];
1018        let result = hwe_test(&g).unwrap();
1019        assert!(result.chi_squared > 10.0);
1020        assert!(result.p_value < 0.01);
1021        assert!(result.inbreeding_coeff < -0.5);
1022    }
1023
1024    #[test]
1025    fn hwe_deficit_het() {
1026        // 50 hom-ref, 0 het, 50 hom-alt → extreme het deficit
1027        let mut g = Vec::new();
1028        for _ in 0..50 {
1029            g.push(Some(0));
1030        }
1031        for _ in 0..50 {
1032            g.push(Some(2));
1033        }
1034        let result = hwe_test(&g).unwrap();
1035        assert!(result.chi_squared > 50.0);
1036        assert!(result.inbreeding_coeff > 0.9);
1037    }
1038
1039    #[test]
1040    fn hwe_textbook() {
1041        // 298 AA, 489 Aa, 213 aa (n=1000)
1042        // Classic textbook example: close to HWE
1043        let mut g = Vec::new();
1044        for _ in 0..298 {
1045            g.push(Some(0));
1046        }
1047        for _ in 0..489 {
1048            g.push(Some(1));
1049        }
1050        for _ in 0..213 {
1051            g.push(Some(2));
1052        }
1053        let result = hwe_test(&g).unwrap();
1054        // These genotype frequencies are close to HWE for p≈0.5425
1055        // so chi-squared should be small
1056        assert!(result.chi_squared < 10.0);
1057    }
1058
1059    #[test]
1060    fn hwe_monomorphic_error() {
1061        let g = [Some(0), Some(0), Some(0)];
1062        assert!(hwe_test(&g).is_err());
1063    }
1064
1065    #[test]
1066    fn hwe_with_missing() {
1067        let mut g: Vec<Option<u8>> = Vec::new();
1068        for _ in 0..20 {
1069            g.push(Some(0));
1070        }
1071        for _ in 0..40 {
1072            g.push(Some(1));
1073        }
1074        for _ in 0..20 {
1075            g.push(Some(2));
1076        }
1077        for _ in 0..10 {
1078            g.push(None);
1079        }
1080        let result = hwe_test(&g).unwrap();
1081        assert!(result.p_value > 0.5);
1082    }
1083
1084    // ── Fst ──────────────────────────────────────────────────────────
1085
1086    #[test]
1087    fn fst_identical_populations() {
1088        let locus: Vec<Option<u8>> = vec![Some(0), Some(1), Some(1), Some(2)];
1089        let pop1: Vec<&[Option<u8>]> = vec![locus.as_slice()];
1090        let pop2: Vec<&[Option<u8>]> = vec![locus.as_slice()];
1091        let result = fst_weir_cockerham(&pop1, &pop2).unwrap();
1092        assert!(result.fst < 0.05, "fst={}", result.fst);
1093    }
1094
1095    #[test]
1096    fn fst_fully_differentiated() {
1097        let pop1_locus: Vec<Option<u8>> = vec![Some(0); 50];
1098        let pop2_locus: Vec<Option<u8>> = vec![Some(2); 50];
1099        let pop1: Vec<&[Option<u8>]> = vec![pop1_locus.as_slice()];
1100        let pop2: Vec<&[Option<u8>]> = vec![pop2_locus.as_slice()];
1101        let result = fst_weir_cockerham(&pop1, &pop2).unwrap();
1102        assert!(result.fst > 0.8, "fst={}", result.fst);
1103    }
1104
1105    #[test]
1106    fn fst_wc_known() {
1107        // Two pops with moderate differentiation
1108        let pop1_locus: Vec<Option<u8>> = vec![Some(0), Some(0), Some(0), Some(1), Some(1)];
1109        let pop2_locus: Vec<Option<u8>> = vec![Some(1), Some(1), Some(2), Some(2), Some(2)];
1110        let pop1: Vec<&[Option<u8>]> = vec![pop1_locus.as_slice()];
1111        let pop2: Vec<&[Option<u8>]> = vec![pop2_locus.as_slice()];
1112        let result = fst_weir_cockerham(&pop1, &pop2).unwrap();
1113        assert!(result.fst > 0.1 && result.fst < 0.9, "fst={}", result.fst);
1114        assert_eq!(result.method, FstMethod::WeirCockerham);
1115    }
1116
1117    #[test]
1118    fn fst_hudson_known() {
1119        let pop1_locus: Vec<Option<u8>> = vec![Some(0), Some(0), Some(0), Some(1), Some(1)];
1120        let pop2_locus: Vec<Option<u8>> = vec![Some(1), Some(1), Some(2), Some(2), Some(2)];
1121        let pop1: Vec<&[Option<u8>]> = vec![pop1_locus.as_slice()];
1122        let pop2: Vec<&[Option<u8>]> = vec![pop2_locus.as_slice()];
1123        let result = fst_hudson(&pop1, &pop2).unwrap();
1124        assert!(result.fst > 0.1 && result.fst < 0.9, "fst={}", result.fst);
1125        assert_eq!(result.method, FstMethod::Hudson);
1126    }
1127
1128    #[test]
1129    fn fst_multi_locus() {
1130        // Multiple loci should give a more robust estimate
1131        let p1_l1: Vec<Option<u8>> = vec![Some(0), Some(0), Some(1), Some(1)];
1132        let p1_l2: Vec<Option<u8>> = vec![Some(0), Some(1), Some(1), Some(2)];
1133        let p2_l1: Vec<Option<u8>> = vec![Some(1), Some(2), Some(2), Some(2)];
1134        let p2_l2: Vec<Option<u8>> = vec![Some(1), Some(1), Some(2), Some(2)];
1135        let pop1: Vec<&[Option<u8>]> = vec![p1_l1.as_slice(), p1_l2.as_slice()];
1136        let pop2: Vec<&[Option<u8>]> = vec![p2_l1.as_slice(), p2_l2.as_slice()];
1137        let result = fst_weir_cockerham(&pop1, &pop2).unwrap();
1138        assert!(result.fst >= 0.0 && result.fst <= 1.0);
1139    }
1140
1141    #[test]
1142    fn fst_three_populations() {
1143        let l1: Vec<Option<u8>> = vec![Some(0), Some(0), Some(1)];
1144        let l2: Vec<Option<u8>> = vec![Some(1), Some(2), Some(2)];
1145        let l3: Vec<Option<u8>> = vec![Some(0), Some(1), Some(2)];
1146        let pop1: Vec<&[Option<u8>]> = vec![l1.as_slice()];
1147        let pop2: Vec<&[Option<u8>]> = vec![l2.as_slice()];
1148        let pop3: Vec<&[Option<u8>]> = vec![l3.as_slice()];
1149        let pops: Vec<&[&[Option<u8>]]> = vec![pop1.as_slice(), pop2.as_slice(), pop3.as_slice()];
1150        let result = super::fst_multi_population(&pops).unwrap();
1151        assert!(result.fst >= 0.0 && result.fst <= 1.0);
1152    }
1153
1154    #[test]
1155    fn fst_unequal_sizes() {
1156        let pop1_locus: Vec<Option<u8>> = vec![Some(0), Some(1)];
1157        let pop2_locus: Vec<Option<u8>> = vec![Some(1), Some(2), Some(2), Some(2), Some(2)];
1158        let pop1: Vec<&[Option<u8>]> = vec![pop1_locus.as_slice()];
1159        let pop2: Vec<&[Option<u8>]> = vec![pop2_locus.as_slice()];
1160        let result = fst_hudson(&pop1, &pop2).unwrap();
1161        assert!(result.fst >= 0.0 && result.fst <= 1.0);
1162    }
1163
1164    #[test]
1165    fn fst_error_empty() {
1166        let pop1: Vec<&[Option<u8>]> = vec![];
1167        let pop2: Vec<&[Option<u8>]> = vec![];
1168        assert!(fst_weir_cockerham(&pop1, &pop2).is_err());
1169    }
1170
1171    // ── Diversity ────────────────────────────────────────────────────
1172
1173    #[test]
1174    fn diversity_known_pi() {
1175        // 4 loci, 2 segregating. With known allele frequencies we can verify pi.
1176        let l1: Vec<Option<u8>> = vec![Some(0), Some(0), Some(1), Some(2)]; // q=0.375
1177        let l2: Vec<Option<u8>> = vec![Some(0), Some(0), Some(0), Some(0)]; // monomorphic
1178        let l3: Vec<Option<u8>> = vec![Some(1), Some(1), Some(1), Some(1)]; // q=0.5
1179        let l4: Vec<Option<u8>> = vec![Some(0), Some(0), Some(0), Some(0)]; // monomorphic
1180        let matrix: Vec<&[Option<u8>]> =
1181            vec![l1.as_slice(), l2.as_slice(), l3.as_slice(), l4.as_slice()];
1182        let result = diversity(&matrix, 8).unwrap();
1183        assert!(result.pi > 0.0);
1184        assert_eq!(result.segregating_sites, 2);
1185        assert!(result.theta_w > 0.0);
1186    }
1187
1188    #[test]
1189    fn diversity_known_theta_w() {
1190        // S=10, n=20 → theta_w = S / H_{19}
1191        let result = diversity_from_counts(10, 20, 5.0).unwrap();
1192        let h19 = harmonic(19);
1193        assert!((result.theta_w - 10.0 / h19).abs() < TOL);
1194    }
1195
1196    #[test]
1197    fn diversity_zero_segregating() {
1198        let l1: Vec<Option<u8>> = vec![Some(0), Some(0), Some(0)];
1199        let matrix: Vec<&[Option<u8>]> = vec![l1.as_slice()];
1200        let result = diversity(&matrix, 6).unwrap();
1201        assert_eq!(result.segregating_sites, 0);
1202        assert!((result.pi - 0.0).abs() < TOL);
1203    }
1204
1205    #[test]
1206    fn diversity_error_n_lt_2() {
1207        assert!(diversity_from_counts(5, 1, 2.0).is_err());
1208    }
1209
1210    // ── Tajima's D ───────────────────────────────────────────────────
1211
1212    #[test]
1213    fn tajimas_d_neutral() {
1214        // When pi ≈ theta_w, D ≈ 0
1215        let n = 20;
1216        let s = 10;
1217        let a1 = harmonic(n - 1);
1218        let theta_w = s as f64 / a1;
1219        let result = tajimas_d(s, n, theta_w).unwrap();
1220        assert!(result.d.abs() < 0.01, "d={}", result.d);
1221    }
1222
1223    #[test]
1224    fn tajimas_d_positive() {
1225        // pi >> theta_w → positive D (balancing selection / contraction)
1226        let result = tajimas_d(5, 20, 10.0).unwrap();
1227        assert!(result.d > 0.0, "d={}", result.d);
1228    }
1229
1230    #[test]
1231    fn tajimas_d_negative() {
1232        // pi << theta_w → negative D (purifying selection / expansion)
1233        let result = tajimas_d(50, 20, 1.0).unwrap();
1234        assert!(result.d < 0.0, "d={}", result.d);
1235    }
1236
1237    #[test]
1238    fn tajimas_d_textbook() {
1239        // Textbook example: n=10, S=12, pi=3.5
1240        let result = tajimas_d(12, 10, 3.5).unwrap();
1241        // D should be negative since pi (3.5) < S/a1 (12/2.829 ≈ 4.24)
1242        assert!(result.d < 0.0, "d={}", result.d);
1243        assert_eq!(result.segregating_sites, 12);
1244        assert_eq!(result.n_sequences, 10);
1245    }
1246
1247    #[test]
1248    fn tajimas_d_n_lt_4_error() {
1249        assert!(tajimas_d(5, 3, 2.0).is_err());
1250    }
1251
1252    #[test]
1253    fn tajimas_d_zero_s_error() {
1254        assert!(tajimas_d(0, 20, 0.0).is_err());
1255    }
1256
1257    // ── Linkage disequilibrium ───────────────────────────────────────
1258
1259    #[test]
1260    fn ld_perfect() {
1261        // Perfect LD: sites perfectly correlated
1262        let site_a: Vec<Option<u8>> = vec![Some(0), Some(0), Some(2), Some(2)];
1263        let site_b: Vec<Option<u8>> = vec![Some(0), Some(0), Some(2), Some(2)];
1264        let result = linkage_disequilibrium(&site_a, &site_b).unwrap();
1265        assert!(result.r_squared > 0.9, "r²={}", result.r_squared);
1266        assert!(result.d_prime > 0.9, "D'={}", result.d_prime);
1267    }
1268
1269    #[test]
1270    fn ld_no_ld() {
1271        // Sites with no LD (independent)
1272        // Create a larger sample to get stable LD estimates
1273        let mut site_a = Vec::new();
1274        let mut site_b = Vec::new();
1275        // AB, Ab, aB, ab equally frequent → no LD
1276        for _ in 0..25 {
1277            site_a.push(Some(0));
1278            site_b.push(Some(0));
1279        }
1280        for _ in 0..25 {
1281            site_a.push(Some(0));
1282            site_b.push(Some(2));
1283        }
1284        for _ in 0..25 {
1285            site_a.push(Some(2));
1286            site_b.push(Some(0));
1287        }
1288        for _ in 0..25 {
1289            site_a.push(Some(2));
1290            site_b.push(Some(2));
1291        }
1292        let result = linkage_disequilibrium(&site_a, &site_b).unwrap();
1293        assert!(result.r_squared < 0.05, "r²={}", result.r_squared);
1294    }
1295
1296    #[test]
1297    fn ld_intermediate() {
1298        // Some LD but not perfect
1299        let site_a: Vec<Option<u8>> =
1300            vec![Some(0), Some(0), Some(0), Some(2), Some(2), Some(1)];
1301        let site_b: Vec<Option<u8>> =
1302            vec![Some(0), Some(0), Some(2), Some(2), Some(2), Some(1)];
1303        let result = linkage_disequilibrium(&site_a, &site_b).unwrap();
1304        assert!(result.r_squared >= 0.0 && result.r_squared <= 1.0);
1305        assert!(result.d_prime >= 0.0 && result.d_prime <= 1.0);
1306    }
1307
1308    #[test]
1309    fn ld_d_prime_vs_r_squared() {
1310        // D' and r² measure different things: D' can be 1 when r² < 1
1311        let site_a: Vec<Option<u8>> =
1312            vec![Some(0), Some(0), Some(0), Some(0), Some(1), Some(1), Some(2)];
1313        let site_b: Vec<Option<u8>> =
1314            vec![Some(0), Some(0), Some(0), Some(1), Some(1), Some(2), Some(2)];
1315        let result = linkage_disequilibrium(&site_a, &site_b).unwrap();
1316        // Both should be in valid range
1317        assert!(result.r_squared >= 0.0 && result.r_squared <= 1.0);
1318        assert!(result.d_prime >= 0.0 && result.d_prime <= 1.0);
1319    }
1320
1321    #[test]
1322    fn ld_with_missing() {
1323        let site_a: Vec<Option<u8>> = vec![Some(0), Some(0), None, Some(2), Some(2)];
1324        let site_b: Vec<Option<u8>> = vec![Some(0), None, Some(0), Some(2), Some(2)];
1325        let result = linkage_disequilibrium(&site_a, &site_b).unwrap();
1326        assert!(result.n_haplotypes > 0);
1327        assert!(result.r_squared >= 0.0);
1328    }
1329
1330    #[test]
1331    fn ld_monomorphic_error() {
1332        let site_a: Vec<Option<u8>> = vec![Some(0), Some(0), Some(0)];
1333        let site_b: Vec<Option<u8>> = vec![Some(0), Some(1), Some(2)];
1334        assert!(linkage_disequilibrium(&site_a, &site_b).is_err());
1335    }
1336
1337    #[test]
1338    fn ld_length_mismatch() {
1339        let site_a: Vec<Option<u8>> = vec![Some(0), Some(1)];
1340        let site_b: Vec<Option<u8>> = vec![Some(0), Some(1), Some(2)];
1341        assert!(linkage_disequilibrium(&site_a, &site_b).is_err());
1342    }
1343
1344    // ── Genotype PCA ─────────────────────────────────────────────────
1345
1346    #[test]
1347    fn pca_basic() {
1348        let s1: Vec<Option<u8>> = vec![Some(0), Some(0), Some(1), Some(2)];
1349        let s2: Vec<Option<u8>> = vec![Some(0), Some(1), Some(1), Some(2)];
1350        let s3: Vec<Option<u8>> = vec![Some(2), Some(2), Some(1), Some(0)];
1351        let s4: Vec<Option<u8>> = vec![Some(2), Some(1), Some(0), Some(0)];
1352        let matrix: Vec<&[Option<u8>]> =
1353            vec![s1.as_slice(), s2.as_slice(), s3.as_slice(), s4.as_slice()];
1354        let result = genotype_pca(&matrix, 2).unwrap();
1355        assert_eq!(result.components.len(), 2);
1356        assert_eq!(result.transformed.len(), 4);
1357        assert_eq!(result.transformed[0].len(), 2);
1358    }
1359
1360    #[test]
1361    fn pca_missing_imputation() {
1362        // Missing values should be imputed without error
1363        let s1: Vec<Option<u8>> = vec![Some(0), None, Some(1)];
1364        let s2: Vec<Option<u8>> = vec![Some(1), Some(1), Some(2)];
1365        let s3: Vec<Option<u8>> = vec![Some(2), Some(2), None];
1366        let matrix: Vec<&[Option<u8>]> = vec![s1.as_slice(), s2.as_slice(), s3.as_slice()];
1367        let result = genotype_pca(&matrix, 2).unwrap();
1368        assert_eq!(result.transformed.len(), 3);
1369    }
1370
1371    #[test]
1372    fn pca_variance() {
1373        // Variance ratios should be non-negative and sum ≤ 1
1374        let s1: Vec<Option<u8>> = vec![Some(0), Some(0), Some(2), Some(2)];
1375        let s2: Vec<Option<u8>> = vec![Some(0), Some(1), Some(1), Some(2)];
1376        let s3: Vec<Option<u8>> = vec![Some(2), Some(2), Some(0), Some(0)];
1377        let s4: Vec<Option<u8>> = vec![Some(1), Some(1), Some(1), Some(1)];
1378        let s5: Vec<Option<u8>> = vec![Some(0), Some(2), Some(0), Some(2)];
1379        let matrix: Vec<&[Option<u8>]> = vec![
1380            s1.as_slice(),
1381            s2.as_slice(),
1382            s3.as_slice(),
1383            s4.as_slice(),
1384            s5.as_slice(),
1385        ];
1386        let result = genotype_pca(&matrix, 3).unwrap();
1387        for r in &result.explained_variance_ratio {
1388            assert!(*r >= 0.0);
1389        }
1390        let total: f64 = result.explained_variance_ratio.iter().sum();
1391        assert!(total <= 1.0 + 0.01);
1392    }
1393
1394    #[test]
1395    fn pca_population_structure() {
1396        // Two distinct populations should separate along PC1
1397        let mut matrix: Vec<Vec<Option<u8>>> = Vec::new();
1398        // Pop A: mostly 0s
1399        for _ in 0..10 {
1400            matrix.push(vec![Some(0), Some(0), Some(0), Some(0), Some(1)]);
1401        }
1402        // Pop B: mostly 2s
1403        for _ in 0..10 {
1404            matrix.push(vec![Some(2), Some(2), Some(2), Some(2), Some(1)]);
1405        }
1406        let refs: Vec<&[Option<u8>]> = matrix.iter().map(|v| v.as_slice()).collect();
1407        let result = genotype_pca(&refs, 2).unwrap();
1408        // PC1 should explain most variance
1409        assert!(result.explained_variance_ratio[0] > 0.5);
1410        // Pop A and Pop B should be on opposite sides of PC1
1411        let mean_a: f64 = result.transformed[..10].iter().map(|t| t[0]).sum::<f64>() / 10.0;
1412        let mean_b: f64 = result.transformed[10..].iter().map(|t| t[0]).sum::<f64>() / 10.0;
1413        assert!((mean_a - mean_b).abs() > 0.1);
1414    }
1415
1416    #[test]
1417    fn pca_empty_error() {
1418        let matrix: Vec<&[Option<u8>]> = vec![];
1419        assert!(genotype_pca(&matrix, 1).is_err());
1420    }
1421
1422    // ── Edge cases ───────────────────────────────────────────────────
1423
1424    #[test]
1425    fn u8_convenience() {
1426        let g: Vec<u8> = vec![0, 1, 2, 255, 0];
1427        let af = allele_frequencies_u8(&g).unwrap();
1428        assert_eq!(af.missing_count, 1);
1429        assert_eq!(af.allele_count, 8);
1430    }
1431
1432    #[test]
1433    fn all_missing_error() {
1434        let g = [None, None, None];
1435        assert!(allele_frequencies(&g).is_err());
1436    }
1437
1438    #[test]
1439    fn af_consistency() {
1440        // freq_ref + freq_alt should always sum to 1
1441        let g = [Some(0), Some(0), Some(1), Some(2), Some(2), Some(2)];
1442        let af = allele_frequencies(&g).unwrap();
1443        assert!((af.freq_ref + af.freq_alt - 1.0).abs() < TOL);
1444    }
1445}
cyanea_stats/popgen.rs

cyanea_stats/
popgen.rs