1#![allow(non_snake_case)]
13
14use cyanea_core::{CyaneaError, Result};
15
16use crate::distribution::{ChiSquared, Distribution};
17
18#[derive(Debug, Clone)]
22pub struct AlleleFrequencies {
23 pub freq_ref: f64,
25 pub freq_alt: f64,
27 pub allele_count: usize,
29 pub missing_count: usize,
31 pub observed_het: f64,
33 pub expected_het: f64,
35}
36
37#[derive(Debug, Clone)]
39pub struct HweResult {
40 pub chi_squared: f64,
42 pub p_value: f64,
44 pub observed: [usize; 3],
46 pub expected: [f64; 3],
48 pub inbreeding_coeff: f64,
50}
51
52#[derive(Debug, Clone, Copy, PartialEq, Eq)]
54pub enum FstMethod {
55 WeirCockerham,
57 Hudson,
59}
60
61#[derive(Debug, Clone)]
63pub struct FstResult {
64 pub fst: f64,
66 pub method: FstMethod,
68}
69
70#[derive(Debug, Clone)]
72pub struct DiversityStats {
73 pub pi: f64,
75 pub theta_w: f64,
77 pub segregating_sites: usize,
79 pub n_sequences: usize,
81}
82
83#[derive(Debug, Clone)]
85pub struct TajimaD {
86 pub d: f64,
88 pub pi: f64,
90 pub theta_w: f64,
92 pub segregating_sites: usize,
94 pub n_sequences: usize,
96}
97
98#[derive(Debug, Clone)]
100pub struct LdResult {
101 pub r_squared: f64,
103 pub d_prime: f64,
105 pub d: f64,
107 pub n_haplotypes: usize,
109}
110
111fn count_genotypes(genotypes: &[Option<u8>]) -> Result<(usize, usize, usize, usize)> {
115 let mut n0 = 0usize;
116 let mut n1 = 0usize;
117 let mut n2 = 0usize;
118 let mut n_miss = 0usize;
119 for (i, g) in genotypes.iter().enumerate() {
120 match g {
121 Some(0) => n0 += 1,
122 Some(1) => n1 += 1,
123 Some(2) => n2 += 1,
124 None => n_miss += 1,
125 Some(v) => {
126 return Err(CyaneaError::InvalidInput(format!(
127 "invalid genotype {} at index {} (expected 0, 1, or 2)",
128 v, i
129 )));
130 }
131 }
132 }
133 Ok((n0, n1, n2, n_miss))
134}
135
136fn allele_freq_from_counts(n0: usize, n1: usize, n2: usize) -> (f64, f64) {
138 let total = 2 * (n0 + n1 + n2);
139 if total == 0 {
140 return (0.0, 0.0);
141 }
142 let p = (2 * n0 + n1) as f64 / total as f64;
143 let q = (2 * n2 + n1) as f64 / total as f64;
144 (p, q)
145}
146
147fn harmonic(n: usize) -> f64 {
149 (1..=n).map(|i| 1.0 / i as f64).sum()
150}
151
152fn harmonic_sq(n: usize) -> f64 {
154 (1..=n).map(|i| 1.0 / (i as f64 * i as f64)).sum()
155}
156
157fn em_haplotype_freqs(
161 site_a: &[Option<u8>],
162 site_b: &[Option<u8>],
163 max_iter: usize,
164 tol: f64,
165) -> Result<(f64, f64, f64, f64)> {
166 let mut counts = [[0usize; 3]; 3]; let mut n = 0usize;
169 for (ga, gb) in site_a.iter().zip(site_b.iter()) {
170 if let (Some(a), Some(b)) = (ga, gb) {
171 if *a > 2 || *b > 2 {
172 return Err(CyaneaError::InvalidInput(
173 "invalid genotype value (expected 0, 1, or 2)".into(),
174 ));
175 }
176 counts[*a as usize][*b as usize] += 1;
177 n += 1;
178 }
179 }
180
181 if n == 0 {
182 return Err(CyaneaError::InvalidInput(
183 "no non-missing genotype pairs".into(),
184 ));
185 }
186
187 let mut a_alt = 0usize;
189 let mut b_alt = 0usize;
190 for ga in 0..3 {
191 for gb in 0..3 {
192 a_alt += ga * counts[ga][gb];
193 b_alt += gb * counts[ga][gb];
194 }
195 }
196 let pa = a_alt as f64 / (2 * n) as f64; let pb = b_alt as f64 / (2 * n) as f64; let mut p_ab = pa * pb; let mut p_aB = pa * (1.0 - pb);
202 let mut p_Ab = (1.0 - pa) * pb;
203 let mut p_AB = (1.0 - pa) * (1.0 - pb);
204
205 for _ in 0..max_iter {
206 let mut h_AB = 0.0;
208 let mut h_Ab = 0.0;
209 let mut h_aB = 0.0;
210 let mut h_ab = 0.0;
211
212 for ga in 0..3usize {
213 for gb in 0..3usize {
214 let c = counts[ga][gb] as f64;
215 if c == 0.0 {
216 continue;
217 }
218 match (ga, gb) {
220 (0, 0) => h_AB += 2.0 * c, (0, 1) => {
222 h_AB += c;
223 h_Ab += c;
224 } (0, 2) => h_Ab += 2.0 * c, (1, 0) => {
227 h_AB += c;
228 h_aB += c;
229 } (1, 1) => {
231 let denom = p_AB * p_ab + p_Ab * p_aB;
233 if denom > 0.0 {
234 let w = p_AB * p_ab / denom;
235 h_AB += c * w;
236 h_ab += c * w;
237 h_Ab += c * (1.0 - w);
238 h_aB += c * (1.0 - w);
239 } else {
240 h_AB += 0.5 * c;
242 h_ab += 0.5 * c;
243 h_Ab += 0.5 * c;
244 h_aB += 0.5 * c;
245 }
246 }
247 (1, 2) => {
248 h_Ab += c;
249 h_ab += c;
250 } (2, 0) => h_aB += 2.0 * c, (2, 1) => {
253 h_aB += c;
254 h_ab += c;
255 } (2, 2) => h_ab += 2.0 * c, _ => {}
258 }
259 }
260 }
261
262 let total = h_AB + h_Ab + h_aB + h_ab;
264 let new_AB = h_AB / total;
265 let new_Ab = h_Ab / total;
266 let new_aB = h_aB / total;
267 let new_ab = h_ab / total;
268
269 let diff = (new_AB - p_AB).abs()
270 + (new_Ab - p_Ab).abs()
271 + (new_aB - p_aB).abs()
272 + (new_ab - p_ab).abs();
273
274 p_AB = new_AB;
275 p_Ab = new_Ab;
276 p_aB = new_aB;
277 p_ab = new_ab;
278
279 if diff < tol {
280 break;
281 }
282 }
283
284 Ok((p_AB, p_Ab, p_aB, p_ab))
285}
286
287pub fn allele_frequencies(genotypes: &[Option<u8>]) -> Result<AlleleFrequencies> {
296 if genotypes.is_empty() {
297 return Err(CyaneaError::InvalidInput("empty genotype array".into()));
298 }
299 let (n0, n1, n2, n_miss) = count_genotypes(genotypes)?;
300 let n_called = n0 + n1 + n2;
301 if n_called == 0 {
302 return Err(CyaneaError::InvalidInput(
303 "all genotypes are missing".into(),
304 ));
305 }
306 let (p, q) = allele_freq_from_counts(n0, n1, n2);
307 let obs_het = n1 as f64 / n_called as f64;
308 let exp_het = 2.0 * p * q;
309
310 Ok(AlleleFrequencies {
311 freq_ref: p,
312 freq_alt: q,
313 allele_count: 2 * n_called,
314 missing_count: n_miss,
315 observed_het: obs_het,
316 expected_het: exp_het,
317 })
318}
319
320pub fn allele_frequencies_u8(genotypes: &[u8]) -> Result<AlleleFrequencies> {
323 let opts: Vec<Option<u8>> = genotypes
324 .iter()
325 .map(|&g| if g == 255 { None } else { Some(g) })
326 .collect();
327 allele_frequencies(&opts)
328}
329
330pub fn hwe_test(genotypes: &[Option<u8>]) -> Result<HweResult> {
339 if genotypes.is_empty() {
340 return Err(CyaneaError::InvalidInput("empty genotype array".into()));
341 }
342 let (n0, n1, n2, _) = count_genotypes(genotypes)?;
343 let n = n0 + n1 + n2;
344 if n == 0 {
345 return Err(CyaneaError::InvalidInput(
346 "all genotypes are missing".into(),
347 ));
348 }
349 let (p, q) = allele_freq_from_counts(n0, n1, n2);
350 if p == 0.0 || q == 0.0 {
351 return Err(CyaneaError::InvalidInput(
352 "monomorphic site: HWE test requires polymorphism".into(),
353 ));
354 }
355
356 let n_f = n as f64;
357 let exp = [p * p * n_f, 2.0 * p * q * n_f, q * q * n_f];
358 let obs = [n0, n1, n2];
359
360 let chi_sq: f64 = obs
361 .iter()
362 .zip(exp.iter())
363 .map(|(&o, &e)| {
364 if e > 0.0 {
365 (o as f64 - e).powi(2) / e
366 } else {
367 0.0
368 }
369 })
370 .sum();
371
372 let chi2_dist = ChiSquared::new(1.0)?;
373 let p_value = 1.0 - chi2_dist.cdf(chi_sq);
374
375 let obs_het = n1 as f64 / n_f;
376 let exp_het = 2.0 * p * q;
377 let f = if exp_het > 0.0 {
378 1.0 - obs_het / exp_het
379 } else {
380 0.0
381 };
382
383 Ok(HweResult {
384 chi_squared: chi_sq,
385 p_value,
386 observed: obs,
387 expected: exp,
388 inbreeding_coeff: f,
389 })
390}
391
392pub fn fst_weir_cockerham(
401 pop1: &[&[Option<u8>]],
402 pop2: &[&[Option<u8>]],
403) -> Result<FstResult> {
404 if pop1.is_empty() || pop2.is_empty() {
405 return Err(CyaneaError::InvalidInput("empty loci or populations".into()));
406 }
407
408 let mut sum_a = 0.0;
409 let mut sum_abc = 0.0;
410
411 for (locus1, locus2) in pop1.iter().zip(pop2.iter()) {
412 let (n0_1, n1_1, n2_1, _) = count_genotypes(locus1)?;
413 let (n0_2, n1_2, n2_2, _) = count_genotypes(locus2)?;
414 let ni1 = (n0_1 + n1_1 + n2_1) as f64;
415 let ni2 = (n0_2 + n1_2 + n2_2) as f64;
416 if ni1 == 0.0 || ni2 == 0.0 {
417 continue;
418 }
419
420 let r = 2.0; let n_bar = (ni1 + ni2) / r;
422 let n_total = ni1 + ni2;
423
424 let (_, q1) = allele_freq_from_counts(n0_1, n1_1, n2_1);
425 let (_, q2) = allele_freq_from_counts(n0_2, n1_2, n2_2);
426
427 let p_bar = (ni1 * q1 + ni2 * q2) / n_total;
428
429 let s_sq = (ni1 * (q1 - p_bar).powi(2) + ni2 * (q2 - p_bar).powi(2))
430 / ((r - 1.0) * n_bar);
431
432 let h_bar = (ni1 * n1_1 as f64 / ni1 + ni2 * n1_2 as f64 / ni2) / n_total;
433
434 let nc = n_total - (ni1 * ni1 + ni2 * ni2) / n_total;
435 let nc = nc / (r - 1.0);
436
437 let a = (n_bar / nc)
439 * (s_sq - (1.0 / (n_bar - 1.0)) * (p_bar * (1.0 - p_bar) - (r - 1.0) / r * s_sq - 0.25 * h_bar));
440
441 let b = (n_bar / (n_bar - 1.0))
442 * (p_bar * (1.0 - p_bar) - (r - 1.0) / r * s_sq
443 - (2.0 * n_bar - 1.0) / (4.0 * n_bar) * h_bar);
444
445 let c = 0.5 * h_bar;
446
447 sum_a += a;
448 sum_abc += a + b + c;
449 }
450
451 let fst = if sum_abc > 0.0 {
452 (sum_a / sum_abc).clamp(0.0, 1.0)
453 } else {
454 0.0
455 };
456
457 Ok(FstResult {
458 fst,
459 method: FstMethod::WeirCockerham,
460 })
461}
462
463pub fn fst_hudson(
471 pop1: &[&[Option<u8>]],
472 pop2: &[&[Option<u8>]],
473) -> Result<FstResult> {
474 if pop1.is_empty() || pop2.is_empty() {
475 return Err(CyaneaError::InvalidInput("empty loci or populations".into()));
476 }
477
478 let mut sum_num = 0.0;
479 let mut sum_den = 0.0;
480
481 for (locus1, locus2) in pop1.iter().zip(pop2.iter()) {
482 let (n0_1, n1_1, n2_1, _) = count_genotypes(locus1)?;
483 let (n0_2, n1_2, n2_2, _) = count_genotypes(locus2)?;
484 let n1 = (n0_1 + n1_1 + n2_1) as f64;
485 let n2_pop = (n0_2 + n1_2 + n2_2) as f64;
486 if n1 == 0.0 || n2_pop == 0.0 {
487 continue;
488 }
489
490 let (_, q1) = allele_freq_from_counts(n0_1, n1_1, n2_1);
491 let (_, q2) = allele_freq_from_counts(n0_2, n1_2, n2_2);
492
493 let hw1 = 2.0 * q1 * (1.0 - q1) * n1 / (n1 - 1.0);
495 let hw2 = 2.0 * q2 * (1.0 - q2) * n2_pop / (n2_pop - 1.0);
496 let hw = (hw1 + hw2) / 2.0;
497
498 let hb = q1 * (1.0 - q2) + q2 * (1.0 - q1);
500
501 sum_num += hb - hw;
503 sum_den += hb;
504 }
505
506 let fst = if sum_den > 0.0 {
507 (sum_num / sum_den).clamp(0.0, 1.0)
508 } else {
509 0.0
510 };
511
512 Ok(FstResult {
513 fst,
514 method: FstMethod::Hudson,
515 })
516}
517
518pub fn fst_multi_population(populations: &[&[&[Option<u8>]]]) -> Result<FstResult> {
527 let r = populations.len();
528 if r < 2 {
529 return Err(CyaneaError::InvalidInput(
530 "need at least 2 populations for Fst".into(),
531 ));
532 }
533 let n_loci = populations[0].len();
534 if n_loci == 0 {
535 return Err(CyaneaError::InvalidInput("no loci provided".into()));
536 }
537 for pop in populations.iter() {
538 if pop.len() != n_loci {
539 return Err(CyaneaError::InvalidInput(
540 "all populations must have the same number of loci".into(),
541 ));
542 }
543 }
544
545 let mut sum_a = 0.0;
546 let mut sum_abc = 0.0;
547 let r_f = r as f64;
548
549 for locus_idx in 0..n_loci {
550 let mut ni = Vec::with_capacity(r);
551 let mut qi = Vec::with_capacity(r);
552 let mut hi = Vec::with_capacity(r);
553
554 for pop in populations.iter() {
555 let (n0, n1, n2, _) = count_genotypes(pop[locus_idx])?;
556 let n = (n0 + n1 + n2) as f64;
557 if n == 0.0 {
558 ni.push(0.0);
559 qi.push(0.0);
560 hi.push(0.0);
561 continue;
562 }
563 let (_, q) = allele_freq_from_counts(n0, n1, n2);
564 ni.push(n);
565 qi.push(q);
566 hi.push(n1 as f64 / n);
567 }
568
569 let n_total: f64 = ni.iter().sum();
570 if n_total == 0.0 {
571 continue;
572 }
573
574 let n_bar = n_total / r_f;
575 let p_bar: f64 = ni.iter().zip(qi.iter()).map(|(n, q)| n * q).sum::<f64>() / n_total;
576 let s_sq: f64 = ni
577 .iter()
578 .zip(qi.iter())
579 .map(|(n, q)| n * (q - p_bar).powi(2))
580 .sum::<f64>()
581 / ((r_f - 1.0) * n_bar);
582 let h_bar: f64 = ni.iter().zip(hi.iter()).map(|(n, h)| n * h).sum::<f64>() / n_total;
583
584 let nc = (n_total - ni.iter().map(|n| n * n).sum::<f64>() / n_total) / (r_f - 1.0);
585
586 let a = (n_bar / nc)
587 * (s_sq
588 - (1.0 / (n_bar - 1.0))
589 * (p_bar * (1.0 - p_bar) - (r_f - 1.0) / r_f * s_sq - 0.25 * h_bar));
590
591 let b = (n_bar / (n_bar - 1.0))
592 * (p_bar * (1.0 - p_bar) - (r_f - 1.0) / r_f * s_sq
593 - (2.0 * n_bar - 1.0) / (4.0 * n_bar) * h_bar);
594
595 let c = 0.5 * h_bar;
596
597 sum_a += a;
598 sum_abc += a + b + c;
599 }
600
601 let fst = if sum_abc > 0.0 {
602 (sum_a / sum_abc).clamp(0.0, 1.0)
603 } else {
604 0.0
605 };
606
607 Ok(FstResult {
608 fst,
609 method: FstMethod::WeirCockerham,
610 })
611}
612
613pub fn diversity(
624 genotype_matrix: &[&[Option<u8>]],
625 n_sequences: usize,
626) -> Result<DiversityStats> {
627 if n_sequences < 2 {
628 return Err(CyaneaError::InvalidInput(
629 "need at least 2 sequences for diversity".into(),
630 ));
631 }
632 if genotype_matrix.is_empty() {
633 return Err(CyaneaError::InvalidInput("no loci provided".into()));
634 }
635
636 let mut seg_sites = 0usize;
637 let mut total_pi = 0.0;
638
639 for locus in genotype_matrix {
640 let (n0, n1, n2, _) = count_genotypes(locus)?;
641 let n_called = n0 + n1 + n2;
642 if n_called == 0 {
643 continue;
644 }
645 let (_, q) = allele_freq_from_counts(n0, n1, n2);
646 if q > 0.0 && q < 1.0 {
647 seg_sites += 1;
648 }
649 let n_hap = (2 * n_called) as f64;
651 let pi_site = 2.0 * q * (1.0 - q) * n_hap / (n_hap - 1.0);
652 total_pi += pi_site;
653 }
654
655 let n_loci = genotype_matrix.len() as f64;
656 let pi = total_pi / n_loci;
657
658 let a1 = harmonic(n_sequences - 1);
659 let theta_w = if a1 > 0.0 {
660 (seg_sites as f64 / n_loci) / a1
661 } else {
662 0.0
663 };
664
665 Ok(DiversityStats {
666 pi,
667 theta_w,
668 segregating_sites: seg_sites,
669 n_sequences,
670 })
671}
672
673pub fn diversity_from_counts(
682 segregating_sites: usize,
683 n_sequences: usize,
684 avg_pairwise_diff: f64,
685) -> Result<DiversityStats> {
686 if n_sequences < 2 {
687 return Err(CyaneaError::InvalidInput(
688 "need at least 2 sequences for diversity".into(),
689 ));
690 }
691
692 let a1 = harmonic(n_sequences - 1);
693 let theta_w = if a1 > 0.0 {
694 segregating_sites as f64 / a1
695 } else {
696 0.0
697 };
698
699 Ok(DiversityStats {
700 pi: avg_pairwise_diff,
701 theta_w,
702 segregating_sites,
703 n_sequences,
704 })
705}
706
707pub fn tajimas_d(
719 segregating_sites: usize,
720 n_sequences: usize,
721 avg_pairwise_diff: f64,
722) -> Result<TajimaD> {
723 if n_sequences < 4 {
724 return Err(CyaneaError::InvalidInput(
725 "Tajima's D requires at least 4 sequences".into(),
726 ));
727 }
728 if segregating_sites == 0 {
729 return Err(CyaneaError::InvalidInput(
730 "Tajima's D undefined with 0 segregating sites".into(),
731 ));
732 }
733
734 let n = n_sequences as f64;
735 let s = segregating_sites as f64;
736
737 let a1 = harmonic(n_sequences - 1);
738 let a2 = harmonic_sq(n_sequences - 1);
739
740 let theta_w = s / a1;
741
742 let b1 = (n + 1.0) / (3.0 * (n - 1.0));
744 let b2 = 2.0 * (n * n + n + 3.0) / (9.0 * n * (n - 1.0));
745
746 let c1 = b1 - 1.0 / a1;
747 let c2 = b2 - (n + 2.0) / (a1 * n) + a2 / (a1 * a1);
748
749 let e1 = c1 / a1;
750 let e2 = c2 / (a1 * a1 + a2);
751
752 let var_d = e1 * s + e2 * s * (s - 1.0);
753
754 let d = if var_d > 0.0 {
755 (avg_pairwise_diff - theta_w) / var_d.sqrt()
756 } else {
757 0.0
758 };
759
760 Ok(TajimaD {
761 d,
762 pi: avg_pairwise_diff,
763 theta_w,
764 segregating_sites,
765 n_sequences,
766 })
767}
768
769pub fn linkage_disequilibrium(
779 site_a: &[Option<u8>],
780 site_b: &[Option<u8>],
781) -> Result<LdResult> {
782 if site_a.len() != site_b.len() {
783 return Err(CyaneaError::InvalidInput(
784 "site vectors must have the same length".into(),
785 ));
786 }
787 if site_a.is_empty() {
788 return Err(CyaneaError::InvalidInput("empty site vectors".into()));
789 }
790
791 let mut n_pairs = 0usize;
793 for (a, b) in site_a.iter().zip(site_b.iter()) {
794 if a.is_some() && b.is_some() {
795 n_pairs += 1;
796 }
797 }
798 if n_pairs == 0 {
799 return Err(CyaneaError::InvalidInput(
800 "no non-missing genotype pairs".into(),
801 ));
802 }
803
804 let af_a = allele_frequencies(site_a)?;
806 let af_b = allele_frequencies(site_b)?;
807 if af_a.freq_alt == 0.0 || af_a.freq_alt == 1.0 {
808 return Err(CyaneaError::InvalidInput(
809 "site A is monomorphic: LD undefined".into(),
810 ));
811 }
812 if af_b.freq_alt == 0.0 || af_b.freq_alt == 1.0 {
813 return Err(CyaneaError::InvalidInput(
814 "site B is monomorphic: LD undefined".into(),
815 ));
816 }
817
818 let (p_AB, _p_Ab, _p_aB, _p_ab) = em_haplotype_freqs(site_a, site_b, 1000, 1e-10)?;
819
820 let p_a = af_a.freq_alt; let p_A = af_a.freq_ref;
823 let p_b = af_b.freq_alt; let p_B = af_b.freq_ref;
825
826 let d = p_AB - p_A * p_B;
828
829 let d_max = if d >= 0.0 {
831 (p_A * p_b).min(p_a * p_B)
832 } else {
833 (p_A * p_B).min(p_a * p_b)
834 };
835 let d_prime = if d_max.abs() > 0.0 {
836 (d / d_max).abs().min(1.0)
837 } else {
838 0.0
839 };
840
841 let denom = p_A * p_a * p_B * p_b;
843 let r_squared = if denom > 0.0 {
844 (d * d / denom).min(1.0)
845 } else {
846 0.0
847 };
848
849 Ok(LdResult {
850 r_squared,
851 d_prime,
852 d,
853 n_haplotypes: 2 * n_pairs,
854 })
855}
856
857pub fn genotype_pca(
868 genotype_matrix: &[&[Option<u8>]],
869 n_components: usize,
870) -> Result<crate::reduction::PcaResult> {
871 if genotype_matrix.is_empty() {
872 return Err(CyaneaError::InvalidInput(
873 "empty genotype matrix".into(),
874 ));
875 }
876 let n_features = genotype_matrix[0].len();
877 if n_features == 0 {
878 return Err(CyaneaError::InvalidInput("zero loci".into()));
879 }
880 for (i, row) in genotype_matrix.iter().enumerate() {
881 if row.len() != n_features {
882 return Err(CyaneaError::InvalidInput(format!(
883 "sample {} has {} loci, expected {}",
884 i,
885 row.len(),
886 n_features
887 )));
888 }
889 }
890
891 let n_samples = genotype_matrix.len();
892
893 let mut site_sum = vec![0.0; n_features];
895 let mut site_count = vec![0usize; n_features];
896 for row in genotype_matrix {
897 for (j, g) in row.iter().enumerate() {
898 if let Some(v) = g {
899 site_sum[j] += *v as f64;
900 site_count[j] += 1;
901 }
902 }
903 }
904 let site_mean: Vec<f64> = (0..n_features)
905 .map(|j| {
906 if site_count[j] > 0 {
907 site_sum[j] / site_count[j] as f64
908 } else {
909 0.0
910 }
911 })
912 .collect();
913
914 let mut data: Vec<Vec<f64>> = Vec::with_capacity(n_samples);
916 for row in genotype_matrix {
917 let imputed: Vec<f64> = row
918 .iter()
919 .enumerate()
920 .map(|(j, g)| match g {
921 Some(v) => *v as f64,
922 None => site_mean[j],
923 })
924 .collect();
925 data.push(imputed);
926 }
927
928 let refs: Vec<&[f64]> = data.iter().map(|v| v.as_slice()).collect();
929 crate::reduction::pca(&refs, n_components)
930}
931
932#[cfg(test)]
935mod tests {
936 use super::*;
937
938 const TOL: f64 = 1e-6;
939
940 #[test]
943 fn af_basic() {
944 let g = [Some(0), Some(0), Some(1), Some(1), Some(2)];
946 let af = allele_frequencies(&g).unwrap();
947 assert!((af.freq_ref - 0.6).abs() < TOL);
948 assert!((af.freq_alt - 0.4).abs() < TOL);
949 assert_eq!(af.allele_count, 10);
950 assert_eq!(af.missing_count, 0);
951 assert!((af.observed_het - 0.4).abs() < TOL);
952 }
953
954 #[test]
955 fn af_all_hom_ref() {
956 let g = [Some(0), Some(0), Some(0)];
957 let af = allele_frequencies(&g).unwrap();
958 assert!((af.freq_ref - 1.0).abs() < TOL);
959 assert!((af.freq_alt - 0.0).abs() < TOL);
960 assert!((af.observed_het - 0.0).abs() < TOL);
961 }
962
963 #[test]
964 fn af_all_het() {
965 let g = [Some(1), Some(1), Some(1), Some(1)];
966 let af = allele_frequencies(&g).unwrap();
967 assert!((af.freq_ref - 0.5).abs() < TOL);
968 assert!((af.freq_alt - 0.5).abs() < TOL);
969 assert!((af.observed_het - 1.0).abs() < TOL);
970 }
971
972 #[test]
973 fn af_with_missing() {
974 let g = [Some(0), None, Some(1), Some(2), None];
975 let af = allele_frequencies(&g).unwrap();
976 assert_eq!(af.missing_count, 2);
977 assert_eq!(af.allele_count, 6);
978 }
979
980 #[test]
981 fn af_empty_error() {
982 let g: [Option<u8>; 0] = [];
983 assert!(allele_frequencies(&g).is_err());
984 }
985
986 #[test]
987 fn af_invalid_genotype() {
988 let g = [Some(0), Some(3), Some(1)];
989 assert!(allele_frequencies(&g).is_err());
990 }
991
992 #[test]
995 fn hwe_perfect_equilibrium() {
996 let mut g = Vec::new();
999 for _ in 0..25 {
1000 g.push(Some(0));
1001 }
1002 for _ in 0..50 {
1003 g.push(Some(1));
1004 }
1005 for _ in 0..25 {
1006 g.push(Some(2));
1007 }
1008 let result = hwe_test(&g).unwrap();
1009 assert!(result.chi_squared < 0.01, "chi_sq={}", result.chi_squared);
1010 assert!(result.p_value > 0.9);
1011 assert!(result.inbreeding_coeff.abs() < 0.01);
1012 }
1013
1014 #[test]
1015 fn hwe_excess_heterozygosity() {
1016 let g: Vec<Option<u8>> = vec![Some(1); 100];
1018 let result = hwe_test(&g).unwrap();
1019 assert!(result.chi_squared > 10.0);
1020 assert!(result.p_value < 0.01);
1021 assert!(result.inbreeding_coeff < -0.5);
1022 }
1023
1024 #[test]
1025 fn hwe_deficit_het() {
1026 let mut g = Vec::new();
1028 for _ in 0..50 {
1029 g.push(Some(0));
1030 }
1031 for _ in 0..50 {
1032 g.push(Some(2));
1033 }
1034 let result = hwe_test(&g).unwrap();
1035 assert!(result.chi_squared > 50.0);
1036 assert!(result.inbreeding_coeff > 0.9);
1037 }
1038
1039 #[test]
1040 fn hwe_textbook() {
1041 let mut g = Vec::new();
1044 for _ in 0..298 {
1045 g.push(Some(0));
1046 }
1047 for _ in 0..489 {
1048 g.push(Some(1));
1049 }
1050 for _ in 0..213 {
1051 g.push(Some(2));
1052 }
1053 let result = hwe_test(&g).unwrap();
1054 assert!(result.chi_squared < 10.0);
1057 }
1058
1059 #[test]
1060 fn hwe_monomorphic_error() {
1061 let g = [Some(0), Some(0), Some(0)];
1062 assert!(hwe_test(&g).is_err());
1063 }
1064
1065 #[test]
1066 fn hwe_with_missing() {
1067 let mut g: Vec<Option<u8>> = Vec::new();
1068 for _ in 0..20 {
1069 g.push(Some(0));
1070 }
1071 for _ in 0..40 {
1072 g.push(Some(1));
1073 }
1074 for _ in 0..20 {
1075 g.push(Some(2));
1076 }
1077 for _ in 0..10 {
1078 g.push(None);
1079 }
1080 let result = hwe_test(&g).unwrap();
1081 assert!(result.p_value > 0.5);
1082 }
1083
1084 #[test]
1087 fn fst_identical_populations() {
1088 let locus: Vec<Option<u8>> = vec![Some(0), Some(1), Some(1), Some(2)];
1089 let pop1: Vec<&[Option<u8>]> = vec![locus.as_slice()];
1090 let pop2: Vec<&[Option<u8>]> = vec![locus.as_slice()];
1091 let result = fst_weir_cockerham(&pop1, &pop2).unwrap();
1092 assert!(result.fst < 0.05, "fst={}", result.fst);
1093 }
1094
1095 #[test]
1096 fn fst_fully_differentiated() {
1097 let pop1_locus: Vec<Option<u8>> = vec![Some(0); 50];
1098 let pop2_locus: Vec<Option<u8>> = vec![Some(2); 50];
1099 let pop1: Vec<&[Option<u8>]> = vec![pop1_locus.as_slice()];
1100 let pop2: Vec<&[Option<u8>]> = vec![pop2_locus.as_slice()];
1101 let result = fst_weir_cockerham(&pop1, &pop2).unwrap();
1102 assert!(result.fst > 0.8, "fst={}", result.fst);
1103 }
1104
1105 #[test]
1106 fn fst_wc_known() {
1107 let pop1_locus: Vec<Option<u8>> = vec![Some(0), Some(0), Some(0), Some(1), Some(1)];
1109 let pop2_locus: Vec<Option<u8>> = vec![Some(1), Some(1), Some(2), Some(2), Some(2)];
1110 let pop1: Vec<&[Option<u8>]> = vec![pop1_locus.as_slice()];
1111 let pop2: Vec<&[Option<u8>]> = vec![pop2_locus.as_slice()];
1112 let result = fst_weir_cockerham(&pop1, &pop2).unwrap();
1113 assert!(result.fst > 0.1 && result.fst < 0.9, "fst={}", result.fst);
1114 assert_eq!(result.method, FstMethod::WeirCockerham);
1115 }
1116
1117 #[test]
1118 fn fst_hudson_known() {
1119 let pop1_locus: Vec<Option<u8>> = vec![Some(0), Some(0), Some(0), Some(1), Some(1)];
1120 let pop2_locus: Vec<Option<u8>> = vec![Some(1), Some(1), Some(2), Some(2), Some(2)];
1121 let pop1: Vec<&[Option<u8>]> = vec![pop1_locus.as_slice()];
1122 let pop2: Vec<&[Option<u8>]> = vec![pop2_locus.as_slice()];
1123 let result = fst_hudson(&pop1, &pop2).unwrap();
1124 assert!(result.fst > 0.1 && result.fst < 0.9, "fst={}", result.fst);
1125 assert_eq!(result.method, FstMethod::Hudson);
1126 }
1127
1128 #[test]
1129 fn fst_multi_locus() {
1130 let p1_l1: Vec<Option<u8>> = vec![Some(0), Some(0), Some(1), Some(1)];
1132 let p1_l2: Vec<Option<u8>> = vec![Some(0), Some(1), Some(1), Some(2)];
1133 let p2_l1: Vec<Option<u8>> = vec![Some(1), Some(2), Some(2), Some(2)];
1134 let p2_l2: Vec<Option<u8>> = vec![Some(1), Some(1), Some(2), Some(2)];
1135 let pop1: Vec<&[Option<u8>]> = vec![p1_l1.as_slice(), p1_l2.as_slice()];
1136 let pop2: Vec<&[Option<u8>]> = vec![p2_l1.as_slice(), p2_l2.as_slice()];
1137 let result = fst_weir_cockerham(&pop1, &pop2).unwrap();
1138 assert!(result.fst >= 0.0 && result.fst <= 1.0);
1139 }
1140
1141 #[test]
1142 fn fst_three_populations() {
1143 let l1: Vec<Option<u8>> = vec![Some(0), Some(0), Some(1)];
1144 let l2: Vec<Option<u8>> = vec![Some(1), Some(2), Some(2)];
1145 let l3: Vec<Option<u8>> = vec![Some(0), Some(1), Some(2)];
1146 let pop1: Vec<&[Option<u8>]> = vec![l1.as_slice()];
1147 let pop2: Vec<&[Option<u8>]> = vec![l2.as_slice()];
1148 let pop3: Vec<&[Option<u8>]> = vec![l3.as_slice()];
1149 let pops: Vec<&[&[Option<u8>]]> = vec![pop1.as_slice(), pop2.as_slice(), pop3.as_slice()];
1150 let result = super::fst_multi_population(&pops).unwrap();
1151 assert!(result.fst >= 0.0 && result.fst <= 1.0);
1152 }
1153
1154 #[test]
1155 fn fst_unequal_sizes() {
1156 let pop1_locus: Vec<Option<u8>> = vec![Some(0), Some(1)];
1157 let pop2_locus: Vec<Option<u8>> = vec![Some(1), Some(2), Some(2), Some(2), Some(2)];
1158 let pop1: Vec<&[Option<u8>]> = vec![pop1_locus.as_slice()];
1159 let pop2: Vec<&[Option<u8>]> = vec![pop2_locus.as_slice()];
1160 let result = fst_hudson(&pop1, &pop2).unwrap();
1161 assert!(result.fst >= 0.0 && result.fst <= 1.0);
1162 }
1163
1164 #[test]
1165 fn fst_error_empty() {
1166 let pop1: Vec<&[Option<u8>]> = vec![];
1167 let pop2: Vec<&[Option<u8>]> = vec![];
1168 assert!(fst_weir_cockerham(&pop1, &pop2).is_err());
1169 }
1170
1171 #[test]
1174 fn diversity_known_pi() {
1175 let l1: Vec<Option<u8>> = vec![Some(0), Some(0), Some(1), Some(2)]; let l2: Vec<Option<u8>> = vec![Some(0), Some(0), Some(0), Some(0)]; let l3: Vec<Option<u8>> = vec![Some(1), Some(1), Some(1), Some(1)]; let l4: Vec<Option<u8>> = vec![Some(0), Some(0), Some(0), Some(0)]; let matrix: Vec<&[Option<u8>]> =
1181 vec![l1.as_slice(), l2.as_slice(), l3.as_slice(), l4.as_slice()];
1182 let result = diversity(&matrix, 8).unwrap();
1183 assert!(result.pi > 0.0);
1184 assert_eq!(result.segregating_sites, 2);
1185 assert!(result.theta_w > 0.0);
1186 }
1187
1188 #[test]
1189 fn diversity_known_theta_w() {
1190 let result = diversity_from_counts(10, 20, 5.0).unwrap();
1192 let h19 = harmonic(19);
1193 assert!((result.theta_w - 10.0 / h19).abs() < TOL);
1194 }
1195
1196 #[test]
1197 fn diversity_zero_segregating() {
1198 let l1: Vec<Option<u8>> = vec![Some(0), Some(0), Some(0)];
1199 let matrix: Vec<&[Option<u8>]> = vec![l1.as_slice()];
1200 let result = diversity(&matrix, 6).unwrap();
1201 assert_eq!(result.segregating_sites, 0);
1202 assert!((result.pi - 0.0).abs() < TOL);
1203 }
1204
1205 #[test]
1206 fn diversity_error_n_lt_2() {
1207 assert!(diversity_from_counts(5, 1, 2.0).is_err());
1208 }
1209
1210 #[test]
1213 fn tajimas_d_neutral() {
1214 let n = 20;
1216 let s = 10;
1217 let a1 = harmonic(n - 1);
1218 let theta_w = s as f64 / a1;
1219 let result = tajimas_d(s, n, theta_w).unwrap();
1220 assert!(result.d.abs() < 0.01, "d={}", result.d);
1221 }
1222
1223 #[test]
1224 fn tajimas_d_positive() {
1225 let result = tajimas_d(5, 20, 10.0).unwrap();
1227 assert!(result.d > 0.0, "d={}", result.d);
1228 }
1229
1230 #[test]
1231 fn tajimas_d_negative() {
1232 let result = tajimas_d(50, 20, 1.0).unwrap();
1234 assert!(result.d < 0.0, "d={}", result.d);
1235 }
1236
1237 #[test]
1238 fn tajimas_d_textbook() {
1239 let result = tajimas_d(12, 10, 3.5).unwrap();
1241 assert!(result.d < 0.0, "d={}", result.d);
1243 assert_eq!(result.segregating_sites, 12);
1244 assert_eq!(result.n_sequences, 10);
1245 }
1246
1247 #[test]
1248 fn tajimas_d_n_lt_4_error() {
1249 assert!(tajimas_d(5, 3, 2.0).is_err());
1250 }
1251
1252 #[test]
1253 fn tajimas_d_zero_s_error() {
1254 assert!(tajimas_d(0, 20, 0.0).is_err());
1255 }
1256
1257 #[test]
1260 fn ld_perfect() {
1261 let site_a: Vec<Option<u8>> = vec![Some(0), Some(0), Some(2), Some(2)];
1263 let site_b: Vec<Option<u8>> = vec![Some(0), Some(0), Some(2), Some(2)];
1264 let result = linkage_disequilibrium(&site_a, &site_b).unwrap();
1265 assert!(result.r_squared > 0.9, "r²={}", result.r_squared);
1266 assert!(result.d_prime > 0.9, "D'={}", result.d_prime);
1267 }
1268
1269 #[test]
1270 fn ld_no_ld() {
1271 let mut site_a = Vec::new();
1274 let mut site_b = Vec::new();
1275 for _ in 0..25 {
1277 site_a.push(Some(0));
1278 site_b.push(Some(0));
1279 }
1280 for _ in 0..25 {
1281 site_a.push(Some(0));
1282 site_b.push(Some(2));
1283 }
1284 for _ in 0..25 {
1285 site_a.push(Some(2));
1286 site_b.push(Some(0));
1287 }
1288 for _ in 0..25 {
1289 site_a.push(Some(2));
1290 site_b.push(Some(2));
1291 }
1292 let result = linkage_disequilibrium(&site_a, &site_b).unwrap();
1293 assert!(result.r_squared < 0.05, "r²={}", result.r_squared);
1294 }
1295
1296 #[test]
1297 fn ld_intermediate() {
1298 let site_a: Vec<Option<u8>> =
1300 vec![Some(0), Some(0), Some(0), Some(2), Some(2), Some(1)];
1301 let site_b: Vec<Option<u8>> =
1302 vec![Some(0), Some(0), Some(2), Some(2), Some(2), Some(1)];
1303 let result = linkage_disequilibrium(&site_a, &site_b).unwrap();
1304 assert!(result.r_squared >= 0.0 && result.r_squared <= 1.0);
1305 assert!(result.d_prime >= 0.0 && result.d_prime <= 1.0);
1306 }
1307
1308 #[test]
1309 fn ld_d_prime_vs_r_squared() {
1310 let site_a: Vec<Option<u8>> =
1312 vec![Some(0), Some(0), Some(0), Some(0), Some(1), Some(1), Some(2)];
1313 let site_b: Vec<Option<u8>> =
1314 vec![Some(0), Some(0), Some(0), Some(1), Some(1), Some(2), Some(2)];
1315 let result = linkage_disequilibrium(&site_a, &site_b).unwrap();
1316 assert!(result.r_squared >= 0.0 && result.r_squared <= 1.0);
1318 assert!(result.d_prime >= 0.0 && result.d_prime <= 1.0);
1319 }
1320
1321 #[test]
1322 fn ld_with_missing() {
1323 let site_a: Vec<Option<u8>> = vec![Some(0), Some(0), None, Some(2), Some(2)];
1324 let site_b: Vec<Option<u8>> = vec![Some(0), None, Some(0), Some(2), Some(2)];
1325 let result = linkage_disequilibrium(&site_a, &site_b).unwrap();
1326 assert!(result.n_haplotypes > 0);
1327 assert!(result.r_squared >= 0.0);
1328 }
1329
1330 #[test]
1331 fn ld_monomorphic_error() {
1332 let site_a: Vec<Option<u8>> = vec![Some(0), Some(0), Some(0)];
1333 let site_b: Vec<Option<u8>> = vec![Some(0), Some(1), Some(2)];
1334 assert!(linkage_disequilibrium(&site_a, &site_b).is_err());
1335 }
1336
1337 #[test]
1338 fn ld_length_mismatch() {
1339 let site_a: Vec<Option<u8>> = vec![Some(0), Some(1)];
1340 let site_b: Vec<Option<u8>> = vec![Some(0), Some(1), Some(2)];
1341 assert!(linkage_disequilibrium(&site_a, &site_b).is_err());
1342 }
1343
1344 #[test]
1347 fn pca_basic() {
1348 let s1: Vec<Option<u8>> = vec![Some(0), Some(0), Some(1), Some(2)];
1349 let s2: Vec<Option<u8>> = vec![Some(0), Some(1), Some(1), Some(2)];
1350 let s3: Vec<Option<u8>> = vec![Some(2), Some(2), Some(1), Some(0)];
1351 let s4: Vec<Option<u8>> = vec![Some(2), Some(1), Some(0), Some(0)];
1352 let matrix: Vec<&[Option<u8>]> =
1353 vec![s1.as_slice(), s2.as_slice(), s3.as_slice(), s4.as_slice()];
1354 let result = genotype_pca(&matrix, 2).unwrap();
1355 assert_eq!(result.components.len(), 2);
1356 assert_eq!(result.transformed.len(), 4);
1357 assert_eq!(result.transformed[0].len(), 2);
1358 }
1359
1360 #[test]
1361 fn pca_missing_imputation() {
1362 let s1: Vec<Option<u8>> = vec![Some(0), None, Some(1)];
1364 let s2: Vec<Option<u8>> = vec![Some(1), Some(1), Some(2)];
1365 let s3: Vec<Option<u8>> = vec![Some(2), Some(2), None];
1366 let matrix: Vec<&[Option<u8>]> = vec![s1.as_slice(), s2.as_slice(), s3.as_slice()];
1367 let result = genotype_pca(&matrix, 2).unwrap();
1368 assert_eq!(result.transformed.len(), 3);
1369 }
1370
1371 #[test]
1372 fn pca_variance() {
1373 let s1: Vec<Option<u8>> = vec![Some(0), Some(0), Some(2), Some(2)];
1375 let s2: Vec<Option<u8>> = vec![Some(0), Some(1), Some(1), Some(2)];
1376 let s3: Vec<Option<u8>> = vec![Some(2), Some(2), Some(0), Some(0)];
1377 let s4: Vec<Option<u8>> = vec![Some(1), Some(1), Some(1), Some(1)];
1378 let s5: Vec<Option<u8>> = vec![Some(0), Some(2), Some(0), Some(2)];
1379 let matrix: Vec<&[Option<u8>]> = vec![
1380 s1.as_slice(),
1381 s2.as_slice(),
1382 s3.as_slice(),
1383 s4.as_slice(),
1384 s5.as_slice(),
1385 ];
1386 let result = genotype_pca(&matrix, 3).unwrap();
1387 for r in &result.explained_variance_ratio {
1388 assert!(*r >= 0.0);
1389 }
1390 let total: f64 = result.explained_variance_ratio.iter().sum();
1391 assert!(total <= 1.0 + 0.01);
1392 }
1393
1394 #[test]
1395 fn pca_population_structure() {
1396 let mut matrix: Vec<Vec<Option<u8>>> = Vec::new();
1398 for _ in 0..10 {
1400 matrix.push(vec![Some(0), Some(0), Some(0), Some(0), Some(1)]);
1401 }
1402 for _ in 0..10 {
1404 matrix.push(vec![Some(2), Some(2), Some(2), Some(2), Some(1)]);
1405 }
1406 let refs: Vec<&[Option<u8>]> = matrix.iter().map(|v| v.as_slice()).collect();
1407 let result = genotype_pca(&refs, 2).unwrap();
1408 assert!(result.explained_variance_ratio[0] > 0.5);
1410 let mean_a: f64 = result.transformed[..10].iter().map(|t| t[0]).sum::<f64>() / 10.0;
1412 let mean_b: f64 = result.transformed[10..].iter().map(|t| t[0]).sum::<f64>() / 10.0;
1413 assert!((mean_a - mean_b).abs() > 0.1);
1414 }
1415
1416 #[test]
1417 fn pca_empty_error() {
1418 let matrix: Vec<&[Option<u8>]> = vec![];
1419 assert!(genotype_pca(&matrix, 1).is_err());
1420 }
1421
1422 #[test]
1425 fn u8_convenience() {
1426 let g: Vec<u8> = vec![0, 1, 2, 255, 0];
1427 let af = allele_frequencies_u8(&g).unwrap();
1428 assert_eq!(af.missing_count, 1);
1429 assert_eq!(af.allele_count, 8);
1430 }
1431
1432 #[test]
1433 fn all_missing_error() {
1434 let g = [None, None, None];
1435 assert!(allele_frequencies(&g).is_err());
1436 }
1437
1438 #[test]
1439 fn af_consistency() {
1440 let g = [Some(0), Some(0), Some(1), Some(2), Some(2), Some(2)];
1442 let af = allele_frequencies(&g).unwrap();
1443 assert!((af.freq_ref + af.freq_alt - 1.0).abs() < TOL);
1444 }
1445}