#![allow(non_snake_case)]
use cyanea_core::{CyaneaError, Result};
use crate::distribution::{ChiSquared, Distribution};
#[derive(Debug, Clone)]
pub struct AlleleFrequencies {
pub freq_ref: f64,
pub freq_alt: f64,
pub allele_count: usize,
pub missing_count: usize,
pub observed_het: f64,
pub expected_het: f64,
}
#[derive(Debug, Clone)]
pub struct HweResult {
pub chi_squared: f64,
pub p_value: f64,
pub observed: [usize; 3],
pub expected: [f64; 3],
pub inbreeding_coeff: f64,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum FstMethod {
WeirCockerham,
Hudson,
}
#[derive(Debug, Clone)]
pub struct FstResult {
pub fst: f64,
pub method: FstMethod,
}
#[derive(Debug, Clone)]
pub struct DiversityStats {
pub pi: f64,
pub theta_w: f64,
pub segregating_sites: usize,
pub n_sequences: usize,
}
#[derive(Debug, Clone)]
pub struct TajimaD {
pub d: f64,
pub pi: f64,
pub theta_w: f64,
pub segregating_sites: usize,
pub n_sequences: usize,
}
#[derive(Debug, Clone)]
pub struct LdResult {
pub r_squared: f64,
pub d_prime: f64,
pub d: f64,
pub n_haplotypes: usize,
}
fn count_genotypes(genotypes: &[Option<u8>]) -> Result<(usize, usize, usize, usize)> {
let mut n0 = 0usize;
let mut n1 = 0usize;
let mut n2 = 0usize;
let mut n_miss = 0usize;
for (i, g) in genotypes.iter().enumerate() {
match g {
Some(0) => n0 += 1,
Some(1) => n1 += 1,
Some(2) => n2 += 1,
None => n_miss += 1,
Some(v) => {
return Err(CyaneaError::InvalidInput(format!(
"invalid genotype {} at index {} (expected 0, 1, or 2)",
v, i
)));
}
}
}
Ok((n0, n1, n2, n_miss))
}
fn allele_freq_from_counts(n0: usize, n1: usize, n2: usize) -> (f64, f64) {
let total = 2 * (n0 + n1 + n2);
if total == 0 {
return (0.0, 0.0);
}
let p = (2 * n0 + n1) as f64 / total as f64;
let q = (2 * n2 + n1) as f64 / total as f64;
(p, q)
}
fn harmonic(n: usize) -> f64 {
(1..=n).map(|i| 1.0 / i as f64).sum()
}
fn harmonic_sq(n: usize) -> f64 {
(1..=n).map(|i| 1.0 / (i as f64 * i as f64)).sum()
}
fn em_haplotype_freqs(
site_a: &[Option<u8>],
site_b: &[Option<u8>],
max_iter: usize,
tol: f64,
) -> Result<(f64, f64, f64, f64)> {
let mut counts = [[0usize; 3]; 3]; let mut n = 0usize;
for (ga, gb) in site_a.iter().zip(site_b.iter()) {
if let (Some(a), Some(b)) = (ga, gb) {
if *a > 2 || *b > 2 {
return Err(CyaneaError::InvalidInput(
"invalid genotype value (expected 0, 1, or 2)".into(),
));
}
counts[*a as usize][*b as usize] += 1;
n += 1;
}
}
if n == 0 {
return Err(CyaneaError::InvalidInput(
"no non-missing genotype pairs".into(),
));
}
let mut a_alt = 0usize;
let mut b_alt = 0usize;
for ga in 0..3 {
for gb in 0..3 {
a_alt += ga * counts[ga][gb];
b_alt += gb * counts[ga][gb];
}
}
let pa = a_alt as f64 / (2 * n) as f64; let pb = b_alt as f64 / (2 * n) as f64;
let mut p_ab = pa * pb; let mut p_aB = pa * (1.0 - pb);
let mut p_Ab = (1.0 - pa) * pb;
let mut p_AB = (1.0 - pa) * (1.0 - pb);
for _ in 0..max_iter {
let mut h_AB = 0.0;
let mut h_Ab = 0.0;
let mut h_aB = 0.0;
let mut h_ab = 0.0;
for ga in 0..3usize {
for gb in 0..3usize {
let c = counts[ga][gb] as f64;
if c == 0.0 {
continue;
}
match (ga, gb) {
(0, 0) => h_AB += 2.0 * c, (0, 1) => {
h_AB += c;
h_Ab += c;
} (0, 2) => h_Ab += 2.0 * c, (1, 0) => {
h_AB += c;
h_aB += c;
} (1, 1) => {
let denom = p_AB * p_ab + p_Ab * p_aB;
if denom > 0.0 {
let w = p_AB * p_ab / denom;
h_AB += c * w;
h_ab += c * w;
h_Ab += c * (1.0 - w);
h_aB += c * (1.0 - w);
} else {
h_AB += 0.5 * c;
h_ab += 0.5 * c;
h_Ab += 0.5 * c;
h_aB += 0.5 * c;
}
}
(1, 2) => {
h_Ab += c;
h_ab += c;
} (2, 0) => h_aB += 2.0 * c, (2, 1) => {
h_aB += c;
h_ab += c;
} (2, 2) => h_ab += 2.0 * c, _ => {}
}
}
}
let total = h_AB + h_Ab + h_aB + h_ab;
let new_AB = h_AB / total;
let new_Ab = h_Ab / total;
let new_aB = h_aB / total;
let new_ab = h_ab / total;
let diff = (new_AB - p_AB).abs()
+ (new_Ab - p_Ab).abs()
+ (new_aB - p_aB).abs()
+ (new_ab - p_ab).abs();
p_AB = new_AB;
p_Ab = new_Ab;
p_aB = new_aB;
p_ab = new_ab;
if diff < tol {
break;
}
}
Ok((p_AB, p_Ab, p_aB, p_ab))
}
pub fn allele_frequencies(genotypes: &[Option<u8>]) -> Result<AlleleFrequencies> {
if genotypes.is_empty() {
return Err(CyaneaError::InvalidInput("empty genotype array".into()));
}
let (n0, n1, n2, n_miss) = count_genotypes(genotypes)?;
let n_called = n0 + n1 + n2;
if n_called == 0 {
return Err(CyaneaError::InvalidInput(
"all genotypes are missing".into(),
));
}
let (p, q) = allele_freq_from_counts(n0, n1, n2);
let obs_het = n1 as f64 / n_called as f64;
let exp_het = 2.0 * p * q;
Ok(AlleleFrequencies {
freq_ref: p,
freq_alt: q,
allele_count: 2 * n_called,
missing_count: n_miss,
observed_het: obs_het,
expected_het: exp_het,
})
}
pub fn allele_frequencies_u8(genotypes: &[u8]) -> Result<AlleleFrequencies> {
let opts: Vec<Option<u8>> = genotypes
.iter()
.map(|&g| if g == 255 { None } else { Some(g) })
.collect();
allele_frequencies(&opts)
}
pub fn hwe_test(genotypes: &[Option<u8>]) -> Result<HweResult> {
if genotypes.is_empty() {
return Err(CyaneaError::InvalidInput("empty genotype array".into()));
}
let (n0, n1, n2, _) = count_genotypes(genotypes)?;
let n = n0 + n1 + n2;
if n == 0 {
return Err(CyaneaError::InvalidInput(
"all genotypes are missing".into(),
));
}
let (p, q) = allele_freq_from_counts(n0, n1, n2);
if p == 0.0 || q == 0.0 {
return Err(CyaneaError::InvalidInput(
"monomorphic site: HWE test requires polymorphism".into(),
));
}
let n_f = n as f64;
let exp = [p * p * n_f, 2.0 * p * q * n_f, q * q * n_f];
let obs = [n0, n1, n2];
let chi_sq: f64 = obs
.iter()
.zip(exp.iter())
.map(|(&o, &e)| {
if e > 0.0 {
(o as f64 - e).powi(2) / e
} else {
0.0
}
})
.sum();
let chi2_dist = ChiSquared::new(1.0)?;
let p_value = 1.0 - chi2_dist.cdf(chi_sq);
let obs_het = n1 as f64 / n_f;
let exp_het = 2.0 * p * q;
let f = if exp_het > 0.0 {
1.0 - obs_het / exp_het
} else {
0.0
};
Ok(HweResult {
chi_squared: chi_sq,
p_value,
observed: obs,
expected: exp,
inbreeding_coeff: f,
})
}
pub fn fst_weir_cockerham(
pop1: &[&[Option<u8>]],
pop2: &[&[Option<u8>]],
) -> Result<FstResult> {
if pop1.is_empty() || pop2.is_empty() {
return Err(CyaneaError::InvalidInput("empty loci or populations".into()));
}
let mut sum_a = 0.0;
let mut sum_abc = 0.0;
for (locus1, locus2) in pop1.iter().zip(pop2.iter()) {
let (n0_1, n1_1, n2_1, _) = count_genotypes(locus1)?;
let (n0_2, n1_2, n2_2, _) = count_genotypes(locus2)?;
let ni1 = (n0_1 + n1_1 + n2_1) as f64;
let ni2 = (n0_2 + n1_2 + n2_2) as f64;
if ni1 == 0.0 || ni2 == 0.0 {
continue;
}
let r = 2.0; let n_bar = (ni1 + ni2) / r;
let n_total = ni1 + ni2;
let (_, q1) = allele_freq_from_counts(n0_1, n1_1, n2_1);
let (_, q2) = allele_freq_from_counts(n0_2, n1_2, n2_2);
let p_bar = (ni1 * q1 + ni2 * q2) / n_total;
let s_sq = (ni1 * (q1 - p_bar).powi(2) + ni2 * (q2 - p_bar).powi(2))
/ ((r - 1.0) * n_bar);
let h_bar = (ni1 * n1_1 as f64 / ni1 + ni2 * n1_2 as f64 / ni2) / n_total;
let nc = n_total - (ni1 * ni1 + ni2 * ni2) / n_total;
let nc = nc / (r - 1.0);
let a = (n_bar / nc)
* (s_sq - (1.0 / (n_bar - 1.0)) * (p_bar * (1.0 - p_bar) - (r - 1.0) / r * s_sq - 0.25 * h_bar));
let b = (n_bar / (n_bar - 1.0))
* (p_bar * (1.0 - p_bar) - (r - 1.0) / r * s_sq
- (2.0 * n_bar - 1.0) / (4.0 * n_bar) * h_bar);
let c = 0.5 * h_bar;
sum_a += a;
sum_abc += a + b + c;
}
let fst = if sum_abc > 0.0 {
(sum_a / sum_abc).clamp(0.0, 1.0)
} else {
0.0
};
Ok(FstResult {
fst,
method: FstMethod::WeirCockerham,
})
}
pub fn fst_hudson(
pop1: &[&[Option<u8>]],
pop2: &[&[Option<u8>]],
) -> Result<FstResult> {
if pop1.is_empty() || pop2.is_empty() {
return Err(CyaneaError::InvalidInput("empty loci or populations".into()));
}
let mut sum_num = 0.0;
let mut sum_den = 0.0;
for (locus1, locus2) in pop1.iter().zip(pop2.iter()) {
let (n0_1, n1_1, n2_1, _) = count_genotypes(locus1)?;
let (n0_2, n1_2, n2_2, _) = count_genotypes(locus2)?;
let n1 = (n0_1 + n1_1 + n2_1) as f64;
let n2_pop = (n0_2 + n1_2 + n2_2) as f64;
if n1 == 0.0 || n2_pop == 0.0 {
continue;
}
let (_, q1) = allele_freq_from_counts(n0_1, n1_1, n2_1);
let (_, q2) = allele_freq_from_counts(n0_2, n1_2, n2_2);
let hw1 = 2.0 * q1 * (1.0 - q1) * n1 / (n1 - 1.0);
let hw2 = 2.0 * q2 * (1.0 - q2) * n2_pop / (n2_pop - 1.0);
let hw = (hw1 + hw2) / 2.0;
let hb = q1 * (1.0 - q2) + q2 * (1.0 - q1);
sum_num += hb - hw;
sum_den += hb;
}
let fst = if sum_den > 0.0 {
(sum_num / sum_den).clamp(0.0, 1.0)
} else {
0.0
};
Ok(FstResult {
fst,
method: FstMethod::Hudson,
})
}
pub fn fst_multi_population(populations: &[&[&[Option<u8>]]]) -> Result<FstResult> {
let r = populations.len();
if r < 2 {
return Err(CyaneaError::InvalidInput(
"need at least 2 populations for Fst".into(),
));
}
let n_loci = populations[0].len();
if n_loci == 0 {
return Err(CyaneaError::InvalidInput("no loci provided".into()));
}
for pop in populations.iter() {
if pop.len() != n_loci {
return Err(CyaneaError::InvalidInput(
"all populations must have the same number of loci".into(),
));
}
}
let mut sum_a = 0.0;
let mut sum_abc = 0.0;
let r_f = r as f64;
for locus_idx in 0..n_loci {
let mut ni = Vec::with_capacity(r);
let mut qi = Vec::with_capacity(r);
let mut hi = Vec::with_capacity(r);
for pop in populations.iter() {
let (n0, n1, n2, _) = count_genotypes(pop[locus_idx])?;
let n = (n0 + n1 + n2) as f64;
if n == 0.0 {
ni.push(0.0);
qi.push(0.0);
hi.push(0.0);
continue;
}
let (_, q) = allele_freq_from_counts(n0, n1, n2);
ni.push(n);
qi.push(q);
hi.push(n1 as f64 / n);
}
let n_total: f64 = ni.iter().sum();
if n_total == 0.0 {
continue;
}
let n_bar = n_total / r_f;
let p_bar: f64 = ni.iter().zip(qi.iter()).map(|(n, q)| n * q).sum::<f64>() / n_total;
let s_sq: f64 = ni
.iter()
.zip(qi.iter())
.map(|(n, q)| n * (q - p_bar).powi(2))
.sum::<f64>()
/ ((r_f - 1.0) * n_bar);
let h_bar: f64 = ni.iter().zip(hi.iter()).map(|(n, h)| n * h).sum::<f64>() / n_total;
let nc = (n_total - ni.iter().map(|n| n * n).sum::<f64>() / n_total) / (r_f - 1.0);
let a = (n_bar / nc)
* (s_sq
- (1.0 / (n_bar - 1.0))
* (p_bar * (1.0 - p_bar) - (r_f - 1.0) / r_f * s_sq - 0.25 * h_bar));
let b = (n_bar / (n_bar - 1.0))
* (p_bar * (1.0 - p_bar) - (r_f - 1.0) / r_f * s_sq
- (2.0 * n_bar - 1.0) / (4.0 * n_bar) * h_bar);
let c = 0.5 * h_bar;
sum_a += a;
sum_abc += a + b + c;
}
let fst = if sum_abc > 0.0 {
(sum_a / sum_abc).clamp(0.0, 1.0)
} else {
0.0
};
Ok(FstResult {
fst,
method: FstMethod::WeirCockerham,
})
}
pub fn diversity(
genotype_matrix: &[&[Option<u8>]],
n_sequences: usize,
) -> Result<DiversityStats> {
if n_sequences < 2 {
return Err(CyaneaError::InvalidInput(
"need at least 2 sequences for diversity".into(),
));
}
if genotype_matrix.is_empty() {
return Err(CyaneaError::InvalidInput("no loci provided".into()));
}
let mut seg_sites = 0usize;
let mut total_pi = 0.0;
for locus in genotype_matrix {
let (n0, n1, n2, _) = count_genotypes(locus)?;
let n_called = n0 + n1 + n2;
if n_called == 0 {
continue;
}
let (_, q) = allele_freq_from_counts(n0, n1, n2);
if q > 0.0 && q < 1.0 {
seg_sites += 1;
}
let n_hap = (2 * n_called) as f64;
let pi_site = 2.0 * q * (1.0 - q) * n_hap / (n_hap - 1.0);
total_pi += pi_site;
}
let n_loci = genotype_matrix.len() as f64;
let pi = total_pi / n_loci;
let a1 = harmonic(n_sequences - 1);
let theta_w = if a1 > 0.0 {
(seg_sites as f64 / n_loci) / a1
} else {
0.0
};
Ok(DiversityStats {
pi,
theta_w,
segregating_sites: seg_sites,
n_sequences,
})
}
pub fn diversity_from_counts(
segregating_sites: usize,
n_sequences: usize,
avg_pairwise_diff: f64,
) -> Result<DiversityStats> {
if n_sequences < 2 {
return Err(CyaneaError::InvalidInput(
"need at least 2 sequences for diversity".into(),
));
}
let a1 = harmonic(n_sequences - 1);
let theta_w = if a1 > 0.0 {
segregating_sites as f64 / a1
} else {
0.0
};
Ok(DiversityStats {
pi: avg_pairwise_diff,
theta_w,
segregating_sites,
n_sequences,
})
}
pub fn tajimas_d(
segregating_sites: usize,
n_sequences: usize,
avg_pairwise_diff: f64,
) -> Result<TajimaD> {
if n_sequences < 4 {
return Err(CyaneaError::InvalidInput(
"Tajima's D requires at least 4 sequences".into(),
));
}
if segregating_sites == 0 {
return Err(CyaneaError::InvalidInput(
"Tajima's D undefined with 0 segregating sites".into(),
));
}
let n = n_sequences as f64;
let s = segregating_sites as f64;
let a1 = harmonic(n_sequences - 1);
let a2 = harmonic_sq(n_sequences - 1);
let theta_w = s / a1;
let b1 = (n + 1.0) / (3.0 * (n - 1.0));
let b2 = 2.0 * (n * n + n + 3.0) / (9.0 * n * (n - 1.0));
let c1 = b1 - 1.0 / a1;
let c2 = b2 - (n + 2.0) / (a1 * n) + a2 / (a1 * a1);
let e1 = c1 / a1;
let e2 = c2 / (a1 * a1 + a2);
let var_d = e1 * s + e2 * s * (s - 1.0);
let d = if var_d > 0.0 {
(avg_pairwise_diff - theta_w) / var_d.sqrt()
} else {
0.0
};
Ok(TajimaD {
d,
pi: avg_pairwise_diff,
theta_w,
segregating_sites,
n_sequences,
})
}
pub fn linkage_disequilibrium(
site_a: &[Option<u8>],
site_b: &[Option<u8>],
) -> Result<LdResult> {
if site_a.len() != site_b.len() {
return Err(CyaneaError::InvalidInput(
"site vectors must have the same length".into(),
));
}
if site_a.is_empty() {
return Err(CyaneaError::InvalidInput("empty site vectors".into()));
}
let mut n_pairs = 0usize;
for (a, b) in site_a.iter().zip(site_b.iter()) {
if a.is_some() && b.is_some() {
n_pairs += 1;
}
}
if n_pairs == 0 {
return Err(CyaneaError::InvalidInput(
"no non-missing genotype pairs".into(),
));
}
let af_a = allele_frequencies(site_a)?;
let af_b = allele_frequencies(site_b)?;
if af_a.freq_alt == 0.0 || af_a.freq_alt == 1.0 {
return Err(CyaneaError::InvalidInput(
"site A is monomorphic: LD undefined".into(),
));
}
if af_b.freq_alt == 0.0 || af_b.freq_alt == 1.0 {
return Err(CyaneaError::InvalidInput(
"site B is monomorphic: LD undefined".into(),
));
}
let (p_AB, _p_Ab, _p_aB, _p_ab) = em_haplotype_freqs(site_a, site_b, 1000, 1e-10)?;
let p_a = af_a.freq_alt; let p_A = af_a.freq_ref;
let p_b = af_b.freq_alt; let p_B = af_b.freq_ref;
let d = p_AB - p_A * p_B;
let d_max = if d >= 0.0 {
(p_A * p_b).min(p_a * p_B)
} else {
(p_A * p_B).min(p_a * p_b)
};
let d_prime = if d_max.abs() > 0.0 {
(d / d_max).abs().min(1.0)
} else {
0.0
};
let denom = p_A * p_a * p_B * p_b;
let r_squared = if denom > 0.0 {
(d * d / denom).min(1.0)
} else {
0.0
};
Ok(LdResult {
r_squared,
d_prime,
d,
n_haplotypes: 2 * n_pairs,
})
}
pub fn genotype_pca(
genotype_matrix: &[&[Option<u8>]],
n_components: usize,
) -> Result<crate::reduction::PcaResult> {
if genotype_matrix.is_empty() {
return Err(CyaneaError::InvalidInput(
"empty genotype matrix".into(),
));
}
let n_features = genotype_matrix[0].len();
if n_features == 0 {
return Err(CyaneaError::InvalidInput("zero loci".into()));
}
for (i, row) in genotype_matrix.iter().enumerate() {
if row.len() != n_features {
return Err(CyaneaError::InvalidInput(format!(
"sample {} has {} loci, expected {}",
i,
row.len(),
n_features
)));
}
}
let n_samples = genotype_matrix.len();
let mut site_sum = vec![0.0; n_features];
let mut site_count = vec![0usize; n_features];
for row in genotype_matrix {
for (j, g) in row.iter().enumerate() {
if let Some(v) = g {
site_sum[j] += *v as f64;
site_count[j] += 1;
}
}
}
let site_mean: Vec<f64> = (0..n_features)
.map(|j| {
if site_count[j] > 0 {
site_sum[j] / site_count[j] as f64
} else {
0.0
}
})
.collect();
let mut data: Vec<Vec<f64>> = Vec::with_capacity(n_samples);
for row in genotype_matrix {
let imputed: Vec<f64> = row
.iter()
.enumerate()
.map(|(j, g)| match g {
Some(v) => *v as f64,
None => site_mean[j],
})
.collect();
data.push(imputed);
}
let refs: Vec<&[f64]> = data.iter().map(|v| v.as_slice()).collect();
crate::reduction::pca(&refs, n_components)
}
#[cfg(test)]
mod tests {
use super::*;
const TOL: f64 = 1e-6;
#[test]
fn af_basic() {
let g = [Some(0), Some(0), Some(1), Some(1), Some(2)];
let af = allele_frequencies(&g).unwrap();
assert!((af.freq_ref - 0.6).abs() < TOL);
assert!((af.freq_alt - 0.4).abs() < TOL);
assert_eq!(af.allele_count, 10);
assert_eq!(af.missing_count, 0);
assert!((af.observed_het - 0.4).abs() < TOL);
}
#[test]
fn af_all_hom_ref() {
let g = [Some(0), Some(0), Some(0)];
let af = allele_frequencies(&g).unwrap();
assert!((af.freq_ref - 1.0).abs() < TOL);
assert!((af.freq_alt - 0.0).abs() < TOL);
assert!((af.observed_het - 0.0).abs() < TOL);
}
#[test]
fn af_all_het() {
let g = [Some(1), Some(1), Some(1), Some(1)];
let af = allele_frequencies(&g).unwrap();
assert!((af.freq_ref - 0.5).abs() < TOL);
assert!((af.freq_alt - 0.5).abs() < TOL);
assert!((af.observed_het - 1.0).abs() < TOL);
}
#[test]
fn af_with_missing() {
let g = [Some(0), None, Some(1), Some(2), None];
let af = allele_frequencies(&g).unwrap();
assert_eq!(af.missing_count, 2);
assert_eq!(af.allele_count, 6);
}
#[test]
fn af_empty_error() {
let g: [Option<u8>; 0] = [];
assert!(allele_frequencies(&g).is_err());
}
#[test]
fn af_invalid_genotype() {
let g = [Some(0), Some(3), Some(1)];
assert!(allele_frequencies(&g).is_err());
}
#[test]
fn hwe_perfect_equilibrium() {
let mut g = Vec::new();
for _ in 0..25 {
g.push(Some(0));
}
for _ in 0..50 {
g.push(Some(1));
}
for _ in 0..25 {
g.push(Some(2));
}
let result = hwe_test(&g).unwrap();
assert!(result.chi_squared < 0.01, "chi_sq={}", result.chi_squared);
assert!(result.p_value > 0.9);
assert!(result.inbreeding_coeff.abs() < 0.01);
}
#[test]
fn hwe_excess_heterozygosity() {
let g: Vec<Option<u8>> = vec![Some(1); 100];
let result = hwe_test(&g).unwrap();
assert!(result.chi_squared > 10.0);
assert!(result.p_value < 0.01);
assert!(result.inbreeding_coeff < -0.5);
}
#[test]
fn hwe_deficit_het() {
let mut g = Vec::new();
for _ in 0..50 {
g.push(Some(0));
}
for _ in 0..50 {
g.push(Some(2));
}
let result = hwe_test(&g).unwrap();
assert!(result.chi_squared > 50.0);
assert!(result.inbreeding_coeff > 0.9);
}
#[test]
fn hwe_textbook() {
let mut g = Vec::new();
for _ in 0..298 {
g.push(Some(0));
}
for _ in 0..489 {
g.push(Some(1));
}
for _ in 0..213 {
g.push(Some(2));
}
let result = hwe_test(&g).unwrap();
assert!(result.chi_squared < 10.0);
}
#[test]
fn hwe_monomorphic_error() {
let g = [Some(0), Some(0), Some(0)];
assert!(hwe_test(&g).is_err());
}
#[test]
fn hwe_with_missing() {
let mut g: Vec<Option<u8>> = Vec::new();
for _ in 0..20 {
g.push(Some(0));
}
for _ in 0..40 {
g.push(Some(1));
}
for _ in 0..20 {
g.push(Some(2));
}
for _ in 0..10 {
g.push(None);
}
let result = hwe_test(&g).unwrap();
assert!(result.p_value > 0.5);
}
#[test]
fn fst_identical_populations() {
let locus: Vec<Option<u8>> = vec![Some(0), Some(1), Some(1), Some(2)];
let pop1: Vec<&[Option<u8>]> = vec![locus.as_slice()];
let pop2: Vec<&[Option<u8>]> = vec![locus.as_slice()];
let result = fst_weir_cockerham(&pop1, &pop2).unwrap();
assert!(result.fst < 0.05, "fst={}", result.fst);
}
#[test]
fn fst_fully_differentiated() {
let pop1_locus: Vec<Option<u8>> = vec![Some(0); 50];
let pop2_locus: Vec<Option<u8>> = vec![Some(2); 50];
let pop1: Vec<&[Option<u8>]> = vec![pop1_locus.as_slice()];
let pop2: Vec<&[Option<u8>]> = vec![pop2_locus.as_slice()];
let result = fst_weir_cockerham(&pop1, &pop2).unwrap();
assert!(result.fst > 0.8, "fst={}", result.fst);
}
#[test]
fn fst_wc_known() {
let pop1_locus: Vec<Option<u8>> = vec![Some(0), Some(0), Some(0), Some(1), Some(1)];
let pop2_locus: Vec<Option<u8>> = vec![Some(1), Some(1), Some(2), Some(2), Some(2)];
let pop1: Vec<&[Option<u8>]> = vec![pop1_locus.as_slice()];
let pop2: Vec<&[Option<u8>]> = vec![pop2_locus.as_slice()];
let result = fst_weir_cockerham(&pop1, &pop2).unwrap();
assert!(result.fst > 0.1 && result.fst < 0.9, "fst={}", result.fst);
assert_eq!(result.method, FstMethod::WeirCockerham);
}
#[test]
fn fst_hudson_known() {
let pop1_locus: Vec<Option<u8>> = vec![Some(0), Some(0), Some(0), Some(1), Some(1)];
let pop2_locus: Vec<Option<u8>> = vec![Some(1), Some(1), Some(2), Some(2), Some(2)];
let pop1: Vec<&[Option<u8>]> = vec![pop1_locus.as_slice()];
let pop2: Vec<&[Option<u8>]> = vec![pop2_locus.as_slice()];
let result = fst_hudson(&pop1, &pop2).unwrap();
assert!(result.fst > 0.1 && result.fst < 0.9, "fst={}", result.fst);
assert_eq!(result.method, FstMethod::Hudson);
}
#[test]
fn fst_multi_locus() {
let p1_l1: Vec<Option<u8>> = vec![Some(0), Some(0), Some(1), Some(1)];
let p1_l2: Vec<Option<u8>> = vec![Some(0), Some(1), Some(1), Some(2)];
let p2_l1: Vec<Option<u8>> = vec![Some(1), Some(2), Some(2), Some(2)];
let p2_l2: Vec<Option<u8>> = vec![Some(1), Some(1), Some(2), Some(2)];
let pop1: Vec<&[Option<u8>]> = vec![p1_l1.as_slice(), p1_l2.as_slice()];
let pop2: Vec<&[Option<u8>]> = vec![p2_l1.as_slice(), p2_l2.as_slice()];
let result = fst_weir_cockerham(&pop1, &pop2).unwrap();
assert!(result.fst >= 0.0 && result.fst <= 1.0);
}
#[test]
fn fst_three_populations() {
let l1: Vec<Option<u8>> = vec![Some(0), Some(0), Some(1)];
let l2: Vec<Option<u8>> = vec![Some(1), Some(2), Some(2)];
let l3: Vec<Option<u8>> = vec![Some(0), Some(1), Some(2)];
let pop1: Vec<&[Option<u8>]> = vec![l1.as_slice()];
let pop2: Vec<&[Option<u8>]> = vec![l2.as_slice()];
let pop3: Vec<&[Option<u8>]> = vec![l3.as_slice()];
let pops: Vec<&[&[Option<u8>]]> = vec![pop1.as_slice(), pop2.as_slice(), pop3.as_slice()];
let result = super::fst_multi_population(&pops).unwrap();
assert!(result.fst >= 0.0 && result.fst <= 1.0);
}
#[test]
fn fst_unequal_sizes() {
let pop1_locus: Vec<Option<u8>> = vec![Some(0), Some(1)];
let pop2_locus: Vec<Option<u8>> = vec![Some(1), Some(2), Some(2), Some(2), Some(2)];
let pop1: Vec<&[Option<u8>]> = vec![pop1_locus.as_slice()];
let pop2: Vec<&[Option<u8>]> = vec![pop2_locus.as_slice()];
let result = fst_hudson(&pop1, &pop2).unwrap();
assert!(result.fst >= 0.0 && result.fst <= 1.0);
}
#[test]
fn fst_error_empty() {
let pop1: Vec<&[Option<u8>]> = vec![];
let pop2: Vec<&[Option<u8>]> = vec![];
assert!(fst_weir_cockerham(&pop1, &pop2).is_err());
}
#[test]
fn diversity_known_pi() {
let l1: Vec<Option<u8>> = vec![Some(0), Some(0), Some(1), Some(2)]; let l2: Vec<Option<u8>> = vec![Some(0), Some(0), Some(0), Some(0)]; let l3: Vec<Option<u8>> = vec![Some(1), Some(1), Some(1), Some(1)]; let l4: Vec<Option<u8>> = vec![Some(0), Some(0), Some(0), Some(0)]; let matrix: Vec<&[Option<u8>]> =
vec![l1.as_slice(), l2.as_slice(), l3.as_slice(), l4.as_slice()];
let result = diversity(&matrix, 8).unwrap();
assert!(result.pi > 0.0);
assert_eq!(result.segregating_sites, 2);
assert!(result.theta_w > 0.0);
}
#[test]
fn diversity_known_theta_w() {
let result = diversity_from_counts(10, 20, 5.0).unwrap();
let h19 = harmonic(19);
assert!((result.theta_w - 10.0 / h19).abs() < TOL);
}
#[test]
fn diversity_zero_segregating() {
let l1: Vec<Option<u8>> = vec![Some(0), Some(0), Some(0)];
let matrix: Vec<&[Option<u8>]> = vec![l1.as_slice()];
let result = diversity(&matrix, 6).unwrap();
assert_eq!(result.segregating_sites, 0);
assert!((result.pi - 0.0).abs() < TOL);
}
#[test]
fn diversity_error_n_lt_2() {
assert!(diversity_from_counts(5, 1, 2.0).is_err());
}
#[test]
fn tajimas_d_neutral() {
let n = 20;
let s = 10;
let a1 = harmonic(n - 1);
let theta_w = s as f64 / a1;
let result = tajimas_d(s, n, theta_w).unwrap();
assert!(result.d.abs() < 0.01, "d={}", result.d);
}
#[test]
fn tajimas_d_positive() {
let result = tajimas_d(5, 20, 10.0).unwrap();
assert!(result.d > 0.0, "d={}", result.d);
}
#[test]
fn tajimas_d_negative() {
let result = tajimas_d(50, 20, 1.0).unwrap();
assert!(result.d < 0.0, "d={}", result.d);
}
#[test]
fn tajimas_d_textbook() {
let result = tajimas_d(12, 10, 3.5).unwrap();
assert!(result.d < 0.0, "d={}", result.d);
assert_eq!(result.segregating_sites, 12);
assert_eq!(result.n_sequences, 10);
}
#[test]
fn tajimas_d_n_lt_4_error() {
assert!(tajimas_d(5, 3, 2.0).is_err());
}
#[test]
fn tajimas_d_zero_s_error() {
assert!(tajimas_d(0, 20, 0.0).is_err());
}
#[test]
fn ld_perfect() {
let site_a: Vec<Option<u8>> = vec![Some(0), Some(0), Some(2), Some(2)];
let site_b: Vec<Option<u8>> = vec![Some(0), Some(0), Some(2), Some(2)];
let result = linkage_disequilibrium(&site_a, &site_b).unwrap();
assert!(result.r_squared > 0.9, "r²={}", result.r_squared);
assert!(result.d_prime > 0.9, "D'={}", result.d_prime);
}
#[test]
fn ld_no_ld() {
let mut site_a = Vec::new();
let mut site_b = Vec::new();
for _ in 0..25 {
site_a.push(Some(0));
site_b.push(Some(0));
}
for _ in 0..25 {
site_a.push(Some(0));
site_b.push(Some(2));
}
for _ in 0..25 {
site_a.push(Some(2));
site_b.push(Some(0));
}
for _ in 0..25 {
site_a.push(Some(2));
site_b.push(Some(2));
}
let result = linkage_disequilibrium(&site_a, &site_b).unwrap();
assert!(result.r_squared < 0.05, "r²={}", result.r_squared);
}
#[test]
fn ld_intermediate() {
let site_a: Vec<Option<u8>> =
vec![Some(0), Some(0), Some(0), Some(2), Some(2), Some(1)];
let site_b: Vec<Option<u8>> =
vec![Some(0), Some(0), Some(2), Some(2), Some(2), Some(1)];
let result = linkage_disequilibrium(&site_a, &site_b).unwrap();
assert!(result.r_squared >= 0.0 && result.r_squared <= 1.0);
assert!(result.d_prime >= 0.0 && result.d_prime <= 1.0);
}
#[test]
fn ld_d_prime_vs_r_squared() {
let site_a: Vec<Option<u8>> =
vec![Some(0), Some(0), Some(0), Some(0), Some(1), Some(1), Some(2)];
let site_b: Vec<Option<u8>> =
vec![Some(0), Some(0), Some(0), Some(1), Some(1), Some(2), Some(2)];
let result = linkage_disequilibrium(&site_a, &site_b).unwrap();
assert!(result.r_squared >= 0.0 && result.r_squared <= 1.0);
assert!(result.d_prime >= 0.0 && result.d_prime <= 1.0);
}
#[test]
fn ld_with_missing() {
let site_a: Vec<Option<u8>> = vec![Some(0), Some(0), None, Some(2), Some(2)];
let site_b: Vec<Option<u8>> = vec![Some(0), None, Some(0), Some(2), Some(2)];
let result = linkage_disequilibrium(&site_a, &site_b).unwrap();
assert!(result.n_haplotypes > 0);
assert!(result.r_squared >= 0.0);
}
#[test]
fn ld_monomorphic_error() {
let site_a: Vec<Option<u8>> = vec![Some(0), Some(0), Some(0)];
let site_b: Vec<Option<u8>> = vec![Some(0), Some(1), Some(2)];
assert!(linkage_disequilibrium(&site_a, &site_b).is_err());
}
#[test]
fn ld_length_mismatch() {
let site_a: Vec<Option<u8>> = vec![Some(0), Some(1)];
let site_b: Vec<Option<u8>> = vec![Some(0), Some(1), Some(2)];
assert!(linkage_disequilibrium(&site_a, &site_b).is_err());
}
#[test]
fn pca_basic() {
let s1: Vec<Option<u8>> = vec![Some(0), Some(0), Some(1), Some(2)];
let s2: Vec<Option<u8>> = vec![Some(0), Some(1), Some(1), Some(2)];
let s3: Vec<Option<u8>> = vec![Some(2), Some(2), Some(1), Some(0)];
let s4: Vec<Option<u8>> = vec![Some(2), Some(1), Some(0), Some(0)];
let matrix: Vec<&[Option<u8>]> =
vec![s1.as_slice(), s2.as_slice(), s3.as_slice(), s4.as_slice()];
let result = genotype_pca(&matrix, 2).unwrap();
assert_eq!(result.components.len(), 2);
assert_eq!(result.transformed.len(), 4);
assert_eq!(result.transformed[0].len(), 2);
}
#[test]
fn pca_missing_imputation() {
let s1: Vec<Option<u8>> = vec![Some(0), None, Some(1)];
let s2: Vec<Option<u8>> = vec![Some(1), Some(1), Some(2)];
let s3: Vec<Option<u8>> = vec![Some(2), Some(2), None];
let matrix: Vec<&[Option<u8>]> = vec![s1.as_slice(), s2.as_slice(), s3.as_slice()];
let result = genotype_pca(&matrix, 2).unwrap();
assert_eq!(result.transformed.len(), 3);
}
#[test]
fn pca_variance() {
let s1: Vec<Option<u8>> = vec![Some(0), Some(0), Some(2), Some(2)];
let s2: Vec<Option<u8>> = vec![Some(0), Some(1), Some(1), Some(2)];
let s3: Vec<Option<u8>> = vec![Some(2), Some(2), Some(0), Some(0)];
let s4: Vec<Option<u8>> = vec![Some(1), Some(1), Some(1), Some(1)];
let s5: Vec<Option<u8>> = vec![Some(0), Some(2), Some(0), Some(2)];
let matrix: Vec<&[Option<u8>]> = vec![
s1.as_slice(),
s2.as_slice(),
s3.as_slice(),
s4.as_slice(),
s5.as_slice(),
];
let result = genotype_pca(&matrix, 3).unwrap();
for r in &result.explained_variance_ratio {
assert!(*r >= 0.0);
}
let total: f64 = result.explained_variance_ratio.iter().sum();
assert!(total <= 1.0 + 0.01);
}
#[test]
fn pca_population_structure() {
let mut matrix: Vec<Vec<Option<u8>>> = Vec::new();
for _ in 0..10 {
matrix.push(vec![Some(0), Some(0), Some(0), Some(0), Some(1)]);
}
for _ in 0..10 {
matrix.push(vec![Some(2), Some(2), Some(2), Some(2), Some(1)]);
}
let refs: Vec<&[Option<u8>]> = matrix.iter().map(|v| v.as_slice()).collect();
let result = genotype_pca(&refs, 2).unwrap();
assert!(result.explained_variance_ratio[0] > 0.5);
let mean_a: f64 = result.transformed[..10].iter().map(|t| t[0]).sum::<f64>() / 10.0;
let mean_b: f64 = result.transformed[10..].iter().map(|t| t[0]).sum::<f64>() / 10.0;
assert!((mean_a - mean_b).abs() > 0.1);
}
#[test]
fn pca_empty_error() {
let matrix: Vec<&[Option<u8>]> = vec![];
assert!(genotype_pca(&matrix, 1).is_err());
}
#[test]
fn u8_convenience() {
let g: Vec<u8> = vec![0, 1, 2, 255, 0];
let af = allele_frequencies_u8(&g).unwrap();
assert_eq!(af.missing_count, 1);
assert_eq!(af.allele_count, 8);
}
#[test]
fn all_missing_error() {
let g = [None, None, None];
assert!(allele_frequencies(&g).is_err());
}
#[test]
fn af_consistency() {
let g = [Some(0), Some(0), Some(1), Some(2), Some(2), Some(2)];
let af = allele_frequencies(&g).unwrap();
assert!((af.freq_ref + af.freq_alt - 1.0).abs() < TOL);
}
}