use super::{VcfReader, VcfRecord};
use crate::core::GenomicRecordIterator;
use crate::error::Result;
use crate::stats::{CategoryCounter, RunningStats};
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum VariantType {
Snp,
Insertion,
Deletion,
Mnp,
Complex,
Structural,
}
impl VariantType {
pub fn classify(reference: &str, alt: &str) -> Self {
let ref_len = reference.len();
let alt_len = alt.len();
if alt.starts_with('<') && alt.ends_with('>') {
return VariantType::Structural;
}
match (ref_len, alt_len) {
(1, 1) => VariantType::Snp,
(r, a) if r < a => VariantType::Insertion,
(r, a) if r > a => VariantType::Deletion,
(r, a) if r == a && r > 1 => VariantType::Mnp,
_ => VariantType::Complex,
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum TransitionType {
Transition,
Transversion,
}
impl TransitionType {
pub fn classify(reference: &str, alt: &str) -> Option<Self> {
if reference.len() != 1 || alt.len() != 1 {
return None;
}
let ref_base = reference.chars().next()?.to_ascii_uppercase();
let alt_base = alt.chars().next()?.to_ascii_uppercase();
match (ref_base, alt_base) {
('A', 'G') | ('G', 'A') | ('C', 'T') | ('T', 'C') => Some(TransitionType::Transition),
('A', 'C') | ('A', 'T') | ('G', 'C') | ('G', 'T') | ('C', 'A') | ('C', 'G')
| ('T', 'A') | ('T', 'G') => Some(TransitionType::Transversion),
_ => None,
}
}
}
#[derive(Debug, Clone)]
pub struct VcfStats {
pub total_variants: usize,
pub transitions: usize,
pub transversions: usize,
pub variant_types: CategoryCounter<VariantType>,
pub filter_status: CategoryCounter<String>,
pub chrom_distribution: CategoryCounter<String>,
pub quality_stats: RunningStats,
pub multi_allelic_sites: usize,
pub total_alleles: usize,
pub pass_count: usize,
pub allele_frequencies: Vec<f64>,
}
impl VcfStats {
pub fn new() -> Self {
Self {
total_variants: 0,
transitions: 0,
transversions: 0,
variant_types: CategoryCounter::new(),
filter_status: CategoryCounter::new(),
chrom_distribution: CategoryCounter::new(),
quality_stats: RunningStats::new(),
multi_allelic_sites: 0,
total_alleles: 0,
pass_count: 0,
allele_frequencies: Vec::new(),
}
}
pub fn update(&mut self, record: &VcfRecord) {
self.total_variants += 1;
self.chrom_distribution.increment(record.chrom.clone());
self.filter_status.increment(record.filter.clone());
if record.is_pass() {
self.pass_count += 1;
}
if let Some(qual) = record.qual {
self.quality_stats.push(qual);
}
let num_alts = record.alt.len();
self.total_alleles += num_alts;
if num_alts > 1 {
self.multi_allelic_sites += 1;
}
for alt in &record.alt {
let var_type = VariantType::classify(&record.reference, alt);
self.variant_types.increment(var_type);
if var_type == VariantType::Snp {
if let Some(tt) = TransitionType::classify(&record.reference, alt) {
match tt {
TransitionType::Transition => self.transitions += 1,
TransitionType::Transversion => self.transversions += 1,
}
}
}
}
if let Some(af) = Self::extract_af_from_info(&record.info) {
self.allele_frequencies.push(af);
}
}
pub fn compute(reader: &mut VcfReader) -> Result<Self> {
let mut stats = Self::new();
while let Some(record) = reader.next_record()? {
stats.update(&record);
}
Ok(stats)
}
pub fn ts_tv_ratio(&self) -> Option<f64> {
if self.transversions > 0 {
Some(self.transitions as f64 / self.transversions as f64)
} else {
None
}
}
pub fn total_variants(&self) -> usize {
self.total_variants
}
pub fn snp_count(&self) -> usize {
self.variant_types.get(&VariantType::Snp)
}
pub fn insertion_count(&self) -> usize {
self.variant_types.get(&VariantType::Insertion)
}
pub fn deletion_count(&self) -> usize {
self.variant_types.get(&VariantType::Deletion)
}
pub fn indel_count(&self) -> usize {
self.insertion_count() + self.deletion_count()
}
pub fn pass_rate(&self) -> f64 {
if self.total_variants == 0 {
0.0
} else {
self.pass_count as f64 / self.total_variants as f64
}
}
pub fn multi_allelic_rate(&self) -> f64 {
if self.total_variants == 0 {
0.0
} else {
self.multi_allelic_sites as f64 / self.total_variants as f64
}
}
pub fn mean_quality(&self) -> Option<f64> {
self.quality_stats.mean()
}
pub fn quality_stats(&self) -> &RunningStats {
&self.quality_stats
}
pub fn variants_per_chromosome(&self) -> &CategoryCounter<String> {
&self.chrom_distribution
}
pub fn filter_distribution(&self) -> &CategoryCounter<String> {
&self.filter_status
}
fn extract_af_from_info(info: &str) -> Option<f64> {
for field in info.split(';') {
if let Some(af_str) = field.strip_prefix("AF=") {
if let Some(first_af) = af_str.split(',').next() {
if let Ok(af) = first_af.parse::<f64>() {
return Some(af);
}
}
}
}
None
}
pub fn print_summary(&self) {
println!("=== VCF Statistics Summary ===");
println!();
println!("Total variants: {}", self.total_variants);
println!(" PASS: {} ({:.1}%)", self.pass_count, self.pass_rate() * 100.0);
println!();
println!("Variant Types:");
println!(" SNPs: {}", self.snp_count());
println!(" Insertions: {}", self.insertion_count());
println!(" Deletions: {}", self.deletion_count());
println!(" Indels: {}", self.indel_count());
println!(" MNPs: {}", self.variant_types.get(&VariantType::Mnp));
println!(
" Structural: {}",
self.variant_types.get(&VariantType::Structural)
);
println!();
if let Some(ratio) = self.ts_tv_ratio() {
println!("Ts/Tv ratio: {:.3}", ratio);
println!(" Transitions: {}", self.transitions);
println!(" Transversions: {}", self.transversions);
println!();
}
println!(
"Multi-allelic sites: {} ({:.1}%)",
self.multi_allelic_sites,
self.multi_allelic_rate() * 100.0
);
println!("Total ALT alleles: {}", self.total_alleles);
println!();
if let Some(mean) = self.quality_stats.mean() {
println!("Quality Scores:");
println!(" Mean: {:.2}", mean);
if let Some(std) = self.quality_stats.std_dev() {
println!(" Std Dev: {:.2}", std);
}
println!(" Min: {:.2}", self.quality_stats.min().unwrap_or(0.0));
println!(" Max: {:.2}", self.quality_stats.max().unwrap_or(0.0));
println!();
}
if self.chrom_distribution.num_categories() <= 30 {
println!("Variants per chromosome:");
let mut chroms: Vec<_> = self.chrom_distribution.iter().collect();
chroms.sort_by_key(|(_, count)| std::cmp::Reverse(**count));
for (chrom, count) in chroms.iter().take(10) {
println!(" {}: {}", chrom, count);
}
if chroms.len() > 10 {
println!(" ... and {} more", chroms.len() - 10);
}
}
}
}
impl Default for VcfStats {
fn default() -> Self {
Self::new()
}
}
#[derive(Debug, Clone, Default)]
pub struct GenotypeStats {
pub hom_ref: usize,
pub het: usize,
pub hom_alt: usize,
pub missing: usize,
pub total: usize,
}
impl GenotypeStats {
pub fn new() -> Self {
Self::default()
}
pub fn update(&mut self, gt: &str) {
self.total += 1;
let gt = gt.trim();
let parts: Vec<&str> = if gt.contains('/') {
gt.split('/').collect()
} else if gt.contains('|') {
gt.split('|').collect()
} else {
vec![gt]
};
if parts.len() != 2 {
return; }
match (parts[0], parts[1]) {
("0", "0") => self.hom_ref += 1,
("1", "1") => self.hom_alt += 1,
("0", "1") | ("1", "0") => self.het += 1,
(".", ".") => self.missing += 1,
_ => {} }
}
pub fn allele_frequency(&self) -> Option<f64> {
let total_called = self.total - self.missing;
if total_called == 0 {
return None;
}
let alt_allele_count = self.het + 2 * self.hom_alt;
let total_alleles = 2 * total_called;
Some(alt_allele_count as f64 / total_alleles as f64)
}
pub fn heterozygosity(&self) -> Option<f64> {
let total_called = self.total - self.missing;
if total_called == 0 {
return None;
}
Some(self.het as f64 / total_called as f64)
}
pub fn hardy_weinberg_test(&self) -> Option<(f64, f64)> {
let total_called = self.total - self.missing;
if total_called < 10 {
return None; }
let af = self.allele_frequency()?;
let p = af;
let q = 1.0 - p;
let n = total_called as f64;
let exp_hom_ref = n * q * q;
let exp_het = n * 2.0 * p * q;
let exp_hom_alt = n * p * p;
let chi_sq = ((self.hom_ref as f64 - exp_hom_ref).powi(2) / exp_hom_ref)
+ ((self.het as f64 - exp_het).powi(2) / exp_het)
+ ((self.hom_alt as f64 - exp_hom_alt).powi(2) / exp_hom_alt);
let p_value = chi_squared_p_value(chi_sq, 1);
Some((chi_sq, p_value))
}
}
fn chi_squared_p_value(chi_sq: f64, df: u32) -> f64 {
if df == 1 {
let x = (chi_sq / 2.0).sqrt();
return (1.0 - libm::erf(x / std::f64::consts::SQRT_2)).max(1e-10);
}
1.0
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_variant_type_classification() {
assert_eq!(VariantType::classify("A", "G"), VariantType::Snp);
assert_eq!(VariantType::classify("A", "AT"), VariantType::Insertion);
assert_eq!(VariantType::classify("AT", "A"), VariantType::Deletion);
assert_eq!(VariantType::classify("AT", "GC"), VariantType::Mnp);
assert_eq!(
VariantType::classify("A", "<DEL>"),
VariantType::Structural
);
}
#[test]
fn test_transition_type() {
assert_eq!(
TransitionType::classify("A", "G"),
Some(TransitionType::Transition)
);
assert_eq!(
TransitionType::classify("C", "T"),
Some(TransitionType::Transition)
);
assert_eq!(
TransitionType::classify("A", "C"),
Some(TransitionType::Transversion)
);
assert_eq!(
TransitionType::classify("G", "T"),
Some(TransitionType::Transversion)
);
assert_eq!(TransitionType::classify("AT", "GC"), None);
}
#[test]
fn test_vcf_stats_update() {
let mut stats = VcfStats::new();
let record = VcfRecord {
chrom: "chr1".to_string(),
pos: 100,
id: ".".to_string(),
reference: "A".to_string(),
alt: vec!["G".to_string()],
qual: Some(30.0),
filter: "PASS".to_string(),
info: "DP=100".to_string(),
format: Some("GT".to_string()),
samples: vec!["0/1".to_string()],
};
stats.update(&record);
assert_eq!(stats.total_variants(), 1);
assert_eq!(stats.snp_count(), 1);
assert_eq!(stats.transitions, 1);
assert_eq!(stats.pass_count, 1);
assert_eq!(stats.mean_quality(), Some(30.0));
}
#[test]
fn test_genotype_stats() {
let mut stats = GenotypeStats::new();
stats.update("0/0");
stats.update("0/1");
stats.update("1/1");
stats.update("./.");
assert_eq!(stats.hom_ref, 1);
assert_eq!(stats.het, 1);
assert_eq!(stats.hom_alt, 1);
assert_eq!(stats.missing, 1);
assert_eq!(stats.allele_frequency(), Some(0.5));
}
#[test]
fn test_extract_af() {
let info = "DP=100;AF=0.25;AC=10";
assert_eq!(VcfStats::extract_af_from_info(info), Some(0.25));
let info_no_af = "DP=100;AC=10";
assert_eq!(VcfStats::extract_af_from_info(info_no_af), None);
}
}