1#![allow(clippy::cast_precision_loss)]
2
3use std::fs::File;
4use std::io::{BufRead, BufReader};
5use std::path::Path;
6
7use rsomics_common::{Result, RsomicsError};
8use serde::Serialize;
9
10#[derive(Debug, Default, Serialize)]
11pub struct VcfStats {
12 pub total: u64,
13 pub snps: u64,
14 pub insertions: u64,
15 pub deletions: u64,
16 pub mnps: u64,
17 pub transitions: u64,
18 pub transversions: u64,
19 pub ti_tv: f64,
20}
21
22fn is_transition(r: u8, a: u8) -> bool {
23 matches!(
24 (r, a),
25 (b'A', b'G') | (b'G', b'A') | (b'C', b'T') | (b'T', b'C')
26 )
27}
28
29pub fn stats(input: &Path) -> Result<VcfStats> {
30 let file = File::open(input)
31 .map_err(|e| RsomicsError::InvalidInput(format!("{}: {e}", input.display())))?;
32 let reader = BufReader::new(file);
33 let mut s = VcfStats::default();
34
35 for line in reader.lines() {
36 let line = line.map_err(RsomicsError::Io)?;
37 if line.starts_with('#') {
38 continue;
39 }
40 let fields: Vec<&str> = line.splitn(6, '\t').collect();
41 if fields.len() < 5 {
42 continue;
43 }
44 let ref_allele = fields[3];
45 let alt_field = fields[4];
46
47 for alt in alt_field.split(',') {
48 if alt == "." || alt == "*" {
49 continue;
50 }
51 s.total += 1;
52 let rlen = ref_allele.len();
53 let alen = alt.len();
54
55 if rlen == 1 && alen == 1 {
56 s.snps += 1;
57 let rb = ref_allele.as_bytes()[0].to_ascii_uppercase();
58 let ab = alt.as_bytes()[0].to_ascii_uppercase();
59 if is_transition(rb, ab) {
60 s.transitions += 1;
61 } else {
62 s.transversions += 1;
63 }
64 } else if rlen > alen {
65 s.deletions += 1;
66 } else if alen > rlen {
67 s.insertions += 1;
68 } else {
69 s.mnps += 1;
70 }
71 }
72 }
73
74 s.ti_tv = if s.transversions > 0 {
75 s.transitions as f64 / s.transversions as f64
76 } else {
77 0.0
78 };
79
80 Ok(s)
81}