#![allow(clippy::cast_precision_loss)]
use rayon::prelude::*;
use rsomics_pgen::Pgen;
use std::io::{self, Write};
pub struct SampleMiss {
pub fid: String,
pub iid: String,
pub miss_pheno: bool,
pub n_miss: u32,
pub n_geno: u32,
}
pub struct VariantMiss {
pub chrom: String,
pub snp: String,
pub n_miss: u32,
pub n_geno: u32,
}
pub struct Missing {
pub samples: Vec<SampleMiss>,
pub variants: Vec<VariantMiss>,
}
const LANE_LO: u64 = 0x5555_5555_5555_5555;
#[inline]
fn byte_miss(b: u8) -> u8 {
b & !(b >> 1) & 0x55
}
#[inline]
fn scatter(mut m: u64, base: usize, acc: &mut [u32]) {
while m != 0 {
acc[base + (m.trailing_zeros() as usize >> 1)] += 1;
m &= m - 1;
}
}
#[inline]
fn scan_variant(row: &[u8], full_bytes: usize, last_lanes: usize, acc: &mut [u32]) -> u32 {
let words = full_bytes / 8;
let mut n = 0u32;
for c in 0..words {
let off = c * 8;
let w = u64::from_le_bytes(row[off..off + 8].try_into().unwrap());
let miss = w & !(w >> 1) & LANE_LO;
if miss == 0 {
continue;
}
n += miss.count_ones();
scatter(miss, off * 4, acc);
}
for (bi, &b) in row[..full_bytes].iter().enumerate().skip(words * 8) {
let mb = byte_miss(b);
if mb != 0 {
n += mb.count_ones();
scatter(u64::from(mb), bi * 4, acc);
}
}
if last_lanes != 0 {
let keep = (1u8 << (last_lanes * 2)) - 1;
let mb = byte_miss(row[full_bytes]) & keep;
if mb != 0 {
n += mb.count_ones();
scatter(u64::from(mb), full_bytes * 4, acc);
}
}
n
}
#[must_use]
pub fn missing(pgen: &Pgen, threads: usize) -> Missing {
let n_samp = pgen.n_samples();
let n_var = pgen.n_variants();
let bpv = pgen.bytes_per_variant();
let full_bytes = n_samp / 4;
let last_lanes = n_samp % 4;
let gt = &pgen.gt_raw;
let row = |v: usize| >[v * bpv..v * bpv + bpv];
let (per_variant, per_sample): (Vec<u32>, Vec<u32>) = if threads <= 1 {
let mut sample_acc = vec![0u32; n_samp];
let variant_counts: Vec<u32> = (0..n_var)
.map(|v| scan_variant(row(v), full_bytes, last_lanes, &mut sample_acc))
.collect();
(variant_counts, sample_acc)
} else {
let pool = rayon::ThreadPoolBuilder::new()
.num_threads(threads)
.build()
.expect("thread pool");
pool.install(|| {
let mut variant_counts = vec![0u32; n_var];
let sample_acc = variant_counts
.par_iter_mut()
.enumerate()
.fold(
|| vec![0u32; n_samp],
|mut acc, (v, vc)| {
*vc = scan_variant(row(v), full_bytes, last_lanes, &mut acc);
acc
},
)
.reduce(
|| vec![0u32; n_samp],
|mut a, b| {
for (x, y) in a.iter_mut().zip(b) {
*x += y;
}
a
},
);
(variant_counts, sample_acc)
})
};
let n_var_u32 = n_var as u32;
let samples = pgen
.samples
.iter()
.zip(&per_sample)
.map(|(s, &n_miss)| SampleMiss {
fid: s.fid.clone(),
iid: s.iid.clone(),
miss_pheno: pheno_missing(&s.phen),
n_miss,
n_geno: n_var_u32,
})
.collect();
let n_samp_u32 = n_samp as u32;
let variants = pgen
.variants
.iter()
.zip(&per_variant)
.map(|(v, &n_miss)| VariantMiss {
chrom: chrom_code(&v.chrom),
snp: v.id.clone(),
n_miss,
n_geno: n_samp_u32,
})
.collect();
Missing { samples, variants }
}
fn pheno_missing(phen: &str) -> bool {
phen == "-9" || phen == "0"
}
fn chrom_code(chrom: &str) -> String {
match chrom {
"X" | "x" => "23".to_string(),
"Y" | "y" => "24".to_string(),
"XY" | "xy" => "25".to_string(),
"MT" | "mt" | "M" | "m" => "26".to_string(),
other => other.to_string(),
}
}
fn id_width(max_id: usize) -> usize {
if max_id <= 4 { 5 } else { max_id + 3 }
}
fn fid_width(max_fid: usize) -> usize {
if max_fid <= 4 { 4 } else { max_fid + 2 }
}
pub fn write_imiss<W: Write>(samples: &[SampleMiss], out: &mut W) -> io::Result<()> {
let max_fid = samples.iter().map(|s| s.fid.len()).max().unwrap_or(0);
let max_iid = samples.iter().map(|s| s.iid.len()).max().unwrap_or(0);
let fw = fid_width(max_fid);
let iw = id_width(max_iid);
writeln!(
out,
"{:>fw$}{:>iw$}{:>11}{:>9}{:>9}{:>9}",
"FID", "IID", "MISS_PHENO", "N_MISS", "N_GENO", "F_MISS",
)?;
for s in samples {
writeln!(
out,
"{:>fw$}{:>iw$}{:>11}{:>9}{:>9}{:>9}",
s.fid,
s.iid,
if s.miss_pheno { "Y" } else { "N" },
s.n_miss,
s.n_geno,
fmt_g(f_miss(s.n_miss, s.n_geno)),
)?;
}
Ok(())
}
pub fn write_lmiss<W: Write>(variants: &[VariantMiss], out: &mut W) -> io::Result<()> {
let max_snp = variants.iter().map(|v| v.snp.len()).max().unwrap_or(0);
let sw = id_width(max_snp);
writeln!(
out,
"{:>4}{:>sw$}{:>9}{:>9}{:>9}",
"CHR", "SNP", "N_MISS", "N_GENO", "F_MISS"
)?;
for v in variants {
writeln!(
out,
"{:>4}{:>sw$}{:>9}{:>9}{:>9}",
v.chrom,
v.snp,
v.n_miss,
v.n_geno,
fmt_g(f_miss(v.n_miss, v.n_geno)),
)?;
}
Ok(())
}
fn f_miss(n_miss: u32, n_geno: u32) -> f64 {
if n_geno == 0 {
f64::NAN
} else {
f64::from(n_miss) / f64::from(n_geno)
}
}
fn fmt_g(x: f64) -> String {
const P: i32 = 4;
if x.is_nan() {
return "nan".to_string();
}
if x == 0.0 {
return "0".to_string();
}
let rounded = format!("{:.*e}", (P - 1) as usize, x);
let exp: i32 = rounded
.split('e')
.nth(1)
.and_then(|s| s.parse().ok())
.unwrap();
if !(-4..P).contains(&exp) {
let (mant, e) = rounded.split_once('e').unwrap();
let mant = strip_trailing(mant);
let e: i32 = e.parse().unwrap();
format!("{mant}e{}{:02}", if e < 0 { '-' } else { '+' }, e.abs())
} else {
let frac = (P - 1 - exp).max(0) as usize;
strip_trailing(&format!("{x:.frac$}"))
}
}
fn strip_trailing(s: &str) -> String {
if s.contains('.') {
s.trim_end_matches('0').trim_end_matches('.').to_string()
} else {
s.to_string()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn g_formatting_matches_plink_samples() {
assert_eq!(fmt_g(0.04), "0.04");
assert_eq!(fmt_g(0.015), "0.015");
assert_eq!(fmt_g(0.025), "0.025");
assert_eq!(fmt_g(1.0 / 3.0), "0.3333");
assert_eq!(fmt_g(0.5), "0.5");
assert_eq!(fmt_g(0.075), "0.075");
assert_eq!(fmt_g(0.0), "0");
}
#[test]
fn width_rules() {
assert_eq!(fid_width(4), 4);
assert_eq!(fid_width(5), 7);
assert_eq!(fid_width(12), 14);
assert_eq!(id_width(4), 5);
assert_eq!(id_width(5), 8);
assert_eq!(id_width(10), 13);
}
#[test]
fn chrom_codes() {
assert_eq!(chrom_code("X"), "23");
assert_eq!(chrom_code("Y"), "24");
assert_eq!(chrom_code("XY"), "25");
assert_eq!(chrom_code("MT"), "26");
assert_eq!(chrom_code("7"), "7");
}
#[test]
fn pheno_missing_rule() {
assert!(pheno_missing("-9"));
assert!(pheno_missing("0"));
assert!(!pheno_missing("1"));
assert!(!pheno_missing("2"));
}
}