rsomics-plink-missing 0.1.0

Per-sample and per-variant genotype missingness from a PLINK1 binary fileset (plink --missing)
Documentation
//! PLINK1 `--missing`: per-sample (`.imiss`) and per-variant (`.lmiss`)
//! genotype missingness.
//!
//! A genotype is missing iff its packed 2-bit code is `0b01`. Per sample we
//! report N_MISS / N_GENO (= variant count) / F_MISS; per variant N_MISS /
//! N_GENO (= sample count) / F_MISS. F_MISS is N_MISS / N_GENO, printed with
//! C `%g`. Unlike `--het`, every variant is reported regardless of chromosome.

#![allow(clippy::cast_precision_loss)]

use rayon::prelude::*;
use rsomics_pgen::Pgen;
use std::io::{self, Write};

pub struct SampleMiss {
    pub fid: String,
    pub iid: String,
    pub miss_pheno: bool,
    pub n_miss: u32,
    pub n_geno: u32,
}

pub struct VariantMiss {
    pub chrom: String,
    pub snp: String,
    pub n_miss: u32,
    pub n_geno: u32,
}

pub struct Missing {
    pub samples: Vec<SampleMiss>,
    pub variants: Vec<VariantMiss>,
}

/// Lo-bit of each 2-bit lane across a 64-bit word.
const LANE_LO: u64 = 0x5555_5555_5555_5555;

/// Missing-lane mask of one byte: bit `2*lane` set when that lane's code is
/// `0b01`. PLINK missing = lo-bit set, hi-bit clear.
#[inline]
fn byte_miss(b: u8) -> u8 {
    b & !(b >> 1) & 0x55
}

/// Scatter the set lanes of `m` (lo-bit-per-lane mask, lane `l` at bit `2l`)
/// into `acc`, sample index `base + l`.
#[inline]
fn scatter(mut m: u64, base: usize, acc: &mut [u32]) {
    while m != 0 {
        acc[base + (m.trailing_zeros() as usize >> 1)] += 1;
        m &= m - 1;
    }
}

/// Tally one variant row's missing genotypes into `acc` (per-sample) and return
/// its total (per-variant). Missingness is sparse, so whole words with no
/// missing lane are skipped before the per-lane scatter. Trailing lanes beyond
/// `n_samp` are zero-padded (decode HomA1, never missing).
#[inline]
fn scan_variant(row: &[u8], full_bytes: usize, last_lanes: usize, acc: &mut [u32]) -> u32 {
    let words = full_bytes / 8;
    let mut n = 0u32;
    for c in 0..words {
        let off = c * 8;
        let w = u64::from_le_bytes(row[off..off + 8].try_into().unwrap());
        let miss = w & !(w >> 1) & LANE_LO;
        if miss == 0 {
            continue;
        }
        n += miss.count_ones();
        scatter(miss, off * 4, acc);
    }
    for (bi, &b) in row[..full_bytes].iter().enumerate().skip(words * 8) {
        let mb = byte_miss(b);
        if mb != 0 {
            n += mb.count_ones();
            scatter(u64::from(mb), bi * 4, acc);
        }
    }
    if last_lanes != 0 {
        let keep = (1u8 << (last_lanes * 2)) - 1;
        let mb = byte_miss(row[full_bytes]) & keep;
        if mb != 0 {
            n += mb.count_ones();
            scatter(u64::from(mb), full_bytes * 4, acc);
        }
    }
    n
}

#[must_use]
pub fn missing(pgen: &Pgen, threads: usize) -> Missing {
    let n_samp = pgen.n_samples();
    let n_var = pgen.n_variants();
    let bpv = pgen.bytes_per_variant();
    let full_bytes = n_samp / 4;
    let last_lanes = n_samp % 4;
    let gt = &pgen.gt_raw;
    let row = |v: usize| &gt[v * bpv..v * bpv + bpv];

    let (per_variant, per_sample): (Vec<u32>, Vec<u32>) = if threads <= 1 {
        let mut sample_acc = vec![0u32; n_samp];
        let variant_counts: Vec<u32> = (0..n_var)
            .map(|v| scan_variant(row(v), full_bytes, last_lanes, &mut sample_acc))
            .collect();
        (variant_counts, sample_acc)
    } else {
        let pool = rayon::ThreadPoolBuilder::new()
            .num_threads(threads)
            .build()
            .expect("thread pool");
        pool.install(|| {
            let mut variant_counts = vec![0u32; n_var];
            let sample_acc = variant_counts
                .par_iter_mut()
                .enumerate()
                .fold(
                    || vec![0u32; n_samp],
                    |mut acc, (v, vc)| {
                        *vc = scan_variant(row(v), full_bytes, last_lanes, &mut acc);
                        acc
                    },
                )
                .reduce(
                    || vec![0u32; n_samp],
                    |mut a, b| {
                        for (x, y) in a.iter_mut().zip(b) {
                            *x += y;
                        }
                        a
                    },
                );
            (variant_counts, sample_acc)
        })
    };

    let n_var_u32 = n_var as u32;
    let samples = pgen
        .samples
        .iter()
        .zip(&per_sample)
        .map(|(s, &n_miss)| SampleMiss {
            fid: s.fid.clone(),
            iid: s.iid.clone(),
            miss_pheno: pheno_missing(&s.phen),
            n_miss,
            n_geno: n_var_u32,
        })
        .collect();

    let n_samp_u32 = n_samp as u32;
    let variants = pgen
        .variants
        .iter()
        .zip(&per_variant)
        .map(|(v, &n_miss)| VariantMiss {
            chrom: chrom_code(&v.chrom),
            snp: v.id.clone(),
            n_miss,
            n_geno: n_samp_u32,
        })
        .collect();

    Missing { samples, variants }
}

/// MISS_PHENO is "Y" when the phenotype is unset: `-9` or `0`.
fn pheno_missing(phen: &str) -> bool {
    phen == "-9" || phen == "0"
}

/// `.lmiss` reports PLINK's numeric chromosome code: X→23, Y→24, XY→25, MT→26.
/// Other contigs pass through unchanged.
fn chrom_code(chrom: &str) -> String {
    match chrom {
        "X" | "x" => "23".to_string(),
        "Y" | "y" => "24".to_string(),
        "XY" | "xy" => "25".to_string(),
        "MT" | "mt" | "M" | "m" => "26".to_string(),
        other => other.to_string(),
    }
}

fn id_width(max_id: usize) -> usize {
    if max_id <= 4 { 5 } else { max_id + 3 }
}

fn fid_width(max_fid: usize) -> usize {
    if max_fid <= 4 { 4 } else { max_fid + 2 }
}

pub fn write_imiss<W: Write>(samples: &[SampleMiss], out: &mut W) -> io::Result<()> {
    let max_fid = samples.iter().map(|s| s.fid.len()).max().unwrap_or(0);
    let max_iid = samples.iter().map(|s| s.iid.len()).max().unwrap_or(0);
    let fw = fid_width(max_fid);
    let iw = id_width(max_iid);

    writeln!(
        out,
        "{:>fw$}{:>iw$}{:>11}{:>9}{:>9}{:>9}",
        "FID", "IID", "MISS_PHENO", "N_MISS", "N_GENO", "F_MISS",
    )?;
    for s in samples {
        writeln!(
            out,
            "{:>fw$}{:>iw$}{:>11}{:>9}{:>9}{:>9}",
            s.fid,
            s.iid,
            if s.miss_pheno { "Y" } else { "N" },
            s.n_miss,
            s.n_geno,
            fmt_g(f_miss(s.n_miss, s.n_geno)),
        )?;
    }
    Ok(())
}

pub fn write_lmiss<W: Write>(variants: &[VariantMiss], out: &mut W) -> io::Result<()> {
    let max_snp = variants.iter().map(|v| v.snp.len()).max().unwrap_or(0);
    let sw = id_width(max_snp);

    writeln!(
        out,
        "{:>4}{:>sw$}{:>9}{:>9}{:>9}",
        "CHR", "SNP", "N_MISS", "N_GENO", "F_MISS"
    )?;
    for v in variants {
        writeln!(
            out,
            "{:>4}{:>sw$}{:>9}{:>9}{:>9}",
            v.chrom,
            v.snp,
            v.n_miss,
            v.n_geno,
            fmt_g(f_miss(v.n_miss, v.n_geno)),
        )?;
    }
    Ok(())
}

fn f_miss(n_miss: u32, n_geno: u32) -> f64 {
    if n_geno == 0 {
        f64::NAN
    } else {
        f64::from(n_miss) / f64::from(n_geno)
    }
}

/// Faithful C `printf("%.4g")` — PLINK's F_MISS format. Keeps 4 significant
/// figures, switches to scientific notation when the decimal exponent is `< -4`
/// or `>= 4`, and strips trailing zeros.
fn fmt_g(x: f64) -> String {
    const P: i32 = 4;
    if x.is_nan() {
        return "nan".to_string();
    }
    if x == 0.0 {
        return "0".to_string();
    }
    let rounded = format!("{:.*e}", (P - 1) as usize, x);
    let exp: i32 = rounded
        .split('e')
        .nth(1)
        .and_then(|s| s.parse().ok())
        .unwrap();

    if !(-4..P).contains(&exp) {
        let (mant, e) = rounded.split_once('e').unwrap();
        let mant = strip_trailing(mant);
        let e: i32 = e.parse().unwrap();
        format!("{mant}e{}{:02}", if e < 0 { '-' } else { '+' }, e.abs())
    } else {
        let frac = (P - 1 - exp).max(0) as usize;
        strip_trailing(&format!("{x:.frac$}"))
    }
}

fn strip_trailing(s: &str) -> String {
    if s.contains('.') {
        s.trim_end_matches('0').trim_end_matches('.').to_string()
    } else {
        s.to_string()
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn g_formatting_matches_plink_samples() {
        assert_eq!(fmt_g(0.04), "0.04");
        assert_eq!(fmt_g(0.015), "0.015");
        assert_eq!(fmt_g(0.025), "0.025");
        assert_eq!(fmt_g(1.0 / 3.0), "0.3333");
        assert_eq!(fmt_g(0.5), "0.5");
        assert_eq!(fmt_g(0.075), "0.075");
        assert_eq!(fmt_g(0.0), "0");
    }

    #[test]
    fn width_rules() {
        assert_eq!(fid_width(4), 4);
        assert_eq!(fid_width(5), 7);
        assert_eq!(fid_width(12), 14);
        assert_eq!(id_width(4), 5);
        assert_eq!(id_width(5), 8);
        assert_eq!(id_width(10), 13);
    }

    #[test]
    fn chrom_codes() {
        assert_eq!(chrom_code("X"), "23");
        assert_eq!(chrom_code("Y"), "24");
        assert_eq!(chrom_code("XY"), "25");
        assert_eq!(chrom_code("MT"), "26");
        assert_eq!(chrom_code("7"), "7");
    }

    #[test]
    fn pheno_missing_rule() {
        assert!(pheno_missing("-9"));
        assert!(pheno_missing("0"));
        assert!(!pheno_missing("1"));
        assert!(!pheno_missing("2"));
    }
}