Skip to main content

rsomics_plink_missing/
lib.rs

1//! PLINK1 `--missing`: per-sample (`.imiss`) and per-variant (`.lmiss`)
2//! genotype missingness.
3//!
4//! A genotype is missing iff its packed 2-bit code is `0b01`. Per sample we
5//! report N_MISS / N_GENO (= variant count) / F_MISS; per variant N_MISS /
6//! N_GENO (= sample count) / F_MISS. F_MISS is N_MISS / N_GENO, printed with
7//! C `%g`. Unlike `--het`, every variant is reported regardless of chromosome.
8
9#![allow(clippy::cast_precision_loss)]
10
11use rayon::prelude::*;
12use rsomics_pgen::Pgen;
13use std::io::{self, Write};
14
15pub struct SampleMiss {
16    pub fid: String,
17    pub iid: String,
18    pub miss_pheno: bool,
19    pub n_miss: u32,
20    pub n_geno: u32,
21}
22
23pub struct VariantMiss {
24    pub chrom: String,
25    pub snp: String,
26    pub n_miss: u32,
27    pub n_geno: u32,
28}
29
30pub struct Missing {
31    pub samples: Vec<SampleMiss>,
32    pub variants: Vec<VariantMiss>,
33}
34
35/// Lo-bit of each 2-bit lane across a 64-bit word.
36const LANE_LO: u64 = 0x5555_5555_5555_5555;
37
38/// Missing-lane mask of one byte: bit `2*lane` set when that lane's code is
39/// `0b01`. PLINK missing = lo-bit set, hi-bit clear.
40#[inline]
41fn byte_miss(b: u8) -> u8 {
42    b & !(b >> 1) & 0x55
43}
44
45/// Scatter the set lanes of `m` (lo-bit-per-lane mask, lane `l` at bit `2l`)
46/// into `acc`, sample index `base + l`.
47#[inline]
48fn scatter(mut m: u64, base: usize, acc: &mut [u32]) {
49    while m != 0 {
50        acc[base + (m.trailing_zeros() as usize >> 1)] += 1;
51        m &= m - 1;
52    }
53}
54
55/// Tally one variant row's missing genotypes into `acc` (per-sample) and return
56/// its total (per-variant). Missingness is sparse, so whole words with no
57/// missing lane are skipped before the per-lane scatter. Trailing lanes beyond
58/// `n_samp` are zero-padded (decode HomA1, never missing).
59#[inline]
60fn scan_variant(row: &[u8], full_bytes: usize, last_lanes: usize, acc: &mut [u32]) -> u32 {
61    let words = full_bytes / 8;
62    let mut n = 0u32;
63    for c in 0..words {
64        let off = c * 8;
65        let w = u64::from_le_bytes(row[off..off + 8].try_into().unwrap());
66        let miss = w & !(w >> 1) & LANE_LO;
67        if miss == 0 {
68            continue;
69        }
70        n += miss.count_ones();
71        scatter(miss, off * 4, acc);
72    }
73    for (bi, &b) in row[..full_bytes].iter().enumerate().skip(words * 8) {
74        let mb = byte_miss(b);
75        if mb != 0 {
76            n += mb.count_ones();
77            scatter(u64::from(mb), bi * 4, acc);
78        }
79    }
80    if last_lanes != 0 {
81        let keep = (1u8 << (last_lanes * 2)) - 1;
82        let mb = byte_miss(row[full_bytes]) & keep;
83        if mb != 0 {
84            n += mb.count_ones();
85            scatter(u64::from(mb), full_bytes * 4, acc);
86        }
87    }
88    n
89}
90
91#[must_use]
92pub fn missing(pgen: &Pgen, threads: usize) -> Missing {
93    let n_samp = pgen.n_samples();
94    let n_var = pgen.n_variants();
95    let bpv = pgen.bytes_per_variant();
96    let full_bytes = n_samp / 4;
97    let last_lanes = n_samp % 4;
98    let gt = &pgen.gt_raw;
99    let row = |v: usize| &gt[v * bpv..v * bpv + bpv];
100
101    let (per_variant, per_sample): (Vec<u32>, Vec<u32>) = if threads <= 1 {
102        let mut sample_acc = vec![0u32; n_samp];
103        let variant_counts: Vec<u32> = (0..n_var)
104            .map(|v| scan_variant(row(v), full_bytes, last_lanes, &mut sample_acc))
105            .collect();
106        (variant_counts, sample_acc)
107    } else {
108        let pool = rayon::ThreadPoolBuilder::new()
109            .num_threads(threads)
110            .build()
111            .expect("thread pool");
112        pool.install(|| {
113            let mut variant_counts = vec![0u32; n_var];
114            let sample_acc = variant_counts
115                .par_iter_mut()
116                .enumerate()
117                .fold(
118                    || vec![0u32; n_samp],
119                    |mut acc, (v, vc)| {
120                        *vc = scan_variant(row(v), full_bytes, last_lanes, &mut acc);
121                        acc
122                    },
123                )
124                .reduce(
125                    || vec![0u32; n_samp],
126                    |mut a, b| {
127                        for (x, y) in a.iter_mut().zip(b) {
128                            *x += y;
129                        }
130                        a
131                    },
132                );
133            (variant_counts, sample_acc)
134        })
135    };
136
137    let n_var_u32 = n_var as u32;
138    let samples = pgen
139        .samples
140        .iter()
141        .zip(&per_sample)
142        .map(|(s, &n_miss)| SampleMiss {
143            fid: s.fid.clone(),
144            iid: s.iid.clone(),
145            miss_pheno: pheno_missing(&s.phen),
146            n_miss,
147            n_geno: n_var_u32,
148        })
149        .collect();
150
151    let n_samp_u32 = n_samp as u32;
152    let variants = pgen
153        .variants
154        .iter()
155        .zip(&per_variant)
156        .map(|(v, &n_miss)| VariantMiss {
157            chrom: chrom_code(&v.chrom),
158            snp: v.id.clone(),
159            n_miss,
160            n_geno: n_samp_u32,
161        })
162        .collect();
163
164    Missing { samples, variants }
165}
166
167/// MISS_PHENO is "Y" when the phenotype is unset: `-9` or `0`.
168fn pheno_missing(phen: &str) -> bool {
169    phen == "-9" || phen == "0"
170}
171
172/// `.lmiss` reports PLINK's numeric chromosome code: X→23, Y→24, XY→25, MT→26.
173/// Other contigs pass through unchanged.
174fn chrom_code(chrom: &str) -> String {
175    match chrom {
176        "X" | "x" => "23".to_string(),
177        "Y" | "y" => "24".to_string(),
178        "XY" | "xy" => "25".to_string(),
179        "MT" | "mt" | "M" | "m" => "26".to_string(),
180        other => other.to_string(),
181    }
182}
183
184fn id_width(max_id: usize) -> usize {
185    if max_id <= 4 { 5 } else { max_id + 3 }
186}
187
188fn fid_width(max_fid: usize) -> usize {
189    if max_fid <= 4 { 4 } else { max_fid + 2 }
190}
191
192pub fn write_imiss<W: Write>(samples: &[SampleMiss], out: &mut W) -> io::Result<()> {
193    let max_fid = samples.iter().map(|s| s.fid.len()).max().unwrap_or(0);
194    let max_iid = samples.iter().map(|s| s.iid.len()).max().unwrap_or(0);
195    let fw = fid_width(max_fid);
196    let iw = id_width(max_iid);
197
198    writeln!(
199        out,
200        "{:>fw$}{:>iw$}{:>11}{:>9}{:>9}{:>9}",
201        "FID", "IID", "MISS_PHENO", "N_MISS", "N_GENO", "F_MISS",
202    )?;
203    for s in samples {
204        writeln!(
205            out,
206            "{:>fw$}{:>iw$}{:>11}{:>9}{:>9}{:>9}",
207            s.fid,
208            s.iid,
209            if s.miss_pheno { "Y" } else { "N" },
210            s.n_miss,
211            s.n_geno,
212            fmt_g(f_miss(s.n_miss, s.n_geno)),
213        )?;
214    }
215    Ok(())
216}
217
218pub fn write_lmiss<W: Write>(variants: &[VariantMiss], out: &mut W) -> io::Result<()> {
219    let max_snp = variants.iter().map(|v| v.snp.len()).max().unwrap_or(0);
220    let sw = id_width(max_snp);
221
222    writeln!(
223        out,
224        "{:>4}{:>sw$}{:>9}{:>9}{:>9}",
225        "CHR", "SNP", "N_MISS", "N_GENO", "F_MISS"
226    )?;
227    for v in variants {
228        writeln!(
229            out,
230            "{:>4}{:>sw$}{:>9}{:>9}{:>9}",
231            v.chrom,
232            v.snp,
233            v.n_miss,
234            v.n_geno,
235            fmt_g(f_miss(v.n_miss, v.n_geno)),
236        )?;
237    }
238    Ok(())
239}
240
241fn f_miss(n_miss: u32, n_geno: u32) -> f64 {
242    if n_geno == 0 {
243        f64::NAN
244    } else {
245        f64::from(n_miss) / f64::from(n_geno)
246    }
247}
248
249/// Faithful C `printf("%.4g")` — PLINK's F_MISS format. Keeps 4 significant
250/// figures, switches to scientific notation when the decimal exponent is `< -4`
251/// or `>= 4`, and strips trailing zeros.
252fn fmt_g(x: f64) -> String {
253    const P: i32 = 4;
254    if x.is_nan() {
255        return "nan".to_string();
256    }
257    if x == 0.0 {
258        return "0".to_string();
259    }
260    let rounded = format!("{:.*e}", (P - 1) as usize, x);
261    let exp: i32 = rounded
262        .split('e')
263        .nth(1)
264        .and_then(|s| s.parse().ok())
265        .unwrap();
266
267    if !(-4..P).contains(&exp) {
268        let (mant, e) = rounded.split_once('e').unwrap();
269        let mant = strip_trailing(mant);
270        let e: i32 = e.parse().unwrap();
271        format!("{mant}e{}{:02}", if e < 0 { '-' } else { '+' }, e.abs())
272    } else {
273        let frac = (P - 1 - exp).max(0) as usize;
274        strip_trailing(&format!("{x:.frac$}"))
275    }
276}
277
278fn strip_trailing(s: &str) -> String {
279    if s.contains('.') {
280        s.trim_end_matches('0').trim_end_matches('.').to_string()
281    } else {
282        s.to_string()
283    }
284}
285
286#[cfg(test)]
287mod tests {
288    use super::*;
289
290    #[test]
291    fn g_formatting_matches_plink_samples() {
292        assert_eq!(fmt_g(0.04), "0.04");
293        assert_eq!(fmt_g(0.015), "0.015");
294        assert_eq!(fmt_g(0.025), "0.025");
295        assert_eq!(fmt_g(1.0 / 3.0), "0.3333");
296        assert_eq!(fmt_g(0.5), "0.5");
297        assert_eq!(fmt_g(0.075), "0.075");
298        assert_eq!(fmt_g(0.0), "0");
299    }
300
301    #[test]
302    fn width_rules() {
303        assert_eq!(fid_width(4), 4);
304        assert_eq!(fid_width(5), 7);
305        assert_eq!(fid_width(12), 14);
306        assert_eq!(id_width(4), 5);
307        assert_eq!(id_width(5), 8);
308        assert_eq!(id_width(10), 13);
309    }
310
311    #[test]
312    fn chrom_codes() {
313        assert_eq!(chrom_code("X"), "23");
314        assert_eq!(chrom_code("Y"), "24");
315        assert_eq!(chrom_code("XY"), "25");
316        assert_eq!(chrom_code("MT"), "26");
317        assert_eq!(chrom_code("7"), "7");
318    }
319
320    #[test]
321    fn pheno_missing_rule() {
322        assert!(pheno_missing("-9"));
323        assert!(pheno_missing("0"));
324        assert!(!pheno_missing("1"));
325        assert!(!pheno_missing("2"));
326    }
327}