1#![allow(clippy::cast_precision_loss)]
10
11use rayon::prelude::*;
12use rsomics_pgen::Pgen;
13use std::io::{self, Write};
14
15pub struct SampleMiss {
16 pub fid: String,
17 pub iid: String,
18 pub miss_pheno: bool,
19 pub n_miss: u32,
20 pub n_geno: u32,
21}
22
23pub struct VariantMiss {
24 pub chrom: String,
25 pub snp: String,
26 pub n_miss: u32,
27 pub n_geno: u32,
28}
29
30pub struct Missing {
31 pub samples: Vec<SampleMiss>,
32 pub variants: Vec<VariantMiss>,
33}
34
35const LANE_LO: u64 = 0x5555_5555_5555_5555;
37
38#[inline]
41fn byte_miss(b: u8) -> u8 {
42 b & !(b >> 1) & 0x55
43}
44
45#[inline]
48fn scatter(mut m: u64, base: usize, acc: &mut [u32]) {
49 while m != 0 {
50 acc[base + (m.trailing_zeros() as usize >> 1)] += 1;
51 m &= m - 1;
52 }
53}
54
55#[inline]
60fn scan_variant(row: &[u8], full_bytes: usize, last_lanes: usize, acc: &mut [u32]) -> u32 {
61 let words = full_bytes / 8;
62 let mut n = 0u32;
63 for c in 0..words {
64 let off = c * 8;
65 let w = u64::from_le_bytes(row[off..off + 8].try_into().unwrap());
66 let miss = w & !(w >> 1) & LANE_LO;
67 if miss == 0 {
68 continue;
69 }
70 n += miss.count_ones();
71 scatter(miss, off * 4, acc);
72 }
73 for (bi, &b) in row[..full_bytes].iter().enumerate().skip(words * 8) {
74 let mb = byte_miss(b);
75 if mb != 0 {
76 n += mb.count_ones();
77 scatter(u64::from(mb), bi * 4, acc);
78 }
79 }
80 if last_lanes != 0 {
81 let keep = (1u8 << (last_lanes * 2)) - 1;
82 let mb = byte_miss(row[full_bytes]) & keep;
83 if mb != 0 {
84 n += mb.count_ones();
85 scatter(u64::from(mb), full_bytes * 4, acc);
86 }
87 }
88 n
89}
90
91#[must_use]
92pub fn missing(pgen: &Pgen, threads: usize) -> Missing {
93 let n_samp = pgen.n_samples();
94 let n_var = pgen.n_variants();
95 let bpv = pgen.bytes_per_variant();
96 let full_bytes = n_samp / 4;
97 let last_lanes = n_samp % 4;
98 let gt = &pgen.gt_raw;
99 let row = |v: usize| >[v * bpv..v * bpv + bpv];
100
101 let (per_variant, per_sample): (Vec<u32>, Vec<u32>) = if threads <= 1 {
102 let mut sample_acc = vec![0u32; n_samp];
103 let variant_counts: Vec<u32> = (0..n_var)
104 .map(|v| scan_variant(row(v), full_bytes, last_lanes, &mut sample_acc))
105 .collect();
106 (variant_counts, sample_acc)
107 } else {
108 let pool = rayon::ThreadPoolBuilder::new()
109 .num_threads(threads)
110 .build()
111 .expect("thread pool");
112 pool.install(|| {
113 let mut variant_counts = vec![0u32; n_var];
114 let sample_acc = variant_counts
115 .par_iter_mut()
116 .enumerate()
117 .fold(
118 || vec![0u32; n_samp],
119 |mut acc, (v, vc)| {
120 *vc = scan_variant(row(v), full_bytes, last_lanes, &mut acc);
121 acc
122 },
123 )
124 .reduce(
125 || vec![0u32; n_samp],
126 |mut a, b| {
127 for (x, y) in a.iter_mut().zip(b) {
128 *x += y;
129 }
130 a
131 },
132 );
133 (variant_counts, sample_acc)
134 })
135 };
136
137 let n_var_u32 = n_var as u32;
138 let samples = pgen
139 .samples
140 .iter()
141 .zip(&per_sample)
142 .map(|(s, &n_miss)| SampleMiss {
143 fid: s.fid.clone(),
144 iid: s.iid.clone(),
145 miss_pheno: pheno_missing(&s.phen),
146 n_miss,
147 n_geno: n_var_u32,
148 })
149 .collect();
150
151 let n_samp_u32 = n_samp as u32;
152 let variants = pgen
153 .variants
154 .iter()
155 .zip(&per_variant)
156 .map(|(v, &n_miss)| VariantMiss {
157 chrom: chrom_code(&v.chrom),
158 snp: v.id.clone(),
159 n_miss,
160 n_geno: n_samp_u32,
161 })
162 .collect();
163
164 Missing { samples, variants }
165}
166
167fn pheno_missing(phen: &str) -> bool {
169 phen == "-9" || phen == "0"
170}
171
172fn chrom_code(chrom: &str) -> String {
175 match chrom {
176 "X" | "x" => "23".to_string(),
177 "Y" | "y" => "24".to_string(),
178 "XY" | "xy" => "25".to_string(),
179 "MT" | "mt" | "M" | "m" => "26".to_string(),
180 other => other.to_string(),
181 }
182}
183
184fn id_width(max_id: usize) -> usize {
185 if max_id <= 4 { 5 } else { max_id + 3 }
186}
187
188fn fid_width(max_fid: usize) -> usize {
189 if max_fid <= 4 { 4 } else { max_fid + 2 }
190}
191
192pub fn write_imiss<W: Write>(samples: &[SampleMiss], out: &mut W) -> io::Result<()> {
193 let max_fid = samples.iter().map(|s| s.fid.len()).max().unwrap_or(0);
194 let max_iid = samples.iter().map(|s| s.iid.len()).max().unwrap_or(0);
195 let fw = fid_width(max_fid);
196 let iw = id_width(max_iid);
197
198 writeln!(
199 out,
200 "{:>fw$}{:>iw$}{:>11}{:>9}{:>9}{:>9}",
201 "FID", "IID", "MISS_PHENO", "N_MISS", "N_GENO", "F_MISS",
202 )?;
203 for s in samples {
204 writeln!(
205 out,
206 "{:>fw$}{:>iw$}{:>11}{:>9}{:>9}{:>9}",
207 s.fid,
208 s.iid,
209 if s.miss_pheno { "Y" } else { "N" },
210 s.n_miss,
211 s.n_geno,
212 fmt_g(f_miss(s.n_miss, s.n_geno)),
213 )?;
214 }
215 Ok(())
216}
217
218pub fn write_lmiss<W: Write>(variants: &[VariantMiss], out: &mut W) -> io::Result<()> {
219 let max_snp = variants.iter().map(|v| v.snp.len()).max().unwrap_or(0);
220 let sw = id_width(max_snp);
221
222 writeln!(
223 out,
224 "{:>4}{:>sw$}{:>9}{:>9}{:>9}",
225 "CHR", "SNP", "N_MISS", "N_GENO", "F_MISS"
226 )?;
227 for v in variants {
228 writeln!(
229 out,
230 "{:>4}{:>sw$}{:>9}{:>9}{:>9}",
231 v.chrom,
232 v.snp,
233 v.n_miss,
234 v.n_geno,
235 fmt_g(f_miss(v.n_miss, v.n_geno)),
236 )?;
237 }
238 Ok(())
239}
240
241fn f_miss(n_miss: u32, n_geno: u32) -> f64 {
242 if n_geno == 0 {
243 f64::NAN
244 } else {
245 f64::from(n_miss) / f64::from(n_geno)
246 }
247}
248
249fn fmt_g(x: f64) -> String {
253 const P: i32 = 4;
254 if x.is_nan() {
255 return "nan".to_string();
256 }
257 if x == 0.0 {
258 return "0".to_string();
259 }
260 let rounded = format!("{:.*e}", (P - 1) as usize, x);
261 let exp: i32 = rounded
262 .split('e')
263 .nth(1)
264 .and_then(|s| s.parse().ok())
265 .unwrap();
266
267 if !(-4..P).contains(&exp) {
268 let (mant, e) = rounded.split_once('e').unwrap();
269 let mant = strip_trailing(mant);
270 let e: i32 = e.parse().unwrap();
271 format!("{mant}e{}{:02}", if e < 0 { '-' } else { '+' }, e.abs())
272 } else {
273 let frac = (P - 1 - exp).max(0) as usize;
274 strip_trailing(&format!("{x:.frac$}"))
275 }
276}
277
278fn strip_trailing(s: &str) -> String {
279 if s.contains('.') {
280 s.trim_end_matches('0').trim_end_matches('.').to_string()
281 } else {
282 s.to_string()
283 }
284}
285
286#[cfg(test)]
287mod tests {
288 use super::*;
289
290 #[test]
291 fn g_formatting_matches_plink_samples() {
292 assert_eq!(fmt_g(0.04), "0.04");
293 assert_eq!(fmt_g(0.015), "0.015");
294 assert_eq!(fmt_g(0.025), "0.025");
295 assert_eq!(fmt_g(1.0 / 3.0), "0.3333");
296 assert_eq!(fmt_g(0.5), "0.5");
297 assert_eq!(fmt_g(0.075), "0.075");
298 assert_eq!(fmt_g(0.0), "0");
299 }
300
301 #[test]
302 fn width_rules() {
303 assert_eq!(fid_width(4), 4);
304 assert_eq!(fid_width(5), 7);
305 assert_eq!(fid_width(12), 14);
306 assert_eq!(id_width(4), 5);
307 assert_eq!(id_width(5), 8);
308 assert_eq!(id_width(10), 13);
309 }
310
311 #[test]
312 fn chrom_codes() {
313 assert_eq!(chrom_code("X"), "23");
314 assert_eq!(chrom_code("Y"), "24");
315 assert_eq!(chrom_code("XY"), "25");
316 assert_eq!(chrom_code("MT"), "26");
317 assert_eq!(chrom_code("7"), "7");
318 }
319
320 #[test]
321 fn pheno_missing_rule() {
322 assert!(pheno_missing("-9"));
323 assert!(pheno_missing("0"));
324 assert!(!pheno_missing("1"));
325 assert!(!pheno_missing("2"));
326 }
327}