Skip to main content

rsomics_seqstats/
lib.rs

1//! Format-agnostic sequence-statistics primitives shared by the
2//! `rsomics-*-stats` tools. The N50/L50/quartile math is a port of
3//! `shenwei356/bio` `util/length-stats.go`; the alphabet guess mirrors
4//! seqkit's `seq.GuessAlphabet`. Sharing this verbatim (rather than
5//! re-deriving per format) is what lets `--all --tabular` byte-agree with
6//! `seqkit stats -a -T` for both FASTA and FASTQ.
7
8// u64→f64 cast: lengths/counts fit in 52-bit mantissa for any real genome;
9// cast only at final quartile/N50/percentage stage.
10#![allow(clippy::cast_precision_loss)]
11
12use serde::Serialize;
13
14#[derive(Debug, Clone, Copy, Serialize, PartialEq, Eq)]
15pub enum SeqType {
16    #[serde(rename = "DNA")]
17    Dna,
18    #[serde(rename = "RNA")]
19    Rna,
20    #[serde(rename = "Protein")]
21    Protein,
22    #[serde(rename = "Other")]
23    Other,
24}
25
26impl SeqType {
27    #[must_use]
28    pub const fn as_str(self) -> &'static str {
29        match self {
30            Self::Dna => "DNA",
31            Self::Rna => "RNA",
32            Self::Protein => "Protein",
33            Self::Other => "Other",
34        }
35    }
36}
37
38/// Count every byte of `haystack` equal to any byte in `needles`, deduping
39/// `needles` so overlapping classes (e.g. `b"GCgc"`) are not double-counted.
40#[must_use]
41pub fn count_any_of(haystack: &[u8], needles: &[u8]) -> u64 {
42    let mut seen = [false; 256];
43    let mut total: u64 = 0;
44    for &n in needles {
45        if seen[n as usize] {
46            continue;
47        }
48        seen[n as usize] = true;
49        // Per-needle bytecount SIMD (4-pass over the slice) beats a scalar
50        // single-pass LUT: NEON/AVX2 recoups the extra bandwidth.
51        // (M2: scalar LUT 83 ms vs 53 ms on a chr22 fixture.)
52        total += bytecount::count(haystack, n) as u64;
53    }
54    total
55}
56
57/// seqkit's alphabet guess: any protein-only residue ⇒ Protein; else U-without-T
58/// ⇒ RNA; else DNA. Ambiguity codes and gaps do not decide the type.
59#[must_use]
60pub fn classify(sample: &[u8]) -> SeqType {
61    if sample.is_empty() {
62        return SeqType::Other;
63    }
64    let mut has_t = false;
65    let mut has_u = false;
66    let mut has_protein_only = false;
67    for &b in sample {
68        let c = b.to_ascii_uppercase();
69        match c {
70            b'T' => has_t = true,
71            b'U' => has_u = true,
72            b'E' | b'F' | b'I' | b'L' | b'P' | b'Q' | b'Z' | b'X' | b'*' => {
73                has_protein_only = true;
74            }
75            b'A' | b'C' | b'G' | b'N' | b'-' | b'.' | b' ' | b'\n' | b'\r' | b'R' | b'Y' | b'S'
76            | b'W' | b'K' | b'M' | b'B' | b'D' | b'H' | b'V' => {}
77            _ => return SeqType::Other,
78        }
79    }
80    if has_protein_only {
81        SeqType::Protein
82    } else if has_u && !has_t {
83        SeqType::Rna
84    } else {
85        SeqType::Dna
86    }
87}
88
89/// Port of `bio/util/length-stats.go`. seqkit's L50 counts unique-length
90/// buckets, not records — reproduced so `--tabular --all` agrees with seqkit.
91pub struct LengthStats {
92    counts: Vec<(u64, u64)>,
93    sum: u64,
94    count: u64,
95}
96
97impl LengthStats {
98    #[must_use]
99    pub fn new(mut lengths: Vec<u64>) -> Self {
100        let sum: u64 = lengths.iter().sum();
101        let count = lengths.len() as u64;
102        lengths.sort_unstable();
103        let mut counts: Vec<(u64, u64)> = Vec::new();
104        let mut acc: u64 = 0;
105        let mut i = 0;
106        while i < lengths.len() {
107            let v = lengths[i];
108            let mut j = i;
109            while j < lengths.len() && lengths[j] == v {
110                j += 1;
111            }
112            acc += (j - i) as u64;
113            counts.push((v, acc));
114            i = j;
115        }
116        Self { counts, sum, count }
117    }
118
119    fn get_value(&self, even: bool, i_med_l: u64, i_med_r: u64) -> f64 {
120        let mut flag = false;
121        let mut prev: u64 = 0;
122        for &(len, acc) in &self.counts {
123            if flag {
124                return (len + prev) as f64 / 2.0;
125            }
126            if acc > i_med_l {
127                if even {
128                    if acc > i_med_r {
129                        return len as f64;
130                    }
131                    flag = true;
132                    prev = len;
133                } else {
134                    return len as f64;
135                }
136            }
137        }
138        // Callers pass i_med_l < self.count, so the last bucket's acc always
139        // terminates the loop. Reaching here means a caller broke the invariant.
140        unreachable!(
141            "LengthStats::get_value: i_med_l={i_med_l} not bracketed in counts (count={}, flag={flag}, prev={prev})",
142            self.count
143        )
144    }
145
146    #[must_use]
147    pub fn q2(&self) -> f64 {
148        if self.counts.is_empty() {
149            return 0.0;
150        }
151        if self.counts.len() == 1 {
152            return self.counts[0].0 as f64;
153        }
154        let even = self.count & 1 == 0;
155        if even {
156            let l = self.count / 2 - 1;
157            let r = self.count / 2;
158            self.get_value(true, l, r)
159        } else {
160            self.get_value(false, self.count / 2, 0)
161        }
162    }
163
164    #[must_use]
165    pub fn q1(&self) -> f64 {
166        if self.counts.is_empty() {
167            return 0.0;
168        }
169        if self.counts.len() == 1 {
170            return self.counts[0].0 as f64;
171        }
172        let parent_even = self.count & 1 == 0;
173        let n = if parent_even {
174            self.count / 2
175        } else {
176            self.count.div_ceil(2)
177        };
178        let even = n & 1 == 0;
179        if even {
180            self.get_value(true, n / 2 - 1, n / 2)
181        } else {
182            self.get_value(false, n / 2, 0)
183        }
184    }
185
186    #[must_use]
187    pub fn q3(&self) -> f64 {
188        if self.counts.is_empty() {
189            return 0.0;
190        }
191        if self.counts.len() == 1 {
192            return self.counts[0].0 as f64;
193        }
194        let parent_even = self.count & 1 == 0;
195        let (n, mean) = if parent_even {
196            (self.count / 2, self.count / 2)
197        } else {
198            (self.count.div_ceil(2), self.count / 2)
199        };
200        let even = n & 1 == 0;
201        if even {
202            self.get_value(true, n / 2 - 1 + mean, n / 2 + mean)
203        } else {
204            self.get_value(false, n / 2 + mean, 0)
205        }
206    }
207
208    /// `(N50, L50)` where L50 is seqkit's `N50_num` (unique-length-bucket
209    /// count, not record count — see the struct doc).
210    #[must_use]
211    pub fn n50_l50(&self) -> (u64, u64) {
212        if self.counts.is_empty() {
213            return (0, 0);
214        }
215        if self.counts.len() == 1 {
216            return (self.counts[0].0, 1);
217        }
218        let half = self.sum as f64 / 2.0;
219        let mut sum_len: f64 = 0.0;
220        let n = self.counts.len();
221        for i in (0..n).rev() {
222            let (len, acc) = self.counts[i];
223            let prev_acc = if i == 0 { 0 } else { self.counts[i - 1].1 };
224            let per_len_count = acc - prev_acc;
225            sum_len += (len * per_len_count) as f64;
226            if sum_len >= half {
227                return (len, (n - i) as u64);
228            }
229        }
230        (0, 0)
231    }
232}
233
234#[cfg(test)]
235#[allow(clippy::float_cmp)]
236mod tests {
237    use super::*;
238
239    #[test]
240    fn quartiles_three_contigs_match_seqkit() {
241        let ls = LengthStats::new(vec![4, 6, 8]);
242        assert_eq!(ls.q1(), 5.0);
243        assert_eq!(ls.q2(), 6.0);
244        assert_eq!(ls.q3(), 7.0);
245    }
246
247    #[test]
248    fn quartiles_one_to_nine_match_seqkit() {
249        let ls = LengthStats::new((1u64..=9).collect());
250        assert_eq!(ls.q1(), 3.0);
251        assert_eq!(ls.q2(), 5.0);
252        assert_eq!(ls.q3(), 7.0);
253    }
254
255    #[test]
256    fn n50_three_contigs() {
257        let ls = LengthStats::new(vec![4, 6, 8]);
258        assert_eq!(ls.n50_l50(), (6, 2));
259    }
260
261    #[test]
262    fn single_length_bucket() {
263        let ls = LengthStats::new(vec![7, 7, 7]);
264        assert_eq!(ls.q1(), 7.0);
265        assert_eq!(ls.q2(), 7.0);
266        assert_eq!(ls.q3(), 7.0);
267        assert_eq!(ls.n50_l50(), (7, 1));
268    }
269
270    #[test]
271    fn empty_is_zero() {
272        let ls = LengthStats::new(vec![]);
273        assert_eq!(ls.q2(), 0.0);
274        assert_eq!(ls.n50_l50(), (0, 0));
275    }
276
277    #[test]
278    fn count_any_of_dedupes_overlapping_classes() {
279        assert_eq!(count_any_of(b"GCgcAT", b"GCgc"), 4);
280        assert_eq!(count_any_of(b"NNnnX", b"Nn"), 4);
281    }
282
283    #[test]
284    fn classify_rna_protein_dna_other() {
285        assert_eq!(classify(b"ACGU"), SeqType::Rna);
286        assert_eq!(classify(b"MEEPSILQRT"), SeqType::Protein);
287        assert_eq!(classify(b"ACGTN"), SeqType::Dna);
288        assert_eq!(classify(b""), SeqType::Other);
289        assert_eq!(classify(b"@#$"), SeqType::Other);
290    }
291}