Skip to main content

rsomics_seqstats/
lib.rs

1//! Format-agnostic sequence-statistics primitives shared by the
2//! `rsomics-*-stats` tools. The N50/L50/quartile math is a port of
3//! `shenwei356/bio` `util/length-stats.go`; the alphabet guess mirrors
4//! seqkit's `seq.GuessAlphabet`. Sharing this verbatim (rather than
5//! re-deriving per format) is what lets `--all --tabular` byte-agree with
6//! `seqkit stats -a -T` for both FASTA and FASTQ.
7
8// u64→f64 cast: lengths/counts fit in 52-bit mantissa for any real genome;
9// cast only at final quartile/N50/percentage stage.
10#![allow(clippy::cast_precision_loss)]
11
12use serde::Serialize;
13
14#[derive(Debug, Clone, Copy, Serialize, PartialEq, Eq)]
15pub enum SeqType {
16    #[serde(rename = "DNA")]
17    Dna,
18    #[serde(rename = "RNA")]
19    Rna,
20    #[serde(rename = "Protein")]
21    Protein,
22    // seqkit's catch-all alphabet is "Unlimit" (empty or non-bio sequence);
23    // it never emits "Other", so byte-equality requires this exact string.
24    #[serde(rename = "Unlimit")]
25    Unlimit,
26}
27
28impl SeqType {
29    #[must_use]
30    pub const fn as_str(self) -> &'static str {
31        match self {
32            Self::Dna => "DNA",
33            Self::Rna => "RNA",
34            Self::Protein => "Protein",
35            Self::Unlimit => "Unlimit",
36        }
37    }
38}
39
40/// seqkit guesses the sequence type from the **first record only**, scanning
41/// at most this many bytes of it (`seq.AlphabetGuessSeqLengthThreshold`,
42/// seqkit's `--alphabet-guess-seq-length` default). Callers pass the first
43/// record's sequence truncated to this; accumulating across records diverges.
44pub const DEFAULT_ALPHABET_GUESS_LEN: usize = 10_000;
45
46/// Count every byte of `haystack` equal to any byte in `needles`, deduping
47/// `needles` so overlapping classes (e.g. `b"GCgc"`) are not double-counted.
48#[must_use]
49pub fn count_any_of(haystack: &[u8], needles: &[u8]) -> u64 {
50    let mut seen = [false; 256];
51    let mut total: u64 = 0;
52    for &n in needles {
53        if seen[n as usize] {
54            continue;
55        }
56        seen[n as usize] = true;
57        // Per-needle bytecount SIMD (4-pass over the slice) beats a scalar
58        // single-pass LUT: NEON/AVX2 recoups the extra bandwidth.
59        // (M2: scalar LUT 83 ms vs 53 ms on a chr22 fixture.)
60        total += bytecount::count(haystack, n) as u64;
61    }
62    total
63}
64
65/// seqkit's alphabet guess over one sequence (pass the first record's prefix —
66/// see [`DEFAULT_ALPHABET_GUESS_LEN`]): any protein-only residue ⇒ Protein;
67/// else U-without-T ⇒ RNA; else DNA. An empty or non-bio sequence ⇒ Unlimit.
68/// Ambiguity codes and gaps do not decide the type.
69#[must_use]
70pub fn classify(sample: &[u8]) -> SeqType {
71    if sample.is_empty() {
72        return SeqType::Unlimit;
73    }
74    let mut has_t = false;
75    let mut has_u = false;
76    let mut has_protein_only = false;
77    for &b in sample {
78        let c = b.to_ascii_uppercase();
79        match c {
80            b'T' => has_t = true,
81            b'U' => has_u = true,
82            b'E' | b'F' | b'I' | b'L' | b'P' | b'Q' | b'Z' | b'X' | b'*' => {
83                has_protein_only = true;
84            }
85            b'A' | b'C' | b'G' | b'N' | b'-' | b'.' | b' ' | b'\n' | b'\r' | b'R' | b'Y' | b'S'
86            | b'W' | b'K' | b'M' | b'B' | b'D' | b'H' | b'V' => {}
87            _ => return SeqType::Unlimit,
88        }
89    }
90    if has_protein_only {
91        SeqType::Protein
92    } else if has_u && !has_t {
93        SeqType::Rna
94    } else {
95        SeqType::Dna
96    }
97}
98
99/// Port of `bio/util/length-stats.go`. seqkit's L50 counts unique-length
100/// buckets, not records — reproduced so `--tabular --all` agrees with seqkit.
101pub struct LengthStats {
102    counts: Vec<(u64, u64)>,
103    sum: u64,
104    count: u64,
105}
106
107impl LengthStats {
108    #[must_use]
109    pub fn new(mut lengths: Vec<u64>) -> Self {
110        let sum: u64 = lengths.iter().sum();
111        let count = lengths.len() as u64;
112        lengths.sort_unstable();
113        let mut counts: Vec<(u64, u64)> = Vec::new();
114        let mut acc: u64 = 0;
115        let mut i = 0;
116        while i < lengths.len() {
117            let v = lengths[i];
118            let mut j = i;
119            while j < lengths.len() && lengths[j] == v {
120                j += 1;
121            }
122            acc += (j - i) as u64;
123            counts.push((v, acc));
124            i = j;
125        }
126        Self { counts, sum, count }
127    }
128
129    fn get_value(&self, even: bool, i_med_l: u64, i_med_r: u64) -> f64 {
130        let mut flag = false;
131        let mut prev: u64 = 0;
132        for &(len, acc) in &self.counts {
133            if flag {
134                return (len + prev) as f64 / 2.0;
135            }
136            if acc > i_med_l {
137                if even {
138                    if acc > i_med_r {
139                        return len as f64;
140                    }
141                    flag = true;
142                    prev = len;
143                } else {
144                    return len as f64;
145                }
146            }
147        }
148        // Callers pass i_med_l < self.count, so the last bucket's acc always
149        // terminates the loop. Reaching here means a caller broke the invariant.
150        unreachable!(
151            "LengthStats::get_value: i_med_l={i_med_l} not bracketed in counts (count={}, flag={flag}, prev={prev})",
152            self.count
153        )
154    }
155
156    #[must_use]
157    pub fn q2(&self) -> f64 {
158        if self.counts.is_empty() {
159            return 0.0;
160        }
161        if self.counts.len() == 1 {
162            return self.counts[0].0 as f64;
163        }
164        let even = self.count & 1 == 0;
165        if even {
166            let l = self.count / 2 - 1;
167            let r = self.count / 2;
168            self.get_value(true, l, r)
169        } else {
170            self.get_value(false, self.count / 2, 0)
171        }
172    }
173
174    #[must_use]
175    pub fn q1(&self) -> f64 {
176        if self.counts.is_empty() {
177            return 0.0;
178        }
179        if self.counts.len() == 1 {
180            return self.counts[0].0 as f64;
181        }
182        let parent_even = self.count & 1 == 0;
183        let n = if parent_even {
184            self.count / 2
185        } else {
186            self.count.div_ceil(2)
187        };
188        let even = n & 1 == 0;
189        if even {
190            self.get_value(true, n / 2 - 1, n / 2)
191        } else {
192            self.get_value(false, n / 2, 0)
193        }
194    }
195
196    #[must_use]
197    pub fn q3(&self) -> f64 {
198        if self.counts.is_empty() {
199            return 0.0;
200        }
201        if self.counts.len() == 1 {
202            return self.counts[0].0 as f64;
203        }
204        let parent_even = self.count & 1 == 0;
205        let (n, mean) = if parent_even {
206            (self.count / 2, self.count / 2)
207        } else {
208            (self.count.div_ceil(2), self.count / 2)
209        };
210        let even = n & 1 == 0;
211        if even {
212            self.get_value(true, n / 2 - 1 + mean, n / 2 + mean)
213        } else {
214            self.get_value(false, n / 2 + mean, 0)
215        }
216    }
217
218    /// `(N50, L50)` where L50 is seqkit's `N50_num` (unique-length-bucket
219    /// count, not record count — see the struct doc).
220    #[must_use]
221    pub fn n50_l50(&self) -> (u64, u64) {
222        if self.counts.is_empty() {
223            return (0, 0);
224        }
225        if self.counts.len() == 1 {
226            return (self.counts[0].0, 1);
227        }
228        let half = self.sum as f64 / 2.0;
229        let mut sum_len: f64 = 0.0;
230        let n = self.counts.len();
231        for i in (0..n).rev() {
232            let (len, acc) = self.counts[i];
233            let prev_acc = if i == 0 { 0 } else { self.counts[i - 1].1 };
234            let per_len_count = acc - prev_acc;
235            sum_len += (len * per_len_count) as f64;
236            if sum_len >= half {
237                return (len, (n - i) as u64);
238            }
239        }
240        (0, 0)
241    }
242}
243
244#[cfg(test)]
245#[allow(clippy::float_cmp)]
246mod tests {
247    use super::*;
248
249    #[test]
250    fn quartiles_three_contigs_match_seqkit() {
251        let ls = LengthStats::new(vec![4, 6, 8]);
252        assert_eq!(ls.q1(), 5.0);
253        assert_eq!(ls.q2(), 6.0);
254        assert_eq!(ls.q3(), 7.0);
255    }
256
257    #[test]
258    fn quartiles_one_to_nine_match_seqkit() {
259        let ls = LengthStats::new((1u64..=9).collect());
260        assert_eq!(ls.q1(), 3.0);
261        assert_eq!(ls.q2(), 5.0);
262        assert_eq!(ls.q3(), 7.0);
263    }
264
265    #[test]
266    fn n50_three_contigs() {
267        let ls = LengthStats::new(vec![4, 6, 8]);
268        assert_eq!(ls.n50_l50(), (6, 2));
269    }
270
271    #[test]
272    fn single_length_bucket() {
273        let ls = LengthStats::new(vec![7, 7, 7]);
274        assert_eq!(ls.q1(), 7.0);
275        assert_eq!(ls.q2(), 7.0);
276        assert_eq!(ls.q3(), 7.0);
277        assert_eq!(ls.n50_l50(), (7, 1));
278    }
279
280    #[test]
281    fn empty_is_zero() {
282        let ls = LengthStats::new(vec![]);
283        assert_eq!(ls.q2(), 0.0);
284        assert_eq!(ls.n50_l50(), (0, 0));
285    }
286
287    #[test]
288    fn count_any_of_dedupes_overlapping_classes() {
289        assert_eq!(count_any_of(b"GCgcAT", b"GCgc"), 4);
290        assert_eq!(count_any_of(b"NNnnX", b"Nn"), 4);
291    }
292
293    #[test]
294    fn classify_rna_protein_dna_unlimit() {
295        assert_eq!(classify(b"ACGU"), SeqType::Rna);
296        assert_eq!(classify(b"MEEPSILQRT"), SeqType::Protein);
297        assert_eq!(classify(b"ACGTN"), SeqType::Dna);
298        // seqkit prints "Unlimit" (not "Other") for empty / non-bio input.
299        assert_eq!(classify(b""), SeqType::Unlimit);
300        assert_eq!(classify(b"@#$"), SeqType::Unlimit);
301    }
302}