1#![allow(clippy::cast_precision_loss)]
11
12use serde::Serialize;
13
14#[derive(Debug, Clone, Copy, Serialize, PartialEq, Eq)]
15pub enum SeqType {
16 #[serde(rename = "DNA")]
17 Dna,
18 #[serde(rename = "RNA")]
19 Rna,
20 #[serde(rename = "Protein")]
21 Protein,
22 #[serde(rename = "Unlimit")]
25 Unlimit,
26}
27
28impl SeqType {
29 #[must_use]
30 pub const fn as_str(self) -> &'static str {
31 match self {
32 Self::Dna => "DNA",
33 Self::Rna => "RNA",
34 Self::Protein => "Protein",
35 Self::Unlimit => "Unlimit",
36 }
37 }
38}
39
40pub const DEFAULT_ALPHABET_GUESS_LEN: usize = 10_000;
45
46#[must_use]
49pub fn count_any_of(haystack: &[u8], needles: &[u8]) -> u64 {
50 let mut seen = [false; 256];
51 let mut total: u64 = 0;
52 for &n in needles {
53 if seen[n as usize] {
54 continue;
55 }
56 seen[n as usize] = true;
57 total += bytecount::count(haystack, n) as u64;
61 }
62 total
63}
64
65#[must_use]
70pub fn classify(sample: &[u8]) -> SeqType {
71 if sample.is_empty() {
72 return SeqType::Unlimit;
73 }
74 let mut has_t = false;
75 let mut has_u = false;
76 let mut has_protein_only = false;
77 for &b in sample {
78 let c = b.to_ascii_uppercase();
79 match c {
80 b'T' => has_t = true,
81 b'U' => has_u = true,
82 b'E' | b'F' | b'I' | b'L' | b'P' | b'Q' | b'Z' | b'X' | b'*' => {
83 has_protein_only = true;
84 }
85 b'A' | b'C' | b'G' | b'N' | b'-' | b'.' | b' ' | b'\n' | b'\r' | b'R' | b'Y' | b'S'
86 | b'W' | b'K' | b'M' | b'B' | b'D' | b'H' | b'V' => {}
87 _ => return SeqType::Unlimit,
88 }
89 }
90 if has_protein_only {
91 SeqType::Protein
92 } else if has_u && !has_t {
93 SeqType::Rna
94 } else {
95 SeqType::Dna
96 }
97}
98
99pub struct LengthStats {
102 counts: Vec<(u64, u64)>,
103 sum: u64,
104 count: u64,
105}
106
107impl LengthStats {
108 #[must_use]
109 pub fn new(mut lengths: Vec<u64>) -> Self {
110 let sum: u64 = lengths.iter().sum();
111 let count = lengths.len() as u64;
112 lengths.sort_unstable();
113 let mut counts: Vec<(u64, u64)> = Vec::new();
114 let mut acc: u64 = 0;
115 let mut i = 0;
116 while i < lengths.len() {
117 let v = lengths[i];
118 let mut j = i;
119 while j < lengths.len() && lengths[j] == v {
120 j += 1;
121 }
122 acc += (j - i) as u64;
123 counts.push((v, acc));
124 i = j;
125 }
126 Self { counts, sum, count }
127 }
128
129 fn get_value(&self, even: bool, i_med_l: u64, i_med_r: u64) -> f64 {
130 let mut flag = false;
131 let mut prev: u64 = 0;
132 for &(len, acc) in &self.counts {
133 if flag {
134 return (len + prev) as f64 / 2.0;
135 }
136 if acc > i_med_l {
137 if even {
138 if acc > i_med_r {
139 return len as f64;
140 }
141 flag = true;
142 prev = len;
143 } else {
144 return len as f64;
145 }
146 }
147 }
148 unreachable!(
151 "LengthStats::get_value: i_med_l={i_med_l} not bracketed in counts (count={}, flag={flag}, prev={prev})",
152 self.count
153 )
154 }
155
156 #[must_use]
157 pub fn q2(&self) -> f64 {
158 if self.counts.is_empty() {
159 return 0.0;
160 }
161 if self.counts.len() == 1 {
162 return self.counts[0].0 as f64;
163 }
164 let even = self.count & 1 == 0;
165 if even {
166 let l = self.count / 2 - 1;
167 let r = self.count / 2;
168 self.get_value(true, l, r)
169 } else {
170 self.get_value(false, self.count / 2, 0)
171 }
172 }
173
174 #[must_use]
175 pub fn q1(&self) -> f64 {
176 if self.counts.is_empty() {
177 return 0.0;
178 }
179 if self.counts.len() == 1 {
180 return self.counts[0].0 as f64;
181 }
182 let parent_even = self.count & 1 == 0;
183 let n = if parent_even {
184 self.count / 2
185 } else {
186 self.count.div_ceil(2)
187 };
188 let even = n & 1 == 0;
189 if even {
190 self.get_value(true, n / 2 - 1, n / 2)
191 } else {
192 self.get_value(false, n / 2, 0)
193 }
194 }
195
196 #[must_use]
197 pub fn q3(&self) -> f64 {
198 if self.counts.is_empty() {
199 return 0.0;
200 }
201 if self.counts.len() == 1 {
202 return self.counts[0].0 as f64;
203 }
204 let parent_even = self.count & 1 == 0;
205 let (n, mean) = if parent_even {
206 (self.count / 2, self.count / 2)
207 } else {
208 (self.count.div_ceil(2), self.count / 2)
209 };
210 let even = n & 1 == 0;
211 if even {
212 self.get_value(true, n / 2 - 1 + mean, n / 2 + mean)
213 } else {
214 self.get_value(false, n / 2 + mean, 0)
215 }
216 }
217
218 #[must_use]
221 pub fn n50_l50(&self) -> (u64, u64) {
222 if self.counts.is_empty() {
223 return (0, 0);
224 }
225 if self.counts.len() == 1 {
226 return (self.counts[0].0, 1);
227 }
228 let half = self.sum as f64 / 2.0;
229 let mut sum_len: f64 = 0.0;
230 let n = self.counts.len();
231 for i in (0..n).rev() {
232 let (len, acc) = self.counts[i];
233 let prev_acc = if i == 0 { 0 } else { self.counts[i - 1].1 };
234 let per_len_count = acc - prev_acc;
235 sum_len += (len * per_len_count) as f64;
236 if sum_len >= half {
237 return (len, (n - i) as u64);
238 }
239 }
240 (0, 0)
241 }
242}
243
244#[cfg(test)]
245#[allow(clippy::float_cmp)]
246mod tests {
247 use super::*;
248
249 #[test]
250 fn quartiles_three_contigs_match_seqkit() {
251 let ls = LengthStats::new(vec![4, 6, 8]);
252 assert_eq!(ls.q1(), 5.0);
253 assert_eq!(ls.q2(), 6.0);
254 assert_eq!(ls.q3(), 7.0);
255 }
256
257 #[test]
258 fn quartiles_one_to_nine_match_seqkit() {
259 let ls = LengthStats::new((1u64..=9).collect());
260 assert_eq!(ls.q1(), 3.0);
261 assert_eq!(ls.q2(), 5.0);
262 assert_eq!(ls.q3(), 7.0);
263 }
264
265 #[test]
266 fn n50_three_contigs() {
267 let ls = LengthStats::new(vec![4, 6, 8]);
268 assert_eq!(ls.n50_l50(), (6, 2));
269 }
270
271 #[test]
272 fn single_length_bucket() {
273 let ls = LengthStats::new(vec![7, 7, 7]);
274 assert_eq!(ls.q1(), 7.0);
275 assert_eq!(ls.q2(), 7.0);
276 assert_eq!(ls.q3(), 7.0);
277 assert_eq!(ls.n50_l50(), (7, 1));
278 }
279
280 #[test]
281 fn empty_is_zero() {
282 let ls = LengthStats::new(vec![]);
283 assert_eq!(ls.q2(), 0.0);
284 assert_eq!(ls.n50_l50(), (0, 0));
285 }
286
287 #[test]
288 fn count_any_of_dedupes_overlapping_classes() {
289 assert_eq!(count_any_of(b"GCgcAT", b"GCgc"), 4);
290 assert_eq!(count_any_of(b"NNnnX", b"Nn"), 4);
291 }
292
293 #[test]
294 fn classify_rna_protein_dna_unlimit() {
295 assert_eq!(classify(b"ACGU"), SeqType::Rna);
296 assert_eq!(classify(b"MEEPSILQRT"), SeqType::Protein);
297 assert_eq!(classify(b"ACGTN"), SeqType::Dna);
298 assert_eq!(classify(b""), SeqType::Unlimit);
300 assert_eq!(classify(b"@#$"), SeqType::Unlimit);
301 }
302}