1#![allow(clippy::cast_precision_loss)]
11
12use serde::Serialize;
13
14#[derive(Debug, Clone, Copy, Serialize, PartialEq, Eq)]
15pub enum SeqType {
16 #[serde(rename = "DNA")]
17 Dna,
18 #[serde(rename = "RNA")]
19 Rna,
20 #[serde(rename = "Protein")]
21 Protein,
22 #[serde(rename = "Other")]
23 Other,
24}
25
26impl SeqType {
27 #[must_use]
28 pub const fn as_str(self) -> &'static str {
29 match self {
30 Self::Dna => "DNA",
31 Self::Rna => "RNA",
32 Self::Protein => "Protein",
33 Self::Other => "Other",
34 }
35 }
36}
37
38#[must_use]
41pub fn count_any_of(haystack: &[u8], needles: &[u8]) -> u64 {
42 let mut seen = [false; 256];
43 let mut total: u64 = 0;
44 for &n in needles {
45 if seen[n as usize] {
46 continue;
47 }
48 seen[n as usize] = true;
49 total += bytecount::count(haystack, n) as u64;
53 }
54 total
55}
56
57#[must_use]
60pub fn classify(sample: &[u8]) -> SeqType {
61 if sample.is_empty() {
62 return SeqType::Other;
63 }
64 let mut has_t = false;
65 let mut has_u = false;
66 let mut has_protein_only = false;
67 for &b in sample {
68 let c = b.to_ascii_uppercase();
69 match c {
70 b'T' => has_t = true,
71 b'U' => has_u = true,
72 b'E' | b'F' | b'I' | b'L' | b'P' | b'Q' | b'Z' | b'X' | b'*' => {
73 has_protein_only = true;
74 }
75 b'A' | b'C' | b'G' | b'N' | b'-' | b'.' | b' ' | b'\n' | b'\r' | b'R' | b'Y' | b'S'
76 | b'W' | b'K' | b'M' | b'B' | b'D' | b'H' | b'V' => {}
77 _ => return SeqType::Other,
78 }
79 }
80 if has_protein_only {
81 SeqType::Protein
82 } else if has_u && !has_t {
83 SeqType::Rna
84 } else {
85 SeqType::Dna
86 }
87}
88
89pub struct LengthStats {
92 counts: Vec<(u64, u64)>,
93 sum: u64,
94 count: u64,
95}
96
97impl LengthStats {
98 #[must_use]
99 pub fn new(mut lengths: Vec<u64>) -> Self {
100 let sum: u64 = lengths.iter().sum();
101 let count = lengths.len() as u64;
102 lengths.sort_unstable();
103 let mut counts: Vec<(u64, u64)> = Vec::new();
104 let mut acc: u64 = 0;
105 let mut i = 0;
106 while i < lengths.len() {
107 let v = lengths[i];
108 let mut j = i;
109 while j < lengths.len() && lengths[j] == v {
110 j += 1;
111 }
112 acc += (j - i) as u64;
113 counts.push((v, acc));
114 i = j;
115 }
116 Self { counts, sum, count }
117 }
118
119 fn get_value(&self, even: bool, i_med_l: u64, i_med_r: u64) -> f64 {
120 let mut flag = false;
121 let mut prev: u64 = 0;
122 for &(len, acc) in &self.counts {
123 if flag {
124 return (len + prev) as f64 / 2.0;
125 }
126 if acc > i_med_l {
127 if even {
128 if acc > i_med_r {
129 return len as f64;
130 }
131 flag = true;
132 prev = len;
133 } else {
134 return len as f64;
135 }
136 }
137 }
138 unreachable!(
141 "LengthStats::get_value: i_med_l={i_med_l} not bracketed in counts (count={}, flag={flag}, prev={prev})",
142 self.count
143 )
144 }
145
146 #[must_use]
147 pub fn q2(&self) -> f64 {
148 if self.counts.is_empty() {
149 return 0.0;
150 }
151 if self.counts.len() == 1 {
152 return self.counts[0].0 as f64;
153 }
154 let even = self.count & 1 == 0;
155 if even {
156 let l = self.count / 2 - 1;
157 let r = self.count / 2;
158 self.get_value(true, l, r)
159 } else {
160 self.get_value(false, self.count / 2, 0)
161 }
162 }
163
164 #[must_use]
165 pub fn q1(&self) -> f64 {
166 if self.counts.is_empty() {
167 return 0.0;
168 }
169 if self.counts.len() == 1 {
170 return self.counts[0].0 as f64;
171 }
172 let parent_even = self.count & 1 == 0;
173 let n = if parent_even {
174 self.count / 2
175 } else {
176 self.count.div_ceil(2)
177 };
178 let even = n & 1 == 0;
179 if even {
180 self.get_value(true, n / 2 - 1, n / 2)
181 } else {
182 self.get_value(false, n / 2, 0)
183 }
184 }
185
186 #[must_use]
187 pub fn q3(&self) -> f64 {
188 if self.counts.is_empty() {
189 return 0.0;
190 }
191 if self.counts.len() == 1 {
192 return self.counts[0].0 as f64;
193 }
194 let parent_even = self.count & 1 == 0;
195 let (n, mean) = if parent_even {
196 (self.count / 2, self.count / 2)
197 } else {
198 (self.count.div_ceil(2), self.count / 2)
199 };
200 let even = n & 1 == 0;
201 if even {
202 self.get_value(true, n / 2 - 1 + mean, n / 2 + mean)
203 } else {
204 self.get_value(false, n / 2 + mean, 0)
205 }
206 }
207
208 #[must_use]
211 pub fn n50_l50(&self) -> (u64, u64) {
212 if self.counts.is_empty() {
213 return (0, 0);
214 }
215 if self.counts.len() == 1 {
216 return (self.counts[0].0, 1);
217 }
218 let half = self.sum as f64 / 2.0;
219 let mut sum_len: f64 = 0.0;
220 let n = self.counts.len();
221 for i in (0..n).rev() {
222 let (len, acc) = self.counts[i];
223 let prev_acc = if i == 0 { 0 } else { self.counts[i - 1].1 };
224 let per_len_count = acc - prev_acc;
225 sum_len += (len * per_len_count) as f64;
226 if sum_len >= half {
227 return (len, (n - i) as u64);
228 }
229 }
230 (0, 0)
231 }
232}
233
234#[cfg(test)]
235#[allow(clippy::float_cmp)]
236mod tests {
237 use super::*;
238
239 #[test]
240 fn quartiles_three_contigs_match_seqkit() {
241 let ls = LengthStats::new(vec![4, 6, 8]);
242 assert_eq!(ls.q1(), 5.0);
243 assert_eq!(ls.q2(), 6.0);
244 assert_eq!(ls.q3(), 7.0);
245 }
246
247 #[test]
248 fn quartiles_one_to_nine_match_seqkit() {
249 let ls = LengthStats::new((1u64..=9).collect());
250 assert_eq!(ls.q1(), 3.0);
251 assert_eq!(ls.q2(), 5.0);
252 assert_eq!(ls.q3(), 7.0);
253 }
254
255 #[test]
256 fn n50_three_contigs() {
257 let ls = LengthStats::new(vec![4, 6, 8]);
258 assert_eq!(ls.n50_l50(), (6, 2));
259 }
260
261 #[test]
262 fn single_length_bucket() {
263 let ls = LengthStats::new(vec![7, 7, 7]);
264 assert_eq!(ls.q1(), 7.0);
265 assert_eq!(ls.q2(), 7.0);
266 assert_eq!(ls.q3(), 7.0);
267 assert_eq!(ls.n50_l50(), (7, 1));
268 }
269
270 #[test]
271 fn empty_is_zero() {
272 let ls = LengthStats::new(vec![]);
273 assert_eq!(ls.q2(), 0.0);
274 assert_eq!(ls.n50_l50(), (0, 0));
275 }
276
277 #[test]
278 fn count_any_of_dedupes_overlapping_classes() {
279 assert_eq!(count_any_of(b"GCgcAT", b"GCgc"), 4);
280 assert_eq!(count_any_of(b"NNnnX", b"Nn"), 4);
281 }
282
283 #[test]
284 fn classify_rna_protein_dna_other() {
285 assert_eq!(classify(b"ACGU"), SeqType::Rna);
286 assert_eq!(classify(b"MEEPSILQRT"), SeqType::Protein);
287 assert_eq!(classify(b"ACGTN"), SeqType::Dna);
288 assert_eq!(classify(b""), SeqType::Other);
289 assert_eq!(classify(b"@#$"), SeqType::Other);
290 }
291}