1use std::collections::HashMap;
5use std::io;
6use std::sync::{Arc, Mutex};
7
8use crate::config::{Limits, LimitsExt};
9use crate::modules::QCModule;
10use crate::sequence::Sequence;
11use crate::utils::format::java_format_double;
12
13#[derive(Default)]
16pub struct OverRepresentedData {
17 pub sequences: HashMap<String, u64>,
19 pub count: u64,
21 pub count_at_unique_limit: u64,
23}
24
25impl OverRepresentedData {
26 pub fn new() -> Self {
27 Self::default()
28 }
29}
30
31struct ContaminantHit {
33 name: String,
34 length: usize,
35 percent_id: usize,
36}
37
38impl ContaminantHit {
39 fn is_better_than(&self, other: &Option<ContaminantHit>) -> bool {
43 match other {
44 None => true,
45 Some(b) => {
46 self.length > b.length
47 || (self.length == b.length && self.percent_id > b.percent_id)
48 }
49 }
50 }
51}
52
53impl std::fmt::Display for ContaminantHit {
54 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
55 write!(
57 f,
58 "{} ({}% over {}bp)",
59 self.name, self.percent_id, self.length
60 )
61 }
62}
63
64struct OverrepresentedSeq {
66 seq: String,
67 count: u64,
68 percentage: f64,
69 contaminant_hit: Option<ContaminantHit>,
70}
71
72struct Contaminant {
74 name: String,
75 forward: Vec<u8>,
76 reverse: Vec<u8>,
77}
78
79impl Contaminant {
80 fn new(name: &str, sequence: &str) -> Self {
81 let forward: Vec<u8> = sequence.to_uppercase().bytes().collect();
82 let mut reverse = vec![0u8; forward.len()];
84 for (c, &base) in forward.iter().enumerate() {
85 let rev_pos = (forward.len() - 1) - c;
86 reverse[rev_pos] = match base {
87 b'G' => b'C',
88 b'A' => b'T',
89 b'T' => b'A',
90 b'C' => b'G',
91 _ => base,
92 };
93 }
94 Contaminant {
95 name: name.to_string(),
96 forward,
97 reverse,
98 }
99 }
100
101 fn find_match(&self, query: &str) -> Option<ContaminantHit> {
105 let query_upper = query.to_uppercase();
106
107 if query_upper.len() < 20 && query_upper.len() >= 8 {
109 let forward_str = std::str::from_utf8(&self.forward).unwrap_or("");
110 let reverse_str = std::str::from_utf8(&self.reverse).unwrap_or("");
111
112 if forward_str.contains(&query_upper) {
113 return Some(ContaminantHit {
114 name: self.name.clone(),
115 length: query_upper.len(),
116 percent_id: 100,
117 });
118 }
119 if reverse_str.contains(&query_upper) {
120 return Some(ContaminantHit {
121 name: self.name.clone(),
122 length: query_upper.len(),
123 percent_id: 100,
124 });
125 }
126 }
127
128 let q: Vec<u8> = query_upper.bytes().collect();
129 let mut best_hit: Option<ContaminantHit> = None;
130
131 best_hit = Self::find_strand_match(&self.forward, &q, best_hit, &self.name);
133 best_hit = Self::find_strand_match(&self.reverse, &q, best_hit, &self.name);
134
135 best_hit
136 }
137
138 fn find_strand_match(
139 ca: &[u8],
140 cb: &[u8],
141 mut best_hit: Option<ContaminantHit>,
142 name: &str,
143 ) -> Option<ContaminantHit> {
144 let min_offset = -(ca.len() as isize - 20);
145 let max_offset = cb.len() as isize - 20;
146
147 for offset in min_offset..max_offset {
148 if let Some(hit) = Self::find_match_at_offset(ca, cb, offset, name) {
149 if hit.is_better_than(&best_hit) {
150 best_hit = Some(hit);
151 }
152 }
153 }
154 best_hit
155 }
156
157 fn find_match_at_offset(
159 ca: &[u8],
160 cb: &[u8],
161 offset: isize,
162 name: &str,
163 ) -> Option<ContaminantHit> {
164 let mut best_hit: Option<ContaminantHit> = None;
165 let mut mismatch_count: usize = 0;
166 let mut start: isize = 0;
168 let mut end: isize = 0;
169
170 for (i, &ca_byte) in ca.iter().enumerate() {
172 let j = i as isize + offset;
173 if j < 0 {
174 start = i as isize + 1;
175 continue;
176 }
177 if j >= cb.len() as isize {
178 break;
179 }
180
181 if ca_byte == cb[j as usize] {
182 end = i as isize;
183 } else {
184 mismatch_count += 1;
185 if mismatch_count > 1 {
186 if end >= start {
188 let match_len = (1 + end - start) as usize;
189 if match_len > 20 {
190 let id = ((match_len - (mismatch_count - 1)) * 100) / match_len;
191 let candidate = ContaminantHit {
192 name: name.to_string(),
193 length: match_len,
194 percent_id: id,
195 };
196 if candidate.is_better_than(&best_hit) {
197 best_hit = Some(candidate);
198 }
199 }
200 }
201 start = i as isize + 1;
202 end = i as isize + 1;
203 mismatch_count = 0;
204 }
205 }
206 }
207
208 if end < start {
210 return best_hit;
211 }
212 let match_len = (1 + end - start) as usize;
213 if match_len > 20 {
214 let id = ((match_len - mismatch_count) * 100) / match_len;
215 let candidate = ContaminantHit {
216 name: name.to_string(),
217 length: match_len,
218 percent_id: id,
219 };
220 if candidate.is_better_than(&best_hit) {
221 best_hit = Some(candidate);
222 }
223 }
224
225 best_hit
226 }
227}
228
229fn find_contaminant_hit(query: &str, contaminants: &[Contaminant]) -> Option<ContaminantHit> {
232 let mut best_hit: Option<ContaminantHit> = None;
233
234 for contaminant in contaminants {
235 if let Some(hit) = contaminant.find_match(query) {
236 if hit.is_better_than(&best_hit) {
237 best_hit = Some(hit);
238 }
239 }
240 }
241
242 best_hit
243}
244
245pub struct OverRepresentedSeqs {
246 pub shared_data: Arc<Mutex<OverRepresentedData>>,
247 unique_sequence_count: usize,
249 frozen: bool,
250 dup_length: usize,
251 contaminants: Vec<Contaminant>,
252 limits: Limits,
253 computed: Option<Vec<OverrepresentedSeq>>,
255}
256
257const OBSERVATION_CUTOFF: usize = 100_000;
259
260impl OverRepresentedSeqs {
261 pub fn new(
262 limits: &Limits,
263 dup_length: usize,
264 contaminant_entries: &[(String, String)],
265 shared_data: Arc<Mutex<OverRepresentedData>>,
266 ) -> Self {
267 let contaminants: Vec<Contaminant> = contaminant_entries
268 .iter()
269 .map(|(name, seq)| Contaminant::new(name, seq))
270 .collect();
271
272 OverRepresentedSeqs {
273 shared_data,
274 unique_sequence_count: 0,
275 frozen: false,
276 dup_length,
277 contaminants,
278 limits: limits.clone(),
279 computed: None,
280 }
281 }
282
283 fn get_overrepresented_seqs(&mut self) {
284 if self.computed.is_some() {
285 return;
286 }
287
288 let warn_threshold = self.limits.threshold("overrepresented\twarn", 0.1);
289
290 let data = self.shared_data.lock().unwrap_or_else(|e| e.into_inner());
293 let total_count = data.count;
294
295 let mut keepers: Vec<OverrepresentedSeq> = Vec::new();
296
297 for (seq, &count) in &data.sequences {
298 let percentage = (count as f64 / total_count as f64) * 100.0;
299 if percentage > warn_threshold {
300 let hit = find_contaminant_hit(seq, &self.contaminants);
301 keepers.push(OverrepresentedSeq {
302 seq: seq.clone(),
303 count,
304 percentage,
305 contaminant_hit: hit,
306 });
307 }
308 }
309
310 keepers.sort_by(|a, b| b.count.cmp(&a.count).then_with(|| a.seq.cmp(&b.seq)));
315
316 self.computed = Some(keepers);
317 }
318
319 fn ensure_calculated(&self) -> &[OverrepresentedSeq] {
320 self.computed.as_deref().unwrap_or(&[])
321 }
322}
323
324impl QCModule for OverRepresentedSeqs {
325 fn process_sequence(&mut self, sequence: &Sequence) {
326 self.computed = None;
327
328 let mut data = self.shared_data.lock().unwrap_or_else(|e| e.into_inner());
329 data.count += 1;
330
331 let seq_bytes = &sequence.sequence;
334 let truncate_len = if self.dup_length != 0 && seq_bytes.len() > self.dup_length {
335 self.dup_length
336 } else if seq_bytes.len() > 50 {
337 50
339 } else {
340 seq_bytes.len()
341 };
342 let seq = std::str::from_utf8(&seq_bytes[..truncate_len]).unwrap_or("");
343
344 if let Some(count) = data.sequences.get_mut(seq) {
345 *count += 1;
346 if !self.frozen {
348 data.count_at_unique_limit = data.count;
349 }
350 } else if !self.frozen {
351 data.sequences.insert(seq.to_string(), 1);
352 self.unique_sequence_count += 1;
353 data.count_at_unique_limit = data.count;
354 if self.unique_sequence_count == OBSERVATION_CUTOFF {
355 self.frozen = true;
356 }
357 }
358 }
359
360 fn finalize(&mut self) {
361 self.get_overrepresented_seqs();
362 }
363
364 fn name(&self) -> &str {
365 "Overrepresented sequences"
366 }
367
368 fn description(&self) -> &str {
369 "Identifies sequences which are overrepresented in the set"
370 }
371
372 fn reset(&mut self) {
373 let mut data = self.shared_data.lock().unwrap_or_else(|e| e.into_inner());
374 data.count = 0;
375 data.count_at_unique_limit = 0;
376 data.sequences.clear();
377 self.unique_sequence_count = 0;
378 self.frozen = false;
379 self.computed = None;
380 }
381
382 fn raises_error(&self) -> bool {
383 let error_threshold = self.limits.threshold("overrepresented\terror", 1.0);
384 let seqs = self.ensure_calculated();
385 seqs.first().is_some_and(|s| s.percentage > error_threshold)
387 }
388
389 fn raises_warning(&self) -> bool {
390 let seqs = self.ensure_calculated();
391 !seqs.is_empty()
393 }
394
395 fn ignore_filtered_sequences(&self) -> bool {
396 true
397 }
398
399 fn ignore_in_report(&self) -> bool {
400 self.limits.is_ignored("overrepresented")
401 }
402
403 fn write_text_report(&self, writer: &mut dyn io::Write) -> io::Result<()> {
404 let seqs = self.ensure_calculated();
405
406 if seqs.is_empty() {
410 return Ok(());
411 }
412
413 writeln!(writer, "#Sequence\tCount\tPercentage\tPossible Source")?;
414 for s in seqs {
415 let source = match &s.contaminant_hit {
416 Some(hit) => hit.to_string(),
417 None => "No Hit".to_string(),
418 };
419 writeln!(
426 writer,
427 "{}\t{}\t{}\t{}",
428 s.seq,
429 s.count,
430 java_format_double(s.percentage),
431 source
432 )?;
433 }
434
435 Ok(())
436 }
437}