Skip to main content

fastqc_rust/modules/
overrepresented_seqs.rs

1// Overrepresented Sequences module
2// Corresponds to Modules/OverRepresentedSeqs.java
3
4use std::collections::HashMap;
5use std::io;
6use std::sync::{Arc, Mutex};
7
8use crate::config::{Limits, LimitsExt};
9use crate::modules::QCModule;
10use crate::sequence::Sequence;
11use crate::utils::format::java_format_double;
12
13/// Shared data between OverRepresentedSeqs and DuplicationLevel.
14/// In Java, DuplicationLevel directly accesses OverRepresentedSeqs' fields.
15#[derive(Default)]
16pub struct OverRepresentedData {
17    /// Map of truncated sequence -> count
18    pub sequences: HashMap<String, u64>,
19    /// Total number of sequences processed
20    pub count: u64,
21    /// Total count at the point where we reached the unique sequence limit
22    pub count_at_unique_limit: u64,
23}
24
25impl OverRepresentedData {
26    pub fn new() -> Self {
27        Self::default()
28    }
29}
30
31/// Result of contaminant matching for an overrepresented sequence.
32struct ContaminantHit {
33    name: String,
34    length: usize,
35    percent_id: usize,
36}
37
38impl ContaminantHit {
39    /// Returns true if this hit is better (longer match or higher identity at same length)
40    /// than `other`, or if `other` is None.
41    /// Replicates the comparison logic used throughout Contaminant.findMatch().
42    fn is_better_than(&self, other: &Option<ContaminantHit>) -> bool {
43        match other {
44            None => true,
45            Some(b) => {
46                self.length > b.length
47                    || (self.length == b.length && self.percent_id > b.percent_id)
48            }
49        }
50    }
51}
52
53impl std::fmt::Display for ContaminantHit {
54    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
55        // Format matches ContaminantHit.toString()
56        write!(
57            f,
58            "{} ({}% over {}bp)",
59            self.name, self.percent_id, self.length
60        )
61    }
62}
63
64/// A single overrepresented sequence entry for reporting.
65struct OverrepresentedSeq {
66    seq: String,
67    count: u64,
68    percentage: f64,
69    contaminant_hit: Option<ContaminantHit>,
70}
71
72/// A contaminant entry loaded from the contaminant list.
73struct Contaminant {
74    name: String,
75    forward: Vec<u8>,
76    reverse: Vec<u8>,
77}
78
79impl Contaminant {
80    fn new(name: &str, sequence: &str) -> Self {
81        let forward: Vec<u8> = sequence.to_uppercase().bytes().collect();
82        // Reverse complement computed exactly as in Contaminant.java
83        let mut reverse = vec![0u8; forward.len()];
84        for (c, &base) in forward.iter().enumerate() {
85            let rev_pos = (forward.len() - 1) - c;
86            reverse[rev_pos] = match base {
87                b'G' => b'C',
88                b'A' => b'T',
89                b'T' => b'A',
90                b'C' => b'G',
91                _ => base,
92            };
93        }
94        Contaminant {
95            name: name.to_string(),
96            forward,
97            reverse,
98        }
99    }
100
101    /// Replicates Contaminant.findMatch() - for short sequences (<20bp, >=8bp)
102    /// checks if query is a substring of the contaminant. For longer sequences, slides with
103    /// 1 mismatch tolerance and requires >=20bp match length.
104    fn find_match(&self, query: &str) -> Option<ContaminantHit> {
105        let query_upper = query.to_uppercase();
106
107        // Special case for short queries (8-19bp) - exact substring match
108        if query_upper.len() < 20 && query_upper.len() >= 8 {
109            let forward_str = std::str::from_utf8(&self.forward).unwrap_or("");
110            let reverse_str = std::str::from_utf8(&self.reverse).unwrap_or("");
111
112            if forward_str.contains(&query_upper) {
113                return Some(ContaminantHit {
114                    name: self.name.clone(),
115                    length: query_upper.len(),
116                    percent_id: 100,
117                });
118            }
119            if reverse_str.contains(&query_upper) {
120                return Some(ContaminantHit {
121                    name: self.name.clone(),
122                    length: query_upper.len(),
123                    percent_id: 100,
124                });
125            }
126        }
127
128        let q: Vec<u8> = query_upper.bytes().collect();
129        let mut best_hit: Option<ContaminantHit> = None;
130
131        // Check forward strand with sliding window and 1 mismatch tolerance
132        best_hit = Self::find_strand_match(&self.forward, &q, best_hit, &self.name);
133        best_hit = Self::find_strand_match(&self.reverse, &q, best_hit, &self.name);
134
135        best_hit
136    }
137
138    fn find_strand_match(
139        ca: &[u8],
140        cb: &[u8],
141        mut best_hit: Option<ContaminantHit>,
142        name: &str,
143    ) -> Option<ContaminantHit> {
144        let min_offset = -(ca.len() as isize - 20);
145        let max_offset = cb.len() as isize - 20;
146
147        for offset in min_offset..max_offset {
148            if let Some(hit) = Self::find_match_at_offset(ca, cb, offset, name) {
149                if hit.is_better_than(&best_hit) {
150                    best_hit = Some(hit);
151                }
152            }
153        }
154        best_hit
155    }
156
157    /// Replicates the private findMatch(char[], char[], int, int) method.
158    fn find_match_at_offset(
159        ca: &[u8],
160        cb: &[u8],
161        offset: isize,
162        name: &str,
163    ) -> Option<ContaminantHit> {
164        let mut best_hit: Option<ContaminantHit> = None;
165        let mut mismatch_count: usize = 0;
166        // Use isize to avoid underflow when offset causes start > end
167        let mut start: isize = 0;
168        let mut end: isize = 0;
169
170        // index i used to access both ca[i] and cb[i+offset]
171        for (i, &ca_byte) in ca.iter().enumerate() {
172            let j = i as isize + offset;
173            if j < 0 {
174                start = i as isize + 1;
175                continue;
176            }
177            if j >= cb.len() as isize {
178                break;
179            }
180
181            if ca_byte == cb[j as usize] {
182                end = i as isize;
183            } else {
184                mismatch_count += 1;
185                if mismatch_count > 1 {
186                    // Check if match so far is worth recording (>20bp)
187                    if end >= start {
188                        let match_len = (1 + end - start) as usize;
189                        if match_len > 20 {
190                            let id = ((match_len - (mismatch_count - 1)) * 100) / match_len;
191                            let candidate = ContaminantHit {
192                                name: name.to_string(),
193                                length: match_len,
194                                percent_id: id,
195                            };
196                            if candidate.is_better_than(&best_hit) {
197                                best_hit = Some(candidate);
198                            }
199                        }
200                    }
201                    start = i as isize + 1;
202                    end = i as isize + 1;
203                    mismatch_count = 0;
204                }
205            }
206        }
207
208        // Check final stretch
209        if end < start {
210            return best_hit;
211        }
212        let match_len = (1 + end - start) as usize;
213        if match_len > 20 {
214            let id = ((match_len - mismatch_count) * 100) / match_len;
215            let candidate = ContaminantHit {
216                name: name.to_string(),
217                length: match_len,
218                percent_id: id,
219            };
220            if candidate.is_better_than(&best_hit) {
221                best_hit = Some(candidate);
222            }
223        }
224
225        best_hit
226    }
227}
228
229/// Find the best contaminant match for a query sequence.
230/// Replicates ContaminentFinder.findContaminantHit()
231fn find_contaminant_hit(query: &str, contaminants: &[Contaminant]) -> Option<ContaminantHit> {
232    let mut best_hit: Option<ContaminantHit> = None;
233
234    for contaminant in contaminants {
235        if let Some(hit) = contaminant.find_match(query) {
236            if hit.is_better_than(&best_hit) {
237                best_hit = Some(hit);
238            }
239        }
240    }
241
242    best_hit
243}
244
245pub struct OverRepresentedSeqs {
246    pub shared_data: Arc<Mutex<OverRepresentedData>>,
247    /// Maximum 100000 unique sequences tracked
248    unique_sequence_count: usize,
249    frozen: bool,
250    dup_length: usize,
251    contaminants: Vec<Contaminant>,
252    limits: Limits,
253    // Lazily computed
254    computed: Option<Vec<OverrepresentedSeq>>,
255}
256
257/// Maximum number of unique sequences to track
258const OBSERVATION_CUTOFF: usize = 100_000;
259
260impl OverRepresentedSeqs {
261    pub fn new(
262        limits: &Limits,
263        dup_length: usize,
264        contaminant_entries: &[(String, String)],
265        shared_data: Arc<Mutex<OverRepresentedData>>,
266    ) -> Self {
267        let contaminants: Vec<Contaminant> = contaminant_entries
268            .iter()
269            .map(|(name, seq)| Contaminant::new(name, seq))
270            .collect();
271
272        OverRepresentedSeqs {
273            shared_data,
274            unique_sequence_count: 0,
275            frozen: false,
276            dup_length,
277            contaminants,
278            limits: limits.clone(),
279            computed: None,
280        }
281    }
282
283    fn get_overrepresented_seqs(&mut self) {
284        if self.computed.is_some() {
285            return;
286        }
287
288        let warn_threshold = self.limits.threshold("overrepresented\twarn", 0.1);
289
290        // Use into_inner on PoisonError to recover the guard even after a panic in
291        // another module -- losing the analysis is worse than using slightly stale data.
292        let data = self.shared_data.lock().unwrap_or_else(|e| e.into_inner());
293        let total_count = data.count;
294
295        let mut keepers: Vec<OverrepresentedSeq> = Vec::new();
296
297        for (seq, &count) in &data.sequences {
298            let percentage = (count as f64 / total_count as f64) * 100.0;
299            if percentage > warn_threshold {
300                let hit = find_contaminant_hit(seq, &self.contaminants);
301                keepers.push(OverrepresentedSeq {
302                    seq: seq.clone(),
303                    count,
304                    percentage,
305                    contaminant_hit: hit,
306                });
307            }
308        }
309
310        // JAVA COMPAT: Sort by count descending, then by sequence ascending as tiebreaker.
311        // Java's Arrays.sort is stable and preserves HashMap iteration order for equal counts,
312        // but Rust's HashMap has different iteration order. Using sequence as tiebreaker
313        // ensures deterministic output regardless of HashMap order.
314        keepers.sort_by(|a, b| b.count.cmp(&a.count).then_with(|| a.seq.cmp(&b.seq)));
315
316        self.computed = Some(keepers);
317    }
318
319    fn ensure_calculated(&self) -> &[OverrepresentedSeq] {
320        self.computed.as_deref().unwrap_or(&[])
321    }
322}
323
324impl QCModule for OverRepresentedSeqs {
325    fn process_sequence(&mut self, sequence: &Sequence) {
326        self.computed = None;
327
328        let mut data = self.shared_data.lock().unwrap_or_else(|e| e.into_inner());
329        data.count += 1;
330
331        // Truncate sequence to dup_length or 50bp if longer than 50
332        // Safety: sequence bytes are ASCII (uppercased in Sequence::new), so slicing is valid UTF-8
333        let seq_bytes = &sequence.sequence;
334        let truncate_len = if self.dup_length != 0 && seq_bytes.len() > self.dup_length {
335            self.dup_length
336        } else if seq_bytes.len() > 50 {
337            // Default truncation to 50bp for sequences longer than 50
338            50
339        } else {
340            seq_bytes.len()
341        };
342        let seq = std::str::from_utf8(&seq_bytes[..truncate_len]).unwrap_or("");
343
344        if let Some(count) = data.sequences.get_mut(seq) {
345            *count += 1;
346            // Keep updating countAtUniqueLimit while not frozen
347            if !self.frozen {
348                data.count_at_unique_limit = data.count;
349            }
350        } else if !self.frozen {
351            data.sequences.insert(seq.to_string(), 1);
352            self.unique_sequence_count += 1;
353            data.count_at_unique_limit = data.count;
354            if self.unique_sequence_count == OBSERVATION_CUTOFF {
355                self.frozen = true;
356            }
357        }
358    }
359
360    fn finalize(&mut self) {
361        self.get_overrepresented_seqs();
362    }
363
364    fn name(&self) -> &str {
365        "Overrepresented sequences"
366    }
367
368    fn description(&self) -> &str {
369        "Identifies sequences which are overrepresented in the set"
370    }
371
372    fn reset(&mut self) {
373        let mut data = self.shared_data.lock().unwrap_or_else(|e| e.into_inner());
374        data.count = 0;
375        data.count_at_unique_limit = 0;
376        data.sequences.clear();
377        self.unique_sequence_count = 0;
378        self.frozen = false;
379        self.computed = None;
380    }
381
382    fn raises_error(&self) -> bool {
383        let error_threshold = self.limits.threshold("overrepresented\terror", 1.0);
384        let seqs = self.ensure_calculated();
385        // Check if the top sequence exceeds error threshold
386        seqs.first().is_some_and(|s| s.percentage > error_threshold)
387    }
388
389    fn raises_warning(&self) -> bool {
390        let seqs = self.ensure_calculated();
391        // Any overrepresented sequence triggers a warning
392        !seqs.is_empty()
393    }
394
395    fn ignore_filtered_sequences(&self) -> bool {
396        true
397    }
398
399    fn ignore_in_report(&self) -> bool {
400        self.limits.is_ignored("overrepresented")
401    }
402
403    fn write_text_report(&self, writer: &mut dyn io::Write) -> io::Result<()> {
404        let seqs = self.ensure_calculated();
405
406        // When there are no overrepresented sequences, Java writes nothing
407        // to the data section (no header, no rows). The header is only output when
408        // writeTable() is called, which is skipped when overrepresntedSeqs.length == 0.
409        if seqs.is_empty() {
410            return Ok(());
411        }
412
413        writeln!(writer, "#Sequence\tCount\tPercentage\tPossible Source")?;
414        for s in seqs {
415            let source = match &s.contaminant_hit {
416                Some(hit) => hit.to_string(),
417                None => "No Hit".to_string(),
418            };
419            // JAVA COMPAT: Java's OverRepresentedSeqs.OverrepresentedSeq stores
420            // the raw double percentage without rounding (see OverRepresentedSeqs.java:253),
421            // and AbstractQCModule.writeTable serializes it via String.valueOf(getValueAt(...))
422            // (AbstractQCModule.java:159), which returns Java's Double.toString() of the
423            // unrounded value (e.g. "7.160449112640348"). Pass the raw percentage to the
424            // formatter; do not round to 2 decimals.
425            writeln!(
426                writer,
427                "{}\t{}\t{}\t{}",
428                s.seq,
429                s.count,
430                java_format_double(s.percentage),
431                source
432            )?;
433        }
434
435        Ok(())
436    }
437}