Skip to main content

fastqc_rust/modules/
duplication_level.rs

1// Sequence Duplication Levels module
2// Corresponds to Modules/DuplicationLevel.java
3
4use std::collections::HashMap;
5use std::io;
6use std::sync::{Arc, Mutex};
7
8use crate::config::{Limits, LimitsExt};
9use crate::modules::overrepresented_seqs::OverRepresentedData;
10use crate::modules::QCModule;
11use crate::report::charts::line_graph::{render_line_graph, LineGraphData};
12use crate::sequence::Sequence;
13use crate::utils::format::java_format_double;
14
15pub struct DuplicationLevel {
16    shared_data: Arc<Mutex<OverRepresentedData>>,
17    limits: Limits,
18    // Lazily computed
19    computed: Option<ComputedLevels>,
20}
21
22struct ComputedLevels {
23    total_percentages: [f64; 16],
24    percent_different_seqs: f64,
25}
26
27/// The 16 duplication level labels, matching DuplicationLevel.java
28const LABELS: [&str; 16] = [
29    "1", "2", "3", "4", "5", "6", "7", "8", "9", ">10", ">50", ">100", ">500", ">1k", ">5k", ">10k",
30];
31
32impl DuplicationLevel {
33    pub fn new(shared_data: Arc<Mutex<OverRepresentedData>>, limits: &Limits) -> Self {
34        DuplicationLevel {
35            shared_data,
36            limits: limits.clone(),
37            computed: None,
38        }
39    }
40
41    /// Replicates calculateLevels() from DuplicationLevel.java.
42    /// Must be called after all sequences have been processed.
43    pub fn calculate_levels(&mut self) {
44        if self.computed.is_some() {
45            return;
46        }
47
48        let data = self.shared_data.lock().unwrap_or_else(|e| e.into_inner());
49
50        let mut total_percentages = [0.0f64; 16];
51
52        // Collate how many unique sequences have each duplication count
53        let mut collated_counts: HashMap<u64, u64> = HashMap::new();
54        for &count in data.sequences.values() {
55            *collated_counts.entry(count).or_insert(0) += 1;
56        }
57
58        // Apply statistical correction to each duplication level
59        let mut corrected_counts: HashMap<u64, f64> = HashMap::new();
60        for (&dup_level, &num_observations) in &collated_counts {
61            let corrected = get_corrected_count(
62                data.count_at_unique_limit,
63                data.count,
64                dup_level,
65                num_observations,
66            );
67            corrected_counts.insert(dup_level, corrected);
68        }
69
70        // Calculate raw and deduplicated totals from corrected counts
71        let mut dedup_total: f64 = 0.0;
72        let mut raw_total: f64 = 0.0;
73
74        for (&dup_level, &count) in &corrected_counts {
75            dedup_total += count;
76            raw_total += count * dup_level as f64;
77
78            // Map duplication level to one of 16 bins
79            let temp_dup_slot = dup_level as i64 - 1;
80
81            // The dupSlot < 0 is a kludge to handle duplication levels > 2^31
82            let dup_slot: usize = if !(0..=9999).contains(&temp_dup_slot) {
83                15
84            } else if temp_dup_slot > 4999 {
85                14
86            } else if temp_dup_slot > 999 {
87                13
88            } else if temp_dup_slot > 499 {
89                12
90            } else if temp_dup_slot > 99 {
91                11
92            } else if temp_dup_slot > 49 {
93                10
94            } else if temp_dup_slot > 9 {
95                9
96            } else {
97                temp_dup_slot as usize
98            };
99
100            total_percentages[dup_slot] += count * dup_level as f64;
101        }
102
103        // Convert to percentages
104        for tp in &mut total_percentages {
105            if raw_total > 0.0 {
106                *tp /= raw_total;
107                *tp *= 100.0;
108            }
109        }
110
111        // percentDifferentSeqs = (dedupTotal/rawTotal)*100
112        let percent_different_seqs = if raw_total == 0.0 {
113            100.0
114        } else {
115            (dedup_total / raw_total) * 100.0
116        };
117
118        self.computed = Some(ComputedLevels {
119            total_percentages,
120            percent_different_seqs,
121        });
122    }
123
124    fn ensure_calculated(&self) -> &ComputedLevels {
125        // SAFETY: finalize() must be called before any reporting method.
126        // If a caller skips finalize(), we provide a static default to avoid panicking.
127        static DEFAULT: ComputedLevels = ComputedLevels {
128            total_percentages: [0.0; 16],
129            percent_different_seqs: 100.0,
130        };
131        self.computed.as_ref().unwrap_or(&DEFAULT)
132    }
133}
134
135/// Replicates getCorrectedCount() from DuplicationLevel.java.
136/// Corrects observed counts to account for sequences that might have been missed
137/// because we stopped adding new unique sequences after the observation cutoff.
138fn get_corrected_count(
139    count_at_limit: u64,
140    total_count: u64,
141    duplication_level: u64,
142    number_of_observations: u64,
143) -> f64 {
144    // Bail out early if we saw all sequences
145    if count_at_limit == total_count {
146        return number_of_observations as f64;
147    }
148
149    // If not enough sequences left to hide another sequence with this count
150    if total_count - number_of_observations < count_at_limit {
151        return number_of_observations as f64;
152    }
153
154    // Calculate probability of NOT seeing a sequence with this duplication
155    // level within the first countAtLimit sequences
156    let mut p_not_seeing_at_limit: f64 = 1.0;
157
158    // Probability below which we stop caring (won't change count by 0.01)
159    let limit_of_caring =
160        1.0 - (number_of_observations as f64 / (number_of_observations as f64 + 0.01));
161
162    for i in 0..count_at_limit {
163        p_not_seeing_at_limit *=
164            ((total_count - i) - duplication_level) as f64 / (total_count - i) as f64;
165
166        if p_not_seeing_at_limit < limit_of_caring {
167            p_not_seeing_at_limit = 0.0;
168            break;
169        }
170    }
171
172    // Invert to get chance of seeing, then scale up
173    let p_seeing_at_limit = 1.0 - p_not_seeing_at_limit;
174    number_of_observations as f64 / p_seeing_at_limit
175}
176
177impl DuplicationLevel {
178    fn build_chart_svg(&self) -> String {
179        let computed = self.ensure_calculated();
180        // Java uses fixed maxCount=100 (percentage scale 0-100%), not dynamic
181        let max_count = 100.0_f64;
182
183        let labels: Vec<String> = LABELS.iter().map(|&l| l.to_string()).collect();
184
185        // Title uses Java's DecimalFormat("#.##") — up to 2 decimals, trailing zeros stripped
186        let pct_str = format!("{:.2}", computed.percent_different_seqs);
187        let pct_str = pct_str.trim_end_matches('0').trim_end_matches('.');
188        let title = format!("Percent of seqs remaining if deduplicated {}%", pct_str);
189
190        render_line_graph(&LineGraphData {
191            data: vec![computed.total_percentages.to_vec()],
192            min_y: 0.0,
193            max_y: max_count,
194            x_label: "Sequence Duplication Level".to_string(),
195            series_names: vec!["% Total sequences".to_string()],
196            x_categories: labels,
197            title,
198        })
199    }
200}
201
202impl QCModule for DuplicationLevel {
203    fn process_sequence(&mut self, _sequence: &Sequence) {
204        // DuplicationLevel doesn't process sequences itself;
205        // it uses the shared data from OverRepresentedSeqs.
206    }
207
208    fn finalize(&mut self) {
209        self.calculate_levels();
210    }
211
212    fn name(&self) -> &str {
213        "Sequence Duplication Levels"
214    }
215
216    fn description(&self) -> &str {
217        "Plots the number of sequences which are duplicated to different levels"
218    }
219
220    fn reset(&mut self) {
221        self.computed = None;
222    }
223
224    fn raises_error(&self) -> bool {
225        let threshold = self.limits.threshold("duplication\terror", 50.0);
226        let computed = self.ensure_calculated();
227        // Error if percent different seqs is BELOW the threshold
228        computed.percent_different_seqs < threshold
229    }
230
231    fn raises_warning(&self) -> bool {
232        let threshold = self.limits.threshold("duplication\twarn", 70.0);
233        let computed = self.ensure_calculated();
234        // Warning if percent different seqs is BELOW the threshold
235        computed.percent_different_seqs < threshold
236    }
237
238    fn ignore_filtered_sequences(&self) -> bool {
239        // The ignoreFilteredSequences in Java checks the ignore config
240        self.limits.is_ignored("duplication")
241    }
242
243    fn ignore_in_report(&self) -> bool {
244        self.limits.is_ignored("duplication")
245    }
246
247    fn write_text_report(&self, writer: &mut dyn io::Write) -> io::Result<()> {
248        let computed = self.ensure_calculated();
249
250        // First line is the total deduplicated percentage
251        writeln!(
252            writer,
253            "#Total Deduplicated Percentage\t{}",
254            java_format_double(computed.percent_different_seqs)
255        )?;
256        // Header for the duplication level data
257        writeln!(writer, "#Duplication Level\tPercentage of total")?;
258
259        // Iterate labels and percentages together
260        for (i, (label_str, tp)) in LABELS
261            .iter()
262            .zip(computed.total_percentages.iter())
263            .enumerate()
264        {
265            let label = if i == 15 {
266                format!("{}+", label_str)
267            } else {
268                label_str.to_string()
269            };
270            writeln!(writer, "{}\t{}", label, java_format_double(*tp))?;
271        }
272
273        Ok(())
274    }
275
276    // Image filename matches Java's "duplication_levels.png" in Images/
277    fn chart_image_name(&self) -> Option<&str> {
278        Some("duplication_levels")
279    }
280    fn chart_alt_text(&self) -> Option<&str> {
281        Some("Duplication level graph")
282    }
283    fn generate_chart_svg(&self) -> Option<String> {
284        Some(self.build_chart_svg())
285    }
286}