fastqc_rust/modules/
duplication_level.rs1use std::collections::HashMap;
5use std::io;
6use std::sync::{Arc, Mutex};
7
8use crate::config::{Limits, LimitsExt};
9use crate::modules::overrepresented_seqs::OverRepresentedData;
10use crate::modules::QCModule;
11use crate::report::charts::line_graph::{render_line_graph, LineGraphData};
12use crate::sequence::Sequence;
13use crate::utils::format::java_format_double;
14
15pub struct DuplicationLevel {
16 shared_data: Arc<Mutex<OverRepresentedData>>,
17 limits: Limits,
18 computed: Option<ComputedLevels>,
20}
21
22struct ComputedLevels {
23 total_percentages: [f64; 16],
24 percent_different_seqs: f64,
25}
26
27const LABELS: [&str; 16] = [
29 "1", "2", "3", "4", "5", "6", "7", "8", "9", ">10", ">50", ">100", ">500", ">1k", ">5k", ">10k",
30];
31
32impl DuplicationLevel {
33 pub fn new(shared_data: Arc<Mutex<OverRepresentedData>>, limits: &Limits) -> Self {
34 DuplicationLevel {
35 shared_data,
36 limits: limits.clone(),
37 computed: None,
38 }
39 }
40
41 pub fn calculate_levels(&mut self) {
44 if self.computed.is_some() {
45 return;
46 }
47
48 let data = self.shared_data.lock().unwrap_or_else(|e| e.into_inner());
49
50 let mut total_percentages = [0.0f64; 16];
51
52 let mut collated_counts: HashMap<u64, u64> = HashMap::new();
54 for &count in data.sequences.values() {
55 *collated_counts.entry(count).or_insert(0) += 1;
56 }
57
58 let mut corrected_counts: HashMap<u64, f64> = HashMap::new();
60 for (&dup_level, &num_observations) in &collated_counts {
61 let corrected = get_corrected_count(
62 data.count_at_unique_limit,
63 data.count,
64 dup_level,
65 num_observations,
66 );
67 corrected_counts.insert(dup_level, corrected);
68 }
69
70 let mut dedup_total: f64 = 0.0;
72 let mut raw_total: f64 = 0.0;
73
74 for (&dup_level, &count) in &corrected_counts {
75 dedup_total += count;
76 raw_total += count * dup_level as f64;
77
78 let temp_dup_slot = dup_level as i64 - 1;
80
81 let dup_slot: usize = if !(0..=9999).contains(&temp_dup_slot) {
83 15
84 } else if temp_dup_slot > 4999 {
85 14
86 } else if temp_dup_slot > 999 {
87 13
88 } else if temp_dup_slot > 499 {
89 12
90 } else if temp_dup_slot > 99 {
91 11
92 } else if temp_dup_slot > 49 {
93 10
94 } else if temp_dup_slot > 9 {
95 9
96 } else {
97 temp_dup_slot as usize
98 };
99
100 total_percentages[dup_slot] += count * dup_level as f64;
101 }
102
103 for tp in &mut total_percentages {
105 if raw_total > 0.0 {
106 *tp /= raw_total;
107 *tp *= 100.0;
108 }
109 }
110
111 let percent_different_seqs = if raw_total == 0.0 {
113 100.0
114 } else {
115 (dedup_total / raw_total) * 100.0
116 };
117
118 self.computed = Some(ComputedLevels {
119 total_percentages,
120 percent_different_seqs,
121 });
122 }
123
124 fn ensure_calculated(&self) -> &ComputedLevels {
125 static DEFAULT: ComputedLevels = ComputedLevels {
128 total_percentages: [0.0; 16],
129 percent_different_seqs: 100.0,
130 };
131 self.computed.as_ref().unwrap_or(&DEFAULT)
132 }
133}
134
135fn get_corrected_count(
139 count_at_limit: u64,
140 total_count: u64,
141 duplication_level: u64,
142 number_of_observations: u64,
143) -> f64 {
144 if count_at_limit == total_count {
146 return number_of_observations as f64;
147 }
148
149 if total_count - number_of_observations < count_at_limit {
151 return number_of_observations as f64;
152 }
153
154 let mut p_not_seeing_at_limit: f64 = 1.0;
157
158 let limit_of_caring =
160 1.0 - (number_of_observations as f64 / (number_of_observations as f64 + 0.01));
161
162 for i in 0..count_at_limit {
163 p_not_seeing_at_limit *=
164 ((total_count - i) - duplication_level) as f64 / (total_count - i) as f64;
165
166 if p_not_seeing_at_limit < limit_of_caring {
167 p_not_seeing_at_limit = 0.0;
168 break;
169 }
170 }
171
172 let p_seeing_at_limit = 1.0 - p_not_seeing_at_limit;
174 number_of_observations as f64 / p_seeing_at_limit
175}
176
177impl DuplicationLevel {
178 fn build_chart_svg(&self) -> String {
179 let computed = self.ensure_calculated();
180 let max_count = 100.0_f64;
182
183 let labels: Vec<String> = LABELS.iter().map(|&l| l.to_string()).collect();
184
185 let pct_str = format!("{:.2}", computed.percent_different_seqs);
187 let pct_str = pct_str.trim_end_matches('0').trim_end_matches('.');
188 let title = format!("Percent of seqs remaining if deduplicated {}%", pct_str);
189
190 render_line_graph(&LineGraphData {
191 data: vec![computed.total_percentages.to_vec()],
192 min_y: 0.0,
193 max_y: max_count,
194 x_label: "Sequence Duplication Level".to_string(),
195 series_names: vec!["% Total sequences".to_string()],
196 x_categories: labels,
197 title,
198 })
199 }
200}
201
202impl QCModule for DuplicationLevel {
203 fn process_sequence(&mut self, _sequence: &Sequence) {
204 }
207
208 fn finalize(&mut self) {
209 self.calculate_levels();
210 }
211
212 fn name(&self) -> &str {
213 "Sequence Duplication Levels"
214 }
215
216 fn description(&self) -> &str {
217 "Plots the number of sequences which are duplicated to different levels"
218 }
219
220 fn reset(&mut self) {
221 self.computed = None;
222 }
223
224 fn raises_error(&self) -> bool {
225 let threshold = self.limits.threshold("duplication\terror", 50.0);
226 let computed = self.ensure_calculated();
227 computed.percent_different_seqs < threshold
229 }
230
231 fn raises_warning(&self) -> bool {
232 let threshold = self.limits.threshold("duplication\twarn", 70.0);
233 let computed = self.ensure_calculated();
234 computed.percent_different_seqs < threshold
236 }
237
238 fn ignore_filtered_sequences(&self) -> bool {
239 self.limits.is_ignored("duplication")
241 }
242
243 fn ignore_in_report(&self) -> bool {
244 self.limits.is_ignored("duplication")
245 }
246
247 fn write_text_report(&self, writer: &mut dyn io::Write) -> io::Result<()> {
248 let computed = self.ensure_calculated();
249
250 writeln!(
252 writer,
253 "#Total Deduplicated Percentage\t{}",
254 java_format_double(computed.percent_different_seqs)
255 )?;
256 writeln!(writer, "#Duplication Level\tPercentage of total")?;
258
259 for (i, (label_str, tp)) in LABELS
261 .iter()
262 .zip(computed.total_percentages.iter())
263 .enumerate()
264 {
265 let label = if i == 15 {
266 format!("{}+", label_str)
267 } else {
268 label_str.to_string()
269 };
270 writeln!(writer, "{}\t{}", label, java_format_double(*tp))?;
271 }
272
273 Ok(())
274 }
275
276 fn chart_image_name(&self) -> Option<&str> {
278 Some("duplication_levels")
279 }
280 fn chart_alt_text(&self) -> Option<&str> {
281 Some("Duplication level graph")
282 }
283 fn generate_chart_svg(&self) -> Option<String> {
284 Some(self.build_chart_svg())
285 }
286}