use std::collections::HashMap;
use std::io;
use std::sync::{Arc, Mutex};
use crate::config::{Limits, LimitsExt};
use crate::modules::overrepresented_seqs::OverRepresentedData;
use crate::modules::QCModule;
use crate::report::charts::line_graph::{render_line_graph, LineGraphData};
use crate::sequence::Sequence;
use crate::utils::format::java_format_double;
pub struct DuplicationLevel {
shared_data: Arc<Mutex<OverRepresentedData>>,
limits: Limits,
computed: Option<ComputedLevels>,
}
struct ComputedLevels {
total_percentages: [f64; 16],
percent_different_seqs: f64,
}
const LABELS: [&str; 16] = [
"1", "2", "3", "4", "5", "6", "7", "8", "9", ">10", ">50", ">100", ">500", ">1k", ">5k", ">10k",
];
impl DuplicationLevel {
pub fn new(shared_data: Arc<Mutex<OverRepresentedData>>, limits: &Limits) -> Self {
DuplicationLevel {
shared_data,
limits: limits.clone(),
computed: None,
}
}
pub fn calculate_levels(&mut self) {
if self.computed.is_some() {
return;
}
let data = self.shared_data.lock().unwrap_or_else(|e| e.into_inner());
let mut total_percentages = [0.0f64; 16];
let mut collated_counts: HashMap<u64, u64> = HashMap::new();
for &count in data.sequences.values() {
*collated_counts.entry(count).or_insert(0) += 1;
}
let mut corrected_counts: HashMap<u64, f64> = HashMap::new();
for (&dup_level, &num_observations) in &collated_counts {
let corrected = get_corrected_count(
data.count_at_unique_limit,
data.count,
dup_level,
num_observations,
);
corrected_counts.insert(dup_level, corrected);
}
let mut dedup_total: f64 = 0.0;
let mut raw_total: f64 = 0.0;
for (&dup_level, &count) in &corrected_counts {
dedup_total += count;
raw_total += count * dup_level as f64;
let temp_dup_slot = dup_level as i64 - 1;
let dup_slot: usize = if !(0..=9999).contains(&temp_dup_slot) {
15
} else if temp_dup_slot > 4999 {
14
} else if temp_dup_slot > 999 {
13
} else if temp_dup_slot > 499 {
12
} else if temp_dup_slot > 99 {
11
} else if temp_dup_slot > 49 {
10
} else if temp_dup_slot > 9 {
9
} else {
temp_dup_slot as usize
};
total_percentages[dup_slot] += count * dup_level as f64;
}
for tp in &mut total_percentages {
if raw_total > 0.0 {
*tp /= raw_total;
*tp *= 100.0;
}
}
let percent_different_seqs = if raw_total == 0.0 {
100.0
} else {
(dedup_total / raw_total) * 100.0
};
self.computed = Some(ComputedLevels {
total_percentages,
percent_different_seqs,
});
}
fn ensure_calculated(&self) -> &ComputedLevels {
static DEFAULT: ComputedLevels = ComputedLevels {
total_percentages: [0.0; 16],
percent_different_seqs: 100.0,
};
self.computed.as_ref().unwrap_or(&DEFAULT)
}
}
fn get_corrected_count(
count_at_limit: u64,
total_count: u64,
duplication_level: u64,
number_of_observations: u64,
) -> f64 {
if count_at_limit == total_count {
return number_of_observations as f64;
}
if total_count - number_of_observations < count_at_limit {
return number_of_observations as f64;
}
let mut p_not_seeing_at_limit: f64 = 1.0;
let limit_of_caring =
1.0 - (number_of_observations as f64 / (number_of_observations as f64 + 0.01));
for i in 0..count_at_limit {
p_not_seeing_at_limit *=
((total_count - i) - duplication_level) as f64 / (total_count - i) as f64;
if p_not_seeing_at_limit < limit_of_caring {
p_not_seeing_at_limit = 0.0;
break;
}
}
let p_seeing_at_limit = 1.0 - p_not_seeing_at_limit;
number_of_observations as f64 / p_seeing_at_limit
}
impl DuplicationLevel {
fn build_chart_svg(&self) -> String {
let computed = self.ensure_calculated();
let max_count = 100.0_f64;
let labels: Vec<String> = LABELS.iter().map(|&l| l.to_string()).collect();
let pct_str = format!("{:.2}", computed.percent_different_seqs);
let pct_str = pct_str.trim_end_matches('0').trim_end_matches('.');
let title = format!("Percent of seqs remaining if deduplicated {}%", pct_str);
render_line_graph(&LineGraphData {
data: vec![computed.total_percentages.to_vec()],
min_y: 0.0,
max_y: max_count,
x_label: "Sequence Duplication Level".to_string(),
series_names: vec!["% Total sequences".to_string()],
x_categories: labels,
title,
})
}
}
impl QCModule for DuplicationLevel {
fn process_sequence(&mut self, _sequence: &Sequence) {
}
fn finalize(&mut self) {
self.calculate_levels();
}
fn name(&self) -> &str {
"Sequence Duplication Levels"
}
fn description(&self) -> &str {
"Plots the number of sequences which are duplicated to different levels"
}
fn reset(&mut self) {
self.computed = None;
}
fn raises_error(&self) -> bool {
let threshold = self.limits.threshold("duplication\terror", 50.0);
let computed = self.ensure_calculated();
computed.percent_different_seqs < threshold
}
fn raises_warning(&self) -> bool {
let threshold = self.limits.threshold("duplication\twarn", 70.0);
let computed = self.ensure_calculated();
computed.percent_different_seqs < threshold
}
fn ignore_filtered_sequences(&self) -> bool {
self.limits.is_ignored("duplication")
}
fn ignore_in_report(&self) -> bool {
self.limits.is_ignored("duplication")
}
fn write_text_report(&self, writer: &mut dyn io::Write) -> io::Result<()> {
let computed = self.ensure_calculated();
writeln!(
writer,
"#Total Deduplicated Percentage\t{}",
java_format_double(computed.percent_different_seqs)
)?;
writeln!(writer, "#Duplication Level\tPercentage of total")?;
for (i, (label_str, tp)) in LABELS
.iter()
.zip(computed.total_percentages.iter())
.enumerate()
{
let label = if i == 15 {
format!("{}+", label_str)
} else {
label_str.to_string()
};
writeln!(writer, "{}\t{}", label, java_format_double(*tp))?;
}
Ok(())
}
fn chart_image_name(&self) -> Option<&str> {
Some("duplication_levels")
}
fn chart_alt_text(&self) -> Option<&str> {
Some("Duplication level graph")
}
fn generate_chart_svg(&self) -> Option<String> {
Some(self.build_chart_svg())
}
}