use std::io;
use crate::config::{Limits, LimitsExt};
use crate::modules::QCModule;
use crate::report::charts::line_graph::{render_line_graph, LineGraphData};
use crate::sequence::Sequence;
use crate::utils::format::java_format_double;
struct GCModelValue {
percentage: usize,
increment: f64,
}
struct GCModel {
models: Vec<Vec<GCModelValue>>,
}
impl GCModel {
fn new(read_length: usize) -> Self {
let mut claiming_counts = vec![0usize; 101];
for pos in 0..=read_length {
let low_count = if pos == 0 { 0.0 } else { pos as f64 - 0.5 };
let high_count = (pos as f64 + 0.5).min(read_length as f64);
let low_count = low_count.max(0.0);
let low_percentage = ((low_count * 100.0) / read_length as f64).round() as usize;
let high_percentage = ((high_count * 100.0) / read_length as f64).round() as usize;
for cc in &mut claiming_counts[low_percentage..=high_percentage] {
*cc += 1;
}
}
let mut models = Vec::with_capacity(read_length + 1);
for pos in 0..=read_length {
let low_count = if pos == 0 { 0.0 } else { pos as f64 - 0.5 };
let high_count = (pos as f64 + 0.5).min(read_length as f64);
let low_count = low_count.max(0.0);
let low_percentage = ((low_count * 100.0) / read_length as f64).round() as usize;
let high_percentage = ((high_count * 100.0) / read_length as f64).round() as usize;
let mut values = Vec::with_capacity((high_percentage - low_percentage) + 1);
for (p, &cc) in (low_percentage..=high_percentage)
.zip(&claiming_counts[low_percentage..=high_percentage])
{
values.push(GCModelValue {
percentage: p,
increment: 1.0 / cc as f64,
});
}
models.push(values);
}
GCModel { models }
}
fn get_model_values(&self, gc_count: usize) -> &[GCModelValue] {
&self.models[gc_count]
}
}
pub struct PerSequenceGCContent {
gc_distribution: [f64; 101],
cached_models: Vec<Option<GCModel>>,
limits: Limits,
deviation_percent: Option<f64>,
theoretical_distribution: Option<[f64; 101]>,
}
impl PerSequenceGCContent {
pub fn new(limits: &Limits) -> Self {
PerSequenceGCContent {
gc_distribution: [0.0; 101],
cached_models: Vec::new(),
limits: limits.clone(),
deviation_percent: None,
theoretical_distribution: None,
}
}
fn truncate_length(len: usize) -> usize {
if len > 1000 {
(len / 1000) * 1000
} else if len > 100 {
(len / 100) * 100
} else {
len
}
}
fn calculate_distribution(&mut self) {
if self.deviation_percent.is_some() {
return;
}
let mut total_count: f64 = 0.0;
let mut first_mode: usize = 0;
let mut mode_count: f64 = 0.0;
for i in 0..101 {
total_count += self.gc_distribution[i];
if self.gc_distribution[i] > mode_count {
mode_count = self.gc_distribution[i];
first_mode = i;
}
}
let mut mode: f64 = 0.0;
let mut mode_duplicates: usize = 0;
let mut fell_off_top = true;
for i in first_mode..101 {
if self.gc_distribution[i]
> self.gc_distribution[first_mode] - (self.gc_distribution[first_mode] / 10.0)
{
mode += i as f64;
mode_duplicates += 1;
} else {
fell_off_top = false;
break;
}
}
let mut fell_off_bottom = true;
if first_mode > 0 {
for i in (0..first_mode).rev() {
if self.gc_distribution[i]
> self.gc_distribution[first_mode] - (self.gc_distribution[first_mode] / 10.0)
{
mode += i as f64;
mode_duplicates += 1;
} else {
fell_off_bottom = false;
break;
}
}
}
if fell_off_bottom || fell_off_top {
mode = first_mode as f64;
} else {
mode /= mode_duplicates as f64;
}
let mut stdev: f64 = 0.0;
for i in 0..101 {
stdev += (i as f64 - mode).powi(2) * self.gc_distribution[i];
}
stdev /= total_count - 1.0;
stdev = stdev.sqrt();
let mut deviation_percent: f64 = 0.0;
let mut theoretical = [0.0f64; 101];
for (i, (theo, &observed)) in theoretical
.iter_mut()
.zip(self.gc_distribution.iter())
.enumerate()
{
let lhs = 1.0 / (2.0 * std::f64::consts::PI * stdev * stdev).sqrt();
let rhs =
std::f64::consts::E.powf(-(((i as f64) - mode).powi(2)) / (2.0 * stdev * stdev));
*theo = lhs * rhs * total_count;
deviation_percent += (*theo - observed).abs();
}
deviation_percent /= total_count;
deviation_percent *= 100.0;
self.theoretical_distribution = Some(theoretical);
self.deviation_percent = Some(deviation_percent);
}
fn get_deviation_percent(&self) -> f64 {
self.deviation_percent.unwrap_or(0.0)
}
}
impl PerSequenceGCContent {
fn build_chart_svg(&self) -> String {
let gc_dist: Vec<f64> = self.gc_distribution.to_vec();
let theoretical = self.theoretical_distribution.unwrap_or([0.0; 101]).to_vec();
let max_val = gc_dist
.iter()
.chain(theoretical.iter())
.cloned()
.fold(0.0_f64, f64::max);
let max_y = max_val;
let x_categories: Vec<String> = (0..101).map(|i| format!("{}", i)).collect();
render_line_graph(&LineGraphData {
data: vec![gc_dist, theoretical],
min_y: 0.0,
max_y,
x_label: "Mean GC content (%)".to_string(),
series_names: vec![
"GC count per read".to_string(),
"Theoretical Distribution".to_string(),
],
x_categories,
title: "GC distribution over all sequences".to_string(),
})
}
}
impl QCModule for PerSequenceGCContent {
fn process_sequence(&mut self, sequence: &Sequence) {
self.deviation_percent = None;
self.theoretical_distribution = None;
let seq = &sequence.sequence;
let truncated_len = Self::truncate_length(seq.len());
if truncated_len == 0 {
return;
}
let mut gc_count: usize = 0;
for &b in &seq[..truncated_len] {
if b == b'G' || b == b'C' {
gc_count += 1;
}
}
if truncated_len >= self.cached_models.len() {
self.cached_models.resize_with(truncated_len + 1, || None);
}
if self.cached_models[truncated_len].is_none() {
self.cached_models[truncated_len] = Some(GCModel::new(truncated_len));
}
let model = self.cached_models[truncated_len].as_ref().unwrap();
let values = model.get_model_values(gc_count);
for v in values {
self.gc_distribution[v.percentage] += v.increment;
}
}
fn name(&self) -> &str {
"Per sequence GC content"
}
fn description(&self) -> &str {
"Shows the distribution of GC contents for whole sequences"
}
fn reset(&mut self) {
self.gc_distribution = [0.0; 101];
self.deviation_percent = None;
self.theoretical_distribution = None;
}
fn finalize(&mut self) {
self.calculate_distribution();
}
fn raises_error(&self) -> bool {
self.get_deviation_percent() > self.limits.threshold("gc_sequence\terror", 30.0)
}
fn raises_warning(&self) -> bool {
self.get_deviation_percent() > self.limits.threshold("gc_sequence\twarn", 15.0)
}
fn ignore_filtered_sequences(&self) -> bool {
true
}
fn ignore_in_report(&self) -> bool {
self.limits.is_ignored("gc_sequence")
}
fn write_text_report(&self, writer: &mut dyn io::Write) -> io::Result<()> {
writeln!(writer, "#GC Content\tCount")?;
for i in 0..101 {
writeln!(
writer,
"{}\t{}",
i,
java_format_double(self.gc_distribution[i])
)?;
}
Ok(())
}
fn chart_image_name(&self) -> Option<&str> {
Some("per_sequence_gc_content")
}
fn chart_alt_text(&self) -> Option<&str> {
Some("Per sequence GC content graph")
}
fn generate_chart_svg(&self) -> Option<String> {
Some(self.build_chart_svg())
}
}