extern crate serde;
use serde::{Deserialize, Serialize};
use std::collections::BTreeMap;
pub fn compute_total_counts(length_counts: &BTreeMap<usize, u64>) -> (u64, u64) {
let mut total_bases: u64 = 0;
let mut total_seqs: u64 = 0;
for (length, count) in length_counts.iter() {
total_bases += (*length as u64) * count;
total_seqs += count;
}
(total_bases, total_seqs)
}
pub fn compute_median_length(length_counts: &BTreeMap<usize, u64>, total_seqs: u64) -> f64 {
let middle_seq_index: u64 = total_seqs / 2;
let mut total_observed = 0;
for (seq_len, seq_count) in length_counts.iter() {
total_observed += seq_count;
if total_observed > middle_seq_index {
return *seq_len as f64;
}
}
assert!(total_seqs == 0 && length_counts.is_empty());
0.0
}
pub fn compute_n_score(length_counts: &BTreeMap<usize, u64>, total_bases: u64, target: usize) -> usize {
assert!((1..=99).contains(&target));
let target_bases: f64 = (target as u64*total_bases) as f64 / 100.0;
let mut current_bases: u64 = 0;
for (seq_len, seq_count) in length_counts.iter().rev() {
current_bases += (*seq_len as u64) * *seq_count;
if current_bases as f64 >= target_bases {
return *seq_len;
}
}
assert!(total_bases == 0 && length_counts.is_empty());
0
}
#[derive(Serialize, Deserialize, Debug, PartialEq)]
pub struct LengthStats {
pub total_bases: u64,
pub total_sequences: u64,
pub mean_length: f64,
pub median_length: f64,
pub n10: usize,
pub n25: usize,
pub n50: usize,
pub n75: usize,
pub n90: usize
}
pub fn compute_length_stats(length_counts: &BTreeMap<usize, u64>) -> LengthStats {
let (total_bases, total_seqs): (u64, u64) = compute_total_counts(length_counts);
let median_length: f64 = compute_median_length(length_counts, total_seqs);
let n10: usize = compute_n_score(length_counts, total_bases, 10);
let n25: usize = compute_n_score(length_counts, total_bases, 25);
let n50: usize = compute_n_score(length_counts, total_bases, 50);
let n75: usize = compute_n_score(length_counts, total_bases, 75);
let n90: usize = compute_n_score(length_counts, total_bases, 90);
let final_stats: LengthStats = LengthStats {
total_bases,
total_sequences: total_seqs,
mean_length: (total_bases as f64) / (total_seqs as f64),
median_length,
n10,
n25,
n50,
n75,
n90
};
final_stats
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_compute_total_counts() {
let seq_lens: BTreeMap<usize, u64> = [
(10, 100)
].iter().cloned().collect();
let expected_total_bases: u64 = 1000;
let expected_total_seqs: u64 = 100;
let computed_results = compute_total_counts(&seq_lens);
assert_eq!((expected_total_bases, expected_total_seqs), computed_results);
}
#[test]
fn test_compute_median_length() {
let seq_lens: BTreeMap<usize, u64> = [
(1, 1),
(2, 1),
(3, 1)
].iter().cloned().collect();
let (_total_bases, total_seqs) = compute_total_counts(&seq_lens);
let median: f64 = compute_median_length(&seq_lens, total_seqs);
assert_eq!(median, 2.0);
let seq_lens: BTreeMap<usize, u64> = [
(1, 1),
(2, 1),
(3, 1),
(4, 1)
].iter().cloned().collect();
let (_total_bases, total_seqs) = compute_total_counts(&seq_lens);
let median: f64 = compute_median_length(&seq_lens, total_seqs);
assert_eq!(median, 3.0);
let seq_lens: BTreeMap<usize, u64> = [
(1, 1),
(2, 2),
(3, 1)
].iter().cloned().collect();
let (_total_bases, total_seqs) = compute_total_counts(&seq_lens);
let median: f64 = compute_median_length(&seq_lens, total_seqs);
assert_eq!(median, 2.0);
let seq_lens: BTreeMap<usize, u64> = [
(2, 3),
(3, 2),
(4, 1)
].iter().cloned().collect();
let (_total_bases, total_seqs) = compute_total_counts(&seq_lens);
let median: f64 = compute_median_length(&seq_lens, total_seqs);
assert_eq!(median, 3.0);
}
#[test]
fn test_compute_n_score() {
let seq_lens: BTreeMap<usize, u64> = [
(1, 1),
(2, 1),
(3, 1)
].iter().cloned().collect();
let (total_bases, _total_seqs) = compute_total_counts(&seq_lens);
let n_score = compute_n_score(&seq_lens, total_bases, 50);
assert_eq!(n_score, 3);
let seq_lens: BTreeMap<usize, u64> = [
(1, 1),
(2, 1),
(3, 1),
(4, 1)
].iter().cloned().collect();
let (total_bases, _total_seqs) = compute_total_counts(&seq_lens);
let n_score = compute_n_score(&seq_lens, total_bases, 50);
assert_eq!(n_score, 3);
let seq_lens: BTreeMap<usize, u64> = [
(1, 1),
(2, 2),
(3, 1)
].iter().cloned().collect();
let (total_bases, _total_seqs) = compute_total_counts(&seq_lens);
let n_score = compute_n_score(&seq_lens, total_bases, 50);
assert_eq!(n_score, 2);
let seq_lens: BTreeMap<usize, u64> = [
(2, 3),
(3, 2),
(4, 1)
].iter().cloned().collect();
let (total_bases, _total_seqs) = compute_total_counts(&seq_lens);
let n_score = compute_n_score(&seq_lens, total_bases, 50);
assert_eq!(n_score, 3);
let seq_lens: BTreeMap<usize, u64> = [
(1, 1000),
(1000, 1)
].iter().cloned().collect();
let (total_bases, _total_seqs) = compute_total_counts(&seq_lens);
let n_score = compute_n_score(&seq_lens, total_bases, 50);
assert_eq!(n_score, 1000);
let n_score = compute_n_score(&seq_lens, total_bases, 51);
assert_eq!(n_score, 1);
let seq_lens: BTreeMap<usize, u64> = [
(1, 1001),
(1000, 1)
].iter().cloned().collect();
let (total_bases, _total_seqs) = compute_total_counts(&seq_lens);
let n_score = compute_n_score(&seq_lens, total_bases, 50);
assert_eq!(n_score, 1);
let n_score = compute_n_score(&seq_lens, total_bases, 49);
assert_eq!(n_score, 1000);
let mut seq_lens: BTreeMap<usize, u64> = BTreeMap::new();
for x in 1..101 {
seq_lens.insert(x, 1);
}
let (total_bases, _total_seqs) = compute_total_counts(&seq_lens);
for n_value in 1..100 {
let n_score = compute_n_score(&seq_lens, total_bases, n_value);
let target_value = (total_bases as f64) * (n_value as f64 / 100.0);
let mut total_count: u64 = 0;
for (seq_len, seq_count) in seq_lens.iter() {
if *seq_len >= n_score {
total_count += (*seq_len as u64) * *seq_count;
}
}
assert!((total_count as f64) >= target_value);
}
}
#[test]
fn test_full_all_same() {
let seq_lens: BTreeMap<usize, u64> = [
(10, 100)
].iter().cloned().collect();
let expected_stats: LengthStats = LengthStats {
total_bases: 1000,
total_sequences: 100,
mean_length: 10.0,
median_length: 10.0,
n10: 10,
n25: 10,
n50: 10,
n75: 10,
n90: 10
};
let actual_stats: LengthStats = compute_length_stats(&seq_lens);
assert_eq!(expected_stats, actual_stats);
}
}