use crate::trim::LengthConfig;
use super::index::IndexFilterConfig;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum FilterDecision {
Pass,
FailQuality,
FailLength,
FailComplexity,
FailNRate,
FailIndex,
}
impl FilterDecision {
#[inline]
pub fn is_pass(&self) -> bool {
matches!(self, FilterDecision::Pass)
}
#[inline]
pub fn is_fail(&self) -> bool {
!self.is_pass()
}
}
#[derive(Debug, Clone)]
pub struct FilterConfig {
pub qualified_quality: u8,
pub unqualified_percent_limit: f64,
pub n_base_limit: usize,
pub n_percent_limit: Option<f64>,
pub min_avg_quality: Option<u8>,
pub max_n_rate: Option<f64>,
pub low_complexity_threshold: Option<f64>,
pub length_config: LengthConfig,
pub quality_filtering_enabled: bool,
pub index_filter: IndexFilterConfig,
}
impl Default for FilterConfig {
fn default() -> Self {
Self {
qualified_quality: 15,
unqualified_percent_limit: 40.0,
n_base_limit: 5,
n_percent_limit: None,
min_avg_quality: None,
max_n_rate: None,
low_complexity_threshold: None,
length_config: LengthConfig::default(),
quality_filtering_enabled: true,
index_filter: IndexFilterConfig::default(),
}
}
}
impl FilterConfig {
pub fn new() -> Self {
Self::default()
}
pub fn short_read() -> Self {
Self {
qualified_quality: 15,
unqualified_percent_limit: 40.0,
n_base_limit: 5,
n_percent_limit: None, min_avg_quality: None, max_n_rate: None, low_complexity_threshold: None, length_config: LengthConfig::short_read(),
quality_filtering_enabled: true,
index_filter: IndexFilterConfig::default(),
}
}
pub fn long_read() -> Self {
Self {
qualified_quality: 7, unqualified_percent_limit: 50.0, n_base_limit: 10, n_percent_limit: Some(10.0), min_avg_quality: None,
max_n_rate: None,
low_complexity_threshold: None, length_config: LengthConfig::long_read(),
quality_filtering_enabled: true,
index_filter: IndexFilterConfig::default(),
}
}
pub fn with_qualified_quality(mut self, quality: u8) -> Self {
self.qualified_quality = quality;
self
}
pub fn with_unqualified_percent_limit(mut self, limit: f64) -> Self {
self.unqualified_percent_limit = limit.clamp(0.0, 100.0);
self
}
pub fn with_n_base_limit(mut self, limit: usize) -> Self {
self.n_base_limit = limit;
self
}
pub fn with_n_percent_limit(mut self, percent: f64) -> Self {
self.n_percent_limit = Some(percent.clamp(0.0, 100.0));
self
}
pub fn with_min_avg_quality(mut self, quality: u8) -> Self {
self.min_avg_quality = Some(quality);
self
}
pub fn without_quality_filter(mut self) -> Self {
self.quality_filtering_enabled = false;
self.min_avg_quality = None;
self
}
pub fn with_quality_filter(mut self) -> Self {
self.quality_filtering_enabled = true;
self
}
pub fn with_max_n_rate(mut self, rate: f64) -> Self {
self.max_n_rate = Some(rate.clamp(0.0, 1.0));
self
}
pub fn with_complexity_threshold(mut self, threshold: f64) -> Self {
self.low_complexity_threshold = Some(threshold.clamp(0.0, 1.0));
self
}
pub fn without_complexity_filter(mut self) -> Self {
self.low_complexity_threshold = None;
self
}
pub fn with_length_config(mut self, config: LengthConfig) -> Self {
self.length_config = config;
self
}
pub fn with_min_length(mut self, length: usize) -> Self {
self.length_config.min_length = length;
self
}
pub fn with_max_length(mut self, length: usize) -> Self {
self.length_config.max_length = Some(length);
self
}
pub fn with_index1_filter(mut self, barcode: String, max_mismatches: usize) -> Self {
self.index_filter = self.index_filter.with_index1(barcode, max_mismatches);
self
}
pub fn with_index2_filter(mut self, barcode: String, max_mismatches: usize) -> Self {
self.index_filter = self.index_filter.with_index2(barcode, max_mismatches);
self
}
}
pub fn apply_filters(name: &[u8], seq: &[u8], qual: &[u8], config: &FilterConfig) -> FilterDecision {
if config.index_filter.is_enabled() {
if !super::index::check_index_filter(name, &config.index_filter) {
return FilterDecision::FailIndex;
}
}
if !check_length(seq.len(), config) {
return FilterDecision::FailLength;
}
if config.quality_filtering_enabled
&& !check_unqualified_percent(qual, config.qualified_quality, config.unqualified_percent_limit)
{
return FilterDecision::FailQuality;
}
if let Some(min_qual) = config.min_avg_quality {
if !check_avg_quality(qual, min_qual) {
return FilterDecision::FailQuality;
}
}
if !check_n_base_count(seq, config.n_base_limit) {
return FilterDecision::FailNRate;
}
if let Some(percent_limit) = config.n_percent_limit {
if !check_n_percent(seq, percent_limit) {
return FilterDecision::FailNRate;
}
}
if let Some(max_rate) = config.max_n_rate {
if !check_n_rate(seq, max_rate) {
return FilterDecision::FailNRate;
}
}
if let Some(threshold) = config.low_complexity_threshold {
if !check_complexity(seq, threshold) {
return FilterDecision::FailComplexity;
}
}
FilterDecision::Pass
}
#[inline]
fn check_length(len: usize, config: &FilterConfig) -> bool {
crate::trim::check_length(len, &config.length_config)
}
pub fn check_avg_quality(qual: &[u8], min: u8) -> bool {
if qual.is_empty() {
return false;
}
let sum: u64 = qual.iter().map(|&q| q.saturating_sub(33) as u64).sum();
let avg = sum / qual.len() as u64;
avg >= min as u64
}
pub fn check_n_rate(seq: &[u8], max_rate: f64) -> bool {
if seq.is_empty() {
return true;
}
let n_count = seq.iter().filter(|&&b| b == b'N' || b == b'n').count();
let rate = n_count as f64 / seq.len() as f64;
rate <= max_rate
}
#[inline]
pub fn check_unqualified_percent(qual: &[u8], qualified_quality: u8, limit: f64) -> bool {
if qual.is_empty() {
return false;
}
let threshold = qualified_quality.saturating_add(33); let low_qual_count = qual.iter().filter(|&&q| q < threshold).count();
let percent = (low_qual_count as f64 / qual.len() as f64) * 100.0;
percent <= limit
}
#[inline]
pub fn check_n_base_count(seq: &[u8], limit: usize) -> bool {
let n_count = seq.iter().filter(|&&b| b == b'N' || b == b'n').count();
n_count <= limit
}
#[inline]
pub fn check_n_percent(seq: &[u8], percent_limit: f64) -> bool {
if seq.is_empty() {
return true;
}
let n_count = seq.iter().filter(|&&b| b == b'N' || b == b'n').count();
let percent = (n_count as f64 / seq.len() as f64) * 100.0;
percent <= percent_limit
}
pub fn check_complexity(seq: &[u8], threshold: f64) -> bool {
if seq.len() < 2 {
return true; }
let mut counts = [0u32; 16];
for window in seq.windows(2) {
if let (Some(i), Some(j)) = (base_to_idx(window[0]), base_to_idx(window[1])) {
counts[i * 4 + j] += 1;
}
}
let total = (seq.len() - 1) as f64;
let mut entropy = 0.0;
for &count in &counts {
if count > 0 {
let p = count as f64 / total;
entropy -= p * p.log2();
}
}
let complexity = entropy / 4.0;
complexity >= threshold
}
#[inline]
fn base_to_idx(base: u8) -> Option<usize> {
match base {
b'A' | b'a' => Some(0),
b'C' | b'c' => Some(1),
b'G' | b'g' => Some(2),
b'T' | b't' => Some(3),
_ => None,
}
}
pub type FilterCriteria = FilterConfig;
#[cfg(test)]
mod tests {
use super::*;
fn make_qual(scores: &[u8]) -> Vec<u8> {
scores.iter().map(|&s| s + 33).collect()
}
#[test]
fn test_filter_decision_is_pass() {
assert!(FilterDecision::Pass.is_pass());
assert!(!FilterDecision::FailQuality.is_pass());
assert!(!FilterDecision::FailLength.is_pass());
assert!(!FilterDecision::FailComplexity.is_pass());
assert!(!FilterDecision::FailNRate.is_pass());
}
#[test]
fn test_filter_decision_is_fail() {
assert!(!FilterDecision::Pass.is_fail());
assert!(FilterDecision::FailQuality.is_fail());
}
#[test]
fn test_filter_config_default() {
let config = FilterConfig::default();
assert_eq!(config.qualified_quality, 15);
assert!((config.unqualified_percent_limit - 40.0).abs() < 0.001);
assert_eq!(config.n_base_limit, 5);
assert!(config.min_avg_quality.is_none());
assert!(config.max_n_rate.is_none());
assert!(config.low_complexity_threshold.is_none());
assert!(config.quality_filtering_enabled);
}
#[test]
fn test_filter_config_short_read() {
let config = FilterConfig::short_read();
assert_eq!(config.qualified_quality, 15);
assert!((config.unqualified_percent_limit - 40.0).abs() < 0.001);
assert_eq!(config.n_base_limit, 5);
assert!(config.min_avg_quality.is_none()); assert!(config.low_complexity_threshold.is_none()); assert!(config.quality_filtering_enabled);
}
#[test]
fn test_filter_config_long_read() {
let config = FilterConfig::long_read();
assert_eq!(config.qualified_quality, 7);
assert!((config.unqualified_percent_limit - 50.0).abs() < 0.001);
assert_eq!(config.n_base_limit, 10);
assert!(config.low_complexity_threshold.is_none());
assert!(config.quality_filtering_enabled);
}
#[test]
fn test_check_avg_quality_pass() {
let qual = make_qual(&[30, 30, 30, 30, 30]);
assert!(check_avg_quality(&qual, 20));
assert!(check_avg_quality(&qual, 30));
}
#[test]
fn test_check_avg_quality_fail() {
let qual = make_qual(&[10, 10, 10, 10, 10]);
assert!(!check_avg_quality(&qual, 20));
}
#[test]
fn test_check_avg_quality_empty() {
assert!(!check_avg_quality(&[], 10));
}
#[test]
fn test_check_n_rate_pass() {
let seq = b"ACGTACGTACGT";
assert!(check_n_rate(seq, 0.05));
}
#[test]
fn test_check_n_rate_fail() {
let seq = b"ACGTNNNNNNNN";
assert!(!check_n_rate(seq, 0.05));
}
#[test]
fn test_check_n_rate_boundary() {
let seq = b"ACGTACGTNN"; assert!(!check_n_rate(seq, 0.1));
assert!(check_n_rate(seq, 0.2));
assert!(check_n_rate(seq, 0.3));
}
#[test]
fn test_check_n_rate_empty() {
assert!(check_n_rate(&[], 0.05));
}
#[test]
fn test_check_complexity_pass() {
let seq = b"ACGTACGTACGTACGT";
assert!(check_complexity(seq, 0.3));
}
#[test]
fn test_check_complexity_fail() {
let seq = b"AAAAAAAAAAAAAAAA";
assert!(!check_complexity(seq, 0.3));
}
#[test]
fn test_check_complexity_short() {
assert!(check_complexity(b"A", 0.5));
assert!(check_complexity(b"", 0.5));
}
#[test]
fn test_apply_filters_pass() {
let name = b"@test_read";
let seq = b"ACGTACGTACGTACGTACGTACGTACGTACGT"; let qual = make_qual(&[30; 32]);
let config = FilterConfig::short_read();
let decision = apply_filters(name, seq, &qual, &config);
assert!(decision.is_pass());
}
#[test]
fn test_apply_filters_fail_length() {
let name = b"@test_read";
let seq = b"ACGT"; let qual = make_qual(&[30; 4]);
let config = FilterConfig::short_read();
let decision = apply_filters(name, seq, &qual, &config);
assert_eq!(decision, FilterDecision::FailLength);
}
#[test]
fn test_apply_filters_fail_quality() {
let name = b"@test_read";
let seq = b"ACGTACGTACGTACGTACGTACGTACGTACGT";
let qual = make_qual(&[5; 32]); let config = FilterConfig::short_read();
let decision = apply_filters(name, seq, &qual, &config);
assert_eq!(decision, FilterDecision::FailQuality);
}
#[test]
fn test_apply_filters_fail_n_rate() {
let name = b"@test_read";
let seq = b"NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN"; let qual = make_qual(&[30; 32]);
let config = FilterConfig::short_read();
let decision = apply_filters(name, seq, &qual, &config);
assert_eq!(decision, FilterDecision::FailNRate);
}
#[test]
fn test_apply_filters_fail_complexity() {
let name = b"@test_read";
let seq = b"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"; let qual = make_qual(&[30; 40]);
let config = FilterConfig::short_read().with_complexity_threshold(0.3);
let decision = apply_filters(name, seq, &qual, &config);
assert_eq!(decision, FilterDecision::FailComplexity);
}
#[test]
fn test_config_builder() {
let config = FilterConfig::new()
.with_qualified_quality(20)
.with_unqualified_percent_limit(30.0)
.with_n_base_limit(10)
.with_min_avg_quality(20)
.with_max_n_rate(0.1)
.with_complexity_threshold(0.5)
.with_min_length(50)
.with_max_length(500);
assert_eq!(config.qualified_quality, 20);
assert!((config.unqualified_percent_limit - 30.0).abs() < 0.001);
assert_eq!(config.n_base_limit, 10);
assert_eq!(config.min_avg_quality, Some(20));
assert_eq!(config.max_n_rate, Some(0.1));
assert_eq!(config.low_complexity_threshold, Some(0.5));
assert_eq!(config.length_config.min_length, 50);
assert_eq!(config.length_config.max_length, Some(500));
}
#[test]
fn test_config_disable_filters() {
let config = FilterConfig::short_read()
.without_quality_filter()
.without_complexity_filter();
assert!(!config.quality_filtering_enabled);
assert!(config.min_avg_quality.is_none());
assert!(config.low_complexity_threshold.is_none());
}
#[test]
fn test_n_rate_clamping() {
let config = FilterConfig::new().with_max_n_rate(1.5);
assert_eq!(config.max_n_rate, Some(1.0));
let config = FilterConfig::new().with_max_n_rate(-0.5);
assert_eq!(config.max_n_rate, Some(0.0));
}
#[test]
fn test_check_unqualified_percent_pass() {
let qual = make_qual(&[30, 30, 30, 30, 30]);
assert!(check_unqualified_percent(&qual, 15, 40.0));
}
#[test]
fn test_check_unqualified_percent_fail() {
let qual = make_qual(&[10, 10, 10, 10, 10]);
assert!(!check_unqualified_percent(&qual, 15, 40.0));
}
#[test]
fn test_check_unqualified_percent_boundary() {
let qual = make_qual(&[10, 10, 30, 30, 30]);
assert!(check_unqualified_percent(&qual, 15, 40.0));
assert!(!check_unqualified_percent(&qual, 15, 39.0)); }
#[test]
fn test_check_unqualified_percent_empty() {
assert!(!check_unqualified_percent(&[], 15, 40.0));
}
#[test]
fn test_check_n_base_count_pass() {
let seq = b"ACGTACGTNN"; assert!(check_n_base_count(seq, 5));
assert!(check_n_base_count(seq, 2)); }
#[test]
fn test_check_n_base_count_fail() {
let seq = b"ACGTNNNNNNN"; assert!(!check_n_base_count(seq, 5));
}
#[test]
fn test_check_n_base_count_empty() {
assert!(check_n_base_count(b"", 5));
}
#[test]
fn test_fastp_style_filtering() {
let name = b"@test_read";
let seq = b"ACGTACGTACGTACGTACGTACGTACGTACGT";
let good_qual = make_qual(&[30; 32]);
let config = FilterConfig::short_read();
assert!(apply_filters(name, seq, &good_qual, &config).is_pass());
let mut mixed_qual = vec![30u8 + 33; 16]; mixed_qual.extend(vec![10u8 + 33; 16]); assert_eq!(
apply_filters(name, seq, &mixed_qual, &config),
FilterDecision::FailQuality
);
}
#[test]
fn test_quality_filtering_disabled() {
let name = b"@test_read";
let seq = b"ACGTACGTACGTACGTACGTACGTACGTACGT";
let bad_qual = make_qual(&[5; 32]);
let config = FilterConfig::short_read().without_quality_filter();
assert!(apply_filters(name, seq, &bad_qual, &config).is_pass());
}
}