#[derive(Debug, Clone, PartialEq)]
pub struct IndexFilterConfig {
pub index1: Option<String>,
pub index2: Option<String>,
pub max_mismatches: usize,
}
impl IndexFilterConfig {
pub fn new() -> Self {
Self {
index1: None,
index2: None,
max_mismatches: 0,
}
}
pub fn with_index1(mut self, barcode: String, max_mismatches: usize) -> Self {
self.index1 = Some(barcode);
self.max_mismatches = max_mismatches;
self
}
pub fn with_index2(mut self, barcode: String, max_mismatches: usize) -> Self {
self.index2 = Some(barcode);
self.max_mismatches = max_mismatches;
self
}
pub fn is_enabled(&self) -> bool {
self.index1.is_some() || self.index2.is_some()
}
}
impl Default for IndexFilterConfig {
fn default() -> Self {
Self::new()
}
}
pub fn parse_index_from_name(name: &[u8]) -> (Option<Vec<u8>>, Option<Vec<u8>>) {
let name_str = match std::str::from_utf8(name) {
Ok(s) => s,
Err(_) => return (None, None),
};
let index_part = match name_str.split_whitespace().nth(1) {
Some(part) => part,
None => return (None, None),
};
let parts: Vec<&str> = index_part.split(':').collect();
if parts.len() < 4 {
return (None, None);
}
let index_seq = parts[parts.len() - 1];
if index_seq.contains('+') {
let indices: Vec<&str> = index_seq.split('+').collect();
if indices.len() == 2 {
let index1 = Some(indices[0].as_bytes().to_vec());
let index2 = Some(indices[1].as_bytes().to_vec());
return (index1, index2);
}
} else if !index_seq.is_empty() {
return (Some(index_seq.as_bytes().to_vec()), None);
}
(None, None)
}
fn count_mismatches(seq1: &[u8], seq2: &[u8]) -> usize {
if seq1.len() != seq2.len() {
return usize::MAX;
}
seq1.iter()
.zip(seq2.iter())
.filter(|(a, b)| a != b)
.count()
}
pub fn check_index_filter(name: &[u8], config: &IndexFilterConfig) -> bool {
if !config.is_enabled() {
return true;
}
let (read_index1, read_index2) = parse_index_from_name(name);
if let Some(ref expected_index1) = config.index1 {
match read_index1 {
Some(ref idx1) => {
let mismatches = count_mismatches(idx1, expected_index1.as_bytes());
if mismatches > config.max_mismatches {
return false;
}
}
None => {
return false;
}
}
}
if let Some(ref expected_index2) = config.index2 {
match read_index2 {
Some(ref idx2) => {
let mismatches = count_mismatches(idx2, expected_index2.as_bytes());
if mismatches > config.max_mismatches {
return false;
}
}
None => {
return false;
}
}
}
true
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_parse_index_dual() {
let name = b"@SIM:1:FCX:1:1:1:1 1:N:0:ATCACG+TTAGGC";
let (idx1, idx2) = parse_index_from_name(name);
assert_eq!(idx1, Some(b"ATCACG".to_vec()));
assert_eq!(idx2, Some(b"TTAGGC".to_vec()));
}
#[test]
fn test_parse_index_single() {
let name = b"@SIM:1:FCX:1:1:1:1 1:N:0:ATCACG";
let (idx1, idx2) = parse_index_from_name(name);
assert_eq!(idx1, Some(b"ATCACG".to_vec()));
assert_eq!(idx2, None);
}
#[test]
fn test_parse_index_none() {
let name = b"@SIM:1:FCX:1:1:1:1";
let (idx1, idx2) = parse_index_from_name(name);
assert_eq!(idx1, None);
assert_eq!(idx2, None);
}
#[test]
fn test_count_mismatches_perfect() {
let seq1 = b"ATCACG";
let seq2 = b"ATCACG";
assert_eq!(count_mismatches(seq1, seq2), 0);
}
#[test]
fn test_count_mismatches_one() {
let seq1 = b"ATCACG";
let seq2 = b"ATCACA";
assert_eq!(count_mismatches(seq1, seq2), 1);
}
#[test]
fn test_count_mismatches_length_diff() {
let seq1 = b"ATCACG";
let seq2 = b"ATCA";
assert_eq!(count_mismatches(seq1, seq2), usize::MAX);
}
#[test]
fn test_check_index_filter_disabled() {
let config = IndexFilterConfig::new();
let name = b"@SIM:1:FCX:1:1:1:1 1:N:0:ATCACG+TTAGGC";
assert!(check_index_filter(name, &config));
}
#[test]
fn test_check_index_filter_index1_exact() {
let config = IndexFilterConfig::new()
.with_index1("ATCACG".to_string(), 0);
let name = b"@SIM:1:FCX:1:1:1:1 1:N:0:ATCACG+TTAGGC";
assert!(check_index_filter(name, &config));
}
#[test]
fn test_check_index_filter_index1_mismatch() {
let config = IndexFilterConfig::new()
.with_index1("ATCACG".to_string(), 0);
let name = b"@SIM:1:FCX:1:1:1:1 1:N:0:ATCACA+TTAGGC";
assert!(!check_index_filter(name, &config));
}
#[test]
fn test_check_index_filter_index1_threshold() {
let config = IndexFilterConfig::new()
.with_index1("ATCACG".to_string(), 1);
let name = b"@SIM:1:FCX:1:1:1:1 1:N:0:ATCACA+TTAGGC";
assert!(check_index_filter(name, &config));
}
#[test]
fn test_check_index_filter_index2_exact() {
let config = IndexFilterConfig::new()
.with_index2("TTAGGC".to_string(), 0);
let name = b"@SIM:1:FCX:1:1:1:1 1:N:0:ATCACG+TTAGGC";
assert!(check_index_filter(name, &config));
}
#[test]
fn test_check_index_filter_dual() {
let config = IndexFilterConfig::new()
.with_index1("ATCACG".to_string(), 0)
.with_index2("TTAGGC".to_string(), 0);
let name = b"@SIM:1:FCX:1:1:1:1 1:N:0:ATCACG+TTAGGC";
assert!(check_index_filter(name, &config));
}
#[test]
fn test_check_index_filter_dual_mismatch_index2() {
let config = IndexFilterConfig::new()
.with_index1("ATCACG".to_string(), 0)
.with_index2("TTAGGC".to_string(), 0);
let name = b"@SIM:1:FCX:1:1:1:1 1:N:0:ATCACG+TTAGGA";
assert!(!check_index_filter(name, &config));
}
#[test]
fn test_check_index_filter_missing_index() {
let config = IndexFilterConfig::new()
.with_index1("ATCACG".to_string(), 0);
let name = b"@SIM:1:FCX:1:1:1:1";
assert!(!check_index_filter(name, &config));
}
}