fastars 0.1.0

Ultra-fast QC and trimming for short and long reads
Documentation
//! Index barcode filtering for Illumina reads.
//!
//! This module provides filtering based on index barcodes in Illumina header format.
//! Illumina headers typically contain index information in the format:
//! `@INSTRUMENT:RUN:FLOWCELL:LANE:TILE:X:Y INDEX1+INDEX2`
//!
//! Example:
//! `@SIM:1:FCX:1:1:1:1 1:N:0:ATCACG+TTAGGC`
//!                         ^^^^^^ ^^^^^^
//!                         index1  index2

/// Configuration for index barcode filtering.
#[derive(Debug, Clone, PartialEq)]
pub struct IndexFilterConfig {
    /// Index 1 barcode to match (None = no filtering on index1)
    pub index1: Option<String>,
    /// Index 2 barcode to match (None = no filtering on index2)
    pub index2: Option<String>,
    /// Maximum number of mismatches allowed
    pub max_mismatches: usize,
}

impl IndexFilterConfig {
    /// Create a new disabled index filter configuration.
    pub fn new() -> Self {
        Self {
            index1: None,
            index2: None,
            max_mismatches: 0,
        }
    }

    /// Set index 1 filter.
    pub fn with_index1(mut self, barcode: String, max_mismatches: usize) -> Self {
        self.index1 = Some(barcode);
        self.max_mismatches = max_mismatches;
        self
    }

    /// Set index 2 filter.
    pub fn with_index2(mut self, barcode: String, max_mismatches: usize) -> Self {
        self.index2 = Some(barcode);
        self.max_mismatches = max_mismatches;
        self
    }

    /// Check if any index filtering is enabled.
    pub fn is_enabled(&self) -> bool {
        self.index1.is_some() || self.index2.is_some()
    }
}

impl Default for IndexFilterConfig {
    fn default() -> Self {
        Self::new()
    }
}

/// Parse index barcodes from Illumina read name.
///
/// Illumina headers have the format:
/// `@INSTRUMENT:RUN:FLOWCELL:LANE:TILE:X:Y INDEX1+INDEX2`
///
/// Returns (index1, index2) if found, or (None, None) otherwise.
pub fn parse_index_from_name(name: &[u8]) -> (Option<Vec<u8>>, Option<Vec<u8>>) {
    // Find the last space or tab in the name (separator before index info)
    let name_str = match std::str::from_utf8(name) {
        Ok(s) => s,
        Err(_) => return (None, None),
    };

    // Find the last space or tab
    let index_part = match name_str.split_whitespace().nth(1) {
        Some(part) => part,
        None => return (None, None),
    };

    // The index info is typically after the last colon
    // Format: "1:N:0:ATCACG+TTAGGC"
    let parts: Vec<&str> = index_part.split(':').collect();
    if parts.len() < 4 {
        return (None, None);
    }

    // Last part should be the index sequence(s)
    let index_seq = parts[parts.len() - 1];

    // Check if there are two indices separated by '+'
    if index_seq.contains('+') {
        let indices: Vec<&str> = index_seq.split('+').collect();
        if indices.len() == 2 {
            let index1 = Some(indices[0].as_bytes().to_vec());
            let index2 = Some(indices[1].as_bytes().to_vec());
            return (index1, index2);
        }
    } else if !index_seq.is_empty() {
        // Single index
        return (Some(index_seq.as_bytes().to_vec()), None);
    }

    (None, None)
}

/// Calculate the number of mismatches between two sequences.
///
/// Returns usize::MAX if sequences have different lengths.
fn count_mismatches(seq1: &[u8], seq2: &[u8]) -> usize {
    if seq1.len() != seq2.len() {
        return usize::MAX;
    }

    seq1.iter()
        .zip(seq2.iter())
        .filter(|(a, b)| a != b)
        .count()
}

/// Check if a read passes the index barcode filter.
///
/// Returns true if:
/// - No index filtering is configured (pass-through)
/// - The read's index barcodes match the configured filters within the mismatch threshold
///
/// Returns false if:
/// - Index filtering is configured but the read name doesn't contain index info
/// - The index barcodes don't match within the mismatch threshold
pub fn check_index_filter(name: &[u8], config: &IndexFilterConfig) -> bool {
    // If no filtering is configured, pass all reads
    if !config.is_enabled() {
        return true;
    }

    // Parse indices from read name
    let (read_index1, read_index2) = parse_index_from_name(name);

    // Check index1 if configured
    if let Some(ref expected_index1) = config.index1 {
        match read_index1 {
            Some(ref idx1) => {
                let mismatches = count_mismatches(idx1, expected_index1.as_bytes());
                if mismatches > config.max_mismatches {
                    return false;
                }
            }
            None => {
                // Expected index1 but not found in read name
                return false;
            }
        }
    }

    // Check index2 if configured
    if let Some(ref expected_index2) = config.index2 {
        match read_index2 {
            Some(ref idx2) => {
                let mismatches = count_mismatches(idx2, expected_index2.as_bytes());
                if mismatches > config.max_mismatches {
                    return false;
                }
            }
            None => {
                // Expected index2 but not found in read name
                return false;
            }
        }
    }

    true
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_parse_index_dual() {
        let name = b"@SIM:1:FCX:1:1:1:1 1:N:0:ATCACG+TTAGGC";
        let (idx1, idx2) = parse_index_from_name(name);
        assert_eq!(idx1, Some(b"ATCACG".to_vec()));
        assert_eq!(idx2, Some(b"TTAGGC".to_vec()));
    }

    #[test]
    fn test_parse_index_single() {
        let name = b"@SIM:1:FCX:1:1:1:1 1:N:0:ATCACG";
        let (idx1, idx2) = parse_index_from_name(name);
        assert_eq!(idx1, Some(b"ATCACG".to_vec()));
        assert_eq!(idx2, None);
    }

    #[test]
    fn test_parse_index_none() {
        let name = b"@SIM:1:FCX:1:1:1:1";
        let (idx1, idx2) = parse_index_from_name(name);
        assert_eq!(idx1, None);
        assert_eq!(idx2, None);
    }

    #[test]
    fn test_count_mismatches_perfect() {
        let seq1 = b"ATCACG";
        let seq2 = b"ATCACG";
        assert_eq!(count_mismatches(seq1, seq2), 0);
    }

    #[test]
    fn test_count_mismatches_one() {
        let seq1 = b"ATCACG";
        let seq2 = b"ATCACA";
        assert_eq!(count_mismatches(seq1, seq2), 1);
    }

    #[test]
    fn test_count_mismatches_length_diff() {
        let seq1 = b"ATCACG";
        let seq2 = b"ATCA";
        assert_eq!(count_mismatches(seq1, seq2), usize::MAX);
    }

    #[test]
    fn test_check_index_filter_disabled() {
        let config = IndexFilterConfig::new();
        let name = b"@SIM:1:FCX:1:1:1:1 1:N:0:ATCACG+TTAGGC";
        assert!(check_index_filter(name, &config));
    }

    #[test]
    fn test_check_index_filter_index1_exact() {
        let config = IndexFilterConfig::new()
            .with_index1("ATCACG".to_string(), 0);
        let name = b"@SIM:1:FCX:1:1:1:1 1:N:0:ATCACG+TTAGGC";
        assert!(check_index_filter(name, &config));
    }

    #[test]
    fn test_check_index_filter_index1_mismatch() {
        let config = IndexFilterConfig::new()
            .with_index1("ATCACG".to_string(), 0);
        let name = b"@SIM:1:FCX:1:1:1:1 1:N:0:ATCACA+TTAGGC";
        assert!(!check_index_filter(name, &config));
    }

    #[test]
    fn test_check_index_filter_index1_threshold() {
        let config = IndexFilterConfig::new()
            .with_index1("ATCACG".to_string(), 1);
        let name = b"@SIM:1:FCX:1:1:1:1 1:N:0:ATCACA+TTAGGC";
        assert!(check_index_filter(name, &config));
    }

    #[test]
    fn test_check_index_filter_index2_exact() {
        let config = IndexFilterConfig::new()
            .with_index2("TTAGGC".to_string(), 0);
        let name = b"@SIM:1:FCX:1:1:1:1 1:N:0:ATCACG+TTAGGC";
        assert!(check_index_filter(name, &config));
    }

    #[test]
    fn test_check_index_filter_dual() {
        let config = IndexFilterConfig::new()
            .with_index1("ATCACG".to_string(), 0)
            .with_index2("TTAGGC".to_string(), 0);
        let name = b"@SIM:1:FCX:1:1:1:1 1:N:0:ATCACG+TTAGGC";
        assert!(check_index_filter(name, &config));
    }

    #[test]
    fn test_check_index_filter_dual_mismatch_index2() {
        let config = IndexFilterConfig::new()
            .with_index1("ATCACG".to_string(), 0)
            .with_index2("TTAGGC".to_string(), 0);
        let name = b"@SIM:1:FCX:1:1:1:1 1:N:0:ATCACG+TTAGGA";
        assert!(!check_index_filter(name, &config));
    }

    #[test]
    fn test_check_index_filter_missing_index() {
        let config = IndexFilterConfig::new()
            .with_index1("ATCACG".to_string(), 0);
        let name = b"@SIM:1:FCX:1:1:1:1";
        assert!(!check_index_filter(name, &config));
    }
}