fastars 0.1.0 - Docs.rs

//! UMI (Unique Molecular Identifier) processing module.
//!
//! This module provides UMI extraction and processing functionality matching
//! fastp's UMI handling capabilities. UMIs are short sequences used to identify
//! unique molecules in sequencing libraries, enabling PCR duplicate detection.
//!
//! ## Supported UMI Locations
//!
//! - `Read1`: Extract from the beginning of read 1
//! - `Read2`: Extract from the beginning of read 2
//! - `Index`: Extract from a separate index read
//! - `PerIndex`: Extract per-index UMIs from index reads
//!
//! ## Example
//!
//! ```no_run
//! use fastars::umi::{UmiConfig, UmiProcessor, UmiLocation};
//!
//! // Configure UMI extraction from read 1
//! let config = UmiConfig::new()
//!     .with_location(UmiLocation::Read1)
//!     .with_length(8)
//!     .with_prefix("UMI")
//!     .with_separator("_");
//!
//! let processor = UmiProcessor::new(config);
//!
//! // Extract UMI from a read
//! let seq = b"ACGTACGTNNNNNNNN";
//! if let Some(umi) = processor.extract_umi(seq, None) {
//!     println!("Extracted UMI: {}", String::from_utf8_lossy(&umi));
//! }
//! ```

/// Location from which to extract the UMI.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
pub enum UmiLocation {
    /// Extract UMI from the beginning of read 1.
    #[default]
    Read1,
    /// Extract UMI from the beginning of read 2.
    Read2,
    /// Extract UMI from a separate index read.
    Index,
    /// Extract per-index UMIs from index reads.
    PerIndex,
}

impl UmiLocation {
    /// Parse a UMI location from a string.
    ///
    /// Accepts: "read1", "read2", "index", "per_index"
    pub fn from_str(s: &str) -> Option<Self> {
        match s.to_lowercase().as_str() {
            "read1" => Some(UmiLocation::Read1),
            "read2" => Some(UmiLocation::Read2),
            "index" => Some(UmiLocation::Index),
            "per_index" | "perindex" => Some(UmiLocation::PerIndex),
            _ => None,
        }
    }

    /// Convert to string representation.
    pub fn as_str(&self) -> &'static str {
        match self {
            UmiLocation::Read1 => "read1",
            UmiLocation::Read2 => "read2",
            UmiLocation::Index => "index",
            UmiLocation::PerIndex => "per_index",
        }
    }
}

/// Configuration for UMI processing.
#[derive(Debug, Clone)]
pub struct UmiConfig {
    /// Whether UMI processing is enabled.
    pub enabled: bool,
    /// Location from which to extract the UMI.
    pub location: UmiLocation,
    /// Length of the UMI sequence.
    pub length: usize,
    /// Prefix to add before the UMI in the read name (default: empty).
    pub prefix: String,
    /// Number of bases to skip before the UMI.
    pub skip: usize,
    /// Separator between the original read name and the UMI (default: ":").
    pub separator: String,
}

impl Default for UmiConfig {
    fn default() -> Self {
        Self {
            enabled: false,
            location: UmiLocation::Read1,
            length: 0,
            prefix: String::new(),
            skip: 0,
            separator: ":".to_string(),
        }
    }
}

impl UmiConfig {
    /// Create a new UmiConfig with default settings.
    pub fn new() -> Self {
        Self::default()
    }

    /// Create a disabled UmiConfig.
    pub fn disabled() -> Self {
        Self::default()
    }

    /// Enable UMI processing.
    pub fn enabled(mut self) -> Self {
        self.enabled = true;
        self
    }

    /// Set the UMI location.
    pub fn with_location(mut self, location: UmiLocation) -> Self {
        self.location = location;
        self
    }

    /// Set the UMI length.
    pub fn with_length(mut self, length: usize) -> Self {
        self.length = length;
        self.enabled = length > 0;
        self
    }

    /// Set the UMI prefix.
    pub fn with_prefix(mut self, prefix: impl Into<String>) -> Self {
        self.prefix = prefix.into();
        self
    }

    /// Set the number of bases to skip before the UMI.
    pub fn with_skip(mut self, skip: usize) -> Self {
        self.skip = skip;
        self
    }

    /// Set the separator between read name and UMI.
    pub fn with_separator(mut self, separator: impl Into<String>) -> Self {
        self.separator = separator.into();
        self
    }

    /// Check if UMI processing should be applied.
    #[inline]
    pub fn is_enabled(&self) -> bool {
        self.enabled && self.length > 0
    }

    /// Get the total number of bases consumed by UMI (skip + length).
    #[inline]
    pub fn total_consumed(&self) -> usize {
        self.skip + self.length
    }
}

/// UMI processor that extracts and handles UMI sequences.
#[derive(Debug, Clone)]
pub struct UmiProcessor {
    config: UmiConfig,
}

impl UmiProcessor {
    /// Create a new UmiProcessor with the given configuration.
    pub fn new(config: UmiConfig) -> Self {
        Self { config }
    }

    /// Get the configuration.
    #[inline]
    pub fn config(&self) -> &UmiConfig {
        &self.config
    }

    /// Check if UMI processing is enabled.
    #[inline]
    pub fn is_enabled(&self) -> bool {
        self.config.is_enabled()
    }

    /// Extract UMI from a sequence based on the configured location.
    ///
    /// # Arguments
    /// * `seq` - The sequence to extract UMI from (for Read1/Read2 locations)
    /// * `index_seq` - Optional index sequence (for Index/PerIndex locations)
    ///
    /// # Returns
    /// The extracted UMI sequence, or None if extraction failed.
    pub fn extract_umi(&self, seq: &[u8], index_seq: Option<&[u8]>) -> Option<Vec<u8>> {
        if !self.config.is_enabled() {
            return None;
        }

        let source = match self.config.location {
            UmiLocation::Read1 | UmiLocation::Read2 => seq,
            UmiLocation::Index | UmiLocation::PerIndex => index_seq?,
        };

        self.extract_from_sequence(source)
    }

    /// Extract UMI from a specific sequence.
    ///
    /// Applies skip and length to extract the UMI portion.
    #[inline]
    fn extract_from_sequence(&self, seq: &[u8]) -> Option<Vec<u8>> {
        let start = self.config.skip;
        let end = start + self.config.length;

        if end > seq.len() {
            return None;
        }

        Some(seq[start..end].to_vec())
    }

    /// Add the extracted UMI to a read name.
    ///
    /// Format: `original_name{separator}{prefix}{umi}`
    ///
    /// # Arguments
    /// * `name` - The original read name
    /// * `umi` - The UMI sequence to add
    ///
    /// # Returns
    /// The new read name with UMI appended.
    pub fn add_umi_to_name(&self, name: &[u8], umi: &[u8]) -> Vec<u8> {
        let mut new_name = Vec::with_capacity(
            name.len() + self.config.separator.len() + self.config.prefix.len() + umi.len(),
        );

        new_name.extend_from_slice(name);
        new_name.extend_from_slice(self.config.separator.as_bytes());
        new_name.extend_from_slice(self.config.prefix.as_bytes());
        new_name.extend_from_slice(umi);

        new_name
    }

    /// Process a single read, extracting UMI and modifying the read.
    ///
    /// For Read1/Read2 locations, this:
    /// 1. Extracts the UMI from the sequence
    /// 2. Adds the UMI to the read name
    /// 3. Trims the UMI (skip + length) from the sequence
    ///
    /// # Arguments
    /// * `name` - The read name (will be modified)
    /// * `seq` - The read sequence (will be trimmed)
    /// * `qual` - The quality scores (will be trimmed)
    /// * `index_seq` - Optional index sequence for Index/PerIndex locations
    ///
    /// # Returns
    /// `Some((new_name, new_seq, new_qual))` if successful, `None` if UMI extraction failed.
    pub fn process_read(
        &self,
        name: &[u8],
        seq: &[u8],
        qual: &[u8],
        index_seq: Option<&[u8]>,
    ) -> Option<(Vec<u8>, Vec<u8>, Vec<u8>)> {
        if !self.config.is_enabled() {
            return Some((name.to_vec(), seq.to_vec(), qual.to_vec()));
        }

        // Extract UMI
        let umi = self.extract_umi(seq, index_seq)?;

        // Add UMI to name
        let new_name = self.add_umi_to_name(name, &umi);

        // Trim sequence and quality based on location
        let (new_seq, new_qual) = match self.config.location {
            UmiLocation::Read1 | UmiLocation::Read2 => {
                let consumed = self.config.total_consumed();
                if consumed > seq.len() || consumed > qual.len() {
                    return None;
                }
                (seq[consumed..].to_vec(), qual[consumed..].to_vec())
            }
            // For Index/PerIndex, don't trim the main read
            UmiLocation::Index | UmiLocation::PerIndex => (seq.to_vec(), qual.to_vec()),
        };

        Some((new_name, new_seq, new_qual))
    }

    /// Process a paired-end read pair.
    ///
    /// Handles UMI extraction based on the configured location:
    /// - Read1: Extract from R1, trim R1
    /// - Read2: Extract from R2, trim R2
    /// - Index/PerIndex: Extract from index, no trimming of reads
    ///
    /// # Arguments
    /// * `r1_name`, `r1_seq`, `r1_qual` - Read 1 data
    /// * `r2_name`, `r2_seq`, `r2_qual` - Read 2 data
    /// * `index_seq` - Optional index sequence
    ///
    /// # Returns
    /// Tuple of ((r1_name, r1_seq, r1_qual), (r2_name, r2_seq, r2_qual)) if successful.
    #[allow(clippy::type_complexity)]
    pub fn process_paired_reads(
        &self,
        r1_name: &[u8],
        r1_seq: &[u8],
        r1_qual: &[u8],
        r2_name: &[u8],
        r2_seq: &[u8],
        r2_qual: &[u8],
        index_seq: Option<&[u8]>,
    ) -> Option<((Vec<u8>, Vec<u8>, Vec<u8>), (Vec<u8>, Vec<u8>, Vec<u8>))> {
        if !self.config.is_enabled() {
            return Some((
                (r1_name.to_vec(), r1_seq.to_vec(), r1_qual.to_vec()),
                (r2_name.to_vec(), r2_seq.to_vec(), r2_qual.to_vec()),
            ));
        }

        match self.config.location {
            UmiLocation::Read1 => {
                // Extract from R1, apply to both read names
                let umi = self.extract_from_sequence(r1_seq)?;
                let new_r1_name = self.add_umi_to_name(r1_name, &umi);
                let new_r2_name = self.add_umi_to_name(r2_name, &umi);

                let consumed = self.config.total_consumed();
                if consumed > r1_seq.len() || consumed > r1_qual.len() {
                    return None;
                }

                Some((
                    (new_r1_name, r1_seq[consumed..].to_vec(), r1_qual[consumed..].to_vec()),
                    (new_r2_name, r2_seq.to_vec(), r2_qual.to_vec()),
                ))
            }
            UmiLocation::Read2 => {
                // Extract from R2, apply to both read names
                let umi = self.extract_from_sequence(r2_seq)?;
                let new_r1_name = self.add_umi_to_name(r1_name, &umi);
                let new_r2_name = self.add_umi_to_name(r2_name, &umi);

                let consumed = self.config.total_consumed();
                if consumed > r2_seq.len() || consumed > r2_qual.len() {
                    return None;
                }

                Some((
                    (new_r1_name, r1_seq.to_vec(), r1_qual.to_vec()),
                    (new_r2_name, r2_seq[consumed..].to_vec(), r2_qual[consumed..].to_vec()),
                ))
            }
            UmiLocation::Index | UmiLocation::PerIndex => {
                // Extract from index, apply to both read names, no trimming
                let umi = self.extract_from_sequence(index_seq?)?;
                let new_r1_name = self.add_umi_to_name(r1_name, &umi);
                let new_r2_name = self.add_umi_to_name(r2_name, &umi);

                Some((
                    (new_r1_name, r1_seq.to_vec(), r1_qual.to_vec()),
                    (new_r2_name, r2_seq.to_vec(), r2_qual.to_vec()),
                ))
            }
        }
    }

    /// Get the trimmed sequence range after UMI extraction.
    ///
    /// For Read1/Read2 locations, returns the start index after UMI consumption.
    /// For Index/PerIndex, returns 0 (no trimming).
    #[inline]
    pub fn trim_start(&self) -> usize {
        if !self.config.is_enabled() {
            return 0;
        }

        match self.config.location {
            UmiLocation::Read1 | UmiLocation::Read2 => self.config.total_consumed(),
            UmiLocation::Index | UmiLocation::PerIndex => 0,
        }
    }
}

/// Parse a UMI location from a command-line string.
///
/// This function provides a user-friendly error message for invalid locations.
pub fn parse_umi_location(s: &str) -> Result<UmiLocation, String> {
    UmiLocation::from_str(s).ok_or_else(|| {
        format!(
            "Invalid UMI location '{}'. Valid options: read1, read2, index, per_index",
            s
        )
    })
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_umi_location_default() {
        assert_eq!(UmiLocation::default(), UmiLocation::Read1);
    }

    #[test]
    fn test_umi_location_from_str() {
        assert_eq!(UmiLocation::from_str("read1"), Some(UmiLocation::Read1));
        assert_eq!(UmiLocation::from_str("Read1"), Some(UmiLocation::Read1));
        assert_eq!(UmiLocation::from_str("READ1"), Some(UmiLocation::Read1));
        assert_eq!(UmiLocation::from_str("read2"), Some(UmiLocation::Read2));
        assert_eq!(UmiLocation::from_str("index"), Some(UmiLocation::Index));
        assert_eq!(UmiLocation::from_str("per_index"), Some(UmiLocation::PerIndex));
        assert_eq!(UmiLocation::from_str("perindex"), Some(UmiLocation::PerIndex));
        assert_eq!(UmiLocation::from_str("invalid"), None);
    }

    #[test]
    fn test_umi_location_as_str() {
        assert_eq!(UmiLocation::Read1.as_str(), "read1");
        assert_eq!(UmiLocation::Read2.as_str(), "read2");
        assert_eq!(UmiLocation::Index.as_str(), "index");
        assert_eq!(UmiLocation::PerIndex.as_str(), "per_index");
    }

    #[test]
    fn test_umi_config_default() {
        let config = UmiConfig::default();
        assert!(!config.enabled);
        assert_eq!(config.location, UmiLocation::Read1);
        assert_eq!(config.length, 0);
        assert_eq!(config.prefix, "");
        assert_eq!(config.skip, 0);
        assert_eq!(config.separator, ":");
    }

    #[test]
    fn test_umi_config_builder() {
        let config = UmiConfig::new()
            .enabled()
            .with_location(UmiLocation::Read2)
            .with_length(8)
            .with_prefix("UMI")
            .with_skip(2)
            .with_separator("_");

        assert!(config.enabled);
        assert_eq!(config.location, UmiLocation::Read2);
        assert_eq!(config.length, 8);
        assert_eq!(config.prefix, "UMI");
        assert_eq!(config.skip, 2);
        assert_eq!(config.separator, "_");
        assert_eq!(config.total_consumed(), 10);
    }

    #[test]
    fn test_umi_config_is_enabled() {
        let disabled = UmiConfig::default();
        assert!(!disabled.is_enabled());

        let enabled_no_length = UmiConfig::new().enabled();
        assert!(!enabled_no_length.is_enabled());

        let enabled_with_length = UmiConfig::new().with_length(8);
        assert!(enabled_with_length.is_enabled());
    }

    #[test]
    fn test_umi_processor_extract_umi() {
        let config = UmiConfig::new().with_location(UmiLocation::Read1).with_length(8);

        let processor = UmiProcessor::new(config);
        let seq = b"ACGTACGTNNNNNNNNNNNN";
        let umi = processor.extract_umi(seq, None);

        assert_eq!(umi, Some(b"ACGTACGT".to_vec()));
    }

    #[test]
    fn test_umi_processor_extract_with_skip() {
        let config = UmiConfig::new()
            .with_location(UmiLocation::Read1)
            .with_length(6)
            .with_skip(2);

        let processor = UmiProcessor::new(config);
        let seq = b"XXACGTACNNNNNNNN";
        let umi = processor.extract_umi(seq, None);

        assert_eq!(umi, Some(b"ACGTAC".to_vec()));
    }

    #[test]
    fn test_umi_processor_extract_too_short() {
        let config = UmiConfig::new().with_location(UmiLocation::Read1).with_length(20);

        let processor = UmiProcessor::new(config);
        let seq = b"ACGTACGT"; // Only 8 bases
        let umi = processor.extract_umi(seq, None);

        assert_eq!(umi, None);
    }

    #[test]
    fn test_umi_processor_extract_from_index() {
        let config = UmiConfig::new().with_location(UmiLocation::Index).with_length(6);

        let processor = UmiProcessor::new(config);
        let seq = b"MAINSEQUENCE";
        let index = b"UMIUMIEXTRA";
        let umi = processor.extract_umi(seq, Some(index));

        assert_eq!(umi, Some(b"UMIUMI".to_vec()));
    }

    #[test]
    fn test_umi_processor_add_umi_to_name() {
        let config = UmiConfig::new()
            .with_length(8)
            .with_prefix("UMI")
            .with_separator("_");

        let processor = UmiProcessor::new(config);
        let name = b"read1";
        let umi = b"ACGTACGT";
        let new_name = processor.add_umi_to_name(name, umi);

        assert_eq!(new_name, b"read1_UMIACGTACGT".to_vec());
    }

    #[test]
    fn test_umi_processor_add_umi_to_name_no_prefix() {
        let config = UmiConfig::new().with_length(8).with_separator(":");

        let processor = UmiProcessor::new(config);
        let name = b"read1";
        let umi = b"ACGTACGT";
        let new_name = processor.add_umi_to_name(name, umi);

        assert_eq!(new_name, b"read1:ACGTACGT".to_vec());
    }

    #[test]
    fn test_umi_processor_process_read() {
        let config = UmiConfig::new()
            .with_location(UmiLocation::Read1)
            .with_length(4)
            .with_separator(":");

        let processor = UmiProcessor::new(config);
        let name = b"read1";
        let seq = b"ACGTNNNNNNNNNNNN";
        let qual = b"IIIIJJJJJJJJJJJJ";

        let result = processor.process_read(name, seq, qual, None);
        assert!(result.is_some());

        let (new_name, new_seq, new_qual) = result.unwrap();
        assert_eq!(new_name, b"read1:ACGT".to_vec());
        assert_eq!(new_seq, b"NNNNNNNNNNNN".to_vec());
        assert_eq!(new_qual, b"JJJJJJJJJJJJ".to_vec());
    }

    #[test]
    fn test_umi_processor_process_read_with_skip() {
        let config = UmiConfig::new()
            .with_location(UmiLocation::Read1)
            .with_length(4)
            .with_skip(2)
            .with_separator(":");

        let processor = UmiProcessor::new(config);
        let name = b"read1";
        let seq = b"XXACGTNNNNNNNN";
        let qual = b"IIIIIIJJJJJJJJ";

        let result = processor.process_read(name, seq, qual, None);
        assert!(result.is_some());

        let (new_name, new_seq, new_qual) = result.unwrap();
        assert_eq!(new_name, b"read1:ACGT".to_vec());
        assert_eq!(new_seq, b"NNNNNNNN".to_vec()); // Trimmed skip(2) + length(4) = 6 bases
        assert_eq!(new_qual, b"JJJJJJJJ".to_vec());
    }

    #[test]
    fn test_umi_processor_process_read_disabled() {
        let config = UmiConfig::disabled();
        let processor = UmiProcessor::new(config);

        let name = b"read1";
        let seq = b"ACGTNNNN";
        let qual = b"IIIIJJJJ";

        let result = processor.process_read(name, seq, qual, None);
        assert!(result.is_some());

        let (new_name, new_seq, new_qual) = result.unwrap();
        assert_eq!(new_name, b"read1".to_vec());
        assert_eq!(new_seq, b"ACGTNNNN".to_vec());
        assert_eq!(new_qual, b"IIIIJJJJ".to_vec());
    }

    #[test]
    fn test_umi_processor_process_paired_reads_read1() {
        let config = UmiConfig::new()
            .with_location(UmiLocation::Read1)
            .with_length(4)
            .with_separator(":");

        let processor = UmiProcessor::new(config);

        let result = processor.process_paired_reads(
            b"read1/1",
            b"ACGTNNNNNNNN",
            b"IIIIJJJJJJJJ",
            b"read1/2",
            b"TGCANNNNNNNN",
            b"KKKKLLLLLLLL",
            None,
        );

        assert!(result.is_some());
        let ((r1_name, r1_seq, r1_qual), (r2_name, r2_seq, r2_qual)) = result.unwrap();

        assert_eq!(r1_name, b"read1/1:ACGT".to_vec());
        assert_eq!(r1_seq, b"NNNNNNNN".to_vec()); // UMI trimmed from R1
        assert_eq!(r1_qual, b"JJJJJJJJ".to_vec());

        assert_eq!(r2_name, b"read1/2:ACGT".to_vec()); // Same UMI added to R2
        assert_eq!(r2_seq, b"TGCANNNNNNNN".to_vec()); // R2 not trimmed
        assert_eq!(r2_qual, b"KKKKLLLLLLLL".to_vec());
    }

    #[test]
    fn test_umi_processor_process_paired_reads_read2() {
        let config = UmiConfig::new()
            .with_location(UmiLocation::Read2)
            .with_length(4)
            .with_separator(":");

        let processor = UmiProcessor::new(config);

        let result = processor.process_paired_reads(
            b"read1/1",
            b"ACGTNNNNNNNN",
            b"IIIIJJJJJJJJ",
            b"read1/2",
            b"TGCANNNNNNNN",
            b"KKKKLLLLLLLL",
            None,
        );

        assert!(result.is_some());
        let ((r1_name, r1_seq, r1_qual), (r2_name, r2_seq, r2_qual)) = result.unwrap();

        assert_eq!(r1_name, b"read1/1:TGCA".to_vec()); // UMI from R2
        assert_eq!(r1_seq, b"ACGTNNNNNNNN".to_vec()); // R1 not trimmed
        assert_eq!(r1_qual, b"IIIIJJJJJJJJ".to_vec());

        assert_eq!(r2_name, b"read1/2:TGCA".to_vec());
        assert_eq!(r2_seq, b"NNNNNNNN".to_vec()); // UMI trimmed from R2
        assert_eq!(r2_qual, b"LLLLLLLL".to_vec());
    }

    #[test]
    fn test_umi_processor_process_paired_reads_index() {
        let config = UmiConfig::new()
            .with_location(UmiLocation::Index)
            .with_length(6)
            .with_separator(":");

        let processor = UmiProcessor::new(config);

        let result = processor.process_paired_reads(
            b"read1/1",
            b"ACGTNNNNNNNN",
            b"IIIIJJJJJJJJ",
            b"read1/2",
            b"TGCANNNNNNNN",
            b"KKKKLLLLLLLL",
            Some(b"UMIUMIEXTRA"),
        );

        assert!(result.is_some());
        let ((r1_name, r1_seq, r1_qual), (r2_name, r2_seq, r2_qual)) = result.unwrap();

        assert_eq!(r1_name, b"read1/1:UMIUMI".to_vec());
        assert_eq!(r1_seq, b"ACGTNNNNNNNN".to_vec()); // Not trimmed
        assert_eq!(r1_qual, b"IIIIJJJJJJJJ".to_vec());

        assert_eq!(r2_name, b"read1/2:UMIUMI".to_vec());
        assert_eq!(r2_seq, b"TGCANNNNNNNN".to_vec()); // Not trimmed
        assert_eq!(r2_qual, b"KKKKLLLLLLLL".to_vec());
    }

    #[test]
    fn test_umi_processor_trim_start() {
        let config_read1 = UmiConfig::new()
            .with_location(UmiLocation::Read1)
            .with_length(8)
            .with_skip(2);
        let processor_read1 = UmiProcessor::new(config_read1);
        assert_eq!(processor_read1.trim_start(), 10);

        let config_index = UmiConfig::new()
            .with_location(UmiLocation::Index)
            .with_length(8);
        let processor_index = UmiProcessor::new(config_index);
        assert_eq!(processor_index.trim_start(), 0);

        let config_disabled = UmiConfig::disabled();
        let processor_disabled = UmiProcessor::new(config_disabled);
        assert_eq!(processor_disabled.trim_start(), 0);
    }

    #[test]
    fn test_parse_umi_location() {
        assert!(parse_umi_location("read1").is_ok());
        assert!(parse_umi_location("invalid").is_err());
    }
}