seqtkrs 0.1.1

A Rust reimplementation of seqtk, a fast and lightweight tool for processing biological sequences in FASTA/FASTQ format
Documentation
//! Mott修剪算法实现
//!
//! 使用动态规划找到质量值序列中的最佳得分区间,
//! 如果最佳区间长度小于最小长度要求,则回退到滑动窗口方法。

use crate::utils::quality::build_qual_table;

/// Mott算法修剪参数
#[derive(Debug, Clone)]
pub struct TrimmerConfig {
    /// 质量阈值(错误率)
    pub error_threshold: f64,
    /// 最小保留长度
    pub min_length: usize,
    /// 质量值偏移
    pub qual_offset: u8,
    /// 固定修剪左端碱基数
    pub trim_left: usize,
    /// 固定修剪右端碱基数
    pub trim_right: usize,
}

impl Default for TrimmerConfig {
    fn default() -> Self {
        Self {
            error_threshold: 0.05,
            min_length: 30,
            qual_offset: 33,
            trim_left: 0,
            trim_right: 0,
        }
    }
}

/// Mott算法修剪结果
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct TrimResult {
    /// 保留区间的起始位置(0-based, inclusive)
    pub start: usize,
    /// 保留区间的结束位置(0-based, exclusive)
    pub end: usize,
}

/// Mott算法修剪低质量区域
///
/// # 算法说明
///
/// 1. 使用动态规划找到最大得分区间
///    - 得分 = error_threshold - error_prob(qual[i])
///    - 当累积得分为负时重置起点
///
/// 2. 如果最佳区间长度 < min_length,使用滑动窗口
///    - 找到平均质量最高的min_length长度窗口
///
/// 3. 应用固定修剪
///    - 从左端修剪trim_left个碱基
///    - 从右端修剪trim_right个碱基
///
/// # 参数
///
/// * `qual` - 质量值序列
/// * `config` - 修剪配置参数
///
/// # 返回
///
/// 返回修剪后的起始和结束位置(半开区间[start, end))
///
/// # 示例
///
/// ```
/// use seqtkrs::algorithms::trimmer::{trim_mott, TrimmerConfig};
///
/// let qual = b"IIIIIIIIII!!!!!IIIIIIIIII";  // 质量值字符串
/// let config = TrimmerConfig::default();
/// let result = trim_mott(qual, &config);
///
/// println!("保留区间: [{}..{})", result.start, result.end);
/// ```
pub fn trim_mott(qual: &[u8], config: &TrimmerConfig) -> TrimResult {
    if qual.is_empty() {
        return TrimResult { start: 0, end: 0 };
    }

    // 预计算错误概率表
    let error_table = build_qual_table(config.qual_offset);

    let mut start;
    let mut end;

    // 只有当序列长度严格大于min_length时才进行修剪
    if qual.len() > config.min_length {
        // 动态规划找最大得分区间
        let mut score = 0.0;
        let mut max_score = 0.0;
        let mut max_end = qual.len();
        let mut temp_start = 0;
        let mut best_start = 0;

        for (i, &q) in qual.iter().enumerate() {
            // 质量值钳制:与原版seqtk保持一致
            let q_clamped = q.clamp(36, 127);
            let error_prob = error_table[q_clamped as usize];
            score += config.error_threshold - error_prob;

            if score > max_score {
                max_score = score;
                max_end = i + 1;
                best_start = temp_start;
            }

            if score < 0.0 {
                score = 0.0;
                temp_start = i + 1;
            }
        }

        start = best_start;
        end = max_end;

        // 如果max_score从未更新(全是低质量),返回前min_length个碱基
        if max_score == 0.0 {
            start = 0;
            end = config.min_length;
        }

        // 如果找到的区间太短,使用滑动窗口找质量值总和最高的min_length区间
        // 注意:这里直接使用质量值,不是错误概率
        if end - start < config.min_length && qual.len() >= config.min_length {
            // 计算第一个窗口的质量值总和
            let mut window_sum: i32 = qual[0..config.min_length]
                .iter()
                .map(|&q| (q as i32) - (config.qual_offset as i32))
                .sum();

            let mut best_sum = window_sum;
            start = 0;

            // 滑动窗口
            for i in config.min_length..qual.len() {
                window_sum += (qual[i] as i32) - (config.qual_offset as i32);
                window_sum -= (qual[i - config.min_length] as i32) - (config.qual_offset as i32);

                if window_sum > best_sum {
                    best_sum = window_sum;
                    start = i - config.min_length + 1;
                }
            }
            end = start + config.min_length;
        }
    } else {
        // 序列长度 <= min_length,保留整个序列
        start = 0;
        end = qual.len();
    }

    // 应用固定修剪
    start = start.saturating_add(config.trim_left);
    end = end.saturating_sub(config.trim_right);

    // 确保start < end
    if start >= end {
        return TrimResult { start: 0, end: 0 };
    }

    TrimResult { start, end }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_trim_mott_basic() {
        // 创建一个简单的质量值序列
        // 'I' = Phred+33质量值40 (高质量)
        // '!' = Phred+33质量值0 (低质量)
        let qual = b"IIIIIIIIII!!!!!IIIIIIIIII";
        let config = TrimmerConfig {
            error_threshold: 0.05,
            min_length: 5,
            qual_offset: 33,
            trim_left: 0,
            trim_right: 0,
        };

        let result = trim_mott(qual, &config);

        // 应该保留高质量区域
        assert!(result.start < result.end);
        assert!(result.end - result.start >= config.min_length);
    }

    #[test]
    fn test_trim_mott_empty() {
        let qual = b"";
        let config = TrimmerConfig::default();
        let result = trim_mott(qual, &config);

        assert_eq!(result.start, 0);
        assert_eq!(result.end, 0);
    }

    #[test]
    fn test_trim_mott_with_fixed_trim() {
        let qual = b"IIIIIIIIIIIIIIIIIIII"; // 全高质量
        let config = TrimmerConfig {
            error_threshold: 0.05,
            min_length: 5,
            qual_offset: 33,
            trim_left: 2,
            trim_right: 3,
        };

        let result = trim_mott(qual, &config);

        // 应该从两端修剪固定数量的碱基
        assert!(result.start >= 2);
        assert!(result.end <= qual.len() - 3);
    }

    #[test]
    fn test_trim_mott_all_low_quality() {
        // 全低质量序列
        let qual = b"!!!!!!!!!!";
        let config = TrimmerConfig {
            error_threshold: 0.05,
            min_length: 20, // 比序列长度还长
            qual_offset: 33,
            trim_left: 0,
            trim_right: 0,
        };

        let result = trim_mott(qual, &config);

        // 因为序列长度(10) <= min_length(20),保留整个序列
        assert_eq!(result.start, 0);
        assert_eq!(result.end, 10);
    }
}