seqtkrs 0.1.1

A Rust reimplementation of seqtk, a fast and lightweight tool for processing biological sequences in FASTA/FASTQ format
Documentation
//! X-dropoff算法实现
//!
//! X-dropoff算法用于在序列中找到满足特定条件的最佳区间。
//! 主要应用于GC含量区域识别,当累积得分下降超过阈值时停止扩展。

/// X-dropoff算法配置
#[derive(Debug, Clone)]
pub struct XDropoffConfig {
    /// 目标得分(匹配时)
    pub match_score: f64,
    /// 非匹配惩罚
    pub mismatch_penalty: f64,
    /// X-dropoff阈值
    pub xdrop: f64,
}

impl Default for XDropoffConfig {
    fn default() -> Self {
        Self {
            match_score: 1.0,
            mismatch_penalty: 1.0,
            xdrop: 10.0,
        }
    }
}

/// X-dropoff算法结果
#[derive(Debug, Clone, PartialEq)]
pub struct XDropoffResult {
    /// 区间起始位置
    pub start: usize,
    /// 区间结束位置
    pub end: usize,
    /// 最大得分
    pub score: f64,
}

/// 使用X-dropoff算法找到最佳区间
///
/// # 算法说明
///
/// 从指定起始位置开始向右扫描,累积得分:
/// - 匹配条件满足时加上match_score
/// - 不满足时减去mismatch_penalty
/// - 当 max_score - current_score > xdrop 时停止
///
/// # 参数
///
/// * `seq` - 输入序列
/// * `start_pos` - 起始位置
/// * `is_match` - 判断碱基是否满足匹配条件的闭包
/// * `config` - X-dropoff配置
///
/// # 返回
///
/// 返回找到的最佳区间及其得分
///
/// # 示例
///
/// ```
/// use seqtkrs::algorithms::xdropoff::{xdropoff_search, XDropoffConfig};
///
/// let seq = b"ATCGATCGGGGGGATCG";
/// let config = XDropoffConfig::default();
///
/// // 查找G含量高的区域
/// let result = xdropoff_search(seq, 0, |base| base == b'G', &config);
/// println!("找到区域: [{}..{}), 得分: {}", result.start, result.end, result.score);
/// ```
pub fn xdropoff_search<F>(
    seq: &[u8],
    start_pos: usize,
    is_match: F,
    config: &XDropoffConfig,
) -> XDropoffResult
where
    F: Fn(u8) -> bool,
{
    let mut score = 0.0;
    let mut max_score = 0.0;
    let mut max_pos = start_pos;
    let start = start_pos;

    for (i, &base) in seq[start_pos..].iter().enumerate() {
        let pos = start_pos + i;

        if is_match(base) {
            score += config.match_score;
        } else {
            score -= config.mismatch_penalty;
        }

        if score > max_score {
            max_score = score;
            max_pos = pos + 1;
        }

        // X-dropoff条件
        if max_score - score > config.xdrop {
            break;
        }
    }

    XDropoffResult {
        start,
        end: max_pos,
        score: max_score,
    }
}

/// GC含量区域识别(使用X-dropoff)
///
/// 在整个序列中查找所有满足目标GC含量的区域。
///
/// # 算法说明
///
/// 1. 根据目标GC含量计算得分权重 q = (1-gc)/gc
/// 2. 从序列起点开始,使用X-dropoff算法查找高GC区域
/// 3. 跳过已找到的区域,继续查找下一个区域
/// 4. 过滤掉长度小于min_length的区域
///
/// # 参数
///
/// * `seq` - 输入序列
/// * `target_gc_frac` - 目标GC含量(0.0-1.0)
/// * `xdrop` - X-dropoff阈值
/// * `min_length` - 最小区域长度
///
/// # 返回
///
/// 返回所有找到的GC区域列表,每个元素为 (start, end, score)
///
/// # 示例
///
/// ```
/// use seqtkrs::algorithms::xdropoff::find_gc_regions;
///
/// let seq = b"ATATATATGCGCGCGCGCATATATATAT";
///
/// // 查找GC含量约50%的区域
/// let regions = find_gc_regions(seq, 0.5, 5.0, 10);
///
/// for (start, end, score) in regions {
///     println!("GC区域: [{}..{}), 得分: {:.2}", start, end, score);
/// }
/// ```
pub fn find_gc_regions(
    seq: &[u8],
    target_gc_frac: f64,
    xdrop: f64,
    min_length: usize,
) -> Vec<(usize, usize, f64)> {
    // 计算得分权重
    // 当GC碱基出现时得分为q,非GC碱基出现时扣分为1
    // 这样可以平衡GC含量接近target_gc_frac的区域
    let q = (1.0 - target_gc_frac) / target_gc_frac;

    let config = XDropoffConfig {
        match_score: q,
        mismatch_penalty: 1.0,
        xdrop,
    };

    let mut regions = Vec::new();
    let mut pos = 0;

    while pos < seq.len() {
        let result = xdropoff_search(
            seq,
            pos,
            |base| base == b'G' || base == b'g' || base == b'C' || base == b'c',
            &config,
        );

        // 只保留足够长的区域
        if result.end > result.start && result.end - result.start >= min_length {
            regions.push((result.start, result.end, result.score));
        }

        // 移动到下一个未检查的位置
        // 如果找到了区域就跳过它,否则向前移动一个位置
        pos = if result.end > result.start {
            result.end
        } else {
            pos + 1
        };
    }

    regions
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_xdropoff_search_basic() {
        // 测试简单的匹配搜索
        let seq = b"AAAAGGGGGAAAA";
        let config = XDropoffConfig::default();

        // 从头开始搜索G
        let result = xdropoff_search(seq, 0, |base| base == b'G', &config);

        // 应该找到G的聚集区域
        assert!(result.end > result.start);
        assert!(result.score > 0.0);
    }

    #[test]
    fn test_xdropoff_search_no_match() {
        // 测试没有匹配的情况
        let seq = b"AAAAAAAA";
        let config = XDropoffConfig::default();

        let result = xdropoff_search(seq, 0, |base| base == b'G', &config);

        // 没有匹配,得分应该为0或负数
        assert_eq!(result.score, 0.0);
    }

    #[test]
    fn test_xdropoff_search_with_start_pos() {
        // 测试从中间位置开始搜索
        let seq = b"AAAAGGGGGAAAA";
        let config = XDropoffConfig::default();

        let result = xdropoff_search(seq, 5, |base| base == b'G', &config);

        assert_eq!(result.start, 5);
        assert!(result.end > 5);
    }

    #[test]
    fn test_find_gc_regions_basic() {
        // 测试GC区域识别
        let seq = b"ATATATATGCGCGCGCGCATATATATAT";

        // 查找GC含量高的区域
        let regions = find_gc_regions(seq, 0.5, 5.0, 5);

        // 应该找到至少一个GC区域
        assert!(!regions.is_empty());

        // 检查找到的区域
        for (start, end, score) in regions {
            assert!(end > start);
            assert!(end - start >= 5); // 满足最小长度要求
            assert!(score > 0.0);

            // 检查该区域的GC含量确实较高
            let region = &seq[start..end];
            let gc_count = region
                .iter()
                .filter(|&&b| b == b'G' || b == b'C' || b == b'g' || b == b'c')
                .count();
            let gc_frac = gc_count as f64 / region.len() as f64;

            // GC含量应该高于0.3(这是一个宽松的阈值)
            assert!(gc_frac > 0.3, "GC含量: {}", gc_frac);
        }
    }

    #[test]
    fn test_find_gc_regions_no_gc() {
        // 测试没有GC的序列
        let seq = b"ATATATAT";

        let regions = find_gc_regions(seq, 0.5, 5.0, 5);

        // 应该找不到符合条件的GC区域
        assert!(regions.is_empty());
    }

    #[test]
    fn test_find_gc_regions_min_length_filter() {
        // 测试最小长度过滤
        let seq = b"ATGCAT"; // 只有很短的GC片段

        let regions = find_gc_regions(seq, 0.5, 5.0, 10); // 要求至少10bp

        // 因为没有满足长度要求的区域,应该返回空
        assert!(regions.is_empty());
    }

    #[test]
    fn test_xdropoff_config() {
        // 测试配置参数的影响
        let seq = b"AAAAGGGGGAAAA";

        // 高xdrop阈值应该允许更长的扩展
        let config_high = XDropoffConfig {
            match_score: 1.0,
            mismatch_penalty: 1.0,
            xdrop: 100.0,
        };

        // 低xdrop阈值应该更早停止
        let config_low = XDropoffConfig {
            match_score: 1.0,
            mismatch_penalty: 1.0,
            xdrop: 1.0,
        };

        let result_high = xdropoff_search(seq, 0, |base| base == b'G', &config_high);
        let result_low = xdropoff_search(seq, 0, |base| base == b'G', &config_low);

        // 高阈值应该找到更长或相同长度的区间
        assert!(result_high.end - result_high.start >= result_low.end - result_low.start);
    }
}