seqtkrs 0.1.1

A Rust reimplementation of seqtk, a fast and lightweight tool for processing biological sequences in FASTA/FASTQ format
Documentation
// 质量值处理模块
// 提供质量值与错误概率之间的转换、质量值格式检测和转换

/// 将Phred质量值转换为错误概率
///
/// # 参数
///
/// * `qual` - ASCII编码的质量值
/// * `offset` - 质量值偏移量(33表示Phred+33,64表示Phred+64)
///
/// # 返回
///
/// 错误概率(0到1之间)
#[inline]
pub fn qual_to_error_prob(qual: u8, offset: u8) -> f64 {
    let q = (qual.saturating_sub(offset)) as f64;
    10.0_f64.powf(-q / 10.0)
}

/// 预计算质量值到错误概率的映射表
///
/// 生成256个元素的查找表,避免重复计算
///
/// # 参数
///
/// * `offset` - 质量值偏移量
///
/// # 返回
///
/// 包含256个错误概率值的向量
pub fn build_qual_table(offset: u8) -> Vec<f64> {
    (0..=255).map(|q| qual_to_error_prob(q, offset)).collect()
}

/// 判断质量值格式 (Phred33 or Phred64)
///
/// 通过检查质量值的最小值来推断格式
///
/// # 参数
///
/// * `qual` - 质量值字节数组
///
/// # 返回
///
/// 33 表示Phred+33 (Sanger, Illumina 1.8+)
/// 64 表示Phred+64 (Illumina 1.3-1.7)
pub fn detect_quality_offset(qual: &[u8]) -> u8 {
    if qual.is_empty() {
        return 33; // 默认使用Phred+33
    }

    let min_qual = qual.iter().min().copied().unwrap_or(33);
    if min_qual < 59 {
        33 // Phred+33 (Sanger, Illumina 1.8+)
    } else {
        64 // Phred+64 (Illumina 1.3-1.7)
    }
}

/// 转换质量值偏移
///
/// 将质量值从一种编码格式转换为另一种
///
/// # 参数
///
/// * `qual` - 要转换的质量值数组(会被就地修改)
/// * `from` - 源偏移量
/// * `to` - 目标偏移量
pub fn convert_quality_offset(qual: &mut [u8], from: u8, to: u8) {
    let diff = to as i16 - from as i16;
    for q in qual {
        *q = (*q as i16 + diff).max(to as i16).min(126) as u8;
    }
}

/// 计算平均质量值
///
/// # 参数
///
/// * `qual` - 质量值数组
/// * `offset` - 质量值偏移量
///
/// # 返回
///
/// 平均质量值(Phred分数)
pub fn average_quality(qual: &[u8], offset: u8) -> f64 {
    if qual.is_empty() {
        return 0.0;
    }

    let sum: f64 = qual
        .iter()
        .map(|&q| (q.saturating_sub(offset)) as f64)
        .sum();
    sum / qual.len() as f64
}

/// 计算质量值的百分位数
///
/// # 参数
///
/// * `qual` - 质量值数组
/// * `offset` - 质量值偏移量
/// * `percentile` - 百分位数(0-100)
///
/// # 返回
///
/// 指定百分位数的质量值
pub fn quality_percentile(qual: &[u8], offset: u8, percentile: f64) -> u8 {
    if qual.is_empty() {
        return offset;
    }

    let mut sorted: Vec<u8> = qual.iter().map(|&q| q.saturating_sub(offset)).collect();
    sorted.sort_unstable();

    let idx = ((qual.len() - 1) as f64 * percentile / 100.0) as usize;
    sorted[idx]
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_qual_to_error_prob() {
        // Phred+33编码
        let offset = 33;

        // 质量值40 (ASCII 73),错误率应该是10^-4 = 0.0001
        let q40 = qual_to_error_prob(73, offset);
        assert!((q40 - 0.0001).abs() < 1e-6);

        // 质量值30 (ASCII 63),错误率应该是10^-3 = 0.001
        let q30 = qual_to_error_prob(63, offset);
        assert!((q30 - 0.001).abs() < 1e-6);

        // 质量值20 (ASCII 53),错误率应该是10^-2 = 0.01
        let q20 = qual_to_error_prob(53, offset);
        assert!((q20 - 0.01).abs() < 1e-6);
    }

    #[test]
    fn test_build_qual_table() {
        let table = build_qual_table(33);
        assert_eq!(table.len(), 256);

        // 检查几个已知值
        assert!((table[73] - 0.0001).abs() < 1e-6); // Q40
        assert!((table[63] - 0.001).abs() < 1e-6); // Q30
        assert!((table[53] - 0.01).abs() < 1e-6); // Q20
    }

    #[test]
    fn test_detect_quality_offset() {
        // Phred+33质量值范围通常是33-73
        let phred33_qual = vec![35, 40, 45, 50, 55]; // ASCII 35-55
        assert_eq!(detect_quality_offset(&phred33_qual), 33);

        // Phred+64质量值范围通常是64-104
        let phred64_qual = vec![65, 70, 75, 80, 85]; // ASCII 65-85
        assert_eq!(detect_quality_offset(&phred64_qual), 64);

        // 空数组应该返回默认值33
        assert_eq!(detect_quality_offset(&[]), 33);
    }

    #[test]
    fn test_convert_quality_offset() {
        // 从Phred+33转换到Phred+64
        let mut qual = vec![33, 40, 50, 60, 70]; // 质量值0,7,17,27,37
        convert_quality_offset(&mut qual, 33, 64);

        // 转换后应该加31
        assert_eq!(qual, vec![64, 71, 81, 91, 101]);

        // 从Phred+64转换回Phred+33
        convert_quality_offset(&mut qual, 64, 33);
        assert_eq!(qual, vec![33, 40, 50, 60, 70]);
    }

    #[test]
    fn test_convert_quality_offset_clamp() {
        // 测试边界情况
        let mut qual = vec![33, 126, 200];
        convert_quality_offset(&mut qual, 33, 64);

        // 所有值应该被限制在[64, 126]范围内
        for &q in &qual {
            assert!(q >= 64 && q <= 126);
        }
    }

    #[test]
    fn test_average_quality() {
        let qual = vec![43, 43, 43, 43]; // 全部是质量值10 (43-33=10)
        let avg = average_quality(&qual, 33);
        assert!((avg - 10.0).abs() < 1e-6);

        let qual2 = vec![33, 43, 53, 63]; // 质量值0,10,20,30
        let avg2 = average_quality(&qual2, 33);
        assert!((avg2 - 15.0).abs() < 1e-6); // 平均值是15
    }

    #[test]
    fn test_quality_percentile() {
        let qual = vec![33, 38, 43, 48, 53]; // 质量值0,5,10,15,20

        // 0百分位应该是最小值0
        assert_eq!(quality_percentile(&qual, 33, 0.0), 0);

        // 50百分位(中位数)应该是10
        assert_eq!(quality_percentile(&qual, 33, 50.0), 10);

        // 100百分位应该是最大值20
        assert_eq!(quality_percentile(&qual, 33, 100.0), 20);
    }

    #[test]
    fn test_empty_quality() {
        let qual: Vec<u8> = vec![];

        assert_eq!(average_quality(&qual, 33), 0.0);
        assert_eq!(quality_percentile(&qual, 33, 50.0), 33);
    }
}