seqtkrs 0.1.1

A Rust reimplementation of seqtk, a fast and lightweight tool for processing biological sequences in FASTA/FASTQ format
Documentation
// trimfq 命令:使用 Mott 修剪算法修剪 FASTQ 序列的低质量碱基
//
// 该命令可以:
// 1. 使用 Mott 修剪算法动态修剪低质量区域
// 2. 固定位置修剪(从左端或右端)
// 3. 限制输出序列的最大长度

use anyhow::Result;
use clap::Args;

use crate::algorithms::trimmer::{trim_mott, TrimmerConfig};
use crate::core::{SeqReader, SeqRecord, SeqWriter};

#[derive(Args, Debug)]
pub struct TrimfqArgs {
    /// 输入的 FASTQ 文件("-" 表示标准输入)
    #[arg(value_name = "in.fq")]
    pub input: String,

    /// Mott 修剪算法的错误率阈值(被 -b/-e/-L 禁用)
    #[arg(short = 'q', default_value = "0.05")]
    pub error_threshold: f64,

    /// 修剪后的最小长度(被 -b/-e/-L 禁用)
    #[arg(short = 'l', default_value = "30")]
    pub min_length: usize,

    /// 从左端(5'端)修剪固定长度(非零则禁用 -q/-l)
    #[arg(short = 'b', default_value = "0")]
    pub trim_left: usize,

    /// 从右端(3'端)修剪固定长度(非零则禁用 -q/-l)
    #[arg(short = 'e', default_value = "0")]
    pub trim_right: usize,

    /// 从5'端保留最多的碱基数(非零则禁用 -q/-l)
    #[arg(short = 'L', default_value = "0")]
    pub max_length: usize,
}

pub fn run(args: &TrimfqArgs) -> Result<()> {
    // 创建输入读取器
    let mut reader = if args.input == "-" {
        SeqReader::from_stdin()
    } else {
        SeqReader::from_path(&args.input)?
    };

    // 创建输出写入器
    let mut writer = SeqWriter::to_stdout();

    // 创建记录缓冲区
    let mut record = SeqRecord::new(Vec::new(), Vec::new());

    // 判断使用哪种修剪模式
    let use_fixed_trim = args.trim_left > 0 || args.trim_right > 0 || args.max_length > 0;

    // 处理每条序列
    while reader.read_next(&mut record)? {
        // 如果不是 FASTQ 格式,直接输出
        if !record.is_fastq() {
            writer.write_record(&record)?;
            continue;
        }

        // 计算修剪后的起始和结束位置
        let (start, end) = if use_fixed_trim {
            // 固定位置修剪模式
            fixed_trim(&record, args.trim_left, args.trim_right, args.max_length)
        } else {
            // Mott 修剪模式
            mott_trim(&record, args.error_threshold, args.min_length)?
        };

        // 如果修剪后序列为空,跳过输出
        if start >= end {
            continue;
        }

        // 创建修剪后的记录并输出
        let trimmed_record = create_trimmed_record(&record, start, end);
        writer.write_record(&trimmed_record)?;
    }

    writer.flush()?;
    Ok(())
}

/// 固定位置修剪
fn fixed_trim(
    record: &SeqRecord,
    trim_left: usize,
    trim_right: usize,
    max_length: usize,
) -> (usize, usize) {
    let seq_len = record.seq.len();

    // 计算起始和结束位置
    let mut start = trim_left;
    let mut end = seq_len.saturating_sub(trim_right);

    // 确保 start < end
    if start >= end {
        start = 0;
        end = 0;
    }

    // 应用最大长度限制
    if max_length > 0 && end - start > max_length {
        end = start + max_length;
    }

    (start, end)
}

/// Mott 修剪模式
fn mott_trim(
    record: &SeqRecord,
    error_threshold: f64,
    min_length: usize,
) -> Result<(usize, usize)> {
    // 如果质量序列长度小于等于最小长度,直接返回整个序列
    if let Some(qual) = &record.qual {
        if qual.len() <= min_length {
            return Ok((0, record.seq.len()));
        }

        // 使用 Mott 修剪算法
        let config = TrimmerConfig {
            error_threshold,
            min_length,
            qual_offset: 33,
            trim_left: 0,
            trim_right: 0,
        };
        let trim_result = trim_mott(qual, &config);

        Ok((trim_result.start, trim_result.end))
    } else {
        // 没有质量值,返回整个序列
        Ok((0, record.seq.len()))
    }
}

/// 创建修剪后的序列记录
fn create_trimmed_record(record: &SeqRecord, start: usize, end: usize) -> SeqRecord {
    let mut trimmed = SeqRecord::new(Vec::new(), Vec::new());

    // 复制名称和注释
    trimmed.name = record.name.clone();
    trimmed.comment = record.comment.clone();

    // 复制修剪后的序列
    trimmed.seq = record.seq[start..end].to_vec();

    // 如果有质量值,也要修剪
    if let Some(qual) = &record.qual {
        trimmed.qual = Some(qual[start..end].to_vec());
    }

    trimmed
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_fixed_trim() {
        let record = SeqRecord {
            name: b"test".to_vec(),
            comment: None,
            seq: b"ACGTACGTACGT".to_vec(),
            qual: Some(b"IIIIIIIIIIII".to_vec()),
        };

        // 测试左端修剪
        let (start, end) = fixed_trim(&record, 2, 0, 0);
        assert_eq!(start, 2);
        assert_eq!(end, 12);

        // 测试右端修剪
        let (start, end) = fixed_trim(&record, 0, 2, 0);
        assert_eq!(start, 0);
        assert_eq!(end, 10);

        // 测试两端修剪
        let (start, end) = fixed_trim(&record, 2, 2, 0);
        assert_eq!(start, 2);
        assert_eq!(end, 10);

        // 测试最大长度限制
        let (start, end) = fixed_trim(&record, 0, 0, 5);
        assert_eq!(start, 0);
        assert_eq!(end, 5);
    }

    #[test]
    fn test_create_trimmed_record() {
        let record = SeqRecord {
            name: b"test".to_vec(),
            comment: Some(b"comment".to_vec()),
            seq: b"ACGTACGT".to_vec(),
            qual: Some(b"IIIIIIII".to_vec()),
        };

        let trimmed = create_trimmed_record(&record, 2, 6);

        assert_eq!(trimmed.name, b"test");
        assert_eq!(trimmed.comment, Some(b"comment".to_vec()));
        assert_eq!(trimmed.seq, b"GTAC");
        assert_eq!(trimmed.qual, Some(b"IIII".to_vec()));
    }
}