seqtkrs 0.1.1

A Rust reimplementation of seqtk, a fast and lightweight tool for processing biological sequences in FASTA/FASTQ format
Documentation
//! rename命令:重命名序列
//!
//! 该命令检测双端读段(通过名称相同判断)并重命名序列,给每对序列分配相同的编号

use anyhow::{Context, Result};
use clap::Args;

use crate::core::{SeqReader, SeqRecord, SeqWriter};

#[derive(Args, Debug)]
pub struct RenameArgs {
    /// 输入文件
    #[arg(value_name = "in.fq", default_value = "-")]
    pub input: String,

    /// 序列名称前缀(可选)
    #[arg(value_name = "prefix")]
    pub prefix: Option<String>,
}

/// 比较两个序列名称是否相同(忽略末尾的/1、/2后缀)
fn names_equal(name1: &[u8], name2: &[u8]) -> bool {
    if name1.len() != name2.len() {
        return false;
    }

    // 确定比较的长度:如果末尾是 /digit 格式,则忽略最后两个字符
    let compare_len = if name1.len() > 2
        && name1[name1.len() - 2] == b'/'
        && name2[name2.len() - 2] == b'/'
        && name1[name1.len() - 1].is_ascii_digit()
        && name2[name2.len() - 1].is_ascii_digit()
    {
        name1.len() - 2
    } else {
        name1.len()
    };

    name1[..compare_len] == name2[..compare_len]
}

/// 写入重命名后的序列
fn write_renamed_record<W: std::io::Write>(
    writer: &mut SeqWriter<W>,
    record: &SeqRecord,
    prefix: Option<&str>,
    number: u64,
) -> Result<()> {
    // 创建新的名称
    let new_name = if let Some(p) = prefix {
        format!("{}{}", p, number).into_bytes()
    } else {
        format!("{}", number).into_bytes()
    };

    // 创建重命名后的记录
    let renamed_record = SeqRecord {
        name: new_name,
        comment: record.comment.clone(),
        seq: record.seq.clone(),
        qual: record.qual.clone(),
    };

    writer.write_record(&renamed_record)?;
    Ok(())
}

pub fn run(args: &RenameArgs) -> Result<()> {
    // 打开输入文件
    let mut reader = if args.input == "-" {
        SeqReader::from_stdin()
    } else {
        SeqReader::from_path(&args.input)
            .with_context(|| format!("无法打开输入文件: {}", args.input))?
    };

    // 创建输出写入器
    let mut writer = SeqWriter::to_stdout();

    // 用于存储上一条序列的记录
    let mut last_record: Option<SeqRecord> = None;
    let mut current_record = SeqRecord::new(Vec::new(), Vec::new());
    let mut sequence_number: u64 = 1;

    // 读取所有序列
    while reader.read_next(&mut current_record)? {
        if let Some(last) = last_record.take() {
            // 比较当前序列和上一条序列的名称
            if names_equal(&last.name, &current_record.name) {
                // 名称相同,输出这一对序列,使用相同的编号
                write_renamed_record(&mut writer, &last, args.prefix.as_deref(), sequence_number)?;
                write_renamed_record(
                    &mut writer,
                    &current_record,
                    args.prefix.as_deref(),
                    sequence_number,
                )?;
                sequence_number += 1;
                // 不保存当前记录到last_record(因为已经输出了一对)
            } else {
                // 名称不同,输出上一条序列(单端),然后保存当前记录
                write_renamed_record(&mut writer, &last, args.prefix.as_deref(), sequence_number)?;
                sequence_number += 1;
                last_record = Some(current_record.clone());
            }
        } else {
            // 第一条序列,保存到last_record
            last_record = Some(current_record.clone());
        }
    }

    // 输出最后一条序列(如果有)
    if let Some(last) = last_record {
        write_renamed_record(&mut writer, &last, args.prefix.as_deref(), sequence_number)?;
    }

    writer.flush()?;
    Ok(())
}