seqtkrs 0.1.1

A Rust reimplementation of seqtk, a fast and lightweight tool for processing biological sequences in FASTA/FASTQ format
Documentation
//! split命令:将输入的FASTA/FASTQ文件分割成多个文件
//!
//! 该命令将序列轮询分配到不同的输出文件中

use anyhow::{Context, Result};
use clap::Args;
use std::fs::File;
use std::io::BufWriter;

use crate::core::{SeqReader, SeqRecord, SeqWriter};

#[derive(Args, Debug)]
pub struct SplitArgs {
    /// 输出文件名前缀
    #[arg(value_name = "prefix")]
    pub prefix: String,

    /// 输入文件
    #[arg(value_name = "in.fa")]
    pub input: String,

    /// 输出文件数量
    #[arg(short = 'n', default_value = "10")]
    pub num_files: usize,

    /// 每行长度(0表示不折行)
    #[arg(short = 'l', default_value = "0")]
    pub line_width: usize,
}

pub fn run(args: &SplitArgs) -> Result<()> {
    // 打开输入文件
    let mut reader = if args.input == "-" {
        SeqReader::from_stdin()
    } else {
        SeqReader::from_path(&args.input)
            .with_context(|| format!("无法打开输入文件: {}", args.input))?
    };

    // 创建输出文件
    let mut writers: Vec<SeqWriter<BufWriter<File>>> = Vec::with_capacity(args.num_files);
    for i in 0..args.num_files {
        let filename = format!("{}.{:05}.fa", args.prefix, i + 1);
        let file =
            File::create(&filename).with_context(|| format!("无法创建输出文件: {}", filename))?;
        let buf_writer = BufWriter::new(file);
        let mut writer = SeqWriter::new(buf_writer);

        // 如果指定了行宽,设置行宽
        if args.line_width > 0 {
            writer = writer.with_line_width(args.line_width);
        }

        writers.push(writer);
    }

    // 用于复用的记录对象
    let mut record = SeqRecord::new(Vec::new(), Vec::new());
    let mut seq_index = 0;

    // 读取所有序列并轮询分配到不同文件
    while reader.read_next(&mut record)? {
        let file_index = seq_index % args.num_files;
        writers[file_index].write_record(&record)?;
        seq_index += 1;
    }

    // 确保所有输出都被写入
    for writer in &mut writers {
        writer.flush()?;
    }

    Ok(())
}