seqtkrs 0.1.1

A Rust reimplementation of seqtk, a fast and lightweight tool for processing biological sequences in FASTA/FASTQ format
Documentation
use crate::core::seq_record::SeqRecord;
use anyhow::{Context, Result};
use flate2::write::GzEncoder;
use flate2::Compression;
use std::fs::File;
use std::io::{BufWriter, Write};
use std::path::Path;

/// 序列写入器,支持FASTA/FASTQ格式输出
///
/// # 特性
/// - 根据记录类型自动选择FASTA或FASTQ格式
/// - 支持gzip压缩输出
/// - 支持可配置的行宽(仅FASTA)
/// - 使用16KB缓冲区优化I/O性能
pub struct SeqWriter<W: Write> {
    writer: W,
    line_width: Option<usize>, // None表示不换行
}

impl SeqWriter<Box<dyn Write>> {
    /// 从文件路径创建写入器
    ///
    /// # 参数
    /// - `path`: 输出文件路径
    /// - `compress`: 是否启用gzip压缩
    pub fn to_path<P: AsRef<Path>>(path: P, compress: bool) -> Result<Self> {
        let path = path.as_ref();
        let file =
            File::create(path).with_context(|| format!("无法创建文件: {}", path.display()))?;

        let writer: Box<dyn Write> = if compress {
            Box::new(BufWriter::with_capacity(
                16384,
                GzEncoder::new(file, Compression::default()),
            ))
        } else {
            Box::new(BufWriter::with_capacity(16384, file))
        };

        Ok(Self::new(writer))
    }

    /// 写入到标准输出
    pub fn to_stdout() -> Self {
        let stdout = std::io::stdout();
        let writer = BufWriter::with_capacity(16384, stdout);
        Self::new(Box::new(writer))
    }
}

impl<W: Write> SeqWriter<W> {
    /// 创建新的写入器
    pub fn new(writer: W) -> Self {
        Self {
            writer,
            line_width: None,
        }
    }

    /// 设置FASTA格式的行宽
    ///
    /// # 参数
    /// - `width`: 每行的字符数(0表示不换行)
    ///
    /// # 说明
    /// 仅对FASTA格式有效,FASTQ格式始终单行输出
    pub fn with_line_width(mut self, width: usize) -> Self {
        self.line_width = if width > 0 { Some(width) } else { None };
        self
    }

    /// 写入一条序列记录
    ///
    /// # 参数
    /// - `record`: 要写入的序列记录
    ///
    /// # 格式选择
    /// - 如果record有质量值,输出FASTQ格式
    /// - 否则输出FASTA格式
    pub fn write_record(&mut self, record: &SeqRecord) -> Result<()> {
        if record.is_fastq() {
            self.write_fastq(record)
        } else {
            self.write_fasta(record)
        }
    }

    /// 写入FASTA格式
    fn write_fasta(&mut self, record: &SeqRecord) -> Result<()> {
        // 写入header行
        write!(self.writer, ">")?;
        self.writer.write_all(&record.name)?;
        if let Some(comment) = &record.comment {
            write!(self.writer, " ")?;
            self.writer.write_all(comment)?;
        }
        writeln!(self.writer)?;

        // 写入序列(根据line_width决定是否换行)
        match self.line_width {
            Some(width) => {
                // 按指定宽度换行
                for chunk in record.seq.chunks(width) {
                    self.writer.write_all(chunk)?;
                    writeln!(self.writer)?;
                }
            }
            None => {
                // 单行输出
                self.writer.write_all(&record.seq)?;
                writeln!(self.writer)?;
            }
        }

        Ok(())
    }

    /// 写入FASTQ格式
    fn write_fastq(&mut self, record: &SeqRecord) -> Result<()> {
        // 写入header行(@开头)
        write!(self.writer, "@")?;
        self.writer.write_all(&record.name)?;
        if let Some(comment) = &record.comment {
            write!(self.writer, " ")?;
            self.writer.write_all(comment)?;
        }
        writeln!(self.writer)?;

        // 写入序列(根据line_width决定是否换行)
        match self.line_width {
            Some(width) => {
                // 按指定宽度换行
                for chunk in record.seq.chunks(width) {
                    self.writer.write_all(chunk)?;
                    writeln!(self.writer)?;
                }
            }
            None => {
                // 单行输出
                self.writer.write_all(&record.seq)?;
                writeln!(self.writer)?;
            }
        }

        // 写入分隔符
        writeln!(self.writer, "+")?;

        // 写入质量值(根据line_width决定是否换行)
        if let Some(qual) = &record.qual {
            match self.line_width {
                Some(width) => {
                    // 按指定宽度换行
                    for chunk in qual.chunks(width) {
                        self.writer.write_all(chunk)?;
                        writeln!(self.writer)?;
                    }
                }
                None => {
                    // 单行输出
                    self.writer.write_all(qual)?;
                    writeln!(self.writer)?;
                }
            }
        }

        Ok(())
    }

    /// 刷新缓冲区
    ///
    /// # 说明
    /// 在程序退出前务必调用,确保所有数据写入磁盘
    pub fn flush(&mut self) -> Result<()> {
        self.writer.flush()?;
        Ok(())
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use std::io::Cursor;

    #[test]
    fn test_write_fasta() {
        let mut output = Cursor::new(Vec::new());
        let mut writer = SeqWriter::new(&mut output);

        let record = SeqRecord::new(b"seq1".to_vec(), b"ACGTACGT".to_vec());
        writer.write_record(&record).unwrap();
        writer.flush().unwrap();

        let result = String::from_utf8(output.into_inner()).unwrap();
        assert_eq!(result, ">seq1\nACGTACGT\n");
    }

    #[test]
    fn test_write_fasta_with_comment() {
        let mut output = Cursor::new(Vec::new());
        let mut writer = SeqWriter::new(&mut output);

        let mut record = SeqRecord::new(b"seq1".to_vec(), b"ACGT".to_vec());
        record.comment = Some(b"test comment".to_vec());
        writer.write_record(&record).unwrap();
        writer.flush().unwrap();

        let result = String::from_utf8(output.into_inner()).unwrap();
        assert_eq!(result, ">seq1 test comment\nACGT\n");
    }

    #[test]
    fn test_write_fasta_with_line_width() {
        let mut output = Cursor::new(Vec::new());
        let mut writer = SeqWriter::new(&mut output).with_line_width(4);

        let record = SeqRecord::new(b"seq1".to_vec(), b"ACGTACGT".to_vec());
        writer.write_record(&record).unwrap();
        writer.flush().unwrap();

        let result = String::from_utf8(output.into_inner()).unwrap();
        assert_eq!(result, ">seq1\nACGT\nACGT\n");
    }

    #[test]
    fn test_write_fastq() {
        let mut output = Cursor::new(Vec::new());
        let mut writer = SeqWriter::new(&mut output);

        let record = SeqRecord::with_qual(b"seq1".to_vec(), b"ACGT".to_vec(), b"IIII".to_vec());
        writer.write_record(&record).unwrap();
        writer.flush().unwrap();

        let result = String::from_utf8(output.into_inner()).unwrap();
        assert_eq!(result, "@seq1\nACGT\n+\nIIII\n");
    }

    #[test]
    fn test_write_multiple_records() {
        let mut output = Cursor::new(Vec::new());
        let mut writer = SeqWriter::new(&mut output);

        let record1 = SeqRecord::new(b"seq1".to_vec(), b"ACGT".to_vec());
        let record2 = SeqRecord::new(b"seq2".to_vec(), b"TGCA".to_vec());

        writer.write_record(&record1).unwrap();
        writer.write_record(&record2).unwrap();
        writer.flush().unwrap();

        let result = String::from_utf8(output.into_inner()).unwrap();
        assert_eq!(result, ">seq1\nACGT\n>seq2\nTGCA\n");
    }
}