seqtkrs 0.1.1

A Rust reimplementation of seqtk, a fast and lightweight tool for processing biological sequences in FASTA/FASTQ format
Documentation
use crate::core::seq_record::SeqRecord;
use anyhow::{Context, Result};
use bzip2::read::BzDecoder;
use flate2::read::GzDecoder;
use std::fs::File;
use std::io::{BufRead, BufReader};
use std::path::Path;

/// 序列读取器,支持FASTA/FASTQ格式,自动检测gzip/bzip2压缩
///
/// # 特性
/// - 自动识别FASTA('>')和FASTQ('@')格式
/// - 支持gzip (.gz) 和bzip2 (.bz2) 压缩文件
/// - 支持从文件或stdin读取
/// - 使用16KB缓冲区优化I/O性能
/// - 实现Iterator trait,支持函数式编程
pub struct SeqReader<R: BufRead> {
    reader: R,
    buffer: Vec<u8>,
    last_char: Option<u8>,
}

impl SeqReader<Box<dyn BufRead>> {
    /// 从文件路径创建读取器
    ///
    /// # 参数
    /// - `path`: 文件路径,根据扩展名自动选择解压方式
    ///
    /// # 支持的格式
    /// - `.gz`: gzip压缩
    /// - `.bz2`: bzip2压缩
    /// - 其他: 纯文本
    pub fn from_path<P: AsRef<Path>>(path: P) -> Result<Self> {
        let path = path.as_ref();
        let file = File::open(path).with_context(|| format!("无法打开文件: {}", path.display()))?;

        let reader: Box<dyn BufRead> = if path.extension().and_then(|s| s.to_str()) == Some("gz") {
            Box::new(BufReader::with_capacity(16384, GzDecoder::new(file)))
        } else if path.extension().and_then(|s| s.to_str()) == Some("bz2") {
            Box::new(BufReader::with_capacity(16384, BzDecoder::new(file)))
        } else {
            Box::new(BufReader::with_capacity(16384, file))
        };

        Ok(Self::new(reader))
    }

    /// 从标准输入创建读取器
    pub fn from_stdin() -> Self {
        let stdin = std::io::stdin();
        let reader = BufReader::with_capacity(16384, stdin);
        Self::new(Box::new(reader))
    }
}

impl<R: BufRead> SeqReader<R> {
    /// 创建新的序列读取器
    pub fn new(reader: R) -> Self {
        Self {
            reader,
            buffer: Vec::with_capacity(1024),
            last_char: None,
        }
    }

    /// 读取下一条序列记录
    ///
    /// # 参数
    /// - `record`: 用于存储读取结果的可变引用(会被复用以提高性能)
    ///
    /// # 返回值
    /// - `Ok(true)`: 成功读取一条记录
    /// - `Ok(false)`: 已到达文件末尾(EOF)
    /// - `Err`: 发生错误
    ///
    /// # 性能优化
    /// 在循环中复用同一个record对象,避免重复分配内存
    pub fn read_next(&mut self, record: &mut SeqRecord) -> Result<bool> {
        record.clear();

        // 跳到下一个header('>' 或 '@')
        if self.last_char.is_none() {
            loop {
                let mut ch = [0u8; 1];
                match self.reader.read(&mut ch) {
                    Ok(0) => return Ok(false), // EOF
                    Ok(_) => {
                        if ch[0] == b'>' || ch[0] == b'@' {
                            self.last_char = Some(ch[0]);
                            break;
                        }
                    }
                    Err(e) => return Err(e.into()),
                }
            }
        }

        let is_fastq = self.last_char == Some(b'@');

        // 读取名称行
        self.buffer.clear();
        let n = self.reader.read_until(b'\n', &mut self.buffer)?;
        if n == 0 {
            // EOF
            return Ok(false);
        }

        // 解析名称和注释
        if let Some(space_pos) = self.buffer.iter().position(|&b| b == b' ' || b == b'\t') {
            record.name = self.buffer[..space_pos].to_vec();
            let comment_start = space_pos + 1;
            let comment_end = if self.buffer.ends_with(b"\n") {
                self.buffer.len() - 1
            } else {
                self.buffer.len()
            };
            if comment_end > comment_start {
                record.comment = Some(self.buffer[comment_start..comment_end].to_vec());
            }
        } else {
            // 去掉换行符
            let end = if self.buffer.ends_with(b"\n") {
                self.buffer.len() - 1
            } else {
                self.buffer.len()
            };
            record.name = self.buffer[..end].to_vec();
        }

        // 读取序列(可能跨多行)
        loop {
            let buf = self.reader.fill_buf()?;
            if buf.is_empty() {
                break; // EOF
            }

            let ch = buf[0];

            // 遇到下一个记录的开始或FASTQ的质量值分隔符
            if ch == b'>' || ch == b'@' || ch == b'+' {
                self.last_char = Some(ch);
                // 消费这个字符,防止下次读取时重复读到
                self.reader.consume(1);
                break;
            }

            // 读取一行序列
            self.buffer.clear();
            self.reader.read_until(b'\n', &mut self.buffer)?;

            // 去掉换行符,添加到序列
            let end = if self.buffer.ends_with(b"\n") {
                self.buffer.len() - 1
            } else {
                self.buffer.len()
            };

            if end > 0 {
                record.seq.extend_from_slice(&self.buffer[..end]);
            }
        }

        // 如果是FASTQ,读取质量值
        if is_fastq {
            // 跳过 '+' 行
            self.buffer.clear();
            self.reader.read_until(b'\n', &mut self.buffer)?;

            // 读取质量值(长度必须与序列长度相同)
            let mut qual = Vec::with_capacity(record.seq.len());
            while qual.len() < record.seq.len() {
                self.buffer.clear();
                let n = self.reader.read_until(b'\n', &mut self.buffer)?;
                if n == 0 {
                    anyhow::bail!("质量值字符串被截断");
                }

                let end = if self.buffer.ends_with(b"\n") {
                    self.buffer.len() - 1
                } else {
                    self.buffer.len()
                };

                qual.extend_from_slice(&self.buffer[..end]);
            }

            if qual.len() != record.seq.len() {
                anyhow::bail!("质量值长度与序列长度不匹配");
            }

            record.qual = Some(qual);
            self.last_char = None;
        }

        Ok(true)
    }
}

/// 实现Iterator trait,支持函数式编程风格
///
/// # 示例
/// ```no_run
/// use seqtkrs::core::SeqReader;
///
/// let reader = SeqReader::from_stdin();
/// for result in reader {
///     let record = result.unwrap();
///     println!("序列名: {:?}", String::from_utf8_lossy(&record.name));
/// }
/// ```
impl<R: BufRead> Iterator for SeqReader<R> {
    type Item = Result<SeqRecord>;

    fn next(&mut self) -> Option<Self::Item> {
        let mut record = SeqRecord::new(Vec::new(), Vec::new());
        match self.read_next(&mut record) {
            Ok(true) => Some(Ok(record)),
            Ok(false) => None,
            Err(e) => Some(Err(e)),
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use std::io::Cursor;

    #[test]
    fn test_read_fasta() {
        let fasta_data = b">seq1 comment\nACGT\nACGT\n>seq2\nTGCA\n";
        let cursor = Cursor::new(fasta_data);
        let reader = BufReader::new(cursor);
        let mut seq_reader = SeqReader::new(reader);

        let mut record = SeqRecord::new(Vec::new(), Vec::new());

        // 读取第一条记录
        assert!(seq_reader.read_next(&mut record).unwrap());
        assert_eq!(record.name, b"seq1");
        assert_eq!(record.comment, Some(b"comment".to_vec()));
        assert_eq!(record.seq, b"ACGTACGT");
        assert!(!record.is_fastq());

        // 读取第二条记录
        assert!(seq_reader.read_next(&mut record).unwrap());
        assert_eq!(record.name, b"seq2");
        assert_eq!(record.seq, b"TGCA");

        // EOF
        assert!(!seq_reader.read_next(&mut record).unwrap());
    }

    #[test]
    fn test_read_fastq() {
        let fastq_data = b"@seq1\nACGT\n+\nIIII\n@seq2\nTGCA\n+\nJJJJ\n";
        let cursor = Cursor::new(fastq_data);
        let reader = BufReader::new(cursor);
        let mut seq_reader = SeqReader::new(reader);

        let mut record = SeqRecord::new(Vec::new(), Vec::new());

        // 读取第一条记录
        assert!(seq_reader.read_next(&mut record).unwrap());
        assert_eq!(record.name, b"seq1");
        assert_eq!(record.seq, b"ACGT");
        assert_eq!(record.qual, Some(b"IIII".to_vec()));
        assert!(record.is_fastq());

        // 读取第二条记录
        assert!(seq_reader.read_next(&mut record).unwrap());
        assert_eq!(record.name, b"seq2");
        assert_eq!(record.seq, b"TGCA");
        assert_eq!(record.qual, Some(b"JJJJ".to_vec()));
    }

    #[test]
    fn test_iterator() {
        let fasta_data = b">seq1\nACGT\n>seq2\nTGCA\n";
        let cursor = Cursor::new(fasta_data);
        let reader = BufReader::new(cursor);
        let seq_reader = SeqReader::new(reader);

        let records: Vec<_> = seq_reader.map(|r| r.unwrap()).collect();

        assert_eq!(records.len(), 2);
        assert_eq!(records[0].name, b"seq1");
        assert_eq!(records[1].name, b"seq2");
    }
}