seqtkrs 0.1.1

A Rust reimplementation of seqtk, a fast and lightweight tool for processing biological sequences in FASTA/FASTQ format
Documentation
//! subseq命令 - 提取子序列
//!
//! 根据BED文件或名称列表提取指定的序列或子序列。
//!
//! # 示例
//!
//! ```bash
//! # 根据名称列表提取序列
//! seqtkrs subseq input.fa names.txt > output.fa
//!
//! # 根据BED文件提取子序列
//! seqtkrs subseq input.fa regions.bed > output.fa
//!
//! # TAB分隔输出
//! seqtkrs subseq -t input.fa regions.bed > output.tsv
//!
//! # 链方向感知(反向互补负链序列)
//! seqtkrs subseq -s input.fa regions.bed > output.fa
//! ```

use crate::core::tables::LOOKUP_TABLES;
use crate::core::{SeqReader, SeqRecord};
use crate::utils::region::{parse_regions, Region, RegionList};
use anyhow::{Context, Result};
use clap::Args;
use std::io::Write;

#[derive(Args, Debug)]
pub struct SubseqArgs {
    /// 输入文件(FASTA/FASTQ格式,支持gzip/bzip2),使用"-"表示stdin
    #[arg(value_name = "in.fa")]
    pub input: String,

    /// BED文件或名称列表
    #[arg(value_name = "in.bed|name.list")]
    pub regions_file: String,

    /// TAB分隔输出
    #[arg(short = 't', long)]
    pub tab_delimited: bool,

    /// 链方向感知(反向互补负链序列)
    #[arg(short = 's', long)]
    pub strand_aware: bool,

    /// 序列行长度(0表示不换行)[0]
    #[arg(short = 'l', long, value_name = "INT", default_value = "0")]
    pub line_width: usize,
}

pub fn run(args: &SubseqArgs) -> Result<()> {
    // 读取区域文件
    let regions = parse_regions(&args.regions_file)
        .with_context(|| format!("无法读取区域文件: {}", args.regions_file))?;

    // 打开输入文件
    let mut reader = if args.input == "-" {
        SeqReader::from_stdin()
    } else {
        SeqReader::from_path(&args.input)
            .with_context(|| format!("无法打开输入文件: {}", args.input))?
    };

    let stdout = std::io::stdout();
    let mut writer = stdout.lock();
    let mut record = SeqRecord::new(Vec::new(), Vec::new());

    while reader.read_next(&mut record)? {
        // 检查这个序列是否在我们的区域列表中
        if let Some(region_list) = regions.get(&record.name) {
            extract_and_output(&mut writer, &record, region_list, args)?;
        }
    }

    writer.flush()?;
    Ok(())
}

/// 提取并输出子序列
fn extract_and_output(
    writer: &mut impl Write,
    record: &SeqRecord,
    region_list: &RegionList,
    args: &SubseqArgs,
) -> Result<()> {
    for region in region_list.iter() {
        let beg = region.start;
        let mut end = region.end;

        // 检查边界
        if beg >= record.seq.len() {
            eprintln!(
                "[subseq] {}: {} >= {}",
                String::from_utf8_lossy(&record.name),
                beg,
                record.seq.len()
            );
            continue;
        }

        if end > record.seq.len() {
            end = record.seq.len();
        }

        // 输出序列
        if args.tab_delimited {
            // TAB分隔格式
            output_tab_format(writer, record, beg, end, args)?;
        } else {
            // 标准FASTA/FASTQ格式
            output_fasta_format(writer, record, region, beg, end, args)?;
        }
    }

    Ok(())
}

/// 输出TAB分隔格式
fn output_tab_format(
    writer: &mut impl Write,
    record: &SeqRecord,
    beg: usize,
    end: usize,
    _args: &SubseqArgs,
) -> Result<()> {
    // 格式:名称\t起始位置\t序列\n
    write!(
        writer,
        "{}\t{}\t",
        String::from_utf8_lossy(&record.name),
        beg + 1
    )?;

    // 如果是链方向感知模式且是负链,需要反向互补
    // 但TAB格式通常不使用链方向
    writer.write_all(&record.seq[beg..end])?;
    writeln!(writer)?;

    Ok(())
}

/// 输出FASTA/FASTQ格式
fn output_fasta_format(
    writer: &mut impl Write,
    record: &SeqRecord,
    region: Region,
    beg: usize,
    end: usize,
    args: &SubseqArgs,
) -> Result<()> {
    let is_fastq = record.qual.is_some();

    // 输出序列头
    if !args.tab_delimited {
        let prefix = if is_fastq { b'@' } else { b'>' };
        write!(writer, "{}", prefix as char)?;
        writer.write_all(&record.name)?;

        // 如果不是完整序列,添加位置信息
        // 只有在beg>0或end不是最大值时才添加位置信息
        let is_full_seq = beg == 0 && end >= record.seq.len();
        if !is_full_seq {
            if end >= record.seq.len() {
                if beg > 0 {
                    write!(writer, ":{}", beg + 1)?;
                }
            } else {
                write!(writer, ":{}-{}", beg + 1, end)?;
            }
        }

        // 添加注释(如果有)
        if let Some(ref comment) = record.comment {
            write!(writer, " ")?;
            writer.write_all(comment)?;
        }
        writeln!(writer)?;
    }

    // 提取子序列
    let subseq = &record.seq[beg..end];

    // 处理链方向
    if args.strand_aware && region.strand.is_some() {
        let strand = region.strand.unwrap();
        if strand == b'-' {
            // 负链:反向互补
            let rev_comp = reverse_complement(subseq);
            output_sequence_lines(writer, &rev_comp, args.line_width)?;
        } else {
            // 正链:直接输出
            output_sequence_lines(writer, subseq, args.line_width)?;
        }
    } else {
        // 不考虑链方向,直接输出
        output_sequence_lines(writer, subseq, args.line_width)?;
    }

    // 如果是FASTQ,输出质量值
    if is_fastq && !args.tab_delimited {
        if let Some(ref qual) = record.qual {
            writeln!(writer, "+")?;
            let subqual = &qual[beg..end];
            output_sequence_lines(writer, subqual, args.line_width)?;
        }
    }

    Ok(())
}

/// 输出序列,可选换行
fn output_sequence_lines(writer: &mut impl Write, seq: &[u8], line_width: usize) -> Result<()> {
    if line_width == 0 {
        // 不换行
        writer.write_all(seq)?;
        writeln!(writer)?;
    } else {
        // 按指定宽度换行
        for (i, &base) in seq.iter().enumerate() {
            if i > 0 && i % line_width == 0 {
                writeln!(writer)?;
            }
            write!(writer, "{}", base as char)?;
        }
        writeln!(writer)?;
    }
    Ok(())
}

/// 反向互补序列
fn reverse_complement(seq: &[u8]) -> Vec<u8> {
    seq.iter()
        .rev()
        .map(|&b| LOOKUP_TABLES.comp[b as usize])
        .collect()
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_reverse_complement() {
        assert_eq!(reverse_complement(b"ACGT"), b"ACGT");
        assert_eq!(reverse_complement(b"AAAA"), b"TTTT");
        assert_eq!(reverse_complement(b"GCTA"), b"TAGC");
    }

    #[test]
    fn test_output_sequence_lines() -> Result<()> {
        let mut buffer = Vec::new();

        // 无换行
        output_sequence_lines(&mut buffer, b"ACGTACGT", 0)?;
        assert_eq!(buffer, b"ACGTACGT\n");

        // 换行
        buffer.clear();
        output_sequence_lines(&mut buffer, b"ACGTACGT", 4)?;
        assert_eq!(buffer, b"ACGT\nACGT\n");

        Ok(())
    }
}