seqtkrs 0.1.1

A Rust reimplementation of seqtk, a fast and lightweight tool for processing biological sequences in FASTA/FASTQ format
Documentation
use crate::core::{SeqReader, SeqRecord, LOOKUP_TABLES};
use anyhow::Result;
use clap::Args;

#[derive(Args, Debug)]
pub struct GcArgs {
    /// 输入FASTA文件
    #[arg(value_name = "in.fa")]
    pub input: String,

    /// 识别高AT区域而不是高GC区域
    #[arg(short = 'w')]
    pub at_mode: bool,

    /// 最小GC(或AT)比例
    #[arg(short = 'f', value_name = "FLOAT", default_value = "0.6")]
    pub min_frac: f64,

    /// 最小区域长度
    #[arg(short = 'l', value_name = "INT", default_value = "20")]
    pub min_length: usize,

    /// X-dropoff阈值
    #[arg(short = 'x', value_name = "FLOAT", default_value = "10.0")]
    pub xdropoff: f64,
}

pub fn run(args: &GcArgs) -> Result<()> {
    let mut reader = if args.input == "-" {
        SeqReader::from_stdin()
    } else {
        SeqReader::from_path(&args.input)?
    };

    let q = (1.0 - args.min_frac) / args.min_frac;
    let mut record = SeqRecord::new(Vec::new(), Vec::new());

    while reader.read_next(&mut record)? {
        let mut start = 0usize;
        let mut max_i = 0usize;
        let mut n_hits = 0i64;
        let mut start_hits = 0i64;
        let mut max_hits = 0i64;
        let mut sc = 0.0f64;
        let mut max = 0.0f64;

        for (i, &base) in record.seq.iter().enumerate() {
            let c = LOOKUP_TABLES.nt16[base as usize];
            let hit = if args.at_mode {
                c == 1 || c == 8 || c == 9 // A or T or W
            } else {
                c == 2 || c == 4 || c == 6 // C or G or S
            };

            if hit {
                n_hits += 1;
            }

            if hit {
                if sc == 0.0 {
                    start = i;
                    start_hits = n_hits;
                }
                sc += q;
                if sc > max {
                    max = sc;
                    max_i = i;
                    max_hits = n_hits;
                }
            } else if sc > 0.0 {
                sc -= 1.0;
                if sc < 0.0 || max - sc > args.xdropoff {
                    if max_i + 1 - start >= args.min_length {
                        println!(
                            "{}\t{}\t{}\t{}",
                            String::from_utf8_lossy(&record.name),
                            start,
                            max_i + 1,
                            max_hits - start_hits + 1
                        );
                    }
                    sc = 0.0;
                    max = 0.0;
                }
            }
        }

        if max > 0.0 && max_i + 1 - start >= args.min_length {
            println!(
                "{}\t{}\t{}\t{}",
                String::from_utf8_lossy(&record.name),
                start,
                max_i + 1,
                max_hits - start_hits + 1
            );
        }
    }

    Ok(())
}