seqtkrs 0.1.1

A Rust reimplementation of seqtk, a fast and lightweight tool for processing biological sequences in FASTA/FASTQ format
Documentation
use crate::core::{SeqReader, SeqRecord, LOOKUP_TABLES};
use anyhow::Result;
use clap::Args;
use rustc_hash::FxHashSet;

#[derive(Args, Debug)]
pub struct TeloArgs {
    /// 输入FASTA/FASTQ文件
    #[arg(value_name = "in.fq")]
    pub input: String,

    /// 端粒序列motif
    #[arg(short = 'm', value_name = "STR", default_value = "CCCTAA")]
    pub motif: String,

    /// 非匹配惩罚
    #[arg(short = 'p', value_name = "INT", default_value = "1")]
    pub penalty: i64,

    /// 最大drop值
    #[arg(short = 'd', value_name = "INT", default_value = "2000")]
    pub max_drop: i64,

    /// 最小得分
    #[arg(short = 's', value_name = "INT", default_value = "300")]
    pub min_score: i64,

    /// 打印得分信息
    #[arg(short = 'P')]
    pub show_profile: bool,
}

pub fn run(args: &TeloArgs) -> Result<()> {
    let penalty = args.penalty.abs();
    let len = args.motif.len();
    let mask = (1u64 << (2 * len)) - 1;

    // 构建所有旋转的端粒motif的哈希表
    let mut motif_set = FxHashSet::default();
    for i in 0..len {
        let mut x = 0u64;
        for j in 0..len {
            let c = LOOKUP_TABLES.nt6[args.motif.as_bytes()[(i + j) % len] as usize];
            if !(1..=4).contains(&c) {
                anyhow::bail!("Invalid motif sequence");
            }
            x = (x << 2) | ((c - 1) as u64);
        }
        motif_set.insert(x);
    }

    let mut reader = if args.input == "-" {
        SeqReader::from_stdin()
    } else {
        SeqReader::from_path(&args.input)?
    };

    let mut record = SeqRecord::new(Vec::new(), Vec::new());
    let mut sum_input = 0u64;
    let mut sum_telo = 0u64;

    while reader.read_next(&mut record)? {
        let mut x = [0u64; 2];
        let mut k = 0usize;
        let mut score = 0i64;
        let mut max_score = 0i64;
        let mut max_i = -1isize;
        let mut st = 0usize;

        for (i, &base) in record.seq.iter().enumerate() {
            let c = LOOKUP_TABLES.nt6[base as usize];
            if (1..=4).contains(&c) {
                x[0] = ((x[0] << 2) | ((c - 1) as u64)) & mask;
                x[1] = (x[1] >> 2) | (((4 - c) as u64) << (2 * (len - 1)));
                if k < len {
                    k += 1;
                }
                if k == len {
                    if motif_set.contains(&x[0]) || motif_set.contains(&x[1]) {
                        if score == 0 {
                            st = i + 1 - len;
                        }
                        score += 1;
                        if score > max_score {
                            max_score = score;
                            max_i = i as isize;
                        }
                    } else {
                        score -= penalty;
                        if score < 0 || max_score - score > args.max_drop {
                            if max_score >= args.min_score {
                                println!(
                                    "{}\t{}\t{}\t{}\t{}",
                                    String::from_utf8_lossy(&record.name),
                                    record.seq.len(),
                                    st,
                                    max_i + 1,
                                    max_score
                                );
                                sum_telo += max_i as u64 + 1 - st as u64;
                            }
                            score = 0;
                            max_score = 0;
                        }
                    }
                }
            } else {
                k = 0;
                if max_score >= args.min_score {
                    println!(
                        "{}\t{}\t{}\t{}\t{}",
                        String::from_utf8_lossy(&record.name),
                        record.seq.len(),
                        st,
                        max_i + 1,
                        max_score
                    );
                    sum_telo += max_i as u64 + 1 - st as u64;
                }
                score = 0;
                max_score = 0;
            }
        }

        if max_score >= args.min_score {
            println!(
                "{}\t{}\t{}\t{}\t{}",
                String::from_utf8_lossy(&record.name),
                record.seq.len(),
                st,
                max_i + 1,
                max_score
            );
            sum_telo += max_i as u64 + 1 - st as u64;
        }

        sum_input += record.seq.len() as u64;
    }

    eprintln!(
        "Total: {}\tTelomeric: {}\tFraction: {:.4}",
        sum_input,
        sum_telo,
        sum_telo as f64 / sum_input as f64
    );

    Ok(())
}