seqtkrs 0.1.1

A Rust reimplementation of seqtk, a fast and lightweight tool for processing biological sequences in FASTA/FASTQ format
Documentation
use crate::core::{SeqReader, SeqRecord, LOOKUP_TABLES};
use anyhow::Result;
use clap::Args;

#[derive(Args, Debug)]
pub struct KfreqArgs {
    /// k-mer序列
    #[arg(value_name = "kmer")]
    pub kmer: String,

    /// 输入FASTA文件
    #[arg(value_name = "in.fa")]
    pub input: String,
}

pub fn run(args: &KfreqArgs) -> Result<()> {
    let len = args.kmer.len();
    let mut kmer = 0u64;

    // 解析k-mer
    for &base in args.kmer.as_bytes() {
        let c = LOOKUP_TABLES.nt6[base as usize];
        if !(1..=4).contains(&c) {
            anyhow::bail!("Invalid k-mer sequence");
        }
        kmer = (kmer << 2) | ((c - 1) as u64);
    }

    let mask = (1u64 << (2 * len)) - 1;

    // 构建邻居k-mer集合
    let mut neighbors = vec![false; 1 << (2 * len)];
    for i in 0..len {
        let x = kmer & !(3u64 << (2 * i));
        for j in 0..4 {
            neighbors[(x | (j << (2 * i))) as usize] = true;
        }
    }

    let mut reader = if args.input == "-" {
        SeqReader::from_stdin()
    } else {
        SeqReader::from_path(&args.input)?
    };

    let mut record = SeqRecord::new(Vec::new(), Vec::new());

    while reader.read_next(&mut record)? {
        let mut x = [0u64; 2];
        let mut k = 0usize;
        let mut cnt = [0i64; 2];
        let mut cnt_nei = [0i64; 2];

        for &base in &record.seq {
            let c = LOOKUP_TABLES.nt6[base as usize];
            if (1..=4).contains(&c) {
                x[0] = ((x[0] << 2) | ((c - 1) as u64)) & mask;
                x[1] = (x[1] >> 2) | (((4 - c) as u64) << (2 * (len - 1)));
                if k < len {
                    k += 1;
                }
                if k == len {
                    if x[0] == kmer {
                        cnt[0] += 1;
                    } else if x[1] == kmer {
                        cnt[1] += 1;
                    }
                    if neighbors[x[0] as usize] {
                        cnt_nei[0] += 1;
                    } else if neighbors[x[1] as usize] {
                        cnt_nei[1] += 1;
                    }
                }
            } else {
                k = 0;
            }
        }

        let which = if cnt_nei[0] > cnt_nei[1] { 0 } else { 1 };
        println!(
            "{}\t{}\t{}\t{}\t{}",
            String::from_utf8_lossy(&record.name),
            record.seq.len(),
            if which == 0 { '+' } else { '-' },
            cnt_nei[which],
            cnt[which]
        );
    }

    Ok(())
}