seqtkrs 0.1.1

A Rust reimplementation of seqtk, a fast and lightweight tool for processing biological sequences in FASTA/FASTQ format
Documentation
use crate::core::{SeqReader, SeqRecord, LOOKUP_TABLES};
use anyhow::Result;
use clap::Args;

#[derive(Args, Debug)]
pub struct MergefaArgs {
    /// 第一个输入FASTA文件
    #[arg(value_name = "in1.fa")]
    pub input1: String,

    /// 第二个输入FASTA文件
    #[arg(value_name = "in2.fa")]
    pub input2: String,

    /// 质量阈值
    #[arg(short = 'q', value_name = "INT", default_value = "0")]
    pub qual_threshold: u8,

    /// 取交集
    #[arg(short = 'i')]
    pub intersect: bool,

    /// 当一个碱基为N时转为小写
    #[arg(short = 'm')]
    pub mask: bool,

    /// 从杂合位点随机选择等位基因
    #[arg(short = 'r')]
    pub rand_het: bool,

    /// 抑制输入中的杂合位点
    #[arg(short = 's')]
    pub suppress_het: bool,
}

// nt16_rev_table: 4位编码 -> ASCII
const NT16_REV: [u8; 16] = [
    b'=', b'A', b'C', b'M', b'G', b'R', b'S', b'V', b'T', b'W', b'Y', b'H', b'K', b'D', b'B', b'N',
];

pub fn run(args: &MergefaArgs) -> Result<()> {
    if args.mask && args.intersect {
        anyhow::bail!("'-i' and '-m' cannot be applied at the same time");
    }

    let mut reader1 = if args.input1 == "-" {
        SeqReader::from_stdin()
    } else {
        SeqReader::from_path(&args.input1)?
    };

    let mut reader2 = SeqReader::from_path(&args.input2)?;

    let mut record1 = SeqRecord::new(Vec::new(), Vec::new());
    let mut record2 = SeqRecord::new(Vec::new(), Vec::new());
    let mut cnt = [0u64; 5];

    while reader1.read_next(&mut record1)? {
        if !reader2.read_next(&mut record2)? {
            break;
        }

        if record1.name != record2.name {
            eprintln!(
                "不同的序列名称: {} != {}",
                String::from_utf8_lossy(&record1.name),
                String::from_utf8_lossy(&record2.name)
            );
        }

        if record1.seq.len() != record2.seq.len() {
            eprintln!(
                "不等的序列长度: {} != {}",
                record1.seq.len(),
                record2.seq.len()
            );
        }

        let min_l = record1.seq.len().min(record2.seq.len());
        print!(">{}", String::from_utf8_lossy(&record1.name));

        for i in 0..min_l {
            let c = [record1.seq[i], record2.seq[i]];
            let is_upper = if args.qual_threshold > 0 {
                // 质量值检查逻辑(如果有质量值)
                c[0].is_ascii_uppercase() && c[1].is_ascii_uppercase()
            } else {
                c[0].is_ascii_uppercase() && c[1].is_ascii_uppercase()
            };

            let mut c_nt16 = [
                LOOKUP_TABLES.nt16[c[0] as usize],
                LOOKUP_TABLES.nt16[c[1] as usize],
            ];
            if c_nt16[0] == 0 {
                c_nt16[0] = 15;
            }
            if c_nt16[1] == 0 {
                c_nt16[1] = 15;
            }

            let b = [
                LOOKUP_TABLES.bitcnt[c_nt16[0] as usize],
                LOOKUP_TABLES.bitcnt[c_nt16[1] as usize],
            ];

            let mut is_upper_final = is_upper;

            if is_upper {
                if b[0] == 1 && b[1] == 1 {
                    if c_nt16[0] == c_nt16[1] {
                        cnt[0] += 1;
                    } else {
                        cnt[1] += 1;
                    }
                } else if b[0] == 1 && b[1] == 2 {
                    cnt[2] += 1;
                } else if b[0] == 2 && b[1] == 1 {
                    cnt[3] += 1;
                } else if b[0] == 2 && b[1] == 2 {
                    cnt[4] += 1;
                }
            }

            if args.suppress_het && (b[0] > 1 || b[1] > 1) {
                is_upper_final = false;
            }

            if args.intersect {
                c_nt16[0] &= c_nt16[1];
                if c_nt16[0] == 0 {
                    is_upper_final = false;
                }
            } else if args.mask {
                if c_nt16[0] == 15 || c_nt16[1] == 15 {
                    is_upper_final = false;
                }
                c_nt16[0] &= c_nt16[1];
                if c_nt16[0] == 0 {
                    is_upper_final = false;
                }
            } else if args.rand_het {
                use rand::Rng;
                let mut rng = rand::rng();

                if b[0] == 1 && b[1] == 1 {
                    c_nt16[0] |= c_nt16[1];
                } else if ((b[0] == 1 && b[1] == 2) || (b[0] == 2 && b[1] == 1))
                    && (c_nt16[0] & c_nt16[1]) != 0
                {
                    c_nt16[0] = if rng.random::<bool>() {
                        c_nt16[0] & c_nt16[1]
                    } else {
                        c_nt16[0] | c_nt16[1]
                    };
                } else if b[0] == 2 && b[1] == 2 && c_nt16[0] == c_nt16[1] {
                    if rng.random::<bool>() {
                        if rng.random::<bool>() {
                            for bit in [8, 4, 2, 1] {
                                if c_nt16[0] & bit != 0 {
                                    c_nt16[0] &= bit;
                                    break;
                                }
                            }
                        } else {
                            for bit in [1, 2, 4, 8] {
                                if c_nt16[0] & bit != 0 {
                                    c_nt16[0] &= bit;
                                    break;
                                }
                            }
                        }
                    }
                } else {
                    is_upper_final = false;
                }
            } else {
                c_nt16[0] |= c_nt16[1];
            }

            let mut out_base = NT16_REV[c_nt16[0] as usize];
            if !is_upper_final {
                out_base = out_base.to_ascii_lowercase();
            }

            if i % 60 == 0 {
                println!();
            }
            print!("{}", out_base as char);
        }
        println!();
    }

    eprintln!(
        "(same,diff,hom-het,het-hom,het-het)=({},{},{},{},{})",
        cnt[0], cnt[1], cnt[2], cnt[3], cnt[4]
    );

    Ok(())
}