seqtkrs 0.1.1

A Rust reimplementation of seqtk, a fast and lightweight tool for processing biological sequences in FASTA/FASTQ format
Documentation
use crate::core::{SeqReader, SeqRecord, SeqWriter, LOOKUP_TABLES};
use anyhow::Result;
use clap::Args;

#[derive(Args, Debug)]
pub struct CutnArgs {
    /// 输入FASTA/FASTQ文件
    #[arg(value_name = "in.fa")]
    pub input: String,

    /// N区域的最小大小
    #[arg(short = 'n', value_name = "INT", default_value = "1000")]
    pub min_n_tract: usize,

    /// 非N碱基的惩罚值
    #[arg(short = 'p', value_name = "INT", default_value = "10")]
    pub non_n_penalty: i64,

    /// 仅打印gap,不输出序列
    #[arg(short = 'g')]
    pub gap_only: bool,
}

fn find_next_cut(
    seq: &[u8],
    k: usize,
    min_n_tract: usize,
    non_n_penalty: i64,
) -> Option<(usize, usize, usize)> {
    let mut pos = k;
    while pos < seq.len() {
        if LOOKUP_TABLES.nt16[seq[pos] as usize] == 15 {
            let mut score = 0i64;
            let mut max_score = -1i64;
            let mut end = 0usize;

            for (i, &base) in seq.iter().enumerate().skip(pos) {
                if LOOKUP_TABLES.nt16[base as usize] == 15 {
                    score += 1;
                } else {
                    score -= non_n_penalty;
                }
                if score < 0 {
                    break;
                }
                if score > max_score {
                    max_score = score;
                    end = i;
                }
            }

            let mut score = 0i64;
            let mut max_score = -1i64;
            let mut begin = 0usize;

            for i in (0..=end).rev() {
                if LOOKUP_TABLES.nt16[seq[i] as usize] == 15 {
                    score += 1;
                } else {
                    score -= non_n_penalty;
                }
                if score < 0 {
                    break;
                }
                if score > max_score {
                    max_score = score;
                    begin = i;
                }
            }

            if end + 1 >= begin && end + 1 - begin >= min_n_tract {
                return Some((begin, end + 1, end + 1));
            }
            pos = end + 1;
        } else {
            pos += 1;
        }
    }
    None
}

pub fn run(args: &CutnArgs) -> Result<()> {
    let mut reader = if args.input == "-" {
        SeqReader::from_stdin()
    } else {
        SeqReader::from_path(&args.input)?
    };

    let mut writer = SeqWriter::to_stdout().with_line_width(60);
    let mut record = SeqRecord::new(Vec::new(), Vec::new());

    while reader.read_next(&mut record)? {
        let mut k = 0;
        while let Some((begin, end, next_k)) =
            find_next_cut(&record.seq, k, args.min_n_tract, args.non_n_penalty)
        {
            if begin != 0 {
                if args.gap_only {
                    println!(
                        "{}\t{}\t{}",
                        String::from_utf8_lossy(&record.name),
                        begin,
                        end
                    );
                } else {
                    let mut sub_record = SeqRecord::new(
                        format!(
                            "{}:{}-{}",
                            String::from_utf8_lossy(&record.name),
                            k + 1,
                            begin
                        )
                        .into_bytes(),
                        record.seq[k..begin].to_vec(),
                    );
                    if let Some(qual) = &record.qual {
                        sub_record.qual = Some(qual[k..begin].to_vec());
                    }
                    writer.write_record(&sub_record)?;
                }
            }
            k = next_k;
        }
        if !args.gap_only && k < record.seq.len() {
            let mut sub_record = SeqRecord::new(
                format!(
                    "{}:{}-{}",
                    String::from_utf8_lossy(&record.name),
                    k + 1,
                    record.seq.len()
                )
                .into_bytes(),
                record.seq[k..].to_vec(),
            );
            if let Some(qual) = &record.qual {
                sub_record.qual = Some(qual[k..].to_vec());
            }
            writer.write_record(&sub_record)?;
        }
    }

    writer.flush()?;
    Ok(())
}