use crate::core::tables::LOOKUP_TABLES;
use crate::core::{SeqReader, SeqRecord, SeqWriter};
use crate::utils::random::MersenneTwister;
use crate::utils::region::{parse_regions, RegionList};
use anyhow::{Context, Result};
use clap::Args;
use rustc_hash::FxHashMap;
#[derive(Args, Debug)]
pub struct SeqArgs {
#[arg(value_name = "in.fq")]
pub input: String,
#[arg(short = 'q', long, value_name = "INT", default_value = "0")]
pub qual_threshold: u8,
#[arg(short = 'X', long, value_name = "INT", default_value = "255")]
pub max_qual: u8,
#[arg(short = 'n', long, value_name = "CHAR")]
pub mask_char: Option<char>,
#[arg(short = 'l', long, value_name = "INT", default_value = "0")]
pub line_width: usize,
#[arg(short = 'Q', long, value_name = "INT", default_value = "33")]
pub qual_shift: i32,
#[arg(short = 's', long, value_name = "INT", default_value = "11")]
pub seed: u64,
#[arg(short = 'f', long, value_name = "FLOAT", default_value = "1.0")]
pub fraction: f64,
#[arg(short = 'M', long, value_name = "FILE")]
pub mask_regions: Option<String>,
#[arg(short = 'L', long, value_name = "INT", default_value = "0")]
pub min_length: usize,
#[arg(short = 'F', long, value_name = "CHAR")]
pub fake_qual: Option<char>,
#[arg(short = 'c', long)]
pub mask_complement: bool,
#[arg(short = 'r', long)]
pub reverse_complement: bool,
#[arg(short = 'R', long)]
pub both_strands: bool,
#[arg(short = 'A', long)]
pub force_fasta: bool,
#[arg(short = 'C', long)]
pub drop_comment: bool,
#[arg(short = 'N', long)]
pub drop_ambiguous: bool,
#[arg(short = '1', long)]
pub odd_only: bool,
#[arg(short = '2', long)]
pub even_only: bool,
#[arg(short = 'V', long)]
pub shift_quality: bool,
#[arg(short = 'U', long)]
pub uppercase: bool,
#[arg(short = 'x', long)]
pub lowercase_to_n: bool,
#[arg(short = 'S', long)]
pub strip_spaces: bool,
}
pub fn run(args: &SeqArgs) -> Result<()> {
let mut rng = MersenneTwister::with_seed(args.seed);
let mask_regions = if let Some(ref path) = args.mask_regions {
Some(parse_regions(path)?)
} else {
None
};
let mut reader = if args.input == "-" {
SeqReader::from_stdin()
} else {
SeqReader::from_path(&args.input)
.with_context(|| format!("无法打开输入文件: {}", args.input))?
};
let mut writer = SeqWriter::to_stdout();
if args.line_width > 0 {
writer = writer.with_line_width(args.line_width);
}
let mut record = SeqRecord::new(Vec::new(), Vec::new());
let mut seq_count: u64 = 0;
let qual_threshold = args.qual_threshold.saturating_add(args.qual_shift as u8);
while reader.read_next(&mut record)? {
seq_count += 1;
if record.seq.len() < args.min_length {
continue;
}
if args.fraction < 1.0 && rng.random() >= args.fraction {
continue;
}
if args.odd_only && seq_count % 2 == 0 {
continue;
}
if args.even_only && seq_count % 2 == 1 {
continue;
}
if args.strip_spaces {
strip_whitespace(&mut record);
}
if record.qual.is_some() && args.qual_threshold > 0 {
apply_quality_mask(&mut record, qual_threshold, args.max_qual, args.mask_char);
}
if args.uppercase {
for base in &mut record.seq {
*base = base.to_ascii_uppercase();
}
} else if args.lowercase_to_n {
if let Some(mask_char) = args.mask_char {
for base in &mut record.seq {
if base.is_ascii_lowercase() {
*base = mask_char as u8;
}
}
}
}
if args.force_fasta {
record.qual = None;
} else if let Some(fake_qual_char) = args.fake_qual {
if record.qual.is_none() {
let qual = vec![fake_qual_char as u8; record.seq.len()];
record.qual = Some(qual);
}
}
if args.drop_comment {
record.comment = None;
}
if let Some(ref regions) = mask_regions {
mask_sequence(&mut record, regions, args.mask_complement, args.mask_char)?;
}
if args.reverse_complement || args.both_strands {
if args.both_strands {
let mut forward = record.clone();
add_strand_suffix(&mut forward, b"+");
writer.write_record(&forward)?;
}
reverse_complement_record(&mut record);
if args.both_strands {
add_strand_suffix(&mut record, b"-");
}
}
if args.shift_quality && record.qual.is_some() && args.qual_shift != 33 {
shift_quality_values(&mut record, args.qual_shift);
}
if args.drop_ambiguous && has_ambiguous_bases(&record) {
continue;
}
writer.write_record(&record)?;
}
writer.flush()?;
Ok(())
}
fn strip_whitespace(record: &mut SeqRecord) {
if let Some(ref mut qual) = record.qual {
let mut write_pos = 0;
for read_pos in 0..record.seq.len() {
if !record.seq[read_pos].is_ascii_whitespace() {
qual[write_pos] = qual[read_pos];
write_pos += 1;
}
}
qual.truncate(write_pos);
}
record.seq.retain(|&b| !b.is_ascii_whitespace());
}
fn apply_quality_mask(
record: &mut SeqRecord,
qual_threshold: u8,
max_qual: u8,
mask_char: Option<char>,
) {
if let Some(ref qual) = record.qual {
for (seq_byte, &qual_byte) in record.seq.iter_mut().zip(qual.iter()) {
if qual_byte < qual_threshold || qual_byte > max_qual {
if let Some(mask_char) = mask_char {
*seq_byte = mask_char as u8;
} else {
*seq_byte = seq_byte.to_ascii_lowercase();
}
}
}
}
}
fn mask_sequence(
record: &mut SeqRecord,
regions: &FxHashMap<Vec<u8>, RegionList>,
mask_complement: bool,
mask_char: Option<char>,
) -> Result<()> {
if let Some(region_list) = regions.get(&record.name) {
let seq_len = record.seq.len();
let mut mask = vec![mask_complement; seq_len];
for region in region_list.iter() {
let start = region.start.min(seq_len);
let end = region.end.min(seq_len);
mask[start..end].fill(!mask_complement);
}
for (i, &should_mask) in mask.iter().enumerate() {
if should_mask {
if let Some(mask_char) = mask_char {
record.seq[i] = mask_char as u8;
} else {
record.seq[i] = record.seq[i].to_ascii_lowercase();
}
}
}
}
Ok(())
}
fn reverse_complement_record(record: &mut SeqRecord) {
let len = record.seq.len();
for i in 0..len / 2 {
let c0 = LOOKUP_TABLES.comp[record.seq[i] as usize];
let c1 = LOOKUP_TABLES.comp[record.seq[len - 1 - i] as usize];
record.seq[i] = c1;
record.seq[len - 1 - i] = c0;
}
if len % 2 == 1 {
let mid = len / 2;
record.seq[mid] = LOOKUP_TABLES.comp[record.seq[mid] as usize];
}
if let Some(ref mut qual) = record.qual {
qual.reverse();
}
}
fn add_strand_suffix(record: &mut SeqRecord, suffix: &[u8]) {
record.name.extend_from_slice(suffix);
}
fn shift_quality_values(record: &mut SeqRecord, qual_shift: i32) {
if let Some(ref mut qual) = record.qual {
let shift = 33 - qual_shift;
for q in qual.iter_mut() {
*q = (*q as i32 + shift).clamp(33, 126) as u8;
}
}
}
fn has_ambiguous_bases(record: &SeqRecord) -> bool {
const NT16_TO_4: [u8; 16] = [4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4];
for &base in &record.seq {
let nt16 = LOOKUP_TABLES.nt16[base as usize];
if NT16_TO_4[nt16 as usize] > 3 {
return true;
}
}
false
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_reverse_complement() {
let mut record = SeqRecord::new(b"seq1".to_vec(), b"ACGT".to_vec());
reverse_complement_record(&mut record);
assert_eq!(record.seq, b"ACGT");
let mut record2 = SeqRecord::new(b"seq2".to_vec(), b"AAAA".to_vec());
reverse_complement_record(&mut record2);
assert_eq!(record2.seq, b"TTTT");
}
#[test]
fn test_has_ambiguous_bases() {
let record1 = SeqRecord::new(b"seq1".to_vec(), b"ACGT".to_vec());
assert!(!has_ambiguous_bases(&record1));
let record2 = SeqRecord::new(b"seq2".to_vec(), b"ACGTN".to_vec());
assert!(has_ambiguous_bases(&record2));
let record3 = SeqRecord::new(b"seq3".to_vec(), b"ACGTR".to_vec());
assert!(has_ambiguous_bases(&record3));
}
#[test]
fn test_strip_whitespace() {
let mut record = SeqRecord::new(b"seq1".to_vec(), b"A C G T".to_vec());
strip_whitespace(&mut record);
assert_eq!(record.seq, b"ACGT");
}
}