use anyhow::Result;
use clap::Args;
use rustc_hash::FxHashMap;
use crate::core::tables::LOOKUP_TABLES;
use crate::core::{SeqReader, SeqRecord};
use crate::utils::region::{parse_regions, RegionList};
#[derive(Args, Debug)]
pub struct CompArgs {
#[arg(value_name = "in.fa")]
pub input: Option<String>,
#[arg(short = 'u')]
pub upper_only: bool,
#[arg(short = 'r', value_name = "FILE")]
pub region_file: Option<String>,
}
#[derive(Debug, Default)]
struct CompStats {
bases: [u64; 4],
ambig_2: u64,
ambig_3: u64,
ambig_4: u64,
cpg: u64,
tv: u64,
ts: u64,
cpg_ts: u64,
}
pub fn run(args: &CompArgs) -> Result<()> {
let regions = if let Some(ref region_file) = args.region_file {
Some(parse_regions(region_file)?)
} else {
None
};
let input = args.input.as_deref().unwrap_or("-");
let mut reader = if input == "-" {
SeqReader::from_stdin()
} else {
SeqReader::from_path(input)?
};
let mut record = SeqRecord::new(Vec::new(), Vec::new());
while reader.read_next(&mut record)? {
if let Some(ref region_map) = regions {
process_with_regions(&record, region_map, args.upper_only)?;
} else {
let stats = compute_composition(&record.seq, 0, record.seq.len(), args.upper_only);
print_stats(&record.name, None, record.seq.len(), &stats);
}
}
Ok(())
}
fn process_with_regions(
record: &SeqRecord,
region_map: &FxHashMap<Vec<u8>, RegionList>,
upper_only: bool,
) -> Result<()> {
if let Some(region_list) = region_map.get(&record.name) {
for &(start, end) in ®ion_list.regions {
let start = start as usize;
let end = end as usize;
if start >= record.seq.len() {
continue;
}
let end = end.min(record.seq.len());
let stats = compute_composition(&record.seq, start, end, upper_only);
print_stats(&record.name, Some((start, end)), end - start, &stats);
}
}
Ok(())
}
const NT16_TO_4: [u8; 16] = [4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4];
fn compute_composition(seq: &[u8], start: usize, end: usize, upper_only: bool) -> CompStats {
let mut stats = CompStats::default();
if start >= end || end > seq.len() {
return stats;
}
let prev_base = if start > 0 {
seq[start - 1]
} else {
b'a' };
let mut prev_code = LOOKUP_TABLES.nt16[prev_base as usize];
for i in start..end {
let base = seq[i];
let code = LOOKUP_TABLES.nt16[base as usize];
let bitcnt = LOOKUP_TABLES.bitcnt[code as usize];
let mut is_cpg = false;
if code == 2 || code == 10 {
if i + 1 < end {
let next_code = LOOKUP_TABLES.nt16[seq[i + 1] as usize];
if next_code == 4 || next_code == 5 {
is_cpg = true;
}
}
} else if code == 4 || code == 5 {
if prev_code == 2 || prev_code == 10 {
is_cpg = true;
}
}
if !upper_only || base.is_ascii_uppercase() {
if bitcnt > 1 {
stats.ambig_2 += (bitcnt == 2) as u64;
stats.ambig_3 += (bitcnt == 3) as u64;
stats.ambig_4 += (bitcnt == 4) as u64;
} else if bitcnt == 1 {
let nt4 = NT16_TO_4[code as usize];
if nt4 < 4 {
stats.bases[nt4 as usize] += 1;
}
}
if code == 10 || code == 5 {
stats.ts += 1;
} else if bitcnt == 2 {
stats.tv += 1;
}
if is_cpg {
stats.cpg += 1;
if code == 10 || code == 5 {
stats.cpg_ts += 1;
}
}
}
prev_code = code;
}
stats
}
fn print_stats(name: &[u8], region: Option<(usize, usize)>, length: usize, stats: &CompStats) {
print!("{}", String::from_utf8_lossy(name));
if let Some((start, end)) = region {
print!("\t{}\t{}", start, end);
} else {
print!("\t{}", length);
}
println!(
"\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}",
stats.bases[0], stats.bases[1], stats.bases[2], stats.bases[3], stats.ambig_2, stats.ambig_3, stats.ambig_4, stats.cpg, stats.tv, stats.ts, stats.cpg_ts );
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_compute_composition_simple() {
let seq = b"ACGT";
let stats = compute_composition(seq, 0, seq.len(), false);
assert_eq!(stats.bases[0], 1); assert_eq!(stats.bases[1], 1); assert_eq!(stats.bases[2], 1); assert_eq!(stats.bases[3], 1); }
#[test]
fn test_compute_composition_cpg() {
let seq = b"ACGT";
let stats = compute_composition(seq, 0, seq.len(), false);
assert_eq!(stats.cpg, 2); }
#[test]
fn test_compute_composition_upper_only() {
let seq = b"ACGTacgt";
let stats = compute_composition(seq, 0, seq.len(), true);
assert_eq!(stats.bases[0], 1); assert_eq!(stats.bases[1], 1); assert_eq!(stats.bases[2], 1); assert_eq!(stats.bases[3], 1); }
#[test]
fn test_compute_composition_ambiguous() {
let seq = b"ACGTRYN"; let stats = compute_composition(seq, 0, seq.len(), false);
assert_eq!(stats.ambig_2, 2);
assert_eq!(stats.ts, 2);
}
}