use std::io::Write;
use std::str;
use hashbrown::hash_set::Entry::*;
use hashbrown::HashSet;
use rayon::prelude::*;
use noodles_vcf::{
self as vcf,
header::record::value::{map::Contig, Map},
record::{
alternate_bases::Allele,
genotypes::{keys::key, sample::Value, Keys},
reference_bases::Base,
AlternateBases, Genotypes, Position,
},
};
extern crate needletail;
use ndarray::{s, Array2, ArrayView};
use needletail::{
parse_fastx_file,
parser::{write_fasta, Format},
};
use super::QualFilter;
pub mod aln_writer;
use crate::ska_ref::aln_writer::AlnWriter;
pub mod idx_check;
use crate::ska_ref::idx_check::IdxCheck;
use crate::merge_ska_dict::MergeSkaDict;
use crate::ska_dict::bit_encoding::{UInt, RC_IUPAC};
use crate::ska_dict::split_kmer::SplitKmer;
#[derive(Debug, Clone)]
pub struct RefKmer<IntT> {
pub kmer: IntT,
pub base: u8,
pub pos: usize,
pub chrom: usize,
pub rc: bool,
}
pub struct RefSka<IntT>
where
IntT: for<'a> UInt<'a>,
{
k: usize,
split_kmer_pos: Vec<RefKmer<IntT>>,
ambig_mask: bool,
chrom_names: Vec<String>,
seq: Vec<Vec<u8>>,
repeat_coors: Vec<usize>,
mapped_pos: Vec<(usize, usize)>,
mapped_variants: Array2<u8>,
mapped_names: Vec<String>,
}
#[inline]
fn u8_to_base(ref_base: u8) -> Base {
match ref_base {
b'A' => Base::A,
b'C' => Base::C,
b'G' => Base::G,
b'T' => Base::T,
_ => Base::N,
}
}
#[inline]
fn gt_keys() -> Keys {
Keys::try_from(vec![key::GENOTYPE]).unwrap()
}
impl<IntT> RefSka<IntT>
where
IntT: for<'a> UInt<'a>,
{
fn is_mapped(&self) -> bool {
self.mapped_variants.nrows() > 0
}
pub fn new(k: usize, filename: &str, rc: bool, ambig_mask: bool, repeat_mask: bool) -> Self {
if !(5..=63).contains(&k) || k.is_multiple_of(2) {
panic!("Invalid k-mer length");
}
let mut split_kmer_pos = Vec::new();
let mut seq = Vec::new();
let mut chrom_names = Vec::new();
let mut singles = HashSet::new();
let mut repeats = HashSet::new();
let mut reader =
parse_fastx_file(filename).unwrap_or_else(|_| panic!("Invalid path/file: {filename}",));
let mut chrom = 0;
while let Some(record) = reader.next() {
let seqrec = record.expect("Invalid FASTA record");
if seqrec.format() == Format::Fastq {
panic!("Cannot create reference from FASTQ files");
}
let chrom_name = str::from_utf8(seqrec.id())
.unwrap()
.split_whitespace()
.next()
.unwrap();
chrom_names.push(chrom_name.to_string());
split_kmer_pos.reserve(seqrec.num_bases());
let kmer_opt = SplitKmer::new(
seqrec.seq(),
seqrec.num_bases(),
None,
k,
rc,
0,
QualFilter::NoFilter,
false,
);
if let Some(mut kmer_it) = kmer_opt {
let (kmer, base, rc) = kmer_it.get_curr_kmer();
let mut pos = kmer_it.get_middle_pos();
split_kmer_pos.push(RefKmer {
kmer,
base,
pos,
chrom,
rc,
});
if repeat_mask {
Self::track_repeats(kmer, &mut singles, &mut repeats);
}
while let Some((kmer, base, rc)) = kmer_it.get_next_kmer() {
pos = kmer_it.get_middle_pos();
split_kmer_pos.push(RefKmer {
kmer,
base,
pos,
chrom,
rc,
});
if repeat_mask {
Self::track_repeats(kmer, &mut singles, &mut repeats);
}
}
}
chrom += 1;
seq.push(seqrec.seq().to_vec());
}
if split_kmer_pos.is_empty() {
panic!("{filename} has no valid sequence");
}
let mut repeat_coors = Vec::new();
if repeat_mask {
let half_split_len = (k - 1) / 2;
let mut last_chrom = 0;
let mut last_end = 0;
let mut chrom_offset = 0;
for sk in &split_kmer_pos {
if sk.chrom > last_chrom {
chrom_offset += seq[last_chrom].len();
last_chrom = sk.chrom;
}
if repeats.contains(&sk.kmer) {
let start = sk.pos - half_split_len + chrom_offset;
let end = sk.pos + half_split_len + chrom_offset;
let range = if start > last_end || start == 0 {
std::ops::Range {
start,
end: end + 1,
}
} else {
std::ops::Range {
start: last_end + 1,
end: end + 1,
}
};
for pos in range {
repeat_coors.push(pos);
}
last_chrom = sk.chrom;
last_end = end;
}
}
log::info!(
"Masking {} unique split k-mer repeats spanning {} bases",
repeats.len(),
repeat_coors.len()
);
}
Self {
k,
seq,
ambig_mask,
chrom_names,
split_kmer_pos,
repeat_coors,
mapped_pos: Vec::new(),
mapped_variants: Array2::zeros((0, 0)),
mapped_names: Vec::new(),
}
}
fn track_repeats(kmer: IntT, singles: &mut HashSet<IntT>, repeats: &mut HashSet<IntT>) {
if let Vacant(rep_kmer) = repeats.entry(kmer) {
match singles.entry(kmer) {
Vacant(single_entry) => single_entry.insert(),
Occupied(_) => {
rep_kmer.insert();
}
}
}
}
pub fn map(&mut self, ska_dict: &MergeSkaDict<IntT>) {
if self.k != ska_dict.kmer_len() {
panic!(
"K-mer sizes do not match ref:{} skf:{}",
self.k,
ska_dict.ksize()
);
}
self.mapped_names = ska_dict.names().clone();
self.mapped_variants = Array2::zeros((0, ska_dict.nsamples()));
for ref_k in &self.split_kmer_pos {
if ska_dict.kmer_dict().contains_key(&ref_k.kmer) {
let seq_char: Vec<u8> = ska_dict.kmer_dict()[&ref_k.kmer]
.iter()
.map(|x| match ref_k.rc {
true => RC_IUPAC[*x as usize],
false => *x,
})
.collect();
self.mapped_variants
.push_row(ArrayView::from(&seq_char))
.unwrap();
self.mapped_pos.push((ref_k.chrom, ref_k.pos));
}
}
}
pub fn ksize(&self) -> usize {
self.split_kmer_pos.len()
}
pub fn kmer_iter(&self) -> impl Iterator<Item = IntT> + '_ {
self.split_kmer_pos.iter().map(|k| k.kmer)
}
fn pseudoalignment(&self, threads: usize) -> Vec<AlnWriter<'_>> {
if !self.is_mapped() {
panic!("No split k-mers mapped to reference");
}
if self.ambig_mask {
log::info!("Masking any ambiguous bases (non-A/C/G/T/U/N/-) with 'N'");
}
let mut seq_writers =
vec![
AlnWriter::new(&self.seq, self.k, &self.repeat_coors, self.ambig_mask);
self.mapped_names.len()
];
rayon::ThreadPoolBuilder::new()
.num_threads(threads)
.build_global()
.unwrap();
seq_writers
.par_iter_mut()
.enumerate()
.for_each(|(idx, seq)| {
let sample_vars = self.mapped_variants.slice(s![.., idx]);
for ((mapped_chrom, mapped_pos), base) in
self.mapped_pos.iter().zip(sample_vars.iter())
{
if *base != b'-' {
seq.write_split_kmer(*mapped_pos, *mapped_chrom, *base);
}
}
seq.finalise();
});
seq_writers
}
pub fn write_aln<W: Write>(
&self,
f: &mut W,
threads: usize,
) -> Result<(), needletail::errors::ParseError> {
if self.chrom_names.len() > 1 {
log::warn!(
"Reference contained multiple contigs, in the output they will be concatenated"
);
}
let alignments = self.pseudoalignment(threads);
log::info!("Writing alignment to file");
for (sample_name, mut aligned_seq) in self.mapped_names.iter().zip(alignments) {
write_fasta(
sample_name.as_bytes(),
aligned_seq.get_seq(),
f,
needletail::parser::LineEnding::Unix,
)?;
}
Ok(())
}
pub fn write_vcf<W: Write>(&self, f: &mut W, threads: usize) -> Result<(), std::io::Error> {
if !self.is_mapped() {
panic!("No split k-mers mapped to reference");
}
let alignments = self.pseudoalignment(threads);
log::info!("Converting to VCF");
let mut variants = Array2::zeros((0, alignments[0].total_size()));
for mut seq in alignments {
variants.push_row(ArrayView::from(seq.get_seq())).unwrap();
}
let var_t = variants.t();
let mut var_t_owned = Array2::zeros(var_t.raw_dim());
var_t_owned.assign(&var_t);
let mut writer = vcf::Writer::new(f);
let mut header_builder = vcf::Header::builder();
for contig in &self.chrom_names {
header_builder = header_builder.add_contig(
contig.parse().expect("Could not add contig to header"),
Map::<Contig>::new(),
);
}
for name in &self.mapped_names {
header_builder = header_builder.add_sample_name(name);
}
let header = header_builder.build();
writer.write_header(&header)?;
let keys = gt_keys();
let idx_map = IdxCheck::new(&self.seq);
for (sample_variants, (map_chrom, map_pos)) in var_t_owned.outer_iter().zip(idx_map.iter())
{
let ref_base = self.seq[map_chrom][map_pos];
let ref_allele = u8_to_base(ref_base);
let mut genotype_vec = Vec::with_capacity(var_t_owned.ncols());
let mut alt_bases: Vec<Base> = Vec::new();
let mut variant = false;
for mapped_base in sample_variants {
let gt = if *mapped_base == ref_base {
String::from("0")
} else if *mapped_base == b'-' {
variant = true;
String::from(".")
} else {
variant = true;
let alt_base = u8_to_base(*mapped_base);
if !alt_bases.contains(&alt_base) {
alt_bases.push(alt_base);
}
(alt_bases.iter().position(|&r| r == alt_base).unwrap() + 1).to_string()
};
genotype_vec.push(vec![Some(Value::String(gt))]);
}
if variant {
let genotypes = Genotypes::new(keys.clone(), genotype_vec);
let alt_alleles: Vec<Allele> =
alt_bases.iter().map(|a| Allele::Bases(vec![*a])).collect();
let record = vcf::Record::builder()
.set_chromosome(
self.chrom_names[map_chrom]
.parse()
.expect("Invalid chromosome name"),
)
.set_position(Position::from(map_pos + 1))
.add_reference_base(ref_allele)
.set_alternate_bases(AlternateBases::from(alt_alleles))
.set_genotypes(genotypes)
.build()
.expect("Could not construct record");
writer.write_record(&header, &record)?;
}
}
Ok(())
}
}