use crate::utils::SingletonKmers;
use bincode::{config, encode_into_std_write};
use clap::Parser;
use log::info;
use needletail::{parse_fastx_file, Sequence};
use rayon::prelude::*;
use std::{
collections::{HashMap, HashSet},
fs::File,
io::BufWriter,
};
const KMER_SIZE: u8 = 24;
const SINGLETON_KMERS: &str = "kmers.bc";
#[derive(Parser, Debug)]
pub struct BuildArgs {
pub fasta_files: Vec<String>,
#[clap(short, long, default_value = SINGLETON_KMERS)]
pub output_file: String,
#[clap(short, long, default_value_t = KMER_SIZE)]
pub kmer_size: u8,
}
struct SeqRecord {
pub id: String,
pub seq: Vec<u8>,
}
pub fn build(fasta_files: &Vec<String>, output_file: &str, kmer_size: u8) {
let all_records = fasta_files
.par_iter()
.flat_map(|fasta_file| get_sequences(fasta_file))
.collect::<Vec<_>>();
let all_sets = all_records
.par_iter()
.map(|seq_record| get_kmers(seq_record, kmer_size))
.collect::<Vec<_>>();
let mut kmer_counts = HashMap::new();
for kmer_set in all_sets.iter() {
for kmer in kmer_set.iter() {
*kmer_counts.entry(*kmer).or_insert(0) += 1;
}
}
info!("Total unique kmers: {}", kmer_counts.len());
let singleton_kmers = kmer_counts
.into_iter()
.filter(|(_, count)| *count == 1)
.map(|(kmer, _)| kmer)
.collect::<HashSet<_>>();
info!("Singleton kmers: {}", singleton_kmers.len());
let ids = all_records
.iter()
.map(|seq_record| seq_record.id.clone())
.collect::<Vec<_>>();
let kmers = (all_records, all_sets)
.into_par_iter()
.map(|(seq_record, kmer_set)| {
let singleton_kmers_per_file = kmer_set
.intersection(&singleton_kmers)
.cloned()
.collect::<Vec<_>>();
info!(
"{}: {} singleton kmers found",
seq_record.id,
singleton_kmers_per_file.len()
);
singleton_kmers_per_file
})
.collect::<Vec<_>>();
let singleton_kmers = SingletonKmers {
kmer_size,
ids,
kmers,
};
let mut writer = BufWriter::new(File::create(output_file).unwrap());
encode_into_std_write(&singleton_kmers, &mut writer, config::standard())
.expect("Failed to serialize");
info!("Singleton kmers written to `{}`", output_file);
}
fn get_sequences(fasta_file: &str) -> Vec<SeqRecord> {
let mut reader = parse_fastx_file(fasta_file).expect("valid FASTA file");
let mut records = Vec::new();
while let Some(record) = reader.next() {
let record = record.expect("valid record");
let id = String::from_utf8(record.id().to_vec()).expect("valid UTF-8");
let seq = record.normalize(false).to_vec();
records.push(SeqRecord { id, seq });
}
info!("{}: {} records found", fasta_file, records.len());
records
}
fn get_kmers(seq_record: &SeqRecord, kmer_size: u8) -> HashSet<u64> {
let mut kmer_set = HashSet::new();
for (_, kmer, _) in seq_record.seq.bit_kmers(kmer_size, true) {
kmer_set.insert(kmer.0);
}
info!("{}: {} kmers found", seq_record.id, kmer_set.len());
kmer_set
}
#[cfg(test)]
mod tests {
use super::*;
use bincode::{config, decode_from_std_read};
use std::fs::{self, File};
use std::io::BufReader;
use std::path::{Path, PathBuf};
use tempfile::tempdir;
fn write_fasta(dir: &Path, name: &str, contents: &str) -> PathBuf {
let p = dir.join(name);
fs::write(&p, contents).expect("write fasta");
p
}
#[test]
fn test_get_kmers_canonicalization() {
let rec = SeqRecord {
id: "x".to_string(),
seq: b"ACGCGT".to_vec(),
};
let set = get_kmers(&rec, 3);
assert_eq!(
set.len(),
2,
"canonicalized 3-mers should collapse RC pairs"
);
}
#[test]
fn test_build_finds_singletons_and_writes_bincode() {
let tmp = tempdir().expect("tmpdir");
let out = tmp.path().join("kmers.bc");
let f1 = write_fasta(
tmp.path(),
"a.fa",
r#">r1
ACGTAC
>r2
ACGTAA
"#,
);
let f2 = write_fasta(
tmp.path(),
"b.fa",
r#">r3
ACGTAC
"#,
);
let fasta_files = vec![
f1.to_string_lossy().to_string(),
f2.to_string_lossy().to_string(),
];
build(&fasta_files, out.to_str().unwrap(), 3);
assert!(out.exists(), "output bincode file should be written");
let meta = fs::metadata(&out).expect("stat out");
assert!(meta.len() > 0, "output file should be non-empty");
let mut reader = BufReader::new(File::open(&out).expect("open out"));
let decoded: SingletonKmers =
decode_from_std_read(&mut reader, config::standard()).expect("bincode decode");
assert_eq!(decoded.kmer_size, 3, "kmer_size should be what we passed");
assert_eq!(
decoded.ids.len(),
decoded.kmers.len(),
"ids and kmers should be aligned"
);
assert_eq!(
decoded.ids.len(),
3,
"we had 3 sequence records across the two files"
);
let total_singletons: usize = decoded.kmers.iter().map(|v| v.len()).sum();
assert_eq!(
total_singletons, 1,
"there should be exactly one singleton k-mer overall"
);
let mut found_r2 = false;
for (id, kmers) in decoded.ids.iter().zip(decoded.kmers.iter()) {
if id == "r2" {
found_r2 = true;
assert_eq!(kmers.len(), 1, "r2 should have the only singleton (TAA)");
} else {
assert!(
kmers.is_empty(),
"records other than r2 should have zero singletons"
);
}
}
assert!(found_r2, "decoded ids should include r2");
}
#[test]
fn test_buildargs_parse_defaults() {
let args = BuildArgs::parse_from(["prog", "a.fa", "b.fa"]);
assert_eq!(args.output_file, SINGLETON_KMERS);
assert_eq!(args.kmer_size, KMER_SIZE);
assert_eq!(args.fasta_files.len(), 2);
assert!(args.fasta_files.contains(&"a.fa".to_string()));
assert!(args.fasta_files.contains(&"b.fa".to_string()));
}
}