use anyhow::Result;
use fxread::{initialize_reader, initialize_stdin_reader, FastxRead, Record};
use spinoff::{spinners::Dots12, Color, Spinner, Streams};
use std::{collections::HashMap, io::stdin};
use super::{match_output_stream, write_output, write_output_with_invalid};
type UniqMap = HashMap<Vec<u8>, Record>;
type NullMap = HashMap<Vec<u8>, Vec<Record>>;
struct Unique {
map: UniqMap,
null: NullMap,
}
impl Unique {
pub fn from_reader(reader: Box<dyn FastxRead<Item = Record>>) -> Self {
let (map, null) = Self::build(reader);
Self { map, null }
}
pub fn passing_records(&self) -> impl Iterator<Item = &Record> {
self.map.values()
}
pub fn null_records(&self) -> impl Iterator<Item = &Record> {
self.null.values().flatten()
}
pub fn num_passing(&self) -> usize {
self.map.len()
}
pub fn num_null_records(&self) -> usize {
self.null.values().flatten().count()
}
pub fn num_null_sequences(&self) -> usize {
self.null.len()
}
fn build(reader: Box<dyn FastxRead<Item = Record>>) -> (UniqMap, NullMap) {
reader.fold(
(HashMap::new(), HashMap::new()),
|(mut map, mut null), x| {
if Self::in_null(&mut null, &x) {
Self::insert_to_null(&mut null, x);
}
else {
if Self::in_map(&mut map, &x) {
Self::nullify_existing(&mut null, &mut map, x);
}
else {
Self::insert_to_map(&mut map, x);
}
}
(map, null)
},
)
}
fn in_map(map: &mut UniqMap, record: &Record) -> bool {
map.contains_key(record.seq())
}
fn in_null(null: &mut NullMap, record: &Record) -> bool {
null.contains_key(record.seq())
}
fn nullify_existing(null: &mut NullMap, map: &mut UniqMap, record: Record) {
let duplicate = map.remove(record.seq()).expect("unexpected empty value");
Self::insert_to_null(null, duplicate);
Self::insert_to_null(null, record);
}
fn insert_to_null(null: &mut NullMap, record: Record) {
null.entry(record.seq().to_owned())
.or_default()
.push(record);
}
fn insert_to_map(map: &mut UniqMap, record: Record) {
map.insert(record.seq().to_owned(), record);
}
}
fn format_print(record: &Record) -> &str {
record.as_str()
}
pub fn run(
path: Option<String>,
output: Option<String>,
null: Option<String>,
compression_threads: Option<usize>,
compression_level: Option<usize>,
allow_invalid: bool,
) -> Result<()> {
let reader = if let Some(path) = path {
initialize_reader(&path)
} else {
initialize_stdin_reader(stdin().lock())
}?;
let mut spinner = Spinner::new_with_stream(
Dots12,
"Determining Unique Records".to_string(),
Color::Green,
Streams::Stderr,
);
let unique = Unique::from_reader(reader);
spinner.stop_and_persist(
"✔",
&format!(
"Found {} unique records, {} duplicate sequences with {} records affected",
unique.num_passing(),
unique.num_null_sequences(),
unique.num_null_records()
),
);
let mut unique_writer = match_output_stream(output, compression_threads, compression_level)?;
if allow_invalid {
write_output_with_invalid(
&mut unique_writer,
Box::new(unique.passing_records()),
&format_print,
);
} else {
write_output(
&mut unique_writer,
Box::new(unique.passing_records()),
&format_print,
);
}
if null.is_some() {
let mut null_writer = match_output_stream(null, compression_threads, compression_level)?;
if allow_invalid {
write_output_with_invalid(
&mut null_writer,
Box::new(unique.null_records()),
&format_print,
);
} else {
write_output(
&mut null_writer,
Box::new(unique.null_records()),
&format_print,
);
}
}
Ok(())
}
#[cfg(test)]
mod test {
use super::Unique;
use fxread::{FastaReader, FastqReader, FastxRead, Record};
fn fasta_reader() -> Box<dyn FastxRead<Item = Record>> {
let sequence: &'static [u8] = b">seq.0\nACT\n>seq.1\nACC\n>seq.2\nACT\n";
Box::new(FastaReader::new(sequence))
}
fn fastq_reader() -> Box<dyn FastxRead<Item = Record>> {
let sequence: &'static [u8] =
b"@seq.0\nACT\n+\n123\n@seq.1\nACC\n+\n123\n@seq.2\nACT\n+\n123\n";
Box::new(FastqReader::new(sequence))
}
#[test]
fn unique_fasta() {
let reader = fasta_reader();
let unique = Unique::from_reader(reader);
assert_eq!(unique.num_null_records(), 2);
assert_eq!(unique.num_null_sequences(), 1);
assert_eq!(unique.num_passing(), 1);
}
#[test]
fn unique_fastq() {
let reader = fastq_reader();
let unique = Unique::from_reader(reader);
assert_eq!(unique.num_null_records(), 2);
assert_eq!(unique.num_null_sequences(), 1);
assert_eq!(unique.num_passing(), 1);
}
}