use std::collections::HashMap;
use std::fs::File;
use std::io;
use std::io::BufRead;
use std::io::BufReader;
use std::io::Write;
use std::path::PathBuf;
use crate::errors;
use crate::rank;
use crate::rank::Rank;
use crate::taxon;
#[derive(Debug, StructOpt)]
#[structopt(verbatim_doc_comment)]
pub struct TaxaToFreq {
#[structopt(
short = "r",
long = "rank",
default_value = "species",
possible_values = &Rank::variants()
)]
pub rank: Rank,
#[structopt(short = "f", long = "frequency", default_value = "1")]
pub min_frequency: usize,
#[structopt(parse(from_os_str))]
pub taxon_file: PathBuf,
#[structopt(parse(from_os_str))]
pub input_files: Vec<PathBuf>,
}
pub fn taxa2freq(args: TaxaToFreq) -> errors::Result<()> {
let taxons = taxon::read_taxa_file(&args.taxon_file)?;
if args.rank == rank::Rank::NoRank {
return Err(errors::ErrorKind::InvalidInvocation("Snap to an actual rank.".into()).into());
}
let numfiles = args.input_files.len();
let tree = taxon::TaxonTree::new(&taxons);
let by_id = taxon::TaxonList::new(taxons);
let snapping =
tree.filter_ancestors(|tid| by_id.get(tid).map(|t| t.rank == args.rank).unwrap_or(false));
let stdout = io::stdout();
let mut stdout = stdout.lock();
write!(stdout, "taxon id,taxon name")?;
if numfiles == 0 {
write!(stdout, ",stdin")?;
} else {
for filename in args.input_files.iter() {
write!(stdout, ",{}", filename.to_string_lossy())?;
}
}
writeln!(stdout)?;
let mut counts = HashMap::new();
if numfiles == 0 {
let stdin = io::stdin();
count_file(&snapping, &mut counts, 0, 1, Box::new(stdin.lock()))?;
} else {
for (i, file) in args.input_files.iter().enumerate() {
count_file(
&snapping,
&mut counts,
i,
numfiles,
Box::new(BufReader::new(File::open(file)?)),
)?;
}
}
let mut sorted_counts = counts
.into_iter()
.collect::<Vec<(taxon::TaxonId, Vec<usize>)>>();
sorted_counts.sort_by_key(|p| p.1.iter().sum::<usize>());
for (tid, row) in sorted_counts.into_iter().rev() {
let taxon = by_id
.get(tid)
.ok_or("LCA taxon id not in taxon list. Check compatibility with index.")?;
if row.iter().sum::<usize>() > args.min_frequency {
write!(stdout, "{},{}", taxon.id, taxon.name)?;
for count in row {
write!(stdout, ",{}", count)?;
}
writeln!(stdout)?;
}
}
Ok(())
}
fn count_file<T: BufRead>(
snapping: &[Option<taxon::TaxonId>],
counts: &mut HashMap<taxon::TaxonId, Vec<usize>>,
index: usize,
numfiles: usize,
file: T,
) -> errors::Result<()> {
for line in file.lines() {
if let Ok(taxon) = line?.parse::<taxon::TaxonId>() {
counts
.entry(snapping[taxon].unwrap_or(0))
.or_insert_with(|| vec![0; numfiles])[index] += 1
}
}
Ok(())
}