use std::cmp;
use std::io;
use std::path::PathBuf;
use crate::errors;
use crate::io::fasta;
use crate::taxon;
use crate::taxon::TaxonId;
#[derive(Debug, StructOpt)]
#[structopt(verbatim_doc_comment)]
pub struct SeedExtend {
#[structopt(short = "s", long = "min-seed-size", default_value = "2")]
pub min_seed_size: usize,
#[structopt(short = "g", long = "max-gap-size", default_value = "0")]
pub max_gap_size: usize,
#[structopt(short = "r", long = "ranked", parse(from_os_str))]
pub ranked: Option<PathBuf>,
#[structopt(short = "p", long = "penalty", default_value = "5")]
pub penalty: usize,
}
pub fn seedextend(args: SeedExtend) -> errors::Result<()> {
let mut writer = fasta::Writer::new(io::stdout(), "\n", false);
let by_id = if let Some(ref tf) = args.ranked {
let taxa = taxon::read_taxa_file(tf)?;
Some(taxon::TaxonList::new_with_unknown(taxa, true))
} else {
None
};
for record in fasta::Reader::new(io::stdin(), false).records() {
let record = record?;
let mut taxons = record
.sequence
.iter()
.map(|s| s.parse::<TaxonId>())
.collect::<std::result::Result<Vec<TaxonId>, _>>()?;
taxons.push(0);
let mut seeds = Vec::new();
let mut start = 0;
let mut end = 1;
let mut last_tid = taxons[start];
let mut same_tid = 1;
let mut same_max = 1;
while end < taxons.len() {
if last_tid == taxons[end] {
same_tid += 1;
end += 1;
continue;
}
if last_tid == 0 && same_tid > args.max_gap_size {
if same_max >= args.min_seed_size {
seeds.push((start, end - same_tid))
}
start = end;
last_tid = taxons[end];
same_tid = 1;
same_max = 1;
end += 1;
continue;
}
if last_tid == 0 && (end - start) == same_tid {
end += 1;
start = end;
continue;
}
if last_tid != 0 {
same_max = cmp::max(same_max, same_tid);
}
last_tid = taxons[end];
same_tid = 1;
end += 1;
}
if same_max >= args.min_seed_size {
if last_tid == 0 {
end -= same_tid
}
seeds.push((start, end))
}
if let Some(ref by_id) = by_id {
seeds = seeds
.into_iter()
.max_by_key(|(s, e)| {
taxons
.iter()
.skip(*s)
.take(e - s)
.map(|t| by_id.score(*t).unwrap_or(args.penalty))
.sum::<usize>()
})
.into_iter()
.collect::<Vec<(usize, usize)>>();
}
writer
.write_record(fasta::Record {
header: record.header,
sequence: seeds
.into_iter()
.flat_map(|(start, end)| taxons[start..end].iter())
.map(|t| t.to_string())
.collect(),
})
.map_err(|err| err.to_string())?;
}
Ok(())
}