extern crate ansi_term;
extern crate csv;
extern crate suppaftp;
#[macro_use]
extern crate log;
extern crate loggerv;
extern crate md5;
extern crate rusqlite;
extern crate structopt;
extern crate xdg;
extern crate zip;
use std::collections::HashMap;
use std::error::Error;
use std::fmt;
use std::fs::remove_file;
use std::path::PathBuf;
use crate::db::DB;
static NCBI_FTP_HOST: &str = "ftp.ncbi.nih.gov:21";
static NCBI_FTP_PATH: &str = "/pub/taxonomy";
pub mod db;
pub mod tree;
pub fn populate_db(datadir: &PathBuf, email: String) -> Result<(), Box<dyn Error>> {
info!("Downloading data from {}...", NCBI_FTP_HOST);
db::download_taxdump(&datadir, email)?;
info!("Checking download integrity...");
db::check_integrity(&datadir)?;
info!("Everything's OK!");
let db = DB::new(&datadir.join("taxonomy.db"))?;
db.populate(&datadir.join("taxdmp.zip"))?;
info!("Removing temporary files...");
remove_file(&datadir.join("taxdmp.zip"))?;
remove_file(&datadir.join("taxdmp.zip.md5"))?;
Ok(())
}
pub fn get_node(db: &DB, term: String) -> Result<Node, Box<dyn Error>> {
let ids = term_to_taxids(db, &[term])?;
let node = db.get_nodes(ids)?;
Ok(node[0].clone())
}
pub fn get_nodes(db: &DB, terms: &[String]) -> Result<Vec<Node>, Box<dyn Error>> {
let ids = term_to_taxids(db, terms)?;
db.get_nodes(ids)
}
pub fn make_lineages(db: &DB, nodes: &[Node]) -> Result<Vec<Vec<Node>>, Box<dyn Error>> {
let lineages: Result<Vec<Vec<Node>>, Box<dyn Error>> = nodes.iter()
.map(|node| db.get_lineage(node.tax_id))
.collect();
lineages
}
pub fn make_tree(db: &DB, nodes: &[Node]) -> Result<tree::Tree, Box<dyn Error>> {
let mut lineages = make_lineages(db, nodes)?;
lineages.sort_by(|a, b| b.len().cmp(&a.len()));
let mut tree = tree::Tree::new(1, &lineages.pop().unwrap());
for lineage in lineages.iter() {
tree.add_nodes(lineage);
}
let ids: Vec<_> = nodes.iter().map(|node| node.tax_id).collect();
tree.mark_nodes(&ids);
Ok(tree)
}
pub fn make_subtree(db: &DB, root: Node, species: bool) -> Result<tree::Tree, Box<dyn Error>> {
let nodes = db.get_children(root.tax_id, species)?;
Ok(tree::Tree::new(root.tax_id, &nodes))
}
pub fn get_lca(db: &DB, node1: &Node, node2: &Node) -> Result<Node, Box<dyn Error>> {
let node1 = node1.clone();
let node2 = node2.clone();
let mut tree = make_tree(db, &[node1, node2])?;
tree.simplify();
let lca_id =
if tree.children.get(&1).unwrap().len() == 2 {
&1
} else {
tree.children.get(&1).unwrap().iter().next().unwrap()
};
let lca = tree.nodes.get(lca_id).unwrap();
Ok(lca.clone())
}
#[derive(Debug, Clone, Default)]
pub struct Node {
pub tax_id: i64,
parent_tax_id: i64,
pub rank: String,
pub division: String,
pub genetic_code: String,
pub mito_genetic_code: Option<String>,
pub comments: Option<String>,
pub names: HashMap<String, Vec<String>>, pub format_string: Option<String>,
}
impl fmt::Display for Node {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
if let Some(format_string) = &self.format_string {
return write!(f, "{}", format_string
.replace("%taxid", &self.tax_id.to_string())
.replace("%name", &self.names.get("scientific name").unwrap()[0])
.replace("%rank", &self.rank));
}
let mut lines = String::new();
let sciname = &self.names.get("scientific name").unwrap()[0];
let l1 = format!("{} - {}\n", sciname, self.rank);
let l2 = std::iter::repeat("-").take(l1.len()-1).collect::<String>();
lines.push_str(&l1);
lines.push_str(&l2);
lines.push_str(&format!("\nNCBI Taxonomy ID: {}\n", self.tax_id));
if self.names.contains_key("synonym") {
lines.push_str("Same as:\n");
for synonym in self.names.get("synonym").unwrap() {
lines.push_str(&format!("* {}\n", synonym));
}
}
if self.names.contains_key("genbank common name") {
let genbank = &self.names.get("genbank common name").unwrap()[0];
lines.push_str(&format!("Commonly named {}.\n", genbank));
}
if self.names.contains_key("common name") {
lines.push_str("Also known as:\n");
for name in self.names.get("common name").unwrap() {
lines.push_str(&format!("* {}\n", name));
}
}
if self.names.contains_key("authority") {
lines.push_str("First description:\n");
for authority in self.names.get("authority").unwrap() {
lines.push_str(&format!("* {}\n", authority));
}
}
lines.push_str(&format!("Part of the {}.\n", self.division));
lines.push_str(&format!("Uses the {} genetic code.\n", self.genetic_code));
if let Some(ref mito) = self.mito_genetic_code {
lines.push_str(&format!("Its mitochondria use the {} genetic code.\n", mito));
}
if let Some(ref comments) = self.comments {
lines.push_str(&format!("\nComments: {}", comments));
}
write!(f, "{}", lines)
}
}
fn clean_term(term: &str) -> String {
term.trim().replace("_", " ")
}
fn term_to_taxids(db: &DB, terms: &[String]) -> Result<Vec<i64>, Box<dyn Error>> {
let mut ids: Vec<i64> = vec![];
let terms: Vec<String> = terms.iter()
.map(|term| clean_term(term))
.collect();
let mut names: Vec<String> = vec![];
let mut indices: Vec<usize> = vec![];
for (i, term) in terms.iter().enumerate() {
match term.parse::<i64>() {
Ok(id) => ids.push(id),
Err(_) => {
names.push(term.to_string());
indices.push(i);
ids.push(-1)
}
};
}
let name_ids = db.get_taxids(names)?;
for (idx, taxid) in indices.iter().zip(name_ids.iter()) {
ids[*idx] = *taxid;
}
Ok(ids)
}