use std::collections::HashMap;
use std::fmt;
use std::fmt::{Debug, Display};
use std::hash::Hash;
use std::io::{Read, Write};
use std::str::FromStr;
use serde::{de, Deserialize, Deserializer, Serialize, Serializer};
use serde_json::{json, to_value, to_writer, Value};
use crate::base::GeneralTaxonomy;
use crate::errors::{Error, ErrorKind, TaxonomyResult};
use crate::rank::TaxRank;
use crate::Taxonomy;
#[derive(Eq, PartialEq)]
pub enum JsonFormat {
NodeLink,
Tree,
}
struct DeserializeU64OrStringVisitor;
impl<'de> de::Visitor<'de> for DeserializeU64OrStringVisitor {
type Value = String;
fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
formatter.write_str("an integer or a string")
}
fn visit_u64<E>(self, v: u64) -> Result<Self::Value, E>
where
E: de::Error,
{
Ok(format!("{}", v))
}
fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
where
E: de::Error,
{
Ok(v.to_string())
}
}
fn deserialize_u64_or_string<'de, D>(deserializer: D) -> Result<String, D::Error>
where
D: Deserializer<'de>,
{
deserializer.deserialize_any(DeserializeU64OrStringVisitor)
}
fn deserialize_tax_rank<'de, D>(deserializer: D) -> Result<TaxRank, D::Error>
where
D: Deserializer<'de>,
{
let opt: Option<String> = Option::deserialize(deserializer)?;
if let Some(s) = opt {
if s.is_empty() {
return Ok(TaxRank::Unspecified);
}
TaxRank::from_str(&s).map_err(de::Error::custom)
} else {
Ok(TaxRank::Unspecified)
}
}
fn serialize_tax_rank<S>(x: &TaxRank, s: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
s.serialize_str(x.to_ncbi_rank())
}
fn default_tax_rank() -> TaxRank {
TaxRank::Unspecified
}
#[derive(Debug, Serialize, Deserialize)]
struct TaxNode {
#[serde(deserialize_with = "deserialize_u64_or_string")]
id: String,
name: String,
#[serde(deserialize_with = "deserialize_tax_rank")]
#[serde(serialize_with = "serialize_tax_rank")]
#[serde(default = "default_tax_rank")]
rank: TaxRank,
#[serde(flatten)]
extra: HashMap<String, Value>,
}
#[derive(Debug, Serialize, Deserialize)]
struct Link {
source: usize,
target: usize,
}
fn load_node_link_json(tax_json: &Value) -> TaxonomyResult<GeneralTaxonomy> {
let json_tax_nodes = tax_json["nodes"]
.as_array()
.ok_or_else(|| {
Error::new(ErrorKind::ImportError {
line: 0,
msg: "'nodes' not in JSON".to_owned(),
})
})?
.clone();
let mut tax_nodes = Vec::with_capacity(json_tax_nodes.len());
for n in json_tax_nodes {
let node: TaxNode = serde_json::from_value(n)?;
if node.id.parse::<usize>().is_err() {
return Err(Error::new(ErrorKind::ImportError {
line: 0,
msg: format!("Tax ID {} cannot be converted to an integer", node.id),
}));
}
tax_nodes.push(node);
}
#[allow(clippy::needless_collect)]
let initial_positions: Vec<_> = tax_nodes.iter().map(|n| n.id.clone()).collect();
tax_nodes.sort_unstable_by(|a, b| {
let tax_id_a = a.id.parse::<usize>().unwrap();
let tax_id_b = b.id.parse::<usize>().unwrap();
tax_id_a.cmp(&tax_id_b)
});
let after_positions: Vec<_> = tax_nodes.iter().map(|n| &n.id).collect();
let mut idx_mapping_after_sorting = HashMap::new();
for (i, id) in initial_positions.into_iter().enumerate() {
idx_mapping_after_sorting
.insert(i, after_positions.iter().position(|p| *p == &id).unwrap());
}
let tax_links = tax_json["links"]
.as_array()
.ok_or_else(|| {
Error::new(ErrorKind::ImportError {
line: 0,
msg: "'links' not in JSON".to_owned(),
})
})?
.clone();
let num_nodes = tax_nodes.len();
let mut tax_ids: Vec<String> = Vec::with_capacity(tax_nodes.len());
let mut names: Vec<String> = Vec::with_capacity(tax_nodes.len());
let mut ranks: Vec<TaxRank> = Vec::with_capacity(tax_nodes.len());
let mut data: Vec<HashMap<String, Value>> = Vec::with_capacity(tax_nodes.len());
let mut parent_ids = vec![0; tax_nodes.len()];
for node in tax_nodes {
tax_ids.push(node.id);
names.push(node.name);
ranks.push(node.rank);
data.push(node.extra);
}
for l in tax_links {
let link: Link = serde_json::from_value(l)?;
if link.source >= num_nodes {
return Err(Error::new(ErrorKind::ImportError {
line: 0,
msg: format!(
"JSON source index {} specified, but there are only {} nodes",
link.source, num_nodes
),
}));
}
if link.target >= num_nodes {
return Err(Error::new(ErrorKind::ImportError {
line: 0,
msg: format!(
"JSON target index {} specified, but there are only {} nodes",
link.target, num_nodes
),
}));
}
let fixed_source = idx_mapping_after_sorting[&link.source];
let fixed_target = idx_mapping_after_sorting[&link.target];
parent_ids[fixed_source] = fixed_target;
}
GeneralTaxonomy::from_arrays(
tax_ids,
parent_ids,
Some(names),
Some(ranks),
None,
Some(data),
)
}
#[derive(Debug, Serialize, Deserialize)]
struct TaxNodeTree {
#[serde(deserialize_with = "deserialize_u64_or_string")]
id: String,
name: String,
#[serde(deserialize_with = "deserialize_tax_rank")]
#[serde(serialize_with = "serialize_tax_rank")]
#[serde(default = "default_tax_rank")]
rank: TaxRank,
#[serde(default)]
children: Vec<TaxNodeTree>,
#[serde(flatten)]
extra: HashMap<String, Value>,
}
fn load_tree_json(tax_json: &Value) -> TaxonomyResult<GeneralTaxonomy> {
let root_node: TaxNodeTree = serde_json::from_value(tax_json.clone())?;
fn add_node(
parent_loc: usize,
node: TaxNodeTree,
tax_ids: &mut Vec<String>,
parent_ids: &mut Vec<usize>,
names: &mut Vec<String>,
ranks: &mut Vec<TaxRank>,
data: &mut Vec<HashMap<String, Value>>,
) -> TaxonomyResult<()> {
tax_ids.push(node.id);
parent_ids.push(parent_loc);
names.push(node.name);
ranks.push(node.rank);
data.push(node.extra);
let loc = tax_ids.len() - 1;
for child in node.children {
add_node(loc, child, tax_ids, parent_ids, names, ranks, data)?;
}
Ok(())
}
let mut tax_ids = Vec::new();
let mut parent_ids = Vec::new();
let mut names = Vec::new();
let mut ranks = Vec::new();
let mut data = Vec::new();
add_node(
0,
root_node,
&mut tax_ids,
&mut parent_ids,
&mut names,
&mut ranks,
&mut data,
)?;
GeneralTaxonomy::from_arrays(
tax_ids,
parent_ids,
Some(names),
Some(ranks),
None,
Some(data),
)
}
pub fn load<R: Read>(reader: R, json_pointer: Option<&str>) -> TaxonomyResult<GeneralTaxonomy> {
let tax_json: Value = serde_json::from_reader(reader)?;
let actual_tax_json = if let Some(p) = json_pointer {
tax_json.pointer(p).ok_or_else(|| {
Error::new(ErrorKind::ImportError {
line: 0,
msg: format!("JSON path {} does not correspond to a value", p),
})
})?
} else {
&tax_json
};
if actual_tax_json.get("nodes").is_some() {
load_node_link_json(actual_tax_json)
} else {
load_tree_json(actual_tax_json)
}
}
pub fn save<'t, W: Write, T: 't, X: Taxonomy<'t, T>>(
writer: W,
taxonomy: &'t X,
format: JsonFormat,
root_node: Option<T>,
) -> TaxonomyResult<()>
where
T: Clone + Debug + Display + Eq + Hash + PartialEq,
{
let root_node = root_node.unwrap_or_else(|| taxonomy.root());
let json_data = match format {
JsonFormat::NodeLink => serialize_as_node_links(taxonomy, root_node),
JsonFormat::Tree => serialize_as_tree(taxonomy, root_node),
}?;
to_writer(writer, &json_data)?;
Ok(())
}
fn serialize_as_tree<'t, T: 't>(
taxonomy: &'t impl Taxonomy<'t, T>,
tax_id: T,
) -> TaxonomyResult<Value>
where
T: Clone + Debug + Display + Eq + Hash + PartialEq,
{
fn inner<'t, T: 't>(tax: &'t impl Taxonomy<'t, T>, tax_id: T) -> TaxonomyResult<TaxNodeTree>
where
T: Clone + Debug + Display + Eq + Hash + PartialEq,
{
let mut children = Vec::new();
for child in tax.children(tax_id.clone())? {
children.push(inner(tax, child)?);
}
let node = TaxNodeTree {
id: tax_id.to_string(),
name: tax.name(tax_id.clone())?.to_string(),
rank: tax.rank(tax_id.clone())?,
children,
extra: (*tax.data(tax_id)?).clone(),
};
Ok(node)
}
Ok(to_value(inner(taxonomy, tax_id)?)?)
}
fn serialize_as_node_links<'t, T: 't>(
tax: &'t impl Taxonomy<'t, T>,
root_id: T,
) -> TaxonomyResult<Value>
where
T: Clone + Debug + Display + Eq + Hash + PartialEq,
{
let mut nodes = Vec::new();
let mut links = Vec::new();
let mut id_to_idx = HashMap::new();
for (ix, (tid, _pre)) in tax.traverse(root_id)?.filter(|x| x.1).enumerate() {
let node = TaxNode {
id: tid.to_string(),
name: tax.name(tid.clone())?.to_string(),
rank: tax.rank(tid.clone())?,
extra: (*tax.data(tid.clone())?).clone(),
};
nodes.push(to_value(&node).unwrap());
id_to_idx.insert(tid.clone(), ix);
if let Some(parent_id) = tax.parent(tid)? {
links.push(json!({
"source": ix,
"target": id_to_idx[&parent_id.0],
}));
}
}
let tax_json = json!({
"nodes": nodes,
"links": links,
"directed": true,
"multigraph": false,
"graph": [],
});
Ok(tax_json)
}
#[cfg(test)]
mod tests {
use super::*;
use crate::taxonomy::Taxonomy;
use serde_json::from_str;
use std::io::Cursor;
#[test]
fn can_load_empty_node_link_format() {
let example = r#"{"nodes": [], "links": []}"#;
let tax = load(Cursor::new(example), None).unwrap();
assert_eq!(Taxonomy::<&str>::len(&tax), 0);
}
#[derive(Debug, Serialize, Deserialize)]
struct NodeLinkFormat {
nodes: Vec<TaxNode>,
links: Vec<Link>,
}
#[test]
fn can_load_small_node_link_format() {
let example = r#"{
"nodes": [
{"id": "1", "name": "root", "readcount": 1000},
{"id": "2", "name": "Bacteria", "rank": "no rank", "readcount": 1000},
{"id": "562", "name": "Escherichia coli", "rank": "species", "readcount": 1000}
],
"links": [
{"source": 1, "target": 0},
{"source": 2, "target": 1}
]
}"#;
let tax = load(Cursor::new(example), None).unwrap();
assert_eq!(Taxonomy::<&str>::len(&tax), 3);
assert_eq!(Taxonomy::<&str>::root(&tax), "1");
assert_eq!(Taxonomy::<&str>::children(&tax, "1").unwrap(), vec!["2"]);
assert_eq!(
Taxonomy::<&str>::lineage(&tax, "562").unwrap(),
vec!["562", "2", "1"]
);
assert_eq!(
Taxonomy::<&str>::data(&tax, "1").unwrap()["readcount"]
.as_i64()
.unwrap(),
1000
);
let serialized = serialize_as_node_links(&tax, Taxonomy::<&str>::root(&tax)).unwrap();
let tax2 = load_node_link_json(&serialized).unwrap();
assert_eq!(Taxonomy::<&str>::len(&tax2), 3);
assert_eq!(Taxonomy::<&str>::root(&tax2), "1");
assert_eq!(Taxonomy::<&str>::children(&tax2, "1").unwrap(), vec!["2"]);
assert_eq!(
Taxonomy::<&str>::lineage(&tax2, "562").unwrap(),
vec!["562", "2", "1"]
);
assert_eq!(
Taxonomy::<&str>::data(&tax2, "1").unwrap()["readcount"]
.as_i64()
.unwrap(),
1000
);
}
#[test]
fn can_load_tree_format() {
let example = r#"{
"id": "1",
"name": "root",
"rank": "no rank",
"readcount": 1000,
"children": [
{
"id": "2",
"name": "Bacteria",
"rank": "no rank",
"readcount": 2000,
"children": [
{
"id": "562",
"name": "Escherichia coli",
"rank": "species",
"readcount": 3000
}
]
}
]
}"#;
let tax = load(Cursor::new(example), None).unwrap();
assert_eq!(Taxonomy::<&str>::len(&tax), 3);
assert_eq!(Taxonomy::<&str>::root(&tax), "1");
assert_eq!(Taxonomy::<&str>::children(&tax, "1").unwrap(), vec!["2"]);
assert_eq!(
Taxonomy::<&str>::lineage(&tax, "562").unwrap(),
vec!["562", "2", "1"]
);
assert_eq!(
Taxonomy::<&str>::data(&tax, "1").unwrap()["readcount"]
.as_i64()
.unwrap(),
1000
);
let serialized = serialize_as_tree(&tax, Taxonomy::<&str>::root(&tax)).unwrap();
let input_val: TaxNodeTree = from_str(&example).unwrap();
assert_eq!(serialized, to_value(&input_val).unwrap());
}
#[test]
fn errors_on_invalid_tree_format() {
let example = r#"{}"#;
assert!(load(Cursor::new(example), None).is_err());
let example = r#"{"id": "1", "rank": 5, "name": ""}"#;
assert!(load(Cursor::new(example), None).is_err());
}
#[test]
fn can_load_node_link_and_fix_order_them() {
let example = r#"
{
"multigraph": false,
"directed": true,
"graph": [],
"nodes": [
{
"name": "root",
"rank": "no rank",
"id": 1
},
{
"name": "genus 2",
"rank": "genus",
"id": 9
},
{
"name": "superkingdom 1",
"rank": "superkingdom",
"id": 2
},
{
"name": "species 2.1",
"rank": "species",
"id": 11
},
{
"name": "genus 1",
"rank": "genus",
"id": 8
},
{
"name": "class 1",
"rank": "class",
"id": 5
},
{
"name": "kingdom 1",
"rank": "kingdom",
"id": 3
},
{
"name": "phylum 1",
"rank": "phylum",
"id": 4
},
{
"name": "order 1",
"rank": "order",
"id": 6
},
{
"name": "family 1",
"rank": "family",
"id": 7
},
{
"name": "species 1.1",
"rank": "species",
"id": 10
}
],
"links": [
{
"source": 3,
"target": 1
},
{
"source": 10,
"target": 4
},
{
"source": 1,
"target": 9
},
{
"source": 4,
"target": 9
},
{
"source": 9,
"target": 8
},
{
"source": 8,
"target": 5
},
{
"source": 5,
"target": 7
},
{
"source": 7,
"target": 6
},
{
"source": 6,
"target": 2
},
{
"source": 2,
"target": 0
}
]
} "#;
let tax = load(Cursor::new(example), None).unwrap();
assert_eq!(Taxonomy::<&str>::len(&tax), 11);
assert_eq!(Taxonomy::<&str>::root(&tax), "1");
assert_eq!(
Taxonomy::<&str>::lineage(&tax, "10").unwrap(),
vec!["10", "8", "7", "6", "5", "4", "3", "2", "1"]
);
}
#[test]
fn can_load_from_json_path() {
let example = r#"{"test": {"sub": {"nodes": [], "links": []}}}"#;
let tax = load(Cursor::new(example), Some("/test/sub")).unwrap();
assert_eq!(Taxonomy::<&str>::len(&tax), 0);
}
#[test]
fn can_handle_null_ranks() {
let example = r#"{"id": "1", "rank": null, "name": ""}"#;
assert!(load(Cursor::new(example), None).is_ok());
}
}