use bio_types::phylogeny::{Tree, TreeGraph};
use pest::iterators::Pair;
use pest::Parser;
use petgraph::graph::NodeIndex;
use std::fs;
use std::io;
use std::path::{Path, PathBuf};
use thiserror::Error;
#[derive(Debug, Error)]
pub enum Error {
#[error("Error while opening {}: {}", filename.display(), source)]
OpenFile {
filename: PathBuf,
source: std::io::Error,
},
#[error("Error while reading tree: {0}")]
Read(#[from] std::io::Error),
#[error("Tree contains invalid UTF-8: {0}")]
InvalidContent(#[from] std::str::Utf8Error),
#[error("Error while parsing tree: {0}")]
ParsingError(#[from] pest::error::Error<crate::io::newick::Rule>),
}
type Result<T, E = Error> = std::result::Result<T, E>;
#[derive(Parser)]
#[grammar = "io/newick.pest"]
#[derive(
Default, Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Debug, Serialize, Deserialize,
)]
pub struct NewickParser;
enum TreeValue {
Node {
name: Option<String>,
children: Option<Vec<TreeValue>>,
},
Link {
weight: f32,
node: Box<TreeValue>,
},
}
fn parse_newick_file(content: &str) -> Result<TreeValue> {
fn parse_value(pair: Pair<Rule>) -> TreeValue {
match pair.as_rule() {
Rule::Leaf => {
let name = pair.into_inner().next().unwrap().as_str();
TreeValue::Node {
name: Some(name.into()),
children: None,
}
}
Rule::Internal => {
let mut inner_rules = pair.into_inner();
let children = Some(
inner_rules
.next()
.unwrap()
.into_inner()
.map(parse_value)
.collect(),
);
let name = if let Some(clade) = inner_rules.next() {
Some(clade.as_str().into())
} else {
None
};
TreeValue::Node { children, name }
}
Rule::Branch => {
fn get_weight(mut inner: pest::iterators::Pairs<Rule>) -> f32 {
if let Some(weight) = inner.next() {
weight.as_str().parse::<f32>().unwrap()
} else {
f32::NAN
}
}
let mut inner = pair.into_inner();
let (node, weight) = if let Some(next) = inner.next() {
match next.as_rule() {
Rule::SubTree => (parse_value(next), get_weight(inner)),
_ => (
TreeValue::Node {
name: None,
children: None,
},
next.as_str().parse::<f32>().unwrap(),
),
}
} else {
(
TreeValue::Node {
name: None,
children: None,
},
get_weight(inner),
)
};
TreeValue::Link {
weight,
node: Box::new(node),
}
}
Rule::SubTree => parse_value(pair.into_inner().next().unwrap()),
Rule::EOI
| Rule::WHITESPACE
| Rule::Tree
| Rule::Length
| Rule::BranchSet
| Rule::float
| Rule::safe
| Rule::name => unreachable!(),
}
}
let root = NewickParser::parse(Rule::Tree, &content)
.map_err(Error::ParsingError)?
.next()
.unwrap();
Ok(parse_value(root))
}
fn newick_to_graph(root: TreeValue) -> Result<Tree> {
fn add_node(g: &mut TreeGraph, t: TreeValue) -> NodeIndex {
match t {
TreeValue::Node { name, children } => {
let node_id = g.add_node(name.unwrap_or("N/A".into()).into());
if let Some(children) = children {
for child in children {
match child {
TreeValue::Node { .. } => unimplemented!(),
TreeValue::Link { weight, node } => {
let child_id = add_node(g, *node);
g.add_edge(node_id, child_id, weight);
}
}
}
};
node_id
}
TreeValue::Link { .. } => unreachable!(),
}
}
let mut g = TreeGraph::new();
add_node(&mut g, root);
Ok(Tree { g })
}
pub fn from_string<S: AsRef<str>>(content: S) -> Result<Tree> {
let raw_tree = parse_newick_file(content.as_ref())?;
newick_to_graph(raw_tree)
}
pub fn from_file<P: AsRef<Path>>(path: P) -> Result<Tree> {
fs::File::open(&path)
.map(read)
.map_err(|e| Error::OpenFile {
filename: path.as_ref().to_owned(),
source: e,
})?
}
pub fn read<R: io::Read>(reader: R) -> Result<Tree> {
let content_bytes = reader
.bytes()
.collect::<Result<Vec<_>, _>>()
.map_err(Error::Read)?;
let content_str = std::str::from_utf8(&content_bytes).map_err(Error::InvalidContent)?;
from_string(&content_str)
}