use crate::{
annostorage::ValueSearch,
errors::{GraphAnnisCoreError, Result},
graph::{
update::{GraphUpdate, UpdateEvent},
Graph, ANNIS_NS, NODE_NAME, NODE_NAME_KEY, NODE_TYPE, NODE_TYPE_KEY,
},
types::{AnnoKey, Annotation, Component, ComponentType, Edge},
util::{join_qname, split_qname},
};
use quick_xml::{
events::{attributes::Attributes, BytesDecl, BytesEnd, BytesStart, BytesText, Event},
Reader, Writer,
};
use std::{
collections::{BTreeMap, BTreeSet, HashMap},
io::{BufReader, BufWriter, Read, Write},
str::FromStr,
};
fn write_annotation_keys<CT: ComponentType, W: std::io::Write>(
graph: &Graph<CT>,
has_graph_configuration: bool,
writer: &mut Writer<W>,
) -> Result<BTreeMap<AnnoKey, String>> {
let mut key_id_mapping = BTreeMap::new();
let mut id_counter = 0;
if has_graph_configuration {
let new_id = format!("k{}", id_counter);
id_counter += 1;
let mut key_start = BytesStart::borrowed_name(b"key");
key_start.push_attribute(("id", new_id.as_str()));
key_start.push_attribute(("for", "graph"));
key_start.push_attribute(("attr.name", "configuration"));
key_start.push_attribute(("attr.type", "string"));
writer.write_event(Event::Empty(key_start))?;
}
for key in graph.get_node_annos().annotation_keys()? {
if (key.ns != ANNIS_NS || key.name != NODE_NAME) && !key_id_mapping.contains_key(&key) {
let new_id = format!("k{}", id_counter);
id_counter += 1;
let qname = join_qname(&key.ns, &key.name);
let mut key_start = BytesStart::borrowed_name(b"key");
key_start.push_attribute(("id", new_id.as_str()));
key_start.push_attribute(("for", "node"));
key_start.push_attribute(("attr.name", qname.as_str()));
key_start.push_attribute(("attr.type", "string"));
writer.write_event(Event::Empty(key_start))?;
key_id_mapping.insert(key, new_id);
}
}
let autogenerated_components: BTreeSet<Component<CT>> =
CT::update_graph_index_components(graph)
.into_iter()
.collect();
for c in graph.get_all_components(None, None) {
if !autogenerated_components.contains(&c) {
if let Some(gs) = graph.get_graphstorage(&c) {
for key in gs.get_anno_storage().annotation_keys()? {
#[allow(clippy::map_entry)]
if !key_id_mapping.contains_key(&key) {
let new_id = format!("k{}", id_counter);
id_counter += 1;
let qname = join_qname(&key.ns, &key.name);
let mut key_start = BytesStart::borrowed_name(b"key");
key_start.push_attribute(("id", new_id.as_str()));
key_start.push_attribute(("for", "node"));
key_start.push_attribute(("attr.name", qname.as_str()));
key_start.push_attribute(("attr.type", "string"));
writer.write_event(Event::Empty(key_start))?;
key_id_mapping.insert(key, new_id);
}
}
}
}
}
Ok(key_id_mapping)
}
fn write_data<W: std::io::Write>(
anno: Annotation,
writer: &mut Writer<W>,
key_id_mapping: &BTreeMap<AnnoKey, String>,
) -> Result<()> {
let mut data_start = BytesStart::borrowed_name(b"data");
let key_id = key_id_mapping
.get(&anno.key)
.ok_or_else(|| GraphAnnisCoreError::GraphMLMissingAnnotationKey(anno.key.clone()))?;
data_start.push_attribute(("key", key_id.as_str()));
writer.write_event(Event::Start(data_start))?;
writer.write_event(Event::Text(BytesText::from_plain(anno.val.as_bytes())))?;
writer.write_event(Event::End(BytesEnd::borrowed(b"data")))?;
Ok(())
}
fn write_nodes<CT: ComponentType, W: std::io::Write>(
graph: &Graph<CT>,
writer: &mut Writer<W>,
key_id_mapping: &BTreeMap<AnnoKey, String>,
) -> Result<()> {
for m in graph
.get_node_annos()
.exact_anno_search(Some(ANNIS_NS), NODE_TYPE, ValueSearch::Any)
{
let m = m?;
let mut node_start = BytesStart::borrowed_name(b"node");
if let Some(id) = graph
.get_node_annos()
.get_value_for_item(&m.node, &NODE_NAME_KEY)?
{
node_start.push_attribute(("id", id.as_ref()));
let node_annotations = graph.get_node_annos().get_annotations_for_item(&m.node)?;
if node_annotations.is_empty() {
writer.write_event(Event::Empty(node_start))?;
} else {
writer.write_event(Event::Start(node_start))?;
for anno in node_annotations {
if anno.key.ns != ANNIS_NS || anno.key.name != NODE_NAME {
write_data(anno, writer, key_id_mapping)?;
}
}
writer.write_event(Event::End(BytesEnd::borrowed(b"node")))?;
}
}
}
Ok(())
}
fn write_edges<CT: ComponentType, W: std::io::Write>(
graph: &Graph<CT>,
writer: &mut Writer<W>,
key_id_mapping: &BTreeMap<AnnoKey, String>,
) -> Result<()> {
let mut edge_counter = 0;
let autogenerated_components: BTreeSet<Component<CT>> =
CT::update_graph_index_components(graph)
.into_iter()
.collect();
for c in graph.get_all_components(None, None) {
if !autogenerated_components.contains(&c) {
if let Some(gs) = graph.get_graphstorage(&c) {
for source in gs.source_nodes() {
let source = source?;
if let Some(source_id) = graph
.get_node_annos()
.get_value_for_item(&source, &NODE_NAME_KEY)?
{
for target in gs.get_outgoing_edges(source) {
let target = target?;
if let Some(target_id) = graph
.get_node_annos()
.get_value_for_item(&target, &NODE_NAME_KEY)?
{
let edge = Edge { source, target };
let mut edge_id = edge_counter.to_string();
edge_counter += 1;
edge_id.insert(0, 'e');
let mut edge_start = BytesStart::borrowed_name(b"edge");
edge_start.push_attribute(("id", edge_id.as_str()));
edge_start.push_attribute(("source", source_id.as_ref()));
edge_start.push_attribute(("target", target_id.as_ref()));
edge_start.push_attribute(("label", c.to_string().as_ref()));
writer.write_event(Event::Start(edge_start))?;
for anno in gs.get_anno_storage().get_annotations_for_item(&edge)? {
write_data(anno, writer, key_id_mapping)?;
}
writer.write_event(Event::End(BytesEnd::borrowed(b"edge")))?;
}
}
}
}
}
}
}
Ok(())
}
pub fn export<CT: ComponentType, W: std::io::Write, F>(
graph: &Graph<CT>,
graph_configuration: Option<&str>,
output: W,
progress_callback: F,
) -> Result<()>
where
F: Fn(&str),
{
let output = BufWriter::new(output);
let mut writer = Writer::new_with_indent(output, b' ', 4);
let xml_decl = BytesDecl::new(b"1.0", Some(b"UTF-8"), None);
writer.write_event(Event::Decl(xml_decl))?;
writer.write_event(Event::Start(BytesStart::borrowed_name(b"graphml")))?;
progress_callback("exporting all available annotation keys");
let key_id_mapping = write_annotation_keys(graph, graph_configuration.is_some(), &mut writer)?;
let mut graph_start = BytesStart::borrowed_name(b"graph");
graph_start.push_attribute(("edgedefault", "directed"));
graph_start.push_attribute(("parse.order", "nodesfirst"));
graph_start.push_attribute(("parse.nodeids", "free"));
graph_start.push_attribute(("parse.edgeids", "canonical"));
writer.write_event(Event::Start(graph_start))?;
if let Some(config) = graph_configuration {
let mut data_start = BytesStart::borrowed_name(b"data");
data_start.push_attribute(("key", "k0"));
writer.write_event(Event::Start(data_start))?;
writer.write_event(Event::CData(BytesText::from_escaped_str(config)))?;
writer.write_event(Event::End(BytesEnd::borrowed(b"data")))?;
}
progress_callback("exporting nodes");
write_nodes(graph, &mut writer, &key_id_mapping)?;
progress_callback("exporting edges");
write_edges(graph, &mut writer, &key_id_mapping)?;
writer.write_event(Event::End(BytesEnd::borrowed(b"graph")))?;
writer.write_event(Event::End(BytesEnd::borrowed(b"graphml")))?;
writer.into_inner().flush()?;
Ok(())
}
fn add_annotation_key(keys: &mut BTreeMap<String, AnnoKey>, attributes: Attributes) -> Result<()> {
let mut id: Option<String> = None;
let mut anno_key: Option<AnnoKey> = None;
for att in attributes {
let att = att?;
let att_value = String::from_utf8_lossy(&att.value);
match att.key {
b"id" => {
id = Some(att_value.to_string());
}
b"attr.name" => {
let (ns, name) = split_qname(att_value.as_ref());
anno_key = Some(AnnoKey {
ns: ns.unwrap_or("").into(),
name: name.into(),
});
}
_ => {}
}
}
if let (Some(id), Some(anno_key)) = (id, anno_key) {
keys.insert(id, anno_key);
}
Ok(())
}
fn add_node(
node_updates: &mut GraphUpdate,
current_node_id: &Option<String>,
data: &mut HashMap<AnnoKey, String>,
) -> Result<()> {
if let Some(node_name) = current_node_id {
let node_type = data
.remove(&NODE_TYPE_KEY)
.unwrap_or_else(|| "node".to_string());
node_updates.add_event(UpdateEvent::AddNode {
node_name: node_name.clone(),
node_type,
})?;
for (key, value) in data.drain() {
node_updates.add_event(UpdateEvent::AddNodeLabel {
node_name: node_name.clone(),
anno_ns: key.ns.into(),
anno_name: key.name.into(),
anno_value: value,
})?;
}
}
Ok(())
}
fn add_edge<CT: ComponentType>(
edge_updates: &mut GraphUpdate,
current_source_id: &Option<String>,
current_target_id: &Option<String>,
current_component: &Option<String>,
data: &mut HashMap<AnnoKey, String>,
) -> Result<()> {
if let (Some(source), Some(target), Some(component)) =
(current_source_id, current_target_id, current_component)
{
if let Ok(component) = Component::<CT>::from_str(component) {
edge_updates.add_event(UpdateEvent::AddEdge {
source_node: source.clone(),
target_node: target.clone(),
layer: component.layer.clone().into(),
component_type: component.get_type().to_string(),
component_name: component.name.clone().into(),
})?;
for (key, value) in data.drain() {
edge_updates.add_event(UpdateEvent::AddEdgeLabel {
source_node: source.clone(),
target_node: target.clone(),
layer: component.layer.clone().into(),
component_type: component.get_type().to_string(),
component_name: component.name.clone().into(),
anno_ns: key.ns.into(),
anno_name: key.name.into(),
anno_value: value,
})?;
}
}
}
Ok(())
}
fn read_graphml<CT: ComponentType, R: std::io::BufRead, F: Fn(&str)>(
input: &mut R,
node_updates: &mut GraphUpdate,
edge_updates: &mut GraphUpdate,
progress_callback: &F,
) -> Result<Option<String>> {
let mut reader = Reader::from_reader(input);
reader.expand_empty_elements(true);
let mut keys = BTreeMap::new();
let mut level = 0;
let mut in_graph = false;
let mut current_node_id: Option<String> = None;
let mut current_data_key: Option<String> = None;
let mut current_source_id: Option<String> = None;
let mut current_target_id: Option<String> = None;
let mut current_component: Option<String> = None;
let mut data: HashMap<AnnoKey, String> = HashMap::new();
let mut config = None;
let mut processed_updates = 0;
let mut buf = Vec::new();
loop {
match reader.read_event(&mut buf)? {
Event::Start(ref e) => {
level += 1;
match e.name() {
b"graph" => {
if level == 2 {
in_graph = true;
}
}
b"key" => {
if level == 2 {
add_annotation_key(&mut keys, e.attributes())?;
}
}
b"node" => {
if in_graph && level == 3 {
data.clear();
for att in e.attributes() {
let att = att?;
if att.key == b"id" {
current_node_id =
Some(String::from_utf8_lossy(&att.value).to_string());
}
}
}
}
b"edge" => {
if in_graph && level == 3 {
data.clear();
for att in e.attributes() {
let att = att?;
if att.key == b"source" {
current_source_id =
Some(String::from_utf8_lossy(&att.value).to_string());
} else if att.key == b"target" {
current_target_id =
Some(String::from_utf8_lossy(&att.value).to_string());
} else if att.key == b"label" {
current_component =
Some(String::from_utf8_lossy(&att.value).to_string());
}
}
}
}
b"data" => {
for att in e.attributes() {
let att = att?;
if att.key == b"key" {
current_data_key =
Some(String::from_utf8_lossy(&att.value).to_string());
}
}
}
_ => {}
}
}
Event::Text(t) => {
if let Some(current_data_key) = ¤t_data_key {
if in_graph && level == 4 {
if let Some(anno_key) = keys.get(current_data_key) {
data.insert(anno_key.clone(), t.unescape_and_decode(&reader)?);
}
}
}
}
Event::CData(t) => {
if let Some(current_data_key) = ¤t_data_key {
if in_graph && level == 3 && current_data_key == "k0" {
let t_unescaped = t.unescaped()?;
config = Some(String::from_utf8_lossy(&t_unescaped).to_string());
}
}
}
Event::End(ref e) => {
match e.name() {
b"graph" => {
in_graph = false;
}
b"node" => {
add_node(node_updates, ¤t_node_id, &mut data)?;
current_node_id = None;
processed_updates += 1;
if processed_updates % 1_000_000 == 0 {
progress_callback(&format!(
"Processed {} GraphML nodes and edges",
processed_updates
));
}
}
b"edge" => {
add_edge::<CT>(
edge_updates,
¤t_source_id,
¤t_target_id,
¤t_component,
&mut data,
)?;
current_source_id = None;
current_target_id = None;
current_component = None;
processed_updates += 1;
if processed_updates % 1_000_000 == 0 {
progress_callback(&format!(
"Processed {} GraphML nodes and edges",
processed_updates
));
}
}
b"data" => {
current_data_key = None;
}
_ => {}
}
level -= 1;
}
Event::Eof => {
break;
}
_ => {}
}
buf.clear();
}
Ok(config)
}
pub fn import<CT: ComponentType, R: Read, F>(
input: R,
disk_based: bool,
progress_callback: F,
) -> Result<(Graph<CT>, Option<String>)>
where
F: Fn(&str),
{
let mut input = BufReader::new(input);
let mut g = Graph::new(disk_based)?;
let mut updates = GraphUpdate::default();
let mut edge_updates = GraphUpdate::default();
progress_callback("reading GraphML");
let config = read_graphml::<CT, BufReader<R>, F>(
&mut input,
&mut updates,
&mut edge_updates,
&progress_callback,
)?;
progress_callback("merging generated events");
for event in edge_updates.iter()? {
let (_, event) = event?;
updates.add_event(event)?;
}
progress_callback("applying imported changes");
g.apply_update(&mut updates, &progress_callback)?;
progress_callback("calculating node statistics");
g.get_node_annos_mut().calculate_statistics()?;
for c in g.get_all_components(None, None) {
progress_callback(&format!("calculating statistics for component {}", c));
g.calculate_component_statistics(&c)?;
g.optimize_gs_impl(&c)?;
}
Ok((g, config))
}
#[cfg(test)]
mod tests {
use super::*;
use crate::{
graph::{GraphUpdate, DEFAULT_NS},
types::DefaultComponentType,
};
use std::borrow::Cow;
const TEST_CONFIG: &str = r#"[some]
key = "<value>"
[some.another]
value = "test""#;
#[test]
fn export_graphml() {
let mut u = GraphUpdate::new();
u.add_event(UpdateEvent::AddNode {
node_name: "first_node".to_string(),
node_type: "node".to_string(),
})
.unwrap();
u.add_event(UpdateEvent::AddNode {
node_name: "second_node".to_string(),
node_type: "node".to_string(),
})
.unwrap();
u.add_event(UpdateEvent::AddNodeLabel {
node_name: "first_node".to_string(),
anno_ns: DEFAULT_NS.to_string(),
anno_name: "an_annotation".to_string(),
anno_value: "something".to_string(),
})
.unwrap();
u.add_event(UpdateEvent::AddEdge {
source_node: "first_node".to_string(),
target_node: "second_node".to_string(),
component_type: "Edge".to_string(),
layer: "some_ns".to_string(),
component_name: "test_component".to_string(),
})
.unwrap();
let mut g: Graph<DefaultComponentType> = Graph::new(false).unwrap();
g.apply_update(&mut u, |_| {}).unwrap();
let mut xml_data: Vec<u8> = Vec::default();
export(&g, Some(TEST_CONFIG), &mut xml_data, |_| {}).unwrap();
let expected = include_str!("graphml_example.graphml");
let actual = String::from_utf8(xml_data).unwrap();
assert_eq!(expected, actual);
}
#[test]
fn import_graphml() {
let input_xml = std::io::Cursor::new(
include_str!("graphml_example.graphml")
.as_bytes()
.to_owned(),
);
let (g, config_str) = import(input_xml, false, |_| {}).unwrap();
let first_node_id = g.get_node_id_from_name("first_node").unwrap().unwrap();
let second_node_id = g.get_node_id_from_name("second_node").unwrap().unwrap();
let first_node_annos = g
.get_node_annos()
.get_annotations_for_item(&first_node_id)
.unwrap();
assert_eq!(3, first_node_annos.len());
assert_eq!(
Some(Cow::Borrowed("something")),
g.get_node_annos()
.get_value_for_item(
&first_node_id,
&AnnoKey {
ns: DEFAULT_NS.into(),
name: "an_annotation".into(),
}
)
.unwrap()
);
assert_eq!(
2,
g.get_node_annos()
.get_annotations_for_item(&second_node_id)
.unwrap()
.len()
);
let component = g.get_all_components(Some(DefaultComponentType::Edge), None);
assert_eq!(1, component.len());
assert_eq!("some_ns", component[0].layer);
assert_eq!("test_component", component[0].name);
let test_gs = g.get_graphstorage_as_ref(&component[0]).unwrap();
assert_eq!(
Some(1),
test_gs.distance(first_node_id, second_node_id).unwrap()
);
assert_eq!(Some(TEST_CONFIG), config_str.as_deref());
}
}