use std::collections::BTreeSet;
use anyhow::anyhow;
use facet::Facet;
use graphannis::{
aql,
graph::NodeID,
model::{AnnotationComponent, AnnotationComponentType},
update::{GraphUpdate, UpdateEvent},
};
use graphannis_core::graph::{ANNIS_NS, NODE_NAME_KEY, NODE_TYPE_KEY};
use serde::{Deserialize, Serialize};
use crate::util::update_graph;
use super::Manipulator;
#[derive(Facet, Deserialize, Serialize, Clone, PartialEq)]
#[serde(deny_unknown_fields)]
pub struct FilterNodes {
query: String,
#[serde(default)]
inverse: bool,
}
impl Manipulator for FilterNodes {
fn manipulate_corpus(
&self,
graph: &mut graphannis::AnnotationGraph,
_workflow_directory: &std::path::Path,
step_id: crate::StepID,
tx: Option<crate::workflow::StatusSender>,
) -> Result<(), Box<dyn std::error::Error>> {
let mut update = GraphUpdate::default();
let query = aql::parse(&self.query, false)?;
let mut matching_nodes = BTreeSet::default();
let node_annos = graph.get_node_annos();
let terminals = {
let mut v = BTreeSet::default();
if let Some(storage) = graph.get_graphstorage(&AnnotationComponent::new(
AnnotationComponentType::Ordering,
ANNIS_NS.into(),
"".into(),
)) {
let roots = storage
.source_nodes()
.flatten()
.filter(|n| !storage.has_ingoing_edges(*n).unwrap_or_default());
for root in roots {
storage
.find_connected(root, 0, std::ops::Bound::Excluded(usize::MAX))
.flatten()
.for_each(|n| {
v.insert(n);
});
}
}
v
};
aql::execute_query_on_graph(graph, &query, true, None)?
.flatten()
.for_each(|group| {
for member in group {
matching_nodes.insert(member.node);
}
});
if self.inverse {
for n in matching_nodes.difference(&terminals) {
if let Some(node_name) = node_annos.get_value_for_item(n, &NODE_NAME_KEY)? {
update.add_event(UpdateEvent::DeleteNode {
node_name: node_name.to_string(),
})?;
} else {
return Err(anyhow!("Node has no name. This is invalid.").into());
}
}
} else {
let max_id = node_annos.get_largest_item()?.unwrap_or(NodeID::MAX);
for n in 0..max_id {
if let Some(node_type) = node_annos.get_value_for_item(&n, &NODE_TYPE_KEY)?
&& !matching_nodes.contains(&n)
&& !terminals.contains(&n)
&& &*node_type == "node"
{
if let Some(node_name) = node_annos.get_value_for_item(&n, &NODE_NAME_KEY)? {
update.add_event(UpdateEvent::DeleteNode {
node_name: node_name.to_string(),
})?;
} else {
return Err(anyhow!("Node has no name. This is invalid.").into());
}
}
}
}
update_graph(graph, &mut update, Some(step_id), tx)?;
Ok(())
}
fn requires_statistics(&self) -> bool {
true
}
}
#[cfg(test)]
mod tests {
use std::{fs, path::Path};
use graphannis::{AnnotationGraph, update::GraphUpdate};
use insta::assert_snapshot;
use crate::{
StepID,
exporter::graphml::GraphMLExporter,
importer::{GenericImportConfiguration, Importer, exmaralda::ImportEXMARaLDA},
manipulator::{Manipulator, filter::FilterNodes},
test_util::export_to_string,
util::{example_generator, update_graph_silent},
};
#[test]
fn serialize_custom() {
let module = FilterNodes {
query: "tok _=_ pos=/NOUN|PROPN/".to_string(),
inverse: true,
};
let serialization = toml::to_string(&module);
assert!(
serialization.is_ok(),
"Serialization failed: {:?}",
serialization.err()
);
assert_snapshot!(serialization.unwrap());
}
#[test]
fn graph_statistics() {
let g = AnnotationGraph::with_default_graphstorages(false);
assert!(g.is_ok());
let mut graph = g.unwrap();
let mut u = GraphUpdate::default();
example_generator::create_corpus_structure_simple(&mut u);
assert!(update_graph_silent(&mut graph, &mut u).is_ok());
let module = FilterNodes {
query: "node".to_string(),
inverse: false,
};
assert!(
module
.validate_graph(
&mut graph,
StepID {
module_name: "test".to_string(),
path: None
},
None
)
.is_ok()
);
assert!(graph.global_statistics.is_some());
}
#[test]
fn default() {
let exmaralda = ImportEXMARaLDA::default();
let mprt = exmaralda.import_corpus(
Path::new("tests/data/import/exmaralda/clean/import/exmaralda/"),
StepID {
module_name: "test_import_exb".to_string(),
path: None,
},
GenericImportConfiguration::new_with_default_extensions(&exmaralda),
None,
);
assert!(mprt.is_ok());
let mut update_import = mprt.unwrap();
let g = AnnotationGraph::with_default_graphstorages(true);
assert!(g.is_ok());
let mut graph = g.unwrap();
assert!(graph.apply_update(&mut update_import, |_| {}).is_ok());
let manipulation = FilterNodes {
query: "pos=/PRON/".to_string(),
inverse: false,
};
assert!(
manipulation
.manipulate_corpus(
&mut graph,
Path::new("./"),
StepID {
module_name: "test_filter".to_string(),
path: None
},
None
)
.is_ok()
);
let export = export_to_string(&graph, GraphMLExporter::default());
assert!(export.is_ok(), "error: {:?}", export.err());
assert_snapshot!(export.unwrap());
}
#[test]
fn inverse() {
let exmaralda = ImportEXMARaLDA::default();
let mprt = exmaralda.import_corpus(
Path::new("tests/data/import/exmaralda/clean/import/exmaralda/"),
StepID {
module_name: "test_import_exb".to_string(),
path: None,
},
GenericImportConfiguration::new_with_default_extensions(&exmaralda),
None,
);
assert!(mprt.is_ok());
let mut update_import = mprt.unwrap();
let g = AnnotationGraph::with_default_graphstorages(true);
assert!(g.is_ok());
let mut graph = g.unwrap();
assert!(graph.apply_update(&mut update_import, |_| {}).is_ok());
let manipulation = FilterNodes {
query: "pos=/PRON/".to_string(),
inverse: true,
};
assert!(
manipulation
.manipulate_corpus(
&mut graph,
Path::new("./"),
StepID {
module_name: "test_filter".to_string(),
path: None
},
None
)
.is_ok()
);
let export = export_to_string(&graph, GraphMLExporter::default());
assert!(export.is_ok(), "error: {:?}", export.err());
assert_snapshot!(export.unwrap());
}
#[test]
fn deserialize() {
let toml_str =
fs::read_to_string(Path::new("./tests/data/graph_op/filter/deserialize.toml"))
.unwrap_or_default();
let filter_nodes: Result<FilterNodes, _> = toml::from_str(toml_str.as_str());
assert!(filter_nodes.is_ok(), "error: {:?}", filter_nodes.err());
}
}