use anyhow::{Context, Result, anyhow};
use graphannis::{AnnotationGraph, graph::GraphStorage, model::AnnotationComponentType};
use graphannis_core::{
annostorage::NodeAnnotationStorage,
dfs::CycleSafeDFS,
graph::{ANNIS_NS, DEFAULT_NS, storage::union::UnionEdgeContainer},
types::{AnnoKey, Component, NodeID},
};
use itertools::Itertools;
use lazy_static::lazy_static;
use std::{
cmp::Ordering,
collections::{BTreeMap, BTreeSet, HashSet},
sync::Arc,
};
#[derive(Clone)]
pub struct TokenHelper<'a> {
node_annos: &'a dyn NodeAnnotationStorage,
cov_edges: Vec<Arc<dyn GraphStorage>>,
ordering_gs: BTreeMap<String, Arc<dyn GraphStorage>>,
part_of_gs: Arc<dyn GraphStorage>,
}
lazy_static! {
static ref COMPONENT_LEFT: Component<AnnotationComponentType> = {
Component::new(
AnnotationComponentType::LeftToken,
ANNIS_NS.into(),
"".into(),
)
};
static ref COMPONENT_RIGHT: Component<AnnotationComponentType> = {
Component::new(
AnnotationComponentType::RightToken,
ANNIS_NS.into(),
"".into(),
)
};
pub static ref TOKEN_KEY: Arc<AnnoKey> = Arc::from(AnnoKey {
ns: ANNIS_NS.into(),
name: "tok".into(),
});
}
impl<'a> TokenHelper<'a> {
pub fn new(graph: &'a AnnotationGraph) -> anyhow::Result<TokenHelper<'a>> {
let cov_edges: Vec<Arc<dyn GraphStorage>> = graph
.get_all_components(Some(AnnotationComponentType::Coverage), None)
.into_iter()
.filter_map(|c| graph.get_graphstorage(&c))
.filter(|gs| {
if let Some(stats) = gs.get_statistics() {
stats.nodes > 0
} else {
true
}
})
.collect();
let mut ordering_gs = BTreeMap::new();
for c in graph.get_all_components(Some(AnnotationComponentType::Ordering), None) {
if let Some(gs) = graph.get_graphstorage(&c) {
ordering_gs.insert(c.name.to_string(), gs);
}
}
let part_of_component =
Component::new(AnnotationComponentType::PartOf, ANNIS_NS.into(), "".into());
let part_of_gs = graph
.get_graphstorage(&part_of_component)
.ok_or_else(|| anyhow!("Missing PartOf component"))?;
Ok(TokenHelper {
node_annos: graph.get_node_annos(),
cov_edges,
ordering_gs,
part_of_gs,
})
}
pub fn is_token(&self, id: NodeID) -> anyhow::Result<bool> {
if self.node_annos.has_value_for_item(&id, &TOKEN_KEY)? {
let has_outgoing = self.has_outgoing_coverage_edges(id)?;
Ok(!has_outgoing)
} else {
Ok(false)
}
}
pub fn is_segmentation_token(&self, id: NodeID, seg: &str) -> anyhow::Result<bool> {
if self.node_annos.has_value_for_item(&id, &TOKEN_KEY)? {
let mut part_of_seg_component = false;
let mut ordering_component_is_empty = true;
if let Some(gs_ordering) = self.ordering_gs.get(seg) {
part_of_seg_component =
gs_ordering.has_outgoing_edges(id)? || gs_ordering.has_ingoing_edges(id)?;
ordering_component_is_empty = gs_ordering.source_nodes().next().is_none();
}
let has_seg_label = !self
.node_annos
.get_all_keys_for_item(&id, None, Some(seg))?
.is_empty();
return Ok(part_of_seg_component || (has_seg_label && ordering_component_is_empty));
}
Ok(false)
}
pub fn get_segmentation_name(&self, id: NodeID) -> anyhow::Result<Option<String>> {
if self.node_annos.has_value_for_item(&id, &TOKEN_KEY)? {
for (seg, gs_ordering) in self.ordering_gs.iter() {
if !seg.is_empty()
&& (gs_ordering.has_outgoing_edges(id)? || gs_ordering.has_ingoing_edges(id)?)
{
return Ok(Some(seg.to_string()));
}
}
if self.has_outgoing_coverage_edges(id)? {
let anno_keys: BTreeSet<AnnoKey> = self
.node_annos
.get_all_keys_for_item(&id, None, None)?
.into_iter()
.map(|k| k.as_ref().clone())
.collect();
let potential_seg_names: Vec<_> = anno_keys
.iter()
.filter(|key| key.ns != ANNIS_NS)
.map(|key| key.name.clone())
.collect();
if potential_seg_names.len() == 1 {
return Ok(Some(potential_seg_names[0].to_string()));
}
for seg in potential_seg_names.iter() {
let key_with_same_namespace = AnnoKey {
ns: seg.clone(),
name: seg.clone(),
};
if anno_keys.contains(&key_with_same_namespace) {
return Ok(Some(seg.to_string()));
}
}
for seg in potential_seg_names.iter() {
let key_with_annis_namespace = AnnoKey {
ns: ANNIS_NS.into(),
name: seg.clone(),
};
if anno_keys.contains(&key_with_annis_namespace) {
return Ok(Some(seg.to_string()));
}
}
for seg in potential_seg_names.iter() {
let key_with_default_namespace = AnnoKey {
ns: DEFAULT_NS.into(),
name: seg.clone(),
};
if anno_keys.contains(&key_with_default_namespace) {
return Ok(Some(seg.to_string()));
}
}
for seg in potential_seg_names.iter() {
let key_with_empty_namespace = AnnoKey {
ns: "".into(),
name: seg.clone(),
};
if anno_keys.contains(&key_with_empty_namespace) {
return Ok(Some(seg.to_string()));
}
}
}
}
Ok(None)
}
pub fn has_outgoing_coverage_edges(&self, id: NodeID) -> anyhow::Result<bool> {
for c in self.cov_edges.iter() {
if c.has_outgoing_edges(id)? {
return Ok(true);
}
}
Ok(false)
}
pub fn get_ordered_token(
&self,
parent_name: &str,
segmentation: Option<&str>,
) -> Result<Vec<NodeID>> {
let parent_id = self.node_annos.get_node_id_from_name(parent_name)?;
let segmentation = segmentation.unwrap_or("");
let ordering_gs = &self
.ordering_gs
.get(segmentation)
.ok_or_else(|| anyhow!("Missing ordering component for segmentation {segmentation}"))?;
let mut roots: HashSet<_> = HashSet::new();
for n in ordering_gs.source_nodes() {
let n = n?;
if !ordering_gs.has_ingoing_edges(n)? {
if let Some(parent_id) = parent_id {
if self
.part_of_gs
.is_connected(n, parent_id, 1, std::ops::Bound::Unbounded)?
{
roots.insert(n);
}
} else {
roots.insert(n);
}
}
}
if roots.is_empty() {
if let Some(parent_id) = parent_id {
for connected in
self.part_of_gs
.find_connected_inverse(parent_id, 1, std::ops::Bound::Unbounded)
{
let connected = connected?;
if self.is_token(connected)? {
roots.insert(connected);
}
}
} else {
for n in self.node_annos.exact_anno_search(
Some(ANNIS_NS),
"tok",
graphannis_core::annostorage::ValueSearch::Any,
) {
let n = n?.node;
if !self.has_outgoing_coverage_edges(n)? {
roots.insert(n);
}
}
}
}
let mut result = Vec::default();
for r in roots {
let mut token = Some(r);
while let Some(current_token) = token {
result.push(current_token);
if let Some(next_token) = ordering_gs.get_outgoing_edges(current_token).next() {
let next_token = next_token?;
token = Some(next_token);
} else {
token = None;
}
}
}
Ok(result)
}
#[cfg(test)]
pub fn spanned_text(&self, token_ids: &[NodeID]) -> Result<String> {
use graphannis_core::errors::GraphAnnisCoreError;
use itertools::Itertools;
let anno_values: std::result::Result<Vec<_>, GraphAnnisCoreError> = token_ids
.iter()
.map(|t| self.node_annos.get_value_for_item(t, &TOKEN_KEY))
.collect();
let anno_values = anno_values?.into_iter().flatten().collect_vec();
let result = anno_values.join(" ");
Ok(result)
}
pub fn covered_token(&self, node_id: NodeID) -> Result<Vec<NodeID>> {
let mut result = Vec::default();
let coverage = UnionEdgeContainer::new(
self.cov_edges
.iter()
.map(|gs| gs.as_edgecontainer())
.collect_vec(),
);
let it = CycleSafeDFS::new(&coverage, node_id, 1, usize::MAX);
for step in it {
let step = step?;
if self.is_token(step.node)? {
result.push(step.node);
}
}
self.sort_token(&mut result, None)?;
Ok(result)
}
pub fn get_ordering_gs(&self, segmentation: Option<&str>) -> Option<Arc<dyn GraphStorage>> {
self.ordering_gs
.get(segmentation.unwrap_or_default())
.cloned()
}
pub fn get_coverage_gs(&self) -> &[Arc<dyn GraphStorage>] {
&self.cov_edges
}
pub fn sort_token(&self, token_ids: &mut [NodeID], segmentation: Option<&str>) -> Result<()> {
if let Some(gs) = self.ordering_gs.get(segmentation.unwrap_or_default()) {
token_ids.sort_by(|a, b| {
if a == b {
Ordering::Equal
} else if let Ok(connected) = gs.is_connected(*a, *b, 1, std::ops::Bound::Unbounded)
{
if connected {
Ordering::Less
} else {
Ordering::Greater
}
} else {
Ordering::Less
}
});
}
Ok(())
}
pub fn get_token_before(
&self,
node_id: NodeID,
segmentation: Option<&str>,
) -> Result<Option<NodeID>> {
let covered_token = if self.is_token(node_id)? {
vec![node_id]
} else {
self.covered_token(node_id)?
};
if let Some(first_covered_token) = covered_token.first() {
let gs_tok = self
.ordering_gs
.get("")
.context("Missing base token graph storage component")?;
let dfs = CycleSafeDFS::new_inverse(
gs_tok.as_edgecontainer(),
*first_covered_token,
1,
usize::MAX,
);
for step in dfs {
let step = step?;
let token_before = step.node;
if let Some(segmentation) = segmentation {
let mut segmentation_nodes = Vec::new();
for gs_cov in &self.cov_edges {
for n in gs_cov.get_ingoing_edges(token_before) {
let n = n?;
if self.is_segmentation_token(n, segmentation)? {
segmentation_nodes.push(n);
}
}
}
if !segmentation_nodes.is_empty() {
self.sort_token(&mut segmentation_nodes, Some(segmentation))?;
return Ok(segmentation_nodes.last().copied());
}
} else {
return Ok(Some(token_before));
}
}
Ok(None)
} else {
Ok(None)
}
}
pub fn get_token_after(
&self,
node_id: NodeID,
segmentation: Option<&str>,
) -> Result<Option<NodeID>> {
let covered_token = if self.is_token(node_id)? {
vec![node_id]
} else {
self.covered_token(node_id)?
};
if let Some(last_covered_token) = covered_token.last() {
let gs_tok = self
.ordering_gs
.get("")
.context("Missing base token graph storage component")?;
let dfs = CycleSafeDFS::new(
gs_tok.as_edgecontainer(),
*last_covered_token,
1,
usize::MAX,
);
for step in dfs {
let step = step?;
let token_after = step.node;
if let Some(segmentation) = segmentation {
let mut segmentation_nodes = Vec::new();
for gs_cov in &self.cov_edges {
for n in gs_cov.get_ingoing_edges(token_after) {
let n = n?;
if self.is_segmentation_token(n, segmentation)? {
segmentation_nodes.push(n);
}
}
}
if !segmentation_nodes.is_empty() {
self.sort_token(&mut segmentation_nodes, Some(segmentation))?;
return Ok(segmentation_nodes.first().copied());
}
} else {
return Ok(Some(token_after));
}
}
}
Ok(None)
}
pub(crate) fn continuous_segmentation_spans(
&self,
nodes: &[NodeID],
segmentation: Option<&str>,
) -> anyhow::Result<Vec<(NodeID, NodeID)>> {
let mut token_ids = Vec::with_capacity(nodes.len());
for n in nodes {
if let Some(seg) = segmentation {
if self.is_segmentation_token(*n, seg)? {
token_ids.push(*n);
}
} else if self.is_token(*n)? {
token_ids.push(*n);
}
}
self.sort_token(&mut token_ids, segmentation)?;
let segmentation = segmentation.unwrap_or("");
let ordering_gs = &self.ordering_gs.get(segmentation).ok_or_else(|| {
anyhow!("Missing ordering component for segmentation '{segmentation}'")
})?;
let mut spans: Vec<_> = token_ids.into_iter().map(|n| (n, n)).collect();
let mut idx = 0;
while let Some(current_span) = spans.get(idx).copied()
&& let Some(next_span) = spans.get(idx + 1).copied()
{
let outgoing: Result<Vec<_>, _> =
ordering_gs.get_outgoing_edges(current_span.1).collect();
if outgoing?.into_iter().any(|o| o == next_span.0) {
spans.remove(idx);
spans[idx].0 = current_span.0;
spans[idx].1 = next_span.1;
} else {
idx += 1;
}
}
Ok(spans)
}
}
#[cfg(test)]
mod tests;