annatomic 0.4.0

use anyhow::{Context, Result, anyhow};
use graphannis::{AnnotationGraph, graph::GraphStorage, model::AnnotationComponentType};
use graphannis_core::{
    annostorage::NodeAnnotationStorage,
    dfs::CycleSafeDFS,
    graph::{ANNIS_NS, DEFAULT_NS, storage::union::UnionEdgeContainer},
    types::{AnnoKey, Component, NodeID},
};

use itertools::Itertools;
use lazy_static::lazy_static;
use std::{
    cmp::Ordering,
    collections::{BTreeMap, BTreeSet, HashSet},
    sync::Arc,
};

#[derive(Clone)]
pub struct TokenHelper<'a> {
    node_annos: &'a dyn NodeAnnotationStorage,
    cov_edges: Vec<Arc<dyn GraphStorage>>,
    ordering_gs: BTreeMap<String, Arc<dyn GraphStorage>>,
    part_of_gs: Arc<dyn GraphStorage>,
}

lazy_static! {
    static ref COMPONENT_LEFT: Component<AnnotationComponentType> = {
        Component::new(
            AnnotationComponentType::LeftToken,
            ANNIS_NS.into(),
            "".into(),
        )
    };
    static ref COMPONENT_RIGHT: Component<AnnotationComponentType> = {
        Component::new(
            AnnotationComponentType::RightToken,
            ANNIS_NS.into(),
            "".into(),
        )
    };
    pub static ref TOKEN_KEY: Arc<AnnoKey> = Arc::from(AnnoKey {
        ns: ANNIS_NS.into(),
        name: "tok".into(),
    });
}

impl<'a> TokenHelper<'a> {
    pub fn new(graph: &'a AnnotationGraph) -> anyhow::Result<TokenHelper<'a>> {
        let cov_edges: Vec<Arc<dyn GraphStorage>> = graph
            .get_all_components(Some(AnnotationComponentType::Coverage), None)
            .into_iter()
            .filter_map(|c| graph.get_graphstorage(&c))
            .filter(|gs| {
                if let Some(stats) = gs.get_statistics() {
                    stats.nodes > 0
                } else {
                    true
                }
            })
            .collect();
        let mut ordering_gs = BTreeMap::new();

        for c in graph.get_all_components(Some(AnnotationComponentType::Ordering), None) {
            if let Some(gs) = graph.get_graphstorage(&c) {
                ordering_gs.insert(c.name.to_string(), gs);
            }
        }

        let part_of_component =
            Component::new(AnnotationComponentType::PartOf, ANNIS_NS.into(), "".into());
        let part_of_gs = graph
            .get_graphstorage(&part_of_component)
            .ok_or_else(|| anyhow!("Missing PartOf component"))?;

        Ok(TokenHelper {
            node_annos: graph.get_node_annos(),
            cov_edges,
            ordering_gs,
            part_of_gs,
        })
    }

    pub fn is_token(&self, id: NodeID) -> anyhow::Result<bool> {
        if self.node_annos.has_value_for_item(&id, &TOKEN_KEY)? {
            // check if there is no outgoing edge in any of the coverage components
            let has_outgoing = self.has_outgoing_coverage_edges(id)?;
            Ok(!has_outgoing)
        } else {
            Ok(false)
        }
    }

    pub fn is_segmentation_token(&self, id: NodeID, seg: &str) -> anyhow::Result<bool> {
        if self.node_annos.has_value_for_item(&id, &TOKEN_KEY)? {
            let mut part_of_seg_component = false;
            let mut ordering_component_is_empty = true;
            if let Some(gs_ordering) = self.ordering_gs.get(seg) {
                part_of_seg_component =
                    gs_ordering.has_outgoing_edges(id)? || gs_ordering.has_ingoing_edges(id)?;
                ordering_component_is_empty = gs_ordering.source_nodes().next().is_none();
            }
            let has_seg_label = !self
                .node_annos
                .get_all_keys_for_item(&id, None, Some(seg))?
                .is_empty();
            return Ok(part_of_seg_component || (has_seg_label && ordering_component_is_empty));
        }
        Ok(false)
    }

    /// Returns the name of the segmentation component if the given node is a segmentation node.
    pub fn get_segmentation_name(&self, id: NodeID) -> anyhow::Result<Option<String>> {
        if self.node_annos.has_value_for_item(&id, &TOKEN_KEY)? {
            for (seg, gs_ordering) in self.ordering_gs.iter() {
                if !seg.is_empty()
                    && (gs_ordering.has_outgoing_edges(id)? || gs_ordering.has_ingoing_edges(id)?)
                {
                    return Ok(Some(seg.to_string()));
                }
            }
            // If the node is not part of any known ordering components, it
            // might be a single node not connected to another one.
            // Still, it needs to cover a base token to be a valid segmentation token.
            if self.has_outgoing_coverage_edges(id)? {
                let anno_keys: BTreeSet<AnnoKey> = self
                    .node_annos
                    .get_all_keys_for_item(&id, None, None)?
                    .into_iter()
                    .map(|k| k.as_ref().clone())
                    .collect();
                let potential_seg_names: Vec<_> = anno_keys
                    .iter()
                    .filter(|key| key.ns != ANNIS_NS)
                    .map(|key| key.name.clone())
                    .collect();
                // If there is only one key, directly return it.
                if potential_seg_names.len() == 1 {
                    return Ok(Some(potential_seg_names[0].to_string()));
                }
                // If there is more than one annotation for the node, it is ambigous which of these is the segmentation node.
                // There are different strategies that identify segmentation labels and wir try these in order.

                // Check if namespace matches the segmentation name.
                for seg in potential_seg_names.iter() {
                    let key_with_same_namespace = AnnoKey {
                        ns: seg.clone(),
                        name: seg.clone(),
                    };
                    if anno_keys.contains(&key_with_same_namespace) {
                        return Ok(Some(seg.to_string()));
                    }
                }
                // Check if there is a key with the segmentation name and the "annis" namespace
                for seg in potential_seg_names.iter() {
                    let key_with_annis_namespace = AnnoKey {
                        ns: ANNIS_NS.into(),
                        name: seg.clone(),
                    };
                    if anno_keys.contains(&key_with_annis_namespace) {
                        return Ok(Some(seg.to_string()));
                    }
                }
                // Check if there is a key with the segmentation name and the "default_ns" namespace
                for seg in potential_seg_names.iter() {
                    let key_with_default_namespace = AnnoKey {
                        ns: DEFAULT_NS.into(),
                        name: seg.clone(),
                    };
                    if anno_keys.contains(&key_with_default_namespace) {
                        return Ok(Some(seg.to_string()));
                    }
                }
                // Check if there is a key with the segmentation name and an empty namespace
                for seg in potential_seg_names.iter() {
                    let key_with_empty_namespace = AnnoKey {
                        ns: "".into(),
                        name: seg.clone(),
                    };
                    if anno_keys.contains(&key_with_empty_namespace) {
                        return Ok(Some(seg.to_string()));
                    }
                }
                // Do not return any value if this is still ambiguous
            }
        }
        Ok(None)
    }

    pub fn has_outgoing_coverage_edges(&self, id: NodeID) -> anyhow::Result<bool> {
        for c in self.cov_edges.iter() {
            if c.has_outgoing_edges(id)? {
                return Ok(true);
            }
        }
        Ok(false)
    }

    pub fn get_ordered_token(
        &self,
        parent_name: &str,
        segmentation: Option<&str>,
    ) -> Result<Vec<NodeID>> {
        let parent_id = self.node_annos.get_node_id_from_name(parent_name)?;
        let segmentation = segmentation.unwrap_or("");
        let ordering_gs = &self
            .ordering_gs
            .get(segmentation)
            .ok_or_else(|| anyhow!("Missing ordering component for segmentation {segmentation}"))?;

        // Find all token roots
        let mut roots: HashSet<_> = HashSet::new();
        for n in ordering_gs.source_nodes() {
            let n = n?;
            if !ordering_gs.has_ingoing_edges(n)? {
                // Filter the roots by checking the parent node in the corpus structure
                if let Some(parent_id) = parent_id {
                    if self
                        .part_of_gs
                        .is_connected(n, parent_id, 1, std::ops::Bound::Unbounded)?
                    {
                        roots.insert(n);
                    }
                } else {
                    roots.insert(n);
                }
            }
        }

        if roots.is_empty() {
            if let Some(parent_id) = parent_id {
                // Get all nodes that are part of this parent and check for the
                // annis:tok label and no outgoing coverage edges
                for connected in
                    self.part_of_gs
                        .find_connected_inverse(parent_id, 1, std::ops::Bound::Unbounded)
                {
                    let connected = connected?;
                    if self.is_token(connected)? {
                        roots.insert(connected);
                    }
                }
            } else {
                // Find all token that have the "annis:tok" label
                for n in self.node_annos.exact_anno_search(
                    Some(ANNIS_NS),
                    "tok",
                    graphannis_core::annostorage::ValueSearch::Any,
                ) {
                    let n = n?.node;
                    if !self.has_outgoing_coverage_edges(n)? {
                        roots.insert(n);
                    }
                }
            }
        }

        // Follow the ordering edges from the roots to reconstruct the token in their correct order
        let mut result = Vec::default();
        for r in roots {
            let mut token = Some(r);
            while let Some(current_token) = token {
                result.push(current_token);
                // Get next token
                if let Some(next_token) = ordering_gs.get_outgoing_edges(current_token).next() {
                    let next_token = next_token?;
                    token = Some(next_token);
                } else {
                    token = None;
                }
            }
        }

        Ok(result)
    }

    #[cfg(test)]
    pub fn spanned_text(&self, token_ids: &[NodeID]) -> Result<String> {
        use graphannis_core::errors::GraphAnnisCoreError;
        use itertools::Itertools;

        let anno_values: std::result::Result<Vec<_>, GraphAnnisCoreError> = token_ids
            .iter()
            .map(|t| self.node_annos.get_value_for_item(t, &TOKEN_KEY))
            .collect();
        // TODO: support whitespace after/before annotations
        let anno_values = anno_values?.into_iter().flatten().collect_vec();
        let result = anno_values.join(" ");
        Ok(result)
    }

    /// Find all token covered by the given node and sort the result according to the token order.
    pub fn covered_token(&self, node_id: NodeID) -> Result<Vec<NodeID>> {
        let mut result = Vec::default();
        let coverage = UnionEdgeContainer::new(
            self.cov_edges
                .iter()
                .map(|gs| gs.as_edgecontainer())
                .collect_vec(),
        );
        let it = CycleSafeDFS::new(&coverage, node_id, 1, usize::MAX);
        for step in it {
            let step = step?;
            if self.is_token(step.node)? {
                result.push(step.node);
            }
        }

        // Sort token by their order
        self.sort_token(&mut result, None)?;
        Ok(result)
    }

    pub fn get_ordering_gs(&self, segmentation: Option<&str>) -> Option<Arc<dyn GraphStorage>> {
        self.ordering_gs
            .get(segmentation.unwrap_or_default())
            .cloned()
    }

    pub fn get_coverage_gs(&self) -> &[Arc<dyn GraphStorage>] {
        &self.cov_edges
    }

    pub fn sort_token(&self, token_ids: &mut [NodeID], segmentation: Option<&str>) -> Result<()> {
        if let Some(gs) = self.ordering_gs.get(segmentation.unwrap_or_default()) {
            token_ids.sort_by(|a, b| {
                if a == b {
                    Ordering::Equal
                } else if let Ok(connected) = gs.is_connected(*a, *b, 1, std::ops::Bound::Unbounded)
                {
                    if connected {
                        Ordering::Less
                    } else {
                        Ordering::Greater
                    }
                } else {
                    Ordering::Less
                }
            });
        }
        Ok(())
    }

    /// Gets the token that comes before the given `node_id`. If a
    /// `segmentation` is given, this will not be a base token but a
    /// segmentation node that comes directly before the given node.
    pub fn get_token_before(
        &self,
        node_id: NodeID,
        segmentation: Option<&str>,
    ) -> Result<Option<NodeID>> {
        // Get all sorted covered token for this node
        let covered_token = if self.is_token(node_id)? {
            vec![node_id]
        } else {
            self.covered_token(node_id)?
        };

        // Find the token node before the left-most covered token
        if let Some(first_covered_token) = covered_token.first() {
            let gs_tok = self
                .ordering_gs
                .get("")
                .context("Missing base token graph storage component")?;
            let dfs = CycleSafeDFS::new_inverse(
                gs_tok.as_edgecontainer(),
                *first_covered_token,
                1,
                usize::MAX,
            );
            for step in dfs {
                let step = step?;
                let token_before = step.node;

                if let Some(segmentation) = segmentation {
                    // If a segmentation node is requested as result, find the one covering this token
                    let mut segmentation_nodes = Vec::new();
                    for gs_cov in &self.cov_edges {
                        for n in gs_cov.get_ingoing_edges(token_before) {
                            let n = n?;
                            if self.is_segmentation_token(n, segmentation)? {
                                segmentation_nodes.push(n);
                            }
                        }
                    }

                    if !segmentation_nodes.is_empty() {
                        self.sort_token(&mut segmentation_nodes, Some(segmentation))?;
                        return Ok(segmentation_nodes.last().copied());
                    }
                } else {
                    return Ok(Some(token_before));
                }
            }

            Ok(None)
        } else {
            Ok(None)
        }
    }

    /// Gets the token that comes after the given `node_id`. If a `segmentation`
    /// is given, this will not be a base token but a segmentation node that
    /// comes directly after the given node.
    pub fn get_token_after(
        &self,
        node_id: NodeID,
        segmentation: Option<&str>,
    ) -> Result<Option<NodeID>> {
        // Get all sorted covered token for this node
        let covered_token = if self.is_token(node_id)? {
            vec![node_id]
        } else {
            self.covered_token(node_id)?
        };

        // Find the token node after the right-most covered token
        if let Some(last_covered_token) = covered_token.last() {
            let gs_tok = self
                .ordering_gs
                .get("")
                .context("Missing base token graph storage component")?;

            let dfs = CycleSafeDFS::new(
                gs_tok.as_edgecontainer(),
                *last_covered_token,
                1,
                usize::MAX,
            );
            for step in dfs {
                let step = step?;
                let token_after = step.node;

                if let Some(segmentation) = segmentation {
                    // If a segmentation node is requested as result, find the one covering this token
                    let mut segmentation_nodes = Vec::new();
                    for gs_cov in &self.cov_edges {
                        for n in gs_cov.get_ingoing_edges(token_after) {
                            let n = n?;
                            if self.is_segmentation_token(n, segmentation)? {
                                segmentation_nodes.push(n);
                            }
                        }
                    }
                    if !segmentation_nodes.is_empty() {
                        self.sort_token(&mut segmentation_nodes, Some(segmentation))?;
                        return Ok(segmentation_nodes.first().copied());
                    }
                } else {
                    return Ok(Some(token_after));
                }
            }
        }
        Ok(None)
    }

    /// Creates a sequence of start/end pairs of token nodes that are adjacent to each other.
    pub(crate) fn continuous_segmentation_spans(
        &self,
        nodes: &[NodeID],
        segmentation: Option<&str>,
    ) -> anyhow::Result<Vec<(NodeID, NodeID)>> {
        // Filter the and sort the given node IDs, to include only the ones that belong to gigven segmentation;
        let mut token_ids = Vec::with_capacity(nodes.len());
        for n in nodes {
            if let Some(seg) = segmentation {
                if self.is_segmentation_token(*n, seg)? {
                    token_ids.push(*n);
                }
            } else if self.is_token(*n)? {
                token_ids.push(*n);
            }
        }
        self.sort_token(&mut token_ids, segmentation)?;

        // Create an inital sequence of spans, where all elements are a span item
        let segmentation = segmentation.unwrap_or("");
        let ordering_gs = &self.ordering_gs.get(segmentation).ok_or_else(|| {
            anyhow!("Missing ordering component for segmentation '{segmentation}'")
        })?;
        let mut spans: Vec<_> = token_ids.into_iter().map(|n| (n, n)).collect();
        let mut idx = 0;
        while let Some(current_span) = spans.get(idx).copied()
            && let Some(next_span) = spans.get(idx + 1).copied()
        {
            // check if the current and next span are adjacent and should be joined
            let outgoing: Result<Vec<_>, _> =
                ordering_gs.get_outgoing_edges(current_span.1).collect();
            if outgoing?.into_iter().any(|o| o == next_span.0) {
                // Merge boths spans
                spans.remove(idx);
                spans[idx].0 = current_span.0;
                spans[idx].1 = next_span.1;
            } else {
                idx += 1;
            }
        }
        Ok(spans)
    }
}

#[cfg(test)]
mod tests;