use crate::{
annis::{db::token_helper::TokenHelper, util::quicksort},
errors::GraphAnnisError,
graph::{Edge, EdgeContainer, GraphStorage, NodeID},
};
use facet::Facet;
use graphannis_core::{
annostorage::ValueSearch,
dfs::CycleSafeDFS,
errors::ComponentTypeError,
graph::{ANNIS_NS, NODE_TYPE_KEY, storage::union::UnionEdgeContainer},
types::ComponentType,
util::disk_collections::{DEFAULT_BLOCK_CACHE_CAPACITY, DiskMap, EvictionStrategy},
};
use std::{
collections::{BTreeMap, HashMap},
fmt,
};
use transient_btree_index::BtreeConfig;
use std::borrow::Cow;
use std::{str::FromStr, sync::Arc};
use strum::IntoEnumIterator;
use strum_macros::{EnumIter, EnumString};
use crate::{AnnotationGraph, update::UpdateEvent};
use rustc_hash::FxHashSet;
use crate::{
Graph,
graph::{AnnoKey, Component},
model::AnnotationComponent,
};
pub const TOK: &str = "tok";
pub const TOK_WHITESPACE_BEFORE: &str = "tok-whitespace-before";
pub const TOK_WHITESPACE_AFTER: &str = "tok-whitespace-after";
lazy_static! {
pub static ref TOKEN_KEY: Arc<AnnoKey> = Arc::from(AnnoKey {
ns: ANNIS_NS.into(),
name: TOK.into(),
});
}
#[derive(
Facet,
Serialize,
Deserialize,
Eq,
PartialEq,
PartialOrd,
Ord,
Hash,
Clone,
Debug,
EnumIter,
EnumString,
)]
#[repr(C)]
pub enum AnnotationComponentType {
Coverage,
Dominance = 2,
Pointing,
Ordering,
LeftToken,
RightToken,
PartOf,
}
impl From<AnnotationComponentType> for u16 {
fn from(t: AnnotationComponentType) -> Self {
t as u16
}
}
impl From<u16> for AnnotationComponentType {
fn from(idx: u16) -> AnnotationComponentType {
match idx {
0 => AnnotationComponentType::Coverage,
2 => AnnotationComponentType::Dominance,
3 => AnnotationComponentType::Pointing,
4 => AnnotationComponentType::Ordering,
5 => AnnotationComponentType::LeftToken,
6 => AnnotationComponentType::RightToken,
7 => AnnotationComponentType::PartOf,
_ => AnnotationComponentType::Pointing,
}
}
}
#[non_exhaustive]
#[derive(Serialize, Deserialize, Clone, Debug, Default)]
pub enum CorpusSize {
#[default]
Unknown,
Token {
base_token_count: u64,
segmentation_count: BTreeMap<String, u64>,
},
}
#[derive(Serialize, Deserialize, Clone, Debug, Default)]
pub struct AQLGlobalStatistics {
#[serde(default)]
pub all_token_in_order_component: bool,
#[serde(default)]
pub corpus_size: CorpusSize,
}
fn calculate_inherited_coverage_edges(
graph: &mut AnnotationGraph,
n: NodeID,
other_cov_gs: &[Arc<dyn GraphStorage>],
all_text_coverage_components: &[AnnotationComponent],
inherited_cov_component: &AnnotationComponent,
) -> std::result::Result<FxHashSet<NodeID>, ComponentTypeError> {
let all_text_cov_components_gs: Vec<_> = all_text_coverage_components
.iter()
.filter_map(|c| graph.get_graphstorage_as_ref(c))
.map(|gs| gs.as_edgecontainer())
.collect();
let all_text_cov_components_combined = UnionEdgeContainer::new(all_text_cov_components_gs);
let mut covered_token = FxHashSet::default();
{
let tok_helper = TokenHelper::new(graph)?;
for step in CycleSafeDFS::new(&all_text_cov_components_combined, n, 1, usize::MAX) {
let step = step?;
if tok_helper.is_token(step.node)? {
covered_token.insert(step.node);
}
}
};
let mut direct_coverage_targets = FxHashSet::default();
for gs in other_cov_gs.iter() {
for target in gs.get_outgoing_edges(n) {
direct_coverage_targets.insert(target?);
}
}
let inherited_gs_cov = graph.get_or_create_writable(inherited_cov_component)?;
for target in &covered_token {
if n != *target && !direct_coverage_targets.contains(target) {
inherited_gs_cov.add_edge(Edge {
source: n,
target: *target,
})?;
}
}
Ok(covered_token)
}
pub struct AQLUpdateGraphIndex {
cached_node_ids: DiskMap<String, NodeID>,
graph_without_nodes: bool,
invalid_nodes: DiskMap<NodeID, bool>,
text_coverage_components: FxHashSet<AnnotationComponent>,
}
impl AQLUpdateGraphIndex {
#[allow(
clippy::owned_cow,
reason = "We don't want to copy the node_name if not necessary and also have to look up &String"
)]
fn get_cached_node_id_from_name(
&mut self,
node_name: Cow<String>,
graph: &AnnotationGraph,
) -> std::result::Result<NodeID, ComponentTypeError> {
if let Some(id) = self.cached_node_ids.get(&node_name)? {
return Ok(*id);
} else if let Some(id) = graph.get_node_annos().get_node_id_from_name(&node_name)? {
self.cached_node_ids.insert(node_name.to_string(), id)?;
return Ok(id);
}
Err(ComponentTypeError(
GraphAnnisError::NoSuchNodeID(node_name.to_string()).into(),
))
}
fn calculate_invalidated_nodes_by_coverage(
&mut self,
graph: &AnnotationGraph,
node: NodeID,
) -> std::result::Result<(), ComponentTypeError> {
let containers: Vec<&dyn EdgeContainer> = self
.text_coverage_components
.iter()
.filter_map(|c| graph.get_graphstorage_as_ref(c))
.map(|gs| gs.as_edgecontainer())
.collect();
let union_container = UnionEdgeContainer::new(containers);
let dfs = CycleSafeDFS::new_inverse(&union_container, node, 0, usize::MAX);
for step in dfs {
let step = step?;
self.invalid_nodes.insert(step.node, true)?;
}
Ok(())
}
fn clear_left_right_token(
&self,
graph: &mut AnnotationGraph,
) -> std::result::Result<(), ComponentTypeError> {
let component_left = AnnotationComponent::new(
AnnotationComponentType::LeftToken,
ANNIS_NS.into(),
"".into(),
);
let component_right = AnnotationComponent::new(
AnnotationComponentType::RightToken,
ANNIS_NS.into(),
"".into(),
);
let component_cov = AnnotationComponent::new(
AnnotationComponentType::Coverage,
ANNIS_NS.into(),
"inherited-coverage".into(),
);
let gs_left = graph.get_or_create_writable(&component_left)?;
if self.graph_without_nodes {
gs_left.clear()?;
let gs_right = graph.get_or_create_writable(&component_right)?;
gs_right.clear()?;
let gs_cov = graph.get_or_create_writable(&component_cov)?;
gs_cov.clear()?;
} else {
for invalid in self.invalid_nodes.iter()? {
let (n, _) = invalid?;
gs_left.delete_node(n)?;
}
let gs_right = graph.get_or_create_writable(&component_right)?;
for invalid in self.invalid_nodes.iter()? {
let (n, _) = invalid?;
gs_right.delete_node(n)?;
}
let gs_cov = graph.get_or_create_writable(&component_cov)?;
for invalid in self.invalid_nodes.iter()? {
let (n, _) = invalid?;
gs_cov.delete_node(n)?;
}
}
Ok(())
}
fn reindex_inherited_coverage(
&self,
graph: &mut AnnotationGraph,
gs_order: Arc<dyn GraphStorage>,
) -> std::result::Result<(), ComponentTypeError> {
self.clear_left_right_token(graph)?;
let inherited_cov_component = AnnotationComponent::new(
AnnotationComponentType::Coverage,
ANNIS_NS.into(),
"inherited-coverage".into(),
);
let all_cov_components: Vec<_> = graph
.get_all_components(Some(AnnotationComponentType::Coverage), None)
.into_iter()
.filter(|c| c != &inherited_cov_component)
.collect();
let all_cov_gs: Vec<_> = all_cov_components
.iter()
.filter_map(|c| graph.get_graphstorage(c))
.collect();
let all_dom_components =
graph.get_all_components(Some(AnnotationComponentType::Dominance), None);
let all_text_coverage_components: Vec<AnnotationComponent> =
[all_cov_components, all_dom_components].concat();
for invalid in self.invalid_nodes.iter()? {
let (n, _) = invalid?;
let covered_token = calculate_inherited_coverage_edges(
graph,
n,
&all_cov_gs,
&all_text_coverage_components,
&inherited_cov_component,
)?;
self.calculate_token_alignment(
graph,
n,
AnnotationComponentType::LeftToken,
gs_order.as_ref(),
&covered_token,
)?;
self.calculate_token_alignment(
graph,
n,
AnnotationComponentType::RightToken,
gs_order.as_ref(),
&covered_token,
)?;
}
Ok(())
}
fn calculate_token_alignment(
&self,
graph: &mut AnnotationGraph,
n: NodeID,
ctype: AnnotationComponentType,
gs_order: &dyn GraphStorage,
covered_token: &FxHashSet<u64>,
) -> std::result::Result<Option<NodeID>, ComponentTypeError> {
let alignment_component =
AnnotationComponent::new(ctype.clone(), ANNIS_NS.into(), "".into());
if graph
.get_node_annos()
.get_value_for_item(&n, &TOKEN_KEY)?
.is_some()
&& covered_token.is_empty()
{
return Ok(Some(n));
}
if let Some(alignment_gs) = graph.get_graphstorage_as_ref(&alignment_component)
&& let Some(existing) = alignment_gs.get_outgoing_edges(n).next()
{
let existing = existing?;
return Ok(Some(existing));
}
let mut candidates: Vec<u64> = covered_token.iter().copied().collect();
quicksort::sort(&mut candidates, move |a, b| {
if *a == *b {
return Ok(std::cmp::Ordering::Equal);
}
if gs_order.is_connected(*a, *b, 1, std::ops::Bound::Unbounded)? {
return Ok(std::cmp::Ordering::Less);
} else if gs_order.is_connected(*b, *a, 1, std::ops::Bound::Unbounded)? {
return Ok(std::cmp::Ordering::Greater);
}
Ok(std::cmp::Ordering::Equal)
})?;
let t = if ctype == AnnotationComponentType::RightToken {
candidates.last()
} else {
candidates.first()
};
if let Some(t) = t {
let gs = graph.get_or_create_writable(&alignment_component)?;
let e = Edge {
source: n,
target: *t,
};
gs.add_edge(e)?;
Ok(Some(*t))
} else {
Ok(None)
}
}
}
impl ComponentType for AnnotationComponentType {
type UpdateGraphIndex = AQLUpdateGraphIndex;
type GlobalStatistics = AQLGlobalStatistics;
fn all_component_types() -> Vec<Self> {
AnnotationComponentType::iter().collect()
}
fn default_components() -> Vec<AnnotationComponent> {
vec![
AnnotationComponent::new(
AnnotationComponentType::Coverage,
ANNIS_NS.into(),
"".into(),
),
AnnotationComponent::new(
AnnotationComponentType::Ordering,
ANNIS_NS.into(),
"".into(),
),
AnnotationComponent::new(
AnnotationComponentType::LeftToken,
ANNIS_NS.into(),
"".into(),
),
AnnotationComponent::new(
AnnotationComponentType::RightToken,
ANNIS_NS.into(),
"".into(),
),
AnnotationComponent::new(AnnotationComponentType::PartOf, ANNIS_NS.into(), "".into()),
]
}
fn init_update_graph_index(
graph: &AnnotationGraph,
) -> std::result::Result<Self::UpdateGraphIndex, ComponentTypeError> {
let node_ids = DiskMap::new(
None,
EvictionStrategy::MaximumItems(1_000_000),
DEFAULT_BLOCK_CACHE_CAPACITY,
BtreeConfig::default().fixed_value_size(9),
)?;
let graph_without_nodes = graph.get_node_annos().is_empty()?;
let invalid_nodes: DiskMap<NodeID, bool> = DiskMap::new(
None,
EvictionStrategy::MaximumItems(1_000_000),
DEFAULT_BLOCK_CACHE_CAPACITY,
BtreeConfig::default().fixed_key_size(8).fixed_value_size(2),
)?;
let mut text_coverage_components = FxHashSet::default();
text_coverage_components
.extend(graph.get_all_components(Some(AnnotationComponentType::Dominance), Some("")));
text_coverage_components
.extend(graph.get_all_components(Some(AnnotationComponentType::Coverage), None));
Ok(AQLUpdateGraphIndex {
cached_node_ids: node_ids,
graph_without_nodes,
text_coverage_components,
invalid_nodes,
})
}
fn before_update_event(
update: &UpdateEvent,
graph: &AnnotationGraph,
index: &mut Self::UpdateGraphIndex,
) -> std::result::Result<(), ComponentTypeError> {
match update {
UpdateEvent::DeleteNode { node_name } if !index.graph_without_nodes => {
let existing_node_id =
index.get_cached_node_id_from_name(Cow::Borrowed(node_name), graph)?;
if !index.invalid_nodes.contains_key(&existing_node_id)? {
index.calculate_invalidated_nodes_by_coverage(graph, existing_node_id)?;
}
}
UpdateEvent::DeleteEdge {
source_node,
target_node,
component_type,
..
} => {
if !index.graph_without_nodes
&& let Ok(ctype) = AnnotationComponentType::from_str(component_type)
{
if ctype == AnnotationComponentType::Coverage
|| ctype == AnnotationComponentType::Dominance
|| ctype == AnnotationComponentType::Ordering
|| ctype == AnnotationComponentType::LeftToken
|| ctype == AnnotationComponentType::RightToken
{
let source = index
.get_cached_node_id_from_name(Cow::Borrowed(source_node), graph)?;
index.calculate_invalidated_nodes_by_coverage(graph, source)?;
}
if ctype == AnnotationComponentType::Ordering {
let target = index
.get_cached_node_id_from_name(Cow::Borrowed(target_node), graph)?;
index.calculate_invalidated_nodes_by_coverage(graph, target)?;
}
}
}
_ => {}
}
Ok(())
}
fn after_update_event(
update: UpdateEvent,
graph: &AnnotationGraph,
index: &mut Self::UpdateGraphIndex,
) -> std::result::Result<(), ComponentTypeError> {
if let UpdateEvent::AddEdge {
component_type,
component_name,
layer,
source_node,
target_node,
..
} = update
&& let Ok(ctype) = AnnotationComponentType::from_str(&component_type)
{
if (ctype == AnnotationComponentType::Dominance
|| ctype == AnnotationComponentType::Coverage)
&& component_name.is_empty()
{
let c = AnnotationComponent::new(ctype.clone(), layer, component_name);
index.text_coverage_components.insert(c);
}
if !index.graph_without_nodes {
if ctype == AnnotationComponentType::Coverage
|| ctype == AnnotationComponentType::Dominance
|| ctype == AnnotationComponentType::Ordering
|| ctype == AnnotationComponentType::LeftToken
|| ctype == AnnotationComponentType::RightToken
{
let source =
index.get_cached_node_id_from_name(Cow::Owned(source_node), graph)?;
index.calculate_invalidated_nodes_by_coverage(graph, source)?;
}
if ctype == AnnotationComponentType::Ordering {
let target =
index.get_cached_node_id_from_name(Cow::Owned(target_node), graph)?;
index.calculate_invalidated_nodes_by_coverage(graph, target)?;
}
}
}
Ok(())
}
fn apply_update_graph_index(
mut index: Self::UpdateGraphIndex,
graph: &mut AnnotationGraph,
) -> std::result::Result<(), ComponentTypeError> {
if index.graph_without_nodes {
let node_search = graph.get_node_annos().exact_anno_search(
Some(&NODE_TYPE_KEY.ns),
&NODE_TYPE_KEY.name,
ValueSearch::Any,
);
for m in node_search {
let m = m?;
index.invalid_nodes.insert(m.node, true)?;
}
}
index.invalid_nodes.compact()?;
let order_component = AnnotationComponent::new(
AnnotationComponentType::Ordering,
ANNIS_NS.into(),
"".into(),
);
let order_stats_exist = graph
.get_graphstorage(&order_component)
.map(|gs_order| gs_order.get_statistics().is_some())
.unwrap_or(false);
if !order_stats_exist {
graph.calculate_component_statistics(&order_component)?;
}
graph.optimize_gs_impl(&order_component)?;
if let Some(gs_order) = graph.get_graphstorage(&order_component) {
index.reindex_inherited_coverage(graph, gs_order)?;
}
Ok(())
}
fn update_graph_index_components(_graph: &Graph<Self>) -> Vec<Component<Self>> {
vec![
AnnotationComponent::new(
AnnotationComponentType::LeftToken,
ANNIS_NS.into(),
"".into(),
),
AnnotationComponent::new(
AnnotationComponentType::RightToken,
ANNIS_NS.into(),
"".into(),
),
AnnotationComponent::new(
AnnotationComponentType::Coverage,
ANNIS_NS.into(),
"inherited-coverage".into(),
),
]
}
fn calculate_global_statistics(
graph: &mut Graph<Self>,
) -> std::result::Result<(), ComponentTypeError> {
let default_ordering_component = Component::new(
AnnotationComponentType::Ordering,
ANNIS_NS.into(),
"".into(),
);
let token_helper = TokenHelper::new(graph)?;
let mut all_token_in_order_component = false;
let mut base_token_count = 0;
let mut token_count_by_ordering_component = HashMap::new();
if let Some(ordering_gs) = graph.get_graphstorage_as_ref(&default_ordering_component) {
all_token_in_order_component = true;
for m in graph.get_node_annos().exact_anno_search(
Some(&TOKEN_KEY.ns),
&TOKEN_KEY.name,
ValueSearch::Any,
) {
let n = m?.node;
if !token_helper.has_outgoing_coverage_edges(n)? {
all_token_in_order_component = all_token_in_order_component
&& ordering_gs.has_outgoing_edges(n)?
|| ordering_gs.has_ingoing_edges(n)?;
base_token_count += 1;
}
}
}
for ordering_component in
graph.get_all_components(Some(AnnotationComponentType::Ordering), None)
{
if !ordering_component.name.is_empty()
&& let Some(gs_stats) = graph
.get_graphstorage_as_ref(&ordering_component)
.and_then(|gs| gs.get_statistics())
{
token_count_by_ordering_component.insert(ordering_component, gs_stats.nodes as u64);
}
}
graph.global_statistics = Some(AQLGlobalStatistics {
all_token_in_order_component,
corpus_size: CorpusSize::Token {
base_token_count,
segmentation_count: token_count_by_ordering_component
.into_iter()
.map(|(k, v)| (k.name, v))
.collect(),
},
});
Ok(())
}
}
impl fmt::Display for AnnotationComponentType {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
fmt::Debug::fmt(self, f)
}
}
#[cfg(test)]
mod tests;