use anyhow::Result;
use graphannis::graph::Match;
use graphannis_core::annostorage::NodeAnnotationStorage;
use graphannis_core::graph::NODE_NAME_KEY;
use graphannis_core::{graph::storage::GraphStorage, types::NodeID};
use lru::LruCache;
use nonzero::nonzero;
use std::borrow::Cow;
use std::cmp::Ordering;
use std::sync::Arc;
use super::token_helper::TokenHelper;
pub(crate) struct SortCache {
node_name: LruCache<NodeID, String>,
left_token: LruCache<NodeID, Option<NodeID>>,
is_connected: LruCache<(NodeID, NodeID), bool>,
gs_order: Arc<dyn GraphStorage>,
}
impl SortCache {
pub fn new(gs_order: Arc<dyn GraphStorage>) -> Self {
Self {
node_name: LruCache::new(nonzero!(1000usize)),
left_token: LruCache::new(nonzero!(1000usize)),
is_connected: LruCache::new(nonzero!(1000usize)),
gs_order,
}
}
pub fn compare_matchgroup_by_text_pos(
&mut self,
m1: &[Match],
m2: &[Match],
node_annos: &dyn NodeAnnotationStorage,
token_helper: &TokenHelper,
) -> Result<Ordering> {
for i in 0..std::cmp::min(m1.len(), m2.len()) {
let element_cmp =
self.compare_match_by_text_pos(&m1[i], &m2[i], node_annos, token_helper)?;
if element_cmp != Ordering::Equal {
return Ok(element_cmp);
}
}
Ok(m2.len().cmp(&m1.len()))
}
pub fn compare_match_by_text_pos(
&mut self,
m1: &Match,
m2: &Match,
node_annos: &dyn NodeAnnotationStorage,
token_helper: &TokenHelper,
) -> Result<Ordering> {
if m1.node == m2.node {
Ok(m1.anno_key.cmp(&m2.anno_key))
} else {
let m1_anno_val = if let Some(val) = self.node_name.get(&m1.node) {
Some(Cow::Owned(val.clone()))
} else {
let val = node_annos.get_value_for_item(&m1.node, &NODE_NAME_KEY)?;
if let Some(val) = &val {
self.node_name.put(m1.node, val.to_string());
}
val
};
let m2_anno_val = if let Some(val) = self.node_name.get(&m2.node) {
Some(Cow::Borrowed(val.as_str()))
} else {
let val = node_annos.get_value_for_item(&m2.node, &NODE_NAME_KEY)?;
if let Some(val) = &val {
self.node_name.put(m2.node, val.to_string());
}
val
};
if let Some(m1_anno_val) = m1_anno_val
&& let Some(m2_anno_val) = m2_anno_val
{
let (m1_path, m1_name) = split_path_and_nodename(&m1_anno_val);
let (m2_path, m2_name) = split_path_and_nodename(&m2_anno_val);
let path_cmp = compare_document_path(m1_path, m2_path);
if path_cmp != Ordering::Equal {
return Ok(path_cmp);
}
let m1_lefttok = if let Some(lefttok) = self.left_token.get(&m1.node).copied() {
lefttok
} else {
let result = token_helper.left_token_for(m1.node)?;
self.left_token.put(m1.node, result);
result
};
let m2_lefttok = if let Some(lefttok) = self.left_token.get(&m2.node).copied() {
lefttok
} else {
let result = token_helper.left_token_for(m2.node)?;
self.left_token.put(m2.node, result);
result
};
if let Some(m1_lefttok) = m1_lefttok
&& let Some(m2_lefttok) = m2_lefttok
{
let token_are_connected =
if let Some(v) = self.is_connected.get(&(m1_lefttok, m2_lefttok)) {
*v
} else {
self.gs_order.is_connected(
m1_lefttok,
m2_lefttok,
1,
std::ops::Bound::Unbounded,
)?
};
if token_are_connected {
return Ok(Ordering::Less);
} else if self.gs_order.is_connected(
m2_lefttok,
m1_lefttok,
1,
std::ops::Bound::Unbounded,
)? {
return Ok(Ordering::Greater);
}
}
let name_cmp = m1_name.cmp(m2_name);
if name_cmp != Ordering::Equal {
return Ok(name_cmp);
}
}
Ok(m1.node.cmp(&m2.node))
}
}
}
fn split_path_and_nodename(full_node_name: &str) -> (&str, &str) {
full_node_name
.rsplit_once('#')
.unwrap_or((full_node_name, ""))
}
fn compare_document_path(p1: &str, p2: &str) -> std::cmp::Ordering {
let it1 = p1.split('/').filter(|s| !s.is_empty());
let it2 = p2.split('/').filter(|s| !s.is_empty());
for (part1, part2) in it1.zip(it2) {
let string_cmp = part1.cmp(part2);
if string_cmp != std::cmp::Ordering::Equal {
return string_cmp;
}
}
let length1 = p1.split('/').filter(|s| !s.is_empty()).count();
let length2 = p2.split('/').filter(|s| !s.is_empty()).count();
length1.cmp(&length2)
}
#[cfg(test)]
mod tests {
use std::{io::BufReader, path::Path};
use graphannis::model::{AnnotationComponent, AnnotationComponentType};
use graphannis_core::graph::{ANNIS_NS, NODE_TYPE_KEY, serialization::graphml};
use super::*;
#[test]
fn tiger_doc_name_sort() {
let p1 = "tiger2/tiger2/tiger_release_dec05_110";
let p2 = "tiger2/tiger2/tiger_release_dec05_1_1";
assert_eq!(std::cmp::Ordering::Less, compare_document_path(p1, p2));
}
#[test]
fn compare_match_for_example_graph() {
let input_file = std::fs::File::open(Path::new(
"tests/data/import/graphml/single_sentence.graphml",
))
.unwrap();
let input_file = BufReader::new(input_file);
let (graph, _) =
graphml::import::<AnnotationComponentType, _, _>(input_file, false, |_| {}).unwrap();
let gs_order = graph
.get_graphstorage(&AnnotationComponent::new(
AnnotationComponentType::Ordering,
ANNIS_NS.into(),
"".into(),
))
.unwrap();
let token_helper = TokenHelper::new(&graph).unwrap();
let t3_id = graph
.get_node_annos()
.get_node_id_from_name("single_sentence/zossen#t3")
.unwrap()
.unwrap();
let t5_id = graph
.get_node_annos()
.get_node_id_from_name("single_sentence/zossen#t5")
.unwrap()
.unwrap();
let mut sort_cache = SortCache::new(gs_order);
let match_t3 = Match {
node: t3_id,
anno_key: NODE_TYPE_KEY.clone(),
};
assert_eq!(
Ordering::Equal,
sort_cache
.compare_match_by_text_pos(
&match_t3,
&match_t3,
graph.get_node_annos(),
&token_helper
)
.unwrap()
);
let match_t5 = Match {
node: t5_id,
anno_key: NODE_TYPE_KEY.clone(),
};
assert_eq!(
Ordering::Less,
sort_cache
.compare_match_by_text_pos(
&match_t3,
&match_t5,
graph.get_node_annos(),
&token_helper
)
.unwrap()
);
assert_eq!(
Ordering::Greater,
sort_cache
.compare_match_by_text_pos(
&match_t5,
&match_t3,
graph.get_node_annos(),
&token_helper
)
.unwrap()
);
}
}