use std::{
collections::{BTreeMap, BTreeSet},
fs,
path::{Path, PathBuf},
sync::mpsc,
};
use super::Manipulator;
use crate::{
StepID,
progress::ProgressReporter,
util::{
CorpusGraphHelper,
sort_matches::SortCache,
token_helper::{TOKEN_KEY, TokenHelper},
update_graph, update_graph_silent,
},
};
use anyhow::{Context, anyhow};
use facet::Facet;
use graphannis::{
AnnotationGraph,
graph::{AnnoKey, EdgeContainer, Match},
model::{AnnotationComponent, AnnotationComponentType},
update::{GraphUpdate, UpdateEvent},
};
use graphannis_core::graph::{ANNIS_NS, DEFAULT_NS, NODE_NAME_KEY, NODE_TYPE_KEY};
use itertools::Itertools;
use regex::Regex;
use serde::Serialize;
use serde_derive::Deserialize;
#[derive(Facet, Default, Deserialize, Serialize, Clone, PartialEq)]
#[serde(deny_unknown_fields)]
pub struct MapAnnos {
#[serde(default)]
rule_file: Option<PathBuf>,
#[serde(default)]
mapping: Option<Mapping>,
#[serde(default)]
debug: bool,
}
impl Manipulator for MapAnnos {
fn manipulate_corpus(
&self,
graph: &mut graphannis::AnnotationGraph,
workflow_directory: &std::path::Path,
step_id: StepID,
tx: Option<crate::workflow::StatusSender>,
) -> Result<(), Box<dyn std::error::Error>> {
let progress = ProgressReporter::new_unknown_total_work(tx.clone(), step_id.clone())?;
let config_from_file = if let Some(path) = &self.rule_file {
let read_from_path = {
let p = path.as_path().to_path_buf();
if p.is_relative() {
workflow_directory.join(p)
} else {
p
}
};
Some(read_config(read_from_path.as_path())?)
} else {
None
};
progress.info("Ensure all graph storages are loaded.")?;
graph.ensure_loaded_all()?;
if self.mapping.is_none() && self.rule_file.is_none() {
progress.warn("Neither a rule file was provided nor are there any inline mapping definitions. This step will thus not modify the annotation graph.")?;
}
if let Some(config) = config_from_file {
progress.info("Starting application of rules from rule file ...")?;
let mut map_impl = MapperImpl {
config,
added_spans: 0,
progress: { if self.debug { Some(progress) } else { None } },
};
map_impl.run(graph)?;
}
if let Some(inline_config) = &self.mapping {
let inline_progress = ProgressReporter::new_unknown_total_work(tx, step_id)?;
inline_progress.info("Starting application of inline rules ...")?;
let mut map_impl = MapperImpl {
config: inline_config.clone(),
added_spans: 0,
progress: {
if self.debug {
Some(inline_progress)
} else {
None
}
},
};
map_impl.run(graph)?;
}
Ok(())
}
fn requires_statistics(&self) -> bool {
true
}
}
fn read_config(path: &Path) -> Result<Mapping, Box<dyn std::error::Error>> {
let config_string = fs::read_to_string(path)?;
let m: Mapping = toml::from_str(config_string.as_str())?;
Ok(m)
}
#[derive(Facet, Debug, Deserialize, Clone, Serialize, PartialEq)]
#[repr(u8)]
enum RepetitionMode {
Fixed { n: usize },
UntilUnchanged,
}
impl Default for RepetitionMode {
fn default() -> Self {
Self::Fixed { n: 1 }
}
}
#[derive(Facet, Deserialize, Debug, Clone, Serialize, PartialEq)]
#[serde(deny_unknown_fields)]
struct Mapping {
rules: Vec<Rule>,
#[serde(default)]
repetition: RepetitionMode,
}
#[derive(Facet, Clone, Deserialize, Debug, Serialize, PartialEq)]
#[serde(untagged)]
#[repr(u8)]
enum TargetRef {
Node(usize),
Span(Vec<usize>),
}
impl TargetRef {
fn resolve_value(
&self,
graph: &AnnotationGraph,
mg: &[Match],
sep: &str,
) -> anyhow::Result<String> {
let targets: Vec<usize> = match self {
TargetRef::Node(n) => vec![*n],
TargetRef::Span(t) => t.clone(),
};
let mut result = String::new();
for target_node in targets {
let m = mg
.get(target_node - 1)
.with_context(|| format!("target {target_node} does not exist in result"))?;
let anno_key = if m.anno_key.as_ref() == NODE_TYPE_KEY.as_ref() {
TOKEN_KEY.clone()
} else {
m.anno_key.clone()
};
let orig_val = graph
.get_node_annos()
.get_value_for_item(&m.node, &anno_key)?
.unwrap_or_default();
if !result.is_empty() {
result.push_str(sep);
}
result.push_str(&orig_val);
}
Ok(result)
}
}
#[derive(Facet, Clone, Deserialize, Debug, Serialize, PartialEq)]
#[serde(untagged)]
#[repr(u8)]
enum Value {
Fixed(String),
Copy {
copy: TargetRef,
#[serde(default = "default_value_delimiter")]
delimiter: String,
},
Replace {
target: TargetRef,
replacements: Vec<(String, String)>,
#[serde(default = "default_value_delimiter")]
delimiter: String,
},
Factorize {
factorize: TargetRef,
},
}
fn default_value_delimiter() -> String {
" ".to_string()
}
#[derive(Facet, Clone, Debug, Deserialize, Serialize, PartialEq)]
#[serde(deny_unknown_fields)]
struct Rule {
#[serde(deserialize_with = "crate::estarde::query::deserialize_and_check")]
query: String,
target: TargetRef,
#[serde(with = "crate::estarde::anno_key")]
anno: AnnoKey,
value: Value,
#[serde(default)]
delete: Vec<usize>,
}
impl Rule {
fn resolve_value(
&self,
graph: &AnnotationGraph,
mg: &[Match],
factors: &mut BTreeMap<String, usize>,
) -> anyhow::Result<String> {
match &self.value {
Value::Fixed(val) => Ok(val.clone()),
Value::Copy { copy, delimiter } => copy.resolve_value(graph, mg, delimiter),
Value::Replace {
target,
replacements,
delimiter,
} => {
let mut val = target.resolve_value(graph, mg, delimiter)?;
for (search, replace) in replacements {
let search = Regex::new(search)?;
val = search.replace_all(&val, replace).to_string();
}
Ok(val)
}
Value::Factorize { factorize } => {
let val = factorize.resolve_value(graph, mg, "")?;
let v = factors.len();
let v = match factors.entry(val) {
std::collections::btree_map::Entry::Vacant(vacant_entry) => {
vacant_entry.insert(v);
v
}
std::collections::btree_map::Entry::Occupied(occupied_entry) => {
*occupied_entry.get()
}
};
Ok(v.to_string())
}
}
}
}
struct MapperImpl {
config: Mapping,
added_spans: usize,
progress: Option<ProgressReporter>,
}
impl MapperImpl {
fn run(&mut self, graph: &mut AnnotationGraph) -> anyhow::Result<()> {
match self.config.repetition {
RepetitionMode::Fixed { n } => {
for i in 0..n {
if let Some(p) = &self.progress {
p.info(format!(
"Applying rule set of `map` module run {}/{n}",
i + 1
))?;
}
self.apply_ruleset(graph)?;
if i < n - 1 {
graph.calculate_all_statistics()?;
}
}
}
RepetitionMode::UntilUnchanged => {
let mut run_nr = 1;
loop {
if let Some(p) = &self.progress {
p.info(format!("Applying rule set of `map` module run {run_nr}"))?;
}
let new_update_size = self.apply_ruleset(graph)?;
if new_update_size > 0 {
if let Some(p) = &self.progress {
p.info(format!("Added {new_update_size} updates because of rules, repeating to apply all rules until no updates are generated."))?;
}
run_nr += 1;
graph.calculate_all_statistics()?;
} else {
break;
}
}
}
}
Ok(())
}
fn apply_ruleset(&mut self, graph: &mut AnnotationGraph) -> anyhow::Result<usize> {
let mut updates = GraphUpdate::default();
for rule in self.config.rules.clone() {
let query = graphannis::aql::parse(&rule.query, false)
.with_context(|| format!("could not parse query '{}'", &rule.query))?;
let result_it = graphannis::aql::execute_query_on_graph(graph, &query, true, None)?;
let mut n = 0;
let mut factors = BTreeMap::default();
let results = if matches!(rule.value, Value::Factorize { .. }) {
let gs_order = graph
.get_graphstorage(&AnnotationComponent::new(
AnnotationComponentType::Ordering,
ANNIS_NS.to_string(),
"".to_string(),
))
.ok_or(anyhow!("No default ordering found."))?;
let mut sort_cache = SortCache::new(gs_order);
let token_helper = TokenHelper::new(graph)?;
Box::new(result_it.sorted_by(|a, b| {
if let Ok(a) = a
&& let Ok(b) = b
{
sort_cache
.compare_matchgroup_by_text_pos(
a,
b,
graph.get_node_annos(),
&token_helper,
)
.unwrap_or(std::cmp::Ordering::Equal)
} else {
std::cmp::Ordering::Equal
}
}))
} else {
result_it
};
for match_group in results {
let match_group = match_group?;
match rule.target {
TargetRef::Node(target) => {
self.map_single_node(
&rule,
target,
&match_group,
&mut factors,
graph,
&mut updates,
)?;
}
TargetRef::Span(ref all_targets) => {
self.map_span(
&rule,
all_targets,
&match_group,
&mut factors,
graph,
&mut updates,
)?;
}
}
self.delete_existing_annotations(&rule, &match_group, graph, &mut updates)?;
n += 1;
}
if let Some(p) = &self.progress {
p.info(format!(
"Rule with query `{}` matched {n} time(s).",
&rule.query
))?;
}
}
let number_of_updates = updates.len()?;
if number_of_updates > 0 {
let tx_rx = if self.progress.is_some() {
Some(mpsc::channel())
} else {
None
};
if let Some((sender, _receiver)) = tx_rx {
update_graph(
graph,
&mut updates,
Some(StepID {
module_name: "map".to_string(),
path: None,
}),
Some(sender),
)?;
} else {
update_graph_silent(graph, &mut updates)?;
}
}
Ok(number_of_updates)
}
fn map_single_node(
&self,
rule: &Rule,
target: usize,
match_group: &[Match],
factors: &mut BTreeMap<String, usize>,
graph: &AnnotationGraph,
update: &mut GraphUpdate,
) -> anyhow::Result<()> {
if let Some(m) = match_group.get(target - 1) {
let match_node_name = graph
.get_node_annos()
.get_value_for_item(&m.node, &NODE_NAME_KEY)?
.context("Missing node name for matched node")?;
update.add_event(UpdateEvent::AddNodeLabel {
node_name: match_node_name.to_string(),
anno_ns: rule.anno.ns.to_string(),
anno_name: rule.anno.name.to_string(),
anno_value: rule.resolve_value(graph, match_group, factors)?,
})?;
}
Ok(())
}
fn map_span(
&mut self,
rule: &Rule,
targets: &[usize],
match_group: &[Match],
factors: &mut BTreeMap<String, usize>,
graph: &AnnotationGraph,
update: &mut GraphUpdate,
) -> anyhow::Result<()> {
let tok_helper = TokenHelper::new(graph)?;
let corpusgraph_helper = CorpusGraphHelper::new(graph);
if let Some(first_match) = targets
.first()
.copied()
.and_then(|t| match_group.get(t - 1))
{
let mut covered_token = BTreeSet::new();
for t in targets {
if let Some(n) = match_group.get(t - 1) {
if tok_helper.is_token(n.node)? {
covered_token.insert(n.node);
} else {
covered_token.extend(tok_helper.covered_token(n.node)?);
}
}
}
let first_node_name = graph
.get_node_annos()
.get_value_for_item(&first_match.node, &NODE_NAME_KEY)?
.context("Missing node name")?;
let new_node_name = format!("{first_node_name}_map_{}", self.added_spans);
self.added_spans += 1;
update.add_event(UpdateEvent::AddNode {
node_name: new_node_name.clone(),
node_type: "node".to_string(),
})?;
update.add_event(UpdateEvent::AddNodeLabel {
node_name: new_node_name.clone(),
anno_ns: rule.anno.ns.to_string(),
anno_name: rule.anno.name.to_string(),
anno_value: rule.resolve_value(graph, match_group, factors)?,
})?;
if let Some(parent_node) = corpusgraph_helper
.get_outgoing_edges(first_match.node)
.next()
{
let parent_node = parent_node?;
let parent_node_name = graph
.get_node_annos()
.get_value_for_item(&parent_node, &NODE_NAME_KEY)?
.context("Missing node name for parent node")?;
update.add_event(UpdateEvent::AddEdge {
source_node: new_node_name.clone(),
target_node: parent_node_name.to_string(),
layer: ANNIS_NS.to_string(),
component_type: AnnotationComponentType::PartOf.to_string(),
component_name: "".to_string(),
})?;
}
for t in covered_token {
let token_node_name = graph
.get_node_annos()
.get_value_for_item(&t, &NODE_NAME_KEY)?
.context("Missing node name for covered token")?;
update.add_event(UpdateEvent::AddEdge {
source_node: new_node_name.clone(),
target_node: token_node_name.to_string(),
layer: DEFAULT_NS.to_string(),
component_type: AnnotationComponentType::Coverage.to_string(),
component_name: "".to_string(),
})?;
}
}
Ok(())
}
fn delete_existing_annotations(
&mut self,
rule: &Rule,
match_group: &[Match],
graph: &AnnotationGraph,
update: &mut GraphUpdate,
) -> anyhow::Result<()> {
for query_index in &rule.delete {
if let Some(m) = match_group.get(*query_index - 1) {
let delete_from_node = graph
.get_node_annos()
.get_value_for_item(&m.node, &NODE_NAME_KEY)?
.ok_or(anyhow!("Node has no node name."))?;
update.add_event(UpdateEvent::DeleteNodeLabel {
node_name: delete_from_node.to_string(),
anno_ns: m.anno_key.ns.to_string(),
anno_name: m.anno_key.name.to_string(),
})?;
}
}
Ok(())
}
}
#[cfg(test)]
mod tests {
use std::sync::mpsc;
use graphannis::{
AnnotationGraph,
aql::{self, execute_query_on_graph},
model::AnnotationComponentType,
update::{GraphUpdate, UpdateEvent},
};
use graphannis_core::{annostorage::ValueSearch, graph::ANNIS_NS};
use insta::assert_snapshot;
use itertools::Itertools;
use pretty_assertions::assert_eq;
use tempfile::NamedTempFile;
use tests::test_util::export_to_string;
use crate::{
StepID,
exporter::graphml::GraphMLExporter,
importer::{Importer, treetagger::ImportTreeTagger},
manipulator::Manipulator,
test_util,
util::example_generator,
};
use super::*;
#[test]
fn serialize() {
let module = MapAnnos::default();
let serialization = toml::to_string(&module);
assert!(
serialization.is_ok(),
"Serialization failed: {:?}",
serialization.err()
);
assert_snapshot!(serialization.unwrap());
}
#[test]
fn serialize_custom() {
let module = MapAnnos {
rule_file: Some(PathBuf::from("external/file.toml")),
mapping: Some(Mapping {
rules: vec![Rule {
query: "pos=/NN/".to_string(),
target: TargetRef::Span(vec![1]),
anno: AnnoKey {
name: "upos".into(),
ns: "ud".into(),
},
value: Value::Fixed("NOUN".to_string()),
delete: vec![1],
}],
repetition: RepetitionMode::UntilUnchanged,
}),
debug: true,
};
let serialization = toml::to_string(&module);
assert!(
serialization.is_ok(),
"Serialization failed: {:?}",
serialization.err()
);
assert_snapshot!(serialization.unwrap());
}
#[test]
fn fail_deserialization_with_bad_query() {
let rule: Result<Rule, _> = toml::from_str(
r#"
query = "annis:tok @* doc"
target = 1
anno = "new::anno"
value = "new_value"
"#,
);
assert!(rule.is_err());
assert_snapshot!(rule.err().unwrap());
}
#[test]
fn graph_statistics() {
let g = AnnotationGraph::with_default_graphstorages(false);
assert!(g.is_ok());
let mut graph = g.unwrap();
let mut u = GraphUpdate::default();
example_generator::create_corpus_structure_simple(&mut u);
assert!(update_graph_silent(&mut graph, &mut u).is_ok());
let module = MapAnnos {
rule_file: Some(PathBuf::from("./any_file.toml")),
mapping: None,
debug: false,
};
assert!(
module
.validate_graph(
&mut graph,
StepID {
module_name: "test".to_string(),
path: None
},
None
)
.is_ok()
);
assert!(graph.global_statistics.is_some());
}
#[test]
fn inline_rules() {
let mut updates = GraphUpdate::new();
example_generator::create_corpus_structure_simple(&mut updates);
example_generator::create_tokens(&mut updates, Some("root/doc1"));
let mut g = AnnotationGraph::with_default_graphstorages(false).unwrap();
g.apply_update(&mut updates, |_msg| {}).unwrap();
let config = r#"
[[mapping.rules]]
query = "tok"
target = 1
anno = "test_ns::test"
value = {copy = 1}
"#;
let m: Result<MapAnnos, _> = toml::from_str(config);
assert!(m.is_ok(), "Error deserializing mapper: {:?}", m.err());
let mapper = m.unwrap();
assert!(
mapper
.manipulate_corpus(
&mut g,
Path::new("./"),
StepID {
module_name: "test_map_inline".to_string(),
path: None,
},
None,
)
.is_ok()
);
let tok_match = g
.get_node_annos()
.exact_anno_search(Some("annis"), "tok", ValueSearch::Some("complicated"))
.next()
.unwrap()
.unwrap();
let resolved = mapper.mapping.unwrap().rules[0]
.resolve_value(&g, &vec![tok_match], &mut BTreeMap::default())
.unwrap();
assert_eq!("complicated", resolved);
}
#[test]
fn test_delete() {
let mut updates = GraphUpdate::new();
example_generator::create_corpus_structure_simple(&mut updates);
example_generator::create_multiple_segmentations(&mut updates, "root/doc1");
let mut g = AnnotationGraph::with_default_graphstorages(true).unwrap();
g.apply_update(&mut updates, |_msg| {}).unwrap();
let map_with_deletion = Rule {
query: "b".to_string(),
target: super::TargetRef::Node(1),
anno: AnnoKey {
name: "c".into(),
ns: "".into(),
},
value: Value::Copy {
copy: TargetRef::Node(1),
delimiter: default_value_delimiter(),
},
delete: vec![1],
};
let mapping = Mapping {
repetition: super::RepetitionMode::Fixed { n: 1 },
rules: vec![map_with_deletion],
};
let mut mapper = super::MapperImpl {
config: mapping,
added_spans: 0,
progress: None,
};
assert!(mapper.apply_ruleset(&mut g).is_ok());
let actual = export_to_string(&g, GraphMLExporter::default());
assert!(actual.is_ok());
assert_snapshot!(actual.unwrap());
}
#[test]
fn test_resolve_value_fixed() {
let mut updates = GraphUpdate::new();
example_generator::create_corpus_structure_simple(&mut updates);
example_generator::create_tokens(&mut updates, Some("root/doc1"));
let mut g = AnnotationGraph::with_default_graphstorages(false).unwrap();
g.apply_update(&mut updates, |_msg| {}).unwrap();
let fixed_value = Rule {
query: "tok".to_string(),
target: super::TargetRef::Node(1),
anno: AnnoKey {
ns: "test_ns".into(),
name: "test".into(),
},
value: Value::Fixed("myvalue".to_string()),
delete: vec![],
};
let resolved = fixed_value
.resolve_value(&g, &vec![], &mut BTreeMap::default())
.unwrap();
assert_eq!("myvalue", resolved);
}
#[test]
fn test_resolve_value_copy() {
let mut updates = GraphUpdate::new();
example_generator::create_corpus_structure_simple(&mut updates);
example_generator::create_tokens(&mut updates, Some("root/doc1"));
let mut g = AnnotationGraph::with_default_graphstorages(false).unwrap();
g.apply_update(&mut updates, |_msg| {}).unwrap();
let config = r#"
[[rules]]
query = "tok"
target = 1
anno = "test_ns::test"
value = {copy = 1}
"#;
let m: Mapping = toml::from_str(config).unwrap();
let tok_match = g
.get_node_annos()
.exact_anno_search(Some("annis"), "tok", ValueSearch::Some("complicated"))
.next()
.unwrap()
.unwrap();
let resolved = m.rules[0]
.resolve_value(&g, &vec![tok_match], &mut BTreeMap::default())
.unwrap();
assert_eq!("complicated", resolved);
}
#[test]
fn test_resolve_value_replace_simple() {
let mut updates = GraphUpdate::new();
example_generator::create_corpus_structure_simple(&mut updates);
example_generator::create_tokens(&mut updates, Some("root/doc1"));
let mut g = AnnotationGraph::with_default_graphstorages(false).unwrap();
g.apply_update(&mut updates, |_msg| {}).unwrap();
let fixed_value = Rule {
query: "tok".to_string(),
target: super::TargetRef::Node(1),
anno: AnnoKey {
name: "test".into(),
ns: "test_ns".into(),
},
value: Value::Replace {
target: TargetRef::Node(1),
replacements: vec![("cat".to_string(), "dog".to_string())],
delimiter: default_value_delimiter(),
},
delete: vec![],
};
let tok_match = g
.get_node_annos()
.exact_anno_search(Some("annis"), "tok", ValueSearch::Some("complicated"))
.next()
.unwrap()
.unwrap();
let resolved = fixed_value
.resolve_value(&g, &vec![tok_match], &mut BTreeMap::default())
.unwrap();
assert_eq!("complidoged", resolved);
}
#[test]
fn test_resolve_value_copy_span_custom_delim() {
let mut updates = GraphUpdate::new();
example_generator::create_corpus_structure_simple(&mut updates);
example_generator::create_tokens(&mut updates, Some("root/doc1"));
let mut g = AnnotationGraph::with_default_graphstorages(false).unwrap();
g.apply_update(&mut updates, |_msg| {}).unwrap();
let fixed_value = Rule {
query: "tok . tok=/complicated/".to_string(),
target: super::TargetRef::Node(2),
anno: AnnoKey {
name: "test".into(),
ns: "test_ns".into(),
},
value: Value::Copy {
delimiter: "###".to_string(),
copy: TargetRef::Span(vec![1, 2]),
},
delete: vec![],
};
let mapper = MapAnnos {
rule_file: None,
mapping: Some(Mapping {
rules: vec![fixed_value],
repetition: RepetitionMode::Fixed { n: 1 },
}),
debug: false,
};
let r = mapper.manipulate_corpus(
&mut g,
Path::new("./"),
StepID {
module_name: "test".to_string(),
path: None,
},
None,
);
assert!(r.is_ok(), "Err: {:?}", r.err().unwrap());
let query = aql::parse("test_ns:test", false).unwrap();
let search = execute_query_on_graph(&g, &query, true, None).unwrap();
assert_snapshot!(
search
.map(|m| {
let m = m.unwrap();
let m = m.get(0).unwrap();
g.get_node_annos()
.get_value_for_item(&m.node, &m.anno_key)
.unwrap()
.unwrap()
})
.join("\n")
);
}
#[test]
fn test_resolve_value_replace_span_custom_delim() {
let mut updates = GraphUpdate::new();
example_generator::create_corpus_structure_simple(&mut updates);
example_generator::create_tokens(&mut updates, Some("root/doc1"));
let mut g = AnnotationGraph::with_default_graphstorages(false).unwrap();
g.apply_update(&mut updates, |_msg| {}).unwrap();
let fixed_value = Rule {
query: "tok . tok=/complicated/".to_string(),
target: super::TargetRef::Node(2),
anno: AnnoKey {
name: "test".into(),
ns: "test_ns".into(),
},
value: Value::Replace {
target: TargetRef::Span(vec![1, 2]),
replacements: vec![("cat".to_string(), "dog".to_string())],
delimiter: "###".to_string(),
},
delete: vec![],
};
let mapper = MapAnnos {
rule_file: None,
mapping: Some(Mapping {
rules: vec![fixed_value],
repetition: RepetitionMode::Fixed { n: 1 },
}),
debug: false,
};
let r = mapper.manipulate_corpus(
&mut g,
Path::new("./"),
StepID {
module_name: "test".to_string(),
path: None,
},
None,
);
assert!(r.is_ok(), "Err: {:?}", r.err().unwrap());
let query = aql::parse("test_ns:test", false).unwrap();
let search = execute_query_on_graph(&g, &query, true, None).unwrap();
assert_snapshot!(
search
.map(|m| {
let m = m.unwrap();
let m = m.get(0).unwrap();
g.get_node_annos()
.get_value_for_item(&m.node, &m.anno_key)
.unwrap()
.unwrap()
})
.join("\n")
);
}
#[test]
fn test_resolve_value_replace_with_backreference() {
let mut updates = GraphUpdate::new();
example_generator::create_corpus_structure_simple(&mut updates);
example_generator::create_tokens(&mut updates, Some("root/doc1"));
let mut g = AnnotationGraph::with_default_graphstorages(false).unwrap();
g.apply_update(&mut updates, |_msg| {}).unwrap();
let fixed_value = Rule {
query: "tok".to_string(),
target: super::TargetRef::Node(1),
anno: AnnoKey {
name: "test".into(),
ns: "test_ns".into(),
},
value: Value::Replace {
target: TargetRef::Node(1),
replacements: vec![("cat.*".to_string(), "$0$0".to_string())],
delimiter: default_value_delimiter(),
},
delete: vec![],
};
let tok_match = g
.get_node_annos()
.exact_anno_search(Some("annis"), "tok", ValueSearch::Some("complicated"))
.next()
.unwrap()
.unwrap();
let resolved = fixed_value
.resolve_value(&g, &vec![tok_match], &mut BTreeMap::default())
.unwrap();
assert_eq!("complicatedcated", resolved);
}
#[test]
fn test_parse_complicated_replace() {
let config = r#"
[[rules]]
query = "tok=\"New York\""
target = 1
anno = "abbr"
[rules.value]
target = 1
replacements = [["([A-Z])[a-z]+ ([A-Z])[a-z]+", "${1}${2}"]]
"#;
let m: Mapping = toml::from_str(config).unwrap();
let g = source_graph(false).unwrap();
let newyork_match = g
.get_node_annos()
.exact_anno_search(Some("annis"), "tok", ValueSearch::Some("New York"))
.next()
.unwrap()
.unwrap();
let result = m.rules[0]
.resolve_value(&g, &[newyork_match], &mut BTreeMap::default())
.unwrap();
assert_eq!("NY", result);
}
#[test]
fn test_ridges_clean_resolver() {
let config = r#"
[[rules]]
query = "tok"
target = 1
anno = "test::clean"
[rules.value]
target = 1
replacements = [
['ð', 'der'],
['(.*)(.)\u0304(.*)', '$1$2/MACRON_M/$3|$1$2/MACRON_N/$3'],
['([^|]*)([^|])\u0304([^|]*)', '$1$2/MACRON_M/$3|$1$2/MACRON_N/$3'],
['/MACRON_M/', 'm'],
['/MACRON_N/', 'n'],
]
"#;
let m: Mapping = toml::from_str(config).unwrap();
let g = tokens_with_macrons().unwrap();
let singlemacron = g
.get_node_annos()
.exact_anno_search(Some("annis"), "tok", ValueSearch::Some("anðthalbē"))
.next()
.unwrap()
.unwrap();
let result = m.rules[0]
.resolve_value(&g, &[singlemacron], &mut BTreeMap::default())
.unwrap();
assert_eq!("anderthalbem|anderthalben", result);
let multiple_macron = g
.get_node_annos()
.exact_anno_search(Some("annis"), "tok", ValueSearch::Some("ellēbogē"))
.next()
.unwrap()
.unwrap();
let result = m.rules[0]
.resolve_value(&g, &[multiple_macron], &mut BTreeMap::default())
.unwrap();
assert_eq!("ellembogem|ellenbogem|ellembogen|ellenbogen", result);
}
#[test]
fn repeat_mapping_fixed() {
let config = r#"
repetition = {Fixed = {n = 3}}
[[rules]]
query = "tok"
target = 1
anno = "annis::tok"
[rules.value]
target = 1
# Only replace the last character of each token.
replacements = [
['(\w\u0304?)X*$', 'X'],
]
"#;
let mut g = tokens_with_macrons().unwrap();
let tmp = NamedTempFile::new().unwrap();
std::fs::write(tmp.path(), config).unwrap();
let mapper = MapAnnos {
rule_file: Some(tmp.path().to_path_buf()),
mapping: None,
debug: true,
};
let step_id = StepID {
module_name: "test_map".to_string(),
path: None,
};
mapper
.manipulate_corpus(&mut g, tmp.path().parent().unwrap(), step_id, None)
.unwrap();
let th = TokenHelper::new(&g).unwrap();
let tokens = th.get_ordered_token("doc", None).unwrap();
let text = th.spanned_text(&tokens).unwrap();
assert_eq!("X krX wechX etX anðthaX ellēbX hX", text);
}
#[test]
fn repeat_mapping_until_unchanged() {
let config = r#"
repetition = "UntilUnchanged"
[[rules]]
query = 'tok!="X"'
target = 1
anno = "annis::tok"
[rules.value]
target = 1
replacements = [
['[^X]X*$', 'X'],
]
"#;
let mut g = tokens_with_macrons().unwrap();
let tmp = NamedTempFile::new().unwrap();
std::fs::write(tmp.path(), config).unwrap();
let mapper = MapAnnos {
rule_file: Some(tmp.path().to_path_buf()),
mapping: None,
debug: true,
};
let step_id = StepID {
module_name: "test_map".to_string(),
path: None,
};
mapper
.manipulate_corpus(&mut g, tmp.path().parent().unwrap(), step_id, None)
.unwrap();
let th = TokenHelper::new(&g).unwrap();
let tokens = th.get_ordered_token("doc", None).unwrap();
let text = th.spanned_text(&tokens).unwrap();
assert_eq!("X X X X X X X", text);
}
#[test]
fn test_map_spans() {
let mut updates = GraphUpdate::new();
example_generator::create_corpus_structure_simple(&mut updates);
example_generator::create_tokens(&mut updates, Some("root/doc1"));
let mut g = AnnotationGraph::with_default_graphstorages(false).unwrap();
g.apply_update(&mut updates, |_msg| {}).unwrap();
let config = r#"
[[rules]]
query = "tok=/more/ . tok"
target = [1,2]
anno = "mapper::form"
value = "comparison"
"#;
let tmp = NamedTempFile::new().unwrap();
std::fs::write(tmp.path(), config).unwrap();
let mapper = MapAnnos {
rule_file: Some(tmp.path().to_path_buf()),
mapping: None,
debug: true,
};
let step_id = StepID {
module_name: "test_map".to_string(),
path: None,
};
mapper
.validate_graph(
&mut g,
StepID {
module_name: "test".to_string(),
path: None,
},
None,
)
.unwrap();
mapper
.manipulate_corpus(&mut g, tmp.path().parent().unwrap(), step_id, None)
.unwrap();
g.calculate_all_statistics().unwrap();
let query = aql::parse(
"mapper:form=\"comparison\" & \"more\" . \"complicated\" & #1 _l_ #2 & #1 _r_ #3",
false,
)
.unwrap();
let result: Vec<_> = aql::execute_query_on_graph(&g, &query, true, None)
.unwrap()
.collect();
assert_eq!(1, result.len(), "Results are: {:?}", result);
assert_eq!(true, result[0].is_ok());
}
#[test]
fn test_map_annos_in_mem() {
let r = main_test(false);
assert!(r.is_ok(), "Error: {:?}", r.err());
}
#[test]
fn test_map_annos_on_disk() {
let r = main_test(true);
assert!(r.is_ok(), "Error: {:?}", r.err());
}
fn main_test(on_disk: bool) -> Result<(), Box<dyn std::error::Error>> {
let config = r#"
[[rules]]
query = "tok=/I/"
target = 1
anno = "pos"
value = "PRON"
[[rules]]
query = "tok=/am/"
target = 1
anno = "pos"
value = "VERB"
[[rules]]
query = "tok=/in/"
target = 1
anno = "pos"
value = "ADP"
[[rules]]
query = "tok=/New York/"
target = 1
anno = "pos"
value = "PROPN"
"#;
let tmp = NamedTempFile::new().unwrap();
std::fs::write(tmp.path(), config).unwrap();
let mapper = MapAnnos {
rule_file: Some(tmp.path().to_path_buf()),
mapping: None,
debug: true,
};
let mut g = source_graph(on_disk)?;
let (sender, _receiver) = mpsc::channel();
let tx = Some(sender);
let step_id = StepID {
module_name: "test_map".to_string(),
path: None,
};
mapper
.manipulate_corpus(&mut g, tmp.path().parent().unwrap(), step_id, tx)
.unwrap();
let e_g = target_graph(on_disk)?;
test_util::compare_graphs(&g, &e_g);
let queries = [
("tok=/I/ _=_ pos=/PRON/", 1),
("tok=/am/ _=_ pos=/VERB/", 1),
("tok=/in/ _=_ pos=/ADP/", 1),
("tok=/New York/ _=_ pos=/PROPN/", 1),
];
for (query_s, expected_n) in queries {
let query = graphannis::aql::parse(&query_s, false).unwrap();
let matches_e: Result<Vec<_>, graphannis::errors::GraphAnnisError> =
graphannis::aql::execute_query_on_graph(&e_g, &query, false, None)?.collect();
let matches_g: Result<Vec<_>, graphannis::errors::GraphAnnisError> =
graphannis::aql::execute_query_on_graph(&g, &query, false, None)?.collect();
let mut matches_e = matches_e.unwrap();
let mut matches_g = matches_g.unwrap();
assert_eq!(
matches_e.len(),
expected_n,
"Number of results for query `{}` does not match for expected graph. Expected:{} vs. Is:{}",
query_s,
expected_n,
matches_e.len()
);
matches_e.sort();
matches_g.sort();
assert_eq!(
matches_e, matches_g,
"Matches for query '{query_s}' are not equal. {matches_e:?} != {matches_g:?}"
);
}
Ok(())
}
#[test]
fn factorize() {
let g = AnnotationGraph::with_default_graphstorages(false);
assert!(g.is_ok());
let mut graph = g.unwrap();
let import: Result<ImportTreeTagger, _> =
toml::from_str(r#"column_names = ["tok", "pos", "lemma"]"#);
assert!(import.is_ok());
let import = import.unwrap();
let u = import.import_corpus(
Path::new("tests/data/graph_op/map/factorize"),
StepID {
module_name: "test_import".to_string(),
path: None,
},
import.default_configuration(),
None,
);
assert!(u.is_ok());
assert!(graph.apply_update(&mut u.unwrap(), |_| {}).is_ok());
let m: Result<MapAnnos, _> = toml::from_str(
r#"
[[mapping.rules]]
query = "pos _=_ lemma"
target = 1
anno = "factors_pos_lemma"
value = { factorize = [1, 2] }
[[mapping.rules]]
query = "pos"
target = 1
anno = "factors_pos"
value = { factorize = 1 }
"#,
);
assert!(m.is_ok(), "Error deserializing: {:?}", m.err().unwrap());
let module = m.unwrap();
let r = module.manipulate_corpus(
&mut graph,
Path::new("./"),
StepID {
module_name: "test_manip".to_string(),
path: None,
},
None,
);
assert!(r.is_ok(), "Error applying map: {:?}", r.err().unwrap());
let export: Result<GraphMLExporter, _> = toml::from_str("stable_order = true");
assert!(export.is_ok());
assert_snapshot!(export_to_string(&graph, export.unwrap()).unwrap());
}
fn source_graph(on_disk: bool) -> Result<AnnotationGraph, Box<dyn std::error::Error>> {
let mut g = AnnotationGraph::with_default_graphstorages(on_disk)?;
let mut u = GraphUpdate::default();
u.add_event(UpdateEvent::AddNode {
node_name: "doc".to_string(),
node_type: "corpus".to_string(),
})?;
for (i, text) in ["I", "am", "in", "New York"].iter().enumerate() {
let node_name = format!("doc#t{}", &i + &1);
u.add_event(UpdateEvent::AddNode {
node_name: node_name.to_string(),
node_type: "node".to_string(),
})?;
u.add_event(UpdateEvent::AddNodeLabel {
node_name: node_name.to_string(),
anno_ns: ANNIS_NS.to_string(),
anno_name: "tok".to_string(),
anno_value: text.to_string(),
})?;
if i > 0 {
u.add_event(UpdateEvent::AddEdge {
source_node: format!("doc#t{i}"),
target_node: node_name.to_string(),
layer: ANNIS_NS.to_string(),
component_type: AnnotationComponentType::Ordering.to_string(),
component_name: "".to_string(),
})?;
}
}
g.apply_update(&mut u, |_| {})?;
Ok(g)
}
fn tokens_with_macrons() -> Result<AnnotationGraph, Box<dyn std::error::Error>> {
let mut g = AnnotationGraph::with_default_graphstorages(true)?;
let mut u = GraphUpdate::default();
u.add_event(UpdateEvent::AddNode {
node_name: "doc".to_string(),
node_type: "corpus".to_string(),
})?;
for (i, text) in [
"ein",
"kraut",
"wechſzt",
"etwan",
"anðthalbē",
"ellēbogē",
"hoch",
]
.iter()
.enumerate()
{
let node_name = format!("doc#t{}", &i + &1);
u.add_event(UpdateEvent::AddNode {
node_name: node_name.to_string(),
node_type: "node".to_string(),
})?;
u.add_event(UpdateEvent::AddNodeLabel {
node_name: node_name.to_string(),
anno_ns: ANNIS_NS.to_string(),
anno_name: "tok".to_string(),
anno_value: text.to_string(),
})?;
u.add_event(UpdateEvent::AddEdge {
source_node: format!("doc#t{i}"),
target_node: "doc".to_string(),
layer: ANNIS_NS.to_string(),
component_type: AnnotationComponentType::PartOf.to_string(),
component_name: "".to_string(),
})?;
if i > 0 {
u.add_event(UpdateEvent::AddEdge {
source_node: format!("doc#t{i}"),
target_node: node_name.to_string(),
layer: ANNIS_NS.to_string(),
component_type: AnnotationComponentType::Ordering.to_string(),
component_name: "".to_string(),
})?;
}
}
g.apply_update(&mut u, |_| {})?;
Ok(g)
}
fn target_graph(on_disk: bool) -> Result<AnnotationGraph, Box<dyn std::error::Error>> {
let mut g = source_graph(on_disk)?;
let mut u = GraphUpdate::default();
for (i, pos_val) in ["PRON", "VERB", "ADP", "PROPN"].iter().enumerate() {
let node_name = format!("doc#t{}", &i + &1);
u.add_event(UpdateEvent::AddNode {
node_name: node_name.to_string(),
node_type: "node".to_string(),
})?;
u.add_event(UpdateEvent::AddNodeLabel {
node_name: node_name.to_string(),
anno_ns: "".to_string(),
anno_name: "pos".to_string(),
anno_value: pos_val.to_string(),
})?;
}
g.apply_update(&mut u, |_| {})?;
Ok(g)
}
}