use std::{
collections::{BTreeMap, btree_map::Entry},
fs::File,
io::{BufWriter, Write},
path::Path,
sync::Arc,
};
use anyhow::anyhow;
use facet::Facet;
use graphannis::{
AnnotationGraph,
graph::{AnnoKey, GraphStorage, NodeID},
model::{AnnotationComponent, AnnotationComponentType},
};
use graphannis_core::{
dfs::{self, CycleSafeDFS},
graph::{ANNIS_NS, DEFAULT_NS, NODE_NAME_KEY},
};
use itertools::Itertools;
use serde::{Deserialize, Serialize};
use super::Exporter;
use crate::{
progress::ProgressReporter,
util::token_helper::{TOKEN_KEY, TokenHelper},
};
#[derive(Facet, Clone, Debug, Deserialize, PartialEq, Serialize, Default)]
#[repr(u8)]
#[serde(tag = "strategy", content = "name", rename_all = "snake_case")]
enum SpanName {
#[default]
FirstAnnoName,
FirstAnnoNamespace,
Fixed(String),
}
#[derive(Facet, Deserialize, Serialize, Clone, PartialEq)]
#[serde(deny_unknown_fields)]
pub struct ExportTreeTagger {
#[serde(
default = "default_column_names",
with = "crate::estarde::anno_key::in_sequence"
)]
column_names: Vec<AnnoKey>,
#[serde(default)]
segmentation: Option<String>,
#[serde(default)]
span_names: SpanName,
#[serde(default = "default_doc_anno", with = "crate::estarde::anno_key")]
doc_anno: AnnoKey,
#[serde(default)]
skip_meta: bool,
#[serde(default)]
skip_spans: bool,
}
fn default_doc_anno() -> AnnoKey {
AnnoKey {
name: "doc".into(),
ns: ANNIS_NS.into(),
}
}
fn default_column_names() -> Vec<AnnoKey> {
vec![
AnnoKey {
name: "pos".into(),
ns: DEFAULT_NS.into(),
},
AnnoKey {
name: "lemma".into(),
ns: DEFAULT_NS.into(),
},
]
}
impl Default for ExportTreeTagger {
fn default() -> Self {
Self {
column_names: default_column_names(),
segmentation: None,
doc_anno: default_doc_anno(),
skip_meta: false,
skip_spans: false,
span_names: SpanName::FirstAnnoName,
}
}
}
const FILE_EXTENSION: &str = "tt";
impl Exporter for ExportTreeTagger {
fn export_corpus(
&self,
graph: &graphannis::AnnotationGraph,
output_path: &std::path::Path,
step_id: crate::StepID,
tx: Option<crate::workflow::StatusSender>,
) -> Result<(), Box<dyn std::error::Error>> {
let _progress = ProgressReporter::new_unknown_total_work(tx.clone(), step_id.clone())?;
std::fs::create_dir_all(output_path)?;
let base_ordering = AnnotationComponent::new(
AnnotationComponentType::Ordering,
ANNIS_NS.into(),
"".into(),
);
let mut selected_ordering = base_ordering;
if let Some(seg) = &self.segmentation {
let matching_components =
graph.get_all_components(Some(AnnotationComponentType::Ordering), Some(seg));
if matching_components.len() == 1 {
selected_ordering = matching_components[0].clone();
} else {
for layer in self.possible_namespace_for_segmentation() {
if let Some(matching) = matching_components
.iter()
.find(|c| c.layer.as_str() == layer)
{
selected_ordering = matching.clone();
break;
}
}
}
}
let gs_ordering = graph
.get_graphstorage(&selected_ordering)
.ok_or(anyhow!("Storage of ordering component unavailable"))?;
let part_of_storage = graph
.get_graphstorage(&AnnotationComponent::new(
AnnotationComponentType::PartOf,
ANNIS_NS.into(),
"".into(),
))
.ok_or(anyhow!("Part-of storage unavailable."))?;
let mut doc_node_to_start = BTreeMap::new();
for node in gs_ordering.root_nodes() {
let node = node?;
let dfs = CycleSafeDFS::new(
part_of_storage.as_edgecontainer(),
node,
0,
NodeID::MAX as usize,
);
for n in dfs {
let n = n?.node;
if graph
.get_node_annos()
.has_value_for_item(&n, &self.doc_anno)
.unwrap_or_default()
{
if let Entry::Vacant(e) = doc_node_to_start.entry(n) {
e.insert(node);
break;
} else {
let doc_node_name = graph
.get_node_annos()
.get_value_for_item(&n, &NODE_NAME_KEY)?
.unwrap_or_default();
return Err(anyhow!(
"Document {doc_node_name} has more than one start node for base ordering."
)
.into());
}
}
}
}
let progress = ProgressReporter::new(tx, step_id, doc_node_to_start.len())?;
progress.info(format!("Exporting {} documents", doc_node_to_start.len()))?;
doc_node_to_start
.into_iter()
.try_for_each(move |(doc, start)| -> anyhow::Result<()> {
self.export_document(graph, output_path, doc, start, gs_ordering.clone())?;
progress.worked(1)?;
Ok(())
})?;
Ok(())
}
fn file_extension(&self) -> &str {
FILE_EXTENSION
}
}
impl ExportTreeTagger {
fn export_document(
&self,
graph: &AnnotationGraph,
corpus_path: &Path,
doc_node: NodeID,
start_node: NodeID,
gs_ordering: Arc<dyn GraphStorage>,
) -> anyhow::Result<()> {
let token_helper = TokenHelper::new(graph)?;
let node_annos = graph.get_node_annos();
let doc_node_name = node_annos
.get_value_for_item(&doc_node, &self.doc_anno)?
.ok_or(anyhow!("Could not determine document node name."))?;
let file_path =
Path::new(corpus_path).join(format!("{doc_node_name}.{}", self.file_extension()));
let mut w = BufWriter::new(File::create(file_path)?);
let footer = if self.skip_meta {
None
} else {
Some(self.write_metadata_header(graph, doc_node, &mut w)?)
};
let it = dfs::CycleSafeDFS::new(gs_ordering.as_edgecontainer(), start_node, 0, usize::MAX);
for token in it {
let token = token?.node;
let mut matching_token_key = TOKEN_KEY.as_ref().clone();
if !node_annos.has_value_for_item(&token, &matching_token_key)?
&& let Some(seg) = &self.segmentation
{
matching_token_key.name = seg.clone();
for ns in self.possible_namespace_for_segmentation() {
matching_token_key.ns = ns;
if node_annos.has_value_for_item(&token, &matching_token_key)? {
break;
}
}
}
if !self.skip_spans {
self.write_starting_spans(graph, token, &token_helper, &mut w)?;
}
let token_val = node_annos
.get_value_for_item(&token, &matching_token_key)?
.unwrap_or_default();
write!(w, "{token_val}")?;
for column in &self.column_names {
let anno_value = node_annos
.get_value_for_item(&token, column)?
.unwrap_or_default();
write!(w, "\t{anno_value}")?;
}
writeln!(w)?;
if !self.skip_spans {
self.write_ending_spans(graph, token, &token_helper, &mut w)?;
}
}
if let Some(footer) = footer {
writeln!(w, "{footer}")?;
}
Ok(())
}
fn write_metadata_header<W: Write>(
&self,
graph: &AnnotationGraph,
doc_node: NodeID,
mut w: W,
) -> anyhow::Result<String> {
write!(w, "<doc")?;
for anno in graph.get_node_annos().get_annotations_for_item(&doc_node)? {
if anno.key.ns != ANNIS_NS {
let name = quick_xml::escape::escape(&anno.key.name);
let value = quick_xml::escape::escape(&anno.val);
write!(w, " {name}=\"{value}\"")?;
}
}
writeln!(w, ">")?;
Ok("</doc>".to_string())
}
fn write_starting_spans<W: Write>(
&self,
graph: &AnnotationGraph,
token: NodeID,
token_helper: &TokenHelper,
mut w: W,
) -> anyhow::Result<()> {
if let Some(left_token) = token_helper.left_token_for(token)? {
for starting_span in token_helper
.get_gs_left_token()
.get_ingoing_edges(left_token)
{
let starting_span = starting_span?;
if !self.is_segmentation_span(starting_span, graph, token_helper)? {
let tag = self.tag_name_for_span(graph, starting_span)?;
write!(w, "<{tag}")?;
for anno in graph
.get_node_annos()
.get_annotations_for_item(&starting_span)?
{
if anno.key.ns != ANNIS_NS {
let name = quick_xml::escape::escape(&anno.key.name);
let value = quick_xml::escape::escape(&anno.val);
write!(w, " {name}=\"{value}\"")?;
}
}
writeln!(w, ">")?;
}
}
}
Ok(())
}
fn write_ending_spans<W: Write>(
&self,
graph: &AnnotationGraph,
token: NodeID,
token_helper: &TokenHelper,
mut w: W,
) -> anyhow::Result<()> {
if let Some(right_token) = token_helper.right_token_for(token)? {
for ending_span in token_helper
.get_gs_right_token()
.get_ingoing_edges(right_token)
{
let ending_span = ending_span?;
if !self.is_segmentation_span(ending_span, graph, token_helper)? {
let tag = self.tag_name_for_span(graph, ending_span)?;
writeln!(w, "</{tag}>")?;
}
}
}
Ok(())
}
fn is_segmentation_span(
&self,
span: NodeID,
graph: &AnnotationGraph,
token_helper: &TokenHelper,
) -> anyhow::Result<bool> {
if graph
.get_node_annos()
.has_value_for_item(&span, &TOKEN_KEY)?
{
Ok(true)
} else {
for gs in token_helper.get_gs_ordering().values() {
if gs.has_outgoing_edges(span)? || gs.has_ingoing_edges(span)? {
return Ok(true);
}
}
Ok(false)
}
}
fn tag_name_for_span(&self, graph: &AnnotationGraph, span: NodeID) -> anyhow::Result<String> {
match &self.span_names {
SpanName::FirstAnnoName => {
let keys: Vec<_> = graph
.get_node_annos()
.get_all_keys_for_item(&span, None, None)?
.into_iter()
.filter(|key| key.ns != ANNIS_NS)
.sorted()
.collect();
let first_name = keys
.first()
.map(|key| quick_xml::escape::escape(&key.name).to_string())
.unwrap_or_else(|| "span".to_string());
Ok(first_name)
}
SpanName::FirstAnnoNamespace => {
let keys: Vec<_> = graph
.get_node_annos()
.get_all_keys_for_item(&span, None, None)?
.into_iter()
.filter(|key| key.ns != ANNIS_NS)
.sorted()
.collect();
let first_name = keys
.first()
.map(|key| quick_xml::escape::escape(&key.ns).to_string())
.unwrap_or_else(|| "span".to_string());
Ok(first_name)
}
SpanName::Fixed(name) => Ok(name.clone()),
}
}
fn possible_namespace_for_segmentation(&self) -> Vec<String> {
let mut result = Vec::new();
if let Some(segmentation) = &self.segmentation {
result.push(segmentation.clone());
result.push(ANNIS_NS.to_string());
result.push(DEFAULT_NS.to_string());
result.push("".to_string());
}
result
}
}
#[cfg(test)]
mod tests;