use std::{
cmp::Ordering,
collections::{BTreeSet, HashMap, HashSet},
};
use anyhow::anyhow;
use facet::Facet;
use graphannis::{AnnotationGraph, aql, graph::GraphStorage, model::AnnotationComponentType};
use graphannis_core::{
annostorage::{NodeAnnotationStorage, ValueSearch},
graph::{ANNIS_NS, NODE_TYPE_KEY},
types::{AnnoKey, Component, NodeID},
util::join_qname,
};
use linked_hash_map::LinkedHashMap;
use serde::{Deserialize, Serialize};
use tempfile::NamedTempFile;
use umya_spreadsheet::{Cell, NumberingFormat};
use crate::{
importer::xlsx::SheetAddress,
progress::ProgressReporter,
util::token_helper::{TOKEN_KEY, TokenHelper},
};
use super::Exporter;
#[derive(Facet, Default, Deserialize, Serialize, Clone, PartialEq)]
#[serde(deny_unknown_fields)]
pub struct ExportXlsx {
#[serde(default)]
include_namespace: bool,
#[serde(default, with = "crate::estarde::anno_key::in_sequence")]
annotation_order: Vec<AnnoKey>,
#[serde(default)]
skip_unchanged_files: bool,
#[serde(default)]
update_datasheet: Option<SheetAddress>,
}
fn find_token_roots(
g: &graphannis::AnnotationGraph,
doc_node_name: &str,
token_helper: &TokenHelper,
ordering_gs: Option<&dyn GraphStorage>,
) -> anyhow::Result<HashSet<NodeID>> {
let mut roots: HashSet<_> = HashSet::new();
let query = aql::parse(&format!("tok @* annis:doc=/{doc_node_name}/"), false)?;
for m in aql::execute_query_on_graph(g, &query, true, None)?.flatten() {
if let Some(n) = m.first()
&& token_helper.is_token(n.node)?
&& (ordering_gs.is_none()
|| ordering_gs.is_some_and(|gs| !gs.has_ingoing_edges(n.node).unwrap_or_default()))
{
roots.insert(n.node);
}
}
Ok(roots)
}
fn is_span_column(
anno_key: &AnnoKey,
node_annos: &dyn NodeAnnotationStorage,
token_helper: &TokenHelper,
) -> anyhow::Result<bool> {
let mut has_non_corpus_match = false;
for m in node_annos.exact_anno_search(Some(&anno_key.ns), &anno_key.name, ValueSearch::Any) {
let m = m?;
if token_helper.is_token(m.node)? {
return Ok(false);
}
if let Some(node_type) = node_annos.get_value_for_item(&m.node, &NODE_TYPE_KEY)?
&& node_type == "node"
{
has_non_corpus_match = true;
}
}
Ok(has_non_corpus_match)
}
fn overwritten_position_for_key(
anno_key: &AnnoKey,
position_overwrite: &HashMap<AnnoKey, u32>,
) -> Option<u32> {
position_overwrite
.get(anno_key)
.or_else(|| {
position_overwrite
.iter()
.find(|(k, _)| k.name.as_str() == anno_key.name.as_str())
.map(|(_, ix)| ix)
})
.copied()
}
impl ExportXlsx {
fn export_document(
&self,
doc_name: &str,
doc_node_id: NodeID,
g: &graphannis::AnnotationGraph,
output_path: &std::path::Path,
progress: &ProgressReporter,
) -> Result<(), anyhow::Error> {
let output_path = output_path.join(format!("{doc_name}.xlsx"));
let mut workbook = if output_path.exists() && self.update_datasheet.is_some() {
umya_spreadsheet::reader::xlsx::read(&output_path)?
} else {
umya_spreadsheet::new_file()
};
let worksheet = if let Some(addr) = &self.update_datasheet {
match addr {
SheetAddress::Numeric(i) => {
workbook.remove_sheet(*i - 1).map_err(|e| anyhow!(e))?;
}
SheetAddress::Name(s) => {
workbook.remove_sheet_by_name(s).map_err(|e| anyhow!(e))?;
}
}
let sheet_name = format!(
"{doc_name}-{}",
chrono::Local::now().format("%Y-%m-%d-%H-%M-%S-%9f")
);
workbook
.new_sheet(&sheet_name)
.map_err(|e| anyhow!("Could not create new sheet with name `{sheet_name}`: {e}"))?
} else {
workbook
.get_sheet_mut(&0)
.ok_or(anyhow!("Could not obtain blank sheet."))?
};
let token_helper = TokenHelper::new(g)?;
let ordering_component = Component::new(
AnnotationComponentType::Ordering,
ANNIS_NS.into(),
"".into(),
);
let ordering_gs = g.get_graphstorage_as_ref(&ordering_component);
let token_roots = find_token_roots(g, doc_name, &token_helper, ordering_gs)?;
let (token_to_row, has_only_empty_token) =
self.create_token_colum(g, &token_roots, doc_node_id, worksheet)?;
let column_offset = if !has_only_empty_token {
set_cell_value(worksheet.get_cell_mut((1, 1)), "tok");
1
} else {
0
};
let name_to_column = self.get_span_columns(g, &token_helper, column_offset)?;
self.create_span_columns(
g,
&name_to_column,
token_to_row,
&token_helper,
worksheet,
progress,
)?;
let meta_annos = g.get_node_annos().get_annotations_for_item(&doc_node_id)?;
if !meta_annos.is_empty() && self.update_datasheet.is_none() {
let meta_sheet = workbook.new_sheet("meta").map_err(|s| anyhow!(s))?;
set_cell_value(meta_sheet.get_cell_mut((1, 1)), "Name");
set_cell_value(meta_sheet.get_cell_mut((2, 1)), "Value");
let mut current_row = 2;
for a in meta_annos {
if a.key.ns != ANNIS_NS {
set_cell_value(
meta_sheet.get_cell_mut((1, current_row)),
&join_qname(&a.key.ns, &a.key.name),
);
set_cell_value(meta_sheet.get_cell_mut((2, current_row)), &a.val);
current_row += 1;
}
}
}
if self.skip_unchanged_files
&& output_path.is_file()
&& let Some(parent_dir) = output_path.parent()
{
let tmp_out = NamedTempFile::with_suffix_in(".xlsx", parent_dir)?;
umya_spreadsheet::writer::xlsx::write(&workbook, tmp_out.path())?;
let diff = sheets_diff::core::diff::Diff::new(
&output_path.to_string_lossy(),
&tmp_out.path().to_string_lossy(),
);
let contains_changes = !diff.sheet_diff.is_empty() || !diff.cell_diffs.is_empty();
if contains_changes {
tmp_out.persist(output_path)?;
}
} else {
umya_spreadsheet::writer::xlsx::write(&workbook, output_path)?;
}
Ok(())
}
fn get_span_columns(
&self,
g: &AnnotationGraph,
token_helper: &TokenHelper,
column_offset: u32,
) -> anyhow::Result<LinkedHashMap<AnnoKey, u32>> {
let position_overwrite: HashMap<AnnoKey, u32> = self
.annotation_order
.iter()
.enumerate()
.map(|(idx, anno)| (anno.clone(), (1 + idx as u32) + column_offset))
.collect();
let node_annos = g.get_node_annos();
let mut all_anno_keys = node_annos.annotation_keys()?;
all_anno_keys.sort_by(|a, b| {
let a_overwrite = overwritten_position_for_key(a, &position_overwrite);
let b_overwrite = overwritten_position_for_key(b, &position_overwrite);
if let Some(a_overwrite) = a_overwrite
&& let Some(b_overwrite) = b_overwrite
{
a_overwrite.cmp(&b_overwrite)
} else if a_overwrite.is_some() {
Ordering::Less
} else if b_overwrite.is_some() {
Ordering::Greater
} else {
a.cmp(b)
}
});
let mut result = LinkedHashMap::new();
let mut column_index = column_offset + 1;
for anno_key in all_anno_keys {
if anno_key.ns != ANNIS_NS && is_span_column(&anno_key, node_annos, token_helper)? {
result.insert(anno_key, column_index);
column_index += 1;
}
}
Ok(result)
}
fn create_token_colum(
&self,
g: &AnnotationGraph,
token_roots: &HashSet<NodeID>,
doc_node_id: NodeID,
worksheet: &mut umya_spreadsheet::Worksheet,
) -> anyhow::Result<(HashMap<NodeID, u32>, bool)> {
let ordering_component = Component::new(
AnnotationComponentType::Ordering,
ANNIS_NS.into(),
"".into(),
);
let ordering_gs = g.get_graphstorage_as_ref(&ordering_component);
if let Some(gs_part_of) = g.get_graphstorage_as_ref(&Component::new(
AnnotationComponentType::PartOf,
ANNIS_NS.into(),
"".into(),
)) {
let mut token_roots_for_document = Vec::default();
for t in token_roots {
if gs_part_of.is_connected(*t, doc_node_id, 1, std::ops::Bound::Unbounded)? {
token_roots_for_document.push(*t);
}
}
let mut has_only_empty_token = true;
let mut token_to_row = HashMap::new();
let mut token = token_roots_for_document.into_iter().next();
let mut row_index = 2;
while let Some(current_token) = token {
if let Some(val) = g
.get_node_annos()
.get_value_for_item(¤t_token, &TOKEN_KEY)?
&& !val.trim().is_empty()
{
has_only_empty_token = false;
set_cell_value(worksheet.get_cell_mut((1, row_index)), &val);
}
token_to_row.insert(current_token, row_index);
token = if let Some(ordering_gs) = ordering_gs
&& let Some(next_token) = ordering_gs.get_outgoing_edges(current_token).next()
{
let next_token = next_token?;
Some(next_token)
} else {
None
};
row_index += 1;
}
Ok((token_to_row, has_only_empty_token))
} else {
Err(anyhow!("Missing PartOf component"))
}
}
fn create_span_columns(
&self,
g: &AnnotationGraph,
name_to_column: &LinkedHashMap<AnnoKey, u32>,
token_to_row: HashMap<NodeID, u32>,
token_helper: &TokenHelper,
worksheet: &mut umya_spreadsheet::Worksheet,
progress: &ProgressReporter,
) -> anyhow::Result<()> {
for span_anno_key in name_to_column.keys() {
if let Some(column_index) = name_to_column.get(span_anno_key) {
if self.include_namespace {
set_cell_value(
worksheet.get_cell_mut((*column_index, 1)),
&join_qname(&span_anno_key.ns, &span_anno_key.name),
);
} else {
set_cell_value(
worksheet.get_cell_mut((*column_index, 1)),
&span_anno_key.name,
);
}
let mut written_rows = BTreeSet::default();
for span in g.get_node_annos().exact_anno_search(
Some(&span_anno_key.ns),
&span_anno_key.name,
ValueSearch::Any,
) {
let span = span?;
let span_val = g
.get_node_annos()
.get_value_for_item(&span.node, &span.anno_key)?
.unwrap_or_default();
let mut spanned_rows = BTreeSet::new();
for gs in token_helper.get_gs_coverage().iter() {
for t in gs.get_outgoing_edges(span.node) {
let t = t?;
if let Some(row) = token_to_row.get(&t) {
spanned_rows.insert(*row);
}
}
}
let first_row = spanned_rows.first();
let last_row = spanned_rows.last();
if let Some(first) = first_row
&& let Some(last) = last_row
{
let intersection_size = spanned_rows.intersection(&written_rows).count();
if intersection_size > 0 {
let msg = format!(
"Could not write span value {span_val} from row {first} to row {last} in column `{}::{}` in document {}. A span already exists in at least of the affected rows. {intersection_size} node(s) overlap(s).",
span_anno_key.ns,
span_anno_key.name,
worksheet.get_name()
);
progress.warn(msg)?;
continue;
}
if *last - *first > 0 {
set_cell_value(
worksheet.get_cell_mut((*column_index, *first)),
&span_val,
);
let column_letter =
umya_spreadsheet::helper::coordinate::string_from_column_index(
column_index,
);
let range = format!("{column_letter}{first}:{column_letter}{last}");
worksheet.add_merge_cells(range);
} else {
set_cell_value(
worksheet.get_cell_mut((*column_index, *first)),
&span_val,
);
}
written_rows.extend(spanned_rows);
}
}
}
}
Ok(())
}
}
fn set_cell_value(cell: &mut Cell, value: &str) {
cell.get_style_mut()
.get_number_format_mut()
.set_format_code(NumberingFormat::FORMAT_TEXT);
cell.set_value_string(value);
}
impl Exporter for ExportXlsx {
fn export_corpus(
&self,
graph: &graphannis::AnnotationGraph,
output_path: &std::path::Path,
step_id: crate::StepID,
tx: Option<crate::workflow::StatusSender>,
) -> Result<(), Box<dyn std::error::Error>> {
let doc_annos = graph.get_node_annos().exact_anno_search(
Some(ANNIS_NS),
"doc",
graphannis_core::annostorage::ValueSearch::Any,
);
let mut document_names = Vec::new();
for m in doc_annos {
let m = m?;
if let Some(val) = graph
.get_node_annos()
.get_value_for_item(&m.node, &m.anno_key)?
{
document_names.push((val, m.node));
}
}
let reporter = ProgressReporter::new(tx, step_id, document_names.len())?;
std::fs::create_dir_all(output_path)?;
let results: anyhow::Result<Vec<_>> = document_names
.iter()
.map(|(doc_name, doc_node_id)| {
self.export_document(doc_name, *doc_node_id, graph, output_path, &reporter)?;
reporter.worked(1)?;
Ok(())
})
.collect();
results?;
Ok(())
}
fn file_extension(&self) -> &str {
"xlsx"
}
}
#[cfg(test)]
mod tests {
use std::{
fs::{self, File},
path::{Path, PathBuf},
};
use graphannis::update::{GraphUpdate, UpdateEvent};
use insta::assert_snapshot;
use sha2::{Digest, Sha256};
use tempfile::{TempDir, tempdir};
use crate::{
ExporterStep, ImporterStep, ReadFrom, WriteAs,
importer::{GenericImportConfiguration, Importer, xlsx::ImportSpreadsheet},
test_util::compare_graphs,
util::example_generator,
};
use super::*;
#[test]
fn serialize() {
let module = ExportXlsx::default();
let serialization = toml::to_string(&module);
assert!(
serialization.is_ok(),
"Serialization failed: {:?}",
serialization.err()
);
assert_snapshot!(serialization.unwrap());
}
#[test]
fn serialize_custom() {
let module = ExportXlsx {
annotation_order: vec![
AnnoKey {
ns: "text".into(),
name: "text".into(),
},
AnnoKey {
ns: "edition".into(),
name: "edition".into(),
},
],
include_namespace: true,
skip_unchanged_files: false,
update_datasheet: None,
};
let serialization = toml::to_string(&module);
assert!(
serialization.is_ok(),
"Serialization failed: {:?}",
serialization.err()
);
assert_snapshot!(serialization.unwrap());
}
#[test]
fn with_segmentation() {
let importer: ImportSpreadsheet = toml::from_str(
r#"
column_map = {"dipl" = ["sentence"], "norm" = ["pos", "lemma", "seg"]}
"#,
)
.unwrap();
let exporter = ExportXlsx::default();
let path = Path::new("./tests/data/import/xlsx/clean/xlsx/");
let orig_import_step = ImporterStep {
module: crate::ReadFrom::Xlsx(importer),
path: path.to_path_buf(),
description: None,
generic_config: None,
};
let mut updates = orig_import_step.execute(None).unwrap();
let mut original_graph = AnnotationGraph::with_default_graphstorages(false).unwrap();
original_graph.apply_update(&mut updates, |_| {}).unwrap();
let tmp_outputdir = TempDir::new().unwrap();
let output_dir = tmp_outputdir.path().join("xlsx");
std::fs::create_dir(&output_dir).unwrap();
let exporter = crate::WriteAs::Xlsx(exporter);
let export_step = ExporterStep {
module: exporter,
path: output_dir.clone(),
description: None,
extension: None,
};
export_step.execute(&original_graph, None).unwrap();
let importer: ImportSpreadsheet = toml::from_str(
r#"
column_map = {"dipl" = ["sentence"], "norm" = ["pos", "lemma", "seg"]}
"#,
)
.unwrap();
let second_import_step = ImporterStep {
module: crate::ReadFrom::Xlsx(importer),
path: output_dir.clone(),
description: None,
generic_config: None,
};
let mut updates = second_import_step.execute(None).unwrap();
let mut written_graph = AnnotationGraph::with_default_graphstorages(false).unwrap();
written_graph.apply_update(&mut updates, |_| {}).unwrap();
compare_graphs(&original_graph, &written_graph);
}
#[test]
fn with_token() {
let importer: ImportSpreadsheet = toml::from_str(
r#"
column_map = {"tok" = ["lb"]}
"#,
)
.unwrap();
let exporter = ExportXlsx::default();
let path = Path::new("./tests/data/import/xlsx/sample_sentence/");
let importer = crate::ReadFrom::Xlsx(importer);
let orig_import_step = ImporterStep {
module: importer,
path: path.to_path_buf(),
description: None,
generic_config: None,
};
let mut updates = orig_import_step.execute(None).unwrap();
let mut original_graph = AnnotationGraph::with_default_graphstorages(false).unwrap();
original_graph.apply_update(&mut updates, |_| {}).unwrap();
let tmp_outputdir = TempDir::new().unwrap();
let output_dir = tmp_outputdir.path().join("sample_sentence");
std::fs::create_dir(&output_dir).unwrap();
let exporter = crate::WriteAs::Xlsx(exporter);
let export_step = ExporterStep {
module: exporter,
path: output_dir.clone(),
description: None,
extension: None,
};
export_step.execute(&original_graph, None).unwrap();
let importer: ImportSpreadsheet = toml::from_str(
r#"
column_map = {"tok" = ["lb"]}
"#,
)
.unwrap();
let second_import_step = ImporterStep {
module: crate::ReadFrom::Xlsx(importer),
path: output_dir.clone(),
description: None,
generic_config: None,
};
let mut updates = second_import_step.execute(None).unwrap();
let mut written_graph = AnnotationGraph::with_default_graphstorages(false).unwrap();
written_graph.apply_update(&mut updates, |_| {}).unwrap();
compare_graphs(&original_graph, &written_graph);
let q = graphannis::aql::parse("tok", false).unwrap();
let it = graphannis::aql::execute_query_on_graph(&written_graph, &q, false, None).unwrap();
assert_eq!(11, it.count());
let q = graphannis::aql::parse("lb=\"1\"", false).unwrap();
let it = graphannis::aql::execute_query_on_graph(&written_graph, &q, false, None).unwrap();
assert_eq!(1, it.count());
let q = graphannis::aql::parse("lb=\"2\"", false).unwrap();
let it = graphannis::aql::execute_query_on_graph(&written_graph, &q, false, None).unwrap();
assert_eq!(1, it.count());
}
#[test]
fn with_namespace() {
let importer: ImportSpreadsheet = toml::from_str(
r#"
column_map = {"default_ns::text" = ["mynamespace::lb"]}
"#,
)
.unwrap();
let importer = ReadFrom::Xlsx(importer);
let mut exporter = ExportXlsx::default();
exporter.include_namespace = true;
exporter.annotation_order = vec![AnnoKey {
ns: "default_ns".into(),
name: "text".into(),
}];
let exporter = WriteAs::Xlsx(exporter);
let path = Path::new("./tests/data/import/xlsx/sample_sentence_with_namespace/");
let first_import_step = ImporterStep {
module: importer,
path: path.to_path_buf(),
description: None,
generic_config: None,
};
let mut updates = first_import_step.execute(None).unwrap();
let mut original_graph = AnnotationGraph::with_default_graphstorages(false).unwrap();
original_graph.apply_update(&mut updates, |_| {}).unwrap();
let tmp_outputdir = TempDir::new().unwrap();
let output_dir = tmp_outputdir.path().join("sample_sentence_with_namespace");
std::fs::create_dir(&output_dir).unwrap();
let export_step = ExporterStep {
module: exporter,
path: output_dir.clone(),
description: None,
extension: None,
};
export_step.execute(&original_graph, None).unwrap();
let importer: ImportSpreadsheet = toml::from_str(
r#"
column_map = {"default_ns::text" = ["mynamespace::lb"]}
"#,
)
.unwrap();
let second_import_step = ImporterStep {
module: crate::ReadFrom::Xlsx(importer),
path: output_dir.clone(),
description: None,
generic_config: None,
};
let mut updates = second_import_step.execute(None).unwrap();
let mut written_graph = AnnotationGraph::with_default_graphstorages(false).unwrap();
written_graph.apply_update(&mut updates, |_| {}).unwrap();
compare_graphs(&original_graph, &written_graph);
let q = graphannis::aql::parse("tok", false).unwrap();
let it = graphannis::aql::execute_query_on_graph(&written_graph, &q, false, None).unwrap();
assert_eq!(11, it.count());
let q = graphannis::aql::parse("mynamespace:lb=\"1\"", false).unwrap();
let it = graphannis::aql::execute_query_on_graph(&written_graph, &q, false, None).unwrap();
assert_eq!(1, it.count());
let q = graphannis::aql::parse("mynamespace:lb=\"2\"", false).unwrap();
let it = graphannis::aql::execute_query_on_graph(&written_graph, &q, false, None).unwrap();
assert_eq!(1, it.count());
}
#[test]
fn with_meta() {
let importer: ImportSpreadsheet = toml::from_str(
r#"
column_map = {"tok" = ["lb"]}
metasheet = "meta"
metasheet_skip_rows = 1
"#,
)
.unwrap();
let exporter = ExportXlsx::default();
let path = Path::new("./tests/data/import/xlsx/sample_sentence/");
let importer = crate::ReadFrom::Xlsx(importer);
let orig_import_step = ImporterStep {
module: importer,
path: path.to_path_buf(),
description: None,
generic_config: None,
};
let mut updates = orig_import_step.execute(None).unwrap();
let mut original_graph = AnnotationGraph::with_default_graphstorages(false).unwrap();
original_graph.apply_update(&mut updates, |_| {}).unwrap();
let tmp_outputdir = TempDir::new().unwrap();
let output_dir = tmp_outputdir.path().join("sample_sentence");
std::fs::create_dir(&output_dir).unwrap();
let exporter = crate::WriteAs::Xlsx(exporter);
let export_step = ExporterStep {
module: exporter,
path: output_dir.clone(),
description: None,
extension: None,
};
export_step.execute(&original_graph, None).unwrap();
let importer: ImportSpreadsheet = toml::from_str(
r#"
column_map = {"tok" = ["lb"]}
metasheet = "meta"
metasheet_skip_rows = 1
"#,
)
.unwrap();
let second_import_step = ImporterStep {
module: crate::ReadFrom::Xlsx(importer),
path: output_dir.clone(),
description: None,
generic_config: None,
};
let e = second_import_step.execute(None);
assert!(e.is_ok(), "Error re-importing: {:?}", e.err().unwrap());
let mut updates = e.unwrap();
let mut written_graph = AnnotationGraph::with_default_graphstorages(false).unwrap();
written_graph.apply_update(&mut updates, |_| {}).unwrap();
let q = graphannis::aql::parse("Author=\"Unknown\" _ident_ annis:doc", false).unwrap();
let it = graphannis::aql::execute_query_on_graph(&written_graph, &q, false, None).unwrap();
assert_eq!(1, it.count());
let q = graphannis::aql::parse("Year=\"2024\" _ident_ annis:doc", false).unwrap();
let it = graphannis::aql::execute_query_on_graph(&written_graph, &q, false, None).unwrap();
assert_eq!(1, it.count());
let q = graphannis::aql::parse("Name _ident_ annis:doc", false).unwrap();
let it = graphannis::aql::execute_query_on_graph(&written_graph, &q, false, None).unwrap();
assert_eq!(0, it.count());
}
fn create_corpus_folder_and_hash() -> (TempDir, PathBuf, String) {
let corpus_dir = tempfile::TempDir::new().unwrap();
let document_path = corpus_dir.path().join("doc1.xlsx");
std::fs::copy(
Path::new("./tests/data/import/xlsx/sample_sentence/doc1.xlsx"),
&document_path,
)
.unwrap();
let mut file = File::open(&document_path).unwrap();
let mut sha256 = Sha256::new();
std::io::copy(&mut file, &mut sha256).unwrap();
let hash_value = format!("{:02X?}", sha256.finalize());
(corpus_dir, document_path, hash_value)
}
#[test]
fn export_skips_unchanged() {
let (corpus_dir, document_path, original_hash) = create_corpus_folder_and_hash();
let before = document_path.metadata().unwrap().modified().unwrap();
let importer: ImportSpreadsheet = toml::from_str(
r#"
column_map = {"tok" = ["lb"]}
metasheet = "meta"
metasheet_skip_rows = 1
"#,
)
.unwrap();
let importer = crate::ReadFrom::Xlsx(importer);
let orig_import_step = ImporterStep {
module: importer,
path: corpus_dir.path().to_path_buf(),
description: None,
generic_config: None,
};
let mut updates = orig_import_step.execute(None).unwrap();
let mut graph = AnnotationGraph::with_default_graphstorages(false).unwrap();
graph.apply_update(&mut updates, |_| {}).unwrap();
let exporter: ExportXlsx = toml::from_str(
r#"
skip_unchanged_files = true
annotation_order = ["tok", "lb"]
"#,
)
.unwrap();
let exporter = crate::WriteAs::Xlsx(exporter);
let export_step = ExporterStep {
module: exporter,
path: corpus_dir.path().to_path_buf(),
description: None,
extension: None,
};
export_step.execute(&graph, None).unwrap();
let mut file = File::open(&document_path).unwrap();
let mut sha256 = Sha256::new();
std::io::copy(&mut file, &mut sha256).unwrap();
let hash_after_conversion = format!("{:02X?}", sha256.finalize());
let after = document_path.metadata().unwrap().modified().unwrap();
assert_eq!(original_hash, hash_after_conversion);
assert_eq!(before, after);
}
#[test]
fn export_overwrites_changed() {
let (corpus_dir, document_path, original_hash) = create_corpus_folder_and_hash();
let importer: ImportSpreadsheet = toml::from_str(
r#"
column_map = {"tok" = ["lb"]}
metasheet = "meta"
metasheet_skip_rows = 1
"#,
)
.unwrap();
let importer = crate::ReadFrom::Xlsx(importer);
let orig_import_step = ImporterStep {
module: importer,
path: corpus_dir.path().to_path_buf(),
description: None,
generic_config: None,
};
let mut updates = orig_import_step.execute(None).unwrap();
let mut graph = AnnotationGraph::with_default_graphstorages(false).unwrap();
graph.apply_update(&mut updates, |_| {}).unwrap();
let exporter: ExportXlsx = toml::from_str(
r#"
skip_unchanged_files = true
annotation_order = ["lb", "tok"]
"#,
)
.unwrap();
let exporter = crate::WriteAs::Xlsx(exporter);
let export_step = ExporterStep {
module: exporter,
path: corpus_dir.path().to_path_buf(),
description: None,
extension: None,
};
export_step.execute(&graph, None).unwrap();
let mut file = File::open(&document_path).unwrap();
let mut sha256 = Sha256::new();
std::io::copy(&mut file, &mut sha256).unwrap();
let hash_after_conversion = format!("{:02X?}", sha256.finalize());
assert_ne!(original_hash, hash_after_conversion);
}
#[test]
fn export_overwrites_unchanged() {
let (corpus_dir, document_path, _) = create_corpus_folder_and_hash();
let before = document_path.metadata().unwrap().modified().unwrap();
let importer: ImportSpreadsheet = toml::from_str(
r#"
column_map = {"tok" = ["lb"]}
metasheet = "meta"
metasheet_skip_rows = 1
"#,
)
.unwrap();
let importer = crate::ReadFrom::Xlsx(importer);
let orig_import_step = ImporterStep {
module: importer,
path: corpus_dir.path().to_path_buf(),
description: Some("custom-xlsx-export-id".to_string()),
generic_config: None,
};
let mut updates = orig_import_step.execute(None).unwrap();
let mut graph = AnnotationGraph::with_default_graphstorages(false).unwrap();
graph.apply_update(&mut updates, |_| {}).unwrap();
let exporter: ExportXlsx = toml::from_str(
r#"
skip_unchanged_files = false
annotation_order = ["tok", "lb"]
"#,
)
.unwrap();
let exporter = crate::WriteAs::Xlsx(exporter);
let export_step = ExporterStep {
module: exporter,
path: corpus_dir.path().to_path_buf(),
description: None,
extension: None,
};
export_step.execute(&graph, None).unwrap();
let after = document_path.metadata().unwrap().modified().unwrap();
assert_ne!(before, after);
}
#[test]
fn manipulate_sheet_in_existing_book() {
let test_path = Path::new("./tests/data/export/xlsx/existing/existing.xlsx");
let importer: ImportSpreadsheet = toml::from_str(
r#"
column_map = { "tok" = ["sentence"] }
datasheet = 1
"#,
)
.unwrap();
let exporter: ExportXlsx = toml::from_str(
r#"
update_datasheet = 1
"#,
)
.unwrap();
let mut graph = AnnotationGraph::with_default_graphstorages(true).unwrap();
let mut update = importer
.import_corpus(
test_path.parent().unwrap(),
crate::StepID {
module_name: "test_import".to_string(),
path: None,
},
GenericImportConfiguration::new_with_default_extensions(&importer),
None,
)
.unwrap();
assert!(graph.apply_update(&mut update, |_| {}).is_ok());
let test_dir = tempdir().unwrap();
let test_target = &test_dir.path().join("existing.xlsx");
assert!(fs::copy(&test_path, &test_target).is_ok());
assert!(
exporter
.export_corpus(
&graph,
test_dir.path(),
crate::StepID {
module_name: "test_export".to_string(),
path: None
},
None
)
.is_ok()
);
let wb = umya_spreadsheet::reader::xlsx::read(test_target);
assert!(wb.is_ok());
let book = wb.unwrap();
let sheet = book.get_sheet(&0).unwrap();
let merge_cells = sheet.get_merge_cells();
assert_eq!(1, merge_cells.len());
let merge_cell = merge_cells.get(0);
assert!(merge_cell.is_some());
let merge_cell = merge_cell.unwrap();
assert_eq!("C4:E7", &merge_cell.get_range());
let c = merge_cell.get_coordinate_start_col().unwrap().get_num();
let r = merge_cell.get_coordinate_start_row().unwrap().get_num();
let cell = &sheet.get_cell((c, r)).unwrap();
assert_eq!("EXACTLY", cell.get_formatted_value());
let bg = cell.get_style().get_background_color().unwrap();
let fnt = cell.get_style().get_font().unwrap();
assert_snapshot!(format!(
"{}\n{}\n{}",
bg.get_argb(),
fnt.get_color().get_argb(),
fnt.get_bold()
));
}
#[test]
fn spans() {
let g = AnnotationGraph::with_default_graphstorages(true);
assert!(g.is_ok());
let mut graph = g.unwrap();
let mut u = GraphUpdate::default();
example_generator::create_corpus_structure_simple(&mut u);
example_generator::create_multiple_segmentations(&mut u, "root/doc1");
assert!(
u.add_event(UpdateEvent::AddNodeLabel {
node_name: "root/doc1#a1".to_string(),
anno_ns: "".to_string(),
anno_name: "number".to_string(),
anno_value: "1.".to_string(),
})
.is_ok()
);
assert!(graph.apply_update(&mut u, |_| {}).is_ok());
let exporter = ExportXlsx {
..Default::default()
};
let target_dir = tempdir().unwrap();
assert!(
exporter
.export_corpus(
&graph,
target_dir.path(),
crate::StepID {
module_name: "test_export".to_string(),
path: None
},
None
)
.is_ok()
);
assert!(
sheets_diff::core::diff::Diff::new(
"./tests/data/export/xlsx/span-target/doc1.xlsx",
&target_dir.path().join("doc1.xlsx").to_string_lossy()
)
.diff()
.cell_diffs
.is_empty()
);
}
}