use crate::{error::AnnattoError, importer::GenericImportConfiguration};
use super::Importer;
use facet::Facet;
use graphannis::{
model::AnnotationComponentType,
update::{GraphUpdate, UpdateEvent},
};
use graphannis_core::graph::ANNIS_NS;
use normpath::PathExt;
use serde::Serialize;
use serde_derive::Deserialize;
#[derive(Facet, Deserialize, Default, Serialize, Clone, PartialEq)]
#[serde(deny_unknown_fields)]
pub struct CreateFileNodes {
#[serde(default)]
corpus_name: Option<String>,
}
impl Importer for CreateFileNodes {
fn import_corpus(
&self,
input_path: &std::path::Path,
step_id: crate::StepID,
_config: GenericImportConfiguration,
_tx: Option<crate::workflow::StatusSender>,
) -> Result<GraphUpdate, Box<dyn std::error::Error>> {
let mut update = GraphUpdate::default();
let base_dir = input_path.normalize()?;
if let Some(base_dir_name) = base_dir.file_name() {
let start_index = base_dir.as_path().to_string_lossy().len() - base_dir_name.len();
if let Some(link_target) = &self.corpus_name {
update.add_event(UpdateEvent::AddNode {
node_name: link_target.to_string(),
node_type: "corpus".to_string(),
})?;
}
for path_r in
glob::glob(format!("{}/**/*", base_dir.as_path().to_string_lossy()).as_str())?
{
let path = path_r?;
let node_name = path.to_string_lossy()[start_index..].to_string();
if path.is_file() {
update.add_event(UpdateEvent::AddNode {
node_name: node_name.to_string(),
node_type: "file".to_string(),
})?;
update.add_event(UpdateEvent::AddNodeLabel {
node_name: node_name.to_string(),
anno_ns: ANNIS_NS.to_string(),
anno_name: "file".to_string(),
anno_value: node_name.to_string(),
})?;
if let Some(link_target) = &self.corpus_name {
update.add_event(UpdateEvent::AddEdge {
source_node: node_name,
target_node: link_target.to_string(),
layer: ANNIS_NS.to_string(),
component_type: AnnotationComponentType::PartOf.to_string(),
component_name: "".to_string(),
})?;
}
}
}
Ok(update)
} else {
Err(Box::new(AnnattoError::Import {
reason: "Could not determine base dir.".to_string(),
importer: step_id.module_name.to_string(),
path: input_path.to_path_buf(),
}))
}
}
fn default_file_extensions(&self) -> &[&str] {
&[]
}
}
#[cfg(test)]
mod tests {
use std::path::PathBuf;
use graphannis::{
AnnotationGraph,
model::{AnnotationComponent, AnnotationComponentType},
update::{GraphUpdate, UpdateEvent},
};
use graphannis_core::graph::ANNIS_NS;
use insta::assert_snapshot;
use itertools::Itertools;
use crate::ImporterStep;
use super::CreateFileNodes;
#[test]
fn serialize() {
let module = CreateFileNodes::default();
let serialization = toml::to_string(&module);
assert!(
serialization.is_ok(),
"Serialization failed: {:?}",
serialization.err()
);
assert_snapshot!(serialization.unwrap());
}
#[test]
fn serialize_custom() {
let module = CreateFileNodes {
corpus_name: Some("BeMaTaC".to_string()),
};
let serialization = toml::to_string(&module);
assert!(
serialization.is_ok(),
"Serialization failed: {:?}",
serialization.err()
);
assert_snapshot!(serialization.unwrap());
}
#[test]
fn test_file_nodes_in_mem() {
let r = test(false);
assert!(r.is_ok(), "test ended with error: {:?}", r.err());
}
#[test]
fn test_files_nodes_on_disk() {
let r = test(true);
assert!(r.is_ok(), "test ended with error: {:?}", r.err());
}
fn test(on_disk: bool) -> Result<(), Box<dyn std::error::Error>> {
let mut expected_g = AnnotationGraph::with_default_graphstorages(on_disk)?;
let mut u = GraphUpdate::default();
u.add_event(UpdateEvent::AddNode {
node_name: "xlsx".to_string(),
node_type: "corpus".to_string(),
})?;
u.add_event(UpdateEvent::AddNode {
node_name: "xlsx/test_file.xlsx".to_string(),
node_type: "file".to_string(),
})?;
u.add_event(UpdateEvent::AddNodeLabel {
node_name: "xlsx/test_file.xlsx".to_string(),
anno_ns: ANNIS_NS.to_string(),
anno_name: "file".to_string(),
anno_value: "xlsx/test_file.xlsx".to_string(),
})?;
u.add_event(UpdateEvent::AddEdge {
source_node: "xlsx/test_file.xlsx".to_string(),
target_node: "xlsx".to_string(),
layer: ANNIS_NS.to_string(),
component_type: AnnotationComponentType::PartOf.to_string(),
component_name: "".to_string(),
})?;
u.add_event(UpdateEvent::AddEdge {
source_node: "xlsx/test_file.xlsx".to_string(),
target_node: "xlsx/test_file.xlsx".to_string(),
layer: ANNIS_NS.to_string(),
component_type: AnnotationComponentType::Ordering.to_string(),
component_name: "".to_string(),
})?;
let eur = expected_g.apply_update(&mut u, |_| {});
assert!(eur.is_ok()); let mut test_g = AnnotationGraph::with_default_graphstorages(on_disk)?;
let import = CreateFileNodes {
corpus_name: Some("xlsx".to_string()),
};
let step = ImporterStep {
module: crate::ReadFrom::Path(import),
path: PathBuf::from("tests/data/import/xlsx/clean/xlsx/"),
description: Some("Custom-id-import".to_string()),
generic_config: None,
};
let mut test_u = step.execute(None)?;
test_u.add_event(UpdateEvent::AddNode {
node_name: "dummy_node".to_string(),
node_type: "node".to_string(),
})?;
test_u.add_event(UpdateEvent::AddEdge {
source_node: "dummy_node".to_string(),
target_node: "dummy_node".to_string(),
layer: ANNIS_NS.to_string(),
component_type: AnnotationComponentType::Ordering.to_string(),
component_name: "".to_string(),
})?;
let ur = test_g.apply_update(&mut test_u, |_| {});
assert!(ur.is_ok());
let expected_id = expected_g
.get_node_annos()
.get_node_id_from_name("xlsx/test_file.xlsx")?;
assert!(expected_id.is_some());
let test_id = test_g
.get_node_annos()
.get_node_id_from_name("xlsx/test_file.xlsx")?;
assert!(test_id.is_some());
assert_eq!(expected_id.unwrap(), test_id.unwrap());
let expected_matches = expected_g
.get_node_annos()
.exact_anno_search(
Some(ANNIS_NS),
"file",
graphannis_core::annostorage::ValueSearch::Any,
)
.collect_vec();
let test_matches = test_g
.get_node_annos()
.exact_anno_search(
Some(ANNIS_NS),
"file",
graphannis_core::annostorage::ValueSearch::Any,
)
.collect_vec();
assert_eq!(expected_matches.len(), test_matches.len());
for (me, mt) in expected_matches.into_iter().zip(test_matches) {
assert_eq!(me?, mt?);
}
let test_part_of_comp = test_g.get_graphstorage(&AnnotationComponent::new(
AnnotationComponentType::PartOf,
ANNIS_NS.into(),
"".into(),
));
assert!(test_part_of_comp.is_some());
let test_root_node_id = test_g.get_node_annos().get_node_id_from_name("xlsx")?;
assert!(test_root_node_id.is_some());
let expected_part_of_comp = expected_g.get_graphstorage_as_ref(&AnnotationComponent::new(
AnnotationComponentType::PartOf,
ANNIS_NS.into(),
"".into(),
));
assert!(expected_part_of_comp.is_some());
let expected_root_node_id = expected_g.get_node_annos().get_node_id_from_name("xlsx")?;
assert!(expected_root_node_id.is_some());
assert_eq!(
expected_part_of_comp
.unwrap()
.get_ingoing_edges(expected_root_node_id.unwrap())
.count(),
test_part_of_comp
.clone()
.unwrap()
.get_ingoing_edges(test_root_node_id.unwrap())
.count()
);
assert_eq!(
test_part_of_comp
.unwrap()
.get_ingoing_edges(test_root_node_id.unwrap())
.count(),
glob::glob("tests/data/import/xlsx/clean/xlsx/*.*")
.into_iter()
.count()
);
Ok(())
}
}