1use crate::{error::AnnattoError, importer::GenericImportConfiguration};
2
3use super::Importer;
4use facet::Facet;
5use graphannis::{
6 model::AnnotationComponentType,
7 update::{GraphUpdate, UpdateEvent},
8};
9use graphannis_core::graph::ANNIS_NS;
10use normpath::PathExt;
11use serde::Serialize;
12use serde_derive::Deserialize;
13
14#[derive(Facet, Deserialize, Default, Serialize, Clone, PartialEq)]
16#[serde(deny_unknown_fields)]
17pub struct CreateFileNodes {
18 #[serde(default)]
20 corpus_name: Option<String>,
21}
22
23impl Importer for CreateFileNodes {
24 fn import_corpus(
25 &self,
26 input_path: &std::path::Path,
27 step_id: crate::StepID,
28 _config: GenericImportConfiguration,
29 _tx: Option<crate::workflow::StatusSender>,
30 ) -> Result<GraphUpdate, Box<dyn std::error::Error>> {
31 let mut update = GraphUpdate::default();
32 let base_dir = input_path.normalize()?;
33 if let Some(base_dir_name) = base_dir.file_name() {
34 let start_index = base_dir.as_path().to_string_lossy().len() - base_dir_name.len();
35 if let Some(link_target) = &self.corpus_name {
36 update.add_event(UpdateEvent::AddNode {
37 node_name: link_target.to_string(),
38 node_type: "corpus".to_string(),
39 })?;
40 }
41 for path_r in
42 glob::glob(format!("{}/**/*", base_dir.as_path().to_string_lossy()).as_str())?
43 {
44 let path = path_r?;
45 let node_name = path.to_string_lossy()[start_index..].to_string();
46 if path.is_file() {
47 update.add_event(UpdateEvent::AddNode {
48 node_name: node_name.to_string(),
49 node_type: "file".to_string(),
50 })?;
51 update.add_event(UpdateEvent::AddNodeLabel {
52 node_name: node_name.to_string(),
53 anno_ns: ANNIS_NS.to_string(),
54 anno_name: "file".to_string(),
55 anno_value: node_name.to_string(),
56 })?;
57 if let Some(link_target) = &self.corpus_name {
58 update.add_event(UpdateEvent::AddEdge {
59 source_node: node_name,
60 target_node: link_target.to_string(),
61 layer: ANNIS_NS.to_string(),
62 component_type: AnnotationComponentType::PartOf.to_string(),
63 component_name: "".to_string(),
64 })?;
65 }
66 }
67 }
68 Ok(update)
69 } else {
70 Err(Box::new(AnnattoError::Import {
71 reason: "Could not determine base dir.".to_string(),
72 importer: step_id.module_name.to_string(),
73 path: input_path.to_path_buf(),
74 }))
75 }
76 }
77
78 fn default_file_extensions(&self) -> &[&str] {
79 &[]
80 }
81}
82
83#[cfg(test)]
84mod tests {
85 use std::path::PathBuf;
86
87 use graphannis::{
88 AnnotationGraph,
89 model::{AnnotationComponent, AnnotationComponentType},
90 update::{GraphUpdate, UpdateEvent},
91 };
92 use graphannis_core::graph::ANNIS_NS;
93 use insta::assert_snapshot;
94 use itertools::Itertools;
95
96 use crate::ImporterStep;
97
98 use super::CreateFileNodes;
99
100 #[test]
101 fn serialize() {
102 let module = CreateFileNodes::default();
103 let serialization = toml::to_string(&module);
104 assert!(
105 serialization.is_ok(),
106 "Serialization failed: {:?}",
107 serialization.err()
108 );
109 assert_snapshot!(serialization.unwrap());
110 }
111
112 #[test]
113 fn serialize_custom() {
114 let module = CreateFileNodes {
115 corpus_name: Some("BeMaTaC".to_string()),
116 };
117 let serialization = toml::to_string(&module);
118 assert!(
119 serialization.is_ok(),
120 "Serialization failed: {:?}",
121 serialization.err()
122 );
123 assert_snapshot!(serialization.unwrap());
124 }
125
126 #[test]
127 fn test_file_nodes_in_mem() {
128 let r = test(false);
129 assert!(r.is_ok(), "test ended with error: {:?}", r.err());
130 }
131
132 #[test]
133 fn test_files_nodes_on_disk() {
134 let r = test(true);
135 assert!(r.is_ok(), "test ended with error: {:?}", r.err());
136 }
137
138 fn test(on_disk: bool) -> Result<(), Box<dyn std::error::Error>> {
139 let mut expected_g = AnnotationGraph::with_default_graphstorages(on_disk)?;
140 let mut u = GraphUpdate::default();
141 u.add_event(UpdateEvent::AddNode {
142 node_name: "xlsx".to_string(),
143 node_type: "corpus".to_string(),
144 })?;
145 u.add_event(UpdateEvent::AddNode {
146 node_name: "xlsx/test_file.xlsx".to_string(),
147 node_type: "file".to_string(),
148 })?;
149 u.add_event(UpdateEvent::AddNodeLabel {
150 node_name: "xlsx/test_file.xlsx".to_string(),
151 anno_ns: ANNIS_NS.to_string(),
152 anno_name: "file".to_string(),
153 anno_value: "xlsx/test_file.xlsx".to_string(),
154 })?;
155 u.add_event(UpdateEvent::AddEdge {
156 source_node: "xlsx/test_file.xlsx".to_string(),
157 target_node: "xlsx".to_string(),
158 layer: ANNIS_NS.to_string(),
159 component_type: AnnotationComponentType::PartOf.to_string(),
160 component_name: "".to_string(),
161 })?;
162 u.add_event(UpdateEvent::AddEdge {
163 source_node: "xlsx/test_file.xlsx".to_string(),
165 target_node: "xlsx/test_file.xlsx".to_string(),
166 layer: ANNIS_NS.to_string(),
167 component_type: AnnotationComponentType::Ordering.to_string(),
168 component_name: "".to_string(),
169 })?;
170 let eur = expected_g.apply_update(&mut u, |_| {});
171 assert!(eur.is_ok()); let mut test_g = AnnotationGraph::with_default_graphstorages(on_disk)?;
173 let import = CreateFileNodes {
174 corpus_name: Some("xlsx".to_string()),
175 };
176 let step = ImporterStep {
177 module: crate::ReadFrom::Path(import),
178 path: PathBuf::from("tests/data/import/xlsx/clean/xlsx/"),
179 description: Some("Custom-id-import".to_string()),
180 generic_config: None,
181 };
182 let mut test_u = step.execute(None)?;
183 test_u.add_event(UpdateEvent::AddNode {
185 node_name: "dummy_node".to_string(),
186 node_type: "node".to_string(),
187 })?;
188 test_u.add_event(UpdateEvent::AddEdge {
189 source_node: "dummy_node".to_string(),
190 target_node: "dummy_node".to_string(),
191 layer: ANNIS_NS.to_string(),
192 component_type: AnnotationComponentType::Ordering.to_string(),
193 component_name: "".to_string(),
194 })?;
195 let ur = test_g.apply_update(&mut test_u, |_| {});
197 assert!(ur.is_ok());
198 let expected_id = expected_g
199 .get_node_annos()
200 .get_node_id_from_name("xlsx/test_file.xlsx")?;
201 assert!(expected_id.is_some());
202 let test_id = test_g
203 .get_node_annos()
204 .get_node_id_from_name("xlsx/test_file.xlsx")?;
205 assert!(test_id.is_some());
206 assert_eq!(expected_id.unwrap(), test_id.unwrap());
207 let expected_matches = expected_g
208 .get_node_annos()
209 .exact_anno_search(
210 Some(ANNIS_NS),
211 "file",
212 graphannis_core::annostorage::ValueSearch::Any,
213 )
214 .collect_vec();
215 let test_matches = test_g
216 .get_node_annos()
217 .exact_anno_search(
218 Some(ANNIS_NS),
219 "file",
220 graphannis_core::annostorage::ValueSearch::Any,
221 )
222 .collect_vec();
223 assert_eq!(expected_matches.len(), test_matches.len());
224 for (me, mt) in expected_matches.into_iter().zip(test_matches) {
225 assert_eq!(me?, mt?);
226 }
227 let test_part_of_comp = test_g.get_graphstorage(&AnnotationComponent::new(
228 AnnotationComponentType::PartOf,
229 ANNIS_NS.into(),
230 "".into(),
231 ));
232 assert!(test_part_of_comp.is_some());
233 let test_root_node_id = test_g.get_node_annos().get_node_id_from_name("xlsx")?;
234 assert!(test_root_node_id.is_some());
235 let expected_part_of_comp = expected_g.get_graphstorage_as_ref(&AnnotationComponent::new(
236 AnnotationComponentType::PartOf,
237 ANNIS_NS.into(),
238 "".into(),
239 ));
240 assert!(expected_part_of_comp.is_some());
241 let expected_root_node_id = expected_g.get_node_annos().get_node_id_from_name("xlsx")?;
242 assert!(expected_root_node_id.is_some());
243 assert_eq!(
244 expected_part_of_comp
245 .unwrap()
246 .get_ingoing_edges(expected_root_node_id.unwrap())
247 .count(),
248 test_part_of_comp
249 .clone()
250 .unwrap()
251 .get_ingoing_edges(test_root_node_id.unwrap())
252 .count()
253 );
254 assert_eq!(
255 test_part_of_comp
256 .unwrap()
257 .get_ingoing_edges(test_root_node_id.unwrap())
258 .count(),
259 glob::glob("tests/data/import/xlsx/clean/xlsx/*.*")
260 .into_iter()
261 .count()
262 );
263 Ok(())
264 }
265}