Skip to main content

annatto/importer/
file_nodes.rs

1use crate::{error::AnnattoError, importer::GenericImportConfiguration};
2
3use super::Importer;
4use facet::Facet;
5use graphannis::{
6    model::AnnotationComponentType,
7    update::{GraphUpdate, UpdateEvent},
8};
9use graphannis_core::graph::ANNIS_NS;
10use normpath::PathExt;
11use serde::Serialize;
12use serde_derive::Deserialize;
13
14/// Add file nodes for all files in the imported directory.
15#[derive(Facet, Deserialize, Default, Serialize, Clone, PartialEq)]
16#[serde(deny_unknown_fields)]
17pub struct CreateFileNodes {
18    /// The name of the corpus root node.
19    #[serde(default)]
20    corpus_name: Option<String>,
21}
22
23impl Importer for CreateFileNodes {
24    fn import_corpus(
25        &self,
26        input_path: &std::path::Path,
27        step_id: crate::StepID,
28        _config: GenericImportConfiguration,
29        _tx: Option<crate::workflow::StatusSender>,
30    ) -> Result<GraphUpdate, Box<dyn std::error::Error>> {
31        let mut update = GraphUpdate::default();
32        let base_dir = input_path.normalize()?;
33        if let Some(base_dir_name) = base_dir.file_name() {
34            let start_index = base_dir.as_path().to_string_lossy().len() - base_dir_name.len();
35            if let Some(link_target) = &self.corpus_name {
36                update.add_event(UpdateEvent::AddNode {
37                    node_name: link_target.to_string(),
38                    node_type: "corpus".to_string(),
39                })?;
40            }
41            for path_r in
42                glob::glob(format!("{}/**/*", base_dir.as_path().to_string_lossy()).as_str())?
43            {
44                let path = path_r?;
45                let node_name = path.to_string_lossy()[start_index..].to_string();
46                if path.is_file() {
47                    update.add_event(UpdateEvent::AddNode {
48                        node_name: node_name.to_string(),
49                        node_type: "file".to_string(),
50                    })?;
51                    update.add_event(UpdateEvent::AddNodeLabel {
52                        node_name: node_name.to_string(),
53                        anno_ns: ANNIS_NS.to_string(),
54                        anno_name: "file".to_string(),
55                        anno_value: node_name.to_string(),
56                    })?;
57                    if let Some(link_target) = &self.corpus_name {
58                        update.add_event(UpdateEvent::AddEdge {
59                            source_node: node_name,
60                            target_node: link_target.to_string(),
61                            layer: ANNIS_NS.to_string(),
62                            component_type: AnnotationComponentType::PartOf.to_string(),
63                            component_name: "".to_string(),
64                        })?;
65                    }
66                }
67            }
68            Ok(update)
69        } else {
70            Err(Box::new(AnnattoError::Import {
71                reason: "Could not determine base dir.".to_string(),
72                importer: step_id.module_name.to_string(),
73                path: input_path.to_path_buf(),
74            }))
75        }
76    }
77
78    fn default_file_extensions(&self) -> &[&str] {
79        &[]
80    }
81}
82
83#[cfg(test)]
84mod tests {
85    use std::path::PathBuf;
86
87    use graphannis::{
88        AnnotationGraph,
89        model::{AnnotationComponent, AnnotationComponentType},
90        update::{GraphUpdate, UpdateEvent},
91    };
92    use graphannis_core::graph::ANNIS_NS;
93    use insta::assert_snapshot;
94    use itertools::Itertools;
95
96    use crate::ImporterStep;
97
98    use super::CreateFileNodes;
99
100    #[test]
101    fn serialize() {
102        let module = CreateFileNodes::default();
103        let serialization = toml::to_string(&module);
104        assert!(
105            serialization.is_ok(),
106            "Serialization failed: {:?}",
107            serialization.err()
108        );
109        assert_snapshot!(serialization.unwrap());
110    }
111
112    #[test]
113    fn serialize_custom() {
114        let module = CreateFileNodes {
115            corpus_name: Some("BeMaTaC".to_string()),
116        };
117        let serialization = toml::to_string(&module);
118        assert!(
119            serialization.is_ok(),
120            "Serialization failed: {:?}",
121            serialization.err()
122        );
123        assert_snapshot!(serialization.unwrap());
124    }
125
126    #[test]
127    fn test_file_nodes_in_mem() {
128        let r = test(false);
129        assert!(r.is_ok(), "test ended with error: {:?}", r.err());
130    }
131
132    #[test]
133    fn test_files_nodes_on_disk() {
134        let r = test(true);
135        assert!(r.is_ok(), "test ended with error: {:?}", r.err());
136    }
137
138    fn test(on_disk: bool) -> Result<(), Box<dyn std::error::Error>> {
139        let mut expected_g = AnnotationGraph::with_default_graphstorages(on_disk)?;
140        let mut u = GraphUpdate::default();
141        u.add_event(UpdateEvent::AddNode {
142            node_name: "xlsx".to_string(),
143            node_type: "corpus".to_string(),
144        })?;
145        u.add_event(UpdateEvent::AddNode {
146            node_name: "xlsx/test_file.xlsx".to_string(),
147            node_type: "file".to_string(),
148        })?;
149        u.add_event(UpdateEvent::AddNodeLabel {
150            node_name: "xlsx/test_file.xlsx".to_string(),
151            anno_ns: ANNIS_NS.to_string(),
152            anno_name: "file".to_string(),
153            anno_value: "xlsx/test_file.xlsx".to_string(),
154        })?;
155        u.add_event(UpdateEvent::AddEdge {
156            source_node: "xlsx/test_file.xlsx".to_string(),
157            target_node: "xlsx".to_string(),
158            layer: ANNIS_NS.to_string(),
159            component_type: AnnotationComponentType::PartOf.to_string(),
160            component_name: "".to_string(),
161        })?;
162        u.add_event(UpdateEvent::AddEdge {
163            // dummy edge to pass model check
164            source_node: "xlsx/test_file.xlsx".to_string(),
165            target_node: "xlsx/test_file.xlsx".to_string(),
166            layer: ANNIS_NS.to_string(),
167            component_type: AnnotationComponentType::Ordering.to_string(),
168            component_name: "".to_string(),
169        })?;
170        let eur = expected_g.apply_update(&mut u, |_| {});
171        assert!(eur.is_ok()); // ordering component is missing, so this should be an error
172        let mut test_g = AnnotationGraph::with_default_graphstorages(on_disk)?;
173        let import = CreateFileNodes {
174            corpus_name: Some("xlsx".to_string()),
175        };
176        let step = ImporterStep {
177            module: crate::ReadFrom::Path(import),
178            path: PathBuf::from("tests/data/import/xlsx/clean/xlsx/"),
179            description: Some("Custom-id-import".to_string()),
180            generic_config: None,
181        };
182        let mut test_u = step.execute(None)?;
183        // add dummy node and dummy ordering edge to pass model checks when applying the update to the graph
184        test_u.add_event(UpdateEvent::AddNode {
185            node_name: "dummy_node".to_string(),
186            node_type: "node".to_string(),
187        })?;
188        test_u.add_event(UpdateEvent::AddEdge {
189            source_node: "dummy_node".to_string(),
190            target_node: "dummy_node".to_string(),
191            layer: ANNIS_NS.to_string(),
192            component_type: AnnotationComponentType::Ordering.to_string(),
193            component_name: "".to_string(),
194        })?;
195        // apply
196        let ur = test_g.apply_update(&mut test_u, |_| {});
197        assert!(ur.is_ok());
198        let expected_id = expected_g
199            .get_node_annos()
200            .get_node_id_from_name("xlsx/test_file.xlsx")?;
201        assert!(expected_id.is_some());
202        let test_id = test_g
203            .get_node_annos()
204            .get_node_id_from_name("xlsx/test_file.xlsx")?;
205        assert!(test_id.is_some());
206        assert_eq!(expected_id.unwrap(), test_id.unwrap());
207        let expected_matches = expected_g
208            .get_node_annos()
209            .exact_anno_search(
210                Some(ANNIS_NS),
211                "file",
212                graphannis_core::annostorage::ValueSearch::Any,
213            )
214            .collect_vec();
215        let test_matches = test_g
216            .get_node_annos()
217            .exact_anno_search(
218                Some(ANNIS_NS),
219                "file",
220                graphannis_core::annostorage::ValueSearch::Any,
221            )
222            .collect_vec();
223        assert_eq!(expected_matches.len(), test_matches.len());
224        for (me, mt) in expected_matches.into_iter().zip(test_matches) {
225            assert_eq!(me?, mt?);
226        }
227        let test_part_of_comp = test_g.get_graphstorage(&AnnotationComponent::new(
228            AnnotationComponentType::PartOf,
229            ANNIS_NS.into(),
230            "".into(),
231        ));
232        assert!(test_part_of_comp.is_some());
233        let test_root_node_id = test_g.get_node_annos().get_node_id_from_name("xlsx")?;
234        assert!(test_root_node_id.is_some());
235        let expected_part_of_comp = expected_g.get_graphstorage_as_ref(&AnnotationComponent::new(
236            AnnotationComponentType::PartOf,
237            ANNIS_NS.into(),
238            "".into(),
239        ));
240        assert!(expected_part_of_comp.is_some());
241        let expected_root_node_id = expected_g.get_node_annos().get_node_id_from_name("xlsx")?;
242        assert!(expected_root_node_id.is_some());
243        assert_eq!(
244            expected_part_of_comp
245                .unwrap()
246                .get_ingoing_edges(expected_root_node_id.unwrap())
247                .count(),
248            test_part_of_comp
249                .clone()
250                .unwrap()
251                .get_ingoing_edges(test_root_node_id.unwrap())
252                .count()
253        );
254        assert_eq!(
255            test_part_of_comp
256                .unwrap()
257                .get_ingoing_edges(test_root_node_id.unwrap())
258                .count(),
259            glob::glob("tests/data/import/xlsx/clean/xlsx/*.*")
260                .into_iter()
261                .count()
262        );
263        Ok(())
264    }
265}