Skip to main content

annatto/importer/
table.rs

1use std::{
2    fs::File,
3    io::{BufRead, BufReader},
4    path::Path,
5};
6
7use csv::Reader;
8use facet::Facet;
9use graphannis::{
10    graph::AnnoKey,
11    model::{AnnotationComponent, AnnotationComponentType},
12    update::{GraphUpdate, UpdateEvent},
13};
14use graphannis_core::{graph::ANNIS_NS, util::split_qname};
15
16use serde::Serialize;
17use serde_derive::Deserialize;
18
19use super::Importer;
20use crate::{
21    StepID, importer::GenericImportConfiguration, progress::ProgressReporter,
22    util::graphupdate::import_corpus_graph_from_files,
23};
24
25#[derive(Facet, Deserialize, Serialize, Clone, PartialEq)]
26#[serde(deny_unknown_fields)]
27struct EmptyLineGroup {
28    #[serde(with = "crate::estarde::anno_key")]
29    anno: AnnoKey,
30    #[serde(default, with = "crate::estarde::annotation_component::as_option")]
31    component: Option<AnnotationComponent>,
32}
33
34/// Import CSV files with token and token annotations.
35#[derive(Facet, Deserialize, Serialize, Clone, PartialEq)]
36#[serde(deny_unknown_fields)]
37pub struct ImportTable {
38    /// If not empty, skip the first row and use this list as the fully qualified annotation name for each column.
39    #[serde(default)]
40    column_names: Vec<String>,
41    /// The provided character defines the column delimiter. The default value is tab.
42    ///
43    /// Example:
44    /// ```toml
45    /// [export.config]
46    /// delimiter = ";"
47    /// ```
48    #[serde(default = "default_delimiter")]
49    delimiter: char,
50    /// The provided character will be used for quoting values. If nothing is provided, all columns will contain bare values. If a character is provided,
51    /// all values will be quoted.
52    ///
53    /// Example:
54    /// ```toml
55    /// [export.config]
56    /// quote_char = "\""
57    /// ```
58    #[serde(default)]
59    quote_char: Option<char>,
60    /// If given, treat empty lines as separator for spans of token (e.g.
61    /// sentences). You need to configure the name of the annotation to create
62    /// (`anno`).
63    /// Example:
64    /// ```toml
65    /// [import.config]
66    /// empty_line_group = {anno="csv::sent_id"}
67    /// ```
68    /// The annotation value will be a sequential number.
69    ///
70    /// Per default, a span is created, but you can change the `component` e.g. to a one of the type dominance.
71    ///
72    /// ```toml
73    /// [import.config]
74    /// empty_line_group = {anno = "csv::sentence, value="S", component = {ctype="Dominance", layer="syntax", name="cat"}}
75    /// ```
76    ///
77    #[serde(default)]
78    empty_line_group: Option<EmptyLineGroup>,
79    /// Null entries can be skipped by providing an explicit na-value.
80    ///
81    /// Example:
82    /// ```toml
83    /// [[import]]
84    /// format = "table"
85    /// path = "..."
86    ///
87    /// [import.config]
88    /// na = "_"
89    /// ```
90    ///
91    /// If provided, for all values matching the `na` value, no annotation is created.
92    #[serde(default)]
93    na: Option<String>,
94}
95
96fn default_delimiter() -> char {
97    '\t'
98}
99
100impl Default for ImportTable {
101    fn default() -> Self {
102        Self {
103            column_names: Default::default(),
104            quote_char: Default::default(),
105            delimiter: default_delimiter(),
106            empty_line_group: Default::default(),
107            na: None,
108        }
109    }
110}
111
112const FILE_ENDINGS: [&str; 4] = ["csv", "tsv", "tab", "txt"];
113
114impl Importer for ImportTable {
115    fn import_corpus(
116        &self,
117        input_path: &std::path::Path,
118        step_id: StepID,
119        config: GenericImportConfiguration,
120        tx: Option<crate::workflow::StatusSender>,
121    ) -> Result<graphannis::update::GraphUpdate, Box<dyn std::error::Error>> {
122        let mut update = GraphUpdate::default();
123        let paths_and_node_names =
124            import_corpus_graph_from_files(&mut update, input_path, &config)?;
125        let progress =
126            ProgressReporter::new(tx.clone(), step_id.clone(), paths_and_node_names.len())?;
127        for (pathbuf, doc_node_name) in paths_and_node_names {
128            self.import_document(&mut update, pathbuf.as_path(), doc_node_name)?;
129            progress.worked(1)?;
130        }
131        Ok(update)
132    }
133
134    fn default_file_extensions(&self) -> &[&str] {
135        &FILE_ENDINGS
136    }
137}
138impl ImportTable {
139    fn import_document(
140        &self,
141        update: &mut GraphUpdate,
142        document_path: &Path,
143        document_node_name: String,
144    ) -> Result<(), Box<dyn std::error::Error>> {
145        let mut reader_builder = csv::ReaderBuilder::new();
146        reader_builder
147            .delimiter(self.delimiter as u8)
148            .quoting(false)
149            .trim(csv::Trim::All)
150            .flexible(true);
151        if let Some(c) = &self.quote_char {
152            reader_builder.quoting(true).quote(*c as u8);
153        }
154        if self.column_names.is_empty() {
155            reader_builder.has_headers(true);
156        } else {
157            reader_builder.has_headers(false);
158        }
159        let reader = reader_builder.from_path(document_path)?;
160
161        self.map_token(update, &document_node_name, reader)?;
162
163        if let Some(empty_line_group) = &self.empty_line_group {
164            // Go trough the file and find empty lines
165            let f = File::open(document_path)?;
166            let buffered_reader = BufReader::new(f);
167
168            let mut empty_line_nr = 1;
169            let mut group_start_token: u64 = 1;
170            let mut next_token_idx = 1;
171            for line in buffered_reader.lines() {
172                let line = line?;
173
174                if line.trim_ascii().is_empty() {
175                    self.map_span(
176                        update,
177                        group_start_token,
178                        next_token_idx,
179                        empty_line_group,
180                        &document_node_name,
181                        empty_line_nr.to_string(),
182                    )?;
183                    empty_line_nr += 1;
184                    group_start_token = next_token_idx;
185                } else {
186                    // Token are only added for non-empty lines
187                    next_token_idx += 1;
188                }
189            }
190            if next_token_idx > group_start_token {
191                // Map the last group as well
192                self.map_span(
193                    update,
194                    group_start_token,
195                    next_token_idx,
196                    empty_line_group,
197                    &document_node_name,
198                    empty_line_nr.to_string(),
199                )?;
200            }
201        }
202
203        Ok(())
204    }
205
206    fn map_span(
207        &self,
208        update: &mut GraphUpdate,
209        group_start_token: u64,
210        next_token_idx: u64,
211        empty_line_group: &EmptyLineGroup,
212        document_node_name: &str,
213        value: String,
214    ) -> anyhow::Result<()> {
215        let group_span_name = format!(
216            "{document_node_name}#group_span_{group_start_token}_{}",
217            next_token_idx - 1
218        );
219
220        update.add_event(UpdateEvent::AddNode {
221            node_name: group_span_name.clone(),
222            node_type: "node".to_string(),
223        })?;
224        update.add_event(UpdateEvent::AddNodeLabel {
225            node_name: group_span_name.clone(),
226            anno_ns: empty_line_group.anno.ns.to_string(),
227            anno_name: empty_line_group.anno.name.to_string(),
228            anno_value: value,
229        })?;
230        update.add_event(UpdateEvent::AddEdge {
231            source_node: group_span_name.clone(),
232            target_node: document_node_name.to_string(),
233            layer: ANNIS_NS.to_string(),
234            component_type: AnnotationComponentType::PartOf.to_string(),
235            component_name: "".to_string(),
236        })?;
237        // Add spanning relations for all covered token
238        for t in group_start_token..next_token_idx {
239            if let Some(c) = &empty_line_group.component {
240                update.add_event(UpdateEvent::AddEdge {
241                    source_node: group_span_name.clone(),
242                    target_node: format!("{document_node_name}#t{t}"),
243                    layer: c.layer.to_string(),
244                    component_type: c.get_type().to_string(),
245                    component_name: c.name.to_string(),
246                })?;
247            } else {
248                update.add_event(UpdateEvent::AddEdge {
249                    source_node: group_span_name.clone(),
250                    target_node: format!("{document_node_name}#t{t}"),
251                    layer: ANNIS_NS.to_string(),
252                    component_type: AnnotationComponentType::Coverage.to_string(),
253                    component_name: "".to_string(),
254                })?;
255            }
256        }
257        Ok(())
258    }
259
260    fn map_token<R>(
261        &self,
262        update: &mut GraphUpdate,
263        document_node_name: &str,
264        mut reader: Reader<R>,
265    ) -> Result<(), Box<dyn std::error::Error>>
266    where
267        R: std::io::Read,
268    {
269        let column_names: Vec<_> = if reader.has_headers() {
270            reader.headers()?.iter().map(|h| h.to_string()).collect()
271        } else {
272            self.column_names.clone()
273        };
274
275        let mut token_idx = 1;
276
277        for record in reader.records() {
278            let record = record?;
279
280            // Add node for token
281            let node_name = format!("{document_node_name}#t{token_idx}");
282            update.add_event(UpdateEvent::AddNode {
283                node_name: node_name.clone(),
284                node_type: "node".to_string(),
285            })?;
286            update.add_event(UpdateEvent::AddEdge {
287                source_node: node_name.clone(),
288                target_node: document_node_name.to_string(),
289                layer: ANNIS_NS.to_string(),
290                component_type: AnnotationComponentType::PartOf.to_string(),
291                component_name: "".to_string(),
292            })?;
293            if token_idx > 0 {
294                let last_token_node_name = format!("{document_node_name}#t{}", token_idx - 1);
295                update.add_event(UpdateEvent::AddEdge {
296                    source_node: last_token_node_name.clone(),
297                    target_node: node_name.clone(),
298                    layer: ANNIS_NS.to_string(),
299                    component_type: AnnotationComponentType::Ordering.to_string(),
300                    component_name: "".to_string(),
301                })?;
302            }
303
304            // Add all columns as token annotations
305            for (i, name) in column_names.iter().enumerate() {
306                if let Some(val) = record.get(i) {
307                    if let Some(na_val) = &self.na
308                        && na_val == val.trim()
309                    {
310                        continue;
311                    }
312                    let (ns, name) = split_qname(name);
313                    update.add_event(UpdateEvent::AddNodeLabel {
314                        node_name: node_name.clone(),
315                        anno_ns: ns.unwrap_or_default().to_string(),
316                        anno_name: name.to_string(),
317                        anno_value: val.to_string(),
318                    })?;
319                }
320            }
321            token_idx += 1;
322        }
323
324        Ok(())
325    }
326}
327
328#[cfg(test)]
329mod tests;