1use std::{
2 fs::File,
3 io::{BufRead, BufReader},
4 path::Path,
5};
6
7use csv::Reader;
8use facet::Facet;
9use graphannis::{
10 graph::AnnoKey,
11 model::{AnnotationComponent, AnnotationComponentType},
12 update::{GraphUpdate, UpdateEvent},
13};
14use graphannis_core::{graph::ANNIS_NS, util::split_qname};
15
16use serde::Serialize;
17use serde_derive::Deserialize;
18
19use super::Importer;
20use crate::{
21 StepID, importer::GenericImportConfiguration, progress::ProgressReporter,
22 util::graphupdate::import_corpus_graph_from_files,
23};
24
25#[derive(Facet, Deserialize, Serialize, Clone, PartialEq)]
26#[serde(deny_unknown_fields)]
27struct EmptyLineGroup {
28 #[serde(with = "crate::estarde::anno_key")]
29 anno: AnnoKey,
30 #[serde(default, with = "crate::estarde::annotation_component::as_option")]
31 component: Option<AnnotationComponent>,
32}
33
34#[derive(Facet, Deserialize, Serialize, Clone, PartialEq)]
36#[serde(deny_unknown_fields)]
37pub struct ImportTable {
38 #[serde(default)]
40 column_names: Vec<String>,
41 #[serde(default = "default_delimiter")]
49 delimiter: char,
50 #[serde(default)]
59 quote_char: Option<char>,
60 #[serde(default)]
78 empty_line_group: Option<EmptyLineGroup>,
79 #[serde(default)]
93 na: Option<String>,
94}
95
96fn default_delimiter() -> char {
97 '\t'
98}
99
100impl Default for ImportTable {
101 fn default() -> Self {
102 Self {
103 column_names: Default::default(),
104 quote_char: Default::default(),
105 delimiter: default_delimiter(),
106 empty_line_group: Default::default(),
107 na: None,
108 }
109 }
110}
111
112const FILE_ENDINGS: [&str; 4] = ["csv", "tsv", "tab", "txt"];
113
114impl Importer for ImportTable {
115 fn import_corpus(
116 &self,
117 input_path: &std::path::Path,
118 step_id: StepID,
119 config: GenericImportConfiguration,
120 tx: Option<crate::workflow::StatusSender>,
121 ) -> Result<graphannis::update::GraphUpdate, Box<dyn std::error::Error>> {
122 let mut update = GraphUpdate::default();
123 let paths_and_node_names =
124 import_corpus_graph_from_files(&mut update, input_path, &config)?;
125 let progress =
126 ProgressReporter::new(tx.clone(), step_id.clone(), paths_and_node_names.len())?;
127 for (pathbuf, doc_node_name) in paths_and_node_names {
128 self.import_document(&mut update, pathbuf.as_path(), doc_node_name)?;
129 progress.worked(1)?;
130 }
131 Ok(update)
132 }
133
134 fn default_file_extensions(&self) -> &[&str] {
135 &FILE_ENDINGS
136 }
137}
138impl ImportTable {
139 fn import_document(
140 &self,
141 update: &mut GraphUpdate,
142 document_path: &Path,
143 document_node_name: String,
144 ) -> Result<(), Box<dyn std::error::Error>> {
145 let mut reader_builder = csv::ReaderBuilder::new();
146 reader_builder
147 .delimiter(self.delimiter as u8)
148 .quoting(false)
149 .trim(csv::Trim::All)
150 .flexible(true);
151 if let Some(c) = &self.quote_char {
152 reader_builder.quoting(true).quote(*c as u8);
153 }
154 if self.column_names.is_empty() {
155 reader_builder.has_headers(true);
156 } else {
157 reader_builder.has_headers(false);
158 }
159 let reader = reader_builder.from_path(document_path)?;
160
161 self.map_token(update, &document_node_name, reader)?;
162
163 if let Some(empty_line_group) = &self.empty_line_group {
164 let f = File::open(document_path)?;
166 let buffered_reader = BufReader::new(f);
167
168 let mut empty_line_nr = 1;
169 let mut group_start_token: u64 = 1;
170 let mut next_token_idx = 1;
171 for line in buffered_reader.lines() {
172 let line = line?;
173
174 if line.trim_ascii().is_empty() {
175 self.map_span(
176 update,
177 group_start_token,
178 next_token_idx,
179 empty_line_group,
180 &document_node_name,
181 empty_line_nr.to_string(),
182 )?;
183 empty_line_nr += 1;
184 group_start_token = next_token_idx;
185 } else {
186 next_token_idx += 1;
188 }
189 }
190 if next_token_idx > group_start_token {
191 self.map_span(
193 update,
194 group_start_token,
195 next_token_idx,
196 empty_line_group,
197 &document_node_name,
198 empty_line_nr.to_string(),
199 )?;
200 }
201 }
202
203 Ok(())
204 }
205
206 fn map_span(
207 &self,
208 update: &mut GraphUpdate,
209 group_start_token: u64,
210 next_token_idx: u64,
211 empty_line_group: &EmptyLineGroup,
212 document_node_name: &str,
213 value: String,
214 ) -> anyhow::Result<()> {
215 let group_span_name = format!(
216 "{document_node_name}#group_span_{group_start_token}_{}",
217 next_token_idx - 1
218 );
219
220 update.add_event(UpdateEvent::AddNode {
221 node_name: group_span_name.clone(),
222 node_type: "node".to_string(),
223 })?;
224 update.add_event(UpdateEvent::AddNodeLabel {
225 node_name: group_span_name.clone(),
226 anno_ns: empty_line_group.anno.ns.to_string(),
227 anno_name: empty_line_group.anno.name.to_string(),
228 anno_value: value,
229 })?;
230 update.add_event(UpdateEvent::AddEdge {
231 source_node: group_span_name.clone(),
232 target_node: document_node_name.to_string(),
233 layer: ANNIS_NS.to_string(),
234 component_type: AnnotationComponentType::PartOf.to_string(),
235 component_name: "".to_string(),
236 })?;
237 for t in group_start_token..next_token_idx {
239 if let Some(c) = &empty_line_group.component {
240 update.add_event(UpdateEvent::AddEdge {
241 source_node: group_span_name.clone(),
242 target_node: format!("{document_node_name}#t{t}"),
243 layer: c.layer.to_string(),
244 component_type: c.get_type().to_string(),
245 component_name: c.name.to_string(),
246 })?;
247 } else {
248 update.add_event(UpdateEvent::AddEdge {
249 source_node: group_span_name.clone(),
250 target_node: format!("{document_node_name}#t{t}"),
251 layer: ANNIS_NS.to_string(),
252 component_type: AnnotationComponentType::Coverage.to_string(),
253 component_name: "".to_string(),
254 })?;
255 }
256 }
257 Ok(())
258 }
259
260 fn map_token<R>(
261 &self,
262 update: &mut GraphUpdate,
263 document_node_name: &str,
264 mut reader: Reader<R>,
265 ) -> Result<(), Box<dyn std::error::Error>>
266 where
267 R: std::io::Read,
268 {
269 let column_names: Vec<_> = if reader.has_headers() {
270 reader.headers()?.iter().map(|h| h.to_string()).collect()
271 } else {
272 self.column_names.clone()
273 };
274
275 let mut token_idx = 1;
276
277 for record in reader.records() {
278 let record = record?;
279
280 let node_name = format!("{document_node_name}#t{token_idx}");
282 update.add_event(UpdateEvent::AddNode {
283 node_name: node_name.clone(),
284 node_type: "node".to_string(),
285 })?;
286 update.add_event(UpdateEvent::AddEdge {
287 source_node: node_name.clone(),
288 target_node: document_node_name.to_string(),
289 layer: ANNIS_NS.to_string(),
290 component_type: AnnotationComponentType::PartOf.to_string(),
291 component_name: "".to_string(),
292 })?;
293 if token_idx > 0 {
294 let last_token_node_name = format!("{document_node_name}#t{}", token_idx - 1);
295 update.add_event(UpdateEvent::AddEdge {
296 source_node: last_token_node_name.clone(),
297 target_node: node_name.clone(),
298 layer: ANNIS_NS.to_string(),
299 component_type: AnnotationComponentType::Ordering.to_string(),
300 component_name: "".to_string(),
301 })?;
302 }
303
304 for (i, name) in column_names.iter().enumerate() {
306 if let Some(val) = record.get(i) {
307 if let Some(na_val) = &self.na
308 && na_val == val.trim()
309 {
310 continue;
311 }
312 let (ns, name) = split_qname(name);
313 update.add_event(UpdateEvent::AddNodeLabel {
314 node_name: node_name.clone(),
315 anno_ns: ns.unwrap_or_default().to_string(),
316 anno_name: name.to_string(),
317 anno_value: val.to_string(),
318 })?;
319 }
320 }
321 token_idx += 1;
322 }
323
324 Ok(())
325 }
326}
327
328#[cfg(test)]
329mod tests;