stamtools/
tsv.rs

1use stam::*;
2use std::borrow::Cow;
3use std::collections::HashMap;
4use std::fmt;
5use std::fs::File;
6use std::io::{BufRead, BufReader};
7
8#[derive(Clone, PartialEq, Debug)]
9/// Represents a column in TSV output or input
10pub enum Column {
11    /// Sequence number, usually a row number but sometimes multiple rows may share the same number if hierarchical relations are expressed
12    SeqNr,
13
14    /// Variable name, as used in a STAMQL query
15    VarName,
16
17    /// Type of the result on this row
18    Type,
19
20    /// ID of this result on this row
21    Id,
22
23    /// ID of the annotation
24    Annotation,
25
26    /// ID of the text resource
27    TextResource,
28
29    /// ID of the annotation data
30    AnnotationData,
31
32    /// ID of the annotation dataset
33    AnnotationDataSet,
34
35    /// Offset in unicode points (begin-end), 0 indexed, end non-inclusive.
36    Offset,
37
38    ///Begin offset in unicode points, 0 indexed.
39    BeginOffset,
40
41    ///End offset in unicode points, 0 indexed, non-inclusive.
42    EndOffset,
43
44    Utf8Offset,
45
46    ///Begin offset in bytes (UTF-8 encoding), 0 indexed
47    BeginUtf8Offset,
48
49    ///End offset in bytes (UTF-8 encoding), 0 indexed, non-inclusive
50    EndUtf8Offset,
51
52    /// ID of the data key
53    DataKey,
54
55    /// Value
56    DataValue,
57
58    /// The text
59    Text,
60
61    /// The text selection is a combination of `TextResource` and `Offset`, seperated by a '#`
62    TextSelection,
63
64    /// Ignore this column
65    Ignore,
66
67    /// Custom data column, represents the value for the given set and datakey.
68    Custom {
69        set: String,
70        key: String,
71    },
72}
73
74#[derive(Clone, Copy, PartialEq, Debug)]
75pub enum ValidationMode {
76    Strict,
77    Loose,
78    No,
79}
80
81impl TryFrom<&str> for ValidationMode {
82    type Error = String;
83    fn try_from(val: &str) -> Result<Self, Self::Error> {
84        let val_lower = val.to_lowercase();
85        match val_lower.as_str() {
86            "strict" | "yes" => Ok(Self::Strict),
87            "loose" => Ok(Self::Loose),
88            "no" => Ok(Self::No),
89            _ => Err(format!(
90                "Unknown value for --validate: {}, see --help for allowed values",
91                val
92            )),
93        }
94    }
95}
96
97impl Column {
98    /// Parse a column header into a type
99    pub fn parse(val: &str, setdelimiter: &str) -> Result<Self, String> {
100        if val.find(setdelimiter).is_some() {
101            let (set, key) = val.rsplit_once(setdelimiter).unwrap();
102            Ok(Self::Custom {
103                set: set.to_string(),
104                key: key.to_string(),
105            })
106        } else {
107            let val_lower = val.to_lowercase();
108            match val_lower.as_str() {
109                "type" => Ok(Self::Type),
110                "id" => Ok(Self::Id),
111                "annotationid" | "annotation" => Ok(Self::Annotation),
112                "annotationdatasetid"
113                | "annotationdataset"
114                | "set"
115                | "setid"
116                | "datasetid"
117                | "dataset" => Ok(Self::AnnotationDataSet),
118                "resource" | "resourceid" | "textresource" | "textresources" => {
119                    Ok(Self::TextResource)
120                }
121                "annotationdataid" | "dataid" => Ok(Self::AnnotationData),
122                "offset" => Ok(Self::Offset),
123                "beginoffset" | "begin" | "start" | "startoffset" => Ok(Self::BeginOffset),
124                "endoffset" | "end" => Ok(Self::EndOffset),
125                "utf8offset" => Ok(Self::Utf8Offset),
126                "beginutf8offset" | "beginutf8" | "beginbyte" | "startbyte" | "startutf8"
127                | "startutf8offset" => Ok(Self::BeginUtf8Offset),
128                "endutf8offset" | "endutf8" | "endbyte" => Ok(Self::EndUtf8Offset),
129                "datakey" | "key" | "datakeyid" | "keyid" => Ok(Self::DataKey),
130                "datavalue" | "value" => Ok(Self::DataValue),
131                "text" => Ok(Self::Text),
132                "textselections" | "textselection" => Ok(Self::TextSelection),
133                "ignore" => Ok(Self::Ignore),
134                _ => Err(format!(
135                    "Unknown column: {}, see --help for allowed values",
136                    val
137                )),
138            }
139        }
140    }
141}
142
143impl fmt::Display for Column {
144    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
145        write!(f, "{}", self.to_string())
146    }
147}
148
149#[derive(Clone)]
150struct Context<'a> {
151    id: Option<Cow<'a, str>>,
152    varname: Option<Cow<'a, str>>,
153    seqnr: usize,
154    textselections: Option<&'a Vec<ResultTextSelection<'a>>>,
155    text: Option<&'a str>,
156    annotation: Option<ResultItem<'a, Annotation>>,
157    data: Option<ResultItem<'a, AnnotationData>>,
158    resource: Option<ResultItem<'a, TextResource>>,
159    set: Option<ResultItem<'a, AnnotationDataSet>>,
160    key: Option<ResultItem<'a, DataKey>>,
161    value: Option<&'a DataValue>,
162}
163
164impl<'a> Default for Context<'a> {
165    fn default() -> Self {
166        Context {
167            id: None,
168            varname: None,
169            seqnr: 0,
170            textselections: None, //multiple
171            text: None,           //single text reference
172            annotation: None,
173            data: None,
174            resource: None,
175            set: None,
176            key: None,
177            value: None,
178        }
179    }
180}
181
182impl Column {
183    /// Output a string for this column, to be used in e.g. a TSV header
184    pub fn to_string(&self) -> String {
185        match self {
186            Self::SeqNr => "SeqNr".to_string(),
187            Self::VarName => "Variable".to_string(),
188            Self::Type => "Type".to_string(),
189            Self::Id => "Id".to_string(),
190            Self::Annotation => "Annotation".to_string(),
191            Self::TextResource => "TextResource".to_string(),
192            Self::AnnotationData => "AnnotationData".to_string(),
193            Self::AnnotationDataSet => "AnnotationDataSet".to_string(),
194            Self::Offset => "Offset".to_string(),
195            Self::BeginOffset => "BeginOffset".to_string(),
196            Self::EndOffset => "EndOffset".to_string(),
197            Self::Utf8Offset => "Utf8Offset".to_string(),
198            Self::BeginUtf8Offset => "BeginUtf8Offset".to_string(),
199            Self::EndUtf8Offset => "EndUtf8Offset".to_string(),
200            Self::DataKey => "DataKey".to_string(),
201            Self::DataValue => "DataValue".to_string(),
202            Self::Text => "Text".to_string(),
203            Self::TextSelection => "TextSelection".to_string(),
204            Self::Ignore => "Ignore".to_string(),
205            Self::Custom { set, key } => format!("{}/{}", set, key),
206        }
207    }
208
209    fn print<W: std::io::Write>(
210        &self,
211        writer: &mut W,
212        tp: Type,
213        colnr: usize,
214        col_len: usize,
215        context: &Context,
216        delimiter: &str,
217        null: &str,
218    ) -> Result<(), std::io::Error> {
219        if colnr > 0 {
220            write!(writer, "\t")?;
221        }
222        match self {
223            Column::SeqNr => write!(writer, "{}", context.seqnr)?,
224            Column::VarName => write!(
225                writer,
226                "{}",
227                context.varname.as_ref().unwrap_or(&Cow::Borrowed(null))
228            )?,
229            Column::Type => write!(writer, "{}", tp)?,
230            Column::Id => write!(
231                writer,
232                "{}",
233                context.id.as_ref().unwrap_or(&Cow::Borrowed(null))
234            )?,
235            Column::TextSelection => {
236                if let Some(textselections) = context.textselections {
237                    write!(
238                        writer,
239                        "{}",
240                        textselections
241                            .iter()
242                            .map(|textselection| {
243                                format!(
244                                    "{}#{}-{}",
245                                    textselection.resource().id().unwrap_or(""),
246                                    textselection.begin(),
247                                    textselection.end()
248                                )
249                            })
250                            .collect::<Vec<String>>()
251                            .join(delimiter)
252                    )?;
253                } else {
254                    write!(writer, "{}", null)?
255                }
256            }
257            Column::Offset => {
258                if let Some(textselections) = context.textselections {
259                    write!(
260                        writer,
261                        "{}",
262                        textselections
263                            .iter()
264                            .map(|textselection| {
265                                format!("{}-{}", textselection.begin(), textselection.end())
266                            })
267                            .collect::<Vec<String>>()
268                            .join(delimiter)
269                    )?;
270                } else {
271                    write!(writer, "{}", null)?
272                }
273            }
274            Column::BeginOffset => {
275                if let Some(textselections) = context.textselections {
276                    write!(
277                        writer,
278                        "{}",
279                        textselections
280                            .iter()
281                            .map(|textselection| { format!("{}", textselection.begin()) })
282                            .collect::<Vec<String>>()
283                            .join(delimiter)
284                    )?;
285                } else {
286                    write!(writer, "{}", null)?
287                }
288            }
289            Column::EndOffset => {
290                if let Some(textselections) = context.textselections {
291                    write!(
292                        writer,
293                        "{}",
294                        textselections
295                            .iter()
296                            .map(|textselection| { format!("{}", textselection.end()) })
297                            .collect::<Vec<String>>()
298                            .join(delimiter)
299                    )?;
300                } else {
301                    write!(writer, "{}", null)?
302                }
303            }
304            Column::Utf8Offset => {
305                if let Some(textselections) = context.textselections {
306                    write!(
307                        writer,
308                        "{}",
309                        textselections
310                            .iter()
311                            .map(|textselection| {
312                                format!(
313                                    "{}-{}",
314                                    textselection
315                                        .resource()
316                                        .utf8byte(textselection.begin())
317                                        .expect("offset must be valid"),
318                                    textselection
319                                        .resource()
320                                        .utf8byte(textselection.end())
321                                        .expect("offset must be valid"),
322                                )
323                            })
324                            .collect::<Vec<String>>()
325                            .join(delimiter)
326                    )?;
327                } else {
328                    write!(writer, "{}", null)?
329                }
330            }
331            Column::BeginUtf8Offset => {
332                if let Some(textselections) = context.textselections {
333                    write!(
334                        writer,
335                        "{}",
336                        textselections
337                            .iter()
338                            .map(|textselection| {
339                                format!(
340                                    "{}",
341                                    textselection
342                                        .resource()
343                                        .utf8byte(textselection.begin())
344                                        .expect("offset must be valid"),
345                                )
346                            })
347                            .collect::<Vec<String>>()
348                            .join(delimiter)
349                    )?;
350                } else {
351                    write!(writer, "{}", null)?
352                }
353            }
354            Column::EndUtf8Offset => {
355                if let Some(textselections) = context.textselections {
356                    write!(
357                        writer,
358                        "{}",
359                        textselections
360                            .iter()
361                            .map(|textselection| {
362                                format!(
363                                    "{}",
364                                    textselection
365                                        .resource()
366                                        .utf8byte(textselection.end())
367                                        .expect("offset must be valid"),
368                                )
369                            })
370                            .collect::<Vec<String>>()
371                            .join(delimiter)
372                    )?;
373                } else {
374                    write!(writer, "{}", null)?
375                }
376            }
377            Column::Text => {
378                if let Some(text) = context.text {
379                    write!(writer, "{}", text)?
380                } else if let Some(textselections) = context.textselections {
381                    write!(
382                        writer,
383                        "{}",
384                        textselections
385                            .iter()
386                            .map(|textselection| textselection.text().replace("\n", " "))
387                            .collect::<Vec<String>>()
388                            .join(delimiter)
389                    )?
390                } else {
391                    write!(writer, "{}", null)?
392                }
393            }
394            Column::Annotation => write!(
395                writer,
396                "{}",
397                context
398                    .annotation
399                    .as_ref()
400                    .map(|annotation| annotation
401                        .id()
402                        .map(|x| x.to_string())
403                        .unwrap_or_else(|| annotation.as_ref().temp_id().unwrap()))
404                    .unwrap_or(null.to_string())
405            )?,
406            Column::AnnotationData => write!(
407                writer,
408                "{}",
409                context
410                    .data
411                    .as_ref()
412                    .map(|data| data.id().unwrap_or(null))
413                    .unwrap_or(null)
414            )?,
415            Column::AnnotationDataSet => write!(
416                writer,
417                "{}",
418                context
419                    .set
420                    .as_ref()
421                    .map(|set| set.id().unwrap_or(null))
422                    .unwrap_or(null)
423            )?,
424            Column::TextResource => write!(
425                writer,
426                "{}",
427                context
428                    .resource
429                    .as_ref()
430                    .map(|resource| resource.id().unwrap_or(null))
431                    .unwrap_or(null)
432            )?,
433            Column::DataKey => write!(
434                writer,
435                "{}",
436                context
437                    .key
438                    .as_ref()
439                    .map(|key| key.id().unwrap_or(null))
440                    .unwrap_or(null)
441            )?,
442            Column::DataValue => write!(
443                writer,
444                "{}",
445                context
446                    .value
447                    .as_ref()
448                    .map(|value| value.to_string())
449                    .unwrap_or(null.to_string())
450            )?,
451            Column::Custom { set, key } => {
452                let mut found = false;
453                if let Some(annotation) = &context.annotation {
454                    if let Some(key) = annotation.store().key(set.as_str(), key.as_str()) {
455                        for (i, annotationdata) in annotation.data().filter_key(&key).enumerate() {
456                            found = true;
457                            write!(
458                                writer,
459                                "{}{}",
460                                if i > 0 { delimiter } else { "" },
461                                annotationdata.value()
462                            )?
463                        }
464                    }
465                }
466                if !found {
467                    write!(writer, "{}", null)?
468                }
469            }
470            _ => write!(writer, "{}", null)?,
471        }
472        if colnr == col_len - 1 {
473            write!(writer, "\n")?;
474        }
475        Ok(())
476    }
477}
478
479#[derive(Debug)]
480/// A column specification, holds one or more [`Column`] instances.
481pub struct Columns(Vec<Column>);
482
483impl Columns {
484    fn printrow<W: std::io::Write>(
485        &self,
486        writer: &mut W,
487        tp: Type,
488        context: &Context,
489        delimiter: &str,
490        null: &str,
491    ) -> Result<(), std::io::Error> {
492        for (i, column) in self.0.iter().enumerate() {
493            column.print(writer, tp, i, self.len(), context, delimiter, null)?;
494        }
495        Ok(())
496    }
497
498    fn printheader<W: std::io::Write>(&self, writer: &mut W) -> Result<(), std::io::Error> {
499        for (i, column) in self.0.iter().enumerate() {
500            if i > 0 {
501                write!(writer, "\t")?;
502            }
503            write!(writer, "{}", column)?;
504            if i == self.len() - 1 {
505                write!(writer, "\n")?;
506            }
507        }
508        Ok(())
509    }
510
511    fn index(&self, coltype: &Column) -> Option<usize> {
512        for (i, col) in self.0.iter().enumerate() {
513            if col == coltype {
514                return Some(i);
515            }
516        }
517        None
518    }
519
520    fn has(&self, coltype: &Column) -> bool {
521        self.index(coltype).is_some()
522    }
523
524    fn len(&self) -> usize {
525        self.0.len()
526    }
527
528    fn iter<'a>(&'a self) -> std::slice::Iter<'a, Column> {
529        self.0.iter()
530    }
531
532    fn add_from_query<'a>(&mut self, query: &Query<'a>) {
533        for constraint in query.iter() {
534            match constraint {
535                Constraint::KeyValue { set, key, .. } | Constraint::DataKey { set, key, .. } => {
536                    self.0.push(Column::Custom {
537                        set: set.to_string(),
538                        key: key.to_string(),
539                    })
540                }
541                _ => {}
542            }
543        }
544        for subquery in query.subqueries() {
545            self.add_from_query(subquery);
546        }
547    }
548}
549
550pub fn to_tsv<'a, W: std::io::Write>(
551    store: &'a AnnotationStore,
552    writer: &mut W,
553    query: Query<'a>,
554    columnconfig: &[&str],
555    verbose: bool,
556    delimiter: &str,
557    null: &str,
558    header: bool,
559    setdelimiter: &str,
560    autocolumns: bool,
561) -> Result<(), StamError> {
562    let mut columns = Columns(
563        columnconfig
564            .iter()
565            .map(|col| {
566                Column::parse(*col, setdelimiter)
567                    .map_err(|err| {
568                        eprintln!("[warning] {}", err);
569                    })
570                    .unwrap()
571            })
572            .collect(),
573    );
574
575    if autocolumns {
576        if (verbose || query.has_subqueries()) && !columns.0.contains(&Column::SeqNr) {
577            //output the sequence (row) number in verbose mode or if we have subqueries
578            columns.0.insert(0, Column::SeqNr);
579        }
580        if query.has_subqueries() && !columns.0.contains(&Column::VarName) {
581            //output the variable name if we have subqueries
582            columns.0.insert(1, Column::VarName);
583        }
584
585        columns.add_from_query(&query);
586    }
587
588    if header {
589        columns.printheader(writer)?;
590    }
591
592    let want_textselections =
593        columns.0.contains(&Column::TextSelection) || columns.0.contains(&Column::Text);
594
595    let iter = store.query(query)?;
596    for (seqnr, resultrow) in iter.enumerate() {
597        let seqnr = seqnr + 1; //1-indexed
598        for (result, varname) in resultrow.iter().zip(resultrow.names()) {
599            match result {
600                QueryResultItem::None | QueryResultItem::AnnotationSubStore(..) => {}
601                QueryResultItem::Annotation(annotation) => {
602                    let textselections: Option<Vec<_>> = if want_textselections {
603                        Some(annotation.textselections().collect())
604                    } else {
605                        None
606                    };
607                    let context = Context {
608                        id: if let Some(id) = annotation.id() {
609                            Some(Cow::Borrowed(id))
610                        } else {
611                            Some(Cow::Owned(annotation.as_ref().temp_id().unwrap()))
612                        },
613                        seqnr,
614                        varname: varname.map(|s| Cow::Borrowed(s)),
615                        annotation: Some(annotation.clone()), //clones only the ResultItem, cheap
616                        textselections: textselections.as_ref(),
617                        ..Context::default()
618                    };
619                    columns.printrow(writer, Type::Annotation, &context, delimiter, null)?;
620                    if verbose {
621                        for data in annotation.data() {
622                            let context = Context {
623                                id: data.id().map(|x| Cow::Borrowed(x)),
624                                seqnr,
625                                annotation: Some(annotation.clone()),
626                                key: Some(data.key()),
627                                data: Some(data.clone()),
628                                set: Some(data.set()),
629                                value: Some(data.value()),
630                                ..Context::default()
631                            };
632                            columns.printrow(
633                                writer,
634                                Type::AnnotationData,
635                                &context,
636                                delimiter,
637                                null,
638                            )?;
639                        }
640                    }
641                }
642                QueryResultItem::AnnotationData(data) => {
643                    let context = Context {
644                        id: data.id().map(|x| Cow::Borrowed(x)),
645                        seqnr,
646                        varname: varname.map(|s| Cow::Borrowed(s)),
647                        set: Some(data.set()),
648                        key: Some(data.key()),
649                        value: Some(data.value()),
650                        ..Context::default()
651                    };
652                    columns.printrow(writer, Type::AnnotationData, &context, delimiter, null)?;
653                }
654                QueryResultItem::DataKey(key) => {
655                    let context = Context {
656                        id: key.id().map(|x| Cow::Borrowed(x)),
657                        seqnr,
658                        varname: varname.map(|s| Cow::Borrowed(s)),
659                        set: Some(key.set()),
660                        key: Some(key.clone()),
661                        ..Context::default()
662                    };
663                    columns.printrow(writer, Type::DataKey, &context, delimiter, null)?;
664                }
665                QueryResultItem::AnnotationDataSet(dataset) => {
666                    let context = Context {
667                        id: dataset.id().map(|x| Cow::Borrowed(x)),
668                        seqnr,
669                        varname: varname.map(|s| Cow::Borrowed(s)),
670                        set: Some(dataset.clone()),
671                        ..Context::default()
672                    };
673                    columns.printrow(writer, Type::AnnotationDataSet, &context, delimiter, null)?;
674                    if verbose {
675                        for key in dataset.keys() {
676                            let context = Context {
677                                id: key.id().map(|x| Cow::Borrowed(x)),
678                                seqnr,
679                                set: Some(key.set()),
680                                key: Some(key.clone()),
681                                ..Context::default()
682                            };
683                            columns.printrow(writer, Type::DataKey, &context, delimiter, null)?;
684                        }
685                        for data in dataset.data() {
686                            let context = Context {
687                                id: data.id().map(|x| Cow::Borrowed(x)),
688                                seqnr,
689                                set: Some(data.set()),
690                                key: Some(data.key()),
691                                value: Some(data.value()),
692                                ..Context::default()
693                            };
694                            columns.printrow(
695                                writer,
696                                Type::AnnotationData,
697                                &context,
698                                delimiter,
699                                null,
700                            )?;
701                        }
702                    }
703                }
704                QueryResultItem::TextResource(resource) => {
705                    let context = Context {
706                        id: resource.id().map(|x| Cow::Borrowed(x)),
707                        varname: varname.map(|s| Cow::Borrowed(s)),
708                        seqnr,
709                        resource: Some(resource.clone()),
710                        ..Context::default()
711                    };
712                    columns.printrow(writer, Type::TextResource, &context, delimiter, null)?;
713                }
714                QueryResultItem::TextSelection(textselection) => {
715                    let id = format!(
716                        "{}#{}-{}",
717                        textselection.resource().id().unwrap_or(""),
718                        textselection.begin(),
719                        textselection.end()
720                    );
721                    let text = Some(textselection.text());
722                    let textselections: Vec<ResultTextSelection> = vec![textselection.clone()];
723                    let context = Context {
724                        id: Some(Cow::Owned(id)),
725                        seqnr,
726                        varname: varname.map(|s| Cow::Borrowed(s)),
727                        resource: Some(textselection.resource()),
728                        textselections: Some(&textselections),
729                        text,
730                        ..Context::default()
731                    };
732                    columns.printrow(writer, Type::TextSelection, &context, delimiter, null)?;
733                }
734            }
735        }
736    }
737    Ok(())
738}
739
740#[derive(Debug, Clone, Copy, PartialEq)]
741/// Determines how a TSV file is parsed in relation to references text files (which may or may not exist).
742pub enum ParseMode {
743    /// Normal parse mode, assumes a stand-off text file exists and alignment information is given.
744    Simple,
745    /// Align with an existing text resource. This is useful when a TSV file holds text and a stand-off file exist, but no alignment is provided. The alignment will be computed.
746    AlignWithText,
747    /// Reconstruct a text resource from scratch. This is useful when a TSV file holds the text and an stand-off file does not exist. It will be created.
748    ReconstructText,
749    /// Tag all occurrences. This is used when there is stand-off file and the TSV files applies occurrences in that stand-off text file, rather than to any single ones.
750    MultiTag,
751    /// This is used when a TSV file does not relate to a text at all.
752    Metadata,
753}
754
755impl ParseMode {
756    /// Automatically determines a ParseMode from a column configuration and presence or absence of an existing resource
757    /// `sequential`: Is the information in the TSV file sequential/ordered? (e.g. a token or line on each subsequent row)
758    pub fn new(
759        columns: &Columns,
760        existing_resource: Option<&str>,
761        sequential: bool,
762    ) -> Result<Self, &'static str> {
763        if columns.has(&Column::Text) {
764            if columns.has(&Column::Offset)
765                || (columns.has(&Column::BeginOffset) && columns.has(&Column::EndOffset))
766                || columns.has(&Column::TextSelection)
767            {
768                Ok(Self::Simple)
769            } else {
770                //no offset information
771                if columns.has(&Column::TextResource)
772                    || existing_resource.is_some()
773                    || columns.has(&Column::TextSelection)
774                {
775                    if sequential {
776                        Ok(Self::AlignWithText)
777                    } else {
778                        Ok(Self::MultiTag)
779                    }
780                } else {
781                    if sequential {
782                        Ok(Self::ReconstructText)
783                    } else {
784                        Err("Can not reconstruct a text if rows in input data are not sequential")
785                    }
786                }
787            }
788        } else if columns.has(&Column::TextResource) || existing_resource.is_some() {
789            if columns.has(&Column::Offset)
790                || (columns.has(&Column::BeginOffset) && columns.has(&Column::EndOffset))
791                || columns.has(&Column::TextSelection)
792            {
793                Ok(Self::Simple)
794            } else {
795                Err("Unable to determine how to parse this data based on the available columns. Make sure there is at least an Offset column (or BeginOffset, EndOffset columns)")
796            }
797        } else if !columns.has(&Column::Offset)
798            && !columns.has(&Column::BeginOffset)
799            && !columns.has(&Column::EndOffset)
800            && !columns.has(&Column::TextSelection)
801        {
802            if columns.has(&Column::TextResource) || existing_resource.is_some() {
803                eprintln!("Warning: Data has neither a Text nor an Offset column, interpreting data as metadata");
804                Ok(Self::Metadata)
805            } else {
806                Err("Data has neither a Text nor an Offset column")
807            }
808        } else {
809            Err("Unable to determine how to parse this data based on the available columns. Make sure there is at least an Offset, Text or Resource column (or supply --resource)")
810        }
811    }
812}
813
814/// Reads a TSV, with a flexible column configuration, into an Annotation Store
815pub fn from_tsv(
816    store: &mut AnnotationStore,
817    filename: &str,
818    columnconfig: Option<&Vec<&str>>,
819    existing_resource: Option<&str>,
820    new_resource: Option<&str>,
821    default_set: Option<&str>,
822    comments: bool,
823    sequential: bool,
824    case_sensitive: bool,
825    escape: bool,
826    nullvalue: &str,
827    subdelimiter: &str,     //input delimiter for multiple values in a cell
828    setdelimiter: &str,     //delimiter between key/set
829    outputdelimiter: &str,  //outputted after each row when reconstructing text (space)
830    outputdelimiter2: &str, //outputted after each empty line when reconstructing text (newline)
831    header: Option<bool>,   //None means autodetect
832    validation: ValidationMode,
833    verbose: bool,
834) -> Result<(), String> {
835    let f =
836        File::open(filename).map_err(|e| format!("Error opening TSV file {}: {}", filename, e))?;
837    let reader = BufReader::new(f);
838
839    let mut columns: Option<Columns> = None;
840    let mut parsemode: Option<ParseMode> = None;
841    let mut cursors: HashMap<TextResourceHandle, usize> = HashMap::new(); //used in AlignWithText mode to keep track of the begin of text offset (per resource)
842    let mut buffer: Vec<String> = Vec::new(); //used in ReconstructText mode for a second pass over the data
843    let mut bufferbegin: usize = 0; //line number where the buffer begins
844    let mut texts: HashMap<String, String> = HashMap::new(); //used in ReconstructText mode
845    let mut buffered_delimiter: Option<String> = None; // used in ReconstructText mode
846
847    for (i, line) in reader.lines().enumerate() {
848        if let Ok(line) = line {
849            if line.is_empty() {
850                buffered_delimiter = Some(outputdelimiter2.to_string()); //only affects ReconstructText mode
851            } else if comments && !line.is_empty() && &line.get(0..1) == &Some("#") {
852                //this is a comment, ignore
853                continue;
854            } else if i == 0 && columns.is_none() && header != Some(false) {
855                if verbose {
856                    eprintln!("Parsing first row as header...")
857                }
858                columns = Some(
859                    Columns(
860                        line.split("\t")
861                            .map(|col| {
862                                parse_column(col, default_set, setdelimiter).map_err(|err| {
863                                    eprintln!("[warning] Unable to parse first line of TSV file as header (please provide a column configuration explicitly if the input file has none): {}. You may consider setting --annotationset if you want to interpret this column as a key in the specified annotationset", err);
864                                }).unwrap()
865                            })
866                            .collect(),
867                    )
868                );
869                parsemode = Some(
870                    ParseMode::new(columns.as_ref().unwrap(), existing_resource, sequential)
871                        .map_err(|e| format!("Can't determine parse mode: {}", e))?,
872                );
873                if verbose {
874                    eprintln!("Columns: {:?}", columns.as_ref().unwrap());
875                    eprintln!("Parse mode: {:?}", parsemode.unwrap());
876                }
877            } else if i == 0 && columns.is_some() && header != Some(false) {
878                if verbose {
879                    eprintln!("Skipping first row (assuming to be a header)...")
880                }
881                continue; //skip header row
882            } else {
883                if columns.is_none() {
884                    if columnconfig.is_none() {
885                        return Err(format!("Please provide a configuration for the columns"));
886                    }
887                    columns = Some(Columns(
888                        columnconfig
889                            .unwrap()
890                            .iter()
891                            .map(|col| {
892                                parse_column(col, default_set, setdelimiter)
893                                    .map_err(|err| {
894                                        eprintln!(
895                                            "[warning] Unable to parse provided column: {}",
896                                            err
897                                        );
898                                    })
899                                    .unwrap()
900                            })
901                            .collect(),
902                    ));
903                    parsemode = Some(
904                        ParseMode::new(columns.as_ref().unwrap(), existing_resource, sequential)
905                            .map_err(|e| format!("Can't determine parse mode: {}", e))?,
906                    );
907                    if verbose {
908                        eprintln!("Columns: {:?}", columns.as_ref().unwrap());
909                        eprintln!("Parse mode: {:?}", parsemode.unwrap())
910                    }
911                }
912                if let (Some(columns), Some(parsemode)) = (&columns, parsemode) {
913                    if parsemode == ParseMode::ReconstructText {
914                        if let Err(e) = reconstruct_text(
915                            &line,
916                            &columns,
917                            &mut texts,
918                            existing_resource,
919                            new_resource,
920                            outputdelimiter,
921                            &mut buffered_delimiter,
922                        ) {
923                            return Err(format!(
924                                "Error reconstructing text (line {}): {}",
925                                i + 1,
926                                e
927                            ));
928                        }
929                        if buffer.is_empty() {
930                            bufferbegin = i;
931                        }
932                        buffer.push(line);
933                    } else if let Err(e) = parse_row(
934                        store,
935                        &line,
936                        &columns,
937                        parsemode,
938                        subdelimiter,
939                        existing_resource,
940                        new_resource,
941                        default_set,
942                        case_sensitive,
943                        escape,
944                        nullvalue,
945                        validation,
946                        &mut cursors,
947                    ) {
948                        return Err(format!("Error parsing tsv line {}: {}", i + 1, e));
949                    }
950                }
951            }
952        }
953    }
954
955    if parsemode == Some(ParseMode::ReconstructText) {
956        if verbose {
957            eprintln!("Creating resources...");
958        }
959        for (filename, text) in texts {
960            if verbose {
961                eprintln!("Creating resource {} (length={})", filename, text.len());
962            }
963            if let Err(e) = store.add_resource(
964                TextResourceBuilder::new()
965                    .with_text(text)
966                    .with_filename(&filename),
967            ) {
968                return Err(format!("Error loading/adding resource: {}", e));
969            }
970        }
971        if verbose {
972            eprintln!("Parsing buffered rows...");
973        }
974        let parsemode = ParseMode::AlignWithText;
975        let columns = columns.unwrap();
976        for (i, line) in buffer.iter().enumerate() {
977            if let Err(e) = parse_row(
978                store,
979                &line,
980                &columns,
981                parsemode,
982                subdelimiter,
983                existing_resource,
984                new_resource,
985                default_set,
986                case_sensitive,
987                escape,
988                nullvalue,
989                validation,
990                &mut cursors,
991            ) {
992                return Err(format!(
993                    "Error parsing tsv line {}: {}",
994                    i + bufferbegin + 1,
995                    e
996                ));
997            }
998        }
999    }
1000    Ok(())
1001}
1002
1003fn reconstruct_text(
1004    line: &str,
1005    columns: &Columns,
1006    texts: &mut HashMap<String, String>,
1007    existing_resource: Option<&str>,
1008    new_resource: Option<&str>,
1009    output_delimiter: &str,
1010    buffered_delimiter: &mut Option<String>,
1011) -> Result<(), String> {
1012    let cells: Vec<&str> = line.split("\t").collect();
1013    if cells.len() != columns.len() {
1014        return Err(format!(
1015            "Number of cells is not equal to number of columns in header ({} vs {})",
1016            cells.len(),
1017            columns.len()
1018        ));
1019    }
1020    let resource_file: &str =
1021        parse_resource_file(&cells, columns, existing_resource, new_resource)?;
1022    let textcolumn = columns.index(&Column::Text);
1023    if !texts.contains_key(resource_file) {
1024        texts.insert(resource_file.to_string(), String::new());
1025    }
1026    if let Some(text) = texts.get_mut(resource_file) {
1027        if let Some(buffered_delimiter) = buffered_delimiter {
1028            text.push_str(&buffered_delimiter);
1029        }
1030        text.push_str(&cells[textcolumn.expect("there must be a text column")]);
1031        *buffered_delimiter = Some(output_delimiter.to_string());
1032    }
1033    Ok(())
1034}
1035
1036/// Parse a row (`line`) of a TSV file (provided as string)
1037fn parse_row(
1038    store: &mut AnnotationStore,
1039    line: &str,
1040    columns: &Columns,
1041    parsemode: ParseMode,
1042    subdelimiter: &str,
1043    existing_resource: Option<&str>,
1044    new_resource: Option<&str>,
1045    default_set: Option<&str>,
1046    case_sensitive: bool,
1047    escape: bool,
1048    nullvalue: &str,
1049    validation: ValidationMode,
1050    cursors: &mut HashMap<TextResourceHandle, usize>,
1051) -> Result<(), String> {
1052    let cells: Vec<&str> = line.split("\t").collect();
1053    if cells.len() != columns.len() {
1054        return Err(format!(
1055            "Number of cells is not equal to number of columns in header ({} vs {})",
1056            cells.len(),
1057            columns.len()
1058        ));
1059    }
1060    let resource_file: &str =
1061        parse_resource_file(&cells, columns, existing_resource, new_resource)?;
1062    let resource_handle: TextResourceHandle = get_resource_handle(store, resource_file)?;
1063    let textcolumn = columns.index(&Column::Text);
1064    let selector = match parsemode {
1065        ParseMode::Simple => build_selector(&cells, columns, resource_handle)?,
1066        ParseMode::AlignWithText => align_with_text(
1067            store,
1068            resource_handle,
1069            &cells,
1070            textcolumn.expect("text column is required when parsemode is set to AlignWithText"),
1071            case_sensitive,
1072            cursors,
1073        )?,
1074        _ => return Err("Not implemented yet".to_string()),
1075    };
1076    let mut annotationbuilder = build_annotation(
1077        &cells,
1078        columns,
1079        default_set,
1080        subdelimiter,
1081        escape,
1082        nullvalue,
1083    )?;
1084    annotationbuilder = annotationbuilder.with_target(selector);
1085    match store.annotate(annotationbuilder) {
1086        Err(e) => return Err(format!("{}", e)),
1087        Ok(handle) => {
1088            if parsemode == ParseMode::Simple {
1089                if let Some(textcolumn) = textcolumn {
1090                    validate_text(store, handle, &cells, textcolumn, validation)?;
1091                }
1092            }
1093        }
1094    }
1095    Ok(())
1096}
1097
1098fn align_with_text<'a>(
1099    store: &AnnotationStore,
1100    resource_handle: TextResourceHandle,
1101    cells: &[&str],
1102    textcolumn: usize,
1103    case_sensitive: bool,
1104    cursors: &mut HashMap<TextResourceHandle, usize>,
1105) -> Result<SelectorBuilder<'a>, String> {
1106    let textfragment = cells[textcolumn];
1107    if textfragment.is_empty() {
1108        return Err("Value in text column can not be empty".to_string());
1109    }
1110    let cursor = cursors.entry(resource_handle).or_insert(0);
1111    let resource = store
1112        .resource(&BuildItem::from(resource_handle))
1113        .expect("resource must exist");
1114    let searchtext = resource
1115        .textselection(&Offset::new(
1116            Cursor::BeginAligned(*cursor),
1117            Cursor::EndAligned(0),
1118        ))
1119        .map_err(|e| format!("{}", e))?;
1120    if let Some(foundtextselection) = if case_sensitive {
1121        searchtext.find_text(textfragment).next()
1122    } else {
1123        searchtext.find_text_nocase(textfragment).next() //MAYBE TODO: this will be sub-optimal on large texts as it is lowercased each time -> use a smaller text buffer
1124    } {
1125        *cursor = foundtextselection.end();
1126        Ok(SelectorBuilder::textselector(
1127            resource_handle,
1128            Offset::simple(foundtextselection.begin(), foundtextselection.end()),
1129        ))
1130    } else {
1131        return Err(format!(
1132            "Unable to align specified text with the underlying resource: '{}' (lost track after character position {})",
1133            textfragment,
1134            *cursor
1135        ));
1136    }
1137}
1138
1139fn validate_text(
1140    store: &AnnotationStore,
1141    annotation_handle: AnnotationHandle,
1142    cells: &[&str],
1143    textcolumn: usize,
1144    validation: ValidationMode,
1145) -> Result<(), String> {
1146    if validation == ValidationMode::No {
1147        return Ok(());
1148    }
1149    if let Some(annotation) = store.annotation(annotation_handle) {
1150        let text: Vec<&str> = annotation.text().collect();
1151        if text.is_empty() {
1152            return Err("No text found".to_string());
1153        } else if text.len() == 1 {
1154            if !match validation {
1155                ValidationMode::Strict => {
1156                    &text[0] == cells.get(textcolumn).expect("cell must exist")
1157                }
1158                ValidationMode::Loose => {
1159                    text[0].to_lowercase()
1160                        == cells
1161                            .get(textcolumn)
1162                            .expect("cell must exist")
1163                            .to_lowercase()
1164                }
1165                ValidationMode::No => true,
1166            } {
1167                return Err(format!(
1168                    "Text validation failed, TSV expects '{}', data has '{}'",
1169                    cells.get(textcolumn).unwrap(),
1170                    &text[0]
1171                ));
1172            }
1173        } else {
1174            let text: String = text.join(" ");
1175            if !match validation {
1176                ValidationMode::Strict => {
1177                    &text.as_str() == cells.get(textcolumn).expect("cell must exist")
1178                }
1179                ValidationMode::Loose => {
1180                    text.to_lowercase()
1181                        == cells
1182                            .get(textcolumn)
1183                            .expect("cell must exist")
1184                            .to_lowercase()
1185                }
1186                ValidationMode::No => true,
1187            } {
1188                return Err(format!(
1189                    "Text validation failed, TSV expects '{}', data has '{}'",
1190                    cells.get(textcolumn).unwrap(),
1191                    &text.as_str()
1192                ));
1193            }
1194        }
1195    } else {
1196        return Err("Annotation not found (should never happen)".to_string());
1197    }
1198    Ok(())
1199}
1200
1201fn unescape(s: &str) -> String {
1202    let mut result = String::with_capacity(s.len());
1203    let mut prevc = None;
1204    let mut do_unescape: bool = false;
1205    for c in s.chars() {
1206        if c == '\\' && prevc != Some('\\') {
1207            do_unescape = true;
1208        }
1209        if do_unescape {
1210            match c {
1211                'n' => result.push('\n'),
1212                't' => result.push('\t'),
1213                _ => {
1214                    result.push('\\');
1215                    result.push(c);
1216                }
1217            }
1218        } else {
1219            result.push(c)
1220        }
1221        prevc = Some(c);
1222        do_unescape = false;
1223    }
1224    result
1225}
1226
1227fn build_annotation<'a>(
1228    cells: &'a [&'a str],
1229    columns: &Columns,
1230    default_set: Option<&'a str>,
1231    subdelimiter: &str,
1232    escape: bool,
1233    nullvalue: &str,
1234) -> Result<AnnotationBuilder<'a>, String> {
1235    let mut annotationbuilder = AnnotationBuilder::new();
1236    if let Some(i) = columns.index(&Column::Id) {
1237        let id = cells.get(i).expect("cell must exist");
1238        annotationbuilder = annotationbuilder.with_id(id.to_string());
1239    } else if let Some(i) = columns.index(&Column::Annotation) {
1240        //same as above
1241        let id = cells.get(i).expect("cell must exist");
1242        annotationbuilder = annotationbuilder.with_id(id.to_string());
1243    } else if let (Some(ikey), Some(ivalue)) = (
1244        columns.index(&Column::DataKey),
1245        columns.index(&Column::DataValue),
1246    ) {
1247        let mut databuilder = AnnotationDataBuilder::new();
1248        if let Some(i) = columns.index(&Column::AnnotationData) {
1249            let id = cells.get(i).expect("cell must exist");
1250            databuilder = databuilder.with_id(BuildItem::IdRef(id));
1251        } else if let Some(default_set) = default_set {
1252            databuilder = databuilder.with_id(BuildItem::IdRef(default_set));
1253        }
1254        if let Some(i) = columns.index(&Column::AnnotationDataSet) {
1255            let set = cells.get(i).expect("cell must exist");
1256            databuilder = databuilder.with_dataset(BuildItem::Id(set.to_string()));
1257        }
1258        let key = cells.get(ikey).expect("cell must exist");
1259        let value = cells.get(ivalue).expect("cell must exist");
1260        if !value.is_empty() && *value != nullvalue {
1261            if value.find(subdelimiter).is_some() {
1262                for value in value.split(subdelimiter) {
1263                    let mut multidatabuilder = AnnotationDataBuilder::new();
1264                    if let Some(i) = columns.index(&Column::AnnotationDataSet) {
1265                        let set = cells.get(i).expect("cell must exist");
1266                        multidatabuilder =
1267                            multidatabuilder.with_dataset(BuildItem::Id(set.to_string()));
1268                    }
1269                    multidatabuilder = multidatabuilder.with_key(BuildItem::from(*key));
1270                    if escape {
1271                        multidatabuilder =
1272                            multidatabuilder.with_value(DataValue::from(unescape(value)));
1273                    } else {
1274                        multidatabuilder = multidatabuilder.with_value(DataValue::from(value));
1275                    }
1276                    annotationbuilder = annotationbuilder.with_data_builder(multidatabuilder);
1277                }
1278            } else {
1279                databuilder = databuilder.with_key(BuildItem::from(*key));
1280                if escape {
1281                    databuilder = databuilder.with_value(DataValue::from(unescape(value)));
1282                } else {
1283                    databuilder = databuilder.with_value(DataValue::from(*value));
1284                }
1285                annotationbuilder = annotationbuilder.with_data_builder(databuilder);
1286            }
1287        }
1288    }
1289    //process custom columns
1290    for (column, cell) in columns.iter().zip(cells.iter()) {
1291        if let Column::Custom { set, key } = column {
1292            if cell.find(subdelimiter).is_some() {
1293                for value in cell.split(subdelimiter) {
1294                    let value: DataValue = if escape {
1295                        unescape(value).into()
1296                    } else {
1297                        value.into()
1298                    };
1299                    let databuilder = AnnotationDataBuilder::new()
1300                        .with_dataset(BuildItem::Id(set.clone()))
1301                        .with_key(BuildItem::Id(key.clone()))
1302                        .with_value(value);
1303                    annotationbuilder = annotationbuilder.with_data_builder(databuilder);
1304                }
1305            } else {
1306                let value: DataValue = if escape {
1307                    unescape(cell).into()
1308                } else {
1309                    (*cell).into()
1310                };
1311                let databuilder = AnnotationDataBuilder::new()
1312                    .with_dataset(BuildItem::Id(set.clone()))
1313                    .with_key(BuildItem::Id(key.clone()))
1314                    .with_value(value);
1315                annotationbuilder = annotationbuilder.with_data_builder(databuilder);
1316            }
1317        }
1318    }
1319    Ok(annotationbuilder)
1320}
1321
1322fn parse_resource_file<'a>(
1323    cells: &[&'a str],
1324    columns: &Columns,
1325    existing_resource: Option<&'a str>,
1326    new_resource: Option<&'a str>,
1327) -> Result<&'a str, String> {
1328    if let Some(i) = columns.index(&Column::TextResource) {
1329        Ok(cells.get(i).expect("cell must exist"))
1330    } else if let Some(i) = columns.index(&Column::TextSelection) {
1331        let textselection = cells.get(i).expect("cell must exist");
1332        if let Some(bytepos) = textselection.find('#') {
1333            Ok(&textselection[..bytepos])
1334        } else {
1335            Err("Text selection must have format: resource#beginoffset-endoffset".to_string())
1336        }
1337    } else if let Some(existing_resource) = existing_resource {
1338        Ok(existing_resource)
1339    } else if let Some(new_resource) = new_resource {
1340        Ok(new_resource)
1341    } else {
1342        Err(
1343            "Can't find resource (data doesn't make an explicit reference to it). You may want to specify a default (existing) resource using --resource"
1344                .to_string(),
1345        )
1346    }
1347}
1348
1349fn get_resource_handle(
1350    store: &mut AnnotationStore,
1351    filename: &str,
1352) -> Result<TextResourceHandle, String> {
1353    if let Some(resource) = store.resource(filename) {
1354        return Ok(resource.handle());
1355    }
1356    store
1357        .add_resource(TextResourceBuilder::new().with_filename(filename))
1358        .map_err(|e| format!("Specified resource not found: {}: {}", filename, e))
1359}
1360
1361fn build_selector<'a>(
1362    cells: &[&str],
1363    columns: &Columns,
1364    resource_handle: TextResourceHandle,
1365) -> Result<SelectorBuilder<'a>, String> {
1366    //TODO: for now this only returns a TextSelector, should be adapted to handle multiple offsets (with subdelimiter) and return a CompositeSelector then
1367    let offset = parse_offset(cells, columns)?;
1368    Ok(SelectorBuilder::textselector(resource_handle, offset))
1369}
1370
1371fn parse_offset(cells: &[&str], columns: &Columns) -> Result<Offset, String> {
1372    if let Some(ioffset) = columns.index(&Column::Offset) {
1373        let cell = cells.get(ioffset).expect("cell must exist");
1374        if let Some(delimiterpos) = &cell[1..].find('-') {
1375            let delimiterpos = *delimiterpos + 1; //we do 1 rather than 0 to not consider an immediate hyphen after the # , that would indicate a negative begin index
1376            let begin_str = &cell[0..delimiterpos];
1377            let end_str = &cell[(delimiterpos + 1)..];
1378            let begin: Cursor = begin_str.try_into().map_err(|e| format!("{}", e))?;
1379            let end: Cursor = end_str.try_into().map_err(|e| format!("{}", e))?;
1380            return Ok(Offset::new(begin, end));
1381        }
1382        Err("Offset must have format: beginoffset-endoffset".to_string())
1383    } else if let (Some(b), Some(e)) = (
1384        columns.index(&Column::BeginOffset),
1385        columns.index(&Column::EndOffset),
1386    ) {
1387        let begin_str = cells.get(b).expect("cell must exist");
1388        let end_str = cells.get(e).expect("cell must exist");
1389        let begin: Cursor = (*begin_str).try_into().map_err(|e| format!("{}", e))?;
1390        let end: Cursor = (*end_str).try_into().map_err(|e| format!("{}", e))?;
1391        Ok(Offset::new(begin, end))
1392    } else if let Some(i) = columns.index(&Column::TextSelection) {
1393        let textselection = cells.get(i).expect("cell must exist");
1394        if let Some(bytepos) = textselection.find('#') {
1395            if let Some(delimiterpos) = &textselection[(bytepos + 2)..].find('-') {
1396                let delimiterpos = *delimiterpos + bytepos + 2; //we do 2 rather than 1 to not consider an immediate hyphen after the # , that would indicate a negative begin index
1397                let begin_str = &textselection[(bytepos + 1)..delimiterpos];
1398                let end_str = &textselection[(delimiterpos + 1)..];
1399                let begin: Cursor = (*begin_str).try_into().map_err(|e| format!("{}", e))?;
1400                let end: Cursor = (*end_str).try_into().map_err(|e| format!("{}", e))?;
1401                return Ok(Offset::new(begin, end));
1402            }
1403        }
1404        Err("Text selection must have format: resource#beginoffset-endoffset".to_string())
1405    } else {
1406        Err(format!("No offset information found"))
1407    }
1408}
1409
1410fn parse_column(
1411    column: &str,
1412    default_set: Option<&str>,
1413    setdelimiter: &str,
1414) -> Result<Column, String> {
1415    let result = Column::parse(column, setdelimiter)
1416        .map_err(|err| format!("Unable to parse provided columns: {}", err));
1417    if result.is_err() && default_set.is_some() {
1418        return Ok(Column::Custom {
1419            set: default_set.unwrap().to_string(),
1420            key: column.to_string(),
1421        });
1422    } else {
1423        result
1424    }
1425}