1use stam::*;
2use std::borrow::Cow;
3use std::collections::HashMap;
4use std::fmt;
5use std::fs::File;
6use std::io::{BufRead, BufReader};
7
8#[derive(Clone, PartialEq, Debug)]
9pub enum Column {
11 SeqNr,
13
14 VarName,
16
17 Type,
19
20 Id,
22
23 Annotation,
25
26 TextResource,
28
29 AnnotationData,
31
32 AnnotationDataSet,
34
35 Offset,
37
38 BeginOffset,
40
41 EndOffset,
43
44 Utf8Offset,
45
46 BeginUtf8Offset,
48
49 EndUtf8Offset,
51
52 DataKey,
54
55 DataValue,
57
58 Text,
60
61 TextSelection,
63
64 Ignore,
66
67 Custom {
69 set: String,
70 key: String,
71 },
72}
73
74#[derive(Clone, Copy, PartialEq, Debug)]
75pub enum ValidationMode {
76 Strict,
77 Loose,
78 No,
79}
80
81impl TryFrom<&str> for ValidationMode {
82 type Error = String;
83 fn try_from(val: &str) -> Result<Self, Self::Error> {
84 let val_lower = val.to_lowercase();
85 match val_lower.as_str() {
86 "strict" | "yes" => Ok(Self::Strict),
87 "loose" => Ok(Self::Loose),
88 "no" => Ok(Self::No),
89 _ => Err(format!(
90 "Unknown value for --validate: {}, see --help for allowed values",
91 val
92 )),
93 }
94 }
95}
96
97impl Column {
98 pub fn parse(val: &str, setdelimiter: &str) -> Result<Self, String> {
100 if val.find(setdelimiter).is_some() {
101 let (set, key) = val.rsplit_once(setdelimiter).unwrap();
102 Ok(Self::Custom {
103 set: set.to_string(),
104 key: key.to_string(),
105 })
106 } else {
107 let val_lower = val.to_lowercase();
108 match val_lower.as_str() {
109 "type" => Ok(Self::Type),
110 "id" => Ok(Self::Id),
111 "annotationid" | "annotation" => Ok(Self::Annotation),
112 "annotationdatasetid"
113 | "annotationdataset"
114 | "set"
115 | "setid"
116 | "datasetid"
117 | "dataset" => Ok(Self::AnnotationDataSet),
118 "resource" | "resourceid" | "textresource" | "textresources" => {
119 Ok(Self::TextResource)
120 }
121 "annotationdataid" | "dataid" => Ok(Self::AnnotationData),
122 "offset" => Ok(Self::Offset),
123 "beginoffset" | "begin" | "start" | "startoffset" => Ok(Self::BeginOffset),
124 "endoffset" | "end" => Ok(Self::EndOffset),
125 "utf8offset" => Ok(Self::Utf8Offset),
126 "beginutf8offset" | "beginutf8" | "beginbyte" | "startbyte" | "startutf8"
127 | "startutf8offset" => Ok(Self::BeginUtf8Offset),
128 "endutf8offset" | "endutf8" | "endbyte" => Ok(Self::EndUtf8Offset),
129 "datakey" | "key" | "datakeyid" | "keyid" => Ok(Self::DataKey),
130 "datavalue" | "value" => Ok(Self::DataValue),
131 "text" => Ok(Self::Text),
132 "textselections" | "textselection" => Ok(Self::TextSelection),
133 "ignore" => Ok(Self::Ignore),
134 _ => Err(format!(
135 "Unknown column: {}, see --help for allowed values",
136 val
137 )),
138 }
139 }
140 }
141}
142
143impl fmt::Display for Column {
144 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
145 write!(f, "{}", self.to_string())
146 }
147}
148
149#[derive(Clone)]
150struct Context<'a> {
151 id: Option<Cow<'a, str>>,
152 varname: Option<Cow<'a, str>>,
153 seqnr: usize,
154 textselections: Option<&'a Vec<ResultTextSelection<'a>>>,
155 text: Option<&'a str>,
156 annotation: Option<ResultItem<'a, Annotation>>,
157 data: Option<ResultItem<'a, AnnotationData>>,
158 resource: Option<ResultItem<'a, TextResource>>,
159 set: Option<ResultItem<'a, AnnotationDataSet>>,
160 key: Option<ResultItem<'a, DataKey>>,
161 value: Option<&'a DataValue>,
162}
163
164impl<'a> Default for Context<'a> {
165 fn default() -> Self {
166 Context {
167 id: None,
168 varname: None,
169 seqnr: 0,
170 textselections: None, text: None, annotation: None,
173 data: None,
174 resource: None,
175 set: None,
176 key: None,
177 value: None,
178 }
179 }
180}
181
182impl Column {
183 pub fn to_string(&self) -> String {
185 match self {
186 Self::SeqNr => "SeqNr".to_string(),
187 Self::VarName => "Variable".to_string(),
188 Self::Type => "Type".to_string(),
189 Self::Id => "Id".to_string(),
190 Self::Annotation => "Annotation".to_string(),
191 Self::TextResource => "TextResource".to_string(),
192 Self::AnnotationData => "AnnotationData".to_string(),
193 Self::AnnotationDataSet => "AnnotationDataSet".to_string(),
194 Self::Offset => "Offset".to_string(),
195 Self::BeginOffset => "BeginOffset".to_string(),
196 Self::EndOffset => "EndOffset".to_string(),
197 Self::Utf8Offset => "Utf8Offset".to_string(),
198 Self::BeginUtf8Offset => "BeginUtf8Offset".to_string(),
199 Self::EndUtf8Offset => "EndUtf8Offset".to_string(),
200 Self::DataKey => "DataKey".to_string(),
201 Self::DataValue => "DataValue".to_string(),
202 Self::Text => "Text".to_string(),
203 Self::TextSelection => "TextSelection".to_string(),
204 Self::Ignore => "Ignore".to_string(),
205 Self::Custom { set, key } => format!("{}/{}", set, key),
206 }
207 }
208
209 fn print<W: std::io::Write>(
210 &self,
211 writer: &mut W,
212 tp: Type,
213 colnr: usize,
214 col_len: usize,
215 context: &Context,
216 delimiter: &str,
217 null: &str,
218 ) -> Result<(), std::io::Error> {
219 if colnr > 0 {
220 write!(writer, "\t")?;
221 }
222 match self {
223 Column::SeqNr => write!(writer, "{}", context.seqnr)?,
224 Column::VarName => write!(
225 writer,
226 "{}",
227 context.varname.as_ref().unwrap_or(&Cow::Borrowed(null))
228 )?,
229 Column::Type => write!(writer, "{}", tp)?,
230 Column::Id => write!(
231 writer,
232 "{}",
233 context.id.as_ref().unwrap_or(&Cow::Borrowed(null))
234 )?,
235 Column::TextSelection => {
236 if let Some(textselections) = context.textselections {
237 write!(
238 writer,
239 "{}",
240 textselections
241 .iter()
242 .map(|textselection| {
243 format!(
244 "{}#{}-{}",
245 textselection.resource().id().unwrap_or(""),
246 textselection.begin(),
247 textselection.end()
248 )
249 })
250 .collect::<Vec<String>>()
251 .join(delimiter)
252 )?;
253 } else {
254 write!(writer, "{}", null)?
255 }
256 }
257 Column::Offset => {
258 if let Some(textselections) = context.textselections {
259 write!(
260 writer,
261 "{}",
262 textselections
263 .iter()
264 .map(|textselection| {
265 format!("{}-{}", textselection.begin(), textselection.end())
266 })
267 .collect::<Vec<String>>()
268 .join(delimiter)
269 )?;
270 } else {
271 write!(writer, "{}", null)?
272 }
273 }
274 Column::BeginOffset => {
275 if let Some(textselections) = context.textselections {
276 write!(
277 writer,
278 "{}",
279 textselections
280 .iter()
281 .map(|textselection| { format!("{}", textselection.begin()) })
282 .collect::<Vec<String>>()
283 .join(delimiter)
284 )?;
285 } else {
286 write!(writer, "{}", null)?
287 }
288 }
289 Column::EndOffset => {
290 if let Some(textselections) = context.textselections {
291 write!(
292 writer,
293 "{}",
294 textselections
295 .iter()
296 .map(|textselection| { format!("{}", textselection.end()) })
297 .collect::<Vec<String>>()
298 .join(delimiter)
299 )?;
300 } else {
301 write!(writer, "{}", null)?
302 }
303 }
304 Column::Utf8Offset => {
305 if let Some(textselections) = context.textselections {
306 write!(
307 writer,
308 "{}",
309 textselections
310 .iter()
311 .map(|textselection| {
312 format!(
313 "{}-{}",
314 textselection
315 .resource()
316 .utf8byte(textselection.begin())
317 .expect("offset must be valid"),
318 textselection
319 .resource()
320 .utf8byte(textselection.end())
321 .expect("offset must be valid"),
322 )
323 })
324 .collect::<Vec<String>>()
325 .join(delimiter)
326 )?;
327 } else {
328 write!(writer, "{}", null)?
329 }
330 }
331 Column::BeginUtf8Offset => {
332 if let Some(textselections) = context.textselections {
333 write!(
334 writer,
335 "{}",
336 textselections
337 .iter()
338 .map(|textselection| {
339 format!(
340 "{}",
341 textselection
342 .resource()
343 .utf8byte(textselection.begin())
344 .expect("offset must be valid"),
345 )
346 })
347 .collect::<Vec<String>>()
348 .join(delimiter)
349 )?;
350 } else {
351 write!(writer, "{}", null)?
352 }
353 }
354 Column::EndUtf8Offset => {
355 if let Some(textselections) = context.textselections {
356 write!(
357 writer,
358 "{}",
359 textselections
360 .iter()
361 .map(|textselection| {
362 format!(
363 "{}",
364 textselection
365 .resource()
366 .utf8byte(textselection.end())
367 .expect("offset must be valid"),
368 )
369 })
370 .collect::<Vec<String>>()
371 .join(delimiter)
372 )?;
373 } else {
374 write!(writer, "{}", null)?
375 }
376 }
377 Column::Text => {
378 if let Some(text) = context.text {
379 write!(writer, "{}", text)?
380 } else if let Some(textselections) = context.textselections {
381 write!(
382 writer,
383 "{}",
384 textselections
385 .iter()
386 .map(|textselection| textselection.text().replace("\n", " "))
387 .collect::<Vec<String>>()
388 .join(delimiter)
389 )?
390 } else {
391 write!(writer, "{}", null)?
392 }
393 }
394 Column::Annotation => write!(
395 writer,
396 "{}",
397 context
398 .annotation
399 .as_ref()
400 .map(|annotation| annotation
401 .id()
402 .map(|x| x.to_string())
403 .unwrap_or_else(|| annotation.as_ref().temp_id().unwrap()))
404 .unwrap_or(null.to_string())
405 )?,
406 Column::AnnotationData => write!(
407 writer,
408 "{}",
409 context
410 .data
411 .as_ref()
412 .map(|data| data.id().unwrap_or(null))
413 .unwrap_or(null)
414 )?,
415 Column::AnnotationDataSet => write!(
416 writer,
417 "{}",
418 context
419 .set
420 .as_ref()
421 .map(|set| set.id().unwrap_or(null))
422 .unwrap_or(null)
423 )?,
424 Column::TextResource => write!(
425 writer,
426 "{}",
427 context
428 .resource
429 .as_ref()
430 .map(|resource| resource.id().unwrap_or(null))
431 .unwrap_or(null)
432 )?,
433 Column::DataKey => write!(
434 writer,
435 "{}",
436 context
437 .key
438 .as_ref()
439 .map(|key| key.id().unwrap_or(null))
440 .unwrap_or(null)
441 )?,
442 Column::DataValue => write!(
443 writer,
444 "{}",
445 context
446 .value
447 .as_ref()
448 .map(|value| value.to_string())
449 .unwrap_or(null.to_string())
450 )?,
451 Column::Custom { set, key } => {
452 let mut found = false;
453 if let Some(annotation) = &context.annotation {
454 if let Some(key) = annotation.store().key(set.as_str(), key.as_str()) {
455 for (i, annotationdata) in annotation.data().filter_key(&key).enumerate() {
456 found = true;
457 write!(
458 writer,
459 "{}{}",
460 if i > 0 { delimiter } else { "" },
461 annotationdata.value()
462 )?
463 }
464 }
465 }
466 if !found {
467 write!(writer, "{}", null)?
468 }
469 }
470 _ => write!(writer, "{}", null)?,
471 }
472 if colnr == col_len - 1 {
473 write!(writer, "\n")?;
474 }
475 Ok(())
476 }
477}
478
479#[derive(Debug)]
480pub struct Columns(Vec<Column>);
482
483impl Columns {
484 fn printrow<W: std::io::Write>(
485 &self,
486 writer: &mut W,
487 tp: Type,
488 context: &Context,
489 delimiter: &str,
490 null: &str,
491 ) -> Result<(), std::io::Error> {
492 for (i, column) in self.0.iter().enumerate() {
493 column.print(writer, tp, i, self.len(), context, delimiter, null)?;
494 }
495 Ok(())
496 }
497
498 fn printheader<W: std::io::Write>(&self, writer: &mut W) -> Result<(), std::io::Error> {
499 for (i, column) in self.0.iter().enumerate() {
500 if i > 0 {
501 write!(writer, "\t")?;
502 }
503 write!(writer, "{}", column)?;
504 if i == self.len() - 1 {
505 write!(writer, "\n")?;
506 }
507 }
508 Ok(())
509 }
510
511 fn index(&self, coltype: &Column) -> Option<usize> {
512 for (i, col) in self.0.iter().enumerate() {
513 if col == coltype {
514 return Some(i);
515 }
516 }
517 None
518 }
519
520 fn has(&self, coltype: &Column) -> bool {
521 self.index(coltype).is_some()
522 }
523
524 fn len(&self) -> usize {
525 self.0.len()
526 }
527
528 fn iter<'a>(&'a self) -> std::slice::Iter<'a, Column> {
529 self.0.iter()
530 }
531
532 fn add_from_query<'a>(&mut self, query: &Query<'a>) {
533 for constraint in query.iter() {
534 match constraint {
535 Constraint::KeyValue { set, key, .. } | Constraint::DataKey { set, key, .. } => {
536 self.0.push(Column::Custom {
537 set: set.to_string(),
538 key: key.to_string(),
539 })
540 }
541 _ => {}
542 }
543 }
544 for subquery in query.subqueries() {
545 self.add_from_query(subquery);
546 }
547 }
548}
549
550pub fn to_tsv<'a, W: std::io::Write>(
551 store: &'a AnnotationStore,
552 writer: &mut W,
553 query: Query<'a>,
554 columnconfig: &[&str],
555 verbose: bool,
556 delimiter: &str,
557 null: &str,
558 header: bool,
559 setdelimiter: &str,
560 autocolumns: bool,
561) -> Result<(), StamError> {
562 let mut columns = Columns(
563 columnconfig
564 .iter()
565 .map(|col| {
566 Column::parse(*col, setdelimiter)
567 .map_err(|err| {
568 eprintln!("[warning] {}", err);
569 })
570 .unwrap()
571 })
572 .collect(),
573 );
574
575 if autocolumns {
576 if (verbose || query.has_subqueries()) && !columns.0.contains(&Column::SeqNr) {
577 columns.0.insert(0, Column::SeqNr);
579 }
580 if query.has_subqueries() && !columns.0.contains(&Column::VarName) {
581 columns.0.insert(1, Column::VarName);
583 }
584
585 columns.add_from_query(&query);
586 }
587
588 if header {
589 columns.printheader(writer)?;
590 }
591
592 let want_textselections =
593 columns.0.contains(&Column::TextSelection) || columns.0.contains(&Column::Text);
594
595 let iter = store.query(query)?;
596 for (seqnr, resultrow) in iter.enumerate() {
597 let seqnr = seqnr + 1; for (result, varname) in resultrow.iter().zip(resultrow.names()) {
599 match result {
600 QueryResultItem::None | QueryResultItem::AnnotationSubStore(..) => {}
601 QueryResultItem::Annotation(annotation) => {
602 let textselections: Option<Vec<_>> = if want_textselections {
603 Some(annotation.textselections().collect())
604 } else {
605 None
606 };
607 let context = Context {
608 id: if let Some(id) = annotation.id() {
609 Some(Cow::Borrowed(id))
610 } else {
611 Some(Cow::Owned(annotation.as_ref().temp_id().unwrap()))
612 },
613 seqnr,
614 varname: varname.map(|s| Cow::Borrowed(s)),
615 annotation: Some(annotation.clone()), textselections: textselections.as_ref(),
617 ..Context::default()
618 };
619 columns.printrow(writer, Type::Annotation, &context, delimiter, null)?;
620 if verbose {
621 for data in annotation.data() {
622 let context = Context {
623 id: data.id().map(|x| Cow::Borrowed(x)),
624 seqnr,
625 annotation: Some(annotation.clone()),
626 key: Some(data.key()),
627 data: Some(data.clone()),
628 set: Some(data.set()),
629 value: Some(data.value()),
630 ..Context::default()
631 };
632 columns.printrow(
633 writer,
634 Type::AnnotationData,
635 &context,
636 delimiter,
637 null,
638 )?;
639 }
640 }
641 }
642 QueryResultItem::AnnotationData(data) => {
643 let context = Context {
644 id: data.id().map(|x| Cow::Borrowed(x)),
645 seqnr,
646 varname: varname.map(|s| Cow::Borrowed(s)),
647 set: Some(data.set()),
648 key: Some(data.key()),
649 value: Some(data.value()),
650 ..Context::default()
651 };
652 columns.printrow(writer, Type::AnnotationData, &context, delimiter, null)?;
653 }
654 QueryResultItem::DataKey(key) => {
655 let context = Context {
656 id: key.id().map(|x| Cow::Borrowed(x)),
657 seqnr,
658 varname: varname.map(|s| Cow::Borrowed(s)),
659 set: Some(key.set()),
660 key: Some(key.clone()),
661 ..Context::default()
662 };
663 columns.printrow(writer, Type::DataKey, &context, delimiter, null)?;
664 }
665 QueryResultItem::AnnotationDataSet(dataset) => {
666 let context = Context {
667 id: dataset.id().map(|x| Cow::Borrowed(x)),
668 seqnr,
669 varname: varname.map(|s| Cow::Borrowed(s)),
670 set: Some(dataset.clone()),
671 ..Context::default()
672 };
673 columns.printrow(writer, Type::AnnotationDataSet, &context, delimiter, null)?;
674 if verbose {
675 for key in dataset.keys() {
676 let context = Context {
677 id: key.id().map(|x| Cow::Borrowed(x)),
678 seqnr,
679 set: Some(key.set()),
680 key: Some(key.clone()),
681 ..Context::default()
682 };
683 columns.printrow(writer, Type::DataKey, &context, delimiter, null)?;
684 }
685 for data in dataset.data() {
686 let context = Context {
687 id: data.id().map(|x| Cow::Borrowed(x)),
688 seqnr,
689 set: Some(data.set()),
690 key: Some(data.key()),
691 value: Some(data.value()),
692 ..Context::default()
693 };
694 columns.printrow(
695 writer,
696 Type::AnnotationData,
697 &context,
698 delimiter,
699 null,
700 )?;
701 }
702 }
703 }
704 QueryResultItem::TextResource(resource) => {
705 let context = Context {
706 id: resource.id().map(|x| Cow::Borrowed(x)),
707 varname: varname.map(|s| Cow::Borrowed(s)),
708 seqnr,
709 resource: Some(resource.clone()),
710 ..Context::default()
711 };
712 columns.printrow(writer, Type::TextResource, &context, delimiter, null)?;
713 }
714 QueryResultItem::TextSelection(textselection) => {
715 let id = format!(
716 "{}#{}-{}",
717 textselection.resource().id().unwrap_or(""),
718 textselection.begin(),
719 textselection.end()
720 );
721 let text = Some(textselection.text());
722 let textselections: Vec<ResultTextSelection> = vec![textselection.clone()];
723 let context = Context {
724 id: Some(Cow::Owned(id)),
725 seqnr,
726 varname: varname.map(|s| Cow::Borrowed(s)),
727 resource: Some(textselection.resource()),
728 textselections: Some(&textselections),
729 text,
730 ..Context::default()
731 };
732 columns.printrow(writer, Type::TextSelection, &context, delimiter, null)?;
733 }
734 }
735 }
736 }
737 Ok(())
738}
739
740#[derive(Debug, Clone, Copy, PartialEq)]
741pub enum ParseMode {
743 Simple,
745 AlignWithText,
747 ReconstructText,
749 MultiTag,
751 Metadata,
753}
754
755impl ParseMode {
756 pub fn new(
759 columns: &Columns,
760 existing_resource: Option<&str>,
761 sequential: bool,
762 ) -> Result<Self, &'static str> {
763 if columns.has(&Column::Text) {
764 if columns.has(&Column::Offset)
765 || (columns.has(&Column::BeginOffset) && columns.has(&Column::EndOffset))
766 || columns.has(&Column::TextSelection)
767 {
768 Ok(Self::Simple)
769 } else {
770 if columns.has(&Column::TextResource)
772 || existing_resource.is_some()
773 || columns.has(&Column::TextSelection)
774 {
775 if sequential {
776 Ok(Self::AlignWithText)
777 } else {
778 Ok(Self::MultiTag)
779 }
780 } else {
781 if sequential {
782 Ok(Self::ReconstructText)
783 } else {
784 Err("Can not reconstruct a text if rows in input data are not sequential")
785 }
786 }
787 }
788 } else if columns.has(&Column::TextResource) || existing_resource.is_some() {
789 if columns.has(&Column::Offset)
790 || (columns.has(&Column::BeginOffset) && columns.has(&Column::EndOffset))
791 || columns.has(&Column::TextSelection)
792 {
793 Ok(Self::Simple)
794 } else {
795 Err("Unable to determine how to parse this data based on the available columns. Make sure there is at least an Offset column (or BeginOffset, EndOffset columns)")
796 }
797 } else if !columns.has(&Column::Offset)
798 && !columns.has(&Column::BeginOffset)
799 && !columns.has(&Column::EndOffset)
800 && !columns.has(&Column::TextSelection)
801 {
802 if columns.has(&Column::TextResource) || existing_resource.is_some() {
803 eprintln!("Warning: Data has neither a Text nor an Offset column, interpreting data as metadata");
804 Ok(Self::Metadata)
805 } else {
806 Err("Data has neither a Text nor an Offset column")
807 }
808 } else {
809 Err("Unable to determine how to parse this data based on the available columns. Make sure there is at least an Offset, Text or Resource column (or supply --resource)")
810 }
811 }
812}
813
814pub fn from_tsv(
816 store: &mut AnnotationStore,
817 filename: &str,
818 columnconfig: Option<&Vec<&str>>,
819 existing_resource: Option<&str>,
820 new_resource: Option<&str>,
821 default_set: Option<&str>,
822 comments: bool,
823 sequential: bool,
824 case_sensitive: bool,
825 escape: bool,
826 nullvalue: &str,
827 subdelimiter: &str, setdelimiter: &str, outputdelimiter: &str, outputdelimiter2: &str, header: Option<bool>, validation: ValidationMode,
833 verbose: bool,
834) -> Result<(), String> {
835 let f =
836 File::open(filename).map_err(|e| format!("Error opening TSV file {}: {}", filename, e))?;
837 let reader = BufReader::new(f);
838
839 let mut columns: Option<Columns> = None;
840 let mut parsemode: Option<ParseMode> = None;
841 let mut cursors: HashMap<TextResourceHandle, usize> = HashMap::new(); let mut buffer: Vec<String> = Vec::new(); let mut bufferbegin: usize = 0; let mut texts: HashMap<String, String> = HashMap::new(); let mut buffered_delimiter: Option<String> = None; for (i, line) in reader.lines().enumerate() {
848 if let Ok(line) = line {
849 if line.is_empty() {
850 buffered_delimiter = Some(outputdelimiter2.to_string()); } else if comments && !line.is_empty() && &line.get(0..1) == &Some("#") {
852 continue;
854 } else if i == 0 && columns.is_none() && header != Some(false) {
855 if verbose {
856 eprintln!("Parsing first row as header...")
857 }
858 columns = Some(
859 Columns(
860 line.split("\t")
861 .map(|col| {
862 parse_column(col, default_set, setdelimiter).map_err(|err| {
863 eprintln!("[warning] Unable to parse first line of TSV file as header (please provide a column configuration explicitly if the input file has none): {}. You may consider setting --annotationset if you want to interpret this column as a key in the specified annotationset", err);
864 }).unwrap()
865 })
866 .collect(),
867 )
868 );
869 parsemode = Some(
870 ParseMode::new(columns.as_ref().unwrap(), existing_resource, sequential)
871 .map_err(|e| format!("Can't determine parse mode: {}", e))?,
872 );
873 if verbose {
874 eprintln!("Columns: {:?}", columns.as_ref().unwrap());
875 eprintln!("Parse mode: {:?}", parsemode.unwrap());
876 }
877 } else if i == 0 && columns.is_some() && header != Some(false) {
878 if verbose {
879 eprintln!("Skipping first row (assuming to be a header)...")
880 }
881 continue; } else {
883 if columns.is_none() {
884 if columnconfig.is_none() {
885 return Err(format!("Please provide a configuration for the columns"));
886 }
887 columns = Some(Columns(
888 columnconfig
889 .unwrap()
890 .iter()
891 .map(|col| {
892 parse_column(col, default_set, setdelimiter)
893 .map_err(|err| {
894 eprintln!(
895 "[warning] Unable to parse provided column: {}",
896 err
897 );
898 })
899 .unwrap()
900 })
901 .collect(),
902 ));
903 parsemode = Some(
904 ParseMode::new(columns.as_ref().unwrap(), existing_resource, sequential)
905 .map_err(|e| format!("Can't determine parse mode: {}", e))?,
906 );
907 if verbose {
908 eprintln!("Columns: {:?}", columns.as_ref().unwrap());
909 eprintln!("Parse mode: {:?}", parsemode.unwrap())
910 }
911 }
912 if let (Some(columns), Some(parsemode)) = (&columns, parsemode) {
913 if parsemode == ParseMode::ReconstructText {
914 if let Err(e) = reconstruct_text(
915 &line,
916 &columns,
917 &mut texts,
918 existing_resource,
919 new_resource,
920 outputdelimiter,
921 &mut buffered_delimiter,
922 ) {
923 return Err(format!(
924 "Error reconstructing text (line {}): {}",
925 i + 1,
926 e
927 ));
928 }
929 if buffer.is_empty() {
930 bufferbegin = i;
931 }
932 buffer.push(line);
933 } else if let Err(e) = parse_row(
934 store,
935 &line,
936 &columns,
937 parsemode,
938 subdelimiter,
939 existing_resource,
940 new_resource,
941 default_set,
942 case_sensitive,
943 escape,
944 nullvalue,
945 validation,
946 &mut cursors,
947 ) {
948 return Err(format!("Error parsing tsv line {}: {}", i + 1, e));
949 }
950 }
951 }
952 }
953 }
954
955 if parsemode == Some(ParseMode::ReconstructText) {
956 if verbose {
957 eprintln!("Creating resources...");
958 }
959 for (filename, text) in texts {
960 if verbose {
961 eprintln!("Creating resource {} (length={})", filename, text.len());
962 }
963 if let Err(e) = store.add_resource(
964 TextResourceBuilder::new()
965 .with_text(text)
966 .with_filename(&filename),
967 ) {
968 return Err(format!("Error loading/adding resource: {}", e));
969 }
970 }
971 if verbose {
972 eprintln!("Parsing buffered rows...");
973 }
974 let parsemode = ParseMode::AlignWithText;
975 let columns = columns.unwrap();
976 for (i, line) in buffer.iter().enumerate() {
977 if let Err(e) = parse_row(
978 store,
979 &line,
980 &columns,
981 parsemode,
982 subdelimiter,
983 existing_resource,
984 new_resource,
985 default_set,
986 case_sensitive,
987 escape,
988 nullvalue,
989 validation,
990 &mut cursors,
991 ) {
992 return Err(format!(
993 "Error parsing tsv line {}: {}",
994 i + bufferbegin + 1,
995 e
996 ));
997 }
998 }
999 }
1000 Ok(())
1001}
1002
1003fn reconstruct_text(
1004 line: &str,
1005 columns: &Columns,
1006 texts: &mut HashMap<String, String>,
1007 existing_resource: Option<&str>,
1008 new_resource: Option<&str>,
1009 output_delimiter: &str,
1010 buffered_delimiter: &mut Option<String>,
1011) -> Result<(), String> {
1012 let cells: Vec<&str> = line.split("\t").collect();
1013 if cells.len() != columns.len() {
1014 return Err(format!(
1015 "Number of cells is not equal to number of columns in header ({} vs {})",
1016 cells.len(),
1017 columns.len()
1018 ));
1019 }
1020 let resource_file: &str =
1021 parse_resource_file(&cells, columns, existing_resource, new_resource)?;
1022 let textcolumn = columns.index(&Column::Text);
1023 if !texts.contains_key(resource_file) {
1024 texts.insert(resource_file.to_string(), String::new());
1025 }
1026 if let Some(text) = texts.get_mut(resource_file) {
1027 if let Some(buffered_delimiter) = buffered_delimiter {
1028 text.push_str(&buffered_delimiter);
1029 }
1030 text.push_str(&cells[textcolumn.expect("there must be a text column")]);
1031 *buffered_delimiter = Some(output_delimiter.to_string());
1032 }
1033 Ok(())
1034}
1035
1036fn parse_row(
1038 store: &mut AnnotationStore,
1039 line: &str,
1040 columns: &Columns,
1041 parsemode: ParseMode,
1042 subdelimiter: &str,
1043 existing_resource: Option<&str>,
1044 new_resource: Option<&str>,
1045 default_set: Option<&str>,
1046 case_sensitive: bool,
1047 escape: bool,
1048 nullvalue: &str,
1049 validation: ValidationMode,
1050 cursors: &mut HashMap<TextResourceHandle, usize>,
1051) -> Result<(), String> {
1052 let cells: Vec<&str> = line.split("\t").collect();
1053 if cells.len() != columns.len() {
1054 return Err(format!(
1055 "Number of cells is not equal to number of columns in header ({} vs {})",
1056 cells.len(),
1057 columns.len()
1058 ));
1059 }
1060 let resource_file: &str =
1061 parse_resource_file(&cells, columns, existing_resource, new_resource)?;
1062 let resource_handle: TextResourceHandle = get_resource_handle(store, resource_file)?;
1063 let textcolumn = columns.index(&Column::Text);
1064 let selector = match parsemode {
1065 ParseMode::Simple => build_selector(&cells, columns, resource_handle)?,
1066 ParseMode::AlignWithText => align_with_text(
1067 store,
1068 resource_handle,
1069 &cells,
1070 textcolumn.expect("text column is required when parsemode is set to AlignWithText"),
1071 case_sensitive,
1072 cursors,
1073 )?,
1074 _ => return Err("Not implemented yet".to_string()),
1075 };
1076 let mut annotationbuilder = build_annotation(
1077 &cells,
1078 columns,
1079 default_set,
1080 subdelimiter,
1081 escape,
1082 nullvalue,
1083 )?;
1084 annotationbuilder = annotationbuilder.with_target(selector);
1085 match store.annotate(annotationbuilder) {
1086 Err(e) => return Err(format!("{}", e)),
1087 Ok(handle) => {
1088 if parsemode == ParseMode::Simple {
1089 if let Some(textcolumn) = textcolumn {
1090 validate_text(store, handle, &cells, textcolumn, validation)?;
1091 }
1092 }
1093 }
1094 }
1095 Ok(())
1096}
1097
1098fn align_with_text<'a>(
1099 store: &AnnotationStore,
1100 resource_handle: TextResourceHandle,
1101 cells: &[&str],
1102 textcolumn: usize,
1103 case_sensitive: bool,
1104 cursors: &mut HashMap<TextResourceHandle, usize>,
1105) -> Result<SelectorBuilder<'a>, String> {
1106 let textfragment = cells[textcolumn];
1107 if textfragment.is_empty() {
1108 return Err("Value in text column can not be empty".to_string());
1109 }
1110 let cursor = cursors.entry(resource_handle).or_insert(0);
1111 let resource = store
1112 .resource(&BuildItem::from(resource_handle))
1113 .expect("resource must exist");
1114 let searchtext = resource
1115 .textselection(&Offset::new(
1116 Cursor::BeginAligned(*cursor),
1117 Cursor::EndAligned(0),
1118 ))
1119 .map_err(|e| format!("{}", e))?;
1120 if let Some(foundtextselection) = if case_sensitive {
1121 searchtext.find_text(textfragment).next()
1122 } else {
1123 searchtext.find_text_nocase(textfragment).next() } {
1125 *cursor = foundtextselection.end();
1126 Ok(SelectorBuilder::textselector(
1127 resource_handle,
1128 Offset::simple(foundtextselection.begin(), foundtextselection.end()),
1129 ))
1130 } else {
1131 return Err(format!(
1132 "Unable to align specified text with the underlying resource: '{}' (lost track after character position {})",
1133 textfragment,
1134 *cursor
1135 ));
1136 }
1137}
1138
1139fn validate_text(
1140 store: &AnnotationStore,
1141 annotation_handle: AnnotationHandle,
1142 cells: &[&str],
1143 textcolumn: usize,
1144 validation: ValidationMode,
1145) -> Result<(), String> {
1146 if validation == ValidationMode::No {
1147 return Ok(());
1148 }
1149 if let Some(annotation) = store.annotation(annotation_handle) {
1150 let text: Vec<&str> = annotation.text().collect();
1151 if text.is_empty() {
1152 return Err("No text found".to_string());
1153 } else if text.len() == 1 {
1154 if !match validation {
1155 ValidationMode::Strict => {
1156 &text[0] == cells.get(textcolumn).expect("cell must exist")
1157 }
1158 ValidationMode::Loose => {
1159 text[0].to_lowercase()
1160 == cells
1161 .get(textcolumn)
1162 .expect("cell must exist")
1163 .to_lowercase()
1164 }
1165 ValidationMode::No => true,
1166 } {
1167 return Err(format!(
1168 "Text validation failed, TSV expects '{}', data has '{}'",
1169 cells.get(textcolumn).unwrap(),
1170 &text[0]
1171 ));
1172 }
1173 } else {
1174 let text: String = text.join(" ");
1175 if !match validation {
1176 ValidationMode::Strict => {
1177 &text.as_str() == cells.get(textcolumn).expect("cell must exist")
1178 }
1179 ValidationMode::Loose => {
1180 text.to_lowercase()
1181 == cells
1182 .get(textcolumn)
1183 .expect("cell must exist")
1184 .to_lowercase()
1185 }
1186 ValidationMode::No => true,
1187 } {
1188 return Err(format!(
1189 "Text validation failed, TSV expects '{}', data has '{}'",
1190 cells.get(textcolumn).unwrap(),
1191 &text.as_str()
1192 ));
1193 }
1194 }
1195 } else {
1196 return Err("Annotation not found (should never happen)".to_string());
1197 }
1198 Ok(())
1199}
1200
1201fn unescape(s: &str) -> String {
1202 let mut result = String::with_capacity(s.len());
1203 let mut prevc = None;
1204 let mut do_unescape: bool = false;
1205 for c in s.chars() {
1206 if c == '\\' && prevc != Some('\\') {
1207 do_unescape = true;
1208 }
1209 if do_unescape {
1210 match c {
1211 'n' => result.push('\n'),
1212 't' => result.push('\t'),
1213 _ => {
1214 result.push('\\');
1215 result.push(c);
1216 }
1217 }
1218 } else {
1219 result.push(c)
1220 }
1221 prevc = Some(c);
1222 do_unescape = false;
1223 }
1224 result
1225}
1226
1227fn build_annotation<'a>(
1228 cells: &'a [&'a str],
1229 columns: &Columns,
1230 default_set: Option<&'a str>,
1231 subdelimiter: &str,
1232 escape: bool,
1233 nullvalue: &str,
1234) -> Result<AnnotationBuilder<'a>, String> {
1235 let mut annotationbuilder = AnnotationBuilder::new();
1236 if let Some(i) = columns.index(&Column::Id) {
1237 let id = cells.get(i).expect("cell must exist");
1238 annotationbuilder = annotationbuilder.with_id(id.to_string());
1239 } else if let Some(i) = columns.index(&Column::Annotation) {
1240 let id = cells.get(i).expect("cell must exist");
1242 annotationbuilder = annotationbuilder.with_id(id.to_string());
1243 } else if let (Some(ikey), Some(ivalue)) = (
1244 columns.index(&Column::DataKey),
1245 columns.index(&Column::DataValue),
1246 ) {
1247 let mut databuilder = AnnotationDataBuilder::new();
1248 if let Some(i) = columns.index(&Column::AnnotationData) {
1249 let id = cells.get(i).expect("cell must exist");
1250 databuilder = databuilder.with_id(BuildItem::IdRef(id));
1251 } else if let Some(default_set) = default_set {
1252 databuilder = databuilder.with_id(BuildItem::IdRef(default_set));
1253 }
1254 if let Some(i) = columns.index(&Column::AnnotationDataSet) {
1255 let set = cells.get(i).expect("cell must exist");
1256 databuilder = databuilder.with_dataset(BuildItem::Id(set.to_string()));
1257 }
1258 let key = cells.get(ikey).expect("cell must exist");
1259 let value = cells.get(ivalue).expect("cell must exist");
1260 if !value.is_empty() && *value != nullvalue {
1261 if value.find(subdelimiter).is_some() {
1262 for value in value.split(subdelimiter) {
1263 let mut multidatabuilder = AnnotationDataBuilder::new();
1264 if let Some(i) = columns.index(&Column::AnnotationDataSet) {
1265 let set = cells.get(i).expect("cell must exist");
1266 multidatabuilder =
1267 multidatabuilder.with_dataset(BuildItem::Id(set.to_string()));
1268 }
1269 multidatabuilder = multidatabuilder.with_key(BuildItem::from(*key));
1270 if escape {
1271 multidatabuilder =
1272 multidatabuilder.with_value(DataValue::from(unescape(value)));
1273 } else {
1274 multidatabuilder = multidatabuilder.with_value(DataValue::from(value));
1275 }
1276 annotationbuilder = annotationbuilder.with_data_builder(multidatabuilder);
1277 }
1278 } else {
1279 databuilder = databuilder.with_key(BuildItem::from(*key));
1280 if escape {
1281 databuilder = databuilder.with_value(DataValue::from(unescape(value)));
1282 } else {
1283 databuilder = databuilder.with_value(DataValue::from(*value));
1284 }
1285 annotationbuilder = annotationbuilder.with_data_builder(databuilder);
1286 }
1287 }
1288 }
1289 for (column, cell) in columns.iter().zip(cells.iter()) {
1291 if let Column::Custom { set, key } = column {
1292 if cell.find(subdelimiter).is_some() {
1293 for value in cell.split(subdelimiter) {
1294 let value: DataValue = if escape {
1295 unescape(value).into()
1296 } else {
1297 value.into()
1298 };
1299 let databuilder = AnnotationDataBuilder::new()
1300 .with_dataset(BuildItem::Id(set.clone()))
1301 .with_key(BuildItem::Id(key.clone()))
1302 .with_value(value);
1303 annotationbuilder = annotationbuilder.with_data_builder(databuilder);
1304 }
1305 } else {
1306 let value: DataValue = if escape {
1307 unescape(cell).into()
1308 } else {
1309 (*cell).into()
1310 };
1311 let databuilder = AnnotationDataBuilder::new()
1312 .with_dataset(BuildItem::Id(set.clone()))
1313 .with_key(BuildItem::Id(key.clone()))
1314 .with_value(value);
1315 annotationbuilder = annotationbuilder.with_data_builder(databuilder);
1316 }
1317 }
1318 }
1319 Ok(annotationbuilder)
1320}
1321
1322fn parse_resource_file<'a>(
1323 cells: &[&'a str],
1324 columns: &Columns,
1325 existing_resource: Option<&'a str>,
1326 new_resource: Option<&'a str>,
1327) -> Result<&'a str, String> {
1328 if let Some(i) = columns.index(&Column::TextResource) {
1329 Ok(cells.get(i).expect("cell must exist"))
1330 } else if let Some(i) = columns.index(&Column::TextSelection) {
1331 let textselection = cells.get(i).expect("cell must exist");
1332 if let Some(bytepos) = textselection.find('#') {
1333 Ok(&textselection[..bytepos])
1334 } else {
1335 Err("Text selection must have format: resource#beginoffset-endoffset".to_string())
1336 }
1337 } else if let Some(existing_resource) = existing_resource {
1338 Ok(existing_resource)
1339 } else if let Some(new_resource) = new_resource {
1340 Ok(new_resource)
1341 } else {
1342 Err(
1343 "Can't find resource (data doesn't make an explicit reference to it). You may want to specify a default (existing) resource using --resource"
1344 .to_string(),
1345 )
1346 }
1347}
1348
1349fn get_resource_handle(
1350 store: &mut AnnotationStore,
1351 filename: &str,
1352) -> Result<TextResourceHandle, String> {
1353 if let Some(resource) = store.resource(filename) {
1354 return Ok(resource.handle());
1355 }
1356 store
1357 .add_resource(TextResourceBuilder::new().with_filename(filename))
1358 .map_err(|e| format!("Specified resource not found: {}: {}", filename, e))
1359}
1360
1361fn build_selector<'a>(
1362 cells: &[&str],
1363 columns: &Columns,
1364 resource_handle: TextResourceHandle,
1365) -> Result<SelectorBuilder<'a>, String> {
1366 let offset = parse_offset(cells, columns)?;
1368 Ok(SelectorBuilder::textselector(resource_handle, offset))
1369}
1370
1371fn parse_offset(cells: &[&str], columns: &Columns) -> Result<Offset, String> {
1372 if let Some(ioffset) = columns.index(&Column::Offset) {
1373 let cell = cells.get(ioffset).expect("cell must exist");
1374 if let Some(delimiterpos) = &cell[1..].find('-') {
1375 let delimiterpos = *delimiterpos + 1; let begin_str = &cell[0..delimiterpos];
1377 let end_str = &cell[(delimiterpos + 1)..];
1378 let begin: Cursor = begin_str.try_into().map_err(|e| format!("{}", e))?;
1379 let end: Cursor = end_str.try_into().map_err(|e| format!("{}", e))?;
1380 return Ok(Offset::new(begin, end));
1381 }
1382 Err("Offset must have format: beginoffset-endoffset".to_string())
1383 } else if let (Some(b), Some(e)) = (
1384 columns.index(&Column::BeginOffset),
1385 columns.index(&Column::EndOffset),
1386 ) {
1387 let begin_str = cells.get(b).expect("cell must exist");
1388 let end_str = cells.get(e).expect("cell must exist");
1389 let begin: Cursor = (*begin_str).try_into().map_err(|e| format!("{}", e))?;
1390 let end: Cursor = (*end_str).try_into().map_err(|e| format!("{}", e))?;
1391 Ok(Offset::new(begin, end))
1392 } else if let Some(i) = columns.index(&Column::TextSelection) {
1393 let textselection = cells.get(i).expect("cell must exist");
1394 if let Some(bytepos) = textselection.find('#') {
1395 if let Some(delimiterpos) = &textselection[(bytepos + 2)..].find('-') {
1396 let delimiterpos = *delimiterpos + bytepos + 2; let begin_str = &textselection[(bytepos + 1)..delimiterpos];
1398 let end_str = &textselection[(delimiterpos + 1)..];
1399 let begin: Cursor = (*begin_str).try_into().map_err(|e| format!("{}", e))?;
1400 let end: Cursor = (*end_str).try_into().map_err(|e| format!("{}", e))?;
1401 return Ok(Offset::new(begin, end));
1402 }
1403 }
1404 Err("Text selection must have format: resource#beginoffset-endoffset".to_string())
1405 } else {
1406 Err(format!("No offset information found"))
1407 }
1408}
1409
1410fn parse_column(
1411 column: &str,
1412 default_set: Option<&str>,
1413 setdelimiter: &str,
1414) -> Result<Column, String> {
1415 let result = Column::parse(column, setdelimiter)
1416 .map_err(|err| format!("Unable to parse provided columns: {}", err));
1417 if result.is_err() && default_set.is_some() {
1418 return Ok(Column::Custom {
1419 set: default_set.unwrap().to_string(),
1420 key: column.to_string(),
1421 });
1422 } else {
1423 result
1424 }
1425}