stamtools/
align.rs

1use stam::*;
2
3use seal::pair::{AlignmentSet, InMemoryAlignmentMatrix, NeedlemanWunsch, SmithWaterman, Step};
4use std::str::FromStr;
5
6const TRIM_CHARS: [char; 4] = [' ', '\n', '\t', '\r'];
7
8#[derive(Clone, Debug)]
9pub struct AlignmentConfig {
10    /// Case-insensitive matching has more performance overhead
11    pub case_sensitive: bool,
12
13    // The Alignment algorithm
14    pub algorithm: AlignmentAlgorithm,
15
16    /// Prefix to use when assigning annotation IDs. The actual ID will have a random component
17    pub annotation_id_prefix: Option<String>,
18
19    /// Strip leading and trailing whitespace/newlines from aligned text selections, keeping them as minimal as possible (default is to be as greedy as possible in selecting)
20    /// Setting this may lead to certain whitespaces not being covered even though they may align.
21    pub trim: bool,
22
23    /// Only allow for alignments that consist of one contiguous text selection on either side. This is a so-called simple transposition.
24    pub simple_only: bool,
25
26    /// The minimal number of characters that must be aligned (absolute number) for a transposition/translation to be valid
27    pub minimal_align_length: usize,
28
29    /// The maximum number of errors (max edit distance) that may occur for a transposition to be valid.
30    /// This is either an absolute integer or a relative ratio between 0.0 and 1.0, interpreted in relation to the length of the first text in the alignment.
31    /// In other words; this represents the number of characters in the search string that may be missed when matching in the larger text.
32    /// The transposition itself will only consist of fully matching parts, use `grow` if you want to include non-matching parts.
33    pub max_errors: Option<AbsoluteOrRelative>,
34
35    /// Grow aligned parts into larger alignments by incorporating non-matching parts. This will return translations rather than transpositions.
36    /// You'll want to set `max_errors` in combination with this one to prevent very low-quality alignments.
37    pub grow: bool,
38
39    /// Output alignments to standard output in a TSV format
40    pub verbose: bool,
41
42    /// Limit output
43    pub quiet: bool,
44}
45
46impl Default for AlignmentConfig {
47    fn default() -> Self {
48        Self {
49            case_sensitive: true,
50            algorithm: AlignmentAlgorithm::default(),
51            annotation_id_prefix: None,
52            minimal_align_length: 0,
53            max_errors: None,
54            trim: false,
55            simple_only: false,
56            verbose: false,
57            grow: false,
58            quiet: false,
59        }
60    }
61}
62
63#[derive(Clone, Debug)]
64pub enum AlignmentAlgorithm {
65    /// Needleman-Wunsch, global sequence alignment
66    NeedlemanWunsch {
67        equal: isize,
68        align: isize,
69        insert: isize,
70        delete: isize,
71    },
72    /// Smith-Waterman, local sequence alignment
73    SmithWaterman {
74        equal: isize,
75        align: isize,
76        insert: isize,
77        delete: isize,
78    },
79}
80
81impl Default for AlignmentAlgorithm {
82    fn default() -> Self {
83        Self::SmithWaterman {
84            equal: 2,
85            align: -1,
86            insert: -1,
87            delete: -1,
88        }
89    }
90}
91
92/// Aligns the texts of two queries
93/// and adds transposition annotations for each possible combination of the two
94/// Returns the transpositions added
95pub fn align<'store>(
96    store: &'store mut AnnotationStore,
97    query: Query<'store>,
98    queries2: Vec<Query<'store>>,
99    use_var: Option<&str>,
100    use_var2: Option<&str>,
101    config: &AlignmentConfig,
102) -> Result<Vec<AnnotationHandle>, StamError> {
103    let mut buildtranspositions = Vec::new();
104    {
105        let iter = store.query(query)?;
106        for resultrow in iter {
107            if let Ok(result) = resultrow.get_by_name_or_last(use_var) {
108                for (i, query2raw) in queries2.iter().enumerate() {
109                    //MAYBE TODO: this could be parallellized (but memory may be a problem then)
110                    if !config.quiet {
111                        eprintln!("Aligning #{}/{}...", i + 1, queries2.len());
112                    }
113                    let (text, query2) = match result {
114                        QueryResultItem::TextResource(resource) => ( resource.clone().to_textselection(), query2raw.clone().with_resourcevar(use_var.unwrap_or("resource"), resource)),
115                        QueryResultItem::Annotation(annotation) => {
116                            if let Some(tsel) = annotation.textselections().next() {
117                                (tsel, query2raw.clone().with_annotationvar(use_var.unwrap_or("annotation"), annotation))
118                            } else {
119                                return Err(StamError::OtherError("Annotation references multiple texts, this is not supported yet by stam align"));
120                            }
121                        }
122                        QueryResultItem::TextSelection(tsel) => ( tsel.clone(), query2raw.clone().with_textvar(use_var.unwrap_or("text"), tsel)),
123                        _ => return Err(StamError::OtherError("Obtained result type can not by used by stam align, expected ANNOTATION, RESOURCE or TEXT"))
124                    };
125
126                    let iter2 = store.query(query2)?;
127                    for resultrow2 in iter2 {
128                        if let Ok(result) = resultrow2.get_by_name_or_last(use_var2) {
129                            let text2 = match result {
130                            QueryResultItem::TextResource(resource) => resource.clone().to_textselection(),
131                            QueryResultItem::Annotation(annotation) => {
132                                if let Some(tsel) = annotation.textselections().next() {
133                                    tsel
134                                } else {
135                                    return Err(StamError::OtherError("Annotation references multiple texts, this is not supported yet by stam align"));
136                                }
137                            }
138                            QueryResultItem::TextSelection(tsel) => tsel.clone(),
139                            _ => return Err(StamError::OtherError("Obtained result type can not by used by stam align, expected ANNOTATION, RESOURCE or TEXT"))
140                        };
141                            let builders = align_texts(&text, &text2, config)?;
142                            buildtranspositions.extend(builders);
143                        } else if let Some(use_var2) = use_var2 {
144                            return Err(StamError::QuerySyntaxError(
145                                format!(
146                                    "No result found for variable {}, so nothing to align",
147                                    use_var2
148                                ),
149                                "(align)",
150                            ));
151                        }
152                    }
153                }
154            } else if let Some(use_var) = use_var {
155                return Err(StamError::QuerySyntaxError(
156                    format!(
157                        "No result found for variable {}, so nothing to align",
158                        use_var
159                    ),
160                    "(align)",
161                ));
162            }
163        }
164    }
165    let mut transpositions = Vec::with_capacity(buildtranspositions.len());
166    for builder in buildtranspositions {
167        transpositions.push(store.annotate(builder)?);
168    }
169    if !config.quiet {
170        eprintln!("{} annotations(s) created", transpositions.len());
171    }
172    Ok(transpositions)
173}
174
175#[derive(Clone, PartialEq, Debug)]
176struct AlignedFragment {
177    begin1: usize,
178    begin2: usize,
179    length: usize,
180}
181
182impl AlignedFragment {
183    fn to_offsets<'a>(&self) -> (Offset, Offset) {
184        (
185            Offset::simple(self.begin1, self.begin1 + self.length),
186            Offset::simple(self.begin2, self.begin2 + self.length),
187        )
188    }
189
190    /// Returns None when equal, an index number where the strings differ
191    fn strdiff(
192        &self,
193        textstring1: &str,
194        textstring2: &str,
195        config: &AlignmentConfig,
196    ) -> Option<usize> {
197        for (i, (c1, c2)) in textstring1.chars().zip(textstring2.chars()).enumerate() {
198            if c1 != c2 {
199                if config.case_sensitive {
200                    return Some(i);
201                } else {
202                    if c1.to_lowercase().to_string() != c2.to_lowercase().to_string() {
203                        return Some(i);
204                    }
205                }
206            }
207        }
208        return None;
209    }
210
211    fn publish<'store>(
212        &self,
213        select1: &mut Vec<SelectorBuilder<'static>>,
214        select2: &mut Vec<SelectorBuilder<'static>>,
215        text: &ResultTextSelection<'store>,
216        text2: &ResultTextSelection<'store>,
217        config: &AlignmentConfig,
218    ) -> Result<bool, StamError> {
219        let (offset1, offset2) = self.to_offsets(); //will get shadowed eventually
220        let mut textsel1 = text.textselection(&offset1)?;
221        let mut textsel2 = text2.textselection(&offset2)?;
222        let mut textstring1 = textsel1.text();
223        let mut textstring2 = textsel2.text();
224        if !config.grow {
225            // Due to the way the algorithm works, we can end up with non-exact alignments in the tail of the textstrings
226            // For transpositions this is unwanted, this will strip the tail:
227            if let Some(newlength) = self.strdiff(textstring1, textstring2, config) {
228                let mut shorterfragment = self.clone();
229                if newlength > 1 {
230                    shorterfragment.length = newlength;
231                    return shorterfragment.publish(select1, select2, text, text2, config);
232                } else {
233                    return Ok(false);
234                }
235            }
236        }
237        if config.trim {
238            if let Ok(trimmed) = text.textselection(&offset1)?.trim_text(&TRIM_CHARS) {
239                textsel1 = trimmed;
240                textstring1 = textsel1.text();
241            } else {
242                //nothing left to align
243                return Ok(false);
244            }
245            if let Ok(trimmed) = text2.textselection(&offset2)?.trim_text(&TRIM_CHARS) {
246                textsel2 = trimmed;
247                textstring2 = textsel2.text();
248            } else {
249                //nothing left to align
250                return Ok(false);
251            }
252        };
253        if config.verbose {
254            println!(
255                "{}\t{}-{}\t{}\t{}-{}\t\"{}\"\t\"{}\"",
256                text.resource().id().unwrap_or("-"),
257                &textsel1.begin(),
258                &textsel1.end(),
259                text2.resource().id().unwrap_or("-"),
260                &textsel2.begin(),
261                &textsel2.end(),
262                textstring1
263                    .replace("\"", "\\\"")
264                    .replace("\t", "\\t")
265                    .replace("\n", "\\n"),
266                textstring2
267                    .replace("\"", "\\\"")
268                    .replace("\t", "\\t")
269                    .replace("\n", "\\n")
270            );
271        }
272        let offset1: Offset = textsel1.inner().into();
273        let offset2: Offset = textsel2.inner().into();
274        select1.push(SelectorBuilder::TextSelector(
275            text.resource().handle().into(),
276            offset1,
277        ));
278        select2.push(SelectorBuilder::TextSelector(
279            text2.resource().handle().into(),
280            offset2,
281        ));
282        Ok(true)
283    }
284}
285
286/// Find an alignment between two texts and creates a transposition
287/// Returns builders for the transposition, you still have to add it to the store.
288pub fn align_texts<'store>(
289    text: &ResultTextSelection<'store>,
290    text2: &ResultTextSelection<'store>,
291    config: &AlignmentConfig,
292) -> Result<Vec<AnnotationBuilder<'static>>, StamError> {
293    let mut builders = Vec::with_capacity(3);
294    let seq1: Vec<char> = text.text().chars().collect();
295    let seq2: Vec<char> = text2.text().chars().collect();
296
297    let alignment_set: Result<AlignmentSet<InMemoryAlignmentMatrix>, _> = match config.algorithm {
298        AlignmentAlgorithm::SmithWaterman {
299            equal,
300            align,
301            insert,
302            delete,
303        } => {
304            let algorithm = SmithWaterman::new(equal, align, insert, delete);
305            AlignmentSet::new(seq1.len(), seq2.len(), algorithm, |x, y| {
306                if seq1[x] != seq2[y] {
307                    if !config.case_sensitive {
308                        return seq1[x].to_lowercase().to_string()
309                            == seq2[y].to_lowercase().to_string();
310                    } else {
311                        return false;
312                    }
313                }
314                true
315            })
316        }
317        AlignmentAlgorithm::NeedlemanWunsch {
318            equal,
319            align,
320            insert,
321            delete,
322        } => {
323            let algorithm = NeedlemanWunsch::new(equal, align, insert, delete);
324            AlignmentSet::new(seq1.len(), seq2.len(), algorithm, |x, y| {
325                if seq1[x] != seq2[y] {
326                    if !config.case_sensitive {
327                        return seq1[x].to_lowercase().to_string()
328                            == seq2[y].to_lowercase().to_string();
329                    } else {
330                        return false;
331                    }
332                }
333                true
334            })
335        }
336    };
337
338    match alignment_set {
339        Ok(alignment_set) => {
340            let alignment = match config.algorithm {
341                AlignmentAlgorithm::SmithWaterman { .. } => alignment_set.local_alignment(),
342                AlignmentAlgorithm::NeedlemanWunsch { .. } => alignment_set.global_alignment(),
343            };
344            let mut select1: Vec<SelectorBuilder<'static>> = Vec::new();
345            let mut select2: Vec<SelectorBuilder<'static>> = Vec::new();
346
347            let mut fragment: Option<AlignedFragment> = None;
348            let mut totalalignlength = 0;
349            let mut errors = 0;
350            let mut last = None;
351            for step in alignment.steps() {
352                match step {
353                    Step::Align { x, y } => {
354                        if let Some(fragment) = fragment.as_mut() {
355                            fragment.length += 1;
356                            last = last.map(|x| x + 1);
357                            totalalignlength += 1;
358                        } else {
359                            fragment = Some(AlignedFragment {
360                                begin1: x,
361                                begin2: y,
362                                length: 1,
363                            });
364                            if let Some(last) = last {
365                                //everything not spanned since the last alignment is an error
366                                errors += x - last;
367                            } else {
368                                //everything up until here is an error
369                                errors += x;
370                            }
371                            last = Some(x + 1);
372                            totalalignlength += 1;
373                        }
374                    }
375                    _ => {
376                        if let Some(fragment) = fragment.take() {
377                            fragment.publish(&mut select1, &mut select2, text, text2, config)?;
378                        }
379                    }
380                }
381            }
382
383            if let Some(fragment) = fragment.take() {
384                fragment.publish(&mut select1, &mut select2, text, text2, config)?;
385            }
386
387            if let Some(max_errors) = config.max_errors {
388                let max_errors = max_errors.as_absolute(seq1.len());
389                if let Some(last) = last {
390                    //everything after the last match (that was not matched, counts as an error)
391                    errors += seq1.len() - last;
392                }
393                if errors > max_errors {
394                    //alignment not good enough to return
395                    return Ok(builders);
396                }
397            }
398            if totalalignlength < config.minimal_align_length {
399                //alignment not good enough to return
400                return Ok(builders);
401            }
402
403            if select1.is_empty() || (config.simple_only && select1.len() > 1) {
404                //no alignment found
405                //MAYBE TODO: compute and constrain by score?
406                return Ok(builders);
407            }
408
409            if config.grow && select1.len() > 1 {
410                let id = if let Some(prefix) = config.annotation_id_prefix.as_ref() {
411                    generate_id(&format!("{}translation", prefix), "")
412                } else {
413                    generate_id("translation", "")
414                };
415
416                let mut newselect1 = select1.pop().expect("must have an item");
417                if let Some(s) = select1.get(0) {
418                    let mut offset = newselect1.offset().expect("must have offset").clone();
419                    offset.begin = s.offset().expect("must have offset").begin;
420                    newselect1 = SelectorBuilder::textselector(
421                        newselect1.resource().unwrap().clone(),
422                        offset,
423                    );
424                }
425                let mut newselect2 = select2.pop().expect("must have an item");
426                if let Some(s) = select2.get(0) {
427                    let mut offset = newselect2.offset().expect("must have offset").clone();
428                    offset.begin = s.offset().expect("must have offset").begin;
429                    newselect2 = SelectorBuilder::textselector(
430                        newselect2.resource().unwrap().clone(),
431                        offset,
432                    );
433                }
434
435                builders.push(
436                    AnnotationBuilder::new()
437                        .with_id(id.clone())
438                        .with_data(
439                            "https://w3id.org/stam/extensions/stam-translate/",
440                            "Translation",
441                            DataValue::Null,
442                        )
443                        .with_target(SelectorBuilder::DirectionalSelector(vec![
444                            newselect1, newselect2,
445                        ])),
446                );
447                Ok(builders)
448            } else {
449                let id = if let Some(prefix) = config.annotation_id_prefix.as_ref() {
450                    generate_id(&format!("{}transposition-", prefix), "")
451                } else {
452                    generate_id("transposition-", "")
453                };
454                if select1.len() == 1 {
455                    //simple transposition
456                    builders.push(
457                        AnnotationBuilder::new()
458                            .with_id(id.clone())
459                            .with_data(
460                                "https://w3id.org/stam/extensions/stam-transpose/",
461                                "Transposition",
462                                DataValue::Null,
463                            )
464                            .with_target(SelectorBuilder::DirectionalSelector(vec![
465                                select1.into_iter().next().unwrap(),
466                                select2.into_iter().next().unwrap(),
467                            ])),
468                    );
469                } else {
470                    //complex transposition
471                    let annotation1id = format!("{}-side1", id);
472                    builders.push(
473                        AnnotationBuilder::new()
474                            .with_id(annotation1id.clone())
475                            .with_data(
476                                "https://w3id.org/stam/extensions/stam-transpose/",
477                                "TranspositionSide",
478                                DataValue::Null,
479                            )
480                            .with_target(SelectorBuilder::DirectionalSelector(select1)),
481                    );
482                    let annotation2id = format!("{}-side2", id);
483                    builders.push(
484                        AnnotationBuilder::new()
485                            .with_id(annotation2id.clone())
486                            .with_data(
487                                "https://w3id.org/stam/extensions/stam-transpose/",
488                                "TranspositionSide",
489                                DataValue::Null,
490                            )
491                            .with_target(SelectorBuilder::DirectionalSelector(select2)),
492                    );
493                    builders.push(
494                        AnnotationBuilder::new()
495                            .with_id(id.clone())
496                            .with_data(
497                                "https://w3id.org/stam/extensions/stam-transpose/",
498                                "Transposition",
499                                DataValue::Null,
500                            )
501                            .with_target(SelectorBuilder::DirectionalSelector(vec![
502                                SelectorBuilder::AnnotationSelector(annotation1id.into(), None),
503                                SelectorBuilder::AnnotationSelector(annotation2id.into(), None),
504                            ])),
505                    );
506                }
507                Ok(builders)
508            }
509        }
510        Err(error) => {
511            eprintln!("ALIGNMENT ERROR: {:?}", error);
512            return Err(StamError::OtherError(
513                "Failed to generated alignment set due to error",
514            ));
515        }
516    }
517}
518
519pub fn alignments_tsv_out<'a>(
520    store: &'a AnnotationStore,
521    query: Query<'a>,
522    use_var: Option<&str>,
523) -> Result<(), StamError> {
524    let iter = store.query(query)?;
525    for resultrow in iter {
526        if let Ok(result) = resultrow.get_by_name_or_last(use_var) {
527            if let QueryResultItem::Annotation(annotation) = result {
528                print_transposition(annotation);
529            } else {
530                return Err(StamError::OtherError(
531                    "Only queries that return ANNOTATION are supported when outputting aligments",
532                ));
533            }
534        }
535    }
536    Ok(())
537}
538
539pub fn print_transposition<'a>(annotation: &ResultItem<'a, Annotation>) {
540    let mut annoiter = annotation.annotations_in_targets(AnnotationDepth::One);
541    if let (Some(left), Some(right)) = (annoiter.next(), annoiter.next()) {
542        //complex transposition
543        for (text1, text2) in left.textselections().zip(right.textselections()) {
544            print_alignment(annotation, &text1, &text2)
545        }
546    } else {
547        //simple transposition
548        let mut textiter = annotation.textselections();
549        if let (Some(text1), Some(text2)) = (textiter.next(), textiter.next()) {
550            print_alignment(annotation, &text1, &text2)
551        }
552    }
553}
554
555fn print_alignment<'a>(
556    annotation: &ResultItem<'a, Annotation>,
557    text1: &ResultTextSelection<'a>,
558    text2: &ResultTextSelection<'a>,
559) {
560    println!(
561        "{}\t{}\t{}-{}\t{}\t{}-{}\t\"{}\"\t\"{}\"\t{}",
562        annotation.id().unwrap_or("-"),
563        text1.resource().id().unwrap_or("-"),
564        text1.begin(),
565        text1.end(),
566        text2.resource().id().unwrap_or("-"),
567        text2.begin(),
568        text2.end(),
569        text1
570            .text()
571            .replace("\"", "\\\"")
572            .replace("\t", "\\t")
573            .replace("\n", "\\n"),
574        text2
575            .text()
576            .replace("\"", "\\\"")
577            .replace("\t", "\\t")
578            .replace("\n", "\\n"),
579        {
580            let ids: Vec<_> = annotation
581                .annotations_in_targets(AnnotationDepth::One)
582                .map(|a| a.id().unwrap_or("-"))
583                .collect();
584            ids.join("|")
585        }
586    );
587}
588
589#[derive(Debug, Clone, Copy, PartialEq, PartialOrd)]
590pub enum AbsoluteOrRelative {
591    Absolute(usize),
592    Relative(f64),
593}
594
595impl AbsoluteOrRelative {
596    pub fn as_absolute(self, total: usize) -> usize {
597        match self {
598            Self::Absolute(i) => i,
599            Self::Relative(f) => (f * total as f64).round() as usize,
600        }
601    }
602}
603
604impl FromStr for AbsoluteOrRelative {
605    type Err = &'static str;
606
607    fn from_str(s: &str) -> Result<Self, Self::Err> {
608        if let Ok(i) = s.parse::<usize>() {
609            Ok(Self::Absolute(i))
610        } else if let Ok(f) = s.parse::<f64>() {
611            Ok(Self::Relative(f))
612        } else {
613            Err("Value must be either an integer (absolute) or a floating point value (relative)")
614        }
615    }
616}