multi_seq_align/
lib.rs

1/*! ![stability-experimental](https://img.shields.io/badge/stability-experimental-orange.svg)
2
3A crate to manipulate multiple sequences alignments in Rust.
4
5Instead of storing aligned sequences as multiple strings, `multi_seq_align` stores bases or residues in [`Alignment`] using a list of characters, like a matrix. This allows easy access to specific rows and columns of the alignment.
6
7# Usage
8
9```rust
10# use multi_seq_align::Alignment;
11# use std::error::Error;
12# fn main() -> Result<(), Box<dyn Error>> {
13let mut kappa_casein_fragments_alignment = Alignment::with_sequences(
14    &[
15        b"PAPISKWQSMP".to_vec(),
16        b"HAQIPQRQYLP".to_vec(),
17        b"PAQILQWQVLS".to_vec(),
18    ],
19)?;
20
21// Let's extract a column of this alignment
22assert_eq!(
23    kappa_casein_fragments_alignment.nth_position(6).unwrap(),
24    [&b'W', &b'R', &b'W']
25);
26
27// But we also have the aligned sequence for the Platypus
28// Let's add it to the original alignment
29kappa_casein_fragments_alignment.add(
30    b"EHQRP--YVLP".to_vec(),
31)?;
32
33// the new aligned sequence has a gap at the 6th position
34assert_eq!(
35    kappa_casein_fragments_alignment.nth_position(6).unwrap(),
36    [&b'W', &b'R', &b'W', &b'-']
37);
38
39// We can also loop over each position of the alignment
40for aas in kappa_casein_fragments_alignment.iter_positions() {
41    println!("{:?}", aas);
42    assert_eq!(aas.len(), 4); // 4 sequences
43}
44
45# Ok(())
46# }
47```
48
49Here I instancied an alignment using `u8`, but `Alignment` works on generics like numbers, custom or third-party structs.
50
51# Features
52
53- Create [`Alignment`] from one or multiple aligned sequences at once (see [`add()`] and [`with_sequences()`]).
54- Extract columns of the alignment (see [`iter_positions()`] and [`iter_sequences(`]).
55This crate is currently in early stage development. I wouldn't recommend using it in production but I am interested in possible ideas to further the developemt of this project. Quite some work needs toi be done to improve the API and make it easy to use in other project.
56# Ideas
57- Computation of conservation scores
58- Identification of conserved sites
59- Computation of consensus sequence
60- Collapse / trim alignment
61- Serialisation / Deserialisation of alignment files
62- Extract sub-alignments
63    - positions
64    - motifs
65
66# Optimisation
67
68My goal is to reduce the footprint of this crate, there is ome work to do to achieve it. The code will eventually be optimised to be faster and to better use memory.
69
70[`Alignment`]: struct.Alignment.html
71[`iter_positions()`]: struct.Alignment.html#method.iter_positions
72[`iter_sequences(`]: struct.Alignment.html#method.iter_sequences
73[`add()`]: struct.Alignment.html#method.add
74[`with_sequences()`]: struct.Alignment.html#method.with_sequences
75*/
76#![warn(clippy::all, clippy::pedantic, clippy::nursery, clippy::cargo)]
77
78mod errors;
79mod utils;
80
81use errors::MultiSeqAlignError;
82use std::iter::FromIterator;
83
84#[cfg(feature = "serde")]
85#[macro_use]
86extern crate serde;
87
88/// An alignment of DNA or amino acids sequences
89///
90/// Aligned sequences should all have the same length. Each sequence is stored as one vector of `char`s. This allows an easy access to columns and rows of the alignment.
91#[derive(Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Debug)] // Use Rc to implement Copy
92#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
93pub struct Alignment<T> {
94    /// Sequences (as one)
95    sequences: Vec<T>,
96    /// The number of sequences in the alignment
97    n_sequences: usize,
98    /// The length of the alignment
99    length: usize,
100}
101
102impl<T> Default for Alignment<T>
103where
104    T: Clone,
105{
106    fn default() -> Self {
107        Self {
108            sequences: Vec::<T>::default(),
109            n_sequences: 0_usize,
110            length: 0_usize,
111        }
112    }
113}
114struct AlignmentPositionIterator<'a, T> {
115    alignment: &'a Alignment<T>,
116    index: usize,
117    size_hint: usize,
118}
119
120impl<'a, T> Iterator for AlignmentPositionIterator<'a, T>
121where
122    T: Clone,
123{
124    type Item = Vec<&'a T>;
125    fn next(&mut self) -> Option<Vec<&'a T>> {
126        if self.index >= self.alignment.length {
127            return None;
128        }
129        match self.alignment.nth_position(self.index) {
130            Some(position) => {
131                self.index = self.index.saturating_add(1);
132                self.size_hint = self.size_hint.saturating_sub(1);
133                Some(position)
134            }
135            None => None,
136        }
137    }
138
139    fn size_hint(&self) -> (usize, Option<usize>) {
140        if self.size_hint < usize::max_value() {
141            // ?
142            (self.size_hint, Some(self.size_hint))
143        } else {
144            (usize::max_value(), None)
145        }
146    }
147}
148
149impl<'a, T> ExactSizeIterator for AlignmentPositionIterator<'a, T>
150where
151    T: Clone,
152{
153    fn len(&self) -> usize {
154        let (lower, upper) = self.size_hint();
155        // Note: This assertion is overly defensive, but it checks the invariant
156        // guaranteed by the trait. If this trait were rust-internal,
157        // we could use debug_assert!; assert_eq! will check all Rust user
158        // implementations too.
159        assert_eq!(upper, Some(lower));
160        lower
161    }
162}
163
164struct AlignmentSequenceIterator<'a, T> {
165    alignment: &'a Alignment<T>,
166    index: usize,
167    size_hint: usize,
168}
169
170impl<'a, T> Iterator for AlignmentSequenceIterator<'a, T>
171where
172    T: Clone,
173{
174    type Item = Vec<&'a T>;
175    fn next(&mut self) -> Option<Vec<&'a T>> {
176        if self.index >= self.alignment.n_sequences {
177            return None;
178        }
179
180        match self.alignment.nth_sequence(self.index) {
181            Some(seq) => {
182                self.index = self.index.saturating_add(1);
183                self.size_hint = self.size_hint.saturating_sub(1);
184                Some(seq)
185            }
186            None => None,
187        }
188    }
189
190    fn nth(&mut self, n: usize) -> Option<Self::Item> {
191        self.alignment.nth_sequence(n)
192    }
193
194    fn size_hint(&self) -> (usize, Option<usize>) {
195        if self.size_hint < usize::max_value() {
196            // ?
197            (self.size_hint, Some(self.size_hint))
198        } else {
199            (usize::max_value(), None)
200        }
201    }
202}
203
204impl<'a, T> ExactSizeIterator for AlignmentSequenceIterator<'a, T>
205where
206    T: Clone,
207{
208    fn len(&self) -> usize {
209        let (lower, upper) = self.size_hint();
210        // Note: This assertion is overly defensive, but it checks the invariant
211        // guaranteed by the trait. If this trait were rust-internal,
212        // we could use debug_assert!; assert_eq! will check all Rust user
213        // implementations too.
214        assert_eq!(upper, Some(lower));
215        lower
216    }
217}
218
219impl<T> Alignment<T> {
220    /// Returns the fixed `length` of the Alignment `self`
221    #[must_use]
222    pub const fn length(&self) -> &usize {
223        &self.length
224    }
225
226    /// Returns the number of sequences contained in `self`
227    #[must_use]
228    pub const fn n_sequences(&self) -> &usize {
229        &self.n_sequences
230    }
231
232    /// Returns an Iterator over the positions of the alignment
233    ///
234    /// # Examples
235    ///
236    /// ```rust
237    /// # use multi_seq_align::Alignment;
238    /// let align = Alignment::with_sequences(
239    ///     &[
240    ///         b"AVEQTPRK".to_vec(),
241    ///         b"SVEQTPRK".to_vec(),
242    ///         b"SVEQTPKK".to_vec(),
243    ///     ],
244    /// )
245    /// .unwrap();
246    ///
247    /// for position in align.iter_positions() {
248    ///     assert_eq!(position.len(), 3)
249    /// }
250    /// ```
251    pub fn iter_positions(
252        &self,
253    ) -> impl Iterator<Item = Vec<&T>> + ExactSizeIterator<Item = Vec<&T>>
254    where
255        T: Clone,
256    {
257        AlignmentPositionIterator {
258            alignment: self,
259            index: 0_usize,
260            size_hint: self.length,
261        }
262    }
263
264    /// Returns an Iterator over the sequences of the alignment
265    ///
266    /// # Examples
267    ///
268    /// ```rust
269    /// # use multi_seq_align::Alignment;
270    /// let align = Alignment::with_sequences(
271    ///     &[
272    ///         b"AVEQTPRK".to_vec(),
273    ///         b"SVEQTPRK".to_vec(),
274    ///         b"SVEQTPKK".to_vec(),
275    ///     ],
276    /// )
277    /// .unwrap();
278    ///
279    /// for sequence in align.iter_sequences() {
280    ///     assert_eq!(sequence.len(), 8)
281    /// }
282    /// ```
283    pub fn iter_sequences(
284        &self,
285    ) -> impl Iterator<Item = Vec<&T>> + ExactSizeIterator<Item = Vec<&T>>
286    where
287        T: Clone,
288    {
289        AlignmentSequenceIterator {
290            alignment: self,
291            index: 0_usize,
292            size_hint: self.n_sequences,
293        }
294    }
295
296    /// Returns an empty `Alignment` of fixed `length`
297    ///
298    ///
299    /// # Examples
300    ///
301    /// ```rust
302    ///  # use multi_seq_align::Alignment;
303    ///  let alignment = Alignment::<char>::new(42);
304    ///
305    ///  assert_eq!(*alignment.length(), 42 as usize);
306    ///  assert_eq!(*alignment.n_sequences(), 0 as usize);
307    /// ```
308    #[must_use]
309    pub const fn new(length: usize) -> Self {
310        Self {
311            sequences: Vec::new(),
312            n_sequences: 0_usize,
313            length,
314        }
315    }
316
317    /// Returns `true` if `self` doesn't contains any sequence
318    ///
319    ///
320    /// # Examples
321    ///
322    /// ```rust
323    /// # use multi_seq_align::Alignment;
324    /// let alignment = Alignment::<char>::new(42);
325    ///
326    /// assert!(alignment.is_empty())
327    ///```
328    #[must_use]
329    pub const fn is_empty(&self) -> bool {
330        self.n_sequences == 0_usize
331    }
332
333    /// Create an `Alignment` from same length vectors of names, descriptions, sequences
334    ///
335    /// # Examples
336    ///
337    /// ```rust
338    /// # use multi_seq_align::Alignment;
339    /// let align = Alignment::with_sequences(
340    ///     &[
341    ///         b"AVEQTPRK".to_vec(),
342    ///         b"SVEQTPRK".to_vec(),
343    ///         b"SVEQTPKK".to_vec(),
344    ///     ],
345    /// )
346    /// .unwrap();
347    ///
348    /// assert_eq!(*align.length(), 8);
349    /// assert_eq!(*align.n_sequences(), 3);
350    /// ```
351    ///
352    /// # Errors
353    ///
354    /// Will return an error if `names`, `descriptions` and `sequences` have different lengths, and also if the sequences have different lengths (based on the first sequence).
355    pub fn with_sequences(sequences: &[Vec<T>]) -> Result<Self, MultiSeqAlignError>
356    where
357        T: Clone,
358    {
359        let length = utils::first_sequence_length(sequences);
360        utils::check_unequal_lengths(sequences, length)?;
361
362        let n_sequences = sequences.len();
363
364        let sequences_vec = sequences.iter().flat_map(|x| x.to_vec()).collect();
365
366        Ok(Self {
367            sequences: sequences_vec,
368            n_sequences,
369            length,
370        })
371    }
372
373    /// Add a sequence to `self`
374    ///
375    /// The new sequence must have the same length than `self.length`.
376    ///
377    /// # Examples
378    ///
379    /// ```rust
380    /// # use multi_seq_align::Alignment;
381    /// let mut align = Alignment::new(8);
382    ///
383    ///  assert_eq!(*align.n_sequences(), 0);
384    ///
385    /// align
386    ///     .add(b"AVEQTPRK".to_vec())
387    ///     .unwrap();
388    ///
389    /// assert_eq!(*align.n_sequences(), 1);
390    ///
391    /// align
392    ///     .add(b"SVEQTPRK".to_vec())
393    ///     .unwrap();
394    ///
395    /// assert_eq!(*align.n_sequences(), 2);
396    /// ```
397    ///
398    /// # Errors
399    ///
400    /// Will return an error if the length of `sequence` is different from the one of the alignment.
401    pub fn add<'a>(&'a mut self, sequence: Vec<T>) -> Result<&'a mut Self, MultiSeqAlignError> {
402        if sequence.len() != self.length {
403            return Err(MultiSeqAlignError::NewSequenceOfDifferentLength {
404                expected_length: self.length,
405                // sequences_name: name,
406                found_length: sequence.len(),
407            });
408        }
409
410        self.sequences.extend(sequence);
411
412        self.n_sequences += 1;
413
414        Ok(self)
415    }
416
417    /// Returns all amino acids / bases at a `position` in the alignment `self`. The returned vector has a length equal of number of sequences in `self`.
418    ///
419    /// # Examples
420    ///
421    /// ```rust
422    /// # use multi_seq_align::Alignment;
423    /// let align = Alignment::<u8>::with_sequences(
424    ///     &[b"ELK".to_vec(), b"ILK".to_vec()],
425    /// )
426    /// .unwrap();
427    ///
428    /// assert_eq!(align.nth_position(0).unwrap(), &[&b'E', &b'I']);
429    /// ```
430    /// # Panics
431    ///
432    /// Panics if `n` is greater or equal to the `length` of the Alignment.
433    #[must_use]
434    pub fn nth_position(&self, n: usize) -> Option<Vec<&T>> {
435        assert!(n < self.length);
436        (0..self.n_sequences)
437            .map(|i| self.sequences.get(i * self.length + n))
438            .collect::<Vec<Option<&T>>>()
439            .into_iter()
440            .collect::<Option<Vec<&T>>>()
441    }
442
443    /// Returns all amino acids / bases of the sequence at the `index` of the Alignment `self`. The returned vector has a length equal to the length of the Alignment `self`.
444    ///
445    /// # Examples
446    ///
447    /// ```rust
448    /// # use multi_seq_align::Alignment;
449    /// let align = Alignment::<u8>::with_sequences(
450    ///     &[b"ELK".to_vec(), b"ILK".to_vec()],
451    /// )
452    /// .unwrap();
453    ///
454    /// assert_eq!(align.nth_sequence(1).unwrap(), &[&b'I', &b'L', &b'K']);
455    /// ```
456    /// # Panics
457    ///
458    /// Panics if `index` is greater or equal to the `n_sequences` of the Alignment.
459    #[must_use]
460    pub fn nth_sequence(&self, index: usize) -> Option<Vec<&T>> {
461        debug_assert!(index < self.n_sequences);
462
463        (0..self.length)
464            .map(|i| self.sequences.get(self.length * index + i))
465            .collect::<Vec<Option<&T>>>()
466            .into_iter()
467            .collect::<Option<Vec<&T>>>()
468    }
469}
470
471impl<A> FromIterator<Vec<A>> for Alignment<A>
472where
473    A: Clone,
474{
475    /// # Panics
476    ///
477    /// Panics if sequences are of different lengths
478    fn from_iter<I: IntoIterator<Item = Vec<A>>>(iter: I) -> Self {
479        let mut length: Option<usize> = None;
480        let mut n_sequences = 0_usize;
481
482        let sequences = iter
483            .into_iter()
484            .flat_map(|x| {
485                if length.is_none() {
486                    length = Some(x.len());
487                } else if Some(x.len()) != length {
488                    panic!("sequences of different lengths");
489                }
490
491                n_sequences += 1;
492                x.to_vec()
493            })
494            .collect::<Vec<_>>();
495
496        Self {
497            sequences,
498            n_sequences,
499            length: length.unwrap(),
500        }
501    }
502}
503
504#[cfg(test)]
505mod tests {
506    use super::*;
507    use pretty_assertions::assert_eq;
508
509    #[test]
510    fn new_align() {
511        let x = Alignment::<char>::new(5_usize);
512        assert!(x.sequences.is_empty());
513        assert_eq!(x.length, 5_usize);
514        assert_eq!(x.n_sequences, 0_usize);
515    }
516
517    #[test]
518    fn new_alignment_with_desc() {
519        let x = Alignment::<u8>::with_sequences(&[b"ELK".to_vec(), b"ILK".to_vec()]).unwrap();
520
521        assert_eq!(x.sequences, vec![b'E', b'L', b'K', b'I', b'L', b'K']);
522        assert_eq!(x.length, 3);
523        assert_eq!(x.n_sequences, 2);
524    }
525
526    #[test]
527    fn add_1_sequence() {
528        let mut align =
529            Alignment::with_sequences(&[b"ALKHITAN".to_vec(), b"VLK-ITAN".to_vec()]).unwrap();
530
531        align.add(b"ALRYITAT".to_vec()).unwrap();
532
533        assert_eq!(align.n_sequences, 3_usize);
534        assert_eq!(align.nth_position(3).unwrap(), vec![&b'H', &b'-', &b'Y'])
535    }
536
537    #[test]
538    fn add_1_sequence_wrong_length() {
539        let mut x = Alignment::new(3_usize);
540        let error = x.add(b"ILKAV".to_vec()).err().unwrap();
541        let expected = MultiSeqAlignError::NewSequenceOfDifferentLength {
542            expected_length: 3_usize,
543            found_length: 5_usize,
544        };
545        assert_eq!(error, expected);
546    }
547
548    #[test]
549    fn add_to_new() {
550        let mut x = Alignment::new(3_usize);
551
552        x.add(b"ELK".to_vec()).unwrap();
553        assert_eq!(x.n_sequences, 1_usize);
554        assert_eq!(x.length, 3_usize);
555        assert_eq!(x.sequences.len(), 3_usize);
556
557        x.add(b"ILK".to_vec()).unwrap();
558
559        assert_eq!(x.n_sequences, 2_usize);
560        assert_eq!(x.length, 3_usize);
561        assert_eq!(x.sequences.len(), 6_usize);
562    }
563
564    #[test]
565    fn empty_align() {
566        let mut x = Alignment::new(3_usize);
567        assert!(x.is_empty());
568        x.add(b"ILK".to_vec()).unwrap();
569        assert!(!x.is_empty());
570    }
571
572    #[test]
573    fn nth_residues_3() {
574        let align =
575            Alignment::with_sequences(&[b"ALKHITAN".to_vec(), b"VLK-ITAN".to_vec()]).unwrap();
576        assert_eq!(align.nth_position(3).unwrap(), vec![&b'H', &b'-'])
577    }
578
579    #[test]
580    fn nth_residues_more_seqs() {
581        let align = Alignment::with_sequences(&[
582            b"ALKHITAN".to_vec(),
583            b"VLK-ITAN".to_vec(),
584            b"ALKWITAN".to_vec(),
585            b"VLKMITAN".to_vec(),
586        ])
587        .unwrap();
588        assert_eq!(
589            align.nth_position(3).unwrap(),
590            vec![&b'H', &b'-', &b'W', &b'M']
591        )
592    }
593
594    #[test]
595    #[should_panic(expected = "assertion failed: n < self.length")]
596    fn nth_residues_out() {
597        let align =
598            Alignment::with_sequences(&[b"ALKHITAN".to_vec(), b"VLK-ITAN".to_vec()]).unwrap();
599        let _out_of_bonds = align.nth_position(10);
600    }
601
602    #[test]
603    fn different_seq_lengths() {
604        let error = Alignment::with_sequences(&[b"ALKHITAN".to_vec(), b"VLK-ITAN---".to_vec()])
605            .err()
606            .unwrap();
607
608        let expected = MultiSeqAlignError::MultipleSequencesOfDifferentLengths {
609            expected_length: 8,
610            found_lengths: vec![11],
611        };
612        assert_eq!(error, expected);
613    }
614
615    #[test]
616    fn for_positions() {
617        let align =
618            Alignment::with_sequences(&[b"ALKHITAN".to_vec(), b"VLK-ITAN".to_vec()]).unwrap();
619
620        let mut x = Vec::new();
621
622        for col in align.iter_positions() {
623            x.push(col);
624        }
625
626        assert_eq!(x.len(), 8);
627        assert_eq!(x.get(0).unwrap(), &[&b'A', &b'V']);
628        assert_eq!(x.get(3).unwrap(), &[&b'H', &b'-']);
629    }
630
631    #[test]
632    #[should_panic]
633    fn for_positions_out_of_bonds() {
634        let align =
635            Alignment::with_sequences(&[b"ALKHITAN".to_vec(), b"VLK-ITAN".to_vec()]).unwrap();
636        let mut x = Vec::new();
637
638        for col in align.iter_positions() {
639            x.push(col);
640        }
641
642        let _ = x.get(22).unwrap();
643    }
644
645    #[test]
646    fn for_positions_exact() {
647        let align =
648            Alignment::with_sequences(&[b"ALKHITAN".to_vec(), b"VLK-ITAN".to_vec()]).unwrap();
649
650        assert_eq!(align.iter_positions().len(), 8);
651        assert_eq!(align.iter_positions().next().unwrap().len(), 2);
652    }
653
654    #[test]
655    fn for_sequences() {
656        let align =
657            Alignment::with_sequences(&[b"ALKHITAN".to_vec(), b"VLK-ITAN".to_vec()]).unwrap();
658
659        let mut x = Vec::new();
660
661        for row in align.iter_sequences() {
662            assert_eq!(row.len(), 8);
663            x.push(row);
664        }
665
666        assert_eq!(x.len(), 2)
667    }
668
669    #[test]
670    fn for_sequences_exact() {
671        let align =
672            Alignment::with_sequences(&[b"ALKHITAN".to_vec(), b"VLK-ITAN".to_vec()]).unwrap();
673
674        assert_eq!(align.iter_sequences().len(), 2);
675        assert_eq!(align.iter_sequences().next().unwrap().len(), 8);
676    }
677
678    #[test]
679    fn for_sequences_collect() {
680        let align =
681            Alignment::with_sequences(&[b"ALKHITAN".to_vec(), b"VLK-ITAN".to_vec()]).unwrap();
682
683        assert_eq!(align.iter_sequences().len(), 2);
684        assert_eq!(align.iter_sequences().next().unwrap().len(), 8);
685    }
686}