multi_seq_align/lib.rs
1/*! 
2
3A crate to manipulate multiple sequences alignments in Rust.
4
5Instead of storing aligned sequences as multiple strings, `multi_seq_align` stores bases or residues in [`Alignment`] using a list of characters, like a matrix. This allows easy access to specific rows and columns of the alignment.
6
7# Usage
8
9```rust
10# use multi_seq_align::Alignment;
11# use std::error::Error;
12# fn main() -> Result<(), Box<dyn Error>> {
13let mut kappa_casein_fragments_alignment = Alignment::with_sequences(
14 &[
15 b"PAPISKWQSMP".to_vec(),
16 b"HAQIPQRQYLP".to_vec(),
17 b"PAQILQWQVLS".to_vec(),
18 ],
19)?;
20
21// Let's extract a column of this alignment
22assert_eq!(
23 kappa_casein_fragments_alignment.nth_position(6).unwrap(),
24 [&b'W', &b'R', &b'W']
25);
26
27// But we also have the aligned sequence for the Platypus
28// Let's add it to the original alignment
29kappa_casein_fragments_alignment.add(
30 b"EHQRP--YVLP".to_vec(),
31)?;
32
33// the new aligned sequence has a gap at the 6th position
34assert_eq!(
35 kappa_casein_fragments_alignment.nth_position(6).unwrap(),
36 [&b'W', &b'R', &b'W', &b'-']
37);
38
39// We can also loop over each position of the alignment
40for aas in kappa_casein_fragments_alignment.iter_positions() {
41 println!("{:?}", aas);
42 assert_eq!(aas.len(), 4); // 4 sequences
43}
44
45# Ok(())
46# }
47```
48
49Here I instancied an alignment using `u8`, but `Alignment` works on generics like numbers, custom or third-party structs.
50
51# Features
52
53- Create [`Alignment`] from one or multiple aligned sequences at once (see [`add()`] and [`with_sequences()`]).
54- Extract columns of the alignment (see [`iter_positions()`] and [`iter_sequences(`]).
55This crate is currently in early stage development. I wouldn't recommend using it in production but I am interested in possible ideas to further the developemt of this project. Quite some work needs toi be done to improve the API and make it easy to use in other project.
56# Ideas
57- Computation of conservation scores
58- Identification of conserved sites
59- Computation of consensus sequence
60- Collapse / trim alignment
61- Serialisation / Deserialisation of alignment files
62- Extract sub-alignments
63 - positions
64 - motifs
65
66# Optimisation
67
68My goal is to reduce the footprint of this crate, there is ome work to do to achieve it. The code will eventually be optimised to be faster and to better use memory.
69
70[`Alignment`]: struct.Alignment.html
71[`iter_positions()`]: struct.Alignment.html#method.iter_positions
72[`iter_sequences(`]: struct.Alignment.html#method.iter_sequences
73[`add()`]: struct.Alignment.html#method.add
74[`with_sequences()`]: struct.Alignment.html#method.with_sequences
75*/
76#![warn(clippy::all, clippy::pedantic, clippy::nursery, clippy::cargo)]
77
78mod errors;
79mod utils;
80
81use errors::MultiSeqAlignError;
82use std::iter::FromIterator;
83
84#[cfg(feature = "serde")]
85#[macro_use]
86extern crate serde;
87
88/// An alignment of DNA or amino acids sequences
89///
90/// Aligned sequences should all have the same length. Each sequence is stored as one vector of `char`s. This allows an easy access to columns and rows of the alignment.
91#[derive(Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Debug)] // Use Rc to implement Copy
92#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
93pub struct Alignment<T> {
94 /// Sequences (as one)
95 sequences: Vec<T>,
96 /// The number of sequences in the alignment
97 n_sequences: usize,
98 /// The length of the alignment
99 length: usize,
100}
101
102impl<T> Default for Alignment<T>
103where
104 T: Clone,
105{
106 fn default() -> Self {
107 Self {
108 sequences: Vec::<T>::default(),
109 n_sequences: 0_usize,
110 length: 0_usize,
111 }
112 }
113}
114struct AlignmentPositionIterator<'a, T> {
115 alignment: &'a Alignment<T>,
116 index: usize,
117 size_hint: usize,
118}
119
120impl<'a, T> Iterator for AlignmentPositionIterator<'a, T>
121where
122 T: Clone,
123{
124 type Item = Vec<&'a T>;
125 fn next(&mut self) -> Option<Vec<&'a T>> {
126 if self.index >= self.alignment.length {
127 return None;
128 }
129 match self.alignment.nth_position(self.index) {
130 Some(position) => {
131 self.index = self.index.saturating_add(1);
132 self.size_hint = self.size_hint.saturating_sub(1);
133 Some(position)
134 }
135 None => None,
136 }
137 }
138
139 fn size_hint(&self) -> (usize, Option<usize>) {
140 if self.size_hint < usize::max_value() {
141 // ?
142 (self.size_hint, Some(self.size_hint))
143 } else {
144 (usize::max_value(), None)
145 }
146 }
147}
148
149impl<'a, T> ExactSizeIterator for AlignmentPositionIterator<'a, T>
150where
151 T: Clone,
152{
153 fn len(&self) -> usize {
154 let (lower, upper) = self.size_hint();
155 // Note: This assertion is overly defensive, but it checks the invariant
156 // guaranteed by the trait. If this trait were rust-internal,
157 // we could use debug_assert!; assert_eq! will check all Rust user
158 // implementations too.
159 assert_eq!(upper, Some(lower));
160 lower
161 }
162}
163
164struct AlignmentSequenceIterator<'a, T> {
165 alignment: &'a Alignment<T>,
166 index: usize,
167 size_hint: usize,
168}
169
170impl<'a, T> Iterator for AlignmentSequenceIterator<'a, T>
171where
172 T: Clone,
173{
174 type Item = Vec<&'a T>;
175 fn next(&mut self) -> Option<Vec<&'a T>> {
176 if self.index >= self.alignment.n_sequences {
177 return None;
178 }
179
180 match self.alignment.nth_sequence(self.index) {
181 Some(seq) => {
182 self.index = self.index.saturating_add(1);
183 self.size_hint = self.size_hint.saturating_sub(1);
184 Some(seq)
185 }
186 None => None,
187 }
188 }
189
190 fn nth(&mut self, n: usize) -> Option<Self::Item> {
191 self.alignment.nth_sequence(n)
192 }
193
194 fn size_hint(&self) -> (usize, Option<usize>) {
195 if self.size_hint < usize::max_value() {
196 // ?
197 (self.size_hint, Some(self.size_hint))
198 } else {
199 (usize::max_value(), None)
200 }
201 }
202}
203
204impl<'a, T> ExactSizeIterator for AlignmentSequenceIterator<'a, T>
205where
206 T: Clone,
207{
208 fn len(&self) -> usize {
209 let (lower, upper) = self.size_hint();
210 // Note: This assertion is overly defensive, but it checks the invariant
211 // guaranteed by the trait. If this trait were rust-internal,
212 // we could use debug_assert!; assert_eq! will check all Rust user
213 // implementations too.
214 assert_eq!(upper, Some(lower));
215 lower
216 }
217}
218
219impl<T> Alignment<T> {
220 /// Returns the fixed `length` of the Alignment `self`
221 #[must_use]
222 pub const fn length(&self) -> &usize {
223 &self.length
224 }
225
226 /// Returns the number of sequences contained in `self`
227 #[must_use]
228 pub const fn n_sequences(&self) -> &usize {
229 &self.n_sequences
230 }
231
232 /// Returns an Iterator over the positions of the alignment
233 ///
234 /// # Examples
235 ///
236 /// ```rust
237 /// # use multi_seq_align::Alignment;
238 /// let align = Alignment::with_sequences(
239 /// &[
240 /// b"AVEQTPRK".to_vec(),
241 /// b"SVEQTPRK".to_vec(),
242 /// b"SVEQTPKK".to_vec(),
243 /// ],
244 /// )
245 /// .unwrap();
246 ///
247 /// for position in align.iter_positions() {
248 /// assert_eq!(position.len(), 3)
249 /// }
250 /// ```
251 pub fn iter_positions(
252 &self,
253 ) -> impl Iterator<Item = Vec<&T>> + ExactSizeIterator<Item = Vec<&T>>
254 where
255 T: Clone,
256 {
257 AlignmentPositionIterator {
258 alignment: self,
259 index: 0_usize,
260 size_hint: self.length,
261 }
262 }
263
264 /// Returns an Iterator over the sequences of the alignment
265 ///
266 /// # Examples
267 ///
268 /// ```rust
269 /// # use multi_seq_align::Alignment;
270 /// let align = Alignment::with_sequences(
271 /// &[
272 /// b"AVEQTPRK".to_vec(),
273 /// b"SVEQTPRK".to_vec(),
274 /// b"SVEQTPKK".to_vec(),
275 /// ],
276 /// )
277 /// .unwrap();
278 ///
279 /// for sequence in align.iter_sequences() {
280 /// assert_eq!(sequence.len(), 8)
281 /// }
282 /// ```
283 pub fn iter_sequences(
284 &self,
285 ) -> impl Iterator<Item = Vec<&T>> + ExactSizeIterator<Item = Vec<&T>>
286 where
287 T: Clone,
288 {
289 AlignmentSequenceIterator {
290 alignment: self,
291 index: 0_usize,
292 size_hint: self.n_sequences,
293 }
294 }
295
296 /// Returns an empty `Alignment` of fixed `length`
297 ///
298 ///
299 /// # Examples
300 ///
301 /// ```rust
302 /// # use multi_seq_align::Alignment;
303 /// let alignment = Alignment::<char>::new(42);
304 ///
305 /// assert_eq!(*alignment.length(), 42 as usize);
306 /// assert_eq!(*alignment.n_sequences(), 0 as usize);
307 /// ```
308 #[must_use]
309 pub const fn new(length: usize) -> Self {
310 Self {
311 sequences: Vec::new(),
312 n_sequences: 0_usize,
313 length,
314 }
315 }
316
317 /// Returns `true` if `self` doesn't contains any sequence
318 ///
319 ///
320 /// # Examples
321 ///
322 /// ```rust
323 /// # use multi_seq_align::Alignment;
324 /// let alignment = Alignment::<char>::new(42);
325 ///
326 /// assert!(alignment.is_empty())
327 ///```
328 #[must_use]
329 pub const fn is_empty(&self) -> bool {
330 self.n_sequences == 0_usize
331 }
332
333 /// Create an `Alignment` from same length vectors of names, descriptions, sequences
334 ///
335 /// # Examples
336 ///
337 /// ```rust
338 /// # use multi_seq_align::Alignment;
339 /// let align = Alignment::with_sequences(
340 /// &[
341 /// b"AVEQTPRK".to_vec(),
342 /// b"SVEQTPRK".to_vec(),
343 /// b"SVEQTPKK".to_vec(),
344 /// ],
345 /// )
346 /// .unwrap();
347 ///
348 /// assert_eq!(*align.length(), 8);
349 /// assert_eq!(*align.n_sequences(), 3);
350 /// ```
351 ///
352 /// # Errors
353 ///
354 /// Will return an error if `names`, `descriptions` and `sequences` have different lengths, and also if the sequences have different lengths (based on the first sequence).
355 pub fn with_sequences(sequences: &[Vec<T>]) -> Result<Self, MultiSeqAlignError>
356 where
357 T: Clone,
358 {
359 let length = utils::first_sequence_length(sequences);
360 utils::check_unequal_lengths(sequences, length)?;
361
362 let n_sequences = sequences.len();
363
364 let sequences_vec = sequences.iter().flat_map(|x| x.to_vec()).collect();
365
366 Ok(Self {
367 sequences: sequences_vec,
368 n_sequences,
369 length,
370 })
371 }
372
373 /// Add a sequence to `self`
374 ///
375 /// The new sequence must have the same length than `self.length`.
376 ///
377 /// # Examples
378 ///
379 /// ```rust
380 /// # use multi_seq_align::Alignment;
381 /// let mut align = Alignment::new(8);
382 ///
383 /// assert_eq!(*align.n_sequences(), 0);
384 ///
385 /// align
386 /// .add(b"AVEQTPRK".to_vec())
387 /// .unwrap();
388 ///
389 /// assert_eq!(*align.n_sequences(), 1);
390 ///
391 /// align
392 /// .add(b"SVEQTPRK".to_vec())
393 /// .unwrap();
394 ///
395 /// assert_eq!(*align.n_sequences(), 2);
396 /// ```
397 ///
398 /// # Errors
399 ///
400 /// Will return an error if the length of `sequence` is different from the one of the alignment.
401 pub fn add<'a>(&'a mut self, sequence: Vec<T>) -> Result<&'a mut Self, MultiSeqAlignError> {
402 if sequence.len() != self.length {
403 return Err(MultiSeqAlignError::NewSequenceOfDifferentLength {
404 expected_length: self.length,
405 // sequences_name: name,
406 found_length: sequence.len(),
407 });
408 }
409
410 self.sequences.extend(sequence);
411
412 self.n_sequences += 1;
413
414 Ok(self)
415 }
416
417 /// Returns all amino acids / bases at a `position` in the alignment `self`. The returned vector has a length equal of number of sequences in `self`.
418 ///
419 /// # Examples
420 ///
421 /// ```rust
422 /// # use multi_seq_align::Alignment;
423 /// let align = Alignment::<u8>::with_sequences(
424 /// &[b"ELK".to_vec(), b"ILK".to_vec()],
425 /// )
426 /// .unwrap();
427 ///
428 /// assert_eq!(align.nth_position(0).unwrap(), &[&b'E', &b'I']);
429 /// ```
430 /// # Panics
431 ///
432 /// Panics if `n` is greater or equal to the `length` of the Alignment.
433 #[must_use]
434 pub fn nth_position(&self, n: usize) -> Option<Vec<&T>> {
435 assert!(n < self.length);
436 (0..self.n_sequences)
437 .map(|i| self.sequences.get(i * self.length + n))
438 .collect::<Vec<Option<&T>>>()
439 .into_iter()
440 .collect::<Option<Vec<&T>>>()
441 }
442
443 /// Returns all amino acids / bases of the sequence at the `index` of the Alignment `self`. The returned vector has a length equal to the length of the Alignment `self`.
444 ///
445 /// # Examples
446 ///
447 /// ```rust
448 /// # use multi_seq_align::Alignment;
449 /// let align = Alignment::<u8>::with_sequences(
450 /// &[b"ELK".to_vec(), b"ILK".to_vec()],
451 /// )
452 /// .unwrap();
453 ///
454 /// assert_eq!(align.nth_sequence(1).unwrap(), &[&b'I', &b'L', &b'K']);
455 /// ```
456 /// # Panics
457 ///
458 /// Panics if `index` is greater or equal to the `n_sequences` of the Alignment.
459 #[must_use]
460 pub fn nth_sequence(&self, index: usize) -> Option<Vec<&T>> {
461 debug_assert!(index < self.n_sequences);
462
463 (0..self.length)
464 .map(|i| self.sequences.get(self.length * index + i))
465 .collect::<Vec<Option<&T>>>()
466 .into_iter()
467 .collect::<Option<Vec<&T>>>()
468 }
469}
470
471impl<A> FromIterator<Vec<A>> for Alignment<A>
472where
473 A: Clone,
474{
475 /// # Panics
476 ///
477 /// Panics if sequences are of different lengths
478 fn from_iter<I: IntoIterator<Item = Vec<A>>>(iter: I) -> Self {
479 let mut length: Option<usize> = None;
480 let mut n_sequences = 0_usize;
481
482 let sequences = iter
483 .into_iter()
484 .flat_map(|x| {
485 if length.is_none() {
486 length = Some(x.len());
487 } else if Some(x.len()) != length {
488 panic!("sequences of different lengths");
489 }
490
491 n_sequences += 1;
492 x.to_vec()
493 })
494 .collect::<Vec<_>>();
495
496 Self {
497 sequences,
498 n_sequences,
499 length: length.unwrap(),
500 }
501 }
502}
503
504#[cfg(test)]
505mod tests {
506 use super::*;
507 use pretty_assertions::assert_eq;
508
509 #[test]
510 fn new_align() {
511 let x = Alignment::<char>::new(5_usize);
512 assert!(x.sequences.is_empty());
513 assert_eq!(x.length, 5_usize);
514 assert_eq!(x.n_sequences, 0_usize);
515 }
516
517 #[test]
518 fn new_alignment_with_desc() {
519 let x = Alignment::<u8>::with_sequences(&[b"ELK".to_vec(), b"ILK".to_vec()]).unwrap();
520
521 assert_eq!(x.sequences, vec![b'E', b'L', b'K', b'I', b'L', b'K']);
522 assert_eq!(x.length, 3);
523 assert_eq!(x.n_sequences, 2);
524 }
525
526 #[test]
527 fn add_1_sequence() {
528 let mut align =
529 Alignment::with_sequences(&[b"ALKHITAN".to_vec(), b"VLK-ITAN".to_vec()]).unwrap();
530
531 align.add(b"ALRYITAT".to_vec()).unwrap();
532
533 assert_eq!(align.n_sequences, 3_usize);
534 assert_eq!(align.nth_position(3).unwrap(), vec![&b'H', &b'-', &b'Y'])
535 }
536
537 #[test]
538 fn add_1_sequence_wrong_length() {
539 let mut x = Alignment::new(3_usize);
540 let error = x.add(b"ILKAV".to_vec()).err().unwrap();
541 let expected = MultiSeqAlignError::NewSequenceOfDifferentLength {
542 expected_length: 3_usize,
543 found_length: 5_usize,
544 };
545 assert_eq!(error, expected);
546 }
547
548 #[test]
549 fn add_to_new() {
550 let mut x = Alignment::new(3_usize);
551
552 x.add(b"ELK".to_vec()).unwrap();
553 assert_eq!(x.n_sequences, 1_usize);
554 assert_eq!(x.length, 3_usize);
555 assert_eq!(x.sequences.len(), 3_usize);
556
557 x.add(b"ILK".to_vec()).unwrap();
558
559 assert_eq!(x.n_sequences, 2_usize);
560 assert_eq!(x.length, 3_usize);
561 assert_eq!(x.sequences.len(), 6_usize);
562 }
563
564 #[test]
565 fn empty_align() {
566 let mut x = Alignment::new(3_usize);
567 assert!(x.is_empty());
568 x.add(b"ILK".to_vec()).unwrap();
569 assert!(!x.is_empty());
570 }
571
572 #[test]
573 fn nth_residues_3() {
574 let align =
575 Alignment::with_sequences(&[b"ALKHITAN".to_vec(), b"VLK-ITAN".to_vec()]).unwrap();
576 assert_eq!(align.nth_position(3).unwrap(), vec![&b'H', &b'-'])
577 }
578
579 #[test]
580 fn nth_residues_more_seqs() {
581 let align = Alignment::with_sequences(&[
582 b"ALKHITAN".to_vec(),
583 b"VLK-ITAN".to_vec(),
584 b"ALKWITAN".to_vec(),
585 b"VLKMITAN".to_vec(),
586 ])
587 .unwrap();
588 assert_eq!(
589 align.nth_position(3).unwrap(),
590 vec![&b'H', &b'-', &b'W', &b'M']
591 )
592 }
593
594 #[test]
595 #[should_panic(expected = "assertion failed: n < self.length")]
596 fn nth_residues_out() {
597 let align =
598 Alignment::with_sequences(&[b"ALKHITAN".to_vec(), b"VLK-ITAN".to_vec()]).unwrap();
599 let _out_of_bonds = align.nth_position(10);
600 }
601
602 #[test]
603 fn different_seq_lengths() {
604 let error = Alignment::with_sequences(&[b"ALKHITAN".to_vec(), b"VLK-ITAN---".to_vec()])
605 .err()
606 .unwrap();
607
608 let expected = MultiSeqAlignError::MultipleSequencesOfDifferentLengths {
609 expected_length: 8,
610 found_lengths: vec![11],
611 };
612 assert_eq!(error, expected);
613 }
614
615 #[test]
616 fn for_positions() {
617 let align =
618 Alignment::with_sequences(&[b"ALKHITAN".to_vec(), b"VLK-ITAN".to_vec()]).unwrap();
619
620 let mut x = Vec::new();
621
622 for col in align.iter_positions() {
623 x.push(col);
624 }
625
626 assert_eq!(x.len(), 8);
627 assert_eq!(x.get(0).unwrap(), &[&b'A', &b'V']);
628 assert_eq!(x.get(3).unwrap(), &[&b'H', &b'-']);
629 }
630
631 #[test]
632 #[should_panic]
633 fn for_positions_out_of_bonds() {
634 let align =
635 Alignment::with_sequences(&[b"ALKHITAN".to_vec(), b"VLK-ITAN".to_vec()]).unwrap();
636 let mut x = Vec::new();
637
638 for col in align.iter_positions() {
639 x.push(col);
640 }
641
642 let _ = x.get(22).unwrap();
643 }
644
645 #[test]
646 fn for_positions_exact() {
647 let align =
648 Alignment::with_sequences(&[b"ALKHITAN".to_vec(), b"VLK-ITAN".to_vec()]).unwrap();
649
650 assert_eq!(align.iter_positions().len(), 8);
651 assert_eq!(align.iter_positions().next().unwrap().len(), 2);
652 }
653
654 #[test]
655 fn for_sequences() {
656 let align =
657 Alignment::with_sequences(&[b"ALKHITAN".to_vec(), b"VLK-ITAN".to_vec()]).unwrap();
658
659 let mut x = Vec::new();
660
661 for row in align.iter_sequences() {
662 assert_eq!(row.len(), 8);
663 x.push(row);
664 }
665
666 assert_eq!(x.len(), 2)
667 }
668
669 #[test]
670 fn for_sequences_exact() {
671 let align =
672 Alignment::with_sequences(&[b"ALKHITAN".to_vec(), b"VLK-ITAN".to_vec()]).unwrap();
673
674 assert_eq!(align.iter_sequences().len(), 2);
675 assert_eq!(align.iter_sequences().next().unwrap().len(), 8);
676 }
677
678 #[test]
679 fn for_sequences_collect() {
680 let align =
681 Alignment::with_sequences(&[b"ALKHITAN".to_vec(), b"VLK-ITAN".to_vec()]).unwrap();
682
683 assert_eq!(align.iter_sequences().len(), 2);
684 assert_eq!(align.iter_sequences().next().unwrap().len(), 8);
685 }
686}