compact_genome/interface/
sequence.rs

1//! Traits for genome sequences.
2
3use crate::interface::alphabet::{Alphabet, AlphabetCharacter, AlphabetError};
4use crate::interface::k_mer::OwnedKmer;
5use std::cmp::Ordering;
6use std::iter;
7use std::iter::{FromIterator, Map, Repeat, Rev, Zip};
8use std::ops::Range;
9use traitsequence::interface::{EditableSequence, OwnedSequence, Sequence, SequenceMut};
10
11pub mod neighbor_iterators;
12
13/// An iterator over the reverse complement of a genome sequence.
14pub type ReverseComplementIterator<I, AlphabetType> = Map<
15    Rev<I>,
16    for<'c> fn(
17        &'c <AlphabetType as Alphabet>::CharacterType,
18    ) -> <AlphabetType as Alphabet>::CharacterType,
19>;
20
21/// An iterator over the cloned k-mers of a genome sequence.
22pub type OwnedKmerIterator<'a, GenomeSequenceType, KmerType> = Map<
23    Zip<Range<usize>, Repeat<&'a GenomeSequenceType>>,
24    fn((usize, &'a GenomeSequenceType)) -> KmerType,
25>;
26
27/// A genome sequence.
28pub trait GenomeSequence<
29    AlphabetType: Alphabet,
30    GenomeSubsequence: GenomeSequence<AlphabetType, GenomeSubsequence> + ?Sized,
31>: Sequence<AlphabetType::CharacterType, GenomeSubsequence>
32{
33    /// Returns true if this genome is valid, i.e. it contains no invalid characters.
34    fn is_valid(&self) -> bool {
35        true
36    }
37
38    /// Copies this genome string into a `Vec`.
39    fn clone_as_vec(&self) -> Vec<u8> {
40        self.iter()
41            .cloned()
42            .map(AlphabetType::character_to_ascii)
43            .collect()
44    }
45
46    /// Get a reference to this genome as its subsequence type.
47    fn as_genome_subsequence(&self) -> &GenomeSubsequence {
48        self.index(0..self.len())
49    }
50
51    /// Returns the genome as nucleotide string.
52    fn as_string(&self) -> String {
53        String::from_utf8(self.clone_as_vec())
54            .expect("Genome contains non-utf8 characters (It should be ASCII only).")
55    }
56
57    /// Returns an iterator over the reverse complement of this genome.
58    /// Panics if the iterator his an invalid character (see [not valid](GenomeSequence::is_valid)).
59    fn reverse_complement_iter(
60        &self,
61    ) -> ReverseComplementIterator<Self::Iterator<'_>, AlphabetType> {
62        self.iter()
63            .rev()
64            .map(AlphabetType::CharacterType::complement)
65    }
66
67    /// Returns an iterator over the k-mers of this genome.
68    /// The k-mers are cloned from this genome.
69    fn cloned_k_mer_iter<
70        const K: usize,
71        KmerType: OwnedKmer<K, AlphabetType, GenomeSubsequence>,
72    >(
73        &self,
74    ) -> OwnedKmerIterator<'_, Self, KmerType> {
75        (0..self.len() - K + 1)
76            .zip(iter::repeat(self))
77            .map(|(offset, source_genome)| {
78                source_genome.iter().skip(offset).take(K).cloned().collect()
79            })
80    }
81
82    /// Returns an owned copy of the reverse complement of this genome.
83    /// Panics if this genome is [not valid](GenomeSequence::is_valid).
84    fn convert_with_reverse_complement<
85        ReverseComplementSequence: OwnedGenomeSequence<AlphabetType, ReverseComplementSubsequence>,
86        ReverseComplementSubsequence: GenomeSequence<AlphabetType, ReverseComplementSubsequence> + ?Sized,
87    >(
88        &self,
89    ) -> ReverseComplementSequence {
90        self.reverse_complement_iter().collect()
91    }
92
93    /// Returns an owned copy of this genome.
94    fn convert<
95        ResultSequence: OwnedGenomeSequence<AlphabetType, ResultSubsequence>,
96        ResultSubsequence: GenomeSequence<AlphabetType, ResultSubsequence> + ?Sized,
97    >(
98        &self,
99    ) -> ResultSequence {
100        self.iter().cloned().collect()
101    }
102
103    /// Returns true if the genome is canonical.
104    /// A canonical genome is lexicographically smaller or equal to its reverse complement.
105    fn is_canonical(&self) -> bool {
106        for (forward_character, reverse_character) in
107            self.iter().cloned().zip(self.reverse_complement_iter())
108        {
109            match forward_character.cmp(&reverse_character) {
110                Ordering::Less => return true,
111                Ordering::Greater => return false,
112                _ => {}
113            }
114        }
115        true
116    }
117
118    /// Returns true if the genome is self-complemental.
119    /// A self-complemental genome is equivalent to its reverse complement.
120    fn is_self_complemental(&self) -> bool {
121        self.iter().cloned().eq(self.reverse_complement_iter())
122    }
123}
124
125/// A genome sequence that is owned, i.e. not a reference.
126pub trait OwnedGenomeSequence<
127    AlphabetType: Alphabet,
128    GenomeSubsequence: GenomeSequence<AlphabetType, GenomeSubsequence> + ?Sized,
129>:
130    GenomeSequence<AlphabetType, GenomeSubsequence>
131    + FromIterator<AlphabetType::CharacterType>
132    + OwnedSequence<AlphabetType::CharacterType, GenomeSubsequence>
133{
134    /// Returns the reverse complement of this genome.
135    /// Panics if this genome is [not valid](GenomeSequence::is_valid).
136    fn clone_as_reverse_complement(&self) -> Self {
137        self.reverse_complement_iter().collect()
138    }
139
140    /// Constructs an owned genome sequence from an `IntoIter` over ASCII characters.
141    /// If any character is not part of the alphabet, then `None` is returned.
142    fn from_iter_u8<T: IntoIterator<Item = u8>>(iter: T) -> Result<Self, AlphabetError> {
143        iter.into_iter()
144            .map(AlphabetType::ascii_to_character)
145            .collect()
146    }
147
148    /// Constructs an owned genome sequence from a slice of ASCII characters.
149    /// If any character is not part of the alphabet, then `None` is returned.
150    fn from_slice_u8(slice: &[u8]) -> Result<Self, AlphabetError> {
151        Self::from_iter_u8(slice.iter().copied())
152    }
153}
154
155/// A mutable genome sequence.
156pub trait GenomeSequenceMut<
157    AlphabetType: Alphabet,
158    GenomeSubsequenceMut: GenomeSequenceMut<AlphabetType, GenomeSubsequenceMut> + ?Sized,
159>:
160    SequenceMut<AlphabetType::CharacterType, GenomeSubsequenceMut>
161    + GenomeSequence<AlphabetType, GenomeSubsequenceMut>
162{
163    /// Get a reference to this genome as its subsequence type.
164    fn as_genome_subsequence_mut(&mut self) -> &mut GenomeSubsequenceMut {
165        self.index_mut(0..self.len())
166    }
167}
168
169type IntoIterU8<SourceType, AlphabetType> = Map<
170    <SourceType as IntoIterator>::IntoIter,
171    fn(<AlphabetType as Alphabet>::CharacterType) -> u8,
172>;
173
174/// An editable genome sequence.
175pub trait EditableGenomeSequence<
176    AlphabetType: Alphabet,
177    GenomeSubsequence: GenomeSequence<AlphabetType, GenomeSubsequence> + ?Sized,
178>:
179    EditableSequence<AlphabetType::CharacterType, GenomeSubsequence>
180    + GenomeSequence<AlphabetType, GenomeSubsequence>
181{
182    /// Converts this genome sequence into an iterator over ASCII characters.
183    fn into_iter_u8(self) -> IntoIterU8<Self, AlphabetType> {
184        self.into_iter().map(AlphabetType::character_to_ascii)
185    }
186
187    /// Extends this genome from a sequence of ASCII characters.
188    fn extend_from_iter_u8<IteratorType: IntoIterator<Item = u8>>(
189        &mut self,
190        iter: IteratorType,
191    ) -> Result<(), AlphabetError> {
192        let original_len = self.len();
193        let iter = iter.into_iter();
194        let (size, _) = iter.size_hint();
195        self.reserve(size);
196        for item in iter {
197            match AlphabetType::ascii_to_character(item) {
198                Ok(character) => self.push(character),
199                Err(error) => {
200                    self.resize(
201                        original_len,
202                        AlphabetType::CharacterType::from_index(0).unwrap(),
203                    );
204                    return Err(error);
205                }
206            }
207        }
208
209        Ok(())
210    }
211
212    /// Extends this genome from a sequence of ASCII characters.
213    fn extend_from_slice_u8(&mut self, slice: &[u8]) -> Result<(), AlphabetError> {
214        self.extend_from_iter_u8(slice.iter().copied())
215    }
216}