1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
//! Traits for genome sequences.

use crate::interface::alphabet::{Alphabet, AlphabetCharacter, AlphabetError};
use crate::interface::k_mer::OwnedKmer;
use std::cmp::Ordering;
use std::iter;
use std::iter::{FromIterator, Map, Repeat, Rev, Zip};
use std::ops::Range;
use traitsequence::interface::{EditableSequence, OwnedSequence, Sequence, SequenceMut};

pub mod neighbor_iterators;

/// An iterator over the reverse complement of a genome sequence.
pub type ReverseComplementIterator<I, AlphabetType> = Map<
    Rev<I>,
    for<'c> fn(
        &'c <AlphabetType as Alphabet>::CharacterType,
    ) -> <AlphabetType as Alphabet>::CharacterType,
>;

/// An iterator over the cloned k-mers of a genome sequence.
pub type OwnedKmerIterator<'a, GenomeSequenceType, KmerType> = Map<
    Zip<Range<usize>, Repeat<&'a GenomeSequenceType>>,
    fn((usize, &'a GenomeSequenceType)) -> KmerType,
>;

/// A genome sequence.
pub trait GenomeSequence<
    AlphabetType: Alphabet,
    GenomeSubsequence: GenomeSequence<AlphabetType, GenomeSubsequence> + ?Sized,
>: Sequence<AlphabetType::CharacterType, GenomeSubsequence>
{
    /// Returns true if this genome is valid, i.e. it contains no invalid characters.
    fn is_valid(&self) -> bool {
        true
    }

    /// Copies this genome string into a `Vec`.
    fn clone_as_vec(&self) -> Vec<u8> {
        self.iter()
            .cloned()
            .map(AlphabetType::character_to_ascii)
            .collect()
    }

    /// Get a reference to this genome as its subsequence type.
    fn as_genome_subsequence(&self) -> &GenomeSubsequence {
        self.index(0..self.len())
    }

    /// Returns the genome as nucleotide string.
    fn as_string(&self) -> String {
        String::from_utf8(self.clone_as_vec())
            .expect("Genome contains non-utf8 characters (It should be ASCII only).")
    }

    /// Returns an iterator over the reverse complement of this genome.
    /// Panics if the iterator his an invalid character (see [not valid](GenomeSequence::is_valid)).
    fn reverse_complement_iter(
        &self,
    ) -> ReverseComplementIterator<Self::Iterator<'_>, AlphabetType> {
        self.iter()
            .rev()
            .map(AlphabetType::CharacterType::complement)
    }

    /// Returns an iterator over the k-mers of this genome.
    /// The k-mers are cloned from this genome.
    fn cloned_k_mer_iter<
        const K: usize,
        KmerType: OwnedKmer<K, AlphabetType, GenomeSubsequence>,
    >(
        &self,
    ) -> OwnedKmerIterator<'_, Self, KmerType> {
        (0..self.len() - K + 1)
            .zip(iter::repeat(self))
            .map(|(offset, source_genome)| {
                source_genome.iter().skip(offset).take(K).cloned().collect()
            })
    }

    /// Returns an owned copy of the reverse complement of this genome.
    /// Panics if this genome is [not valid](GenomeSequence::is_valid).
    fn convert_with_reverse_complement<
        ReverseComplementSequence: OwnedGenomeSequence<AlphabetType, ReverseComplementSubsequence>,
        ReverseComplementSubsequence: GenomeSequence<AlphabetType, ReverseComplementSubsequence> + ?Sized,
    >(
        &self,
    ) -> ReverseComplementSequence {
        self.reverse_complement_iter().collect()
    }

    /// Returns an owned copy of this genome.
    fn convert<
        ResultSequence: OwnedGenomeSequence<AlphabetType, ResultSubsequence>,
        ResultSubsequence: GenomeSequence<AlphabetType, ResultSubsequence> + ?Sized,
    >(
        &self,
    ) -> ResultSequence {
        self.iter().cloned().collect()
    }

    /// Returns true if the genome is canonical.
    /// A canonical genome is lexicographically smaller or equal to its reverse complement.
    fn is_canonical(&self) -> bool {
        for (forward_character, reverse_character) in
            self.iter().cloned().zip(self.reverse_complement_iter())
        {
            match forward_character.cmp(&reverse_character) {
                Ordering::Less => return true,
                Ordering::Greater => return false,
                _ => {}
            }
        }
        true
    }

    /// Returns true if the genome is self-complemental.
    /// A self-complemental genome is equivalent to its reverse complement.
    fn is_self_complemental(&self) -> bool {
        self.iter().cloned().eq(self.reverse_complement_iter())
    }
}

/// A genome sequence that is owned, i.e. not a reference.
pub trait OwnedGenomeSequence<
    AlphabetType: Alphabet,
    GenomeSubsequence: GenomeSequence<AlphabetType, GenomeSubsequence> + ?Sized,
>:
    GenomeSequence<AlphabetType, GenomeSubsequence>
    + FromIterator<AlphabetType::CharacterType>
    + OwnedSequence<AlphabetType::CharacterType, GenomeSubsequence>
{
    /// Returns the reverse complement of this genome.
    /// Panics if this genome is [not valid](GenomeSequence::is_valid).
    fn clone_as_reverse_complement(&self) -> Self {
        self.reverse_complement_iter().collect()
    }

    /// Constructs an owned genome sequence from an `IntoIter` over ASCII characters.
    /// If any character is not part of the alphabet, then `None` is returned.
    fn from_iter_u8<T: IntoIterator<Item = u8>>(iter: T) -> Result<Self, AlphabetError> {
        iter.into_iter()
            .map(AlphabetType::ascii_to_character)
            .collect()
    }

    /// Constructs an owned genome sequence from a slice of ASCII characters.
    /// If any character is not part of the alphabet, then `None` is returned.
    fn from_slice_u8(slice: &[u8]) -> Result<Self, AlphabetError> {
        Self::from_iter_u8(slice.iter().copied())
    }
}

/// A mutable genome sequence.
pub trait GenomeSequenceMut<
    AlphabetType: Alphabet,
    GenomeSubsequenceMut: GenomeSequenceMut<AlphabetType, GenomeSubsequenceMut> + ?Sized,
>:
    SequenceMut<AlphabetType::CharacterType, GenomeSubsequenceMut>
    + GenomeSequence<AlphabetType, GenomeSubsequenceMut>
{
    /// Get a reference to this genome as its subsequence type.
    fn as_genome_subsequence_mut(&mut self) -> &mut GenomeSubsequenceMut {
        self.index_mut(0..self.len())
    }
}

type IntoIterU8<SourceType, AlphabetType> = Map<
    <SourceType as IntoIterator>::IntoIter,
    fn(<AlphabetType as Alphabet>::CharacterType) -> u8,
>;

/// An editable genome sequence.
pub trait EditableGenomeSequence<
    AlphabetType: Alphabet,
    GenomeSubsequence: GenomeSequence<AlphabetType, GenomeSubsequence> + ?Sized,
>:
    EditableSequence<AlphabetType::CharacterType, GenomeSubsequence>
    + GenomeSequence<AlphabetType, GenomeSubsequence>
{
    /// Converts this genome sequence into an iterator over ASCII characters.
    fn into_iter_u8(self) -> IntoIterU8<Self, AlphabetType> {
        self.into_iter().map(AlphabetType::character_to_ascii)
    }

    /// Extends this genome from a sequence of ASCII characters.
    fn extend_from_iter_u8<IteratorType: IntoIterator<Item = u8>>(
        &mut self,
        iter: IteratorType,
    ) -> Result<(), AlphabetError> {
        let original_len = self.len();
        let iter = iter.into_iter();
        let (size, _) = iter.size_hint();
        self.reserve(size);
        for item in iter {
            match AlphabetType::ascii_to_character(item) {
                Ok(character) => self.push(character),
                Err(error) => {
                    self.resize(
                        original_len,
                        AlphabetType::CharacterType::from_index(0).unwrap(),
                    );
                    return Err(error);
                }
            }
        }

        Ok(())
    }

    /// Extends this genome from a sequence of ASCII characters.
    fn extend_from_slice_u8(&mut self, slice: &[u8]) -> Result<(), AlphabetError> {
        self.extend_from_iter_u8(slice.iter().copied())
    }

    /// Reserve memory for at least `additional` items.
    fn reserve(&mut self, additional: usize);

    /// Resize to contain the given number of items.
    /// Empty spaces are filled with the given default item.
    fn resize(&mut self, len: usize, default: AlphabetType::CharacterType);

    /// Insert the given character at the end of the genome sequence.
    fn push(&mut self, character: AlphabetType::CharacterType);
}