packed_seq/traits.rs
1use super::u32x8;
2use mem_dbg::{MemDbg, MemSize};
3use std::ops::Range;
4
5/// A non-owned slice of characters.
6///
7/// The represented character values are expected to be in `[0, 2^b)`,
8/// but they can be encoded in various ways. E.g.:
9/// - A `&[u8]` of ASCII characters, returning 8-bit values.
10/// - An `AsciiSeq` of DNA characters `ACGT`, interpreted 2-bit values.
11/// - A `PackedSeq` of packed DNA characters (4 per byte), returning 2-bit values.
12///
13/// Each character is assumed to fit in 8 bits. Some functions take or return
14/// this 'unpacked' (ASCII) character.
15pub trait Seq<'s>: Copy + Eq + Ord {
16 /// Number of encoded characters per byte of memory of the `Seq`.
17 const BASES_PER_BYTE: usize;
18 /// Number of bits `b` to represent each character returned by `iter_bp` and variants..
19 const BITS_PER_CHAR: usize;
20
21 /// The corresponding owned sequence type.
22 type SeqVec: SeqVec;
23
24 /// Convenience function that returns `b=Self::BITS_PER_CHAR`.
25 fn bits_per_char(&self) -> usize {
26 Self::BITS_PER_CHAR
27 }
28
29 /// The length of the sequence in characters.
30 fn len(&self) -> usize;
31
32 /// Returns `true` if the sequence is empty.
33 fn is_empty(&self) -> bool;
34
35 /// Get the character at the given index.
36 fn get(&self, _index: usize) -> u8;
37
38 /// Get the ASCII character at the given index, _without_ mapping to `b`-bit values.
39 fn get_ascii(&self, _index: usize) -> u8;
40
41 /// Convert a short sequence (kmer) to a packed representation as `u64`.
42 fn as_u64(&self) -> u64;
43
44 /// Convert a short sequence (kmer) to a packed representation of its reverse complement as `u64`.
45 fn revcomp_as_u64(&self) -> u64;
46
47 /// Convert a short sequence (kmer) to a packed representation as `u128`.
48 fn as_u128(&self) -> u128;
49
50 /// Convert a short sequence (kmer) to a packed representation of its reverse complement as `u128`.
51 fn revcomp_as_u128(&self) -> u128;
52
53 /// Convert a short sequence (kmer) to a packed representation as `usize`.
54 #[deprecated = "Prefer `to_u64`."]
55 #[inline(always)]
56 fn to_word(&self) -> usize {
57 self.as_u64() as usize
58 }
59
60 /// Convert a short sequence (kmer) to a packed representation of its reverse complement as `usize`.
61 #[deprecated = "Prefer `revcomp_to_u64`."]
62 #[inline(always)]
63 fn to_word_revcomp(&self) -> usize {
64 self.revcomp_as_u64() as usize
65 }
66
67 /// Convert to an owned version.
68 fn to_vec(&self) -> Self::SeqVec;
69
70 /// Compute the reverse complement of this sequence.
71 fn to_revcomp(&self) -> Self::SeqVec;
72
73 /// Get a sub-slice of the sequence.
74 /// `range` indicates character indices.
75 fn slice(&self, range: Range<usize>) -> Self;
76
77 /// Extract a k-mer from this sequence.
78 #[inline(always)]
79 fn read_kmer(&self, k: usize, pos: usize) -> u64 {
80 self.slice(pos..pos + k).as_u64()
81 }
82
83 /// Extract a reverse complement k-mer from this sequence.
84 #[inline(always)]
85 fn read_revcomp_kmer(&self, k: usize, pos: usize) -> u64 {
86 self.slice(pos..pos + k).revcomp_as_u64()
87 }
88
89 /// Iterate over the `b`-bit characters of the sequence.
90 fn iter_bp(self) -> impl ExactSizeIterator<Item = u8> + Clone;
91
92 /// Iterate over 8 chunks of `b`-bit characters of the sequence in parallel.
93 ///
94 /// This splits the input into 8 chunks and streams over them in parallel.
95 /// The second output returns the number of 'padding' characters that was added to get a full number of SIMD lanes.
96 /// Thus, the last `padding` number of returned elements (from the last lane(s)) should be ignored.
97 /// The context can be e.g. the k-mer size being iterated.
98 /// When `context>1`, consecutive chunks overlap by `context-1` bases.
99 ///
100 /// Expected to be implemented using SIMD instructions.
101 fn par_iter_bp(self, context: usize) -> (impl ExactSizeIterator<Item = u32x8> + Clone, usize);
102
103 /// Iterate over 8 chunks of the sequence in parallel, returning two characters offset by `delay` positions.
104 ///
105 /// Returned pairs are `(add, remove)`, and the first `delay` 'remove' characters are always `0`.
106 ///
107 /// For example, when the sequence starts as `ABCDEF...`, and `delay=2`,
108 /// the first returned tuples in the first lane are:
109 /// `(b'A', 0)`, `(b'B', 0)`, `(b'C', b'A')`, `(b'D', b'B')`.
110 ///
111 /// When `context>1`, consecutive chunks overlap by `context-1` bases:
112 /// the first `context-1` 'added' characters of the second chunk overlap
113 /// with the last `context-1` 'added' characters of the first chunk.
114 fn par_iter_bp_delayed(
115 self,
116 context: usize,
117 delay: usize,
118 ) -> (impl ExactSizeIterator<Item = (u32x8, u32x8)> + Clone, usize);
119
120 /// Iterate over 8 chunks of the sequence in parallel, returning three characters:
121 /// the char added, the one `delay` positions before, and the one `delay2` positions before.
122 ///
123 /// Requires `delay1 <= delay2`.
124 ///
125 /// Returned pairs are `(add, d1, d2)`. The first `delay1` `d1` characters and first `delay2` `d2` are always `0`.
126 ///
127 /// For example, when the sequence starts as `ABCDEF...`, and `delay1=2` and `delay2=3`,
128 /// the first returned tuples in the first lane are:
129 /// `(b'A', 0, 0)`, `(b'B', 0, 0)`, `(b'C', b'A', 0)`, `(b'D', b'B', b'A')`.
130 ///
131 /// When `context>1`, consecutive chunks overlap by `context-1` bases:
132 /// the first `context-1` 'added' characters of the second chunk overlap
133 /// with the last `context-1` 'added' characters of the first chunk.
134 fn par_iter_bp_delayed_2(
135 self,
136 context: usize,
137 delay1: usize,
138 delay2: usize,
139 ) -> (
140 impl ExactSizeIterator<Item = (u32x8, u32x8, u32x8)> + Clone,
141 usize,
142 );
143
144 /// Compare and return the LCP of the two sequences.
145 fn cmp_lcp(&self, other: &Self) -> (std::cmp::Ordering, usize);
146}
147
148// Some hacky stuff to make conditional supertraits.
149cfg_if::cfg_if! {
150 if #[cfg(feature = "epserde")] {
151 pub use epserde::{deser::DeserializeInner, ser::SerializeInner};
152 } else {
153 pub trait SerializeInner {}
154 pub trait DeserializeInner {}
155
156 impl SerializeInner for Vec<u8> {}
157 impl DeserializeInner for Vec<u8> {}
158 impl SerializeInner for crate::AsciiSeqVec {}
159 impl DeserializeInner for crate::AsciiSeqVec {}
160 impl SerializeInner for crate::PackedSeqVec {}
161 impl DeserializeInner for crate::PackedSeqVec {}
162 }
163}
164
165/// An owned sequence.
166/// Can be constructed from either ASCII input or the underlying non-owning `Seq` type.
167///
168/// Implemented for:
169/// - A `Vec<u8>` of ASCII characters, returning 8-bit values.
170/// - An `AsciiSeqVec` of DNA characters `ACGT`, interpreted as 2-bit values.
171/// - A `PackedSeqVec` of packed DNA characters (4 per byte), returning 2-bit values.
172pub trait SeqVec:
173 Default + Sync + SerializeInner + DeserializeInner + MemSize + MemDbg + Clone + 'static
174{
175 type Seq<'s>: Seq<'s>;
176
177 /// Get a non-owning slice to the underlying sequence.
178 ///
179 /// Unfortunately, `Deref` into a `Seq` can not be supported.
180 fn as_slice(&self) -> Self::Seq<'_>;
181
182 /// Get a sub-slice of the sequence. Indices are character offsets.
183 #[inline(always)]
184 fn slice(&self, range: Range<usize>) -> Self::Seq<'_> {
185 self.as_slice().slice(range)
186 }
187
188 /// Extract a k-mer from this sequence.
189 #[inline(always)]
190 fn read_kmer(&self, k: usize, pos: usize) -> u64 {
191 self.as_slice().read_kmer(k, pos)
192 }
193
194 /// Extract a k-mer from this sequence.
195 #[inline(always)]
196 fn read_revcomp_kmer(&self, k: usize, pos: usize) -> u64 {
197 self.as_slice().read_revcomp_kmer(k, pos)
198 }
199
200 /// The length of the sequence in characters.
201 fn len(&self) -> usize;
202
203 /// Returns `true` if the sequence is empty.
204 fn is_empty(&self) -> bool;
205
206 /// Empty the sequence.
207 fn clear(&mut self);
208
209 /// Convert into the underlying raw representation.
210 fn into_raw(self) -> Vec<u8>;
211
212 /// Generate a random sequence with the given number of characters.
213 #[cfg(feature = "rand")]
214 fn random(n: usize) -> Self;
215
216 /// Create a `SeqVec` from ASCII input.
217 #[inline(always)]
218 fn from_ascii(seq: &[u8]) -> Self {
219 let mut packed_vec = Self::default();
220 packed_vec.push_ascii(seq);
221 packed_vec
222 }
223
224 /// Append the given sequence to the underlying storage.
225 ///
226 /// This may leave gaps (padding) between consecutively pushed sequences to avoid re-aligning the pushed data.
227 /// Returns the range of indices corresponding to the pushed sequence.
228 /// Use `self.slice(range)` to get the corresponding slice.
229 fn push_seq(&mut self, seq: Self::Seq<'_>) -> Range<usize>;
230
231 /// Append the given ASCII sequence to the underlying storage.
232 ///
233 /// This may leave gaps (padding) between consecutively pushed sequences to avoid re-aligning the pushed data.
234 /// Returns the range of indices corresponding to the pushed sequence.
235 /// Use `self.slice(range)` to get the corresponding slice.
236 fn push_ascii(&mut self, seq: &[u8]) -> Range<usize>;
237}