packed_seq/traits.rs
1use crate::{ChunkIt, PaddedIt};
2
3use super::u32x8;
4use mem_dbg::{MemDbg, MemSize};
5use std::ops::Range;
6
7/// Strong type indicating the delay passed to [`Seq::par_iter_bp_delayed`] and [`Seq::par_iter_bp_delayed_2`].
8#[derive(Debug, Clone, Copy, PartialEq, Eq)]
9pub struct Delay(pub usize);
10
11/// A non-owned slice of characters.
12///
13/// The represented character values are expected to be in `[0, 2^b)`,
14/// but they can be encoded in various ways. E.g.:
15/// - A `&[u8]` of ASCII characters, returning 8-bit values.
16/// - An `AsciiSeq` of DNA characters `ACGT`, interpreted 2-bit values.
17/// - A `PackedSeq` of packed DNA characters (4 per byte), returning 2-bit values.
18///
19/// Each character is assumed to fit in 8 bits. Some functions take or return
20/// this 'unpacked' (ASCII) character.
21pub trait Seq<'s>: Copy + Eq + Ord {
22 /// Number of bits `b` to represent each character returned by `iter_bp` and variants..
23 const BITS_PER_CHAR: usize;
24 /// Number of encoded characters per byte of memory of the `Seq`.
25 const BASES_PER_BYTE: usize;
26
27 /// The corresponding owned sequence type.
28 type SeqVec: SeqVec;
29
30 /// Convenience function that returns `b=Self::BITS_PER_CHAR`.
31 fn bits_per_char(&self) -> usize {
32 Self::BITS_PER_CHAR
33 }
34
35 /// The length of the sequence in characters.
36 fn len(&self) -> usize;
37
38 /// Returns `true` if the sequence is empty.
39 fn is_empty(&self) -> bool;
40
41 /// Get the character at the given index.
42 fn get(&self, _index: usize) -> u8;
43
44 /// Get the ASCII character at the given index, _without_ mapping to `b`-bit values.
45 fn get_ascii(&self, _index: usize) -> u8;
46
47 /// Convert a short sequence (kmer) to a packed representation as `u64`.
48 fn as_u64(&self) -> u64;
49
50 /// Convert a short sequence (kmer) to a packed representation of its reverse complement as `u64`.
51 fn revcomp_as_u64(&self) -> u64;
52
53 /// Convert a short sequence (kmer) to a packed representation as `u128`.
54 fn as_u128(&self) -> u128;
55
56 /// Convert a short sequence (kmer) to a packed representation of its reverse complement as `u128`.
57 fn revcomp_as_u128(&self) -> u128;
58
59 /// Convert a short sequence (kmer) to a packed representation as `usize`.
60 #[deprecated = "Prefer `to_u64`."]
61 #[inline(always)]
62 fn to_word(&self) -> usize {
63 self.as_u64() as usize
64 }
65
66 /// Convert a short sequence (kmer) to a packed representation of its reverse complement as `usize`.
67 #[deprecated = "Prefer `revcomp_to_u64`."]
68 #[inline(always)]
69 fn to_word_revcomp(&self) -> usize {
70 self.revcomp_as_u64() as usize
71 }
72
73 /// Convert to an owned version.
74 fn to_vec(&self) -> Self::SeqVec;
75
76 /// Compute the reverse complement of this sequence.
77 fn to_revcomp(&self) -> Self::SeqVec;
78
79 /// Get a sub-slice of the sequence.
80 /// `range` indicates character indices.
81 fn slice(&self, range: Range<usize>) -> Self;
82
83 /// Extract a k-mer from this sequence.
84 #[inline(always)]
85 fn read_kmer(&self, k: usize, pos: usize) -> u64 {
86 self.slice(pos..pos + k).as_u64()
87 }
88
89 /// Extract a reverse complement k-mer from this sequence.
90 #[inline(always)]
91 fn read_revcomp_kmer(&self, k: usize, pos: usize) -> u64 {
92 self.slice(pos..pos + k).revcomp_as_u64()
93 }
94
95 /// Extract a k-mer from this sequence.
96 #[inline(always)]
97 fn read_kmer_u128(&self, k: usize, pos: usize) -> u128 {
98 self.slice(pos..pos + k).as_u128()
99 }
100
101 /// Extract a reverse complement k-mer from this sequence.
102 #[inline(always)]
103 fn read_revcomp_kmer_u128(&self, k: usize, pos: usize) -> u128 {
104 self.slice(pos..pos + k).revcomp_as_u128()
105 }
106
107 /// Iterate over the `b`-bit characters of the sequence.
108 fn iter_bp(self) -> impl ExactSizeIterator<Item = u8>;
109
110 /// Iterate over 8 chunks of `b`-bit characters of the sequence in parallel.
111 ///
112 /// This splits the input into 8 chunks and streams over them in parallel.
113 /// The second output returns the number of 'padding' characters that was added to get a full number of SIMD lanes.
114 /// Thus, the last `padding` number of returned elements (from the last lane(s)) should be ignored.
115 /// The context can be e.g. the k-mer size being iterated.
116 /// When `context>1`, consecutive chunks overlap by `context-1` bases.
117 ///
118 /// Expected to be implemented using SIMD instructions.
119 fn par_iter_bp(self, context: usize) -> PaddedIt<impl ChunkIt<u32x8>>;
120
121 /// Iterate over 8 chunks of the sequence in parallel, returning two characters offset by `delay` positions.
122 ///
123 /// Returned pairs are `(add, remove)`, and the first `delay` 'remove' characters are always `0`.
124 ///
125 /// For example, when the sequence starts as `ABCDEF...`, and `delay=2`,
126 /// the first returned tuples in the first lane are:
127 /// `(b'A', 0)`, `(b'B', 0)`, `(b'C', b'A')`, `(b'D', b'B')`.
128 ///
129 /// When `context>1`, consecutive chunks overlap by `context-1` bases:
130 /// the first `context-1` 'added' characters of the second chunk overlap
131 /// with the last `context-1` 'added' characters of the first chunk.
132 fn par_iter_bp_delayed(
133 self,
134 context: usize,
135 delay: Delay,
136 ) -> PaddedIt<impl ChunkIt<(u32x8, u32x8)>>;
137
138 /// Iterate over 8 chunks of the sequence in parallel, returning three characters:
139 /// the char added, the one `delay` positions before, and the one `delay2` positions before.
140 ///
141 /// Requires `delay1 <= delay2`.
142 ///
143 /// Returned pairs are `(add, d1, d2)`. The first `delay1` `d1` characters and first `delay2` `d2` are always `0`.
144 ///
145 /// For example, when the sequence starts as `ABCDEF...`, and `delay1=2` and `delay2=3`,
146 /// the first returned tuples in the first lane are:
147 /// `(b'A', 0, 0)`, `(b'B', 0, 0)`, `(b'C', b'A', 0)`, `(b'D', b'B', b'A')`.
148 ///
149 /// When `context>1`, consecutive chunks overlap by `context-1` bases:
150 /// the first `context-1` 'added' characters of the second chunk overlap
151 /// with the last `context-1` 'added' characters of the first chunk.
152 fn par_iter_bp_delayed_2(
153 self,
154 context: usize,
155 delay1: Delay,
156 delay2: Delay,
157 ) -> PaddedIt<impl ChunkIt<(u32x8, u32x8, u32x8)>>;
158
159 /// Compare and return the LCP of the two sequences.
160 fn cmp_lcp(&self, other: &Self) -> (std::cmp::Ordering, usize);
161}
162
163// Some hacky stuff to make conditional supertraits.
164cfg_if::cfg_if! {
165 if #[cfg(feature = "epserde")] {
166 pub use epserde::{deser::DeserializeInner, ser::SerializeInner};
167 } else {
168 pub trait SerializeInner {}
169 pub trait DeserializeInner {}
170
171 use crate::packed_seq::{Bits, SupportedBits, PackedSeqVecBase};
172
173 impl SerializeInner for Vec<u8> {}
174 impl DeserializeInner for Vec<u8> {}
175 impl SerializeInner for crate::AsciiSeqVec {}
176 impl DeserializeInner for crate::AsciiSeqVec {}
177 impl<const B: usize> SerializeInner for PackedSeqVecBase<B> where Bits<B>:SupportedBits {}
178 impl<const B: usize> DeserializeInner for PackedSeqVecBase<B> where Bits<B>:SupportedBits {}
179 }
180}
181
182/// An owned sequence.
183/// Can be constructed from either ASCII input or the underlying non-owning `Seq` type.
184///
185/// Implemented for:
186/// - A `Vec<u8>` of ASCII characters, returning 8-bit values.
187/// - An `AsciiSeqVec` of DNA characters `ACGT`, interpreted as 2-bit values.
188/// - A `PackedSeqVec` of packed DNA characters (4 per byte), returning 2-bit values.
189pub trait SeqVec:
190 Default + Sync + SerializeInner + DeserializeInner + MemSize + MemDbg + Clone + 'static
191{
192 type Seq<'s>: Seq<'s>;
193
194 /// Get a non-owning slice to the underlying sequence.
195 ///
196 /// Unfortunately, `Deref` into a `Seq` can not be supported.
197 fn as_slice(&self) -> Self::Seq<'_>;
198
199 /// Get a sub-slice of the sequence. Indices are character offsets.
200 #[inline(always)]
201 fn slice(&self, range: Range<usize>) -> Self::Seq<'_> {
202 self.as_slice().slice(range)
203 }
204
205 /// Extract a k-mer from this sequence.
206 #[inline(always)]
207 fn read_kmer(&self, k: usize, pos: usize) -> u64 {
208 self.as_slice().read_kmer(k, pos)
209 }
210
211 /// Extract a k-mer from this sequence.
212 #[inline(always)]
213 fn read_revcomp_kmer(&self, k: usize, pos: usize) -> u64 {
214 self.as_slice().read_revcomp_kmer(k, pos)
215 }
216
217 /// Extract a k-mer from this sequence.
218 #[inline(always)]
219 fn read_kmer_u128(&self, k: usize, pos: usize) -> u128 {
220 self.as_slice().read_kmer_u128(k, pos)
221 }
222
223 /// Extract a k-mer from this sequence.
224 #[inline(always)]
225 fn read_revcomp_kmer_u128(&self, k: usize, pos: usize) -> u128 {
226 self.as_slice().read_revcomp_kmer_u128(k, pos)
227 }
228
229 /// The length of the sequence in characters.
230 fn len(&self) -> usize;
231
232 /// Returns `true` if the sequence is empty.
233 fn is_empty(&self) -> bool;
234
235 /// Empty the sequence.
236 fn clear(&mut self);
237
238 /// Convert into the underlying raw representation.
239 fn into_raw(self) -> Vec<u8>;
240
241 /// Generate a random sequence with the given number of characters.
242 #[cfg(feature = "rand")]
243 fn random(n: usize) -> Self;
244
245 /// Create a `SeqVec` from ASCII input.
246 #[inline(always)]
247 fn from_ascii(seq: &[u8]) -> Self {
248 let mut packed_vec = Self::default();
249 packed_vec.push_ascii(seq);
250 packed_vec
251 }
252
253 /// Append the given sequence to the underlying storage.
254 ///
255 /// This may leave gaps (padding) between consecutively pushed sequences to avoid re-aligning the pushed data.
256 /// Returns the range of indices corresponding to the pushed sequence.
257 /// Use `self.slice(range)` to get the corresponding slice.
258 fn push_seq(&mut self, seq: Self::Seq<'_>) -> Range<usize>;
259
260 /// Append the given ASCII sequence to the underlying storage.
261 ///
262 /// This may leave gaps (padding) between consecutively pushed sequences to avoid re-aligning the pushed data.
263 /// Returns the range of indices corresponding to the pushed sequence.
264 /// Use `self.slice(range)` to get the corresponding slice.
265 fn push_ascii(&mut self, seq: &[u8]) -> Range<usize>;
266}