packed_seq/
traits.rs

1use super::u32x8;
2use mem_dbg::{MemDbg, MemSize};
3use std::ops::Range;
4
5/// A non-owned slice of characters.
6///
7/// The represented character values are expected to be in `[0, 2^b)`,
8/// but they can be encoded in various ways. E.g.:
9/// - A `&[u8]` of ASCII characters, returning 8-bit values.
10/// - An `AsciiSeq` of DNA characters `ACGT`, interpreted 2-bit values.
11/// - A `PackedSeq` of packed DNA characters (4 per byte), returning 2-bit values.
12///
13/// Each character is assumed to fit in 8 bits. Some functions take or return
14/// this 'unpacked' (ASCII) character.
15pub trait Seq<'s>: Copy + Eq + Ord {
16    /// Number of encoded characters per byte of memory of the `Seq`.
17    const BASES_PER_BYTE: usize;
18    /// Number of bits `b` to represent each character returned by `iter_bp` and variants..
19    const BITS_PER_CHAR: usize;
20
21    /// The corresponding owned sequence type.
22    type SeqVec: SeqVec;
23
24    /// Convenience function that returns `b=Self::BITS_PER_CHAR`.
25    fn bits_per_char(&self) -> usize {
26        Self::BITS_PER_CHAR
27    }
28
29    /// The length of the sequence in characters.
30    fn len(&self) -> usize;
31
32    /// Get the character at the given index.
33    fn get(&self, _index: usize) -> u8;
34
35    /// Get the ASCII character at the given index, _without_ mapping to `b`-bit values.
36    fn get_ascii(&self, _index: usize) -> u8;
37
38    /// Convert a short sequence (kmer) to a packed representation as `usize`.
39    fn to_word(&self) -> usize;
40
41    /// Convert to an owned version.
42    fn to_vec(&self) -> Self::SeqVec;
43
44    /// Get a sub-slice of the sequence.
45    /// `range` indicates character indices.
46    fn slice(&self, range: Range<usize>) -> Self;
47
48    /// Iterate over the `b`-bit characters of the sequence.
49    fn iter_bp(self) -> impl ExactSizeIterator<Item = u8> + Clone;
50
51    /// Iterate over 8 chunks of `b`-bit characters of the sequence in parallel.
52    ///
53    /// This splits the input into 8 chunks and streams over them in parallel.
54    /// The second output returns the number of 'padding' characters that was added to get a full number of SIMD lanes.
55    /// Thus, the last `padding` number of returned elements (from the last lane(s)) should be ignored.
56    /// The context can be e.g. the k-mer size being iterated.
57    /// When `context>1`, consecutive chunks overlap by `context-1` bases.
58    ///
59    /// Expected to be implemented using SIMD instructions.
60    fn par_iter_bp(self, context: usize) -> (impl ExactSizeIterator<Item = u32x8> + Clone, usize);
61
62    /// Iterate over 8 chunks of the sequence in parallel, returning two characters offset by `delay` positions.
63    ///
64    /// Returned pairs are `(add, remove)`, and the first `delay` 'remove' characters are always `0`.
65    ///
66    /// For example, when the sequence starts as `ABCDEF...`, and `delay=2`,
67    /// the first returned tuples in the first lane are:
68    /// `(b'A', 0)`, `(b'B', 0)`, `(b'C', b'A')`, `(b'D', b'B')`.
69    ///
70    /// When `context>1`, consecutive chunks overlap by `context-1` bases:
71    /// the first `context-1` 'added' characters of the second chunk overlap
72    /// with the last `context-1` 'added' characters of the first chunk.
73    fn par_iter_bp_delayed(
74        self,
75        context: usize,
76        delay: usize,
77    ) -> (impl ExactSizeIterator<Item = (u32x8, u32x8)> + Clone, usize);
78
79    /// Iterate over 8 chunks of the sequence in parallel, returning three characters:
80    /// the char added, the one `delay` positions before, and the one `delay2` positions before.
81    ///
82    /// Requires `delay1 <= delay2`.
83    ///
84    /// Returned pairs are `(add, d1, d2)`. The first `delay1` `d1` characters and first `delay2` `d2` are always `0`.
85    ///
86    /// For example, when the sequence starts as `ABCDEF...`, and `delay1=2` and `delay2=3`,
87    /// the first returned tuples in the first lane are:
88    /// `(b'A', 0, 0)`, `(b'B', 0, 0)`, `(b'C', b'A', 0)`, `(b'D', b'B', b'A')`.
89    ///
90    /// When `context>1`, consecutive chunks overlap by `context-1` bases:
91    /// the first `context-1` 'added' characters of the second chunk overlap
92    /// with the last `context-1` 'added' characters of the first chunk.
93    fn par_iter_bp_delayed_2(
94        self,
95        context: usize,
96        delay1: usize,
97        delay2: usize,
98    ) -> (
99        impl ExactSizeIterator<Item = (u32x8, u32x8, u32x8)> + Clone,
100        usize,
101    );
102
103    /// Compare and return the LCP of the two sequences.
104    fn cmp_lcp(&self, other: &Self) -> (std::cmp::Ordering, usize);
105}
106
107// Some hacky stuff to make conditional supertraits.
108cfg_if::cfg_if! {
109    if #[cfg(feature = "epserde")] {
110        pub use serde::{DeserializeInner, SerializeInner};
111    } else {
112        pub trait SerializeInner {}
113        pub trait DeserializeInner {}
114
115        impl SerializeInner for Vec<u8> {}
116        impl DeserializeInner for Vec<u8> {}
117        impl SerializeInner for crate::AsciiSeqVec {}
118        impl DeserializeInner for crate::AsciiSeqVec {}
119        impl SerializeInner for crate::PackedSeqVec {}
120        impl DeserializeInner for crate::PackedSeqVec {}
121    }
122}
123
124/// An owned sequence.
125/// Can be constructed from either ASCII input or the underlying non-owning `Seq` type.
126///
127/// Implemented for:
128/// - A `Vec<u8>` of ASCII characters, returning 8-bit values.
129/// - An `AsciiSeqVec` of DNA characters `ACGT`, interpreted as 2-bit values.
130/// - A `PackedSeqVec` of packed DNA characters (4 per byte), returning 2-bit values.
131pub trait SeqVec:
132    Default + Sync + SerializeInner + DeserializeInner + MemSize + MemDbg + Clone + 'static
133{
134    type Seq<'s>: Seq<'s>;
135
136    /// Get a non-owning slice to the underlying sequence.
137    ///
138    /// Unfortunately, `Deref` into a `Seq` can not be supported.
139    fn as_slice(&self) -> Self::Seq<'_>;
140
141    /// Get a sub-slice of the sequence. Indices are character offsets.
142    #[inline(always)]
143    fn slice(&self, range: Range<usize>) -> Self::Seq<'_> {
144        self.as_slice().slice(range)
145    }
146
147    /// The length of the sequence in characters.
148    fn len(&self) -> usize {
149        self.as_slice().len()
150    }
151
152    /// Convert into the underlying raw representation.
153    fn into_raw(self) -> Vec<u8>;
154
155    /// Generate a random sequence with the given number of characters.
156    fn random(n: usize) -> Self;
157
158    /// Create a `SeqVec` from ASCII input.
159    fn from_ascii(seq: &[u8]) -> Self {
160        let mut packed_vec = Self::default();
161        packed_vec.push_ascii(seq);
162        packed_vec
163    }
164
165    /// Append the given sequence to the underlying storage.
166    ///
167    /// This may leave gaps (padding) between consecutively pushed sequences to avoid re-aligning the pushed data.
168    /// Returns the range of indices corresponding to the pushed sequence.
169    /// Use `self.slice(range)` to get the corresponding slice.
170    fn push_seq(&mut self, seq: Self::Seq<'_>) -> Range<usize>;
171
172    /// Append the given ASCII sequence to the underlying storage.
173    ///
174    /// This may leave gaps (padding) between consecutively pushed sequences to avoid re-aligning the pushed data.
175    /// Returns the range of indices corresponding to the pushed sequence.
176    /// Use `self.slice(range)` to get the corresponding slice.
177    fn push_ascii(&mut self, seq: &[u8]) -> Range<usize>;
178}