packed_seq/
traits.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
use super::u32x8;
use mem_dbg::{MemDbg, MemSize};
use std::ops::Range;

/// A non-owned slice of characters.
///
/// The represented character values are expected to be in `[0, 2^b)`,
/// but they can be encoded in various ways. E.g.:
/// - A `&[u8]` of ASCII characters, returning 8-bit values.
/// - An `AsciiSeq` of DNA characters `ACGT`, interpreted 2-bit values.
/// - A `PackedSeq` of packed DNA characters (4 per byte), returning 2-bit values.
///
/// Each character is assumed to fit in 8 bits. Some functions take or return
/// this 'unpacked' (ASCII) character.
pub trait Seq<'s>: Copy + Eq + Ord {
    /// Number of encoded characters per byte of memory of the `Seq`.
    const BASES_PER_BYTE: usize;
    /// Number of bits `b` to represent each character returned by `iter_bp` and variants..
    const BITS_PER_CHAR: usize;

    /// The corresponding owned sequence type.
    type SeqVec: SeqVec;

    /// Convenience function that returns `b=Self::BITS_PER_CHAR`.
    fn bits_per_char(&self) -> usize {
        Self::BITS_PER_CHAR
    }

    /// The length of the sequence in characters.
    fn len(&self) -> usize;

    /// Get the character at the given index.
    fn get(&self, _index: usize) -> u8;

    /// Get the ASCII character at the given index, _without_ mapping to `b`-bit values.
    fn get_ascii(&self, _index: usize) -> u8;

    /// Convert a short sequence (kmer) to a packed representation as `usize`.
    fn to_word(&self) -> usize;

    /// Convert to an owned version.
    fn to_vec(&self) -> Self::SeqVec;

    /// Get a sub-slice of the sequence.
    /// `range` indicates character indices.
    fn slice(&self, range: Range<usize>) -> Self;

    /// Iterate over the `b`-bit characters of the sequence.
    fn iter_bp(self) -> impl ExactSizeIterator<Item = u8> + Clone;

    /// Iterate over 8 chunks of `b`-bit characters of the sequence in parallel.
    ///
    /// This splits the input into 8 chunks and streams over them in parallel.
    /// Returns a separate `tail` iterator over the remaining characters.
    /// The context can be e.g. the k-mer size being iterated.
    /// When `context>1`, consecutive chunks overlap by `context-1` bases.
    ///
    /// Expected to be implemented using SIMD instructions.
    fn par_iter_bp(self, context: usize) -> (impl ExactSizeIterator<Item = u32x8> + Clone, Self);

    /// Iterate over 8 chunks of the sequence in parallel, returning two characters offset by `delay` positions.
    ///
    /// Returned pairs are `(add, remove)`, and the first `delay` 'remove' characters are always `0`.
    ///
    /// For example, when the sequence starts as `ABCDEF...`, and `delay=2`,
    /// the first returned tuples in the first lane are:
    /// `(b'A', 0)`, `(b'B', 0)`, `(b'C', b'A')`, `(b'D', b'B')`.
    ///
    /// When `context>1`, consecutive chunks overlap by `context-1` bases:
    /// the first `context-1` 'added' characters of the second chunk overlap
    /// with the last `context-1` 'added' characters of the first chunk.
    fn par_iter_bp_delayed(
        self,
        context: usize,
        delay: usize,
    ) -> (impl ExactSizeIterator<Item = (u32x8, u32x8)> + Clone, Self);

    /// Iterate over 8 chunks of the sequence in parallel, returning three characters:
    /// the char added, the one `delay` positions before, and the one `delay2` positions before.
    ///
    /// Requires `delay1 <= delay2`.
    ///
    /// Returned pairs are `(add, d1, d2)`. The first `delay1` `d1` characters and first `delay2` `d2` are always `0`.
    ///
    /// For example, when the sequence starts as `ABCDEF...`, and `delay1=2` and `delay2=3`,
    /// the first returned tuples in the first lane are:
    /// `(b'A', 0, 0)`, `(b'B', 0, 0)`, `(b'C', b'A', 0)`, `(b'D', b'B', b'A')`.
    ///
    /// When `context>1`, consecutive chunks overlap by `context-1` bases:
    /// the first `context-1` 'added' characters of the second chunk overlap
    /// with the last `context-1` 'added' characters of the first chunk.
    fn par_iter_bp_delayed_2(
        self,
        context: usize,
        delay1: usize,
        delay2: usize,
    ) -> (
        impl ExactSizeIterator<Item = (u32x8, u32x8, u32x8)> + Clone,
        Self,
    );

    /// Compare and return the LCP of the two sequences.
    fn cmp_lcp(&self, other: &Self) -> (std::cmp::Ordering, usize);
}

// Some hacky stuff to make conditional supertraits.
cfg_if::cfg_if! {
    if #[cfg(feature = "epserde")] {
        pub use serde::{DeserializeInner, SerializeInner};
    } else {
        pub trait SerializeInner {}
        pub trait DeserializeInner {}

        impl SerializeInner for Vec<u8> {}
        impl DeserializeInner for Vec<u8> {}
        impl SerializeInner for crate::AsciiSeqVec {}
        impl DeserializeInner for crate::AsciiSeqVec {}
        impl SerializeInner for crate::PackedSeqVec {}
        impl DeserializeInner for crate::PackedSeqVec {}
    }
}

/// An owned sequence.
/// Can be constructed from either ASCII input or the underlying non-owning `Seq` type.
///
/// Implemented for:
/// - A `Vec<u8>` of ASCII characters, returning 8-bit values.
/// - An `AsciiSeqVec` of DNA characters `ACGT`, interpreted as 2-bit values.
/// - A `PackedSeqVec` of packed DNA characters (4 per byte), returning 2-bit values.
pub trait SeqVec:
    Default + Sync + SerializeInner + DeserializeInner + MemSize + MemDbg + Clone + 'static
{
    type Seq<'s>: Seq<'s>;

    /// Get a non-owning slice to the underlying sequence.
    ///
    /// Unfortunately, `Deref` into a `Seq` can not be supported.
    fn as_slice(&self) -> Self::Seq<'_>;

    /// Get a sub-slice of the sequence. Indices are character offsets.
    fn slice(&self, range: Range<usize>) -> Self::Seq<'_> {
        self.as_slice().slice(range)
    }

    /// The length of the sequence in characters.
    fn len(&self) -> usize {
        self.as_slice().len()
    }

    /// Convert into the underlying raw representation.
    fn into_raw(self) -> Vec<u8>;

    /// Generate a random sequence with the given number of characters.
    fn random(n: usize) -> Self;

    /// Create a `SeqVec` from ASCII input.
    fn from_ascii(seq: &[u8]) -> Self {
        let mut packed_vec = Self::default();
        packed_vec.push_ascii(seq);
        packed_vec
    }

    /// Append the given sequence to the underlying storage.
    ///
    /// This may leave gaps (padding) between consecutively pushed sequences to avoid re-aligning the pushed data.
    /// Returns the range of indices corresponding to the pushed sequence.
    /// Use `self.slice(range)` to get the corresponding slice.
    fn push_seq(&mut self, seq: Self::Seq<'_>) -> Range<usize>;

    /// Append the given ASCII sequence to the underlying storage.
    ///
    /// This may leave gaps (padding) between consecutively pushed sequences to avoid re-aligning the pushed data.
    /// Returns the range of indices corresponding to the pushed sequence.
    /// Use `self.slice(range)` to get the corresponding slice.
    fn push_ascii(&mut self, seq: &[u8]) -> Range<usize>;
}