packed_seq/traits.rs
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176
use super::u32x8;
use mem_dbg::{MemDbg, MemSize};
use std::ops::Range;
/// A non-owned slice of characters.
///
/// The represented character values are expected to be in `[0, 2^b)`,
/// but they can be encoded in various ways. E.g.:
/// - A `&[u8]` of ASCII characters, returning 8-bit values.
/// - An `AsciiSeq` of DNA characters `ACGT`, interpreted 2-bit values.
/// - A `PackedSeq` of packed DNA characters (4 per byte), returning 2-bit values.
///
/// Each character is assumed to fit in 8 bits. Some functions take or return
/// this 'unpacked' (ASCII) character.
pub trait Seq<'s>: Copy + Eq + Ord {
/// Number of encoded characters per byte of memory of the `Seq`.
const BASES_PER_BYTE: usize;
/// Number of bits `b` to represent each character returned by `iter_bp` and variants..
const BITS_PER_CHAR: usize;
/// The corresponding owned sequence type.
type SeqVec: SeqVec;
/// Convenience function that returns `b=Self::BITS_PER_CHAR`.
fn bits_per_char(&self) -> usize {
Self::BITS_PER_CHAR
}
/// The length of the sequence in characters.
fn len(&self) -> usize;
/// Get the character at the given index.
fn get(&self, _index: usize) -> u8;
/// Get the ASCII character at the given index, _without_ mapping to `b`-bit values.
fn get_ascii(&self, _index: usize) -> u8;
/// Convert a short sequence (kmer) to a packed representation as `usize`.
fn to_word(&self) -> usize;
/// Convert to an owned version.
fn to_vec(&self) -> Self::SeqVec;
/// Get a sub-slice of the sequence.
/// `range` indicates character indices.
fn slice(&self, range: Range<usize>) -> Self;
/// Iterate over the `b`-bit characters of the sequence.
fn iter_bp(self) -> impl ExactSizeIterator<Item = u8> + Clone;
/// Iterate over 8 chunks of `b`-bit characters of the sequence in parallel.
///
/// This splits the input into 8 chunks and streams over them in parallel.
/// Returns a separate `tail` iterator over the remaining characters.
/// The context can be e.g. the k-mer size being iterated.
/// When `context>1`, consecutive chunks overlap by `context-1` bases.
///
/// Expected to be implemented using SIMD instructions.
fn par_iter_bp(self, context: usize) -> (impl ExactSizeIterator<Item = u32x8> + Clone, Self);
/// Iterate over 8 chunks of the sequence in parallel, returning two characters offset by `delay` positions.
///
/// Returned pairs are `(add, remove)`, and the first `delay` 'remove' characters are always `0`.
///
/// For example, when the sequence starts as `ABCDEF...`, and `delay=2`,
/// the first returned tuples in the first lane are:
/// `(b'A', 0)`, `(b'B', 0)`, `(b'C', b'A')`, `(b'D', b'B')`.
///
/// When `context>1`, consecutive chunks overlap by `context-1` bases:
/// the first `context-1` 'added' characters of the second chunk overlap
/// with the last `context-1` 'added' characters of the first chunk.
fn par_iter_bp_delayed(
self,
context: usize,
delay: usize,
) -> (impl ExactSizeIterator<Item = (u32x8, u32x8)> + Clone, Self);
/// Iterate over 8 chunks of the sequence in parallel, returning three characters:
/// the char added, the one `delay` positions before, and the one `delay2` positions before.
///
/// Requires `delay1 <= delay2`.
///
/// Returned pairs are `(add, d1, d2)`. The first `delay1` `d1` characters and first `delay2` `d2` are always `0`.
///
/// For example, when the sequence starts as `ABCDEF...`, and `delay1=2` and `delay2=3`,
/// the first returned tuples in the first lane are:
/// `(b'A', 0, 0)`, `(b'B', 0, 0)`, `(b'C', b'A', 0)`, `(b'D', b'B', b'A')`.
///
/// When `context>1`, consecutive chunks overlap by `context-1` bases:
/// the first `context-1` 'added' characters of the second chunk overlap
/// with the last `context-1` 'added' characters of the first chunk.
fn par_iter_bp_delayed_2(
self,
context: usize,
delay1: usize,
delay2: usize,
) -> (
impl ExactSizeIterator<Item = (u32x8, u32x8, u32x8)> + Clone,
Self,
);
/// Compare and return the LCP of the two sequences.
fn cmp_lcp(&self, other: &Self) -> (std::cmp::Ordering, usize);
}
// Some hacky stuff to make conditional supertraits.
cfg_if::cfg_if! {
if #[cfg(feature = "epserde")] {
pub use serde::{DeserializeInner, SerializeInner};
} else {
pub trait SerializeInner {}
pub trait DeserializeInner {}
impl SerializeInner for Vec<u8> {}
impl DeserializeInner for Vec<u8> {}
impl SerializeInner for crate::AsciiSeqVec {}
impl DeserializeInner for crate::AsciiSeqVec {}
impl SerializeInner for crate::PackedSeqVec {}
impl DeserializeInner for crate::PackedSeqVec {}
}
}
/// An owned sequence.
/// Can be constructed from either ASCII input or the underlying non-owning `Seq` type.
///
/// Implemented for:
/// - A `Vec<u8>` of ASCII characters, returning 8-bit values.
/// - An `AsciiSeqVec` of DNA characters `ACGT`, interpreted as 2-bit values.
/// - A `PackedSeqVec` of packed DNA characters (4 per byte), returning 2-bit values.
pub trait SeqVec:
Default + Sync + SerializeInner + DeserializeInner + MemSize + MemDbg + Clone + 'static
{
type Seq<'s>: Seq<'s>;
/// Get a non-owning slice to the underlying sequence.
///
/// Unfortunately, `Deref` into a `Seq` can not be supported.
fn as_slice(&self) -> Self::Seq<'_>;
/// Get a sub-slice of the sequence. Indices are character offsets.
fn slice(&self, range: Range<usize>) -> Self::Seq<'_> {
self.as_slice().slice(range)
}
/// The length of the sequence in characters.
fn len(&self) -> usize {
self.as_slice().len()
}
/// Convert into the underlying raw representation.
fn into_raw(self) -> Vec<u8>;
/// Generate a random sequence with the given number of characters.
fn random(n: usize) -> Self;
/// Create a `SeqVec` from ASCII input.
fn from_ascii(seq: &[u8]) -> Self {
let mut packed_vec = Self::default();
packed_vec.push_ascii(seq);
packed_vec
}
/// Append the given sequence to the underlying storage.
///
/// This may leave gaps (padding) between consecutively pushed sequences to avoid re-aligning the pushed data.
/// Returns the range of indices corresponding to the pushed sequence.
/// Use `self.slice(range)` to get the corresponding slice.
fn push_seq(&mut self, seq: Self::Seq<'_>) -> Range<usize>;
/// Append the given ASCII sequence to the underlying storage.
///
/// This may leave gaps (padding) between consecutively pushed sequences to avoid re-aligning the pushed data.
/// Returns the range of indices corresponding to the pushed sequence.
/// Use `self.slice(range)` to get the corresponding slice.
fn push_ascii(&mut self, seq: &[u8]) -> Range<usize>;
}