packed_seq/
lib.rs

1//! Types and traits to iterate over (packed) input data.
2//!
3//! The main type is [`PackedSeqVec`], that holds a sequence of 2-bit packed DNA bases. [`PackedSeq`] is a non-owned slice of packed data.
4//!
5//! To make libraries depending on this crate more generic, logic is encapsulated in the [`Seq`] and [`SeqVec`] traits.
6//! [`Seq`] is a non-owned slice of characters that can be iterated, while [`SeqVec`] is the corresponding owned type.
7//!
8//! These traits serve two purposes:
9//! 1. They encapsulate the packing/unpacking of characters between ASCII and the possibly different in-memory format.
10//! 2. They allow efficiently iterating over 8 _chunks_ of a sequence in parallel using SIMD instructions.
11//!
12//! ## Sequence types
13//!
14//! The traits are implemented for three types.
15//!
16//! #### Plain ASCII sequences
17//!
18//! With `&[u8]: Seq` and `Vec<u8>: SeqVec`, the ASCII characters (or arbitrary
19//! `u8` values, really) of any input slice can be iterated.
20//!
21//! #### ASCII-encoded DNA sequences
22//!
23//! The [`AsciiSeq: Seq`] and [`AsciiSeqVec: SeqVec`] types store a DNA sequence of `ACTGactg` characters.
24//! When iterated, these are returned as `0123` values, with the mapping `A=0`, `C=1`, `T=2`, `G=3`.
25//!
26//! Any other characters are silently mapped to `0123` using `(c>>1) & 3`, but this should not be relied upon.
27//!
28//! #### Packed DNA
29//!
30//! The [`PackedSeq: Seq`] and [`PackedSeqVec: SeqVec`] types store a packed DNA sequence, encoded as 4 bases per byte.
31//! Each `ACTG` base is stored as `0123` as above and four of these 2-bit values fill a byte.
32//!
33//! Use [`PackedSeqVec::from_ascii`] to construct a [`PackedSeqVec`].
34//! Currently this relies on the `pext` instruction for good performance on `x86`.
35//!
36//! ## Parallel iterators
37//!
38//! This library enables iterating 8 chunks of a sequence at the same time using SIMD instructions.
39//! The [`Seq::par_iter_bp`] functions return a `wide::u32x8` that contains the 2-bit or 8-bit values of the next character in each chunk in a `u32` for 8 SIMD lanes.
40//!
41//! This is used in the `simd-minimizers` crate, and explained in more detail in the corresponding [preprint](https://www.biorxiv.org/content/10.1101/2025.01.27.634998v1).
42//!
43//! #### Context
44//!
45//! The `context` parameter determines how much adjacent chunks overlap. When `context=1`, they are disjoint.
46//! When `context=k`, adjacent chunks overlap by `k-1` characters, so that each k-mer is present in exactly one chunk.
47//! Thus, this can be used to iterate all k-mers, where the first `k-1` characters in each chunk are used to initialize the first k-mer.
48//!
49//! #### Delayed iteration
50//!
51//! This crate also provides [`Seq::par_iter_bp_delayed`] and [`Seq::par_iter_bp_delayed_2`] functions. Like [`Seq::par_iter_bp`], these split the input into 8 chunks and stream over the chunks in parallel.
52//! But instead of just returning a single character, they also return a second (and third) character, that is `delay` positions _behind_ the new character (at index `idx - delay`).
53//! This way, k-mers can be enumerated by setting `delay=k` and then mapping e.g. `|(add, remove)| kmer = (kmer<<2) ^ add ^ (remove << (2*k))`.
54//!
55//! ## Example
56//!
57//! ```
58//! use packed_seq::{SeqVec, Seq, AsciiSeqVec, PackedSeqVec, pack_char};
59//! // Plain ASCII sequence.
60//! let seq = b"ACTGCAGCGCATATGTAGT";
61//! // ASCII DNA sequence.
62//! let ascii_seq = AsciiSeqVec::from_ascii(seq);
63//! // Packed DNA sequence.
64//! let packed_seq = PackedSeqVec::from_ascii(seq);
65//! assert_eq!(ascii_seq.len(), packed_seq.len());
66//! // Iterate the ASCII characters.
67//! let characters: Vec<u8> = seq.iter_bp().collect();
68//! assert_eq!(characters, seq);
69//!
70//! // Iterate the bases with 0..4 values.
71//! let bases: Vec<u8> = seq.iter().copied().map(pack_char).collect();
72//! assert_eq!(bases, vec![0,1,2,3,1,0,3,1,3,1,0,2,0,2,3,2,0,3,2]);
73//! let ascii_bases: Vec<u8> = ascii_seq.as_slice().iter_bp().collect();
74//! assert_eq!(ascii_bases, bases);
75//! let packed_bases: Vec<u8> = ascii_seq.as_slice().iter_bp().collect();
76//! assert_eq!(packed_bases, bases);
77//!
78//! // Iterate over 8 chunks at the same time.
79//! let seq = b"AAAACCTTGGTTACTG"; // plain ASCII sequence
80//! // chunks:  ^ ^ ^ ^ ^ ^ ^ ^
81//! let (par_iter, padding) = seq.as_slice().par_iter_bp(1);
82//! let mut par_iter_u8 = par_iter.map(|x| x.as_array_ref().map(|c| c as u8));
83//! assert_eq!(par_iter_u8.next(), Some(*b"AACTGTAT"));
84//! assert_eq!(par_iter_u8.next(), Some(*b"AACTGTCG"));
85//! assert_eq!(par_iter_u8.next(), None);
86//! ```
87//!
88//! ## Feature flags
89//! - `epserde` enables `derive(epserde::Epserde)` for `PackedSeqVec` and `AsciiSeqVec`, and adds its `SerializeInner` and `DeserializeInner` traits to `SeqVec`.
90//! - `pyo3` enables `derive(pyo3::pyclass)` for `PackedSeqVec` and `AsciiSeqVec`.
91
92/// Functions with architecture-specific implementations.
93mod intrinsics {
94    mod transpose;
95    pub use transpose::transpose;
96}
97
98mod traits;
99
100mod ascii;
101mod ascii_seq;
102mod packed_seq;
103
104#[cfg(test)]
105mod test;
106
107/// A SIMD vector containing 8 u32s.
108pub use wide::u32x8;
109/// The number of lanes in a `u32x8`.
110pub const L: usize = 8;
111
112pub use ascii_seq::{AsciiSeq, AsciiSeqVec};
113pub use packed_seq::{
114    complement_base, complement_base_simd, complement_char, pack_char, unpack_base,
115};
116pub use packed_seq::{PackedSeq, PackedSeqVec};
117pub use traits::{Seq, SeqVec};
118
119// For internal use only.
120use core::array::from_fn;
121use mem_dbg::{MemDbg, MemSize};
122use std::{hint::assert_unchecked, ops::Range};
123use wide::u32x8 as S;