packed_seq/lib.rs
1//! Types and traits to iterate over (packed) input data.
2//!
3//! The main type is [`PackedSeqVec`], that holds a sequence of 2-bit packed DNA bases. [`PackedSeq`] is a non-owned slice of packed data.
4//!
5//! To make libraries depending on this crate more generic, logic is encapsulated in the [`Seq`] and [`SeqVec`] traits.
6//! [`Seq`] is a non-owned slice of characters that can be iterated, while [`SeqVec`] is the corresponding owned type.
7//!
8//! These traits serve two purposes:
9//! 1. They encapsulate the packing/unpacking of characters between ASCII and the possibly different in-memory format.
10//! 2. They allow efficiently iterating over 8 _chunks_ of a sequence in parallel using SIMD instructions.
11//!
12//! ## Sequence types
13//!
14//! The traits are implemented for three types.
15//!
16//! #### Plain ASCII sequences
17//!
18//! With `&[u8]: Seq` and `Vec<u8>: SeqVec`, the ASCII characters (or arbitrary
19//! `u8` values, really) of any input slice can be iterated.
20//!
21//! #### ASCII-encoded DNA sequences
22//!
23//! The [`AsciiSeq: Seq`] and [`AsciiSeqVec: SeqVec`] types store a DNA sequence of `ACTGactg` characters.
24//! When iterated, these are returned as `0123` values, with the mapping `A=0`, `C=1`, `T=2`, `G=3`.
25//!
26//! Any other characters are silently mapped to `0123` using `(c>>1) & 3`, but this should not be relied upon.
27//!
28//! #### Packed DNA
29//!
30//! The [`PackedSeq: Seq`] and [`PackedSeqVec: SeqVec`] types store a packed DNA sequence, encoded as 4 bases per byte.
31//! Each `ACTG` base is stored as `0123` as above and four of these 2-bit values fill a byte.
32//!
33//! Use [`PackedSeqVec::from_ascii`] to construct a [`PackedSeqVec`].
34//! Currently this relies on the `pext` instruction for good performance on `x86`.
35//!
36//! ## Parallel iterators
37//!
38//! This library enables iterating 8 chunks of a sequence at the same time using SIMD instructions.
39//! The [`Seq::par_iter_bp`] functions return a `wide::u32x8` that contains the 2-bit or 8-bit values of the next character in each chunk in a `u32` for 8 SIMD lanes.
40//!
41//! This is used in the `simd-minimizers` crate, and explained in more detail in the corresponding [preprint](https://www.biorxiv.org/content/10.1101/2025.01.27.634998v1).
42//!
43//! #### Context
44//!
45//! The `context` parameter determines how much adjacent chunks overlap. When `context=1`, they are disjoint.
46//! When `context=k`, adjacent chunks overlap by `k-1` characters, so that each k-mer is present in exactly one chunk.
47//! Thus, this can be used to iterate all k-mers, where the first `k-1` characters in each chunk are used to initialize the first k-mer.
48//!
49//! #### Delayed iteration
50//!
51//! This crate also provides [`Seq::par_iter_bp_delayed`] and [`Seq::par_iter_bp_delayed_2`] functions. Like [`Seq::par_iter_bp`], these split the input into 8 chunks and stream over the chunks in parallel.
52//! But instead of just returning a single character, they also return a second (and third) character, that is `delay` positions _behind_ the new character (at index `idx - delay`).
53//! This way, k-mers can be enumerated by setting `delay=k` and then mapping e.g. `|(add, remove)| kmer = (kmer<<2) ^ add ^ (remove << (2*k))`.
54//!
55//! #### Collect
56//!
57//! Use [`PaddedIt::collect`] and [`PaddedIt::collect_into`] to collect the values returned by a parallel iterator over `u32x8` into a flat `Vec<u32>`.
58//!
59//! ## Example
60//!
61//! ```
62//! use packed_seq::{SeqVec, Seq, AsciiSeqVec, PackedSeqVec, pack_char};
63//! // Plain ASCII sequence.
64//! let seq = b"ACTGCAGCGCATATGTAGT";
65//! // ASCII DNA sequence.
66//! let ascii_seq = AsciiSeqVec::from_ascii(seq);
67//! // Packed DNA sequence.
68//! let packed_seq = PackedSeqVec::from_ascii(seq);
69//! assert_eq!(ascii_seq.len(), packed_seq.len());
70//! // Iterate the ASCII characters.
71//! let characters: Vec<u8> = seq.iter_bp().collect();
72//! assert_eq!(characters, seq);
73//!
74//! // Iterate the bases with 0..4 values.
75//! let bases: Vec<u8> = seq.iter().copied().map(pack_char).collect();
76//! assert_eq!(bases, vec![0,1,2,3,1,0,3,1,3,1,0,2,0,2,3,2,0,3,2]);
77//! let ascii_bases: Vec<u8> = ascii_seq.as_slice().iter_bp().collect();
78//! assert_eq!(ascii_bases, bases);
79//! let packed_bases: Vec<u8> = ascii_seq.as_slice().iter_bp().collect();
80//! assert_eq!(packed_bases, bases);
81//!
82//! // Iterate over 8 chunks at the same time.
83//! let seq = b"AAAACCTTGGTTACTG"; // plain ASCII sequence
84//! // chunks: ^ ^ ^ ^ ^ ^ ^ ^
85//! // the `1` argument indicates a 'context' length of 1,
86//! // since we're just iterating single characters.
87//! let par_iter = seq.as_slice().par_iter_bp(1);
88//! let mut par_iter_u8 = par_iter.it.map(|x| x.as_array_ref().map(|c| c as u8));
89//! assert_eq!(par_iter_u8.next(), Some(*b"AACTGTAT"));
90//! assert_eq!(par_iter_u8.next(), Some(*b"AACTGTCG"));
91//! assert_eq!(par_iter_u8.next(), None);
92//!
93//! let bases: Vec<u32> = seq.as_slice().par_iter_bp(1).collect();
94//! let bases: Vec<u8> = bases.into_iter().map(|x| x as u8).collect();
95//! assert_eq!(bases, seq);
96//!
97//! // With context=3, the chunks overlap by 2 characters,
98//! // which can be skipped using `advance`.
99//! let bases: Vec<u32> = seq.as_slice().par_iter_bp(3).advance(2).collect();
100//! let bases: Vec<u8> = bases.into_iter().map(|x| x as u8).collect();
101//! assert_eq!(bases, &seq[2..]);
102//! ```
103//!
104//! ## Feature flags
105//! - `epserde` enables `derive(epserde::Epserde)` for `PackedSeqVec` and `AsciiSeqVec`, and adds its `SerializeInner` and `DeserializeInner` traits to `SeqVec`.
106//! - `pyo3` enables `derive(pyo3::pyclass)` for `PackedSeqVec` and `AsciiSeqVec`.
107
108/// Functions with architecture-specific implementations.
109pub mod intrinsics {
110 mod transpose;
111 pub use transpose::transpose;
112}
113
114mod traits;
115
116mod ascii;
117mod ascii_seq;
118mod packed_n_seq;
119mod packed_seq;
120mod padded_it;
121
122#[cfg(test)]
123mod test;
124
125/// A SIMD vector containing 8 u32s.
126pub use wide::u32x8;
127/// The number of lanes in a `u32x8`.
128pub const L: usize = 8;
129
130pub use ascii_seq::{AsciiSeq, AsciiSeqVec};
131pub use packed_n_seq::{PackedNSeq, PackedNSeqVec};
132pub use packed_seq::{BitSeq, BitSeqVec, PackedSeq, PackedSeqVec};
133pub use packed_seq::{
134 complement_base, complement_base_simd, complement_char, pack_char, pack_kmer_lossy,
135 pack_kmer_u128_lossy, unpack_base, unpack_kmer, unpack_kmer_into_vec, unpack_kmer_to_vec,
136 unpack_kmer_u128, unpack_kmer_u128_into_vec, unpack_kmer_u128_to_vec,
137};
138pub use padded_it::{Advance, ChunkIt, PaddedIt};
139pub use traits::{Delay, Seq, SeqVec};
140
141// For internal use only.
142use core::array::from_fn;
143use mem_dbg::{MemDbg, MemSize};
144use std::{hint::assert_unchecked, ops::Range};
145use wide::u32x8 as S;