simd_minimizers/
minimizers.rs

1//! Find the (canonical) minimizers of a sequence.
2use std::iter::zip;
3
4use crate::{
5    canonical,
6    sliding_min::{Cache, sliding_lr_min_mapper_scalar, sliding_min_mapper_scalar},
7};
8
9use super::{
10    canonical::canonical_mapper_simd,
11    sliding_min::{sliding_lr_min_mapper_simd, sliding_min_mapper_simd},
12};
13use itertools::{Itertools, izip};
14use packed_seq::{Advance, ChunkIt, Delay, PaddedIt, Seq};
15use seq_hash::KmerHasher;
16use wide::u32x8;
17
18pub const SKIPPED: u32 = u32::MAX - 1;
19pub(crate) const SIMD_SKIPPED: u32x8 = u32x8::new([SKIPPED; 8]);
20
21/// Minimizer position of a single window.
22pub fn one_minimizer<'s>(seq: impl Seq<'s>, hasher: &impl KmerHasher) -> usize {
23    hasher
24        .hash_kmers_scalar(seq)
25        .map(|x| x & 0xffff_0000)
26        .position_min()
27        .unwrap()
28}
29
30// FIMXE: Add one_canonical_minimizer
31
32/// Returns an iterator over the absolute positions of the minimizers of a sequence.
33/// Returns one value for each window of size `w+k-1` in the input. Use
34/// `Itertools::dedup()` to obtain the distinct positions of the minimizers.
35///
36/// Prefer `minimizer_simd_it` that internally used SIMD, or `minimizer_par_it` if it works for you.
37#[inline(always)]
38pub fn minimizers_seq_scalar<'s>(
39    seq: impl Seq<'s>,
40    hasher: &impl KmerHasher,
41    w: usize,
42    cache: &mut Cache,
43) -> impl ExactSizeIterator<Item = u32> {
44    let kmer_hashes = hasher.hash_kmers_scalar(seq);
45    let len = kmer_hashes.len();
46    kmer_hashes
47        .map(sliding_min_mapper_scalar::<true>(w, len, cache))
48        .advance(w - 1)
49}
50
51/// Split the windows of the sequence into 8 chunks of equal length ~len/8.
52/// Then return the positions of the minimizers of each of them in parallel using SIMD,
53/// and return the remaining few using the second iterator.
54#[inline(always)]
55pub fn minimizers_seq_simd<'s>(
56    seq: impl Seq<'s>,
57    hasher: &impl KmerHasher,
58    w: usize,
59    cache: &mut Cache,
60) -> PaddedIt<impl ChunkIt<u32x8>> {
61    let kmer_hashes = hasher.hash_kmers_simd(seq, w);
62    let len = kmer_hashes.it.len();
63    kmer_hashes
64        .map(sliding_min_mapper_simd::<true>(w, len, cache))
65        .advance(w - 1)
66}
67
68///////////////////////////////////////////////////////////////////////////////////////////////////
69// TRULY CANONICAL MINIMIZERS BELOW HERE
70// The minimizers above can take a canonical hash, but do not correctly break ties.
71// Below we fix that.
72
73#[inline(always)]
74pub fn canonical_minimizers_seq_scalar<'s>(
75    seq: impl Seq<'s>,
76    hasher: &impl KmerHasher,
77    w: usize,
78    cache: &mut Cache,
79) -> impl ExactSizeIterator<Item = u32> {
80    // TODO: Change to compile-time check on `impl KmerHasher<RC=true>` once supported.
81    assert!(hasher.is_canonical());
82
83    let k = hasher.k();
84    let delay1 = hasher.delay().0;
85    let mut hash_mapper = hasher.in_out_mapper_scalar(seq);
86    let mut sliding_min_mapper = sliding_lr_min_mapper_scalar(w, seq.len(), cache);
87    let (Delay(delay2), mut canonical_mapper) = canonical::canonical_mapper_scalar(k + w - 1);
88
89    assert!(delay1 < k);
90    assert!(k - 1 <= delay2);
91    assert!(delay2 == k + w - 2);
92
93    let mut a = seq.iter_bp();
94    let mut rh = seq.iter_bp();
95    let rc = seq.iter_bp();
96
97    a.by_ref().take(delay1).for_each(|a| {
98        hash_mapper((a, 0));
99        canonical_mapper((a, 0));
100    });
101
102    zip(a.by_ref(), rh.by_ref())
103        .take((k - 1) - delay1)
104        .for_each(|(a, rh)| {
105            hash_mapper((a, rh));
106            canonical_mapper((a, 0));
107        });
108
109    zip(a.by_ref(), rh.by_ref())
110        .take(delay2 - (k - 1))
111        .for_each(|(a, rh)| {
112            let hash = hash_mapper((a, rh));
113            canonical_mapper((a, 0));
114            sliding_min_mapper(hash);
115        });
116
117    izip!(a, rh, rc).map(
118        #[inline(always)]
119        #[allow(clippy::let_and_return)]
120        move |(a, rh, rc)| {
121            let hash = hash_mapper((a, rh));
122            let canonical = canonical_mapper((a, rc));
123            let (left, right) = sliding_min_mapper(hash);
124            // Assigning to x ensures we get a cmov here.
125            let x = if canonical { left } else { right };
126            x
127        },
128    )
129}
130
131/// Use canonical NtHash, and keep both leftmost and rightmost minima.
132#[inline(always)]
133pub fn canonical_minimizers_seq_simd<'s>(
134    seq: impl Seq<'s>,
135    hasher: &impl KmerHasher,
136    w: usize,
137    cache: &mut Cache,
138) -> PaddedIt<impl ChunkIt<u32x8>> {
139    assert!(hasher.is_canonical());
140
141    let k = hasher.k();
142    let l = k + w - 1;
143    let mut hash_mapper = hasher.in_out_mapper_simd(seq);
144    let (c_delay, mut canonical_mapper) = canonical_mapper_simd(l);
145
146    let mut padded_it = seq.par_iter_bp_delayed_2(l, hasher.delay(), c_delay);
147
148    // Process first k-1 characters separately, to initialize hash values.
149    padded_it.advance_with(k - 1, |(a, rh, rc)| {
150        hash_mapper((a, rh));
151        canonical_mapper((a, rc));
152    });
153    let mut sliding_min_mapper = sliding_lr_min_mapper_simd(w, padded_it.it.len(), cache);
154    padded_it.advance_with(w - 1, |(a, rh, rc)| {
155        let hash = hash_mapper((a, rh));
156        canonical_mapper((a, rc));
157        sliding_min_mapper(hash);
158    });
159
160    padded_it.map(move |(a, rh, rc)| {
161        let hash = hash_mapper((a, rh));
162        let canonical = canonical_mapper((a, rc));
163        let (lmin, rmin) = sliding_min_mapper(hash);
164        canonical.blend(lmin, rmin)
165    })
166}
167
168#[inline(always)]
169pub fn canonical_minimizers_skip_ambiguous_windows<'s>(
170    nseq: packed_seq::PackedNSeq<'s>,
171    hasher: &impl KmerHasher,
172    w: usize,
173    cache: &'s mut (Cache, Vec<u32x8>, Vec<u32x8>),
174) -> PaddedIt<impl ChunkIt<u32x8>> {
175    assert!(hasher.is_canonical());
176
177    let k = hasher.k();
178    let l = k + w - 1;
179    let mut hash_mapper = hasher.in_out_mapper_simd(nseq.seq);
180    let (c_delay, mut canonical_mapper) = canonical_mapper_simd(l);
181
182    let mut padded_it = nseq.seq.par_iter_bp_delayed_2_with_factor_and_buf(
183        l,
184        hasher.delay(),
185        c_delay,
186        2,
187        &mut cache.1,
188    );
189
190    // Process first k-1 characters separately, to initialize hash values.
191    padded_it.advance_with(k - 1, |(a, rh, rc)| {
192        hash_mapper((a, rh));
193        canonical_mapper((a, rc));
194    });
195    let mut sliding_min_mapper = sliding_lr_min_mapper_simd(w, padded_it.it.len(), &mut cache.0);
196    padded_it.advance_with(w - 1, |(a, rh, rc)| {
197        let hash = hash_mapper((a, rh));
198        canonical_mapper((a, rc));
199        sliding_min_mapper(hash);
200    });
201
202    // jump over the l-1 first ambiguity results
203    padded_it
204        .zip(
205            nseq.ambiguous
206                .par_iter_kmer_ambiguity_with_buf(l, l, l - 1, &mut cache.2),
207        )
208        .map(move |((a, rh, rc), ambi)| {
209            let hash = hash_mapper((a, rh));
210            let canonical = canonical_mapper((a, rc));
211            let (lmin, rmin) = sliding_min_mapper(hash);
212            ambi.blend(SIMD_SKIPPED, canonical.blend(lmin, rmin))
213        })
214}