packed_seq/
packed_seq.rs

1use core::cell::RefCell;
2use std::ops::{Deref, DerefMut};
3use traits::Seq;
4use wide::u16x8;
5
6use crate::{intrinsics::transpose, padded_it::ChunkIt};
7
8use super::*;
9
10type SimdBuf = [S; 8];
11
12struct RecycledBox(Option<Box<SimdBuf>>);
13
14thread_local! {
15    static RECYCLED_BOX_CACHE: RefCell<Vec<Box<SimdBuf>>> = {
16        RefCell::new(vec![Box::new(SimdBuf::default())])
17    };
18}
19
20impl RecycledBox {
21    #[inline(always)]
22    fn take() -> Self {
23        let mut buf = RECYCLED_BOX_CACHE.with_borrow_mut(|v| RecycledBox(v.pop()));
24        buf.init_if_needed();
25        buf
26    }
27
28    #[inline(always)]
29    fn init_if_needed(&mut self) {
30        if self.0.is_none() {
31            self.0 = Some(Box::new(SimdBuf::default()));
32        }
33    }
34
35    #[inline(always)]
36    fn get(&self) -> &SimdBuf {
37        unsafe { self.0.as_ref().unwrap_unchecked() }
38    }
39
40    #[inline(always)]
41    fn get_mut(&mut self) -> &mut SimdBuf {
42        unsafe { self.0.as_mut().unwrap_unchecked() }
43    }
44}
45
46impl Deref for RecycledBox {
47    type Target = SimdBuf;
48
49    #[inline(always)]
50    fn deref(&self) -> &Self::Target {
51        self.get()
52    }
53}
54impl DerefMut for RecycledBox {
55    #[inline(always)]
56    fn deref_mut(&mut self) -> &mut SimdBuf {
57        self.get_mut()
58    }
59}
60
61impl Drop for RecycledBox {
62    #[inline(always)]
63    fn drop(&mut self) {
64        let mut x = None;
65        core::mem::swap(&mut x, &mut self.0);
66        RECYCLED_BOX_CACHE.with_borrow_mut(|v| v.push(unsafe { x.unwrap_unchecked() }));
67    }
68}
69
70struct RecycledVec(Vec<S>);
71
72thread_local! {
73    static RECYCLED_VEC_CACHE: RefCell<Vec<Vec<S>>> = {
74        RefCell::new(vec![])
75    };
76}
77
78impl RecycledVec {
79    #[inline(always)]
80    fn take() -> Self {
81        RecycledVec(RECYCLED_VEC_CACHE.with_borrow_mut(|v| v.pop().unwrap_or_default()))
82    }
83}
84
85impl Deref for RecycledVec {
86    type Target = Vec<S>;
87    #[inline(always)]
88    fn deref(&self) -> &Self::Target {
89        &self.0
90    }
91}
92impl DerefMut for RecycledVec {
93    #[inline(always)]
94    fn deref_mut(&mut self) -> &mut Self::Target {
95        &mut self.0
96    }
97}
98impl Drop for RecycledVec {
99    #[inline(always)]
100    fn drop(&mut self) {
101        RECYCLED_VEC_CACHE.with_borrow_mut(|v| v.push(std::mem::take(&mut self.0)));
102    }
103}
104
105#[doc(hidden)]
106pub struct Bits<const B: usize>;
107#[doc(hidden)]
108pub trait SupportedBits {}
109impl SupportedBits for Bits<1> {}
110impl SupportedBits for Bits<2> {}
111impl SupportedBits for Bits<4> {}
112impl SupportedBits for Bits<8> {}
113
114/// Number of padding bytes at the end of `PackedSeqVecBase::seq`.
115pub(crate) const PADDING: usize = 48;
116
117/// A 2-bit packed non-owned slice of DNA bases.
118#[doc(hidden)]
119#[derive(Copy, Clone, Debug, MemSize, MemDbg)]
120pub struct PackedSeqBase<'s, const B: usize>
121where
122    Bits<B>: SupportedBits,
123{
124    /// Packed data.
125    seq: &'s [u8],
126    /// Offset in bp from the start of the `seq`.
127    offset: usize,
128    /// Length of the sequence in bp, starting at `offset` from the start of `seq`.
129    len: usize,
130}
131
132/// A 2-bit packed owned sequence of DNA bases.
133#[doc(hidden)]
134#[derive(Clone, Debug, MemSize, MemDbg)]
135#[cfg_attr(feature = "pyo3", pyo3::pyclass)]
136#[cfg_attr(feature = "epserde", derive(epserde::Epserde))]
137pub struct PackedSeqVecBase<const B: usize>
138where
139    Bits<B>: SupportedBits,
140{
141    /// NOTE: We maintain the invariant that this has at least 16 bytes padding
142    /// at the end after `len` finishes.
143    /// This ensures that `read_unaligned` in `as_64` works OK.
144    pub(crate) seq: Vec<u8>,
145
146    /// The length, in bp, of the underlying sequence. See `.len()`.
147    len: usize,
148}
149
150pub type PackedSeq<'s> = PackedSeqBase<'s, 2>;
151pub type PackedSeqVec = PackedSeqVecBase<2>;
152pub type BitSeq<'s> = PackedSeqBase<'s, 1>;
153pub type BitSeqVec = PackedSeqVecBase<1>;
154
155/// Convenience constants.
156/// B: bits per chat
157impl<'s, const B: usize> PackedSeqBase<'s, B>
158where
159    Bits<B>: SupportedBits,
160{
161    /// lowest B bits are 1.
162    const CHAR_MASK: u64 = (1 << B) - 1;
163    const SIMD_B: S = S::new([B as u32; 8]);
164    const SIMD_CHAR_MASK: S = S::new([(1 << B) - 1; 8]);
165    /// Chars per byte
166    const C8: usize = 8 / B;
167    /// Chars per u32
168    const C32: usize = 32 / B;
169    /// Chars per u256
170    const C256: usize = 256 / B;
171    /// Max length of a kmer that can be read as a single u64.
172    const K64: usize = (64 - 8) / B + 1;
173}
174
175/// Convenience constants.
176impl<const B: usize> PackedSeqVecBase<B>
177where
178    Bits<B>: SupportedBits,
179{
180    /// Chars per byte
181    const C8: usize = 8 / B;
182}
183
184impl<const B: usize> Default for PackedSeqVecBase<B>
185where
186    Bits<B>: SupportedBits,
187{
188    fn default() -> Self {
189        Self {
190            seq: vec![0; PADDING],
191            len: 0,
192        }
193    }
194}
195
196// ======================================================================
197// 2-BIT HELPER METHODS
198
199/// Pack an ASCII `ACTGactg` character into its 2-bit representation, and panic for anything else.
200#[inline(always)]
201pub fn pack_char(base: u8) -> u8 {
202    match base {
203        b'a' | b'A' => 0,
204        b'c' | b'C' => 1,
205        b'g' | b'G' => 3,
206        b't' | b'T' => 2,
207        _ => panic!(
208            "Unexpected character '{}' with ASCII value {base}. Expected one of ACTGactg.",
209            base as char
210        ),
211    }
212}
213
214/// Pack an ASCII `ACTGactg` character into its 2-bit representation, and silently convert other characters into 0..4 as well.
215#[inline(always)]
216pub fn pack_char_lossy(base: u8) -> u8 {
217    (base >> 1) & 3
218}
219
220/// Unpack a 2-bit DNA base into the corresponding `ACTG` character.
221#[inline(always)]
222pub fn unpack_base(base: u8) -> u8 {
223    debug_assert!(base < 4, "Base {base} is not <4.");
224    b"ACTG"[base as usize]
225}
226
227/// Complement an ASCII character: `A<>T` and `C<>G`.
228#[inline(always)]
229pub const fn complement_char(base: u8) -> u8 {
230    match base {
231        b'A' => b'T',
232        b'C' => b'G',
233        b'G' => b'C',
234        b'T' => b'A',
235        _ => panic!("Unexpected character. Expected one of ACTGactg.",),
236    }
237}
238
239/// Complement a 2-bit base: `0<>2` and `1<>3`.
240#[inline(always)]
241pub const fn complement_base(base: u8) -> u8 {
242    base ^ 2
243}
244
245/// Complement 8 lanes of 2-bit bases: `0<>2` and `1<>3`.
246#[inline(always)]
247pub fn complement_base_simd(base: u32x8) -> u32x8 {
248    const TWO: u32x8 = u32x8::new([2; 8]);
249    base ^ TWO
250}
251
252/// Reverse complement the 2-bit pairs in the input.
253#[inline(always)]
254const fn revcomp_raw(word: u64) -> u64 {
255    #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
256    {
257        let mut res = word.reverse_bits(); // ARM can reverse bits in a single instruction
258        res = ((res >> 1) & 0x5555_5555_5555_5555) | ((res & 0x5555_5555_5555_5555) << 1);
259        res ^ 0xAAAA_AAAA_AAAA_AAAA
260    }
261
262    #[cfg(not(any(target_arch = "arm", target_arch = "aarch64")))]
263    {
264        let mut res = word.swap_bytes();
265        res = ((res >> 4) & 0x0F0F_0F0F_0F0F_0F0F) | ((res & 0x0F0F_0F0F_0F0F_0F0F) << 4);
266        res = ((res >> 2) & 0x3333_3333_3333_3333) | ((res & 0x3333_3333_3333_3333) << 2);
267        res ^ 0xAAAA_AAAA_AAAA_AAAA
268    }
269}
270
271/// Compute the reverse complement of a short sequence packed in a `u64`.
272#[inline(always)]
273pub const fn revcomp_u64(word: u64, len: usize) -> u64 {
274    revcomp_raw(word) >> (usize::BITS as usize - 2 * len)
275}
276
277#[inline(always)]
278pub const fn revcomp_u128(word: u128, len: usize) -> u128 {
279    let low = word as u64;
280    let high = (word >> 64) as u64;
281    let rlow = revcomp_raw(low);
282    let rhigh = revcomp_raw(high);
283    let out = ((rlow as u128) << 64) | rhigh as u128;
284    out >> (u128::BITS as usize - 2 * len)
285}
286
287// ======================================================================
288// 1-BIT HELPER METHODS
289
290/// 1 when a char is ambiguous.
291#[inline(always)]
292pub fn char_is_ambiguous(base: u8) -> u8 {
293    // (!matches!(base, b'A' | b'C'  | b'G'  | b'T' | b'a' | b'c'  | b'g'  | b't')) as u8
294    let table = b"ACTG";
295    let upper_mask = !(b'a' - b'A');
296    (table[pack_char_lossy(base) as usize] != (base & upper_mask)) as u8
297}
298
299/// Reverse `len` bits packed in a `u64`.
300#[inline(always)]
301pub const fn rev_u64(word: u64, len: usize) -> u64 {
302    word.reverse_bits() >> (usize::BITS as usize - len)
303}
304
305/// Reverse `len` bits packed in a `u128`.
306#[inline(always)]
307pub const fn rev_u128(word: u128, len: usize) -> u128 {
308    word.reverse_bits() >> (u128::BITS as usize - len)
309}
310
311// ======================================================================
312
313impl<const B: usize> PackedSeqBase<'_, B>
314where
315    Bits<B>: SupportedBits,
316{
317    /// Shrink `seq` to only just cover the data.
318    #[inline(always)]
319    pub fn normalize(&self) -> Self {
320        let start_byte = self.offset / Self::C8;
321        let end_byte = (self.offset + self.len).div_ceil(Self::C8);
322        Self {
323            seq: &self.seq[start_byte..end_byte + PADDING],
324            offset: self.offset % Self::C8,
325            len: self.len,
326        }
327    }
328
329    /// Return a `Vec<u8>` of ASCII `ACTG` characters.
330    #[inline(always)]
331    pub fn unpack(&self) -> Vec<u8> {
332        self.iter_bp().map(unpack_base).collect()
333    }
334}
335
336/// Read up to 32 bytes starting at idx.
337#[inline(always)]
338pub(crate) unsafe fn read_slice_32_unchecked(seq: &[u8], idx: usize) -> u32x8 {
339    unsafe {
340        let src = seq.as_ptr().add(idx);
341        debug_assert!(idx + 32 <= seq.len());
342        std::mem::transmute::<_, *const u32x8>(src).read_unaligned()
343    }
344}
345
346/// Read up to 32 bytes starting at idx.
347#[inline(always)]
348pub(crate) fn read_slice_32(seq: &[u8], idx: usize) -> u32x8 {
349    unsafe {
350        let src = seq.as_ptr().add(idx);
351        if idx + 32 <= seq.len() {
352            std::mem::transmute::<_, *const u32x8>(src).read_unaligned()
353        } else {
354            let num_bytes = seq.len().saturating_sub(idx);
355            let mut result = [0u8; 32];
356            std::ptr::copy_nonoverlapping(src, result.as_mut_ptr(), num_bytes);
357            std::mem::transmute(result)
358        }
359    }
360}
361
362/// Read up to 16 bytes starting at idx.
363#[allow(unused)]
364#[inline(always)]
365pub(crate) fn read_slice_16(seq: &[u8], idx: usize) -> u16x8 {
366    unsafe {
367        let src = seq.as_ptr().add(idx);
368        if idx + 16 <= seq.len() {
369            std::mem::transmute::<_, *const u16x8>(src).read_unaligned()
370        } else {
371            let num_bytes = seq.len().saturating_sub(idx);
372            let mut result = [0u8; 16];
373            std::ptr::copy_nonoverlapping(src, result.as_mut_ptr(), num_bytes);
374            std::mem::transmute(result)
375        }
376    }
377}
378
379impl<'s, const B: usize> Seq<'s> for PackedSeqBase<'s, B>
380where
381    Bits<B>: SupportedBits,
382{
383    const BITS_PER_CHAR: usize = B;
384    const BASES_PER_BYTE: usize = Self::C8;
385    type SeqVec = PackedSeqVecBase<B>;
386
387    #[inline(always)]
388    fn len(&self) -> usize {
389        self.len
390    }
391
392    #[inline(always)]
393    fn is_empty(&self) -> bool {
394        self.len == 0
395    }
396
397    #[inline(always)]
398    fn get_ascii(&self, index: usize) -> u8 {
399        unpack_base(self.get(index))
400    }
401
402    /// Convert a short sequence (kmer) to a packed representation as `u64`.
403    /// Panics if `self` is longer than 32 characters.
404    #[inline(always)]
405    fn as_u64(&self) -> u64 {
406        assert!(self.len() <= 64 / B);
407
408        let mask = u64::MAX >> (64 - B * self.len());
409
410        // The unaligned read is OK, because we ensure that the underlying `PackedSeqVecBase::seq` always
411        // has at least 16 bytes (the size of a u128) of padding at the end.
412        if self.len() <= Self::K64 {
413            let x = unsafe { (self.seq.as_ptr() as *const u64).read_unaligned() };
414            (x >> (B * self.offset)) & mask
415        } else {
416            let x = unsafe { (self.seq.as_ptr() as *const u128).read_unaligned() };
417            (x >> (B * self.offset)) as u64 & mask
418        }
419    }
420
421    /// Convert a short sequence (kmer) to a packed representation of its reverse complement as `usize`.
422    /// Panics if `self` is longer than 32 characters.
423    #[inline(always)]
424    fn revcomp_as_u64(&self) -> u64 {
425        match B {
426            1 => rev_u64(self.as_u64(), self.len()),
427            2 => revcomp_u64(self.as_u64(), self.len()),
428            _ => panic!("Rev(comp) is only supported for 1-bit and 2-bit alphabets."),
429        }
430    }
431
432    /// Convert a short sequence (kmer) to a packed representation as `u128`.
433    /// Panics if `self` is longer than 64 characters.
434    #[inline(always)]
435    fn as_u128(&self) -> u128 {
436        assert!(
437            self.len() <= (128 - 8) / B + 1,
438            "Sequences >61 long cannot be read with a single unaligned u128 read."
439        );
440
441        let mask = u128::MAX >> (128 - B * self.len());
442
443        // The unaligned read is OK, because we ensure that the underlying `PackedSeqVecBase::seq` always
444        // has at least 16 bytes (the size of a u128) of padding at the end.
445        let x = unsafe { (self.seq.as_ptr() as *const u128).read_unaligned() };
446        (x >> (B * self.offset)) & mask
447    }
448
449    /// Convert a short sequence (kmer) to a packed representation of its reverse complement as `usize`.
450    /// Panics if `self` is longer than 64 characters.
451    #[inline(always)]
452    fn revcomp_as_u128(&self) -> u128 {
453        match B {
454            1 => rev_u128(self.as_u128(), self.len()),
455            2 => revcomp_u128(self.as_u128(), self.len()),
456            _ => panic!("Rev(comp) is only supported for 1-bit and 2-bit alphabets."),
457        }
458    }
459
460    #[inline(always)]
461    fn to_vec(&self) -> PackedSeqVecBase<B> {
462        assert_eq!(self.offset, 0);
463        PackedSeqVecBase {
464            seq: self
465                .seq
466                .iter()
467                .copied()
468                .chain(std::iter::repeat_n(0u8, PADDING))
469                .collect(),
470            len: self.len,
471        }
472    }
473
474    fn to_revcomp(&self) -> PackedSeqVecBase<B> {
475        match B {
476            1 | 2 => {}
477            _ => panic!("Can only reverse (&complement) 1-bit and 2-bit packed sequences.",),
478        }
479
480        let mut seq = self.seq[..(self.offset + self.len).div_ceil(Self::C8)]
481            .iter()
482            // 1. reverse the bytes
483            .rev()
484            .copied()
485            .map(|mut res| {
486                match B {
487                    2 => {
488                        // 2. swap the bases in the byte
489                        // This is auto-vectorized.
490                        res = ((res >> 4) & 0x0F) | ((res & 0x0F) << 4);
491                        res = ((res >> 2) & 0x33) | ((res & 0x33) << 2);
492                        // Complement the bases.
493                        res ^ 0xAA
494                    }
495                    1 => res.reverse_bits(),
496                    _ => unreachable!(),
497                }
498            })
499            .chain(std::iter::repeat_n(0u8, PADDING))
500            .collect::<Vec<u8>>();
501
502        // 3. Shift away the offset.
503        let new_offset = (Self::C8 - (self.offset + self.len) % Self::C8) % Self::C8;
504
505        if new_offset > 0 {
506            // Shift everything left by `2*new_offset` bits.
507            let shift = B * new_offset;
508            *seq.last_mut().unwrap() >>= shift;
509            // This loop is also auto-vectorized.
510            for i in 0..seq.len() - 1 {
511                seq[i] = (seq[i] >> shift) | (seq[i + 1] << (8 - shift));
512            }
513        }
514
515        PackedSeqVecBase { seq, len: self.len }
516    }
517
518    #[inline(always)]
519    fn slice(&self, range: Range<usize>) -> Self {
520        debug_assert!(
521            range.end <= self.len,
522            "Slice index out of bounds: {} > {}",
523            range.end,
524            self.len
525        );
526        PackedSeqBase {
527            seq: self.seq,
528            offset: self.offset + range.start,
529            len: range.end - range.start,
530        }
531        .normalize()
532    }
533
534    #[inline(always)]
535    fn iter_bp(self) -> impl ExactSizeIterator<Item = u8> {
536        assert!(self.len <= self.seq.len() * Self::C8);
537
538        let this = self.normalize();
539
540        // read u64 at a time?
541        let mut byte = 0;
542        (0..this.len + this.offset)
543            .map(
544                #[inline(always)]
545                move |i| {
546                    if i % Self::C8 == 0 {
547                        byte = this.seq[i / Self::C8];
548                    }
549                    // Shift byte instead of i?
550                    (byte >> (B * (i % Self::C8))) & Self::CHAR_MASK as u8
551                },
552            )
553            .advance(this.offset)
554    }
555
556    #[inline(always)]
557    fn par_iter_bp(self, context: usize) -> PaddedIt<impl ChunkIt<S>> {
558        self.par_iter_bp_with_buf(context, RecycledBox::take())
559    }
560
561    #[inline(always)]
562    fn par_iter_bp_delayed(self, context: usize, delay: Delay) -> PaddedIt<impl ChunkIt<(S, S)>> {
563        self.par_iter_bp_delayed_with_factor(context, delay, 1)
564    }
565
566    /// NOTE: When `self` starts does not start at a byte boundary, the
567    /// 'delayed' character is not guaranteed to be `0`.
568    #[inline(always)]
569    fn par_iter_bp_delayed_2(
570        self,
571        context: usize,
572        delay1: Delay,
573        delay2: Delay,
574    ) -> PaddedIt<impl ChunkIt<(S, S, S)>> {
575        self.par_iter_bp_delayed_2_with_factor_and_buf(
576            context,
577            delay1,
578            delay2,
579            1,
580            RecycledVec::take(),
581        )
582    }
583
584    /// Compares 29 characters at a time.
585    fn cmp_lcp(&self, other: &Self) -> (std::cmp::Ordering, usize) {
586        let mut lcp = 0;
587        let min_len = self.len.min(other.len);
588        for i in (0..min_len).step_by(Self::K64) {
589            let len = (min_len - i).min(Self::K64);
590            let this = self.slice(i..i + len);
591            let other = other.slice(i..i + len);
592            let this_word = this.as_u64();
593            let other_word = other.as_u64();
594            if this_word != other_word {
595                // Unfortunately, bases are packed in little endian order, so the default order is reversed.
596                let eq = this_word ^ other_word;
597                let t = eq.trailing_zeros() as usize / B * B;
598                lcp += t / B;
599                let mask = (Self::CHAR_MASK) << t;
600                return ((this_word & mask).cmp(&(other_word & mask)), lcp);
601            }
602            lcp += len;
603        }
604        (self.len.cmp(&other.len), lcp)
605    }
606
607    #[inline(always)]
608    fn get(&self, index: usize) -> u8 {
609        let offset = self.offset + index;
610        let idx = offset / Self::C8;
611        let offset = offset % Self::C8;
612        (self.seq[idx] >> (B * offset)) & Self::CHAR_MASK as u8
613    }
614}
615
616impl<'s, const B: usize> PackedSeqBase<'s, B>
617where
618    Bits<B>: SupportedBits,
619{
620    #[inline(always)]
621    pub fn par_iter_bp_with_buf<BUF: DerefMut<Target = [S; 8]>>(
622        self,
623        context: usize,
624        mut buf: BUF,
625    ) -> PaddedIt<impl ChunkIt<S> + use<'s, B, BUF>> {
626        #[cfg(target_endian = "big")]
627        panic!("Big endian architectures are not supported.");
628
629        let this = self.normalize();
630        let o = this.offset;
631        assert!(o < Self::C8);
632
633        let num_kmers = if this.len == 0 {
634            0
635        } else {
636            (this.len + o).saturating_sub(context - 1)
637        };
638        // without +o, since we don't need them in the stride.
639        let num_kmers_stride = this.len.saturating_sub(context - 1);
640        let n = num_kmers_stride.div_ceil(L).next_multiple_of(Self::C8);
641        let bytes_per_chunk = n / Self::C8;
642        let padding = Self::C8 * L * bytes_per_chunk - num_kmers_stride;
643
644        let offsets: [usize; 8] = from_fn(|l| l * bytes_per_chunk);
645        let mut cur = S::ZERO;
646
647        let par_len = if num_kmers == 0 {
648            0
649        } else {
650            n + context + o - 1
651        };
652
653        let last_i = par_len.saturating_sub(1) / Self::C32 * Self::C32;
654        // Safety check for the `read_slice_32_unchecked`:
655        assert!(offsets[7] + (last_i / Self::C8) + 32 <= this.seq.len());
656
657        let it = (0..par_len)
658            .map(
659                #[inline(always)]
660                move |i| {
661                    if i % Self::C32 == 0 {
662                        if i % Self::C256 == 0 {
663                            // Read a u256 for each lane containing the next 128 characters.
664                            let data: [u32x8; 8] = from_fn(
665                                #[inline(always)]
666                                |lane| unsafe {
667                                    read_slice_32_unchecked(
668                                        this.seq,
669                                        offsets[lane] + (i / Self::C8),
670                                    )
671                                },
672                            );
673                            *buf = transpose(data);
674                        }
675                        cur = buf[(i % Self::C256) / Self::C32];
676                    }
677                    // Extract the last 2 bits of each character.
678                    let chars = cur & Self::SIMD_CHAR_MASK;
679                    // Shift remaining characters to the right.
680                    cur = cur >> Self::SIMD_B;
681                    chars
682                },
683            )
684            .advance(o);
685
686        PaddedIt { it, padding }
687    }
688
689    #[inline(always)]
690    pub fn par_iter_bp_delayed_with_factor(
691        self,
692        context: usize,
693        delay: Delay,
694        factor: usize,
695    ) -> PaddedIt<impl ChunkIt<(S, S)> + use<'s, B>> {
696        self.par_iter_bp_delayed_with_factor_and_buf(context, delay, factor, RecycledVec::take())
697    }
698
699    #[inline(always)]
700    pub fn par_iter_bp_delayed_with_buf<BUF: DerefMut<Target = Vec<S>>>(
701        self,
702        context: usize,
703        delay: Delay,
704        buf: BUF,
705    ) -> PaddedIt<impl ChunkIt<(S, S)> + use<'s, B, BUF>> {
706        self.par_iter_bp_delayed_with_factor_and_buf(context, delay, 1, buf)
707    }
708
709    #[inline(always)]
710    pub fn par_iter_bp_delayed_with_factor_and_buf<BUF: DerefMut<Target = Vec<S>>>(
711        self,
712        context: usize,
713        Delay(delay): Delay,
714        factor: usize,
715        mut buf: BUF,
716    ) -> PaddedIt<impl ChunkIt<(S, S)> + use<'s, B, BUF>> {
717        #[cfg(target_endian = "big")]
718        panic!("Big endian architectures are not supported.");
719
720        assert!(
721            delay < usize::MAX / 2,
722            "Delay={} should be >=0.",
723            delay as isize
724        );
725
726        let this = self.normalize();
727        let o = this.offset;
728        assert!(o < Self::C8);
729
730        let num_kmers = if this.len == 0 {
731            0
732        } else {
733            (this.len + o).saturating_sub(context - 1)
734        };
735        // without +o, since we don't need them in the stride.
736        let num_kmers_stride = this.len.saturating_sub(context - 1);
737        let n = num_kmers_stride
738            .div_ceil(L)
739            .next_multiple_of(factor * Self::C8);
740        let bytes_per_chunk = n / Self::C8;
741        let padding = Self::C8 * L * bytes_per_chunk - num_kmers_stride;
742
743        let offsets: [usize; 8] = from_fn(|l| l * bytes_per_chunk);
744        let mut upcoming = S::ZERO;
745        let mut upcoming_d = S::ZERO;
746
747        // Even buf_len is nice to only have the write==buf_len check once.
748        // We also make it the next power of 2, for faster modulo operations.
749        // delay/16: number of bp in a u32.
750        // +8: some 'random' padding
751        let buf_len = (delay / Self::C32 + 8).next_power_of_two();
752        let buf_mask = buf_len - 1;
753        if buf.capacity() < buf_len {
754            // This has better codegen than `vec.clear(); vec.resize()`, since the inner `do_reserve_and_handle` of resize is not inlined.
755            *buf = vec![S::ZERO; buf_len];
756        } else {
757            // NOTE: Buf needs to be filled with zeros to guarantee returning 0 values for out-of-bounds characters.
758            buf.clear();
759            buf.resize(buf_len, S::ZERO);
760        }
761
762        let mut write_idx = 0;
763        // We compensate for the first delay/16 triggers of the check below that
764        // happen before the delay is actually reached.
765        let mut read_idx = (buf_len - delay / Self::C32) % buf_len;
766
767        let par_len = if num_kmers == 0 {
768            0
769        } else {
770            n + context + o - 1
771        };
772
773        let last_i = par_len.saturating_sub(1) / Self::C32 * Self::C32;
774        // Safety check for the `read_slice_32_unchecked`:
775        assert!(offsets[7] + (last_i / Self::C8) + 32 <= this.seq.len());
776
777        let it = (0..par_len)
778            .map(
779                #[inline(always)]
780                move |i| {
781                    if i % Self::C32 == 0 {
782                        if i % Self::C256 == 0 {
783                            // Read a u256 for each lane containing the next 128 characters.
784                            let data: [u32x8; 8] = from_fn(
785                                #[inline(always)]
786                                |lane| unsafe {
787                                    read_slice_32_unchecked(
788                                        this.seq,
789                                        offsets[lane] + (i / Self::C8),
790                                    )
791                                },
792                            );
793                            unsafe {
794                                *TryInto::<&mut [u32x8; 8]>::try_into(
795                                    buf.get_unchecked_mut(write_idx..write_idx + 8),
796                                )
797                                .unwrap_unchecked() = transpose(data);
798                            }
799                            if i == 0 {
800                                // Mask out chars before the offset.
801                                let elem = !((1u32 << (B * o)) - 1);
802                                let mask = S::splat(elem);
803                                unsafe { assert_unchecked(write_idx < buf.len()) };
804                                buf[write_idx] &= mask;
805                            }
806                        }
807                        unsafe { assert_unchecked(write_idx < buf.len()) };
808                        upcoming = buf[write_idx];
809                        write_idx += 1;
810                        write_idx &= buf_mask;
811                    }
812                    if i % Self::C32 == delay % Self::C32 {
813                        unsafe { assert_unchecked(read_idx < buf.len()) };
814                        upcoming_d = buf[read_idx];
815                        read_idx += 1;
816                        read_idx &= buf_mask;
817                    }
818                    // Extract the last 2 bits of each character.
819                    let chars = upcoming & Self::SIMD_CHAR_MASK;
820                    let chars_d = upcoming_d & Self::SIMD_CHAR_MASK;
821                    // Shift remaining characters to the right.
822                    upcoming = upcoming >> Self::SIMD_B;
823                    upcoming_d = upcoming_d >> Self::SIMD_B;
824                    (chars, chars_d)
825                },
826            )
827            .advance(o);
828
829        PaddedIt { it, padding }
830    }
831
832    #[inline(always)]
833    pub fn par_iter_bp_delayed_2_with_factor(
834        self,
835        context: usize,
836        delay1: Delay,
837        delay2: Delay,
838        factor: usize,
839    ) -> PaddedIt<impl ChunkIt<(S, S, S)> + use<'s, B>> {
840        self.par_iter_bp_delayed_2_with_factor_and_buf(
841            context,
842            delay1,
843            delay2,
844            factor,
845            RecycledVec::take(),
846        )
847    }
848
849    #[inline(always)]
850    pub fn par_iter_bp_delayed_2_with_buf<BUF: DerefMut<Target = Vec<S>>>(
851        self,
852        context: usize,
853        delay1: Delay,
854        delay2: Delay,
855        buf: BUF,
856    ) -> PaddedIt<impl ChunkIt<(S, S, S)> + use<'s, B, BUF>> {
857        self.par_iter_bp_delayed_2_with_factor_and_buf(context, delay1, delay2, 1, buf)
858    }
859
860    /// When iterating over 2-bit and 1-bit encoded data in parallel,
861    /// one must ensure that they have the same stride.
862    /// On the larger type, set `factor` as the ratio to the smaller one,
863    /// so that the stride in bytes is a multiple of `factor`,
864    /// so that the smaller type also has a byte-aligned stride.
865    #[inline(always)]
866    pub fn par_iter_bp_delayed_2_with_factor_and_buf<BUF: DerefMut<Target = Vec<S>>>(
867        self,
868        context: usize,
869        Delay(delay1): Delay,
870        Delay(delay2): Delay,
871        factor: usize,
872        mut buf: BUF,
873    ) -> PaddedIt<impl ChunkIt<(S, S, S)> + use<'s, B, BUF>> {
874        #[cfg(target_endian = "big")]
875        panic!("Big endian architectures are not supported.");
876
877        let this = self.normalize();
878        let o = this.offset;
879        assert!(o < Self::C8);
880        assert!(delay1 <= delay2, "Delay1 must be at most delay2.");
881
882        let num_kmers = if this.len == 0 {
883            0
884        } else {
885            (this.len + o).saturating_sub(context - 1)
886        };
887        // without +o, since we don't need them in the stride.
888        let num_kmers_stride = this.len.saturating_sub(context - 1);
889        let n = num_kmers_stride
890            .div_ceil(L)
891            .next_multiple_of(factor * Self::C8);
892        let bytes_per_chunk = n / Self::C8;
893        let padding = Self::C8 * L * bytes_per_chunk - num_kmers_stride;
894
895        let offsets: [usize; 8] = from_fn(|l| l * bytes_per_chunk);
896        let mut upcoming = S::ZERO;
897        let mut upcoming_d1 = S::ZERO;
898        let mut upcoming_d2 = S::ZERO;
899
900        // Even buf_len is nice to only have the write==buf_len check once.
901        let buf_len = (delay2 / Self::C32 + 8).next_power_of_two();
902        let buf_mask = buf_len - 1;
903        if buf.capacity() < buf_len {
904            // This has better codegen than `vec.clear(); vec.resize()`, since the inner `do_reserve_and_handle` of resize is not inlined.
905            *buf = vec![S::ZERO; buf_len];
906        } else {
907            // NOTE: Buf needs to be filled with zeros to guarantee returning 0 values for out-of-bounds characters.
908            buf.clear();
909            buf.resize(buf_len, S::ZERO);
910        }
911
912        let mut write_idx = 0;
913        // We compensate for the first delay/16 triggers of the check below that
914        // happen before the delay is actually reached.
915        let mut read_idx1 = (buf_len - delay1 / Self::C32) % buf_len;
916        let mut read_idx2 = (buf_len - delay2 / Self::C32) % buf_len;
917
918        let par_len = if num_kmers == 0 {
919            0
920        } else {
921            n + context + o - 1
922        };
923
924        let last_i = par_len.saturating_sub(1) / Self::C32 * Self::C32;
925        // Safety check for the `read_slice_32_unchecked`:
926        assert!(offsets[7] + (last_i / Self::C8) + 32 <= this.seq.len());
927
928        let it = (0..par_len)
929            .map(
930                #[inline(always)]
931                move |i| {
932                    if i % Self::C32 == 0 {
933                        if i % Self::C256 == 0 {
934                            // Read a u256 for each lane containing the next 128 characters.
935                            let data: [u32x8; 8] = from_fn(
936                                #[inline(always)]
937                                |lane| unsafe {
938                                    read_slice_32_unchecked(
939                                        this.seq,
940                                        offsets[lane] + (i / Self::C8),
941                                    )
942                                },
943                            );
944                            unsafe {
945                                *TryInto::<&mut [u32x8; 8]>::try_into(
946                                    buf.get_unchecked_mut(write_idx..write_idx + 8),
947                                )
948                                .unwrap_unchecked() = transpose(data);
949                            }
950                            // FIXME DROP THIS?
951                            if i == 0 {
952                                // Mask out chars before the offset.
953                                let elem = !((1u32 << (B * o)) - 1);
954                                let mask = S::splat(elem);
955                                buf[write_idx] &= mask;
956                            }
957                        }
958                        upcoming = buf[write_idx];
959                        write_idx += 1;
960                        write_idx &= buf_mask;
961                    }
962                    if i % Self::C32 == delay1 % Self::C32 {
963                        unsafe { assert_unchecked(read_idx1 < buf.len()) };
964                        upcoming_d1 = buf[read_idx1];
965                        read_idx1 += 1;
966                        read_idx1 &= buf_mask;
967                    }
968                    if i % Self::C32 == delay2 % Self::C32 {
969                        unsafe { assert_unchecked(read_idx2 < buf.len()) };
970                        upcoming_d2 = buf[read_idx2];
971                        read_idx2 += 1;
972                        read_idx2 &= buf_mask;
973                    }
974                    // Extract the last 2 bits of each character.
975                    let chars = upcoming & Self::SIMD_CHAR_MASK;
976                    let chars_d1 = upcoming_d1 & Self::SIMD_CHAR_MASK;
977                    let chars_d2 = upcoming_d2 & Self::SIMD_CHAR_MASK;
978                    // Shift remaining characters to the right.
979                    upcoming = upcoming >> Self::SIMD_B;
980                    upcoming_d1 = upcoming_d1 >> Self::SIMD_B;
981                    upcoming_d2 = upcoming_d2 >> Self::SIMD_B;
982                    (chars, chars_d1, chars_d2)
983                },
984            )
985            .advance(o);
986
987        PaddedIt { it, padding }
988    }
989}
990
991impl<const B: usize> PartialEq for PackedSeqBase<'_, B>
992where
993    Bits<B>: SupportedBits,
994{
995    /// Compares 29 characters at a time.
996    fn eq(&self, other: &Self) -> bool {
997        if self.len != other.len {
998            return false;
999        }
1000        for i in (0..self.len).step_by(Self::K64) {
1001            let len = (self.len - i).min(Self::K64);
1002            let this = self.slice(i..i + len);
1003            let that = other.slice(i..i + len);
1004            if this.as_u64() != that.as_u64() {
1005                return false;
1006            }
1007        }
1008        true
1009    }
1010}
1011
1012impl<const B: usize> Eq for PackedSeqBase<'_, B> where Bits<B>: SupportedBits {}
1013
1014impl<const B: usize> PartialOrd for PackedSeqBase<'_, B>
1015where
1016    Bits<B>: SupportedBits,
1017{
1018    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
1019        Some(self.cmp(other))
1020    }
1021}
1022
1023impl<const B: usize> Ord for PackedSeqBase<'_, B>
1024where
1025    Bits<B>: SupportedBits,
1026{
1027    /// Compares 29 characters at a time.
1028    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
1029        let min_len = self.len.min(other.len);
1030        for i in (0..min_len).step_by(Self::K64) {
1031            let len = (min_len - i).min(Self::K64);
1032            let this = self.slice(i..i + len);
1033            let other = other.slice(i..i + len);
1034            let this_word = this.as_u64();
1035            let other_word = other.as_u64();
1036            if this_word != other_word {
1037                // Unfortunately, bases are packed in little endian order, so the default order is reversed.
1038                let eq = this_word ^ other_word;
1039                let t = eq.trailing_zeros() as usize / B * B;
1040                let mask = (Self::CHAR_MASK) << t;
1041                return (this_word & mask).cmp(&(other_word & mask));
1042            }
1043        }
1044        self.len.cmp(&other.len)
1045    }
1046}
1047
1048impl<const B: usize> SeqVec for PackedSeqVecBase<B>
1049where
1050    Bits<B>: SupportedBits,
1051{
1052    type Seq<'s> = PackedSeqBase<'s, B>;
1053
1054    #[inline(always)]
1055    fn into_raw(mut self) -> Vec<u8> {
1056        self.seq.resize(self.len.div_ceil(Self::C8), 0);
1057        self.seq
1058    }
1059
1060    #[inline(always)]
1061    fn as_slice(&self) -> Self::Seq<'_> {
1062        PackedSeqBase {
1063            seq: &self.seq[..self.len.div_ceil(Self::C8) + PADDING],
1064            offset: 0,
1065            len: self.len,
1066        }
1067    }
1068
1069    #[inline(always)]
1070    fn len(&self) -> usize {
1071        self.len
1072    }
1073
1074    #[inline(always)]
1075    fn is_empty(&self) -> bool {
1076        self.len == 0
1077    }
1078
1079    #[inline(always)]
1080    fn clear(&mut self) {
1081        self.seq.clear();
1082        self.len = 0;
1083    }
1084
1085    fn push_seq<'a>(&mut self, seq: PackedSeqBase<'_, B>) -> Range<usize> {
1086        let start = self.len.next_multiple_of(Self::C8) + seq.offset;
1087        let end = start + seq.len();
1088
1089        // Shrink away the padding.
1090        self.seq.resize(self.len.div_ceil(Self::C8), 0);
1091        // Extend.
1092        self.seq.extend(seq.seq);
1093        self.len = end;
1094        start..end
1095    }
1096
1097    /// For `PackedSeqVec` (2-bit encoding): map ASCII `ACGT` (and `acgt`) to DNA `0132` in that order.
1098    /// Other characters are silently mapped into `0..4`.
1099    ///
1100    /// Uses the BMI2 `pext` instruction when available, based on the
1101    /// `n_to_bits_pext` method described at
1102    /// <https://github.com/Daniel-Liu-c0deb0t/cute-nucleotides>.
1103    ///
1104    /// For `BitSeqVec` (1-bit encoding): map `ACGTacgt` to `0`, and everything else to `1`.
1105    fn push_ascii(&mut self, seq: &[u8]) -> Range<usize> {
1106        match B {
1107            1 | 2 => {}
1108            _ => panic!(
1109                "Can only use ASCII input for 2-bit DNA packing, or 1-bit ambiguous indicators."
1110            ),
1111        }
1112
1113        self.seq
1114            .resize((self.len + seq.len()).div_ceil(Self::C8) + PADDING, 0);
1115        let start_aligned = self.len.next_multiple_of(Self::C8);
1116        let start = self.len;
1117        let len = seq.len();
1118        let mut idx = self.len / Self::C8;
1119
1120        let parse_base = |base| match B {
1121            1 => char_is_ambiguous(base),
1122            2 => pack_char_lossy(base),
1123            _ => unreachable!(),
1124        };
1125
1126        let unaligned = core::cmp::min(start_aligned - start, len);
1127        if unaligned > 0 {
1128            let mut packed_byte = self.seq[idx];
1129            for &base in &seq[..unaligned] {
1130                packed_byte |= parse_base(base) << ((self.len % Self::C8) * B);
1131                self.len += 1;
1132            }
1133            self.seq[idx] = packed_byte;
1134            idx += 1;
1135        }
1136
1137        #[allow(unused)]
1138        let mut last = unaligned;
1139
1140        if B == 2 {
1141            #[cfg(all(target_arch = "x86_64", target_feature = "bmi2"))]
1142            {
1143                last = unaligned + (len - unaligned) / 8 * 8;
1144
1145                for i in (unaligned..last).step_by(8) {
1146                    let chunk =
1147                        unsafe { seq.get_unchecked(i..i + 8).try_into().unwrap_unchecked() };
1148                    let ascii = u64::from_le_bytes(chunk);
1149                    let packed_bytes =
1150                        unsafe { std::arch::x86_64::_pext_u64(ascii, 0x0606060606060606) } as u16;
1151                    unsafe {
1152                        self.seq
1153                            .get_unchecked_mut(idx..(idx + 2))
1154                            .copy_from_slice(&packed_bytes.to_le_bytes())
1155                    };
1156                    idx += 2;
1157                    self.len += 8;
1158                }
1159            }
1160
1161            #[cfg(target_feature = "neon")]
1162            {
1163                use core::arch::aarch64::{
1164                    vandq_u8, vdup_n_u8, vld1q_u8, vpadd_u8, vshlq_u8, vst1_u8,
1165                };
1166                use core::mem::transmute;
1167
1168                last = unaligned + (len - unaligned) / 16 * 16;
1169
1170                for i in (unaligned..last).step_by(16) {
1171                    unsafe {
1172                        let ascii = vld1q_u8(seq.as_ptr().add(i));
1173                        let masked_bits = vandq_u8(ascii, transmute([6i8; 16]));
1174                        let (bits_0, bits_1) = transmute(vshlq_u8(
1175                            masked_bits,
1176                            transmute([-1i8, 1, 3, 5, -1, 1, 3, 5, -1, 1, 3, 5, -1, 1, 3, 5]),
1177                        ));
1178                        let half_packed = vpadd_u8(bits_0, bits_1);
1179                        let packed = vpadd_u8(half_packed, vdup_n_u8(0));
1180                        vst1_u8(self.seq.as_mut_ptr().add(idx), packed);
1181                        idx += Self::C8;
1182                        self.len += 16;
1183                    }
1184                }
1185            }
1186        }
1187
1188        if B == 1 {
1189            #[cfg(all(target_arch = "x86_64", target_feature = "avx2"))]
1190            {
1191                last = len;
1192                self.len += len - unaligned;
1193
1194                let mut last_i = unaligned;
1195
1196                for i in (unaligned..last).step_by(32) {
1197                    use std::mem::transmute as t;
1198
1199                    use wide::CmpEq;
1200                    // Wide doesn't have u8x32, so this is messy here...
1201                    type S = wide::i8x32;
1202                    let chars: S = unsafe { t(read_slice_32(seq, i)) };
1203                    let upper_mask = !(b'a' - b'A');
1204                    // make everything upper case
1205                    let chars = chars & S::splat(upper_mask as i8);
1206                    let lossy_encoded = chars & S::splat(6);
1207                    let table = unsafe { S::from(t::<_, S>(*b"AxCxTxGxxxxxxxxxAxCxTxGxxxxxxxxx")) };
1208                    let lookup: S = unsafe {
1209                        t(std::arch::x86_64::_mm256_shuffle_epi8(
1210                            t(table),
1211                            t(lossy_encoded),
1212                        ))
1213                    };
1214                    let packed_bytes = !(chars.cmp_eq(lookup).move_mask() as u32);
1215
1216                    last_i = i;
1217                    unsafe {
1218                        self.seq
1219                            .get_unchecked_mut(idx..(idx + 4))
1220                            .copy_from_slice(&packed_bytes.to_le_bytes())
1221                    };
1222                    idx += 4;
1223                }
1224
1225                // Fix up trailing bytes.
1226                if unaligned < last {
1227                    idx -= 4;
1228                    let mut val = unsafe {
1229                        u32::from_le_bytes(
1230                            self.seq
1231                                .get_unchecked(idx..(idx + 4))
1232                                .try_into()
1233                                .unwrap_unchecked(),
1234                        )
1235                    };
1236                    // keep only the `last - last_i` low bits.
1237                    let keep = last - last_i;
1238                    val <<= 32 - keep;
1239                    val >>= 32 - keep;
1240                    unsafe {
1241                        self.seq
1242                            .get_unchecked_mut(idx..(idx + 4))
1243                            .copy_from_slice(&val.to_le_bytes())
1244                    };
1245                    idx += keep.div_ceil(8);
1246                }
1247            }
1248
1249            #[cfg(target_feature = "neon")]
1250            {
1251                use core::arch::aarch64::*;
1252                use core::mem::transmute;
1253
1254                last = unaligned + (len - unaligned) / 64 * 64;
1255
1256                for i in (unaligned..last).step_by(64) {
1257                    unsafe {
1258                        let ptr = seq.as_ptr().add(i);
1259                        let chars = vld4q_u8(ptr);
1260
1261                        let upper_mask = vdupq_n_u8(!(b'a' - b'A'));
1262                        let chars = neon::map_8x16x4(chars, |v| vandq_u8(v, upper_mask));
1263
1264                        let two_bits_mask = vdupq_n_u8(6);
1265                        let lossy_encoded = neon::map_8x16x4(chars, |v| vandq_u8(v, two_bits_mask));
1266
1267                        let table = transmute(*b"AxCxTxGxxxxxxxxx");
1268                        let lookup = neon::map_8x16x4(lossy_encoded, |v| vqtbl1q_u8(table, v));
1269
1270                        let mask = neon::map_two_8x16x4(chars, lookup, |v1, v2| vceqq_u8(v1, v2));
1271                        let packed_bytes = !neon::movemask_64(mask);
1272
1273                        self.seq[idx..(idx + 8)].copy_from_slice(&packed_bytes.to_le_bytes());
1274                        idx += 8;
1275                        self.len += 64;
1276                    }
1277                }
1278            }
1279        }
1280
1281        let mut packed_byte = 0;
1282        for &base in &seq[last..] {
1283            packed_byte |= parse_base(base) << ((self.len % Self::C8) * B);
1284            self.len += 1;
1285            if self.len % Self::C8 == 0 {
1286                self.seq[idx] = packed_byte;
1287                idx += 1;
1288                packed_byte = 0;
1289            }
1290        }
1291        if self.len % Self::C8 != 0 && last < len {
1292            self.seq[idx] = packed_byte;
1293            idx += 1;
1294        }
1295        assert_eq!(idx + PADDING, self.seq.len());
1296        start..start + len
1297    }
1298
1299    #[cfg(feature = "rand")]
1300    fn random(n: usize) -> Self {
1301        use rand::{RngCore, SeedableRng};
1302
1303        let byte_len = n.div_ceil(Self::C8);
1304        let mut seq = vec![0; byte_len + PADDING];
1305        rand::rngs::SmallRng::from_os_rng().fill_bytes(&mut seq[..byte_len]);
1306        // Ensure that the last byte is padded with zeros.
1307        if n % Self::C8 != 0 {
1308            seq[byte_len - 1] &= (1 << (B * (n % Self::C8))) - 1;
1309        }
1310
1311        Self { seq, len: n }
1312    }
1313}
1314
1315impl PackedSeqVecBase<1> {
1316    pub fn with_len(n: usize) -> Self {
1317        Self {
1318            seq: vec![0; n.div_ceil(Self::C8) + PADDING],
1319            len: n,
1320        }
1321    }
1322
1323    pub fn random(len: usize, n_frac: f32) -> Self {
1324        let byte_len = len.div_ceil(Self::C8);
1325        let mut seq = vec![0; byte_len + PADDING];
1326
1327        assert!(
1328            (0.0..=0.3).contains(&n_frac),
1329            "n_frac={} should be in [0, 0.3]",
1330            n_frac
1331        );
1332
1333        for _ in 0..(len as f32 * n_frac) as usize {
1334            let idx = rand::random::<u64>() as usize % len;
1335            let byte = idx / Self::C8;
1336            let offset = idx % Self::C8;
1337            seq[byte] |= 1 << offset;
1338        }
1339
1340        Self { seq, len }
1341    }
1342}
1343
1344impl<'s> PackedSeqBase<'s, 1> {
1345    /// An iterator indicating for each kmer whether it contains ambiguous bases.
1346    ///
1347    /// Returns n-(k-1) elements.
1348    #[inline(always)]
1349    pub fn iter_kmer_ambiguity(self, k: usize) -> impl ExactSizeIterator<Item = bool> + use<'s> {
1350        let this = self.normalize();
1351        assert!(k > 0);
1352        assert!(k <= Self::K64);
1353        (this.offset..this.offset + this.len.saturating_sub(k - 1))
1354            .map(move |i| self.read_kmer(k, i) != 0)
1355    }
1356
1357    /// A parallel iterator indicating for each kmer whether it contains ambiguous bases.
1358    ///
1359    /// First element is the 'kmer' consisting only of the first character of each chunk.
1360    ///
1361    /// `k`: length of windows to check
1362    /// `context`: number of overlapping iterations +1. To determine stride of each lane.
1363    /// `skip`: Set to `context-1` to skip the iterations added by the context.
1364    #[inline(always)]
1365    pub fn par_iter_kmer_ambiguity(
1366        self,
1367        k: usize,
1368        context: usize,
1369        skip: usize,
1370    ) -> PaddedIt<impl ChunkIt<S> + use<'s>> {
1371        #[cfg(target_endian = "big")]
1372        panic!("Big endian architectures are not supported.");
1373
1374        assert!(k > 0, "par_iter_kmers requires k>0, but k={k}");
1375        assert!(k <= 96, "par_iter_kmers requires k<=96, but k={k}");
1376
1377        let this = self.normalize();
1378        let o = this.offset;
1379        assert!(o < Self::C8);
1380
1381        let delay = k - 1;
1382
1383        let it = self.par_iter_bp_delayed(context, Delay(delay));
1384
1385        let mut cnt = u32x8::ZERO;
1386
1387        it.map(
1388            #[inline(always)]
1389            move |(a, r)| {
1390                cnt += a;
1391                let out = cnt.cmp_gt(S::ZERO);
1392                cnt -= r;
1393                out
1394            },
1395        )
1396        .advance(skip)
1397    }
1398
1399    #[inline(always)]
1400    pub fn par_iter_kmer_ambiguity_with_buf(
1401        self,
1402        k: usize,
1403        context: usize,
1404        skip: usize,
1405        buf: &'s mut Vec<S>,
1406    ) -> PaddedIt<impl ChunkIt<S> + use<'s>> {
1407        #[cfg(target_endian = "big")]
1408        panic!("Big endian architectures are not supported.");
1409
1410        assert!(k > 0, "par_iter_kmers requires k>0, but k={k}");
1411        assert!(k <= 96, "par_iter_kmers requires k<=96, but k={k}");
1412
1413        let this = self.normalize();
1414        let o = this.offset;
1415        assert!(o < Self::C8);
1416
1417        let delay = k - 1;
1418
1419        let it = self.par_iter_bp_delayed_with_buf(context, Delay(delay), buf);
1420
1421        let mut cnt = u32x8::ZERO;
1422
1423        it.map(
1424            #[inline(always)]
1425            move |(a, r)| {
1426                cnt += a;
1427                let out = cnt.cmp_gt(S::ZERO);
1428                cnt -= r;
1429                out
1430            },
1431        )
1432        .advance(skip)
1433    }
1434}
1435
1436#[cfg(target_feature = "neon")]
1437mod neon {
1438    use core::arch::aarch64::*;
1439
1440    #[inline(always)]
1441    pub fn movemask_64(v: uint8x16x4_t) -> u64 {
1442        // https://stackoverflow.com/questions/74722950/convert-vector-compare-mask-into-bit-mask-in-aarch64-simd-or-arm-neon/74748402#74748402
1443        unsafe {
1444            let acc = vsriq_n_u8(vsriq_n_u8(v.3, v.2, 1), vsriq_n_u8(v.1, v.0, 1), 2);
1445            vget_lane_u64(
1446                vreinterpret_u64_u8(vshrn_n_u16(
1447                    vreinterpretq_u16_u8(vsriq_n_u8(acc, acc, 4)),
1448                    4,
1449                )),
1450                0,
1451            )
1452        }
1453    }
1454
1455    #[inline(always)]
1456    pub fn map_8x16x4<F>(v: uint8x16x4_t, mut f: F) -> uint8x16x4_t
1457    where
1458        F: FnMut(uint8x16_t) -> uint8x16_t,
1459    {
1460        uint8x16x4_t(f(v.0), f(v.1), f(v.2), f(v.3))
1461    }
1462
1463    #[inline(always)]
1464    pub fn map_two_8x16x4<F>(v1: uint8x16x4_t, v2: uint8x16x4_t, mut f: F) -> uint8x16x4_t
1465    where
1466        F: FnMut(uint8x16_t, uint8x16_t) -> uint8x16_t,
1467    {
1468        uint8x16x4_t(f(v1.0, v2.0), f(v1.1, v2.1), f(v1.2, v2.2), f(v1.3, v2.3))
1469    }
1470}