packed_seq/
packed_seq.rs

1use core::cell::RefCell;
2use std::ops::{Deref, DerefMut};
3use traits::Seq;
4use wide::u16x8;
5
6use crate::{intrinsics::transpose, padded_it::ChunkIt};
7
8use super::*;
9
10type SimdBuf = [S; 8];
11
12struct RecycledBox(Option<Box<SimdBuf>>);
13
14thread_local! {
15    static RECYCLED_BOX_CACHE: RefCell<Vec<Box<SimdBuf>>> = {
16        RefCell::new(vec![Box::new(SimdBuf::default())])
17    };
18}
19
20impl RecycledBox {
21    #[inline(always)]
22    fn take() -> Self {
23        let mut buf = RECYCLED_BOX_CACHE.with_borrow_mut(|v| RecycledBox(v.pop()));
24        buf.init_if_needed();
25        buf
26    }
27
28    #[inline(always)]
29    fn init_if_needed(&mut self) {
30        if self.0.is_none() {
31            self.0 = Some(Box::new(SimdBuf::default()));
32        }
33    }
34
35    #[inline(always)]
36    fn get(&self) -> &SimdBuf {
37        unsafe { self.0.as_ref().unwrap_unchecked() }
38    }
39
40    #[inline(always)]
41    fn get_mut(&mut self) -> &mut SimdBuf {
42        unsafe { self.0.as_mut().unwrap_unchecked() }
43    }
44}
45
46impl Deref for RecycledBox {
47    type Target = SimdBuf;
48
49    #[inline(always)]
50    fn deref(&self) -> &Self::Target {
51        self.get()
52    }
53}
54impl DerefMut for RecycledBox {
55    #[inline(always)]
56    fn deref_mut(&mut self) -> &mut SimdBuf {
57        self.get_mut()
58    }
59}
60
61impl Drop for RecycledBox {
62    #[inline(always)]
63    fn drop(&mut self) {
64        let mut x = None;
65        core::mem::swap(&mut x, &mut self.0);
66        RECYCLED_BOX_CACHE.with_borrow_mut(|v| v.push(unsafe { x.unwrap_unchecked() }));
67    }
68}
69
70struct RecycledVec(Vec<S>);
71
72thread_local! {
73    static RECYCLED_VEC_CACHE: RefCell<Vec<Vec<S>>> = {
74        RefCell::new(vec![])
75    };
76}
77
78impl RecycledVec {
79    #[inline(always)]
80    fn take() -> Self {
81        RecycledVec(RECYCLED_VEC_CACHE.with_borrow_mut(|v| v.pop().unwrap_or_default()))
82    }
83}
84
85impl Deref for RecycledVec {
86    type Target = Vec<S>;
87    #[inline(always)]
88    fn deref(&self) -> &Self::Target {
89        &self.0
90    }
91}
92impl DerefMut for RecycledVec {
93    #[inline(always)]
94    fn deref_mut(&mut self) -> &mut Self::Target {
95        &mut self.0
96    }
97}
98impl Drop for RecycledVec {
99    #[inline(always)]
100    fn drop(&mut self) {
101        RECYCLED_VEC_CACHE.with_borrow_mut(|v| v.push(std::mem::take(&mut self.0)));
102    }
103}
104
105#[doc(hidden)]
106pub struct Bits<const B: usize>;
107#[doc(hidden)]
108pub trait SupportedBits {}
109impl SupportedBits for Bits<1> {}
110impl SupportedBits for Bits<2> {}
111impl SupportedBits for Bits<4> {}
112impl SupportedBits for Bits<8> {}
113
114/// Number of padding bytes at the end of `PackedSeqVecBase::seq`.
115pub(crate) const PADDING: usize = 48;
116
117/// A variable-bit-width packed non-owned slice of DNA bases.
118#[derive(Copy, Clone, Debug, MemSize, MemDbg)]
119pub struct PackedSeqBase<'s, const B: usize>
120where
121    Bits<B>: SupportedBits,
122{
123    /// Packed data.
124    seq: &'s [u8],
125    /// Offset in bp from the start of the `seq`.
126    offset: usize,
127    /// Length of the sequence in bp, starting at `offset` from the start of `seq`.
128    len: usize,
129}
130
131/// A variable-bit-width packed owned sequence of DNA bases.
132#[derive(Clone, Debug, MemSize, MemDbg)]
133#[cfg_attr(feature = "pyo3", pyo3::pyclass)]
134#[cfg_attr(feature = "epserde", derive(epserde::Epserde))]
135pub struct PackedSeqVecBase<const B: usize>
136where
137    Bits<B>: SupportedBits,
138{
139    /// NOTE: We maintain the invariant that this has at least 48 bytes of padding
140    /// at the end after `len` finishes.
141    /// This ensures that `read_unaligned` in `as_64` works OK.
142    pub(crate) seq: Vec<u8>,
143
144    /// The length, in bp, of the underlying sequence. See `.len()`.
145    len: usize,
146}
147
148pub type PackedSeq<'s> = PackedSeqBase<'s, 2>;
149pub type PackedSeqVec = PackedSeqVecBase<2>;
150pub type BitSeq<'s> = PackedSeqBase<'s, 1>;
151pub type BitSeqVec = PackedSeqVecBase<1>;
152
153/// Convenience constants.
154/// B: bits per chat
155impl<'s, const B: usize> PackedSeqBase<'s, B>
156where
157    Bits<B>: SupportedBits,
158{
159    /// lowest B bits are 1.
160    const CHAR_MASK: u64 = (1 << B) - 1;
161    const SIMD_B: S = S::new([B as u32; 8]);
162    const SIMD_CHAR_MASK: S = S::new([(1 << B) - 1; 8]);
163    /// Chars per byte
164    const C8: usize = 8 / B;
165    /// Chars per u32
166    const C32: usize = 32 / B;
167    /// Chars per u256
168    const C256: usize = 256 / B;
169    /// Max length of a kmer that can be read as a single u64.
170    const K64: usize = (64 - 8) / B + 1;
171}
172
173/// Convenience constants.
174impl<const B: usize> PackedSeqVecBase<B>
175where
176    Bits<B>: SupportedBits,
177{
178    /// Chars per byte
179    const C8: usize = 8 / B;
180}
181
182impl<const B: usize> Default for PackedSeqVecBase<B>
183where
184    Bits<B>: SupportedBits,
185{
186    fn default() -> Self {
187        Self {
188            seq: vec![0; PADDING],
189            len: 0,
190        }
191    }
192}
193
194// ======================================================================
195// 2-BIT HELPER METHODS
196
197/// Pack an ASCII `ACTGactg` character into its 2-bit representation, and panic for anything else.
198#[inline(always)]
199pub fn pack_char(base: u8) -> u8 {
200    match base {
201        b'a' | b'A' => 0,
202        b'c' | b'C' => 1,
203        b'g' | b'G' => 3,
204        b't' | b'T' => 2,
205        _ => panic!(
206            "Unexpected character '{}' with ASCII value {base}. Expected one of ACTGactg.",
207            base as char
208        ),
209    }
210}
211
212/// Pack an ASCII `ACTGactg` character into its 2-bit representation, and silently convert other characters into 0..4 as well.
213#[inline(always)]
214pub fn pack_char_lossy(base: u8) -> u8 {
215    (base >> 1) & 3
216}
217/// Pack a slice of ASCII `ACTGactg` characters into its packed 2-bit kmer representation.
218/// Other characters are silently converted.
219#[inline(always)]
220pub fn pack_kmer_lossy(slice: &[u8]) -> u64 {
221    let mut kmer = 0;
222    for (i, &base) in slice.iter().enumerate() {
223        kmer |= (pack_char_lossy(base) as u64) << (2 * i);
224    }
225    kmer
226}
227/// Pack a slice of ASCII `ACTGactg` characters into its packed 2-bit kmer representation.
228#[inline(always)]
229pub fn pack_kmer_u128_lossy(slice: &[u8]) -> u128 {
230    let mut kmer = 0;
231    for (i, &base) in slice.iter().enumerate() {
232        kmer |= (pack_char_lossy(base) as u128) << (2 * i);
233    }
234    kmer
235}
236
237/// Unpack a 2-bit DNA base into the corresponding `ACTG` character.
238#[inline(always)]
239pub fn unpack_base(base: u8) -> u8 {
240    debug_assert!(base < 4, "Base {base} is not <4.");
241    b"ACTG"[base as usize]
242}
243
244/// Unpack a 2-bit encoded kmer into corresponding `ACTG` characters.
245/// Slice the returned array to the correct length.
246#[inline(always)]
247pub fn unpack_kmer(kmer: u64) -> [u8; 32] {
248    std::array::from_fn(|i| unpack_base(((kmer >> (2 * i)) & 3) as u8))
249}
250/// Unpack a 2-bit encoded kmer into corresponding `ACTG` character.
251#[inline(always)]
252pub fn unpack_kmer_into_vec(kmer: u64, k: usize, out: &mut Vec<u8>) {
253    out.clear();
254    out.extend((0..k).map(|i| unpack_base(((kmer >> (2 * i)) & 3) as u8)));
255}
256/// Unpack a 2-bit encoded kmer into corresponding `ACTG` character.
257#[inline(always)]
258pub fn unpack_kmer_to_vec(kmer: u64, k: usize) -> Vec<u8> {
259    let mut out = vec![];
260    unpack_kmer_into_vec(kmer, k, &mut out);
261    out
262}
263
264/// Unpack a 2-bit encoded kmer into corresponding `ACTG` characters.
265/// Slice the returned array to the correct length.
266#[inline(always)]
267pub fn unpack_kmer_u128(kmer: u128) -> [u8; 64] {
268    std::array::from_fn(|i| unpack_base(((kmer >> (2 * i)) & 3) as u8))
269}
270/// Unpack a 2-bit encoded kmer into corresponding `ACTG` character.
271#[inline(always)]
272pub fn unpack_kmer_u128_into_vec(kmer: u128, k: usize, out: &mut Vec<u8>) {
273    out.clear();
274    out.extend((0..k).map(|i| unpack_base(((kmer >> (2 * i)) & 3) as u8)));
275}
276/// Unpack a 2-bit encoded kmer into corresponding `ACTG` character.
277#[inline(always)]
278pub fn unpack_kmer_u128_to_vec(kmer: u128, k: usize) -> Vec<u8> {
279    let mut out = vec![];
280    unpack_kmer_u128_into_vec(kmer, k, &mut out);
281    out
282}
283
284/// Complement an ASCII character: `A<>T` and `C<>G`.
285#[inline(always)]
286pub const fn complement_char(base: u8) -> u8 {
287    match base {
288        b'A' => b'T',
289        b'C' => b'G',
290        b'G' => b'C',
291        b'T' => b'A',
292        _ => panic!("Unexpected character. Expected one of ACTGactg.",),
293    }
294}
295
296/// Complement a 2-bit base: `0<>2` and `1<>3`.
297#[inline(always)]
298pub const fn complement_base(base: u8) -> u8 {
299    base ^ 2
300}
301
302/// Complement 8 lanes of 2-bit bases: `0<>2` and `1<>3`.
303#[inline(always)]
304pub fn complement_base_simd(base: u32x8) -> u32x8 {
305    const TWO: u32x8 = u32x8::new([2; 8]);
306    base ^ TWO
307}
308
309/// Reverse complement the 2-bit pairs in the input.
310#[inline(always)]
311const fn revcomp_raw(word: u64) -> u64 {
312    #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
313    {
314        let mut res = word.reverse_bits(); // ARM can reverse bits in a single instruction
315        res = ((res >> 1) & 0x5555_5555_5555_5555) | ((res & 0x5555_5555_5555_5555) << 1);
316        res ^ 0xAAAA_AAAA_AAAA_AAAA
317    }
318
319    #[cfg(not(any(target_arch = "arm", target_arch = "aarch64")))]
320    {
321        let mut res = word.swap_bytes();
322        res = ((res >> 4) & 0x0F0F_0F0F_0F0F_0F0F) | ((res & 0x0F0F_0F0F_0F0F_0F0F) << 4);
323        res = ((res >> 2) & 0x3333_3333_3333_3333) | ((res & 0x3333_3333_3333_3333) << 2);
324        res ^ 0xAAAA_AAAA_AAAA_AAAA
325    }
326}
327
328/// Compute the reverse complement of a short sequence packed in a `u64`.
329#[inline(always)]
330pub const fn revcomp_u64(word: u64, len: usize) -> u64 {
331    revcomp_raw(word) >> (usize::BITS as usize - 2 * len)
332}
333
334#[inline(always)]
335pub const fn revcomp_u128(word: u128, len: usize) -> u128 {
336    let low = word as u64;
337    let high = (word >> 64) as u64;
338    let rlow = revcomp_raw(low);
339    let rhigh = revcomp_raw(high);
340    let out = ((rlow as u128) << 64) | rhigh as u128;
341    out >> (u128::BITS as usize - 2 * len)
342}
343
344// ======================================================================
345// 1-BIT HELPER METHODS
346
347/// 1 when a char is ambiguous.
348#[inline(always)]
349pub fn char_is_ambiguous(base: u8) -> u8 {
350    // (!matches!(base, b'A' | b'C'  | b'G'  | b'T' | b'a' | b'c'  | b'g'  | b't')) as u8
351    let table = b"ACTG";
352    let upper_mask = !(b'a' - b'A');
353    (table[pack_char_lossy(base) as usize] != (base & upper_mask)) as u8
354}
355
356/// Reverse `len` bits packed in a `u64`.
357#[inline(always)]
358pub const fn rev_u64(word: u64, len: usize) -> u64 {
359    word.reverse_bits() >> (usize::BITS as usize - len)
360}
361
362/// Reverse `len` bits packed in a `u128`.
363#[inline(always)]
364pub const fn rev_u128(word: u128, len: usize) -> u128 {
365    word.reverse_bits() >> (u128::BITS as usize - len)
366}
367
368// ======================================================================
369
370impl<'s, const B: usize> PackedSeqBase<'s, B>
371where
372    Bits<B>: SupportedBits,
373{
374    /// Creates a `Seq` from a slice of packed bytes, an offset in bp and a length in bp.
375    ///
376    /// The slice should have at least 48 bytes of padding after `offset + len`.
377    /// Otherwise, the function will panic.
378    pub fn from_raw_parts(seq: &'s [u8], offset: usize, len: usize) -> Self {
379        assert!(offset + len + PADDING * Self::C8 <= seq.len() * Self::C8);
380        Self { seq, offset, len }
381    }
382
383    /// Shrink `seq` to only just cover the data.
384    #[inline(always)]
385    pub fn normalize(&self) -> Self {
386        let start_byte = self.offset / Self::C8;
387        let end_byte = (self.offset + self.len).div_ceil(Self::C8);
388        Self {
389            seq: &self.seq[start_byte..end_byte + PADDING],
390            offset: self.offset % Self::C8,
391            len: self.len,
392        }
393    }
394
395    /// Return a `Vec<u8>` of ASCII `ACTG` characters.
396    #[inline(always)]
397    pub fn unpack(&self) -> Vec<u8> {
398        self.iter_bp().map(unpack_base).collect()
399    }
400}
401
402/// Read up to 32 bytes starting at idx.
403#[inline(always)]
404pub(crate) unsafe fn read_slice_32_unchecked(seq: &[u8], idx: usize) -> u32x8 {
405    unsafe {
406        let src = seq.as_ptr().add(idx);
407        debug_assert!(idx + 32 <= seq.len());
408        std::mem::transmute::<_, *const u32x8>(src).read_unaligned()
409    }
410}
411
412/// Read up to 32 bytes starting at idx.
413#[inline(always)]
414pub(crate) fn read_slice_32(seq: &[u8], idx: usize) -> u32x8 {
415    unsafe {
416        let src = seq.as_ptr().add(idx);
417        if idx + 32 <= seq.len() {
418            std::mem::transmute::<_, *const u32x8>(src).read_unaligned()
419        } else {
420            let num_bytes = seq.len().saturating_sub(idx);
421            let mut result = [0u8; 32];
422            std::ptr::copy_nonoverlapping(src, result.as_mut_ptr(), num_bytes);
423            std::mem::transmute(result)
424        }
425    }
426}
427
428/// Read up to 16 bytes starting at idx.
429#[allow(unused)]
430#[inline(always)]
431pub(crate) fn read_slice_16(seq: &[u8], idx: usize) -> u16x8 {
432    unsafe {
433        let src = seq.as_ptr().add(idx);
434        if idx + 16 <= seq.len() {
435            std::mem::transmute::<_, *const u16x8>(src).read_unaligned()
436        } else {
437            let num_bytes = seq.len().saturating_sub(idx);
438            let mut result = [0u8; 16];
439            std::ptr::copy_nonoverlapping(src, result.as_mut_ptr(), num_bytes);
440            std::mem::transmute(result)
441        }
442    }
443}
444
445impl<'s, const B: usize> Seq<'s> for PackedSeqBase<'s, B>
446where
447    Bits<B>: SupportedBits,
448{
449    const BITS_PER_CHAR: usize = B;
450    const BASES_PER_BYTE: usize = Self::C8;
451    type SeqVec = PackedSeqVecBase<B>;
452
453    #[inline(always)]
454    fn len(&self) -> usize {
455        self.len
456    }
457
458    #[inline(always)]
459    fn is_empty(&self) -> bool {
460        self.len == 0
461    }
462
463    #[inline(always)]
464    fn get_ascii(&self, index: usize) -> u8 {
465        unpack_base(self.get(index))
466    }
467
468    /// Convert a short sequence (kmer) to a packed representation as `u64`.
469    /// Panics if `self` is longer than 32 characters.
470    #[inline(always)]
471    fn as_u64(&self) -> u64 {
472        assert!(self.len() <= 64 / B);
473
474        let mask = u64::MAX >> (64 - B * self.len());
475
476        // The unaligned read is OK, because we ensure that the underlying `PackedSeqVecBase::seq` always
477        // has at least 48 bytes of padding at the end.
478        if self.len() <= Self::K64 {
479            let x = unsafe { (self.seq.as_ptr() as *const u64).read_unaligned() };
480            (x >> (B * self.offset)) & mask
481        } else {
482            let x = unsafe { (self.seq.as_ptr() as *const u128).read_unaligned() };
483            (x >> (B * self.offset)) as u64 & mask
484        }
485    }
486
487    /// Convert a short sequence (kmer) to a packed representation of its reverse complement as `usize`.
488    /// Panics if `self` is longer than 32 characters.
489    #[inline(always)]
490    fn revcomp_as_u64(&self) -> u64 {
491        match B {
492            1 => rev_u64(self.as_u64(), self.len()),
493            2 => revcomp_u64(self.as_u64(), self.len()),
494            _ => panic!("Rev(comp) is only supported for 1-bit and 2-bit alphabets."),
495        }
496    }
497
498    /// Convert a short sequence (kmer) to a packed representation as `u128`.
499    /// Panics if `self` is longer than 64 characters.
500    #[inline(always)]
501    fn as_u128(&self) -> u128 {
502        assert!(
503            self.len() <= (128 - 8) / B + 1,
504            "Sequences >61 long cannot be read with a single unaligned u128 read."
505        );
506
507        let mask = u128::MAX >> (128 - B * self.len());
508
509        // The unaligned read is OK, because we ensure that the underlying `PackedSeqVecBase::seq` always
510        // has at least 48 bytes of padding at the end.
511        let x = unsafe { (self.seq.as_ptr() as *const u128).read_unaligned() };
512        (x >> (B * self.offset)) & mask
513    }
514
515    /// Convert a short sequence (kmer) to a packed representation of its reverse complement as `usize`.
516    /// Panics if `self` is longer than 64 characters.
517    #[inline(always)]
518    fn revcomp_as_u128(&self) -> u128 {
519        match B {
520            1 => rev_u128(self.as_u128(), self.len()),
521            2 => revcomp_u128(self.as_u128(), self.len()),
522            _ => panic!("Rev(comp) is only supported for 1-bit and 2-bit alphabets."),
523        }
524    }
525
526    #[inline(always)]
527    fn to_vec(&self) -> PackedSeqVecBase<B> {
528        assert_eq!(self.offset, 0);
529        PackedSeqVecBase {
530            seq: self
531                .seq
532                .iter()
533                .copied()
534                .chain(std::iter::repeat_n(0u8, PADDING))
535                .collect(),
536            len: self.len,
537        }
538    }
539
540    fn to_revcomp(&self) -> PackedSeqVecBase<B> {
541        match B {
542            1 | 2 => {}
543            _ => panic!("Can only reverse (&complement) 1-bit and 2-bit packed sequences.",),
544        }
545
546        let mut seq = self.seq[..(self.offset + self.len).div_ceil(Self::C8)]
547            .iter()
548            // 1. reverse the bytes
549            .rev()
550            .copied()
551            .map(|mut res| {
552                match B {
553                    2 => {
554                        // 2. swap the bases in the byte
555                        // This is auto-vectorized.
556                        res = ((res >> 4) & 0x0F) | ((res & 0x0F) << 4);
557                        res = ((res >> 2) & 0x33) | ((res & 0x33) << 2);
558                        // Complement the bases.
559                        res ^ 0xAA
560                    }
561                    1 => res.reverse_bits(),
562                    _ => unreachable!(),
563                }
564            })
565            .chain(std::iter::repeat_n(0u8, PADDING))
566            .collect::<Vec<u8>>();
567
568        // 3. Shift away the offset.
569        let new_offset = (Self::C8 - (self.offset + self.len) % Self::C8) % Self::C8;
570
571        if new_offset > 0 {
572            // Shift everything left by `2*new_offset` bits.
573            let shift = B * new_offset;
574            *seq.last_mut().unwrap() >>= shift;
575            // This loop is also auto-vectorized.
576            for i in 0..seq.len() - 1 {
577                seq[i] = (seq[i] >> shift) | (seq[i + 1] << (8 - shift));
578            }
579        }
580
581        PackedSeqVecBase { seq, len: self.len }
582    }
583
584    #[inline(always)]
585    fn slice(&self, range: Range<usize>) -> Self {
586        debug_assert!(
587            range.end <= self.len,
588            "Slice index out of bounds: {} > {}",
589            range.end,
590            self.len
591        );
592        PackedSeqBase {
593            seq: self.seq,
594            offset: self.offset + range.start,
595            len: range.end - range.start,
596        }
597        .normalize()
598    }
599
600    #[inline(always)]
601    fn iter_bp(self) -> impl ExactSizeIterator<Item = u8> {
602        assert!(self.len <= self.seq.len() * Self::C8);
603
604        let this = self.normalize();
605
606        // read u64 at a time?
607        let mut byte = 0;
608        (0..this.len + this.offset)
609            .map(
610                #[inline(always)]
611                move |i| {
612                    if i % Self::C8 == 0 {
613                        byte = this.seq[i / Self::C8];
614                    }
615                    // Shift byte instead of i?
616                    (byte >> (B * (i % Self::C8))) & Self::CHAR_MASK as u8
617                },
618            )
619            .advance(this.offset)
620    }
621
622    #[inline(always)]
623    fn par_iter_bp(self, context: usize) -> PaddedIt<impl ChunkIt<S>> {
624        self.par_iter_bp_with_buf(context, RecycledBox::take())
625    }
626
627    #[inline(always)]
628    fn par_iter_bp_delayed(self, context: usize, delay: Delay) -> PaddedIt<impl ChunkIt<(S, S)>> {
629        self.par_iter_bp_delayed_with_factor(context, delay, 1)
630    }
631
632    /// NOTE: When `self` starts does not start at a byte boundary, the
633    /// 'delayed' character is not guaranteed to be `0`.
634    #[inline(always)]
635    fn par_iter_bp_delayed_2(
636        self,
637        context: usize,
638        delay1: Delay,
639        delay2: Delay,
640    ) -> PaddedIt<impl ChunkIt<(S, S, S)>> {
641        self.par_iter_bp_delayed_2_with_factor_and_buf(
642            context,
643            delay1,
644            delay2,
645            1,
646            RecycledVec::take(),
647        )
648    }
649
650    /// Compares 29 characters at a time.
651    fn cmp_lcp(&self, other: &Self) -> (std::cmp::Ordering, usize) {
652        let mut lcp = 0;
653        let min_len = self.len.min(other.len);
654        for i in (0..min_len).step_by(Self::K64) {
655            let len = (min_len - i).min(Self::K64);
656            let this = self.slice(i..i + len);
657            let other = other.slice(i..i + len);
658            let this_word = this.as_u64();
659            let other_word = other.as_u64();
660            if this_word != other_word {
661                // Unfortunately, bases are packed in little endian order, so the default order is reversed.
662                let eq = this_word ^ other_word;
663                let t = eq.trailing_zeros() as usize / B * B;
664                lcp += t / B;
665                let mask = (Self::CHAR_MASK) << t;
666                return ((this_word & mask).cmp(&(other_word & mask)), lcp);
667            }
668            lcp += len;
669        }
670        (self.len.cmp(&other.len), lcp)
671    }
672
673    #[inline(always)]
674    fn get(&self, index: usize) -> u8 {
675        let offset = self.offset + index;
676        let idx = offset / Self::C8;
677        let offset = offset % Self::C8;
678        (self.seq[idx] >> (B * offset)) & Self::CHAR_MASK as u8
679    }
680}
681
682impl<'s, const B: usize> PackedSeqBase<'s, B>
683where
684    Bits<B>: SupportedBits,
685{
686    #[inline(always)]
687    pub fn par_iter_bp_with_buf<BUF: DerefMut<Target = [S; 8]>>(
688        self,
689        context: usize,
690        mut buf: BUF,
691    ) -> PaddedIt<impl ChunkIt<S> + use<'s, B, BUF>> {
692        #[cfg(target_endian = "big")]
693        panic!("Big endian architectures are not supported.");
694
695        let this = self.normalize();
696        let o = this.offset;
697        assert!(o < Self::C8);
698
699        let num_kmers = if this.len == 0 {
700            0
701        } else {
702            (this.len + o).saturating_sub(context - 1)
703        };
704        // without +o, since we don't need them in the stride.
705        let num_kmers_stride = this.len.saturating_sub(context - 1);
706        let n = num_kmers_stride.div_ceil(L).next_multiple_of(Self::C8);
707        let bytes_per_chunk = n / Self::C8;
708        let padding = Self::C8 * L * bytes_per_chunk - num_kmers_stride;
709
710        let offsets: [usize; 8] = from_fn(|l| l * bytes_per_chunk);
711        let mut cur = S::ZERO;
712
713        let par_len = if num_kmers == 0 {
714            0
715        } else {
716            n + context + o - 1
717        };
718
719        let last_i = par_len.saturating_sub(1) / Self::C32 * Self::C32;
720        // Safety check for the `read_slice_32_unchecked`:
721        assert!(offsets[7] + (last_i / Self::C8) + 32 <= this.seq.len());
722
723        let it = (0..par_len)
724            .map(
725                #[inline(always)]
726                move |i| {
727                    if i % Self::C32 == 0 {
728                        if i % Self::C256 == 0 {
729                            // Read a u256 for each lane containing the next 128 characters.
730                            let data: [u32x8; 8] = from_fn(
731                                #[inline(always)]
732                                |lane| unsafe {
733                                    read_slice_32_unchecked(
734                                        this.seq,
735                                        offsets[lane] + (i / Self::C8),
736                                    )
737                                },
738                            );
739                            *buf = transpose(data);
740                        }
741                        cur = buf[(i % Self::C256) / Self::C32];
742                    }
743                    // Extract the last 2 bits of each character.
744                    let chars = cur & Self::SIMD_CHAR_MASK;
745                    // Shift remaining characters to the right.
746                    cur = cur >> Self::SIMD_B;
747                    chars
748                },
749            )
750            .advance(o);
751
752        PaddedIt { it, padding }
753    }
754
755    #[inline(always)]
756    pub fn par_iter_bp_delayed_with_factor(
757        self,
758        context: usize,
759        delay: Delay,
760        factor: usize,
761    ) -> PaddedIt<impl ChunkIt<(S, S)> + use<'s, B>> {
762        self.par_iter_bp_delayed_with_factor_and_buf(context, delay, factor, RecycledVec::take())
763    }
764
765    #[inline(always)]
766    pub fn par_iter_bp_delayed_with_buf<BUF: DerefMut<Target = Vec<S>>>(
767        self,
768        context: usize,
769        delay: Delay,
770        buf: BUF,
771    ) -> PaddedIt<impl ChunkIt<(S, S)> + use<'s, B, BUF>> {
772        self.par_iter_bp_delayed_with_factor_and_buf(context, delay, 1, buf)
773    }
774
775    #[inline(always)]
776    pub fn par_iter_bp_delayed_with_factor_and_buf<BUF: DerefMut<Target = Vec<S>>>(
777        self,
778        context: usize,
779        Delay(delay): Delay,
780        factor: usize,
781        mut buf: BUF,
782    ) -> PaddedIt<impl ChunkIt<(S, S)> + use<'s, B, BUF>> {
783        #[cfg(target_endian = "big")]
784        panic!("Big endian architectures are not supported.");
785
786        assert!(
787            delay < usize::MAX / 2,
788            "Delay={} should be >=0.",
789            delay as isize
790        );
791
792        let this = self.normalize();
793        let o = this.offset;
794        assert!(o < Self::C8);
795
796        let num_kmers = if this.len == 0 {
797            0
798        } else {
799            (this.len + o).saturating_sub(context - 1)
800        };
801        // without +o, since we don't need them in the stride.
802        let num_kmers_stride = this.len.saturating_sub(context - 1);
803        let n = num_kmers_stride
804            .div_ceil(L)
805            .next_multiple_of(factor * Self::C8);
806        let bytes_per_chunk = n / Self::C8;
807        let padding = Self::C8 * L * bytes_per_chunk - num_kmers_stride;
808
809        let offsets: [usize; 8] = from_fn(|l| l * bytes_per_chunk);
810        let mut upcoming = S::ZERO;
811        let mut upcoming_d = S::ZERO;
812
813        // Even buf_len is nice to only have the write==buf_len check once.
814        // We also make it the next power of 2, for faster modulo operations.
815        // delay/16: number of bp in a u32.
816        // +8: some 'random' padding
817        let buf_len = (delay / Self::C32 + 8).next_power_of_two();
818        let buf_mask = buf_len - 1;
819        if buf.capacity() < buf_len {
820            // This has better codegen than `vec.clear(); vec.resize()`, since the inner `do_reserve_and_handle` of resize is not inlined.
821            *buf = vec![S::ZERO; buf_len];
822        } else {
823            // NOTE: Buf needs to be filled with zeros to guarantee returning 0 values for out-of-bounds characters.
824            buf.clear();
825            buf.resize(buf_len, S::ZERO);
826        }
827
828        let mut write_idx = 0;
829        // We compensate for the first delay/16 triggers of the check below that
830        // happen before the delay is actually reached.
831        let mut read_idx = (buf_len - delay / Self::C32) % buf_len;
832
833        let par_len = if num_kmers == 0 {
834            0
835        } else {
836            n + context + o - 1
837        };
838
839        let last_i = par_len.saturating_sub(1) / Self::C32 * Self::C32;
840        // Safety check for the `read_slice_32_unchecked`:
841        assert!(offsets[7] + (last_i / Self::C8) + 32 <= this.seq.len());
842
843        let it = (0..par_len)
844            .map(
845                #[inline(always)]
846                move |i| {
847                    if i % Self::C32 == 0 {
848                        if i % Self::C256 == 0 {
849                            // Read a u256 for each lane containing the next 128 characters.
850                            let data: [u32x8; 8] = from_fn(
851                                #[inline(always)]
852                                |lane| unsafe {
853                                    read_slice_32_unchecked(
854                                        this.seq,
855                                        offsets[lane] + (i / Self::C8),
856                                    )
857                                },
858                            );
859                            unsafe {
860                                *TryInto::<&mut [u32x8; 8]>::try_into(
861                                    buf.get_unchecked_mut(write_idx..write_idx + 8),
862                                )
863                                .unwrap_unchecked() = transpose(data);
864                            }
865                            if i == 0 {
866                                // Mask out chars before the offset.
867                                let elem = !((1u32 << (B * o)) - 1);
868                                let mask = S::splat(elem);
869                                unsafe { assert_unchecked(write_idx < buf.len()) };
870                                buf[write_idx] &= mask;
871                            }
872                        }
873                        unsafe { assert_unchecked(write_idx < buf.len()) };
874                        upcoming = buf[write_idx];
875                        write_idx += 1;
876                        write_idx &= buf_mask;
877                    }
878                    if i % Self::C32 == delay % Self::C32 {
879                        unsafe { assert_unchecked(read_idx < buf.len()) };
880                        upcoming_d = buf[read_idx];
881                        read_idx += 1;
882                        read_idx &= buf_mask;
883                    }
884                    // Extract the last 2 bits of each character.
885                    let chars = upcoming & Self::SIMD_CHAR_MASK;
886                    let chars_d = upcoming_d & Self::SIMD_CHAR_MASK;
887                    // Shift remaining characters to the right.
888                    upcoming = upcoming >> Self::SIMD_B;
889                    upcoming_d = upcoming_d >> Self::SIMD_B;
890                    (chars, chars_d)
891                },
892            )
893            .advance(o);
894
895        PaddedIt { it, padding }
896    }
897
898    #[inline(always)]
899    pub fn par_iter_bp_delayed_2_with_factor(
900        self,
901        context: usize,
902        delay1: Delay,
903        delay2: Delay,
904        factor: usize,
905    ) -> PaddedIt<impl ChunkIt<(S, S, S)> + use<'s, B>> {
906        self.par_iter_bp_delayed_2_with_factor_and_buf(
907            context,
908            delay1,
909            delay2,
910            factor,
911            RecycledVec::take(),
912        )
913    }
914
915    #[inline(always)]
916    pub fn par_iter_bp_delayed_2_with_buf<BUF: DerefMut<Target = Vec<S>>>(
917        self,
918        context: usize,
919        delay1: Delay,
920        delay2: Delay,
921        buf: BUF,
922    ) -> PaddedIt<impl ChunkIt<(S, S, S)> + use<'s, B, BUF>> {
923        self.par_iter_bp_delayed_2_with_factor_and_buf(context, delay1, delay2, 1, buf)
924    }
925
926    /// When iterating over 2-bit and 1-bit encoded data in parallel,
927    /// one must ensure that they have the same stride.
928    /// On the larger type, set `factor` as the ratio to the smaller one,
929    /// so that the stride in bytes is a multiple of `factor`,
930    /// so that the smaller type also has a byte-aligned stride.
931    #[inline(always)]
932    pub fn par_iter_bp_delayed_2_with_factor_and_buf<BUF: DerefMut<Target = Vec<S>>>(
933        self,
934        context: usize,
935        Delay(delay1): Delay,
936        Delay(delay2): Delay,
937        factor: usize,
938        mut buf: BUF,
939    ) -> PaddedIt<impl ChunkIt<(S, S, S)> + use<'s, B, BUF>> {
940        #[cfg(target_endian = "big")]
941        panic!("Big endian architectures are not supported.");
942
943        let this = self.normalize();
944        let o = this.offset;
945        assert!(o < Self::C8);
946        assert!(delay1 <= delay2, "Delay1 must be at most delay2.");
947
948        let num_kmers = if this.len == 0 {
949            0
950        } else {
951            (this.len + o).saturating_sub(context - 1)
952        };
953        // without +o, since we don't need them in the stride.
954        let num_kmers_stride = this.len.saturating_sub(context - 1);
955        let n = num_kmers_stride
956            .div_ceil(L)
957            .next_multiple_of(factor * Self::C8);
958        let bytes_per_chunk = n / Self::C8;
959        let padding = Self::C8 * L * bytes_per_chunk - num_kmers_stride;
960
961        let offsets: [usize; 8] = from_fn(|l| l * bytes_per_chunk);
962        let mut upcoming = S::ZERO;
963        let mut upcoming_d1 = S::ZERO;
964        let mut upcoming_d2 = S::ZERO;
965
966        // Even buf_len is nice to only have the write==buf_len check once.
967        let buf_len = (delay2 / Self::C32 + 8).next_power_of_two();
968        let buf_mask = buf_len - 1;
969        if buf.capacity() < buf_len {
970            // This has better codegen than `vec.clear(); vec.resize()`, since the inner `do_reserve_and_handle` of resize is not inlined.
971            *buf = vec![S::ZERO; buf_len];
972        } else {
973            // NOTE: Buf needs to be filled with zeros to guarantee returning 0 values for out-of-bounds characters.
974            buf.clear();
975            buf.resize(buf_len, S::ZERO);
976        }
977
978        let mut write_idx = 0;
979        // We compensate for the first delay/16 triggers of the check below that
980        // happen before the delay is actually reached.
981        let mut read_idx1 = (buf_len - delay1 / Self::C32) % buf_len;
982        let mut read_idx2 = (buf_len - delay2 / Self::C32) % buf_len;
983
984        let par_len = if num_kmers == 0 {
985            0
986        } else {
987            n + context + o - 1
988        };
989
990        let last_i = par_len.saturating_sub(1) / Self::C32 * Self::C32;
991        // Safety check for the `read_slice_32_unchecked`:
992        assert!(offsets[7] + (last_i / Self::C8) + 32 <= this.seq.len());
993
994        let it = (0..par_len)
995            .map(
996                #[inline(always)]
997                move |i| {
998                    if i % Self::C32 == 0 {
999                        if i % Self::C256 == 0 {
1000                            // Read a u256 for each lane containing the next 128 characters.
1001                            let data: [u32x8; 8] = from_fn(
1002                                #[inline(always)]
1003                                |lane| unsafe {
1004                                    read_slice_32_unchecked(
1005                                        this.seq,
1006                                        offsets[lane] + (i / Self::C8),
1007                                    )
1008                                },
1009                            );
1010                            unsafe {
1011                                *TryInto::<&mut [u32x8; 8]>::try_into(
1012                                    buf.get_unchecked_mut(write_idx..write_idx + 8),
1013                                )
1014                                .unwrap_unchecked() = transpose(data);
1015                            }
1016                            // FIXME DROP THIS?
1017                            if i == 0 {
1018                                // Mask out chars before the offset.
1019                                let elem = !((1u32 << (B * o)) - 1);
1020                                let mask = S::splat(elem);
1021                                buf[write_idx] &= mask;
1022                            }
1023                        }
1024                        upcoming = buf[write_idx];
1025                        write_idx += 1;
1026                        write_idx &= buf_mask;
1027                    }
1028                    if i % Self::C32 == delay1 % Self::C32 {
1029                        unsafe { assert_unchecked(read_idx1 < buf.len()) };
1030                        upcoming_d1 = buf[read_idx1];
1031                        read_idx1 += 1;
1032                        read_idx1 &= buf_mask;
1033                    }
1034                    if i % Self::C32 == delay2 % Self::C32 {
1035                        unsafe { assert_unchecked(read_idx2 < buf.len()) };
1036                        upcoming_d2 = buf[read_idx2];
1037                        read_idx2 += 1;
1038                        read_idx2 &= buf_mask;
1039                    }
1040                    // Extract the last 2 bits of each character.
1041                    let chars = upcoming & Self::SIMD_CHAR_MASK;
1042                    let chars_d1 = upcoming_d1 & Self::SIMD_CHAR_MASK;
1043                    let chars_d2 = upcoming_d2 & Self::SIMD_CHAR_MASK;
1044                    // Shift remaining characters to the right.
1045                    upcoming = upcoming >> Self::SIMD_B;
1046                    upcoming_d1 = upcoming_d1 >> Self::SIMD_B;
1047                    upcoming_d2 = upcoming_d2 >> Self::SIMD_B;
1048                    (chars, chars_d1, chars_d2)
1049                },
1050            )
1051            .advance(o);
1052
1053        PaddedIt { it, padding }
1054    }
1055}
1056
1057impl<const B: usize> PartialEq for PackedSeqBase<'_, B>
1058where
1059    Bits<B>: SupportedBits,
1060{
1061    /// Compares 29 characters at a time.
1062    fn eq(&self, other: &Self) -> bool {
1063        if self.len != other.len {
1064            return false;
1065        }
1066        for i in (0..self.len).step_by(Self::K64) {
1067            let len = (self.len - i).min(Self::K64);
1068            let this = self.slice(i..i + len);
1069            let that = other.slice(i..i + len);
1070            if this.as_u64() != that.as_u64() {
1071                return false;
1072            }
1073        }
1074        true
1075    }
1076}
1077
1078impl<const B: usize> Eq for PackedSeqBase<'_, B> where Bits<B>: SupportedBits {}
1079
1080impl<const B: usize> PartialOrd for PackedSeqBase<'_, B>
1081where
1082    Bits<B>: SupportedBits,
1083{
1084    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
1085        Some(self.cmp(other))
1086    }
1087}
1088
1089impl<const B: usize> Ord for PackedSeqBase<'_, B>
1090where
1091    Bits<B>: SupportedBits,
1092{
1093    /// Compares 29 characters at a time.
1094    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
1095        let min_len = self.len.min(other.len);
1096        for i in (0..min_len).step_by(Self::K64) {
1097            let len = (min_len - i).min(Self::K64);
1098            let this = self.slice(i..i + len);
1099            let other = other.slice(i..i + len);
1100            let this_word = this.as_u64();
1101            let other_word = other.as_u64();
1102            if this_word != other_word {
1103                // Unfortunately, bases are packed in little endian order, so the default order is reversed.
1104                let eq = this_word ^ other_word;
1105                let t = eq.trailing_zeros() as usize / B * B;
1106                let mask = (Self::CHAR_MASK) << t;
1107                return (this_word & mask).cmp(&(other_word & mask));
1108            }
1109        }
1110        self.len.cmp(&other.len)
1111    }
1112}
1113
1114impl<const B: usize> SeqVec for PackedSeqVecBase<B>
1115where
1116    Bits<B>: SupportedBits,
1117{
1118    type Seq<'s> = PackedSeqBase<'s, B>;
1119
1120    #[inline(always)]
1121    fn into_raw(mut self) -> Vec<u8> {
1122        self.seq.resize(self.len.div_ceil(Self::C8), 0);
1123        self.seq
1124    }
1125
1126    #[inline(always)]
1127    fn as_slice(&self) -> Self::Seq<'_> {
1128        PackedSeqBase {
1129            seq: &self.seq[..self.len.div_ceil(Self::C8) + PADDING],
1130            offset: 0,
1131            len: self.len,
1132        }
1133    }
1134
1135    #[inline(always)]
1136    fn len(&self) -> usize {
1137        self.len
1138    }
1139
1140    #[inline(always)]
1141    fn is_empty(&self) -> bool {
1142        self.len == 0
1143    }
1144
1145    #[inline(always)]
1146    fn clear(&mut self) {
1147        self.seq.clear();
1148        self.len = 0;
1149    }
1150
1151    fn push_seq<'a>(&mut self, seq: PackedSeqBase<'_, B>) -> Range<usize> {
1152        let start = self.len.next_multiple_of(Self::C8) + seq.offset;
1153        let end = start + seq.len();
1154
1155        // Shrink away the padding.
1156        self.seq.resize(self.len.div_ceil(Self::C8), 0);
1157        // Extend.
1158        self.seq.extend(seq.seq);
1159        self.len = end;
1160        start..end
1161    }
1162
1163    /// For `PackedSeqVec` (2-bit encoding): map ASCII `ACGT` (and `acgt`) to DNA `0132` in that order.
1164    /// Other characters are silently mapped into `0..4`.
1165    ///
1166    /// Uses the BMI2 `pext` instruction when available, based on the
1167    /// `n_to_bits_pext` method described at
1168    /// <https://github.com/Daniel-Liu-c0deb0t/cute-nucleotides>.
1169    ///
1170    /// For `BitSeqVec` (1-bit encoding): map `ACGTacgt` to `0`, and everything else to `1`.
1171    fn push_ascii(&mut self, seq: &[u8]) -> Range<usize> {
1172        match B {
1173            1 | 2 => {}
1174            _ => panic!(
1175                "Can only use ASCII input for 2-bit DNA packing, or 1-bit ambiguous indicators."
1176            ),
1177        }
1178
1179        self.seq
1180            .resize((self.len + seq.len()).div_ceil(Self::C8) + PADDING, 0);
1181        let start_aligned = self.len.next_multiple_of(Self::C8);
1182        let start = self.len;
1183        let len = seq.len();
1184        let mut idx = self.len / Self::C8;
1185
1186        let parse_base = |base| match B {
1187            1 => char_is_ambiguous(base),
1188            2 => pack_char_lossy(base),
1189            _ => unreachable!(),
1190        };
1191
1192        let unaligned = core::cmp::min(start_aligned - start, len);
1193        if unaligned > 0 {
1194            let mut packed_byte = self.seq[idx];
1195            for &base in &seq[..unaligned] {
1196                packed_byte |= parse_base(base) << ((self.len % Self::C8) * B);
1197                self.len += 1;
1198            }
1199            self.seq[idx] = packed_byte;
1200            idx += 1;
1201        }
1202
1203        #[allow(unused)]
1204        let mut last = unaligned;
1205
1206        if B == 2 {
1207            #[cfg(all(target_arch = "x86_64", target_feature = "bmi2"))]
1208            {
1209                last = unaligned + (len - unaligned) / 8 * 8;
1210
1211                for i in (unaligned..last).step_by(8) {
1212                    let chunk =
1213                        unsafe { seq.get_unchecked(i..i + 8).try_into().unwrap_unchecked() };
1214                    let ascii = u64::from_le_bytes(chunk);
1215                    let packed_bytes =
1216                        unsafe { std::arch::x86_64::_pext_u64(ascii, 0x0606060606060606) } as u16;
1217                    unsafe {
1218                        self.seq
1219                            .get_unchecked_mut(idx..(idx + 2))
1220                            .copy_from_slice(&packed_bytes.to_le_bytes())
1221                    };
1222                    idx += 2;
1223                    self.len += 8;
1224                }
1225            }
1226
1227            #[cfg(target_feature = "neon")]
1228            {
1229                use core::arch::aarch64::{
1230                    vandq_u8, vdup_n_u8, vld1q_u8, vpadd_u8, vshlq_u8, vst1_u8,
1231                };
1232                use core::mem::transmute;
1233
1234                last = unaligned + (len - unaligned) / 16 * 16;
1235
1236                for i in (unaligned..last).step_by(16) {
1237                    unsafe {
1238                        let ascii = vld1q_u8(seq.as_ptr().add(i));
1239                        let masked_bits = vandq_u8(ascii, transmute([6i8; 16]));
1240                        let (bits_0, bits_1) = transmute(vshlq_u8(
1241                            masked_bits,
1242                            transmute([-1i8, 1, 3, 5, -1, 1, 3, 5, -1, 1, 3, 5, -1, 1, 3, 5]),
1243                        ));
1244                        let half_packed = vpadd_u8(bits_0, bits_1);
1245                        let packed = vpadd_u8(half_packed, vdup_n_u8(0));
1246                        vst1_u8(self.seq.as_mut_ptr().add(idx), packed);
1247                        idx += Self::C8;
1248                        self.len += 16;
1249                    }
1250                }
1251            }
1252        }
1253
1254        if B == 1 {
1255            #[cfg(all(target_arch = "x86_64", target_feature = "avx2"))]
1256            {
1257                last = len;
1258                self.len += len - unaligned;
1259
1260                let mut last_i = unaligned;
1261
1262                for i in (unaligned..last).step_by(32) {
1263                    use std::mem::transmute as t;
1264
1265                    use wide::CmpEq;
1266                    // Wide doesn't have u8x32, so this is messy here...
1267                    type S = wide::i8x32;
1268                    let chars: S = unsafe { t(read_slice_32(seq, i)) };
1269                    let upper_mask = !(b'a' - b'A');
1270                    // make everything upper case
1271                    let chars = chars & S::splat(upper_mask as i8);
1272                    let lossy_encoded = chars & S::splat(6);
1273                    let table = unsafe { S::from(t::<_, S>(*b"AxCxTxGxxxxxxxxxAxCxTxGxxxxxxxxx")) };
1274                    let lookup: S = unsafe {
1275                        t(std::arch::x86_64::_mm256_shuffle_epi8(
1276                            t(table),
1277                            t(lossy_encoded),
1278                        ))
1279                    };
1280                    let packed_bytes = !(chars.cmp_eq(lookup).move_mask() as u32);
1281
1282                    last_i = i;
1283                    unsafe {
1284                        self.seq
1285                            .get_unchecked_mut(idx..(idx + 4))
1286                            .copy_from_slice(&packed_bytes.to_le_bytes())
1287                    };
1288                    idx += 4;
1289                }
1290
1291                // Fix up trailing bytes.
1292                if unaligned < last {
1293                    idx -= 4;
1294                    let mut val = unsafe {
1295                        u32::from_le_bytes(
1296                            self.seq
1297                                .get_unchecked(idx..(idx + 4))
1298                                .try_into()
1299                                .unwrap_unchecked(),
1300                        )
1301                    };
1302                    // keep only the `last - last_i` low bits.
1303                    let keep = last - last_i;
1304                    val <<= 32 - keep;
1305                    val >>= 32 - keep;
1306                    unsafe {
1307                        self.seq
1308                            .get_unchecked_mut(idx..(idx + 4))
1309                            .copy_from_slice(&val.to_le_bytes())
1310                    };
1311                    idx += keep.div_ceil(8);
1312                }
1313            }
1314
1315            #[cfg(target_feature = "neon")]
1316            {
1317                use core::arch::aarch64::*;
1318                use core::mem::transmute;
1319
1320                last = unaligned + (len - unaligned) / 64 * 64;
1321
1322                for i in (unaligned..last).step_by(64) {
1323                    unsafe {
1324                        let ptr = seq.as_ptr().add(i);
1325                        let chars = vld4q_u8(ptr);
1326
1327                        let upper_mask = vdupq_n_u8(!(b'a' - b'A'));
1328                        let chars = neon::map_8x16x4(chars, |v| vandq_u8(v, upper_mask));
1329
1330                        let two_bits_mask = vdupq_n_u8(6);
1331                        let lossy_encoded = neon::map_8x16x4(chars, |v| vandq_u8(v, two_bits_mask));
1332
1333                        let table = transmute(*b"AxCxTxGxxxxxxxxx");
1334                        let lookup = neon::map_8x16x4(lossy_encoded, |v| vqtbl1q_u8(table, v));
1335
1336                        let mask = neon::map_two_8x16x4(chars, lookup, |v1, v2| vceqq_u8(v1, v2));
1337                        let packed_bytes = !neon::movemask_64(mask);
1338
1339                        self.seq[idx..(idx + 8)].copy_from_slice(&packed_bytes.to_le_bytes());
1340                        idx += 8;
1341                        self.len += 64;
1342                    }
1343                }
1344            }
1345        }
1346
1347        let mut packed_byte = 0;
1348        for &base in &seq[last..] {
1349            packed_byte |= parse_base(base) << ((self.len % Self::C8) * B);
1350            self.len += 1;
1351            if self.len % Self::C8 == 0 {
1352                self.seq[idx] = packed_byte;
1353                idx += 1;
1354                packed_byte = 0;
1355            }
1356        }
1357        if self.len % Self::C8 != 0 && last < len {
1358            self.seq[idx] = packed_byte;
1359            idx += 1;
1360        }
1361        assert_eq!(idx + PADDING, self.seq.len());
1362        start..start + len
1363    }
1364
1365    #[cfg(feature = "rand")]
1366    fn random(n: usize) -> Self {
1367        use rand::{RngCore, SeedableRng};
1368
1369        let byte_len = n.div_ceil(Self::C8);
1370        let mut seq = vec![0; byte_len + PADDING];
1371        rand::rngs::SmallRng::from_os_rng().fill_bytes(&mut seq[..byte_len]);
1372        // Ensure that the last byte is padded with zeros.
1373        if n % Self::C8 != 0 {
1374            seq[byte_len - 1] &= (1 << (B * (n % Self::C8))) - 1;
1375        }
1376
1377        Self { seq, len: n }
1378    }
1379}
1380
1381impl<const B: usize> PackedSeqVecBase<B>
1382where
1383    Bits<B>: SupportedBits,
1384{
1385    /// Creates a `SeqVec` from a vector of packed bytes and a length in bp.
1386    ///
1387    /// The vector should have at least 48 bytes of padding after `len`.
1388    /// Otherwise, the vector will be resized to be padded with zeros.
1389    pub fn from_raw_parts(mut seq: Vec<u8>, len: usize) -> Self {
1390        assert!(len <= seq.len() * Self::C8);
1391        seq.resize(len.div_ceil(Self::C8) + PADDING, 0);
1392        Self { seq, len }
1393    }
1394}
1395
1396impl PackedSeqVecBase<1> {
1397    pub fn with_len(n: usize) -> Self {
1398        Self {
1399            seq: vec![0; n.div_ceil(Self::C8) + PADDING],
1400            len: n,
1401        }
1402    }
1403
1404    #[cfg(feature = "rand")]
1405    pub fn random(len: usize, n_frac: f32) -> Self {
1406        let byte_len = len.div_ceil(Self::C8);
1407        let mut seq = vec![0; byte_len + PADDING];
1408
1409        assert!(
1410            (0.0..=0.3).contains(&n_frac),
1411            "n_frac={} should be in [0, 0.3]",
1412            n_frac
1413        );
1414
1415        for _ in 0..(len as f32 * n_frac) as usize {
1416            let idx = rand::random::<u64>() as usize % len;
1417            let byte = idx / Self::C8;
1418            let offset = idx % Self::C8;
1419            seq[byte] |= 1 << offset;
1420        }
1421
1422        Self { seq, len }
1423    }
1424}
1425
1426impl<'s> PackedSeqBase<'s, 1> {
1427    /// An iterator indicating for each kmer whether it contains ambiguous bases.
1428    ///
1429    /// Returns n-(k-1) elements.
1430    #[inline(always)]
1431    pub fn iter_kmer_ambiguity(self, k: usize) -> impl ExactSizeIterator<Item = bool> + use<'s> {
1432        let this = self.normalize();
1433        assert!(k > 0);
1434        assert!(k <= Self::K64);
1435        (this.offset..this.offset + this.len.saturating_sub(k - 1))
1436            .map(move |i| self.read_kmer(k, i) != 0)
1437    }
1438
1439    /// A parallel iterator indicating for each kmer whether it contains ambiguous bases.
1440    ///
1441    /// First element is the 'kmer' consisting only of the first character of each chunk.
1442    ///
1443    /// `k`: length of windows to check
1444    /// `context`: number of overlapping iterations +1. To determine stride of each lane.
1445    /// `skip`: Set to `context-1` to skip the iterations added by the context.
1446    #[inline(always)]
1447    pub fn par_iter_kmer_ambiguity(
1448        self,
1449        k: usize,
1450        context: usize,
1451        skip: usize,
1452    ) -> PaddedIt<impl ChunkIt<S> + use<'s>> {
1453        #[cfg(target_endian = "big")]
1454        panic!("Big endian architectures are not supported.");
1455
1456        assert!(k > 0, "par_iter_kmers requires k>0, but k={k}");
1457        assert!(k <= 96, "par_iter_kmers requires k<=96, but k={k}");
1458
1459        let this = self.normalize();
1460        let o = this.offset;
1461        assert!(o < Self::C8);
1462
1463        let delay = k - 1;
1464
1465        let it = self.par_iter_bp_delayed(context, Delay(delay));
1466
1467        let mut cnt = u32x8::ZERO;
1468
1469        it.map(
1470            #[inline(always)]
1471            move |(a, r)| {
1472                cnt += a;
1473                let out = cnt.cmp_gt(S::ZERO);
1474                cnt -= r;
1475                out
1476            },
1477        )
1478        .advance(skip)
1479    }
1480
1481    #[inline(always)]
1482    pub fn par_iter_kmer_ambiguity_with_buf(
1483        self,
1484        k: usize,
1485        context: usize,
1486        skip: usize,
1487        buf: &'s mut Vec<S>,
1488    ) -> PaddedIt<impl ChunkIt<S> + use<'s>> {
1489        #[cfg(target_endian = "big")]
1490        panic!("Big endian architectures are not supported.");
1491
1492        assert!(k > 0, "par_iter_kmers requires k>0, but k={k}");
1493        assert!(k <= 96, "par_iter_kmers requires k<=96, but k={k}");
1494
1495        let this = self.normalize();
1496        let o = this.offset;
1497        assert!(o < Self::C8);
1498
1499        let delay = k - 1;
1500
1501        let it = self.par_iter_bp_delayed_with_buf(context, Delay(delay), buf);
1502
1503        let mut cnt = u32x8::ZERO;
1504
1505        it.map(
1506            #[inline(always)]
1507            move |(a, r)| {
1508                cnt += a;
1509                let out = cnt.cmp_gt(S::ZERO);
1510                cnt -= r;
1511                out
1512            },
1513        )
1514        .advance(skip)
1515    }
1516}
1517
1518#[cfg(target_feature = "neon")]
1519mod neon {
1520    use core::arch::aarch64::*;
1521
1522    #[inline(always)]
1523    pub fn movemask_64(v: uint8x16x4_t) -> u64 {
1524        // https://stackoverflow.com/questions/74722950/convert-vector-compare-mask-into-bit-mask-in-aarch64-simd-or-arm-neon/74748402#74748402
1525        unsafe {
1526            let acc = vsriq_n_u8(vsriq_n_u8(v.3, v.2, 1), vsriq_n_u8(v.1, v.0, 1), 2);
1527            vget_lane_u64(
1528                vreinterpret_u64_u8(vshrn_n_u16(
1529                    vreinterpretq_u16_u8(vsriq_n_u8(acc, acc, 4)),
1530                    4,
1531                )),
1532                0,
1533            )
1534        }
1535    }
1536
1537    #[inline(always)]
1538    pub fn map_8x16x4<F>(v: uint8x16x4_t, mut f: F) -> uint8x16x4_t
1539    where
1540        F: FnMut(uint8x16_t) -> uint8x16_t,
1541    {
1542        uint8x16x4_t(f(v.0), f(v.1), f(v.2), f(v.3))
1543    }
1544
1545    #[inline(always)]
1546    pub fn map_two_8x16x4<F>(v1: uint8x16x4_t, v2: uint8x16x4_t, mut f: F) -> uint8x16x4_t
1547    where
1548        F: FnMut(uint8x16_t, uint8x16_t) -> uint8x16_t,
1549    {
1550        uint8x16x4_t(f(v1.0, v2.0), f(v1.1, v2.1), f(v1.2, v2.2), f(v1.3, v2.3))
1551    }
1552}