packed_seq/
packed_seq.rs

1use core::cell::RefCell;
2use std::ops::{Deref, DerefMut};
3use traits::Seq;
4use wide::u16x8;
5
6use crate::{intrinsics::transpose, padded_it::ChunkIt};
7
8use super::*;
9
10type SimdBuf = [S; 8];
11
12struct RecycledBox(Option<Box<SimdBuf>>);
13
14thread_local! {
15    static RECYCLED_BOX_CACHE: RefCell<Vec<Box<SimdBuf>>> = {
16        RefCell::new(vec![Box::new(SimdBuf::default())])
17    };
18}
19
20impl RecycledBox {
21    #[inline(always)]
22    fn take() -> Self {
23        let mut buf = RECYCLED_BOX_CACHE.with_borrow_mut(|v| RecycledBox(v.pop()));
24        buf.init_if_needed();
25        buf
26    }
27
28    #[inline(always)]
29    fn init_if_needed(&mut self) {
30        if self.0.is_none() {
31            self.0 = Some(Box::new(SimdBuf::default()));
32        }
33    }
34
35    #[inline(always)]
36    fn get(&self) -> &SimdBuf {
37        unsafe { self.0.as_ref().unwrap_unchecked() }
38    }
39
40    #[inline(always)]
41    fn get_mut(&mut self) -> &mut SimdBuf {
42        unsafe { self.0.as_mut().unwrap_unchecked() }
43    }
44}
45
46impl Deref for RecycledBox {
47    type Target = SimdBuf;
48
49    #[inline(always)]
50    fn deref(&self) -> &Self::Target {
51        self.get()
52    }
53}
54impl DerefMut for RecycledBox {
55    #[inline(always)]
56    fn deref_mut(&mut self) -> &mut SimdBuf {
57        self.get_mut()
58    }
59}
60
61impl Drop for RecycledBox {
62    #[inline(always)]
63    fn drop(&mut self) {
64        let mut x = None;
65        core::mem::swap(&mut x, &mut self.0);
66        RECYCLED_BOX_CACHE.with_borrow_mut(|v| v.push(unsafe { x.unwrap_unchecked() }));
67    }
68}
69
70struct RecycledVec(Vec<S>);
71
72thread_local! {
73    static RECYCLED_VEC_CACHE: RefCell<Vec<Vec<S>>> = {
74        RefCell::new(vec![])
75    };
76}
77
78impl RecycledVec {
79    #[inline(always)]
80    fn take() -> Self {
81        RecycledVec(RECYCLED_VEC_CACHE.with_borrow_mut(|v| v.pop().unwrap_or_default()))
82    }
83}
84
85impl Deref for RecycledVec {
86    type Target = Vec<S>;
87    #[inline(always)]
88    fn deref(&self) -> &Self::Target {
89        &self.0
90    }
91}
92impl DerefMut for RecycledVec {
93    #[inline(always)]
94    fn deref_mut(&mut self) -> &mut Self::Target {
95        &mut self.0
96    }
97}
98impl Drop for RecycledVec {
99    #[inline(always)]
100    fn drop(&mut self) {
101        RECYCLED_VEC_CACHE.with_borrow_mut(|v| v.push(std::mem::take(&mut self.0)));
102    }
103}
104
105#[doc(hidden)]
106pub struct Bits<const B: usize>;
107#[doc(hidden)]
108pub trait SupportedBits {}
109impl SupportedBits for Bits<1> {}
110impl SupportedBits for Bits<2> {}
111impl SupportedBits for Bits<4> {}
112impl SupportedBits for Bits<8> {}
113
114/// Number of padding bytes at the end of `PackedSeqVecBase::seq`.
115pub(crate) const PADDING: usize = 48;
116
117/// A 2-bit packed non-owned slice of DNA bases.
118#[doc(hidden)]
119#[derive(Copy, Clone, Debug, MemSize, MemDbg)]
120pub struct PackedSeqBase<'s, const B: usize>
121where
122    Bits<B>: SupportedBits,
123{
124    /// Packed data.
125    seq: &'s [u8],
126    /// Offset in bp from the start of the `seq`.
127    offset: usize,
128    /// Length of the sequence in bp, starting at `offset` from the start of `seq`.
129    len: usize,
130}
131
132/// A 2-bit packed owned sequence of DNA bases.
133#[doc(hidden)]
134#[derive(Clone, Debug, MemSize, MemDbg)]
135#[cfg_attr(feature = "pyo3", pyo3::pyclass)]
136#[cfg_attr(feature = "epserde", derive(epserde::Epserde))]
137pub struct PackedSeqVecBase<const B: usize>
138where
139    Bits<B>: SupportedBits,
140{
141    /// NOTE: We maintain the invariant that this has at least 48 bytes of padding
142    /// at the end after `len` finishes.
143    /// This ensures that `read_unaligned` in `as_64` works OK.
144    pub(crate) seq: Vec<u8>,
145
146    /// The length, in bp, of the underlying sequence. See `.len()`.
147    len: usize,
148}
149
150pub type PackedSeq<'s> = PackedSeqBase<'s, 2>;
151pub type PackedSeqVec = PackedSeqVecBase<2>;
152pub type BitSeq<'s> = PackedSeqBase<'s, 1>;
153pub type BitSeqVec = PackedSeqVecBase<1>;
154
155/// Convenience constants.
156/// B: bits per chat
157impl<'s, const B: usize> PackedSeqBase<'s, B>
158where
159    Bits<B>: SupportedBits,
160{
161    /// lowest B bits are 1.
162    const CHAR_MASK: u64 = (1 << B) - 1;
163    const SIMD_B: S = S::new([B as u32; 8]);
164    const SIMD_CHAR_MASK: S = S::new([(1 << B) - 1; 8]);
165    /// Chars per byte
166    const C8: usize = 8 / B;
167    /// Chars per u32
168    const C32: usize = 32 / B;
169    /// Chars per u256
170    const C256: usize = 256 / B;
171    /// Max length of a kmer that can be read as a single u64.
172    const K64: usize = (64 - 8) / B + 1;
173}
174
175/// Convenience constants.
176impl<const B: usize> PackedSeqVecBase<B>
177where
178    Bits<B>: SupportedBits,
179{
180    /// Chars per byte
181    const C8: usize = 8 / B;
182}
183
184impl<const B: usize> Default for PackedSeqVecBase<B>
185where
186    Bits<B>: SupportedBits,
187{
188    fn default() -> Self {
189        Self {
190            seq: vec![0; PADDING],
191            len: 0,
192        }
193    }
194}
195
196// ======================================================================
197// 2-BIT HELPER METHODS
198
199/// Pack an ASCII `ACTGactg` character into its 2-bit representation, and panic for anything else.
200#[inline(always)]
201pub fn pack_char(base: u8) -> u8 {
202    match base {
203        b'a' | b'A' => 0,
204        b'c' | b'C' => 1,
205        b'g' | b'G' => 3,
206        b't' | b'T' => 2,
207        _ => panic!(
208            "Unexpected character '{}' with ASCII value {base}. Expected one of ACTGactg.",
209            base as char
210        ),
211    }
212}
213
214/// Pack an ASCII `ACTGactg` character into its 2-bit representation, and silently convert other characters into 0..4 as well.
215#[inline(always)]
216pub fn pack_char_lossy(base: u8) -> u8 {
217    (base >> 1) & 3
218}
219/// Pack a slice of ASCII `ACTGactg` characters into its packed 2-bit kmer representation.
220/// Other characters are silently converted.
221#[inline(always)]
222pub fn pack_kmer_lossy(slice: &[u8]) -> u64 {
223    let mut kmer = 0;
224    for (i, &base) in slice.iter().enumerate() {
225        kmer |= (pack_char_lossy(base) as u64) << (2 * i);
226    }
227    kmer
228}
229/// Pack a slice of ASCII `ACTGactg` characters into its packed 2-bit kmer representation.
230#[inline(always)]
231pub fn pack_kmer_u128_lossy(slice: &[u8]) -> u128 {
232    let mut kmer = 0;
233    for (i, &base) in slice.iter().enumerate() {
234        kmer |= (pack_char_lossy(base) as u128) << (2 * i);
235    }
236    kmer
237}
238
239/// Unpack a 2-bit DNA base into the corresponding `ACTG` character.
240#[inline(always)]
241pub fn unpack_base(base: u8) -> u8 {
242    debug_assert!(base < 4, "Base {base} is not <4.");
243    b"ACTG"[base as usize]
244}
245
246/// Unpack a 2-bit encoded kmer into corresponding `ACTG` characters.
247/// Slice the returned array to the correct length.
248#[inline(always)]
249pub fn unpack_kmer(kmer: u64) -> [u8; 32] {
250    std::array::from_fn(|i| unpack_base(((kmer >> (2 * i)) & 3) as u8))
251}
252/// Unpack a 2-bit encoded kmer into corresponding `ACTG` character.
253#[inline(always)]
254pub fn unpack_kmer_into_vec(kmer: u64, k: usize, out: &mut Vec<u8>) {
255    out.clear();
256    out.extend((0..k).map(|i| unpack_base(((kmer >> (2 * i)) & 3) as u8)));
257}
258/// Unpack a 2-bit encoded kmer into corresponding `ACTG` character.
259#[inline(always)]
260pub fn unpack_kmer_to_vec(kmer: u64, k: usize) -> Vec<u8> {
261    let mut out = vec![];
262    unpack_kmer_into_vec(kmer, k, &mut out);
263    out
264}
265
266/// Unpack a 2-bit encoded kmer into corresponding `ACTG` characters.
267/// Slice the returned array to the correct length.
268#[inline(always)]
269pub fn unpack_kmer_u128(kmer: u128) -> [u8; 64] {
270    std::array::from_fn(|i| unpack_base(((kmer >> (2 * i)) & 3) as u8))
271}
272/// Unpack a 2-bit encoded kmer into corresponding `ACTG` character.
273#[inline(always)]
274pub fn unpack_kmer_u128_into_vec(kmer: u128, k: usize, out: &mut Vec<u8>) {
275    out.clear();
276    out.extend((0..k).map(|i| unpack_base(((kmer >> (2 * i)) & 3) as u8)));
277}
278/// Unpack a 2-bit encoded kmer into corresponding `ACTG` character.
279#[inline(always)]
280pub fn unpack_kmer_u128_to_vec(kmer: u128, k: usize) -> Vec<u8> {
281    let mut out = vec![];
282    unpack_kmer_u128_into_vec(kmer, k, &mut out);
283    out
284}
285
286/// Complement an ASCII character: `A<>T` and `C<>G`.
287#[inline(always)]
288pub const fn complement_char(base: u8) -> u8 {
289    match base {
290        b'A' => b'T',
291        b'C' => b'G',
292        b'G' => b'C',
293        b'T' => b'A',
294        _ => panic!("Unexpected character. Expected one of ACTGactg.",),
295    }
296}
297
298/// Complement a 2-bit base: `0<>2` and `1<>3`.
299#[inline(always)]
300pub const fn complement_base(base: u8) -> u8 {
301    base ^ 2
302}
303
304/// Complement 8 lanes of 2-bit bases: `0<>2` and `1<>3`.
305#[inline(always)]
306pub fn complement_base_simd(base: u32x8) -> u32x8 {
307    const TWO: u32x8 = u32x8::new([2; 8]);
308    base ^ TWO
309}
310
311/// Reverse complement the 2-bit pairs in the input.
312#[inline(always)]
313const fn revcomp_raw(word: u64) -> u64 {
314    #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
315    {
316        let mut res = word.reverse_bits(); // ARM can reverse bits in a single instruction
317        res = ((res >> 1) & 0x5555_5555_5555_5555) | ((res & 0x5555_5555_5555_5555) << 1);
318        res ^ 0xAAAA_AAAA_AAAA_AAAA
319    }
320
321    #[cfg(not(any(target_arch = "arm", target_arch = "aarch64")))]
322    {
323        let mut res = word.swap_bytes();
324        res = ((res >> 4) & 0x0F0F_0F0F_0F0F_0F0F) | ((res & 0x0F0F_0F0F_0F0F_0F0F) << 4);
325        res = ((res >> 2) & 0x3333_3333_3333_3333) | ((res & 0x3333_3333_3333_3333) << 2);
326        res ^ 0xAAAA_AAAA_AAAA_AAAA
327    }
328}
329
330/// Compute the reverse complement of a short sequence packed in a `u64`.
331#[inline(always)]
332pub const fn revcomp_u64(word: u64, len: usize) -> u64 {
333    revcomp_raw(word) >> (usize::BITS as usize - 2 * len)
334}
335
336#[inline(always)]
337pub const fn revcomp_u128(word: u128, len: usize) -> u128 {
338    let low = word as u64;
339    let high = (word >> 64) as u64;
340    let rlow = revcomp_raw(low);
341    let rhigh = revcomp_raw(high);
342    let out = ((rlow as u128) << 64) | rhigh as u128;
343    out >> (u128::BITS as usize - 2 * len)
344}
345
346// ======================================================================
347// 1-BIT HELPER METHODS
348
349/// 1 when a char is ambiguous.
350#[inline(always)]
351pub fn char_is_ambiguous(base: u8) -> u8 {
352    // (!matches!(base, b'A' | b'C'  | b'G'  | b'T' | b'a' | b'c'  | b'g'  | b't')) as u8
353    let table = b"ACTG";
354    let upper_mask = !(b'a' - b'A');
355    (table[pack_char_lossy(base) as usize] != (base & upper_mask)) as u8
356}
357
358/// Reverse `len` bits packed in a `u64`.
359#[inline(always)]
360pub const fn rev_u64(word: u64, len: usize) -> u64 {
361    word.reverse_bits() >> (usize::BITS as usize - len)
362}
363
364/// Reverse `len` bits packed in a `u128`.
365#[inline(always)]
366pub const fn rev_u128(word: u128, len: usize) -> u128 {
367    word.reverse_bits() >> (u128::BITS as usize - len)
368}
369
370// ======================================================================
371
372impl<'s, const B: usize> PackedSeqBase<'s, B>
373where
374    Bits<B>: SupportedBits,
375{
376    /// Creates a `Seq` from a slice of packed bytes, an offset in bp and a length in bp.
377    ///
378    /// The slice should have at least 48 bytes of padding after `offset + len`.
379    /// Otherwise, the function will panic.
380    pub fn from_raw_parts(seq: &'s [u8], offset: usize, len: usize) -> Self {
381        assert!(offset + len + PADDING * Self::C8 <= seq.len() * Self::C8);
382        Self { seq, offset, len }
383    }
384
385    /// Shrink `seq` to only just cover the data.
386    #[inline(always)]
387    pub fn normalize(&self) -> Self {
388        let start_byte = self.offset / Self::C8;
389        let end_byte = (self.offset + self.len).div_ceil(Self::C8);
390        Self {
391            seq: &self.seq[start_byte..end_byte + PADDING],
392            offset: self.offset % Self::C8,
393            len: self.len,
394        }
395    }
396
397    /// Return a `Vec<u8>` of ASCII `ACTG` characters.
398    #[inline(always)]
399    pub fn unpack(&self) -> Vec<u8> {
400        self.iter_bp().map(unpack_base).collect()
401    }
402}
403
404/// Read up to 32 bytes starting at idx.
405#[inline(always)]
406pub(crate) unsafe fn read_slice_32_unchecked(seq: &[u8], idx: usize) -> u32x8 {
407    unsafe {
408        let src = seq.as_ptr().add(idx);
409        debug_assert!(idx + 32 <= seq.len());
410        std::mem::transmute::<_, *const u32x8>(src).read_unaligned()
411    }
412}
413
414/// Read up to 32 bytes starting at idx.
415#[inline(always)]
416pub(crate) fn read_slice_32(seq: &[u8], idx: usize) -> u32x8 {
417    unsafe {
418        let src = seq.as_ptr().add(idx);
419        if idx + 32 <= seq.len() {
420            std::mem::transmute::<_, *const u32x8>(src).read_unaligned()
421        } else {
422            let num_bytes = seq.len().saturating_sub(idx);
423            let mut result = [0u8; 32];
424            std::ptr::copy_nonoverlapping(src, result.as_mut_ptr(), num_bytes);
425            std::mem::transmute(result)
426        }
427    }
428}
429
430/// Read up to 16 bytes starting at idx.
431#[allow(unused)]
432#[inline(always)]
433pub(crate) fn read_slice_16(seq: &[u8], idx: usize) -> u16x8 {
434    unsafe {
435        let src = seq.as_ptr().add(idx);
436        if idx + 16 <= seq.len() {
437            std::mem::transmute::<_, *const u16x8>(src).read_unaligned()
438        } else {
439            let num_bytes = seq.len().saturating_sub(idx);
440            let mut result = [0u8; 16];
441            std::ptr::copy_nonoverlapping(src, result.as_mut_ptr(), num_bytes);
442            std::mem::transmute(result)
443        }
444    }
445}
446
447impl<'s, const B: usize> Seq<'s> for PackedSeqBase<'s, B>
448where
449    Bits<B>: SupportedBits,
450{
451    const BITS_PER_CHAR: usize = B;
452    const BASES_PER_BYTE: usize = Self::C8;
453    type SeqVec = PackedSeqVecBase<B>;
454
455    #[inline(always)]
456    fn len(&self) -> usize {
457        self.len
458    }
459
460    #[inline(always)]
461    fn is_empty(&self) -> bool {
462        self.len == 0
463    }
464
465    #[inline(always)]
466    fn get_ascii(&self, index: usize) -> u8 {
467        unpack_base(self.get(index))
468    }
469
470    /// Convert a short sequence (kmer) to a packed representation as `u64`.
471    /// Panics if `self` is longer than 32 characters.
472    #[inline(always)]
473    fn as_u64(&self) -> u64 {
474        assert!(self.len() <= 64 / B);
475
476        let mask = u64::MAX >> (64 - B * self.len());
477
478        // The unaligned read is OK, because we ensure that the underlying `PackedSeqVecBase::seq` always
479        // has at least 48 bytes of padding at the end.
480        if self.len() <= Self::K64 {
481            let x = unsafe { (self.seq.as_ptr() as *const u64).read_unaligned() };
482            (x >> (B * self.offset)) & mask
483        } else {
484            let x = unsafe { (self.seq.as_ptr() as *const u128).read_unaligned() };
485            (x >> (B * self.offset)) as u64 & mask
486        }
487    }
488
489    /// Convert a short sequence (kmer) to a packed representation of its reverse complement as `usize`.
490    /// Panics if `self` is longer than 32 characters.
491    #[inline(always)]
492    fn revcomp_as_u64(&self) -> u64 {
493        match B {
494            1 => rev_u64(self.as_u64(), self.len()),
495            2 => revcomp_u64(self.as_u64(), self.len()),
496            _ => panic!("Rev(comp) is only supported for 1-bit and 2-bit alphabets."),
497        }
498    }
499
500    /// Convert a short sequence (kmer) to a packed representation as `u128`.
501    /// Panics if `self` is longer than 64 characters.
502    #[inline(always)]
503    fn as_u128(&self) -> u128 {
504        assert!(
505            self.len() <= (128 - 8) / B + 1,
506            "Sequences >61 long cannot be read with a single unaligned u128 read."
507        );
508
509        let mask = u128::MAX >> (128 - B * self.len());
510
511        // The unaligned read is OK, because we ensure that the underlying `PackedSeqVecBase::seq` always
512        // has at least 48 bytes of padding at the end.
513        let x = unsafe { (self.seq.as_ptr() as *const u128).read_unaligned() };
514        (x >> (B * self.offset)) & mask
515    }
516
517    /// Convert a short sequence (kmer) to a packed representation of its reverse complement as `usize`.
518    /// Panics if `self` is longer than 64 characters.
519    #[inline(always)]
520    fn revcomp_as_u128(&self) -> u128 {
521        match B {
522            1 => rev_u128(self.as_u128(), self.len()),
523            2 => revcomp_u128(self.as_u128(), self.len()),
524            _ => panic!("Rev(comp) is only supported for 1-bit and 2-bit alphabets."),
525        }
526    }
527
528    #[inline(always)]
529    fn to_vec(&self) -> PackedSeqVecBase<B> {
530        assert_eq!(self.offset, 0);
531        PackedSeqVecBase {
532            seq: self
533                .seq
534                .iter()
535                .copied()
536                .chain(std::iter::repeat_n(0u8, PADDING))
537                .collect(),
538            len: self.len,
539        }
540    }
541
542    fn to_revcomp(&self) -> PackedSeqVecBase<B> {
543        match B {
544            1 | 2 => {}
545            _ => panic!("Can only reverse (&complement) 1-bit and 2-bit packed sequences.",),
546        }
547
548        let mut seq = self.seq[..(self.offset + self.len).div_ceil(Self::C8)]
549            .iter()
550            // 1. reverse the bytes
551            .rev()
552            .copied()
553            .map(|mut res| {
554                match B {
555                    2 => {
556                        // 2. swap the bases in the byte
557                        // This is auto-vectorized.
558                        res = ((res >> 4) & 0x0F) | ((res & 0x0F) << 4);
559                        res = ((res >> 2) & 0x33) | ((res & 0x33) << 2);
560                        // Complement the bases.
561                        res ^ 0xAA
562                    }
563                    1 => res.reverse_bits(),
564                    _ => unreachable!(),
565                }
566            })
567            .chain(std::iter::repeat_n(0u8, PADDING))
568            .collect::<Vec<u8>>();
569
570        // 3. Shift away the offset.
571        let new_offset = (Self::C8 - (self.offset + self.len) % Self::C8) % Self::C8;
572
573        if new_offset > 0 {
574            // Shift everything left by `2*new_offset` bits.
575            let shift = B * new_offset;
576            *seq.last_mut().unwrap() >>= shift;
577            // This loop is also auto-vectorized.
578            for i in 0..seq.len() - 1 {
579                seq[i] = (seq[i] >> shift) | (seq[i + 1] << (8 - shift));
580            }
581        }
582
583        PackedSeqVecBase { seq, len: self.len }
584    }
585
586    #[inline(always)]
587    fn slice(&self, range: Range<usize>) -> Self {
588        debug_assert!(
589            range.end <= self.len,
590            "Slice index out of bounds: {} > {}",
591            range.end,
592            self.len
593        );
594        PackedSeqBase {
595            seq: self.seq,
596            offset: self.offset + range.start,
597            len: range.end - range.start,
598        }
599        .normalize()
600    }
601
602    #[inline(always)]
603    fn iter_bp(self) -> impl ExactSizeIterator<Item = u8> {
604        assert!(self.len <= self.seq.len() * Self::C8);
605
606        let this = self.normalize();
607
608        // read u64 at a time?
609        let mut byte = 0;
610        (0..this.len + this.offset)
611            .map(
612                #[inline(always)]
613                move |i| {
614                    if i % Self::C8 == 0 {
615                        byte = this.seq[i / Self::C8];
616                    }
617                    // Shift byte instead of i?
618                    (byte >> (B * (i % Self::C8))) & Self::CHAR_MASK as u8
619                },
620            )
621            .advance(this.offset)
622    }
623
624    #[inline(always)]
625    fn par_iter_bp(self, context: usize) -> PaddedIt<impl ChunkIt<S>> {
626        self.par_iter_bp_with_buf(context, RecycledBox::take())
627    }
628
629    #[inline(always)]
630    fn par_iter_bp_delayed(self, context: usize, delay: Delay) -> PaddedIt<impl ChunkIt<(S, S)>> {
631        self.par_iter_bp_delayed_with_factor(context, delay, 1)
632    }
633
634    /// NOTE: When `self` starts does not start at a byte boundary, the
635    /// 'delayed' character is not guaranteed to be `0`.
636    #[inline(always)]
637    fn par_iter_bp_delayed_2(
638        self,
639        context: usize,
640        delay1: Delay,
641        delay2: Delay,
642    ) -> PaddedIt<impl ChunkIt<(S, S, S)>> {
643        self.par_iter_bp_delayed_2_with_factor_and_buf(
644            context,
645            delay1,
646            delay2,
647            1,
648            RecycledVec::take(),
649        )
650    }
651
652    /// Compares 29 characters at a time.
653    fn cmp_lcp(&self, other: &Self) -> (std::cmp::Ordering, usize) {
654        let mut lcp = 0;
655        let min_len = self.len.min(other.len);
656        for i in (0..min_len).step_by(Self::K64) {
657            let len = (min_len - i).min(Self::K64);
658            let this = self.slice(i..i + len);
659            let other = other.slice(i..i + len);
660            let this_word = this.as_u64();
661            let other_word = other.as_u64();
662            if this_word != other_word {
663                // Unfortunately, bases are packed in little endian order, so the default order is reversed.
664                let eq = this_word ^ other_word;
665                let t = eq.trailing_zeros() as usize / B * B;
666                lcp += t / B;
667                let mask = (Self::CHAR_MASK) << t;
668                return ((this_word & mask).cmp(&(other_word & mask)), lcp);
669            }
670            lcp += len;
671        }
672        (self.len.cmp(&other.len), lcp)
673    }
674
675    #[inline(always)]
676    fn get(&self, index: usize) -> u8 {
677        let offset = self.offset + index;
678        let idx = offset / Self::C8;
679        let offset = offset % Self::C8;
680        (self.seq[idx] >> (B * offset)) & Self::CHAR_MASK as u8
681    }
682}
683
684impl<'s, const B: usize> PackedSeqBase<'s, B>
685where
686    Bits<B>: SupportedBits,
687{
688    #[inline(always)]
689    pub fn par_iter_bp_with_buf<BUF: DerefMut<Target = [S; 8]>>(
690        self,
691        context: usize,
692        mut buf: BUF,
693    ) -> PaddedIt<impl ChunkIt<S> + use<'s, B, BUF>> {
694        #[cfg(target_endian = "big")]
695        panic!("Big endian architectures are not supported.");
696
697        let this = self.normalize();
698        let o = this.offset;
699        assert!(o < Self::C8);
700
701        let num_kmers = if this.len == 0 {
702            0
703        } else {
704            (this.len + o).saturating_sub(context - 1)
705        };
706        // without +o, since we don't need them in the stride.
707        let num_kmers_stride = this.len.saturating_sub(context - 1);
708        let n = num_kmers_stride.div_ceil(L).next_multiple_of(Self::C8);
709        let bytes_per_chunk = n / Self::C8;
710        let padding = Self::C8 * L * bytes_per_chunk - num_kmers_stride;
711
712        let offsets: [usize; 8] = from_fn(|l| l * bytes_per_chunk);
713        let mut cur = S::ZERO;
714
715        let par_len = if num_kmers == 0 {
716            0
717        } else {
718            n + context + o - 1
719        };
720
721        let last_i = par_len.saturating_sub(1) / Self::C32 * Self::C32;
722        // Safety check for the `read_slice_32_unchecked`:
723        assert!(offsets[7] + (last_i / Self::C8) + 32 <= this.seq.len());
724
725        let it = (0..par_len)
726            .map(
727                #[inline(always)]
728                move |i| {
729                    if i % Self::C32 == 0 {
730                        if i % Self::C256 == 0 {
731                            // Read a u256 for each lane containing the next 128 characters.
732                            let data: [u32x8; 8] = from_fn(
733                                #[inline(always)]
734                                |lane| unsafe {
735                                    read_slice_32_unchecked(
736                                        this.seq,
737                                        offsets[lane] + (i / Self::C8),
738                                    )
739                                },
740                            );
741                            *buf = transpose(data);
742                        }
743                        cur = buf[(i % Self::C256) / Self::C32];
744                    }
745                    // Extract the last 2 bits of each character.
746                    let chars = cur & Self::SIMD_CHAR_MASK;
747                    // Shift remaining characters to the right.
748                    cur = cur >> Self::SIMD_B;
749                    chars
750                },
751            )
752            .advance(o);
753
754        PaddedIt { it, padding }
755    }
756
757    #[inline(always)]
758    pub fn par_iter_bp_delayed_with_factor(
759        self,
760        context: usize,
761        delay: Delay,
762        factor: usize,
763    ) -> PaddedIt<impl ChunkIt<(S, S)> + use<'s, B>> {
764        self.par_iter_bp_delayed_with_factor_and_buf(context, delay, factor, RecycledVec::take())
765    }
766
767    #[inline(always)]
768    pub fn par_iter_bp_delayed_with_buf<BUF: DerefMut<Target = Vec<S>>>(
769        self,
770        context: usize,
771        delay: Delay,
772        buf: BUF,
773    ) -> PaddedIt<impl ChunkIt<(S, S)> + use<'s, B, BUF>> {
774        self.par_iter_bp_delayed_with_factor_and_buf(context, delay, 1, buf)
775    }
776
777    #[inline(always)]
778    pub fn par_iter_bp_delayed_with_factor_and_buf<BUF: DerefMut<Target = Vec<S>>>(
779        self,
780        context: usize,
781        Delay(delay): Delay,
782        factor: usize,
783        mut buf: BUF,
784    ) -> PaddedIt<impl ChunkIt<(S, S)> + use<'s, B, BUF>> {
785        #[cfg(target_endian = "big")]
786        panic!("Big endian architectures are not supported.");
787
788        assert!(
789            delay < usize::MAX / 2,
790            "Delay={} should be >=0.",
791            delay as isize
792        );
793
794        let this = self.normalize();
795        let o = this.offset;
796        assert!(o < Self::C8);
797
798        let num_kmers = if this.len == 0 {
799            0
800        } else {
801            (this.len + o).saturating_sub(context - 1)
802        };
803        // without +o, since we don't need them in the stride.
804        let num_kmers_stride = this.len.saturating_sub(context - 1);
805        let n = num_kmers_stride
806            .div_ceil(L)
807            .next_multiple_of(factor * Self::C8);
808        let bytes_per_chunk = n / Self::C8;
809        let padding = Self::C8 * L * bytes_per_chunk - num_kmers_stride;
810
811        let offsets: [usize; 8] = from_fn(|l| l * bytes_per_chunk);
812        let mut upcoming = S::ZERO;
813        let mut upcoming_d = S::ZERO;
814
815        // Even buf_len is nice to only have the write==buf_len check once.
816        // We also make it the next power of 2, for faster modulo operations.
817        // delay/16: number of bp in a u32.
818        // +8: some 'random' padding
819        let buf_len = (delay / Self::C32 + 8).next_power_of_two();
820        let buf_mask = buf_len - 1;
821        if buf.capacity() < buf_len {
822            // This has better codegen than `vec.clear(); vec.resize()`, since the inner `do_reserve_and_handle` of resize is not inlined.
823            *buf = vec![S::ZERO; buf_len];
824        } else {
825            // NOTE: Buf needs to be filled with zeros to guarantee returning 0 values for out-of-bounds characters.
826            buf.clear();
827            buf.resize(buf_len, S::ZERO);
828        }
829
830        let mut write_idx = 0;
831        // We compensate for the first delay/16 triggers of the check below that
832        // happen before the delay is actually reached.
833        let mut read_idx = (buf_len - delay / Self::C32) % buf_len;
834
835        let par_len = if num_kmers == 0 {
836            0
837        } else {
838            n + context + o - 1
839        };
840
841        let last_i = par_len.saturating_sub(1) / Self::C32 * Self::C32;
842        // Safety check for the `read_slice_32_unchecked`:
843        assert!(offsets[7] + (last_i / Self::C8) + 32 <= this.seq.len());
844
845        let it = (0..par_len)
846            .map(
847                #[inline(always)]
848                move |i| {
849                    if i % Self::C32 == 0 {
850                        if i % Self::C256 == 0 {
851                            // Read a u256 for each lane containing the next 128 characters.
852                            let data: [u32x8; 8] = from_fn(
853                                #[inline(always)]
854                                |lane| unsafe {
855                                    read_slice_32_unchecked(
856                                        this.seq,
857                                        offsets[lane] + (i / Self::C8),
858                                    )
859                                },
860                            );
861                            unsafe {
862                                *TryInto::<&mut [u32x8; 8]>::try_into(
863                                    buf.get_unchecked_mut(write_idx..write_idx + 8),
864                                )
865                                .unwrap_unchecked() = transpose(data);
866                            }
867                            if i == 0 {
868                                // Mask out chars before the offset.
869                                let elem = !((1u32 << (B * o)) - 1);
870                                let mask = S::splat(elem);
871                                unsafe { assert_unchecked(write_idx < buf.len()) };
872                                buf[write_idx] &= mask;
873                            }
874                        }
875                        unsafe { assert_unchecked(write_idx < buf.len()) };
876                        upcoming = buf[write_idx];
877                        write_idx += 1;
878                        write_idx &= buf_mask;
879                    }
880                    if i % Self::C32 == delay % Self::C32 {
881                        unsafe { assert_unchecked(read_idx < buf.len()) };
882                        upcoming_d = buf[read_idx];
883                        read_idx += 1;
884                        read_idx &= buf_mask;
885                    }
886                    // Extract the last 2 bits of each character.
887                    let chars = upcoming & Self::SIMD_CHAR_MASK;
888                    let chars_d = upcoming_d & Self::SIMD_CHAR_MASK;
889                    // Shift remaining characters to the right.
890                    upcoming = upcoming >> Self::SIMD_B;
891                    upcoming_d = upcoming_d >> Self::SIMD_B;
892                    (chars, chars_d)
893                },
894            )
895            .advance(o);
896
897        PaddedIt { it, padding }
898    }
899
900    #[inline(always)]
901    pub fn par_iter_bp_delayed_2_with_factor(
902        self,
903        context: usize,
904        delay1: Delay,
905        delay2: Delay,
906        factor: usize,
907    ) -> PaddedIt<impl ChunkIt<(S, S, S)> + use<'s, B>> {
908        self.par_iter_bp_delayed_2_with_factor_and_buf(
909            context,
910            delay1,
911            delay2,
912            factor,
913            RecycledVec::take(),
914        )
915    }
916
917    #[inline(always)]
918    pub fn par_iter_bp_delayed_2_with_buf<BUF: DerefMut<Target = Vec<S>>>(
919        self,
920        context: usize,
921        delay1: Delay,
922        delay2: Delay,
923        buf: BUF,
924    ) -> PaddedIt<impl ChunkIt<(S, S, S)> + use<'s, B, BUF>> {
925        self.par_iter_bp_delayed_2_with_factor_and_buf(context, delay1, delay2, 1, buf)
926    }
927
928    /// When iterating over 2-bit and 1-bit encoded data in parallel,
929    /// one must ensure that they have the same stride.
930    /// On the larger type, set `factor` as the ratio to the smaller one,
931    /// so that the stride in bytes is a multiple of `factor`,
932    /// so that the smaller type also has a byte-aligned stride.
933    #[inline(always)]
934    pub fn par_iter_bp_delayed_2_with_factor_and_buf<BUF: DerefMut<Target = Vec<S>>>(
935        self,
936        context: usize,
937        Delay(delay1): Delay,
938        Delay(delay2): Delay,
939        factor: usize,
940        mut buf: BUF,
941    ) -> PaddedIt<impl ChunkIt<(S, S, S)> + use<'s, B, BUF>> {
942        #[cfg(target_endian = "big")]
943        panic!("Big endian architectures are not supported.");
944
945        let this = self.normalize();
946        let o = this.offset;
947        assert!(o < Self::C8);
948        assert!(delay1 <= delay2, "Delay1 must be at most delay2.");
949
950        let num_kmers = if this.len == 0 {
951            0
952        } else {
953            (this.len + o).saturating_sub(context - 1)
954        };
955        // without +o, since we don't need them in the stride.
956        let num_kmers_stride = this.len.saturating_sub(context - 1);
957        let n = num_kmers_stride
958            .div_ceil(L)
959            .next_multiple_of(factor * Self::C8);
960        let bytes_per_chunk = n / Self::C8;
961        let padding = Self::C8 * L * bytes_per_chunk - num_kmers_stride;
962
963        let offsets: [usize; 8] = from_fn(|l| l * bytes_per_chunk);
964        let mut upcoming = S::ZERO;
965        let mut upcoming_d1 = S::ZERO;
966        let mut upcoming_d2 = S::ZERO;
967
968        // Even buf_len is nice to only have the write==buf_len check once.
969        let buf_len = (delay2 / Self::C32 + 8).next_power_of_two();
970        let buf_mask = buf_len - 1;
971        if buf.capacity() < buf_len {
972            // This has better codegen than `vec.clear(); vec.resize()`, since the inner `do_reserve_and_handle` of resize is not inlined.
973            *buf = vec![S::ZERO; buf_len];
974        } else {
975            // NOTE: Buf needs to be filled with zeros to guarantee returning 0 values for out-of-bounds characters.
976            buf.clear();
977            buf.resize(buf_len, S::ZERO);
978        }
979
980        let mut write_idx = 0;
981        // We compensate for the first delay/16 triggers of the check below that
982        // happen before the delay is actually reached.
983        let mut read_idx1 = (buf_len - delay1 / Self::C32) % buf_len;
984        let mut read_idx2 = (buf_len - delay2 / Self::C32) % buf_len;
985
986        let par_len = if num_kmers == 0 {
987            0
988        } else {
989            n + context + o - 1
990        };
991
992        let last_i = par_len.saturating_sub(1) / Self::C32 * Self::C32;
993        // Safety check for the `read_slice_32_unchecked`:
994        assert!(offsets[7] + (last_i / Self::C8) + 32 <= this.seq.len());
995
996        let it = (0..par_len)
997            .map(
998                #[inline(always)]
999                move |i| {
1000                    if i % Self::C32 == 0 {
1001                        if i % Self::C256 == 0 {
1002                            // Read a u256 for each lane containing the next 128 characters.
1003                            let data: [u32x8; 8] = from_fn(
1004                                #[inline(always)]
1005                                |lane| unsafe {
1006                                    read_slice_32_unchecked(
1007                                        this.seq,
1008                                        offsets[lane] + (i / Self::C8),
1009                                    )
1010                                },
1011                            );
1012                            unsafe {
1013                                *TryInto::<&mut [u32x8; 8]>::try_into(
1014                                    buf.get_unchecked_mut(write_idx..write_idx + 8),
1015                                )
1016                                .unwrap_unchecked() = transpose(data);
1017                            }
1018                            // FIXME DROP THIS?
1019                            if i == 0 {
1020                                // Mask out chars before the offset.
1021                                let elem = !((1u32 << (B * o)) - 1);
1022                                let mask = S::splat(elem);
1023                                buf[write_idx] &= mask;
1024                            }
1025                        }
1026                        upcoming = buf[write_idx];
1027                        write_idx += 1;
1028                        write_idx &= buf_mask;
1029                    }
1030                    if i % Self::C32 == delay1 % Self::C32 {
1031                        unsafe { assert_unchecked(read_idx1 < buf.len()) };
1032                        upcoming_d1 = buf[read_idx1];
1033                        read_idx1 += 1;
1034                        read_idx1 &= buf_mask;
1035                    }
1036                    if i % Self::C32 == delay2 % Self::C32 {
1037                        unsafe { assert_unchecked(read_idx2 < buf.len()) };
1038                        upcoming_d2 = buf[read_idx2];
1039                        read_idx2 += 1;
1040                        read_idx2 &= buf_mask;
1041                    }
1042                    // Extract the last 2 bits of each character.
1043                    let chars = upcoming & Self::SIMD_CHAR_MASK;
1044                    let chars_d1 = upcoming_d1 & Self::SIMD_CHAR_MASK;
1045                    let chars_d2 = upcoming_d2 & Self::SIMD_CHAR_MASK;
1046                    // Shift remaining characters to the right.
1047                    upcoming = upcoming >> Self::SIMD_B;
1048                    upcoming_d1 = upcoming_d1 >> Self::SIMD_B;
1049                    upcoming_d2 = upcoming_d2 >> Self::SIMD_B;
1050                    (chars, chars_d1, chars_d2)
1051                },
1052            )
1053            .advance(o);
1054
1055        PaddedIt { it, padding }
1056    }
1057}
1058
1059impl<const B: usize> PartialEq for PackedSeqBase<'_, B>
1060where
1061    Bits<B>: SupportedBits,
1062{
1063    /// Compares 29 characters at a time.
1064    fn eq(&self, other: &Self) -> bool {
1065        if self.len != other.len {
1066            return false;
1067        }
1068        for i in (0..self.len).step_by(Self::K64) {
1069            let len = (self.len - i).min(Self::K64);
1070            let this = self.slice(i..i + len);
1071            let that = other.slice(i..i + len);
1072            if this.as_u64() != that.as_u64() {
1073                return false;
1074            }
1075        }
1076        true
1077    }
1078}
1079
1080impl<const B: usize> Eq for PackedSeqBase<'_, B> where Bits<B>: SupportedBits {}
1081
1082impl<const B: usize> PartialOrd for PackedSeqBase<'_, B>
1083where
1084    Bits<B>: SupportedBits,
1085{
1086    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
1087        Some(self.cmp(other))
1088    }
1089}
1090
1091impl<const B: usize> Ord for PackedSeqBase<'_, B>
1092where
1093    Bits<B>: SupportedBits,
1094{
1095    /// Compares 29 characters at a time.
1096    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
1097        let min_len = self.len.min(other.len);
1098        for i in (0..min_len).step_by(Self::K64) {
1099            let len = (min_len - i).min(Self::K64);
1100            let this = self.slice(i..i + len);
1101            let other = other.slice(i..i + len);
1102            let this_word = this.as_u64();
1103            let other_word = other.as_u64();
1104            if this_word != other_word {
1105                // Unfortunately, bases are packed in little endian order, so the default order is reversed.
1106                let eq = this_word ^ other_word;
1107                let t = eq.trailing_zeros() as usize / B * B;
1108                let mask = (Self::CHAR_MASK) << t;
1109                return (this_word & mask).cmp(&(other_word & mask));
1110            }
1111        }
1112        self.len.cmp(&other.len)
1113    }
1114}
1115
1116impl<const B: usize> SeqVec for PackedSeqVecBase<B>
1117where
1118    Bits<B>: SupportedBits,
1119{
1120    type Seq<'s> = PackedSeqBase<'s, B>;
1121
1122    #[inline(always)]
1123    fn into_raw(mut self) -> Vec<u8> {
1124        self.seq.resize(self.len.div_ceil(Self::C8), 0);
1125        self.seq
1126    }
1127
1128    #[inline(always)]
1129    fn as_slice(&self) -> Self::Seq<'_> {
1130        PackedSeqBase {
1131            seq: &self.seq[..self.len.div_ceil(Self::C8) + PADDING],
1132            offset: 0,
1133            len: self.len,
1134        }
1135    }
1136
1137    #[inline(always)]
1138    fn len(&self) -> usize {
1139        self.len
1140    }
1141
1142    #[inline(always)]
1143    fn is_empty(&self) -> bool {
1144        self.len == 0
1145    }
1146
1147    #[inline(always)]
1148    fn clear(&mut self) {
1149        self.seq.clear();
1150        self.len = 0;
1151    }
1152
1153    fn push_seq<'a>(&mut self, seq: PackedSeqBase<'_, B>) -> Range<usize> {
1154        let start = self.len.next_multiple_of(Self::C8) + seq.offset;
1155        let end = start + seq.len();
1156
1157        // Shrink away the padding.
1158        self.seq.resize(self.len.div_ceil(Self::C8), 0);
1159        // Extend.
1160        self.seq.extend(seq.seq);
1161        self.len = end;
1162        start..end
1163    }
1164
1165    /// For `PackedSeqVec` (2-bit encoding): map ASCII `ACGT` (and `acgt`) to DNA `0132` in that order.
1166    /// Other characters are silently mapped into `0..4`.
1167    ///
1168    /// Uses the BMI2 `pext` instruction when available, based on the
1169    /// `n_to_bits_pext` method described at
1170    /// <https://github.com/Daniel-Liu-c0deb0t/cute-nucleotides>.
1171    ///
1172    /// For `BitSeqVec` (1-bit encoding): map `ACGTacgt` to `0`, and everything else to `1`.
1173    fn push_ascii(&mut self, seq: &[u8]) -> Range<usize> {
1174        match B {
1175            1 | 2 => {}
1176            _ => panic!(
1177                "Can only use ASCII input for 2-bit DNA packing, or 1-bit ambiguous indicators."
1178            ),
1179        }
1180
1181        self.seq
1182            .resize((self.len + seq.len()).div_ceil(Self::C8) + PADDING, 0);
1183        let start_aligned = self.len.next_multiple_of(Self::C8);
1184        let start = self.len;
1185        let len = seq.len();
1186        let mut idx = self.len / Self::C8;
1187
1188        let parse_base = |base| match B {
1189            1 => char_is_ambiguous(base),
1190            2 => pack_char_lossy(base),
1191            _ => unreachable!(),
1192        };
1193
1194        let unaligned = core::cmp::min(start_aligned - start, len);
1195        if unaligned > 0 {
1196            let mut packed_byte = self.seq[idx];
1197            for &base in &seq[..unaligned] {
1198                packed_byte |= parse_base(base) << ((self.len % Self::C8) * B);
1199                self.len += 1;
1200            }
1201            self.seq[idx] = packed_byte;
1202            idx += 1;
1203        }
1204
1205        #[allow(unused)]
1206        let mut last = unaligned;
1207
1208        if B == 2 {
1209            #[cfg(all(target_arch = "x86_64", target_feature = "bmi2"))]
1210            {
1211                last = unaligned + (len - unaligned) / 8 * 8;
1212
1213                for i in (unaligned..last).step_by(8) {
1214                    let chunk =
1215                        unsafe { seq.get_unchecked(i..i + 8).try_into().unwrap_unchecked() };
1216                    let ascii = u64::from_le_bytes(chunk);
1217                    let packed_bytes =
1218                        unsafe { std::arch::x86_64::_pext_u64(ascii, 0x0606060606060606) } as u16;
1219                    unsafe {
1220                        self.seq
1221                            .get_unchecked_mut(idx..(idx + 2))
1222                            .copy_from_slice(&packed_bytes.to_le_bytes())
1223                    };
1224                    idx += 2;
1225                    self.len += 8;
1226                }
1227            }
1228
1229            #[cfg(target_feature = "neon")]
1230            {
1231                use core::arch::aarch64::{
1232                    vandq_u8, vdup_n_u8, vld1q_u8, vpadd_u8, vshlq_u8, vst1_u8,
1233                };
1234                use core::mem::transmute;
1235
1236                last = unaligned + (len - unaligned) / 16 * 16;
1237
1238                for i in (unaligned..last).step_by(16) {
1239                    unsafe {
1240                        let ascii = vld1q_u8(seq.as_ptr().add(i));
1241                        let masked_bits = vandq_u8(ascii, transmute([6i8; 16]));
1242                        let (bits_0, bits_1) = transmute(vshlq_u8(
1243                            masked_bits,
1244                            transmute([-1i8, 1, 3, 5, -1, 1, 3, 5, -1, 1, 3, 5, -1, 1, 3, 5]),
1245                        ));
1246                        let half_packed = vpadd_u8(bits_0, bits_1);
1247                        let packed = vpadd_u8(half_packed, vdup_n_u8(0));
1248                        vst1_u8(self.seq.as_mut_ptr().add(idx), packed);
1249                        idx += Self::C8;
1250                        self.len += 16;
1251                    }
1252                }
1253            }
1254        }
1255
1256        if B == 1 {
1257            #[cfg(all(target_arch = "x86_64", target_feature = "avx2"))]
1258            {
1259                last = len;
1260                self.len += len - unaligned;
1261
1262                let mut last_i = unaligned;
1263
1264                for i in (unaligned..last).step_by(32) {
1265                    use std::mem::transmute as t;
1266
1267                    use wide::CmpEq;
1268                    // Wide doesn't have u8x32, so this is messy here...
1269                    type S = wide::i8x32;
1270                    let chars: S = unsafe { t(read_slice_32(seq, i)) };
1271                    let upper_mask = !(b'a' - b'A');
1272                    // make everything upper case
1273                    let chars = chars & S::splat(upper_mask as i8);
1274                    let lossy_encoded = chars & S::splat(6);
1275                    let table = unsafe { S::from(t::<_, S>(*b"AxCxTxGxxxxxxxxxAxCxTxGxxxxxxxxx")) };
1276                    let lookup: S = unsafe {
1277                        t(std::arch::x86_64::_mm256_shuffle_epi8(
1278                            t(table),
1279                            t(lossy_encoded),
1280                        ))
1281                    };
1282                    let packed_bytes = !(chars.cmp_eq(lookup).move_mask() as u32);
1283
1284                    last_i = i;
1285                    unsafe {
1286                        self.seq
1287                            .get_unchecked_mut(idx..(idx + 4))
1288                            .copy_from_slice(&packed_bytes.to_le_bytes())
1289                    };
1290                    idx += 4;
1291                }
1292
1293                // Fix up trailing bytes.
1294                if unaligned < last {
1295                    idx -= 4;
1296                    let mut val = unsafe {
1297                        u32::from_le_bytes(
1298                            self.seq
1299                                .get_unchecked(idx..(idx + 4))
1300                                .try_into()
1301                                .unwrap_unchecked(),
1302                        )
1303                    };
1304                    // keep only the `last - last_i` low bits.
1305                    let keep = last - last_i;
1306                    val <<= 32 - keep;
1307                    val >>= 32 - keep;
1308                    unsafe {
1309                        self.seq
1310                            .get_unchecked_mut(idx..(idx + 4))
1311                            .copy_from_slice(&val.to_le_bytes())
1312                    };
1313                    idx += keep.div_ceil(8);
1314                }
1315            }
1316
1317            #[cfg(target_feature = "neon")]
1318            {
1319                use core::arch::aarch64::*;
1320                use core::mem::transmute;
1321
1322                last = unaligned + (len - unaligned) / 64 * 64;
1323
1324                for i in (unaligned..last).step_by(64) {
1325                    unsafe {
1326                        let ptr = seq.as_ptr().add(i);
1327                        let chars = vld4q_u8(ptr);
1328
1329                        let upper_mask = vdupq_n_u8(!(b'a' - b'A'));
1330                        let chars = neon::map_8x16x4(chars, |v| vandq_u8(v, upper_mask));
1331
1332                        let two_bits_mask = vdupq_n_u8(6);
1333                        let lossy_encoded = neon::map_8x16x4(chars, |v| vandq_u8(v, two_bits_mask));
1334
1335                        let table = transmute(*b"AxCxTxGxxxxxxxxx");
1336                        let lookup = neon::map_8x16x4(lossy_encoded, |v| vqtbl1q_u8(table, v));
1337
1338                        let mask = neon::map_two_8x16x4(chars, lookup, |v1, v2| vceqq_u8(v1, v2));
1339                        let packed_bytes = !neon::movemask_64(mask);
1340
1341                        self.seq[idx..(idx + 8)].copy_from_slice(&packed_bytes.to_le_bytes());
1342                        idx += 8;
1343                        self.len += 64;
1344                    }
1345                }
1346            }
1347        }
1348
1349        let mut packed_byte = 0;
1350        for &base in &seq[last..] {
1351            packed_byte |= parse_base(base) << ((self.len % Self::C8) * B);
1352            self.len += 1;
1353            if self.len % Self::C8 == 0 {
1354                self.seq[idx] = packed_byte;
1355                idx += 1;
1356                packed_byte = 0;
1357            }
1358        }
1359        if self.len % Self::C8 != 0 && last < len {
1360            self.seq[idx] = packed_byte;
1361            idx += 1;
1362        }
1363        assert_eq!(idx + PADDING, self.seq.len());
1364        start..start + len
1365    }
1366
1367    #[cfg(feature = "rand")]
1368    fn random(n: usize) -> Self {
1369        use rand::{RngCore, SeedableRng};
1370
1371        let byte_len = n.div_ceil(Self::C8);
1372        let mut seq = vec![0; byte_len + PADDING];
1373        rand::rngs::SmallRng::from_os_rng().fill_bytes(&mut seq[..byte_len]);
1374        // Ensure that the last byte is padded with zeros.
1375        if n % Self::C8 != 0 {
1376            seq[byte_len - 1] &= (1 << (B * (n % Self::C8))) - 1;
1377        }
1378
1379        Self { seq, len: n }
1380    }
1381}
1382
1383impl<const B: usize> PackedSeqVecBase<B>
1384where
1385    Bits<B>: SupportedBits,
1386{
1387    /// Creates a `SeqVec` from a vector of packed bytes and a length in bp.
1388    ///
1389    /// The vector should have at least 48 bytes of padding after `len`.
1390    /// Otherwise, the vector will be resized to be padded with zeros.
1391    pub fn from_raw_parts(mut seq: Vec<u8>, len: usize) -> Self {
1392        assert!(len <= seq.len() * Self::C8);
1393        seq.resize(len.div_ceil(Self::C8) + PADDING, 0);
1394        Self { seq, len }
1395    }
1396}
1397
1398impl PackedSeqVecBase<1> {
1399    pub fn with_len(n: usize) -> Self {
1400        Self {
1401            seq: vec![0; n.div_ceil(Self::C8) + PADDING],
1402            len: n,
1403        }
1404    }
1405
1406    pub fn random(len: usize, n_frac: f32) -> Self {
1407        let byte_len = len.div_ceil(Self::C8);
1408        let mut seq = vec![0; byte_len + PADDING];
1409
1410        assert!(
1411            (0.0..=0.3).contains(&n_frac),
1412            "n_frac={} should be in [0, 0.3]",
1413            n_frac
1414        );
1415
1416        for _ in 0..(len as f32 * n_frac) as usize {
1417            let idx = rand::random::<u64>() as usize % len;
1418            let byte = idx / Self::C8;
1419            let offset = idx % Self::C8;
1420            seq[byte] |= 1 << offset;
1421        }
1422
1423        Self { seq, len }
1424    }
1425}
1426
1427impl<'s> PackedSeqBase<'s, 1> {
1428    /// An iterator indicating for each kmer whether it contains ambiguous bases.
1429    ///
1430    /// Returns n-(k-1) elements.
1431    #[inline(always)]
1432    pub fn iter_kmer_ambiguity(self, k: usize) -> impl ExactSizeIterator<Item = bool> + use<'s> {
1433        let this = self.normalize();
1434        assert!(k > 0);
1435        assert!(k <= Self::K64);
1436        (this.offset..this.offset + this.len.saturating_sub(k - 1))
1437            .map(move |i| self.read_kmer(k, i) != 0)
1438    }
1439
1440    /// A parallel iterator indicating for each kmer whether it contains ambiguous bases.
1441    ///
1442    /// First element is the 'kmer' consisting only of the first character of each chunk.
1443    ///
1444    /// `k`: length of windows to check
1445    /// `context`: number of overlapping iterations +1. To determine stride of each lane.
1446    /// `skip`: Set to `context-1` to skip the iterations added by the context.
1447    #[inline(always)]
1448    pub fn par_iter_kmer_ambiguity(
1449        self,
1450        k: usize,
1451        context: usize,
1452        skip: usize,
1453    ) -> PaddedIt<impl ChunkIt<S> + use<'s>> {
1454        #[cfg(target_endian = "big")]
1455        panic!("Big endian architectures are not supported.");
1456
1457        assert!(k > 0, "par_iter_kmers requires k>0, but k={k}");
1458        assert!(k <= 96, "par_iter_kmers requires k<=96, but k={k}");
1459
1460        let this = self.normalize();
1461        let o = this.offset;
1462        assert!(o < Self::C8);
1463
1464        let delay = k - 1;
1465
1466        let it = self.par_iter_bp_delayed(context, Delay(delay));
1467
1468        let mut cnt = u32x8::ZERO;
1469
1470        it.map(
1471            #[inline(always)]
1472            move |(a, r)| {
1473                cnt += a;
1474                let out = cnt.cmp_gt(S::ZERO);
1475                cnt -= r;
1476                out
1477            },
1478        )
1479        .advance(skip)
1480    }
1481
1482    #[inline(always)]
1483    pub fn par_iter_kmer_ambiguity_with_buf(
1484        self,
1485        k: usize,
1486        context: usize,
1487        skip: usize,
1488        buf: &'s mut Vec<S>,
1489    ) -> PaddedIt<impl ChunkIt<S> + use<'s>> {
1490        #[cfg(target_endian = "big")]
1491        panic!("Big endian architectures are not supported.");
1492
1493        assert!(k > 0, "par_iter_kmers requires k>0, but k={k}");
1494        assert!(k <= 96, "par_iter_kmers requires k<=96, but k={k}");
1495
1496        let this = self.normalize();
1497        let o = this.offset;
1498        assert!(o < Self::C8);
1499
1500        let delay = k - 1;
1501
1502        let it = self.par_iter_bp_delayed_with_buf(context, Delay(delay), buf);
1503
1504        let mut cnt = u32x8::ZERO;
1505
1506        it.map(
1507            #[inline(always)]
1508            move |(a, r)| {
1509                cnt += a;
1510                let out = cnt.cmp_gt(S::ZERO);
1511                cnt -= r;
1512                out
1513            },
1514        )
1515        .advance(skip)
1516    }
1517}
1518
1519#[cfg(target_feature = "neon")]
1520mod neon {
1521    use core::arch::aarch64::*;
1522
1523    #[inline(always)]
1524    pub fn movemask_64(v: uint8x16x4_t) -> u64 {
1525        // https://stackoverflow.com/questions/74722950/convert-vector-compare-mask-into-bit-mask-in-aarch64-simd-or-arm-neon/74748402#74748402
1526        unsafe {
1527            let acc = vsriq_n_u8(vsriq_n_u8(v.3, v.2, 1), vsriq_n_u8(v.1, v.0, 1), 2);
1528            vget_lane_u64(
1529                vreinterpret_u64_u8(vshrn_n_u16(
1530                    vreinterpretq_u16_u8(vsriq_n_u8(acc, acc, 4)),
1531                    4,
1532                )),
1533                0,
1534            )
1535        }
1536    }
1537
1538    #[inline(always)]
1539    pub fn map_8x16x4<F>(v: uint8x16x4_t, mut f: F) -> uint8x16x4_t
1540    where
1541        F: FnMut(uint8x16_t) -> uint8x16_t,
1542    {
1543        uint8x16x4_t(f(v.0), f(v.1), f(v.2), f(v.3))
1544    }
1545
1546    #[inline(always)]
1547    pub fn map_two_8x16x4<F>(v1: uint8x16x4_t, v2: uint8x16x4_t, mut f: F) -> uint8x16x4_t
1548    where
1549        F: FnMut(uint8x16_t, uint8x16_t) -> uint8x16_t,
1550    {
1551        uint8x16x4_t(f(v1.0, v2.0), f(v1.1, v2.1), f(v1.2, v2.2), f(v1.3, v2.3))
1552    }
1553}