packed_seq/
packed_seq.rs

1use traits::Seq;
2
3use crate::{intrinsics::transpose, padded_it::ChunkIt};
4
5use super::*;
6
7#[doc(hidden)]
8pub struct Bits<const B: usize>;
9#[doc(hidden)]
10pub trait SupportedBits {}
11impl SupportedBits for Bits<1> {}
12impl SupportedBits for Bits<2> {}
13impl SupportedBits for Bits<4> {}
14impl SupportedBits for Bits<8> {}
15
16/// Number of padding bytes at the end of `PackedSeqVecBase::seq`.
17const PADDING: usize = 16;
18
19/// A 2-bit packed non-owned slice of DNA bases.
20#[doc(hidden)]
21#[derive(Copy, Clone, Debug, MemSize, MemDbg)]
22pub struct PackedSeqBase<'s, const B: usize>
23where
24    Bits<B>: SupportedBits,
25{
26    /// Packed data.
27    seq: &'s [u8],
28    /// Offset in bp from the start of the `seq`.
29    offset: usize,
30    /// Length of the sequence in bp, starting at `offset` from the start of `seq`.
31    len: usize,
32}
33
34/// A 2-bit packed owned sequence of DNA bases.
35#[doc(hidden)]
36#[derive(Clone, Debug, MemSize, MemDbg)]
37#[cfg_attr(feature = "pyo3", pyo3::pyclass)]
38#[cfg_attr(feature = "epserde", derive(epserde::Epserde))]
39pub struct PackedSeqVecBase<const B: usize>
40where
41    Bits<B>: SupportedBits,
42{
43    /// NOTE: We maintain the invariant that this has at least 16 bytes padding
44    /// at the end after `len` finishes.
45    /// This ensures that `read_unaligned` in `as_64` works OK.
46    pub(crate) seq: Vec<u8>,
47
48    /// The length, in bp, of the underlying sequence. See `.len()`.
49    len: usize,
50}
51
52pub type PackedSeq<'s> = PackedSeqBase<'s, 2>;
53pub type PackedSeqVec = PackedSeqVecBase<2>;
54pub type BitSeq<'s> = PackedSeqBase<'s, 1>;
55pub type BitSeqVec = PackedSeqVecBase<1>;
56
57/// Convenience constants.
58/// B: bits per chat
59impl<'s, const B: usize> PackedSeqBase<'s, B>
60where
61    Bits<B>: SupportedBits,
62{
63    /// lowest B bits are 1.
64    const CHAR_MASK: u64 = (1 << B) - 1;
65    /// Chars per byte
66    const C8: usize = 8 / B;
67    /// Chars per u32
68    const C32: usize = 32 / B;
69    /// Chars per u256
70    const C256: usize = 256 / B;
71    /// Max length of a kmer that can be read as a single u64.
72    const K64: usize = (64 - 8) / B + 1;
73}
74
75/// Convenience constants.
76impl<const B: usize> PackedSeqVecBase<B>
77where
78    Bits<B>: SupportedBits,
79{
80    /// Chars per byte
81    const C8: usize = 8 / B;
82}
83
84impl<const B: usize> Default for PackedSeqVecBase<B>
85where
86    Bits<B>: SupportedBits,
87{
88    fn default() -> Self {
89        Self {
90            seq: vec![0; PADDING],
91            len: 0,
92        }
93    }
94}
95
96// ======================================================================
97// 2-BIT HELPER METHODS
98
99/// Pack an ASCII `ACTGactg` character into its 2-bit representation, and panic for anything else.
100#[inline(always)]
101pub fn pack_char(base: u8) -> u8 {
102    match base {
103        b'a' | b'A' => 0,
104        b'c' | b'C' => 1,
105        b'g' | b'G' => 3,
106        b't' | b'T' => 2,
107        _ => panic!(
108            "Unexpected character '{}' with ASCII value {base}. Expected one of ACTGactg.",
109            base as char
110        ),
111    }
112}
113
114/// Pack an ASCII `ACTGactg` character into its 2-bit representation, and silently convert other characters into 0..4 as well.
115#[inline(always)]
116pub fn pack_char_lossy(base: u8) -> u8 {
117    (base >> 1) & 3
118}
119
120/// Unpack a 2-bit DNA base into the corresponding `ACTG` character.
121#[inline(always)]
122pub fn unpack_base(base: u8) -> u8 {
123    debug_assert!(base < 4, "Base {base} is not <4.");
124    b"ACTG"[base as usize]
125}
126
127/// Complement an ASCII character: `A<>T` and `C<>G`.
128#[inline(always)]
129pub const fn complement_char(base: u8) -> u8 {
130    match base {
131        b'A' => b'T',
132        b'C' => b'G',
133        b'G' => b'C',
134        b'T' => b'A',
135        _ => panic!("Unexpected character. Expected one of ACTGactg.",),
136    }
137}
138
139/// Complement a 2-bit base: `0<>2` and `1<>3`.
140#[inline(always)]
141pub const fn complement_base(base: u8) -> u8 {
142    base ^ 2
143}
144
145/// Complement 8 lanes of 2-bit bases: `0<>2` and `1<>3`.
146#[inline(always)]
147pub fn complement_base_simd(base: u32x8) -> u32x8 {
148    base ^ u32x8::splat(2)
149}
150
151/// Reverse complement the 2-bit pairs in the input.
152#[inline(always)]
153const fn revcomp_raw(word: u64) -> u64 {
154    #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
155    {
156        let mut res = word.reverse_bits(); // ARM can reverse bits in a single instruction
157        res = ((res >> 1) & 0x5555_5555_5555_5555) | ((res & 0x5555_5555_5555_5555) << 1);
158        res ^ 0xAAAA_AAAA_AAAA_AAAA
159    }
160
161    #[cfg(not(any(target_arch = "arm", target_arch = "aarch64")))]
162    {
163        let mut res = word.swap_bytes();
164        res = ((res >> 4) & 0x0F0F_0F0F_0F0F_0F0F) | ((res & 0x0F0F_0F0F_0F0F_0F0F) << 4);
165        res = ((res >> 2) & 0x3333_3333_3333_3333) | ((res & 0x3333_3333_3333_3333) << 2);
166        res ^ 0xAAAA_AAAA_AAAA_AAAA
167    }
168}
169
170/// Compute the reverse complement of a short sequence packed in a `u64`.
171#[inline(always)]
172pub const fn revcomp_u64(word: u64, len: usize) -> u64 {
173    revcomp_raw(word) >> (usize::BITS as usize - 2 * len)
174}
175
176#[inline(always)]
177pub const fn revcomp_u128(word: u128, len: usize) -> u128 {
178    let low = word as u64;
179    let high = (word >> 64) as u64;
180    let rlow = revcomp_raw(low);
181    let rhigh = revcomp_raw(high);
182    let out = ((rlow as u128) << 64) | rhigh as u128;
183    out >> (u128::BITS as usize - 2 * len)
184}
185
186// ======================================================================
187// 1-BIT HELPER METHODS
188
189/// 1 when a char is ambiguous.
190#[inline(always)]
191pub fn char_is_ambiguous(base: u8) -> u8 {
192    // (!matches!(base, b'A' | b'C'  | b'G'  | b'T' | b'a' | b'c'  | b'g'  | b't')) as u8
193    let table = b"ACTG";
194    let upper_mask = !(b'a' - b'A');
195    (table[pack_char_lossy(base) as usize] != (base & upper_mask)) as u8
196}
197
198/// Reverse the bits in the input.
199#[inline(always)]
200const fn rev_raw(word: u64) -> u64 {
201    #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
202    {
203        // ARM can reverse bits in a single instruction
204        word.reverse_bits()
205    }
206
207    #[cfg(not(any(target_arch = "arm", target_arch = "aarch64")))]
208    {
209        let mut res = word.swap_bytes();
210        res = ((res >> 4) & 0x0F0F_0F0F_0F0F_0F0F) | ((res & 0x0F0F_0F0F_0F0F_0F0F) << 4);
211        res = ((res >> 2) & 0x3333_3333_3333_3333) | ((res & 0x3333_3333_3333_3333) << 2);
212        res = ((res >> 1) & 0x5555_5555_5555_5555) | ((res & 0x5555_5555_5555_5555) << 1);
213        res ^ 0xAAAA_AAAA_AAAA_AAAA
214    }
215}
216
217/// Compute the reverse complement of a short sequence packed in a `u64`.
218#[inline(always)]
219pub const fn rev_u64(word: u64, len: usize) -> u64 {
220    rev_raw(word) >> (usize::BITS as usize - len)
221}
222
223#[inline(always)]
224pub const fn rev_u128(word: u128, len: usize) -> u128 {
225    let low = word as u64;
226    let high = (word >> 64) as u64;
227    let rlow = rev_raw(low);
228    let rhigh = rev_raw(high);
229    let out = ((rlow as u128) << 64) | rhigh as u128;
230    out >> (u128::BITS as usize - len)
231}
232
233// ======================================================================
234
235impl<const B: usize> PackedSeqBase<'_, B>
236where
237    Bits<B>: SupportedBits,
238{
239    /// Shrink `seq` to only just cover the data.
240    #[inline(always)]
241    pub fn normalize(&self) -> Self {
242        let start_byte = self.offset / Self::C8;
243        let end_byte = (self.offset + self.len).div_ceil(Self::C8);
244        Self {
245            seq: &self.seq[start_byte..end_byte],
246            offset: self.offset % Self::C8,
247            len: self.len,
248        }
249    }
250
251    /// Return a `Vec<u8>` of ASCII `ACTG` characters.
252    #[inline(always)]
253    pub fn unpack(&self) -> Vec<u8> {
254        self.iter_bp().map(unpack_base).collect()
255    }
256}
257
258/// Read up to 32 bytes starting at idx.
259#[inline(always)]
260pub(crate) fn read_slice(seq: &[u8], idx: usize) -> u32x8 {
261    // assert!(idx <= seq.len());
262    let mut result = [0u8; 32];
263    let num_bytes = 32.min(seq.len().saturating_sub(idx));
264    unsafe {
265        let src = seq.as_ptr().add(idx);
266        std::ptr::copy_nonoverlapping(src, result.as_mut_ptr(), num_bytes);
267        std::mem::transmute(result)
268    }
269}
270
271impl<'s, const B: usize> Seq<'s> for PackedSeqBase<'s, B>
272where
273    Bits<B>: SupportedBits,
274{
275    const BITS_PER_CHAR: usize = B;
276    const BASES_PER_BYTE: usize = Self::C8;
277    type SeqVec = PackedSeqVecBase<B>;
278
279    #[inline(always)]
280    fn len(&self) -> usize {
281        self.len
282    }
283
284    #[inline(always)]
285    fn is_empty(&self) -> bool {
286        self.len == 0
287    }
288
289    #[inline(always)]
290    fn get_ascii(&self, index: usize) -> u8 {
291        unpack_base(self.get(index))
292    }
293
294    /// Convert a short sequence (kmer) to a packed representation as `u64`.
295    /// Panics if `self` is longer than 32 characters.
296    #[inline(always)]
297    fn as_u64(&self) -> u64 {
298        assert!(self.len() <= 64 / B);
299        debug_assert!(self.seq.len() <= 9);
300
301        let mask = u64::MAX >> (64 - B * self.len());
302
303        // The unaligned read is OK, because we ensure that the underlying `PackedSeqVecBase::seq` always
304        // has at least 16 bytes (the size of a u128) of padding at the end.
305        if self.len() <= Self::K64 {
306            let x = unsafe { (self.seq.as_ptr() as *const u64).read_unaligned() };
307            (x >> (B * self.offset)) & mask
308        } else {
309            let x = unsafe { (self.seq.as_ptr() as *const u128).read_unaligned() };
310            (x >> (B * self.offset)) as u64 & mask
311        }
312    }
313
314    /// Convert a short sequence (kmer) to a packed representation of its reverse complement as `usize`.
315    /// Panics if `self` is longer than 32 characters.
316    #[inline(always)]
317    fn revcomp_as_u64(&self) -> u64 {
318        match B {
319            1 => rev_u64(self.as_u64(), self.len()),
320            2 => revcomp_u64(self.as_u64(), self.len()),
321            _ => panic!("Rev(comp) is only supported for 1-bit and 2-bit alphabets."),
322        }
323    }
324
325    /// Convert a short sequence (kmer) to a packed representation as `u128`.
326    /// Panics if `self` is longer than 64 characters.
327    #[inline(always)]
328    fn as_u128(&self) -> u128 {
329        assert!(
330            self.len() <= (128 - 8) / B + 1,
331            "Sequences >61 long cannot be read with a single unaligned u128 read."
332        );
333        debug_assert!(self.seq.len() <= 17);
334
335        let mask = u128::MAX >> (128 - B * self.len());
336
337        // The unaligned read is OK, because we ensure that the underlying `PackedSeqVecBase::seq` always
338        // has at least 16 bytes (the size of a u128) of padding at the end.
339        let x = unsafe { (self.seq.as_ptr() as *const u128).read_unaligned() };
340        (x >> (B * self.offset)) & mask
341    }
342
343    /// Convert a short sequence (kmer) to a packed representation of its reverse complement as `usize`.
344    /// Panics if `self` is longer than 64 characters.
345    #[inline(always)]
346    fn revcomp_as_u128(&self) -> u128 {
347        match B {
348            1 => rev_u128(self.as_u128(), self.len()),
349            2 => revcomp_u128(self.as_u128(), self.len()),
350            _ => panic!("Rev(comp) is only supported for 1-bit and 2-bit alphabets."),
351        }
352    }
353
354    #[inline(always)]
355    fn to_vec(&self) -> PackedSeqVecBase<B> {
356        assert_eq!(self.offset, 0);
357        PackedSeqVecBase {
358            seq: self
359                .seq
360                .iter()
361                .copied()
362                .chain(std::iter::repeat_n(0u8, PADDING))
363                .collect(),
364            len: self.len,
365        }
366    }
367
368    fn to_revcomp(&self) -> PackedSeqVecBase<B> {
369        let mut seq = self.seq[..(self.offset + self.len).div_ceil(4)]
370            .iter()
371            // 1. reverse the bytes
372            .rev()
373            .copied()
374            .map(|mut res| {
375                // 2. swap the bases in the byte
376                // This is auto-vectorized.
377                res = ((res >> 4) & 0x0F) | ((res & 0x0F) << 4);
378                res = ((res >> 2) & 0x33) | ((res & 0x33) << 2);
379                res ^ 0xAA
380            })
381            .chain(std::iter::repeat_n(0u8, PADDING))
382            .collect::<Vec<u8>>();
383
384        // 3. Shift away the offset.
385        let new_offset = (4 - (self.offset + self.len) % 4) % 4;
386
387        if new_offset > 0 {
388            // Shift everything left by `2*new_offset` bits.
389            let shift = 2 * new_offset;
390            *seq.last_mut().unwrap() >>= shift;
391            // This loop is also auto-vectorized.
392            for i in 0..seq.len() - 1 {
393                seq[i] = (seq[i] >> shift) | (seq[i + 1] << (8 - shift));
394            }
395        }
396
397        PackedSeqVecBase { seq, len: self.len }
398    }
399
400    #[inline(always)]
401    fn slice(&self, range: Range<usize>) -> Self {
402        debug_assert!(
403            range.end <= self.len,
404            "Slice index out of bounds: {} > {}",
405            range.end,
406            self.len
407        );
408        PackedSeqBase {
409            seq: self.seq,
410            offset: self.offset + range.start,
411            len: range.end - range.start,
412        }
413        .normalize()
414    }
415
416    #[inline(always)]
417    fn iter_bp(self) -> impl ExactSizeIterator<Item = u8> {
418        assert!(self.len <= self.seq.len() * Self::C8);
419
420        let this = self.normalize();
421
422        // read u64 at a time?
423        let mut byte = 0;
424        (0..this.len + this.offset)
425            .map(
426                #[inline(always)]
427                move |i| {
428                    if i % Self::C8 == 0 {
429                        byte = this.seq[i / Self::C8];
430                    }
431                    // Shift byte instead of i?
432                    (byte >> (B * (i % Self::C8))) & Self::CHAR_MASK as u8
433                },
434            )
435            .advance(this.offset)
436    }
437
438    #[inline(always)]
439    fn par_iter_bp(self, context: usize) -> PaddedIt<impl ChunkIt<S>> {
440        #[cfg(target_endian = "big")]
441        panic!("Big endian architectures are not supported.");
442
443        let this = self.normalize();
444        let o = this.offset;
445        assert!(o < Self::C8);
446
447        let num_kmers = if this.len == 0 {
448            0
449        } else {
450            (this.len + o).saturating_sub(context - 1)
451        };
452        // without +o, since we don't need them in the stride.
453        let num_kmers_stride = this.len.saturating_sub(context - 1);
454        let n = num_kmers_stride.div_ceil(L).next_multiple_of(Self::C8);
455        let bytes_per_chunk = n / Self::C8;
456        let padding = Self::C8 * L * bytes_per_chunk - num_kmers_stride;
457
458        let offsets: [usize; 8] = from_fn(|l| l * bytes_per_chunk);
459        let mut cur = S::ZERO;
460
461        // Boxed, so it doesn't consume precious registers.
462        // Without this, cur is not always inlined into a register.
463        let mut buf = Box::new([S::ZERO; 8]);
464
465        let par_len = if num_kmers == 0 {
466            0
467        } else {
468            n + context + o - 1
469        };
470        let it = (0..par_len)
471            .map(
472                #[inline(always)]
473                move |i| {
474                    if i % Self::C32 == 0 {
475                        if i % Self::C256 == 0 {
476                            // Read a u256 for each lane containing the next 128 characters.
477                            let data: [u32x8; 8] = from_fn(
478                                #[inline(always)]
479                                |lane| read_slice(this.seq, offsets[lane] + (i / Self::C8)),
480                            );
481                            *buf = transpose(data);
482                        }
483                        cur = buf[(i % Self::C256) / Self::C32];
484                    }
485                    // Extract the last 2 bits of each character.
486                    let chars = cur & S::splat(Self::CHAR_MASK as u32);
487                    // Shift remaining characters to the right.
488                    cur = cur >> S::splat(B as u32);
489                    chars
490                },
491            )
492            .advance(o);
493
494        PaddedIt { it, padding }
495    }
496
497    #[inline(always)]
498    fn par_iter_bp_delayed(
499        self,
500        context: usize,
501        Delay(delay): Delay,
502    ) -> PaddedIt<impl ChunkIt<(S, S)>> {
503        #[cfg(target_endian = "big")]
504        panic!("Big endian architectures are not supported.");
505
506        assert!(
507            delay < usize::MAX / 2,
508            "Delay={} should be >=0.",
509            delay as isize
510        );
511
512        let this = self.normalize();
513        let o = this.offset;
514        assert!(o < Self::C8);
515
516        let num_kmers = if this.len == 0 {
517            0
518        } else {
519            (this.len + o).saturating_sub(context - 1)
520        };
521        // without +o, since we don't need them in the stride.
522        let num_kmers_stride = this.len.saturating_sub(context - 1);
523        let n = num_kmers_stride.div_ceil(L).next_multiple_of(Self::C8);
524        let bytes_per_chunk = n / Self::C8;
525        let padding = Self::C8 * L * bytes_per_chunk - num_kmers_stride;
526
527        let offsets: [usize; 8] = from_fn(|l| l * bytes_per_chunk);
528        let mut upcoming = S::ZERO;
529        let mut upcoming_d = S::ZERO;
530
531        // Even buf_len is nice to only have the write==buf_len check once.
532        // We also make it the next power of 2, for faster modulo operations.
533        // delay/16: number of bp in a u32.
534        // +8: some 'random' padding
535        let buf_len = (delay / Self::C32 + 8).next_power_of_two();
536        let buf_mask = buf_len - 1;
537        let mut buf = vec![S::ZERO; buf_len];
538        let mut write_idx = 0;
539        // We compensate for the first delay/16 triggers of the check below that
540        // happen before the delay is actually reached.
541        let mut read_idx = (buf_len - delay / Self::C32) % buf_len;
542
543        let par_len = if num_kmers == 0 {
544            0
545        } else {
546            n + context + o - 1
547        };
548        let it = (0..par_len)
549            .map(
550                #[inline(always)]
551                move |i| {
552                    if i % Self::C32 == 0 {
553                        if i % Self::C256 == 0 {
554                            // Read a u256 for each lane containing the next 128 characters.
555                            let data: [u32x8; 8] = from_fn(
556                                #[inline(always)]
557                                |lane| read_slice(this.seq, offsets[lane] + (i / Self::C8)),
558                            );
559                            unsafe {
560                                *TryInto::<&mut [u32x8; 8]>::try_into(
561                                    buf.get_unchecked_mut(write_idx..write_idx + 8),
562                                )
563                                .unwrap_unchecked() = transpose(data);
564                            }
565                            if i == 0 {
566                                // Mask out chars before the offset.
567                                let elem = !((1u32 << (B * o)) - 1);
568                                let mask = S::splat(elem);
569                                buf[write_idx] &= mask;
570                            }
571                        }
572                        upcoming = buf[write_idx];
573                        write_idx += 1;
574                        write_idx &= buf_mask;
575                    }
576                    if i % Self::C32 == delay % Self::C32 {
577                        unsafe { assert_unchecked(read_idx < buf.len()) };
578                        upcoming_d = buf[read_idx];
579                        read_idx += 1;
580                        read_idx &= buf_mask;
581                    }
582                    // Extract the last 2 bits of each character.
583                    let chars = upcoming & S::splat(Self::CHAR_MASK as u32);
584                    let chars_d = upcoming_d & S::splat(Self::CHAR_MASK as u32);
585                    // Shift remaining characters to the right.
586                    upcoming = upcoming >> S::splat(B as u32);
587                    upcoming_d = upcoming_d >> S::splat(B as u32);
588                    (chars, chars_d)
589                },
590            )
591            .advance(o);
592
593        PaddedIt { it, padding }
594    }
595
596    /// NOTE: When `self` starts does not start at a byte boundary, the
597    /// 'delayed' character is not guaranteed to be `0`.
598    #[inline(always)]
599    fn par_iter_bp_delayed_2(
600        self,
601        context: usize,
602        delay1: Delay,
603        delay2: Delay,
604    ) -> PaddedIt<impl ChunkIt<(S, S, S)>> {
605        self.par_iter_bp_delayed_2_with_factor(context, delay1, delay2, 1)
606    }
607
608    /// Compares 29 characters at a time.
609    fn cmp_lcp(&self, other: &Self) -> (std::cmp::Ordering, usize) {
610        let mut lcp = 0;
611        let min_len = self.len.min(other.len);
612        for i in (0..min_len).step_by(Self::K64) {
613            let len = (min_len - i).min(Self::K64);
614            let this = self.slice(i..i + len);
615            let other = other.slice(i..i + len);
616            let this_word = this.as_u64();
617            let other_word = other.as_u64();
618            if this_word != other_word {
619                // Unfortunately, bases are packed in little endian order, so the default order is reversed.
620                let eq = this_word ^ other_word;
621                let t = eq.trailing_zeros() as usize / B * B;
622                lcp += t / B;
623                let mask = (Self::CHAR_MASK) << t;
624                return ((this_word & mask).cmp(&(other_word & mask)), lcp);
625            }
626            lcp += len;
627        }
628        (self.len.cmp(&other.len), lcp)
629    }
630
631    #[inline(always)]
632    fn get(&self, index: usize) -> u8 {
633        let offset = self.offset + index;
634        let idx = offset / Self::C8;
635        let offset = offset % Self::C8;
636        (self.seq[idx] >> (B * offset)) & Self::CHAR_MASK as u8
637    }
638}
639
640impl<'s, const B: usize> PackedSeqBase<'s, B>
641where
642    Bits<B>: SupportedBits,
643{
644    /// When iterating over 2-bit and 1-bit encoded data in parallel,
645    /// one must ensure that they have the same stride.
646    /// On the larger type, set `factor` as the ratio to the smaller one,
647    /// so that the stride in bytes is a multiple of `factor`,
648    /// so that the smaller type also has a byte-aligned stride.
649    #[inline(always)]
650    pub fn par_iter_bp_delayed_2_with_factor(
651        self,
652        context: usize,
653        Delay(delay1): Delay,
654        Delay(delay2): Delay,
655        factor: usize,
656    ) -> PaddedIt<impl ChunkIt<(S, S, S)> + use<'s, B>> {
657        #[cfg(target_endian = "big")]
658        panic!("Big endian architectures are not supported.");
659
660        let this = self.normalize();
661        let o = this.offset;
662        assert!(o < Self::C8);
663        assert!(delay1 <= delay2, "Delay1 must be at most delay2.");
664
665        let num_kmers = if this.len == 0 {
666            0
667        } else {
668            (this.len + o).saturating_sub(context - 1)
669        };
670        // without +o, since we don't need them in the stride.
671        let num_kmers_stride = this.len.saturating_sub(context - 1);
672        let n = num_kmers_stride
673            .div_ceil(L)
674            .next_multiple_of(factor * Self::C8);
675        let bytes_per_chunk = n / Self::C8;
676        let padding = Self::C8 * L * bytes_per_chunk - num_kmers_stride;
677
678        let offsets: [usize; 8] = from_fn(|l| l * bytes_per_chunk);
679        let mut upcoming = S::ZERO;
680        let mut upcoming_d1 = S::ZERO;
681        let mut upcoming_d2 = S::ZERO;
682
683        // Even buf_len is nice to only have the write==buf_len check once.
684        let buf_len = (delay2 / Self::C32 + 8).next_power_of_two();
685        let buf_mask = buf_len - 1;
686        let mut buf = vec![S::ZERO; buf_len];
687        let mut write_idx = 0;
688        // We compensate for the first delay/16 triggers of the check below that
689        // happen before the delay is actually reached.
690        let mut read_idx1 = (buf_len - delay1 / Self::C32) % buf_len;
691        let mut read_idx2 = (buf_len - delay2 / Self::C32) % buf_len;
692
693        let par_len = if num_kmers == 0 {
694            0
695        } else {
696            n + context + o - 1
697        };
698        let it = (0..par_len)
699            .map(
700                #[inline(always)]
701                move |i| {
702                    if i % Self::C32 == 0 {
703                        if i % Self::C256 == 0 {
704                            // Read a u256 for each lane containing the next 128 characters.
705                            let data: [u32x8; 8] = from_fn(
706                                #[inline(always)]
707                                |lane| read_slice(this.seq, offsets[lane] + (i / Self::C8)),
708                            );
709                            unsafe {
710                                *TryInto::<&mut [u32x8; 8]>::try_into(
711                                    buf.get_unchecked_mut(write_idx..write_idx + 8),
712                                )
713                                .unwrap_unchecked() = transpose(data);
714                            }
715                            if i == 0 {
716                                // Mask out chars before the offset.
717                                let elem = !((1u32 << (B * o)) - 1);
718                                let mask = S::splat(elem);
719                                buf[write_idx] &= mask;
720                            }
721                        }
722                        upcoming = buf[write_idx];
723                        write_idx += 1;
724                        write_idx &= buf_mask;
725                    }
726                    if i % Self::C32 == delay1 % Self::C32 {
727                        unsafe { assert_unchecked(read_idx1 < buf.len()) };
728                        upcoming_d1 = buf[read_idx1];
729                        read_idx1 += 1;
730                        read_idx1 &= buf_mask;
731                    }
732                    if i % Self::C32 == delay2 % Self::C32 {
733                        unsafe { assert_unchecked(read_idx2 < buf.len()) };
734                        upcoming_d2 = buf[read_idx2];
735                        read_idx2 += 1;
736                        read_idx2 &= buf_mask;
737                    }
738                    // Extract the last 2 bits of each character.
739                    let chars = upcoming & S::splat(Self::CHAR_MASK as u32);
740                    let chars_d1 = upcoming_d1 & S::splat(Self::CHAR_MASK as u32);
741                    let chars_d2 = upcoming_d2 & S::splat(Self::CHAR_MASK as u32);
742                    // Shift remaining characters to the right.
743                    upcoming = upcoming >> S::splat(B as u32);
744                    upcoming_d1 = upcoming_d1 >> S::splat(B as u32);
745                    upcoming_d2 = upcoming_d2 >> S::splat(B as u32);
746                    (chars, chars_d1, chars_d2)
747                },
748            )
749            .advance(o);
750
751        PaddedIt { it, padding }
752    }
753}
754
755impl<const B: usize> PartialEq for PackedSeqBase<'_, B>
756where
757    Bits<B>: SupportedBits,
758{
759    /// Compares 29 characters at a time.
760    fn eq(&self, other: &Self) -> bool {
761        if self.len != other.len {
762            return false;
763        }
764        for i in (0..self.len).step_by(Self::K64) {
765            let len = (self.len - i).min(Self::K64);
766            let this = self.slice(i..i + len);
767            let that = other.slice(i..i + len);
768            if this.as_u64() != that.as_u64() {
769                return false;
770            }
771        }
772        true
773    }
774}
775
776impl<const B: usize> Eq for PackedSeqBase<'_, B> where Bits<B>: SupportedBits {}
777
778impl<const B: usize> PartialOrd for PackedSeqBase<'_, B>
779where
780    Bits<B>: SupportedBits,
781{
782    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
783        Some(self.cmp(other))
784    }
785}
786
787impl<const B: usize> Ord for PackedSeqBase<'_, B>
788where
789    Bits<B>: SupportedBits,
790{
791    /// Compares 29 characters at a time.
792    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
793        let min_len = self.len.min(other.len);
794        for i in (0..min_len).step_by(Self::K64) {
795            let len = (min_len - i).min(Self::K64);
796            let this = self.slice(i..i + len);
797            let other = other.slice(i..i + len);
798            let this_word = this.as_u64();
799            let other_word = other.as_u64();
800            if this_word != other_word {
801                // Unfortunately, bases are packed in little endian order, so the default order is reversed.
802                let eq = this_word ^ other_word;
803                let t = eq.trailing_zeros() as usize / B * B;
804                let mask = (Self::CHAR_MASK) << t;
805                return (this_word & mask).cmp(&(other_word & mask));
806            }
807        }
808        self.len.cmp(&other.len)
809    }
810}
811
812impl<const B: usize> SeqVec for PackedSeqVecBase<B>
813where
814    Bits<B>: SupportedBits,
815{
816    type Seq<'s> = PackedSeqBase<'s, B>;
817
818    #[inline(always)]
819    fn into_raw(mut self) -> Vec<u8> {
820        self.seq.resize(self.len.div_ceil(Self::C8), 0);
821        self.seq
822    }
823
824    #[inline(always)]
825    fn as_slice(&self) -> Self::Seq<'_> {
826        PackedSeqBase {
827            seq: &self.seq[..self.len.div_ceil(Self::C8)],
828            offset: 0,
829            len: self.len,
830        }
831    }
832
833    #[inline(always)]
834    fn len(&self) -> usize {
835        self.len
836    }
837
838    #[inline(always)]
839    fn is_empty(&self) -> bool {
840        self.len == 0
841    }
842
843    #[inline(always)]
844    fn clear(&mut self) {
845        self.seq.clear();
846        self.len = 0;
847    }
848
849    fn push_seq<'a>(&mut self, seq: PackedSeqBase<'_, B>) -> Range<usize> {
850        let start = self.len.next_multiple_of(Self::C8) + seq.offset;
851        let end = start + seq.len();
852        // Reserve *additional* capacity.
853        self.seq.reserve(seq.seq.len());
854        // Shrink away the padding.
855        self.seq.resize(self.len.div_ceil(Self::C8), 0);
856        // Extend.
857        self.seq.extend(seq.seq);
858        // Push padding.
859        self.seq.extend(std::iter::repeat_n(0u8, PADDING));
860        self.len = end;
861        start..end
862    }
863
864    /// For `PackedSeqVec` (2-bit encoding): map ASCII `ACGT` (and `acgt`) to DNA `0132` in that order.
865    /// Other characters are silently mapped into `0..4`.
866    ///
867    /// Uses the BMI2 `pext` instruction when available, based on the
868    /// `n_to_bits_pext` method described at
869    /// <https://github.com/Daniel-Liu-c0deb0t/cute-nucleotides>.
870    ///
871    /// For `BitSeqVec` (1-bit encoding): map `ACGTacgt` to `0`, and everything else to `1`.
872    fn push_ascii(&mut self, seq: &[u8]) -> Range<usize> {
873        match B {
874            1 | 2 => {}
875            _ => panic!(
876                "Can only use ASCII input for 2-bit DNA packing, or 1-bit ambiguous indicators."
877            ),
878        }
879
880        self.seq
881            .resize((self.len + seq.len()).div_ceil(Self::C8) + PADDING, 0);
882        let start_aligned = self.len.next_multiple_of(Self::C8);
883        let start = self.len;
884        let len = seq.len();
885        let mut idx = self.len / Self::C8;
886
887        let parse_base = |base| match B {
888            1 => char_is_ambiguous(base),
889            2 => pack_char_lossy(base),
890            _ => unreachable!(),
891        };
892
893        let unaligned = core::cmp::min(start_aligned - start, len);
894        if unaligned > 0 {
895            let mut packed_byte = self.seq[idx];
896            for &base in &seq[..unaligned] {
897                packed_byte |= parse_base(base) << ((self.len % Self::C8) * B);
898                self.len += 1;
899            }
900            self.seq[idx] = packed_byte;
901            idx += 1;
902        }
903
904        #[allow(unused)]
905        let mut last = unaligned;
906
907        // TODO: Vectorization for B=1?
908        if B == 2 {
909            #[cfg(all(target_arch = "x86_64", target_feature = "bmi2"))]
910            {
911                last = unaligned + (len - unaligned) / 8 * 8;
912
913                for i in (unaligned..last).step_by(8) {
914                    let chunk = &seq[i..i + 8].try_into().unwrap();
915                    let ascii = u64::from_ne_bytes(*chunk);
916                    let packed_bytes =
917                        unsafe { std::arch::x86_64::_pext_u64(ascii, 0x0606060606060606) };
918                    self.seq[idx] = packed_bytes as u8;
919                    idx += 1;
920                    self.seq[idx] = (packed_bytes >> 8) as u8;
921                    idx += 1;
922                    self.len += 8;
923                }
924            }
925
926            #[cfg(target_feature = "neon")]
927            {
928                use core::arch::aarch64::{
929                    vandq_u8, vdup_n_u8, vld1q_u8, vpadd_u8, vshlq_u8, vst1_u8,
930                };
931                use core::mem::transmute;
932
933                last = unaligned + (len - unaligned) / 16 * 16;
934
935                for i in (unaligned..last).step_by(16) {
936                    unsafe {
937                        let ascii = vld1q_u8(seq.as_ptr().add(i));
938                        let masked_bits = vandq_u8(ascii, transmute([6i8; 16]));
939                        let (bits_0, bits_1) = transmute(vshlq_u8(
940                            masked_bits,
941                            transmute([-1i8, 1, 3, 5, -1, 1, 3, 5, -1, 1, 3, 5, -1, 1, 3, 5]),
942                        ));
943                        let half_packed = vpadd_u8(bits_0, bits_1);
944                        let packed = vpadd_u8(half_packed, vdup_n_u8(0));
945                        vst1_u8(self.seq.as_mut_ptr().add(idx), packed);
946                        idx += Self::C8;
947                        self.len += 16;
948                    }
949                }
950            }
951        }
952        if B == 1 {
953            // FIXME: Add NEON version.
954            #[cfg(all(target_arch = "x86_64", target_feature = "avx2"))]
955            {
956                last = unaligned + len;
957                self.len = len;
958
959                for i in (unaligned..last).step_by(32) {
960                    use std::mem::transmute as t;
961
962                    use wide::CmpEq;
963                    // Wide doesn't have u8x32, so this is messy here...
964                    type S = wide::i8x32;
965                    let chars: S = unsafe { t(read_slice(seq, i)) };
966                    let upper_mask = !(b'a' - b'A');
967                    // make everything upper case
968                    let chars = chars & S::splat(upper_mask as i8);
969                    let lossy_encoded = chars & S::splat(6);
970                    let table = unsafe { S::from(t::<_, S>(*b"AxCxTxGxxxxxxxxxAxCxTxGxxxxxxxxx")) };
971                    let lookup: S = unsafe {
972                        t(std::arch::x86_64::_mm256_shuffle_epi8(
973                            t(table),
974                            t(lossy_encoded),
975                        ))
976                    };
977                    let packed_bytes = !(chars.cmp_eq(lookup).move_mask() as u32);
978
979                    if i + 32 <= last {
980                        self.seq[idx + 0] = packed_bytes as u8;
981                        self.seq[idx + 1] = (packed_bytes >> 8) as u8;
982                        self.seq[idx + 2] = (packed_bytes >> 16) as u8;
983                        self.seq[idx + 3] = (packed_bytes >> 24) as u8;
984                        idx += 4;
985                    } else {
986                        let mut b = 0;
987                        while i + b < last {
988                            self.seq[idx] = (packed_bytes >> b) as u8;
989                            idx += 1;
990                            b += 8;
991                        }
992                    }
993                }
994            }
995        }
996
997        let mut packed_byte = 0;
998        for &base in &seq[last..] {
999            packed_byte |= parse_base(base) << ((self.len % Self::C8) * B);
1000            self.len += 1;
1001            if self.len % Self::C8 == 0 {
1002                self.seq[idx] = packed_byte;
1003                idx += 1;
1004                packed_byte = 0;
1005            }
1006        }
1007        if self.len % Self::C8 != 0 && last < len {
1008            self.seq[idx] = packed_byte;
1009            idx += 1;
1010        }
1011        assert_eq!(idx + PADDING, self.seq.len());
1012        start..start + len
1013    }
1014
1015    #[cfg(feature = "rand")]
1016    fn random(n: usize) -> Self {
1017        use rand::{RngCore, SeedableRng};
1018
1019        let byte_len = n.div_ceil(Self::C8);
1020        let mut seq = vec![0; byte_len + PADDING];
1021        rand::rngs::SmallRng::from_os_rng().fill_bytes(&mut seq[..byte_len]);
1022        // Ensure that the last byte is padded with zeros.
1023        if n % Self::C8 != 0 {
1024            seq[byte_len - 1] &= (1 << (B * (n % Self::C8))) - 1;
1025        }
1026
1027        Self { seq, len: n }
1028    }
1029}
1030
1031impl<'s> PackedSeqBase<'s, 1> {
1032    /// An iterator indicating for each kmer whether it contains ambiguous bases.
1033    ///
1034    /// Returns n-(k-1) elements.
1035    #[inline(always)]
1036    pub fn iter_kmer_ambiguity(self, k: usize) -> impl ExactSizeIterator<Item = bool> + use<'s> {
1037        let this = self.normalize();
1038        assert!(k > 0);
1039        assert!(k <= Self::K64);
1040        (this.offset..this.offset + this.len.saturating_sub(k - 1))
1041            .map(move |i| self.read_kmer(k, i) != 0)
1042    }
1043
1044    /// A parallel iterator indicating for each kmer whether it contains ambiguous bases.
1045    ///
1046    /// First element is the 'kmer' consisting only of the first character of each chunk.
1047    ///
1048    /// `k`: length of windows to check
1049    /// `context`: number of overlapping iterations +1. To determine stride of each lane.
1050    /// `skip`: Set to `context-1` to skip the iterations added by the context.
1051    #[inline(always)]
1052    pub fn par_iter_kmer_ambiguity(
1053        self,
1054        k: usize,
1055        context: usize,
1056        skip: usize,
1057    ) -> PaddedIt<impl ChunkIt<S> + use<'s>> {
1058        #[cfg(target_endian = "big")]
1059        panic!("Big endian architectures are not supported.");
1060
1061        assert!(k <= 64, "par_iter_kmers requires k<=64, but k={k}");
1062
1063        let this = self.normalize();
1064        let o = this.offset;
1065        assert!(o < Self::C8);
1066
1067        let num_kmers = if this.len == 0 {
1068            0
1069        } else {
1070            (this.len + o).saturating_sub(context - 1)
1071        };
1072        // without +o, since we don't need them in the stride.
1073        let num_kmers_stride = this.len.saturating_sub(context - 1);
1074        let n = num_kmers_stride.div_ceil(L).next_multiple_of(Self::C8);
1075        let bytes_per_chunk = n / Self::C8;
1076        let padding = Self::C8 * L * bytes_per_chunk - num_kmers_stride;
1077
1078        let offsets: [usize; 8] = from_fn(|l| l * bytes_per_chunk);
1079
1080        //     prev2 prev    cur
1081        //           0..31 | 32..63
1082        // mask      00001111110000
1083        // mask      00000111111000
1084        // mask      00000011111100
1085        // mask      00000001111110
1086        // mask      00000000111111
1087        //           cur     next
1088        //           32..63| 64..95
1089        // mask      11111100000000
1090
1091        // [prev2, prev, cur]
1092        let mut cur = [S::ZERO; 3];
1093        let mut mask = [S::ZERO; 3];
1094        if k <= 32 {
1095            // high k bits of cur
1096            mask[2] = (S::MAX) << S::splat(32 - k as u32);
1097        } else {
1098            mask[2] = S::MAX;
1099            mask[1] = (S::MAX) << S::splat(64 - k as u32);
1100        }
1101
1102        #[inline(always)]
1103        fn rotate_mask(mask: &mut [S; 3], r: u32) {
1104            let carry01 = mask[0] >> S::splat(32 - r);
1105            let carry12 = mask[1] >> S::splat(32 - r);
1106            mask[0] = mask[0] << r;
1107            mask[1] = (mask[1] << r) | carry01;
1108            mask[2] = (mask[2] << r) | carry12;
1109        }
1110
1111        // Boxed, so it doesn't consume precious registers.
1112        // Without this, cur is not always inlined into a register.
1113        let mut buf = Box::new([S::ZERO; 8]);
1114
1115        // We skip the first o iterations.
1116        let par_len = if num_kmers == 0 { 0 } else { n + k + o - 1 };
1117
1118        let mut read = {
1119            #[inline(always)]
1120            move |i: usize, cur: &mut [S; 3]| {
1121                if i % Self::C256 == 0 {
1122                    // Read a u256 for each lane containing the next 128 characters.
1123                    let data: [u32x8; 8] = from_fn(
1124                        #[inline(always)]
1125                        |lane| read_slice(this.seq, offsets[lane] + (i / Self::C8)),
1126                    );
1127                    *buf = transpose(data);
1128                }
1129                cur[0] = cur[1];
1130                cur[1] = cur[2];
1131                cur[2] = buf[(i % Self::C256) / Self::C32];
1132            }
1133        };
1134
1135        // Precompute the first o+skip iterations.
1136        let mut to_skip = o + skip;
1137        let mut i = 0;
1138        while to_skip > 0 {
1139            read(i, &mut cur);
1140            i += 32;
1141            if to_skip >= 32 {
1142                to_skip -= 32;
1143            } else {
1144                mask[0] = mask[1];
1145                mask[1] = mask[2];
1146                mask[2] = S::splat(0);
1147                // rotate mask by remainder
1148                rotate_mask(&mut mask, to_skip as u32);
1149                break;
1150            }
1151        }
1152
1153        let it = (o + skip..par_len).map(
1154            #[inline(always)]
1155            move |i| {
1156                if i % Self::C32 == 0 {
1157                    read(i, &mut cur);
1158                    mask[0] = mask[1];
1159                    mask[1] = mask[2];
1160                    mask[2] = S::splat(0);
1161                }
1162
1163                rotate_mask(&mut mask, 1);
1164                !((cur[0] & mask[0]) | (cur[1] & mask[1]) | (cur[2] & mask[2])).cmp_eq(S::splat(0))
1165            },
1166        );
1167
1168        PaddedIt { it, padding }
1169    }
1170}