packed_seq/
packed_seq.rs

1use traits::Seq;
2
3use crate::intrinsics::transpose;
4
5use super::*;
6
7/// A 2-bit packed non-owned slice of DNA bases.
8#[derive(Copy, Clone, Debug, MemSize, MemDbg)]
9pub struct PackedSeq<'s> {
10    /// Packed data.
11    seq: &'s [u8],
12    /// Offset in bp from the start of the `seq`.
13    offset: usize,
14    /// Length of the sequence in bp, starting at `offset` from the start of `seq`.
15    len: usize,
16}
17
18/// Number of padding bytes at the end of `PackedSeqVec::seq`.
19const PADDING: usize = 16;
20
21/// A 2-bit packed owned sequence of DNA bases.
22#[derive(Clone, Debug, MemSize, MemDbg)]
23#[cfg_attr(feature = "pyo3", pyo3::pyclass)]
24#[cfg_attr(feature = "epserde", derive(epserde::Epserde))]
25pub struct PackedSeqVec {
26    /// NOTE: We maintain the invariant that this has at least 16 bytes padding
27    /// at the end after `len` finishes.
28    /// This ensures that `read_unaligned` in `as_64` works OK.
29    pub(crate) seq: Vec<u8>,
30
31    /// The length, in bp, of the underlying sequence. See `.len()`.
32    len: usize,
33}
34
35impl Default for PackedSeqVec {
36    fn default() -> Self {
37        Self {
38            seq: vec![0; PADDING],
39            len: 0,
40        }
41    }
42}
43
44/// Pack an ASCII `ACTGactg` character into its 2-bit representation.
45#[inline(always)]
46pub fn pack_char(base: u8) -> u8 {
47    match base {
48        b'a' | b'A' => 0,
49        b'c' | b'C' => 1,
50        b'g' | b'G' => 3,
51        b't' | b'T' => 2,
52        _ => panic!(
53            "Unexpected character '{}' with ASCII value {base}. Expected one of ACTGactg.",
54            base as char
55        ),
56    }
57}
58
59/// Unpack a 2-bit DNA base into the corresponding `ACTG` character.
60#[inline(always)]
61pub fn unpack_base(base: u8) -> u8 {
62    debug_assert!(base < 4, "Base {base} is not <4.");
63    b"ACTG"[base as usize]
64}
65
66/// Complement an ASCII character: `A<>T` and `C<>G`.
67#[inline(always)]
68pub const fn complement_char(base: u8) -> u8 {
69    match base {
70        b'A' => b'T',
71        b'C' => b'G',
72        b'G' => b'C',
73        b'T' => b'A',
74        _ => panic!("Unexpected character. Expected one of ACTGactg.",),
75    }
76}
77
78/// Complement a 2-bit base: `0<>2` and `1<>3`.
79#[inline(always)]
80pub const fn complement_base(base: u8) -> u8 {
81    base ^ 2
82}
83
84/// Complement 8 lanes of 2-bit bases: `0<>2` and `1<>3`.
85#[inline(always)]
86pub fn complement_base_simd(base: u32x8) -> u32x8 {
87    base ^ u32x8::splat(2)
88}
89
90/// Compute the reverse complement of a short sequence packed in a `u64`.
91#[inline(always)]
92pub const fn revcomp_u64(word: u64, len: usize) -> u64 {
93    #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
94    {
95        let mut res = word.reverse_bits(); // ARM can reverse bits in a single instruction
96        res = ((res >> 1) & 0x5555_5555_5555_5555) | ((res & 0x5555_5555_5555_5555) << 1);
97        res ^= 0xAAAA_AAAA_AAAA_AAAA;
98        res >> (usize::BITS as usize - 2 * len)
99    }
100
101    #[cfg(not(any(target_arch = "arm", target_arch = "aarch64")))]
102    {
103        let mut res = word.swap_bytes();
104        res = ((res >> 4) & 0x0F0F_0F0F_0F0F_0F0F) | ((res & 0x0F0F_0F0F_0F0F_0F0F) << 4);
105        res = ((res >> 2) & 0x3333_3333_3333_3333) | ((res & 0x3333_3333_3333_3333) << 2);
106        res ^= 0xAAAA_AAAA_AAAA_AAAA;
107        res >> (usize::BITS as usize - 2 * len)
108    }
109}
110
111impl PackedSeq<'_> {
112    /// Shrink `seq` to only just cover the data.
113    #[inline(always)]
114    pub fn normalize(&self) -> Self {
115        let start = self.offset / 4;
116        let end = (self.offset + self.len).div_ceil(4);
117        Self {
118            seq: &self.seq[start..end],
119            offset: self.offset % 4,
120            len: self.len,
121        }
122    }
123
124    /// Return a `Vec<u8>` of ASCII `ACTG` characters.
125    #[inline(always)]
126    pub fn unpack(&self) -> Vec<u8> {
127        self.iter_bp().map(unpack_base).collect()
128    }
129}
130
131#[inline(always)]
132pub(crate) fn read_slice(seq: &[u8], idx: usize) -> u32x8 {
133    // assert!(idx <= seq.len());
134    let mut result = [0u8; 32];
135    let num_bytes = 32.min(seq.len().saturating_sub(idx));
136    unsafe {
137        let src = seq.as_ptr().add(idx);
138        std::ptr::copy_nonoverlapping(src, result.as_mut_ptr(), num_bytes);
139        std::mem::transmute(result)
140    }
141}
142
143impl<'s> Seq<'s> for PackedSeq<'s> {
144    const BASES_PER_BYTE: usize = 4;
145    const BITS_PER_CHAR: usize = 2;
146    type SeqVec = PackedSeqVec;
147
148    #[inline(always)]
149    fn len(&self) -> usize {
150        self.len
151    }
152
153    #[inline(always)]
154    fn is_empty(&self) -> bool {
155        self.len == 0
156    }
157
158    #[inline(always)]
159    fn get(&self, index: usize) -> u8 {
160        let offset = self.offset + index;
161        let idx = offset / 4;
162        let offset = offset % 4;
163        (self.seq[idx] >> (2 * offset)) & 3
164    }
165
166    #[inline(always)]
167    fn get_ascii(&self, index: usize) -> u8 {
168        unpack_base(self.get(index))
169    }
170
171    /// Convert a short sequence (kmer) to a packed representation as `usize`.
172    /// Panics if `self` is longer than 32 characters.
173    #[inline(always)]
174    fn as_u64(&self) -> u64 {
175        assert!(self.len() <= 32);
176        debug_assert!(self.seq.len() <= 9);
177
178        let mask = u64::MAX >> (64 - 2 * self.len());
179
180        // The unaligned read is OK, because we ensure that the underlying `PackedSeqVec::seq` always
181        // has at least 16 bytes (the size of a u128) of padding at the end.
182        if self.len() <= 29 {
183            let x = unsafe { (self.seq.as_ptr() as *const u64).read_unaligned() };
184            (x >> (2 * self.offset)) & mask
185        } else {
186            let x = unsafe { (self.seq.as_ptr() as *const u128).read_unaligned() };
187            (x >> (2 * self.offset)) as u64 & mask
188        }
189    }
190
191    /// Convert a short sequence (kmer) to a packed representation of its reverse complement as `usize`.
192    /// Panics if `self` is longer than 32 characters.
193    #[inline(always)]
194    fn revcomp_as_u64(&self) -> u64 {
195        revcomp_u64(self.as_u64(), self.len())
196    }
197
198    #[inline(always)]
199    fn to_vec(&self) -> PackedSeqVec {
200        assert_eq!(self.offset, 0);
201        PackedSeqVec {
202            seq: self
203                .seq
204                .iter()
205                .copied()
206                .chain(std::iter::repeat_n(0u8, PADDING))
207                .collect(),
208            len: self.len,
209        }
210    }
211
212    fn to_revcomp(&self) -> PackedSeqVec {
213        let mut seq = self.seq[..(self.offset + self.len).div_ceil(4)]
214            .iter()
215            // 1. reverse the bytes
216            .rev()
217            .copied()
218            .map(|mut res| {
219                // 2. swap the bases in the byte
220                // This is auto-vectorized.
221                res = ((res >> 4) & 0x0F) | ((res & 0x0F) << 4);
222                res = ((res >> 2) & 0x33) | ((res & 0x33) << 2);
223                res ^ 0xAA
224            })
225            .chain(std::iter::repeat_n(0u8, PADDING))
226            .collect::<Vec<u8>>();
227
228        // 3. Shift away the offset.
229        let new_offset = (4 - (self.offset + self.len) % 4) % 4;
230
231        if new_offset > 0 {
232            // Shift everything left by `2*new_offset` bits.
233            let shift = 2 * new_offset;
234            *seq.last_mut().unwrap() >>= shift;
235            // This loop is also auto-vectorized.
236            for i in 0..seq.len() - 1 {
237                seq[i] = (seq[i] >> shift) | (seq[i + 1] << (8 - shift));
238            }
239        }
240
241        PackedSeqVec { seq, len: self.len }
242    }
243
244    #[inline(always)]
245    fn slice(&self, range: Range<usize>) -> Self {
246        debug_assert!(
247            range.end <= self.len,
248            "Slice index out of bounds: {} > {}",
249            range.end,
250            self.len
251        );
252        PackedSeq {
253            seq: self.seq,
254            offset: self.offset + range.start,
255            len: range.end - range.start,
256        }
257        .normalize()
258    }
259
260    #[inline(always)]
261    fn iter_bp(self) -> impl ExactSizeIterator<Item = u8> + Clone {
262        assert!(self.len <= self.seq.len() * 4);
263
264        let this = self.normalize();
265
266        // read u64 at a time?
267        let mut byte = 0;
268        let mut it = (0..this.len + this.offset).map(
269            #[inline(always)]
270            move |i| {
271                if i % 4 == 0 {
272                    byte = this.seq[i / 4];
273                }
274                // Shift byte instead of i?
275                (byte >> (2 * (i % 4))) & 0b11
276            },
277        );
278        it.by_ref().take(this.offset).for_each(drop);
279        it
280    }
281
282    #[inline(always)]
283    fn par_iter_bp(self, context: usize) -> (impl ExactSizeIterator<Item = S> + Clone, usize) {
284        #[cfg(target_endian = "big")]
285        panic!("Big endian architectures are not supported.");
286
287        let this = self.normalize();
288        let o = this.offset;
289        assert!(o < 4);
290
291        let num_kmers = if this.len == 0 {
292            0
293        } else {
294            (this.len + o).saturating_sub(context - 1)
295        };
296        // without +o, since we don't need them in the stride.
297        let num_kmers_stride = this.len.saturating_sub(context - 1);
298        let n = num_kmers_stride.div_ceil(L).next_multiple_of(4);
299        let bytes_per_chunk = n / 4;
300        let padding = 4 * L * bytes_per_chunk - num_kmers_stride;
301
302        let offsets: [usize; 8] = from_fn(|l| (l * bytes_per_chunk));
303        let mut cur = S::ZERO;
304
305        // Boxed, so it doesn't consume precious registers.
306        // Without this, cur is not always inlined into a register.
307        let mut buf = Box::new([S::ZERO; 8]);
308
309        // We skip the first o iterations.
310        let par_len = if num_kmers == 0 {
311            0
312        } else {
313            n + context + o - 1
314        };
315        let mut it = (0..par_len).map(
316            #[inline(always)]
317            move |i| {
318                if i % 16 == 0 {
319                    if i % 128 == 0 {
320                        // Read a u256 for each lane containing the next 128 characters.
321                        let data: [u32x8; 8] = from_fn(
322                            #[inline(always)]
323                            |lane| read_slice(this.seq, offsets[lane] + (i / 4)),
324                        );
325                        *buf = transpose(data);
326                    }
327                    cur = buf[(i % 128) / 16];
328                }
329                // Extract the last 2 bits of each character.
330                let chars = cur & S::splat(0x03);
331                // Shift remaining characters to the right.
332                cur = cur >> S::splat(2);
333                chars
334            },
335        );
336        // Drop the first few chars.
337        it.by_ref().take(o).for_each(drop);
338
339        (it, padding)
340    }
341
342    /// NOTE: When `self` starts does not start at a byte boundary, the
343    /// 'delayed' character is not guaranteed to be `0`.
344    #[inline(always)]
345    fn par_iter_bp_delayed(
346        self,
347        context: usize,
348        delay: usize,
349    ) -> (impl ExactSizeIterator<Item = (S, S)> + Clone, usize) {
350        #[cfg(target_endian = "big")]
351        panic!("Big endian architectures are not supported.");
352
353        assert!(
354            delay < usize::MAX / 2,
355            "Delay={} should be >=0.",
356            delay as isize
357        );
358
359        let this = self.normalize();
360        let o = this.offset;
361        assert!(o < 4);
362
363        let num_kmers = if this.len == 0 {
364            0
365        } else {
366            (this.len + o).saturating_sub(context - 1)
367        };
368        // without +o, since we don't need them in the stride.
369        let num_kmers_stride = this.len.saturating_sub(context - 1);
370        let n = num_kmers_stride.div_ceil(L).next_multiple_of(4);
371        let bytes_per_chunk = n / 4;
372        let padding = 4 * L * bytes_per_chunk - num_kmers_stride;
373
374        let offsets: [usize; 8] = from_fn(|l| (l * bytes_per_chunk));
375        let mut upcoming = S::ZERO;
376        let mut upcoming_d = S::ZERO;
377
378        // Even buf_len is nice to only have the write==buf_len check once.
379        // We also make it the next power of 2, for faster modulo operations.
380        // delay/16: number of bp in a u32.
381        let buf_len = (delay / 16 + 8).next_power_of_two();
382        let buf_mask = buf_len - 1;
383        let mut buf = vec![S::ZERO; buf_len];
384        let mut write_idx = 0;
385        // We compensate for the first delay/16 triggers of the check below that
386        // happen before the delay is actually reached.
387        let mut read_idx = (buf_len - delay / 16) % buf_len;
388
389        let par_len = if num_kmers == 0 {
390            0
391        } else {
392            n + context + o - 1
393        };
394        let mut it = (0..par_len).map(
395            #[inline(always)]
396            move |i| {
397                if i % 16 == 0 {
398                    if i % 128 == 0 {
399                        // Read a u256 for each lane containing the next 128 characters.
400                        let data: [u32x8; 8] = from_fn(
401                            #[inline(always)]
402                            |lane| read_slice(this.seq, offsets[lane] + (i / 4)),
403                        );
404                        unsafe {
405                            *TryInto::<&mut [u32x8; 8]>::try_into(
406                                buf.get_unchecked_mut(write_idx..write_idx + 8),
407                            )
408                            .unwrap_unchecked() = transpose(data);
409                        }
410                        if i == 0 {
411                            // Mask out chars before the offset.
412                            let elem = !((1u32 << (2 * o)) - 1);
413                            let mask = S::splat(elem);
414                            buf[write_idx] &= mask;
415                        }
416                    }
417                    upcoming = buf[write_idx];
418                    write_idx += 1;
419                    write_idx &= buf_mask;
420                }
421                if i % 16 == delay % 16 {
422                    unsafe { assert_unchecked(read_idx < buf.len()) };
423                    upcoming_d = buf[read_idx];
424                    read_idx += 1;
425                    read_idx &= buf_mask;
426                }
427                // Extract the last 2 bits of each character.
428                let chars = upcoming & S::splat(0x03);
429                let chars_d = upcoming_d & S::splat(0x03);
430                // Shift remaining characters to the right.
431                upcoming = upcoming >> S::splat(2);
432                upcoming_d = upcoming_d >> S::splat(2);
433                (chars, chars_d)
434            },
435        );
436        it.by_ref().take(o).for_each(drop);
437
438        (it, padding)
439    }
440
441    /// NOTE: When `self` starts does not start at a byte boundary, the
442    /// 'delayed' character is not guaranteed to be `0`.
443    #[inline(always)]
444    fn par_iter_bp_delayed_2(
445        self,
446        context: usize,
447        delay1: usize,
448        delay2: usize,
449    ) -> (impl ExactSizeIterator<Item = (S, S, S)> + Clone, usize) {
450        #[cfg(target_endian = "big")]
451        panic!("Big endian architectures are not supported.");
452
453        let this = self.normalize();
454        let o = this.offset;
455        assert!(o < 4);
456        assert!(delay1 <= delay2, "Delay1 must be at most delay2.");
457
458        let num_kmers = if this.len == 0 {
459            0
460        } else {
461            (this.len + o).saturating_sub(context - 1)
462        };
463        // without +o, since we don't need them in the stride.
464        let num_kmers_stride = this.len.saturating_sub(context - 1);
465        let n = num_kmers_stride.div_ceil(L).next_multiple_of(4);
466        let bytes_per_chunk = n / 4;
467        let padding = 4 * L * bytes_per_chunk - num_kmers_stride;
468
469        let offsets: [usize; 8] = from_fn(|l| (l * bytes_per_chunk));
470        let mut upcoming = S::ZERO;
471        let mut upcoming_d1 = S::ZERO;
472        let mut upcoming_d2 = S::ZERO;
473
474        // Even buf_len is nice to only have the write==buf_len check once.
475        let buf_len = (delay2 / 16 + 8).next_power_of_two();
476        let buf_mask = buf_len - 1;
477        let mut buf = vec![S::ZERO; buf_len];
478        let mut write_idx = 0;
479        // We compensate for the first delay/16 triggers of the check below that
480        // happen before the delay is actually reached.
481        let mut read_idx1 = (buf_len - delay1 / 16) % buf_len;
482        let mut read_idx2 = (buf_len - delay2 / 16) % buf_len;
483
484        let par_len = if num_kmers == 0 {
485            0
486        } else {
487            n + context + o - 1
488        };
489        let mut it = (0..par_len).map(
490            #[inline(always)]
491            move |i| {
492                if i % 16 == 0 {
493                    if i % 128 == 0 {
494                        // Read a u256 for each lane containing the next 128 characters.
495                        let data: [u32x8; 8] = from_fn(
496                            #[inline(always)]
497                            |lane| read_slice(this.seq, offsets[lane] + (i / 4)),
498                        );
499                        unsafe {
500                            *TryInto::<&mut [u32x8; 8]>::try_into(
501                                buf.get_unchecked_mut(write_idx..write_idx + 8),
502                            )
503                            .unwrap_unchecked() = transpose(data);
504                        }
505                        if i == 0 {
506                            // Mask out chars before the offset.
507                            let elem = !((1u32 << (2 * o)) - 1);
508                            let mask = S::splat(elem);
509                            buf[write_idx] &= mask;
510                        }
511                    }
512                    upcoming = buf[write_idx];
513                    write_idx += 1;
514                    write_idx &= buf_mask;
515                }
516                if i % 16 == delay1 % 16 {
517                    unsafe { assert_unchecked(read_idx1 < buf.len()) };
518                    upcoming_d1 = buf[read_idx1];
519                    read_idx1 += 1;
520                    read_idx1 &= buf_mask;
521                }
522                if i % 16 == delay2 % 16 {
523                    unsafe { assert_unchecked(read_idx2 < buf.len()) };
524                    upcoming_d2 = buf[read_idx2];
525                    read_idx2 += 1;
526                    read_idx2 &= buf_mask;
527                }
528                // Extract the last 2 bits of each character.
529                let chars = upcoming & S::splat(0x03);
530                let chars_d1 = upcoming_d1 & S::splat(0x03);
531                let chars_d2 = upcoming_d2 & S::splat(0x03);
532                // Shift remaining characters to the right.
533                upcoming = upcoming >> S::splat(2);
534                upcoming_d1 = upcoming_d1 >> S::splat(2);
535                upcoming_d2 = upcoming_d2 >> S::splat(2);
536                (chars, chars_d1, chars_d2)
537            },
538        );
539        it.by_ref().take(o).for_each(drop);
540
541        (it, padding)
542    }
543
544    /// Compares 29 characters at a time.
545    fn cmp_lcp(&self, other: &Self) -> (std::cmp::Ordering, usize) {
546        let mut lcp = 0;
547        let min_len = self.len.min(other.len);
548        for i in (0..min_len).step_by(29) {
549            let len = (min_len - i).min(29);
550            let this = self.slice(i..i + len);
551            let other = other.slice(i..i + len);
552            let this_word = this.as_u64();
553            let other_word = other.as_u64();
554            if this_word != other_word {
555                // Unfortunately, bases are packed in little endian order, so the default order is reversed.
556                let eq = this_word ^ other_word;
557                let t = eq.trailing_zeros() / 2 * 2;
558                lcp += t as usize / 2;
559                let mask = 0b11 << t;
560                return ((this_word & mask).cmp(&(other_word & mask)), lcp);
561            }
562            lcp += len;
563        }
564        (self.len.cmp(&other.len), lcp)
565    }
566}
567
568impl PartialEq for PackedSeq<'_> {
569    /// Compares 29 characters at a time.
570    fn eq(&self, other: &Self) -> bool {
571        if self.len != other.len {
572            return false;
573        }
574        for i in (0..self.len).step_by(29) {
575            let len = (self.len - i).min(29);
576            let this = self.slice(i..i + len);
577            let that = other.slice(i..i + len);
578            if this.as_u64() != that.as_u64() {
579                return false;
580            }
581        }
582        true
583    }
584}
585
586impl Eq for PackedSeq<'_> {}
587
588impl PartialOrd for PackedSeq<'_> {
589    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
590        Some(self.cmp(other))
591    }
592}
593
594impl Ord for PackedSeq<'_> {
595    /// Compares 29 characters at a time.
596    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
597        let min_len = self.len.min(other.len);
598        for i in (0..min_len).step_by(29) {
599            let len = (min_len - i).min(29);
600            let this = self.slice(i..i + len);
601            let other = other.slice(i..i + len);
602            let this_word = this.as_u64();
603            let other_word = other.as_u64();
604            if this_word != other_word {
605                // Unfortunately, bases are packed in little endian order, so the default order is reversed.
606                let eq = this_word ^ other_word;
607                let t = eq.trailing_zeros() / 2 * 2;
608                let mask = 0b11 << t;
609                return (this_word & mask).cmp(&(other_word & mask));
610            }
611        }
612        self.len.cmp(&other.len)
613    }
614}
615
616impl SeqVec for PackedSeqVec {
617    type Seq<'s> = PackedSeq<'s>;
618
619    #[inline(always)]
620    fn into_raw(mut self) -> Vec<u8> {
621        self.seq.resize(self.len.div_ceil(4), 0);
622        self.seq
623    }
624
625    #[inline(always)]
626    fn as_slice(&self) -> Self::Seq<'_> {
627        PackedSeq {
628            seq: &self.seq[..self.len.div_ceil(4)],
629            offset: 0,
630            len: self.len,
631        }
632    }
633
634    #[inline(always)]
635    fn len(&self) -> usize {
636        self.len
637    }
638
639    #[inline(always)]
640    fn is_empty(&self) -> bool {
641        self.len == 0
642    }
643
644    #[inline(always)]
645    fn clear(&mut self) {
646        self.seq.clear();
647        self.len = 0;
648    }
649
650    fn push_seq<'a>(&mut self, seq: PackedSeq<'_>) -> Range<usize> {
651        let start = self.len.next_multiple_of(4) + seq.offset;
652        let end = start + seq.len();
653        // Reserve *additional* capacity.
654        self.seq.reserve(seq.seq.len());
655        // Shrink away the padding.
656        self.seq.resize(self.len.div_ceil(4), 0);
657        // Extend.
658        self.seq.extend(seq.seq);
659        // Push padding.
660        self.seq.extend(std::iter::repeat_n(0u8, PADDING));
661        self.len = end;
662        start..end
663    }
664
665    /// Push an ASCII sequence to an `PackedSeqVec`.
666    /// `Aa` map to `0`, `Cc` to `1`, `Gg` to `3`, and `Tt` to `2`.
667    /// Other characters may be silently mapped into `[0, 4)` or panic.
668    /// (TODO: Explicitly support different conversions.)
669    ///
670    /// Uses the BMI2 `pext` instruction when available, based on the
671    /// `n_to_bits_pext` method described at
672    /// <https://github.com/Daniel-Liu-c0deb0t/cute-nucleotides>.
673    ///
674    /// TODO: Support multiple ways of dealing with non-`ACTG` characters.
675    fn push_ascii(&mut self, seq: &[u8]) -> Range<usize> {
676        self.seq
677            .resize((self.len + seq.len()).div_ceil(4) + PADDING, 0);
678        let start_aligned = self.len.next_multiple_of(4);
679        let start = self.len;
680        let len = seq.len();
681        let mut idx = self.len / 4;
682
683        let unaligned = core::cmp::min(start_aligned - start, len);
684        if unaligned > 0 {
685            let mut packed_byte = self.seq[idx];
686            for &base in &seq[..unaligned] {
687                packed_byte |= pack_char(base) << ((self.len % 4) * 2);
688                self.len += 1;
689            }
690            self.seq[idx] = packed_byte;
691            idx += 1;
692        }
693
694        #[allow(unused)]
695        let mut last = unaligned;
696
697        #[cfg(all(target_arch = "x86_64", target_feature = "bmi2"))]
698        {
699            last = unaligned + (len - unaligned) / 8 * 8;
700
701            for i in (unaligned..last).step_by(8) {
702                let chunk = &seq[i..i + 8].try_into().unwrap();
703                let ascii = u64::from_ne_bytes(*chunk);
704                let packed_bytes =
705                    unsafe { std::arch::x86_64::_pext_u64(ascii, 0x0606060606060606) };
706                self.seq[idx] = packed_bytes as u8;
707                idx += 1;
708                self.seq[idx] = (packed_bytes >> 8) as u8;
709                idx += 1;
710                self.len += 8;
711            }
712        }
713
714        #[cfg(target_feature = "neon")]
715        {
716            use core::arch::aarch64::{vandq_u8, vdup_n_u8, vld1q_u8, vpadd_u8, vshlq_u8, vst1_u8};
717            use core::mem::transmute;
718
719            last = unaligned + (len - unaligned) / 16 * 16;
720
721            for i in (unaligned..last).step_by(16) {
722                unsafe {
723                    let ascii = vld1q_u8(seq.as_ptr().add(i));
724                    let masked_bits = vandq_u8(ascii, transmute([6i8; 16]));
725                    let (bits_0, bits_1) = transmute(vshlq_u8(
726                        masked_bits,
727                        transmute([-1i8, 1, 3, 5, -1, 1, 3, 5, -1, 1, 3, 5, -1, 1, 3, 5]),
728                    ));
729                    let half_packed = vpadd_u8(bits_0, bits_1);
730                    let packed = vpadd_u8(half_packed, vdup_n_u8(0));
731                    vst1_u8(self.seq.as_mut_ptr().add(idx), packed);
732                    idx += 4;
733                    self.len += 16;
734                }
735            }
736        }
737
738        let mut packed_byte = 0;
739        for &base in &seq[last..] {
740            packed_byte |= pack_char(base) << ((self.len % 4) * 2);
741            self.len += 1;
742            if self.len % 4 == 0 {
743                self.seq[idx] = packed_byte;
744                idx += 1;
745                packed_byte = 0;
746            }
747        }
748        if self.len % 4 != 0 && last < len {
749            self.seq[idx] = packed_byte;
750            idx += 1;
751        }
752        assert_eq!(idx + PADDING, self.seq.len());
753        start..start + len
754    }
755
756    #[cfg(feature = "rand")]
757    fn random(n: usize) -> Self {
758        use rand::{RngCore, SeedableRng};
759
760        let byte_len = n.div_ceil(4);
761        let mut seq = vec![0; byte_len + PADDING];
762        rand::rngs::SmallRng::from_os_rng().fill_bytes(&mut seq[..byte_len]);
763        // Ensure that the last byte is padded with zeros.
764        if n % 4 != 0 {
765            seq[byte_len - 1] &= (1 << (2 * (n % 4))) - 1;
766        }
767
768        Self { seq, len: n }
769    }
770}