packed_seq/
packed_seq.rs

1use traits::Seq;
2
3use crate::intrinsics::transpose;
4
5use super::*;
6
7/// A 2-bit packed non-owned slice of DNA bases.
8#[derive(Copy, Clone, Debug, MemSize, MemDbg)]
9pub struct PackedSeq<'s> {
10    /// Packed data.
11    seq: &'s [u8],
12    /// Offset in bp from the start of the `seq`.
13    offset: usize,
14    /// Length of the sequence in bp, starting at `offset` from the start of `seq`.
15    len: usize,
16}
17
18/// Number of padding bytes at the end of `PackedSeqVec::seq`.
19const PADDING: usize = 16;
20
21/// A 2-bit packed owned sequence of DNA bases.
22#[derive(Clone, Debug, MemSize, MemDbg)]
23#[cfg_attr(feature = "pyo3", pyo3::pyclass)]
24#[cfg_attr(feature = "epserde", derive(epserde::Epserde))]
25pub struct PackedSeqVec {
26    /// NOTE: We maintain the invariant that this has at least 16 bytes padding
27    /// at the end after `len` finishes.
28    /// This ensures that `read_unaligned` in `as_64` works OK.
29    pub(crate) seq: Vec<u8>,
30
31    /// The length, in bp, of the underlying sequence. See `.len()`.
32    len: usize,
33}
34
35impl Default for PackedSeqVec {
36    fn default() -> Self {
37        Self {
38            seq: vec![0; PADDING],
39            len: 0,
40        }
41    }
42}
43
44/// Pack an ASCII `ACTGactg` character into its 2-bit representation, and panic for anything else.
45#[inline(always)]
46pub fn pack_char(base: u8) -> u8 {
47    match base {
48        b'a' | b'A' => 0,
49        b'c' | b'C' => 1,
50        b'g' | b'G' => 3,
51        b't' | b'T' => 2,
52        _ => panic!(
53            "Unexpected character '{}' with ASCII value {base}. Expected one of ACTGactg.",
54            base as char
55        ),
56    }
57}
58
59/// Pack an ASCII `ACTGactg` character into its 2-bit representation, and silently convert other characters into 0..4 as well.
60#[inline(always)]
61pub fn pack_char_lossy(base: u8) -> u8 {
62    (base >> 1) & 3
63}
64
65/// Unpack a 2-bit DNA base into the corresponding `ACTG` character.
66#[inline(always)]
67pub fn unpack_base(base: u8) -> u8 {
68    debug_assert!(base < 4, "Base {base} is not <4.");
69    b"ACTG"[base as usize]
70}
71
72/// Complement an ASCII character: `A<>T` and `C<>G`.
73#[inline(always)]
74pub const fn complement_char(base: u8) -> u8 {
75    match base {
76        b'A' => b'T',
77        b'C' => b'G',
78        b'G' => b'C',
79        b'T' => b'A',
80        _ => panic!("Unexpected character. Expected one of ACTGactg.",),
81    }
82}
83
84/// Complement a 2-bit base: `0<>2` and `1<>3`.
85#[inline(always)]
86pub const fn complement_base(base: u8) -> u8 {
87    base ^ 2
88}
89
90/// Complement 8 lanes of 2-bit bases: `0<>2` and `1<>3`.
91#[inline(always)]
92pub fn complement_base_simd(base: u32x8) -> u32x8 {
93    base ^ u32x8::splat(2)
94}
95
96/// Compute the reverse complement of a short sequence packed in a `u64`.
97#[inline(always)]
98pub const fn revcomp_u64(word: u64, len: usize) -> u64 {
99    #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
100    {
101        let mut res = word.reverse_bits(); // ARM can reverse bits in a single instruction
102        res = ((res >> 1) & 0x5555_5555_5555_5555) | ((res & 0x5555_5555_5555_5555) << 1);
103        res ^= 0xAAAA_AAAA_AAAA_AAAA;
104        res >> (usize::BITS as usize - 2 * len)
105    }
106
107    #[cfg(not(any(target_arch = "arm", target_arch = "aarch64")))]
108    {
109        let mut res = word.swap_bytes();
110        res = ((res >> 4) & 0x0F0F_0F0F_0F0F_0F0F) | ((res & 0x0F0F_0F0F_0F0F_0F0F) << 4);
111        res = ((res >> 2) & 0x3333_3333_3333_3333) | ((res & 0x3333_3333_3333_3333) << 2);
112        res ^= 0xAAAA_AAAA_AAAA_AAAA;
113        res >> (usize::BITS as usize - 2 * len)
114    }
115}
116
117impl PackedSeq<'_> {
118    /// Shrink `seq` to only just cover the data.
119    #[inline(always)]
120    pub fn normalize(&self) -> Self {
121        let start = self.offset / 4;
122        let end = (self.offset + self.len).div_ceil(4);
123        Self {
124            seq: &self.seq[start..end],
125            offset: self.offset % 4,
126            len: self.len,
127        }
128    }
129
130    /// Return a `Vec<u8>` of ASCII `ACTG` characters.
131    #[inline(always)]
132    pub fn unpack(&self) -> Vec<u8> {
133        self.iter_bp().map(unpack_base).collect()
134    }
135}
136
137#[inline(always)]
138pub(crate) fn read_slice(seq: &[u8], idx: usize) -> u32x8 {
139    // assert!(idx <= seq.len());
140    let mut result = [0u8; 32];
141    let num_bytes = 32.min(seq.len().saturating_sub(idx));
142    unsafe {
143        let src = seq.as_ptr().add(idx);
144        std::ptr::copy_nonoverlapping(src, result.as_mut_ptr(), num_bytes);
145        std::mem::transmute(result)
146    }
147}
148
149impl<'s> Seq<'s> for PackedSeq<'s> {
150    const BASES_PER_BYTE: usize = 4;
151    const BITS_PER_CHAR: usize = 2;
152    type SeqVec = PackedSeqVec;
153
154    #[inline(always)]
155    fn len(&self) -> usize {
156        self.len
157    }
158
159    #[inline(always)]
160    fn is_empty(&self) -> bool {
161        self.len == 0
162    }
163
164    #[inline(always)]
165    fn get(&self, index: usize) -> u8 {
166        let offset = self.offset + index;
167        let idx = offset / 4;
168        let offset = offset % 4;
169        (self.seq[idx] >> (2 * offset)) & 3
170    }
171
172    #[inline(always)]
173    fn get_ascii(&self, index: usize) -> u8 {
174        unpack_base(self.get(index))
175    }
176
177    /// Convert a short sequence (kmer) to a packed representation as `usize`.
178    /// Panics if `self` is longer than 32 characters.
179    #[inline(always)]
180    fn as_u64(&self) -> u64 {
181        assert!(self.len() <= 32);
182        debug_assert!(self.seq.len() <= 9);
183
184        let mask = u64::MAX >> (64 - 2 * self.len());
185
186        // The unaligned read is OK, because we ensure that the underlying `PackedSeqVec::seq` always
187        // has at least 16 bytes (the size of a u128) of padding at the end.
188        if self.len() <= 29 {
189            let x = unsafe { (self.seq.as_ptr() as *const u64).read_unaligned() };
190            (x >> (2 * self.offset)) & mask
191        } else {
192            let x = unsafe { (self.seq.as_ptr() as *const u128).read_unaligned() };
193            (x >> (2 * self.offset)) as u64 & mask
194        }
195    }
196
197    /// Convert a short sequence (kmer) to a packed representation of its reverse complement as `usize`.
198    /// Panics if `self` is longer than 32 characters.
199    #[inline(always)]
200    fn revcomp_as_u64(&self) -> u64 {
201        revcomp_u64(self.as_u64(), self.len())
202    }
203
204    #[inline(always)]
205    fn to_vec(&self) -> PackedSeqVec {
206        assert_eq!(self.offset, 0);
207        PackedSeqVec {
208            seq: self
209                .seq
210                .iter()
211                .copied()
212                .chain(std::iter::repeat_n(0u8, PADDING))
213                .collect(),
214            len: self.len,
215        }
216    }
217
218    fn to_revcomp(&self) -> PackedSeqVec {
219        let mut seq = self.seq[..(self.offset + self.len).div_ceil(4)]
220            .iter()
221            // 1. reverse the bytes
222            .rev()
223            .copied()
224            .map(|mut res| {
225                // 2. swap the bases in the byte
226                // This is auto-vectorized.
227                res = ((res >> 4) & 0x0F) | ((res & 0x0F) << 4);
228                res = ((res >> 2) & 0x33) | ((res & 0x33) << 2);
229                res ^ 0xAA
230            })
231            .chain(std::iter::repeat_n(0u8, PADDING))
232            .collect::<Vec<u8>>();
233
234        // 3. Shift away the offset.
235        let new_offset = (4 - (self.offset + self.len) % 4) % 4;
236
237        if new_offset > 0 {
238            // Shift everything left by `2*new_offset` bits.
239            let shift = 2 * new_offset;
240            *seq.last_mut().unwrap() >>= shift;
241            // This loop is also auto-vectorized.
242            for i in 0..seq.len() - 1 {
243                seq[i] = (seq[i] >> shift) | (seq[i + 1] << (8 - shift));
244            }
245        }
246
247        PackedSeqVec { seq, len: self.len }
248    }
249
250    #[inline(always)]
251    fn slice(&self, range: Range<usize>) -> Self {
252        debug_assert!(
253            range.end <= self.len,
254            "Slice index out of bounds: {} > {}",
255            range.end,
256            self.len
257        );
258        PackedSeq {
259            seq: self.seq,
260            offset: self.offset + range.start,
261            len: range.end - range.start,
262        }
263        .normalize()
264    }
265
266    #[inline(always)]
267    fn iter_bp(self) -> impl ExactSizeIterator<Item = u8> + Clone {
268        assert!(self.len <= self.seq.len() * 4);
269
270        let this = self.normalize();
271
272        // read u64 at a time?
273        let mut byte = 0;
274        let mut it = (0..this.len + this.offset).map(
275            #[inline(always)]
276            move |i| {
277                if i % 4 == 0 {
278                    byte = this.seq[i / 4];
279                }
280                // Shift byte instead of i?
281                (byte >> (2 * (i % 4))) & 0b11
282            },
283        );
284        it.by_ref().take(this.offset).for_each(drop);
285        it
286    }
287
288    #[inline(always)]
289    fn par_iter_bp(self, context: usize) -> (impl ExactSizeIterator<Item = S> + Clone, usize) {
290        #[cfg(target_endian = "big")]
291        panic!("Big endian architectures are not supported.");
292
293        let this = self.normalize();
294        let o = this.offset;
295        assert!(o < 4);
296
297        let num_kmers = if this.len == 0 {
298            0
299        } else {
300            (this.len + o).saturating_sub(context - 1)
301        };
302        // without +o, since we don't need them in the stride.
303        let num_kmers_stride = this.len.saturating_sub(context - 1);
304        let n = num_kmers_stride.div_ceil(L).next_multiple_of(4);
305        let bytes_per_chunk = n / 4;
306        let padding = 4 * L * bytes_per_chunk - num_kmers_stride;
307
308        let offsets: [usize; 8] = from_fn(|l| (l * bytes_per_chunk));
309        let mut cur = S::ZERO;
310
311        // Boxed, so it doesn't consume precious registers.
312        // Without this, cur is not always inlined into a register.
313        let mut buf = Box::new([S::ZERO; 8]);
314
315        // We skip the first o iterations.
316        let par_len = if num_kmers == 0 {
317            0
318        } else {
319            n + context + o - 1
320        };
321        let mut it = (0..par_len).map(
322            #[inline(always)]
323            move |i| {
324                if i % 16 == 0 {
325                    if i % 128 == 0 {
326                        // Read a u256 for each lane containing the next 128 characters.
327                        let data: [u32x8; 8] = from_fn(
328                            #[inline(always)]
329                            |lane| read_slice(this.seq, offsets[lane] + (i / 4)),
330                        );
331                        *buf = transpose(data);
332                    }
333                    cur = buf[(i % 128) / 16];
334                }
335                // Extract the last 2 bits of each character.
336                let chars = cur & S::splat(0x03);
337                // Shift remaining characters to the right.
338                cur = cur >> S::splat(2);
339                chars
340            },
341        );
342        // Drop the first few chars.
343        it.by_ref().take(o).for_each(drop);
344
345        (it, padding)
346    }
347
348    /// NOTE: When `self` starts does not start at a byte boundary, the
349    /// 'delayed' character is not guaranteed to be `0`.
350    #[inline(always)]
351    fn par_iter_bp_delayed(
352        self,
353        context: usize,
354        delay: usize,
355    ) -> (impl ExactSizeIterator<Item = (S, S)> + Clone, usize) {
356        #[cfg(target_endian = "big")]
357        panic!("Big endian architectures are not supported.");
358
359        assert!(
360            delay < usize::MAX / 2,
361            "Delay={} should be >=0.",
362            delay as isize
363        );
364
365        let this = self.normalize();
366        let o = this.offset;
367        assert!(o < 4);
368
369        let num_kmers = if this.len == 0 {
370            0
371        } else {
372            (this.len + o).saturating_sub(context - 1)
373        };
374        // without +o, since we don't need them in the stride.
375        let num_kmers_stride = this.len.saturating_sub(context - 1);
376        let n = num_kmers_stride.div_ceil(L).next_multiple_of(4);
377        let bytes_per_chunk = n / 4;
378        let padding = 4 * L * bytes_per_chunk - num_kmers_stride;
379
380        let offsets: [usize; 8] = from_fn(|l| (l * bytes_per_chunk));
381        let mut upcoming = S::ZERO;
382        let mut upcoming_d = S::ZERO;
383
384        // Even buf_len is nice to only have the write==buf_len check once.
385        // We also make it the next power of 2, for faster modulo operations.
386        // delay/16: number of bp in a u32.
387        let buf_len = (delay / 16 + 8).next_power_of_two();
388        let buf_mask = buf_len - 1;
389        let mut buf = vec![S::ZERO; buf_len];
390        let mut write_idx = 0;
391        // We compensate for the first delay/16 triggers of the check below that
392        // happen before the delay is actually reached.
393        let mut read_idx = (buf_len - delay / 16) % buf_len;
394
395        let par_len = if num_kmers == 0 {
396            0
397        } else {
398            n + context + o - 1
399        };
400        let mut it = (0..par_len).map(
401            #[inline(always)]
402            move |i| {
403                if i % 16 == 0 {
404                    if i % 128 == 0 {
405                        // Read a u256 for each lane containing the next 128 characters.
406                        let data: [u32x8; 8] = from_fn(
407                            #[inline(always)]
408                            |lane| read_slice(this.seq, offsets[lane] + (i / 4)),
409                        );
410                        unsafe {
411                            *TryInto::<&mut [u32x8; 8]>::try_into(
412                                buf.get_unchecked_mut(write_idx..write_idx + 8),
413                            )
414                            .unwrap_unchecked() = transpose(data);
415                        }
416                        if i == 0 {
417                            // Mask out chars before the offset.
418                            let elem = !((1u32 << (2 * o)) - 1);
419                            let mask = S::splat(elem);
420                            buf[write_idx] &= mask;
421                        }
422                    }
423                    upcoming = buf[write_idx];
424                    write_idx += 1;
425                    write_idx &= buf_mask;
426                }
427                if i % 16 == delay % 16 {
428                    unsafe { assert_unchecked(read_idx < buf.len()) };
429                    upcoming_d = buf[read_idx];
430                    read_idx += 1;
431                    read_idx &= buf_mask;
432                }
433                // Extract the last 2 bits of each character.
434                let chars = upcoming & S::splat(0x03);
435                let chars_d = upcoming_d & S::splat(0x03);
436                // Shift remaining characters to the right.
437                upcoming = upcoming >> S::splat(2);
438                upcoming_d = upcoming_d >> S::splat(2);
439                (chars, chars_d)
440            },
441        );
442        it.by_ref().take(o).for_each(drop);
443
444        (it, padding)
445    }
446
447    /// NOTE: When `self` starts does not start at a byte boundary, the
448    /// 'delayed' character is not guaranteed to be `0`.
449    #[inline(always)]
450    fn par_iter_bp_delayed_2(
451        self,
452        context: usize,
453        delay1: usize,
454        delay2: usize,
455    ) -> (impl ExactSizeIterator<Item = (S, S, S)> + Clone, usize) {
456        #[cfg(target_endian = "big")]
457        panic!("Big endian architectures are not supported.");
458
459        let this = self.normalize();
460        let o = this.offset;
461        assert!(o < 4);
462        assert!(delay1 <= delay2, "Delay1 must be at most delay2.");
463
464        let num_kmers = if this.len == 0 {
465            0
466        } else {
467            (this.len + o).saturating_sub(context - 1)
468        };
469        // without +o, since we don't need them in the stride.
470        let num_kmers_stride = this.len.saturating_sub(context - 1);
471        let n = num_kmers_stride.div_ceil(L).next_multiple_of(4);
472        let bytes_per_chunk = n / 4;
473        let padding = 4 * L * bytes_per_chunk - num_kmers_stride;
474
475        let offsets: [usize; 8] = from_fn(|l| (l * bytes_per_chunk));
476        let mut upcoming = S::ZERO;
477        let mut upcoming_d1 = S::ZERO;
478        let mut upcoming_d2 = S::ZERO;
479
480        // Even buf_len is nice to only have the write==buf_len check once.
481        let buf_len = (delay2 / 16 + 8).next_power_of_two();
482        let buf_mask = buf_len - 1;
483        let mut buf = vec![S::ZERO; buf_len];
484        let mut write_idx = 0;
485        // We compensate for the first delay/16 triggers of the check below that
486        // happen before the delay is actually reached.
487        let mut read_idx1 = (buf_len - delay1 / 16) % buf_len;
488        let mut read_idx2 = (buf_len - delay2 / 16) % buf_len;
489
490        let par_len = if num_kmers == 0 {
491            0
492        } else {
493            n + context + o - 1
494        };
495        let mut it = (0..par_len).map(
496            #[inline(always)]
497            move |i| {
498                if i % 16 == 0 {
499                    if i % 128 == 0 {
500                        // Read a u256 for each lane containing the next 128 characters.
501                        let data: [u32x8; 8] = from_fn(
502                            #[inline(always)]
503                            |lane| read_slice(this.seq, offsets[lane] + (i / 4)),
504                        );
505                        unsafe {
506                            *TryInto::<&mut [u32x8; 8]>::try_into(
507                                buf.get_unchecked_mut(write_idx..write_idx + 8),
508                            )
509                            .unwrap_unchecked() = transpose(data);
510                        }
511                        if i == 0 {
512                            // Mask out chars before the offset.
513                            let elem = !((1u32 << (2 * o)) - 1);
514                            let mask = S::splat(elem);
515                            buf[write_idx] &= mask;
516                        }
517                    }
518                    upcoming = buf[write_idx];
519                    write_idx += 1;
520                    write_idx &= buf_mask;
521                }
522                if i % 16 == delay1 % 16 {
523                    unsafe { assert_unchecked(read_idx1 < buf.len()) };
524                    upcoming_d1 = buf[read_idx1];
525                    read_idx1 += 1;
526                    read_idx1 &= buf_mask;
527                }
528                if i % 16 == delay2 % 16 {
529                    unsafe { assert_unchecked(read_idx2 < buf.len()) };
530                    upcoming_d2 = buf[read_idx2];
531                    read_idx2 += 1;
532                    read_idx2 &= buf_mask;
533                }
534                // Extract the last 2 bits of each character.
535                let chars = upcoming & S::splat(0x03);
536                let chars_d1 = upcoming_d1 & S::splat(0x03);
537                let chars_d2 = upcoming_d2 & S::splat(0x03);
538                // Shift remaining characters to the right.
539                upcoming = upcoming >> S::splat(2);
540                upcoming_d1 = upcoming_d1 >> S::splat(2);
541                upcoming_d2 = upcoming_d2 >> S::splat(2);
542                (chars, chars_d1, chars_d2)
543            },
544        );
545        it.by_ref().take(o).for_each(drop);
546
547        (it, padding)
548    }
549
550    /// Compares 29 characters at a time.
551    fn cmp_lcp(&self, other: &Self) -> (std::cmp::Ordering, usize) {
552        let mut lcp = 0;
553        let min_len = self.len.min(other.len);
554        for i in (0..min_len).step_by(29) {
555            let len = (min_len - i).min(29);
556            let this = self.slice(i..i + len);
557            let other = other.slice(i..i + len);
558            let this_word = this.as_u64();
559            let other_word = other.as_u64();
560            if this_word != other_word {
561                // Unfortunately, bases are packed in little endian order, so the default order is reversed.
562                let eq = this_word ^ other_word;
563                let t = eq.trailing_zeros() / 2 * 2;
564                lcp += t as usize / 2;
565                let mask = 0b11 << t;
566                return ((this_word & mask).cmp(&(other_word & mask)), lcp);
567            }
568            lcp += len;
569        }
570        (self.len.cmp(&other.len), lcp)
571    }
572}
573
574impl PartialEq for PackedSeq<'_> {
575    /// Compares 29 characters at a time.
576    fn eq(&self, other: &Self) -> bool {
577        if self.len != other.len {
578            return false;
579        }
580        for i in (0..self.len).step_by(29) {
581            let len = (self.len - i).min(29);
582            let this = self.slice(i..i + len);
583            let that = other.slice(i..i + len);
584            if this.as_u64() != that.as_u64() {
585                return false;
586            }
587        }
588        true
589    }
590}
591
592impl Eq for PackedSeq<'_> {}
593
594impl PartialOrd for PackedSeq<'_> {
595    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
596        Some(self.cmp(other))
597    }
598}
599
600impl Ord for PackedSeq<'_> {
601    /// Compares 29 characters at a time.
602    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
603        let min_len = self.len.min(other.len);
604        for i in (0..min_len).step_by(29) {
605            let len = (min_len - i).min(29);
606            let this = self.slice(i..i + len);
607            let other = other.slice(i..i + len);
608            let this_word = this.as_u64();
609            let other_word = other.as_u64();
610            if this_word != other_word {
611                // Unfortunately, bases are packed in little endian order, so the default order is reversed.
612                let eq = this_word ^ other_word;
613                let t = eq.trailing_zeros() / 2 * 2;
614                let mask = 0b11 << t;
615                return (this_word & mask).cmp(&(other_word & mask));
616            }
617        }
618        self.len.cmp(&other.len)
619    }
620}
621
622impl SeqVec for PackedSeqVec {
623    type Seq<'s> = PackedSeq<'s>;
624
625    #[inline(always)]
626    fn into_raw(mut self) -> Vec<u8> {
627        self.seq.resize(self.len.div_ceil(4), 0);
628        self.seq
629    }
630
631    #[inline(always)]
632    fn as_slice(&self) -> Self::Seq<'_> {
633        PackedSeq {
634            seq: &self.seq[..self.len.div_ceil(4)],
635            offset: 0,
636            len: self.len,
637        }
638    }
639
640    #[inline(always)]
641    fn len(&self) -> usize {
642        self.len
643    }
644
645    #[inline(always)]
646    fn is_empty(&self) -> bool {
647        self.len == 0
648    }
649
650    #[inline(always)]
651    fn clear(&mut self) {
652        self.seq.clear();
653        self.len = 0;
654    }
655
656    fn push_seq<'a>(&mut self, seq: PackedSeq<'_>) -> Range<usize> {
657        let start = self.len.next_multiple_of(4) + seq.offset;
658        let end = start + seq.len();
659        // Reserve *additional* capacity.
660        self.seq.reserve(seq.seq.len());
661        // Shrink away the padding.
662        self.seq.resize(self.len.div_ceil(4), 0);
663        // Extend.
664        self.seq.extend(seq.seq);
665        // Push padding.
666        self.seq.extend(std::iter::repeat_n(0u8, PADDING));
667        self.len = end;
668        start..end
669    }
670
671    /// Push an ASCII sequence to an `PackedSeqVec`.
672    /// `Aa` map to `0`, `Cc` to `1`, `Gg` to `3`, and `Tt` to `2`.
673    /// Other characters are silently mapped into `0..4`.
674    ///
675    /// Uses the BMI2 `pext` instruction when available, based on the
676    /// `n_to_bits_pext` method described at
677    /// <https://github.com/Daniel-Liu-c0deb0t/cute-nucleotides>.
678    ///
679    /// TODO: Support multiple ways of dealing with non-`ACTG` characters:
680    /// - panic on non-`ACGT`,
681    /// - filter out non-`ACGT`.
682    fn push_ascii(&mut self, seq: &[u8]) -> Range<usize> {
683        self.seq
684            .resize((self.len + seq.len()).div_ceil(4) + PADDING, 0);
685        let start_aligned = self.len.next_multiple_of(4);
686        let start = self.len;
687        let len = seq.len();
688        let mut idx = self.len / 4;
689
690        let unaligned = core::cmp::min(start_aligned - start, len);
691        if unaligned > 0 {
692            let mut packed_byte = self.seq[idx];
693            for &base in &seq[..unaligned] {
694                packed_byte |= pack_char_lossy(base) << ((self.len % 4) * 2);
695                self.len += 1;
696            }
697            self.seq[idx] = packed_byte;
698            idx += 1;
699        }
700
701        #[allow(unused)]
702        let mut last = unaligned;
703
704        #[cfg(all(target_arch = "x86_64", target_feature = "bmi2"))]
705        {
706            last = unaligned + (len - unaligned) / 8 * 8;
707
708            for i in (unaligned..last).step_by(8) {
709                let chunk = &seq[i..i + 8].try_into().unwrap();
710                let ascii = u64::from_ne_bytes(*chunk);
711                let packed_bytes =
712                    unsafe { std::arch::x86_64::_pext_u64(ascii, 0x0606060606060606) };
713                self.seq[idx] = packed_bytes as u8;
714                idx += 1;
715                self.seq[idx] = (packed_bytes >> 8) as u8;
716                idx += 1;
717                self.len += 8;
718            }
719        }
720
721        #[cfg(target_feature = "neon")]
722        {
723            use core::arch::aarch64::{vandq_u8, vdup_n_u8, vld1q_u8, vpadd_u8, vshlq_u8, vst1_u8};
724            use core::mem::transmute;
725
726            last = unaligned + (len - unaligned) / 16 * 16;
727
728            for i in (unaligned..last).step_by(16) {
729                unsafe {
730                    let ascii = vld1q_u8(seq.as_ptr().add(i));
731                    let masked_bits = vandq_u8(ascii, transmute([6i8; 16]));
732                    let (bits_0, bits_1) = transmute(vshlq_u8(
733                        masked_bits,
734                        transmute([-1i8, 1, 3, 5, -1, 1, 3, 5, -1, 1, 3, 5, -1, 1, 3, 5]),
735                    ));
736                    let half_packed = vpadd_u8(bits_0, bits_1);
737                    let packed = vpadd_u8(half_packed, vdup_n_u8(0));
738                    vst1_u8(self.seq.as_mut_ptr().add(idx), packed);
739                    idx += 4;
740                    self.len += 16;
741                }
742            }
743        }
744
745        let mut packed_byte = 0;
746        for &base in &seq[last..] {
747            packed_byte |= pack_char_lossy(base) << ((self.len % 4) * 2);
748            self.len += 1;
749            if self.len % 4 == 0 {
750                self.seq[idx] = packed_byte;
751                idx += 1;
752                packed_byte = 0;
753            }
754        }
755        if self.len % 4 != 0 && last < len {
756            self.seq[idx] = packed_byte;
757            idx += 1;
758        }
759        assert_eq!(idx + PADDING, self.seq.len());
760        start..start + len
761    }
762
763    #[cfg(feature = "rand")]
764    fn random(n: usize) -> Self {
765        use rand::{RngCore, SeedableRng};
766
767        let byte_len = n.div_ceil(4);
768        let mut seq = vec![0; byte_len + PADDING];
769        rand::rngs::SmallRng::from_os_rng().fill_bytes(&mut seq[..byte_len]);
770        // Ensure that the last byte is padded with zeros.
771        if n % 4 != 0 {
772            seq[byte_len - 1] &= (1 << (2 * (n % 4))) - 1;
773        }
774
775        Self { seq, len: n }
776    }
777}