1use core::cell::RefCell;
2use std::ops::{Deref, DerefMut};
3use traits::Seq;
4use wide::u16x8;
5
6use crate::{intrinsics::transpose, padded_it::ChunkIt};
7
8use super::*;
9
10type SimdBuf = [S; 8];
11
12struct RecycledBox(Option<Box<SimdBuf>>);
13
14thread_local! {
15 static RECYCLED_BOX_CACHE: RefCell<Vec<Box<SimdBuf>>> = {
16 RefCell::new(vec![Box::new(SimdBuf::default())])
17 };
18}
19
20impl RecycledBox {
21 #[inline(always)]
22 fn take() -> Self {
23 let mut buf = RECYCLED_BOX_CACHE.with_borrow_mut(|v| RecycledBox(v.pop()));
24 buf.init_if_needed();
25 buf
26 }
27
28 #[inline(always)]
29 fn init_if_needed(&mut self) {
30 if self.0.is_none() {
31 self.0 = Some(Box::new(SimdBuf::default()));
32 }
33 }
34
35 #[inline(always)]
36 fn get(&self) -> &SimdBuf {
37 unsafe { self.0.as_ref().unwrap_unchecked() }
38 }
39
40 #[inline(always)]
41 fn get_mut(&mut self) -> &mut SimdBuf {
42 unsafe { self.0.as_mut().unwrap_unchecked() }
43 }
44}
45
46impl Deref for RecycledBox {
47 type Target = SimdBuf;
48
49 #[inline(always)]
50 fn deref(&self) -> &Self::Target {
51 self.get()
52 }
53}
54impl DerefMut for RecycledBox {
55 #[inline(always)]
56 fn deref_mut(&mut self) -> &mut SimdBuf {
57 self.get_mut()
58 }
59}
60
61impl Drop for RecycledBox {
62 #[inline(always)]
63 fn drop(&mut self) {
64 let mut x = None;
65 core::mem::swap(&mut x, &mut self.0);
66 RECYCLED_BOX_CACHE.with_borrow_mut(|v| v.push(unsafe { x.unwrap_unchecked() }));
67 }
68}
69
70struct RecycledVec(Vec<S>);
71
72thread_local! {
73 static RECYCLED_VEC_CACHE: RefCell<Vec<Vec<S>>> = {
74 RefCell::new(vec![])
75 };
76}
77
78impl RecycledVec {
79 #[inline(always)]
80 fn take() -> Self {
81 RecycledVec(RECYCLED_VEC_CACHE.with_borrow_mut(|v| v.pop().unwrap_or_default()))
82 }
83}
84
85impl Deref for RecycledVec {
86 type Target = Vec<S>;
87 #[inline(always)]
88 fn deref(&self) -> &Self::Target {
89 &self.0
90 }
91}
92impl DerefMut for RecycledVec {
93 #[inline(always)]
94 fn deref_mut(&mut self) -> &mut Self::Target {
95 &mut self.0
96 }
97}
98impl Drop for RecycledVec {
99 #[inline(always)]
100 fn drop(&mut self) {
101 RECYCLED_VEC_CACHE.with_borrow_mut(|v| v.push(std::mem::take(&mut self.0)));
102 }
103}
104
105#[doc(hidden)]
106pub struct Bits<const B: usize>;
107#[doc(hidden)]
108pub trait SupportedBits {}
109impl SupportedBits for Bits<1> {}
110impl SupportedBits for Bits<2> {}
111impl SupportedBits for Bits<4> {}
112impl SupportedBits for Bits<8> {}
113
114pub(crate) const PADDING: usize = 48;
116
117#[derive(Copy, Clone, Debug, MemSize, MemDbg)]
119pub struct PackedSeqBase<'s, const B: usize>
120where
121 Bits<B>: SupportedBits,
122{
123 seq: &'s [u8],
125 offset: usize,
127 len: usize,
129}
130
131#[derive(Clone, Debug, MemSize, MemDbg)]
133#[cfg_attr(feature = "pyo3", pyo3::pyclass)]
134#[cfg_attr(feature = "epserde", derive(epserde::Epserde))]
135pub struct PackedSeqVecBase<const B: usize>
136where
137 Bits<B>: SupportedBits,
138{
139 pub(crate) seq: Vec<u8>,
143
144 len: usize,
146}
147
148pub type PackedSeq<'s> = PackedSeqBase<'s, 2>;
149pub type PackedSeqVec = PackedSeqVecBase<2>;
150pub type BitSeq<'s> = PackedSeqBase<'s, 1>;
151pub type BitSeqVec = PackedSeqVecBase<1>;
152
153impl<'s, const B: usize> PackedSeqBase<'s, B>
156where
157 Bits<B>: SupportedBits,
158{
159 const CHAR_MASK: u64 = (1 << B) - 1;
161 const SIMD_B: S = S::new([B as u32; 8]);
162 const SIMD_CHAR_MASK: S = S::new([(1 << B) - 1; 8]);
163 const C8: usize = 8 / B;
165 const C32: usize = 32 / B;
167 const C256: usize = 256 / B;
169 const K64: usize = (64 - 8) / B + 1;
171}
172
173impl<const B: usize> PackedSeqVecBase<B>
175where
176 Bits<B>: SupportedBits,
177{
178 const C8: usize = 8 / B;
180}
181
182impl<const B: usize> Default for PackedSeqVecBase<B>
183where
184 Bits<B>: SupportedBits,
185{
186 fn default() -> Self {
187 Self {
188 seq: vec![0; PADDING],
189 len: 0,
190 }
191 }
192}
193
194#[inline(always)]
199pub fn pack_char(base: u8) -> u8 {
200 match base {
201 b'a' | b'A' => 0,
202 b'c' | b'C' => 1,
203 b'g' | b'G' => 3,
204 b't' | b'T' => 2,
205 _ => panic!(
206 "Unexpected character '{}' with ASCII value {base}. Expected one of ACTGactg.",
207 base as char
208 ),
209 }
210}
211
212#[inline(always)]
214pub fn pack_char_lossy(base: u8) -> u8 {
215 (base >> 1) & 3
216}
217#[inline(always)]
220pub fn pack_kmer_lossy(slice: &[u8]) -> u64 {
221 let mut kmer = 0;
222 for (i, &base) in slice.iter().enumerate() {
223 kmer |= (pack_char_lossy(base) as u64) << (2 * i);
224 }
225 kmer
226}
227#[inline(always)]
229pub fn pack_kmer_u128_lossy(slice: &[u8]) -> u128 {
230 let mut kmer = 0;
231 for (i, &base) in slice.iter().enumerate() {
232 kmer |= (pack_char_lossy(base) as u128) << (2 * i);
233 }
234 kmer
235}
236
237#[inline(always)]
239pub fn unpack_base(base: u8) -> u8 {
240 debug_assert!(base < 4, "Base {base} is not <4.");
241 b"ACTG"[base as usize]
242}
243
244#[inline(always)]
247pub fn unpack_kmer(kmer: u64) -> [u8; 32] {
248 std::array::from_fn(|i| unpack_base(((kmer >> (2 * i)) & 3) as u8))
249}
250#[inline(always)]
252pub fn unpack_kmer_into_vec(kmer: u64, k: usize, out: &mut Vec<u8>) {
253 out.clear();
254 out.extend((0..k).map(|i| unpack_base(((kmer >> (2 * i)) & 3) as u8)));
255}
256#[inline(always)]
258pub fn unpack_kmer_to_vec(kmer: u64, k: usize) -> Vec<u8> {
259 let mut out = vec![];
260 unpack_kmer_into_vec(kmer, k, &mut out);
261 out
262}
263
264#[inline(always)]
267pub fn unpack_kmer_u128(kmer: u128) -> [u8; 64] {
268 std::array::from_fn(|i| unpack_base(((kmer >> (2 * i)) & 3) as u8))
269}
270#[inline(always)]
272pub fn unpack_kmer_u128_into_vec(kmer: u128, k: usize, out: &mut Vec<u8>) {
273 out.clear();
274 out.extend((0..k).map(|i| unpack_base(((kmer >> (2 * i)) & 3) as u8)));
275}
276#[inline(always)]
278pub fn unpack_kmer_u128_to_vec(kmer: u128, k: usize) -> Vec<u8> {
279 let mut out = vec![];
280 unpack_kmer_u128_into_vec(kmer, k, &mut out);
281 out
282}
283
284#[inline(always)]
286pub const fn complement_char(base: u8) -> u8 {
287 match base {
288 b'A' => b'T',
289 b'C' => b'G',
290 b'G' => b'C',
291 b'T' => b'A',
292 _ => panic!("Unexpected character. Expected one of ACTGactg.",),
293 }
294}
295
296#[inline(always)]
298pub const fn complement_base(base: u8) -> u8 {
299 base ^ 2
300}
301
302#[inline(always)]
304pub fn complement_base_simd(base: u32x8) -> u32x8 {
305 const TWO: u32x8 = u32x8::new([2; 8]);
306 base ^ TWO
307}
308
309#[inline(always)]
311const fn revcomp_raw(word: u64) -> u64 {
312 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
313 {
314 let mut res = word.reverse_bits(); res = ((res >> 1) & 0x5555_5555_5555_5555) | ((res & 0x5555_5555_5555_5555) << 1);
316 res ^ 0xAAAA_AAAA_AAAA_AAAA
317 }
318
319 #[cfg(not(any(target_arch = "arm", target_arch = "aarch64")))]
320 {
321 let mut res = word.swap_bytes();
322 res = ((res >> 4) & 0x0F0F_0F0F_0F0F_0F0F) | ((res & 0x0F0F_0F0F_0F0F_0F0F) << 4);
323 res = ((res >> 2) & 0x3333_3333_3333_3333) | ((res & 0x3333_3333_3333_3333) << 2);
324 res ^ 0xAAAA_AAAA_AAAA_AAAA
325 }
326}
327
328#[inline(always)]
330pub const fn revcomp_u64(word: u64, len: usize) -> u64 {
331 revcomp_raw(word) >> (usize::BITS as usize - 2 * len)
332}
333
334#[inline(always)]
335pub const fn revcomp_u128(word: u128, len: usize) -> u128 {
336 let low = word as u64;
337 let high = (word >> 64) as u64;
338 let rlow = revcomp_raw(low);
339 let rhigh = revcomp_raw(high);
340 let out = ((rlow as u128) << 64) | rhigh as u128;
341 out >> (u128::BITS as usize - 2 * len)
342}
343
344#[inline(always)]
349pub fn char_is_ambiguous(base: u8) -> u8 {
350 let table = b"ACTG";
352 let upper_mask = !(b'a' - b'A');
353 (table[pack_char_lossy(base) as usize] != (base & upper_mask)) as u8
354}
355
356#[inline(always)]
358pub const fn rev_u64(word: u64, len: usize) -> u64 {
359 word.reverse_bits() >> (usize::BITS as usize - len)
360}
361
362#[inline(always)]
364pub const fn rev_u128(word: u128, len: usize) -> u128 {
365 word.reverse_bits() >> (u128::BITS as usize - len)
366}
367
368impl<'s, const B: usize> PackedSeqBase<'s, B>
371where
372 Bits<B>: SupportedBits,
373{
374 pub fn from_raw_parts(seq: &'s [u8], offset: usize, len: usize) -> Self {
379 assert!(offset + len + PADDING * Self::C8 <= seq.len() * Self::C8);
380 Self { seq, offset, len }
381 }
382
383 #[inline(always)]
385 pub fn normalize(&self) -> Self {
386 let start_byte = self.offset / Self::C8;
387 let end_byte = (self.offset + self.len).div_ceil(Self::C8);
388 Self {
389 seq: &self.seq[start_byte..end_byte + PADDING],
390 offset: self.offset % Self::C8,
391 len: self.len,
392 }
393 }
394
395 #[inline(always)]
397 pub fn unpack(&self) -> Vec<u8> {
398 self.iter_bp().map(unpack_base).collect()
399 }
400}
401
402#[inline(always)]
404pub(crate) unsafe fn read_slice_32_unchecked(seq: &[u8], idx: usize) -> u32x8 {
405 unsafe {
406 let src = seq.as_ptr().add(idx);
407 debug_assert!(idx + 32 <= seq.len());
408 std::mem::transmute::<_, *const u32x8>(src).read_unaligned()
409 }
410}
411
412#[inline(always)]
414pub(crate) fn read_slice_32(seq: &[u8], idx: usize) -> u32x8 {
415 unsafe {
416 let src = seq.as_ptr().add(idx);
417 if idx + 32 <= seq.len() {
418 std::mem::transmute::<_, *const u32x8>(src).read_unaligned()
419 } else {
420 let num_bytes = seq.len().saturating_sub(idx);
421 let mut result = [0u8; 32];
422 std::ptr::copy_nonoverlapping(src, result.as_mut_ptr(), num_bytes);
423 std::mem::transmute(result)
424 }
425 }
426}
427
428#[allow(unused)]
430#[inline(always)]
431pub(crate) fn read_slice_16(seq: &[u8], idx: usize) -> u16x8 {
432 unsafe {
433 let src = seq.as_ptr().add(idx);
434 if idx + 16 <= seq.len() {
435 std::mem::transmute::<_, *const u16x8>(src).read_unaligned()
436 } else {
437 let num_bytes = seq.len().saturating_sub(idx);
438 let mut result = [0u8; 16];
439 std::ptr::copy_nonoverlapping(src, result.as_mut_ptr(), num_bytes);
440 std::mem::transmute(result)
441 }
442 }
443}
444
445impl<'s, const B: usize> Seq<'s> for PackedSeqBase<'s, B>
446where
447 Bits<B>: SupportedBits,
448{
449 const BITS_PER_CHAR: usize = B;
450 const BASES_PER_BYTE: usize = Self::C8;
451 type SeqVec = PackedSeqVecBase<B>;
452
453 #[inline(always)]
454 fn len(&self) -> usize {
455 self.len
456 }
457
458 #[inline(always)]
459 fn is_empty(&self) -> bool {
460 self.len == 0
461 }
462
463 #[inline(always)]
464 fn get_ascii(&self, index: usize) -> u8 {
465 unpack_base(self.get(index))
466 }
467
468 #[inline(always)]
471 fn as_u64(&self) -> u64 {
472 assert!(self.len() <= 64 / B);
473
474 let mask = u64::MAX >> (64 - B * self.len());
475
476 if self.len() <= Self::K64 {
479 let x = unsafe { (self.seq.as_ptr() as *const u64).read_unaligned() };
480 (x >> (B * self.offset)) & mask
481 } else {
482 let x = unsafe { (self.seq.as_ptr() as *const u128).read_unaligned() };
483 (x >> (B * self.offset)) as u64 & mask
484 }
485 }
486
487 #[inline(always)]
490 fn revcomp_as_u64(&self) -> u64 {
491 match B {
492 1 => rev_u64(self.as_u64(), self.len()),
493 2 => revcomp_u64(self.as_u64(), self.len()),
494 _ => panic!("Rev(comp) is only supported for 1-bit and 2-bit alphabets."),
495 }
496 }
497
498 #[inline(always)]
501 fn as_u128(&self) -> u128 {
502 assert!(
503 self.len() <= (128 - 8) / B + 1,
504 "Sequences >61 long cannot be read with a single unaligned u128 read."
505 );
506
507 let mask = u128::MAX >> (128 - B * self.len());
508
509 let x = unsafe { (self.seq.as_ptr() as *const u128).read_unaligned() };
512 (x >> (B * self.offset)) & mask
513 }
514
515 #[inline(always)]
518 fn revcomp_as_u128(&self) -> u128 {
519 match B {
520 1 => rev_u128(self.as_u128(), self.len()),
521 2 => revcomp_u128(self.as_u128(), self.len()),
522 _ => panic!("Rev(comp) is only supported for 1-bit and 2-bit alphabets."),
523 }
524 }
525
526 #[inline(always)]
527 fn to_vec(&self) -> PackedSeqVecBase<B> {
528 assert_eq!(self.offset, 0);
529 PackedSeqVecBase {
530 seq: self
531 .seq
532 .iter()
533 .copied()
534 .chain(std::iter::repeat_n(0u8, PADDING))
535 .collect(),
536 len: self.len,
537 }
538 }
539
540 fn to_revcomp(&self) -> PackedSeqVecBase<B> {
541 match B {
542 1 | 2 => {}
543 _ => panic!("Can only reverse (&complement) 1-bit and 2-bit packed sequences.",),
544 }
545
546 let mut seq = self.seq[..(self.offset + self.len).div_ceil(Self::C8)]
547 .iter()
548 .rev()
550 .copied()
551 .map(|mut res| {
552 match B {
553 2 => {
554 res = ((res >> 4) & 0x0F) | ((res & 0x0F) << 4);
557 res = ((res >> 2) & 0x33) | ((res & 0x33) << 2);
558 res ^ 0xAA
560 }
561 1 => res.reverse_bits(),
562 _ => unreachable!(),
563 }
564 })
565 .chain(std::iter::repeat_n(0u8, PADDING))
566 .collect::<Vec<u8>>();
567
568 let new_offset = (Self::C8 - (self.offset + self.len) % Self::C8) % Self::C8;
570
571 if new_offset > 0 {
572 let shift = B * new_offset;
574 *seq.last_mut().unwrap() >>= shift;
575 for i in 0..seq.len() - 1 {
577 seq[i] = (seq[i] >> shift) | (seq[i + 1] << (8 - shift));
578 }
579 }
580
581 PackedSeqVecBase { seq, len: self.len }
582 }
583
584 #[inline(always)]
585 fn slice(&self, range: Range<usize>) -> Self {
586 debug_assert!(
587 range.end <= self.len,
588 "Slice index out of bounds: {} > {}",
589 range.end,
590 self.len
591 );
592 PackedSeqBase {
593 seq: self.seq,
594 offset: self.offset + range.start,
595 len: range.end - range.start,
596 }
597 .normalize()
598 }
599
600 #[inline(always)]
601 fn iter_bp(self) -> impl ExactSizeIterator<Item = u8> {
602 assert!(self.len <= self.seq.len() * Self::C8);
603
604 let this = self.normalize();
605
606 let mut byte = 0;
608 (0..this.len + this.offset)
609 .map(
610 #[inline(always)]
611 move |i| {
612 if i % Self::C8 == 0 {
613 byte = this.seq[i / Self::C8];
614 }
615 (byte >> (B * (i % Self::C8))) & Self::CHAR_MASK as u8
617 },
618 )
619 .advance(this.offset)
620 }
621
622 #[inline(always)]
623 fn par_iter_bp(self, context: usize) -> PaddedIt<impl ChunkIt<S>> {
624 self.par_iter_bp_with_buf(context, RecycledBox::take())
625 }
626
627 #[inline(always)]
628 fn par_iter_bp_delayed(self, context: usize, delay: Delay) -> PaddedIt<impl ChunkIt<(S, S)>> {
629 self.par_iter_bp_delayed_with_factor(context, delay, 1)
630 }
631
632 #[inline(always)]
635 fn par_iter_bp_delayed_2(
636 self,
637 context: usize,
638 delay1: Delay,
639 delay2: Delay,
640 ) -> PaddedIt<impl ChunkIt<(S, S, S)>> {
641 self.par_iter_bp_delayed_2_with_factor_and_buf(
642 context,
643 delay1,
644 delay2,
645 1,
646 RecycledVec::take(),
647 )
648 }
649
650 fn cmp_lcp(&self, other: &Self) -> (std::cmp::Ordering, usize) {
652 let mut lcp = 0;
653 let min_len = self.len.min(other.len);
654 for i in (0..min_len).step_by(Self::K64) {
655 let len = (min_len - i).min(Self::K64);
656 let this = self.slice(i..i + len);
657 let other = other.slice(i..i + len);
658 let this_word = this.as_u64();
659 let other_word = other.as_u64();
660 if this_word != other_word {
661 let eq = this_word ^ other_word;
663 let t = eq.trailing_zeros() as usize / B * B;
664 lcp += t / B;
665 let mask = (Self::CHAR_MASK) << t;
666 return ((this_word & mask).cmp(&(other_word & mask)), lcp);
667 }
668 lcp += len;
669 }
670 (self.len.cmp(&other.len), lcp)
671 }
672
673 #[inline(always)]
674 fn get(&self, index: usize) -> u8 {
675 let offset = self.offset + index;
676 let idx = offset / Self::C8;
677 let offset = offset % Self::C8;
678 (self.seq[idx] >> (B * offset)) & Self::CHAR_MASK as u8
679 }
680}
681
682impl<'s, const B: usize> PackedSeqBase<'s, B>
683where
684 Bits<B>: SupportedBits,
685{
686 #[inline(always)]
687 pub fn par_iter_bp_with_buf<BUF: DerefMut<Target = [S; 8]>>(
688 self,
689 context: usize,
690 mut buf: BUF,
691 ) -> PaddedIt<impl ChunkIt<S> + use<'s, B, BUF>> {
692 #[cfg(target_endian = "big")]
693 panic!("Big endian architectures are not supported.");
694
695 let this = self.normalize();
696 let o = this.offset;
697 assert!(o < Self::C8);
698
699 let num_kmers = if this.len == 0 {
700 0
701 } else {
702 (this.len + o).saturating_sub(context - 1)
703 };
704 let num_kmers_stride = this.len.saturating_sub(context - 1);
706 let n = num_kmers_stride.div_ceil(L).next_multiple_of(Self::C8);
707 let bytes_per_chunk = n / Self::C8;
708 let padding = Self::C8 * L * bytes_per_chunk - num_kmers_stride;
709
710 let offsets: [usize; 8] = from_fn(|l| l * bytes_per_chunk);
711 let mut cur = S::ZERO;
712
713 let par_len = if num_kmers == 0 {
714 0
715 } else {
716 n + context + o - 1
717 };
718
719 let last_i = par_len.saturating_sub(1) / Self::C32 * Self::C32;
720 assert!(offsets[7] + (last_i / Self::C8) + 32 <= this.seq.len());
722
723 let it = (0..par_len)
724 .map(
725 #[inline(always)]
726 move |i| {
727 if i % Self::C32 == 0 {
728 if i % Self::C256 == 0 {
729 let data: [u32x8; 8] = from_fn(
731 #[inline(always)]
732 |lane| unsafe {
733 read_slice_32_unchecked(
734 this.seq,
735 offsets[lane] + (i / Self::C8),
736 )
737 },
738 );
739 *buf = transpose(data);
740 }
741 cur = buf[(i % Self::C256) / Self::C32];
742 }
743 let chars = cur & Self::SIMD_CHAR_MASK;
745 cur = cur >> Self::SIMD_B;
747 chars
748 },
749 )
750 .advance(o);
751
752 PaddedIt { it, padding }
753 }
754
755 #[inline(always)]
756 pub fn par_iter_bp_delayed_with_factor(
757 self,
758 context: usize,
759 delay: Delay,
760 factor: usize,
761 ) -> PaddedIt<impl ChunkIt<(S, S)> + use<'s, B>> {
762 self.par_iter_bp_delayed_with_factor_and_buf(context, delay, factor, RecycledVec::take())
763 }
764
765 #[inline(always)]
766 pub fn par_iter_bp_delayed_with_buf<BUF: DerefMut<Target = Vec<S>>>(
767 self,
768 context: usize,
769 delay: Delay,
770 buf: BUF,
771 ) -> PaddedIt<impl ChunkIt<(S, S)> + use<'s, B, BUF>> {
772 self.par_iter_bp_delayed_with_factor_and_buf(context, delay, 1, buf)
773 }
774
775 #[inline(always)]
776 pub fn par_iter_bp_delayed_with_factor_and_buf<BUF: DerefMut<Target = Vec<S>>>(
777 self,
778 context: usize,
779 Delay(delay): Delay,
780 factor: usize,
781 mut buf: BUF,
782 ) -> PaddedIt<impl ChunkIt<(S, S)> + use<'s, B, BUF>> {
783 #[cfg(target_endian = "big")]
784 panic!("Big endian architectures are not supported.");
785
786 assert!(
787 delay < usize::MAX / 2,
788 "Delay={} should be >=0.",
789 delay as isize
790 );
791
792 let this = self.normalize();
793 let o = this.offset;
794 assert!(o < Self::C8);
795
796 let num_kmers = if this.len == 0 {
797 0
798 } else {
799 (this.len + o).saturating_sub(context - 1)
800 };
801 let num_kmers_stride = this.len.saturating_sub(context - 1);
803 let n = num_kmers_stride
804 .div_ceil(L)
805 .next_multiple_of(factor * Self::C8);
806 let bytes_per_chunk = n / Self::C8;
807 let padding = Self::C8 * L * bytes_per_chunk - num_kmers_stride;
808
809 let offsets: [usize; 8] = from_fn(|l| l * bytes_per_chunk);
810 let mut upcoming = S::ZERO;
811 let mut upcoming_d = S::ZERO;
812
813 let buf_len = (delay / Self::C32 + 8).next_power_of_two();
818 let buf_mask = buf_len - 1;
819 if buf.capacity() < buf_len {
820 *buf = vec![S::ZERO; buf_len];
822 } else {
823 buf.clear();
825 buf.resize(buf_len, S::ZERO);
826 }
827
828 let mut write_idx = 0;
829 let mut read_idx = (buf_len - delay / Self::C32) % buf_len;
832
833 let par_len = if num_kmers == 0 {
834 0
835 } else {
836 n + context + o - 1
837 };
838
839 let last_i = par_len.saturating_sub(1) / Self::C32 * Self::C32;
840 assert!(offsets[7] + (last_i / Self::C8) + 32 <= this.seq.len());
842
843 let it = (0..par_len)
844 .map(
845 #[inline(always)]
846 move |i| {
847 if i % Self::C32 == 0 {
848 if i % Self::C256 == 0 {
849 let data: [u32x8; 8] = from_fn(
851 #[inline(always)]
852 |lane| unsafe {
853 read_slice_32_unchecked(
854 this.seq,
855 offsets[lane] + (i / Self::C8),
856 )
857 },
858 );
859 unsafe {
860 *TryInto::<&mut [u32x8; 8]>::try_into(
861 buf.get_unchecked_mut(write_idx..write_idx + 8),
862 )
863 .unwrap_unchecked() = transpose(data);
864 }
865 if i == 0 {
866 let elem = !((1u32 << (B * o)) - 1);
868 let mask = S::splat(elem);
869 unsafe { assert_unchecked(write_idx < buf.len()) };
870 buf[write_idx] &= mask;
871 }
872 }
873 unsafe { assert_unchecked(write_idx < buf.len()) };
874 upcoming = buf[write_idx];
875 write_idx += 1;
876 write_idx &= buf_mask;
877 }
878 if i % Self::C32 == delay % Self::C32 {
879 unsafe { assert_unchecked(read_idx < buf.len()) };
880 upcoming_d = buf[read_idx];
881 read_idx += 1;
882 read_idx &= buf_mask;
883 }
884 let chars = upcoming & Self::SIMD_CHAR_MASK;
886 let chars_d = upcoming_d & Self::SIMD_CHAR_MASK;
887 upcoming = upcoming >> Self::SIMD_B;
889 upcoming_d = upcoming_d >> Self::SIMD_B;
890 (chars, chars_d)
891 },
892 )
893 .advance(o);
894
895 PaddedIt { it, padding }
896 }
897
898 #[inline(always)]
899 pub fn par_iter_bp_delayed_2_with_factor(
900 self,
901 context: usize,
902 delay1: Delay,
903 delay2: Delay,
904 factor: usize,
905 ) -> PaddedIt<impl ChunkIt<(S, S, S)> + use<'s, B>> {
906 self.par_iter_bp_delayed_2_with_factor_and_buf(
907 context,
908 delay1,
909 delay2,
910 factor,
911 RecycledVec::take(),
912 )
913 }
914
915 #[inline(always)]
916 pub fn par_iter_bp_delayed_2_with_buf<BUF: DerefMut<Target = Vec<S>>>(
917 self,
918 context: usize,
919 delay1: Delay,
920 delay2: Delay,
921 buf: BUF,
922 ) -> PaddedIt<impl ChunkIt<(S, S, S)> + use<'s, B, BUF>> {
923 self.par_iter_bp_delayed_2_with_factor_and_buf(context, delay1, delay2, 1, buf)
924 }
925
926 #[inline(always)]
932 pub fn par_iter_bp_delayed_2_with_factor_and_buf<BUF: DerefMut<Target = Vec<S>>>(
933 self,
934 context: usize,
935 Delay(delay1): Delay,
936 Delay(delay2): Delay,
937 factor: usize,
938 mut buf: BUF,
939 ) -> PaddedIt<impl ChunkIt<(S, S, S)> + use<'s, B, BUF>> {
940 #[cfg(target_endian = "big")]
941 panic!("Big endian architectures are not supported.");
942
943 let this = self.normalize();
944 let o = this.offset;
945 assert!(o < Self::C8);
946 assert!(delay1 <= delay2, "Delay1 must be at most delay2.");
947
948 let num_kmers = if this.len == 0 {
949 0
950 } else {
951 (this.len + o).saturating_sub(context - 1)
952 };
953 let num_kmers_stride = this.len.saturating_sub(context - 1);
955 let n = num_kmers_stride
956 .div_ceil(L)
957 .next_multiple_of(factor * Self::C8);
958 let bytes_per_chunk = n / Self::C8;
959 let padding = Self::C8 * L * bytes_per_chunk - num_kmers_stride;
960
961 let offsets: [usize; 8] = from_fn(|l| l * bytes_per_chunk);
962 let mut upcoming = S::ZERO;
963 let mut upcoming_d1 = S::ZERO;
964 let mut upcoming_d2 = S::ZERO;
965
966 let buf_len = (delay2 / Self::C32 + 8).next_power_of_two();
968 let buf_mask = buf_len - 1;
969 if buf.capacity() < buf_len {
970 *buf = vec![S::ZERO; buf_len];
972 } else {
973 buf.clear();
975 buf.resize(buf_len, S::ZERO);
976 }
977
978 let mut write_idx = 0;
979 let mut read_idx1 = (buf_len - delay1 / Self::C32) % buf_len;
982 let mut read_idx2 = (buf_len - delay2 / Self::C32) % buf_len;
983
984 let par_len = if num_kmers == 0 {
985 0
986 } else {
987 n + context + o - 1
988 };
989
990 let last_i = par_len.saturating_sub(1) / Self::C32 * Self::C32;
991 assert!(offsets[7] + (last_i / Self::C8) + 32 <= this.seq.len());
993
994 let it = (0..par_len)
995 .map(
996 #[inline(always)]
997 move |i| {
998 if i % Self::C32 == 0 {
999 if i % Self::C256 == 0 {
1000 let data: [u32x8; 8] = from_fn(
1002 #[inline(always)]
1003 |lane| unsafe {
1004 read_slice_32_unchecked(
1005 this.seq,
1006 offsets[lane] + (i / Self::C8),
1007 )
1008 },
1009 );
1010 unsafe {
1011 *TryInto::<&mut [u32x8; 8]>::try_into(
1012 buf.get_unchecked_mut(write_idx..write_idx + 8),
1013 )
1014 .unwrap_unchecked() = transpose(data);
1015 }
1016 if i == 0 {
1018 let elem = !((1u32 << (B * o)) - 1);
1020 let mask = S::splat(elem);
1021 buf[write_idx] &= mask;
1022 }
1023 }
1024 upcoming = buf[write_idx];
1025 write_idx += 1;
1026 write_idx &= buf_mask;
1027 }
1028 if i % Self::C32 == delay1 % Self::C32 {
1029 unsafe { assert_unchecked(read_idx1 < buf.len()) };
1030 upcoming_d1 = buf[read_idx1];
1031 read_idx1 += 1;
1032 read_idx1 &= buf_mask;
1033 }
1034 if i % Self::C32 == delay2 % Self::C32 {
1035 unsafe { assert_unchecked(read_idx2 < buf.len()) };
1036 upcoming_d2 = buf[read_idx2];
1037 read_idx2 += 1;
1038 read_idx2 &= buf_mask;
1039 }
1040 let chars = upcoming & Self::SIMD_CHAR_MASK;
1042 let chars_d1 = upcoming_d1 & Self::SIMD_CHAR_MASK;
1043 let chars_d2 = upcoming_d2 & Self::SIMD_CHAR_MASK;
1044 upcoming = upcoming >> Self::SIMD_B;
1046 upcoming_d1 = upcoming_d1 >> Self::SIMD_B;
1047 upcoming_d2 = upcoming_d2 >> Self::SIMD_B;
1048 (chars, chars_d1, chars_d2)
1049 },
1050 )
1051 .advance(o);
1052
1053 PaddedIt { it, padding }
1054 }
1055}
1056
1057impl<const B: usize> PartialEq for PackedSeqBase<'_, B>
1058where
1059 Bits<B>: SupportedBits,
1060{
1061 fn eq(&self, other: &Self) -> bool {
1063 if self.len != other.len {
1064 return false;
1065 }
1066 for i in (0..self.len).step_by(Self::K64) {
1067 let len = (self.len - i).min(Self::K64);
1068 let this = self.slice(i..i + len);
1069 let that = other.slice(i..i + len);
1070 if this.as_u64() != that.as_u64() {
1071 return false;
1072 }
1073 }
1074 true
1075 }
1076}
1077
1078impl<const B: usize> Eq for PackedSeqBase<'_, B> where Bits<B>: SupportedBits {}
1079
1080impl<const B: usize> PartialOrd for PackedSeqBase<'_, B>
1081where
1082 Bits<B>: SupportedBits,
1083{
1084 fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
1085 Some(self.cmp(other))
1086 }
1087}
1088
1089impl<const B: usize> Ord for PackedSeqBase<'_, B>
1090where
1091 Bits<B>: SupportedBits,
1092{
1093 fn cmp(&self, other: &Self) -> std::cmp::Ordering {
1095 let min_len = self.len.min(other.len);
1096 for i in (0..min_len).step_by(Self::K64) {
1097 let len = (min_len - i).min(Self::K64);
1098 let this = self.slice(i..i + len);
1099 let other = other.slice(i..i + len);
1100 let this_word = this.as_u64();
1101 let other_word = other.as_u64();
1102 if this_word != other_word {
1103 let eq = this_word ^ other_word;
1105 let t = eq.trailing_zeros() as usize / B * B;
1106 let mask = (Self::CHAR_MASK) << t;
1107 return (this_word & mask).cmp(&(other_word & mask));
1108 }
1109 }
1110 self.len.cmp(&other.len)
1111 }
1112}
1113
1114impl<const B: usize> SeqVec for PackedSeqVecBase<B>
1115where
1116 Bits<B>: SupportedBits,
1117{
1118 type Seq<'s> = PackedSeqBase<'s, B>;
1119
1120 #[inline(always)]
1121 fn into_raw(mut self) -> Vec<u8> {
1122 self.seq.resize(self.len.div_ceil(Self::C8), 0);
1123 self.seq
1124 }
1125
1126 #[inline(always)]
1127 fn as_slice(&self) -> Self::Seq<'_> {
1128 PackedSeqBase {
1129 seq: &self.seq[..self.len.div_ceil(Self::C8) + PADDING],
1130 offset: 0,
1131 len: self.len,
1132 }
1133 }
1134
1135 #[inline(always)]
1136 fn len(&self) -> usize {
1137 self.len
1138 }
1139
1140 #[inline(always)]
1141 fn is_empty(&self) -> bool {
1142 self.len == 0
1143 }
1144
1145 #[inline(always)]
1146 fn clear(&mut self) {
1147 self.seq.clear();
1148 self.len = 0;
1149 }
1150
1151 fn push_seq<'a>(&mut self, seq: PackedSeqBase<'_, B>) -> Range<usize> {
1152 let start = self.len.next_multiple_of(Self::C8) + seq.offset;
1153 let end = start + seq.len();
1154
1155 self.seq.resize(self.len.div_ceil(Self::C8), 0);
1157 self.seq.extend(seq.seq);
1159 self.len = end;
1160 start..end
1161 }
1162
1163 fn push_ascii(&mut self, seq: &[u8]) -> Range<usize> {
1172 match B {
1173 1 | 2 => {}
1174 _ => panic!(
1175 "Can only use ASCII input for 2-bit DNA packing, or 1-bit ambiguous indicators."
1176 ),
1177 }
1178
1179 self.seq
1180 .resize((self.len + seq.len()).div_ceil(Self::C8) + PADDING, 0);
1181 let start_aligned = self.len.next_multiple_of(Self::C8);
1182 let start = self.len;
1183 let len = seq.len();
1184 let mut idx = self.len / Self::C8;
1185
1186 let parse_base = |base| match B {
1187 1 => char_is_ambiguous(base),
1188 2 => pack_char_lossy(base),
1189 _ => unreachable!(),
1190 };
1191
1192 let unaligned = core::cmp::min(start_aligned - start, len);
1193 if unaligned > 0 {
1194 let mut packed_byte = self.seq[idx];
1195 for &base in &seq[..unaligned] {
1196 packed_byte |= parse_base(base) << ((self.len % Self::C8) * B);
1197 self.len += 1;
1198 }
1199 self.seq[idx] = packed_byte;
1200 idx += 1;
1201 }
1202
1203 #[allow(unused)]
1204 let mut last = unaligned;
1205
1206 if B == 2 {
1207 #[cfg(all(target_arch = "x86_64", target_feature = "bmi2"))]
1208 {
1209 last = unaligned + (len - unaligned) / 8 * 8;
1210
1211 for i in (unaligned..last).step_by(8) {
1212 let chunk =
1213 unsafe { seq.get_unchecked(i..i + 8).try_into().unwrap_unchecked() };
1214 let ascii = u64::from_le_bytes(chunk);
1215 let packed_bytes =
1216 unsafe { std::arch::x86_64::_pext_u64(ascii, 0x0606060606060606) } as u16;
1217 unsafe {
1218 self.seq
1219 .get_unchecked_mut(idx..(idx + 2))
1220 .copy_from_slice(&packed_bytes.to_le_bytes())
1221 };
1222 idx += 2;
1223 self.len += 8;
1224 }
1225 }
1226
1227 #[cfg(target_feature = "neon")]
1228 {
1229 use core::arch::aarch64::{
1230 vandq_u8, vdup_n_u8, vld1q_u8, vpadd_u8, vshlq_u8, vst1_u8,
1231 };
1232 use core::mem::transmute;
1233
1234 last = unaligned + (len - unaligned) / 16 * 16;
1235
1236 for i in (unaligned..last).step_by(16) {
1237 unsafe {
1238 let ascii = vld1q_u8(seq.as_ptr().add(i));
1239 let masked_bits = vandq_u8(ascii, transmute([6i8; 16]));
1240 let (bits_0, bits_1) = transmute(vshlq_u8(
1241 masked_bits,
1242 transmute([-1i8, 1, 3, 5, -1, 1, 3, 5, -1, 1, 3, 5, -1, 1, 3, 5]),
1243 ));
1244 let half_packed = vpadd_u8(bits_0, bits_1);
1245 let packed = vpadd_u8(half_packed, vdup_n_u8(0));
1246 vst1_u8(self.seq.as_mut_ptr().add(idx), packed);
1247 idx += Self::C8;
1248 self.len += 16;
1249 }
1250 }
1251 }
1252 }
1253
1254 if B == 1 {
1255 #[cfg(all(target_arch = "x86_64", target_feature = "avx2"))]
1256 {
1257 last = len;
1258 self.len += len - unaligned;
1259
1260 let mut last_i = unaligned;
1261
1262 for i in (unaligned..last).step_by(32) {
1263 use std::mem::transmute as t;
1264
1265 use wide::CmpEq;
1266 type S = wide::i8x32;
1268 let chars: S = unsafe { t(read_slice_32(seq, i)) };
1269 let upper_mask = !(b'a' - b'A');
1270 let chars = chars & S::splat(upper_mask as i8);
1272 let lossy_encoded = chars & S::splat(6);
1273 let table = unsafe { S::from(t::<_, S>(*b"AxCxTxGxxxxxxxxxAxCxTxGxxxxxxxxx")) };
1274 let lookup: S = unsafe {
1275 t(std::arch::x86_64::_mm256_shuffle_epi8(
1276 t(table),
1277 t(lossy_encoded),
1278 ))
1279 };
1280 let packed_bytes = !(chars.cmp_eq(lookup).move_mask() as u32);
1281
1282 last_i = i;
1283 unsafe {
1284 self.seq
1285 .get_unchecked_mut(idx..(idx + 4))
1286 .copy_from_slice(&packed_bytes.to_le_bytes())
1287 };
1288 idx += 4;
1289 }
1290
1291 if unaligned < last {
1293 idx -= 4;
1294 let mut val = unsafe {
1295 u32::from_le_bytes(
1296 self.seq
1297 .get_unchecked(idx..(idx + 4))
1298 .try_into()
1299 .unwrap_unchecked(),
1300 )
1301 };
1302 let keep = last - last_i;
1304 val <<= 32 - keep;
1305 val >>= 32 - keep;
1306 unsafe {
1307 self.seq
1308 .get_unchecked_mut(idx..(idx + 4))
1309 .copy_from_slice(&val.to_le_bytes())
1310 };
1311 idx += keep.div_ceil(8);
1312 }
1313 }
1314
1315 #[cfg(target_feature = "neon")]
1316 {
1317 use core::arch::aarch64::*;
1318 use core::mem::transmute;
1319
1320 last = unaligned + (len - unaligned) / 64 * 64;
1321
1322 for i in (unaligned..last).step_by(64) {
1323 unsafe {
1324 let ptr = seq.as_ptr().add(i);
1325 let chars = vld4q_u8(ptr);
1326
1327 let upper_mask = vdupq_n_u8(!(b'a' - b'A'));
1328 let chars = neon::map_8x16x4(chars, |v| vandq_u8(v, upper_mask));
1329
1330 let two_bits_mask = vdupq_n_u8(6);
1331 let lossy_encoded = neon::map_8x16x4(chars, |v| vandq_u8(v, two_bits_mask));
1332
1333 let table = transmute(*b"AxCxTxGxxxxxxxxx");
1334 let lookup = neon::map_8x16x4(lossy_encoded, |v| vqtbl1q_u8(table, v));
1335
1336 let mask = neon::map_two_8x16x4(chars, lookup, |v1, v2| vceqq_u8(v1, v2));
1337 let packed_bytes = !neon::movemask_64(mask);
1338
1339 self.seq[idx..(idx + 8)].copy_from_slice(&packed_bytes.to_le_bytes());
1340 idx += 8;
1341 self.len += 64;
1342 }
1343 }
1344 }
1345 }
1346
1347 let mut packed_byte = 0;
1348 for &base in &seq[last..] {
1349 packed_byte |= parse_base(base) << ((self.len % Self::C8) * B);
1350 self.len += 1;
1351 if self.len % Self::C8 == 0 {
1352 self.seq[idx] = packed_byte;
1353 idx += 1;
1354 packed_byte = 0;
1355 }
1356 }
1357 if self.len % Self::C8 != 0 && last < len {
1358 self.seq[idx] = packed_byte;
1359 idx += 1;
1360 }
1361 assert_eq!(idx + PADDING, self.seq.len());
1362 start..start + len
1363 }
1364
1365 #[cfg(feature = "rand")]
1366 fn random(n: usize) -> Self {
1367 use rand::{RngCore, SeedableRng};
1368
1369 let byte_len = n.div_ceil(Self::C8);
1370 let mut seq = vec![0; byte_len + PADDING];
1371 rand::rngs::SmallRng::from_os_rng().fill_bytes(&mut seq[..byte_len]);
1372 if n % Self::C8 != 0 {
1374 seq[byte_len - 1] &= (1 << (B * (n % Self::C8))) - 1;
1375 }
1376
1377 Self { seq, len: n }
1378 }
1379}
1380
1381impl<const B: usize> PackedSeqVecBase<B>
1382where
1383 Bits<B>: SupportedBits,
1384{
1385 pub fn from_raw_parts(mut seq: Vec<u8>, len: usize) -> Self {
1390 assert!(len <= seq.len() * Self::C8);
1391 seq.resize(len.div_ceil(Self::C8) + PADDING, 0);
1392 Self { seq, len }
1393 }
1394}
1395
1396impl PackedSeqVecBase<1> {
1397 pub fn with_len(n: usize) -> Self {
1398 Self {
1399 seq: vec![0; n.div_ceil(Self::C8) + PADDING],
1400 len: n,
1401 }
1402 }
1403
1404 #[cfg(feature = "rand")]
1405 pub fn random(len: usize, n_frac: f32) -> Self {
1406 let byte_len = len.div_ceil(Self::C8);
1407 let mut seq = vec![0; byte_len + PADDING];
1408
1409 assert!(
1410 (0.0..=0.3).contains(&n_frac),
1411 "n_frac={} should be in [0, 0.3]",
1412 n_frac
1413 );
1414
1415 for _ in 0..(len as f32 * n_frac) as usize {
1416 let idx = rand::random::<u64>() as usize % len;
1417 let byte = idx / Self::C8;
1418 let offset = idx % Self::C8;
1419 seq[byte] |= 1 << offset;
1420 }
1421
1422 Self { seq, len }
1423 }
1424}
1425
1426impl<'s> PackedSeqBase<'s, 1> {
1427 #[inline(always)]
1431 pub fn iter_kmer_ambiguity(self, k: usize) -> impl ExactSizeIterator<Item = bool> + use<'s> {
1432 let this = self.normalize();
1433 assert!(k > 0);
1434 assert!(k <= Self::K64);
1435 (this.offset..this.offset + this.len.saturating_sub(k - 1))
1436 .map(move |i| self.read_kmer(k, i) != 0)
1437 }
1438
1439 #[inline(always)]
1447 pub fn par_iter_kmer_ambiguity(
1448 self,
1449 k: usize,
1450 context: usize,
1451 skip: usize,
1452 ) -> PaddedIt<impl ChunkIt<S> + use<'s>> {
1453 #[cfg(target_endian = "big")]
1454 panic!("Big endian architectures are not supported.");
1455
1456 assert!(k > 0, "par_iter_kmers requires k>0, but k={k}");
1457 assert!(k <= 96, "par_iter_kmers requires k<=96, but k={k}");
1458
1459 let this = self.normalize();
1460 let o = this.offset;
1461 assert!(o < Self::C8);
1462
1463 let delay = k - 1;
1464
1465 let it = self.par_iter_bp_delayed(context, Delay(delay));
1466
1467 let mut cnt = u32x8::ZERO;
1468
1469 it.map(
1470 #[inline(always)]
1471 move |(a, r)| {
1472 cnt += a;
1473 let out = cnt.cmp_gt(S::ZERO);
1474 cnt -= r;
1475 out
1476 },
1477 )
1478 .advance(skip)
1479 }
1480
1481 #[inline(always)]
1482 pub fn par_iter_kmer_ambiguity_with_buf(
1483 self,
1484 k: usize,
1485 context: usize,
1486 skip: usize,
1487 buf: &'s mut Vec<S>,
1488 ) -> PaddedIt<impl ChunkIt<S> + use<'s>> {
1489 #[cfg(target_endian = "big")]
1490 panic!("Big endian architectures are not supported.");
1491
1492 assert!(k > 0, "par_iter_kmers requires k>0, but k={k}");
1493 assert!(k <= 96, "par_iter_kmers requires k<=96, but k={k}");
1494
1495 let this = self.normalize();
1496 let o = this.offset;
1497 assert!(o < Self::C8);
1498
1499 let delay = k - 1;
1500
1501 let it = self.par_iter_bp_delayed_with_buf(context, Delay(delay), buf);
1502
1503 let mut cnt = u32x8::ZERO;
1504
1505 it.map(
1506 #[inline(always)]
1507 move |(a, r)| {
1508 cnt += a;
1509 let out = cnt.cmp_gt(S::ZERO);
1510 cnt -= r;
1511 out
1512 },
1513 )
1514 .advance(skip)
1515 }
1516}
1517
1518#[cfg(target_feature = "neon")]
1519mod neon {
1520 use core::arch::aarch64::*;
1521
1522 #[inline(always)]
1523 pub fn movemask_64(v: uint8x16x4_t) -> u64 {
1524 unsafe {
1526 let acc = vsriq_n_u8(vsriq_n_u8(v.3, v.2, 1), vsriq_n_u8(v.1, v.0, 1), 2);
1527 vget_lane_u64(
1528 vreinterpret_u64_u8(vshrn_n_u16(
1529 vreinterpretq_u16_u8(vsriq_n_u8(acc, acc, 4)),
1530 4,
1531 )),
1532 0,
1533 )
1534 }
1535 }
1536
1537 #[inline(always)]
1538 pub fn map_8x16x4<F>(v: uint8x16x4_t, mut f: F) -> uint8x16x4_t
1539 where
1540 F: FnMut(uint8x16_t) -> uint8x16_t,
1541 {
1542 uint8x16x4_t(f(v.0), f(v.1), f(v.2), f(v.3))
1543 }
1544
1545 #[inline(always)]
1546 pub fn map_two_8x16x4<F>(v1: uint8x16x4_t, v2: uint8x16x4_t, mut f: F) -> uint8x16x4_t
1547 where
1548 F: FnMut(uint8x16_t, uint8x16_t) -> uint8x16_t,
1549 {
1550 uint8x16x4_t(f(v1.0, v2.0), f(v1.1, v2.1), f(v1.2, v2.2), f(v1.3, v2.3))
1551 }
1552}