1use core::cell::RefCell;
2use std::ops::{Deref, DerefMut};
3use traits::Seq;
4use wide::u16x8;
5
6use crate::{intrinsics::transpose, padded_it::ChunkIt};
7
8use super::*;
9
10type SimdBuf = [S; 8];
11
12struct RecycledBox(Option<Box<SimdBuf>>);
13
14thread_local! {
15 static RECYCLED_BOX_CACHE: RefCell<Vec<Box<SimdBuf>>> = {
16 RefCell::new(vec![Box::new(SimdBuf::default())])
17 };
18}
19
20impl RecycledBox {
21 #[inline(always)]
22 fn take() -> Self {
23 let mut buf = RECYCLED_BOX_CACHE.with_borrow_mut(|v| RecycledBox(v.pop()));
24 buf.init_if_needed();
25 buf
26 }
27
28 #[inline(always)]
29 fn init_if_needed(&mut self) {
30 if self.0.is_none() {
31 self.0 = Some(Box::new(SimdBuf::default()));
32 }
33 }
34
35 #[inline(always)]
36 fn get(&self) -> &SimdBuf {
37 unsafe { self.0.as_ref().unwrap_unchecked() }
38 }
39
40 #[inline(always)]
41 fn get_mut(&mut self) -> &mut SimdBuf {
42 unsafe { self.0.as_mut().unwrap_unchecked() }
43 }
44}
45
46impl Deref for RecycledBox {
47 type Target = SimdBuf;
48
49 #[inline(always)]
50 fn deref(&self) -> &Self::Target {
51 self.get()
52 }
53}
54impl DerefMut for RecycledBox {
55 #[inline(always)]
56 fn deref_mut(&mut self) -> &mut SimdBuf {
57 self.get_mut()
58 }
59}
60
61impl Drop for RecycledBox {
62 #[inline(always)]
63 fn drop(&mut self) {
64 let mut x = None;
65 core::mem::swap(&mut x, &mut self.0);
66 RECYCLED_BOX_CACHE.with_borrow_mut(|v| v.push(unsafe { x.unwrap_unchecked() }));
67 }
68}
69
70struct RecycledVec(Vec<S>);
71
72thread_local! {
73 static RECYCLED_VEC_CACHE: RefCell<Vec<Vec<S>>> = {
74 RefCell::new(vec![])
75 };
76}
77
78impl RecycledVec {
79 #[inline(always)]
80 fn take() -> Self {
81 RecycledVec(RECYCLED_VEC_CACHE.with_borrow_mut(|v| v.pop().unwrap_or_default()))
82 }
83}
84
85impl Deref for RecycledVec {
86 type Target = Vec<S>;
87 #[inline(always)]
88 fn deref(&self) -> &Self::Target {
89 &self.0
90 }
91}
92impl DerefMut for RecycledVec {
93 #[inline(always)]
94 fn deref_mut(&mut self) -> &mut Self::Target {
95 &mut self.0
96 }
97}
98impl Drop for RecycledVec {
99 #[inline(always)]
100 fn drop(&mut self) {
101 RECYCLED_VEC_CACHE.with_borrow_mut(|v| v.push(std::mem::take(&mut self.0)));
102 }
103}
104
105#[doc(hidden)]
106pub struct Bits<const B: usize>;
107#[doc(hidden)]
108pub trait SupportedBits {}
109impl SupportedBits for Bits<1> {}
110impl SupportedBits for Bits<2> {}
111impl SupportedBits for Bits<4> {}
112impl SupportedBits for Bits<8> {}
113
114pub(crate) const PADDING: usize = 48;
116
117#[doc(hidden)]
119#[derive(Copy, Clone, Debug, MemSize, MemDbg)]
120pub struct PackedSeqBase<'s, const B: usize>
121where
122 Bits<B>: SupportedBits,
123{
124 seq: &'s [u8],
126 offset: usize,
128 len: usize,
130}
131
132#[doc(hidden)]
134#[derive(Clone, Debug, MemSize, MemDbg)]
135#[cfg_attr(feature = "pyo3", pyo3::pyclass)]
136#[cfg_attr(feature = "epserde", derive(epserde::Epserde))]
137pub struct PackedSeqVecBase<const B: usize>
138where
139 Bits<B>: SupportedBits,
140{
141 pub(crate) seq: Vec<u8>,
145
146 len: usize,
148}
149
150pub type PackedSeq<'s> = PackedSeqBase<'s, 2>;
151pub type PackedSeqVec = PackedSeqVecBase<2>;
152pub type BitSeq<'s> = PackedSeqBase<'s, 1>;
153pub type BitSeqVec = PackedSeqVecBase<1>;
154
155impl<'s, const B: usize> PackedSeqBase<'s, B>
158where
159 Bits<B>: SupportedBits,
160{
161 const CHAR_MASK: u64 = (1 << B) - 1;
163 const SIMD_B: S = S::new([B as u32; 8]);
164 const SIMD_CHAR_MASK: S = S::new([(1 << B) - 1; 8]);
165 const C8: usize = 8 / B;
167 const C32: usize = 32 / B;
169 const C256: usize = 256 / B;
171 const K64: usize = (64 - 8) / B + 1;
173}
174
175impl<const B: usize> PackedSeqVecBase<B>
177where
178 Bits<B>: SupportedBits,
179{
180 const C8: usize = 8 / B;
182}
183
184impl<const B: usize> Default for PackedSeqVecBase<B>
185where
186 Bits<B>: SupportedBits,
187{
188 fn default() -> Self {
189 Self {
190 seq: vec![0; PADDING],
191 len: 0,
192 }
193 }
194}
195
196#[inline(always)]
201pub fn pack_char(base: u8) -> u8 {
202 match base {
203 b'a' | b'A' => 0,
204 b'c' | b'C' => 1,
205 b'g' | b'G' => 3,
206 b't' | b'T' => 2,
207 _ => panic!(
208 "Unexpected character '{}' with ASCII value {base}. Expected one of ACTGactg.",
209 base as char
210 ),
211 }
212}
213
214#[inline(always)]
216pub fn pack_char_lossy(base: u8) -> u8 {
217 (base >> 1) & 3
218}
219#[inline(always)]
222pub fn pack_kmer_lossy(slice: &[u8]) -> u64 {
223 let mut kmer = 0;
224 for (i, &base) in slice.iter().enumerate() {
225 kmer |= (pack_char_lossy(base) as u64) << (2 * i);
226 }
227 kmer
228}
229#[inline(always)]
231pub fn pack_kmer_u128_lossy(slice: &[u8]) -> u128 {
232 let mut kmer = 0;
233 for (i, &base) in slice.iter().enumerate() {
234 kmer |= (pack_char_lossy(base) as u128) << (2 * i);
235 }
236 kmer
237}
238
239#[inline(always)]
241pub fn unpack_base(base: u8) -> u8 {
242 debug_assert!(base < 4, "Base {base} is not <4.");
243 b"ACTG"[base as usize]
244}
245
246#[inline(always)]
249pub fn unpack_kmer(kmer: u64) -> [u8; 32] {
250 std::array::from_fn(|i| unpack_base(((kmer >> (2 * i)) & 3) as u8))
251}
252#[inline(always)]
254pub fn unpack_kmer_into_vec(kmer: u64, k: usize, out: &mut Vec<u8>) {
255 out.clear();
256 out.extend((0..k).map(|i| unpack_base(((kmer >> (2 * i)) & 3) as u8)));
257}
258#[inline(always)]
260pub fn unpack_kmer_to_vec(kmer: u64, k: usize) -> Vec<u8> {
261 let mut out = vec![];
262 unpack_kmer_into_vec(kmer, k, &mut out);
263 out
264}
265
266#[inline(always)]
269pub fn unpack_kmer_u128(kmer: u128) -> [u8; 64] {
270 std::array::from_fn(|i| unpack_base(((kmer >> (2 * i)) & 3) as u8))
271}
272#[inline(always)]
274pub fn unpack_kmer_u128_into_vec(kmer: u128, k: usize, out: &mut Vec<u8>) {
275 out.clear();
276 out.extend((0..k).map(|i| unpack_base(((kmer >> (2 * i)) & 3) as u8)));
277}
278#[inline(always)]
280pub fn unpack_kmer_u128_to_vec(kmer: u128, k: usize) -> Vec<u8> {
281 let mut out = vec![];
282 unpack_kmer_u128_into_vec(kmer, k, &mut out);
283 out
284}
285
286#[inline(always)]
288pub const fn complement_char(base: u8) -> u8 {
289 match base {
290 b'A' => b'T',
291 b'C' => b'G',
292 b'G' => b'C',
293 b'T' => b'A',
294 _ => panic!("Unexpected character. Expected one of ACTGactg.",),
295 }
296}
297
298#[inline(always)]
300pub const fn complement_base(base: u8) -> u8 {
301 base ^ 2
302}
303
304#[inline(always)]
306pub fn complement_base_simd(base: u32x8) -> u32x8 {
307 const TWO: u32x8 = u32x8::new([2; 8]);
308 base ^ TWO
309}
310
311#[inline(always)]
313const fn revcomp_raw(word: u64) -> u64 {
314 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
315 {
316 let mut res = word.reverse_bits(); res = ((res >> 1) & 0x5555_5555_5555_5555) | ((res & 0x5555_5555_5555_5555) << 1);
318 res ^ 0xAAAA_AAAA_AAAA_AAAA
319 }
320
321 #[cfg(not(any(target_arch = "arm", target_arch = "aarch64")))]
322 {
323 let mut res = word.swap_bytes();
324 res = ((res >> 4) & 0x0F0F_0F0F_0F0F_0F0F) | ((res & 0x0F0F_0F0F_0F0F_0F0F) << 4);
325 res = ((res >> 2) & 0x3333_3333_3333_3333) | ((res & 0x3333_3333_3333_3333) << 2);
326 res ^ 0xAAAA_AAAA_AAAA_AAAA
327 }
328}
329
330#[inline(always)]
332pub const fn revcomp_u64(word: u64, len: usize) -> u64 {
333 revcomp_raw(word) >> (usize::BITS as usize - 2 * len)
334}
335
336#[inline(always)]
337pub const fn revcomp_u128(word: u128, len: usize) -> u128 {
338 let low = word as u64;
339 let high = (word >> 64) as u64;
340 let rlow = revcomp_raw(low);
341 let rhigh = revcomp_raw(high);
342 let out = ((rlow as u128) << 64) | rhigh as u128;
343 out >> (u128::BITS as usize - 2 * len)
344}
345
346#[inline(always)]
351pub fn char_is_ambiguous(base: u8) -> u8 {
352 let table = b"ACTG";
354 let upper_mask = !(b'a' - b'A');
355 (table[pack_char_lossy(base) as usize] != (base & upper_mask)) as u8
356}
357
358#[inline(always)]
360pub const fn rev_u64(word: u64, len: usize) -> u64 {
361 word.reverse_bits() >> (usize::BITS as usize - len)
362}
363
364#[inline(always)]
366pub const fn rev_u128(word: u128, len: usize) -> u128 {
367 word.reverse_bits() >> (u128::BITS as usize - len)
368}
369
370impl<'s, const B: usize> PackedSeqBase<'s, B>
373where
374 Bits<B>: SupportedBits,
375{
376 pub fn from_raw_parts(seq: &'s [u8], offset: usize, len: usize) -> Self {
381 assert!(offset + len + PADDING * Self::C8 <= seq.len() * Self::C8);
382 Self { seq, offset, len }
383 }
384
385 #[inline(always)]
387 pub fn normalize(&self) -> Self {
388 let start_byte = self.offset / Self::C8;
389 let end_byte = (self.offset + self.len).div_ceil(Self::C8);
390 Self {
391 seq: &self.seq[start_byte..end_byte + PADDING],
392 offset: self.offset % Self::C8,
393 len: self.len,
394 }
395 }
396
397 #[inline(always)]
399 pub fn unpack(&self) -> Vec<u8> {
400 self.iter_bp().map(unpack_base).collect()
401 }
402}
403
404#[inline(always)]
406pub(crate) unsafe fn read_slice_32_unchecked(seq: &[u8], idx: usize) -> u32x8 {
407 unsafe {
408 let src = seq.as_ptr().add(idx);
409 debug_assert!(idx + 32 <= seq.len());
410 std::mem::transmute::<_, *const u32x8>(src).read_unaligned()
411 }
412}
413
414#[inline(always)]
416pub(crate) fn read_slice_32(seq: &[u8], idx: usize) -> u32x8 {
417 unsafe {
418 let src = seq.as_ptr().add(idx);
419 if idx + 32 <= seq.len() {
420 std::mem::transmute::<_, *const u32x8>(src).read_unaligned()
421 } else {
422 let num_bytes = seq.len().saturating_sub(idx);
423 let mut result = [0u8; 32];
424 std::ptr::copy_nonoverlapping(src, result.as_mut_ptr(), num_bytes);
425 std::mem::transmute(result)
426 }
427 }
428}
429
430#[allow(unused)]
432#[inline(always)]
433pub(crate) fn read_slice_16(seq: &[u8], idx: usize) -> u16x8 {
434 unsafe {
435 let src = seq.as_ptr().add(idx);
436 if idx + 16 <= seq.len() {
437 std::mem::transmute::<_, *const u16x8>(src).read_unaligned()
438 } else {
439 let num_bytes = seq.len().saturating_sub(idx);
440 let mut result = [0u8; 16];
441 std::ptr::copy_nonoverlapping(src, result.as_mut_ptr(), num_bytes);
442 std::mem::transmute(result)
443 }
444 }
445}
446
447impl<'s, const B: usize> Seq<'s> for PackedSeqBase<'s, B>
448where
449 Bits<B>: SupportedBits,
450{
451 const BITS_PER_CHAR: usize = B;
452 const BASES_PER_BYTE: usize = Self::C8;
453 type SeqVec = PackedSeqVecBase<B>;
454
455 #[inline(always)]
456 fn len(&self) -> usize {
457 self.len
458 }
459
460 #[inline(always)]
461 fn is_empty(&self) -> bool {
462 self.len == 0
463 }
464
465 #[inline(always)]
466 fn get_ascii(&self, index: usize) -> u8 {
467 unpack_base(self.get(index))
468 }
469
470 #[inline(always)]
473 fn as_u64(&self) -> u64 {
474 assert!(self.len() <= 64 / B);
475
476 let mask = u64::MAX >> (64 - B * self.len());
477
478 if self.len() <= Self::K64 {
481 let x = unsafe { (self.seq.as_ptr() as *const u64).read_unaligned() };
482 (x >> (B * self.offset)) & mask
483 } else {
484 let x = unsafe { (self.seq.as_ptr() as *const u128).read_unaligned() };
485 (x >> (B * self.offset)) as u64 & mask
486 }
487 }
488
489 #[inline(always)]
492 fn revcomp_as_u64(&self) -> u64 {
493 match B {
494 1 => rev_u64(self.as_u64(), self.len()),
495 2 => revcomp_u64(self.as_u64(), self.len()),
496 _ => panic!("Rev(comp) is only supported for 1-bit and 2-bit alphabets."),
497 }
498 }
499
500 #[inline(always)]
503 fn as_u128(&self) -> u128 {
504 assert!(
505 self.len() <= (128 - 8) / B + 1,
506 "Sequences >61 long cannot be read with a single unaligned u128 read."
507 );
508
509 let mask = u128::MAX >> (128 - B * self.len());
510
511 let x = unsafe { (self.seq.as_ptr() as *const u128).read_unaligned() };
514 (x >> (B * self.offset)) & mask
515 }
516
517 #[inline(always)]
520 fn revcomp_as_u128(&self) -> u128 {
521 match B {
522 1 => rev_u128(self.as_u128(), self.len()),
523 2 => revcomp_u128(self.as_u128(), self.len()),
524 _ => panic!("Rev(comp) is only supported for 1-bit and 2-bit alphabets."),
525 }
526 }
527
528 #[inline(always)]
529 fn to_vec(&self) -> PackedSeqVecBase<B> {
530 assert_eq!(self.offset, 0);
531 PackedSeqVecBase {
532 seq: self
533 .seq
534 .iter()
535 .copied()
536 .chain(std::iter::repeat_n(0u8, PADDING))
537 .collect(),
538 len: self.len,
539 }
540 }
541
542 fn to_revcomp(&self) -> PackedSeqVecBase<B> {
543 match B {
544 1 | 2 => {}
545 _ => panic!("Can only reverse (&complement) 1-bit and 2-bit packed sequences.",),
546 }
547
548 let mut seq = self.seq[..(self.offset + self.len).div_ceil(Self::C8)]
549 .iter()
550 .rev()
552 .copied()
553 .map(|mut res| {
554 match B {
555 2 => {
556 res = ((res >> 4) & 0x0F) | ((res & 0x0F) << 4);
559 res = ((res >> 2) & 0x33) | ((res & 0x33) << 2);
560 res ^ 0xAA
562 }
563 1 => res.reverse_bits(),
564 _ => unreachable!(),
565 }
566 })
567 .chain(std::iter::repeat_n(0u8, PADDING))
568 .collect::<Vec<u8>>();
569
570 let new_offset = (Self::C8 - (self.offset + self.len) % Self::C8) % Self::C8;
572
573 if new_offset > 0 {
574 let shift = B * new_offset;
576 *seq.last_mut().unwrap() >>= shift;
577 for i in 0..seq.len() - 1 {
579 seq[i] = (seq[i] >> shift) | (seq[i + 1] << (8 - shift));
580 }
581 }
582
583 PackedSeqVecBase { seq, len: self.len }
584 }
585
586 #[inline(always)]
587 fn slice(&self, range: Range<usize>) -> Self {
588 debug_assert!(
589 range.end <= self.len,
590 "Slice index out of bounds: {} > {}",
591 range.end,
592 self.len
593 );
594 PackedSeqBase {
595 seq: self.seq,
596 offset: self.offset + range.start,
597 len: range.end - range.start,
598 }
599 .normalize()
600 }
601
602 #[inline(always)]
603 fn iter_bp(self) -> impl ExactSizeIterator<Item = u8> {
604 assert!(self.len <= self.seq.len() * Self::C8);
605
606 let this = self.normalize();
607
608 let mut byte = 0;
610 (0..this.len + this.offset)
611 .map(
612 #[inline(always)]
613 move |i| {
614 if i % Self::C8 == 0 {
615 byte = this.seq[i / Self::C8];
616 }
617 (byte >> (B * (i % Self::C8))) & Self::CHAR_MASK as u8
619 },
620 )
621 .advance(this.offset)
622 }
623
624 #[inline(always)]
625 fn par_iter_bp(self, context: usize) -> PaddedIt<impl ChunkIt<S>> {
626 self.par_iter_bp_with_buf(context, RecycledBox::take())
627 }
628
629 #[inline(always)]
630 fn par_iter_bp_delayed(self, context: usize, delay: Delay) -> PaddedIt<impl ChunkIt<(S, S)>> {
631 self.par_iter_bp_delayed_with_factor(context, delay, 1)
632 }
633
634 #[inline(always)]
637 fn par_iter_bp_delayed_2(
638 self,
639 context: usize,
640 delay1: Delay,
641 delay2: Delay,
642 ) -> PaddedIt<impl ChunkIt<(S, S, S)>> {
643 self.par_iter_bp_delayed_2_with_factor_and_buf(
644 context,
645 delay1,
646 delay2,
647 1,
648 RecycledVec::take(),
649 )
650 }
651
652 fn cmp_lcp(&self, other: &Self) -> (std::cmp::Ordering, usize) {
654 let mut lcp = 0;
655 let min_len = self.len.min(other.len);
656 for i in (0..min_len).step_by(Self::K64) {
657 let len = (min_len - i).min(Self::K64);
658 let this = self.slice(i..i + len);
659 let other = other.slice(i..i + len);
660 let this_word = this.as_u64();
661 let other_word = other.as_u64();
662 if this_word != other_word {
663 let eq = this_word ^ other_word;
665 let t = eq.trailing_zeros() as usize / B * B;
666 lcp += t / B;
667 let mask = (Self::CHAR_MASK) << t;
668 return ((this_word & mask).cmp(&(other_word & mask)), lcp);
669 }
670 lcp += len;
671 }
672 (self.len.cmp(&other.len), lcp)
673 }
674
675 #[inline(always)]
676 fn get(&self, index: usize) -> u8 {
677 let offset = self.offset + index;
678 let idx = offset / Self::C8;
679 let offset = offset % Self::C8;
680 (self.seq[idx] >> (B * offset)) & Self::CHAR_MASK as u8
681 }
682}
683
684impl<'s, const B: usize> PackedSeqBase<'s, B>
685where
686 Bits<B>: SupportedBits,
687{
688 #[inline(always)]
689 pub fn par_iter_bp_with_buf<BUF: DerefMut<Target = [S; 8]>>(
690 self,
691 context: usize,
692 mut buf: BUF,
693 ) -> PaddedIt<impl ChunkIt<S> + use<'s, B, BUF>> {
694 #[cfg(target_endian = "big")]
695 panic!("Big endian architectures are not supported.");
696
697 let this = self.normalize();
698 let o = this.offset;
699 assert!(o < Self::C8);
700
701 let num_kmers = if this.len == 0 {
702 0
703 } else {
704 (this.len + o).saturating_sub(context - 1)
705 };
706 let num_kmers_stride = this.len.saturating_sub(context - 1);
708 let n = num_kmers_stride.div_ceil(L).next_multiple_of(Self::C8);
709 let bytes_per_chunk = n / Self::C8;
710 let padding = Self::C8 * L * bytes_per_chunk - num_kmers_stride;
711
712 let offsets: [usize; 8] = from_fn(|l| l * bytes_per_chunk);
713 let mut cur = S::ZERO;
714
715 let par_len = if num_kmers == 0 {
716 0
717 } else {
718 n + context + o - 1
719 };
720
721 let last_i = par_len.saturating_sub(1) / Self::C32 * Self::C32;
722 assert!(offsets[7] + (last_i / Self::C8) + 32 <= this.seq.len());
724
725 let it = (0..par_len)
726 .map(
727 #[inline(always)]
728 move |i| {
729 if i % Self::C32 == 0 {
730 if i % Self::C256 == 0 {
731 let data: [u32x8; 8] = from_fn(
733 #[inline(always)]
734 |lane| unsafe {
735 read_slice_32_unchecked(
736 this.seq,
737 offsets[lane] + (i / Self::C8),
738 )
739 },
740 );
741 *buf = transpose(data);
742 }
743 cur = buf[(i % Self::C256) / Self::C32];
744 }
745 let chars = cur & Self::SIMD_CHAR_MASK;
747 cur = cur >> Self::SIMD_B;
749 chars
750 },
751 )
752 .advance(o);
753
754 PaddedIt { it, padding }
755 }
756
757 #[inline(always)]
758 pub fn par_iter_bp_delayed_with_factor(
759 self,
760 context: usize,
761 delay: Delay,
762 factor: usize,
763 ) -> PaddedIt<impl ChunkIt<(S, S)> + use<'s, B>> {
764 self.par_iter_bp_delayed_with_factor_and_buf(context, delay, factor, RecycledVec::take())
765 }
766
767 #[inline(always)]
768 pub fn par_iter_bp_delayed_with_buf<BUF: DerefMut<Target = Vec<S>>>(
769 self,
770 context: usize,
771 delay: Delay,
772 buf: BUF,
773 ) -> PaddedIt<impl ChunkIt<(S, S)> + use<'s, B, BUF>> {
774 self.par_iter_bp_delayed_with_factor_and_buf(context, delay, 1, buf)
775 }
776
777 #[inline(always)]
778 pub fn par_iter_bp_delayed_with_factor_and_buf<BUF: DerefMut<Target = Vec<S>>>(
779 self,
780 context: usize,
781 Delay(delay): Delay,
782 factor: usize,
783 mut buf: BUF,
784 ) -> PaddedIt<impl ChunkIt<(S, S)> + use<'s, B, BUF>> {
785 #[cfg(target_endian = "big")]
786 panic!("Big endian architectures are not supported.");
787
788 assert!(
789 delay < usize::MAX / 2,
790 "Delay={} should be >=0.",
791 delay as isize
792 );
793
794 let this = self.normalize();
795 let o = this.offset;
796 assert!(o < Self::C8);
797
798 let num_kmers = if this.len == 0 {
799 0
800 } else {
801 (this.len + o).saturating_sub(context - 1)
802 };
803 let num_kmers_stride = this.len.saturating_sub(context - 1);
805 let n = num_kmers_stride
806 .div_ceil(L)
807 .next_multiple_of(factor * Self::C8);
808 let bytes_per_chunk = n / Self::C8;
809 let padding = Self::C8 * L * bytes_per_chunk - num_kmers_stride;
810
811 let offsets: [usize; 8] = from_fn(|l| l * bytes_per_chunk);
812 let mut upcoming = S::ZERO;
813 let mut upcoming_d = S::ZERO;
814
815 let buf_len = (delay / Self::C32 + 8).next_power_of_two();
820 let buf_mask = buf_len - 1;
821 if buf.capacity() < buf_len {
822 *buf = vec![S::ZERO; buf_len];
824 } else {
825 buf.clear();
827 buf.resize(buf_len, S::ZERO);
828 }
829
830 let mut write_idx = 0;
831 let mut read_idx = (buf_len - delay / Self::C32) % buf_len;
834
835 let par_len = if num_kmers == 0 {
836 0
837 } else {
838 n + context + o - 1
839 };
840
841 let last_i = par_len.saturating_sub(1) / Self::C32 * Self::C32;
842 assert!(offsets[7] + (last_i / Self::C8) + 32 <= this.seq.len());
844
845 let it = (0..par_len)
846 .map(
847 #[inline(always)]
848 move |i| {
849 if i % Self::C32 == 0 {
850 if i % Self::C256 == 0 {
851 let data: [u32x8; 8] = from_fn(
853 #[inline(always)]
854 |lane| unsafe {
855 read_slice_32_unchecked(
856 this.seq,
857 offsets[lane] + (i / Self::C8),
858 )
859 },
860 );
861 unsafe {
862 *TryInto::<&mut [u32x8; 8]>::try_into(
863 buf.get_unchecked_mut(write_idx..write_idx + 8),
864 )
865 .unwrap_unchecked() = transpose(data);
866 }
867 if i == 0 {
868 let elem = !((1u32 << (B * o)) - 1);
870 let mask = S::splat(elem);
871 unsafe { assert_unchecked(write_idx < buf.len()) };
872 buf[write_idx] &= mask;
873 }
874 }
875 unsafe { assert_unchecked(write_idx < buf.len()) };
876 upcoming = buf[write_idx];
877 write_idx += 1;
878 write_idx &= buf_mask;
879 }
880 if i % Self::C32 == delay % Self::C32 {
881 unsafe { assert_unchecked(read_idx < buf.len()) };
882 upcoming_d = buf[read_idx];
883 read_idx += 1;
884 read_idx &= buf_mask;
885 }
886 let chars = upcoming & Self::SIMD_CHAR_MASK;
888 let chars_d = upcoming_d & Self::SIMD_CHAR_MASK;
889 upcoming = upcoming >> Self::SIMD_B;
891 upcoming_d = upcoming_d >> Self::SIMD_B;
892 (chars, chars_d)
893 },
894 )
895 .advance(o);
896
897 PaddedIt { it, padding }
898 }
899
900 #[inline(always)]
901 pub fn par_iter_bp_delayed_2_with_factor(
902 self,
903 context: usize,
904 delay1: Delay,
905 delay2: Delay,
906 factor: usize,
907 ) -> PaddedIt<impl ChunkIt<(S, S, S)> + use<'s, B>> {
908 self.par_iter_bp_delayed_2_with_factor_and_buf(
909 context,
910 delay1,
911 delay2,
912 factor,
913 RecycledVec::take(),
914 )
915 }
916
917 #[inline(always)]
918 pub fn par_iter_bp_delayed_2_with_buf<BUF: DerefMut<Target = Vec<S>>>(
919 self,
920 context: usize,
921 delay1: Delay,
922 delay2: Delay,
923 buf: BUF,
924 ) -> PaddedIt<impl ChunkIt<(S, S, S)> + use<'s, B, BUF>> {
925 self.par_iter_bp_delayed_2_with_factor_and_buf(context, delay1, delay2, 1, buf)
926 }
927
928 #[inline(always)]
934 pub fn par_iter_bp_delayed_2_with_factor_and_buf<BUF: DerefMut<Target = Vec<S>>>(
935 self,
936 context: usize,
937 Delay(delay1): Delay,
938 Delay(delay2): Delay,
939 factor: usize,
940 mut buf: BUF,
941 ) -> PaddedIt<impl ChunkIt<(S, S, S)> + use<'s, B, BUF>> {
942 #[cfg(target_endian = "big")]
943 panic!("Big endian architectures are not supported.");
944
945 let this = self.normalize();
946 let o = this.offset;
947 assert!(o < Self::C8);
948 assert!(delay1 <= delay2, "Delay1 must be at most delay2.");
949
950 let num_kmers = if this.len == 0 {
951 0
952 } else {
953 (this.len + o).saturating_sub(context - 1)
954 };
955 let num_kmers_stride = this.len.saturating_sub(context - 1);
957 let n = num_kmers_stride
958 .div_ceil(L)
959 .next_multiple_of(factor * Self::C8);
960 let bytes_per_chunk = n / Self::C8;
961 let padding = Self::C8 * L * bytes_per_chunk - num_kmers_stride;
962
963 let offsets: [usize; 8] = from_fn(|l| l * bytes_per_chunk);
964 let mut upcoming = S::ZERO;
965 let mut upcoming_d1 = S::ZERO;
966 let mut upcoming_d2 = S::ZERO;
967
968 let buf_len = (delay2 / Self::C32 + 8).next_power_of_two();
970 let buf_mask = buf_len - 1;
971 if buf.capacity() < buf_len {
972 *buf = vec![S::ZERO; buf_len];
974 } else {
975 buf.clear();
977 buf.resize(buf_len, S::ZERO);
978 }
979
980 let mut write_idx = 0;
981 let mut read_idx1 = (buf_len - delay1 / Self::C32) % buf_len;
984 let mut read_idx2 = (buf_len - delay2 / Self::C32) % buf_len;
985
986 let par_len = if num_kmers == 0 {
987 0
988 } else {
989 n + context + o - 1
990 };
991
992 let last_i = par_len.saturating_sub(1) / Self::C32 * Self::C32;
993 assert!(offsets[7] + (last_i / Self::C8) + 32 <= this.seq.len());
995
996 let it = (0..par_len)
997 .map(
998 #[inline(always)]
999 move |i| {
1000 if i % Self::C32 == 0 {
1001 if i % Self::C256 == 0 {
1002 let data: [u32x8; 8] = from_fn(
1004 #[inline(always)]
1005 |lane| unsafe {
1006 read_slice_32_unchecked(
1007 this.seq,
1008 offsets[lane] + (i / Self::C8),
1009 )
1010 },
1011 );
1012 unsafe {
1013 *TryInto::<&mut [u32x8; 8]>::try_into(
1014 buf.get_unchecked_mut(write_idx..write_idx + 8),
1015 )
1016 .unwrap_unchecked() = transpose(data);
1017 }
1018 if i == 0 {
1020 let elem = !((1u32 << (B * o)) - 1);
1022 let mask = S::splat(elem);
1023 buf[write_idx] &= mask;
1024 }
1025 }
1026 upcoming = buf[write_idx];
1027 write_idx += 1;
1028 write_idx &= buf_mask;
1029 }
1030 if i % Self::C32 == delay1 % Self::C32 {
1031 unsafe { assert_unchecked(read_idx1 < buf.len()) };
1032 upcoming_d1 = buf[read_idx1];
1033 read_idx1 += 1;
1034 read_idx1 &= buf_mask;
1035 }
1036 if i % Self::C32 == delay2 % Self::C32 {
1037 unsafe { assert_unchecked(read_idx2 < buf.len()) };
1038 upcoming_d2 = buf[read_idx2];
1039 read_idx2 += 1;
1040 read_idx2 &= buf_mask;
1041 }
1042 let chars = upcoming & Self::SIMD_CHAR_MASK;
1044 let chars_d1 = upcoming_d1 & Self::SIMD_CHAR_MASK;
1045 let chars_d2 = upcoming_d2 & Self::SIMD_CHAR_MASK;
1046 upcoming = upcoming >> Self::SIMD_B;
1048 upcoming_d1 = upcoming_d1 >> Self::SIMD_B;
1049 upcoming_d2 = upcoming_d2 >> Self::SIMD_B;
1050 (chars, chars_d1, chars_d2)
1051 },
1052 )
1053 .advance(o);
1054
1055 PaddedIt { it, padding }
1056 }
1057}
1058
1059impl<const B: usize> PartialEq for PackedSeqBase<'_, B>
1060where
1061 Bits<B>: SupportedBits,
1062{
1063 fn eq(&self, other: &Self) -> bool {
1065 if self.len != other.len {
1066 return false;
1067 }
1068 for i in (0..self.len).step_by(Self::K64) {
1069 let len = (self.len - i).min(Self::K64);
1070 let this = self.slice(i..i + len);
1071 let that = other.slice(i..i + len);
1072 if this.as_u64() != that.as_u64() {
1073 return false;
1074 }
1075 }
1076 true
1077 }
1078}
1079
1080impl<const B: usize> Eq for PackedSeqBase<'_, B> where Bits<B>: SupportedBits {}
1081
1082impl<const B: usize> PartialOrd for PackedSeqBase<'_, B>
1083where
1084 Bits<B>: SupportedBits,
1085{
1086 fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
1087 Some(self.cmp(other))
1088 }
1089}
1090
1091impl<const B: usize> Ord for PackedSeqBase<'_, B>
1092where
1093 Bits<B>: SupportedBits,
1094{
1095 fn cmp(&self, other: &Self) -> std::cmp::Ordering {
1097 let min_len = self.len.min(other.len);
1098 for i in (0..min_len).step_by(Self::K64) {
1099 let len = (min_len - i).min(Self::K64);
1100 let this = self.slice(i..i + len);
1101 let other = other.slice(i..i + len);
1102 let this_word = this.as_u64();
1103 let other_word = other.as_u64();
1104 if this_word != other_word {
1105 let eq = this_word ^ other_word;
1107 let t = eq.trailing_zeros() as usize / B * B;
1108 let mask = (Self::CHAR_MASK) << t;
1109 return (this_word & mask).cmp(&(other_word & mask));
1110 }
1111 }
1112 self.len.cmp(&other.len)
1113 }
1114}
1115
1116impl<const B: usize> SeqVec for PackedSeqVecBase<B>
1117where
1118 Bits<B>: SupportedBits,
1119{
1120 type Seq<'s> = PackedSeqBase<'s, B>;
1121
1122 #[inline(always)]
1123 fn into_raw(mut self) -> Vec<u8> {
1124 self.seq.resize(self.len.div_ceil(Self::C8), 0);
1125 self.seq
1126 }
1127
1128 #[inline(always)]
1129 fn as_slice(&self) -> Self::Seq<'_> {
1130 PackedSeqBase {
1131 seq: &self.seq[..self.len.div_ceil(Self::C8) + PADDING],
1132 offset: 0,
1133 len: self.len,
1134 }
1135 }
1136
1137 #[inline(always)]
1138 fn len(&self) -> usize {
1139 self.len
1140 }
1141
1142 #[inline(always)]
1143 fn is_empty(&self) -> bool {
1144 self.len == 0
1145 }
1146
1147 #[inline(always)]
1148 fn clear(&mut self) {
1149 self.seq.clear();
1150 self.len = 0;
1151 }
1152
1153 fn push_seq<'a>(&mut self, seq: PackedSeqBase<'_, B>) -> Range<usize> {
1154 let start = self.len.next_multiple_of(Self::C8) + seq.offset;
1155 let end = start + seq.len();
1156
1157 self.seq.resize(self.len.div_ceil(Self::C8), 0);
1159 self.seq.extend(seq.seq);
1161 self.len = end;
1162 start..end
1163 }
1164
1165 fn push_ascii(&mut self, seq: &[u8]) -> Range<usize> {
1174 match B {
1175 1 | 2 => {}
1176 _ => panic!(
1177 "Can only use ASCII input for 2-bit DNA packing, or 1-bit ambiguous indicators."
1178 ),
1179 }
1180
1181 self.seq
1182 .resize((self.len + seq.len()).div_ceil(Self::C8) + PADDING, 0);
1183 let start_aligned = self.len.next_multiple_of(Self::C8);
1184 let start = self.len;
1185 let len = seq.len();
1186 let mut idx = self.len / Self::C8;
1187
1188 let parse_base = |base| match B {
1189 1 => char_is_ambiguous(base),
1190 2 => pack_char_lossy(base),
1191 _ => unreachable!(),
1192 };
1193
1194 let unaligned = core::cmp::min(start_aligned - start, len);
1195 if unaligned > 0 {
1196 let mut packed_byte = self.seq[idx];
1197 for &base in &seq[..unaligned] {
1198 packed_byte |= parse_base(base) << ((self.len % Self::C8) * B);
1199 self.len += 1;
1200 }
1201 self.seq[idx] = packed_byte;
1202 idx += 1;
1203 }
1204
1205 #[allow(unused)]
1206 let mut last = unaligned;
1207
1208 if B == 2 {
1209 #[cfg(all(target_arch = "x86_64", target_feature = "bmi2"))]
1210 {
1211 last = unaligned + (len - unaligned) / 8 * 8;
1212
1213 for i in (unaligned..last).step_by(8) {
1214 let chunk =
1215 unsafe { seq.get_unchecked(i..i + 8).try_into().unwrap_unchecked() };
1216 let ascii = u64::from_le_bytes(chunk);
1217 let packed_bytes =
1218 unsafe { std::arch::x86_64::_pext_u64(ascii, 0x0606060606060606) } as u16;
1219 unsafe {
1220 self.seq
1221 .get_unchecked_mut(idx..(idx + 2))
1222 .copy_from_slice(&packed_bytes.to_le_bytes())
1223 };
1224 idx += 2;
1225 self.len += 8;
1226 }
1227 }
1228
1229 #[cfg(target_feature = "neon")]
1230 {
1231 use core::arch::aarch64::{
1232 vandq_u8, vdup_n_u8, vld1q_u8, vpadd_u8, vshlq_u8, vst1_u8,
1233 };
1234 use core::mem::transmute;
1235
1236 last = unaligned + (len - unaligned) / 16 * 16;
1237
1238 for i in (unaligned..last).step_by(16) {
1239 unsafe {
1240 let ascii = vld1q_u8(seq.as_ptr().add(i));
1241 let masked_bits = vandq_u8(ascii, transmute([6i8; 16]));
1242 let (bits_0, bits_1) = transmute(vshlq_u8(
1243 masked_bits,
1244 transmute([-1i8, 1, 3, 5, -1, 1, 3, 5, -1, 1, 3, 5, -1, 1, 3, 5]),
1245 ));
1246 let half_packed = vpadd_u8(bits_0, bits_1);
1247 let packed = vpadd_u8(half_packed, vdup_n_u8(0));
1248 vst1_u8(self.seq.as_mut_ptr().add(idx), packed);
1249 idx += Self::C8;
1250 self.len += 16;
1251 }
1252 }
1253 }
1254 }
1255
1256 if B == 1 {
1257 #[cfg(all(target_arch = "x86_64", target_feature = "avx2"))]
1258 {
1259 last = len;
1260 self.len += len - unaligned;
1261
1262 let mut last_i = unaligned;
1263
1264 for i in (unaligned..last).step_by(32) {
1265 use std::mem::transmute as t;
1266
1267 use wide::CmpEq;
1268 type S = wide::i8x32;
1270 let chars: S = unsafe { t(read_slice_32(seq, i)) };
1271 let upper_mask = !(b'a' - b'A');
1272 let chars = chars & S::splat(upper_mask as i8);
1274 let lossy_encoded = chars & S::splat(6);
1275 let table = unsafe { S::from(t::<_, S>(*b"AxCxTxGxxxxxxxxxAxCxTxGxxxxxxxxx")) };
1276 let lookup: S = unsafe {
1277 t(std::arch::x86_64::_mm256_shuffle_epi8(
1278 t(table),
1279 t(lossy_encoded),
1280 ))
1281 };
1282 let packed_bytes = !(chars.cmp_eq(lookup).move_mask() as u32);
1283
1284 last_i = i;
1285 unsafe {
1286 self.seq
1287 .get_unchecked_mut(idx..(idx + 4))
1288 .copy_from_slice(&packed_bytes.to_le_bytes())
1289 };
1290 idx += 4;
1291 }
1292
1293 if unaligned < last {
1295 idx -= 4;
1296 let mut val = unsafe {
1297 u32::from_le_bytes(
1298 self.seq
1299 .get_unchecked(idx..(idx + 4))
1300 .try_into()
1301 .unwrap_unchecked(),
1302 )
1303 };
1304 let keep = last - last_i;
1306 val <<= 32 - keep;
1307 val >>= 32 - keep;
1308 unsafe {
1309 self.seq
1310 .get_unchecked_mut(idx..(idx + 4))
1311 .copy_from_slice(&val.to_le_bytes())
1312 };
1313 idx += keep.div_ceil(8);
1314 }
1315 }
1316
1317 #[cfg(target_feature = "neon")]
1318 {
1319 use core::arch::aarch64::*;
1320 use core::mem::transmute;
1321
1322 last = unaligned + (len - unaligned) / 64 * 64;
1323
1324 for i in (unaligned..last).step_by(64) {
1325 unsafe {
1326 let ptr = seq.as_ptr().add(i);
1327 let chars = vld4q_u8(ptr);
1328
1329 let upper_mask = vdupq_n_u8(!(b'a' - b'A'));
1330 let chars = neon::map_8x16x4(chars, |v| vandq_u8(v, upper_mask));
1331
1332 let two_bits_mask = vdupq_n_u8(6);
1333 let lossy_encoded = neon::map_8x16x4(chars, |v| vandq_u8(v, two_bits_mask));
1334
1335 let table = transmute(*b"AxCxTxGxxxxxxxxx");
1336 let lookup = neon::map_8x16x4(lossy_encoded, |v| vqtbl1q_u8(table, v));
1337
1338 let mask = neon::map_two_8x16x4(chars, lookup, |v1, v2| vceqq_u8(v1, v2));
1339 let packed_bytes = !neon::movemask_64(mask);
1340
1341 self.seq[idx..(idx + 8)].copy_from_slice(&packed_bytes.to_le_bytes());
1342 idx += 8;
1343 self.len += 64;
1344 }
1345 }
1346 }
1347 }
1348
1349 let mut packed_byte = 0;
1350 for &base in &seq[last..] {
1351 packed_byte |= parse_base(base) << ((self.len % Self::C8) * B);
1352 self.len += 1;
1353 if self.len % Self::C8 == 0 {
1354 self.seq[idx] = packed_byte;
1355 idx += 1;
1356 packed_byte = 0;
1357 }
1358 }
1359 if self.len % Self::C8 != 0 && last < len {
1360 self.seq[idx] = packed_byte;
1361 idx += 1;
1362 }
1363 assert_eq!(idx + PADDING, self.seq.len());
1364 start..start + len
1365 }
1366
1367 #[cfg(feature = "rand")]
1368 fn random(n: usize) -> Self {
1369 use rand::{RngCore, SeedableRng};
1370
1371 let byte_len = n.div_ceil(Self::C8);
1372 let mut seq = vec![0; byte_len + PADDING];
1373 rand::rngs::SmallRng::from_os_rng().fill_bytes(&mut seq[..byte_len]);
1374 if n % Self::C8 != 0 {
1376 seq[byte_len - 1] &= (1 << (B * (n % Self::C8))) - 1;
1377 }
1378
1379 Self { seq, len: n }
1380 }
1381}
1382
1383impl<const B: usize> PackedSeqVecBase<B>
1384where
1385 Bits<B>: SupportedBits,
1386{
1387 pub fn from_raw_parts(mut seq: Vec<u8>, len: usize) -> Self {
1392 assert!(len <= seq.len() * Self::C8);
1393 seq.resize(len.div_ceil(Self::C8) + PADDING, 0);
1394 Self { seq, len }
1395 }
1396}
1397
1398impl PackedSeqVecBase<1> {
1399 pub fn with_len(n: usize) -> Self {
1400 Self {
1401 seq: vec![0; n.div_ceil(Self::C8) + PADDING],
1402 len: n,
1403 }
1404 }
1405
1406 pub fn random(len: usize, n_frac: f32) -> Self {
1407 let byte_len = len.div_ceil(Self::C8);
1408 let mut seq = vec![0; byte_len + PADDING];
1409
1410 assert!(
1411 (0.0..=0.3).contains(&n_frac),
1412 "n_frac={} should be in [0, 0.3]",
1413 n_frac
1414 );
1415
1416 for _ in 0..(len as f32 * n_frac) as usize {
1417 let idx = rand::random::<u64>() as usize % len;
1418 let byte = idx / Self::C8;
1419 let offset = idx % Self::C8;
1420 seq[byte] |= 1 << offset;
1421 }
1422
1423 Self { seq, len }
1424 }
1425}
1426
1427impl<'s> PackedSeqBase<'s, 1> {
1428 #[inline(always)]
1432 pub fn iter_kmer_ambiguity(self, k: usize) -> impl ExactSizeIterator<Item = bool> + use<'s> {
1433 let this = self.normalize();
1434 assert!(k > 0);
1435 assert!(k <= Self::K64);
1436 (this.offset..this.offset + this.len.saturating_sub(k - 1))
1437 .map(move |i| self.read_kmer(k, i) != 0)
1438 }
1439
1440 #[inline(always)]
1448 pub fn par_iter_kmer_ambiguity(
1449 self,
1450 k: usize,
1451 context: usize,
1452 skip: usize,
1453 ) -> PaddedIt<impl ChunkIt<S> + use<'s>> {
1454 #[cfg(target_endian = "big")]
1455 panic!("Big endian architectures are not supported.");
1456
1457 assert!(k > 0, "par_iter_kmers requires k>0, but k={k}");
1458 assert!(k <= 96, "par_iter_kmers requires k<=96, but k={k}");
1459
1460 let this = self.normalize();
1461 let o = this.offset;
1462 assert!(o < Self::C8);
1463
1464 let delay = k - 1;
1465
1466 let it = self.par_iter_bp_delayed(context, Delay(delay));
1467
1468 let mut cnt = u32x8::ZERO;
1469
1470 it.map(
1471 #[inline(always)]
1472 move |(a, r)| {
1473 cnt += a;
1474 let out = cnt.cmp_gt(S::ZERO);
1475 cnt -= r;
1476 out
1477 },
1478 )
1479 .advance(skip)
1480 }
1481
1482 #[inline(always)]
1483 pub fn par_iter_kmer_ambiguity_with_buf(
1484 self,
1485 k: usize,
1486 context: usize,
1487 skip: usize,
1488 buf: &'s mut Vec<S>,
1489 ) -> PaddedIt<impl ChunkIt<S> + use<'s>> {
1490 #[cfg(target_endian = "big")]
1491 panic!("Big endian architectures are not supported.");
1492
1493 assert!(k > 0, "par_iter_kmers requires k>0, but k={k}");
1494 assert!(k <= 96, "par_iter_kmers requires k<=96, but k={k}");
1495
1496 let this = self.normalize();
1497 let o = this.offset;
1498 assert!(o < Self::C8);
1499
1500 let delay = k - 1;
1501
1502 let it = self.par_iter_bp_delayed_with_buf(context, Delay(delay), buf);
1503
1504 let mut cnt = u32x8::ZERO;
1505
1506 it.map(
1507 #[inline(always)]
1508 move |(a, r)| {
1509 cnt += a;
1510 let out = cnt.cmp_gt(S::ZERO);
1511 cnt -= r;
1512 out
1513 },
1514 )
1515 .advance(skip)
1516 }
1517}
1518
1519#[cfg(target_feature = "neon")]
1520mod neon {
1521 use core::arch::aarch64::*;
1522
1523 #[inline(always)]
1524 pub fn movemask_64(v: uint8x16x4_t) -> u64 {
1525 unsafe {
1527 let acc = vsriq_n_u8(vsriq_n_u8(v.3, v.2, 1), vsriq_n_u8(v.1, v.0, 1), 2);
1528 vget_lane_u64(
1529 vreinterpret_u64_u8(vshrn_n_u16(
1530 vreinterpretq_u16_u8(vsriq_n_u8(acc, acc, 4)),
1531 4,
1532 )),
1533 0,
1534 )
1535 }
1536 }
1537
1538 #[inline(always)]
1539 pub fn map_8x16x4<F>(v: uint8x16x4_t, mut f: F) -> uint8x16x4_t
1540 where
1541 F: FnMut(uint8x16_t) -> uint8x16_t,
1542 {
1543 uint8x16x4_t(f(v.0), f(v.1), f(v.2), f(v.3))
1544 }
1545
1546 #[inline(always)]
1547 pub fn map_two_8x16x4<F>(v1: uint8x16x4_t, v2: uint8x16x4_t, mut f: F) -> uint8x16x4_t
1548 where
1549 F: FnMut(uint8x16_t, uint8x16_t) -> uint8x16_t,
1550 {
1551 uint8x16x4_t(f(v1.0, v2.0), f(v1.1, v2.1), f(v1.2, v2.2), f(v1.3, v2.3))
1552 }
1553}