1use core::cell::RefCell;
2use std::ops::{Deref, DerefMut};
3use traits::Seq;
4use wide::u16x8;
5
6use crate::{intrinsics::transpose, padded_it::ChunkIt};
7
8use super::*;
9
10type SimdBuf = [S; 8];
11
12struct RecycledBox(Option<Box<SimdBuf>>);
13
14thread_local! {
15 static RECYCLED_BOX_CACHE: RefCell<Vec<Box<SimdBuf>>> = {
16 RefCell::new(vec![Box::new(SimdBuf::default())])
17 };
18}
19
20impl RecycledBox {
21 #[inline(always)]
22 fn take() -> Self {
23 let mut buf = RECYCLED_BOX_CACHE.with_borrow_mut(|v| RecycledBox(v.pop()));
24 buf.init_if_needed();
25 buf
26 }
27
28 #[inline(always)]
29 fn init_if_needed(&mut self) {
30 if self.0.is_none() {
31 self.0 = Some(Box::new(SimdBuf::default()));
32 }
33 }
34
35 #[inline(always)]
36 fn get(&self) -> &SimdBuf {
37 unsafe { self.0.as_ref().unwrap_unchecked() }
38 }
39
40 #[inline(always)]
41 fn get_mut(&mut self) -> &mut SimdBuf {
42 unsafe { self.0.as_mut().unwrap_unchecked() }
43 }
44}
45
46impl Deref for RecycledBox {
47 type Target = SimdBuf;
48
49 #[inline(always)]
50 fn deref(&self) -> &Self::Target {
51 self.get()
52 }
53}
54impl DerefMut for RecycledBox {
55 #[inline(always)]
56 fn deref_mut(&mut self) -> &mut SimdBuf {
57 self.get_mut()
58 }
59}
60
61impl Drop for RecycledBox {
62 #[inline(always)]
63 fn drop(&mut self) {
64 let mut x = None;
65 core::mem::swap(&mut x, &mut self.0);
66 RECYCLED_BOX_CACHE.with_borrow_mut(|v| v.push(unsafe { x.unwrap_unchecked() }));
67 }
68}
69
70struct RecycledVec(Vec<S>);
71
72thread_local! {
73 static RECYCLED_VEC_CACHE: RefCell<Vec<Vec<S>>> = {
74 RefCell::new(vec![])
75 };
76}
77
78impl RecycledVec {
79 #[inline(always)]
80 fn take() -> Self {
81 RecycledVec(RECYCLED_VEC_CACHE.with_borrow_mut(|v| v.pop().unwrap_or_default()))
82 }
83}
84
85impl Deref for RecycledVec {
86 type Target = Vec<S>;
87 #[inline(always)]
88 fn deref(&self) -> &Self::Target {
89 &self.0
90 }
91}
92impl DerefMut for RecycledVec {
93 #[inline(always)]
94 fn deref_mut(&mut self) -> &mut Self::Target {
95 &mut self.0
96 }
97}
98impl Drop for RecycledVec {
99 #[inline(always)]
100 fn drop(&mut self) {
101 RECYCLED_VEC_CACHE.with_borrow_mut(|v| v.push(std::mem::take(&mut self.0)));
102 }
103}
104
105#[doc(hidden)]
106pub struct Bits<const B: usize>;
107#[doc(hidden)]
108pub trait SupportedBits {}
109impl SupportedBits for Bits<1> {}
110impl SupportedBits for Bits<2> {}
111impl SupportedBits for Bits<4> {}
112impl SupportedBits for Bits<8> {}
113
114pub(crate) const PADDING: usize = 48;
116
117#[doc(hidden)]
119#[derive(Copy, Clone, Debug, MemSize, MemDbg)]
120pub struct PackedSeqBase<'s, const B: usize>
121where
122 Bits<B>: SupportedBits,
123{
124 seq: &'s [u8],
126 offset: usize,
128 len: usize,
130}
131
132#[doc(hidden)]
134#[derive(Clone, Debug, MemSize, MemDbg)]
135#[cfg_attr(feature = "pyo3", pyo3::pyclass)]
136#[cfg_attr(feature = "epserde", derive(epserde::Epserde))]
137pub struct PackedSeqVecBase<const B: usize>
138where
139 Bits<B>: SupportedBits,
140{
141 pub(crate) seq: Vec<u8>,
145
146 len: usize,
148}
149
150pub type PackedSeq<'s> = PackedSeqBase<'s, 2>;
151pub type PackedSeqVec = PackedSeqVecBase<2>;
152pub type BitSeq<'s> = PackedSeqBase<'s, 1>;
153pub type BitSeqVec = PackedSeqVecBase<1>;
154
155impl<'s, const B: usize> PackedSeqBase<'s, B>
158where
159 Bits<B>: SupportedBits,
160{
161 const CHAR_MASK: u64 = (1 << B) - 1;
163 const SIMD_B: S = S::new([B as u32; 8]);
164 const SIMD_CHAR_MASK: S = S::new([(1 << B) - 1; 8]);
165 const C8: usize = 8 / B;
167 const C32: usize = 32 / B;
169 const C256: usize = 256 / B;
171 const K64: usize = (64 - 8) / B + 1;
173}
174
175impl<const B: usize> PackedSeqVecBase<B>
177where
178 Bits<B>: SupportedBits,
179{
180 const C8: usize = 8 / B;
182}
183
184impl<const B: usize> Default for PackedSeqVecBase<B>
185where
186 Bits<B>: SupportedBits,
187{
188 fn default() -> Self {
189 Self {
190 seq: vec![0; PADDING],
191 len: 0,
192 }
193 }
194}
195
196#[inline(always)]
201pub fn pack_char(base: u8) -> u8 {
202 match base {
203 b'a' | b'A' => 0,
204 b'c' | b'C' => 1,
205 b'g' | b'G' => 3,
206 b't' | b'T' => 2,
207 _ => panic!(
208 "Unexpected character '{}' with ASCII value {base}. Expected one of ACTGactg.",
209 base as char
210 ),
211 }
212}
213
214#[inline(always)]
216pub fn pack_char_lossy(base: u8) -> u8 {
217 (base >> 1) & 3
218}
219
220#[inline(always)]
222pub fn unpack_base(base: u8) -> u8 {
223 debug_assert!(base < 4, "Base {base} is not <4.");
224 b"ACTG"[base as usize]
225}
226
227#[inline(always)]
229pub const fn complement_char(base: u8) -> u8 {
230 match base {
231 b'A' => b'T',
232 b'C' => b'G',
233 b'G' => b'C',
234 b'T' => b'A',
235 _ => panic!("Unexpected character. Expected one of ACTGactg.",),
236 }
237}
238
239#[inline(always)]
241pub const fn complement_base(base: u8) -> u8 {
242 base ^ 2
243}
244
245#[inline(always)]
247pub fn complement_base_simd(base: u32x8) -> u32x8 {
248 const TWO: u32x8 = u32x8::new([2; 8]);
249 base ^ TWO
250}
251
252#[inline(always)]
254const fn revcomp_raw(word: u64) -> u64 {
255 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
256 {
257 let mut res = word.reverse_bits(); res = ((res >> 1) & 0x5555_5555_5555_5555) | ((res & 0x5555_5555_5555_5555) << 1);
259 res ^ 0xAAAA_AAAA_AAAA_AAAA
260 }
261
262 #[cfg(not(any(target_arch = "arm", target_arch = "aarch64")))]
263 {
264 let mut res = word.swap_bytes();
265 res = ((res >> 4) & 0x0F0F_0F0F_0F0F_0F0F) | ((res & 0x0F0F_0F0F_0F0F_0F0F) << 4);
266 res = ((res >> 2) & 0x3333_3333_3333_3333) | ((res & 0x3333_3333_3333_3333) << 2);
267 res ^ 0xAAAA_AAAA_AAAA_AAAA
268 }
269}
270
271#[inline(always)]
273pub const fn revcomp_u64(word: u64, len: usize) -> u64 {
274 revcomp_raw(word) >> (usize::BITS as usize - 2 * len)
275}
276
277#[inline(always)]
278pub const fn revcomp_u128(word: u128, len: usize) -> u128 {
279 let low = word as u64;
280 let high = (word >> 64) as u64;
281 let rlow = revcomp_raw(low);
282 let rhigh = revcomp_raw(high);
283 let out = ((rlow as u128) << 64) | rhigh as u128;
284 out >> (u128::BITS as usize - 2 * len)
285}
286
287#[inline(always)]
292pub fn char_is_ambiguous(base: u8) -> u8 {
293 let table = b"ACTG";
295 let upper_mask = !(b'a' - b'A');
296 (table[pack_char_lossy(base) as usize] != (base & upper_mask)) as u8
297}
298
299#[inline(always)]
301pub const fn rev_u64(word: u64, len: usize) -> u64 {
302 word.reverse_bits() >> (usize::BITS as usize - len)
303}
304
305#[inline(always)]
307pub const fn rev_u128(word: u128, len: usize) -> u128 {
308 word.reverse_bits() >> (u128::BITS as usize - len)
309}
310
311impl<const B: usize> PackedSeqBase<'_, B>
314where
315 Bits<B>: SupportedBits,
316{
317 #[inline(always)]
319 pub fn normalize(&self) -> Self {
320 let start_byte = self.offset / Self::C8;
321 let end_byte = (self.offset + self.len).div_ceil(Self::C8);
322 Self {
323 seq: &self.seq[start_byte..end_byte + PADDING],
324 offset: self.offset % Self::C8,
325 len: self.len,
326 }
327 }
328
329 #[inline(always)]
331 pub fn unpack(&self) -> Vec<u8> {
332 self.iter_bp().map(unpack_base).collect()
333 }
334}
335
336#[inline(always)]
338pub(crate) unsafe fn read_slice_32_unchecked(seq: &[u8], idx: usize) -> u32x8 {
339 unsafe {
340 let src = seq.as_ptr().add(idx);
341 debug_assert!(idx + 32 <= seq.len());
342 std::mem::transmute::<_, *const u32x8>(src).read_unaligned()
343 }
344}
345
346#[inline(always)]
348pub(crate) fn read_slice_32(seq: &[u8], idx: usize) -> u32x8 {
349 unsafe {
350 let src = seq.as_ptr().add(idx);
351 if idx + 32 <= seq.len() {
352 std::mem::transmute::<_, *const u32x8>(src).read_unaligned()
353 } else {
354 let num_bytes = seq.len().saturating_sub(idx);
355 let mut result = [0u8; 32];
356 std::ptr::copy_nonoverlapping(src, result.as_mut_ptr(), num_bytes);
357 std::mem::transmute(result)
358 }
359 }
360}
361
362#[allow(unused)]
364#[inline(always)]
365pub(crate) fn read_slice_16(seq: &[u8], idx: usize) -> u16x8 {
366 unsafe {
367 let src = seq.as_ptr().add(idx);
368 if idx + 16 <= seq.len() {
369 std::mem::transmute::<_, *const u16x8>(src).read_unaligned()
370 } else {
371 let num_bytes = seq.len().saturating_sub(idx);
372 let mut result = [0u8; 16];
373 std::ptr::copy_nonoverlapping(src, result.as_mut_ptr(), num_bytes);
374 std::mem::transmute(result)
375 }
376 }
377}
378
379impl<'s, const B: usize> Seq<'s> for PackedSeqBase<'s, B>
380where
381 Bits<B>: SupportedBits,
382{
383 const BITS_PER_CHAR: usize = B;
384 const BASES_PER_BYTE: usize = Self::C8;
385 type SeqVec = PackedSeqVecBase<B>;
386
387 #[inline(always)]
388 fn len(&self) -> usize {
389 self.len
390 }
391
392 #[inline(always)]
393 fn is_empty(&self) -> bool {
394 self.len == 0
395 }
396
397 #[inline(always)]
398 fn get_ascii(&self, index: usize) -> u8 {
399 unpack_base(self.get(index))
400 }
401
402 #[inline(always)]
405 fn as_u64(&self) -> u64 {
406 assert!(self.len() <= 64 / B);
407
408 let mask = u64::MAX >> (64 - B * self.len());
409
410 if self.len() <= Self::K64 {
413 let x = unsafe { (self.seq.as_ptr() as *const u64).read_unaligned() };
414 (x >> (B * self.offset)) & mask
415 } else {
416 let x = unsafe { (self.seq.as_ptr() as *const u128).read_unaligned() };
417 (x >> (B * self.offset)) as u64 & mask
418 }
419 }
420
421 #[inline(always)]
424 fn revcomp_as_u64(&self) -> u64 {
425 match B {
426 1 => rev_u64(self.as_u64(), self.len()),
427 2 => revcomp_u64(self.as_u64(), self.len()),
428 _ => panic!("Rev(comp) is only supported for 1-bit and 2-bit alphabets."),
429 }
430 }
431
432 #[inline(always)]
435 fn as_u128(&self) -> u128 {
436 assert!(
437 self.len() <= (128 - 8) / B + 1,
438 "Sequences >61 long cannot be read with a single unaligned u128 read."
439 );
440
441 let mask = u128::MAX >> (128 - B * self.len());
442
443 let x = unsafe { (self.seq.as_ptr() as *const u128).read_unaligned() };
446 (x >> (B * self.offset)) & mask
447 }
448
449 #[inline(always)]
452 fn revcomp_as_u128(&self) -> u128 {
453 match B {
454 1 => rev_u128(self.as_u128(), self.len()),
455 2 => revcomp_u128(self.as_u128(), self.len()),
456 _ => panic!("Rev(comp) is only supported for 1-bit and 2-bit alphabets."),
457 }
458 }
459
460 #[inline(always)]
461 fn to_vec(&self) -> PackedSeqVecBase<B> {
462 assert_eq!(self.offset, 0);
463 PackedSeqVecBase {
464 seq: self
465 .seq
466 .iter()
467 .copied()
468 .chain(std::iter::repeat_n(0u8, PADDING))
469 .collect(),
470 len: self.len,
471 }
472 }
473
474 fn to_revcomp(&self) -> PackedSeqVecBase<B> {
475 match B {
476 1 | 2 => {}
477 _ => panic!("Can only reverse (&complement) 1-bit and 2-bit packed sequences.",),
478 }
479
480 let mut seq = self.seq[..(self.offset + self.len).div_ceil(Self::C8)]
481 .iter()
482 .rev()
484 .copied()
485 .map(|mut res| {
486 match B {
487 2 => {
488 res = ((res >> 4) & 0x0F) | ((res & 0x0F) << 4);
491 res = ((res >> 2) & 0x33) | ((res & 0x33) << 2);
492 res ^ 0xAA
494 }
495 1 => res.reverse_bits(),
496 _ => unreachable!(),
497 }
498 })
499 .chain(std::iter::repeat_n(0u8, PADDING))
500 .collect::<Vec<u8>>();
501
502 let new_offset = (Self::C8 - (self.offset + self.len) % Self::C8) % Self::C8;
504
505 if new_offset > 0 {
506 let shift = B * new_offset;
508 *seq.last_mut().unwrap() >>= shift;
509 for i in 0..seq.len() - 1 {
511 seq[i] = (seq[i] >> shift) | (seq[i + 1] << (8 - shift));
512 }
513 }
514
515 PackedSeqVecBase { seq, len: self.len }
516 }
517
518 #[inline(always)]
519 fn slice(&self, range: Range<usize>) -> Self {
520 debug_assert!(
521 range.end <= self.len,
522 "Slice index out of bounds: {} > {}",
523 range.end,
524 self.len
525 );
526 PackedSeqBase {
527 seq: self.seq,
528 offset: self.offset + range.start,
529 len: range.end - range.start,
530 }
531 .normalize()
532 }
533
534 #[inline(always)]
535 fn iter_bp(self) -> impl ExactSizeIterator<Item = u8> {
536 assert!(self.len <= self.seq.len() * Self::C8);
537
538 let this = self.normalize();
539
540 let mut byte = 0;
542 (0..this.len + this.offset)
543 .map(
544 #[inline(always)]
545 move |i| {
546 if i % Self::C8 == 0 {
547 byte = this.seq[i / Self::C8];
548 }
549 (byte >> (B * (i % Self::C8))) & Self::CHAR_MASK as u8
551 },
552 )
553 .advance(this.offset)
554 }
555
556 #[inline(always)]
557 fn par_iter_bp(self, context: usize) -> PaddedIt<impl ChunkIt<S>> {
558 self.par_iter_bp_with_buf(context, RecycledBox::take())
559 }
560
561 #[inline(always)]
562 fn par_iter_bp_delayed(self, context: usize, delay: Delay) -> PaddedIt<impl ChunkIt<(S, S)>> {
563 self.par_iter_bp_delayed_with_factor(context, delay, 1)
564 }
565
566 #[inline(always)]
569 fn par_iter_bp_delayed_2(
570 self,
571 context: usize,
572 delay1: Delay,
573 delay2: Delay,
574 ) -> PaddedIt<impl ChunkIt<(S, S, S)>> {
575 self.par_iter_bp_delayed_2_with_factor_and_buf(
576 context,
577 delay1,
578 delay2,
579 1,
580 RecycledVec::take(),
581 )
582 }
583
584 fn cmp_lcp(&self, other: &Self) -> (std::cmp::Ordering, usize) {
586 let mut lcp = 0;
587 let min_len = self.len.min(other.len);
588 for i in (0..min_len).step_by(Self::K64) {
589 let len = (min_len - i).min(Self::K64);
590 let this = self.slice(i..i + len);
591 let other = other.slice(i..i + len);
592 let this_word = this.as_u64();
593 let other_word = other.as_u64();
594 if this_word != other_word {
595 let eq = this_word ^ other_word;
597 let t = eq.trailing_zeros() as usize / B * B;
598 lcp += t / B;
599 let mask = (Self::CHAR_MASK) << t;
600 return ((this_word & mask).cmp(&(other_word & mask)), lcp);
601 }
602 lcp += len;
603 }
604 (self.len.cmp(&other.len), lcp)
605 }
606
607 #[inline(always)]
608 fn get(&self, index: usize) -> u8 {
609 let offset = self.offset + index;
610 let idx = offset / Self::C8;
611 let offset = offset % Self::C8;
612 (self.seq[idx] >> (B * offset)) & Self::CHAR_MASK as u8
613 }
614}
615
616impl<'s, const B: usize> PackedSeqBase<'s, B>
617where
618 Bits<B>: SupportedBits,
619{
620 #[inline(always)]
621 pub fn par_iter_bp_with_buf<BUF: DerefMut<Target = [S; 8]>>(
622 self,
623 context: usize,
624 mut buf: BUF,
625 ) -> PaddedIt<impl ChunkIt<S> + use<'s, B, BUF>> {
626 #[cfg(target_endian = "big")]
627 panic!("Big endian architectures are not supported.");
628
629 let this = self.normalize();
630 let o = this.offset;
631 assert!(o < Self::C8);
632
633 let num_kmers = if this.len == 0 {
634 0
635 } else {
636 (this.len + o).saturating_sub(context - 1)
637 };
638 let num_kmers_stride = this.len.saturating_sub(context - 1);
640 let n = num_kmers_stride.div_ceil(L).next_multiple_of(Self::C8);
641 let bytes_per_chunk = n / Self::C8;
642 let padding = Self::C8 * L * bytes_per_chunk - num_kmers_stride;
643
644 let offsets: [usize; 8] = from_fn(|l| l * bytes_per_chunk);
645 let mut cur = S::ZERO;
646
647 let par_len = if num_kmers == 0 {
648 0
649 } else {
650 n + context + o - 1
651 };
652
653 let last_i = par_len.saturating_sub(1) / Self::C32 * Self::C32;
654 assert!(offsets[7] + (last_i / Self::C8) + 32 <= this.seq.len());
656
657 let it = (0..par_len)
658 .map(
659 #[inline(always)]
660 move |i| {
661 if i % Self::C32 == 0 {
662 if i % Self::C256 == 0 {
663 let data: [u32x8; 8] = from_fn(
665 #[inline(always)]
666 |lane| unsafe {
667 read_slice_32_unchecked(
668 this.seq,
669 offsets[lane] + (i / Self::C8),
670 )
671 },
672 );
673 *buf = transpose(data);
674 }
675 cur = buf[(i % Self::C256) / Self::C32];
676 }
677 let chars = cur & Self::SIMD_CHAR_MASK;
679 cur = cur >> Self::SIMD_B;
681 chars
682 },
683 )
684 .advance(o);
685
686 PaddedIt { it, padding }
687 }
688
689 #[inline(always)]
690 pub fn par_iter_bp_delayed_with_factor(
691 self,
692 context: usize,
693 delay: Delay,
694 factor: usize,
695 ) -> PaddedIt<impl ChunkIt<(S, S)> + use<'s, B>> {
696 self.par_iter_bp_delayed_with_factor_and_buf(context, delay, factor, RecycledVec::take())
697 }
698
699 #[inline(always)]
700 pub fn par_iter_bp_delayed_with_buf<BUF: DerefMut<Target = Vec<S>>>(
701 self,
702 context: usize,
703 delay: Delay,
704 buf: BUF,
705 ) -> PaddedIt<impl ChunkIt<(S, S)> + use<'s, B, BUF>> {
706 self.par_iter_bp_delayed_with_factor_and_buf(context, delay, 1, buf)
707 }
708
709 #[inline(always)]
710 pub fn par_iter_bp_delayed_with_factor_and_buf<BUF: DerefMut<Target = Vec<S>>>(
711 self,
712 context: usize,
713 Delay(delay): Delay,
714 factor: usize,
715 mut buf: BUF,
716 ) -> PaddedIt<impl ChunkIt<(S, S)> + use<'s, B, BUF>> {
717 #[cfg(target_endian = "big")]
718 panic!("Big endian architectures are not supported.");
719
720 assert!(
721 delay < usize::MAX / 2,
722 "Delay={} should be >=0.",
723 delay as isize
724 );
725
726 let this = self.normalize();
727 let o = this.offset;
728 assert!(o < Self::C8);
729
730 let num_kmers = if this.len == 0 {
731 0
732 } else {
733 (this.len + o).saturating_sub(context - 1)
734 };
735 let num_kmers_stride = this.len.saturating_sub(context - 1);
737 let n = num_kmers_stride
738 .div_ceil(L)
739 .next_multiple_of(factor * Self::C8);
740 let bytes_per_chunk = n / Self::C8;
741 let padding = Self::C8 * L * bytes_per_chunk - num_kmers_stride;
742
743 let offsets: [usize; 8] = from_fn(|l| l * bytes_per_chunk);
744 let mut upcoming = S::ZERO;
745 let mut upcoming_d = S::ZERO;
746
747 let buf_len = (delay / Self::C32 + 8).next_power_of_two();
752 let buf_mask = buf_len - 1;
753 if buf.capacity() < buf_len {
754 *buf = vec![S::ZERO; buf_len];
756 } else {
757 buf.clear();
759 buf.resize(buf_len, S::ZERO);
760 }
761
762 let mut write_idx = 0;
763 let mut read_idx = (buf_len - delay / Self::C32) % buf_len;
766
767 let par_len = if num_kmers == 0 {
768 0
769 } else {
770 n + context + o - 1
771 };
772
773 let last_i = par_len.saturating_sub(1) / Self::C32 * Self::C32;
774 assert!(offsets[7] + (last_i / Self::C8) + 32 <= this.seq.len());
776
777 let it = (0..par_len)
778 .map(
779 #[inline(always)]
780 move |i| {
781 if i % Self::C32 == 0 {
782 if i % Self::C256 == 0 {
783 let data: [u32x8; 8] = from_fn(
785 #[inline(always)]
786 |lane| unsafe {
787 read_slice_32_unchecked(
788 this.seq,
789 offsets[lane] + (i / Self::C8),
790 )
791 },
792 );
793 unsafe {
794 *TryInto::<&mut [u32x8; 8]>::try_into(
795 buf.get_unchecked_mut(write_idx..write_idx + 8),
796 )
797 .unwrap_unchecked() = transpose(data);
798 }
799 if i == 0 {
800 let elem = !((1u32 << (B * o)) - 1);
802 let mask = S::splat(elem);
803 unsafe { assert_unchecked(write_idx < buf.len()) };
804 buf[write_idx] &= mask;
805 }
806 }
807 unsafe { assert_unchecked(write_idx < buf.len()) };
808 upcoming = buf[write_idx];
809 write_idx += 1;
810 write_idx &= buf_mask;
811 }
812 if i % Self::C32 == delay % Self::C32 {
813 unsafe { assert_unchecked(read_idx < buf.len()) };
814 upcoming_d = buf[read_idx];
815 read_idx += 1;
816 read_idx &= buf_mask;
817 }
818 let chars = upcoming & Self::SIMD_CHAR_MASK;
820 let chars_d = upcoming_d & Self::SIMD_CHAR_MASK;
821 upcoming = upcoming >> Self::SIMD_B;
823 upcoming_d = upcoming_d >> Self::SIMD_B;
824 (chars, chars_d)
825 },
826 )
827 .advance(o);
828
829 PaddedIt { it, padding }
830 }
831
832 #[inline(always)]
833 pub fn par_iter_bp_delayed_2_with_factor(
834 self,
835 context: usize,
836 delay1: Delay,
837 delay2: Delay,
838 factor: usize,
839 ) -> PaddedIt<impl ChunkIt<(S, S, S)> + use<'s, B>> {
840 self.par_iter_bp_delayed_2_with_factor_and_buf(
841 context,
842 delay1,
843 delay2,
844 factor,
845 RecycledVec::take(),
846 )
847 }
848
849 #[inline(always)]
850 pub fn par_iter_bp_delayed_2_with_buf<BUF: DerefMut<Target = Vec<S>>>(
851 self,
852 context: usize,
853 delay1: Delay,
854 delay2: Delay,
855 buf: BUF,
856 ) -> PaddedIt<impl ChunkIt<(S, S, S)> + use<'s, B, BUF>> {
857 self.par_iter_bp_delayed_2_with_factor_and_buf(context, delay1, delay2, 1, buf)
858 }
859
860 #[inline(always)]
866 pub fn par_iter_bp_delayed_2_with_factor_and_buf<BUF: DerefMut<Target = Vec<S>>>(
867 self,
868 context: usize,
869 Delay(delay1): Delay,
870 Delay(delay2): Delay,
871 factor: usize,
872 mut buf: BUF,
873 ) -> PaddedIt<impl ChunkIt<(S, S, S)> + use<'s, B, BUF>> {
874 #[cfg(target_endian = "big")]
875 panic!("Big endian architectures are not supported.");
876
877 let this = self.normalize();
878 let o = this.offset;
879 assert!(o < Self::C8);
880 assert!(delay1 <= delay2, "Delay1 must be at most delay2.");
881
882 let num_kmers = if this.len == 0 {
883 0
884 } else {
885 (this.len + o).saturating_sub(context - 1)
886 };
887 let num_kmers_stride = this.len.saturating_sub(context - 1);
889 let n = num_kmers_stride
890 .div_ceil(L)
891 .next_multiple_of(factor * Self::C8);
892 let bytes_per_chunk = n / Self::C8;
893 let padding = Self::C8 * L * bytes_per_chunk - num_kmers_stride;
894
895 let offsets: [usize; 8] = from_fn(|l| l * bytes_per_chunk);
896 let mut upcoming = S::ZERO;
897 let mut upcoming_d1 = S::ZERO;
898 let mut upcoming_d2 = S::ZERO;
899
900 let buf_len = (delay2 / Self::C32 + 8).next_power_of_two();
902 let buf_mask = buf_len - 1;
903 if buf.capacity() < buf_len {
904 *buf = vec![S::ZERO; buf_len];
906 } else {
907 buf.clear();
909 buf.resize(buf_len, S::ZERO);
910 }
911
912 let mut write_idx = 0;
913 let mut read_idx1 = (buf_len - delay1 / Self::C32) % buf_len;
916 let mut read_idx2 = (buf_len - delay2 / Self::C32) % buf_len;
917
918 let par_len = if num_kmers == 0 {
919 0
920 } else {
921 n + context + o - 1
922 };
923
924 let last_i = par_len.saturating_sub(1) / Self::C32 * Self::C32;
925 assert!(offsets[7] + (last_i / Self::C8) + 32 <= this.seq.len());
927
928 let it = (0..par_len)
929 .map(
930 #[inline(always)]
931 move |i| {
932 if i % Self::C32 == 0 {
933 if i % Self::C256 == 0 {
934 let data: [u32x8; 8] = from_fn(
936 #[inline(always)]
937 |lane| unsafe {
938 read_slice_32_unchecked(
939 this.seq,
940 offsets[lane] + (i / Self::C8),
941 )
942 },
943 );
944 unsafe {
945 *TryInto::<&mut [u32x8; 8]>::try_into(
946 buf.get_unchecked_mut(write_idx..write_idx + 8),
947 )
948 .unwrap_unchecked() = transpose(data);
949 }
950 if i == 0 {
952 let elem = !((1u32 << (B * o)) - 1);
954 let mask = S::splat(elem);
955 buf[write_idx] &= mask;
956 }
957 }
958 upcoming = buf[write_idx];
959 write_idx += 1;
960 write_idx &= buf_mask;
961 }
962 if i % Self::C32 == delay1 % Self::C32 {
963 unsafe { assert_unchecked(read_idx1 < buf.len()) };
964 upcoming_d1 = buf[read_idx1];
965 read_idx1 += 1;
966 read_idx1 &= buf_mask;
967 }
968 if i % Self::C32 == delay2 % Self::C32 {
969 unsafe { assert_unchecked(read_idx2 < buf.len()) };
970 upcoming_d2 = buf[read_idx2];
971 read_idx2 += 1;
972 read_idx2 &= buf_mask;
973 }
974 let chars = upcoming & Self::SIMD_CHAR_MASK;
976 let chars_d1 = upcoming_d1 & Self::SIMD_CHAR_MASK;
977 let chars_d2 = upcoming_d2 & Self::SIMD_CHAR_MASK;
978 upcoming = upcoming >> Self::SIMD_B;
980 upcoming_d1 = upcoming_d1 >> Self::SIMD_B;
981 upcoming_d2 = upcoming_d2 >> Self::SIMD_B;
982 (chars, chars_d1, chars_d2)
983 },
984 )
985 .advance(o);
986
987 PaddedIt { it, padding }
988 }
989}
990
991impl<const B: usize> PartialEq for PackedSeqBase<'_, B>
992where
993 Bits<B>: SupportedBits,
994{
995 fn eq(&self, other: &Self) -> bool {
997 if self.len != other.len {
998 return false;
999 }
1000 for i in (0..self.len).step_by(Self::K64) {
1001 let len = (self.len - i).min(Self::K64);
1002 let this = self.slice(i..i + len);
1003 let that = other.slice(i..i + len);
1004 if this.as_u64() != that.as_u64() {
1005 return false;
1006 }
1007 }
1008 true
1009 }
1010}
1011
1012impl<const B: usize> Eq for PackedSeqBase<'_, B> where Bits<B>: SupportedBits {}
1013
1014impl<const B: usize> PartialOrd for PackedSeqBase<'_, B>
1015where
1016 Bits<B>: SupportedBits,
1017{
1018 fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
1019 Some(self.cmp(other))
1020 }
1021}
1022
1023impl<const B: usize> Ord for PackedSeqBase<'_, B>
1024where
1025 Bits<B>: SupportedBits,
1026{
1027 fn cmp(&self, other: &Self) -> std::cmp::Ordering {
1029 let min_len = self.len.min(other.len);
1030 for i in (0..min_len).step_by(Self::K64) {
1031 let len = (min_len - i).min(Self::K64);
1032 let this = self.slice(i..i + len);
1033 let other = other.slice(i..i + len);
1034 let this_word = this.as_u64();
1035 let other_word = other.as_u64();
1036 if this_word != other_word {
1037 let eq = this_word ^ other_word;
1039 let t = eq.trailing_zeros() as usize / B * B;
1040 let mask = (Self::CHAR_MASK) << t;
1041 return (this_word & mask).cmp(&(other_word & mask));
1042 }
1043 }
1044 self.len.cmp(&other.len)
1045 }
1046}
1047
1048impl<const B: usize> SeqVec for PackedSeqVecBase<B>
1049where
1050 Bits<B>: SupportedBits,
1051{
1052 type Seq<'s> = PackedSeqBase<'s, B>;
1053
1054 #[inline(always)]
1055 fn into_raw(mut self) -> Vec<u8> {
1056 self.seq.resize(self.len.div_ceil(Self::C8), 0);
1057 self.seq
1058 }
1059
1060 #[inline(always)]
1061 fn as_slice(&self) -> Self::Seq<'_> {
1062 PackedSeqBase {
1063 seq: &self.seq[..self.len.div_ceil(Self::C8) + PADDING],
1064 offset: 0,
1065 len: self.len,
1066 }
1067 }
1068
1069 #[inline(always)]
1070 fn len(&self) -> usize {
1071 self.len
1072 }
1073
1074 #[inline(always)]
1075 fn is_empty(&self) -> bool {
1076 self.len == 0
1077 }
1078
1079 #[inline(always)]
1080 fn clear(&mut self) {
1081 self.seq.clear();
1082 self.len = 0;
1083 }
1084
1085 fn push_seq<'a>(&mut self, seq: PackedSeqBase<'_, B>) -> Range<usize> {
1086 let start = self.len.next_multiple_of(Self::C8) + seq.offset;
1087 let end = start + seq.len();
1088
1089 self.seq.resize(self.len.div_ceil(Self::C8), 0);
1091 self.seq.extend(seq.seq);
1093 self.len = end;
1094 start..end
1095 }
1096
1097 fn push_ascii(&mut self, seq: &[u8]) -> Range<usize> {
1106 match B {
1107 1 | 2 => {}
1108 _ => panic!(
1109 "Can only use ASCII input for 2-bit DNA packing, or 1-bit ambiguous indicators."
1110 ),
1111 }
1112
1113 self.seq
1114 .resize((self.len + seq.len()).div_ceil(Self::C8) + PADDING, 0);
1115 let start_aligned = self.len.next_multiple_of(Self::C8);
1116 let start = self.len;
1117 let len = seq.len();
1118 let mut idx = self.len / Self::C8;
1119
1120 let parse_base = |base| match B {
1121 1 => char_is_ambiguous(base),
1122 2 => pack_char_lossy(base),
1123 _ => unreachable!(),
1124 };
1125
1126 let unaligned = core::cmp::min(start_aligned - start, len);
1127 if unaligned > 0 {
1128 let mut packed_byte = self.seq[idx];
1129 for &base in &seq[..unaligned] {
1130 packed_byte |= parse_base(base) << ((self.len % Self::C8) * B);
1131 self.len += 1;
1132 }
1133 self.seq[idx] = packed_byte;
1134 idx += 1;
1135 }
1136
1137 #[allow(unused)]
1138 let mut last = unaligned;
1139
1140 if B == 2 {
1141 #[cfg(all(target_arch = "x86_64", target_feature = "bmi2"))]
1142 {
1143 last = unaligned + (len - unaligned) / 8 * 8;
1144
1145 for i in (unaligned..last).step_by(8) {
1146 let chunk =
1147 unsafe { seq.get_unchecked(i..i + 8).try_into().unwrap_unchecked() };
1148 let ascii = u64::from_le_bytes(chunk);
1149 let packed_bytes =
1150 unsafe { std::arch::x86_64::_pext_u64(ascii, 0x0606060606060606) } as u16;
1151 unsafe {
1152 self.seq
1153 .get_unchecked_mut(idx..(idx + 2))
1154 .copy_from_slice(&packed_bytes.to_le_bytes())
1155 };
1156 idx += 2;
1157 self.len += 8;
1158 }
1159 }
1160
1161 #[cfg(target_feature = "neon")]
1162 {
1163 use core::arch::aarch64::{
1164 vandq_u8, vdup_n_u8, vld1q_u8, vpadd_u8, vshlq_u8, vst1_u8,
1165 };
1166 use core::mem::transmute;
1167
1168 last = unaligned + (len - unaligned) / 16 * 16;
1169
1170 for i in (unaligned..last).step_by(16) {
1171 unsafe {
1172 let ascii = vld1q_u8(seq.as_ptr().add(i));
1173 let masked_bits = vandq_u8(ascii, transmute([6i8; 16]));
1174 let (bits_0, bits_1) = transmute(vshlq_u8(
1175 masked_bits,
1176 transmute([-1i8, 1, 3, 5, -1, 1, 3, 5, -1, 1, 3, 5, -1, 1, 3, 5]),
1177 ));
1178 let half_packed = vpadd_u8(bits_0, bits_1);
1179 let packed = vpadd_u8(half_packed, vdup_n_u8(0));
1180 vst1_u8(self.seq.as_mut_ptr().add(idx), packed);
1181 idx += Self::C8;
1182 self.len += 16;
1183 }
1184 }
1185 }
1186 }
1187
1188 if B == 1 {
1189 #[cfg(all(target_arch = "x86_64", target_feature = "avx2"))]
1190 {
1191 last = len;
1192 self.len += len - unaligned;
1193
1194 let mut last_i = unaligned;
1195
1196 for i in (unaligned..last).step_by(32) {
1197 use std::mem::transmute as t;
1198
1199 use wide::CmpEq;
1200 type S = wide::i8x32;
1202 let chars: S = unsafe { t(read_slice_32(seq, i)) };
1203 let upper_mask = !(b'a' - b'A');
1204 let chars = chars & S::splat(upper_mask as i8);
1206 let lossy_encoded = chars & S::splat(6);
1207 let table = unsafe { S::from(t::<_, S>(*b"AxCxTxGxxxxxxxxxAxCxTxGxxxxxxxxx")) };
1208 let lookup: S = unsafe {
1209 t(std::arch::x86_64::_mm256_shuffle_epi8(
1210 t(table),
1211 t(lossy_encoded),
1212 ))
1213 };
1214 let packed_bytes = !(chars.cmp_eq(lookup).move_mask() as u32);
1215
1216 last_i = i;
1217 unsafe {
1218 self.seq
1219 .get_unchecked_mut(idx..(idx + 4))
1220 .copy_from_slice(&packed_bytes.to_le_bytes())
1221 };
1222 idx += 4;
1223 }
1224
1225 if unaligned < last {
1227 idx -= 4;
1228 let mut val = unsafe {
1229 u32::from_le_bytes(
1230 self.seq
1231 .get_unchecked(idx..(idx + 4))
1232 .try_into()
1233 .unwrap_unchecked(),
1234 )
1235 };
1236 let keep = last - last_i;
1238 val <<= 32 - keep;
1239 val >>= 32 - keep;
1240 unsafe {
1241 self.seq
1242 .get_unchecked_mut(idx..(idx + 4))
1243 .copy_from_slice(&val.to_le_bytes())
1244 };
1245 idx += keep.div_ceil(8);
1246 }
1247 }
1248
1249 #[cfg(target_feature = "neon")]
1250 {
1251 use core::arch::aarch64::*;
1252 use core::mem::transmute;
1253
1254 last = unaligned + (len - unaligned) / 64 * 64;
1255
1256 for i in (unaligned..last).step_by(64) {
1257 unsafe {
1258 let ptr = seq.as_ptr().add(i);
1259 let chars = vld4q_u8(ptr);
1260
1261 let upper_mask = vdupq_n_u8(!(b'a' - b'A'));
1262 let chars = neon::map_8x16x4(chars, |v| vandq_u8(v, upper_mask));
1263
1264 let two_bits_mask = vdupq_n_u8(6);
1265 let lossy_encoded = neon::map_8x16x4(chars, |v| vandq_u8(v, two_bits_mask));
1266
1267 let table = transmute(*b"AxCxTxGxxxxxxxxx");
1268 let lookup = neon::map_8x16x4(lossy_encoded, |v| vqtbl1q_u8(table, v));
1269
1270 let mask = neon::map_two_8x16x4(chars, lookup, |v1, v2| vceqq_u8(v1, v2));
1271 let packed_bytes = !neon::movemask_64(mask);
1272
1273 self.seq[idx..(idx + 8)].copy_from_slice(&packed_bytes.to_le_bytes());
1274 idx += 8;
1275 self.len += 64;
1276 }
1277 }
1278 }
1279 }
1280
1281 let mut packed_byte = 0;
1282 for &base in &seq[last..] {
1283 packed_byte |= parse_base(base) << ((self.len % Self::C8) * B);
1284 self.len += 1;
1285 if self.len % Self::C8 == 0 {
1286 self.seq[idx] = packed_byte;
1287 idx += 1;
1288 packed_byte = 0;
1289 }
1290 }
1291 if self.len % Self::C8 != 0 && last < len {
1292 self.seq[idx] = packed_byte;
1293 idx += 1;
1294 }
1295 assert_eq!(idx + PADDING, self.seq.len());
1296 start..start + len
1297 }
1298
1299 #[cfg(feature = "rand")]
1300 fn random(n: usize) -> Self {
1301 use rand::{RngCore, SeedableRng};
1302
1303 let byte_len = n.div_ceil(Self::C8);
1304 let mut seq = vec![0; byte_len + PADDING];
1305 rand::rngs::SmallRng::from_os_rng().fill_bytes(&mut seq[..byte_len]);
1306 if n % Self::C8 != 0 {
1308 seq[byte_len - 1] &= (1 << (B * (n % Self::C8))) - 1;
1309 }
1310
1311 Self { seq, len: n }
1312 }
1313}
1314
1315impl PackedSeqVecBase<1> {
1316 pub fn with_len(n: usize) -> Self {
1317 Self {
1318 seq: vec![0; n.div_ceil(Self::C8) + PADDING],
1319 len: n,
1320 }
1321 }
1322
1323 pub fn random(len: usize, n_frac: f32) -> Self {
1324 let byte_len = len.div_ceil(Self::C8);
1325 let mut seq = vec![0; byte_len + PADDING];
1326
1327 assert!(
1328 (0.0..=0.3).contains(&n_frac),
1329 "n_frac={} should be in [0, 0.3]",
1330 n_frac
1331 );
1332
1333 for _ in 0..(len as f32 * n_frac) as usize {
1334 let idx = rand::random::<u64>() as usize % len;
1335 let byte = idx / Self::C8;
1336 let offset = idx % Self::C8;
1337 seq[byte] |= 1 << offset;
1338 }
1339
1340 Self { seq, len }
1341 }
1342}
1343
1344impl<'s> PackedSeqBase<'s, 1> {
1345 #[inline(always)]
1349 pub fn iter_kmer_ambiguity(self, k: usize) -> impl ExactSizeIterator<Item = bool> + use<'s> {
1350 let this = self.normalize();
1351 assert!(k > 0);
1352 assert!(k <= Self::K64);
1353 (this.offset..this.offset + this.len.saturating_sub(k - 1))
1354 .map(move |i| self.read_kmer(k, i) != 0)
1355 }
1356
1357 #[inline(always)]
1365 pub fn par_iter_kmer_ambiguity(
1366 self,
1367 k: usize,
1368 context: usize,
1369 skip: usize,
1370 ) -> PaddedIt<impl ChunkIt<S> + use<'s>> {
1371 #[cfg(target_endian = "big")]
1372 panic!("Big endian architectures are not supported.");
1373
1374 assert!(k > 0, "par_iter_kmers requires k>0, but k={k}");
1375 assert!(k <= 96, "par_iter_kmers requires k<=96, but k={k}");
1376
1377 let this = self.normalize();
1378 let o = this.offset;
1379 assert!(o < Self::C8);
1380
1381 let delay = k - 1;
1382
1383 let it = self.par_iter_bp_delayed(context, Delay(delay));
1384
1385 let mut cnt = u32x8::ZERO;
1386
1387 it.map(
1388 #[inline(always)]
1389 move |(a, r)| {
1390 cnt += a;
1391 let out = cnt.cmp_gt(S::ZERO);
1392 cnt -= r;
1393 out
1394 },
1395 )
1396 .advance(skip)
1397 }
1398
1399 #[inline(always)]
1400 pub fn par_iter_kmer_ambiguity_with_buf(
1401 self,
1402 k: usize,
1403 context: usize,
1404 skip: usize,
1405 buf: &'s mut Vec<S>,
1406 ) -> PaddedIt<impl ChunkIt<S> + use<'s>> {
1407 #[cfg(target_endian = "big")]
1408 panic!("Big endian architectures are not supported.");
1409
1410 assert!(k > 0, "par_iter_kmers requires k>0, but k={k}");
1411 assert!(k <= 96, "par_iter_kmers requires k<=96, but k={k}");
1412
1413 let this = self.normalize();
1414 let o = this.offset;
1415 assert!(o < Self::C8);
1416
1417 let delay = k - 1;
1418
1419 let it = self.par_iter_bp_delayed_with_buf(context, Delay(delay), buf);
1420
1421 let mut cnt = u32x8::ZERO;
1422
1423 it.map(
1424 #[inline(always)]
1425 move |(a, r)| {
1426 cnt += a;
1427 let out = cnt.cmp_gt(S::ZERO);
1428 cnt -= r;
1429 out
1430 },
1431 )
1432 .advance(skip)
1433 }
1434}
1435
1436#[cfg(target_feature = "neon")]
1437mod neon {
1438 use core::arch::aarch64::*;
1439
1440 #[inline(always)]
1441 pub fn movemask_64(v: uint8x16x4_t) -> u64 {
1442 unsafe {
1444 let acc = vsriq_n_u8(vsriq_n_u8(v.3, v.2, 1), vsriq_n_u8(v.1, v.0, 1), 2);
1445 vget_lane_u64(
1446 vreinterpret_u64_u8(vshrn_n_u16(
1447 vreinterpretq_u16_u8(vsriq_n_u8(acc, acc, 4)),
1448 4,
1449 )),
1450 0,
1451 )
1452 }
1453 }
1454
1455 #[inline(always)]
1456 pub fn map_8x16x4<F>(v: uint8x16x4_t, mut f: F) -> uint8x16x4_t
1457 where
1458 F: FnMut(uint8x16_t) -> uint8x16_t,
1459 {
1460 uint8x16x4_t(f(v.0), f(v.1), f(v.2), f(v.3))
1461 }
1462
1463 #[inline(always)]
1464 pub fn map_two_8x16x4<F>(v1: uint8x16x4_t, v2: uint8x16x4_t, mut f: F) -> uint8x16x4_t
1465 where
1466 F: FnMut(uint8x16_t, uint8x16_t) -> uint8x16_t,
1467 {
1468 uint8x16x4_t(f(v1.0, v2.0), f(v1.1, v2.1), f(v1.2, v2.2), f(v1.3, v2.3))
1469 }
1470}