1use traits::Seq;
2
3use crate::intrinsics::transpose;
4
5use super::*;
6
7#[derive(Copy, Clone, Debug, MemSize, MemDbg)]
9pub struct PackedSeq<'s> {
10 seq: &'s [u8],
12 offset: usize,
14 len: usize,
16}
17
18const PADDING: usize = 16;
20
21#[derive(Clone, Debug, MemSize, MemDbg)]
23#[cfg_attr(feature = "pyo3", pyo3::pyclass)]
24#[cfg_attr(feature = "epserde", derive(epserde::Epserde))]
25pub struct PackedSeqVec {
26 pub(crate) seq: Vec<u8>,
30
31 len: usize,
33}
34
35impl Default for PackedSeqVec {
36 fn default() -> Self {
37 Self {
38 seq: vec![0; PADDING],
39 len: 0,
40 }
41 }
42}
43
44#[inline(always)]
46pub fn pack_char(base: u8) -> u8 {
47 match base {
48 b'a' | b'A' => 0,
49 b'c' | b'C' => 1,
50 b'g' | b'G' => 3,
51 b't' | b'T' => 2,
52 _ => panic!(
53 "Unexpected character '{}' with ASCII value {base}. Expected one of ACTGactg.",
54 base as char
55 ),
56 }
57}
58
59#[inline(always)]
61pub fn unpack_base(base: u8) -> u8 {
62 debug_assert!(base < 4, "Base {base} is not <4.");
63 b"ACTG"[base as usize]
64}
65
66#[inline(always)]
68pub const fn complement_char(base: u8) -> u8 {
69 match base {
70 b'A' => b'T',
71 b'C' => b'G',
72 b'G' => b'C',
73 b'T' => b'A',
74 _ => panic!("Unexpected character. Expected one of ACTGactg.",),
75 }
76}
77
78#[inline(always)]
80pub const fn complement_base(base: u8) -> u8 {
81 base ^ 2
82}
83
84#[inline(always)]
86pub fn complement_base_simd(base: u32x8) -> u32x8 {
87 base ^ u32x8::splat(2)
88}
89
90#[inline(always)]
92pub const fn revcomp_u64(word: u64, len: usize) -> u64 {
93 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
94 {
95 let mut res = word.reverse_bits(); res = ((res >> 1) & 0x5555_5555_5555_5555) | ((res & 0x5555_5555_5555_5555) << 1);
97 res ^= 0xAAAA_AAAA_AAAA_AAAA;
98 res >> (usize::BITS as usize - 2 * len)
99 }
100
101 #[cfg(not(any(target_arch = "arm", target_arch = "aarch64")))]
102 {
103 let mut res = word.swap_bytes();
104 res = ((res >> 4) & 0x0F0F_0F0F_0F0F_0F0F) | ((res & 0x0F0F_0F0F_0F0F_0F0F) << 4);
105 res = ((res >> 2) & 0x3333_3333_3333_3333) | ((res & 0x3333_3333_3333_3333) << 2);
106 res ^= 0xAAAA_AAAA_AAAA_AAAA;
107 res >> (usize::BITS as usize - 2 * len)
108 }
109}
110
111impl PackedSeq<'_> {
112 #[inline(always)]
114 pub fn normalize(&self) -> Self {
115 let start = self.offset / 4;
116 let end = (self.offset + self.len).div_ceil(4);
117 Self {
118 seq: &self.seq[start..end],
119 offset: self.offset % 4,
120 len: self.len,
121 }
122 }
123
124 #[inline(always)]
126 pub fn unpack(&self) -> Vec<u8> {
127 self.iter_bp().map(unpack_base).collect()
128 }
129}
130
131#[inline(always)]
132pub(crate) fn read_slice(seq: &[u8], idx: usize) -> u32x8 {
133 let mut result = [0u8; 32];
135 let num_bytes = 32.min(seq.len().saturating_sub(idx));
136 unsafe {
137 let src = seq.as_ptr().add(idx);
138 std::ptr::copy_nonoverlapping(src, result.as_mut_ptr(), num_bytes);
139 std::mem::transmute(result)
140 }
141}
142
143impl<'s> Seq<'s> for PackedSeq<'s> {
144 const BASES_PER_BYTE: usize = 4;
145 const BITS_PER_CHAR: usize = 2;
146 type SeqVec = PackedSeqVec;
147
148 #[inline(always)]
149 fn len(&self) -> usize {
150 self.len
151 }
152
153 #[inline(always)]
154 fn is_empty(&self) -> bool {
155 self.len == 0
156 }
157
158 #[inline(always)]
159 fn get(&self, index: usize) -> u8 {
160 let offset = self.offset + index;
161 let idx = offset / 4;
162 let offset = offset % 4;
163 (self.seq[idx] >> (2 * offset)) & 3
164 }
165
166 #[inline(always)]
167 fn get_ascii(&self, index: usize) -> u8 {
168 unpack_base(self.get(index))
169 }
170
171 #[inline(always)]
174 fn as_u64(&self) -> u64 {
175 assert!(self.len() <= 32);
176 debug_assert!(self.seq.len() <= 9);
177
178 let mask = u64::MAX >> (64 - 2 * self.len());
179
180 if self.len() <= 29 {
183 let x = unsafe { (self.seq.as_ptr() as *const u64).read_unaligned() };
184 (x >> (2 * self.offset)) & mask
185 } else {
186 let x = unsafe { (self.seq.as_ptr() as *const u128).read_unaligned() };
187 (x >> (2 * self.offset)) as u64 & mask
188 }
189 }
190
191 #[inline(always)]
194 fn revcomp_as_u64(&self) -> u64 {
195 revcomp_u64(self.as_u64(), self.len())
196 }
197
198 #[inline(always)]
199 fn to_vec(&self) -> PackedSeqVec {
200 assert_eq!(self.offset, 0);
201 PackedSeqVec {
202 seq: self
203 .seq
204 .iter()
205 .copied()
206 .chain(std::iter::repeat_n(0u8, PADDING))
207 .collect(),
208 len: self.len,
209 }
210 }
211
212 fn to_revcomp(&self) -> PackedSeqVec {
213 let mut seq = self.seq[..(self.offset + self.len).div_ceil(4)]
214 .iter()
215 .rev()
217 .copied()
218 .map(|mut res| {
219 res = ((res >> 4) & 0x0F) | ((res & 0x0F) << 4);
222 res = ((res >> 2) & 0x33) | ((res & 0x33) << 2);
223 res ^ 0xAA
224 })
225 .chain(std::iter::repeat_n(0u8, PADDING))
226 .collect::<Vec<u8>>();
227
228 let new_offset = (4 - (self.offset + self.len) % 4) % 4;
230
231 if new_offset > 0 {
232 let shift = 2 * new_offset;
234 *seq.last_mut().unwrap() >>= shift;
235 for i in 0..seq.len() - 1 {
237 seq[i] = (seq[i] >> shift) | (seq[i + 1] << (8 - shift));
238 }
239 }
240
241 PackedSeqVec { seq, len: self.len }
242 }
243
244 #[inline(always)]
245 fn slice(&self, range: Range<usize>) -> Self {
246 debug_assert!(
247 range.end <= self.len,
248 "Slice index out of bounds: {} > {}",
249 range.end,
250 self.len
251 );
252 PackedSeq {
253 seq: self.seq,
254 offset: self.offset + range.start,
255 len: range.end - range.start,
256 }
257 .normalize()
258 }
259
260 #[inline(always)]
261 fn iter_bp(self) -> impl ExactSizeIterator<Item = u8> + Clone {
262 assert!(self.len <= self.seq.len() * 4);
263
264 let this = self.normalize();
265
266 let mut byte = 0;
268 let mut it = (0..this.len + this.offset).map(
269 #[inline(always)]
270 move |i| {
271 if i % 4 == 0 {
272 byte = this.seq[i / 4];
273 }
274 (byte >> (2 * (i % 4))) & 0b11
276 },
277 );
278 it.by_ref().take(this.offset).for_each(drop);
279 it
280 }
281
282 #[inline(always)]
283 fn par_iter_bp(self, context: usize) -> (impl ExactSizeIterator<Item = S> + Clone, usize) {
284 #[cfg(target_endian = "big")]
285 panic!("Big endian architectures are not supported.");
286
287 let this = self.normalize();
288 let o = this.offset;
289 assert!(o < 4);
290
291 let num_kmers = if this.len == 0 {
292 0
293 } else {
294 (this.len + o).saturating_sub(context - 1)
295 };
296 let num_kmers_stride = this.len.saturating_sub(context - 1);
298 let n = num_kmers_stride.div_ceil(L).next_multiple_of(4);
299 let bytes_per_chunk = n / 4;
300 let padding = 4 * L * bytes_per_chunk - num_kmers_stride;
301
302 let offsets: [usize; 8] = from_fn(|l| (l * bytes_per_chunk));
303 let mut cur = S::ZERO;
304
305 let mut buf = Box::new([S::ZERO; 8]);
308
309 let par_len = if num_kmers == 0 {
311 0
312 } else {
313 n + context + o - 1
314 };
315 let mut it = (0..par_len).map(
316 #[inline(always)]
317 move |i| {
318 if i % 16 == 0 {
319 if i % 128 == 0 {
320 let data: [u32x8; 8] = from_fn(
322 #[inline(always)]
323 |lane| read_slice(this.seq, offsets[lane] + (i / 4)),
324 );
325 *buf = transpose(data);
326 }
327 cur = buf[(i % 128) / 16];
328 }
329 let chars = cur & S::splat(0x03);
331 cur = cur >> S::splat(2);
333 chars
334 },
335 );
336 it.by_ref().take(o).for_each(drop);
338
339 (it, padding)
340 }
341
342 #[inline(always)]
345 fn par_iter_bp_delayed(
346 self,
347 context: usize,
348 delay: usize,
349 ) -> (impl ExactSizeIterator<Item = (S, S)> + Clone, usize) {
350 #[cfg(target_endian = "big")]
351 panic!("Big endian architectures are not supported.");
352
353 assert!(
354 delay < usize::MAX / 2,
355 "Delay={} should be >=0.",
356 delay as isize
357 );
358
359 let this = self.normalize();
360 let o = this.offset;
361 assert!(o < 4);
362
363 let num_kmers = if this.len == 0 {
364 0
365 } else {
366 (this.len + o).saturating_sub(context - 1)
367 };
368 let num_kmers_stride = this.len.saturating_sub(context - 1);
370 let n = num_kmers_stride.div_ceil(L).next_multiple_of(4);
371 let bytes_per_chunk = n / 4;
372 let padding = 4 * L * bytes_per_chunk - num_kmers_stride;
373
374 let offsets: [usize; 8] = from_fn(|l| (l * bytes_per_chunk));
375 let mut upcoming = S::ZERO;
376 let mut upcoming_d = S::ZERO;
377
378 let buf_len = (delay / 16 + 8).next_power_of_two();
382 let buf_mask = buf_len - 1;
383 let mut buf = vec![S::ZERO; buf_len];
384 let mut write_idx = 0;
385 let mut read_idx = (buf_len - delay / 16) % buf_len;
388
389 let par_len = if num_kmers == 0 {
390 0
391 } else {
392 n + context + o - 1
393 };
394 let mut it = (0..par_len).map(
395 #[inline(always)]
396 move |i| {
397 if i % 16 == 0 {
398 if i % 128 == 0 {
399 let data: [u32x8; 8] = from_fn(
401 #[inline(always)]
402 |lane| read_slice(this.seq, offsets[lane] + (i / 4)),
403 );
404 unsafe {
405 *TryInto::<&mut [u32x8; 8]>::try_into(
406 buf.get_unchecked_mut(write_idx..write_idx + 8),
407 )
408 .unwrap_unchecked() = transpose(data);
409 }
410 if i == 0 {
411 let elem = !((1u32 << (2 * o)) - 1);
413 let mask = S::splat(elem);
414 buf[write_idx] &= mask;
415 }
416 }
417 upcoming = buf[write_idx];
418 write_idx += 1;
419 write_idx &= buf_mask;
420 }
421 if i % 16 == delay % 16 {
422 unsafe { assert_unchecked(read_idx < buf.len()) };
423 upcoming_d = buf[read_idx];
424 read_idx += 1;
425 read_idx &= buf_mask;
426 }
427 let chars = upcoming & S::splat(0x03);
429 let chars_d = upcoming_d & S::splat(0x03);
430 upcoming = upcoming >> S::splat(2);
432 upcoming_d = upcoming_d >> S::splat(2);
433 (chars, chars_d)
434 },
435 );
436 it.by_ref().take(o).for_each(drop);
437
438 (it, padding)
439 }
440
441 #[inline(always)]
444 fn par_iter_bp_delayed_2(
445 self,
446 context: usize,
447 delay1: usize,
448 delay2: usize,
449 ) -> (impl ExactSizeIterator<Item = (S, S, S)> + Clone, usize) {
450 #[cfg(target_endian = "big")]
451 panic!("Big endian architectures are not supported.");
452
453 let this = self.normalize();
454 let o = this.offset;
455 assert!(o < 4);
456 assert!(delay1 <= delay2, "Delay1 must be at most delay2.");
457
458 let num_kmers = if this.len == 0 {
459 0
460 } else {
461 (this.len + o).saturating_sub(context - 1)
462 };
463 let num_kmers_stride = this.len.saturating_sub(context - 1);
465 let n = num_kmers_stride.div_ceil(L).next_multiple_of(4);
466 let bytes_per_chunk = n / 4;
467 let padding = 4 * L * bytes_per_chunk - num_kmers_stride;
468
469 let offsets: [usize; 8] = from_fn(|l| (l * bytes_per_chunk));
470 let mut upcoming = S::ZERO;
471 let mut upcoming_d1 = S::ZERO;
472 let mut upcoming_d2 = S::ZERO;
473
474 let buf_len = (delay2 / 16 + 8).next_power_of_two();
476 let buf_mask = buf_len - 1;
477 let mut buf = vec![S::ZERO; buf_len];
478 let mut write_idx = 0;
479 let mut read_idx1 = (buf_len - delay1 / 16) % buf_len;
482 let mut read_idx2 = (buf_len - delay2 / 16) % buf_len;
483
484 let par_len = if num_kmers == 0 {
485 0
486 } else {
487 n + context + o - 1
488 };
489 let mut it = (0..par_len).map(
490 #[inline(always)]
491 move |i| {
492 if i % 16 == 0 {
493 if i % 128 == 0 {
494 let data: [u32x8; 8] = from_fn(
496 #[inline(always)]
497 |lane| read_slice(this.seq, offsets[lane] + (i / 4)),
498 );
499 unsafe {
500 *TryInto::<&mut [u32x8; 8]>::try_into(
501 buf.get_unchecked_mut(write_idx..write_idx + 8),
502 )
503 .unwrap_unchecked() = transpose(data);
504 }
505 if i == 0 {
506 let elem = !((1u32 << (2 * o)) - 1);
508 let mask = S::splat(elem);
509 buf[write_idx] &= mask;
510 }
511 }
512 upcoming = buf[write_idx];
513 write_idx += 1;
514 write_idx &= buf_mask;
515 }
516 if i % 16 == delay1 % 16 {
517 unsafe { assert_unchecked(read_idx1 < buf.len()) };
518 upcoming_d1 = buf[read_idx1];
519 read_idx1 += 1;
520 read_idx1 &= buf_mask;
521 }
522 if i % 16 == delay2 % 16 {
523 unsafe { assert_unchecked(read_idx2 < buf.len()) };
524 upcoming_d2 = buf[read_idx2];
525 read_idx2 += 1;
526 read_idx2 &= buf_mask;
527 }
528 let chars = upcoming & S::splat(0x03);
530 let chars_d1 = upcoming_d1 & S::splat(0x03);
531 let chars_d2 = upcoming_d2 & S::splat(0x03);
532 upcoming = upcoming >> S::splat(2);
534 upcoming_d1 = upcoming_d1 >> S::splat(2);
535 upcoming_d2 = upcoming_d2 >> S::splat(2);
536 (chars, chars_d1, chars_d2)
537 },
538 );
539 it.by_ref().take(o).for_each(drop);
540
541 (it, padding)
542 }
543
544 fn cmp_lcp(&self, other: &Self) -> (std::cmp::Ordering, usize) {
546 let mut lcp = 0;
547 let min_len = self.len.min(other.len);
548 for i in (0..min_len).step_by(29) {
549 let len = (min_len - i).min(29);
550 let this = self.slice(i..i + len);
551 let other = other.slice(i..i + len);
552 let this_word = this.as_u64();
553 let other_word = other.as_u64();
554 if this_word != other_word {
555 let eq = this_word ^ other_word;
557 let t = eq.trailing_zeros() / 2 * 2;
558 lcp += t as usize / 2;
559 let mask = 0b11 << t;
560 return ((this_word & mask).cmp(&(other_word & mask)), lcp);
561 }
562 lcp += len;
563 }
564 (self.len.cmp(&other.len), lcp)
565 }
566}
567
568impl PartialEq for PackedSeq<'_> {
569 fn eq(&self, other: &Self) -> bool {
571 if self.len != other.len {
572 return false;
573 }
574 for i in (0..self.len).step_by(29) {
575 let len = (self.len - i).min(29);
576 let this = self.slice(i..i + len);
577 let that = other.slice(i..i + len);
578 if this.as_u64() != that.as_u64() {
579 return false;
580 }
581 }
582 true
583 }
584}
585
586impl Eq for PackedSeq<'_> {}
587
588impl PartialOrd for PackedSeq<'_> {
589 fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
590 Some(self.cmp(other))
591 }
592}
593
594impl Ord for PackedSeq<'_> {
595 fn cmp(&self, other: &Self) -> std::cmp::Ordering {
597 let min_len = self.len.min(other.len);
598 for i in (0..min_len).step_by(29) {
599 let len = (min_len - i).min(29);
600 let this = self.slice(i..i + len);
601 let other = other.slice(i..i + len);
602 let this_word = this.as_u64();
603 let other_word = other.as_u64();
604 if this_word != other_word {
605 let eq = this_word ^ other_word;
607 let t = eq.trailing_zeros() / 2 * 2;
608 let mask = 0b11 << t;
609 return (this_word & mask).cmp(&(other_word & mask));
610 }
611 }
612 self.len.cmp(&other.len)
613 }
614}
615
616impl SeqVec for PackedSeqVec {
617 type Seq<'s> = PackedSeq<'s>;
618
619 #[inline(always)]
620 fn into_raw(mut self) -> Vec<u8> {
621 self.seq.resize(self.len.div_ceil(4), 0);
622 self.seq
623 }
624
625 #[inline(always)]
626 fn as_slice(&self) -> Self::Seq<'_> {
627 PackedSeq {
628 seq: &self.seq[..self.len.div_ceil(4)],
629 offset: 0,
630 len: self.len,
631 }
632 }
633
634 #[inline(always)]
635 fn len(&self) -> usize {
636 self.len
637 }
638
639 #[inline(always)]
640 fn is_empty(&self) -> bool {
641 self.len == 0
642 }
643
644 #[inline(always)]
645 fn clear(&mut self) {
646 self.seq.clear();
647 self.len = 0;
648 }
649
650 fn push_seq<'a>(&mut self, seq: PackedSeq<'_>) -> Range<usize> {
651 let start = self.len.next_multiple_of(4) + seq.offset;
652 let end = start + seq.len();
653 self.seq.reserve(seq.seq.len());
655 self.seq.resize(self.len.div_ceil(4), 0);
657 self.seq.extend(seq.seq);
659 self.seq.extend(std::iter::repeat_n(0u8, PADDING));
661 self.len = end;
662 start..end
663 }
664
665 fn push_ascii(&mut self, seq: &[u8]) -> Range<usize> {
676 self.seq
677 .resize((self.len + seq.len()).div_ceil(4) + PADDING, 0);
678 let start_aligned = self.len.next_multiple_of(4);
679 let start = self.len;
680 let len = seq.len();
681 let mut idx = self.len / 4;
682
683 let unaligned = core::cmp::min(start_aligned - start, len);
684 if unaligned > 0 {
685 let mut packed_byte = self.seq[idx];
686 for &base in &seq[..unaligned] {
687 packed_byte |= pack_char(base) << ((self.len % 4) * 2);
688 self.len += 1;
689 }
690 self.seq[idx] = packed_byte;
691 idx += 1;
692 }
693
694 #[allow(unused)]
695 let mut last = unaligned;
696
697 #[cfg(all(target_arch = "x86_64", target_feature = "bmi2"))]
698 {
699 last = unaligned + (len - unaligned) / 8 * 8;
700
701 for i in (unaligned..last).step_by(8) {
702 let chunk = &seq[i..i + 8].try_into().unwrap();
703 let ascii = u64::from_ne_bytes(*chunk);
704 let packed_bytes =
705 unsafe { std::arch::x86_64::_pext_u64(ascii, 0x0606060606060606) };
706 self.seq[idx] = packed_bytes as u8;
707 idx += 1;
708 self.seq[idx] = (packed_bytes >> 8) as u8;
709 idx += 1;
710 self.len += 8;
711 }
712 }
713
714 #[cfg(target_feature = "neon")]
715 {
716 use core::arch::aarch64::{vandq_u8, vdup_n_u8, vld1q_u8, vpadd_u8, vshlq_u8, vst1_u8};
717 use core::mem::transmute;
718
719 last = unaligned + (len - unaligned) / 16 * 16;
720
721 for i in (unaligned..last).step_by(16) {
722 unsafe {
723 let ascii = vld1q_u8(seq.as_ptr().add(i));
724 let masked_bits = vandq_u8(ascii, transmute([6i8; 16]));
725 let (bits_0, bits_1) = transmute(vshlq_u8(
726 masked_bits,
727 transmute([-1i8, 1, 3, 5, -1, 1, 3, 5, -1, 1, 3, 5, -1, 1, 3, 5]),
728 ));
729 let half_packed = vpadd_u8(bits_0, bits_1);
730 let packed = vpadd_u8(half_packed, vdup_n_u8(0));
731 vst1_u8(self.seq.as_mut_ptr().add(idx), packed);
732 idx += 4;
733 self.len += 16;
734 }
735 }
736 }
737
738 let mut packed_byte = 0;
739 for &base in &seq[last..] {
740 packed_byte |= pack_char(base) << ((self.len % 4) * 2);
741 self.len += 1;
742 if self.len % 4 == 0 {
743 self.seq[idx] = packed_byte;
744 idx += 1;
745 packed_byte = 0;
746 }
747 }
748 if self.len % 4 != 0 && last < len {
749 self.seq[idx] = packed_byte;
750 idx += 1;
751 }
752 assert_eq!(idx + PADDING, self.seq.len());
753 start..start + len
754 }
755
756 #[cfg(feature = "rand")]
757 fn random(n: usize) -> Self {
758 use rand::{RngCore, SeedableRng};
759
760 let byte_len = n.div_ceil(4);
761 let mut seq = vec![0; byte_len + PADDING];
762 rand::rngs::SmallRng::from_os_rng().fill_bytes(&mut seq[..byte_len]);
763 if n % 4 != 0 {
765 seq[byte_len - 1] &= (1 << (2 * (n % 4))) - 1;
766 }
767
768 Self { seq, len: n }
769 }
770}