packed_seq/
packed_n_seq.rs1use wide::{CmpLt, i8x32};
2
3use crate::packed_seq::read_slice_32;
4
5use super::*;
6
7#[derive(Clone, Copy, Debug, MemSize, MemDbg)]
8#[cfg_attr(feature = "pyo3", pyo3::pyclass)]
9#[cfg_attr(feature = "epserde", derive(epserde::Epserde))]
10pub struct PackedNSeq<'s> {
11 pub seq: PackedSeq<'s>,
12 pub ambiguous: BitSeq<'s>,
13}
14
15#[derive(Clone, Debug, MemSize, MemDbg, Default)]
16#[cfg_attr(feature = "pyo3", pyo3::pyclass)]
17#[cfg_attr(feature = "epserde", derive(epserde::Epserde))]
18pub struct PackedNSeqVec {
19 pub seq: PackedSeqVec,
20 pub ambiguous: BitSeqVec,
21}
22
23impl<'s> PackedNSeq<'s> {
25 pub fn to_revcomp(&self) -> PackedNSeqVec {
26 PackedNSeqVec {
27 seq: self.seq.to_revcomp(),
28 ambiguous: self.ambiguous.to_revcomp(),
29 }
30 }
31
32 pub fn slice(&self, range: Range<usize>) -> PackedNSeq<'s> {
33 PackedNSeq {
34 seq: self.seq.slice(range.clone()),
35 ambiguous: self.ambiguous.slice(range),
36 }
37 }
38}
39
40impl PackedNSeqVec {
42 pub fn clear(&mut self) {
43 self.seq.clear();
44 self.ambiguous.clear();
45 }
46
47 #[cfg(feature = "rand")]
48 pub fn random(len: usize, n_frac: f32) -> Self {
49 let seq = PackedSeqVec::random(len);
50 let ambiguous = BitSeqVec::random(len, n_frac);
51 Self { seq, ambiguous }
52 }
53
54 pub fn as_slice(&self) -> PackedNSeq<'_> {
55 PackedNSeq {
56 seq: self.seq.as_slice(),
57 ambiguous: self.ambiguous.as_slice(),
58 }
59 }
60
61 pub fn slice(&self, range: Range<usize>) -> PackedNSeq<'_> {
62 self.as_slice().slice(range).to_owned()
63 }
64
65 pub fn push_ascii(&mut self, seq: &[u8]) -> Range<usize> {
66 let r1 = self.seq.push_ascii(seq);
67 let r2 = self.ambiguous.push_ascii(seq);
68 assert_eq!(r1, r2);
69 r1
70 }
71
72 pub fn from_ascii(seq: &[u8]) -> Self {
73 Self {
74 seq: PackedSeqVec::from_ascii(seq),
75 ambiguous: BitSeqVec::from_ascii(seq),
76 }
77 }
78
79 pub fn from_ascii_and_quality(seq: &[u8], quality: &[u8], threshold: usize) -> Self {
81 assert_eq!(seq.len(), quality.len());
82
83 let mut ambiguous = BitSeqVec::from_ascii(seq);
84
85 {
87 let t = (b'!' + threshold as u8) as i8;
88 let t_simd = i8x32::splat(t);
89 let ambiguous = ambiguous.seq.as_chunks_mut::<4>().0;
90 for i in (0..quality.len()).step_by(32) {
91 let chunk = i8x32::from(unsafe {
92 std::mem::transmute::<_, i8x32>(read_slice_32(quality, i))
93 });
94
95 let mask = t_simd.cmp_lt(chunk).move_mask() as u32;
96 let ambi = &mut ambiguous[i / 32];
97 *ambi = (u32::from_ne_bytes(*ambi) | mask).to_ne_bytes();
98 }
99 }
100
101 Self {
102 seq: PackedSeqVec::from_ascii(seq),
103 ambiguous,
104 }
105 }
106
107 pub fn push_from_ascii_and_quality(&mut self, seq: &[u8], quality: &[u8], min_qual: u8) {
109 let r = self.seq.push_ascii(seq);
110 let r2 = self.ambiguous.push_ascii(seq);
111 assert_eq!(r, r2);
112
113 assert_eq!(seq.len(), quality.len());
114
115 let t = b'!' + min_qual;
117 let t_simd = i8x32::splat(t as i8);
118
119 let mut idx = r2.start;
120 let mut i = 0;
121 while idx % 8 != 0 {
122 self.ambiguous.seq[idx / 8] |= ((quality[i] < t) as u8) << (idx % 8);
123 idx += 1;
124 i += 1;
125 }
126 let quality = &quality[i..];
127
128 let ambiguous = self.ambiguous.seq[idx / 8..].as_chunks_mut::<4>().0;
129 for i in (0..quality.len()).step_by(32) {
130 let chunk =
131 i8x32::from(unsafe { std::mem::transmute::<_, i8x32>(read_slice_32(quality, i)) });
132
133 let mask = t_simd.cmp_lt(chunk).move_mask() as u32;
134 let ambi = &mut ambiguous[i / 32];
135 *ambi = (u32::from_ne_bytes(*ambi) | mask).to_ne_bytes();
136 }
137 }
138
139 #[cfg(feature = "file_io")]
141 pub fn from_fastx(path: &std::path::Path) -> Self {
142 let mut ascii = Vec::with_capacity(16000);
143
144 let mut seq = Self::default();
145
146 let mut reader = needletail::parse_fastx_file(&path).unwrap();
147 while let Some(record) = reader.next() {
148 let seqrec = record.expect("Invalid FASTA/Q record");
149 ascii.extend_from_slice(&seqrec.seq());
150 ascii.push(b'N');
151
152 if ascii.len() > 16000 {
154 seq.push_ascii(&ascii);
155 ascii.clear();
156 }
157 }
158
159 if ascii.len() > 0 {
160 seq.push_ascii(&ascii);
161 }
162
163 seq
164 }
165
166 #[cfg(feature = "file_io")]
169 pub fn from_fastq_with_quality(path: &std::path::Path, min_qual: u8) -> Self {
170 let mut ascii = vec![];
171 let mut qual = vec![];
172
173 let mut seq = Self::default();
174
175 let mut reader = needletail::parse_fastx_file(&path).unwrap();
176 while let Some(record) = reader.next() {
177 let seqrec = record.expect("Invalid FASTA/Q record");
178 ascii.extend_from_slice(&seqrec.seq());
179 qual.extend_from_slice(&seqrec.qual().unwrap());
180 ascii.push(b'N');
181 qual.push(0);
182
183 if ascii.len() > 16000 {
185 seq.push_from_ascii_and_quality(&ascii, &qual, min_qual);
186 ascii.clear();
187 qual.clear();
188 }
189 }
190
191 if ascii.len() > 0 {
192 seq.push_from_ascii_and_quality(&ascii, &qual, min_qual);
193 }
194
195 seq
196 }
197}