packed_seq/
packed_n_seq.rs

1use wide::{CmpLt, i8x32};
2
3use crate::packed_seq::read_slice_32;
4
5use super::*;
6
7#[derive(Clone, Copy, Debug, MemSize, MemDbg)]
8#[cfg_attr(feature = "pyo3", pyo3::pyclass)]
9#[cfg_attr(feature = "epserde", derive(epserde::Epserde))]
10pub struct PackedNSeq<'s> {
11    pub seq: PackedSeq<'s>,
12    pub ambiguous: BitSeq<'s>,
13}
14
15#[derive(Clone, Debug, MemSize, MemDbg, Default)]
16#[cfg_attr(feature = "pyo3", pyo3::pyclass)]
17#[cfg_attr(feature = "epserde", derive(epserde::Epserde))]
18pub struct PackedNSeqVec {
19    pub seq: PackedSeqVec,
20    pub ambiguous: BitSeqVec,
21}
22
23/// Implement a subset of `Seq` for `PackedNSeq`.
24impl<'s> PackedNSeq<'s> {
25    pub fn to_revcomp(&self) -> PackedNSeqVec {
26        PackedNSeqVec {
27            seq: self.seq.to_revcomp(),
28            ambiguous: self.ambiguous.to_revcomp(),
29        }
30    }
31
32    pub fn slice(&self, range: Range<usize>) -> PackedNSeq<'s> {
33        PackedNSeq {
34            seq: self.seq.slice(range.clone()),
35            ambiguous: self.ambiguous.slice(range),
36        }
37    }
38}
39
40/// Implement a subset of `SeqVec` for `PackedNSeqVec`.
41impl PackedNSeqVec {
42    pub fn clear(&mut self) {
43        self.seq.clear();
44        self.ambiguous.clear();
45    }
46
47    #[cfg(feature = "rand")]
48    pub fn random(len: usize, n_frac: f32) -> Self {
49        let seq = PackedSeqVec::random(len);
50        let ambiguous = BitSeqVec::random(len, n_frac);
51        Self { seq, ambiguous }
52    }
53
54    pub fn as_slice(&self) -> PackedNSeq<'_> {
55        PackedNSeq {
56            seq: self.seq.as_slice(),
57            ambiguous: self.ambiguous.as_slice(),
58        }
59    }
60
61    pub fn slice(&self, range: Range<usize>) -> PackedNSeq<'_> {
62        self.as_slice().slice(range).to_owned()
63    }
64
65    pub fn push_ascii(&mut self, seq: &[u8]) -> Range<usize> {
66        let r1 = self.seq.push_ascii(seq);
67        let r2 = self.ambiguous.push_ascii(seq);
68        assert_eq!(r1, r2);
69        r1
70    }
71
72    pub fn from_ascii(seq: &[u8]) -> Self {
73        Self {
74            seq: PackedSeqVec::from_ascii(seq),
75            ambiguous: BitSeqVec::from_ascii(seq),
76        }
77    }
78
79    /// Create a mask that is 1 for all non-ACGT bases and for all low-quality bases with quality `<threshold`.
80    pub fn from_ascii_and_quality(seq: &[u8], quality: &[u8], threshold: usize) -> Self {
81        assert_eq!(seq.len(), quality.len());
82
83        let mut ambiguous = BitSeqVec::from_ascii(seq);
84
85        // Low-quality bases are also ambiguous.
86        {
87            let t = (b'!' + threshold as u8) as i8;
88            let t_simd = i8x32::splat(t);
89            let ambiguous = ambiguous.seq.as_chunks_mut::<4>().0;
90            for i in (0..quality.len()).step_by(32) {
91                let chunk = i8x32::from(unsafe {
92                    std::mem::transmute::<_, i8x32>(read_slice_32(quality, i))
93                });
94
95                let mask = t_simd.cmp_lt(chunk).move_mask() as u32;
96                let ambi = &mut ambiguous[i / 32];
97                *ambi = (u32::from_ne_bytes(*ambi) | mask).to_ne_bytes();
98            }
99        }
100
101        Self {
102            seq: PackedSeqVec::from_ascii(seq),
103            ambiguous,
104        }
105    }
106
107    /// Create a mask that is 1 for all non-ACGT bases and for all low-quality bases with quality `<threshold`.
108    pub fn push_from_ascii_and_quality(&mut self, seq: &[u8], quality: &[u8], min_qual: u8) {
109        let r = self.seq.push_ascii(seq);
110        let r2 = self.ambiguous.push_ascii(seq);
111        assert_eq!(r, r2);
112
113        assert_eq!(seq.len(), quality.len());
114
115        // Low-quality bases are also ambiguous.
116        let t = b'!' + min_qual;
117        let t_simd = i8x32::splat(t as i8);
118
119        let mut idx = r2.start;
120        let mut i = 0;
121        while idx % 8 != 0 {
122            self.ambiguous.seq[idx / 8] |= ((quality[i] < t) as u8) << (idx % 8);
123            idx += 1;
124            i += 1;
125        }
126        let quality = &quality[i..];
127
128        let ambiguous = self.ambiguous.seq[idx / 8..].as_chunks_mut::<4>().0;
129        for i in (0..quality.len()).step_by(32) {
130            let chunk =
131                i8x32::from(unsafe { std::mem::transmute::<_, i8x32>(read_slice_32(quality, i)) });
132
133            let mask = t_simd.cmp_lt(chunk).move_mask() as u32;
134            let ambi = &mut ambiguous[i / 32];
135            *ambi = (u32::from_ne_bytes(*ambi) | mask).to_ne_bytes();
136        }
137    }
138
139    /// Read a fasta/fastq file at the given path into a `PackedNSeq`, separating records with `N`.
140    #[cfg(feature = "file_io")]
141    pub fn from_fastx(path: &std::path::Path) -> Self {
142        let mut ascii = Vec::with_capacity(16000);
143
144        let mut seq = Self::default();
145
146        let mut reader = needletail::parse_fastx_file(&path).unwrap();
147        while let Some(record) = reader.next() {
148            let seqrec = record.expect("Invalid FASTA/Q record");
149            ascii.extend_from_slice(&seqrec.seq());
150            ascii.push(b'N');
151
152            // Convert from ascii dna to packed dna+mask in batches of 16kbp.
153            if ascii.len() > 16000 {
154                seq.push_ascii(&ascii);
155                ascii.clear();
156            }
157        }
158
159        if ascii.len() > 0 {
160            seq.push_ascii(&ascii);
161        }
162
163        seq
164    }
165
166    /// Read a fasta/fastq file at the given path into a `PackedNSeq`, separating records with `N`.
167    /// Low-quality bases are masked out.
168    #[cfg(feature = "file_io")]
169    pub fn from_fastq_with_quality(path: &std::path::Path, min_qual: u8) -> Self {
170        let mut ascii = vec![];
171        let mut qual = vec![];
172
173        let mut seq = Self::default();
174
175        let mut reader = needletail::parse_fastx_file(&path).unwrap();
176        while let Some(record) = reader.next() {
177            let seqrec = record.expect("Invalid FASTA/Q record");
178            ascii.extend_from_slice(&seqrec.seq());
179            qual.extend_from_slice(&seqrec.qual().unwrap());
180            ascii.push(b'N');
181            qual.push(0);
182
183            // Convert from ascii dna+qual to packed dna+mask in batches of 16kbp.
184            if ascii.len() > 16000 {
185                seq.push_from_ascii_and_quality(&ascii, &qual, min_qual);
186                ascii.clear();
187                qual.clear();
188            }
189        }
190
191        if ascii.len() > 0 {
192            seq.push_from_ascii_and_quality(&ascii, &qual, min_qual);
193        }
194
195        seq
196    }
197}