1use crate::offsets::{EliasFanoOffsets, OffsetsVector};
7use crate::kmer::{Kmer, KmerBits};
8use std::io::{self, Read, Write};
9
10pub struct SpectrumPreservingStringSet {
18 strings: Vec<u8>,
20 offsets: EliasFanoOffsets,
23 k: usize,
25 m: usize,
27}
28
29impl SpectrumPreservingStringSet {
30 pub fn new(k: usize, m: usize) -> Self {
32 Self {
33 strings: Vec::new(),
34 offsets: EliasFanoOffsets::from_vec(&[0]),
35 k,
36 m,
37 }
38 }
39
40 pub fn from_parts(strings: Vec<u8>, offsets: OffsetsVector, k: usize, m: usize) -> Self {
50 Self {
51 strings,
52 offsets: EliasFanoOffsets::from_offsets_vector(offsets),
53 k,
54 m,
55 }
56 }
57
58 pub fn string_offsets(&self, string_id: u32) -> (u64, u64) {
60 let id = string_id as usize;
61 let begin = self.offsets.access(id);
62 let end = self.offsets.access(id + 1);
63 (begin, end)
64 }
65
66 pub fn num_strings(&self) -> u64 {
68 if !self.offsets.is_empty() {
69 (self.offsets.len() - 1) as u64
70 } else {
71 0
72 }
73 }
74
75 pub fn string_offset(&self, string_id: u64) -> u64 {
77 self.offsets.access(string_id as usize)
78 }
79
80 pub fn k(&self) -> usize {
82 self.k
83 }
84
85 pub fn m(&self) -> usize {
87 self.m
88 }
89
90 pub fn total_bases(&self) -> u64 {
92 if !self.offsets.is_empty() {
93 self.offsets.access(self.offsets.len() - 1)
94 } else {
95 0
96 }
97 }
98
99 #[inline]
102 pub fn locate(&self, absolute_pos: u64) -> Option<(u64, u64)> {
103 self.offsets.locate(absolute_pos)
104 }
105
106 #[inline]
110 pub fn locate_with_end(&self, absolute_pos: u64) -> Option<(u64, u64, u64)> {
111 self.offsets.locate_with_end(absolute_pos)
112 }
113
114 pub fn num_bits(&self) -> u64 {
116 (self.strings.len() as u64) * 8 + self.offsets.num_bits()
117 }
118
119 pub fn strings_bytes(&self) -> usize {
121 self.strings.len()
122 }
123
124 #[inline]
127 pub fn strings_raw(&self) -> &[u8] {
128 &self.strings
129 }
130
131 #[inline]
133 pub fn offsets_len(&self) -> usize {
134 self.offsets.len()
135 }
136
137 #[inline]
139 pub fn offset_at(&self, i: usize) -> u64 {
140 self.offsets.access(i)
141 }
142
143 pub fn offsets_bytes(&self) -> usize {
145 self.offsets.num_bytes() as usize
146 }
147
148 pub fn string_length(&self, string_id: u64) -> usize {
150 let (begin, end) = self.string_offsets(string_id as u32);
151 (end - begin) as usize
152 }
153
154 #[inline]
158 pub fn decode_kmer<const K: usize>(&self, string_id: u64, kmer_pos: usize) -> Kmer<K>
159 where
160 Kmer<K>: KmerBits,
161 {
162 let (begin, _end) = self.string_offsets(string_id as u32);
163 let start_base = (begin as usize) + kmer_pos;
164
165 let byte_offset = start_base / 4;
166 let bit_shift = (start_base % 4) * 2;
167 let needed_bits = K * 2;
168
169 let needed_bytes = (needed_bits + bit_shift).div_ceil(8);
172
173 if needed_bytes <= 8 {
174 let mut buf = [0u8; 8];
176 let avail = self.strings.len().saturating_sub(byte_offset).min(8);
177 buf[..avail].copy_from_slice(&self.strings[byte_offset..byte_offset + avail]);
178 let raw = u64::from_le_bytes(buf);
179 let shifted = raw >> bit_shift;
180 let mask = if needed_bits >= 64 { u64::MAX } else { (1u64 << needed_bits) - 1 };
181 Kmer::<K>::new(<Kmer<K> as KmerBits>::from_u64(shifted & mask))
182 } else {
183 let mut buf = [0u8; 17];
188 let avail = self.strings.len().saturating_sub(byte_offset).min(17);
189 buf[..avail].copy_from_slice(&self.strings[byte_offset..byte_offset + avail]);
190 let raw = u128::from_le_bytes(buf[..16].try_into().unwrap());
191 let shifted = if bit_shift > 0 {
192 let extra = buf[16] as u128;
193 (raw >> bit_shift) | (extra << (128 - bit_shift))
194 } else {
195 raw
196 };
197 let mask = if needed_bits >= 128 { u128::MAX } else { (1u128 << needed_bits) - 1 };
198 Kmer::<K>::new(<Kmer<K> as KmerBits>::from_u128(shifted & mask))
199 }
200 }
201
202 #[inline]
206 pub fn decode_kmer_at<const K: usize>(&self, absolute_pos: usize) -> Kmer<K>
207 where
208 Kmer<K>: KmerBits,
209 {
210 let byte_offset = absolute_pos / 4;
211 let bit_shift = (absolute_pos % 4) * 2;
212 let needed_bits = K * 2;
213 let needed_bytes = (needed_bits + bit_shift).div_ceil(8);
214
215 if needed_bytes <= 8 {
216 let raw = if byte_offset + 8 <= self.strings.len() {
222 unsafe {
223 std::ptr::read_unaligned(
224 self.strings.as_ptr().add(byte_offset) as *const u64
225 )
226 }
227 } else {
228 let mut buf = [0u8; 8];
230 let avail = self.strings.len() - byte_offset;
231 buf[..avail].copy_from_slice(&self.strings[byte_offset..byte_offset + avail]);
232 u64::from_le_bytes(buf)
233 };
234 let shifted = raw >> bit_shift;
235 let mask = if needed_bits >= 64 { u64::MAX } else { (1u64 << needed_bits) - 1 };
236 Kmer::<K>::new(<Kmer<K> as KmerBits>::from_u64(shifted & mask))
237 } else {
238 let (raw, extra_byte) = if byte_offset + 17 <= self.strings.len() {
241 let r = unsafe {
242 std::ptr::read_unaligned(
243 self.strings.as_ptr().add(byte_offset) as *const u128
244 )
245 };
246 (r, self.strings[byte_offset + 16])
247 } else if byte_offset + 16 <= self.strings.len() {
248 let r = unsafe {
249 std::ptr::read_unaligned(
250 self.strings.as_ptr().add(byte_offset) as *const u128
251 )
252 };
253 let extra = if byte_offset + 16 < self.strings.len() {
254 self.strings[byte_offset + 16]
255 } else {
256 0u8
257 };
258 (r, extra)
259 } else {
260 let mut buf = [0u8; 17];
261 let avail = self.strings.len() - byte_offset;
262 buf[..avail].copy_from_slice(&self.strings[byte_offset..byte_offset + avail]);
263 (u128::from_le_bytes(buf[..16].try_into().unwrap()), buf[16])
264 };
265 let shifted = if bit_shift > 0 {
266 (raw >> bit_shift) | ((extra_byte as u128) << (128 - bit_shift))
267 } else {
268 raw
269 };
270 let mask = if needed_bits >= 128 { u128::MAX } else { (1u128 << needed_bits) - 1 };
271 Kmer::<K>::new(<Kmer<K> as KmerBits>::from_u128(shifted & mask))
272 }
273 }
274
275 pub fn serialize_to<W: Write>(&self, writer: &mut W) -> io::Result<()> {
284 writer.write_all(&(self.k as u64).to_le_bytes())?;
285 writer.write_all(&(self.m as u64).to_le_bytes())?;
286 writer.write_all(&(self.strings.len() as u64).to_le_bytes())?;
287 writer.write_all(&self.strings)?;
288 self.offsets.write_to(writer)?;
289 Ok(())
290 }
291
292 pub fn deserialize_from<R: Read>(reader: &mut R) -> io::Result<Self> {
294 let mut buf8 = [0u8; 8];
295 reader.read_exact(&mut buf8)?;
296 let k = u64::from_le_bytes(buf8) as usize;
297 reader.read_exact(&mut buf8)?;
298 let m = u64::from_le_bytes(buf8) as usize;
299 reader.read_exact(&mut buf8)?;
300 let strings_len = u64::from_le_bytes(buf8) as usize;
301 let mut strings = vec![0u8; strings_len];
302 reader.read_exact(&mut strings)?;
303 let offsets = EliasFanoOffsets::read_from(reader)?;
304 Ok(Self { strings, offsets, k, m })
305 }
306}
307
308impl std::fmt::Debug for SpectrumPreservingStringSet {
309 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
310 f.debug_struct("SpectrumPreservingStringSet")
311 .field("k", &self.k)
312 .field("m", &self.m)
313 .field("num_strings", &self.num_strings())
314 .field("total_bases", &self.total_bases())
315 .field("num_bits", &self.num_bits())
316 .finish()
317 }
318}
319
320#[cfg(test)]
321mod tests {
322 use super::*;
323 use crate::offsets::OffsetsVector;
324
325 fn build_test_spss(k: usize, m: usize, strings: &[&str]) -> SpectrumPreservingStringSet {
328 let mut packed = Vec::new();
329 let mut offsets = OffsetsVector::new();
330 let mut total_bases: u64 = 0;
331
332 for s in strings {
333 for &b in s.as_bytes() {
334 let bits = match b {
335 b'A' | b'a' => 0u8,
336 b'C' | b'c' => 1u8,
337 b'G' | b'g' => 3u8,
338 b'T' | b't' => 2u8,
339 _ => panic!("invalid base"),
340 };
341 let byte_idx = (total_bases as usize) / 4;
342 let bit_off = ((total_bases as usize) % 4) * 2;
343 if byte_idx >= packed.len() {
344 packed.push(0u8);
345 }
346 packed[byte_idx] |= bits << bit_off;
347 total_bases += 1;
348 }
349 offsets.push(total_bases);
350 }
351
352 SpectrumPreservingStringSet::from_parts(packed, offsets, k, m)
353 }
354
355 #[test]
356 fn test_spss_creation() {
357 let spss = SpectrumPreservingStringSet::new(31, 13);
358 assert_eq!(spss.k(), 31);
359 assert_eq!(spss.m(), 13);
360 assert_eq!(spss.num_strings(), 0);
361 }
362
363 #[test]
364 fn test_spss_two_strings() {
365 let spss = build_test_spss(31, 13, &[
366 "ACGTACGTACGTACGTACGTACGTACGTACG", "TGCATGCATGCATGCATGCATGCATGCATGCA", ]);
369 assert_eq!(spss.num_strings(), 2);
370 }
371
372 #[test]
373 fn test_spss_string_offsets() {
374 let spss = build_test_spss(31, 13, &[
375 "ACGTACGTACGTACGTACGTACGTACGTACG", "TGCATGCATGCATGCATGCATGCATGCATGC", ]);
378
379 let (begin1, end1) = spss.string_offsets(0);
380 let (begin2, end2) = spss.string_offsets(1);
381
382 assert_eq!(begin1, 0);
383 assert_eq!(end1 - begin1, 31);
384 assert_eq!(begin2, 31);
385 assert_eq!(end2 - begin2, 31);
386 }
387
388 #[test]
389 fn test_spss_total_bases() {
390 let spss = build_test_spss(31, 13, &[
391 "ACGTACGTACGTACGTACGTACGTACGTACG", "TGCATGCATGCATGCATGCATGCATGCATGC", ]);
394 assert_eq!(spss.total_bases(), 62);
395 }
396
397 #[test]
398 fn test_spss_debug() {
399 let spss = build_test_spss(31, 13, &[
400 "ACGTACGTACGTACGTACGTACGTACGTACG",
401 ]);
402 let debug_str = format!("{:?}", spss);
403 assert!(debug_str.contains("SpectrumPreservingStringSet"));
404 assert!(debug_str.contains("k: 31"));
405 }
406}