Skip to main content

sshash_lib/
spectrum_preserving_string_set.rs

1//! Spectrum-Preserving String Set (SPSS)
2//!
3//! This is the core data structure that stores the strings in a compressed form.
4//! Each string is stored as a sequence of 2-bit encoded DNA bases.
5
6use crate::offsets::{EliasFanoOffsets, OffsetsVector};
7use crate::kmer::{Kmer, KmerBits};
8use std::io::{self, Read, Write};
9
10/// The spectrum-preserving string set
11///
12/// Stores all strings in a bit-packed format with offsets to each string.
13/// This allows for both memory-efficient storage and efficient access patterns.
14///
15/// Offsets are stored using Elias-Fano encoding (via sux-rs) for compact
16/// representation and O(1) `locate()` via successor queries with Cursor.
17pub struct SpectrumPreservingStringSet {
18    /// Bit-packed string data (2 bits per base)
19    strings: Vec<u8>,
20    /// Offsets into the strings vector (in terms of bases, not bytes),
21    /// encoded using Elias-Fano for ~2 bits/element vs 64 bits/element.
22    offsets: EliasFanoOffsets,
23    /// K-mer size of the strings
24    k: usize,
25    /// M (minimizer) size
26    m: usize,
27}
28
29impl SpectrumPreservingStringSet {
30    /// Create a new empty SPSS
31    pub fn new(k: usize, m: usize) -> Self {
32        Self {
33            strings: Vec::new(),
34            offsets: EliasFanoOffsets::from_vec(&[0]),
35            k,
36            m,
37        }
38    }
39    
40    /// Create a new SPSS from existing strings and offsets
41    ///
42    /// Converts the `OffsetsVector` to Elias-Fano encoding for compact storage.
43    ///
44    /// # Arguments
45    /// * `strings` - Encoded string data (2-bit packed)
46    /// * `offsets` - Offset vector for string boundaries (will be converted to EF)
47    /// * `k` - K-mer size
48    /// * `m` - Minimizer size
49    pub fn from_parts(strings: Vec<u8>, offsets: OffsetsVector, k: usize, m: usize) -> Self {
50        Self {
51            strings,
52            offsets: EliasFanoOffsets::from_offsets_vector(offsets),
53            k,
54            m,
55        }
56    }
57
58    /// Get the string offsets (begin, end) for a string ID
59    pub fn string_offsets(&self, string_id: u32) -> (u64, u64) {
60        let id = string_id as usize;
61        let begin = self.offsets.access(id);
62        let end = self.offsets.access(id + 1);
63        (begin, end)
64    }
65
66    /// Get the number of strings stored
67    pub fn num_strings(&self) -> u64 {
68        if !self.offsets.is_empty() {
69            (self.offsets.len() - 1) as u64
70        } else {
71            0
72        }
73    }
74    
75    /// Get the starting offset (in bases) of a string
76    pub fn string_offset(&self, string_id: u64) -> u64 {
77        self.offsets.access(string_id as usize)
78    }
79
80    /// Get the k-mer size
81    pub fn k(&self) -> usize {
82        self.k
83    }
84
85    /// Get the minimizer size
86    pub fn m(&self) -> usize {
87        self.m
88    }
89
90    /// Get the total number of bases stored
91    pub fn total_bases(&self) -> u64 {
92        if !self.offsets.is_empty() {
93            self.offsets.access(self.offsets.len() - 1)
94        } else {
95            0
96        }
97    }
98
99    /// Locate which string contains a given absolute position.
100    /// Returns `(string_id, string_begin)` or None if out of bounds.
101    #[inline]
102    pub fn locate(&self, absolute_pos: u64) -> Option<(u64, u64)> {
103        self.offsets.locate(absolute_pos)
104    }
105
106    /// Locate which string contains a given absolute position, returning
107    /// `(string_id, string_begin, string_end)` in a single EF traversal.
108    /// This is more efficient than calling `locate()` + `string_offsets()`.
109    #[inline]
110    pub fn locate_with_end(&self, absolute_pos: u64) -> Option<(u64, u64, u64)> {
111        self.offsets.locate_with_end(absolute_pos)
112    }
113
114    /// Get memory usage in bits
115    pub fn num_bits(&self) -> u64 {
116        (self.strings.len() as u64) * 8 + self.offsets.num_bits()
117    }
118
119    /// Get the byte size of the packed strings data
120    pub fn strings_bytes(&self) -> usize {
121        self.strings.len()
122    }
123
124    /// Access the raw 2-bit-packed strings buffer. Intended for crates that
125    /// need to clone the SPSS contents into their own representation.
126    #[inline]
127    pub fn strings_raw(&self) -> &[u8] {
128        &self.strings
129    }
130
131    /// Number of offset entries (`num_strings + 1`).
132    #[inline]
133    pub fn offsets_len(&self) -> usize {
134        self.offsets.len()
135    }
136
137    /// Get the offset at index `i` (0..=num_strings). Offsets are in bases.
138    #[inline]
139    pub fn offset_at(&self, i: usize) -> u64 {
140        self.offsets.access(i)
141    }
142
143    /// Get the byte size of the offsets vector
144    pub fn offsets_bytes(&self) -> usize {
145        self.offsets.num_bytes() as usize
146    }
147    
148    /// Get the length of a specific string in bases
149    pub fn string_length(&self, string_id: u64) -> usize {
150        let (begin, end) = self.string_offsets(string_id as u32);
151        (end - begin) as usize
152    }
153    
154    /// Decode a k-mer from a specific position in a string
155    ///
156    /// Uses word-level loads from the packed buffer for efficiency.
157    #[inline]
158    pub fn decode_kmer<const K: usize>(&self, string_id: u64, kmer_pos: usize) -> Kmer<K>
159    where
160        Kmer<K>: KmerBits,
161    {
162        let (begin, _end) = self.string_offsets(string_id as u32);
163        let start_base = (begin as usize) + kmer_pos;
164        
165        let byte_offset = start_base / 4;
166        let bit_shift = (start_base % 4) * 2;
167        let needed_bits = K * 2;
168        
169        // Read enough bytes to cover K bases + potential misalignment
170        // K bases need ceil((K*2 + bit_shift) / 8) bytes
171        let needed_bytes = (needed_bits + bit_shift).div_ceil(8);
172        
173        if needed_bytes <= 8 {
174            // Single u64 read suffices (K <= ~28-31 bases, common case)
175            let mut buf = [0u8; 8];
176            let avail = self.strings.len().saturating_sub(byte_offset).min(8);
177            buf[..avail].copy_from_slice(&self.strings[byte_offset..byte_offset + avail]);
178            let raw = u64::from_le_bytes(buf);
179            let shifted = raw >> bit_shift;
180            let mask = if needed_bits >= 64 { u64::MAX } else { (1u64 << needed_bits) - 1 };
181            Kmer::<K>::new(<Kmer<K> as KmerBits>::from_u64(shifted & mask))
182        } else {
183            // Need u128 for K > 28 or large bit_shift.
184            // For K=63 (needed_bits=126), when bit_shift > 2 we need more
185            // than 128 bits from the byte stream. Load 16 bytes as u128
186            // plus an extra byte to cover the overflow.
187            let mut buf = [0u8; 17];
188            let avail = self.strings.len().saturating_sub(byte_offset).min(17);
189            buf[..avail].copy_from_slice(&self.strings[byte_offset..byte_offset + avail]);
190            let raw = u128::from_le_bytes(buf[..16].try_into().unwrap());
191            let shifted = if bit_shift > 0 {
192                let extra = buf[16] as u128;
193                (raw >> bit_shift) | (extra << (128 - bit_shift))
194            } else {
195                raw
196            };
197            let mask = if needed_bits >= 128 { u128::MAX } else { (1u128 << needed_bits) - 1 };
198            Kmer::<K>::new(<Kmer<K> as KmerBits>::from_u128(shifted & mask))
199        }
200    }
201
202    /// Decode a k-mer at an absolute base position in the concatenated strings.
203    /// Avoids the need for string_id (no binary search needed).
204    /// This matches the C++ `util::read_kmer_at` approach with decoded_offsets.
205    #[inline]
206    pub fn decode_kmer_at<const K: usize>(&self, absolute_pos: usize) -> Kmer<K>
207    where
208        Kmer<K>: KmerBits,
209    {
210        let byte_offset = absolute_pos / 4;
211        let bit_shift = (absolute_pos % 4) * 2;
212        let needed_bits = K * 2;
213        let needed_bytes = (needed_bits + bit_shift).div_ceil(8);
214        
215        if needed_bytes <= 8 {
216            // Fast path: unaligned u64 load when we have enough bytes.
217            // Avoids bounds-checked copy_from_slice + zero-fill overhead.
218            // Note: ptr::read_unaligned produces native-endian u64, which on
219            // little-endian matches u64::from_le_bytes (the 2-bit encoding
220            // assumes LE byte order).
221            let raw = if byte_offset + 8 <= self.strings.len() {
222                unsafe {
223                    std::ptr::read_unaligned(
224                        self.strings.as_ptr().add(byte_offset) as *const u64
225                    )
226                }
227            } else {
228                // Near end of strings — rare fallback
229                let mut buf = [0u8; 8];
230                let avail = self.strings.len() - byte_offset;
231                buf[..avail].copy_from_slice(&self.strings[byte_offset..byte_offset + avail]);
232                u64::from_le_bytes(buf)
233            };
234            let shifted = raw >> bit_shift;
235            let mask = if needed_bits >= 64 { u64::MAX } else { (1u64 << needed_bits) - 1 };
236            Kmer::<K>::new(<Kmer<K> as KmerBits>::from_u64(shifted & mask))
237        } else {
238            // For K=63 (needed_bits=126), when bit_shift > 2 we need more
239            // than 128 bits from the byte stream. Load u128 + extra byte.
240            let (raw, extra_byte) = if byte_offset + 17 <= self.strings.len() {
241                let r = unsafe {
242                    std::ptr::read_unaligned(
243                        self.strings.as_ptr().add(byte_offset) as *const u128
244                    )
245                };
246                (r, self.strings[byte_offset + 16])
247            } else if byte_offset + 16 <= self.strings.len() {
248                let r = unsafe {
249                    std::ptr::read_unaligned(
250                        self.strings.as_ptr().add(byte_offset) as *const u128
251                    )
252                };
253                let extra = if byte_offset + 16 < self.strings.len() {
254                    self.strings[byte_offset + 16]
255                } else {
256                    0u8
257                };
258                (r, extra)
259            } else {
260                let mut buf = [0u8; 17];
261                let avail = self.strings.len() - byte_offset;
262                buf[..avail].copy_from_slice(&self.strings[byte_offset..byte_offset + avail]);
263                (u128::from_le_bytes(buf[..16].try_into().unwrap()), buf[16])
264            };
265            let shifted = if bit_shift > 0 {
266                (raw >> bit_shift) | ((extra_byte as u128) << (128 - bit_shift))
267            } else {
268                raw
269            };
270            let mask = if needed_bits >= 128 { u128::MAX } else { (1u128 << needed_bits) - 1 };
271            Kmer::<K>::new(<Kmer<K> as KmerBits>::from_u128(shifted & mask))
272        }
273    }
274
275    /// Serialize the SPSS to a writer using a custom binary format.
276    ///
277    /// Format:
278    /// - k: u64 (LE)
279    /// - m: u64 (LE)
280    /// - strings_len: u64 (LE)
281    /// - strings: [u8; strings_len]
282    /// - offsets: epserde Elias-Fano binary format
283    pub fn serialize_to<W: Write>(&self, writer: &mut W) -> io::Result<()> {
284        writer.write_all(&(self.k as u64).to_le_bytes())?;
285        writer.write_all(&(self.m as u64).to_le_bytes())?;
286        writer.write_all(&(self.strings.len() as u64).to_le_bytes())?;
287        writer.write_all(&self.strings)?;
288        self.offsets.write_to(writer)?;
289        Ok(())
290    }
291
292    /// Deserialize an SPSS from a reader.
293    pub fn deserialize_from<R: Read>(reader: &mut R) -> io::Result<Self> {
294        let mut buf8 = [0u8; 8];
295        reader.read_exact(&mut buf8)?;
296        let k = u64::from_le_bytes(buf8) as usize;
297        reader.read_exact(&mut buf8)?;
298        let m = u64::from_le_bytes(buf8) as usize;
299        reader.read_exact(&mut buf8)?;
300        let strings_len = u64::from_le_bytes(buf8) as usize;
301        let mut strings = vec![0u8; strings_len];
302        reader.read_exact(&mut strings)?;
303        let offsets = EliasFanoOffsets::read_from(reader)?;
304        Ok(Self { strings, offsets, k, m })
305    }
306}
307
308impl std::fmt::Debug for SpectrumPreservingStringSet {
309    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
310        f.debug_struct("SpectrumPreservingStringSet")
311            .field("k", &self.k)
312            .field("m", &self.m)
313            .field("num_strings", &self.num_strings())
314            .field("total_bases", &self.total_bases())
315            .field("num_bits", &self.num_bits())
316            .finish()
317    }
318}
319
320#[cfg(test)]
321mod tests {
322    use super::*;
323    use crate::offsets::OffsetsVector;
324
325    /// Helper: build a test SPSS from DNA strings using the same Encoder
326    /// logic (2-bit packing with offsets).
327    fn build_test_spss(k: usize, m: usize, strings: &[&str]) -> SpectrumPreservingStringSet {
328        let mut packed = Vec::new();
329        let mut offsets = OffsetsVector::new();
330        let mut total_bases: u64 = 0;
331
332        for s in strings {
333            for &b in s.as_bytes() {
334                let bits = match b {
335                    b'A' | b'a' => 0u8,
336                    b'C' | b'c' => 1u8,
337                    b'G' | b'g' => 3u8,
338                    b'T' | b't' => 2u8,
339                    _ => panic!("invalid base"),
340                };
341                let byte_idx = (total_bases as usize) / 4;
342                let bit_off = ((total_bases as usize) % 4) * 2;
343                if byte_idx >= packed.len() {
344                    packed.push(0u8);
345                }
346                packed[byte_idx] |= bits << bit_off;
347                total_bases += 1;
348            }
349            offsets.push(total_bases);
350        }
351
352        SpectrumPreservingStringSet::from_parts(packed, offsets, k, m)
353    }
354
355    #[test]
356    fn test_spss_creation() {
357        let spss = SpectrumPreservingStringSet::new(31, 13);
358        assert_eq!(spss.k(), 31);
359        assert_eq!(spss.m(), 13);
360        assert_eq!(spss.num_strings(), 0);
361    }
362
363    #[test]
364    fn test_spss_two_strings() {
365        let spss = build_test_spss(31, 13, &[
366            "ACGTACGTACGTACGTACGTACGTACGTACG",  // 31 chars
367            "TGCATGCATGCATGCATGCATGCATGCATGCA", // 32 chars
368        ]);
369        assert_eq!(spss.num_strings(), 2);
370    }
371
372    #[test]
373    fn test_spss_string_offsets() {
374        let spss = build_test_spss(31, 13, &[
375            "ACGTACGTACGTACGTACGTACGTACGTACG",  // 31 chars
376            "TGCATGCATGCATGCATGCATGCATGCATGC", // 31 chars
377        ]);
378
379        let (begin1, end1) = spss.string_offsets(0);
380        let (begin2, end2) = spss.string_offsets(1);
381
382        assert_eq!(begin1, 0);
383        assert_eq!(end1 - begin1, 31);
384        assert_eq!(begin2, 31);
385        assert_eq!(end2 - begin2, 31);
386    }
387
388    #[test]
389    fn test_spss_total_bases() {
390        let spss = build_test_spss(31, 13, &[
391            "ACGTACGTACGTACGTACGTACGTACGTACG",  // 31 chars
392            "TGCATGCATGCATGCATGCATGCATGCATGC", // 31 chars
393        ]);
394        assert_eq!(spss.total_bases(), 62);
395    }
396
397    #[test]
398    fn test_spss_debug() {
399        let spss = build_test_spss(31, 13, &[
400            "ACGTACGTACGTACGTACGTACGTACGTACG",
401        ]);
402        let debug_str = format!("{:?}", spss);
403        assert!(debug_str.contains("SpectrumPreservingStringSet"));
404        assert!(debug_str.contains("k: 31"));
405    }
406}