Skip to main content

sshash_lib/
offsets.rs

1//! Compact encoding of offsets into a bit-packed string set
2//!
3//! This module provides efficient storage of offsets using variable-length encoding,
4//! reducing the memory footprint of the index.
5//!
6//! Two representations are available:
7//! - `OffsetsVector`: Plain `Vec<u64>`, used during construction
8//! - `EliasFanoOffsets`: Elias-Fano encoding via sux-rs `EfSeqDict`, used after
9//!   construction and during queries. Provides O(1) random access and fast
10//!   `locate()` via successor queries (matches C++ endpoints_sequence).
11
12use epserde::prelude::*;
13
14/// A decoded offset with both absolute and relative information
15#[derive(Clone, Copy, Debug)]
16pub struct DecodedOffset {
17    /// Absolute byte offset into the string data
18    pub absolute_offset: u64,
19    /// Relative offset (for retrieval purposes)
20    pub relative_offset: u64,
21}
22
23impl DecodedOffset {
24    /// Create a new decoded offset
25    pub fn new(absolute_offset: u64, relative_offset: u64) -> Self {
26        Self {
27            absolute_offset,
28            relative_offset,
29        }
30    }
31}
32
33/// Compact vector of offsets
34///
35/// Stores offsets using a compact representation.
36#[derive(Clone, Debug, Epserde)]
37pub struct OffsetsVector {
38    /// Raw offset values
39    offsets: Vec<u64>,
40}
41
42impl OffsetsVector {
43    /// Create a new empty offsets vector
44    pub fn new() -> Self {
45        Self {
46            offsets: vec![0], // Start with 0 for the first offset
47        }
48    }
49
50    /// Add an offset to the vector
51    #[inline]
52    pub fn push(&mut self, offset: u64) {
53        self.offsets.push(offset);
54    }
55
56    /// Get the offset at index `i`
57    #[inline]
58    pub fn access(&self, i: usize) -> u64 {
59        assert!(i < self.offsets.len(), "Offset index {} out of bounds", i);
60        self.offsets[i]
61    }
62
63    /// Decode an offset (currently identity since we don't compress yet)
64    #[inline]
65    pub fn decode(&self, absolute_offset: u64) -> DecodedOffset {
66        DecodedOffset::new(absolute_offset, absolute_offset)
67    }
68
69    /// Get the number of offsets
70    #[inline]
71    pub fn len(&self) -> usize {
72        self.offsets.len()
73    }
74
75    /// Check if the vector is empty
76    #[inline]
77    pub fn is_empty(&self) -> bool {
78        self.offsets.is_empty()
79    }
80
81    /// Get the number of bytes used (approximation for MVP)
82    #[inline]
83    pub fn num_bytes(&self) -> u64 {
84        (self.offsets.len() * 8) as u64
85    }
86
87    /// Get the number of bits used (approximation for MVP)
88    #[inline]
89    pub fn num_bits(&self) -> u64 {
90        (self.offsets.len() * 64) as u64
91    }
92
93    /// Binary search to find which string contains a given absolute position.
94    /// Returns `(string_id, string_begin)` where `offsets[string_id] <= pos < offsets[string_id + 1]`.
95    /// This matches the C++ `decoded_offsets::offset_to_id` / Elias-Fano `locate` approach.
96    #[inline]
97    pub fn locate(&self, pos: u64) -> Option<(u64, u64)> {
98        let n = self.offsets.len();
99        if n < 2 {
100            return None;
101        }
102
103        // Use Rust's optimised binary search: partition_point returns the first
104        // index where the predicate is false, i.e. the first offset > pos.
105        let idx = self.offsets.partition_point(|&x| x <= pos);
106
107        // idx == 0 means pos < offsets[0]: out of bounds.
108        if idx == 0 {
109            return None;
110        }
111        let string_id = idx - 1;
112
113        // Validate: pos must be within [offsets[string_id], offsets[string_id + 1])
114        if string_id + 1 < n {
115            Some((string_id as u64, self.offsets[string_id]))
116        } else {
117            None
118        }
119    }
120
121    /// Branchless binary search variant for benchmarking comparison.
122    ///
123    /// Uses conditional moves instead of branches to avoid branch mispredictions.
124    /// The inner loop has no data-dependent branches - only a CMOV.
125    #[inline]
126    pub fn locate_branchless(&self, pos: u64) -> Option<(u64, u64)> {
127        let n = self.offsets.len();
128        if n < 2 {
129            return None;
130        }
131
132        let data = self.offsets.as_slice();
133
134        // Branchless binary search: find rightmost index where data[idx] <= pos
135        let mut lo: usize = 0;
136        let mut size = n;
137        while size > 1 {
138            let half = size / 2;
139            let mid = lo + half;
140            // Branchless: compiler should emit CMOV
141            // SAFETY: mid is always < n because lo + half < lo + size <= n
142            lo = if unsafe { *data.get_unchecked(mid) } <= pos { mid } else { lo };
143            size -= half;
144        }
145
146        // lo is now the rightmost index where data[lo] <= pos, or 0 if pos < data[0]
147        if unsafe { *data.get_unchecked(lo) } > pos {
148            return None;
149        }
150        if lo + 1 < n {
151            Some((lo as u64, unsafe { *data.get_unchecked(lo) }))
152        } else {
153            None
154        }
155    }
156}
157
158impl Default for OffsetsVector {
159    fn default() -> Self {
160        Self::new()
161    }
162}
163
164#[cfg(test)]
165mod tests {
166    use super::*;
167
168    #[test]
169    fn test_offsets_vector_creation() {
170        let offsets = OffsetsVector::new();
171        assert_eq!(offsets.len(), 1);
172        assert_eq!(offsets.access(0), 0);
173    }
174
175    #[test]
176    fn test_offsets_vector_push() {
177        let mut offsets = OffsetsVector::new();
178        offsets.push(100);
179        offsets.push(200);
180        offsets.push(300);
181
182        assert_eq!(offsets.len(), 4);
183        assert_eq!(offsets.access(0), 0);
184        assert_eq!(offsets.access(1), 100);
185        assert_eq!(offsets.access(2), 200);
186        assert_eq!(offsets.access(3), 300);
187    }
188
189    #[test]
190    fn test_offsets_vector_decode() {
191        let offsets = OffsetsVector::new();
192        let decoded = offsets.decode(50);
193        assert_eq!(decoded.absolute_offset, 50);
194        assert_eq!(decoded.relative_offset, 50);
195    }
196
197    #[test]
198    fn test_decoded_offset_creation() {
199        let decoded = DecodedOffset::new(1000, 500);
200        assert_eq!(decoded.absolute_offset, 1000);
201        assert_eq!(decoded.relative_offset, 500);
202    }
203}
204
205// ---------------------------------------------------------------------------
206// Elias-Fano encoded offsets (sux-rs)
207// ---------------------------------------------------------------------------
208
209use sux::dict::elias_fano::{EliasFanoBuilder, EfSeqDict};
210use sux::traits::{IndexedSeq, Succ};
211use sux::traits::iter::BidiIterator;
212use mem_dbg::{MemSize, SizeFlags};
213
214/// Elias-Fano encoded offsets for compact, fast string boundary lookups.
215///
216/// Uses sux-rs `EfSeqDict` which provides:
217/// - O(1) random access via `get_unchecked(i)` (uses Select1)
218/// - O(1) successor query via `succ(x)` (uses Select0)
219/// - Bidirectional iterator from successor via `iter_bidi_from_succ(x)`:
220///   `next()`/`prev()` use `select_in_word` (single CPU instruction) for
221///   cheap adjacent-element access without full Select1 inventory lookup.
222///
223/// This closely matches the C++ `endpoints_sequence` data structure used by SSHash.
224///
225/// Space usage is approximately `2 + log(U/N)` bits per element (Elias-Fano bound),
226/// compared to 64 bits per element for `Vec<u64>`.
227pub struct EliasFanoOffsets {
228    /// Elias-Fano sequence for compact access and locate
229    ef: EfSeqDict,
230}
231
232impl EliasFanoOffsets {
233    /// Build from a sorted vector of offsets (must start with 0).
234    pub fn from_vec(offsets: &[u64]) -> Self {
235        let n = offsets.len();
236        let u = if n > 0 { offsets[n - 1] as usize + 1 } else { 1 };
237        let mut builder = EliasFanoBuilder::new(n, u);
238        for &v in offsets {
239            builder.push(v as usize);
240        }
241        Self { ef: builder.build_with_seq_and_dict() }
242    }
243
244    /// Build from an `OffsetsVector` (consumes it).
245    pub fn from_offsets_vector(ov: OffsetsVector) -> Self {
246        Self::from_vec(&ov.offsets)
247    }
248
249    /// Get the offset at index `i`.
250    #[inline]
251    pub fn access(&self, i: usize) -> u64 {
252        // SAFETY: caller must ensure i < self.len().
253        unsafe { self.ef.get_unchecked(i) as u64 }
254    }
255
256    /// Locate which string contains a given absolute position, returning
257    /// `(string_id, string_begin, string_end)`.
258    ///
259    /// Uses sux-rs `iter_bidi_from_succ()` to find the successor, then
260    /// cheap `next()`/`prev()` calls (single-instruction bit scans) to
261    /// read adjacent elements without full Select1 inventory lookups.
262    #[inline]
263    pub fn locate_with_end(&self, pos: u64) -> Option<(u64, u64, u64)> {
264        let n = self.ef.len();
265        if n < 2 {
266            return None;
267        }
268
269        // iter_bidi_from_succ returns (index, positioned_iterator) for the
270        // first element >= pos. The first next() yields the successor value.
271        let (idx, mut iter) = self.ef.iter_bidi_from_succ(pos as usize)?;
272
273        // Get the successor value (cheap: reads from cached word).
274        let val = iter.next()?;
275
276        if val == pos as usize {
277            // Exact hit: pos is at a string boundary → string_id = idx.
278            // Need the NEXT element for string_end (cheap bit scan forward).
279            if idx + 1 < n {
280                let end = iter.next()? as u64;
281                Some((idx as u64, pos, end))
282            } else {
283                None
284            }
285        } else {
286            // val > pos → string containing pos starts at idx-1.
287            // val IS the end of this string. Need begin = offsets[idx-1].
288            // prev() undoes the next(), then prev() again gets offsets[idx-1].
289            // Both use select_in_word (single CPU instruction per call).
290            debug_assert!(idx > 0);
291            iter.prev(); // back to idx (returns val, discarded)
292            let begin = iter.prev()? as u64; // offsets[idx-1]
293            Some(((idx - 1) as u64, begin, val as u64))
294        }
295    }
296
297    /// Locate which string contains a given absolute position.
298    /// Returns `(string_id, string_begin)` where
299    /// `offsets[string_id] <= pos < offsets[string_id + 1]`.
300    ///
301    /// Uses `iter_bidi_from_succ()` + cheap `prev()` bit scans
302    /// instead of full Select1 for adjacent element access.
303    #[inline]
304    pub fn locate(&self, pos: u64) -> Option<(u64, u64)> {
305        let n = self.ef.len();
306        if n < 2 {
307            return None;
308        }
309
310        let (idx, mut iter) = self.ef.iter_bidi_from_succ(pos as usize)?;
311        let val = iter.next()?;
312
313        if val == pos as usize {
314            // Exact hit: string_id = idx, but only if there's a next element
315            if idx + 1 < n {
316                Some((idx as u64, pos))
317            } else {
318                None
319            }
320        } else {
321            // val > pos: string containing pos starts at idx - 1
322            debug_assert!(idx > 0);
323            iter.prev(); // back to idx
324            let string_begin = iter.prev()? as u64; // offsets[idx-1]
325            Some(((idx - 1) as u64, string_begin))
326        }
327    }
328
329    /// Number of offsets stored.
330    #[inline]
331    pub fn len(&self) -> usize {
332        self.ef.len()
333    }
334
335    /// Whether there are no offsets.
336    #[inline]
337    pub fn is_empty(&self) -> bool {
338        self.ef.len() == 0
339    }
340
341    /// Actual number of bytes used by the Elias-Fano index (including selection structures).
342    #[inline]
343    pub fn num_bytes(&self) -> u64 {
344        self.ef.mem_size(SizeFlags::default()) as u64
345    }
346
347    /// Actual number of bits used by the Elias-Fano index.
348    #[inline]
349    pub fn num_bits(&self) -> u64 {
350        self.num_bytes() * 8
351    }
352
353    /// Serialize the Elias-Fano offsets to a writer using epserde's binary format.
354    pub fn write_to<W: std::io::Write>(&self, writer: &mut W) -> std::io::Result<()> {
355        unsafe {
356            self.ef.serialize(writer)
357                .map_err(std::io::Error::other)?;
358        }
359        Ok(())
360    }
361
362    /// Deserialize Elias-Fano offsets from a reader using epserde's binary format.
363    pub fn read_from<R: std::io::Read>(reader: &mut R) -> std::io::Result<Self> {
364        let ef = unsafe {
365            EfSeqDict::deserialize_full(reader)
366                .map_err(std::io::Error::other)?
367        };
368        Ok(Self { ef })
369    }
370}
371
372impl std::fmt::Debug for EliasFanoOffsets {
373    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
374        f.debug_struct("EliasFanoOffsets")
375            .field("len", &self.ef.len())
376            .finish()
377    }
378}
379
380#[cfg(test)]
381mod ef_tests {
382    use super::*;
383
384    #[test]
385    fn test_ef_from_vec() {
386        let offsets = vec![0, 100, 200, 300, 400];
387        let ef = EliasFanoOffsets::from_vec(&offsets);
388        assert_eq!(ef.len(), 5);
389        assert_eq!(ef.access(0), 0);
390        assert_eq!(ef.access(1), 100);
391        assert_eq!(ef.access(2), 200);
392        assert_eq!(ef.access(3), 300);
393        assert_eq!(ef.access(4), 400);
394    }
395
396    #[test]
397    fn test_ef_locate() {
398        let offsets = vec![0, 100, 200, 300, 400];
399        let ef = EliasFanoOffsets::from_vec(&offsets);
400
401        // pos=50 → string 0 (begins at 0)
402        assert_eq!(ef.locate(50), Some((0, 0)));
403        // pos=100 → exact boundary, string 1 (begins at 100)
404        assert_eq!(ef.locate(100), Some((1, 100)));
405        // pos=199 → string 1 (begins at 100)
406        assert_eq!(ef.locate(199), Some((1, 100)));
407        // pos=300 → exact boundary, string 3 (begins at 300)
408        assert_eq!(ef.locate(300), Some((3, 300)));
409        // pos=399 → string 3 (begins at 300)
410        assert_eq!(ef.locate(399), Some((3, 300)));
411        // pos=400 → out of range (last element is universe bound)
412        assert_eq!(ef.locate(400), None);
413    }
414
415    #[test]
416    fn test_ef_locate_with_end() {
417        let offsets = vec![0, 100, 200, 300, 400];
418        let ef = EliasFanoOffsets::from_vec(&offsets);
419
420        // pos=50 → string 0 (begins at 0, ends at 100)
421        assert_eq!(ef.locate_with_end(50), Some((0, 0, 100)));
422        // pos=100 → exact boundary, string 1 (begins at 100, ends at 200)
423        assert_eq!(ef.locate_with_end(100), Some((1, 100, 200)));
424        // pos=199 → string 1 (begins at 100, ends at 200)
425        assert_eq!(ef.locate_with_end(199), Some((1, 100, 200)));
426        // pos=300 → exact boundary, string 3 (begins at 300, ends at 400)
427        assert_eq!(ef.locate_with_end(300), Some((3, 300, 400)));
428        // pos=399 → string 3 (begins at 300, ends at 400)
429        assert_eq!(ef.locate_with_end(399), Some((3, 300, 400)));
430        // pos=400 → out of range
431        assert_eq!(ef.locate_with_end(400), None);
432    }
433
434    #[test]
435    fn test_ef_serialization_roundtrip() {
436        let offsets = vec![0, 100, 200, 300, 400];
437        let ef = EliasFanoOffsets::from_vec(&offsets);
438
439        let mut buf = Vec::new();
440        ef.write_to(&mut buf).unwrap();
441
442        let ef2 = EliasFanoOffsets::read_from(&mut &buf[..]).unwrap();
443        assert_eq!(ef2.len(), 5);
444        for i in 0..5 {
445            assert_eq!(ef.access(i), ef2.access(i));
446        }
447        assert_eq!(ef2.locate(150), Some((1, 100)));
448        assert_eq!(ef2.locate_with_end(150), Some((1, 100, 200)));
449    }
450
451    /// Stress test: build EF with varying gaps and verify locate_with_end
452    /// against a brute-force reference for EVERY position in the range.
453    #[test]
454    fn test_ef_locate_with_end_stress() {
455        // Create offsets with varying gap sizes to exercise different EF bit patterns
456        let mut offsets = vec![0u64];
457        let gaps = [3, 7, 1, 15, 2, 100, 5, 31, 8, 63, 4, 127, 6, 255, 10, 50,
458                    1, 1, 1, 33, 200, 9, 17, 3, 11, 500, 2, 7, 13, 41];
459        for &g in gaps.iter() {
460            offsets.push(offsets.last().unwrap() + g);
461        }
462        let ef = EliasFanoOffsets::from_vec(&offsets);
463
464        // Verify access
465        for (i, &v) in offsets.iter().enumerate() {
466            assert_eq!(ef.access(i), v, "access({i}) mismatch");
467        }
468
469        // Reference locate_with_end via brute force
470        let universe = *offsets.last().unwrap();
471        for pos in 0..=universe {
472            let expected = {
473                // Find string containing pos: offsets[id] <= pos < offsets[id+1]
474                let mut found = None;
475                for i in 0..offsets.len() - 1 {
476                    if offsets[i] <= pos && pos < offsets[i + 1] {
477                        found = Some((i as u64, offsets[i], offsets[i + 1]));
478                        break;
479                    }
480                }
481                found
482            };
483            let got = ef.locate_with_end(pos);
484            assert_eq!(got, expected, "locate_with_end({pos}) mismatch");
485        }
486
487        // Also test past-the-end
488        assert_eq!(ef.locate_with_end(universe), None);
489        assert_eq!(ef.locate_with_end(universe + 1), None);
490    }
491
492    /// Stress test with large gaps to exercise high-bit patterns
493    #[test]
494    fn test_ef_locate_large_universe() {
495        let offsets: Vec<u64> = vec![0, 1000, 5000, 5001, 5002, 10000, 100000, 100001, 500000];
496        let ef = EliasFanoOffsets::from_vec(&offsets);
497
498        // Test all boundary positions and a few interior ones
499        let test_positions: Vec<u64> = {
500            let mut v = Vec::new();
501            for &off in &offsets {
502                if off > 0 { v.push(off - 1); }
503                v.push(off);
504                v.push(off + 1);
505            }
506            // Some random interior positions
507            v.extend_from_slice(&[500, 3000, 5000, 7500, 50000, 200000, 400000]);
508            v.sort();
509            v.dedup();
510            v
511        };
512
513        for pos in test_positions {
514            let expected = {
515                let mut found = None;
516                for i in 0..offsets.len() - 1 {
517                    if offsets[i] <= pos && pos < offsets[i + 1] {
518                        found = Some((i as u64, offsets[i], offsets[i + 1]));
519                        break;
520                    }
521                }
522                found
523            };
524            let got = ef.locate_with_end(pos);
525            assert_eq!(got, expected, "locate_with_end({pos}) mismatch");
526        }
527    }
528}