packed_seq/
packed_ef_n_seq.rs

1use mem_dbg::{MemDbg, MemSize};
2use sux::dict::{EliasFano, EliasFanoBuilder};
3
4use crate::{BitSeqVec, PackedNSeqVec, PackedSeqVec, SeqVec};
5
6/// Like the 2+1-bit encoded [`PackedNSeqVec`], but using Elias-Fano to encode the positions of the N characters.
7/// Note that the fixed-cost overhead of [`sux::dict::EliasFano`] is quite large at ~5 `usize` values, or 40 bytes (=160bp).
8#[derive(Clone, Debug, MemSize, MemDbg)]
9#[cfg_attr(feature = "pyo3", pyo3::pyclass)]
10#[cfg_attr(feature = "epserde", derive(epserde::Epserde))]
11pub struct PackedEfNSeqVec {
12    seq: PackedSeqVec,
13    ef_ambiguous: Option<EliasFano>,
14}
15
16impl PackedEfNSeqVec {
17    pub fn from_packed_n_seq_vec(n_seq: PackedNSeqVec) -> Self {
18        let mut cnt = 0;
19        let (chunks, tail) = n_seq.ambiguous.seq.as_chunks();
20        for chunk in chunks {
21            cnt += u64::from_ne_bytes(*chunk).count_ones();
22        }
23        for byte in tail {
24            cnt += byte.count_ones();
25        }
26
27        if cnt == 0 {
28            return Self {
29                seq: n_seq.seq,
30                ef_ambiguous: None,
31            };
32        }
33
34        let mut ef_builder = EliasFanoBuilder::new(cnt as usize, n_seq.seq.seq.len() as usize);
35        let mut pos = 0;
36        for mut byte in n_seq.ambiguous.seq {
37            while byte > 0 {
38                let i = byte.count_zeros();
39                ef_builder.push(pos + i as usize);
40                byte ^= 1 << i;
41            }
42            pos += 8;
43        }
44        Self {
45            seq: n_seq.seq,
46            ef_ambiguous: Some(ef_builder.build()),
47        }
48    }
49    pub fn into_packed_n_seq_vec(self) -> PackedNSeqVec {
50        let mut ambiguous = BitSeqVec::with_len(self.seq.len());
51        if let Some(ef_ambiguous) = self.ef_ambiguous {
52            for pos in ef_ambiguous.iter() {
53                ambiguous.seq[pos / 8] |= 1 << (pos % 8);
54            }
55        }
56        PackedNSeqVec {
57            seq: self.seq,
58            ambiguous,
59        }
60    }
61}
62
63#[test]
64fn ef_duplicate_values() {
65    let vals = vec![1, 2, 3, 4, 8, 10, 10, 123];
66    let mut builder = EliasFanoBuilder::new(vals.len(), *vals.last().unwrap());
67    for x in vals {
68        builder.push(x);
69    }
70    builder.build();
71}