cacheline_ef/
lib.rs

1//! # Cacheline Elias-Fano
2//!
3//! [`CachelineEf`] is an integer encoding that packs chunks of 44 sorted 40-bit values into a single
4//! cacheline, using `64/44*8 = 11.6` bits per value.
5//! Each chunk can hold increasing values in a range of length `256*84=21504`.
6//!
7//! [`CachelineEfVec`] stores a vector of [`CachelineEf`] and provides [`CachelineEfVec::index`] and [`CachelineEfVec::prefetch`].
8//!
9//! This encoding is efficient when consecutive values differ by roughly 100, where using
10//! Elias-Fano directly on the full list would use around `9` bits/value.
11//!
12//! The main benefit is that CachelineEf only requires reading a single cacheline per
13//! query, where Elias-Fano encoding usually needs 3 reads.
14//!
15//! Epserde is supported when the `epserde` feature flag is enabled.
16//!
17//! ## Layout
18//!
19//! The layout is described in detail in the PtrHash paper ([arXiv](https://arxiv.org/abs/2502.15539),
20//! [blog version](https://curiouscoding.nl/posts/ptrhash)).
21//!
22//! In summary:
23//! - First store a 4 byte offset, corresponding to the 32 high bits of the smallest value.
24//! - Then store for each of 44 values the 8 low bits.
25//! - Lastly we have 16 bytes (128 bits) to encode the high parts.
26//!   For the i'th value `x[i]`, we set the bit at position `i+(x[i]/256 - x[0]/256)` to `1`.
27
28use common_traits::SelectInWord;
29use std::cmp::min;
30
31/// Number of stored values per unit.
32const L: usize = 44;
33
34/// A vector of [`CachelineEf`].
35#[derive(Default, Clone, mem_dbg::MemSize, mem_dbg::MemDbg)]
36#[cfg_attr(feature = "epserde", derive(epserde::prelude::Epserde))]
37pub struct CachelineEfVec<E = Vec<CachelineEf>> {
38    ef: E,
39    len: usize,
40}
41
42impl CachelineEfVec<Vec<CachelineEf>> {
43    /// Construct a new `CachelineEfVec` for a list of `u64` values.
44    ///
45    /// Panics when:
46    /// - the input is not sorted,
47    /// - the input values are over 2^40,
48    /// - there is a cacheline where the values span a too large range.
49    pub fn new(vals: &[u64]) -> Self {
50        let mut p = Vec::with_capacity(vals.len().div_ceil(L));
51        for i in (0..vals.len()).step_by(L) {
52            p.push(CachelineEf::new(&vals[i..min(i + L, vals.len())]));
53        }
54
55        Self {
56            ef: p,
57            len: vals.len(),
58        }
59    }
60}
61
62impl<E: AsRef<[CachelineEf]>> CachelineEfVec<E> {
63    /// Get the value at the given index in the vector.
64    pub fn index(&self, index: usize) -> u64 {
65        assert!(
66            index < self.len,
67            "Index {index} out of bounds. Length is {}.",
68            self.len
69        );
70        // Note: This division is inlined by the compiler.
71        unsafe { self.ef.as_ref().get_unchecked(index / L).index(index % L) }
72    }
73    /// The number of values stored.
74    pub fn len(&self) -> usize {
75        self.len
76    }
77    /// Get the value at the given index in the vector, and do not check bounds.
78    pub unsafe fn index_unchecked(&self, index: usize) -> u64 {
79        // Note: This division is inlined by the compiler.
80        (*self.ef.as_ref().get_unchecked(index / L)).index(index % L)
81    }
82    /// Prefetch the cacheline containing the given element.
83    pub fn prefetch(&self, index: usize) {
84        prefetch_index(self.ef.as_ref(), index / L);
85    }
86    /// The size of the underlying vector, in bytes.
87    pub fn size_in_bytes(&self) -> usize {
88        std::mem::size_of_val(self.ef.as_ref())
89    }
90}
91
92/// A single cacheline that holds 44 Elias-Fano encoded 40-bit values in a range of size `256*84=21504`.
93// This has size 64 bytes (one cacheline) and is aligned to 64bytes as well to
94// ensure it actually occupied a single cacheline.
95// It is marked `zero_copy` to be able to use it with lazy deserialization of ep-serde.
96#[derive(Clone, Copy, mem_dbg::MemSize, mem_dbg::MemDbg)]
97#[repr(C)]
98#[repr(align(64))]
99#[cfg_attr(feature = "epserde", derive(epserde::prelude::Epserde))]
100#[cfg_attr(feature = "epserde", zero_copy)]
101#[copy_type]
102pub struct CachelineEf {
103    // 2*64 = 128 bits to indicate where 256 boundaries are crossed.
104    // There are 44 1-bits corresponding to the stored numbers, and the number
105    // of 0-bits before each number indicates the number of times 256 must be added.
106    high_boundaries: [u64; 2],
107    // The offset of the first element, divided by 256.
108    reduced_offset: u32,
109    // Last 8 bits of each number.
110    low_bits: [u8; L],
111}
112
113impl CachelineEf {
114    fn new(vals: &[u64]) -> Self {
115        assert!(!vals.is_empty(), "List of values must not be empty.");
116        assert!(
117            vals.len() <= L,
118            "Number of values must be at most {L}, but is {}",
119            vals.len()
120        );
121        let l = vals.len();
122        assert!(
123            vals[l - 1] - vals[0] <= 256 * (128 - L as u64),
124            "Range of values {} ({} to {}) is too large! Can be at most {}.",
125            vals[l - 1] - vals[0],
126            vals[0],
127            vals[l - 1],
128            256 * (128 - L as u64)
129        );
130        assert!(
131            vals[l - 1] < (1u64 << 40),
132            "Last value {} is too large! Must be less than 2^40={}",
133            vals[l - 1],
134            1u64 << 40
135        );
136
137        let offset = vals[0] >> 8;
138        assert!(
139            offset <= u32::MAX as u64,
140            "vals[0] does not fit in 40 bits."
141        );
142        let mut low_bits = [0u8; L];
143        for (i, &v) in vals.iter().enumerate() {
144            low_bits[i] = (v & 0xff) as u8;
145        }
146        let mut high_boundaries = [0u64; 2];
147        let mut last = 0;
148        for (i, &v) in vals.iter().enumerate() {
149            assert!(i >= last);
150            last = i;
151            let idx = i + ((v >> 8) - offset) as usize;
152            assert!(idx < 128, "Value {} is too large!", v - offset);
153            high_boundaries[idx / 64] |= 1 << (idx % 64);
154        }
155        Self {
156            reduced_offset: offset as u32,
157            high_boundaries,
158            low_bits,
159        }
160    }
161
162    /// Get the value a the given index.
163    ///
164    /// Panics when `idx` is out of bounds.
165    pub fn index(&self, idx: usize) -> u64 {
166        let p = self.high_boundaries[0].count_ones() as usize;
167        let one_pos = if idx < p {
168            self.high_boundaries[0].select_in_word(idx)
169        } else {
170            64 + self.high_boundaries[1].select_in_word(idx - p)
171        };
172
173        256 * self.reduced_offset as u64 + 256 * (one_pos - idx) as u64 + self.low_bits[idx] as u64
174    }
175}
176
177/// Prefetch the given cacheline into L1 cache.
178fn prefetch_index<T>(s: &[T], index: usize) {
179    let ptr = unsafe { s.as_ptr().add(index) as *const u64 };
180    #[cfg(target_arch = "x86_64")]
181    unsafe {
182        std::arch::x86_64::_mm_prefetch(ptr as *const i8, std::arch::x86_64::_MM_HINT_T0);
183    }
184    #[cfg(target_arch = "x86")]
185    unsafe {
186        std::arch::x86::_mm_prefetch(ptr as *const i8, std::arch::x86::_MM_HINT_T0);
187    }
188    #[cfg(target_arch = "aarch64")]
189    unsafe {
190        // TODO: Put this behind a feature flag.
191        // std::arch::aarch64::_prefetch(ptr as *const i8, std::arch::aarch64::_PREFETCH_LOCALITY3);
192    }
193    #[cfg(not(any(target_arch = "x86_64", target_arch = "x86", target_arch = "aarch64")))]
194    {
195        // Do nothing.
196    }
197}
198
199#[test]
200fn test() {
201    let max = (128 - L) * 256;
202    let offset = rand::random::<u64>() % (1 << 40);
203    let mut vals = [0u64; L];
204    for _ in 0..1000000 {
205        for v in &mut vals {
206            *v = offset + rand::random::<u64>() % max as u64;
207        }
208        vals.sort_unstable();
209
210        let lef = CachelineEf::new(&vals);
211        for i in 0..L {
212            assert_eq!(lef.index(i), vals[i], "error; full list: {:?}", vals);
213        }
214    }
215}