cacheline_ef/lib.rs
1//! # Cacheline Elias-Fano
2//!
3//! [`CachelineEf`] is an integer encoding that packs chunks of 44 sorted 40-bit values into a single
4//! cacheline, using `64/44*8 = 11.6` bits per value.
5//! Each chunk can hold increasing values in a range of length `256*84=21504`.
6//! If this range is exceeded, [`CachelineEfVec::new`] will panic while [`CachelineEf::try_new`] will return `None`.
7//!
8//! [`CachelineEfVec`] stores a vector of [`CachelineEf`] and provides [`CachelineEfVec::index`] and [`CachelineEfVec::prefetch`].
9//!
10//! This encoding is efficient when consecutive values differ by roughly 100, where using
11//! Elias-Fano directly on the full list would use around `9` bits/value.
12//!
13//! The main benefit is that CachelineEf only requires reading a single cacheline per
14//! query, where Elias-Fano encoding usually needs 3 reads.
15//!
16//! Epserde is supported when the `epserde` feature flag is enabled.
17//!
18//! ## Layout
19//!
20//! The layout is described in detail in the PtrHash paper ([arXiv](https://arxiv.org/abs/2502.15539),
21//! [blog version](https://curiouscoding.nl/posts/ptrhash)).
22//!
23//! In summary:
24//! - First store a 4 byte offset, corresponding to the 32 high bits of the smallest value.
25//! - Then store for each of 44 values the 8 low bits.
26//! - Lastly we have 16 bytes (128 bits) to encode the high parts.
27//! For the i'th value `x[i]`, we set the bit at position `i+(x[i]/256 - x[0]/256)` to `1`.
28
29use common_traits::SelectInWord;
30use std::cmp::min;
31
32/// Number of stored values per unit.
33const L: usize = 44;
34
35/// A vector of [`CachelineEf`].
36#[derive(Default, Clone, mem_dbg::MemSize, mem_dbg::MemDbg)]
37#[cfg_attr(feature = "epserde", derive(epserde::prelude::Epserde))]
38pub struct CachelineEfVec<E = Vec<CachelineEf>> {
39 ef: E,
40 len: usize,
41}
42
43impl CachelineEfVec<Vec<CachelineEf>> {
44 /// Construct a new `CachelineEfVec` for a list of `u64` values.
45 ///
46 /// Panics when:
47 /// - the input is not sorted,
48 /// - the input values are over 2^40,
49 /// - there is a cacheline where the values span a too large range.
50 pub fn try_new(vals: &[u64]) -> Option<Self> {
51 let mut p = Vec::with_capacity(vals.len().div_ceil(L));
52 for i in (0..vals.len()).step_by(L) {
53 p.push(CachelineEf::try_new(&vals[i..min(i + L, vals.len())])?);
54 }
55
56 Some(Self {
57 ef: p,
58 len: vals.len(),
59 })
60 }
61
62 /// Construct a new `CachelineEfVec` for a list of `u64` values.
63 ///
64 /// Panics when:
65 /// - the input is not sorted,
66 /// - the input values are over 2^40,
67 /// - there is a cacheline where the values span a too large range.
68 pub fn new(vals: &[u64]) -> Self {
69 Self::try_new(vals).expect("Values are too sparse!")
70 }
71}
72
73impl<E: AsRef<[CachelineEf]>> CachelineEfVec<E> {
74 /// Get the value at the given index in the vector.
75 pub fn index(&self, index: usize) -> u64 {
76 assert!(
77 index < self.len,
78 "Index {index} out of bounds. Length is {}.",
79 self.len
80 );
81 // Note: This division is inlined by the compiler.
82 unsafe { self.ef.as_ref().get_unchecked(index / L).index(index % L) }
83 }
84 /// The number of values stored.
85 pub fn len(&self) -> usize {
86 self.len
87 }
88 /// Get the value at the given index in the vector, and do not check bounds.
89 pub unsafe fn index_unchecked(&self, index: usize) -> u64 {
90 // Note: This division is inlined by the compiler.
91 (*self.ef.as_ref().get_unchecked(index / L)).index(index % L)
92 }
93 /// Prefetch the cacheline containing the given element.
94 pub fn prefetch(&self, index: usize) {
95 prefetch_index(self.ef.as_ref(), index / L);
96 }
97 /// The size of the underlying vector, in bytes.
98 pub fn size_in_bytes(&self) -> usize {
99 std::mem::size_of_val(self.ef.as_ref())
100 }
101}
102
103/// A single cacheline that holds 44 Elias-Fano encoded 40-bit values in a range of size `256*84=21504`.
104// This has size 64 bytes (one cacheline) and is aligned to 64bytes as well to
105// ensure it actually occupied a single cacheline.
106// It is marked `zero_copy` to be able to use it with lazy deserialization of ep-serde.
107#[derive(Clone, Copy, mem_dbg::MemSize, mem_dbg::MemDbg)]
108#[repr(C)]
109#[repr(align(64))]
110#[cfg_attr(feature = "epserde", derive(epserde::prelude::Epserde))]
111#[cfg_attr(feature = "epserde", zero_copy)]
112#[copy_type]
113pub struct CachelineEf {
114 /// 2*64 = 128 bits to indicate where 256 boundaries are crossed.
115 /// There are 44 1-bits corresponding to the stored numbers, and the number
116 /// of 0-bits before each number indicates the number of times 256 must be added.
117 high_boundaries: [u64; 2],
118 /// The offset of the first element, divided by 256.
119 reduced_offset: u32,
120 /// Last 8 bits of each number.
121 low_bits: [u8; L],
122}
123
124impl CachelineEf {
125 fn try_new(vals: &[u64]) -> Option<Self> {
126 assert!(!vals.is_empty(), "List of values must not be empty.");
127 assert!(
128 vals.len() <= L,
129 "Number of values must be at most {L}, but is {}",
130 vals.len()
131 );
132 let l = vals.len();
133 if vals[l - 1] - vals[0] > 256 * (128 - L as u64) {
134 return None;
135 }
136 // assert!(
137 // vals[l - 1] - vals[0] <= 256 * (128 - L as u64),
138 // "Range of values {} ({} to {}) is too large! Can be at most {}.",
139 // vals[l - 1] - vals[0],
140 // vals[0],
141 // vals[l - 1],
142 // 256 * (128 - L as u64)
143 // );
144 assert!(
145 vals[l - 1] < (1u64 << 40),
146 "Last value {} is too large! Must be less than 2^40={}",
147 vals[l - 1],
148 1u64 << 40
149 );
150
151 let offset = vals[0] >> 8;
152 assert!(
153 offset <= u32::MAX as u64,
154 "vals[0] does not fit in 40 bits."
155 );
156 let mut low_bits = [0u8; L];
157 for (i, &v) in vals.iter().enumerate() {
158 low_bits[i] = (v & 0xff) as u8;
159 }
160 let mut high_boundaries = [0u64; 2];
161 let mut last = 0;
162 for (i, &v) in vals.iter().enumerate() {
163 assert!(i >= last, "Values are not sorted! {last} > {i}");
164 last = i;
165 let idx = i + ((v >> 8) - offset) as usize;
166 assert!(idx < 128, "Value {} is too large!", v - offset);
167 high_boundaries[idx / 64] |= 1 << (idx % 64);
168 }
169 Some(Self {
170 reduced_offset: offset as u32,
171 high_boundaries,
172 low_bits,
173 })
174 }
175
176 /// Get the value a the given index.
177 ///
178 /// Panics when `idx` is out of bounds.
179 pub fn index(&self, idx: usize) -> u64 {
180 let p = self.high_boundaries[0].count_ones() as usize;
181 let one_pos = if idx < p {
182 self.high_boundaries[0].select_in_word(idx)
183 } else {
184 64 + self.high_boundaries[1].select_in_word(idx - p)
185 };
186
187 256 * self.reduced_offset as u64 + 256 * (one_pos - idx) as u64 + self.low_bits[idx] as u64
188 }
189}
190
191/// Prefetch the given cacheline into L1 cache.
192fn prefetch_index<T>(s: &[T], index: usize) {
193 let ptr = unsafe { s.as_ptr().add(index) as *const u64 };
194 #[cfg(target_arch = "x86_64")]
195 unsafe {
196 std::arch::x86_64::_mm_prefetch(ptr as *const i8, std::arch::x86_64::_MM_HINT_T0);
197 }
198 #[cfg(target_arch = "x86")]
199 unsafe {
200 std::arch::x86::_mm_prefetch(ptr as *const i8, std::arch::x86::_MM_HINT_T0);
201 }
202 #[cfg(target_arch = "aarch64")]
203 unsafe {
204 // TODO: Put this behind a feature flag.
205 // std::arch::aarch64::_prefetch(ptr as *const i8, std::arch::aarch64::_PREFETCH_LOCALITY3);
206 }
207 #[cfg(not(any(target_arch = "x86_64", target_arch = "x86", target_arch = "aarch64")))]
208 {
209 // Do nothing.
210 }
211}
212
213#[test]
214fn test() {
215 let max = (128 - L) * 256;
216 let offset = rand::random::<u64>() % (1 << 40);
217 let mut vals = [0u64; L];
218 for _ in 0..1000000 {
219 for v in &mut vals {
220 *v = offset + rand::random::<u64>() % max as u64;
221 }
222 vals.sort_unstable();
223
224 let lef = CachelineEf::try_new(&vals).unwrap();
225 for i in 0..L {
226 assert_eq!(lef.index(i), vals[i], "error; full list: {:?}", vals);
227 }
228 }
229}
230
231#[test]
232fn size() {
233 assert_eq!(std::mem::size_of::<CachelineEf>(), 64);
234}