hermes_core/structures/postings/
posting_common.rs1use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
9use std::io::{self, Read, Write};
10
11use crate::DocId;
12
13pub const BLOCK_SIZE: usize = 128;
15
16#[inline]
21pub fn write_vint<W: Write>(writer: &mut W, mut value: u64) -> io::Result<()> {
22 loop {
23 let byte = (value & 0x7F) as u8;
24 value >>= 7;
25 if value == 0 {
26 writer.write_u8(byte)?;
27 return Ok(());
28 } else {
29 writer.write_u8(byte | 0x80)?;
30 }
31 }
32}
33
34#[inline]
36pub fn read_vint<R: Read>(reader: &mut R) -> io::Result<u64> {
37 let mut result = 0u64;
38 let mut shift = 0;
39
40 loop {
41 let byte = reader.read_u8()?;
42 result |= ((byte & 0x7F) as u64) << shift;
43 if byte & 0x80 == 0 {
44 return Ok(result);
45 }
46 shift += 7;
47 if shift >= 64 {
48 return Err(io::Error::new(
49 io::ErrorKind::InvalidData,
50 "varint too long",
51 ));
52 }
53 }
54}
55
56#[derive(Debug, Clone, Copy, PartialEq, Eq)]
60pub struct SkipEntry {
61 pub first_doc: DocId,
63 pub last_doc: DocId,
65 pub offset: u32,
67}
68
69impl SkipEntry {
70 pub fn new(first_doc: DocId, last_doc: DocId, offset: u32) -> Self {
71 Self {
72 first_doc,
73 last_doc,
74 offset,
75 }
76 }
77
78 pub fn write<W: Write>(&self, writer: &mut W) -> io::Result<()> {
80 writer.write_u32::<LittleEndian>(self.first_doc)?;
81 writer.write_u32::<LittleEndian>(self.last_doc)?;
82 writer.write_u32::<LittleEndian>(self.offset)?;
83 Ok(())
84 }
85
86 pub fn read<R: Read>(reader: &mut R) -> io::Result<Self> {
88 let first_doc = reader.read_u32::<LittleEndian>()?;
89 let last_doc = reader.read_u32::<LittleEndian>()?;
90 let offset = reader.read_u32::<LittleEndian>()?;
91 Ok(Self {
92 first_doc,
93 last_doc,
94 offset,
95 })
96 }
97}
98
99#[derive(Debug, Clone, Default)]
101pub struct SkipList {
102 entries: Vec<SkipEntry>,
103}
104
105impl SkipList {
106 pub fn new() -> Self {
107 Self::default()
108 }
109
110 pub fn with_capacity(capacity: usize) -> Self {
111 Self {
112 entries: Vec::with_capacity(capacity),
113 }
114 }
115
116 pub fn push(&mut self, first_doc: DocId, last_doc: DocId, offset: u32) {
118 self.entries
119 .push(SkipEntry::new(first_doc, last_doc, offset));
120 }
121
122 pub fn len(&self) -> usize {
124 self.entries.len()
125 }
126
127 pub fn is_empty(&self) -> bool {
128 self.entries.is_empty()
129 }
130
131 pub fn get(&self, index: usize) -> Option<&SkipEntry> {
133 self.entries.get(index)
134 }
135
136 pub fn find_block(&self, target: DocId) -> Option<usize> {
141 let idx = self.entries.partition_point(|e| e.last_doc < target);
142 if idx < self.entries.len() {
143 Some(idx)
144 } else {
145 None
146 }
147 }
148
149 pub fn iter(&self) -> impl Iterator<Item = &SkipEntry> {
151 self.entries.iter()
152 }
153
154 pub fn write<W: Write>(&self, writer: &mut W) -> io::Result<()> {
156 writer.write_u32::<LittleEndian>(self.entries.len() as u32)?;
157 for entry in &self.entries {
158 entry.write(writer)?;
159 }
160 Ok(())
161 }
162
163 pub fn read<R: Read>(reader: &mut R) -> io::Result<Self> {
165 let count = reader.read_u32::<LittleEndian>()? as usize;
166 let mut entries = Vec::with_capacity(count);
167 for _ in 0..count {
168 entries.push(SkipEntry::read(reader)?);
169 }
170 Ok(Self { entries })
171 }
172
173 pub fn from_tuples(tuples: &[(DocId, DocId, u32)]) -> Self {
175 Self {
176 entries: tuples
177 .iter()
178 .map(|(first, last, offset)| SkipEntry::new(*first, *last, *offset))
179 .collect(),
180 }
181 }
182
183 pub fn to_tuples(&self) -> Vec<(DocId, DocId, u32)> {
185 self.entries
186 .iter()
187 .map(|e| (e.first_doc, e.last_doc, e.offset))
188 .collect()
189 }
190}
191
192pub fn write_doc_id_block<W: Write>(writer: &mut W, doc_ids: &[DocId]) -> io::Result<DocId> {
197 if doc_ids.is_empty() {
198 return Ok(0);
199 }
200
201 write_vint(writer, doc_ids.len() as u64)?;
202
203 let mut prev = 0u32;
204 for (i, &doc_id) in doc_ids.iter().enumerate() {
205 if i == 0 {
206 write_vint(writer, doc_id as u64)?;
208 } else {
209 write_vint(writer, (doc_id - prev) as u64)?;
211 }
212 prev = doc_id;
213 }
214
215 Ok(*doc_ids.last().unwrap())
216}
217
218pub fn read_doc_id_block<R: Read>(reader: &mut R) -> io::Result<Vec<DocId>> {
222 let count = read_vint(reader)? as usize;
223 let mut doc_ids = Vec::with_capacity(count);
224
225 let mut prev = 0u32;
226 for i in 0..count {
227 let value = read_vint(reader)? as u32;
228 let doc_id = if i == 0 {
229 value } else {
231 prev + value };
233 doc_ids.push(doc_id);
234 prev = doc_id;
235 }
236
237 Ok(doc_ids)
238}
239
240use crate::structures::simd;
245
246#[derive(Debug, Clone, Copy, PartialEq, Eq)]
250#[repr(u8)]
251pub enum RoundedBitWidth {
252 Zero = 0,
254 Bits8 = 8,
256 Bits16 = 16,
258 Bits32 = 32,
260}
261
262impl RoundedBitWidth {
263 pub fn from_max_value(max_val: u32) -> Self {
265 if max_val == 0 {
266 RoundedBitWidth::Zero
267 } else if max_val <= 255 {
268 RoundedBitWidth::Bits8
269 } else if max_val <= 65535 {
270 RoundedBitWidth::Bits16
271 } else {
272 RoundedBitWidth::Bits32
273 }
274 }
275
276 pub fn bytes_per_value(&self) -> usize {
278 match self {
279 RoundedBitWidth::Zero => 0,
280 RoundedBitWidth::Bits8 => 1,
281 RoundedBitWidth::Bits16 => 2,
282 RoundedBitWidth::Bits32 => 4,
283 }
284 }
285
286 pub fn from_u8(v: u8) -> Option<Self> {
288 match v {
289 0 => Some(RoundedBitWidth::Zero),
290 8 => Some(RoundedBitWidth::Bits8),
291 16 => Some(RoundedBitWidth::Bits16),
292 32 => Some(RoundedBitWidth::Bits32),
293 _ => None,
294 }
295 }
296}
297
298pub fn pack_deltas_fixed(doc_ids: &[DocId]) -> (RoundedBitWidth, Vec<u8>) {
303 if doc_ids.len() <= 1 {
304 return (RoundedBitWidth::Zero, Vec::new());
305 }
306
307 let mut max_delta = 0u32;
309 let mut deltas = Vec::with_capacity(doc_ids.len() - 1);
310
311 for i in 1..doc_ids.len() {
312 let delta = doc_ids[i] - doc_ids[i - 1] - 1; deltas.push(delta);
314 max_delta = max_delta.max(delta);
315 }
316
317 let bit_width = RoundedBitWidth::from_max_value(max_delta);
318 let bytes_per_val = bit_width.bytes_per_value();
319
320 if bytes_per_val == 0 {
321 return (bit_width, Vec::new());
322 }
323
324 let mut packed = Vec::with_capacity(deltas.len() * bytes_per_val);
325
326 match bit_width {
327 RoundedBitWidth::Zero => {}
328 RoundedBitWidth::Bits8 => {
329 for delta in deltas {
330 packed.push(delta as u8);
331 }
332 }
333 RoundedBitWidth::Bits16 => {
334 for delta in deltas {
335 packed.extend_from_slice(&(delta as u16).to_le_bytes());
336 }
337 }
338 RoundedBitWidth::Bits32 => {
339 for delta in deltas {
340 packed.extend_from_slice(&delta.to_le_bytes());
341 }
342 }
343 }
344
345 (bit_width, packed)
346}
347
348pub fn unpack_deltas_fixed(
352 packed: &[u8],
353 bit_width: RoundedBitWidth,
354 first_doc_id: DocId,
355 count: usize,
356 output: &mut [DocId],
357) {
358 if count == 0 {
359 return;
360 }
361
362 output[0] = first_doc_id;
363
364 if count == 1 {
365 return;
366 }
367
368 match bit_width {
369 RoundedBitWidth::Zero => {
370 for (i, out) in output.iter_mut().enumerate().skip(1).take(count - 1) {
372 *out = first_doc_id + i as u32;
373 }
374 }
375 RoundedBitWidth::Bits8 => {
376 simd::unpack_8bit_delta_decode(packed, output, first_doc_id, count);
377 }
378 RoundedBitWidth::Bits16 => {
379 simd::unpack_16bit_delta_decode(packed, output, first_doc_id, count);
380 }
381 RoundedBitWidth::Bits32 => {
382 let mut carry = first_doc_id;
384 for i in 0..count - 1 {
385 let idx = i * 4;
386 let delta = u32::from_le_bytes([
387 packed[idx],
388 packed[idx + 1],
389 packed[idx + 2],
390 packed[idx + 3],
391 ]);
392 carry = carry.wrapping_add(delta).wrapping_add(1);
393 output[i + 1] = carry;
394 }
395 }
396 }
397}
398
399#[cfg(test)]
400mod tests {
401 use super::*;
402
403 #[test]
404 fn test_vint_roundtrip() {
405 let values = [
406 0u64,
407 1,
408 127,
409 128,
410 255,
411 256,
412 16383,
413 16384,
414 u32::MAX as u64,
415 u64::MAX,
416 ];
417
418 for &value in &values {
419 let mut buf = Vec::new();
420 write_vint(&mut buf, value).unwrap();
421 let read_value = read_vint(&mut buf.as_slice()).unwrap();
422 assert_eq!(value, read_value, "Failed for value {}", value);
423 }
424 }
425
426 #[test]
427 fn test_skip_list_roundtrip() {
428 let mut skip_list = SkipList::new();
429 skip_list.push(0, 127, 0);
430 skip_list.push(128, 255, 100);
431 skip_list.push(256, 500, 200);
432
433 let mut buf = Vec::new();
434 skip_list.write(&mut buf).unwrap();
435
436 let restored = SkipList::read(&mut buf.as_slice()).unwrap();
437 assert_eq!(skip_list.len(), restored.len());
438
439 for (a, b) in skip_list.iter().zip(restored.iter()) {
440 assert_eq!(a, b);
441 }
442 }
443
444 #[test]
445 fn test_skip_list_find_block() {
446 let mut skip_list = SkipList::new();
447 skip_list.push(0, 99, 0);
448 skip_list.push(100, 199, 100);
449 skip_list.push(200, 299, 200);
450
451 assert_eq!(skip_list.find_block(0), Some(0));
452 assert_eq!(skip_list.find_block(50), Some(0));
453 assert_eq!(skip_list.find_block(99), Some(0));
454 assert_eq!(skip_list.find_block(100), Some(1));
455 assert_eq!(skip_list.find_block(150), Some(1));
456 assert_eq!(skip_list.find_block(250), Some(2));
457 assert_eq!(skip_list.find_block(300), None);
458 }
459
460 #[test]
461 fn test_doc_id_block_roundtrip() {
462 let doc_ids: Vec<DocId> = vec![0, 5, 10, 100, 1000, 10000];
463
464 let mut buf = Vec::new();
465 let last = write_doc_id_block(&mut buf, &doc_ids).unwrap();
466 assert_eq!(last, 10000);
467
468 let restored = read_doc_id_block(&mut buf.as_slice()).unwrap();
469 assert_eq!(doc_ids, restored);
470 }
471
472 #[test]
473 fn test_doc_id_block_single() {
474 let doc_ids: Vec<DocId> = vec![42];
475
476 let mut buf = Vec::new();
477 write_doc_id_block(&mut buf, &doc_ids).unwrap();
478
479 let restored = read_doc_id_block(&mut buf.as_slice()).unwrap();
480 assert_eq!(doc_ids, restored);
481 }
482}