hermes_core/structures/postings/
posting_common.rs1use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
9use std::io::{self, Read, Write};
10
11use crate::DocId;
12
13pub const BLOCK_SIZE: usize = 128;
15
16#[inline]
21pub fn write_vint<W: Write>(writer: &mut W, mut value: u64) -> io::Result<()> {
22 loop {
23 let byte = (value & 0x7F) as u8;
24 value >>= 7;
25 if value == 0 {
26 writer.write_u8(byte)?;
27 return Ok(());
28 } else {
29 writer.write_u8(byte | 0x80)?;
30 }
31 }
32}
33
34#[inline]
36pub fn read_vint<R: Read>(reader: &mut R) -> io::Result<u64> {
37 let mut result = 0u64;
38 let mut shift = 0;
39
40 loop {
41 let byte = reader.read_u8()?;
42 result |= ((byte & 0x7F) as u64) << shift;
43 if byte & 0x80 == 0 {
44 return Ok(result);
45 }
46 shift += 7;
47 if shift >= 64 {
48 return Err(io::Error::new(
49 io::ErrorKind::InvalidData,
50 "varint too long",
51 ));
52 }
53 }
54}
55
56#[derive(Debug, Clone, Copy, PartialEq, Eq)]
60pub struct SkipEntry {
61 pub first_doc: DocId,
63 pub last_doc: DocId,
65 pub offset: u32,
67}
68
69impl SkipEntry {
70 pub fn new(first_doc: DocId, last_doc: DocId, offset: u32) -> Self {
71 Self {
72 first_doc,
73 last_doc,
74 offset,
75 }
76 }
77
78 pub fn write<W: Write>(&self, writer: &mut W) -> io::Result<()> {
80 writer.write_u32::<LittleEndian>(self.first_doc)?;
81 writer.write_u32::<LittleEndian>(self.last_doc)?;
82 writer.write_u32::<LittleEndian>(self.offset)?;
83 Ok(())
84 }
85
86 pub fn read<R: Read>(reader: &mut R) -> io::Result<Self> {
88 let first_doc = reader.read_u32::<LittleEndian>()?;
89 let last_doc = reader.read_u32::<LittleEndian>()?;
90 let offset = reader.read_u32::<LittleEndian>()?;
91 Ok(Self {
92 first_doc,
93 last_doc,
94 offset,
95 })
96 }
97}
98
99#[derive(Debug, Clone, Default)]
101pub struct SkipList {
102 entries: Vec<SkipEntry>,
103}
104
105impl SkipList {
106 pub fn new() -> Self {
107 Self::default()
108 }
109
110 pub fn with_capacity(capacity: usize) -> Self {
111 Self {
112 entries: Vec::with_capacity(capacity),
113 }
114 }
115
116 pub fn push(&mut self, first_doc: DocId, last_doc: DocId, offset: u32) {
118 self.entries
119 .push(SkipEntry::new(first_doc, last_doc, offset));
120 }
121
122 pub fn len(&self) -> usize {
124 self.entries.len()
125 }
126
127 pub fn is_empty(&self) -> bool {
128 self.entries.is_empty()
129 }
130
131 pub fn get(&self, index: usize) -> Option<&SkipEntry> {
133 self.entries.get(index)
134 }
135
136 pub fn find_block(&self, target: DocId) -> Option<usize> {
140 self.entries.iter().position(|e| e.last_doc >= target)
141 }
142
143 pub fn iter(&self) -> impl Iterator<Item = &SkipEntry> {
145 self.entries.iter()
146 }
147
148 pub fn write<W: Write>(&self, writer: &mut W) -> io::Result<()> {
150 writer.write_u32::<LittleEndian>(self.entries.len() as u32)?;
151 for entry in &self.entries {
152 entry.write(writer)?;
153 }
154 Ok(())
155 }
156
157 pub fn read<R: Read>(reader: &mut R) -> io::Result<Self> {
159 let count = reader.read_u32::<LittleEndian>()? as usize;
160 let mut entries = Vec::with_capacity(count);
161 for _ in 0..count {
162 entries.push(SkipEntry::read(reader)?);
163 }
164 Ok(Self { entries })
165 }
166
167 pub fn from_tuples(tuples: &[(DocId, DocId, u32)]) -> Self {
169 Self {
170 entries: tuples
171 .iter()
172 .map(|(first, last, offset)| SkipEntry::new(*first, *last, *offset))
173 .collect(),
174 }
175 }
176
177 pub fn to_tuples(&self) -> Vec<(DocId, DocId, u32)> {
179 self.entries
180 .iter()
181 .map(|e| (e.first_doc, e.last_doc, e.offset))
182 .collect()
183 }
184}
185
186pub fn write_doc_id_block<W: Write>(writer: &mut W, doc_ids: &[DocId]) -> io::Result<DocId> {
191 if doc_ids.is_empty() {
192 return Ok(0);
193 }
194
195 write_vint(writer, doc_ids.len() as u64)?;
196
197 let mut prev = 0u32;
198 for (i, &doc_id) in doc_ids.iter().enumerate() {
199 if i == 0 {
200 write_vint(writer, doc_id as u64)?;
202 } else {
203 write_vint(writer, (doc_id - prev) as u64)?;
205 }
206 prev = doc_id;
207 }
208
209 Ok(*doc_ids.last().unwrap())
210}
211
212pub fn read_doc_id_block<R: Read>(reader: &mut R) -> io::Result<Vec<DocId>> {
216 let count = read_vint(reader)? as usize;
217 let mut doc_ids = Vec::with_capacity(count);
218
219 let mut prev = 0u32;
220 for i in 0..count {
221 let value = read_vint(reader)? as u32;
222 let doc_id = if i == 0 {
223 value } else {
225 prev + value };
227 doc_ids.push(doc_id);
228 prev = doc_id;
229 }
230
231 Ok(doc_ids)
232}
233
234use crate::structures::simd;
239
240#[derive(Debug, Clone, Copy, PartialEq, Eq)]
244#[repr(u8)]
245pub enum RoundedBitWidth {
246 Zero = 0,
248 Bits8 = 8,
250 Bits16 = 16,
252 Bits32 = 32,
254}
255
256impl RoundedBitWidth {
257 pub fn from_max_value(max_val: u32) -> Self {
259 if max_val == 0 {
260 RoundedBitWidth::Zero
261 } else if max_val <= 255 {
262 RoundedBitWidth::Bits8
263 } else if max_val <= 65535 {
264 RoundedBitWidth::Bits16
265 } else {
266 RoundedBitWidth::Bits32
267 }
268 }
269
270 pub fn bytes_per_value(&self) -> usize {
272 match self {
273 RoundedBitWidth::Zero => 0,
274 RoundedBitWidth::Bits8 => 1,
275 RoundedBitWidth::Bits16 => 2,
276 RoundedBitWidth::Bits32 => 4,
277 }
278 }
279
280 pub fn from_u8(v: u8) -> Option<Self> {
282 match v {
283 0 => Some(RoundedBitWidth::Zero),
284 8 => Some(RoundedBitWidth::Bits8),
285 16 => Some(RoundedBitWidth::Bits16),
286 32 => Some(RoundedBitWidth::Bits32),
287 _ => None,
288 }
289 }
290}
291
292pub fn pack_deltas_fixed(doc_ids: &[DocId]) -> (RoundedBitWidth, Vec<u8>) {
297 if doc_ids.len() <= 1 {
298 return (RoundedBitWidth::Zero, Vec::new());
299 }
300
301 let mut max_delta = 0u32;
303 let mut deltas = Vec::with_capacity(doc_ids.len() - 1);
304
305 for i in 1..doc_ids.len() {
306 let delta = doc_ids[i] - doc_ids[i - 1] - 1; deltas.push(delta);
308 max_delta = max_delta.max(delta);
309 }
310
311 let bit_width = RoundedBitWidth::from_max_value(max_delta);
312 let bytes_per_val = bit_width.bytes_per_value();
313
314 if bytes_per_val == 0 {
315 return (bit_width, Vec::new());
316 }
317
318 let mut packed = Vec::with_capacity(deltas.len() * bytes_per_val);
319
320 match bit_width {
321 RoundedBitWidth::Zero => {}
322 RoundedBitWidth::Bits8 => {
323 for delta in deltas {
324 packed.push(delta as u8);
325 }
326 }
327 RoundedBitWidth::Bits16 => {
328 for delta in deltas {
329 packed.extend_from_slice(&(delta as u16).to_le_bytes());
330 }
331 }
332 RoundedBitWidth::Bits32 => {
333 for delta in deltas {
334 packed.extend_from_slice(&delta.to_le_bytes());
335 }
336 }
337 }
338
339 (bit_width, packed)
340}
341
342pub fn unpack_deltas_fixed(
346 packed: &[u8],
347 bit_width: RoundedBitWidth,
348 first_doc_id: DocId,
349 count: usize,
350 output: &mut [DocId],
351) {
352 if count == 0 {
353 return;
354 }
355
356 output[0] = first_doc_id;
357
358 if count == 1 {
359 return;
360 }
361
362 match bit_width {
363 RoundedBitWidth::Zero => {
364 for (i, out) in output.iter_mut().enumerate().skip(1).take(count - 1) {
366 *out = first_doc_id + i as u32;
367 }
368 }
369 RoundedBitWidth::Bits8 => {
370 simd::unpack_8bit_delta_decode(packed, output, first_doc_id, count);
371 }
372 RoundedBitWidth::Bits16 => {
373 simd::unpack_16bit_delta_decode(packed, output, first_doc_id, count);
374 }
375 RoundedBitWidth::Bits32 => {
376 let mut carry = first_doc_id;
378 for i in 0..count - 1 {
379 let idx = i * 4;
380 let delta = u32::from_le_bytes([
381 packed[idx],
382 packed[idx + 1],
383 packed[idx + 2],
384 packed[idx + 3],
385 ]);
386 carry = carry.wrapping_add(delta).wrapping_add(1);
387 output[i + 1] = carry;
388 }
389 }
390 }
391}
392
393#[cfg(test)]
394mod tests {
395 use super::*;
396
397 #[test]
398 fn test_vint_roundtrip() {
399 let values = [
400 0u64,
401 1,
402 127,
403 128,
404 255,
405 256,
406 16383,
407 16384,
408 u32::MAX as u64,
409 u64::MAX,
410 ];
411
412 for &value in &values {
413 let mut buf = Vec::new();
414 write_vint(&mut buf, value).unwrap();
415 let read_value = read_vint(&mut buf.as_slice()).unwrap();
416 assert_eq!(value, read_value, "Failed for value {}", value);
417 }
418 }
419
420 #[test]
421 fn test_skip_list_roundtrip() {
422 let mut skip_list = SkipList::new();
423 skip_list.push(0, 127, 0);
424 skip_list.push(128, 255, 100);
425 skip_list.push(256, 500, 200);
426
427 let mut buf = Vec::new();
428 skip_list.write(&mut buf).unwrap();
429
430 let restored = SkipList::read(&mut buf.as_slice()).unwrap();
431 assert_eq!(skip_list.len(), restored.len());
432
433 for (a, b) in skip_list.iter().zip(restored.iter()) {
434 assert_eq!(a, b);
435 }
436 }
437
438 #[test]
439 fn test_skip_list_find_block() {
440 let mut skip_list = SkipList::new();
441 skip_list.push(0, 99, 0);
442 skip_list.push(100, 199, 100);
443 skip_list.push(200, 299, 200);
444
445 assert_eq!(skip_list.find_block(0), Some(0));
446 assert_eq!(skip_list.find_block(50), Some(0));
447 assert_eq!(skip_list.find_block(99), Some(0));
448 assert_eq!(skip_list.find_block(100), Some(1));
449 assert_eq!(skip_list.find_block(150), Some(1));
450 assert_eq!(skip_list.find_block(250), Some(2));
451 assert_eq!(skip_list.find_block(300), None);
452 }
453
454 #[test]
455 fn test_doc_id_block_roundtrip() {
456 let doc_ids: Vec<DocId> = vec![0, 5, 10, 100, 1000, 10000];
457
458 let mut buf = Vec::new();
459 let last = write_doc_id_block(&mut buf, &doc_ids).unwrap();
460 assert_eq!(last, 10000);
461
462 let restored = read_doc_id_block(&mut buf.as_slice()).unwrap();
463 assert_eq!(doc_ids, restored);
464 }
465
466 #[test]
467 fn test_doc_id_block_single() {
468 let doc_ids: Vec<DocId> = vec![42];
469
470 let mut buf = Vec::new();
471 write_doc_id_block(&mut buf, &doc_ids).unwrap();
472
473 let restored = read_doc_id_block(&mut buf.as_slice()).unwrap();
474 assert_eq!(doc_ids, restored);
475 }
476}