hermes_core/structures/postings/
posting.rs1use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
6use std::io::{self, Read, Write};
7
8use crate::DocId;
9
10#[derive(Debug, Clone, Copy, PartialEq, Eq)]
12pub struct Posting {
13 pub doc_id: DocId,
14 pub term_freq: u32,
15}
16
17#[derive(Debug, Clone, Default)]
19pub struct PostingList {
20 postings: Vec<Posting>,
21}
22
23impl PostingList {
24 pub fn new() -> Self {
25 Self::default()
26 }
27
28 pub fn with_capacity(capacity: usize) -> Self {
29 Self {
30 postings: Vec::with_capacity(capacity),
31 }
32 }
33
34 pub fn push(&mut self, doc_id: DocId, term_freq: u32) {
36 debug_assert!(
37 self.postings.is_empty() || self.postings.last().unwrap().doc_id < doc_id,
38 "Postings must be added in sorted order"
39 );
40 self.postings.push(Posting { doc_id, term_freq });
41 }
42
43 pub fn add(&mut self, doc_id: DocId, term_freq: u32) {
45 if let Some(last) = self.postings.last_mut()
46 && last.doc_id == doc_id
47 {
48 last.term_freq += term_freq;
49 return;
50 }
51 self.postings.push(Posting { doc_id, term_freq });
52 }
53
54 pub fn doc_count(&self) -> u32 {
56 self.postings.len() as u32
57 }
58
59 pub fn len(&self) -> usize {
60 self.postings.len()
61 }
62
63 pub fn is_empty(&self) -> bool {
64 self.postings.is_empty()
65 }
66
67 pub fn iter(&self) -> impl Iterator<Item = &Posting> {
68 self.postings.iter()
69 }
70
71 pub fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
73 write_vint(writer, self.postings.len() as u64)?;
75
76 let mut prev_doc_id = 0u32;
77 for posting in &self.postings {
78 let delta = posting.doc_id - prev_doc_id;
80 write_vint(writer, delta as u64)?;
81 write_vint(writer, posting.term_freq as u64)?;
82 prev_doc_id = posting.doc_id;
83 }
84
85 Ok(())
86 }
87
88 pub fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
90 let count = read_vint(reader)? as usize;
91 let mut postings = Vec::with_capacity(count);
92
93 let mut prev_doc_id = 0u32;
94 for _ in 0..count {
95 let delta = read_vint(reader)? as u32;
96 let term_freq = read_vint(reader)? as u32;
97 let doc_id = prev_doc_id + delta;
98 postings.push(Posting { doc_id, term_freq });
99 prev_doc_id = doc_id;
100 }
101
102 Ok(Self { postings })
103 }
104}
105
106pub struct PostingListIterator<'a> {
108 postings: &'a [Posting],
109 position: usize,
110}
111
112impl<'a> PostingListIterator<'a> {
113 pub fn new(posting_list: &'a PostingList) -> Self {
114 Self {
115 postings: &posting_list.postings,
116 position: 0,
117 }
118 }
119
120 pub fn doc(&self) -> DocId {
122 if self.position < self.postings.len() {
123 self.postings[self.position].doc_id
124 } else {
125 TERMINATED
126 }
127 }
128
129 pub fn term_freq(&self) -> u32 {
131 if self.position < self.postings.len() {
132 self.postings[self.position].term_freq
133 } else {
134 0
135 }
136 }
137
138 pub fn advance(&mut self) -> DocId {
140 self.position += 1;
141 self.doc()
142 }
143
144 pub fn seek(&mut self, target: DocId) -> DocId {
146 while self.position < self.postings.len() {
148 if self.postings[self.position].doc_id >= target {
149 return self.postings[self.position].doc_id;
150 }
151 self.position += 1;
152 }
153 TERMINATED
154 }
155
156 pub fn size_hint(&self) -> usize {
158 self.postings.len().saturating_sub(self.position)
159 }
160}
161
162pub const TERMINATED: DocId = DocId::MAX;
164
165fn write_vint<W: Write>(writer: &mut W, mut value: u64) -> io::Result<()> {
167 loop {
168 let byte = (value & 0x7F) as u8;
169 value >>= 7;
170 if value == 0 {
171 writer.write_u8(byte)?;
172 return Ok(());
173 } else {
174 writer.write_u8(byte | 0x80)?;
175 }
176 }
177}
178
179fn read_vint<R: Read>(reader: &mut R) -> io::Result<u64> {
181 let mut result = 0u64;
182 let mut shift = 0;
183
184 loop {
185 let byte = reader.read_u8()?;
186 result |= ((byte & 0x7F) as u64) << shift;
187 if byte & 0x80 == 0 {
188 return Ok(result);
189 }
190 shift += 7;
191 if shift >= 64 {
192 return Err(io::Error::new(
193 io::ErrorKind::InvalidData,
194 "varint too long",
195 ));
196 }
197 }
198}
199
200#[allow(dead_code)]
202#[derive(Debug, Clone)]
203pub struct CompactPostingList {
204 data: Vec<u8>,
205 doc_count: u32,
206}
207
208#[allow(dead_code)]
209impl CompactPostingList {
210 pub fn from_posting_list(list: &PostingList) -> io::Result<Self> {
212 let mut data = Vec::new();
213 list.serialize(&mut data)?;
214 Ok(Self {
215 doc_count: list.len() as u32,
216 data,
217 })
218 }
219
220 pub fn as_bytes(&self) -> &[u8] {
222 &self.data
223 }
224
225 pub fn doc_count(&self) -> u32 {
227 self.doc_count
228 }
229
230 pub fn to_posting_list(&self) -> io::Result<PostingList> {
232 PostingList::deserialize(&mut &self.data[..])
233 }
234}
235
236pub const BLOCK_SIZE: usize = 128;
239
240#[derive(Debug, Clone)]
241pub struct BlockPostingList {
242 skip_list: Vec<(DocId, DocId, u32)>,
246 data: Vec<u8>,
248 doc_count: u32,
250}
251
252impl BlockPostingList {
253 pub fn from_posting_list(list: &PostingList) -> io::Result<Self> {
255 let mut skip_list = Vec::new();
256 let mut data = Vec::new();
257
258 let postings = &list.postings;
259 let mut i = 0;
260
261 while i < postings.len() {
262 let block_start = data.len() as u32;
263 let block_end = (i + BLOCK_SIZE).min(postings.len());
264 let block = &postings[i..block_end];
265
266 let base_doc_id = block.first().unwrap().doc_id;
268 let last_doc_id = block.last().unwrap().doc_id;
269 skip_list.push((base_doc_id, last_doc_id, block_start));
270
271 write_vint(&mut data, block.len() as u64)?;
273
274 let mut prev_doc_id = base_doc_id;
275 for (j, posting) in block.iter().enumerate() {
276 if j == 0 {
277 write_vint(&mut data, posting.doc_id as u64)?;
279 } else {
280 let delta = posting.doc_id - prev_doc_id;
282 write_vint(&mut data, delta as u64)?;
283 }
284 write_vint(&mut data, posting.term_freq as u64)?;
285 prev_doc_id = posting.doc_id;
286 }
287
288 i = block_end;
289 }
290
291 Ok(Self {
292 skip_list,
293 data,
294 doc_count: postings.len() as u32,
295 })
296 }
297
298 pub fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
300 writer.write_u32::<LittleEndian>(self.doc_count)?;
302
303 writer.write_u32::<LittleEndian>(self.skip_list.len() as u32)?;
305 for (base_doc_id, last_doc_id, offset) in &self.skip_list {
306 writer.write_u32::<LittleEndian>(*base_doc_id)?;
307 writer.write_u32::<LittleEndian>(*last_doc_id)?;
308 writer.write_u32::<LittleEndian>(*offset)?;
309 }
310
311 writer.write_u32::<LittleEndian>(self.data.len() as u32)?;
313 writer.write_all(&self.data)?;
314
315 Ok(())
316 }
317
318 pub fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
320 let doc_count = reader.read_u32::<LittleEndian>()?;
321
322 let skip_count = reader.read_u32::<LittleEndian>()? as usize;
323 let mut skip_list = Vec::with_capacity(skip_count);
324 for _ in 0..skip_count {
325 let base_doc_id = reader.read_u32::<LittleEndian>()?;
326 let last_doc_id = reader.read_u32::<LittleEndian>()?;
327 let offset = reader.read_u32::<LittleEndian>()?;
328 skip_list.push((base_doc_id, last_doc_id, offset));
329 }
330
331 let data_len = reader.read_u32::<LittleEndian>()? as usize;
332 let mut data = vec![0u8; data_len];
333 reader.read_exact(&mut data)?;
334
335 Ok(Self {
336 skip_list,
337 data,
338 doc_count,
339 })
340 }
341
342 pub fn doc_count(&self) -> u32 {
343 self.doc_count
344 }
345
346 pub fn num_blocks(&self) -> usize {
348 self.skip_list.len()
349 }
350
351 pub fn block_info(&self, block_idx: usize) -> Option<(DocId, DocId, usize, usize)> {
353 if block_idx >= self.skip_list.len() {
354 return None;
355 }
356 let (base, last, offset) = self.skip_list[block_idx];
357 let next_offset = if block_idx + 1 < self.skip_list.len() {
358 self.skip_list[block_idx + 1].2 as usize
359 } else {
360 self.data.len()
361 };
362 Some((base, last, offset as usize, next_offset - offset as usize))
363 }
364
365 pub fn block_data(&self, block_idx: usize) -> Option<&[u8]> {
367 let (_, _, offset, len) = self.block_info(block_idx)?;
368 Some(&self.data[offset..offset + len])
369 }
370
371 pub fn concatenate_blocks(sources: &[(BlockPostingList, u32)]) -> io::Result<Self> {
374 let mut skip_list = Vec::new();
375 let mut data = Vec::new();
376 let mut total_docs = 0u32;
377
378 for (source, doc_offset) in sources {
379 for block_idx in 0..source.num_blocks() {
380 if let Some((base, last, src_offset, len)) = source.block_info(block_idx) {
381 let new_base = base + doc_offset;
382 let new_last = last + doc_offset;
383 let new_offset = data.len() as u32;
384
385 let block_bytes = &source.data[src_offset..src_offset + len];
387 let mut reader = block_bytes;
388
389 let count = read_vint(&mut reader)? as usize;
391
392 write_vint(&mut data, count as u64)?;
394
395 let first_doc = read_vint(&mut reader)? as u32;
397 let first_tf = read_vint(&mut reader)?;
398 write_vint(&mut data, (first_doc + doc_offset) as u64)?;
399 write_vint(&mut data, first_tf)?;
400
401 data.extend_from_slice(reader);
403
404 skip_list.push((new_base, new_last, new_offset));
405 total_docs += count as u32;
406 }
407 }
408 }
409
410 Ok(Self {
411 skip_list,
412 data,
413 doc_count: total_docs,
414 })
415 }
416
417 pub fn iterator(&self) -> BlockPostingIterator<'_> {
419 BlockPostingIterator::new(self)
420 }
421
422 pub fn into_iterator(self) -> BlockPostingIterator<'static> {
424 BlockPostingIterator::owned(self)
425 }
426}
427
428pub struct BlockPostingIterator<'a> {
431 block_list: std::borrow::Cow<'a, BlockPostingList>,
432 current_block: usize,
433 block_postings: Vec<Posting>,
434 position_in_block: usize,
435 exhausted: bool,
436}
437
438#[allow(dead_code)]
440pub type OwnedBlockPostingIterator = BlockPostingIterator<'static>;
441
442impl<'a> BlockPostingIterator<'a> {
443 fn new(block_list: &'a BlockPostingList) -> Self {
444 let exhausted = block_list.skip_list.is_empty();
445 let mut iter = Self {
446 block_list: std::borrow::Cow::Borrowed(block_list),
447 current_block: 0,
448 block_postings: Vec::new(),
449 position_in_block: 0,
450 exhausted,
451 };
452 if !iter.exhausted {
453 iter.load_block(0);
454 }
455 iter
456 }
457
458 fn owned(block_list: BlockPostingList) -> BlockPostingIterator<'static> {
459 let exhausted = block_list.skip_list.is_empty();
460 let mut iter = BlockPostingIterator {
461 block_list: std::borrow::Cow::Owned(block_list),
462 current_block: 0,
463 block_postings: Vec::new(),
464 position_in_block: 0,
465 exhausted,
466 };
467 if !iter.exhausted {
468 iter.load_block(0);
469 }
470 iter
471 }
472
473 fn load_block(&mut self, block_idx: usize) {
474 if block_idx >= self.block_list.skip_list.len() {
475 self.exhausted = true;
476 return;
477 }
478
479 self.current_block = block_idx;
480 self.position_in_block = 0;
481
482 let offset = self.block_list.skip_list[block_idx].2 as usize;
483 let mut reader = &self.block_list.data[offset..];
484
485 let count = read_vint(&mut reader).unwrap_or(0) as usize;
486 self.block_postings.clear();
487 self.block_postings.reserve(count);
488
489 let mut prev_doc_id = 0u32;
490
491 for i in 0..count {
492 if let (Ok(value), Ok(tf)) = (read_vint(&mut reader), read_vint(&mut reader)) {
493 let doc_id = if i == 0 {
494 value as u32
496 } else {
497 prev_doc_id + value as u32
499 };
500 self.block_postings.push(Posting {
501 doc_id,
502 term_freq: tf as u32,
503 });
504 prev_doc_id = doc_id;
505 }
506 }
507 }
508
509 pub fn doc(&self) -> DocId {
510 if self.exhausted {
511 TERMINATED
512 } else if self.position_in_block < self.block_postings.len() {
513 self.block_postings[self.position_in_block].doc_id
514 } else {
515 TERMINATED
516 }
517 }
518
519 pub fn term_freq(&self) -> u32 {
520 if self.exhausted || self.position_in_block >= self.block_postings.len() {
521 0
522 } else {
523 self.block_postings[self.position_in_block].term_freq
524 }
525 }
526
527 pub fn advance(&mut self) -> DocId {
528 if self.exhausted {
529 return TERMINATED;
530 }
531
532 self.position_in_block += 1;
533 if self.position_in_block >= self.block_postings.len() {
534 self.load_block(self.current_block + 1);
535 }
536 self.doc()
537 }
538
539 pub fn seek(&mut self, target: DocId) -> DocId {
540 if self.exhausted {
541 return TERMINATED;
542 }
543
544 let target_block = self
545 .block_list
546 .skip_list
547 .iter()
548 .position(|(_, last_doc, _)| *last_doc >= target);
549
550 if let Some(block_idx) = target_block {
551 if block_idx != self.current_block {
552 self.load_block(block_idx);
553 }
554
555 while self.position_in_block < self.block_postings.len() {
556 if self.block_postings[self.position_in_block].doc_id >= target {
557 return self.doc();
558 }
559 self.position_in_block += 1;
560 }
561
562 self.load_block(self.current_block + 1);
563 self.seek(target)
564 } else {
565 self.exhausted = true;
566 TERMINATED
567 }
568 }
569}
570
571#[cfg(test)]
572mod tests {
573 use super::*;
574
575 #[test]
576 fn test_posting_list_basic() {
577 let mut list = PostingList::new();
578 list.push(1, 2);
579 list.push(5, 1);
580 list.push(10, 3);
581
582 assert_eq!(list.len(), 3);
583
584 let mut iter = PostingListIterator::new(&list);
585 assert_eq!(iter.doc(), 1);
586 assert_eq!(iter.term_freq(), 2);
587
588 assert_eq!(iter.advance(), 5);
589 assert_eq!(iter.term_freq(), 1);
590
591 assert_eq!(iter.advance(), 10);
592 assert_eq!(iter.term_freq(), 3);
593
594 assert_eq!(iter.advance(), TERMINATED);
595 }
596
597 #[test]
598 fn test_posting_list_serialization() {
599 let mut list = PostingList::new();
600 for i in 0..100 {
601 list.push(i * 3, (i % 5) + 1);
602 }
603
604 let mut buffer = Vec::new();
605 list.serialize(&mut buffer).unwrap();
606
607 let deserialized = PostingList::deserialize(&mut &buffer[..]).unwrap();
608 assert_eq!(deserialized.len(), list.len());
609
610 for (a, b) in list.iter().zip(deserialized.iter()) {
611 assert_eq!(a, b);
612 }
613 }
614
615 #[test]
616 fn test_posting_list_seek() {
617 let mut list = PostingList::new();
618 for i in 0..100 {
619 list.push(i * 2, 1);
620 }
621
622 let mut iter = PostingListIterator::new(&list);
623
624 assert_eq!(iter.seek(50), 50);
625 assert_eq!(iter.seek(51), 52);
626 assert_eq!(iter.seek(200), TERMINATED);
627 }
628
629 #[test]
630 fn test_block_posting_list() {
631 let mut list = PostingList::new();
632 for i in 0..500 {
633 list.push(i * 2, (i % 10) + 1);
634 }
635
636 let block_list = BlockPostingList::from_posting_list(&list).unwrap();
637 assert_eq!(block_list.doc_count(), 500);
638
639 let mut iter = block_list.iterator();
640 assert_eq!(iter.doc(), 0);
641 assert_eq!(iter.term_freq(), 1);
642
643 assert_eq!(iter.seek(500), 500);
645 assert_eq!(iter.seek(998), 998);
646 assert_eq!(iter.seek(1000), TERMINATED);
647 }
648
649 #[test]
650 fn test_block_posting_list_serialization() {
651 let mut list = PostingList::new();
652 for i in 0..300 {
653 list.push(i * 3, i + 1);
654 }
655
656 let block_list = BlockPostingList::from_posting_list(&list).unwrap();
657
658 let mut buffer = Vec::new();
659 block_list.serialize(&mut buffer).unwrap();
660
661 let deserialized = BlockPostingList::deserialize(&mut &buffer[..]).unwrap();
662 assert_eq!(deserialized.doc_count(), block_list.doc_count());
663
664 let mut iter1 = block_list.iterator();
666 let mut iter2 = deserialized.iterator();
667
668 while iter1.doc() != TERMINATED {
669 assert_eq!(iter1.doc(), iter2.doc());
670 assert_eq!(iter1.term_freq(), iter2.term_freq());
671 iter1.advance();
672 iter2.advance();
673 }
674 assert_eq!(iter2.doc(), TERMINATED);
675 }
676}