hermes_core/structures/postings/
posting.rs1use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
6use std::io::{self, Read, Write};
7
8use crate::DocId;
9
10#[derive(Debug, Clone, Copy, PartialEq, Eq)]
12pub struct Posting {
13 pub doc_id: DocId,
14 pub term_freq: u32,
15}
16
17#[derive(Debug, Clone, Default)]
19pub struct PostingList {
20 postings: Vec<Posting>,
21}
22
23impl PostingList {
24 pub fn new() -> Self {
25 Self::default()
26 }
27
28 pub fn with_capacity(capacity: usize) -> Self {
29 Self {
30 postings: Vec::with_capacity(capacity),
31 }
32 }
33
34 pub fn push(&mut self, doc_id: DocId, term_freq: u32) {
36 debug_assert!(
37 self.postings.is_empty() || self.postings.last().unwrap().doc_id < doc_id,
38 "Postings must be added in sorted order"
39 );
40 self.postings.push(Posting { doc_id, term_freq });
41 }
42
43 pub fn add(&mut self, doc_id: DocId, term_freq: u32) {
45 if let Some(last) = self.postings.last_mut()
46 && last.doc_id == doc_id
47 {
48 last.term_freq += term_freq;
49 return;
50 }
51 self.postings.push(Posting { doc_id, term_freq });
52 }
53
54 pub fn doc_count(&self) -> u32 {
56 self.postings.len() as u32
57 }
58
59 pub fn len(&self) -> usize {
60 self.postings.len()
61 }
62
63 pub fn is_empty(&self) -> bool {
64 self.postings.is_empty()
65 }
66
67 pub fn iter(&self) -> impl Iterator<Item = &Posting> {
68 self.postings.iter()
69 }
70
71 pub fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
73 write_vint(writer, self.postings.len() as u64)?;
75
76 let mut prev_doc_id = 0u32;
77 for posting in &self.postings {
78 let delta = posting.doc_id - prev_doc_id;
80 write_vint(writer, delta as u64)?;
81 write_vint(writer, posting.term_freq as u64)?;
82 prev_doc_id = posting.doc_id;
83 }
84
85 Ok(())
86 }
87
88 pub fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
90 let count = read_vint(reader)? as usize;
91 let mut postings = Vec::with_capacity(count);
92
93 let mut prev_doc_id = 0u32;
94 for _ in 0..count {
95 let delta = read_vint(reader)? as u32;
96 let term_freq = read_vint(reader)? as u32;
97 let doc_id = prev_doc_id + delta;
98 postings.push(Posting { doc_id, term_freq });
99 prev_doc_id = doc_id;
100 }
101
102 Ok(Self { postings })
103 }
104}
105
106pub struct PostingListIterator<'a> {
108 postings: &'a [Posting],
109 position: usize,
110}
111
112impl<'a> PostingListIterator<'a> {
113 pub fn new(posting_list: &'a PostingList) -> Self {
114 Self {
115 postings: &posting_list.postings,
116 position: 0,
117 }
118 }
119
120 pub fn doc(&self) -> DocId {
122 if self.position < self.postings.len() {
123 self.postings[self.position].doc_id
124 } else {
125 TERMINATED
126 }
127 }
128
129 pub fn term_freq(&self) -> u32 {
131 if self.position < self.postings.len() {
132 self.postings[self.position].term_freq
133 } else {
134 0
135 }
136 }
137
138 pub fn advance(&mut self) -> DocId {
140 self.position += 1;
141 self.doc()
142 }
143
144 pub fn seek(&mut self, target: DocId) -> DocId {
146 while self.position < self.postings.len() {
148 if self.postings[self.position].doc_id >= target {
149 return self.postings[self.position].doc_id;
150 }
151 self.position += 1;
152 }
153 TERMINATED
154 }
155
156 pub fn size_hint(&self) -> usize {
158 self.postings.len().saturating_sub(self.position)
159 }
160}
161
162pub const TERMINATED: DocId = DocId::MAX;
164
165fn write_vint<W: Write>(writer: &mut W, mut value: u64) -> io::Result<()> {
167 loop {
168 let byte = (value & 0x7F) as u8;
169 value >>= 7;
170 if value == 0 {
171 writer.write_u8(byte)?;
172 return Ok(());
173 } else {
174 writer.write_u8(byte | 0x80)?;
175 }
176 }
177}
178
179fn read_vint<R: Read>(reader: &mut R) -> io::Result<u64> {
181 let mut result = 0u64;
182 let mut shift = 0;
183
184 loop {
185 let byte = reader.read_u8()?;
186 result |= ((byte & 0x7F) as u64) << shift;
187 if byte & 0x80 == 0 {
188 return Ok(result);
189 }
190 shift += 7;
191 if shift >= 64 {
192 return Err(io::Error::new(
193 io::ErrorKind::InvalidData,
194 "varint too long",
195 ));
196 }
197 }
198}
199
200#[allow(dead_code)]
202#[derive(Debug, Clone)]
203pub struct CompactPostingList {
204 data: Vec<u8>,
205 doc_count: u32,
206}
207
208#[allow(dead_code)]
209impl CompactPostingList {
210 pub fn from_posting_list(list: &PostingList) -> io::Result<Self> {
212 let mut data = Vec::new();
213 list.serialize(&mut data)?;
214 Ok(Self {
215 doc_count: list.len() as u32,
216 data,
217 })
218 }
219
220 pub fn as_bytes(&self) -> &[u8] {
222 &self.data
223 }
224
225 pub fn doc_count(&self) -> u32 {
227 self.doc_count
228 }
229
230 pub fn to_posting_list(&self) -> io::Result<PostingList> {
232 PostingList::deserialize(&mut &self.data[..])
233 }
234}
235
236pub const BLOCK_SIZE: usize = 128;
239
240#[derive(Debug, Clone)]
241pub struct BlockPostingList {
242 skip_list: Vec<(DocId, DocId, u32, u32)>,
246 data: Vec<u8>,
248 doc_count: u32,
250 max_tf: u32,
252}
253
254impl BlockPostingList {
255 pub fn from_posting_list(list: &PostingList) -> io::Result<Self> {
257 let mut skip_list = Vec::new();
258 let mut data = Vec::new();
259 let mut max_tf = 0u32;
260
261 let postings = &list.postings;
262 let mut i = 0;
263
264 while i < postings.len() {
265 let block_start = data.len() as u32;
266 let block_end = (i + BLOCK_SIZE).min(postings.len());
267 let block = &postings[i..block_end];
268
269 let block_max_tf = block.iter().map(|p| p.term_freq).max().unwrap_or(0);
271 max_tf = max_tf.max(block_max_tf);
272
273 let base_doc_id = block.first().unwrap().doc_id;
275 let last_doc_id = block.last().unwrap().doc_id;
276 skip_list.push((base_doc_id, last_doc_id, block_start, block_max_tf));
277
278 write_vint(&mut data, block.len() as u64)?;
280
281 let mut prev_doc_id = base_doc_id;
282 for (j, posting) in block.iter().enumerate() {
283 if j == 0 {
284 write_vint(&mut data, posting.doc_id as u64)?;
286 } else {
287 let delta = posting.doc_id - prev_doc_id;
289 write_vint(&mut data, delta as u64)?;
290 }
291 write_vint(&mut data, posting.term_freq as u64)?;
292 prev_doc_id = posting.doc_id;
293 }
294
295 i = block_end;
296 }
297
298 Ok(Self {
299 skip_list,
300 data,
301 doc_count: postings.len() as u32,
302 max_tf,
303 })
304 }
305
306 pub fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
308 writer.write_u32::<LittleEndian>(self.doc_count)?;
310 writer.write_u32::<LittleEndian>(self.max_tf)?;
311
312 writer.write_u32::<LittleEndian>(self.skip_list.len() as u32)?;
314 for (base_doc_id, last_doc_id, offset, block_max_tf) in &self.skip_list {
315 writer.write_u32::<LittleEndian>(*base_doc_id)?;
316 writer.write_u32::<LittleEndian>(*last_doc_id)?;
317 writer.write_u32::<LittleEndian>(*offset)?;
318 writer.write_u32::<LittleEndian>(*block_max_tf)?;
319 }
320
321 writer.write_u32::<LittleEndian>(self.data.len() as u32)?;
323 writer.write_all(&self.data)?;
324
325 Ok(())
326 }
327
328 pub fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
330 let doc_count = reader.read_u32::<LittleEndian>()?;
331 let max_tf = reader.read_u32::<LittleEndian>()?;
332
333 let skip_count = reader.read_u32::<LittleEndian>()? as usize;
334 let mut skip_list = Vec::with_capacity(skip_count);
335 for _ in 0..skip_count {
336 let base_doc_id = reader.read_u32::<LittleEndian>()?;
337 let last_doc_id = reader.read_u32::<LittleEndian>()?;
338 let offset = reader.read_u32::<LittleEndian>()?;
339 let block_max_tf = reader.read_u32::<LittleEndian>()?;
340 skip_list.push((base_doc_id, last_doc_id, offset, block_max_tf));
341 }
342
343 let data_len = reader.read_u32::<LittleEndian>()? as usize;
344 let mut data = vec![0u8; data_len];
345 reader.read_exact(&mut data)?;
346
347 Ok(Self {
348 skip_list,
349 data,
350 max_tf,
351 doc_count,
352 })
353 }
354
355 pub fn doc_count(&self) -> u32 {
356 self.doc_count
357 }
358
359 pub fn max_tf(&self) -> u32 {
361 self.max_tf
362 }
363
364 pub fn num_blocks(&self) -> usize {
366 self.skip_list.len()
367 }
368
369 pub fn block_info(&self, block_idx: usize) -> Option<(DocId, DocId, usize, usize, u32)> {
371 if block_idx >= self.skip_list.len() {
372 return None;
373 }
374 let (base, last, offset, block_max_tf) = self.skip_list[block_idx];
375 let next_offset = if block_idx + 1 < self.skip_list.len() {
376 self.skip_list[block_idx + 1].2 as usize
377 } else {
378 self.data.len()
379 };
380 Some((
381 base,
382 last,
383 offset as usize,
384 next_offset - offset as usize,
385 block_max_tf,
386 ))
387 }
388
389 pub fn block_max_tf(&self, block_idx: usize) -> Option<u32> {
391 self.skip_list
392 .get(block_idx)
393 .map(|(_, _, _, max_tf)| *max_tf)
394 }
395
396 pub fn block_data(&self, block_idx: usize) -> Option<&[u8]> {
398 let (_, _, offset, len, _) = self.block_info(block_idx)?;
399 Some(&self.data[offset..offset + len])
400 }
401
402 pub fn concatenate_blocks(sources: &[(BlockPostingList, u32)]) -> io::Result<Self> {
405 let mut skip_list = Vec::new();
406 let mut data = Vec::new();
407 let mut total_docs = 0u32;
408 let mut max_tf = 0u32;
409
410 for (source, doc_offset) in sources {
411 max_tf = max_tf.max(source.max_tf);
412 for block_idx in 0..source.num_blocks() {
413 if let Some((base, last, src_offset, len, block_max_tf)) =
414 source.block_info(block_idx)
415 {
416 let new_base = base + doc_offset;
417 let new_last = last + doc_offset;
418 let new_offset = data.len() as u32;
419
420 let block_bytes = &source.data[src_offset..src_offset + len];
422 let mut reader = block_bytes;
423
424 let count = read_vint(&mut reader)? as usize;
426
427 write_vint(&mut data, count as u64)?;
429
430 let first_doc = read_vint(&mut reader)? as u32;
432 let first_tf = read_vint(&mut reader)?;
433 write_vint(&mut data, (first_doc + doc_offset) as u64)?;
434 write_vint(&mut data, first_tf)?;
435
436 data.extend_from_slice(reader);
438
439 skip_list.push((new_base, new_last, new_offset, block_max_tf));
440 total_docs += count as u32;
441 }
442 }
443 }
444
445 Ok(Self {
446 skip_list,
447 data,
448 doc_count: total_docs,
449 max_tf,
450 })
451 }
452
453 pub fn iterator(&self) -> BlockPostingIterator<'_> {
455 BlockPostingIterator::new(self)
456 }
457
458 pub fn into_iterator(self) -> BlockPostingIterator<'static> {
460 BlockPostingIterator::owned(self)
461 }
462}
463
464pub struct BlockPostingIterator<'a> {
467 block_list: std::borrow::Cow<'a, BlockPostingList>,
468 current_block: usize,
469 block_postings: Vec<Posting>,
470 position_in_block: usize,
471 exhausted: bool,
472}
473
474#[allow(dead_code)]
476pub type OwnedBlockPostingIterator = BlockPostingIterator<'static>;
477
478impl<'a> BlockPostingIterator<'a> {
479 fn new(block_list: &'a BlockPostingList) -> Self {
480 let exhausted = block_list.skip_list.is_empty();
481 let mut iter = Self {
482 block_list: std::borrow::Cow::Borrowed(block_list),
483 current_block: 0,
484 block_postings: Vec::new(),
485 position_in_block: 0,
486 exhausted,
487 };
488 if !iter.exhausted {
489 iter.load_block(0);
490 }
491 iter
492 }
493
494 fn owned(block_list: BlockPostingList) -> BlockPostingIterator<'static> {
495 let exhausted = block_list.skip_list.is_empty();
496 let mut iter = BlockPostingIterator {
497 block_list: std::borrow::Cow::Owned(block_list),
498 current_block: 0,
499 block_postings: Vec::new(),
500 position_in_block: 0,
501 exhausted,
502 };
503 if !iter.exhausted {
504 iter.load_block(0);
505 }
506 iter
507 }
508
509 fn load_block(&mut self, block_idx: usize) {
510 if block_idx >= self.block_list.skip_list.len() {
511 self.exhausted = true;
512 return;
513 }
514
515 self.current_block = block_idx;
516 self.position_in_block = 0;
517
518 let offset = self.block_list.skip_list[block_idx].2 as usize;
519 let mut reader = &self.block_list.data[offset..];
520
521 let count = read_vint(&mut reader).unwrap_or(0) as usize;
522 self.block_postings.clear();
523 self.block_postings.reserve(count);
524
525 let mut prev_doc_id = 0u32;
526
527 for i in 0..count {
528 if let (Ok(value), Ok(tf)) = (read_vint(&mut reader), read_vint(&mut reader)) {
529 let doc_id = if i == 0 {
530 value as u32
532 } else {
533 prev_doc_id + value as u32
535 };
536 self.block_postings.push(Posting {
537 doc_id,
538 term_freq: tf as u32,
539 });
540 prev_doc_id = doc_id;
541 }
542 }
543 }
544
545 pub fn doc(&self) -> DocId {
546 if self.exhausted {
547 TERMINATED
548 } else if self.position_in_block < self.block_postings.len() {
549 self.block_postings[self.position_in_block].doc_id
550 } else {
551 TERMINATED
552 }
553 }
554
555 pub fn term_freq(&self) -> u32 {
556 if self.exhausted || self.position_in_block >= self.block_postings.len() {
557 0
558 } else {
559 self.block_postings[self.position_in_block].term_freq
560 }
561 }
562
563 pub fn advance(&mut self) -> DocId {
564 if self.exhausted {
565 return TERMINATED;
566 }
567
568 self.position_in_block += 1;
569 if self.position_in_block >= self.block_postings.len() {
570 self.load_block(self.current_block + 1);
571 }
572 self.doc()
573 }
574
575 pub fn seek(&mut self, target: DocId) -> DocId {
576 if self.exhausted {
577 return TERMINATED;
578 }
579
580 let target_block = self
581 .block_list
582 .skip_list
583 .iter()
584 .position(|(_, last_doc, _, _)| *last_doc >= target);
585
586 if let Some(block_idx) = target_block {
587 if block_idx != self.current_block {
588 self.load_block(block_idx);
589 }
590
591 while self.position_in_block < self.block_postings.len() {
592 if self.block_postings[self.position_in_block].doc_id >= target {
593 return self.doc();
594 }
595 self.position_in_block += 1;
596 }
597
598 self.load_block(self.current_block + 1);
599 self.seek(target)
600 } else {
601 self.exhausted = true;
602 TERMINATED
603 }
604 }
605
606 pub fn skip_to_next_block(&mut self) -> DocId {
610 if self.exhausted {
611 return TERMINATED;
612 }
613 self.load_block(self.current_block + 1);
614 self.doc()
615 }
616
617 #[inline]
619 pub fn current_block_idx(&self) -> usize {
620 self.current_block
621 }
622
623 #[inline]
625 pub fn num_blocks(&self) -> usize {
626 self.block_list.skip_list.len()
627 }
628
629 #[inline]
631 pub fn current_block_max_tf(&self) -> u32 {
632 if self.exhausted || self.current_block >= self.block_list.skip_list.len() {
633 0
634 } else {
635 self.block_list.skip_list[self.current_block].3
636 }
637 }
638}
639
640#[cfg(test)]
641mod tests {
642 use super::*;
643
644 #[test]
645 fn test_posting_list_basic() {
646 let mut list = PostingList::new();
647 list.push(1, 2);
648 list.push(5, 1);
649 list.push(10, 3);
650
651 assert_eq!(list.len(), 3);
652
653 let mut iter = PostingListIterator::new(&list);
654 assert_eq!(iter.doc(), 1);
655 assert_eq!(iter.term_freq(), 2);
656
657 assert_eq!(iter.advance(), 5);
658 assert_eq!(iter.term_freq(), 1);
659
660 assert_eq!(iter.advance(), 10);
661 assert_eq!(iter.term_freq(), 3);
662
663 assert_eq!(iter.advance(), TERMINATED);
664 }
665
666 #[test]
667 fn test_posting_list_serialization() {
668 let mut list = PostingList::new();
669 for i in 0..100 {
670 list.push(i * 3, (i % 5) + 1);
671 }
672
673 let mut buffer = Vec::new();
674 list.serialize(&mut buffer).unwrap();
675
676 let deserialized = PostingList::deserialize(&mut &buffer[..]).unwrap();
677 assert_eq!(deserialized.len(), list.len());
678
679 for (a, b) in list.iter().zip(deserialized.iter()) {
680 assert_eq!(a, b);
681 }
682 }
683
684 #[test]
685 fn test_posting_list_seek() {
686 let mut list = PostingList::new();
687 for i in 0..100 {
688 list.push(i * 2, 1);
689 }
690
691 let mut iter = PostingListIterator::new(&list);
692
693 assert_eq!(iter.seek(50), 50);
694 assert_eq!(iter.seek(51), 52);
695 assert_eq!(iter.seek(200), TERMINATED);
696 }
697
698 #[test]
699 fn test_block_posting_list() {
700 let mut list = PostingList::new();
701 for i in 0..500 {
702 list.push(i * 2, (i % 10) + 1);
703 }
704
705 let block_list = BlockPostingList::from_posting_list(&list).unwrap();
706 assert_eq!(block_list.doc_count(), 500);
707
708 let mut iter = block_list.iterator();
709 assert_eq!(iter.doc(), 0);
710 assert_eq!(iter.term_freq(), 1);
711
712 assert_eq!(iter.seek(500), 500);
714 assert_eq!(iter.seek(998), 998);
715 assert_eq!(iter.seek(1000), TERMINATED);
716 }
717
718 #[test]
719 fn test_block_posting_list_serialization() {
720 let mut list = PostingList::new();
721 for i in 0..300 {
722 list.push(i * 3, i + 1);
723 }
724
725 let block_list = BlockPostingList::from_posting_list(&list).unwrap();
726
727 let mut buffer = Vec::new();
728 block_list.serialize(&mut buffer).unwrap();
729
730 let deserialized = BlockPostingList::deserialize(&mut &buffer[..]).unwrap();
731 assert_eq!(deserialized.doc_count(), block_list.doc_count());
732
733 let mut iter1 = block_list.iterator();
735 let mut iter2 = deserialized.iterator();
736
737 while iter1.doc() != TERMINATED {
738 assert_eq!(iter1.doc(), iter2.doc());
739 assert_eq!(iter1.term_freq(), iter2.term_freq());
740 iter1.advance();
741 iter2.advance();
742 }
743 assert_eq!(iter2.doc(), TERMINATED);
744 }
745}