1use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
26use std::io::{self, Read, Write};
27
28use crate::DocId;
29
30pub const POSITION_BLOCK_SIZE: usize = 128;
32
33pub const MAX_TOKEN_POSITION: u32 = (1 << 20) - 1;
35
36pub const MAX_ELEMENT_ORDINAL: u32 = (1 << 12) - 1;
38
39#[inline]
41pub fn encode_position(element_ordinal: u32, token_position: u32) -> u32 {
42 debug_assert!(
43 element_ordinal <= MAX_ELEMENT_ORDINAL,
44 "Element ordinal {} exceeds maximum {}",
45 element_ordinal,
46 MAX_ELEMENT_ORDINAL
47 );
48 debug_assert!(
49 token_position <= MAX_TOKEN_POSITION,
50 "Token position {} exceeds maximum {}",
51 token_position,
52 MAX_TOKEN_POSITION
53 );
54 (element_ordinal << 20) | (token_position & MAX_TOKEN_POSITION)
55}
56
57#[inline]
59pub fn decode_element_ordinal(position: u32) -> u32 {
60 position >> 20
61}
62
63#[inline]
65pub fn decode_token_position(position: u32) -> u32 {
66 position & MAX_TOKEN_POSITION
67}
68
69#[derive(Debug, Clone, PartialEq, Eq)]
71pub struct PostingWithPositions {
72 pub doc_id: DocId,
73 pub term_freq: u32,
74 pub positions: Vec<u32>,
76}
77
78#[derive(Debug, Clone)]
83pub struct PositionPostingList {
84 skip_list: Vec<(DocId, DocId, u32)>,
87 data: Vec<u8>,
89 doc_count: u32,
91}
92
93impl Default for PositionPostingList {
94 fn default() -> Self {
95 Self::new()
96 }
97}
98
99impl PositionPostingList {
100 pub fn new() -> Self {
101 Self {
102 skip_list: Vec::new(),
103 data: Vec::new(),
104 doc_count: 0,
105 }
106 }
107
108 pub fn with_capacity(capacity: usize) -> Self {
109 Self {
110 skip_list: Vec::with_capacity(capacity / POSITION_BLOCK_SIZE + 1),
111 data: Vec::with_capacity(capacity * 8), doc_count: 0,
113 }
114 }
115
116 pub fn from_postings(postings: &[PostingWithPositions]) -> io::Result<Self> {
118 if postings.is_empty() {
119 return Ok(Self::new());
120 }
121
122 let mut skip_list = Vec::new();
123 let mut data = Vec::new();
124 let mut i = 0;
125
126 while i < postings.len() {
127 let block_start = data.len() as u32;
128 let block_end = (i + POSITION_BLOCK_SIZE).min(postings.len());
129 let block = &postings[i..block_end];
130
131 let base_doc_id = block.first().unwrap().doc_id;
133 let last_doc_id = block.last().unwrap().doc_id;
134 skip_list.push((base_doc_id, last_doc_id, block_start));
135
136 write_vint(&mut data, block.len() as u64)?;
138
139 let mut prev_doc_id = base_doc_id;
140 for (j, posting) in block.iter().enumerate() {
141 if j == 0 {
142 write_vint(&mut data, posting.doc_id as u64)?;
144 } else {
145 let delta = posting.doc_id - prev_doc_id;
147 write_vint(&mut data, delta as u64)?;
148 }
149 prev_doc_id = posting.doc_id;
150
151 write_vint(&mut data, posting.positions.len() as u64)?;
153 for &pos in &posting.positions {
154 write_vint(&mut data, pos as u64)?;
155 }
156 }
157
158 i = block_end;
159 }
160
161 Ok(Self {
162 skip_list,
163 data,
164 doc_count: postings.len() as u32,
165 })
166 }
167
168 pub fn push(&mut self, doc_id: DocId, positions: Vec<u32>) {
170 let posting = PostingWithPositions {
173 doc_id,
174 term_freq: positions.len() as u32,
175 positions,
176 };
177
178 let block_start = self.data.len() as u32;
180
181 let need_new_block =
183 self.skip_list.is_empty() || self.doc_count.is_multiple_of(POSITION_BLOCK_SIZE as u32);
184
185 if need_new_block {
186 self.skip_list.push((doc_id, doc_id, block_start));
188 write_vint(&mut self.data, 1u64).unwrap();
190 write_vint(&mut self.data, doc_id as u64).unwrap();
192 } else {
193 let last_block = self.skip_list.last_mut().unwrap();
195 let prev_doc_id = last_block.1;
196 last_block.1 = doc_id; let delta = doc_id - prev_doc_id;
200 write_vint(&mut self.data, delta as u64).unwrap();
201 }
202
203 write_vint(&mut self.data, posting.positions.len() as u64).unwrap();
205 for &pos in &posting.positions {
206 write_vint(&mut self.data, pos as u64).unwrap();
207 }
208
209 self.doc_count += 1;
210 }
211
212 pub fn doc_count(&self) -> u32 {
213 self.doc_count
214 }
215
216 pub fn len(&self) -> usize {
217 self.doc_count as usize
218 }
219
220 pub fn is_empty(&self) -> bool {
221 self.doc_count == 0
222 }
223
224 pub fn get_positions(&self, target_doc_id: DocId) -> Option<Vec<u32>> {
226 if self.skip_list.is_empty() {
227 return None;
228 }
229
230 let block_idx = match self.skip_list.binary_search_by(|&(base, last, _)| {
232 if target_doc_id < base {
233 std::cmp::Ordering::Greater
234 } else if target_doc_id > last {
235 std::cmp::Ordering::Less
236 } else {
237 std::cmp::Ordering::Equal
238 }
239 }) {
240 Ok(idx) => idx,
241 Err(_) => return None, };
243
244 let offset = self.skip_list[block_idx].2 as usize;
246 let mut reader = &self.data[offset..];
247
248 let count = read_vint(&mut reader).ok()? as usize;
249 let mut prev_doc_id = 0u32;
250
251 for i in 0..count {
252 let doc_id = if i == 0 {
253 read_vint(&mut reader).ok()? as u32
254 } else {
255 let delta = read_vint(&mut reader).ok()? as u32;
256 prev_doc_id + delta
257 };
258 prev_doc_id = doc_id;
259
260 let num_positions = read_vint(&mut reader).ok()? as usize;
261
262 if doc_id == target_doc_id {
263 let mut positions = Vec::with_capacity(num_positions);
265 for _ in 0..num_positions {
266 let pos = read_vint(&mut reader).ok()? as u32;
267 positions.push(pos);
268 }
269 return Some(positions);
270 } else {
271 for _ in 0..num_positions {
273 let _ = read_vint(&mut reader);
274 }
275 }
276 }
277
278 None
279 }
280
281 pub fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
283 writer.write_u32::<LittleEndian>(self.doc_count)?;
285
286 writer.write_u32::<LittleEndian>(self.skip_list.len() as u32)?;
288 for (base_doc_id, last_doc_id, offset) in &self.skip_list {
289 writer.write_u32::<LittleEndian>(*base_doc_id)?;
290 writer.write_u32::<LittleEndian>(*last_doc_id)?;
291 writer.write_u32::<LittleEndian>(*offset)?;
292 }
293
294 writer.write_u32::<LittleEndian>(self.data.len() as u32)?;
296 writer.write_all(&self.data)?;
297
298 Ok(())
299 }
300
301 pub fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
303 let doc_count = reader.read_u32::<LittleEndian>()?;
304
305 let skip_count = reader.read_u32::<LittleEndian>()? as usize;
306 let mut skip_list = Vec::with_capacity(skip_count);
307 for _ in 0..skip_count {
308 let base_doc_id = reader.read_u32::<LittleEndian>()?;
309 let last_doc_id = reader.read_u32::<LittleEndian>()?;
310 let offset = reader.read_u32::<LittleEndian>()?;
311 skip_list.push((base_doc_id, last_doc_id, offset));
312 }
313
314 let data_len = reader.read_u32::<LittleEndian>()? as usize;
315 let mut data = vec![0u8; data_len];
316 reader.read_exact(&mut data)?;
317
318 Ok(Self {
319 skip_list,
320 data,
321 doc_count,
322 })
323 }
324
325 pub fn concatenate_blocks(sources: &[(PositionPostingList, u32)]) -> io::Result<Self> {
327 let mut skip_list = Vec::new();
328 let mut data = Vec::new();
329 let mut total_docs = 0u32;
330
331 for (source, doc_offset) in sources {
332 for block_idx in 0..source.skip_list.len() {
333 let (base, last, src_offset) = source.skip_list[block_idx];
334 let next_offset = if block_idx + 1 < source.skip_list.len() {
335 source.skip_list[block_idx + 1].2 as usize
336 } else {
337 source.data.len()
338 };
339
340 let new_base = base + doc_offset;
341 let new_last = last + doc_offset;
342 let new_offset = data.len() as u32;
343
344 let block_bytes = &source.data[src_offset as usize..next_offset];
346 let mut reader = block_bytes;
347
348 let count = read_vint(&mut reader)? as usize;
349 write_vint(&mut data, count as u64)?;
350
351 let first_doc = read_vint(&mut reader)? as u32;
353 write_vint(&mut data, (first_doc + doc_offset) as u64)?;
354
355 data.extend_from_slice(reader);
357
358 skip_list.push((new_base, new_last, new_offset));
359 total_docs += count as u32;
360 }
361 }
362
363 Ok(Self {
364 skip_list,
365 data,
366 doc_count: total_docs,
367 })
368 }
369
370 pub fn iter(&self) -> PositionPostingIterator<'_> {
372 PositionPostingIterator::new(self)
373 }
374}
375
376fn write_vint<W: Write>(writer: &mut W, mut value: u64) -> io::Result<()> {
378 loop {
379 let byte = (value & 0x7F) as u8;
380 value >>= 7;
381 if value == 0 {
382 writer.write_u8(byte)?;
383 break;
384 } else {
385 writer.write_u8(byte | 0x80)?;
386 }
387 }
388 Ok(())
389}
390
391fn read_vint<R: Read>(reader: &mut R) -> io::Result<u64> {
393 let mut result = 0u64;
394 let mut shift = 0;
395 loop {
396 let byte = reader.read_u8()?;
397 result |= ((byte & 0x7F) as u64) << shift;
398 if byte & 0x80 == 0 {
399 break;
400 }
401 shift += 7;
402 }
403 Ok(result)
404}
405
406pub struct PositionPostingIterator<'a> {
408 list: &'a PositionPostingList,
409 current_block: usize,
410 position_in_block: usize,
411 block_postings: Vec<PostingWithPositions>,
412 exhausted: bool,
413}
414
415impl<'a> PositionPostingIterator<'a> {
416 pub fn new(list: &'a PositionPostingList) -> Self {
417 let exhausted = list.skip_list.is_empty();
418 let mut iter = Self {
419 list,
420 current_block: 0,
421 position_in_block: 0,
422 block_postings: Vec::new(),
423 exhausted,
424 };
425 if !iter.exhausted {
426 iter.load_block(0);
427 }
428 iter
429 }
430
431 fn load_block(&mut self, block_idx: usize) {
432 if block_idx >= self.list.skip_list.len() {
433 self.exhausted = true;
434 return;
435 }
436
437 self.current_block = block_idx;
438 self.position_in_block = 0;
439
440 let offset = self.list.skip_list[block_idx].2 as usize;
441 let mut reader = &self.list.data[offset..];
442
443 let count = read_vint(&mut reader).unwrap_or(0) as usize;
444 self.block_postings.clear();
445 self.block_postings.reserve(count);
446
447 let mut prev_doc_id = 0u32;
448
449 for i in 0..count {
450 let doc_id = if i == 0 {
451 read_vint(&mut reader).unwrap_or(0) as u32
452 } else {
453 let delta = read_vint(&mut reader).unwrap_or(0) as u32;
454 prev_doc_id + delta
455 };
456 prev_doc_id = doc_id;
457
458 let num_positions = read_vint(&mut reader).unwrap_or(0) as usize;
459 let mut positions = Vec::with_capacity(num_positions);
460 for _ in 0..num_positions {
461 let pos = read_vint(&mut reader).unwrap_or(0) as u32;
462 positions.push(pos);
463 }
464
465 self.block_postings.push(PostingWithPositions {
466 doc_id,
467 term_freq: num_positions as u32,
468 positions,
469 });
470 }
471 }
472
473 pub fn doc(&self) -> DocId {
474 if self.exhausted || self.position_in_block >= self.block_postings.len() {
475 u32::MAX
476 } else {
477 self.block_postings[self.position_in_block].doc_id
478 }
479 }
480
481 pub fn term_freq(&self) -> u32 {
482 if self.exhausted || self.position_in_block >= self.block_postings.len() {
483 0
484 } else {
485 self.block_postings[self.position_in_block].term_freq
486 }
487 }
488
489 pub fn positions(&self) -> &[u32] {
490 if self.exhausted || self.position_in_block >= self.block_postings.len() {
491 &[]
492 } else {
493 &self.block_postings[self.position_in_block].positions
494 }
495 }
496
497 pub fn advance(&mut self) {
498 if self.exhausted {
499 return;
500 }
501
502 self.position_in_block += 1;
503 if self.position_in_block >= self.block_postings.len() {
504 self.load_block(self.current_block + 1);
505 }
506 }
507
508 pub fn seek(&mut self, target: DocId) {
509 if self.exhausted {
510 return;
511 }
512
513 if let Some((_, last, _)) = self.list.skip_list.get(self.current_block)
515 && target <= *last
516 {
517 while self.position_in_block < self.block_postings.len()
519 && self.block_postings[self.position_in_block].doc_id < target
520 {
521 self.position_in_block += 1;
522 }
523 if self.position_in_block >= self.block_postings.len() {
524 self.load_block(self.current_block + 1);
525 self.seek(target); }
527 return;
528 }
529
530 let block_idx = match self.list.skip_list.binary_search_by(|&(base, last, _)| {
532 if target < base {
533 std::cmp::Ordering::Greater
534 } else if target > last {
535 std::cmp::Ordering::Less
536 } else {
537 std::cmp::Ordering::Equal
538 }
539 }) {
540 Ok(idx) => idx,
541 Err(idx) => idx, };
543
544 if block_idx >= self.list.skip_list.len() {
545 self.exhausted = true;
546 return;
547 }
548
549 self.load_block(block_idx);
550
551 while self.position_in_block < self.block_postings.len()
553 && self.block_postings[self.position_in_block].doc_id < target
554 {
555 self.position_in_block += 1;
556 }
557
558 if self.position_in_block >= self.block_postings.len() {
559 self.load_block(self.current_block + 1);
560 }
561 }
562}
563
564#[cfg(test)]
565mod tests {
566 use super::*;
567
568 #[test]
569 fn test_position_encoding() {
570 let pos = encode_position(0, 5);
572 assert_eq!(decode_element_ordinal(pos), 0);
573 assert_eq!(decode_token_position(pos), 5);
574
575 let pos = encode_position(3, 100);
577 assert_eq!(decode_element_ordinal(pos), 3);
578 assert_eq!(decode_token_position(pos), 100);
579
580 let pos = encode_position(MAX_ELEMENT_ORDINAL, MAX_TOKEN_POSITION);
582 assert_eq!(decode_element_ordinal(pos), MAX_ELEMENT_ORDINAL);
583 assert_eq!(decode_token_position(pos), MAX_TOKEN_POSITION);
584 }
585
586 #[test]
587 fn test_position_posting_list_build() {
588 let postings = vec![
590 PostingWithPositions {
591 doc_id: 1,
592 term_freq: 2,
593 positions: vec![encode_position(0, 0), encode_position(0, 2)],
594 },
595 PostingWithPositions {
596 doc_id: 3,
597 term_freq: 1,
598 positions: vec![encode_position(1, 0)],
599 },
600 ];
601
602 let list = PositionPostingList::from_postings(&postings).unwrap();
603 assert_eq!(list.doc_count(), 2);
604
605 let pos = list.get_positions(1).unwrap();
607 assert_eq!(pos.len(), 2);
608
609 let pos = list.get_positions(3).unwrap();
610 assert_eq!(pos.len(), 1);
611
612 assert!(list.get_positions(2).is_none());
614 assert!(list.get_positions(99).is_none());
615 }
616
617 #[test]
618 fn test_serialization_roundtrip() {
619 let postings = vec![
620 PostingWithPositions {
621 doc_id: 1,
622 term_freq: 2,
623 positions: vec![encode_position(0, 0), encode_position(0, 5)],
624 },
625 PostingWithPositions {
626 doc_id: 3,
627 term_freq: 1,
628 positions: vec![encode_position(1, 0)],
629 },
630 PostingWithPositions {
631 doc_id: 5,
632 term_freq: 1,
633 positions: vec![encode_position(0, 10)],
634 },
635 ];
636
637 let list = PositionPostingList::from_postings(&postings).unwrap();
638
639 let mut bytes = Vec::new();
640 list.serialize(&mut bytes).unwrap();
641
642 let mut cursor = std::io::Cursor::new(&bytes);
643 let deserialized = PositionPostingList::deserialize(&mut cursor).unwrap();
644
645 assert_eq!(list.doc_count(), deserialized.doc_count());
646
647 let pos = deserialized.get_positions(1).unwrap();
649 assert_eq!(pos, vec![encode_position(0, 0), encode_position(0, 5)]);
650
651 let pos = deserialized.get_positions(3).unwrap();
652 assert_eq!(pos, vec![encode_position(1, 0)]);
653 }
654
655 #[test]
656 fn test_binary_search_many_blocks() {
657 let mut postings = Vec::new();
659 for i in 0..300 {
660 postings.push(PostingWithPositions {
661 doc_id: i * 2, term_freq: 1,
663 positions: vec![encode_position(0, i)],
664 });
665 }
666
667 let list = PositionPostingList::from_postings(&postings).unwrap();
668 assert_eq!(list.doc_count(), 300);
669
670 assert_eq!(list.skip_list.len(), 3);
672
673 let pos = list.get_positions(0).unwrap();
675 assert_eq!(pos, vec![encode_position(0, 0)]);
676
677 let pos = list.get_positions(256).unwrap(); assert_eq!(pos, vec![encode_position(0, 128)]);
679
680 let pos = list.get_positions(598).unwrap(); assert_eq!(pos, vec![encode_position(0, 299)]);
682
683 assert!(list.get_positions(1).is_none());
685 assert!(list.get_positions(257).is_none());
686 }
687
688 #[test]
689 fn test_concatenate_blocks_merge() {
690 let postings1 = vec![
692 PostingWithPositions {
693 doc_id: 0,
694 term_freq: 1,
695 positions: vec![0],
696 },
697 PostingWithPositions {
698 doc_id: 1,
699 term_freq: 1,
700 positions: vec![5],
701 },
702 PostingWithPositions {
703 doc_id: 2,
704 term_freq: 1,
705 positions: vec![10],
706 },
707 ];
708 let list1 = PositionPostingList::from_postings(&postings1).unwrap();
709
710 let postings2 = vec![
711 PostingWithPositions {
712 doc_id: 0,
713 term_freq: 1,
714 positions: vec![100],
715 },
716 PostingWithPositions {
717 doc_id: 1,
718 term_freq: 1,
719 positions: vec![105],
720 },
721 ];
722 let list2 = PositionPostingList::from_postings(&postings2).unwrap();
723
724 let combined = PositionPostingList::concatenate_blocks(&[
726 (list1, 0), (list2, 3), ])
729 .unwrap();
730
731 assert_eq!(combined.doc_count(), 5);
732
733 assert!(combined.get_positions(0).is_some());
735 assert!(combined.get_positions(1).is_some());
736 assert!(combined.get_positions(2).is_some());
737 assert!(combined.get_positions(3).is_some()); assert!(combined.get_positions(4).is_some()); }
740
741 #[test]
742 fn test_iterator() {
743 let postings = vec![
744 PostingWithPositions {
745 doc_id: 1,
746 term_freq: 2,
747 positions: vec![0, 5],
748 },
749 PostingWithPositions {
750 doc_id: 3,
751 term_freq: 1,
752 positions: vec![10],
753 },
754 PostingWithPositions {
755 doc_id: 5,
756 term_freq: 1,
757 positions: vec![15],
758 },
759 ];
760
761 let list = PositionPostingList::from_postings(&postings).unwrap();
762 let mut iter = list.iter();
763
764 assert_eq!(iter.doc(), 1);
765 assert_eq!(iter.positions(), &[0, 5]);
766
767 iter.advance();
768 assert_eq!(iter.doc(), 3);
769
770 iter.seek(5);
771 assert_eq!(iter.doc(), 5);
772 assert_eq!(iter.positions(), &[15]);
773
774 iter.advance();
775 assert_eq!(iter.doc(), u32::MAX); }
777}