1use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
26use std::io::{self, Write};
27
28use super::posting_common::{read_vint, write_vint};
29use crate::DocId;
30
31pub const POSITION_BLOCK_SIZE: usize = 128;
33
34pub const MAX_TOKEN_POSITION: u32 = (1 << 20) - 1;
36
37pub const MAX_ELEMENT_ORDINAL: u32 = (1 << 12) - 1;
39
40#[inline]
42pub fn encode_position(element_ordinal: u32, token_position: u32) -> u32 {
43 debug_assert!(
44 element_ordinal <= MAX_ELEMENT_ORDINAL,
45 "Element ordinal {} exceeds maximum {}",
46 element_ordinal,
47 MAX_ELEMENT_ORDINAL
48 );
49 debug_assert!(
50 token_position <= MAX_TOKEN_POSITION,
51 "Token position {} exceeds maximum {}",
52 token_position,
53 MAX_TOKEN_POSITION
54 );
55 (element_ordinal << 20) | (token_position & MAX_TOKEN_POSITION)
56}
57
58#[inline]
60pub fn decode_element_ordinal(position: u32) -> u32 {
61 position >> 20
62}
63
64#[inline]
66pub fn decode_token_position(position: u32) -> u32 {
67 position & MAX_TOKEN_POSITION
68}
69
70#[derive(Debug, Clone, PartialEq, Eq)]
72pub struct PostingWithPositions {
73 pub doc_id: DocId,
74 pub term_freq: u32,
75 pub positions: Vec<u32>,
77}
78
79#[derive(Debug, Clone)]
84pub struct PositionPostingList {
85 skip_list: Vec<(DocId, DocId, u64)>,
88 data: Vec<u8>,
90 doc_count: u32,
92}
93
94impl Default for PositionPostingList {
95 fn default() -> Self {
96 Self::new()
97 }
98}
99
100impl PositionPostingList {
101 pub fn new() -> Self {
102 Self {
103 skip_list: Vec::new(),
104 data: Vec::new(),
105 doc_count: 0,
106 }
107 }
108
109 pub fn with_capacity(capacity: usize) -> Self {
110 Self {
111 skip_list: Vec::with_capacity(capacity / POSITION_BLOCK_SIZE + 1),
112 data: Vec::with_capacity(capacity * 8), doc_count: 0,
114 }
115 }
116
117 pub fn from_postings(postings: &[PostingWithPositions]) -> io::Result<Self> {
119 if postings.is_empty() {
120 return Ok(Self::new());
121 }
122
123 let mut skip_list = Vec::new();
124 let mut data = Vec::new();
125 let mut i = 0;
126
127 while i < postings.len() {
128 let block_start = data.len() as u64;
129 let block_end = (i + POSITION_BLOCK_SIZE).min(postings.len());
130 let block = &postings[i..block_end];
131
132 let base_doc_id = block.first().unwrap().doc_id;
134 let last_doc_id = block.last().unwrap().doc_id;
135 skip_list.push((base_doc_id, last_doc_id, block_start));
136
137 data.write_u32::<LittleEndian>(block.len() as u32)?;
139 data.write_u32::<LittleEndian>(base_doc_id)?;
140
141 let mut prev_doc_id = base_doc_id;
142 for (j, posting) in block.iter().enumerate() {
143 if j > 0 {
144 let delta = posting.doc_id - prev_doc_id;
145 write_vint(&mut data, delta as u64)?;
146 }
147 prev_doc_id = posting.doc_id;
148
149 write_vint(&mut data, posting.positions.len() as u64)?;
151 for &pos in &posting.positions {
152 write_vint(&mut data, pos as u64)?;
153 }
154 }
155
156 i = block_end;
157 }
158
159 Ok(Self {
160 skip_list,
161 data,
162 doc_count: postings.len() as u32,
163 })
164 }
165
166 pub fn push(&mut self, doc_id: DocId, positions: Vec<u32>) {
168 let posting = PostingWithPositions {
171 doc_id,
172 term_freq: positions.len() as u32,
173 positions,
174 };
175
176 let block_start = self.data.len() as u64;
178
179 let need_new_block =
181 self.skip_list.is_empty() || self.doc_count.is_multiple_of(POSITION_BLOCK_SIZE as u32);
182
183 if need_new_block {
184 self.skip_list.push((doc_id, doc_id, block_start));
186 self.data.write_u32::<LittleEndian>(1u32).unwrap();
187 self.data.write_u32::<LittleEndian>(doc_id).unwrap();
188 } else {
189 let last_block = self.skip_list.last_mut().unwrap();
191 let prev_doc_id = last_block.1;
192 last_block.1 = doc_id;
193
194 let count_offset = last_block.2 as usize;
196 let old_count = u32::from_le_bytes(
197 self.data[count_offset..count_offset + 4]
198 .try_into()
199 .unwrap(),
200 );
201 self.data[count_offset..count_offset + 4]
202 .copy_from_slice(&(old_count + 1).to_le_bytes());
203
204 let delta = doc_id - prev_doc_id;
205 write_vint(&mut self.data, delta as u64).unwrap();
206 }
207
208 write_vint(&mut self.data, posting.positions.len() as u64).unwrap();
210 for &pos in &posting.positions {
211 write_vint(&mut self.data, pos as u64).unwrap();
212 }
213
214 self.doc_count += 1;
215 }
216
217 pub fn doc_count(&self) -> u32 {
218 self.doc_count
219 }
220
221 pub fn len(&self) -> usize {
222 self.doc_count as usize
223 }
224
225 pub fn is_empty(&self) -> bool {
226 self.doc_count == 0
227 }
228
229 pub fn get_positions_into(&self, target_doc_id: DocId, out: &mut Vec<u32>) -> bool {
232 out.clear();
233 self.get_positions_impl(target_doc_id, Some(out)).is_some()
234 }
235
236 pub fn get_positions(&self, target_doc_id: DocId) -> Option<Vec<u32>> {
238 self.get_positions_impl(target_doc_id, None)
239 }
240
241 fn get_positions_impl(
242 &self,
243 target_doc_id: DocId,
244 mut out: Option<&mut Vec<u32>>,
245 ) -> Option<Vec<u32>> {
246 if self.skip_list.is_empty() {
247 return None;
248 }
249
250 let block_idx = match self.skip_list.binary_search_by(|&(base, last, _)| {
252 if target_doc_id < base {
253 std::cmp::Ordering::Greater
254 } else if target_doc_id > last {
255 std::cmp::Ordering::Less
256 } else {
257 std::cmp::Ordering::Equal
258 }
259 }) {
260 Ok(idx) => idx,
261 Err(_) => return None, };
263
264 let offset = self.skip_list[block_idx].2 as usize;
266 let mut reader = &self.data[offset..];
267
268 let count = reader.read_u32::<LittleEndian>().ok()? as usize;
270 let first_doc = reader.read_u32::<LittleEndian>().ok()?;
271 let mut prev_doc_id = first_doc;
272
273 for i in 0..count {
274 let doc_id = if i == 0 {
275 first_doc
276 } else {
277 let delta = read_vint(&mut reader).ok()? as u32;
278 prev_doc_id + delta
279 };
280 prev_doc_id = doc_id;
281
282 let num_positions = read_vint(&mut reader).ok()? as usize;
283
284 if doc_id == target_doc_id {
285 if let Some(buf) = &mut out {
287 buf.reserve(num_positions);
289 for _ in 0..num_positions {
290 let pos = read_vint(&mut reader).ok()? as u32;
291 buf.push(pos);
292 }
293 return Some(Vec::new()); }
295 let mut positions = Vec::with_capacity(num_positions);
296 for _ in 0..num_positions {
297 let pos = read_vint(&mut reader).ok()? as u32;
298 positions.push(pos);
299 }
300 return Some(positions);
301 } else {
302 for _ in 0..num_positions {
304 if read_vint(&mut reader).is_err() {
305 return None;
306 }
307 }
308 }
309 }
310
311 None
312 }
313
314 const SKIP_ENTRY_SIZE: usize = 20;
317
318 pub fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
327 writer.write_all(&self.data)?;
329
330 for (i, (base_doc_id, last_doc_id, offset)) in self.skip_list.iter().enumerate() {
332 let next_offset = if i + 1 < self.skip_list.len() {
333 self.skip_list[i + 1].2
334 } else {
335 self.data.len() as u64
336 };
337 let length = (next_offset - offset) as u32;
338 writer.write_u32::<LittleEndian>(*base_doc_id)?;
339 writer.write_u32::<LittleEndian>(*last_doc_id)?;
340 writer.write_u64::<LittleEndian>(*offset)?;
341 writer.write_u32::<LittleEndian>(length)?;
342 }
343
344 writer.write_u64::<LittleEndian>(self.data.len() as u64)?;
346 writer.write_u32::<LittleEndian>(self.skip_list.len() as u32)?;
347 writer.write_u32::<LittleEndian>(self.doc_count)?;
348
349 Ok(())
350 }
351
352 pub fn deserialize(raw: &[u8]) -> io::Result<Self> {
354 if raw.len() < 16 {
355 return Err(io::Error::new(
356 io::ErrorKind::InvalidData,
357 "position data too short",
358 ));
359 }
360
361 let f = raw.len() - 16;
363 let data_len = u64::from_le_bytes(raw[f..f + 8].try_into().unwrap()) as usize;
364 let skip_count = u32::from_le_bytes(raw[f + 8..f + 12].try_into().unwrap()) as usize;
365 let doc_count = u32::from_le_bytes(raw[f + 12..f + 16].try_into().unwrap());
366
367 let mut skip_list = Vec::with_capacity(skip_count);
369 let mut pos = data_len;
370 for _ in 0..skip_count {
371 let base = u32::from_le_bytes(raw[pos..pos + 4].try_into().unwrap());
372 let last = u32::from_le_bytes(raw[pos + 4..pos + 8].try_into().unwrap());
373 let offset = u64::from_le_bytes(raw[pos + 8..pos + 16].try_into().unwrap());
374 skip_list.push((base, last, offset));
376 pos += Self::SKIP_ENTRY_SIZE;
377 }
378
379 let data = raw[..data_len].to_vec();
380
381 Ok(Self {
382 skip_list,
383 data,
384 doc_count,
385 })
386 }
387
388 pub fn concatenate_blocks(sources: &[(PositionPostingList, u32)]) -> io::Result<Self> {
390 let mut skip_list = Vec::new();
391 let mut data = Vec::new();
392 let mut total_docs = 0u32;
393
394 for (source, doc_offset) in sources {
395 for block_idx in 0..source.skip_list.len() {
396 let (base, last, src_offset) = source.skip_list[block_idx];
397 let next_offset = if block_idx + 1 < source.skip_list.len() {
398 source.skip_list[block_idx + 1].2 as usize
399 } else {
400 source.data.len()
401 };
402
403 let new_base = base + doc_offset;
404 let new_last = last + doc_offset;
405 let new_offset = data.len() as u64;
406
407 let block_bytes = &source.data[src_offset as usize..next_offset];
409
410 let count = u32::from_le_bytes(block_bytes[0..4].try_into().unwrap());
412 let first_doc = u32::from_le_bytes(block_bytes[4..8].try_into().unwrap());
413
414 data.write_u32::<LittleEndian>(count)?;
416 data.write_u32::<LittleEndian>(first_doc + doc_offset)?;
417 data.extend_from_slice(&block_bytes[8..]);
418
419 skip_list.push((new_base, new_last, new_offset));
420 total_docs += count;
421 }
422 }
423
424 Ok(Self {
425 skip_list,
426 data,
427 doc_count: total_docs,
428 })
429 }
430
431 pub fn concatenate_streaming<W: Write>(
442 sources: &[(&[u8], u32)],
443 writer: &mut W,
444 ) -> io::Result<(u32, usize)> {
445 struct SourceMeta {
447 data_len: usize,
448 skip_count: usize,
449 }
450
451 let mut metas: Vec<SourceMeta> = Vec::with_capacity(sources.len());
452 let mut total_docs = 0u32;
453
454 for (raw, _) in sources {
455 if raw.len() < 16 {
456 continue;
457 }
458 let f = raw.len() - 16;
459 let data_len = u64::from_le_bytes(raw[f..f + 8].try_into().unwrap()) as usize;
460 let skip_count = u32::from_le_bytes(raw[f + 8..f + 12].try_into().unwrap()) as usize;
461 let doc_count = u32::from_le_bytes(raw[f + 12..f + 16].try_into().unwrap());
462 total_docs += doc_count;
463 metas.push(SourceMeta {
464 data_len,
465 skip_count,
466 });
467 }
468
469 let mut out_skip: Vec<u8> = Vec::new();
472 let mut out_skip_count = 0u32;
473 let mut data_written = 0u64;
474 let mut patch_buf = [0u8; 8];
475 let es = Self::SKIP_ENTRY_SIZE;
476
477 for (src_idx, meta) in metas.iter().enumerate() {
478 let (raw, doc_offset) = &sources[src_idx];
479 let skip_base = meta.data_len;
480 let data = &raw[..meta.data_len];
481
482 for i in 0..meta.skip_count {
483 let p = skip_base + i * es;
485 let base = u32::from_le_bytes(raw[p..p + 4].try_into().unwrap());
486 let last = u32::from_le_bytes(raw[p + 4..p + 8].try_into().unwrap());
487 let offset = u64::from_le_bytes(raw[p + 8..p + 16].try_into().unwrap());
488 let length = u32::from_le_bytes(raw[p + 16..p + 20].try_into().unwrap());
489
490 let block = &data[offset as usize..(offset as usize + length as usize)];
491
492 out_skip.extend_from_slice(&(base + doc_offset).to_le_bytes());
494 out_skip.extend_from_slice(&(last + doc_offset).to_le_bytes());
495 out_skip.extend_from_slice(&data_written.to_le_bytes());
496 out_skip.extend_from_slice(&length.to_le_bytes());
497 out_skip_count += 1;
498
499 patch_buf[0..4].copy_from_slice(&block[0..4]);
501 let first_doc = u32::from_le_bytes(block[4..8].try_into().unwrap());
502 patch_buf[4..8].copy_from_slice(&(first_doc + doc_offset).to_le_bytes());
503 writer.write_all(&patch_buf)?;
504 writer.write_all(&block[8..])?;
505
506 data_written += block.len() as u64;
507 }
508 }
509
510 writer.write_all(&out_skip)?;
512
513 writer.write_u64::<LittleEndian>(data_written)?;
514 writer.write_u32::<LittleEndian>(out_skip_count)?;
515 writer.write_u32::<LittleEndian>(total_docs)?;
516
517 let total_bytes = data_written as usize + out_skip.len() + 16;
518 Ok((total_docs, total_bytes))
519 }
520
521 pub fn iter(&self) -> PositionPostingIterator<'_> {
523 PositionPostingIterator::new(self)
524 }
525}
526
527pub struct PositionPostingIterator<'a> {
533 list: &'a PositionPostingList,
534 current_block: usize,
535 position_in_block: usize,
536 block_count: usize,
538 block_doc_ids: Vec<DocId>,
540 block_term_freqs: Vec<u32>,
542 block_positions: Vec<u32>,
544 block_pos_offsets: Vec<usize>,
546 exhausted: bool,
547}
548
549impl<'a> PositionPostingIterator<'a> {
550 pub fn new(list: &'a PositionPostingList) -> Self {
551 let exhausted = list.skip_list.is_empty();
552 let mut iter = Self {
553 list,
554 current_block: 0,
555 position_in_block: 0,
556 block_count: 0,
557 block_doc_ids: Vec::with_capacity(POSITION_BLOCK_SIZE),
558 block_term_freqs: Vec::with_capacity(POSITION_BLOCK_SIZE),
559 block_positions: Vec::new(),
560 block_pos_offsets: Vec::with_capacity(POSITION_BLOCK_SIZE + 1),
561 exhausted,
562 };
563 if !iter.exhausted {
564 iter.load_block(0);
565 }
566 iter
567 }
568
569 fn load_block(&mut self, block_idx: usize) {
570 if block_idx >= self.list.skip_list.len() {
571 self.exhausted = true;
572 return;
573 }
574
575 self.current_block = block_idx;
576 self.position_in_block = 0;
577
578 let offset = self.list.skip_list[block_idx].2 as usize;
579 let mut reader = &self.list.data[offset..];
580
581 let count = reader.read_u32::<LittleEndian>().unwrap_or(0) as usize;
583 let first_doc = reader.read_u32::<LittleEndian>().unwrap_or(0);
584
585 self.block_count = count;
586 self.block_doc_ids.clear();
587 self.block_term_freqs.clear();
588 self.block_positions.clear();
589 self.block_pos_offsets.clear();
590
591 let mut prev_doc_id = first_doc;
592
593 for i in 0..count {
594 let doc_id = if i == 0 {
595 first_doc
596 } else {
597 let delta = read_vint(&mut reader).unwrap_or(0) as u32;
598 prev_doc_id + delta
599 };
600 prev_doc_id = doc_id;
601
602 let num_positions = read_vint(&mut reader).unwrap_or(0) as usize;
603 self.block_doc_ids.push(doc_id);
604 self.block_term_freqs.push(num_positions as u32);
605 self.block_pos_offsets.push(self.block_positions.len());
606 for _ in 0..num_positions {
607 let pos = read_vint(&mut reader).unwrap_or(0) as u32;
608 self.block_positions.push(pos);
609 }
610 }
611 self.block_pos_offsets.push(self.block_positions.len());
613 }
614
615 pub fn doc(&self) -> DocId {
616 if self.exhausted || self.position_in_block >= self.block_count {
617 u32::MAX
618 } else {
619 self.block_doc_ids[self.position_in_block]
620 }
621 }
622
623 pub fn term_freq(&self) -> u32 {
624 if self.exhausted || self.position_in_block >= self.block_count {
625 0
626 } else {
627 self.block_term_freqs[self.position_in_block]
628 }
629 }
630
631 pub fn positions(&self) -> &[u32] {
632 if self.exhausted || self.position_in_block >= self.block_count {
633 &[]
634 } else {
635 let start = self.block_pos_offsets[self.position_in_block];
636 let end = self.block_pos_offsets[self.position_in_block + 1];
637 &self.block_positions[start..end]
638 }
639 }
640
641 pub fn advance(&mut self) {
642 if self.exhausted {
643 return;
644 }
645
646 self.position_in_block += 1;
647 if self.position_in_block >= self.block_count {
648 self.load_block(self.current_block + 1);
649 }
650 }
651
652 pub fn seek(&mut self, target: DocId) {
653 if self.exhausted {
654 return;
655 }
656
657 if let Some((_, last, _)) = self.list.skip_list.get(self.current_block)
659 && target <= *last
660 {
661 let remaining = &self.block_doc_ids[self.position_in_block..self.block_count];
663 let offset = crate::structures::simd::find_first_ge_u32(remaining, target);
664 self.position_in_block += offset;
665 if self.position_in_block >= self.block_count {
666 self.load_block(self.current_block + 1);
667 self.seek(target); }
669 return;
670 }
671
672 let block_idx = match self.list.skip_list.binary_search_by(|&(base, last, _)| {
674 if target < base {
675 std::cmp::Ordering::Greater
676 } else if target > last {
677 std::cmp::Ordering::Less
678 } else {
679 std::cmp::Ordering::Equal
680 }
681 }) {
682 Ok(idx) => idx,
683 Err(idx) => idx, };
685
686 if block_idx >= self.list.skip_list.len() {
687 self.exhausted = true;
688 return;
689 }
690
691 self.load_block(block_idx);
692
693 let remaining = &self.block_doc_ids[self.position_in_block..self.block_count];
695 let offset = crate::structures::simd::find_first_ge_u32(remaining, target);
696 self.position_in_block += offset;
697
698 if self.position_in_block >= self.block_count {
699 self.load_block(self.current_block + 1);
700 }
701 }
702}
703
704#[cfg(test)]
705mod tests {
706 use super::*;
707
708 #[test]
709 fn test_position_encoding() {
710 let pos = encode_position(0, 5);
712 assert_eq!(decode_element_ordinal(pos), 0);
713 assert_eq!(decode_token_position(pos), 5);
714
715 let pos = encode_position(3, 100);
717 assert_eq!(decode_element_ordinal(pos), 3);
718 assert_eq!(decode_token_position(pos), 100);
719
720 let pos = encode_position(MAX_ELEMENT_ORDINAL, MAX_TOKEN_POSITION);
722 assert_eq!(decode_element_ordinal(pos), MAX_ELEMENT_ORDINAL);
723 assert_eq!(decode_token_position(pos), MAX_TOKEN_POSITION);
724 }
725
726 #[test]
727 fn test_position_posting_list_build() {
728 let postings = vec![
730 PostingWithPositions {
731 doc_id: 1,
732 term_freq: 2,
733 positions: vec![encode_position(0, 0), encode_position(0, 2)],
734 },
735 PostingWithPositions {
736 doc_id: 3,
737 term_freq: 1,
738 positions: vec![encode_position(1, 0)],
739 },
740 ];
741
742 let list = PositionPostingList::from_postings(&postings).unwrap();
743 assert_eq!(list.doc_count(), 2);
744
745 let pos = list.get_positions(1).unwrap();
747 assert_eq!(pos.len(), 2);
748
749 let pos = list.get_positions(3).unwrap();
750 assert_eq!(pos.len(), 1);
751
752 assert!(list.get_positions(2).is_none());
754 assert!(list.get_positions(99).is_none());
755 }
756
757 #[test]
758 fn test_serialization_roundtrip() {
759 let postings = vec![
760 PostingWithPositions {
761 doc_id: 1,
762 term_freq: 2,
763 positions: vec![encode_position(0, 0), encode_position(0, 5)],
764 },
765 PostingWithPositions {
766 doc_id: 3,
767 term_freq: 1,
768 positions: vec![encode_position(1, 0)],
769 },
770 PostingWithPositions {
771 doc_id: 5,
772 term_freq: 1,
773 positions: vec![encode_position(0, 10)],
774 },
775 ];
776
777 let list = PositionPostingList::from_postings(&postings).unwrap();
778
779 let mut bytes = Vec::new();
780 list.serialize(&mut bytes).unwrap();
781
782 let deserialized = PositionPostingList::deserialize(&bytes).unwrap();
783
784 assert_eq!(list.doc_count(), deserialized.doc_count());
785
786 let pos = deserialized.get_positions(1).unwrap();
788 assert_eq!(pos, vec![encode_position(0, 0), encode_position(0, 5)]);
789
790 let pos = deserialized.get_positions(3).unwrap();
791 assert_eq!(pos, vec![encode_position(1, 0)]);
792 }
793
794 #[test]
795 fn test_binary_search_many_blocks() {
796 let mut postings = Vec::new();
798 for i in 0..300 {
799 postings.push(PostingWithPositions {
800 doc_id: i * 2, term_freq: 1,
802 positions: vec![encode_position(0, i)],
803 });
804 }
805
806 let list = PositionPostingList::from_postings(&postings).unwrap();
807 assert_eq!(list.doc_count(), 300);
808
809 assert_eq!(list.skip_list.len(), 3);
811
812 let pos = list.get_positions(0).unwrap();
814 assert_eq!(pos, vec![encode_position(0, 0)]);
815
816 let pos = list.get_positions(256).unwrap(); assert_eq!(pos, vec![encode_position(0, 128)]);
818
819 let pos = list.get_positions(598).unwrap(); assert_eq!(pos, vec![encode_position(0, 299)]);
821
822 assert!(list.get_positions(1).is_none());
824 assert!(list.get_positions(257).is_none());
825 }
826
827 #[test]
828 fn test_concatenate_blocks_merge() {
829 let postings1 = vec![
831 PostingWithPositions {
832 doc_id: 0,
833 term_freq: 1,
834 positions: vec![0],
835 },
836 PostingWithPositions {
837 doc_id: 1,
838 term_freq: 1,
839 positions: vec![5],
840 },
841 PostingWithPositions {
842 doc_id: 2,
843 term_freq: 1,
844 positions: vec![10],
845 },
846 ];
847 let list1 = PositionPostingList::from_postings(&postings1).unwrap();
848
849 let postings2 = vec![
850 PostingWithPositions {
851 doc_id: 0,
852 term_freq: 1,
853 positions: vec![100],
854 },
855 PostingWithPositions {
856 doc_id: 1,
857 term_freq: 1,
858 positions: vec![105],
859 },
860 ];
861 let list2 = PositionPostingList::from_postings(&postings2).unwrap();
862
863 let combined = PositionPostingList::concatenate_blocks(&[
865 (list1, 0), (list2, 3), ])
868 .unwrap();
869
870 assert_eq!(combined.doc_count(), 5);
871
872 assert!(combined.get_positions(0).is_some());
874 assert!(combined.get_positions(1).is_some());
875 assert!(combined.get_positions(2).is_some());
876 assert!(combined.get_positions(3).is_some()); assert!(combined.get_positions(4).is_some()); }
879
880 #[test]
881 fn test_iterator() {
882 let postings = vec![
883 PostingWithPositions {
884 doc_id: 1,
885 term_freq: 2,
886 positions: vec![0, 5],
887 },
888 PostingWithPositions {
889 doc_id: 3,
890 term_freq: 1,
891 positions: vec![10],
892 },
893 PostingWithPositions {
894 doc_id: 5,
895 term_freq: 1,
896 positions: vec![15],
897 },
898 ];
899
900 let list = PositionPostingList::from_postings(&postings).unwrap();
901 let mut iter = list.iter();
902
903 assert_eq!(iter.doc(), 1);
904 assert_eq!(iter.positions(), &[0, 5]);
905
906 iter.advance();
907 assert_eq!(iter.doc(), 3);
908
909 iter.seek(5);
910 assert_eq!(iter.doc(), 5);
911 assert_eq!(iter.positions(), &[15]);
912
913 iter.advance();
914 assert_eq!(iter.doc(), u32::MAX); }
916
917 fn build_ppl(entries: &[(u32, Vec<u32>)]) -> PositionPostingList {
919 let postings: Vec<PostingWithPositions> = entries
920 .iter()
921 .map(|(doc_id, positions)| PostingWithPositions {
922 doc_id: *doc_id,
923 term_freq: positions.len() as u32,
924 positions: positions.clone(),
925 })
926 .collect();
927 PositionPostingList::from_postings(&postings).unwrap()
928 }
929
930 fn serialize_ppl(ppl: &PositionPostingList) -> Vec<u8> {
932 let mut buf = Vec::new();
933 ppl.serialize(&mut buf).unwrap();
934 buf
935 }
936
937 fn collect_positions(ppl: &PositionPostingList) -> Vec<(u32, Vec<u32>)> {
939 let mut result = Vec::new();
940 let mut it = ppl.iter();
941 while it.doc() != u32::MAX {
942 result.push((it.doc(), it.positions().to_vec()));
943 it.advance();
944 }
945 result
946 }
947
948 #[test]
949 fn test_concatenate_streaming_matches_blocks() {
950 let seg_a: Vec<(u32, Vec<u32>)> = (0..150)
952 .map(|i| (i * 2, vec![i * 10, i * 10 + 3]))
953 .collect();
954 let seg_b: Vec<(u32, Vec<u32>)> = (0..100).map(|i| (i * 5, vec![i * 7])).collect();
955 let seg_c: Vec<(u32, Vec<u32>)> = (0..80).map(|i| (i * 3, vec![i, i + 1, i + 2])).collect();
956
957 let ppl_a = build_ppl(&seg_a);
958 let ppl_b = build_ppl(&seg_b);
959 let ppl_c = build_ppl(&seg_c);
960
961 let offset_b = 500u32;
962 let offset_c = 1000u32;
963
964 let ref_merged = PositionPostingList::concatenate_blocks(&[
966 (ppl_a.clone(), 0),
967 (ppl_b.clone(), offset_b),
968 (ppl_c.clone(), offset_c),
969 ])
970 .unwrap();
971 let mut ref_buf = Vec::new();
972 ref_merged.serialize(&mut ref_buf).unwrap();
973
974 let bytes_a = serialize_ppl(&ppl_a);
976 let bytes_b = serialize_ppl(&ppl_b);
977 let bytes_c = serialize_ppl(&ppl_c);
978
979 let sources: Vec<(&[u8], u32)> =
980 vec![(&bytes_a, 0), (&bytes_b, offset_b), (&bytes_c, offset_c)];
981 let mut stream_buf = Vec::new();
982 let (doc_count, bytes_written) =
983 PositionPostingList::concatenate_streaming(&sources, &mut stream_buf).unwrap();
984
985 assert_eq!(doc_count, 330);
986 assert_eq!(bytes_written, stream_buf.len());
987
988 let ref_posts = collect_positions(&PositionPostingList::deserialize(&ref_buf).unwrap());
990 let stream_posts =
991 collect_positions(&PositionPostingList::deserialize(&stream_buf).unwrap());
992
993 assert_eq!(ref_posts.len(), stream_posts.len());
994 for (i, (r, s)) in ref_posts.iter().zip(stream_posts.iter()).enumerate() {
995 assert_eq!(r.0, s.0, "doc_id mismatch at {}", i);
996 assert_eq!(r.1, s.1, "positions mismatch at doc {}", r.0);
997 }
998 }
999
1000 #[test]
1001 fn test_positions_multi_round_merge() {
1002 let segments: Vec<Vec<(u32, Vec<u32>)>> = (0..4)
1004 .map(|seg| {
1005 (0..200)
1006 .map(|i| {
1007 let pos_count = (i % 3) + 1;
1008 let positions: Vec<u32> = (0..pos_count)
1009 .map(|p| (seg * 1000 + i * 10 + p) as u32)
1010 .collect();
1011 (i as u32 * 3, positions)
1012 })
1013 .collect()
1014 })
1015 .collect();
1016
1017 let ppls: Vec<PositionPostingList> = segments.iter().map(|s| build_ppl(s)).collect();
1018 let serialized: Vec<Vec<u8>> = ppls.iter().map(serialize_ppl).collect();
1019
1020 let mut merged_01 = Vec::new();
1022 let sources_01: Vec<(&[u8], u32)> = vec![(&serialized[0], 0), (&serialized[1], 600)];
1023 let (dc_01, _) =
1024 PositionPostingList::concatenate_streaming(&sources_01, &mut merged_01).unwrap();
1025 assert_eq!(dc_01, 400);
1026
1027 let mut merged_23 = Vec::new();
1028 let sources_23: Vec<(&[u8], u32)> = vec![(&serialized[2], 0), (&serialized[3], 600)];
1029 let (dc_23, _) =
1030 PositionPostingList::concatenate_streaming(&sources_23, &mut merged_23).unwrap();
1031 assert_eq!(dc_23, 400);
1032
1033 let mut final_merged = Vec::new();
1035 let sources_final: Vec<(&[u8], u32)> = vec![(&merged_01, 0), (&merged_23, 1200)];
1036 let (dc_final, _) =
1037 PositionPostingList::concatenate_streaming(&sources_final, &mut final_merged).unwrap();
1038 assert_eq!(dc_final, 800);
1039
1040 let final_ppl = PositionPostingList::deserialize(&final_merged).unwrap();
1042 let all = collect_positions(&final_ppl);
1043 assert_eq!(all.len(), 800);
1044
1045 assert_eq!(all[0].0, 0);
1048 assert_eq!(all[0].1, vec![0]); assert_eq!(all[200].0, 600);
1052 assert_eq!(all[200].1, vec![1000]); assert_eq!(all[400].0, 1200);
1056 assert_eq!(all[400].1, vec![2000]); let mut it = final_ppl.iter();
1060 it.seek(1200);
1061 assert_eq!(it.doc(), 1200);
1062 assert_eq!(it.positions(), &[2000]);
1063
1064 let pos = final_ppl.get_positions(600).unwrap();
1066 assert_eq!(pos, vec![1000]);
1067 }
1068
1069 #[test]
1070 fn test_positions_large_scale_merge() {
1071 let num_segments = 5usize;
1073 let docs_per_segment = 500usize;
1074
1075 let segments: Vec<Vec<(u32, Vec<u32>)>> = (0..num_segments)
1076 .map(|seg| {
1077 (0..docs_per_segment)
1078 .map(|i| {
1079 let n_pos = (i % 4) + 1;
1080 let positions: Vec<u32> =
1081 (0..n_pos).map(|p| (p * 5 + seg) as u32).collect();
1082 (i as u32 * 2, positions)
1083 })
1084 .collect()
1085 })
1086 .collect();
1087
1088 let ppls: Vec<PositionPostingList> = segments.iter().map(|s| build_ppl(s)).collect();
1089 let serialized: Vec<Vec<u8>> = ppls.iter().map(serialize_ppl).collect();
1090
1091 let max_doc = (docs_per_segment as u32 - 1) * 2;
1092 let offsets: Vec<u32> = (0..num_segments)
1093 .map(|i| i as u32 * (max_doc + 1))
1094 .collect();
1095
1096 let sources: Vec<(&[u8], u32)> = serialized
1097 .iter()
1098 .zip(offsets.iter())
1099 .map(|(b, o)| (b.as_slice(), *o))
1100 .collect();
1101
1102 let mut merged = Vec::new();
1103 let (doc_count, _) =
1104 PositionPostingList::concatenate_streaming(&sources, &mut merged).unwrap();
1105 assert_eq!(doc_count, (num_segments * docs_per_segment) as u32);
1106
1107 let merged_ppl = PositionPostingList::deserialize(&merged).unwrap();
1109 let all = collect_positions(&merged_ppl);
1110 assert_eq!(all.len(), num_segments * docs_per_segment);
1111
1112 for (seg, &offset) in offsets.iter().enumerate() {
1114 for i in 0..docs_per_segment {
1115 let idx = seg * docs_per_segment + i;
1116 let expected_doc = i as u32 * 2 + offset;
1117 assert_eq!(all[idx].0, expected_doc, "seg={} i={}", seg, i);
1118
1119 let n_pos = (i % 4) + 1;
1120 let expected_positions: Vec<u32> =
1121 (0..n_pos).map(|p| (p * 5 + seg) as u32).collect();
1122 assert_eq!(
1123 all[idx].1, expected_positions,
1124 "positions mismatch seg={} i={}",
1125 seg, i
1126 );
1127 }
1128 }
1129 }
1130
1131 #[test]
1132 fn test_positions_streaming_single_source() {
1133 let entries: Vec<(u32, Vec<u32>)> =
1134 (0..300).map(|i| (i * 4, vec![i * 2, i * 2 + 1])).collect();
1135 let ppl = build_ppl(&entries);
1136 let direct = serialize_ppl(&ppl);
1137
1138 let sources: Vec<(&[u8], u32)> = vec![(&direct, 0)];
1139 let mut streamed = Vec::new();
1140 PositionPostingList::concatenate_streaming(&sources, &mut streamed).unwrap();
1141
1142 let p1 = collect_positions(&PositionPostingList::deserialize(&direct).unwrap());
1143 let p2 = collect_positions(&PositionPostingList::deserialize(&streamed).unwrap());
1144 assert_eq!(p1, p2);
1145 }
1146
1147 #[test]
1148 fn test_positions_edge_single_doc() {
1149 let ppl_a = build_ppl(&[(0, vec![42, 43, 44])]);
1150 let ppl_b = build_ppl(&[(0, vec![100])]);
1151
1152 let merged = PositionPostingList::concatenate_blocks(&[(ppl_a, 0), (ppl_b, 1)]).unwrap();
1153
1154 assert_eq!(merged.doc_count(), 2);
1155 assert_eq!(merged.get_positions(0).unwrap(), vec![42, 43, 44]);
1156 assert_eq!(merged.get_positions(1).unwrap(), vec![100]);
1157 }
1158}