1use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
26use std::io::{self, Write};
27
28use super::posting_common::{read_vint, write_vint};
29use crate::DocId;
30
31pub const POSITION_BLOCK_SIZE: usize = 128;
33
34pub const MAX_TOKEN_POSITION: u32 = (1 << 20) - 1;
36
37pub const MAX_ELEMENT_ORDINAL: u32 = (1 << 12) - 1;
39
40#[inline]
42pub fn encode_position(element_ordinal: u32, token_position: u32) -> u32 {
43 debug_assert!(
44 element_ordinal <= MAX_ELEMENT_ORDINAL,
45 "Element ordinal {} exceeds maximum {}",
46 element_ordinal,
47 MAX_ELEMENT_ORDINAL
48 );
49 debug_assert!(
50 token_position <= MAX_TOKEN_POSITION,
51 "Token position {} exceeds maximum {}",
52 token_position,
53 MAX_TOKEN_POSITION
54 );
55 (element_ordinal << 20) | (token_position & MAX_TOKEN_POSITION)
56}
57
58#[inline]
60pub fn decode_element_ordinal(position: u32) -> u32 {
61 position >> 20
62}
63
64#[inline]
66pub fn decode_token_position(position: u32) -> u32 {
67 position & MAX_TOKEN_POSITION
68}
69
70#[derive(Debug, Clone, PartialEq, Eq)]
72pub struct PostingWithPositions {
73 pub doc_id: DocId,
74 pub term_freq: u32,
75 pub positions: Vec<u32>,
77}
78
79#[derive(Debug, Clone)]
84pub struct PositionPostingList {
85 skip_list: Vec<(DocId, DocId, u64)>,
88 data: Vec<u8>,
90 doc_count: u32,
92}
93
94impl Default for PositionPostingList {
95 fn default() -> Self {
96 Self::new()
97 }
98}
99
100impl PositionPostingList {
101 pub fn new() -> Self {
102 Self {
103 skip_list: Vec::new(),
104 data: Vec::new(),
105 doc_count: 0,
106 }
107 }
108
109 pub fn with_capacity(capacity: usize) -> Self {
110 Self {
111 skip_list: Vec::with_capacity(capacity / POSITION_BLOCK_SIZE + 1),
112 data: Vec::with_capacity(capacity * 8), doc_count: 0,
114 }
115 }
116
117 pub fn from_postings(postings: &[PostingWithPositions]) -> io::Result<Self> {
119 if postings.is_empty() {
120 return Ok(Self::new());
121 }
122
123 let mut skip_list = Vec::new();
124 let mut data = Vec::new();
125 let mut i = 0;
126
127 while i < postings.len() {
128 let block_start = data.len() as u64;
129 let block_end = (i + POSITION_BLOCK_SIZE).min(postings.len());
130 let block = &postings[i..block_end];
131
132 let base_doc_id = block.first().unwrap().doc_id;
134 let last_doc_id = block.last().unwrap().doc_id;
135 skip_list.push((base_doc_id, last_doc_id, block_start));
136
137 data.write_u32::<LittleEndian>(block.len() as u32)?;
139 data.write_u32::<LittleEndian>(base_doc_id)?;
140
141 let mut prev_doc_id = base_doc_id;
142 for (j, posting) in block.iter().enumerate() {
143 if j > 0 {
144 let delta = posting.doc_id - prev_doc_id;
145 write_vint(&mut data, delta as u64)?;
146 }
147 prev_doc_id = posting.doc_id;
148
149 write_vint(&mut data, posting.positions.len() as u64)?;
151 for &pos in &posting.positions {
152 write_vint(&mut data, pos as u64)?;
153 }
154 }
155
156 i = block_end;
157 }
158
159 Ok(Self {
160 skip_list,
161 data,
162 doc_count: postings.len() as u32,
163 })
164 }
165
166 pub fn push(&mut self, doc_id: DocId, positions: Vec<u32>) {
168 let posting = PostingWithPositions {
171 doc_id,
172 term_freq: positions.len() as u32,
173 positions,
174 };
175
176 let block_start = self.data.len() as u64;
178
179 let need_new_block =
181 self.skip_list.is_empty() || self.doc_count.is_multiple_of(POSITION_BLOCK_SIZE as u32);
182
183 if need_new_block {
184 self.skip_list.push((doc_id, doc_id, block_start));
186 self.data.write_u32::<LittleEndian>(1u32).unwrap();
187 self.data.write_u32::<LittleEndian>(doc_id).unwrap();
188 } else {
189 let last_block = self.skip_list.last_mut().unwrap();
191 let prev_doc_id = last_block.1;
192 last_block.1 = doc_id;
193
194 let count_offset = last_block.2 as usize;
196 let old_count = u32::from_le_bytes(
197 self.data[count_offset..count_offset + 4]
198 .try_into()
199 .unwrap(),
200 );
201 self.data[count_offset..count_offset + 4]
202 .copy_from_slice(&(old_count + 1).to_le_bytes());
203
204 let delta = doc_id - prev_doc_id;
205 write_vint(&mut self.data, delta as u64).unwrap();
206 }
207
208 write_vint(&mut self.data, posting.positions.len() as u64).unwrap();
210 for &pos in &posting.positions {
211 write_vint(&mut self.data, pos as u64).unwrap();
212 }
213
214 self.doc_count += 1;
215 }
216
217 pub fn doc_count(&self) -> u32 {
218 self.doc_count
219 }
220
221 pub fn len(&self) -> usize {
222 self.doc_count as usize
223 }
224
225 pub fn is_empty(&self) -> bool {
226 self.doc_count == 0
227 }
228
229 pub fn get_positions_into(&self, target_doc_id: DocId, out: &mut Vec<u32>) -> bool {
232 out.clear();
233 self.get_positions_impl(target_doc_id, Some(out)).is_some()
234 }
235
236 pub fn get_positions(&self, target_doc_id: DocId) -> Option<Vec<u32>> {
238 self.get_positions_impl(target_doc_id, None)
239 }
240
241 fn get_positions_impl(
242 &self,
243 target_doc_id: DocId,
244 mut out: Option<&mut Vec<u32>>,
245 ) -> Option<Vec<u32>> {
246 if self.skip_list.is_empty() {
247 return None;
248 }
249
250 let block_idx = match self.skip_list.binary_search_by(|&(base, last, _)| {
252 if target_doc_id < base {
253 std::cmp::Ordering::Greater
254 } else if target_doc_id > last {
255 std::cmp::Ordering::Less
256 } else {
257 std::cmp::Ordering::Equal
258 }
259 }) {
260 Ok(idx) => idx,
261 Err(_) => return None, };
263
264 let offset = self.skip_list[block_idx].2 as usize;
266 let mut reader = &self.data[offset..];
267
268 let count = reader.read_u32::<LittleEndian>().ok()? as usize;
270 let first_doc = reader.read_u32::<LittleEndian>().ok()?;
271 let mut prev_doc_id = first_doc;
272
273 for i in 0..count {
274 let doc_id = if i == 0 {
275 first_doc
276 } else {
277 let delta = read_vint(&mut reader).ok()? as u32;
278 prev_doc_id + delta
279 };
280 prev_doc_id = doc_id;
281
282 let num_positions = read_vint(&mut reader).ok()? as usize;
283
284 if doc_id == target_doc_id {
285 if let Some(buf) = &mut out {
287 buf.reserve(num_positions);
289 for _ in 0..num_positions {
290 let pos = read_vint(&mut reader).ok()? as u32;
291 buf.push(pos);
292 }
293 return Some(Vec::new()); }
295 let mut positions = Vec::with_capacity(num_positions);
296 for _ in 0..num_positions {
297 let pos = read_vint(&mut reader).ok()? as u32;
298 positions.push(pos);
299 }
300 return Some(positions);
301 } else {
302 for _ in 0..num_positions {
304 if read_vint(&mut reader).is_err() {
305 return None;
306 }
307 }
308 }
309 }
310
311 None
312 }
313
314 const SKIP_ENTRY_SIZE: usize = 20;
317
318 pub fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
327 writer.write_all(&self.data)?;
329
330 for (i, (base_doc_id, last_doc_id, offset)) in self.skip_list.iter().enumerate() {
332 let next_offset = if i + 1 < self.skip_list.len() {
333 self.skip_list[i + 1].2
334 } else {
335 self.data.len() as u64
336 };
337 let length = (next_offset - offset) as u32;
338 writer.write_u32::<LittleEndian>(*base_doc_id)?;
339 writer.write_u32::<LittleEndian>(*last_doc_id)?;
340 writer.write_u64::<LittleEndian>(*offset)?;
341 writer.write_u32::<LittleEndian>(length)?;
342 }
343
344 writer.write_u64::<LittleEndian>(self.data.len() as u64)?;
346 writer.write_u32::<LittleEndian>(self.skip_list.len() as u32)?;
347 writer.write_u32::<LittleEndian>(self.doc_count)?;
348
349 Ok(())
350 }
351
352 pub fn deserialize(raw: &[u8]) -> io::Result<Self> {
354 if raw.len() < 16 {
355 return Err(io::Error::new(
356 io::ErrorKind::InvalidData,
357 "position data too short",
358 ));
359 }
360
361 let f = raw.len() - 16;
363 let data_len = u64::from_le_bytes(raw[f..f + 8].try_into().unwrap()) as usize;
364 let skip_count = u32::from_le_bytes(raw[f + 8..f + 12].try_into().unwrap()) as usize;
365 let doc_count = u32::from_le_bytes(raw[f + 12..f + 16].try_into().unwrap());
366
367 let mut skip_list = Vec::with_capacity(skip_count);
369 let mut pos = data_len;
370 for _ in 0..skip_count {
371 let base = u32::from_le_bytes(raw[pos..pos + 4].try_into().unwrap());
372 let last = u32::from_le_bytes(raw[pos + 4..pos + 8].try_into().unwrap());
373 let offset = u64::from_le_bytes(raw[pos + 8..pos + 16].try_into().unwrap());
374 skip_list.push((base, last, offset));
376 pos += Self::SKIP_ENTRY_SIZE;
377 }
378
379 let data = raw[..data_len].to_vec();
380
381 Ok(Self {
382 skip_list,
383 data,
384 doc_count,
385 })
386 }
387
388 pub fn concatenate_blocks(sources: &[(PositionPostingList, u32)]) -> io::Result<Self> {
390 let mut skip_list = Vec::new();
391 let mut data = Vec::new();
392 let mut total_docs = 0u32;
393
394 for (source, doc_offset) in sources {
395 for block_idx in 0..source.skip_list.len() {
396 let (base, last, src_offset) = source.skip_list[block_idx];
397 let next_offset = if block_idx + 1 < source.skip_list.len() {
398 source.skip_list[block_idx + 1].2 as usize
399 } else {
400 source.data.len()
401 };
402
403 let new_base = base + doc_offset;
404 let new_last = last + doc_offset;
405 let new_offset = data.len() as u64;
406
407 let block_bytes = &source.data[src_offset as usize..next_offset];
409
410 let count = u32::from_le_bytes(block_bytes[0..4].try_into().unwrap());
412 let first_doc = u32::from_le_bytes(block_bytes[4..8].try_into().unwrap());
413
414 data.write_u32::<LittleEndian>(count)?;
416 data.write_u32::<LittleEndian>(first_doc + doc_offset)?;
417 data.extend_from_slice(&block_bytes[8..]);
418
419 skip_list.push((new_base, new_last, new_offset));
420 total_docs += count;
421 }
422 }
423
424 Ok(Self {
425 skip_list,
426 data,
427 doc_count: total_docs,
428 })
429 }
430
431 pub fn concatenate_streaming<W: Write>(
442 sources: &[(&[u8], u32)],
443 writer: &mut W,
444 ) -> io::Result<(u32, usize)> {
445 struct SourceMeta {
447 data_len: usize,
448 skip_count: usize,
449 }
450
451 let mut metas: Vec<SourceMeta> = Vec::with_capacity(sources.len());
452 let mut total_docs = 0u32;
453
454 for (raw, _) in sources {
455 if raw.len() < 16 {
456 continue;
457 }
458 let f = raw.len() - 16;
459 let data_len = u64::from_le_bytes(raw[f..f + 8].try_into().unwrap()) as usize;
460 let skip_count = u32::from_le_bytes(raw[f + 8..f + 12].try_into().unwrap()) as usize;
461 let doc_count = u32::from_le_bytes(raw[f + 12..f + 16].try_into().unwrap());
462 total_docs += doc_count;
463 metas.push(SourceMeta {
464 data_len,
465 skip_count,
466 });
467 }
468
469 let mut out_skip: Vec<u8> = Vec::new();
472 let mut out_skip_count = 0u32;
473 let mut data_written = 0u64;
474 let mut patch_buf = [0u8; 8];
475 let es = Self::SKIP_ENTRY_SIZE;
476
477 for (src_idx, meta) in metas.iter().enumerate() {
478 let (raw, doc_offset) = &sources[src_idx];
479 let skip_base = meta.data_len;
480 let data = &raw[..meta.data_len];
481
482 for i in 0..meta.skip_count {
483 let p = skip_base + i * es;
485 let base = u32::from_le_bytes(raw[p..p + 4].try_into().unwrap());
486 let last = u32::from_le_bytes(raw[p + 4..p + 8].try_into().unwrap());
487 let offset = u64::from_le_bytes(raw[p + 8..p + 16].try_into().unwrap());
488 let length = u32::from_le_bytes(raw[p + 16..p + 20].try_into().unwrap());
489
490 let block = &data[offset as usize..(offset as usize + length as usize)];
491
492 out_skip.extend_from_slice(&(base + doc_offset).to_le_bytes());
494 out_skip.extend_from_slice(&(last + doc_offset).to_le_bytes());
495 out_skip.extend_from_slice(&data_written.to_le_bytes());
496 out_skip.extend_from_slice(&length.to_le_bytes());
497 out_skip_count += 1;
498
499 patch_buf[0..4].copy_from_slice(&block[0..4]);
501 let first_doc = u32::from_le_bytes(block[4..8].try_into().unwrap());
502 patch_buf[4..8].copy_from_slice(&(first_doc + doc_offset).to_le_bytes());
503 writer.write_all(&patch_buf)?;
504 writer.write_all(&block[8..])?;
505
506 data_written += block.len() as u64;
507 }
508 }
509
510 writer.write_all(&out_skip)?;
512
513 writer.write_u64::<LittleEndian>(data_written)?;
514 writer.write_u32::<LittleEndian>(out_skip_count)?;
515 writer.write_u32::<LittleEndian>(total_docs)?;
516
517 let total_bytes = data_written as usize + out_skip.len() + 16;
518 Ok((total_docs, total_bytes))
519 }
520
521 pub fn iter(&self) -> PositionPostingIterator<'_> {
523 PositionPostingIterator::new(self)
524 }
525}
526
527pub struct PositionPostingIterator<'a> {
533 list: &'a PositionPostingList,
534 current_block: usize,
535 position_in_block: usize,
536 block_count: usize,
538 block_doc_ids: Vec<DocId>,
540 block_term_freqs: Vec<u32>,
542 block_positions: Vec<u32>,
544 block_pos_offsets: Vec<usize>,
546 exhausted: bool,
547}
548
549impl<'a> PositionPostingIterator<'a> {
550 pub fn new(list: &'a PositionPostingList) -> Self {
551 let exhausted = list.skip_list.is_empty();
552 let mut iter = Self {
553 list,
554 current_block: 0,
555 position_in_block: 0,
556 block_count: 0,
557 block_doc_ids: Vec::with_capacity(POSITION_BLOCK_SIZE),
558 block_term_freqs: Vec::with_capacity(POSITION_BLOCK_SIZE),
559 block_positions: Vec::new(),
560 block_pos_offsets: Vec::with_capacity(POSITION_BLOCK_SIZE + 1),
561 exhausted,
562 };
563 if !iter.exhausted {
564 iter.load_block(0);
565 }
566 iter
567 }
568
569 fn load_block(&mut self, block_idx: usize) {
570 if block_idx >= self.list.skip_list.len() {
571 self.exhausted = true;
572 return;
573 }
574
575 self.current_block = block_idx;
576 self.position_in_block = 0;
577
578 let offset = self.list.skip_list[block_idx].2 as usize;
579 let mut reader = &self.list.data[offset..];
580
581 let count = reader.read_u32::<LittleEndian>().unwrap_or(0) as usize;
583 let first_doc = reader.read_u32::<LittleEndian>().unwrap_or(0);
584
585 self.block_count = count;
586 self.block_doc_ids.clear();
587 self.block_term_freqs.clear();
588 self.block_positions.clear();
589 self.block_pos_offsets.clear();
590
591 let mut prev_doc_id = first_doc;
592
593 for i in 0..count {
594 let doc_id = if i == 0 {
595 first_doc
596 } else {
597 let delta = read_vint(&mut reader).unwrap_or(0) as u32;
598 prev_doc_id + delta
599 };
600 prev_doc_id = doc_id;
601
602 let num_positions = read_vint(&mut reader).unwrap_or(0) as usize;
603 self.block_doc_ids.push(doc_id);
604 self.block_term_freqs.push(num_positions as u32);
605 self.block_pos_offsets.push(self.block_positions.len());
606 for _ in 0..num_positions {
607 let pos = read_vint(&mut reader).unwrap_or(0) as u32;
608 self.block_positions.push(pos);
609 }
610 }
611 self.block_pos_offsets.push(self.block_positions.len());
613 }
614
615 pub fn doc(&self) -> DocId {
616 if self.exhausted || self.position_in_block >= self.block_count {
617 u32::MAX
618 } else {
619 self.block_doc_ids[self.position_in_block]
620 }
621 }
622
623 pub fn term_freq(&self) -> u32 {
624 if self.exhausted || self.position_in_block >= self.block_count {
625 0
626 } else {
627 self.block_term_freqs[self.position_in_block]
628 }
629 }
630
631 pub fn positions(&self) -> &[u32] {
632 if self.exhausted || self.position_in_block >= self.block_count {
633 &[]
634 } else {
635 let start = self.block_pos_offsets[self.position_in_block];
636 let end = self.block_pos_offsets[self.position_in_block + 1];
637 &self.block_positions[start..end]
638 }
639 }
640
641 pub fn advance(&mut self) {
642 if self.exhausted {
643 return;
644 }
645
646 self.position_in_block += 1;
647 if self.position_in_block >= self.block_count {
648 self.load_block(self.current_block + 1);
649 }
650 }
651
652 pub fn seek(&mut self, target: DocId) {
653 if self.exhausted {
654 return;
655 }
656
657 if let Some((_, last, _)) = self.list.skip_list.get(self.current_block)
659 && target <= *last
660 {
661 while self.position_in_block < self.block_count
663 && self.block_doc_ids[self.position_in_block] < target
664 {
665 self.position_in_block += 1;
666 }
667 if self.position_in_block >= self.block_count {
668 self.load_block(self.current_block + 1);
669 self.seek(target); }
671 return;
672 }
673
674 let block_idx = match self.list.skip_list.binary_search_by(|&(base, last, _)| {
676 if target < base {
677 std::cmp::Ordering::Greater
678 } else if target > last {
679 std::cmp::Ordering::Less
680 } else {
681 std::cmp::Ordering::Equal
682 }
683 }) {
684 Ok(idx) => idx,
685 Err(idx) => idx, };
687
688 if block_idx >= self.list.skip_list.len() {
689 self.exhausted = true;
690 return;
691 }
692
693 self.load_block(block_idx);
694
695 while self.position_in_block < self.block_count
697 && self.block_doc_ids[self.position_in_block] < target
698 {
699 self.position_in_block += 1;
700 }
701
702 if self.position_in_block >= self.block_count {
703 self.load_block(self.current_block + 1);
704 }
705 }
706}
707
708#[cfg(test)]
709mod tests {
710 use super::*;
711
712 #[test]
713 fn test_position_encoding() {
714 let pos = encode_position(0, 5);
716 assert_eq!(decode_element_ordinal(pos), 0);
717 assert_eq!(decode_token_position(pos), 5);
718
719 let pos = encode_position(3, 100);
721 assert_eq!(decode_element_ordinal(pos), 3);
722 assert_eq!(decode_token_position(pos), 100);
723
724 let pos = encode_position(MAX_ELEMENT_ORDINAL, MAX_TOKEN_POSITION);
726 assert_eq!(decode_element_ordinal(pos), MAX_ELEMENT_ORDINAL);
727 assert_eq!(decode_token_position(pos), MAX_TOKEN_POSITION);
728 }
729
730 #[test]
731 fn test_position_posting_list_build() {
732 let postings = vec![
734 PostingWithPositions {
735 doc_id: 1,
736 term_freq: 2,
737 positions: vec![encode_position(0, 0), encode_position(0, 2)],
738 },
739 PostingWithPositions {
740 doc_id: 3,
741 term_freq: 1,
742 positions: vec![encode_position(1, 0)],
743 },
744 ];
745
746 let list = PositionPostingList::from_postings(&postings).unwrap();
747 assert_eq!(list.doc_count(), 2);
748
749 let pos = list.get_positions(1).unwrap();
751 assert_eq!(pos.len(), 2);
752
753 let pos = list.get_positions(3).unwrap();
754 assert_eq!(pos.len(), 1);
755
756 assert!(list.get_positions(2).is_none());
758 assert!(list.get_positions(99).is_none());
759 }
760
761 #[test]
762 fn test_serialization_roundtrip() {
763 let postings = vec![
764 PostingWithPositions {
765 doc_id: 1,
766 term_freq: 2,
767 positions: vec![encode_position(0, 0), encode_position(0, 5)],
768 },
769 PostingWithPositions {
770 doc_id: 3,
771 term_freq: 1,
772 positions: vec![encode_position(1, 0)],
773 },
774 PostingWithPositions {
775 doc_id: 5,
776 term_freq: 1,
777 positions: vec![encode_position(0, 10)],
778 },
779 ];
780
781 let list = PositionPostingList::from_postings(&postings).unwrap();
782
783 let mut bytes = Vec::new();
784 list.serialize(&mut bytes).unwrap();
785
786 let deserialized = PositionPostingList::deserialize(&bytes).unwrap();
787
788 assert_eq!(list.doc_count(), deserialized.doc_count());
789
790 let pos = deserialized.get_positions(1).unwrap();
792 assert_eq!(pos, vec![encode_position(0, 0), encode_position(0, 5)]);
793
794 let pos = deserialized.get_positions(3).unwrap();
795 assert_eq!(pos, vec![encode_position(1, 0)]);
796 }
797
798 #[test]
799 fn test_binary_search_many_blocks() {
800 let mut postings = Vec::new();
802 for i in 0..300 {
803 postings.push(PostingWithPositions {
804 doc_id: i * 2, term_freq: 1,
806 positions: vec![encode_position(0, i)],
807 });
808 }
809
810 let list = PositionPostingList::from_postings(&postings).unwrap();
811 assert_eq!(list.doc_count(), 300);
812
813 assert_eq!(list.skip_list.len(), 3);
815
816 let pos = list.get_positions(0).unwrap();
818 assert_eq!(pos, vec![encode_position(0, 0)]);
819
820 let pos = list.get_positions(256).unwrap(); assert_eq!(pos, vec![encode_position(0, 128)]);
822
823 let pos = list.get_positions(598).unwrap(); assert_eq!(pos, vec![encode_position(0, 299)]);
825
826 assert!(list.get_positions(1).is_none());
828 assert!(list.get_positions(257).is_none());
829 }
830
831 #[test]
832 fn test_concatenate_blocks_merge() {
833 let postings1 = vec![
835 PostingWithPositions {
836 doc_id: 0,
837 term_freq: 1,
838 positions: vec![0],
839 },
840 PostingWithPositions {
841 doc_id: 1,
842 term_freq: 1,
843 positions: vec![5],
844 },
845 PostingWithPositions {
846 doc_id: 2,
847 term_freq: 1,
848 positions: vec![10],
849 },
850 ];
851 let list1 = PositionPostingList::from_postings(&postings1).unwrap();
852
853 let postings2 = vec![
854 PostingWithPositions {
855 doc_id: 0,
856 term_freq: 1,
857 positions: vec![100],
858 },
859 PostingWithPositions {
860 doc_id: 1,
861 term_freq: 1,
862 positions: vec![105],
863 },
864 ];
865 let list2 = PositionPostingList::from_postings(&postings2).unwrap();
866
867 let combined = PositionPostingList::concatenate_blocks(&[
869 (list1, 0), (list2, 3), ])
872 .unwrap();
873
874 assert_eq!(combined.doc_count(), 5);
875
876 assert!(combined.get_positions(0).is_some());
878 assert!(combined.get_positions(1).is_some());
879 assert!(combined.get_positions(2).is_some());
880 assert!(combined.get_positions(3).is_some()); assert!(combined.get_positions(4).is_some()); }
883
884 #[test]
885 fn test_iterator() {
886 let postings = vec![
887 PostingWithPositions {
888 doc_id: 1,
889 term_freq: 2,
890 positions: vec![0, 5],
891 },
892 PostingWithPositions {
893 doc_id: 3,
894 term_freq: 1,
895 positions: vec![10],
896 },
897 PostingWithPositions {
898 doc_id: 5,
899 term_freq: 1,
900 positions: vec![15],
901 },
902 ];
903
904 let list = PositionPostingList::from_postings(&postings).unwrap();
905 let mut iter = list.iter();
906
907 assert_eq!(iter.doc(), 1);
908 assert_eq!(iter.positions(), &[0, 5]);
909
910 iter.advance();
911 assert_eq!(iter.doc(), 3);
912
913 iter.seek(5);
914 assert_eq!(iter.doc(), 5);
915 assert_eq!(iter.positions(), &[15]);
916
917 iter.advance();
918 assert_eq!(iter.doc(), u32::MAX); }
920
921 fn build_ppl(entries: &[(u32, Vec<u32>)]) -> PositionPostingList {
923 let postings: Vec<PostingWithPositions> = entries
924 .iter()
925 .map(|(doc_id, positions)| PostingWithPositions {
926 doc_id: *doc_id,
927 term_freq: positions.len() as u32,
928 positions: positions.clone(),
929 })
930 .collect();
931 PositionPostingList::from_postings(&postings).unwrap()
932 }
933
934 fn serialize_ppl(ppl: &PositionPostingList) -> Vec<u8> {
936 let mut buf = Vec::new();
937 ppl.serialize(&mut buf).unwrap();
938 buf
939 }
940
941 fn collect_positions(ppl: &PositionPostingList) -> Vec<(u32, Vec<u32>)> {
943 let mut result = Vec::new();
944 let mut it = ppl.iter();
945 while it.doc() != u32::MAX {
946 result.push((it.doc(), it.positions().to_vec()));
947 it.advance();
948 }
949 result
950 }
951
952 #[test]
953 fn test_concatenate_streaming_matches_blocks() {
954 let seg_a: Vec<(u32, Vec<u32>)> = (0..150)
956 .map(|i| (i * 2, vec![i * 10, i * 10 + 3]))
957 .collect();
958 let seg_b: Vec<(u32, Vec<u32>)> = (0..100).map(|i| (i * 5, vec![i * 7])).collect();
959 let seg_c: Vec<(u32, Vec<u32>)> = (0..80).map(|i| (i * 3, vec![i, i + 1, i + 2])).collect();
960
961 let ppl_a = build_ppl(&seg_a);
962 let ppl_b = build_ppl(&seg_b);
963 let ppl_c = build_ppl(&seg_c);
964
965 let offset_b = 500u32;
966 let offset_c = 1000u32;
967
968 let ref_merged = PositionPostingList::concatenate_blocks(&[
970 (ppl_a.clone(), 0),
971 (ppl_b.clone(), offset_b),
972 (ppl_c.clone(), offset_c),
973 ])
974 .unwrap();
975 let mut ref_buf = Vec::new();
976 ref_merged.serialize(&mut ref_buf).unwrap();
977
978 let bytes_a = serialize_ppl(&ppl_a);
980 let bytes_b = serialize_ppl(&ppl_b);
981 let bytes_c = serialize_ppl(&ppl_c);
982
983 let sources: Vec<(&[u8], u32)> =
984 vec![(&bytes_a, 0), (&bytes_b, offset_b), (&bytes_c, offset_c)];
985 let mut stream_buf = Vec::new();
986 let (doc_count, bytes_written) =
987 PositionPostingList::concatenate_streaming(&sources, &mut stream_buf).unwrap();
988
989 assert_eq!(doc_count, 330);
990 assert_eq!(bytes_written, stream_buf.len());
991
992 let ref_posts = collect_positions(&PositionPostingList::deserialize(&ref_buf).unwrap());
994 let stream_posts =
995 collect_positions(&PositionPostingList::deserialize(&stream_buf).unwrap());
996
997 assert_eq!(ref_posts.len(), stream_posts.len());
998 for (i, (r, s)) in ref_posts.iter().zip(stream_posts.iter()).enumerate() {
999 assert_eq!(r.0, s.0, "doc_id mismatch at {}", i);
1000 assert_eq!(r.1, s.1, "positions mismatch at doc {}", r.0);
1001 }
1002 }
1003
1004 #[test]
1005 fn test_positions_multi_round_merge() {
1006 let segments: Vec<Vec<(u32, Vec<u32>)>> = (0..4)
1008 .map(|seg| {
1009 (0..200)
1010 .map(|i| {
1011 let pos_count = (i % 3) + 1;
1012 let positions: Vec<u32> = (0..pos_count)
1013 .map(|p| (seg * 1000 + i * 10 + p) as u32)
1014 .collect();
1015 (i as u32 * 3, positions)
1016 })
1017 .collect()
1018 })
1019 .collect();
1020
1021 let ppls: Vec<PositionPostingList> = segments.iter().map(|s| build_ppl(s)).collect();
1022 let serialized: Vec<Vec<u8>> = ppls.iter().map(serialize_ppl).collect();
1023
1024 let mut merged_01 = Vec::new();
1026 let sources_01: Vec<(&[u8], u32)> = vec![(&serialized[0], 0), (&serialized[1], 600)];
1027 let (dc_01, _) =
1028 PositionPostingList::concatenate_streaming(&sources_01, &mut merged_01).unwrap();
1029 assert_eq!(dc_01, 400);
1030
1031 let mut merged_23 = Vec::new();
1032 let sources_23: Vec<(&[u8], u32)> = vec![(&serialized[2], 0), (&serialized[3], 600)];
1033 let (dc_23, _) =
1034 PositionPostingList::concatenate_streaming(&sources_23, &mut merged_23).unwrap();
1035 assert_eq!(dc_23, 400);
1036
1037 let mut final_merged = Vec::new();
1039 let sources_final: Vec<(&[u8], u32)> = vec![(&merged_01, 0), (&merged_23, 1200)];
1040 let (dc_final, _) =
1041 PositionPostingList::concatenate_streaming(&sources_final, &mut final_merged).unwrap();
1042 assert_eq!(dc_final, 800);
1043
1044 let final_ppl = PositionPostingList::deserialize(&final_merged).unwrap();
1046 let all = collect_positions(&final_ppl);
1047 assert_eq!(all.len(), 800);
1048
1049 assert_eq!(all[0].0, 0);
1052 assert_eq!(all[0].1, vec![0]); assert_eq!(all[200].0, 600);
1056 assert_eq!(all[200].1, vec![1000]); assert_eq!(all[400].0, 1200);
1060 assert_eq!(all[400].1, vec![2000]); let mut it = final_ppl.iter();
1064 it.seek(1200);
1065 assert_eq!(it.doc(), 1200);
1066 assert_eq!(it.positions(), &[2000]);
1067
1068 let pos = final_ppl.get_positions(600).unwrap();
1070 assert_eq!(pos, vec![1000]);
1071 }
1072
1073 #[test]
1074 fn test_positions_large_scale_merge() {
1075 let num_segments = 5usize;
1077 let docs_per_segment = 500usize;
1078
1079 let segments: Vec<Vec<(u32, Vec<u32>)>> = (0..num_segments)
1080 .map(|seg| {
1081 (0..docs_per_segment)
1082 .map(|i| {
1083 let n_pos = (i % 4) + 1;
1084 let positions: Vec<u32> =
1085 (0..n_pos).map(|p| (p * 5 + seg) as u32).collect();
1086 (i as u32 * 2, positions)
1087 })
1088 .collect()
1089 })
1090 .collect();
1091
1092 let ppls: Vec<PositionPostingList> = segments.iter().map(|s| build_ppl(s)).collect();
1093 let serialized: Vec<Vec<u8>> = ppls.iter().map(serialize_ppl).collect();
1094
1095 let max_doc = (docs_per_segment as u32 - 1) * 2;
1096 let offsets: Vec<u32> = (0..num_segments)
1097 .map(|i| i as u32 * (max_doc + 1))
1098 .collect();
1099
1100 let sources: Vec<(&[u8], u32)> = serialized
1101 .iter()
1102 .zip(offsets.iter())
1103 .map(|(b, o)| (b.as_slice(), *o))
1104 .collect();
1105
1106 let mut merged = Vec::new();
1107 let (doc_count, _) =
1108 PositionPostingList::concatenate_streaming(&sources, &mut merged).unwrap();
1109 assert_eq!(doc_count, (num_segments * docs_per_segment) as u32);
1110
1111 let merged_ppl = PositionPostingList::deserialize(&merged).unwrap();
1113 let all = collect_positions(&merged_ppl);
1114 assert_eq!(all.len(), num_segments * docs_per_segment);
1115
1116 for (seg, &offset) in offsets.iter().enumerate() {
1118 for i in 0..docs_per_segment {
1119 let idx = seg * docs_per_segment + i;
1120 let expected_doc = i as u32 * 2 + offset;
1121 assert_eq!(all[idx].0, expected_doc, "seg={} i={}", seg, i);
1122
1123 let n_pos = (i % 4) + 1;
1124 let expected_positions: Vec<u32> =
1125 (0..n_pos).map(|p| (p * 5 + seg) as u32).collect();
1126 assert_eq!(
1127 all[idx].1, expected_positions,
1128 "positions mismatch seg={} i={}",
1129 seg, i
1130 );
1131 }
1132 }
1133 }
1134
1135 #[test]
1136 fn test_positions_streaming_single_source() {
1137 let entries: Vec<(u32, Vec<u32>)> =
1138 (0..300).map(|i| (i * 4, vec![i * 2, i * 2 + 1])).collect();
1139 let ppl = build_ppl(&entries);
1140 let direct = serialize_ppl(&ppl);
1141
1142 let sources: Vec<(&[u8], u32)> = vec![(&direct, 0)];
1143 let mut streamed = Vec::new();
1144 PositionPostingList::concatenate_streaming(&sources, &mut streamed).unwrap();
1145
1146 let p1 = collect_positions(&PositionPostingList::deserialize(&direct).unwrap());
1147 let p2 = collect_positions(&PositionPostingList::deserialize(&streamed).unwrap());
1148 assert_eq!(p1, p2);
1149 }
1150
1151 #[test]
1152 fn test_positions_edge_single_doc() {
1153 let ppl_a = build_ppl(&[(0, vec![42, 43, 44])]);
1154 let ppl_b = build_ppl(&[(0, vec![100])]);
1155
1156 let merged = PositionPostingList::concatenate_blocks(&[(ppl_a, 0), (ppl_b, 1)]).unwrap();
1157
1158 assert_eq!(merged.doc_count(), 2);
1159 assert_eq!(merged.get_positions(0).unwrap(), vec![42, 43, 44]);
1160 assert_eq!(merged.get_positions(1).unwrap(), vec![100]);
1161 }
1162}