1use crate::core::DocId;
13
14fn encode_vbyte(mut value: u32, out: &mut Vec<u8>) {
20 loop {
21 let byte = (value & 0x7F) as u8;
22 value >>= 7;
23 if value == 0 {
24 out.push(byte); break;
26 }
27 out.push(byte | 0x80); }
29}
30
31fn decode_vbyte(data: &[u8], pos: usize) -> (u32, usize) {
33 let mut result: u32 = 0;
34 let mut shift = 0;
35 let mut i = pos;
36 loop {
37 let byte = data[i];
38 result |= ((byte & 0x7F) as u32) << shift;
39 i += 1;
40 if byte & 0x80 == 0 {
41 break;
42 }
43 shift += 7;
44 }
45 (result, i - pos)
46}
47
48pub struct PostingListWriter {
54 buf: Vec<u8>,
55 count: u32,
56 last_doc_id: u32,
57}
58
59impl PostingListWriter {
60 pub fn new() -> Self {
61 Self {
62 buf: Vec::new(),
63 count: 0,
64 last_doc_id: 0,
65 }
66 }
67
68 pub fn add(&mut self, doc_id: DocId, tf: u32) {
70 let id = doc_id.as_u32();
71 let delta = if self.count == 0 {
72 id
73 } else {
74 debug_assert!(id > self.last_doc_id, "doc IDs must be strictly increasing");
75 id - self.last_doc_id
76 };
77
78 encode_vbyte(delta, &mut self.buf);
79 encode_vbyte(tf, &mut self.buf);
80
81 self.last_doc_id = id;
82 self.count += 1;
83 }
84
85 pub fn finish(self) -> Vec<u8> {
87 let mut result = Vec::with_capacity(5 + self.buf.len());
88 result.extend_from_slice(&self.count.to_le_bytes());
89 result.push(0x00); result.extend_from_slice(&self.buf);
91 result
92 }
93}
94
95impl Default for PostingListWriter {
96 fn default() -> Self {
97 Self::new()
98 }
99}
100
101pub struct PostingListReader<'a> {
108 data: &'a [u8],
109 pos: usize,
110 remaining: u32,
111 current_doc_id: u32,
112 has_positions: bool,
113}
114
115impl<'a> PostingListReader<'a> {
116 pub fn new(data: &'a [u8]) -> Self {
123 let count = if data.len() >= 5 {
124 u32::from_le_bytes([data[0], data[1], data[2], data[3]])
125 } else {
126 0
127 };
128 let has_pos = has_positions(data);
129
130 let pos = if data.len() >= 5 && data[4] == FLAG_BLOCK_MAX {
132 let num_blocks = u16::from_le_bytes(data[5..7].try_into().unwrap()) as usize;
134 7 + num_blocks * 8 } else {
136 5 };
138
139 Self {
140 data,
141 pos,
142 remaining: count,
143 current_doc_id: 0,
144 has_positions: has_pos,
145 }
146 }
147
148 pub fn len(&self) -> u32 {
150 if self.data.len() >= 5 {
151 u32::from_le_bytes([self.data[0], self.data[1], self.data[2], self.data[3]])
152 } else {
153 0
154 }
155 }
156
157 pub fn is_empty(&self) -> bool {
159 self.len() == 0
160 }
161
162 pub fn next(&mut self) -> Option<(DocId, u32)> {
165 if self.remaining == 0 {
166 return None;
167 }
168
169 let (delta, consumed) = decode_vbyte(self.data, self.pos);
170 self.pos += consumed;
171
172 let (tf, consumed) = decode_vbyte(self.data, self.pos);
173 self.pos += consumed;
174
175 if self.has_positions {
177 for _ in 0..tf {
178 let (_, consumed) = decode_vbyte(self.data, self.pos);
179 self.pos += consumed;
180 }
181 }
182
183 self.current_doc_id += delta;
184 self.remaining -= 1;
185
186 Some((DocId(self.current_doc_id), tf))
187 }
188}
189
190pub struct PositionPostingListWriter {
202 buf: Vec<u8>,
203 count: u32,
204 last_doc_id: u32,
205}
206
207const FLAG_HAS_POSITIONS: u8 = 0x01;
209const FLAG_BLOCK_MAX: u8 = 0x02;
210
211const BLOCK_SIZE: usize = 128;
213
214impl PositionPostingListWriter {
215 pub fn new() -> Self {
216 Self {
217 buf: Vec::new(),
218 count: 0,
219 last_doc_id: 0,
220 }
221 }
222
223 pub fn add(&mut self, doc_id: DocId, positions: &[u32]) {
226 let id = doc_id.as_u32();
227 let delta = if self.count == 0 {
228 id
229 } else {
230 debug_assert!(id > self.last_doc_id);
231 id - self.last_doc_id
232 };
233
234 let tf = positions.len() as u32;
235 encode_vbyte(delta, &mut self.buf);
236 encode_vbyte(tf, &mut self.buf);
237
238 let mut last_pos = 0u32;
240 for &pos in positions {
241 encode_vbyte(pos - last_pos, &mut self.buf);
242 last_pos = pos;
243 }
244
245 self.last_doc_id = id;
246 self.count += 1;
247 }
248
249 pub fn finish(self) -> Vec<u8> {
250 let mut result = Vec::with_capacity(5 + self.buf.len());
251 result.extend_from_slice(&self.count.to_le_bytes());
252 result.push(FLAG_HAS_POSITIONS);
253 result.extend_from_slice(&self.buf);
254 result
255 }
256}
257
258impl Default for PositionPostingListWriter {
259 fn default() -> Self {
260 Self::new()
261 }
262}
263
264pub struct PositionPostingListReader<'a> {
270 data: &'a [u8],
271 pos: usize,
272 remaining: u32,
273 current_doc_id: u32,
274 position_buf: Vec<u32>,
276 cached_first_pos: u32,
278 cached_tf: u32,
279}
280
281impl<'a> PositionPostingListReader<'a> {
282 pub fn new(data: &'a [u8]) -> Self {
284 let count = if data.len() >= 5 {
285 u32::from_le_bytes([data[0], data[1], data[2], data[3]])
286 } else {
287 0
288 };
289 Self {
290 data,
291 pos: 5, remaining: count,
293 current_doc_id: 0,
294 position_buf: Vec::new(),
295 cached_first_pos: 0,
296 cached_tf: 0,
297 }
298 }
299
300 pub fn len(&self) -> u32 {
301 if self.data.len() >= 4 {
302 u32::from_le_bytes([self.data[0], self.data[1], self.data[2], self.data[3]])
303 } else {
304 0
305 }
306 }
307
308 pub fn is_empty(&self) -> bool {
309 self.len() == 0
310 }
311
312 pub fn positions(&self) -> &[u32] {
314 &self.position_buf
315 }
316
317 #[inline(always)]
321 pub fn first_position(&self) -> u32 {
322 self.cached_first_pos
323 }
324
325 #[inline(always)]
327 pub fn current_tf(&self) -> u32 {
328 self.cached_tf
329 }
330
331 fn decode_positions(&mut self, tf: u32) {
334 self.cached_tf = tf;
335 let (first_delta, consumed) = decode_vbyte(self.data, self.pos);
336 self.pos += consumed;
337 self.cached_first_pos = first_delta;
338
339 if tf == 1 {
340 self.position_buf.clear();
342 self.position_buf.push(first_delta);
343 } else {
344 self.position_buf.clear();
346 self.position_buf.push(first_delta);
347 let mut last_pos = first_delta;
348 for _ in 1..tf {
349 let (pos_delta, consumed) = decode_vbyte(self.data, self.pos);
350 self.pos += consumed;
351 last_pos += pos_delta;
352 self.position_buf.push(last_pos);
353 }
354 }
355 }
356
357 fn skip_positions(&mut self, tf: u32) {
359 for _ in 0..tf {
360 let (_, consumed) = decode_vbyte(self.data, self.pos);
361 self.pos += consumed;
362 }
363 }
364
365 pub fn next(&mut self) -> Option<(DocId, Vec<u32>)> {
367 if self.remaining == 0 {
368 return None;
369 }
370
371 let (delta, consumed) = decode_vbyte(self.data, self.pos);
372 self.pos += consumed;
373
374 let (tf, consumed) = decode_vbyte(self.data, self.pos);
375 self.pos += consumed;
376
377 self.current_doc_id += delta;
378 self.decode_positions(tf);
379 self.remaining -= 1;
380
381 Some((DocId(self.current_doc_id), self.position_buf.clone()))
382 }
383
384 pub fn advance(&mut self, target: DocId) -> Option<DocId> {
389 let target_val = target.as_u32();
390 while self.remaining > 0 {
391 let (delta, consumed) = decode_vbyte(self.data, self.pos);
392 self.pos += consumed;
393
394 let (tf, consumed) = decode_vbyte(self.data, self.pos);
395 self.pos += consumed;
396
397 self.current_doc_id += delta;
398 self.remaining -= 1;
399
400 if self.current_doc_id >= target_val {
401 self.decode_positions(tf);
402 return Some(DocId(self.current_doc_id));
403 }
404
405 self.skip_positions(tf);
407 }
408 None
409 }
410
411 #[inline(always)]
415 pub fn next_doc(&mut self) -> Option<DocId> {
416 if self.remaining == 0 {
417 return None;
418 }
419
420 let (delta, consumed) = decode_vbyte(self.data, self.pos);
421 self.pos += consumed;
422
423 let (tf, consumed) = decode_vbyte(self.data, self.pos);
424 self.pos += consumed;
425
426 self.current_doc_id += delta;
427 self.cached_tf = tf;
428
429 let (first_delta, consumed) = decode_vbyte(self.data, self.pos);
431 self.pos += consumed;
432 self.cached_first_pos = first_delta;
433
434 if tf > 1 {
435 self.position_buf.clear();
437 self.position_buf.push(first_delta);
438 let mut last_pos = first_delta;
439 for _ in 1..tf {
440 let (pos_delta, consumed) = decode_vbyte(self.data, self.pos);
441 self.pos += consumed;
442 last_pos += pos_delta;
443 self.position_buf.push(last_pos);
444 }
445 }
446 self.remaining -= 1;
449 Some(DocId(self.current_doc_id))
450 }
451
452 pub fn current_doc_id(&self) -> u32 {
454 self.current_doc_id
455 }
456}
457
458pub struct BlockMaxPostingListWriter {
471 entries: Vec<(u32, u32)>, }
473
474impl BlockMaxPostingListWriter {
475 pub fn new() -> Self {
476 Self {
477 entries: Vec::new(),
478 }
479 }
480
481 pub fn add(&mut self, doc_id: DocId, tf: u32) {
482 self.entries.push((doc_id.as_u32(), tf));
483 }
484
485 pub fn finish(self) -> Vec<u8> {
486 let num_docs = self.entries.len() as u32;
487 let num_blocks = if self.entries.is_empty() {
488 0u16
489 } else {
490 ((self.entries.len() + BLOCK_SIZE - 1) / BLOCK_SIZE) as u16
491 };
492
493 let mut block_headers: Vec<(u32, u16, u16)> = Vec::with_capacity(num_blocks as usize);
495 let mut block_data_bufs: Vec<Vec<u8>> = Vec::with_capacity(num_blocks as usize);
496
497 for block_idx in 0..num_blocks as usize {
498 let start = block_idx * BLOCK_SIZE;
499 let end = ((block_idx + 1) * BLOCK_SIZE).min(self.entries.len());
500 let block_entries = &self.entries[start..end];
501
502 let last_doc_id = block_entries.last().unwrap().0;
503 let max_tf = block_entries.iter().map(|e| e.1).max().unwrap();
504 let max_tf_u16 = if max_tf > u16::MAX as u32 {
505 u16::MAX
506 } else {
507 max_tf as u16
508 };
509
510 let mut buf = Vec::new();
512 let base_doc_id = if block_idx == 0 {
513 0u32
514 } else {
515 self.entries[start - 1].0
516 };
517 let mut prev = base_doc_id;
518 for &(doc_id, tf) in block_entries {
519 encode_vbyte(doc_id - prev, &mut buf);
520 encode_vbyte(tf, &mut buf);
521 prev = doc_id;
522 }
523
524 let data_len = buf.len() as u16;
525 block_headers.push((last_doc_id, max_tf_u16, data_len));
526 block_data_bufs.push(buf);
527 }
528
529 let header_bytes = num_blocks as usize * 8;
531 let data_bytes: usize = block_data_bufs.iter().map(|b| b.len()).sum();
532 let mut result = Vec::with_capacity(4 + 1 + 2 + header_bytes + data_bytes);
533
534 result.extend_from_slice(&num_docs.to_le_bytes());
535 result.push(FLAG_BLOCK_MAX);
536 result.extend_from_slice(&num_blocks.to_le_bytes());
537
538 for &(last_doc_id, max_tf, data_len) in &block_headers {
540 result.extend_from_slice(&last_doc_id.to_le_bytes());
541 result.extend_from_slice(&max_tf.to_le_bytes());
542 result.extend_from_slice(&data_len.to_le_bytes());
543 }
544
545 for buf in block_data_bufs {
547 result.extend_from_slice(&buf);
548 }
549
550 result
551 }
552}
553
554impl Default for BlockMaxPostingListWriter {
555 fn default() -> Self {
556 Self::new()
557 }
558}
559
560pub struct BlockMaxPostingListReader<'a> {
564 data: &'a [u8],
565 num_docs: u32,
566 num_blocks: u16,
567 headers_start: usize,
568
569 block_data_offsets: Vec<usize>,
571
572 current_block: u16,
574 pos_in_data: usize,
575 remaining_in_block: u16,
576 current_doc_id: u32,
577 total_remaining: u32,
578}
579
580impl<'a> BlockMaxPostingListReader<'a> {
581 pub fn new(data: &'a [u8]) -> Self {
582 let num_docs = u32::from_le_bytes(data[0..4].try_into().unwrap());
583 debug_assert_eq!(data[4], FLAG_BLOCK_MAX);
584 let num_blocks = u16::from_le_bytes(data[5..7].try_into().unwrap());
585
586 let headers_start = 7;
587 let block_data_start = headers_start + num_blocks as usize * 8;
588
589 let mut block_data_offsets = Vec::with_capacity(num_blocks as usize + 1);
591 let mut offset = block_data_start;
592 for i in 0..num_blocks as usize {
593 block_data_offsets.push(offset);
594 let hdr_pos = headers_start + i * 8;
595 let data_len =
596 u16::from_le_bytes(data[hdr_pos + 6..hdr_pos + 8].try_into().unwrap()) as usize;
597 offset += data_len;
598 }
599 block_data_offsets.push(offset); let first_block_docs = if num_blocks > 0 {
602 let total = num_docs as usize;
603 let _full_blocks = if num_blocks > 1 {
604 (num_blocks as usize - 1) * BLOCK_SIZE
605 } else {
606 0
607 };
608 if num_blocks == 1 {
609 total as u16
610 } else {
611 BLOCK_SIZE as u16
612 }
613 } else {
614 0
615 };
616
617 Self {
618 data,
619 num_docs,
620 num_blocks,
621 headers_start,
622 block_data_offsets,
623 current_block: 0,
624 pos_in_data: block_data_start,
625 remaining_in_block: first_block_docs,
626 current_doc_id: 0,
627 total_remaining: num_docs,
628 }
629 }
630
631 pub fn len(&self) -> u32 {
633 self.num_docs
634 }
635
636 pub fn is_empty(&self) -> bool {
637 self.num_docs == 0
638 }
639
640 pub fn num_blocks(&self) -> u16 {
641 self.num_blocks
642 }
643
644 pub fn block_max_tf(&self, block: u16) -> u16 {
646 let hdr_pos = self.headers_start + block as usize * 8;
647 u16::from_le_bytes(self.data[hdr_pos + 4..hdr_pos + 6].try_into().unwrap())
648 }
649
650 pub fn block_last_doc(&self, block: u16) -> u32 {
652 let hdr_pos = self.headers_start + block as usize * 8;
653 u32::from_le_bytes(self.data[hdr_pos..hdr_pos + 4].try_into().unwrap())
654 }
655
656 fn block_doc_count(&self, block: u16) -> u16 {
658 if (block as usize) < self.num_blocks as usize - 1 {
659 BLOCK_SIZE as u16
660 } else {
661 let full_blocks = (self.num_blocks as usize - 1) * BLOCK_SIZE;
663 (self.num_docs as usize - full_blocks) as u16
664 }
665 }
666
667 pub fn advance_shallow(&mut self, target: DocId) {
674 let target_val = target.as_u32();
675 let mut lo = self.current_block as usize;
677 let mut hi = self.num_blocks as usize;
678 while lo < hi {
679 let mid = lo + (hi - lo) / 2;
680 if self.block_last_doc(mid as u16) < target_val {
681 lo = mid + 1;
682 } else {
683 hi = mid;
684 }
685 }
686 if lo < self.num_blocks as usize {
687 self.current_block = lo as u16;
688 }
689 }
690
691 pub fn advance_to_block(&mut self, target: DocId) {
694 let target_val = target.as_u32();
695
696 let mut lo = self.current_block as usize;
698 let mut hi = self.num_blocks as usize;
699 while lo < hi {
700 let mid = lo + (hi - lo) / 2;
701 if self.block_last_doc(mid as u16) < target_val {
702 lo = mid + 1;
703 } else {
704 hi = mid;
705 }
706 }
707
708 if lo >= self.num_blocks as usize {
709 self.total_remaining = 0;
711 self.remaining_in_block = 0;
712 return;
713 }
714
715 self.seek_to_block(lo as u16);
717 }
718
719 fn seek_to_block(&mut self, block: u16) {
721 if block >= self.num_blocks {
722 self.total_remaining = 0;
723 self.remaining_in_block = 0;
724 return;
725 }
726
727 let docs_before_block = block as usize * BLOCK_SIZE;
729 self.total_remaining = self.num_docs.saturating_sub(docs_before_block as u32);
730 self.remaining_in_block = self.block_doc_count(block);
731 self.current_block = block;
732 self.pos_in_data = self.block_data_offsets[block as usize];
733
734 if block == 0 {
736 self.current_doc_id = 0;
737 } else {
738 self.current_doc_id = self.block_last_doc(block - 1);
739 }
740 }
741
742 pub fn next(&mut self) -> Option<(DocId, u32)> {
744 if self.total_remaining == 0 {
745 return None;
746 }
747
748 if self.remaining_in_block == 0 {
750 let next_block = self.current_block + 1;
751 if next_block >= self.num_blocks {
752 self.total_remaining = 0;
753 return None;
754 }
755 self.current_block = next_block;
756 self.remaining_in_block = self.block_doc_count(next_block);
757 self.pos_in_data = self.block_data_offsets[next_block as usize];
758 }
759
760 let (delta, consumed) = decode_vbyte(self.data, self.pos_in_data);
761 self.pos_in_data += consumed;
762 let (tf, consumed) = decode_vbyte(self.data, self.pos_in_data);
763 self.pos_in_data += consumed;
764
765 self.current_doc_id += delta;
766 self.remaining_in_block -= 1;
767 self.total_remaining -= 1;
768
769 Some((DocId(self.current_doc_id), tf))
770 }
771
772 pub fn current_block_idx(&self) -> u16 {
774 self.current_block
775 }
776}
777
778pub fn has_block_max(data: &[u8]) -> bool {
780 data.len() >= 5 && data[4] == FLAG_BLOCK_MAX
781}
782
783pub fn has_positions(data: &[u8]) -> bool {
785 data.len() >= 5 && data[4] == FLAG_HAS_POSITIONS
786}
787
788#[cfg(test)]
789mod tests {
790 use super::*;
791
792 #[test]
793 fn vbyte_single_byte() {
794 let mut buf = Vec::new();
796 encode_vbyte(0, &mut buf);
797 assert_eq!(buf.len(), 1);
798 let (val, consumed) = decode_vbyte(&buf, 0);
799 assert_eq!(val, 0);
800 assert_eq!(consumed, 1);
801 }
802
803 #[test]
804 fn vbyte_boundary_127() {
805 let mut buf = Vec::new();
806 encode_vbyte(127, &mut buf);
807 assert_eq!(buf.len(), 1);
808 let (val, _) = decode_vbyte(&buf, 0);
809 assert_eq!(val, 127);
810 }
811
812 #[test]
813 fn vbyte_boundary_128() {
814 let mut buf = Vec::new();
815 encode_vbyte(128, &mut buf);
816 assert_eq!(buf.len(), 2);
817 let (val, consumed) = decode_vbyte(&buf, 0);
818 assert_eq!(val, 128);
819 assert_eq!(consumed, 2);
820 }
821
822 #[test]
823 fn vbyte_16383() {
824 let mut buf = Vec::new();
825 encode_vbyte(16383, &mut buf);
826 assert_eq!(buf.len(), 2);
827 let (val, _) = decode_vbyte(&buf, 0);
828 assert_eq!(val, 16383);
829 }
830
831 #[test]
832 fn vbyte_16384() {
833 let mut buf = Vec::new();
834 encode_vbyte(16384, &mut buf);
835 assert_eq!(buf.len(), 3);
836 let (val, _) = decode_vbyte(&buf, 0);
837 assert_eq!(val, 16384);
838 }
839
840 #[test]
841 fn vbyte_large_value() {
842 let mut buf = Vec::new();
843 let val = u32::MAX - 1;
844 encode_vbyte(val, &mut buf);
845 assert_eq!(buf.len(), 5);
846 let (decoded, _) = decode_vbyte(&buf, 0);
847 assert_eq!(decoded, val);
848 }
849
850 #[test]
851 fn vbyte_max_value() {
852 let mut buf = Vec::new();
853 encode_vbyte(u32::MAX, &mut buf);
854 let (decoded, _) = decode_vbyte(&buf, 0);
855 assert_eq!(decoded, u32::MAX);
856 }
857
858 #[test]
859 fn round_trip_single_doc() {
860 let mut writer = PostingListWriter::new();
861 writer.add(DocId(5), 3);
862 let data = writer.finish();
863
864 let mut reader = PostingListReader::new(&data);
865 assert_eq!(reader.len(), 1);
866 assert_eq!(reader.next(), Some((DocId(5), 3)));
867 assert_eq!(reader.next(), None);
868 }
869
870 #[test]
871 fn round_trip_multiple_docs() {
872 let mut writer = PostingListWriter::new();
873 writer.add(DocId(1), 2);
874 writer.add(DocId(5), 1);
875 writer.add(DocId(100), 4);
876 writer.add(DocId(1000), 1);
877 let data = writer.finish();
878
879 let mut reader = PostingListReader::new(&data);
880 assert_eq!(reader.len(), 4);
881 assert_eq!(reader.next(), Some((DocId(1), 2)));
882 assert_eq!(reader.next(), Some((DocId(5), 1)));
883 assert_eq!(reader.next(), Some((DocId(100), 4)));
884 assert_eq!(reader.next(), Some((DocId(1000), 1)));
885 assert_eq!(reader.next(), None);
886 }
887
888 #[test]
889 fn round_trip_consecutive_doc_ids() {
890 let mut writer = PostingListWriter::new();
891 for i in 0..10 {
892 writer.add(DocId(i), 1);
893 }
894 let data = writer.finish();
895
896 let mut reader = PostingListReader::new(&data);
897 assert_eq!(reader.len(), 10);
898 for i in 0..10 {
899 assert_eq!(reader.next(), Some((DocId(i), 1)));
900 }
901 assert_eq!(reader.next(), None);
902 }
903
904 #[test]
905 fn round_trip_varying_tf() {
906 let mut writer = PostingListWriter::new();
907 writer.add(DocId(0), 0);
908 writer.add(DocId(1), 1);
909 writer.add(DocId(2), 127);
910 writer.add(DocId(3), 128);
911 writer.add(DocId(4), 10000);
912 let data = writer.finish();
913
914 let mut reader = PostingListReader::new(&data);
915 assert_eq!(reader.next(), Some((DocId(0), 0)));
916 assert_eq!(reader.next(), Some((DocId(1), 1)));
917 assert_eq!(reader.next(), Some((DocId(2), 127)));
918 assert_eq!(reader.next(), Some((DocId(3), 128)));
919 assert_eq!(reader.next(), Some((DocId(4), 10000)));
920 }
921
922 #[test]
923 fn empty_posting_list() {
924 let writer = PostingListWriter::new();
925 let data = writer.finish();
926
927 let mut reader = PostingListReader::new(&data);
928 assert_eq!(reader.len(), 0);
929 assert!(reader.is_empty());
930 assert_eq!(reader.next(), None);
931 }
932
933 #[test]
934 fn large_posting_list() {
935 let count = 10_000;
936 let mut writer = PostingListWriter::new();
937 for i in 0..count {
938 writer.add(DocId(i * 3), (i % 10) + 1);
939 }
940 let data = writer.finish();
941
942 let mut reader = PostingListReader::new(&data);
943 assert_eq!(reader.len(), count);
944 for i in 0..count {
945 let (doc_id, tf) = reader.next().unwrap();
946 assert_eq!(doc_id, DocId(i * 3));
947 assert_eq!(tf, (i % 10) + 1);
948 }
949 assert_eq!(reader.next(), None);
950 }
951
952 #[test]
953 fn delta_encoding_compresses() {
954 let mut writer = PostingListWriter::new();
956 for i in 0..1000 {
957 writer.add(DocId(i), 1);
958 }
959 let data = writer.finish();
960
961 assert!(data.len() < 3000);
964 }
965
966 #[test]
967 fn wide_gap_doc_ids() {
968 let mut writer = PostingListWriter::new();
969 writer.add(DocId(0), 1);
970 writer.add(DocId(1_000_000), 1);
971 writer.add(DocId(2_000_000), 1);
972 let data = writer.finish();
973
974 let mut reader = PostingListReader::new(&data);
975 assert_eq!(reader.next(), Some((DocId(0), 1)));
976 assert_eq!(reader.next(), Some((DocId(1_000_000), 1)));
977 assert_eq!(reader.next(), Some((DocId(2_000_000), 1)));
978 assert_eq!(reader.next(), None);
979 }
980
981 #[test]
982 fn iterator_exhaustion_is_stable() {
983 let mut writer = PostingListWriter::new();
984 writer.add(DocId(1), 1);
985 let data = writer.finish();
986
987 let mut reader = PostingListReader::new(&data);
988 assert!(reader.next().is_some());
989 assert_eq!(reader.next(), None);
990 assert_eq!(reader.next(), None); assert_eq!(reader.next(), None);
992 }
993
994 #[test]
995 fn doc_id_starting_at_zero() {
996 let mut writer = PostingListWriter::new();
997 writer.add(DocId(0), 5);
998 let data = writer.finish();
999
1000 let mut reader = PostingListReader::new(&data);
1001 assert_eq!(reader.next(), Some((DocId(0), 5)));
1002 }
1003
1004 #[test]
1007 fn position_round_trip_single_doc() {
1008 let mut writer = PositionPostingListWriter::new();
1009 writer.add(DocId(0), &[0, 3, 7]);
1010 let data = writer.finish();
1011
1012 assert!(has_positions(&data));
1013 let mut reader = PositionPostingListReader::new(&data);
1014 assert_eq!(reader.len(), 1);
1015 let (doc_id, positions) = reader.next().unwrap();
1016 assert_eq!(doc_id, DocId(0));
1017 assert_eq!(positions, vec![0, 3, 7]);
1018 assert!(reader.next().is_none());
1019 }
1020
1021 #[test]
1022 fn position_round_trip_multiple_docs() {
1023 let mut writer = PositionPostingListWriter::new();
1024 writer.add(DocId(1), &[0, 1]);
1025 writer.add(DocId(5), &[2, 5, 8]);
1026 writer.add(DocId(10), &[0]);
1027 let data = writer.finish();
1028
1029 let mut reader = PositionPostingListReader::new(&data);
1030 assert_eq!(reader.len(), 3);
1031
1032 let (id, pos) = reader.next().unwrap();
1033 assert_eq!(id, DocId(1));
1034 assert_eq!(pos, vec![0, 1]);
1035
1036 let (id, pos) = reader.next().unwrap();
1037 assert_eq!(id, DocId(5));
1038 assert_eq!(pos, vec![2, 5, 8]);
1039
1040 let (id, pos) = reader.next().unwrap();
1041 assert_eq!(id, DocId(10));
1042 assert_eq!(pos, vec![0]);
1043
1044 assert!(reader.next().is_none());
1045 }
1046
1047 #[test]
1048 fn position_consecutive_positions() {
1049 let mut writer = PositionPostingListWriter::new();
1050 writer.add(DocId(0), &[0, 1, 2, 3, 4]);
1051 let data = writer.finish();
1052
1053 let mut reader = PositionPostingListReader::new(&data);
1054 let (_, pos) = reader.next().unwrap();
1055 assert_eq!(pos, vec![0, 1, 2, 3, 4]);
1056 }
1057
1058 #[test]
1059 fn position_gapped_positions() {
1060 let mut writer = PositionPostingListWriter::new();
1061 writer.add(DocId(0), &[0, 100, 200, 5000]);
1062 let data = writer.finish();
1063
1064 let mut reader = PositionPostingListReader::new(&data);
1065 let (_, pos) = reader.next().unwrap();
1066 assert_eq!(pos, vec![0, 100, 200, 5000]);
1067 }
1068
1069 #[test]
1070 fn position_empty_list() {
1071 let writer = PositionPostingListWriter::new();
1072 let data = writer.finish();
1073
1074 let mut reader = PositionPostingListReader::new(&data);
1075 assert_eq!(reader.len(), 0);
1076 assert!(reader.is_empty());
1077 assert!(reader.next().is_none());
1078 }
1079
1080 #[test]
1081 fn position_single_position() {
1082 let mut writer = PositionPostingListWriter::new();
1083 writer.add(DocId(42), &[7]);
1084 let data = writer.finish();
1085
1086 let mut reader = PositionPostingListReader::new(&data);
1087 let (id, pos) = reader.next().unwrap();
1088 assert_eq!(id, DocId(42));
1089 assert_eq!(pos, vec![7]);
1090 }
1091
1092 #[test]
1093 fn position_many_docs() {
1094 let mut writer = PositionPostingListWriter::new();
1095 for i in 0..1000u32 {
1096 writer.add(DocId(i), &[i * 2, i * 2 + 1]);
1097 }
1098 let data = writer.finish();
1099
1100 let mut reader = PositionPostingListReader::new(&data);
1101 assert_eq!(reader.len(), 1000);
1102 for i in 0..1000u32 {
1103 let (id, pos) = reader.next().unwrap();
1104 assert_eq!(id, DocId(i));
1105 assert_eq!(pos, vec![i * 2, i * 2 + 1]);
1106 }
1107 assert!(reader.next().is_none());
1108 }
1109
1110 #[test]
1111 fn has_positions_flag() {
1112 let mut pw = PositionPostingListWriter::new();
1114 pw.add(DocId(0), &[0]);
1115 let pdata = pw.finish();
1116 assert!(has_positions(&pdata));
1117
1118 let mut w = PostingListWriter::new();
1120 w.add(DocId(0), 1);
1121 let data = w.finish();
1122 assert!(!has_positions(&data));
1123 }
1124
1125 #[test]
1126 fn position_exhaustion_stable() {
1127 let mut writer = PositionPostingListWriter::new();
1128 writer.add(DocId(0), &[0]);
1129 let data = writer.finish();
1130
1131 let mut reader = PositionPostingListReader::new(&data);
1132 assert!(reader.next().is_some());
1133 assert!(reader.next().is_none());
1134 assert!(reader.next().is_none());
1135 }
1136
1137 #[test]
1140 fn block_max_single_doc() {
1141 let mut writer = BlockMaxPostingListWriter::new();
1142 writer.add(DocId(5), 3);
1143 let data = writer.finish();
1144
1145 assert!(has_block_max(&data));
1146 let mut reader = BlockMaxPostingListReader::new(&data);
1147 assert_eq!(reader.len(), 1);
1148 assert_eq!(reader.num_blocks(), 1);
1149 assert_eq!(reader.block_last_doc(0), 5);
1150 assert_eq!(reader.block_max_tf(0), 3);
1151 assert_eq!(reader.next(), Some((DocId(5), 3)));
1152 assert_eq!(reader.next(), None);
1153 }
1154
1155 #[test]
1156 fn block_max_under_block_size() {
1157 let mut writer = BlockMaxPostingListWriter::new();
1158 for i in 0..50 {
1159 writer.add(DocId(i * 2), (i % 5) + 1);
1160 }
1161 let data = writer.finish();
1162
1163 let mut reader = BlockMaxPostingListReader::new(&data);
1164 assert_eq!(reader.len(), 50);
1165 assert_eq!(reader.num_blocks(), 1);
1166 assert_eq!(reader.block_last_doc(0), 98);
1167 assert_eq!(reader.block_max_tf(0), 5); for i in 0..50 {
1170 assert_eq!(reader.next(), Some((DocId(i * 2), (i % 5) + 1)));
1171 }
1172 assert_eq!(reader.next(), None);
1173 }
1174
1175 #[test]
1176 fn block_max_exact_block_size() {
1177 let mut writer = BlockMaxPostingListWriter::new();
1178 for i in 0..128 {
1179 writer.add(DocId(i), 1);
1180 }
1181 let data = writer.finish();
1182
1183 let mut reader = BlockMaxPostingListReader::new(&data);
1184 assert_eq!(reader.len(), 128);
1185 assert_eq!(reader.num_blocks(), 1);
1186 assert_eq!(reader.block_last_doc(0), 127);
1187
1188 for i in 0..128 {
1189 assert_eq!(reader.next(), Some((DocId(i), 1)));
1190 }
1191 assert_eq!(reader.next(), None);
1192 }
1193
1194 #[test]
1195 fn block_max_multi_block() {
1196 let mut writer = BlockMaxPostingListWriter::new();
1198 for i in 0..300u32 {
1199 writer.add(DocId(i), (i % 10) + 1);
1200 }
1201 let data = writer.finish();
1202
1203 let mut reader = BlockMaxPostingListReader::new(&data);
1204 assert_eq!(reader.len(), 300);
1205 assert_eq!(reader.num_blocks(), 3);
1206
1207 assert_eq!(reader.block_last_doc(0), 127);
1209 assert_eq!(reader.block_max_tf(0), 10); assert_eq!(reader.block_last_doc(1), 255);
1213
1214 assert_eq!(reader.block_last_doc(2), 299);
1216
1217 for i in 0..300u32 {
1219 assert_eq!(reader.next(), Some((DocId(i), (i % 10) + 1)));
1220 }
1221 assert_eq!(reader.next(), None);
1222 }
1223
1224 #[test]
1225 fn block_max_advance_to_block() {
1226 let mut writer = BlockMaxPostingListWriter::new();
1227 for i in 0..300u32 {
1228 writer.add(DocId(i * 3), 1);
1229 }
1230 let data = writer.finish();
1231
1232 let mut reader = BlockMaxPostingListReader::new(&data);
1233 reader.advance_to_block(DocId(400));
1239 let (doc, _) = reader.next().unwrap();
1241 assert_eq!(doc, DocId(384)); reader.advance_to_block(DocId(800));
1245 let (doc, _) = reader.next().unwrap();
1246 assert_eq!(doc, DocId(768)); }
1248
1249 #[test]
1250 fn block_max_advance_past_end() {
1251 let mut writer = BlockMaxPostingListWriter::new();
1252 for i in 0..10 {
1253 writer.add(DocId(i), 1);
1254 }
1255 let data = writer.finish();
1256
1257 let mut reader = BlockMaxPostingListReader::new(&data);
1258 reader.advance_to_block(DocId(100));
1259 assert_eq!(reader.next(), None);
1260 }
1261
1262 #[test]
1263 fn block_max_empty() {
1264 let writer = BlockMaxPostingListWriter::new();
1265 let data = writer.finish();
1266
1267 let mut reader = BlockMaxPostingListReader::new(&data);
1268 assert_eq!(reader.len(), 0);
1269 assert!(reader.is_empty());
1270 assert_eq!(reader.num_blocks(), 0);
1271 assert_eq!(reader.next(), None);
1272 }
1273
1274 #[test]
1275 fn block_max_large_tf_clamped() {
1276 let mut writer = BlockMaxPostingListWriter::new();
1277 writer.add(DocId(0), 70000); let data = writer.finish();
1279
1280 let mut reader = BlockMaxPostingListReader::new(&data);
1281 assert_eq!(reader.block_max_tf(0), u16::MAX); assert_eq!(reader.next(), Some((DocId(0), 70000)));
1284 }
1285
1286 #[test]
1287 fn block_max_flag_detected() {
1288 let mut bm = BlockMaxPostingListWriter::new();
1289 bm.add(DocId(0), 1);
1290 let bm_data = bm.finish();
1291 assert!(has_block_max(&bm_data));
1292 assert!(!has_positions(&bm_data));
1293
1294 let mut basic = PostingListWriter::new();
1295 basic.add(DocId(0), 1);
1296 let basic_data = basic.finish();
1297 assert!(!has_block_max(&basic_data));
1298 }
1299}