1use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
15use std::io::{self, Read, Write};
16
17const MIN_PARTITION_SIZE: usize = 64;
19
20const MAX_PARTITION_SIZE: usize = 512;
22
23#[derive(Debug, Clone)]
25pub struct EFPartition {
26 lower_bits: Vec<u64>,
28 upper_bits: Vec<u64>,
30 len: u32,
32 first_value: u32,
34 last_value: u32,
36 lower_bit_width: u8,
38}
39
40impl EFPartition {
41 pub fn from_sorted_slice(values: &[u32]) -> Self {
44 if values.is_empty() {
45 return Self {
46 lower_bits: Vec::new(),
47 upper_bits: Vec::new(),
48 len: 0,
49 first_value: 0,
50 last_value: 0,
51 lower_bit_width: 0,
52 };
53 }
54
55 let first_value = values[0];
56 let last_value = *values.last().unwrap();
57 let n = values.len() as u64;
58
59 let local_universe = last_value - first_value + 1;
61
62 let lower_bit_width = if n <= 1 {
64 0
65 } else {
66 let ratio = (local_universe as u64).max(1) / n.max(1);
67 if ratio <= 1 {
68 0
69 } else {
70 (64 - ratio.leading_zeros() - 1) as u8
71 }
72 };
73
74 let lower_bits_total = (n as usize) * (lower_bit_width as usize);
76 let lower_words = lower_bits_total.div_ceil(64);
77 let mut lower_bits = vec![0u64; lower_words];
78
79 let max_relative = local_universe.saturating_sub(1) as u64;
80 let upper_bound = n + (max_relative >> lower_bit_width) + 1;
81 let upper_words = (upper_bound as usize).div_ceil(64);
82 let mut upper_bits = vec![0u64; upper_words];
83
84 let lower_mask = if lower_bit_width == 0 {
85 0
86 } else {
87 (1u64 << lower_bit_width) - 1
88 };
89
90 for (i, &val) in values.iter().enumerate() {
92 let relative_val = (val - first_value) as u64;
93
94 if lower_bit_width > 0 {
96 let lower = relative_val & lower_mask;
97 let bit_pos = i * (lower_bit_width as usize);
98 let word_idx = bit_pos / 64;
99 let bit_offset = bit_pos % 64;
100
101 lower_bits[word_idx] |= lower << bit_offset;
102 if bit_offset + (lower_bit_width as usize) > 64 && word_idx + 1 < lower_bits.len() {
103 lower_bits[word_idx + 1] |= lower >> (64 - bit_offset);
104 }
105 }
106
107 let upper = relative_val >> lower_bit_width;
109 let upper_pos = (i as u64) + upper;
110 let word_idx = (upper_pos / 64) as usize;
111 let bit_offset = upper_pos % 64;
112 if word_idx < upper_bits.len() {
113 upper_bits[word_idx] |= 1u64 << bit_offset;
114 }
115 }
116
117 Self {
118 lower_bits,
119 upper_bits,
120 len: values.len() as u32,
121 first_value,
122 last_value,
123 lower_bit_width,
124 }
125 }
126
127 #[inline]
129 pub fn get(&self, i: u32) -> Option<u32> {
130 if i >= self.len {
131 return None;
132 }
133
134 let i = i as usize;
135
136 let lower = if self.lower_bit_width == 0 {
138 0u64
139 } else {
140 let bit_pos = i * (self.lower_bit_width as usize);
141 let word_idx = bit_pos / 64;
142 let bit_offset = bit_pos % 64;
143 let lower_mask = (1u64 << self.lower_bit_width) - 1;
144
145 let mut val = (self.lower_bits[word_idx] >> bit_offset) & lower_mask;
146 if bit_offset + (self.lower_bit_width as usize) > 64
147 && word_idx + 1 < self.lower_bits.len()
148 {
149 val |= (self.lower_bits[word_idx + 1] << (64 - bit_offset)) & lower_mask;
150 }
151 val
152 };
153
154 let select_pos = self.select1(i as u32)?;
156 let upper = (select_pos as u64) - (i as u64);
157
158 let relative_val = (upper << self.lower_bit_width) | lower;
160 Some(self.first_value + relative_val as u32)
161 }
162
163 fn select1(&self, i: u32) -> Option<u32> {
165 if i >= self.len {
166 return None;
167 }
168
169 let mut remaining = i + 1;
170 let mut pos = 0u32;
171
172 #[cfg(target_arch = "aarch64")]
174 {
175 use std::arch::aarch64::*;
176
177 let chunks = self.upper_bits.chunks_exact(4);
178 let remainder = chunks.remainder();
179
180 for chunk in chunks {
181 let words: [u64; 4] = [chunk[0], chunk[1], chunk[2], chunk[3]];
183
184 unsafe {
187 let bytes = std::mem::transmute::<[u64; 4], [u8; 32]>(words);
188
189 let v0 = vld1q_u8(bytes.as_ptr());
191 let cnt0 = vcntq_u8(v0);
192 let sum0 = vaddlvq_u8(cnt0) as u32;
193
194 let v1 = vld1q_u8(bytes.as_ptr().add(16));
196 let cnt1 = vcntq_u8(v1);
197 let sum1 = vaddlvq_u8(cnt1) as u32;
198
199 let total_popcount = sum0 + sum1;
200
201 if total_popcount >= remaining {
202 for &word in chunk {
204 let popcount = word.count_ones();
205 if popcount >= remaining {
206 let mut w = word;
207 for _ in 0..remaining - 1 {
208 w &= w - 1;
209 }
210 return Some(pos + w.trailing_zeros());
211 }
212 remaining -= popcount;
213 pos += 64;
214 }
215 }
216 remaining -= total_popcount;
217 pos += 256; }
219 }
220
221 for &word in remainder {
223 let popcount = word.count_ones();
224 if popcount >= remaining {
225 let mut w = word;
226 for _ in 0..remaining - 1 {
227 w &= w - 1;
228 }
229 return Some(pos + w.trailing_zeros());
230 }
231 remaining -= popcount;
232 pos += 64;
233 }
234 }
235
236 #[cfg(target_arch = "x86_64")]
238 {
239 #[cfg(target_feature = "popcnt")]
240 {
241 use std::arch::x86_64::*;
242
243 let chunks = self.upper_bits.chunks_exact(4);
244 let remainder = chunks.remainder();
245
246 for chunk in chunks {
247 unsafe {
249 let p0 = _popcnt64(chunk[0] as i64) as u32;
250 let p1 = _popcnt64(chunk[1] as i64) as u32;
251 let p2 = _popcnt64(chunk[2] as i64) as u32;
252 let p3 = _popcnt64(chunk[3] as i64) as u32;
253
254 let total_popcount = p0 + p1 + p2 + p3;
255
256 if total_popcount >= remaining {
257 for &word in chunk {
259 let popcount = word.count_ones();
260 if popcount >= remaining {
261 let mut w = word;
262 for _ in 0..remaining - 1 {
263 w &= w - 1;
264 }
265 return Some(pos + w.trailing_zeros());
266 }
267 remaining -= popcount;
268 pos += 64;
269 }
270 }
271 remaining -= total_popcount;
272 pos += 256; }
274 }
275
276 for &word in remainder {
278 let popcount = word.count_ones();
279 if popcount >= remaining {
280 let mut w = word;
281 for _ in 0..remaining - 1 {
282 w &= w - 1;
283 }
284 return Some(pos + w.trailing_zeros());
285 }
286 remaining -= popcount;
287 pos += 64;
288 }
289 }
290
291 #[cfg(not(target_feature = "popcnt"))]
293 {
294 for &word in &self.upper_bits {
295 let popcount = word.count_ones();
296 if popcount >= remaining {
297 let mut w = word;
298 for _ in 0..remaining - 1 {
299 w &= w - 1;
300 }
301 return Some(pos + w.trailing_zeros());
302 }
303 remaining -= popcount;
304 pos += 64;
305 }
306 }
307 }
308
309 #[cfg(not(any(target_arch = "aarch64", target_arch = "x86_64")))]
311 {
312 for &word in &self.upper_bits {
313 let popcount = word.count_ones();
314 if popcount >= remaining {
315 let mut w = word;
316 for _ in 0..remaining - 1 {
317 w &= w - 1;
318 }
319 return Some(pos + w.trailing_zeros());
320 }
321 remaining -= popcount;
322 pos += 64;
323 }
324 }
325
326 None
327 }
328
329 pub fn next_geq(&self, target: u32) -> Option<(u32, u32)> {
332 if self.len == 0 || target > self.last_value {
333 return None;
334 }
335
336 if target <= self.first_value {
337 return Some((0, self.first_value));
338 }
339
340 let mut lo = 0u32;
342 let mut hi = self.len;
343
344 while lo < hi {
345 let mid = lo + (hi - lo) / 2;
346 if let Some(val) = self.get(mid) {
347 if val < target {
348 lo = mid + 1;
349 } else {
350 hi = mid;
351 }
352 } else {
353 break;
354 }
355 }
356
357 if lo < self.len {
358 self.get(lo).map(|v| (lo, v))
359 } else {
360 None
361 }
362 }
363
364 pub fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
366 writer.write_u32::<LittleEndian>(self.len)?;
367 writer.write_u32::<LittleEndian>(self.first_value)?;
368 writer.write_u32::<LittleEndian>(self.last_value)?;
369 writer.write_u8(self.lower_bit_width)?;
371
372 writer.write_u32::<LittleEndian>(self.lower_bits.len() as u32)?;
373 for &word in &self.lower_bits {
374 writer.write_u64::<LittleEndian>(word)?;
375 }
376
377 writer.write_u32::<LittleEndian>(self.upper_bits.len() as u32)?;
378 for &word in &self.upper_bits {
379 writer.write_u64::<LittleEndian>(word)?;
380 }
381
382 Ok(())
383 }
384
385 pub fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
387 let len = reader.read_u32::<LittleEndian>()?;
388 let first_value = reader.read_u32::<LittleEndian>()?;
389 let last_value = reader.read_u32::<LittleEndian>()?;
390 let lower_bit_width = reader.read_u8()?;
392
393 let lower_len = reader.read_u32::<LittleEndian>()? as usize;
394 let mut lower_bits = Vec::with_capacity(lower_len);
395 for _ in 0..lower_len {
396 lower_bits.push(reader.read_u64::<LittleEndian>()?);
397 }
398
399 let upper_len = reader.read_u32::<LittleEndian>()? as usize;
400 let mut upper_bits = Vec::with_capacity(upper_len);
401 for _ in 0..upper_len {
402 upper_bits.push(reader.read_u64::<LittleEndian>()?);
403 }
404
405 Ok(Self {
406 lower_bits,
407 upper_bits,
408 len,
409 first_value,
410 last_value,
411 lower_bit_width,
412 })
413 }
414
415 pub fn size_bytes(&self) -> usize {
417 13 + self.lower_bits.len() * 8 + self.upper_bits.len() * 8
418 }
419}
420
421fn estimate_partition_cost(values: &[u32]) -> usize {
423 if values.is_empty() {
424 return 0;
425 }
426
427 let n = values.len();
428 let first = values[0];
429 let last = *values.last().unwrap();
430 let local_universe = (last - first + 1) as usize;
431
432 let lower_bits = if n <= 1 || local_universe <= n {
434 0
435 } else {
436 let ratio = local_universe / n;
437 let l = (usize::BITS - ratio.leading_zeros()) as usize;
438 n * l
439 };
440
441 let upper_bits = 2 * n;
443
444 let overhead = 13 * 8; (lower_bits + upper_bits + overhead).div_ceil(8)
448}
449
450fn find_optimal_partitions(values: &[u32]) -> Vec<usize> {
453 let n = values.len();
454 if n <= MIN_PARTITION_SIZE {
455 return vec![n];
456 }
457
458 let mut dp = vec![usize::MAX; n + 1];
461 let mut parent = vec![0usize; n + 1];
462 dp[0] = 0;
463
464 for i in MIN_PARTITION_SIZE..=n {
465 let min_start = i.saturating_sub(MAX_PARTITION_SIZE);
467 let max_start = i.saturating_sub(MIN_PARTITION_SIZE);
468
469 for start in min_start..=max_start {
470 if dp[start] == usize::MAX {
471 continue;
472 }
473
474 let partition_cost = estimate_partition_cost(&values[start..i]);
475 let total_cost = dp[start].saturating_add(partition_cost);
476
477 if total_cost < dp[i] {
478 dp[i] = total_cost;
479 parent[i] = start;
480 }
481 }
482 }
483
484 if dp[n] == usize::MAX {
486 return vec![n];
487 }
488
489 let mut boundaries = Vec::new();
491 let mut pos = n;
492 while pos > 0 {
493 boundaries.push(pos);
494 pos = parent[pos];
495 }
496 boundaries.reverse();
497
498 boundaries
499}
500
501#[derive(Debug, Clone)]
503pub struct PartitionedEliasFano {
504 partitions: Vec<EFPartition>,
506 len: u32,
508 cumulative_counts: Vec<u32>,
510}
511
512impl PartitionedEliasFano {
513 pub fn from_sorted_slice(values: &[u32]) -> Self {
515 if values.is_empty() {
516 return Self {
517 partitions: Vec::new(),
518 len: 0,
519 cumulative_counts: Vec::new(),
520 };
521 }
522
523 let boundaries = find_optimal_partitions(values);
524 let mut partitions = Vec::with_capacity(boundaries.len());
525 let mut cumulative_counts = Vec::with_capacity(boundaries.len());
526
527 let mut start = 0;
528 let mut cumulative = 0u32;
529
530 for &end in &boundaries {
531 let partition = EFPartition::from_sorted_slice(&values[start..end]);
532 cumulative += partition.len;
533 cumulative_counts.push(cumulative);
534 partitions.push(partition);
535 start = end;
536 }
537
538 Self {
539 partitions,
540 len: values.len() as u32,
541 cumulative_counts,
542 }
543 }
544
545 #[inline]
547 pub fn len(&self) -> u32 {
548 self.len
549 }
550
551 #[inline]
553 pub fn is_empty(&self) -> bool {
554 self.len == 0
555 }
556
557 pub fn num_partitions(&self) -> usize {
559 self.partitions.len()
560 }
561
562 pub fn get(&self, pos: u32) -> Option<u32> {
564 if pos >= self.len {
565 return None;
566 }
567
568 let partition_idx = self
570 .cumulative_counts
571 .binary_search(&(pos + 1))
572 .unwrap_or_else(|x| x);
573
574 if partition_idx >= self.partitions.len() {
575 return None;
576 }
577
578 let local_pos = if partition_idx == 0 {
579 pos
580 } else {
581 pos - self.cumulative_counts[partition_idx - 1]
582 };
583
584 self.partitions[partition_idx].get(local_pos)
585 }
586
587 pub fn next_geq(&self, target: u32) -> Option<(u32, u32)> {
590 if self.partitions.is_empty() {
591 return None;
592 }
593
594 let partition_idx = self
596 .partitions
597 .binary_search_by(|p| {
598 if p.last_value < target {
599 std::cmp::Ordering::Less
600 } else if p.first_value > target {
601 std::cmp::Ordering::Greater
602 } else {
603 std::cmp::Ordering::Equal
604 }
605 })
606 .unwrap_or_else(|x| x);
607
608 for (i, partition) in self.partitions[partition_idx..].iter().enumerate() {
610 let actual_idx = partition_idx + i;
611
612 if let Some((local_pos, val)) = partition.next_geq(target) {
613 let global_pos = if actual_idx == 0 {
614 local_pos
615 } else {
616 self.cumulative_counts[actual_idx - 1] + local_pos
617 };
618 return Some((global_pos, val));
619 }
620 }
621
622 None
623 }
624
625 pub fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
627 writer.write_u32::<LittleEndian>(self.len)?;
628 writer.write_u32::<LittleEndian>(self.partitions.len() as u32)?;
629
630 for &count in &self.cumulative_counts {
631 writer.write_u32::<LittleEndian>(count)?;
632 }
633
634 for partition in &self.partitions {
635 partition.serialize(writer)?;
636 }
637
638 Ok(())
639 }
640
641 pub fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
643 let len = reader.read_u32::<LittleEndian>()?;
644 let num_partitions = reader.read_u32::<LittleEndian>()? as usize;
645
646 let mut cumulative_counts = Vec::with_capacity(num_partitions);
647 for _ in 0..num_partitions {
648 cumulative_counts.push(reader.read_u32::<LittleEndian>()?);
649 }
650
651 let mut partitions = Vec::with_capacity(num_partitions);
652 for _ in 0..num_partitions {
653 partitions.push(EFPartition::deserialize(reader)?);
654 }
655
656 Ok(Self {
657 partitions,
658 len,
659 cumulative_counts,
660 })
661 }
662
663 pub fn size_bytes(&self) -> usize {
665 let mut size = 8 + self.cumulative_counts.len() * 4;
666 for p in &self.partitions {
667 size += p.size_bytes();
668 }
669 size
670 }
671
672 pub fn iter(&self) -> PartitionedEFIterator<'_> {
674 PartitionedEFIterator {
675 pef: self,
676 partition_idx: 0,
677 local_pos: 0,
678 }
679 }
680}
681
682pub struct PartitionedEFIterator<'a> {
684 pef: &'a PartitionedEliasFano,
685 partition_idx: usize,
686 local_pos: u32,
687}
688
689impl<'a> Iterator for PartitionedEFIterator<'a> {
690 type Item = u32;
691
692 fn next(&mut self) -> Option<Self::Item> {
693 if self.partition_idx >= self.pef.partitions.len() {
694 return None;
695 }
696
697 let partition = &self.pef.partitions[self.partition_idx];
698 if let Some(val) = partition.get(self.local_pos) {
699 self.local_pos += 1;
700 if self.local_pos >= partition.len {
701 self.partition_idx += 1;
702 self.local_pos = 0;
703 }
704 Some(val)
705 } else {
706 None
707 }
708 }
709
710 fn size_hint(&self) -> (usize, Option<usize>) {
711 let current_global = if self.partition_idx == 0 {
712 self.local_pos
713 } else if self.partition_idx < self.pef.cumulative_counts.len() {
714 self.pef.cumulative_counts[self.partition_idx - 1] + self.local_pos
715 } else {
716 self.pef.len
717 };
718 let remaining = (self.pef.len - current_global) as usize;
719 (remaining, Some(remaining))
720 }
721}
722
723#[derive(Debug, Clone)]
725pub struct PEFBlockInfo {
726 pub first_doc_id: u32,
728 pub last_doc_id: u32,
730 pub max_tf: u32,
732 pub max_block_score: f32,
734 pub partition_idx: u16,
736 pub local_start: u32,
738 pub num_docs: u16,
740}
741
742impl PEFBlockInfo {
743 pub fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
744 writer.write_u32::<LittleEndian>(self.first_doc_id)?;
745 writer.write_u32::<LittleEndian>(self.last_doc_id)?;
746 writer.write_u32::<LittleEndian>(self.max_tf)?;
747 writer.write_f32::<LittleEndian>(self.max_block_score)?;
748 writer.write_u16::<LittleEndian>(self.partition_idx)?;
749 writer.write_u32::<LittleEndian>(self.local_start)?;
750 writer.write_u16::<LittleEndian>(self.num_docs)?;
751 Ok(())
752 }
753
754 pub fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
755 Ok(Self {
756 first_doc_id: reader.read_u32::<LittleEndian>()?,
757 last_doc_id: reader.read_u32::<LittleEndian>()?,
758 max_tf: reader.read_u32::<LittleEndian>()?,
759 max_block_score: reader.read_f32::<LittleEndian>()?,
760 partition_idx: reader.read_u16::<LittleEndian>()?,
761 local_start: reader.read_u32::<LittleEndian>()?,
762 num_docs: reader.read_u16::<LittleEndian>()?,
763 })
764 }
765}
766
767pub const PEF_BLOCK_SIZE: usize = 128;
769
770#[derive(Debug, Clone)]
772pub struct PartitionedEFPostingList {
773 pub doc_ids: PartitionedEliasFano,
775 pub term_freqs: Vec<u8>,
777 pub tf_bits: u8,
779 pub max_tf: u32,
781 pub blocks: Vec<PEFBlockInfo>,
783 pub max_score: f32,
785}
786
787impl PartitionedEFPostingList {
788 const K1: f32 = 1.2;
789 const B: f32 = 0.75;
790
791 #[inline]
792 pub fn compute_bm25_upper_bound(max_tf: u32, idf: f32) -> f32 {
793 let tf = max_tf as f32;
794 let min_length_norm = 1.0 - Self::B;
795 let tf_norm = (tf * (Self::K1 + 1.0)) / (tf + Self::K1 * min_length_norm);
796 idf * tf_norm
797 }
798
799 pub fn from_postings(doc_ids: &[u32], term_freqs: &[u32]) -> Self {
801 Self::from_postings_with_idf(doc_ids, term_freqs, 1.0)
802 }
803
804 pub fn from_postings_with_idf(doc_ids: &[u32], term_freqs: &[u32], idf: f32) -> Self {
806 assert_eq!(doc_ids.len(), term_freqs.len());
807
808 if doc_ids.is_empty() {
809 return Self {
810 doc_ids: PartitionedEliasFano::from_sorted_slice(&[]),
811 term_freqs: Vec::new(),
812 tf_bits: 0,
813 max_tf: 0,
814 blocks: Vec::new(),
815 max_score: 0.0,
816 };
817 }
818
819 let pef_doc_ids = PartitionedEliasFano::from_sorted_slice(doc_ids);
820
821 let max_tf = *term_freqs.iter().max().unwrap();
823 let tf_bits = if max_tf == 0 {
824 0
825 } else {
826 (32 - max_tf.leading_zeros()) as u8
827 };
828
829 let total_bits = doc_ids.len() * (tf_bits as usize);
831 let total_bytes = total_bits.div_ceil(8);
832 let mut packed_tfs = vec![0u8; total_bytes];
833
834 if tf_bits > 0 {
835 for (i, &tf) in term_freqs.iter().enumerate() {
836 let bit_pos = i * (tf_bits as usize);
837 let byte_idx = bit_pos / 8;
838 let bit_offset = bit_pos % 8;
839
840 let val = tf.saturating_sub(1);
841
842 packed_tfs[byte_idx] |= (val as u8) << bit_offset;
843 if bit_offset + (tf_bits as usize) > 8 && byte_idx + 1 < packed_tfs.len() {
844 packed_tfs[byte_idx + 1] |= (val >> (8 - bit_offset)) as u8;
845 }
846 if bit_offset + (tf_bits as usize) > 16 && byte_idx + 2 < packed_tfs.len() {
847 packed_tfs[byte_idx + 2] |= (val >> (16 - bit_offset)) as u8;
848 }
849 }
850 }
851
852 let mut blocks = Vec::new();
854 let mut max_score = 0.0f32;
855 let mut i = 0;
856
857 while i < doc_ids.len() {
858 let block_end = (i + PEF_BLOCK_SIZE).min(doc_ids.len());
859 let block_doc_ids = &doc_ids[i..block_end];
860 let block_tfs = &term_freqs[i..block_end];
861
862 let block_max_tf = *block_tfs.iter().max().unwrap_or(&1);
863 let block_score = Self::compute_bm25_upper_bound(block_max_tf, idf);
864 max_score = max_score.max(block_score);
865
866 let (partition_idx, local_start) =
868 Self::find_partition_position(&pef_doc_ids, i as u32);
869
870 blocks.push(PEFBlockInfo {
871 first_doc_id: block_doc_ids[0],
872 last_doc_id: *block_doc_ids.last().unwrap(),
873 max_tf: block_max_tf,
874 max_block_score: block_score,
875 partition_idx: partition_idx as u16,
876 local_start,
877 num_docs: (block_end - i) as u16,
878 });
879
880 i = block_end;
881 }
882
883 Self {
884 doc_ids: pef_doc_ids,
885 term_freqs: packed_tfs,
886 tf_bits,
887 max_tf,
888 blocks,
889 max_score,
890 }
891 }
892
893 fn find_partition_position(pef: &PartitionedEliasFano, global_pos: u32) -> (usize, u32) {
894 let partition_idx = pef
895 .cumulative_counts
896 .binary_search(&(global_pos + 1))
897 .unwrap_or_else(|x| x);
898
899 let local_pos = if partition_idx == 0 {
900 global_pos
901 } else {
902 global_pos - pef.cumulative_counts[partition_idx - 1]
903 };
904
905 (partition_idx, local_pos)
906 }
907
908 pub fn get_tf(&self, pos: u32) -> u32 {
910 if self.tf_bits == 0 || pos >= self.doc_ids.len() {
911 return 1;
912 }
913
914 let bit_pos = (pos as usize) * (self.tf_bits as usize);
915 let byte_idx = bit_pos / 8;
916 let bit_offset = bit_pos % 8;
917 let mask = (1u32 << self.tf_bits) - 1;
918
919 let mut val = (self.term_freqs[byte_idx] >> bit_offset) as u32;
920 if bit_offset + (self.tf_bits as usize) > 8 && byte_idx + 1 < self.term_freqs.len() {
921 val |= (self.term_freqs[byte_idx + 1] as u32) << (8 - bit_offset);
922 }
923 if bit_offset + (self.tf_bits as usize) > 16 && byte_idx + 2 < self.term_freqs.len() {
924 val |= (self.term_freqs[byte_idx + 2] as u32) << (16 - bit_offset);
925 }
926
927 (val & mask) + 1
928 }
929
930 pub fn len(&self) -> u32 {
932 self.doc_ids.len()
933 }
934
935 pub fn is_empty(&self) -> bool {
937 self.doc_ids.is_empty()
938 }
939
940 pub fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
942 self.doc_ids.serialize(writer)?;
943 writer.write_u8(self.tf_bits)?;
944 writer.write_u32::<LittleEndian>(self.max_tf)?;
945 writer.write_f32::<LittleEndian>(self.max_score)?;
946 writer.write_u32::<LittleEndian>(self.term_freqs.len() as u32)?;
947 writer.write_all(&self.term_freqs)?;
948
949 writer.write_u32::<LittleEndian>(self.blocks.len() as u32)?;
950 for block in &self.blocks {
951 block.serialize(writer)?;
952 }
953
954 Ok(())
955 }
956
957 pub fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
959 let doc_ids = PartitionedEliasFano::deserialize(reader)?;
960 let tf_bits = reader.read_u8()?;
961 let max_tf = reader.read_u32::<LittleEndian>()?;
962 let max_score = reader.read_f32::<LittleEndian>()?;
963 let tf_len = reader.read_u32::<LittleEndian>()? as usize;
964 let mut term_freqs = vec![0u8; tf_len];
965 reader.read_exact(&mut term_freqs)?;
966
967 let num_blocks = reader.read_u32::<LittleEndian>()? as usize;
968 let mut blocks = Vec::with_capacity(num_blocks);
969 for _ in 0..num_blocks {
970 blocks.push(PEFBlockInfo::deserialize(reader)?);
971 }
972
973 Ok(Self {
974 doc_ids,
975 term_freqs,
976 tf_bits,
977 max_tf,
978 blocks,
979 max_score,
980 })
981 }
982
983 pub fn iterator(&self) -> PartitionedEFPostingIterator<'_> {
985 PartitionedEFPostingIterator {
986 list: self,
987 pos: 0,
988 current_block: 0,
989 }
990 }
991
992 pub fn compression_info(&self) -> (usize, usize) {
994 let pef_size = self.doc_ids.size_bytes();
995 let n = self.len() as usize;
997 let max_val = if let Some(last_block) = self.blocks.last() {
998 last_block.last_doc_id
999 } else {
1000 0
1001 };
1002 let ef_size = if n > 0 {
1003 let l = if n <= 1 {
1004 0
1005 } else {
1006 let ratio = (max_val as usize + 1) / n;
1007 if ratio <= 1 {
1008 0
1009 } else {
1010 (usize::BITS - ratio.leading_zeros()) as usize
1011 }
1012 };
1013 (n * l + 2 * n).div_ceil(8) + 16
1014 } else {
1015 0
1016 };
1017 (pef_size, ef_size)
1018 }
1019}
1020
1021pub struct PartitionedEFPostingIterator<'a> {
1023 list: &'a PartitionedEFPostingList,
1024 pos: u32,
1025 current_block: usize,
1026}
1027
1028impl<'a> PartitionedEFPostingIterator<'a> {
1029 pub fn doc(&self) -> u32 {
1031 self.list.doc_ids.get(self.pos).unwrap_or(u32::MAX)
1032 }
1033
1034 pub fn term_freq(&self) -> u32 {
1036 self.list.get_tf(self.pos)
1037 }
1038
1039 pub fn advance(&mut self) -> u32 {
1041 self.pos += 1;
1042 if !self.list.blocks.is_empty() {
1043 let new_block = (self.pos as usize) / PEF_BLOCK_SIZE;
1044 if new_block < self.list.blocks.len() {
1045 self.current_block = new_block;
1046 }
1047 }
1048 self.doc()
1049 }
1050
1051 pub fn seek(&mut self, target: u32) -> u32 {
1053 if !self.list.blocks.is_empty() {
1055 let block_idx = self.list.blocks[self.current_block..].binary_search_by(|b| {
1056 if b.last_doc_id < target {
1057 std::cmp::Ordering::Less
1058 } else if b.first_doc_id > target {
1059 std::cmp::Ordering::Greater
1060 } else {
1061 std::cmp::Ordering::Equal
1062 }
1063 });
1064
1065 let target_block = match block_idx {
1066 Ok(idx) => self.current_block + idx,
1067 Err(idx) => {
1068 let abs_idx = self.current_block + idx;
1069 if abs_idx >= self.list.blocks.len() {
1070 self.pos = self.list.len();
1071 return u32::MAX;
1072 }
1073 abs_idx
1074 }
1075 };
1076
1077 if target_block > self.current_block {
1078 self.current_block = target_block;
1079 self.pos = (target_block * PEF_BLOCK_SIZE) as u32;
1080 }
1081 }
1082
1083 if let Some((pos, val)) = self.list.doc_ids.next_geq(target)
1085 && pos >= self.pos
1086 {
1087 self.pos = pos;
1088 if !self.list.blocks.is_empty() {
1089 self.current_block = (pos as usize) / PEF_BLOCK_SIZE;
1090 }
1091 return val;
1092 }
1093
1094 while self.pos < self.list.len() {
1096 let doc = self.doc();
1097 if doc >= target {
1098 return doc;
1099 }
1100 self.pos += 1;
1101 }
1102
1103 u32::MAX
1104 }
1105
1106 pub fn is_exhausted(&self) -> bool {
1108 self.pos >= self.list.len()
1109 }
1110
1111 pub fn current_block_max_score(&self) -> f32 {
1113 if self.is_exhausted() || self.list.blocks.is_empty() {
1114 return 0.0;
1115 }
1116 if self.current_block < self.list.blocks.len() {
1117 self.list.blocks[self.current_block].max_block_score
1118 } else {
1119 0.0
1120 }
1121 }
1122
1123 pub fn current_block_max_tf(&self) -> u32 {
1125 if self.is_exhausted() || self.list.blocks.is_empty() {
1126 return 0;
1127 }
1128 if self.current_block < self.list.blocks.len() {
1129 self.list.blocks[self.current_block].max_tf
1130 } else {
1131 0
1132 }
1133 }
1134
1135 pub fn max_remaining_score(&self) -> f32 {
1137 if self.is_exhausted() || self.list.blocks.is_empty() {
1138 return 0.0;
1139 }
1140 self.list.blocks[self.current_block..]
1141 .iter()
1142 .map(|b| b.max_block_score)
1143 .fold(0.0f32, |a, b| a.max(b))
1144 }
1145
1146 pub fn skip_to_block_with_doc(&mut self, target: u32) -> Option<(u32, f32)> {
1149 if self.list.blocks.is_empty() {
1150 return None;
1151 }
1152
1153 while self.current_block < self.list.blocks.len() {
1154 let block = &self.list.blocks[self.current_block];
1155 if block.last_doc_id >= target {
1156 self.pos = (self.current_block * PEF_BLOCK_SIZE) as u32;
1157 return Some((block.first_doc_id, block.max_block_score));
1158 }
1159 self.current_block += 1;
1160 }
1161
1162 self.pos = self.list.len();
1163 None
1164 }
1165}
1166
1167#[cfg(test)]
1168mod tests {
1169 use super::*;
1170
1171 #[test]
1172 fn test_ef_partition_basic() {
1173 let values = vec![10, 20, 30, 40, 50];
1174 let partition = EFPartition::from_sorted_slice(&values);
1175
1176 assert_eq!(partition.len, 5);
1177 assert_eq!(partition.first_value, 10);
1178 assert_eq!(partition.last_value, 50);
1179
1180 for (i, &expected) in values.iter().enumerate() {
1181 assert_eq!(partition.get(i as u32), Some(expected));
1182 }
1183 }
1184
1185 #[test]
1186 fn test_ef_partition_next_geq() {
1187 let values = vec![10, 20, 30, 100, 200, 300];
1188 let partition = EFPartition::from_sorted_slice(&values);
1189
1190 assert_eq!(partition.next_geq(5), Some((0, 10)));
1191 assert_eq!(partition.next_geq(10), Some((0, 10)));
1192 assert_eq!(partition.next_geq(15), Some((1, 20)));
1193 assert_eq!(partition.next_geq(100), Some((3, 100)));
1194 assert_eq!(partition.next_geq(301), None);
1195 }
1196
1197 #[test]
1198 fn test_partitioned_ef_basic() {
1199 let values: Vec<u32> = (0..500).map(|i| i * 2).collect();
1200 let pef = PartitionedEliasFano::from_sorted_slice(&values);
1201
1202 assert_eq!(pef.len(), 500);
1203 assert!(pef.num_partitions() >= 1);
1204
1205 for (i, &expected) in values.iter().enumerate() {
1206 assert_eq!(pef.get(i as u32), Some(expected), "Mismatch at {}", i);
1207 }
1208 }
1209
1210 #[test]
1211 fn test_partitioned_ef_next_geq() {
1212 let values: Vec<u32> = (0..1000).map(|i| i * 3).collect();
1213 let pef = PartitionedEliasFano::from_sorted_slice(&values);
1214
1215 assert_eq!(pef.next_geq(0), Some((0, 0)));
1216 assert_eq!(pef.next_geq(100), Some((34, 102))); assert_eq!(pef.next_geq(1500), Some((500, 1500)));
1218 assert_eq!(pef.next_geq(3000), None);
1219 }
1220
1221 #[test]
1222 fn test_partitioned_ef_serialization() {
1223 let values: Vec<u32> = (0..500).map(|i| i * 5).collect();
1224 let pef = PartitionedEliasFano::from_sorted_slice(&values);
1225
1226 let mut buffer = Vec::new();
1227 pef.serialize(&mut buffer).unwrap();
1228
1229 let restored = PartitionedEliasFano::deserialize(&mut &buffer[..]).unwrap();
1230
1231 assert_eq!(restored.len(), pef.len());
1232 assert_eq!(restored.num_partitions(), pef.num_partitions());
1233
1234 for i in 0..pef.len() {
1235 assert_eq!(restored.get(i), pef.get(i));
1236 }
1237 }
1238
1239 #[test]
1240 fn test_partitioned_ef_posting_list() {
1241 let doc_ids: Vec<u32> = (0..300).map(|i| i * 2).collect();
1242 let term_freqs: Vec<u32> = (0..300).map(|i| (i % 10) + 1).collect();
1243
1244 let list = PartitionedEFPostingList::from_postings(&doc_ids, &term_freqs);
1245
1246 assert_eq!(list.len(), 300);
1247
1248 let mut iter = list.iterator();
1249 for (i, (&expected_doc, &expected_tf)) in doc_ids.iter().zip(term_freqs.iter()).enumerate()
1250 {
1251 assert_eq!(iter.doc(), expected_doc, "Doc mismatch at {}", i);
1252 assert_eq!(iter.term_freq(), expected_tf, "TF mismatch at {}", i);
1253 iter.advance();
1254 }
1255 }
1256
1257 #[test]
1258 fn test_partitioned_ef_seek() {
1259 let doc_ids: Vec<u32> = (0..500).map(|i| i * 3).collect();
1260 let term_freqs: Vec<u32> = vec![1; 500];
1261
1262 let list = PartitionedEFPostingList::from_postings(&doc_ids, &term_freqs);
1263 let mut iter = list.iterator();
1264
1265 assert_eq!(iter.seek(100), 102); assert_eq!(iter.seek(300), 300);
1267 assert_eq!(iter.seek(1500), u32::MAX);
1268 }
1269
1270 #[test]
1271 fn test_compression_improvement() {
1272 let values: Vec<u32> = (0..10000)
1275 .map(|i| {
1276 if i < 5000 {
1277 i * 2 } else {
1279 10000 + (i - 5000) * 100 }
1281 })
1282 .collect();
1283
1284 let pef = PartitionedEliasFano::from_sorted_slice(&values);
1285
1286 assert!(
1288 pef.num_partitions() > 1,
1289 "Expected multiple partitions, got {}",
1290 pef.num_partitions()
1291 );
1292
1293 for (i, &expected) in values.iter().enumerate() {
1295 assert_eq!(pef.get(i as u32), Some(expected));
1296 }
1297 }
1298
1299 #[test]
1300 fn test_partitioned_ef_block_max() {
1301 let doc_ids: Vec<u32> = (0..500).map(|i| i * 2).collect();
1303 let term_freqs: Vec<u32> = (0..500)
1305 .map(|i| {
1306 if i < 128 {
1307 1 } else if i < 256 {
1309 5 } else if i < 384 {
1311 10 } else {
1313 3 }
1315 })
1316 .collect();
1317
1318 let list = PartitionedEFPostingList::from_postings_with_idf(&doc_ids, &term_freqs, 2.0);
1319
1320 assert_eq!(list.blocks.len(), 4);
1322 assert_eq!(list.blocks[0].max_tf, 1);
1323 assert_eq!(list.blocks[1].max_tf, 5);
1324 assert_eq!(list.blocks[2].max_tf, 10);
1325 assert_eq!(list.blocks[3].max_tf, 3);
1326
1327 assert!(list.blocks[2].max_block_score > list.blocks[0].max_block_score);
1329 assert!(list.blocks[2].max_block_score > list.blocks[1].max_block_score);
1330 assert!(list.blocks[2].max_block_score > list.blocks[3].max_block_score);
1331
1332 assert_eq!(list.max_score, list.blocks[2].max_block_score);
1334
1335 let mut iter = list.iterator();
1337 assert_eq!(iter.current_block_max_tf(), 1); iter.seek(256); assert_eq!(iter.current_block_max_tf(), 5);
1342
1343 iter.seek(512); assert_eq!(iter.current_block_max_tf(), 10);
1346
1347 let mut iter2 = list.iterator();
1349 let result = iter2.skip_to_block_with_doc(300);
1350 assert!(result.is_some());
1351 let (first_doc, score) = result.unwrap();
1352 assert!(first_doc <= 300);
1353 assert!(score > 0.0);
1354
1355 let mut iter3 = list.iterator();
1357 let max_score = iter3.max_remaining_score();
1358 assert_eq!(max_score, list.max_score);
1359
1360 iter3.seek(768); let remaining = iter3.max_remaining_score();
1363 assert!(remaining < max_score);
1364 }
1365}