1use std::ops::Range;
15mod bitmap;
19mod encoded_array;
20mod index;
21pub mod segment;
22mod serde;
23pub mod version;
24
25use deepsize::DeepSizeOf;
26pub use index::FragmentRowIdIndex;
28pub use index::RowIdIndex;
29use lance_core::{
30 utils::mask::{RowAddrMask, RowAddrTreeMap},
31 Error, Result,
32};
33use lance_io::ReadBatchParams;
34pub use serde::{read_row_ids, write_row_ids};
35
36use snafu::location;
37
38use crate::utils::LanceIteratorExtension;
39use lance_core::utils::mask::RowSetOps;
40use segment::U64Segment;
41use tracing::instrument;
42
43#[derive(Debug, Clone, DeepSizeOf, PartialEq, Eq, Default)]
56pub struct RowIdSequence(Vec<U64Segment>);
57
58impl std::fmt::Display for RowIdSequence {
59 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
60 let mut iter = self.iter();
61 let mut first_10 = Vec::new();
62 let mut last_10 = Vec::new();
63 for row_id in iter.by_ref() {
64 first_10.push(row_id);
65 if first_10.len() > 10 {
66 break;
67 }
68 }
69
70 while let Some(row_id) = iter.next_back() {
71 last_10.push(row_id);
72 if last_10.len() > 10 {
73 break;
74 }
75 }
76 last_10.reverse();
77
78 let theres_more = iter.next().is_some();
79
80 write!(f, "[")?;
81 for row_id in first_10 {
82 write!(f, "{}", row_id)?;
83 }
84 if theres_more {
85 write!(f, ", ...")?;
86 }
87 for row_id in last_10 {
88 write!(f, ", {}", row_id)?;
89 }
90 write!(f, "]")
91 }
92}
93
94impl From<Range<u64>> for RowIdSequence {
95 fn from(range: Range<u64>) -> Self {
96 Self(vec![U64Segment::Range(range)])
97 }
98}
99
100impl From<&[u64]> for RowIdSequence {
101 fn from(row_ids: &[u64]) -> Self {
102 Self(vec![U64Segment::from_slice(row_ids)])
103 }
104}
105
106impl RowIdSequence {
107 pub fn new() -> Self {
108 Self::default()
109 }
110
111 pub fn iter(&self) -> impl DoubleEndedIterator<Item = u64> + '_ {
112 self.0.iter().flat_map(|segment| segment.iter())
113 }
114
115 pub fn len(&self) -> u64 {
116 self.0.iter().map(|segment| segment.len() as u64).sum()
117 }
118
119 pub fn is_empty(&self) -> bool {
120 self.0.is_empty()
121 }
122
123 pub fn extend(&mut self, other: Self) {
125 if let (Some(U64Segment::Range(range1)), Some(U64Segment::Range(range2))) =
129 (self.0.last(), other.0.first())
130 {
131 if range1.end == range2.start {
132 let new_range = U64Segment::Range(range1.start..range2.end);
133 self.0.pop();
134 self.0.push(new_range);
135 self.0.extend(other.0.into_iter().skip(1));
136 return;
137 }
138 }
139 self.0.extend(other.0);
141 }
142
143 pub fn delete(&mut self, row_ids: impl IntoIterator<Item = u64>) {
145 let (row_ids, offsets) = self.find_ids(row_ids);
147
148 let capacity = self.0.capacity();
149 let old_segments = std::mem::replace(&mut self.0, Vec::with_capacity(capacity));
150 let mut remaining_segments = old_segments.as_slice();
151
152 for (segment_idx, range) in offsets {
153 let segments_handled = old_segments.len() - remaining_segments.len();
154 let segments_to_add = segment_idx - segments_handled;
155 self.0
156 .extend_from_slice(&remaining_segments[..segments_to_add]);
157 remaining_segments = &remaining_segments[segments_to_add..];
158
159 let segment;
160 (segment, remaining_segments) = remaining_segments.split_first().unwrap();
161
162 let segment_ids = &row_ids[range];
163 self.0.push(segment.delete(segment_ids));
164 }
165
166 self.0.extend_from_slice(remaining_segments);
168 }
169
170 pub fn mask(&mut self, positions: impl IntoIterator<Item = u32>) -> Result<()> {
172 let mut local_positions = Vec::new();
173 let mut positions_iter = positions.into_iter();
174 let mut curr_position = positions_iter.next();
175 let mut offset = 0;
176 let mut cutoff = 0;
177
178 for segment in &mut self.0 {
179 cutoff += segment.len() as u32;
181 while let Some(position) = curr_position {
182 if position >= cutoff {
183 break;
184 }
185 local_positions.push(position - offset);
186 curr_position = positions_iter.next();
187 }
188
189 if !local_positions.is_empty() {
190 segment.mask(&local_positions);
191 local_positions.clear();
192 }
193 offset = cutoff;
194 }
195
196 self.0.retain(|segment| !segment.is_empty());
197
198 Ok(())
199 }
200
201 fn find_ids(
207 &self,
208 row_ids: impl IntoIterator<Item = u64>,
209 ) -> (Vec<u64>, Vec<(usize, Range<usize>)>) {
210 let mut segment_iter = self.0.iter().enumerate().cycle();
214
215 let mut segment_matches = vec![Vec::new(); self.0.len()];
216
217 row_ids.into_iter().for_each(|row_id| {
218 let mut i = 0;
219 while i < self.0.len() {
221 let (segment_idx, segment) = segment_iter.next().unwrap();
222 if segment.range().is_some_and(|range| range.contains(&row_id)) {
223 if let Some(offset) = segment.position(row_id) {
224 segment_matches.get_mut(segment_idx).unwrap().push(offset);
225 }
226 }
228 i += 1;
229 }
230 });
231 for matches in &mut segment_matches {
232 matches.sort_unstable();
233 }
234
235 let mut offset = 0;
236 let segment_ranges = segment_matches
237 .iter()
238 .enumerate()
239 .filter(|(_, matches)| !matches.is_empty())
240 .map(|(segment_idx, matches)| {
241 let range = offset..offset + matches.len();
242 offset += matches.len();
243 (segment_idx, range)
244 })
245 .collect();
246 let row_ids = segment_matches
247 .into_iter()
248 .enumerate()
249 .flat_map(|(segment_idx, offset)| {
250 offset
251 .into_iter()
252 .map(move |offset| self.0[segment_idx].get(offset).unwrap())
253 })
254 .collect();
255
256 (row_ids, segment_ranges)
257 }
258
259 pub fn slice(&self, offset: usize, len: usize) -> RowIdSeqSlice<'_> {
260 if len == 0 {
261 return RowIdSeqSlice {
262 segments: &[],
263 offset_start: 0,
264 offset_last: 0,
265 };
266 }
267
268 let mut offset_start = offset;
270 let mut segment_offset = 0;
271 for segment in &self.0 {
272 let segment_len = segment.len();
273 if offset_start < segment_len {
274 break;
275 }
276 offset_start -= segment_len;
277 segment_offset += 1;
278 }
279
280 let mut offset_last = offset_start + len;
282 let mut segment_offset_last = segment_offset;
283 for segment in &self.0[segment_offset..] {
284 let segment_len = segment.len();
285 if offset_last <= segment_len {
286 break;
287 }
288 offset_last -= segment_len;
289 segment_offset_last += 1;
290 }
291
292 RowIdSeqSlice {
293 segments: &self.0[segment_offset..=segment_offset_last],
294 offset_start,
295 offset_last,
296 }
297 }
298
299 pub fn get(&self, index: usize) -> Option<u64> {
303 let mut offset = 0;
304 for segment in &self.0 {
305 let segment_len = segment.len();
306 if index < offset + segment_len {
307 return segment.get(index - offset);
308 }
309 offset += segment_len;
310 }
311 None
312 }
313
314 pub fn select<'a>(
322 &'a self,
323 selection: impl Iterator<Item = usize> + 'a,
324 ) -> impl Iterator<Item = u64> + 'a {
325 let mut seg_iter = self.0.iter();
326 let mut cur_seg = seg_iter.next();
327 let mut rows_passed = 0;
328 let mut cur_seg_len = cur_seg.map(|seg| seg.len()).unwrap_or(0);
329 let mut last_index = 0;
330 selection.filter_map(move |index| {
331 if index < last_index {
332 panic!("Selection is not sorted");
333 }
334 last_index = index;
335
336 cur_seg?;
337
338 while (index - rows_passed) >= cur_seg_len {
339 rows_passed += cur_seg_len;
340 cur_seg = seg_iter.next();
341 if let Some(cur_seg) = cur_seg {
342 cur_seg_len = cur_seg.len();
343 } else {
344 return None;
345 }
346 }
347
348 Some(cur_seg.unwrap().get(index - rows_passed).unwrap())
349 })
350 }
351
352 #[instrument(level = "debug", skip_all)]
367 pub fn mask_to_offset_ranges(&self, mask: &RowAddrMask) -> Vec<Range<u64>> {
368 let mut offset = 0;
369 let mut ranges = Vec::new();
370 for segment in &self.0 {
371 match segment {
372 U64Segment::Range(range) => {
373 let mut ids = RowAddrTreeMap::from(range.clone());
374 ids.mask(mask);
375 ranges.extend(GroupingIterator::new(
376 unsafe { ids.into_addr_iter() }.map(|addr| addr - range.start + offset),
377 ));
378 offset += range.end - range.start;
379 }
380 U64Segment::RangeWithHoles { range, holes } => {
381 let offset_start = offset;
382 let mut ids = RowAddrTreeMap::from(range.clone());
383 offset += range.end - range.start;
384 for hole in holes.iter() {
385 if ids.remove(hole) {
386 offset -= 1;
387 }
388 }
389 ids.mask(mask);
390
391 let mut sorted_holes = holes.clone().into_iter().collect::<Vec<_>>();
393 sorted_holes.sort_unstable();
394 let mut next_holes_iter = sorted_holes.into_iter().peekable();
395 let mut holes_passed = 0;
396 ranges.extend(GroupingIterator::new(unsafe { ids.into_addr_iter() }.map(
397 |addr| {
398 while let Some(next_hole) = next_holes_iter.peek() {
399 if *next_hole < addr {
400 next_holes_iter.next();
401 holes_passed += 1;
402 } else {
403 break;
404 }
405 }
406 addr - range.start + offset_start - holes_passed
407 },
408 )));
409 }
410 U64Segment::RangeWithBitmap { range, bitmap } => {
411 let mut ids = RowAddrTreeMap::from(range.clone());
412 let offset_start = offset;
413 offset += range.end - range.start;
414 for (i, val) in range.clone().enumerate() {
415 if !bitmap.get(i) && ids.remove(val) {
416 offset -= 1;
417 }
418 }
419 ids.mask(mask);
420 let mut bitmap_iter = bitmap.iter();
421 let mut bitmap_iter_pos = 0;
422 let mut holes_passed = 0;
423 ranges.extend(GroupingIterator::new(unsafe { ids.into_addr_iter() }.map(
424 |addr| {
425 let offset_no_holes = addr - range.start + offset_start;
426 while bitmap_iter_pos < offset_no_holes {
427 if !bitmap_iter.next().unwrap() {
428 holes_passed += 1;
429 }
430 bitmap_iter_pos += 1;
431 }
432 offset_no_holes - holes_passed
433 },
434 )));
435 }
436 U64Segment::SortedArray(array) | U64Segment::Array(array) => {
437 ranges.extend(GroupingIterator::new(array.iter().enumerate().filter_map(
439 |(off, id)| {
440 if mask.selected(id) {
441 Some(off as u64 + offset)
442 } else {
443 None
444 }
445 },
446 )));
447 offset += array.len() as u64;
448 }
449 }
450 }
451 ranges
452 }
453}
454
455struct GroupingIterator<I: Iterator<Item = u64>> {
460 iter: I,
461 cur_range: Option<Range<u64>>,
462}
463
464impl<I: Iterator<Item = u64>> GroupingIterator<I> {
465 fn new(iter: I) -> Self {
466 Self {
467 iter,
468 cur_range: None,
469 }
470 }
471}
472
473impl<I: Iterator<Item = u64>> Iterator for GroupingIterator<I> {
474 type Item = Range<u64>;
475
476 fn next(&mut self) -> Option<Self::Item> {
477 for id in self.iter.by_ref() {
478 if let Some(range) = self.cur_range.as_mut() {
479 if range.end == id {
480 range.end = id + 1;
481 } else {
482 let ret = Some(range.clone());
483 self.cur_range = Some(id..id + 1);
484 return ret;
485 }
486 } else {
487 self.cur_range = Some(id..id + 1);
488 }
489 }
490 self.cur_range.take()
491 }
492}
493
494impl From<&RowIdSequence> for RowAddrTreeMap {
495 fn from(row_ids: &RowIdSequence) -> Self {
496 let mut tree_map = Self::new();
497 for segment in &row_ids.0 {
498 match segment {
499 U64Segment::Range(range) => {
500 tree_map.insert_range(range.clone());
501 }
502 U64Segment::RangeWithBitmap { range, bitmap } => {
503 tree_map.insert_range(range.clone());
504 for (i, val) in range.clone().enumerate() {
505 if !bitmap.get(i) {
506 tree_map.remove(val);
507 }
508 }
509 }
510 U64Segment::RangeWithHoles { range, holes } => {
511 tree_map.insert_range(range.clone());
512 for hole in holes.iter() {
513 tree_map.remove(hole);
514 }
515 }
516 U64Segment::SortedArray(array) | U64Segment::Array(array) => {
517 for val in array.iter() {
518 tree_map.insert(val);
519 }
520 }
521 }
522 }
523 tree_map
524 }
525}
526
527#[derive(Debug)]
528pub struct RowIdSeqSlice<'a> {
529 segments: &'a [U64Segment],
531 offset_start: usize,
533 offset_last: usize,
535}
536
537impl RowIdSeqSlice<'_> {
538 pub fn iter(&self) -> impl Iterator<Item = u64> + '_ {
539 let mut known_size = self.segments.iter().map(|segment| segment.len()).sum();
540 known_size -= self.offset_start;
541 known_size -= self.segments.last().map(|s| s.len()).unwrap_or_default() - self.offset_last;
542
543 let end = self.segments.len().saturating_sub(1);
544 self.segments
545 .iter()
546 .enumerate()
547 .flat_map(move |(i, segment)| {
548 match i {
549 0 if self.segments.len() == 1 => {
550 let len = self.offset_last - self.offset_start;
551 Box::new(segment.iter().skip(self.offset_start).take(len))
554 as Box<dyn Iterator<Item = u64>>
555 }
556 0 => Box::new(segment.iter().skip(self.offset_start)),
557 i if i == end => Box::new(segment.iter().take(self.offset_last)),
558 _ => Box::new(segment.iter()),
559 }
560 })
561 .exact_size(known_size)
562 }
563}
564
565pub fn rechunk_sequences(
579 sequences: impl IntoIterator<Item = RowIdSequence>,
580 chunk_sizes: impl IntoIterator<Item = u64>,
581 allow_incomplete: bool,
582) -> Result<Vec<RowIdSequence>> {
583 let chunk_sizes_vec: Vec<u64> = chunk_sizes.into_iter().collect();
585 let total_chunks = chunk_sizes_vec.len();
586 let mut chunked_sequences = Vec::with_capacity(total_chunks);
587 let mut segment_iter = sequences
588 .into_iter()
589 .flat_map(|sequence| sequence.0.into_iter())
590 .peekable();
591
592 let too_few_segments_error = |chunk_index: usize, expected_chunk_size: u64, remaining: u64| {
593 Error::invalid_input(
594 format!(
595 "Got too few segments for chunk {}. Expected chunk size: {}, remaining needed: {}",
596 chunk_index, expected_chunk_size, remaining
597 ),
598 location!(),
599 )
600 };
601
602 let too_many_segments_error = |processed_chunks: usize, total_chunk_sizes: usize| {
603 Error::invalid_input(
604 format!(
605 "Got too many segments for the provided chunk lengths. Processed {} chunks out of {} expected",
606 processed_chunks, total_chunk_sizes
607 ),
608 location!(),
609 )
610 };
611
612 let mut segment_offset = 0_u64;
613
614 for (chunk_index, chunk_size) in chunk_sizes_vec.iter().enumerate() {
615 let chunk_size = *chunk_size;
616 let mut sequence = RowIdSequence(Vec::new());
617 let mut remaining = chunk_size;
618
619 while remaining > 0 {
620 let remaining_in_segment = segment_iter
621 .peek()
622 .map_or(0, |segment| segment.len() as u64 - segment_offset);
623
624 if remaining_in_segment == 0 {
626 if segment_iter.next().is_some() {
627 segment_offset = 0;
628 continue;
629 } else {
630 if allow_incomplete {
632 break;
633 } else {
634 return Err(too_few_segments_error(chunk_index, chunk_size, remaining));
635 }
636 }
637 }
638
639 match remaining_in_segment.cmp(&remaining) {
641 std::cmp::Ordering::Greater => {
642 let segment = segment_iter
644 .peek()
645 .ok_or_else(|| too_few_segments_error(chunk_index, chunk_size, remaining))?
646 .slice(segment_offset as usize, remaining as usize);
647 sequence.extend(RowIdSequence(vec![segment]));
648 segment_offset += remaining;
649 remaining = 0;
650 }
651 std::cmp::Ordering::Equal | std::cmp::Ordering::Less => {
652 let segment = segment_iter
656 .next()
657 .ok_or_else(|| too_few_segments_error(chunk_index, chunk_size, remaining))?
658 .slice(segment_offset as usize, remaining_in_segment as usize);
659 sequence.extend(RowIdSequence(vec![segment]));
660 segment_offset = 0;
661 remaining -= remaining_in_segment;
662 }
663 }
664 }
665
666 chunked_sequences.push(sequence);
667 }
668
669 if segment_iter.peek().is_some() {
670 return Err(too_many_segments_error(
671 chunked_sequences.len(),
672 total_chunks,
673 ));
674 }
675
676 Ok(chunked_sequences)
677}
678
679pub fn select_row_ids<'a>(
681 sequence: &'a RowIdSequence,
682 offsets: &'a ReadBatchParams,
683) -> Result<Vec<u64>> {
684 let out_of_bounds_err = |offset: u32| {
685 Error::invalid_input(
686 format!(
687 "Index out of bounds: {} for sequence of length {}",
688 offset,
689 sequence.len()
690 ),
691 location!(),
692 )
693 };
694
695 match offsets {
696 ReadBatchParams::Indices(indices) => indices
698 .values()
699 .iter()
700 .map(|index| {
701 sequence
702 .get(*index as usize)
703 .ok_or_else(|| out_of_bounds_err(*index))
704 })
705 .collect(),
706 ReadBatchParams::Range(range) => {
707 if range.end > sequence.len() as usize {
708 return Err(out_of_bounds_err(range.end as u32));
709 }
710 let sequence = sequence.slice(range.start, range.end - range.start);
711 Ok(sequence.iter().collect())
712 }
713 ReadBatchParams::Ranges(ranges) => {
714 let num_rows = ranges
715 .iter()
716 .map(|r| (r.end - r.start) as usize)
717 .sum::<usize>();
718 let mut result = Vec::with_capacity(num_rows);
719 for range in ranges.as_ref() {
720 if range.end > sequence.len() {
721 return Err(out_of_bounds_err(range.end as u32));
722 }
723 let sequence =
724 sequence.slice(range.start as usize, (range.end - range.start) as usize);
725 result.extend(sequence.iter());
726 }
727 Ok(result)
728 }
729
730 ReadBatchParams::RangeFull => Ok(sequence.iter().collect()),
731 ReadBatchParams::RangeTo(to) => {
732 if to.end > sequence.len() as usize {
733 return Err(out_of_bounds_err(to.end as u32));
734 }
735 let len = to.end;
736 let sequence = sequence.slice(0, len);
737 Ok(sequence.iter().collect())
738 }
739 ReadBatchParams::RangeFrom(from) => {
740 let sequence = sequence.slice(from.start, sequence.len() as usize - from.start);
741 Ok(sequence.iter().collect())
742 }
743 }
744}
745
746#[cfg(test)]
747mod test {
748 use super::*;
749
750 use pretty_assertions::assert_eq;
751 use test::bitmap::Bitmap;
752
753 #[test]
754 fn test_row_id_sequence_from_range() {
755 let sequence = RowIdSequence::from(0..10);
756 assert_eq!(sequence.len(), 10);
757 assert_eq!(sequence.is_empty(), false);
758
759 let iter = sequence.iter();
760 assert_eq!(iter.collect::<Vec<_>>(), (0..10).collect::<Vec<_>>());
761 }
762
763 #[test]
764 fn test_row_id_sequence_extend() {
765 let mut sequence = RowIdSequence::from(0..10);
766 sequence.extend(RowIdSequence::from(10..20));
767 assert_eq!(sequence.0, vec![U64Segment::Range(0..20)]);
768
769 let mut sequence = RowIdSequence::from(0..10);
770 sequence.extend(RowIdSequence::from(20..30));
771 assert_eq!(
772 sequence.0,
773 vec![U64Segment::Range(0..10), U64Segment::Range(20..30)]
774 );
775 }
776
777 #[test]
778 fn test_row_id_sequence_delete() {
779 let mut sequence = RowIdSequence::from(0..10);
780 sequence.delete(vec![1, 3, 5, 7, 9]);
781 let mut expected_bitmap = Bitmap::new_empty(9);
782 for i in [0, 2, 4, 6, 8] {
783 expected_bitmap.set(i as usize);
784 }
785 assert_eq!(
786 sequence.0,
787 vec![U64Segment::RangeWithBitmap {
788 range: 0..9,
789 bitmap: expected_bitmap
790 },]
791 );
792
793 let mut sequence = RowIdSequence::from(0..10);
794 sequence.extend(RowIdSequence::from(12..20));
795 sequence.delete(vec![0, 9, 10, 11, 12, 13]);
796 assert_eq!(
797 sequence.0,
798 vec![U64Segment::Range(1..9), U64Segment::Range(14..20),]
799 );
800
801 let mut sequence = RowIdSequence::from(0..10);
802 sequence.delete(vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9]);
803 assert_eq!(sequence.0, vec![U64Segment::Range(0..0)]);
804 }
805
806 #[test]
807 fn test_row_id_slice() {
808 let sequence = RowIdSequence(vec![
811 U64Segment::Range(30..35), U64Segment::RangeWithHoles {
813 range: 50..60,
815 holes: vec![53, 54].into(),
816 },
817 U64Segment::SortedArray(vec![7, 9].into()), U64Segment::RangeWithBitmap {
819 range: 0..5,
820 bitmap: [true, false, true, false, true].as_slice().into(),
821 },
822 U64Segment::Array(vec![35, 39].into()),
823 U64Segment::Range(40..50),
824 ]);
825
826 for offset in 0..sequence.len() as usize {
828 for len in 0..sequence.len() as usize {
829 if offset + len > sequence.len() as usize {
830 continue;
831 }
832 let slice = sequence.slice(offset, len);
833
834 let actual = slice.iter().collect::<Vec<_>>();
835 let expected = sequence.iter().skip(offset).take(len).collect::<Vec<_>>();
836 assert_eq!(
837 actual, expected,
838 "Failed for offset {} and len {}",
839 offset, len
840 );
841
842 let (claimed_size, claimed_max) = slice.iter().size_hint();
843 assert_eq!(claimed_max, Some(claimed_size)); assert_eq!(claimed_size, actual.len()); }
846 }
847 }
848
849 #[test]
850 fn test_row_id_slice_empty() {
851 let sequence = RowIdSequence::from(0..10);
852 let slice = sequence.slice(10, 0);
853 assert_eq!(slice.iter().collect::<Vec<_>>(), Vec::<u64>::new());
854 }
855
856 #[test]
857 fn test_row_id_sequence_rechunk() {
858 fn assert_rechunked(
859 input: Vec<RowIdSequence>,
860 chunk_sizes: Vec<u64>,
861 expected: Vec<RowIdSequence>,
862 ) {
863 let chunked = rechunk_sequences(input, chunk_sizes, false).unwrap();
864 assert_eq!(chunked, expected);
865 }
866
867 let many_segments = vec![
869 RowIdSequence(vec![U64Segment::Range(0..5), U64Segment::Range(35..40)]),
870 RowIdSequence::from(10..18),
871 RowIdSequence::from(18..28),
872 RowIdSequence::from(28..30),
873 ];
874 let fewer_segments = vec![
875 RowIdSequence(vec![U64Segment::Range(0..5), U64Segment::Range(35..40)]),
876 RowIdSequence::from(10..30),
877 ];
878 assert_rechunked(
879 many_segments.clone(),
880 fewer_segments.iter().map(|seq| seq.len()).collect(),
881 fewer_segments.clone(),
882 );
883
884 assert_rechunked(
886 fewer_segments,
887 many_segments.iter().map(|seq| seq.len()).collect(),
888 many_segments.clone(),
889 );
890
891 assert_rechunked(
893 many_segments.clone(),
894 many_segments.iter().map(|seq| seq.len()).collect(),
895 many_segments.clone(),
896 );
897
898 let result = rechunk_sequences(many_segments.clone(), vec![100], false);
900 assert!(result.is_err());
901
902 let result = rechunk_sequences(many_segments, vec![5], false);
904 assert!(result.is_err());
905 }
906
907 #[test]
908 fn test_select_row_ids() {
909 let offsets = [
911 ReadBatchParams::Indices(vec![1, 3, 9, 5, 7, 6].into()),
912 ReadBatchParams::Range(2..8),
913 ReadBatchParams::RangeFull,
914 ReadBatchParams::RangeTo(..5),
915 ReadBatchParams::RangeFrom(5..),
916 ReadBatchParams::Ranges(vec![2..3, 5..10].into()),
917 ];
918
919 let sequences = [
922 RowIdSequence(vec![
923 U64Segment::Range(0..5),
924 U64Segment::RangeWithHoles {
925 range: 50..60,
926 holes: vec![53, 54].into(),
927 },
928 U64Segment::SortedArray(vec![7, 9].into()),
929 ]),
930 RowIdSequence(vec![
931 U64Segment::RangeWithBitmap {
932 range: 0..5,
933 bitmap: [true, false, true, false, true].as_slice().into(),
934 },
935 U64Segment::Array(vec![30, 20, 10].into()),
936 U64Segment::Range(40..50),
937 ]),
938 ];
939
940 for params in offsets {
941 for sequence in &sequences {
942 let row_ids = select_row_ids(sequence, ¶ms).unwrap();
943 let flat_sequence = sequence.iter().collect::<Vec<_>>();
944
945 let selection: Vec<usize> = match ¶ms {
947 ReadBatchParams::RangeFull => (0..flat_sequence.len()).collect(),
948 ReadBatchParams::RangeTo(to) => (0..to.end).collect(),
949 ReadBatchParams::RangeFrom(from) => (from.start..flat_sequence.len()).collect(),
950 ReadBatchParams::Range(range) => range.clone().collect(),
951 ReadBatchParams::Ranges(ranges) => ranges
952 .iter()
953 .flat_map(|r| r.start as usize..r.end as usize)
954 .collect(),
955 ReadBatchParams::Indices(indices) => {
956 indices.values().iter().map(|i| *i as usize).collect()
957 }
958 };
959
960 let expected = selection
961 .into_iter()
962 .map(|i| flat_sequence[i])
963 .collect::<Vec<_>>();
964 assert_eq!(
965 row_ids, expected,
966 "Failed for params {:?} on the sequence {:?}",
967 ¶ms, sequence
968 );
969 }
970 }
971 }
972
973 #[test]
974 fn test_select_row_ids_out_of_bounds() {
975 let offsets = [
976 ReadBatchParams::Indices(vec![1, 1000, 4].into()),
977 ReadBatchParams::Range(2..1000),
978 ReadBatchParams::RangeTo(..1000),
979 ];
980
981 let sequence = RowIdSequence::from(0..10);
982
983 for params in offsets {
984 let result = select_row_ids(&sequence, ¶ms);
985 assert!(result.is_err());
986 assert!(matches!(result.unwrap_err(), Error::InvalidInput { .. }));
987 }
988 }
989
990 #[test]
991 fn test_row_id_sequence_to_treemap() {
992 let sequence = RowIdSequence(vec![
993 U64Segment::Range(0..5),
994 U64Segment::RangeWithHoles {
995 range: 50..60,
996 holes: vec![53, 54].into(),
997 },
998 U64Segment::SortedArray(vec![7, 9].into()),
999 U64Segment::RangeWithBitmap {
1000 range: 10..15,
1001 bitmap: [true, false, true, false, true].as_slice().into(),
1002 },
1003 U64Segment::Array(vec![35, 39].into()),
1004 U64Segment::Range(40..50),
1005 ]);
1006
1007 let tree_map = RowAddrTreeMap::from(&sequence);
1008 let expected = vec![
1009 0, 1, 2, 3, 4, 7, 9, 10, 12, 14, 35, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
1010 51, 52, 55, 56, 57, 58, 59,
1011 ]
1012 .into_iter()
1013 .collect::<RowAddrTreeMap>();
1014 assert_eq!(tree_map, expected);
1015 }
1016
1017 #[test]
1018 fn test_row_addr_mask() {
1019 let sequence = RowIdSequence(vec![
1025 U64Segment::Range(0..5),
1026 U64Segment::RangeWithHoles {
1027 range: 50..60,
1028 holes: vec![53, 54].into(),
1029 },
1030 U64Segment::SortedArray(vec![7, 9].into()),
1031 U64Segment::RangeWithBitmap {
1032 range: 10..15,
1033 bitmap: [true, false, true, false, true].as_slice().into(),
1034 },
1035 U64Segment::Array(vec![35, 39].into()),
1036 ]);
1037
1038 let values_to_remove = [4, 55, 7, 12, 39];
1040 let positions_to_remove = sequence
1041 .iter()
1042 .enumerate()
1043 .filter_map(|(i, val)| {
1044 if values_to_remove.contains(&val) {
1045 Some(i as u32)
1046 } else {
1047 None
1048 }
1049 })
1050 .collect::<Vec<_>>();
1051 let mut sequence = sequence;
1052 sequence.mask(positions_to_remove).unwrap();
1053 let expected = RowIdSequence(vec![
1054 U64Segment::Range(0..4),
1055 U64Segment::RangeWithBitmap {
1056 range: 50..60,
1057 bitmap: [
1058 true, true, true, false, false, false, true, true, true, true,
1059 ]
1060 .as_slice()
1061 .into(),
1062 },
1063 U64Segment::Range(9..10),
1064 U64Segment::RangeWithBitmap {
1065 range: 10..15,
1066 bitmap: [true, false, false, false, true].as_slice().into(),
1067 },
1068 U64Segment::Array(vec![35].into()),
1069 ]);
1070 assert_eq!(sequence, expected);
1071 }
1072
1073 #[test]
1074 fn test_row_addr_mask_everything() {
1075 let mut sequence = RowIdSequence(vec![
1076 U64Segment::Range(0..5),
1077 U64Segment::SortedArray(vec![7, 9].into()),
1078 ]);
1079 sequence.mask(0..sequence.len() as u32).unwrap();
1080 let expected = RowIdSequence(vec![]);
1081 assert_eq!(sequence, expected);
1082 }
1083
1084 #[test]
1085 fn test_selection() {
1086 let sequence = RowIdSequence(vec![
1087 U64Segment::Range(0..5),
1088 U64Segment::Range(10..15),
1089 U64Segment::Range(20..25),
1090 ]);
1091 let selection = sequence.select(vec![2, 4, 13, 14, 57].into_iter());
1092 assert_eq!(selection.collect::<Vec<_>>(), vec![2, 4, 23, 24]);
1093 }
1094
1095 #[test]
1096 #[should_panic]
1097 fn test_selection_unsorted() {
1098 let sequence = RowIdSequence(vec![
1099 U64Segment::Range(0..5),
1100 U64Segment::Range(10..15),
1101 U64Segment::Range(20..25),
1102 ]);
1103 let _ = sequence
1104 .select(vec![2, 4, 3].into_iter())
1105 .collect::<Vec<_>>();
1106 }
1107
1108 #[test]
1109 fn test_mask_to_offset_ranges() {
1110 let sequence = RowIdSequence(vec![U64Segment::Range(0..10)]);
1112 let mask = RowAddrMask::from_allowed(RowAddrTreeMap::from_iter(&[0, 2, 4, 6, 8]));
1113 let ranges = sequence.mask_to_offset_ranges(&mask);
1114 assert_eq!(ranges, vec![0..1, 2..3, 4..5, 6..7, 8..9]);
1115
1116 let sequence = RowIdSequence(vec![U64Segment::Range(40..60)]);
1117 let mask = RowAddrMask::from_allowed(RowAddrTreeMap::from_iter(&[54]));
1118 let ranges = sequence.mask_to_offset_ranges(&mask);
1119 assert_eq!(ranges, vec![14..15]);
1120
1121 let sequence = RowIdSequence(vec![U64Segment::Range(40..60)]);
1122 let mask = RowAddrMask::from_block(RowAddrTreeMap::from_iter(&[54]));
1123 let ranges = sequence.mask_to_offset_ranges(&mask);
1124 assert_eq!(ranges, vec![0..14, 15..20]);
1125
1126 let sequence = RowIdSequence(vec![U64Segment::RangeWithHoles {
1129 range: 0..10,
1130 holes: vec![2, 6].into(),
1131 }]);
1132 let mask = RowAddrMask::from_allowed(RowAddrTreeMap::from_iter(&[0, 2, 4, 6, 8]));
1133 let ranges = sequence.mask_to_offset_ranges(&mask);
1134 assert_eq!(ranges, vec![0..1, 3..4, 6..7]);
1135
1136 let sequence = RowIdSequence(vec![U64Segment::RangeWithHoles {
1137 range: 40..60,
1138 holes: vec![47, 43].into(),
1139 }]);
1140 let mask = RowAddrMask::from_allowed(RowAddrTreeMap::from_iter(&[44]));
1141 let ranges = sequence.mask_to_offset_ranges(&mask);
1142 assert_eq!(ranges, vec![3..4]);
1143
1144 let sequence = RowIdSequence(vec![U64Segment::RangeWithHoles {
1145 range: 40..60,
1146 holes: vec![47, 43].into(),
1147 }]);
1148 let mask = RowAddrMask::from_block(RowAddrTreeMap::from_iter(&[44]));
1149 let ranges = sequence.mask_to_offset_ranges(&mask);
1150 assert_eq!(ranges, vec![0..3, 4..18]);
1151
1152 let sequence = RowIdSequence(vec![U64Segment::RangeWithBitmap {
1155 range: 0..10,
1156 bitmap: [
1157 true, true, false, false, true, true, true, true, false, false,
1158 ]
1159 .as_slice()
1160 .into(),
1161 }]);
1162 let mask = RowAddrMask::from_allowed(RowAddrTreeMap::from_iter(&[0, 2, 4, 6, 8]));
1163 let ranges = sequence.mask_to_offset_ranges(&mask);
1164 assert_eq!(ranges, vec![0..1, 2..3, 4..5]);
1165
1166 let sequence = RowIdSequence(vec![U64Segment::RangeWithBitmap {
1167 range: 40..45,
1168 bitmap: [true, true, false, false, true].as_slice().into(),
1169 }]);
1170 let mask = RowAddrMask::from_allowed(RowAddrTreeMap::from_iter(&[44]));
1171 let ranges = sequence.mask_to_offset_ranges(&mask);
1172 assert_eq!(ranges, vec![2..3]);
1173
1174 let sequence = RowIdSequence(vec![U64Segment::RangeWithBitmap {
1175 range: 40..45,
1176 bitmap: [true, true, false, false, true].as_slice().into(),
1177 }]);
1178 let mask = RowAddrMask::from_block(RowAddrTreeMap::from_iter(&[44]));
1179 let ranges = sequence.mask_to_offset_ranges(&mask);
1180 assert_eq!(ranges, vec![0..2]);
1181
1182 let sequence = RowIdSequence(vec![U64Segment::SortedArray(vec![0, 2, 4, 6, 8].into())]);
1184 let mask = RowAddrMask::from_allowed(RowAddrTreeMap::from_iter(&[0, 6, 8]));
1185 let ranges = sequence.mask_to_offset_ranges(&mask);
1186 assert_eq!(ranges, vec![0..1, 3..5]);
1187
1188 let sequence = RowIdSequence(vec![U64Segment::Array(vec![8, 2, 6, 0, 4].into())]);
1189 let mask = RowAddrMask::from_allowed(RowAddrTreeMap::from_iter(&[0, 6, 8]));
1190 let ranges = sequence.mask_to_offset_ranges(&mask);
1191 assert_eq!(ranges, vec![0..1, 2..4]);
1192
1193 let sequence = RowIdSequence(vec![
1198 U64Segment::Range(0..5),
1199 U64Segment::RangeWithHoles {
1200 range: 100..105,
1201 holes: vec![103].into(),
1202 },
1203 U64Segment::SortedArray(vec![44, 46, 78].into()),
1204 ]);
1205 let mask = RowAddrMask::from_allowed(RowAddrTreeMap::from_iter(&[0, 2, 46, 100, 104]));
1206 let ranges = sequence.mask_to_offset_ranges(&mask);
1207 assert_eq!(ranges, vec![0..1, 2..3, 5..6, 8..9, 10..11]);
1208
1209 let sequence = RowIdSequence(vec![U64Segment::Range(0..10)]);
1211 let mask = RowAddrMask::default();
1212 let ranges = sequence.mask_to_offset_ranges(&mask);
1213 assert_eq!(ranges, vec![0..10]);
1214
1215 let sequence = RowIdSequence(vec![U64Segment::Range(0..10)]);
1217 let mask = RowAddrMask::allow_nothing();
1218 let ranges = sequence.mask_to_offset_ranges(&mask);
1219 assert_eq!(ranges, vec![]);
1220 }
1221
1222 #[test]
1223 fn test_row_id_sequence_rechunk_with_empty_segments() {
1224 let input_sequences = vec![
1226 RowIdSequence::from(0..2), RowIdSequence::from(20..23), ];
1229 let chunk_sizes = vec![2, 3]; let result = rechunk_sequences(input_sequences, chunk_sizes, false).unwrap();
1232 assert_eq!(result.len(), 2);
1233 assert_eq!(result[0].len(), 2);
1234 assert_eq!(result[1].len(), 3);
1235
1236 let first_chunk: Vec<u64> = result[0].iter().collect();
1237 let second_chunk: Vec<u64> = result[1].iter().collect();
1238 assert_eq!(first_chunk, vec![0, 1]);
1239 assert_eq!(second_chunk, vec![20, 21, 22]);
1240
1241 let input_sequences = vec![
1243 RowIdSequence::from(0..2), RowIdSequence::from(20..21), RowIdSequence::from(30..32), ];
1247 let chunk_sizes = vec![5]; let result = rechunk_sequences(input_sequences, chunk_sizes, false).unwrap();
1250 assert_eq!(result.len(), 1);
1251 assert_eq!(result[0].len(), 5);
1252
1253 let elements: Vec<u64> = result[0].iter().collect();
1254 assert_eq!(elements, vec![0, 1, 20, 30, 31]);
1255
1256 let input_sequences = vec![
1258 RowIdSequence::from(0..2), RowIdSequence::from(10..10), RowIdSequence::from(20..22), ];
1262 let chunk_sizes = vec![3, 1];
1263 let result = rechunk_sequences(input_sequences, chunk_sizes, false).unwrap();
1264
1265 assert_eq!(result.len(), 2);
1266 assert_eq!(result[0].len(), 3);
1267 assert_eq!(result[1].len(), 1);
1268
1269 let first_chunk_elements: Vec<u64> = result[0].iter().collect();
1270 let second_chunk_elements: Vec<u64> = result[1].iter().collect();
1271 assert_eq!(first_chunk_elements, vec![0, 1, 20]);
1272 assert_eq!(second_chunk_elements, vec![21]);
1273
1274 let input_sequences = vec![
1276 RowIdSequence::from(0..1), RowIdSequence::from(10..10), RowIdSequence::from(20..20), RowIdSequence::from(30..32), ];
1281 let chunk_sizes = vec![3];
1282 let result = rechunk_sequences(input_sequences, chunk_sizes, false).unwrap();
1283
1284 assert_eq!(result.len(), 1);
1285 assert_eq!(result[0].len(), 3);
1286
1287 let elements: Vec<u64> = result[0].iter().collect();
1288 assert_eq!(elements, vec![0, 30, 31]);
1289
1290 let input_sequences = vec![
1292 RowIdSequence::from(0..3), RowIdSequence::from(10..10), RowIdSequence::from(20..22), ];
1296 let chunk_sizes = vec![3, 2];
1297 let result = rechunk_sequences(input_sequences, chunk_sizes, false).unwrap();
1298
1299 assert_eq!(result.len(), 2);
1300 assert_eq!(result[0].len(), 3);
1301 assert_eq!(result[1].len(), 2);
1302
1303 let first_chunk_elements: Vec<u64> = result[0].iter().collect();
1304 let second_chunk_elements: Vec<u64> = result[1].iter().collect();
1305 assert_eq!(first_chunk_elements, vec![0, 1, 2]);
1306 assert_eq!(second_chunk_elements, vec![20, 21]);
1307
1308 let input_sequences = vec![
1310 RowIdSequence::from(0..2), RowIdSequence::from(10..10), ];
1313 let chunk_sizes = vec![5]; let result = rechunk_sequences(input_sequences, chunk_sizes, true).unwrap();
1315
1316 assert_eq!(result.len(), 1);
1317 assert_eq!(result[0].len(), 2);
1318
1319 let elements: Vec<u64> = result[0].iter().collect();
1320 assert_eq!(elements, vec![0, 1]);
1321 }
1322}