1use std::ops::Range;
15mod bitmap;
19mod encoded_array;
20mod index;
21pub mod segment;
22mod serde;
23
24use deepsize::DeepSizeOf;
25pub use index::FragmentRowIdIndex;
27pub use index::RowIdIndex;
28use lance_core::{
29 utils::mask::{RowIdMask, RowIdTreeMap},
30 Error, Result,
31};
32use lance_io::ReadBatchParams;
33pub use serde::{read_row_ids, write_row_ids};
34
35use snafu::location;
36
37use crate::utils::LanceIteratorExtension;
38use segment::U64Segment;
39use tracing::instrument;
40
41#[derive(Debug, Clone, DeepSizeOf, PartialEq, Eq, Default)]
54pub struct RowIdSequence(Vec<U64Segment>);
55
56impl std::fmt::Display for RowIdSequence {
57 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
58 let mut iter = self.iter();
59 let mut first_10 = Vec::new();
60 let mut last_10 = Vec::new();
61 for row_id in iter.by_ref() {
62 first_10.push(row_id);
63 if first_10.len() > 10 {
64 break;
65 }
66 }
67
68 while let Some(row_id) = iter.next_back() {
69 last_10.push(row_id);
70 if last_10.len() > 10 {
71 break;
72 }
73 }
74 last_10.reverse();
75
76 let theres_more = iter.next().is_some();
77
78 write!(f, "[")?;
79 for row_id in first_10 {
80 write!(f, "{}", row_id)?;
81 }
82 if theres_more {
83 write!(f, ", ...")?;
84 }
85 for row_id in last_10 {
86 write!(f, ", {}", row_id)?;
87 }
88 write!(f, "]")
89 }
90}
91
92impl From<Range<u64>> for RowIdSequence {
93 fn from(range: Range<u64>) -> Self {
94 Self(vec![U64Segment::Range(range)])
95 }
96}
97
98impl From<&[u64]> for RowIdSequence {
99 fn from(row_ids: &[u64]) -> Self {
100 Self(vec![U64Segment::from_slice(row_ids)])
101 }
102}
103
104impl RowIdSequence {
105 pub fn new() -> Self {
106 Self::default()
107 }
108
109 pub fn iter(&self) -> impl DoubleEndedIterator<Item = u64> + '_ {
110 self.0.iter().flat_map(|segment| segment.iter())
111 }
112
113 pub fn len(&self) -> u64 {
114 self.0.iter().map(|segment| segment.len() as u64).sum()
115 }
116
117 pub fn is_empty(&self) -> bool {
118 self.0.is_empty()
119 }
120
121 pub fn extend(&mut self, other: Self) {
123 if let (Some(U64Segment::Range(range1)), Some(U64Segment::Range(range2))) =
127 (self.0.last(), other.0.first())
128 {
129 if range1.end == range2.start {
130 let new_range = U64Segment::Range(range1.start..range2.end);
131 self.0.pop();
132 self.0.push(new_range);
133 self.0.extend(other.0.into_iter().skip(1));
134 return;
135 }
136 }
137 self.0.extend(other.0);
139 }
140
141 pub fn delete(&mut self, row_ids: impl IntoIterator<Item = u64>) {
143 let (row_ids, offsets) = self.find_ids(row_ids);
145
146 let capacity = self.0.capacity();
147 let old_segments = std::mem::replace(&mut self.0, Vec::with_capacity(capacity));
148 let mut remaining_segments = old_segments.as_slice();
149
150 for (segment_idx, range) in offsets {
151 let segments_handled = old_segments.len() - remaining_segments.len();
152 let segments_to_add = segment_idx - segments_handled;
153 self.0
154 .extend_from_slice(&remaining_segments[..segments_to_add]);
155 remaining_segments = &remaining_segments[segments_to_add..];
156
157 let segment;
158 (segment, remaining_segments) = remaining_segments.split_first().unwrap();
159
160 let segment_ids = &row_ids[range];
161 self.0.push(segment.delete(segment_ids));
162 }
163
164 self.0.extend_from_slice(remaining_segments);
166 }
167
168 pub fn mask(&mut self, positions: impl IntoIterator<Item = u32>) -> Result<()> {
170 let mut local_positions = Vec::new();
171 let mut positions_iter = positions.into_iter();
172 let mut curr_position = positions_iter.next();
173 let mut offset = 0;
174 let mut cutoff = 0;
175
176 for segment in &mut self.0 {
177 cutoff += segment.len() as u32;
179 while let Some(position) = curr_position {
180 if position >= cutoff {
181 break;
182 }
183 local_positions.push(position - offset);
184 curr_position = positions_iter.next();
185 }
186
187 if !local_positions.is_empty() {
188 segment.mask(&local_positions);
189 local_positions.clear();
190 }
191 offset = cutoff;
192 }
193
194 self.0.retain(|segment| !segment.is_empty());
195
196 Ok(())
197 }
198
199 fn find_ids(
205 &self,
206 row_ids: impl IntoIterator<Item = u64>,
207 ) -> (Vec<u64>, Vec<(usize, Range<usize>)>) {
208 let mut segment_iter = self.0.iter().enumerate().cycle();
212
213 let mut segment_matches = vec![Vec::new(); self.0.len()];
214
215 row_ids.into_iter().for_each(|row_id| {
216 let mut i = 0;
217 while i < self.0.len() {
219 let (segment_idx, segment) = segment_iter.next().unwrap();
220 if segment.range().is_some_and(|range| range.contains(&row_id)) {
221 if let Some(offset) = segment.position(row_id) {
222 segment_matches.get_mut(segment_idx).unwrap().push(offset);
223 }
224 }
226 i += 1;
227 }
228 });
229 for matches in &mut segment_matches {
230 matches.sort_unstable();
231 }
232
233 let mut offset = 0;
234 let segment_ranges = segment_matches
235 .iter()
236 .enumerate()
237 .filter(|(_, matches)| !matches.is_empty())
238 .map(|(segment_idx, matches)| {
239 let range = offset..offset + matches.len();
240 offset += matches.len();
241 (segment_idx, range)
242 })
243 .collect();
244 let row_ids = segment_matches
245 .into_iter()
246 .enumerate()
247 .flat_map(|(segment_idx, offset)| {
248 offset
249 .into_iter()
250 .map(move |offset| self.0[segment_idx].get(offset).unwrap())
251 })
252 .collect();
253
254 (row_ids, segment_ranges)
255 }
256
257 pub fn slice(&self, offset: usize, len: usize) -> RowIdSeqSlice<'_> {
258 if len == 0 {
259 return RowIdSeqSlice {
260 segments: &[],
261 offset_start: 0,
262 offset_last: 0,
263 };
264 }
265
266 let mut offset_start = offset;
268 let mut segment_offset = 0;
269 for segment in &self.0 {
270 let segment_len = segment.len();
271 if offset_start < segment_len {
272 break;
273 }
274 offset_start -= segment_len;
275 segment_offset += 1;
276 }
277
278 let mut offset_last = offset_start + len;
280 let mut segment_offset_last = segment_offset;
281 for segment in &self.0[segment_offset..] {
282 let segment_len = segment.len();
283 if offset_last <= segment_len {
284 break;
285 }
286 offset_last -= segment_len;
287 segment_offset_last += 1;
288 }
289
290 RowIdSeqSlice {
291 segments: &self.0[segment_offset..=segment_offset_last],
292 offset_start,
293 offset_last,
294 }
295 }
296
297 pub fn get(&self, index: usize) -> Option<u64> {
301 let mut offset = 0;
302 for segment in &self.0 {
303 let segment_len = segment.len();
304 if index < offset + segment_len {
305 return segment.get(index - offset);
306 }
307 offset += segment_len;
308 }
309 None
310 }
311
312 pub fn select<'a>(
320 &'a self,
321 selection: impl Iterator<Item = usize> + 'a,
322 ) -> impl Iterator<Item = u64> + 'a {
323 let mut seg_iter = self.0.iter();
324 let mut cur_seg = seg_iter.next();
325 let mut rows_passed = 0;
326 let mut cur_seg_len = cur_seg.map(|seg| seg.len()).unwrap_or(0);
327 let mut last_index = 0;
328 selection.filter_map(move |index| {
329 if index < last_index {
330 panic!("Selection is not sorted");
331 }
332 last_index = index;
333
334 cur_seg?;
335
336 while (index - rows_passed) >= cur_seg_len {
337 rows_passed += cur_seg_len;
338 cur_seg = seg_iter.next();
339 if let Some(cur_seg) = cur_seg {
340 cur_seg_len = cur_seg.len();
341 } else {
342 return None;
343 }
344 }
345
346 Some(cur_seg.unwrap().get(index - rows_passed).unwrap())
347 })
348 }
349
350 #[instrument(level = "debug", skip_all)]
365 pub fn mask_to_offset_ranges(&self, mask: &RowIdMask) -> Vec<Range<u64>> {
366 let mut offset = 0;
367 let mut ranges = Vec::new();
368 for segment in &self.0 {
369 match segment {
370 U64Segment::Range(range) => {
371 let mut ids = RowIdTreeMap::from(range.clone());
372 ids.mask(mask);
373 ranges.extend(GroupingIterator::new(
374 unsafe { ids.into_addr_iter() }.map(|addr| addr - range.start + offset),
375 ));
376 offset += range.end - range.start;
377 }
378 U64Segment::RangeWithHoles { range, holes } => {
379 let offset_start = offset;
380 let mut ids = RowIdTreeMap::from(range.clone());
381 offset += range.end - range.start;
382 for hole in holes.iter() {
383 if ids.remove(hole) {
384 offset -= 1;
385 }
386 }
387 ids.mask(mask);
388
389 let mut sorted_holes = holes.clone().into_iter().collect::<Vec<_>>();
391 sorted_holes.sort_unstable();
392 let mut next_holes_iter = sorted_holes.into_iter().peekable();
393 let mut holes_passed = 0;
394 ranges.extend(GroupingIterator::new(unsafe { ids.into_addr_iter() }.map(
395 |addr| {
396 while let Some(next_hole) = next_holes_iter.peek() {
397 if *next_hole < addr {
398 next_holes_iter.next();
399 holes_passed += 1;
400 } else {
401 break;
402 }
403 }
404 addr - range.start + offset_start - holes_passed
405 },
406 )));
407 }
408 U64Segment::RangeWithBitmap { range, bitmap } => {
409 let mut ids = RowIdTreeMap::from(range.clone());
410 let offset_start = offset;
411 offset += range.end - range.start;
412 for (i, val) in range.clone().enumerate() {
413 if !bitmap.get(i) && ids.remove(val) {
414 offset -= 1;
415 }
416 }
417 ids.mask(mask);
418 let mut bitmap_iter = bitmap.iter();
419 let mut bitmap_iter_pos = 0;
420 let mut holes_passed = 0;
421 ranges.extend(GroupingIterator::new(unsafe { ids.into_addr_iter() }.map(
422 |addr| {
423 let offset_no_holes = addr - range.start + offset_start;
424 while bitmap_iter_pos < offset_no_holes {
425 if !bitmap_iter.next().unwrap() {
426 holes_passed += 1;
427 }
428 bitmap_iter_pos += 1;
429 }
430 offset_no_holes - holes_passed
431 },
432 )));
433 }
434 U64Segment::SortedArray(array) | U64Segment::Array(array) => {
435 ranges.extend(GroupingIterator::new(array.iter().enumerate().filter_map(
437 |(off, id)| {
438 if mask.selected(id) {
439 Some(off as u64 + offset)
440 } else {
441 None
442 }
443 },
444 )));
445 offset += array.len() as u64;
446 }
447 }
448 }
449 ranges
450 }
451}
452
453struct GroupingIterator<I: Iterator<Item = u64>> {
458 iter: I,
459 cur_range: Option<Range<u64>>,
460}
461
462impl<I: Iterator<Item = u64>> GroupingIterator<I> {
463 fn new(iter: I) -> Self {
464 Self {
465 iter,
466 cur_range: None,
467 }
468 }
469}
470
471impl<I: Iterator<Item = u64>> Iterator for GroupingIterator<I> {
472 type Item = Range<u64>;
473
474 fn next(&mut self) -> Option<Self::Item> {
475 for id in self.iter.by_ref() {
476 if let Some(range) = self.cur_range.as_mut() {
477 if range.end == id {
478 range.end = id + 1;
479 } else {
480 let ret = Some(range.clone());
481 self.cur_range = Some(id..id + 1);
482 return ret;
483 }
484 } else {
485 self.cur_range = Some(id..id + 1);
486 }
487 }
488 self.cur_range.take()
489 }
490}
491
492impl From<&RowIdSequence> for RowIdTreeMap {
493 fn from(row_ids: &RowIdSequence) -> Self {
494 let mut tree_map = Self::new();
495 for segment in &row_ids.0 {
496 match segment {
497 U64Segment::Range(range) => {
498 tree_map.insert_range(range.clone());
499 }
500 U64Segment::RangeWithBitmap { range, bitmap } => {
501 tree_map.insert_range(range.clone());
502 for (i, val) in range.clone().enumerate() {
503 if !bitmap.get(i) {
504 tree_map.remove(val);
505 }
506 }
507 }
508 U64Segment::RangeWithHoles { range, holes } => {
509 tree_map.insert_range(range.clone());
510 for hole in holes.iter() {
511 tree_map.remove(hole);
512 }
513 }
514 U64Segment::SortedArray(array) | U64Segment::Array(array) => {
515 for val in array.iter() {
516 tree_map.insert(val);
517 }
518 }
519 }
520 }
521 tree_map
522 }
523}
524
525#[derive(Debug)]
526pub struct RowIdSeqSlice<'a> {
527 segments: &'a [U64Segment],
529 offset_start: usize,
531 offset_last: usize,
533}
534
535impl RowIdSeqSlice<'_> {
536 pub fn iter(&self) -> impl Iterator<Item = u64> + '_ {
537 let mut known_size = self.segments.iter().map(|segment| segment.len()).sum();
538 known_size -= self.offset_start;
539 known_size -= self.segments.last().map(|s| s.len()).unwrap_or_default() - self.offset_last;
540
541 let end = self.segments.len().saturating_sub(1);
542 self.segments
543 .iter()
544 .enumerate()
545 .flat_map(move |(i, segment)| {
546 match i {
547 0 if self.segments.len() == 1 => {
548 let len = self.offset_last - self.offset_start;
549 Box::new(segment.iter().skip(self.offset_start).take(len))
552 as Box<dyn Iterator<Item = u64>>
553 }
554 0 => Box::new(segment.iter().skip(self.offset_start)),
555 i if i == end => Box::new(segment.iter().take(self.offset_last)),
556 _ => Box::new(segment.iter()),
557 }
558 })
559 .exact_size(known_size)
560 }
561}
562
563pub fn rechunk_sequences(
577 sequences: impl IntoIterator<Item = RowIdSequence>,
578 chunk_sizes: impl IntoIterator<Item = u64>,
579 allow_incomplete: bool,
580) -> Result<Vec<RowIdSequence>> {
581 let chunk_sizes_vec: Vec<u64> = chunk_sizes.into_iter().collect();
583 let total_chunks = chunk_sizes_vec.len();
584 let mut chunked_sequences = Vec::with_capacity(total_chunks);
585 let mut segment_iter = sequences
586 .into_iter()
587 .flat_map(|sequence| sequence.0.into_iter())
588 .peekable();
589
590 let too_few_segments_error = |chunk_index: usize, expected_chunk_size: u64, remaining: u64| {
591 Error::invalid_input(
592 format!(
593 "Got too few segments for chunk {}. Expected chunk size: {}, remaining needed: {}",
594 chunk_index, expected_chunk_size, remaining
595 ),
596 location!(),
597 )
598 };
599
600 let too_many_segments_error = |processed_chunks: usize, total_chunk_sizes: usize| {
601 Error::invalid_input(
602 format!(
603 "Got too many segments for the provided chunk lengths. Processed {} chunks out of {} expected",
604 processed_chunks, total_chunk_sizes
605 ),
606 location!(),
607 )
608 };
609
610 let mut segment_offset = 0_u64;
611
612 for (chunk_index, chunk_size) in chunk_sizes_vec.iter().enumerate() {
613 let chunk_size = *chunk_size;
614 let mut sequence = RowIdSequence(Vec::new());
615 let mut remaining = chunk_size;
616
617 while remaining > 0 {
618 let remaining_in_segment = segment_iter
619 .peek()
620 .map_or(0, |segment| segment.len() as u64 - segment_offset);
621
622 if remaining_in_segment == 0 {
624 if segment_iter.next().is_some() {
625 segment_offset = 0;
626 continue;
627 } else {
628 if allow_incomplete {
630 break;
631 } else {
632 return Err(too_few_segments_error(chunk_index, chunk_size, remaining));
633 }
634 }
635 }
636
637 match remaining_in_segment.cmp(&remaining) {
639 std::cmp::Ordering::Greater => {
640 let segment = segment_iter
642 .peek()
643 .ok_or_else(|| too_few_segments_error(chunk_index, chunk_size, remaining))?
644 .slice(segment_offset as usize, remaining as usize);
645 sequence.extend(RowIdSequence(vec![segment]));
646 segment_offset += remaining;
647 remaining = 0;
648 }
649 std::cmp::Ordering::Equal | std::cmp::Ordering::Less => {
650 let segment = segment_iter
654 .next()
655 .ok_or_else(|| too_few_segments_error(chunk_index, chunk_size, remaining))?
656 .slice(segment_offset as usize, remaining_in_segment as usize);
657 sequence.extend(RowIdSequence(vec![segment]));
658 segment_offset = 0;
659 remaining -= remaining_in_segment;
660 }
661 }
662 }
663
664 chunked_sequences.push(sequence);
665 }
666
667 if segment_iter.peek().is_some() {
668 return Err(too_many_segments_error(
669 chunked_sequences.len(),
670 total_chunks,
671 ));
672 }
673
674 Ok(chunked_sequences)
675}
676
677pub fn select_row_ids<'a>(
679 sequence: &'a RowIdSequence,
680 offsets: &'a ReadBatchParams,
681) -> Result<Vec<u64>> {
682 let out_of_bounds_err = |offset: u32| {
683 Error::invalid_input(
684 format!(
685 "Index out of bounds: {} for sequence of length {}",
686 offset,
687 sequence.len()
688 ),
689 location!(),
690 )
691 };
692
693 match offsets {
694 ReadBatchParams::Indices(indices) => indices
696 .values()
697 .iter()
698 .map(|index| {
699 sequence
700 .get(*index as usize)
701 .ok_or_else(|| out_of_bounds_err(*index))
702 })
703 .collect(),
704 ReadBatchParams::Range(range) => {
705 if range.end > sequence.len() as usize {
706 return Err(out_of_bounds_err(range.end as u32));
707 }
708 let sequence = sequence.slice(range.start, range.end - range.start);
709 Ok(sequence.iter().collect())
710 }
711 ReadBatchParams::Ranges(ranges) => {
712 let num_rows = ranges
713 .iter()
714 .map(|r| (r.end - r.start) as usize)
715 .sum::<usize>();
716 let mut result = Vec::with_capacity(num_rows);
717 for range in ranges.as_ref() {
718 if range.end > sequence.len() {
719 return Err(out_of_bounds_err(range.end as u32));
720 }
721 let sequence =
722 sequence.slice(range.start as usize, (range.end - range.start) as usize);
723 result.extend(sequence.iter());
724 }
725 Ok(result)
726 }
727
728 ReadBatchParams::RangeFull => Ok(sequence.iter().collect()),
729 ReadBatchParams::RangeTo(to) => {
730 if to.end > sequence.len() as usize {
731 return Err(out_of_bounds_err(to.end as u32));
732 }
733 let len = to.end;
734 let sequence = sequence.slice(0, len);
735 Ok(sequence.iter().collect())
736 }
737 ReadBatchParams::RangeFrom(from) => {
738 let sequence = sequence.slice(from.start, sequence.len() as usize - from.start);
739 Ok(sequence.iter().collect())
740 }
741 }
742}
743
744#[cfg(test)]
745mod test {
746 use super::*;
747
748 use pretty_assertions::assert_eq;
749 use test::bitmap::Bitmap;
750
751 #[test]
752 fn test_row_id_sequence_from_range() {
753 let sequence = RowIdSequence::from(0..10);
754 assert_eq!(sequence.len(), 10);
755 assert_eq!(sequence.is_empty(), false);
756
757 let iter = sequence.iter();
758 assert_eq!(iter.collect::<Vec<_>>(), (0..10).collect::<Vec<_>>());
759 }
760
761 #[test]
762 fn test_row_id_sequence_extend() {
763 let mut sequence = RowIdSequence::from(0..10);
764 sequence.extend(RowIdSequence::from(10..20));
765 assert_eq!(sequence.0, vec![U64Segment::Range(0..20)]);
766
767 let mut sequence = RowIdSequence::from(0..10);
768 sequence.extend(RowIdSequence::from(20..30));
769 assert_eq!(
770 sequence.0,
771 vec![U64Segment::Range(0..10), U64Segment::Range(20..30)]
772 );
773 }
774
775 #[test]
776 fn test_row_id_sequence_delete() {
777 let mut sequence = RowIdSequence::from(0..10);
778 sequence.delete(vec![1, 3, 5, 7, 9]);
779 let mut expected_bitmap = Bitmap::new_empty(9);
780 for i in [0, 2, 4, 6, 8] {
781 expected_bitmap.set(i as usize);
782 }
783 assert_eq!(
784 sequence.0,
785 vec![U64Segment::RangeWithBitmap {
786 range: 0..9,
787 bitmap: expected_bitmap
788 },]
789 );
790
791 let mut sequence = RowIdSequence::from(0..10);
792 sequence.extend(RowIdSequence::from(12..20));
793 sequence.delete(vec![0, 9, 10, 11, 12, 13]);
794 assert_eq!(
795 sequence.0,
796 vec![U64Segment::Range(1..9), U64Segment::Range(14..20),]
797 );
798
799 let mut sequence = RowIdSequence::from(0..10);
800 sequence.delete(vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9]);
801 assert_eq!(sequence.0, vec![U64Segment::Range(0..0)]);
802 }
803
804 #[test]
805 fn test_row_id_slice() {
806 let sequence = RowIdSequence(vec![
809 U64Segment::Range(30..35), U64Segment::RangeWithHoles {
811 range: 50..60,
813 holes: vec![53, 54].into(),
814 },
815 U64Segment::SortedArray(vec![7, 9].into()), U64Segment::RangeWithBitmap {
817 range: 0..5,
818 bitmap: [true, false, true, false, true].as_slice().into(),
819 },
820 U64Segment::Array(vec![35, 39].into()),
821 U64Segment::Range(40..50),
822 ]);
823
824 for offset in 0..sequence.len() as usize {
826 for len in 0..sequence.len() as usize {
827 if offset + len > sequence.len() as usize {
828 continue;
829 }
830 let slice = sequence.slice(offset, len);
831
832 let actual = slice.iter().collect::<Vec<_>>();
833 let expected = sequence.iter().skip(offset).take(len).collect::<Vec<_>>();
834 assert_eq!(
835 actual, expected,
836 "Failed for offset {} and len {}",
837 offset, len
838 );
839
840 let (claimed_size, claimed_max) = slice.iter().size_hint();
841 assert_eq!(claimed_max, Some(claimed_size)); assert_eq!(claimed_size, actual.len()); }
844 }
845 }
846
847 #[test]
848 fn test_row_id_slice_empty() {
849 let sequence = RowIdSequence::from(0..10);
850 let slice = sequence.slice(10, 0);
851 assert_eq!(slice.iter().collect::<Vec<_>>(), Vec::<u64>::new());
852 }
853
854 #[test]
855 fn test_row_id_sequence_rechunk() {
856 fn assert_rechunked(
857 input: Vec<RowIdSequence>,
858 chunk_sizes: Vec<u64>,
859 expected: Vec<RowIdSequence>,
860 ) {
861 let chunked = rechunk_sequences(input, chunk_sizes, false).unwrap();
862 assert_eq!(chunked, expected);
863 }
864
865 let many_segments = vec![
867 RowIdSequence(vec![U64Segment::Range(0..5), U64Segment::Range(35..40)]),
868 RowIdSequence::from(10..18),
869 RowIdSequence::from(18..28),
870 RowIdSequence::from(28..30),
871 ];
872 let fewer_segments = vec![
873 RowIdSequence(vec![U64Segment::Range(0..5), U64Segment::Range(35..40)]),
874 RowIdSequence::from(10..30),
875 ];
876 assert_rechunked(
877 many_segments.clone(),
878 fewer_segments.iter().map(|seq| seq.len()).collect(),
879 fewer_segments.clone(),
880 );
881
882 assert_rechunked(
884 fewer_segments,
885 many_segments.iter().map(|seq| seq.len()).collect(),
886 many_segments.clone(),
887 );
888
889 assert_rechunked(
891 many_segments.clone(),
892 many_segments.iter().map(|seq| seq.len()).collect(),
893 many_segments.clone(),
894 );
895
896 let result = rechunk_sequences(many_segments.clone(), vec![100], false);
898 assert!(result.is_err());
899
900 let result = rechunk_sequences(many_segments, vec![5], false);
902 assert!(result.is_err());
903 }
904
905 #[test]
906 fn test_select_row_ids() {
907 let offsets = [
909 ReadBatchParams::Indices(vec![1, 3, 9, 5, 7, 6].into()),
910 ReadBatchParams::Range(2..8),
911 ReadBatchParams::RangeFull,
912 ReadBatchParams::RangeTo(..5),
913 ReadBatchParams::RangeFrom(5..),
914 ReadBatchParams::Ranges(vec![2..3, 5..10].into()),
915 ];
916
917 let sequences = [
920 RowIdSequence(vec![
921 U64Segment::Range(0..5),
922 U64Segment::RangeWithHoles {
923 range: 50..60,
924 holes: vec![53, 54].into(),
925 },
926 U64Segment::SortedArray(vec![7, 9].into()),
927 ]),
928 RowIdSequence(vec![
929 U64Segment::RangeWithBitmap {
930 range: 0..5,
931 bitmap: [true, false, true, false, true].as_slice().into(),
932 },
933 U64Segment::Array(vec![30, 20, 10].into()),
934 U64Segment::Range(40..50),
935 ]),
936 ];
937
938 for params in offsets {
939 for sequence in &sequences {
940 let row_ids = select_row_ids(sequence, ¶ms).unwrap();
941 let flat_sequence = sequence.iter().collect::<Vec<_>>();
942
943 let selection: Vec<usize> = match ¶ms {
945 ReadBatchParams::RangeFull => (0..flat_sequence.len()).collect(),
946 ReadBatchParams::RangeTo(to) => (0..to.end).collect(),
947 ReadBatchParams::RangeFrom(from) => (from.start..flat_sequence.len()).collect(),
948 ReadBatchParams::Range(range) => range.clone().collect(),
949 ReadBatchParams::Ranges(ranges) => ranges
950 .iter()
951 .flat_map(|r| r.start as usize..r.end as usize)
952 .collect(),
953 ReadBatchParams::Indices(indices) => {
954 indices.values().iter().map(|i| *i as usize).collect()
955 }
956 };
957
958 let expected = selection
959 .into_iter()
960 .map(|i| flat_sequence[i])
961 .collect::<Vec<_>>();
962 assert_eq!(
963 row_ids, expected,
964 "Failed for params {:?} on the sequence {:?}",
965 ¶ms, sequence
966 );
967 }
968 }
969 }
970
971 #[test]
972 fn test_select_row_ids_out_of_bounds() {
973 let offsets = [
974 ReadBatchParams::Indices(vec![1, 1000, 4].into()),
975 ReadBatchParams::Range(2..1000),
976 ReadBatchParams::RangeTo(..1000),
977 ];
978
979 let sequence = RowIdSequence::from(0..10);
980
981 for params in offsets {
982 let result = select_row_ids(&sequence, ¶ms);
983 assert!(result.is_err());
984 assert!(matches!(result.unwrap_err(), Error::InvalidInput { .. }));
985 }
986 }
987
988 #[test]
989 fn test_row_id_sequence_to_treemap() {
990 let sequence = RowIdSequence(vec![
991 U64Segment::Range(0..5),
992 U64Segment::RangeWithHoles {
993 range: 50..60,
994 holes: vec![53, 54].into(),
995 },
996 U64Segment::SortedArray(vec![7, 9].into()),
997 U64Segment::RangeWithBitmap {
998 range: 10..15,
999 bitmap: [true, false, true, false, true].as_slice().into(),
1000 },
1001 U64Segment::Array(vec![35, 39].into()),
1002 U64Segment::Range(40..50),
1003 ]);
1004
1005 let tree_map = RowIdTreeMap::from(&sequence);
1006 let expected = vec![
1007 0, 1, 2, 3, 4, 7, 9, 10, 12, 14, 35, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
1008 51, 52, 55, 56, 57, 58, 59,
1009 ]
1010 .into_iter()
1011 .collect::<RowIdTreeMap>();
1012 assert_eq!(tree_map, expected);
1013 }
1014
1015 #[test]
1016 fn test_row_id_mask() {
1017 let sequence = RowIdSequence(vec![
1023 U64Segment::Range(0..5),
1024 U64Segment::RangeWithHoles {
1025 range: 50..60,
1026 holes: vec![53, 54].into(),
1027 },
1028 U64Segment::SortedArray(vec![7, 9].into()),
1029 U64Segment::RangeWithBitmap {
1030 range: 10..15,
1031 bitmap: [true, false, true, false, true].as_slice().into(),
1032 },
1033 U64Segment::Array(vec![35, 39].into()),
1034 ]);
1035
1036 let values_to_remove = [4, 55, 7, 12, 39];
1038 let positions_to_remove = sequence
1039 .iter()
1040 .enumerate()
1041 .filter_map(|(i, val)| {
1042 if values_to_remove.contains(&val) {
1043 Some(i as u32)
1044 } else {
1045 None
1046 }
1047 })
1048 .collect::<Vec<_>>();
1049 let mut sequence = sequence;
1050 sequence.mask(positions_to_remove).unwrap();
1051 let expected = RowIdSequence(vec![
1052 U64Segment::Range(0..4),
1053 U64Segment::RangeWithBitmap {
1054 range: 50..60,
1055 bitmap: [
1056 true, true, true, false, false, false, true, true, true, true,
1057 ]
1058 .as_slice()
1059 .into(),
1060 },
1061 U64Segment::Range(9..10),
1062 U64Segment::RangeWithBitmap {
1063 range: 10..15,
1064 bitmap: [true, false, false, false, true].as_slice().into(),
1065 },
1066 U64Segment::Array(vec![35].into()),
1067 ]);
1068 assert_eq!(sequence, expected);
1069 }
1070
1071 #[test]
1072 fn test_row_id_mask_everything() {
1073 let mut sequence = RowIdSequence(vec![
1074 U64Segment::Range(0..5),
1075 U64Segment::SortedArray(vec![7, 9].into()),
1076 ]);
1077 sequence.mask(0..sequence.len() as u32).unwrap();
1078 let expected = RowIdSequence(vec![]);
1079 assert_eq!(sequence, expected);
1080 }
1081
1082 #[test]
1083 fn test_selection() {
1084 let sequence = RowIdSequence(vec![
1085 U64Segment::Range(0..5),
1086 U64Segment::Range(10..15),
1087 U64Segment::Range(20..25),
1088 ]);
1089 let selection = sequence.select(vec![2, 4, 13, 14, 57].into_iter());
1090 assert_eq!(selection.collect::<Vec<_>>(), vec![2, 4, 23, 24]);
1091 }
1092
1093 #[test]
1094 #[should_panic]
1095 fn test_selection_unsorted() {
1096 let sequence = RowIdSequence(vec![
1097 U64Segment::Range(0..5),
1098 U64Segment::Range(10..15),
1099 U64Segment::Range(20..25),
1100 ]);
1101 let _ = sequence
1102 .select(vec![2, 4, 3].into_iter())
1103 .collect::<Vec<_>>();
1104 }
1105
1106 #[test]
1107 fn test_mask_to_offset_ranges() {
1108 let sequence = RowIdSequence(vec![U64Segment::Range(0..10)]);
1110 let mask = RowIdMask::from_allowed(RowIdTreeMap::from_iter(&[0, 2, 4, 6, 8]));
1111 let ranges = sequence.mask_to_offset_ranges(&mask);
1112 assert_eq!(ranges, vec![0..1, 2..3, 4..5, 6..7, 8..9]);
1113
1114 let sequence = RowIdSequence(vec![U64Segment::Range(40..60)]);
1115 let mask = RowIdMask::from_allowed(RowIdTreeMap::from_iter(&[54]));
1116 let ranges = sequence.mask_to_offset_ranges(&mask);
1117 assert_eq!(ranges, vec![14..15]);
1118
1119 let sequence = RowIdSequence(vec![U64Segment::Range(40..60)]);
1120 let mask = RowIdMask::from_block(RowIdTreeMap::from_iter(&[54]));
1121 let ranges = sequence.mask_to_offset_ranges(&mask);
1122 assert_eq!(ranges, vec![0..14, 15..20]);
1123
1124 let sequence = RowIdSequence(vec![U64Segment::RangeWithHoles {
1127 range: 0..10,
1128 holes: vec![2, 6].into(),
1129 }]);
1130 let mask = RowIdMask::from_allowed(RowIdTreeMap::from_iter(&[0, 2, 4, 6, 8]));
1131 let ranges = sequence.mask_to_offset_ranges(&mask);
1132 assert_eq!(ranges, vec![0..1, 3..4, 6..7]);
1133
1134 let sequence = RowIdSequence(vec![U64Segment::RangeWithHoles {
1135 range: 40..60,
1136 holes: vec![47, 43].into(),
1137 }]);
1138 let mask = RowIdMask::from_allowed(RowIdTreeMap::from_iter(&[44]));
1139 let ranges = sequence.mask_to_offset_ranges(&mask);
1140 assert_eq!(ranges, vec![3..4]);
1141
1142 let sequence = RowIdSequence(vec![U64Segment::RangeWithHoles {
1143 range: 40..60,
1144 holes: vec![47, 43].into(),
1145 }]);
1146 let mask = RowIdMask::from_block(RowIdTreeMap::from_iter(&[44]));
1147 let ranges = sequence.mask_to_offset_ranges(&mask);
1148 assert_eq!(ranges, vec![0..3, 4..18]);
1149
1150 let sequence = RowIdSequence(vec![U64Segment::RangeWithBitmap {
1153 range: 0..10,
1154 bitmap: [
1155 true, true, false, false, true, true, true, true, false, false,
1156 ]
1157 .as_slice()
1158 .into(),
1159 }]);
1160 let mask = RowIdMask::from_allowed(RowIdTreeMap::from_iter(&[0, 2, 4, 6, 8]));
1161 let ranges = sequence.mask_to_offset_ranges(&mask);
1162 assert_eq!(ranges, vec![0..1, 2..3, 4..5]);
1163
1164 let sequence = RowIdSequence(vec![U64Segment::RangeWithBitmap {
1165 range: 40..45,
1166 bitmap: [true, true, false, false, true].as_slice().into(),
1167 }]);
1168 let mask = RowIdMask::from_allowed(RowIdTreeMap::from_iter(&[44]));
1169 let ranges = sequence.mask_to_offset_ranges(&mask);
1170 assert_eq!(ranges, vec![2..3]);
1171
1172 let sequence = RowIdSequence(vec![U64Segment::RangeWithBitmap {
1173 range: 40..45,
1174 bitmap: [true, true, false, false, true].as_slice().into(),
1175 }]);
1176 let mask = RowIdMask::from_block(RowIdTreeMap::from_iter(&[44]));
1177 let ranges = sequence.mask_to_offset_ranges(&mask);
1178 assert_eq!(ranges, vec![0..2]);
1179
1180 let sequence = RowIdSequence(vec![U64Segment::SortedArray(vec![0, 2, 4, 6, 8].into())]);
1182 let mask = RowIdMask::from_allowed(RowIdTreeMap::from_iter(&[0, 6, 8]));
1183 let ranges = sequence.mask_to_offset_ranges(&mask);
1184 assert_eq!(ranges, vec![0..1, 3..5]);
1185
1186 let sequence = RowIdSequence(vec![U64Segment::Array(vec![8, 2, 6, 0, 4].into())]);
1187 let mask = RowIdMask::from_allowed(RowIdTreeMap::from_iter(&[0, 6, 8]));
1188 let ranges = sequence.mask_to_offset_ranges(&mask);
1189 assert_eq!(ranges, vec![0..1, 2..4]);
1190
1191 let sequence = RowIdSequence(vec![
1196 U64Segment::Range(0..5),
1197 U64Segment::RangeWithHoles {
1198 range: 100..105,
1199 holes: vec![103].into(),
1200 },
1201 U64Segment::SortedArray(vec![44, 46, 78].into()),
1202 ]);
1203 let mask = RowIdMask::from_allowed(RowIdTreeMap::from_iter(&[0, 2, 46, 100, 104]));
1204 let ranges = sequence.mask_to_offset_ranges(&mask);
1205 assert_eq!(ranges, vec![0..1, 2..3, 5..6, 8..9, 10..11]);
1206
1207 let sequence = RowIdSequence(vec![U64Segment::Range(0..10)]);
1209 let mask = RowIdMask::default();
1210 let ranges = sequence.mask_to_offset_ranges(&mask);
1211 assert_eq!(ranges, vec![0..10]);
1212
1213 let sequence = RowIdSequence(vec![U64Segment::Range(0..10)]);
1215 let mask = RowIdMask::allow_nothing();
1216 let ranges = sequence.mask_to_offset_ranges(&mask);
1217 assert_eq!(ranges, vec![]);
1218 }
1219
1220 #[test]
1221 fn test_row_id_sequence_rechunk_with_empty_segments() {
1222 let input_sequences = vec![
1224 RowIdSequence::from(0..2), RowIdSequence::from(20..23), ];
1227 let chunk_sizes = vec![2, 3]; let result = rechunk_sequences(input_sequences, chunk_sizes, false).unwrap();
1230 assert_eq!(result.len(), 2);
1231 assert_eq!(result[0].len(), 2);
1232 assert_eq!(result[1].len(), 3);
1233
1234 let first_chunk: Vec<u64> = result[0].iter().collect();
1235 let second_chunk: Vec<u64> = result[1].iter().collect();
1236 assert_eq!(first_chunk, vec![0, 1]);
1237 assert_eq!(second_chunk, vec![20, 21, 22]);
1238
1239 let input_sequences = vec![
1241 RowIdSequence::from(0..2), RowIdSequence::from(20..21), RowIdSequence::from(30..32), ];
1245 let chunk_sizes = vec![5]; let result = rechunk_sequences(input_sequences, chunk_sizes, false).unwrap();
1248 assert_eq!(result.len(), 1);
1249 assert_eq!(result[0].len(), 5);
1250
1251 let elements: Vec<u64> = result[0].iter().collect();
1252 assert_eq!(elements, vec![0, 1, 20, 30, 31]);
1253
1254 let input_sequences = vec![
1256 RowIdSequence::from(0..2), RowIdSequence::from(10..10), RowIdSequence::from(20..22), ];
1260 let chunk_sizes = vec![3, 1];
1261 let result = rechunk_sequences(input_sequences, chunk_sizes, false).unwrap();
1262
1263 assert_eq!(result.len(), 2);
1264 assert_eq!(result[0].len(), 3);
1265 assert_eq!(result[1].len(), 1);
1266
1267 let first_chunk_elements: Vec<u64> = result[0].iter().collect();
1268 let second_chunk_elements: Vec<u64> = result[1].iter().collect();
1269 assert_eq!(first_chunk_elements, vec![0, 1, 20]);
1270 assert_eq!(second_chunk_elements, vec![21]);
1271
1272 let input_sequences = vec![
1274 RowIdSequence::from(0..1), RowIdSequence::from(10..10), RowIdSequence::from(20..20), RowIdSequence::from(30..32), ];
1279 let chunk_sizes = vec![3];
1280 let result = rechunk_sequences(input_sequences, chunk_sizes, false).unwrap();
1281
1282 assert_eq!(result.len(), 1);
1283 assert_eq!(result[0].len(), 3);
1284
1285 let elements: Vec<u64> = result[0].iter().collect();
1286 assert_eq!(elements, vec![0, 30, 31]);
1287
1288 let input_sequences = vec![
1290 RowIdSequence::from(0..3), RowIdSequence::from(10..10), RowIdSequence::from(20..22), ];
1294 let chunk_sizes = vec![3, 2];
1295 let result = rechunk_sequences(input_sequences, chunk_sizes, false).unwrap();
1296
1297 assert_eq!(result.len(), 2);
1298 assert_eq!(result[0].len(), 3);
1299 assert_eq!(result[1].len(), 2);
1300
1301 let first_chunk_elements: Vec<u64> = result[0].iter().collect();
1302 let second_chunk_elements: Vec<u64> = result[1].iter().collect();
1303 assert_eq!(first_chunk_elements, vec![0, 1, 2]);
1304 assert_eq!(second_chunk_elements, vec![20, 21]);
1305
1306 let input_sequences = vec![
1308 RowIdSequence::from(0..2), RowIdSequence::from(10..10), ];
1311 let chunk_sizes = vec![5]; let result = rechunk_sequences(input_sequences, chunk_sizes, true).unwrap();
1313
1314 assert_eq!(result.len(), 1);
1315 assert_eq!(result[0].len(), 2);
1316
1317 let elements: Vec<u64> = result[0].iter().collect();
1318 assert_eq!(elements, vec![0, 1]);
1319 }
1320}