vortex_array/arrays/varbinview/
compact.rs1use vortex_error::VortexResult;
8
9use crate::arrays::VarBinViewArray;
10use crate::builders::{ArrayBuilder, VarBinViewBuilder};
11use crate::validity::Validity;
12use crate::vtable::ValidityHelper;
13
14impl VarBinViewArray {
15 pub fn compact_buffers(&self) -> VortexResult<VarBinViewArray> {
23 if !self.should_compact() {
25 return Ok(self.clone());
26 }
27
28 match self.validity() {
30 Validity::AllInvalid => unsafe {
33 Ok(VarBinViewArray::new_unchecked(
34 self.views().clone(),
35 Default::default(),
36 self.dtype().clone(),
37 self.validity().clone(),
38 ))
39 },
40 Validity::NonNullable | Validity::AllValid => rebuild_nonnull(self),
42 Validity::Array(_) => rebuild_nullable(self),
44 }
45 }
46
47 fn should_compact(&self) -> bool {
48 if self.nbuffers() == 0 {
50 return false;
51 }
52
53 let bytes_referenced: u64 = self.count_referenced_bytes();
54 let buffer_total_bytes: u64 = self.buffers.iter().map(|buf| buf.len() as u64).sum();
55
56 bytes_referenced < buffer_total_bytes
59 }
60
61 fn count_referenced_bytes(&self) -> u64 {
64 match self.validity() {
65 Validity::AllInvalid => 0u64,
66 Validity::NonNullable | Validity::AllValid | Validity::Array(_) => self
67 .views()
68 .iter()
69 .enumerate()
70 .map(|(idx, &view)| {
71 if !self.is_valid(idx) || view.is_inlined() {
72 0u64
73 } else {
74 view.len() as u64
75 }
76 })
77 .sum(),
78 }
79 }
80}
81
82fn rebuild_nullable(array: &VarBinViewArray) -> VortexResult<VarBinViewArray> {
85 let mut builder = VarBinViewBuilder::with_capacity(array.dtype().clone(), array.len());
86 for i in 0..array.len() {
87 if !array.is_valid(i) {
88 builder.append_null();
89 } else {
90 let bytes = array.bytes_at(i);
91 builder.append_value(bytes.as_slice());
92 }
93 }
94
95 Ok(builder.finish_into_varbinview())
96}
97
98fn rebuild_nonnull(array: &VarBinViewArray) -> VortexResult<VarBinViewArray> {
101 let mut builder = VarBinViewBuilder::with_capacity(array.dtype().clone(), array.len());
102 for i in 0..array.len() {
103 builder.append_value(array.bytes_at(i).as_ref());
104 }
105 Ok(builder.finish_into_varbinview())
106}
107
108#[cfg(test)]
109mod tests {
110 use vortex_buffer::buffer;
111
112 use crate::IntoArray;
113 use crate::arrays::{VarBinViewArray, VarBinViewVTable};
114 use crate::compute::take;
115
116 #[test]
117 fn test_optimize_compacts_buffers() {
118 let original = VarBinViewArray::from_iter_nullable_str([
120 Some("short"),
121 Some("this is a longer string that will be stored in a buffer"),
122 Some("medium length string"),
123 Some("another very long string that definitely needs a buffer to store it"),
124 Some("tiny"),
125 ]);
126
127 assert!(original.nbuffers() > 0);
129 let original_buffers = original.nbuffers();
130
131 let indices = buffer![0u32, 4u32].into_array();
133 let taken = take(original.as_ref(), &indices).unwrap();
134 let taken_array = taken.as_::<VarBinViewVTable>();
135
136 assert_eq!(taken_array.nbuffers(), original_buffers);
138
139 let optimized_array = taken_array.compact_buffers().unwrap();
141
142 assert!(optimized_array.nbuffers() <= 1);
146
147 assert_eq!(optimized_array.len(), 2);
149 assert_eq!(optimized_array.scalar_at(0), "short".into());
150 assert_eq!(optimized_array.scalar_at(1), "tiny".into());
151 }
152
153 #[test]
154 fn test_optimize_with_long_strings() {
155 let long_string_1 = "this is definitely a very long string that exceeds the inline limit";
157 let long_string_2 = "another extremely long string that also needs external buffer storage";
158 let long_string_3 = "yet another long string for testing buffer compaction functionality";
159
160 let original = VarBinViewArray::from_iter_str([
161 long_string_1,
162 long_string_2,
163 long_string_3,
164 "short1",
165 "short2",
166 ]);
167
168 let indices = buffer![0u32, 2u32].into_array();
170 let taken = take(original.as_ref(), &indices).unwrap();
171 let taken_array = taken.as_::<VarBinViewVTable>();
172
173 let optimized_array = taken_array.compact_buffers().unwrap();
175
176 assert_eq!(optimized_array.nbuffers(), 1);
178
179 assert_eq!(optimized_array.len(), 2);
181 assert_eq!(optimized_array.scalar_at(0), long_string_1.into());
182 assert_eq!(optimized_array.scalar_at(1), long_string_3.into());
183 }
184
185 #[test]
186 fn test_optimize_no_buffers() {
187 let original = VarBinViewArray::from_iter_str(["a", "bb", "ccc", "dddd"]);
189
190 assert_eq!(original.nbuffers(), 0);
192
193 let optimized_array = original.compact_buffers().unwrap();
195
196 assert_eq!(optimized_array.nbuffers(), 0);
197 assert_eq!(optimized_array.len(), 4);
198
199 for i in 0..4 {
201 assert_eq!(optimized_array.scalar_at(i), original.scalar_at(i));
202 }
203 }
204
205 #[test]
206 fn test_optimize_single_buffer() {
207 let str1 = "this is a long string that goes into a buffer";
209 let str2 = "another long string in the same buffer";
210 let original = VarBinViewArray::from_iter_str([str1, str2]);
211
212 assert_eq!(original.nbuffers(), 1);
214 assert_eq!(original.buffer(0).len(), str1.len() + str2.len());
215
216 let optimized_array = original.compact_buffers().unwrap();
218
219 assert_eq!(optimized_array.nbuffers(), 1);
220 assert_eq!(optimized_array.len(), 2);
221
222 for i in 0..2 {
224 assert_eq!(optimized_array.scalar_at(i), original.scalar_at(i));
225 }
226 }
227}