vortex_array/arrays/varbinview/
compact.rs1use vortex_error::{VortexResult, VortexUnwrap};
8
9use crate::arrays::VarBinViewArray;
10use crate::builders::{ArrayBuilder, VarBinViewBuilder};
11use crate::validity::Validity;
12use crate::vtable::ValidityHelper;
13
14impl VarBinViewArray {
15 pub fn compact_buffers(&self) -> VortexResult<VarBinViewArray> {
23 if !self.should_compact() {
25 return Ok(self.clone());
26 }
27
28 match self.validity() {
30 Validity::AllInvalid => Ok(VarBinViewArray::try_new(
32 self.views().clone(),
33 Default::default(),
34 self.dtype().clone(),
35 self.validity().clone(),
36 )?),
37 Validity::NonNullable | Validity::AllValid => rebuild_nonnull(self),
39 Validity::Array(_) => rebuild_nullable(self),
41 }
42 }
43
44 fn should_compact(&self) -> bool {
45 if self.nbuffers() == 0 {
47 return false;
48 }
49
50 let bytes_referenced: u64 = self.count_referenced_bytes();
51 let buffer_total_bytes: u64 = self.buffers.iter().map(|buf| buf.len() as u64).sum();
52
53 bytes_referenced < buffer_total_bytes
56 }
57
58 fn count_referenced_bytes(&self) -> u64 {
61 match self.validity() {
62 Validity::AllInvalid => 0u64,
63 _ => self
64 .views()
65 .iter()
66 .enumerate()
67 .map(|(idx, &view)| {
68 if !self.is_valid(idx).vortex_unwrap() || view.is_inlined() {
69 0u64
70 } else {
71 view.len() as u64
72 }
73 })
74 .sum(),
75 }
76 }
77}
78
79fn rebuild_nullable(array: &VarBinViewArray) -> VortexResult<VarBinViewArray> {
82 let mut builder = VarBinViewBuilder::with_capacity(array.dtype().clone(), array.len());
83 for i in 0..array.len() {
84 if !array.is_valid(i)? {
85 builder.append_null();
86 } else {
87 let bytes = array.bytes_at(i);
88 builder.append_value(bytes.as_slice());
89 }
90 }
91
92 Ok(builder.finish_into_varbinview())
93}
94
95fn rebuild_nonnull(array: &VarBinViewArray) -> VortexResult<VarBinViewArray> {
98 let mut builder = VarBinViewBuilder::with_capacity(array.dtype().clone(), array.len());
99 for i in 0..array.len() {
100 builder.append_value(array.bytes_at(i).as_ref());
101 }
102 Ok(builder.finish_into_varbinview())
103}
104
105#[cfg(test)]
106mod tests {
107 use vortex_buffer::buffer;
108
109 use crate::IntoArray;
110 use crate::arrays::{VarBinViewArray, VarBinViewVTable};
111 use crate::compute::take;
112
113 #[test]
114 fn test_optimize_compacts_buffers() {
115 let original = VarBinViewArray::from_iter_nullable_str([
117 Some("short"),
118 Some("this is a longer string that will be stored in a buffer"),
119 Some("medium length string"),
120 Some("another very long string that definitely needs a buffer to store it"),
121 Some("tiny"),
122 ]);
123
124 assert!(original.nbuffers() > 0);
126 let original_buffers = original.nbuffers();
127
128 let indices = buffer![0u32, 4u32].into_array();
130 let taken = take(original.as_ref(), &indices).unwrap();
131 let taken_array = taken.as_::<VarBinViewVTable>();
132
133 assert_eq!(taken_array.nbuffers(), original_buffers);
135
136 let optimized_array = taken_array.compact_buffers().unwrap();
138
139 assert!(optimized_array.nbuffers() <= 1);
143
144 assert_eq!(optimized_array.len(), 2);
146 assert_eq!(optimized_array.scalar_at(0).unwrap(), "short".into());
147 assert_eq!(optimized_array.scalar_at(1).unwrap(), "tiny".into());
148 }
149
150 #[test]
151 fn test_optimize_with_long_strings() {
152 let long_string_1 = "this is definitely a very long string that exceeds the inline limit";
154 let long_string_2 = "another extremely long string that also needs external buffer storage";
155 let long_string_3 = "yet another long string for testing buffer compaction functionality";
156
157 let original = VarBinViewArray::from_iter_str([
158 long_string_1,
159 long_string_2,
160 long_string_3,
161 "short1",
162 "short2",
163 ]);
164
165 let indices = buffer![0u32, 2u32].into_array();
167 let taken = take(original.as_ref(), &indices).unwrap();
168 let taken_array = taken.as_::<VarBinViewVTable>();
169
170 let optimized_array = taken_array.compact_buffers().unwrap();
172
173 assert_eq!(optimized_array.nbuffers(), 1);
175
176 assert_eq!(optimized_array.len(), 2);
178 assert_eq!(optimized_array.scalar_at(0).unwrap(), long_string_1.into());
179 assert_eq!(optimized_array.scalar_at(1).unwrap(), long_string_3.into());
180 }
181
182 #[test]
183 fn test_optimize_no_buffers() {
184 let original = VarBinViewArray::from_iter_str(["a", "bb", "ccc", "dddd"]);
186
187 assert_eq!(original.nbuffers(), 0);
189
190 let optimized_array = original.compact_buffers().unwrap();
192
193 assert_eq!(optimized_array.nbuffers(), 0);
194 assert_eq!(optimized_array.len(), 4);
195
196 for i in 0..4 {
198 assert_eq!(
199 optimized_array.scalar_at(i).unwrap(),
200 original.scalar_at(i).unwrap()
201 );
202 }
203 }
204
205 #[test]
206 fn test_optimize_single_buffer() {
207 let str1 = "this is a long string that goes into a buffer";
209 let str2 = "another long string in the same buffer";
210 let original = VarBinViewArray::from_iter_str([str1, str2]);
211
212 assert_eq!(original.nbuffers(), 1);
214 assert_eq!(original.buffer(0).len(), str1.len() + str2.len());
215
216 let optimized_array = original.compact_buffers().unwrap();
218
219 assert_eq!(optimized_array.nbuffers(), 1);
220 assert_eq!(optimized_array.len(), 2);
221
222 for i in 0..2 {
224 assert_eq!(
225 optimized_array.scalar_at(i).unwrap(),
226 original.scalar_at(i).unwrap()
227 );
228 }
229 }
230}