vortex_array/arrays/varbinview/
compact.rs1use vortex_error::{VortexResult, VortexUnwrap};
8
9use crate::arrays::VarBinViewArray;
10use crate::builders::{ArrayBuilder, VarBinViewBuilder};
11use crate::validity::Validity;
12use crate::vtable::ValidityHelper;
13
14pub fn compact_buffers(array: &VarBinViewArray) -> VortexResult<VarBinViewArray> {
22 if !should_compact(array) {
24 return Ok(array.clone());
25 }
26
27 match array.validity() {
29 Validity::AllInvalid => Ok(VarBinViewArray::try_new(
31 array.views().clone(),
32 vec![],
33 array.dtype().clone(),
34 array.validity().clone(),
35 )?),
36 Validity::NonNullable | Validity::AllValid => rebuild_nonnull(array),
38 Validity::Array(_) => rebuild_nullable(array),
40 }
41}
42
43fn should_compact(array: &VarBinViewArray) -> bool {
44 if array.nbuffers() == 0 {
46 return false;
47 }
48
49 let bytes_referenced: u64 = count_referenced_bytes(array);
50 let buffer_total_bytes: u64 = array.buffers.iter().map(|buf| buf.len() as u64).sum();
51
52 bytes_referenced < buffer_total_bytes
55}
56
57fn count_referenced_bytes(array: &VarBinViewArray) -> u64 {
60 match array.validity() {
61 Validity::AllInvalid => 0u64,
62 _ => {
63 array
64 .views()
65 .iter()
66 .enumerate()
67 .map(|(idx, &view)| {
68 if !array.is_valid(idx).vortex_unwrap() || view.is_inlined() {
69 0u64
70 } else {
71 unsafe { view._ref }.size as u64
73 }
74 })
75 .sum()
76 }
77 }
78}
79
80fn rebuild_nullable(array: &VarBinViewArray) -> VortexResult<VarBinViewArray> {
83 let mut builder = VarBinViewBuilder::with_capacity(array.dtype().clone(), array.len());
84 for i in 0..array.len() {
85 if !array.is_valid(i)? {
86 builder.append_null();
87 } else {
88 let bytes = array.bytes_at(i);
89 builder.append_value(bytes.as_slice());
90 }
91 }
92
93 Ok(builder.finish_into_varbinview())
94}
95
96fn rebuild_nonnull(array: &VarBinViewArray) -> VortexResult<VarBinViewArray> {
99 let mut builder = VarBinViewBuilder::with_capacity(array.dtype().clone(), array.len());
100 for i in 0..array.len() {
101 builder.append_value(array.bytes_at(i).as_ref());
102 }
103 Ok(builder.finish_into_varbinview())
104}
105
106#[cfg(test)]
107mod tests {
108 use vortex_buffer::buffer;
109
110 use crate::IntoArray;
111 use crate::arrays::varbinview::compact::compact_buffers;
112 use crate::arrays::{VarBinViewArray, VarBinViewVTable};
113 use crate::compute::take;
114
115 #[test]
116 fn test_optimize_compacts_buffers() {
117 let original = VarBinViewArray::from_iter_nullable_str([
119 Some("short"),
120 Some("this is a longer string that will be stored in a buffer"),
121 Some("medium length string"),
122 Some("another very long string that definitely needs a buffer to store it"),
123 Some("tiny"),
124 ]);
125
126 assert!(original.nbuffers() > 0);
128 let original_buffers = original.nbuffers();
129
130 let indices = buffer![0u32, 4u32].into_array();
132 let taken = take(original.as_ref(), &indices).unwrap();
133 let taken_array = taken.as_::<VarBinViewVTable>();
134
135 assert_eq!(taken_array.nbuffers(), original_buffers);
137
138 let optimized_array = compact_buffers(taken_array).unwrap();
140
141 assert!(optimized_array.nbuffers() <= 1);
145
146 assert_eq!(optimized_array.len(), 2);
148 assert_eq!(optimized_array.scalar_at(0).unwrap(), "short".into());
149 assert_eq!(optimized_array.scalar_at(1).unwrap(), "tiny".into());
150 }
151
152 #[test]
153 fn test_optimize_with_long_strings() {
154 let long_string_1 = "this is definitely a very long string that exceeds the inline limit";
156 let long_string_2 = "another extremely long string that also needs external buffer storage";
157 let long_string_3 = "yet another long string for testing buffer compaction functionality";
158
159 let original = VarBinViewArray::from_iter_str([
160 long_string_1,
161 long_string_2,
162 long_string_3,
163 "short1",
164 "short2",
165 ]);
166
167 let indices = buffer![0u32, 2u32].into_array();
169 let taken = take(original.as_ref(), &indices).unwrap();
170 let taken_array = taken.as_::<VarBinViewVTable>();
171
172 let optimized_array = compact_buffers(taken_array).unwrap();
174
175 assert_eq!(optimized_array.nbuffers(), 1);
177
178 assert_eq!(optimized_array.len(), 2);
180 assert_eq!(optimized_array.scalar_at(0).unwrap(), long_string_1.into());
181 assert_eq!(optimized_array.scalar_at(1).unwrap(), long_string_3.into());
182 }
183
184 #[test]
185 fn test_optimize_no_buffers() {
186 let original = VarBinViewArray::from_iter_str(["a", "bb", "ccc", "dddd"]);
188
189 assert_eq!(original.nbuffers(), 0);
191
192 let optimized_array = compact_buffers(&original).unwrap();
194
195 assert_eq!(optimized_array.nbuffers(), 0);
196 assert_eq!(optimized_array.len(), 4);
197
198 for i in 0..4 {
200 assert_eq!(
201 optimized_array.scalar_at(i).unwrap(),
202 original.scalar_at(i).unwrap()
203 );
204 }
205 }
206
207 #[test]
208 fn test_optimize_single_buffer() {
209 let str1 = "this is a long string that goes into a buffer";
211 let str2 = "another long string in the same buffer";
212 let original = VarBinViewArray::from_iter_str([str1, str2]);
213
214 assert_eq!(original.nbuffers(), 1);
216 assert_eq!(original.buffer(0).len(), str1.len() + str2.len());
217
218 let optimized_array = compact_buffers(&original).unwrap();
220
221 assert_eq!(optimized_array.nbuffers(), 1);
222 assert_eq!(optimized_array.len(), 2);
223
224 for i in 0..2 {
226 assert_eq!(
227 optimized_array.scalar_at(i).unwrap(),
228 original.scalar_at(i).unwrap()
229 );
230 }
231 }
232}