use arrow::array::{
Array as _, ArrayRef, AsArray as _, RecordBatch, RecordBatchOptions, StringViewBuilder,
};
use std::sync::Arc;
pub fn garbage_collect_string_view_batch(batch: &RecordBatch) -> RecordBatch {
let new_columns: Vec<ArrayRef> = batch
.columns()
.iter()
.map(|c| {
let Some(s) = c.as_string_view_opt() else {
return Arc::clone(c);
};
if s.data_buffers().is_empty() {
return Arc::clone(c);
}
let ideal_buffer_size: usize = s
.views()
.iter()
.map(|v| {
let len = (*v as u32) as usize;
if len > 12 { len } else { 0 }
})
.sum();
let actual_buffer_size = s.data_buffers().iter().map(|b| b.capacity()).sum::<usize>();
if actual_buffer_size > (ideal_buffer_size * 2) {
let mut builder = StringViewBuilder::with_capacity(s.len());
if ideal_buffer_size > 0 {
builder = builder.with_fixed_block_size(ideal_buffer_size as u32);
}
for v in s {
builder.append_option(v);
}
let gc_string = builder.finish();
re_log::debug_assert!(gc_string.data_buffers().len() <= 1);
Arc::new(gc_string)
} else {
Arc::clone(c)
}
})
.collect();
let mut options = RecordBatchOptions::new();
options = options.with_row_count(Some(batch.num_rows()));
RecordBatch::try_new_with_options(batch.schema(), new_columns, &options)
.expect("Failed to re-create the garbage collected record batch")
}