1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The Lance Authors
use arrow_array::{cast::AsArray, ArrayRef};
use arrow_buffer::BooleanBufferBuilder;
use arrow_schema::DataType;
use lance_core::Result;
use crate::encoder::{BufferEncoder, EncodedBuffer};
#[derive(Debug, Default)]
pub struct FlatBufferEncoder {}
impl BufferEncoder for FlatBufferEncoder {
fn encode(&self, arrays: &[ArrayRef]) -> Result<EncodedBuffer> {
let parts = arrays
.iter()
.map(|arr| arr.to_data().buffers()[0].clone())
.collect::<Vec<_>>();
Ok(EncodedBuffer { parts })
}
}
// Encoder for writing boolean arrays as dense bitmaps
#[derive(Debug, Default)]
pub struct BitmapBufferEncoder {}
impl BufferEncoder for BitmapBufferEncoder {
fn encode(&self, arrays: &[ArrayRef]) -> Result<EncodedBuffer> {
debug_assert!(arrays
.iter()
.all(|arr| *arr.data_type() == DataType::Boolean));
let num_rows: u32 = arrays.iter().map(|arr| arr.len() as u32).sum();
// Empty pages don't make sense, this should be prevented before we
// get here
debug_assert_ne!(num_rows, 0);
// We can't just write the inner value buffers one after the other because
// bitmaps can have junk padding at the end (e.g. a boolean array with 12
// values will be 2 bytes but the last four bits of the second byte are
// garbage). So we go ahead and pay the cost of a copy (we could avoid this
// if we really needed to, at the expense of more complicated code and a slightly
// larger encoded size but writer cost generally doesn't matter as much as reader cost)
let mut builder = BooleanBufferBuilder::new(num_rows as usize);
for arr in arrays {
let bool_arr = arr.as_boolean();
builder.append_buffer(bool_arr.values());
}
let buffer = builder.finish().into_inner();
let parts = vec![buffer];
let buffer = EncodedBuffer { parts };
Ok(buffer)
}
}