use std::{collections::HashMap, sync::Arc};
use arrow_array::RecordBatch;
use arrow_schema::{DataType, Field, Schema};
use criterion::{Criterion, criterion_group, criterion_main};
use lance_encoding::{
encoder::{EncodingOptions, default_encoding_strategy, encode_batch},
version::LanceFileVersion,
};
fn bench_encode_compressed(c: &mut Criterion) {
let rt = tokio::runtime::Runtime::new().unwrap();
let mut group = c.benchmark_group("encode_compressed");
const NUM_ROWS: usize = 5_000_000;
const NUM_COLUMNS: usize = 10;
let array: Arc<dyn arrow_array::Array> = Arc::new(arrow_array::StringArray::from_iter_values(
(0..NUM_ROWS).map(|i| format!("prefix_that_compresses_well_{}", i)),
));
for compression in ["zstd", "lz4"] {
let mut metadata = HashMap::new();
metadata.insert(
"lance-encoding:compression".to_string(),
compression.to_string(),
);
metadata.insert(
"lance-encoding:dict-divisor".to_string(),
"100000".to_string(),
);
metadata.insert(
"lance-encoding:structural-encoding".to_string(),
"miniblock".to_string(),
);
let fields: Vec<Field> = (0..NUM_COLUMNS)
.map(|i| {
Field::new(format!("s{}", i), DataType::Utf8, false).with_metadata(metadata.clone())
})
.collect();
let columns: Vec<Arc<dyn arrow_array::Array>> =
(0..NUM_COLUMNS).map(|_| array.clone()).collect();
let schema = Arc::new(Schema::new(fields));
let data = RecordBatch::try_new(schema.clone(), columns).unwrap();
let lance_schema =
Arc::new(lance_core::datatypes::Schema::try_from(schema.as_ref()).unwrap());
let encoding_strategy = default_encoding_strategy(LanceFileVersion::V2_2);
group.throughput(criterion::Throughput::Elements(
(NUM_ROWS * NUM_COLUMNS) as u64,
));
group.bench_function(
format!("{}_strings_{}cols", compression, NUM_COLUMNS),
|b| {
b.iter(|| {
rt.block_on(encode_batch(
&data,
lance_schema.clone(),
encoding_strategy.as_ref(),
&EncodingOptions::default(),
))
.unwrap()
})
},
);
}
}
#[cfg(target_os = "linux")]
criterion_group!(
name=benches;
config = Criterion::default().significance_level(0.1).sample_size(10)
.with_profiler(pprof::criterion::PProfProfiler::new(100, pprof::criterion::Output::Flamegraph(None)));
targets = bench_encode_compressed);
#[cfg(not(target_os = "linux"))]
criterion_group!(
name=benches;
config = Criterion::default().significance_level(0.1).sample_size(10);
targets = bench_encode_compressed);
criterion_main!(benches);