use std::sync::{
Arc,
atomic::{AtomicBool, AtomicU64, AtomicUsize, Ordering},
};
use arrow_array::{Array, BooleanArray, RecordBatch, UInt64Array};
use arrow_schema::{DataType, Field, Schema};
use datafusion_common::hash_utils::create_hashes;
use futures::{StreamExt, TryStreamExt};
use lance_arrow::SchemaExt;
use lance_core::ROW_ID;
use crate::{
Error, Result,
arrow::{SendableRecordBatchStream, SimpleRecordBatchStream},
dataloader::{
permutation::shuffle::{Shuffler, ShufflerConfig},
permutation::util::TemporaryDirectory,
},
query::{Query, QueryBase, Select},
};
pub const SPLIT_ID_COLUMN: &str = "split_id";
#[derive(Debug, Clone, Default)]
pub enum SplitStrategy {
#[default]
NoSplit,
Random {
seed: Option<u64>,
sizes: SplitSizes,
},
Hash {
columns: Vec<String>,
split_weights: Vec<u64>,
discard_weight: u64,
},
Sequential { sizes: SplitSizes },
Calculated { calculation: String },
}
impl SplitStrategy {
pub fn validate(&self, num_rows: u64) -> Result<()> {
match self {
Self::NoSplit => Ok(()),
Self::Random { sizes, .. } => sizes.validate(num_rows),
Self::Hash {
split_weights,
columns,
..
} => {
if columns.is_empty() {
return Err(Error::InvalidInput {
message: "Hash strategy requires at least one column".to_string(),
});
}
if split_weights.is_empty() {
return Err(Error::InvalidInput {
message: "Hash strategy requires at least one split weight".to_string(),
});
}
if split_weights.contains(&0) {
return Err(Error::InvalidInput {
message: "Split weights must be greater than 0".to_string(),
});
}
Ok(())
}
Self::Sequential { sizes } => sizes.validate(num_rows),
Self::Calculated { .. } => Ok(()),
}
}
}
pub struct Splitter {
temp_dir: TemporaryDirectory,
strategy: SplitStrategy,
}
impl Splitter {
pub fn new(temp_dir: TemporaryDirectory, strategy: SplitStrategy) -> Self {
Self { temp_dir, strategy }
}
fn sequential_split_id(
num_rows: u64,
split_sizes: &[u64],
split_index: &AtomicUsize,
counter_in_split: &AtomicU64,
exhausted: &AtomicBool,
) -> UInt64Array {
let mut split_ids = Vec::<u64>::with_capacity(num_rows as usize);
while split_ids.len() < num_rows as usize {
let split_id = split_index.load(Ordering::Relaxed);
let counter = counter_in_split.load(Ordering::Relaxed);
let split_size = split_sizes[split_id];
let remaining_in_split = split_size - counter;
let remaining_in_batch = num_rows - split_ids.len() as u64;
let mut done = false;
let rows_to_add = if remaining_in_batch < remaining_in_split {
counter_in_split.fetch_add(remaining_in_batch, Ordering::Relaxed);
remaining_in_batch
} else {
split_index.fetch_add(1, Ordering::Relaxed);
counter_in_split.store(0, Ordering::Relaxed);
if split_id == split_sizes.len() - 1 {
exhausted.store(true, Ordering::Relaxed);
done = true;
}
remaining_in_split
};
split_ids.extend(std::iter::repeat_n(split_id as u64, rows_to_add as usize));
if done {
break;
}
}
UInt64Array::from(split_ids)
}
async fn apply_sequential(
&self,
source: SendableRecordBatchStream,
num_rows: u64,
sizes: &SplitSizes,
) -> Result<SendableRecordBatchStream> {
let split_sizes = sizes.to_counts(num_rows);
let split_index = AtomicUsize::new(0);
let counter_in_split = AtomicU64::new(0);
let exhausted = AtomicBool::new(false);
let schema = source.schema();
let new_schema = Arc::new(schema.try_with_column(Field::new(
SPLIT_ID_COLUMN,
DataType::UInt64,
false,
))?);
let new_schema_clone = new_schema.clone();
let stream = source.filter_map(move |batch| {
let batch = match batch {
Ok(batch) => batch,
Err(e) => {
return std::future::ready(Some(Err(e)));
}
};
if exhausted.load(Ordering::Relaxed) {
return std::future::ready(None);
}
let split_ids = Self::sequential_split_id(
batch.num_rows() as u64,
&split_sizes,
&split_index,
&counter_in_split,
&exhausted,
);
let mut arrays = batch.columns().to_vec();
if split_ids.len() < batch.num_rows() {
arrays = arrays
.iter()
.map(|arr| arr.slice(0, split_ids.len()))
.collect();
}
arrays.push(Arc::new(split_ids));
std::future::ready(Some(Ok(
RecordBatch::try_new(new_schema.clone(), arrays).unwrap()
)))
});
Ok(Box::pin(SimpleRecordBatchStream::new(
stream,
new_schema_clone,
)))
}
fn hash_split_id(batch: &RecordBatch, thresholds: &[u64], total_weight: u64) -> UInt64Array {
let arrays = batch
.columns()
.iter()
.take(batch.columns().len() - 1)
.cloned()
.collect::<Vec<_>>();
let mut hashes = vec![0; batch.num_rows()];
let random_state = ahash::RandomState::with_seeds(0, 0, 0, 0);
create_hashes(&arrays, &random_state, &mut hashes).unwrap();
let split_ids = hashes
.iter()
.map(|h| {
let h = h % total_weight;
let split_id = match thresholds.binary_search(&h) {
Ok(i) => (i + 1) as u64,
Err(i) => i as u64,
};
if split_id == thresholds.len() as u64 {
None
} else {
Some(split_id)
}
})
.collect::<Vec<_>>();
UInt64Array::from(split_ids)
}
async fn apply_hash(
&self,
source: SendableRecordBatchStream,
weights: &[u64],
discard_weight: u64,
) -> Result<SendableRecordBatchStream> {
let row_id_index = source.schema().fields.len() - 1;
let new_schema = Arc::new(Schema::new(vec![
source.schema().field(row_id_index).clone(),
Field::new(SPLIT_ID_COLUMN, DataType::UInt64, false),
]));
let total_weight = weights.iter().sum::<u64>() + discard_weight;
let mut offset = 0;
let thresholds = weights
.iter()
.map(|w| {
let value = offset + w;
offset = value;
value
})
.collect::<Vec<_>>();
let new_schema_clone = new_schema.clone();
let stream = source.map_ok(move |batch| {
let split_ids = Self::hash_split_id(&batch, &thresholds, total_weight);
if split_ids.null_count() > 0 {
let is_valid = split_ids.nulls().unwrap().inner();
let is_valid_mask = BooleanArray::new(is_valid.clone(), None);
let split_ids = arrow::compute::filter(&split_ids, &is_valid_mask).unwrap();
let row_ids = batch.column(row_id_index);
let row_ids = arrow::compute::filter(row_ids.as_ref(), &is_valid_mask).unwrap();
RecordBatch::try_new(new_schema.clone(), vec![row_ids, split_ids]).unwrap()
} else {
RecordBatch::try_new(
new_schema.clone(),
vec![batch.column(row_id_index).clone(), Arc::new(split_ids)],
)
.unwrap()
}
});
Ok(Box::pin(SimpleRecordBatchStream::new(
stream,
new_schema_clone,
)))
}
pub async fn apply(
&self,
source: SendableRecordBatchStream,
num_rows: u64,
) -> Result<SendableRecordBatchStream> {
self.strategy.validate(num_rows)?;
match &self.strategy {
SplitStrategy::NoSplit => {
self.apply_sequential(source, num_rows, &SplitSizes::Counts(vec![num_rows]))
.await
}
SplitStrategy::Random { seed, sizes } => {
let shuffler = Shuffler::new(ShufflerConfig {
seed: *seed,
max_rows_per_file: 10 * 1024 * 1024,
temp_dir: self.temp_dir.clone(),
clump_size: None,
});
let shuffled = shuffler.shuffle(source, num_rows).await?;
self.apply_sequential(shuffled, num_rows, sizes).await
}
SplitStrategy::Sequential { sizes } => {
self.apply_sequential(source, num_rows, sizes).await
}
SplitStrategy::Calculated { .. } => Ok(source),
SplitStrategy::Hash {
split_weights,
discard_weight,
..
} => {
self.apply_hash(source, split_weights, *discard_weight)
.await
}
}
}
pub fn project(&self, query: Query) -> Query {
match &self.strategy {
SplitStrategy::Calculated { calculation } => query.select(Select::Dynamic(vec![
(SPLIT_ID_COLUMN.to_string(), calculation.clone()),
(ROW_ID.to_string(), ROW_ID.to_string()),
])),
SplitStrategy::Hash { columns, .. } => {
let mut cols = columns.clone();
cols.push(ROW_ID.to_string());
query.select(Select::Columns(cols))
}
_ => query,
}
}
pub fn orders_by_split_id(&self) -> bool {
match &self.strategy {
SplitStrategy::Hash { .. } | SplitStrategy::Calculated { .. } => true,
SplitStrategy::NoSplit
| SplitStrategy::Sequential { .. }
| SplitStrategy::Random { .. } => false,
}
}
}
#[derive(Debug, Clone)]
pub enum SplitSizes {
Percentages(Vec<f64>),
Counts(Vec<u64>),
Fixed(u64),
}
impl Default for SplitSizes {
fn default() -> Self {
Self::Percentages(vec![1.0])
}
}
impl SplitSizes {
pub fn validate(&self, num_rows: u64) -> Result<()> {
match self {
Self::Percentages(percentages) => {
for percentage in percentages {
if *percentage < 0.0 || *percentage > 1.0 {
return Err(Error::InvalidInput {
message: "Split percentages must be between 0.0 and 1.0".to_string(),
});
}
if percentage * (num_rows as f64) < 1.0 {
return Err(Error::InvalidInput {
message: format!(
"One of the splits has {}% of {} rows which rounds to 0 rows",
percentage * 100.0,
num_rows
),
});
}
}
if percentages.iter().sum::<f64>() > 1.0 {
return Err(Error::InvalidInput {
message: "Split percentages must sum to 1.0 or less".to_string(),
});
}
}
Self::Counts(counts) => {
if counts.iter().sum::<u64>() > num_rows {
return Err(Error::InvalidInput {
message: format!(
"Split counts specified {} rows but only {} are available",
counts.iter().sum::<u64>(),
num_rows
),
});
}
if counts.contains(&0) {
return Err(Error::InvalidInput {
message: "Split counts must be greater than 0".to_string(),
});
}
}
Self::Fixed(num_splits) => {
if *num_splits > num_rows {
return Err(Error::InvalidInput {
message: format!(
"Split fixed config specified {} splits but only {} rows are available. Must have at least 1 row per split.",
*num_splits, num_rows
),
});
}
if (num_rows / num_splits) == 0 {
return Err(Error::InvalidInput {
message: format!(
"Split fixed config specified {} splits but only {} rows are available. Must have at least 1 row per split.",
*num_splits, num_rows
),
});
}
}
}
Ok(())
}
pub fn to_counts(&self, num_rows: u64) -> Vec<u64> {
let sizes = match self {
Self::Percentages(percentages) => {
let mut percentage_sum = 0.0_f64;
let mut counts = percentages
.iter()
.map(|p| {
let count = (p * (num_rows as f64)).round() as u64;
percentage_sum += p;
count
})
.collect::<Vec<_>>();
let sum = counts.iter().sum::<u64>();
let is_basically_one =
(num_rows as f64 - percentage_sum * num_rows as f64).abs() < 0.5;
let mut excess = sum as i64 - num_rows as i64;
let mut drop_idx = 0;
while excess > 0 {
if counts[drop_idx] > 0 {
counts[drop_idx] -= 1;
excess -= 1;
}
drop_idx += 1;
if drop_idx == counts.len() {
drop_idx = 0;
}
}
let mut add_idx = 0;
while is_basically_one && excess < 0 {
counts[add_idx] += 1;
add_idx += 1;
excess += 1;
if add_idx == counts.len() {
add_idx = 0;
}
}
counts
}
Self::Counts(counts) => counts.clone(),
Self::Fixed(num_splits) => {
let rows_per_split = num_rows / *num_splits;
vec![rows_per_split; *num_splits as usize]
}
};
assert!(sizes.iter().sum::<u64>() <= num_rows);
sizes
}
}
#[cfg(test)]
mod tests {
use crate::arrow::LanceDbDatagenExt;
use super::*;
use arrow::{
array::AsArray,
compute::concat_batches,
datatypes::{Int32Type, UInt64Type},
};
use arrow_array::Int32Array;
use futures::TryStreamExt;
use lance_datagen::{BatchCount, ByteCount, RowCount, Seed};
use std::sync::Arc;
const ID_COLUMN: &str = "id";
#[test]
fn test_split_sizes_percentages_validation() {
let sizes = SplitSizes::Percentages(vec![0.7, 0.3]);
assert!(sizes.validate(100).is_ok());
let sizes = SplitSizes::Percentages(vec![0.7, 0.4]);
assert!(sizes.validate(100).is_err());
let sizes = SplitSizes::Percentages(vec![-0.1, 0.5]);
assert!(sizes.validate(100).is_err());
let sizes = SplitSizes::Percentages(vec![1.5]);
assert!(sizes.validate(100).is_err());
let sizes = SplitSizes::Percentages(vec![0.001]);
assert!(sizes.validate(100).is_err());
}
#[test]
fn test_split_sizes_counts_validation() {
let sizes = SplitSizes::Counts(vec![30, 70]);
assert!(sizes.validate(100).is_ok());
let sizes = SplitSizes::Counts(vec![60, 50]);
assert!(sizes.validate(100).is_err());
let sizes = SplitSizes::Counts(vec![0, 100]);
assert!(sizes.validate(100).is_err());
}
#[test]
fn test_split_sizes_fixed_validation() {
let sizes = SplitSizes::Fixed(5);
assert!(sizes.validate(100).is_ok());
let sizes = SplitSizes::Fixed(150);
assert!(sizes.validate(100).is_err());
}
#[test]
fn test_split_sizes_to_sizes_percentages() {
let sizes = SplitSizes::Percentages(vec![0.3, 0.7]);
let result = sizes.to_counts(100);
assert_eq!(result, vec![30, 70]);
let sizes = SplitSizes::Percentages(vec![0.3, 0.41]);
let result = sizes.to_counts(70);
assert_eq!(result, vec![21, 29]);
}
#[test]
fn test_split_sizes_to_sizes_fixed() {
let sizes = SplitSizes::Fixed(4);
let result = sizes.to_counts(100);
assert_eq!(result, vec![25, 25, 25, 25]);
let sizes = SplitSizes::Fixed(3);
let result = sizes.to_counts(10);
assert_eq!(result, vec![3, 3, 3]);
}
fn test_data() -> SendableRecordBatchStream {
lance_datagen::gen_batch()
.with_seed(Seed::from(42))
.col(ID_COLUMN, lance_datagen::array::step::<Int32Type>())
.into_ldb_stream(RowCount::from(10), BatchCount::from(5))
}
async fn verify_splitter(
splitter: Splitter,
data: SendableRecordBatchStream,
num_rows: u64,
expected_split_sizes: &[u64],
row_ids_in_order: bool,
) {
let split_batches = splitter
.apply(data, num_rows)
.await
.unwrap()
.try_collect::<Vec<_>>()
.await
.unwrap();
let schema = split_batches[0].schema();
let split_batch = concat_batches(&schema, &split_batches).unwrap();
let total_split_sizes = expected_split_sizes.iter().sum::<u64>();
assert_eq!(split_batch.num_rows(), total_split_sizes as usize);
let mut expected = Vec::with_capacity(total_split_sizes as usize);
for (i, size) in expected_split_sizes.iter().enumerate() {
expected.extend(std::iter::repeat_n(i as u64, *size as usize));
}
let expected = Arc::new(UInt64Array::from(expected)) as Arc<dyn Array>;
assert_eq!(&expected, split_batch.column(1));
let expected_row_ids =
Arc::new(Int32Array::from_iter_values(0..total_split_sizes as i32)) as Arc<dyn Array>;
if row_ids_in_order {
assert_eq!(&expected_row_ids, split_batch.column(0));
} else {
assert_ne!(&expected_row_ids, split_batch.column(0));
}
}
#[tokio::test]
async fn test_fixed_sequential_split() {
let splitter = Splitter::new(
TemporaryDirectory::None,
SplitStrategy::Sequential {
sizes: SplitSizes::Fixed(3),
},
);
verify_splitter(splitter, test_data(), 50, &[16, 16, 16], true).await;
}
#[tokio::test]
async fn test_fixed_random_split() {
let splitter = Splitter::new(
TemporaryDirectory::None,
SplitStrategy::Random {
seed: Some(42),
sizes: SplitSizes::Fixed(3),
},
);
verify_splitter(splitter, test_data(), 50, &[16, 16, 16], false).await;
}
#[tokio::test]
async fn test_counts_sequential_split() {
let splitter = Splitter::new(
TemporaryDirectory::None,
SplitStrategy::Sequential {
sizes: SplitSizes::Counts(vec![5, 15, 10]),
},
);
verify_splitter(splitter, test_data(), 50, &[5, 15, 10], true).await;
}
#[tokio::test]
async fn test_counts_random_split() {
let splitter = Splitter::new(
TemporaryDirectory::None,
SplitStrategy::Random {
seed: Some(42),
sizes: SplitSizes::Counts(vec![5, 15, 10]),
},
);
verify_splitter(splitter, test_data(), 50, &[5, 15, 10], false).await;
}
#[tokio::test]
async fn test_percentages_sequential_split() {
let splitter = Splitter::new(
TemporaryDirectory::None,
SplitStrategy::Sequential {
sizes: SplitSizes::Percentages(vec![0.217, 0.168, 0.17]),
},
);
verify_splitter(splitter, test_data(), 50, &[11, 8, 9], true).await;
}
#[tokio::test]
async fn test_percentages_random_split() {
let splitter = Splitter::new(
TemporaryDirectory::None,
SplitStrategy::Random {
seed: Some(42),
sizes: SplitSizes::Percentages(vec![0.217, 0.168, 0.17]),
},
);
verify_splitter(splitter, test_data(), 50, &[11, 8, 9], false).await;
}
#[tokio::test]
async fn test_hash_split() {
let data = lance_datagen::gen_batch()
.with_seed(Seed::from(42))
.col(
"hash1",
lance_datagen::array::rand_utf8(ByteCount::from(10), false),
)
.col("hash2", lance_datagen::array::step::<Int32Type>())
.col(ID_COLUMN, lance_datagen::array::step::<Int32Type>())
.into_ldb_stream(RowCount::from(10), BatchCount::from(5));
let splitter = Splitter::new(
TemporaryDirectory::None,
SplitStrategy::Hash {
columns: vec!["hash1".to_string(), "hash2".to_string()],
split_weights: vec![1, 2],
discard_weight: 1,
},
);
let split_batches = splitter
.apply(data, 10)
.await
.unwrap()
.try_collect::<Vec<_>>()
.await
.unwrap();
let schema = split_batches[0].schema();
let split_batch = concat_batches(&schema, &split_batches).unwrap();
assert_eq!(split_batch.num_rows(), 36);
assert_eq!(split_batch.num_columns(), 2);
let split_ids = split_batch.column(1).as_primitive::<UInt64Type>().values();
let num_in_split_0 = split_ids.iter().filter(|v| **v == 0).count();
let num_in_split_1 = split_ids.iter().filter(|v| **v == 1).count();
assert_eq!(num_in_split_0, 11); assert_eq!(num_in_split_1, 25); }
}