use std::sync::{Arc, Mutex};
use arrow::compute::concat_batches;
use arrow_array::{RecordBatch, UInt64Array};
use futures::{StreamExt, TryStreamExt};
use lance::io::ObjectStore;
use lance_core::{cache::LanceCache, utils::futures::FinallyStreamExt};
use lance_encoding::decoder::DecoderPlugins;
use lance_file::{
reader::{FileReader, FileReaderOptions},
writer::{FileWriter, FileWriterOptions},
};
use lance_index::scalar::IndexReader;
use lance_io::{
scheduler::{ScanScheduler, SchedulerConfig},
utils::CachedFileSize,
};
use rand::{Rng, RngCore, seq::SliceRandom};
use crate::{
Error, Result,
arrow::{SendableRecordBatchStream, SimpleRecordBatchStream},
dataloader::permutation::util::{TemporaryDirectory, non_crypto_rng},
};
#[derive(Debug, Clone)]
pub struct ShufflerConfig {
pub seed: Option<u64>,
pub max_rows_per_file: u64,
pub temp_dir: TemporaryDirectory,
pub clump_size: Option<u64>,
}
impl Default for ShufflerConfig {
fn default() -> Self {
Self {
max_rows_per_file: 1024 * 1024,
seed: Option::default(),
temp_dir: TemporaryDirectory::default(),
clump_size: None,
}
}
}
pub struct Shuffler {
config: ShufflerConfig,
id: String,
}
impl Shuffler {
pub fn new(config: ShufflerConfig) -> Self {
let id = uuid::Uuid::new_v4().to_string();
Self { config, id }
}
fn shuffle_batch(
batch: &RecordBatch,
rng: &mut dyn RngCore,
clump_size: u64,
) -> Result<RecordBatch> {
let num_clumps = (batch.num_rows() as u64).div_ceil(clump_size);
let mut indices = (0..num_clumps).collect::<Vec<_>>();
indices.shuffle(rng);
let indices = if clump_size == 1 {
UInt64Array::from(indices)
} else {
UInt64Array::from_iter_values(indices.iter().flat_map(|&clump_index| {
if clump_index == num_clumps - 1 {
clump_index * clump_size..batch.num_rows() as u64
} else {
clump_index * clump_size..(clump_index + 1) * clump_size
}
}))
};
Ok(arrow::compute::take_record_batch(batch, &indices)?)
}
async fn in_memory_shuffle(
&self,
data: SendableRecordBatchStream,
mut rng: Box<dyn RngCore + Send>,
) -> Result<SendableRecordBatchStream> {
let schema = data.schema();
let batches = data.try_collect::<Vec<_>>().await?;
let batch = concat_batches(&schema, &batches)?;
let shuffled = Self::shuffle_batch(&batch, &mut rng, self.config.clump_size.unwrap_or(1))?;
log::debug!("Shuffle job {}: in-memory shuffle complete", self.id);
Ok(Box::pin(SimpleRecordBatchStream::new(
futures::stream::once(async move { Ok(shuffled) }),
schema,
)))
}
async fn do_shuffle(
&self,
mut data: SendableRecordBatchStream,
num_rows: u64,
mut rng: Box<dyn RngCore + Send>,
) -> Result<SendableRecordBatchStream> {
let num_files = num_rows.div_ceil(self.config.max_rows_per_file);
let temp_dir = self.config.temp_dir.create_temp_dir()?;
let tmp_dir = temp_dir.path().to_path_buf();
let clump_size = self.config.clump_size.unwrap_or(1);
if clump_size == 0 {
return Err(Error::InvalidInput {
message: "clump size must be greater than 0".to_string(),
});
}
let object_store = ObjectStore::local();
let arrow_schema = data.schema();
let schema = lance::datatypes::Schema::try_from(arrow_schema.as_ref())?;
let mut file_writers = Vec::with_capacity(num_files as usize);
for file_index in 0..num_files {
let path = tmp_dir.join(format!("shuffle_{}_{file_index}.lance", self.id));
let path =
object_store::path::Path::from_absolute_path(path).map_err(|err| Error::Other {
message: format!("Failed to create temporary file: {}", err),
source: None,
})?;
let object_writer = object_store.create(&path).await?;
let writer =
FileWriter::try_new(object_writer, schema.clone(), FileWriterOptions::default())?;
file_writers.push(writer);
}
let mut num_rows_seen = 0;
while let Some(batch) = data.try_next().await? {
num_rows_seen += batch.num_rows() as u64;
let is_last = num_rows_seen == num_rows;
if num_rows_seen > num_rows {
return Err(Error::Runtime {
message: format!("Expected {} rows but saw {} rows", num_rows, num_rows_seen),
});
}
if !is_last && !(batch.num_rows() as u64).is_multiple_of(clump_size) {
return Err(Error::Runtime {
message: format!(
"Expected batch size ({}) to be divisible by clump size ({})",
batch.num_rows(),
clump_size
),
});
}
let num_clumps = (batch.num_rows() as u64).div_ceil(clump_size);
let mut batch_offsets_for_files =
vec![Vec::<u64>::with_capacity(batch.num_rows()); num_files as usize];
for clump_offset in 0..num_clumps {
let clump_start = clump_offset * clump_size;
let num_rows_in_clump = clump_size.min(batch.num_rows() as u64 - clump_start);
let clump_end = clump_start + num_rows_in_clump;
let file_index = rng.random_range(0..num_files);
batch_offsets_for_files[file_index as usize].extend(clump_start..clump_end);
}
for (file_index, batch_offsets) in batch_offsets_for_files.into_iter().enumerate() {
if batch_offsets.is_empty() {
continue;
}
let indices = UInt64Array::from(batch_offsets);
let partition = arrow::compute::take_record_batch(&batch, &indices)?;
file_writers[file_index].write_batch(&partition).await?;
}
}
for (file_idx, mut writer) in file_writers.into_iter().enumerate() {
let num_written = writer.finish().await?;
log::debug!(
"Shuffle job {}: wrote {} rows to file {}",
self.id,
num_written,
file_idx
);
}
let scheduler_config = SchedulerConfig::max_bandwidth(&object_store);
let scan_scheduler = ScanScheduler::new(Arc::new(object_store), scheduler_config);
let job_id = self.id.clone();
let rng = Arc::new(Mutex::new(rng));
let stream = futures::stream::iter(0..num_files)
.then(move |file_index| {
let scan_scheduler = scan_scheduler.clone();
let rng = rng.clone();
let tmp_dir = tmp_dir.clone();
let job_id = job_id.clone();
async move {
let path = tmp_dir.join(format!("shuffle_{}_{file_index}.lance", job_id));
let path = object_store::path::Path::from_absolute_path(path).unwrap();
let file_scheduler = scan_scheduler
.open_file(&path, &CachedFileSize::unknown())
.await?;
let reader = FileReader::try_open(
file_scheduler,
None,
Arc::<DecoderPlugins>::default(),
&LanceCache::no_cache(),
FileReaderOptions::default(),
)
.await?;
let batch = reader.read_record_batch(0, reader.num_rows()).await?;
let mut rng = rng.lock().unwrap();
Self::shuffle_batch(&batch, &mut rng, clump_size)
}
})
.finally(move || drop(temp_dir))
.boxed();
Ok(Box::pin(SimpleRecordBatchStream::new(stream, arrow_schema)))
}
pub async fn shuffle(
self,
data: SendableRecordBatchStream,
num_rows: u64,
) -> Result<SendableRecordBatchStream> {
log::debug!(
"Shuffle job {}: shuffling {} rows and {} columns",
self.id,
num_rows,
data.schema().fields.len()
);
let rng = non_crypto_rng(&self.config.seed);
if num_rows < self.config.max_rows_per_file {
return self.in_memory_shuffle(data, rng).await;
}
self.do_shuffle(data, num_rows, rng).await
}
}
#[cfg(test)]
mod tests {
use crate::arrow::LanceDbDatagenExt;
use super::*;
use arrow::{array::AsArray, datatypes::Int32Type};
use datafusion::prelude::SessionContext;
use datafusion_expr::col;
use futures::TryStreamExt;
use lance_datagen::{BatchCount, BatchGeneratorBuilder, ByteCount, RowCount, Seed};
use rand::{SeedableRng, rngs::SmallRng};
fn test_gen() -> BatchGeneratorBuilder {
lance_datagen::gen_batch()
.with_seed(Seed::from(42))
.col("id", lance_datagen::array::step::<Int32Type>())
.col(
"name",
lance_datagen::array::rand_utf8(ByteCount::from(10), false),
)
}
fn create_test_batch(size: RowCount) -> RecordBatch {
test_gen().into_batch_rows(size).unwrap()
}
fn create_test_stream(
num_batches: BatchCount,
batch_size: RowCount,
) -> SendableRecordBatchStream {
test_gen().into_ldb_stream(batch_size, num_batches)
}
#[test]
fn test_shuffle_batch_deterministic() {
let batch = create_test_batch(RowCount::from(10));
let mut rng1 = SmallRng::seed_from_u64(42);
let mut rng2 = SmallRng::seed_from_u64(42);
let shuffled1 = Shuffler::shuffle_batch(&batch, &mut rng1, 1).unwrap();
let shuffled2 = Shuffler::shuffle_batch(&batch, &mut rng2, 1).unwrap();
assert_eq!(shuffled1, shuffled2);
}
#[test]
fn test_shuffle_with_clumps() {
let batch = create_test_batch(RowCount::from(10));
let mut rng = SmallRng::seed_from_u64(42);
let shuffled = Shuffler::shuffle_batch(&batch, &mut rng, 3).unwrap();
let values = shuffled.column(0).as_primitive::<Int32Type>();
let mut iter = values.into_iter().map(|o| o.unwrap());
let mut frag_seen = false;
let mut clumps_seen = 0;
while let Some(first) = iter.next() {
if first != 9 {
let second = iter.next().unwrap();
let third = iter.next().unwrap();
assert_eq!(first + 1, second);
assert_eq!(first + 2, third);
clumps_seen += 1;
} else {
frag_seen = true;
}
}
assert_eq!(clumps_seen, 3);
assert!(frag_seen);
}
async fn sort_batch(batch: RecordBatch) -> RecordBatch {
let ctx = SessionContext::new();
let df = ctx.read_batch(batch).unwrap();
let sorted = df.sort_by(vec![col("id")]).unwrap();
let batches = sorted.collect().await.unwrap();
let schema = batches[0].schema();
concat_batches(&schema, &batches).unwrap()
}
#[tokio::test]
async fn test_shuffle_batch_preserves_data() {
let batch = create_test_batch(RowCount::from(100));
let mut rng = SmallRng::seed_from_u64(42);
let shuffled = Shuffler::shuffle_batch(&batch, &mut rng, 1).unwrap();
assert_ne!(shuffled, batch);
let sorted = sort_batch(shuffled).await;
assert_eq!(sorted, batch);
}
#[test]
fn test_shuffle_batch_empty() {
let batch = create_test_batch(RowCount::from(0));
let mut rng = SmallRng::seed_from_u64(42);
let shuffled = Shuffler::shuffle_batch(&batch, &mut rng, 1).unwrap();
assert_eq!(shuffled.num_rows(), 0);
}
#[tokio::test]
async fn test_in_memory_shuffle() {
let config = ShufflerConfig {
temp_dir: TemporaryDirectory::None,
..Default::default()
};
let shuffler = Shuffler::new(config);
let stream = create_test_stream(BatchCount::from(5), RowCount::from(20));
let result_stream = shuffler.shuffle(stream, 100).await.unwrap();
let result_batches: Vec<RecordBatch> = result_stream.try_collect().await.unwrap();
assert_eq!(result_batches.len(), 1);
let result_batch = result_batches.into_iter().next().unwrap();
let unshuffled_batches = create_test_stream(BatchCount::from(5), RowCount::from(20))
.try_collect::<Vec<_>>()
.await
.unwrap();
let schema = unshuffled_batches[0].schema();
let unshuffled_batch = concat_batches(&schema, &unshuffled_batches).unwrap();
let sorted = sort_batch(result_batch).await;
assert_eq!(unshuffled_batch, sorted);
}
#[tokio::test]
async fn test_external_shuffle() {
let config = ShufflerConfig {
max_rows_per_file: 100,
..Default::default()
};
let shuffler = Shuffler::new(config);
let stream = create_test_stream(BatchCount::from(5), RowCount::from(1000));
let result_stream = shuffler.shuffle(stream, 5000).await.unwrap();
let result_batches: Vec<RecordBatch> = result_stream.try_collect().await.unwrap();
let unshuffled_batches = create_test_stream(BatchCount::from(5), RowCount::from(1000))
.try_collect::<Vec<_>>()
.await
.unwrap();
let schema = unshuffled_batches[0].schema();
let unshuffled_batch = concat_batches(&schema, &unshuffled_batches).unwrap();
assert_eq!(result_batches.len(), 50);
let result_batch = concat_batches(&schema, &result_batches).unwrap();
let sorted = sort_batch(result_batch).await;
assert_eq!(unshuffled_batch, sorted);
}
#[test_log::test(tokio::test)]
async fn test_external_clump_shuffle() {
let config = ShufflerConfig {
max_rows_per_file: 100,
clump_size: Some(30),
..Default::default()
};
let shuffler = Shuffler::new(config);
let stream = create_test_stream(BatchCount::from(5), RowCount::from(900));
let schema = stream.schema();
let mut batches = stream.try_collect::<Vec<_>>().await.unwrap();
let last_index = batches.len() - 1;
let sliced_last = batches[last_index].slice(0, 890);
batches[last_index] = sliced_last;
let stream = Box::pin(SimpleRecordBatchStream::new(
futures::stream::iter(batches).map(Ok).boxed(),
schema.clone(),
));
let result_stream = shuffler.shuffle(stream, 4490).await.unwrap();
let result_batches: Vec<RecordBatch> = result_stream.try_collect().await.unwrap();
let result_batch = concat_batches(&schema, &result_batches).unwrap();
let ids = result_batch.column(0).as_primitive::<Int32Type>();
let mut iter = ids.into_iter().map(|o| o.unwrap());
while let Some(first) = iter.next() {
let rows_left_in_clump = if first == 4470 { 19 } else { 29 };
let mut expected_next = first + 1;
for _ in 0..rows_left_in_clump {
let next = iter.next().unwrap();
assert_eq!(next, expected_next);
expected_next += 1;
}
}
}
}