datafusion_datasource/write/
demux.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Module containing helper methods/traits related to enabling
19//! dividing input stream into multiple output files at execution time
20
21use std::borrow::Cow;
22use std::collections::HashMap;
23use std::sync::Arc;
24
25use crate::url::ListingTableUrl;
26use crate::write::FileSinkConfig;
27use datafusion_common::error::Result;
28use datafusion_physical_plan::SendableRecordBatchStream;
29
30use arrow::array::{
31    ArrayAccessor, RecordBatch, StringArray, StructArray, builder::UInt64Builder,
32    cast::AsArray, downcast_dictionary_array,
33};
34use arrow::datatypes::{DataType, Schema};
35use datafusion_common::cast::{
36    as_boolean_array, as_date32_array, as_date64_array, as_float16_array,
37    as_float32_array, as_float64_array, as_int8_array, as_int16_array, as_int32_array,
38    as_int64_array, as_large_string_array, as_string_array, as_string_view_array,
39    as_uint8_array, as_uint16_array, as_uint32_array, as_uint64_array,
40};
41use datafusion_common::{exec_datafusion_err, internal_datafusion_err, not_impl_err};
42use datafusion_common_runtime::SpawnedTask;
43
44use chrono::NaiveDate;
45use datafusion_execution::TaskContext;
46use futures::StreamExt;
47use object_store::path::Path;
48use rand::distr::SampleString;
49use tokio::sync::mpsc::{self, Receiver, Sender, UnboundedReceiver, UnboundedSender};
50
51type RecordBatchReceiver = Receiver<RecordBatch>;
52pub type DemuxedStreamReceiver = UnboundedReceiver<(Path, RecordBatchReceiver)>;
53
54/// Splits a single [SendableRecordBatchStream] into a dynamically determined
55/// number of partitions at execution time.
56///
57/// The partitions are determined by factors known only at execution time, such
58/// as total number of rows and partition column values. The demuxer task
59/// communicates to the caller by sending channels over a channel. The inner
60/// channels send RecordBatches which should be contained within the same output
61/// file. The outer channel is used to send a dynamic number of inner channels,
62/// representing a dynamic number of total output files.
63///
64/// The caller is also responsible to monitor the demux task for errors and
65/// abort accordingly.
66///
67/// A path with an extension will force only a single file to
68/// be written with the extension from the path. Otherwise the default extension
69/// will be used and the output will be split into multiple files.
70///
71/// Output file guarantees:
72///  - Partitioned files: Files are created only for non-empty partitions.
73///  - Single-file output: 1 file is always written, even when the stream is empty.
74///  - Multi-file output: Depending on the number of record batches, 0 or more files are written.
75///
76/// Examples of `base_output_path`
77///  * `tmp/dataset/` -> is a folder since it ends in `/`
78///  * `tmp/dataset` -> is still a folder since it does not end in `/` but has no valid file extension
79///  * `tmp/file.parquet` -> is a file since it does not end in `/` and has a valid file extension `.parquet`
80///  * `tmp/file.parquet/` -> is a folder since it ends in `/`
81///
82/// The `partition_by` parameter will additionally split the input based on the
83/// unique values of a specific column, see
84/// <https://github.com/apache/datafusion/issues/7744>
85///
86/// ```text
87///                                                                              ┌───────────┐               ┌────────────┐    ┌─────────────┐
88///                                                                     ┌──────▶ │  batch 1  ├────▶...──────▶│   Batch a  │    │ Output File1│
89///                                                                     │        └───────────┘               └────────────┘    └─────────────┘
90///                                                                     │
91///                                                 ┌──────────┐        │        ┌───────────┐               ┌────────────┐    ┌─────────────┐
92/// ┌───────────┐               ┌────────────┐      │          │        ├──────▶ │  batch a+1├────▶...──────▶│   Batch b  │    │ Output File2│
93/// │  batch 1  ├────▶...──────▶│   Batch N  ├─────▶│  Demux   ├────────┤ ...    └───────────┘               └────────────┘    └─────────────┘
94/// └───────────┘               └────────────┘      │          │        │
95///                                                 └──────────┘        │        ┌───────────┐               ┌────────────┐    ┌─────────────┐
96///                                                                     └──────▶ │  batch d  ├────▶...──────▶│   Batch n  │    │ Output FileN│
97///                                                                              └───────────┘               └────────────┘    └─────────────┘
98/// ```
99pub(crate) fn start_demuxer_task(
100    config: &FileSinkConfig,
101    data: SendableRecordBatchStream,
102    context: &Arc<TaskContext>,
103) -> (SpawnedTask<Result<()>>, DemuxedStreamReceiver) {
104    let (tx, rx) = mpsc::unbounded_channel();
105    let context = Arc::clone(context);
106    let file_extension = config.file_extension.clone();
107    let base_output_path = config.table_paths[0].clone();
108    let task = if config.table_partition_cols.is_empty() {
109        let single_file_output = config
110            .file_output_mode
111            .single_file_output(&base_output_path);
112        SpawnedTask::spawn(async move {
113            row_count_demuxer(
114                tx,
115                data,
116                context,
117                base_output_path,
118                file_extension,
119                single_file_output,
120            )
121            .await
122        })
123    } else {
124        // There could be an arbitrarily large number of parallel hive style partitions being written to, so we cannot
125        // bound this channel without risking a deadlock.
126        let partition_by = config.table_partition_cols.clone();
127        let keep_partition_by_columns = config.keep_partition_by_columns;
128        SpawnedTask::spawn(async move {
129            hive_style_partitions_demuxer(
130                tx,
131                data,
132                context,
133                partition_by,
134                base_output_path,
135                file_extension,
136                keep_partition_by_columns,
137            )
138            .await
139        })
140    };
141
142    (task, rx)
143}
144
145/// Dynamically partitions input stream to achieve desired maximum rows per file
146async fn row_count_demuxer(
147    mut tx: UnboundedSender<(Path, Receiver<RecordBatch>)>,
148    mut input: SendableRecordBatchStream,
149    context: Arc<TaskContext>,
150    base_output_path: ListingTableUrl,
151    file_extension: String,
152    single_file_output: bool,
153) -> Result<()> {
154    let exec_options = &context.session_config().options().execution;
155
156    let max_rows_per_file = exec_options.soft_max_rows_per_output_file;
157    let max_buffered_batches = exec_options.max_buffered_batches_per_output_file;
158    let minimum_parallel_files = exec_options.minimum_parallel_output_files;
159    let mut part_idx = 0;
160    let write_id = rand::distr::Alphanumeric.sample_string(&mut rand::rng(), 16);
161
162    let mut open_file_streams = Vec::with_capacity(minimum_parallel_files);
163
164    let mut next_send_steam = 0;
165    let mut row_counts = Vec::with_capacity(minimum_parallel_files);
166
167    // Overrides if single_file_output is set
168    let minimum_parallel_files = if single_file_output {
169        1
170    } else {
171        minimum_parallel_files
172    };
173
174    let max_rows_per_file = if single_file_output {
175        usize::MAX
176    } else {
177        max_rows_per_file
178    };
179
180    if single_file_output {
181        // ensure we have one file open, even when the input stream is empty
182        open_file_streams.push(create_new_file_stream(
183            &base_output_path,
184            &write_id,
185            part_idx,
186            &file_extension,
187            single_file_output,
188            max_buffered_batches,
189            &mut tx,
190        )?);
191        row_counts.push(0);
192        part_idx += 1;
193    }
194
195    let schema = input.schema();
196    let mut is_batch_received = false;
197
198    while let Some(rb) = input.next().await.transpose()? {
199        is_batch_received = true;
200        // ensure we have at least minimum_parallel_files open
201        if open_file_streams.len() < minimum_parallel_files {
202            open_file_streams.push(create_new_file_stream(
203                &base_output_path,
204                &write_id,
205                part_idx,
206                &file_extension,
207                single_file_output,
208                max_buffered_batches,
209                &mut tx,
210            )?);
211            row_counts.push(0);
212            part_idx += 1;
213        } else if row_counts[next_send_steam] >= max_rows_per_file {
214            row_counts[next_send_steam] = 0;
215            open_file_streams[next_send_steam] = create_new_file_stream(
216                &base_output_path,
217                &write_id,
218                part_idx,
219                &file_extension,
220                single_file_output,
221                max_buffered_batches,
222                &mut tx,
223            )?;
224            part_idx += 1;
225        }
226        row_counts[next_send_steam] += rb.num_rows();
227        open_file_streams[next_send_steam]
228            .send(rb)
229            .await
230            .map_err(|_| {
231                exec_datafusion_err!("Error sending RecordBatch to file stream!")
232            })?;
233
234        next_send_steam = (next_send_steam + 1) % minimum_parallel_files;
235    }
236
237    // if there is no batch send but with a single file, send an empty batch
238    if single_file_output && !is_batch_received {
239        open_file_streams
240            .first_mut()
241            .ok_or_else(|| internal_datafusion_err!("Expected a single output file"))?
242            .send(RecordBatch::new_empty(schema))
243            .await
244            .map_err(|_| {
245                exec_datafusion_err!("Error sending empty RecordBatch to file stream!")
246            })?;
247    }
248
249    Ok(())
250}
251
252/// Helper for row count demuxer
253fn generate_file_path(
254    base_output_path: &ListingTableUrl,
255    write_id: &str,
256    part_idx: usize,
257    file_extension: &str,
258    single_file_output: bool,
259) -> Path {
260    if !single_file_output {
261        base_output_path
262            .prefix()
263            .child(format!("{write_id}_{part_idx}.{file_extension}"))
264    } else {
265        base_output_path.prefix().to_owned()
266    }
267}
268
269/// Helper for row count demuxer
270fn create_new_file_stream(
271    base_output_path: &ListingTableUrl,
272    write_id: &str,
273    part_idx: usize,
274    file_extension: &str,
275    single_file_output: bool,
276    max_buffered_batches: usize,
277    tx: &mut UnboundedSender<(Path, Receiver<RecordBatch>)>,
278) -> Result<Sender<RecordBatch>> {
279    let file_path = generate_file_path(
280        base_output_path,
281        write_id,
282        part_idx,
283        file_extension,
284        single_file_output,
285    );
286    let (tx_file, rx_file) = mpsc::channel(max_buffered_batches / 2);
287    tx.send((file_path, rx_file))
288        .map_err(|_| exec_datafusion_err!("Error sending RecordBatch to file stream!"))?;
289    Ok(tx_file)
290}
291
292/// Splits an input stream based on the distinct values of a set of columns
293/// Assumes standard hive style partition paths such as
294/// /col1=val1/col2=val2/outputfile.parquet
295async fn hive_style_partitions_demuxer(
296    tx: UnboundedSender<(Path, Receiver<RecordBatch>)>,
297    mut input: SendableRecordBatchStream,
298    context: Arc<TaskContext>,
299    partition_by: Vec<(String, DataType)>,
300    base_output_path: ListingTableUrl,
301    file_extension: String,
302    keep_partition_by_columns: bool,
303) -> Result<()> {
304    let write_id = rand::distr::Alphanumeric.sample_string(&mut rand::rng(), 16);
305
306    let exec_options = &context.session_config().options().execution;
307    let max_buffered_recordbatches = exec_options.max_buffered_batches_per_output_file;
308
309    // To support non string partition col types, cast the type to &str first
310    let mut value_map: HashMap<Vec<String>, Sender<RecordBatch>> = HashMap::new();
311
312    while let Some(rb) = input.next().await.transpose()? {
313        // First compute partition key for each row of batch, e.g. (col1=val1, col2=val2, ...)
314        let all_partition_values = compute_partition_keys_by_row(&rb, &partition_by)?;
315
316        // Next compute how the batch should be split up to take each distinct key to its own batch
317        let take_map = compute_take_arrays(&rb, &all_partition_values);
318
319        // Divide up the batch into distinct partition key batches and send each batch
320        for (part_key, mut builder) in take_map.into_iter() {
321            // Take method adapted from https://github.com/lancedb/lance/pull/1337/files
322            // TODO: upstream RecordBatch::take to arrow-rs
323            let take_indices = builder.finish();
324            let struct_array: StructArray = rb.clone().into();
325            let parted_batch = RecordBatch::from(
326                arrow::compute::take(&struct_array, &take_indices, None)?.as_struct(),
327            );
328
329            // Get or create channel for this batch
330            let part_tx = match value_map.get_mut(&part_key) {
331                Some(part_tx) => part_tx,
332                None => {
333                    // Create channel for previously unseen distinct partition key and notify consumer of new file
334                    let (part_tx, part_rx) =
335                        mpsc::channel::<RecordBatch>(max_buffered_recordbatches);
336                    let file_path = compute_hive_style_file_path(
337                        &part_key,
338                        &partition_by,
339                        &write_id,
340                        &file_extension,
341                        &base_output_path,
342                    );
343
344                    tx.send((file_path, part_rx)).map_err(|_| {
345                        exec_datafusion_err!("Error sending new file stream!")
346                    })?;
347
348                    value_map.insert(part_key.clone(), part_tx);
349                    value_map.get_mut(&part_key).ok_or_else(|| {
350                        exec_datafusion_err!("Key must exist since it was just inserted!")
351                    })?
352                }
353            };
354
355            let final_batch_to_send = if keep_partition_by_columns {
356                parted_batch
357            } else {
358                remove_partition_by_columns(&parted_batch, &partition_by)?
359            };
360
361            // Finally send the partial batch partitioned by distinct value!
362            part_tx.send(final_batch_to_send).await.map_err(|_| {
363                internal_datafusion_err!("Unexpected error sending parted batch!")
364            })?;
365        }
366    }
367
368    Ok(())
369}
370
371fn compute_partition_keys_by_row<'a>(
372    rb: &'a RecordBatch,
373    partition_by: &'a [(String, DataType)],
374) -> Result<Vec<Vec<Cow<'a, str>>>> {
375    let mut all_partition_values = vec![];
376
377    const EPOCH_DAYS_FROM_CE: i32 = 719_163;
378
379    // For the purposes of writing partitioned data, we can rely on schema inference
380    // to determine the type of the partition cols in order to provide a more ergonomic
381    // UI which does not require specifying DataTypes manually. So, we ignore the
382    // DataType within the partition_by array and infer the correct type from the
383    // batch schema instead.
384    let schema = rb.schema();
385    for (col, _) in partition_by.iter() {
386        let mut partition_values = vec![];
387
388        let dtype = schema.field_with_name(col)?.data_type();
389        let col_array = rb.column_by_name(col).ok_or(exec_datafusion_err!(
390            "PartitionBy Column {} does not exist in source data! Got schema {schema}.",
391            col
392        ))?;
393
394        match dtype {
395            DataType::Utf8 => {
396                let array = as_string_array(col_array)?;
397                for i in 0..rb.num_rows() {
398                    partition_values.push(Cow::from(array.value(i)));
399                }
400            }
401            DataType::LargeUtf8 => {
402                let array = as_large_string_array(col_array)?;
403                for i in 0..rb.num_rows() {
404                    partition_values.push(Cow::from(array.value(i)));
405                }
406            }
407            DataType::Utf8View => {
408                let array = as_string_view_array(col_array)?;
409                for i in 0..rb.num_rows() {
410                    partition_values.push(Cow::from(array.value(i)));
411                }
412            }
413            DataType::Boolean => {
414                let array = as_boolean_array(col_array)?;
415                for i in 0..rb.num_rows() {
416                    partition_values.push(Cow::from(array.value(i).to_string()));
417                }
418            }
419            DataType::Date32 => {
420                let array = as_date32_array(col_array)?;
421                // ISO-8601/RFC3339 format - yyyy-mm-dd
422                let format = "%Y-%m-%d";
423                for i in 0..rb.num_rows() {
424                    let date = NaiveDate::from_num_days_from_ce_opt(
425                        EPOCH_DAYS_FROM_CE + array.value(i),
426                    )
427                    .unwrap()
428                    .format(format)
429                    .to_string();
430                    partition_values.push(Cow::from(date));
431                }
432            }
433            DataType::Date64 => {
434                let array = as_date64_array(col_array)?;
435                // ISO-8601/RFC3339 format - yyyy-mm-dd
436                let format = "%Y-%m-%d";
437                for i in 0..rb.num_rows() {
438                    let date = NaiveDate::from_num_days_from_ce_opt(
439                        EPOCH_DAYS_FROM_CE + (array.value(i) / 86_400_000) as i32,
440                    )
441                    .unwrap()
442                    .format(format)
443                    .to_string();
444                    partition_values.push(Cow::from(date));
445                }
446            }
447            DataType::Int8 => {
448                let array = as_int8_array(col_array)?;
449                for i in 0..rb.num_rows() {
450                    partition_values.push(Cow::from(array.value(i).to_string()));
451                }
452            }
453            DataType::Int16 => {
454                let array = as_int16_array(col_array)?;
455                for i in 0..rb.num_rows() {
456                    partition_values.push(Cow::from(array.value(i).to_string()));
457                }
458            }
459            DataType::Int32 => {
460                let array = as_int32_array(col_array)?;
461                for i in 0..rb.num_rows() {
462                    partition_values.push(Cow::from(array.value(i).to_string()));
463                }
464            }
465            DataType::Int64 => {
466                let array = as_int64_array(col_array)?;
467                for i in 0..rb.num_rows() {
468                    partition_values.push(Cow::from(array.value(i).to_string()));
469                }
470            }
471            DataType::UInt8 => {
472                let array = as_uint8_array(col_array)?;
473                for i in 0..rb.num_rows() {
474                    partition_values.push(Cow::from(array.value(i).to_string()));
475                }
476            }
477            DataType::UInt16 => {
478                let array = as_uint16_array(col_array)?;
479                for i in 0..rb.num_rows() {
480                    partition_values.push(Cow::from(array.value(i).to_string()));
481                }
482            }
483            DataType::UInt32 => {
484                let array = as_uint32_array(col_array)?;
485                for i in 0..rb.num_rows() {
486                    partition_values.push(Cow::from(array.value(i).to_string()));
487                }
488            }
489            DataType::UInt64 => {
490                let array = as_uint64_array(col_array)?;
491                for i in 0..rb.num_rows() {
492                    partition_values.push(Cow::from(array.value(i).to_string()));
493                }
494            }
495            DataType::Float16 => {
496                let array = as_float16_array(col_array)?;
497                for i in 0..rb.num_rows() {
498                    partition_values.push(Cow::from(array.value(i).to_string()));
499                }
500            }
501            DataType::Float32 => {
502                let array = as_float32_array(col_array)?;
503                for i in 0..rb.num_rows() {
504                    partition_values.push(Cow::from(array.value(i).to_string()));
505                }
506            }
507            DataType::Float64 => {
508                let array = as_float64_array(col_array)?;
509                for i in 0..rb.num_rows() {
510                    partition_values.push(Cow::from(array.value(i).to_string()));
511                }
512            }
513            DataType::Dictionary(_, _) => {
514                downcast_dictionary_array!(
515                    col_array =>  {
516                        let array = col_array.downcast_dict::<StringArray>()
517                            .ok_or(exec_datafusion_err!("it is not yet supported to write to hive partitions with datatype {}",
518                            dtype))?;
519
520                        for i in 0..rb.num_rows() {
521                            partition_values.push(Cow::from(array.value(i)));
522                        }
523                    },
524                    _ => unreachable!(),
525                )
526            }
527            _ => {
528                return not_impl_err!(
529                    "it is not yet supported to write to hive partitions with datatype {}",
530                    dtype
531                );
532            }
533        }
534
535        all_partition_values.push(partition_values);
536    }
537
538    Ok(all_partition_values)
539}
540
541fn compute_take_arrays(
542    rb: &RecordBatch,
543    all_partition_values: &[Vec<Cow<str>>],
544) -> HashMap<Vec<String>, UInt64Builder> {
545    let mut take_map = HashMap::new();
546    for i in 0..rb.num_rows() {
547        let mut part_key = vec![];
548        for vals in all_partition_values.iter() {
549            part_key.push(vals[i].clone().into());
550        }
551        let builder = take_map.entry(part_key).or_insert_with(UInt64Builder::new);
552        builder.append_value(i as u64);
553    }
554    take_map
555}
556
557fn remove_partition_by_columns(
558    parted_batch: &RecordBatch,
559    partition_by: &[(String, DataType)],
560) -> Result<RecordBatch> {
561    let partition_names: Vec<_> = partition_by.iter().map(|(s, _)| s).collect();
562    let (non_part_cols, non_part_fields): (Vec<_>, Vec<_>) = parted_batch
563        .columns()
564        .iter()
565        .zip(parted_batch.schema().fields())
566        .filter_map(|(a, f)| {
567            if !partition_names.contains(&f.name()) {
568                Some((Arc::clone(a), (**f).clone()))
569            } else {
570                None
571            }
572        })
573        .unzip();
574
575    let non_part_schema = Schema::new(non_part_fields);
576    let final_batch_to_send =
577        RecordBatch::try_new(Arc::new(non_part_schema), non_part_cols)?;
578
579    Ok(final_batch_to_send)
580}
581
582fn compute_hive_style_file_path(
583    part_key: &[String],
584    partition_by: &[(String, DataType)],
585    write_id: &str,
586    file_extension: &str,
587    base_output_path: &ListingTableUrl,
588) -> Path {
589    let mut file_path = base_output_path.prefix().clone();
590    for j in 0..part_key.len() {
591        file_path = file_path.child(format!("{}={}", partition_by[j].0, part_key[j]));
592    }
593
594    file_path.child(format!("{write_id}.{file_extension}"))
595}
datafusion_datasource/write/demux.rs

datafusion_datasource/write/
demux.rs