use std::path::PathBuf;
use anyhow::Result;
use ar_row::deserialize::{ArRowDeserialize, ArRowStruct};
use ar_row_derive::ArRowDeserialize;
use orc_rust::arrow_reader::ArrowReaderBuilder;
use orc_rust::reader::ChunkReader;
use rayon::prelude::*;
use super::orc::{get_dataset_readers, par_iter_arrow};
type ExportedFullname = (Box<[u8]>, Box<[u8]>);
pub fn iter_fullnames(
dataset_dir: &PathBuf,
subdirectory: &str,
) -> Result<impl ParallelIterator<Item = ExportedFullname>> {
let map_get_dataset_readers =
|dataset_dir, subdirectory| get_dataset_readers(dataset_dir, subdirectory);
Ok([].into_par_iter().chain(
map_get_dataset_readers(dataset_dir, subdirectory)?
.into_par_iter()
.flat_map(iter_fullnames_from_file),
))
}
fn map_fullnames<T, F, R: ChunkReader + Send>(
reader_builder: ArrowReaderBuilder<R>,
f: F,
) -> impl ParallelIterator<Item = ExportedFullname>
where
F: Fn(T) -> Vec<ExportedFullname> + Send + Sync,
T: ArRowDeserialize + ArRowStruct + Send,
{
par_iter_arrow(reader_builder, f)
}
fn iter_fullnames_from_file<R: ChunkReader + Send>(
reader_builder: ArrowReaderBuilder<R>,
) -> impl ParallelIterator<Item = ExportedFullname> {
#[derive(ArRowDeserialize, Default, Clone)]
struct Row {
fullname: Box<[u8]>,
sha256_fullname: Box<[u8]>,
}
map_fullnames(reader_builder, |row: Row| {
vec![(row.fullname, row.sha256_fullname)]
})
}