use std::path::PathBuf;
use anyhow::Result;
use ar_row_derive::ArRowDeserialize;
use rayon::prelude::*;
use super::orc::{get_dataset_readers, par_iter_arrow};
pub fn iter_origins(
dataset_dir: &PathBuf,
) -> Result<impl ParallelIterator<Item = (String, String)>> {
#[derive(ArRowDeserialize, Default, Clone)]
struct Origin {
id: String,
url: String,
}
Ok(get_dataset_readers(dataset_dir, "origin")?
.into_par_iter()
.flat_map(|reader_builder| {
par_iter_arrow(reader_builder, |ori: Origin| {
[(ori.url, format!("swh:1:ori:{}", ori.id))]
})
}))
}