swh_graph/compress/
iter_origins.rs1use std::path::PathBuf;
8
9use anyhow::Result;
10use ar_row_derive::ArRowDeserialize;
11use rayon::prelude::*;
12
13use super::orc::{get_dataset_readers, par_iter_arrow};
14
15pub fn iter_origins(
16 dataset_dir: &PathBuf,
17) -> Result<impl ParallelIterator<Item = (String, String)>> {
18 #[derive(ArRowDeserialize, Default, Clone)]
19 struct Origin {
20 id: String,
21 url: String,
22 }
23
24 Ok(get_dataset_readers(dataset_dir, "origin")?
25 .into_par_iter()
26 .flat_map(|reader_builder| {
27 par_iter_arrow(reader_builder, |ori: Origin| {
28 [(ori.url, format!("swh:1:ori:{}", ori.id))]
29 })
30 }))
31}