swh_graph/compress/
iter_origins.rs

1// Copyright (C) 2023-2024  The Software Heritage developers
2// See the AUTHORS file at the top-level directory of this distribution
3// License: GNU General Public License version 3, or any later version
4// See top-level LICENSE file for more information
5
6//! Iterator on the set of all origin URLs in an ORC dataset
7use std::path::PathBuf;
8
9use anyhow::Result;
10use ar_row_derive::ArRowDeserialize;
11use rayon::prelude::*;
12
13use super::orc::{get_dataset_readers, par_iter_arrow};
14
15pub fn iter_origins(
16    dataset_dir: &PathBuf,
17) -> Result<impl ParallelIterator<Item = (String, String)>> {
18    #[derive(ArRowDeserialize, Default, Clone)]
19    struct Origin {
20        id: String,
21        url: String,
22    }
23
24    Ok(get_dataset_readers(dataset_dir, "origin")?
25        .into_par_iter()
26        .flat_map(|reader_builder| {
27            par_iter_arrow(reader_builder, |ori: Origin| {
28                [(ori.url, format!("swh:1:ori:{}", ori.id))]
29            })
30        }))
31}