Skip to main content

webgraph_cli/dist/hyperball/
mod.rs

1/*
2 * SPDX-FileCopyrightText: 2025 Tommaso Fontana
3 * SPDX-FileCopyrightText: 2025 Sebastiano Vigna
4 *
5 * SPDX-License-Identifier: Apache-2.0 OR LGPL-2.1-or-later
6 */
7use crate::{FloatVectorFormat, GlobalArgs, GranularityArgs, NumThreadsArg, get_thread_pool};
8use anyhow::{Result, bail, ensure};
9use clap::{ArgGroup, Args, Parser};
10use dsi_bitstream::prelude::*;
11use dsi_progress_logger::{ProgressLog, concurrent_progress_logger};
12use epserde::deser::{Deserialize, Flags};
13use rand::SeedableRng;
14use std::path::PathBuf;
15use webgraph::{
16    graphs::bvgraph::get_endianness,
17    prelude::{BvGraph, DCF, DEG_CUMUL_EXTENSION},
18};
19use webgraph_algo::distances::hyperball::HyperBallBuilder;
20
21#[derive(Args, Debug, Clone)]
22#[clap(group = ArgGroup::new("centralities"))]
23/// Centralities that can be computed with hyperball.
24///
25/// To compress the result you can use named pipes or process substitution
26/// like `--harmonic >(zstd > harmonic.zstd)`.
27pub struct Centralities {
28    /// How all the centralities will be stored.
29    #[clap(long, value_enum, default_value_t = FloatVectorFormat::Ascii)]
30    pub fmt: FloatVectorFormat,
31    #[clap(long)]
32    /// How many decimal digits will be used to store centralities in text formats.
33    pub precision: Option<usize>,
34
35    /// Compute the approximate sum of distances and save them as at the given path.
36    #[clap(long)]
37    pub sum_of_distances: Option<PathBuf>,
38    /// Compute the approximate number of reachable nodes and save them as at the given path.
39    #[clap(long)]
40    pub reachable_nodes: Option<PathBuf>,
41    /// Compute the approximate harmonic centralities and save them as at the given path.
42    #[clap(long)]
43    pub harmonic: Option<PathBuf>,
44    /// Compute the approximate closeness centralities and save them as at the given path.
45    #[clap(long)]
46    pub closeness: Option<PathBuf>,
47    #[clap(long)]
48    /// Compute the approximate neighborhood function and save it as at the given path.
49    pub neighborhood_function: Option<PathBuf>,
50}
51
52impl Centralities {
53    pub fn should_compute_sum_of_distances(&self) -> bool {
54        self.sum_of_distances.is_some() || self.closeness.is_some()
55    }
56    pub fn should_compute_sum_of_inverse_distances(&self) -> bool {
57        self.harmonic.is_some()
58    }
59}
60
61#[derive(Parser, Debug)]
62#[command(
63    name = "hyperball",
64    about = "Use hyperball to compute centralities.",
65    long_about = None
66)]
67pub struct CliArgs {
68    /// The basename of the graph.
69    pub basename: PathBuf,
70
71    #[clap(long, default_value_t = false)]
72    /// Whether the graph is symmetric or not. If true, the algorithm will
73    /// use the graph as its transposed.
74    pub symm: bool,
75
76    /// The basename of the transposed graph. If available, HyperBall will
77    /// perform systolic iterations which will speed up the computation.
78    /// If the graph is symmetric, use the --symm option instead.
79    #[clap(short, long)]
80    pub transposed: Option<PathBuf>,
81
82    #[clap(flatten)]
83    pub centralities: Centralities,
84
85    #[clap(short = 'm', long, default_value_t = 14)]
86    /// The base-2 logarithm of the number of registers for the HyperLogLog
87    /// cardinality estimators.
88    pub log2m: usize,
89
90    #[clap(long, default_value_t = usize::MAX)]
91    /// Maximum number of iterations to run.
92    pub upper_bound: usize,
93
94    #[clap(long)]
95    /// A value that will be used to stop the computation by relative increment
96    /// if the neighborhood function is being computed. Otherwise, the
97    /// computation will stop when all estimators do not change their values.
98    pub threshold: Option<f64>,
99
100    #[clap(flatten)]
101    pub num_threads: NumThreadsArg,
102
103    #[clap(flatten)]
104    pub granularity: GranularityArgs,
105
106    #[clap(long, default_value_t = 0)]
107    /// The seed of the pseudorandom number generator used for initialization.
108    pub seed: u64,
109}
110
111pub fn main(global_args: GlobalArgs, args: CliArgs) -> Result<()> {
112    ensure!(
113        !args.symm || args.transposed.is_none(),
114        "If the graph is symmetric, you should not pass the transpose."
115    );
116
117    match get_endianness(&args.basename)?.as_str() {
118        #[cfg(feature = "be_bins")]
119        BE::NAME => hyperball::<BE>(global_args, args),
120        #[cfg(feature = "le_bins")]
121        LE::NAME => hyperball::<LE>(global_args, args),
122        e => panic!("Unknown endianness: {}", e),
123    }
124}
125
126pub fn hyperball<E: Endianness>(global_args: GlobalArgs, args: CliArgs) -> Result<()> {
127    let mut pl = concurrent_progress_logger![];
128    if let Some(log_interval) = global_args.log_interval {
129        pl.log_interval(log_interval);
130    }
131    let thread_pool = get_thread_pool(args.num_threads.num_threads);
132
133    let graph = BvGraph::with_basename(&args.basename).load()?;
134
135    log::info!("Loading DCF...");
136    if !args.basename.with_extension(DEG_CUMUL_EXTENSION).exists() {
137        bail!(
138            "Missing DCF file. Please run `webgraph build dcf {}`.",
139            args.basename.display()
140        );
141    }
142    let deg_cumul = unsafe {
143        DCF::mmap(
144            args.basename.with_extension(DEG_CUMUL_EXTENSION),
145            Flags::RANDOM_ACCESS,
146        )
147    }?;
148
149    log::info!("Loading Transposed graph...");
150    let mut transposed = None;
151    if let Some(transposed_path) = args.transposed.as_ref() {
152        transposed = Some(BvGraph::with_basename(transposed_path).load()?);
153    }
154    let mut transposed_ref = transposed.as_ref();
155    if args.symm {
156        transposed_ref = Some(&graph);
157    }
158
159    let mut hb = HyperBallBuilder::with_hyper_log_log(
160        &graph,
161        transposed_ref,
162        deg_cumul.uncase(),
163        args.log2m,
164        None,
165    )?
166    .granularity(args.granularity.into_granularity())
167    .sum_of_distances(args.centralities.should_compute_sum_of_distances())
168    .sum_of_inverse_distances(args.centralities.should_compute_sum_of_inverse_distances())
169    .build(&mut pl);
170
171    log::info!("Starting Hyperball...");
172    let rng = rand::rngs::SmallRng::seed_from_u64(args.seed);
173    thread_pool.install(|| hb.run(args.upper_bound, args.threshold, rng, &mut pl))?;
174
175    log::info!("Storing the results...");
176
177    /// here we use a macro to avoid duplicating the code, it can't be a function
178    /// because different centralities have different return types
179    macro_rules! store_centrality {
180        ($flag:ident, $method:ident, $description:expr) => {{
181            if let Some(path) = args.centralities.$flag {
182                log::info!("Saving {} to {}", $description, path.display());
183                let value = hb.$method()?;
184                args.centralities
185                    .fmt
186                    .store(path, &value, args.centralities.precision)?;
187            }
188        }};
189    }
190
191    store_centrality!(sum_of_distances, sum_of_distances, "sum of distances");
192    store_centrality!(harmonic, harmonic_centralities, "harmonic centralities");
193    store_centrality!(closeness, closeness_centrality, "closeness centralities");
194    store_centrality!(reachable_nodes, reachable_nodes, "reachable nodes");
195    store_centrality!(
196        neighborhood_function,
197        neighborhood_function,
198        "neighborhood function"
199    );
200
201    Ok(())
202}