webgraph_cli/dist/hyperball/
mod.rs

1/*
2 * SPDX-FileCopyrightText: 2025 Tommaso Fontana
3 * SPDX-FileCopyrightText: 2025 Sebastiano Vigna
4 *
5 * SPDX-License-Identifier: Apache-2.0 OR LGPL-2.1-or-later
6 */
7use crate::{get_thread_pool, FloatVectorFormat, GlobalArgs, GranularityArgs, NumThreadsArg};
8use anyhow::{ensure, Result};
9use clap::{ArgGroup, Args, Parser};
10use dsi_bitstream::prelude::*;
11use dsi_progress_logger::{concurrent_progress_logger, ProgressLog};
12use epserde::deser::{Deserialize, Flags};
13use rand::SeedableRng;
14use std::path::PathBuf;
15use webgraph::{
16    graphs::bvgraph::get_endianness,
17    prelude::{BvGraph, DCF, DEG_CUMUL_EXTENSION},
18};
19use webgraph_algo::distances::hyperball::HyperBallBuilder;
20
21#[derive(Args, Debug, Clone)]
22#[clap(group = ArgGroup::new("centralities"))]
23/// Centralities that can be computed with hyperball.
24///
25/// To compress the result you can use named pipes or process substitution
26/// like `--harmonic >(zstd > harmonic.zstd)`.
27pub struct Centralities {
28    /// How all the centralities will be stored.
29    #[clap(long, value_enum, default_value_t = FloatVectorFormat::Ascii)]
30    pub fmt: FloatVectorFormat,
31    #[clap(long)]
32    /// How many decimal digits will be used to store centralities in text formats.
33    pub precision: Option<usize>,
34
35    /// Compute the approximate sum of distances and save them as at the given path.
36    #[clap(long)]
37    pub sum_of_distances: Option<PathBuf>,
38    /// Compute the approximate number of reachable nodes and save them as at the given path.
39    #[clap(long)]
40    pub reachable_nodes: Option<PathBuf>,
41    /// Compute the approximate harmonic centralities and save them as at the given path.
42    #[clap(long)]
43    pub harmonic: Option<PathBuf>,
44    /// Compute the approximate closeness centralities and save them as at the given path.
45    #[clap(long)]
46    pub closeness: Option<PathBuf>,
47    #[clap(long)]
48    /// Compute the approximate neighborhood function and save it as at the given path.
49    pub neighborhood_function: Option<PathBuf>,
50}
51
52impl Centralities {
53    pub fn should_compute_sum_of_distances(&self) -> bool {
54        self.sum_of_distances.is_some() || self.closeness.is_some()
55    }
56    pub fn should_compute_sum_of_inverse_distances(&self) -> bool {
57        self.harmonic.is_some()
58    }
59}
60
61#[derive(Parser, Debug)]
62#[command(
63    name = "hyperball",
64    about = "Use hyperball to compute centralities.",
65    long_about = ""
66)]
67pub struct CliArgs {
68    /// The basename of the graph.
69    pub basename: PathBuf,
70
71    #[clap(long, default_value_t = false)]
72    /// Whether the graph is symmetric or not. If true, the algorithm will
73    /// use the graph as its transposed.
74    pub symm: bool,
75
76    /// The basename of the transposed graph. If available, HyperBall will
77    /// perform systolic iterations which will speed up the computation.
78    /// If the graph is symmetric, use the --symm option instead.
79    #[clap(short, long)]
80    pub transposed: Option<PathBuf>,
81
82    /// Compute the approximate neighborhood function, which will be
83    /// store in ASCII format as BASENAME.nf.
84    #[clap(short, long)]
85    pub neighborhood_function: bool,
86
87    #[clap(flatten)]
88    pub centralities: Centralities,
89
90    #[clap(short = 'm', long, default_value_t = 14)]
91    /// The base-2 logarithm of the number of registers for the HyperLogLog
92    /// cardinality estimators.
93    pub log2m: usize,
94
95    #[clap(long, default_value_t = usize::MAX)]
96    /// Maximum number of iterations to run.
97    pub upper_bound: usize,
98
99    #[clap(long)]
100    /// A value that will be used to stop the computation by relative increment
101    /// if the neighborhood function is being computed. Otherwise, the
102    /// computation will stop all estimators do not change their values.
103    pub threshold: Option<f64>,
104
105    #[clap(flatten)]
106    pub num_threads: NumThreadsArg,
107
108    #[clap(flatten)]
109    pub granularity: GranularityArgs,
110
111    #[clap(long, default_value_t = 0)]
112    /// The seed of the pseudorandom number generator used for initialization.
113    pub seed: u64,
114}
115
116pub fn main(global_args: GlobalArgs, args: CliArgs) -> Result<()> {
117    ensure!(
118        !args.symm || args.transposed.is_none(),
119        "If the graph is symmetric, you should not pass the transpose."
120    );
121
122    match get_endianness(&args.basename)?.as_str() {
123        #[cfg(feature = "be_bins")]
124        BE::NAME => hyperball::<BE>(global_args, args),
125        #[cfg(feature = "le_bins")]
126        LE::NAME => hyperball::<LE>(global_args, args),
127        e => panic!("Unknown endianness: {}", e),
128    }
129}
130
131pub fn hyperball<E: Endianness>(global_args: GlobalArgs, args: CliArgs) -> Result<()> {
132    let mut pl = concurrent_progress_logger![];
133    if let Some(log_interval) = global_args.log_interval {
134        pl.log_interval(log_interval);
135    }
136    let thread_pool = get_thread_pool(args.num_threads.num_threads);
137
138    let graph = BvGraph::with_basename(&args.basename).load()?;
139
140    log::info!("Loading DCF...");
141    if !args.basename.with_extension(DEG_CUMUL_EXTENSION).exists() {
142        log::error!(
143            "Missing DCF file. Please run `webgraph build dcf {}`.",
144            args.basename.display()
145        );
146    }
147    let deg_cumul = DCF::mmap(
148        args.basename.with_extension(DEG_CUMUL_EXTENSION),
149        Flags::RANDOM_ACCESS,
150    )?;
151
152    log::info!("Loading Transposed graph...");
153    let mut transposed = None;
154    if let Some(transposed_path) = args.transposed.as_ref() {
155        transposed = Some(BvGraph::with_basename(transposed_path).load()?);
156    }
157    let mut transposed_ref = transposed.as_ref();
158    if args.symm {
159        transposed_ref = Some(&graph);
160    }
161
162    let mut hb = HyperBallBuilder::with_hyper_log_log(
163        &graph,
164        transposed_ref,
165        deg_cumul.as_ref(),
166        args.log2m,
167        None,
168    )?
169    .granularity(args.granularity.into_granularity())
170    .sum_of_distances(args.centralities.should_compute_sum_of_distances())
171    .sum_of_inverse_distances(args.centralities.should_compute_sum_of_inverse_distances())
172    .build(&mut pl);
173
174    log::info!("Starting Hyperball...");
175    let rng = rand::rngs::SmallRng::seed_from_u64(args.seed);
176    hb.run(args.upper_bound, args.threshold, &thread_pool, rng, &mut pl)?;
177
178    log::info!("Storing the results...");
179
180    /// here we use a macro to avoid duplicating the code, it can't be a function
181    /// because different centralities have different return types
182    macro_rules! store_centrality {
183        ($flag:ident, $method:ident, $description:expr) => {{
184            if let Some(path) = args.centralities.$flag {
185                log::info!("Saving {} to {}", $description, path.display());
186                let value = hb.$method()?;
187                args.centralities
188                    .fmt
189                    .store(path, &value, args.centralities.precision)?;
190            }
191        }};
192    }
193
194    store_centrality!(sum_of_distances, sum_of_distances, "sum of distances");
195    store_centrality!(harmonic, harmonic_centralities, "harmonic centralities");
196    store_centrality!(closeness, closeness_centrality, "closeness centralities");
197    store_centrality!(reachable_nodes, reachable_nodes, "reachable nodes");
198    store_centrality!(
199        neighborhood_function,
200        neighborhood_function,
201        "neighborhood function"
202    );
203
204    Ok(())
205}