1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
/*
* Copyright (C) 2023-2026 The Software Heritage developers
* See the AUTHORS file at the top-level directory of this distribution
* License: GNU General Public License version 3, or any later version
* See top-level LICENSE file for more information
*/
use std::path::PathBuf;
use anyhow::{ensure, Context, Result};
use clap::{Parser, Subcommand};
use dsi_bitstream::prelude::BE;
use swh_graph::utils::suffix_path;
#[derive(Parser, Debug)]
#[command(about = "Commands to (re)generate `.ef` and `.offsets` files, allowing random access to BVGraph", long_about = None)]
struct Args {
#[clap(flatten)]
webgraph_args: webgraph_cli::GlobalArgs,
#[command(subcommand)]
command: Commands,
}
#[derive(Subcommand, Debug)]
enum Commands {
/// Reads a graph file linearly and produce a .offsets file which can be used
/// by the Java backend to randomly access the graph.
Offsets { graph: PathBuf },
/// Reads either a graph file linearly or .offsets file (generated and used
/// by the Java backend to randomly access the graph), and produces a .ef file
/// suitable to randomly access the graph from the Rust backend.
///
/// Only suitable for unlabeled graphs.
Ef { base_path: PathBuf },
/// Reads either a graph file linearly or .offsets file (generated and used
/// by the Java backend to randomly access the graph), and produces a .ef file
/// suitable to randomly access the graph from the Rust backend.
///
/// Only suitable for labeled graphs.
LabelsEf {
base_path: PathBuf,
/// The number of nodes in the graph
num_nodes: usize,
},
/// Reads either a graph file linearly, and produces a degree-cumulative function
/// encoded as an Elias-Fano sequence in a .dcf file,
/// suitable to distribute load while working on the graph.
///
/// Only suitable for unlabeled graphs.
Dcf { base_path: PathBuf },
/// Reads the lengths of the full names and builds the corresponding Elias-Fano
/// offsets file.
FullnamesEf {
#[arg(long)]
num_persons: usize,
fullnames_path: PathBuf,
lengths_path: PathBuf,
ef_path: PathBuf,
},
}
pub fn main() -> Result<()> {
let args = Args::parse();
env_logger::Builder::from_env(env_logger::Env::default().default_filter_or("info")).init();
match args.command {
Commands::Offsets { graph } => {
use webgraph_cli::build::offsets::{build_offsets, CliArgs};
build_offsets::<BE>(args.webgraph_args, CliArgs { basename: graph })?;
}
Commands::Ef { base_path } => {
use webgraph_cli::build::ef::{build_elias_fano, CliArgs};
build_elias_fano::<BE>(
args.webgraph_args,
CliArgs {
basename: base_path,
number_of_nodes: None,
},
)?;
}
Commands::LabelsEf {
base_path,
num_nodes,
} => {
use webgraph_cli::build::ef::{build_elias_fano, CliArgs};
// webgraph shows a very obscure error when it happens (failed `.unwrap()`
// when reading `nodes=` property on the `.properties` file),
// so we should catch it here.
let offsets_path = suffix_path(&base_path, ".labeloffsets");
ensure!(
offsets_path.exists(),
"{} is missing",
offsets_path.display()
);
build_elias_fano::<BE>(
args.webgraph_args,
CliArgs {
basename: base_path,
number_of_nodes: Some(num_nodes),
},
)?;
}
Commands::Dcf { base_path } => {
use webgraph_cli::build::dcf::{build_dcf, CliArgs};
build_dcf::<BE>(
args.webgraph_args,
CliArgs {
basename: base_path,
},
)?;
}
Commands::FullnamesEf {
num_persons,
fullnames_path,
lengths_path,
ef_path,
} => {
use dsi_bitstream::prelude::*;
use epserde::ser::Serialize;
use std::{fs::File, io::BufReader};
use sux::dict::EliasFanoBuilder;
let lengths_file = File::open(&lengths_path)
.with_context(|| format!("Could not open {}", lengths_path.display()))?;
let mut lengths_reader = <BufBitReader<BE, _>>::new(<WordAdapter<u64, _>>::new(
BufReader::with_capacity(1 << 20, lengths_file),
));
let max_offset = std::fs::metadata(&fullnames_path)
.with_context(|| format!("Could not stat {}", fullnames_path.display()))?
.len();
let max_offset = usize::try_from(max_offset).context("offset overflowed usize")?;
let mut ef_builder = EliasFanoBuilder::new(num_persons + 1, max_offset);
let mut offset = 0usize;
ef_builder.push(offset);
for _ in 0..num_persons {
let delta = lengths_reader
.read_gamma()
.context("Could not read gamma")?;
offset = offset.checked_add(delta as usize).with_context(|| {
format!(
"Sum of lengths in {} overflowed usize",
lengths_path.display()
)
})?;
ensure!(
offset <= max_offset,
"Sum of sizes in {} is greater than the size of {}",
lengths_path.display(),
fullnames_path.display(),
);
ef_builder.push(offset);
}
let ef_offsets = ef_builder.build_with_seq();
log::info!("Writing Elias-Fano file for full names offsets...");
let mut ef_file = File::create(&ef_path)
.with_context(|| format!("Could not create {}", ef_path.display()))?;
// SAFETY: this might leak some internal memory, but we only ship this .ef alongside
// the data this process has access to.
unsafe { ef_offsets.serialize(&mut ef_file) }
.context("Could not write full names offsets elias-fano file")?;
}
}
Ok(())
}