webgraph_cli/transform/
simplify.rs

1/*
2 * SPDX-FileCopyrightText: 2023 Inria
3 * SPDX-FileCopyrightText: 2023 Tommaso Fontana
4 *
5 * SPDX-License-Identifier: Apache-2.0 OR LGPL-2.1-or-later
6 */
7
8use crate::*;
9use anyhow::Result;
10use dsi_bitstream::{dispatch::factory::CodesReaderFactoryHelper, prelude::*};
11use mmap_rs::MmapFlags;
12use std::path::PathBuf;
13use tempfile::Builder;
14use webgraph::graphs::union_graph::UnionGraph;
15use webgraph::prelude::*;
16
17#[derive(Parser, Debug)]
18#[command(name = "simplify", about = "Makes a BvGraph simple (undirected and loopless) by adding missing arcs and removing loops, optionally applying a permutation.", long_about = None)]
19pub struct CliArgs {
20    /// The basename of the graph.
21    pub src: PathBuf,
22    /// The basename of the simplified graph.
23    pub dst: PathBuf,
24
25    #[arg(long)]
26    /// The basename of a pre-computed transposed version of the source graph, which
27    /// will be use to speed up the simplification.
28    pub transposed: Option<PathBuf>,
29
30    #[clap(flatten)]
31    pub num_threads: NumThreadsArg,
32
33    #[clap(flatten)]
34    pub batch_size: BatchSizeArg,
35
36    #[clap(flatten)]
37    pub ca: CompressArgs,
38
39    #[arg(long)]
40    /// The path to an optional permutation in binary big-endian format to apply to the graph.
41    pub permutation: Option<PathBuf>,
42}
43
44pub fn main(global_args: GlobalArgs, args: CliArgs) -> Result<()> {
45    create_parent_dir(&args.dst)?;
46
47    match get_endianness(&args.src)?.as_str() {
48        #[cfg(feature = "be_bins")]
49        BE::NAME => simplify::<BE>(global_args, args),
50        #[cfg(feature = "le_bins")]
51        LE::NAME => simplify::<LE>(global_args, args),
52        e => panic!("Unknown endianness: {}", e),
53    }
54}
55
56fn no_ef_warn(basepath: impl AsRef<std::path::Path>) {
57    log::warn!(
58        "The .ef file was not found so the simplification will proceed sequentially. This may be slow. To speed it up, you can use `webgraph build ef {}` which would allow us create batches in parallel",
59        basepath.as_ref().display()
60    );
61}
62
63pub fn simplify<E: Endianness>(_global_args: GlobalArgs, args: CliArgs) -> Result<()>
64where
65    MmapHelper<u32>: CodesReaderFactoryHelper<E>,
66    for<'a> LoadModeCodesReader<'a, E, Mmap>: BitSeek + Clone + Send + Sync,
67{
68    // TODO!: speed it up by using random access graph if possible
69    let thread_pool = crate::get_thread_pool(args.num_threads.num_threads);
70
71    let target_endianness = args.ca.endianness.clone().unwrap_or_else(|| E::NAME.into());
72
73    let dir = Builder::new().prefix("transform_simplify_").tempdir()?;
74
75    match (args.permutation, args.transposed) {
76        // load the transposed graph and use it to directly compress the graph
77        // without doing any sorting
78        (None, Some(t_path)) => {
79            log::info!("Transposed graph provided, using it to simplify the graph");
80
81            let has_ef_graph =
82                std::fs::metadata(args.src.with_extension("ef")).is_ok_and(|x| x.is_file());
83            let has_ef_t_graph =
84                std::fs::metadata(t_path.with_extension("ef")).is_ok_and(|x| x.is_file());
85
86            match (has_ef_graph, has_ef_t_graph) {
87                (true, true) => {
88                    log::info!("Both .ef files found, using simplify split");
89
90                    let graph =
91                        webgraph::graphs::bvgraph::random_access::BvGraph::with_basename(&args.src)
92                            .endianness::<E>()
93                            .load()?;
94                    let num_nodes = graph.num_nodes();
95                    let graph_t =
96                        webgraph::graphs::bvgraph::random_access::BvGraph::with_basename(&t_path)
97                            .endianness::<E>()
98                            .load()?;
99
100                    if graph_t.num_nodes() != num_nodes {
101                        anyhow::bail!(
102                            "The number of nodes in the graph and its transpose do not match! {} != {}",
103                            num_nodes,
104                            graph_t.num_nodes()
105                        );
106                    }
107
108                    let sorted = NoSelfLoopsGraph(UnionGraph(graph, graph_t));
109
110                    BvComp::parallel_endianness(
111                        &args.dst,
112                        &sorted,
113                        num_nodes,
114                        args.ca.into(),
115                        &thread_pool,
116                        dir,
117                        &target_endianness,
118                    )?;
119
120                    return Ok(());
121                }
122                (true, false) => {
123                    no_ef_warn(&args.src);
124                }
125                (false, true) => {
126                    no_ef_warn(&t_path);
127                }
128                (false, false) => {
129                    no_ef_warn(&args.src);
130                    no_ef_warn(&t_path);
131                }
132            }
133
134            let seq_graph =
135                webgraph::graphs::bvgraph::sequential::BvGraphSeq::with_basename(&args.src)
136                    .endianness::<E>()
137                    .load()?;
138            let num_nodes = seq_graph.num_nodes();
139            let seq_graph_t =
140                webgraph::graphs::bvgraph::sequential::BvGraphSeq::with_basename(&t_path)
141                    .endianness::<E>()
142                    .load()?;
143
144            if seq_graph_t.num_nodes() != num_nodes {
145                anyhow::bail!(
146                    "The number of nodes in the graph and its transpose do not match! {} != {}",
147                    num_nodes,
148                    seq_graph_t.num_nodes()
149                );
150            }
151
152            let sorted = NoSelfLoopsGraph(UnionGraph(seq_graph, seq_graph_t));
153
154            BvComp::parallel_endianness(
155                &args.dst,
156                &sorted,
157                num_nodes,
158                args.ca.into(),
159                &thread_pool,
160                dir,
161                &target_endianness,
162            )?;
163        }
164        // apply the permutation, don't care if the transposed graph is already computed
165        // as we cannot really exploit it
166        (Some(perm_path), None | Some(_)) => {
167            log::info!("Permutation provided, applying it to the graph");
168
169            let perm = JavaPermutation::mmap(perm_path, MmapFlags::RANDOM_ACCESS)?;
170
171            // if the .ef file exists, we can use the simplify split
172            if std::fs::metadata(args.src.with_extension("ef")).is_ok_and(|x| x.is_file()) {
173                log::info!(".ef file found, using simplify split");
174                let graph =
175                    webgraph::graphs::bvgraph::random_access::BvGraph::with_basename(&args.src)
176                        .endianness::<E>()
177                        .load()?;
178
179                let perm_graph = PermutedGraph {
180                    graph: &graph,
181                    perm: &perm,
182                };
183
184                let sorted = webgraph::transform::simplify_split(
185                    &perm_graph,
186                    args.batch_size.batch_size,
187                    &thread_pool,
188                )?;
189
190                BvComp::parallel_endianness(
191                    &args.dst,
192                    &sorted,
193                    graph.num_nodes(),
194                    args.ca.into(),
195                    &thread_pool,
196                    dir,
197                    &target_endianness,
198                )?;
199
200                return Ok(());
201            }
202
203            no_ef_warn(&args.src);
204
205            let seq_graph =
206                webgraph::graphs::bvgraph::sequential::BvGraphSeq::with_basename(&args.src)
207                    .endianness::<E>()
208                    .load()?;
209
210            let perm_graph = PermutedGraph {
211                graph: &seq_graph,
212                perm: &perm,
213            };
214
215            // simplify the graph
216            let sorted =
217                webgraph::transform::simplify(&perm_graph, args.batch_size.batch_size).unwrap();
218
219            BvComp::parallel_endianness(
220                &args.dst,
221                &sorted,
222                sorted.num_nodes(),
223                args.ca.into(),
224                &thread_pool,
225                dir,
226                &target_endianness,
227            )?;
228        }
229        // just compute the transpose on the fly
230        (None, None) => {
231            log::info!(
232                "No permutation or transposed graph provided, computing the transpose on the fly"
233            );
234            // if the .ef file exists, we can use the simplify split
235            if std::fs::metadata(args.src.with_extension("ef")).is_ok_and(|x| x.is_file()) {
236                log::info!(".ef file found, using simplify split");
237
238                let graph =
239                    webgraph::graphs::bvgraph::random_access::BvGraph::with_basename(&args.src)
240                        .endianness::<E>()
241                        .load()?;
242
243                let sorted = webgraph::transform::simplify_split(
244                    &graph,
245                    args.batch_size.batch_size,
246                    &thread_pool,
247                )?;
248
249                BvComp::parallel_endianness(
250                    &args.dst,
251                    &sorted,
252                    graph.num_nodes(),
253                    args.ca.into(),
254                    &thread_pool,
255                    dir,
256                    &target_endianness,
257                )?;
258
259                return Ok(());
260            }
261
262            no_ef_warn(&args.src);
263
264            let seq_graph =
265                webgraph::graphs::bvgraph::sequential::BvGraphSeq::with_basename(&args.src)
266                    .endianness::<E>()
267                    .load()?;
268
269            let num_nodes = seq_graph.num_nodes();
270            // transpose the graph
271            let sorted =
272                webgraph::transform::simplify_sorted(seq_graph, args.batch_size.batch_size)
273                    .unwrap();
274
275            BvComp::parallel_endianness(
276                &args.dst,
277                &sorted,
278                num_nodes,
279                args.ca.into(),
280                &thread_pool,
281                dir,
282                &target_endianness,
283            )?;
284        }
285    }
286
287    Ok(())
288}