webgraph_cli/build/
ef.rs

1/*
2 * SPDX-FileCopyrightText: 2023 Inria
3 * SPDX-FileCopyrightText: 2023 Tommaso Fontana
4 *
5 * SPDX-License-Identifier: Apache-2.0 OR LGPL-2.1-or-later
6 */
7
8use crate::GlobalArgs;
9use anyhow::{Context, Result};
10use clap::Parser;
11use dsi_bitstream::dispatch::factory::CodesReaderFactoryHelper;
12use dsi_bitstream::prelude::*;
13use dsi_progress_logger::prelude::*;
14use epserde::prelude::*;
15use log::info;
16use mmap_rs::MmapFlags;
17use std::fs::File;
18use std::io::{BufReader, BufWriter, Seek};
19use std::path::PathBuf;
20use sux::prelude::*;
21use webgraph::prelude::*;
22
23#[derive(Parser, Debug, Clone)]
24#[command(name = "ef", about = "Builds the Elias-Fano representation of the offsets of a graph.", long_about = None)]
25pub struct CliArgs {
26    /// The basename of the graph (or labels).
27    pub src: PathBuf,
28    /// The number of nodes of the graph. When passed, we don't need to load the
29    /// `.properties` file. This allows to build Elias-Fano from the offsets of
30    /// something that might not be a graph but that has offsets, like labels.
31    /// For this reason, if passed, we will also try to read the `.labeloffsets`
32    /// file and then fallback to the usual `.offsets` file.
33    pub number_of_nodes: Option<usize>,
34}
35
36pub fn main(global_args: GlobalArgs, args: CliArgs) -> Result<()> {
37    let ef_path = args.src.with_extension(EF_EXTENSION);
38    // check that ef_path is writable, this is the only portable way I found
39    // to check that the file is writable.
40    if ef_path.exists() && ef_path.metadata()?.permissions().readonly() {
41        return Err(anyhow::anyhow!(
42            "The file is not writable: {}",
43            ef_path.display()
44        ));
45    }
46
47    match get_endianness(&args.src)?.as_str() {
48        #[cfg(feature = "be_bins")]
49        BE::NAME => build_eliasfano::<BE>(global_args, args),
50        #[cfg(feature = "le_bins")]
51        LE::NAME => build_eliasfano::<LE>(global_args, args),
52        e => panic!("Unknown endianness: {}", e),
53    }
54}
55
56pub fn build_eliasfano<E: Endianness + 'static>(
57    global_args: GlobalArgs,
58    args: CliArgs,
59) -> Result<()>
60where
61    for<'a> BufBitReader<E, MemWordReader<u32, &'a [u32]>>: CodesRead<E> + BitSeek,
62{
63    let mut pl = ProgressLogger::default();
64    pl.display_memory(true).item_name("offset");
65    if let Some(duration) = &global_args.log_interval {
66        pl.log_interval(*duration);
67    }
68
69    let basename = args.src.clone();
70    if let Some(num_nodes) = args.number_of_nodes {
71        pl.expected_updates(Some(num_nodes));
72        // Horribly temporary duplicated code for the case of label offsets.
73        let of_file_path = basename.with_extension(LABELOFFSETS_EXTENSION);
74        if of_file_path.exists() {
75            let labels_path = basename.with_extension(LABELS_EXTENSION);
76            let mut file = File::open(&labels_path)
77                .with_context(|| format!("Could not open {}", labels_path.display()))?;
78            let file_len = 8 * file
79                .seek(std::io::SeekFrom::End(0))
80                .with_context(|| format!("Could not seek to end of {}", labels_path.display()))?;
81
82            let mut efb = EliasFanoBuilder::new(num_nodes + 1, file_len as usize);
83
84            info!("The offsets file exists, reading it to build Elias-Fano");
85
86            let of = <MmapHelper<u32>>::mmap(of_file_path, MmapFlags::SEQUENTIAL)?;
87            build_eliasfano_from_offsets(
88                &global_args,
89                &args,
90                num_nodes,
91                of.new_reader(),
92                &mut pl,
93                &mut efb,
94            )?;
95            return serialize_eliasfano(&global_args, &args, efb, &mut pl);
96        }
97    }
98
99    // Creates the offsets file
100    let of_file_path = basename.with_extension(OFFSETS_EXTENSION);
101
102    let graph_path = basename.with_extension(GRAPH_EXTENSION);
103    info!("Getting size of graph at '{}'", graph_path.display());
104    let mut file = File::open(&graph_path)
105        .with_context(|| format!("Could not open {}", graph_path.display()))?;
106    let file_len = 8 * file
107        .seek(std::io::SeekFrom::End(0))
108        .with_context(|| format!("Could not seek in {}", graph_path.display()))?;
109    info!("Graph file size: {} bits", file_len);
110
111    // if the num_of_nodes is not present, read it from the properties file
112    // otherwise use the provided value, this is so we can build the Elias-Fano
113    // for offsets of any custom format that might not use the standard
114    // properties file
115    let num_nodes = args
116        .number_of_nodes
117        .map(Ok::<_, anyhow::Error>)
118        .unwrap_or_else(|| {
119            let properties_path = basename.with_extension(PROPERTIES_EXTENSION);
120            info!(
121                "Reading num_of_nodes from properties file at '{}'",
122                properties_path.display()
123            );
124            let f = File::open(&properties_path).with_context(|| {
125                format!(
126                    "Could not open properties file: {}",
127                    properties_path.display()
128                )
129            })?;
130            let map = java_properties::read(BufReader::new(f))?;
131            Ok(map.get("nodes").unwrap().parse::<usize>()?)
132        })?;
133    pl.expected_updates(Some(num_nodes));
134
135    let mut efb = EliasFanoBuilder::new(num_nodes + 1, file_len as usize);
136
137    info!("Checking if offsets exists at '{}'", of_file_path.display());
138    // if the offset files exists, read it to build elias-fano
139    if of_file_path.exists() {
140        info!("The offsets file exists, reading it to build Elias-Fano");
141        let of = <MmapHelper<u32>>::mmap(of_file_path, MmapFlags::SEQUENTIAL)?;
142        build_eliasfano_from_offsets(
143            &global_args,
144            &args,
145            num_nodes,
146            of.new_reader(),
147            &mut pl,
148            &mut efb,
149        )?;
150    } else {
151        build_eliasfano_from_graph(&args, &mut pl, &mut efb)?;
152    }
153
154    serialize_eliasfano(&global_args, &args, efb, &mut pl)
155}
156
157pub fn build_eliasfano_from_graph(
158    args: &CliArgs,
159    pl: &mut impl ProgressLog,
160    efb: &mut EliasFanoBuilder,
161) -> Result<()> {
162    info!("The offsets file does not exists, reading the graph to build Elias-Fano");
163    match get_endianness(&args.src)?.as_str() {
164        #[cfg(feature = "be_bins")]
165        BE::NAME => build_eliasfano_from_graph_with_endianness::<BE>(args, pl, efb),
166        #[cfg(feature = "le_bins")]
167        LE::NAME => build_eliasfano_from_graph_with_endianness::<LE>(args, pl, efb),
168        e => panic!("Unknown endianness: {}", e),
169    }
170}
171
172pub fn build_eliasfano_from_offsets<E: Endianness>(
173    _global_args: &GlobalArgs,
174    _args: &CliArgs,
175    num_nodes: usize,
176    mut reader: impl GammaRead<E>,
177    pl: &mut impl ProgressLog,
178    efb: &mut EliasFanoBuilder,
179) -> Result<()> {
180    info!("Building Elias-Fano from offsets...");
181
182    // progress bar
183    pl.start("Translating offsets to EliasFano...");
184    // read the graph a write the offsets
185    let mut offset = 0;
186    for _node_id in 0..num_nodes + 1 {
187        // write where
188        offset += reader.read_gamma().context("Could not read gamma")?;
189        efb.push(offset as _);
190        // decode the next nodes so we know where the next node_id starts
191        pl.light_update();
192    }
193    pl.done();
194    Ok(())
195}
196
197pub fn build_eliasfano_from_graph_with_endianness<E: Endianness>(
198    args: &CliArgs,
199    pl: &mut impl ProgressLog,
200    efb: &mut EliasFanoBuilder,
201) -> Result<()>
202where
203    MmapHelper<u32>: CodesReaderFactoryHelper<E>,
204    for<'a> LoadModeCodesReader<'a, E, Mmap>: BitSeek,
205{
206    let seq_graph = webgraph::graphs::bvgraph::sequential::BvGraphSeq::with_basename(&args.src)
207        .endianness::<E>()
208        .load()
209        .with_context(|| format!("Could not load graph at {}", args.src.display()))?;
210    // otherwise directly read the graph
211    // progress bar
212    pl.start("Building EliasFano...");
213    // read the graph a write the offsets
214    let mut iter = seq_graph.offset_deg_iter();
215    for (new_offset, _degree) in iter.by_ref() {
216        // write where
217        efb.push(new_offset as _);
218        // decode the next nodes so we know where the next node_id starts
219        pl.light_update();
220    }
221    efb.push(iter.get_pos() as _);
222    Ok(())
223}
224
225pub fn serialize_eliasfano(
226    global_args: &GlobalArgs,
227    args: &CliArgs,
228    efb: EliasFanoBuilder,
229    pl: &mut impl ProgressLog,
230) -> Result<()> {
231    let ef = efb.build();
232    pl.done();
233
234    let mut pl = ProgressLogger::default();
235    pl.display_memory(true);
236    if let Some(duration) = &global_args.log_interval {
237        pl.log_interval(*duration);
238    }
239    pl.start("Building the Index over the ones in the high-bits...");
240    let ef: EF = unsafe { ef.map_high_bits(SelectAdaptConst::<_, _, 12, 4>::new) };
241    pl.done();
242
243    let mut pl = ProgressLogger::default();
244    pl.display_memory(true);
245    if let Some(duration) = &global_args.log_interval {
246        pl.log_interval(*duration);
247    }
248    pl.start("Writing to disk...");
249
250    let ef_path = args.src.with_extension(EF_EXTENSION);
251    info!("Creating Elias-Fano at '{}'", ef_path.display());
252    let mut ef_file = BufWriter::new(
253        File::create(&ef_path)
254            .with_context(|| format!("Could not create {}", ef_path.display()))?,
255    );
256
257    // serialize and dump the schema to disk
258    unsafe {
259        ef.serialize(&mut ef_file)
260            .with_context(|| format!("Could not serialize EliasFano to {}", ef_path.display()))
261    }?;
262
263    pl.done();
264    Ok(())
265}