webgraph_cli/analyze/
codes.rs

1/*
2 * SPDX-FileCopyrightText: 2023 Inria
3 * SPDX-FileCopyrightText: 2023 Tommaso Fontana
4 *
5 * SPDX-License-Identifier: Apache-2.0 OR LGPL-2.1-or-later
6 */
7
8use crate::{GlobalArgs, GranularityArgs, NumThreadsArg};
9use anyhow::Result;
10use clap::Parser;
11use dsi_bitstream::{dispatch::factory::CodesReaderFactoryHelper, prelude::*};
12use dsi_progress_logger::prelude::*;
13use std::path::PathBuf;
14use webgraph::prelude::*;
15
16#[derive(Parser, Debug)]
17#[command(name = "codes", about = "Reads a graph and suggests the best codes to use.", long_about = None)]
18pub struct CliArgs {
19    /// The basename of the graph.
20    pub src: PathBuf,
21
22    #[clap(flatten)]
23    pub num_threads: NumThreadsArg,
24
25    #[clap(flatten)]
26    pub granularity: GranularityArgs,
27
28    #[clap(short = 'k', long, default_value_t = 3)]
29    /// How many codes to show for each type, if k is bigger than the number of codes available
30    /// all codes will be shown.
31    pub top_k: usize,
32}
33
34pub fn main(global_args: GlobalArgs, args: CliArgs) -> Result<()> {
35    match get_endianness(&args.src)?.as_str() {
36        #[cfg(feature = "be_bins")]
37        BE::NAME => optimize_codes::<BE>(global_args, args),
38        #[cfg(feature = "le_bins")]
39        LE::NAME => optimize_codes::<LE>(global_args, args),
40        e => panic!("Unknown endianness: {}", e),
41    }
42}
43
44/// Returns ranges of nodes to process in parallel of size `chunk_size` each,
45/// with the last chunk possibly being smaller.
46/// The equivalent of `std::iter::Chunks` but with a `Range` instead of a `Slice`.
47pub struct Chunks {
48    total: core::ops::Range<usize>,
49    chunk_size: usize,
50}
51
52impl Chunks {
53    pub fn new(total: core::ops::Range<usize>, chunk_size: usize) -> Self {
54        Self { total, chunk_size }
55    }
56}
57
58impl Iterator for Chunks {
59    type Item = core::ops::Range<usize>;
60
61    fn next(&mut self) -> Option<Self::Item> {
62        if self.total.start < self.total.end {
63            let end = (self.total.start + self.chunk_size).min(self.total.end);
64            let range = self.total.start..end;
65            self.total.start = end;
66            Some(range)
67        } else {
68            None
69        }
70    }
71}
72
73pub fn optimize_codes<E: Endianness>(global_args: GlobalArgs, args: CliArgs) -> Result<()>
74where
75    MmapHelper<u32>: CodesReaderFactoryHelper<E>,
76    for<'a> LoadModeCodesReader<'a, E, Mmap>: BitSeek,
77{
78    let mut stats = Default::default();
79    let has_ef = std::fs::metadata(args.src.with_extension("ef")).is_ok_and(|x| x.is_file());
80
81    if has_ef {
82        log::info!(
83            "Analyzing codes in parallel using {} threads",
84            args.num_threads.num_threads
85        );
86        let graph = BvGraph::with_basename(&args.src).endianness::<E>().load()?;
87
88        let mut pl = concurrent_progress_logger![item_name = "node"];
89        pl.display_memory(true)
90            .expected_updates(Some(graph.num_nodes()));
91        pl.start("Scanning...");
92
93        if let Some(duration) = global_args.log_interval {
94            pl.log_interval(duration);
95        }
96
97        let thread_pool = rayon::ThreadPoolBuilder::new()
98            .num_threads(args.num_threads.num_threads)
99            .build()?;
100
101        let node_granularity = args
102            .granularity
103            .into_granularity()
104            .node_granularity(graph.num_nodes(), Some(graph.num_arcs()));
105
106        // TODO!: use FairChunks with the offsets EF to distribute the
107        // work based on number of bits used, not nodes
108        stats = Chunks::new(0..graph.num_nodes(), node_granularity).par_map_fold_with(
109            pl.clone(),
110            |pl, range| {
111                let mut iter = graph
112                    .offset_deg_iter_from(range.start)
113                    .map_decoder(|d| StatsDecoder::new(d, Default::default()));
114
115                for _ in (&mut iter).take(range.len()) {
116                    pl.light_update();
117                }
118
119                let mut stats = Default::default();
120                iter.map_decoder(|d| {
121                    stats = d.stats;
122                    d.codes_reader // not important but we need to return something
123                });
124                stats
125            },
126            |mut acc1, acc2| {
127                acc1 += &acc2;
128                acc1
129            },
130            &thread_pool,
131        );
132
133        pl.done();
134    } else {
135        if args.num_threads.num_threads != 1 {
136            log::info!(
137                "Analyzing codes sequentially, this might be faster if you build the Elias-Fano index using `webgraph build ef {}` which will generate file {}",
138                args.src.display(),
139                args.src.with_extension("ef").display()
140            );
141        }
142
143        let graph = BvGraphSeq::with_basename(args.src)
144            .endianness::<E>()
145            .load()?;
146
147        let mut pl = ProgressLogger::default();
148        pl.display_memory(true)
149            .item_name("node")
150            .expected_updates(Some(graph.num_nodes()));
151
152        pl.start("Scanning...");
153
154        // add the stats wrapper to the decoder
155        let mut iter = graph
156            .offset_deg_iter()
157            .map_decoder(|d| StatsDecoder::new(d, Default::default()));
158        // iterate over the graph
159        for _ in iter.by_ref() {
160            pl.light_update();
161        }
162        pl.done();
163        // extract the stats
164        iter.map_decoder(|d| {
165            stats = d.stats;
166            d.codes_reader // not important but we need to return something
167        });
168    }
169
170    macro_rules! impl_best_code {
171        ($new_bits:expr, $old_bits:expr, $stats:expr, $($code:ident - $old:expr),*) => {
172            println!("{:>17} {:>20} {:>12} {:>10} {:>10} {:>16}",
173                "Type", "Code", "Improvement", "Weight", "Bytes", "Bits",
174            );
175            $(
176                let (_, new) = $stats.$code.best_code();
177                $new_bits += new;
178                $old_bits += $old;
179            )*
180
181            $(
182                let codes = $stats.$code.get_codes();
183                let (best_code, best_size) = codes[0];
184
185                let improvement = 100.0 * ($old - best_size) as f64 / $old as f64;
186                let weight = 100.0 * ($old as f64 - best_size as f64) / ($old_bits as f64 - $new_bits as f64);
187
188                println!("{:>17} {:>20} {:>12.3}% {:>9.3}% {:>10} {:>16}",
189                    stringify!($code),
190                    format!("{:?}", best_code),
191                    improvement,
192                    weight,
193                    normalize(best_size as f64 / 8.0),
194                    best_size,
195                );
196                for i in 1..args.top_k.min(codes.len()).max(1) {
197                    let (code, size) = codes[i];
198                    let improvement = 100.0 * ($old as f64 - size as f64) / $old as f64;
199                    println!("{:>17} {:>20} {:>12.3}% {:>10.3} {:>10} {:>16}",
200                        stringify!($code),
201                        format!("{:?}", code),
202                        improvement,
203                        "",
204                        normalize(size as f64 / 8.0),
205                        size,
206                    );
207                }
208                print!("\n");
209            )*
210        };
211    }
212
213    let mut new_bits = 0;
214    let mut old_bits = 0;
215    impl_best_code!(
216        new_bits,
217        old_bits,
218        stats,
219        outdegrees - stats.outdegrees.gamma,
220        reference_offsets - stats.reference_offsets.unary,
221        block_counts - stats.block_counts.gamma,
222        blocks - stats.blocks.gamma,
223        interval_counts - stats.interval_counts.gamma,
224        interval_starts - stats.interval_starts.gamma,
225        interval_lens - stats.interval_lens.gamma,
226        first_residuals - stats.first_residuals.zeta[2],
227        residuals - stats.residuals.zeta[2]
228    );
229
230    println!();
231    println!(" Old bit size: {:>16}", old_bits);
232    println!(" New bit size: {:>16}", new_bits);
233    println!("   Saved bits: {:>16}", old_bits - new_bits);
234
235    println!("Old byte size: {:>16}", normalize(old_bits as f64 / 8.0));
236    println!("New byte size: {:>16}", normalize(new_bits as f64 / 8.0));
237    println!(
238        "  Saved bytes: {:>16}",
239        normalize((old_bits - new_bits) as f64 / 8.0)
240    );
241
242    println!(
243        "  Improvement: {:>15.3}%",
244        100.0 * (old_bits - new_bits) as f64 / old_bits as f64
245    );
246    Ok(())
247}
248
249fn normalize(mut value: f64) -> String {
250    let mut uom = ' ';
251    if value > 1000.0 {
252        value /= 1000.0;
253        uom = 'K';
254    }
255    if value > 1000.0 {
256        value /= 1000.0;
257        uom = 'M';
258    }
259    if value > 1000.0 {
260        value /= 1000.0;
261        uom = 'G';
262    }
263    if value > 1000.0 {
264        value /= 1000.0;
265        uom = 'T';
266    }
267    format!("{:.3}{}", value, uom)
268}