1use crate::{GlobalArgs, GranularityArgs, NumThreadsArg};
9use anyhow::Result;
10use clap::Parser;
11use dsi_bitstream::{dispatch::factory::CodesReaderFactoryHelper, prelude::*};
12use dsi_progress_logger::prelude::*;
13use std::path::PathBuf;
14use webgraph::prelude::*;
15
16#[derive(Parser, Debug)]
17#[command(name = "codes", about = "Reads a graph and suggests the best codes to use.", long_about = None)]
18pub struct CliArgs {
19 pub src: PathBuf,
21
22 #[clap(flatten)]
23 pub num_threads: NumThreadsArg,
24
25 #[clap(flatten)]
26 pub granularity: GranularityArgs,
27
28 #[clap(short = 'k', long, default_value_t = 3)]
29 pub top_k: usize,
32}
33
34pub fn main(global_args: GlobalArgs, args: CliArgs) -> Result<()> {
35 match get_endianness(&args.src)?.as_str() {
36 #[cfg(feature = "be_bins")]
37 BE::NAME => optimize_codes::<BE>(global_args, args),
38 #[cfg(feature = "le_bins")]
39 LE::NAME => optimize_codes::<LE>(global_args, args),
40 e => panic!("Unknown endianness: {}", e),
41 }
42}
43
44pub struct Chunks {
48 total: core::ops::Range<usize>,
49 chunk_size: usize,
50}
51
52impl Chunks {
53 pub fn new(total: core::ops::Range<usize>, chunk_size: usize) -> Self {
54 Self { total, chunk_size }
55 }
56}
57
58impl Iterator for Chunks {
59 type Item = core::ops::Range<usize>;
60
61 fn next(&mut self) -> Option<Self::Item> {
62 if self.total.start < self.total.end {
63 let end = (self.total.start + self.chunk_size).min(self.total.end);
64 let range = self.total.start..end;
65 self.total.start = end;
66 Some(range)
67 } else {
68 None
69 }
70 }
71}
72
73pub fn optimize_codes<E: Endianness>(global_args: GlobalArgs, args: CliArgs) -> Result<()>
74where
75 MmapHelper<u32>: CodesReaderFactoryHelper<E>,
76 for<'a> LoadModeCodesReader<'a, E, Mmap>: BitSeek,
77{
78 let mut stats = Default::default();
79 let has_ef = std::fs::metadata(args.src.with_extension("ef")).is_ok_and(|x| x.is_file());
80
81 if has_ef {
82 log::info!(
83 "Analyzing codes in parallel using {} threads",
84 args.num_threads.num_threads
85 );
86 let graph = BvGraph::with_basename(&args.src).endianness::<E>().load()?;
87
88 let mut pl = concurrent_progress_logger![item_name = "node"];
89 pl.display_memory(true)
90 .expected_updates(Some(graph.num_nodes()));
91 pl.start("Scanning...");
92
93 if let Some(duration) = global_args.log_interval {
94 pl.log_interval(duration);
95 }
96
97 let thread_pool = rayon::ThreadPoolBuilder::new()
98 .num_threads(args.num_threads.num_threads)
99 .build()?;
100
101 let node_granularity = args
102 .granularity
103 .into_granularity()
104 .node_granularity(graph.num_nodes(), Some(graph.num_arcs()));
105
106 stats = Chunks::new(0..graph.num_nodes(), node_granularity).par_map_fold_with(
109 pl.clone(),
110 |pl, range| {
111 let mut iter = graph
112 .offset_deg_iter_from(range.start)
113 .map_decoder(|d| StatsDecoder::new(d, Default::default()));
114
115 for _ in (&mut iter).take(range.len()) {
116 pl.light_update();
117 }
118
119 let mut stats = Default::default();
120 iter.map_decoder(|d| {
121 stats = d.stats;
122 d.codes_reader });
124 stats
125 },
126 |mut acc1, acc2| {
127 acc1 += &acc2;
128 acc1
129 },
130 &thread_pool,
131 );
132
133 pl.done();
134 } else {
135 if args.num_threads.num_threads != 1 {
136 log::info!(
137 "Analyzing codes sequentially, this might be faster if you build the Elias-Fano index using `webgraph build ef {}` which will generate file {}",
138 args.src.display(),
139 args.src.with_extension("ef").display()
140 );
141 }
142
143 let graph = BvGraphSeq::with_basename(args.src)
144 .endianness::<E>()
145 .load()?;
146
147 let mut pl = ProgressLogger::default();
148 pl.display_memory(true)
149 .item_name("node")
150 .expected_updates(Some(graph.num_nodes()));
151
152 pl.start("Scanning...");
153
154 let mut iter = graph
156 .offset_deg_iter()
157 .map_decoder(|d| StatsDecoder::new(d, Default::default()));
158 for _ in iter.by_ref() {
160 pl.light_update();
161 }
162 pl.done();
163 iter.map_decoder(|d| {
165 stats = d.stats;
166 d.codes_reader });
168 }
169
170 macro_rules! impl_best_code {
171 ($new_bits:expr, $old_bits:expr, $stats:expr, $($code:ident - $old:expr),*) => {
172 println!("{:>17} {:>20} {:>12} {:>10} {:>10} {:>16}",
173 "Type", "Code", "Improvement", "Weight", "Bytes", "Bits",
174 );
175 $(
176 let (_, new) = $stats.$code.best_code();
177 $new_bits += new;
178 $old_bits += $old;
179 )*
180
181 $(
182 let codes = $stats.$code.get_codes();
183 let (best_code, best_size) = codes[0];
184
185 let improvement = 100.0 * ($old - best_size) as f64 / $old as f64;
186 let weight = 100.0 * ($old as f64 - best_size as f64) / ($old_bits as f64 - $new_bits as f64);
187
188 println!("{:>17} {:>20} {:>12.3}% {:>9.3}% {:>10} {:>16}",
189 stringify!($code),
190 format!("{:?}", best_code),
191 improvement,
192 weight,
193 normalize(best_size as f64 / 8.0),
194 best_size,
195 );
196 for i in 1..args.top_k.min(codes.len()).max(1) {
197 let (code, size) = codes[i];
198 let improvement = 100.0 * ($old as f64 - size as f64) / $old as f64;
199 println!("{:>17} {:>20} {:>12.3}% {:>10.3} {:>10} {:>16}",
200 stringify!($code),
201 format!("{:?}", code),
202 improvement,
203 "",
204 normalize(size as f64 / 8.0),
205 size,
206 );
207 }
208 print!("\n");
209 )*
210 };
211 }
212
213 let mut new_bits = 0;
214 let mut old_bits = 0;
215 impl_best_code!(
216 new_bits,
217 old_bits,
218 stats,
219 outdegrees - stats.outdegrees.gamma,
220 reference_offsets - stats.reference_offsets.unary,
221 block_counts - stats.block_counts.gamma,
222 blocks - stats.blocks.gamma,
223 interval_counts - stats.interval_counts.gamma,
224 interval_starts - stats.interval_starts.gamma,
225 interval_lens - stats.interval_lens.gamma,
226 first_residuals - stats.first_residuals.zeta[2],
227 residuals - stats.residuals.zeta[2]
228 );
229
230 println!();
231 println!(" Old bit size: {:>16}", old_bits);
232 println!(" New bit size: {:>16}", new_bits);
233 println!(" Saved bits: {:>16}", old_bits - new_bits);
234
235 println!("Old byte size: {:>16}", normalize(old_bits as f64 / 8.0));
236 println!("New byte size: {:>16}", normalize(new_bits as f64 / 8.0));
237 println!(
238 " Saved bytes: {:>16}",
239 normalize((old_bits - new_bits) as f64 / 8.0)
240 );
241
242 println!(
243 " Improvement: {:>15.3}%",
244 100.0 * (old_bits - new_bits) as f64 / old_bits as f64
245 );
246 Ok(())
247}
248
249fn normalize(mut value: f64) -> String {
250 let mut uom = ' ';
251 if value > 1000.0 {
252 value /= 1000.0;
253 uom = 'K';
254 }
255 if value > 1000.0 {
256 value /= 1000.0;
257 uom = 'M';
258 }
259 if value > 1000.0 {
260 value /= 1000.0;
261 uom = 'G';
262 }
263 if value > 1000.0 {
264 value /= 1000.0;
265 uom = 'T';
266 }
267 format!("{:.3}{}", value, uom)
268}