1use anyhow::{anyhow, bail, ensure, Context, Result};
15use clap::{Args, CommandFactory, Parser, Subcommand, ValueEnum};
16use common_traits::{ToBytes, UnsignedInt};
17use dsi_bitstream::dispatch::Codes;
18use epserde::ser::Serialize;
19use jiff::fmt::friendly::{Designator, Spacing, SpanPrinter};
20use jiff::SpanRound;
21use std::io::{BufWriter, Write};
22use std::path::{Path, PathBuf};
23use std::time::Duration;
24use std::time::SystemTime;
25use sux::bits::BitFieldVec;
26use sysinfo::System;
27use webgraph::prelude::CompFlags;
28use webgraph::utils::Granularity;
29
30#[cfg(not(any(feature = "le_bins", feature = "be_bins")))]
31compile_error!("At least one of the features `le_bins` or `be_bins` must be enabled.");
32
33pub mod build_info {
34 include!(concat!(env!("OUT_DIR"), "/built.rs"));
35
36 pub fn version_string() -> String {
37 format!(
38 "{}
39git info: {} {} {}
40build info: built on {} for {} with {}",
41 PKG_VERSION,
42 GIT_VERSION.unwrap_or(""),
43 GIT_COMMIT_HASH.unwrap_or(""),
44 match GIT_DIRTY {
45 None => "",
46 Some(true) => "(dirty)",
47 Some(false) => "(clean)",
48 },
49 BUILD_DATE,
50 TARGET,
51 RUSTC_VERSION
52 )
53 }
54}
55
56#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, ValueEnum)]
57pub enum PrivCode {
61 Unary,
62 Gamma,
63 Delta,
64 Zeta1,
65 Zeta2,
66 Zeta3,
67 Zeta4,
68 Zeta5,
69 Zeta6,
70 Zeta7,
71}
72
73impl From<PrivCode> for Codes {
74 fn from(value: PrivCode) -> Self {
75 match value {
76 PrivCode::Unary => Codes::Unary,
77 PrivCode::Gamma => Codes::Gamma,
78 PrivCode::Delta => Codes::Delta,
79 PrivCode::Zeta1 => Codes::Zeta { k: 1 },
80 PrivCode::Zeta2 => Codes::Zeta { k: 2 },
81 PrivCode::Zeta3 => Codes::Zeta { k: 3 },
82 PrivCode::Zeta4 => Codes::Zeta { k: 4 },
83 PrivCode::Zeta5 => Codes::Zeta { k: 5 },
84 PrivCode::Zeta6 => Codes::Zeta { k: 6 },
85 PrivCode::Zeta7 => Codes::Zeta { k: 7 },
86 }
87 }
88}
89
90#[derive(Args, Debug)]
91pub struct ArcsArgs {
93 #[arg(long, default_value_t = '#')]
94 pub line_comment_symbol: char,
96
97 #[arg(long, default_value_t = 0)]
98 pub lines_to_skip: usize,
100
101 #[arg(long)]
102 pub max_arcs: Option<usize>,
105
106 #[arg(long, default_value_t = '\t')]
107 pub separator: char,
109
110 #[arg(long, default_value_t = 0)]
111 pub source_column: usize,
113
114 #[arg(long, default_value_t = 1)]
115 pub target_column: usize,
117
118 #[arg(long, default_value_t = false)]
119 pub exact: bool,
121}
122
123pub fn num_threads_parser(arg: &str) -> Result<usize> {
128 let num_threads = arg.parse::<usize>()?;
129 ensure!(num_threads > 0, "Number of threads must be greater than 0");
130 Ok(num_threads)
131}
132
133#[derive(Args, Debug)]
135pub struct NumThreadsArg {
136 #[arg(short = 'j', long, default_value_t = rayon::current_num_threads().max(1), value_parser = num_threads_parser)]
137 pub num_threads: usize,
139}
140
141#[derive(Args, Debug)]
143pub struct GranularityArgs {
144 #[arg(long, conflicts_with("node_granularity"))]
145 pub arc_granularity: Option<u64>,
148
149 #[arg(long, conflicts_with("arc_granularity"))]
150 pub node_granularity: Option<usize>,
153}
154
155impl GranularityArgs {
156 pub fn into_granularity(&self) -> Granularity {
157 match (self.arc_granularity, self.node_granularity) {
158 (Some(_), Some(_)) => unreachable!(),
159 (Some(arc_granularity), None) => Granularity::Arcs(arc_granularity),
160 (None, Some(node_granularity)) => Granularity::Nodes(node_granularity),
161 (None, None) => Granularity::default(),
162 }
163 }
164}
165
166#[derive(Args, Debug)]
168pub struct BatchSizeArg {
169 #[clap(short = 'b', long, value_parser = batch_size, default_value = "50%")]
170 pub batch_size: usize,
175}
176
177#[derive(Debug, Clone, Copy, ValueEnum)]
178pub enum FloatVectorFormat {
180 Java,
182 Epserde,
184 Ascii,
186 Json,
188}
189
190impl FloatVectorFormat {
191 pub fn store<F>(
198 &self,
199 path: impl AsRef<Path>,
200 values: &[F],
201 precision: Option<usize>,
202 ) -> Result<()>
203 where
204 F: ToBytes + core::fmt::Display + epserde::ser::Serialize + Copy,
205 for<'a> &'a [F]: epserde::ser::Serialize,
206 {
207 let precision = precision.unwrap_or(f64::DIGITS as usize);
208 create_parent_dir(&path)?;
209 let path_display = path.as_ref().display();
210 let mut file = std::fs::File::create(&path)
211 .with_context(|| format!("Could not create vector at {}", path_display))?;
212
213 match self {
214 FloatVectorFormat::Epserde => {
215 log::info!("Storing in ε-serde format at {}", path_display);
216 values
217 .serialize(&mut file)
218 .with_context(|| format!("Could not write vector to {}", path_display))?;
219 }
220 FloatVectorFormat::Java => {
221 log::info!("Storing in Java format at {}", path_display);
222 for word in values.iter() {
223 file.write_all(word.to_be_bytes().as_ref())
224 .with_context(|| format!("Could not write vector to {}", path_display))?;
225 }
226 }
227 FloatVectorFormat::Ascii => {
228 log::info!("Storing in ASCII format at {}", path_display);
229 for word in values.iter() {
230 writeln!(file, "{word:.precision$}")
231 .with_context(|| format!("Could not write vector to {}", path_display))?;
232 }
233 }
234 FloatVectorFormat::Json => {
235 log::info!("Storing in JSON format at {}", path_display);
236 write!(file, "[")?;
237 for word in values.iter().take(values.len().saturating_sub(2)) {
238 write!(file, "{word:.precision$}, ")
239 .with_context(|| format!("Could not write vector to {}", path_display))?;
240 }
241 if let Some(last) = values.last() {
242 write!(file, "{last:.precision$}")
243 .with_context(|| format!("Could not write vector to {}", path_display))?;
244 }
245 write!(file, "]")?;
246 }
247 }
248
249 Ok(())
250 }
251}
252
253#[derive(Debug, Clone, Copy, ValueEnum)]
254pub enum IntVectorFormat {
256 Java,
258 Epserde,
260 BitFieldVec,
264 Ascii,
266 Json,
268}
269
270impl IntVectorFormat {
271 pub fn store(&self, path: impl AsRef<Path>, data: &[u64], max: Option<u64>) -> Result<()> {
276 create_parent_dir(&path)?;
278
279 let mut file = std::fs::File::create(&path)
280 .with_context(|| format!("Could not create vector at {}", path.as_ref().display()))?;
281 let mut buf = BufWriter::new(&mut file);
282
283 debug_assert_eq!(
284 max,
285 max.map(|_| { data.iter().copied().max().unwrap_or(0) }),
286 "The wrong maximum value was provided for the vector"
287 );
288
289 match self {
290 IntVectorFormat::Epserde => {
291 log::info!("Storing in epserde format at {}", path.as_ref().display());
292 data.serialize(&mut buf).with_context(|| {
293 format!("Could not write vector to {}", path.as_ref().display())
294 })?;
295 }
296 IntVectorFormat::BitFieldVec => {
297 log::info!(
298 "Storing in BitFieldVec format at {}",
299 path.as_ref().display()
300 );
301 let max = max.unwrap_or_else(|| {
302 data.iter()
303 .copied()
304 .max()
305 .unwrap_or_else(|| panic!("Empty vector"))
306 });
307 let bit_width = max.len() as usize;
308 log::info!("Using {} bits per element", bit_width);
309 let mut bit_field_vec = <BitFieldVec<u64, _>>::with_capacity(bit_width, data.len());
310 bit_field_vec.extend(data.iter().copied());
311 bit_field_vec.store(&path).with_context(|| {
312 format!("Could not write vector to {}", path.as_ref().display())
313 })?;
314 }
315 IntVectorFormat::Java => {
316 log::info!("Storing in Java format at {}", path.as_ref().display());
317 for word in data.iter() {
318 buf.write_all(&word.to_be_bytes()).with_context(|| {
319 format!("Could not write vector to {}", path.as_ref().display())
320 })?;
321 }
322 }
323 IntVectorFormat::Ascii => {
324 log::info!("Storing in ASCII format at {}", path.as_ref().display());
325 for word in data.iter() {
326 writeln!(buf, "{}", word).with_context(|| {
327 format!("Could not write vector to {}", path.as_ref().display())
328 })?;
329 }
330 }
331 IntVectorFormat::Json => {
332 log::info!("Storing in JSON format at {}", path.as_ref().display());
333 write!(buf, "[")?;
334 for word in data.iter().take(data.len().saturating_sub(2)) {
335 write!(buf, "{}, ", word).with_context(|| {
336 format!("Could not write vector to {}", path.as_ref().display())
337 })?;
338 }
339 if let Some(last) = data.last() {
340 write!(buf, "{}", last).with_context(|| {
341 format!("Could not write vector to {}", path.as_ref().display())
342 })?;
343 }
344 write!(buf, "]")?;
345 }
346 };
347
348 Ok(())
349 }
350
351 #[cfg(target_pointer_width = "64")]
352 pub fn store_usizes(
359 &self,
360 path: impl AsRef<Path>,
361 data: &[usize],
362 max: Option<usize>,
363 ) -> Result<()> {
364 self.store(
365 path,
366 unsafe { core::mem::transmute::<&[usize], &[u64]>(data) },
367 max.map(|x| x as u64),
368 )
369 }
370}
371
372pub fn batch_size(arg: &str) -> anyhow::Result<usize> {
379 const PREF_SYMS: [(&str, u64); 10] = [
380 ("k", 1E3 as u64),
381 ("m", 1E6 as u64),
382 ("g", 1E9 as u64),
383 ("t", 1E12 as u64),
384 ("p", 1E15 as u64),
385 ("ki", 1 << 10),
386 ("mi", 1 << 20),
387 ("gi", 1 << 30),
388 ("ti", 1 << 40),
389 ("pi", 1 << 50),
390 ];
391 let arg = arg.trim().to_ascii_lowercase();
392 ensure!(!arg.is_empty(), "empty string");
393
394 if arg.ends_with('%') {
395 let perc = arg[..arg.len() - 1].parse::<f64>()?;
396 ensure!(perc >= 0.0 || perc <= 100.0, "percentage out of range");
397 let mut system = System::new();
398 system.refresh_memory();
399 let num_pairs: usize = (((system.total_memory() as f64) * (perc / 100.0)
400 / (std::mem::size_of::<(usize, usize)>() as f64))
401 as u64)
402 .try_into()?;
403 return Ok(num_pairs.align_to(1 << 20)); }
406
407 arg.chars().position(|c| c.is_alphabetic()).map_or_else(
408 || Ok(arg.parse::<usize>()?),
409 |pos| {
410 let (num, pref_sym) = arg.split_at(pos);
411 let multiplier = PREF_SYMS
412 .iter()
413 .find(|(x, _)| *x == pref_sym)
414 .map(|(_, m)| m)
415 .ok_or(anyhow!("invalid prefix symbol"))?;
416
417 Ok((num.parse::<u64>()? * multiplier).try_into()?)
418 },
419 )
420}
421
422#[derive(Args, Debug)]
423pub struct CompressArgs {
425 #[clap(short = 'E', long)]
427 pub endianness: Option<String>,
428
429 #[clap(short = 'w', long, default_value_t = 7)]
431 pub compression_window: usize,
432 #[clap(short = 'i', long, default_value_t = 4)]
434 pub min_interval_length: usize,
435 #[clap(short = 'r', long, default_value_t = 3)]
437 pub max_ref_count: isize,
438
439 #[arg(value_enum)]
440 #[clap(long, default_value = "gamma")]
441 pub outdegrees: PrivCode,
443
444 #[arg(value_enum)]
445 #[clap(long, default_value = "unary")]
446 pub references: PrivCode,
448
449 #[arg(value_enum)]
450 #[clap(long, default_value = "gamma")]
451 pub blocks: PrivCode,
453
454 #[arg(value_enum)]
455 #[clap(long, default_value = "zeta3")]
456 pub residuals: PrivCode,
458}
459
460impl From<CompressArgs> for CompFlags {
461 fn from(value: CompressArgs) -> Self {
462 CompFlags {
463 outdegrees: value.outdegrees.into(),
464 references: value.references.into(),
465 blocks: value.blocks.into(),
466 intervals: PrivCode::Gamma.into(),
467 residuals: value.residuals.into(),
468 min_interval_length: value.min_interval_length,
469 compression_window: value.compression_window,
470 max_ref_count: match value.max_ref_count {
471 -1 => usize::MAX,
472 _ => value.max_ref_count as usize,
473 },
474 }
475 }
476}
477
478pub fn get_thread_pool(num_threads: usize) -> rayon::ThreadPool {
480 rayon::ThreadPoolBuilder::new()
481 .num_threads(num_threads)
482 .build()
483 .expect("Failed to create thread pool")
484}
485
486pub fn append(path: impl AsRef<Path>, s: impl AsRef<str>) -> PathBuf {
492 debug_assert!(path.as_ref().extension().is_none());
493 let mut path_buf = path.as_ref().to_owned();
494 let mut filename = path_buf.file_name().unwrap().to_owned();
495 filename.push(s.as_ref());
496 path_buf.push(filename);
497 path_buf
498}
499
500pub fn create_parent_dir(file_path: impl AsRef<Path>) -> Result<()> {
502 if let Some(parent_dir) = file_path.as_ref().parent() {
504 std::fs::create_dir_all(parent_dir).with_context(|| {
505 format!(
506 "Failed to create the directory {:?}",
507 parent_dir.to_string_lossy()
508 )
509 })?;
510 }
511 Ok(())
512}
513
514fn parse_duration(value: &str) -> Result<Duration> {
524 if value.is_empty() {
525 bail!("Empty duration string, if you want every 0 milliseconds use `0`.");
526 }
527 let mut duration = Duration::from_secs(0);
528 let mut acc = String::new();
529 for c in value.chars() {
530 if c.is_ascii_digit() {
531 acc.push(c);
532 } else if c.is_whitespace() {
533 continue;
534 } else {
535 let dur = acc.parse::<u64>()?;
536 match c {
537 's' => duration += Duration::from_secs(dur),
538 'm' => duration += Duration::from_secs(dur * 60),
539 'h' => duration += Duration::from_secs(dur * 60 * 60),
540 'd' => duration += Duration::from_secs(dur * 60 * 60 * 24),
541 _ => return Err(anyhow!("Invalid duration suffix: {}", c)),
542 }
543 acc.clear();
544 }
545 }
546 if !acc.is_empty() {
547 let dur = acc.parse::<u64>()?;
548 duration += Duration::from_millis(dur);
549 }
550 Ok(duration)
551}
552
553pub fn init_env_logger() -> Result<()> {
554 let mut builder =
555 env_logger::Builder::from_env(env_logger::Env::default().default_filter_or("info"));
556
557 let start = std::time::Instant::now();
558 let printer = SpanPrinter::new()
559 .spacing(Spacing::None)
560 .designator(Designator::Compact);
561 let span_round = SpanRound::new()
562 .largest(jiff::Unit::Day)
563 .smallest(jiff::Unit::Millisecond)
564 .days_are_24_hours();
565
566 builder.format(move |buf, record| {
567 let Ok(ts) = jiff::Timestamp::try_from(SystemTime::now()) else {
568 return Err(std::io::Error::other("Failed to get timestamp"));
569 };
570 let style = buf.default_level_style(record.level());
571 let elapsed = start.elapsed();
572 let span = jiff::Span::new()
573 .seconds(elapsed.as_secs() as i64)
574 .milliseconds(elapsed.subsec_millis() as i64);
575 let span = span.round(span_round).expect("Failed to round span");
576 writeln!(
577 buf,
578 "{} {} {style}{}{style:#} [{:?}] {} - {}",
579 ts.strftime("%F %T%.3f"),
580 printer.span_to_string(&span),
581 record.level(),
582 std::thread::current().id(),
583 record.target(),
584 record.args()
585 )
586 });
587 builder.init();
588 Ok(())
589}
590
591#[derive(Args, Debug)]
592pub struct GlobalArgs {
593 #[arg(long, value_parser = parse_duration, global=true, display_order = 1000)]
594 log_interval: Option<Duration>,
600}
601
602#[derive(Subcommand, Debug)]
603pub enum SubCommands {
604 #[command(subcommand)]
605 Analyze(analyze::SubCommands),
606 #[command(subcommand)]
607 Bench(bench::SubCommands),
608 #[command(subcommand)]
609 Build(build::SubCommands),
610 #[command(subcommand)]
611 Check(check::SubCommands),
612 #[command(subcommand)]
613 From(from::SubCommands),
614 #[command(subcommand)]
615 Perm(perm::SubCommands),
616 #[command(subcommand)]
617 Run(run::SubCommands),
618 #[command(subcommand)]
619 To(to::SubCommands),
620 #[command(subcommand)]
621 Transform(transform::SubCommands),
622}
623
624#[derive(Parser, Debug)]
625#[command(name = "webgraph", version=build_info::version_string())]
626pub struct Cli {
638 #[command(subcommand)]
639 command: SubCommands,
640 #[clap(flatten)]
641 args: GlobalArgs,
642}
643
644pub mod dist;
645pub mod sccs;
646
647pub mod analyze;
648pub mod bench;
649pub mod build;
650pub mod check;
651pub mod from;
652pub mod perm;
653pub mod run;
654pub mod to;
655pub mod transform;
656
657pub fn cli_main<I, T>(args: I) -> Result<()>
659where
660 I: IntoIterator<Item = T>,
661 T: Into<std::ffi::OsString> + Clone,
662{
663 let start = std::time::Instant::now();
664 let cli = Cli::parse_from(args);
665 match cli.command {
666 SubCommands::Analyze(args) => {
667 analyze::main(cli.args, args)?;
668 }
669 SubCommands::Bench(args) => {
670 bench::main(cli.args, args)?;
671 }
672 SubCommands::Build(args) => {
673 build::main(cli.args, args, Cli::command())?;
674 }
675 SubCommands::Check(args) => {
676 check::main(cli.args, args)?;
677 }
678 SubCommands::From(args) => {
679 from::main(cli.args, args)?;
680 }
681 SubCommands::Perm(args) => {
682 perm::main(cli.args, args)?;
683 }
684 SubCommands::Run(args) => {
685 run::main(cli.args, args)?;
686 }
687 SubCommands::To(args) => {
688 to::main(cli.args, args)?;
689 }
690 SubCommands::Transform(args) => {
691 transform::main(cli.args, args)?;
692 }
693 }
694
695 log::info!(
696 "The command took {}",
697 pretty_print_elapsed(start.elapsed().as_secs_f64())
698 );
699
700 Ok(())
701}
702
703fn pretty_print_elapsed(elapsed: f64) -> String {
705 let mut result = String::new();
706 let mut elapsed_seconds = elapsed as u64;
707 let weeks = elapsed_seconds / (60 * 60 * 24 * 7);
708 elapsed_seconds %= 60 * 60 * 24 * 7;
709 let days = elapsed_seconds / (60 * 60 * 24);
710 elapsed_seconds %= 60 * 60 * 24;
711 let hours = elapsed_seconds / (60 * 60);
712 elapsed_seconds %= 60 * 60;
713 let minutes = elapsed_seconds / 60;
714 match weeks {
717 0 => {}
718 1 => result.push_str("1 week "),
719 _ => result.push_str(&format!("{} weeks ", weeks)),
720 }
721 match days {
722 0 => {}
723 1 => result.push_str("1 day "),
724 _ => result.push_str(&format!("{} days ", days)),
725 }
726 match hours {
727 0 => {}
728 1 => result.push_str("1 hour "),
729 _ => result.push_str(&format!("{} hours ", hours)),
730 }
731 match minutes {
732 0 => {}
733 1 => result.push_str("1 minute "),
734 _ => result.push_str(&format!("{} minutes ", minutes)),
735 }
736
737 result.push_str(&format!("{:.3} seconds ({}s)", elapsed % 60.0, elapsed));
738 result
739}