1#![doc = include_str!("../README.md")]
10#![allow(clippy::type_complexity)]
11
12use anyhow::{anyhow, bail, ensure, Context, Result};
13use clap::{Args, CommandFactory, Parser, Subcommand, ValueEnum};
14use common_traits::{ToBytes, UnsignedInt};
15use dsi_bitstream::dispatch::Codes;
16use epserde::ser::Serialize;
17use std::io::{BufWriter, Write};
18use std::path::{Path, PathBuf};
19use std::time::Duration;
20use std::time::SystemTime;
21use sux::bits::BitFieldVec;
22use webgraph::prelude::CompFlags;
23use webgraph::utils::{Granularity, MemoryUsage};
24
25#[cfg(not(any(feature = "le_bins", feature = "be_bins")))]
26compile_error!("At least one of the features `le_bins` or `be_bins` must be enabled.");
27
28pub mod build_info {
29 include!(concat!(env!("OUT_DIR"), "/built.rs"));
30
31 pub fn version_string() -> String {
32 format!(
33 "{}
34git info: {} {} {}
35build info: built on {} for {} with {}",
36 PKG_VERSION,
37 GIT_VERSION.unwrap_or(""),
38 GIT_COMMIT_HASH.unwrap_or(""),
39 match GIT_DIRTY {
40 None => "",
41 Some(true) => "(dirty)",
42 Some(false) => "(clean)",
43 },
44 BUILD_DATE,
45 TARGET,
46 RUSTC_VERSION
47 )
48 }
49}
50
51#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, ValueEnum)]
52pub enum PrivCode {
56 Unary,
57 Gamma,
58 Delta,
59 Zeta1,
60 Zeta2,
61 Zeta3,
62 Zeta4,
63 Zeta5,
64 Zeta6,
65 Zeta7,
66}
67
68impl From<PrivCode> for Codes {
69 fn from(value: PrivCode) -> Self {
70 match value {
71 PrivCode::Unary => Codes::Unary,
72 PrivCode::Gamma => Codes::Gamma,
73 PrivCode::Delta => Codes::Delta,
74 PrivCode::Zeta1 => Codes::Zeta { k: 1 },
75 PrivCode::Zeta2 => Codes::Zeta { k: 2 },
76 PrivCode::Zeta3 => Codes::Zeta { k: 3 },
77 PrivCode::Zeta4 => Codes::Zeta { k: 4 },
78 PrivCode::Zeta5 => Codes::Zeta { k: 5 },
79 PrivCode::Zeta6 => Codes::Zeta { k: 6 },
80 PrivCode::Zeta7 => Codes::Zeta { k: 7 },
81 }
82 }
83}
84
85#[derive(Args, Debug)]
86pub struct ArcsArgs {
88 #[arg(long, default_value_t = '#')]
89 pub line_comment_symbol: char,
91
92 #[arg(long, default_value_t = 0)]
93 pub lines_to_skip: usize,
95
96 #[arg(long)]
97 pub max_arcs: Option<usize>,
100
101 #[arg(long, default_value_t = '\t')]
102 pub separator: char,
104
105 #[arg(long, default_value_t = 0)]
106 pub source_column: usize,
108
109 #[arg(long, default_value_t = 1)]
110 pub target_column: usize,
112
113 #[arg(long, default_value_t = false)]
114 pub labels: bool,
116}
117
118pub fn num_threads_parser(arg: &str) -> Result<usize> {
123 let num_threads = arg.parse::<usize>()?;
124 ensure!(num_threads > 0, "Number of threads must be greater than 0");
125 Ok(num_threads)
126}
127
128#[derive(Args, Debug)]
130pub struct NumThreadsArg {
131 #[arg(short = 'j', long, default_value_t = rayon::current_num_threads().max(1), value_parser = num_threads_parser)]
132 pub num_threads: usize,
134}
135
136#[derive(Args, Debug)]
138pub struct GranularityArgs {
139 #[arg(long, conflicts_with("node_granularity"))]
140 pub arc_granularity: Option<u64>,
143
144 #[arg(long, conflicts_with("arc_granularity"))]
145 pub node_granularity: Option<usize>,
148}
149
150impl GranularityArgs {
151 pub fn into_granularity(&self) -> Granularity {
152 match (self.arc_granularity, self.node_granularity) {
153 (Some(_), Some(_)) => unreachable!(),
154 (Some(arc_granularity), None) => Granularity::Arcs(arc_granularity),
155 (None, Some(node_granularity)) => Granularity::Nodes(node_granularity),
156 (None, None) => Granularity::default(),
157 }
158 }
159}
160
161#[derive(Args, Debug)]
163pub struct MemoryUsageArg {
164 #[clap(short = 'm', long = "memory-usage", value_parser = memory_usage_parser, default_value = "50%")]
165 pub memory_usage: MemoryUsage,
170}
171
172#[derive(Debug, Clone, Copy, ValueEnum)]
173pub enum FloatVectorFormat {
175 Java,
177 Epserde,
179 Ascii,
181 Json,
183}
184
185impl FloatVectorFormat {
186 pub fn store<F>(
193 &self,
194 path: impl AsRef<Path>,
195 values: &[F],
196 precision: Option<usize>,
197 ) -> Result<()>
198 where
199 F: ToBytes + core::fmt::Display + epserde::ser::Serialize + Copy,
200 for<'a> &'a [F]: epserde::ser::Serialize,
201 {
202 let precision = precision.unwrap_or(f64::DIGITS as usize);
203 create_parent_dir(&path)?;
204 let path_display = path.as_ref().display();
205 let mut file = std::fs::File::create(&path)
206 .with_context(|| format!("Could not create vector at {}", path_display))?;
207
208 match self {
209 FloatVectorFormat::Epserde => {
210 log::info!("Storing in ε-serde format at {}", path_display);
211 unsafe {
212 values
213 .serialize(&mut file)
214 .with_context(|| format!("Could not write vector to {}", path_display))
215 }?;
216 }
217 FloatVectorFormat::Java => {
218 log::info!("Storing in Java format at {}", path_display);
219 for word in values.iter() {
220 file.write_all(word.to_be_bytes().as_ref())
221 .with_context(|| format!("Could not write vector to {}", path_display))?;
222 }
223 }
224 FloatVectorFormat::Ascii => {
225 log::info!("Storing in ASCII format at {}", path_display);
226 for word in values.iter() {
227 writeln!(file, "{word:.precision$}")
228 .with_context(|| format!("Could not write vector to {}", path_display))?;
229 }
230 }
231 FloatVectorFormat::Json => {
232 log::info!("Storing in JSON format at {}", path_display);
233 write!(file, "[")?;
234 for word in values.iter().take(values.len().saturating_sub(2)) {
235 write!(file, "{word:.precision$}, ")
236 .with_context(|| format!("Could not write vector to {}", path_display))?;
237 }
238 if let Some(last) = values.last() {
239 write!(file, "{last:.precision$}")
240 .with_context(|| format!("Could not write vector to {}", path_display))?;
241 }
242 write!(file, "]")?;
243 }
244 }
245
246 Ok(())
247 }
248}
249
250#[derive(Debug, Clone, Copy, ValueEnum)]
251pub enum IntVectorFormat {
253 Java,
255 Epserde,
257 BitFieldVec,
261 Ascii,
263 Json,
265}
266
267impl IntVectorFormat {
268 pub fn store(&self, path: impl AsRef<Path>, data: &[u64], max: Option<u64>) -> Result<()> {
273 create_parent_dir(&path)?;
275
276 let mut file = std::fs::File::create(&path)
277 .with_context(|| format!("Could not create vector at {}", path.as_ref().display()))?;
278 let mut buf = BufWriter::new(&mut file);
279
280 debug_assert_eq!(
281 max,
282 max.map(|_| { data.iter().copied().max().unwrap_or(0) }),
283 "The wrong maximum value was provided for the vector"
284 );
285
286 match self {
287 IntVectorFormat::Epserde => {
288 log::info!("Storing in epserde format at {}", path.as_ref().display());
289 unsafe {
290 data.serialize(&mut buf).with_context(|| {
291 format!("Could not write vector to {}", path.as_ref().display())
292 })
293 }?;
294 }
295 IntVectorFormat::BitFieldVec => {
296 log::info!(
297 "Storing in BitFieldVec format at {}",
298 path.as_ref().display()
299 );
300 let max = max.unwrap_or_else(|| {
301 data.iter()
302 .copied()
303 .max()
304 .unwrap_or_else(|| panic!("Empty vector"))
305 });
306 let bit_width = max.len() as usize;
307 log::info!("Using {} bits per element", bit_width);
308 let mut bit_field_vec = <BitFieldVec<u64, _>>::with_capacity(bit_width, data.len());
309 bit_field_vec.extend(data.iter().copied());
310 unsafe {
311 bit_field_vec.store(&path).with_context(|| {
312 format!("Could not write vector to {}", path.as_ref().display())
313 })
314 }?;
315 }
316 IntVectorFormat::Java => {
317 log::info!("Storing in Java format at {}", path.as_ref().display());
318 for word in data.iter() {
319 buf.write_all(&word.to_be_bytes()).with_context(|| {
320 format!("Could not write vector to {}", path.as_ref().display())
321 })?;
322 }
323 }
324 IntVectorFormat::Ascii => {
325 log::info!("Storing in ASCII format at {}", path.as_ref().display());
326 for word in data.iter() {
327 writeln!(buf, "{}", word).with_context(|| {
328 format!("Could not write vector to {}", path.as_ref().display())
329 })?;
330 }
331 }
332 IntVectorFormat::Json => {
333 log::info!("Storing in JSON format at {}", path.as_ref().display());
334 write!(buf, "[")?;
335 for word in data.iter().take(data.len().saturating_sub(2)) {
336 write!(buf, "{}, ", word).with_context(|| {
337 format!("Could not write vector to {}", path.as_ref().display())
338 })?;
339 }
340 if let Some(last) = data.last() {
341 write!(buf, "{}", last).with_context(|| {
342 format!("Could not write vector to {}", path.as_ref().display())
343 })?;
344 }
345 write!(buf, "]")?;
346 }
347 };
348
349 Ok(())
350 }
351
352 #[cfg(target_pointer_width = "64")]
353 pub fn store_usizes(
360 &self,
361 path: impl AsRef<Path>,
362 data: &[usize],
363 max: Option<usize>,
364 ) -> Result<()> {
365 self.store(
366 path,
367 unsafe { core::mem::transmute::<&[usize], &[u64]>(data) },
368 max.map(|x| x as u64),
369 )
370 }
371}
372
373pub fn memory_usage_parser(arg: &str) -> anyhow::Result<MemoryUsage> {
381 const PREF_SYMS: [(&str, u64); 10] = [
382 ("k", 1E3 as u64),
383 ("m", 1E6 as u64),
384 ("g", 1E9 as u64),
385 ("t", 1E12 as u64),
386 ("p", 1E15 as u64),
387 ("ki", 1 << 10),
388 ("mi", 1 << 20),
389 ("gi", 1 << 30),
390 ("ti", 1 << 40),
391 ("pi", 1 << 50),
392 ];
393 let arg = arg.trim().to_ascii_lowercase();
394 ensure!(!arg.is_empty(), "empty string");
395
396 if arg.ends_with('%') {
397 let perc = arg[..arg.len() - 1].parse::<f64>()?;
398 ensure!((0.0..=100.0).contains(&perc), "percentage out of range");
399 return Ok(MemoryUsage::from_perc(perc));
400 }
401
402 let num_digits = arg
403 .chars()
404 .take_while(|c| c.is_ascii_digit() || *c == '.')
405 .count();
406
407 let number = arg[..num_digits].parse::<f64>()?;
408 let suffix = &arg[num_digits..].trim();
409
410 let multiplier = PREF_SYMS
411 .iter()
412 .find(|(x, _)| suffix.starts_with(x))
413 .map(|(_, m)| m)
414 .ok_or(anyhow!("invalid prefix symbol {}", suffix))?;
415
416 let value = (number * (*multiplier as f64)) as usize;
417 ensure!(value > 0, "batch size must be greater than zero");
418
419 if suffix.ends_with('b') {
420 Ok(MemoryUsage::MemorySize(value))
421 } else {
422 Ok(MemoryUsage::BatchSize(value))
423 }
424}
425
426#[derive(Args, Debug)]
427pub struct CompressArgs {
429 #[clap(short = 'E', long)]
431 pub endianness: Option<String>,
432
433 #[clap(short = 'w', long, default_value_t = 7)]
435 pub compression_window: usize,
436 #[clap(short = 'i', long, default_value_t = 4)]
438 pub min_interval_length: usize,
439 #[clap(short = 'r', long, default_value_t = 3)]
441 pub max_ref_count: isize,
442
443 #[arg(value_enum)]
444 #[clap(long, default_value = "gamma")]
445 pub outdegrees: PrivCode,
447
448 #[arg(value_enum)]
449 #[clap(long, default_value = "unary")]
450 pub references: PrivCode,
452
453 #[arg(value_enum)]
454 #[clap(long, default_value = "gamma")]
455 pub blocks: PrivCode,
457
458 #[arg(value_enum)]
459 #[clap(long, default_value = "zeta3")]
460 pub residuals: PrivCode,
462}
463
464impl From<CompressArgs> for CompFlags {
465 fn from(value: CompressArgs) -> Self {
466 CompFlags {
467 outdegrees: value.outdegrees.into(),
468 references: value.references.into(),
469 blocks: value.blocks.into(),
470 intervals: PrivCode::Gamma.into(),
471 residuals: value.residuals.into(),
472 min_interval_length: value.min_interval_length,
473 compression_window: value.compression_window,
474 max_ref_count: match value.max_ref_count {
475 -1 => usize::MAX,
476 _ => value.max_ref_count as usize,
477 },
478 }
479 }
480}
481
482pub fn get_thread_pool(num_threads: usize) -> rayon::ThreadPool {
484 rayon::ThreadPoolBuilder::new()
485 .num_threads(num_threads)
486 .build()
487 .expect("Failed to create thread pool")
488}
489
490pub fn append(path: impl AsRef<Path>, s: impl AsRef<str>) -> PathBuf {
496 debug_assert!(path.as_ref().extension().is_none());
497 let mut path_buf = path.as_ref().to_owned();
498 let mut filename = path_buf.file_name().unwrap().to_owned();
499 filename.push(s.as_ref());
500 path_buf.push(filename);
501 path_buf
502}
503
504pub fn create_parent_dir(file_path: impl AsRef<Path>) -> Result<()> {
506 if let Some(parent_dir) = file_path.as_ref().parent() {
508 std::fs::create_dir_all(parent_dir).with_context(|| {
509 format!(
510 "Failed to create the directory {:?}",
511 parent_dir.to_string_lossy()
512 )
513 })?;
514 }
515 Ok(())
516}
517
518fn parse_duration(value: &str) -> Result<Duration> {
528 if value.is_empty() {
529 bail!("Empty duration string, if you want every 0 milliseconds use `0`.");
530 }
531 let mut duration = Duration::from_secs(0);
532 let mut acc = String::new();
533 for c in value.chars() {
534 if c.is_ascii_digit() {
535 acc.push(c);
536 } else if c.is_whitespace() {
537 continue;
538 } else {
539 let dur = acc.parse::<u64>()?;
540 match c {
541 's' => duration += Duration::from_secs(dur),
542 'm' => duration += Duration::from_secs(dur * 60),
543 'h' => duration += Duration::from_secs(dur * 60 * 60),
544 'd' => duration += Duration::from_secs(dur * 60 * 60 * 24),
545 _ => return Err(anyhow!("Invalid duration suffix: {}", c)),
546 }
547 acc.clear();
548 }
549 }
550 if !acc.is_empty() {
551 let dur = acc.parse::<u64>()?;
552 duration += Duration::from_millis(dur);
553 }
554 Ok(duration)
555}
556
557pub fn init_env_logger() -> Result<()> {
560 use jiff::fmt::friendly::{Designator, Spacing, SpanPrinter};
561 use jiff::SpanRound;
562
563 let mut builder =
564 env_logger::Builder::from_env(env_logger::Env::default().default_filter_or("info"));
565
566 let start = std::time::Instant::now();
567 let printer = SpanPrinter::new()
568 .spacing(Spacing::None)
569 .designator(Designator::Compact);
570 let span_round = SpanRound::new()
571 .largest(jiff::Unit::Day)
572 .smallest(jiff::Unit::Millisecond)
573 .days_are_24_hours();
574
575 builder.format(move |buf, record| {
576 let Ok(ts) = jiff::Timestamp::try_from(SystemTime::now()) else {
577 return Err(std::io::Error::other("Failed to get timestamp"));
578 };
579 let style = buf.default_level_style(record.level());
580 let elapsed = start.elapsed();
581 let span = jiff::Span::new()
582 .seconds(elapsed.as_secs() as i64)
583 .milliseconds(elapsed.subsec_millis() as i64);
584 let span = span.round(span_round).expect("Failed to round span");
585 writeln!(
586 buf,
587 "{} {} {style}{}{style:#} [{:?}] {} - {}",
588 ts.strftime("%F %T%.3f"),
589 printer.span_to_string(&span),
590 record.level(),
591 std::thread::current().id(),
592 record.target(),
593 record.args()
594 )
595 });
596 builder.init();
597 Ok(())
598}
599
600#[derive(Args, Debug)]
601pub struct GlobalArgs {
602 #[arg(long, value_parser = parse_duration, global=true, display_order = 1000)]
603 pub log_interval: Option<Duration>,
609}
610
611#[derive(Subcommand, Debug)]
612pub enum SubCommands {
613 #[command(subcommand)]
614 Analyze(analyze::SubCommands),
615 #[command(subcommand)]
616 Bench(bench::SubCommands),
617 #[command(subcommand)]
618 Build(build::SubCommands),
619 #[command(subcommand)]
620 Check(check::SubCommands),
621 #[command(subcommand)]
622 From(from::SubCommands),
623 #[command(subcommand)]
624 Perm(perm::SubCommands),
625 #[command(subcommand)]
626 Run(run::SubCommands),
627 #[command(subcommand)]
628 To(to::SubCommands),
629 #[command(subcommand)]
630 Transform(transform::SubCommands),
631}
632
633#[derive(Parser, Debug)]
634#[command(name = "webgraph", version=build_info::version_string())]
635#[doc = include_str!("common_env.txt")]
637pub struct Cli {
638 #[command(subcommand)]
639 pub command: SubCommands,
640 #[clap(flatten)]
641 pub args: GlobalArgs,
642}
643
644pub mod dist;
645pub mod sccs;
646
647pub mod analyze;
648pub mod bench;
649pub mod build;
650pub mod check;
651pub mod from;
652pub mod perm;
653pub mod run;
654pub mod to;
655pub mod transform;
656
657pub fn cli_main<I, T>(args: I) -> Result<()>
659where
660 I: IntoIterator<Item = T>,
661 T: Into<std::ffi::OsString> + Clone,
662{
663 let start = std::time::Instant::now();
664 let cli = Cli::parse_from(args);
665 match cli.command {
666 SubCommands::Analyze(args) => {
667 analyze::main(cli.args, args)?;
668 }
669 SubCommands::Bench(args) => {
670 bench::main(cli.args, args)?;
671 }
672 SubCommands::Build(args) => {
673 build::main(cli.args, args, Cli::command())?;
674 }
675 SubCommands::Check(args) => {
676 check::main(cli.args, args)?;
677 }
678 SubCommands::From(args) => {
679 from::main(cli.args, args)?;
680 }
681 SubCommands::Perm(args) => {
682 perm::main(cli.args, args)?;
683 }
684 SubCommands::Run(args) => {
685 run::main(cli.args, args)?;
686 }
687 SubCommands::To(args) => {
688 to::main(cli.args, args)?;
689 }
690 SubCommands::Transform(args) => {
691 transform::main(cli.args, args)?;
692 }
693 }
694
695 log::info!(
696 "The command took {}",
697 pretty_print_elapsed(start.elapsed().as_secs_f64())
698 );
699
700 Ok(())
701}
702
703fn pretty_print_elapsed(elapsed: f64) -> String {
705 let mut result = String::new();
706 let mut elapsed_seconds = elapsed as u64;
707 let weeks = elapsed_seconds / (60 * 60 * 24 * 7);
708 elapsed_seconds %= 60 * 60 * 24 * 7;
709 let days = elapsed_seconds / (60 * 60 * 24);
710 elapsed_seconds %= 60 * 60 * 24;
711 let hours = elapsed_seconds / (60 * 60);
712 elapsed_seconds %= 60 * 60;
713 let minutes = elapsed_seconds / 60;
714 match weeks {
717 0 => {}
718 1 => result.push_str("1 week "),
719 _ => result.push_str(&format!("{} weeks ", weeks)),
720 }
721 match days {
722 0 => {}
723 1 => result.push_str("1 day "),
724 _ => result.push_str(&format!("{} days ", days)),
725 }
726 match hours {
727 0 => {}
728 1 => result.push_str("1 hour "),
729 _ => result.push_str(&format!("{} hours ", hours)),
730 }
731 match minutes {
732 0 => {}
733 1 => result.push_str("1 minute "),
734 _ => result.push_str(&format!("{} minutes ", minutes)),
735 }
736
737 result.push_str(&format!("{:.3} seconds ({}s)", elapsed % 60.0, elapsed));
738 result
739}