webgraph_cli/
lib.rs

1/*
2 * SPDX-FileCopyrightText: 2023 Inria
3 * SPDX-FileCopyrightText: 2023 Tommaso Fontana
4 * SPDX-FileCopyrightText: 2025 Sebastiano Vigna
5 *
6 * SPDX-License-Identifier: Apache-2.0 OR LGPL-2.1-or-later
7 */
8
9#![doc = include_str!("../README.md")]
10#![deny(unstable_features)]
11#![deny(trivial_casts)]
12#![deny(unconditional_recursion)]
13#![deny(clippy::empty_loop)]
14#![deny(unreachable_code)]
15#![deny(unreachable_pub)]
16#![deny(unreachable_patterns)]
17#![deny(unused_macro_rules)]
18#![deny(unused_doc_comments)]
19#![allow(clippy::type_complexity)]
20
21use anyhow::{Context, Result, anyhow, bail, ensure};
22use clap::{Args, CommandFactory, Parser, Subcommand, ValueEnum};
23use common_traits::{ToBytes, UnsignedInt};
24use dsi_bitstream::dispatch::Codes;
25use epserde::ser::Serialize;
26use std::io::{BufWriter, Write};
27use std::path::{Path, PathBuf};
28use std::time::Duration;
29use std::time::SystemTime;
30use sux::bits::BitFieldVec;
31use webgraph::prelude::CompFlags;
32use webgraph::utils::{Granularity, MemoryUsage};
33
34macro_rules! SEQ_PROC_WARN {
35    () => {"Processing the graph sequentially: for parallel processing please build the Elias-Fano offsets list using 'webgraph build ef {}'"}
36}
37
38#[cfg(not(any(feature = "le_bins", feature = "be_bins")))]
39compile_error!("At least one of the features `le_bins` or `be_bins` must be enabled.");
40
41pub mod build_info {
42    include!(concat!(env!("OUT_DIR"), "/built.rs"));
43
44    pub fn version_string() -> String {
45        format!(
46            "{}
47git info: {} {} {}
48build info: built on {} for {} with {}",
49            PKG_VERSION,
50            GIT_VERSION.unwrap_or(""),
51            GIT_COMMIT_HASH.unwrap_or(""),
52            match GIT_DIRTY {
53                None => "",
54                Some(true) => "(dirty)",
55                Some(false) => "(clean)",
56            },
57            BUILD_DATE,
58            TARGET,
59            RUSTC_VERSION
60        )
61    }
62}
63
64#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, ValueEnum)]
65/// Enum for instantaneous codes.
66///
67/// It is used to implement [`ValueEnum`] here instead of in [`dsi_bitstream`].
68///
69/// For CLI ergonomics and compatibility, this codes must be the same as those
70/// appearing in [`CompFlags::code_from_str`].
71pub enum PrivCode {
72    Unary,
73    Gamma,
74    Delta,
75    Zeta1,
76    Zeta2,
77    Zeta3,
78    Zeta4,
79    Zeta5,
80    Zeta6,
81    Zeta7,
82    Pi1,
83    Pi2,
84    Pi3,
85    Pi4,
86}
87
88impl From<PrivCode> for Codes {
89    fn from(value: PrivCode) -> Self {
90        match value {
91            PrivCode::Unary => Codes::Unary,
92            PrivCode::Gamma => Codes::Gamma,
93            PrivCode::Delta => Codes::Delta,
94            PrivCode::Zeta1 => Codes::Zeta(1),
95            PrivCode::Zeta2 => Codes::Zeta(2),
96            PrivCode::Zeta3 => Codes::Zeta(3),
97            PrivCode::Zeta4 => Codes::Zeta(4),
98            PrivCode::Zeta5 => Codes::Zeta(5),
99            PrivCode::Zeta6 => Codes::Zeta(6),
100            PrivCode::Zeta7 => Codes::Zeta(7),
101            PrivCode::Pi1 => Codes::Pi(1),
102            PrivCode::Pi2 => Codes::Pi(2),
103            PrivCode::Pi3 => Codes::Pi(3),
104            PrivCode::Pi4 => Codes::Pi(4),
105        }
106    }
107}
108
109#[derive(Args, Debug)]
110/// Shared CLI arguments for reading files containing arcs.
111pub struct ArcsArgs {
112    #[arg(long, default_value_t = '#')]
113    /// Ignore lines that start with this symbol.
114    pub line_comment_symbol: char,
115
116    #[arg(long, default_value_t = 0)]
117    /// How many lines to skip, ignoring comment lines.
118    pub lines_to_skip: usize,
119
120    #[arg(long)]
121    /// How many lines to parse, after skipping the first lines_to_skip and
122    /// ignoring comment lines.
123    pub max_arcs: Option<usize>,
124
125    #[arg(long, default_value_t = '\t')]
126    /// The column separator.
127    pub separator: char,
128
129    #[arg(long, default_value_t = 0)]
130    /// The index of the column containing the source node of an arc.
131    pub source_column: usize,
132
133    #[arg(long, default_value_t = 1)]
134    /// The index of the column containing the target node of an arc.
135    pub target_column: usize,
136
137    #[arg(long, default_value_t = false)]
138    /// Source and destinations are not node identifiers starting from 0, but labels.
139    pub labels: bool,
140}
141
142/// Parses the number of threads from a string.
143///
144/// This function is meant to be used with `#[arg(...,  value_parser =
145/// num_threads_parser)]`.
146pub fn num_threads_parser(arg: &str) -> Result<usize> {
147    let num_threads = arg.parse::<usize>()?;
148    ensure!(num_threads > 0, "Number of threads must be greater than 0");
149    Ok(num_threads)
150}
151
152/// Shared CLI arguments for commands that specify a number of threads.
153#[derive(Args, Debug)]
154pub struct NumThreadsArg {
155    #[arg(short = 'j', long, default_value_t = rayon::current_num_threads().max(1), value_parser = num_threads_parser)]
156    /// The number of threads to use.
157    pub num_threads: usize,
158}
159
160/// Shared CLI arguments for commands that specify a granularity.
161#[derive(Args, Debug)]
162pub struct GranularityArgs {
163    #[arg(long, conflicts_with("node_granularity"))]
164    /// The tentative number of arcs used to define the size of a parallel job
165    /// (advanced option).
166    pub arc_granularity: Option<u64>,
167
168    #[arg(long, conflicts_with("arc_granularity"))]
169    /// The tentative number of nodes used to define the size of a parallel job
170    /// (advanced option).
171    pub node_granularity: Option<usize>,
172}
173
174impl GranularityArgs {
175    pub fn into_granularity(&self) -> Granularity {
176        match (self.arc_granularity, self.node_granularity) {
177            (Some(_), Some(_)) => unreachable!(),
178            (Some(arc_granularity), None) => Granularity::Arcs(arc_granularity),
179            (None, Some(node_granularity)) => Granularity::Nodes(node_granularity),
180            (None, None) => Granularity::default(),
181        }
182    }
183}
184
185/// Shared CLI arguments for commands that specify a memory usage.
186#[derive(Args, Debug)]
187pub struct MemoryUsageArg {
188    #[clap(short = 'm', long = "memory-usage", value_parser = memory_usage_parser, default_value = "50%")]
189    /// The number of pairs to be used in batches.
190    /// If the number ends with a "b" or "B" it is interpreted as a number of bytes, otherwise as a number of elements.
191    /// You can use the SI and NIST multipliers k, M, G, T, P, ki, Mi, Gi, Ti, and Pi.
192    /// You can also use a percentage of the available memory by appending a "%" to the number.
193    pub memory_usage: MemoryUsage,
194}
195
196#[derive(Debug, Clone, Copy, ValueEnum)]
197/// How to store vectors of floats.
198pub enum FloatVectorFormat {
199    /// Java-compatible format: a sequence of big-endian floats (32 or 64 bits).
200    Java,
201    /// A slice of floats (32 or 64 bits) serialized using ε-serde.
202    Epserde,
203    /// ASCII format, one float per line.
204    Ascii,
205    /// A JSON Array.
206    Json,
207}
208
209impl FloatVectorFormat {
210    /// Stores float values in the specified `path` using the format defined by
211    /// `self`.
212    ///
213    /// If the result is a textual format, i.e., ASCII or JSON, `precision`
214    /// will be used to truncate the float values to the specified number of
215    /// decimal digits.
216    pub fn store<F>(
217        &self,
218        path: impl AsRef<Path>,
219        values: &[F],
220        precision: Option<usize>,
221    ) -> Result<()>
222    where
223        F: ToBytes + core::fmt::Display + epserde::ser::Serialize + Copy,
224        for<'a> &'a [F]: epserde::ser::Serialize,
225    {
226        let precision = precision.unwrap_or(f64::DIGITS as usize);
227        create_parent_dir(&path)?;
228        let path_display = path.as_ref().display();
229        let mut file = std::fs::File::create(&path)
230            .with_context(|| format!("Could not create vector at {}", path_display))?;
231
232        match self {
233            FloatVectorFormat::Epserde => {
234                log::info!("Storing in ε-serde format at {}", path_display);
235                unsafe {
236                    values
237                        .serialize(&mut file)
238                        .with_context(|| format!("Could not write vector to {}", path_display))
239                }?;
240            }
241            FloatVectorFormat::Java => {
242                log::info!("Storing in Java format at {}", path_display);
243                for word in values.iter() {
244                    file.write_all(word.to_be_bytes().as_ref())
245                        .with_context(|| format!("Could not write vector to {}", path_display))?;
246                }
247            }
248            FloatVectorFormat::Ascii => {
249                log::info!("Storing in ASCII format at {}", path_display);
250                for word in values.iter() {
251                    writeln!(file, "{word:.precision$}")
252                        .with_context(|| format!("Could not write vector to {}", path_display))?;
253                }
254            }
255            FloatVectorFormat::Json => {
256                log::info!("Storing in JSON format at {}", path_display);
257                write!(file, "[")?;
258                for word in values.iter().take(values.len().saturating_sub(1)) {
259                    write!(file, "{word:.precision$}, ")
260                        .with_context(|| format!("Could not write vector to {}", path_display))?;
261                }
262                if let Some(last) = values.last() {
263                    write!(file, "{last:.precision$}")
264                        .with_context(|| format!("Could not write vector to {}", path_display))?;
265                }
266                write!(file, "]")?;
267            }
268        }
269
270        Ok(())
271    }
272}
273
274#[derive(Debug, Clone, Copy, ValueEnum)]
275/// How to store vectors of integers.
276pub enum IntVectorFormat {
277    /// Java-compatible format: a sequence of big-endian longs (64 bits).
278    Java,
279    /// A slice of usize serialized using ε-serde.
280    Epserde,
281    /// A BitFieldVec stored using ε-serde. It stores each element using
282    /// ⌊log₂(max)⌋ + 1 bits. It requires to allocate the `BitFieldVec` in RAM
283    /// before serializing it.
284    BitFieldVec,
285    /// ASCII format, one integer per line.
286    Ascii,
287    /// A JSON Array.
288    Json,
289}
290
291impl IntVectorFormat {
292    /// Stores a vector of `u64` in the specified `path` using the format defined by `self`.
293    ///
294    /// `max` is the maximum value of the vector. If it is not provided, it will
295    /// be computed from the data.
296    pub fn store(&self, path: impl AsRef<Path>, data: &[u64], max: Option<u64>) -> Result<()> {
297        // Ensure the parent directory exists
298        create_parent_dir(&path)?;
299
300        let mut file = std::fs::File::create(&path)
301            .with_context(|| format!("Could not create vector at {}", path.as_ref().display()))?;
302        let mut buf = BufWriter::new(&mut file);
303
304        debug_assert_eq!(
305            max,
306            max.map(|_| { data.iter().copied().max().unwrap_or(0) }),
307            "The wrong maximum value was provided for the vector"
308        );
309
310        match self {
311            IntVectorFormat::Epserde => {
312                log::info!("Storing in epserde format at {}", path.as_ref().display());
313                unsafe {
314                    data.serialize(&mut buf).with_context(|| {
315                        format!("Could not write vector to {}", path.as_ref().display())
316                    })
317                }?;
318            }
319            IntVectorFormat::BitFieldVec => {
320                log::info!(
321                    "Storing in BitFieldVec format at {}",
322                    path.as_ref().display()
323                );
324                let max = max.unwrap_or_else(|| {
325                    data.iter()
326                        .copied()
327                        .max()
328                        .unwrap_or_else(|| panic!("Empty vector"))
329                });
330                let bit_width = max.len() as usize;
331                log::info!("Using {} bits per element", bit_width);
332                let mut bit_field_vec = <BitFieldVec<u64, _>>::with_capacity(bit_width, data.len());
333                bit_field_vec.extend(data.iter().copied());
334                unsafe {
335                    bit_field_vec.store(&path).with_context(|| {
336                        format!("Could not write vector to {}", path.as_ref().display())
337                    })
338                }?;
339            }
340            IntVectorFormat::Java => {
341                log::info!("Storing in Java format at {}", path.as_ref().display());
342                for word in data.iter() {
343                    buf.write_all(&word.to_be_bytes()).with_context(|| {
344                        format!("Could not write vector to {}", path.as_ref().display())
345                    })?;
346                }
347            }
348            IntVectorFormat::Ascii => {
349                log::info!("Storing in ASCII format at {}", path.as_ref().display());
350                for word in data.iter() {
351                    writeln!(buf, "{}", word).with_context(|| {
352                        format!("Could not write vector to {}", path.as_ref().display())
353                    })?;
354                }
355            }
356            IntVectorFormat::Json => {
357                log::info!("Storing in JSON format at {}", path.as_ref().display());
358                write!(buf, "[")?;
359                for word in data.iter().take(data.len().saturating_sub(1)) {
360                    write!(buf, "{}, ", word).with_context(|| {
361                        format!("Could not write vector to {}", path.as_ref().display())
362                    })?;
363                }
364                if let Some(last) = data.last() {
365                    write!(buf, "{}", last).with_context(|| {
366                        format!("Could not write vector to {}", path.as_ref().display())
367                    })?;
368                }
369                write!(buf, "]")?;
370            }
371        };
372
373        Ok(())
374    }
375
376    #[cfg(target_pointer_width = "64")]
377    /// Stores a vector of `usize` in the specified `path` using the format defined by `self`.
378    /// `max` is the maximum value of the vector, if it is not provided, it will
379    /// be computed from the data.
380    ///
381    /// This helper method is available only on 64-bit architectures as Java's format
382    /// uses of 64-bit integers.
383    pub fn store_usizes(
384        &self,
385        path: impl AsRef<Path>,
386        data: &[usize],
387        max: Option<usize>,
388    ) -> Result<()> {
389        self.store(
390            path,
391            unsafe { core::mem::transmute::<&[usize], &[u64]>(data) },
392            max.map(|x| x as u64),
393        )
394    }
395}
396
397/// Parses a batch size.
398///
399/// This function accepts either a number (possibly followed by a
400/// SI or NIST multiplier k, M, G, T, P, ki, Mi, Gi, Ti, or Pi), or a percentage
401/// (followed by a `%`) that is interpreted as a percentage of the core
402/// memory. If the value ends with a `b` or `B` it is interpreted as a number of
403/// bytes, otherwise as a number of elements.
404pub fn memory_usage_parser(arg: &str) -> anyhow::Result<MemoryUsage> {
405    const PREF_SYMS: [(&str, u64); 10] = [
406        ("ki", 1 << 10),
407        ("mi", 1 << 20),
408        ("gi", 1 << 30),
409        ("ti", 1 << 40),
410        ("pi", 1 << 50),
411        ("k", 1E3 as u64),
412        ("m", 1E6 as u64),
413        ("g", 1E9 as u64),
414        ("t", 1E12 as u64),
415        ("p", 1E15 as u64),
416    ];
417    let arg = arg.trim().to_ascii_lowercase();
418    ensure!(!arg.is_empty(), "empty string");
419
420    if arg.ends_with('%') {
421        let perc = arg[..arg.len() - 1].parse::<f64>()?;
422        ensure!((0.0..=100.0).contains(&perc), "percentage out of range");
423        return Ok(MemoryUsage::from_perc(perc));
424    }
425
426    let num_digits = arg
427        .chars()
428        .take_while(|c| c.is_ascii_digit() || *c == '.')
429        .count();
430
431    let number = arg[..num_digits].parse::<f64>()?;
432    let suffix = &arg[num_digits..].trim();
433
434    let prefix = suffix.strip_suffix('b').unwrap_or(suffix);
435    let multiplier = PREF_SYMS
436        .iter()
437        .find(|(x, _)| *x == prefix)
438        .map(|(_, m)| m)
439        .ok_or(anyhow!("invalid prefix symbol {}", suffix))?;
440
441    let value = (number * (*multiplier as f64)) as usize;
442    ensure!(value > 0, "batch size must be greater than zero");
443
444    if suffix.ends_with('b') {
445        Ok(MemoryUsage::MemorySize(value))
446    } else {
447        Ok(MemoryUsage::BatchSize(value))
448    }
449}
450
451#[derive(Args, Debug, Clone)]
452/// Shared CLI arguments for compression.
453pub struct CompressArgs {
454    /// The endianness of the graph to write
455    #[clap(short = 'E', long)]
456    pub endianness: Option<String>,
457
458    /// The compression windows
459    #[clap(short = 'w', long, default_value_t = 7)]
460    pub compression_window: usize,
461    /// The minimum interval length
462    #[clap(short = 'i', long, default_value_t = 4)]
463    pub min_interval_length: usize,
464    /// The maximum recursion depth for references (-1 for infinite recursion depth)
465    #[clap(short = 'r', long, default_value_t = 3)]
466    pub max_ref_count: isize,
467
468    #[arg(value_enum)]
469    #[clap(long, default_value = "gamma")]
470    /// The code to use for the outdegree
471    pub outdegrees: PrivCode,
472
473    #[arg(value_enum)]
474    #[clap(long, default_value = "unary")]
475    /// The code to use for the reference offsets
476    pub references: PrivCode,
477
478    #[arg(value_enum)]
479    #[clap(long, default_value = "gamma")]
480    /// The code to use for the blocks
481    pub blocks: PrivCode,
482
483    #[arg(value_enum)]
484    #[clap(long, default_value = "zeta3")]
485    /// The code to use for the residuals
486    pub residuals: PrivCode,
487
488    /// Whether to use Zuckerli's reference selection algorithm. This slows down the compression
489    /// process and requires more memory, but improves compression ratio and decoding speed.
490    #[clap(long)]
491    pub bvgraphz: bool,
492
493    /// How many nodes to process in a chunk; the default (10000) is usually a good
494    /// value.
495    #[clap(long, default_value = "10000")]
496    pub chunk_size: usize,
497}
498
499impl From<CompressArgs> for CompFlags {
500    fn from(value: CompressArgs) -> Self {
501        CompFlags {
502            outdegrees: value.outdegrees.into(),
503            references: value.references.into(),
504            blocks: value.blocks.into(),
505            intervals: PrivCode::Gamma.into(),
506            residuals: value.residuals.into(),
507            min_interval_length: value.min_interval_length,
508            compression_window: value.compression_window,
509            max_ref_count: match value.max_ref_count {
510                -1 => usize::MAX,
511                max_ref_count => {
512                    assert!(
513                        max_ref_count >= 0,
514                        "max_ref_count cannot be negative, except for -1, which means infinite recursion depth, but got {}",
515                        max_ref_count
516                    );
517                    value.max_ref_count as usize
518                }
519            },
520        }
521    }
522}
523
524/// Creates a [`ThreadPool`](rayon::ThreadPool) with the given number of threads.
525pub fn get_thread_pool(num_threads: usize) -> rayon::ThreadPool {
526    rayon::ThreadPoolBuilder::new()
527        .num_threads(num_threads)
528        .build()
529        .expect("Failed to create thread pool")
530}
531
532/// Appends a string to the filename of a path.
533///
534/// # Panics
535/// * Will panic if there is no filename.
536/// * Will panic in test mode if the path has an extension.
537pub fn append(path: impl AsRef<Path>, s: impl AsRef<str>) -> PathBuf {
538    debug_assert!(path.as_ref().extension().is_none());
539    let mut path_buf = path.as_ref().to_owned();
540    let mut filename = path_buf.file_name().unwrap().to_owned();
541    filename.push(s.as_ref());
542    path_buf.set_file_name(filename);
543    path_buf
544}
545
546/// Creates all parent directories of the given file path.
547pub fn create_parent_dir(file_path: impl AsRef<Path>) -> Result<()> {
548    // ensure that the dst directory exists
549    if let Some(parent_dir) = file_path.as_ref().parent() {
550        std::fs::create_dir_all(parent_dir).with_context(|| {
551            format!(
552                "Failed to create the directory {:?}",
553                parent_dir.to_string_lossy()
554            )
555        })?;
556    }
557    Ok(())
558}
559
560/// Parses a duration from a string.
561/// For compatibility with Java, if no suffix is given, it is assumed to be in milliseconds.
562/// You can use suffixes, the available ones are:
563/// - `s` for seconds
564/// - `m` for minutes
565/// - `h` for hours
566/// - `d` for days
567///
568/// Example: `1d2h3m4s567` this is parsed as: 1 day, 2 hours, 3 minutes, 4 seconds, and 567 milliseconds.
569fn parse_duration(value: &str) -> Result<Duration> {
570    if value.is_empty() {
571        bail!("Empty duration string, if you want every 0 milliseconds use `0`.");
572    }
573    let mut duration = Duration::from_secs(0);
574    let mut acc = String::new();
575    for c in value.chars() {
576        if c.is_ascii_digit() {
577            acc.push(c);
578        } else if c.is_whitespace() {
579            continue;
580        } else {
581            let dur = acc.parse::<u64>()?;
582            match c {
583                's' => duration += Duration::from_secs(dur),
584                'm' => duration += Duration::from_secs(dur * 60),
585                'h' => duration += Duration::from_secs(dur * 60 * 60),
586                'd' => duration += Duration::from_secs(dur * 60 * 60 * 24),
587                _ => return Err(anyhow!("Invalid duration suffix: {}", c)),
588            }
589            acc.clear();
590        }
591    }
592    if !acc.is_empty() {
593        let dur = acc.parse::<u64>()?;
594        duration += Duration::from_millis(dur);
595    }
596    Ok(duration)
597}
598
599/// Initializes the `env_logger` logger with a custom format including
600/// timestamps with elapsed time since initialization.
601pub fn init_env_logger() -> Result<()> {
602    use jiff::SpanRound;
603    use jiff::fmt::friendly::{Designator, Spacing, SpanPrinter};
604
605    let mut builder =
606        env_logger::Builder::from_env(env_logger::Env::default().default_filter_or("info"));
607
608    let start = std::time::Instant::now();
609    let printer = SpanPrinter::new()
610        .spacing(Spacing::None)
611        .designator(Designator::Compact);
612    let span_round = SpanRound::new()
613        .largest(jiff::Unit::Day)
614        .smallest(jiff::Unit::Millisecond)
615        .days_are_24_hours();
616
617    builder.format(move |buf, record| {
618        let Ok(ts) = jiff::Timestamp::try_from(SystemTime::now()) else {
619            return Err(std::io::Error::other("Failed to get timestamp"));
620        };
621        let style = buf.default_level_style(record.level());
622        let elapsed = start.elapsed();
623        let span = jiff::Span::new()
624            .seconds(elapsed.as_secs() as i64)
625            .milliseconds(elapsed.subsec_millis() as i64);
626        let span = span.round(span_round).expect("Failed to round span");
627        writeln!(
628            buf,
629            "{} {} {style}{}{style:#} [{:?}] {} - {}",
630            ts.strftime("%F %T%.3f"),
631            printer.span_to_string(&span),
632            record.level(),
633            std::thread::current().id(),
634            record.target(),
635            record.args()
636        )
637    });
638    builder.init();
639    Ok(())
640}
641
642#[derive(Args, Debug)]
643pub struct GlobalArgs {
644    #[arg(long, value_parser = parse_duration, global=true, display_order = 1000)]
645    /// How often to log progress. Default is 10s. You can use the suffixes "s"
646    /// for seconds, "m" for minutes, "h" for hours, and "d" for days. If no
647    /// suffix is provided it is assumed to be in milliseconds.
648    /// Example: "1d2h3m4s567" is parsed as 1 day + 2 hours + 3 minutes + 4
649    /// seconds + 567 milliseconds = 93784567 milliseconds.
650    pub log_interval: Option<Duration>,
651}
652
653#[derive(Subcommand, Debug)]
654pub enum SubCommands {
655    #[command(subcommand)]
656    Analyze(analyze::SubCommands),
657    #[command(subcommand)]
658    Bench(bench::SubCommands),
659    #[command(subcommand)]
660    Build(build::SubCommands),
661    #[command(subcommand)]
662    Check(check::SubCommands),
663    #[command(subcommand)]
664    From(from::SubCommands),
665    #[command(subcommand)]
666    Perm(perm::SubCommands),
667    #[command(subcommand)]
668    Run(run::SubCommands),
669    #[command(subcommand)]
670    To(to::SubCommands),
671    #[command(subcommand)]
672    Transform(transform::SubCommands),
673}
674
675#[derive(Parser, Debug)]
676#[command(name = "webgraph", version=build_info::version_string())]
677/// Webgraph tools to build, convert, modify, and analyze graphs.
678#[doc = include_str!("common_env.txt")]
679pub struct Cli {
680    #[command(subcommand)]
681    pub command: SubCommands,
682    #[clap(flatten)]
683    pub args: GlobalArgs,
684}
685
686pub mod dist;
687pub mod sccs;
688
689pub mod analyze;
690pub mod bench;
691pub mod build;
692pub mod check;
693pub mod from;
694pub mod perm;
695pub mod run;
696pub mod to;
697pub mod transform;
698
699/// The entry point of the command-line interface.
700pub fn cli_main<I, T>(args: I) -> Result<()>
701where
702    I: IntoIterator<Item = T>,
703    T: Into<std::ffi::OsString> + Clone,
704{
705    let start = std::time::Instant::now();
706    let cli = Cli::parse_from(args);
707    match cli.command {
708        SubCommands::Analyze(args) => {
709            analyze::main(cli.args, args)?;
710        }
711        SubCommands::Bench(args) => {
712            bench::main(cli.args, args)?;
713        }
714        SubCommands::Build(args) => {
715            build::main(cli.args, args, Cli::command())?;
716        }
717        SubCommands::Check(args) => {
718            check::main(cli.args, args)?;
719        }
720        SubCommands::From(args) => {
721            from::main(cli.args, args)?;
722        }
723        SubCommands::Perm(args) => {
724            perm::main(cli.args, args)?;
725        }
726        SubCommands::Run(args) => {
727            run::main(cli.args, args)?;
728        }
729        SubCommands::To(args) => {
730            to::main(cli.args, args)?;
731        }
732        SubCommands::Transform(args) => {
733            transform::main(cli.args, args)?;
734        }
735    }
736
737    log::info!(
738        "The command took {}",
739        pretty_print_elapsed(start.elapsed().as_secs_f64())
740    );
741
742    Ok(())
743}
744
745/// Pretty-prints seconds in a human-readable format.
746fn pretty_print_elapsed(elapsed: f64) -> String {
747    let mut result = String::new();
748    let mut elapsed_seconds = elapsed as u64;
749    let weeks = elapsed_seconds / (60 * 60 * 24 * 7);
750    elapsed_seconds %= 60 * 60 * 24 * 7;
751    let days = elapsed_seconds / (60 * 60 * 24);
752    elapsed_seconds %= 60 * 60 * 24;
753    let hours = elapsed_seconds / (60 * 60);
754    elapsed_seconds %= 60 * 60;
755    let minutes = elapsed_seconds / 60;
756    //elapsed_seconds %= 60;
757
758    match weeks {
759        0 => {}
760        1 => result.push_str("1 week "),
761        _ => result.push_str(&format!("{} weeks ", weeks)),
762    }
763    match days {
764        0 => {}
765        1 => result.push_str("1 day "),
766        _ => result.push_str(&format!("{} days ", days)),
767    }
768    match hours {
769        0 => {}
770        1 => result.push_str("1 hour "),
771        _ => result.push_str(&format!("{} hours ", hours)),
772    }
773    match minutes {
774        0 => {}
775        1 => result.push_str("1 minute "),
776        _ => result.push_str(&format!("{} minutes ", minutes)),
777    }
778
779    result.push_str(&format!("{:.3} seconds ({}s)", elapsed % 60.0, elapsed));
780    result
781}
782
783#[cfg(test)]
784mod tests {
785    use super::*;
786
787    mod float_vector_format {
788        use super::*;
789
790        #[test]
791        fn test_ascii_f64() {
792            let dir = tempfile::tempdir().unwrap();
793            let path = dir.path().join("test.txt");
794            let values: Vec<f64> = vec![1.5, 2.75, 3.0];
795            FloatVectorFormat::Ascii
796                .store(&path, &values, None)
797                .unwrap();
798            let content = std::fs::read_to_string(&path).unwrap();
799            // Default precision is f64::DIGITS (15)
800            for (line, expected) in content.lines().zip(&values) {
801                let parsed: f64 = line.trim().parse().unwrap();
802                assert!((parsed - expected).abs() < 1e-10);
803            }
804            assert_eq!(content.lines().count(), 3);
805        }
806
807        #[test]
808        fn test_ascii_f32() {
809            let dir = tempfile::tempdir().unwrap();
810            let path = dir.path().join("test.txt");
811            let values: Vec<f32> = vec![1.5, 2.75, 3.0];
812            FloatVectorFormat::Ascii
813                .store(&path, &values, None)
814                .unwrap();
815            let content = std::fs::read_to_string(&path).unwrap();
816            for (line, expected) in content.lines().zip(&values) {
817                let parsed: f32 = line.trim().parse().unwrap();
818                assert!((parsed - expected).abs() < 1e-6);
819            }
820        }
821
822        #[test]
823        fn test_ascii_with_precision() {
824            let dir = tempfile::tempdir().unwrap();
825            let path = dir.path().join("test.txt");
826            let values: Vec<f64> = vec![1.123456789, 2.987654321];
827            FloatVectorFormat::Ascii
828                .store(&path, &values, Some(3))
829                .unwrap();
830            let content = std::fs::read_to_string(&path).unwrap();
831            let lines: Vec<&str> = content.lines().collect();
832            assert_eq!(lines[0], "1.123");
833            assert_eq!(lines[1], "2.988");
834        }
835
836        #[test]
837        fn test_json_f64() {
838            let dir = tempfile::tempdir().unwrap();
839            let path = dir.path().join("test.json");
840            let values: Vec<f64> = vec![1.5, 2.75, 3.0];
841            FloatVectorFormat::Json.store(&path, &values, None).unwrap();
842            let content = std::fs::read_to_string(&path).unwrap();
843            let parsed: Vec<f64> = serde_json::from_str(&content).unwrap();
844            assert_eq!(parsed, values);
845        }
846
847        #[test]
848        fn test_json_with_precision() {
849            let dir = tempfile::tempdir().unwrap();
850            let path = dir.path().join("test.json");
851            let values: Vec<f64> = vec![1.123456789, 2.987654321];
852            FloatVectorFormat::Json
853                .store(&path, &values, Some(2))
854                .unwrap();
855            let content = std::fs::read_to_string(&path).unwrap();
856            assert_eq!(content, "[1.12, 2.99]");
857        }
858
859        #[test]
860        fn test_json_empty() {
861            let dir = tempfile::tempdir().unwrap();
862            let path = dir.path().join("test.json");
863            let values: Vec<f64> = vec![];
864            FloatVectorFormat::Json.store(&path, &values, None).unwrap();
865            let content = std::fs::read_to_string(&path).unwrap();
866            assert_eq!(content, "[]");
867        }
868
869        #[test]
870        fn test_json_single_element() {
871            let dir = tempfile::tempdir().unwrap();
872            let path = dir.path().join("test.json");
873            let values: Vec<f64> = vec![42.0];
874            FloatVectorFormat::Json.store(&path, &values, None).unwrap();
875            let content = std::fs::read_to_string(&path).unwrap();
876            let parsed: Vec<f64> = serde_json::from_str(&content).unwrap();
877            assert_eq!(parsed, values);
878        }
879
880        #[test]
881        fn test_java_f64() {
882            let dir = tempfile::tempdir().unwrap();
883            let path = dir.path().join("test.bin");
884            let values: Vec<f64> = vec![1.5, 2.75, 3.0];
885            FloatVectorFormat::Java.store(&path, &values, None).unwrap();
886            let bytes = std::fs::read(&path).unwrap();
887            assert_eq!(bytes.len(), 3 * 8);
888            for (i, expected) in values.iter().enumerate() {
889                let chunk: [u8; 8] = bytes[i * 8..(i + 1) * 8].try_into().unwrap();
890                let val = f64::from_be_bytes(chunk);
891                assert_eq!(val, *expected);
892            }
893        }
894
895        #[test]
896        fn test_java_f32() {
897            let dir = tempfile::tempdir().unwrap();
898            let path = dir.path().join("test.bin");
899            let values: Vec<f32> = vec![1.5, 2.75, 3.0];
900            FloatVectorFormat::Java.store(&path, &values, None).unwrap();
901            let bytes = std::fs::read(&path).unwrap();
902            assert_eq!(bytes.len(), 3 * 4);
903            for (i, expected) in values.iter().enumerate() {
904                let chunk: [u8; 4] = bytes[i * 4..(i + 1) * 4].try_into().unwrap();
905                let val = f32::from_be_bytes(chunk);
906                assert_eq!(val, *expected);
907            }
908        }
909
910        #[test]
911        fn test_epserde_f64() {
912            let dir = tempfile::tempdir().unwrap();
913            let path = dir.path().join("test.bin");
914            let values: Vec<f64> = vec![1.5, 2.75, 3.0];
915            FloatVectorFormat::Epserde
916                .store(&path, &values, None)
917                .unwrap();
918            // Just verify the file was created and is non-empty
919            let metadata = std::fs::metadata(&path).unwrap();
920            assert!(metadata.len() > 0);
921        }
922
923        #[test]
924        fn test_ascii_empty() {
925            let dir = tempfile::tempdir().unwrap();
926            let path = dir.path().join("test.txt");
927            let values: Vec<f64> = vec![];
928            FloatVectorFormat::Ascii
929                .store(&path, &values, None)
930                .unwrap();
931            let content = std::fs::read_to_string(&path).unwrap();
932            assert!(content.is_empty());
933        }
934
935        #[test]
936        fn test_creates_parent_dirs() {
937            let dir = tempfile::tempdir().unwrap();
938            let path = dir.path().join("a").join("b").join("test.txt");
939            let values: Vec<f64> = vec![1.0];
940            FloatVectorFormat::Ascii
941                .store(&path, &values, None)
942                .unwrap();
943            assert!(path.exists());
944        }
945    }
946
947    mod int_vector_format {
948        use super::*;
949
950        #[test]
951        fn test_ascii() {
952            let dir = tempfile::tempdir().unwrap();
953            let path = dir.path().join("test.txt");
954            let data: Vec<u64> = vec![10, 20, 30];
955            IntVectorFormat::Ascii.store(&path, &data, None).unwrap();
956            let content = std::fs::read_to_string(&path).unwrap();
957            let lines: Vec<u64> = content.lines().map(|l| l.trim().parse().unwrap()).collect();
958            assert_eq!(lines, data);
959        }
960
961        #[test]
962        fn test_ascii_empty() {
963            let dir = tempfile::tempdir().unwrap();
964            let path = dir.path().join("test.txt");
965            let data: Vec<u64> = vec![];
966            IntVectorFormat::Ascii.store(&path, &data, None).unwrap();
967            let content = std::fs::read_to_string(&path).unwrap();
968            assert!(content.is_empty());
969        }
970
971        #[test]
972        fn test_json() {
973            let dir = tempfile::tempdir().unwrap();
974            let path = dir.path().join("test.json");
975            let data: Vec<u64> = vec![10, 20, 30];
976            IntVectorFormat::Json.store(&path, &data, None).unwrap();
977            let content = std::fs::read_to_string(&path).unwrap();
978            let parsed: Vec<u64> = serde_json::from_str(&content).unwrap();
979            assert_eq!(parsed, data);
980        }
981
982        #[test]
983        fn test_json_empty() {
984            let dir = tempfile::tempdir().unwrap();
985            let path = dir.path().join("test.json");
986            let data: Vec<u64> = vec![];
987            IntVectorFormat::Json.store(&path, &data, None).unwrap();
988            let content = std::fs::read_to_string(&path).unwrap();
989            assert_eq!(content, "[]");
990        }
991
992        #[test]
993        fn test_json_single_element() {
994            let dir = tempfile::tempdir().unwrap();
995            let path = dir.path().join("test.json");
996            let data: Vec<u64> = vec![42];
997            IntVectorFormat::Json.store(&path, &data, None).unwrap();
998            let content = std::fs::read_to_string(&path).unwrap();
999            let parsed: Vec<u64> = serde_json::from_str(&content).unwrap();
1000            assert_eq!(parsed, data);
1001        }
1002
1003        #[test]
1004        fn test_java() {
1005            let dir = tempfile::tempdir().unwrap();
1006            let path = dir.path().join("test.bin");
1007            let data: Vec<u64> = vec![1, 256, 65535];
1008            IntVectorFormat::Java.store(&path, &data, None).unwrap();
1009            let bytes = std::fs::read(&path).unwrap();
1010            assert_eq!(bytes.len(), 3 * 8);
1011            for (i, expected) in data.iter().enumerate() {
1012                let chunk: [u8; 8] = bytes[i * 8..(i + 1) * 8].try_into().unwrap();
1013                let val = u64::from_be_bytes(chunk);
1014                assert_eq!(val, *expected);
1015            }
1016        }
1017
1018        #[test]
1019        fn test_java_empty() {
1020            let dir = tempfile::tempdir().unwrap();
1021            let path = dir.path().join("test.bin");
1022            let data: Vec<u64> = vec![];
1023            IntVectorFormat::Java.store(&path, &data, None).unwrap();
1024            let bytes = std::fs::read(&path).unwrap();
1025            assert!(bytes.is_empty());
1026        }
1027
1028        #[test]
1029        fn test_epserde() {
1030            let dir = tempfile::tempdir().unwrap();
1031            let path = dir.path().join("test.bin");
1032            let data: Vec<u64> = vec![10, 20, 30];
1033            IntVectorFormat::Epserde.store(&path, &data, None).unwrap();
1034            let metadata = std::fs::metadata(&path).unwrap();
1035            assert!(metadata.len() > 0);
1036        }
1037
1038        #[test]
1039        fn test_bitfieldvec() {
1040            let dir = tempfile::tempdir().unwrap();
1041            let path = dir.path().join("test.bin");
1042            let data: Vec<u64> = vec![1, 3, 7, 15];
1043            IntVectorFormat::BitFieldVec
1044                .store(&path, &data, Some(15))
1045                .unwrap();
1046            let metadata = std::fs::metadata(&path).unwrap();
1047            assert!(metadata.len() > 0);
1048        }
1049
1050        #[test]
1051        fn test_bitfieldvec_max_computed() {
1052            let dir = tempfile::tempdir().unwrap();
1053            let path = dir.path().join("test.bin");
1054            let data: Vec<u64> = vec![1, 3, 7, 15];
1055            // max is None, so it should be computed from data
1056            IntVectorFormat::BitFieldVec
1057                .store(&path, &data, None)
1058                .unwrap();
1059            assert!(path.exists());
1060        }
1061
1062        #[test]
1063        fn test_creates_parent_dirs() {
1064            let dir = tempfile::tempdir().unwrap();
1065            let path = dir.path().join("a").join("b").join("test.txt");
1066            let data: Vec<u64> = vec![1];
1067            IntVectorFormat::Ascii.store(&path, &data, None).unwrap();
1068            assert!(path.exists());
1069        }
1070
1071        #[cfg(target_pointer_width = "64")]
1072        #[test]
1073        fn test_store_usizes() {
1074            let dir = tempfile::tempdir().unwrap();
1075            let path = dir.path().join("test.txt");
1076            let data: Vec<usize> = vec![10, 20, 30];
1077            IntVectorFormat::Ascii
1078                .store_usizes(&path, &data, None)
1079                .unwrap();
1080            let content = std::fs::read_to_string(&path).unwrap();
1081            let lines: Vec<usize> = content.lines().map(|l| l.trim().parse().unwrap()).collect();
1082            assert_eq!(lines, data);
1083        }
1084
1085        #[cfg(target_pointer_width = "64")]
1086        #[test]
1087        fn test_store_usizes_java() {
1088            let dir = tempfile::tempdir().unwrap();
1089            let path = dir.path().join("test.bin");
1090            let data: Vec<usize> = vec![1, 256, 65535];
1091            IntVectorFormat::Java
1092                .store_usizes(&path, &data, None)
1093                .unwrap();
1094            let bytes = std::fs::read(&path).unwrap();
1095            assert_eq!(bytes.len(), 3 * 8);
1096            for (i, expected) in data.iter().enumerate() {
1097                let chunk: [u8; 8] = bytes[i * 8..(i + 1) * 8].try_into().unwrap();
1098                let val = u64::from_be_bytes(chunk) as usize;
1099                assert_eq!(val, *expected);
1100            }
1101        }
1102    }
1103}
webgraph_cli/lib.rs

webgraph_cli/
lib.rs