swh-graph-stdlib 13.0.0

Library of algorithms and data structures for swh-graph
Documentation
// Copyright (C) 2025-2026  The Software Heritage developers
// See the AUTHORS file at the top-level directory of this distribution
// License: GNU General Public License version 3, or any later version
// See top-level LICENSE file for more information

#![cfg(feature = "unstable_project_ids")]

use std::collections::{HashMap, HashSet};
use std::path::Path;

use anyhow::{Context, Result};
use dsi_bitstream::traits::BigEndian;
use epserde::deser::Deserialize;
use lender::Lender;
#[allow(unused_imports)] // to keep debug!() around
use log::{self, debug, info, warn};
use sux::array::partial_array::{PartialArray, SparseIndex};
use swh_graph::graph_builder::GraphBuilder;
use swh_graph::swhid;
use webgraph::prelude::{BvGraphSeq, MemoryFlags};
use webgraph::traits::SequentialLabeling;

use swh_graph_stdlib::project_ids::compute_project_ids;

type SparsePartialArray<T> = PartialArray<T, SparseIndex<Box<[usize]>>>;

fn load_partialarray_to_map(path: &Path) -> Result<HashMap<usize, usize>> {
    let pa = unsafe { SparsePartialArray::<usize>::mmap(path, Default::default()) }
        .with_context(|| format!("Could not mmap or deserialize {}", path.display()))?;
    let pa = pa.uncase();

    Ok((0..pa.len())
        .filter_map(|node| pa.get(node).map(|project| (node, *project)))
        .collect())
}

fn load_bvgraph_to_set(path: &Path) -> anyhow::Result<HashSet<(usize, usize)>> {
    let graph = BvGraphSeq::with_basename(path)
        .endianness::<BigEndian>()
        .flags(MemoryFlags::default())
        .load()?;

    // TODO: surely there is a shorter way to flatten and collect `graph.iter()` into a set?
    let mut pairs = HashSet::new();
    let mut lender = graph.iter();
    while let Some((reached_rev, root_revs)) = lender.next() {
        for root_rev in root_revs {
            pairs.insert((reached_rev, root_rev));
        }
    }
    Ok(pairs)
}

#[test]
fn multiple_connected_components() -> anyhow::Result<()> {
    let mut builder = GraphBuilder::default();

    // add some nodes at the beginning, to cause rev ids to be larger than the number of revs,
    // which can trigger some bugs
    let d1 = builder
        .node(swhid!(swh:1:dir:0000000000000000000000000000000000000000))
        .unwrap()
        .done();
    let d2 = builder
        .node(swhid!(swh:1:dir:0000000000000000000000000000000000000001))
        .unwrap()
        .done();
    let d3 = builder
        .node(swhid!(swh:1:dir:0000000000000000000000000000000000000002))
        .unwrap()
        .done();
    let d4 = builder
        .node(swhid!(swh:1:dir:0000000000000000000000000000000000000003))
        .unwrap()
        .done();
    builder.arc(d1, d2);
    builder.arc(d2, d3);
    builder.arc(d3, d4);

    let a = builder
        .node(swhid!(swh:1:rev:0000000000000000000000000000000000000010))
        .unwrap()
        .done();
    let b = builder
        .node(swhid!(swh:1:rev:0000000000000000000000000000000000000020))
        .unwrap()
        .done();
    let c = builder
        .node(swhid!(swh:1:rev:0000000000000000000000000000000000000030))
        .unwrap()
        .done();
    let d = builder
        .node(swhid!(swh:1:rev:0000000000000000000000000000000000000040))
        .unwrap()
        .done();
    let e = builder
        .node(swhid!(swh:1:rev:0000000000000000000000000000000000000050))
        .unwrap()
        .done();
    let f = builder
        .node(swhid!(swh:1:rev:0000000000000000000000000000000000000060))
        .unwrap()
        .done();
    // b is a commit that merges two root commits a and c
    builder.arc(b, a);
    builder.arc(b, c);
    // f and d are commits that branch out from the same root commit d
    builder.arc(e, d);
    builder.arc(f, d);
    let graph = builder.done().unwrap();

    let output_dir = tempfile::tempdir()?.into_path();
    let bvgraph_path = output_dir.join("graph");
    let (report, project_ids) = compute_project_ids(&graph, 5, &bvgraph_path)?;

    let pairs = load_bvgraph_to_set(&bvgraph_path.join("rev_to_roots"))?;

    let expected_set: HashSet<(usize, usize)> =
        vec![(a, a), (b, a), (b, c), (c, c), (d, d), (e, d), (f, d)]
            .into_iter()
            .collect();
    assert_eq!(pairs, expected_set);

    let pairs = load_partialarray_to_map(&bvgraph_path.join("rev_to_project_id"))?;

    let expected_set: HashMap<usize, usize> = vec![(a, 0), (b, 1), (c, 2), (d, 3), (e, 3), (f, 3)]
        .into_iter()
        .collect();
    assert_eq!(pairs, expected_set);

    let pairs = load_bvgraph_to_set(&bvgraph_path.join("project_id_to_roots"))?;

    let expected_set: HashSet<(usize, usize)> =
        vec![(0, a), (1, a), (1, c), (2, c), (3, d), (3, d)]
            .into_iter()
            .collect();
    assert_eq!(pairs, expected_set);

    assert_eq!(
        report.to_string(),
        "\
Statistics about root revisions\n\
----------------------------------\n\
Revisions: 6\n\
Initial revisions: 3 (50.00%)\n\
Initial revisions per reached revision:\n  \
1: 5 (83.33%)\n  \
2: 1 (16.67%)\n  \
3: 0 (0.00%)\n  \
4: 0 (0.00%)\n  \
5+: 0 (0.00%)\nMaximum number of root revisions reaching a revision: 2\n"
    );

    let expected_project_ids =
        HashMap::from([(vec![a], 0), (vec![a, c], 1), (vec![c], 2), (vec![d], 3)]);
    assert_eq!(project_ids, expected_project_ids);
    Ok(())
}

/// Tests a diamond merge: d merges b and c, both of which come from a
#[test]
fn diamond_graph() -> anyhow::Result<()> {
    use swh_graph::graph_builder::GraphBuilder;
    use swh_graph::swhid;

    let mut builder = GraphBuilder::default();
    let a = builder
        .node(swhid!(swh:1:rev:0000000000000000000000000000000000000010))
        .unwrap()
        .done();
    let b = builder
        .node(swhid!(swh:1:rev:0000000000000000000000000000000000000020))
        .unwrap()
        .done();
    let c = builder
        .node(swhid!(swh:1:rev:0000000000000000000000000000000000000030))
        .unwrap()
        .done();
    let d = builder
        .node(swhid!(swh:1:rev:0000000000000000000000000000000000000040))
        .unwrap()
        .done();
    // a is root, b and c branch from a, d merges b and c
    builder.arc(b, a);
    builder.arc(c, a);
    builder.arc(d, b);
    builder.arc(d, c);
    let graph = builder.done().unwrap();

    let output_dir = tempfile::tempdir()?.into_path();
    let bvgraph_path = output_dir.join("graph");
    let (_report, project_ids) = compute_project_ids(&graph, 5, &bvgraph_path)?;

    // All revisions reach the same root revision a
    let pairs = load_bvgraph_to_set(&bvgraph_path.join("rev_to_roots"))?;
    let expected: HashSet<_> = vec![(a, a), (b, a), (c, a), (d, a)].into_iter().collect();
    assert_eq!(pairs, expected);

    // All revisions share the same project id
    let pairs = load_partialarray_to_map(&bvgraph_path.join("rev_to_project_id"))?;
    let expected: HashMap<_, _> = vec![(a, 0), (b, 0), (c, 0), (d, 0)].into_iter().collect();
    assert_eq!(pairs, expected);

    let expected_project_ids = HashMap::from([(vec![a], 0)]);
    assert_eq!(project_ids, expected_project_ids);

    Ok(())
}

#[test]
fn merge_many_unrelated_histories() -> anyhow::Result<()> {
    use swh_graph::graph_builder::GraphBuilder;
    use swh_graph::swhid;

    let mut builder = GraphBuilder::default();
    // 4 root revisions
    let a = builder
        .node(swhid!(swh:1:rev:0000000000000000000000000000000000000010))
        .unwrap()
        .done();
    let b = builder
        .node(swhid!(swh:1:rev:0000000000000000000000000000000000000020))
        .unwrap()
        .done();
    let c = builder
        .node(swhid!(swh:1:rev:0000000000000000000000000000000000000030))
        .unwrap()
        .done();
    let d = builder
        .node(swhid!(swh:1:rev:0000000000000000000000000000000000000040))
        .unwrap()
        .done();
    // merge commits
    let merge_abc = builder
        .node(swhid!(swh:1:rev:0000000000000000000000000000000000000050))
        .unwrap()
        .done();
    let merge_abcd = builder
        .node(swhid!(swh:1:rev:0000000000000000000000000000000000000060))
        .unwrap()
        .done();

    // merge_abc merges a, b, c (3 root revisions)
    builder.arc(merge_abc, a);
    builder.arc(merge_abc, b);
    builder.arc(merge_abc, c);
    // merge_abcd merges all 4 root revisions
    builder.arc(merge_abcd, a);
    builder.arc(merge_abcd, b);
    builder.arc(merge_abcd, c);
    builder.arc(merge_abcd, d);
    let graph = builder.done().unwrap();

    let output_dir = tempfile::tempdir()?.into_path();
    let bvgraph_path = output_dir.join("graph");
    let (_report, project_ids) = compute_project_ids(&graph, 5, &bvgraph_path)?;

    let pairs = load_bvgraph_to_set(&bvgraph_path.join("rev_to_roots"))?;
    let expected: HashSet<_> = vec![
        (a, a),
        (b, b),
        (c, c),
        (d, d),
        (merge_abc, a),
        (merge_abc, b),
        (merge_abc, c),
        (merge_abcd, a),
        (merge_abcd, b),
        (merge_abcd, c),
        (merge_abcd, d),
    ]
    .into_iter()
    .collect();
    assert_eq!(pairs, expected);

    // Each root revision has its own project id, merge_abc and merge_abcd have unique project ids
    let expected_project_ids = HashMap::from([
        (vec![a], 0),
        (vec![b], 1),
        (vec![c], 2),
        (vec![d], 3),
        (vec![a, b, c], 4),
        (vec![a, b, c, d], 5),
    ]);
    assert_eq!(project_ids, expected_project_ids);

    Ok(())
}