swh-graph-stdlib 13.0.0

Library of algorithms and data structures for swh-graph
Documentation
// Copyright (C) 2026  The Software Heritage developers
// See the AUTHORS file at the top-level directory of this distribution
// License: GNU General Public License version 3, or any later version
// See top-level LICENSE file for more information

//! Functionalities related to managing software origins in the Software Heritage graph.
//!
//! Software origins (origins for short) represent places used to develop and distribute software
//! source code, such as version control system repositories and (source) package repositories.
//! Origins are represented in the Software Heritage graph as nodes with type [`NodeType::Origin`].

use std::sync::LazyLock;

use rapidhash::RapidHashMap;
use rayon::prelude::*;
use regex::Regex;
use smallvec::SmallVec;
use swh_graph::graph::*;
use swh_graph::{NodeType, SWHID, properties};

use crate::collections::{NodeSet, SmallNodeSet};

/// Regular expression matching any URI/URL protocol prefix, e.g., "http://".
pub static PROTOCOL_RE: LazyLock<Regex> =
    LazyLock::new(|| Regex::new(r"^[a-zA-Z][a-zA-Z0-9+.-]*://").unwrap());

/// Normalize an origin URL to a canonical origin identifier.
///
/// Canonical origin identifiers are lowercase, stripped of protocol prefixes (hence: they are not
/// URLs anymore), and ".git" trailers. They can be used to compare origin URLs for (lax)
/// equivalence.
pub fn normalize_origin_url(mut url: &str) -> String {
    if let Some(m) = PROTOCOL_RE.find(url) {
        url = &url[m.end()..];
    }

    if url.ends_with(".git") {
        url = &url[0..url.len() - 4];
    }

    url.to_ascii_lowercase()
}

/// Search if a given set of origins, specified by URL, exist in the graph.
///
/// Matching is lax, based on normalized origin URLs, as per [`normalize_origin_url`].
///
/// Return an iterator over `(input_url_pos, origin_node)` pairs, where `input_url_pos` is the
/// position within `urls` of the matching input URL and `origin_node` is a matching origin node
/// in the graph. Multiple matches for the same input URL are possible; there is no guarantee they
/// will be returned contiguously by the iterator.
pub fn fuzzy_find_origins<G>(
    graph: &G,
    urls: &[String],
) -> impl ParallelIterator<Item = (usize, usize)>
where
    G: SwhGraphWithProperties<Maps: properties::Maps, Strings: properties::Strings> + Sync,
{
    // Mapping from normalized origin URLs to input URLs (as positions in [`urls`]).
    let mut queried_urls: RapidHashMap<String, SmallNodeSet> =
        RapidHashMap::with_capacity_and_hasher(urls.len(), Default::default());
    urls.iter().enumerate().for_each(|(pos, url)| {
        let norm_url = normalize_origin_url(url);
        queried_urls.entry(norm_url).or_default().insert(pos);
    });

    (0..graph.num_nodes())
        .into_par_iter()
        .filter(|&node| graph.properties().node_type(node) == NodeType::Origin)
        .filter_map(move |node| {
            // See https://gitlab.softwareheritage.org/swh/devel/swh-graph/-/issues/4813 for why,
            // currently, we can have origin nodes without URLs.
            let node_url = graph.properties().message(node)?;
            // Unwrap here should not fail, because origin URLs are UTF-8 safe.
            let norm_url = normalize_origin_url(&String::from_utf8(node_url).unwrap());
            Some(
                queried_urls
                    .get(&norm_url)?
                    .iter()
                    .map(|input_url_pos| (input_url_pos, node))
                    .collect::<SmallVec<[_; 1]>>(), // Avoid allocation in the common case
            )
        })
        .flatten_iter()
}

/// Lookup a single origin by URL, resorting to fuzzy matching, as per [`fuzzy_find_origins`] if
/// (and only if) needed. In case of multiple fuzzy matches, an arbitrary match is chosen.
///
/// Return the node id of the found origin in case of success.
pub fn fuzzy_find_origin<G>(graph: &G, origin_url: &str) -> Option<usize>
where
    G: SwhGraphWithProperties<Maps: properties::Maps, Strings: properties::Strings> + Sync,
{
    // First, try origin string as is
    let ori_swhid = SWHID::from_origin_url(origin_url);
    if let Ok(node) = graph.properties().node_id(ori_swhid) {
        return Some(node);
    }

    // If that didn't work, try fuzzy search and return an arbitrary match from there
    let origins = [origin_url.to_string()];
    let matches = fuzzy_find_origins(graph, &origins);
    if let Some((_, node)) = matches.find_any(|_| true) {
        return Some(node);
    }

    // Nothing worked, fail
    None
}