Skip to main content

swh_graph_stdlib/
origins.rs

1// Copyright (C) 2026  The Software Heritage developers
2// See the AUTHORS file at the top-level directory of this distribution
3// License: GNU General Public License version 3, or any later version
4// See top-level LICENSE file for more information
5
6//! Functionalities related to managing software origins in the Software Heritage graph.
7//!
8//! Software origins (origins for short) represent places used to develop and distribute software
9//! source code, such as version control system repositories and (source) package repositories.
10//! Origins are represented in the Software Heritage graph as nodes with type [`NodeType::Origin`].
11
12use std::sync::LazyLock;
13
14use rapidhash::RapidHashMap;
15use rayon::prelude::*;
16use regex::Regex;
17use smallvec::SmallVec;
18use swh_graph::graph::*;
19use swh_graph::{NodeType, properties};
20
21use crate::collections::{NodeSet, SmallNodeSet};
22
23/// Regular expression matching any URI/URL protocol prefix, e.g., "http://".
24pub static PROTOCOL_RE: LazyLock<Regex> =
25    LazyLock::new(|| Regex::new(r"^[a-zA-Z][a-zA-Z0-9+.-]*://").unwrap());
26
27/// Normalize an origin URL to a canonical origin identifier.
28///
29/// Canonical origin identifiers are lowercase, stripped of protocol prefixes (hence: they are not
30/// URLs anymore), and ".git" trailers. They can be used to compare origin URLs for (lax)
31/// equivalence.
32pub fn normalize_origin_url(mut url: &str) -> String {
33    if let Some(m) = PROTOCOL_RE.find(url) {
34        url = &url[m.end()..];
35    }
36
37    if url.ends_with(".git") {
38        url = &url[0..url.len() - 4];
39    }
40
41    url.to_ascii_lowercase()
42}
43
44/// Search if a given set of origins, specified by URL, exist in the graph.
45///
46/// Matching is lax, based on normalized origin URLs, as per [`normalize_origin_url`].
47///
48/// Return an iterator over `(input_url_pos, origin_node)` pairs, where `input_url_pos` is the
49/// position within `urls` of the matching input URL and `origin_node` is a matching origin node
50/// in the graph. Multiple matches for the same input URL are possible; there is no guarantee they
51/// will be returned contiguously by the iterator.
52pub fn fuzzy_find_origins<G>(
53    graph: &G,
54    urls: &[String],
55) -> impl ParallelIterator<Item = (usize, usize)>
56where
57    G: SwhGraphWithProperties<Maps: properties::Maps, Strings: properties::Strings> + Sync,
58{
59    // Mapping from normalized origin URLs to input URLs (as positions in [`urls`]).
60    let mut queried_urls: RapidHashMap<String, SmallNodeSet> =
61        RapidHashMap::with_capacity_and_hasher(urls.len(), Default::default());
62    urls.iter().enumerate().for_each(|(pos, url)| {
63        let norm_url = normalize_origin_url(url);
64        queried_urls.entry(norm_url).or_default().insert(pos);
65    });
66
67    (0..graph.num_nodes())
68        .into_par_iter()
69        .filter(|&node| graph.properties().node_type(node) == NodeType::Origin)
70        .filter_map(move |node| {
71            // See https://gitlab.softwareheritage.org/swh/devel/swh-graph/-/issues/4813 for why,
72            // currently, we can have origin nodes without URLs.
73            let node_url = graph.properties().message(node)?;
74            // Unwrap here should not fail, because origin URLs are UTF-8 safe.
75            let norm_url = normalize_origin_url(&String::from_utf8(node_url).unwrap());
76            Some(
77                queried_urls
78                    .get(&norm_url)?
79                    .iter()
80                    .map(|input_url_pos| (input_url_pos, node))
81                    .collect::<SmallVec<[_; 1]>>(), // Avoid allocation in the common case
82            )
83        })
84        .flatten_iter()
85}