swh_graph_stdlib/origins.rs
1// Copyright (C) 2026 The Software Heritage developers
2// See the AUTHORS file at the top-level directory of this distribution
3// License: GNU General Public License version 3, or any later version
4// See top-level LICENSE file for more information
5
6//! Functionalities related to managing software origins in the Software Heritage graph.
7//!
8//! Software origins (origins for short) represent places used to develop and distribute software
9//! source code, such as version control system repositories and (source) package repositories.
10//! Origins are represented in the Software Heritage graph as nodes with type [`NodeType::Origin`].
11
12use std::sync::LazyLock;
13
14use rapidhash::RapidHashMap;
15use rayon::prelude::*;
16use regex::Regex;
17use smallvec::SmallVec;
18use swh_graph::graph::*;
19use swh_graph::{NodeType, properties};
20
21use crate::collections::{NodeSet, SmallNodeSet};
22
23/// Regular expression matching any URI/URL protocol prefix, e.g., "http://".
24pub static PROTOCOL_RE: LazyLock<Regex> =
25 LazyLock::new(|| Regex::new(r"^[a-zA-Z][a-zA-Z0-9+.-]*://").unwrap());
26
27/// Normalize an origin URL to a canonical origin identifier.
28///
29/// Canonical origin identifiers are lowercase, stripped of protocol prefixes (hence: they are not
30/// URLs anymore), and ".git" trailers. They can be used to compare origin URLs for (lax)
31/// equivalence.
32pub fn normalize_origin_url(mut url: &str) -> String {
33 if let Some(m) = PROTOCOL_RE.find(url) {
34 url = &url[m.end()..];
35 }
36
37 if url.ends_with(".git") {
38 url = &url[0..url.len() - 4];
39 }
40
41 url.to_ascii_lowercase()
42}
43
44/// Search if a given set of origins, specified by URL, exist in the graph.
45///
46/// Matching is lax, based on normalized origin URLs, as per [`normalize_origin_url`].
47///
48/// Return an iterator over `(input_url_pos, origin_node)` pairs, where `input_url_pos` is the
49/// position within `urls` of the matching input URL and `origin_node` is a matching origin node
50/// in the graph. Multiple matches for the same input URL are possible; there is no guarantee they
51/// will be returned contiguously by the iterator.
52pub fn fuzzy_find_origins<G>(
53 graph: &G,
54 urls: &[String],
55) -> impl ParallelIterator<Item = (usize, usize)>
56where
57 G: SwhGraphWithProperties<Maps: properties::Maps, Strings: properties::Strings> + Sync,
58{
59 // Mapping from normalized origin URLs to input URLs (as positions in [`urls`]).
60 let mut queried_urls: RapidHashMap<String, SmallNodeSet> =
61 RapidHashMap::with_capacity_and_hasher(urls.len(), Default::default());
62 urls.iter().enumerate().for_each(|(pos, url)| {
63 let norm_url = normalize_origin_url(url);
64 queried_urls.entry(norm_url).or_default().insert(pos);
65 });
66
67 (0..graph.num_nodes())
68 .into_par_iter()
69 .filter(|&node| graph.properties().node_type(node) == NodeType::Origin)
70 .filter_map(move |node| {
71 // See https://gitlab.softwareheritage.org/swh/devel/swh-graph/-/issues/4813 for why,
72 // currently, we can have origin nodes without URLs.
73 let node_url = graph.properties().message(node)?;
74 // Unwrap here should not fail, because origin URLs are UTF-8 safe.
75 let norm_url = normalize_origin_url(&String::from_utf8(node_url).unwrap());
76 Some(
77 queried_urls
78 .get(&norm_url)?
79 .iter()
80 .map(|input_url_pos| (input_url_pos, node))
81 .collect::<SmallVec<[_; 1]>>(), // Avoid allocation in the common case
82 )
83 })
84 .flatten_iter()
85}