use std::sync::LazyLock;
use rapidhash::RapidHashMap;
use rayon::prelude::*;
use regex::Regex;
use smallvec::SmallVec;
use swh_graph::graph::*;
use swh_graph::{NodeType, SWHID, properties};
use crate::collections::{NodeSet, SmallNodeSet};
pub static PROTOCOL_RE: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"^[a-zA-Z][a-zA-Z0-9+.-]*://").unwrap());
pub fn normalize_origin_url(mut url: &str) -> String {
if let Some(m) = PROTOCOL_RE.find(url) {
url = &url[m.end()..];
}
if url.ends_with(".git") {
url = &url[0..url.len() - 4];
}
url.to_ascii_lowercase()
}
pub fn fuzzy_find_origins<G>(
graph: &G,
urls: &[String],
) -> impl ParallelIterator<Item = (usize, usize)>
where
G: SwhGraphWithProperties<Maps: properties::Maps, Strings: properties::Strings> + Sync,
{
let mut queried_urls: RapidHashMap<String, SmallNodeSet> =
RapidHashMap::with_capacity_and_hasher(urls.len(), Default::default());
urls.iter().enumerate().for_each(|(pos, url)| {
let norm_url = normalize_origin_url(url);
queried_urls.entry(norm_url).or_default().insert(pos);
});
(0..graph.num_nodes())
.into_par_iter()
.filter(|&node| graph.properties().node_type(node) == NodeType::Origin)
.filter_map(move |node| {
let node_url = graph.properties().message(node)?;
let norm_url = normalize_origin_url(&String::from_utf8(node_url).unwrap());
Some(
queried_urls
.get(&norm_url)?
.iter()
.map(|input_url_pos| (input_url_pos, node))
.collect::<SmallVec<[_; 1]>>(), )
})
.flatten_iter()
}
pub fn fuzzy_find_origin<G>(graph: &G, origin_url: &str) -> Option<usize>
where
G: SwhGraphWithProperties<Maps: properties::Maps, Strings: properties::Strings> + Sync,
{
let ori_swhid = SWHID::from_origin_url(origin_url);
if let Ok(node) = graph.properties().node_id(ori_swhid) {
return Some(node);
}
let origins = [origin_url.to_string()];
let matches = fuzzy_find_origins(graph, &origins);
if let Some((_, node)) = matches.find_any(|_| true) {
return Some(node);
}
None
}