use crate::types::{CsrGraph, SimilarityScore};
use rustkernel_core::{domain::Domain, kernel::KernelMetadata, traits::GpuKernel};
use std::collections::HashSet;
#[derive(Debug, Clone)]
pub struct JaccardSimilarity {
metadata: KernelMetadata,
}
impl JaccardSimilarity {
#[must_use]
pub fn new() -> Self {
Self {
metadata: KernelMetadata::batch("graph/jaccard-similarity", Domain::GraphAnalytics)
.with_description("Jaccard similarity (neighbor set overlap)")
.with_throughput(100_000)
.with_latency_us(10.0),
}
}
pub fn compute_pair(graph: &CsrGraph, node_a: u64, node_b: u64) -> f64 {
let neighbors_a: HashSet<u64> = graph.neighbors(node_a).iter().copied().collect();
let neighbors_b: HashSet<u64> = graph.neighbors(node_b).iter().copied().collect();
let intersection = neighbors_a.intersection(&neighbors_b).count();
let union = neighbors_a.union(&neighbors_b).count();
if union == 0 {
0.0
} else {
intersection as f64 / union as f64
}
}
pub fn compute_all_pairs(
graph: &CsrGraph,
min_similarity: f64,
max_pairs: usize,
) -> Vec<SimilarityScore> {
let n = graph.num_nodes;
let mut results = Vec::new();
for i in 0..n {
for j in (i + 1)..n {
let similarity = Self::compute_pair(graph, i as u64, j as u64);
if similarity >= min_similarity {
results.push(SimilarityScore {
id_a: i as u64,
id_b: j as u64,
similarity,
});
if results.len() >= max_pairs {
return results;
}
}
}
}
results.sort_by(|a, b| {
b.similarity
.partial_cmp(&a.similarity)
.unwrap_or(std::cmp::Ordering::Equal)
});
results
}
pub fn top_k_pairs(graph: &CsrGraph, k: usize) -> Vec<SimilarityScore> {
Self::compute_all_pairs(graph, 0.0, k)
}
}
impl Default for JaccardSimilarity {
fn default() -> Self {
Self::new()
}
}
impl GpuKernel for JaccardSimilarity {
fn metadata(&self) -> &KernelMetadata {
&self.metadata
}
}
#[derive(Debug, Clone)]
pub struct CosineSimilarity {
metadata: KernelMetadata,
}
impl CosineSimilarity {
#[must_use]
pub fn new() -> Self {
Self {
metadata: KernelMetadata::batch("graph/cosine-similarity", Domain::GraphAnalytics)
.with_description("Cosine similarity (normalized dot product)")
.with_throughput(100_000)
.with_latency_us(10.0),
}
}
pub fn compute_pair(graph: &CsrGraph, node_a: u64, node_b: u64) -> f64 {
let neighbors_a: HashSet<u64> = graph.neighbors(node_a).iter().copied().collect();
let neighbors_b: HashSet<u64> = graph.neighbors(node_b).iter().copied().collect();
let intersection = neighbors_a.intersection(&neighbors_b).count() as f64;
let norm = (neighbors_a.len() as f64 * neighbors_b.len() as f64).sqrt();
if norm == 0.0 {
0.0
} else {
intersection / norm
}
}
pub fn compute_all_pairs(
graph: &CsrGraph,
min_similarity: f64,
max_pairs: usize,
) -> Vec<SimilarityScore> {
let n = graph.num_nodes;
let mut results = Vec::new();
for i in 0..n {
for j in (i + 1)..n {
let similarity = Self::compute_pair(graph, i as u64, j as u64);
if similarity >= min_similarity {
results.push(SimilarityScore {
id_a: i as u64,
id_b: j as u64,
similarity,
});
if results.len() >= max_pairs {
return results;
}
}
}
}
results.sort_by(|a, b| {
b.similarity
.partial_cmp(&a.similarity)
.unwrap_or(std::cmp::Ordering::Equal)
});
results
}
}
impl Default for CosineSimilarity {
fn default() -> Self {
Self::new()
}
}
impl GpuKernel for CosineSimilarity {
fn metadata(&self) -> &KernelMetadata {
&self.metadata
}
}
#[derive(Debug, Clone)]
pub struct AdamicAdarIndex {
metadata: KernelMetadata,
}
impl AdamicAdarIndex {
#[must_use]
pub fn new() -> Self {
Self {
metadata: KernelMetadata::batch("graph/adamic-adar", Domain::GraphAnalytics)
.with_description("Adamic-Adar index (weighted common neighbors)")
.with_throughput(100_000)
.with_latency_us(10.0),
}
}
pub fn compute_pair(graph: &CsrGraph, node_a: u64, node_b: u64) -> f64 {
let neighbors_a: HashSet<u64> = graph.neighbors(node_a).iter().copied().collect();
let neighbors_b: HashSet<u64> = graph.neighbors(node_b).iter().copied().collect();
let common_neighbors = neighbors_a.intersection(&neighbors_b);
common_neighbors
.map(|&z| {
let degree = graph.out_degree(z) as f64;
if degree > 1.0 { 1.0 / degree.ln() } else { 0.0 }
})
.sum()
}
pub fn compute_all_pairs(
graph: &CsrGraph,
min_score: f64,
max_pairs: usize,
) -> Vec<SimilarityScore> {
let n = graph.num_nodes;
let mut results = Vec::new();
for i in 0..n {
for j in (i + 1)..n {
let score = Self::compute_pair(graph, i as u64, j as u64);
if score >= min_score {
results.push(SimilarityScore {
id_a: i as u64,
id_b: j as u64,
similarity: score,
});
if results.len() >= max_pairs {
return results;
}
}
}
}
results.sort_by(|a, b| {
b.similarity
.partial_cmp(&a.similarity)
.unwrap_or(std::cmp::Ordering::Equal)
});
results
}
pub fn top_k_pairs(graph: &CsrGraph, k: usize) -> Vec<SimilarityScore> {
Self::compute_all_pairs(graph, 0.0, k)
}
}
impl Default for AdamicAdarIndex {
fn default() -> Self {
Self::new()
}
}
impl GpuKernel for AdamicAdarIndex {
fn metadata(&self) -> &KernelMetadata {
&self.metadata
}
}
#[derive(Debug, Clone)]
pub struct CommonNeighbors {
metadata: KernelMetadata,
}
impl CommonNeighbors {
#[must_use]
pub fn new() -> Self {
Self {
metadata: KernelMetadata::batch("graph/common-neighbors", Domain::GraphAnalytics)
.with_description("Common neighbors count")
.with_throughput(200_000)
.with_latency_us(5.0),
}
}
pub fn compute_pair(graph: &CsrGraph, node_a: u64, node_b: u64) -> usize {
let neighbors_a: HashSet<u64> = graph.neighbors(node_a).iter().copied().collect();
let neighbors_b: HashSet<u64> = graph.neighbors(node_b).iter().copied().collect();
neighbors_a.intersection(&neighbors_b).count()
}
pub fn compute_all_pairs(
graph: &CsrGraph,
min_count: usize,
max_pairs: usize,
) -> Vec<SimilarityScore> {
let n = graph.num_nodes;
let mut results = Vec::new();
for i in 0..n {
for j in (i + 1)..n {
let count = Self::compute_pair(graph, i as u64, j as u64);
if count >= min_count {
results.push(SimilarityScore {
id_a: i as u64,
id_b: j as u64,
similarity: count as f64,
});
if results.len() >= max_pairs {
return results;
}
}
}
}
results.sort_by(|a, b| {
b.similarity
.partial_cmp(&a.similarity)
.unwrap_or(std::cmp::Ordering::Equal)
});
results
}
}
impl Default for CommonNeighbors {
fn default() -> Self {
Self::new()
}
}
impl GpuKernel for CommonNeighbors {
fn metadata(&self) -> &KernelMetadata {
&self.metadata
}
}
#[cfg(test)]
mod tests {
use super::*;
fn create_test_graph() -> CsrGraph {
CsrGraph::from_edges(
6,
&[
(0, 1),
(1, 0),
(1, 2),
(2, 1),
(0, 3),
(3, 0),
(1, 4),
(4, 1),
(2, 5),
(5, 2),
(3, 4),
(4, 3),
(4, 5),
(5, 4),
],
)
}
#[test]
fn test_jaccard_similarity_metadata() {
let kernel = JaccardSimilarity::new();
assert_eq!(kernel.metadata().id, "graph/jaccard-similarity");
assert_eq!(kernel.metadata().domain, Domain::GraphAnalytics);
}
#[test]
fn test_jaccard_similarity_pair() {
let graph = create_test_graph();
let sim = JaccardSimilarity::compute_pair(&graph, 0, 2);
assert!(
(sim - 1.0 / 3.0).abs() < 0.01,
"Expected ~0.33, got {}",
sim
);
}
#[test]
fn test_cosine_similarity_pair() {
let graph = create_test_graph();
let sim = CosineSimilarity::compute_pair(&graph, 0, 2);
assert!((sim - 0.5).abs() < 0.01, "Expected 0.5, got {}", sim);
}
#[test]
fn test_adamic_adar_pair() {
let graph = create_test_graph();
let aa = AdamicAdarIndex::compute_pair(&graph, 0, 2);
assert!(aa > 0.0, "Expected positive Adamic-Adar score, got {}", aa);
let aa_no_common = AdamicAdarIndex::compute_pair(&graph, 0, 5);
assert_eq!(aa_no_common, 0.0);
}
#[test]
fn test_common_neighbors_pair() {
let graph = create_test_graph();
let count = CommonNeighbors::compute_pair(&graph, 0, 2);
assert_eq!(count, 1);
let count = CommonNeighbors::compute_pair(&graph, 0, 1);
assert_eq!(count, 0);
}
#[test]
fn test_jaccard_all_pairs() {
let graph = create_test_graph();
let pairs = JaccardSimilarity::compute_all_pairs(&graph, 0.0, 100);
assert!(!pairs.is_empty());
for i in 1..pairs.len() {
assert!(pairs[i - 1].similarity >= pairs[i].similarity);
}
}
}
#[derive(Debug, Clone)]
pub struct ValueDistribution {
pub node_count: usize,
pub bin_count: usize,
pub distributions: Vec<f64>,
pub bin_edges: Vec<f64>,
pub strategy: BinningStrategy,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum BinningStrategy {
EqualWidth,
Logarithmic,
Quantile,
}
impl ValueDistribution {
pub fn new(node_count: usize, bin_count: usize) -> Self {
Self {
node_count,
bin_count,
distributions: vec![0.0; node_count * bin_count],
bin_edges: vec![0.0; bin_count + 1],
strategy: BinningStrategy::EqualWidth,
}
}
pub fn from_values(values: &[Vec<f64>], bin_count: usize) -> Self {
let node_count = values.len();
let (min_val, max_val) = values
.iter()
.flat_map(|v| v.iter())
.fold((f64::INFINITY, f64::NEG_INFINITY), |(min, max), &v| {
(min.min(v), max.max(v))
});
let range = max_val - min_val;
let bin_width = if range > 0.0 {
range / bin_count as f64
} else {
1.0
};
let mut dist = Self::new(node_count, bin_count);
for i in 0..=bin_count {
dist.bin_edges[i] = min_val + i as f64 * bin_width;
}
dist.bin_edges[bin_count] = max_val + 0.001;
for (node, node_values) in values.iter().enumerate() {
if node_values.is_empty() {
continue;
}
for &v in node_values {
let bin = ((v - min_val) / bin_width).floor() as usize;
let bin = bin.min(bin_count - 1);
dist.distributions[node * bin_count + bin] += 1.0;
}
let sum: f64 = dist.distributions[node * bin_count..(node + 1) * bin_count]
.iter()
.sum();
if sum > 0.0 {
for b in 0..bin_count {
dist.distributions[node * bin_count + b] /= sum;
}
}
}
dist
}
pub fn get_distribution(&self, node: usize) -> &[f64] {
let start = node * self.bin_count;
&self.distributions[start..start + self.bin_count]
}
}
#[derive(Debug, Clone)]
pub struct ValueSimilarityResult {
pub node_a: usize,
pub node_b: usize,
pub similarity: f64,
pub distance: f64,
}
#[derive(Debug, Clone)]
pub struct ValueSimilarity {
metadata: KernelMetadata,
}
impl Default for ValueSimilarity {
fn default() -> Self {
Self::new()
}
}
impl ValueSimilarity {
#[must_use]
pub fn new() -> Self {
Self {
metadata: KernelMetadata::batch("graph/value-similarity", Domain::GraphAnalytics)
.with_description("Value distribution similarity via JSD/Wasserstein")
.with_throughput(25_000)
.with_latency_us(800.0),
}
}
pub fn jensen_shannon_divergence(p: &[f64], q: &[f64]) -> f64 {
assert_eq!(p.len(), q.len(), "Distributions must have same length");
let epsilon = 1e-10;
let mut kl_pm = 0.0;
let mut kl_qm = 0.0;
for i in 0..p.len() {
let m = 0.5 * (p[i] + q[i]);
if p[i] > epsilon && m > epsilon {
kl_pm += p[i] * (p[i] / m).ln();
}
if q[i] > epsilon && m > epsilon {
kl_qm += q[i] * (q[i] / m).ln();
}
}
0.5 * kl_pm + 0.5 * kl_qm
}
pub fn jsd_similarity(p: &[f64], q: &[f64]) -> f64 {
let jsd = Self::jensen_shannon_divergence(p, q);
1.0 - (jsd / 2.0_f64.ln()).sqrt()
}
pub fn wasserstein_distance(p: &[f64], q: &[f64]) -> f64 {
assert_eq!(p.len(), q.len(), "Distributions must have same length");
let mut cdf_p = 0.0;
let mut cdf_q = 0.0;
let mut w1 = 0.0;
for i in 0..p.len() {
cdf_p += p[i];
cdf_q += q[i];
w1 += (cdf_p - cdf_q).abs();
}
w1
}
pub fn wasserstein_similarity(p: &[f64], q: &[f64]) -> f64 {
let w1 = Self::wasserstein_distance(p, q);
1.0 / (1.0 + w1)
}
pub fn compute_all_pairs_jsd(
distributions: &ValueDistribution,
min_similarity: f64,
max_pairs: usize,
) -> Vec<ValueSimilarityResult> {
let n = distributions.node_count;
let mut results = Vec::new();
for i in 0..n {
for j in (i + 1)..n {
let p = distributions.get_distribution(i);
let q = distributions.get_distribution(j);
let jsd = Self::jensen_shannon_divergence(p, q);
let similarity = 1.0 - (jsd / 2.0_f64.ln()).sqrt();
if similarity >= min_similarity {
results.push(ValueSimilarityResult {
node_a: i,
node_b: j,
similarity,
distance: jsd,
});
if results.len() >= max_pairs {
return results;
}
}
}
}
results.sort_by(|a, b| {
b.similarity
.partial_cmp(&a.similarity)
.unwrap_or(std::cmp::Ordering::Equal)
});
results
}
pub fn compute_all_pairs_wasserstein(
distributions: &ValueDistribution,
min_similarity: f64,
max_pairs: usize,
) -> Vec<ValueSimilarityResult> {
let n = distributions.node_count;
let mut results = Vec::new();
for i in 0..n {
for j in (i + 1)..n {
let p = distributions.get_distribution(i);
let q = distributions.get_distribution(j);
let w1 = Self::wasserstein_distance(p, q);
let similarity = 1.0 / (1.0 + w1);
if similarity >= min_similarity {
results.push(ValueSimilarityResult {
node_a: i,
node_b: j,
similarity,
distance: w1,
});
if results.len() >= max_pairs {
return results;
}
}
}
}
results.sort_by(|a, b| {
b.similarity
.partial_cmp(&a.similarity)
.unwrap_or(std::cmp::Ordering::Equal)
});
results
}
pub fn find_similar_nodes(
distributions: &ValueDistribution,
target_node: usize,
min_similarity: f64,
top_k: usize,
) -> Vec<ValueSimilarityResult> {
let n = distributions.node_count;
let p = distributions.get_distribution(target_node);
let mut results = Vec::new();
for i in 0..n {
if i == target_node {
continue;
}
let q = distributions.get_distribution(i);
let similarity = Self::jsd_similarity(p, q);
if similarity >= min_similarity {
results.push(ValueSimilarityResult {
node_a: target_node,
node_b: i,
similarity,
distance: Self::jensen_shannon_divergence(p, q),
});
}
}
results.sort_by(|a, b| {
b.similarity
.partial_cmp(&a.similarity)
.unwrap_or(std::cmp::Ordering::Equal)
});
results.into_iter().take(top_k).collect()
}
}
impl GpuKernel for ValueSimilarity {
fn metadata(&self) -> &KernelMetadata {
&self.metadata
}
}
#[cfg(test)]
mod value_similarity_tests {
use super::*;
#[test]
fn test_value_similarity_metadata() {
let kernel = ValueSimilarity::new();
assert_eq!(kernel.metadata().id, "graph/value-similarity");
assert_eq!(kernel.metadata().domain, Domain::GraphAnalytics);
}
#[test]
fn test_jsd_identical_distributions() {
let p = vec![0.25, 0.25, 0.25, 0.25];
let q = vec![0.25, 0.25, 0.25, 0.25];
let jsd = ValueSimilarity::jensen_shannon_divergence(&p, &q);
assert!(
jsd.abs() < 0.001,
"JSD of identical distributions should be 0"
);
}
#[test]
fn test_jsd_different_distributions() {
let p = vec![1.0, 0.0, 0.0, 0.0];
let q = vec![0.0, 0.0, 0.0, 1.0];
let jsd = ValueSimilarity::jensen_shannon_divergence(&p, &q);
assert!(
jsd > 0.6,
"JSD should be high for very different distributions"
);
}
#[test]
fn test_jsd_similarity() {
let p = vec![0.25, 0.25, 0.25, 0.25];
let q = vec![0.25, 0.25, 0.25, 0.25];
let sim = ValueSimilarity::jsd_similarity(&p, &q);
assert!(
(sim - 1.0).abs() < 0.01,
"Identical distributions should have similarity 1.0"
);
}
#[test]
fn test_wasserstein_identical() {
let p = vec![0.25, 0.25, 0.25, 0.25];
let q = vec![0.25, 0.25, 0.25, 0.25];
let w1 = ValueSimilarity::wasserstein_distance(&p, &q);
assert!(
w1.abs() < 0.001,
"Wasserstein of identical distributions should be 0"
);
}
#[test]
fn test_wasserstein_shifted() {
let p = vec![1.0, 0.0, 0.0, 0.0];
let q = vec![0.0, 1.0, 0.0, 0.0];
let w1 = ValueSimilarity::wasserstein_distance(&p, &q);
assert!((w1 - 1.0).abs() < 0.01);
}
#[test]
fn test_value_distribution_from_values() {
let values = vec![vec![1.0, 2.0, 3.0], vec![2.0, 3.0, 4.0]];
let dist = ValueDistribution::from_values(&values, 4);
assert_eq!(dist.node_count, 2);
assert_eq!(dist.bin_count, 4);
let sum0: f64 = dist.get_distribution(0).iter().sum();
let sum1: f64 = dist.get_distribution(1).iter().sum();
assert!((sum0 - 1.0).abs() < 0.01);
assert!((sum1 - 1.0).abs() < 0.01);
}
#[test]
fn test_find_similar_nodes() {
let values = vec![
vec![1.0, 2.0, 3.0],
vec![1.0, 2.0, 3.0], vec![10.0, 11.0, 12.0], ];
let dist = ValueDistribution::from_values(&values, 5);
let similar = ValueSimilarity::find_similar_nodes(&dist, 0, 0.5, 5);
assert!(!similar.is_empty());
assert_eq!(similar[0].node_b, 1);
}
}