use serde::{Deserialize, Serialize};
use crate::stats::mean;
pub fn cosine_similarity(a: &[f64], b: &[f64], center: bool) -> Option<f64> {
let n = a.len().min(b.len());
if n < 2 {
return None;
}
let (ma, mb) = if center {
(mean(&a[..n]), mean(&b[..n]))
} else {
(0.0, 0.0)
};
let mut dot = 0.0;
let mut na = 0.0;
let mut nb = 0.0;
for i in 0..n {
let da = a[i] - ma;
let db = b[i] - mb;
dot += da * db;
na += da * da;
nb += db * db;
}
if na == 0.0 || nb == 0.0 {
return None;
}
Some((dot / (na.sqrt() * nb.sqrt())).clamp(-1.0, 1.0))
}
#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
pub struct RediscoveryVerdict {
pub is_rediscovery: bool,
pub max_similarity: f64,
pub nearest_index: Option<usize>,
pub threshold: f64,
}
pub const DEFAULT_REDISCOVERY_THRESHOLD: f64 = 0.97;
pub fn classify_rediscovery(
submitted: &[f64],
known: &[Vec<f64>],
threshold: f64,
center: bool,
) -> RediscoveryVerdict {
let mut max_similarity = 0.0_f64;
let mut nearest_index = None;
for (i, k) in known.iter().enumerate() {
if let Some(c) = cosine_similarity(submitted, k, center) {
let abs = c.abs();
if abs > max_similarity {
max_similarity = abs;
nearest_index = Some(i);
}
}
}
RediscoveryVerdict {
is_rediscovery: nearest_index.is_some() && max_similarity >= threshold,
max_similarity,
nearest_index,
threshold,
}
}
#[cfg(test)]
mod tests {
use super::*;
fn approx(a: f64, b: f64) -> bool {
(a - b).abs() < 1e-12
}
fn stream(seed: f64, n: usize) -> Vec<f64> {
(0..n)
.map(|i| (i as f64 * 0.37 + seed).sin() * 0.01 + 0.001)
.collect()
}
#[test]
fn identical_is_cosine_one() {
let a = stream(1.0, 50);
assert!(approx(cosine_similarity(&a, &a, false).unwrap(), 1.0));
}
#[test]
fn scaled_stream_stays_collinear() {
let a = stream(2.0, 50);
let scaled: Vec<f64> = a.iter().map(|x| x * 3.0).collect();
assert!(approx(cosine_similarity(&a, &scaled, false).unwrap(), 1.0));
}
#[test]
fn inverse_is_cosine_minus_one() {
let a = stream(3.0, 50);
let inv: Vec<f64> = a.iter().map(|x| -x).collect();
assert!(approx(cosine_similarity(&a, &inv, false).unwrap(), -1.0));
}
#[test]
fn zero_norm_is_undefined() {
let a = stream(1.0, 40);
let flat = vec![0.0; 40];
assert!(cosine_similarity(&a, &flat, false).is_none());
}
#[test]
fn near_duplicate_flags() {
let known = stream(5.0, 60);
let submitted: Vec<f64> = known
.iter()
.enumerate()
.map(|(i, x)| x * 2.0 + 1e-6 * (i as f64).cos())
.collect();
let v = classify_rediscovery(&submitted, &[known], DEFAULT_REDISCOVERY_THRESHOLD, false);
assert!(v.is_rediscovery, "leveraged clone should flag: {v:?}");
assert_eq!(v.nearest_index, Some(0));
assert!(v.max_similarity >= DEFAULT_REDISCOVERY_THRESHOLD);
}
#[test]
fn inverse_clone_flags_on_abs() {
let known = stream(7.0, 60);
let inv: Vec<f64> = known.iter().map(|x| -x).collect();
let v = classify_rediscovery(&inv, &[known], DEFAULT_REDISCOVERY_THRESHOLD, false);
assert!(v.is_rediscovery, "an inverse clone is non-novel too: {v:?}");
}
#[test]
fn novel_stream_does_not_flag() {
let known = stream(1.0, 80);
let novel: Vec<f64> = (0..80)
.map(|i| (i as f64 * 0.91 + 13.0).cos() * 0.008 - 0.0004)
.collect();
let v = classify_rediscovery(&novel, &[known], DEFAULT_REDISCOVERY_THRESHOLD, false);
assert!(
!v.is_rediscovery,
"an independent stream must not flag: {v:?}"
);
assert!(v.max_similarity < DEFAULT_REDISCOVERY_THRESHOLD);
}
#[test]
fn empty_library_never_flags() {
let v = classify_rediscovery(&stream(1.0, 30), &[], DEFAULT_REDISCOVERY_THRESHOLD, false);
assert!(!v.is_rediscovery);
assert_eq!(v.nearest_index, None);
assert_eq!(v.max_similarity, 0.0);
}
#[test]
fn picks_nearest_of_several() {
let target = stream(4.0, 60);
let decoy = stream(20.0, 60);
let submitted: Vec<f64> = target.iter().map(|x| x * 1.5).collect();
let v = classify_rediscovery(
&submitted,
&[decoy, target],
DEFAULT_REDISCOVERY_THRESHOLD,
false,
);
assert_eq!(
v.nearest_index,
Some(1),
"should match the target, not decoy"
);
assert!(v.is_rediscovery);
}
}