use serde::{Deserialize, Serialize};
use std::collections::HashSet;
pub fn jaccard_similarity(a: &str, b: &str) -> f64 {
let set_a = trigrams(a);
let set_b = trigrams(b);
if set_a.is_empty() && set_b.is_empty() {
return 1.0;
}
let intersection = set_a.intersection(&set_b).count() as f64;
let union = set_a.union(&set_b).count() as f64;
if union == 0.0 {
0.0
} else {
intersection / union
}
}
fn trigrams(input: &str) -> HashSet<[char; 3]> {
let chars: Vec<char> = input.chars().collect();
if chars.is_empty() {
return HashSet::new();
}
let mut out: HashSet<[char; 3]> = HashSet::with_capacity(chars.len().saturating_add(2));
let mut window: [char; 3] = ['\0', '\0', '\0'];
for (i, ch) in chars.iter().enumerate() {
window[0] = if i >= 1 { chars[i - 1] } else { '\0' };
window[1] = *ch;
window[2] = if i + 1 < chars.len() {
chars[i + 1]
} else {
'\0'
};
out.insert(window);
}
out
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
#[serde(tag = "verdict", rename_all = "snake_case")]
pub enum PreservationVerdict {
Preserved { score: f64, threshold: f64 },
Rejected { score: f64, threshold: f64 },
Unchanged { byte_len: usize },
}
impl PreservationVerdict {
pub fn evaluate(original: &str, rewritten: &str, threshold: f64) -> Self {
let threshold = threshold.clamp(0.0, 1.0);
if original == rewritten {
return Self::Unchanged {
byte_len: original.len(),
};
}
let score = jaccard_similarity(original, rewritten);
if score >= threshold {
Self::Preserved { score, threshold }
} else {
Self::Rejected { score, threshold }
}
}
pub fn is_accepted(&self) -> bool {
matches!(self, Self::Preserved { .. } | Self::Unchanged { .. })
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn identical_strings_score_one() {
let s = "the quick brown fox jumps over the lazy dog";
assert!((jaccard_similarity(s, s) - 1.0).abs() < f64::EPSILON);
}
#[test]
fn completely_different_strings_score_zero_or_near_zero() {
let a = "aaaaaaaaaa";
let b = "zzzzzzzzzz";
assert!(jaccard_similarity(a, b) < 0.05);
}
#[test]
fn partial_overlap_scores_between_zero_and_one() {
let a = "the quick brown fox jumps";
let b = "the slow brown cat sleeps";
let score = jaccard_similarity(a, b);
assert!(score > 0.0 && score < 1.0, "got {score}");
}
#[test]
fn both_empty_score_one() {
assert!((jaccard_similarity("", "") - 1.0).abs() < f64::EPSILON);
}
#[test]
fn one_empty_scores_zero() {
assert!(jaccard_similarity("hello", "").abs() < f64::EPSILON);
assert!(jaccard_similarity("", "hello").abs() < f64::EPSILON);
}
#[test]
fn unicode_strings_do_not_panic() {
let a = "ç日本語";
let b = "ç中文";
let _ = jaccard_similarity(a, b);
}
#[test]
fn verdict_preserved_when_above_threshold() {
let v = PreservationVerdict::evaluate("hello world", "hello world!", 0.5);
assert!(v.is_accepted());
assert!(matches!(v, PreservationVerdict::Preserved { .. }));
}
#[test]
fn verdict_unchanged_for_identical() {
let v = PreservationVerdict::evaluate("same", "same", 0.9);
assert!(v.is_accepted());
assert!(matches!(v, PreservationVerdict::Unchanged { byte_len: 4 }));
}
#[test]
fn threshold_clamped_out_of_range() {
let v = PreservationVerdict::evaluate("abc", "abc", 99.0);
assert!(v.is_accepted());
let v = PreservationVerdict::evaluate("abc", "xyz", -5.0);
assert!(v.is_accepted());
let v = PreservationVerdict::evaluate("abc", "abcd", 0.0);
assert!(
v.is_accepted(),
"single-char append is mostly the same body"
);
}
#[test]
fn g29_repro_evaluates_rejected_when_diverges() {
let original = "JWT token rotation strategy with 15-min expiry and refresh flow";
let drifted = "The weather in Tokyo is sunny today with mild temperatures expected";
let v = PreservationVerdict::evaluate(original, drifted, 0.7);
assert!(!v.is_accepted(), "should reject hallucinated rewrite");
}
}