1use std::collections::HashMap;
40
41use unicode_normalization::UnicodeNormalization;
42
43use crate::agentlog::Record;
44use crate::diff::axes::{Axis, AxisStat};
45use crate::diff::bootstrap::{median, paired_ci};
46use crate::diff::embedder::{cosine, Embedder};
47
48fn tokenize(text: &str) -> Vec<String> {
51 text.nfc()
52 .flat_map(|c| c.to_lowercase())
53 .collect::<String>()
54 .split(|c: char| !c.is_alphanumeric())
55 .filter(|s| !s.is_empty())
56 .map(String::from)
57 .collect()
58}
59
60fn term_frequency(tokens: &[String]) -> HashMap<String, f64> {
62 let mut out: HashMap<String, f64> = HashMap::new();
63 for tok in tokens {
64 *out.entry(tok.clone()).or_insert(0.0) += 1.0;
65 }
66 out
67}
68
69fn document_frequency(corpus: &[Vec<String>]) -> HashMap<String, usize> {
72 let mut df: HashMap<String, usize> = HashMap::new();
73 for doc in corpus {
74 let mut seen: std::collections::HashSet<&str> = std::collections::HashSet::new();
75 for tok in doc {
76 if seen.insert(tok.as_str()) {
77 *df.entry(tok.clone()).or_insert(0) += 1;
78 }
79 }
80 }
81 df
82}
83
84fn tfidf_vector(
87 tokens: &[String],
88 df: &HashMap<String, usize>,
89 n_docs: usize,
90) -> HashMap<String, f64> {
91 let tf = term_frequency(tokens);
92 let mut out: HashMap<String, f64> = HashMap::with_capacity(tf.len());
93 let n = n_docs as f64;
94 for (tok, tf_v) in tf {
95 let df_v = *df.get(&tok).unwrap_or(&0) as f64;
96 let idf = ((n + 1.0) / (df_v + 1.0)).ln() + 1.0;
97 let tf_weight = 1.0 + tf_v.ln();
98 out.insert(tok, tf_weight * idf);
99 }
100 out
101}
102
103fn sparse_cosine(a: &HashMap<String, f64>, b: &HashMap<String, f64>) -> f64 {
104 let na: f64 = a.values().map(|v| v * v).sum::<f64>().sqrt();
105 let nb: f64 = b.values().map(|v| v * v).sum::<f64>().sqrt();
106 if na < 1e-12 || nb < 1e-12 {
107 return if na < 1e-12 && nb < 1e-12 { 1.0 } else { 0.0 };
108 }
109 let (small, large) = if a.len() <= b.len() { (a, b) } else { (b, a) };
111 let mut dot = 0.0;
112 for (k, v) in small {
113 if let Some(w) = large.get(k) {
114 dot += v * w;
115 }
116 }
117 dot / (na * nb)
118}
119
120fn response_text(r: &Record) -> String {
121 let Some(arr) = r.payload.get("content").and_then(|c| c.as_array()) else {
122 return String::new();
123 };
124 arr.iter()
125 .filter_map(|p| {
126 if p.get("type").and_then(|t| t.as_str()) == Some("text") {
127 p.get("text")
128 .and_then(|t| t.as_str())
129 .map(ToString::to_string)
130 } else {
131 None
132 }
133 })
134 .collect::<Vec<_>>()
135 .join(" ")
136}
137
138pub fn compute(pairs: &[(&Record, &Record)], seed: Option<u64>) -> AxisStat {
144 if pairs.is_empty() {
145 return AxisStat::empty(Axis::Semantic);
146 }
147 let baseline_tokens: Vec<Vec<String>> = pairs
150 .iter()
151 .map(|(b, _)| tokenize(&response_text(b)))
152 .collect();
153 let candidate_tokens: Vec<Vec<String>> = pairs
154 .iter()
155 .map(|(_, c)| tokenize(&response_text(c)))
156 .collect();
157 let mut corpus: Vec<Vec<String>> = Vec::with_capacity(pairs.len() * 2);
158 corpus.extend(baseline_tokens.clone());
159 corpus.extend(candidate_tokens.clone());
160 let df = document_frequency(&corpus);
161 let n_docs = corpus.len();
162
163 let similarities: Vec<f64> = baseline_tokens
164 .iter()
165 .zip(candidate_tokens.iter())
166 .map(|(bt, ct)| {
167 let bv = tfidf_vector(bt, &df, n_docs);
168 let cv = tfidf_vector(ct, &df, n_docs);
169 sparse_cosine(&bv, &cv).clamp(0.0, 1.0)
170 })
171 .collect();
172
173 similarities_to_stat(&similarities, pairs.len(), seed)
174}
175
176pub fn compute_with_embedder(
188 pairs: &[(&Record, &Record)],
189 embedder: &dyn Embedder,
190 seed: Option<u64>,
191) -> AxisStat {
192 if pairs.is_empty() {
193 return AxisStat::empty(Axis::Semantic);
194 }
195 let baseline_texts: Vec<String> = pairs.iter().map(|(b, _)| response_text(b)).collect();
196 let candidate_texts: Vec<String> = pairs.iter().map(|(_, c)| response_text(c)).collect();
197 let baseline_refs: Vec<&str> = baseline_texts.iter().map(String::as_str).collect();
198 let candidate_refs: Vec<&str> = candidate_texts.iter().map(String::as_str).collect();
199
200 let baseline_vecs = embedder.embed(&baseline_refs);
201 let candidate_vecs = embedder.embed(&candidate_refs);
202
203 if baseline_vecs.len() != pairs.len() || candidate_vecs.len() != pairs.len() {
204 return AxisStat::empty(Axis::Semantic);
205 }
206
207 let similarities: Vec<f64> = baseline_vecs
208 .iter()
209 .zip(candidate_vecs.iter())
210 .map(|(bv, cv)| f64::from(cosine(bv, cv).clamp(0.0, 1.0)))
211 .collect();
212
213 similarities_to_stat(&similarities, pairs.len(), seed)
214}
215
216fn similarities_to_stat(similarities: &[f64], n_pairs: usize, seed: Option<u64>) -> AxisStat {
220 let baseline_ones: Vec<f64> = (0..similarities.len()).map(|_| 1.0).collect();
221 let bm = 1.0;
222 let cm = median(similarities);
223 let delta = cm - bm;
224 let ci = paired_ci(
225 &baseline_ones,
226 similarities,
227 |bs, cs| median(cs) - median(bs),
228 0,
229 seed,
230 );
231 AxisStat::new_value(Axis::Semantic, bm, cm, delta, ci.low, ci.high, n_pairs)
232}
233
234#[cfg(test)]
235mod tests {
236 use super::*;
237 use crate::agentlog::Kind;
238 use crate::diff::axes::Severity;
239 use serde_json::json;
240
241 fn response(text: &str) -> Record {
242 Record::new(
243 Kind::ChatResponse,
244 json!({
245 "model": "x",
246 "content": [{"type": "text", "text": text}],
247 "stop_reason": "end_turn",
248 "latency_ms": 0,
249 "usage": {"input_tokens": 1, "output_tokens": 1, "thinking_tokens": 0},
250 }),
251 "2026-04-21T10:00:00Z",
252 None,
253 )
254 }
255
256 #[test]
257 fn identical_text_has_similarity_1() {
258 let r = response("the quick brown fox jumps over the lazy dog");
259 let pairs = [(&r, &r)];
260 let stat = compute(&pairs, Some(1));
261 assert!((stat.candidate_median - 1.0).abs() < 1e-9);
262 assert_eq!(stat.severity, Severity::None);
263 }
264
265 #[test]
266 fn very_different_text_has_lower_similarity() {
267 let baseline: Vec<Record> = (0..10)
268 .map(|i| response(&format!("refund issued for order {i}")))
269 .collect();
270 let candidate: Vec<Record> = (0..10)
271 .map(|i| {
272 response(&format!(
273 "unable to process request {i}, please contact support"
274 ))
275 })
276 .collect();
277 let pairs: Vec<(&Record, &Record)> = baseline.iter().zip(candidate.iter()).collect();
278 let stat = compute(&pairs, Some(1));
279 assert!(stat.candidate_median < 0.5);
280 }
281
282 #[test]
283 fn tokenize_splits_on_punctuation_and_lowercases() {
284 assert_eq!(
285 tokenize("Hello, world! It's nice"),
286 vec![
287 "hello".to_string(),
288 "world".to_string(),
289 "it".to_string(),
290 "s".to_string(),
291 "nice".to_string(),
292 ]
293 );
294 }
295
296 #[test]
297 fn tokenize_handles_unicode_nfc() {
298 let nfd = "cafe\u{0301}";
300 let nfc = "café";
301 assert_eq!(tokenize(nfd), tokenize(nfc));
302 }
303
304 #[test]
305 fn empty_text_has_zero_similarity_to_nonempty() {
306 let empty = response("");
307 let full = response("some content here");
308 let pairs = [(&empty, &full); 3];
309 let stat = compute(&pairs, Some(1));
310 assert!(stat.candidate_median < 0.1);
311 }
312
313 #[test]
314 fn identical_content_scores_higher_than_partial_overlap() {
315 let identical_b = response("refund issued for order abc123");
317 let identical_c = response("refund issued for order abc123");
318 let partial_b = response("refund issued for order abc123");
319 let partial_c = response("unable to process please contact support");
320 let baseline = [identical_b, partial_b];
321 let candidate = [identical_c, partial_c];
322 let pairs: Vec<(&Record, &Record)> = baseline.iter().zip(candidate.iter()).collect();
323 let bt: Vec<Vec<String>> = pairs
324 .iter()
325 .map(|(b, _)| tokenize(&response_text(b)))
326 .collect();
327 let ct: Vec<Vec<String>> = pairs
328 .iter()
329 .map(|(_, c)| tokenize(&response_text(c)))
330 .collect();
331 let mut corpus = bt.clone();
332 corpus.extend(ct.clone());
333 let df = document_frequency(&corpus);
334 let n = corpus.len();
335 let score_identical =
336 sparse_cosine(&tfidf_vector(&bt[0], &df, n), &tfidf_vector(&ct[0], &df, n));
337 let score_partial =
338 sparse_cosine(&tfidf_vector(&bt[1], &df, n), &tfidf_vector(&ct[1], &df, n));
339 assert!(
340 score_identical > score_partial + 0.3,
341 "identical={score_identical} partial={score_partial}"
342 );
343 }
344
345 use crate::diff::embedder::BoxedEmbedder;
350
351 fn fixed_embedder(
352 mapping: std::collections::HashMap<&'static str, Vec<f32>>,
353 ) -> BoxedEmbedder<impl Fn(&[&str]) -> Vec<Vec<f32>> + Send + Sync> {
354 BoxedEmbedder::named(
355 move |texts: &[&str]| {
356 texts
357 .iter()
358 .map(|t| mapping.get(t).cloned().unwrap_or_else(|| vec![0.0_f32; 4]))
359 .collect()
360 },
361 "fixed",
362 )
363 }
364
365 #[test]
366 fn embedder_path_identical_vectors_score_one() {
367 let r = response("alpha");
368 let pairs = [(&r, &r)];
369 let mut m = std::collections::HashMap::new();
370 m.insert("alpha", vec![1.0_f32, 0.0, 0.0, 0.0]);
371 let emb = fixed_embedder(m);
372 let stat = compute_with_embedder(&pairs, &emb, Some(1));
373 assert!(
374 (stat.candidate_median - 1.0).abs() < 1e-6,
375 "expected median≈1.0, got {}",
376 stat.candidate_median
377 );
378 }
379
380 #[test]
381 fn embedder_path_orthogonal_vectors_score_zero() {
382 let baseline = response("alpha");
383 let candidate = response("beta");
384 let pairs = [(&baseline, &candidate); 4];
385 let mut m = std::collections::HashMap::new();
386 m.insert("alpha", vec![1.0_f32, 0.0, 0.0, 0.0]);
387 m.insert("beta", vec![0.0_f32, 1.0, 0.0, 0.0]);
388 let emb = fixed_embedder(m);
389 let stat = compute_with_embedder(&pairs, &emb, Some(1));
390 assert!(stat.candidate_median.abs() < 1e-6);
391 }
392
393 #[test]
394 fn embedder_path_paraphrase_robustness() {
395 let baseline = response("yes");
399 let candidate = response("I agree");
400 let pairs = [(&baseline, &candidate); 4];
401
402 let tfidf_stat = compute(&pairs, Some(1));
404 assert!(
405 tfidf_stat.candidate_median < 0.5,
406 "TF-IDF should score these low; got {}",
407 tfidf_stat.candidate_median
408 );
409
410 let mut m = std::collections::HashMap::new();
412 m.insert("yes", vec![0.9_f32, 0.4, 0.1, 0.0]);
413 m.insert("I agree", vec![0.91_f32, 0.41, 0.09, 0.0]);
414 let emb = fixed_embedder(m);
415 let neural_stat = compute_with_embedder(&pairs, &emb, Some(1));
416 assert!(
417 neural_stat.candidate_median > 0.99,
418 "neural embedder should score paraphrases ≈1; got {}",
419 neural_stat.candidate_median
420 );
421 }
422
423 #[test]
424 fn embedder_path_dim_mismatch_returns_empty_axis() {
425 let baseline = response("a");
426 let candidate = response("b");
427 let pairs = [(&baseline, &candidate)];
428 let emb = BoxedEmbedder::new(|_texts: &[&str]| vec![vec![1.0_f32, 0.0]]);
430 let stat = compute_with_embedder(&pairs, &emb, Some(1));
431 assert_eq!(stat.severity, Severity::None);
433 }
434
435 #[test]
436 fn embedder_path_empty_pairs_returns_empty() {
437 let pairs: Vec<(&Record, &Record)> = vec![];
438 let emb =
439 BoxedEmbedder::new(|texts: &[&str]| texts.iter().map(|_| vec![1.0_f32; 4]).collect());
440 let stat = compute_with_embedder(&pairs, &emb, Some(1));
441 assert_eq!(stat.severity, Severity::None);
442 }
443}