Skip to main content

shadow_core/diff/
semantic.rs

1//! Axis 1: final-output semantic similarity.
2//!
3//! Two paths are supported:
4//!
5//! 1. **TF-IDF cosine** (default, no extra deps) — smoothed sklearn-style
6//!    TF-IDF over the corpus of response texts being compared. Lexical:
7//!    word-level overlap weighted by token rarity. Fast, deterministic,
8//!    blind to paraphrase ("yes" vs "I agree" score 0).
9//! 2. **Pluggable [`Embedder`]** — any backend that produces dense
10//!    vectors per text. Use [`compute_with_embedder`] and pass an
11//!    [`Embedder`] impl. Suitable for ONNX runtimes, HF Inference API
12//!    clients, OpenAI/Cohere embeddings, in-house services, or a
13//!    PyO3 callback into Python `sentence-transformers`.
14//!
15//! Both paths use the same downstream cosine + paired-CI machinery, so
16//! reports from either embedder are directly comparable.
17//!
18//! ## Coverage cross-references
19//!
20//! What this axis catches:
21//! - Final-text similarity drops (lexical with TF-IDF; paraphrase-
22//!   robust with a neural [`Embedder`]).
23//!
24//! What it does NOT catch:
25//! - **Wrong answer with similar words** — TF-IDF cosine measures
26//!   token overlap; a numeric value flip ("$99 → $9") barely moves
27//!   the cosine. The alignment module's W_ARGS component catches
28//!   tool-arg value flips; numeric content drift surfaces on the
29//!   v2.7+ `numeric_token_density` fingerprint dimension.
30//! - **Empty-response regressions** — empty-vs-empty scores 1.0
31//!   (vacuous match). The verbosity axis (axis 4) catches the
32//!   collapse to empty.
33//! - **Tone shifts with same content** — embeddings only carry
34//!   semantic meaning; the Judge axis (axis 8) with a tone rubric
35//!   is the right surface.
36//!
37//! [`Embedder`]: crate::diff::embedder::Embedder
38
39use std::collections::HashMap;
40
41use unicode_normalization::UnicodeNormalization;
42
43use crate::agentlog::Record;
44use crate::diff::axes::{Axis, AxisStat};
45use crate::diff::bootstrap::{median, paired_ci};
46use crate::diff::embedder::{cosine, Embedder};
47
48/// Lowercase, NFC-normalize, split on non-alphanumeric. Empty tokens
49/// dropped. Tokens shorter than 2 chars are kept (e.g. "ok", "no").
50fn tokenize(text: &str) -> Vec<String> {
51    text.nfc()
52        .flat_map(|c| c.to_lowercase())
53        .collect::<String>()
54        .split(|c: char| !c.is_alphanumeric())
55        .filter(|s| !s.is_empty())
56        .map(String::from)
57        .collect()
58}
59
60/// Term frequency: count of each token in `tokens`.
61fn term_frequency(tokens: &[String]) -> HashMap<String, f64> {
62    let mut out: HashMap<String, f64> = HashMap::new();
63    for tok in tokens {
64        *out.entry(tok.clone()).or_insert(0.0) += 1.0;
65    }
66    out
67}
68
69/// Document frequency: number of docs in which each term appears (at
70/// least once). Input is already-tokenized text per doc.
71fn document_frequency(corpus: &[Vec<String>]) -> HashMap<String, usize> {
72    let mut df: HashMap<String, usize> = HashMap::new();
73    for doc in corpus {
74        let mut seen: std::collections::HashSet<&str> = std::collections::HashSet::new();
75        for tok in doc {
76            if seen.insert(tok.as_str()) {
77                *df.entry(tok.clone()).or_insert(0) += 1;
78            }
79        }
80    }
81    df
82}
83
84/// Sparse TF-IDF vector: `tfidf[t] = (1 + log(tf)) · log((N + 1) / (df + 1))`.
85/// This is the smoothed variant used by scikit-learn's `TfidfVectorizer`.
86fn tfidf_vector(
87    tokens: &[String],
88    df: &HashMap<String, usize>,
89    n_docs: usize,
90) -> HashMap<String, f64> {
91    let tf = term_frequency(tokens);
92    let mut out: HashMap<String, f64> = HashMap::with_capacity(tf.len());
93    let n = n_docs as f64;
94    for (tok, tf_v) in tf {
95        let df_v = *df.get(&tok).unwrap_or(&0) as f64;
96        let idf = ((n + 1.0) / (df_v + 1.0)).ln() + 1.0;
97        let tf_weight = 1.0 + tf_v.ln();
98        out.insert(tok, tf_weight * idf);
99    }
100    out
101}
102
103fn sparse_cosine(a: &HashMap<String, f64>, b: &HashMap<String, f64>) -> f64 {
104    let na: f64 = a.values().map(|v| v * v).sum::<f64>().sqrt();
105    let nb: f64 = b.values().map(|v| v * v).sum::<f64>().sqrt();
106    if na < 1e-12 || nb < 1e-12 {
107        return if na < 1e-12 && nb < 1e-12 { 1.0 } else { 0.0 };
108    }
109    // Walk the smaller of the two.
110    let (small, large) = if a.len() <= b.len() { (a, b) } else { (b, a) };
111    let mut dot = 0.0;
112    for (k, v) in small {
113        if let Some(w) = large.get(k) {
114            dot += v * w;
115        }
116    }
117    dot / (na * nb)
118}
119
120fn response_text(r: &Record) -> String {
121    let Some(arr) = r.payload.get("content").and_then(|c| c.as_array()) else {
122        return String::new();
123    };
124    arr.iter()
125        .filter_map(|p| {
126            if p.get("type").and_then(|t| t.as_str()) == Some("text") {
127                p.get("text")
128                    .and_then(|t| t.as_str())
129                    .map(ToString::to_string)
130            } else {
131                None
132            }
133        })
134        .collect::<Vec<_>>()
135        .join(" ")
136}
137
138/// Compute the semantic-similarity axis using TF-IDF cosine.
139///
140/// This is the default path: no extra dependencies, deterministic,
141/// lexical. For paraphrase-robust similarity see
142/// [`compute_with_embedder`].
143pub fn compute(pairs: &[(&Record, &Record)], seed: Option<u64>) -> AxisStat {
144    if pairs.is_empty() {
145        return AxisStat::empty(Axis::Semantic);
146    }
147    // Build a shared corpus DF over all responses on both sides so
148    // pairwise similarities are comparable.
149    let baseline_tokens: Vec<Vec<String>> = pairs
150        .iter()
151        .map(|(b, _)| tokenize(&response_text(b)))
152        .collect();
153    let candidate_tokens: Vec<Vec<String>> = pairs
154        .iter()
155        .map(|(_, c)| tokenize(&response_text(c)))
156        .collect();
157    let mut corpus: Vec<Vec<String>> = Vec::with_capacity(pairs.len() * 2);
158    corpus.extend(baseline_tokens.clone());
159    corpus.extend(candidate_tokens.clone());
160    let df = document_frequency(&corpus);
161    let n_docs = corpus.len();
162
163    let similarities: Vec<f64> = baseline_tokens
164        .iter()
165        .zip(candidate_tokens.iter())
166        .map(|(bt, ct)| {
167            let bv = tfidf_vector(bt, &df, n_docs);
168            let cv = tfidf_vector(ct, &df, n_docs);
169            sparse_cosine(&bv, &cv).clamp(0.0, 1.0)
170        })
171        .collect();
172
173    similarities_to_stat(&similarities, pairs.len(), seed)
174}
175
176/// Compute the semantic-similarity axis using a caller-supplied
177/// dense [`Embedder`].
178///
179/// The embedder is invoked once per side: baseline texts are embedded
180/// together, then candidate texts. Pair-wise cosine similarity is
181/// computed in Rust on the returned vectors, then folded into the
182/// usual median + paired-CI shape.
183///
184/// Mismatched dimensions or a zero-length result are treated as a
185/// no-op axis (`AxisStat::empty`) so a misconfigured embedder
186/// can't poison the rest of the report.
187pub fn compute_with_embedder(
188    pairs: &[(&Record, &Record)],
189    embedder: &dyn Embedder,
190    seed: Option<u64>,
191) -> AxisStat {
192    if pairs.is_empty() {
193        return AxisStat::empty(Axis::Semantic);
194    }
195    let baseline_texts: Vec<String> = pairs.iter().map(|(b, _)| response_text(b)).collect();
196    let candidate_texts: Vec<String> = pairs.iter().map(|(_, c)| response_text(c)).collect();
197    let baseline_refs: Vec<&str> = baseline_texts.iter().map(String::as_str).collect();
198    let candidate_refs: Vec<&str> = candidate_texts.iter().map(String::as_str).collect();
199
200    let baseline_vecs = embedder.embed(&baseline_refs);
201    let candidate_vecs = embedder.embed(&candidate_refs);
202
203    if baseline_vecs.len() != pairs.len() || candidate_vecs.len() != pairs.len() {
204        return AxisStat::empty(Axis::Semantic);
205    }
206
207    let similarities: Vec<f64> = baseline_vecs
208        .iter()
209        .zip(candidate_vecs.iter())
210        .map(|(bv, cv)| f64::from(cosine(bv, cv).clamp(0.0, 1.0)))
211        .collect();
212
213    similarities_to_stat(&similarities, pairs.len(), seed)
214}
215
216/// Shared tail: convert a per-pair similarity vector into the
217/// AxisStat used by the rest of the diff pipeline. Same shape
218/// regardless of which embedder produced the similarities.
219fn similarities_to_stat(similarities: &[f64], n_pairs: usize, seed: Option<u64>) -> AxisStat {
220    let baseline_ones: Vec<f64> = (0..similarities.len()).map(|_| 1.0).collect();
221    let bm = 1.0;
222    let cm = median(similarities);
223    let delta = cm - bm;
224    let ci = paired_ci(
225        &baseline_ones,
226        similarities,
227        |bs, cs| median(cs) - median(bs),
228        0,
229        seed,
230    );
231    AxisStat::new_value(Axis::Semantic, bm, cm, delta, ci.low, ci.high, n_pairs)
232}
233
234#[cfg(test)]
235mod tests {
236    use super::*;
237    use crate::agentlog::Kind;
238    use crate::diff::axes::Severity;
239    use serde_json::json;
240
241    fn response(text: &str) -> Record {
242        Record::new(
243            Kind::ChatResponse,
244            json!({
245                "model": "x",
246                "content": [{"type": "text", "text": text}],
247                "stop_reason": "end_turn",
248                "latency_ms": 0,
249                "usage": {"input_tokens": 1, "output_tokens": 1, "thinking_tokens": 0},
250            }),
251            "2026-04-21T10:00:00Z",
252            None,
253        )
254    }
255
256    #[test]
257    fn identical_text_has_similarity_1() {
258        let r = response("the quick brown fox jumps over the lazy dog");
259        let pairs = [(&r, &r)];
260        let stat = compute(&pairs, Some(1));
261        assert!((stat.candidate_median - 1.0).abs() < 1e-9);
262        assert_eq!(stat.severity, Severity::None);
263    }
264
265    #[test]
266    fn very_different_text_has_lower_similarity() {
267        let baseline: Vec<Record> = (0..10)
268            .map(|i| response(&format!("refund issued for order {i}")))
269            .collect();
270        let candidate: Vec<Record> = (0..10)
271            .map(|i| {
272                response(&format!(
273                    "unable to process request {i}, please contact support"
274                ))
275            })
276            .collect();
277        let pairs: Vec<(&Record, &Record)> = baseline.iter().zip(candidate.iter()).collect();
278        let stat = compute(&pairs, Some(1));
279        assert!(stat.candidate_median < 0.5);
280    }
281
282    #[test]
283    fn tokenize_splits_on_punctuation_and_lowercases() {
284        assert_eq!(
285            tokenize("Hello, world!  It's nice"),
286            vec![
287                "hello".to_string(),
288                "world".to_string(),
289                "it".to_string(),
290                "s".to_string(),
291                "nice".to_string(),
292            ]
293        );
294    }
295
296    #[test]
297    fn tokenize_handles_unicode_nfc() {
298        // "café" in NFD vs NFC — tokenize should normalize.
299        let nfd = "cafe\u{0301}";
300        let nfc = "café";
301        assert_eq!(tokenize(nfd), tokenize(nfc));
302    }
303
304    #[test]
305    fn empty_text_has_zero_similarity_to_nonempty() {
306        let empty = response("");
307        let full = response("some content here");
308        let pairs = [(&empty, &full); 3];
309        let stat = compute(&pairs, Some(1));
310        assert!(stat.candidate_median < 0.1);
311    }
312
313    #[test]
314    fn identical_content_scores_higher_than_partial_overlap() {
315        // One pair is identical; the other shares only boilerplate.
316        let identical_b = response("refund issued for order abc123");
317        let identical_c = response("refund issued for order abc123");
318        let partial_b = response("refund issued for order abc123");
319        let partial_c = response("unable to process please contact support");
320        let baseline = [identical_b, partial_b];
321        let candidate = [identical_c, partial_c];
322        let pairs: Vec<(&Record, &Record)> = baseline.iter().zip(candidate.iter()).collect();
323        let bt: Vec<Vec<String>> = pairs
324            .iter()
325            .map(|(b, _)| tokenize(&response_text(b)))
326            .collect();
327        let ct: Vec<Vec<String>> = pairs
328            .iter()
329            .map(|(_, c)| tokenize(&response_text(c)))
330            .collect();
331        let mut corpus = bt.clone();
332        corpus.extend(ct.clone());
333        let df = document_frequency(&corpus);
334        let n = corpus.len();
335        let score_identical =
336            sparse_cosine(&tfidf_vector(&bt[0], &df, n), &tfidf_vector(&ct[0], &df, n));
337        let score_partial =
338            sparse_cosine(&tfidf_vector(&bt[1], &df, n), &tfidf_vector(&ct[1], &df, n));
339        assert!(
340            score_identical > score_partial + 0.3,
341            "identical={score_identical} partial={score_partial}"
342        );
343    }
344
345    // ----------------------------------------------------------------
346    // Pluggable Embedder integration
347    // ----------------------------------------------------------------
348
349    use crate::diff::embedder::BoxedEmbedder;
350
351    fn fixed_embedder(
352        mapping: std::collections::HashMap<&'static str, Vec<f32>>,
353    ) -> BoxedEmbedder<impl Fn(&[&str]) -> Vec<Vec<f32>> + Send + Sync> {
354        BoxedEmbedder::named(
355            move |texts: &[&str]| {
356                texts
357                    .iter()
358                    .map(|t| mapping.get(t).cloned().unwrap_or_else(|| vec![0.0_f32; 4]))
359                    .collect()
360            },
361            "fixed",
362        )
363    }
364
365    #[test]
366    fn embedder_path_identical_vectors_score_one() {
367        let r = response("alpha");
368        let pairs = [(&r, &r)];
369        let mut m = std::collections::HashMap::new();
370        m.insert("alpha", vec![1.0_f32, 0.0, 0.0, 0.0]);
371        let emb = fixed_embedder(m);
372        let stat = compute_with_embedder(&pairs, &emb, Some(1));
373        assert!(
374            (stat.candidate_median - 1.0).abs() < 1e-6,
375            "expected median≈1.0, got {}",
376            stat.candidate_median
377        );
378    }
379
380    #[test]
381    fn embedder_path_orthogonal_vectors_score_zero() {
382        let baseline = response("alpha");
383        let candidate = response("beta");
384        let pairs = [(&baseline, &candidate); 4];
385        let mut m = std::collections::HashMap::new();
386        m.insert("alpha", vec![1.0_f32, 0.0, 0.0, 0.0]);
387        m.insert("beta", vec![0.0_f32, 1.0, 0.0, 0.0]);
388        let emb = fixed_embedder(m);
389        let stat = compute_with_embedder(&pairs, &emb, Some(1));
390        assert!(stat.candidate_median.abs() < 1e-6);
391    }
392
393    #[test]
394    fn embedder_path_paraphrase_robustness() {
395        // TF-IDF cosine assigns 0 to disjoint-vocabulary paraphrases;
396        // a neural embedder would assign ≈1. This test simulates that
397        // scenario and verifies the embedder path actually surfaces it.
398        let baseline = response("yes");
399        let candidate = response("I agree");
400        let pairs = [(&baseline, &candidate); 4];
401
402        // TF-IDF result: low similarity (no token overlap).
403        let tfidf_stat = compute(&pairs, Some(1));
404        assert!(
405            tfidf_stat.candidate_median < 0.5,
406            "TF-IDF should score these low; got {}",
407            tfidf_stat.candidate_median
408        );
409
410        // Custom embedder where both phrases map near-identical vectors.
411        let mut m = std::collections::HashMap::new();
412        m.insert("yes", vec![0.9_f32, 0.4, 0.1, 0.0]);
413        m.insert("I agree", vec![0.91_f32, 0.41, 0.09, 0.0]);
414        let emb = fixed_embedder(m);
415        let neural_stat = compute_with_embedder(&pairs, &emb, Some(1));
416        assert!(
417            neural_stat.candidate_median > 0.99,
418            "neural embedder should score paraphrases ≈1; got {}",
419            neural_stat.candidate_median
420        );
421    }
422
423    #[test]
424    fn embedder_path_dim_mismatch_returns_empty_axis() {
425        let baseline = response("a");
426        let candidate = response("b");
427        let pairs = [(&baseline, &candidate)];
428        // Embedder returns wrong number of vectors → axis is empty.
429        let emb = BoxedEmbedder::new(|_texts: &[&str]| vec![vec![1.0_f32, 0.0]]);
430        let stat = compute_with_embedder(&pairs, &emb, Some(1));
431        // Empty axis: severity::None, n_pairs=0 (the empty marker).
432        assert_eq!(stat.severity, Severity::None);
433    }
434
435    #[test]
436    fn embedder_path_empty_pairs_returns_empty() {
437        let pairs: Vec<(&Record, &Record)> = vec![];
438        let emb =
439            BoxedEmbedder::new(|texts: &[&str]| texts.iter().map(|_| vec![1.0_f32; 4]).collect());
440        let stat = compute_with_embedder(&pairs, &emb, Some(1));
441        assert_eq!(stat.severity, Severity::None);
442    }
443}