1use serde::{Deserialize, Serialize};
9
10#[cfg(feature = "wasm")]
11use wasm_bindgen::prelude::*;
12
13pub fn cosine_similarity(a: &[f64], b: &[f64]) -> f64 {
24 if a.len() != b.len() || a.is_empty() {
25 return 0.0;
26 }
27 let dot: f64 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
28 let mag_a: f64 = a.iter().map(|x| x * x).sum::<f64>().sqrt();
29 let mag_b: f64 = b.iter().map(|x| x * x).sum::<f64>().sqrt();
30 if mag_a == 0.0 || mag_b == 0.0 {
31 return 0.0;
32 }
33 dot / (mag_a * mag_b)
34}
35
36#[derive(Debug, Clone, Deserialize)]
40pub struct EmbeddedSection {
41 pub title: String,
43 pub content: String,
45 pub embedding: Vec<f64>,
47}
48
49#[derive(Debug, Clone, Serialize, Deserialize)]
51#[serde(rename_all = "camelCase")]
52pub enum SectionAlignment {
53 Matched,
55 Renamed,
57 Added,
59 Removed,
61}
62
63#[derive(Debug, Clone, Serialize, Deserialize)]
65#[serde(rename_all = "camelCase")]
66pub struct SectionSimilarity {
67 pub section_a: String,
69 pub section_b: String,
71 pub similarity: f64,
73 pub alignment: SectionAlignment,
75}
76
77#[derive(Debug, Clone, Serialize, Deserialize)]
79#[serde(rename_all = "camelCase")]
80pub struct SemanticChange {
81 pub change_type: String,
83 pub section: String,
85 pub similarity: f64,
87 pub description: String,
89}
90
91#[derive(Debug, Clone, Serialize, Deserialize)]
93#[serde(rename_all = "camelCase")]
94pub struct SemanticDiffResult {
95 pub overall_similarity: f64,
98 pub section_similarities: Vec<SectionSimilarity>,
100 pub semantic_changes: Vec<SemanticChange>,
102}
103
104#[cfg_attr(feature = "wasm", wasm_bindgen)]
117pub fn cosine_similarity_wasm(a_json: &str, b_json: &str) -> f64 {
118 let a: Vec<f64> = match serde_json::from_str(a_json) {
119 Ok(v) => v,
120 Err(_) => return 0.0,
121 };
122 let b: Vec<f64> = match serde_json::from_str(b_json) {
123 Ok(v) => v,
124 Err(_) => return 0.0,
125 };
126 cosine_similarity(&a, &b)
127}
128
129fn classify_change(title: &str, similarity: f64) -> SemanticChange {
133 let (change_type, description) = if similarity >= 0.95 {
134 (
135 "unchanged",
136 format!("Section '{title}' is semantically identical (similarity {similarity:.2})"),
137 )
138 } else if similarity >= 0.85 {
139 (
140 "rephrased",
141 format!(
142 "Section '{title}' expresses the same meaning with different wording (similarity {similarity:.2})"
143 ),
144 )
145 } else if similarity >= 0.70 {
146 (
147 "modified",
148 format!("Section '{title}' has been partially changed (similarity {similarity:.2})"),
149 )
150 } else {
151 (
152 "rewritten",
153 format!(
154 "Section '{title}' has been substantially rewritten (similarity {similarity:.2})"
155 ),
156 )
157 };
158 SemanticChange {
159 change_type: change_type.to_string(),
160 section: title.to_string(),
161 similarity,
162 description,
163 }
164}
165
166pub fn semantic_diff_native(
171 sections_a: &[EmbeddedSection],
172 sections_b: &[EmbeddedSection],
173) -> SemanticDiffResult {
174 let mut matched_b: Vec<bool> = vec![false; sections_b.len()];
176 let mut section_similarities: Vec<SectionSimilarity> = Vec::new();
177 let mut semantic_changes: Vec<SemanticChange> = Vec::new();
178
179 for sec_a in sections_a {
181 if sections_b.is_empty() {
182 section_similarities.push(SectionSimilarity {
183 section_a: sec_a.title.clone(),
184 section_b: String::new(),
185 similarity: 0.0,
186 alignment: SectionAlignment::Removed,
187 });
188 continue;
189 }
190
191 let (best_idx, best_sim) = sections_b
193 .iter()
194 .enumerate()
195 .map(|(i, sec_b)| (i, cosine_similarity(&sec_a.embedding, &sec_b.embedding)))
196 .max_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal))
197 .unwrap_or((0, 0.0));
198
199 if best_sim < 0.40 {
201 section_similarities.push(SectionSimilarity {
203 section_a: sec_a.title.clone(),
204 section_b: String::new(),
205 similarity: 0.0,
206 alignment: SectionAlignment::Removed,
207 });
208 } else {
209 let sec_b = §ions_b[best_idx];
210 matched_b[best_idx] = true;
211
212 let alignment =
213 if sec_a.title.trim().to_lowercase() == sec_b.title.trim().to_lowercase() {
214 SectionAlignment::Matched
215 } else if best_sim >= 0.85 {
216 SectionAlignment::Renamed
217 } else {
218 SectionAlignment::Matched
219 };
220
221 section_similarities.push(SectionSimilarity {
222 section_a: sec_a.title.clone(),
223 section_b: sec_b.title.clone(),
224 similarity: best_sim,
225 alignment,
226 });
227
228 semantic_changes.push(classify_change(&sec_a.title, best_sim));
229 }
230 }
231
232 for (i, sec_b) in sections_b.iter().enumerate() {
234 if !matched_b[i] {
235 section_similarities.push(SectionSimilarity {
236 section_a: String::new(),
237 section_b: sec_b.title.clone(),
238 similarity: 0.0,
239 alignment: SectionAlignment::Added,
240 });
241 }
242 }
243
244 let non_zero: Vec<f64> = section_similarities
246 .iter()
247 .filter(|s| s.similarity > 0.0)
248 .map(|s| s.similarity)
249 .collect();
250
251 let overall_similarity = if non_zero.is_empty() {
252 0.0
253 } else {
254 let sum: f64 = non_zero.iter().sum();
255 sum / non_zero.len() as f64
256 };
257
258 SemanticDiffResult {
259 overall_similarity,
260 section_similarities,
261 semantic_changes,
262 }
263}
264
265#[cfg_attr(feature = "wasm", wasm_bindgen)]
272pub fn semantic_diff(sections_a_json: &str, sections_b_json: &str) -> String {
273 let sections_a: Vec<EmbeddedSection> = match serde_json::from_str(sections_a_json) {
274 Ok(v) => v,
275 Err(e) => return format!(r#"{{"error":"Invalid sections_a JSON: {e}"}}"#),
276 };
277 let sections_b: Vec<EmbeddedSection> = match serde_json::from_str(sections_b_json) {
278 Ok(v) => v,
279 Err(e) => return format!(r#"{{"error":"Invalid sections_b JSON: {e}"}}"#),
280 };
281
282 let result = semantic_diff_native(§ions_a, §ions_b);
283 serde_json::to_string(&result)
284 .unwrap_or_else(|e| format!(r#"{{"error":"Serialization: {e}"}}"#))
285}
286
287#[derive(Debug, Clone, Deserialize)]
291#[serde(rename_all = "camelCase")]
292pub struct EmbeddedReview {
293 pub reviewer_id: String,
295 pub content: String,
297 pub embedding: Vec<f64>,
299}
300
301#[derive(Debug, Clone, Serialize, Deserialize)]
303#[serde(rename_all = "camelCase")]
304pub struct ReviewCluster {
305 pub members: Vec<String>,
307 pub avg_similarity: f64,
309}
310
311#[derive(Debug, Clone, Serialize, Deserialize)]
313#[serde(rename_all = "camelCase")]
314pub struct SemanticConsensusResult {
315 pub consensus: bool,
317 pub agreement_score: f64,
319 pub clusters: Vec<ReviewCluster>,
321 pub outliers: Vec<String>,
323}
324
325pub fn semantic_consensus_native(
330 reviews: &[EmbeddedReview],
331 threshold: f64,
332) -> SemanticConsensusResult {
333 if reviews.is_empty() {
334 return SemanticConsensusResult {
335 consensus: false,
336 agreement_score: 0.0,
337 clusters: vec![],
338 outliers: vec![],
339 };
340 }
341
342 let n = reviews.len();
343
344 let mut sims = vec![vec![0.0f64; n]; n];
347 for (i, review_i) in reviews.iter().enumerate() {
348 sims[i][i] = 1.0;
349 for j in (i + 1)..n {
350 let s = cosine_similarity(&review_i.embedding, &reviews[j].embedding);
351 sims[i][j] = s;
352 sims[j][i] = s;
353 }
354 }
355
356 let pair_count = n * (n - 1) / 2;
358 let agreement_score = if pair_count == 0 {
359 1.0 } else {
361 let total: f64 = sims
363 .iter()
364 .enumerate()
365 .flat_map(|(i, row)| row.iter().enumerate().skip(i + 1).map(|(_, &v)| v))
366 .sum();
367 total / pair_count as f64
368 };
369
370 let mut clusters: Vec<Vec<usize>> = Vec::new(); 'outer: for (i, _) in reviews.iter().enumerate() {
376 for cluster in &mut clusters {
377 if cluster.iter().all(|&j| sims[i][j] >= threshold) {
379 cluster.push(i);
380 continue 'outer;
381 }
382 }
383 clusters.push(vec![i]);
384 }
385
386 clusters.sort_by_key(|c| std::cmp::Reverse(c.len()));
388
389 let result_clusters: Vec<ReviewCluster> = clusters
391 .iter()
392 .map(|members| {
393 let ids: Vec<String> = members
394 .iter()
395 .map(|&i| reviews[i].reviewer_id.clone())
396 .collect();
397 let avg_sim = if members.len() == 1 {
398 1.0
399 } else {
400 let mut total = 0.0;
401 let mut count = 0;
402 for a in 0..members.len() {
403 for b in (a + 1)..members.len() {
404 total += sims[members[a]][members[b]];
405 count += 1;
406 }
407 }
408 if count > 0 { total / count as f64 } else { 1.0 }
409 };
410 ReviewCluster {
411 members: ids,
412 avg_similarity: avg_sim,
413 }
414 })
415 .collect();
416
417 let majority_size = result_clusters
419 .first()
420 .map(|c| c.members.len())
421 .unwrap_or(0);
422 let consensus = majority_size * 2 > n; let majority_members: std::collections::HashSet<&str> = result_clusters
426 .first()
427 .map(|c| c.members.iter().map(String::as_str).collect())
428 .unwrap_or_default();
429
430 let outliers: Vec<String> = reviews
431 .iter()
432 .filter(|r| !majority_members.contains(r.reviewer_id.as_str()))
433 .map(|r| r.reviewer_id.clone())
434 .collect();
435
436 SemanticConsensusResult {
437 consensus,
438 agreement_score,
439 clusters: result_clusters,
440 outliers,
441 }
442}
443
444#[cfg_attr(feature = "wasm", wasm_bindgen)]
451pub fn semantic_consensus(reviews_json: &str, threshold: f64) -> String {
452 let reviews: Vec<EmbeddedReview> = match serde_json::from_str(reviews_json) {
453 Ok(v) => v,
454 Err(e) => return format!(r#"{{"error":"Invalid reviews JSON: {e}"}}"#),
455 };
456
457 let result = semantic_consensus_native(&reviews, threshold);
458 serde_json::to_string(&result)
459 .unwrap_or_else(|e| format!(r#"{{"error":"Serialization: {e}"}}"#))
460}
461
462#[cfg(test)]
465mod tests {
466 use super::*;
467 use std::f64::EPSILON;
468
469 #[test]
472 fn cosine_identical_vectors() {
473 let v = vec![1.0, 2.0, 3.0];
474 let sim = cosine_similarity(&v, &v);
475 assert!(
476 (sim - 1.0).abs() < EPSILON,
477 "identical vectors → 1.0, got {sim}"
478 );
479 }
480
481 #[test]
482 fn cosine_orthogonal_vectors() {
483 let a = vec![1.0, 0.0, 0.0];
484 let b = vec![0.0, 1.0, 0.0];
485 let sim = cosine_similarity(&a, &b);
486 assert!(sim.abs() < EPSILON, "orthogonal vectors → 0.0, got {sim}");
487 }
488
489 #[test]
490 fn cosine_opposite_vectors() {
491 let a = vec![1.0, 0.0];
492 let b = vec![-1.0, 0.0];
493 let sim = cosine_similarity(&a, &b);
494 assert!(
495 (sim - (-1.0)).abs() < EPSILON,
496 "opposite vectors → -1.0, got {sim}"
497 );
498 }
499
500 #[test]
501 fn cosine_different_lengths_returns_zero() {
502 let a = vec![1.0, 2.0];
503 let b = vec![1.0, 2.0, 3.0];
504 assert_eq!(cosine_similarity(&a, &b), 0.0, "mismatched lengths → 0.0");
505 }
506
507 #[test]
508 fn cosine_zero_vector_returns_zero() {
509 let a = vec![0.0, 0.0, 0.0];
510 let b = vec![1.0, 2.0, 3.0];
511 assert_eq!(cosine_similarity(&a, &b), 0.0, "zero vector → 0.0");
512 }
513
514 #[test]
515 fn cosine_empty_vectors_returns_zero() {
516 assert_eq!(cosine_similarity(&[], &[]), 0.0, "empty vectors → 0.0");
517 }
518
519 fn make_section(title: &str, vec: Vec<f64>) -> EmbeddedSection {
522 EmbeddedSection {
523 title: title.to_string(),
524 content: title.to_string(),
525 embedding: vec,
526 }
527 }
528
529 #[test]
530 fn semantic_diff_identical_sections() {
531 let sections = vec![make_section("Intro", vec![1.0, 0.0, 0.0])];
532 let result = semantic_diff_native(§ions, §ions);
533 assert!(
534 (result.overall_similarity - 1.0).abs() < 1e-9,
535 "identical sections → overall_similarity ≈ 1.0, got {}",
536 result.overall_similarity
537 );
538 assert_eq!(result.section_similarities.len(), 1);
539 assert_eq!(result.semantic_changes.len(), 1);
540 assert_eq!(result.semantic_changes[0].change_type, "unchanged");
541 }
542
543 #[test]
544 fn semantic_diff_detects_added_section() {
545 let sections_a = vec![make_section("Overview", vec![1.0, 0.0])];
546 let sections_b = vec![
547 make_section("Overview", vec![1.0, 0.0]),
548 make_section("NewSection", vec![0.0, 1.0]),
549 ];
550 let result = semantic_diff_native(§ions_a, §ions_b);
551 let added = result
552 .section_similarities
553 .iter()
554 .find(|s| matches!(s.alignment, SectionAlignment::Added));
555 assert!(added.is_some(), "should detect Added section");
556 assert_eq!(added.unwrap().section_b, "NewSection");
557 }
558
559 #[test]
560 fn semantic_diff_detects_removed_section() {
561 let sections_a = vec![
562 make_section("Overview", vec![1.0, 0.0]),
563 make_section("OldSection", vec![0.0, 1.0]),
564 ];
565 let sections_b = vec![make_section("Overview", vec![1.0, 0.0])];
566 let result = semantic_diff_native(§ions_a, §ions_b);
567 let removed = result
568 .section_similarities
569 .iter()
570 .find(|s| matches!(s.alignment, SectionAlignment::Removed));
571 assert!(removed.is_some(), "should detect Removed section");
572 assert_eq!(removed.unwrap().section_a, "OldSection");
573 }
574
575 #[test]
576 fn semantic_diff_detects_renamed_section() {
577 let sections_a = vec![make_section("Old Title", vec![0.9, 0.1])];
579 let sections_b = vec![make_section("New Title", vec![0.91, 0.09])];
580 let result = semantic_diff_native(§ions_a, §ions_b);
581 let renamed = result
582 .section_similarities
583 .iter()
584 .find(|s| matches!(s.alignment, SectionAlignment::Renamed));
585 assert!(renamed.is_some(), "should detect Renamed section");
586 }
587
588 #[test]
589 fn semantic_diff_json_roundtrip() {
590 let sections_a = r#"[{"title":"A","content":"A text","embedding":[1.0,0.0]}]"#;
591 let sections_b = r#"[{"title":"A","content":"A text","embedding":[1.0,0.0]}]"#;
592 let out = semantic_diff(sections_a, sections_b);
593 let parsed: serde_json::Value = serde_json::from_str(&out).expect("valid JSON");
594 assert!(parsed.get("error").is_none(), "should not have error field");
595 assert!(parsed.get("overallSimilarity").is_some());
596 }
597
598 fn make_review(id: &str, vec: Vec<f64>) -> EmbeddedReview {
601 EmbeddedReview {
602 reviewer_id: id.to_string(),
603 content: id.to_string(),
604 embedding: vec,
605 }
606 }
607
608 #[test]
609 fn consensus_unanimous_single_cluster() {
610 let reviews = vec![
611 make_review("a", vec![1.0, 0.0]),
612 make_review("b", vec![0.98, 0.02]),
613 make_review("c", vec![0.99, 0.01]),
614 ];
615 let result = semantic_consensus_native(&reviews, 0.80);
616 assert!(result.consensus, "3 similar reviews should reach consensus");
617 assert!(result.outliers.is_empty(), "no outliers expected");
618 assert_eq!(result.clusters.len(), 1, "should form one cluster");
619 }
620
621 #[test]
622 fn consensus_divergent_outlier() {
623 let reviews = vec![
625 make_review("a", vec![1.0, 0.0]),
626 make_review("b", vec![0.99, 0.01]),
627 make_review("c", vec![0.0, 1.0]), ];
629 let result = semantic_consensus_native(&reviews, 0.80);
630 assert!(result.consensus, "2/3 agreement should reach consensus");
632 assert!(
633 result.outliers.contains(&"c".to_string()),
634 "'c' should be an outlier"
635 );
636 }
637
638 #[test]
639 fn consensus_no_consensus_split() {
640 let reviews = vec![
642 make_review("a", vec![1.0, 0.0]),
643 make_review("b", vec![0.0, 1.0]),
644 ];
645 let result = semantic_consensus_native(&reviews, 0.80);
646 assert!(!result.consensus, "50/50 split should not reach consensus");
648 }
649
650 #[test]
651 fn consensus_empty_reviews() {
652 let result = semantic_consensus_native(&[], 0.80);
653 assert!(!result.consensus);
654 assert_eq!(result.agreement_score, 0.0);
655 assert!(result.clusters.is_empty());
656 assert!(result.outliers.is_empty());
657 }
658
659 #[test]
660 fn consensus_json_roundtrip() {
661 let reviews_json = r#"[
662 {"reviewerId":"a","content":"test","embedding":[1.0,0.0]},
663 {"reviewerId":"b","content":"test","embedding":[0.99,0.01]}
664 ]"#;
665 let out = semantic_consensus(reviews_json, 0.80);
666 let parsed: serde_json::Value = serde_json::from_str(&out).expect("valid JSON");
667 assert!(parsed.get("error").is_none(), "should not have error field");
668 assert!(parsed.get("consensus").is_some());
669 }
670}