use std::collections::HashMap;
use crate::ts_syn::abi::SpanIR;
use crate::ts_syn::abi::ir::type_registry::{TypeDefinitionIR, TypeRegistry};
use crate::ts_syn::declarative::MacroMode;
use super::registry::DeclarativeMacroRegistry;
#[derive(Debug, Clone)]
pub struct ResolvedCallSite {
pub macro_name: String,
pub call_span: SpanIR,
pub arg_shapes: Vec<TypeShape>,
}
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub enum TypeShape {
Named {
name: String,
fields: Option<Vec<String>>,
},
Literal(String),
Opaque,
}
impl TypeShape {
pub fn named(name: impl Into<String>) -> Self {
TypeShape::Named {
name: name.into(),
fields: None,
}
}
}
#[derive(Debug, Clone)]
pub struct MacroPolymorphism {
pub distinct_shapes: usize,
pub recommendation: Recommendation,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum Recommendation {
Share,
Cluster(Vec<TypeCluster>),
ForceExpand,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct TypeCluster {
pub shapes: Vec<Vec<TypeShape>>,
pub id: String,
}
#[derive(Debug, Default, Clone)]
pub struct MegamorphReport {
pub per_macro: HashMap<String, MacroPolymorphism>,
}
impl MegamorphReport {
pub fn lookup(&self, macro_name: &str) -> Option<&MacroPolymorphism> {
self.per_macro.get(macro_name)
}
}
pub fn analyze(
registry: &DeclarativeMacroRegistry,
call_sites: &[ResolvedCallSite],
threshold: u8,
) -> MegamorphReport {
let mut per_macro: HashMap<String, Vec<&ResolvedCallSite>> = HashMap::new();
for site in call_sites {
if let Some(def) = registry.lookup(&site.macro_name)
&& def.mode == MacroMode::Auto
{
per_macro
.entry(site.macro_name.clone())
.or_default()
.push(site);
}
}
let mut report = MegamorphReport::default();
for (name, sites) in per_macro {
let mut shape_set: Vec<Vec<TypeShape>> = Vec::new();
for site in &sites {
if !shape_set.contains(&site.arg_shapes) {
shape_set.push(site.arg_shapes.clone());
}
}
let distinct_shapes = shape_set.len();
let per_macro_threshold = registry
.lookup(&name)
.map(|d| d.megamorphism_threshold as usize)
.unwrap_or(threshold as usize);
let recommendation = if distinct_shapes <= per_macro_threshold {
Recommendation::Share
} else {
let clusters = cluster_shapes(&shape_set, per_macro_threshold);
if clusters
.iter()
.any(|c| count_distinct_fingerprints(&c.shapes) > per_macro_threshold)
{
Recommendation::ForceExpand
} else {
Recommendation::Cluster(clusters)
}
};
report.per_macro.insert(
name,
MacroPolymorphism {
distinct_shapes,
recommendation,
},
);
}
report
}
const JACCARD_THRESHOLD: f64 = 0.60;
fn cluster_shapes(shapes: &[Vec<TypeShape>], _threshold: usize) -> Vec<TypeCluster> {
let mut structural: Vec<Vec<Vec<TypeShape>>> = Vec::new();
let mut prefix_buckets: HashMap<String, Vec<Vec<TypeShape>>> = HashMap::new();
let mut literal_bucket: Vec<Vec<TypeShape>> = Vec::new();
let mut opaque_bucket: Vec<Vec<TypeShape>> = Vec::new();
let mut empty_bucket: Vec<Vec<TypeShape>> = Vec::new();
for tuple in shapes {
let Some(first) = tuple.first() else {
empty_bucket.push(tuple.clone());
continue;
};
match first {
TypeShape::Named {
fields: Some(fields),
..
} if !fields.is_empty() => {
let _ = fields; let mut joined = false;
for cluster in structural.iter_mut() {
let fits = cluster.iter().all(|existing| {
mean_pairwise_jaccard(tuple, existing) >= JACCARD_THRESHOLD
});
if fits {
cluster.push(tuple.clone());
joined = true;
break;
}
}
if !joined {
structural.push(vec![tuple.clone()]);
}
}
TypeShape::Named { name, .. } => {
let key = name
.chars()
.next()
.map(|c| c.to_ascii_lowercase().to_string())
.unwrap_or_else(|| "_".to_string());
let keyed = if tuple.len() == 1 {
key
} else {
format!("{}{}", key, tuple.len())
};
prefix_buckets.entry(keyed).or_default().push(tuple.clone());
}
TypeShape::Literal(_) => literal_bucket.push(tuple.clone()),
TypeShape::Opaque => opaque_bucket.push(tuple.clone()),
}
}
let mut clusters: Vec<TypeCluster> = Vec::new();
for group in structural {
let id = structural_cluster_id(&group);
clusters.push(TypeCluster { id, shapes: group });
}
let mut prefix_clusters: Vec<TypeCluster> = prefix_buckets
.into_iter()
.map(|(id, shapes)| TypeCluster { id, shapes })
.collect();
prefix_clusters.sort_by(|a, b| a.id.cmp(&b.id));
clusters.extend(prefix_clusters);
if !empty_bucket.is_empty() {
clusters.push(TypeCluster {
id: "empty".to_string(),
shapes: empty_bucket,
});
}
if !literal_bucket.is_empty() {
clusters.push(TypeCluster {
id: "lit".to_string(),
shapes: literal_bucket,
});
}
if !opaque_bucket.is_empty() {
clusters.push(TypeCluster {
id: "opaque".to_string(),
shapes: opaque_bucket,
});
}
clusters
}
fn count_distinct_fingerprints(shapes: &[Vec<TypeShape>]) -> usize {
use std::collections::HashSet;
let mut seen: HashSet<String> = HashSet::new();
for tuple in shapes {
let key = tuple
.iter()
.map(|s| match s {
TypeShape::Named {
fields: Some(fs), ..
} if !fs.is_empty() => format!("s:{}", fs.join(",")),
TypeShape::Named { name, .. } => format!("n:{}", name),
TypeShape::Literal(kind) => format!("l:{}", kind),
TypeShape::Opaque => "o".to_string(),
})
.collect::<Vec<_>>()
.join("|");
seen.insert(format!("arity{}:{}", tuple.len(), key));
}
seen.len()
}
fn jaccard(a: &[String], b: &[String]) -> f64 {
if a.is_empty() && b.is_empty() {
return 0.0;
}
let mut i = 0;
let mut j = 0;
let mut inter: usize = 0;
let mut uni: usize = 0;
while i < a.len() && j < b.len() {
match a[i].cmp(&b[j]) {
std::cmp::Ordering::Equal => {
inter += 1;
uni += 1;
i += 1;
j += 1;
}
std::cmp::Ordering::Less => {
uni += 1;
i += 1;
}
std::cmp::Ordering::Greater => {
uni += 1;
j += 1;
}
}
}
uni += a.len() - i;
uni += b.len() - j;
if uni == 0 {
0.0
} else {
inter as f64 / uni as f64
}
}
fn mean_pairwise_jaccard(a: &[TypeShape], b: &[TypeShape]) -> f64 {
if a.len() != b.len() {
return 0.0;
}
if a.is_empty() {
return 1.0;
}
let mut sum = 0.0;
for (ai, bi) in a.iter().zip(b.iter()) {
sum += position_jaccard(ai, bi);
}
sum / a.len() as f64
}
fn position_jaccard(a: &TypeShape, b: &TypeShape) -> f64 {
match (a, b) {
(
TypeShape::Named {
name: na,
fields: Some(fa),
},
TypeShape::Named {
name: nb,
fields: Some(fb),
},
) if !fa.is_empty() && !fb.is_empty() => {
let _ = (na, nb);
jaccard(fa, fb)
}
(TypeShape::Named { name: na, .. }, TypeShape::Named { name: nb, .. }) => {
if na == nb { 1.0 } else { 0.0 }
}
(TypeShape::Literal(la), TypeShape::Literal(lb)) => {
if la == lb {
1.0
} else {
0.0
}
}
(TypeShape::Opaque, TypeShape::Opaque) => 1.0,
_ => 0.0,
}
}
fn structural_cluster_id(group: &[Vec<TypeShape>]) -> String {
let mut names: Vec<&str> = group
.iter()
.filter_map(|tuple| match tuple.first() {
Some(TypeShape::Named { name, .. }) => Some(name.as_str()),
_ => None,
})
.collect();
names.sort_unstable();
names.dedup();
if names.is_empty() {
"struct".to_string()
} else {
format!("struct_{}", names.join("_"))
}
}
pub fn extract_type_shape(
arg: &oxc::ast::ast::Argument<'_>,
type_registry: Option<&TypeRegistry>,
) -> TypeShape {
use oxc::ast::ast::Expression;
let Some(expr) = arg.as_expression() else {
return TypeShape::Opaque;
};
match expr {
Expression::Identifier(ident) => {
let name = ident.name.as_str();
if name.chars().next().is_some_and(|c| c.is_ascii_uppercase()) {
named_with_fingerprint(name, type_registry)
} else {
TypeShape::Opaque
}
}
Expression::NewExpression(new_expr) => {
if let Expression::Identifier(ident) = &new_expr.callee {
named_with_fingerprint(ident.name.as_str(), type_registry)
} else {
TypeShape::Opaque
}
}
Expression::StringLiteral(_) => TypeShape::Literal("string".into()),
Expression::NumericLiteral(_) => TypeShape::Literal("number".into()),
Expression::BooleanLiteral(_) => TypeShape::Literal("boolean".into()),
Expression::NullLiteral(_) => TypeShape::Literal("null".into()),
Expression::BigIntLiteral(_) => TypeShape::Literal("bigint".into()),
Expression::TemplateLiteral(_) => TypeShape::Literal("string".into()),
_ => TypeShape::Opaque,
}
}
fn named_with_fingerprint(name: &str, type_registry: Option<&TypeRegistry>) -> TypeShape {
let fields = type_registry
.and_then(|reg| reg.get(name))
.and_then(|entry| extract_fingerprint_fields(&entry.definition, type_registry));
TypeShape::Named {
name: name.to_string(),
fields,
}
}
fn extract_fingerprint_fields(
def: &TypeDefinitionIR,
type_registry: Option<&TypeRegistry>,
) -> Option<Vec<String>> {
match def {
TypeDefinitionIR::Class(class) => {
let mut fields: Vec<String> = class.fields.iter().map(|f| f.name.clone()).collect();
fields.sort_unstable();
fields.dedup();
if fields.is_empty() {
None
} else {
Some(fields)
}
}
TypeDefinitionIR::Interface(iface) => {
let mut fields: Vec<String> = iface.fields.iter().map(|f| f.name.clone()).collect();
fields.sort_unstable();
fields.dedup();
if fields.is_empty() {
None
} else {
Some(fields)
}
}
TypeDefinitionIR::Enum(enum_ir) => {
let mut variants: Vec<String> =
enum_ir.variants.iter().map(|v| v.name.clone()).collect();
variants.sort_unstable();
variants.dedup();
if variants.is_empty() {
None
} else {
Some(variants)
}
}
TypeDefinitionIR::TypeAlias(alias) => {
if let Some(members) = alias.body.as_object() {
let mut fields: Vec<String> = members.iter().map(|m| m.name.clone()).collect();
fields.sort_unstable();
fields.dedup();
if !fields.is_empty() {
return Some(fields);
}
}
if let Some(target_name) = alias.body.as_alias() {
return type_registry
.and_then(|reg| reg.get(target_name))
.and_then(|entry| extract_fingerprint_fields_direct(&entry.definition));
}
None
}
}
}
fn extract_fingerprint_fields_direct(def: &TypeDefinitionIR) -> Option<Vec<String>> {
match def {
TypeDefinitionIR::Class(class) => {
let mut fields: Vec<String> = class.fields.iter().map(|f| f.name.clone()).collect();
fields.sort_unstable();
fields.dedup();
if fields.is_empty() {
None
} else {
Some(fields)
}
}
TypeDefinitionIR::Interface(iface) => {
let mut fields: Vec<String> = iface.fields.iter().map(|f| f.name.clone()).collect();
fields.sort_unstable();
fields.dedup();
if fields.is_empty() {
None
} else {
Some(fields)
}
}
TypeDefinitionIR::Enum(enum_ir) => {
let mut variants: Vec<String> =
enum_ir.variants.iter().map(|v| v.name.clone()).collect();
variants.sort_unstable();
variants.dedup();
if variants.is_empty() {
None
} else {
Some(variants)
}
}
TypeDefinitionIR::TypeAlias(_) => None,
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::ts_syn::declarative::{MacroArm, MacroDef};
fn fake_def(name: &str, mode: MacroMode) -> MacroDef {
let mut def = MacroDef::from_arms(
name.to_string(),
Vec::<MacroArm>::new(),
mode,
SpanIR::new(0, 0),
);
def.runtime = Some(format!("function __{}(x) {{ return x; }}", name));
def.call_arms = Some(Vec::new());
def
}
fn site(macro_name: &str, shape: TypeShape) -> ResolvedCallSite {
ResolvedCallSite {
macro_name: macro_name.to_string(),
call_span: SpanIR::new(0, 0),
arg_shapes: vec![shape],
}
}
fn multi_arg_site(macro_name: &str, shapes: Vec<TypeShape>) -> ResolvedCallSite {
ResolvedCallSite {
macro_name: macro_name.to_string(),
call_span: SpanIR::new(0, 0),
arg_shapes: shapes,
}
}
fn named_with_fields(name: &str, fields: &[&str]) -> TypeShape {
let mut fields: Vec<String> = fields.iter().map(|s| s.to_string()).collect();
fields.sort();
TypeShape::Named {
name: name.to_string(),
fields: Some(fields),
}
}
#[test]
fn analyze_monomorphic_share() {
let mut reg = DeclarativeMacroRegistry::new();
reg.register(fake_def("serialize", MacroMode::Auto))
.unwrap();
let sites = vec![
site("serialize", TypeShape::named("User")),
site("serialize", TypeShape::named("User")),
site("serialize", TypeShape::named("User")),
];
let report = analyze(®, &sites, 4);
let info = report.lookup("serialize").unwrap();
assert_eq!(info.distinct_shapes, 1);
assert_eq!(info.recommendation, Recommendation::Share);
}
#[test]
fn analyze_at_threshold_still_share() {
let mut reg = DeclarativeMacroRegistry::new();
reg.register(fake_def("serialize", MacroMode::Auto))
.unwrap();
let sites = vec![
site("serialize", TypeShape::named("User")),
site("serialize", TypeShape::named("Admin")),
site("serialize", TypeShape::named("Guest")),
site("serialize", TypeShape::named("Bot")),
];
let report = analyze(®, &sites, 4);
let info = report.lookup("serialize").unwrap();
assert_eq!(info.distinct_shapes, 4);
assert_eq!(info.recommendation, Recommendation::Share);
}
#[test]
fn analyze_above_threshold_clusters_by_first_letter_fallback() {
let mut reg = DeclarativeMacroRegistry::new();
reg.register(fake_def("serialize", MacroMode::Auto))
.unwrap();
let sites = vec![
site("serialize", TypeShape::named("User")),
site("serialize", TypeShape::named("Admin")),
site("serialize", TypeShape::named("Alice")),
site("serialize", TypeShape::named("Bob")),
site("serialize", TypeShape::named("Guest")),
];
let report = analyze(®, &sites, 4);
let info = report.lookup("serialize").unwrap();
assert_eq!(info.distinct_shapes, 5);
let Recommendation::Cluster(clusters) = &info.recommendation else {
panic!("expected Cluster, got {:?}", info.recommendation);
};
assert_eq!(clusters.len(), 4);
let a = clusters.iter().find(|c| c.id == "a").unwrap();
assert_eq!(a.shapes.len(), 2);
}
#[test]
fn analyze_force_expand_when_cluster_still_megamorphic() {
let mut reg = DeclarativeMacroRegistry::new();
reg.register(fake_def("serialize", MacroMode::Auto))
.unwrap();
let sites = vec![
site("serialize", TypeShape::named("User1")),
site("serialize", TypeShape::named("User2")),
site("serialize", TypeShape::named("User3")),
site("serialize", TypeShape::named("User4")),
site("serialize", TypeShape::named("User5")),
site("serialize", TypeShape::named("User6")),
];
let report = analyze(®, &sites, 4);
let info = report.lookup("serialize").unwrap();
assert_eq!(info.recommendation, Recommendation::ForceExpand);
}
#[test]
fn analyze_respects_per_macro_threshold() {
let mut reg = DeclarativeMacroRegistry::new();
let mut def = fake_def("serialize", MacroMode::Auto);
def.megamorphism_threshold = 2;
reg.register(def).unwrap();
let sites = vec![
site("serialize", TypeShape::named("User")),
site("serialize", TypeShape::named("Admin")),
site("serialize", TypeShape::named("Guest")),
];
let report = analyze(®, &sites, 4);
let info = report.lookup("serialize").unwrap();
assert!(matches!(info.recommendation, Recommendation::Cluster(_)));
}
#[test]
fn analyze_ignores_non_auto_macros() {
let mut reg = DeclarativeMacroRegistry::new();
reg.register(fake_def("expand_only", MacroMode::ExpandOnly))
.unwrap();
reg.register(fake_def("share_only", MacroMode::ShareOnly))
.unwrap();
reg.register(fake_def("auto", MacroMode::Auto)).unwrap();
let sites = vec![
site("expand_only", TypeShape::named("X")),
site("share_only", TypeShape::named("X")),
site("auto", TypeShape::named("X")),
];
let report = analyze(®, &sites, 4);
assert_eq!(report.per_macro.len(), 1);
assert!(report.lookup("auto").is_some());
}
#[test]
fn jaccard_identical_sets() {
let a = vec!["id".into(), "name".into()];
let b = vec!["id".into(), "name".into()];
assert!((jaccard(&a, &b) - 1.0).abs() < 1e-9);
}
#[test]
fn jaccard_disjoint_sets() {
let a = vec!["id".into(), "name".into()];
let b = vec!["price".into(), "qty".into()];
assert!(jaccard(&a, &b) < 1e-9);
}
#[test]
fn jaccard_partial_overlap() {
let a = vec!["email".to_string(), "id".to_string(), "name".to_string()];
let b = vec!["id".to_string(), "name".to_string(), "phone".to_string()];
let j = jaccard(&a, &b);
assert!((j - 0.5).abs() < 1e-9, "expected 0.5, got {}", j);
}
fn tuple(shape: TypeShape) -> Vec<TypeShape> {
vec![shape]
}
#[test]
fn cluster_shapes_groups_identical_fingerprints() {
let shapes = vec![
tuple(named_with_fields("User", &["id", "name", "email"])),
tuple(named_with_fields("Admin", &["id", "name", "email"])),
];
let clusters = cluster_shapes(&shapes, 4);
assert_eq!(
clusters.len(),
1,
"identical fields should collapse to one cluster, got: {:?}",
clusters
);
assert_eq!(clusters[0].shapes.len(), 2);
}
#[test]
fn cluster_shapes_groups_high_overlap() {
let shapes = vec![
tuple(named_with_fields(
"User",
&["id", "name", "email", "phone", "address"],
)),
tuple(named_with_fields(
"Contact",
&["id", "name", "email", "phone", "company"],
)),
];
let clusters = cluster_shapes(&shapes, 4);
assert_eq!(clusters.len(), 1);
}
#[test]
fn cluster_shapes_splits_low_overlap() {
let shapes = vec![
tuple(named_with_fields(
"User",
&["id", "name", "email", "phone", "address"],
)),
tuple(named_with_fields(
"Order",
&["id", "total", "status", "items", "customer"],
)),
];
let clusters = cluster_shapes(&shapes, 4);
assert_eq!(clusters.len(), 2);
}
#[test]
fn cluster_shapes_falls_back_to_prefix_without_fingerprint() {
let shapes = vec![
tuple(TypeShape::named("Alice")),
tuple(TypeShape::named("Admin")),
tuple(TypeShape::named("Bob")),
];
let clusters = cluster_shapes(&shapes, 4);
assert_eq!(clusters.len(), 2);
let a = clusters.iter().find(|c| c.id == "a").unwrap();
assert_eq!(a.shapes.len(), 2);
let b = clusters.iter().find(|c| c.id == "b").unwrap();
assert_eq!(b.shapes.len(), 1);
}
#[test]
fn cluster_shapes_mixes_structural_and_prefix_paths() {
let shapes = vec![
tuple(named_with_fields("User", &["id", "name"])),
tuple(named_with_fields("Person", &["id", "name"])),
tuple(TypeShape::named("Order")),
];
let clusters = cluster_shapes(&shapes, 4);
assert_eq!(clusters.len(), 2);
let sizes: Vec<usize> = clusters.iter().map(|c| c.shapes.len()).collect();
assert!(sizes.contains(&2));
assert!(sizes.contains(&1));
}
#[test]
fn analyze_structurally_clusters_diverse_names_same_shape() {
let mut reg = DeclarativeMacroRegistry::new();
reg.register(fake_def("serialize", MacroMode::Auto))
.unwrap();
let sites = vec![
site("serialize", named_with_fields("Alpha", &["id", "name"])),
site("serialize", named_with_fields("Bravo", &["id", "name"])),
site("serialize", named_with_fields("Charlie", &["id", "name"])),
site("serialize", named_with_fields("Delta", &["id", "name"])),
site("serialize", named_with_fields("Echo", &["id", "name"])),
];
let report = analyze(®, &sites, 4);
let info = report.lookup("serialize").unwrap();
let Recommendation::Cluster(clusters) = &info.recommendation else {
panic!(
"expected Cluster (structural grouping), got {:?}",
info.recommendation
);
};
assert_eq!(
clusters.len(),
1,
"identical fingerprints should collapse to one cluster: {:?}",
clusters
);
}
#[test]
fn analyze_two_arg_monomorphic_shares() {
let mut reg = DeclarativeMacroRegistry::new();
reg.register(fake_def("serialize", MacroMode::Auto))
.unwrap();
let sites = vec![
multi_arg_site(
"serialize",
vec![TypeShape::named("User"), TypeShape::named("Order")],
),
multi_arg_site(
"serialize",
vec![TypeShape::named("User"), TypeShape::named("Order")],
),
multi_arg_site(
"serialize",
vec![TypeShape::named("User"), TypeShape::named("Order")],
),
];
let report = analyze(®, &sites, 4);
let info = report.lookup("serialize").unwrap();
assert_eq!(info.distinct_shapes, 1);
assert_eq!(info.recommendation, Recommendation::Share);
}
#[test]
fn analyze_two_arg_divergent_second_position_clusters() {
let mut reg = DeclarativeMacroRegistry::new();
let mut def = fake_def("serialize", MacroMode::Auto);
def.megamorphism_threshold = 1;
reg.register(def).unwrap();
let sites = vec![
multi_arg_site(
"serialize",
vec![TypeShape::named("User"), TypeShape::named("Order")],
),
multi_arg_site(
"serialize",
vec![TypeShape::named("User"), TypeShape::named("Invoice")],
),
multi_arg_site(
"serialize",
vec![TypeShape::named("User"), TypeShape::named("Product")],
),
];
let report = analyze(®, &sites, 4);
let info = report.lookup("serialize").unwrap();
assert_eq!(
info.distinct_shapes, 3,
"three distinct tuples expected, got {}",
info.distinct_shapes
);
assert_eq!(
info.recommendation,
Recommendation::ForceExpand,
"divergent second-argument shapes should force expand"
);
}
#[test]
fn analyze_arity_mismatch_does_not_cluster_together() {
let mut reg = DeclarativeMacroRegistry::new();
let mut def = fake_def("serialize", MacroMode::Auto);
def.megamorphism_threshold = 1;
reg.register(def).unwrap();
let sites = vec![
multi_arg_site("serialize", vec![TypeShape::named("User")]),
multi_arg_site(
"serialize",
vec![TypeShape::named("User"), TypeShape::named("Order")],
),
];
let report = analyze(®, &sites, 4);
let info = report.lookup("serialize").unwrap();
assert_eq!(info.distinct_shapes, 2);
match &info.recommendation {
Recommendation::Cluster(clusters) => {
for cluster in clusters {
let arities: std::collections::HashSet<usize> =
cluster.shapes.iter().map(|t| t.len()).collect();
assert_eq!(
arities.len(),
1,
"cluster `{}` mixed arities: {:?}",
cluster.id,
cluster.shapes
);
}
}
Recommendation::ForceExpand => {}
other => panic!("unexpected recommendation: {:?}", other),
}
}
#[test]
fn mean_pairwise_jaccard_arity_mismatch_is_zero() {
let a = vec![named_with_fields("User", &["id", "name"])];
let b = vec![
named_with_fields("User", &["id", "name"]),
named_with_fields("Order", &["id", "total"]),
];
assert!((mean_pairwise_jaccard(&a, &b) - 0.0).abs() < 1e-9);
}
#[test]
fn mean_pairwise_jaccard_identical_tuples_are_one() {
let a = vec![
named_with_fields("User", &["id", "name"]),
named_with_fields("Order", &["id", "total"]),
];
let b = a.clone();
assert!((mean_pairwise_jaccard(&a, &b) - 1.0).abs() < 1e-9);
}
#[test]
fn mean_pairwise_jaccard_partial_overlap_across_positions() {
let a = vec![
named_with_fields("User", &["id", "name"]),
named_with_fields("OrderA", &["customer", "id", "items", "status", "total"]),
];
let b = vec![
named_with_fields("User", &["id", "name"]),
named_with_fields("OrderB", &["extra", "id", "items", "status", "total"]),
];
let score = mean_pairwise_jaccard(&a, &b);
assert!(
score >= JACCARD_THRESHOLD,
"expected ≥ threshold, got {}",
score
);
}
#[test]
fn mean_pairwise_jaccard_disjoint_tail_positions_fall_below_threshold() {
let a = vec![
named_with_fields("User", &["id", "name"]),
named_with_fields("OrderA", &["a", "b", "c"]),
];
let b = vec![
named_with_fields("User", &["id", "name"]),
named_with_fields("OrderB", &["x", "y", "z"]),
];
let score = mean_pairwise_jaccard(&a, &b);
assert!(
score < JACCARD_THRESHOLD,
"expected below threshold, got {}",
score
);
}
#[test]
fn cluster_shapes_groups_tuples_via_mean_jaccard() {
let shapes = vec![
vec![
named_with_fields("User", &["id", "name", "email"]),
named_with_fields("OrderA", &["id", "total", "status", "customer", "items"]),
],
vec![
named_with_fields("User", &["id", "name", "email"]),
named_with_fields("OrderB", &["id", "total", "status", "customer", "notes"]),
],
];
let clusters = cluster_shapes(&shapes, 4);
assert_eq!(
clusters.len(),
1,
"expected pairwise-Jaccard to collapse both tuples into one cluster, got: {:?}",
clusters
);
}
#[test]
fn cluster_shapes_splits_tuples_with_divergent_tails() {
let shapes = vec![
vec![
named_with_fields("User", &["id", "name"]),
named_with_fields("OrderA", &["a", "b", "c"]),
],
vec![
named_with_fields("User", &["id", "name"]),
named_with_fields("OrderB", &["x", "y", "z"]),
],
];
let clusters = cluster_shapes(&shapes, 4);
assert_eq!(
clusters.len(),
2,
"expected divergent tails to split: {:?}",
clusters
);
}
}