use std::collections::HashSet;
use crate::diagnostics::{Diagnostic, Severity, C001, C002, C003};
use crate::prompt::{estimate_tokens, SkillEntry};
const DEFAULT_SIMILARITY_THRESHOLD: f64 = 0.7;
const TOKEN_BUDGET_THRESHOLD: usize = 4000;
#[must_use]
pub fn detect_conflicts(entries: &[SkillEntry]) -> Vec<Diagnostic> {
detect_conflicts_with_threshold(entries, DEFAULT_SIMILARITY_THRESHOLD)
}
#[must_use]
pub fn detect_conflicts_with_threshold(
entries: &[SkillEntry],
similarity_threshold: f64,
) -> Vec<Diagnostic> {
let mut diags = Vec::new();
diags.extend(check_name_collisions(entries));
diags.extend(check_description_similarity(entries, similarity_threshold));
diags.extend(check_token_budget(entries));
diags
}
fn check_name_collisions(entries: &[SkillEntry]) -> Vec<Diagnostic> {
let mut diags = Vec::new();
let mut seen: HashSet<&str> = HashSet::new();
for entry in entries {
if !seen.insert(&entry.name) {
diags.push(
Diagnostic::new(
Severity::Warning,
C001,
format!(
"name collision: '{}' appears in multiple locations",
entry.name
),
)
.with_field("name")
.with_suggestion("Rename one of the conflicting skills"),
);
}
}
diags
}
fn check_description_similarity(entries: &[SkillEntry], threshold: f64) -> Vec<Diagnostic> {
let token_sets: Vec<HashSet<String>> =
entries.iter().map(|e| tokenize(&e.description)).collect();
let mut diags = Vec::new();
for i in 0..entries.len() {
for j in (i + 1)..entries.len() {
let sim = jaccard_from_sets(&token_sets[i], &token_sets[j]);
if sim >= threshold {
diags.push(
Diagnostic::new(
Severity::Warning,
C002,
format!(
"description overlap ({:.0}%): '{}' and '{}'",
sim * 100.0,
entries[i].name,
entries[j].name,
),
)
.with_field("description")
.with_suggestion("Differentiate descriptions to avoid activation conflicts"),
);
}
}
}
diags
}
fn check_token_budget(entries: &[SkillEntry]) -> Vec<Diagnostic> {
let total: usize = entries.iter().map(estimate_entry_tokens).sum();
if total > TOKEN_BUDGET_THRESHOLD {
vec![
Diagnostic::new(
Severity::Warning,
C003,
format!(
"total estimated tokens ({total}) exceed budget threshold ({TOKEN_BUDGET_THRESHOLD})"
),
)
.with_field("collection")
.with_suggestion("Remove or consolidate skills to reduce token usage"),
]
} else {
vec![]
}
}
fn estimate_entry_tokens(entry: &SkillEntry) -> usize {
estimate_tokens(&entry.name) + estimate_tokens(&entry.description)
}
fn tokenize(s: &str) -> HashSet<String> {
s.split_whitespace()
.map(|w| {
w.trim_matches(|c: char| !c.is_alphanumeric())
.to_lowercase()
})
.filter(|w| !w.is_empty())
.collect()
}
fn jaccard_from_sets(a: &HashSet<String>, b: &HashSet<String>) -> f64 {
let intersection = a.intersection(b).count();
let union = a.union(b).count();
if union == 0 {
0.0
} else {
intersection as f64 / union as f64
}
}
#[cfg(test)]
mod tests {
use super::*;
fn jaccard_similarity(a: &str, b: &str) -> f64 {
let set_a = tokenize(a);
let set_b = tokenize(b);
jaccard_from_sets(&set_a, &set_b)
}
fn make_entry(name: &str, description: &str) -> SkillEntry {
SkillEntry {
name: name.to_string(),
description: description.to_string(),
location: format!("skills/{name}"),
}
}
#[test]
fn c001_duplicate_names() {
let entries = vec![
make_entry("my-skill", "First skill"),
make_entry("my-skill", "Second skill"),
];
let diags = detect_conflicts(&entries);
assert!(
diags.iter().any(|d| d.code == C001),
"expected C001 for duplicate names, got: {diags:?}",
);
}
#[test]
fn c001_unique_names_no_collision() {
let entries = vec![
make_entry("skill-a", "First skill"),
make_entry("skill-b", "Second skill"),
];
let diags = detect_conflicts(&entries);
assert!(
!diags.iter().any(|d| d.code == C001),
"expected no C001 for unique names, got: {diags:?}",
);
}
#[test]
fn c002_similar_descriptions() {
let entries = vec![
make_entry(
"skill-a",
"Processes PDF files and generates detailed reports",
),
make_entry(
"skill-b",
"Processes PDF files and generates detailed summaries",
),
];
let diags = detect_conflicts(&entries);
assert!(
diags.iter().any(|d| d.code == C002),
"expected C002 for similar descriptions, got: {diags:?}",
);
}
#[test]
fn c002_distinct_descriptions_no_overlap() {
let entries = vec![
make_entry("skill-a", "Processes PDF files"),
make_entry("skill-b", "Manages database connections"),
];
let diags = detect_conflicts(&entries);
assert!(
!diags.iter().any(|d| d.code == C002),
"expected no C002 for distinct descriptions, got: {diags:?}",
);
}
#[test]
fn c002_custom_threshold() {
let entries = vec![
make_entry("skill-a", "Processes files"),
make_entry("skill-b", "Processes documents"),
];
let diags = detect_conflicts_with_threshold(&entries, 0.3);
assert!(
diags.iter().any(|d| d.code == C002),
"expected C002 with low threshold, got: {diags:?}",
);
}
#[test]
fn c003_exceeds_budget() {
let large_description = "word ".repeat(10000);
let entries = vec![
make_entry("skill-a", &large_description),
make_entry("skill-b", &large_description),
];
let diags = detect_conflicts(&entries);
assert!(
diags.iter().any(|d| d.code == C003),
"expected C003 for large token budget, got: {diags:?}",
);
}
#[test]
fn c003_within_budget_no_warning() {
let entries = vec![
make_entry("skill-a", "A small skill"),
make_entry("skill-b", "Another skill"),
];
let diags = detect_conflicts(&entries);
assert!(
!diags.iter().any(|d| d.code == C003),
"expected no C003 for small skills, got: {diags:?}",
);
}
#[test]
fn jaccard_identical_strings() {
let sim = jaccard_similarity("hello world", "hello world");
assert!((sim - 1.0).abs() < f64::EPSILON);
}
#[test]
fn jaccard_completely_different() {
let sim = jaccard_similarity("hello world", "foo bar");
assert!(sim < f64::EPSILON);
}
#[test]
fn jaccard_partial_overlap() {
let sim = jaccard_similarity("hello world", "hello there");
assert!((sim - 1.0 / 3.0).abs() < 0.01);
}
#[test]
fn jaccard_case_insensitive() {
let sim = jaccard_similarity("PDF Files", "pdf files");
assert!(
(sim - 1.0).abs() < f64::EPSILON,
"expected 1.0 for case-only difference, got: {sim}",
);
}
#[test]
fn jaccard_empty_strings() {
let sim = jaccard_similarity("", "");
assert!(sim < f64::EPSILON);
}
#[test]
fn all_conflict_diagnostics_are_warnings() {
let entries = vec![
make_entry("my-skill", "Processes PDF files"),
make_entry("my-skill", "Processes PDF files"),
];
let diags = detect_conflicts(&entries);
assert!(
diags.iter().all(|d| d.is_warning()),
"all conflict diagnostics should be warnings: {diags:?}",
);
}
#[test]
fn diagnostics_have_fields_and_suggestions() {
let entries = vec![
make_entry("my-skill", "Same description"),
make_entry("my-skill", "Same description"),
];
let diags = detect_conflicts(&entries);
assert!(
diags.iter().all(|d| d.field.is_some()),
"all diagnostics should have fields: {diags:?}",
);
assert!(
diags.iter().all(|d| d.suggestion.is_some()),
"all diagnostics should have suggestions: {diags:?}",
);
}
#[test]
fn empty_collection_no_conflicts() {
let diags = detect_conflicts(&[]);
assert!(diags.is_empty());
}
#[test]
fn single_entry_no_conflicts() {
let entries = vec![make_entry("my-skill", "A skill")];
let diags = detect_conflicts(&entries);
assert!(diags.is_empty());
}
#[test]
fn tokenize_produces_expected_word_set() {
let tokens = tokenize("Hello World hello");
let expected: HashSet<String> = ["hello", "world"].iter().map(|s| s.to_string()).collect();
assert_eq!(tokens, expected);
}
#[test]
fn tokenize_empty_string_returns_empty_set() {
let tokens = tokenize("");
assert!(tokens.is_empty());
}
#[test]
fn jaccard_from_sets_matches_jaccard_similarity() {
let pairs = vec![
("hello world", "hello world"),
("hello world", "foo bar"),
("hello world", "hello there"),
("PDF Files", "pdf files"),
("", ""),
];
for (a, b) in pairs {
let from_str = jaccard_similarity(a, b);
let set_a = tokenize(a);
let set_b = tokenize(b);
let from_sets = jaccard_from_sets(&set_a, &set_b);
assert!(
(from_str - from_sets).abs() < f64::EPSILON,
"mismatch for ({a:?}, {b:?}): jaccard_similarity={from_str}, jaccard_from_sets={from_sets}",
);
}
}
#[test]
fn jaccard_from_sets_two_empty_sets_returns_zero() {
let empty: HashSet<String> = HashSet::new();
assert!(jaccard_from_sets(&empty, &empty) < f64::EPSILON);
}
}