1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
// SemanticSimilarity implementation: keyword-based similarity scoring with
// stopword filtering, weighted matching, and semantic keyword boosting.
impl SemanticSimilarity {
/// Create new similarity calculator
pub fn new() -> Self {
let stopwords = vec![
"the", "a", "an", "and", "or", "but", "in", "on", "at", "to", "for", "of", "with",
"by", "from", "as", "is", "was", "are", "were", "be", "been", "being", "have", "has",
"had", "do", "does", "did", "will", "would", "should", "could", "may", "might", "must",
"can", "cannot",
]
.into_iter()
.map(|s| s.to_string())
.collect();
Self { stopwords }
}
/// Calculate similarity between claim and fact (0.0 - 1.0)
///
/// Uses enhanced keyword-based similarity with:
/// - Stopword filtering
/// - Weighted matching (exact > partial)
/// - Semantic keyword boosting
pub fn calculate(&self, claim: &str, fact: &str) -> f32 {
let claim_lower = claim.to_lowercase();
let fact_lower = fact.to_lowercase();
// Extract meaningful keywords (filter stopwords)
let claim_words = self.extract_keywords(&claim_lower);
let fact_words = self.extract_keywords(&fact_lower);
if claim_words.is_empty() || fact_words.is_empty() {
return 0.0;
}
// Calculate weighted similarity
let mut score = 0.0;
let mut total_weight = 0.0;
for claim_word in &claim_words {
let weight = self.get_word_weight(claim_word);
total_weight += weight;
// Exact match
if fact_words.contains(claim_word) {
score += weight;
}
// Partial match (substring)
else if fact_words
.iter()
.any(|fw| fw.contains(claim_word.as_str()) || claim_word.contains(fw))
{
score += weight * 0.5;
}
}
if total_weight == 0.0 {
return 0.0;
}
// Normalize to 0.0-1.0 range
let base_score = score / total_weight;
// Boost score if key semantic keywords match
let boost = self.semantic_keyword_boost(&claim_lower, &fact_lower);
// Combine base score with boost (capped at 1.0)
(base_score + boost).min(1.0)
}
/// Extract meaningful keywords (filter stopwords)
fn extract_keywords(&self, text: &str) -> Vec<String> {
text.split_whitespace()
.filter(|word| !self.stopwords.contains(&word.to_string()))
.map(|s| s.to_string())
.collect()
}
/// Get weight for a word (higher weight for important words)
fn get_word_weight(&self, word: &str) -> f32 {
// Technical terms get higher weight
match word {
// Language names
"rust" | "typescript" | "javascript" | "python" | "c" | "cpp" | "go" | "java"
| "kotlin" | "ruby" | "php" | "swift" | "haskell" => 3.0,
// Action verbs (capabilities)
"analyze" | "analyzes" | "analyzing" | "analysis" => 2.5,
"compile" | "compiles" | "compiling" | "compilation" => 2.5,
"support" | "supports" | "supporting" | "supported" => 2.0,
"detect" | "detects" | "detecting" | "detection" => 2.0,
"generate" | "generates" | "generating" => 2.0,
// Technical nouns
"complexity" | "metrics" | "code" | "files" | "functions" => 1.5,
"pmat" => 1.0, // Tool name is neutral
_ => 1.0, // Default weight
}
}
/// Calculate semantic keyword boost
fn semantic_keyword_boost(&self, claim: &str, fact: &str) -> f32 {
let mut boost = 0.0;
// Check for explicit contradictions first (highest priority)
// Pattern: claim says "can X" but fact says "does not X" or "cannot X"
let action_verbs = ["compile", "compiles", "analyze", "support", "generate"];
for verb in &action_verbs {
// Claim is positive about verb, fact is negative
if claim.contains(verb)
&& !claim.contains("cannot")
&& !claim.contains("does not")
&& (fact.contains(&format!("does not {}", verb))
|| fact.contains(&format!("cannot {}", verb))
|| fact.contains(&format!("not {}", verb))
|| (fact.contains(verb) && (fact.contains("but not") || fact.contains("only"))))
{
// CONTRADICTION: claim positive, fact negative
return -0.8; // Strong negative boost
}
// Both agree on capability
if claim.contains(verb) && fact.contains(verb) {
// Check if both are positive or both are negative
let claim_negative = claim.contains("cannot") || claim.contains("does not");
let fact_negative = fact.contains("cannot")
|| fact.contains("does not")
|| fact.contains("but not");
if claim_negative == fact_negative {
boost += 0.3; // Both agree
}
}
}
// Language matching (high boost for exact match)
let languages = ["rust", "typescript", "javascript", "python", "c", "cpp"];
for lang in &languages {
if claim.contains(lang) && fact.contains(lang) {
boost += 0.4;
break;
}
}
// Complexity/metrics matching
if (claim.contains("complexity") && fact.contains("complexity"))
|| (claim.contains("metrics") && fact.contains("metrics"))
{
boost += 0.2;
}
boost
}
}
impl Default for SemanticSimilarity {
fn default() -> Self {
Self::new()
}
}