lean_ctx/core/knowledge/
ranking.rs1use chrono::Utc;
2
3use super::types::{JudgedPair, KnowledgeFact};
4
5pub(super) fn confidence_stars(confidence: f32) -> &'static str {
6 if confidence >= 0.95 {
7 "★★★★★"
8 } else if confidence >= 0.85 {
9 "★★★★"
10 } else if confidence >= 0.7 {
11 "★★★"
12 } else if confidence >= 0.5 {
13 "★★"
14 } else {
15 "★"
16 }
17}
18
19pub(super) fn string_similarity(a: &str, b: &str) -> f32 {
20 let a_lower = a.to_lowercase();
21 let b_lower = b.to_lowercase();
22 let a_words: std::collections::HashSet<&str> = a_lower.split_whitespace().collect();
23 let b_words: std::collections::HashSet<&str> = b_lower.split_whitespace().collect();
24
25 if a_words.is_empty() && b_words.is_empty() {
26 return 1.0;
27 }
28
29 let intersection = a_words.intersection(&b_words).count();
30 let union = a_words.union(&b_words).count();
31
32 if union == 0 {
33 return 0.0;
34 }
35
36 intersection as f32 / union as f32
37}
38
39pub(super) fn sort_fact_for_output(a: &KnowledgeFact, b: &KnowledgeFact) -> std::cmp::Ordering {
40 salience_score(b)
41 .cmp(&salience_score(a))
42 .then_with(|| {
43 b.quality_score()
44 .partial_cmp(&a.quality_score())
45 .unwrap_or(std::cmp::Ordering::Equal)
46 })
47 .then_with(|| {
48 b.confidence
49 .partial_cmp(&a.confidence)
50 .unwrap_or(std::cmp::Ordering::Equal)
51 })
52 .then_with(|| b.confirmation_count.cmp(&a.confirmation_count))
53 .then_with(|| b.retrieval_count.cmp(&a.retrieval_count))
54 .then_with(|| b.last_retrieved.cmp(&a.last_retrieved))
55 .then_with(|| b.last_confirmed.cmp(&a.last_confirmed))
56 .then_with(|| a.category.cmp(&b.category))
57 .then_with(|| a.key.cmp(&b.key))
58 .then_with(|| a.value.cmp(&b.value))
59}
60
61fn salience_score(f: &KnowledgeFact) -> u32 {
69 let cat = f.category.to_lowercase();
70 let base: u32 = match cat.as_str() {
71 "decision" => 70,
72 "gotcha" => 75,
73 "architecture" | "arch" => 60,
74 "security" => 65,
75 "testing" | "tests" | "deployment" | "deploy" => 55,
76 "conventions" | "convention" => 45,
77 "finding" => 40,
78 _ => 30,
79 };
80
81 let quality_bonus = (f.quality_score() * 60.0) as u32;
82
83 let recency_bonus = f.last_retrieved.map_or(0u32, |t| {
84 let days = Utc::now().signed_duration_since(t).num_days();
85 if days <= 7 {
86 10u32
87 } else if days <= 30 {
88 5u32
89 } else {
90 0u32
91 }
92 });
93
94 let archetype_bonus = f.archetype.salience_bonus();
95
96 let fidelity_bonus = f
97 .fidelity
98 .as_ref()
99 .map_or(0u32, |fi| (fi.structural * 10.0) as u32);
100
101 base + quality_bonus + recency_bonus + archetype_bonus + fidelity_bonus
102}
103
104pub(super) fn hash_project_root(root: &str) -> String {
105 crate::core::project_hash::hash_project_root(root)
106}
107
108pub(super) fn tokenize_lower(s: &str) -> impl Iterator<Item = String> + '_ {
109 s.to_lowercase()
110 .split(|c: char| c.is_whitespace() || c == '-' || c == '_' || c == '/' || c == '.')
111 .filter(|t| !t.is_empty())
112 .map(String::from)
113 .collect::<Vec<_>>()
114 .into_iter()
115}
116
117pub(super) fn build_token_index(
118 facts: &[KnowledgeFact],
119 include_session: bool,
120) -> std::collections::HashMap<String, Vec<usize>> {
121 let mut index: std::collections::HashMap<String, Vec<usize>> = std::collections::HashMap::new();
122 for (i, f) in facts.iter().enumerate() {
123 for token in tokenize_lower(&f.category) {
124 index.entry(token).or_default().push(i);
125 }
126 for token in tokenize_lower(&f.key) {
127 index.entry(token).or_default().push(i);
128 }
129 for token in tokenize_lower(&f.value) {
130 index.entry(token).or_default().push(i);
131 }
132 if include_session {
133 for token in tokenize_lower(&f.source_session) {
134 index.entry(token).or_default().push(i);
135 }
136 }
137 }
138 for indices in index.values_mut() {
139 indices.sort_unstable();
140 indices.dedup();
141 }
142 index
143}
144
145#[derive(Debug, Clone)]
146pub struct SimilarFact {
147 pub category: String,
148 pub key: String,
149 pub value_preview: String,
150 pub similarity: f32,
151}
152
153pub fn find_cross_key_similar(
154 new_category: &str,
155 new_key: &str,
156 new_value: &str,
157 all_facts: &[KnowledgeFact],
158 judged_pairs: &[JudgedPair],
159 limit: usize,
160) -> Vec<SimilarFact> {
161 let composite_key = format!("{new_category}/{new_key}");
162 let mut results: Vec<SimilarFact> = Vec::new();
163
164 for f in all_facts {
165 if !f.is_current() {
166 continue;
167 }
168 let other_key = format!("{}/{}", f.category, f.key);
169 if other_key == composite_key {
170 continue;
171 }
172
173 let already_judged = judged_pairs.iter().any(|jp| {
174 (jp.key_a == composite_key && jp.key_b == other_key)
175 || (jp.key_a == other_key && jp.key_b == composite_key)
176 });
177 if already_judged {
178 continue;
179 }
180
181 let sim = string_similarity(new_value, &f.value);
182 if sim > 0.35 {
183 let preview = if f.value.len() > 60 {
184 format!("{}...", &f.value[..57])
185 } else {
186 f.value.clone()
187 };
188 results.push(SimilarFact {
189 category: f.category.clone(),
190 key: f.key.clone(),
191 value_preview: preview,
192 similarity: sim,
193 });
194 }
195 }
196
197 results.sort_by(|a, b| {
198 b.similarity
199 .partial_cmp(&a.similarity)
200 .unwrap_or(std::cmp::Ordering::Equal)
201 });
202 results.truncate(limit);
203 results
204}
205
206pub(super) fn fact_version_id_v1(f: &KnowledgeFact) -> String {
207 use md5::{Digest, Md5};
208 let mut hasher = Md5::new();
209 hasher.update(f.category.as_bytes());
210 hasher.update(b"\n");
211 hasher.update(f.key.as_bytes());
212 hasher.update(b"\n");
213 hasher.update(f.value.as_bytes());
214 hasher.update(b"\n");
215 hasher.update(f.source_session.as_bytes());
216 hasher.update(b"\n");
217 hasher.update(f.created_at.to_rfc3339().as_bytes());
218 format!("{:x}", hasher.finalize())
219}