1use std::collections::HashMap;
17
18#[derive(Debug, Clone)]
20pub struct ScaleEntry {
21 pub path: String,
22 pub tfidf_keywords: Vec<(String, f64)>,
23 pub total_chunks: usize,
24 pub avg_chunk_tokens: usize,
25}
26
27pub struct MultiScaleIndex {
29 pub micro_chunk_count: usize,
31 pub meso_files: HashMap<String, ScaleEntry>,
33 pub macro_dirs: HashMap<String, ScaleEntry>,
35}
36
37impl MultiScaleIndex {
38 pub fn new() -> Self {
39 Self {
40 micro_chunk_count: 0,
41 meso_files: HashMap::new(),
42 macro_dirs: HashMap::new(),
43 }
44 }
45
46 pub fn build_from_chunks(chunks: &[super::bm25_index::CodeChunk]) -> Self {
48 let mut meso: HashMap<String, FileAccumulator> = HashMap::new();
49
50 for chunk in chunks {
52 let acc = meso.entry(chunk.file_path.clone()).or_default();
53 acc.chunk_count += 1;
54 acc.total_tokens += chunk.token_count;
55 for token in &chunk.tokens {
56 *acc.token_freqs.entry(token.to_lowercase()).or_insert(0) += 1;
57 }
58 }
59
60 let num_files = meso.len().max(1) as f64;
62 let mut doc_freqs: HashMap<String, usize> = HashMap::new();
63 for acc in meso.values() {
64 for token in acc.token_freqs.keys() {
65 *doc_freqs.entry(token.clone()).or_insert(0) += 1;
66 }
67 }
68
69 let mut meso_files = HashMap::new();
70 for (path, acc) in &meso {
71 let mut keywords: Vec<(String, f64)> = acc
72 .token_freqs
73 .iter()
74 .map(|(token, &tf)| {
75 let df = *doc_freqs.get(token).unwrap_or(&1) as f64;
76 let idf = (num_files / df).ln() + 1.0;
77 let tfidf = tf as f64 * idf;
78 (token.clone(), tfidf)
79 })
80 .collect();
81 keywords.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
82 keywords.truncate(20); meso_files.insert(
85 path.clone(),
86 ScaleEntry {
87 path: path.clone(),
88 tfidf_keywords: keywords,
89 total_chunks: acc.chunk_count,
90 avg_chunk_tokens: acc.total_tokens.checked_div(acc.chunk_count).unwrap_or(0),
91 },
92 );
93 }
94
95 let mut macro_acc: HashMap<String, FileAccumulator> = HashMap::new();
97 for (path, entry) in &meso_files {
98 let dir = parent_dir(path);
99 let acc = macro_acc.entry(dir).or_default();
100 acc.chunk_count += entry.total_chunks;
101 acc.total_tokens += entry.avg_chunk_tokens * entry.total_chunks;
102 for (kw, score) in &entry.tfidf_keywords {
103 *acc.token_freqs.entry(kw.clone()).or_insert(0) += *score as usize;
104 }
105 }
106
107 let mut macro_dirs = HashMap::new();
108 for (dir, acc) in ¯o_acc {
109 let mut keywords: Vec<(String, f64)> = acc
110 .token_freqs
111 .iter()
112 .map(|(token, &count)| (token.clone(), count as f64))
113 .collect();
114 keywords.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
115 keywords.truncate(30); macro_dirs.insert(
118 dir.clone(),
119 ScaleEntry {
120 path: dir.clone(),
121 tfidf_keywords: keywords,
122 total_chunks: acc.chunk_count,
123 avg_chunk_tokens: acc.total_tokens.checked_div(acc.chunk_count).unwrap_or(0),
124 },
125 );
126 }
127
128 Self {
129 micro_chunk_count: chunks.len(),
130 meso_files,
131 macro_dirs,
132 }
133 }
134
135 pub fn search_meso(&self, query_tokens: &[String], top_k: usize) -> Vec<(String, f64)> {
137 let mut scores: Vec<(String, f64)> = self
138 .meso_files
139 .iter()
140 .map(|(path, entry)| {
141 let score = query_match_score(query_tokens, &entry.tfidf_keywords);
142 (path.clone(), score)
143 })
144 .filter(|(_, s)| *s > 0.0)
145 .collect();
146
147 scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
148 scores.truncate(top_k);
149 scores
150 }
151
152 pub fn search_macro(&self, query_tokens: &[String], top_k: usize) -> Vec<(String, f64)> {
154 let mut scores: Vec<(String, f64)> = self
155 .macro_dirs
156 .iter()
157 .map(|(dir, entry)| {
158 let score = query_match_score(query_tokens, &entry.tfidf_keywords);
159 (dir.clone(), score)
160 })
161 .filter(|(_, s)| *s > 0.0)
162 .collect();
163
164 scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
165 scores.truncate(top_k);
166 scores
167 }
168
169 pub fn entry_scale(query_type: &super::search_reranking::QueryType) -> Scale {
171 match query_type {
172 super::search_reranking::QueryType::Symbol => Scale::Micro,
173 super::search_reranking::QueryType::NaturalLanguage => Scale::Meso,
174 super::search_reranking::QueryType::Architecture => Scale::Macro,
175 }
176 }
177}
178
179impl Default for MultiScaleIndex {
180 fn default() -> Self {
181 Self::new()
182 }
183}
184
185#[derive(Debug, Clone, Copy, PartialEq)]
187pub enum Scale {
188 Micro,
189 Meso,
190 Macro,
191}
192
193#[derive(Default)]
194struct FileAccumulator {
195 chunk_count: usize,
196 total_tokens: usize,
197 token_freqs: HashMap<String, usize>,
198}
199
200fn parent_dir(path: &str) -> String {
201 let p = std::path::Path::new(path);
202 p.parent()
203 .map_or_else(|| ".".to_string(), |d| d.to_string_lossy().to_string())
204}
205
206fn query_match_score(query_tokens: &[String], keywords: &[(String, f64)]) -> f64 {
207 let mut score = 0.0;
208 for qt in query_tokens {
209 let lower = qt.to_lowercase();
210 for (kw, weight) in keywords {
211 if kw.contains(&lower) || lower.contains(kw.as_str()) {
212 score += weight;
213 }
214 }
215 }
216 score
217}
218
219#[cfg(test)]
220mod tests {
221 use super::*;
222 use crate::core::bm25_index::{ChunkKind, CodeChunk};
223
224 fn make_chunk(path: &str, content: &str, tokens: &[&str]) -> CodeChunk {
225 CodeChunk {
226 file_path: path.to_string(),
227 symbol_name: "test".to_string(),
228 kind: ChunkKind::Function,
229 start_line: 1,
230 end_line: 10,
231 content: content.to_string(),
232 tokens: tokens.iter().copied().map(str::to_string).collect(),
233 token_count: tokens.len(),
234 }
235 }
236
237 #[test]
238 fn builds_meso_from_chunks() {
239 let chunks = vec![
240 make_chunk("src/auth.rs", "fn login() {}", &["fn", "login"]),
241 make_chunk("src/auth.rs", "fn logout() {}", &["fn", "logout"]),
242 make_chunk("src/db.rs", "fn query() {}", &["fn", "query", "sql"]),
243 ];
244
245 let index = MultiScaleIndex::build_from_chunks(&chunks);
246 assert_eq!(index.meso_files.len(), 2);
247 assert!(index.meso_files.contains_key("src/auth.rs"));
248 assert!(index.meso_files.contains_key("src/db.rs"));
249 }
250
251 #[test]
252 fn builds_macro_from_chunks() {
253 let chunks = vec![
254 make_chunk("src/auth/login.rs", "fn login() {}", &["login"]),
255 make_chunk("src/auth/session.rs", "fn session() {}", &["session"]),
256 make_chunk("src/db/pool.rs", "fn pool() {}", &["pool", "connection"]),
257 ];
258
259 let index = MultiScaleIndex::build_from_chunks(&chunks);
260 assert!(index.macro_dirs.contains_key("src/auth"));
261 assert!(index.macro_dirs.contains_key("src/db"));
262 }
263
264 #[test]
265 fn meso_search_returns_relevant_files() {
266 let chunks = vec![
267 make_chunk(
268 "src/auth.rs",
269 "fn authenticate() {}",
270 &["authenticate", "token", "jwt"],
271 ),
272 make_chunk("src/db.rs", "fn query() {}", &["query", "sql", "database"]),
273 ];
274
275 let index = MultiScaleIndex::build_from_chunks(&chunks);
276 let results = index.search_meso(&["token".to_string(), "auth".to_string()], 5);
277 assert!(!results.is_empty());
278 assert_eq!(results[0].0, "src/auth.rs");
279 }
280
281 #[test]
282 fn entry_scale_for_query_types() {
283 use crate::core::search_reranking::QueryType;
284 assert_eq!(
285 MultiScaleIndex::entry_scale(&QueryType::Symbol),
286 Scale::Micro
287 );
288 assert_eq!(
289 MultiScaleIndex::entry_scale(&QueryType::NaturalLanguage),
290 Scale::Meso
291 );
292 assert_eq!(
293 MultiScaleIndex::entry_scale(&QueryType::Architecture),
294 Scale::Macro
295 );
296 }
297}