Skip to main content

lean_ctx/core/
multiscale_index.rs

1//! Renormalization-Inspired Multi-Scale Indexing.
2//!
3//! Scientific basis: Kenneth Wilson's Renormalization Group (Nobel Prize, 1982) —
4//! describes systems at different scales using consistent transformations.
5//! Each scale captures progressively coarser features:
6//!
7//! - Mikro (Chunk): Individual code chunks — precise symbol search
8//! - Meso (File): Aggregated per-file representations — "which files are relevant?"
9//! - Makro (Directory): Module-level aggregations — architecture queries
10//!
11//! The query-type classifier from search_reranking determines the entry scale:
12//! - Symbol queries → Mikro directly
13//! - NL queries → Meso → Mikro refinement
14//! - Architecture queries → Makro → Meso → Mikro cascade
15
16use std::collections::HashMap;
17
18/// A scale-aggregated representation for search.
19#[derive(Debug, Clone)]
20pub struct ScaleEntry {
21    pub path: String,
22    pub tfidf_keywords: Vec<(String, f64)>,
23    pub total_chunks: usize,
24    pub avg_chunk_tokens: usize,
25}
26
27/// Multi-scale index holding representations at three granularities.
28pub struct MultiScaleIndex {
29    /// Mikro: individual chunks (delegated to BM25Index)
30    pub micro_chunk_count: usize,
31    /// Meso: per-file aggregated keywords and statistics
32    pub meso_files: HashMap<String, ScaleEntry>,
33    /// Makro: per-directory aggregated keywords
34    pub macro_dirs: HashMap<String, ScaleEntry>,
35}
36
37impl MultiScaleIndex {
38    pub fn new() -> Self {
39        Self {
40            micro_chunk_count: 0,
41            meso_files: HashMap::new(),
42            macro_dirs: HashMap::new(),
43        }
44    }
45
46    /// Build meso and macro scales from chunk-level data.
47    pub fn build_from_chunks(chunks: &[super::bm25_index::CodeChunk]) -> Self {
48        let mut meso: HashMap<String, FileAccumulator> = HashMap::new();
49
50        // Aggregate chunks into file-level entries
51        for chunk in chunks {
52            let acc = meso.entry(chunk.file_path.clone()).or_default();
53            acc.chunk_count += 1;
54            acc.total_tokens += chunk.token_count;
55            for token in &chunk.tokens {
56                *acc.token_freqs.entry(token.to_lowercase()).or_insert(0) += 1;
57            }
58        }
59
60        // Build meso-scale entries with TF-IDF-like scoring
61        let num_files = meso.len().max(1) as f64;
62        let mut doc_freqs: HashMap<String, usize> = HashMap::new();
63        for acc in meso.values() {
64            for token in acc.token_freqs.keys() {
65                *doc_freqs.entry(token.clone()).or_insert(0) += 1;
66            }
67        }
68
69        let mut meso_files = HashMap::new();
70        for (path, acc) in &meso {
71            let mut keywords: Vec<(String, f64)> = acc
72                .token_freqs
73                .iter()
74                .map(|(token, &tf)| {
75                    let df = *doc_freqs.get(token).unwrap_or(&1) as f64;
76                    let idf = (num_files / df).ln() + 1.0;
77                    let tfidf = tf as f64 * idf;
78                    (token.clone(), tfidf)
79                })
80                .collect();
81            keywords.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
82            keywords.truncate(20); // Keep top 20 keywords per file
83
84            meso_files.insert(
85                path.clone(),
86                ScaleEntry {
87                    path: path.clone(),
88                    tfidf_keywords: keywords,
89                    total_chunks: acc.chunk_count,
90                    avg_chunk_tokens: acc.total_tokens.checked_div(acc.chunk_count).unwrap_or(0),
91                },
92            );
93        }
94
95        // Build macro-scale: aggregate files into directories
96        let mut macro_acc: HashMap<String, FileAccumulator> = HashMap::new();
97        for (path, entry) in &meso_files {
98            let dir = parent_dir(path);
99            let acc = macro_acc.entry(dir).or_default();
100            acc.chunk_count += entry.total_chunks;
101            acc.total_tokens += entry.avg_chunk_tokens * entry.total_chunks;
102            for (kw, score) in &entry.tfidf_keywords {
103                *acc.token_freqs.entry(kw.clone()).or_insert(0) += *score as usize;
104            }
105        }
106
107        let mut macro_dirs = HashMap::new();
108        for (dir, acc) in &macro_acc {
109            let mut keywords: Vec<(String, f64)> = acc
110                .token_freqs
111                .iter()
112                .map(|(token, &count)| (token.clone(), count as f64))
113                .collect();
114            keywords.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
115            keywords.truncate(30); // Top 30 keywords per directory
116
117            macro_dirs.insert(
118                dir.clone(),
119                ScaleEntry {
120                    path: dir.clone(),
121                    tfidf_keywords: keywords,
122                    total_chunks: acc.chunk_count,
123                    avg_chunk_tokens: acc.total_tokens.checked_div(acc.chunk_count).unwrap_or(0),
124                },
125            );
126        }
127
128        Self {
129            micro_chunk_count: chunks.len(),
130            meso_files,
131            macro_dirs,
132        }
133    }
134
135    /// Search at the meso (file) scale. Returns file paths with relevance scores.
136    pub fn search_meso(&self, query_tokens: &[String], top_k: usize) -> Vec<(String, f64)> {
137        let mut scores: Vec<(String, f64)> = self
138            .meso_files
139            .iter()
140            .map(|(path, entry)| {
141                let score = query_match_score(query_tokens, &entry.tfidf_keywords);
142                (path.clone(), score)
143            })
144            .filter(|(_, s)| *s > 0.0)
145            .collect();
146
147        scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
148        scores.truncate(top_k);
149        scores
150    }
151
152    /// Search at the macro (directory) scale. Returns directory paths with relevance.
153    pub fn search_macro(&self, query_tokens: &[String], top_k: usize) -> Vec<(String, f64)> {
154        let mut scores: Vec<(String, f64)> = self
155            .macro_dirs
156            .iter()
157            .map(|(dir, entry)| {
158                let score = query_match_score(query_tokens, &entry.tfidf_keywords);
159                (dir.clone(), score)
160            })
161            .filter(|(_, s)| *s > 0.0)
162            .collect();
163
164        scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
165        scores.truncate(top_k);
166        scores
167    }
168
169    /// Determine which scale to start search from based on query type.
170    pub fn entry_scale(query_type: &super::search_reranking::QueryType) -> Scale {
171        match query_type {
172            super::search_reranking::QueryType::Symbol => Scale::Micro,
173            super::search_reranking::QueryType::NaturalLanguage => Scale::Meso,
174            super::search_reranking::QueryType::Architecture => Scale::Macro,
175        }
176    }
177}
178
179impl Default for MultiScaleIndex {
180    fn default() -> Self {
181        Self::new()
182    }
183}
184
185/// Scale levels for the renormalization cascade.
186#[derive(Debug, Clone, Copy, PartialEq)]
187pub enum Scale {
188    Micro,
189    Meso,
190    Macro,
191}
192
193#[derive(Default)]
194struct FileAccumulator {
195    chunk_count: usize,
196    total_tokens: usize,
197    token_freqs: HashMap<String, usize>,
198}
199
200fn parent_dir(path: &str) -> String {
201    let p = std::path::Path::new(path);
202    p.parent()
203        .map_or_else(|| ".".to_string(), |d| d.to_string_lossy().to_string())
204}
205
206fn query_match_score(query_tokens: &[String], keywords: &[(String, f64)]) -> f64 {
207    let mut score = 0.0;
208    for qt in query_tokens {
209        let lower = qt.to_lowercase();
210        for (kw, weight) in keywords {
211            if kw.contains(&lower) || lower.contains(kw.as_str()) {
212                score += weight;
213            }
214        }
215    }
216    score
217}
218
219#[cfg(test)]
220mod tests {
221    use super::*;
222    use crate::core::bm25_index::{ChunkKind, CodeChunk};
223
224    fn make_chunk(path: &str, content: &str, tokens: &[&str]) -> CodeChunk {
225        CodeChunk {
226            file_path: path.to_string(),
227            symbol_name: "test".to_string(),
228            kind: ChunkKind::Function,
229            start_line: 1,
230            end_line: 10,
231            content: content.to_string(),
232            tokens: tokens.iter().copied().map(str::to_string).collect(),
233            token_count: tokens.len(),
234        }
235    }
236
237    #[test]
238    fn builds_meso_from_chunks() {
239        let chunks = vec![
240            make_chunk("src/auth.rs", "fn login() {}", &["fn", "login"]),
241            make_chunk("src/auth.rs", "fn logout() {}", &["fn", "logout"]),
242            make_chunk("src/db.rs", "fn query() {}", &["fn", "query", "sql"]),
243        ];
244
245        let index = MultiScaleIndex::build_from_chunks(&chunks);
246        assert_eq!(index.meso_files.len(), 2);
247        assert!(index.meso_files.contains_key("src/auth.rs"));
248        assert!(index.meso_files.contains_key("src/db.rs"));
249    }
250
251    #[test]
252    fn builds_macro_from_chunks() {
253        let chunks = vec![
254            make_chunk("src/auth/login.rs", "fn login() {}", &["login"]),
255            make_chunk("src/auth/session.rs", "fn session() {}", &["session"]),
256            make_chunk("src/db/pool.rs", "fn pool() {}", &["pool", "connection"]),
257        ];
258
259        let index = MultiScaleIndex::build_from_chunks(&chunks);
260        assert!(index.macro_dirs.contains_key("src/auth"));
261        assert!(index.macro_dirs.contains_key("src/db"));
262    }
263
264    #[test]
265    fn meso_search_returns_relevant_files() {
266        let chunks = vec![
267            make_chunk(
268                "src/auth.rs",
269                "fn authenticate() {}",
270                &["authenticate", "token", "jwt"],
271            ),
272            make_chunk("src/db.rs", "fn query() {}", &["query", "sql", "database"]),
273        ];
274
275        let index = MultiScaleIndex::build_from_chunks(&chunks);
276        let results = index.search_meso(&["token".to_string(), "auth".to_string()], 5);
277        assert!(!results.is_empty());
278        assert_eq!(results[0].0, "src/auth.rs");
279    }
280
281    #[test]
282    fn entry_scale_for_query_types() {
283        use crate::core::search_reranking::QueryType;
284        assert_eq!(
285            MultiScaleIndex::entry_scale(&QueryType::Symbol),
286            Scale::Micro
287        );
288        assert_eq!(
289            MultiScaleIndex::entry_scale(&QueryType::NaturalLanguage),
290            Scale::Meso
291        );
292        assert_eq!(
293            MultiScaleIndex::entry_scale(&QueryType::Architecture),
294            Scale::Macro
295        );
296    }
297}