Skip to main content

codemem_engine/
search.rs

1//! Code search and summary tree domain logic.
2
3use crate::index::symbol::{symbol_from_graph_node, Symbol};
4use crate::CodememEngine;
5use codemem_core::{CodememError, GraphBackend, NodeKind, RelationshipType, VectorBackend};
6use serde::Serialize;
7
8/// A code search result returned by semantic vector search over symbols and chunks.
9#[derive(Debug, Clone, Serialize)]
10pub struct CodeSearchResult {
11    pub id: String,
12    /// "chunk" or the graph node kind string (e.g. "function", "class").
13    pub kind: String,
14    pub label: String,
15    pub similarity: f64,
16    pub file_path: Option<serde_json::Value>,
17    pub line_start: Option<serde_json::Value>,
18    pub line_end: Option<serde_json::Value>,
19    // Symbol-specific fields
20    pub qualified_name: Option<String>,
21    pub signature: Option<serde_json::Value>,
22    pub doc_comment: Option<serde_json::Value>,
23    // Chunk-specific fields
24    pub node_kind: Option<serde_json::Value>,
25    pub parent_symbol: Option<serde_json::Value>,
26    pub non_ws_chars: Option<serde_json::Value>,
27}
28
29/// A recursive tree node for the summary tree.
30#[derive(Debug, Clone, Serialize)]
31pub struct SummaryTreeNode {
32    pub id: String,
33    pub kind: String,
34    pub label: String,
35    pub centrality: f64,
36    #[serde(skip_serializing_if = "Option::is_none")]
37    pub symbol_count: Option<usize>,
38    #[serde(skip_serializing_if = "Option::is_none")]
39    pub chunk_count: Option<usize>,
40    #[serde(skip_serializing_if = "Vec::is_empty")]
41    pub children: Vec<SummaryTreeNode>,
42}
43
44/// A filtered symbol match from the in-memory index cache.
45#[derive(Debug, Clone, Serialize)]
46pub struct SymbolSearchResult {
47    pub name: String,
48    pub qualified_name: String,
49    pub kind: String,
50    pub signature: String,
51    pub file_path: String,
52    pub line_start: usize,
53    pub line_end: usize,
54    pub visibility: String,
55    pub parent: Option<String>,
56}
57
58/// Extract code references from free-form text for use in auto-linking.
59///
60/// Recognizes:
61/// - CamelCase identifiers (likely type/class names): `ProcessRequest`, `HashMap`
62/// - Backtick-wrapped code: content from \`...\`
63/// - Function-like patterns: `word()` or `word::path`
64/// - Short file paths: patterns like `src/foo.rs`, `lib/bar.py`
65///
66/// Returns deduplicated references suitable for matching against graph node labels.
67pub fn extract_code_references(text: &str) -> Vec<String> {
68    use regex::Regex;
69    use std::collections::HashSet;
70    use std::sync::OnceLock;
71
72    static RE_CAMEL: OnceLock<Regex> = OnceLock::new();
73    static RE_BACKTICK: OnceLock<Regex> = OnceLock::new();
74    static RE_FUNC: OnceLock<Regex> = OnceLock::new();
75    static RE_PATH: OnceLock<Regex> = OnceLock::new();
76
77    let re_camel = RE_CAMEL.get_or_init(|| {
78        // CamelCase: at least two words, each starting uppercase, min 4 chars total
79        Regex::new(r"\b([A-Z][a-z]+(?:[A-Z][a-z0-9]*)+)\b").unwrap()
80    });
81    let re_backtick = RE_BACKTICK.get_or_init(|| {
82        // Content inside backticks (non-greedy, at least 2 chars)
83        Regex::new(r"`([^`]{2,})`").unwrap()
84    });
85    let re_func = RE_FUNC.get_or_init(|| {
86        // Function calls like word() or qualified paths like word::path
87        Regex::new(r"\b([a-zA-Z_]\w*(?:::[a-zA-Z_]\w*)+)\b|\b([a-zA-Z_]\w*)\(\)").unwrap()
88    });
89    let re_path = RE_PATH.get_or_init(|| {
90        // File paths: word/word.ext patterns (2-4 segments, common extensions)
91        Regex::new(r"\b([a-zA-Z0-9_\-]+(?:/[a-zA-Z0-9_\-]+)*\.[a-zA-Z]{1,4})\b").unwrap()
92    });
93
94    let mut seen = HashSet::new();
95    let mut refs = Vec::new();
96
97    let mut add = |s: &str| {
98        let trimmed = s.trim();
99        if trimmed.len() >= 2 && seen.insert(trimmed.to_string()) {
100            refs.push(trimmed.to_string());
101        }
102    };
103
104    for cap in re_camel.captures_iter(text) {
105        if let Some(m) = cap.get(1) {
106            add(m.as_str());
107        }
108    }
109    for cap in re_backtick.captures_iter(text) {
110        if let Some(m) = cap.get(1) {
111            add(m.as_str());
112        }
113    }
114    for cap in re_func.captures_iter(text) {
115        // Group 1: qualified path (a::b::c), Group 2: function call (word())
116        if let Some(m) = cap.get(1) {
117            add(m.as_str());
118        }
119        if let Some(m) = cap.get(2) {
120            add(m.as_str());
121        }
122    }
123    for cap in re_path.captures_iter(text) {
124        if let Some(m) = cap.get(1) {
125            let path = m.as_str();
126            // Only include if it looks like a real file path (has a directory separator)
127            if path.contains('/') {
128                add(path);
129            }
130        }
131    }
132
133    refs
134}
135
136impl CodememEngine {
137    /// Estimate the approximate RAM usage of the in-memory graph.
138    ///
139    /// The graph engine (`GraphEngine`) keeps all nodes and edges in memory via
140    /// `HashMap`s plus a `petgraph::DiGraph`. Each node carries a `GraphNode` struct
141    /// (~200 bytes: id, kind, label, payload HashMap, centrality, optional memory_id
142    /// and namespace) plus a petgraph `NodeIndex`. Each edge carries an `Edge` struct
143    /// (~150 bytes: id, src, dst, relationship, weight, properties HashMap, timestamps)
144    /// plus petgraph edge weight storage and adjacency list entries.
145    ///
146    /// Returns an estimate in bytes. Actual usage may be higher due to HashMap overhead,
147    /// string heap allocations, and payload contents.
148    #[cfg(test)]
149    pub fn graph_memory_estimate(&self) -> usize {
150        let (node_count, edge_count) = match self.lock_graph() {
151            Ok(g) => (g.node_count(), g.edge_count()),
152            Err(_) => (0, 0),
153        };
154        node_count * 200 + edge_count * 150
155    }
156
157    /// Semantic code search: embed query, vector search, filter to sym:/chunk: IDs,
158    /// enrich with graph node data.
159    pub fn search_code(
160        &self,
161        query: &str,
162        k: usize,
163    ) -> Result<Vec<CodeSearchResult>, CodememError> {
164        let emb_guard = self
165            .lock_embeddings()?
166            .ok_or_else(|| CodememError::InvalidInput("Embedding service not available".into()))?;
167
168        let query_embedding = emb_guard
169            .embed(query)
170            .map_err(|e| CodememError::InvalidInput(format!("Embedding failed: {e}")))?;
171        drop(emb_guard);
172
173        let vec = self.lock_vector()?;
174        let raw_results: Vec<(String, f32)> = vec
175            .search(&query_embedding, k * 3)
176            .unwrap_or_default()
177            .into_iter()
178            .filter(|(id, _)| id.starts_with("sym:") || id.starts_with("chunk:"))
179            .take(k)
180            .collect();
181        drop(vec);
182
183        if raw_results.is_empty() {
184            return Ok(Vec::new());
185        }
186
187        let mut output = Vec::new();
188        for (id, similarity_value) in &raw_results {
189            let similarity = *similarity_value as f64;
190            if let Ok(Some(node)) = self.storage.get_graph_node(id) {
191                if id.starts_with("chunk:") {
192                    output.push(CodeSearchResult {
193                        id: id.clone(),
194                        kind: "chunk".to_string(),
195                        label: node.label,
196                        similarity,
197                        file_path: node.payload.get("file_path").cloned(),
198                        line_start: node.payload.get("line_start").cloned(),
199                        line_end: node.payload.get("line_end").cloned(),
200                        qualified_name: None,
201                        signature: None,
202                        doc_comment: None,
203                        node_kind: node.payload.get("node_kind").cloned(),
204                        parent_symbol: node.payload.get("parent_symbol").cloned(),
205                        non_ws_chars: node.payload.get("non_ws_chars").cloned(),
206                    });
207                } else {
208                    output.push(CodeSearchResult {
209                        id: id.clone(),
210                        kind: node.kind.to_string(),
211                        label: node.label,
212                        similarity,
213                        file_path: node.payload.get("file_path").cloned(),
214                        line_start: node.payload.get("line_start").cloned(),
215                        line_end: node.payload.get("line_end").cloned(),
216                        qualified_name: Some(id.strip_prefix("sym:").unwrap_or(id).to_string()),
217                        signature: node.payload.get("signature").cloned(),
218                        doc_comment: node.payload.get("doc_comment").cloned(),
219                        node_kind: None,
220                        parent_symbol: None,
221                        non_ws_chars: None,
222                    });
223                }
224            }
225        }
226
227        Ok(output)
228    }
229
230    /// Build a hierarchical summary tree starting from a given node.
231    /// Traverses Contains edges: packages -> files -> symbols.
232    pub fn summary_tree(
233        &self,
234        start_id: &str,
235        max_depth: usize,
236        include_chunks: bool,
237    ) -> Result<SummaryTreeNode, CodememError> {
238        let graph = self.lock_graph()?;
239
240        fn build_tree(
241            graph: &dyn GraphBackend,
242            node_id: &str,
243            depth: usize,
244            max_depth: usize,
245            include_chunks: bool,
246        ) -> Option<SummaryTreeNode> {
247            if depth > max_depth {
248                return None;
249            }
250            let node = match graph.get_node(node_id) {
251                Ok(Some(n)) => n,
252                _ => return None,
253            };
254
255            let mut children: Vec<SummaryTreeNode> = Vec::new();
256            if depth < max_depth {
257                if let Ok(edges) = graph.get_edges(node_id) {
258                    let mut child_ids: Vec<String> = edges
259                        .iter()
260                        .filter(|e| {
261                            e.src == node_id && e.relationship == RelationshipType::Contains
262                        })
263                        .map(|e| e.dst.clone())
264                        .collect();
265                    child_ids.sort();
266
267                    for child_id in &child_ids {
268                        if !include_chunks && child_id.starts_with("chunk:") {
269                            continue;
270                        }
271                        if let Some(child) =
272                            build_tree(graph, child_id, depth + 1, max_depth, include_chunks)
273                        {
274                            children.push(child);
275                        }
276                    }
277                }
278            }
279
280            let (symbol_count, chunk_count) = if node.kind == NodeKind::File {
281                let syms = children
282                    .iter()
283                    .filter(|c| c.kind != "chunk" && c.kind != "package" && c.kind != "file")
284                    .count();
285                let chunks = if include_chunks {
286                    children.iter().filter(|c| c.kind == "chunk").count()
287                } else {
288                    0
289                };
290                (Some(syms), Some(chunks))
291            } else {
292                (None, None)
293            };
294
295            Some(SummaryTreeNode {
296                id: node.id,
297                kind: node.kind.to_string(),
298                label: node.label,
299                centrality: node.centrality,
300                symbol_count,
301                chunk_count,
302                children,
303            })
304        }
305
306        build_tree(&*graph, start_id, 0, max_depth, include_chunks)
307            .ok_or_else(|| CodememError::NotFound(format!("Node not found: {start_id}")))
308    }
309
310    /// Look up a single symbol by qualified name with cache-through to the DB.
311    ///
312    /// 1. Check the in-memory `IndexCache`.
313    /// 2. On miss, query storage for `sym:{qualified_name}` graph node.
314    /// 3. Reconstruct a `Symbol`, insert into cache, return.
315    pub fn get_symbol(&self, qualified_name: &str) -> Result<Option<Symbol>, CodememError> {
316        // 1. Check cache
317        {
318            let cache = self.lock_index_cache()?;
319            if let Some(ref c) = *cache {
320                if let Some(sym) = c
321                    .symbols
322                    .iter()
323                    .find(|s| s.qualified_name == qualified_name)
324                {
325                    return Ok(Some(sym.clone()));
326                }
327            }
328        }
329
330        // 2. Cache miss — query DB
331        let node_id = format!("sym:{qualified_name}");
332        let node = match self.storage.get_graph_node(&node_id)? {
333            Some(n) => n,
334            None => return Ok(None),
335        };
336
337        // 3. Reconstruct Symbol from GraphNode
338        let sym = match symbol_from_graph_node(&node) {
339            Some(s) => s,
340            None => return Ok(None),
341        };
342
343        // 4. Insert into cache (init if None)
344        {
345            let mut cache = self.lock_index_cache()?;
346            if let Some(ref mut c) = *cache {
347                // Dedup: only insert if not already present
348                if !c.symbols.iter().any(|s| s.qualified_name == qualified_name) {
349                    c.symbols.push(sym.clone());
350                }
351            } else {
352                *cache = Some(crate::IndexCache {
353                    symbols: vec![sym.clone()],
354                    chunks: Vec::new(),
355                    root_path: String::new(),
356                });
357            }
358        }
359
360        Ok(Some(sym))
361    }
362
363    /// Search symbols by name, with optional kind filter. Falls through to the DB
364    /// when the in-memory cache is empty or has no matches.
365    pub fn search_symbols(
366        &self,
367        query: &str,
368        limit: usize,
369        kind_filter: Option<&str>,
370    ) -> Result<Vec<SymbolSearchResult>, CodememError> {
371        let query_lower = query.to_lowercase();
372        let kind_lower = kind_filter.map(|k| k.to_lowercase());
373
374        // Helper closure: filter + map a Symbol into a SymbolSearchResult
375        let matches_filter = |sym: &Symbol| -> bool {
376            let name_match = sym.name.to_lowercase().contains(&query_lower)
377                || sym.qualified_name.to_lowercase().contains(&query_lower);
378            if !name_match {
379                return false;
380            }
381            if let Some(ref kl) = kind_lower {
382                return sym.kind.to_string().to_lowercase() == *kl;
383            }
384            true
385        };
386
387        let to_result = |sym: &Symbol| -> SymbolSearchResult {
388            SymbolSearchResult {
389                name: sym.name.clone(),
390                qualified_name: sym.qualified_name.clone(),
391                kind: sym.kind.to_string(),
392                signature: sym.signature.clone(),
393                file_path: sym.file_path.clone(),
394                line_start: sym.line_start,
395                line_end: sym.line_end,
396                visibility: sym.visibility.to_string(),
397                parent: sym.parent.clone(),
398            }
399        };
400
401        // 1. Try cache first — only short-circuit if we got enough results
402        let mut results = Vec::new();
403        let mut seen_qnames = std::collections::HashSet::new();
404        {
405            let cache = self.lock_index_cache()?;
406            if let Some(ref c) = *cache {
407                for sym in c.symbols.iter().filter(|s| matches_filter(s)) {
408                    if seen_qnames.insert(sym.qualified_name.clone()) {
409                        results.push(to_result(sym));
410                        if results.len() >= limit {
411                            return Ok(results);
412                        }
413                    }
414                }
415            }
416        }
417
418        // 2. Cache empty or partial results — top up from DB.
419        // Over-fetch by 3× because search_graph_nodes returns all node kinds
420        // (files, chunks, packages…) and we filter to sym:* nodes only.
421        // NOTE: namespace=None is intentional — the in-memory cache is already
422        // cross-namespace, so the DB fallback should match that behavior.
423        let db_nodes = self
424            .storage
425            .search_graph_nodes(&query_lower, None, limit * 3)?;
426
427        let mut new_symbols = Vec::new();
428
429        for node in &db_nodes {
430            if !node.id.starts_with("sym:") {
431                continue;
432            }
433            if let Some(sym) = symbol_from_graph_node(node) {
434                if matches_filter(&sym) && seen_qnames.insert(sym.qualified_name.clone()) {
435                    results.push(to_result(&sym));
436                    new_symbols.push(sym);
437                    if results.len() >= limit {
438                        break;
439                    }
440                }
441            }
442        }
443
444        // 3. Populate cache with DB results for future queries
445        if !new_symbols.is_empty() {
446            let mut cache = self.lock_index_cache()?;
447            if let Some(ref mut c) = *cache {
448                for sym in new_symbols {
449                    if !c
450                        .symbols
451                        .iter()
452                        .any(|s| s.qualified_name == sym.qualified_name)
453                    {
454                        c.symbols.push(sym);
455                    }
456                }
457            } else {
458                *cache = Some(crate::IndexCache {
459                    symbols: new_symbols,
460                    chunks: Vec::new(),
461                    root_path: String::new(),
462                });
463            }
464        }
465
466        Ok(results)
467    }
468}