Skip to main content

codemem_engine/
search.rs

1//! Code search and summary tree domain logic.
2
3use crate::index::symbol::{symbol_from_graph_node, Symbol};
4use crate::CodememEngine;
5use codemem_core::{CodememError, GraphBackend, NodeKind, RelationshipType, VectorBackend};
6use serde::Serialize;
7
8/// A code search result returned by semantic vector search over symbols and chunks.
9#[derive(Debug, Clone, Serialize)]
10pub struct CodeSearchResult {
11    pub id: String,
12    /// "chunk" or the graph node kind string (e.g. "function", "class").
13    pub kind: String,
14    pub label: String,
15    pub similarity: f64,
16    pub file_path: Option<serde_json::Value>,
17    pub line_start: Option<serde_json::Value>,
18    pub line_end: Option<serde_json::Value>,
19    // Symbol-specific fields
20    pub qualified_name: Option<String>,
21    pub signature: Option<serde_json::Value>,
22    pub doc_comment: Option<serde_json::Value>,
23    // Chunk-specific fields
24    pub node_kind: Option<serde_json::Value>,
25    pub parent_symbol: Option<serde_json::Value>,
26    pub non_ws_chars: Option<serde_json::Value>,
27}
28
29/// A recursive tree node for the summary tree.
30#[derive(Debug, Clone, Serialize)]
31pub struct SummaryTreeNode {
32    pub id: String,
33    pub kind: String,
34    pub label: String,
35    pub centrality: f64,
36    #[serde(skip_serializing_if = "Option::is_none")]
37    pub symbol_count: Option<usize>,
38    #[serde(skip_serializing_if = "Option::is_none")]
39    pub chunk_count: Option<usize>,
40    #[serde(skip_serializing_if = "Vec::is_empty")]
41    pub children: Vec<SummaryTreeNode>,
42}
43
44/// A filtered symbol match from the in-memory index cache.
45#[derive(Debug, Clone, Serialize)]
46pub struct SymbolSearchResult {
47    pub name: String,
48    pub qualified_name: String,
49    pub kind: String,
50    pub signature: String,
51    pub file_path: String,
52    pub line_start: usize,
53    pub line_end: usize,
54    pub visibility: String,
55    pub parent: Option<String>,
56}
57
58/// Extract code references from free-form text for use in auto-linking.
59///
60/// Recognizes:
61/// - CamelCase identifiers (likely type/class names): `ProcessRequest`, `HashMap`
62/// - Backtick-wrapped code: content from \`...\`
63/// - Function-like patterns: `word()` or `word::path`
64/// - Short file paths: patterns like `src/foo.rs`, `lib/bar.py`
65///
66/// Returns deduplicated references suitable for matching against graph node labels.
67pub fn extract_code_references(text: &str) -> Vec<String> {
68    use regex::Regex;
69    use std::collections::HashSet;
70    use std::sync::OnceLock;
71
72    static RE_CAMEL: OnceLock<Regex> = OnceLock::new();
73    static RE_BACKTICK: OnceLock<Regex> = OnceLock::new();
74    static RE_FUNC: OnceLock<Regex> = OnceLock::new();
75    static RE_PATH: OnceLock<Regex> = OnceLock::new();
76
77    let re_camel = RE_CAMEL.get_or_init(|| {
78        // CamelCase: at least two words, each starting uppercase, min 4 chars total
79        Regex::new(r"\b([A-Z][a-z]+(?:[A-Z][a-z0-9]*)+)\b").unwrap()
80    });
81    let re_backtick = RE_BACKTICK.get_or_init(|| {
82        // Content inside backticks (non-greedy, at least 2 chars)
83        Regex::new(r"`([^`]{2,})`").unwrap()
84    });
85    let re_func = RE_FUNC.get_or_init(|| {
86        // Function calls like word() or qualified paths like word::path
87        Regex::new(r"\b([a-zA-Z_]\w*(?:::[a-zA-Z_]\w*)+)\b|\b([a-zA-Z_]\w*)\(\)").unwrap()
88    });
89    let re_path = RE_PATH.get_or_init(|| {
90        // File paths: word/word.ext patterns (2-4 segments, common extensions)
91        Regex::new(r"\b([a-zA-Z0-9_\-]+(?:/[a-zA-Z0-9_\-]+)*\.[a-zA-Z]{1,4})\b").unwrap()
92    });
93
94    let mut seen = HashSet::new();
95    let mut refs = Vec::new();
96
97    let mut add = |s: &str| {
98        let trimmed = s.trim();
99        if trimmed.len() >= 2 && seen.insert(trimmed.to_string()) {
100            refs.push(trimmed.to_string());
101        }
102    };
103
104    for cap in re_camel.captures_iter(text) {
105        if let Some(m) = cap.get(1) {
106            add(m.as_str());
107        }
108    }
109    for cap in re_backtick.captures_iter(text) {
110        if let Some(m) = cap.get(1) {
111            add(m.as_str());
112        }
113    }
114    for cap in re_func.captures_iter(text) {
115        // Group 1: qualified path (a::b::c), Group 2: function call (word())
116        if let Some(m) = cap.get(1) {
117            add(m.as_str());
118        }
119        if let Some(m) = cap.get(2) {
120            add(m.as_str());
121        }
122    }
123    for cap in re_path.captures_iter(text) {
124        if let Some(m) = cap.get(1) {
125            let path = m.as_str();
126            // Only include if it looks like a real file path (has a directory separator)
127            if path.contains('/') {
128                add(path);
129            }
130        }
131    }
132
133    refs
134}
135
136impl CodememEngine {
137    /// Estimate the approximate RAM usage of the in-memory graph.
138    ///
139    /// The graph engine (`GraphEngine`) keeps all nodes and edges in memory via
140    /// `HashMap`s plus a `petgraph::DiGraph`. Each node carries a `GraphNode` struct
141    /// (~200 bytes: id, kind, label, payload HashMap, centrality, optional memory_id
142    /// and namespace) plus a petgraph `NodeIndex`. Each edge carries an `Edge` struct
143    /// (~150 bytes: id, src, dst, relationship, weight, properties HashMap, timestamps)
144    /// plus petgraph edge weight storage and adjacency list entries.
145    ///
146    /// Returns an estimate in bytes. Actual usage may be higher due to HashMap overhead,
147    /// string heap allocations, and payload contents.
148    pub fn graph_memory_estimate(&self) -> usize {
149        let (node_count, edge_count) = match self.lock_graph() {
150            Ok(g) => (g.node_count(), g.edge_count()),
151            Err(_) => (0, 0),
152        };
153        node_count * 200 + edge_count * 150
154    }
155
156    /// Semantic code search: embed query, vector search, filter to sym:/chunk: IDs,
157    /// enrich with graph node data.
158    pub fn search_code(
159        &self,
160        query: &str,
161        k: usize,
162    ) -> Result<Vec<CodeSearchResult>, CodememError> {
163        let emb_guard = self
164            .lock_embeddings()?
165            .ok_or_else(|| CodememError::InvalidInput("Embedding service not available".into()))?;
166
167        let query_embedding = emb_guard
168            .embed(query)
169            .map_err(|e| CodememError::InvalidInput(format!("Embedding failed: {e}")))?;
170        drop(emb_guard);
171
172        let vec = self.lock_vector()?;
173        let raw_results: Vec<(String, f32)> = vec
174            .search(&query_embedding, k * 3)
175            .unwrap_or_default()
176            .into_iter()
177            .filter(|(id, _)| id.starts_with("sym:") || id.starts_with("chunk:"))
178            .take(k)
179            .collect();
180        drop(vec);
181
182        if raw_results.is_empty() {
183            return Ok(Vec::new());
184        }
185
186        let mut output = Vec::new();
187        for (id, distance) in &raw_results {
188            let similarity = 1.0 - *distance as f64;
189            if let Ok(Some(node)) = self.storage.get_graph_node(id) {
190                if id.starts_with("chunk:") {
191                    output.push(CodeSearchResult {
192                        id: id.clone(),
193                        kind: "chunk".to_string(),
194                        label: node.label,
195                        similarity,
196                        file_path: node.payload.get("file_path").cloned(),
197                        line_start: node.payload.get("line_start").cloned(),
198                        line_end: node.payload.get("line_end").cloned(),
199                        qualified_name: None,
200                        signature: None,
201                        doc_comment: None,
202                        node_kind: node.payload.get("node_kind").cloned(),
203                        parent_symbol: node.payload.get("parent_symbol").cloned(),
204                        non_ws_chars: node.payload.get("non_ws_chars").cloned(),
205                    });
206                } else {
207                    output.push(CodeSearchResult {
208                        id: id.clone(),
209                        kind: node.kind.to_string(),
210                        label: node.label,
211                        similarity,
212                        file_path: node.payload.get("file_path").cloned(),
213                        line_start: node.payload.get("line_start").cloned(),
214                        line_end: node.payload.get("line_end").cloned(),
215                        qualified_name: Some(id.strip_prefix("sym:").unwrap_or(id).to_string()),
216                        signature: node.payload.get("signature").cloned(),
217                        doc_comment: node.payload.get("doc_comment").cloned(),
218                        node_kind: None,
219                        parent_symbol: None,
220                        non_ws_chars: None,
221                    });
222                }
223            }
224        }
225
226        Ok(output)
227    }
228
229    /// Build a hierarchical summary tree starting from a given node.
230    /// Traverses Contains edges: packages -> files -> symbols.
231    pub fn summary_tree(
232        &self,
233        start_id: &str,
234        max_depth: usize,
235        include_chunks: bool,
236    ) -> Result<SummaryTreeNode, CodememError> {
237        let graph = self.lock_graph()?;
238
239        fn build_tree(
240            graph: &dyn GraphBackend,
241            node_id: &str,
242            depth: usize,
243            max_depth: usize,
244            include_chunks: bool,
245        ) -> Option<SummaryTreeNode> {
246            if depth > max_depth {
247                return None;
248            }
249            let node = match graph.get_node(node_id) {
250                Ok(Some(n)) => n,
251                _ => return None,
252            };
253
254            let mut children: Vec<SummaryTreeNode> = Vec::new();
255            if depth < max_depth {
256                if let Ok(edges) = graph.get_edges(node_id) {
257                    let mut child_ids: Vec<String> = edges
258                        .iter()
259                        .filter(|e| {
260                            e.src == node_id && e.relationship == RelationshipType::Contains
261                        })
262                        .map(|e| e.dst.clone())
263                        .collect();
264                    child_ids.sort();
265
266                    for child_id in &child_ids {
267                        if !include_chunks && child_id.starts_with("chunk:") {
268                            continue;
269                        }
270                        if let Some(child) =
271                            build_tree(graph, child_id, depth + 1, max_depth, include_chunks)
272                        {
273                            children.push(child);
274                        }
275                    }
276                }
277            }
278
279            let (symbol_count, chunk_count) = if node.kind == NodeKind::File {
280                let syms = children
281                    .iter()
282                    .filter(|c| c.kind != "chunk" && c.kind != "package" && c.kind != "file")
283                    .count();
284                let chunks = if include_chunks {
285                    children.iter().filter(|c| c.kind == "chunk").count()
286                } else {
287                    0
288                };
289                (Some(syms), Some(chunks))
290            } else {
291                (None, None)
292            };
293
294            Some(SummaryTreeNode {
295                id: node.id,
296                kind: node.kind.to_string(),
297                label: node.label,
298                centrality: node.centrality,
299                symbol_count,
300                chunk_count,
301                children,
302            })
303        }
304
305        build_tree(&*graph, start_id, 0, max_depth, include_chunks)
306            .ok_or_else(|| CodememError::NotFound(format!("Node not found: {start_id}")))
307    }
308
309    /// Look up a single symbol by qualified name with cache-through to the DB.
310    ///
311    /// 1. Check the in-memory `IndexCache`.
312    /// 2. On miss, query storage for `sym:{qualified_name}` graph node.
313    /// 3. Reconstruct a `Symbol`, insert into cache, return.
314    pub fn get_symbol(&self, qualified_name: &str) -> Result<Option<Symbol>, CodememError> {
315        // 1. Check cache
316        {
317            let cache = self.lock_index_cache()?;
318            if let Some(ref c) = *cache {
319                if let Some(sym) = c
320                    .symbols
321                    .iter()
322                    .find(|s| s.qualified_name == qualified_name)
323                {
324                    return Ok(Some(sym.clone()));
325                }
326            }
327        }
328
329        // 2. Cache miss — query DB
330        let node_id = format!("sym:{qualified_name}");
331        let node = match self.storage.get_graph_node(&node_id)? {
332            Some(n) => n,
333            None => return Ok(None),
334        };
335
336        // 3. Reconstruct Symbol from GraphNode
337        let sym = match symbol_from_graph_node(&node) {
338            Some(s) => s,
339            None => return Ok(None),
340        };
341
342        // 4. Insert into cache (init if None)
343        {
344            let mut cache = self.lock_index_cache()?;
345            if let Some(ref mut c) = *cache {
346                // Dedup: only insert if not already present
347                if !c.symbols.iter().any(|s| s.qualified_name == qualified_name) {
348                    c.symbols.push(sym.clone());
349                }
350            } else {
351                *cache = Some(crate::IndexCache {
352                    symbols: vec![sym.clone()],
353                    chunks: Vec::new(),
354                    root_path: String::new(),
355                });
356            }
357        }
358
359        Ok(Some(sym))
360    }
361
362    /// Search symbols by name, with optional kind filter. Falls through to the DB
363    /// when the in-memory cache is empty or has no matches.
364    pub fn search_symbols(
365        &self,
366        query: &str,
367        limit: usize,
368        kind_filter: Option<&str>,
369    ) -> Result<Vec<SymbolSearchResult>, CodememError> {
370        let query_lower = query.to_lowercase();
371        let kind_lower = kind_filter.map(|k| k.to_lowercase());
372
373        // Helper closure: filter + map a Symbol into a SymbolSearchResult
374        let matches_filter = |sym: &Symbol| -> bool {
375            let name_match = sym.name.to_lowercase().contains(&query_lower)
376                || sym.qualified_name.to_lowercase().contains(&query_lower);
377            if !name_match {
378                return false;
379            }
380            if let Some(ref kl) = kind_lower {
381                return sym.kind.to_string().to_lowercase() == *kl;
382            }
383            true
384        };
385
386        let to_result = |sym: &Symbol| -> SymbolSearchResult {
387            SymbolSearchResult {
388                name: sym.name.clone(),
389                qualified_name: sym.qualified_name.clone(),
390                kind: sym.kind.to_string(),
391                signature: sym.signature.clone(),
392                file_path: sym.file_path.clone(),
393                line_start: sym.line_start,
394                line_end: sym.line_end,
395                visibility: sym.visibility.to_string(),
396                parent: sym.parent.clone(),
397            }
398        };
399
400        // 1. Try cache first — only short-circuit if we got enough results
401        let mut results = Vec::new();
402        let mut seen_qnames = std::collections::HashSet::new();
403        {
404            let cache = self.lock_index_cache()?;
405            if let Some(ref c) = *cache {
406                for sym in c.symbols.iter().filter(|s| matches_filter(s)) {
407                    if seen_qnames.insert(sym.qualified_name.clone()) {
408                        results.push(to_result(sym));
409                        if results.len() >= limit {
410                            return Ok(results);
411                        }
412                    }
413                }
414            }
415        }
416
417        // 2. Cache empty or partial results — top up from DB.
418        // Over-fetch by 3× because search_graph_nodes returns all node kinds
419        // (files, chunks, packages…) and we filter to sym:* nodes only.
420        // NOTE: namespace=None is intentional — the in-memory cache is already
421        // cross-namespace, so the DB fallback should match that behavior.
422        let db_nodes = self
423            .storage
424            .search_graph_nodes(&query_lower, None, limit * 3)?;
425
426        let mut new_symbols = Vec::new();
427
428        for node in &db_nodes {
429            if !node.id.starts_with("sym:") {
430                continue;
431            }
432            if let Some(sym) = symbol_from_graph_node(node) {
433                if matches_filter(&sym) && seen_qnames.insert(sym.qualified_name.clone()) {
434                    results.push(to_result(&sym));
435                    new_symbols.push(sym);
436                    if results.len() >= limit {
437                        break;
438                    }
439                }
440            }
441        }
442
443        // 3. Populate cache with DB results for future queries
444        if !new_symbols.is_empty() {
445            let mut cache = self.lock_index_cache()?;
446            if let Some(ref mut c) = *cache {
447                for sym in new_symbols {
448                    if !c
449                        .symbols
450                        .iter()
451                        .any(|s| s.qualified_name == sym.qualified_name)
452                    {
453                        c.symbols.push(sym);
454                    }
455                }
456            } else {
457                *cache = Some(crate::IndexCache {
458                    symbols: new_symbols,
459                    chunks: Vec::new(),
460                    root_path: String::new(),
461                });
462            }
463        }
464
465        Ok(results)
466    }
467}