Skip to main content

codemem_engine/
search.rs

1//! Code search and summary tree domain logic.
2
3use crate::index::symbol::{symbol_from_graph_node, Symbol};
4use crate::CodememEngine;
5use codemem_core::{CodememError, GraphBackend, NodeKind, RelationshipType};
6use serde::Serialize;
7
8/// A code search result returned by semantic vector search over symbols and chunks.
9#[derive(Debug, Clone, Serialize)]
10pub struct CodeSearchResult {
11    pub id: String,
12    /// "chunk" or the graph node kind string (e.g. "function", "class").
13    pub kind: String,
14    pub label: String,
15    pub similarity: f64,
16    pub file_path: Option<serde_json::Value>,
17    pub line_start: Option<serde_json::Value>,
18    pub line_end: Option<serde_json::Value>,
19    // Symbol-specific fields
20    pub qualified_name: Option<String>,
21    pub signature: Option<serde_json::Value>,
22    pub doc_comment: Option<serde_json::Value>,
23    // Chunk-specific fields
24    pub node_kind: Option<serde_json::Value>,
25    pub parent_symbol: Option<serde_json::Value>,
26    pub non_ws_chars: Option<serde_json::Value>,
27}
28
29/// A recursive tree node for the summary tree.
30#[derive(Debug, Clone, Serialize)]
31pub struct SummaryTreeNode {
32    pub id: String,
33    pub kind: String,
34    pub label: String,
35    pub centrality: f64,
36    #[serde(skip_serializing_if = "Option::is_none")]
37    pub symbol_count: Option<usize>,
38    #[serde(skip_serializing_if = "Option::is_none")]
39    pub chunk_count: Option<usize>,
40    #[serde(skip_serializing_if = "Vec::is_empty")]
41    pub children: Vec<SummaryTreeNode>,
42}
43
44/// A filtered symbol match from the in-memory index cache.
45#[derive(Debug, Clone, Serialize)]
46pub struct SymbolSearchResult {
47    pub name: String,
48    pub qualified_name: String,
49    pub kind: String,
50    pub signature: String,
51    pub file_path: String,
52    pub line_start: usize,
53    pub line_end: usize,
54    pub visibility: String,
55    pub parent: Option<String>,
56}
57
58/// Extract code references from free-form text for use in auto-linking.
59///
60/// Recognizes:
61/// - CamelCase identifiers (likely type/class names): `ProcessRequest`, `HashMap`
62/// - Backtick-wrapped code: content from \`...\`
63/// - Function-like patterns: `word()` or `word::path`
64/// - Short file paths: patterns like `src/foo.rs`, `lib/bar.py`
65///
66/// Returns deduplicated references suitable for matching against graph node labels.
67pub fn extract_code_references(text: &str) -> Vec<String> {
68    use regex::Regex;
69    use std::collections::HashSet;
70    use std::sync::OnceLock;
71
72    static RE_CAMEL: OnceLock<Regex> = OnceLock::new();
73    static RE_BACKTICK: OnceLock<Regex> = OnceLock::new();
74    static RE_FUNC: OnceLock<Regex> = OnceLock::new();
75    static RE_PATH: OnceLock<Regex> = OnceLock::new();
76
77    let re_camel = RE_CAMEL.get_or_init(|| {
78        // CamelCase: at least two words, each starting uppercase, min 4 chars total
79        Regex::new(r"\b([A-Z][a-z]+(?:[A-Z][a-z0-9]*)+)\b").unwrap()
80    });
81    let re_backtick = RE_BACKTICK.get_or_init(|| {
82        // Content inside backticks (non-greedy, at least 2 chars)
83        Regex::new(r"`([^`]{2,})`").unwrap()
84    });
85    let re_func = RE_FUNC.get_or_init(|| {
86        // Function calls like word() or qualified paths like word::path
87        Regex::new(r"\b([a-zA-Z_]\w*(?:::[a-zA-Z_]\w*)+)\b|\b([a-zA-Z_]\w*)\(\)").unwrap()
88    });
89    let re_path = RE_PATH.get_or_init(|| {
90        // File paths: word/word.ext patterns (2-4 segments, common extensions)
91        Regex::new(r"\b([a-zA-Z0-9_\-]+(?:/[a-zA-Z0-9_\-]+)*\.[a-zA-Z]{1,4})\b").unwrap()
92    });
93
94    let mut seen = HashSet::new();
95    let mut refs = Vec::new();
96
97    let mut add = |s: &str| {
98        let trimmed = s.trim();
99        if trimmed.len() >= 2 && seen.insert(trimmed.to_string()) {
100            refs.push(trimmed.to_string());
101        }
102    };
103
104    for cap in re_camel.captures_iter(text) {
105        if let Some(m) = cap.get(1) {
106            add(m.as_str());
107        }
108    }
109    for cap in re_backtick.captures_iter(text) {
110        if let Some(m) = cap.get(1) {
111            add(m.as_str());
112        }
113    }
114    for cap in re_func.captures_iter(text) {
115        // Group 1: qualified path (a::b::c), Group 2: function call (word())
116        if let Some(m) = cap.get(1) {
117            add(m.as_str());
118        }
119        if let Some(m) = cap.get(2) {
120            add(m.as_str());
121        }
122    }
123    for cap in re_path.captures_iter(text) {
124        if let Some(m) = cap.get(1) {
125            let path = m.as_str();
126            // Only include if it looks like a real file path (has a directory separator)
127            if path.contains('/') {
128                add(path);
129            }
130        }
131    }
132
133    refs
134}
135
136impl CodememEngine {
137    /// Estimate the approximate RAM usage of the in-memory graph.
138    ///
139    /// The graph engine (`GraphEngine`) keeps all nodes and edges in memory via
140    /// `HashMap`s plus a `petgraph::DiGraph`. Each node carries a `GraphNode` struct
141    /// (~200 bytes: id, kind, label, payload HashMap, centrality, optional memory_id
142    /// and namespace) plus a petgraph `NodeIndex`. Each edge carries an `Edge` struct
143    /// (~150 bytes: id, src, dst, relationship, weight, properties HashMap, timestamps)
144    /// plus petgraph edge weight storage and adjacency list entries.
145    ///
146    /// Returns an estimate in bytes. Actual usage may be higher due to HashMap overhead,
147    /// string heap allocations, and payload contents.
148    #[cfg(test)]
149    pub fn graph_memory_estimate(&self) -> usize {
150        let (node_count, edge_count) = match self.lock_graph() {
151            Ok(g) => (g.node_count(), g.edge_count()),
152            Err(_) => (0, 0),
153        };
154        node_count * 200 + edge_count * 150
155    }
156
157    /// Semantic code search: embed query, vector search, filter to sym:/chunk: IDs,
158    /// enrich with graph node data.
159    pub fn search_code(
160        &self,
161        query: &str,
162        k: usize,
163    ) -> Result<Vec<CodeSearchResult>, CodememError> {
164        let emb_guard = self
165            .lock_embeddings()?
166            .ok_or_else(|| CodememError::InvalidInput("Embedding service not available".into()))?;
167
168        let query_embedding = emb_guard
169            .embed(query)
170            .map_err(|e| CodememError::InvalidInput(format!("Embedding failed: {e}")))?;
171        drop(emb_guard);
172
173        let vec = self.lock_vector()?;
174        let raw_results: Vec<(String, f32)> = vec
175            .search(&query_embedding, k * 3)
176            .unwrap_or_default()
177            .into_iter()
178            .filter(|(id, _)| id.starts_with("sym:") || id.starts_with("chunk:"))
179            .take(k)
180            .collect();
181        drop(vec);
182
183        if raw_results.is_empty() {
184            return Ok(Vec::new());
185        }
186
187        let mut output = Vec::new();
188        let now = chrono::Utc::now();
189        for (id, similarity_value) in &raw_results {
190            let similarity = *similarity_value as f64;
191            if let Ok(Some(node)) = self.storage.get_graph_node(id) {
192                // Skip expired nodes (deleted symbols/files)
193                if node.valid_to.is_some_and(|vt| vt <= now) {
194                    continue;
195                }
196                if id.starts_with("chunk:") {
197                    output.push(CodeSearchResult {
198                        id: id.clone(),
199                        kind: "chunk".to_string(),
200                        label: node.label,
201                        similarity,
202                        file_path: node.payload.get("file_path").cloned(),
203                        line_start: node.payload.get("line_start").cloned(),
204                        line_end: node.payload.get("line_end").cloned(),
205                        qualified_name: None,
206                        signature: None,
207                        doc_comment: None,
208                        node_kind: node.payload.get("node_kind").cloned(),
209                        parent_symbol: node.payload.get("parent_symbol").cloned(),
210                        non_ws_chars: node.payload.get("non_ws_chars").cloned(),
211                    });
212                } else {
213                    output.push(CodeSearchResult {
214                        id: id.clone(),
215                        kind: node.kind.to_string(),
216                        label: node.label,
217                        similarity,
218                        file_path: node.payload.get("file_path").cloned(),
219                        line_start: node.payload.get("line_start").cloned(),
220                        line_end: node.payload.get("line_end").cloned(),
221                        qualified_name: Some(id.strip_prefix("sym:").unwrap_or(id).to_string()),
222                        signature: node.payload.get("signature").cloned(),
223                        doc_comment: node.payload.get("doc_comment").cloned(),
224                        node_kind: None,
225                        parent_symbol: None,
226                        non_ws_chars: None,
227                    });
228                }
229            }
230        }
231
232        Ok(output)
233    }
234
235    /// Build a hierarchical summary tree starting from a given node.
236    /// Traverses Contains edges: packages -> files -> symbols.
237    pub fn summary_tree(
238        &self,
239        start_id: &str,
240        max_depth: usize,
241        include_chunks: bool,
242    ) -> Result<SummaryTreeNode, CodememError> {
243        let graph = self.lock_graph()?;
244
245        fn build_tree(
246            graph: &dyn GraphBackend,
247            node_id: &str,
248            depth: usize,
249            max_depth: usize,
250            include_chunks: bool,
251        ) -> Option<SummaryTreeNode> {
252            if depth > max_depth {
253                return None;
254            }
255            let node = match graph.get_node(node_id) {
256                Ok(Some(n)) => n,
257                _ => return None,
258            };
259
260            let mut children: Vec<SummaryTreeNode> = Vec::new();
261            if depth < max_depth {
262                if let Ok(edges) = graph.get_edges(node_id) {
263                    let mut child_ids: Vec<String> = edges
264                        .iter()
265                        .filter(|e| {
266                            e.src == node_id && e.relationship == RelationshipType::Contains
267                        })
268                        .map(|e| e.dst.clone())
269                        .collect();
270                    child_ids.sort();
271
272                    for child_id in &child_ids {
273                        if !include_chunks && child_id.starts_with("chunk:") {
274                            continue;
275                        }
276                        if let Some(child) =
277                            build_tree(graph, child_id, depth + 1, max_depth, include_chunks)
278                        {
279                            children.push(child);
280                        }
281                    }
282                }
283            }
284
285            let (symbol_count, chunk_count) = if node.kind == NodeKind::File {
286                let syms = children
287                    .iter()
288                    .filter(|c| c.kind != "chunk" && c.kind != "package" && c.kind != "file")
289                    .count();
290                let chunks = if include_chunks {
291                    children.iter().filter(|c| c.kind == "chunk").count()
292                } else {
293                    0
294                };
295                (Some(syms), Some(chunks))
296            } else {
297                (None, None)
298            };
299
300            Some(SummaryTreeNode {
301                id: node.id,
302                kind: node.kind.to_string(),
303                label: node.label,
304                centrality: node.centrality,
305                symbol_count,
306                chunk_count,
307                children,
308            })
309        }
310
311        build_tree(&**graph, start_id, 0, max_depth, include_chunks)
312            .ok_or_else(|| CodememError::NotFound(format!("Node not found: {start_id}")))
313    }
314
315    /// Look up a single symbol by qualified name with cache-through to the DB.
316    ///
317    /// 1. Check the in-memory `IndexCache`.
318    /// 2. On miss, query storage for `sym:{qualified_name}` graph node.
319    /// 3. Reconstruct a `Symbol`, insert into cache, return.
320    pub fn get_symbol(&self, qualified_name: &str) -> Result<Option<Symbol>, CodememError> {
321        // 1. Check cache
322        {
323            let cache = self.lock_index_cache()?;
324            if let Some(ref c) = *cache {
325                if let Some(sym) = c
326                    .symbols
327                    .iter()
328                    .find(|s| s.qualified_name == qualified_name)
329                {
330                    return Ok(Some(sym.clone()));
331                }
332            }
333        }
334
335        // 2. Cache miss — query DB
336        let node_id = format!("sym:{qualified_name}");
337        let node = match self.storage.get_graph_node(&node_id)? {
338            Some(n) => n,
339            None => return Ok(None),
340        };
341
342        // 3. Reconstruct Symbol from GraphNode
343        let sym = match symbol_from_graph_node(&node) {
344            Some(s) => s,
345            None => return Ok(None),
346        };
347
348        // 4. Insert into cache (init if None)
349        {
350            let mut cache = self.lock_index_cache()?;
351            if let Some(ref mut c) = *cache {
352                // Dedup: only insert if not already present
353                if !c.symbols.iter().any(|s| s.qualified_name == qualified_name) {
354                    c.symbols.push(sym.clone());
355                }
356            } else {
357                *cache = Some(crate::IndexCache {
358                    symbols: vec![sym.clone()],
359                    chunks: Vec::new(),
360                    root_path: String::new(),
361                });
362            }
363        }
364
365        Ok(Some(sym))
366    }
367
368    /// Search symbols by name, with optional kind filter. Falls through to the DB
369    /// when the in-memory cache is empty or has no matches.
370    pub fn search_symbols(
371        &self,
372        query: &str,
373        limit: usize,
374        kind_filter: Option<&str>,
375    ) -> Result<Vec<SymbolSearchResult>, CodememError> {
376        let query_lower = query.to_lowercase();
377        let kind_lower = kind_filter.map(|k| k.to_lowercase());
378
379        // Helper closure: filter + map a Symbol into a SymbolSearchResult
380        let matches_filter = |sym: &Symbol| -> bool {
381            let name_match = sym.name.to_lowercase().contains(&query_lower)
382                || sym.qualified_name.to_lowercase().contains(&query_lower);
383            if !name_match {
384                return false;
385            }
386            if let Some(ref kl) = kind_lower {
387                return sym.kind.to_string().to_lowercase() == *kl;
388            }
389            true
390        };
391
392        let to_result = |sym: &Symbol| -> SymbolSearchResult {
393            SymbolSearchResult {
394                name: sym.name.clone(),
395                qualified_name: sym.qualified_name.clone(),
396                kind: sym.kind.to_string(),
397                signature: sym.signature.clone(),
398                file_path: sym.file_path.clone(),
399                line_start: sym.line_start,
400                line_end: sym.line_end,
401                visibility: sym.visibility.to_string(),
402                parent: sym.parent.clone(),
403            }
404        };
405
406        // 1. Try cache first — only short-circuit if we got enough results
407        let mut results = Vec::new();
408        let mut seen_qnames = std::collections::HashSet::new();
409        {
410            let cache = self.lock_index_cache()?;
411            if let Some(ref c) = *cache {
412                for sym in c.symbols.iter().filter(|s| matches_filter(s)) {
413                    if seen_qnames.insert(sym.qualified_name.clone()) {
414                        results.push(to_result(sym));
415                        if results.len() >= limit {
416                            return Ok(results);
417                        }
418                    }
419                }
420            }
421        }
422
423        // 2. Cache empty or partial results — top up from DB.
424        // Over-fetch by 3× because search_graph_nodes returns all node kinds
425        // (files, chunks, packages…) and we filter to sym:* nodes only.
426        // NOTE: namespace=None is intentional — the in-memory cache is already
427        // cross-namespace, so the DB fallback should match that behavior.
428        let db_nodes = self
429            .storage
430            .search_graph_nodes(&query_lower, None, limit * 3)?;
431
432        let mut new_symbols = Vec::new();
433
434        for node in &db_nodes {
435            if !node.id.starts_with("sym:") {
436                continue;
437            }
438            if let Some(sym) = symbol_from_graph_node(node) {
439                if matches_filter(&sym) && seen_qnames.insert(sym.qualified_name.clone()) {
440                    results.push(to_result(&sym));
441                    new_symbols.push(sym);
442                    if results.len() >= limit {
443                        break;
444                    }
445                }
446            }
447        }
448
449        // 3. Populate cache with DB results for future queries
450        if !new_symbols.is_empty() {
451            let mut cache = self.lock_index_cache()?;
452            if let Some(ref mut c) = *cache {
453                for sym in new_symbols {
454                    if !c
455                        .symbols
456                        .iter()
457                        .any(|s| s.qualified_name == sym.qualified_name)
458                    {
459                        c.symbols.push(sym);
460                    }
461                }
462            } else {
463                *cache = Some(crate::IndexCache {
464                    symbols: new_symbols,
465                    chunks: Vec::new(),
466                    root_path: String::new(),
467                });
468            }
469        }
470
471        Ok(results)
472    }
473}