next_plaid_cli/
embed.rs

1use std::path::Path;
2
3use crate::parser::{CodeUnit, UnitType};
4
5/// Shorten a file path to keep only the filename and up to 3 parent folders.
6/// This makes paths easier for language models to encode and process.
7fn shorten_path(path: &Path) -> String {
8    let components: Vec<_> = path.components().collect();
9    let len = components.len();
10
11    // Keep at most the last 4 components (3 folders + filename)
12    let start = len.saturating_sub(4);
13    let shortened: std::path::PathBuf = components[start..].iter().collect();
14
15    shortened.display().to_string()
16}
17
18/// Normalize a path string for better embedding by separating words:
19/// - Add spaces around path separators (/ and \)
20/// - Replace underscores, hyphens, and dots with spaces
21/// - Split CamelCase words (e.g., "MyClassName" -> "My Class Name")
22/// - Remove extension from processed string (it's in the appended filename)
23/// - Append the original filename at the end
24fn normalize_path_for_embedding(path_str: &str) -> String {
25    // Extract the original filename
26    let original_filename = path_str.rsplit(['/', '\\']).next().unwrap_or(path_str);
27
28    // Remove extension from path for processing
29    let path_without_ext = if let Some(dot_pos) = path_str.rfind('.') {
30        &path_str[..dot_pos]
31    } else {
32        path_str
33    };
34
35    let mut result = String::with_capacity(path_without_ext.len() * 2);
36    let chars: Vec<char> = path_without_ext.chars().collect();
37
38    for (i, &c) in chars.iter().enumerate() {
39        match c {
40            '/' | '\\' => {
41                // Add space before and after path separators, normalize \ to /
42                if !result.ends_with(' ') && !result.is_empty() {
43                    result.push(' ');
44                }
45                result.push('/');
46                result.push(' ');
47            }
48            '_' | '-' | '.' => {
49                // Replace underscores, hyphens, and dots with spaces
50                if !result.ends_with(' ') {
51                    result.push(' ');
52                }
53            }
54            c if c.is_uppercase() => {
55                // For CamelCase: add space before uppercase if previous char was lowercase
56                if i > 0 {
57                    let prev = chars[i - 1];
58                    if prev.is_lowercase() {
59                        result.push(' ');
60                    }
61                }
62                result.push(c);
63            }
64            _ => {
65                result.push(c);
66            }
67        }
68    }
69
70    // Clean up any double spaces, trim, lowercase, and append original filename
71    let normalized = result
72        .split_whitespace()
73        .collect::<Vec<_>>()
74        .join(" ")
75        .to_lowercase();
76    format!("{} {}", normalized, original_filename)
77}
78
79/// Build text representation combining all 5 analysis layers.
80/// This rich text is what gets embedded by ColBERT for semantic search.
81pub fn build_embedding_text(unit: &CodeUnit) -> String {
82    let mut parts = Vec::new();
83
84    // === Layer 1: AST (Identity + Signature) ===
85    let type_str = match unit.unit_type {
86        UnitType::Function => "Function",
87        UnitType::Method => "Method",
88        UnitType::Class => "Class",
89        UnitType::Document => "Document",
90        UnitType::Section => "Section",
91    };
92    parts.push(format!("{}: {}", type_str, unit.name));
93
94    if !unit.signature.is_empty() {
95        parts.push(format!("Signature: {}", unit.signature));
96    }
97
98    if let Some(doc) = &unit.docstring {
99        if !doc.is_empty() {
100            parts.push(format!("Description: {}", doc));
101        }
102    }
103
104    if !unit.parameters.is_empty() {
105        parts.push(format!("Parameters: {}", unit.parameters.join(", ")));
106    }
107
108    if let Some(ret) = &unit.return_type {
109        if !ret.is_empty() {
110            parts.push(format!("Returns: {}", ret));
111        }
112    }
113
114    // === Layer 2: Call Graph ===
115    if !unit.calls.is_empty() {
116        parts.push(format!("Calls: {}", unit.calls.join(", ")));
117    }
118
119    if !unit.called_by.is_empty() {
120        parts.push(format!("Called by: {}", unit.called_by.join(", ")));
121    }
122
123    // === Layer 3: Control Flow ===
124    let mut flow_info = Vec::new();
125    if unit.complexity > 1 {
126        flow_info.push(format!("complexity={}", unit.complexity));
127    }
128    if unit.has_loops {
129        flow_info.push("has_loops".to_string());
130    }
131    if unit.has_branches {
132        flow_info.push("has_branches".to_string());
133    }
134    if unit.has_error_handling {
135        flow_info.push("handles_errors".to_string());
136    }
137    if !flow_info.is_empty() {
138        parts.push(format!("Control flow: {}", flow_info.join(", ")));
139    }
140
141    // === Layer 4: Data Flow ===
142    if !unit.variables.is_empty() {
143        parts.push(format!("Variables: {}", unit.variables.join(", ")));
144    }
145
146    // === Layer 5: Dependencies ===
147    if !unit.imports.is_empty() {
148        parts.push(format!("Uses: {}", unit.imports.join(", ")));
149    }
150
151    // === Code Preview ===
152    if !unit.code_preview.is_empty() {
153        parts.push(format!("Code:\n{}", unit.code_preview));
154    }
155
156    // === File Path (shortened for better LLM encoding) ===
157    parts.push(format!(
158        "File: {}",
159        normalize_path_for_embedding(&shorten_path(&unit.file))
160    ));
161
162    parts.join("\n")
163}
164
165#[cfg(test)]
166mod tests {
167    use super::*;
168
169    #[test]
170    fn test_normalize_path_separators() {
171        assert_eq!(
172            normalize_path_for_embedding("src/parser/mod.rs"),
173            "src / parser / mod mod.rs"
174        );
175    }
176
177    #[test]
178    fn test_normalize_backslash_separators() {
179        // Backslashes are normalized to forward slashes
180        assert_eq!(
181            normalize_path_for_embedding("src\\parser\\mod.rs"),
182            "src / parser / mod mod.rs"
183        );
184    }
185
186    #[test]
187    fn test_normalize_underscores() {
188        assert_eq!(
189            normalize_path_for_embedding("my_file_name.py"),
190            "my file name my_file_name.py"
191        );
192    }
193
194    #[test]
195    fn test_normalize_hyphens() {
196        assert_eq!(
197            normalize_path_for_embedding("my-file-name.py"),
198            "my file name my-file-name.py"
199        );
200    }
201
202    #[test]
203    fn test_normalize_camel_case() {
204        assert_eq!(
205            normalize_path_for_embedding("MyClassName.ts"),
206            "my class name MyClassName.ts"
207        );
208    }
209
210    #[test]
211    fn test_normalize_camel_case_lowercase_start() {
212        assert_eq!(
213            normalize_path_for_embedding("myClassName.ts"),
214            "my class name myClassName.ts"
215        );
216    }
217
218    #[test]
219    fn test_normalize_combined() {
220        assert_eq!(
221            normalize_path_for_embedding("src/utils/HttpClientHelper.rs"),
222            "src / utils / http client helper HttpClientHelper.rs"
223        );
224    }
225
226    #[test]
227    fn test_normalize_snake_case_path() {
228        assert_eq!(
229            normalize_path_for_embedding("src/my_module/file_utils.py"),
230            "src / my module / file utils file_utils.py"
231        );
232    }
233
234    #[test]
235    fn test_normalize_mixed_separators() {
236        assert_eq!(
237            normalize_path_for_embedding("my_great-file.rs"),
238            "my great file my_great-file.rs"
239        );
240    }
241
242    #[test]
243    fn test_normalize_empty_string() {
244        assert_eq!(normalize_path_for_embedding(""), " ");
245    }
246
247    #[test]
248    fn test_normalize_simple_filename() {
249        assert_eq!(normalize_path_for_embedding("main.rs"), "main main.rs");
250    }
251
252    #[test]
253    fn test_normalize_consecutive_separators() {
254        // Multiple underscores/hyphens should collapse to single space
255        assert_eq!(
256            normalize_path_for_embedding("my__file--name.rs"),
257            "my file name my__file--name.rs"
258        );
259    }
260
261    #[test]
262    fn test_normalize_no_extension() {
263        assert_eq!(
264            normalize_path_for_embedding("src/Makefile"),
265            "src / makefile Makefile"
266        );
267    }
268}