1use std::path::Path;
2
3use crate::parser::{CodeUnit, UnitType};
4
5fn shorten_path(path: &Path) -> String {
8 let components: Vec<_> = path.components().collect();
9 let len = components.len();
10
11 let start = len.saturating_sub(4);
13 let shortened: std::path::PathBuf = components[start..].iter().collect();
14
15 shortened.display().to_string()
16}
17
18fn normalize_path_for_embedding(path_str: &str) -> String {
25 let original_filename = path_str.rsplit(['/', '\\']).next().unwrap_or(path_str);
27
28 let path_without_ext = if let Some(dot_pos) = path_str.rfind('.') {
30 &path_str[..dot_pos]
31 } else {
32 path_str
33 };
34
35 let mut result = String::with_capacity(path_without_ext.len() * 2);
36 let chars: Vec<char> = path_without_ext.chars().collect();
37
38 for (i, &c) in chars.iter().enumerate() {
39 match c {
40 '/' | '\\' => {
41 if !result.ends_with(' ') && !result.is_empty() {
43 result.push(' ');
44 }
45 result.push('/');
46 result.push(' ');
47 }
48 '_' | '-' | '.' => {
49 if !result.ends_with(' ') {
51 result.push(' ');
52 }
53 }
54 c if c.is_uppercase() => {
55 if i > 0 {
57 let prev = chars[i - 1];
58 if prev.is_lowercase() {
59 result.push(' ');
60 }
61 }
62 result.push(c);
63 }
64 _ => {
65 result.push(c);
66 }
67 }
68 }
69
70 let normalized = result
72 .split_whitespace()
73 .collect::<Vec<_>>()
74 .join(" ")
75 .to_lowercase();
76 format!("{} {}", normalized, original_filename)
77}
78
79pub fn build_embedding_text(unit: &CodeUnit) -> String {
82 let mut parts = Vec::new();
83
84 let type_str = match unit.unit_type {
86 UnitType::Function => "Function",
87 UnitType::Method => "Method",
88 UnitType::Class => "Class",
89 UnitType::Document => "Document",
90 UnitType::Section => "Section",
91 };
92 parts.push(format!("{}: {}", type_str, unit.name));
93
94 if !unit.signature.is_empty() {
95 parts.push(format!("Signature: {}", unit.signature));
96 }
97
98 if let Some(doc) = &unit.docstring {
99 if !doc.is_empty() {
100 parts.push(format!("Description: {}", doc));
101 }
102 }
103
104 if !unit.parameters.is_empty() {
105 parts.push(format!("Parameters: {}", unit.parameters.join(", ")));
106 }
107
108 if let Some(ret) = &unit.return_type {
109 if !ret.is_empty() {
110 parts.push(format!("Returns: {}", ret));
111 }
112 }
113
114 if !unit.calls.is_empty() {
116 parts.push(format!("Calls: {}", unit.calls.join(", ")));
117 }
118
119 if !unit.called_by.is_empty() {
120 parts.push(format!("Called by: {}", unit.called_by.join(", ")));
121 }
122
123 let mut flow_info = Vec::new();
125 if unit.complexity > 1 {
126 flow_info.push(format!("complexity={}", unit.complexity));
127 }
128 if unit.has_loops {
129 flow_info.push("has_loops".to_string());
130 }
131 if unit.has_branches {
132 flow_info.push("has_branches".to_string());
133 }
134 if unit.has_error_handling {
135 flow_info.push("handles_errors".to_string());
136 }
137 if !flow_info.is_empty() {
138 parts.push(format!("Control flow: {}", flow_info.join(", ")));
139 }
140
141 if !unit.variables.is_empty() {
143 parts.push(format!("Variables: {}", unit.variables.join(", ")));
144 }
145
146 if !unit.imports.is_empty() {
148 parts.push(format!("Uses: {}", unit.imports.join(", ")));
149 }
150
151 if !unit.code_preview.is_empty() {
153 parts.push(format!("Code:\n{}", unit.code_preview));
154 }
155
156 parts.push(format!(
158 "File: {}",
159 normalize_path_for_embedding(&shorten_path(&unit.file))
160 ));
161
162 parts.join("\n")
163}
164
165#[cfg(test)]
166mod tests {
167 use super::*;
168
169 #[test]
170 fn test_normalize_path_separators() {
171 assert_eq!(
172 normalize_path_for_embedding("src/parser/mod.rs"),
173 "src / parser / mod mod.rs"
174 );
175 }
176
177 #[test]
178 fn test_normalize_backslash_separators() {
179 assert_eq!(
181 normalize_path_for_embedding("src\\parser\\mod.rs"),
182 "src / parser / mod mod.rs"
183 );
184 }
185
186 #[test]
187 fn test_normalize_underscores() {
188 assert_eq!(
189 normalize_path_for_embedding("my_file_name.py"),
190 "my file name my_file_name.py"
191 );
192 }
193
194 #[test]
195 fn test_normalize_hyphens() {
196 assert_eq!(
197 normalize_path_for_embedding("my-file-name.py"),
198 "my file name my-file-name.py"
199 );
200 }
201
202 #[test]
203 fn test_normalize_camel_case() {
204 assert_eq!(
205 normalize_path_for_embedding("MyClassName.ts"),
206 "my class name MyClassName.ts"
207 );
208 }
209
210 #[test]
211 fn test_normalize_camel_case_lowercase_start() {
212 assert_eq!(
213 normalize_path_for_embedding("myClassName.ts"),
214 "my class name myClassName.ts"
215 );
216 }
217
218 #[test]
219 fn test_normalize_combined() {
220 assert_eq!(
221 normalize_path_for_embedding("src/utils/HttpClientHelper.rs"),
222 "src / utils / http client helper HttpClientHelper.rs"
223 );
224 }
225
226 #[test]
227 fn test_normalize_snake_case_path() {
228 assert_eq!(
229 normalize_path_for_embedding("src/my_module/file_utils.py"),
230 "src / my module / file utils file_utils.py"
231 );
232 }
233
234 #[test]
235 fn test_normalize_mixed_separators() {
236 assert_eq!(
237 normalize_path_for_embedding("my_great-file.rs"),
238 "my great file my_great-file.rs"
239 );
240 }
241
242 #[test]
243 fn test_normalize_empty_string() {
244 assert_eq!(normalize_path_for_embedding(""), " ");
245 }
246
247 #[test]
248 fn test_normalize_simple_filename() {
249 assert_eq!(normalize_path_for_embedding("main.rs"), "main main.rs");
250 }
251
252 #[test]
253 fn test_normalize_consecutive_separators() {
254 assert_eq!(
256 normalize_path_for_embedding("my__file--name.rs"),
257 "my file name my__file--name.rs"
258 );
259 }
260
261 #[test]
262 fn test_normalize_no_extension() {
263 assert_eq!(
264 normalize_path_for_embedding("src/Makefile"),
265 "src / makefile Makefile"
266 );
267 }
268}