use std::path::Path;
use crate::parser::{CodeUnit, UnitType};
fn shorten_path(path: &Path) -> String {
let components: Vec<_> = path.components().collect();
let len = components.len();
let start = len.saturating_sub(4);
let shortened: std::path::PathBuf = components[start..].iter().collect();
shortened.display().to_string()
}
fn normalize_path_for_embedding(path_str: &str) -> String {
let original_filename = path_str.rsplit(['/', '\\']).next().unwrap_or(path_str);
let path_without_ext = if let Some(dot_pos) = path_str.rfind('.') {
&path_str[..dot_pos]
} else {
path_str
};
let mut result = String::with_capacity(path_without_ext.len() * 2);
let chars: Vec<char> = path_without_ext.chars().collect();
for (i, &c) in chars.iter().enumerate() {
match c {
'/' | '\\' => {
if !result.ends_with(' ') && !result.is_empty() {
result.push(' ');
}
result.push('/');
result.push(' ');
}
'_' | '-' | '.' => {
if !result.ends_with(' ') {
result.push(' ');
}
}
c if c.is_uppercase() => {
if i > 0 {
let prev = chars[i - 1];
if prev.is_lowercase() {
result.push(' ');
}
}
result.push(c);
}
_ => {
result.push(c);
}
}
}
let normalized = result
.split_whitespace()
.collect::<Vec<_>>()
.join(" ")
.to_lowercase();
format!("{} {}", normalized, original_filename)
}
pub fn build_embedding_text(unit: &CodeUnit) -> String {
let mut parts = Vec::new();
let type_str = match unit.unit_type {
UnitType::Function => "Function",
UnitType::Method => "Method",
UnitType::Class => "Class",
UnitType::Document => "Document",
UnitType::Section => "Section",
};
parts.push(format!("{}: {}", type_str, unit.name));
if !unit.signature.is_empty() {
parts.push(format!("Signature: {}", unit.signature));
}
if let Some(doc) = &unit.docstring {
if !doc.is_empty() {
parts.push(format!("Description: {}", doc));
}
}
if !unit.parameters.is_empty() {
parts.push(format!("Parameters: {}", unit.parameters.join(", ")));
}
if let Some(ret) = &unit.return_type {
if !ret.is_empty() {
parts.push(format!("Returns: {}", ret));
}
}
if !unit.calls.is_empty() {
parts.push(format!("Calls: {}", unit.calls.join(", ")));
}
if !unit.called_by.is_empty() {
parts.push(format!("Called by: {}", unit.called_by.join(", ")));
}
let mut flow_info = Vec::new();
if unit.complexity > 1 {
flow_info.push(format!("complexity={}", unit.complexity));
}
if unit.has_loops {
flow_info.push("has_loops".to_string());
}
if unit.has_branches {
flow_info.push("has_branches".to_string());
}
if unit.has_error_handling {
flow_info.push("handles_errors".to_string());
}
if !flow_info.is_empty() {
parts.push(format!("Control flow: {}", flow_info.join(", ")));
}
if !unit.variables.is_empty() {
parts.push(format!("Variables: {}", unit.variables.join(", ")));
}
if !unit.imports.is_empty() {
parts.push(format!("Uses: {}", unit.imports.join(", ")));
}
if !unit.code_preview.is_empty() {
parts.push(format!("Code:\n{}", unit.code_preview));
}
parts.push(format!(
"File: {}",
normalize_path_for_embedding(&shorten_path(&unit.file))
));
parts.join("\n")
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_normalize_path_separators() {
assert_eq!(
normalize_path_for_embedding("src/parser/mod.rs"),
"src / parser / mod mod.rs"
);
}
#[test]
fn test_normalize_backslash_separators() {
assert_eq!(
normalize_path_for_embedding("src\\parser\\mod.rs"),
"src / parser / mod mod.rs"
);
}
#[test]
fn test_normalize_underscores() {
assert_eq!(
normalize_path_for_embedding("my_file_name.py"),
"my file name my_file_name.py"
);
}
#[test]
fn test_normalize_hyphens() {
assert_eq!(
normalize_path_for_embedding("my-file-name.py"),
"my file name my-file-name.py"
);
}
#[test]
fn test_normalize_camel_case() {
assert_eq!(
normalize_path_for_embedding("MyClassName.ts"),
"my class name MyClassName.ts"
);
}
#[test]
fn test_normalize_camel_case_lowercase_start() {
assert_eq!(
normalize_path_for_embedding("myClassName.ts"),
"my class name myClassName.ts"
);
}
#[test]
fn test_normalize_combined() {
assert_eq!(
normalize_path_for_embedding("src/utils/HttpClientHelper.rs"),
"src / utils / http client helper HttpClientHelper.rs"
);
}
#[test]
fn test_normalize_snake_case_path() {
assert_eq!(
normalize_path_for_embedding("src/my_module/file_utils.py"),
"src / my module / file utils file_utils.py"
);
}
#[test]
fn test_normalize_mixed_separators() {
assert_eq!(
normalize_path_for_embedding("my_great-file.rs"),
"my great file my_great-file.rs"
);
}
#[test]
fn test_normalize_empty_string() {
assert_eq!(normalize_path_for_embedding(""), " ");
}
#[test]
fn test_normalize_simple_filename() {
assert_eq!(normalize_path_for_embedding("main.rs"), "main main.rs");
}
#[test]
fn test_normalize_consecutive_separators() {
assert_eq!(
normalize_path_for_embedding("my__file--name.rs"),
"my file name my__file--name.rs"
);
}
#[test]
fn test_normalize_no_extension() {
assert_eq!(
normalize_path_for_embedding("src/Makefile"),
"src / makefile Makefile"
);
}
}