use super::runtime::parse_bool_env;
pub fn split_identifier(name: &str) -> String {
if !name.contains('_') && !name.chars().any(|c| c.is_uppercase()) {
return name.to_string();
}
let mut words = Vec::new();
let mut current = String::new();
let chars: Vec<char> = name.chars().collect();
for (i, &ch) in chars.iter().enumerate() {
if ch == '_' {
if !current.is_empty() {
words.push(current.clone());
current.clear();
}
} else if ch.is_uppercase()
&& !current.is_empty()
&& (current
.chars()
.last()
.map(|c| c.is_lowercase())
.unwrap_or(false)
|| chars.get(i + 1).map(|c| c.is_lowercase()).unwrap_or(false))
{
words.push(current.clone());
current.clear();
current.push(ch);
} else {
current.push(ch);
}
}
if !current.is_empty() {
words.push(current);
}
if words.len() <= 1 {
return name.to_string(); }
words.join(" ")
}
pub fn is_test_only_symbol(sym: &crate::db::SymbolWithFile, source: Option<&str>) -> bool {
let fp = &sym.file_path;
if fp.contains("/tests/") || fp.ends_with("_tests.rs") {
return true;
}
if fp.contains("/__tests__/") || fp.contains("\\__tests__\\") {
return true;
}
if fp.ends_with("_test.py") {
return true;
}
if fp.ends_with("_test.go") {
return true;
}
if fp.ends_with(".test.ts")
|| fp.ends_with(".test.tsx")
|| fp.ends_with(".test.js")
|| fp.ends_with(".test.jsx")
|| fp.ends_with(".spec.ts")
|| fp.ends_with(".spec.js")
{
return true;
}
if fp.contains("/src/test/") {
return true;
}
if fp.ends_with("Test.java") || fp.ends_with("Tests.java") {
return true;
}
if fp.ends_with("_test.rb") || fp.contains("/spec/") {
return true;
}
if sym.name_path.starts_with("tests::")
|| sym.name_path.contains("::tests::")
|| sym.name_path.starts_with("test::")
|| sym.name_path.contains("::test::")
{
return true;
}
let Some(source) = source else {
return false;
};
let start = usize::try_from(sym.start_byte.max(0))
.unwrap_or(0)
.min(source.len());
let window_start = start.saturating_sub(2048);
let attrs = String::from_utf8_lossy(&source.as_bytes()[window_start..start]);
if attrs.contains("#[test]")
|| attrs.contains("#[tokio::test]")
|| attrs.contains("#[cfg(test)]")
|| attrs.contains("#[cfg(all(test")
{
return true;
}
if fp.ends_with(".py") {
if sym.name.starts_with("test_") {
return true;
}
if sym.kind == "class" && sym.name.starts_with("Test") {
return true;
}
}
if fp.ends_with(".go") && sym.name.starts_with("Test") && sym.kind == "function" {
return true;
}
if fp.ends_with(".java") || fp.ends_with(".kt") {
let before = &source[..start];
let window = if before.len() > 200 {
&before[before.len() - 200..]
} else {
before
};
if window.contains("@Test")
|| window.contains("@ParameterizedTest")
|| window.contains("@RepeatedTest")
{
return true;
}
}
false
}
pub fn build_embedding_text(sym: &crate::db::SymbolWithFile, source: Option<&str>) -> String {
let file_ctx = if sym.file_path.is_empty() {
String::new()
} else {
let filename = sym.file_path.rsplit('/').next().unwrap_or(&sym.file_path);
format!(" in {}", filename)
};
let split_name = split_identifier(&sym.name);
let name_with_split = if split_name != sym.name {
format!("{} ({})", sym.name, split_name)
} else {
sym.name.clone()
};
let parent_ctx = if !sym.name_path.is_empty() && sym.name_path.contains('/') {
let parent = sym.name_path.rsplit_once('/').map(|x| x.0).unwrap_or("");
if parent.is_empty() {
String::new()
} else {
format!(" (in {})", parent)
}
} else {
String::new()
};
let module_ctx = if sym.file_path.contains('/') {
let parts: Vec<&str> = sym.file_path.rsplitn(3, '/').collect();
if parts.len() >= 2 {
let dir = parts[1];
if dir != "src" && dir != "crates" {
format!(" [{dir}]")
} else {
String::new()
}
} else {
String::new()
}
} else {
String::new()
};
let base = if sym.signature.is_empty() {
format!(
"{} {}{}{}{}",
sym.kind, name_with_split, parent_ctx, module_ctx, file_ctx
)
} else {
format!(
"{} {}{}{}{}: {}",
sym.kind, name_with_split, parent_ctx, module_ctx, file_ctx, sym.signature
)
};
let docstrings_disabled = std::env::var("CODELENS_EMBED_DOCSTRINGS")
.map(|v| v == "0" || v == "false")
.unwrap_or(false);
if docstrings_disabled {
return base;
}
let docstring = source
.and_then(|src| extract_leading_doc(src, sym.start_byte as usize, sym.end_byte as usize))
.unwrap_or_default();
let mut text = if docstring.is_empty() {
let body_hint = source
.and_then(|src| extract_body_hint(src, sym.start_byte as usize, sym.end_byte as usize))
.unwrap_or_default();
if body_hint.is_empty() {
base
} else {
format!("{} — {}", base, body_hint)
}
} else {
let line_budget = hint_line_budget();
let lines: Vec<String> = docstring
.lines()
.map(str::trim)
.filter(|line| !line.is_empty())
.take(line_budget)
.map(str::to_string)
.collect();
let hint = join_hint_lines(&lines);
if hint.is_empty() {
base
} else {
format!("{} — {}", base, hint)
}
};
if let Some(src) = source
&& let Some(nl_tokens) =
extract_nl_tokens(src, sym.start_byte as usize, sym.end_byte as usize)
&& !nl_tokens.is_empty()
{
text.push_str(" · NL: ");
text.push_str(&nl_tokens);
}
if let Some(src) = source
&& let Some(api_calls) =
extract_api_calls(src, sym.start_byte as usize, sym.end_byte as usize)
&& !api_calls.is_empty()
{
text.push_str(" · API: ");
text.push_str(&api_calls);
}
text
}
const DEFAULT_HINT_TOTAL_CHAR_BUDGET: usize = 60;
const DEFAULT_HINT_LINES: usize = 1;
pub fn hint_char_budget() -> usize {
std::env::var("CODELENS_EMBED_HINT_CHARS")
.ok()
.and_then(|raw| raw.parse::<usize>().ok())
.map(|n| n.clamp(60, 512))
.unwrap_or(DEFAULT_HINT_TOTAL_CHAR_BUDGET)
}
pub fn hint_line_budget() -> usize {
std::env::var("CODELENS_EMBED_HINT_LINES")
.ok()
.and_then(|raw| raw.parse::<usize>().ok())
.map(|n| n.clamp(1, 10))
.unwrap_or(DEFAULT_HINT_LINES)
}
pub fn join_hint_lines(lines: &[String]) -> String {
if lines.is_empty() {
return String::new();
}
let joined = lines
.iter()
.map(String::as_str)
.collect::<Vec<_>>()
.join(" · ");
let budget = hint_char_budget();
if joined.chars().count() > budget {
let truncated: String = joined.chars().take(budget).collect();
format!("{truncated}...")
} else {
joined
}
}
pub fn extract_body_hint(source: &str, start: usize, end: usize) -> Option<String> {
if start >= source.len() || end > source.len() || start >= end {
return None;
}
let safe_start = if source.is_char_boundary(start) {
start
} else {
source.floor_char_boundary(start)
};
let safe_end = end.min(source.len());
let safe_end = if source.is_char_boundary(safe_end) {
safe_end
} else {
source.floor_char_boundary(safe_end)
};
let body = &source[safe_start..safe_end];
let max_lines = hint_line_budget();
let mut collected: Vec<String> = Vec::with_capacity(max_lines);
let mut past_signature = false;
for line in body.lines() {
let trimmed = line.trim();
if !past_signature {
if trimmed.ends_with('{') || trimmed.ends_with(':') || trimmed == "{" {
past_signature = true;
}
continue;
}
if trimmed.is_empty()
|| trimmed.starts_with("//")
|| trimmed.starts_with('#')
|| trimmed.starts_with("/*")
|| trimmed.starts_with('*')
|| trimmed == "}"
{
continue;
}
collected.push(trimmed.to_string());
if collected.len() >= max_lines {
break;
}
}
if collected.is_empty() {
None
} else {
Some(join_hint_lines(&collected))
}
}
pub fn nl_tokens_enabled() -> bool {
if let Some(explicit) = parse_bool_env("CODELENS_EMBED_HINT_INCLUDE_COMMENTS") {
return explicit;
}
auto_hint_should_enable()
}
pub fn auto_hint_mode_enabled() -> bool {
parse_bool_env("CODELENS_EMBED_HINT_AUTO").unwrap_or(true)
}
pub fn auto_hint_lang() -> Option<String> {
std::env::var("CODELENS_EMBED_HINT_AUTO_LANG")
.ok()
.map(|raw| raw.trim().to_ascii_lowercase())
}
pub fn language_supports_nl_stack(lang: &str) -> bool {
matches!(
lang.trim().to_ascii_lowercase().as_str(),
"rs" | "rust"
| "cpp"
| "cc"
| "cxx"
| "c++"
| "c"
| "go"
| "golang"
| "java"
| "kt"
| "kotlin"
| "scala"
| "cs"
| "csharp"
| "ts"
| "typescript"
| "tsx"
| "js"
| "javascript"
| "jsx"
)
}
pub fn language_supports_sparse_weighting(lang: &str) -> bool {
matches!(
lang.trim().to_ascii_lowercase().as_str(),
"rs" | "rust"
| "cpp"
| "cc"
| "cxx"
| "c++"
| "c"
| "go"
| "golang"
| "java"
| "kt"
| "kotlin"
| "scala"
| "cs"
| "csharp"
)
}
pub fn auto_hint_should_enable() -> bool {
if !auto_hint_mode_enabled() {
return false;
}
match auto_hint_lang() {
Some(lang) => language_supports_nl_stack(&lang),
None => false, }
}
pub fn auto_sparse_should_enable() -> bool {
if !auto_hint_mode_enabled() {
return false;
}
match auto_hint_lang() {
Some(lang) => language_supports_sparse_weighting(&lang),
None => false,
}
}
pub fn is_nl_shaped(s: &str) -> bool {
let s = s.trim();
if s.chars().count() < 4 {
return false;
}
if s.contains('/') || s.contains('\\') || s.contains("::") {
return false;
}
if !s.contains(' ') {
return false;
}
let non_ws: usize = s.chars().filter(|c| !c.is_whitespace()).count();
if non_ws == 0 {
return false;
}
let alpha: usize = s.chars().filter(|c| c.is_alphabetic()).count();
(alpha * 100) / non_ws >= 60
}
pub fn strict_comments_enabled() -> bool {
std::env::var("CODELENS_EMBED_HINT_STRICT_COMMENTS")
.map(|raw| {
let lowered = raw.to_ascii_lowercase();
matches!(lowered.as_str(), "1" | "true" | "yes" | "on")
})
.unwrap_or(false)
}
pub fn looks_like_meta_annotation(body: &str) -> bool {
let trimmed = body.trim_start();
let word_end = trimmed
.find(|c: char| !c.is_ascii_alphabetic())
.unwrap_or(trimmed.len());
if word_end == 0 {
return false;
}
let first_word = &trimmed[..word_end];
let upper = first_word.to_ascii_uppercase();
matches!(
upper.as_str(),
"TODO"
| "FIXME"
| "HACK"
| "XXX"
| "BUG"
| "REVIEW"
| "REFACTOR"
| "TEMP"
| "TEMPORARY"
| "DEPRECATED"
)
}
pub fn strict_literal_filter_enabled() -> bool {
std::env::var("CODELENS_EMBED_HINT_STRICT_LITERALS")
.map(|raw| {
let lowered = raw.to_ascii_lowercase();
matches!(lowered.as_str(), "1" | "true" | "yes" | "on")
})
.unwrap_or(false)
}
pub fn contains_format_specifier(s: &str) -> bool {
let bytes = s.as_bytes();
let len = bytes.len();
let mut i = 0;
while i + 1 < len {
if bytes[i] == b'%' {
let next = bytes[i + 1];
if matches!(next, b's' | b'd' | b'r' | b'f' | b'x' | b'o' | b'i' | b'u') {
return true;
}
}
i += 1;
}
for window in s.split('{').skip(1) {
let Some(close_idx) = window.find('}') else {
continue;
};
let inside = &window[..close_idx];
if inside.is_empty() {
return true;
}
if inside.chars().any(|c| c.is_whitespace()) {
continue;
}
if inside.starts_with(':') {
return true;
}
let ident_end = inside.find(':').unwrap_or(inside.len());
let ident = &inside[..ident_end];
if !ident.is_empty()
&& ident
.chars()
.all(|c| c.is_ascii_alphanumeric() || c == '_' || c == '.')
{
return true;
}
}
false
}
pub fn looks_like_error_or_log_prefix(s: &str) -> bool {
let lower = s.trim().to_lowercase();
const PREFIXES: &[&str] = &[
"invalid ",
"cannot ",
"could not ",
"unable to ",
"failed to ",
"expected ",
"unexpected ",
"missing ",
"not found",
"error: ",
"error ",
"warning: ",
"warning ",
"sending ",
"received ",
"starting ",
"stopping ",
"calling ",
"connecting ",
"disconnecting ",
];
PREFIXES.iter().any(|p| lower.starts_with(p))
}
#[cfg(test)]
pub fn should_reject_literal_strict(s: &str) -> bool {
contains_format_specifier(s) || looks_like_error_or_log_prefix(s)
}
pub fn extract_nl_tokens(source: &str, start: usize, end: usize) -> Option<String> {
if !nl_tokens_enabled() {
return None;
}
extract_nl_tokens_inner(source, start, end)
}
pub fn extract_nl_tokens_inner(source: &str, start: usize, end: usize) -> Option<String> {
if start >= source.len() || end > source.len() || start >= end {
return None;
}
let safe_start = if source.is_char_boundary(start) {
start
} else {
source.floor_char_boundary(start)
};
let safe_end = end.min(source.len());
let safe_end = if source.is_char_boundary(safe_end) {
safe_end
} else {
source.floor_char_boundary(safe_end)
};
let body = &source[safe_start..safe_end];
let mut tokens: Vec<String> = Vec::new();
let strict_comments = strict_comments_enabled();
for line in body.lines() {
let trimmed = line.trim();
if let Some(cleaned) = extract_comment_body(trimmed)
&& is_nl_shaped(&cleaned)
&& (!strict_comments || !looks_like_meta_annotation(&cleaned))
{
tokens.push(cleaned);
}
}
let strict_literals = strict_literal_filter_enabled();
let mut chars = body.chars().peekable();
let mut in_string = false;
let mut current = String::new();
while let Some(c) = chars.next() {
if in_string {
if c == '\\' {
let _ = chars.next();
} else if c == '"' {
if is_nl_shaped(¤t)
&& (!strict_literals
|| (!contains_format_specifier(¤t)
&& !looks_like_error_or_log_prefix(¤t)))
{
tokens.push(current.clone());
}
current.clear();
in_string = false;
} else {
current.push(c);
}
} else if c == '"' {
in_string = true;
}
}
if tokens.is_empty() {
return None;
}
Some(join_hint_lines(&tokens))
}
pub fn api_calls_enabled() -> bool {
if let Some(explicit) = parse_bool_env("CODELENS_EMBED_HINT_INCLUDE_API_CALLS") {
return explicit;
}
auto_hint_should_enable()
}
pub fn is_static_method_ident(ident: &str) -> bool {
ident.chars().next().is_some_and(|c| c.is_ascii_uppercase())
}
pub fn extract_api_calls(source: &str, start: usize, end: usize) -> Option<String> {
if !api_calls_enabled() {
return None;
}
extract_api_calls_inner(source, start, end)
}
pub fn extract_api_calls_inner(source: &str, start: usize, end: usize) -> Option<String> {
if start >= source.len() || end > source.len() || start >= end {
return None;
}
let safe_start = if source.is_char_boundary(start) {
start
} else {
source.floor_char_boundary(start)
};
let safe_end = end.min(source.len());
let safe_end = if source.is_char_boundary(safe_end) {
safe_end
} else {
source.floor_char_boundary(safe_end)
};
if safe_start >= safe_end {
return None;
}
let body = &source[safe_start..safe_end];
let bytes = body.as_bytes();
let len = bytes.len();
let mut calls: Vec<String> = Vec::new();
let mut seen: std::collections::HashSet<String> = std::collections::HashSet::new();
let mut i = 0usize;
while i < len {
let b = bytes[i];
if !(b == b'_' || b.is_ascii_alphabetic()) {
i += 1;
continue;
}
let ident_start = i;
while i < len {
let bb = bytes[i];
if bb == b'_' || bb.is_ascii_alphanumeric() {
i += 1;
} else {
break;
}
}
let ident_end = i;
if i + 1 >= len || bytes[i] != b':' || bytes[i + 1] != b':' {
continue;
}
let type_ident = &body[ident_start..ident_end];
if !is_static_method_ident(type_ident) {
i += 2;
continue;
}
let mut j = i + 2;
if j >= len || !(bytes[j] == b'_' || bytes[j].is_ascii_alphabetic()) {
i = j;
continue;
}
let method_start = j;
while j < len {
let bb = bytes[j];
if bb == b'_' || bb.is_ascii_alphanumeric() {
j += 1;
} else {
break;
}
}
let method_end = j;
let method_ident = &body[method_start..method_end];
let call = format!("{type_ident}::{method_ident}");
if seen.insert(call.clone()) {
calls.push(call);
}
i = j;
}
if calls.is_empty() {
return None;
}
Some(join_hint_lines(&calls))
}
pub fn extract_comment_body(trimmed: &str) -> Option<String> {
if trimmed.is_empty() {
return None;
}
if let Some(rest) = trimmed.strip_prefix("///") {
return Some(rest.trim().to_string());
}
if let Some(rest) = trimmed.strip_prefix("//!") {
return Some(rest.trim().to_string());
}
if let Some(rest) = trimmed.strip_prefix("//") {
return Some(rest.trim().to_string());
}
if trimmed.starts_with("#[") || trimmed.starts_with("#!") {
return None;
}
if let Some(rest) = trimmed.strip_prefix('#') {
return Some(rest.trim().to_string());
}
if let Some(rest) = trimmed.strip_prefix("/**") {
return Some(rest.trim_end_matches("*/").trim().to_string());
}
if let Some(rest) = trimmed.strip_prefix("/*") {
return Some(rest.trim_end_matches("*/").trim().to_string());
}
if let Some(rest) = trimmed.strip_prefix('*') {
let rest = rest.trim_end_matches("*/").trim();
if rest.is_empty() {
return None;
}
if rest.contains(';') || rest.contains('{') {
return None;
}
return Some(rest.to_string());
}
None
}
pub fn extract_leading_doc(source: &str, start: usize, end: usize) -> Option<String> {
if start >= source.len() || end > source.len() || start >= end {
return None;
}
let safe_start = if source.is_char_boundary(start) {
start
} else {
source.floor_char_boundary(start)
};
let safe_end = end.min(source.len());
let safe_end = if source.is_char_boundary(safe_end) {
safe_end
} else {
source.floor_char_boundary(safe_end)
};
if safe_start >= safe_end {
return None;
}
let body = &source[safe_start..safe_end];
let lines: Vec<&str> = body.lines().skip(1).collect(); if lines.is_empty() {
return None;
}
let mut doc_lines = Vec::new();
let first_trimmed = lines.first().map(|l| l.trim()).unwrap_or_default();
if first_trimmed.starts_with("\"\"\"") || first_trimmed.starts_with("'''") {
let quote = &first_trimmed[..3];
for line in &lines {
let t = line.trim();
doc_lines.push(t.trim_start_matches(quote).trim_end_matches(quote));
if doc_lines.len() > 1 && t.ends_with(quote) {
break;
}
}
}
else if first_trimmed.starts_with("///") || first_trimmed.starts_with("//!") {
for line in &lines {
let t = line.trim();
if t.starts_with("///") || t.starts_with("//!") {
doc_lines.push(t.trim_start_matches("///").trim_start_matches("//!").trim());
} else {
break;
}
}
}
else if first_trimmed.starts_with("/**") {
for line in &lines {
let t = line.trim();
let cleaned = t
.trim_start_matches("/**")
.trim_start_matches('*')
.trim_end_matches("*/")
.trim();
if !cleaned.is_empty() {
doc_lines.push(cleaned);
}
if t.ends_with("*/") {
break;
}
}
}
else {
for line in &lines {
let t = line.trim();
if t.starts_with("//") || t.starts_with('#') {
doc_lines.push(t.trim_start_matches("//").trim_start_matches('#').trim());
} else {
break;
}
}
}
if doc_lines.is_empty() {
return None;
}
Some(doc_lines.join(" ").trim().to_owned())
}