use super::types::SymbolInfo;
pub(crate) fn contains_ascii_ci(haystack: &str, needle: &str) -> bool {
let h = haystack.as_bytes();
let n = needle.as_bytes();
if n.len() > h.len() {
return false;
}
if n.is_empty() {
return true;
}
h.windows(n.len())
.any(|window| window.eq_ignore_ascii_case(n))
}
fn eq_ascii_ci(a: &str, b: &str) -> bool {
a.eq_ignore_ascii_case(b)
}
fn query_has_action_verb(tokens: &[&str]) -> bool {
const ACTION_VERBS: &[&str] = &[
"find",
"get",
"search",
"detect",
"start",
"run",
"read",
"write",
"move",
"change",
"rename",
"replace",
"extract",
"route",
"embed",
"build",
"create",
"delete",
"update",
"compute",
"calculate",
"apply",
"handle",
"parse",
"index",
"watch",
"listen",
"fetch",
"send",
"load",
"save",
"open",
"close",
"connect",
"check",
"validate",
"verify",
"transform",
"convert",
"process",
"execute",
"call",
"invoke",
"inline",
"refactor",
"analyze",
"import",
"export",
];
tokens.iter().any(|t| ACTION_VERBS.contains(t))
}
pub(crate) fn score_symbol(query: &str, symbol: &SymbolInfo) -> Option<i32> {
let lower = query.to_lowercase();
let snake = lower.replace(|c: char| c.is_whitespace() || c == '-', "_");
score_symbol_with_lower(query, &lower, &snake, symbol)
}
pub(crate) fn score_symbol_with_lower(
query: &str,
query_lower: &str,
joined_snake: &str,
symbol: &SymbolInfo,
) -> Option<i32> {
if symbol.name.eq_ignore_ascii_case(query) {
return Some(100);
}
if contains_ascii_ci(&symbol.name, query_lower) {
return Some(60);
}
if contains_ascii_ci(&symbol.signature, query_lower) {
return Some(30);
}
if contains_ascii_ci(&symbol.name_path, query_lower) {
return Some(20);
}
if eq_ascii_ci(&symbol.name, joined_snake) {
return Some(80);
}
if contains_ascii_ci(joined_snake, &symbol.name) && symbol.name.contains('_') {
return Some(70);
}
if contains_ascii_ci(&symbol.name, joined_snake) && joined_snake.contains('_') {
return Some(65);
}
let tokens: Vec<&str> = query_lower
.split(|c: char| c.is_whitespace() || c == '_' || c == '-')
.filter(|t| t.len() >= 2)
.collect();
if tokens.is_empty() {
return None;
}
let mut name_hits = 0i32;
let mut sig_hits = 0i32;
let mut path_hits = 0i32;
for token in &tokens {
if contains_ascii_ci(&symbol.name, token) {
name_hits += 1;
}
if contains_ascii_ci(&symbol.signature, token) {
sig_hits += 1;
}
if contains_ascii_ci(&symbol.file_path, token) {
path_hits += 1;
}
}
let total_tokens = tokens.len() as i32;
if name_hits == 0 && sig_hits == 0 && path_hits == 0 {
return None;
}
let name_ratio = name_hits as f64 / total_tokens as f64;
let sig_ratio = sig_hits as f64 / total_tokens as f64;
let base_score = if name_hits > 0 {
let base = (15.0 + name_ratio * 40.0) as i32;
let sig_bonus = (sig_ratio * 5.0) as i32;
(base + sig_bonus).min(55)
} else if sig_hits > 0 {
(5.0 + sig_ratio * 20.0) as i32
} else {
let path_ratio = path_hits as f64 / total_tokens as f64;
(1.0 + path_ratio * 4.0).max(1.0) as i32
};
let kind_boost = if query_has_action_verb(&tokens) {
match symbol.kind {
super::types::SymbolKind::Function | super::types::SymbolKind::Method => 8,
_ => 0,
}
} else {
match symbol.kind {
super::types::SymbolKind::Class
| super::types::SymbolKind::Interface
| super::types::SymbolKind::Enum => 5,
_ => 0,
}
};
Some(base_score + kind_boost)
}
pub fn sparse_weighting_enabled() -> bool {
if let Ok(raw) = std::env::var("CODELENS_RANK_SPARSE_TERM_WEIGHT") {
let lowered = raw.trim().to_ascii_lowercase();
return matches!(lowered.as_str(), "1" | "true" | "yes" | "on");
}
#[cfg(feature = "semantic")]
{
crate::embedding::auto_sparse_should_enable()
}
#[cfg(not(feature = "semantic"))]
{
false
}
}
pub fn sparse_max_bonus() -> f64 {
std::env::var("CODELENS_RANK_SPARSE_MAX")
.ok()
.and_then(|raw| raw.parse::<u32>().ok())
.map(|n| n.clamp(5, 50))
.unwrap_or(20) as f64
}
pub fn sparse_threshold() -> f64 {
std::env::var("CODELENS_RANK_SPARSE_THRESHOLD")
.ok()
.and_then(|raw| raw.parse::<u32>().ok())
.map(|n| n.clamp(10, 90))
.unwrap_or(60) as f64
/ 100.0
}
const SPARSE_STOPWORDS: &[&str] = &[
"the", "for", "with", "from", "that", "this", "into", "onto", "over", "not", "and", "any",
"all", "are", "was", "were", "has", "have", "had", "how", "what", "when", "where", "which",
"who", "why", "but", "its", "can", "use", "using", "used", "gets", "set", "sets", "new", "let",
];
pub fn has_whole_word(corpus: &str, token: &str) -> bool {
if token.is_empty() || corpus.len() < token.len() {
return false;
}
let corpus_bytes = corpus.as_bytes();
let token_bytes = token.as_bytes();
let mut start = 0;
while start + token_bytes.len() <= corpus_bytes.len() {
let remaining = &corpus[start..];
let Some(local_idx) = remaining.find(token) else {
return false;
};
let abs = start + local_idx;
let end = abs + token_bytes.len();
let before_ok = abs == 0 || !is_word_byte(corpus_bytes[abs - 1]);
let after_ok = end == corpus_bytes.len() || !is_word_byte(corpus_bytes[end]);
if before_ok && after_ok {
return true;
}
start = abs + 1;
}
false
}
fn is_word_byte(b: u8) -> bool {
b.is_ascii_alphanumeric() || (b & 0x80) != 0
}
pub fn sparse_query_tokens(query_lower: &str) -> Vec<String> {
let mut seen: std::collections::HashSet<String> = std::collections::HashSet::new();
let mut out: Vec<String> = Vec::new();
for raw in query_lower.split(|c: char| !c.is_alphanumeric()) {
if raw.len() < 3 {
continue;
}
if SPARSE_STOPWORDS.contains(&raw) {
continue;
}
if seen.insert(raw.to_string()) {
out.push(raw.to_string());
}
}
out
}
pub fn sparse_coverage_bonus_from_fields(
query_lower: &str,
name: &str,
name_path: &str,
signature: &str,
file_path: &str,
) -> f64 {
let tokens = sparse_query_tokens(query_lower);
if tokens.len() < 2 {
return 0.0;
}
let cap = name.len() + name_path.len() + signature.len() + file_path.len() + 3;
let mut corpus_lower = String::with_capacity(cap);
for field in [name, name_path, signature, file_path] {
if !corpus_lower.is_empty() {
corpus_lower.push(' ');
}
for ch in field.chars() {
corpus_lower.push(ch.to_ascii_lowercase());
}
}
let matched = tokens
.iter()
.filter(|t| has_whole_word(&corpus_lower, t))
.count() as f64;
let total = tokens.len() as f64;
let coverage = matched / total;
let threshold = sparse_threshold();
if coverage < threshold {
return 0.0;
}
let span = (1.0 - threshold).max(0.01);
(coverage - threshold) / span * sparse_max_bonus()
}
#[cfg(test)]
pub(crate) fn sparse_coverage_bonus(query_lower: &str, symbol: &SymbolInfo) -> f64 {
sparse_coverage_bonus_from_fields(
query_lower,
&symbol.name,
&symbol.name_path,
&symbol.signature,
&symbol.file_path,
)
}
#[cfg(test)]
mod tests {
use super::super::types::{SymbolInfo, SymbolKind, SymbolProvenance};
use super::*;
use std::sync::Mutex;
static ENV_LOCK: Mutex<()> = Mutex::new(());
fn mk_symbol(name: &str, signature: &str) -> SymbolInfo {
SymbolInfo {
name: name.to_string(),
kind: SymbolKind::Function,
file_path: "test.rs".into(),
line: 1,
column: 0,
signature: signature.to_string(),
name_path: name.to_string(),
id: format!("test.rs#function:{name}"),
body: None,
children: Vec::new(),
start_byte: 0,
end_byte: 0,
provenance: SymbolProvenance::default(),
}
}
#[test]
fn sparse_weighting_gated_off_by_default() {
let _env_guard = ENV_LOCK.lock().unwrap_or_else(|e| e.into_inner());
let previous_explicit = std::env::var("CODELENS_RANK_SPARSE_TERM_WEIGHT").ok();
let previous_auto = std::env::var("CODELENS_EMBED_HINT_AUTO").ok();
let previous_lang = std::env::var("CODELENS_EMBED_HINT_AUTO_LANG").ok();
unsafe {
std::env::remove_var("CODELENS_RANK_SPARSE_TERM_WEIGHT");
std::env::remove_var("CODELENS_EMBED_HINT_AUTO");
std::env::remove_var("CODELENS_EMBED_HINT_AUTO_LANG");
}
let enabled = sparse_weighting_enabled();
unsafe {
match previous_explicit {
Some(value) => std::env::set_var("CODELENS_RANK_SPARSE_TERM_WEIGHT", value),
None => std::env::remove_var("CODELENS_RANK_SPARSE_TERM_WEIGHT"),
}
match previous_auto {
Some(value) => std::env::set_var("CODELENS_EMBED_HINT_AUTO", value),
None => std::env::remove_var("CODELENS_EMBED_HINT_AUTO"),
}
match previous_lang {
Some(value) => std::env::set_var("CODELENS_EMBED_HINT_AUTO_LANG", value),
None => std::env::remove_var("CODELENS_EMBED_HINT_AUTO_LANG"),
}
}
assert!(!enabled, "sparse weighting gate leaked");
}
#[test]
fn sparse_weighting_auto_gate_disables_for_js_ts_but_explicit_env_still_wins() {
let _env_guard = ENV_LOCK.lock().unwrap_or_else(|e| e.into_inner());
let previous_explicit = std::env::var("CODELENS_RANK_SPARSE_TERM_WEIGHT").ok();
let previous_auto = std::env::var("CODELENS_EMBED_HINT_AUTO").ok();
let previous_lang = std::env::var("CODELENS_EMBED_HINT_AUTO_LANG").ok();
unsafe {
std::env::remove_var("CODELENS_RANK_SPARSE_TERM_WEIGHT");
std::env::set_var("CODELENS_EMBED_HINT_AUTO", "1");
std::env::set_var("CODELENS_EMBED_HINT_AUTO_LANG", "rust");
}
assert!(
sparse_weighting_enabled(),
"auto+rust should enable sparse weighting"
);
unsafe {
std::env::remove_var("CODELENS_RANK_SPARSE_TERM_WEIGHT");
std::env::set_var("CODELENS_EMBED_HINT_AUTO", "1");
std::env::set_var("CODELENS_EMBED_HINT_AUTO_LANG", "typescript");
}
assert!(
!sparse_weighting_enabled(),
"auto+typescript should disable sparse weighting after Phase 2m split"
);
unsafe {
std::env::set_var("CODELENS_RANK_SPARSE_TERM_WEIGHT", "1");
std::env::set_var("CODELENS_EMBED_HINT_AUTO", "1");
std::env::set_var("CODELENS_EMBED_HINT_AUTO_LANG", "typescript");
}
assert!(
sparse_weighting_enabled(),
"explicit sparse=1 must still win over JS/TS auto-off"
);
unsafe {
std::env::set_var("CODELENS_RANK_SPARSE_TERM_WEIGHT", "0");
std::env::set_var("CODELENS_EMBED_HINT_AUTO", "1");
std::env::set_var("CODELENS_EMBED_HINT_AUTO_LANG", "rust");
}
assert!(
!sparse_weighting_enabled(),
"explicit sparse=0 must still win over rust auto-on"
);
unsafe {
match previous_explicit {
Some(value) => std::env::set_var("CODELENS_RANK_SPARSE_TERM_WEIGHT", value),
None => std::env::remove_var("CODELENS_RANK_SPARSE_TERM_WEIGHT"),
}
match previous_auto {
Some(value) => std::env::set_var("CODELENS_EMBED_HINT_AUTO", value),
None => std::env::remove_var("CODELENS_EMBED_HINT_AUTO"),
}
match previous_lang {
Some(value) => std::env::set_var("CODELENS_EMBED_HINT_AUTO_LANG", value),
None => std::env::remove_var("CODELENS_EMBED_HINT_AUTO_LANG"),
}
}
}
#[test]
fn sparse_query_tokens_drops_stopwords_and_short_tokens() {
let tokens = sparse_query_tokens("find the function that opens a file");
assert_eq!(tokens, vec!["find", "function", "opens", "file"]);
}
#[test]
fn sparse_query_tokens_deduplicates() {
let tokens = sparse_query_tokens("parse json parse xml parse");
assert_eq!(tokens, vec!["parse", "json", "xml"]);
}
#[test]
fn has_whole_word_respects_word_boundaries() {
assert!(has_whole_word("parse_json_body", "parse"));
assert!(!has_whole_word("parser", "parse"));
assert!(!has_whole_word("parserequest", "parse"));
assert!(has_whole_word("parse the file", "parse"));
assert!(has_whole_word("open file", "file"));
assert!(!has_whole_word("xyz", ""));
assert!(!has_whole_word("ab", "abc"));
}
#[test]
fn sparse_coverage_bonus_zero_for_single_token_query() {
let sym = mk_symbol("parse_json", "fn parse_json(input: &str) -> Value");
let bonus = sparse_coverage_bonus("parse", &sym);
assert_eq!(bonus, 0.0);
}
#[test]
fn sparse_coverage_bonus_zero_below_threshold() {
let sym = mk_symbol("parse_json", "fn parse_json(input: &str) -> Value");
let bonus = sparse_coverage_bonus("parse rename", &sym);
assert_eq!(bonus, 0.0);
}
#[test]
fn sparse_coverage_bonus_full_match_reaches_max() {
let sym = mk_symbol(
"parse_json_body",
"fn parse_json_body(input: &str) -> Value",
);
let bonus = sparse_coverage_bonus("parse json body", &sym);
assert!((bonus - 20.0).abs() < 0.01, "expected ~20, got {bonus}");
}
#[test]
fn sparse_coverage_bonus_ignores_whole_word_false_positives() {
let sym = mk_symbol("parser", "fn parser(input: &str) -> Json");
let bonus = sparse_coverage_bonus("parse json", &sym);
assert_eq!(bonus, 0.0);
}
}