1use super::types::SymbolInfo;
2
3pub(crate) fn contains_ascii_ci(haystack: &str, needle: &str) -> bool {
14 let h = haystack.as_bytes();
15 let n = needle.as_bytes();
16 if n.len() > h.len() {
17 return false;
18 }
19 if n.is_empty() {
20 return true;
21 }
22 h.windows(n.len())
23 .any(|window| window.eq_ignore_ascii_case(n))
24}
25
26fn eq_ascii_ci(a: &str, b: &str) -> bool {
28 a.eq_ignore_ascii_case(b)
29}
30
31fn query_has_action_verb(tokens: &[&str]) -> bool {
33 const ACTION_VERBS: &[&str] = &[
34 "find",
35 "get",
36 "search",
37 "detect",
38 "start",
39 "run",
40 "read",
41 "write",
42 "move",
43 "change",
44 "rename",
45 "replace",
46 "extract",
47 "route",
48 "embed",
49 "build",
50 "create",
51 "delete",
52 "update",
53 "compute",
54 "calculate",
55 "apply",
56 "handle",
57 "parse",
58 "index",
59 "watch",
60 "listen",
61 "fetch",
62 "send",
63 "load",
64 "save",
65 "open",
66 "close",
67 "connect",
68 "check",
69 "validate",
70 "verify",
71 "transform",
72 "convert",
73 "process",
74 "execute",
75 "call",
76 "invoke",
77 "inline",
78 "refactor",
79 "analyze",
80 "import",
81 "export",
82 ];
83 tokens.iter().any(|t| ACTION_VERBS.contains(t))
84}
85
86pub(crate) fn score_symbol(query: &str, symbol: &SymbolInfo) -> Option<i32> {
92 let lower = query.to_lowercase();
93 let snake = lower.replace(|c: char| c.is_whitespace() || c == '-', "_");
94 score_symbol_with_lower(query, &lower, &snake, symbol)
95}
96
97pub(crate) fn score_symbol_with_lower(
107 query: &str,
108 query_lower: &str,
109 joined_snake: &str,
110 symbol: &SymbolInfo,
111) -> Option<i32> {
112 if symbol.name.eq_ignore_ascii_case(query) {
114 return Some(100);
115 }
116
117 if contains_ascii_ci(&symbol.name, query_lower) {
123 return Some(60);
124 }
125 if contains_ascii_ci(&symbol.signature, query_lower) {
126 return Some(30);
127 }
128 if contains_ascii_ci(&symbol.name_path, query_lower) {
129 return Some(20);
130 }
131
132 if eq_ascii_ci(&symbol.name, joined_snake) {
137 return Some(80);
138 }
139 if contains_ascii_ci(joined_snake, &symbol.name) && symbol.name.contains('_') {
142 return Some(70);
143 }
144 if contains_ascii_ci(&symbol.name, joined_snake) && joined_snake.contains('_') {
147 return Some(65);
148 }
149
150 let tokens: Vec<&str> = query_lower
152 .split(|c: char| c.is_whitespace() || c == '_' || c == '-')
153 .filter(|t| t.len() >= 2)
154 .collect();
155 if tokens.is_empty() {
156 return None;
157 }
158
159 let mut name_hits = 0i32;
167 let mut sig_hits = 0i32;
168 let mut path_hits = 0i32;
169 for token in &tokens {
170 if contains_ascii_ci(&symbol.name, token) {
171 name_hits += 1;
172 }
173 if contains_ascii_ci(&symbol.signature, token) {
174 sig_hits += 1;
175 }
176 if contains_ascii_ci(&symbol.file_path, token) {
177 path_hits += 1;
178 }
179 }
180
181 let total_tokens = tokens.len() as i32;
182 if name_hits == 0 && sig_hits == 0 && path_hits == 0 {
183 return None;
184 }
185
186 let name_ratio = name_hits as f64 / total_tokens as f64;
190 let sig_ratio = sig_hits as f64 / total_tokens as f64;
191
192 let base_score = if name_hits > 0 {
193 let base = (15.0 + name_ratio * 40.0) as i32;
194 let sig_bonus = (sig_ratio * 5.0) as i32;
195 (base + sig_bonus).min(55)
196 } else if sig_hits > 0 {
197 (5.0 + sig_ratio * 20.0) as i32
198 } else {
199 let path_ratio = path_hits as f64 / total_tokens as f64;
201 (1.0 + path_ratio * 4.0).max(1.0) as i32
202 };
203
204 let kind_boost = if query_has_action_verb(&tokens) {
207 match symbol.kind {
208 super::types::SymbolKind::Function | super::types::SymbolKind::Method => 8,
209 _ => 0,
210 }
211 } else {
212 match symbol.kind {
213 super::types::SymbolKind::Class
214 | super::types::SymbolKind::Interface
215 | super::types::SymbolKind::Enum => 5,
216 _ => 0,
217 }
218 };
219
220 Some(base_score + kind_boost)
221}
222
223pub fn sparse_weighting_enabled() -> bool {
239 if let Ok(raw) = std::env::var("CODELENS_RANK_SPARSE_TERM_WEIGHT") {
240 let lowered = raw.trim().to_ascii_lowercase();
241 return matches!(lowered.as_str(), "1" | "true" | "yes" | "on");
242 }
243 crate::embedding::auto_sparse_should_enable()
244}
245
246pub fn sparse_max_bonus() -> f64 {
257 std::env::var("CODELENS_RANK_SPARSE_MAX")
258 .ok()
259 .and_then(|raw| raw.parse::<u32>().ok())
260 .map(|n| n.clamp(5, 50))
261 .unwrap_or(20) as f64
262}
263
264pub fn sparse_threshold() -> f64 {
275 std::env::var("CODELENS_RANK_SPARSE_THRESHOLD")
276 .ok()
277 .and_then(|raw| raw.parse::<u32>().ok())
278 .map(|n| n.clamp(10, 90))
279 .unwrap_or(60) as f64
280 / 100.0
281}
282
283const SPARSE_STOPWORDS: &[&str] = &[
289 "the", "for", "with", "from", "that", "this", "into", "onto", "over", "not", "and", "any",
290 "all", "are", "was", "were", "has", "have", "had", "how", "what", "when", "where", "which",
291 "who", "why", "but", "its", "can", "use", "using", "used", "gets", "set", "sets", "new", "let",
292];
293
294pub fn has_whole_word(corpus: &str, token: &str) -> bool {
302 if token.is_empty() || corpus.len() < token.len() {
303 return false;
304 }
305 let corpus_bytes = corpus.as_bytes();
306 let token_bytes = token.as_bytes();
307 let mut start = 0;
308 while start + token_bytes.len() <= corpus_bytes.len() {
309 let remaining = &corpus[start..];
311 let Some(local_idx) = remaining.find(token) else {
312 return false;
313 };
314 let abs = start + local_idx;
315 let end = abs + token_bytes.len();
316 let before_ok = abs == 0 || !is_word_byte(corpus_bytes[abs - 1]);
317 let after_ok = end == corpus_bytes.len() || !is_word_byte(corpus_bytes[end]);
318 if before_ok && after_ok {
319 return true;
320 }
321 start = abs + 1;
322 }
323 false
324}
325
326fn is_word_byte(b: u8) -> bool {
333 b.is_ascii_alphanumeric() || (b & 0x80) != 0
334}
335
336pub fn sparse_query_tokens(query_lower: &str) -> Vec<String> {
347 let mut seen: std::collections::HashSet<String> = std::collections::HashSet::new();
348 let mut out: Vec<String> = Vec::new();
349 for raw in query_lower.split(|c: char| !c.is_alphanumeric()) {
350 if raw.len() < 3 {
351 continue;
352 }
353 if SPARSE_STOPWORDS.contains(&raw) {
354 continue;
355 }
356 if seen.insert(raw.to_string()) {
357 out.push(raw.to_string());
358 }
359 }
360 out
361}
362
363pub fn sparse_coverage_bonus_from_fields(
387 query_lower: &str,
388 name: &str,
389 name_path: &str,
390 signature: &str,
391 file_path: &str,
392) -> f64 {
393 let tokens = sparse_query_tokens(query_lower);
394 if tokens.len() < 2 {
395 return 0.0;
396 }
397 let cap = name.len() + name_path.len() + signature.len() + file_path.len() + 3;
401 let mut corpus_lower = String::with_capacity(cap);
402 for field in [name, name_path, signature, file_path] {
403 if !corpus_lower.is_empty() {
404 corpus_lower.push(' ');
405 }
406 for ch in field.chars() {
407 corpus_lower.push(ch.to_ascii_lowercase());
408 }
409 }
410
411 let matched = tokens
412 .iter()
413 .filter(|t| has_whole_word(&corpus_lower, t))
414 .count() as f64;
415 let total = tokens.len() as f64;
416 let coverage = matched / total;
417
418 let threshold = sparse_threshold();
419 if coverage < threshold {
420 return 0.0;
421 }
422 let span = (1.0 - threshold).max(0.01);
425 (coverage - threshold) / span * sparse_max_bonus()
426}
427
428#[cfg(test)]
431pub(crate) fn sparse_coverage_bonus(query_lower: &str, symbol: &SymbolInfo) -> f64 {
432 sparse_coverage_bonus_from_fields(
433 query_lower,
434 &symbol.name,
435 &symbol.name_path,
436 &symbol.signature,
437 &symbol.file_path,
438 )
439}
440
441#[cfg(test)]
442mod tests {
443 use super::super::types::{SymbolInfo, SymbolKind, SymbolProvenance};
444 use super::*;
445 use std::sync::Mutex;
446
447 static ENV_LOCK: Mutex<()> = Mutex::new(());
448
449 fn mk_symbol(name: &str, signature: &str) -> SymbolInfo {
450 SymbolInfo {
451 name: name.to_string(),
452 kind: SymbolKind::Function,
453 file_path: "test.rs".into(),
454 line: 1,
455 column: 0,
456 signature: signature.to_string(),
457 name_path: name.to_string(),
458 id: format!("test.rs#function:{name}"),
459 body: None,
460 children: Vec::new(),
461 start_byte: 0,
462 end_byte: 0,
463 provenance: SymbolProvenance::default(),
464 }
465 }
466
467 #[test]
468 fn sparse_weighting_gated_off_by_default() {
469 let _env_guard = ENV_LOCK.lock().unwrap_or_else(|e| e.into_inner());
470 let previous_explicit = std::env::var("CODELENS_RANK_SPARSE_TERM_WEIGHT").ok();
471 let previous_auto = std::env::var("CODELENS_EMBED_HINT_AUTO").ok();
472 let previous_lang = std::env::var("CODELENS_EMBED_HINT_AUTO_LANG").ok();
473 unsafe {
474 std::env::remove_var("CODELENS_RANK_SPARSE_TERM_WEIGHT");
475 std::env::remove_var("CODELENS_EMBED_HINT_AUTO");
476 std::env::remove_var("CODELENS_EMBED_HINT_AUTO_LANG");
477 }
478 let enabled = sparse_weighting_enabled();
479 unsafe {
480 match previous_explicit {
481 Some(value) => std::env::set_var("CODELENS_RANK_SPARSE_TERM_WEIGHT", value),
482 None => std::env::remove_var("CODELENS_RANK_SPARSE_TERM_WEIGHT"),
483 }
484 match previous_auto {
485 Some(value) => std::env::set_var("CODELENS_EMBED_HINT_AUTO", value),
486 None => std::env::remove_var("CODELENS_EMBED_HINT_AUTO"),
487 }
488 match previous_lang {
489 Some(value) => std::env::set_var("CODELENS_EMBED_HINT_AUTO_LANG", value),
490 None => std::env::remove_var("CODELENS_EMBED_HINT_AUTO_LANG"),
491 }
492 }
493 assert!(!enabled, "sparse weighting gate leaked");
494 }
495
496 #[test]
497 fn sparse_weighting_auto_gate_disables_for_js_ts_but_explicit_env_still_wins() {
498 let _env_guard = ENV_LOCK.lock().unwrap_or_else(|e| e.into_inner());
499 let previous_explicit = std::env::var("CODELENS_RANK_SPARSE_TERM_WEIGHT").ok();
500 let previous_auto = std::env::var("CODELENS_EMBED_HINT_AUTO").ok();
501 let previous_lang = std::env::var("CODELENS_EMBED_HINT_AUTO_LANG").ok();
502
503 unsafe {
504 std::env::remove_var("CODELENS_RANK_SPARSE_TERM_WEIGHT");
505 std::env::set_var("CODELENS_EMBED_HINT_AUTO", "1");
506 std::env::set_var("CODELENS_EMBED_HINT_AUTO_LANG", "rust");
507 }
508 assert!(
509 sparse_weighting_enabled(),
510 "auto+rust should enable sparse weighting"
511 );
512
513 unsafe {
514 std::env::remove_var("CODELENS_RANK_SPARSE_TERM_WEIGHT");
515 std::env::set_var("CODELENS_EMBED_HINT_AUTO", "1");
516 std::env::set_var("CODELENS_EMBED_HINT_AUTO_LANG", "typescript");
517 }
518 assert!(
519 !sparse_weighting_enabled(),
520 "auto+typescript should disable sparse weighting after Phase 2m split"
521 );
522
523 unsafe {
524 std::env::set_var("CODELENS_RANK_SPARSE_TERM_WEIGHT", "1");
525 std::env::set_var("CODELENS_EMBED_HINT_AUTO", "1");
526 std::env::set_var("CODELENS_EMBED_HINT_AUTO_LANG", "typescript");
527 }
528 assert!(
529 sparse_weighting_enabled(),
530 "explicit sparse=1 must still win over JS/TS auto-off"
531 );
532
533 unsafe {
534 std::env::set_var("CODELENS_RANK_SPARSE_TERM_WEIGHT", "0");
535 std::env::set_var("CODELENS_EMBED_HINT_AUTO", "1");
536 std::env::set_var("CODELENS_EMBED_HINT_AUTO_LANG", "rust");
537 }
538 assert!(
539 !sparse_weighting_enabled(),
540 "explicit sparse=0 must still win over rust auto-on"
541 );
542
543 unsafe {
544 match previous_explicit {
545 Some(value) => std::env::set_var("CODELENS_RANK_SPARSE_TERM_WEIGHT", value),
546 None => std::env::remove_var("CODELENS_RANK_SPARSE_TERM_WEIGHT"),
547 }
548 match previous_auto {
549 Some(value) => std::env::set_var("CODELENS_EMBED_HINT_AUTO", value),
550 None => std::env::remove_var("CODELENS_EMBED_HINT_AUTO"),
551 }
552 match previous_lang {
553 Some(value) => std::env::set_var("CODELENS_EMBED_HINT_AUTO_LANG", value),
554 None => std::env::remove_var("CODELENS_EMBED_HINT_AUTO_LANG"),
555 }
556 }
557 }
558
559 #[test]
560 fn sparse_query_tokens_drops_stopwords_and_short_tokens() {
561 let tokens = sparse_query_tokens("find the function that opens a file");
562 assert_eq!(tokens, vec!["find", "function", "opens", "file"]);
564 }
565
566 #[test]
567 fn sparse_query_tokens_deduplicates() {
568 let tokens = sparse_query_tokens("parse json parse xml parse");
569 assert_eq!(tokens, vec!["parse", "json", "xml"]);
570 }
571
572 #[test]
573 fn has_whole_word_respects_word_boundaries() {
574 assert!(has_whole_word("parse_json_body", "parse"));
576 assert!(!has_whole_word("parser", "parse"));
578 assert!(!has_whole_word("parserequest", "parse"));
579 assert!(has_whole_word("parse the file", "parse"));
581 assert!(has_whole_word("open file", "file"));
582 assert!(!has_whole_word("xyz", ""));
584 assert!(!has_whole_word("ab", "abc"));
585 }
586
587 #[test]
588 fn sparse_coverage_bonus_zero_for_single_token_query() {
589 let sym = mk_symbol("parse_json", "fn parse_json(input: &str) -> Value");
590 let bonus = sparse_coverage_bonus("parse", &sym);
592 assert_eq!(bonus, 0.0);
593 }
594
595 #[test]
596 fn sparse_coverage_bonus_zero_below_threshold() {
597 let sym = mk_symbol("parse_json", "fn parse_json(input: &str) -> Value");
598 let bonus = sparse_coverage_bonus("parse rename", &sym);
601 assert_eq!(bonus, 0.0);
602 }
603
604 #[test]
605 fn sparse_coverage_bonus_full_match_reaches_max() {
606 let sym = mk_symbol(
607 "parse_json_body",
608 "fn parse_json_body(input: &str) -> Value",
609 );
610 let bonus = sparse_coverage_bonus("parse json body", &sym);
613 assert!((bonus - 20.0).abs() < 0.01, "expected ~20, got {bonus}");
615 }
616
617 #[test]
618 fn sparse_coverage_bonus_ignores_whole_word_false_positives() {
619 let sym = mk_symbol("parser", "fn parser(input: &str) -> Json");
624 let bonus = sparse_coverage_bonus("parse json", &sym);
625 assert_eq!(bonus, 0.0);
626 }
627}