1use super::types::SymbolInfo;
2
3pub(crate) fn contains_ascii_ci(haystack: &str, needle: &str) -> bool {
14 let h = haystack.as_bytes();
15 let n = needle.as_bytes();
16 if n.len() > h.len() {
17 return false;
18 }
19 if n.is_empty() {
20 return true;
21 }
22 h.windows(n.len())
23 .any(|window| window.eq_ignore_ascii_case(n))
24}
25
26fn eq_ascii_ci(a: &str, b: &str) -> bool {
28 a.eq_ignore_ascii_case(b)
29}
30
31fn query_has_action_verb(tokens: &[&str]) -> bool {
33 const ACTION_VERBS: &[&str] = &[
34 "find",
35 "get",
36 "search",
37 "detect",
38 "start",
39 "run",
40 "read",
41 "write",
42 "move",
43 "change",
44 "rename",
45 "replace",
46 "extract",
47 "route",
48 "embed",
49 "build",
50 "create",
51 "delete",
52 "update",
53 "compute",
54 "calculate",
55 "apply",
56 "handle",
57 "parse",
58 "index",
59 "watch",
60 "listen",
61 "fetch",
62 "send",
63 "load",
64 "save",
65 "open",
66 "close",
67 "connect",
68 "check",
69 "validate",
70 "verify",
71 "transform",
72 "convert",
73 "process",
74 "execute",
75 "call",
76 "invoke",
77 "inline",
78 "refactor",
79 "analyze",
80 "import",
81 "export",
82 ];
83 tokens.iter().any(|t| ACTION_VERBS.contains(t))
84}
85
86pub(crate) fn score_symbol(query: &str, symbol: &SymbolInfo) -> Option<i32> {
92 let lower = query.to_lowercase();
93 let snake = lower.replace(|c: char| c.is_whitespace() || c == '-', "_");
94 score_symbol_with_lower(query, &lower, &snake, symbol)
95}
96
97pub(crate) fn score_symbol_with_lower(
107 query: &str,
108 query_lower: &str,
109 joined_snake: &str,
110 symbol: &SymbolInfo,
111) -> Option<i32> {
112 if symbol.name.eq_ignore_ascii_case(query) {
114 return Some(100);
115 }
116
117 if contains_ascii_ci(&symbol.name, query_lower) {
123 return Some(60);
124 }
125 if contains_ascii_ci(&symbol.signature, query_lower) {
126 return Some(30);
127 }
128 if contains_ascii_ci(&symbol.name_path, query_lower) {
129 return Some(20);
130 }
131
132 if eq_ascii_ci(&symbol.name, joined_snake) {
137 return Some(80);
138 }
139 if contains_ascii_ci(joined_snake, &symbol.name) && symbol.name.contains('_') {
142 return Some(70);
143 }
144 if contains_ascii_ci(&symbol.name, joined_snake) && joined_snake.contains('_') {
147 return Some(65);
148 }
149
150 let tokens: Vec<&str> = query_lower
152 .split(|c: char| c.is_whitespace() || c == '_' || c == '-')
153 .filter(|t| t.len() >= 2)
154 .collect();
155 if tokens.is_empty() {
156 return None;
157 }
158
159 let mut name_hits = 0i32;
167 let mut sig_hits = 0i32;
168 let mut path_hits = 0i32;
169 for token in &tokens {
170 if contains_ascii_ci(&symbol.name, token) {
171 name_hits += 1;
172 }
173 if contains_ascii_ci(&symbol.signature, token) {
174 sig_hits += 1;
175 }
176 if contains_ascii_ci(&symbol.file_path, token) {
177 path_hits += 1;
178 }
179 }
180
181 let total_tokens = tokens.len() as i32;
182 if name_hits == 0 && sig_hits == 0 && path_hits == 0 {
183 return None;
184 }
185
186 let name_ratio = name_hits as f64 / total_tokens as f64;
190 let sig_ratio = sig_hits as f64 / total_tokens as f64;
191
192 let base_score = if name_hits > 0 {
193 let base = (15.0 + name_ratio * 40.0) as i32;
194 let sig_bonus = (sig_ratio * 5.0) as i32;
195 (base + sig_bonus).min(55)
196 } else if sig_hits > 0 {
197 (5.0 + sig_ratio * 20.0) as i32
198 } else {
199 let path_ratio = path_hits as f64 / total_tokens as f64;
201 (1.0 + path_ratio * 4.0).max(1.0) as i32
202 };
203
204 let kind_boost = if query_has_action_verb(&tokens) {
207 match symbol.kind {
208 super::types::SymbolKind::Function | super::types::SymbolKind::Method => 8,
209 _ => 0,
210 }
211 } else {
212 match symbol.kind {
213 super::types::SymbolKind::Class
214 | super::types::SymbolKind::Interface
215 | super::types::SymbolKind::Enum => 5,
216 _ => 0,
217 }
218 };
219
220 Some(base_score + kind_boost)
221}
222
223pub fn sparse_weighting_enabled() -> bool {
239 if let Ok(raw) = std::env::var("CODELENS_RANK_SPARSE_TERM_WEIGHT") {
240 let lowered = raw.trim().to_ascii_lowercase();
241 return matches!(lowered.as_str(), "1" | "true" | "yes" | "on");
242 }
243 #[cfg(feature = "semantic")]
248 {
249 crate::embedding::auto_sparse_should_enable()
250 }
251 #[cfg(not(feature = "semantic"))]
252 {
253 false
254 }
255}
256
257pub fn sparse_max_bonus() -> f64 {
268 std::env::var("CODELENS_RANK_SPARSE_MAX")
269 .ok()
270 .and_then(|raw| raw.parse::<u32>().ok())
271 .map(|n| n.clamp(5, 50))
272 .unwrap_or(20) as f64
273}
274
275pub fn sparse_threshold() -> f64 {
286 std::env::var("CODELENS_RANK_SPARSE_THRESHOLD")
287 .ok()
288 .and_then(|raw| raw.parse::<u32>().ok())
289 .map(|n| n.clamp(10, 90))
290 .unwrap_or(60) as f64
291 / 100.0
292}
293
294const SPARSE_STOPWORDS: &[&str] = &[
300 "the", "for", "with", "from", "that", "this", "into", "onto", "over", "not", "and", "any",
301 "all", "are", "was", "were", "has", "have", "had", "how", "what", "when", "where", "which",
302 "who", "why", "but", "its", "can", "use", "using", "used", "gets", "set", "sets", "new", "let",
303];
304
305pub fn has_whole_word(corpus: &str, token: &str) -> bool {
313 if token.is_empty() || corpus.len() < token.len() {
314 return false;
315 }
316 let corpus_bytes = corpus.as_bytes();
317 let token_bytes = token.as_bytes();
318 let mut start = 0;
319 while start + token_bytes.len() <= corpus_bytes.len() {
320 let remaining = &corpus[start..];
322 let Some(local_idx) = remaining.find(token) else {
323 return false;
324 };
325 let abs = start + local_idx;
326 let end = abs + token_bytes.len();
327 let before_ok = abs == 0 || !is_word_byte(corpus_bytes[abs - 1]);
328 let after_ok = end == corpus_bytes.len() || !is_word_byte(corpus_bytes[end]);
329 if before_ok && after_ok {
330 return true;
331 }
332 start = abs + 1;
333 }
334 false
335}
336
337fn is_word_byte(b: u8) -> bool {
344 b.is_ascii_alphanumeric() || (b & 0x80) != 0
345}
346
347pub fn sparse_query_tokens(query_lower: &str) -> Vec<String> {
358 let mut seen: std::collections::HashSet<String> = std::collections::HashSet::new();
359 let mut out: Vec<String> = Vec::new();
360 for raw in query_lower.split(|c: char| !c.is_alphanumeric()) {
361 if raw.len() < 3 {
362 continue;
363 }
364 if SPARSE_STOPWORDS.contains(&raw) {
365 continue;
366 }
367 if seen.insert(raw.to_string()) {
368 out.push(raw.to_string());
369 }
370 }
371 out
372}
373
374pub fn sparse_coverage_bonus_from_fields(
398 query_lower: &str,
399 name: &str,
400 name_path: &str,
401 signature: &str,
402 file_path: &str,
403) -> f64 {
404 let tokens = sparse_query_tokens(query_lower);
405 if tokens.len() < 2 {
406 return 0.0;
407 }
408 let cap = name.len() + name_path.len() + signature.len() + file_path.len() + 3;
412 let mut corpus_lower = String::with_capacity(cap);
413 for field in [name, name_path, signature, file_path] {
414 if !corpus_lower.is_empty() {
415 corpus_lower.push(' ');
416 }
417 for ch in field.chars() {
418 corpus_lower.push(ch.to_ascii_lowercase());
419 }
420 }
421
422 let matched = tokens
423 .iter()
424 .filter(|t| has_whole_word(&corpus_lower, t))
425 .count() as f64;
426 let total = tokens.len() as f64;
427 let coverage = matched / total;
428
429 let threshold = sparse_threshold();
430 if coverage < threshold {
431 return 0.0;
432 }
433 let span = (1.0 - threshold).max(0.01);
436 (coverage - threshold) / span * sparse_max_bonus()
437}
438
439#[cfg(test)]
442pub(crate) fn sparse_coverage_bonus(query_lower: &str, symbol: &SymbolInfo) -> f64 {
443 sparse_coverage_bonus_from_fields(
444 query_lower,
445 &symbol.name,
446 &symbol.name_path,
447 &symbol.signature,
448 &symbol.file_path,
449 )
450}
451
452#[cfg(test)]
453mod tests {
454 use super::super::types::{SymbolInfo, SymbolKind, SymbolProvenance};
455 use super::*;
456 use std::sync::Mutex;
457
458 static ENV_LOCK: Mutex<()> = Mutex::new(());
459
460 fn mk_symbol(name: &str, signature: &str) -> SymbolInfo {
461 SymbolInfo {
462 name: name.to_string(),
463 kind: SymbolKind::Function,
464 file_path: "test.rs".into(),
465 line: 1,
466 column: 0,
467 signature: signature.to_string(),
468 name_path: name.to_string(),
469 id: format!("test.rs#function:{name}"),
470 body: None,
471 children: Vec::new(),
472 start_byte: 0,
473 end_byte: 0,
474 provenance: SymbolProvenance::default(),
475 end_line: 0,
476 }
477 }
478
479 #[test]
480 fn sparse_weighting_gated_off_by_default() {
481 let _env_guard = ENV_LOCK.lock().unwrap_or_else(|e| e.into_inner());
482 let previous_explicit = std::env::var("CODELENS_RANK_SPARSE_TERM_WEIGHT").ok();
483 let previous_auto = std::env::var("CODELENS_EMBED_HINT_AUTO").ok();
484 let previous_lang = std::env::var("CODELENS_EMBED_HINT_AUTO_LANG").ok();
485 unsafe {
486 std::env::remove_var("CODELENS_RANK_SPARSE_TERM_WEIGHT");
487 std::env::remove_var("CODELENS_EMBED_HINT_AUTO");
488 std::env::remove_var("CODELENS_EMBED_HINT_AUTO_LANG");
489 }
490 let enabled = sparse_weighting_enabled();
491 unsafe {
492 match previous_explicit {
493 Some(value) => std::env::set_var("CODELENS_RANK_SPARSE_TERM_WEIGHT", value),
494 None => std::env::remove_var("CODELENS_RANK_SPARSE_TERM_WEIGHT"),
495 }
496 match previous_auto {
497 Some(value) => std::env::set_var("CODELENS_EMBED_HINT_AUTO", value),
498 None => std::env::remove_var("CODELENS_EMBED_HINT_AUTO"),
499 }
500 match previous_lang {
501 Some(value) => std::env::set_var("CODELENS_EMBED_HINT_AUTO_LANG", value),
502 None => std::env::remove_var("CODELENS_EMBED_HINT_AUTO_LANG"),
503 }
504 }
505 assert!(!enabled, "sparse weighting gate leaked");
506 }
507
508 #[test]
509 fn sparse_weighting_auto_gate_disables_for_js_ts_but_explicit_env_still_wins() {
510 let _env_guard = ENV_LOCK.lock().unwrap_or_else(|e| e.into_inner());
511 let previous_explicit = std::env::var("CODELENS_RANK_SPARSE_TERM_WEIGHT").ok();
512 let previous_auto = std::env::var("CODELENS_EMBED_HINT_AUTO").ok();
513 let previous_lang = std::env::var("CODELENS_EMBED_HINT_AUTO_LANG").ok();
514
515 unsafe {
516 std::env::remove_var("CODELENS_RANK_SPARSE_TERM_WEIGHT");
517 std::env::set_var("CODELENS_EMBED_HINT_AUTO", "1");
518 std::env::set_var("CODELENS_EMBED_HINT_AUTO_LANG", "rust");
519 }
520 assert!(
521 sparse_weighting_enabled(),
522 "auto+rust should enable sparse weighting"
523 );
524
525 unsafe {
526 std::env::remove_var("CODELENS_RANK_SPARSE_TERM_WEIGHT");
527 std::env::set_var("CODELENS_EMBED_HINT_AUTO", "1");
528 std::env::set_var("CODELENS_EMBED_HINT_AUTO_LANG", "typescript");
529 }
530 assert!(
531 !sparse_weighting_enabled(),
532 "auto+typescript should disable sparse weighting after Phase 2m split"
533 );
534
535 unsafe {
536 std::env::set_var("CODELENS_RANK_SPARSE_TERM_WEIGHT", "1");
537 std::env::set_var("CODELENS_EMBED_HINT_AUTO", "1");
538 std::env::set_var("CODELENS_EMBED_HINT_AUTO_LANG", "typescript");
539 }
540 assert!(
541 sparse_weighting_enabled(),
542 "explicit sparse=1 must still win over JS/TS auto-off"
543 );
544
545 unsafe {
546 std::env::set_var("CODELENS_RANK_SPARSE_TERM_WEIGHT", "0");
547 std::env::set_var("CODELENS_EMBED_HINT_AUTO", "1");
548 std::env::set_var("CODELENS_EMBED_HINT_AUTO_LANG", "rust");
549 }
550 assert!(
551 !sparse_weighting_enabled(),
552 "explicit sparse=0 must still win over rust auto-on"
553 );
554
555 unsafe {
556 match previous_explicit {
557 Some(value) => std::env::set_var("CODELENS_RANK_SPARSE_TERM_WEIGHT", value),
558 None => std::env::remove_var("CODELENS_RANK_SPARSE_TERM_WEIGHT"),
559 }
560 match previous_auto {
561 Some(value) => std::env::set_var("CODELENS_EMBED_HINT_AUTO", value),
562 None => std::env::remove_var("CODELENS_EMBED_HINT_AUTO"),
563 }
564 match previous_lang {
565 Some(value) => std::env::set_var("CODELENS_EMBED_HINT_AUTO_LANG", value),
566 None => std::env::remove_var("CODELENS_EMBED_HINT_AUTO_LANG"),
567 }
568 }
569 }
570
571 #[test]
572 fn sparse_query_tokens_drops_stopwords_and_short_tokens() {
573 let tokens = sparse_query_tokens("find the function that opens a file");
574 assert_eq!(tokens, vec!["find", "function", "opens", "file"]);
576 }
577
578 #[test]
579 fn sparse_query_tokens_deduplicates() {
580 let tokens = sparse_query_tokens("parse json parse xml parse");
581 assert_eq!(tokens, vec!["parse", "json", "xml"]);
582 }
583
584 #[test]
585 fn has_whole_word_respects_word_boundaries() {
586 assert!(has_whole_word("parse_json_body", "parse"));
588 assert!(!has_whole_word("parser", "parse"));
590 assert!(!has_whole_word("parserequest", "parse"));
591 assert!(has_whole_word("parse the file", "parse"));
593 assert!(has_whole_word("open file", "file"));
594 assert!(!has_whole_word("xyz", ""));
596 assert!(!has_whole_word("ab", "abc"));
597 }
598
599 #[test]
600 fn sparse_coverage_bonus_zero_for_single_token_query() {
601 let sym = mk_symbol("parse_json", "fn parse_json(input: &str) -> Value");
602 let bonus = sparse_coverage_bonus("parse", &sym);
604 assert_eq!(bonus, 0.0);
605 }
606
607 #[test]
608 fn sparse_coverage_bonus_zero_below_threshold() {
609 let sym = mk_symbol("parse_json", "fn parse_json(input: &str) -> Value");
610 let bonus = sparse_coverage_bonus("parse rename", &sym);
613 assert_eq!(bonus, 0.0);
614 }
615
616 #[test]
617 fn sparse_coverage_bonus_full_match_reaches_max() {
618 let sym = mk_symbol(
619 "parse_json_body",
620 "fn parse_json_body(input: &str) -> Value",
621 );
622 let bonus = sparse_coverage_bonus("parse json body", &sym);
625 assert!((bonus - 20.0).abs() < 0.01, "expected ~20, got {bonus}");
627 }
628
629 #[test]
630 fn sparse_coverage_bonus_ignores_whole_word_false_positives() {
631 let sym = mk_symbol("parser", "fn parser(input: &str) -> Json");
636 let bonus = sparse_coverage_bonus("parse json", &sym);
637 assert_eq!(bonus, 0.0);
638 }
639}