1use super::types::SymbolInfo;
2
3pub(crate) fn contains_ascii_ci(haystack: &str, needle: &str) -> bool {
14 let h = haystack.as_bytes();
15 let n = needle.as_bytes();
16 if n.len() > h.len() {
17 return false;
18 }
19 if n.is_empty() {
20 return true;
21 }
22 h.windows(n.len())
23 .any(|window| window.eq_ignore_ascii_case(n))
24}
25
26fn eq_ascii_ci(a: &str, b: &str) -> bool {
28 a.eq_ignore_ascii_case(b)
29}
30
31fn query_has_action_verb(tokens: &[&str]) -> bool {
33 const ACTION_VERBS: &[&str] = &[
34 "find",
35 "get",
36 "search",
37 "detect",
38 "start",
39 "run",
40 "read",
41 "write",
42 "move",
43 "change",
44 "rename",
45 "replace",
46 "extract",
47 "route",
48 "embed",
49 "build",
50 "create",
51 "delete",
52 "update",
53 "compute",
54 "calculate",
55 "apply",
56 "handle",
57 "parse",
58 "index",
59 "watch",
60 "listen",
61 "fetch",
62 "send",
63 "load",
64 "save",
65 "open",
66 "close",
67 "connect",
68 "check",
69 "validate",
70 "verify",
71 "transform",
72 "convert",
73 "process",
74 "execute",
75 "call",
76 "invoke",
77 "inline",
78 "refactor",
79 "analyze",
80 "import",
81 "export",
82 ];
83 tokens.iter().any(|t| ACTION_VERBS.contains(t))
84}
85
86pub(crate) fn score_symbol(query: &str, symbol: &SymbolInfo) -> Option<i32> {
92 let lower = query.to_lowercase();
93 let snake = lower.replace(|c: char| c.is_whitespace() || c == '-', "_");
94 score_symbol_with_lower(query, &lower, &snake, symbol)
95}
96
97pub(crate) fn score_symbol_with_lower(
107 query: &str,
108 query_lower: &str,
109 joined_snake: &str,
110 symbol: &SymbolInfo,
111) -> Option<i32> {
112 if symbol.name.eq_ignore_ascii_case(query) {
114 return Some(100);
115 }
116
117 if contains_ascii_ci(&symbol.name, query_lower) {
123 return Some(60);
124 }
125 if contains_ascii_ci(&symbol.signature, query_lower) {
126 return Some(30);
127 }
128 if contains_ascii_ci(&symbol.name_path, query_lower) {
129 return Some(20);
130 }
131
132 if eq_ascii_ci(&symbol.name, joined_snake) {
137 return Some(80);
138 }
139 if contains_ascii_ci(joined_snake, &symbol.name) && symbol.name.contains('_') {
142 return Some(70);
143 }
144 if contains_ascii_ci(&symbol.name, joined_snake) && joined_snake.contains('_') {
147 return Some(65);
148 }
149
150 let tokens: Vec<&str> = query_lower
152 .split(|c: char| c.is_whitespace() || c == '_' || c == '-')
153 .filter(|t| t.len() >= 2)
154 .collect();
155 if tokens.is_empty() {
156 return None;
157 }
158
159 let mut name_hits = 0i32;
167 let mut sig_hits = 0i32;
168 let mut path_hits = 0i32;
169 for token in &tokens {
170 if contains_ascii_ci(&symbol.name, token) {
171 name_hits += 1;
172 }
173 if contains_ascii_ci(&symbol.signature, token) {
174 sig_hits += 1;
175 }
176 if contains_ascii_ci(&symbol.file_path, token) {
177 path_hits += 1;
178 }
179 }
180
181 let total_tokens = tokens.len() as i32;
182 if name_hits == 0 && sig_hits == 0 && path_hits == 0 {
183 return None;
184 }
185
186 let name_ratio = name_hits as f64 / total_tokens as f64;
190 let sig_ratio = sig_hits as f64 / total_tokens as f64;
191
192 let base_score = if name_hits > 0 {
193 let base = (15.0 + name_ratio * 40.0) as i32;
194 let sig_bonus = (sig_ratio * 5.0) as i32;
195 (base + sig_bonus).min(55)
196 } else if sig_hits > 0 {
197 (5.0 + sig_ratio * 20.0) as i32
198 } else {
199 let path_ratio = path_hits as f64 / total_tokens as f64;
201 (1.0 + path_ratio * 4.0).max(1.0) as i32
202 };
203
204 let kind_boost = if query_has_action_verb(&tokens) {
207 match symbol.kind {
208 super::types::SymbolKind::Function | super::types::SymbolKind::Method => 8,
209 _ => 0,
210 }
211 } else {
212 match symbol.kind {
213 super::types::SymbolKind::Class
214 | super::types::SymbolKind::Interface
215 | super::types::SymbolKind::Enum => 5,
216 _ => 0,
217 }
218 };
219
220 Some(base_score + kind_boost)
221}
222
223pub fn sparse_weighting_enabled() -> bool {
239 if let Ok(raw) = std::env::var("CODELENS_RANK_SPARSE_TERM_WEIGHT") {
240 let lowered = raw.trim().to_ascii_lowercase();
241 return matches!(lowered.as_str(), "1" | "true" | "yes" | "on");
242 }
243 #[cfg(feature = "semantic")]
248 {
249 crate::embedding::auto_sparse_should_enable()
250 }
251 #[cfg(not(feature = "semantic"))]
252 {
253 false
254 }
255}
256
257pub fn sparse_max_bonus() -> f64 {
268 std::env::var("CODELENS_RANK_SPARSE_MAX")
269 .ok()
270 .and_then(|raw| raw.parse::<u32>().ok())
271 .map(|n| n.clamp(5, 50))
272 .unwrap_or(20) as f64
273}
274
275pub fn sparse_threshold() -> f64 {
286 std::env::var("CODELENS_RANK_SPARSE_THRESHOLD")
287 .ok()
288 .and_then(|raw| raw.parse::<u32>().ok())
289 .map(|n| n.clamp(10, 90))
290 .unwrap_or(60) as f64
291 / 100.0
292}
293
294const SPARSE_STOPWORDS: &[&str] = &[
300 "the", "for", "with", "from", "that", "this", "into", "onto", "over", "not", "and", "any",
301 "all", "are", "was", "were", "has", "have", "had", "how", "what", "when", "where", "which",
302 "who", "why", "but", "its", "can", "use", "using", "used", "gets", "set", "sets", "new", "let",
303];
304
305pub fn has_whole_word(corpus: &str, token: &str) -> bool {
313 if token.is_empty() || corpus.len() < token.len() {
314 return false;
315 }
316 let corpus_bytes = corpus.as_bytes();
317 let token_bytes = token.as_bytes();
318 let mut start = 0;
319 while start + token_bytes.len() <= corpus_bytes.len() {
320 let remaining = &corpus[start..];
322 let Some(local_idx) = remaining.find(token) else {
323 return false;
324 };
325 let abs = start + local_idx;
326 let end = abs + token_bytes.len();
327 let before_ok = abs == 0 || !is_word_byte(corpus_bytes[abs - 1]);
328 let after_ok = end == corpus_bytes.len() || !is_word_byte(corpus_bytes[end]);
329 if before_ok && after_ok {
330 return true;
331 }
332 start = abs + 1;
333 }
334 false
335}
336
337fn is_word_byte(b: u8) -> bool {
344 b.is_ascii_alphanumeric() || (b & 0x80) != 0
345}
346
347pub fn sparse_query_tokens(query_lower: &str) -> Vec<String> {
358 let mut seen: std::collections::HashSet<String> = std::collections::HashSet::new();
359 let mut out: Vec<String> = Vec::new();
360 for raw in query_lower.split(|c: char| !c.is_alphanumeric()) {
361 if raw.len() < 3 {
362 continue;
363 }
364 if SPARSE_STOPWORDS.contains(&raw) {
365 continue;
366 }
367 if seen.insert(raw.to_string()) {
368 out.push(raw.to_string());
369 }
370 }
371 out
372}
373
374pub fn sparse_coverage_bonus_from_fields(
398 query_lower: &str,
399 name: &str,
400 name_path: &str,
401 signature: &str,
402 file_path: &str,
403) -> f64 {
404 let tokens = sparse_query_tokens(query_lower);
405 if tokens.len() < 2 {
406 return 0.0;
407 }
408 let cap = name.len() + name_path.len() + signature.len() + file_path.len() + 3;
412 let mut corpus_lower = String::with_capacity(cap);
413 for field in [name, name_path, signature, file_path] {
414 if !corpus_lower.is_empty() {
415 corpus_lower.push(' ');
416 }
417 for ch in field.chars() {
418 corpus_lower.push(ch.to_ascii_lowercase());
419 }
420 }
421
422 let matched = tokens
423 .iter()
424 .filter(|t| has_whole_word(&corpus_lower, t))
425 .count() as f64;
426 let total = tokens.len() as f64;
427 let coverage = matched / total;
428
429 let threshold = sparse_threshold();
430 if coverage < threshold {
431 return 0.0;
432 }
433 let span = (1.0 - threshold).max(0.01);
436 (coverage - threshold) / span * sparse_max_bonus()
437}
438
439#[cfg(test)]
442pub(crate) fn sparse_coverage_bonus(query_lower: &str, symbol: &SymbolInfo) -> f64 {
443 sparse_coverage_bonus_from_fields(
444 query_lower,
445 &symbol.name,
446 &symbol.name_path,
447 &symbol.signature,
448 &symbol.file_path,
449 )
450}
451
452#[cfg(test)]
453mod tests {
454 use super::super::types::{SymbolInfo, SymbolKind, SymbolProvenance};
455 use super::*;
456 use std::sync::Mutex;
457
458 static ENV_LOCK: Mutex<()> = Mutex::new(());
459
460 fn mk_symbol(name: &str, signature: &str) -> SymbolInfo {
461 SymbolInfo {
462 name: name.to_string(),
463 kind: SymbolKind::Function,
464 file_path: "test.rs".into(),
465 line: 1,
466 column: 0,
467 signature: signature.to_string(),
468 name_path: name.to_string(),
469 id: format!("test.rs#function:{name}"),
470 body: None,
471 children: Vec::new(),
472 start_byte: 0,
473 end_byte: 0,
474 provenance: SymbolProvenance::default(),
475 }
476 }
477
478 #[test]
479 fn sparse_weighting_gated_off_by_default() {
480 let _env_guard = ENV_LOCK.lock().unwrap_or_else(|e| e.into_inner());
481 let previous_explicit = std::env::var("CODELENS_RANK_SPARSE_TERM_WEIGHT").ok();
482 let previous_auto = std::env::var("CODELENS_EMBED_HINT_AUTO").ok();
483 let previous_lang = std::env::var("CODELENS_EMBED_HINT_AUTO_LANG").ok();
484 unsafe {
485 std::env::remove_var("CODELENS_RANK_SPARSE_TERM_WEIGHT");
486 std::env::remove_var("CODELENS_EMBED_HINT_AUTO");
487 std::env::remove_var("CODELENS_EMBED_HINT_AUTO_LANG");
488 }
489 let enabled = sparse_weighting_enabled();
490 unsafe {
491 match previous_explicit {
492 Some(value) => std::env::set_var("CODELENS_RANK_SPARSE_TERM_WEIGHT", value),
493 None => std::env::remove_var("CODELENS_RANK_SPARSE_TERM_WEIGHT"),
494 }
495 match previous_auto {
496 Some(value) => std::env::set_var("CODELENS_EMBED_HINT_AUTO", value),
497 None => std::env::remove_var("CODELENS_EMBED_HINT_AUTO"),
498 }
499 match previous_lang {
500 Some(value) => std::env::set_var("CODELENS_EMBED_HINT_AUTO_LANG", value),
501 None => std::env::remove_var("CODELENS_EMBED_HINT_AUTO_LANG"),
502 }
503 }
504 assert!(!enabled, "sparse weighting gate leaked");
505 }
506
507 #[test]
508 fn sparse_weighting_auto_gate_disables_for_js_ts_but_explicit_env_still_wins() {
509 let _env_guard = ENV_LOCK.lock().unwrap_or_else(|e| e.into_inner());
510 let previous_explicit = std::env::var("CODELENS_RANK_SPARSE_TERM_WEIGHT").ok();
511 let previous_auto = std::env::var("CODELENS_EMBED_HINT_AUTO").ok();
512 let previous_lang = std::env::var("CODELENS_EMBED_HINT_AUTO_LANG").ok();
513
514 unsafe {
515 std::env::remove_var("CODELENS_RANK_SPARSE_TERM_WEIGHT");
516 std::env::set_var("CODELENS_EMBED_HINT_AUTO", "1");
517 std::env::set_var("CODELENS_EMBED_HINT_AUTO_LANG", "rust");
518 }
519 assert!(
520 sparse_weighting_enabled(),
521 "auto+rust should enable sparse weighting"
522 );
523
524 unsafe {
525 std::env::remove_var("CODELENS_RANK_SPARSE_TERM_WEIGHT");
526 std::env::set_var("CODELENS_EMBED_HINT_AUTO", "1");
527 std::env::set_var("CODELENS_EMBED_HINT_AUTO_LANG", "typescript");
528 }
529 assert!(
530 !sparse_weighting_enabled(),
531 "auto+typescript should disable sparse weighting after Phase 2m split"
532 );
533
534 unsafe {
535 std::env::set_var("CODELENS_RANK_SPARSE_TERM_WEIGHT", "1");
536 std::env::set_var("CODELENS_EMBED_HINT_AUTO", "1");
537 std::env::set_var("CODELENS_EMBED_HINT_AUTO_LANG", "typescript");
538 }
539 assert!(
540 sparse_weighting_enabled(),
541 "explicit sparse=1 must still win over JS/TS auto-off"
542 );
543
544 unsafe {
545 std::env::set_var("CODELENS_RANK_SPARSE_TERM_WEIGHT", "0");
546 std::env::set_var("CODELENS_EMBED_HINT_AUTO", "1");
547 std::env::set_var("CODELENS_EMBED_HINT_AUTO_LANG", "rust");
548 }
549 assert!(
550 !sparse_weighting_enabled(),
551 "explicit sparse=0 must still win over rust auto-on"
552 );
553
554 unsafe {
555 match previous_explicit {
556 Some(value) => std::env::set_var("CODELENS_RANK_SPARSE_TERM_WEIGHT", value),
557 None => std::env::remove_var("CODELENS_RANK_SPARSE_TERM_WEIGHT"),
558 }
559 match previous_auto {
560 Some(value) => std::env::set_var("CODELENS_EMBED_HINT_AUTO", value),
561 None => std::env::remove_var("CODELENS_EMBED_HINT_AUTO"),
562 }
563 match previous_lang {
564 Some(value) => std::env::set_var("CODELENS_EMBED_HINT_AUTO_LANG", value),
565 None => std::env::remove_var("CODELENS_EMBED_HINT_AUTO_LANG"),
566 }
567 }
568 }
569
570 #[test]
571 fn sparse_query_tokens_drops_stopwords_and_short_tokens() {
572 let tokens = sparse_query_tokens("find the function that opens a file");
573 assert_eq!(tokens, vec!["find", "function", "opens", "file"]);
575 }
576
577 #[test]
578 fn sparse_query_tokens_deduplicates() {
579 let tokens = sparse_query_tokens("parse json parse xml parse");
580 assert_eq!(tokens, vec!["parse", "json", "xml"]);
581 }
582
583 #[test]
584 fn has_whole_word_respects_word_boundaries() {
585 assert!(has_whole_word("parse_json_body", "parse"));
587 assert!(!has_whole_word("parser", "parse"));
589 assert!(!has_whole_word("parserequest", "parse"));
590 assert!(has_whole_word("parse the file", "parse"));
592 assert!(has_whole_word("open file", "file"));
593 assert!(!has_whole_word("xyz", ""));
595 assert!(!has_whole_word("ab", "abc"));
596 }
597
598 #[test]
599 fn sparse_coverage_bonus_zero_for_single_token_query() {
600 let sym = mk_symbol("parse_json", "fn parse_json(input: &str) -> Value");
601 let bonus = sparse_coverage_bonus("parse", &sym);
603 assert_eq!(bonus, 0.0);
604 }
605
606 #[test]
607 fn sparse_coverage_bonus_zero_below_threshold() {
608 let sym = mk_symbol("parse_json", "fn parse_json(input: &str) -> Value");
609 let bonus = sparse_coverage_bonus("parse rename", &sym);
612 assert_eq!(bonus, 0.0);
613 }
614
615 #[test]
616 fn sparse_coverage_bonus_full_match_reaches_max() {
617 let sym = mk_symbol(
618 "parse_json_body",
619 "fn parse_json_body(input: &str) -> Value",
620 );
621 let bonus = sparse_coverage_bonus("parse json body", &sym);
624 assert!((bonus - 20.0).abs() < 0.01, "expected ~20, got {bonus}");
626 }
627
628 #[test]
629 fn sparse_coverage_bonus_ignores_whole_word_false_positives() {
630 let sym = mk_symbol("parser", "fn parser(input: &str) -> Json");
635 let bonus = sparse_coverage_bonus("parse json", &sym);
636 assert_eq!(bonus, 0.0);
637 }
638}