use std::collections::HashMap;
use std::path::Path;
use once_cell::sync::Lazy;
use regex::Regex;
use crate::model::Chunk;
static TEST_FILE_RE: Lazy<Regex> = Lazy::new(|| {
Regex::new(concat!(
r"(?:^|/)",
r"(?:",
r"test_[^/]*\.py",
r"|[^/]*_test\.py",
r"|[^/]*_test\.go",
r"|[^/]*Tests?\.java",
r"|[^/]*Test\.php",
r"|[^/]*_spec\.rb",
r"|[^/]*_test\.rb",
r"|[^/]*\.test\.[jt]sx?",
r"|[^/]*\.spec\.[jt]sx?",
r"|[^/]*Tests?\.kt",
r"|[^/]*Spec\.kt",
r"|[^/]*Tests?\.swift",
r"|[^/]*Spec\.swift",
r"|[^/]*Tests?\.cs",
r"|test_[^/]*\.cpp",
r"|[^/]*_test\.cpp",
r"|test_[^/]*\.c",
r"|[^/]*_test\.c",
r"|[^/]*Spec\.scala",
r"|[^/]*Suite\.scala",
r"|[^/]*Test\.scala",
r"|[^/]*_test\.dart",
r"|test_[^/]*\.dart",
r"|[^/]*_spec\.lua",
r"|[^/]*_test\.lua",
r"|test_[^/]*\.lua",
r"|test_helpers?[^/]*\.\w+",
r")$",
))
.unwrap()
});
static TEST_DIR_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r"(?:^|/)(?:tests?|__tests__|spec|testing)(?:/|$)").unwrap());
static COMPAT_DIR_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r"(?:^|/)(?:compat|_compat|legacy)(?:/|$)").unwrap());
static EXAMPLES_DIR_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r"(?:^|/)(?:_?examples?|docs?_src)(?:/|$)").unwrap());
static FIXTURE_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r"(?:^|/)(?:fixtures?|testdata)(?:/|$)|(?:^|/)fixture[^/]*$").unwrap());
static TYPE_DEFS_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"\.d\.ts$").unwrap());
const STRONG_PENALTY: f64 = 0.3;
const MODERATE_PENALTY: f64 = 0.5;
const MILD_PENALTY: f64 = 0.7;
const REEXPORT_FILENAMES: &[&str] = &["__init__.py", "package-info.java"];
const FILE_SATURATION_THRESHOLD: usize = 1;
const FILE_SATURATION_DECAY: f64 = 0.5;
pub fn rerank_topk(
scores: &HashMap<usize, f64>,
chunks: &[Chunk],
top_k: usize,
penalise_paths: bool,
) -> Vec<(usize, f64)> {
if scores.is_empty() {
return Vec::new();
}
let mut penalty_cache: HashMap<&str, f64> = HashMap::new();
let mut penalised: Vec<(usize, f64)> = Vec::with_capacity(scores.len());
for (&idx, &score) in scores {
let penalty = if penalise_paths {
let fp = chunks[idx].file_path.as_str();
*penalty_cache
.entry(fp)
.or_insert_with(|| file_path_penalty(fp))
} else {
1.0
};
penalised.push((idx, score * penalty));
}
penalised.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
let mut file_selected: HashMap<&str, usize> = HashMap::new();
let mut selected: Vec<(f64, usize)> = Vec::new();
let mut min_selected = f64::INFINITY;
for &(idx, pen_score) in &penalised {
if selected.len() >= top_k && pen_score <= min_selected {
break;
}
let fp = chunks[idx].file_path.as_str();
let already = *file_selected.get(fp).unwrap_or(&0);
let mut eff_score = pen_score;
if already >= FILE_SATURATION_THRESHOLD {
let excess = (already - FILE_SATURATION_THRESHOLD + 1) as i32;
eff_score *= FILE_SATURATION_DECAY.powi(excess);
}
selected.push((eff_score, idx));
*file_selected.entry(fp).or_default() += 1;
if selected.len() >= top_k {
min_selected = selected
.iter()
.map(|(s, _)| *s)
.fold(f64::INFINITY, f64::min);
}
}
selected.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
selected
.into_iter()
.take(top_k)
.map(|(score, idx)| (idx, score))
.collect()
}
pub fn rerank_topk_for_bm25_code(
scores: &HashMap<usize, f64>,
chunks: &[Chunk],
top_k: usize,
) -> Vec<(usize, f64)> {
let mut penalty_cache: HashMap<&str, f64> = HashMap::new();
let mut penalised: Vec<(usize, f64)> = scores
.iter()
.map(|(&idx, &score)| {
let fp = chunks[idx].file_path.as_str();
let penalty = *penalty_cache
.entry(fp)
.or_insert_with(|| file_path_penalty(fp));
(idx, score * penalty)
})
.collect();
penalised.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
penalised.truncate(top_k);
penalised
}
pub(crate) fn file_path_penalty(file_path: &str) -> f64 {
let normalised = file_path.replace('\\', "/");
let mut penalty = 1.0;
if TEST_FILE_RE.is_match(&normalised) || TEST_DIR_RE.is_match(&normalised) {
penalty *= STRONG_PENALTY;
}
if let Some(name) = Path::new(file_path).file_name().and_then(|n| n.to_str()) {
if REEXPORT_FILENAMES.contains(&name) {
penalty *= MODERATE_PENALTY;
}
}
if COMPAT_DIR_RE.is_match(&normalised) {
penalty *= STRONG_PENALTY;
}
if EXAMPLES_DIR_RE.is_match(&normalised) {
penalty *= STRONG_PENALTY;
}
if FIXTURE_RE.is_match(&normalised) {
penalty *= STRONG_PENALTY;
}
if TYPE_DEFS_RE.is_match(&normalised) {
penalty *= MILD_PENALTY;
}
penalty
}