use std::collections::BTreeSet;
use crate::model::{Chunk, MatchLine, SearchResult};
pub(crate) fn context_selection_order(candidates: &[SearchResult]) -> Vec<&SearchResult> {
let high_confidence_floor = candidates.iter().map(|r| r.score).fold(0.0, f64::max) * 0.60;
type ChunkKey<'a> = (&'a str, usize, usize);
let mut selected: BTreeSet<ChunkKey<'_>> = BTreeSet::new();
let mut first_by_file: BTreeSet<&str> = BTreeSet::new();
let mut ordered = Vec::with_capacity(candidates.len());
for result in candidates {
let key: ChunkKey<'_> = (
result.chunk.file_path.as_str(),
result.chunk.start_line,
result.chunk.end_line,
);
if result.score >= high_confidence_floor
&& first_by_file.insert(result.chunk.file_path.as_str())
{
selected.insert(key);
ordered.push(result);
}
}
for result in candidates {
let key: ChunkKey<'_> = (
result.chunk.file_path.as_str(),
result.chunk.start_line,
result.chunk.end_line,
);
if selected.insert(key) {
ordered.push(result);
}
}
ordered
}
pub(crate) fn bounded_chunk_range(
chunk: &Chunk,
match_lines: &[MatchLine],
remaining_budget: usize,
) -> Option<(usize, usize, usize)> {
if remaining_budget == 0 {
return None;
}
let full_estimate = estimate_tokens(&chunk.content).max(1);
if full_estimate <= remaining_budget {
return Some((chunk.start_line, chunk.end_line, full_estimate));
}
let lines: Vec<&str> = chunk.content.lines().collect();
if lines.is_empty() {
return None;
}
let center_line = match_lines
.first()
.map(|l| l.line)
.unwrap_or(chunk.start_line)
.clamp(chunk.start_line, chunk.end_line);
let center_idx = center_line
.saturating_sub(chunk.start_line)
.min(lines.len().saturating_sub(1));
let byte_prefix: Vec<usize> = {
let mut acc = 0usize;
std::iter::once(0)
.chain(lines.iter().map(|line| {
acc += line.len();
acc
}))
.collect()
};
let word_prefix: Vec<usize> = {
let mut acc = 0usize;
std::iter::once(0)
.chain(lines.iter().map(|line| {
acc += line.split_whitespace().count();
acc
}))
.collect()
};
let estimate_slice = |start: usize, end: usize| -> usize {
let bytes = (byte_prefix[end + 1] - byte_prefix[start]) + (end - start);
let words = word_prefix[end + 1] - word_prefix[start];
bytes.div_ceil(4).max(words).max(1)
};
let mut best: Option<(usize, usize, usize)> = None;
for radius in 0..lines.len() {
let start_idx = center_idx.saturating_sub(radius);
let end_idx = (center_idx + radius).min(lines.len() - 1);
let estimate = estimate_slice(start_idx, end_idx);
if estimate > remaining_budget {
break;
}
best = Some((
chunk.start_line + start_idx,
chunk.start_line + end_idx,
estimate,
));
if start_idx == 0 && end_idx == lines.len() - 1 {
break;
}
}
best
}
fn estimate_tokens(text: &str) -> usize {
let bytes = text.len();
let words = text.split_whitespace().count();
bytes.div_ceil(4).max(words).max(1)
}
#[cfg(test)]
mod tests {
use super::*;
fn result_with_content(
path: &str,
score: f64,
content: &str,
start_line: usize,
end_line: usize,
) -> SearchResult {
SearchResult {
chunk: Chunk::new(
content.to_string(),
path.to_string(),
start_line,
end_line,
Some("rust".to_string()),
),
score,
match_lines: Vec::new(),
}
}
#[test]
fn context_selection_order_prefers_unique_files_before_second_chunk() {
let candidates = vec![
result_with_content("src/a.rs", 100.0, "fn first() {}", 1, 10),
result_with_content("src/a.rs", 90.0, "fn second() {}", 11, 20),
result_with_content("src/b.rs", 80.0, "fn third() {}", 1, 10),
];
let ordered = context_selection_order(&candidates);
let paths = ordered
.iter()
.map(|result| result.chunk.file_path.as_str())
.collect::<Vec<_>>();
assert_eq!(paths, vec!["src/a.rs", "src/b.rs", "src/a.rs"]);
}
}