use std::cmp::Ordering;
use std::collections::{HashMap, HashSet};
use regex::Regex;
use crate::model::{Chunk, MatchLine, SearchResult};
use crate::ranking::penalties::file_path_penalty;
const EXACT_SCORE_BASE: f64 = 10_000.0;
const MAX_MATCH_LINES: usize = 8;
const REGEX_PREFIX: &str = "re:";
type Trigram = [u8; 3];
pub(crate) struct ExactIndex {
postings: HashMap<Trigram, Vec<usize>>,
}
impl ExactIndex {
pub(crate) fn new(chunks: &[Chunk]) -> Self {
let mut postings: HashMap<Trigram, Vec<usize>> = HashMap::new();
for (idx, chunk) in chunks.iter().enumerate() {
let mut seen = HashSet::new();
for trigram in trigrams(chunk.content.as_bytes()) {
if seen.insert(trigram) {
postings.entry(trigram).or_default().push(idx);
}
}
}
for list in postings.values_mut() {
list.sort_unstable();
list.dedup();
}
Self { postings }
}
pub(crate) fn search(
&self,
query: &str,
chunks: &[Chunk],
top_k: usize,
selector: Option<&[usize]>,
) -> Vec<SearchResult> {
let query = query.trim();
if query.is_empty() || top_k == 0 {
return Vec::new();
}
if let Some(pattern) = query.strip_prefix(REGEX_PREFIX) {
return self.search_regex(pattern, chunks, top_k, selector);
}
self.search_literal(query, chunks, top_k, selector)
}
fn search_literal(
&self,
needle: &str,
chunks: &[Chunk],
top_k: usize,
selector: Option<&[usize]>,
) -> Vec<SearchResult> {
let required = unique_trigrams(needle.as_bytes());
if required.is_empty() {
return Vec::new();
}
let candidates = self.candidate_chunks(&required, selector);
let mut results = Vec::new();
for idx in candidates {
let chunk = &chunks[idx];
if !chunk.content.contains(needle) {
continue;
}
let match_lines = literal_match_lines(chunk, needle);
results.push(SearchResult {
chunk: chunk.clone(),
score: EXACT_SCORE_BASE + match_lines.len() as f64,
match_lines,
});
}
sort_and_limit(results, top_k)
}
fn search_regex(
&self,
pattern: &str,
chunks: &[Chunk],
top_k: usize,
selector: Option<&[usize]>,
) -> Vec<SearchResult> {
let literals = match mandatory_literals_for_safe_regex(pattern) {
Some(literals) if !literals.is_empty() => literals,
_ => return Vec::new(),
};
let mut required = Vec::new();
for literal in &literals {
required.extend(unique_trigrams(literal.as_bytes()));
}
required.sort_unstable();
required.dedup();
if required.is_empty() {
return Vec::new();
}
let regex = match Regex::new(pattern) {
Ok(regex) => regex,
Err(_) => return Vec::new(),
};
let candidates = self.candidate_chunks(&required, selector);
let mut results = Vec::new();
for idx in candidates {
let chunk = &chunks[idx];
if !regex.is_match(&chunk.content) {
continue;
}
let match_lines = regex_match_lines(chunk, ®ex);
results.push(SearchResult {
chunk: chunk.clone(),
score: EXACT_SCORE_BASE + 100.0 + match_lines.len() as f64,
match_lines,
});
}
sort_and_limit(results, top_k)
}
fn candidate_chunks(&self, required: &[Trigram], selector: Option<&[usize]>) -> Vec<usize> {
let mut lists: Vec<&Vec<usize>> = Vec::new();
for trigram in required {
let Some(list) = self.postings.get(trigram) else {
return Vec::new();
};
lists.push(list);
}
lists.sort_by_key(|list| list.len());
let Some(first) = lists.first() else {
return Vec::new();
};
let mut current = (*first).clone();
for list in lists.iter().skip(1) {
current = intersect_sorted(¤t, list);
if current.is_empty() {
break;
}
}
if let Some(selector) = selector {
current = intersect_sorted(¤t, selector);
}
current
}
}
fn trigrams(bytes: &[u8]) -> impl Iterator<Item = Trigram> + '_ {
bytes.windows(3).map(|w| [w[0], w[1], w[2]])
}
fn unique_trigrams(bytes: &[u8]) -> Vec<Trigram> {
if bytes.len() < 3 {
return Vec::new();
}
let mut values: Vec<Trigram> = trigrams(bytes).collect();
values.sort_unstable();
values.dedup();
values
}
fn intersect_sorted(left: &[usize], right: &[usize]) -> Vec<usize> {
let mut out = Vec::new();
let mut i = 0;
let mut j = 0;
while i < left.len() && j < right.len() {
match left[i].cmp(&right[j]) {
Ordering::Less => i += 1,
Ordering::Greater => j += 1,
Ordering::Equal => {
out.push(left[i]);
i += 1;
j += 1;
}
}
}
out
}
fn literal_match_lines(chunk: &Chunk, needle: &str) -> Vec<MatchLine> {
let mut lines = Vec::new();
for (offset, line) in chunk.content.lines().enumerate() {
if line.contains(needle) {
lines.push(MatchLine {
line: chunk.start_line + offset,
content: line.trim().to_string(),
});
if lines.len() >= MAX_MATCH_LINES {
return lines;
}
}
}
if lines.is_empty() {
if let Some(byte_idx) = chunk.content.find(needle) {
lines.push(MatchLine {
line: byte_to_line(&chunk.content, byte_idx, chunk.start_line),
content: preview_at(&chunk.content, byte_idx),
});
}
}
lines
}
fn regex_match_lines(chunk: &Chunk, regex: &Regex) -> Vec<MatchLine> {
let mut lines = Vec::new();
for mat in regex.find_iter(&chunk.content) {
lines.push(MatchLine {
line: byte_to_line(&chunk.content, mat.start(), chunk.start_line),
content: preview_at(&chunk.content, mat.start()),
});
if lines.len() >= MAX_MATCH_LINES {
break;
}
}
lines
}
fn byte_to_line(content: &str, byte_idx: usize, start_line: usize) -> usize {
start_line + content[..byte_idx].bytes().filter(|&b| b == b'\n').count()
}
fn preview_at(content: &str, byte_idx: usize) -> String {
let before = content[..byte_idx].rfind('\n').map_or(0, |idx| idx + 1);
let after = content[byte_idx..]
.find('\n')
.map_or(content.len(), |idx| byte_idx + idx);
content[before..after].trim().to_string()
}
fn mandatory_literals_for_safe_regex(pattern: &str) -> Option<Vec<String>> {
if pattern.contains('|')
|| pattern.contains('?')
|| pattern.contains('*')
|| pattern.contains('{')
{
return None;
}
let mut literals = Vec::new();
let mut current = String::new();
let mut chars = pattern.chars().peekable();
let mut in_class = false;
while let Some(ch) = chars.next() {
if in_class {
if ch == ']' {
in_class = false;
}
flush_literal(&mut literals, &mut current);
continue;
}
match ch {
'[' => {
in_class = true;
flush_literal(&mut literals, &mut current);
}
'\\' => match chars.next() {
Some(escaped) if escaped.is_ascii_alphanumeric() => {
flush_literal(&mut literals, &mut current);
}
Some(escaped) => current.push(escaped),
None => return None,
},
'.' | '^' | '$' | '(' | ')' | '+' => {
flush_literal(&mut literals, &mut current);
}
_ => current.push(ch),
}
}
if in_class {
return None;
}
flush_literal(&mut literals, &mut current);
literals.retain(|literal| literal.len() >= 3);
literals.sort();
literals.dedup();
Some(literals)
}
fn flush_literal(literals: &mut Vec<String>, current: &mut String) {
if current.len() >= 3 {
literals.push(std::mem::take(current));
} else {
current.clear();
}
}
fn sort_and_limit(mut results: Vec<SearchResult>, top_k: usize) -> Vec<SearchResult> {
results.sort_by(|a, b| {
exact_rank_score(b)
.partial_cmp(&exact_rank_score(a))
.unwrap_or(Ordering::Equal)
.then_with(|| b.score.partial_cmp(&a.score).unwrap_or(Ordering::Equal))
.then_with(|| a.chunk.file_path.cmp(&b.chunk.file_path))
.then_with(|| a.chunk.start_line.cmp(&b.chunk.start_line))
.then_with(|| a.chunk.end_line.cmp(&b.chunk.end_line))
});
results.truncate(top_k);
results
}
fn exact_rank_score(result: &SearchResult) -> f64 {
result.score * file_path_penalty(&result.chunk.file_path)
}
#[cfg(test)]
mod tests {
use super::*;
fn chunk(path: &str, content: &str) -> Chunk {
Chunk::new(
content.to_string(),
path.to_string(),
1,
content.lines().count(),
Some("rust".to_string()),
)
}
#[test]
fn literal_search_verifies_candidates() {
let chunks = vec![
chunk("src/retry.rs", "pub fn retry_backoff() -> u64 { 100 }\n"),
chunk("src/other.rs", "pub fn retry_then_wait() {}\n"),
];
let index = ExactIndex::new(&chunks);
let results = index.search("retry_backoff", &chunks, 5, None);
assert_eq!(results.len(), 1);
assert_eq!(results[0].chunk.file_path, "src/retry.rs");
assert!(results[0].match_lines[0].content.contains("retry_backoff"));
}
#[test]
fn safe_regex_uses_verified_match_truth() {
let chunks = vec![chunk(
"src/retry.rs",
"pub fn retry_backoff() -> u64 { 100 }\n",
)];
let index = ExactIndex::new(&chunks);
let results = index.search("re:retry_\\w+", &chunks, 5, None);
assert_eq!(results.len(), 1);
assert_eq!(results[0].chunk.file_path, "src/retry.rs");
}
#[test]
fn exact_symbol_search_prefers_implementation_over_script_fixture() {
let chunks = vec![
chunk(
"scripts/fixture.sh",
"cat > main.rs <<'EOF'\nfn calculate_invoice_00() {}\nEOF\n",
),
chunk("src/feature.rs", "pub fn calculate_invoice_00() {}\n"),
chunk(
"tests/feature_test.rs",
"#[test]\nfn duplicate_calculate_invoice_00_test() {}\n",
),
];
let index = ExactIndex::new(&chunks);
let results = index.search("calculate_invoice_00", &chunks, 3, None);
assert_eq!(results[0].chunk.file_path, "src/feature.rs");
}
#[test]
fn broad_or_ambiguous_regex_does_not_guess() {
let chunks = vec![chunk(
"src/retry.rs",
"pub fn retry_backoff() -> u64 { 100 }\n",
)];
let index = ExactIndex::new(&chunks);
let results = index.search("re:retry|backoff", &chunks, 5, None);
assert!(results.is_empty());
}
}