use crate::display;
use crate::message::{ConversationMessage, SearchHit};
use crate::text;
use regex::Regex;
use std::collections::HashMap;
const BM25_K1: f64 = 1.5;
const BM25_B: f64 = 0.75;
const PROXIMITY_FACTOR: f64 = 2.0;
#[allow(clippy::cast_precision_loss)]
fn usize_as_f64(value: usize) -> f64 {
value as f64
}
#[allow(clippy::cast_precision_loss)]
fn fuzzy_score_as_f64(value: isize) -> f64 {
value as f64
}
fn build_fuzzy_hit(msg: &ConversationMessage, query: &str) -> Option<SearchHit> {
let searchable = display::searchable_text(msg);
let m = sublime_fuzzy::best_match(query, &searchable)?;
let role = msg.role_label();
let snippet = fuzzy_snippet(&searchable, m.matched_indices().next().copied(), 2);
let files = display::message_files(msg);
Some(SearchHit {
entry_id: msg.entry_id.clone(),
score: fuzzy_score_as_f64(m.score()),
role,
text: snippet,
files,
})
}
fn fuzzy_snippet(text: &str, first_match: Option<usize>, context_lines: usize) -> String {
let Some(pos) = first_match else {
return text::clip(text, 200);
};
let lines: Vec<&str> = text.split('\n').collect();
let mut char_count = 0;
for (i, line) in lines.iter().enumerate() {
char_count += line.len() + 1;
if char_count > pos {
return line_snippet_at(&lines, i, context_lines);
}
}
text::clip(text, 200)
}
fn line_snippet_at(lines: &[&str], match_idx: usize, context_lines: usize) -> String {
let start = match_idx.saturating_sub(context_lines);
let end = (match_idx + context_lines + 1).min(lines.len());
let mut out = Vec::new();
if start > 0 {
out.push(format!("...({start} lines above)"));
}
for line in &lines[start..end] {
out.push(line.to_string());
}
if end < lines.len() {
out.push(format!("...({} lines below)", lines.len() - end));
}
out.join("\n")
}
fn min_span_all(positions: &[Vec<usize>]) -> usize {
let mut all: Vec<(usize, usize)> = Vec::new();
for (term_idx, pos_list) in positions.iter().enumerate() {
for &pos in pos_list {
all.push((pos, term_idx));
}
}
all.sort_by_key(|(pos, _)| *pos);
let num_terms = positions.len();
let mut counts = vec![0usize; num_terms];
let mut matched = 0;
let mut left = 0;
let mut min_span = usize::MAX;
for right in 0..all.len() {
let term_idx = all[right].1;
if counts[term_idx] == 0 {
matched += 1;
}
counts[term_idx] += 1;
while matched == num_terms {
let span = all[right].0 - all[left].0 + 1;
min_span = min_span.min(span);
let left_term = all[left].1;
counts[left_term] -= 1;
if counts[left_term] == 0 {
matched -= 1;
}
left += 1;
}
}
min_span
}
fn adjacent_fallback(
messages: &[ConversationMessage],
tokenized: &[Vec<String>],
terms: &[String],
page: usize,
page_size: usize,
) -> (Vec<SearchHit>, usize) {
let hits: Vec<SearchHit> = tokenized
.iter()
.enumerate()
.filter_map(|(i, msg_words)| {
let word_set: std::collections::HashSet<&str> =
msg_words.iter().map(std::string::String::as_str).collect();
let matching_pairs = terms
.windows(2)
.filter(|pair| {
word_set.contains(pair[0].as_str()) && word_set.contains(pair[1].as_str())
})
.count();
if matching_pairs == 0 {
return None;
}
let msg = &messages[i];
let role = msg.role_label();
let searchable = display::searchable_text(msg);
let snippet = text::line_snippet_terms(&searchable, terms, 2);
let files = display::message_files(msg);
Some(SearchHit {
entry_id: msg.entry_id.clone(),
score: usize_as_f64(matching_pairs),
role,
text: snippet,
files,
})
})
.collect();
sort_and_page(hits, page, page_size)
}
fn build_regex_hit(msg: &ConversationMessage, regex: &Regex) -> Option<SearchHit> {
let searchable = display::searchable_text(msg);
let match_count = regex.find_iter(&searchable).count();
if match_count == 0 {
return None;
}
let role = msg.role_label();
let snippet = text::line_snippet_regex(&searchable, regex, 2);
let files = display::message_files(msg);
Some(SearchHit {
entry_id: msg.entry_id.clone(),
score: usize_as_f64(match_count),
role,
text: snippet,
files,
})
}
fn sort_and_page(
mut hits: Vec<SearchHit>,
page: usize,
page_size: usize,
) -> (Vec<SearchHit>, usize) {
hits.sort_by(|a, b| {
b.score
.partial_cmp(&a.score)
.unwrap_or(std::cmp::Ordering::Equal)
});
let total = hits.len();
let start = (page.saturating_sub(1)) * page_size;
if start >= total {
return (Vec::new(), total);
}
(
hits[start..std::cmp::min(start + page_size, total)].to_vec(),
total,
)
}
pub fn grep(messages: &[ConversationMessage], pattern: &str) -> Vec<SearchHit> {
let Some(regex) = text::make_regex(pattern) else {
return Vec::new();
};
messages
.iter()
.filter_map(|msg| build_regex_hit(msg, ®ex))
.collect()
}
pub fn query(
messages: &[ConversationMessage],
query_str: &str,
page: usize,
page_size: usize,
) -> (Vec<SearchHit>, usize) {
if text::looks_like_regex(query_str) {
let hits: Vec<SearchHit> = messages
.iter()
.filter_map(|msg| build_fuzzy_hit(msg, query_str))
.collect();
sort_and_page(hits, page, page_size)
} else {
let words = text::split_words(query_str);
let terms: Vec<String> = words
.into_iter()
.filter(|w| w.len() > 1 && !text::is_stop_word(w))
.collect();
if terms.is_empty() {
return (Vec::new(), 0);
}
let n = usize_as_f64(messages.len());
let mut df: HashMap<String, usize> = HashMap::new();
let mut tokenized: Vec<Vec<String>> = Vec::with_capacity(messages.len());
let mut doc_lengths: Vec<usize> = Vec::with_capacity(messages.len());
let mut total_words: usize = 0;
for msg in messages {
let searchable = display::searchable_text(msg);
let msg_words: Vec<String> = text::split_words(&searchable)
.into_iter()
.map(|w| w.to_ascii_lowercase())
.collect();
let dl = msg_words.len();
doc_lengths.push(dl);
total_words += dl;
let word_set: std::collections::HashSet<&str> =
msg_words.iter().map(std::string::String::as_str).collect();
for term in &terms {
if word_set.contains(term.as_str()) {
*df.entry(term.clone()).or_insert(0) += 1;
}
}
tokenized.push(msg_words);
}
let avgdl = if n > 0.0 {
usize_as_f64(total_words) / n
} else {
0.0
};
let hits: Vec<SearchHit> = messages
.iter()
.enumerate()
.filter_map(|(i, msg)| {
let msg_words = &tokenized[i];
let dl = usize_as_f64(doc_lengths[i]);
let len_norm = 1.0 - BM25_B + BM25_B * (dl / avgdl.max(1.0));
let mut score = 0.0;
let mut term_positions: Vec<Vec<usize>> = Vec::with_capacity(terms.len());
for term in &terms {
let positions: Vec<usize> = msg_words
.iter()
.enumerate()
.filter(|(_, w)| w.as_str() == term)
.map(|(i, _)| i)
.collect();
let tf = positions.len();
if tf == 0 {
continue;
}
term_positions.push(positions);
let df_val = usize_as_f64(*df.get(term).unwrap_or(&1));
let idf = ((n - df_val + 0.5) / (df_val + 0.5)).ln() + 1.0;
let tf = usize_as_f64(tf);
let tf_score = (tf * (BM25_K1 + 1.0)) / (tf + BM25_K1 * len_norm);
score += idf * tf_score;
}
if score > 0.0 && term_positions.len() > 1 {
let min_span = min_span_all(&term_positions).max(1);
score *= 1.0 + PROXIMITY_FACTOR / (1.0 + usize_as_f64(min_span));
}
if score == 0.0 {
return None;
}
let role = msg.role_label();
let searchable = display::searchable_text(msg);
let snippet = text::line_snippet_terms(&searchable, &terms, 2);
let files = display::message_files(msg);
Some(SearchHit {
entry_id: msg.entry_id.clone(),
score,
role,
text: snippet,
files,
})
})
.collect();
let (hits, total) = sort_and_page(hits, page, page_size);
if hits.is_empty() && total == 0 && terms.len() >= 2 {
return adjacent_fallback(messages, &tokenized, &terms, page, page_size);
}
(hits, total)
}
}