use crate::display;
use crate::message::{ConversationMessage, SearchHit};
use crate::text;
use regex::Regex;
use std::collections::HashMap;
#[allow(clippy::cast_precision_loss)]
fn build_regex_hit(msg: &ConversationMessage, regex: &Regex) -> Option<SearchHit> {
let searchable = display::searchable_text(msg);
let match_count = regex.find_iter(&searchable).count();
if match_count == 0 {
return None;
}
let role = msg.role_label();
let snippet = text::line_snippet_regex(&searchable, regex, 2);
let files = display::message_files(msg);
Some(SearchHit {
entry_id: msg.entry_id.clone(),
score: match_count as f64,
role,
text: snippet,
files,
})
}
fn sort_and_page(
mut hits: Vec<SearchHit>,
page: usize,
page_size: usize,
) -> (Vec<SearchHit>, usize) {
hits.sort_by(|a, b| {
b.score
.partial_cmp(&a.score)
.unwrap_or(std::cmp::Ordering::Equal)
});
let total = hits.len();
let start = (page.saturating_sub(1)) * page_size;
if start >= total {
return (Vec::new(), total);
}
(
hits[start..std::cmp::min(start + page_size, total)].to_vec(),
total,
)
}
#[allow(clippy::cast_precision_loss)]
pub fn grep(messages: &[ConversationMessage], pattern: &str) -> Vec<SearchHit> {
let Some(regex) = text::make_regex(pattern) else {
return Vec::new();
};
messages
.iter()
.filter_map(|msg| build_regex_hit(msg, ®ex))
.collect()
}
#[allow(clippy::cast_precision_loss)]
pub fn query(
messages: &[ConversationMessage],
query_str: &str,
page: usize,
page_size: usize,
) -> (Vec<SearchHit>, usize) {
if text::looks_like_regex(query_str) {
let Some(regex) = text::make_regex(query_str) else {
return (Vec::new(), 0);
};
let hits: Vec<SearchHit> = messages
.iter()
.filter_map(|msg| build_regex_hit(msg, ®ex))
.collect();
sort_and_page(hits, page, page_size)
} else {
let words = text::split_words(query_str);
let terms: Vec<String> = words
.into_iter()
.filter(|w| w.len() > 1 && !text::is_stop_word(w))
.collect();
if terms.is_empty() {
return (Vec::new(), 0);
}
let n = messages.len() as f64;
let mut df: HashMap<String, usize> = HashMap::new();
let mut tokenized: Vec<Vec<String>> = Vec::with_capacity(messages.len());
for msg in messages {
let searchable = display::searchable_text(msg);
let msg_words: Vec<String> = text::split_words(&searchable)
.into_iter()
.map(|w| w.to_ascii_lowercase())
.collect();
for term in &terms {
if msg_words.contains(term) {
*df.entry(term.clone()).or_insert(0) += 1;
}
}
tokenized.push(msg_words);
}
let hits: Vec<SearchHit> = messages
.iter()
.enumerate()
.filter_map(|(i, msg)| {
let msg_words = &tokenized[i];
let mut score = 0.0;
for term in &terms {
let tf = msg_words.iter().filter(|w| w.as_str() == term).count();
if tf == 0 {
continue;
}
let df_val = *df.get(term).unwrap_or(&1) as f64;
let idf = ((n + 1.0) / (df_val + 1.0)).ln() + 1.0;
let tf_score = 1.0 + (tf as f64).ln();
score += tf_score * idf;
}
if score == 0.0 {
return None;
}
let role = msg.role_label();
let searchable = display::searchable_text(msg);
let snippet = text::line_snippet_terms(&searchable, &terms, 2);
let files = display::message_files(msg);
Some(SearchHit {
entry_id: msg.entry_id.clone(),
score,
role,
text: snippet,
files,
})
})
.collect();
sort_and_page(hits, page, page_size)
}
}