use std::collections::{BTreeMap, BTreeSet};
use serde::Serialize;
#[derive(Clone, Debug, PartialEq)]
pub struct SearchDocument {
pub id: String,
pub kind: String,
pub title: String,
pub summary: String,
pub content: String,
}
impl SearchDocument {
#[must_use]
pub fn new(id: impl Into<String>, kind: impl Into<String>, title: impl Into<String>) -> Self {
let title = title.into();
Self {
id: id.into(),
kind: kind.into(),
summary: title.clone(),
content: title.clone(),
title,
}
}
#[must_use]
pub fn with_summary(mut self, summary: impl Into<String>) -> Self {
self.summary = summary.into();
self
}
#[must_use]
pub fn with_content(mut self, content: impl Into<String>) -> Self {
self.content = content.into();
self
}
}
#[derive(Clone, Debug, PartialEq, Serialize)]
pub struct SearchResult {
pub command: String,
pub snippet: String,
pub confidence: f64,
}
#[derive(Clone, Debug)]
pub struct SearchIndex {
docs: Vec<SearchDocument>,
tfidf: Vec<BTreeMap<String, f64>>,
idf: BTreeMap<String, f64>,
}
impl SearchIndex {
#[must_use]
pub fn new(docs: Vec<SearchDocument>) -> Self {
let mut doc_terms = Vec::with_capacity(docs.len());
let mut df = BTreeMap::<String, usize>::new();
for doc in &docs {
let tokens = tokenize(&doc.content);
let mut tf = BTreeMap::<String, f64>::new();
for token in &tokens {
*tf.entry(token.clone()).or_default() += 1.0;
}
let len = tokens.len() as f64;
if len > 0.0 {
for value in tf.values_mut() {
*value /= len;
}
}
let mut seen = BTreeSet::new();
for token in tokens {
if seen.insert(token.clone()) {
*df.entry(token).or_default() += 1;
}
}
doc_terms.push(tf);
}
let doc_count = docs.len() as f64;
let idf = df
.into_iter()
.map(|(term, count)| (term, (1.0 + doc_count / count as f64).ln()))
.collect::<BTreeMap<_, _>>();
let tfidf = doc_terms
.into_iter()
.map(|tf| {
tf.into_iter()
.map(|(term, freq)| {
let weighted = freq * idf.get(&term).copied().unwrap_or_default();
(term, weighted)
})
.collect()
})
.collect();
Self { docs, tfidf, idf }
}
#[must_use]
pub fn search(&self, query: &str, top_k: usize) -> Vec<SearchResult> {
let tokens = tokenize(query);
if tokens.is_empty() {
return Vec::new();
}
let mut qtf = BTreeMap::<String, f64>::new();
for token in &tokens {
*qtf.entry(token.clone()).or_default() += 1.0;
}
let len = tokens.len() as f64;
let qvec = qtf
.into_iter()
.filter_map(|(term, freq)| self.idf.get(&term).map(|idf| (term, (freq / len) * idf)))
.collect::<BTreeMap<_, _>>();
if qvec.is_empty() {
return Vec::new();
}
let mut results = self
.tfidf
.iter()
.enumerate()
.filter_map(|(index, dvec)| {
let score = cosine(&qvec, dvec);
(score > 0.0).then(|| SearchResult {
command: self.docs[index].title.clone(),
snippet: self.docs[index].summary.clone(),
confidence: score,
})
})
.collect::<Vec<_>>();
results.sort_by(|left, right| {
right
.confidence
.partial_cmp(&left.confidence)
.unwrap_or(std::cmp::Ordering::Equal)
});
if top_k > 0 && results.len() > top_k {
results.truncate(top_k);
}
results
}
}
#[must_use]
pub fn tokenize(text: &str) -> Vec<String> {
text.to_lowercase()
.split(|ch: char| !ch.is_alphanumeric())
.filter(|word| word.len() >= 2 && !is_stop_word(word))
.map(stem)
.collect()
}
fn cosine(a: &BTreeMap<String, f64>, b: &BTreeMap<String, f64>) -> f64 {
let dot = a
.iter()
.filter_map(|(key, a_value)| b.get(key).map(|b_value| a_value * b_value))
.sum::<f64>();
let norm_a = a.values().map(|value| value * value).sum::<f64>();
let norm_b = b.values().map(|value| value * value).sum::<f64>();
let denom = norm_a.sqrt() * norm_b.sqrt();
if denom == 0.0 { 0.0 } else { dot / denom }
}
fn stem(word: &str) -> String {
for suffix in [
"tion", "sion", "ment", "ness", "ing", "ous", "ive", "ful", "ed", "ly", "er", "es",
] {
if word.len() > suffix.len() + 2 && word.ends_with(suffix) {
return word[..word.len() - suffix.len()].to_owned();
}
}
if word.len() > 3 && word.ends_with('s') {
return word[..word.len() - 1].to_owned();
}
word.to_owned()
}
fn is_stop_word(word: &str) -> bool {
matches!(
word,
"the"
| "be"
| "to"
| "of"
| "and"
| "in"
| "that"
| "have"
| "it"
| "for"
| "not"
| "on"
| "with"
| "he"
| "as"
| "you"
| "do"
| "at"
| "this"
| "but"
| "his"
| "by"
| "from"
| "they"
| "we"
| "her"
| "she"
| "or"
| "an"
| "will"
| "my"
| "one"
| "all"
| "would"
| "there"
| "their"
| "what"
| "so"
| "up"
| "if"
| "about"
| "who"
| "which"
| "them"
| "then"
| "its"
| "also"
| "into"
| "could"
| "than"
| "other"
| "how"
| "has"
| "more"
| "these"
| "was"
| "are"
| "is"
| "am"
| "been"
)
}