1use std::collections::{BTreeMap, BTreeSet};
2
3use serde::Serialize;
4
5#[derive(Clone, Debug, PartialEq)]
7pub struct SearchDocument {
8 pub id: String,
10 pub kind: String,
12 pub title: String,
14 pub summary: String,
16 pub content: String,
18}
19
20impl SearchDocument {
21 #[must_use]
23 pub fn new(id: impl Into<String>, kind: impl Into<String>, title: impl Into<String>) -> Self {
24 let title = title.into();
25 Self {
26 id: id.into(),
27 kind: kind.into(),
28 summary: title.clone(),
29 content: title.clone(),
30 title,
31 }
32 }
33
34 #[must_use]
36 pub fn with_summary(mut self, summary: impl Into<String>) -> Self {
37 self.summary = summary.into();
38 self
39 }
40
41 #[must_use]
43 pub fn with_content(mut self, content: impl Into<String>) -> Self {
44 self.content = content.into();
45 self
46 }
47}
48
49#[derive(Clone, Debug, PartialEq, Serialize)]
51pub struct SearchResult {
52 pub command: String,
54 pub snippet: String,
56 pub confidence: f64,
58}
59
60#[derive(Clone, Debug)]
62pub struct SearchIndex {
63 docs: Vec<SearchDocument>,
64 tfidf: Vec<BTreeMap<String, f64>>,
65 idf: BTreeMap<String, f64>,
66}
67
68impl SearchIndex {
69 #[must_use]
71 pub fn new(docs: Vec<SearchDocument>) -> Self {
72 let mut doc_terms = Vec::with_capacity(docs.len());
73 let mut df = BTreeMap::<String, usize>::new();
74 for doc in &docs {
75 let tokens = tokenize(&doc.content);
76 let mut tf = BTreeMap::<String, f64>::new();
77 for token in &tokens {
78 *tf.entry(token.clone()).or_default() += 1.0;
79 }
80 let len = tokens.len() as f64;
81 if len > 0.0 {
82 for value in tf.values_mut() {
83 *value /= len;
84 }
85 }
86 let mut seen = BTreeSet::new();
87 for token in tokens {
88 if seen.insert(token.clone()) {
89 *df.entry(token).or_default() += 1;
90 }
91 }
92 doc_terms.push(tf);
93 }
94 let doc_count = docs.len() as f64;
95 let idf = df
96 .into_iter()
97 .map(|(term, count)| (term, (1.0 + doc_count / count as f64).ln()))
98 .collect::<BTreeMap<_, _>>();
99 let tfidf = doc_terms
100 .into_iter()
101 .map(|tf| {
102 tf.into_iter()
103 .map(|(term, freq)| {
104 let weighted = freq * idf.get(&term).copied().unwrap_or_default();
105 (term, weighted)
106 })
107 .collect()
108 })
109 .collect();
110 Self { docs, tfidf, idf }
111 }
112
113 #[must_use]
115 pub fn search(&self, query: &str, top_k: usize) -> Vec<SearchResult> {
116 let tokens = tokenize(query);
117 if tokens.is_empty() {
118 return Vec::new();
119 }
120 let mut qtf = BTreeMap::<String, f64>::new();
121 for token in &tokens {
122 *qtf.entry(token.clone()).or_default() += 1.0;
123 }
124 let len = tokens.len() as f64;
125 let qvec = qtf
126 .into_iter()
127 .filter_map(|(term, freq)| self.idf.get(&term).map(|idf| (term, (freq / len) * idf)))
128 .collect::<BTreeMap<_, _>>();
129 if qvec.is_empty() {
130 return Vec::new();
131 }
132 let mut results = self
133 .tfidf
134 .iter()
135 .enumerate()
136 .filter_map(|(index, dvec)| {
137 let score = cosine(&qvec, dvec);
138 (score > 0.0).then(|| SearchResult {
139 command: self.docs[index].title.clone(),
140 snippet: self.docs[index].summary.clone(),
141 confidence: score,
142 })
143 })
144 .collect::<Vec<_>>();
145 results.sort_by(|left, right| {
146 right
147 .confidence
148 .partial_cmp(&left.confidence)
149 .unwrap_or(std::cmp::Ordering::Equal)
150 });
151 if top_k > 0 && results.len() > top_k {
152 results.truncate(top_k);
153 }
154 results
155 }
156}
157
158#[must_use]
160pub fn tokenize(text: &str) -> Vec<String> {
161 text.to_lowercase()
162 .split(|ch: char| !ch.is_alphanumeric())
163 .filter(|word| word.len() >= 2 && !is_stop_word(word))
164 .map(stem)
165 .collect()
166}
167
168fn cosine(a: &BTreeMap<String, f64>, b: &BTreeMap<String, f64>) -> f64 {
169 let dot = a
170 .iter()
171 .filter_map(|(key, a_value)| b.get(key).map(|b_value| a_value * b_value))
172 .sum::<f64>();
173 let norm_a = a.values().map(|value| value * value).sum::<f64>();
174 let norm_b = b.values().map(|value| value * value).sum::<f64>();
175 let denom = norm_a.sqrt() * norm_b.sqrt();
176 if denom == 0.0 { 0.0 } else { dot / denom }
177}
178
179fn stem(word: &str) -> String {
180 for suffix in [
181 "tion", "sion", "ment", "ness", "ing", "ous", "ive", "ful", "ed", "ly", "er", "es",
182 ] {
183 if word.len() > suffix.len() + 2 && word.ends_with(suffix) {
184 return word[..word.len() - suffix.len()].to_owned();
185 }
186 }
187 if word.len() > 3 && word.ends_with('s') {
188 return word[..word.len() - 1].to_owned();
189 }
190 word.to_owned()
191}
192
193fn is_stop_word(word: &str) -> bool {
194 matches!(
195 word,
196 "the"
197 | "be"
198 | "to"
199 | "of"
200 | "and"
201 | "in"
202 | "that"
203 | "have"
204 | "it"
205 | "for"
206 | "not"
207 | "on"
208 | "with"
209 | "he"
210 | "as"
211 | "you"
212 | "do"
213 | "at"
214 | "this"
215 | "but"
216 | "his"
217 | "by"
218 | "from"
219 | "they"
220 | "we"
221 | "her"
222 | "she"
223 | "or"
224 | "an"
225 | "will"
226 | "my"
227 | "one"
228 | "all"
229 | "would"
230 | "there"
231 | "their"
232 | "what"
233 | "so"
234 | "up"
235 | "if"
236 | "about"
237 | "who"
238 | "which"
239 | "them"
240 | "then"
241 | "its"
242 | "also"
243 | "into"
244 | "could"
245 | "than"
246 | "other"
247 | "how"
248 | "has"
249 | "more"
250 | "these"
251 | "was"
252 | "are"
253 | "is"
254 | "am"
255 | "been"
256 )
257}