Skip to main content

cli_engine/
search.rs

1use std::collections::{BTreeMap, BTreeSet};
2
3use serde::Serialize;
4
5/// Searchable document contributed by commands, guides, or applications.
6#[derive(Clone, Debug, PartialEq)]
7pub struct SearchDocument {
8    /// Stable document id.
9    pub id: String,
10    /// Document kind, such as `command` or `guide`.
11    pub kind: String,
12    /// Search result title.
13    pub title: String,
14    /// Short snippet shown in search results.
15    pub summary: String,
16    /// Full searchable content.
17    pub content: String,
18}
19
20impl SearchDocument {
21    /// Creates a searchable document with title text as the default content.
22    #[must_use]
23    pub fn new(id: impl Into<String>, kind: impl Into<String>, title: impl Into<String>) -> Self {
24        let title = title.into();
25        Self {
26            id: id.into(),
27            kind: kind.into(),
28            summary: title.clone(),
29            content: title.clone(),
30            title,
31        }
32    }
33
34    /// Sets the short snippet shown in search results.
35    #[must_use]
36    pub fn with_summary(mut self, summary: impl Into<String>) -> Self {
37        self.summary = summary.into();
38        self
39    }
40
41    /// Sets the full searchable content.
42    #[must_use]
43    pub fn with_content(mut self, content: impl Into<String>) -> Self {
44        self.content = content.into();
45        self
46    }
47}
48
49/// Ranked search hit rendered by `--search`.
50#[derive(Clone, Debug, PartialEq, Serialize)]
51pub struct SearchResult {
52    /// Command path or guide title.
53    pub command: String,
54    /// Short search snippet.
55    pub snippet: String,
56    /// Cosine-similarity confidence score.
57    pub confidence: f64,
58}
59
60/// Small TF-IDF search index for command and guide discovery.
61#[derive(Clone, Debug)]
62pub struct SearchIndex {
63    docs: Vec<SearchDocument>,
64    tfidf: Vec<BTreeMap<String, f64>>,
65    idf: BTreeMap<String, f64>,
66}
67
68impl SearchIndex {
69    /// Builds a search index from documents.
70    #[must_use]
71    pub fn new(docs: Vec<SearchDocument>) -> Self {
72        let mut doc_terms = Vec::with_capacity(docs.len());
73        let mut df = BTreeMap::<String, usize>::new();
74        for doc in &docs {
75            let tokens = tokenize(&doc.content);
76            let mut tf = BTreeMap::<String, f64>::new();
77            for token in &tokens {
78                *tf.entry(token.clone()).or_default() += 1.0;
79            }
80            let len = tokens.len() as f64;
81            if len > 0.0 {
82                for value in tf.values_mut() {
83                    *value /= len;
84                }
85            }
86            let mut seen = BTreeSet::new();
87            for token in tokens {
88                if seen.insert(token.clone()) {
89                    *df.entry(token).or_default() += 1;
90                }
91            }
92            doc_terms.push(tf);
93        }
94        let doc_count = docs.len() as f64;
95        let idf = df
96            .into_iter()
97            .map(|(term, count)| (term, (1.0 + doc_count / count as f64).ln()))
98            .collect::<BTreeMap<_, _>>();
99        let tfidf = doc_terms
100            .into_iter()
101            .map(|tf| {
102                tf.into_iter()
103                    .map(|(term, freq)| {
104                        let weighted = freq * idf.get(&term).copied().unwrap_or_default();
105                        (term, weighted)
106                    })
107                    .collect()
108            })
109            .collect();
110        Self { docs, tfidf, idf }
111    }
112
113    /// Searches the index and returns up to `top_k` ranked hits.
114    #[must_use]
115    pub fn search(&self, query: &str, top_k: usize) -> Vec<SearchResult> {
116        let tokens = tokenize(query);
117        if tokens.is_empty() {
118            return Vec::new();
119        }
120        let mut qtf = BTreeMap::<String, f64>::new();
121        for token in &tokens {
122            *qtf.entry(token.clone()).or_default() += 1.0;
123        }
124        let len = tokens.len() as f64;
125        let qvec = qtf
126            .into_iter()
127            .filter_map(|(term, freq)| self.idf.get(&term).map(|idf| (term, (freq / len) * idf)))
128            .collect::<BTreeMap<_, _>>();
129        if qvec.is_empty() {
130            return Vec::new();
131        }
132        let mut results = self
133            .tfidf
134            .iter()
135            .enumerate()
136            .filter_map(|(index, dvec)| {
137                let score = cosine(&qvec, dvec);
138                (score > 0.0).then(|| SearchResult {
139                    command: self.docs[index].title.clone(),
140                    snippet: self.docs[index].summary.clone(),
141                    confidence: score,
142                })
143            })
144            .collect::<Vec<_>>();
145        results.sort_by(|left, right| {
146            right
147                .confidence
148                .partial_cmp(&left.confidence)
149                .unwrap_or(std::cmp::Ordering::Equal)
150        });
151        if top_k > 0 && results.len() > top_k {
152            results.truncate(top_k);
153        }
154        results
155    }
156}
157
158/// Tokenizes text with the same stemming and stop-word behavior used by search.
159#[must_use]
160pub fn tokenize(text: &str) -> Vec<String> {
161    text.to_lowercase()
162        .split(|ch: char| !ch.is_alphanumeric())
163        .filter(|word| word.len() >= 2 && !is_stop_word(word))
164        .map(stem)
165        .collect()
166}
167
168fn cosine(a: &BTreeMap<String, f64>, b: &BTreeMap<String, f64>) -> f64 {
169    let dot = a
170        .iter()
171        .filter_map(|(key, a_value)| b.get(key).map(|b_value| a_value * b_value))
172        .sum::<f64>();
173    let norm_a = a.values().map(|value| value * value).sum::<f64>();
174    let norm_b = b.values().map(|value| value * value).sum::<f64>();
175    let denom = norm_a.sqrt() * norm_b.sqrt();
176    if denom == 0.0 { 0.0 } else { dot / denom }
177}
178
179fn stem(word: &str) -> String {
180    for suffix in [
181        "tion", "sion", "ment", "ness", "ing", "ous", "ive", "ful", "ed", "ly", "er", "es",
182    ] {
183        if word.len() > suffix.len() + 2 && word.ends_with(suffix) {
184            return word[..word.len() - suffix.len()].to_owned();
185        }
186    }
187    if word.len() > 3 && word.ends_with('s') {
188        return word[..word.len() - 1].to_owned();
189    }
190    word.to_owned()
191}
192
193fn is_stop_word(word: &str) -> bool {
194    matches!(
195        word,
196        "the"
197            | "be"
198            | "to"
199            | "of"
200            | "and"
201            | "in"
202            | "that"
203            | "have"
204            | "it"
205            | "for"
206            | "not"
207            | "on"
208            | "with"
209            | "he"
210            | "as"
211            | "you"
212            | "do"
213            | "at"
214            | "this"
215            | "but"
216            | "his"
217            | "by"
218            | "from"
219            | "they"
220            | "we"
221            | "her"
222            | "she"
223            | "or"
224            | "an"
225            | "will"
226            | "my"
227            | "one"
228            | "all"
229            | "would"
230            | "there"
231            | "their"
232            | "what"
233            | "so"
234            | "up"
235            | "if"
236            | "about"
237            | "who"
238            | "which"
239            | "them"
240            | "then"
241            | "its"
242            | "also"
243            | "into"
244            | "could"
245            | "than"
246            | "other"
247            | "how"
248            | "has"
249            | "more"
250            | "these"
251            | "was"
252            | "are"
253            | "is"
254            | "am"
255            | "been"
256    )
257}