1use crate::search::config::{
31 DEFAULT_FUZZY_DISTANCE, DEFAULT_SEARCH_LIMIT, FUZZY_TRANSPOSE_COST_ONE, MAX_QUERY_LENGTH,
32};
33use crate::search::indexer::SearchIndexer;
34use anyhow::{Context, Result};
35use rmcp::schemars;
36use schemars::JsonSchema;
37use serde::{Deserialize, Serialize};
38use tantivy::{
39 Index, TantivyDocument, Term,
40 collector::TopDocs,
41 query::{BooleanQuery, FuzzyTermQuery, Occur, Query, QueryParser, TermQuery},
42 schema::{Field, Value},
43};
44
45pub struct FuzzySearcher {
47 index: Index,
48 query_parser: QueryParser,
49 fields: FuzzySearchFields,
50}
51
52#[derive(Debug, Clone)]
53struct FuzzySearchFields {
54 name: Field,
55 docs: Field,
56 path: Field,
57 kind: Field,
58 crate_name: Field,
59 version: Field,
60 item_id: Field,
61 visibility: Field,
62 member: Field,
63}
64
65#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
66pub struct FuzzySearchOptions {
67 #[schemars(description = "Enable fuzzy matching for typo tolerance")]
68 pub fuzzy_enabled: bool,
69 #[schemars(description = "Edit distance for fuzzy matching (0-2)")]
70 pub fuzzy_distance: u8,
71 #[schemars(description = "Maximum number of results to return")]
72 pub limit: usize,
73 #[schemars(description = "Filter by item kind")]
74 pub kind_filter: Option<String>,
75 #[schemars(description = "Filter by crate name")]
76 pub crate_filter: Option<String>,
77 #[schemars(description = "Filter by workspace member")]
78 pub member_filter: Option<String>,
79}
80
81impl Default for FuzzySearchOptions {
82 fn default() -> Self {
83 Self {
84 fuzzy_enabled: true,
85 fuzzy_distance: DEFAULT_FUZZY_DISTANCE,
86 limit: DEFAULT_SEARCH_LIMIT,
87 kind_filter: None,
88 crate_filter: None,
89 member_filter: None,
90 }
91 }
92}
93
94#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
95pub struct SearchResult {
96 #[schemars(description = "Relevance score")]
97 pub score: f32,
98 #[schemars(description = "Item ID")]
99 pub item_id: u32,
100 #[schemars(description = "Item name")]
101 pub name: String,
102 #[schemars(description = "Item path")]
103 pub path: String,
104 #[schemars(description = "Item kind")]
105 pub kind: String,
106 #[schemars(description = "Crate name")]
107 pub crate_name: String,
108 #[schemars(description = "Crate version")]
109 pub version: String,
110 #[schemars(description = "Item visibility")]
111 pub visibility: String,
112 #[schemars(description = "Workspace member name (if applicable)")]
113 pub member: Option<String>,
114}
115
116impl FuzzySearcher {
117 pub fn from_indexer(indexer: &SearchIndexer) -> Result<Self> {
119 let index = indexer.get_index().clone();
120
121 let fields = FuzzySearchFields {
122 name: indexer.get_name_field(),
123 docs: indexer.get_docs_field(),
124 path: indexer.get_path_field(),
125 kind: indexer.get_kind_field(),
126 crate_name: indexer.get_crate_name_field(),
127 version: indexer.get_version_field(),
128 item_id: indexer.get_item_id_field(),
129 visibility: indexer.get_visibility_field(),
130 member: indexer.get_member_field(),
131 };
132
133 let query_parser =
135 QueryParser::for_index(&index, vec![fields.name, fields.docs, fields.path]);
136
137 Ok(Self {
138 index,
139 query_parser,
140 fields,
141 })
142 }
143
144 pub fn search(&self, query: &str, options: &FuzzySearchOptions) -> Result<Vec<SearchResult>> {
146 if query.len() > MAX_QUERY_LENGTH {
148 return Err(anyhow::anyhow!(
149 "Query too long (max {} characters)",
150 MAX_QUERY_LENGTH
151 ));
152 }
153
154 let sanitized_query = Self::sanitize_query(query);
156
157 let reader = self.index.reader()?;
158 let searcher = reader.searcher();
159
160 let search_query = if options.fuzzy_enabled {
162 self.build_fuzzy_query(&sanitized_query, options)?
163 } else {
164 self.build_standard_query(&sanitized_query, options)?
165 };
166
167 let top_docs = searcher.search(&search_query, &TopDocs::with_limit(options.limit))?;
169
170 let mut results = Vec::new();
172 for (score, doc_address) in top_docs {
173 let doc = searcher.doc(doc_address)?;
174 if let Some(result) = self.doc_to_search_result(&doc, score)? {
175 if self.matches_filters(&result, options) {
177 results.push(result);
178 }
179 }
180 }
181
182 Ok(results)
183 }
184
185 fn build_fuzzy_query(
187 &self,
188 query: &str,
189 options: &FuzzySearchOptions,
190 ) -> Result<Box<dyn Query>> {
191 let terms: Vec<&str> = query.split_whitespace().collect();
193
194 let mut main_clauses = Vec::new();
195
196 for term in terms {
197 let mut term_clauses = Vec::new();
199
200 for field in &[self.fields.name, self.fields.docs, self.fields.path] {
202 let fuzzy_query = FuzzyTermQuery::new(
203 Term::from_field_text(*field, term),
204 options.fuzzy_distance,
205 FUZZY_TRANSPOSE_COST_ONE,
206 );
207 term_clauses.push((Occur::Should, Box::new(fuzzy_query) as Box<dyn Query>));
208 }
209
210 let term_query = BooleanQuery::new(term_clauses);
212 main_clauses.push((Occur::Should, Box::new(term_query) as Box<dyn Query>));
213 }
214
215 if let Some(crate_name) = &options.crate_filter {
217 let crate_term = Term::from_field_text(self.fields.crate_name, crate_name);
218 let crate_query = TermQuery::new(crate_term, tantivy::schema::IndexRecordOption::Basic);
219 main_clauses.push((Occur::Must, Box::new(crate_query) as Box<dyn Query>));
220 }
221
222 if let Some(member_name) = &options.member_filter {
224 let member_term = Term::from_field_text(self.fields.member, member_name);
225 let member_query =
226 TermQuery::new(member_term, tantivy::schema::IndexRecordOption::Basic);
227 main_clauses.push((Occur::Must, Box::new(member_query) as Box<dyn Query>));
228 }
229
230 let boolean_query = BooleanQuery::new(main_clauses);
231 Ok(Box::new(boolean_query))
232 }
233
234 fn build_standard_query(
236 &self,
237 query: &str,
238 options: &FuzzySearchOptions,
239 ) -> Result<Box<dyn Query>> {
240 let mut clauses = Vec::new();
241
242 let parsed_query = self
244 .query_parser
245 .parse_query(query)
246 .with_context(|| format!("Failed to parse query: {query}"))?;
247 clauses.push((Occur::Must, parsed_query));
248
249 if let Some(crate_name) = &options.crate_filter {
251 let crate_term = Term::from_field_text(self.fields.crate_name, crate_name);
252 let crate_query = TermQuery::new(crate_term, tantivy::schema::IndexRecordOption::Basic);
253 clauses.push((Occur::Must, Box::new(crate_query) as Box<dyn Query>));
254 }
255
256 if let Some(member_name) = &options.member_filter {
258 let member_term = Term::from_field_text(self.fields.member, member_name);
259 let member_query =
260 TermQuery::new(member_term, tantivy::schema::IndexRecordOption::Basic);
261 clauses.push((Occur::Must, Box::new(member_query) as Box<dyn Query>));
262 }
263
264 let boolean_query = BooleanQuery::new(clauses);
265 Ok(Box::new(boolean_query))
266 }
267
268 fn doc_to_search_result(
270 &self,
271 doc: &TantivyDocument,
272 score: f32,
273 ) -> Result<Option<SearchResult>> {
274 let get_text_field = |field: Field| -> Option<String> {
275 doc.get_first(field)?.as_str().map(|s| s.to_string())
276 };
277
278 let get_u64_field = |field: Field| -> Option<u64> { doc.get_first(field)?.as_u64() };
279
280 let item_id = get_u64_field(self.fields.item_id)
281 .ok_or_else(|| anyhow::anyhow!("Missing item_id"))? as u32;
282 let name =
283 get_text_field(self.fields.name).ok_or_else(|| anyhow::anyhow!("Missing name"))?;
284 let path =
285 get_text_field(self.fields.path).ok_or_else(|| anyhow::anyhow!("Missing path"))?;
286 let kind =
287 get_text_field(self.fields.kind).ok_or_else(|| anyhow::anyhow!("Missing kind"))?;
288 let crate_name = get_text_field(self.fields.crate_name)
289 .ok_or_else(|| anyhow::anyhow!("Missing crate_name"))?;
290 let version = get_text_field(self.fields.version)
291 .ok_or_else(|| anyhow::anyhow!("Missing version"))?;
292 let visibility = get_text_field(self.fields.visibility).unwrap_or_default();
293 let member = get_text_field(self.fields.member);
294
295 Ok(Some(SearchResult {
296 score,
297 item_id,
298 name,
299 path,
300 kind,
301 crate_name,
302 version,
303 visibility,
304 member,
305 }))
306 }
307
308 fn matches_filters(&self, result: &SearchResult, options: &FuzzySearchOptions) -> bool {
310 if let Some(kind_filter) = &options.kind_filter
311 && result.kind != *kind_filter
312 {
313 return false;
314 }
315
316 true
317 }
318
319 fn sanitize_query(query: &str) -> String {
321 query
324 .chars()
325 .map(|c| match c {
326 '+' | '-' | '!' | '(' | ')' | '{' | '}' | '[' | ']' | '^' | '"' | '~' | '*'
327 | '?' | ':' | '\\' | '/' => format!("\\{c}"),
328 _ => c.to_string(),
329 })
330 .collect()
331 }
332}
333
334#[cfg(test)]
335mod tests {
336 use super::*;
337 use crate::search::indexer::SearchIndexer;
338 use tempfile::TempDir;
339
340 #[test]
341 fn test_sanitize_query() {
342 assert_eq!(FuzzySearcher::sanitize_query("hello world"), "hello world");
343 assert_eq!(FuzzySearcher::sanitize_query("test+query"), "test\\+query");
344 assert_eq!(FuzzySearcher::sanitize_query("(test)"), "\\(test\\)");
345 assert_eq!(
346 FuzzySearcher::sanitize_query("wild*card?"),
347 "wild\\*card\\?"
348 );
349 assert_eq!(
350 FuzzySearcher::sanitize_query("path/to/file"),
351 "path\\/to\\/file"
352 );
353 }
354
355 #[test]
356 fn test_fuzzy_search_options_default() {
357 let options = FuzzySearchOptions::default();
358 assert!(options.fuzzy_enabled);
359 assert_eq!(options.fuzzy_distance, 1);
360 assert_eq!(options.limit, 50);
361 assert!(options.kind_filter.is_none());
362 assert!(options.crate_filter.is_none());
363 assert!(options.member_filter.is_none());
364 }
365
366 #[test]
367 fn test_search_query_validation() {
368 let temp_dir = TempDir::new().expect("Failed to create temporary directory for test");
369 let index_path = temp_dir.path().join("test_index");
370 let indexer = SearchIndexer::new_at_path(&index_path)
371 .expect("Failed to create search indexer for test");
372 let fuzzy_searcher = FuzzySearcher::from_indexer(&indexer)
373 .expect("Failed to create fuzzy searcher for test");
374
375 let long_query = "a".repeat(1001);
377 let options = FuzzySearchOptions::default();
378 let result = fuzzy_searcher.search(&long_query, &options);
379 assert!(result.is_err());
380 assert!(
381 result
382 .expect_err("Expected error for query length validation")
383 .to_string()
384 .contains("Query too long")
385 );
386 }
387}