rust_docs_mcp/search/
fuzzy.rs

1//! # Fuzzy Search Module
2//!
3//! Provides fuzzy search capabilities with typo tolerance using Tantivy.
4//!
5//! ## Key Components
6//! - [`FuzzySearcher`] - Main searcher with fuzzy and standard search modes
7//! - [`FuzzySearchOptions`] - Configuration for search behavior
8//! - [`SearchResult`] - Structure containing search result information
9//!
10//! ## Example
11//! ```no_run
12//! # use rust_docs_mcp::search::fuzzy::{FuzzySearcher, FuzzySearchOptions};
13//! # use rust_docs_mcp::search::indexer::SearchIndexer;
14//! # use rust_docs_mcp::cache::storage::CacheStorage;
15//! # use anyhow::Result;
16//! # fn main() -> Result<()> {
17//! let storage = CacheStorage::new(None)?;
18//! let indexer = SearchIndexer::new_for_crate("tokio", "1.35.0", &storage, None)?;
19//! let searcher = FuzzySearcher::from_indexer(&indexer)?;
20//! let options = FuzzySearchOptions {
21//!     fuzzy_enabled: true,
22//!     fuzzy_distance: 1,
23//!     ..Default::default()
24//! };
25//! let results = searcher.search("Vec", &options)?;
26//! # Ok(())
27//! # }
28//! ```
29
30use crate::search::config::{
31    DEFAULT_FUZZY_DISTANCE, DEFAULT_SEARCH_LIMIT, FUZZY_TRANSPOSE_COST_ONE, MAX_QUERY_LENGTH,
32};
33use crate::search::indexer::SearchIndexer;
34use anyhow::{Context, Result};
35use rmcp::schemars;
36use schemars::JsonSchema;
37use serde::{Deserialize, Serialize};
38use tantivy::{
39    Index, TantivyDocument, Term,
40    collector::TopDocs,
41    query::{BooleanQuery, FuzzyTermQuery, Occur, Query, QueryParser, TermQuery},
42    schema::{Field, Value},
43};
44
45/// Fuzzy search implementation using Tantivy
46pub struct FuzzySearcher {
47    index: Index,
48    query_parser: QueryParser,
49    fields: FuzzySearchFields,
50}
51
52#[derive(Debug, Clone)]
53struct FuzzySearchFields {
54    name: Field,
55    docs: Field,
56    path: Field,
57    kind: Field,
58    crate_name: Field,
59    version: Field,
60    item_id: Field,
61    visibility: Field,
62    member: Field,
63}
64
65#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
66pub struct FuzzySearchOptions {
67    #[schemars(description = "Enable fuzzy matching for typo tolerance")]
68    pub fuzzy_enabled: bool,
69    #[schemars(description = "Edit distance for fuzzy matching (0-2)")]
70    pub fuzzy_distance: u8,
71    #[schemars(description = "Maximum number of results to return")]
72    pub limit: usize,
73    #[schemars(description = "Filter by item kind")]
74    pub kind_filter: Option<String>,
75    #[schemars(description = "Filter by crate name")]
76    pub crate_filter: Option<String>,
77    #[schemars(description = "Filter by workspace member")]
78    pub member_filter: Option<String>,
79}
80
81impl Default for FuzzySearchOptions {
82    fn default() -> Self {
83        Self {
84            fuzzy_enabled: true,
85            fuzzy_distance: DEFAULT_FUZZY_DISTANCE,
86            limit: DEFAULT_SEARCH_LIMIT,
87            kind_filter: None,
88            crate_filter: None,
89            member_filter: None,
90        }
91    }
92}
93
94#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
95pub struct SearchResult {
96    #[schemars(description = "Relevance score")]
97    pub score: f32,
98    #[schemars(description = "Item ID")]
99    pub item_id: u32,
100    #[schemars(description = "Item name")]
101    pub name: String,
102    #[schemars(description = "Item path")]
103    pub path: String,
104    #[schemars(description = "Item kind")]
105    pub kind: String,
106    #[schemars(description = "Crate name")]
107    pub crate_name: String,
108    #[schemars(description = "Crate version")]
109    pub version: String,
110    #[schemars(description = "Item visibility")]
111    pub visibility: String,
112    #[schemars(description = "Workspace member name (if applicable)")]
113    pub member: Option<String>,
114}
115
116impl FuzzySearcher {
117    /// Create a new fuzzy searcher from an indexer
118    pub fn from_indexer(indexer: &SearchIndexer) -> Result<Self> {
119        let index = indexer.get_index().clone();
120
121        let fields = FuzzySearchFields {
122            name: indexer.get_name_field(),
123            docs: indexer.get_docs_field(),
124            path: indexer.get_path_field(),
125            kind: indexer.get_kind_field(),
126            crate_name: indexer.get_crate_name_field(),
127            version: indexer.get_version_field(),
128            item_id: indexer.get_item_id_field(),
129            visibility: indexer.get_visibility_field(),
130            member: indexer.get_member_field(),
131        };
132
133        // Create query parser for multiple fields
134        let query_parser =
135            QueryParser::for_index(&index, vec![fields.name, fields.docs, fields.path]);
136
137        Ok(Self {
138            index,
139            query_parser,
140            fields,
141        })
142    }
143
144    /// Perform fuzzy search with the given query and options
145    pub fn search(&self, query: &str, options: &FuzzySearchOptions) -> Result<Vec<SearchResult>> {
146        // Validate query length
147        if query.len() > MAX_QUERY_LENGTH {
148            return Err(anyhow::anyhow!(
149                "Query too long (max {} characters)",
150                MAX_QUERY_LENGTH
151            ));
152        }
153
154        // Sanitize query to escape special characters
155        let sanitized_query = Self::sanitize_query(query);
156
157        let reader = self.index.reader()?;
158        let searcher = reader.searcher();
159
160        // Build the query based on options
161        let search_query = if options.fuzzy_enabled {
162            self.build_fuzzy_query(&sanitized_query, options)?
163        } else {
164            self.build_standard_query(&sanitized_query, options)?
165        };
166
167        // Execute search
168        let top_docs = searcher.search(&search_query, &TopDocs::with_limit(options.limit))?;
169
170        // Convert results
171        let mut results = Vec::new();
172        for (score, doc_address) in top_docs {
173            let doc = searcher.doc(doc_address)?;
174            if let Some(result) = self.doc_to_search_result(&doc, score)? {
175                // Apply additional filters
176                if self.matches_filters(&result, options) {
177                    results.push(result);
178                }
179            }
180        }
181
182        Ok(results)
183    }
184
185    /// Build fuzzy query with typo tolerance
186    fn build_fuzzy_query(
187        &self,
188        query: &str,
189        options: &FuzzySearchOptions,
190    ) -> Result<Box<dyn Query>> {
191        // Split query into terms
192        let terms: Vec<&str> = query.split_whitespace().collect();
193
194        let mut main_clauses = Vec::new();
195
196        for term in terms {
197            // Build fuzzy queries for this term across all searchable fields
198            let mut term_clauses = Vec::new();
199
200            // Add fuzzy queries for searchable fields
201            for field in &[self.fields.name, self.fields.docs, self.fields.path] {
202                let fuzzy_query = FuzzyTermQuery::new(
203                    Term::from_field_text(*field, term),
204                    options.fuzzy_distance,
205                    FUZZY_TRANSPOSE_COST_ONE,
206                );
207                term_clauses.push((Occur::Should, Box::new(fuzzy_query) as Box<dyn Query>));
208            }
209
210            // Create a boolean query for this term
211            let term_query = BooleanQuery::new(term_clauses);
212            main_clauses.push((Occur::Should, Box::new(term_query) as Box<dyn Query>));
213        }
214
215        // Add crate filter if specified
216        if let Some(crate_name) = &options.crate_filter {
217            let crate_term = Term::from_field_text(self.fields.crate_name, crate_name);
218            let crate_query = TermQuery::new(crate_term, tantivy::schema::IndexRecordOption::Basic);
219            main_clauses.push((Occur::Must, Box::new(crate_query) as Box<dyn Query>));
220        }
221
222        // Add member filter if specified
223        if let Some(member_name) = &options.member_filter {
224            let member_term = Term::from_field_text(self.fields.member, member_name);
225            let member_query =
226                TermQuery::new(member_term, tantivy::schema::IndexRecordOption::Basic);
227            main_clauses.push((Occur::Must, Box::new(member_query) as Box<dyn Query>));
228        }
229
230        let boolean_query = BooleanQuery::new(main_clauses);
231        Ok(Box::new(boolean_query))
232    }
233
234    /// Build standard query without fuzzy matching
235    fn build_standard_query(
236        &self,
237        query: &str,
238        options: &FuzzySearchOptions,
239    ) -> Result<Box<dyn Query>> {
240        let mut clauses = Vec::new();
241
242        // Parse the query using the query parser
243        let parsed_query = self
244            .query_parser
245            .parse_query(query)
246            .with_context(|| format!("Failed to parse query: {query}"))?;
247        clauses.push((Occur::Must, parsed_query));
248
249        // Add crate filter if specified
250        if let Some(crate_name) = &options.crate_filter {
251            let crate_term = Term::from_field_text(self.fields.crate_name, crate_name);
252            let crate_query = TermQuery::new(crate_term, tantivy::schema::IndexRecordOption::Basic);
253            clauses.push((Occur::Must, Box::new(crate_query) as Box<dyn Query>));
254        }
255
256        // Add member filter if specified
257        if let Some(member_name) = &options.member_filter {
258            let member_term = Term::from_field_text(self.fields.member, member_name);
259            let member_query =
260                TermQuery::new(member_term, tantivy::schema::IndexRecordOption::Basic);
261            clauses.push((Occur::Must, Box::new(member_query) as Box<dyn Query>));
262        }
263
264        let boolean_query = BooleanQuery::new(clauses);
265        Ok(Box::new(boolean_query))
266    }
267
268    /// Convert Tantivy document to SearchResult
269    fn doc_to_search_result(
270        &self,
271        doc: &TantivyDocument,
272        score: f32,
273    ) -> Result<Option<SearchResult>> {
274        let get_text_field = |field: Field| -> Option<String> {
275            doc.get_first(field)?.as_str().map(|s| s.to_string())
276        };
277
278        let get_u64_field = |field: Field| -> Option<u64> { doc.get_first(field)?.as_u64() };
279
280        let item_id = get_u64_field(self.fields.item_id)
281            .ok_or_else(|| anyhow::anyhow!("Missing item_id"))? as u32;
282        let name =
283            get_text_field(self.fields.name).ok_or_else(|| anyhow::anyhow!("Missing name"))?;
284        let path =
285            get_text_field(self.fields.path).ok_or_else(|| anyhow::anyhow!("Missing path"))?;
286        let kind =
287            get_text_field(self.fields.kind).ok_or_else(|| anyhow::anyhow!("Missing kind"))?;
288        let crate_name = get_text_field(self.fields.crate_name)
289            .ok_or_else(|| anyhow::anyhow!("Missing crate_name"))?;
290        let version = get_text_field(self.fields.version)
291            .ok_or_else(|| anyhow::anyhow!("Missing version"))?;
292        let visibility = get_text_field(self.fields.visibility).unwrap_or_default();
293        let member = get_text_field(self.fields.member);
294
295        Ok(Some(SearchResult {
296            score,
297            item_id,
298            name,
299            path,
300            kind,
301            crate_name,
302            version,
303            visibility,
304            member,
305        }))
306    }
307
308    /// Check if result matches additional filters
309    fn matches_filters(&self, result: &SearchResult, options: &FuzzySearchOptions) -> bool {
310        if let Some(kind_filter) = &options.kind_filter
311            && result.kind != *kind_filter
312        {
313            return false;
314        }
315
316        true
317    }
318
319    /// Sanitize query to escape special Tantivy syntax characters
320    fn sanitize_query(query: &str) -> String {
321        // Escape special characters that have meaning in Tantivy query syntax
322        // These include: + - && || ! ( ) { } [ ] ^ " ~ * ? : \ /
323        query
324            .chars()
325            .map(|c| match c {
326                '+' | '-' | '!' | '(' | ')' | '{' | '}' | '[' | ']' | '^' | '"' | '~' | '*'
327                | '?' | ':' | '\\' | '/' => format!("\\{c}"),
328                _ => c.to_string(),
329            })
330            .collect()
331    }
332}
333
334#[cfg(test)]
335mod tests {
336    use super::*;
337    use crate::search::indexer::SearchIndexer;
338    use tempfile::TempDir;
339
340    #[test]
341    fn test_sanitize_query() {
342        assert_eq!(FuzzySearcher::sanitize_query("hello world"), "hello world");
343        assert_eq!(FuzzySearcher::sanitize_query("test+query"), "test\\+query");
344        assert_eq!(FuzzySearcher::sanitize_query("(test)"), "\\(test\\)");
345        assert_eq!(
346            FuzzySearcher::sanitize_query("wild*card?"),
347            "wild\\*card\\?"
348        );
349        assert_eq!(
350            FuzzySearcher::sanitize_query("path/to/file"),
351            "path\\/to\\/file"
352        );
353    }
354
355    #[test]
356    fn test_fuzzy_search_options_default() {
357        let options = FuzzySearchOptions::default();
358        assert!(options.fuzzy_enabled);
359        assert_eq!(options.fuzzy_distance, 1);
360        assert_eq!(options.limit, 50);
361        assert!(options.kind_filter.is_none());
362        assert!(options.crate_filter.is_none());
363        assert!(options.member_filter.is_none());
364    }
365
366    #[test]
367    fn test_search_query_validation() {
368        let temp_dir = TempDir::new().expect("Failed to create temporary directory for test");
369        let index_path = temp_dir.path().join("test_index");
370        let indexer = SearchIndexer::new_at_path(&index_path)
371            .expect("Failed to create search indexer for test");
372        let fuzzy_searcher = FuzzySearcher::from_indexer(&indexer)
373            .expect("Failed to create fuzzy searcher for test");
374
375        // Test query length validation
376        let long_query = "a".repeat(1001);
377        let options = FuzzySearchOptions::default();
378        let result = fuzzy_searcher.search(&long_query, &options);
379        assert!(result.is_err());
380        assert!(
381            result
382                .expect_err("Expected error for query length validation")
383                .to_string()
384                .contains("Query too long")
385        );
386    }
387}