1use crate::client::{Context7Client, SearchResult};
2use crate::rag::embeddings::EmbeddingModel;
3use anyhow::Result;
4use fuzzy_matcher::skim::SkimMatcherV2;
5use fuzzy_matcher::FuzzyMatcher;
6use std::sync::Arc;
7
8#[derive(Debug, Clone)]
9struct ParsedQuery {
10 quoted_phrases: Vec<String>,
11 individual_terms: Vec<String>,
12 original_query: String,
13}
14
15pub struct SearchEngine {
16 client: Context7Client,
17 matcher: SkimMatcherV2,
18 embedding_model: Option<Arc<EmbeddingModel>>,
19}
20
21impl SearchEngine {
22 pub fn new(client: Context7Client) -> Self {
24 Self {
25 client,
26 matcher: SkimMatcherV2::default(),
27 embedding_model: None,
28 }
29 }
30
31 pub fn with_shared_embeddings(
34 client: Context7Client,
35 embedding_model: Arc<EmbeddingModel>,
36 ) -> Self {
37 log::info!("🔄 Reusing shared embedding model for Context7 search");
38 Self {
39 client,
40 matcher: SkimMatcherV2::default(),
41 embedding_model: Some(embedding_model),
42 }
43 }
44
45 pub fn has_embeddings(&self) -> bool {
47 self.embedding_model.is_some()
48 }
49
50 pub async fn search(
51 &self,
52 library: &str,
53 query: &str,
54 limit: Option<usize>,
55 ) -> Result<(Vec<SearchResult>, String, String)> {
56 let (lib_name, _version) = parse_library_spec(library);
58
59 let (library_id, library_title) = self.client.resolve_library(lib_name).await?;
61
62 let parsed_query = self.parse_search_query(query);
64
65 let mut results = self
67 .multi_pass_search(&library_id, library, &parsed_query)
68 .await?;
69
70 if let Some(limit) = limit {
72 if limit > 0 && results.len() > limit {
73 results.truncate(limit);
74 }
75 }
76
77 if let Ok(cache_manager) = crate::cache::CacheManager::new() {
79 for result in &results {
80 let snippet_cache_key = format!("{}_{}", library, &result.id);
81 let _ = cache_manager
83 .set("snippets", &snippet_cache_key, &result.excerpt)
84 .await;
85 }
86 }
87
88 Ok((results, library_title, library_id))
89 }
90
91 async fn multi_pass_search(
92 &self,
93 library_id: &str,
94 library: &str,
95 parsed_query: &ParsedQuery,
96 ) -> Result<Vec<SearchResult>> {
97 let mut all_results = Vec::new();
98
99 if !parsed_query.quoted_phrases.is_empty() {
101 let phrase_query = self.build_phrase_priority_query(parsed_query);
102 let docs = self
103 .client
104 .get_documentation(library_id, Some(&phrase_query))
105 .await?;
106
107 let phrase_results = self
108 .parse_documentation_into_results(
109 library,
110 &parsed_query.original_query,
111 &docs,
112 parsed_query,
113 true, )
115 .await?;
116
117 all_results.extend(phrase_results);
118 }
119
120 let should_do_term_search = parsed_query.quoted_phrases.is_empty() || all_results.len() < 5;
122
123 if should_do_term_search && !parsed_query.individual_terms.is_empty() {
124 let term_query = parsed_query.individual_terms.join(" ");
125 let docs = self
126 .client
127 .get_documentation(library_id, Some(&term_query))
128 .await?;
129
130 let term_results = self
131 .parse_documentation_into_results(
132 library,
133 &parsed_query.original_query,
134 &docs,
135 parsed_query,
136 false, )
138 .await?;
139
140 all_results.extend(term_results);
141 }
142
143 let merged_results = self.merge_and_rank_results(all_results, parsed_query);
145
146 Ok(merged_results)
147 }
148
149 fn build_phrase_priority_query(&self, parsed_query: &ParsedQuery) -> String {
150 let mut query_parts = Vec::new();
151
152 for phrase in &parsed_query.quoted_phrases {
154 query_parts.push(format!("\"{}\"", phrase));
155 }
156
157 query_parts.extend(parsed_query.individual_terms.clone());
159
160 query_parts.join(" ")
161 }
162
163 fn parse_search_query(&self, query: &str) -> ParsedQuery {
164 let mut quoted_phrases = Vec::new();
165 let mut individual_terms = Vec::new();
166 let mut current_term = String::new();
167 let mut in_quotes = false;
168
169 for ch in query.chars() {
170 match ch {
171 '"' => {
172 in_quotes = !in_quotes;
173 if !in_quotes && !current_term.is_empty() {
174 quoted_phrases.push(current_term.clone());
176 current_term.clear();
177 }
178 }
179 ' ' if !in_quotes => {
180 if !current_term.is_empty() {
181 individual_terms.push(current_term.clone());
183 current_term.clear();
184 }
185 }
186 _ => {
187 current_term.push(ch);
188 }
189 }
190 }
191
192 if !current_term.is_empty() {
194 if in_quotes {
195 quoted_phrases.push(current_term);
197 } else {
198 individual_terms.push(current_term);
199 }
200 }
201
202 if quoted_phrases.is_empty() && individual_terms.is_empty() {
204 individual_terms.push(query.to_string());
205 }
206
207 ParsedQuery {
208 quoted_phrases,
209 individual_terms,
210 original_query: query.to_string(),
211 }
212 }
213
214 async fn parse_documentation_into_results(
215 &self,
216 library: &str,
217 original_query: &str,
218 docs: &str,
219 parsed_query: &ParsedQuery,
220 is_phrase_search: bool,
221 ) -> Result<Vec<SearchResult>> {
222 let mut results = Vec::new();
223
224 let sections = self.split_into_sections(docs);
226
227 let relevance_scores = if self.embedding_model.is_some() {
229 self.calculate_embedding_relevance_batch(
230 §ions,
231 &parsed_query.original_query,
232 parsed_query,
233 is_phrase_search,
234 )
235 .await
236 .unwrap_or_else(|e| {
237 log::warn!(
238 "Batch embedding failed, falling back to keyword matching: {}",
239 e
240 );
241 sections
243 .iter()
244 .map(|section| {
245 self.calculate_enhanced_section_relevance(
246 section,
247 parsed_query,
248 is_phrase_search,
249 )
250 })
251 .collect()
252 })
253 } else {
254 sections
256 .iter()
257 .map(|section| {
258 self.calculate_enhanced_section_relevance(
259 section,
260 parsed_query,
261 is_phrase_search,
262 )
263 })
264 .collect()
265 };
266
267 for (idx, (section, &relevance)) in sections.iter().zip(relevance_scores.iter()).enumerate()
268 {
269 let relevance_threshold = if sections.len() > 1 { 0.05 } else { 0.1 };
271
272 if relevance > relevance_threshold {
273 let title = self.extract_section_title(section).unwrap_or_else(|| {
275 let first_line = section.lines().next().unwrap_or("");
277 let title_candidate = if first_line.len() > 60 {
278 format!("{}...", &first_line[..57])
279 } else if first_line.is_empty() {
280 format!("{} - Result {}", original_query, idx + 1)
281 } else {
282 first_line.to_string()
283 };
284 format!("{} ({})", title_candidate, library)
285 });
286
287 let excerpt = self.extract_section_excerpt(section);
288
289 results.push(SearchResult {
290 id: format!("{}-doc-{}", library, idx + 1),
291 library: library.to_string(),
292 title,
293 excerpt,
294 url: None,
295 relevance_score: relevance,
296 });
297 }
298 }
299
300 results.sort_by(|a, b| b.relevance_score.partial_cmp(&a.relevance_score).unwrap());
302
303 if results.is_empty() && !sections.is_empty() {
305 for (idx, section) in sections.iter().enumerate().take(10) {
306 let title = self.extract_section_title(section).unwrap_or_else(|| {
308 let lines: Vec<&str> = section.lines().take(3).collect();
310 let mut title_candidate = String::new();
311
312 for line in &lines {
314 let trimmed = line.trim();
315 if !trimmed.is_empty() && trimmed.len() > 10 {
316 title_candidate = if trimmed.len() > 60 {
317 format!("{}...", &trimmed[..57])
318 } else {
319 trimmed.to_string()
320 };
321 break;
322 }
323 }
324
325 if title_candidate.is_empty() {
326 format!("{} - Section {}", original_query, idx + 1)
327 } else {
328 title_candidate
329 }
330 });
331
332 let excerpt = self.create_unique_excerpt(section, idx);
334
335 results.push(SearchResult {
336 id: format!("doc-{}", idx + 1),
337 library: library.to_string(),
338 title,
339 excerpt,
340 url: None,
341 relevance_score: 0.5, });
343 }
344 }
345
346 Ok(results)
347 }
348
349 fn merge_and_rank_results(
350 &self,
351 mut all_results: Vec<SearchResult>,
352 parsed_query: &ParsedQuery,
353 ) -> Vec<SearchResult> {
354 all_results.sort_by(|a, b| b.relevance_score.partial_cmp(&a.relevance_score).unwrap());
356 all_results.dedup_by(|a, b| {
357 let similarity = self
359 .matcher
360 .fuzzy_match(&a.title.to_lowercase(), &b.title.to_lowercase());
361 similarity.unwrap_or(0) > 800 });
363
364 for result in all_results.iter_mut() {
366 if self.contains_quoted_phrases(&result.excerpt, &parsed_query.quoted_phrases) {
367 result.relevance_score *= 1.5; }
369 }
370
371 all_results.sort_by(|a, b| b.relevance_score.partial_cmp(&a.relevance_score).unwrap());
373 all_results
374 }
375
376 fn contains_quoted_phrases(&self, text: &str, phrases: &[String]) -> bool {
377 let text_lower = text.to_lowercase();
378 phrases
379 .iter()
380 .any(|phrase| text_lower.contains(&phrase.to_lowercase()))
381 }
382
383 fn calculate_enhanced_section_relevance(
384 &self,
385 section: &str,
386 parsed_query: &ParsedQuery,
387 is_phrase_search: bool,
388 ) -> f32 {
389 let section_lower = section.to_lowercase();
390 let mut total_score = 0.0;
391
392 for phrase in &parsed_query.quoted_phrases {
394 let phrase_lower = phrase.to_lowercase();
395
396 if section_lower.contains(&phrase_lower) {
397 let phrase_score = if is_phrase_search { 10.0 } else { 5.0 };
399 total_score += phrase_score;
400
401 if let Some(title_line) = section.lines().find(|line| line.starts_with("TITLE: ")) {
403 if title_line.to_lowercase().contains(&phrase_lower) {
404 total_score += phrase_score * 0.5;
405 }
406 }
407
408 if let Some(desc_line) = section
410 .lines()
411 .find(|line| line.starts_with("DESCRIPTION: "))
412 {
413 if desc_line.to_lowercase().contains(&phrase_lower) {
414 total_score += phrase_score * 0.3;
415 }
416 }
417 } else {
418 let proximity_score = self.calculate_phrase_proximity(section, phrase);
420 total_score += proximity_score;
421 }
422 }
423
424 for term in &parsed_query.individual_terms {
426 let term_lower = term.to_lowercase();
427
428 if section_lower.contains(&term_lower) {
429 total_score += 1.0;
430
431 if let Some(title_line) = section.lines().find(|line| line.starts_with("TITLE: ")) {
433 if title_line.to_lowercase().contains(&term_lower) {
434 total_score += 0.5;
435 }
436 }
437 if let Some(desc_line) = section
438 .lines()
439 .find(|line| line.starts_with("DESCRIPTION: "))
440 {
441 if desc_line.to_lowercase().contains(&term_lower) {
442 total_score += 0.3;
443 }
444 }
445 } else {
446 if let Some(score) = self.matcher.fuzzy_match(§ion_lower, &term_lower) {
448 total_score += (score as f32) / 1000.0;
449 }
450 }
451 }
452
453 let total_elements =
455 parsed_query.quoted_phrases.len() + parsed_query.individual_terms.len();
456 if total_elements > 0 {
457 total_score / total_elements as f32
458 } else {
459 0.0
460 }
461 }
462
463 fn calculate_phrase_proximity(&self, section: &str, phrase: &str) -> f32 {
464 let words: Vec<&str> = phrase.split_whitespace().collect();
465 if words.len() < 2 {
466 return 0.0;
467 }
468
469 let section_lower = section.to_lowercase();
470 let mut max_proximity_score: f32 = 0.0;
471
472 for window in section_lower
474 .split_whitespace()
475 .collect::<Vec<_>>()
476 .windows(words.len())
477 {
478 let mut proximity_score = 0.0;
479 let mut found_words = 0;
480
481 for (i, &target_word) in words.iter().enumerate() {
482 if let Some(fuzzy_score) = self.matcher.fuzzy_match(window[i], target_word) {
483 if fuzzy_score > 700 {
484 proximity_score += 1.0;
486 found_words += 1;
487 }
488 }
489 }
490
491 if found_words > 0 {
492 let proximity_multiplier = found_words as f32 / words.len() as f32;
493 proximity_score = proximity_score * proximity_multiplier * 2.0; max_proximity_score = max_proximity_score.max(proximity_score);
495 }
496 }
497
498 max_proximity_score
499 }
500
501 async fn calculate_embedding_relevance_batch(
503 &self,
504 sections: &[String],
505 query: &str,
506 parsed_query: &ParsedQuery,
507 is_phrase_search: bool,
508 ) -> Result<Vec<f32>> {
509 let embedding_model = self
510 .embedding_model
511 .as_ref()
512 .ok_or_else(|| anyhow::anyhow!("Embedding model not available"))?;
513
514 let query_embedding = embedding_model.embed_text(query).await?;
516
517 let section_texts: Vec<String> = sections
519 .iter()
520 .map(|section| self.prepare_section_for_embedding(section))
521 .collect();
522
523 let section_text_refs: Vec<&str> = section_texts.iter().map(|s| s.as_str()).collect();
525 let section_embeddings = embedding_model.embed_batch(§ion_text_refs).await?;
526
527 let mut scores = Vec::with_capacity(sections.len());
529
530 for (i, section) in sections.iter().enumerate() {
531 if let Some(section_embedding) = section_embeddings.get(i) {
532 let embedding_score =
534 EmbeddingModel::cosine_similarity(&query_embedding, section_embedding);
535
536 let keyword_score = self.calculate_enhanced_section_relevance(
538 section,
539 parsed_query,
540 is_phrase_search,
541 );
542
543 let phrase_bonus =
545 self.calculate_phrase_bonus(section, parsed_query, is_phrase_search);
546
547 let normalized_keyword_score = (keyword_score / 5.0).min(1.0);
549 let normalized_phrase_bonus = (phrase_bonus / 10.0).min(1.0);
550
551 let final_score = (embedding_score * 0.7)
552 + (normalized_keyword_score * 0.2)
553 + (normalized_phrase_bonus * 0.1);
554
555 scores.push(final_score);
556 } else {
557 log::warn!("Missing embedding for section {}, using keyword scoring", i);
559 scores.push(self.calculate_enhanced_section_relevance(
560 section,
561 parsed_query,
562 is_phrase_search,
563 ));
564 }
565 }
566
567 log::debug!(
568 "Batch embedded {} sections with average score: {:.3}",
569 sections.len(),
570 scores.iter().sum::<f32>() / scores.len() as f32
571 );
572
573 Ok(scores)
574 }
575
576 #[allow(dead_code)]
578 async fn calculate_embedding_section_relevance(
579 &self,
580 section: &str,
581 query: &str,
582 parsed_query: &ParsedQuery,
583 is_phrase_search: bool,
584 ) -> Result<f32> {
585 let embedding_model = self
586 .embedding_model
587 .as_ref()
588 .ok_or_else(|| anyhow::anyhow!("Embedding model not available"))?;
589
590 let query_embedding = embedding_model.embed_text(query).await?;
592
593 let section_text = self.prepare_section_for_embedding(section);
595 let section_embedding = embedding_model.embed_text(§ion_text).await?;
596
597 let embedding_score =
599 EmbeddingModel::cosine_similarity(&query_embedding, §ion_embedding);
600
601 let keyword_score =
603 self.calculate_enhanced_section_relevance(section, parsed_query, is_phrase_search);
604
605 let phrase_bonus = self.calculate_phrase_bonus(section, parsed_query, is_phrase_search);
607
608 let normalized_keyword_score = (keyword_score / 5.0).min(1.0);
611 let normalized_phrase_bonus = (phrase_bonus / 10.0).min(1.0);
612
613 let final_score = (embedding_score * 0.7)
614 + (normalized_keyword_score * 0.2)
615 + (normalized_phrase_bonus * 0.1);
616
617 log::debug!("Embedding hybrid scoring for section: Embedding={:.3}, Keywords={:.3}, Phrase={:.3}, Final={:.3}",
618 embedding_score, normalized_keyword_score, normalized_phrase_bonus, final_score);
619
620 Ok(final_score)
621 }
622
623 fn prepare_section_for_embedding(&self, section: &str) -> String {
625 let lines: Vec<&str> = section.lines().collect();
627 let mut embedding_text = String::new();
628
629 if let Some(title_line) = lines.iter().find(|line| line.starts_with("TITLE: ")) {
631 embedding_text.push_str(title_line[7..].trim());
632 embedding_text.push(' ');
633 }
634
635 if let Some(desc_line) = lines.iter().find(|line| line.starts_with("DESCRIPTION: ")) {
637 embedding_text.push_str(desc_line[13..].trim());
638 embedding_text.push(' ');
639 }
640
641 let content_lines: Vec<&str> = lines
643 .iter()
644 .filter(|line| !line.starts_with("TITLE: ") && !line.starts_with("DESCRIPTION: "))
645 .take(5)
646 .copied()
647 .collect();
648
649 let content = content_lines.join(" ");
650 let content_preview = if content.len() > 200 {
651 format!("{}...", &content[..200])
652 } else {
653 content
654 };
655
656 embedding_text.push_str(&content_preview);
657 embedding_text.trim().to_string()
658 }
659
660 fn calculate_phrase_bonus(
662 &self,
663 section: &str,
664 parsed_query: &ParsedQuery,
665 is_phrase_search: bool,
666 ) -> f32 {
667 let section_lower = section.to_lowercase();
668 let mut phrase_score = 0.0;
669
670 for phrase in &parsed_query.quoted_phrases {
671 let phrase_lower = phrase.to_lowercase();
672 if section_lower.contains(&phrase_lower) {
673 phrase_score += if is_phrase_search { 10.0 } else { 5.0 };
674
675 if let Some(title_line) = section.lines().find(|line| line.starts_with("TITLE: ")) {
677 if title_line.to_lowercase().contains(&phrase_lower) {
678 phrase_score += 2.0;
679 }
680 }
681 }
682 }
683
684 phrase_score
685 }
686
687 fn split_into_sections(&self, docs: &str) -> Vec<String> {
688 let mut sections = Vec::new();
690 let lines: Vec<&str> = docs.lines().collect();
691 let mut current_section = Vec::new();
692 let mut in_section = false;
693
694 for line in lines {
695 if line.starts_with("TITLE: ") {
696 if in_section && !current_section.is_empty() {
698 let section_text = current_section.join("\n");
699 if section_text.len() > 20 {
700 sections.push(section_text);
701 }
702 }
703 current_section.clear();
705 current_section.push(line);
706 in_section = true;
707 } else if in_section {
708 current_section.push(line);
709 }
710 }
711
712 if in_section && !current_section.is_empty() {
714 let section_text = current_section.join("\n");
715 if section_text.len() > 20 {
716 sections.push(section_text);
717 }
718 }
719
720 if sections.is_empty() {
722 let paragraphs: Vec<&str> = docs.split("\n\n").collect();
724 if paragraphs.len() > 1 {
725 for paragraph in paragraphs {
726 let trimmed = paragraph.trim();
727 if trimmed.len() > 50 {
728 sections.push(trimmed.to_string());
730 }
731 }
732 }
733
734 if sections.len() < 3 {
736 sections.clear(); let chunk_size = 800; let mut start = 0;
739 let mut chunk_count = 0;
740
741 while start < docs.len() && chunk_count < 20 {
742 let end = (start + chunk_size).min(docs.len());
744 let mut actual_end = end;
746 if end < docs.len() {
747 if let Some(pos) = docs[start..end].rfind("\n\n") {
749 actual_end = start + pos;
750 } else if let Some(pos) = docs[start..end].rfind(".\n") {
751 actual_end = start + pos + 1;
752 } else if let Some(pos) = docs[start..end].rfind(". ") {
753 actual_end = start + pos + 1;
754 } else if let Some(pos) = docs[start..end].rfind('\n') {
755 actual_end = start + pos;
756 }
757 }
758
759 if actual_end <= start {
761 actual_end = end;
762 }
763
764 let chunk = docs[start..actual_end].trim();
765 if !chunk.is_empty() && chunk.len() > 50 {
766 sections.push(chunk.to_string());
767 chunk_count += 1;
768 }
769
770 start = actual_end;
771 while start < docs.len()
773 && docs.chars().nth(start).is_some_and(|c| c.is_whitespace())
774 {
775 start += 1;
776 }
777 }
778 }
779 }
780
781 if sections.is_empty() {
783 vec![docs.to_string()]
784 } else {
785 sections
786 }
787 }
788
789 fn extract_section_title(&self, section: &str) -> Option<String> {
790 section
791 .lines()
792 .find(|line| line.starts_with("TITLE: "))
793 .map(|line| line[7..].to_string())
794 }
795
796 fn extract_section_excerpt(&self, section: &str) -> String {
797 if let Some(desc_line) = section
799 .lines()
800 .find(|line| line.starts_with("DESCRIPTION: "))
801 {
802 let desc = &desc_line[13..];
803 if desc.len() > 300 {
804 format!("{}...", &desc[..300])
805 } else {
806 desc.to_string()
807 }
808 } else {
809 if section.len() > 300 {
811 format!("{}...", §ion[..300])
812 } else {
813 section.to_string()
814 }
815 }
816 }
817
818 fn create_unique_excerpt(&self, section: &str, offset: usize) -> String {
819 let lines: Vec<&str> = section.lines().collect();
820 let mut excerpt_lines = Vec::new();
821 let mut char_count = 0;
822
823 let skip_lines = offset.saturating_mul(2);
825
826 for line in lines.iter().skip(skip_lines) {
827 let trimmed = line.trim();
828 if !trimmed.is_empty() {
829 excerpt_lines.push(trimmed);
830 char_count += trimmed.len();
831
832 if char_count > 200 || excerpt_lines.len() >= 3 {
834 break;
835 }
836 }
837 }
838
839 if excerpt_lines.is_empty() {
841 for line in lines.iter().take(5) {
842 let trimmed = line.trim();
843 if !trimmed.is_empty() {
844 excerpt_lines.push(trimmed);
845 char_count += trimmed.len();
846 if char_count > 200 {
847 break;
848 }
849 }
850 }
851 }
852
853 let result = excerpt_lines.join(" ");
854 if result.len() > 300 {
855 format!("{}...", &result[..297])
856 } else if result.is_empty() {
857 if section.len() > 300 {
859 format!("{}...", §ion[..297])
860 } else {
861 section.to_string()
862 }
863 } else {
864 result
865 }
866 }
867
868 pub async fn get_documentation(&self, library: &str, query: Option<&str>) -> Result<String> {
869 let (lib_name, _version) = parse_library_spec(library);
870
871 let (library_id, _library_title) = self.client.resolve_library(lib_name).await?;
873
874 self.client.get_documentation(&library_id, query).await
876 }
877}
878
879fn parse_library_spec(spec: &str) -> (&str, Option<&str>) {
880 if let Some(at_pos) = spec.find('@') {
881 let (lib, ver) = spec.split_at(at_pos);
882 (lib, Some(&ver[1..]))
883 } else {
884 (spec, None)
885 }
886}
887
888pub fn fuzzy_find_libraries(query: &str, libraries: &[String]) -> Vec<(String, i64)> {
889 let matcher = SkimMatcherV2::default();
890 let mut matches: Vec<(String, i64)> = libraries
891 .iter()
892 .filter_map(|lib| {
893 matcher
894 .fuzzy_match(lib, query)
895 .map(|score| (lib.clone(), score))
896 })
897 .collect();
898
899 matches.sort_by_key(|(_, score)| -score);
900 matches.truncate(5);
901 matches
902}