1use crate::client::{Context7Client, SearchResult};
2use crate::rag::embeddings::EmbeddingModel;
3use anyhow::Result;
4use fuzzy_matcher::skim::SkimMatcherV2;
5use fuzzy_matcher::FuzzyMatcher;
6
7#[derive(Debug, Clone)]
8struct ParsedQuery {
9 quoted_phrases: Vec<String>,
10 individual_terms: Vec<String>,
11 original_query: String,
12}
13
14pub struct SearchEngine {
15 client: Context7Client,
16 matcher: SkimMatcherV2,
17 embedding_model: Option<EmbeddingModel>,
18}
19
20impl SearchEngine {
21 pub fn new(client: Context7Client) -> Self {
23 Self {
24 client,
25 matcher: SkimMatcherV2::default(),
26 embedding_model: None,
27 }
28 }
29
30 pub async fn with_embeddings(client: Context7Client) -> Result<Self> {
32 let embedding_model = match EmbeddingModel::new().await {
33 Ok(model) => {
34 log::info!("🧠Semantic embeddings initialized for Context7 search");
35 Some(model)
36 }
37 Err(e) => {
38 log::warn!(
39 "Semantic embeddings unavailable for Context7, using text matching: {}",
40 e
41 );
42 None
43 }
44 };
45
46 Ok(Self {
47 client,
48 matcher: SkimMatcherV2::default(),
49 embedding_model,
50 })
51 }
52
53 pub fn has_embeddings(&self) -> bool {
55 self.embedding_model.is_some()
56 }
57
58 pub async fn search(
59 &self,
60 library: &str,
61 query: &str,
62 limit: Option<usize>,
63 ) -> Result<(Vec<SearchResult>, String, String)> {
64 let (lib_name, _version) = parse_library_spec(library);
66
67 let (library_id, library_title) = self.client.resolve_library(lib_name).await?;
69
70 let parsed_query = self.parse_search_query(query);
72
73 let mut results = self
75 .multi_pass_search(&library_id, library, &parsed_query)
76 .await?;
77
78 if let Some(limit) = limit {
80 if limit > 0 && results.len() > limit {
81 results.truncate(limit);
82 }
83 }
84
85 if let Ok(cache_manager) = crate::cache::CacheManager::new() {
87 for result in &results {
88 let snippet_cache_key = format!("{}_{}", library, &result.id);
89 let _ = cache_manager
91 .set("snippets", &snippet_cache_key, &result.excerpt)
92 .await;
93 }
94 }
95
96 Ok((results, library_title, library_id))
97 }
98
99 async fn multi_pass_search(
100 &self,
101 library_id: &str,
102 library: &str,
103 parsed_query: &ParsedQuery,
104 ) -> Result<Vec<SearchResult>> {
105 let mut all_results = Vec::new();
106
107 if !parsed_query.quoted_phrases.is_empty() {
109 let phrase_query = self.build_phrase_priority_query(parsed_query);
110 let docs = self
111 .client
112 .get_documentation(library_id, Some(&phrase_query))
113 .await?;
114
115 let phrase_results = self
116 .parse_documentation_into_results(
117 library,
118 &parsed_query.original_query,
119 &docs,
120 parsed_query,
121 true, )
123 .await?;
124
125 all_results.extend(phrase_results);
126 }
127
128 let should_do_term_search = parsed_query.quoted_phrases.is_empty() || all_results.len() < 5;
130
131 if should_do_term_search && !parsed_query.individual_terms.is_empty() {
132 let term_query = parsed_query.individual_terms.join(" ");
133 let docs = self
134 .client
135 .get_documentation(library_id, Some(&term_query))
136 .await?;
137
138 let term_results = self
139 .parse_documentation_into_results(
140 library,
141 &parsed_query.original_query,
142 &docs,
143 parsed_query,
144 false, )
146 .await?;
147
148 all_results.extend(term_results);
149 }
150
151 let merged_results = self.merge_and_rank_results(all_results, parsed_query);
153
154 Ok(merged_results)
155 }
156
157 fn build_phrase_priority_query(&self, parsed_query: &ParsedQuery) -> String {
158 let mut query_parts = Vec::new();
159
160 for phrase in &parsed_query.quoted_phrases {
162 query_parts.push(format!("\"{}\"", phrase));
163 }
164
165 query_parts.extend(parsed_query.individual_terms.clone());
167
168 query_parts.join(" ")
169 }
170
171 fn parse_search_query(&self, query: &str) -> ParsedQuery {
172 let mut quoted_phrases = Vec::new();
173 let mut individual_terms = Vec::new();
174 let mut current_term = String::new();
175 let mut in_quotes = false;
176
177 for ch in query.chars() {
178 match ch {
179 '"' => {
180 in_quotes = !in_quotes;
181 if !in_quotes && !current_term.is_empty() {
182 quoted_phrases.push(current_term.clone());
184 current_term.clear();
185 }
186 }
187 ' ' if !in_quotes => {
188 if !current_term.is_empty() {
189 individual_terms.push(current_term.clone());
191 current_term.clear();
192 }
193 }
194 _ => {
195 current_term.push(ch);
196 }
197 }
198 }
199
200 if !current_term.is_empty() {
202 if in_quotes {
203 quoted_phrases.push(current_term);
205 } else {
206 individual_terms.push(current_term);
207 }
208 }
209
210 if quoted_phrases.is_empty() && individual_terms.is_empty() {
212 individual_terms.push(query.to_string());
213 }
214
215 ParsedQuery {
216 quoted_phrases,
217 individual_terms,
218 original_query: query.to_string(),
219 }
220 }
221
222 async fn parse_documentation_into_results(
223 &self,
224 library: &str,
225 original_query: &str,
226 docs: &str,
227 parsed_query: &ParsedQuery,
228 is_phrase_search: bool,
229 ) -> Result<Vec<SearchResult>> {
230 let mut results = Vec::new();
231
232 let sections = self.split_into_sections(docs);
234
235 for (idx, section) in sections.iter().enumerate() {
236 let relevance = if self.embedding_model.is_some() {
238 self.calculate_embedding_section_relevance(
239 section,
240 &parsed_query.original_query,
241 parsed_query,
242 is_phrase_search,
243 )
244 .await
245 .unwrap_or_else(|e| {
246 log::warn!(
247 "Embedding scoring failed, falling back to keyword matching: {}",
248 e
249 );
250 self.calculate_enhanced_section_relevance(
251 section,
252 parsed_query,
253 is_phrase_search,
254 )
255 })
256 } else {
257 self.calculate_enhanced_section_relevance(section, parsed_query, is_phrase_search)
258 };
259
260 let relevance_threshold = if sections.len() > 1 { 0.05 } else { 0.1 };
262
263 if relevance > relevance_threshold {
264 let title = self.extract_section_title(section).unwrap_or_else(|| {
266 let first_line = section.lines().next().unwrap_or("");
268 let title_candidate = if first_line.len() > 60 {
269 format!("{}...", &first_line[..57])
270 } else if first_line.is_empty() {
271 format!("{} - Result {}", original_query, idx + 1)
272 } else {
273 first_line.to_string()
274 };
275 format!("{} ({})", title_candidate, library)
276 });
277
278 let excerpt = self.extract_section_excerpt(section);
279
280 results.push(SearchResult {
281 id: format!("{}-doc-{}", library, idx + 1),
282 library: library.to_string(),
283 title,
284 excerpt,
285 url: None,
286 relevance_score: relevance,
287 });
288 }
289 }
290
291 results.sort_by(|a, b| b.relevance_score.partial_cmp(&a.relevance_score).unwrap());
293
294 if results.is_empty() && !sections.is_empty() {
296 for (idx, section) in sections.iter().enumerate().take(10) {
297 let title = self.extract_section_title(section).unwrap_or_else(|| {
299 let lines: Vec<&str> = section.lines().take(3).collect();
301 let mut title_candidate = String::new();
302
303 for line in &lines {
305 let trimmed = line.trim();
306 if !trimmed.is_empty() && trimmed.len() > 10 {
307 title_candidate = if trimmed.len() > 60 {
308 format!("{}...", &trimmed[..57])
309 } else {
310 trimmed.to_string()
311 };
312 break;
313 }
314 }
315
316 if title_candidate.is_empty() {
317 format!("{} - Section {}", original_query, idx + 1)
318 } else {
319 title_candidate
320 }
321 });
322
323 let excerpt = self.create_unique_excerpt(section, idx);
325
326 results.push(SearchResult {
327 id: format!("doc-{}", idx + 1),
328 library: library.to_string(),
329 title,
330 excerpt,
331 url: None,
332 relevance_score: 0.5, });
334 }
335 }
336
337 Ok(results)
338 }
339
340 fn merge_and_rank_results(
341 &self,
342 mut all_results: Vec<SearchResult>,
343 parsed_query: &ParsedQuery,
344 ) -> Vec<SearchResult> {
345 all_results.sort_by(|a, b| b.relevance_score.partial_cmp(&a.relevance_score).unwrap());
347 all_results.dedup_by(|a, b| {
348 let similarity = self
350 .matcher
351 .fuzzy_match(&a.title.to_lowercase(), &b.title.to_lowercase());
352 similarity.unwrap_or(0) > 800 });
354
355 for result in all_results.iter_mut() {
357 if self.contains_quoted_phrases(&result.excerpt, &parsed_query.quoted_phrases) {
358 result.relevance_score *= 1.5; }
360 }
361
362 all_results.sort_by(|a, b| b.relevance_score.partial_cmp(&a.relevance_score).unwrap());
364 all_results
365 }
366
367 fn contains_quoted_phrases(&self, text: &str, phrases: &[String]) -> bool {
368 let text_lower = text.to_lowercase();
369 phrases
370 .iter()
371 .any(|phrase| text_lower.contains(&phrase.to_lowercase()))
372 }
373
374 fn calculate_enhanced_section_relevance(
375 &self,
376 section: &str,
377 parsed_query: &ParsedQuery,
378 is_phrase_search: bool,
379 ) -> f32 {
380 let section_lower = section.to_lowercase();
381 let mut total_score = 0.0;
382
383 for phrase in &parsed_query.quoted_phrases {
385 let phrase_lower = phrase.to_lowercase();
386
387 if section_lower.contains(&phrase_lower) {
388 let phrase_score = if is_phrase_search { 10.0 } else { 5.0 };
390 total_score += phrase_score;
391
392 if let Some(title_line) = section.lines().find(|line| line.starts_with("TITLE: ")) {
394 if title_line.to_lowercase().contains(&phrase_lower) {
395 total_score += phrase_score * 0.5;
396 }
397 }
398
399 if let Some(desc_line) = section
401 .lines()
402 .find(|line| line.starts_with("DESCRIPTION: "))
403 {
404 if desc_line.to_lowercase().contains(&phrase_lower) {
405 total_score += phrase_score * 0.3;
406 }
407 }
408 } else {
409 let proximity_score = self.calculate_phrase_proximity(section, phrase);
411 total_score += proximity_score;
412 }
413 }
414
415 for term in &parsed_query.individual_terms {
417 let term_lower = term.to_lowercase();
418
419 if section_lower.contains(&term_lower) {
420 total_score += 1.0;
421
422 if let Some(title_line) = section.lines().find(|line| line.starts_with("TITLE: ")) {
424 if title_line.to_lowercase().contains(&term_lower) {
425 total_score += 0.5;
426 }
427 }
428 if let Some(desc_line) = section
429 .lines()
430 .find(|line| line.starts_with("DESCRIPTION: "))
431 {
432 if desc_line.to_lowercase().contains(&term_lower) {
433 total_score += 0.3;
434 }
435 }
436 } else {
437 if let Some(score) = self.matcher.fuzzy_match(§ion_lower, &term_lower) {
439 total_score += (score as f32) / 1000.0;
440 }
441 }
442 }
443
444 let total_elements =
446 parsed_query.quoted_phrases.len() + parsed_query.individual_terms.len();
447 if total_elements > 0 {
448 total_score / total_elements as f32
449 } else {
450 0.0
451 }
452 }
453
454 fn calculate_phrase_proximity(&self, section: &str, phrase: &str) -> f32 {
455 let words: Vec<&str> = phrase.split_whitespace().collect();
456 if words.len() < 2 {
457 return 0.0;
458 }
459
460 let section_lower = section.to_lowercase();
461 let mut max_proximity_score: f32 = 0.0;
462
463 for window in section_lower
465 .split_whitespace()
466 .collect::<Vec<_>>()
467 .windows(words.len())
468 {
469 let mut proximity_score = 0.0;
470 let mut found_words = 0;
471
472 for (i, &target_word) in words.iter().enumerate() {
473 if let Some(fuzzy_score) = self.matcher.fuzzy_match(window[i], target_word) {
474 if fuzzy_score > 700 {
475 proximity_score += 1.0;
477 found_words += 1;
478 }
479 }
480 }
481
482 if found_words > 0 {
483 let proximity_multiplier = found_words as f32 / words.len() as f32;
484 proximity_score = proximity_score * proximity_multiplier * 2.0; max_proximity_score = max_proximity_score.max(proximity_score);
486 }
487 }
488
489 max_proximity_score
490 }
491
492 async fn calculate_embedding_section_relevance(
494 &self,
495 section: &str,
496 query: &str,
497 parsed_query: &ParsedQuery,
498 is_phrase_search: bool,
499 ) -> Result<f32> {
500 let embedding_model = self
501 .embedding_model
502 .as_ref()
503 .ok_or_else(|| anyhow::anyhow!("Embedding model not available"))?;
504
505 let query_embedding = embedding_model.embed_text(query).await?;
507
508 let section_text = self.prepare_section_for_embedding(section);
510 let section_embedding = embedding_model.embed_text(§ion_text).await?;
511
512 let embedding_score =
514 EmbeddingModel::cosine_similarity(&query_embedding, §ion_embedding);
515
516 let keyword_score =
518 self.calculate_enhanced_section_relevance(section, parsed_query, is_phrase_search);
519
520 let phrase_bonus = self.calculate_phrase_bonus(section, parsed_query, is_phrase_search);
522
523 let normalized_keyword_score = (keyword_score / 5.0).min(1.0);
526 let normalized_phrase_bonus = (phrase_bonus / 10.0).min(1.0);
527
528 let final_score = (embedding_score * 0.7)
529 + (normalized_keyword_score * 0.2)
530 + (normalized_phrase_bonus * 0.1);
531
532 log::debug!("Embedding hybrid scoring for section: Embedding={:.3}, Keywords={:.3}, Phrase={:.3}, Final={:.3}",
533 embedding_score, normalized_keyword_score, normalized_phrase_bonus, final_score);
534
535 Ok(final_score)
536 }
537
538 fn prepare_section_for_embedding(&self, section: &str) -> String {
540 let lines: Vec<&str> = section.lines().collect();
542 let mut embedding_text = String::new();
543
544 if let Some(title_line) = lines.iter().find(|line| line.starts_with("TITLE: ")) {
546 embedding_text.push_str(title_line[7..].trim());
547 embedding_text.push(' ');
548 }
549
550 if let Some(desc_line) = lines.iter().find(|line| line.starts_with("DESCRIPTION: ")) {
552 embedding_text.push_str(desc_line[13..].trim());
553 embedding_text.push(' ');
554 }
555
556 let content_lines: Vec<&str> = lines
558 .iter()
559 .filter(|line| !line.starts_with("TITLE: ") && !line.starts_with("DESCRIPTION: "))
560 .take(5)
561 .copied()
562 .collect();
563
564 let content = content_lines.join(" ");
565 let content_preview = if content.len() > 200 {
566 format!("{}...", &content[..200])
567 } else {
568 content
569 };
570
571 embedding_text.push_str(&content_preview);
572 embedding_text.trim().to_string()
573 }
574
575 fn calculate_phrase_bonus(
577 &self,
578 section: &str,
579 parsed_query: &ParsedQuery,
580 is_phrase_search: bool,
581 ) -> f32 {
582 let section_lower = section.to_lowercase();
583 let mut phrase_score = 0.0;
584
585 for phrase in &parsed_query.quoted_phrases {
586 let phrase_lower = phrase.to_lowercase();
587 if section_lower.contains(&phrase_lower) {
588 phrase_score += if is_phrase_search { 10.0 } else { 5.0 };
589
590 if let Some(title_line) = section.lines().find(|line| line.starts_with("TITLE: ")) {
592 if title_line.to_lowercase().contains(&phrase_lower) {
593 phrase_score += 2.0;
594 }
595 }
596 }
597 }
598
599 phrase_score
600 }
601
602 fn split_into_sections(&self, docs: &str) -> Vec<String> {
603 let mut sections = Vec::new();
605 let lines: Vec<&str> = docs.lines().collect();
606 let mut current_section = Vec::new();
607 let mut in_section = false;
608
609 for line in lines {
610 if line.starts_with("TITLE: ") {
611 if in_section && !current_section.is_empty() {
613 let section_text = current_section.join("\n");
614 if section_text.len() > 20 {
615 sections.push(section_text);
616 }
617 }
618 current_section.clear();
620 current_section.push(line);
621 in_section = true;
622 } else if in_section {
623 current_section.push(line);
624 }
625 }
626
627 if in_section && !current_section.is_empty() {
629 let section_text = current_section.join("\n");
630 if section_text.len() > 20 {
631 sections.push(section_text);
632 }
633 }
634
635 if sections.is_empty() {
637 let paragraphs: Vec<&str> = docs.split("\n\n").collect();
639 if paragraphs.len() > 1 {
640 for paragraph in paragraphs {
641 let trimmed = paragraph.trim();
642 if trimmed.len() > 50 {
643 sections.push(trimmed.to_string());
645 }
646 }
647 }
648
649 if sections.len() < 3 {
651 sections.clear(); let chunk_size = 800; let mut start = 0;
654 let mut chunk_count = 0;
655
656 while start < docs.len() && chunk_count < 20 {
657 let end = (start + chunk_size).min(docs.len());
659 let mut actual_end = end;
661 if end < docs.len() {
662 if let Some(pos) = docs[start..end].rfind("\n\n") {
664 actual_end = start + pos;
665 } else if let Some(pos) = docs[start..end].rfind(".\n") {
666 actual_end = start + pos + 1;
667 } else if let Some(pos) = docs[start..end].rfind(". ") {
668 actual_end = start + pos + 1;
669 } else if let Some(pos) = docs[start..end].rfind('\n') {
670 actual_end = start + pos;
671 }
672 }
673
674 if actual_end <= start {
676 actual_end = end;
677 }
678
679 let chunk = docs[start..actual_end].trim();
680 if !chunk.is_empty() && chunk.len() > 50 {
681 sections.push(chunk.to_string());
682 chunk_count += 1;
683 }
684
685 start = actual_end;
686 while start < docs.len()
688 && docs.chars().nth(start).is_some_and(|c| c.is_whitespace())
689 {
690 start += 1;
691 }
692 }
693 }
694 }
695
696 if sections.is_empty() {
698 vec![docs.to_string()]
699 } else {
700 sections
701 }
702 }
703
704 fn extract_section_title(&self, section: &str) -> Option<String> {
705 section
706 .lines()
707 .find(|line| line.starts_with("TITLE: "))
708 .map(|line| line[7..].to_string())
709 }
710
711 fn extract_section_excerpt(&self, section: &str) -> String {
712 if let Some(desc_line) = section
714 .lines()
715 .find(|line| line.starts_with("DESCRIPTION: "))
716 {
717 let desc = &desc_line[13..];
718 if desc.len() > 300 {
719 format!("{}...", &desc[..300])
720 } else {
721 desc.to_string()
722 }
723 } else {
724 if section.len() > 300 {
726 format!("{}...", §ion[..300])
727 } else {
728 section.to_string()
729 }
730 }
731 }
732
733 fn create_unique_excerpt(&self, section: &str, offset: usize) -> String {
734 let lines: Vec<&str> = section.lines().collect();
735 let mut excerpt_lines = Vec::new();
736 let mut char_count = 0;
737
738 let skip_lines = offset.saturating_mul(2);
740
741 for line in lines.iter().skip(skip_lines) {
742 let trimmed = line.trim();
743 if !trimmed.is_empty() {
744 excerpt_lines.push(trimmed);
745 char_count += trimmed.len();
746
747 if char_count > 200 || excerpt_lines.len() >= 3 {
749 break;
750 }
751 }
752 }
753
754 if excerpt_lines.is_empty() {
756 for line in lines.iter().take(5) {
757 let trimmed = line.trim();
758 if !trimmed.is_empty() {
759 excerpt_lines.push(trimmed);
760 char_count += trimmed.len();
761 if char_count > 200 {
762 break;
763 }
764 }
765 }
766 }
767
768 let result = excerpt_lines.join(" ");
769 if result.len() > 300 {
770 format!("{}...", &result[..297])
771 } else if result.is_empty() {
772 if section.len() > 300 {
774 format!("{}...", §ion[..297])
775 } else {
776 section.to_string()
777 }
778 } else {
779 result
780 }
781 }
782
783 pub async fn get_documentation(&self, library: &str, query: Option<&str>) -> Result<String> {
784 let (lib_name, _version) = parse_library_spec(library);
785
786 let (library_id, _library_title) = self.client.resolve_library(lib_name).await?;
788
789 self.client.get_documentation(&library_id, query).await
791 }
792}
793
794fn parse_library_spec(spec: &str) -> (&str, Option<&str>) {
795 if let Some(at_pos) = spec.find('@') {
796 let (lib, ver) = spec.split_at(at_pos);
797 (lib, Some(&ver[1..]))
798 } else {
799 (spec, None)
800 }
801}
802
803pub fn fuzzy_find_libraries(query: &str, libraries: &[String]) -> Vec<(String, i64)> {
804 let matcher = SkimMatcherV2::default();
805 let mut matches: Vec<(String, i64)> = libraries
806 .iter()
807 .filter_map(|lib| {
808 matcher
809 .fuzzy_match(lib, query)
810 .map(|score| (lib.clone(), score))
811 })
812 .collect();
813
814 matches.sort_by_key(|(_, score)| -score);
815 matches.truncate(5);
816 matches
817}