1use std::collections::{HashMap, HashSet};
2
3use crate::cache::{load_search_index, load_source_registry};
4use crate::config::{load_config, SourceConfig};
5use crate::normalize::normalize_language;
6use crate::search::bm25::{self, build_index_from_documents};
7use crate::search::tokenizer::{compact_identifier, tokenize};
8use crate::types::{DocEntry, SearchIndex, SkillEntry};
9
10#[derive(Debug, Clone, serde::Serialize)]
16pub struct TaggedEntry {
17 #[serde(flatten)]
18 pub kind: EntryKind,
19 #[serde(rename = "_source")]
20 pub source_name: String,
21 #[serde(rename = "_type")]
22 pub entry_type: &'static str,
23 #[serde(skip)]
24 pub source_obj: SourceConfig,
25}
26
27#[derive(Debug, Clone, serde::Serialize)]
28#[serde(untagged)]
29pub enum EntryKind {
30 Doc(DocEntry),
31 Skill(SkillEntry),
32}
33
34impl TaggedEntry {
35 pub fn id(&self) -> &str {
36 match &self.kind {
37 EntryKind::Doc(d) => &d.id,
38 EntryKind::Skill(s) => &s.id,
39 }
40 }
41
42 pub fn name(&self) -> &str {
43 match &self.kind {
44 EntryKind::Doc(d) => &d.name,
45 EntryKind::Skill(s) => &s.name,
46 }
47 }
48
49 pub fn description(&self) -> &str {
50 match &self.kind {
51 EntryKind::Doc(d) => &d.description,
52 EntryKind::Skill(s) => &s.description,
53 }
54 }
55
56 pub fn tags(&self) -> &[String] {
57 match &self.kind {
58 EntryKind::Doc(d) => &d.tags,
59 EntryKind::Skill(s) => &s.tags,
60 }
61 }
62
63 pub fn source_quality(&self) -> Option<&str> {
64 match &self.kind {
65 EntryKind::Doc(d) => Some(&d.source),
66 EntryKind::Skill(s) => Some(&s.source),
67 }
68 }
69
70 pub fn languages(&self) -> Option<&[crate::types::LanguageEntry]> {
71 match &self.kind {
72 EntryKind::Doc(d) => Some(&d.languages),
73 EntryKind::Skill(_) => None,
74 }
75 }
76
77 pub fn as_doc(&self) -> Option<&DocEntry> {
78 match &self.kind {
79 EntryKind::Doc(d) => Some(d),
80 _ => None,
81 }
82 }
83
84 pub fn as_skill(&self) -> Option<&SkillEntry> {
85 match &self.kind {
86 EntryKind::Skill(s) => Some(s),
87 _ => None,
88 }
89 }
90}
91
92#[derive(Debug)]
98pub struct MergedRegistry {
99 pub docs: Vec<TaggedEntry>,
100 pub skills: Vec<TaggedEntry>,
101 pub search_index: Option<SearchIndex>,
102}
103
104fn search_lookup_id(source: &str, entry_id: &str) -> String {
106 format!("{}:{}", source, entry_id)
107}
108
109fn normalize_query(query: &str) -> String {
111 query.split_whitespace().collect::<Vec<_>>().join(" ")
112}
113
114fn namespace_search_index(mut index: SearchIndex, source_name: &str) -> SearchIndex {
116 for doc in &mut index.documents {
117 doc.id = search_lookup_id(source_name, &doc.id);
118 }
119 index.inverted_index = None;
121 index
122}
123
124pub fn load_merged() -> MergedRegistry {
126 let config = load_config();
127 let mut all_docs = Vec::new();
128 let mut all_skills = Vec::new();
129 let mut search_indexes = Vec::new();
130
131 for source in &config.sources {
132 let registry = match load_source_registry(source) {
133 Some(r) => r,
134 None => continue,
135 };
136
137 if let Some(idx) = load_search_index(source) {
138 search_indexes.push(namespace_search_index(idx, &source.name));
139 }
140
141 for doc in registry.docs {
142 all_docs.push(TaggedEntry {
143 kind: EntryKind::Doc(doc),
144 source_name: source.name.clone(),
145 entry_type: "doc",
146 source_obj: source.clone(),
147 });
148 }
149
150 for skill in registry.skills {
151 all_skills.push(TaggedEntry {
152 kind: EntryKind::Skill(skill),
153 source_name: source.name.clone(),
154 entry_type: "skill",
155 source_obj: source.clone(),
156 });
157 }
158 }
159
160 let search_index = merge_search_indexes(search_indexes);
162
163 MergedRegistry {
164 docs: all_docs,
165 skills: all_skills,
166 search_index,
167 }
168}
169
170fn merge_search_indexes(indexes: Vec<SearchIndex>) -> Option<SearchIndex> {
171 if indexes.is_empty() {
172 return None;
173 }
174 if indexes.len() == 1 {
175 let single = indexes.into_iter().next().unwrap();
176 if single.inverted_index.is_some() {
178 return Some(single);
179 }
180 return Some(build_index_from_documents(single.documents, single.params));
181 }
182
183 let params = indexes[0].params.clone();
184 let all_documents: Vec<_> = indexes.into_iter().flat_map(|idx| idx.documents).collect();
185 Some(build_index_from_documents(all_documents, params))
186}
187
188fn get_all_entries(merged: &MergedRegistry) -> Vec<&TaggedEntry> {
193 merged.docs.iter().chain(merged.skills.iter()).collect()
194}
195
196fn apply_source_filter(entries: Vec<&TaggedEntry>) -> Vec<&TaggedEntry> {
197 let config = load_config();
198 let allowed: Vec<String> = config
199 .source
200 .split(',')
201 .map(|s| s.trim().to_lowercase())
202 .collect();
203 entries
204 .into_iter()
205 .filter(|e| {
206 e.source_quality()
207 .map(|s| allowed.contains(&s.to_lowercase()))
208 .unwrap_or(true)
209 })
210 .collect()
211}
212
213fn apply_filters<'a>(
214 entries: Vec<&'a TaggedEntry>,
215 filters: &SearchFilters,
216) -> Vec<&'a TaggedEntry> {
217 let mut result = entries;
218
219 if let Some(ref tags) = filters.tags {
220 let filter_tags: Vec<String> = tags.split(',').map(|t| t.trim().to_lowercase()).collect();
221 result.retain(|e| {
222 filter_tags
223 .iter()
224 .all(|ft| e.tags().iter().any(|t| t.to_lowercase() == *ft))
225 });
226 }
227
228 if let Some(ref lang) = filters.lang {
229 let normalized = normalize_language(lang);
230 result.retain(|e| {
231 e.languages()
232 .map(|langs| langs.iter().any(|l| l.language == normalized))
233 .unwrap_or(false)
234 });
235 }
236
237 result
238}
239
240#[derive(Debug, Default)]
241pub struct SearchFilters {
242 pub tags: Option<String>,
243 pub lang: Option<String>,
244}
245
246pub fn is_multi_source() -> bool {
247 load_config().sources.len() > 1
248}
249
250fn levenshtein_distance(a: &str, b: &str, max_distance: usize) -> usize {
255 if a == b {
256 return 0;
257 }
258 if a.is_empty() {
259 return b.len();
260 }
261 if b.is_empty() {
262 return a.len();
263 }
264 let diff = if a.len() > b.len() {
265 a.len() - b.len()
266 } else {
267 b.len() - a.len()
268 };
269 if diff > max_distance {
270 return max_distance + 1;
271 }
272
273 let a_bytes = a.as_bytes();
274 let b_bytes = b.as_bytes();
275 let mut previous: Vec<usize> = (0..=b_bytes.len()).collect();
276 let mut current = vec![0usize; b_bytes.len() + 1];
277
278 for i in 1..=a_bytes.len() {
279 current[0] = i;
280 let mut row_min = current[0];
281 for j in 1..=b_bytes.len() {
282 let cost = if a_bytes[i - 1] == b_bytes[j - 1] {
283 0
284 } else {
285 1
286 };
287 current[j] = (previous[j] + 1)
288 .min(current[j - 1] + 1)
289 .min(previous[j - 1] + cost);
290 row_min = row_min.min(current[j]);
291 }
292 if row_min > max_distance {
293 return max_distance + 1;
294 }
295 std::mem::swap(&mut previous, &mut current);
296 }
297
298 previous[b_bytes.len()]
299}
300
301struct CompactWeights {
302 exact: f64,
303 prefix: f64,
304 contains: f64,
305 fuzzy: f64,
306}
307
308fn score_compact_candidate(
309 query_compact: &str,
310 candidate_compact: &str,
311 weights: &CompactWeights,
312) -> f64 {
313 if query_compact.is_empty() || candidate_compact.is_empty() {
314 return 0.0;
315 }
316 if candidate_compact == query_compact {
317 return weights.exact;
318 }
319 if query_compact.len() < 3 {
320 return 0.0;
321 }
322
323 let length_penalty = if candidate_compact.len() > query_compact.len() {
324 candidate_compact.len() - query_compact.len()
325 } else {
326 query_compact.len() - candidate_compact.len()
327 } as f64;
328
329 let min_len = candidate_compact.len().min(query_compact.len()) as f64;
330 let max_len = candidate_compact.len().max(query_compact.len()) as f64;
331 let length_ratio = min_len / max_len;
332
333 if (candidate_compact.starts_with(query_compact)
334 || query_compact.starts_with(candidate_compact))
335 && length_ratio >= 0.6
336 {
337 return (weights.prefix - length_penalty).max(0.0);
338 }
339
340 if (candidate_compact.contains(query_compact) || query_compact.contains(candidate_compact))
341 && length_ratio >= 0.75
342 {
343 return (weights.contains - length_penalty).max(0.0);
344 }
345
346 if query_compact.len() < 5 {
347 return 0.0;
348 }
349
350 let max_dist = if query_compact.len() <= 5 {
351 1
352 } else if query_compact.len() <= 8 {
353 2
354 } else {
355 3
356 };
357
358 let distance = levenshtein_distance(query_compact, candidate_compact, max_dist);
359 if distance > max_dist {
360 return 0.0;
361 }
362
363 (weights.fuzzy - (distance as f64 * 20.0) - length_penalty).max(0.0)
364}
365
366fn split_compact_segments(text: &str) -> Vec<String> {
367 let mut segments: HashSet<String> = HashSet::new();
368 for seg in text.split('/') {
369 let c = compact_identifier(seg);
370 if !c.is_empty() {
371 segments.insert(c);
372 }
373 }
374 for seg in text.split(&['/', '_', '.', ' ', '-'][..]) {
375 let c = compact_identifier(seg);
376 if !c.is_empty() {
377 segments.insert(c);
378 }
379 }
380 segments.into_iter().collect()
381}
382
383fn score_entry_lexical_variant(entry: &TaggedEntry, query_compact: &str) -> f64 {
384 if query_compact.len() < 2 {
385 return 0.0;
386 }
387
388 let name_compact = compact_identifier(entry.name());
389 let id_compact = compact_identifier(entry.id());
390 let id_segments = split_compact_segments(entry.id());
391 let name_segments = split_compact_segments(entry.name());
392
393 let mut best = 0.0f64;
394
395 best = best.max(score_compact_candidate(
396 query_compact,
397 &name_compact,
398 &CompactWeights {
399 exact: 620.0,
400 prefix: 560.0,
401 contains: 520.0,
402 fuzzy: 500.0,
403 },
404 ));
405
406 best = best.max(score_compact_candidate(
407 query_compact,
408 &id_compact,
409 &CompactWeights {
410 exact: 600.0,
411 prefix: 540.0,
412 contains: 500.0,
413 fuzzy: 470.0,
414 },
415 ));
416
417 for (idx, segment) in id_segments.iter().enumerate() {
418 let seg_score = score_compact_candidate(
419 query_compact,
420 segment,
421 &CompactWeights {
422 exact: 580.0,
423 prefix: 530.0,
424 contains: 490.0,
425 fuzzy: 460.0,
426 },
427 );
428 if seg_score == 0.0 {
429 continue;
430 }
431
432 let mut bonus = 0.0;
433 if idx == 0 {
434 bonus += 10.0;
435 }
436 if idx == id_segments.len() - 1 {
437 bonus += 10.0;
438 }
439 if query_compact == id_segments[0] {
440 bonus += 60.0;
441 }
442 if query_compact == id_segments[id_segments.len() - 1] {
443 bonus += 25.0;
444 }
445 if id_segments.len() > 1
446 && query_compact == id_segments[0]
447 && query_compact == id_segments[id_segments.len() - 1]
448 {
449 bonus += 40.0;
450 }
451
452 best = best.max(seg_score + bonus);
453 }
454
455 for segment in &name_segments {
456 best = best.max(score_compact_candidate(
457 query_compact,
458 segment,
459 &CompactWeights {
460 exact: 560.0,
461 prefix: 520.0,
462 contains: 480.0,
463 fuzzy: 450.0,
464 },
465 ));
466 }
467
468 best
469}
470
471fn score_entry_lexical_boost(
472 entry: &TaggedEntry,
473 normalized_query: &str,
474 rescue_terms: &[String],
475) -> f64 {
476 let mut query_compacts: Vec<String> = vec![compact_identifier(normalized_query)];
477 for term in rescue_terms {
478 let c = compact_identifier(term);
479 if !query_compacts.contains(&c) {
480 query_compacts.push(c);
481 }
482 }
483 query_compacts.retain(|c| c.len() >= 2);
484
485 let mut best = 0.0f64;
486 for qc in &query_compacts {
487 best = best.max(score_entry_lexical_variant(entry, qc));
488 }
489 best
490}
491
492fn get_missing_query_terms(normalized_query: &str, index: &SearchIndex) -> Vec<String> {
493 match &index.inverted_index {
494 Some(inv) => tokenize(normalized_query)
495 .into_iter()
496 .filter(|term| !inv.contains_key(term.as_str()))
497 .collect(),
498 None => vec![],
499 }
500}
501
502fn should_run_global_lexical_scan(
503 normalized_query: &str,
504 result_count: usize,
505 index: &Option<SearchIndex>,
506) -> bool {
507 let idx = match index {
508 Some(idx) => idx,
509 None => return true,
510 };
511
512 if result_count == 0 {
513 return true;
514 }
515 if idx.inverted_index.is_none() {
516 return false;
517 }
518
519 let query_terms = tokenize(normalized_query);
520 if query_terms.len() < 2 {
521 return false;
522 }
523
524 !get_missing_query_terms(normalized_query, idx).is_empty()
525}
526
527pub fn search_entries(
533 query: &str,
534 filters: &SearchFilters,
535 merged: &MergedRegistry,
536) -> Vec<TaggedEntry> {
537 let normalized_query = normalize_query(query);
538 let entries = apply_source_filter(get_all_entries(merged));
539
540 let mut seen = HashSet::new();
542 let mut deduped = Vec::new();
543 for entry in entries {
544 let key = search_lookup_id(&entry.source_name, entry.id());
545 if seen.insert(key) {
546 deduped.push(entry);
547 }
548 }
549
550 let entry_by_key: HashMap<String, &TaggedEntry> = deduped
552 .iter()
553 .map(|e| (search_lookup_id(&e.source_name, e.id()), *e))
554 .collect();
555
556 if normalized_query.is_empty() {
557 let filtered = apply_filters(deduped, filters);
558 return filtered.into_iter().cloned().collect();
559 }
560
561 let mut result_by_key: HashMap<String, (&TaggedEntry, f64)> = HashMap::new();
562
563 if let Some(ref search_index) = merged.search_index {
564 let bm25_results = bm25::search(&normalized_query, search_index, None);
566 for r in &bm25_results {
567 if let Some(entry) = entry_by_key.get(&r.id) {
568 let key = search_lookup_id(&entry.source_name, entry.id());
569 if r.score > 0.0 {
570 result_by_key.insert(key, (*entry, r.score));
571 }
572 }
573 }
574 } else {
575 let q = normalized_query.to_lowercase();
577 let words: Vec<&str> = q.split_whitespace().collect();
578
579 for entry in &deduped {
580 let mut score = 0.0f64;
581 let id_lower = entry.id().to_lowercase();
582 let name_lower = entry.name().to_lowercase();
583
584 if id_lower == q {
585 score += 100.0;
586 } else if id_lower.contains(&q) {
587 score += 50.0;
588 }
589
590 if name_lower == q {
591 score += 80.0;
592 } else if name_lower.contains(&q) {
593 score += 40.0;
594 }
595
596 for word in &words {
597 if id_lower.contains(word) {
598 score += 10.0;
599 }
600 if name_lower.contains(word) {
601 score += 10.0;
602 }
603 if entry.description().to_lowercase().contains(word) {
604 score += 5.0;
605 }
606 if entry.tags().iter().any(|t| t.to_lowercase().contains(word)) {
607 score += 15.0;
608 }
609 }
610
611 if score > 0.0 {
612 let key = search_lookup_id(&entry.source_name, entry.id());
613 result_by_key.insert(key, (*entry, score));
614 }
615 }
616 }
617
618 let lexical_candidates = if !should_run_global_lexical_scan(
620 &normalized_query,
621 result_by_key.len(),
622 &merged.search_index,
623 ) {
624 result_by_key.values().map(|(e, _)| *e).collect::<Vec<_>>()
626 } else {
627 deduped.clone()
629 };
630
631 let rescue_terms: Vec<String> = if !result_by_key.is_empty() {
632 if let Some(ref idx) = merged.search_index {
633 get_missing_query_terms(&normalized_query, idx)
634 .into_iter()
635 .filter(|t| t.len() >= 5)
636 .collect()
637 } else {
638 vec![]
639 }
640 } else {
641 vec![]
642 };
643
644 for entry in &lexical_candidates {
645 let boost = score_entry_lexical_boost(entry, &normalized_query, &rescue_terms);
646 if boost == 0.0 {
647 continue;
648 }
649
650 let key = search_lookup_id(&entry.source_name, entry.id());
651 if let Some(existing) = result_by_key.get_mut(&key) {
652 existing.1 += boost;
653 } else {
654 result_by_key.insert(key, (*entry, boost));
655 }
656 }
657
658 let mut results: Vec<(&TaggedEntry, f64)> = result_by_key.into_values().collect();
660 let filtered_entries: Vec<&TaggedEntry> = {
661 let refs: Vec<&TaggedEntry> = results.iter().map(|(e, _)| *e).collect();
662 apply_filters(refs, filters)
663 };
664 let filtered_set: HashSet<*const TaggedEntry> =
665 filtered_entries.iter().map(|e| *e as *const _).collect();
666 results.retain(|(e, _)| filtered_set.contains(&(*e as *const _)));
667
668 results.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
669 results.into_iter().map(|(e, _)| e.clone()).collect()
670}
671
672pub struct EntryLookup {
677 pub entry: Option<TaggedEntry>,
678 pub ambiguous: bool,
679 pub alternatives: Vec<String>,
680}
681
682pub fn get_entry(id_or_namespaced: &str, merged: &MergedRegistry) -> EntryLookup {
683 let normalized = normalize_query(id_or_namespaced);
684 let all = apply_source_filter(get_all_entries(merged));
685
686 if let Some(colon_idx) = normalized.find(':') {
688 let source_name = &normalized[..colon_idx];
689 let id = &normalized[colon_idx + 1..];
690 let entry = all
691 .into_iter()
692 .find(|e| e.source_name == source_name && e.id() == id)
693 .cloned();
694 return EntryLookup {
695 entry,
696 ambiguous: false,
697 alternatives: vec![],
698 };
699 }
700
701 let matches: Vec<&TaggedEntry> = all.into_iter().filter(|e| e.id() == normalized).collect();
703
704 match matches.len() {
705 0 => EntryLookup {
706 entry: None,
707 ambiguous: false,
708 alternatives: vec![],
709 },
710 1 => EntryLookup {
711 entry: Some(matches[0].clone()),
712 ambiguous: false,
713 alternatives: vec![],
714 },
715 _ => EntryLookup {
716 entry: None,
717 ambiguous: true,
718 alternatives: matches
719 .iter()
720 .map(|e| format!("{}:{}", e.source_name, e.id()))
721 .collect(),
722 },
723 }
724}
725
726pub fn list_entries(filters: &SearchFilters, merged: &MergedRegistry) -> Vec<TaggedEntry> {
728 let entries = apply_source_filter(get_all_entries(merged));
729
730 let mut seen = HashSet::new();
732 let mut deduped = Vec::new();
733 for entry in entries {
734 let key = format!("{}:{}", entry.source_name, entry.id());
735 if seen.insert(key) {
736 deduped.push(entry);
737 }
738 }
739
740 let filtered = apply_filters(deduped, filters);
741 filtered.into_iter().cloned().collect()
742}
743
744pub enum ResolvedPath {
749 Ok {
750 source: SourceConfig,
751 path: String,
752 files: Vec<String>,
753 },
754 NeedsLanguage {
755 available: Vec<String>,
756 },
757 VersionNotFound {
758 requested: String,
759 available: Vec<String>,
760 },
761}
762
763pub fn resolve_doc_path(
764 entry: &TaggedEntry,
765 language: Option<&str>,
766 version: Option<&str>,
767) -> Option<ResolvedPath> {
768 match &entry.kind {
769 EntryKind::Skill(s) => {
770 if s.path.is_empty() {
771 return None;
772 }
773 Some(ResolvedPath::Ok {
774 source: entry.source_obj.clone(),
775 path: s.path.clone(),
776 files: s.files.clone(),
777 })
778 }
779 EntryKind::Doc(d) => {
780 let lang = language.map(normalize_language);
781
782 let lang_obj = if let Some(ref lang) = lang {
783 d.languages.iter().find(|l| l.language == *lang)
784 } else {
785 return Some(ResolvedPath::NeedsLanguage {
786 available: d.languages.iter().map(|l| l.language.clone()).collect(),
787 });
788 };
789
790 let lang_obj = lang_obj?;
791
792 let ver_obj = if let Some(version) = version {
793 match lang_obj.versions.iter().find(|v| v.version == version) {
794 Some(v) => v,
795 None => {
796 return Some(ResolvedPath::VersionNotFound {
797 requested: version.to_string(),
798 available: lang_obj
799 .versions
800 .iter()
801 .map(|v| v.version.clone())
802 .collect(),
803 })
804 }
805 }
806 } else {
807 let rec = &lang_obj.recommended_version;
808 lang_obj
809 .versions
810 .iter()
811 .find(|v| v.version == *rec)
812 .or(lang_obj.versions.first())?
813 };
814
815 if ver_obj.path.is_empty() {
816 return None;
817 }
818
819 Some(ResolvedPath::Ok {
820 source: entry.source_obj.clone(),
821 path: ver_obj.path.clone(),
822 files: ver_obj.files.clone(),
823 })
824 }
825 }
826}
827
828pub fn resolve_entry_file(
829 resolved: &ResolvedPath,
830 entry_type: &str,
831) -> Option<(String, String, Vec<String>)> {
832 match resolved {
833 ResolvedPath::Ok { path, files, .. } => {
834 let file_name = if entry_type == "skill" {
835 "SKILL.md"
836 } else {
837 "DOC.md"
838 };
839 Some((
840 format!("{}/{}", path, file_name),
841 path.clone(),
842 files.clone(),
843 ))
844 }
845 _ => None,
846 }
847}