1use std::collections::{BTreeSet, HashMap, HashSet};
2
3use crate::cache::{load_search_index, load_source_registry};
4use crate::config::{load_config, SourceConfig};
5use crate::normalize::normalize_language;
6use crate::search::bm25::{self, build_index_from_documents};
7use crate::search::tokenizer::{compact_identifier, tokenize};
8use crate::types::{DocEntry, SearchIndex, SkillEntry};
9
10#[derive(Debug, Clone, serde::Serialize)]
16pub struct TaggedEntry {
17 #[serde(flatten)]
18 pub kind: EntryKind,
19 #[serde(rename = "_source")]
20 pub source_name: String,
21 #[serde(rename = "_type")]
22 pub entry_type: &'static str,
23 #[serde(skip)]
24 pub source_obj: SourceConfig,
25}
26
27#[derive(Debug, Clone, serde::Serialize)]
28#[serde(untagged)]
29pub enum EntryKind {
30 Doc(DocEntry),
31 Skill(SkillEntry),
32}
33
34impl TaggedEntry {
35 pub fn id(&self) -> &str {
36 match &self.kind {
37 EntryKind::Doc(d) => &d.id,
38 EntryKind::Skill(s) => &s.id,
39 }
40 }
41
42 pub fn name(&self) -> &str {
43 match &self.kind {
44 EntryKind::Doc(d) => &d.name,
45 EntryKind::Skill(s) => &s.name,
46 }
47 }
48
49 pub fn description(&self) -> &str {
50 match &self.kind {
51 EntryKind::Doc(d) => &d.description,
52 EntryKind::Skill(s) => &s.description,
53 }
54 }
55
56 pub fn tags(&self) -> &[String] {
57 match &self.kind {
58 EntryKind::Doc(d) => &d.tags,
59 EntryKind::Skill(s) => &s.tags,
60 }
61 }
62
63 pub fn source_quality(&self) -> Option<&str> {
64 match &self.kind {
65 EntryKind::Doc(d) => Some(&d.source),
66 EntryKind::Skill(s) => Some(&s.source),
67 }
68 }
69
70 pub fn languages(&self) -> Option<&[crate::types::LanguageEntry]> {
71 match &self.kind {
72 EntryKind::Doc(d) => Some(&d.languages),
73 EntryKind::Skill(_) => None,
74 }
75 }
76
77 pub fn as_doc(&self) -> Option<&DocEntry> {
78 match &self.kind {
79 EntryKind::Doc(d) => Some(d),
80 _ => None,
81 }
82 }
83
84 pub fn as_skill(&self) -> Option<&SkillEntry> {
85 match &self.kind {
86 EntryKind::Skill(s) => Some(s),
87 _ => None,
88 }
89 }
90}
91
92#[derive(Debug)]
98pub struct MergedRegistry {
99 pub docs: Vec<TaggedEntry>,
100 pub skills: Vec<TaggedEntry>,
101 pub search_index: Option<SearchIndex>,
102}
103
104fn search_lookup_id(source: &str, entry_id: &str) -> String {
106 format!("{}:{}", source, entry_id)
107}
108
109fn normalize_query(query: &str) -> String {
111 query.split_whitespace().collect::<Vec<_>>().join(" ")
112}
113
114fn namespace_search_index(mut index: SearchIndex, source_name: &str) -> SearchIndex {
116 for doc in &mut index.documents {
117 doc.id = search_lookup_id(source_name, &doc.id);
118 }
119 index.inverted_index = None;
121 index
122}
123
124pub fn load_merged() -> MergedRegistry {
126 let config = load_config();
127 let mut all_docs = Vec::new();
128 let mut all_skills = Vec::new();
129 let mut search_indexes = Vec::new();
130
131 for source in &config.sources {
132 let registry = match load_source_registry(source) {
133 Some(r) => r,
134 None => continue,
135 };
136
137 if let Some(idx) = load_search_index(source) {
138 search_indexes.push(namespace_search_index(idx, &source.name));
139 }
140
141 for doc in registry.docs {
142 all_docs.push(TaggedEntry {
143 kind: EntryKind::Doc(doc),
144 source_name: source.name.clone(),
145 entry_type: "doc",
146 source_obj: source.clone(),
147 });
148 }
149
150 for skill in registry.skills {
151 all_skills.push(TaggedEntry {
152 kind: EntryKind::Skill(skill),
153 source_name: source.name.clone(),
154 entry_type: "skill",
155 source_obj: source.clone(),
156 });
157 }
158 }
159
160 let search_index = merge_search_indexes(search_indexes);
162
163 MergedRegistry {
164 docs: all_docs,
165 skills: all_skills,
166 search_index,
167 }
168}
169
170fn merge_search_indexes(indexes: Vec<SearchIndex>) -> Option<SearchIndex> {
171 if indexes.is_empty() {
172 return None;
173 }
174 if indexes.len() == 1 {
175 let single = indexes.into_iter().next().unwrap();
176 if single.inverted_index.is_some() {
178 return Some(single);
179 }
180 return Some(build_index_from_documents(single.documents, single.params));
181 }
182
183 let params = indexes[0].params.clone();
184 let all_documents: Vec<_> = indexes.into_iter().flat_map(|idx| idx.documents).collect();
185 Some(build_index_from_documents(all_documents, params))
186}
187
188fn get_all_entries(merged: &MergedRegistry) -> Vec<&TaggedEntry> {
193 merged.docs.iter().chain(merged.skills.iter()).collect()
194}
195
196fn apply_source_filter(entries: Vec<&TaggedEntry>) -> Vec<&TaggedEntry> {
197 let config = load_config();
198 let allowed: Vec<String> = config
199 .source
200 .split(',')
201 .map(|s| s.trim().to_lowercase())
202 .collect();
203 entries
204 .into_iter()
205 .filter(|e| {
206 e.source_quality()
207 .map(|s| allowed.contains(&s.to_lowercase()))
208 .unwrap_or(true)
209 })
210 .collect()
211}
212
213fn apply_filters<'a>(
214 entries: Vec<&'a TaggedEntry>,
215 filters: &SearchFilters,
216) -> Vec<&'a TaggedEntry> {
217 let mut result = entries;
218
219 if let Some(ref tags) = filters.tags {
220 let filter_tags: Vec<String> = tags.split(',').map(|t| t.trim().to_lowercase()).collect();
221 result.retain(|e| {
222 filter_tags
223 .iter()
224 .all(|ft| e.tags().iter().any(|t| t.to_lowercase() == *ft))
225 });
226 }
227
228 if let Some(ref lang) = filters.lang {
229 let normalized = normalize_language(lang);
230 result.retain(|e| {
231 e.languages()
232 .map(|langs| langs.iter().any(|l| l.language == normalized))
233 .unwrap_or(false)
234 });
235 }
236
237 if let Some(ref entry_type) = filters.entry_type {
238 result.retain(|e| e.entry_type == *entry_type);
239 }
240
241 result
242}
243
244#[derive(Debug, Default)]
245pub struct SearchFilters {
246 pub tags: Option<String>,
247 pub lang: Option<String>,
248 pub entry_type: Option<String>,
249}
250
251pub fn is_multi_source() -> bool {
252 load_config().sources.len() > 1
253}
254
255fn levenshtein_distance(a: &str, b: &str, max_distance: usize) -> usize {
260 if a == b {
261 return 0;
262 }
263 if a.is_empty() {
264 return b.len();
265 }
266 if b.is_empty() {
267 return a.len();
268 }
269 let diff = if a.len() > b.len() {
270 a.len() - b.len()
271 } else {
272 b.len() - a.len()
273 };
274 if diff > max_distance {
275 return max_distance + 1;
276 }
277
278 let a_bytes = a.as_bytes();
279 let b_bytes = b.as_bytes();
280 let mut previous: Vec<usize> = (0..=b_bytes.len()).collect();
281 let mut current = vec![0usize; b_bytes.len() + 1];
282
283 for i in 1..=a_bytes.len() {
284 current[0] = i;
285 let mut row_min = current[0];
286 for j in 1..=b_bytes.len() {
287 let cost = if a_bytes[i - 1] == b_bytes[j - 1] {
288 0
289 } else {
290 1
291 };
292 current[j] = (previous[j] + 1)
293 .min(current[j - 1] + 1)
294 .min(previous[j - 1] + cost);
295 row_min = row_min.min(current[j]);
296 }
297 if row_min > max_distance {
298 return max_distance + 1;
299 }
300 std::mem::swap(&mut previous, &mut current);
301 }
302
303 previous[b_bytes.len()]
304}
305
306struct CompactWeights {
307 exact: f64,
308 prefix: f64,
309 contains: f64,
310 fuzzy: f64,
311}
312
313fn score_compact_candidate(
314 query_compact: &str,
315 candidate_compact: &str,
316 weights: &CompactWeights,
317) -> f64 {
318 if query_compact.is_empty() || candidate_compact.is_empty() {
319 return 0.0;
320 }
321 if candidate_compact == query_compact {
322 return weights.exact;
323 }
324 if query_compact.len() < 3 {
325 return 0.0;
326 }
327
328 let length_penalty = if candidate_compact.len() > query_compact.len() {
329 candidate_compact.len() - query_compact.len()
330 } else {
331 query_compact.len() - candidate_compact.len()
332 } as f64;
333
334 let min_len = candidate_compact.len().min(query_compact.len()) as f64;
335 let max_len = candidate_compact.len().max(query_compact.len()) as f64;
336 let length_ratio = min_len / max_len;
337
338 if (candidate_compact.starts_with(query_compact)
339 || query_compact.starts_with(candidate_compact))
340 && length_ratio >= 0.6
341 {
342 return (weights.prefix - length_penalty).max(0.0);
343 }
344
345 if (candidate_compact.contains(query_compact) || query_compact.contains(candidate_compact))
346 && length_ratio >= 0.75
347 {
348 return (weights.contains - length_penalty).max(0.0);
349 }
350
351 if query_compact.len() < 5 {
352 return 0.0;
353 }
354
355 let max_dist = if query_compact.len() <= 5 {
356 1
357 } else if query_compact.len() <= 8 {
358 2
359 } else {
360 3
361 };
362
363 let distance = levenshtein_distance(query_compact, candidate_compact, max_dist);
364 if distance > max_dist {
365 return 0.0;
366 }
367
368 (weights.fuzzy - (distance as f64 * 20.0) - length_penalty).max(0.0)
369}
370
371fn split_compact_segments(text: &str) -> Vec<String> {
372 let mut segments: BTreeSet<String> = BTreeSet::new();
373 for seg in text.split('/') {
374 let c = compact_identifier(seg);
375 if !c.is_empty() {
376 segments.insert(c);
377 }
378 }
379 for seg in text.split(&['/', '_', '.', ' ', '-'][..]) {
380 let c = compact_identifier(seg);
381 if !c.is_empty() {
382 segments.insert(c);
383 }
384 }
385 segments.into_iter().collect()
386}
387
388fn score_entry_lexical_variant(entry: &TaggedEntry, query_compact: &str) -> f64 {
389 if query_compact.len() < 2 {
390 return 0.0;
391 }
392
393 let name_compact = compact_identifier(entry.name());
394 let id_compact = compact_identifier(entry.id());
395 let id_segments = split_compact_segments(entry.id());
396 let name_segments = split_compact_segments(entry.name());
397
398 let mut best = 0.0f64;
399
400 best = best.max(score_compact_candidate(
401 query_compact,
402 &name_compact,
403 &CompactWeights {
404 exact: 620.0,
405 prefix: 560.0,
406 contains: 520.0,
407 fuzzy: 500.0,
408 },
409 ));
410
411 best = best.max(score_compact_candidate(
412 query_compact,
413 &id_compact,
414 &CompactWeights {
415 exact: 600.0,
416 prefix: 540.0,
417 contains: 500.0,
418 fuzzy: 470.0,
419 },
420 ));
421
422 for (idx, segment) in id_segments.iter().enumerate() {
423 let seg_score = score_compact_candidate(
424 query_compact,
425 segment,
426 &CompactWeights {
427 exact: 580.0,
428 prefix: 530.0,
429 contains: 490.0,
430 fuzzy: 460.0,
431 },
432 );
433 if seg_score == 0.0 {
434 continue;
435 }
436
437 let mut bonus = 0.0;
438 if idx == 0 {
439 bonus += 10.0;
440 }
441 if idx == id_segments.len() - 1 {
442 bonus += 10.0;
443 }
444 if query_compact == id_segments[0] {
445 bonus += 60.0;
446 }
447 if query_compact == id_segments[id_segments.len() - 1] {
448 bonus += 25.0;
449 }
450 if id_segments.len() > 1
451 && query_compact == id_segments[0]
452 && query_compact == id_segments[id_segments.len() - 1]
453 {
454 bonus += 40.0;
455 }
456
457 best = best.max(seg_score + bonus);
458 }
459
460 for segment in &name_segments {
461 best = best.max(score_compact_candidate(
462 query_compact,
463 segment,
464 &CompactWeights {
465 exact: 560.0,
466 prefix: 520.0,
467 contains: 480.0,
468 fuzzy: 450.0,
469 },
470 ));
471 }
472
473 best
474}
475
476fn score_entry_lexical_boost(
477 entry: &TaggedEntry,
478 normalized_query: &str,
479 rescue_terms: &[String],
480) -> f64 {
481 let mut query_compacts: Vec<String> = vec![compact_identifier(normalized_query)];
482 for term in rescue_terms {
483 let c = compact_identifier(term);
484 if !query_compacts.contains(&c) {
485 query_compacts.push(c);
486 }
487 }
488 query_compacts.retain(|c| c.len() >= 2);
489
490 let mut best = 0.0f64;
491 for qc in &query_compacts {
492 best = best.max(score_entry_lexical_variant(entry, qc));
493 }
494 best
495}
496
497fn get_missing_query_terms(normalized_query: &str, index: &SearchIndex) -> Vec<String> {
498 match &index.inverted_index {
499 Some(inv) => tokenize(normalized_query)
500 .into_iter()
501 .filter(|term| !inv.contains_key(term.as_str()))
502 .collect(),
503 None => vec![],
504 }
505}
506
507fn should_run_global_lexical_scan(
508 normalized_query: &str,
509 result_count: usize,
510 index: &Option<SearchIndex>,
511) -> bool {
512 let idx = match index {
513 Some(idx) => idx,
514 None => return true,
515 };
516
517 if result_count == 0 {
518 return true;
519 }
520 if idx.inverted_index.is_none() {
521 return false;
522 }
523
524 let query_terms = tokenize(normalized_query);
525 if query_terms.len() < 2 {
526 return false;
527 }
528
529 !get_missing_query_terms(normalized_query, idx).is_empty()
530}
531
532pub fn search_entries(
538 query: &str,
539 filters: &SearchFilters,
540 merged: &MergedRegistry,
541) -> Vec<TaggedEntry> {
542 let normalized_query = normalize_query(query);
543 let entries = apply_source_filter(get_all_entries(merged));
544
545 let mut seen = HashSet::new();
547 let mut deduped = Vec::new();
548 for entry in entries {
549 let key = search_lookup_id(&entry.source_name, entry.id());
550 if seen.insert(key) {
551 deduped.push(entry);
552 }
553 }
554
555 let entry_by_key: HashMap<String, &TaggedEntry> = deduped
557 .iter()
558 .map(|e| (search_lookup_id(&e.source_name, e.id()), *e))
559 .collect();
560
561 if normalized_query.is_empty() {
562 let filtered = apply_filters(deduped, filters);
563 return filtered.into_iter().cloned().collect();
564 }
565
566 let mut result_by_key: HashMap<String, (&TaggedEntry, f64)> = HashMap::new();
567
568 if let Some(ref search_index) = merged.search_index {
569 let bm25_results = bm25::search(&normalized_query, search_index, None);
571 for r in &bm25_results {
572 if let Some(entry) = entry_by_key.get(&r.id) {
573 let key = search_lookup_id(&entry.source_name, entry.id());
574 if r.score > 0.0 {
575 result_by_key.insert(key, (*entry, r.score));
576 }
577 }
578 }
579 } else {
580 let q = normalized_query.to_lowercase();
582 let words: Vec<&str> = q.split_whitespace().collect();
583
584 for entry in &deduped {
585 let mut score = 0.0f64;
586 let id_lower = entry.id().to_lowercase();
587 let name_lower = entry.name().to_lowercase();
588
589 if id_lower == q {
590 score += 100.0;
591 } else if id_lower.contains(&q) {
592 score += 50.0;
593 }
594
595 if name_lower == q {
596 score += 80.0;
597 } else if name_lower.contains(&q) {
598 score += 40.0;
599 }
600
601 for word in &words {
602 if id_lower.contains(word) {
603 score += 10.0;
604 }
605 if name_lower.contains(word) {
606 score += 10.0;
607 }
608 if entry.description().to_lowercase().contains(word) {
609 score += 5.0;
610 }
611 if entry.tags().iter().any(|t| t.to_lowercase().contains(word)) {
612 score += 15.0;
613 }
614 }
615
616 if score > 0.0 {
617 let key = search_lookup_id(&entry.source_name, entry.id());
618 result_by_key.insert(key, (*entry, score));
619 }
620 }
621 }
622
623 let lexical_candidates = if !should_run_global_lexical_scan(
625 &normalized_query,
626 result_by_key.len(),
627 &merged.search_index,
628 ) {
629 result_by_key.values().map(|(e, _)| *e).collect::<Vec<_>>()
631 } else {
632 deduped.clone()
634 };
635
636 let rescue_terms: Vec<String> = if !result_by_key.is_empty() {
637 if let Some(ref idx) = merged.search_index {
638 get_missing_query_terms(&normalized_query, idx)
639 .into_iter()
640 .filter(|t| t.len() >= 5)
641 .collect()
642 } else {
643 vec![]
644 }
645 } else {
646 vec![]
647 };
648
649 for entry in &lexical_candidates {
650 let boost = score_entry_lexical_boost(entry, &normalized_query, &rescue_terms);
651 if boost == 0.0 {
652 continue;
653 }
654
655 let key = search_lookup_id(&entry.source_name, entry.id());
656 if let Some(existing) = result_by_key.get_mut(&key) {
657 existing.1 += boost;
658 } else {
659 result_by_key.insert(key, (*entry, boost));
660 }
661 }
662
663 let mut results: Vec<(&TaggedEntry, f64)> = result_by_key.into_values().collect();
665 let filtered_entries: Vec<&TaggedEntry> = {
666 let refs: Vec<&TaggedEntry> = results.iter().map(|(e, _)| *e).collect();
667 apply_filters(refs, filters)
668 };
669 let filtered_set: HashSet<*const TaggedEntry> =
670 filtered_entries.iter().map(|e| *e as *const _).collect();
671 results.retain(|(e, _)| filtered_set.contains(&(*e as *const _)));
672
673 results.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
674 results.into_iter().map(|(e, _)| e.clone()).collect()
675}
676
677pub struct EntryLookup {
682 pub entry: Option<TaggedEntry>,
683 pub ambiguous: bool,
684 pub alternatives: Vec<String>,
685}
686
687pub fn get_entry(id_or_namespaced: &str, merged: &MergedRegistry) -> EntryLookup {
688 let normalized = normalize_query(id_or_namespaced);
689 let all = apply_source_filter(get_all_entries(merged));
690
691 if let Some(colon_idx) = normalized.find(':') {
693 let source_name = &normalized[..colon_idx];
694 let id = &normalized[colon_idx + 1..];
695 let entry = all
696 .into_iter()
697 .find(|e| e.source_name == source_name && e.id() == id)
698 .cloned();
699 return EntryLookup {
700 entry,
701 ambiguous: false,
702 alternatives: vec![],
703 };
704 }
705
706 let matches: Vec<&TaggedEntry> = all.into_iter().filter(|e| e.id() == normalized).collect();
708
709 match matches.len() {
710 0 => EntryLookup {
711 entry: None,
712 ambiguous: false,
713 alternatives: vec![],
714 },
715 1 => EntryLookup {
716 entry: Some(matches[0].clone()),
717 ambiguous: false,
718 alternatives: vec![],
719 },
720 _ => EntryLookup {
721 entry: None,
722 ambiguous: true,
723 alternatives: matches
724 .iter()
725 .map(|e| format!("{}:{}", e.source_name, e.id()))
726 .collect(),
727 },
728 }
729}
730
731pub fn list_entries(filters: &SearchFilters, merged: &MergedRegistry) -> Vec<TaggedEntry> {
733 let entries = apply_source_filter(get_all_entries(merged));
734
735 let mut seen = HashSet::new();
737 let mut deduped = Vec::new();
738 for entry in entries {
739 let key = format!("{}:{}", entry.source_name, entry.id());
740 if seen.insert(key) {
741 deduped.push(entry);
742 }
743 }
744
745 let filtered = apply_filters(deduped, filters);
746 filtered.into_iter().cloned().collect()
747}
748
749pub enum ResolvedPath {
754 Ok {
755 source: SourceConfig,
756 path: String,
757 files: Vec<String>,
758 content_hash: Option<String>,
760 },
761 NeedsLanguage {
762 available: Vec<String>,
763 },
764 VersionNotFound {
765 requested: String,
766 available: Vec<String>,
767 },
768}
769
770pub fn resolve_doc_path(
771 entry: &TaggedEntry,
772 language: Option<&str>,
773 version: Option<&str>,
774) -> Option<ResolvedPath> {
775 match &entry.kind {
776 EntryKind::Skill(s) => {
777 if s.path.is_empty() {
778 return None;
779 }
780 Some(ResolvedPath::Ok {
781 source: entry.source_obj.clone(),
782 path: s.path.clone(),
783 files: s.files.clone(),
784 content_hash: s.content_hash.clone(),
785 })
786 }
787 EntryKind::Doc(d) => {
788 let lang = language.map(normalize_language);
789
790 let lang_obj = if let Some(ref lang) = lang {
791 d.languages.iter().find(|l| l.language == *lang)
792 } else if d.languages.len() == 1 {
793 d.languages.first()
794 } else {
795 return Some(ResolvedPath::NeedsLanguage {
796 available: d.languages.iter().map(|l| l.language.clone()).collect(),
797 });
798 };
799
800 let lang_obj = lang_obj?;
801
802 let ver_obj = if let Some(version) = version {
803 match lang_obj.versions.iter().find(|v| v.version == version) {
804 Some(v) => v,
805 None => {
806 return Some(ResolvedPath::VersionNotFound {
807 requested: version.to_string(),
808 available: lang_obj
809 .versions
810 .iter()
811 .map(|v| v.version.clone())
812 .collect(),
813 })
814 }
815 }
816 } else {
817 let rec = &lang_obj.recommended_version;
818 lang_obj
819 .versions
820 .iter()
821 .find(|v| v.version == *rec)
822 .or(lang_obj.versions.first())?
823 };
824
825 if ver_obj.path.is_empty() {
826 return None;
827 }
828
829 Some(ResolvedPath::Ok {
830 source: entry.source_obj.clone(),
831 path: ver_obj.path.clone(),
832 files: ver_obj.files.clone(),
833 content_hash: ver_obj.content_hash.clone(),
834 })
835 }
836 }
837}
838
839pub fn resolve_entry_file(
840 resolved: &ResolvedPath,
841 entry_type: &str,
842) -> Option<(String, String, Vec<String>)> {
843 match resolved {
844 ResolvedPath::Ok { path, files, .. } => {
845 let file_name = if entry_type == "skill" {
846 "SKILL.md"
847 } else {
848 "DOC.md"
849 };
850 Some((
851 format!("{}/{}", path, file_name),
852 path.clone(),
853 files.clone(),
854 ))
855 }
856 _ => None,
857 }
858}