gobby-wiki 0.2.0

Gobby wiki CLI shell
use std::collections::BTreeMap;

use crate::search::{SearchSource, SearchSourceExplanation, WikiSearchResponse, WikiSearchResult};
use gobby_core::degradation::DegradationKind;

pub fn fuse_sources(
    bm25_hits: Vec<WikiSearchResult>,
    semantic_hits: Vec<WikiSearchResult>,
    graph_hits: Vec<WikiSearchResult>,
    degradations: Vec<DegradationKind>,
    limit: usize,
) -> WikiSearchResponse {
    if limit == 0 {
        return WikiSearchResponse {
            results: Vec::new(),
            degradations,
        };
    }

    let bm25_ids = ranked_ids(&bm25_hits);
    let semantic_ids = ranked_ids(&semantic_hits);
    let graph_ids = ranked_ids(&graph_hits);

    let mut by_id = BTreeMap::new();
    for hit in bm25_hits.into_iter().chain(semantic_hits).chain(graph_hits) {
        by_id.entry(hit.id.clone()).or_insert(hit);
    }

    let mut sources = Vec::new();
    if !bm25_ids.is_empty() {
        sources.push((SearchSource::Bm25.as_str(), bm25_ids));
    }
    if !graph_ids.is_empty() {
        sources.push((SearchSource::Graph.as_str(), graph_ids));
    }
    if !semantic_ids.is_empty() {
        sources.push((SearchSource::Semantic.as_str(), semantic_ids));
    }

    let mut results = gobby_core::search::rrf_merge(sources)
        .into_iter()
        .filter_map(|fused| {
            let Some(mut result) = by_id.remove(&fused.id) else {
                log::warn!("RRF returned id absent from source hit map: {}", fused.id);
                return None;
            };
            result.score = fused.score;
            result.sources = fused
                .sources
                .iter()
                .filter_map(|source| {
                    SearchSource::from_source_name(source).or_else(|| {
                        log::warn!("RRF returned unknown source name: {source}");
                        None
                    })
                })
                .collect();
            result.explanations = fused
                .explanations
                .iter()
                .filter_map(|explanation| {
                    let source =
                        SearchSource::from_source_name(&explanation.source).or_else(|| {
                            log::warn!(
                                "RRF returned unknown explanation source: {}",
                                explanation.source
                            );
                            None
                        })?;
                    Some(SearchSourceExplanation {
                        source,
                        rank: explanation.rank,
                        score: explanation.score,
                    })
                })
                .collect();
            Some(result)
        })
        .collect::<Vec<_>>();
    results.truncate(limit);

    WikiSearchResponse {
        results,
        degradations,
    }
}

fn ranked_ids(hits: &[WikiSearchResult]) -> Vec<String> {
    hits.iter().map(|hit| hit.id.clone()).collect()
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::search::{
        ChunkProvenance, SearchHitKind, SearchProvenance, SearchScope, SearchSource,
        SearchSourceExplanation,
    };

    #[test]
    fn fusion_preserves_sources() {
        let response = fuse_sources(
            vec![
                search_result("document:wiki/topics/rust.md", SearchSource::Bm25),
                search_result("document:wiki/topics/borrow.md", SearchSource::Bm25),
            ],
            vec![search_result(
                "document:wiki/topics/rust.md",
                SearchSource::Semantic,
            )],
            vec![search_result(
                "document:wiki/topics/rust.md",
                SearchSource::Graph,
            )],
            vec![DegradationKind::PartialSearch {
                available: vec!["bm25".to_string(), "graph".to_string()],
                unavailable: vec!["semantic".to_string()],
            }],
            10,
        );

        let fused = response
            .results
            .iter()
            .find(|result| result.id == "document:wiki/topics/rust.md")
            .expect("shared document is fused");

        assert_eq!(
            fused.sources,
            vec![
                SearchSource::Bm25,
                SearchSource::Graph,
                SearchSource::Semantic
            ]
        );
        assert_eq!(
            fused
                .explanations
                .iter()
                .map(|explanation| explanation.source)
                .collect::<Vec<_>>(),
            vec![
                SearchSource::Bm25,
                SearchSource::Graph,
                SearchSource::Semantic
            ]
        );
        assert!(
            response.degradations.iter().any(|degradation| {
                matches!(
                    degradation,
                    DegradationKind::PartialSearch {
                        available,
                        unavailable
                    } if available.as_slice() == ["bm25", "graph"]
                        && unavailable.as_slice() == ["semantic"]
                )
            }),
            "fusion should preserve degradation metadata"
        );
    }

    fn search_result(id: &str, source: SearchSource) -> WikiSearchResult {
        WikiSearchResult {
            id: id.to_string(),
            title: Some("Rust".to_string()),
            scope: SearchScope::project("project-1"),
            path: "wiki/topics/rust.md".into(),
            source_path: "raw/INDEX.md".into(),
            hit_kind: SearchHitKind::Document,
            snippet: "Ownership and borrowing".to_string(),
            score: 1.0,
            sources: vec![source],
            explanations: Vec::<SearchSourceExplanation>::new(),
            chunk: Some(ChunkProvenance {
                chunk_index: 0,
                byte_start: 0,
                byte_end: 24,
                heading: Some("Rust".to_string()),
            }),
            provenance: SearchProvenance {
                document_path: "wiki/topics/rust.md".into(),
                source_path: "raw/INDEX.md".into(),
                source_kind: "topic".to_string(),
                content_hash: Some("hash".to_string()),
            },
        }
    }
}