Skip to main content

llm_wiki/
search.rs

1use std::cmp::Reverse;
2use std::collections::HashMap;
3
4use anyhow::{Context, Result};
5use serde::{Deserialize, Serialize};
6use tantivy::{
7    DocId, Order, Score, Searcher, Term,
8    collector::{Count, TopDocs},
9    query::{AllQuery, BooleanQuery, Occur, QueryParser, TermQuery},
10    schema::{IndexRecordOption, Value},
11    snippet::{Snippet, SnippetGenerator},
12};
13
14use crate::config::SearchConfig;
15use crate::index_schema::IndexSchema;
16
17// ── Return types ──────────────────────────────────────────────────────────────
18
19/// A single search result with BM25 score and optional highlighted excerpt.
20#[derive(Debug, Clone, Serialize, Deserialize)]
21pub struct PageRef {
22    /// Page slug (repository-relative path without extension).
23    pub slug: String,
24    /// Fully-qualified `wiki://` URI for the page.
25    pub uri: String,
26    /// Page title from frontmatter.
27    pub title: String,
28    /// Adjusted BM25 score (multiplied by status and confidence).
29    pub score: f32,
30    /// Frontmatter `confidence` value in [0, 1].
31    pub confidence: f32,
32    /// HTML-highlighted body excerpt, if requested.
33    pub excerpt: Option<String>,
34    /// Frontmatter `summary` field, if present.
35    #[serde(default, skip_serializing_if = "Option::is_none")]
36    pub summary: Option<String>,
37}
38
39/// Lightweight page metadata returned by listing operations.
40#[derive(Debug, Clone, Serialize, Deserialize)]
41pub struct PageSummary {
42    /// Page slug.
43    pub slug: String,
44    /// Fully-qualified `wiki://` URI.
45    pub uri: String,
46    /// Page title from frontmatter.
47    pub title: String,
48    /// Page type from frontmatter.
49    pub r#type: String,
50    /// Page status from frontmatter.
51    pub status: String,
52    /// Tags from frontmatter.
53    pub tags: Vec<String>,
54    /// Frontmatter `confidence` value in [0, 1].
55    pub confidence: f32,
56    /// Frontmatter `summary` field, if present.
57    #[serde(default, skip_serializing_if = "Option::is_none")]
58    pub summary: Option<String>,
59}
60
61/// A paginated list of pages with facet counts.
62#[derive(Debug, Clone, Serialize, Deserialize)]
63pub struct PageList {
64    /// Pages in the current page window.
65    pub pages: Vec<PageSummary>,
66    /// Total pages matching the filter (across all pages).
67    pub total: usize,
68    /// Current 1-based page number.
69    pub page: usize,
70    /// Number of items per page.
71    pub page_size: usize,
72    /// Facet counts for type, status, and tags.
73    #[serde(default, skip_serializing_if = "FacetCounts::is_empty")]
74    pub facets: FacetCounts,
75}
76
77// ── Facets ────────────────────────────────────────────────────────────────────
78
79/// Distribution counts for type, status, and tags.
80#[derive(Debug, Clone, Default, Serialize, Deserialize)]
81pub struct FacetCounts {
82    /// Count of pages per frontmatter type.
83    #[serde(default, skip_serializing_if = "HashMap::is_empty")]
84    pub r#type: HashMap<String, u64>,
85    /// Count of pages per frontmatter status.
86    #[serde(default, skip_serializing_if = "HashMap::is_empty")]
87    pub status: HashMap<String, u64>,
88    /// Count of pages per tag.
89    #[serde(default, skip_serializing_if = "HashMap::is_empty")]
90    pub tags: HashMap<String, u64>,
91}
92
93impl FacetCounts {
94    /// Return true if all three facet maps are empty.
95    pub fn is_empty(&self) -> bool {
96        self.r#type.is_empty() && self.status.is_empty() && self.tags.is_empty()
97    }
98}
99
100/// The full result of a search query including ranked results and facets.
101#[derive(Debug, Clone, Serialize, Deserialize)]
102pub struct SearchResult {
103    /// Ranked search results.
104    pub results: Vec<PageRef>,
105    /// Facet counts for the result set.
106    pub facets: FacetCounts,
107}
108
109// ── Options ───────────────────────────────────────────────────────────────────
110
111/// Options for a BM25 search query.
112pub struct SearchOptions {
113    /// Omit HTML excerpt from results when true.
114    pub no_excerpt: bool,
115    /// Include section index pages in results when true.
116    pub include_sections: bool,
117    /// Maximum number of results to return.
118    pub top_k: usize,
119    /// Optional frontmatter type filter.
120    pub r#type: Option<String>,
121    /// Maximum tag facet values to return (0 = all).
122    pub facets_top_tags: usize,
123    /// Status score multiplier config applied to BM25 scores.
124    pub search_config: SearchConfig,
125}
126
127impl Default for SearchOptions {
128    fn default() -> Self {
129        Self {
130            no_excerpt: false,
131            include_sections: false,
132            top_k: 10,
133            r#type: None,
134            facets_top_tags: 10,
135            search_config: SearchConfig::default(),
136        }
137    }
138}
139
140/// Options for a paginated page list operation.
141pub struct ListOptions {
142    /// Optional frontmatter type filter.
143    pub r#type: Option<String>,
144    /// Optional frontmatter status filter.
145    pub status: Option<String>,
146    /// 1-based page number.
147    pub page: usize,
148    /// Number of items per page.
149    pub page_size: usize,
150    /// Maximum tag facet values to return (0 = all).
151    pub facets_top_tags: usize,
152}
153
154impl Default for ListOptions {
155    fn default() -> Self {
156        Self {
157            r#type: None,
158            status: None,
159            page: 1,
160            page_size: 20,
161            facets_top_tags: 10,
162        }
163    }
164}
165
166// ── search ────────────────────────────────────────────────────────────────────
167
168/// Run a BM25 full-text search against a single wiki's index.
169pub fn search(
170    query_str: &str,
171    options: &SearchOptions,
172    searcher: &Searcher,
173    wiki_name: &str,
174    is: &IndexSchema,
175) -> Result<SearchResult> {
176    let f_slug = is.field("slug");
177    let f_title = is.field("title");
178    let f_summary = is.try_field("summary");
179    let f_body = is.field("body");
180    let f_type = is.field("type");
181
182    let index = searcher.index();
183    let mut query_fields = vec![f_title, f_body];
184    if let Some(f) = f_summary {
185        query_fields.insert(1, f);
186    }
187    let query_parser = QueryParser::for_index(index, query_fields);
188    let parsed = query_parser
189        .parse_query(query_str)
190        .with_context(|| format!("failed to parse query: {query_str}"))?;
191
192    // Build the filtered query (with type filter)
193    let final_query: Box<dyn tantivy::query::Query> = {
194        let mut clauses: Vec<(Occur, Box<dyn tantivy::query::Query>)> = Vec::new();
195        clauses.push((Occur::Must, parsed));
196
197        if !options.include_sections {
198            clauses.push((
199                Occur::MustNot,
200                Box::new(TermQuery::new(
201                    Term::from_field_text(f_type, "section"),
202                    IndexRecordOption::Basic,
203                )),
204            ));
205        }
206
207        if let Some(ref type_filter) = options.r#type {
208            clauses.push((
209                Occur::Must,
210                Box::new(TermQuery::new(
211                    Term::from_field_text(f_type, type_filter),
212                    IndexRecordOption::Basic,
213                )),
214            ));
215        }
216
217        Box::new(BooleanQuery::new(clauses))
218    };
219
220    let sc = options.search_config.clone();
221    let has_confidence = is.try_field("confidence").is_some();
222    let collector = TopDocs::with_limit(options.top_k).tweak_score(
223        move |segment_reader: &tantivy::SegmentReader| {
224            let status_col = segment_reader.fast_fields().str("status").ok().flatten();
225            let conf_col = if has_confidence {
226                segment_reader.fast_fields().f64("confidence").ok()
227            } else {
228                None
229            };
230            let status_map = sc.status.clone();
231            move |doc: DocId, score: Score| {
232                let unknown_mult = status_map.get("unknown").copied().unwrap_or(0.9);
233                let status_mult = match &status_col {
234                    Some(col) => match col.term_ords(doc).next() {
235                        Some(ord) => {
236                            let mut buf = String::new();
237                            col.ord_to_str(ord, &mut buf).ok();
238                            status_map
239                                .get(buf.as_str())
240                                .copied()
241                                .unwrap_or(unknown_mult)
242                        }
243                        None => unknown_mult,
244                    },
245                    None => unknown_mult,
246                };
247                let confidence = conf_col.as_ref().and_then(|c| c.first(doc)).unwrap_or(0.5) as f32;
248                score * status_mult * confidence
249            }
250        },
251    );
252    let top_docs = searcher.search(&final_query, &collector)?;
253
254    let snippet_gen = if !options.no_excerpt {
255        Some(SnippetGenerator::create(searcher, &final_query, f_body)?)
256    } else {
257        None
258    };
259
260    let f_confidence = is.try_field("confidence");
261
262    let mut results = Vec::new();
263    for (score, doc_addr) in top_docs {
264        let doc: tantivy::TantivyDocument = searcher.doc(doc_addr)?;
265
266        let slug = doc
267            .get_first(f_slug)
268            .and_then(|v| v.as_str())
269            .unwrap_or("")
270            .to_string();
271        let title = doc
272            .get_first(f_title)
273            .and_then(|v| v.as_str())
274            .unwrap_or("")
275            .to_string();
276        let uri = format!("wiki://{wiki_name}/{slug}");
277
278        let confidence = f_confidence
279            .and_then(|f| doc.get_first(f))
280            .and_then(|v| v.as_f64())
281            .unwrap_or(0.5) as f32;
282
283        let excerpt = snippet_gen.as_ref().map(|sg| {
284            let snippet: Snippet = sg.snippet_from_doc(&doc);
285            snippet.to_html()
286        });
287
288        let summary = f_summary
289            .and_then(|f| doc.get_first(f))
290            .and_then(|v| v.as_str())
291            .filter(|s| !s.is_empty())
292            .map(|s| s.to_string());
293
294        results.push(PageRef {
295            slug,
296            uri,
297            title,
298            score,
299            confidence,
300            excerpt,
301            summary,
302        });
303    }
304
305    // Facets: type is unfiltered, status and tags are filtered
306    // Re-parse query for the unfiltered facet query
307    let unfiltered_query: Box<dyn tantivy::query::Query> = {
308        let parsed2 = query_parser
309            .parse_query(query_str)
310            .with_context(|| format!("failed to parse query: {query_str}"))?;
311        let mut clauses: Vec<(Occur, Box<dyn tantivy::query::Query>)> = Vec::new();
312        clauses.push((Occur::Must, parsed2));
313        if !options.include_sections {
314            clauses.push((
315                Occur::MustNot,
316                Box::new(TermQuery::new(
317                    Term::from_field_text(f_type, "section"),
318                    IndexRecordOption::Basic,
319                )),
320            ));
321        }
322        Box::new(BooleanQuery::new(clauses))
323    };
324
325    let type_facet = collect_facet(searcher, &unfiltered_query, is, "type", 0)?;
326    let status_facet = collect_facet(searcher, &final_query, is, "status", 0)?;
327    let tags_facet = collect_facet(searcher, &final_query, is, "tags", options.facets_top_tags)?;
328
329    Ok(SearchResult {
330        results,
331        facets: FacetCounts {
332            r#type: type_facet,
333            status: status_facet,
334            tags: tags_facet,
335        },
336    })
337}
338
339// ── list ──────────────────────────────────────────────────────────────────────
340
341/// Return a paginated list of pages from the index, sorted alphabetically by slug.
342pub fn list(
343    options: &ListOptions,
344    searcher: &Searcher,
345    wiki_name: &str,
346    is: &IndexSchema,
347) -> Result<PageList> {
348    let f_slug = is.field("slug");
349    let f_title = is.field("title");
350    let f_type = is.field("type");
351    let f_status = is.field("status");
352    let f_tags = is.field("tags");
353    let f_confidence = is.try_field("confidence");
354    let f_summary = is.try_field("summary");
355
356    let query: Box<dyn tantivy::query::Query> = {
357        let mut clauses: Vec<(Occur, Box<dyn tantivy::query::Query>)> = Vec::new();
358
359        if let Some(ref type_filter) = options.r#type {
360            clauses.push((
361                Occur::Must,
362                Box::new(TermQuery::new(
363                    Term::from_field_text(f_type, type_filter),
364                    IndexRecordOption::Basic,
365                )),
366            ));
367        }
368
369        if let Some(ref status_filter) = options.status {
370            clauses.push((
371                Occur::Must,
372                Box::new(TermQuery::new(
373                    Term::from_field_text(f_status, status_filter),
374                    IndexRecordOption::Basic,
375                )),
376            ));
377        }
378
379        if clauses.is_empty() {
380            Box::new(AllQuery)
381        } else {
382            Box::new(BooleanQuery::new(clauses))
383        }
384    };
385
386    // Unfiltered query for type facet (no type/status filter)
387    let unfiltered_query: Box<dyn tantivy::query::Query> = Box::new(AllQuery);
388
389    // Count total matches
390    let total = searcher.search(&query, &Count)?;
391    if total == 0 {
392        // Still collect facets even with no results in the page window
393        let type_facet = collect_facet(searcher, &unfiltered_query, is, "type", 0)?;
394        let status_facet = collect_facet(searcher, &query, is, "status", 0)?;
395        let tags_facet = collect_facet(searcher, &query, is, "tags", options.facets_top_tags)?;
396        return Ok(PageList {
397            pages: Vec::new(),
398            total: 0,
399            page: options.page,
400            page_size: options.page_size,
401            facets: FacetCounts {
402                r#type: type_facet,
403                status: status_facet,
404                tags: tags_facet,
405            },
406        });
407    }
408
409    // Fetch sorted by _slug_ord, limited to offset + page_size
410    let page = options.page;
411    let page_size = options.page_size;
412    let offset = (page - 1) * page_size;
413    let limit = offset + page_size;
414
415    let sorted_docs = searcher.search(
416        &query,
417        &TopDocs::with_limit(limit).order_by_string_fast_field("slug", Order::Asc),
418    )?;
419
420    // Extract full fields only for the page window
421    let window = if offset < sorted_docs.len() {
422        &sorted_docs[offset..]
423    } else {
424        &[]
425    };
426
427    let mut summaries = Vec::with_capacity(window.len());
428    for (_slug_val, doc_addr) in window {
429        let doc: tantivy::TantivyDocument = searcher.doc(*doc_addr)?;
430
431        let slug = doc
432            .get_first(f_slug)
433            .and_then(|v| v.as_str())
434            .unwrap_or("")
435            .to_string();
436        let title = doc
437            .get_first(f_title)
438            .and_then(|v| v.as_str())
439            .unwrap_or("")
440            .to_string();
441        let page_type = doc
442            .get_first(f_type)
443            .and_then(|v| v.as_str())
444            .unwrap_or("")
445            .to_string();
446        let status = doc
447            .get_first(f_status)
448            .and_then(|v| v.as_str())
449            .unwrap_or("")
450            .to_string();
451        let tags_str = doc
452            .get_first(f_tags)
453            .and_then(|v| v.as_str())
454            .unwrap_or("")
455            .to_string();
456        let tags: Vec<String> = tags_str
457            .split_whitespace()
458            .filter(|s| !s.is_empty())
459            .map(|s| s.to_string())
460            .collect();
461
462        let confidence = f_confidence
463            .and_then(|f| doc.get_first(f))
464            .and_then(|v| v.as_f64())
465            .unwrap_or(0.5) as f32;
466
467        let summary = f_summary
468            .and_then(|f| doc.get_first(f))
469            .and_then(|v| v.as_str())
470            .filter(|s| !s.is_empty())
471            .map(|s| s.to_string());
472
473        let uri = format!("wiki://{wiki_name}/{slug}");
474
475        summaries.push(PageSummary {
476            slug,
477            uri,
478            title,
479            r#type: page_type,
480            status,
481            tags,
482            confidence,
483            summary,
484        });
485    }
486
487    Ok(PageList {
488        pages: summaries,
489        total,
490        page,
491        page_size,
492        facets: {
493            let type_facet = collect_facet(searcher, &unfiltered_query, is, "type", 0)?;
494            let status_facet = collect_facet(searcher, &query, is, "status", 0)?;
495            let tags_facet = collect_facet(searcher, &query, is, "tags", options.facets_top_tags)?;
496            FacetCounts {
497                r#type: type_facet,
498                status: status_facet,
499                tags: tags_facet,
500            }
501        },
502    })
503}
504
505// ── search_all ────────────────────────────────────────────────────────────────
506
507/// Search across multiple wikis, merge results by score, and truncate to `top_k`.
508pub fn search_all(
509    query_str: &str,
510    options: &SearchOptions,
511    wikis: &[(String, Searcher, &IndexSchema)],
512) -> Result<SearchResult> {
513    let mut all_results = Vec::new();
514    let mut merged_facets = FacetCounts::default();
515    for (name, searcher, is) in wikis {
516        match search(query_str, options, searcher, name, is) {
517            Ok(sr) => {
518                all_results.extend(sr.results);
519                for (k, v) in sr.facets.r#type {
520                    *merged_facets.r#type.entry(k).or_insert(0) += v;
521                }
522                for (k, v) in sr.facets.status {
523                    *merged_facets.status.entry(k).or_insert(0) += v;
524                }
525                for (k, v) in sr.facets.tags {
526                    *merged_facets.tags.entry(k).or_insert(0) += v;
527                }
528            }
529            Err(_) => continue,
530        }
531    }
532    all_results.sort_by(|a, b| {
533        b.score
534            .partial_cmp(&a.score)
535            .unwrap_or(std::cmp::Ordering::Equal)
536    });
537    all_results.truncate(options.top_k);
538
539    // Re-cap tags after merging
540    if options.facets_top_tags > 0 && merged_facets.tags.len() > options.facets_top_tags {
541        let mut entries: Vec<_> = merged_facets.tags.into_iter().collect();
542        entries.sort_by_key(|e| Reverse(e.1));
543        entries.truncate(options.facets_top_tags);
544        merged_facets.tags = entries.into_iter().collect();
545    }
546
547    Ok(SearchResult {
548        results: all_results,
549        facets: merged_facets,
550    })
551}
552
553// ── Facet collection ──────────────────────────────────────────────────────────
554
555/// Collect term frequency counts for a keyword FAST field across matching docs.
556/// If `top_n` is 0, return all values. Otherwise return the top N by count.
557fn collect_facet(
558    searcher: &Searcher,
559    query: &dyn tantivy::query::Query,
560    is: &IndexSchema,
561    field_name: &str,
562    top_n: usize,
563) -> Result<HashMap<String, u64>> {
564    let field = match is.try_field(field_name) {
565        Some(f) => f,
566        None => return Ok(HashMap::new()),
567    };
568
569    let doc_addrs = searcher.search(query, &tantivy::collector::DocSetCollector)?;
570    let mut counts: HashMap<String, u64> = HashMap::new();
571
572    for doc_addr in &doc_addrs {
573        let doc: tantivy::TantivyDocument = searcher.doc(*doc_addr)?;
574        for val in doc.get_all(field) {
575            if let Some(s) = val.as_str()
576                && !s.is_empty()
577            {
578                *counts.entry(s.to_string()).or_insert(0) += 1;
579            }
580        }
581    }
582
583    if top_n > 0 && counts.len() > top_n {
584        let mut entries: Vec<_> = counts.into_iter().collect();
585        entries.sort_by_key(|e| Reverse(e.1));
586        entries.truncate(top_n);
587        return Ok(entries.into_iter().collect());
588    }
589
590    Ok(counts)
591}
592
593// ── llms renderers ────────────────────────────────────────────────────────────
594
595/// Render a `PageList` as LLM-optimized markdown: pages grouped by type,
596/// one line per page with summary. Archived pages shown with strikethrough.
597pub fn render_list_llms(result: &PageList) -> String {
598    // Group by type, sorted by count desc then name asc
599    let mut by_type: std::collections::HashMap<String, Vec<&PageSummary>> =
600        std::collections::HashMap::new();
601    for page in &result.pages {
602        by_type.entry(page.r#type.clone()).or_default().push(page);
603    }
604    let mut groups: Vec<(String, Vec<&PageSummary>)> = by_type.into_iter().collect();
605    groups.sort_by(|a, b| b.1.len().cmp(&a.1.len()).then(a.0.cmp(&b.0)));
606
607    let mut out = String::new();
608    for (type_name, mut pages) in groups {
609        pages.sort_by(|a, b| {
610            b.confidence
611                .partial_cmp(&a.confidence)
612                .unwrap_or(std::cmp::Ordering::Equal)
613                .then(a.title.cmp(&b.title))
614        });
615        out.push_str(&format!("## {} ({})\n\n", type_name, pages.len()));
616        for page in pages {
617            let summary = page.summary.as_deref().unwrap_or("");
618            let line = if page.status == "archived" {
619                if summary.is_empty() {
620                    format!("- ~~[{}]({})~~\n", page.title, page.uri)
621                } else {
622                    format!("- ~~[{}]({}): {}~~\n", page.title, page.uri, summary)
623                }
624            } else if summary.is_empty() {
625                format!("- [{}]({})\n", page.title, page.uri)
626            } else {
627                format!("- [{}]({}): {}\n", page.title, page.uri, summary)
628            };
629            out.push_str(&line);
630        }
631        out.push('\n');
632    }
633
634    if result.total > result.page_size {
635        let total_pages = (result.total + result.page_size - 1) / result.page_size.max(1);
636        out.push_str(&format!(
637            "_Page {}/{} — {} total pages_\n",
638            result.page, total_pages, result.total
639        ));
640    }
641
642    out
643}
644
645/// Render a `SearchResult` as LLM-optimized markdown: one line per result
646/// with title, uri, and summary. No score, no excerpt block.
647pub fn render_search_llms(result: &SearchResult) -> String {
648    if result.results.is_empty() {
649        return "No results found.\n".to_string();
650    }
651    let mut out = String::new();
652    for r in &result.results {
653        let summary = r.summary.as_deref().unwrap_or("");
654        if summary.is_empty() {
655            out.push_str(&format!("- [{}]({})\n", r.title, r.uri));
656        } else {
657            out.push_str(&format!("- [{}]({}): {}\n", r.title, r.uri, summary));
658        }
659    }
660    out
661}