use std::cmp::Reverse;
use std::collections::HashMap;
use anyhow::{Context, Result};
use serde::{Deserialize, Serialize};
use tantivy::{
DocId, Order, Score, Searcher, Term,
collector::{Count, TopDocs},
query::{AllQuery, BooleanQuery, Occur, QueryParser, TermQuery},
schema::{IndexRecordOption, Value},
snippet::{Snippet, SnippetGenerator},
};
use crate::config::SearchConfig;
use crate::index_schema::IndexSchema;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PageRef {
pub slug: String,
pub uri: String,
pub title: String,
pub score: f32,
pub confidence: f32,
pub excerpt: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub summary: Option<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PageSummary {
pub slug: String,
pub uri: String,
pub title: String,
pub r#type: String,
pub status: String,
pub tags: Vec<String>,
pub confidence: f32,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub summary: Option<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PageList {
pub pages: Vec<PageSummary>,
pub total: usize,
pub page: usize,
pub page_size: usize,
#[serde(default, skip_serializing_if = "FacetCounts::is_empty")]
pub facets: FacetCounts,
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct FacetCounts {
#[serde(default, skip_serializing_if = "HashMap::is_empty")]
pub r#type: HashMap<String, u64>,
#[serde(default, skip_serializing_if = "HashMap::is_empty")]
pub status: HashMap<String, u64>,
#[serde(default, skip_serializing_if = "HashMap::is_empty")]
pub tags: HashMap<String, u64>,
}
impl FacetCounts {
pub fn is_empty(&self) -> bool {
self.r#type.is_empty() && self.status.is_empty() && self.tags.is_empty()
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SearchResult {
pub results: Vec<PageRef>,
pub facets: FacetCounts,
}
pub struct SearchOptions {
pub no_excerpt: bool,
pub include_sections: bool,
pub top_k: usize,
pub r#type: Option<String>,
pub facets_top_tags: usize,
pub search_config: SearchConfig,
}
impl Default for SearchOptions {
fn default() -> Self {
Self {
no_excerpt: false,
include_sections: false,
top_k: 10,
r#type: None,
facets_top_tags: 10,
search_config: SearchConfig::default(),
}
}
}
pub struct ListOptions {
pub r#type: Option<String>,
pub status: Option<String>,
pub page: usize,
pub page_size: usize,
pub facets_top_tags: usize,
}
impl Default for ListOptions {
fn default() -> Self {
Self {
r#type: None,
status: None,
page: 1,
page_size: 20,
facets_top_tags: 10,
}
}
}
pub fn search(
query_str: &str,
options: &SearchOptions,
searcher: &Searcher,
wiki_name: &str,
is: &IndexSchema,
) -> Result<SearchResult> {
let f_slug = is.field("slug");
let f_title = is.field("title");
let f_summary = is.try_field("summary");
let f_body = is.field("body");
let f_type = is.field("type");
let index = searcher.index();
let mut query_fields = vec![f_title, f_body];
if let Some(f) = f_summary {
query_fields.insert(1, f);
}
let query_parser = QueryParser::for_index(index, query_fields);
let parsed = query_parser
.parse_query(query_str)
.with_context(|| format!("failed to parse query: {query_str}"))?;
let final_query: Box<dyn tantivy::query::Query> = {
let mut clauses: Vec<(Occur, Box<dyn tantivy::query::Query>)> = Vec::new();
clauses.push((Occur::Must, parsed));
if !options.include_sections {
clauses.push((
Occur::MustNot,
Box::new(TermQuery::new(
Term::from_field_text(f_type, "section"),
IndexRecordOption::Basic,
)),
));
}
if let Some(ref type_filter) = options.r#type {
clauses.push((
Occur::Must,
Box::new(TermQuery::new(
Term::from_field_text(f_type, type_filter),
IndexRecordOption::Basic,
)),
));
}
Box::new(BooleanQuery::new(clauses))
};
let sc = options.search_config.clone();
let has_confidence = is.try_field("confidence").is_some();
let collector = TopDocs::with_limit(options.top_k).tweak_score(
move |segment_reader: &tantivy::SegmentReader| {
let status_col = segment_reader.fast_fields().str("status").ok().flatten();
let conf_col = if has_confidence {
segment_reader.fast_fields().f64("confidence").ok()
} else {
None
};
let status_map = sc.status.clone();
move |doc: DocId, score: Score| {
let unknown_mult = status_map.get("unknown").copied().unwrap_or(0.9);
let status_mult = match &status_col {
Some(col) => match col.term_ords(doc).next() {
Some(ord) => {
let mut buf = String::new();
col.ord_to_str(ord, &mut buf).ok();
status_map
.get(buf.as_str())
.copied()
.unwrap_or(unknown_mult)
}
None => unknown_mult,
},
None => unknown_mult,
};
let confidence = conf_col.as_ref().and_then(|c| c.first(doc)).unwrap_or(0.5) as f32;
score * status_mult * confidence
}
},
);
let top_docs = searcher.search(&final_query, &collector)?;
let snippet_gen = if !options.no_excerpt {
Some(SnippetGenerator::create(searcher, &final_query, f_body)?)
} else {
None
};
let f_confidence = is.try_field("confidence");
let mut results = Vec::new();
for (score, doc_addr) in top_docs {
let doc: tantivy::TantivyDocument = searcher.doc(doc_addr)?;
let slug = doc
.get_first(f_slug)
.and_then(|v| v.as_str())
.unwrap_or("")
.to_string();
let title = doc
.get_first(f_title)
.and_then(|v| v.as_str())
.unwrap_or("")
.to_string();
let uri = format!("wiki://{wiki_name}/{slug}");
let confidence = f_confidence
.and_then(|f| doc.get_first(f))
.and_then(|v| v.as_f64())
.unwrap_or(0.5) as f32;
let excerpt = snippet_gen.as_ref().map(|sg| {
let snippet: Snippet = sg.snippet_from_doc(&doc);
snippet.to_html()
});
let summary = f_summary
.and_then(|f| doc.get_first(f))
.and_then(|v| v.as_str())
.filter(|s| !s.is_empty())
.map(|s| s.to_string());
results.push(PageRef {
slug,
uri,
title,
score,
confidence,
excerpt,
summary,
});
}
let unfiltered_query: Box<dyn tantivy::query::Query> = {
let parsed2 = query_parser
.parse_query(query_str)
.with_context(|| format!("failed to parse query: {query_str}"))?;
let mut clauses: Vec<(Occur, Box<dyn tantivy::query::Query>)> = Vec::new();
clauses.push((Occur::Must, parsed2));
if !options.include_sections {
clauses.push((
Occur::MustNot,
Box::new(TermQuery::new(
Term::from_field_text(f_type, "section"),
IndexRecordOption::Basic,
)),
));
}
Box::new(BooleanQuery::new(clauses))
};
let type_facet = collect_facet(searcher, &unfiltered_query, is, "type", 0)?;
let status_facet = collect_facet(searcher, &final_query, is, "status", 0)?;
let tags_facet = collect_facet(searcher, &final_query, is, "tags", options.facets_top_tags)?;
Ok(SearchResult {
results,
facets: FacetCounts {
r#type: type_facet,
status: status_facet,
tags: tags_facet,
},
})
}
pub fn list(
options: &ListOptions,
searcher: &Searcher,
wiki_name: &str,
is: &IndexSchema,
) -> Result<PageList> {
let f_slug = is.field("slug");
let f_title = is.field("title");
let f_type = is.field("type");
let f_status = is.field("status");
let f_tags = is.field("tags");
let f_confidence = is.try_field("confidence");
let f_summary = is.try_field("summary");
let query: Box<dyn tantivy::query::Query> = {
let mut clauses: Vec<(Occur, Box<dyn tantivy::query::Query>)> = Vec::new();
if let Some(ref type_filter) = options.r#type {
clauses.push((
Occur::Must,
Box::new(TermQuery::new(
Term::from_field_text(f_type, type_filter),
IndexRecordOption::Basic,
)),
));
}
if let Some(ref status_filter) = options.status {
clauses.push((
Occur::Must,
Box::new(TermQuery::new(
Term::from_field_text(f_status, status_filter),
IndexRecordOption::Basic,
)),
));
}
if clauses.is_empty() {
Box::new(AllQuery)
} else {
Box::new(BooleanQuery::new(clauses))
}
};
let unfiltered_query: Box<dyn tantivy::query::Query> = Box::new(AllQuery);
let total = searcher.search(&query, &Count)?;
if total == 0 {
let type_facet = collect_facet(searcher, &unfiltered_query, is, "type", 0)?;
let status_facet = collect_facet(searcher, &query, is, "status", 0)?;
let tags_facet = collect_facet(searcher, &query, is, "tags", options.facets_top_tags)?;
return Ok(PageList {
pages: Vec::new(),
total: 0,
page: options.page,
page_size: options.page_size,
facets: FacetCounts {
r#type: type_facet,
status: status_facet,
tags: tags_facet,
},
});
}
let page = options.page;
let page_size = options.page_size;
let offset = (page - 1) * page_size;
let limit = offset + page_size;
let sorted_docs = searcher.search(
&query,
&TopDocs::with_limit(limit).order_by_string_fast_field("slug", Order::Asc),
)?;
let window = if offset < sorted_docs.len() {
&sorted_docs[offset..]
} else {
&[]
};
let mut summaries = Vec::with_capacity(window.len());
for (_slug_val, doc_addr) in window {
let doc: tantivy::TantivyDocument = searcher.doc(*doc_addr)?;
let slug = doc
.get_first(f_slug)
.and_then(|v| v.as_str())
.unwrap_or("")
.to_string();
let title = doc
.get_first(f_title)
.and_then(|v| v.as_str())
.unwrap_or("")
.to_string();
let page_type = doc
.get_first(f_type)
.and_then(|v| v.as_str())
.unwrap_or("")
.to_string();
let status = doc
.get_first(f_status)
.and_then(|v| v.as_str())
.unwrap_or("")
.to_string();
let tags_str = doc
.get_first(f_tags)
.and_then(|v| v.as_str())
.unwrap_or("")
.to_string();
let tags: Vec<String> = tags_str
.split_whitespace()
.filter(|s| !s.is_empty())
.map(|s| s.to_string())
.collect();
let confidence = f_confidence
.and_then(|f| doc.get_first(f))
.and_then(|v| v.as_f64())
.unwrap_or(0.5) as f32;
let summary = f_summary
.and_then(|f| doc.get_first(f))
.and_then(|v| v.as_str())
.filter(|s| !s.is_empty())
.map(|s| s.to_string());
let uri = format!("wiki://{wiki_name}/{slug}");
summaries.push(PageSummary {
slug,
uri,
title,
r#type: page_type,
status,
tags,
confidence,
summary,
});
}
Ok(PageList {
pages: summaries,
total,
page,
page_size,
facets: {
let type_facet = collect_facet(searcher, &unfiltered_query, is, "type", 0)?;
let status_facet = collect_facet(searcher, &query, is, "status", 0)?;
let tags_facet = collect_facet(searcher, &query, is, "tags", options.facets_top_tags)?;
FacetCounts {
r#type: type_facet,
status: status_facet,
tags: tags_facet,
}
},
})
}
pub fn search_all(
query_str: &str,
options: &SearchOptions,
wikis: &[(String, Searcher, &IndexSchema)],
) -> Result<SearchResult> {
let mut all_results = Vec::new();
let mut merged_facets = FacetCounts::default();
for (name, searcher, is) in wikis {
match search(query_str, options, searcher, name, is) {
Ok(sr) => {
all_results.extend(sr.results);
for (k, v) in sr.facets.r#type {
*merged_facets.r#type.entry(k).or_insert(0) += v;
}
for (k, v) in sr.facets.status {
*merged_facets.status.entry(k).or_insert(0) += v;
}
for (k, v) in sr.facets.tags {
*merged_facets.tags.entry(k).or_insert(0) += v;
}
}
Err(_) => continue,
}
}
all_results.sort_by(|a, b| {
b.score
.partial_cmp(&a.score)
.unwrap_or(std::cmp::Ordering::Equal)
});
all_results.truncate(options.top_k);
if options.facets_top_tags > 0 && merged_facets.tags.len() > options.facets_top_tags {
let mut entries: Vec<_> = merged_facets.tags.into_iter().collect();
entries.sort_by_key(|e| Reverse(e.1));
entries.truncate(options.facets_top_tags);
merged_facets.tags = entries.into_iter().collect();
}
Ok(SearchResult {
results: all_results,
facets: merged_facets,
})
}
fn collect_facet(
searcher: &Searcher,
query: &dyn tantivy::query::Query,
is: &IndexSchema,
field_name: &str,
top_n: usize,
) -> Result<HashMap<String, u64>> {
let field = match is.try_field(field_name) {
Some(f) => f,
None => return Ok(HashMap::new()),
};
let doc_addrs = searcher.search(query, &tantivy::collector::DocSetCollector)?;
let mut counts: HashMap<String, u64> = HashMap::new();
for doc_addr in &doc_addrs {
let doc: tantivy::TantivyDocument = searcher.doc(*doc_addr)?;
for val in doc.get_all(field) {
if let Some(s) = val.as_str()
&& !s.is_empty()
{
*counts.entry(s.to_string()).or_insert(0) += 1;
}
}
}
if top_n > 0 && counts.len() > top_n {
let mut entries: Vec<_> = counts.into_iter().collect();
entries.sort_by_key(|e| Reverse(e.1));
entries.truncate(top_n);
return Ok(entries.into_iter().collect());
}
Ok(counts)
}
pub fn render_list_llms(result: &PageList) -> String {
let mut by_type: std::collections::HashMap<String, Vec<&PageSummary>> =
std::collections::HashMap::new();
for page in &result.pages {
by_type.entry(page.r#type.clone()).or_default().push(page);
}
let mut groups: Vec<(String, Vec<&PageSummary>)> = by_type.into_iter().collect();
groups.sort_by(|a, b| b.1.len().cmp(&a.1.len()).then(a.0.cmp(&b.0)));
let mut out = String::new();
for (type_name, mut pages) in groups {
pages.sort_by(|a, b| {
b.confidence
.partial_cmp(&a.confidence)
.unwrap_or(std::cmp::Ordering::Equal)
.then(a.title.cmp(&b.title))
});
out.push_str(&format!("## {} ({})\n\n", type_name, pages.len()));
for page in pages {
let summary = page.summary.as_deref().unwrap_or("");
let line = if page.status == "archived" {
if summary.is_empty() {
format!("- ~~[{}]({})~~\n", page.title, page.uri)
} else {
format!("- ~~[{}]({}): {}~~\n", page.title, page.uri, summary)
}
} else if summary.is_empty() {
format!("- [{}]({})\n", page.title, page.uri)
} else {
format!("- [{}]({}): {}\n", page.title, page.uri, summary)
};
out.push_str(&line);
}
out.push('\n');
}
if result.total > result.page_size {
let total_pages = (result.total + result.page_size - 1) / result.page_size.max(1);
out.push_str(&format!(
"_Page {}/{} — {} total pages_\n",
result.page, total_pages, result.total
));
}
out
}
pub fn render_search_llms(result: &SearchResult) -> String {
if result.results.is_empty() {
return "No results found.\n".to_string();
}
let mut out = String::new();
for r in &result.results {
let summary = r.summary.as_deref().unwrap_or("");
if summary.is_empty() {
out.push_str(&format!("- [{}]({})\n", r.title, r.uri));
} else {
out.push_str(&format!("- [{}]({}): {}\n", r.title, r.uri, summary));
}
}
out
}