1use std::cmp::Reverse;
2use std::collections::HashMap;
3
4use anyhow::{Context, Result};
5use serde::{Deserialize, Serialize};
6use tantivy::{
7 DocId, Order, Score, Searcher, Term,
8 collector::{Count, TopDocs},
9 query::{AllQuery, BooleanQuery, Occur, QueryParser, TermQuery},
10 schema::{IndexRecordOption, Value},
11 snippet::{Snippet, SnippetGenerator},
12};
13
14use crate::config::SearchConfig;
15use crate::index_schema::IndexSchema;
16
17#[derive(Debug, Clone, Serialize, Deserialize)]
21pub struct PageRef {
22 pub slug: String,
24 pub uri: String,
26 pub title: String,
28 pub score: f32,
30 pub confidence: f32,
32 pub excerpt: Option<String>,
34 #[serde(default, skip_serializing_if = "Option::is_none")]
36 pub summary: Option<String>,
37}
38
39#[derive(Debug, Clone, Serialize, Deserialize)]
41pub struct PageSummary {
42 pub slug: String,
44 pub uri: String,
46 pub title: String,
48 pub r#type: String,
50 pub status: String,
52 pub tags: Vec<String>,
54 pub confidence: f32,
56 #[serde(default, skip_serializing_if = "Option::is_none")]
58 pub summary: Option<String>,
59}
60
61#[derive(Debug, Clone, Serialize, Deserialize)]
63pub struct PageList {
64 pub pages: Vec<PageSummary>,
66 pub total: usize,
68 pub page: usize,
70 pub page_size: usize,
72 #[serde(default, skip_serializing_if = "FacetCounts::is_empty")]
74 pub facets: FacetCounts,
75}
76
77#[derive(Debug, Clone, Default, Serialize, Deserialize)]
81pub struct FacetCounts {
82 #[serde(default, skip_serializing_if = "HashMap::is_empty")]
84 pub r#type: HashMap<String, u64>,
85 #[serde(default, skip_serializing_if = "HashMap::is_empty")]
87 pub status: HashMap<String, u64>,
88 #[serde(default, skip_serializing_if = "HashMap::is_empty")]
90 pub tags: HashMap<String, u64>,
91}
92
93impl FacetCounts {
94 pub fn is_empty(&self) -> bool {
96 self.r#type.is_empty() && self.status.is_empty() && self.tags.is_empty()
97 }
98}
99
100#[derive(Debug, Clone, Serialize, Deserialize)]
102pub struct SearchResult {
103 pub results: Vec<PageRef>,
105 pub facets: FacetCounts,
107}
108
109pub struct SearchOptions {
113 pub no_excerpt: bool,
115 pub include_sections: bool,
117 pub top_k: usize,
119 pub r#type: Option<String>,
121 pub facets_top_tags: usize,
123 pub search_config: SearchConfig,
125}
126
127impl Default for SearchOptions {
128 fn default() -> Self {
129 Self {
130 no_excerpt: false,
131 include_sections: false,
132 top_k: 10,
133 r#type: None,
134 facets_top_tags: 10,
135 search_config: SearchConfig::default(),
136 }
137 }
138}
139
140pub struct ListOptions {
142 pub r#type: Option<String>,
144 pub status: Option<String>,
146 pub page: usize,
148 pub page_size: usize,
150 pub facets_top_tags: usize,
152}
153
154impl Default for ListOptions {
155 fn default() -> Self {
156 Self {
157 r#type: None,
158 status: None,
159 page: 1,
160 page_size: 20,
161 facets_top_tags: 10,
162 }
163 }
164}
165
166pub fn search(
170 query_str: &str,
171 options: &SearchOptions,
172 searcher: &Searcher,
173 wiki_name: &str,
174 is: &IndexSchema,
175) -> Result<SearchResult> {
176 let f_slug = is.field("slug");
177 let f_title = is.field("title");
178 let f_summary = is.try_field("summary");
179 let f_body = is.field("body");
180 let f_type = is.field("type");
181
182 let index = searcher.index();
183 let mut query_fields = vec![f_title, f_body];
184 if let Some(f) = f_summary {
185 query_fields.insert(1, f);
186 }
187 let query_parser = QueryParser::for_index(index, query_fields);
188 let parsed = query_parser
189 .parse_query(query_str)
190 .with_context(|| format!("failed to parse query: {query_str}"))?;
191
192 let final_query: Box<dyn tantivy::query::Query> = {
194 let mut clauses: Vec<(Occur, Box<dyn tantivy::query::Query>)> = Vec::new();
195 clauses.push((Occur::Must, parsed));
196
197 if !options.include_sections {
198 clauses.push((
199 Occur::MustNot,
200 Box::new(TermQuery::new(
201 Term::from_field_text(f_type, "section"),
202 IndexRecordOption::Basic,
203 )),
204 ));
205 }
206
207 if let Some(ref type_filter) = options.r#type {
208 clauses.push((
209 Occur::Must,
210 Box::new(TermQuery::new(
211 Term::from_field_text(f_type, type_filter),
212 IndexRecordOption::Basic,
213 )),
214 ));
215 }
216
217 Box::new(BooleanQuery::new(clauses))
218 };
219
220 let sc = options.search_config.clone();
221 let has_confidence = is.try_field("confidence").is_some();
222 let collector = TopDocs::with_limit(options.top_k).tweak_score(
223 move |segment_reader: &tantivy::SegmentReader| {
224 let status_col = segment_reader.fast_fields().str("status").ok().flatten();
225 let conf_col = if has_confidence {
226 segment_reader.fast_fields().f64("confidence").ok()
227 } else {
228 None
229 };
230 let status_map = sc.status.clone();
231 move |doc: DocId, score: Score| {
232 let unknown_mult = status_map.get("unknown").copied().unwrap_or(0.9);
233 let status_mult = match &status_col {
234 Some(col) => match col.term_ords(doc).next() {
235 Some(ord) => {
236 let mut buf = String::new();
237 col.ord_to_str(ord, &mut buf).ok();
238 status_map
239 .get(buf.as_str())
240 .copied()
241 .unwrap_or(unknown_mult)
242 }
243 None => unknown_mult,
244 },
245 None => unknown_mult,
246 };
247 let confidence = conf_col.as_ref().and_then(|c| c.first(doc)).unwrap_or(0.5) as f32;
248 score * status_mult * confidence
249 }
250 },
251 );
252 let top_docs = searcher.search(&final_query, &collector)?;
253
254 let snippet_gen = if !options.no_excerpt {
255 Some(SnippetGenerator::create(searcher, &final_query, f_body)?)
256 } else {
257 None
258 };
259
260 let f_confidence = is.try_field("confidence");
261
262 let mut results = Vec::new();
263 for (score, doc_addr) in top_docs {
264 let doc: tantivy::TantivyDocument = searcher.doc(doc_addr)?;
265
266 let slug = doc
267 .get_first(f_slug)
268 .and_then(|v| v.as_str())
269 .unwrap_or("")
270 .to_string();
271 let title = doc
272 .get_first(f_title)
273 .and_then(|v| v.as_str())
274 .unwrap_or("")
275 .to_string();
276 let uri = format!("wiki://{wiki_name}/{slug}");
277
278 let confidence = f_confidence
279 .and_then(|f| doc.get_first(f))
280 .and_then(|v| v.as_f64())
281 .unwrap_or(0.5) as f32;
282
283 let excerpt = snippet_gen.as_ref().map(|sg| {
284 let snippet: Snippet = sg.snippet_from_doc(&doc);
285 snippet.to_html()
286 });
287
288 let summary = f_summary
289 .and_then(|f| doc.get_first(f))
290 .and_then(|v| v.as_str())
291 .filter(|s| !s.is_empty())
292 .map(|s| s.to_string());
293
294 results.push(PageRef {
295 slug,
296 uri,
297 title,
298 score,
299 confidence,
300 excerpt,
301 summary,
302 });
303 }
304
305 let unfiltered_query: Box<dyn tantivy::query::Query> = {
308 let parsed2 = query_parser
309 .parse_query(query_str)
310 .with_context(|| format!("failed to parse query: {query_str}"))?;
311 let mut clauses: Vec<(Occur, Box<dyn tantivy::query::Query>)> = Vec::new();
312 clauses.push((Occur::Must, parsed2));
313 if !options.include_sections {
314 clauses.push((
315 Occur::MustNot,
316 Box::new(TermQuery::new(
317 Term::from_field_text(f_type, "section"),
318 IndexRecordOption::Basic,
319 )),
320 ));
321 }
322 Box::new(BooleanQuery::new(clauses))
323 };
324
325 let type_facet = collect_facet(searcher, &unfiltered_query, is, "type", 0)?;
326 let status_facet = collect_facet(searcher, &final_query, is, "status", 0)?;
327 let tags_facet = collect_facet(searcher, &final_query, is, "tags", options.facets_top_tags)?;
328
329 Ok(SearchResult {
330 results,
331 facets: FacetCounts {
332 r#type: type_facet,
333 status: status_facet,
334 tags: tags_facet,
335 },
336 })
337}
338
339pub fn list(
343 options: &ListOptions,
344 searcher: &Searcher,
345 wiki_name: &str,
346 is: &IndexSchema,
347) -> Result<PageList> {
348 let f_slug = is.field("slug");
349 let f_title = is.field("title");
350 let f_type = is.field("type");
351 let f_status = is.field("status");
352 let f_tags = is.field("tags");
353 let f_confidence = is.try_field("confidence");
354 let f_summary = is.try_field("summary");
355
356 let query: Box<dyn tantivy::query::Query> = {
357 let mut clauses: Vec<(Occur, Box<dyn tantivy::query::Query>)> = Vec::new();
358
359 if let Some(ref type_filter) = options.r#type {
360 clauses.push((
361 Occur::Must,
362 Box::new(TermQuery::new(
363 Term::from_field_text(f_type, type_filter),
364 IndexRecordOption::Basic,
365 )),
366 ));
367 }
368
369 if let Some(ref status_filter) = options.status {
370 clauses.push((
371 Occur::Must,
372 Box::new(TermQuery::new(
373 Term::from_field_text(f_status, status_filter),
374 IndexRecordOption::Basic,
375 )),
376 ));
377 }
378
379 if clauses.is_empty() {
380 Box::new(AllQuery)
381 } else {
382 Box::new(BooleanQuery::new(clauses))
383 }
384 };
385
386 let unfiltered_query: Box<dyn tantivy::query::Query> = Box::new(AllQuery);
388
389 let total = searcher.search(&query, &Count)?;
391 if total == 0 {
392 let type_facet = collect_facet(searcher, &unfiltered_query, is, "type", 0)?;
394 let status_facet = collect_facet(searcher, &query, is, "status", 0)?;
395 let tags_facet = collect_facet(searcher, &query, is, "tags", options.facets_top_tags)?;
396 return Ok(PageList {
397 pages: Vec::new(),
398 total: 0,
399 page: options.page,
400 page_size: options.page_size,
401 facets: FacetCounts {
402 r#type: type_facet,
403 status: status_facet,
404 tags: tags_facet,
405 },
406 });
407 }
408
409 let page = options.page;
411 let page_size = options.page_size;
412 let offset = (page - 1) * page_size;
413 let limit = offset + page_size;
414
415 let sorted_docs = searcher.search(
416 &query,
417 &TopDocs::with_limit(limit).order_by_string_fast_field("slug", Order::Asc),
418 )?;
419
420 let window = if offset < sorted_docs.len() {
422 &sorted_docs[offset..]
423 } else {
424 &[]
425 };
426
427 let mut summaries = Vec::with_capacity(window.len());
428 for (_slug_val, doc_addr) in window {
429 let doc: tantivy::TantivyDocument = searcher.doc(*doc_addr)?;
430
431 let slug = doc
432 .get_first(f_slug)
433 .and_then(|v| v.as_str())
434 .unwrap_or("")
435 .to_string();
436 let title = doc
437 .get_first(f_title)
438 .and_then(|v| v.as_str())
439 .unwrap_or("")
440 .to_string();
441 let page_type = doc
442 .get_first(f_type)
443 .and_then(|v| v.as_str())
444 .unwrap_or("")
445 .to_string();
446 let status = doc
447 .get_first(f_status)
448 .and_then(|v| v.as_str())
449 .unwrap_or("")
450 .to_string();
451 let tags_str = doc
452 .get_first(f_tags)
453 .and_then(|v| v.as_str())
454 .unwrap_or("")
455 .to_string();
456 let tags: Vec<String> = tags_str
457 .split_whitespace()
458 .filter(|s| !s.is_empty())
459 .map(|s| s.to_string())
460 .collect();
461
462 let confidence = f_confidence
463 .and_then(|f| doc.get_first(f))
464 .and_then(|v| v.as_f64())
465 .unwrap_or(0.5) as f32;
466
467 let summary = f_summary
468 .and_then(|f| doc.get_first(f))
469 .and_then(|v| v.as_str())
470 .filter(|s| !s.is_empty())
471 .map(|s| s.to_string());
472
473 let uri = format!("wiki://{wiki_name}/{slug}");
474
475 summaries.push(PageSummary {
476 slug,
477 uri,
478 title,
479 r#type: page_type,
480 status,
481 tags,
482 confidence,
483 summary,
484 });
485 }
486
487 Ok(PageList {
488 pages: summaries,
489 total,
490 page,
491 page_size,
492 facets: {
493 let type_facet = collect_facet(searcher, &unfiltered_query, is, "type", 0)?;
494 let status_facet = collect_facet(searcher, &query, is, "status", 0)?;
495 let tags_facet = collect_facet(searcher, &query, is, "tags", options.facets_top_tags)?;
496 FacetCounts {
497 r#type: type_facet,
498 status: status_facet,
499 tags: tags_facet,
500 }
501 },
502 })
503}
504
505pub fn search_all(
509 query_str: &str,
510 options: &SearchOptions,
511 wikis: &[(String, Searcher, &IndexSchema)],
512) -> Result<SearchResult> {
513 let mut all_results = Vec::new();
514 let mut merged_facets = FacetCounts::default();
515 for (name, searcher, is) in wikis {
516 match search(query_str, options, searcher, name, is) {
517 Ok(sr) => {
518 all_results.extend(sr.results);
519 for (k, v) in sr.facets.r#type {
520 *merged_facets.r#type.entry(k).or_insert(0) += v;
521 }
522 for (k, v) in sr.facets.status {
523 *merged_facets.status.entry(k).or_insert(0) += v;
524 }
525 for (k, v) in sr.facets.tags {
526 *merged_facets.tags.entry(k).or_insert(0) += v;
527 }
528 }
529 Err(_) => continue,
530 }
531 }
532 all_results.sort_by(|a, b| {
533 b.score
534 .partial_cmp(&a.score)
535 .unwrap_or(std::cmp::Ordering::Equal)
536 });
537 all_results.truncate(options.top_k);
538
539 if options.facets_top_tags > 0 && merged_facets.tags.len() > options.facets_top_tags {
541 let mut entries: Vec<_> = merged_facets.tags.into_iter().collect();
542 entries.sort_by_key(|e| Reverse(e.1));
543 entries.truncate(options.facets_top_tags);
544 merged_facets.tags = entries.into_iter().collect();
545 }
546
547 Ok(SearchResult {
548 results: all_results,
549 facets: merged_facets,
550 })
551}
552
553fn collect_facet(
558 searcher: &Searcher,
559 query: &dyn tantivy::query::Query,
560 is: &IndexSchema,
561 field_name: &str,
562 top_n: usize,
563) -> Result<HashMap<String, u64>> {
564 let field = match is.try_field(field_name) {
565 Some(f) => f,
566 None => return Ok(HashMap::new()),
567 };
568
569 let doc_addrs = searcher.search(query, &tantivy::collector::DocSetCollector)?;
570 let mut counts: HashMap<String, u64> = HashMap::new();
571
572 for doc_addr in &doc_addrs {
573 let doc: tantivy::TantivyDocument = searcher.doc(*doc_addr)?;
574 for val in doc.get_all(field) {
575 if let Some(s) = val.as_str()
576 && !s.is_empty()
577 {
578 *counts.entry(s.to_string()).or_insert(0) += 1;
579 }
580 }
581 }
582
583 if top_n > 0 && counts.len() > top_n {
584 let mut entries: Vec<_> = counts.into_iter().collect();
585 entries.sort_by_key(|e| Reverse(e.1));
586 entries.truncate(top_n);
587 return Ok(entries.into_iter().collect());
588 }
589
590 Ok(counts)
591}
592
593pub fn render_list_llms(result: &PageList) -> String {
598 let mut by_type: std::collections::HashMap<String, Vec<&PageSummary>> =
600 std::collections::HashMap::new();
601 for page in &result.pages {
602 by_type.entry(page.r#type.clone()).or_default().push(page);
603 }
604 let mut groups: Vec<(String, Vec<&PageSummary>)> = by_type.into_iter().collect();
605 groups.sort_by(|a, b| b.1.len().cmp(&a.1.len()).then(a.0.cmp(&b.0)));
606
607 let mut out = String::new();
608 for (type_name, mut pages) in groups {
609 pages.sort_by(|a, b| {
610 b.confidence
611 .partial_cmp(&a.confidence)
612 .unwrap_or(std::cmp::Ordering::Equal)
613 .then(a.title.cmp(&b.title))
614 });
615 out.push_str(&format!("## {} ({})\n\n", type_name, pages.len()));
616 for page in pages {
617 let summary = page.summary.as_deref().unwrap_or("");
618 let line = if page.status == "archived" {
619 if summary.is_empty() {
620 format!("- ~~[{}]({})~~\n", page.title, page.uri)
621 } else {
622 format!("- ~~[{}]({}): {}~~\n", page.title, page.uri, summary)
623 }
624 } else if summary.is_empty() {
625 format!("- [{}]({})\n", page.title, page.uri)
626 } else {
627 format!("- [{}]({}): {}\n", page.title, page.uri, summary)
628 };
629 out.push_str(&line);
630 }
631 out.push('\n');
632 }
633
634 if result.total > result.page_size {
635 let total_pages = (result.total + result.page_size - 1) / result.page_size.max(1);
636 out.push_str(&format!(
637 "_Page {}/{} — {} total pages_\n",
638 result.page, total_pages, result.total
639 ));
640 }
641
642 out
643}
644
645pub fn render_search_llms(result: &SearchResult) -> String {
648 if result.results.is_empty() {
649 return "No results found.\n".to_string();
650 }
651 let mut out = String::new();
652 for r in &result.results {
653 let summary = r.summary.as_deref().unwrap_or("");
654 if summary.is_empty() {
655 out.push_str(&format!("- [{}]({})\n", r.title, r.uri));
656 } else {
657 out.push_str(&format!("- [{}]({}): {}\n", r.title, r.uri, summary));
658 }
659 }
660 out
661}