use std::collections::HashMap;
use serde::{Deserialize, Serialize};
use crate::tokenizer::tokenize;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SearchDocument {
pub id: String,
pub title: String,
pub url: String,
pub body: String,
pub headings: Vec<String>,
#[serde(default)]
pub code: Vec<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Posting {
pub doc_idx: usize,
pub tf: u32,
pub field: Field,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum Field {
Title,
Heading,
Body,
Code,
}
impl Field {
#[must_use]
pub fn boost(self) -> f64 {
match self {
Self::Title => 10.0,
Self::Heading => 5.0,
Self::Body => 1.0,
Self::Code => 0.5,
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SearchIndex {
pub documents: Vec<SearchDocument>,
pub index: HashMap<String, Vec<Posting>>,
pub df: HashMap<String, usize>,
pub avg_dl: f64,
pub doc_count: usize,
}
impl SearchIndex {
#[must_use]
pub fn to_json(&self) -> String {
serde_json::to_string(self).unwrap_or_default()
}
#[must_use]
pub fn to_json_compact(&self) -> String {
serde_json::to_string(self).unwrap_or_default()
}
pub fn from_json(json: &str) -> Result<Self, serde_json::Error> {
serde_json::from_str(json)
}
#[must_use]
pub fn len(&self) -> usize {
self.documents.len()
}
#[must_use]
pub fn is_empty(&self) -> bool {
self.documents.is_empty()
}
}
#[derive(Debug, Default)]
pub struct SearchIndexBuilder {
documents: Vec<SearchDocument>,
}
impl SearchIndexBuilder {
#[must_use]
pub fn new() -> Self {
Self::default()
}
pub fn add_document(&mut self, doc: SearchDocument) -> &mut Self {
self.documents.push(doc);
self
}
pub fn add_simple(&mut self, id: &str, title: &str, url: &str, body: &str) -> &mut Self {
self.documents.push(SearchDocument {
id: id.to_string(),
title: title.to_string(),
url: url.to_string(),
body: body.to_string(),
headings: Vec::new(),
code: Vec::new(),
});
self
}
#[must_use]
pub fn build(self) -> SearchIndex {
let mut index: HashMap<String, Vec<Posting>> = HashMap::new();
let mut df: HashMap<String, usize> = HashMap::new();
let mut total_length = 0usize;
for (doc_idx, doc) in self.documents.iter().enumerate() {
let mut doc_terms: HashMap<String, (u32, Field)> = HashMap::new();
for token in tokenize(&doc.title) {
doc_terms
.entry(token)
.and_modify(|(count, _)| *count += 1)
.or_insert((1, Field::Title));
}
for heading in &doc.headings {
for token in tokenize(heading) {
doc_terms
.entry(token)
.and_modify(|(count, _)| *count += 1)
.or_insert((1, Field::Heading));
}
}
let body_tokens = tokenize(&doc.body);
total_length += body_tokens.len();
for token in body_tokens {
doc_terms
.entry(token)
.and_modify(|(count, _)| *count += 1)
.or_insert((1, Field::Body));
}
for code in &doc.code {
for token in tokenize(code) {
doc_terms
.entry(token)
.and_modify(|(count, _)| *count += 1)
.or_insert((1, Field::Code));
}
}
for (term, (tf, field)) in doc_terms {
*df.entry(term.clone()).or_insert(0) += 1;
index.entry(term).or_default().push(Posting { doc_idx, tf, field });
}
}
let doc_count = self.documents.len();
#[allow(clippy::cast_precision_loss)]
let avg_dl = if doc_count > 0 { total_length as f64 / doc_count as f64 } else { 0.0 };
SearchIndex { documents: self.documents, index, df, avg_dl, doc_count }
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_build_index() {
let mut builder = SearchIndexBuilder::new();
builder.add_simple(
"1",
"Getting Started",
"/getting-started",
"Welcome to the documentation",
);
builder.add_simple("2", "Installation", "/installation", "How to install the package");
let index = builder.build();
assert_eq!(index.len(), 2);
assert!(index.index.contains_key("getting"));
assert!(index.index.contains_key("started"));
assert!(index.index.contains_key("install"));
}
#[test]
fn test_serialize_deserialize() {
let mut builder = SearchIndexBuilder::new();
builder.add_simple("1", "Test", "/test", "Test content");
let index = builder.build();
let json = index.to_json();
let restored = SearchIndex::from_json(&json).unwrap();
assert_eq!(restored.len(), 1);
assert_eq!(restored.documents[0].title, "Test");
}
}