1use std::collections::HashMap;
4
5use serde::{Deserialize, Serialize};
6
7use crate::tokenizer::tokenize;
8
9#[derive(Debug, Clone, Serialize, Deserialize)]
11pub struct SearchDocument {
12 pub id: String,
14 pub title: String,
16 pub url: String,
18 pub body: String,
20 pub headings: Vec<String>,
22 #[serde(default)]
24 pub code: Vec<String>,
25}
26
27#[derive(Debug, Clone, Serialize, Deserialize)]
29pub struct Posting {
30 pub doc_idx: usize,
32 pub tf: u32,
34 pub field: Field,
36}
37
38#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
40pub enum Field {
41 Title,
43 Heading,
45 Body,
47 Code,
49}
50
51impl Field {
52 #[must_use]
54 pub fn boost(self) -> f64 {
55 match self {
56 Self::Title => 10.0,
57 Self::Heading => 5.0,
58 Self::Body => 1.0,
59 Self::Code => 0.5,
60 }
61 }
62}
63
64#[derive(Debug, Clone, Serialize, Deserialize)]
66pub struct SearchIndex {
67 pub documents: Vec<SearchDocument>,
69 pub index: HashMap<String, Vec<Posting>>,
71 pub df: HashMap<String, usize>,
73 pub avg_dl: f64,
75 pub doc_count: usize,
77}
78
79impl SearchIndex {
80 #[must_use]
82 pub fn to_json(&self) -> String {
83 serde_json::to_string(self).unwrap_or_default()
84 }
85
86 #[must_use]
88 pub fn to_json_compact(&self) -> String {
89 serde_json::to_string(self).unwrap_or_default()
90 }
91
92 pub fn from_json(json: &str) -> Result<Self, serde_json::Error> {
94 serde_json::from_str(json)
95 }
96
97 #[must_use]
99 pub fn len(&self) -> usize {
100 self.documents.len()
101 }
102
103 #[must_use]
105 pub fn is_empty(&self) -> bool {
106 self.documents.is_empty()
107 }
108}
109
110#[derive(Debug, Default)]
112pub struct SearchIndexBuilder {
113 documents: Vec<SearchDocument>,
114}
115
116impl SearchIndexBuilder {
117 #[must_use]
119 pub fn new() -> Self {
120 Self::default()
121 }
122
123 pub fn add_document(&mut self, doc: SearchDocument) -> &mut Self {
125 self.documents.push(doc);
126 self
127 }
128
129 pub fn add_simple(&mut self, id: &str, title: &str, url: &str, body: &str) -> &mut Self {
131 self.documents.push(SearchDocument {
132 id: id.to_string(),
133 title: title.to_string(),
134 url: url.to_string(),
135 body: body.to_string(),
136 headings: Vec::new(),
137 code: Vec::new(),
138 });
139 self
140 }
141
142 #[must_use]
144 pub fn build(self) -> SearchIndex {
145 let mut index: HashMap<String, Vec<Posting>> = HashMap::new();
146 let mut df: HashMap<String, usize> = HashMap::new();
147 let mut total_length = 0usize;
148
149 for (doc_idx, doc) in self.documents.iter().enumerate() {
150 let mut doc_terms: HashMap<String, (u32, Field)> = HashMap::new();
151
152 for token in tokenize(&doc.title) {
154 doc_terms
155 .entry(token)
156 .and_modify(|(count, _)| *count += 1)
157 .or_insert((1, Field::Title));
158 }
159
160 for heading in &doc.headings {
162 for token in tokenize(heading) {
163 doc_terms
164 .entry(token)
165 .and_modify(|(count, _)| *count += 1)
166 .or_insert((1, Field::Heading));
167 }
168 }
169
170 let body_tokens = tokenize(&doc.body);
172 total_length += body_tokens.len();
173 for token in body_tokens {
174 doc_terms
175 .entry(token)
176 .and_modify(|(count, _)| *count += 1)
177 .or_insert((1, Field::Body));
178 }
179
180 for code in &doc.code {
182 for token in tokenize(code) {
183 doc_terms
184 .entry(token)
185 .and_modify(|(count, _)| *count += 1)
186 .or_insert((1, Field::Code));
187 }
188 }
189
190 for (term, (tf, field)) in doc_terms {
192 *df.entry(term.clone()).or_insert(0) += 1;
193 index.entry(term).or_default().push(Posting { doc_idx, tf, field });
194 }
195 }
196
197 let doc_count = self.documents.len();
198 #[allow(clippy::cast_precision_loss)]
199 let avg_dl = if doc_count > 0 { total_length as f64 / doc_count as f64 } else { 0.0 };
200
201 SearchIndex { documents: self.documents, index, df, avg_dl, doc_count }
202 }
203}
204
205#[cfg(test)]
206mod tests {
207 use super::*;
208
209 #[test]
210 fn test_build_index() {
211 let mut builder = SearchIndexBuilder::new();
212 builder.add_simple(
213 "1",
214 "Getting Started",
215 "/getting-started",
216 "Welcome to the documentation",
217 );
218 builder.add_simple("2", "Installation", "/installation", "How to install the package");
219
220 let index = builder.build();
221
222 assert_eq!(index.len(), 2);
223 assert!(index.index.contains_key("getting"));
224 assert!(index.index.contains_key("started"));
225 assert!(index.index.contains_key("install"));
226 }
227
228 #[test]
229 fn test_serialize_deserialize() {
230 let mut builder = SearchIndexBuilder::new();
231 builder.add_simple("1", "Test", "/test", "Test content");
232
233 let index = builder.build();
234 let json = index.to_json();
235 let restored = SearchIndex::from_json(&json).unwrap();
236
237 assert_eq!(restored.len(), 1);
238 assert_eq!(restored.documents[0].title, "Test");
239 }
240}