1use std::{collections::HashMap, fs, path::Path};
7
8use serde::{Deserialize, Serialize};
9use tracing::info;
10use typstify_core::Page;
11
12use crate::SearchError;
13
14pub const MAX_SIMPLE_INDEX_SIZE: usize = 500 * 1024;
16
17#[derive(Debug, Clone, Serialize, Deserialize)]
19pub struct SimpleDocument {
20 pub url: String,
22
23 pub title: String,
25
26 #[serde(skip_serializing_if = "Option::is_none")]
28 pub description: Option<String>,
29
30 #[serde(skip_serializing_if = "Option::is_none")]
32 pub lang: Option<String>,
33
34 #[serde(default, skip_serializing_if = "Vec::is_empty")]
36 pub tags: Vec<String>,
37
38 #[serde(skip_serializing_if = "Option::is_none")]
40 pub date: Option<String>,
41
42 pub terms: Vec<String>,
44}
45
46#[derive(Debug, Clone, Serialize, Deserialize)]
48pub struct SimpleSearchIndex {
49 pub version: u32,
51
52 pub documents: Vec<SimpleDocument>,
54
55 pub index: HashMap<String, Vec<usize>>,
57}
58
59impl SimpleSearchIndex {
60 pub fn new() -> Self {
62 Self {
63 version: 1,
64 documents: Vec::new(),
65 index: HashMap::new(),
66 }
67 }
68
69 pub fn from_pages(pages: &[&Page]) -> Self {
71 let mut index = Self::new();
72
73 for page in pages {
74 index.add_page(page);
75 }
76
77 index.build_inverted_index();
78 index
79 }
80
81 pub fn add_page(&mut self, page: &Page) {
83 let terms = tokenize_content(&page.title, &page.content, &page.tags);
84
85 let doc = SimpleDocument {
86 url: page.url.clone(),
87 title: page.title.clone(),
88 description: page.description.clone().or(page.summary.clone()),
89 lang: page.lang.clone(),
90 tags: page.tags.clone(),
91 date: page.date.map(|d| d.to_rfc3339()),
92 terms,
93 };
94
95 self.documents.push(doc);
96 }
97
98 fn build_inverted_index(&mut self) {
100 self.index.clear();
101
102 for (doc_idx, doc) in self.documents.iter().enumerate() {
103 for term in &doc.terms {
104 self.index.entry(term.clone()).or_default().push(doc_idx);
105 }
106 }
107
108 for postings in self.index.values_mut() {
110 postings.sort_unstable();
111 postings.dedup();
112 }
113
114 info!(
115 documents = self.documents.len(),
116 terms = self.index.len(),
117 "Built simple search index"
118 );
119 }
120
121 pub fn search(&self, query: &str) -> Vec<&SimpleDocument> {
125 let query_terms = tokenize_query(query);
126
127 if query_terms.is_empty() {
128 return Vec::new();
129 }
130
131 let mut result_indices: Option<Vec<usize>> = None;
133
134 for term in &query_terms {
135 if let Some(postings) = self.index.get(term) {
136 match &mut result_indices {
137 None => {
138 result_indices = Some(postings.clone());
139 }
140 Some(indices) => {
141 indices.retain(|idx| postings.contains(idx));
143 }
144 }
145 } else {
146 return Vec::new();
148 }
149 }
150
151 result_indices
152 .unwrap_or_default()
153 .iter()
154 .filter_map(|&idx| self.documents.get(idx))
155 .collect()
156 }
157
158 pub fn to_json(&self) -> Result<String, SearchError> {
160 serde_json::to_string(self).map_err(|e| SearchError::Serialization(e.to_string()))
161 }
162
163 pub fn to_json_pretty(&self) -> Result<String, SearchError> {
165 serde_json::to_string_pretty(self).map_err(|e| SearchError::Serialization(e.to_string()))
166 }
167
168 pub fn from_json(json: &str) -> Result<Self, SearchError> {
170 serde_json::from_str(json).map_err(|e| SearchError::Serialization(e.to_string()))
171 }
172
173 pub fn write_to_file(&self, path: &Path) -> Result<(), SearchError> {
175 let json = self.to_json()?;
176
177 if json.len() > MAX_SIMPLE_INDEX_SIZE {
179 tracing::warn!(
180 size = json.len(),
181 max = MAX_SIMPLE_INDEX_SIZE,
182 "Simple search index exceeds recommended size"
183 );
184 }
185
186 fs::write(path, json).map_err(|e| SearchError::Io(e.to_string()))?;
187 Ok(())
188 }
189
190 pub fn estimated_size(&self) -> usize {
192 self.documents
194 .iter()
195 .map(|d| {
196 d.url.len()
197 + d.title.len()
198 + d.description.as_ref().map(|s| s.len()).unwrap_or(0)
199 + d.terms.iter().map(|t| t.len() + 3).sum::<usize>()
200 + 100 })
202 .sum()
203 }
204
205 pub fn is_within_size_limit(&self) -> bool {
207 self.estimated_size() <= MAX_SIMPLE_INDEX_SIZE
208 }
209}
210
211impl Default for SimpleSearchIndex {
212 fn default() -> Self {
213 Self::new()
214 }
215}
216
217fn tokenize_content(title: &str, content: &str, tags: &[String]) -> Vec<String> {
221 let mut terms = Vec::new();
222
223 for term in tokenize_text(title) {
225 terms.push(term);
226 }
227
228 let body_text = strip_html(content);
230 for term in tokenize_text(&body_text) {
231 terms.push(term);
232 }
233
234 for tag in tags {
236 terms.push(normalize_term(tag));
237 }
238
239 terms.sort();
241 terms.dedup();
242
243 terms
244}
245
246fn tokenize_query(query: &str) -> Vec<String> {
248 tokenize_text(query)
249}
250
251fn tokenize_text(text: &str) -> Vec<String> {
253 text.split(|c: char| !c.is_alphanumeric())
254 .filter(|s| s.len() >= 2) .map(normalize_term)
256 .collect()
257}
258
259fn normalize_term(term: &str) -> String {
261 term.to_lowercase().trim().to_string()
262}
263
264fn strip_html(html: &str) -> String {
266 let mut result = String::with_capacity(html.len());
267 let mut in_tag = false;
268
269 for c in html.chars() {
270 if c == '<' {
271 in_tag = true;
272 } else if c == '>' {
273 in_tag = false;
274 result.push(' ');
275 } else if !in_tag {
276 result.push(c);
277 }
278 }
279
280 result
281}
282
283#[cfg(test)]
284mod tests {
285 use chrono::Utc;
286
287 use super::*;
288
289 fn create_test_page(url: &str, title: &str, content: &str, tags: Vec<String>) -> Page {
290 Page {
291 url: url.to_string(),
292 title: title.to_string(),
293 description: Some(format!("Description of {}", title)),
294 date: Some(Utc::now()),
295 updated: None,
296 draft: false,
297 lang: Some("en".to_string()),
298 tags,
299 categories: vec![],
300 content: content.to_string(),
301 summary: None,
302 reading_time: Some(5),
303 word_count: Some(100),
304 source_path: None,
305 aliases: vec![],
306 toc: vec![],
307 custom_js: vec![],
308 custom_css: vec![],
309 template: None,
310 weight: 0,
311 }
312 }
313
314 #[test]
315 fn test_tokenize_text() {
316 let terms = tokenize_text("Hello World! This is a test.");
317 assert!(terms.contains(&"hello".to_string()));
318 assert!(terms.contains(&"world".to_string()));
319 assert!(terms.contains(&"test".to_string()));
320 assert!(!terms.contains(&"a".to_string()));
322 }
323
324 #[test]
325 fn test_strip_html() {
326 let html = "<p>Hello <strong>world</strong>!</p>";
327 let text = strip_html(html);
328 assert!(text.contains("Hello"));
329 assert!(text.contains("world"));
330 assert!(!text.contains("<p>"));
331 }
332
333 #[test]
334 fn test_simple_index_from_pages() {
335 let page1 = create_test_page(
336 "/post1",
337 "Introduction to Rust",
338 "<p>Rust is a systems programming language.</p>",
339 vec!["rust".to_string(), "programming".to_string()],
340 );
341 let page2 = create_test_page(
342 "/post2",
343 "Learning Go",
344 "<p>Go is a great language for servers.</p>",
345 vec!["go".to_string(), "programming".to_string()],
346 );
347
348 let index = SimpleSearchIndex::from_pages(&[&page1, &page2]);
349
350 assert_eq!(index.documents.len(), 2);
351 assert!(!index.index.is_empty());
352
353 assert!(index.index.contains_key("rust"));
355 assert!(index.index.contains_key("programming"));
356 }
357
358 #[test]
359 fn test_simple_index_search() {
360 let page1 = create_test_page(
361 "/rust",
362 "Learning Rust",
363 "<p>Rust programming tutorial.</p>",
364 vec!["rust".to_string()],
365 );
366 let page2 = create_test_page(
367 "/go",
368 "Learning Go",
369 "<p>Go programming tutorial.</p>",
370 vec!["go".to_string()],
371 );
372
373 let index = SimpleSearchIndex::from_pages(&[&page1, &page2]);
374
375 let results = index.search("rust");
377 assert_eq!(results.len(), 1);
378 assert_eq!(results[0].url, "/rust");
379
380 let results = index.search("programming");
382 assert_eq!(results.len(), 2);
383
384 let results = index.search("python");
386 assert!(results.is_empty());
387 }
388
389 #[test]
390 fn test_simple_index_serialization() {
391 let page = create_test_page(
392 "/test",
393 "Test Page",
394 "<p>Test content</p>",
395 vec!["test".to_string()],
396 );
397
398 let index = SimpleSearchIndex::from_pages(&[&page]);
399 let json = index.to_json().unwrap();
400 let parsed = SimpleSearchIndex::from_json(&json).unwrap();
401
402 assert_eq!(parsed.documents.len(), 1);
403 assert_eq!(parsed.documents[0].url, "/test");
404 }
405
406 #[test]
407 fn test_simple_index_multi_term_search() {
408 let page1 = create_test_page(
409 "/post1",
410 "Rust Programming Guide",
411 "<p>Learn systems programming with Rust.</p>",
412 vec!["rust".to_string()],
413 );
414 let page2 = create_test_page(
415 "/post2",
416 "Python Programming",
417 "<p>Learn scripting with Python.</p>",
418 vec!["python".to_string()],
419 );
420
421 let index = SimpleSearchIndex::from_pages(&[&page1, &page2]);
422
423 let results = index.search("rust systems");
425 assert_eq!(results.len(), 1);
426 assert_eq!(results[0].url, "/post1");
427 }
428
429 #[test]
430 fn test_estimated_size() {
431 let page = create_test_page(
432 "/test",
433 "Test Page",
434 "<p>Test content</p>",
435 vec!["test".to_string()],
436 );
437
438 let index = SimpleSearchIndex::from_pages(&[&page]);
439 let estimated = index.estimated_size();
440
441 assert!(estimated > 0);
443 assert!(estimated < MAX_SIMPLE_INDEX_SIZE);
444 assert!(index.is_within_size_limit());
445 }
446}