1use std::{collections::HashMap, fs, path::Path};
7
8use serde::{Deserialize, Serialize};
9use tracing::info;
10use typstify_core::Page;
11
12use crate::SearchError;
13
14pub const MAX_SIMPLE_INDEX_SIZE: usize = 500 * 1024;
16
17#[derive(Debug, Clone, Serialize, Deserialize)]
19pub struct SimpleDocument {
20 pub url: String,
22
23 pub title: String,
25
26 #[serde(skip_serializing_if = "Option::is_none")]
28 pub description: Option<String>,
29
30 #[serde(skip_serializing_if = "Option::is_none")]
32 pub lang: Option<String>,
33
34 #[serde(default, skip_serializing_if = "Vec::is_empty")]
36 pub tags: Vec<String>,
37
38 #[serde(skip_serializing_if = "Option::is_none")]
40 pub date: Option<String>,
41
42 pub terms: Vec<String>,
44}
45
46#[derive(Debug, Clone, Serialize, Deserialize)]
48pub struct SimpleSearchIndex {
49 pub version: u32,
51
52 pub documents: Vec<SimpleDocument>,
54
55 pub index: HashMap<String, Vec<usize>>,
57}
58
59impl SimpleSearchIndex {
60 pub fn new() -> Self {
62 Self {
63 version: 1,
64 documents: Vec::new(),
65 index: HashMap::new(),
66 }
67 }
68
69 pub fn from_pages(pages: &[&Page]) -> Self {
71 let mut index = Self::new();
72
73 for page in pages {
74 index.add_page(page);
75 }
76
77 index.build_inverted_index();
78 index
79 }
80
81 pub fn add_page(&mut self, page: &Page) {
83 let terms = tokenize_content(&page.title, &page.content, &page.tags);
84
85 let doc = SimpleDocument {
86 url: page.url.clone(),
87 title: page.title.clone(),
88 description: page.description.clone().or(page.summary.clone()),
89 lang: Some(page.lang.clone()),
90 tags: page.tags.clone(),
91 date: page.date.map(|d| d.to_rfc3339()),
92 terms,
93 };
94
95 self.documents.push(doc);
96 }
97
98 fn build_inverted_index(&mut self) {
100 self.index.clear();
101
102 for (doc_idx, doc) in self.documents.iter().enumerate() {
103 for term in &doc.terms {
104 self.index.entry(term.clone()).or_default().push(doc_idx);
105 }
106 }
107
108 for postings in self.index.values_mut() {
110 postings.sort_unstable();
111 postings.dedup();
112 }
113
114 info!(
115 documents = self.documents.len(),
116 terms = self.index.len(),
117 "Built simple search index"
118 );
119 }
120
121 pub fn search(&self, query: &str) -> Vec<&SimpleDocument> {
125 let query_terms = tokenize_query(query);
126
127 if query_terms.is_empty() {
128 return Vec::new();
129 }
130
131 let mut result_indices: Option<Vec<usize>> = None;
133
134 for term in &query_terms {
135 if let Some(postings) = self.index.get(term) {
136 match &mut result_indices {
137 None => {
138 result_indices = Some(postings.clone());
139 }
140 Some(indices) => {
141 indices.retain(|idx| postings.contains(idx));
143 }
144 }
145 } else {
146 return Vec::new();
148 }
149 }
150
151 result_indices
152 .unwrap_or_default()
153 .iter()
154 .filter_map(|&idx| self.documents.get(idx))
155 .collect()
156 }
157
158 pub fn to_json(&self) -> Result<String, SearchError> {
160 serde_json::to_string(self).map_err(|e| SearchError::Serialization(e.to_string()))
161 }
162
163 pub fn to_json_pretty(&self) -> Result<String, SearchError> {
165 serde_json::to_string_pretty(self).map_err(|e| SearchError::Serialization(e.to_string()))
166 }
167
168 pub fn from_json(json: &str) -> Result<Self, SearchError> {
170 serde_json::from_str(json).map_err(|e| SearchError::Serialization(e.to_string()))
171 }
172
173 pub fn write_to_file(&self, path: &Path) -> Result<(), SearchError> {
175 let json = self.to_json()?;
176
177 if json.len() > MAX_SIMPLE_INDEX_SIZE {
179 tracing::warn!(
180 size = json.len(),
181 max = MAX_SIMPLE_INDEX_SIZE,
182 "Simple search index exceeds recommended size"
183 );
184 }
185
186 fs::write(path, json).map_err(|e| SearchError::Io(e.to_string()))?;
187 Ok(())
188 }
189
190 pub fn estimated_size(&self) -> usize {
192 self.documents
194 .iter()
195 .map(|d| {
196 d.url.len()
197 + d.title.len()
198 + d.description.as_ref().map(|s| s.len()).unwrap_or(0)
199 + d.terms.iter().map(|t| t.len() + 3).sum::<usize>()
200 + 100 })
202 .sum()
203 }
204
205 pub fn is_within_size_limit(&self) -> bool {
207 self.estimated_size() <= MAX_SIMPLE_INDEX_SIZE
208 }
209}
210
211impl Default for SimpleSearchIndex {
212 fn default() -> Self {
213 Self::new()
214 }
215}
216
217fn tokenize_content(title: &str, content: &str, tags: &[String]) -> Vec<String> {
221 let mut terms = Vec::new();
222
223 for term in tokenize_text(title) {
225 terms.push(term);
226 }
227
228 let body_text = strip_html(content);
230 for term in tokenize_text(&body_text) {
231 terms.push(term);
232 }
233
234 for tag in tags {
236 terms.push(normalize_term(tag));
237 }
238
239 terms.sort();
241 terms.dedup();
242
243 terms
244}
245
246fn tokenize_query(query: &str) -> Vec<String> {
248 tokenize_text(query)
249}
250
251fn tokenize_text(text: &str) -> Vec<String> {
254 let mut terms = Vec::new();
255
256 for word in text.split(|c: char| !c.is_alphanumeric()) {
258 if word.len() >= 2 {
259 terms.push(normalize_term(word));
260 }
261 }
262
263 let cjk_text: String = text.chars().filter(|c| is_cjk_char(*c)).collect();
266
267 if !cjk_text.is_empty() {
268 for c in cjk_text.chars() {
270 terms.push(c.to_string());
271 }
272
273 let chars: Vec<char> = cjk_text.chars().collect();
275 for i in 0..chars.len().saturating_sub(1) {
276 terms.push(format!("{}{}", chars[i], chars[i + 1]));
277 }
278
279 if cjk_text.len() <= 20 && cjk_text.chars().count() >= 2 {
281 terms.push(cjk_text.to_lowercase());
282 }
283 }
284
285 terms
286}
287
288fn is_cjk_char(c: char) -> bool {
290 matches!(c,
291 '\u{4E00}'..='\u{9FFF}' | '\u{3400}'..='\u{4DBF}' | '\u{20000}'..='\u{2A6DF}' | '\u{2A700}'..='\u{2B73F}' | '\u{2B740}'..='\u{2B81F}' | '\u{2B820}'..='\u{2CEAF}' | '\u{2CEB0}'..='\u{2EBEF}' | '\u{30000}'..='\u{3134F}' | '\u{F900}'..='\u{FAFF}' | '\u{2F800}'..='\u{2FA1F}' | '\u{3040}'..='\u{309F}' | '\u{30A0}'..='\u{30FF}' | '\u{AC00}'..='\u{D7AF}' )
305}
306
307fn normalize_term(term: &str) -> String {
309 term.to_lowercase().trim().to_string()
310}
311
312fn strip_html(html: &str) -> String {
314 let mut result = String::with_capacity(html.len());
315 let mut in_tag = false;
316
317 for c in html.chars() {
318 if c == '<' {
319 in_tag = true;
320 } else if c == '>' {
321 in_tag = false;
322 result.push(' ');
323 } else if !in_tag {
324 result.push(c);
325 }
326 }
327
328 result
329}
330
331#[cfg(test)]
332mod tests {
333 use chrono::Utc;
334
335 use super::*;
336
337 fn create_test_page(url: &str, title: &str, content: &str, tags: Vec<String>) -> Page {
338 Page {
339 url: url.to_string(),
340 title: title.to_string(),
341 description: Some(format!("Description of {}", title)),
342 date: Some(Utc::now()),
343 updated: None,
344 draft: false,
345 lang: "en".to_string(),
346 is_default_lang: true,
347 canonical_id: url.trim_start_matches('/').to_string(),
348 tags,
349 categories: vec![],
350 content: content.to_string(),
351 summary: None,
352 reading_time: Some(5),
353 word_count: Some(100),
354 source_path: None,
355 aliases: vec![],
356 toc: vec![],
357 custom_js: vec![],
358 custom_css: vec![],
359 template: None,
360 weight: 0,
361 }
362 }
363
364 #[test]
365 fn test_tokenize_text() {
366 let terms = tokenize_text("Hello World! This is a test.");
367 assert!(terms.contains(&"hello".to_string()));
368 assert!(terms.contains(&"world".to_string()));
369 assert!(terms.contains(&"test".to_string()));
370 assert!(!terms.contains(&"a".to_string()));
372 }
373
374 #[test]
375 fn test_tokenize_chinese() {
376 let terms = tokenize_text("你好世界");
377 assert!(terms.contains(&"你".to_string()));
379 assert!(terms.contains(&"好".to_string()));
380 assert!(terms.contains(&"世".to_string()));
381 assert!(terms.contains(&"界".to_string()));
382 assert!(terms.contains(&"你好".to_string()));
384 assert!(terms.contains(&"世界".to_string()));
385 }
386
387 #[test]
388 fn test_is_cjk_char() {
389 assert!(is_cjk_char('你'));
391 assert!(is_cjk_char('好'));
392 assert!(is_cjk_char('あ')); assert!(is_cjk_char('ア')); assert!(is_cjk_char('한')); assert!(!is_cjk_char('a'));
399 assert!(!is_cjk_char('1'));
400 }
401
402 #[test]
403 fn test_strip_html() {
404 let html = "<p>Hello <strong>world</strong>!</p>";
405 let text = strip_html(html);
406 assert!(text.contains("Hello"));
407 assert!(text.contains("world"));
408 assert!(!text.contains("<p>"));
409 }
410
411 #[test]
412 fn test_simple_index_from_pages() {
413 let page1 = create_test_page(
414 "/post1",
415 "Introduction to Rust",
416 "<p>Rust is a systems programming language.</p>",
417 vec!["rust".to_string(), "programming".to_string()],
418 );
419 let page2 = create_test_page(
420 "/post2",
421 "Learning Go",
422 "<p>Go is a great language for servers.</p>",
423 vec!["go".to_string(), "programming".to_string()],
424 );
425
426 let index = SimpleSearchIndex::from_pages(&[&page1, &page2]);
427
428 assert_eq!(index.documents.len(), 2);
429 assert!(!index.index.is_empty());
430
431 assert!(index.index.contains_key("rust"));
433 assert!(index.index.contains_key("programming"));
434 }
435
436 #[test]
437 fn test_simple_index_search() {
438 let page1 = create_test_page(
439 "/rust",
440 "Learning Rust",
441 "<p>Rust programming tutorial.</p>",
442 vec!["rust".to_string()],
443 );
444 let page2 = create_test_page(
445 "/go",
446 "Learning Go",
447 "<p>Go programming tutorial.</p>",
448 vec!["go".to_string()],
449 );
450
451 let index = SimpleSearchIndex::from_pages(&[&page1, &page2]);
452
453 let results = index.search("rust");
455 assert_eq!(results.len(), 1);
456 assert_eq!(results[0].url, "/rust");
457
458 let results = index.search("programming");
460 assert_eq!(results.len(), 2);
461
462 let results = index.search("python");
464 assert!(results.is_empty());
465 }
466
467 #[test]
468 fn test_simple_index_serialization() {
469 let page = create_test_page(
470 "/test",
471 "Test Page",
472 "<p>Test content</p>",
473 vec!["test".to_string()],
474 );
475
476 let index = SimpleSearchIndex::from_pages(&[&page]);
477 let json = index.to_json().unwrap();
478 let parsed = SimpleSearchIndex::from_json(&json).unwrap();
479
480 assert_eq!(parsed.documents.len(), 1);
481 assert_eq!(parsed.documents[0].url, "/test");
482 }
483
484 #[test]
485 fn test_simple_index_multi_term_search() {
486 let page1 = create_test_page(
487 "/post1",
488 "Rust Programming Guide",
489 "<p>Learn systems programming with Rust.</p>",
490 vec!["rust".to_string()],
491 );
492 let page2 = create_test_page(
493 "/post2",
494 "Python Programming",
495 "<p>Learn scripting with Python.</p>",
496 vec!["python".to_string()],
497 );
498
499 let index = SimpleSearchIndex::from_pages(&[&page1, &page2]);
500
501 let results = index.search("rust systems");
503 assert_eq!(results.len(), 1);
504 assert_eq!(results[0].url, "/post1");
505 }
506
507 #[test]
508 fn test_estimated_size() {
509 let page = create_test_page(
510 "/test",
511 "Test Page",
512 "<p>Test content</p>",
513 vec!["test".to_string()],
514 );
515
516 let index = SimpleSearchIndex::from_pages(&[&page]);
517 let estimated = index.estimated_size();
518
519 assert!(estimated > 0);
521 assert!(estimated < MAX_SIMPLE_INDEX_SIZE);
522 assert!(index.is_within_size_limit());
523 }
524}