typstify_search/
simple.rs

1//! Simple JSON-based search index for small sites.
2//!
3//! Provides a lightweight alternative to Tantivy for sites with fewer pages.
4//! The entire index is loaded into memory in the browser.
5
6use std::{collections::HashMap, fs, path::Path};
7
8use serde::{Deserialize, Serialize};
9use tracing::info;
10use typstify_core::Page;
11
12use crate::SearchError;
13
14/// Maximum recommended size for simple index (500KB).
15pub const MAX_SIMPLE_INDEX_SIZE: usize = 500 * 1024;
16
17/// A simple search index document.
18#[derive(Debug, Clone, Serialize, Deserialize)]
19pub struct SimpleDocument {
20    /// Document URL.
21    pub url: String,
22
23    /// Document title.
24    pub title: String,
25
26    /// Document description/summary.
27    #[serde(skip_serializing_if = "Option::is_none")]
28    pub description: Option<String>,
29
30    /// Language code.
31    #[serde(skip_serializing_if = "Option::is_none")]
32    pub lang: Option<String>,
33
34    /// Tags.
35    #[serde(default, skip_serializing_if = "Vec::is_empty")]
36    pub tags: Vec<String>,
37
38    /// Publication date as ISO string.
39    #[serde(skip_serializing_if = "Option::is_none")]
40    pub date: Option<String>,
41
42    /// Pre-tokenized terms from title and body.
43    pub terms: Vec<String>,
44}
45
46/// A simple JSON-based search index.
47#[derive(Debug, Clone, Serialize, Deserialize)]
48pub struct SimpleSearchIndex {
49    /// Index format version.
50    pub version: u32,
51
52    /// All indexed documents.
53    pub documents: Vec<SimpleDocument>,
54
55    /// Inverted index: term -> document indices.
56    pub index: HashMap<String, Vec<usize>>,
57}
58
59impl SimpleSearchIndex {
60    /// Create a new empty index.
61    pub fn new() -> Self {
62        Self {
63            version: 1,
64            documents: Vec::new(),
65            index: HashMap::new(),
66        }
67    }
68
69    /// Build an index from a collection of pages.
70    pub fn from_pages(pages: &[&Page]) -> Self {
71        let mut index = Self::new();
72
73        for page in pages {
74            index.add_page(page);
75        }
76
77        index.build_inverted_index();
78        index
79    }
80
81    /// Add a page to the index.
82    pub fn add_page(&mut self, page: &Page) {
83        let terms = tokenize_content(&page.title, &page.content, &page.tags);
84
85        let doc = SimpleDocument {
86            url: page.url.clone(),
87            title: page.title.clone(),
88            description: page.description.clone().or(page.summary.clone()),
89            lang: Some(page.lang.clone()),
90            tags: page.tags.clone(),
91            date: page.date.map(|d| d.to_rfc3339()),
92            terms,
93        };
94
95        self.documents.push(doc);
96    }
97
98    /// Build the inverted index from documents.
99    fn build_inverted_index(&mut self) {
100        self.index.clear();
101
102        for (doc_idx, doc) in self.documents.iter().enumerate() {
103            for term in &doc.terms {
104                self.index.entry(term.clone()).or_default().push(doc_idx);
105            }
106        }
107
108        // Deduplicate posting lists
109        for postings in self.index.values_mut() {
110            postings.sort_unstable();
111            postings.dedup();
112        }
113
114        info!(
115            documents = self.documents.len(),
116            terms = self.index.len(),
117            "Built simple search index"
118        );
119    }
120
121    /// Search the index for matching documents.
122    ///
123    /// Returns documents matching all query terms (AND search).
124    pub fn search(&self, query: &str) -> Vec<&SimpleDocument> {
125        let query_terms = tokenize_query(query);
126
127        if query_terms.is_empty() {
128            return Vec::new();
129        }
130
131        // Find documents containing all query terms
132        let mut result_indices: Option<Vec<usize>> = None;
133
134        for term in &query_terms {
135            if let Some(postings) = self.index.get(term) {
136                match &mut result_indices {
137                    None => {
138                        result_indices = Some(postings.clone());
139                    }
140                    Some(indices) => {
141                        // Intersect with existing results
142                        indices.retain(|idx| postings.contains(idx));
143                    }
144                }
145            } else {
146                // Term not found, no results
147                return Vec::new();
148            }
149        }
150
151        result_indices
152            .unwrap_or_default()
153            .iter()
154            .filter_map(|&idx| self.documents.get(idx))
155            .collect()
156    }
157
158    /// Serialize the index to JSON.
159    pub fn to_json(&self) -> Result<String, SearchError> {
160        serde_json::to_string(self).map_err(|e| SearchError::Serialization(e.to_string()))
161    }
162
163    /// Serialize the index to pretty-printed JSON.
164    pub fn to_json_pretty(&self) -> Result<String, SearchError> {
165        serde_json::to_string_pretty(self).map_err(|e| SearchError::Serialization(e.to_string()))
166    }
167
168    /// Deserialize an index from JSON.
169    pub fn from_json(json: &str) -> Result<Self, SearchError> {
170        serde_json::from_str(json).map_err(|e| SearchError::Serialization(e.to_string()))
171    }
172
173    /// Write the index to a file.
174    pub fn write_to_file(&self, path: &Path) -> Result<(), SearchError> {
175        let json = self.to_json()?;
176
177        // Warn if index is too large
178        if json.len() > MAX_SIMPLE_INDEX_SIZE {
179            tracing::warn!(
180                size = json.len(),
181                max = MAX_SIMPLE_INDEX_SIZE,
182                "Simple search index exceeds recommended size"
183            );
184        }
185
186        fs::write(path, json).map_err(|e| SearchError::Io(e.to_string()))?;
187        Ok(())
188    }
189
190    /// Get the estimated size of the serialized index.
191    pub fn estimated_size(&self) -> usize {
192        // Rough estimate: JSON overhead + document data
193        self.documents
194            .iter()
195            .map(|d| {
196                d.url.len()
197                    + d.title.len()
198                    + d.description.as_ref().map(|s| s.len()).unwrap_or(0)
199                    + d.terms.iter().map(|t| t.len() + 3).sum::<usize>()
200                    + 100 // JSON overhead
201            })
202            .sum()
203    }
204
205    /// Check if the index is within the recommended size limit.
206    pub fn is_within_size_limit(&self) -> bool {
207        self.estimated_size() <= MAX_SIMPLE_INDEX_SIZE
208    }
209}
210
211impl Default for SimpleSearchIndex {
212    fn default() -> Self {
213        Self::new()
214    }
215}
216
217/// Tokenize content for indexing.
218///
219/// Extracts terms from title, body content, and tags.
220fn tokenize_content(title: &str, content: &str, tags: &[String]) -> Vec<String> {
221    let mut terms = Vec::new();
222
223    // Tokenize title (higher weight, keep as-is)
224    for term in tokenize_text(title) {
225        terms.push(term);
226    }
227
228    // Tokenize body content
229    let body_text = strip_html(content);
230    for term in tokenize_text(&body_text) {
231        terms.push(term);
232    }
233
234    // Add tags
235    for tag in tags {
236        terms.push(normalize_term(tag));
237    }
238
239    // Deduplicate
240    terms.sort();
241    terms.dedup();
242
243    terms
244}
245
246/// Tokenize a query string.
247fn tokenize_query(query: &str) -> Vec<String> {
248    tokenize_text(query)
249}
250
251/// Tokenize text into normalized terms.
252/// Supports both space-separated languages (English) and CJK languages (Chinese, Japanese, Korean).
253fn tokenize_text(text: &str) -> Vec<String> {
254    let mut terms = Vec::new();
255
256    // First, extract word-based terms (for English and other space-separated languages)
257    for word in text.split(|c: char| !c.is_alphanumeric()) {
258        if word.len() >= 2 {
259            terms.push(normalize_term(word));
260        }
261    }
262
263    // Then, extract CJK characters (Chinese, Japanese, Korean)
264    // CJK characters are meaningful individually or in small groups
265    let cjk_text: String = text.chars().filter(|c| is_cjk_char(*c)).collect();
266
267    if !cjk_text.is_empty() {
268        // Add individual CJK characters
269        for c in cjk_text.chars() {
270            terms.push(c.to_string());
271        }
272
273        // Add bigrams (2-character combinations) for better matching
274        let chars: Vec<char> = cjk_text.chars().collect();
275        for i in 0..chars.len().saturating_sub(1) {
276            terms.push(format!("{}{}", chars[i], chars[i + 1]));
277        }
278
279        // Add the full CJK text if it's short enough to be meaningful
280        if cjk_text.len() <= 20 && cjk_text.chars().count() >= 2 {
281            terms.push(cjk_text.to_lowercase());
282        }
283    }
284
285    terms
286}
287
288/// Check if a character is a CJK (Chinese, Japanese, Korean) character.
289fn is_cjk_char(c: char) -> bool {
290    matches!(c,
291        '\u{4E00}'..='\u{9FFF}' |      // CJK Unified Ideographs
292        '\u{3400}'..='\u{4DBF}' |      // CJK Unified Ideographs Extension A
293        '\u{20000}'..='\u{2A6DF}' |    // CJK Unified Ideographs Extension B
294        '\u{2A700}'..='\u{2B73F}' |    // CJK Unified Ideographs Extension C
295        '\u{2B740}'..='\u{2B81F}' |    // CJK Unified Ideographs Extension D
296        '\u{2B820}'..='\u{2CEAF}' |    // CJK Unified Ideographs Extension E
297        '\u{2CEB0}'..='\u{2EBEF}' |    // CJK Unified Ideographs Extension F
298        '\u{30000}'..='\u{3134F}' |    // CJK Unified Ideographs Extension G
299        '\u{F900}'..='\u{FAFF}' |      // CJK Compatibility Ideographs
300        '\u{2F800}'..='\u{2FA1F}' |    // CJK Compatibility Ideographs Supplement
301        '\u{3040}'..='\u{309F}' |      // Hiragana
302        '\u{30A0}'..='\u{30FF}' |      // Katakana
303        '\u{AC00}'..='\u{D7AF}'        // Korean Hangul Syllables
304    )
305}
306
307/// Normalize a term (lowercase, trim).
308fn normalize_term(term: &str) -> String {
309    term.to_lowercase().trim().to_string()
310}
311
312/// Strip HTML tags from content.
313fn strip_html(html: &str) -> String {
314    let mut result = String::with_capacity(html.len());
315    let mut in_tag = false;
316
317    for c in html.chars() {
318        if c == '<' {
319            in_tag = true;
320        } else if c == '>' {
321            in_tag = false;
322            result.push(' ');
323        } else if !in_tag {
324            result.push(c);
325        }
326    }
327
328    result
329}
330
331#[cfg(test)]
332mod tests {
333    use chrono::Utc;
334
335    use super::*;
336
337    fn create_test_page(url: &str, title: &str, content: &str, tags: Vec<String>) -> Page {
338        Page {
339            url: url.to_string(),
340            title: title.to_string(),
341            description: Some(format!("Description of {}", title)),
342            date: Some(Utc::now()),
343            updated: None,
344            draft: false,
345            lang: "en".to_string(),
346            is_default_lang: true,
347            canonical_id: url.trim_start_matches('/').to_string(),
348            tags,
349            categories: vec![],
350            content: content.to_string(),
351            summary: None,
352            reading_time: Some(5),
353            word_count: Some(100),
354            source_path: None,
355            aliases: vec![],
356            toc: vec![],
357            custom_js: vec![],
358            custom_css: vec![],
359            template: None,
360            weight: 0,
361        }
362    }
363
364    #[test]
365    fn test_tokenize_text() {
366        let terms = tokenize_text("Hello World! This is a test.");
367        assert!(terms.contains(&"hello".to_string()));
368        assert!(terms.contains(&"world".to_string()));
369        assert!(terms.contains(&"test".to_string()));
370        // Single character "a" should be filtered out
371        assert!(!terms.contains(&"a".to_string()));
372    }
373
374    #[test]
375    fn test_tokenize_chinese() {
376        let terms = tokenize_text("你好世界");
377        // Should contain individual characters
378        assert!(terms.contains(&"你".to_string()));
379        assert!(terms.contains(&"好".to_string()));
380        assert!(terms.contains(&"世".to_string()));
381        assert!(terms.contains(&"界".to_string()));
382        // Should contain bigrams
383        assert!(terms.contains(&"你好".to_string()));
384        assert!(terms.contains(&"世界".to_string()));
385    }
386
387    #[test]
388    fn test_is_cjk_char() {
389        // Chinese
390        assert!(is_cjk_char('你'));
391        assert!(is_cjk_char('好'));
392        // Japanese
393        assert!(is_cjk_char('あ')); // Hiragana
394        assert!(is_cjk_char('ア')); // Katakana
395        // Korean
396        assert!(is_cjk_char('한')); // Hangul
397        // Not CJK
398        assert!(!is_cjk_char('a'));
399        assert!(!is_cjk_char('1'));
400    }
401
402    #[test]
403    fn test_strip_html() {
404        let html = "<p>Hello <strong>world</strong>!</p>";
405        let text = strip_html(html);
406        assert!(text.contains("Hello"));
407        assert!(text.contains("world"));
408        assert!(!text.contains("<p>"));
409    }
410
411    #[test]
412    fn test_simple_index_from_pages() {
413        let page1 = create_test_page(
414            "/post1",
415            "Introduction to Rust",
416            "<p>Rust is a systems programming language.</p>",
417            vec!["rust".to_string(), "programming".to_string()],
418        );
419        let page2 = create_test_page(
420            "/post2",
421            "Learning Go",
422            "<p>Go is a great language for servers.</p>",
423            vec!["go".to_string(), "programming".to_string()],
424        );
425
426        let index = SimpleSearchIndex::from_pages(&[&page1, &page2]);
427
428        assert_eq!(index.documents.len(), 2);
429        assert!(!index.index.is_empty());
430
431        // Check term indexing
432        assert!(index.index.contains_key("rust"));
433        assert!(index.index.contains_key("programming"));
434    }
435
436    #[test]
437    fn test_simple_index_search() {
438        let page1 = create_test_page(
439            "/rust",
440            "Learning Rust",
441            "<p>Rust programming tutorial.</p>",
442            vec!["rust".to_string()],
443        );
444        let page2 = create_test_page(
445            "/go",
446            "Learning Go",
447            "<p>Go programming tutorial.</p>",
448            vec!["go".to_string()],
449        );
450
451        let index = SimpleSearchIndex::from_pages(&[&page1, &page2]);
452
453        // Search for Rust
454        let results = index.search("rust");
455        assert_eq!(results.len(), 1);
456        assert_eq!(results[0].url, "/rust");
457
458        // Search for programming (should match both)
459        let results = index.search("programming");
460        assert_eq!(results.len(), 2);
461
462        // Search for non-existent term
463        let results = index.search("python");
464        assert!(results.is_empty());
465    }
466
467    #[test]
468    fn test_simple_index_serialization() {
469        let page = create_test_page(
470            "/test",
471            "Test Page",
472            "<p>Test content</p>",
473            vec!["test".to_string()],
474        );
475
476        let index = SimpleSearchIndex::from_pages(&[&page]);
477        let json = index.to_json().unwrap();
478        let parsed = SimpleSearchIndex::from_json(&json).unwrap();
479
480        assert_eq!(parsed.documents.len(), 1);
481        assert_eq!(parsed.documents[0].url, "/test");
482    }
483
484    #[test]
485    fn test_simple_index_multi_term_search() {
486        let page1 = create_test_page(
487            "/post1",
488            "Rust Programming Guide",
489            "<p>Learn systems programming with Rust.</p>",
490            vec!["rust".to_string()],
491        );
492        let page2 = create_test_page(
493            "/post2",
494            "Python Programming",
495            "<p>Learn scripting with Python.</p>",
496            vec!["python".to_string()],
497        );
498
499        let index = SimpleSearchIndex::from_pages(&[&page1, &page2]);
500
501        // Search for "rust programming" should only match post1
502        let results = index.search("rust systems");
503        assert_eq!(results.len(), 1);
504        assert_eq!(results[0].url, "/post1");
505    }
506
507    #[test]
508    fn test_estimated_size() {
509        let page = create_test_page(
510            "/test",
511            "Test Page",
512            "<p>Test content</p>",
513            vec!["test".to_string()],
514        );
515
516        let index = SimpleSearchIndex::from_pages(&[&page]);
517        let estimated = index.estimated_size();
518
519        // Should have some reasonable size
520        assert!(estimated > 0);
521        assert!(estimated < MAX_SIMPLE_INDEX_SIZE);
522        assert!(index.is_within_size_limit());
523    }
524}