typstify_search/
simple.rs

1//! Simple JSON-based search index for small sites.
2//!
3//! Provides a lightweight alternative to Tantivy for sites with fewer pages.
4//! The entire index is loaded into memory in the browser.
5
6use std::{collections::HashMap, fs, path::Path};
7
8use serde::{Deserialize, Serialize};
9use tracing::info;
10use typstify_core::Page;
11
12use crate::SearchError;
13
14/// Maximum recommended size for simple index (500KB).
15pub const MAX_SIMPLE_INDEX_SIZE: usize = 500 * 1024;
16
17/// A simple search index document.
18#[derive(Debug, Clone, Serialize, Deserialize)]
19pub struct SimpleDocument {
20    /// Document URL.
21    pub url: String,
22
23    /// Document title.
24    pub title: String,
25
26    /// Document description/summary.
27    #[serde(skip_serializing_if = "Option::is_none")]
28    pub description: Option<String>,
29
30    /// Language code.
31    #[serde(skip_serializing_if = "Option::is_none")]
32    pub lang: Option<String>,
33
34    /// Tags.
35    #[serde(default, skip_serializing_if = "Vec::is_empty")]
36    pub tags: Vec<String>,
37
38    /// Publication date as ISO string.
39    #[serde(skip_serializing_if = "Option::is_none")]
40    pub date: Option<String>,
41
42    /// Pre-tokenized terms from title and body.
43    pub terms: Vec<String>,
44}
45
46/// A simple JSON-based search index.
47#[derive(Debug, Clone, Serialize, Deserialize)]
48pub struct SimpleSearchIndex {
49    /// Index format version.
50    pub version: u32,
51
52    /// All indexed documents.
53    pub documents: Vec<SimpleDocument>,
54
55    /// Inverted index: term -> document indices.
56    pub index: HashMap<String, Vec<usize>>,
57}
58
59impl SimpleSearchIndex {
60    /// Create a new empty index.
61    pub fn new() -> Self {
62        Self {
63            version: 1,
64            documents: Vec::new(),
65            index: HashMap::new(),
66        }
67    }
68
69    /// Build an index from a collection of pages.
70    pub fn from_pages(pages: &[&Page]) -> Self {
71        let mut index = Self::new();
72
73        for page in pages {
74            index.add_page(page);
75        }
76
77        index.build_inverted_index();
78        index
79    }
80
81    /// Add a page to the index.
82    pub fn add_page(&mut self, page: &Page) {
83        let terms = tokenize_content(&page.title, &page.content, &page.tags);
84
85        let doc = SimpleDocument {
86            url: page.url.clone(),
87            title: page.title.clone(),
88            description: page.description.clone().or(page.summary.clone()),
89            lang: page.lang.clone(),
90            tags: page.tags.clone(),
91            date: page.date.map(|d| d.to_rfc3339()),
92            terms,
93        };
94
95        self.documents.push(doc);
96    }
97
98    /// Build the inverted index from documents.
99    fn build_inverted_index(&mut self) {
100        self.index.clear();
101
102        for (doc_idx, doc) in self.documents.iter().enumerate() {
103            for term in &doc.terms {
104                self.index.entry(term.clone()).or_default().push(doc_idx);
105            }
106        }
107
108        // Deduplicate posting lists
109        for postings in self.index.values_mut() {
110            postings.sort_unstable();
111            postings.dedup();
112        }
113
114        info!(
115            documents = self.documents.len(),
116            terms = self.index.len(),
117            "Built simple search index"
118        );
119    }
120
121    /// Search the index for matching documents.
122    ///
123    /// Returns documents matching all query terms (AND search).
124    pub fn search(&self, query: &str) -> Vec<&SimpleDocument> {
125        let query_terms = tokenize_query(query);
126
127        if query_terms.is_empty() {
128            return Vec::new();
129        }
130
131        // Find documents containing all query terms
132        let mut result_indices: Option<Vec<usize>> = None;
133
134        for term in &query_terms {
135            if let Some(postings) = self.index.get(term) {
136                match &mut result_indices {
137                    None => {
138                        result_indices = Some(postings.clone());
139                    }
140                    Some(indices) => {
141                        // Intersect with existing results
142                        indices.retain(|idx| postings.contains(idx));
143                    }
144                }
145            } else {
146                // Term not found, no results
147                return Vec::new();
148            }
149        }
150
151        result_indices
152            .unwrap_or_default()
153            .iter()
154            .filter_map(|&idx| self.documents.get(idx))
155            .collect()
156    }
157
158    /// Serialize the index to JSON.
159    pub fn to_json(&self) -> Result<String, SearchError> {
160        serde_json::to_string(self).map_err(|e| SearchError::Serialization(e.to_string()))
161    }
162
163    /// Serialize the index to pretty-printed JSON.
164    pub fn to_json_pretty(&self) -> Result<String, SearchError> {
165        serde_json::to_string_pretty(self).map_err(|e| SearchError::Serialization(e.to_string()))
166    }
167
168    /// Deserialize an index from JSON.
169    pub fn from_json(json: &str) -> Result<Self, SearchError> {
170        serde_json::from_str(json).map_err(|e| SearchError::Serialization(e.to_string()))
171    }
172
173    /// Write the index to a file.
174    pub fn write_to_file(&self, path: &Path) -> Result<(), SearchError> {
175        let json = self.to_json()?;
176
177        // Warn if index is too large
178        if json.len() > MAX_SIMPLE_INDEX_SIZE {
179            tracing::warn!(
180                size = json.len(),
181                max = MAX_SIMPLE_INDEX_SIZE,
182                "Simple search index exceeds recommended size"
183            );
184        }
185
186        fs::write(path, json).map_err(|e| SearchError::Io(e.to_string()))?;
187        Ok(())
188    }
189
190    /// Get the estimated size of the serialized index.
191    pub fn estimated_size(&self) -> usize {
192        // Rough estimate: JSON overhead + document data
193        self.documents
194            .iter()
195            .map(|d| {
196                d.url.len()
197                    + d.title.len()
198                    + d.description.as_ref().map(|s| s.len()).unwrap_or(0)
199                    + d.terms.iter().map(|t| t.len() + 3).sum::<usize>()
200                    + 100 // JSON overhead
201            })
202            .sum()
203    }
204
205    /// Check if the index is within the recommended size limit.
206    pub fn is_within_size_limit(&self) -> bool {
207        self.estimated_size() <= MAX_SIMPLE_INDEX_SIZE
208    }
209}
210
211impl Default for SimpleSearchIndex {
212    fn default() -> Self {
213        Self::new()
214    }
215}
216
217/// Tokenize content for indexing.
218///
219/// Extracts terms from title, body content, and tags.
220fn tokenize_content(title: &str, content: &str, tags: &[String]) -> Vec<String> {
221    let mut terms = Vec::new();
222
223    // Tokenize title (higher weight, keep as-is)
224    for term in tokenize_text(title) {
225        terms.push(term);
226    }
227
228    // Tokenize body content
229    let body_text = strip_html(content);
230    for term in tokenize_text(&body_text) {
231        terms.push(term);
232    }
233
234    // Add tags
235    for tag in tags {
236        terms.push(normalize_term(tag));
237    }
238
239    // Deduplicate
240    terms.sort();
241    terms.dedup();
242
243    terms
244}
245
246/// Tokenize a query string.
247fn tokenize_query(query: &str) -> Vec<String> {
248    tokenize_text(query)
249}
250
251/// Tokenize text into normalized terms.
252fn tokenize_text(text: &str) -> Vec<String> {
253    text.split(|c: char| !c.is_alphanumeric())
254        .filter(|s| s.len() >= 2) // Skip single characters
255        .map(normalize_term)
256        .collect()
257}
258
259/// Normalize a term (lowercase, trim).
260fn normalize_term(term: &str) -> String {
261    term.to_lowercase().trim().to_string()
262}
263
264/// Strip HTML tags from content.
265fn strip_html(html: &str) -> String {
266    let mut result = String::with_capacity(html.len());
267    let mut in_tag = false;
268
269    for c in html.chars() {
270        if c == '<' {
271            in_tag = true;
272        } else if c == '>' {
273            in_tag = false;
274            result.push(' ');
275        } else if !in_tag {
276            result.push(c);
277        }
278    }
279
280    result
281}
282
283#[cfg(test)]
284mod tests {
285    use chrono::Utc;
286
287    use super::*;
288
289    fn create_test_page(url: &str, title: &str, content: &str, tags: Vec<String>) -> Page {
290        Page {
291            url: url.to_string(),
292            title: title.to_string(),
293            description: Some(format!("Description of {}", title)),
294            date: Some(Utc::now()),
295            updated: None,
296            draft: false,
297            lang: Some("en".to_string()),
298            tags,
299            categories: vec![],
300            content: content.to_string(),
301            summary: None,
302            reading_time: Some(5),
303            word_count: Some(100),
304            source_path: None,
305            aliases: vec![],
306            toc: vec![],
307            custom_js: vec![],
308            custom_css: vec![],
309            template: None,
310            weight: 0,
311        }
312    }
313
314    #[test]
315    fn test_tokenize_text() {
316        let terms = tokenize_text("Hello World! This is a test.");
317        assert!(terms.contains(&"hello".to_string()));
318        assert!(terms.contains(&"world".to_string()));
319        assert!(terms.contains(&"test".to_string()));
320        // Single character "a" should be filtered out
321        assert!(!terms.contains(&"a".to_string()));
322    }
323
324    #[test]
325    fn test_strip_html() {
326        let html = "<p>Hello <strong>world</strong>!</p>";
327        let text = strip_html(html);
328        assert!(text.contains("Hello"));
329        assert!(text.contains("world"));
330        assert!(!text.contains("<p>"));
331    }
332
333    #[test]
334    fn test_simple_index_from_pages() {
335        let page1 = create_test_page(
336            "/post1",
337            "Introduction to Rust",
338            "<p>Rust is a systems programming language.</p>",
339            vec!["rust".to_string(), "programming".to_string()],
340        );
341        let page2 = create_test_page(
342            "/post2",
343            "Learning Go",
344            "<p>Go is a great language for servers.</p>",
345            vec!["go".to_string(), "programming".to_string()],
346        );
347
348        let index = SimpleSearchIndex::from_pages(&[&page1, &page2]);
349
350        assert_eq!(index.documents.len(), 2);
351        assert!(!index.index.is_empty());
352
353        // Check term indexing
354        assert!(index.index.contains_key("rust"));
355        assert!(index.index.contains_key("programming"));
356    }
357
358    #[test]
359    fn test_simple_index_search() {
360        let page1 = create_test_page(
361            "/rust",
362            "Learning Rust",
363            "<p>Rust programming tutorial.</p>",
364            vec!["rust".to_string()],
365        );
366        let page2 = create_test_page(
367            "/go",
368            "Learning Go",
369            "<p>Go programming tutorial.</p>",
370            vec!["go".to_string()],
371        );
372
373        let index = SimpleSearchIndex::from_pages(&[&page1, &page2]);
374
375        // Search for Rust
376        let results = index.search("rust");
377        assert_eq!(results.len(), 1);
378        assert_eq!(results[0].url, "/rust");
379
380        // Search for programming (should match both)
381        let results = index.search("programming");
382        assert_eq!(results.len(), 2);
383
384        // Search for non-existent term
385        let results = index.search("python");
386        assert!(results.is_empty());
387    }
388
389    #[test]
390    fn test_simple_index_serialization() {
391        let page = create_test_page(
392            "/test",
393            "Test Page",
394            "<p>Test content</p>",
395            vec!["test".to_string()],
396        );
397
398        let index = SimpleSearchIndex::from_pages(&[&page]);
399        let json = index.to_json().unwrap();
400        let parsed = SimpleSearchIndex::from_json(&json).unwrap();
401
402        assert_eq!(parsed.documents.len(), 1);
403        assert_eq!(parsed.documents[0].url, "/test");
404    }
405
406    #[test]
407    fn test_simple_index_multi_term_search() {
408        let page1 = create_test_page(
409            "/post1",
410            "Rust Programming Guide",
411            "<p>Learn systems programming with Rust.</p>",
412            vec!["rust".to_string()],
413        );
414        let page2 = create_test_page(
415            "/post2",
416            "Python Programming",
417            "<p>Learn scripting with Python.</p>",
418            vec!["python".to_string()],
419        );
420
421        let index = SimpleSearchIndex::from_pages(&[&page1, &page2]);
422
423        // Search for "rust programming" should only match post1
424        let results = index.search("rust systems");
425        assert_eq!(results.len(), 1);
426        assert_eq!(results[0].url, "/post1");
427    }
428
429    #[test]
430    fn test_estimated_size() {
431        let page = create_test_page(
432            "/test",
433            "Test Page",
434            "<p>Test content</p>",
435            vec!["test".to_string()],
436        );
437
438        let index = SimpleSearchIndex::from_pages(&[&page]);
439        let estimated = index.estimated_size();
440
441        // Should have some reasonable size
442        assert!(estimated > 0);
443        assert!(estimated < MAX_SIMPLE_INDEX_SIZE);
444        assert!(index.is_within_size_limit());
445    }
446}