Skip to main content

rustpress_search/
lib.rs

1use regex::Regex;
2use serde::{Deserialize, Serialize};
3use sha2::{Digest, Sha256};
4
5#[derive(Debug, Clone, PartialEq, Eq)]
6pub struct SearchConfig {
7    pub languages: Vec<String>,
8}
9
10#[derive(Debug, Clone, PartialEq, Eq)]
11pub struct SearchPage {
12    pub title: String,
13    pub url: String,
14    pub headings: Vec<String>,
15    pub body: String,
16}
17
18#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
19pub struct SearchIndex {
20    pub version: u8,
21    pub languages: Vec<String>,
22    pub pages: Vec<SearchIndexPage>,
23}
24
25#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
26pub struct SearchIndexPage {
27    pub id: String,
28    pub title: String,
29    pub url: String,
30    pub headings: Vec<String>,
31    pub body: String,
32    pub tokens: Vec<String>,
33}
34
35pub fn build_search_index(config: SearchConfig, pages: &[SearchPage]) -> SearchIndex {
36    SearchIndex {
37        version: 1,
38        languages: config.languages,
39        pages: pages
40            .iter()
41            .map(|page| {
42                let mut tokens = tokenize(&format!(
43                    "{} {} {}",
44                    page.title,
45                    page.headings.join(" "),
46                    page.body
47                ));
48                tokens.sort();
49                tokens.dedup();
50                SearchIndexPage {
51                    id: stable_id(&page.url),
52                    title: page.title.clone(),
53                    url: page.url.clone(),
54                    headings: page.headings.clone(),
55                    body: page.body.clone(),
56                    tokens,
57                }
58            })
59            .collect(),
60    }
61}
62
63pub fn tokenize(input: &str) -> Vec<String> {
64    let mut tokens = Vec::new();
65    let latin = Regex::new(r"[A-Za-z0-9]+").unwrap();
66    for capture in latin.find_iter(input) {
67        let token = stem_english(&capture.as_str().to_lowercase());
68        if !token.is_empty() {
69            tokens.push(token);
70        }
71    }
72
73    for ch in input.chars() {
74        if is_cjk(ch) {
75            tokens.push(ch.to_string());
76        }
77    }
78
79    tokens
80}
81
82pub fn wasm_placeholder() -> &'static [u8] {
83    // Empty WASM module: "\0asm" + version 1. It lets the static output keep
84    // the MVP asset contract while JS provides the runtime fallback.
85    b"\0asm\x01\0\0\0"
86}
87
88fn stem_english(token: &str) -> String {
89    for suffix in ["ing", "ed", "es"] {
90        if token.len() > suffix.len() + 2 && token.ends_with(suffix) {
91            return token[..token.len() - suffix.len()].to_string();
92        }
93    }
94    token.to_string()
95}
96
97fn is_cjk(ch: char) -> bool {
98    matches!(ch as u32, 0x3400..=0x9fff | 0xf900..=0xfaff)
99}
100
101fn stable_id(input: &str) -> String {
102    let hash = Sha256::digest(input.as_bytes());
103    hash.iter()
104        .take(8)
105        .map(|byte| format!("{byte:02x}"))
106        .collect()
107}
108
109#[cfg(test)]
110mod tests {
111    use super::*;
112
113    #[test]
114    fn tokenizes_english_case_insensitive_and_chinese() {
115        let tokens = tokenize("Running DOCS 中文搜索");
116
117        assert!(tokens.contains(&"runn".to_string()));
118        assert!(tokens.contains(&"docs".to_string()));
119        assert!(tokens.contains(&"中".to_string()));
120        assert!(tokens.contains(&"文".to_string()));
121    }
122
123    #[test]
124    fn builds_page_index() {
125        let index = build_search_index(
126            SearchConfig {
127                languages: vec!["zh".to_string(), "en".to_string()],
128            },
129            &[SearchPage {
130                title: "Hello".to_string(),
131                url: "/".to_string(),
132                headings: vec!["Intro".to_string()],
133                body: "Body".to_string(),
134            }],
135        );
136
137        assert_eq!(index.pages.len(), 1);
138        assert_eq!(index.pages[0].url, "/");
139        assert!(index.pages[0].tokens.contains(&"hello".to_string()));
140    }
141}