1use regex::Regex;
2use serde::{Deserialize, Serialize};
3use sha2::{Digest, Sha256};
4
5#[derive(Debug, Clone, PartialEq, Eq)]
6pub struct SearchConfig {
7 pub languages: Vec<String>,
8}
9
10#[derive(Debug, Clone, PartialEq, Eq)]
11pub struct SearchPage {
12 pub title: String,
13 pub url: String,
14 pub headings: Vec<String>,
15 pub body: String,
16}
17
18#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
19pub struct SearchIndex {
20 pub version: u8,
21 pub languages: Vec<String>,
22 pub pages: Vec<SearchIndexPage>,
23}
24
25#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
26pub struct SearchIndexPage {
27 pub id: String,
28 pub title: String,
29 pub url: String,
30 pub headings: Vec<String>,
31 pub body: String,
32 pub tokens: Vec<String>,
33}
34
35pub fn build_search_index(config: SearchConfig, pages: &[SearchPage]) -> SearchIndex {
36 SearchIndex {
37 version: 1,
38 languages: config.languages,
39 pages: pages
40 .iter()
41 .map(|page| {
42 let mut tokens = tokenize(&format!(
43 "{} {} {}",
44 page.title,
45 page.headings.join(" "),
46 page.body
47 ));
48 tokens.sort();
49 tokens.dedup();
50 SearchIndexPage {
51 id: stable_id(&page.url),
52 title: page.title.clone(),
53 url: page.url.clone(),
54 headings: page.headings.clone(),
55 body: page.body.clone(),
56 tokens,
57 }
58 })
59 .collect(),
60 }
61}
62
63pub fn tokenize(input: &str) -> Vec<String> {
64 let mut tokens = Vec::new();
65 let latin = Regex::new(r"[A-Za-z0-9]+").unwrap();
66 for capture in latin.find_iter(input) {
67 let token = stem_english(&capture.as_str().to_lowercase());
68 if !token.is_empty() {
69 tokens.push(token);
70 }
71 }
72
73 for ch in input.chars() {
74 if is_cjk(ch) {
75 tokens.push(ch.to_string());
76 }
77 }
78
79 tokens
80}
81
82pub fn wasm_placeholder() -> &'static [u8] {
83 b"\0asm\x01\0\0\0"
86}
87
88fn stem_english(token: &str) -> String {
89 for suffix in ["ing", "ed", "es"] {
90 if token.len() > suffix.len() + 2 && token.ends_with(suffix) {
91 return token[..token.len() - suffix.len()].to_string();
92 }
93 }
94 token.to_string()
95}
96
97fn is_cjk(ch: char) -> bool {
98 matches!(ch as u32, 0x3400..=0x9fff | 0xf900..=0xfaff)
99}
100
101fn stable_id(input: &str) -> String {
102 let hash = Sha256::digest(input.as_bytes());
103 hash.iter()
104 .take(8)
105 .map(|byte| format!("{byte:02x}"))
106 .collect()
107}
108
109#[cfg(test)]
110mod tests {
111 use super::*;
112
113 #[test]
114 fn tokenizes_english_case_insensitive_and_chinese() {
115 let tokens = tokenize("Running DOCS 中文搜索");
116
117 assert!(tokens.contains(&"runn".to_string()));
118 assert!(tokens.contains(&"docs".to_string()));
119 assert!(tokens.contains(&"中".to_string()));
120 assert!(tokens.contains(&"文".to_string()));
121 }
122
123 #[test]
124 fn builds_page_index() {
125 let index = build_search_index(
126 SearchConfig {
127 languages: vec!["zh".to_string(), "en".to_string()],
128 },
129 &[SearchPage {
130 title: "Hello".to_string(),
131 url: "/".to_string(),
132 headings: vec!["Intro".to_string()],
133 body: "Body".to_string(),
134 }],
135 );
136
137 assert_eq!(index.pages.len(), 1);
138 assert_eq!(index.pages[0].url, "/");
139 assert!(index.pages[0].tokens.contains(&"hello".to_string()));
140 }
141}