mdbook/renderer/html_handlebars/
search.rs1use std::borrow::Cow;
2use std::collections::{HashMap, HashSet};
3use std::path::{Path, PathBuf};
4use std::sync::LazyLock;
5
6use elasticlunr::{Index, IndexBuilder};
7use pulldown_cmark::*;
8
9use crate::book::{Book, BookItem, Chapter};
10use crate::config::{Search, SearchChapterSettings};
11use crate::errors::*;
12use crate::renderer::html_handlebars::StaticFiles;
13use crate::theme::searcher;
14use crate::utils;
15use log::{debug, warn};
16use serde::Serialize;
17
18const MAX_WORD_LENGTH_TO_INDEX: usize = 80;
19
20fn tokenize(text: &str) -> Vec<String> {
22 text.split(|c: char| c.is_whitespace() || c == '-')
23 .filter(|s| !s.is_empty())
24 .map(|s| s.trim().to_lowercase())
25 .filter(|s| s.len() <= MAX_WORD_LENGTH_TO_INDEX)
26 .collect()
27}
28
29pub fn create_files(
31 search_config: &Search,
32 static_files: &mut StaticFiles,
33 book: &Book,
34) -> Result<()> {
35 let mut index = IndexBuilder::new()
36 .add_field_with_tokenizer("title", Box::new(&tokenize))
37 .add_field_with_tokenizer("body", Box::new(&tokenize))
38 .add_field_with_tokenizer("breadcrumbs", Box::new(&tokenize))
39 .build();
40
41 let mut doc_urls = Vec::with_capacity(book.sections.len());
42
43 let chapter_configs = sort_search_config(&search_config.chapter);
44 validate_chapter_config(&chapter_configs, book)?;
45
46 for item in book.iter() {
47 let chapter = match item {
48 BookItem::Chapter(ch) if !ch.is_draft_chapter() => ch,
49 _ => continue,
50 };
51 if let Some(path) = settings_path(chapter) {
52 let chapter_settings = get_chapter_settings(&chapter_configs, path);
53 if !chapter_settings.enable.unwrap_or(true) {
54 continue;
55 }
56 }
57 render_item(&mut index, search_config, &mut doc_urls, chapter)?;
58 }
59
60 let index = write_to_json(index, search_config, doc_urls)?;
61 debug!("Writing search index ✓");
62 if index.len() > 10_000_000 {
63 warn!("search index is very large ({} bytes)", index.len());
64 }
65
66 if search_config.copy_js {
67 static_files.add_builtin(
68 "searchindex.js",
69 format!(
72 "window.search = Object.assign(window.search, JSON.parse('{}'));",
73 index.replace("\\", "\\\\").replace("'", "\\'")
74 )
75 .as_bytes(),
76 );
77 static_files.add_builtin("searcher.js", searcher::JS);
78 static_files.add_builtin("mark.min.js", searcher::MARK_JS);
79 static_files.add_builtin("elasticlunr.min.js", searcher::ELASTICLUNR_JS);
80 debug!("Copying search files ✓");
81 }
82
83 Ok(())
84}
85
86fn add_doc(
88 index: &mut Index,
89 doc_urls: &mut Vec<String>,
90 anchor_base: &str,
91 heading: &str,
92 id_counter: &mut HashMap<String, usize>,
93 section_id: &Option<CowStr<'_>>,
94 items: &[&str],
95) {
96 let section_id = section_id.as_ref().map(|id| id.to_string()).or_else(|| {
99 if heading.is_empty() {
100 None
102 } else {
103 Some(utils::unique_id_from_content(heading, id_counter))
104 }
105 });
106
107 let url = if let Some(id) = section_id {
108 Cow::Owned(format!("{anchor_base}#{id}"))
109 } else {
110 Cow::Borrowed(anchor_base)
111 };
112 let url = utils::collapse_whitespace(url.trim());
113 let doc_ref = doc_urls.len().to_string();
114 doc_urls.push(url.into());
115
116 let items = items.iter().map(|&x| utils::collapse_whitespace(x.trim()));
117 index.add_doc(&doc_ref, items);
118}
119
120fn render_item(
122 index: &mut Index,
123 search_config: &Search,
124 doc_urls: &mut Vec<String>,
125 chapter: &Chapter,
126) -> Result<()> {
127 let chapter_path = chapter
128 .path
129 .as_ref()
130 .expect("Checked that path exists above");
131 let filepath = Path::new(&chapter_path).with_extension("html");
132 let filepath = filepath
133 .to_str()
134 .with_context(|| "Could not convert HTML path to str")?;
135 let anchor_base = utils::fs::normalize_path(filepath);
136
137 let mut p = utils::new_cmark_parser(&chapter.content, false).peekable();
138
139 let mut in_heading = false;
140 let max_section_depth = u32::from(search_config.heading_split_level);
141 let mut section_id = None;
142 let mut heading = String::new();
143 let mut body = String::new();
144 let mut breadcrumbs = chapter.parent_names.clone();
145 let mut footnote_numbers = HashMap::new();
146
147 breadcrumbs.push(chapter.name.clone());
148
149 let mut id_counter = HashMap::new();
150 while let Some(event) = p.next() {
151 match event {
152 Event::Start(Tag::Heading { level, id, .. }) if level as u32 <= max_section_depth => {
153 if !heading.is_empty() {
154 add_doc(
157 index,
158 doc_urls,
159 &anchor_base,
160 &heading,
161 &mut id_counter,
162 §ion_id,
163 &[&heading, &body, &breadcrumbs.join(" » ")],
164 );
165 heading.clear();
166 body.clear();
167 breadcrumbs.pop();
168 }
169
170 section_id = id;
171 in_heading = true;
172 }
173 Event::End(TagEnd::Heading(level)) if level as u32 <= max_section_depth => {
174 in_heading = false;
175 breadcrumbs.push(heading.clone());
176 }
177 Event::Start(Tag::FootnoteDefinition(name)) => {
178 let number = footnote_numbers.len() + 1;
179 footnote_numbers.entry(name).or_insert(number);
180 }
181 Event::Html(html) => {
182 let mut html_block = html.into_string();
183
184 while let Some(Event::Html(html)) = p.peek() {
188 html_block.push_str(html);
189 p.next();
190 }
191 body.push_str(&clean_html(&html_block));
192 }
193 Event::InlineHtml(html) => {
194 body.push_str(&clean_html(&html));
203 }
204 Event::Start(_) | Event::End(_) | Event::Rule | Event::SoftBreak | Event::HardBreak => {
205 if in_heading {
208 heading.push(' ');
209 } else {
210 body.push(' ');
211 }
212 }
213 Event::Text(text) | Event::Code(text) => {
214 if in_heading {
215 heading.push_str(&text);
216 } else {
217 body.push_str(&text);
218 }
219 }
220 Event::FootnoteReference(name) => {
221 let len = footnote_numbers.len() + 1;
222 let number = footnote_numbers.entry(name).or_insert(len);
223 body.push_str(&format!(" [{number}] "));
224 }
225 Event::TaskListMarker(_checked) => {}
226 }
227 }
228
229 if !body.is_empty() || !heading.is_empty() {
230 let title = if heading.is_empty() {
231 if let Some(chapter) = breadcrumbs.first() {
232 chapter
233 } else {
234 ""
235 }
236 } else {
237 &heading
238 };
239 add_doc(
241 index,
242 doc_urls,
243 &anchor_base,
244 &heading,
245 &mut id_counter,
246 §ion_id,
247 &[title, &body, &breadcrumbs.join(" » ")],
248 );
249 }
250
251 Ok(())
252}
253
254fn write_to_json(index: Index, search_config: &Search, doc_urls: Vec<String>) -> Result<String> {
255 use elasticlunr::config::{SearchBool, SearchOptions, SearchOptionsField};
256 use std::collections::BTreeMap;
257
258 #[derive(Serialize)]
259 struct ResultsOptions {
260 limit_results: u32,
261 teaser_word_count: u32,
262 }
263
264 #[derive(Serialize)]
265 struct SearchindexJson {
266 results_options: ResultsOptions,
268 search_options: SearchOptions,
270 doc_urls: Vec<String>,
272 index: elasticlunr::Index,
274 }
275
276 let mut fields = BTreeMap::new();
277 let mut opt = SearchOptionsField::default();
278 let mut insert_boost = |key: &str, boost| {
279 opt.boost = Some(boost);
280 fields.insert(key.into(), opt);
281 };
282 insert_boost("title", search_config.boost_title);
283 insert_boost("body", search_config.boost_paragraph);
284 insert_boost("breadcrumbs", search_config.boost_hierarchy);
285
286 let search_options = SearchOptions {
287 bool: if search_config.use_boolean_and {
288 SearchBool::And
289 } else {
290 SearchBool::Or
291 },
292 expand: search_config.expand,
293 fields,
294 };
295
296 let results_options = ResultsOptions {
297 limit_results: search_config.limit_results,
298 teaser_word_count: search_config.teaser_word_count,
299 };
300
301 let json_contents = SearchindexJson {
302 results_options,
303 search_options,
304 doc_urls,
305 index,
306 };
307
308 let json_contents = serde_json::to_value(&json_contents)?;
311 let json_contents = serde_json::to_string(&json_contents)?;
312
313 Ok(json_contents)
314}
315
316fn clean_html(html: &str) -> String {
317 static AMMONIA: LazyLock<ammonia::Builder<'static>> = LazyLock::new(|| {
318 let mut clean_content = HashSet::new();
319 clean_content.insert("script");
320 clean_content.insert("style");
321 let mut builder = ammonia::Builder::new();
322 builder
323 .tags(HashSet::new())
324 .tag_attributes(HashMap::new())
325 .generic_attributes(HashSet::new())
326 .link_rel(None)
327 .allowed_classes(HashMap::new())
328 .clean_content_tags(clean_content);
329 builder
330 });
331 AMMONIA.clean(html).to_string()
332}
333
334fn settings_path(ch: &Chapter) -> Option<&Path> {
335 ch.source_path.as_deref().or_else(|| ch.path.as_deref())
336}
337
338fn validate_chapter_config(
339 chapter_configs: &[(PathBuf, SearchChapterSettings)],
340 book: &Book,
341) -> Result<()> {
342 for (path, _) in chapter_configs {
343 let found = book
344 .iter()
345 .filter_map(|item| match item {
346 BookItem::Chapter(ch) if !ch.is_draft_chapter() => settings_path(ch),
347 _ => None,
348 })
349 .any(|source_path| source_path.starts_with(path));
350 if !found {
351 bail!(
352 "[output.html.search.chapter] key `{}` does not match any chapter paths",
353 path.display()
354 );
355 }
356 }
357 Ok(())
358}
359
360fn sort_search_config(
361 map: &HashMap<String, SearchChapterSettings>,
362) -> Vec<(PathBuf, SearchChapterSettings)> {
363 let mut settings: Vec<_> = map
364 .iter()
365 .map(|(key, value)| (PathBuf::from(key), value.clone()))
366 .collect();
367 settings.sort_by(|a, b| a.0.cmp(&b.0));
370 settings
371}
372
373fn get_chapter_settings(
374 chapter_configs: &[(PathBuf, SearchChapterSettings)],
375 source_path: &Path,
376) -> SearchChapterSettings {
377 let mut result = SearchChapterSettings::default();
378 for (path, config) in chapter_configs {
379 if source_path.starts_with(path) {
380 result.enable = config.enable.or(result.enable);
381 }
382 }
383 result
384}
385
386#[test]
387fn chapter_settings_priority() {
388 let cfg = r#"
389 [output.html.search.chapter]
390 "cli/watch.md" = { enable = true }
391 "cli" = { enable = false }
392 "cli/inner/foo.md" = { enable = false }
393 "cli/inner" = { enable = true }
394 "foo" = {} # Just to make sure empty table is allowed.
395 "#;
396 let cfg: crate::Config = toml::from_str(cfg).unwrap();
397 let html = cfg.html_config().unwrap();
398 let chapter_configs = sort_search_config(&html.search.unwrap().chapter);
399 for (path, enable) in [
400 ("foo.md", None),
401 ("cli/watch.md", Some(true)),
402 ("cli/index.md", Some(false)),
403 ("cli/inner/index.md", Some(true)),
404 ("cli/inner/foo.md", Some(false)),
405 ] {
406 assert_eq!(
407 get_chapter_settings(&chapter_configs, Path::new(path)),
408 SearchChapterSettings { enable }
409 );
410 }
411}
412
413#[cfg(test)]
414mod tests {
415 use super::*;
416
417 #[test]
418 fn test_tokenize_basic() {
419 assert_eq!(tokenize("hello world"), vec!["hello", "world"]);
420 }
421
422 #[test]
423 fn test_tokenize_with_hyphens() {
424 assert_eq!(
425 tokenize("hello-world test-case"),
426 vec!["hello", "world", "test", "case"]
427 );
428 }
429
430 #[test]
431 fn test_tokenize_mixed_whitespace() {
432 assert_eq!(
433 tokenize("hello\tworld\ntest\r\ncase"),
434 vec!["hello", "world", "test", "case"]
435 );
436 }
437
438 #[test]
439 fn test_tokenize_empty_string() {
440 assert_eq!(tokenize(""), Vec::<String>::new());
441 }
442
443 #[test]
444 fn test_tokenize_only_whitespace() {
445 assert_eq!(tokenize(" \t\n "), Vec::<String>::new());
446 }
447
448 #[test]
449 fn test_tokenize_case_normalization() {
450 assert_eq!(tokenize("Hello WORLD Test"), vec!["hello", "world", "test"]);
451 }
452
453 #[test]
454 fn test_tokenize_trim_whitespace() {
455 assert_eq!(tokenize(" hello world "), vec!["hello", "world"]);
456 }
457
458 #[test]
459 fn test_tokenize_long_words_filtered() {
460 let long_word = "a".repeat(MAX_WORD_LENGTH_TO_INDEX + 1);
461 let short_word = "a".repeat(MAX_WORD_LENGTH_TO_INDEX);
462 let input = format!("{} hello {}", long_word, short_word);
463 assert_eq!(tokenize(&input), vec!["hello", &short_word]);
464 }
465
466 #[test]
467 fn test_tokenize_max_length_word() {
468 let max_word = "a".repeat(MAX_WORD_LENGTH_TO_INDEX);
469 assert_eq!(tokenize(&max_word), vec![max_word]);
470 }
471
472 #[test]
473 fn test_tokenize_special_characters() {
474 assert_eq!(
475 tokenize("hello,world.test!case?"),
476 vec!["hello,world.test!case?"]
477 );
478 }
479
480 #[test]
481 fn test_tokenize_unicode() {
482 assert_eq!(
483 tokenize("café naïve résumé"),
484 vec!["café", "naïve", "résumé"]
485 );
486 }
487
488 #[test]
489 fn test_tokenize_unicode_rtl_hebre() {
490 assert_eq!(tokenize("שלום עולם"), vec!["שלום", "עולם"]);
491 }
492
493 #[test]
494 fn test_tokenize_numbers() {
495 assert_eq!(
496 tokenize("test123 456-789 hello"),
497 vec!["test123", "456", "789", "hello"]
498 );
499 }
500}