mdbook/renderer/html_handlebars/
search.rs

1use std::borrow::Cow;
2use std::collections::{HashMap, HashSet};
3use std::path::{Path, PathBuf};
4use std::sync::LazyLock;
5
6use elasticlunr::{Index, IndexBuilder};
7use pulldown_cmark::*;
8
9use crate::book::{Book, BookItem, Chapter};
10use crate::config::{Search, SearchChapterSettings};
11use crate::errors::*;
12use crate::renderer::html_handlebars::StaticFiles;
13use crate::theme::searcher;
14use crate::utils;
15use log::{debug, warn};
16use serde::Serialize;
17
18const MAX_WORD_LENGTH_TO_INDEX: usize = 80;
19
20/// Tokenizes in the same way as elasticlunr-rs (for English), but also drops long tokens.
21fn tokenize(text: &str) -> Vec<String> {
22    text.split(|c: char| c.is_whitespace() || c == '-')
23        .filter(|s| !s.is_empty())
24        .map(|s| s.trim().to_lowercase())
25        .filter(|s| s.len() <= MAX_WORD_LENGTH_TO_INDEX)
26        .collect()
27}
28
29/// Creates all files required for search.
30pub fn create_files(
31    search_config: &Search,
32    static_files: &mut StaticFiles,
33    book: &Book,
34) -> Result<()> {
35    let mut index = IndexBuilder::new()
36        .add_field_with_tokenizer("title", Box::new(&tokenize))
37        .add_field_with_tokenizer("body", Box::new(&tokenize))
38        .add_field_with_tokenizer("breadcrumbs", Box::new(&tokenize))
39        .build();
40
41    let mut doc_urls = Vec::with_capacity(book.sections.len());
42
43    let chapter_configs = sort_search_config(&search_config.chapter);
44    validate_chapter_config(&chapter_configs, book)?;
45
46    for item in book.iter() {
47        let chapter = match item {
48            BookItem::Chapter(ch) if !ch.is_draft_chapter() => ch,
49            _ => continue,
50        };
51        if let Some(path) = settings_path(chapter) {
52            let chapter_settings = get_chapter_settings(&chapter_configs, path);
53            if !chapter_settings.enable.unwrap_or(true) {
54                continue;
55            }
56        }
57        render_item(&mut index, search_config, &mut doc_urls, chapter)?;
58    }
59
60    let index = write_to_json(index, search_config, doc_urls)?;
61    debug!("Writing search index ✓");
62    if index.len() > 10_000_000 {
63        warn!("search index is very large ({} bytes)", index.len());
64    }
65
66    if search_config.copy_js {
67        static_files.add_builtin(
68            "searchindex.js",
69            // To reduce the size of the generated JSON by preventing all `"` characters to be
70            // escaped, we instead surround the string with much less common `'` character.
71            format!(
72                "window.search = Object.assign(window.search, JSON.parse('{}'));",
73                index.replace("\\", "\\\\").replace("'", "\\'")
74            )
75            .as_bytes(),
76        );
77        static_files.add_builtin("searcher.js", searcher::JS);
78        static_files.add_builtin("mark.min.js", searcher::MARK_JS);
79        static_files.add_builtin("elasticlunr.min.js", searcher::ELASTICLUNR_JS);
80        debug!("Copying search files ✓");
81    }
82
83    Ok(())
84}
85
86/// Uses the given arguments to construct a search document, then inserts it to the given index.
87fn add_doc(
88    index: &mut Index,
89    doc_urls: &mut Vec<String>,
90    anchor_base: &str,
91    heading: &str,
92    id_counter: &mut HashMap<String, usize>,
93    section_id: &Option<CowStr<'_>>,
94    items: &[&str],
95) {
96    // Either use the explicit section id the user specified, or generate one
97    // from the heading content.
98    let section_id = section_id.as_ref().map(|id| id.to_string()).or_else(|| {
99        if heading.is_empty() {
100            // In the case where a chapter has no heading, don't set a section id.
101            None
102        } else {
103            Some(utils::unique_id_from_content(heading, id_counter))
104        }
105    });
106
107    let url = if let Some(id) = section_id {
108        Cow::Owned(format!("{anchor_base}#{id}"))
109    } else {
110        Cow::Borrowed(anchor_base)
111    };
112    let url = utils::collapse_whitespace(url.trim());
113    let doc_ref = doc_urls.len().to_string();
114    doc_urls.push(url.into());
115
116    let items = items.iter().map(|&x| utils::collapse_whitespace(x.trim()));
117    index.add_doc(&doc_ref, items);
118}
119
120/// Renders markdown into flat unformatted text and adds it to the search index.
121fn render_item(
122    index: &mut Index,
123    search_config: &Search,
124    doc_urls: &mut Vec<String>,
125    chapter: &Chapter,
126) -> Result<()> {
127    let chapter_path = chapter
128        .path
129        .as_ref()
130        .expect("Checked that path exists above");
131    let filepath = Path::new(&chapter_path).with_extension("html");
132    let filepath = filepath
133        .to_str()
134        .with_context(|| "Could not convert HTML path to str")?;
135    let anchor_base = utils::fs::normalize_path(filepath);
136
137    let mut p = utils::new_cmark_parser(&chapter.content, false).peekable();
138
139    let mut in_heading = false;
140    let max_section_depth = u32::from(search_config.heading_split_level);
141    let mut section_id = None;
142    let mut heading = String::new();
143    let mut body = String::new();
144    let mut breadcrumbs = chapter.parent_names.clone();
145    let mut footnote_numbers = HashMap::new();
146
147    breadcrumbs.push(chapter.name.clone());
148
149    let mut id_counter = HashMap::new();
150    while let Some(event) = p.next() {
151        match event {
152            Event::Start(Tag::Heading { level, id, .. }) if level as u32 <= max_section_depth => {
153                if !heading.is_empty() {
154                    // Section finished, the next heading is following now
155                    // Write the data to the index, and clear it for the next section
156                    add_doc(
157                        index,
158                        doc_urls,
159                        &anchor_base,
160                        &heading,
161                        &mut id_counter,
162                        &section_id,
163                        &[&heading, &body, &breadcrumbs.join(" » ")],
164                    );
165                    heading.clear();
166                    body.clear();
167                    breadcrumbs.pop();
168                }
169
170                section_id = id;
171                in_heading = true;
172            }
173            Event::End(TagEnd::Heading(level)) if level as u32 <= max_section_depth => {
174                in_heading = false;
175                breadcrumbs.push(heading.clone());
176            }
177            Event::Start(Tag::FootnoteDefinition(name)) => {
178                let number = footnote_numbers.len() + 1;
179                footnote_numbers.entry(name).or_insert(number);
180            }
181            Event::Html(html) => {
182                let mut html_block = html.into_string();
183
184                // As of pulldown_cmark 0.6, html events are no longer contained
185                // in an HtmlBlock tag. We must collect consecutive Html events
186                // into a block ourselves.
187                while let Some(Event::Html(html)) = p.peek() {
188                    html_block.push_str(html);
189                    p.next();
190                }
191                body.push_str(&clean_html(&html_block));
192            }
193            Event::InlineHtml(html) => {
194                // This is not capable of cleaning inline tags like
195                // `foo <script>…</script>`. The `<script>` tags show up as
196                // individual InlineHtml events, and the content inside is
197                // just a regular Text event. There isn't a very good way to
198                // know how to collect all the content in-between. I'm not
199                // sure if this is easily fixable. It should be extremely
200                // rare, since script and style tags should almost always be
201                // blocks, and worse case you have some noise in the index.
202                body.push_str(&clean_html(&html));
203            }
204            Event::Start(_) | Event::End(_) | Event::Rule | Event::SoftBreak | Event::HardBreak => {
205                // Insert spaces where HTML output would usually separate text
206                // to ensure words don't get merged together
207                if in_heading {
208                    heading.push(' ');
209                } else {
210                    body.push(' ');
211                }
212            }
213            Event::Text(text) | Event::Code(text) => {
214                if in_heading {
215                    heading.push_str(&text);
216                } else {
217                    body.push_str(&text);
218                }
219            }
220            Event::FootnoteReference(name) => {
221                let len = footnote_numbers.len() + 1;
222                let number = footnote_numbers.entry(name).or_insert(len);
223                body.push_str(&format!(" [{number}] "));
224            }
225            Event::TaskListMarker(_checked) => {}
226        }
227    }
228
229    if !body.is_empty() || !heading.is_empty() {
230        let title = if heading.is_empty() {
231            if let Some(chapter) = breadcrumbs.first() {
232                chapter
233            } else {
234                ""
235            }
236        } else {
237            &heading
238        };
239        // Make sure the last section is added to the index
240        add_doc(
241            index,
242            doc_urls,
243            &anchor_base,
244            &heading,
245            &mut id_counter,
246            &section_id,
247            &[title, &body, &breadcrumbs.join(" » ")],
248        );
249    }
250
251    Ok(())
252}
253
254fn write_to_json(index: Index, search_config: &Search, doc_urls: Vec<String>) -> Result<String> {
255    use elasticlunr::config::{SearchBool, SearchOptions, SearchOptionsField};
256    use std::collections::BTreeMap;
257
258    #[derive(Serialize)]
259    struct ResultsOptions {
260        limit_results: u32,
261        teaser_word_count: u32,
262    }
263
264    #[derive(Serialize)]
265    struct SearchindexJson {
266        /// The options used for displaying search results
267        results_options: ResultsOptions,
268        /// The searchoptions for elasticlunr.js
269        search_options: SearchOptions,
270        /// Used to lookup a document's URL from an integer document ref.
271        doc_urls: Vec<String>,
272        /// The index for elasticlunr.js
273        index: elasticlunr::Index,
274    }
275
276    let mut fields = BTreeMap::new();
277    let mut opt = SearchOptionsField::default();
278    let mut insert_boost = |key: &str, boost| {
279        opt.boost = Some(boost);
280        fields.insert(key.into(), opt);
281    };
282    insert_boost("title", search_config.boost_title);
283    insert_boost("body", search_config.boost_paragraph);
284    insert_boost("breadcrumbs", search_config.boost_hierarchy);
285
286    let search_options = SearchOptions {
287        bool: if search_config.use_boolean_and {
288            SearchBool::And
289        } else {
290            SearchBool::Or
291        },
292        expand: search_config.expand,
293        fields,
294    };
295
296    let results_options = ResultsOptions {
297        limit_results: search_config.limit_results,
298        teaser_word_count: search_config.teaser_word_count,
299    };
300
301    let json_contents = SearchindexJson {
302        results_options,
303        search_options,
304        doc_urls,
305        index,
306    };
307
308    // By converting to serde_json::Value as an intermediary, we use a
309    // BTreeMap internally and can force a stable ordering of map keys.
310    let json_contents = serde_json::to_value(&json_contents)?;
311    let json_contents = serde_json::to_string(&json_contents)?;
312
313    Ok(json_contents)
314}
315
316fn clean_html(html: &str) -> String {
317    static AMMONIA: LazyLock<ammonia::Builder<'static>> = LazyLock::new(|| {
318        let mut clean_content = HashSet::new();
319        clean_content.insert("script");
320        clean_content.insert("style");
321        let mut builder = ammonia::Builder::new();
322        builder
323            .tags(HashSet::new())
324            .tag_attributes(HashMap::new())
325            .generic_attributes(HashSet::new())
326            .link_rel(None)
327            .allowed_classes(HashMap::new())
328            .clean_content_tags(clean_content);
329        builder
330    });
331    AMMONIA.clean(html).to_string()
332}
333
334fn settings_path(ch: &Chapter) -> Option<&Path> {
335    ch.source_path.as_deref().or_else(|| ch.path.as_deref())
336}
337
338fn validate_chapter_config(
339    chapter_configs: &[(PathBuf, SearchChapterSettings)],
340    book: &Book,
341) -> Result<()> {
342    for (path, _) in chapter_configs {
343        let found = book
344            .iter()
345            .filter_map(|item| match item {
346                BookItem::Chapter(ch) if !ch.is_draft_chapter() => settings_path(ch),
347                _ => None,
348            })
349            .any(|source_path| source_path.starts_with(path));
350        if !found {
351            bail!(
352                "[output.html.search.chapter] key `{}` does not match any chapter paths",
353                path.display()
354            );
355        }
356    }
357    Ok(())
358}
359
360fn sort_search_config(
361    map: &HashMap<String, SearchChapterSettings>,
362) -> Vec<(PathBuf, SearchChapterSettings)> {
363    let mut settings: Vec<_> = map
364        .iter()
365        .map(|(key, value)| (PathBuf::from(key), value.clone()))
366        .collect();
367    // Note: This is case-sensitive, and assumes the author uses the same case
368    // as the actual filename.
369    settings.sort_by(|a, b| a.0.cmp(&b.0));
370    settings
371}
372
373fn get_chapter_settings(
374    chapter_configs: &[(PathBuf, SearchChapterSettings)],
375    source_path: &Path,
376) -> SearchChapterSettings {
377    let mut result = SearchChapterSettings::default();
378    for (path, config) in chapter_configs {
379        if source_path.starts_with(path) {
380            result.enable = config.enable.or(result.enable);
381        }
382    }
383    result
384}
385
386#[test]
387fn chapter_settings_priority() {
388    let cfg = r#"
389        [output.html.search.chapter]
390        "cli/watch.md" = { enable = true }
391        "cli" = { enable = false }
392        "cli/inner/foo.md" = { enable = false }
393        "cli/inner" = { enable = true }
394        "foo" = {} # Just to make sure empty table is allowed.
395    "#;
396    let cfg: crate::Config = toml::from_str(cfg).unwrap();
397    let html = cfg.html_config().unwrap();
398    let chapter_configs = sort_search_config(&html.search.unwrap().chapter);
399    for (path, enable) in [
400        ("foo.md", None),
401        ("cli/watch.md", Some(true)),
402        ("cli/index.md", Some(false)),
403        ("cli/inner/index.md", Some(true)),
404        ("cli/inner/foo.md", Some(false)),
405    ] {
406        assert_eq!(
407            get_chapter_settings(&chapter_configs, Path::new(path)),
408            SearchChapterSettings { enable }
409        );
410    }
411}
412
413#[cfg(test)]
414mod tests {
415    use super::*;
416
417    #[test]
418    fn test_tokenize_basic() {
419        assert_eq!(tokenize("hello world"), vec!["hello", "world"]);
420    }
421
422    #[test]
423    fn test_tokenize_with_hyphens() {
424        assert_eq!(
425            tokenize("hello-world test-case"),
426            vec!["hello", "world", "test", "case"]
427        );
428    }
429
430    #[test]
431    fn test_tokenize_mixed_whitespace() {
432        assert_eq!(
433            tokenize("hello\tworld\ntest\r\ncase"),
434            vec!["hello", "world", "test", "case"]
435        );
436    }
437
438    #[test]
439    fn test_tokenize_empty_string() {
440        assert_eq!(tokenize(""), Vec::<String>::new());
441    }
442
443    #[test]
444    fn test_tokenize_only_whitespace() {
445        assert_eq!(tokenize("   \t\n  "), Vec::<String>::new());
446    }
447
448    #[test]
449    fn test_tokenize_case_normalization() {
450        assert_eq!(tokenize("Hello WORLD Test"), vec!["hello", "world", "test"]);
451    }
452
453    #[test]
454    fn test_tokenize_trim_whitespace() {
455        assert_eq!(tokenize("  hello   world  "), vec!["hello", "world"]);
456    }
457
458    #[test]
459    fn test_tokenize_long_words_filtered() {
460        let long_word = "a".repeat(MAX_WORD_LENGTH_TO_INDEX + 1);
461        let short_word = "a".repeat(MAX_WORD_LENGTH_TO_INDEX);
462        let input = format!("{} hello {}", long_word, short_word);
463        assert_eq!(tokenize(&input), vec!["hello", &short_word]);
464    }
465
466    #[test]
467    fn test_tokenize_max_length_word() {
468        let max_word = "a".repeat(MAX_WORD_LENGTH_TO_INDEX);
469        assert_eq!(tokenize(&max_word), vec![max_word]);
470    }
471
472    #[test]
473    fn test_tokenize_special_characters() {
474        assert_eq!(
475            tokenize("hello,world.test!case?"),
476            vec!["hello,world.test!case?"]
477        );
478    }
479
480    #[test]
481    fn test_tokenize_unicode() {
482        assert_eq!(
483            tokenize("café naïve résumé"),
484            vec!["café", "naïve", "résumé"]
485        );
486    }
487
488    #[test]
489    fn test_tokenize_unicode_rtl_hebre() {
490        assert_eq!(tokenize("שלום עולם"), vec!["שלום", "עולם"]);
491    }
492
493    #[test]
494    fn test_tokenize_numbers() {
495        assert_eq!(
496            tokenize("test123 456-789 hello"),
497            vec!["test123", "456", "789", "hello"]
498        );
499    }
500}