mdbook/utils/
mod.rs

1#![allow(missing_docs)] // FIXME: Document this
2
3pub mod fs;
4mod string;
5pub(crate) mod toml_ext;
6use crate::errors::Error;
7use log::error;
8use once_cell::sync::Lazy;
9use pulldown_cmark::{html, CodeBlockKind, CowStr, Event, Options, Parser, Tag};
10use regex::Regex;
11
12use std::borrow::Cow;
13use std::collections::HashMap;
14use std::fmt::Write;
15use std::path::Path;
16
17pub use self::string::{
18    take_anchored_lines, take_lines, take_rustdoc_include_anchored_lines,
19    take_rustdoc_include_lines,
20};
21
22/// Replaces multiple consecutive whitespace characters with a single space character.
23pub fn collapse_whitespace(text: &str) -> Cow<'_, str> {
24    static RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"\s\s+").unwrap());
25    RE.replace_all(text, " ")
26}
27
28/// Convert the given string to a valid HTML element ID.
29/// The only restriction is that the ID must not contain any ASCII whitespace.
30pub fn normalize_id(content: &str) -> String {
31    content
32        .chars()
33        .filter_map(|ch| {
34            if ch.is_alphanumeric() || ch == '_' || ch == '-' {
35                Some(ch.to_ascii_lowercase())
36            } else if ch.is_whitespace() {
37                Some('-')
38            } else {
39                None
40            }
41        })
42        .collect::<String>()
43}
44
45/// Generate an ID for use with anchors which is derived from a "normalised"
46/// string.
47// This function should be made private when the deprecation expires.
48#[deprecated(since = "0.4.16", note = "use unique_id_from_content instead")]
49pub fn id_from_content(content: &str) -> String {
50    let mut content = content.to_string();
51
52    // Skip any tags or html-encoded stuff
53    static HTML: Lazy<Regex> = Lazy::new(|| Regex::new(r"(<.*?>)").unwrap());
54    content = HTML.replace_all(&content, "").into();
55    const REPL_SUB: &[&str] = &["&lt;", "&gt;", "&amp;", "&#39;", "&quot;"];
56    for sub in REPL_SUB {
57        content = content.replace(sub, "");
58    }
59
60    // Remove spaces and hashes indicating a header
61    let trimmed = content.trim().trim_start_matches('#').trim();
62    normalize_id(trimmed)
63}
64
65/// Generate an ID for use with anchors which is derived from a "normalised"
66/// string.
67///
68/// Each ID returned will be unique, if the same `id_counter` is provided on
69/// each call.
70pub fn unique_id_from_content(content: &str, id_counter: &mut HashMap<String, usize>) -> String {
71    let id = {
72        #[allow(deprecated)]
73        id_from_content(content)
74    };
75
76    // If we have headers with the same normalized id, append an incrementing counter
77    let id_count = id_counter.entry(id.clone()).or_insert(0);
78    let unique_id = match *id_count {
79        0 => id,
80        id_count => format!("{}-{}", id, id_count),
81    };
82    *id_count += 1;
83    unique_id
84}
85
86/// Fix links to the correct location.
87///
88/// This adjusts links, such as turning `.md` extensions to `.html`.
89///
90/// `path` is the path to the page being rendered relative to the root of the
91/// book. This is used for the `print.html` page so that links on the print
92/// page go to the original location. Normal page rendering sets `path` to
93/// None. Ideally, print page links would link to anchors on the print page,
94/// but that is very difficult.
95fn adjust_links<'a>(event: Event<'a>, path: Option<&Path>) -> Event<'a> {
96    static SCHEME_LINK: Lazy<Regex> = Lazy::new(|| Regex::new(r"^[a-z][a-z0-9+.-]*:").unwrap());
97    static MD_LINK: Lazy<Regex> =
98        Lazy::new(|| Regex::new(r"(?P<link>.*)\.md(?P<anchor>#.*)?").unwrap());
99
100    fn fix<'a>(dest: CowStr<'a>, path: Option<&Path>) -> CowStr<'a> {
101        if dest.starts_with('#') {
102            // Fragment-only link.
103            if let Some(path) = path {
104                let mut base = path.display().to_string();
105                if base.ends_with(".md") {
106                    base.replace_range(base.len() - 3.., ".html");
107                }
108                return format!("{}{}", base, dest).into();
109            } else {
110                return dest;
111            }
112        }
113        // Don't modify links with schemes like `https`.
114        if !SCHEME_LINK.is_match(&dest) {
115            // This is a relative link, adjust it as necessary.
116            let mut fixed_link = String::new();
117            if let Some(path) = path {
118                let base = path
119                    .parent()
120                    .expect("path can't be empty")
121                    .to_str()
122                    .expect("utf-8 paths only");
123                if !base.is_empty() {
124                    write!(fixed_link, "{}/", base).unwrap();
125                }
126            }
127
128            if let Some(caps) = MD_LINK.captures(&dest) {
129                fixed_link.push_str(&caps["link"]);
130                fixed_link.push_str(".html");
131                if let Some(anchor) = caps.name("anchor") {
132                    fixed_link.push_str(anchor.as_str());
133                }
134            } else {
135                fixed_link.push_str(&dest);
136            };
137            return CowStr::from(fixed_link);
138        }
139        dest
140    }
141
142    fn fix_html<'a>(html: CowStr<'a>, path: Option<&Path>) -> CowStr<'a> {
143        // This is a terrible hack, but should be reasonably reliable. Nobody
144        // should ever parse a tag with a regex. However, there isn't anything
145        // in Rust that I know of that is suitable for handling partial html
146        // fragments like those generated by pulldown_cmark.
147        //
148        // There are dozens of HTML tags/attributes that contain paths, so
149        // feel free to add more tags if desired; these are the only ones I
150        // care about right now.
151        static HTML_LINK: Lazy<Regex> =
152            Lazy::new(|| Regex::new(r#"(<(?:a|img) [^>]*?(?:src|href)=")([^"]+?)""#).unwrap());
153
154        HTML_LINK
155            .replace_all(&html, |caps: &regex::Captures<'_>| {
156                let fixed = fix(caps[2].into(), path);
157                format!("{}{}\"", &caps[1], fixed)
158            })
159            .into_owned()
160            .into()
161    }
162
163    match event {
164        Event::Start(Tag::Link(link_type, dest, title)) => {
165            Event::Start(Tag::Link(link_type, fix(dest, path), title))
166        }
167        Event::Start(Tag::Image(link_type, dest, title)) => {
168            Event::Start(Tag::Image(link_type, fix(dest, path), title))
169        }
170        Event::Html(html) => Event::Html(fix_html(html, path)),
171        _ => event,
172    }
173}
174
175/// Wrapper around the pulldown-cmark parser for rendering markdown to HTML.
176pub fn render_markdown(text: &str, curly_quotes: bool) -> String {
177    render_markdown_with_path(text, curly_quotes, None)
178}
179
180pub fn new_cmark_parser(text: &str, curly_quotes: bool) -> Parser<'_, '_> {
181    let mut opts = Options::empty();
182    opts.insert(Options::ENABLE_TABLES);
183    opts.insert(Options::ENABLE_FOOTNOTES);
184    opts.insert(Options::ENABLE_STRIKETHROUGH);
185    opts.insert(Options::ENABLE_TASKLISTS);
186    if curly_quotes {
187        opts.insert(Options::ENABLE_SMART_PUNCTUATION);
188    }
189    Parser::new_ext(text, opts)
190}
191
192pub fn render_markdown_with_path(text: &str, curly_quotes: bool, path: Option<&Path>) -> String {
193    let mut s = String::with_capacity(text.len() * 3 / 2);
194    let p = new_cmark_parser(text, curly_quotes);
195    let events = p
196        .map(clean_codeblock_headers)
197        .map(|event| adjust_links(event, path))
198        .flat_map(|event| {
199            let (a, b) = wrap_tables(event);
200            a.into_iter().chain(b)
201        });
202
203    html::push_html(&mut s, events);
204    s
205}
206
207/// Wraps tables in a `.table-wrapper` class to apply overflow-x rules to.
208fn wrap_tables(event: Event<'_>) -> (Option<Event<'_>>, Option<Event<'_>>) {
209    match event {
210        Event::Start(Tag::Table(_)) => (
211            Some(Event::Html(r#"<div class="table-wrapper">"#.into())),
212            Some(event),
213        ),
214        Event::End(Tag::Table(_)) => (Some(event), Some(Event::Html(r#"</div>"#.into()))),
215        _ => (Some(event), None),
216    }
217}
218
219fn clean_codeblock_headers(event: Event<'_>) -> Event<'_> {
220    match event {
221        Event::Start(Tag::CodeBlock(CodeBlockKind::Fenced(ref info))) => {
222            let info: String = info
223                .chars()
224                .map(|x| match x {
225                    ' ' | '\t' => ',',
226                    _ => x,
227                })
228                .filter(|ch| !ch.is_whitespace())
229                .collect();
230
231            Event::Start(Tag::CodeBlock(CodeBlockKind::Fenced(CowStr::from(info))))
232        }
233        _ => event,
234    }
235}
236
237/// Prints a "backtrace" of some `Error`.
238pub fn log_backtrace(e: &Error) {
239    error!("Error: {}", e);
240
241    for cause in e.chain().skip(1) {
242        error!("\tCaused By: {}", cause);
243    }
244}
245
246pub(crate) fn bracket_escape(mut s: &str) -> String {
247    let mut escaped = String::with_capacity(s.len());
248    let needs_escape: &[char] = &['<', '>'];
249    while let Some(next) = s.find(needs_escape) {
250        escaped.push_str(&s[..next]);
251        match s.as_bytes()[next] {
252            b'<' => escaped.push_str("&lt;"),
253            b'>' => escaped.push_str("&gt;"),
254            _ => unreachable!(),
255        }
256        s = &s[next + 1..];
257    }
258    escaped.push_str(s);
259    escaped
260}
261
262#[cfg(test)]
263mod tests {
264    use super::bracket_escape;
265
266    mod render_markdown {
267        use super::super::render_markdown;
268
269        #[test]
270        fn preserves_external_links() {
271            assert_eq!(
272                render_markdown("[example](https://www.rust-lang.org/)", false),
273                "<p><a href=\"https://www.rust-lang.org/\">example</a></p>\n"
274            );
275        }
276
277        #[test]
278        fn it_can_adjust_markdown_links() {
279            assert_eq!(
280                render_markdown("[example](example.md)", false),
281                "<p><a href=\"example.html\">example</a></p>\n"
282            );
283            assert_eq!(
284                render_markdown("[example_anchor](example.md#anchor)", false),
285                "<p><a href=\"example.html#anchor\">example_anchor</a></p>\n"
286            );
287
288            // this anchor contains 'md' inside of it
289            assert_eq!(
290                render_markdown("[phantom data](foo.html#phantomdata)", false),
291                "<p><a href=\"foo.html#phantomdata\">phantom data</a></p>\n"
292            );
293        }
294
295        #[test]
296        fn it_can_wrap_tables() {
297            let src = r#"
298| Original        | Punycode        | Punycode + Encoding |
299|-----------------|-----------------|---------------------|
300| føø             | f-5gaa          | f_5gaa              |
301"#;
302            let out = r#"
303<div class="table-wrapper"><table><thead><tr><th>Original</th><th>Punycode</th><th>Punycode + Encoding</th></tr></thead><tbody>
304<tr><td>føø</td><td>f-5gaa</td><td>f_5gaa</td></tr>
305</tbody></table>
306</div>
307"#.trim();
308            assert_eq!(render_markdown(src, false), out);
309        }
310
311        #[test]
312        fn it_can_keep_quotes_straight() {
313            assert_eq!(render_markdown("'one'", false), "<p>'one'</p>\n");
314        }
315
316        #[test]
317        fn it_can_make_quotes_curly_except_when_they_are_in_code() {
318            let input = r#"
319'one'
320```
321'two'
322```
323`'three'` 'four'"#;
324            let expected = r#"<p>‘one’</p>
325<pre><code>'two'
326</code></pre>
327<p><code>'three'</code> ‘four’</p>
328"#;
329            assert_eq!(render_markdown(input, true), expected);
330        }
331
332        #[test]
333        fn whitespace_outside_of_codeblock_header_is_preserved() {
334            let input = r#"
335some text with spaces
336```rust
337fn main() {
338// code inside is unchanged
339}
340```
341more text with spaces
342"#;
343
344            let expected = r#"<p>some text with spaces</p>
345<pre><code class="language-rust">fn main() {
346// code inside is unchanged
347}
348</code></pre>
349<p>more text with spaces</p>
350"#;
351            assert_eq!(render_markdown(input, false), expected);
352            assert_eq!(render_markdown(input, true), expected);
353        }
354
355        #[test]
356        fn rust_code_block_properties_are_passed_as_space_delimited_class() {
357            let input = r#"
358```rust,no_run,should_panic,property_3
359```
360"#;
361
362            let expected = r#"<pre><code class="language-rust,no_run,should_panic,property_3"></code></pre>
363"#;
364            assert_eq!(render_markdown(input, false), expected);
365            assert_eq!(render_markdown(input, true), expected);
366        }
367
368        #[test]
369        fn rust_code_block_properties_with_whitespace_are_passed_as_space_delimited_class() {
370            let input = r#"
371```rust,    no_run,,,should_panic , ,property_3
372```
373"#;
374
375            let expected = r#"<pre><code class="language-rust,,,,,no_run,,,should_panic,,,,property_3"></code></pre>
376"#;
377            assert_eq!(render_markdown(input, false), expected);
378            assert_eq!(render_markdown(input, true), expected);
379        }
380
381        #[test]
382        fn rust_code_block_without_properties_has_proper_html_class() {
383            let input = r#"
384```rust
385```
386"#;
387
388            let expected = r#"<pre><code class="language-rust"></code></pre>
389"#;
390            assert_eq!(render_markdown(input, false), expected);
391            assert_eq!(render_markdown(input, true), expected);
392
393            let input = r#"
394```rust
395```
396"#;
397            assert_eq!(render_markdown(input, false), expected);
398            assert_eq!(render_markdown(input, true), expected);
399        }
400    }
401
402    #[allow(deprecated)]
403    mod id_from_content {
404        use super::super::id_from_content;
405
406        #[test]
407        fn it_generates_anchors() {
408            assert_eq!(
409                id_from_content("## Method-call expressions"),
410                "method-call-expressions"
411            );
412            assert_eq!(id_from_content("## **Bold** title"), "bold-title");
413            assert_eq!(id_from_content("## `Code` title"), "code-title");
414            assert_eq!(
415                id_from_content("## title <span dir=rtl>foo</span>"),
416                "title-foo"
417            );
418        }
419
420        #[test]
421        fn it_generates_anchors_from_non_ascii_initial() {
422            assert_eq!(
423                id_from_content("## `--passes`: add more rustdoc passes"),
424                "--passes-add-more-rustdoc-passes"
425            );
426            assert_eq!(
427                id_from_content("## 中文標題 CJK title"),
428                "中文標題-cjk-title"
429            );
430            assert_eq!(id_from_content("## Über"), "Über");
431        }
432    }
433
434    mod html_munging {
435        use super::super::{normalize_id, unique_id_from_content};
436
437        #[test]
438        fn it_normalizes_ids() {
439            assert_eq!(
440                normalize_id("`--passes`: add more rustdoc passes"),
441                "--passes-add-more-rustdoc-passes"
442            );
443            assert_eq!(
444                normalize_id("Method-call 🐙 expressions \u{1f47c}"),
445                "method-call--expressions-"
446            );
447            assert_eq!(normalize_id("_-_12345"), "_-_12345");
448            assert_eq!(normalize_id("12345"), "12345");
449            assert_eq!(normalize_id("中文"), "中文");
450            assert_eq!(normalize_id("にほんご"), "にほんご");
451            assert_eq!(normalize_id("한국어"), "한국어");
452            assert_eq!(normalize_id(""), "");
453        }
454
455        #[test]
456        fn it_generates_unique_ids_from_content() {
457            // Same id if not given shared state
458            assert_eq!(
459                unique_id_from_content("## 中文標題 CJK title", &mut Default::default()),
460                "中文標題-cjk-title"
461            );
462            assert_eq!(
463                unique_id_from_content("## 中文標題 CJK title", &mut Default::default()),
464                "中文標題-cjk-title"
465            );
466
467            // Different id if given shared state
468            let mut id_counter = Default::default();
469            assert_eq!(unique_id_from_content("## Über", &mut id_counter), "Über");
470            assert_eq!(
471                unique_id_from_content("## 中文標題 CJK title", &mut id_counter),
472                "中文標題-cjk-title"
473            );
474            assert_eq!(unique_id_from_content("## Über", &mut id_counter), "Über-1");
475            assert_eq!(unique_id_from_content("## Über", &mut id_counter), "Über-2");
476        }
477    }
478
479    #[test]
480    fn escaped_brackets() {
481        assert_eq!(bracket_escape(""), "");
482        assert_eq!(bracket_escape("<"), "&lt;");
483        assert_eq!(bracket_escape(">"), "&gt;");
484        assert_eq!(bracket_escape("<>"), "&lt;&gt;");
485        assert_eq!(bracket_escape("<test>"), "&lt;test&gt;");
486        assert_eq!(bracket_escape("a<test>b"), "a&lt;test&gt;b");
487    }
488}