markdown_it/plugins/extra/
typographer.rs

1//! Common textual replacements for dashes, ©, ™, …
2//!
3//! **Note:** Since this plugin is most useful with smart-quotes, which is not
4//! currently implemented, this plugin is _not_ enabled by default when using
5//! `plugins::extra::add`. You will have to enable it separately:
6//!
7//! ```rust
8//! let md = &mut markdown_it::MarkdownIt::new();
9//! markdown_it::plugins::cmark::add(md);
10//! markdown_it::plugins::extra::add(md);
11//! markdown_it::plugins::extra::typographer::add(md);
12//!
13//! let html = md.parse("Hello world!.... This is the Right Way(TM) to markdown!!!!!").render();
14//! assert_eq!(html.trim(), r#"<p>Hello world!.. This is the Right Way™ to markdown!!!</p>"#);
15//! ```
16//! In summary, these are the replacements that will be made when using this:
17//!
18//! ## Typography
19//!
20//! - Repeated dots (`...`) to ellipsis (`…`)
21//!   except `?...` and `!...` which become `?..` and `!..` respectively
22//! - `+-` to `±`
23//! - Don't repeat `?` and `!` more than 3 times: `???`
24//! - De-duplicate commas
25//! - em and en dashes: `--` to `–` and `---` to `—`
26//!
27//! ## Common symbols (case insensitive)
28//!
29//! - Copyright: `(c)` to `©`
30//! - Reserved: `(r)` to `®`
31//! - Trademark: `(tm)` to `™`
32
33use once_cell::sync::Lazy;
34use regex::Regex;
35use std::borrow::Cow;
36
37use crate::parser::core::CoreRule;
38use crate::parser::inline::Text;
39use crate::{MarkdownIt, Node};
40
41static REPLACEMENTS: Lazy<Box<[(Regex, &'static str)]>> = Lazy::new(|| {
42    Box::new([
43        (Regex::new(r"\+-").unwrap(), "±"),
44        (Regex::new(r"\.{2,}").unwrap(), "…"),
45        (Regex::new(r"([?!])…").unwrap(), "$1.."),
46        (Regex::new(r"([?!]){4,}").unwrap(), "$1$1$1"),
47        (Regex::new(r",{2,}").unwrap(), ","),
48        // These look a little different from the JS implementation because the
49        // regex crate doesn't support look-behind and look-ahead patterns
50        (
51            Regex::new(r"(?m)(?P<pre>^|[^-])(?P<dash>---)(?P<post>[^-]|$)").unwrap(),
52            "$pre\u{2014}$post",
53        ),
54        (
55            Regex::new(r"(?m)(?P<pre>^|\s)(?P<dash>--)(?P<post>\s|$)").unwrap(),
56            "$pre\u{2013}$post",
57        ),
58        (
59            Regex::new(r"(?m)(?P<pre>^|[^-\s])(?P<dash>--)(?P<post>[^-\s]|$)").unwrap(),
60            "$pre\u{2013}$post",
61        ),
62    ])
63});
64static SCOPED_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)\((c|tm|r)\)").unwrap());
65static RARE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"\+-|\.\.|\?\?\?\?|!!!!|,,|--").unwrap());
66
67fn replace_abbreviation(input: &str) -> &'static str {
68    match input.to_lowercase().as_str() {
69        "(c)" => "©",
70        "(r)" => "®",
71        "(tm)" => "™",
72        _ => unreachable!("Got invalid abbreviation '{}'", input),
73    }
74}
75
76pub fn add(md: &mut MarkdownIt) {
77    md.add_rule::<TypographerRule>();
78}
79
80pub struct TypographerRule;
81
82impl CoreRule for TypographerRule {
83    fn run(root: &mut Node, _: &MarkdownIt) {
84        root.walk_mut(|node, _| {
85            let Some(text_node) = node.cast_mut::<Text>() else { return; };
86
87            if SCOPED_RE.is_match(&text_node.content) {
88                text_node.content = SCOPED_RE
89                    .replace_all(&text_node.content, |caps: &regex::Captures| {
90                        replace_abbreviation(caps.get(0).unwrap().as_str())
91                    })
92                    .to_string();
93            }
94            if RARE_RE.is_match(&text_node.content) {
95                let mut result = Cow::Borrowed(text_node.content.as_str());
96
97                for (pattern, replacement) in REPLACEMENTS.iter() {
98                    if let Cow::Owned(s) = pattern.replace_all(&result, *replacement) {
99                        result = Cow::Owned(s);
100
101                        // This is a bit unfortunate but since we can't use
102                        // look-ahead and look-behind patterns in the dash
103                        // replacements, the preceding and following
104                        // characters (pre and post in the patterns) become
105                        // part of the match. So a string like "bla-- --foo"
106                        // would create two *overlapping* matches, "a-- "
107                        // and " --f". But replace_all only replaces
108                        // non-overlapping matches. So we can't do this in
109                        // one single replacement. My only consolation here
110                        // is that this won't happen very often in practice,
111                        // and that it cost us "only" one extra call.
112                        if let Cow::Owned(s) = pattern.replace_all(&result, *replacement) {
113                            result = Cow::Owned(s);
114                        }
115                    }
116                }
117
118                if let Cow::Owned(s) = result {
119                    text_node.content = s;
120                }
121            }
122        });
123    }
124}