markdown_that/plugins/extra/
typographer.rs

1//! Common textual replacements for dashes, ©, ™, …
2//!
3//! **Note:** Since this plugin is most useful with smart-quotes, which is not
4//! currently implemented, this plugin is _not_ enabled by default when using
5//! `plugins::extra::add`. You will have to enable it separately:
6//!
7//! ```rust
8//! let md = &mut markdown_that::MarkdownThat::new();
9//! markdown_that::plugins::cmark::add(md);
10//! markdown_that::plugins::extra::add(md);
11//! markdown_that::plugins::extra::typographer::add(md);
12//!
13//! let html = md.parse("Hello world!.... This is the Right Way(TM) to markdown!!!!!").render();
14//! assert_eq!(html.trim(), r#"<p>Hello world!.. This is the Right Way™ to markdown!!!</p>"#);
15//! ```
16//! In summary, these are the replacements that will be made when using this:
17//!
18//! ## Typography
19//!
20//! - Repeated dots (`...`) to ellipsis (`…`)
21//!   except `?...` and `!...` which become `?..` and `!..` respectively
22//! - `+-` to `±`
23//! - Don't repeat `?` and `!` more than 3 times: `???`
24//! - De-duplicate commas
25//! - em and en dashes: `--` to `–` and `---` to `—`
26//!
27//! ## Common symbols (case insensitive)
28//!
29//! - Copyright: `(c)` to `©`
30//! - Reserved: `(r)` to `®`
31//! - Trademark: `(tm)` to `™`
32
33use crate::parser::core::CoreRule;
34use crate::parser::inline::Text;
35use crate::{MarkdownThat, Node};
36use regex::Regex;
37use std::borrow::Cow;
38use std::sync::LazyLock;
39
40static REPLACEMENTS: LazyLock<Box<[(Regex, &'static str)]>> = LazyLock::new(|| {
41    Box::new([
42        (Regex::new(r"\+-").unwrap(), "±"),
43        (Regex::new(r"\.{2,}").unwrap(), "…"),
44        (Regex::new(r"([?!])…").unwrap(), "$1.."),
45        (Regex::new(r"([?!]){4,}").unwrap(), "$1$1$1"),
46        (Regex::new(r",{2,}").unwrap(), ","),
47        // These look a little different from the JS implementation because the
48        // regex crate doesn't support look-behind and look-ahead patterns
49        (
50            Regex::new(r"(?m)(?P<pre>^|[^-])(?P<dash>---)(?P<post>[^-]|$)").unwrap(),
51            "$pre\u{2014}$post",
52        ),
53        (
54            Regex::new(r"(?m)(?P<pre>^|\s)(?P<dash>--)(?P<post>\s|$)").unwrap(),
55            "$pre\u{2013}$post",
56        ),
57        (
58            Regex::new(r"(?m)(?P<pre>^|[^-\s])(?P<dash>--)(?P<post>[^-\s]|$)").unwrap(),
59            "$pre\u{2013}$post",
60        ),
61    ])
62});
63static SCOPED_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"(?i)\((c|tm|r)\)").unwrap());
64static RARE_RE: LazyLock<Regex> =
65    LazyLock::new(|| Regex::new(r"\+-|\.\.|\?\?\?\?|!!!!|,,|--").unwrap());
66
67fn replace_abbreviation(input: &str) -> &'static str {
68    match input.to_lowercase().as_str() {
69        "(c)" => "©",
70        "(r)" => "®",
71        "(tm)" => "™",
72        _ => unreachable!("Got invalid abbreviation '{}'", input),
73    }
74}
75
76pub fn add(md: &mut MarkdownThat) {
77    md.add_rule::<TypographerRule>();
78}
79
80pub struct TypographerRule;
81
82impl CoreRule for TypographerRule {
83    fn run(root: &mut Node, _: &MarkdownThat) {
84        root.walk_mut(|node, _| {
85            let Some(text_node) = node.cast_mut::<Text>() else {
86                return;
87            };
88
89            if SCOPED_RE.is_match(&text_node.content) {
90                text_node.content = SCOPED_RE
91                    .replace_all(&text_node.content, |caps: &regex::Captures| {
92                        replace_abbreviation(caps.get(0).unwrap().as_str())
93                    })
94                    .to_string();
95            }
96            if RARE_RE.is_match(&text_node.content) {
97                let mut result = Cow::Borrowed(text_node.content.as_str());
98
99                for (pattern, replacement) in REPLACEMENTS.iter() {
100                    if let Cow::Owned(s) = pattern.replace_all(&result, *replacement) {
101                        result = Cow::Owned(s);
102
103                        // This is a bit unfortunate but since we can't use
104                        // look-ahead and look-behind patterns in the dash
105                        // replacements, the preceding and following
106                        // characters (pre and post in the patterns) become
107                        // part of the match. So a string like "bla-- --foo"
108                        // would create two *overlapping* matches, "a-- "
109                        // and " --f". But replace_all only replaces
110                        // non-overlapping matches. So we can't do this in
111                        // one single replacement. My only consolation here
112                        // is that this won't happen very often in practice,
113                        // and that it cost us "only" one extra call.
114                        if let Cow::Owned(s) = pattern.replace_all(&result, *replacement) {
115                            result = Cow::Owned(s);
116                        }
117                    }
118                }
119
120                if let Cow::Owned(s) = result {
121                    text_node.content = s;
122                }
123            }
124        });
125    }
126}