markdown_it/plugins/extra/typographer.rs
1//! Common textual replacements for dashes, ©, ™, …
2//!
3//! **Note:** Since this plugin is most useful with smart-quotes, which is not
4//! currently implemented, this plugin is _not_ enabled by default when using
5//! `plugins::extra::add`. You will have to enable it separately:
6//!
7//! ```rust
8//! let md = &mut markdown_it::MarkdownIt::new();
9//! markdown_it::plugins::cmark::add(md);
10//! markdown_it::plugins::extra::add(md);
11//! markdown_it::plugins::extra::typographer::add(md);
12//!
13//! let html = md.parse("Hello world!.... This is the Right Way(TM) to markdown!!!!!").render();
14//! assert_eq!(html.trim(), r#"<p>Hello world!.. This is the Right Way™ to markdown!!!</p>"#);
15//! ```
16//! In summary, these are the replacements that will be made when using this:
17//!
18//! ## Typography
19//!
20//! - Repeated dots (`...`) to ellipsis (`…`)
21//! except `?...` and `!...` which become `?..` and `!..` respectively
22//! - `+-` to `±`
23//! - Don't repeat `?` and `!` more than 3 times: `???`
24//! - De-duplicate commas
25//! - em and en dashes: `--` to `–` and `---` to `—`
26//!
27//! ## Common symbols (case insensitive)
28//!
29//! - Copyright: `(c)` to `©`
30//! - Reserved: `(r)` to `®`
31//! - Trademark: `(tm)` to `™`
32
33use once_cell::sync::Lazy;
34use regex::Regex;
35use std::borrow::Cow;
36
37use crate::parser::core::CoreRule;
38use crate::parser::inline::Text;
39use crate::{MarkdownIt, Node};
40
41static REPLACEMENTS: Lazy<Box<[(Regex, &'static str)]>> = Lazy::new(|| {
42 Box::new([
43 (Regex::new(r"\+-").unwrap(), "±"),
44 (Regex::new(r"\.{2,}").unwrap(), "…"),
45 (Regex::new(r"([?!])…").unwrap(), "$1.."),
46 (Regex::new(r"([?!]){4,}").unwrap(), "$1$1$1"),
47 (Regex::new(r",{2,}").unwrap(), ","),
48 // These look a little different from the JS implementation because the
49 // regex crate doesn't support look-behind and look-ahead patterns
50 (
51 Regex::new(r"(?m)(?P<pre>^|[^-])(?P<dash>---)(?P<post>[^-]|$)").unwrap(),
52 "$pre\u{2014}$post",
53 ),
54 (
55 Regex::new(r"(?m)(?P<pre>^|\s)(?P<dash>--)(?P<post>\s|$)").unwrap(),
56 "$pre\u{2013}$post",
57 ),
58 (
59 Regex::new(r"(?m)(?P<pre>^|[^-\s])(?P<dash>--)(?P<post>[^-\s]|$)").unwrap(),
60 "$pre\u{2013}$post",
61 ),
62 ])
63});
64static SCOPED_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)\((c|tm|r)\)").unwrap());
65static RARE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"\+-|\.\.|\?\?\?\?|!!!!|,,|--").unwrap());
66
67fn replace_abbreviation(input: &str) -> &'static str {
68 match input.to_lowercase().as_str() {
69 "(c)" => "©",
70 "(r)" => "®",
71 "(tm)" => "™",
72 _ => unreachable!("Got invalid abbreviation '{}'", input),
73 }
74}
75
76pub fn add(md: &mut MarkdownIt) {
77 md.add_rule::<TypographerRule>();
78}
79
80pub struct TypographerRule;
81
82impl CoreRule for TypographerRule {
83 fn run(root: &mut Node, _: &MarkdownIt) {
84 root.walk_mut(|node, _| {
85 let Some(text_node) = node.cast_mut::<Text>() else { return; };
86
87 if SCOPED_RE.is_match(&text_node.content) {
88 text_node.content = SCOPED_RE
89 .replace_all(&text_node.content, |caps: ®ex::Captures| {
90 replace_abbreviation(caps.get(0).unwrap().as_str())
91 })
92 .to_string();
93 }
94 if RARE_RE.is_match(&text_node.content) {
95 let mut result = Cow::Borrowed(text_node.content.as_str());
96
97 for (pattern, replacement) in REPLACEMENTS.iter() {
98 if let Cow::Owned(s) = pattern.replace_all(&result, *replacement) {
99 result = Cow::Owned(s);
100
101 // This is a bit unfortunate but since we can't use
102 // look-ahead and look-behind patterns in the dash
103 // replacements, the preceding and following
104 // characters (pre and post in the patterns) become
105 // part of the match. So a string like "bla-- --foo"
106 // would create two *overlapping* matches, "a-- "
107 // and " --f". But replace_all only replaces
108 // non-overlapping matches. So we can't do this in
109 // one single replacement. My only consolation here
110 // is that this won't happen very often in practice,
111 // and that it cost us "only" one extra call.
112 if let Cow::Owned(s) = pattern.replace_all(&result, *replacement) {
113 result = Cow::Owned(s);
114 }
115 }
116 }
117
118 if let Cow::Owned(s) = result {
119 text_node.content = s;
120 }
121 }
122 });
123 }
124}