markdown_that/plugins/extra/typographer.rs
1//! Common textual replacements for dashes, ©, ™, …
2//!
3//! **Note:** Since this plugin is most useful with smart-quotes, which is not
4//! currently implemented, this plugin is _not_ enabled by default when using
5//! `plugins::extra::add`. You will have to enable it separately:
6//!
7//! ```rust
8//! let md = &mut markdown_that::MarkdownThat::new();
9//! markdown_that::plugins::cmark::add(md);
10//! markdown_that::plugins::extra::add(md);
11//! markdown_that::plugins::extra::typographer::add(md);
12//!
13//! let html = md.parse("Hello world!.... This is the Right Way(TM) to markdown!!!!!").render();
14//! assert_eq!(html.trim(), r#"<p>Hello world!.. This is the Right Way™ to markdown!!!</p>"#);
15//! ```
16//! In summary, these are the replacements that will be made when using this:
17//!
18//! ## Typography
19//!
20//! - Repeated dots (`...`) to ellipsis (`…`)
21//! except `?...` and `!...` which become `?..` and `!..` respectively
22//! - `+-` to `±`
23//! - Don't repeat `?` and `!` more than 3 times: `???`
24//! - De-duplicate commas
25//! - em and en dashes: `--` to `–` and `---` to `—`
26//!
27//! ## Common symbols (case insensitive)
28//!
29//! - Copyright: `(c)` to `©`
30//! - Reserved: `(r)` to `®`
31//! - Trademark: `(tm)` to `™`
32
33use crate::parser::core::CoreRule;
34use crate::parser::inline::Text;
35use crate::{MarkdownThat, Node};
36use regex::Regex;
37use std::borrow::Cow;
38use std::sync::LazyLock;
39
40static REPLACEMENTS: LazyLock<Box<[(Regex, &'static str)]>> = LazyLock::new(|| {
41 Box::new([
42 (Regex::new(r"\+-").unwrap(), "±"),
43 (Regex::new(r"\.{2,}").unwrap(), "…"),
44 (Regex::new(r"([?!])…").unwrap(), "$1.."),
45 (Regex::new(r"([?!]){4,}").unwrap(), "$1$1$1"),
46 (Regex::new(r",{2,}").unwrap(), ","),
47 // These look a little different from the JS implementation because the
48 // regex crate doesn't support look-behind and look-ahead patterns
49 (
50 Regex::new(r"(?m)(?P<pre>^|[^-])(?P<dash>---)(?P<post>[^-]|$)").unwrap(),
51 "$pre\u{2014}$post",
52 ),
53 (
54 Regex::new(r"(?m)(?P<pre>^|\s)(?P<dash>--)(?P<post>\s|$)").unwrap(),
55 "$pre\u{2013}$post",
56 ),
57 (
58 Regex::new(r"(?m)(?P<pre>^|[^-\s])(?P<dash>--)(?P<post>[^-\s]|$)").unwrap(),
59 "$pre\u{2013}$post",
60 ),
61 ])
62});
63static SCOPED_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"(?i)\((c|tm|r)\)").unwrap());
64static RARE_RE: LazyLock<Regex> =
65 LazyLock::new(|| Regex::new(r"\+-|\.\.|\?\?\?\?|!!!!|,,|--").unwrap());
66
67fn replace_abbreviation(input: &str) -> &'static str {
68 match input.to_lowercase().as_str() {
69 "(c)" => "©",
70 "(r)" => "®",
71 "(tm)" => "™",
72 _ => unreachable!("Got invalid abbreviation '{}'", input),
73 }
74}
75
76pub fn add(md: &mut MarkdownThat) {
77 md.add_rule::<TypographerRule>();
78}
79
80pub struct TypographerRule;
81
82impl CoreRule for TypographerRule {
83 fn run(root: &mut Node, _: &MarkdownThat) {
84 root.walk_mut(|node, _| {
85 let Some(text_node) = node.cast_mut::<Text>() else {
86 return;
87 };
88
89 if SCOPED_RE.is_match(&text_node.content) {
90 text_node.content = SCOPED_RE
91 .replace_all(&text_node.content, |caps: ®ex::Captures| {
92 replace_abbreviation(caps.get(0).unwrap().as_str())
93 })
94 .to_string();
95 }
96 if RARE_RE.is_match(&text_node.content) {
97 let mut result = Cow::Borrowed(text_node.content.as_str());
98
99 for (pattern, replacement) in REPLACEMENTS.iter() {
100 if let Cow::Owned(s) = pattern.replace_all(&result, *replacement) {
101 result = Cow::Owned(s);
102
103 // This is a bit unfortunate but since we can't use
104 // look-ahead and look-behind patterns in the dash
105 // replacements, the preceding and following
106 // characters (pre and post in the patterns) become
107 // part of the match. So a string like "bla-- --foo"
108 // would create two *overlapping* matches, "a-- "
109 // and " --f". But replace_all only replaces
110 // non-overlapping matches. So we can't do this in
111 // one single replacement. My only consolation here
112 // is that this won't happen very often in practice,
113 // and that it cost us "only" one extra call.
114 if let Cow::Owned(s) = pattern.replace_all(&result, *replacement) {
115 result = Cow::Owned(s);
116 }
117 }
118 }
119
120 if let Cow::Owned(s) = result {
121 text_node.content = s;
122 }
123 }
124 });
125 }
126}