markdown_it/plugins/extra/smartquotes.rs
1//! Replaces `"` and `'` quotes with "nicer" ones like `‘`, `’`, `“`, `”`, or
2//! with `’` for words like "isn't".
3//!
4//! This currently only supports single character quotes, which is a limitation
5//! of the Rust implementation due to the use of `const` generics.
6//!
7//! ## Implementation notes
8//!
9//! The main obstacle to implementing this was the fact that the document is
10//! necessarily represented as a tree of nodes.
11//! Each node is thus necessarily referenced by its parents, which means that an
12//! any given moment we cannot hold a mutable reference to a node if any other
13//! part of the code holds a reference to the document. At least that's my
14//! understanding of the problem.
15//! The smartquotes algorithm from the JS library makes heavy use of iteration
16//! backwards and forwards through a flat list of tokens. This isn't really
17//! possible in the Rust implementation. Building a flat representation of all
18//! `Node` objects is easy, but holding that list precludes us from executing a
19//! `root.walk_mut` call at the same time.
20//! On top of that, while the smartquotes algorithm iterates linearly over all
21//! nodes/tokens, looking at a specific token with index `j` can trigger
22//! replacements in any of the tokens with `0` to `j - 1`.
23//!
24//! The solution proposed here is to first compute all the replacement
25//! operations on a read-only flat view of the document, and _then_ to perform
26//! all replacements in a single call to `root.walk_mut`.
27use std::collections::HashMap;
28
29use crate::common::utils::is_punct_char;
30use crate::parser::core::CoreRule;
31use crate::parser::inline::Text;
32use crate::plugins::cmark::block::paragraph::Paragraph;
33use crate::plugins::cmark::inline::newline::{Hardbreak, Softbreak};
34use crate::plugins::html::html_inline::HtmlInline;
35use crate::{MarkdownIt, Node};
36
37const APOSTROPHE: char = '\u{2019}';
38const SINGLE_QUOTE: char = '\'';
39const DOUBLE_QUOTE: char = '"';
40const SPACE: char = ' ';
41
42/// Add smartquotes with the "classic" quote set of `‘`, `’`, `“`, and `”`.
43pub fn add(md: &mut MarkdownIt) {
44 add_with::<'‘', '’', '“', '”'>(md);
45}
46
47pub fn add_with<
48 const OPEN_SINGLE_QUOTE: char,
49 const CLOSE_SINGLE_QUOTE: char,
50 const OPEN_DOUBLE_QUOTE: char,
51 const CLOSE_DOUBLE_QUOTE: char,
52>(
53 md: &mut MarkdownIt,
54) {
55 md.add_rule::<SmartQuotesRule<
56 OPEN_SINGLE_QUOTE,
57 CLOSE_SINGLE_QUOTE,
58 OPEN_DOUBLE_QUOTE,
59 CLOSE_DOUBLE_QUOTE>>();
60}
61
62/// Simplified Node type that only holds the info we need
63///
64/// To replace quotes, we'll be iterating forward and backward over the nodes in
65/// our document tree. The `Node` class doesn't provide a mechanism to do this
66/// efficiently, and in any case we only care about certain parts of the
67/// information. This struct will be used to build a flat view of the document;
68/// the `Irrelevant` variant serves as a "filler" so that the indexes of the
69/// entries line up correctly with the order we see during tree traversal.
70enum FlatToken<'a> {
71 LineBreak,
72 Text {
73 content: &'a str,
74 nesting_level: u32,
75 },
76 HtmlInline {
77 content: &'a str,
78 },
79 Irrelevant,
80}
81
82/// A simple enum to distinguish single and double quotes
83#[derive(PartialEq, Eq, Debug, Clone, Copy)]
84enum QuoteType {
85 Single,
86 Double,
87}
88
89/// Holds information about quotes we have encountered thus far.
90///
91/// These quotes may or may not be used to close a pair further down the line.
92/// The different fields thus hold all the information we need to a) decide
93/// whether or not to match them up with another quote we encounter, and b) to
94/// perform the correct replacement, should be indeed use this quote to close a
95/// pair.
96struct QuoteMarker {
97 /// The iteration index of the node in which this quote was found.
98 ///
99 /// This is the index at which this quote's `Node` appears in a pre-order
100 /// depth-first walk of the document tree. Since we can only _modify_ nodes
101 /// during a walk, we rely on this index to tell us which nodes to modify.
102 walk_index: usize,
103 /// The position of the quote within node's `content`
104 quote_position: usize,
105 /// Whether this is a single or a double quote
106 quote_type: QuoteType,
107 /// Nesting level of the containing token
108 ///
109 /// This is the nesting of the containing `Node` within the document tree.
110 /// It is used to decide which quotes can be matched up.
111 level: u32,
112}
113
114/// Description of a single quote replacement to be executed
115///
116/// As described above, we have to compute the replacements in a first step that
117/// treats the entire document tree read-only. Only then can we perform the
118/// actual replacements. This `struct` holds the information we need to perform
119/// the replacement of a single quote character during a `walk_mut`.
120struct ReplacementOp {
121 walk_index: usize,
122 quote_position: usize,
123 quote: char,
124}
125
126pub struct SmartQuotesRule<
127 const OPEN_SINGLE_QUOTE: char,
128 const CLOSE_SINGLE_QUOTE: char,
129 const OPEN_DOUBLE_QUOTE: char,
130 const CLOSE_DOUBLE_QUOTE: char,
131>;
132
133impl<
134 const OPEN_SINGLE_QUOTE: char,
135 const CLOSE_SINGLE_QUOTE: char,
136 const OPEN_DOUBLE_QUOTE: char,
137 const CLOSE_DOUBLE_QUOTE: char,
138 > CoreRule
139 for SmartQuotesRule<
140 OPEN_SINGLE_QUOTE,
141 CLOSE_SINGLE_QUOTE,
142 OPEN_DOUBLE_QUOTE,
143 CLOSE_DOUBLE_QUOTE,
144 >
145{
146 fn run(root: &mut Node, _: &MarkdownIt) {
147 let text_tokens = all_text_tokens(root);
148
149 let replacement_ops = Self::compute_replacements(text_tokens);
150
151 // now that we know what we want to replace where, we go over the nodes a _third_ time to do all the actual replacements.
152 let mut current_index: usize = 0;
153
154 root.walk_mut(|node, _| {
155 if let Some(current_replacements) = replacement_ops.get(¤t_index) {
156 let text_node = node.cast_mut::<Text>()
157 .expect("Expected to find a text node at this index because we constructed our replacements HashMap accordingly.");
158 text_node.content = execute_replacements(current_replacements, &text_node.content);
159 };
160 current_index += 1;
161 });
162 }
163}
164
165impl<
166 const OPEN_SINGLE_QUOTE: char,
167 const CLOSE_SINGLE_QUOTE: char,
168 const OPEN_DOUBLE_QUOTE: char,
169 const CLOSE_DOUBLE_QUOTE: char,
170 >
171 SmartQuotesRule<OPEN_SINGLE_QUOTE, CLOSE_SINGLE_QUOTE, OPEN_DOUBLE_QUOTE, CLOSE_DOUBLE_QUOTE>
172{
173 /// Walk the list of tokens to figure out what needs replacing where. to do
174 /// this, we need to search back and forth over the nodes to find matching
175 /// quotes across nodes. The borrow checker won't let us handle the entire
176 /// set of nodes as mutable at the same time however, so all we do here is
177 /// figure out what we _want_ to replace in which node.
178 fn compute_replacements(text_tokens: Vec<FlatToken>) -> HashMap<usize, HashMap<usize, char>> {
179 let mut quote_stack: Vec<QuoteMarker> = Vec::new();
180 let mut replacement_ops: HashMap<usize, HashMap<usize, char>> = HashMap::new();
181 for (walk_index, token) in text_tokens.iter().enumerate() {
182 if let FlatToken::Text {
183 content,
184 nesting_level,
185 } = token
186 {
187 for op in Self::replace_smartquotes(
188 content,
189 walk_index,
190 *nesting_level,
191 &text_tokens,
192 &mut quote_stack,
193 ) {
194 replacement_ops
195 .entry(op.walk_index)
196 .or_default()
197 .insert(op.quote_position, op.quote);
198 }
199 }
200 }
201 replacement_ops
202 }
203
204 /// Compute quote replacements found by looking at a single text block
205 fn replace_smartquotes(
206 content: &str,
207 walk_index: usize,
208 level: u32,
209 text_tokens: &[FlatToken],
210 quote_stack: &mut Vec<QuoteMarker>,
211 ) -> Vec<ReplacementOp> {
212 truncate_stack(quote_stack, level);
213
214 let mut result: Vec<_> = Vec::new();
215 for (quote_position, quote_type) in find_quotes(content) {
216 let last_char = find_last_char_before(text_tokens, walk_index, quote_position);
217 let next_char = find_first_char_after(text_tokens, walk_index, quote_position);
218
219 let (can_open, can_close): (bool, bool) =
220 can_open_or_close("e_type, last_char, next_char);
221
222 if !can_open && !can_close {
223 // if this is a single quote then we're in the middle of a word and
224 // assume it to be an apostrophe
225 if quote_type == QuoteType::Single {
226 result.push(ReplacementOp {
227 walk_index,
228 quote_position,
229 quote: APOSTROPHE,
230 });
231 }
232 // in any case, we're done with this quote and continue searching
233 // for more quotes in this text block
234 continue;
235 }
236
237 if can_close {
238 if let Some((opening_op, closing_op, new_stack_len)) =
239 Self::try_close(quote_stack, walk_index, level, quote_type, quote_position)
240 {
241 quote_stack.truncate(new_stack_len);
242 result.push(opening_op);
243 result.push(closing_op);
244 continue;
245 }
246 }
247
248 if can_open {
249 quote_stack.push(QuoteMarker {
250 walk_index,
251 quote_position,
252 quote_type,
253 level,
254 });
255 } else if can_close && quote_type == QuoteType::Single {
256 result.push(ReplacementOp {
257 walk_index,
258 quote_position,
259 quote: APOSTROPHE,
260 });
261 }
262 }
263 result
264 }
265
266 /// Try to find a matching opening quote to the given one.
267 ///
268 /// If a match is found, returns `Some` with two `ReplacementOp`s to be
269 /// added to the result, and with the resulting length of the `quote_stack`.
270 fn try_close(
271 quote_stack: &[QuoteMarker],
272 walk_index: usize,
273 level: u32,
274 quote_type: QuoteType,
275 quote_position: usize,
276 ) -> Option<(ReplacementOp, ReplacementOp, usize)> {
277 for (j, other_item) in quote_stack.iter().enumerate().rev() {
278 if other_item.level < level {
279 return None;
280 }
281 if other_item.quote_type == quote_type && other_item.level == level {
282 return Some((
283 ReplacementOp {
284 walk_index: other_item.walk_index,
285 quote_position: other_item.quote_position,
286 quote: if quote_type == QuoteType::Single {
287 OPEN_SINGLE_QUOTE
288 } else {
289 OPEN_DOUBLE_QUOTE
290 },
291 },
292 ReplacementOp {
293 walk_index,
294 quote_position,
295 quote: if quote_type == QuoteType::Single {
296 CLOSE_SINGLE_QUOTE
297 } else {
298 CLOSE_DOUBLE_QUOTE
299 },
300 },
301 j,
302 ));
303 }
304 }
305 None
306 }
307}
308
309/// Produces a simplified flat list of all tokens, with the necessary
310/// information to do smart quote replacement.
311///
312/// This handles inline html and inline code like JS version seems to do.
313/// This list is a work-around for the fact that we can't build a flat list of
314/// all nodes for iteration back and forth, and at the same time do a mutable
315/// walk on the document tree.
316///
317/// Returns a `Vec<FlatToken<'a>>` where `<'a>` is the same lifetime as `root`.
318/// This simply reflects the fact that the `content: &str` entries of the
319/// `FlatToken` structs reference the same memory as `root`'s children.
320/// Every entry in the `Vec` will produce an entry in the result, meaning that
321/// the index of a token in the resulting `Vec` will be the same as the index it
322/// would get during a `root.walk` call.
323fn all_text_tokens(root: &Node) -> Vec<FlatToken> {
324 let mut result = Vec::new();
325 let mut walk_index = 0;
326 root.walk(|node, nesting_level| {
327 if let Some(text_node) = node.cast::<Text>() {
328 result.push(FlatToken::Text {
329 content: &text_node.content,
330 nesting_level,
331 });
332 } else if let Some(html_node) = node.cast::<HtmlInline>() {
333 result.push(FlatToken::HtmlInline {
334 content: &html_node.content,
335 });
336 } else if node.is::<Paragraph>() || node.is::<Hardbreak>() || node.is::<Softbreak>() {
337 result.push(FlatToken::LineBreak);
338 } else {
339 result.push(FlatToken::Irrelevant);
340 }
341 walk_index += 1;
342 });
343 result
344}
345
346/// Checks whether we can open or close a pair of quotes, given the quote type
347/// and the type of characters before and after the quote
348fn can_open_or_close(quote_type: &QuoteType, last_char: char, next_char: char) -> (bool, bool) {
349 // special case: 1"" -> count first quote as an inch
350 // We handle this before doing anything else to simplify the conditions
351 // below.
352 let is_double = *quote_type == QuoteType::Double;
353 let next_is_double = next_char == DOUBLE_QUOTE;
354 let last_is_digit = last_char.is_ascii_digit();
355 if next_is_double && is_double && last_is_digit {
356 return (false, false);
357 }
358
359 // using `is_ascii_punctuation` here matches the JS version exactly, but
360 // that also means we might inherit that implementation's shortcomings
361 // by ignoring unicode punctuation. `is_punct_char` however should
362 // compensate for this.
363 let is_last_punctuation = last_char.is_ascii_punctuation() || is_punct_char(last_char);
364 let is_next_punctuation = next_char.is_ascii_punctuation() || is_punct_char(next_char);
365
366 // Yet again we rely on rust's built-in character handling. The definition
367 // of `is_whitespace` according to the unicode proplist.txt shows that the
368 // difference to the JS version.
369 // https://www.unicode.org/Public/UCD/latest/ucd/PropList.txt
370 //
371 // Recognized as whitespace by Rust, but not by JS:
372 // 0x85, 0x28, 0x29
373 let is_last_whitespace = last_char.is_whitespace();
374 let is_next_whitespace = next_char.is_whitespace();
375
376 let can_open =
377 !is_next_whitespace && (!is_next_punctuation || is_last_whitespace || is_last_punctuation);
378 let can_close =
379 !is_last_whitespace && (!is_last_punctuation || is_next_whitespace || is_next_punctuation);
380
381 if can_open && can_close {
382 // Replace quotes in the middle of punctuation sequence, but not
383 // in the middle of the words, i.e.:
384 //
385 // 1. foo " bar " baz - not replaced
386 // 2. foo-"-bar-"-baz - replaced
387 // 3. foo"bar"baz - not replaced
388 return (is_last_punctuation, is_next_punctuation);
389 }
390
391 (can_open, can_close)
392}
393
394/// Executes a set of character replacements on a string
395fn execute_replacements(replacement_ops: &HashMap<usize, char>, content: &str) -> String {
396 content
397 .chars()
398 .enumerate()
399 .map(|(i, c)| *replacement_ops.get(&i).unwrap_or(&c))
400 .collect()
401}
402
403/// Truncates the stack of quotes following the JS implementation.
404///
405/// This _might_ be simplified by removing the `rev` call and using
406/// `Vec::take_while` instead, but I'm not 100% sure yet that the levels on the
407/// stack are really monotonously increasing, so I'm leaving it as is for now.
408fn truncate_stack(quote_stack: &mut Vec<QuoteMarker>, level: u32) {
409 let stack_len = quote_stack
410 .iter()
411 .rev()
412 .skip_while(|qm| qm.level > level)
413 .count();
414 quote_stack.truncate(stack_len);
415}
416
417/// Finds all single or double quotes in a string, together with their positions
418///
419/// This might be replaced with a regex search, but not sure that's really worth
420/// it, given that we only check for two fixed characters.
421fn find_quotes(content: &str) -> impl Iterator<Item = (usize, QuoteType)> + '_ {
422 content.chars().enumerate().filter_map(|(p, c)| {
423 if c == SINGLE_QUOTE || c == DOUBLE_QUOTE {
424 Some((
425 p,
426 if c == SINGLE_QUOTE {
427 QuoteType::Single
428 } else {
429 QuoteType::Double
430 },
431 ))
432 } else {
433 None
434 }
435 })
436}
437
438/// Finds the next relevant character after a given position
439///
440/// This is the mirror image of `find_last_char_before`.
441///
442/// The position given is that of a quote we found. It is identified by its
443/// token/node index and the position of the quote inside that token. The full
444/// sequence of the text tokens is searched forwards from that point and the
445/// first character is returned.
446///
447/// If a line break or the end of the document is encountered during search,
448/// space (0x20) is returned.
449///
450/// This function is a bit simpler than `find_last_char_before` because Vec
451/// conveniently returns None for out-of-range indexes at the top end, while not
452/// allowing to index with negative index.
453fn find_first_char_after(
454 text_tokens: &[FlatToken],
455 token_index: usize,
456 quote_position: usize,
457) -> char {
458 for (idx_t, text_token) in text_tokens.iter().enumerate().skip(token_index) {
459 let token = match text_token {
460 FlatToken::LineBreak => return SPACE,
461 FlatToken::Text {
462 content,
463 nesting_level: _,
464 } => content,
465 FlatToken::HtmlInline {
466 content,
467 } => content,
468 FlatToken::Irrelevant => continue,
469 };
470 let start_index = if idx_t == token_index {
471 quote_position + 1
472 } else {
473 0
474 };
475 if let Some(c) = token.chars().nth(start_index) {
476 return c;
477 }
478 }
479 // this will be hit if we start searching at the last position of the last
480 // text token
481 SPACE
482}
483
484/// Finds the last relevant character before a given position
485///
486/// The position given is that of a quote we found. It is identified by its
487/// token/node index and the position of the quote inside that token. The full
488/// sequence of the text tokens is searched backwards from that point and the
489/// first character is returned.
490///
491/// If a line break or the beginning of the document is encountered during
492/// search, space (0x20) is returned.
493fn find_last_char_before(
494 text_tokens: &[FlatToken],
495 token_index: usize,
496 quote_position: usize,
497) -> char {
498 for idx_t in (0..=token_index).rev() {
499 let token = match &text_tokens[idx_t] {
500 FlatToken::LineBreak => return SPACE,
501 FlatToken::Text {
502 content,
503 nesting_level: _,
504 } => content,
505 FlatToken::HtmlInline {
506 content,
507 } => content,
508 FlatToken::Irrelevant => continue,
509 };
510
511 // this is _not_ the first index we want to look at, but rather the
512 // index just _after_ that. The reason is simply that this is `usize`
513 // and we want to first check if it's possible to still subtract 1 from
514 // it without panicking.
515 let start_index: usize = if idx_t == token_index {
516 quote_position
517 } else {
518 token.chars().count()
519 };
520 // means we can't go any further left -> try the next token (i.e. the
521 // one preceding this one)
522 if start_index == 0 {
523 continue;
524 }
525 // unwrapping is safe here, we built our index to match the length of
526 // the string, or (in the case of the token containing the quote itself)
527 // it should be indexing a _prefix_ of the string.
528 return token.chars().nth(start_index - 1).unwrap();
529 }
530 // this will be hit if we find a quote in the first position of the first token
531 SPACE
532}
533
534
535#[cfg(test)]
536mod tests {
537 #[test]
538 fn smartquotes_basics() {
539 let md = &mut crate::MarkdownIt::new();
540 crate::plugins::cmark::add(md);
541 crate::plugins::extra::smartquotes::add(md);
542 let html = md.parse(r#"'hello' "world""#).render();
543 assert_eq!(html.trim(), r#"<p>‘hello’ “world”</p>"#);
544 }
545
546 #[test]
547 fn smartquotes_shouldnt_affect_html() {
548 let md = &mut crate::MarkdownIt::new();
549 crate::plugins::cmark::add(md);
550 crate::plugins::html::html_inline::add(md);
551 crate::plugins::extra::smartquotes::add(md);
552 let html = md.parse(r#"<a href="hello"></a>"#).render();
553 assert_eq!(html.trim(), r#"<p><a href="hello"></a></p>"#);
554 }
555
556 #[test]
557 fn smartquotes_should_work_with_typographer() {
558 // regression test for https://github.com/rlidwka/markdown-it.rs/issues/26
559 let md = &mut crate::MarkdownIt::new();
560 crate::plugins::cmark::add(md);
561 crate::plugins::html::html_inline::add(md);
562 crate::plugins::extra::typographer::add(md);
563 crate::plugins::extra::smartquotes::add(md);
564 let html = md.parse("\"**...**\"").render();
565 assert_eq!(html.trim(), "<p>“<strong>…</strong>”</p>");
566 }
567}