markdown_it/plugins/extra/
smartquotes.rs

1//! Replaces `"` and `'` quotes with "nicer" ones like `‘`, `’`, `“`, `”`, or
2//! with `’` for words like "isn't".
3//!
4//! This currently only supports single character quotes, which is a limitation
5//! of the Rust implementation due to the use of `const` generics.
6//!
7//! ## Implementation notes
8//!
9//! The main obstacle to implementing this was the fact that the document is
10//! necessarily represented as a tree of nodes.
11//! Each node is thus necessarily referenced by its parents, which means that an
12//! any given moment we cannot hold a mutable reference to a node if any other
13//! part of the code holds a reference to the document. At least that's my
14//! understanding of the problem.
15//! The smartquotes algorithm from the JS library makes heavy use of iteration
16//! backwards and forwards through a flat list of tokens. This isn't really
17//! possible in the Rust implementation. Building a flat representation of all
18//! `Node` objects is easy, but holding that list precludes us from executing a
19//! `root.walk_mut` call at the same time.
20//! On top of that, while the smartquotes algorithm iterates linearly over all
21//! nodes/tokens, looking at a specific token with index `j` can trigger
22//! replacements in any of the tokens with `0` to `j - 1`.
23//!
24//! The solution proposed here is to first compute all the replacement
25//! operations on a read-only flat view of the document, and _then_ to perform
26//! all replacements in a single call to `root.walk_mut`.
27use std::collections::HashMap;
28
29use crate::common::utils::is_punct_char;
30use crate::parser::core::CoreRule;
31use crate::parser::inline::Text;
32use crate::plugins::cmark::block::paragraph::Paragraph;
33use crate::plugins::cmark::inline::newline::{Hardbreak, Softbreak};
34use crate::plugins::html::html_inline::HtmlInline;
35use crate::{MarkdownIt, Node};
36
37const APOSTROPHE: char = '\u{2019}';
38const SINGLE_QUOTE: char = '\'';
39const DOUBLE_QUOTE: char = '"';
40const SPACE: char = ' ';
41
42/// Add smartquotes with the "classic" quote set of `‘`, `’`, `“`, and `”`.
43pub fn add(md: &mut MarkdownIt) {
44    add_with::<'‘', '’', '“', '”'>(md);
45}
46
47pub fn add_with<
48    const OPEN_SINGLE_QUOTE: char,
49    const CLOSE_SINGLE_QUOTE: char,
50    const OPEN_DOUBLE_QUOTE: char,
51    const CLOSE_DOUBLE_QUOTE: char,
52>(
53    md: &mut MarkdownIt,
54) {
55    md.add_rule::<SmartQuotesRule<
56        OPEN_SINGLE_QUOTE,
57        CLOSE_SINGLE_QUOTE,
58        OPEN_DOUBLE_QUOTE,
59        CLOSE_DOUBLE_QUOTE>>();
60}
61
62/// Simplified Node type that only holds the info we need
63///
64/// To replace quotes, we'll be iterating forward and backward over the nodes in
65/// our document tree. The `Node` class doesn't provide a mechanism to do this
66/// efficiently, and in any case we only care about certain parts of the
67/// information. This struct will be used to build a flat view of the document;
68/// the `Irrelevant` variant serves as a "filler" so that the indexes of the
69/// entries line up correctly with the order we see during tree traversal.
70enum FlatToken<'a> {
71    LineBreak,
72    Text {
73        content: &'a str,
74        nesting_level: u32,
75    },
76    HtmlInline {
77        content: &'a str,
78    },
79    Irrelevant,
80}
81
82/// A simple enum to distinguish single and double quotes
83#[derive(PartialEq, Eq, Debug, Clone, Copy)]
84enum QuoteType {
85    Single,
86    Double,
87}
88
89/// Holds information about quotes we have encountered thus far.
90///
91/// These quotes may or may not be used to close a pair further down the line.
92/// The different fields thus hold all the information we need to a) decide
93/// whether or not to match them up with another quote we encounter, and b) to
94/// perform the correct replacement, should be indeed use this quote to close a
95/// pair.
96struct QuoteMarker {
97    /// The iteration index of the node in which this quote was found.
98    ///
99    /// This is the index at which this quote's `Node` appears in a pre-order
100    /// depth-first walk of the document tree. Since we can only _modify_ nodes
101    /// during a walk, we rely on this index to tell us which nodes to modify.
102    walk_index: usize,
103    /// The position of the quote within node's `content`
104    quote_position: usize,
105    /// Whether this is a single or a double quote
106    quote_type: QuoteType,
107    /// Nesting level of the containing token
108    ///
109    /// This is the nesting of the containing `Node` within the document tree.
110    /// It is used to decide which quotes can be matched up.
111    level: u32,
112}
113
114/// Description of a single quote replacement to be executed
115///
116/// As described above, we have to compute the replacements in a first step that
117/// treats the entire document tree read-only. Only then can we perform the
118/// actual replacements. This `struct` holds the information we need to perform
119/// the replacement of a single quote character during a `walk_mut`.
120struct ReplacementOp {
121    walk_index: usize,
122    quote_position: usize,
123    quote: char,
124}
125
126pub struct SmartQuotesRule<
127    const OPEN_SINGLE_QUOTE: char,
128    const CLOSE_SINGLE_QUOTE: char,
129    const OPEN_DOUBLE_QUOTE: char,
130    const CLOSE_DOUBLE_QUOTE: char,
131>;
132
133impl<
134        const OPEN_SINGLE_QUOTE: char,
135        const CLOSE_SINGLE_QUOTE: char,
136        const OPEN_DOUBLE_QUOTE: char,
137        const CLOSE_DOUBLE_QUOTE: char,
138    > CoreRule
139    for SmartQuotesRule<
140        OPEN_SINGLE_QUOTE,
141        CLOSE_SINGLE_QUOTE,
142        OPEN_DOUBLE_QUOTE,
143        CLOSE_DOUBLE_QUOTE,
144    >
145{
146    fn run(root: &mut Node, _: &MarkdownIt) {
147        let text_tokens = all_text_tokens(root);
148
149        let replacement_ops = Self::compute_replacements(text_tokens);
150
151        // now that we know what we want to replace where, we go over the nodes a _third_ time to do all the actual replacements.
152        let mut current_index: usize = 0;
153
154        root.walk_mut(|node, _| {
155            if let Some(current_replacements) = replacement_ops.get(&current_index) {
156                let text_node = node.cast_mut::<Text>()
157                    .expect("Expected to find a text node at this index because we constructed our replacements HashMap accordingly.");
158                text_node.content = execute_replacements(current_replacements, &text_node.content);
159            };
160            current_index += 1;
161        });
162    }
163}
164
165impl<
166        const OPEN_SINGLE_QUOTE: char,
167        const CLOSE_SINGLE_QUOTE: char,
168        const OPEN_DOUBLE_QUOTE: char,
169        const CLOSE_DOUBLE_QUOTE: char,
170    >
171    SmartQuotesRule<OPEN_SINGLE_QUOTE, CLOSE_SINGLE_QUOTE, OPEN_DOUBLE_QUOTE, CLOSE_DOUBLE_QUOTE>
172{
173    /// Walk the list of tokens to figure out what needs replacing where. to do
174    /// this, we need to search back and forth over the nodes to find matching
175    /// quotes across nodes. The borrow checker won't let us handle the entire
176    /// set of nodes as mutable at the same time however, so all we do here is
177    /// figure out what we _want_ to replace in which node.
178    fn compute_replacements(text_tokens: Vec<FlatToken>) -> HashMap<usize, HashMap<usize, char>> {
179        let mut quote_stack: Vec<QuoteMarker> = Vec::new();
180        let mut replacement_ops: HashMap<usize, HashMap<usize, char>> = HashMap::new();
181        for (walk_index, token) in text_tokens.iter().enumerate() {
182            if let FlatToken::Text {
183                content,
184                nesting_level,
185            } = token
186            {
187                for op in Self::replace_smartquotes(
188                    content,
189                    walk_index,
190                    *nesting_level,
191                    &text_tokens,
192                    &mut quote_stack,
193                ) {
194                    replacement_ops
195                        .entry(op.walk_index)
196                        .or_default()
197                        .insert(op.quote_position, op.quote);
198                }
199            }
200        }
201        replacement_ops
202    }
203
204    /// Compute quote replacements found by looking at a single text block
205    fn replace_smartquotes(
206        content: &str,
207        walk_index: usize,
208        level: u32,
209        text_tokens: &[FlatToken],
210        quote_stack: &mut Vec<QuoteMarker>,
211    ) -> Vec<ReplacementOp> {
212        truncate_stack(quote_stack, level);
213
214        let mut result: Vec<_> = Vec::new();
215        for (quote_position, quote_type) in find_quotes(content) {
216            let last_char = find_last_char_before(text_tokens, walk_index, quote_position);
217            let next_char = find_first_char_after(text_tokens, walk_index, quote_position);
218
219            let (can_open, can_close): (bool, bool) =
220                can_open_or_close(&quote_type, last_char, next_char);
221
222            if !can_open && !can_close {
223                // if this is a single quote then we're in the middle of a word and
224                // assume it to be an apostrophe
225                if quote_type == QuoteType::Single {
226                    result.push(ReplacementOp {
227                        walk_index,
228                        quote_position,
229                        quote: APOSTROPHE,
230                    });
231                }
232                // in any case, we're done with this quote and continue searching
233                // for more quotes in this text block
234                continue;
235            }
236
237            if can_close {
238                if let Some((opening_op, closing_op, new_stack_len)) =
239                    Self::try_close(quote_stack, walk_index, level, quote_type, quote_position)
240                {
241                    quote_stack.truncate(new_stack_len);
242                    result.push(opening_op);
243                    result.push(closing_op);
244                    continue;
245                }
246            }
247
248            if can_open {
249                quote_stack.push(QuoteMarker {
250                    walk_index,
251                    quote_position,
252                    quote_type,
253                    level,
254                });
255            } else if can_close && quote_type == QuoteType::Single {
256                result.push(ReplacementOp {
257                    walk_index,
258                    quote_position,
259                    quote: APOSTROPHE,
260                });
261            }
262        }
263        result
264    }
265
266    /// Try to find a matching opening quote to the given one.
267    ///
268    /// If a match is found, returns `Some` with two `ReplacementOp`s to be
269    /// added to the result, and with the resulting length of the `quote_stack`.
270    fn try_close(
271        quote_stack: &[QuoteMarker],
272        walk_index: usize,
273        level: u32,
274        quote_type: QuoteType,
275        quote_position: usize,
276    ) -> Option<(ReplacementOp, ReplacementOp, usize)> {
277        for (j, other_item) in quote_stack.iter().enumerate().rev() {
278            if other_item.level < level {
279                return None;
280            }
281            if other_item.quote_type == quote_type && other_item.level == level {
282                return Some((
283                    ReplacementOp {
284                        walk_index: other_item.walk_index,
285                        quote_position: other_item.quote_position,
286                        quote: if quote_type == QuoteType::Single {
287                            OPEN_SINGLE_QUOTE
288                        } else {
289                            OPEN_DOUBLE_QUOTE
290                        },
291                    },
292                    ReplacementOp {
293                        walk_index,
294                        quote_position,
295                        quote: if quote_type == QuoteType::Single {
296                            CLOSE_SINGLE_QUOTE
297                        } else {
298                            CLOSE_DOUBLE_QUOTE
299                        },
300                    },
301                    j,
302                ));
303            }
304        }
305        None
306    }
307}
308
309/// Produces a simplified flat list of all tokens, with the necessary
310/// information to do smart quote replacement.
311///
312/// This handles inline html and inline code like JS version seems to do.
313/// This list is a work-around for the fact that we can't build a flat list of
314/// all nodes for iteration back and forth, and at the same time do a mutable
315/// walk on the document tree.
316///
317/// Returns a `Vec<FlatToken<'a>>` where `<'a>` is the same lifetime as `root`.
318/// This simply reflects the fact that the `content: &str` entries of the
319/// `FlatToken` structs reference the same memory as `root`'s children.
320/// Every entry in the `Vec` will produce an entry in the result, meaning that
321/// the index of a token in the resulting `Vec` will be the same as the index it
322/// would get during a `root.walk` call.
323fn all_text_tokens(root: &Node) -> Vec<FlatToken> {
324    let mut result = Vec::new();
325    let mut walk_index = 0;
326    root.walk(|node, nesting_level| {
327        if let Some(text_node) = node.cast::<Text>() {
328            result.push(FlatToken::Text {
329                content: &text_node.content,
330                nesting_level,
331            });
332        } else if let Some(html_node) = node.cast::<HtmlInline>() {
333            result.push(FlatToken::HtmlInline {
334                content: &html_node.content,
335            });
336        } else if node.is::<Paragraph>() || node.is::<Hardbreak>() || node.is::<Softbreak>() {
337            result.push(FlatToken::LineBreak);
338        } else {
339            result.push(FlatToken::Irrelevant);
340        }
341        walk_index += 1;
342    });
343    result
344}
345
346/// Checks whether we can open or close a pair of quotes, given the quote type
347/// and the type of characters before and after the quote
348fn can_open_or_close(quote_type: &QuoteType, last_char: char, next_char: char) -> (bool, bool) {
349    // special case: 1"" -> count first quote as an inch
350    // We handle this before doing anything else to simplify the conditions
351    // below.
352    let is_double = *quote_type == QuoteType::Double;
353    let next_is_double = next_char == DOUBLE_QUOTE;
354    let last_is_digit = last_char.is_ascii_digit();
355    if next_is_double && is_double && last_is_digit {
356        return (false, false);
357    }
358
359    // using `is_ascii_punctuation` here matches the JS version exactly, but
360    // that also means we might inherit that implementation's shortcomings
361    // by ignoring unicode punctuation. `is_punct_char` however should
362    // compensate for this.
363    let is_last_punctuation = last_char.is_ascii_punctuation() || is_punct_char(last_char);
364    let is_next_punctuation = next_char.is_ascii_punctuation() || is_punct_char(next_char);
365
366    // Yet again we rely on rust's built-in character handling. The definition
367    // of `is_whitespace` according to the unicode proplist.txt shows that the
368    // difference to the JS version.
369    // https://www.unicode.org/Public/UCD/latest/ucd/PropList.txt
370    //
371    // Recognized as whitespace by Rust, but not by JS:
372    // 0x85, 0x28, 0x29
373    let is_last_whitespace = last_char.is_whitespace();
374    let is_next_whitespace = next_char.is_whitespace();
375
376    let can_open =
377        !is_next_whitespace && (!is_next_punctuation || is_last_whitespace || is_last_punctuation);
378    let can_close =
379        !is_last_whitespace && (!is_last_punctuation || is_next_whitespace || is_next_punctuation);
380
381    if can_open && can_close {
382        // Replace quotes in the middle of punctuation sequence, but not
383        // in the middle of the words, i.e.:
384        //
385        // 1. foo " bar " baz - not replaced
386        // 2. foo-"-bar-"-baz - replaced
387        // 3. foo"bar"baz     - not replaced
388        return (is_last_punctuation, is_next_punctuation);
389    }
390
391    (can_open, can_close)
392}
393
394/// Executes a set of character replacements on a string
395fn execute_replacements(replacement_ops: &HashMap<usize, char>, content: &str) -> String {
396    content
397        .chars()
398        .enumerate()
399        .map(|(i, c)| *replacement_ops.get(&i).unwrap_or(&c))
400        .collect()
401}
402
403/// Truncates the stack of quotes following the JS implementation.
404///
405/// This _might_ be simplified by removing the `rev` call and using
406/// `Vec::take_while` instead, but I'm not 100% sure yet that the levels on the
407/// stack are really monotonously increasing, so I'm leaving it as is for now.
408fn truncate_stack(quote_stack: &mut Vec<QuoteMarker>, level: u32) {
409    let stack_len = quote_stack
410        .iter()
411        .rev()
412        .skip_while(|qm| qm.level > level)
413        .count();
414    quote_stack.truncate(stack_len);
415}
416
417/// Finds all single or double quotes in a string, together with their positions
418///
419/// This might be replaced with a regex search, but not sure that's really worth
420/// it, given that we only check for two fixed characters.
421fn find_quotes(content: &str) -> impl Iterator<Item = (usize, QuoteType)> + '_ {
422    content.chars().enumerate().filter_map(|(p, c)| {
423        if c == SINGLE_QUOTE || c == DOUBLE_QUOTE {
424            Some((
425                p,
426                if c == SINGLE_QUOTE {
427                    QuoteType::Single
428                } else {
429                    QuoteType::Double
430                },
431            ))
432        } else {
433            None
434        }
435    })
436}
437
438/// Finds the next relevant character after a given position
439///
440/// This is the mirror image of `find_last_char_before`.
441///
442/// The position given is that of a quote we found. It is identified by its
443/// token/node index and the position of the quote inside that token. The full
444/// sequence of the text tokens is searched forwards from that point and the
445/// first character is returned.
446///
447/// If a line break or the end of the document is encountered during search,
448/// space (0x20) is returned.
449///
450/// This function is a bit simpler than `find_last_char_before` because Vec
451/// conveniently returns None for out-of-range indexes at the top end, while not
452/// allowing to index with negative index.
453fn find_first_char_after(
454    text_tokens: &[FlatToken],
455    token_index: usize,
456    quote_position: usize,
457) -> char {
458    for (idx_t, text_token) in text_tokens.iter().enumerate().skip(token_index) {
459        let token = match text_token {
460            FlatToken::LineBreak => return SPACE,
461            FlatToken::Text {
462                content,
463                nesting_level: _,
464            } => content,
465            FlatToken::HtmlInline {
466                content,
467            } => content,
468            FlatToken::Irrelevant => continue,
469        };
470        let start_index = if idx_t == token_index {
471            quote_position + 1
472        } else {
473            0
474        };
475        if let Some(c) = token.chars().nth(start_index) {
476            return c;
477        }
478    }
479    // this will be hit if we start searching at the last position of the last
480    // text token
481    SPACE
482}
483
484/// Finds the last relevant character before a given position
485///
486/// The position given is that of a quote we found. It is identified by its
487/// token/node index and the position of the quote inside that token. The full
488/// sequence of the text tokens is searched backwards from that point and the
489/// first character is returned.
490///
491/// If a line break or the beginning of the document is encountered during
492/// search, space (0x20) is returned.
493fn find_last_char_before(
494    text_tokens: &[FlatToken],
495    token_index: usize,
496    quote_position: usize,
497) -> char {
498    for idx_t in (0..=token_index).rev() {
499        let token = match &text_tokens[idx_t] {
500            FlatToken::LineBreak => return SPACE,
501            FlatToken::Text {
502                content,
503                nesting_level: _,
504            } => content,
505            FlatToken::HtmlInline {
506                content,
507            } => content,
508            FlatToken::Irrelevant => continue,
509        };
510
511        // this is _not_ the first index we want to look at, but rather the
512        // index just _after_ that.  The reason is simply that this is `usize`
513        // and we want to first check if it's possible to still subtract 1 from
514        // it without panicking.
515        let start_index: usize = if idx_t == token_index {
516            quote_position
517        } else {
518            token.chars().count()
519        };
520        // means we can't go any further left -> try the next token (i.e. the
521        // one preceding this one)
522        if start_index == 0 {
523            continue;
524        }
525        // unwrapping is safe here, we built our index to match the length of
526        // the string, or (in the case of the token containing the quote itself)
527        // it should be indexing a _prefix_ of the string.
528        return token.chars().nth(start_index - 1).unwrap();
529    }
530    // this will be hit if we find a quote in the first position of the first token
531    SPACE
532}
533
534
535#[cfg(test)]
536mod tests {
537    #[test]
538    fn smartquotes_basics() {
539        let md = &mut crate::MarkdownIt::new();
540        crate::plugins::cmark::add(md);
541        crate::plugins::extra::smartquotes::add(md);
542        let html = md.parse(r#"'hello' "world""#).render();
543        assert_eq!(html.trim(), r#"<p>‘hello’ “world”</p>"#);
544    }
545
546    #[test]
547    fn smartquotes_shouldnt_affect_html() {
548        let md = &mut crate::MarkdownIt::new();
549        crate::plugins::cmark::add(md);
550        crate::plugins::html::html_inline::add(md);
551        crate::plugins::extra::smartquotes::add(md);
552        let html = md.parse(r#"<a href="hello"></a>"#).render();
553        assert_eq!(html.trim(), r#"<p><a href="hello"></a></p>"#);
554    }
555
556    #[test]
557    fn smartquotes_should_work_with_typographer() {
558        // regression test for https://github.com/rlidwka/markdown-it.rs/issues/26
559        let md = &mut crate::MarkdownIt::new();
560        crate::plugins::cmark::add(md);
561        crate::plugins::html::html_inline::add(md);
562        crate::plugins::extra::typographer::add(md);
563        crate::plugins::extra::smartquotes::add(md);
564        let html = md.parse("\"**...**\"").render();
565        assert_eq!(html.trim(), "<p>“<strong>…</strong>”</p>");
566    }
567}