markdown_it/generics/inline/
emph_pair.rs

1//! Structure similar to `*emphasis*` with configurable markers of fixed length.
2//!
3//! There are many structures in various markdown flavors that
4//! can be implemented with this, namely:
5//!
6//!  - `*emphasis*` or `_emphasis_` -> `<em>emphasis</em>`
7//!  - `**strong**` or `__strong__` -> `<strong>strong</strong>`
8//!  - `~~strikethrough~~` -> `<s>strikethrough</s>`
9//!  - `==marked==` -> `<mark>marked</mark>`
10//!  - `++inserted++` -> `<ins>inserted</ins>`
11//!  - `~subscript~` -> `<sub>subscript</sub>`
12//!  - `^superscript^` -> `<sup>superscript</sup>`
13//!
14//! You add a custom structure by using [add_with] function, which takes following arguments:
15//!  - `MARKER` - marker character
16//!  - `LENGTH` - length of the opening/closing marker (can be 1, 2 or 3)
17//!  - `CAN_SPLIT_WORD` - whether this structure can be found in the middle of the word
18//!    (for example, note the difference between `foo*bar*baz` and `foo_bar_baz`
19//!    in CommonMark - first one is an emphasis, second one isn't)
20//!  - `md` - parser instance
21//!  - `f` - function that should return your custom [Node]
22//!
23//! Here is an example of implementing superscript in your custom code:
24//!
25//! ```rust
26//! use markdown_it::generics::inline::emph_pair;
27//! use markdown_it::{MarkdownIt, Node, NodeValue, Renderer};
28//!
29//! #[derive(Debug)]
30//! struct Superscript;
31//! impl NodeValue for Superscript {
32//!     fn render(&self, node: &Node, fmt: &mut dyn Renderer) {
33//!         fmt.open("sup", &node.attrs);
34//!         fmt.contents(&node.children);
35//!         fmt.close("sup");
36//!     }
37//! }
38//!
39//! let md = &mut MarkdownIt::new();
40//! emph_pair::add_with::<'^', 1, true>(md, || Node::new(Superscript));
41//!
42//! let html = md.parse("e^iπ^+1=0").render();
43//! assert_eq!(html.trim(), "e<sup>iπ</sup>+1=0");
44//! ```
45//!
46//! Note that these structures have lower priority than the rest of the rules,
47//! e.g. `` *foo`bar*baz` `` is parsed as `*foo<code>bar*baz</code>`.
48//!
49use std::cmp::min;
50
51use crate::common::sourcemap::SourcePos;
52use crate::parser::core::CoreRule;
53use crate::parser::extset::{MarkdownItExt, NodeExt};
54use crate::parser::inline::builtin::InlineParserRule;
55use crate::parser::inline::{InlineRule, InlineState, Text};
56use crate::{MarkdownIt, Node, NodeValue};
57
58#[derive(Debug, Default)]
59struct PairConfig<const MARKER: char> {
60    inserted: bool,
61    fns: [Option<fn () -> Node>; 3],
62}
63impl<const MARKER: char> MarkdownItExt for PairConfig<MARKER> {}
64
65#[derive(Debug, Default)]
66struct OpenersBottom<const MARKER: char>([ usize; 6 ]);
67impl<const MARKER: char> NodeExt for OpenersBottom<MARKER> {}
68
69#[derive(Debug, Clone)]
70#[doc(hidden)]
71pub struct EmphMarker {
72    // Starting marker
73    pub marker:    char,
74
75    // Total length of these series of delimiters.
76    pub length:    usize,
77
78    // Remaining length that's not already matched to other delimiters.
79    pub remaining: usize,
80
81    // Boolean flags that determine if this delimiter could open or close
82    // an emphasis.
83    pub open:      bool,
84    pub close:     bool,
85}
86
87// this node is supposed to be replaced by actual emph or text node
88impl NodeValue for EmphMarker {}
89
90pub fn add_with<const MARKER: char, const LENGTH: u8, const CAN_SPLIT_WORD: bool>(md: &mut MarkdownIt, f: fn () -> Node) {
91    let pair_config = md.ext.get_or_insert_default::<PairConfig<MARKER>>();
92    pair_config.fns[LENGTH as usize - 1] = Some(f);
93
94    if !pair_config.inserted {
95        pair_config.inserted = true;
96        md.inline.add_rule::<EmphPairScanner<MARKER, CAN_SPLIT_WORD>>();
97    }
98
99    if !md.has_rule::<FragmentsJoin>() {
100        md.add_rule::<FragmentsJoin>()
101            .before_all()
102            .after::<InlineParserRule>();
103    }
104}
105
106#[doc(hidden)]
107pub struct EmphPairScanner<const MARKER: char, const CAN_SPLIT_WORD: bool>;
108impl<const MARKER: char, const CAN_SPLIT_WORD: bool> InlineRule for EmphPairScanner<MARKER, CAN_SPLIT_WORD> {
109    const MARKER: char = MARKER;
110
111    // this rule works on a closing marker, so for technical reasons any rules trying to skip it
112    // should see just plain text
113    fn check(_: &mut InlineState) -> Option<usize> { None }
114
115    fn run(state: &mut InlineState) -> Option<(Node, usize)> {
116        let mut chars = state.src[state.pos..state.pos_max].chars();
117        if chars.next().unwrap() != MARKER { return None; }
118
119        let scanned = state.scan_delims(state.pos, CAN_SPLIT_WORD);
120        let mut node = Node::new(EmphMarker {
121            marker:    MARKER,
122            length:    scanned.length,
123            remaining: scanned.length,
124            open:      scanned.can_open,
125            close:     scanned.can_close,
126        });
127        node.srcmap = state.get_map(state.pos, state.pos + scanned.length);
128        node = scan_and_match_delimiters::<MARKER>(state, node);
129        let map = node.srcmap.unwrap().get_byte_offsets();
130        // backtrack to keep correct source maps
131        state.pos += scanned.length;
132        let token_len = map.1 - map.0;
133        state.pos -= token_len;
134        Some((node, token_len))
135    }
136}
137
138/// Assuming last token is a closing delimiter we just inserted,
139/// try to find opener(s). If any are found, move stuff to nested emph node.
140fn scan_and_match_delimiters<const MARKER: char>(state: &mut InlineState, mut closer_token: Node) -> Node {
141    if state.node.children.is_empty() { return closer_token; } // must have at least opener and closer
142
143    let mut closer = closer_token.cast_mut::<EmphMarker>().unwrap().clone();
144    if !closer.close { return closer_token; }
145
146    // Previously calculated lower bounds (previous fails)
147    // for each marker, each delimiter length modulo 3,
148    // and for whether this closer can be an opener;
149    // https://github.com/commonmark/cmark/commit/34250e12ccebdc6372b8b49c44fab57c72443460
150    let openers_for_marker = state.node.ext.get_or_insert_default::<OpenersBottom<MARKER>>();
151    let openers_parameter = (closer.open as usize) * 3 + closer.length % 3;
152
153    let min_opener_idx = openers_for_marker.0[openers_parameter];
154
155    let mut idx = state.node.children.len() - 1;
156    let mut new_min_opener_idx = idx;
157    while idx > min_opener_idx {
158        idx -= 1;
159
160        let Some(opener) = state.node.children[idx].cast::<EmphMarker>() else { continue; };
161
162        let mut opener = opener.clone();
163        if opener.open && opener.marker == closer.marker && !is_odd_match(&opener, &closer) {
164            while closer.remaining > 0 && opener.remaining > 0 {
165                let max_marker_len = min(3, min(opener.remaining, closer.remaining));
166                let mut matched_rule = None;
167                let fns = &state.md.ext.get::<PairConfig<MARKER>>().unwrap().fns;
168                for marker_len in (1..=max_marker_len).rev() {
169                    if let Some(f) = fns[marker_len-1] {
170                        matched_rule = Some((marker_len, f));
171                        break;
172                    }
173                }
174
175                // If matched_fn isn't found, it can only mean that function is defined for larger marker
176                // than we have (e.g. function defined for **, we have *).
177                // Treat this as "marker not found".
178                if matched_rule.is_none() { break; }
179
180                let (marker_len, marker_fn) = matched_rule.unwrap();
181
182                closer.remaining -= marker_len;
183                opener.remaining -= marker_len;
184
185                let mut new_token = marker_fn();
186                new_token.children = state.node.children.split_off(idx + 1);
187
188                // cut marker_len chars from start, i.e. "12345" -> "345"
189                let mut end_map_pos = 0;
190                if let Some(map) = closer_token.srcmap {
191                    let (start, end) = map.get_byte_offsets();
192                    closer_token.srcmap = Some(SourcePos::new(start + marker_len, end));
193                    end_map_pos = start + marker_len;
194                }
195
196                // cut marker_len chars from end, i.e. "12345" -> "123"
197                let mut start_map_pos = 0;
198                let opener_token = state.node.children.last_mut().unwrap();
199                if let Some(map) = opener_token.srcmap {
200                    let (start, end) = map.get_byte_offsets();
201                    opener_token.srcmap = Some(SourcePos::new(start, end - marker_len));
202                    start_map_pos = end - marker_len;
203                }
204
205                new_token.srcmap = state.get_map(start_map_pos, end_map_pos);
206
207                // remove empty node as a small optimization so we can do less work later
208                if opener.remaining == 0 { state.node.children.pop(); }
209
210                new_min_opener_idx = 0;
211                state.node.children.push(new_token);
212
213            }
214        }
215
216        if opener.remaining > 0 {
217            state.node.children[idx].replace(opener);
218        } // otherwise node was already deleted
219    }
220
221    if new_min_opener_idx != 0 {
222        // If match for this delimiter run failed, we want to set lower bound for
223        // future lookups. This is required to make sure algorithm has linear
224        // complexity.
225        //
226        // See details here:
227        // https://github.com/commonmark/cmark/issues/178#issuecomment-270417442
228        //
229        let openers_for_marker = state.node.ext.get_or_insert_default::<OpenersBottom<MARKER>>();
230        openers_for_marker.0[openers_parameter] = new_min_opener_idx;
231    }
232
233    // remove empty node as a small optimization so we can do less work later
234    if closer.remaining > 0 {
235        closer_token.replace(closer);
236        closer_token
237    } else {
238        state.node.children.pop().unwrap()
239    }
240}
241
242
243fn is_odd_match(opener: &EmphMarker, closer: &EmphMarker) -> bool {
244    // from spec:
245    //
246    // If one of the delimiters can both open and close emphasis, then the
247    // sum of the lengths of the delimiter runs containing the opening and
248    // closing delimiters must not be a multiple of 3 unless both lengths
249    // are multiples of 3.
250    //
251    #[allow(clippy::collapsible_if)]
252    if opener.close || closer.open {
253        if (opener.length + closer.length) % 3 == 0 {
254            if opener.length % 3 != 0 || closer.length % 3 != 0 {
255                return true;
256            }
257        }
258    }
259
260    false
261}
262
263
264#[doc(hidden)]
265pub struct FragmentsJoin;
266impl CoreRule for FragmentsJoin {
267    fn run(node: &mut Node, _: &MarkdownIt) {
268        node.walk_mut(|node, _| fragments_join(node));
269    }
270}
271
272
273/// Clean up tokens after emphasis and strikethrough postprocessing:
274/// merge adjacent text nodes into one and re-calculate all token levels
275///
276/// This is necessary because initially emphasis delimiter markers (*, _, ~)
277/// are treated as their own separate text tokens. Then emphasis rule either
278/// leaves them as text (needed to merge with adjacent text) or turns them
279/// into opening/closing tags (which messes up levels inside).
280///
281fn fragments_join(node: &mut Node) {
282    // replace all emph markers with text tokens
283    for token in node.children.iter_mut() {
284        if let Some(data) = token.cast::<EmphMarker>() {
285            let content = data.marker.to_string().repeat(data.remaining);
286            token.replace(Text { content });
287        }
288    }
289
290    // collapse adjacent text tokens
291    for idx in 1..node.children.len() {
292        let ( tokens1, tokens2 ) = node.children.split_at_mut(idx);
293
294        let token1 = tokens1.last_mut().unwrap();
295        let Some(t1_data) = token1.cast_mut::<Text>() else { continue; };
296
297        let token2 = tokens2.first_mut().unwrap();
298        let Some(t2_data) = token2.cast_mut::<Text>() else { continue; };
299
300        // concat contents
301        let t2_content = std::mem::take(&mut t2_data.content);
302        t1_data.content += &t2_content;
303
304        // adjust source maps
305        if let Some(map1) = token1.srcmap {
306            if let Some(map2) = token2.srcmap {
307                token1.srcmap = Some(SourcePos::new(
308                    map1.get_byte_offsets().0,
309                    map2.get_byte_offsets().1
310                ));
311            }
312        }
313
314        node.children.swap(idx - 1, idx);
315    }
316
317    // remove all empty tokens
318    node.children.retain(|token| {
319        if let Some(data) = token.cast::<Text>() {
320            !data.content.is_empty()
321        } else {
322            true
323        }
324    });
325}