markdown_that/generics/inline/
emph_pair.rs

1//! Structure similar to `*emphasis*` with configurable markers of fixed length.
2//!
3//! There are many structures in various Markdown flavors that
4//! can be implemented with this, namely:
5//!
6//!  - `*emphasis*` or `_emphasis_` -> `<em>emphasis</em>`
7//!  - `**strong**` or `__strong__` -> `<strong>strong</strong>`
8//!  - `~~strikethrough~~` -> `<s>strikethrough</s>`
9//!  - `==marked==` -> `<mark>marked</mark>`
10//!  - `++inserted++` -> `<ins>inserted</ins>`
11//!  - `~subscript~` -> `<sub>subscript</sub>`
12//!  - `^superscript^` -> `<sup>superscript</sup>`
13//!
14//! You add a custom structure by using [add_with] function, which takes following arguments:
15//!  - `MARKER` - marker character
16//!  - `LENGTH` - length of the opening/closing marker (can be 1, 2 or 3)
17//!  - `CAN_SPLIT_WORD` - whether this structure can be found in the middle of the word
18//!    (for example, note the difference between `foo*bar*baz` and `foo_bar_baz`
19//!    in CommonMark - first one is an emphasis, second one isn't)
20//!  - `md` - parser instance
21//!  - `f` - function that should return your custom [Node]
22//!
23//! Here is an example of implementing superscript in your custom code:
24//!
25//! ```rust
26//! use markdown_that::generics::inline::emph_pair;
27//! use markdown_that::{MarkdownThat, Node, NodeValue, Renderer};
28//!
29//! #[derive(Debug)]
30//! struct Superscript;
31//! impl NodeValue for Superscript {
32//!     fn render(&self, node: &Node, fmt: &mut dyn Renderer) {
33//!         fmt.open("sup", &node.attrs);
34//!         fmt.contents(&node.children);
35//!         fmt.close("sup");
36//!     }
37//! }
38//!
39//! let md = &mut MarkdownThat::new();
40//! emph_pair::add_with::<'^', 1, true>(md, || Node::new(Superscript));
41//!
42//! let html = md.parse("e^iπ^+1=0").render();
43//! assert_eq!(html.trim(), "e<sup>iπ</sup>+1=0");
44//! ```
45//!
46//! Note that these structures have lower priority than the rest of the rules,
47//! e.g. `` *foo`bar*baz` `` is parsed as `*foo<code>bar*baz</code>`.
48//!
49use std::cmp::min;
50
51use crate::common::sourcemap::SourcePos;
52use crate::parser::core::CoreRule;
53use crate::parser::extset::{MarkdownThatExt, NodeExt};
54use crate::parser::inline::builtin::InlineParserRule;
55use crate::parser::inline::{InlineRule, InlineState, Text};
56use crate::{MarkdownThat, Node, NodeValue};
57
58#[derive(Debug, Default)]
59struct PairConfig<const MARKER: char> {
60    inserted: bool,
61    fns: [Option<fn() -> Node>; 3],
62}
63impl<const MARKER: char> MarkdownThatExt for PairConfig<MARKER> {}
64
65#[derive(Debug, Default)]
66struct OpenersBottom<const MARKER: char>([usize; 6]);
67impl<const MARKER: char> NodeExt for OpenersBottom<MARKER> {}
68
69#[derive(Debug, Clone)]
70#[doc(hidden)]
71pub struct EmphMarker {
72    // Starting marker
73    pub marker: char,
74
75    // Total length of these series of delimiters.
76    pub length: usize,
77
78    // Remaining length not already matched to other delimiters.
79    pub remaining: usize,
80
81    // Boolean flags that determine if this delimiter could open or close
82    // an emphasis.
83    pub open: bool,
84    pub close: bool,
85}
86
87// this node is supposed to be replaced by an actual emph or text node
88impl NodeValue for EmphMarker {}
89
90pub fn add_with<const MARKER: char, const LENGTH: u8, const CAN_SPLIT_WORD: bool>(
91    md: &mut MarkdownThat,
92    f: fn() -> Node,
93) {
94    let pair_config = md.ext.get_or_insert_default::<PairConfig<MARKER>>();
95    pair_config.fns[LENGTH as usize - 1] = Some(f);
96
97    if !pair_config.inserted {
98        pair_config.inserted = true;
99        md.inline
100            .add_rule::<EmphPairScanner<MARKER, CAN_SPLIT_WORD>>();
101    }
102
103    if !md.has_rule::<FragmentsJoin>() {
104        md.add_rule::<FragmentsJoin>()
105            .before_all()
106            .after::<InlineParserRule>();
107    }
108}
109
110#[doc(hidden)]
111pub struct EmphPairScanner<const MARKER: char, const CAN_SPLIT_WORD: bool>;
112impl<const MARKER: char, const CAN_SPLIT_WORD: bool> InlineRule
113    for EmphPairScanner<MARKER, CAN_SPLIT_WORD>
114{
115    const MARKER: char = MARKER;
116
117    // this rule works on a closing marker, so for technical reasons any rules trying to skip it
118    // should see just plain text
119    fn check(_: &mut InlineState) -> Option<usize> {
120        None
121    }
122
123    fn run(state: &mut InlineState) -> Option<(Node, usize)> {
124        let mut chars = state.src[state.pos..state.pos_max].chars();
125        if chars.next().unwrap() != MARKER {
126            return None;
127        }
128
129        let scanned = state.scan_delims(state.pos, CAN_SPLIT_WORD);
130        let mut node = Node::new(EmphMarker {
131            marker: MARKER,
132            length: scanned.length,
133            remaining: scanned.length,
134            open: scanned.can_open,
135            close: scanned.can_close,
136        });
137        node.srcmap = state.get_map(state.pos, state.pos + scanned.length);
138        node = scan_and_match_delimiters::<MARKER>(state, node);
139        let map = node.srcmap.unwrap().get_byte_offsets();
140        // backtrack to keep correct source maps
141        state.pos += scanned.length;
142        let token_len = map.1 - map.0;
143        state.pos -= token_len;
144        Some((node, token_len))
145    }
146}
147
148/// Assuming the last token is a closing delimiter we just inserted,
149/// try to find opener(s). If any are found, move stuff to a nested emph node.
150fn scan_and_match_delimiters<const MARKER: char>(
151    state: &mut InlineState,
152    mut closer_token: Node,
153) -> Node {
154    if state.node.children.is_empty() {
155        return closer_token;
156    } // must have at least opener and closer
157
158    let mut closer = closer_token.cast_mut::<EmphMarker>().unwrap().clone();
159    if !closer.close {
160        return closer_token;
161    }
162
163    // Previously calculated lower bounds (previous fails)
164    // for each marker, each delimiter length modulo 3,
165    // and for whether this closer can be an opener;
166    // https://github.com/commonmark/cmark/commit/34250e12ccebdc6372b8b49c44fab57c72443460
167    let openers_for_marker = state
168        .node
169        .ext
170        .get_or_insert_default::<OpenersBottom<MARKER>>();
171    let openers_parameter = (closer.open as usize) * 3 + closer.length % 3;
172
173    let min_opener_idx = openers_for_marker.0[openers_parameter];
174
175    let mut idx = state.node.children.len() - 1;
176    let mut new_min_opener_idx = idx;
177    while idx > min_opener_idx {
178        idx -= 1;
179
180        let Some(opener) = state.node.children[idx].cast::<EmphMarker>() else {
181            continue;
182        };
183
184        let mut opener = opener.clone();
185        if opener.open && opener.marker == closer.marker && !is_odd_match(&opener, &closer) {
186            while closer.remaining > 0 && opener.remaining > 0 {
187                let max_marker_len = min(3, min(opener.remaining, closer.remaining));
188                let mut matched_rule = None;
189                let fns = &state.md.ext.get::<PairConfig<MARKER>>().unwrap().fns;
190                for marker_len in (1..=max_marker_len).rev() {
191                    if let Some(f) = fns[marker_len - 1] {
192                        matched_rule = Some((marker_len, f));
193                        break;
194                    }
195                }
196
197                // If matched_fn isn't found, it can only mean that the function is defined for a larger marker
198                // than we have (e.g., function defined for **, we have *).
199                // Treat this as "marker not found".
200                if matched_rule.is_none() {
201                    break;
202                }
203
204                let (marker_len, marker_fn) = matched_rule.unwrap();
205
206                closer.remaining -= marker_len;
207                opener.remaining -= marker_len;
208
209                let mut new_token = marker_fn();
210                new_token.children = state.node.children.split_off(idx + 1);
211
212                // cut marker_len chars from start, i.e. "12345" -> "345"
213                let mut end_map_pos = 0;
214                if let Some(map) = closer_token.srcmap {
215                    let (start, end) = map.get_byte_offsets();
216                    closer_token.srcmap = Some(SourcePos::new(start + marker_len, end));
217                    end_map_pos = start + marker_len;
218                }
219
220                // cut marker_len chars from end, i.e. "12345" -> "123"
221                let mut start_map_pos = 0;
222                let opener_token = state.node.children.last_mut().unwrap();
223                if let Some(map) = opener_token.srcmap {
224                    let (start, end) = map.get_byte_offsets();
225                    opener_token.srcmap = Some(SourcePos::new(start, end - marker_len));
226                    start_map_pos = end - marker_len;
227                }
228
229                new_token.srcmap = state.get_map(start_map_pos, end_map_pos);
230
231                // remove the empty node as a small optimization so we can do less work later
232                if opener.remaining == 0 {
233                    state.node.children.pop();
234                }
235
236                new_min_opener_idx = 0;
237                state.node.children.push(new_token);
238            }
239        }
240
241        if opener.remaining > 0 {
242            state.node.children[idx].replace(opener);
243        } // otherwise node was already deleted
244    }
245
246    if new_min_opener_idx != 0 {
247        // If the match for this delimiter run failed, we want to set a lower bound for
248        // future lookups. This is required to make sure the algorithm has linear
249        // complexity.
250        //
251        // See details here:
252        // https://github.com/commonmark/cmark/issues/178#issuecomment-270417442
253        //
254        let openers_for_marker = state
255            .node
256            .ext
257            .get_or_insert_default::<OpenersBottom<MARKER>>();
258        openers_for_marker.0[openers_parameter] = new_min_opener_idx;
259    }
260
261    // remove the empty node as a small optimization so we can do less work later
262    if closer.remaining > 0 {
263        closer_token.replace(closer);
264        closer_token
265    } else {
266        state.node.children.pop().unwrap()
267    }
268}
269
270fn is_odd_match(opener: &EmphMarker, closer: &EmphMarker) -> bool {
271    // from spec:
272    //
273    // If one of the delimiters can both open and close emphasis, then the
274    // sum of the lengths of the delimiter runs containing the opening and
275    // closing delimiters must not be a multiple of 3 unless both lengths
276    // are multiples of 3.
277    //
278    #[allow(clippy::collapsible_if)]
279    if opener.close || closer.open {
280        if (opener.length + closer.length) % 3 == 0 {
281            if opener.length % 3 != 0 || closer.length % 3 != 0 {
282                return true;
283            }
284        }
285    }
286
287    false
288}
289
290#[doc(hidden)]
291pub struct FragmentsJoin;
292impl CoreRule for FragmentsJoin {
293    fn run(node: &mut Node, _: &MarkdownThat) {
294        node.walk_mut(|node, _| fragments_join(node));
295    }
296}
297
298/// Clean up tokens after emphasis and strikethrough postprocessing:
299/// merge adjacent text nodes into one and re-calculate all token levels
300///
301/// This is necessary because initial emphasis delimiter markers (*, _, ~)
302/// are treated as their own separate text tokens. Then the emphasis rule either
303/// leaves them as text (needed to merge with adjacent text) or turns them
304/// into opening/closing tags (which messes up levels inside).
305///
306fn fragments_join(node: &mut Node) {
307    // replace all emph markers with text tokens
308    for token in node.children.iter_mut() {
309        if let Some(data) = token.cast::<EmphMarker>() {
310            let content = data.marker.to_string().repeat(data.remaining);
311            token.replace(Text { content });
312        }
313    }
314
315    // collapse adjacent text tokens
316    for idx in 1..node.children.len() {
317        let (tokens1, tokens2) = node.children.split_at_mut(idx);
318
319        let token1 = tokens1.last_mut().unwrap();
320        let Some(t1_data) = token1.cast_mut::<Text>() else {
321            continue;
322        };
323
324        let token2 = tokens2.first_mut().unwrap();
325        let Some(t2_data) = token2.cast_mut::<Text>() else {
326            continue;
327        };
328
329        // concat contents
330        let t2_content = std::mem::take(&mut t2_data.content);
331        t1_data.content += &t2_content;
332
333        // adjust source maps
334        if let Some(map1) = token1.srcmap {
335            if let Some(map2) = token2.srcmap {
336                token1.srcmap = Some(SourcePos::new(
337                    map1.get_byte_offsets().0,
338                    map2.get_byte_offsets().1,
339                ));
340            }
341        }
342
343        node.children.swap(idx - 1, idx);
344    }
345
346    // remove all empty tokens
347    node.children.retain(|token| {
348        if let Some(data) = token.cast::<Text>() {
349            !data.content.is_empty()
350        } else {
351            true
352        }
353    });
354}
markdown_that/generics/inline/emph_pair.rs

markdown_that/generics/inline/
emph_pair.rs