ferromark 0.1.3

Ultra-high-performance Markdown to HTML compiler
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
//! Emphasis and strong emphasis resolution.
//!
//! Uses the "modulo-3" stack optimization from md4c to efficiently
//! match emphasis openers and closers according to CommonMark rules.

use super::marks::{Mark, flags};

/// Result of emphasis resolution for a mark pair.
#[derive(Debug, Clone, Copy)]
pub struct EmphasisMatch {
    /// Start position of opener delimiter(s).
    pub opener_start: u32,
    /// End position of opener delimiter(s).
    pub opener_end: u32,
    /// Start position of closer delimiter(s).
    pub closer_start: u32,
    /// End position of closer delimiter(s).
    pub closer_end: u32,
    /// Number of characters matched (1 for emphasis, 2 for strong).
    pub count: u32,
}

/// Reusable emphasis stacks to avoid per-parse allocations.
#[derive(Default)]
pub struct EmphasisStacks {
    stacks: [Vec<OpenerEntry>; 6],
    order: usize,
}

impl EmphasisStacks {
    pub fn clear(&mut self) {
        for stack in &mut self.stacks {
            stack.clear();
        }
        self.order = 0;
    }

    pub fn reserve_for_marks(&mut self, marks_len: usize) {
        let target_per_stack = (marks_len / 6).max(8);
        for stack in &mut self.stacks {
            if stack.capacity() < target_per_stack {
                stack.reserve(target_per_stack - stack.capacity());
            }
        }
    }
}

/// Resolve emphasis marks using modulo-3 stacks.
/// Returns a list of matched pairs.
/// `link_boundaries` contains (start, text_end) pairs for each resolved link.
/// Emphasis delimiters cannot match if they cross a link boundary.
#[cfg(test)]
pub fn resolve_emphasis(marks: &mut [Mark], link_boundaries: &[(u32, u32)]) -> Vec<EmphasisMatch> {
    let mut stacks = EmphasisStacks::default();
    resolve_emphasis_with_stacks(marks, link_boundaries, &mut stacks)
}

#[cfg(test)]
pub fn resolve_emphasis_with_stacks(
    marks: &mut [Mark],
    link_boundaries: &[(u32, u32)],
    stacks: &mut EmphasisStacks,
) -> Vec<EmphasisMatch> {
    let mut matches = Vec::new();
    resolve_emphasis_with_stacks_into(marks, link_boundaries, stacks, &mut matches);
    matches
}

pub fn resolve_emphasis_with_stacks_into(
    marks: &mut [Mark],
    link_boundaries: &[(u32, u32)],
    stacks: &mut EmphasisStacks,
    matches: &mut Vec<EmphasisMatch>,
) {
    stacks.reserve_for_marks(marks.len());
    stacks.clear();
    matches.clear();
    let target_matches = (marks.len() / 4).max(8);
    if matches.capacity() < target_matches {
        matches.reserve(target_matches - matches.capacity());
    }
    let mut resolver = EmphasisResolver::new(link_boundaries, stacks);

    // Process marks left to right
    for i in 0..marks.len() {
        let mark = &marks[i];

        // Skip non-emphasis marks or those inside code spans
        if (mark.ch != b'*' && mark.ch != b'_') || mark.flags & flags::IN_CODE != 0 {
            continue;
        }

        if mark.can_close() {
            // Keep trying to close while we can find openers
            loop {
                let mark = &marks[i];
                if mark.is_resolved() || mark.len() == 0 {
                    break;
                }
                if !mark.can_close() {
                    break;
                }

                // Try to find a matching opener
                if let Some((opener_idx, match_count)) = resolver.find_opener(marks, i) {
                    // Record positions BEFORE modifying marks
                    let opener = &marks[opener_idx];
                    let closer = &marks[i];

                    // Opener delimiter is at the END of the opener mark (rightmost chars)
                    let opener_delim_start = opener.end - match_count;
                    let opener_delim_end = opener.end;

                    // Closer delimiter is at the START of the closer mark (leftmost chars)
                    let closer_delim_start = closer.pos;
                    let closer_delim_end = closer.pos + match_count;

                    matches.push(EmphasisMatch {
                        opener_start: opener_delim_start,
                        opener_end: opener_delim_end,
                        closer_start: closer_delim_start,
                        closer_end: closer_delim_end,
                        count: match_count,
                    });

                    // CommonMark: Remove all openers between opener and closer
                    // They can no longer form valid matches since we've closed past them
                    resolver.remove_openers_between(marks, opener_idx, i);

                    // Consume characters from both marks
                    let opener = &mut marks[opener_idx];
                    let opener_remaining = opener.len() - match_count;
                    if opener_remaining == 0 {
                        opener.resolve();
                    } else {
                        // Shrink opener from the right
                        opener.end -= match_count;
                        // Re-push opener to correct stack based on new length
                        resolver.push_opener(marks, opener_idx);
                    }

                    let closer = &mut marks[i];
                    let closer_remaining = closer.len() - match_count;
                    if closer_remaining == 0 {
                        closer.resolve();
                        break;
                    } else {
                        // Shrink closer from the left
                        closer.pos += match_count;
                        // Continue the loop to try matching more
                    }
                } else {
                    // No more openers to match, break out of the loop
                    break;
                }
            }

            // After closing, if closer still has characters and can open, push it
            let mark = &marks[i];
            if !mark.is_resolved() && mark.len() > 0 && mark.can_open() {
                resolver.push_opener(marks, i);
            }
        } else if mark.can_open() {
            resolver.push_opener(marks, i);
        }
    }
}

/// Entry in the opener stack with ordering info.
#[derive(Debug, Clone, Copy)]
struct OpenerEntry {
    /// Index into marks array.
    mark_idx: usize,
    /// Global push order for finding most recent opener.
    order: usize,
}

/// Emphasis resolver with 6 stacks (2 chars x 3 modulo classes).
struct EmphasisResolver<'a> {
    /// Stacks indexed by: (is_underscore ? 3 : 0) + (run_length % 3)
    stacks: &'a mut [Vec<OpenerEntry>; 6],
    /// Global order counter.
    order: &'a mut usize,
    /// Link boundaries (start, text_end) - emphasis can't cross these.
    link_boundaries: &'a [(u32, u32)],
}

impl<'a> EmphasisResolver<'a> {
    fn new(link_boundaries: &'a [(u32, u32)], stacks: &'a mut EmphasisStacks) -> Self {
        Self {
            stacks: &mut stacks.stacks,
            order: &mut stacks.order,
            link_boundaries,
        }
    }

    /// Find which link boundary (if any) a position is inside.
    /// Returns Some(index) if inside a link, None if outside all links.
    fn link_boundary_for(&self, pos: u32) -> Option<usize> {
        for (i, &(start, end)) in self.link_boundaries.iter().enumerate() {
            if pos >= start && pos < end {
                return Some(i);
            }
        }
        None
    }

    /// Get stack index for a mark.
    fn stack_index(ch: u8, run_len: u32) -> usize {
        let char_offset = if ch == b'_' { 3 } else { 0 };
        char_offset + (run_len as usize % 3)
    }

    /// Push an opener to the appropriate stack.
    fn push_opener(&mut self, marks: &[Mark], idx: usize) {
        let mark = &marks[idx];
        let stack_idx = Self::stack_index(mark.ch, mark.len());
        self.stacks[stack_idx].push(OpenerEntry {
            mark_idx: idx,
            order: *self.order,
        });
        *self.order += 1;
    }

    /// Find a matching opener for a closer.
    /// Returns (opener_index, match_count) if found.
    fn find_opener(&mut self, marks: &[Mark], closer_idx: usize) -> Option<(usize, u32)> {
        let closer = &marks[closer_idx];
        let closer_len = closer.len();
        let closer_can_open = closer.can_open();

        // Determine which link boundary the closer is in (if any)
        let closer_link = self.link_boundary_for(closer.pos);

        // CommonMark "rule of three": only applies when one of the delimiters
        // can BOTH open AND close. If neither can both open and close,
        // we can ignore the rule of three entirely.

        let base_idx = if closer.ch == b'_' { 3 } else { 0 };

        // Calculate closer's modulo-3 class (only needed if rule of three applies)
        let closer_mod = closer_len as usize % 3;

        // Find the most recent (highest order) opener across compatible stacks
        let mut best_opener: Option<(usize, OpenerEntry, u32)> = None; // (stack_idx, entry, match_count)

        for opener_mod in 0..3 {
            let stack_idx = base_idx + opener_mod;
            if let Some(&entry) = self.stacks[stack_idx].last() {
                let opener = &marks[entry.mark_idx];

                // Must be same character
                if opener.ch != closer.ch {
                    continue;
                }

                // Opener and closer must be in the same link boundary (or both outside)
                let opener_link = self.link_boundary_for(opener.pos);
                if opener_link != closer_link {
                    continue;
                }

                // Check rule of three: only applies if opener or closer can both open AND close
                let opener_can_close = opener.can_close();
                let rule_of_three_applies = closer_can_open || opener_can_close;

                if rule_of_three_applies {
                    // If (opener_len + closer_len) % 3 == 0, both must be multiples of 3
                    let sum_mod = (opener_mod + closer_mod) % 3;
                    if sum_mod == 0 && (opener_mod != 0 || closer_mod != 0) {
                        // Would violate rule of three
                        continue;
                    }
                }

                // Check if this is more recent than current best
                let dominated = match &best_opener {
                    Some((_, best_entry, _)) => entry.order < best_entry.order,
                    None => false,
                };
                if dominated {
                    continue;
                }

                // Determine how many to match
                let available = opener.len().min(closer_len);
                let actual_match = if available >= 2 { 2 } else { 1 };

                best_opener = Some((stack_idx, entry, actual_match));
            }
        }

        // Pop and return the best opener
        if let Some((stack_idx, entry, match_count)) = best_opener {
            self.stacks[stack_idx].pop();
            Some((entry.mark_idx, match_count))
        } else {
            None
        }
    }

    /// Remove all openers with mark indices between opener_idx and closer_idx.
    /// Per CommonMark spec: delimiters between an opener and closer can no longer
    /// form valid matches once we've closed past them.
    fn remove_openers_between(&mut self, _marks: &[Mark], opener_idx: usize, closer_idx: usize) {
        let _ = closer_idx;
        for stack in self.stacks.iter_mut() {
            while matches!(stack.last(), Some(entry) if entry.mark_idx > opener_idx) {
                stack.pop();
            }
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::inline::code_span::resolve_code_spans;
    use crate::inline::marks::{MarkBuffer, collect_marks};

    fn get_emphasis_matches(text: &[u8]) -> Vec<EmphasisMatch> {
        let mut buffer = MarkBuffer::new();
        collect_marks(text, &mut buffer);
        resolve_code_spans(buffer.marks_mut(), text, &[]);
        resolve_emphasis(buffer.marks_mut(), &[]) // No link boundaries in basic tests
    }

    #[test]
    fn test_simple_emphasis() {
        let matches = get_emphasis_matches(b"hello *world*");
        assert_eq!(matches.len(), 1);
        assert_eq!(matches[0].count, 1);
    }

    #[test]
    fn test_strong_emphasis() {
        let matches = get_emphasis_matches(b"hello **world**");
        assert_eq!(matches.len(), 1);
        assert_eq!(matches[0].count, 2);
    }

    #[test]
    fn test_underscore_emphasis() {
        let matches = get_emphasis_matches(b"hello _world_");
        assert_eq!(matches.len(), 1);
        assert_eq!(matches[0].count, 1);
    }

    #[test]
    fn test_nested_emphasis() {
        let matches = get_emphasis_matches(b"***bold and italic***");
        // Should produce multiple matches
        assert!(!matches.is_empty());
    }

    #[test]
    fn test_no_emphasis_in_code() {
        let matches = get_emphasis_matches(b"`*not emphasis*`");
        // Asterisks inside code should not match
        assert_eq!(matches.len(), 0);
    }

    #[test]
    fn test_mismatched_delimiters() {
        // Asterisk and underscore don't match
        let matches = get_emphasis_matches(b"*hello_");
        assert_eq!(matches.len(), 0);
    }

    #[test]
    fn test_nested_strong_in_em() {
        // *foo **bar***
        // Position: 0123456789012
        // Expected: <em>foo <strong>bar</strong></em>
        let matches = get_emphasis_matches(b"*foo **bar***");

        eprintln!("Matches:");
        for m in &matches {
            let kind = if m.count == 2 { "strong" } else { "em" };
            eprintln!(
                "  {}: opener {}-{}, closer {}-{}",
                kind, m.opener_start, m.opener_end, m.closer_start, m.closer_end
            );
        }

        // Should have 2 matches: one strong, one em
        assert_eq!(matches.len(), 2);

        // The em opener should be at position 0
        let em_match = matches.iter().find(|m| m.count == 1).expect("em match");
        assert_eq!(em_match.opener_start, 0, "em opener should start at 0");
        assert_eq!(em_match.closer_start, 12, "em closer should start at 12");

        // The strong opener should be at position 5
        let strong_match = matches.iter().find(|m| m.count == 2).expect("strong match");
        assert_eq!(
            strong_match.opener_start, 5,
            "strong opener should start at 5"
        );
        assert_eq!(
            strong_match.closer_start, 10,
            "strong closer should start at 10"
        );
    }
}