Skip to main content

rumdl_lib/utils/
emphasis_utils.rs

1use regex::Regex;
2use std::sync::LazyLock;
3
4// Better detection of inline code with support for multiple backticks
5static INLINE_CODE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"(`+)([^`]|[^`].*?[^`])(`+)").unwrap());
6
7// Inline math pattern - matches both $...$ and $$...$$ syntax
8// The pattern allows zero or more characters between delimiters to handle empty math spans
9static INLINE_MATH: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\$\$[^$]*\$\$|\$[^$\n]*\$").unwrap());
10
11// Documentation style patterns
12static DOC_METADATA_PATTERN: LazyLock<Regex> =
13    LazyLock::new(|| Regex::new(r"^\s*\*?\s*\*\*(?:[^*\s][^*]*[^*\s]|[^*\s])\*\*\s*:").unwrap());
14
15// Bold text pattern (for preserving bold text in documentation) - only match valid bold without spaces
16static BOLD_TEXT_PATTERN: LazyLock<Regex> =
17    LazyLock::new(|| Regex::new(r"\*\*[^*\s][^*]*[^*\s]\*\*|\*\*[^*\s]\*\*").unwrap());
18
19// Pre-compiled patterns for quick checks
20static QUICK_DOC_CHECK: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^\s*\*\s+\*").unwrap());
21static QUICK_BOLD_CHECK: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\*\*[^*\s]").unwrap());
22
23// Template/shortcode syntax pattern - {* ... *} used by documentation systems like FastAPI/MkDocs
24// These are not emphasis markers but template directives for code inclusion/highlighting
25static TEMPLATE_SHORTCODE_PATTERN: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\{\*.*\*\}").unwrap());
26
27/// Represents an emphasis marker found in text
28#[derive(Debug, Clone, PartialEq)]
29pub struct EmphasisMarker {
30    pub marker_type: u8,  // b'*' or b'_' for faster comparison
31    pub count: u8,        // 1 for single, 2 for double
32    pub start_pos: usize, // Position in the line
33}
34
35impl EmphasisMarker {
36    #[inline]
37    pub fn end_pos(&self) -> usize {
38        self.start_pos + self.count as usize
39    }
40
41    #[inline]
42    pub fn as_char(&self) -> char {
43        self.marker_type as char
44    }
45}
46
47/// Represents a complete emphasis span
48#[derive(Debug, Clone)]
49pub struct EmphasisSpan {
50    pub opening: EmphasisMarker,
51    pub closing: EmphasisMarker,
52    pub content: String,
53    pub has_leading_space: bool,
54    pub has_trailing_space: bool,
55}
56
57/// Enhanced inline code replacement with optimized performance
58/// Replaces inline code with 'X' characters to prevent false positives in emphasis detection
59#[inline]
60pub fn replace_inline_code(line: &str) -> String {
61    // Quick check: if no backticks, return original
62    if !line.contains('`') {
63        return line.to_string();
64    }
65
66    let mut result = line.to_string();
67    let mut offset = 0;
68
69    for cap in INLINE_CODE.captures_iter(line) {
70        if let (Some(full_match), Some(_opening), Some(_content), Some(_closing)) =
71            (cap.get(0), cap.get(1), cap.get(2), cap.get(3))
72        {
73            let match_start = full_match.start();
74            let match_end = full_match.end();
75            // Use 'X' instead of spaces to avoid false positives for "spaces in emphasis"
76            let placeholder = "X".repeat(match_end - match_start);
77
78            result.replace_range(match_start + offset..match_end + offset, &placeholder);
79            offset += placeholder.len() - (match_end - match_start);
80        }
81    }
82
83    result
84}
85
86/// Replace inline math ($...$ and $$...$$) with placeholder characters
87/// This prevents math content from being mistaken for emphasis markers
88pub fn replace_inline_math(line: &str) -> String {
89    // Quick check: if no dollar signs, return original
90    if !line.contains('$') {
91        return line.to_string();
92    }
93
94    let mut result = line.to_string();
95    let mut offset: isize = 0;
96
97    for m in INLINE_MATH.find_iter(line) {
98        let match_start = m.start();
99        let match_end = m.end();
100        // Use 'M' instead of spaces or asterisks to avoid affecting emphasis detection
101        let placeholder = "M".repeat(match_end - match_start);
102
103        let adjusted_start = (match_start as isize + offset) as usize;
104        let adjusted_end = (match_end as isize + offset) as usize;
105        result.replace_range(adjusted_start..adjusted_end, &placeholder);
106        offset += placeholder.len() as isize - (match_end - match_start) as isize;
107    }
108
109    result
110}
111
112/// Optimized emphasis marker parsing using byte iteration
113#[inline]
114pub fn find_emphasis_markers(line: &str) -> Vec<EmphasisMarker> {
115    // Early return for lines without emphasis markers
116    if !line.contains('*') && !line.contains('_') {
117        return Vec::new();
118    }
119
120    let mut markers = Vec::new();
121    let bytes = line.as_bytes();
122    let mut i = 0;
123
124    while i < bytes.len() {
125        let byte = bytes[i];
126        if byte == b'*' || byte == b'_' {
127            let start_pos = i;
128            let mut count = 1u8;
129
130            // Count consecutive markers (limit to avoid overflow)
131            while i + (count as usize) < bytes.len() && bytes[i + (count as usize)] == byte && count < 3 {
132                count += 1;
133            }
134
135            // Only consider single (*) and double (**) markers
136            if count == 1 || count == 2 {
137                markers.push(EmphasisMarker {
138                    marker_type: byte,
139                    count,
140                    start_pos,
141                });
142            }
143
144            i += count as usize;
145        } else {
146            i += 1;
147        }
148    }
149
150    markers
151}
152
153/// Find all emphasis spans in a line, excluding only single emphasis (not strong)
154pub fn find_single_emphasis_spans(line: &str, markers: &[EmphasisMarker]) -> Vec<EmphasisSpan> {
155    // Early return for insufficient markers
156    if markers.len() < 2 {
157        return Vec::new();
158    }
159
160    let mut spans = Vec::new();
161    let mut used_markers = vec![false; markers.len()];
162
163    // Process markers in pairs more efficiently
164    for i in 0..markers.len() {
165        if used_markers[i] || markers[i].count != 1 {
166            continue;
167        }
168
169        let opening = &markers[i];
170
171        // Look for the nearest matching closing marker using optimized search
172        for j in (i + 1)..markers.len() {
173            if used_markers[j] {
174                continue;
175            }
176
177            let closing = &markers[j];
178
179            // Quick type and count check - only single emphasis
180            if closing.marker_type == opening.marker_type && closing.count == 1 {
181                let content_start = opening.end_pos();
182                let content_end = closing.start_pos;
183
184                if content_end > content_start {
185                    let content = &line[content_start..content_end];
186
187                    // Optimized validation checks
188                    if is_valid_emphasis_content_fast(content) && is_valid_emphasis_span_fast(line, opening, closing) {
189                        // Quick check for crossing markers
190                        let crosses_markers = markers[i + 1..j]
191                            .iter()
192                            .any(|marker| marker.marker_type == opening.marker_type && marker.count == 1);
193
194                        if !crosses_markers {
195                            let has_leading_space = content.starts_with(' ') || content.starts_with('\t');
196                            let has_trailing_space = content.ends_with(' ') || content.ends_with('\t');
197
198                            spans.push(EmphasisSpan {
199                                opening: opening.clone(),
200                                closing: closing.clone(),
201                                content: content.to_string(),
202                                has_leading_space,
203                                has_trailing_space,
204                            });
205
206                            // Mark both markers as used
207                            used_markers[i] = true;
208                            used_markers[j] = true;
209                            break;
210                        }
211                    }
212                }
213            }
214        }
215    }
216
217    spans
218}
219
220/// Optimized emphasis span finding with reduced complexity (includes both single and strong)
221pub fn find_emphasis_spans(line: &str, markers: &[EmphasisMarker]) -> Vec<EmphasisSpan> {
222    // Early return for insufficient markers
223    if markers.len() < 2 {
224        return Vec::new();
225    }
226
227    let mut spans = Vec::new();
228    let mut used_markers = vec![false; markers.len()];
229
230    // Process markers in pairs more efficiently
231    for i in 0..markers.len() {
232        if used_markers[i] {
233            continue;
234        }
235
236        let opening = &markers[i];
237
238        // Look for the nearest matching closing marker using optimized search
239        for j in (i + 1)..markers.len() {
240            if used_markers[j] {
241                continue;
242            }
243
244            let closing = &markers[j];
245
246            // Quick type and count check
247            if closing.marker_type == opening.marker_type && closing.count == opening.count {
248                let content_start = opening.end_pos();
249                let content_end = closing.start_pos;
250
251                if content_end > content_start {
252                    let content = &line[content_start..content_end];
253
254                    // Optimized validation checks
255                    if is_valid_emphasis_content_fast(content) && is_valid_emphasis_span_fast(line, opening, closing) {
256                        // Quick check for crossing markers
257                        let crosses_markers = markers[i + 1..j]
258                            .iter()
259                            .any(|marker| marker.marker_type == opening.marker_type);
260
261                        if !crosses_markers {
262                            let has_leading_space = content.starts_with(' ') || content.starts_with('\t');
263                            let has_trailing_space = content.ends_with(' ') || content.ends_with('\t');
264
265                            spans.push(EmphasisSpan {
266                                opening: opening.clone(),
267                                closing: closing.clone(),
268                                content: content.to_string(),
269                                has_leading_space,
270                                has_trailing_space,
271                            });
272
273                            // Mark both markers as used
274                            used_markers[i] = true;
275                            used_markers[j] = true;
276                            break;
277                        }
278                    }
279                }
280            }
281        }
282    }
283
284    spans
285}
286
287/// Fast validation of emphasis span context
288#[inline]
289fn is_valid_emphasis_span_fast(line: &str, opening: &EmphasisMarker, closing: &EmphasisMarker) -> bool {
290    let content_start = opening.end_pos();
291    let content_end = closing.start_pos;
292
293    // Content must exist and not be just whitespace
294    if content_end <= content_start {
295        return false;
296    }
297
298    let content = &line[content_start..content_end];
299    if content.trim().is_empty() {
300        return false;
301    }
302
303    // Quick boundary checks using byte indexing
304    let bytes = line.as_bytes();
305
306    // Opening should be at start or after valid character
307    let valid_opening = opening.start_pos == 0
308        || matches!(
309            bytes.get(opening.start_pos.saturating_sub(1)),
310            Some(&b' ')
311                | Some(&b'\t')
312                | Some(&b'(')
313                | Some(&b'[')
314                | Some(&b'{')
315                | Some(&b'"')
316                | Some(&b'\'')
317                | Some(&b'>')
318        );
319
320    // Closing should be at end or before valid character
321    let valid_closing = closing.end_pos() >= bytes.len()
322        || matches!(
323            bytes.get(closing.end_pos()),
324            Some(&b' ')
325                | Some(&b'\t')
326                | Some(&b')')
327                | Some(&b']')
328                | Some(&b'}')
329                | Some(&b'"')
330                | Some(&b'\'')
331                | Some(&b'.')
332                | Some(&b',')
333                | Some(&b'!')
334                | Some(&b'?')
335                | Some(&b';')
336                | Some(&b':')
337                | Some(&b'<')
338        );
339
340    valid_opening && valid_closing && !content.contains('\n')
341}
342
343/// Fast validation of emphasis content
344#[inline]
345fn is_valid_emphasis_content_fast(content: &str) -> bool {
346    !content.trim().is_empty()
347}
348
349/// Check if line has documentation patterns that should be preserved
350pub fn has_doc_patterns(line: &str) -> bool {
351    // Check for template/shortcode syntax like {* ... *} used by FastAPI/MkDocs
352    // These contain asterisks that are not emphasis markers
353    if line.contains("{*") && TEMPLATE_SHORTCODE_PATTERN.is_match(line) {
354        return true;
355    }
356
357    (QUICK_DOC_CHECK.is_match(line) || QUICK_BOLD_CHECK.is_match(line))
358        && (DOC_METADATA_PATTERN.is_match(line) || BOLD_TEXT_PATTERN.is_match(line))
359}
360
361#[cfg(test)]
362mod tests {
363    use super::*;
364
365    #[test]
366    fn test_emphasis_marker_parsing() {
367        let markers = find_emphasis_markers("This has *single* and **double** emphasis");
368        assert_eq!(markers.len(), 4); // *, *, **, **
369
370        let markers = find_emphasis_markers("*start* and *end*");
371        assert_eq!(markers.len(), 4); // *, *, *, *
372    }
373
374    #[test]
375    fn test_single_emphasis_span_detection() {
376        let markers = find_emphasis_markers("This has *valid* emphasis and **strong** too");
377        let spans = find_single_emphasis_spans("This has *valid* emphasis and **strong** too", &markers);
378        assert_eq!(spans.len(), 1); // Only the single emphasis
379        assert_eq!(spans[0].content, "valid");
380        assert!(!spans[0].has_leading_space);
381        assert!(!spans[0].has_trailing_space);
382    }
383
384    #[test]
385    fn test_emphasis_with_spaces() {
386        let markers = find_emphasis_markers("This has * invalid * emphasis");
387        let spans = find_emphasis_spans("This has * invalid * emphasis", &markers);
388        assert_eq!(spans.len(), 1);
389        assert_eq!(spans[0].content, " invalid ");
390        assert!(spans[0].has_leading_space);
391        assert!(spans[0].has_trailing_space);
392    }
393
394    #[test]
395    fn test_mixed_markers() {
396        let markers = find_emphasis_markers("This has *asterisk* and _underscore_ emphasis");
397        let spans = find_single_emphasis_spans("This has *asterisk* and _underscore_ emphasis", &markers);
398        assert_eq!(spans.len(), 2);
399        assert_eq!(spans[0].opening.as_char(), '*');
400        assert_eq!(spans[1].opening.as_char(), '_');
401    }
402
403    #[test]
404    fn test_template_shortcode_detection() {
405        // FastAPI/MkDocs style template syntax should be detected as doc pattern
406        assert!(has_doc_patterns(
407            "{* ../../docs_src/cookie_param_models/tutorial001.py hl[9:12,16] *}"
408        ));
409        assert!(has_doc_patterns(
410            "{* ../../docs_src/conditional_openapi/tutorial001.py hl[6,11] *}"
411        ));
412        // Simple shortcode
413        assert!(has_doc_patterns("{* file.py *}"));
414        // With path and options
415        assert!(has_doc_patterns("{* ../path/to/file.py ln[1-10] *}"));
416
417        // Regular emphasis should NOT match
418        assert!(!has_doc_patterns("This has *emphasis* text"));
419        assert!(!has_doc_patterns("This has * spaces * in emphasis"));
420        // Only opening brace without closing should not match
421        assert!(!has_doc_patterns("{* incomplete"));
422    }
423
424    #[test]
425    fn test_doc_pattern_rejects_spaced_bold_metadata() {
426        // Valid bold metadata — should be treated as doc pattern (skip MD037)
427        assert!(has_doc_patterns("**Key**: value"));
428        assert!(has_doc_patterns("**Name**: another value"));
429        assert!(has_doc_patterns("**X**: single char"));
430        assert!(has_doc_patterns("* **Key**: list item with bold key"));
431
432        // Broken bold with internal spaces — should NOT be treated as doc pattern
433        // so MD037 can flag the spacing issue
434        assert!(!has_doc_patterns("** Key**: value"));
435        assert!(!has_doc_patterns("**Key **: value"));
436        assert!(!has_doc_patterns("** Key **: value"));
437        assert!(!has_doc_patterns(
438            "** Explicit Import**: Convert markdownlint configs to rumdl format:"
439        ));
440    }
441}