rumdl_lib/utils/
emphasis_utils.rs

1use regex::Regex;
2use std::sync::LazyLock;
3
4// Better detection of inline code with support for multiple backticks
5static INLINE_CODE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"(`+)([^`]|[^`].*?[^`])(`+)").unwrap());
6
7// Inline math pattern - matches both $...$ and $$...$$ syntax
8// The pattern allows zero or more characters between delimiters to handle empty math spans
9static INLINE_MATH: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\$\$[^$]*\$\$|\$[^$\n]*\$").unwrap());
10
11// List markers pattern - used to avoid confusion with emphasis
12static LIST_MARKER: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^\s*[*+-]\s+").unwrap());
13
14// Documentation style patterns
15static DOC_METADATA_PATTERN: LazyLock<Regex> =
16    LazyLock::new(|| Regex::new(r"^\s*\*?\s*\*\*(?:[^*\s][^*]*[^*\s]|[^*\s])\*\*\s*:").unwrap());
17
18// Bold text pattern (for preserving bold text in documentation) - only match valid bold without spaces
19static BOLD_TEXT_PATTERN: LazyLock<Regex> =
20    LazyLock::new(|| Regex::new(r"\*\*[^*\s][^*]*[^*\s]\*\*|\*\*[^*\s]\*\*").unwrap());
21
22// Pre-compiled patterns for quick checks
23static QUICK_DOC_CHECK: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^\s*\*\s+\*").unwrap());
24static QUICK_BOLD_CHECK: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\*\*[^*\s]").unwrap());
25
26// Template/shortcode syntax pattern - {* ... *} used by documentation systems like FastAPI/MkDocs
27// These are not emphasis markers but template directives for code inclusion/highlighting
28static TEMPLATE_SHORTCODE_PATTERN: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\{\*.*\*\}").unwrap());
29
30/// Represents an emphasis marker found in text
31#[derive(Debug, Clone, PartialEq)]
32pub struct EmphasisMarker {
33    pub marker_type: u8,  // b'*' or b'_' for faster comparison
34    pub count: u8,        // 1 for single, 2 for double
35    pub start_pos: usize, // Position in the line
36}
37
38impl EmphasisMarker {
39    #[inline]
40    pub fn end_pos(&self) -> usize {
41        self.start_pos + self.count as usize
42    }
43
44    #[inline]
45    pub fn as_char(&self) -> char {
46        self.marker_type as char
47    }
48}
49
50/// Represents a complete emphasis span
51#[derive(Debug, Clone)]
52pub struct EmphasisSpan {
53    pub opening: EmphasisMarker,
54    pub closing: EmphasisMarker,
55    pub content: String,
56    pub has_leading_space: bool,
57    pub has_trailing_space: bool,
58}
59
60/// Enhanced inline code replacement with optimized performance
61/// Replaces inline code with 'X' characters to prevent false positives in emphasis detection
62#[inline]
63pub fn replace_inline_code(line: &str) -> String {
64    // Quick check: if no backticks, return original
65    if !line.contains('`') {
66        return line.to_string();
67    }
68
69    let mut result = line.to_string();
70    let mut offset = 0;
71
72    for cap in INLINE_CODE.captures_iter(line) {
73        if let (Some(full_match), Some(_opening), Some(_content), Some(_closing)) =
74            (cap.get(0), cap.get(1), cap.get(2), cap.get(3))
75        {
76            let match_start = full_match.start();
77            let match_end = full_match.end();
78            // Use 'X' instead of spaces to avoid false positives for "spaces in emphasis"
79            let placeholder = "X".repeat(match_end - match_start);
80
81            result.replace_range(match_start + offset..match_end + offset, &placeholder);
82            offset += placeholder.len() - (match_end - match_start);
83        }
84    }
85
86    result
87}
88
89/// Replace inline math ($...$ and $$...$$) with placeholder characters
90/// This prevents math content from being mistaken for emphasis markers
91pub fn replace_inline_math(line: &str) -> String {
92    // Quick check: if no dollar signs, return original
93    if !line.contains('$') {
94        return line.to_string();
95    }
96
97    let mut result = line.to_string();
98    let mut offset: isize = 0;
99
100    for m in INLINE_MATH.find_iter(line) {
101        let match_start = m.start();
102        let match_end = m.end();
103        // Use 'M' instead of spaces or asterisks to avoid affecting emphasis detection
104        let placeholder = "M".repeat(match_end - match_start);
105
106        let adjusted_start = (match_start as isize + offset) as usize;
107        let adjusted_end = (match_end as isize + offset) as usize;
108        result.replace_range(adjusted_start..adjusted_end, &placeholder);
109        offset += placeholder.len() as isize - (match_end - match_start) as isize;
110    }
111
112    result
113}
114
115/// Optimized emphasis marker parsing using byte iteration
116#[inline]
117pub fn find_emphasis_markers(line: &str) -> Vec<EmphasisMarker> {
118    // Early return for lines without emphasis markers
119    if !line.contains('*') && !line.contains('_') {
120        return Vec::new();
121    }
122
123    let mut markers = Vec::new();
124    let bytes = line.as_bytes();
125    let mut i = 0;
126
127    while i < bytes.len() {
128        let byte = bytes[i];
129        if byte == b'*' || byte == b'_' {
130            let start_pos = i;
131            let mut count = 1u8;
132
133            // Count consecutive markers (limit to avoid overflow)
134            while i + (count as usize) < bytes.len() && bytes[i + (count as usize)] == byte && count < 3 {
135                count += 1;
136            }
137
138            // Only consider single (*) and double (**) markers
139            if count == 1 || count == 2 {
140                markers.push(EmphasisMarker {
141                    marker_type: byte,
142                    count,
143                    start_pos,
144                });
145            }
146
147            i += count as usize;
148        } else {
149            i += 1;
150        }
151    }
152
153    markers
154}
155
156/// Find all emphasis spans in a line, excluding only single emphasis (not strong)
157pub fn find_single_emphasis_spans(line: &str, markers: Vec<EmphasisMarker>) -> Vec<EmphasisSpan> {
158    // Early return for insufficient markers
159    if markers.len() < 2 {
160        return Vec::new();
161    }
162
163    let mut spans = Vec::new();
164    let mut used_markers = vec![false; markers.len()];
165
166    // Process markers in pairs more efficiently
167    for i in 0..markers.len() {
168        if used_markers[i] || markers[i].count != 1 {
169            continue;
170        }
171
172        let opening = &markers[i];
173
174        // Look for the nearest matching closing marker using optimized search
175        for j in (i + 1)..markers.len() {
176            if used_markers[j] {
177                continue;
178            }
179
180            let closing = &markers[j];
181
182            // Quick type and count check - only single emphasis
183            if closing.marker_type == opening.marker_type && closing.count == 1 {
184                let content_start = opening.end_pos();
185                let content_end = closing.start_pos;
186
187                if content_end > content_start {
188                    let content = &line[content_start..content_end];
189
190                    // Optimized validation checks
191                    if is_valid_emphasis_content_fast(content) && is_valid_emphasis_span_fast(line, opening, closing) {
192                        // Quick check for crossing markers
193                        let crosses_markers = markers[i + 1..j]
194                            .iter()
195                            .any(|marker| marker.marker_type == opening.marker_type && marker.count == 1);
196
197                        if !crosses_markers {
198                            let has_leading_space = content.starts_with(' ') || content.starts_with('\t');
199                            let has_trailing_space = content.ends_with(' ') || content.ends_with('\t');
200
201                            spans.push(EmphasisSpan {
202                                opening: opening.clone(),
203                                closing: closing.clone(),
204                                content: content.to_string(),
205                                has_leading_space,
206                                has_trailing_space,
207                            });
208
209                            // Mark both markers as used
210                            used_markers[i] = true;
211                            used_markers[j] = true;
212                            break;
213                        }
214                    }
215                }
216            }
217        }
218    }
219
220    spans
221}
222
223/// Optimized emphasis span finding with reduced complexity (includes both single and strong)
224pub fn find_emphasis_spans(line: &str, markers: Vec<EmphasisMarker>) -> Vec<EmphasisSpan> {
225    // Early return for insufficient markers
226    if markers.len() < 2 {
227        return Vec::new();
228    }
229
230    let mut spans = Vec::new();
231    let mut used_markers = vec![false; markers.len()];
232
233    // Process markers in pairs more efficiently
234    for i in 0..markers.len() {
235        if used_markers[i] {
236            continue;
237        }
238
239        let opening = &markers[i];
240
241        // Look for the nearest matching closing marker using optimized search
242        for j in (i + 1)..markers.len() {
243            if used_markers[j] {
244                continue;
245            }
246
247            let closing = &markers[j];
248
249            // Quick type and count check
250            if closing.marker_type == opening.marker_type && closing.count == opening.count {
251                let content_start = opening.end_pos();
252                let content_end = closing.start_pos;
253
254                if content_end > content_start {
255                    let content = &line[content_start..content_end];
256
257                    // Optimized validation checks
258                    if is_valid_emphasis_content_fast(content) && is_valid_emphasis_span_fast(line, opening, closing) {
259                        // Quick check for crossing markers
260                        let crosses_markers = markers[i + 1..j]
261                            .iter()
262                            .any(|marker| marker.marker_type == opening.marker_type);
263
264                        if !crosses_markers {
265                            let has_leading_space = content.starts_with(' ') || content.starts_with('\t');
266                            let has_trailing_space = content.ends_with(' ') || content.ends_with('\t');
267
268                            spans.push(EmphasisSpan {
269                                opening: opening.clone(),
270                                closing: closing.clone(),
271                                content: content.to_string(),
272                                has_leading_space,
273                                has_trailing_space,
274                            });
275
276                            // Mark both markers as used
277                            used_markers[i] = true;
278                            used_markers[j] = true;
279                            break;
280                        }
281                    }
282                }
283            }
284        }
285    }
286
287    spans
288}
289
290/// Fast validation of emphasis span context
291#[inline]
292pub fn is_valid_emphasis_span_fast(line: &str, opening: &EmphasisMarker, closing: &EmphasisMarker) -> bool {
293    let content_start = opening.end_pos();
294    let content_end = closing.start_pos;
295
296    // Content must exist and not be just whitespace
297    if content_end <= content_start {
298        return false;
299    }
300
301    let content = &line[content_start..content_end];
302    if content.trim().is_empty() {
303        return false;
304    }
305
306    // Quick boundary checks using byte indexing
307    let bytes = line.as_bytes();
308
309    // Opening should be at start or after valid character
310    let valid_opening = opening.start_pos == 0
311        || matches!(
312            bytes.get(opening.start_pos.saturating_sub(1)),
313            Some(&b' ')
314                | Some(&b'\t')
315                | Some(&b'(')
316                | Some(&b'[')
317                | Some(&b'{')
318                | Some(&b'"')
319                | Some(&b'\'')
320                | Some(&b'>')
321        );
322
323    // Closing should be at end or before valid character
324    let valid_closing = closing.end_pos() >= bytes.len()
325        || matches!(
326            bytes.get(closing.end_pos()),
327            Some(&b' ')
328                | Some(&b'\t')
329                | Some(&b')')
330                | Some(&b']')
331                | Some(&b'}')
332                | Some(&b'"')
333                | Some(&b'\'')
334                | Some(&b'.')
335                | Some(&b',')
336                | Some(&b'!')
337                | Some(&b'?')
338                | Some(&b';')
339                | Some(&b':')
340                | Some(&b'<')
341        );
342
343    valid_opening && valid_closing && !content.contains('\n')
344}
345
346/// Fast validation of emphasis content
347#[inline]
348pub fn is_valid_emphasis_content_fast(content: &str) -> bool {
349    !content.trim().is_empty()
350}
351
352/// Check if a line should be treated as a list item vs emphasis
353pub fn is_likely_list_line(line: &str) -> bool {
354    LIST_MARKER.is_match(line)
355}
356
357/// Check if line has documentation patterns that should be preserved
358pub fn has_doc_patterns(line: &str) -> bool {
359    // Check for template/shortcode syntax like {* ... *} used by FastAPI/MkDocs
360    // These contain asterisks that are not emphasis markers
361    if line.contains("{*") && TEMPLATE_SHORTCODE_PATTERN.is_match(line) {
362        return true;
363    }
364
365    (QUICK_DOC_CHECK.is_match(line) || QUICK_BOLD_CHECK.is_match(line))
366        && (DOC_METADATA_PATTERN.is_match(line) || BOLD_TEXT_PATTERN.is_match(line))
367}
368
369#[cfg(test)]
370mod tests {
371    use super::*;
372
373    #[test]
374    fn test_emphasis_marker_parsing() {
375        let markers = find_emphasis_markers("This has *single* and **double** emphasis");
376        assert_eq!(markers.len(), 4); // *, *, **, **
377
378        let markers = find_emphasis_markers("*start* and *end*");
379        assert_eq!(markers.len(), 4); // *, *, *, *
380    }
381
382    #[test]
383    fn test_single_emphasis_span_detection() {
384        let markers = find_emphasis_markers("This has *valid* emphasis and **strong** too");
385        let spans = find_single_emphasis_spans("This has *valid* emphasis and **strong** too", markers);
386        assert_eq!(spans.len(), 1); // Only the single emphasis
387        assert_eq!(spans[0].content, "valid");
388        assert!(!spans[0].has_leading_space);
389        assert!(!spans[0].has_trailing_space);
390    }
391
392    #[test]
393    fn test_emphasis_with_spaces() {
394        let markers = find_emphasis_markers("This has * invalid * emphasis");
395        let spans = find_emphasis_spans("This has * invalid * emphasis", markers);
396        assert_eq!(spans.len(), 1);
397        assert_eq!(spans[0].content, " invalid ");
398        assert!(spans[0].has_leading_space);
399        assert!(spans[0].has_trailing_space);
400    }
401
402    #[test]
403    fn test_mixed_markers() {
404        let markers = find_emphasis_markers("This has *asterisk* and _underscore_ emphasis");
405        let spans = find_single_emphasis_spans("This has *asterisk* and _underscore_ emphasis", markers);
406        assert_eq!(spans.len(), 2);
407        assert_eq!(spans[0].opening.as_char(), '*');
408        assert_eq!(spans[1].opening.as_char(), '_');
409    }
410
411    #[test]
412    fn test_template_shortcode_detection() {
413        // FastAPI/MkDocs style template syntax should be detected as doc pattern
414        assert!(has_doc_patterns(
415            "{* ../../docs_src/cookie_param_models/tutorial001.py hl[9:12,16] *}"
416        ));
417        assert!(has_doc_patterns(
418            "{* ../../docs_src/conditional_openapi/tutorial001.py hl[6,11] *}"
419        ));
420        // Simple shortcode
421        assert!(has_doc_patterns("{* file.py *}"));
422        // With path and options
423        assert!(has_doc_patterns("{* ../path/to/file.py ln[1-10] *}"));
424
425        // Regular emphasis should NOT match
426        assert!(!has_doc_patterns("This has *emphasis* text"));
427        assert!(!has_doc_patterns("This has * spaces * in emphasis"));
428        // Only opening brace without closing should not match
429        assert!(!has_doc_patterns("{* incomplete"));
430    }
431
432    #[test]
433    fn test_doc_pattern_rejects_spaced_bold_metadata() {
434        // Valid bold metadata — should be treated as doc pattern (skip MD037)
435        assert!(has_doc_patterns("**Key**: value"));
436        assert!(has_doc_patterns("**Name**: another value"));
437        assert!(has_doc_patterns("**X**: single char"));
438        assert!(has_doc_patterns("* **Key**: list item with bold key"));
439
440        // Broken bold with internal spaces — should NOT be treated as doc pattern
441        // so MD037 can flag the spacing issue
442        assert!(!has_doc_patterns("** Key**: value"));
443        assert!(!has_doc_patterns("**Key **: value"));
444        assert!(!has_doc_patterns("** Key **: value"));
445        assert!(!has_doc_patterns(
446            "** Explicit Import**: Convert markdownlint configs to rumdl format:"
447        ));
448    }
449}
rumdl_lib/utils/emphasis_utils.rs

rumdl_lib/utils/
emphasis_utils.rs