rumdl_lib/utils/
emphasis_utils.rs

1use regex::Regex;
2use std::sync::LazyLock;
3
4// Better detection of inline code with support for multiple backticks
5static INLINE_CODE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"(`+)([^`]|[^`].*?[^`])(`+)").unwrap());
6
7// Inline math pattern - matches both $...$ and $$...$$ syntax
8// The pattern allows zero or more characters between delimiters to handle empty math spans
9static INLINE_MATH: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\$\$[^$]*\$\$|\$[^$\n]*\$").unwrap());
10
11// List markers pattern - used to avoid confusion with emphasis
12static LIST_MARKER: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^\s*[*+-]\s+").unwrap());
13
14// Documentation style patterns
15static DOC_METADATA_PATTERN: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^\s*\*?\s*\*\*[^*]+\*\*\s*:").unwrap());
16
17// Bold text pattern (for preserving bold text in documentation) - only match valid bold without spaces
18static BOLD_TEXT_PATTERN: LazyLock<Regex> =
19    LazyLock::new(|| Regex::new(r"\*\*[^*\s][^*]*[^*\s]\*\*|\*\*[^*\s]\*\*").unwrap());
20
21// Pre-compiled patterns for quick checks
22static QUICK_DOC_CHECK: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^\s*\*\s+\*").unwrap());
23static QUICK_BOLD_CHECK: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\*\*[^*\s]").unwrap());
24
25// Template/shortcode syntax pattern - {* ... *} used by documentation systems like FastAPI/MkDocs
26// These are not emphasis markers but template directives for code inclusion/highlighting
27static TEMPLATE_SHORTCODE_PATTERN: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\{\*.*\*\}").unwrap());
28
29/// Represents an emphasis marker found in text
30#[derive(Debug, Clone, PartialEq)]
31pub struct EmphasisMarker {
32    pub marker_type: u8,  // b'*' or b'_' for faster comparison
33    pub count: u8,        // 1 for single, 2 for double
34    pub start_pos: usize, // Position in the line
35}
36
37impl EmphasisMarker {
38    #[inline]
39    pub fn end_pos(&self) -> usize {
40        self.start_pos + self.count as usize
41    }
42
43    #[inline]
44    pub fn as_char(&self) -> char {
45        self.marker_type as char
46    }
47}
48
49/// Represents a complete emphasis span
50#[derive(Debug, Clone)]
51pub struct EmphasisSpan {
52    pub opening: EmphasisMarker,
53    pub closing: EmphasisMarker,
54    pub content: String,
55    pub has_leading_space: bool,
56    pub has_trailing_space: bool,
57}
58
59/// Enhanced inline code replacement with optimized performance
60/// Replaces inline code with 'X' characters to prevent false positives in emphasis detection
61#[inline]
62pub fn replace_inline_code(line: &str) -> String {
63    // Quick check: if no backticks, return original
64    if !line.contains('`') {
65        return line.to_string();
66    }
67
68    let mut result = line.to_string();
69    let mut offset = 0;
70
71    for cap in INLINE_CODE.captures_iter(line) {
72        if let (Some(full_match), Some(_opening), Some(_content), Some(_closing)) =
73            (cap.get(0), cap.get(1), cap.get(2), cap.get(3))
74        {
75            let match_start = full_match.start();
76            let match_end = full_match.end();
77            // Use 'X' instead of spaces to avoid false positives for "spaces in emphasis"
78            let placeholder = "X".repeat(match_end - match_start);
79
80            result.replace_range(match_start + offset..match_end + offset, &placeholder);
81            offset += placeholder.len() - (match_end - match_start);
82        }
83    }
84
85    result
86}
87
88/// Replace inline math ($...$ and $$...$$) with placeholder characters
89/// This prevents math content from being mistaken for emphasis markers
90pub fn replace_inline_math(line: &str) -> String {
91    // Quick check: if no dollar signs, return original
92    if !line.contains('$') {
93        return line.to_string();
94    }
95
96    let mut result = line.to_string();
97    let mut offset: isize = 0;
98
99    for m in INLINE_MATH.find_iter(line) {
100        let match_start = m.start();
101        let match_end = m.end();
102        // Use 'M' instead of spaces or asterisks to avoid affecting emphasis detection
103        let placeholder = "M".repeat(match_end - match_start);
104
105        let adjusted_start = (match_start as isize + offset) as usize;
106        let adjusted_end = (match_end as isize + offset) as usize;
107        result.replace_range(adjusted_start..adjusted_end, &placeholder);
108        offset += placeholder.len() as isize - (match_end - match_start) as isize;
109    }
110
111    result
112}
113
114/// Optimized emphasis marker parsing using byte iteration
115#[inline]
116pub fn find_emphasis_markers(line: &str) -> Vec<EmphasisMarker> {
117    // Early return for lines without emphasis markers
118    if !line.contains('*') && !line.contains('_') {
119        return Vec::new();
120    }
121
122    let mut markers = Vec::new();
123    let bytes = line.as_bytes();
124    let mut i = 0;
125
126    while i < bytes.len() {
127        let byte = bytes[i];
128        if byte == b'*' || byte == b'_' {
129            let start_pos = i;
130            let mut count = 1u8;
131
132            // Count consecutive markers (limit to avoid overflow)
133            while i + (count as usize) < bytes.len() && bytes[i + (count as usize)] == byte && count < 3 {
134                count += 1;
135            }
136
137            // Only consider single (*) and double (**) markers
138            if count == 1 || count == 2 {
139                markers.push(EmphasisMarker {
140                    marker_type: byte,
141                    count,
142                    start_pos,
143                });
144            }
145
146            i += count as usize;
147        } else {
148            i += 1;
149        }
150    }
151
152    markers
153}
154
155/// Find all emphasis spans in a line, excluding only single emphasis (not strong)
156pub fn find_single_emphasis_spans(line: &str, markers: Vec<EmphasisMarker>) -> Vec<EmphasisSpan> {
157    // Early return for insufficient markers
158    if markers.len() < 2 {
159        return Vec::new();
160    }
161
162    let mut spans = Vec::new();
163    let mut used_markers = vec![false; markers.len()];
164
165    // Process markers in pairs more efficiently
166    for i in 0..markers.len() {
167        if used_markers[i] || markers[i].count != 1 {
168            continue;
169        }
170
171        let opening = &markers[i];
172
173        // Look for the nearest matching closing marker using optimized search
174        for j in (i + 1)..markers.len() {
175            if used_markers[j] {
176                continue;
177            }
178
179            let closing = &markers[j];
180
181            // Quick type and count check - only single emphasis
182            if closing.marker_type == opening.marker_type && closing.count == 1 {
183                let content_start = opening.end_pos();
184                let content_end = closing.start_pos;
185
186                if content_end > content_start {
187                    let content = &line[content_start..content_end];
188
189                    // Optimized validation checks
190                    if is_valid_emphasis_content_fast(content) && is_valid_emphasis_span_fast(line, opening, closing) {
191                        // Quick check for crossing markers
192                        let crosses_markers = markers[i + 1..j]
193                            .iter()
194                            .any(|marker| marker.marker_type == opening.marker_type && marker.count == 1);
195
196                        if !crosses_markers {
197                            let has_leading_space = content.starts_with(' ') || content.starts_with('\t');
198                            let has_trailing_space = content.ends_with(' ') || content.ends_with('\t');
199
200                            spans.push(EmphasisSpan {
201                                opening: opening.clone(),
202                                closing: closing.clone(),
203                                content: content.to_string(),
204                                has_leading_space,
205                                has_trailing_space,
206                            });
207
208                            // Mark both markers as used
209                            used_markers[i] = true;
210                            used_markers[j] = true;
211                            break;
212                        }
213                    }
214                }
215            }
216        }
217    }
218
219    spans
220}
221
222/// Optimized emphasis span finding with reduced complexity (includes both single and strong)
223pub fn find_emphasis_spans(line: &str, markers: Vec<EmphasisMarker>) -> Vec<EmphasisSpan> {
224    // Early return for insufficient markers
225    if markers.len() < 2 {
226        return Vec::new();
227    }
228
229    let mut spans = Vec::new();
230    let mut used_markers = vec![false; markers.len()];
231
232    // Process markers in pairs more efficiently
233    for i in 0..markers.len() {
234        if used_markers[i] {
235            continue;
236        }
237
238        let opening = &markers[i];
239
240        // Look for the nearest matching closing marker using optimized search
241        for j in (i + 1)..markers.len() {
242            if used_markers[j] {
243                continue;
244            }
245
246            let closing = &markers[j];
247
248            // Quick type and count check
249            if closing.marker_type == opening.marker_type && closing.count == opening.count {
250                let content_start = opening.end_pos();
251                let content_end = closing.start_pos;
252
253                if content_end > content_start {
254                    let content = &line[content_start..content_end];
255
256                    // Optimized validation checks
257                    if is_valid_emphasis_content_fast(content) && is_valid_emphasis_span_fast(line, opening, closing) {
258                        // Quick check for crossing markers
259                        let crosses_markers = markers[i + 1..j]
260                            .iter()
261                            .any(|marker| marker.marker_type == opening.marker_type);
262
263                        if !crosses_markers {
264                            let has_leading_space = content.starts_with(' ') || content.starts_with('\t');
265                            let has_trailing_space = content.ends_with(' ') || content.ends_with('\t');
266
267                            spans.push(EmphasisSpan {
268                                opening: opening.clone(),
269                                closing: closing.clone(),
270                                content: content.to_string(),
271                                has_leading_space,
272                                has_trailing_space,
273                            });
274
275                            // Mark both markers as used
276                            used_markers[i] = true;
277                            used_markers[j] = true;
278                            break;
279                        }
280                    }
281                }
282            }
283        }
284    }
285
286    spans
287}
288
289/// Fast validation of emphasis span context
290#[inline]
291pub fn is_valid_emphasis_span_fast(line: &str, opening: &EmphasisMarker, closing: &EmphasisMarker) -> bool {
292    let content_start = opening.end_pos();
293    let content_end = closing.start_pos;
294
295    // Content must exist and not be just whitespace
296    if content_end <= content_start {
297        return false;
298    }
299
300    let content = &line[content_start..content_end];
301    if content.trim().is_empty() {
302        return false;
303    }
304
305    // Quick boundary checks using byte indexing
306    let bytes = line.as_bytes();
307
308    // Opening should be at start or after valid character
309    let valid_opening = opening.start_pos == 0
310        || matches!(
311            bytes.get(opening.start_pos.saturating_sub(1)),
312            Some(&b' ')
313                | Some(&b'\t')
314                | Some(&b'(')
315                | Some(&b'[')
316                | Some(&b'{')
317                | Some(&b'"')
318                | Some(&b'\'')
319                | Some(&b'>')
320        );
321
322    // Closing should be at end or before valid character
323    let valid_closing = closing.end_pos() >= bytes.len()
324        || matches!(
325            bytes.get(closing.end_pos()),
326            Some(&b' ')
327                | Some(&b'\t')
328                | Some(&b')')
329                | Some(&b']')
330                | Some(&b'}')
331                | Some(&b'"')
332                | Some(&b'\'')
333                | Some(&b'.')
334                | Some(&b',')
335                | Some(&b'!')
336                | Some(&b'?')
337                | Some(&b';')
338                | Some(&b':')
339                | Some(&b'<')
340        );
341
342    valid_opening && valid_closing && !content.contains('\n')
343}
344
345/// Fast validation of emphasis content
346#[inline]
347pub fn is_valid_emphasis_content_fast(content: &str) -> bool {
348    !content.trim().is_empty()
349}
350
351/// Check if a line should be treated as a list item vs emphasis
352pub fn is_likely_list_line(line: &str) -> bool {
353    LIST_MARKER.is_match(line)
354}
355
356/// Check if line has documentation patterns that should be preserved
357pub fn has_doc_patterns(line: &str) -> bool {
358    // Check for template/shortcode syntax like {* ... *} used by FastAPI/MkDocs
359    // These contain asterisks that are not emphasis markers
360    if line.contains("{*") && TEMPLATE_SHORTCODE_PATTERN.is_match(line) {
361        return true;
362    }
363
364    (QUICK_DOC_CHECK.is_match(line) || QUICK_BOLD_CHECK.is_match(line))
365        && (DOC_METADATA_PATTERN.is_match(line) || BOLD_TEXT_PATTERN.is_match(line))
366}
367
368#[cfg(test)]
369mod tests {
370    use super::*;
371
372    #[test]
373    fn test_emphasis_marker_parsing() {
374        let markers = find_emphasis_markers("This has *single* and **double** emphasis");
375        assert_eq!(markers.len(), 4); // *, *, **, **
376
377        let markers = find_emphasis_markers("*start* and *end*");
378        assert_eq!(markers.len(), 4); // *, *, *, *
379    }
380
381    #[test]
382    fn test_single_emphasis_span_detection() {
383        let markers = find_emphasis_markers("This has *valid* emphasis and **strong** too");
384        let spans = find_single_emphasis_spans("This has *valid* emphasis and **strong** too", markers);
385        assert_eq!(spans.len(), 1); // Only the single emphasis
386        assert_eq!(spans[0].content, "valid");
387        assert!(!spans[0].has_leading_space);
388        assert!(!spans[0].has_trailing_space);
389    }
390
391    #[test]
392    fn test_emphasis_with_spaces() {
393        let markers = find_emphasis_markers("This has * invalid * emphasis");
394        let spans = find_emphasis_spans("This has * invalid * emphasis", markers);
395        assert_eq!(spans.len(), 1);
396        assert_eq!(spans[0].content, " invalid ");
397        assert!(spans[0].has_leading_space);
398        assert!(spans[0].has_trailing_space);
399    }
400
401    #[test]
402    fn test_mixed_markers() {
403        let markers = find_emphasis_markers("This has *asterisk* and _underscore_ emphasis");
404        let spans = find_single_emphasis_spans("This has *asterisk* and _underscore_ emphasis", markers);
405        assert_eq!(spans.len(), 2);
406        assert_eq!(spans[0].opening.as_char(), '*');
407        assert_eq!(spans[1].opening.as_char(), '_');
408    }
409
410    #[test]
411    fn test_template_shortcode_detection() {
412        // FastAPI/MkDocs style template syntax should be detected as doc pattern
413        assert!(has_doc_patterns(
414            "{* ../../docs_src/cookie_param_models/tutorial001.py hl[9:12,16] *}"
415        ));
416        assert!(has_doc_patterns(
417            "{* ../../docs_src/conditional_openapi/tutorial001.py hl[6,11] *}"
418        ));
419        // Simple shortcode
420        assert!(has_doc_patterns("{* file.py *}"));
421        // With path and options
422        assert!(has_doc_patterns("{* ../path/to/file.py ln[1-10] *}"));
423
424        // Regular emphasis should NOT match
425        assert!(!has_doc_patterns("This has *emphasis* text"));
426        assert!(!has_doc_patterns("This has * spaces * in emphasis"));
427        // Only opening brace without closing should not match
428        assert!(!has_doc_patterns("{* incomplete"));
429    }
430}
rumdl_lib/utils/emphasis_utils.rs

rumdl_lib/utils/
emphasis_utils.rs