rumdl_lib/utils/
emphasis_utils.rs

1use regex::Regex;
2use std::sync::LazyLock;
3
4// Better detection of inline code with support for multiple backticks
5static INLINE_CODE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"(`+)([^`]|[^`].*?[^`])(`+)").unwrap());
6
7// List markers pattern - used to avoid confusion with emphasis
8static LIST_MARKER: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^\s*[*+-]\s+").unwrap());
9
10// Documentation style patterns
11static DOC_METADATA_PATTERN: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^\s*\*?\s*\*\*[^*]+\*\*\s*:").unwrap());
12
13// Bold text pattern (for preserving bold text in documentation) - only match valid bold without spaces
14static BOLD_TEXT_PATTERN: LazyLock<Regex> =
15    LazyLock::new(|| Regex::new(r"\*\*[^*\s][^*]*[^*\s]\*\*|\*\*[^*\s]\*\*").unwrap());
16
17// Pre-compiled patterns for quick checks
18static QUICK_DOC_CHECK: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^\s*\*\s+\*").unwrap());
19static QUICK_BOLD_CHECK: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\*\*[^*\s]").unwrap());
20
21// Template/shortcode syntax pattern - {* ... *} used by documentation systems like FastAPI/MkDocs
22// These are not emphasis markers but template directives for code inclusion/highlighting
23static TEMPLATE_SHORTCODE_PATTERN: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\{\*.*\*\}").unwrap());
24
25/// Represents an emphasis marker found in text
26#[derive(Debug, Clone, PartialEq)]
27pub struct EmphasisMarker {
28    pub marker_type: u8,  // b'*' or b'_' for faster comparison
29    pub count: u8,        // 1 for single, 2 for double
30    pub start_pos: usize, // Position in the line
31}
32
33impl EmphasisMarker {
34    #[inline]
35    pub fn end_pos(&self) -> usize {
36        self.start_pos + self.count as usize
37    }
38
39    #[inline]
40    pub fn as_char(&self) -> char {
41        self.marker_type as char
42    }
43}
44
45/// Represents a complete emphasis span
46#[derive(Debug, Clone)]
47pub struct EmphasisSpan {
48    pub opening: EmphasisMarker,
49    pub closing: EmphasisMarker,
50    pub content: String,
51    pub has_leading_space: bool,
52    pub has_trailing_space: bool,
53}
54
55/// Enhanced inline code replacement with optimized performance
56/// Replaces inline code with 'X' characters to prevent false positives in emphasis detection
57#[inline]
58pub fn replace_inline_code(line: &str) -> String {
59    // Quick check: if no backticks, return original
60    if !line.contains('`') {
61        return line.to_string();
62    }
63
64    let mut result = line.to_string();
65    let mut offset = 0;
66
67    for cap in INLINE_CODE.captures_iter(line) {
68        if let (Some(full_match), Some(_opening), Some(_content), Some(_closing)) =
69            (cap.get(0), cap.get(1), cap.get(2), cap.get(3))
70        {
71            let match_start = full_match.start();
72            let match_end = full_match.end();
73            // Use 'X' instead of spaces to avoid false positives for "spaces in emphasis"
74            let placeholder = "X".repeat(match_end - match_start);
75
76            result.replace_range(match_start + offset..match_end + offset, &placeholder);
77            offset += placeholder.len() - (match_end - match_start);
78        }
79    }
80
81    result
82}
83
84/// Optimized emphasis marker parsing using byte iteration
85#[inline]
86pub fn find_emphasis_markers(line: &str) -> Vec<EmphasisMarker> {
87    // Early return for lines without emphasis markers
88    if !line.contains('*') && !line.contains('_') {
89        return Vec::new();
90    }
91
92    let mut markers = Vec::new();
93    let bytes = line.as_bytes();
94    let mut i = 0;
95
96    while i < bytes.len() {
97        let byte = bytes[i];
98        if byte == b'*' || byte == b'_' {
99            let start_pos = i;
100            let mut count = 1u8;
101
102            // Count consecutive markers (limit to avoid overflow)
103            while i + (count as usize) < bytes.len() && bytes[i + (count as usize)] == byte && count < 3 {
104                count += 1;
105            }
106
107            // Only consider single (*) and double (**) markers
108            if count == 1 || count == 2 {
109                markers.push(EmphasisMarker {
110                    marker_type: byte,
111                    count,
112                    start_pos,
113                });
114            }
115
116            i += count as usize;
117        } else {
118            i += 1;
119        }
120    }
121
122    markers
123}
124
125/// Find all emphasis spans in a line, excluding only single emphasis (not strong)
126pub fn find_single_emphasis_spans(line: &str, markers: Vec<EmphasisMarker>) -> Vec<EmphasisSpan> {
127    // Early return for insufficient markers
128    if markers.len() < 2 {
129        return Vec::new();
130    }
131
132    let mut spans = Vec::new();
133    let mut used_markers = vec![false; markers.len()];
134
135    // Process markers in pairs more efficiently
136    for i in 0..markers.len() {
137        if used_markers[i] || markers[i].count != 1 {
138            continue;
139        }
140
141        let opening = &markers[i];
142
143        // Look for the nearest matching closing marker using optimized search
144        for j in (i + 1)..markers.len() {
145            if used_markers[j] {
146                continue;
147            }
148
149            let closing = &markers[j];
150
151            // Quick type and count check - only single emphasis
152            if closing.marker_type == opening.marker_type && closing.count == 1 {
153                let content_start = opening.end_pos();
154                let content_end = closing.start_pos;
155
156                if content_end > content_start {
157                    let content = &line[content_start..content_end];
158
159                    // Optimized validation checks
160                    if is_valid_emphasis_content_fast(content) && is_valid_emphasis_span_fast(line, opening, closing) {
161                        // Quick check for crossing markers
162                        let crosses_markers = markers[i + 1..j]
163                            .iter()
164                            .any(|marker| marker.marker_type == opening.marker_type && marker.count == 1);
165
166                        if !crosses_markers {
167                            let has_leading_space = content.starts_with(' ') || content.starts_with('\t');
168                            let has_trailing_space = content.ends_with(' ') || content.ends_with('\t');
169
170                            spans.push(EmphasisSpan {
171                                opening: opening.clone(),
172                                closing: closing.clone(),
173                                content: content.to_string(),
174                                has_leading_space,
175                                has_trailing_space,
176                            });
177
178                            // Mark both markers as used
179                            used_markers[i] = true;
180                            used_markers[j] = true;
181                            break;
182                        }
183                    }
184                }
185            }
186        }
187    }
188
189    spans
190}
191
192/// Optimized emphasis span finding with reduced complexity (includes both single and strong)
193pub fn find_emphasis_spans(line: &str, markers: Vec<EmphasisMarker>) -> Vec<EmphasisSpan> {
194    // Early return for insufficient markers
195    if markers.len() < 2 {
196        return Vec::new();
197    }
198
199    let mut spans = Vec::new();
200    let mut used_markers = vec![false; markers.len()];
201
202    // Process markers in pairs more efficiently
203    for i in 0..markers.len() {
204        if used_markers[i] {
205            continue;
206        }
207
208        let opening = &markers[i];
209
210        // Look for the nearest matching closing marker using optimized search
211        for j in (i + 1)..markers.len() {
212            if used_markers[j] {
213                continue;
214            }
215
216            let closing = &markers[j];
217
218            // Quick type and count check
219            if closing.marker_type == opening.marker_type && closing.count == opening.count {
220                let content_start = opening.end_pos();
221                let content_end = closing.start_pos;
222
223                if content_end > content_start {
224                    let content = &line[content_start..content_end];
225
226                    // Optimized validation checks
227                    if is_valid_emphasis_content_fast(content) && is_valid_emphasis_span_fast(line, opening, closing) {
228                        // Quick check for crossing markers
229                        let crosses_markers = markers[i + 1..j]
230                            .iter()
231                            .any(|marker| marker.marker_type == opening.marker_type);
232
233                        if !crosses_markers {
234                            let has_leading_space = content.starts_with(' ') || content.starts_with('\t');
235                            let has_trailing_space = content.ends_with(' ') || content.ends_with('\t');
236
237                            spans.push(EmphasisSpan {
238                                opening: opening.clone(),
239                                closing: closing.clone(),
240                                content: content.to_string(),
241                                has_leading_space,
242                                has_trailing_space,
243                            });
244
245                            // Mark both markers as used
246                            used_markers[i] = true;
247                            used_markers[j] = true;
248                            break;
249                        }
250                    }
251                }
252            }
253        }
254    }
255
256    spans
257}
258
259/// Fast validation of emphasis span context
260#[inline]
261pub fn is_valid_emphasis_span_fast(line: &str, opening: &EmphasisMarker, closing: &EmphasisMarker) -> bool {
262    let content_start = opening.end_pos();
263    let content_end = closing.start_pos;
264
265    // Content must exist and not be just whitespace
266    if content_end <= content_start {
267        return false;
268    }
269
270    let content = &line[content_start..content_end];
271    if content.trim().is_empty() {
272        return false;
273    }
274
275    // Quick boundary checks using byte indexing
276    let bytes = line.as_bytes();
277
278    // Opening should be at start or after valid character
279    let valid_opening = opening.start_pos == 0
280        || matches!(
281            bytes.get(opening.start_pos.saturating_sub(1)),
282            Some(&b' ')
283                | Some(&b'\t')
284                | Some(&b'(')
285                | Some(&b'[')
286                | Some(&b'{')
287                | Some(&b'"')
288                | Some(&b'\'')
289                | Some(&b'>')
290        );
291
292    // Closing should be at end or before valid character
293    let valid_closing = closing.end_pos() >= bytes.len()
294        || matches!(
295            bytes.get(closing.end_pos()),
296            Some(&b' ')
297                | Some(&b'\t')
298                | Some(&b')')
299                | Some(&b']')
300                | Some(&b'}')
301                | Some(&b'"')
302                | Some(&b'\'')
303                | Some(&b'.')
304                | Some(&b',')
305                | Some(&b'!')
306                | Some(&b'?')
307                | Some(&b';')
308                | Some(&b':')
309                | Some(&b'<')
310        );
311
312    valid_opening && valid_closing && !content.contains('\n')
313}
314
315/// Fast validation of emphasis content
316#[inline]
317pub fn is_valid_emphasis_content_fast(content: &str) -> bool {
318    !content.trim().is_empty()
319}
320
321/// Check if a line should be treated as a list item vs emphasis
322pub fn is_likely_list_line(line: &str) -> bool {
323    LIST_MARKER.is_match(line)
324}
325
326/// Check if line has documentation patterns that should be preserved
327pub fn has_doc_patterns(line: &str) -> bool {
328    // Check for template/shortcode syntax like {* ... *} used by FastAPI/MkDocs
329    // These contain asterisks that are not emphasis markers
330    if line.contains("{*") && TEMPLATE_SHORTCODE_PATTERN.is_match(line) {
331        return true;
332    }
333
334    (QUICK_DOC_CHECK.is_match(line) || QUICK_BOLD_CHECK.is_match(line))
335        && (DOC_METADATA_PATTERN.is_match(line) || BOLD_TEXT_PATTERN.is_match(line))
336}
337
338#[cfg(test)]
339mod tests {
340    use super::*;
341
342    #[test]
343    fn test_emphasis_marker_parsing() {
344        let markers = find_emphasis_markers("This has *single* and **double** emphasis");
345        assert_eq!(markers.len(), 4); // *, *, **, **
346
347        let markers = find_emphasis_markers("*start* and *end*");
348        assert_eq!(markers.len(), 4); // *, *, *, *
349    }
350
351    #[test]
352    fn test_single_emphasis_span_detection() {
353        let markers = find_emphasis_markers("This has *valid* emphasis and **strong** too");
354        let spans = find_single_emphasis_spans("This has *valid* emphasis and **strong** too", markers);
355        assert_eq!(spans.len(), 1); // Only the single emphasis
356        assert_eq!(spans[0].content, "valid");
357        assert!(!spans[0].has_leading_space);
358        assert!(!spans[0].has_trailing_space);
359    }
360
361    #[test]
362    fn test_emphasis_with_spaces() {
363        let markers = find_emphasis_markers("This has * invalid * emphasis");
364        let spans = find_emphasis_spans("This has * invalid * emphasis", markers);
365        assert_eq!(spans.len(), 1);
366        assert_eq!(spans[0].content, " invalid ");
367        assert!(spans[0].has_leading_space);
368        assert!(spans[0].has_trailing_space);
369    }
370
371    #[test]
372    fn test_mixed_markers() {
373        let markers = find_emphasis_markers("This has *asterisk* and _underscore_ emphasis");
374        let spans = find_single_emphasis_spans("This has *asterisk* and _underscore_ emphasis", markers);
375        assert_eq!(spans.len(), 2);
376        assert_eq!(spans[0].opening.as_char(), '*');
377        assert_eq!(spans[1].opening.as_char(), '_');
378    }
379
380    #[test]
381    fn test_template_shortcode_detection() {
382        // FastAPI/MkDocs style template syntax should be detected as doc pattern
383        assert!(has_doc_patterns(
384            "{* ../../docs_src/cookie_param_models/tutorial001.py hl[9:12,16] *}"
385        ));
386        assert!(has_doc_patterns(
387            "{* ../../docs_src/conditional_openapi/tutorial001.py hl[6,11] *}"
388        ));
389        // Simple shortcode
390        assert!(has_doc_patterns("{* file.py *}"));
391        // With path and options
392        assert!(has_doc_patterns("{* ../path/to/file.py ln[1-10] *}"));
393
394        // Regular emphasis should NOT match
395        assert!(!has_doc_patterns("This has *emphasis* text"));
396        assert!(!has_doc_patterns("This has * spaces * in emphasis"));
397        // Only opening brace without closing should not match
398        assert!(!has_doc_patterns("{* incomplete"));
399    }
400}
rumdl_lib/utils/emphasis_utils.rs

rumdl_lib/utils/
emphasis_utils.rs