rumdl_lib/utils/
emphasis_utils.rs

1use regex::Regex;
2use std::sync::LazyLock;
3
4// Better detection of inline code with support for multiple backticks
5static INLINE_CODE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"(`+)([^`]|[^`].*?[^`])(`+)").unwrap());
6
7// List markers pattern - used to avoid confusion with emphasis
8static LIST_MARKER: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^\s*[*+-]\s+").unwrap());
9
10// Documentation style patterns
11static DOC_METADATA_PATTERN: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^\s*\*?\s*\*\*[^*]+\*\*\s*:").unwrap());
12
13// Bold text pattern (for preserving bold text in documentation) - only match valid bold without spaces
14static BOLD_TEXT_PATTERN: LazyLock<Regex> =
15    LazyLock::new(|| Regex::new(r"\*\*[^*\s][^*]*[^*\s]\*\*|\*\*[^*\s]\*\*").unwrap());
16
17// Pre-compiled patterns for quick checks
18static QUICK_DOC_CHECK: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^\s*\*\s+\*").unwrap());
19static QUICK_BOLD_CHECK: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\*\*[^*\s]").unwrap());
20
21/// Represents an emphasis marker found in text
22#[derive(Debug, Clone, PartialEq)]
23pub struct EmphasisMarker {
24    pub marker_type: u8,  // b'*' or b'_' for faster comparison
25    pub count: u8,        // 1 for single, 2 for double
26    pub start_pos: usize, // Position in the line
27}
28
29impl EmphasisMarker {
30    #[inline]
31    pub fn end_pos(&self) -> usize {
32        self.start_pos + self.count as usize
33    }
34
35    #[inline]
36    pub fn as_char(&self) -> char {
37        self.marker_type as char
38    }
39}
40
41/// Represents a complete emphasis span
42#[derive(Debug, Clone)]
43pub struct EmphasisSpan {
44    pub opening: EmphasisMarker,
45    pub closing: EmphasisMarker,
46    pub content: String,
47    pub has_leading_space: bool,
48    pub has_trailing_space: bool,
49}
50
51/// Enhanced inline code replacement with optimized performance
52/// Replaces inline code with 'X' characters to prevent false positives in emphasis detection
53#[inline]
54pub fn replace_inline_code(line: &str) -> String {
55    // Quick check: if no backticks, return original
56    if !line.contains('`') {
57        return line.to_string();
58    }
59
60    let mut result = line.to_string();
61    let mut offset = 0;
62
63    for cap in INLINE_CODE.captures_iter(line) {
64        if let (Some(full_match), Some(_opening), Some(_content), Some(_closing)) =
65            (cap.get(0), cap.get(1), cap.get(2), cap.get(3))
66        {
67            let match_start = full_match.start();
68            let match_end = full_match.end();
69            // Use 'X' instead of spaces to avoid false positives for "spaces in emphasis"
70            let placeholder = "X".repeat(match_end - match_start);
71
72            result.replace_range(match_start + offset..match_end + offset, &placeholder);
73            offset += placeholder.len() - (match_end - match_start);
74        }
75    }
76
77    result
78}
79
80/// Optimized emphasis marker parsing using byte iteration
81#[inline]
82pub fn find_emphasis_markers(line: &str) -> Vec<EmphasisMarker> {
83    // Early return for lines without emphasis markers
84    if !line.contains('*') && !line.contains('_') {
85        return Vec::new();
86    }
87
88    let mut markers = Vec::new();
89    let bytes = line.as_bytes();
90    let mut i = 0;
91
92    while i < bytes.len() {
93        let byte = bytes[i];
94        if byte == b'*' || byte == b'_' {
95            let start_pos = i;
96            let mut count = 1u8;
97
98            // Count consecutive markers (limit to avoid overflow)
99            while i + (count as usize) < bytes.len() && bytes[i + (count as usize)] == byte && count < 3 {
100                count += 1;
101            }
102
103            // Only consider single (*) and double (**) markers
104            if count == 1 || count == 2 {
105                markers.push(EmphasisMarker {
106                    marker_type: byte,
107                    count,
108                    start_pos,
109                });
110            }
111
112            i += count as usize;
113        } else {
114            i += 1;
115        }
116    }
117
118    markers
119}
120
121/// Find all emphasis spans in a line, excluding only single emphasis (not strong)
122pub fn find_single_emphasis_spans(line: &str, markers: Vec<EmphasisMarker>) -> Vec<EmphasisSpan> {
123    // Early return for insufficient markers
124    if markers.len() < 2 {
125        return Vec::new();
126    }
127
128    let mut spans = Vec::new();
129    let mut used_markers = vec![false; markers.len()];
130
131    // Process markers in pairs more efficiently
132    for i in 0..markers.len() {
133        if used_markers[i] || markers[i].count != 1 {
134            continue;
135        }
136
137        let opening = &markers[i];
138
139        // Look for the nearest matching closing marker using optimized search
140        for j in (i + 1)..markers.len() {
141            if used_markers[j] {
142                continue;
143            }
144
145            let closing = &markers[j];
146
147            // Quick type and count check - only single emphasis
148            if closing.marker_type == opening.marker_type && closing.count == 1 {
149                let content_start = opening.end_pos();
150                let content_end = closing.start_pos;
151
152                if content_end > content_start {
153                    let content = &line[content_start..content_end];
154
155                    // Optimized validation checks
156                    if is_valid_emphasis_content_fast(content) && is_valid_emphasis_span_fast(line, opening, closing) {
157                        // Quick check for crossing markers
158                        let crosses_markers = markers[i + 1..j]
159                            .iter()
160                            .any(|marker| marker.marker_type == opening.marker_type && marker.count == 1);
161
162                        if !crosses_markers {
163                            let has_leading_space = content.starts_with(' ') || content.starts_with('\t');
164                            let has_trailing_space = content.ends_with(' ') || content.ends_with('\t');
165
166                            spans.push(EmphasisSpan {
167                                opening: opening.clone(),
168                                closing: closing.clone(),
169                                content: content.to_string(),
170                                has_leading_space,
171                                has_trailing_space,
172                            });
173
174                            // Mark both markers as used
175                            used_markers[i] = true;
176                            used_markers[j] = true;
177                            break;
178                        }
179                    }
180                }
181            }
182        }
183    }
184
185    spans
186}
187
188/// Optimized emphasis span finding with reduced complexity (includes both single and strong)
189pub fn find_emphasis_spans(line: &str, markers: Vec<EmphasisMarker>) -> Vec<EmphasisSpan> {
190    // Early return for insufficient markers
191    if markers.len() < 2 {
192        return Vec::new();
193    }
194
195    let mut spans = Vec::new();
196    let mut used_markers = vec![false; markers.len()];
197
198    // Process markers in pairs more efficiently
199    for i in 0..markers.len() {
200        if used_markers[i] {
201            continue;
202        }
203
204        let opening = &markers[i];
205
206        // Look for the nearest matching closing marker using optimized search
207        for j in (i + 1)..markers.len() {
208            if used_markers[j] {
209                continue;
210            }
211
212            let closing = &markers[j];
213
214            // Quick type and count check
215            if closing.marker_type == opening.marker_type && closing.count == opening.count {
216                let content_start = opening.end_pos();
217                let content_end = closing.start_pos;
218
219                if content_end > content_start {
220                    let content = &line[content_start..content_end];
221
222                    // Optimized validation checks
223                    if is_valid_emphasis_content_fast(content) && is_valid_emphasis_span_fast(line, opening, closing) {
224                        // Quick check for crossing markers
225                        let crosses_markers = markers[i + 1..j]
226                            .iter()
227                            .any(|marker| marker.marker_type == opening.marker_type);
228
229                        if !crosses_markers {
230                            let has_leading_space = content.starts_with(' ') || content.starts_with('\t');
231                            let has_trailing_space = content.ends_with(' ') || content.ends_with('\t');
232
233                            spans.push(EmphasisSpan {
234                                opening: opening.clone(),
235                                closing: closing.clone(),
236                                content: content.to_string(),
237                                has_leading_space,
238                                has_trailing_space,
239                            });
240
241                            // Mark both markers as used
242                            used_markers[i] = true;
243                            used_markers[j] = true;
244                            break;
245                        }
246                    }
247                }
248            }
249        }
250    }
251
252    spans
253}
254
255/// Fast validation of emphasis span context
256#[inline]
257pub fn is_valid_emphasis_span_fast(line: &str, opening: &EmphasisMarker, closing: &EmphasisMarker) -> bool {
258    let content_start = opening.end_pos();
259    let content_end = closing.start_pos;
260
261    // Content must exist and not be just whitespace
262    if content_end <= content_start {
263        return false;
264    }
265
266    let content = &line[content_start..content_end];
267    if content.trim().is_empty() {
268        return false;
269    }
270
271    // Quick boundary checks using byte indexing
272    let bytes = line.as_bytes();
273
274    // Opening should be at start or after valid character
275    let valid_opening = opening.start_pos == 0
276        || matches!(
277            bytes.get(opening.start_pos.saturating_sub(1)),
278            Some(&b' ')
279                | Some(&b'\t')
280                | Some(&b'(')
281                | Some(&b'[')
282                | Some(&b'{')
283                | Some(&b'"')
284                | Some(&b'\'')
285                | Some(&b'>')
286        );
287
288    // Closing should be at end or before valid character
289    let valid_closing = closing.end_pos() >= bytes.len()
290        || matches!(
291            bytes.get(closing.end_pos()),
292            Some(&b' ')
293                | Some(&b'\t')
294                | Some(&b')')
295                | Some(&b']')
296                | Some(&b'}')
297                | Some(&b'"')
298                | Some(&b'\'')
299                | Some(&b'.')
300                | Some(&b',')
301                | Some(&b'!')
302                | Some(&b'?')
303                | Some(&b';')
304                | Some(&b':')
305                | Some(&b'<')
306        );
307
308    valid_opening && valid_closing && !content.contains('\n')
309}
310
311/// Fast validation of emphasis content
312#[inline]
313pub fn is_valid_emphasis_content_fast(content: &str) -> bool {
314    !content.trim().is_empty()
315}
316
317/// Check if a line should be treated as a list item vs emphasis
318pub fn is_likely_list_line(line: &str) -> bool {
319    LIST_MARKER.is_match(line)
320}
321
322/// Check if line has documentation patterns that should be preserved
323pub fn has_doc_patterns(line: &str) -> bool {
324    (QUICK_DOC_CHECK.is_match(line) || QUICK_BOLD_CHECK.is_match(line))
325        && (DOC_METADATA_PATTERN.is_match(line) || BOLD_TEXT_PATTERN.is_match(line))
326}
327
328#[cfg(test)]
329mod tests {
330    use super::*;
331
332    #[test]
333    fn test_emphasis_marker_parsing() {
334        let markers = find_emphasis_markers("This has *single* and **double** emphasis");
335        assert_eq!(markers.len(), 4); // *, *, **, **
336
337        let markers = find_emphasis_markers("*start* and *end*");
338        assert_eq!(markers.len(), 4); // *, *, *, *
339    }
340
341    #[test]
342    fn test_single_emphasis_span_detection() {
343        let markers = find_emphasis_markers("This has *valid* emphasis and **strong** too");
344        let spans = find_single_emphasis_spans("This has *valid* emphasis and **strong** too", markers);
345        assert_eq!(spans.len(), 1); // Only the single emphasis
346        assert_eq!(spans[0].content, "valid");
347        assert!(!spans[0].has_leading_space);
348        assert!(!spans[0].has_trailing_space);
349    }
350
351    #[test]
352    fn test_emphasis_with_spaces() {
353        let markers = find_emphasis_markers("This has * invalid * emphasis");
354        let spans = find_emphasis_spans("This has * invalid * emphasis", markers);
355        assert_eq!(spans.len(), 1);
356        assert_eq!(spans[0].content, " invalid ");
357        assert!(spans[0].has_leading_space);
358        assert!(spans[0].has_trailing_space);
359    }
360
361    #[test]
362    fn test_mixed_markers() {
363        let markers = find_emphasis_markers("This has *asterisk* and _underscore_ emphasis");
364        let spans = find_single_emphasis_spans("This has *asterisk* and _underscore_ emphasis", markers);
365        assert_eq!(spans.len(), 2);
366        assert_eq!(spans[0].opening.as_char(), '*');
367        assert_eq!(spans[1].opening.as_char(), '_');
368    }
369}
rumdl_lib/utils/emphasis_utils.rs

rumdl_lib/utils/
emphasis_utils.rs