rumdl_lib/utils/
emphasis_utils.rs

1use lazy_static::lazy_static;
2use regex::Regex;
3
4lazy_static! {
5    // Front matter detection
6    static ref FRONT_MATTER_DELIM: Regex = Regex::new(r"^---\s*$").unwrap();
7
8    // Better detection of inline code with support for multiple backticks
9    static ref INLINE_CODE: Regex = Regex::new(r"(`+)([^`]|[^`].*?[^`])(`+)").unwrap();
10
11    // List markers pattern - used to avoid confusion with emphasis
12    static ref LIST_MARKER: Regex = Regex::new(r"^\s*[*+-]\s+").unwrap();
13
14    // Valid emphasis at start of line that should not be treated as lists
15    static ref VALID_START_EMPHASIS: Regex = Regex::new(r"^(\*\*[^*\s]|\*[^*\s]|__[^_\s]|_[^_\s])").unwrap();
16
17    // Documentation style patterns
18    static ref DOC_METADATA_PATTERN: Regex = Regex::new(r"^\s*\*?\s*\*\*[^*]+\*\*\s*:").unwrap();
19
20    // Bold text pattern (for preserving bold text in documentation) - only match valid bold without spaces
21    static ref BOLD_TEXT_PATTERN: Regex = Regex::new(r"\*\*[^*\s][^*]*[^*\s]\*\*|\*\*[^*\s]\*\*").unwrap();
22
23    // Pre-compiled patterns for quick checks
24    static ref QUICK_DOC_CHECK: Regex = Regex::new(r"^\s*\*\s+\*").unwrap();
25    static ref QUICK_BOLD_CHECK: Regex = Regex::new(r"\*\*[^*\s]").unwrap();
26}
27
28/// Represents an emphasis marker found in text
29#[derive(Debug, Clone, PartialEq)]
30pub struct EmphasisMarker {
31    pub marker_type: u8,  // b'*' or b'_' for faster comparison
32    pub count: u8,        // 1 for single, 2 for double
33    pub start_pos: usize, // Position in the line
34}
35
36impl EmphasisMarker {
37    #[inline]
38    pub fn end_pos(&self) -> usize {
39        self.start_pos + self.count as usize
40    }
41
42    #[inline]
43    pub fn as_char(&self) -> char {
44        self.marker_type as char
45    }
46}
47
48/// Represents a complete emphasis span
49#[derive(Debug, Clone)]
50pub struct EmphasisSpan {
51    pub opening: EmphasisMarker,
52    pub closing: EmphasisMarker,
53    pub content: String,
54    pub has_leading_space: bool,
55    pub has_trailing_space: bool,
56}
57
58/// Enhanced inline code replacement with optimized performance
59/// Replaces inline code with 'X' characters to prevent false positives in emphasis detection
60#[inline]
61pub fn replace_inline_code(line: &str) -> String {
62    // Quick check: if no backticks, return original
63    if !line.contains('`') {
64        return line.to_string();
65    }
66
67    let mut result = line.to_string();
68    let mut offset = 0;
69
70    for cap in INLINE_CODE.captures_iter(line) {
71        if let (Some(full_match), Some(_opening), Some(_content), Some(_closing)) =
72            (cap.get(0), cap.get(1), cap.get(2), cap.get(3))
73        {
74            let match_start = full_match.start();
75            let match_end = full_match.end();
76            // Use 'X' instead of spaces to avoid false positives for "spaces in emphasis"
77            let placeholder = "X".repeat(match_end - match_start);
78
79            result.replace_range(match_start + offset..match_end + offset, &placeholder);
80            offset += placeholder.len() - (match_end - match_start);
81        }
82    }
83
84    result
85}
86
87/// Optimized emphasis marker parsing using byte iteration
88#[inline]
89pub fn find_emphasis_markers(line: &str) -> Vec<EmphasisMarker> {
90    // Early return for lines without emphasis markers
91    if !line.contains('*') && !line.contains('_') {
92        return Vec::new();
93    }
94
95    let mut markers = Vec::new();
96    let bytes = line.as_bytes();
97    let mut i = 0;
98
99    while i < bytes.len() {
100        let byte = bytes[i];
101        if byte == b'*' || byte == b'_' {
102            let start_pos = i;
103            let mut count = 1u8;
104
105            // Count consecutive markers (limit to avoid overflow)
106            while i + (count as usize) < bytes.len() && bytes[i + (count as usize)] == byte && count < 3 {
107                count += 1;
108            }
109
110            // Only consider single (*) and double (**) markers
111            if count == 1 || count == 2 {
112                markers.push(EmphasisMarker {
113                    marker_type: byte,
114                    count,
115                    start_pos,
116                });
117            }
118
119            i += count as usize;
120        } else {
121            i += 1;
122        }
123    }
124
125    markers
126}
127
128/// Find all emphasis spans in a line, excluding only single emphasis (not strong)
129pub fn find_single_emphasis_spans(line: &str, markers: Vec<EmphasisMarker>) -> Vec<EmphasisSpan> {
130    // Early return for insufficient markers
131    if markers.len() < 2 {
132        return Vec::new();
133    }
134
135    let mut spans = Vec::new();
136    let mut used_markers = vec![false; markers.len()];
137
138    // Process markers in pairs more efficiently
139    for i in 0..markers.len() {
140        if used_markers[i] || markers[i].count != 1 {
141            continue;
142        }
143
144        let opening = &markers[i];
145
146        // Look for the nearest matching closing marker using optimized search
147        for j in (i + 1)..markers.len() {
148            if used_markers[j] {
149                continue;
150            }
151
152            let closing = &markers[j];
153
154            // Quick type and count check - only single emphasis
155            if closing.marker_type == opening.marker_type && closing.count == 1 {
156                let content_start = opening.end_pos();
157                let content_end = closing.start_pos;
158
159                if content_end > content_start {
160                    let content = &line[content_start..content_end];
161
162                    // Optimized validation checks
163                    if is_valid_emphasis_content_fast(content) && is_valid_emphasis_span_fast(line, opening, closing) {
164                        // Quick check for crossing markers
165                        let crosses_markers = markers[i + 1..j]
166                            .iter()
167                            .any(|marker| marker.marker_type == opening.marker_type && marker.count == 1);
168
169                        if !crosses_markers {
170                            let has_leading_space = content.starts_with(' ') || content.starts_with('\t');
171                            let has_trailing_space = content.ends_with(' ') || content.ends_with('\t');
172
173                            spans.push(EmphasisSpan {
174                                opening: opening.clone(),
175                                closing: closing.clone(),
176                                content: content.to_string(),
177                                has_leading_space,
178                                has_trailing_space,
179                            });
180
181                            // Mark both markers as used
182                            used_markers[i] = true;
183                            used_markers[j] = true;
184                            break;
185                        }
186                    }
187                }
188            }
189        }
190    }
191
192    spans
193}
194
195/// Optimized emphasis span finding with reduced complexity (includes both single and strong)
196pub fn find_emphasis_spans(line: &str, markers: Vec<EmphasisMarker>) -> Vec<EmphasisSpan> {
197    // Early return for insufficient markers
198    if markers.len() < 2 {
199        return Vec::new();
200    }
201
202    let mut spans = Vec::new();
203    let mut used_markers = vec![false; markers.len()];
204
205    // Process markers in pairs more efficiently
206    for i in 0..markers.len() {
207        if used_markers[i] {
208            continue;
209        }
210
211        let opening = &markers[i];
212
213        // Look for the nearest matching closing marker using optimized search
214        for j in (i + 1)..markers.len() {
215            if used_markers[j] {
216                continue;
217            }
218
219            let closing = &markers[j];
220
221            // Quick type and count check
222            if closing.marker_type == opening.marker_type && closing.count == opening.count {
223                let content_start = opening.end_pos();
224                let content_end = closing.start_pos;
225
226                if content_end > content_start {
227                    let content = &line[content_start..content_end];
228
229                    // Optimized validation checks
230                    if is_valid_emphasis_content_fast(content) && is_valid_emphasis_span_fast(line, opening, closing) {
231                        // Quick check for crossing markers
232                        let crosses_markers = markers[i + 1..j]
233                            .iter()
234                            .any(|marker| marker.marker_type == opening.marker_type);
235
236                        if !crosses_markers {
237                            let has_leading_space = content.starts_with(' ') || content.starts_with('\t');
238                            let has_trailing_space = content.ends_with(' ') || content.ends_with('\t');
239
240                            spans.push(EmphasisSpan {
241                                opening: opening.clone(),
242                                closing: closing.clone(),
243                                content: content.to_string(),
244                                has_leading_space,
245                                has_trailing_space,
246                            });
247
248                            // Mark both markers as used
249                            used_markers[i] = true;
250                            used_markers[j] = true;
251                            break;
252                        }
253                    }
254                }
255            }
256        }
257    }
258
259    spans
260}
261
262/// Fast validation of emphasis span context
263#[inline]
264pub fn is_valid_emphasis_span_fast(line: &str, opening: &EmphasisMarker, closing: &EmphasisMarker) -> bool {
265    let content_start = opening.end_pos();
266    let content_end = closing.start_pos;
267
268    // Content must exist and not be just whitespace
269    if content_end <= content_start {
270        return false;
271    }
272
273    let content = &line[content_start..content_end];
274    if content.trim().is_empty() {
275        return false;
276    }
277
278    // Quick boundary checks using byte indexing
279    let bytes = line.as_bytes();
280
281    // Opening should be at start or after valid character
282    let valid_opening = opening.start_pos == 0
283        || matches!(
284            bytes.get(opening.start_pos.saturating_sub(1)),
285            Some(&b' ')
286                | Some(&b'\t')
287                | Some(&b'(')
288                | Some(&b'[')
289                | Some(&b'{')
290                | Some(&b'"')
291                | Some(&b'\'')
292                | Some(&b'>')
293        );
294
295    // Closing should be at end or before valid character
296    let valid_closing = closing.end_pos() >= bytes.len()
297        || matches!(
298            bytes.get(closing.end_pos()),
299            Some(&b' ')
300                | Some(&b'\t')
301                | Some(&b')')
302                | Some(&b']')
303                | Some(&b'}')
304                | Some(&b'"')
305                | Some(&b'\'')
306                | Some(&b'.')
307                | Some(&b',')
308                | Some(&b'!')
309                | Some(&b'?')
310                | Some(&b';')
311                | Some(&b':')
312                | Some(&b'<')
313        );
314
315    valid_opening && valid_closing && !content.contains('\n')
316}
317
318/// Fast validation of emphasis content
319#[inline]
320pub fn is_valid_emphasis_content_fast(content: &str) -> bool {
321    !content.trim().is_empty()
322}
323
324/// Check if a line should be treated as a list item vs emphasis
325pub fn is_likely_list_line(line: &str) -> bool {
326    LIST_MARKER.is_match(line)
327}
328
329/// Check if line has documentation patterns that should be preserved
330pub fn has_doc_patterns(line: &str) -> bool {
331    (QUICK_DOC_CHECK.is_match(line) || QUICK_BOLD_CHECK.is_match(line))
332        && (DOC_METADATA_PATTERN.is_match(line) || BOLD_TEXT_PATTERN.is_match(line))
333}
334
335#[cfg(test)]
336mod tests {
337    use super::*;
338
339    #[test]
340    fn test_emphasis_marker_parsing() {
341        let markers = find_emphasis_markers("This has *single* and **double** emphasis");
342        assert_eq!(markers.len(), 4); // *, *, **, **
343
344        let markers = find_emphasis_markers("*start* and *end*");
345        assert_eq!(markers.len(), 4); // *, *, *, *
346    }
347
348    #[test]
349    fn test_single_emphasis_span_detection() {
350        let markers = find_emphasis_markers("This has *valid* emphasis and **strong** too");
351        let spans = find_single_emphasis_spans("This has *valid* emphasis and **strong** too", markers);
352        assert_eq!(spans.len(), 1); // Only the single emphasis
353        assert_eq!(spans[0].content, "valid");
354        assert!(!spans[0].has_leading_space);
355        assert!(!spans[0].has_trailing_space);
356    }
357
358    #[test]
359    fn test_emphasis_with_spaces() {
360        let markers = find_emphasis_markers("This has * invalid * emphasis");
361        let spans = find_emphasis_spans("This has * invalid * emphasis", markers);
362        assert_eq!(spans.len(), 1);
363        assert_eq!(spans[0].content, " invalid ");
364        assert!(spans[0].has_leading_space);
365        assert!(spans[0].has_trailing_space);
366    }
367
368    #[test]
369    fn test_mixed_markers() {
370        let markers = find_emphasis_markers("This has *asterisk* and _underscore_ emphasis");
371        let spans = find_single_emphasis_spans("This has *asterisk* and _underscore_ emphasis", markers);
372        assert_eq!(spans.len(), 2);
373        assert_eq!(spans[0].opening.as_char(), '*');
374        assert_eq!(spans[1].opening.as_char(), '_');
375    }
376}
rumdl_lib/utils/emphasis_utils.rs

rumdl_lib/utils/
emphasis_utils.rs