rumdl_lib/utils/
skip_context.rs

1//! Utilities for determining if a position in markdown should be skipped from processing
2//!
3//! This module provides centralized context detection for various markdown constructs
4//! that should typically be skipped when processing rules.
5
6use crate::config::MarkdownFlavor;
7use crate::lint_context::LintContext;
8use crate::utils::kramdown_utils::is_math_block_delimiter;
9use crate::utils::mkdocs_admonitions;
10use crate::utils::mkdocs_critic;
11use crate::utils::mkdocs_footnotes;
12use crate::utils::mkdocs_snippets;
13use crate::utils::mkdocs_tabs;
14use crate::utils::mkdocstrings_refs;
15use crate::utils::regex_cache::HTML_COMMENT_PATTERN;
16use regex::Regex;
17use std::sync::LazyLock;
18
19/// Enhanced inline math pattern that handles both single $ and double $$ delimiters.
20/// Matches:
21/// - Display math: $$...$$ (zero or more non-$ characters)
22/// - Inline math: $...$ (zero or more non-$ non-newline characters)
23///
24/// The display math pattern is tried first to correctly handle $$content$$.
25/// Critically, both patterns allow ZERO characters between delimiters,
26/// so empty math like $$ or $ $ is consumed and won't pair with other $ signs.
27static INLINE_MATH_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\$\$[^$]*\$\$|\$[^$\n]*\$").unwrap());
28
29/// Range representing a span of bytes (start inclusive, end exclusive)
30#[derive(Debug, Clone, Copy)]
31pub struct ByteRange {
32    pub start: usize,
33    pub end: usize,
34}
35
36/// Pre-compute all HTML comment ranges in the content
37/// Returns a sorted vector of byte ranges for efficient lookup
38pub fn compute_html_comment_ranges(content: &str) -> Vec<ByteRange> {
39    HTML_COMMENT_PATTERN
40        .find_iter(content)
41        .map(|m| ByteRange {
42            start: m.start(),
43            end: m.end(),
44        })
45        .collect()
46}
47
48/// Check if a byte position is within any of the pre-computed HTML comment ranges
49/// Uses binary search for O(log n) complexity
50pub fn is_in_html_comment_ranges(ranges: &[ByteRange], byte_pos: usize) -> bool {
51    // Binary search to find a range that might contain byte_pos
52    ranges
53        .binary_search_by(|range| {
54            if byte_pos < range.start {
55                std::cmp::Ordering::Greater
56            } else if byte_pos >= range.end {
57                std::cmp::Ordering::Less
58            } else {
59                std::cmp::Ordering::Equal
60            }
61        })
62        .is_ok()
63}
64
65/// Check if a line is ENTIRELY within a single HTML comment
66/// Returns true only if both the line start AND end are within the same comment range
67pub fn is_line_entirely_in_html_comment(ranges: &[ByteRange], line_start: usize, line_end: usize) -> bool {
68    for range in ranges {
69        // If line start is within this range, check if line end is also within it
70        if line_start >= range.start && line_start < range.end {
71            return line_end <= range.end;
72        }
73    }
74    false
75}
76
77/// Check if a line is within front matter (both YAML and TOML)
78pub fn is_in_front_matter(content: &str, line_num: usize) -> bool {
79    let lines: Vec<&str> = content.lines().collect();
80
81    // Check YAML front matter (---) at the beginning
82    if !lines.is_empty() && lines[0] == "---" {
83        for (i, line) in lines.iter().enumerate().skip(1) {
84            if *line == "---" {
85                return line_num <= i;
86            }
87        }
88    }
89
90    // Check TOML front matter (+++) at the beginning
91    if !lines.is_empty() && lines[0] == "+++" {
92        for (i, line) in lines.iter().enumerate().skip(1) {
93            if *line == "+++" {
94                return line_num <= i;
95            }
96        }
97    }
98
99    false
100}
101
102/// Check if a byte position is within any context that should be skipped
103pub fn is_in_skip_context(ctx: &LintContext, byte_pos: usize) -> bool {
104    // Check standard code contexts
105    if ctx.is_in_code_block_or_span(byte_pos) {
106        return true;
107    }
108
109    // Check HTML comments
110    if is_in_html_comment(ctx.content, byte_pos) {
111        return true;
112    }
113
114    // Check math contexts
115    if is_in_math_context(ctx, byte_pos) {
116        return true;
117    }
118
119    // Check if in HTML tag
120    if is_in_html_tag(ctx, byte_pos) {
121        return true;
122    }
123
124    // Check MkDocs snippet sections and multi-line blocks
125    if ctx.flavor == MarkdownFlavor::MkDocs {
126        if mkdocs_snippets::is_within_snippet_section(ctx.content, byte_pos) {
127            return true;
128        }
129        if mkdocs_snippets::is_within_snippet_block(ctx.content, byte_pos) {
130            return true;
131        }
132    }
133
134    // Check MkDocs admonition blocks
135    if ctx.flavor == MarkdownFlavor::MkDocs && mkdocs_admonitions::is_within_admonition(ctx.content, byte_pos) {
136        return true;
137    }
138
139    // Check MkDocs footnote definitions
140    if ctx.flavor == MarkdownFlavor::MkDocs && mkdocs_footnotes::is_within_footnote_definition(ctx.content, byte_pos) {
141        return true;
142    }
143
144    // Check MkDocs content tabs
145    if ctx.flavor == MarkdownFlavor::MkDocs && mkdocs_tabs::is_within_tab_content(ctx.content, byte_pos) {
146        return true;
147    }
148
149    // Check MkDocstrings autodoc blocks
150    if ctx.flavor == MarkdownFlavor::MkDocs && mkdocstrings_refs::is_within_autodoc_block(ctx.content, byte_pos) {
151        return true;
152    }
153
154    // Check MkDocs Critic Markup
155    if ctx.flavor == MarkdownFlavor::MkDocs && mkdocs_critic::is_within_critic_markup(ctx.content, byte_pos) {
156        return true;
157    }
158
159    false
160}
161
162/// Check if a line should be skipped due to MkDocs snippet syntax
163pub fn is_mkdocs_snippet_line(line: &str, flavor: MarkdownFlavor) -> bool {
164    flavor == MarkdownFlavor::MkDocs && mkdocs_snippets::is_snippet_marker(line)
165}
166
167/// Check if a line is a MkDocs admonition marker
168pub fn is_mkdocs_admonition_line(line: &str, flavor: MarkdownFlavor) -> bool {
169    flavor == MarkdownFlavor::MkDocs && mkdocs_admonitions::is_admonition_marker(line)
170}
171
172/// Check if a line is a MkDocs footnote definition
173pub fn is_mkdocs_footnote_line(line: &str, flavor: MarkdownFlavor) -> bool {
174    flavor == MarkdownFlavor::MkDocs && mkdocs_footnotes::is_footnote_definition(line)
175}
176
177/// Check if a line is a MkDocs tab marker
178pub fn is_mkdocs_tab_line(line: &str, flavor: MarkdownFlavor) -> bool {
179    flavor == MarkdownFlavor::MkDocs && mkdocs_tabs::is_tab_marker(line)
180}
181
182/// Check if a line is a MkDocstrings autodoc marker
183pub fn is_mkdocstrings_autodoc_line(line: &str, flavor: MarkdownFlavor) -> bool {
184    flavor == MarkdownFlavor::MkDocs && mkdocstrings_refs::is_autodoc_marker(line)
185}
186
187/// Check if a line contains MkDocs Critic Markup
188pub fn is_mkdocs_critic_line(line: &str, flavor: MarkdownFlavor) -> bool {
189    flavor == MarkdownFlavor::MkDocs && mkdocs_critic::contains_critic_markup(line)
190}
191
192/// Check if a byte position is within an HTML comment
193pub fn is_in_html_comment(content: &str, byte_pos: usize) -> bool {
194    for m in HTML_COMMENT_PATTERN.find_iter(content) {
195        if m.start() <= byte_pos && byte_pos < m.end() {
196            return true;
197        }
198    }
199    false
200}
201
202/// Check if a byte position is within an HTML tag
203pub fn is_in_html_tag(ctx: &LintContext, byte_pos: usize) -> bool {
204    for html_tag in ctx.html_tags().iter() {
205        if html_tag.byte_offset <= byte_pos && byte_pos < html_tag.byte_end {
206            return true;
207        }
208    }
209    false
210}
211
212/// Check if a byte position is within a math context (block or inline)
213pub fn is_in_math_context(ctx: &LintContext, byte_pos: usize) -> bool {
214    let content = ctx.content;
215
216    // Check if we're in a math block
217    if is_in_math_block(content, byte_pos) {
218        return true;
219    }
220
221    // Check if we're in inline math
222    if is_in_inline_math(content, byte_pos) {
223        return true;
224    }
225
226    false
227}
228
229/// Check if a byte position is within a math block ($$...$$)
230pub fn is_in_math_block(content: &str, byte_pos: usize) -> bool {
231    let mut in_math_block = false;
232    let mut current_pos = 0;
233
234    for line in content.lines() {
235        let line_start = current_pos;
236        let line_end = current_pos + line.len();
237
238        // Check if this line is a math block delimiter
239        if is_math_block_delimiter(line) {
240            if byte_pos >= line_start && byte_pos <= line_end {
241                // Position is on the delimiter line itself
242                return true;
243            }
244            in_math_block = !in_math_block;
245        } else if in_math_block && byte_pos >= line_start && byte_pos <= line_end {
246            // Position is inside a math block
247            return true;
248        }
249
250        current_pos = line_end + 1; // +1 for newline
251    }
252
253    false
254}
255
256/// Check if a byte position is within inline math ($...$)
257pub fn is_in_inline_math(content: &str, byte_pos: usize) -> bool {
258    // Find all inline math spans
259    for m in INLINE_MATH_REGEX.find_iter(content) {
260        if m.start() <= byte_pos && byte_pos < m.end() {
261            return true;
262        }
263    }
264    false
265}
266
267/// Check if a position is within a table cell
268pub fn is_in_table_cell(ctx: &LintContext, line_num: usize, _col: usize) -> bool {
269    // Check if this line is part of a table
270    for table_row in ctx.table_rows().iter() {
271        if table_row.line == line_num {
272            // This line is part of a table
273            // For now, we'll skip the entire table row
274            // Future enhancement: check specific column boundaries
275            return true;
276        }
277    }
278    false
279}
280
281/// Check if a line contains table syntax
282pub fn is_table_line(line: &str) -> bool {
283    let trimmed = line.trim();
284
285    // Check for table separator line
286    if trimmed
287        .chars()
288        .all(|c| c == '|' || c == '-' || c == ':' || c.is_whitespace())
289        && trimmed.contains('|')
290        && trimmed.contains('-')
291    {
292        return true;
293    }
294
295    // Check for table content line (starts and/or ends with |)
296    if (trimmed.starts_with('|') || trimmed.ends_with('|')) && trimmed.matches('|').count() >= 2 {
297        return true;
298    }
299
300    false
301}
302
303#[cfg(test)]
304mod tests {
305    use super::*;
306
307    #[test]
308    fn test_html_comment_detection() {
309        let content = "Text <!-- comment --> more text";
310        assert!(is_in_html_comment(content, 10)); // Inside comment
311        assert!(!is_in_html_comment(content, 0)); // Before comment
312        assert!(!is_in_html_comment(content, 25)); // After comment
313    }
314
315    #[test]
316    fn test_is_line_entirely_in_html_comment() {
317        // Test 1: Multi-line comment with content after closing
318        let content = "<!--\ncomment\n--> Content after comment";
319        let ranges = compute_html_comment_ranges(content);
320        // Line 0: "<!--" (bytes 0-4) - entirely in comment
321        assert!(is_line_entirely_in_html_comment(&ranges, 0, 4));
322        // Line 1: "comment" (bytes 5-12) - entirely in comment
323        assert!(is_line_entirely_in_html_comment(&ranges, 5, 12));
324        // Line 2: "--> Content after comment" (bytes 13-38) - NOT entirely in comment
325        assert!(!is_line_entirely_in_html_comment(&ranges, 13, 38));
326
327        // Test 2: Single-line comment with content after
328        let content2 = "<!-- comment --> Not a comment";
329        let ranges2 = compute_html_comment_ranges(content2);
330        // The entire line is NOT entirely in the comment
331        assert!(!is_line_entirely_in_html_comment(&ranges2, 0, 30));
332
333        // Test 3: Single-line comment alone
334        let content3 = "<!-- comment -->";
335        let ranges3 = compute_html_comment_ranges(content3);
336        // The entire line IS entirely in the comment
337        assert!(is_line_entirely_in_html_comment(&ranges3, 0, 16));
338
339        // Test 4: Content before comment
340        let content4 = "Text before <!-- comment -->";
341        let ranges4 = compute_html_comment_ranges(content4);
342        // Line start is NOT in the comment range
343        assert!(!is_line_entirely_in_html_comment(&ranges4, 0, 28));
344    }
345
346    #[test]
347    fn test_math_block_detection() {
348        let content = "Text\n$$\nmath content\n$$\nmore text";
349        assert!(is_in_math_block(content, 8)); // On opening $$
350        assert!(is_in_math_block(content, 15)); // Inside math block
351        assert!(!is_in_math_block(content, 0)); // Before math block
352        assert!(!is_in_math_block(content, 30)); // After math block
353    }
354
355    #[test]
356    fn test_inline_math_detection() {
357        let content = "Text $x + y$ and $$a^2 + b^2$$ here";
358        assert!(is_in_inline_math(content, 7)); // Inside first math
359        assert!(is_in_inline_math(content, 20)); // Inside second math
360        assert!(!is_in_inline_math(content, 0)); // Before math
361        assert!(!is_in_inline_math(content, 35)); // After math
362    }
363
364    #[test]
365    fn test_table_line_detection() {
366        assert!(is_table_line("| Header | Column |"));
367        assert!(is_table_line("|--------|--------|"));
368        assert!(is_table_line("| Cell 1 | Cell 2 |"));
369        assert!(!is_table_line("Regular text"));
370        assert!(!is_table_line("Just a pipe | here"));
371    }
372
373    #[test]
374    fn test_is_in_front_matter() {
375        // Test YAML frontmatter
376        let yaml_content = r#"---
377title: "My Post"
378tags: ["test", "example"]
379---
380
381# Content"#;
382
383        assert!(
384            is_in_front_matter(yaml_content, 0),
385            "Line 1 should be in YAML front matter"
386        );
387        assert!(
388            is_in_front_matter(yaml_content, 2),
389            "Line 3 should be in YAML front matter"
390        );
391        assert!(
392            is_in_front_matter(yaml_content, 3),
393            "Line 4 should be in YAML front matter"
394        );
395        assert!(
396            !is_in_front_matter(yaml_content, 4),
397            "Line 5 should NOT be in front matter"
398        );
399
400        // Test TOML frontmatter
401        let toml_content = r#"+++
402title = "My Post"
403tags = ["test", "example"]
404+++
405
406# Content"#;
407
408        assert!(
409            is_in_front_matter(toml_content, 0),
410            "Line 1 should be in TOML front matter"
411        );
412        assert!(
413            is_in_front_matter(toml_content, 2),
414            "Line 3 should be in TOML front matter"
415        );
416        assert!(
417            is_in_front_matter(toml_content, 3),
418            "Line 4 should be in TOML front matter"
419        );
420        assert!(
421            !is_in_front_matter(toml_content, 4),
422            "Line 5 should NOT be in front matter"
423        );
424
425        // Test TOML blocks NOT at beginning (should not be considered front matter)
426        let mixed_content = r#"# Content
427
428+++
429title = "Not frontmatter"
430+++
431
432More content"#;
433
434        assert!(
435            !is_in_front_matter(mixed_content, 2),
436            "TOML block not at beginning should NOT be front matter"
437        );
438        assert!(
439            !is_in_front_matter(mixed_content, 3),
440            "TOML block not at beginning should NOT be front matter"
441        );
442        assert!(
443            !is_in_front_matter(mixed_content, 4),
444            "TOML block not at beginning should NOT be front matter"
445        );
446    }
447}