Skip to main content

rumdl_lib/utils/
mkdocs_snippets.rs

1use regex::Regex;
2/// MkDocs Snippets extension detection utilities
3///
4/// The Snippets extension allows including content from external files
5/// using ASCII scissors syntax: `--8<--`
6///
7/// Common patterns:
8/// - `--8<-- "filename.md"` - Include entire file
9/// - `--8<-- "filename.md:start:end"` - Include specific lines
10/// - `<!-- --8<-- [start:section] -->` - Start marker for section
11/// - `<!-- --8<-- [end:section] -->` - End marker for section
12///
13use std::sync::LazyLock;
14
15/// Pattern to match valid snippet markers: -{1,}8<-{1,}
16/// Based on PyMdown Extensions Snippets specification
17static BARE_SNIPPET_MARKER: LazyLock<Regex> = LazyLock::new(|| {
18    Regex::new(
19        r"^;*-+8<-+$", // Optional semicolons, then dashes-8<-dashes only
20    )
21    .unwrap()
22});
23
24/// Pattern to match snippet with quoted file path
25/// Lenient: accepts unclosed quotes for detection (can warn later)
26static SNIPPET_WITH_FILE: LazyLock<Regex> = LazyLock::new(|| {
27    Regex::new(
28        r#"-+8<-+\s+["']"#, // Just check for quote after snippet marker
29    )
30    .unwrap()
31});
32
33/// Pattern to match section markers
34static SECTION_MARKER: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"-+8<-+\s*\[(start|end):[^\]]*\]").unwrap());
35
36/// Pattern to match invalid asymmetric marker
37static INVALID_ASYMMETRIC: LazyLock<Regex> = LazyLock::new(|| {
38    Regex::new(
39        r#"(?:^|\s)--8<-\s+["']"#, // --8<- followed by quote is invalid
40    )
41    .unwrap()
42});
43
44/// Check if a line contains MkDocs snippet syntax
45pub fn is_snippet_marker(line: &str) -> bool {
46    // PyMdown Snippets spec says: -{1,}8<-{1,} (symmetric dashes)
47    // We're lenient with unclosed quotes for detection (to warn later)
48
49    // Check for known invalid asymmetric patterns
50    // IMPORTANT: -8<-- as a standalone marker is invalid, but it appears
51    // as a substring in valid --8<-- markers!
52    // Only reject if -8<-- appears without a leading dash
53    if !line.contains("--8<--") && !line.contains("---8<---") {
54        // Only check for invalid patterns if we don't have valid ones
55        if line.contains("-8<-- ") || line.contains("-8<--\"") || line.contains("-8<--'") {
56            return false; // -8<-- is invalid when not part of --8<--
57        }
58    }
59    if INVALID_ASYMMETRIC.is_match(line) {
60        return false; // --8<- with file is invalid (asymmetric)
61    }
62
63    let trimmed = line.trim();
64
65    // Check for single-line snippet with file
66    // Be lenient: accept if line has valid marker and a quote anywhere
67    let has_valid_marker = line.contains("--8<--")
68        || line.contains("---8<---")
69        || (line.contains("-8<-") && !line.contains("-8<--") && !line.contains("--8<-"));
70
71    if has_valid_marker && (line.contains('"') || line.contains('\'')) {
72        return true;
73    }
74
75    // Check for section markers: --8<-- [start:name] or [end:name]
76    // Also check with comment prefixes: # -8<- [start:name]
77    if SECTION_MARKER.is_match(trimmed) {
78        return true;
79    }
80
81    // Check for comment-prefixed section markers (# -8<- [start:name])
82    let without_comment = trimmed.trim_start_matches(['#', ';', '/', '*']).trim_start();
83    if (without_comment.starts_with("-8<-")
84        || without_comment.starts_with("--8<--")
85        || without_comment.starts_with("---8<---"))
86        && (without_comment.contains("[start:") || without_comment.contains("[end:"))
87    {
88        return true;
89    }
90
91    // Block format: bare marker is valid for multi-line snippet blocks
92    // According to PyMdown Extensions spec, bare markers like --8<-- on their own line
93    // are valid when used as opening/closing delimiters for multi-file blocks:
94    // --8<--
95    // file1.md
96    // file2.md
97    // --8<--
98    // Check for trailing whitespace (space or tab) BEFORE trimming
99    let trimmed_start = line.trim_start();
100    let has_trailing_whitespace = trimmed_start.ends_with(' ') || trimmed_start.ends_with('\t');
101    if BARE_SNIPPET_MARKER.is_match(trimmed) && !has_trailing_whitespace {
102        return true; // Valid bare marker for block format
103    }
104
105    // HTML comment variations
106    if line.contains("<!--") && line.contains("-->") && line.contains("8<") {
107        // Check various patterns within comments
108        if SNIPPET_WITH_FILE.is_match(line) {
109            return true;
110        }
111        if SECTION_MARKER.is_match(line) {
112            return true;
113        }
114        // Don't accept bare snippet markers in comments without content
115        // <!-- --8<-- --> is not valid (no file or section)
116    }
117
118    false
119}
120
121/// Check if a line is a snippet section start marker
122pub fn is_snippet_section_start(line: &str) -> bool {
123    // Check for patterns like:
124    // <!-- --8<-- [start:section_name] -->
125    // --8<-- [start:section_name]
126    // -8<- [start:section_name]
127    // # -8<- [start:section_name]  (comment format for source files)
128    // ; -8<- [start:section_name]  (comment format for ini files)
129
130    if !line.contains("start:") {
131        return false;
132    }
133
134    // Must have proper bracket structure
135    if let Some(_start_idx) = line.find("[start:")
136        && let Some(_end_idx) = line[_start_idx..].find(']')
137    {
138        // Empty section names are allowed (lenient for detection)
139        let trimmed = line.trim();
140
141        // Handle HTML comments specially
142        let content_to_check = if trimmed.starts_with("<!--") && trimmed.ends_with("-->") {
143            // Extract content from HTML comment
144            trimmed.trim_start_matches("<!--").trim_end_matches("-->").trim()
145        } else {
146            // For other comment styles (# ; / *)
147            let without_comment = trimmed.trim_start_matches(['#', ';', '/', '*']);
148            without_comment.trim_start()
149        };
150
151        return content_to_check.starts_with("--8<--")
152            || content_to_check.starts_with("-8<-")
153            || content_to_check.starts_with("---8<---");
154    }
155
156    false
157}
158
159/// Check if a line is a snippet section end marker
160pub fn is_snippet_section_end(line: &str) -> bool {
161    // Check for patterns like:
162    // <!-- --8<-- [end:section_name] -->
163    // --8<-- [end:section_name]
164    // -8<- [end:section_name]
165    // # -8<- [end:section_name]  (comment format for source files)
166    // ; -8<- [end:section_name]  (comment format for ini files)
167
168    if !line.contains("end:") {
169        return false;
170    }
171
172    // Must have proper bracket structure
173    if let Some(_start_idx) = line.find("[end:")
174        && let Some(_end_idx) = line[_start_idx..].find(']')
175    {
176        // Empty section names are allowed (lenient for detection)
177        let trimmed = line.trim();
178
179        // Handle HTML comments specially
180        let content_to_check = if trimmed.starts_with("<!--") && trimmed.ends_with("-->") {
181            // Extract content from HTML comment
182            trimmed.trim_start_matches("<!--").trim_end_matches("-->").trim()
183        } else {
184            // For other comment styles (# ; / *)
185            let without_comment = trimmed.trim_start_matches(['#', ';', '/', '*']);
186            without_comment.trim_start()
187        };
188
189        return content_to_check.starts_with("--8<--")
190            || content_to_check.starts_with("-8<-")
191            || content_to_check.starts_with("---8<---");
192    }
193
194    false
195}
196
197/// Check if a position is within a snippet section
198pub fn is_within_snippet_section(content: &str, position: usize) -> bool {
199    let lines: Vec<&str> = content.lines().collect();
200    let mut byte_pos = 0;
201    let mut section_stack: Vec<String> = Vec::new();
202
203    for line in lines {
204        let line_end = byte_pos + line.len();
205
206        // Check if we're starting a snippet section
207        if is_snippet_section_start(line) {
208            // Extract section name for matching
209            if let Some(start) = line.find("[start:")
210                && let Some(end) = line[start..].find(']')
211            {
212                let section_name = line[start + 7..start + end].to_string();
213                section_stack.push(section_name);
214            }
215        }
216
217        // Check if we're ending a snippet section
218        if is_snippet_section_end(line) {
219            // Check if section names match
220            if let Some(start) = line.find("[end:")
221                && let Some(end) = line[start..].find(']')
222            {
223                let end_section_name = &line[start + 5..start + end];
224                // Pop the matching section from the stack
225                if let Some(last_section) = section_stack.last()
226                    && last_section == end_section_name
227                {
228                    section_stack.pop();
229                }
230            }
231        }
232
233        // Check if position is within this line and we're in any snippet section
234        if byte_pos <= position && position <= line_end && !section_stack.is_empty() {
235            return true;
236        }
237
238        // Account for newline character
239        byte_pos = line_end + 1;
240    }
241
242    false
243}
244
245/// Check if a line contains a snippet reference that could be a broken link
246pub fn looks_like_snippet_reference(text: &str) -> bool {
247    // More conservative check for link syntax that might be snippets
248    text.contains("--8<--") || text.contains("-8<-")
249}
250
251/// Check if a line is a bare snippet block delimiter (for multi-line blocks)
252pub fn is_snippet_block_delimiter(line: &str) -> bool {
253    let trimmed = line.trim();
254    // Check for trailing whitespace (space or tab) BEFORE full trim
255    let trimmed_start = line.trim_start();
256    let has_trailing_whitespace = trimmed_start.ends_with(' ') || trimmed_start.ends_with('\t');
257    // Bare markers without trailing whitespace are valid block delimiters
258    BARE_SNIPPET_MARKER.is_match(trimmed) && !has_trailing_whitespace
259}
260
261/// Check if a position is within a multi-line snippet block
262/// Multi-line blocks have the format:
263/// --8<--
264/// file1.md
265/// file2.md
266/// --8<--
267pub fn is_within_snippet_block(content: &str, position: usize) -> bool {
268    let lines: Vec<&str> = content.lines().collect();
269    let mut byte_pos = 0;
270    let mut in_block = false;
271
272    for line in lines {
273        let line_end = byte_pos + line.len();
274
275        // Check if this is a block delimiter that toggles state
276        if is_snippet_block_delimiter(line) {
277            if byte_pos <= position && position <= line_end {
278                // The position is on the delimiter itself
279                return true;
280            }
281            in_block = !in_block;
282        }
283
284        // Check if position is within this line and we're in a block
285        if in_block && byte_pos <= position && position <= line_end {
286            return true;
287        }
288
289        // Move to next line (account for newline character)
290        byte_pos = line_end + 1;
291    }
292
293    false
294}
295
296#[cfg(test)]
297mod tests {
298    use super::*;
299
300    #[test]
301    fn test_snippet_marker_detection() {
302        // Valid snippets with file paths
303        assert!(is_snippet_marker("--8<-- \"file.md\""));
304        assert!(is_snippet_marker("--8<-- 'file.md'"));
305        assert!(is_snippet_marker("  --8<-- \"indented.md\"  "));
306        assert!(is_snippet_marker("<!-- --8<-- \"file.md\" -->"));
307
308        // Bare markers are valid for multi-line blocks
309        assert!(is_snippet_marker("--8<--")); // Valid block delimiter
310        assert!(is_snippet_marker("-8<-")); // Shorter form
311        assert!(is_snippet_marker("---8<---")); // Longer form
312
313        // Invalid snippets with trailing spaces
314        assert!(!is_snippet_marker("--8<-- ")); // Trailing space suggests missing file
315        assert!(!is_snippet_marker("<!-- --8<-- -->")); // Empty HTML comment snippet
316
317        // Section markers
318        assert!(is_snippet_marker("--8<-- [start:section]"));
319        assert!(is_snippet_marker("--8<-- [end:section]"));
320        assert!(is_snippet_marker("<!-- --8<-- [start:test] -->"));
321    }
322
323    #[test]
324    fn test_section_markers() {
325        // Valid section start markers
326        assert!(is_snippet_section_start("<!-- --8<-- [start:intro] -->"));
327        assert!(is_snippet_section_start("--8<-- [start:code]"));
328        assert!(is_snippet_section_start("-8<- [start:example]"));
329        assert!(is_snippet_section_start("# -8<- [start:remote-content]")); // Comment style
330
331        // Invalid section start markers
332        // We're lenient with empty section names for detection (can warn later)
333        assert!(is_snippet_section_start("<!-- --8<-- [start:] -->")); // Empty name allowed
334        assert!(!is_snippet_section_start("--8<-- [start")); // Missing bracket
335        assert!(!is_snippet_section_start("[start:test]")); // Missing snippet marker
336
337        // Valid section end markers
338        assert!(is_snippet_section_end("<!-- --8<-- [end:intro] -->"));
339        assert!(is_snippet_section_end("--8<-- [end:code]"));
340
341        // Invalid section end markers
342        // We're lenient with empty section names for detection (can warn later)
343        assert!(is_snippet_section_end("<!-- --8<-- [end:] -->")); // Empty name allowed
344        assert!(!is_snippet_section_end("--8<-- [end")); // Missing bracket
345    }
346
347    #[test]
348    fn test_within_snippet_section() {
349        let content = r#"# Document
350
351Normal content here.
352
353<!-- --8<-- [start:example] -->
354This content is within a snippet section.
355It should be detected as such.
356<!-- --8<-- [end:example] -->
357
358This is outside the snippet section.
359
360<!-- --8<-- [start:another] -->
361Another snippet section.
362<!-- --8<-- [end:another] -->
363"#;
364
365        // Test positions within and outside snippet sections
366        let within_pos = content.find("within a snippet").unwrap();
367        let outside_pos = content.find("outside the snippet").unwrap();
368        let another_pos = content.find("Another snippet").unwrap();
369
370        assert!(is_within_snippet_section(content, within_pos));
371        assert!(!is_within_snippet_section(content, outside_pos));
372        assert!(is_within_snippet_section(content, another_pos));
373    }
374
375    #[test]
376    fn test_nested_snippet_sections() {
377        let content = r#"<!-- --8<-- [start:outer] -->
378Outer content.
379<!-- --8<-- [start:inner] -->
380Inner content.
381<!-- --8<-- [end:inner] -->
382Back to outer.
383<!-- --8<-- [end:outer] -->
384Outside."#;
385
386        let outer_pos = content.find("Outer content").unwrap();
387        let inner_pos = content.find("Inner content").unwrap();
388        let back_pos = content.find("Back to outer").unwrap();
389        let outside_pos = content.find("Outside").unwrap();
390
391        assert!(is_within_snippet_section(content, outer_pos));
392        assert!(is_within_snippet_section(content, inner_pos));
393        assert!(is_within_snippet_section(content, back_pos));
394        assert!(!is_within_snippet_section(content, outside_pos));
395    }
396
397    #[test]
398    fn test_multi_line_snippet_blocks() {
399        let content = r#"# Document
400
401Some content before.
402
403--8<--
404file1.md
405file2.md
406https://raw.githubusercontent.com/example/repo/main/file.md
407--8<--
408
409Some content after.
410
411-8<-
412another_file.txt
413-8<-
414
415More content.
416"#;
417
418        // Test positions within the first block
419        let file1_pos = content.find("file1.md").unwrap();
420        let file2_pos = content.find("file2.md").unwrap();
421        let url_pos = content.find("https://raw.githubusercontent.com").unwrap();
422
423        // Test positions outside blocks
424        let before_pos = content.find("Some content before").unwrap();
425        let after_pos = content.find("Some content after").unwrap();
426        let more_pos = content.find("More content").unwrap();
427
428        // Test positions on delimiters
429        let first_delimiter = content.find("--8<--").unwrap();
430        let second_delimiter = content.rfind("--8<--").unwrap();
431
432        // Test position in second block
433        let another_file_pos = content.find("another_file.txt").unwrap();
434
435        // Assert content within blocks is detected
436        assert!(
437            is_within_snippet_block(content, file1_pos),
438            "file1.md should be in block"
439        );
440        assert!(
441            is_within_snippet_block(content, file2_pos),
442            "file2.md should be in block"
443        );
444        assert!(is_within_snippet_block(content, url_pos), "URL should be in block");
445        assert!(
446            is_within_snippet_block(content, another_file_pos),
447            "another_file.txt should be in block"
448        );
449
450        // Assert delimiters themselves are detected
451        assert!(
452            is_within_snippet_block(content, first_delimiter),
453            "First delimiter should be detected"
454        );
455        assert!(
456            is_within_snippet_block(content, second_delimiter),
457            "Second delimiter should be detected"
458        );
459
460        // Assert content outside blocks is not detected
461        assert!(
462            !is_within_snippet_block(content, before_pos),
463            "Content before block should not be detected"
464        );
465        assert!(
466            !is_within_snippet_block(content, after_pos),
467            "Content between blocks should not be detected"
468        );
469        assert!(
470            !is_within_snippet_block(content, more_pos),
471            "Content after blocks should not be detected"
472        );
473    }
474
475    #[test]
476    fn test_snippet_block_delimiter() {
477        // Valid block delimiters
478        assert!(is_snippet_block_delimiter("--8<--"));
479        assert!(is_snippet_block_delimiter("-8<-"));
480        assert!(is_snippet_block_delimiter("---8<---"));
481        assert!(!is_snippet_block_delimiter("  --8<--  ")); // With trailing whitespace = invalid
482        assert!(!is_snippet_block_delimiter("\t-8<-\t")); // With trailing tabs = invalid
483        assert!(is_snippet_block_delimiter("  --8<--")); // Leading whitespace only is OK
484        assert!(is_snippet_block_delimiter("\t--8<--")); // Leading tabs only is OK
485
486        // Invalid delimiters
487        assert!(!is_snippet_block_delimiter("--8<-- ")); // Trailing space after trim
488        assert!(!is_snippet_block_delimiter("--8<-- file.md")); // With content
489        assert!(!is_snippet_block_delimiter("<!-- --8<-- -->")); // In HTML comment
490    }
491}