rumdl_lib/utils/
mkdocstrings_refs.rs

1/// MkDocstrings cross-references detection utilities
2///
3/// MkDocstrings provides automatic cross-references to documented code objects
4/// using special syntax patterns for Python, JavaScript, and other languages.
5///
6/// Common patterns:
7/// - `::: module.Class` - Auto-doc insertion
8/// - `[module.Class][]` - Cross-reference link
9/// - `[text][module.Class]` - Cross-reference with custom text
10/// - `::: module.Class` with options block (YAML indented)
11use lazy_static::lazy_static;
12use regex::Regex;
13
14lazy_static! {
15    /// Pattern to match auto-doc insertion markers
16    /// ::: module.path.ClassName or ::: handler:module.path
17    /// Lenient: accepts any non-whitespace after ::: to detect potentially dangerous patterns
18    /// Security validation should happen at a different layer (e.g., a specific rule)
19    static ref AUTODOC_MARKER: Regex = Regex::new(
20        r"^(\s*):::\s+\S+.*$"  // Just need non-whitespace after :::
21    ).unwrap();
22
23    /// Pattern to match cross-reference links in various forms
24    /// [module.Class][], [text][module.Class], [module.Class]
25    static ref CROSSREF_PATTERN: Regex = Regex::new(
26        r"\[(?:[^\]]*)\]\[[a-zA-Z_][a-zA-Z0-9_]*(?:[:\.][a-zA-Z_][a-zA-Z0-9_]*)*\]|\[[a-zA-Z_][a-zA-Z0-9_]*(?:[:\.][a-zA-Z_][a-zA-Z0-9_]*)*\]\[\]"
27    ).unwrap();
28
29    /// Pattern to match handler options in YAML format (indented under :::)
30    static ref HANDLER_OPTIONS: Regex = Regex::new(
31        r"^(\s{4,})\w+:"
32    ).unwrap();
33
34    /// Pattern to validate module/class names
35    static ref VALID_MODULE_PATH: Regex = Regex::new(
36        r"^[a-zA-Z_][a-zA-Z0-9_]*(?:\.[a-zA-Z_][a-zA-Z0-9_]*)*$"
37    ).unwrap();
38}
39
40/// Check if a line is an auto-doc insertion marker
41pub fn is_autodoc_marker(line: &str) -> bool {
42    // First check with regex
43    if !AUTODOC_MARKER.is_match(line) {
44        return false;
45    }
46
47    // Additional validation: reject obviously malformed paths
48    // like consecutive dots (module..Class) which Python/JS would reject
49    let trimmed = line.trim();
50    if let Some(start) = trimmed.find(":::") {
51        let after_marker = &trimmed[start + 3..].trim();
52        // Get the module path (first non-whitespace token)
53        if let Some(module_path) = after_marker.split_whitespace().next() {
54            // Reject paths with consecutive dots/colons or starting/ending with separator
55            if module_path.starts_with('.') || module_path.starts_with(':') {
56                return false; // Can't start with separator
57            }
58            if module_path.ends_with('.') || module_path.ends_with(':') {
59                return false; // Can't end with separator
60            }
61            if module_path.contains("..")
62                || module_path.contains("::")
63                || module_path.contains(".:")
64                || module_path.contains(":.")
65            {
66                return false; // No consecutive separators
67            }
68        }
69    }
70
71    // For a linter, we want to be lenient and detect most autodoc-like syntax
72    // even if it contains dangerous or potentially invalid module paths
73    // A separate rule can validate and warn about dangerous patterns
74    true
75}
76
77/// Check if a line contains cross-reference links
78pub fn contains_crossref(line: &str) -> bool {
79    CROSSREF_PATTERN.is_match(line)
80}
81
82/// Get the indentation level of an autodoc marker
83pub fn get_autodoc_indent(line: &str) -> Option<usize> {
84    if AUTODOC_MARKER.is_match(line) {
85        // Use consistent indentation calculation (tabs = 4 spaces)
86        return Some(super::mkdocs_common::get_line_indent(line));
87    }
88    None
89}
90
91/// Check if a line is part of autodoc options (YAML format)
92pub fn is_autodoc_options(line: &str, base_indent: usize) -> bool {
93    // Options must be indented at least 4 spaces more than the ::: marker
94    let line_indent = super::mkdocs_common::get_line_indent(line);
95
96    // Check if properly indented (at least 4 spaces from base)
97    if line_indent >= base_indent + 4 {
98        // Empty lines that are properly indented are considered part of options
99        if line.trim().is_empty() {
100            return true;
101        }
102
103        // YAML key-value pairs
104        if line.contains(':') {
105            return true;
106        }
107        // YAML list items
108        let trimmed = line.trim_start();
109        if trimmed.starts_with("- ") || trimmed.starts_with("* ") {
110            return true;
111        }
112    }
113
114    false
115}
116
117/// Pre-compute all autodoc block ranges in the content
118/// Returns a sorted vector of byte ranges for efficient lookup
119pub fn detect_autodoc_block_ranges(content: &str) -> Vec<crate::utils::skip_context::ByteRange> {
120    let mut ranges = Vec::new();
121    let lines: Vec<&str> = content.lines().collect();
122    let mut byte_pos = 0;
123    let mut in_autodoc = false;
124    let mut autodoc_indent = 0;
125    let mut block_start = 0;
126
127    for line in lines {
128        let line_end = byte_pos + line.len();
129
130        // Check if we're starting an autodoc block
131        if is_autodoc_marker(line) {
132            in_autodoc = true;
133            autodoc_indent = get_autodoc_indent(line).unwrap_or(0);
134            block_start = byte_pos;
135        } else if in_autodoc {
136            // Check if we're still in autodoc options
137            if is_autodoc_options(line, autodoc_indent) {
138                // Continue in autodoc block
139            } else {
140                // Not part of options - check if this ends the block
141                // Completely empty lines (no indentation) don't end the block
142                if line.is_empty() {
143                    // Continue in autodoc
144                } else {
145                    // Non-option, non-empty line ends the autodoc block
146                    // Save the range up to the previous line
147                    ranges.push(crate::utils::skip_context::ByteRange {
148                        start: block_start,
149                        end: byte_pos.saturating_sub(1), // Don't include the newline before this line
150                    });
151                    in_autodoc = false;
152                    autodoc_indent = 0;
153                }
154            }
155        }
156
157        // Account for newline character
158        byte_pos = line_end + 1;
159    }
160
161    // If we ended while still in an autodoc block, save it
162    if in_autodoc {
163        ranges.push(crate::utils::skip_context::ByteRange {
164            start: block_start,
165            end: byte_pos.saturating_sub(1),
166        });
167    }
168
169    ranges
170}
171
172/// Check if a position is within any of the pre-computed autodoc block ranges
173pub fn is_within_autodoc_block_ranges(ranges: &[crate::utils::skip_context::ByteRange], position: usize) -> bool {
174    crate::utils::skip_context::is_in_html_comment_ranges(ranges, position)
175}
176
177/// Check if content at a byte position is within an autodoc block (DEPRECATED: use detect_autodoc_block_ranges + is_within_autodoc_block_ranges)
178pub fn is_within_autodoc_block(content: &str, position: usize) -> bool {
179    let lines: Vec<&str> = content.lines().collect();
180    let mut byte_pos = 0;
181    let mut in_autodoc = false;
182    let mut autodoc_indent = 0;
183
184    for line in lines {
185        let line_end = byte_pos + line.len();
186
187        // Check if we're starting an autodoc block
188        if is_autodoc_marker(line) {
189            in_autodoc = true;
190            autodoc_indent = get_autodoc_indent(line).unwrap_or(0);
191            // Check if position is on the autodoc marker line itself
192            if byte_pos <= position && position <= line_end {
193                return true;
194            }
195        } else if in_autodoc {
196            // Check if we're still in autodoc options
197            if is_autodoc_options(line, autodoc_indent) {
198                // This line is part of autodoc options
199                if byte_pos <= position && position <= line_end {
200                    return true;
201                }
202            } else {
203                // Not part of options - check if this ends the block
204                // Completely empty lines (no indentation) don't end the block
205                if line.is_empty() {
206                    // Continue in autodoc
207                } else {
208                    // Non-option, non-empty line ends the autodoc block
209                    in_autodoc = false;
210                    autodoc_indent = 0;
211                    // If the position is on this line, it's NOT in the autodoc block
212                    // (since we just ended the block)
213                    if byte_pos <= position && position <= line_end {
214                        return false;
215                    }
216                }
217            }
218        }
219
220        // Account for newline character
221        byte_pos = line_end + 1;
222    }
223
224    false
225}
226
227/// Check if a reference should be treated as a cross-reference (not a broken link)
228pub fn is_valid_crossref(ref_text: &str) -> bool {
229    // Cross-references typically follow module.Class or module:function patterns
230    // They often contain dots or colons
231    ref_text.contains('.') || ref_text.contains(':')
232}
233
234#[cfg(test)]
235mod tests {
236    use super::*;
237
238    #[test]
239    fn test_autodoc_marker_detection() {
240        assert!(is_autodoc_marker("::: mymodule.MyClass"));
241        assert!(is_autodoc_marker("::: package.module.Class"));
242        assert!(is_autodoc_marker("  ::: indented.Class"));
243        assert!(is_autodoc_marker("::: module:function"));
244        assert!(!is_autodoc_marker(":: Wrong number"));
245        assert!(!is_autodoc_marker("Regular text"));
246    }
247
248    #[test]
249    fn test_crossref_detection() {
250        assert!(contains_crossref("See [module.Class][]"));
251        assert!(contains_crossref("The [text][module.Class] here"));
252        assert!(contains_crossref("[package.module.Class][]"));
253        assert!(contains_crossref("[custom text][module:function]"));
254        assert!(!contains_crossref("Regular [link](url)"));
255        assert!(!contains_crossref("No references here"));
256    }
257
258    #[test]
259    fn test_autodoc_options() {
260        assert!(is_autodoc_options("    handler: python", 0));
261        assert!(is_autodoc_options("    options:", 0));
262        assert!(is_autodoc_options("      show_source: true", 0));
263        assert!(!is_autodoc_options("", 0)); // Empty lines are neutral
264        assert!(!is_autodoc_options("Not indented", 0));
265        assert!(!is_autodoc_options("  Only 2 spaces", 0));
266        // Test YAML list items
267        assert!(is_autodoc_options("            - window", 0));
268        assert!(is_autodoc_options("            - app", 0));
269    }
270
271    #[test]
272    fn test_within_autodoc_block() {
273        let content = r#"# API Documentation
274
275::: mymodule.MyClass
276    handler: python
277    options:
278      show_source: true
279      show_root_heading: true
280
281Regular text here.
282
283::: another.Class
284
285More text."#;
286
287        let handler_pos = content.find("handler:").unwrap();
288        let options_pos = content.find("show_source:").unwrap();
289        let regular_pos = content.find("Regular text").unwrap();
290        let more_pos = content.find("More text").unwrap();
291
292        assert!(is_within_autodoc_block(content, handler_pos));
293        assert!(is_within_autodoc_block(content, options_pos));
294        assert!(!is_within_autodoc_block(content, regular_pos));
295        assert!(!is_within_autodoc_block(content, more_pos));
296    }
297
298    #[test]
299    fn test_valid_crossref() {
300        assert!(is_valid_crossref("module.Class"));
301        assert!(is_valid_crossref("package.module.Class"));
302        assert!(is_valid_crossref("module:function"));
303        assert!(is_valid_crossref("numpy.ndarray"));
304        assert!(!is_valid_crossref("simple_word"));
305        assert!(!is_valid_crossref("no-dots-here"));
306    }
307}