quickmark_core/rules/
md034.rs

1use std::rc::Rc;
2
3use linkify::{LinkFinder, LinkKind};
4use tree_sitter::Node;
5
6use crate::{
7    linter::{range_from_tree_sitter, RuleViolation},
8    rules::{Context, Rule, RuleLinter, RuleType},
9};
10
11pub(crate) struct MD034Linter {
12    context: Rc<Context>,
13    violations: Vec<RuleViolation>,
14}
15
16impl MD034Linter {
17    pub fn new(context: Rc<Context>) -> Self {
18        Self {
19            context,
20            violations: Vec::new(),
21        }
22    }
23}
24
25impl RuleLinter for MD034Linter {
26    fn feed(&mut self, node: &Node) {
27        // Process paragraph nodes to find bare URLs within them
28        if node.kind() == "paragraph" {
29            let content = self.context.document_content.borrow();
30            let text = node.utf8_text(content.as_bytes()).unwrap_or("").to_string();
31            let node_range = node.range();
32            drop(content); // Release the borrow before calling mutable methods
33
34            self.check_for_bare_urls_in_text(&text, &node_range);
35        }
36    }
37
38    fn finalize(&mut self) -> Vec<RuleViolation> {
39        std::mem::take(&mut self.violations)
40    }
41}
42
43impl MD034Linter {
44    fn check_for_bare_urls_in_text(&mut self, text: &str, paragraph_range: &tree_sitter::Range) {
45        let finder = LinkFinder::new();
46
47        for link in finder.links(text) {
48            let link_start = link.start();
49            let link_end = link.end();
50            let link_text = link.as_str();
51
52            // Skip if this link is already properly formatted
53            if !self.is_link_properly_formatted(text, link_start, link_text, link.kind()) {
54                let violation_range = tree_sitter::Range {
55                    start_byte: paragraph_range.start_byte + link_start,
56                    end_byte: paragraph_range.start_byte + link_end,
57                    start_point: tree_sitter::Point {
58                        row: paragraph_range.start_point.row,
59                        column: paragraph_range.start_point.column + link_start,
60                    },
61                    end_point: tree_sitter::Point {
62                        row: paragraph_range.start_point.row,
63                        column: paragraph_range.start_point.column + link_end,
64                    },
65                };
66
67                self.violations.push(RuleViolation::new(
68                    &MD034,
69                    format!("{} [Context: \"{}\"]", MD034.description, link_text),
70                    self.context.file_path.clone(),
71                    range_from_tree_sitter(&violation_range),
72                ));
73            }
74        }
75    }
76
77    fn is_link_properly_formatted(
78        &self,
79        text: &str,
80        link_start: usize,
81        link_text: &str,
82        link_kind: &LinkKind,
83    ) -> bool {
84        match link_kind {
85            LinkKind::Url => self.is_url_properly_formatted(text, link_start, link_text),
86            LinkKind::Email => self.is_email_properly_formatted(text, link_start, link_text),
87            _ => true, // Other link types are not handled by MD034
88        }
89    }
90
91    fn is_url_properly_formatted(&self, text: &str, url_start: usize, url_text: &str) -> bool {
92        // Check if linkify included backticks in the URL (this happens with code spans)
93        if url_text.starts_with('`') {
94            // This URL is inside a code span according to linkify
95            return true;
96        }
97
98        // Check if URL is in angle brackets: <https://example.com>
99        if url_start > 0 && text.chars().nth(url_start - 1) == Some('<') {
100            let url_end = url_start + url_text.len();
101            if url_end < text.len() && text.chars().nth(url_end) == Some('>') {
102                return true;
103            }
104        }
105
106        // Check if URL is in markdown link: [text](https://example.com)
107        if let Some(link_start) = text[..url_start].rfind("](") {
108            if url_start == link_start + 2 {
109                return true; // URL is right after ](
110            }
111            // Also check if URL is after ]( with some prefix (like mailto:, ftp:, etc.)
112            let after_paren = link_start + 2;
113            let prefix_text = &text[after_paren..url_start];
114            if prefix_text.chars().all(|c| c.is_alphabetic() || c == ':') {
115                return true; // URL is in markdown link target with scheme prefix
116            }
117        }
118
119        // Check if URL is in markdown link text: [text with https://example.com](target)
120        if let Some(bracket_start) = text[..url_start].rfind('[') {
121            // Look for closing bracket and opening paren after the URL
122            let url_end = url_start + url_text.len();
123            if let Some(_bracket_end) = text[url_end..].find("](") {
124                // Check that there's no unmatched bracket between bracket_start and url_start
125                let link_text = &text[bracket_start + 1..url_start];
126                if !link_text.contains('[') && !link_text.contains(']') {
127                    return true; // URL is in link text
128                }
129            }
130        }
131
132        // Check if URL is in HTML tag attribute
133        if let Some(attr_start) = text[..url_start].rfind("href=\"") {
134            if url_start == attr_start + 6 {
135                return true;
136            }
137        }
138        if let Some(attr_start) = text[..url_start].rfind("href='") {
139            if url_start == attr_start + 6 {
140                return true;
141            }
142        }
143
144        // Check if URL is in code span using backtick counting
145        let before_url = &text[..url_start];
146        let after_url = &text[url_start + url_text.len()..];
147
148        let backticks_before = before_url.matches('`').count();
149        if backticks_before % 2 == 1 {
150            // Odd number of backticks before means we're likely inside a code span
151            // Check if there's a closing backtick after the URL
152            if after_url.contains('`') {
153                return true;
154            }
155        }
156
157        false
158    }
159
160    fn is_email_properly_formatted(
161        &self,
162        text: &str,
163        email_start: usize,
164        email_text: &str,
165    ) -> bool {
166        // Check if linkify included backticks in the email (this happens with code spans)
167        if email_text.starts_with('`') {
168            // This email is inside a code span according to linkify
169            return true;
170        }
171
172        // Check if email is in markdown link: [text](mailto:user@example.com)
173        if let Some(link_start) = text[..email_start].rfind("](") {
174            // Check if email is right after ]( or after ]( with prefix like mailto:
175            let after_paren = link_start + 2;
176            if email_start == after_paren {
177                return true; // Email is right after ](
178            }
179            let prefix_text = &text[after_paren..email_start];
180            if prefix_text.chars().all(|c| c.is_alphabetic() || c == ':') {
181                return true; // Email is in markdown link target with scheme prefix
182            }
183        }
184
185        // Check if email is in angle brackets: <user@example.com> or <mailto:user@example.com>
186        let mut check_start = email_start;
187
188        // Look backward for opening angle bracket, potentially with "mailto:" prefix
189        while check_start > 0 {
190            let char_at = text.chars().nth(check_start - 1);
191            if char_at == Some('<') {
192                let email_end = email_start + email_text.len();
193                if email_end < text.len() && text.chars().nth(email_end) == Some('>') {
194                    return true;
195                }
196                break;
197            } else if char_at
198                .map(|c| c.is_alphabetic() || c == ':')
199                .unwrap_or(false)
200            {
201                // Continue looking backward through "mailto:" prefix
202                check_start -= 1;
203            } else {
204                break;
205            }
206        }
207
208        // Check if email is in markdown link text: [text with user@example.com](target)
209        if let Some(bracket_start) = text[..email_start].rfind('[') {
210            // Look for closing bracket and opening paren after the email
211            let email_end = email_start + email_text.len();
212            if let Some(_bracket_end) = text[email_end..].find("](") {
213                // Check that there's no unmatched bracket between bracket_start and email_start
214                let link_text = &text[bracket_start + 1..email_start];
215                if !link_text.contains('[') && !link_text.contains(']') {
216                    return true; // Email is in link text
217                }
218            }
219        }
220
221        // Check if email is in code span using backtick counting
222        let before_email = &text[..email_start];
223        let after_email = &text[email_start + email_text.len()..];
224
225        // Count backticks before email to see if we're inside a code span
226        let backticks_before = before_email.matches('`').count();
227        if backticks_before % 2 == 1 {
228            // Odd number of backticks before means we're likely inside a code span
229            // Check if there's a closing backtick after the email
230            if after_email.contains('`') {
231                return true;
232            }
233        }
234
235        false
236    }
237}
238
239pub const MD034: Rule = Rule {
240    id: "MD034",
241    alias: "no-bare-urls",
242    tags: &["links", "url"],
243    description: "Bare URL used",
244    rule_type: RuleType::Token,
245    required_nodes: &["text"], // Look for text nodes that might contain URLs
246    new_linter: |context| Box::new(MD034Linter::new(context)),
247};
248
249#[cfg(test)]
250mod test {
251    use std::path::PathBuf;
252
253    use crate::config::RuleSeverity;
254    use crate::linter::MultiRuleLinter;
255    use crate::test_utils::test_helpers::test_config_with_rules;
256
257    fn test_config() -> crate::config::QuickmarkConfig {
258        test_config_with_rules(vec![
259            ("no-bare-urls", RuleSeverity::Error),
260            ("heading-increment", RuleSeverity::Off),
261            ("heading-style", RuleSeverity::Off),
262            ("line-length", RuleSeverity::Off),
263        ])
264    }
265
266    #[test]
267    fn test_bare_url_detection() {
268        let input = "Visit https://example.com for more info.";
269
270        let config = test_config();
271        let mut linter = MultiRuleLinter::new_for_document(PathBuf::from("test.md"), config, input);
272        let violations = linter.analyze();
273
274        // This test should fail initially, then pass once we implement the logic properly
275        assert_eq!(1, violations.len());
276        let violation = &violations[0];
277        assert_eq!("MD034", violation.rule().id);
278        assert!(violation.message().contains("Bare URL used"));
279        assert!(violation.message().contains("https://example.com"));
280    }
281
282    #[test]
283    fn test_bare_email_detection() {
284        let input = "Email me at user@example.com for questions.";
285
286        let config = test_config();
287        let mut linter = MultiRuleLinter::new_for_document(PathBuf::from("test.md"), config, input);
288        let violations = linter.analyze();
289
290        assert_eq!(1, violations.len());
291        let violation = &violations[0];
292        assert_eq!("MD034", violation.rule().id);
293        assert!(violation.message().contains("user@example.com"));
294    }
295
296    #[test]
297    fn test_angle_bracket_urls_no_violation() {
298        let input = "Visit <https://example.com> for more info.";
299
300        let config = test_config();
301        let mut linter = MultiRuleLinter::new_for_document(PathBuf::from("test.md"), config, input);
302        let violations = linter.analyze();
303
304        // Should not trigger violation for properly formatted URLs
305        assert_eq!(0, violations.len());
306    }
307
308    #[test]
309    fn test_angle_bracket_emails_no_violation() {
310        let input = "Email me at <user@example.com> for questions.";
311
312        let config = test_config();
313        let mut linter = MultiRuleLinter::new_for_document(PathBuf::from("test.md"), config, input);
314        let violations = linter.analyze();
315
316        assert_eq!(0, violations.len());
317    }
318
319    #[test]
320    fn test_code_span_urls_no_violation() {
321        let input = "Not a link: `https://example.com`";
322
323        let config = test_config();
324        let mut linter = MultiRuleLinter::new_for_document(PathBuf::from("test.md"), config, input);
325        let violations = linter.analyze();
326
327        // URLs in code spans should not trigger violations
328        assert_eq!(0, violations.len());
329    }
330
331    #[test]
332    fn test_markdown_link_urls_no_violation() {
333        let input = "Visit [the site](https://example.com) for more info.";
334
335        let config = test_config();
336        let mut linter = MultiRuleLinter::new_for_document(PathBuf::from("test.md"), config, input);
337        let violations = linter.analyze();
338
339        // URLs in proper markdown links should not trigger violations
340        assert_eq!(0, violations.len());
341    }
342
343    #[test]
344    fn test_html_tag_urls_no_violation() {
345        let input = "<a href='https://example.com'>Link text</a>";
346
347        let config = test_config();
348        let mut linter = MultiRuleLinter::new_for_document(PathBuf::from("test.md"), config, input);
349        let violations = linter.analyze();
350
351        // URLs inside HTML tags should not trigger violations
352        assert_eq!(0, violations.len());
353    }
354
355    #[test]
356    fn test_multiple_bare_urls() {
357        let input = "Visit https://first.com and https://second.com and email admin@site.com";
358
359        let config = test_config();
360        let mut linter = MultiRuleLinter::new_for_document(PathBuf::from("test.md"), config, input);
361        let violations = linter.analyze();
362
363        // Should detect all three bare URLs/emails
364        assert_eq!(3, violations.len());
365    }
366
367    #[test]
368    fn test_mixed_urls_and_proper_links() {
369        let input = "Visit https://bare.com and [proper link](https://proper.com) and <https://formatted.com>";
370
371        let config = test_config();
372        let mut linter = MultiRuleLinter::new_for_document(PathBuf::from("test.md"), config, input);
373        let violations = linter.analyze();
374
375        // Should only detect the bare URL, not the properly formatted ones
376        assert_eq!(1, violations.len());
377        assert!(violations[0].message().contains("https://bare.com"));
378    }
379
380    #[test]
381    fn test_mailto_urls_in_markdown_links_no_violation() {
382        let input = "Email [support](mailto:user@example.com) for help.";
383
384        let config = test_config();
385        let mut linter = MultiRuleLinter::new_for_document(PathBuf::from("test.md"), config, input);
386        let violations = linter.analyze();
387
388        // Should not trigger violation for emails in mailto: markdown links
389        assert_eq!(0, violations.len());
390    }
391
392    #[test]
393    fn test_urls_in_markdown_link_text_no_violation() {
394        let input = "[link text with https://example.com in it](https://proper-target.com)";
395
396        let config = test_config();
397        let mut linter = MultiRuleLinter::new_for_document(PathBuf::from("test.md"), config, input);
398        let violations = linter.analyze();
399
400        // Should not trigger violation for URLs in markdown link text
401        assert_eq!(0, violations.len());
402    }
403
404    #[test]
405    fn test_emails_in_markdown_link_text_no_violation() {
406        let input = "[contact user@example.com for support](https://contact-form.com)";
407
408        let config = test_config();
409        let mut linter = MultiRuleLinter::new_for_document(PathBuf::from("test.md"), config, input);
410        let violations = linter.analyze();
411
412        // Should not trigger violation for emails in markdown link text
413        assert_eq!(0, violations.len());
414    }
415
416    #[test]
417    fn test_scheme_prefixes_in_markdown_links_no_violation() {
418        let input = "Try [FTP site](ftp://files.example.com) and [secure site](https://secure.example.com).";
419
420        let config = test_config();
421        let mut linter = MultiRuleLinter::new_for_document(PathBuf::from("test.md"), config, input);
422        let violations = linter.analyze();
423
424        // Should not trigger violations for URLs with various schemes in markdown links
425        assert_eq!(0, violations.len());
426    }
427
428    #[test]
429    fn test_nested_markdown_scenarios() {
430        let input = "Links bind to the innermost [link that https://example.com link](https://target.com) but https://bare.com should trigger.";
431
432        let config = test_config();
433        let mut linter = MultiRuleLinter::new_for_document(PathBuf::from("test.md"), config, input);
434        let violations = linter.analyze();
435
436        // Should only detect the bare URL, not the one in link text
437        assert_eq!(1, violations.len());
438        assert!(violations[0].message().contains("https://bare.com"));
439    }
440
441    #[test]
442    fn test_complex_mixed_scenarios() {
443        let input = r#"
444Visit https://bare.com for info.
445Email [support](mailto:help@example.com) or bare.email@example.com.
446Check [site with https://url-in-text.com info](https://real-target.com).
447Use <https://angle-bracketed.com> or `https://code-span.com`.
448"#;
449
450        let config = test_config();
451        let mut linter = MultiRuleLinter::new_for_document(PathBuf::from("test.md"), config, input);
452        let violations = linter.analyze();
453
454        // Should detect:
455        // 1. https://bare.com (bare URL)
456        // 2. bare.email@example.com (bare email)
457        // Should NOT detect:
458        // - help@example.com (in mailto: link)
459        // - https://url-in-text.com (in link text)
460        // - https://real-target.com (in link target)
461        // - https://angle-bracketed.com (in angle brackets)
462        // - https://code-span.com (in code span)
463        assert_eq!(2, violations.len());
464
465        let violation_contexts: Vec<String> = violations
466            .iter()
467            .map(|v| {
468                // Extract the context from the message
469                let msg = v.message();
470                let start = msg.find("[Context: \"").unwrap() + 11;
471                let end = msg.find("\"]").unwrap();
472                msg[start..end].to_string()
473            })
474            .collect();
475
476        assert!(violation_contexts.contains(&"https://bare.com".to_string()));
477        assert!(violation_contexts.contains(&"bare.email@example.com".to_string()));
478    }
479
480    #[test]
481    fn test_international_domains_and_emails() {
482        let input = "Visit https://müller.example and email ünser@müller.example for info.";
483
484        let config = test_config();
485        let mut linter = MultiRuleLinter::new_for_document(PathBuf::from("test.md"), config, input);
486        let violations = linter.analyze();
487
488        // Should detect both international URL and email
489        assert_eq!(2, violations.len());
490
491        let violation_contexts: Vec<String> = violations
492            .iter()
493            .map(|v| {
494                let msg = v.message();
495                let start = msg.find("[Context: \"").unwrap() + 11;
496                let end = msg.find("\"]").unwrap();
497                msg[start..end].to_string()
498            })
499            .collect();
500
501        assert!(violation_contexts.contains(&"https://müller.example".to_string()));
502        assert!(violation_contexts.contains(&"ünser@müller.example".to_string()));
503    }
504}