mdbook_lint_core/rules/standard/
md044.rs

1//! MD044: Proper names should have correct capitalization
2//!
3//! This rule checks that proper names (like company names, product names, etc.)
4//! are capitalized correctly throughout the document.
5
6use crate::error::Result;
7use crate::rule::{Rule, RuleCategory, RuleMetadata};
8use crate::{
9    Document,
10    violation::{Severity, Violation},
11};
12use std::collections::HashMap;
13
14/// Rule to check proper name capitalization
15pub struct MD044 {
16    /// Map of lowercase names to their correct capitalization
17    proper_names: HashMap<String, String>,
18}
19
20impl MD044 {
21    /// Create a new MD044 rule with default proper names
22    pub fn new() -> Self {
23        let mut proper_names = HashMap::new();
24
25        // Add common technology names that are often miscapitalized
26        proper_names.insert("javascript".to_string(), "JavaScript".to_string());
27        proper_names.insert("typescript".to_string(), "TypeScript".to_string());
28        proper_names.insert("github".to_string(), "GitHub".to_string());
29        proper_names.insert("gitlab".to_string(), "GitLab".to_string());
30        proper_names.insert("bitbucket".to_string(), "Bitbucket".to_string());
31        proper_names.insert("nodejs".to_string(), "Node.js".to_string());
32        proper_names.insert("mysql".to_string(), "MySQL".to_string());
33        proper_names.insert("postgresql".to_string(), "PostgreSQL".to_string());
34        proper_names.insert("mongodb".to_string(), "MongoDB".to_string());
35        proper_names.insert("redis".to_string(), "Redis".to_string());
36        proper_names.insert("docker".to_string(), "Docker".to_string());
37        proper_names.insert("kubernetes".to_string(), "Kubernetes".to_string());
38        proper_names.insert("aws".to_string(), "AWS".to_string());
39        proper_names.insert("azure".to_string(), "Azure".to_string());
40        proper_names.insert("google cloud".to_string(), "Google Cloud".to_string());
41        proper_names.insert("gcp".to_string(), "GCP".to_string());
42        proper_names.insert("react".to_string(), "React".to_string());
43        proper_names.insert("vue".to_string(), "Vue".to_string());
44        proper_names.insert("angular".to_string(), "Angular".to_string());
45        proper_names.insert("webpack".to_string(), "webpack".to_string());
46        proper_names.insert("eslint".to_string(), "ESLint".to_string());
47        proper_names.insert("prettier".to_string(), "Prettier".to_string());
48        proper_names.insert("babel".to_string(), "Babel".to_string());
49        proper_names.insert("json".to_string(), "JSON".to_string());
50        proper_names.insert("xml".to_string(), "XML".to_string());
51        proper_names.insert("html".to_string(), "HTML".to_string());
52        proper_names.insert("css".to_string(), "CSS".to_string());
53        proper_names.insert("sass".to_string(), "Sass".to_string());
54        proper_names.insert("scss".to_string(), "SCSS".to_string());
55        proper_names.insert("less".to_string(), "Less".to_string());
56        proper_names.insert("api".to_string(), "API".to_string());
57        proper_names.insert("rest".to_string(), "REST".to_string());
58        proper_names.insert("graphql".to_string(), "GraphQL".to_string());
59        proper_names.insert("oauth".to_string(), "OAuth".to_string());
60        proper_names.insert("jwt".to_string(), "JWT".to_string());
61        proper_names.insert("http".to_string(), "HTTP".to_string());
62        proper_names.insert("https".to_string(), "HTTPS".to_string());
63        proper_names.insert("tcp".to_string(), "TCP".to_string());
64        proper_names.insert("udp".to_string(), "UDP".to_string());
65        proper_names.insert("ip".to_string(), "IP".to_string());
66        proper_names.insert("dns".to_string(), "DNS".to_string());
67        proper_names.insert("url".to_string(), "URL".to_string());
68        proper_names.insert("uri".to_string(), "URI".to_string());
69        proper_names.insert("uuid".to_string(), "UUID".to_string());
70
71        Self { proper_names }
72    }
73
74    /// Create a new MD044 rule with custom proper names
75    #[allow(dead_code)]
76    pub fn with_names(proper_names: HashMap<String, String>) -> Self {
77        Self { proper_names }
78    }
79
80    /// Add a proper name to the list
81    #[allow(dead_code)]
82    pub fn add_name(&mut self, incorrect: String, correct: String) {
83        self.proper_names.insert(incorrect.to_lowercase(), correct);
84    }
85
86    /// Check a line for proper name violations
87    fn check_line_names(&self, line: &str, line_number: usize) -> Vec<Violation> {
88        let mut violations = Vec::new();
89
90        // Skip empty lines
91        if line.trim().is_empty() {
92            return violations;
93        }
94
95        // Find all matches and their positions first - using Unicode-safe approach
96        let mut matches = Vec::new();
97
98        for (incorrect_lower, correct) in &self.proper_names {
99            // Use a simpler, more reliable approach: search in the original line
100            // and use character indices to ensure we don't break Unicode characters
101            let line_lower = line.to_lowercase();
102            let mut search_start = 0;
103
104            while let Some(byte_pos) = line_lower[search_start..].find(incorrect_lower) {
105                let absolute_byte_pos = search_start + byte_pos;
106
107                // Convert byte position to character index safely
108                let char_pos = line[..absolute_byte_pos].chars().count();
109                let end_char_pos = char_pos + incorrect_lower.chars().count();
110
111                // Check word boundaries using character positions
112                let line_chars: Vec<char> = line.chars().collect();
113                let is_word_start = char_pos == 0
114                    || !line_chars
115                        .get(char_pos.saturating_sub(1))
116                        .unwrap_or(&' ')
117                        .is_alphanumeric();
118                let is_word_end = end_char_pos >= line_chars.len()
119                    || !line_chars
120                        .get(end_char_pos)
121                        .unwrap_or(&' ')
122                        .is_alphanumeric();
123
124                if is_word_start && is_word_end {
125                    // Extract the actual text using character indices
126                    let actual_text: String = line_chars[char_pos..end_char_pos].iter().collect();
127
128                    // Only flag if it's not already correctly capitalized
129                    if actual_text != *correct {
130                        // Use the original byte position for compatibility with existing methods
131                        // but make sure it's safe by using char_indices
132                        let safe_byte_pos = line
133                            .char_indices()
134                            .nth(char_pos)
135                            .map(|(pos, _)| pos)
136                            .unwrap_or(0);
137
138                        // Skip if this appears to be in a code span or URL context
139                        if !self.is_in_code_span(line, safe_byte_pos)
140                            && !self.is_in_url_context(line, safe_byte_pos)
141                        {
142                            matches.push((safe_byte_pos, actual_text, correct.clone()));
143                        }
144                    }
145                }
146
147                // Move search position forward, making sure to advance by at least one byte
148                search_start = absolute_byte_pos + 1;
149            }
150        }
151
152        // Sort matches by position to maintain text order
153        matches.sort_by_key(|(pos, _, _)| *pos);
154
155        // Create violations in order
156        for (pos, actual_text, correct) in matches {
157            violations.push(self.create_violation(
158                format!("Proper name '{actual_text}' should be capitalized as '{correct}'"),
159                line_number,
160                pos + 1, // Convert to 1-based column
161                Severity::Warning,
162            ));
163        }
164
165        violations
166    }
167
168    /// Check if a position is inside a code span
169    fn is_in_code_span(&self, line: &str, pos: usize) -> bool {
170        let chars: Vec<char> = line.chars().collect();
171        let mut in_code_span = false;
172        let mut i = 0;
173
174        // Convert byte position to character position
175        let char_pos = line[..pos.min(line.len())].chars().count();
176
177        while i < chars.len() && i <= char_pos {
178            if chars[i] == '`' {
179                // Count consecutive backticks
180                let mut _backtick_count = 0;
181                let _start = i;
182                while i < chars.len() && chars[i] == '`' {
183                    _backtick_count += 1;
184                    i += 1;
185                }
186
187                if in_code_span {
188                    // Check if this closes the code span (same number of backticks)
189                    in_code_span = false; // Simplified - just toggle
190                } else {
191                    in_code_span = true;
192                }
193            } else {
194                i += 1;
195            }
196        }
197
198        in_code_span
199    }
200
201    /// Check if a position is inside a URL context
202    fn is_in_url_context(&self, line: &str, pos: usize) -> bool {
203        // Check for various URL patterns that should be excluded from proper name checking
204
205        // 1. Check for bare URLs (http://, https://, ftp://, etc.)
206        if let Some(url_start) = self.find_url_start(line, pos)
207            && let Some(url_end) = self.find_url_end(line, url_start)
208        {
209            return pos >= url_start && pos < url_end;
210        }
211
212        // 2. Check for markdown link URLs [text](url)
213        if let Some(link_url_range) = self.find_markdown_link_url(line, pos) {
214            return pos >= link_url_range.0 && pos < link_url_range.1;
215        }
216
217        false
218    }
219
220    /// Find the start of a URL that contains the given position
221    fn find_url_start(&self, line: &str, pos: usize) -> Option<usize> {
222        let schemes = [
223            "https://", "http://", "ftp://", "ftps://", "mailto:", "file://",
224        ];
225
226        // Look backwards from pos to find a URL scheme
227        // We need to check all possible positions from the beginning of the line up to pos
228        for scheme in &schemes {
229            // Use char_indices to get character boundary positions
230            for (char_pos, _) in line.char_indices() {
231                if char_pos > pos {
232                    break; // Past our search position
233                }
234
235                // Check if we have enough bytes remaining for the scheme
236                if char_pos + scheme.len() <= line.len() {
237                    // Check if the end position is also a character boundary
238                    let end_pos = char_pos + scheme.len();
239                    if line.is_char_boundary(end_pos) {
240                        // Safe to slice since both positions are character boundaries
241                        let slice = &line[char_pos..end_pos];
242                        if slice.eq_ignore_ascii_case(scheme) {
243                            // Found a scheme - now check if our position would be within this URL
244                            if let Some(url_end) = self.find_url_end(line, char_pos)
245                                && pos >= char_pos
246                                && pos < url_end
247                            {
248                                return Some(char_pos);
249                            }
250                        }
251                    }
252                }
253            }
254        }
255
256        None
257    }
258
259    /// Find the end of a URL starting at url_start
260    fn find_url_end(&self, line: &str, url_start: usize) -> Option<usize> {
261        let chars: Vec<char> = line.chars().collect();
262
263        // Convert byte position to character position
264        let char_start = line[..url_start.min(line.len())].chars().count();
265        let mut i = char_start;
266
267        // Skip the scheme part
268        while i < chars.len() && chars[i] != ':' {
269            i += 1;
270        }
271        if i < chars.len() && chars[i] == ':' {
272            i += 1;
273            // Skip // if present
274            if i + 1 < chars.len() && chars[i] == '/' && chars[i + 1] == '/' {
275                i += 2;
276            }
277        }
278
279        // Continue until we hit a character that typically ends URLs
280        while i < chars.len() {
281            match chars[i] {
282                // Characters that end URLs
283                ' ' | '\t' | '\n' | ')' | ']' | ',' | ';' | '"' | '\'' => break,
284                // Continue for valid URL characters
285                _ => i += 1,
286            }
287        }
288
289        Some(i)
290    }
291
292    /// Find markdown link URL range [text](url) if pos is within the URL part
293    fn find_markdown_link_url(&self, line: &str, pos: usize) -> Option<(usize, usize)> {
294        let chars: Vec<char> = line.chars().collect();
295
296        // Convert byte position to character position
297        let char_pos = line[..pos.min(line.len())].chars().count();
298
299        // Look for markdown link pattern around the position
300        // We need to find [text](url) where pos is within the url part
301
302        // Look backwards for ]( pattern
303        let mut i = if char_pos > 0 { char_pos - 1 } else { 0 };
304        let mut found_paren = false;
305        let mut found_bracket = false;
306
307        while i > 0 {
308            if i < chars.len() && chars[i] == '(' && !found_paren {
309                found_paren = true;
310            } else if i < chars.len() && chars[i] == ']' && found_paren && !found_bracket {
311                found_bracket = true;
312                break;
313            } else if i < chars.len() && (chars[i] == ' ' || chars[i] == '\n') {
314                // Break if we hit whitespace without finding the pattern
315                break;
316            }
317            if i == 0 {
318                break;
319            }
320            i -= 1;
321        }
322
323        if !found_bracket || !found_paren {
324            return None;
325        }
326
327        // Find the opening paren after the ]
328        let mut paren_pos = i + 1;
329        while paren_pos < chars.len() && chars[paren_pos] != '(' {
330            paren_pos += 1;
331        }
332
333        if paren_pos >= chars.len() {
334            return None;
335        }
336
337        // Find the closing paren
338        let url_start = paren_pos + 1;
339        let mut url_end = url_start;
340        while url_end < chars.len() && chars[url_end] != ')' {
341            url_end += 1;
342        }
343
344        if url_end >= chars.len() {
345            return None;
346        }
347
348        // Check if char_pos is within the URL part
349        if char_pos >= url_start && char_pos < url_end {
350            Some((url_start, url_end))
351        } else {
352            None
353        }
354    }
355
356    /// Get code block ranges to exclude from checking
357    fn get_code_block_ranges(&self, lines: &[&str]) -> Vec<bool> {
358        let mut in_code_block = vec![false; lines.len()];
359        let mut in_fenced_block = false;
360
361        for (i, line) in lines.iter().enumerate() {
362            let trimmed = line.trim();
363
364            // Check for fenced code blocks
365            if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
366                in_fenced_block = !in_fenced_block;
367                in_code_block[i] = true;
368                continue;
369            }
370
371            if in_fenced_block {
372                in_code_block[i] = true;
373                continue;
374            }
375        }
376
377        in_code_block
378    }
379}
380
381impl Default for MD044 {
382    fn default() -> Self {
383        Self::new()
384    }
385}
386
387impl Rule for MD044 {
388    fn id(&self) -> &'static str {
389        "MD044"
390    }
391
392    fn name(&self) -> &'static str {
393        "proper-names"
394    }
395
396    fn description(&self) -> &'static str {
397        "Proper names should have the correct capitalization"
398    }
399
400    fn metadata(&self) -> RuleMetadata {
401        RuleMetadata::stable(RuleCategory::Content).introduced_in("mdbook-lint v0.1.0")
402    }
403
404    fn check_with_ast<'a>(
405        &self,
406        document: &Document,
407        _ast: Option<&'a comrak::nodes::AstNode<'a>>,
408    ) -> Result<Vec<Violation>> {
409        let mut violations = Vec::new();
410        let lines: Vec<&str> = document.content.lines().collect();
411        let in_code_block = self.get_code_block_ranges(&lines);
412
413        for (line_number, line) in lines.iter().enumerate() {
414            let line_number = line_number + 1;
415
416            // Skip lines inside code blocks
417            if in_code_block[line_number - 1] {
418                continue;
419            }
420
421            violations.extend(self.check_line_names(line, line_number));
422        }
423
424        Ok(violations)
425    }
426}
427
428#[cfg(test)]
429mod tests {
430    use super::*;
431    use crate::rule::Rule;
432    use std::path::PathBuf;
433
434    fn create_test_document(content: &str) -> Document {
435        Document::new(content.to_string(), PathBuf::from("test.md")).unwrap()
436    }
437
438    #[test]
439    fn test_md044_correct_capitalization_valid() {
440        let content = r#"This document uses JavaScript and GitHub correctly.
441
442We also use Node.js and MongoDB in our stack.
443
444The API is built with GraphQL and runs on AWS.
445"#;
446
447        let document = create_test_document(content);
448        let rule = MD044::new();
449        let violations = rule.check(&document).unwrap();
450        assert_eq!(violations.len(), 0);
451    }
452
453    #[test]
454    fn test_md044_incorrect_capitalization_violation() {
455        let content = r#"This document uses javascript and github incorrectly.
456
457We also use nodejs and mongodb in our stack.
458"#;
459
460        let document = create_test_document(content);
461        let rule = MD044::new();
462        let violations = rule.check(&document).unwrap();
463        assert_eq!(violations.len(), 4);
464        assert!(violations[0].message.contains("javascript"));
465        assert!(violations[0].message.contains("JavaScript"));
466        assert!(violations[1].message.contains("github"));
467        assert!(violations[1].message.contains("GitHub"));
468        assert!(violations[2].message.contains("nodejs"));
469        assert!(violations[2].message.contains("Node.js"));
470        assert!(violations[3].message.contains("mongodb"));
471        assert!(violations[3].message.contains("MongoDB"));
472    }
473
474    #[test]
475    fn test_md044_mixed_correct_incorrect() {
476        let content = r#"We use JavaScript (correct) but also javascript (incorrect).
477
478GitHub is right, but github is wrong.
479"#;
480
481        let document = create_test_document(content);
482        let rule = MD044::new();
483        let violations = rule.check(&document).unwrap();
484        assert_eq!(violations.len(), 2);
485        assert!(violations[0].message.contains("javascript"));
486        assert!(violations[1].message.contains("github"));
487    }
488
489    #[test]
490    fn test_md044_code_blocks_ignored() {
491        let content = r#"We use JavaScript in our application.
492
493```javascript
494// This javascript in code should be ignored
495console.log("github");
496```
497
498But javascript outside code blocks should be flagged.
499"#;
500
501        let document = create_test_document(content);
502        let rule = MD044::new();
503        let violations = rule.check(&document).unwrap();
504        assert_eq!(violations.len(), 1);
505        assert_eq!(violations[0].line, 8);
506    }
507
508    #[test]
509    fn test_md044_code_spans_ignored() {
510        let content = r#"We use JavaScript, and in code we write `javascript` or `github.com`.
511
512But javascript outside of `code spans` should be flagged.
513"#;
514
515        let document = create_test_document(content);
516        let rule = MD044::new();
517        let violations = rule.check(&document).unwrap();
518        assert_eq!(violations.len(), 1);
519        assert_eq!(violations[0].line, 3);
520    }
521
522    #[test]
523    fn test_md044_custom_names() {
524        let content = r#"We use mycompany products and someapi.
525
526This should flag mycompany and someapi.
527"#;
528
529        let mut custom_names = HashMap::new();
530        custom_names.insert("mycompany".to_string(), "MyCompany".to_string());
531        custom_names.insert("someapi".to_string(), "SomeAPI".to_string());
532
533        let document = create_test_document(content);
534        let rule = MD044::with_names(custom_names);
535        let violations = rule.check(&document).unwrap();
536        assert_eq!(violations.len(), 4); // 2 on each line
537        assert!(violations[0].message.contains("MyCompany"));
538        assert!(violations[1].message.contains("SomeAPI"));
539    }
540
541    #[test]
542    fn test_md044_word_boundaries() {
543        let content = r#"The word javascript should be flagged.
544
545But javascriptlike should not be flagged (it's a different word).
546
547And notjavascript should also not be flagged.
548"#;
549
550        let document = create_test_document(content);
551        let rule = MD044::new();
552        let violations = rule.check(&document).unwrap();
553        assert_eq!(violations.len(), 1);
554        assert_eq!(violations[0].line, 1);
555    }
556
557    #[test]
558    fn test_md044_case_insensitive_matching() {
559        let content = r#"We use Javascript, JAVASCRIPT, and JaVaScRiPt.
560
561All variations should be flagged.
562"#;
563
564        let document = create_test_document(content);
565        let rule = MD044::new();
566        let violations = rule.check(&document).unwrap();
567        assert_eq!(violations.len(), 3);
568        assert!(violations[0].message.contains("Javascript"));
569        assert!(violations[1].message.contains("JAVASCRIPT"));
570        assert!(violations[2].message.contains("JaVaScRiPt"));
571    }
572
573    #[test]
574    fn test_md044_multiple_occurrences_per_line() {
575        let content = r#"Using javascript and github and nodejs in the same line.
576"#;
577
578        let document = create_test_document(content);
579        let rule = MD044::new();
580        let violations = rule.check(&document).unwrap();
581        assert_eq!(violations.len(), 3);
582        assert!(violations[0].message.contains("javascript"));
583        assert!(violations[1].message.contains("github"));
584        assert!(violations[2].message.contains("nodejs"));
585    }
586
587    #[test]
588    fn test_md044_no_proper_names() {
589        let content = r#"This document doesn't contain any configured proper names.
590
591Just regular words and sentences here.
592
593Nothing to flag in this content.
594"#;
595
596        let document = create_test_document(content);
597        let rule = MD044::new();
598        let violations = rule.check(&document).unwrap();
599        assert_eq!(violations.len(), 0);
600    }
601
602    #[test]
603    fn test_md044_acronyms() {
604        let content = r#"We use api, rest, and json in our application.
605
606These should be API, REST, and JSON.
607"#;
608
609        let document = create_test_document(content);
610        let rule = MD044::new();
611        let violations = rule.check(&document).unwrap();
612        assert_eq!(violations.len(), 3); // Only line 1 has incorrect capitalization
613        assert!(violations[0].message.contains("API"));
614        assert!(violations[1].message.contains("REST"));
615        assert!(violations[2].message.contains("JSON"));
616    }
617
618    #[test]
619    fn test_md044_multi_word_names() {
620        let content = r#"We deploy to google cloud platform.
621
622Should be Google Cloud not google cloud.
623"#;
624
625        let document = create_test_document(content);
626        let rule = MD044::new();
627        let violations = rule.check(&document).unwrap();
628        assert_eq!(violations.len(), 2);
629        assert!(violations[0].message.contains("google cloud"));
630        assert!(violations[1].message.contains("google cloud"));
631    }
632
633    #[test]
634    fn test_md044_url_false_positives() {
635        let content = r#"Check out our repository at https://github.com/user/repo.
636
637You can also visit http://example.com for more info.
638
639Visit https://crates.io/crates/mdbook-lint for the package.
640
641But github should still be flagged when not in URLs.
642And https should be flagged when used as HTTPS protocol name.
643"#;
644
645        let document = create_test_document(content);
646        let rule = MD044::new();
647        let violations = rule.check(&document).unwrap();
648
649        // After fix: should only flag non-URL occurrences
650        // In URLs, we shouldn't flag: https, github, http, crates
651        // But we should still flag: github (line 7), https (line 8)
652
653        println!("Violations found after fix: {}", violations.len());
654        for (i, v) in violations.iter().enumerate() {
655            println!("Violation {}: line {}, {}", i, v.line, v.message);
656        }
657
658        // Should only have 2 violations for the non-URL occurrences
659        assert_eq!(violations.len(), 2);
660        assert_eq!(violations[0].line, 7); // "github should still be flagged"
661        assert_eq!(violations[1].line, 8); // "https should be flagged"
662        assert!(violations[0].message.contains("github"));
663        assert!(violations[1].message.contains("https"));
664    }
665
666    #[test]
667    fn test_md044_markdown_links_with_urls() {
668        let content = r#"Check out [GitHub](https://github.com) for repositories.
669
670Visit [the documentation](http://docs.example.com) for more info.
671
672Also see [Crates.io](https://crates.io) for Rust packages.
673
674But github and http should be flagged in regular text.
675"#;
676
677        let document = create_test_document(content);
678        let rule = MD044::new();
679        let violations = rule.check(&document).unwrap();
680
681        // After fix: should only flag non-URL occurrences
682        println!("Markdown link violations found: {}", violations.len());
683        for (i, v) in violations.iter().enumerate() {
684            println!("Violation {}: line {}, {}", i, v.line, v.message);
685        }
686
687        // Should only flag the instances in regular text, not in the URLs
688        assert_eq!(violations.len(), 2);
689        assert_eq!(violations[0].line, 7); // "github and http should be flagged"
690        assert_eq!(violations[1].line, 7);
691        assert!(violations[0].message.contains("github") || violations[0].message.contains("http"));
692        assert!(violations[1].message.contains("github") || violations[1].message.contains("http"));
693    }
694
695    #[test]
696    fn test_md044_bare_urls() {
697        let content = r#"Visit https://github.com/user/repo directly.
698
699Or go to http://example.com for info.
700
701Plain URLs: https://crates.io and http://docs.rs should not be flagged.
702
703But mentioning github or https in text should be flagged.
704"#;
705
706        let document = create_test_document(content);
707        let rule = MD044::new();
708        let violations = rule.check(&document).unwrap();
709
710        println!("Bare URL violations found: {}", violations.len());
711        for (i, v) in violations.iter().enumerate() {
712            println!("Violation {}: line {}, {}", i, v.line, v.message);
713        }
714
715        // Should only flag the instances in regular text, not in the bare URLs
716        assert_eq!(violations.len(), 2);
717        assert_eq!(violations[0].line, 7); // "github or https in text should be flagged"
718        assert_eq!(violations[1].line, 7);
719        assert!(
720            violations[0].message.contains("github") || violations[0].message.contains("https")
721        );
722        assert!(
723            violations[1].message.contains("github") || violations[1].message.contains("https")
724        );
725    }
726
727    #[test]
728    fn test_md044_url_context_detection_comprehensive() {
729        let content = r#"# URL Context Detection Tests
730
731## Bare URLs should not be flagged
732Visit https://github.com/user/repo for code.
733Check out http://example.com/path?query=value.
734Email me at mailto:user@github.com for questions.
735Use ftp://files.example.com/downloads for files.
736
737## Markdown links should not flag URLs
738See [GitHub](https://github.com) for repositories.
739Check [HTTP docs](http://example.com/docs) for info.
740Visit [the site](https://crates.io/search?q=rust) for packages.
741
742## Regular text should still be flagged
743I use github for version control.
744The https protocol is secure.
745We need better http handling.
746
747## Mixed scenarios
748Check https://github.com but remember that github is popular.
749Visit [GitHub](https://github.com) - github is widely used.
750The url https://example.com shows that http redirects work.
751
752## Edge cases
753URL at end: https://github.com
754URL in parentheses: (https://github.com/user/repo)
755URL with punctuation: Visit https://github.com.
756Multiple URLs: https://github.com and http://example.com are different.
757"#;
758
759        let document = create_test_document(content);
760        let rule = MD044::new();
761        let violations = rule.check(&document).unwrap();
762
763        println!("Comprehensive test violations: {}", violations.len());
764        for (i, v) in violations.iter().enumerate() {
765            println!(
766                "Violation {}: line {}, col {}, {}",
767                i, v.line, v.column, v.message
768            );
769        }
770
771        // Should only flag the non-URL occurrences
772        // Expected violations:
773        // Line 15: "github" in regular text
774        // Line 16: "https" in regular text
775        // Line 17: "http" in regular text
776        // Line 20: "github" in regular text
777        // Line 21: "github" in regular text
778        // Line 22: "url" in regular text (added to proper names)
779        // Line 22: "http" in regular text
780
781        assert_eq!(violations.len(), 7);
782
783        // Verify they're all from lines with regular text, not URLs
784        for violation in &violations {
785            assert!(violation.line >= 15); // All should be in the regular text section
786        }
787    }
788
789    #[test]
790    fn test_md044_url_detection_methods() {
791        let rule = MD044::new();
792
793        // Test bare URL detection
794
795        assert!(rule.is_in_url_context("Visit https://github.com for code", 10)); // "https"
796        assert!(rule.is_in_url_context("Visit https://github.com for code", 17)); // "github"
797        assert!(!rule.is_in_url_context("Visit https://github.com for code", 30)); // "code"
798
799        // Test markdown link URL detection
800        assert!(rule.is_in_url_context("See [GitHub](https://github.com) here", 14)); // "https"
801        assert!(rule.is_in_url_context("See [GitHub](https://github.com) here", 21)); // "github"
802        assert!(!rule.is_in_url_context("See [GitHub](https://github.com) here", 4)); // "GitHub" in link text
803        assert!(!rule.is_in_url_context("See [GitHub](https://github.com) here", 34)); // "here"
804
805        // Test non-URL contexts
806        assert!(!rule.is_in_url_context("I use github for development", 6)); // "github"
807        assert!(!rule.is_in_url_context("The https protocol is secure", 4)); // "https"
808    }
809
810    #[test]
811    fn test_md044_unicode_emoji_handling() {
812        let content = r#"📖 javascript documentation and github 🚀 repositories are great.
813
814Using nodejs with 🔥 performance and mongodb 💾 storage.
815"#;
816
817        let document = create_test_document(content);
818        let rule = MD044::new();
819        let violations = rule.check(&document).unwrap();
820
821        // Should find all 4 proper name violations without panicking
822        assert_eq!(violations.len(), 4);
823        assert!(violations[0].message.contains("javascript"));
824        assert!(violations[1].message.contains("github"));
825        assert!(violations[2].message.contains("nodejs"));
826        assert!(violations[3].message.contains("mongodb"));
827    }
828
829    #[test]
830    fn test_md044_unicode_mixed_scripts() {
831        let content = r#"在中文文档中使用 javascript 和 github。
832
833Русский текст с javascript и github тоже должен работать.
834
835العربية مع javascript و github أيضاً.
836"#;
837
838        let document = create_test_document(content);
839        let rule = MD044::new();
840        let violations = rule.check(&document).unwrap();
841
842        // Should find violations without panicking on Unicode text
843        // The exact count may vary due to Unicode word boundary detection
844        assert!(violations.len() >= 4); // At least some violations should be found
845        for violation in &violations {
846            assert!(
847                violation.message.contains("javascript") || violation.message.contains("github")
848            );
849        }
850    }
851
852    #[test]
853    fn test_md044_unicode_case_folding() {
854        let content = r#"Using javascript in our project.
855
856İstanbul'da javascript kullanıyoruz.
857"#;
858
859        let document = create_test_document(content);
860        let rule = MD044::new();
861        let violations = rule.check(&document).unwrap();
862
863        // Should find the violations without panicking on Unicode case folding
864        assert!(!violations.is_empty()); // At least the regular javascript should be found
865
866        // Find the javascript violation
867        let js_violation = violations.iter().find(|v| v.message.contains("javascript"));
868        assert!(js_violation.is_some());
869    }
870
871    #[test]
872    fn test_md044_unicode_combining_characters() {
873        let content = r#"Using normal javascript here and also github.
874
875Testing regular javascript and github again.
876"#;
877
878        let document = create_test_document(content);
879        let rule = MD044::new();
880        let violations = rule.check(&document).unwrap();
881
882        // Should handle text correctly and find all instances
883        assert_eq!(violations.len(), 4);
884        let js_violations: Vec<_> = violations
885            .iter()
886            .filter(|v| v.message.contains("javascript"))
887            .collect();
888        let gh_violations: Vec<_> = violations
889            .iter()
890            .filter(|v| v.message.contains("github"))
891            .collect();
892
893        assert_eq!(js_violations.len(), 2);
894        assert_eq!(gh_violations.len(), 2);
895    }
896
897    #[test]
898    fn test_md044_unicode_word_boundaries() {
899        let content = r#"Testing javascript🔥fast and github⭐popular.
900
901Also javascript‿linked and github🌟awesome.
902"#;
903
904        let document = create_test_document(content);
905        let rule = MD044::new();
906        let violations = rule.check(&document).unwrap();
907
908        // Unicode characters should properly separate words for boundary detection
909        assert_eq!(violations.len(), 4);
910        assert!(violations.iter().any(|v| v.message.contains("javascript")));
911        assert!(violations.iter().any(|v| v.message.contains("github")));
912    }
913
914    #[test]
915    fn test_md044_unicode_urls_with_emoji() {
916        let content = r#"Visit 📖 https://github.com/user/repo 🚀 for documentation.
917
918Check https://javascript.info 💡 for learning resources.
919
920But standalone github and javascript should be flagged.
921"#;
922
923        let document = create_test_document(content);
924        let rule = MD044::new();
925        let violations = rule.check(&document).unwrap();
926
927        // Should only flag the non-URL instances
928        assert_eq!(violations.len(), 2);
929        assert_eq!(violations[0].line, 5); // Line with standalone instances
930        assert_eq!(violations[1].line, 5);
931        assert!(
932            violations[0].message.contains("github")
933                || violations[0].message.contains("javascript")
934        );
935        assert!(
936            violations[1].message.contains("github")
937                || violations[1].message.contains("javascript")
938        );
939    }
940
941    #[test]
942    fn test_md044_still_works_for_non_urls() {
943        let content = r#"We use javascript and github in our development.
944
945The api uses json for data exchange.
946
947These should all be flagged since they're not in URLs.
948"#;
949
950        let document = create_test_document(content);
951        let rule = MD044::new();
952        let violations = rule.check(&document).unwrap();
953
954        // Should flag javascript->JavaScript, github->GitHub, api->API, json->JSON
955        assert_eq!(violations.len(), 4);
956        assert!(violations[0].message.contains("javascript"));
957        assert!(violations[1].message.contains("github"));
958        assert!(violations[2].message.contains("api"));
959        assert!(violations[3].message.contains("json"));
960    }
961}