codebook/
regexes.rs

1use lazy_static::lazy_static;
2use regex::Regex;
3
4lazy_static! {
5    static ref DEFAULT_SKIP_PATTERNS: Vec<Regex> = vec![
6        // URLs (http/https)
7        Regex::new(r"https?://[^\s]+").expect("Valid URL regex"),
8        // Hex colors (#deadbeef, #fff, #123456)
9        Regex::new(r"#[0-9a-fA-F]{3,8}").expect("Valid hex color regex"),
10        // Email addresses
11        Regex::new(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}").expect("Valid email regex"),
12        // File paths (Unix-style starting with /)
13        Regex::new(r"/[^\s]*").expect("Valid Unix path regex"),
14        // File paths (Windows-style with drive letter)
15        Regex::new(r"[A-Za-z]:\\[^\s]*").expect("Valid Windows path regex"),
16        // UUID
17        Regex::new(r"[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}")
18            .expect("Valid UUID regex"),
19        // Base64 strings (rough pattern for long base64 sequences)
20        Regex::new(r"[A-Za-z0-9+/]{20,}={0,2}").expect("Valid Base64 regex"),
21        // Git commit hashes (7+ hex characters)
22        Regex::new(r"\b[0-9a-fA-F]{7,40}\b").expect("Valid git hash regex"),
23        // Markdown/HTML links
24        Regex::new(r"\[([^\]]+)\]\(([^)]+)\)").expect("Valid markdown link regex"),
25    ];
26}
27
28/// Default regex patterns to skip during spell checking.
29/// These patterns match common technical strings that contain letter sequences
30/// but shouldn't be treated as words for spell checking purposes.
31pub fn get_default_skip_patterns() -> &'static Vec<Regex> {
32    &DEFAULT_SKIP_PATTERNS
33}
34
35/// Compile user-provided regex patterns from strings
36pub fn compile_user_patterns(patterns: &[String]) -> Result<Vec<Regex>, regex::Error> {
37    patterns.iter().map(|pattern| Regex::new(pattern)).collect()
38}
39
40#[cfg(test)]
41mod tests {
42    use super::*;
43
44    #[test]
45    fn test_url_pattern() {
46        let patterns = get_default_skip_patterns();
47        let url_pattern = &patterns[0]; // First pattern should be URLs
48
49        assert!(url_pattern.is_match("https://www.example.com"));
50        assert!(url_pattern.is_match("http://github.com/user/repo"));
51        assert!(!url_pattern.is_match("not a url"));
52    }
53
54    #[test]
55    fn test_hex_color_pattern() {
56        let patterns = get_default_skip_patterns();
57        let hex_pattern = &patterns[1]; // Second pattern should be hex colors
58
59        assert!(hex_pattern.is_match("#deadbeef"));
60        assert!(hex_pattern.is_match("#fff"));
61        assert!(hex_pattern.is_match("#123456"));
62        assert!(!hex_pattern.is_match("deadbeef")); // Without #
63        assert!(!hex_pattern.is_match("#gg")); // Invalid hex
64    }
65
66    #[test]
67    fn test_email_pattern() {
68        let patterns = get_default_skip_patterns();
69        let email_pattern = &patterns[2]; // Third pattern should be emails
70
71        assert!(email_pattern.is_match("user@example.com"));
72        assert!(email_pattern.is_match("test.email+tag@domain.co.uk"));
73        assert!(!email_pattern.is_match("not an email"));
74    }
75
76    #[test]
77    fn test_compile_user_patterns() {
78        let user_patterns = vec![
79            r"\b[A-Z]{2,}\b".to_string(), // All caps words
80            r"TODO:.*".to_string(),       // TODO comments
81        ];
82
83        let compiled = compile_user_patterns(&user_patterns).unwrap();
84        assert_eq!(compiled.len(), 2);
85
86        assert!(compiled[0].is_match("HTML"));
87        assert!(compiled[1].is_match("TODO: fix this"));
88    }
89
90    #[test]
91    fn test_invalid_user_pattern() {
92        let invalid_patterns = vec![r"[invalid".to_string()]; // Missing closing bracket
93
94        assert!(compile_user_patterns(&invalid_patterns).is_err());
95    }
96}