Skip to main content

crossref_xml/
regex.rs

1use regex::Regex;
2use std::sync::LazyLock;
3
4pub static DOI_REGEX: LazyLock<Regex> =
5    LazyLock::new(|| Regex::new(r"^10\.[0-9]{4,9}/.{1,200}$").unwrap());
6pub static RESOURCE_URL_REGEX: LazyLock<Regex> =
7    LazyLock::new(|| Regex::new(r"^(?i)(https?|ftp)://.*$").unwrap());
8pub static ORCID_REGEX: LazyLock<Regex> = LazyLock::new(|| {
9    Regex::new(r"^https?://orcid.org/[0-9]{4}-[0-9]{4}-[0-9]{4}-[0-9]{3}[X0-9]{1}$").unwrap()
10});
11pub static INSTITUTION_PID_REGEX: LazyLock<Regex> =
12    LazyLock::new(|| Regex::new(r"[hH][tT][tT][pP][sS]://.{1,50}$").unwrap());
13pub static NAME_REGEX: LazyLock<Regex> =
14    LazyLock::new(|| Regex::new(r"^[^\d\?\s][^\d\?]*[^\?\s]+[^\d]*").unwrap());
15
16#[cfg(test)]
17mod unit {
18    use super::*;
19
20    mod doi {
21        use super::*;
22
23        #[test]
24        fn valid_dois() {
25            let valid = [
26                "10.1000/xyz123",
27                "10.1234/abcdef",
28                "10.12345/some-article",
29                "10.123456789/a",
30                "10.1016/j.cell.2020.01.001",
31                "10.1038/nature12373",
32                "10.1109/5.771073",
33                "10.1002/(SICI)1097-4636(199905)45:2<133::AID-JBM9>3.0.CO;2-T",
34            ];
35            for doi in valid {
36                assert!(DOI_REGEX.is_match(doi), "should match: {}", doi);
37            }
38        }
39
40        #[test]
41        fn invalid_dois() {
42            let invalid = [
43                "11.1000/xyz123",    // wrong prefix
44                "10.123/xyz",        // registrant too short
45                "10.1234/",          // missing suffix
46                "doi:10.1234/abc",   // has prefix (regex doesn't anchor)
47                "10.1234567890/abc", // registrant too long (10 digits)
48            ];
49            for doi in invalid {
50                assert!(!DOI_REGEX.is_match(doi), "should not match: {}", doi);
51            }
52        }
53
54        #[test]
55        fn suffix_length_limit() {
56            let at_limit = format!("10.1234/{}", "a".repeat(200));
57            assert!(DOI_REGEX.is_match(&at_limit));
58            let over_limit = format!("10.1234/{}", "a".repeat(201));
59            assert!(!DOI_REGEX.is_match(&over_limit));
60        }
61    }
62    mod resource_url {
63        use super::*;
64
65        #[test]
66        fn valid_urls() {
67            let valid = [
68                "http://example.com",
69                "https://example.com",
70                "HTTP://EXAMPLE.COM",
71                "HTTPS://example.com/path",
72                "ftp://files.example.com",
73                "FTP://files.example.com",
74                "https://example.com/path/to/resource?query=1&other=2",
75                "http://localhost:8080",
76                "https://sub.domain.example.com",
77            ];
78            for url in valid {
79                assert!(RESOURCE_URL_REGEX.is_match(url), "should match: {}", url);
80            }
81        }
82
83        #[test]
84        fn invalid_urls() {
85            let invalid = [
86                "example.com",
87                "mailto:test@example.com",
88                "file:///path/to/file",
89                "//example.com",
90                "httpx://example.com",
91            ];
92            for url in invalid {
93                assert!(
94                    !RESOURCE_URL_REGEX.is_match(url),
95                    "should not match: {}",
96                    url
97                );
98            }
99        }
100
101        #[test]
102        fn rejects_non_start_anchored() {
103            // The regex is anchored with ^, so this should fail
104            assert!(!RESOURCE_URL_REGEX.is_match("visit http://example.com"));
105        }
106    }
107    mod orcid {
108        use super::*;
109
110        #[test]
111        fn valid_orcids() {
112            let valid = [
113                "https://orcid.org/0000-0002-1825-0097",
114                "http://orcid.org/0000-0002-1825-0097",
115                "https://orcid.org/0000-0001-5109-3700",
116                "https://orcid.org/0000-0002-1694-233X", // X checksum
117                "http://orcid.org/1234-5678-9012-345X",
118            ];
119            for orcid in valid {
120                assert!(ORCID_REGEX.is_match(orcid), "should match: {}", orcid);
121            }
122        }
123
124        #[test]
125        fn invalid_orcids() {
126            let invalid = [
127                "orcid.org/0000-0002-1825-0097",            // missing protocol
128                "https://orcid.org/0000-0002-1825-009",     // too short
129                "https://orcid.org/0000-0002-1825-00977",   // too long
130                "https://orcid.org/000-0002-1825-0097",     // first group too short
131                "https://orcid.org/0000-0002-1825-009Y",    // invalid character Y
132                "https://notorcid.org/0000-0002-1825-0097", // wrong domain
133                "ftp://orcid.org/0000-0002-1825-0097",      // wrong protocol
134            ];
135            for orcid in invalid {
136                assert!(!ORCID_REGEX.is_match(orcid), "should not match: {}", orcid);
137            }
138        }
139
140        #[test]
141        fn checksum_x_only_in_last_position() {
142            assert!(ORCID_REGEX.is_match("https://orcid.org/0000-0002-1825-009X"));
143            assert!(!ORCID_REGEX.is_match("https://orcid.org/0000-0002-1825-00X9"));
144        }
145    }
146    mod institution_pid {
147        use super::*;
148
149        #[test]
150        fn valid_institution_pids() {
151            let valid = [
152                "https://example.com",
153                "HTTPS://ror.org/12345",
154                "HtTpS://isni.org/isni/0000000121032683",
155                "https://a", // minimum after ://
156            ];
157            for pid in valid {
158                assert!(INSTITUTION_PID_REGEX.is_match(pid), "should match: {}", pid);
159            }
160        }
161
162        #[test]
163        fn invalid_institution_pids() {
164            let invalid = [
165                "http://example.com", // http not https
166                "ftp://example.com",
167                "https://", // nothing after ://
168            ];
169            for pid in invalid {
170                assert!(
171                    !INSTITUTION_PID_REGEX.is_match(pid),
172                    "should not match: {}",
173                    pid
174                );
175            }
176        }
177
178        #[test]
179        fn max_length_after_protocol() {
180            let at_limit = format!("https://{}", "a".repeat(50));
181            let over_limit = format!("https://{}", "a".repeat(51));
182
183            assert!(INSTITUTION_PID_REGEX.is_match(&at_limit));
184            assert!(!INSTITUTION_PID_REGEX.is_match(&over_limit));
185        }
186    }
187    mod name {
188        use super::*;
189
190        #[test]
191        fn valid_names() {
192            let valid = [
193                "John Smith",
194                "María García",
195                "Jean-Pierre",
196                "O'Connor",
197                "李明",
198                "Müller",
199                "Anne-Marie O'Brien",
200            ];
201            for name in valid {
202                assert!(NAME_REGEX.is_match(name), "should match: {}", name);
203            }
204        }
205
206        #[test]
207        fn invalid_names() {
208            let invalid = ["123", "?", "???", " ", "   "];
209            for name in invalid {
210                assert!(!NAME_REGEX.is_match(name), "should not match: {}", name);
211            }
212        }
213
214        #[test]
215        fn names_with_some_digits_at_edges() {
216            assert!(NAME_REGEX.is_match("John Smith III"));
217            assert!(NAME_REGEX.is_match("John 3rd Smith"));
218        }
219
220        #[test]
221        fn rejects_digit_only_or_question_mark_only() {
222            assert!(!NAME_REGEX.is_match("12345"));
223            assert!(!NAME_REGEX.is_match("?????"));
224        }
225    }
226}