Skip to main content

scitadel_core/models/
doi.rs

1/// Valid characters in a DOI suffix: alphanumeric plus `-._;()/:`.
2/// Covers 99.3% of CrossRef DOIs per their regex analysis.
3fn is_valid_suffix_char(c: char) -> bool {
4    c.is_ascii_alphanumeric() || matches!(c, '-' | '.' | '_' | ';' | '(' | ')' | '/' | ':')
5}
6
7/// Normalize a DOI string: trim whitespace, strip common URL prefixes, lowercase.
8pub fn normalize_doi(doi: &str) -> String {
9    let trimmed = doi.trim();
10    let stripped = trimmed
11        .strip_prefix("https://doi.org/")
12        .or_else(|| trimmed.strip_prefix("http://doi.org/"))
13        .or_else(|| trimmed.strip_prefix("https://dx.doi.org/"))
14        .or_else(|| trimmed.strip_prefix("http://dx.doi.org/"))
15        .unwrap_or(trimmed);
16    stripped.to_lowercase()
17}
18
19/// Validate a DOI string against the standard format: `10.NNNN…/suffix`.
20///
21/// - Prefix: `10.` followed by 4–9 digits (registrant code).
22/// - Separator: `/`.
23/// - Suffix: one or more valid characters (`[a-zA-Z0-9-._;()/:]+`).
24///
25/// The input is normalized (trimmed, URL-prefix-stripped) before validation.
26pub fn validate_doi(doi: &str) -> bool {
27    let normalized = normalize_doi(doi);
28
29    // Must start with "10."
30    let rest = match normalized.strip_prefix("10.") {
31        Some(r) => r,
32        None => return false,
33    };
34
35    // Extract registrant digits (4–9 digits before the first '/')
36    let slash_pos = match rest.find('/') {
37        Some(pos) => pos,
38        None => return false,
39    };
40
41    let registrant = &rest[..slash_pos];
42    if registrant.len() < 4 || registrant.len() > 9 {
43        return false;
44    }
45    if !registrant.chars().all(|c| c.is_ascii_digit()) {
46        return false;
47    }
48
49    // Suffix must be non-empty and contain only valid characters
50    let suffix = &rest[slash_pos + 1..];
51    if suffix.is_empty() {
52        return false;
53    }
54
55    suffix.chars().all(is_valid_suffix_char)
56}
57
58/// Sanitize a DOI for use as a filename: replace `/` with `_`, keep only safe chars.
59pub fn doi_to_filename(doi: &str) -> String {
60    let normalized = normalize_doi(doi);
61    normalized
62        .chars()
63        .map(|c| {
64            if c.is_ascii_alphanumeric() || matches!(c, '-' | '.' | '_') {
65                c
66            } else {
67                '_'
68            }
69        })
70        .collect()
71}
72
73#[cfg(test)]
74mod tests {
75    use super::*;
76
77    #[test]
78    fn valid_dois() {
79        assert!(validate_doi("10.1038/s41586-020-2649-2"));
80        assert!(validate_doi("10.1371/journal.pone.0000000"));
81        assert!(validate_doi("10.1002/anie.200906232"));
82        assert!(validate_doi("10.1103/PhysRevLett.116.061102"));
83        assert!(validate_doi("10.48550/arXiv.2301.00001"));
84    }
85
86    #[test]
87    fn valid_with_url_prefix() {
88        assert!(validate_doi("https://doi.org/10.1038/s41586-020-2649-2"));
89        assert!(validate_doi("http://dx.doi.org/10.1002/anie.200906232"));
90    }
91
92    #[test]
93    fn valid_with_special_suffix_chars() {
94        assert!(validate_doi("10.1000/xyz_(abc)"));
95        assert!(validate_doi("10.1000/a:b;c.d_e-f"));
96        assert!(validate_doi("10.1234/sub/path/deep"));
97    }
98
99    #[test]
100    fn invalid_dois() {
101        assert!(!validate_doi(""));
102        assert!(!validate_doi("not-a-doi"));
103        assert!(!validate_doi("10.123/too-short-registrant"));
104        assert!(!validate_doi("10.1234/")); // empty suffix
105        assert!(!validate_doi("10.1234")); // no slash
106        assert!(!validate_doi("11.1234/test")); // wrong prefix
107        assert!(!validate_doi("10.12345678901/too-long-registrant")); // >9 digits
108        assert!(!validate_doi("10.abcd/test")); // non-digit registrant
109    }
110
111    #[test]
112    fn normalize_strips_prefix() {
113        assert_eq!(
114            normalize_doi("https://doi.org/10.1038/TEST"),
115            "10.1038/test"
116        );
117        assert_eq!(
118            normalize_doi("http://dx.doi.org/10.1038/TEST"),
119            "10.1038/test"
120        );
121    }
122
123    #[test]
124    fn normalize_lowercases() {
125        assert_eq!(normalize_doi("10.1038/ABC"), "10.1038/abc");
126    }
127
128    #[test]
129    fn filename_sanitization() {
130        assert_eq!(
131            doi_to_filename("10.1038/s41586-020-2649-2"),
132            "10.1038_s41586-020-2649-2"
133        );
134    }
135}