Skip to main content

crossref_lib/
utils.rs

1use crate::models::WorkMeta;
2
3/// Citation key generation style.
4#[derive(Debug, Clone, PartialEq)]
5pub enum KeyStyle {
6    /// Author last-name(s) + year, e.g. `Smith2024`.
7    AuthorYear,
8    /// Significant title words + year, e.g. `MachineLearning2024`.
9    ShortTitle,
10}
11
12/// Generate a citation key according to the given style.
13pub fn generate_citation_key_by_style(work: &WorkMeta, style: &KeyStyle) -> String {
14    match style {
15        KeyStyle::AuthorYear => generate_citation_key(&work.authors, work.year),
16        KeyStyle::ShortTitle => generate_short_title_key(work),
17    }
18}
19
20/// Generate an author-year citation key, e.g. `Smith2024` or `SmithJones2024`.
21///
22/// Collision suffixes (`a`, `b`, …) must be resolved by the caller after checking
23/// the existing keys in a bibliography.
24pub fn generate_citation_key(authors: &[String], year: Option<i32>) -> String {
25    let author_part = authors
26        .iter()
27        .take(2)
28        .filter_map(|a| {
29            // Authors are stored as "Family, Given" or just "Family"
30            a.split(',')
31                .next()
32                .map(|family| family.trim().to_string())
33        })
34        .filter(|s| !s.is_empty())
35        .map(capitalise_first)
36        .collect::<Vec<_>>()
37        .join("");
38
39    let year_part = year
40        .map(|y| y.to_string())
41        .unwrap_or_default();
42
43    if author_part.is_empty() {
44        format!("Unknown{year_part}")
45    } else {
46        format!("{author_part}{year_part}")
47    }
48}
49
50/// Append a conflict suffix (`a`, `b`, …) to `base_key` until it is unique
51/// among `existing_keys`.
52pub fn resolve_key_conflict(base_key: &str, existing_keys: &[String]) -> String {
53    if !existing_keys.contains(&base_key.to_string()) {
54        return base_key.to_string();
55    }
56    (b'a'..=b'z')
57        .map(|c| format!("{}{}", base_key, c as char))
58        .find(|candidate| !existing_keys.contains(candidate))
59        .unwrap_or_else(|| {
60            // Beyond 'z': try two-letter suffixes aa, ab, …
61            for c1 in b'a'..=b'z' {
62                for c2 in b'a'..=b'z' {
63                    let candidate = format!("{}{}{}", base_key, c1 as char, c2 as char);
64                    if !existing_keys.contains(&candidate) {
65                        return candidate;
66                    }
67                }
68            }
69            format!("{base_key}_conflict")
70        })
71}
72
73/// Normalise a raw DOI string: strip URL prefixes if present.
74///
75/// E.g. `https://doi.org/10.1234/test` → `10.1234/test`
76pub fn normalise_doi(doi: &str) -> String {
77    doi.trim()
78        .trim_start_matches("https://doi.org/")
79        .trim_start_matches("http://doi.org/")
80        .trim_start_matches("https://dx.doi.org/")
81        .trim_start_matches("http://dx.doi.org/")
82        .trim_start_matches("doi:")
83        .to_string()
84}
85
86/// Capitalise the first character of a string, leaving the rest unchanged.
87pub fn capitalise_first(s: String) -> String {
88    let mut chars = s.chars();
89    match chars.next() {
90        None => String::new(),
91        Some(first) => first.to_uppercase().collect::<String>() + chars.as_str(),
92    }
93}
94
95/// Generate a short-title citation key, e.g. `MachineLearning2024`.
96///
97/// Strips common stop-words, takes the first four significant words,
98/// capitalises each, and appends the year.
99fn generate_short_title_key(work: &WorkMeta) -> String {
100    const STOP_WORDS: &[&str] = &[
101        "a", "an", "the", "of", "in", "on", "at", "to", "for", "and",
102        "or", "by", "with", "is", "are", "was", "were", "from", "as",
103        "into", "that", "this", "its", "be", "has", "have", "had",
104    ];
105
106    let title_part: String = work
107        .title
108        .as_deref()
109        .unwrap_or("")
110        .split_whitespace()
111        .filter(|w| {
112            // Strip leading/trailing punctuation for the stop-word check
113            let lower: String = w
114                .chars()
115                .filter(|c| c.is_alphabetic())
116                .collect::<String>()
117                .to_lowercase();
118            !lower.is_empty() && !STOP_WORDS.contains(&lower.as_str())
119        })
120        .take(4)
121        .map(|w| {
122            // Keep only alphanumeric characters, then capitalise
123            let clean: String = w.chars().filter(|c| c.is_alphanumeric()).collect();
124            capitalise_first(clean)
125        })
126        .filter(|s| !s.is_empty())
127        .collect::<Vec<_>>()
128        .join("");
129
130    let year_part = work.year.map(|y| y.to_string()).unwrap_or_default();
131
132    if title_part.is_empty() {
133        format!("Unknown{year_part}")
134    } else {
135        format!("{title_part}{year_part}")
136    }
137}
138
139#[cfg(test)]
140mod tests {
141    use super::*;
142
143    #[test]
144    fn test_normalise_doi_strips_url() {
145        assert_eq!(
146            normalise_doi("https://doi.org/10.1234/test"),
147            "10.1234/test"
148        );
149        assert_eq!(normalise_doi("10.1234/test"), "10.1234/test");
150        assert_eq!(normalise_doi("doi:10.1234/test"), "10.1234/test");
151    }
152
153    #[test]
154    fn test_generate_citation_key_single_author() {
155        let authors = vec!["Smith, John".to_string()];
156        assert_eq!(generate_citation_key(&authors, Some(2024)), "Smith2024");
157    }
158
159    #[test]
160    fn test_generate_citation_key_two_authors() {
161        let authors = vec!["Smith, John".to_string(), "Jones, Alice".to_string()];
162        assert_eq!(generate_citation_key(&authors, Some(2024)), "SmithJones2024");
163    }
164
165    #[test]
166    fn test_resolve_key_conflict() {
167        let existing = vec!["Smith2024".to_string(), "Smith2024a".to_string()];
168        assert_eq!(resolve_key_conflict("Smith2024", &existing), "Smith2024b");
169    }
170
171    #[test]
172    fn test_resolve_key_conflict_beyond_z() {
173        // Build a list with all single-letter suffixes occupied
174        let mut existing = vec!["Smith2024".to_string()];
175        for c in b'a'..=b'z' {
176            existing.push(format!("Smith2024{}", c as char));
177        }
178        // Should fall back to two-letter suffix "aa"
179        assert_eq!(resolve_key_conflict("Smith2024", &existing), "Smith2024aa");
180    }
181
182    #[test]
183    fn test_short_title_key_style() {
184        let work = WorkMeta {
185            doi: "10.1234/ml".to_string(),
186            title: Some("Machine Learning in Practice".to_string()),
187            authors: vec!["Smith, John".to_string()],
188            year: Some(2024),
189            ..WorkMeta::default()
190        };
191        let key = generate_citation_key_by_style(&work, &KeyStyle::ShortTitle);
192        // "Machine", "Learning", "Practice" (stop words: "in")
193        assert_eq!(key, "MachineLearningPractice2024");
194    }
195
196    #[test]
197    fn test_short_title_key_strips_stop_words() {
198        let work = WorkMeta {
199            doi: "10.1234/a".to_string(),
200            title: Some("The Role of AI in the Future".to_string()),
201            authors: vec![],
202            year: Some(2020),
203            ..WorkMeta::default()
204        };
205        let key = generate_citation_key_by_style(&work, &KeyStyle::ShortTitle);
206        // Stop words removed: "The", "of", "in", "the"
207        // Remaining: "Role", "AI", "Future" (only 3 significant words)
208        assert_eq!(key, "RoleAIFuture2020");
209    }
210
211    #[test]
212    fn test_author_year_key_style() {
213        let work = WorkMeta {
214            doi: "10.1234/t".to_string(),
215            title: Some("Some Title".to_string()),
216            authors: vec!["Smith, John".to_string()],
217            year: Some(2024),
218            ..WorkMeta::default()
219        };
220        let key = generate_citation_key_by_style(&work, &KeyStyle::AuthorYear);
221        assert_eq!(key, "Smith2024");
222    }
223}