citum_engine/values/
text_case.rs

1/*
2SPDX-License-Identifier: MIT OR Apache-2.0
3SPDX-FileCopyrightText: © 2023-2026 Bruce D'Arcus
4*/
5
6//! Title text-case transforms.
7//!
8//! Implements structured-title-aware casing for bibliography output.
9//! All transforms operate on Djot-markup-bearing strings and respect
10//! `.nocase` span protection via the rich-text renderer.
11
12use citum_schema::NoteStartTextCase;
13use citum_schema::options::titles::TextCase;
14
15/// Apply a text-case transform to a single plain-text segment.
16///
17/// This function handles the core casing logic for a single string.
18/// For structured titles with subtitles, use [`apply_to_structured_parts`].
19///
20/// `.nocase`-protected spans are handled at the Djot rendering layer,
21/// not here — this function operates on already-resolved text segments.
22#[must_use]
23pub fn apply_text_case(text: &str, case: TextCase) -> String {
24    match case {
25        TextCase::AsIs => text.to_string(),
26        TextCase::Lowercase => text.to_lowercase(),
27        TextCase::Uppercase => text.to_uppercase(),
28        TextCase::CapitalizeFirst => capitalize_first_word(text),
29        TextCase::Sentence | TextCase::SentenceApa | TextCase::SentenceNlm => {
30            to_sentence_case(text)
31        }
32        TextCase::Title => to_title_case(text),
33    }
34}
35
36/// Apply text-case to a structured title (main + subtitles).
37///
38/// The key difference between sentence-case variants:
39/// - `SentenceApa`: capitalize first word of main title AND each subtitle
40/// - `SentenceNlm`: capitalize first word of main title only
41/// - Other variants: applied uniformly to each part
42#[must_use]
43pub fn apply_to_structured_parts(
44    main: &str,
45    subtitles: &[&str],
46    case: TextCase,
47) -> (String, Vec<String>) {
48    match case {
49        TextCase::SentenceApa => {
50            let main_cased = to_sentence_case(main);
51            let subs_cased = subtitles.iter().map(|s| to_sentence_case(s)).collect();
52            (main_cased, subs_cased)
53        }
54        TextCase::SentenceNlm => {
55            let main_cased = to_sentence_case(main);
56            // NLM: subtitles keep only explicit/protected capitals (lowercase the rest)
57            let subs_cased = subtitles.iter().map(|s| s.to_lowercase()).collect();
58            (main_cased, subs_cased)
59        }
60        _ => {
61            let main_cased = apply_text_case(main, case);
62            let subs_cased = subtitles.iter().map(|s| apply_text_case(s, case)).collect();
63            (main_cased, subs_cased)
64        }
65    }
66}
67
68/// Returns true if the given language tag indicates English.
69#[must_use]
70pub fn is_english_language(lang: Option<&str>) -> bool {
71    match lang {
72        Some(tag) => {
73            let primary = tag.split('-').next().unwrap_or(tag);
74            primary.eq_ignore_ascii_case("en")
75        }
76        // Default: assume English for backward compatibility
77        None => true,
78    }
79}
80
81/// Resolve the effective text-case, applying language fallback.
82///
83/// For non-English languages without defined transforms, returns `AsIs`.
84#[must_use]
85pub fn resolve_text_case(case: TextCase, language: Option<&str>) -> TextCase {
86    if is_english_language(language) {
87        case
88    } else {
89        // Non-English: only explicit as-is, lowercase, uppercase pass through.
90        // All English-specific transforms fall back to as-is.
91        match case {
92            TextCase::AsIs | TextCase::Lowercase | TextCase::Uppercase => case,
93            _ => TextCase::AsIs,
94        }
95    }
96}
97
98/// Apply a note-start text-case transform using the same language fallback rules
99/// as other locale-backed casing behavior.
100#[must_use]
101pub(crate) fn apply_note_start_text_case(
102    value: &str,
103    text_case: NoteStartTextCase,
104    language: Option<&str>,
105) -> String {
106    let case = match text_case {
107        NoteStartTextCase::CapitalizeFirst => TextCase::CapitalizeFirst,
108        NoteStartTextCase::Lowercase => TextCase::Lowercase,
109    };
110    apply_text_case(value, resolve_text_case(case, language))
111}
112
113/// Convert text to sentence case: lowercase everything, then capitalize the first word.
114fn to_sentence_case(text: &str) -> String {
115    if text.is_empty() {
116        return String::new();
117    }
118    let lowered = text.to_lowercase();
119    capitalize_first_word(&lowered)
120}
121
122/// Capitalize the first alphabetic character of the string,
123/// preserving leading whitespace and punctuation.
124pub(crate) fn capitalize_first_word(text: &str) -> String {
125    let mut result = String::with_capacity(text.len());
126    let mut found_first = false;
127    for ch in text.chars() {
128        if !found_first && ch.is_alphabetic() {
129            for upper in ch.to_uppercase() {
130                result.push(upper);
131            }
132            found_first = true;
133        } else {
134            result.push(ch);
135        }
136    }
137    result
138}
139
140// English title-case stop words (articles, short conjunctions, short prepositions).
141const TITLE_CASE_STOP_WORDS: &[&str] = &[
142    "a", "an", "and", "as", "at", "but", "by", "for", "from", "in", "nor", "of", "on", "or", "so",
143    "the", "to", "up", "yet", "v", "vs",
144];
145
146/// Capitalize each component of a hyphenated compound word for title case.
147///
148/// When `force_all` is true (first/last word, post-punctuation), every component
149/// is capitalized. Otherwise interior stop-word components stay lowercase.
150fn capitalize_hyphenated(word: &str, force_all: bool) -> String {
151    word.split('-')
152        .map(|part| {
153            if force_all {
154                capitalize_first_word(part)
155            } else {
156                let alpha_core = part.trim_matches(|c: char| !c.is_alphanumeric());
157                if TITLE_CASE_STOP_WORDS.contains(&alpha_core) {
158                    part.to_string()
159                } else {
160                    capitalize_first_word(part)
161                }
162            }
163        })
164        .collect::<Vec<_>>()
165        .join("-")
166}
167
168fn trim_trailing_closing_punctuation(word: &str) -> &str {
169    word.trim_end_matches(['"', '\'', ')', ']', '}', '»', '”', '’'])
170}
171
172/// Convert text to English headline-style title case.
173///
174/// Capitalizes the first and last word unconditionally.
175/// Interior stop words (articles, short prepositions, conjunctions) stay lowercase.
176/// The first word after `:`, `?`, or `!` is always capitalized.
177/// Hyphenated compounds capitalize each non-stop-word component.
178fn to_title_case(text: &str) -> String {
179    if text.is_empty() {
180        return String::new();
181    }
182
183    let words: Vec<&str> = text.split_whitespace().collect();
184    if words.is_empty() {
185        return text.to_string();
186    }
187
188    let last_idx = words.len() - 1;
189    let mut parts: Vec<String> = Vec::with_capacity(words.len());
190    let mut capitalize_next = false;
191
192    for (i, word) in words.iter().enumerate() {
193        let lower = word.to_lowercase();
194        if i == 0 || i == last_idx || capitalize_next {
195            if lower.contains('-') {
196                parts.push(capitalize_hyphenated(&lower, true));
197            } else {
198                parts.push(capitalize_first_word(&lower));
199            }
200        } else {
201            // Strip leading/trailing punctuation when checking stop words so that
202            // words like "(and" or "and)" are still treated as the stop word "and".
203            let alpha_core = lower.trim_matches(|c: char| !c.is_alphanumeric());
204            if TITLE_CASE_STOP_WORDS.contains(&alpha_core) {
205                parts.push(lower);
206            } else if lower.contains('-') {
207                parts.push(capitalize_hyphenated(&lower, false));
208            } else {
209                parts.push(capitalize_first_word(&lower));
210            }
211        }
212        // Capitalize the next word after sentence-ending punctuation or a colon,
213        // even when that punctuation is followed by a closing quote or bracket.
214        let punctuation_core = trim_trailing_closing_punctuation(word);
215        capitalize_next = punctuation_core.ends_with(':')
216            || punctuation_core.ends_with('?')
217            || punctuation_core.ends_with('!');
218    }
219
220    // Rebuild with original whitespace structure
221    let mut result = String::with_capacity(text.len());
222    let mut word_iter = parts.iter();
223    let mut in_word = false;
224    let mut current_word = word_iter.next();
225
226    for ch in text.chars() {
227        if ch.is_whitespace() {
228            if in_word {
229                in_word = false;
230                current_word = word_iter.next();
231            }
232            result.push(ch);
233        } else if !in_word && let Some(word) = current_word {
234            result.push_str(word);
235            in_word = true;
236        }
237    }
238
239    result
240}
241
242#[cfg(test)]
243#[allow(
244    clippy::unwrap_used,
245    clippy::expect_used,
246    clippy::panic,
247    clippy::indexing_slicing,
248    clippy::todo,
249    clippy::unimplemented,
250    clippy::unreachable,
251    clippy::get_unwrap,
252    reason = "Panicking is acceptable and often desired in tests."
253)]
254mod tests {
255    use super::*;
256
257    // --- capitalize_first_word ---
258
259    #[test]
260    fn test_capitalize_first_word_basic() {
261        assert_eq!(capitalize_first_word("hello world"), "Hello world");
262    }
263
264    #[test]
265    fn test_capitalize_first_word_leading_space() {
266        assert_eq!(capitalize_first_word("  hello"), "  Hello");
267    }
268
269    #[test]
270    fn test_capitalize_first_word_empty() {
271        assert_eq!(capitalize_first_word(""), "");
272    }
273
274    #[test]
275    fn test_capitalize_first_word_already_upper() {
276        assert_eq!(capitalize_first_word("Hello"), "Hello");
277    }
278
279    // --- to_sentence_case ---
280
281    #[test]
282    fn test_sentence_case_basic() {
283        assert_eq!(
284            to_sentence_case("The Quick Brown Fox"),
285            "The quick brown fox"
286        );
287    }
288
289    #[test]
290    fn test_sentence_case_all_caps() {
291        assert_eq!(to_sentence_case("DNA REPLICATION"), "Dna replication");
292    }
293
294    #[test]
295    fn test_sentence_case_empty() {
296        assert_eq!(to_sentence_case(""), "");
297    }
298
299    // --- to_title_case ---
300
301    #[test]
302    fn test_title_case_basic() {
303        assert_eq!(to_title_case("the quick brown fox"), "The Quick Brown Fox");
304    }
305
306    #[test]
307    fn test_title_case_stop_words() {
308        assert_eq!(
309            to_title_case("a tale of two cities"),
310            "A Tale of Two Cities"
311        );
312    }
313
314    #[test]
315    fn test_title_case_last_word_capitalized() {
316        assert_eq!(
317            to_title_case("the world we live in"),
318            "The World We Live In"
319        );
320    }
321
322    #[test]
323    fn test_title_case_after_colon() {
324        assert_eq!(
325            to_title_case("the title: a subtitle"),
326            "The Title: A Subtitle"
327        );
328    }
329
330    #[test]
331    fn test_title_case_after_colon_stop_word() {
332        // First word after colon is a stop word but must still be capitalized
333        assert_eq!(
334            to_title_case("history of the world: a new perspective"),
335            "History of the World: A New Perspective"
336        );
337    }
338
339    #[test]
340    fn test_title_case_after_question_mark() {
341        assert_eq!(
342            to_title_case("who's black and why? a hidden chapter"),
343            "Who's Black and Why? A Hidden Chapter"
344        );
345    }
346
347    #[test]
348    fn test_title_case_after_question_mark_with_closing_quote() {
349        assert_eq!(
350            to_title_case("who's black and why?\" a hidden chapter"),
351            "Who's Black and Why?\" A Hidden Chapter"
352        );
353    }
354
355    #[test]
356    fn test_title_case_from_is_stop_word() {
357        assert_eq!(
358            to_title_case("a hidden chapter from the eighteenth-century invention of race"),
359            "A Hidden Chapter from the Eighteenth-Century Invention of Race"
360        );
361    }
362
363    #[test]
364    fn test_title_case_hyphenated_compound() {
365        assert_eq!(
366            to_title_case("eighteenth-century studies"),
367            "Eighteenth-Century Studies"
368        );
369    }
370
371    #[test]
372    fn test_title_case_hyphenated_stop_word_part() {
373        // "well-to-do": "to" is a stop word → stays lowercase in interior position
374        assert_eq!(to_title_case("a well-to-do family"), "A Well-to-Do Family");
375    }
376
377    // --- apply_to_structured_parts ---
378
379    #[test]
380    fn test_sentence_apa_structured() {
381        let (main, subs) = apply_to_structured_parts(
382            "Understanding Citation Systems",
383            &["History and Practice", "A Comparative View"],
384            TextCase::SentenceApa,
385        );
386        assert_eq!(main, "Understanding citation systems");
387        assert_eq!(subs, vec!["History and practice", "A comparative view"]);
388    }
389
390    #[test]
391    fn test_sentence_nlm_structured() {
392        let (main, subs) = apply_to_structured_parts(
393            "Understanding Citation Systems",
394            &["History and Practice"],
395            TextCase::SentenceNlm,
396        );
397        assert_eq!(main, "Understanding citation systems");
398        // NLM: subtitles lowercased (no first-word capitalization)
399        assert_eq!(subs, vec!["history and practice"]);
400    }
401
402    #[test]
403    fn test_title_case_structured() {
404        let (main, subs) =
405            apply_to_structured_parts("the dna of empire", &["a new perspective"], TextCase::Title);
406        assert_eq!(main, "The Dna of Empire");
407        assert_eq!(subs, vec!["A New Perspective"]);
408    }
409
410    // --- resolve_text_case ---
411
412    #[test]
413    fn test_english_language_detection() {
414        assert!(is_english_language(Some("en")));
415        assert!(is_english_language(Some("en-US")));
416        assert!(is_english_language(Some("en-GB")));
417        assert!(is_english_language(None));
418        assert!(!is_english_language(Some("de")));
419        assert!(!is_english_language(Some("fr-FR")));
420    }
421
422    #[test]
423    fn test_resolve_non_english_falls_back() {
424        assert_eq!(
425            resolve_text_case(TextCase::SentenceApa, Some("de")),
426            TextCase::AsIs
427        );
428        assert_eq!(
429            resolve_text_case(TextCase::Title, Some("fr")),
430            TextCase::AsIs
431        );
432        // Explicit lowercase/uppercase pass through for any language
433        assert_eq!(
434            resolve_text_case(TextCase::Lowercase, Some("de")),
435            TextCase::Lowercase
436        );
437    }
438
439    #[test]
440    fn test_resolve_english_passes_through() {
441        assert_eq!(
442            resolve_text_case(TextCase::SentenceApa, Some("en")),
443            TextCase::SentenceApa
444        );
445        assert_eq!(
446            resolve_text_case(TextCase::Title, Some("en-US")),
447            TextCase::Title
448        );
449    }
450
451    #[test]
452    fn test_note_start_capitalize_first_uses_english_language_rules() {
453        assert_eq!(
454            apply_note_start_text_case(
455                "edited by",
456                NoteStartTextCase::CapitalizeFirst,
457                Some("en-US"),
458            ),
459            "Edited by"
460        );
461    }
462
463    #[test]
464    fn test_note_start_capitalize_first_falls_back_to_as_is_for_non_english() {
465        assert_eq!(
466            apply_note_start_text_case(
467                "hg. von",
468                NoteStartTextCase::CapitalizeFirst,
469                Some("de-DE"),
470            ),
471            "hg. von"
472        );
473    }
474
475    #[test]
476    fn test_note_start_capitalize_first_is_no_op_for_uncased_scripts() {
477        assert_eq!(
478            apply_note_start_text_case("ابن سينا", NoteStartTextCase::CapitalizeFirst, Some("ar"),),
479            "ابن سينا"
480        );
481    }
482}
citum_engine/values/text_case.rs

citum_engine/values/
text_case.rs