citum_engine/values/
text_case.rs

1/*
2SPDX-License-Identifier: MIT OR Apache-2.0
3SPDX-FileCopyrightText: © 2023-2026 Bruce D'Arcus and Citum contributors
4*/
5
6//! Title text-case transforms.
7//!
8//! Implements structured-title-aware casing for bibliography output.
9//! All transforms operate on Djot-markup-bearing strings and respect
10//! `.nocase` span protection via the rich-text renderer.
11
12use citum_schema::NoteStartTextCase;
13use citum_schema::options::titles::TextCase;
14
15/// Apply a text-case transform to a single plain-text segment.
16///
17/// This function handles the core casing logic for a single string.
18/// For structured titles with subtitles, use [`apply_to_structured_parts`].
19///
20/// `.nocase`-protected spans are handled at the Djot rendering layer,
21/// not here — this function operates on already-resolved text segments.
22#[must_use]
23pub fn apply_text_case(text: &str, case: TextCase) -> String {
24    match case {
25        TextCase::AsIs => text.to_string(),
26        TextCase::Lowercase => text.to_lowercase(),
27        TextCase::Uppercase => text.to_uppercase(),
28        TextCase::CapitalizeFirst => capitalize_first_word(text),
29        TextCase::Sentence | TextCase::SentenceApa | TextCase::SentenceNlm => {
30            to_sentence_case(text)
31        }
32        TextCase::Title => to_title_case(text),
33    }
34}
35
36/// Apply text-case to a structured title (main + subtitles).
37///
38/// The key difference between sentence-case variants:
39/// - `SentenceApa`: capitalize first word of main title AND each subtitle
40/// - `SentenceNlm`: capitalize first word of main title only
41/// - Other variants: applied uniformly to each part
42#[must_use]
43pub fn apply_to_structured_parts(
44    main: &str,
45    subtitles: &[&str],
46    case: TextCase,
47) -> (String, Vec<String>) {
48    match case {
49        TextCase::SentenceApa => {
50            let main_cased = to_sentence_case(main);
51            let subs_cased = subtitles.iter().map(|s| to_sentence_case(s)).collect();
52            (main_cased, subs_cased)
53        }
54        TextCase::SentenceNlm => {
55            let main_cased = to_sentence_case(main);
56            // NLM: subtitles keep only explicit/protected capitals (lowercase the rest)
57            let subs_cased = subtitles.iter().map(|s| s.to_lowercase()).collect();
58            (main_cased, subs_cased)
59        }
60        _ => {
61            let main_cased = apply_text_case(main, case);
62            let subs_cased = subtitles.iter().map(|s| apply_text_case(s, case)).collect();
63            (main_cased, subs_cased)
64        }
65    }
66}
67
68/// Returns true if the given language tag indicates English.
69#[must_use]
70pub fn is_english_language(lang: Option<&str>) -> bool {
71    match lang {
72        Some(tag) => {
73            let primary = tag.split('-').next().unwrap_or(tag);
74            primary.eq_ignore_ascii_case("en")
75        }
76        // Default: assume English for backward compatibility
77        None => true,
78    }
79}
80
81/// Resolve the effective text-case, applying language fallback.
82///
83/// For non-English languages without defined transforms, returns `AsIs`.
84#[must_use]
85pub fn resolve_text_case(case: TextCase, language: Option<&str>) -> TextCase {
86    if is_english_language(language) {
87        case
88    } else {
89        // Non-English: only explicit as-is, lowercase, uppercase pass through.
90        // All English-specific transforms fall back to as-is.
91        match case {
92            TextCase::AsIs | TextCase::Lowercase | TextCase::Uppercase => case,
93            _ => TextCase::AsIs,
94        }
95    }
96}
97
98/// Apply a note-start text-case transform using the same language fallback rules
99/// as other locale-backed casing behavior.
100#[must_use]
101pub(crate) fn apply_note_start_text_case(
102    value: &str,
103    text_case: NoteStartTextCase,
104    language: Option<&str>,
105) -> String {
106    let case = match text_case {
107        NoteStartTextCase::CapitalizeFirst => TextCase::CapitalizeFirst,
108        NoteStartTextCase::Lowercase => TextCase::Lowercase,
109    };
110    apply_text_case(value, resolve_text_case(case, language))
111}
112
113/// Convert text to sentence case: lowercase everything, then capitalize the first word.
114fn to_sentence_case(text: &str) -> String {
115    if text.is_empty() {
116        return String::new();
117    }
118    let lowered = text.to_lowercase();
119    capitalize_first_word(&lowered)
120}
121
122/// Capitalize the first alphabetic character of the string,
123/// preserving leading whitespace and punctuation.
124pub(crate) fn capitalize_first_word(text: &str) -> String {
125    let mut result = String::with_capacity(text.len());
126    let mut found_first = false;
127    for ch in text.chars() {
128        if !found_first && ch.is_alphabetic() {
129            for upper in ch.to_uppercase() {
130                result.push(upper);
131            }
132            found_first = true;
133        } else {
134            result.push(ch);
135        }
136    }
137    result
138}
139
140/// Capitalize the first alphabetic character of the string, skipping over
141/// HTML tags, LaTeX command prefixes, and Typst command prefixes.
142///
143/// Use this variant when the input may already contain rendered markup from a
144/// pre-formatted component. For plain-text input, behaviour is identical to
145/// [`capitalize_first_word`].
146pub(crate) fn capitalize_first_word_markup_aware(text: &str) -> String {
147    let bytes = text.as_bytes();
148    let len = bytes.len();
149    let mut i = 0;
150
151    while i < len {
152        let Some(&b) = bytes.get(i) else { break };
153
154        // Skip HTML tag: <...>
155        // `i` always points to an ASCII byte here, so the slice is on a char boundary.
156        if b == b'<'
157            && let Some(end) = text.get(i..).and_then(|s| s.find('>'))
158        {
159            i += end + 1;
160            continue;
161        }
162
163        // Skip LaTeX command prefix: \letters{ or \letters[...]{
164        if b == b'\\' {
165            let cmd_start = i + 1;
166            let cmd_len = bytes
167                .get(cmd_start..)
168                .unwrap_or_default()
169                .iter()
170                .take_while(|&&c| c.is_ascii_alphabetic())
171                .count();
172            if cmd_len > 0 {
173                let after_cmd = cmd_start + cmd_len;
174                // Skip optional [...]
175                let after_opt = if bytes.get(after_cmd) == Some(&b'[') {
176                    text.get(after_cmd..)
177                        .and_then(|s| s.find(']'))
178                        .map(|e| after_cmd + e + 1)
179                        .unwrap_or(after_cmd)
180                } else {
181                    after_cmd
182                };
183                if bytes.get(after_opt) == Some(&b'{') {
184                    i = after_opt + 1;
185                    continue;
186                }
187            }
188        }
189
190        // Skip Typst command prefix: #letters[
191        if b == b'#' {
192            let cmd_start = i + 1;
193            let cmd_len = bytes
194                .get(cmd_start..)
195                .unwrap_or_default()
196                .iter()
197                .take_while(|&&c| c.is_ascii_alphabetic())
198                .count();
199            if cmd_len > 0 {
200                let after_cmd = cmd_start + cmd_len;
201                if bytes.get(after_cmd) == Some(&b'[') {
202                    i = after_cmd + 1;
203                    continue;
204                }
205            }
206        }
207
208        // Decode the next Unicode character. `i` is always on a char boundary:
209        // the markup-skip branches only advance past ASCII bytes.
210        let ch = text.get(i..).and_then(|s| s.chars().next()).unwrap_or('\0');
211        if ch.is_alphabetic() {
212            let ch_len = ch.len_utf8();
213            let mut result = String::with_capacity(text.len());
214            result.push_str(text.get(..i).unwrap_or_default());
215            for upper in ch.to_uppercase() {
216                result.push(upper);
217            }
218            result.push_str(text.get(i + ch_len..).unwrap_or_default());
219            return result;
220        }
221
222        i += ch.len_utf8().max(1);
223    }
224
225    text.to_string()
226}
227
228/// Apply a text-case transform to a pre-formatted string that may contain
229/// rendered markup.
230///
231/// Delegates to [`capitalize_first_word_markup_aware`] for `CapitalizeFirst`;
232/// all other cases fall back to [`apply_text_case`].
233pub(crate) fn apply_text_case_markup_aware(text: &str, case: TextCase) -> String {
234    match case {
235        TextCase::CapitalizeFirst => capitalize_first_word_markup_aware(text),
236        _ => apply_text_case(text, case),
237    }
238}
239
240// English title-case stop words (articles, short conjunctions, short prepositions).
241const TITLE_CASE_STOP_WORDS: &[&str] = &[
242    "a", "an", "and", "as", "at", "but", "by", "for", "from", "in", "nor", "of", "on", "or", "so",
243    "the", "to", "up", "yet", "v", "vs",
244];
245
246/// Capitalize each component of a hyphenated compound word for title case.
247///
248/// When `force_all` is true (first/last word, post-punctuation), every component
249/// is capitalized. Otherwise interior stop-word components stay lowercase.
250fn capitalize_hyphenated(word: &str, force_all: bool) -> String {
251    word.split('-')
252        .map(|part| {
253            if force_all {
254                capitalize_first_word(part)
255            } else {
256                let alpha_core = part.trim_matches(|c: char| !c.is_alphanumeric());
257                if TITLE_CASE_STOP_WORDS.contains(&alpha_core) {
258                    part.to_string()
259                } else {
260                    capitalize_first_word(part)
261                }
262            }
263        })
264        .collect::<Vec<_>>()
265        .join("-")
266}
267
268fn trim_trailing_closing_punctuation(word: &str) -> &str {
269    word.trim_end_matches(['"', '\'', ')', ']', '}', '»', '”', '’'])
270}
271
272/// Convert text to English headline-style title case.
273///
274/// Capitalizes the first and last word unconditionally.
275/// Interior stop words (articles, short prepositions, conjunctions) stay lowercase.
276/// The first word after `:`, `?`, or `!` is always capitalized.
277/// Hyphenated compounds capitalize each non-stop-word component.
278fn to_title_case(text: &str) -> String {
279    if text.is_empty() {
280        return String::new();
281    }
282
283    let words: Vec<&str> = text.split_whitespace().collect();
284    if words.is_empty() {
285        return text.to_string();
286    }
287
288    let last_idx = words.len() - 1;
289    let mut parts: Vec<String> = Vec::with_capacity(words.len());
290    let mut capitalize_next = false;
291
292    for (i, word) in words.iter().enumerate() {
293        let lower = word.to_lowercase();
294        if i == 0 || i == last_idx || capitalize_next {
295            if lower.contains('-') {
296                parts.push(capitalize_hyphenated(&lower, true));
297            } else {
298                parts.push(capitalize_first_word(&lower));
299            }
300        } else {
301            // Strip leading/trailing punctuation when checking stop words so that
302            // words like "(and" or "and)" are still treated as the stop word "and".
303            let alpha_core = lower.trim_matches(|c: char| !c.is_alphanumeric());
304            if TITLE_CASE_STOP_WORDS.contains(&alpha_core) {
305                parts.push(lower);
306            } else if lower.contains('-') {
307                parts.push(capitalize_hyphenated(&lower, false));
308            } else {
309                parts.push(capitalize_first_word(&lower));
310            }
311        }
312        // Capitalize the next word after sentence-ending punctuation or a colon,
313        // even when that punctuation is followed by a closing quote or bracket.
314        let punctuation_core = trim_trailing_closing_punctuation(word);
315        capitalize_next = punctuation_core.ends_with(':')
316            || punctuation_core.ends_with('?')
317            || punctuation_core.ends_with('!');
318    }
319
320    // Rebuild with original whitespace structure
321    let mut result = String::with_capacity(text.len());
322    let mut word_iter = parts.iter();
323    let mut in_word = false;
324    let mut current_word = word_iter.next();
325
326    for ch in text.chars() {
327        if ch.is_whitespace() {
328            if in_word {
329                in_word = false;
330                current_word = word_iter.next();
331            }
332            result.push(ch);
333        } else if !in_word && let Some(word) = current_word {
334            result.push_str(word);
335            in_word = true;
336        }
337    }
338
339    result
340}
341
342#[cfg(test)]
343#[allow(
344    clippy::unwrap_used,
345    clippy::expect_used,
346    clippy::panic,
347    clippy::indexing_slicing,
348    clippy::todo,
349    clippy::unimplemented,
350    clippy::unreachable,
351    clippy::get_unwrap,
352    reason = "Panicking is acceptable and often desired in tests."
353)]
354mod tests {
355    use super::*;
356
357    // --- capitalize_first_word ---
358
359    #[test]
360    fn test_capitalize_first_word_basic() {
361        assert_eq!(capitalize_first_word("hello world"), "Hello world");
362    }
363
364    #[test]
365    fn test_capitalize_first_word_leading_space() {
366        assert_eq!(capitalize_first_word("  hello"), "  Hello");
367    }
368
369    #[test]
370    fn test_capitalize_first_word_empty() {
371        assert_eq!(capitalize_first_word(""), "");
372    }
373
374    #[test]
375    fn test_capitalize_first_word_already_upper() {
376        assert_eq!(capitalize_first_word("Hello"), "Hello");
377    }
378
379    // --- capitalize_first_word_markup_aware ---
380
381    #[test]
382    fn test_capitalize_markup_aware_plain_text() {
383        assert_eq!(
384            capitalize_first_word_markup_aware("the collected essays"),
385            "The collected essays"
386        );
387    }
388
389    #[test]
390    fn test_capitalize_markup_aware_html_tag() {
391        assert_eq!(
392            capitalize_first_word_markup_aware("<em>the collected essays</em>"),
393            "<em>The collected essays</em>"
394        );
395    }
396
397    #[test]
398    fn test_capitalize_markup_aware_html_nested_tags() {
399        assert_eq!(
400            capitalize_first_word_markup_aware(r#"<span class="x"><em>the title</em></span>"#),
401            r#"<span class="x"><em>The title</em></span>"#
402        );
403    }
404
405    #[test]
406    fn test_capitalize_markup_aware_latex_command() {
407        assert_eq!(
408            capitalize_first_word_markup_aware(r"\emph{the collected essays}"),
409            r"\emph{The collected essays}"
410        );
411    }
412
413    #[test]
414    fn test_capitalize_markup_aware_latex_number_not_corrupted() {
415        // Regression: \emph{521} must not become \Emph{521}
416        assert_eq!(
417            capitalize_first_word_markup_aware(r"\emph{521}"),
418            r"\emph{521}"
419        );
420    }
421
422    #[test]
423    fn test_capitalize_markup_aware_typst_command() {
424        assert_eq!(
425            capitalize_first_word_markup_aware("#emph[the collected essays]"),
426            "#emph[The collected essays]"
427        );
428    }
429
430    #[test]
431    fn test_capitalize_markup_aware_plain_underscore_delimiters() {
432        // PlainText emph uses _..._; _ is non-alphabetic so this was already safe
433        assert_eq!(
434            capitalize_first_word_markup_aware("_the collected essays_"),
435            "_The collected essays_"
436        );
437    }
438
439    #[test]
440    fn test_capitalize_markup_aware_empty_string() {
441        assert_eq!(capitalize_first_word_markup_aware(""), "");
442    }
443
444    #[test]
445    fn test_capitalize_markup_aware_all_markup_no_text() {
446        assert_eq!(capitalize_first_word_markup_aware("<em></em>"), "<em></em>");
447    }
448
449    // --- to_sentence_case ---
450
451    #[test]
452    fn test_sentence_case_basic() {
453        assert_eq!(
454            to_sentence_case("The Quick Brown Fox"),
455            "The quick brown fox"
456        );
457    }
458
459    #[test]
460    fn test_sentence_case_all_caps() {
461        assert_eq!(to_sentence_case("DNA REPLICATION"), "Dna replication");
462    }
463
464    #[test]
465    fn test_sentence_case_empty() {
466        assert_eq!(to_sentence_case(""), "");
467    }
468
469    // --- to_title_case ---
470
471    #[test]
472    fn test_title_case_basic() {
473        assert_eq!(to_title_case("the quick brown fox"), "The Quick Brown Fox");
474    }
475
476    #[test]
477    fn test_title_case_stop_words() {
478        assert_eq!(
479            to_title_case("a tale of two cities"),
480            "A Tale of Two Cities"
481        );
482    }
483
484    #[test]
485    fn test_title_case_last_word_capitalized() {
486        assert_eq!(
487            to_title_case("the world we live in"),
488            "The World We Live In"
489        );
490    }
491
492    #[test]
493    fn test_title_case_after_colon() {
494        assert_eq!(
495            to_title_case("the title: a subtitle"),
496            "The Title: A Subtitle"
497        );
498    }
499
500    #[test]
501    fn test_title_case_after_colon_stop_word() {
502        // First word after colon is a stop word but must still be capitalized
503        assert_eq!(
504            to_title_case("history of the world: a new perspective"),
505            "History of the World: A New Perspective"
506        );
507    }
508
509    #[test]
510    fn test_title_case_after_question_mark() {
511        assert_eq!(
512            to_title_case("who's black and why? a hidden chapter"),
513            "Who's Black and Why? A Hidden Chapter"
514        );
515    }
516
517    #[test]
518    fn test_title_case_after_question_mark_with_closing_quote() {
519        assert_eq!(
520            to_title_case("who's black and why?\" a hidden chapter"),
521            "Who's Black and Why?\" A Hidden Chapter"
522        );
523    }
524
525    #[test]
526    fn test_title_case_from_is_stop_word() {
527        assert_eq!(
528            to_title_case("a hidden chapter from the eighteenth-century invention of race"),
529            "A Hidden Chapter from the Eighteenth-Century Invention of Race"
530        );
531    }
532
533    #[test]
534    fn test_title_case_hyphenated_compound() {
535        assert_eq!(
536            to_title_case("eighteenth-century studies"),
537            "Eighteenth-Century Studies"
538        );
539    }
540
541    #[test]
542    fn test_title_case_hyphenated_stop_word_part() {
543        // "well-to-do": "to" is a stop word → stays lowercase in interior position
544        assert_eq!(to_title_case("a well-to-do family"), "A Well-to-Do Family");
545    }
546
547    // --- apply_to_structured_parts ---
548
549    #[test]
550    fn test_sentence_apa_structured() {
551        let (main, subs) = apply_to_structured_parts(
552            "Understanding Citation Systems",
553            &["History and Practice", "A Comparative View"],
554            TextCase::SentenceApa,
555        );
556        assert_eq!(main, "Understanding citation systems");
557        assert_eq!(subs, vec!["History and practice", "A comparative view"]);
558    }
559
560    #[test]
561    fn test_sentence_nlm_structured() {
562        let (main, subs) = apply_to_structured_parts(
563            "Understanding Citation Systems",
564            &["History and Practice"],
565            TextCase::SentenceNlm,
566        );
567        assert_eq!(main, "Understanding citation systems");
568        // NLM: subtitles lowercased (no first-word capitalization)
569        assert_eq!(subs, vec!["history and practice"]);
570    }
571
572    #[test]
573    fn test_title_case_structured() {
574        let (main, subs) =
575            apply_to_structured_parts("the dna of empire", &["a new perspective"], TextCase::Title);
576        assert_eq!(main, "The Dna of Empire");
577        assert_eq!(subs, vec!["A New Perspective"]);
578    }
579
580    // --- resolve_text_case ---
581
582    #[test]
583    fn test_english_language_detection() {
584        assert!(is_english_language(Some("en")));
585        assert!(is_english_language(Some("en-US")));
586        assert!(is_english_language(Some("en-GB")));
587        assert!(is_english_language(None));
588        assert!(!is_english_language(Some("de")));
589        assert!(!is_english_language(Some("fr-FR")));
590    }
591
592    #[test]
593    fn test_resolve_non_english_falls_back() {
594        assert_eq!(
595            resolve_text_case(TextCase::SentenceApa, Some("de")),
596            TextCase::AsIs
597        );
598        assert_eq!(
599            resolve_text_case(TextCase::Title, Some("fr")),
600            TextCase::AsIs
601        );
602        // Explicit lowercase/uppercase pass through for any language
603        assert_eq!(
604            resolve_text_case(TextCase::Lowercase, Some("de")),
605            TextCase::Lowercase
606        );
607    }
608
609    #[test]
610    fn test_resolve_english_passes_through() {
611        assert_eq!(
612            resolve_text_case(TextCase::SentenceApa, Some("en")),
613            TextCase::SentenceApa
614        );
615        assert_eq!(
616            resolve_text_case(TextCase::Title, Some("en-US")),
617            TextCase::Title
618        );
619    }
620
621    #[test]
622    fn test_note_start_capitalize_first_uses_english_language_rules() {
623        assert_eq!(
624            apply_note_start_text_case(
625                "edited by",
626                NoteStartTextCase::CapitalizeFirst,
627                Some("en-US"),
628            ),
629            "Edited by"
630        );
631    }
632
633    #[test]
634    fn test_note_start_capitalize_first_falls_back_to_as_is_for_non_english() {
635        assert_eq!(
636            apply_note_start_text_case(
637                "hg. von",
638                NoteStartTextCase::CapitalizeFirst,
639                Some("de-DE"),
640            ),
641            "hg. von"
642        );
643    }
644
645    #[test]
646    fn test_note_start_capitalize_first_is_no_op_for_uncased_scripts() {
647        assert_eq!(
648            apply_note_start_text_case("ابن سينا", NoteStartTextCase::CapitalizeFirst, Some("ar"),),
649            "ابن سينا"
650        );
651    }
652}
citum_engine/values/text_case.rs

citum_engine/values/
text_case.rs