1use citum_schema::NoteStartTextCase;
13use citum_schema::options::titles::TextCase;
14
15#[must_use]
23pub fn apply_text_case(text: &str, case: TextCase) -> String {
24 match case {
25 TextCase::AsIs => text.to_string(),
26 TextCase::Lowercase => text.to_lowercase(),
27 TextCase::Uppercase => text.to_uppercase(),
28 TextCase::CapitalizeFirst => capitalize_first_word(text),
29 TextCase::Sentence | TextCase::SentenceApa | TextCase::SentenceNlm => {
30 to_sentence_case(text)
31 }
32 TextCase::Title => to_title_case(text),
33 }
34}
35
36#[must_use]
43pub fn apply_to_structured_parts(
44 main: &str,
45 subtitles: &[&str],
46 case: TextCase,
47) -> (String, Vec<String>) {
48 match case {
49 TextCase::SentenceApa => {
50 let main_cased = to_sentence_case(main);
51 let subs_cased = subtitles.iter().map(|s| to_sentence_case(s)).collect();
52 (main_cased, subs_cased)
53 }
54 TextCase::SentenceNlm => {
55 let main_cased = to_sentence_case(main);
56 let subs_cased = subtitles.iter().map(|s| s.to_lowercase()).collect();
58 (main_cased, subs_cased)
59 }
60 _ => {
61 let main_cased = apply_text_case(main, case);
62 let subs_cased = subtitles.iter().map(|s| apply_text_case(s, case)).collect();
63 (main_cased, subs_cased)
64 }
65 }
66}
67
68#[must_use]
70pub fn is_english_language(lang: Option<&str>) -> bool {
71 match lang {
72 Some(tag) => {
73 let primary = tag.split('-').next().unwrap_or(tag);
74 primary.eq_ignore_ascii_case("en")
75 }
76 None => true,
78 }
79}
80
81#[must_use]
85pub fn resolve_text_case(case: TextCase, language: Option<&str>) -> TextCase {
86 if is_english_language(language) {
87 case
88 } else {
89 match case {
92 TextCase::AsIs | TextCase::Lowercase | TextCase::Uppercase => case,
93 _ => TextCase::AsIs,
94 }
95 }
96}
97
98#[must_use]
101pub(crate) fn apply_note_start_text_case(
102 value: &str,
103 text_case: NoteStartTextCase,
104 language: Option<&str>,
105) -> String {
106 let case = match text_case {
107 NoteStartTextCase::CapitalizeFirst => TextCase::CapitalizeFirst,
108 NoteStartTextCase::Lowercase => TextCase::Lowercase,
109 };
110 apply_text_case(value, resolve_text_case(case, language))
111}
112
113fn to_sentence_case(text: &str) -> String {
115 if text.is_empty() {
116 return String::new();
117 }
118 let lowered = text.to_lowercase();
119 capitalize_first_word(&lowered)
120}
121
122pub(crate) fn capitalize_first_word(text: &str) -> String {
125 let mut result = String::with_capacity(text.len());
126 let mut found_first = false;
127 for ch in text.chars() {
128 if !found_first && ch.is_alphabetic() {
129 for upper in ch.to_uppercase() {
130 result.push(upper);
131 }
132 found_first = true;
133 } else {
134 result.push(ch);
135 }
136 }
137 result
138}
139
140pub(crate) fn capitalize_first_word_markup_aware(text: &str) -> String {
147 let bytes = text.as_bytes();
148 let len = bytes.len();
149 let mut i = 0;
150
151 while i < len {
152 let Some(&b) = bytes.get(i) else { break };
153
154 if b == b'<'
157 && let Some(end) = text.get(i..).and_then(|s| s.find('>'))
158 {
159 i += end + 1;
160 continue;
161 }
162
163 if b == b'\\' {
165 let cmd_start = i + 1;
166 let cmd_len = bytes
167 .get(cmd_start..)
168 .unwrap_or_default()
169 .iter()
170 .take_while(|&&c| c.is_ascii_alphabetic())
171 .count();
172 if cmd_len > 0 {
173 let after_cmd = cmd_start + cmd_len;
174 let after_opt = if bytes.get(after_cmd) == Some(&b'[') {
176 text.get(after_cmd..)
177 .and_then(|s| s.find(']'))
178 .map(|e| after_cmd + e + 1)
179 .unwrap_or(after_cmd)
180 } else {
181 after_cmd
182 };
183 if bytes.get(after_opt) == Some(&b'{') {
184 i = after_opt + 1;
185 continue;
186 }
187 }
188 }
189
190 if b == b'#' {
192 let cmd_start = i + 1;
193 let cmd_len = bytes
194 .get(cmd_start..)
195 .unwrap_or_default()
196 .iter()
197 .take_while(|&&c| c.is_ascii_alphabetic())
198 .count();
199 if cmd_len > 0 {
200 let after_cmd = cmd_start + cmd_len;
201 if bytes.get(after_cmd) == Some(&b'[') {
202 i = after_cmd + 1;
203 continue;
204 }
205 }
206 }
207
208 let ch = text.get(i..).and_then(|s| s.chars().next()).unwrap_or('\0');
211 if ch.is_alphabetic() {
212 let ch_len = ch.len_utf8();
213 let mut result = String::with_capacity(text.len());
214 result.push_str(text.get(..i).unwrap_or_default());
215 for upper in ch.to_uppercase() {
216 result.push(upper);
217 }
218 result.push_str(text.get(i + ch_len..).unwrap_or_default());
219 return result;
220 }
221
222 i += ch.len_utf8().max(1);
223 }
224
225 text.to_string()
226}
227
228pub(crate) fn apply_text_case_markup_aware(text: &str, case: TextCase) -> String {
234 match case {
235 TextCase::CapitalizeFirst => capitalize_first_word_markup_aware(text),
236 _ => apply_text_case(text, case),
237 }
238}
239
240const TITLE_CASE_STOP_WORDS: &[&str] = &[
242 "a", "an", "and", "as", "at", "but", "by", "for", "from", "in", "nor", "of", "on", "or", "so",
243 "the", "to", "up", "yet", "v", "vs",
244];
245
246fn capitalize_hyphenated(word: &str, force_all: bool) -> String {
251 word.split('-')
252 .map(|part| {
253 if force_all {
254 capitalize_first_word(part)
255 } else {
256 let alpha_core = part.trim_matches(|c: char| !c.is_alphanumeric());
257 if TITLE_CASE_STOP_WORDS.contains(&alpha_core) {
258 part.to_string()
259 } else {
260 capitalize_first_word(part)
261 }
262 }
263 })
264 .collect::<Vec<_>>()
265 .join("-")
266}
267
268fn trim_trailing_closing_punctuation(word: &str) -> &str {
269 word.trim_end_matches(['"', '\'', ')', ']', '}', '»', '”', '’'])
270}
271
272fn to_title_case(text: &str) -> String {
279 if text.is_empty() {
280 return String::new();
281 }
282
283 let words: Vec<&str> = text.split_whitespace().collect();
284 if words.is_empty() {
285 return text.to_string();
286 }
287
288 let last_idx = words.len() - 1;
289 let mut parts: Vec<String> = Vec::with_capacity(words.len());
290 let mut capitalize_next = false;
291
292 for (i, word) in words.iter().enumerate() {
293 let lower = word.to_lowercase();
294 if i == 0 || i == last_idx || capitalize_next {
295 if lower.contains('-') {
296 parts.push(capitalize_hyphenated(&lower, true));
297 } else {
298 parts.push(capitalize_first_word(&lower));
299 }
300 } else {
301 let alpha_core = lower.trim_matches(|c: char| !c.is_alphanumeric());
304 if TITLE_CASE_STOP_WORDS.contains(&alpha_core) {
305 parts.push(lower);
306 } else if lower.contains('-') {
307 parts.push(capitalize_hyphenated(&lower, false));
308 } else {
309 parts.push(capitalize_first_word(&lower));
310 }
311 }
312 let punctuation_core = trim_trailing_closing_punctuation(word);
315 capitalize_next = punctuation_core.ends_with(':')
316 || punctuation_core.ends_with('?')
317 || punctuation_core.ends_with('!');
318 }
319
320 let mut result = String::with_capacity(text.len());
322 let mut word_iter = parts.iter();
323 let mut in_word = false;
324 let mut current_word = word_iter.next();
325
326 for ch in text.chars() {
327 if ch.is_whitespace() {
328 if in_word {
329 in_word = false;
330 current_word = word_iter.next();
331 }
332 result.push(ch);
333 } else if !in_word && let Some(word) = current_word {
334 result.push_str(word);
335 in_word = true;
336 }
337 }
338
339 result
340}
341
342#[cfg(test)]
343#[allow(
344 clippy::unwrap_used,
345 clippy::expect_used,
346 clippy::panic,
347 clippy::indexing_slicing,
348 clippy::todo,
349 clippy::unimplemented,
350 clippy::unreachable,
351 clippy::get_unwrap,
352 reason = "Panicking is acceptable and often desired in tests."
353)]
354mod tests {
355 use super::*;
356
357 #[test]
360 fn test_capitalize_first_word_basic() {
361 assert_eq!(capitalize_first_word("hello world"), "Hello world");
362 }
363
364 #[test]
365 fn test_capitalize_first_word_leading_space() {
366 assert_eq!(capitalize_first_word(" hello"), " Hello");
367 }
368
369 #[test]
370 fn test_capitalize_first_word_empty() {
371 assert_eq!(capitalize_first_word(""), "");
372 }
373
374 #[test]
375 fn test_capitalize_first_word_already_upper() {
376 assert_eq!(capitalize_first_word("Hello"), "Hello");
377 }
378
379 #[test]
382 fn test_capitalize_markup_aware_plain_text() {
383 assert_eq!(
384 capitalize_first_word_markup_aware("the collected essays"),
385 "The collected essays"
386 );
387 }
388
389 #[test]
390 fn test_capitalize_markup_aware_html_tag() {
391 assert_eq!(
392 capitalize_first_word_markup_aware("<em>the collected essays</em>"),
393 "<em>The collected essays</em>"
394 );
395 }
396
397 #[test]
398 fn test_capitalize_markup_aware_html_nested_tags() {
399 assert_eq!(
400 capitalize_first_word_markup_aware(r#"<span class="x"><em>the title</em></span>"#),
401 r#"<span class="x"><em>The title</em></span>"#
402 );
403 }
404
405 #[test]
406 fn test_capitalize_markup_aware_latex_command() {
407 assert_eq!(
408 capitalize_first_word_markup_aware(r"\emph{the collected essays}"),
409 r"\emph{The collected essays}"
410 );
411 }
412
413 #[test]
414 fn test_capitalize_markup_aware_latex_number_not_corrupted() {
415 assert_eq!(
417 capitalize_first_word_markup_aware(r"\emph{521}"),
418 r"\emph{521}"
419 );
420 }
421
422 #[test]
423 fn test_capitalize_markup_aware_typst_command() {
424 assert_eq!(
425 capitalize_first_word_markup_aware("#emph[the collected essays]"),
426 "#emph[The collected essays]"
427 );
428 }
429
430 #[test]
431 fn test_capitalize_markup_aware_plain_underscore_delimiters() {
432 assert_eq!(
434 capitalize_first_word_markup_aware("_the collected essays_"),
435 "_The collected essays_"
436 );
437 }
438
439 #[test]
440 fn test_capitalize_markup_aware_empty_string() {
441 assert_eq!(capitalize_first_word_markup_aware(""), "");
442 }
443
444 #[test]
445 fn test_capitalize_markup_aware_all_markup_no_text() {
446 assert_eq!(capitalize_first_word_markup_aware("<em></em>"), "<em></em>");
447 }
448
449 #[test]
452 fn test_sentence_case_basic() {
453 assert_eq!(
454 to_sentence_case("The Quick Brown Fox"),
455 "The quick brown fox"
456 );
457 }
458
459 #[test]
460 fn test_sentence_case_all_caps() {
461 assert_eq!(to_sentence_case("DNA REPLICATION"), "Dna replication");
462 }
463
464 #[test]
465 fn test_sentence_case_empty() {
466 assert_eq!(to_sentence_case(""), "");
467 }
468
469 #[test]
472 fn test_title_case_basic() {
473 assert_eq!(to_title_case("the quick brown fox"), "The Quick Brown Fox");
474 }
475
476 #[test]
477 fn test_title_case_stop_words() {
478 assert_eq!(
479 to_title_case("a tale of two cities"),
480 "A Tale of Two Cities"
481 );
482 }
483
484 #[test]
485 fn test_title_case_last_word_capitalized() {
486 assert_eq!(
487 to_title_case("the world we live in"),
488 "The World We Live In"
489 );
490 }
491
492 #[test]
493 fn test_title_case_after_colon() {
494 assert_eq!(
495 to_title_case("the title: a subtitle"),
496 "The Title: A Subtitle"
497 );
498 }
499
500 #[test]
501 fn test_title_case_after_colon_stop_word() {
502 assert_eq!(
504 to_title_case("history of the world: a new perspective"),
505 "History of the World: A New Perspective"
506 );
507 }
508
509 #[test]
510 fn test_title_case_after_question_mark() {
511 assert_eq!(
512 to_title_case("who's black and why? a hidden chapter"),
513 "Who's Black and Why? A Hidden Chapter"
514 );
515 }
516
517 #[test]
518 fn test_title_case_after_question_mark_with_closing_quote() {
519 assert_eq!(
520 to_title_case("who's black and why?\" a hidden chapter"),
521 "Who's Black and Why?\" A Hidden Chapter"
522 );
523 }
524
525 #[test]
526 fn test_title_case_from_is_stop_word() {
527 assert_eq!(
528 to_title_case("a hidden chapter from the eighteenth-century invention of race"),
529 "A Hidden Chapter from the Eighteenth-Century Invention of Race"
530 );
531 }
532
533 #[test]
534 fn test_title_case_hyphenated_compound() {
535 assert_eq!(
536 to_title_case("eighteenth-century studies"),
537 "Eighteenth-Century Studies"
538 );
539 }
540
541 #[test]
542 fn test_title_case_hyphenated_stop_word_part() {
543 assert_eq!(to_title_case("a well-to-do family"), "A Well-to-Do Family");
545 }
546
547 #[test]
550 fn test_sentence_apa_structured() {
551 let (main, subs) = apply_to_structured_parts(
552 "Understanding Citation Systems",
553 &["History and Practice", "A Comparative View"],
554 TextCase::SentenceApa,
555 );
556 assert_eq!(main, "Understanding citation systems");
557 assert_eq!(subs, vec!["History and practice", "A comparative view"]);
558 }
559
560 #[test]
561 fn test_sentence_nlm_structured() {
562 let (main, subs) = apply_to_structured_parts(
563 "Understanding Citation Systems",
564 &["History and Practice"],
565 TextCase::SentenceNlm,
566 );
567 assert_eq!(main, "Understanding citation systems");
568 assert_eq!(subs, vec!["history and practice"]);
570 }
571
572 #[test]
573 fn test_title_case_structured() {
574 let (main, subs) =
575 apply_to_structured_parts("the dna of empire", &["a new perspective"], TextCase::Title);
576 assert_eq!(main, "The Dna of Empire");
577 assert_eq!(subs, vec!["A New Perspective"]);
578 }
579
580 #[test]
583 fn test_english_language_detection() {
584 assert!(is_english_language(Some("en")));
585 assert!(is_english_language(Some("en-US")));
586 assert!(is_english_language(Some("en-GB")));
587 assert!(is_english_language(None));
588 assert!(!is_english_language(Some("de")));
589 assert!(!is_english_language(Some("fr-FR")));
590 }
591
592 #[test]
593 fn test_resolve_non_english_falls_back() {
594 assert_eq!(
595 resolve_text_case(TextCase::SentenceApa, Some("de")),
596 TextCase::AsIs
597 );
598 assert_eq!(
599 resolve_text_case(TextCase::Title, Some("fr")),
600 TextCase::AsIs
601 );
602 assert_eq!(
604 resolve_text_case(TextCase::Lowercase, Some("de")),
605 TextCase::Lowercase
606 );
607 }
608
609 #[test]
610 fn test_resolve_english_passes_through() {
611 assert_eq!(
612 resolve_text_case(TextCase::SentenceApa, Some("en")),
613 TextCase::SentenceApa
614 );
615 assert_eq!(
616 resolve_text_case(TextCase::Title, Some("en-US")),
617 TextCase::Title
618 );
619 }
620
621 #[test]
622 fn test_note_start_capitalize_first_uses_english_language_rules() {
623 assert_eq!(
624 apply_note_start_text_case(
625 "edited by",
626 NoteStartTextCase::CapitalizeFirst,
627 Some("en-US"),
628 ),
629 "Edited by"
630 );
631 }
632
633 #[test]
634 fn test_note_start_capitalize_first_falls_back_to_as_is_for_non_english() {
635 assert_eq!(
636 apply_note_start_text_case(
637 "hg. von",
638 NoteStartTextCase::CapitalizeFirst,
639 Some("de-DE"),
640 ),
641 "hg. von"
642 );
643 }
644
645 #[test]
646 fn test_note_start_capitalize_first_is_no_op_for_uncased_scripts() {
647 assert_eq!(
648 apply_note_start_text_case("ابن سينا", NoteStartTextCase::CapitalizeFirst, Some("ar"),),
649 "ابن سينا"
650 );
651 }
652}