use citum_schema::NoteStartTextCase;
use citum_schema::options::titles::TextCase;
#[must_use]
pub fn apply_text_case(text: &str, case: TextCase) -> String {
match case {
TextCase::AsIs => text.to_string(),
TextCase::Lowercase => text.to_lowercase(),
TextCase::Uppercase => text.to_uppercase(),
TextCase::CapitalizeFirst => capitalize_first_word(text),
TextCase::Sentence | TextCase::SentenceApa | TextCase::SentenceNlm => {
to_sentence_case(text)
}
TextCase::Title => to_title_case(text),
}
}
#[must_use]
pub fn apply_to_structured_parts(
main: &str,
subtitles: &[&str],
case: TextCase,
) -> (String, Vec<String>) {
match case {
TextCase::SentenceApa => {
let main_cased = to_sentence_case(main);
let subs_cased = subtitles.iter().map(|s| to_sentence_case(s)).collect();
(main_cased, subs_cased)
}
TextCase::SentenceNlm => {
let main_cased = to_sentence_case(main);
let subs_cased = subtitles.iter().map(|s| s.to_lowercase()).collect();
(main_cased, subs_cased)
}
_ => {
let main_cased = apply_text_case(main, case);
let subs_cased = subtitles.iter().map(|s| apply_text_case(s, case)).collect();
(main_cased, subs_cased)
}
}
}
#[must_use]
pub fn is_english_language(lang: Option<&str>) -> bool {
match lang {
Some(tag) => {
let primary = tag.split('-').next().unwrap_or(tag);
primary.eq_ignore_ascii_case("en")
}
None => true,
}
}
#[must_use]
pub fn resolve_text_case(case: TextCase, language: Option<&str>) -> TextCase {
if is_english_language(language) {
case
} else {
match case {
TextCase::AsIs | TextCase::Lowercase | TextCase::Uppercase => case,
_ => TextCase::AsIs,
}
}
}
#[must_use]
pub(crate) fn apply_note_start_text_case(
value: &str,
text_case: NoteStartTextCase,
language: Option<&str>,
) -> String {
let case = match text_case {
NoteStartTextCase::CapitalizeFirst => TextCase::CapitalizeFirst,
NoteStartTextCase::Lowercase => TextCase::Lowercase,
};
apply_text_case(value, resolve_text_case(case, language))
}
fn to_sentence_case(text: &str) -> String {
if text.is_empty() {
return String::new();
}
let lowered = text.to_lowercase();
capitalize_first_word(&lowered)
}
pub(crate) fn capitalize_first_word(text: &str) -> String {
let mut result = String::with_capacity(text.len());
let mut found_first = false;
for ch in text.chars() {
if !found_first && ch.is_alphabetic() {
for upper in ch.to_uppercase() {
result.push(upper);
}
found_first = true;
} else {
result.push(ch);
}
}
result
}
pub(crate) fn capitalize_first_word_markup_aware(text: &str) -> String {
let bytes = text.as_bytes();
let len = bytes.len();
let mut i = 0;
while i < len {
let Some(&b) = bytes.get(i) else { break };
if b == b'<'
&& let Some(end) = text.get(i..).and_then(|s| s.find('>'))
{
i += end + 1;
continue;
}
if b == b'\\' {
let cmd_start = i + 1;
let cmd_len = bytes
.get(cmd_start..)
.unwrap_or_default()
.iter()
.take_while(|&&c| c.is_ascii_alphabetic())
.count();
if cmd_len > 0 {
let after_cmd = cmd_start + cmd_len;
let after_opt = if bytes.get(after_cmd) == Some(&b'[') {
text.get(after_cmd..)
.and_then(|s| s.find(']'))
.map(|e| after_cmd + e + 1)
.unwrap_or(after_cmd)
} else {
after_cmd
};
if bytes.get(after_opt) == Some(&b'{') {
i = after_opt + 1;
continue;
}
}
}
if b == b'#' {
let cmd_start = i + 1;
let cmd_len = bytes
.get(cmd_start..)
.unwrap_or_default()
.iter()
.take_while(|&&c| c.is_ascii_alphabetic())
.count();
if cmd_len > 0 {
let after_cmd = cmd_start + cmd_len;
if bytes.get(after_cmd) == Some(&b'[') {
i = after_cmd + 1;
continue;
}
}
}
let ch = text.get(i..).and_then(|s| s.chars().next()).unwrap_or('\0');
if ch.is_alphabetic() {
let ch_len = ch.len_utf8();
let mut result = String::with_capacity(text.len());
result.push_str(text.get(..i).unwrap_or_default());
for upper in ch.to_uppercase() {
result.push(upper);
}
result.push_str(text.get(i + ch_len..).unwrap_or_default());
return result;
}
i += ch.len_utf8().max(1);
}
text.to_string()
}
pub(crate) fn apply_text_case_markup_aware(text: &str, case: TextCase) -> String {
match case {
TextCase::CapitalizeFirst => capitalize_first_word_markup_aware(text),
_ => apply_text_case(text, case),
}
}
const TITLE_CASE_STOP_WORDS: &[&str] = &[
"a", "an", "and", "as", "at", "but", "by", "for", "from", "in", "nor", "of", "on", "or", "so",
"the", "to", "up", "yet", "v", "vs",
];
fn capitalize_hyphenated(word: &str, force_all: bool) -> String {
word.split('-')
.map(|part| {
if force_all {
capitalize_first_word(part)
} else {
let alpha_core = part.trim_matches(|c: char| !c.is_alphanumeric());
if TITLE_CASE_STOP_WORDS.contains(&alpha_core) {
part.to_string()
} else {
capitalize_first_word(part)
}
}
})
.collect::<Vec<_>>()
.join("-")
}
fn trim_trailing_closing_punctuation(word: &str) -> &str {
word.trim_end_matches(['"', '\'', ')', ']', '}', '»', '”', '’'])
}
fn to_title_case(text: &str) -> String {
if text.is_empty() {
return String::new();
}
let words: Vec<&str> = text.split_whitespace().collect();
if words.is_empty() {
return text.to_string();
}
let last_idx = words.len() - 1;
let mut parts: Vec<String> = Vec::with_capacity(words.len());
let mut capitalize_next = false;
for (i, word) in words.iter().enumerate() {
let lower = word.to_lowercase();
if i == 0 || i == last_idx || capitalize_next {
if lower.contains('-') {
parts.push(capitalize_hyphenated(&lower, true));
} else {
parts.push(capitalize_first_word(&lower));
}
} else {
let alpha_core = lower.trim_matches(|c: char| !c.is_alphanumeric());
if TITLE_CASE_STOP_WORDS.contains(&alpha_core) {
parts.push(lower);
} else if lower.contains('-') {
parts.push(capitalize_hyphenated(&lower, false));
} else {
parts.push(capitalize_first_word(&lower));
}
}
let punctuation_core = trim_trailing_closing_punctuation(word);
capitalize_next = punctuation_core.ends_with(':')
|| punctuation_core.ends_with('?')
|| punctuation_core.ends_with('!');
}
let mut result = String::with_capacity(text.len());
let mut word_iter = parts.iter();
let mut in_word = false;
let mut current_word = word_iter.next();
for ch in text.chars() {
if ch.is_whitespace() {
if in_word {
in_word = false;
current_word = word_iter.next();
}
result.push(ch);
} else if !in_word && let Some(word) = current_word {
result.push_str(word);
in_word = true;
}
}
result
}
#[cfg(test)]
#[allow(
clippy::unwrap_used,
clippy::expect_used,
clippy::panic,
clippy::indexing_slicing,
clippy::todo,
clippy::unimplemented,
clippy::unreachable,
clippy::get_unwrap,
reason = "Panicking is acceptable and often desired in tests."
)]
mod tests {
use super::*;
#[test]
fn test_capitalize_first_word_basic() {
assert_eq!(capitalize_first_word("hello world"), "Hello world");
}
#[test]
fn test_capitalize_first_word_leading_space() {
assert_eq!(capitalize_first_word(" hello"), " Hello");
}
#[test]
fn test_capitalize_first_word_empty() {
assert_eq!(capitalize_first_word(""), "");
}
#[test]
fn test_capitalize_first_word_already_upper() {
assert_eq!(capitalize_first_word("Hello"), "Hello");
}
#[test]
fn test_capitalize_markup_aware_plain_text() {
assert_eq!(
capitalize_first_word_markup_aware("the collected essays"),
"The collected essays"
);
}
#[test]
fn test_capitalize_markup_aware_html_tag() {
assert_eq!(
capitalize_first_word_markup_aware("<em>the collected essays</em>"),
"<em>The collected essays</em>"
);
}
#[test]
fn test_capitalize_markup_aware_html_nested_tags() {
assert_eq!(
capitalize_first_word_markup_aware(r#"<span class="x"><em>the title</em></span>"#),
r#"<span class="x"><em>The title</em></span>"#
);
}
#[test]
fn test_capitalize_markup_aware_latex_command() {
assert_eq!(
capitalize_first_word_markup_aware(r"\emph{the collected essays}"),
r"\emph{The collected essays}"
);
}
#[test]
fn test_capitalize_markup_aware_latex_number_not_corrupted() {
assert_eq!(
capitalize_first_word_markup_aware(r"\emph{521}"),
r"\emph{521}"
);
}
#[test]
fn test_capitalize_markup_aware_typst_command() {
assert_eq!(
capitalize_first_word_markup_aware("#emph[the collected essays]"),
"#emph[The collected essays]"
);
}
#[test]
fn test_capitalize_markup_aware_plain_underscore_delimiters() {
assert_eq!(
capitalize_first_word_markup_aware("_the collected essays_"),
"_The collected essays_"
);
}
#[test]
fn test_capitalize_markup_aware_empty_string() {
assert_eq!(capitalize_first_word_markup_aware(""), "");
}
#[test]
fn test_capitalize_markup_aware_all_markup_no_text() {
assert_eq!(capitalize_first_word_markup_aware("<em></em>"), "<em></em>");
}
#[test]
fn test_sentence_case_basic() {
assert_eq!(
to_sentence_case("The Quick Brown Fox"),
"The quick brown fox"
);
}
#[test]
fn test_sentence_case_all_caps() {
assert_eq!(to_sentence_case("DNA REPLICATION"), "Dna replication");
}
#[test]
fn test_sentence_case_empty() {
assert_eq!(to_sentence_case(""), "");
}
#[test]
fn test_title_case_basic() {
assert_eq!(to_title_case("the quick brown fox"), "The Quick Brown Fox");
}
#[test]
fn test_title_case_stop_words() {
assert_eq!(
to_title_case("a tale of two cities"),
"A Tale of Two Cities"
);
}
#[test]
fn test_title_case_last_word_capitalized() {
assert_eq!(
to_title_case("the world we live in"),
"The World We Live In"
);
}
#[test]
fn test_title_case_after_colon() {
assert_eq!(
to_title_case("the title: a subtitle"),
"The Title: A Subtitle"
);
}
#[test]
fn test_title_case_after_colon_stop_word() {
assert_eq!(
to_title_case("history of the world: a new perspective"),
"History of the World: A New Perspective"
);
}
#[test]
fn test_title_case_after_question_mark() {
assert_eq!(
to_title_case("who's black and why? a hidden chapter"),
"Who's Black and Why? A Hidden Chapter"
);
}
#[test]
fn test_title_case_after_question_mark_with_closing_quote() {
assert_eq!(
to_title_case("who's black and why?\" a hidden chapter"),
"Who's Black and Why?\" A Hidden Chapter"
);
}
#[test]
fn test_title_case_from_is_stop_word() {
assert_eq!(
to_title_case("a hidden chapter from the eighteenth-century invention of race"),
"A Hidden Chapter from the Eighteenth-Century Invention of Race"
);
}
#[test]
fn test_title_case_hyphenated_compound() {
assert_eq!(
to_title_case("eighteenth-century studies"),
"Eighteenth-Century Studies"
);
}
#[test]
fn test_title_case_hyphenated_stop_word_part() {
assert_eq!(to_title_case("a well-to-do family"), "A Well-to-Do Family");
}
#[test]
fn test_sentence_apa_structured() {
let (main, subs) = apply_to_structured_parts(
"Understanding Citation Systems",
&["History and Practice", "A Comparative View"],
TextCase::SentenceApa,
);
assert_eq!(main, "Understanding citation systems");
assert_eq!(subs, vec!["History and practice", "A comparative view"]);
}
#[test]
fn test_sentence_nlm_structured() {
let (main, subs) = apply_to_structured_parts(
"Understanding Citation Systems",
&["History and Practice"],
TextCase::SentenceNlm,
);
assert_eq!(main, "Understanding citation systems");
assert_eq!(subs, vec!["history and practice"]);
}
#[test]
fn test_title_case_structured() {
let (main, subs) =
apply_to_structured_parts("the dna of empire", &["a new perspective"], TextCase::Title);
assert_eq!(main, "The Dna of Empire");
assert_eq!(subs, vec!["A New Perspective"]);
}
#[test]
fn test_english_language_detection() {
assert!(is_english_language(Some("en")));
assert!(is_english_language(Some("en-US")));
assert!(is_english_language(Some("en-GB")));
assert!(is_english_language(None));
assert!(!is_english_language(Some("de")));
assert!(!is_english_language(Some("fr-FR")));
}
#[test]
fn test_resolve_non_english_falls_back() {
assert_eq!(
resolve_text_case(TextCase::SentenceApa, Some("de")),
TextCase::AsIs
);
assert_eq!(
resolve_text_case(TextCase::Title, Some("fr")),
TextCase::AsIs
);
assert_eq!(
resolve_text_case(TextCase::Lowercase, Some("de")),
TextCase::Lowercase
);
}
#[test]
fn test_resolve_english_passes_through() {
assert_eq!(
resolve_text_case(TextCase::SentenceApa, Some("en")),
TextCase::SentenceApa
);
assert_eq!(
resolve_text_case(TextCase::Title, Some("en-US")),
TextCase::Title
);
}
#[test]
fn test_note_start_capitalize_first_uses_english_language_rules() {
assert_eq!(
apply_note_start_text_case(
"edited by",
NoteStartTextCase::CapitalizeFirst,
Some("en-US"),
),
"Edited by"
);
}
#[test]
fn test_note_start_capitalize_first_falls_back_to_as_is_for_non_english() {
assert_eq!(
apply_note_start_text_case(
"hg. von",
NoteStartTextCase::CapitalizeFirst,
Some("de-DE"),
),
"hg. von"
);
}
#[test]
fn test_note_start_capitalize_first_is_no_op_for_uncased_scripts() {
assert_eq!(
apply_note_start_text_case("ابن سينا", NoteStartTextCase::CapitalizeFirst, Some("ar"),),
"ابن سينا"
);
}
}