harper_core/
title_case.rs

1use std::borrow::Cow;
2
3use crate::Lrc;
4use crate::Token;
5use crate::TokenKind;
6use hashbrown::HashSet;
7use lazy_static::lazy_static;
8
9use crate::{CharStringExt, Dictionary, Document, TokenStringExt, parsers::Parser};
10
11/// A helper function for [`make_title_case`] that uses Strings instead of char buffers.
12pub fn make_title_case_str(source: &str, parser: &impl Parser, dict: &impl Dictionary) -> String {
13    let source: Vec<char> = source.chars().collect();
14
15    make_title_case_chars(Lrc::new(source), parser, dict).to_string()
16}
17
18// Make a given string [title case](https://en.wikipedia.org/wiki/Title_case) following the Chicago Manual of Style.
19pub fn make_title_case_chars(
20    source: Lrc<Vec<char>>,
21    parser: &impl Parser,
22    dict: &impl Dictionary,
23) -> Vec<char> {
24    let document = Document::new_from_vec(source.clone(), parser, dict);
25
26    make_title_case(document.get_tokens(), source.as_slice(), dict)
27}
28
29pub fn make_title_case(toks: &[Token], source: &[char], dict: &impl Dictionary) -> Vec<char> {
30    if toks.is_empty() {
31        return Vec::new();
32    }
33
34    let start_index = toks.first().unwrap().span.start;
35
36    let mut word_likes = toks.iter_word_likes().enumerate().peekable();
37    let mut output = toks.span().unwrap().get_content(source).to_vec();
38
39    while let Some((index, word)) = word_likes.next() {
40        if let Some(Some(metadata)) = word.kind.as_word() {
41            if metadata.is_proper_noun() {
42                // Replace it with the dictionary entry verbatim.
43                let orig_text = word.span.get_content(source);
44
45                if let Some(correct_caps) = dict.get_correct_capitalization_of(orig_text) {
46                    // It should match the dictionary verbatim
47                    output[word.span.start - start_index..word.span.end - start_index]
48                        .iter_mut()
49                        .enumerate()
50                        .for_each(|(idx, c)| *c = correct_caps[idx]);
51                }
52            }
53        };
54
55        let should_capitalize = should_capitalize_token(word, source, dict)
56            || index == 0
57            || word_likes.peek().is_none();
58
59        if should_capitalize {
60            output[word.span.start - start_index] =
61                output[word.span.start - start_index].to_ascii_uppercase();
62        } else {
63            // The whole word should be lowercase.
64            for i in word.span {
65                output[i - start_index] = output[i - start_index].to_ascii_lowercase();
66            }
67        }
68    }
69
70    output
71}
72
73/// Determines whether a token should be capitalized.
74/// Is not responsible for capitalization requirements that are dependent on token position.
75fn should_capitalize_token(tok: &Token, source: &[char], dict: &impl Dictionary) -> bool {
76    match &tok.kind {
77        TokenKind::Word(Some(metadata)) => {
78            // Only specific conjunctions are not capitalized.
79            lazy_static! {
80                static ref SPECIAL_CONJUNCTIONS: HashSet<Vec<char>> =
81                    ["and", "but", "for", "or", "nor"]
82                        .iter()
83                        .map(|v| v.chars().collect())
84                        .collect();
85            }
86
87            let chars = tok.span.get_content(source);
88            let chars_lower = chars.to_lower();
89
90            let mut metadata = Cow::Borrowed(metadata);
91
92            if let Some(metadata_lower) = dict.get_word_metadata(&chars_lower) {
93                metadata = Cow::Owned(metadata.clone().or(metadata_lower));
94            }
95
96            let is_short_preposition = metadata.preposition && tok.span.len() <= 4;
97
98            !is_short_preposition
99                && !metadata.determiner
100                && !SPECIAL_CONJUNCTIONS.contains(chars_lower.as_ref())
101        }
102        _ => true,
103    }
104}
105
106#[cfg(test)]
107mod tests {
108
109    use quickcheck::TestResult;
110    use quickcheck_macros::quickcheck;
111
112    use super::make_title_case_str;
113    use crate::{
114        FstDictionary,
115        parsers::{Markdown, PlainEnglish},
116    };
117
118    #[test]
119    fn normal() {
120        assert_eq!(
121            make_title_case_str("this is a test", &PlainEnglish, &FstDictionary::curated()),
122            "This Is a Test"
123        )
124    }
125
126    #[test]
127    fn complex() {
128        assert_eq!(
129            make_title_case_str(
130                "the first and last words should be capitalized, even if it is \"the\"",
131                &PlainEnglish,
132                &FstDictionary::curated()
133            ),
134            "The First and Last Words Should Be Capitalized, Even If It Is \"The\""
135        )
136    }
137
138    /// Check that "about" remains uppercase
139    #[test]
140    fn about_uppercase_with_numbers() {
141        assert_eq!(
142            make_title_case_str("0 about 0", &PlainEnglish, &FstDictionary::curated()),
143            "0 About 0"
144        )
145    }
146
147    #[test]
148    fn pipe_does_not_cause_crash() {
149        assert_eq!(
150            make_title_case_str("|", &Markdown::default(), &FstDictionary::curated()),
151            "|"
152        )
153    }
154
155    #[test]
156    fn a_paragraph_does_not_cause_crash() {
157        assert_eq!(
158            make_title_case_str("A\n", &Markdown::default(), &FstDictionary::curated()),
159            "A"
160        )
161    }
162
163    #[test]
164    fn tab_a_becomes_upcase() {
165        assert_eq!(
166            make_title_case_str("\ta", &PlainEnglish, &FstDictionary::curated()),
167            "\tA"
168        )
169    }
170
171    #[test]
172    fn fixes_video_press() {
173        assert_eq!(
174            make_title_case_str("videopress", &PlainEnglish, &FstDictionary::curated()),
175            "VideoPress"
176        )
177    }
178
179    #[quickcheck]
180    fn a_stays_lowercase(prefix: String, postfix: String) -> TestResult {
181        // There must be words other than the `a`.
182        if prefix.chars().any(|c| !c.is_ascii_alphanumeric())
183            || prefix.is_empty()
184            || postfix.chars().any(|c| !c.is_ascii_alphanumeric())
185            || postfix.is_empty()
186        {
187            return TestResult::discard();
188        }
189
190        let title_case: Vec<_> = make_title_case_str(
191            &format!("{prefix} a {postfix}"),
192            &Markdown::default(),
193            &FstDictionary::curated(),
194        )
195        .chars()
196        .collect();
197
198        TestResult::from_bool(title_case[prefix.chars().count() + 1] == 'a')
199    }
200
201    #[quickcheck]
202    fn about_becomes_uppercase(prefix: String, postfix: String) -> TestResult {
203        // There must be words other than the `a`.
204        if prefix.chars().any(|c| !c.is_ascii_alphanumeric())
205            || prefix.is_empty()
206            || postfix.chars().any(|c| !c.is_ascii_alphanumeric())
207            || postfix.is_empty()
208        {
209            return TestResult::discard();
210        }
211
212        let title_case: Vec<_> = make_title_case_str(
213            &format!("{prefix} about {postfix}"),
214            &Markdown::default(),
215            &FstDictionary::curated(),
216        )
217        .chars()
218        .collect();
219
220        TestResult::from_bool(title_case[prefix.chars().count() + 1] == 'A')
221    }
222
223    #[quickcheck]
224    fn first_word_is_upcase(text: String) -> TestResult {
225        let title_case: Vec<_> =
226            make_title_case_str(&text, &PlainEnglish, &FstDictionary::curated())
227                .chars()
228                .collect();
229
230        if let Some(first) = title_case.first() {
231            if first.is_ascii_alphabetic() {
232                TestResult::from_bool(first.is_ascii_uppercase())
233            } else {
234                TestResult::discard()
235            }
236        } else {
237            TestResult::discard()
238        }
239    }
240
241    #[test]
242    fn united_states() {
243        assert_eq!(
244            make_title_case_str("united states", &PlainEnglish, &FstDictionary::curated()),
245            "United States"
246        )
247    }
248}