harper_core/
title_case.rs

1use std::borrow::Cow;
2
3use crate::Lrc;
4use crate::Token;
5use crate::TokenKind;
6use hashbrown::HashSet;
7use lazy_static::lazy_static;
8
9use crate::spell::Dictionary;
10use crate::{CharStringExt, Document, TokenStringExt, parsers::Parser};
11
12/// A helper function for [`make_title_case`] that uses Strings instead of char buffers.
13pub fn make_title_case_str(source: &str, parser: &impl Parser, dict: &impl Dictionary) -> String {
14    let source: Vec<char> = source.chars().collect();
15
16    make_title_case_chars(Lrc::new(source), parser, dict).to_string()
17}
18
19// Make a given string [title case](https://en.wikipedia.org/wiki/Title_case) following the Chicago Manual of Style.
20pub fn make_title_case_chars(
21    source: Lrc<Vec<char>>,
22    parser: &impl Parser,
23    dict: &impl Dictionary,
24) -> Vec<char> {
25    let document = Document::new_from_vec(source.clone(), parser, dict);
26
27    make_title_case(document.get_tokens(), source.as_slice(), dict)
28}
29
30pub fn make_title_case(toks: &[Token], source: &[char], dict: &impl Dictionary) -> Vec<char> {
31    if toks.is_empty() {
32        return Vec::new();
33    }
34
35    let start_index = toks.first().unwrap().span.start;
36
37    let mut word_likes = toks.iter_word_likes().enumerate().peekable();
38    let mut output = toks.span().unwrap().get_content(source).to_vec();
39
40    while let Some((index, word)) = word_likes.next() {
41        if let Some(Some(metadata)) = word.kind.as_word()
42            && metadata.is_proper_noun()
43        {
44            // Replace it with the dictionary entry verbatim.
45            let orig_text = word.span.get_content(source);
46
47            if let Some(correct_caps) = dict.get_correct_capitalization_of(orig_text) {
48                // It should match the dictionary verbatim
49                output[word.span.start - start_index..word.span.end - start_index]
50                    .iter_mut()
51                    .enumerate()
52                    .for_each(|(idx, c)| *c = correct_caps[idx]);
53            }
54        };
55
56        let should_capitalize = should_capitalize_token(word, source, dict)
57            || index == 0
58            || word_likes.peek().is_none();
59
60        if should_capitalize {
61            output[word.span.start - start_index] =
62                output[word.span.start - start_index].to_ascii_uppercase();
63        } else {
64            // The whole word should be lowercase.
65            for i in word.span {
66                output[i - start_index] = output[i - start_index].to_ascii_lowercase();
67            }
68        }
69    }
70
71    output
72}
73
74/// Determines whether a token should be capitalized.
75/// Is not responsible for capitalization requirements that are dependent on token position.
76fn should_capitalize_token(tok: &Token, source: &[char], dict: &impl Dictionary) -> bool {
77    match &tok.kind {
78        TokenKind::Word(Some(metadata)) => {
79            // Only specific conjunctions are not capitalized.
80            lazy_static! {
81                static ref SPECIAL_CONJUNCTIONS: HashSet<Vec<char>> =
82                    ["and", "but", "for", "or", "nor"]
83                        .iter()
84                        .map(|v| v.chars().collect())
85                        .collect();
86            }
87
88            let chars = tok.span.get_content(source);
89            let chars_lower = chars.to_lower();
90
91            let mut metadata = Cow::Borrowed(metadata);
92
93            if let Some(metadata_lower) = dict.get_word_metadata(&chars_lower) {
94                metadata = Cow::Owned(metadata.clone().or(&metadata_lower));
95            }
96
97            let is_short_preposition = metadata.preposition && tok.span.len() <= 4;
98
99            !is_short_preposition
100                && !metadata.is_determiner()
101                && !SPECIAL_CONJUNCTIONS.contains(chars_lower.as_ref())
102        }
103        _ => true,
104    }
105}
106
107#[cfg(test)]
108mod tests {
109    use quickcheck::TestResult;
110    use quickcheck_macros::quickcheck;
111
112    use super::make_title_case_str;
113    use crate::parsers::{Markdown, PlainEnglish};
114    use crate::spell::FstDictionary;
115
116    #[test]
117    fn normal() {
118        assert_eq!(
119            make_title_case_str("this is a test", &PlainEnglish, &FstDictionary::curated()),
120            "This Is a Test"
121        )
122    }
123
124    #[test]
125    fn complex() {
126        assert_eq!(
127            make_title_case_str(
128                "the first and last words should be capitalized, even if it is \"the\"",
129                &PlainEnglish,
130                &FstDictionary::curated()
131            ),
132            "The First and Last Words Should Be Capitalized, Even If It Is \"The\""
133        )
134    }
135
136    /// Check that "about" remains uppercase
137    #[test]
138    fn about_uppercase_with_numbers() {
139        assert_eq!(
140            make_title_case_str("0 about 0", &PlainEnglish, &FstDictionary::curated()),
141            "0 About 0"
142        )
143    }
144
145    #[test]
146    fn pipe_does_not_cause_crash() {
147        assert_eq!(
148            make_title_case_str("|", &Markdown::default(), &FstDictionary::curated()),
149            "|"
150        )
151    }
152
153    #[test]
154    fn a_paragraph_does_not_cause_crash() {
155        assert_eq!(
156            make_title_case_str("A\n", &Markdown::default(), &FstDictionary::curated()),
157            "A"
158        )
159    }
160
161    #[test]
162    fn tab_a_becomes_upcase() {
163        assert_eq!(
164            make_title_case_str("\ta", &PlainEnglish, &FstDictionary::curated()),
165            "\tA"
166        )
167    }
168
169    #[test]
170    fn fixes_video_press() {
171        assert_eq!(
172            make_title_case_str("videopress", &PlainEnglish, &FstDictionary::curated()),
173            "VideoPress"
174        )
175    }
176
177    #[quickcheck]
178    fn a_stays_lowercase(prefix: String, postfix: String) -> TestResult {
179        // There must be words other than the `a`.
180        if prefix.chars().any(|c| !c.is_ascii_alphanumeric())
181            || prefix.is_empty()
182            || postfix.chars().any(|c| !c.is_ascii_alphanumeric())
183            || postfix.is_empty()
184        {
185            return TestResult::discard();
186        }
187
188        let title_case: Vec<_> = make_title_case_str(
189            &format!("{prefix} a {postfix}"),
190            &Markdown::default(),
191            &FstDictionary::curated(),
192        )
193        .chars()
194        .collect();
195
196        TestResult::from_bool(title_case[prefix.chars().count() + 1] == 'a')
197    }
198
199    #[quickcheck]
200    fn about_becomes_uppercase(prefix: String, postfix: String) -> TestResult {
201        // There must be words other than the `a`.
202        if prefix.chars().any(|c| !c.is_ascii_alphanumeric())
203            || prefix.is_empty()
204            || postfix.chars().any(|c| !c.is_ascii_alphanumeric())
205            || postfix.is_empty()
206        {
207            return TestResult::discard();
208        }
209
210        let title_case: Vec<_> = make_title_case_str(
211            &format!("{prefix} about {postfix}"),
212            &Markdown::default(),
213            &FstDictionary::curated(),
214        )
215        .chars()
216        .collect();
217
218        TestResult::from_bool(title_case[prefix.chars().count() + 1] == 'A')
219    }
220
221    #[quickcheck]
222    fn first_word_is_upcase(text: String) -> TestResult {
223        let title_case: Vec<_> =
224            make_title_case_str(&text, &PlainEnglish, &FstDictionary::curated())
225                .chars()
226                .collect();
227
228        if let Some(first) = title_case.first() {
229            if first.is_ascii_alphabetic() {
230                TestResult::from_bool(first.is_ascii_uppercase())
231            } else {
232                TestResult::discard()
233            }
234        } else {
235            TestResult::discard()
236        }
237    }
238
239    #[test]
240    fn united_states() {
241        assert_eq!(
242            make_title_case_str("united states", &PlainEnglish, &FstDictionary::curated()),
243            "United States"
244        )
245    }
246}