harper_core/
title_case.rs1use std::borrow::Cow;
2
3use crate::Lrc;
4use crate::Token;
5use crate::TokenKind;
6use hashbrown::HashSet;
7use lazy_static::lazy_static;
8
9use crate::spell::Dictionary;
10use crate::{CharStringExt, Document, TokenStringExt, parsers::Parser};
11
12pub fn make_title_case_str(source: &str, parser: &impl Parser, dict: &impl Dictionary) -> String {
14 let source: Vec<char> = source.chars().collect();
15
16 make_title_case_chars(Lrc::new(source), parser, dict).to_string()
17}
18
19pub fn make_title_case_chars(
21 source: Lrc<Vec<char>>,
22 parser: &impl Parser,
23 dict: &impl Dictionary,
24) -> Vec<char> {
25 let document = Document::new_from_vec(source.clone(), parser, dict);
26
27 make_title_case(document.get_tokens(), source.as_slice(), dict)
28}
29
30pub fn make_title_case(toks: &[Token], source: &[char], dict: &impl Dictionary) -> Vec<char> {
31 if toks.is_empty() {
32 return Vec::new();
33 }
34
35 let start_index = toks.first().unwrap().span.start;
36
37 let mut word_likes = toks.iter_word_likes().enumerate().peekable();
38 let mut output = toks.span().unwrap().get_content(source).to_vec();
39
40 while let Some((index, word)) = word_likes.next() {
41 if let Some(Some(metadata)) = word.kind.as_word()
42 && metadata.is_proper_noun()
43 {
44 let orig_text = word.span.get_content(source);
46
47 if let Some(correct_caps) = dict.get_correct_capitalization_of(orig_text) {
48 output[word.span.start - start_index..word.span.end - start_index]
50 .iter_mut()
51 .enumerate()
52 .for_each(|(idx, c)| *c = correct_caps[idx]);
53 }
54 };
55
56 let should_capitalize = should_capitalize_token(word, source, dict)
57 || index == 0
58 || word_likes.peek().is_none();
59
60 if should_capitalize {
61 output[word.span.start - start_index] =
62 output[word.span.start - start_index].to_ascii_uppercase();
63 } else {
64 for i in word.span {
66 output[i - start_index] = output[i - start_index].to_ascii_lowercase();
67 }
68 }
69 }
70
71 output
72}
73
74fn should_capitalize_token(tok: &Token, source: &[char], dict: &impl Dictionary) -> bool {
77 match &tok.kind {
78 TokenKind::Word(Some(metadata)) => {
79 lazy_static! {
81 static ref SPECIAL_CONJUNCTIONS: HashSet<Vec<char>> =
82 ["and", "but", "for", "or", "nor"]
83 .iter()
84 .map(|v| v.chars().collect())
85 .collect();
86 }
87
88 let chars = tok.span.get_content(source);
89 let chars_lower = chars.to_lower();
90
91 let mut metadata = Cow::Borrowed(metadata);
92
93 if let Some(metadata_lower) = dict.get_word_metadata(&chars_lower) {
94 metadata = Cow::Owned(metadata.clone().or(&metadata_lower));
95 }
96
97 let is_short_preposition = metadata.preposition && tok.span.len() <= 4;
98
99 !is_short_preposition
100 && !metadata.is_determiner()
101 && !SPECIAL_CONJUNCTIONS.contains(chars_lower.as_ref())
102 }
103 _ => true,
104 }
105}
106
107#[cfg(test)]
108mod tests {
109 use quickcheck::TestResult;
110 use quickcheck_macros::quickcheck;
111
112 use super::make_title_case_str;
113 use crate::parsers::{Markdown, PlainEnglish};
114 use crate::spell::FstDictionary;
115
116 #[test]
117 fn normal() {
118 assert_eq!(
119 make_title_case_str("this is a test", &PlainEnglish, &FstDictionary::curated()),
120 "This Is a Test"
121 )
122 }
123
124 #[test]
125 fn complex() {
126 assert_eq!(
127 make_title_case_str(
128 "the first and last words should be capitalized, even if it is \"the\"",
129 &PlainEnglish,
130 &FstDictionary::curated()
131 ),
132 "The First and Last Words Should Be Capitalized, Even If It Is \"The\""
133 )
134 }
135
136 #[test]
138 fn about_uppercase_with_numbers() {
139 assert_eq!(
140 make_title_case_str("0 about 0", &PlainEnglish, &FstDictionary::curated()),
141 "0 About 0"
142 )
143 }
144
145 #[test]
146 fn pipe_does_not_cause_crash() {
147 assert_eq!(
148 make_title_case_str("|", &Markdown::default(), &FstDictionary::curated()),
149 "|"
150 )
151 }
152
153 #[test]
154 fn a_paragraph_does_not_cause_crash() {
155 assert_eq!(
156 make_title_case_str("A\n", &Markdown::default(), &FstDictionary::curated()),
157 "A"
158 )
159 }
160
161 #[test]
162 fn tab_a_becomes_upcase() {
163 assert_eq!(
164 make_title_case_str("\ta", &PlainEnglish, &FstDictionary::curated()),
165 "\tA"
166 )
167 }
168
169 #[test]
170 fn fixes_video_press() {
171 assert_eq!(
172 make_title_case_str("videopress", &PlainEnglish, &FstDictionary::curated()),
173 "VideoPress"
174 )
175 }
176
177 #[quickcheck]
178 fn a_stays_lowercase(prefix: String, postfix: String) -> TestResult {
179 if prefix.chars().any(|c| !c.is_ascii_alphanumeric())
181 || prefix.is_empty()
182 || postfix.chars().any(|c| !c.is_ascii_alphanumeric())
183 || postfix.is_empty()
184 {
185 return TestResult::discard();
186 }
187
188 let title_case: Vec<_> = make_title_case_str(
189 &format!("{prefix} a {postfix}"),
190 &Markdown::default(),
191 &FstDictionary::curated(),
192 )
193 .chars()
194 .collect();
195
196 TestResult::from_bool(title_case[prefix.chars().count() + 1] == 'a')
197 }
198
199 #[quickcheck]
200 fn about_becomes_uppercase(prefix: String, postfix: String) -> TestResult {
201 if prefix.chars().any(|c| !c.is_ascii_alphanumeric())
203 || prefix.is_empty()
204 || postfix.chars().any(|c| !c.is_ascii_alphanumeric())
205 || postfix.is_empty()
206 {
207 return TestResult::discard();
208 }
209
210 let title_case: Vec<_> = make_title_case_str(
211 &format!("{prefix} about {postfix}"),
212 &Markdown::default(),
213 &FstDictionary::curated(),
214 )
215 .chars()
216 .collect();
217
218 TestResult::from_bool(title_case[prefix.chars().count() + 1] == 'A')
219 }
220
221 #[quickcheck]
222 fn first_word_is_upcase(text: String) -> TestResult {
223 let title_case: Vec<_> =
224 make_title_case_str(&text, &PlainEnglish, &FstDictionary::curated())
225 .chars()
226 .collect();
227
228 if let Some(first) = title_case.first() {
229 if first.is_ascii_alphabetic() {
230 TestResult::from_bool(first.is_ascii_uppercase())
231 } else {
232 TestResult::discard()
233 }
234 } else {
235 TestResult::discard()
236 }
237 }
238
239 #[test]
240 fn united_states() {
241 assert_eq!(
242 make_title_case_str("united states", &PlainEnglish, &FstDictionary::curated()),
243 "United States"
244 )
245 }
246}