harper_core/
title_case.rs1use std::borrow::Cow;
2
3use crate::Lrc;
4use crate::Token;
5use crate::TokenKind;
6use hashbrown::HashSet;
7use lazy_static::lazy_static;
8
9use crate::{CharStringExt, Dictionary, Document, TokenStringExt, parsers::Parser};
10
11pub fn make_title_case_str(source: &str, parser: &impl Parser, dict: &impl Dictionary) -> String {
13 let source: Vec<char> = source.chars().collect();
14
15 make_title_case_chars(Lrc::new(source), parser, dict).to_string()
16}
17
18pub fn make_title_case_chars(
20 source: Lrc<Vec<char>>,
21 parser: &impl Parser,
22 dict: &impl Dictionary,
23) -> Vec<char> {
24 let document = Document::new_from_vec(source.clone(), parser, dict);
25
26 make_title_case(document.get_tokens(), source.as_slice(), dict)
27}
28
29pub fn make_title_case(toks: &[Token], source: &[char], dict: &impl Dictionary) -> Vec<char> {
30 if toks.is_empty() {
31 return Vec::new();
32 }
33
34 let start_index = toks.first().unwrap().span.start;
35
36 let mut word_likes = toks.iter_word_likes().enumerate().peekable();
37 let mut output = toks.span().unwrap().get_content(source).to_vec();
38
39 while let Some((index, word)) = word_likes.next() {
40 if let Some(Some(metadata)) = word.kind.as_word() {
41 if metadata.is_proper_noun() {
42 let orig_text = word.span.get_content(source);
44
45 if let Some(correct_caps) = dict.get_correct_capitalization_of(orig_text) {
46 output[word.span.start - start_index..word.span.end - start_index]
48 .iter_mut()
49 .enumerate()
50 .for_each(|(idx, c)| *c = correct_caps[idx]);
51 }
52 }
53 };
54
55 let should_capitalize = should_capitalize_token(word, source, dict)
56 || index == 0
57 || word_likes.peek().is_none();
58
59 if should_capitalize {
60 output[word.span.start - start_index] =
61 output[word.span.start - start_index].to_ascii_uppercase();
62 } else {
63 for i in word.span {
65 output[i - start_index] = output[i - start_index].to_ascii_lowercase();
66 }
67 }
68 }
69
70 output
71}
72
73fn should_capitalize_token(tok: &Token, source: &[char], dict: &impl Dictionary) -> bool {
76 match &tok.kind {
77 TokenKind::Word(Some(metadata)) => {
78 lazy_static! {
80 static ref SPECIAL_CONJUNCTIONS: HashSet<Vec<char>> =
81 ["and", "but", "for", "or", "nor"]
82 .iter()
83 .map(|v| v.chars().collect())
84 .collect();
85 }
86
87 let chars = tok.span.get_content(source);
88 let chars_lower = chars.to_lower();
89
90 let mut metadata = Cow::Borrowed(metadata);
91
92 if let Some(metadata_lower) = dict.get_word_metadata(&chars_lower) {
93 metadata = Cow::Owned(metadata.clone().or(metadata_lower));
94 }
95
96 let is_short_preposition = metadata.preposition && tok.span.len() <= 4;
97
98 !is_short_preposition
99 && !metadata.determiner
100 && !SPECIAL_CONJUNCTIONS.contains(chars_lower.as_ref())
101 }
102 _ => true,
103 }
104}
105
106#[cfg(test)]
107mod tests {
108
109 use quickcheck::TestResult;
110 use quickcheck_macros::quickcheck;
111
112 use super::make_title_case_str;
113 use crate::{
114 FstDictionary,
115 parsers::{Markdown, PlainEnglish},
116 };
117
118 #[test]
119 fn normal() {
120 assert_eq!(
121 make_title_case_str("this is a test", &PlainEnglish, &FstDictionary::curated()),
122 "This Is a Test"
123 )
124 }
125
126 #[test]
127 fn complex() {
128 assert_eq!(
129 make_title_case_str(
130 "the first and last words should be capitalized, even if it is \"the\"",
131 &PlainEnglish,
132 &FstDictionary::curated()
133 ),
134 "The First and Last Words Should Be Capitalized, Even If It Is \"The\""
135 )
136 }
137
138 #[test]
140 fn about_uppercase_with_numbers() {
141 assert_eq!(
142 make_title_case_str("0 about 0", &PlainEnglish, &FstDictionary::curated()),
143 "0 About 0"
144 )
145 }
146
147 #[test]
148 fn pipe_does_not_cause_crash() {
149 assert_eq!(
150 make_title_case_str("|", &Markdown::default(), &FstDictionary::curated()),
151 "|"
152 )
153 }
154
155 #[test]
156 fn a_paragraph_does_not_cause_crash() {
157 assert_eq!(
158 make_title_case_str("A\n", &Markdown::default(), &FstDictionary::curated()),
159 "A"
160 )
161 }
162
163 #[test]
164 fn tab_a_becomes_upcase() {
165 assert_eq!(
166 make_title_case_str("\ta", &PlainEnglish, &FstDictionary::curated()),
167 "\tA"
168 )
169 }
170
171 #[test]
172 fn fixes_video_press() {
173 assert_eq!(
174 make_title_case_str("videopress", &PlainEnglish, &FstDictionary::curated()),
175 "VideoPress"
176 )
177 }
178
179 #[quickcheck]
180 fn a_stays_lowercase(prefix: String, postfix: String) -> TestResult {
181 if prefix.chars().any(|c| !c.is_ascii_alphanumeric())
183 || prefix.is_empty()
184 || postfix.chars().any(|c| !c.is_ascii_alphanumeric())
185 || postfix.is_empty()
186 {
187 return TestResult::discard();
188 }
189
190 let title_case: Vec<_> = make_title_case_str(
191 &format!("{prefix} a {postfix}"),
192 &Markdown::default(),
193 &FstDictionary::curated(),
194 )
195 .chars()
196 .collect();
197
198 TestResult::from_bool(title_case[prefix.chars().count() + 1] == 'a')
199 }
200
201 #[quickcheck]
202 fn about_becomes_uppercase(prefix: String, postfix: String) -> TestResult {
203 if prefix.chars().any(|c| !c.is_ascii_alphanumeric())
205 || prefix.is_empty()
206 || postfix.chars().any(|c| !c.is_ascii_alphanumeric())
207 || postfix.is_empty()
208 {
209 return TestResult::discard();
210 }
211
212 let title_case: Vec<_> = make_title_case_str(
213 &format!("{prefix} about {postfix}"),
214 &Markdown::default(),
215 &FstDictionary::curated(),
216 )
217 .chars()
218 .collect();
219
220 TestResult::from_bool(title_case[prefix.chars().count() + 1] == 'A')
221 }
222
223 #[quickcheck]
224 fn first_word_is_upcase(text: String) -> TestResult {
225 let title_case: Vec<_> =
226 make_title_case_str(&text, &PlainEnglish, &FstDictionary::curated())
227 .chars()
228 .collect();
229
230 if let Some(first) = title_case.first() {
231 if first.is_ascii_alphabetic() {
232 TestResult::from_bool(first.is_ascii_uppercase())
233 } else {
234 TestResult::discard()
235 }
236 } else {
237 TestResult::discard()
238 }
239 }
240
241 #[test]
242 fn united_states() {
243 assert_eq!(
244 make_title_case_str("united states", &PlainEnglish, &FstDictionary::curated()),
245 "United States"
246 )
247 }
248}