harper_core/linting/
sentence_capitalization.rs1use super::Suggestion;
2use super::{Lint, LintKind, Linter};
3use crate::document::Document;
4use crate::spell::Dictionary;
5use crate::{Token, TokenKind, TokenStringExt};
6
7pub struct SentenceCapitalization<T>
8where
9 T: Dictionary,
10{
11 dictionary: T,
12}
13
14impl<T: Dictionary> SentenceCapitalization<T> {
15 pub fn new(dictionary: T) -> Self {
16 Self { dictionary }
17 }
18}
19
20impl<T: Dictionary> Linter for SentenceCapitalization<T> {
21 fn lint(&mut self, document: &Document) -> Vec<Lint> {
24 let mut lints = Vec::new();
25
26 for paragraph in document.iter_paragraphs() {
27 if paragraph.iter_sentences().count() == 1 {
29 let only_sentence = paragraph.iter_sentences().next().unwrap();
30
31 if !only_sentence
32 .iter_chunks()
33 .map(|c| c.iter_words().count())
34 .any(|c| c > 5)
35 {
36 continue;
37 }
38 }
39
40 for sentence in paragraph.iter_sentences() {
41 if !is_full_sentence(sentence) {
42 continue;
43 }
44
45 if let Some(first_word) = sentence.first_non_whitespace() {
46 if !first_word.kind.is_word() {
47 continue;
48 }
49
50 let word_chars = document.get_span_content(&first_word.span);
51
52 if let Some(first_char) = word_chars.first()
53 && first_char.is_alphabetic()
54 && !first_char.is_uppercase()
55 {
56 if let Some(canonical_spelling) =
57 self.dictionary.get_correct_capitalization_of(word_chars)
58 {
59 if first_word.kind.is_proper_noun() {
61 continue;
62 }
63
64 if canonical_spelling
66 .iter()
67 .skip(1)
68 .take_while(|&c| !c.is_whitespace() && *c != '-' && *c != '\'')
69 .any(|&c| c.is_uppercase())
70 {
71 continue;
72 }
73 }
74
75 let target_span = first_word.span;
76 let mut replacement_chars =
77 document.get_span_content(&target_span).to_vec();
78 replacement_chars[0] = replacement_chars[0].to_ascii_uppercase();
79
80 lints.push(Lint {
81 span: target_span,
82 lint_kind: LintKind::Capitalization,
83 suggestions: vec![Suggestion::ReplaceWith(replacement_chars)],
84 priority: 31,
85 message: "This sentence does not start with a capital letter"
86 .to_string(),
87 });
88 }
89 }
90 }
91 }
92
93 lints
94 }
95
96 fn description(&self) -> &'static str {
97 "The opening word of a sentence should almost always be capitalized."
98 }
99}
100
101fn is_full_sentence(toks: &[Token]) -> bool {
102 let mut has_nominal = false;
103 let mut has_verb = false;
104
105 for tok in toks {
106 if let TokenKind::Word(Some(metadata)) = &tok.kind {
107 if metadata.is_nominal() {
108 has_nominal = true;
109 }
110
111 if metadata.is_verb() {
112 has_verb = true;
113 }
114 }
115 }
116
117 has_nominal && has_verb
118}
119
120#[cfg(test)]
121mod tests {
122 use super::super::tests::assert_lint_count;
123 use super::SentenceCapitalization;
124 use crate::spell::FstDictionary;
125
126 #[test]
127 fn catches_basic() {
128 assert_lint_count(
129 "there is no way she is not guilty.",
130 SentenceCapitalization::new(FstDictionary::curated()),
131 1,
132 )
133 }
134
135 #[test]
136 fn no_period() {
137 assert_lint_count(
138 "there is no way she is not guilty",
139 SentenceCapitalization::new(FstDictionary::curated()),
140 1,
141 )
142 }
143
144 #[test]
145 fn two_sentence() {
146 assert_lint_count(
147 "i have complete conviction in this. she is absolutely guilty",
148 SentenceCapitalization::new(FstDictionary::curated()),
149 2,
150 )
151 }
152
153 #[test]
154 fn start_with_number() {
155 assert_lint_count(
156 "53 is the length of the longest word.",
157 SentenceCapitalization::new(FstDictionary::curated()),
158 0,
159 );
160 }
161
162 #[test]
163 fn ignores_unlintable() {
164 assert_lint_count(
165 "[`misspelled_word`] is assumed to be quite small (n < 100). ",
166 SentenceCapitalization::new(FstDictionary::curated()),
167 0,
168 )
169 }
170
171 #[test]
172 fn unfazed_unlintable() {
173 assert_lint_count(
174 "the linter should not be affected by `this` unlintable.",
175 SentenceCapitalization::new(FstDictionary::curated()),
176 1,
177 )
178 }
179
180 #[test]
181 fn unfazed_ellipsis() {
182 assert_lint_count(
183 "the linter should not be affected by... that ellipsis.",
184 SentenceCapitalization::new(FstDictionary::curated()),
185 1,
186 )
187 }
188
189 #[test]
190 fn unfazed_comma() {
191 assert_lint_count(
192 "the linter should not be affected by, that comma.",
193 SentenceCapitalization::new(FstDictionary::curated()),
194 1,
195 )
196 }
197
198 #[test]
199 fn issue_228_allows_labels() {
200 assert_lint_count(
201 "python lsp (fork of pyright)",
202 SentenceCapitalization::new(FstDictionary::curated()),
203 0,
204 )
205 }
206
207 #[test]
208 fn allow_camel_case_trademarks() {
209 assert_lint_count(
211 "macOS 16 could be called something like Redwood or Shasta",
212 SentenceCapitalization::new(FstDictionary::curated()),
213 0,
214 )
215 }
216
217 #[test]
218 #[ignore = "This can't work because currently hyphens are not included in tokenized words\nalthough they are now permitted in `dictionary.dict`"]
219 fn uppercase_unamerican_at_start() {
220 assert_lint_count(
221 "un-American starts with a lowercase letter and contains an uppercase letter, but is not a proper noun or trademark.",
222 SentenceCapitalization::new(FstDictionary::curated()),
223 1,
224 )
225 }
226
227 #[test]
228 fn allow_lowercase_proper_nouns() {
229 assert_lint_count(
232 concat!(
233 "npm is the world's largest software registry. Open source developers from every ",
234 "continent use npm to share and borrow packages, and many organizations use npm to ",
235 "manage private development as well."
236 ),
237 SentenceCapitalization::new(FstDictionary::curated()),
238 0,
239 )
240 }
241
242 #[test]
243 fn allow_lower_camel_case_non_proper_nouns() {
244 assert_lint_count(
246 "mRNA is synthesized from the coding sequence of a gene during the transcriptional process.",
247 SentenceCapitalization::new(FstDictionary::curated()),
248 0,
249 )
250 }
251}