harper_core/linting/
sentence_capitalization.rs

1use super::Suggestion;
2use super::{Lint, LintKind, Linter};
3use crate::document::Document;
4use crate::spell::Dictionary;
5use crate::{Token, TokenKind, TokenStringExt};
6
7pub struct SentenceCapitalization<T>
8where
9    T: Dictionary,
10{
11    dictionary: T,
12}
13
14impl<T: Dictionary> SentenceCapitalization<T> {
15    pub fn new(dictionary: T) -> Self {
16        Self { dictionary }
17    }
18}
19
20impl<T: Dictionary> Linter for SentenceCapitalization<T> {
21    /// A linter that checks to make sure the first word of each sentence is
22    /// capitalized.
23    fn lint(&mut self, document: &Document) -> Vec<Lint> {
24        let mut lints = Vec::new();
25
26        for paragraph in document.iter_paragraphs() {
27            // Allows short, label-like comments in code.
28            if paragraph.iter_sentences().count() == 1 {
29                let only_sentence = paragraph.iter_sentences().next().unwrap();
30
31                if !only_sentence
32                    .iter_chunks()
33                    .map(|c| c.iter_words().count())
34                    .any(|c| c > 5)
35                {
36                    continue;
37                }
38            }
39
40            for sentence in paragraph.iter_sentences() {
41                if !is_full_sentence(sentence) {
42                    continue;
43                }
44
45                if let Some(first_word) = sentence.first_non_whitespace() {
46                    if !first_word.kind.is_word() {
47                        continue;
48                    }
49
50                    let word_chars = document.get_span_content(&first_word.span);
51
52                    if let Some(first_char) = word_chars.first()
53                        && first_char.is_alphabetic()
54                        && !first_char.is_uppercase()
55                    {
56                        if let Some(canonical_spelling) =
57                            self.dictionary.get_correct_capitalization_of(word_chars)
58                        {
59                            // Skip if it's a proper noun or contains uppercase letters before a separator
60                            if first_word.kind.is_proper_noun() {
61                                continue;
62                            }
63
64                            // Check for uppercase letters in the rest of the word before any separators
65                            if canonical_spelling
66                                .iter()
67                                .skip(1)
68                                .take_while(|&c| !c.is_whitespace() && *c != '-' && *c != '\'')
69                                .any(|&c| c.is_uppercase())
70                            {
71                                continue;
72                            }
73                        }
74
75                        let target_span = first_word.span;
76                        let mut replacement_chars =
77                            document.get_span_content(&target_span).to_vec();
78                        replacement_chars[0] = replacement_chars[0].to_ascii_uppercase();
79
80                        lints.push(Lint {
81                            span: target_span,
82                            lint_kind: LintKind::Capitalization,
83                            suggestions: vec![Suggestion::ReplaceWith(replacement_chars)],
84                            priority: 31,
85                            message: "This sentence does not start with a capital letter"
86                                .to_string(),
87                        });
88                    }
89                }
90            }
91        }
92
93        lints
94    }
95
96    fn description(&self) -> &'static str {
97        "The opening word of a sentence should almost always be capitalized."
98    }
99}
100
101fn is_full_sentence(toks: &[Token]) -> bool {
102    let mut has_nominal = false;
103    let mut has_verb = false;
104
105    for tok in toks {
106        if let TokenKind::Word(Some(metadata)) = &tok.kind {
107            if metadata.is_nominal() {
108                has_nominal = true;
109            }
110
111            if metadata.is_verb() {
112                has_verb = true;
113            }
114        }
115    }
116
117    has_nominal && has_verb
118}
119
120#[cfg(test)]
121mod tests {
122    use super::super::tests::assert_lint_count;
123    use super::SentenceCapitalization;
124    use crate::spell::FstDictionary;
125
126    #[test]
127    fn catches_basic() {
128        assert_lint_count(
129            "there is no way she is not guilty.",
130            SentenceCapitalization::new(FstDictionary::curated()),
131            1,
132        )
133    }
134
135    #[test]
136    fn no_period() {
137        assert_lint_count(
138            "there is no way she is not guilty",
139            SentenceCapitalization::new(FstDictionary::curated()),
140            1,
141        )
142    }
143
144    #[test]
145    fn two_sentence() {
146        assert_lint_count(
147            "i have complete conviction in this. she is absolutely guilty",
148            SentenceCapitalization::new(FstDictionary::curated()),
149            2,
150        )
151    }
152
153    #[test]
154    fn start_with_number() {
155        assert_lint_count(
156            "53 is the length of the longest word.",
157            SentenceCapitalization::new(FstDictionary::curated()),
158            0,
159        );
160    }
161
162    #[test]
163    fn ignores_unlintable() {
164        assert_lint_count(
165            "[`misspelled_word`] is assumed to be quite small (n < 100). ",
166            SentenceCapitalization::new(FstDictionary::curated()),
167            0,
168        )
169    }
170
171    #[test]
172    fn unfazed_unlintable() {
173        assert_lint_count(
174            "the linter should not be affected by `this` unlintable.",
175            SentenceCapitalization::new(FstDictionary::curated()),
176            1,
177        )
178    }
179
180    #[test]
181    fn unfazed_ellipsis() {
182        assert_lint_count(
183            "the linter should not be affected by... that ellipsis.",
184            SentenceCapitalization::new(FstDictionary::curated()),
185            1,
186        )
187    }
188
189    #[test]
190    fn unfazed_comma() {
191        assert_lint_count(
192            "the linter should not be affected by, that comma.",
193            SentenceCapitalization::new(FstDictionary::curated()),
194            1,
195        )
196    }
197
198    #[test]
199    fn issue_228_allows_labels() {
200        assert_lint_count(
201            "python lsp (fork of pyright)",
202            SentenceCapitalization::new(FstDictionary::curated()),
203            0,
204        )
205    }
206
207    #[test]
208    fn allow_camel_case_trademarks() {
209        // Some words are marked as proper nouns in `dictionary.dict` but are lower camel case.
210        assert_lint_count(
211            "macOS 16 could be called something like Redwood or Shasta",
212            SentenceCapitalization::new(FstDictionary::curated()),
213            0,
214        )
215    }
216
217    #[test]
218    #[ignore = "This can't work because currently hyphens are not included in tokenized words\nalthough they are now permitted in `dictionary.dict`"]
219    fn uppercase_unamerican_at_start() {
220        assert_lint_count(
221            "un-American starts with a lowercase letter and contains an uppercase letter, but is not a proper noun or trademark.",
222            SentenceCapitalization::new(FstDictionary::curated()),
223            1,
224        )
225    }
226
227    #[test]
228    fn allow_lowercase_proper_nouns() {
229        // A very few words are marked as proper nouns even though they're all lowercase.
230        // https://css-tricks.com/start-sentence-npm/
231        assert_lint_count(
232            concat!(
233                "npm is the world's largest software registry. Open source developers from every ",
234                "continent use npm to share and borrow packages, and many organizations use npm to ",
235                "manage private development as well."
236            ),
237            SentenceCapitalization::new(FstDictionary::curated()),
238            0,
239        )
240    }
241
242    #[test]
243    fn allow_lower_camel_case_non_proper_nouns() {
244        // A very few words are not considered proper nouns but still start with a lowercase letter that shouldn't be uppercased at the start of a sentence.
245        assert_lint_count(
246            "mRNA is synthesized from the coding sequence of a gene during the transcriptional process.",
247            SentenceCapitalization::new(FstDictionary::curated()),
248            0,
249        )
250    }
251}