harper_core/linting/
repeated_words.rs1use super::{Lint, LintKind, Linter, Suggestion};
2use crate::TokenStringExt;
3use crate::char_string::char_string;
4use crate::{CharString, CharStringExt, Document, Span};
5
6#[derive(Debug, Clone)]
7pub struct RepeatedWords {
8 special_cases: Vec<CharString>,
11}
12impl RepeatedWords {
13 pub fn new() -> Self {
14 Self {
15 special_cases: vec![char_string!("this")],
16 }
17 }
18
19 fn is_special_case(&self, chars: &[char]) -> bool {
20 let lower = chars.to_lower();
21
22 self.special_cases
23 .iter()
24 .any(|v| v.as_slice() == lower.as_ref())
25 }
26}
27
28impl Default for RepeatedWords {
29 fn default() -> Self {
30 Self::new()
31 }
32}
33
34impl Linter for RepeatedWords {
35 fn lint(&mut self, document: &Document) -> Vec<Lint> {
36 let mut lints = Vec::new();
37
38 for chunk in document.iter_chunks() {
39 let mut iter = chunk.iter_word_indices().zip(chunk.iter_words()).peekable();
40
41 while let (Some((idx_a, tok_a)), Some((idx_b, tok_b))) = (iter.next(), iter.peek()) {
42 let word_a = document.get_span_content(&tok_a.span);
43 let word_b = document.get_span_content(&tok_b.span);
44
45 let prev_tok = document.get_token_offset(idx_a, -1);
46 let next_tok = document.get_token_offset(*idx_b, 1);
47
48 if prev_tok.is_some_and(|t| t.kind.is_hyphen())
49 || next_tok.is_some_and(|t| t.kind.is_hyphen())
50 {
51 continue;
52 }
53
54 if (tok_a.kind.is_preposition()
55 || tok_a.kind.is_conjunction()
56 || !tok_a.kind.is_likely_homograph()
57 || self.is_special_case(word_a)
58 || tok_a.kind.is_adverb()
59 || tok_a.kind.is_determiner())
60 && word_a.to_lower() == word_b.to_lower()
61 {
62 let intervening_tokens = &chunk[idx_a + 1..*idx_b];
63
64 if intervening_tokens.iter().any(|t| !t.kind.is_whitespace()) {
65 continue;
66 }
67
68 lints.push(Lint {
69 span: Span::new(tok_a.span.start, tok_b.span.end),
70 lint_kind: LintKind::Repetition,
71 suggestions: vec![Suggestion::ReplaceWith(
72 document.get_span_content(&tok_a.span).to_vec(),
73 )],
74 message: "Did you mean to repeat this word?".to_string(),
75 ..Default::default()
76 })
77 }
78 }
79 }
80
81 lints
82 }
83
84 fn description(&self) -> &'static str {
85 "This rule looks for repetitions of words that are not homographs."
86 }
87}
88
89#[cfg(test)]
90mod tests {
91 use crate::linting::tests::assert_suggestion_result;
92
93 use super::super::tests::assert_lint_count;
94 use super::RepeatedWords;
95
96 #[test]
97 fn catches_basic() {
98 assert_lint_count("I wanted the the banana.", RepeatedWords::default(), 1)
99 }
100
101 #[test]
102 fn does_not_lint_homographs_address() {
103 assert_lint_count("To address address problems.", RepeatedWords::default(), 0);
104 }
105
106 #[test]
107 fn does_not_lint_homographs_record() {
108 assert_lint_count("To record record profits.", RepeatedWords::default(), 0);
109 }
110
111 #[test]
112 fn issue_253() {
113 assert_lint_count(
114 "this paper shows that, while the method may be more accurate accurate, the turnout overestimate suggests that self-selection bias is not sufficiently reduced",
115 RepeatedWords::default(),
116 1,
117 );
118 }
119
120 #[test]
121 fn issue_333() {
122 assert_suggestion_result(
123 "This is is a test",
124 RepeatedWords::default(),
125 "This is a test",
126 );
127 }
128
129 #[test]
130 fn double_a() {
131 assert_suggestion_result(
132 "This is a a test",
133 RepeatedWords::default(),
134 "This is a test",
135 );
136 }
137
138 #[test]
139 fn double_and() {
140 assert_suggestion_result(
141 "And and this is also a test",
142 RepeatedWords::default(),
143 "And this is also a test",
144 );
145 }
146
147 #[test]
148 fn on_on_github() {
149 assert_suggestion_result(
150 "Take a look at the project on on GitHub.",
151 RepeatedWords::default(),
152 "Take a look at the project on GitHub.",
153 );
154 }
155
156 #[test]
157 fn as_as() {
158 assert_suggestion_result(
159 "he is as as hard as nails",
160 RepeatedWords::default(),
161 "he is as hard as nails",
162 );
163 }
164
165 #[test]
166 fn dont_flag_first_hyphenated() {
167 assert_lint_count(
168 "The driver-facing camera and microphone are only logged if you explicitly opt-in in settings.",
169 RepeatedWords::default(),
170 0,
171 );
172 }
173
174 #[test]
175 fn dont_flag_hyphenated_either_side() {
176 assert_lint_count("foo-foo foo bar bar-bar", RepeatedWords::default(), 0);
177 }
178}