harper_core/linting/
repeated_words.rs

1use super::{Lint, LintKind, Linter, Suggestion};
2use crate::TokenStringExt;
3use crate::char_string::char_string;
4use crate::{CharString, CharStringExt, Document, Span};
5
6#[derive(Debug, Clone)]
7pub struct RepeatedWords {
8    /// Words that we need to make sure are detected.
9    /// We use a `Vec` since there aren't a whole lot of 'em.
10    special_cases: Vec<CharString>,
11}
12impl RepeatedWords {
13    pub fn new() -> Self {
14        Self {
15            special_cases: vec![char_string!("this")],
16        }
17    }
18
19    fn is_special_case(&self, chars: &[char]) -> bool {
20        let lower = chars.to_lower();
21
22        self.special_cases
23            .iter()
24            .any(|v| v.as_slice() == lower.as_ref())
25    }
26}
27
28impl Default for RepeatedWords {
29    fn default() -> Self {
30        Self::new()
31    }
32}
33
34impl Linter for RepeatedWords {
35    fn lint(&mut self, document: &Document) -> Vec<Lint> {
36        let mut lints = Vec::new();
37
38        for chunk in document.iter_chunks() {
39            let mut iter = chunk.iter_word_indices().zip(chunk.iter_words()).peekable();
40
41            while let (Some((idx_a, tok_a)), Some((idx_b, tok_b))) = (iter.next(), iter.peek()) {
42                let word_a = document.get_span_content(&tok_a.span);
43                let word_b = document.get_span_content(&tok_b.span);
44
45                let prev_tok = document.get_token_offset(idx_a, -1);
46                let next_tok = document.get_token_offset(*idx_b, 1);
47
48                if prev_tok.is_some_and(|t| t.kind.is_hyphen())
49                    || next_tok.is_some_and(|t| t.kind.is_hyphen())
50                {
51                    continue;
52                }
53
54                if (tok_a.kind.is_preposition()
55                    || tok_a.kind.is_conjunction()
56                    || !tok_a.kind.is_likely_homograph()
57                    || self.is_special_case(word_a)
58                    || tok_a.kind.is_adverb()
59                    || tok_a.kind.is_determiner())
60                    && word_a.to_lower() == word_b.to_lower()
61                {
62                    let intervening_tokens = &chunk[idx_a + 1..*idx_b];
63
64                    if intervening_tokens.iter().any(|t| !t.kind.is_whitespace()) {
65                        continue;
66                    }
67
68                    lints.push(Lint {
69                        span: Span::new(tok_a.span.start, tok_b.span.end),
70                        lint_kind: LintKind::Repetition,
71                        suggestions: vec![Suggestion::ReplaceWith(
72                            document.get_span_content(&tok_a.span).to_vec(),
73                        )],
74                        message: "Did you mean to repeat this word?".to_string(),
75                        ..Default::default()
76                    })
77                }
78            }
79        }
80
81        lints
82    }
83
84    fn description(&self) -> &'static str {
85        "This rule looks for repetitions of words that are not homographs."
86    }
87}
88
89#[cfg(test)]
90mod tests {
91    use crate::linting::tests::assert_suggestion_result;
92
93    use super::super::tests::assert_lint_count;
94    use super::RepeatedWords;
95
96    #[test]
97    fn catches_basic() {
98        assert_lint_count("I wanted the the banana.", RepeatedWords::default(), 1)
99    }
100
101    #[test]
102    fn does_not_lint_homographs_address() {
103        assert_lint_count("To address address problems.", RepeatedWords::default(), 0);
104    }
105
106    #[test]
107    fn does_not_lint_homographs_record() {
108        assert_lint_count("To record record profits.", RepeatedWords::default(), 0);
109    }
110
111    #[test]
112    fn issue_253() {
113        assert_lint_count(
114            "this paper shows that, while the method may be more accurate accurate, the turnout overestimate suggests that self-selection bias is not sufficiently reduced",
115            RepeatedWords::default(),
116            1,
117        );
118    }
119
120    #[test]
121    fn issue_333() {
122        assert_suggestion_result(
123            "This is is a test",
124            RepeatedWords::default(),
125            "This is a test",
126        );
127    }
128
129    #[test]
130    fn double_a() {
131        assert_suggestion_result(
132            "This is a a test",
133            RepeatedWords::default(),
134            "This is a test",
135        );
136    }
137
138    #[test]
139    fn double_and() {
140        assert_suggestion_result(
141            "And and this is also a test",
142            RepeatedWords::default(),
143            "And this is also a test",
144        );
145    }
146
147    #[test]
148    fn on_on_github() {
149        assert_suggestion_result(
150            "Take a look at the project on on GitHub.",
151            RepeatedWords::default(),
152            "Take a look at the project on GitHub.",
153        );
154    }
155
156    #[test]
157    fn as_as() {
158        assert_suggestion_result(
159            "he is as as hard as nails",
160            RepeatedWords::default(),
161            "he is as hard as nails",
162        );
163    }
164
165    #[test]
166    fn dont_flag_first_hyphenated() {
167        assert_lint_count(
168            "The driver-facing camera and microphone are only logged if you explicitly opt-in in settings.",
169            RepeatedWords::default(),
170            0,
171        );
172    }
173
174    #[test]
175    fn dont_flag_hyphenated_either_side() {
176        assert_lint_count("foo-foo foo bar bar-bar", RepeatedWords::default(), 0);
177    }
178}