harper_core/patterns/
split_compound_word.rs

1use std::sync::Arc;
2
3use crate::{CharString, Dictionary, FstDictionary, Token, WordMetadata};
4
5use super::{Pattern, SequencePattern};
6
7/// A [`Pattern`] that looks for valid words accidentally split by whitespace.
8///
9/// Note that matches of this pattern are not guaranteed to be valid if merged.
10pub struct SplitCompoundWord {
11    inner: SequencePattern,
12    dict: Arc<FstDictionary>,
13    predicate: Box<dyn Fn(&WordMetadata) -> bool + Send + Sync>,
14}
15
16impl SplitCompoundWord {
17    /// Create a new instance of the linter which will only look for compound words that fit the
18    /// provided predicate.
19    pub fn new(predicate: impl Fn(&WordMetadata) -> bool + Send + Sync + 'static) -> Self {
20        Self {
21            inner: SequencePattern::default()
22                .then_any_word()
23                .then_whitespace()
24                .then_any_word(),
25            dict: FstDictionary::curated(),
26            predicate: Box::new(predicate),
27        }
28    }
29
30    /// Get the merged word from the dictionary that this pattern would match on if it was split.
31    pub fn get_merged_word(
32        &self,
33        word_a: &Token,
34        word_b: &Token,
35        source: &[char],
36    ) -> Option<CharString> {
37        let a_chars: CharString = word_a.span.get_content(source).into();
38        let b_chars: CharString = word_b.span.get_content(source).into();
39
40        let mut buffer = CharString::new();
41
42        buffer.clear();
43        buffer.extend_from_slice(&a_chars);
44        buffer.extend_from_slice(&b_chars);
45
46        if let Some(metadata) = self.dict.get_word_metadata(&buffer) {
47            if (self.predicate)(metadata) {
48                let correct = self.dict.get_correct_capitalization_of(&buffer).unwrap();
49                buffer.clear();
50                buffer.extend_from_slice(correct);
51                return Some(buffer);
52            }
53        }
54
55        None
56    }
57}
58
59impl Pattern for SplitCompoundWord {
60    fn matches(&self, tokens: &[Token], source: &[char]) -> Option<usize> {
61        let inner_match = self.inner.matches(tokens, source)?;
62
63        if inner_match != 3 {
64            return None;
65        }
66
67        let a = &tokens[0];
68        let b = &tokens[2];
69
70        if self.get_merged_word(a, b, source).is_some() {
71            return Some(inner_match);
72        }
73
74        None
75    }
76}