harper_core/patterns/
split_compound_word.rs1use std::sync::Arc;
2
3use crate::{CharString, Dictionary, FstDictionary, Token, WordMetadata};
4
5use super::{Pattern, SequencePattern};
6
7pub struct SplitCompoundWord {
11 inner: SequencePattern,
12 dict: Arc<FstDictionary>,
13 predicate: Box<dyn Fn(&WordMetadata) -> bool + Send + Sync>,
14}
15
16impl SplitCompoundWord {
17 pub fn new(predicate: impl Fn(&WordMetadata) -> bool + Send + Sync + 'static) -> Self {
20 Self {
21 inner: SequencePattern::default()
22 .then_any_word()
23 .then_whitespace()
24 .then_any_word(),
25 dict: FstDictionary::curated(),
26 predicate: Box::new(predicate),
27 }
28 }
29
30 pub fn get_merged_word(
32 &self,
33 word_a: &Token,
34 word_b: &Token,
35 source: &[char],
36 ) -> Option<CharString> {
37 let a_chars: CharString = word_a.span.get_content(source).into();
38 let b_chars: CharString = word_b.span.get_content(source).into();
39
40 let mut buffer = CharString::new();
41
42 buffer.clear();
43 buffer.extend_from_slice(&a_chars);
44 buffer.extend_from_slice(&b_chars);
45
46 if let Some(metadata) = self.dict.get_word_metadata(&buffer) {
47 if (self.predicate)(metadata) {
48 let correct = self.dict.get_correct_capitalization_of(&buffer).unwrap();
49 buffer.clear();
50 buffer.extend_from_slice(correct);
51 return Some(buffer);
52 }
53 }
54
55 None
56 }
57}
58
59impl Pattern for SplitCompoundWord {
60 fn matches(&self, tokens: &[Token], source: &[char]) -> Option<usize> {
61 let inner_match = self.inner.matches(tokens, source)?;
62
63 if inner_match != 3 {
64 return None;
65 }
66
67 let a = &tokens[0];
68 let b = &tokens[2];
69
70 if self.get_merged_word(a, b, source).is_some() {
71 return Some(inner_match);
72 }
73
74 None
75 }
76}