tantivy_analysis_contrib/phonetic/
wrapper.rs1use rphonetic::{BeiderMorseBuilder, Encoder, Phonex};
6use tantivy_tokenizer_api::{TokenStream, Tokenizer};
7
8use super::{
9 BeiderMorseTokenStream, DaitchMokotoffTokenStream, DoubleMetaphoneTokenStream,
10 EncoderAlgorithm, GenericPhoneticTokenStream,
11};
12
13struct PhonexWrapper(Phonex);
18
19impl Encoder for PhonexWrapper {
20 fn encode(&self, s: &str) -> String {
21 let result = self.0.encode(s);
22 if result.bytes().any(|b| b != b'0') {
24 result
25 } else {
26 "".to_owned()
27 }
28 }
29}
30
31#[derive(Debug, Clone)]
32pub struct PhoneticFilterWrapper<T> {
33 algorithm: EncoderAlgorithm,
34 inject: bool,
35 inner: T,
36}
37
38impl<T> PhoneticFilterWrapper<T> {
39 pub(crate) fn new(inner: T, algorithm: EncoderAlgorithm, inject: bool) -> Self {
40 Self {
41 algorithm,
42 inject,
43 inner,
44 }
45 }
46}
47
48impl<T: Tokenizer> Tokenizer for PhoneticFilterWrapper<T> {
49 type TokenStream<'a> = Box<dyn TokenStream + 'a>;
50
51 fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
52 match &self.algorithm {
53 EncoderAlgorithm::BeiderMorse(
55 config_files,
56 name_type,
57 rule_type,
58 concat,
59 max_phonemes,
60 languages_set,
61 ) => {
62 let mut builder = BeiderMorseBuilder::new(config_files);
63 if let Some(name_type) = name_type {
64 builder = builder.name_type(*name_type);
65 }
66 if let Some(rule_type) = rule_type {
67 builder = builder.rule_type(*rule_type);
68 }
69 if let Some(concat) = concat {
70 builder = builder.concat(*concat);
71 }
72 if let Some(max_phonemes) = max_phonemes {
73 builder = builder.max_phonemes(*max_phonemes);
74 }
75
76 let max_phonemes = match max_phonemes {
77 Some(max_phonemes) => *max_phonemes,
78 None => 20,
79 };
80 let encoder = builder.build();
81 Box::new(BeiderMorseTokenStream::new(
82 self.inner.token_stream(text),
83 encoder,
84 max_phonemes,
85 languages_set.clone(),
86 self.inject,
87 ))
88 }
89 EncoderAlgorithm::Caverphone1(encoder) => Box::new(GenericPhoneticTokenStream::new(
91 self.inner.token_stream(text),
92 Box::new(*encoder),
93 self.inject,
94 )),
95 EncoderAlgorithm::Caverphone2(encoder) => Box::new(GenericPhoneticTokenStream::new(
97 self.inner.token_stream(text),
98 Box::new(*encoder),
99 self.inject,
100 )),
101 EncoderAlgorithm::Cologne(encoder) => Box::new(GenericPhoneticTokenStream::new(
103 self.inner.token_stream(text),
104 Box::new(*encoder),
105 self.inject,
106 )),
107 EncoderAlgorithm::DaitchMokotoffSoundex(encoder, branching) => {
109 Box::new(DaitchMokotoffTokenStream::new(
110 self.inner.token_stream(text),
111 encoder.clone(),
112 *branching,
113 self.inject,
114 ))
115 }
116 EncoderAlgorithm::DoubleMetaphone(encoder, use_alternate) => match use_alternate {
118 true => Box::new(DoubleMetaphoneTokenStream::new(
120 self.inner.token_stream(text),
121 *encoder,
122 self.inject,
123 )),
124 false => Box::new(GenericPhoneticTokenStream::new(
125 self.inner.token_stream(text),
126 Box::new(*encoder),
127 self.inject,
128 )),
129 },
130 EncoderAlgorithm::MatchRatingApproach(encoder) => {
132 Box::new(GenericPhoneticTokenStream::new(
133 self.inner.token_stream(text),
134 Box::new(*encoder),
135 self.inject,
136 ))
137 }
138 EncoderAlgorithm::Metaphone(encoder) => Box::new(GenericPhoneticTokenStream::new(
140 self.inner.token_stream(text),
141 Box::new(*encoder),
142 self.inject,
143 )),
144 EncoderAlgorithm::Nysiis(encoder) => Box::new(GenericPhoneticTokenStream::new(
146 self.inner.token_stream(text),
147 Box::new(*encoder),
148 self.inject,
149 )),
150 EncoderAlgorithm::Phonex(encoder) => Box::new(GenericPhoneticTokenStream::new(
152 self.inner.token_stream(text),
153 Box::new(PhonexWrapper(*encoder)),
154 self.inject,
155 )),
156 EncoderAlgorithm::RefinedSoundex(encoder) => Box::new(GenericPhoneticTokenStream::new(
158 self.inner.token_stream(text),
159 Box::new(*encoder),
160 self.inject,
161 )),
162 EncoderAlgorithm::Soundex(encoder) => Box::new(GenericPhoneticTokenStream::new(
164 self.inner.token_stream(text),
165 Box::new(*encoder),
166 self.inject,
167 )),
168 }
169 }
170}