tantivy_analysis_contrib/phonetic/
wrapper.rs

1//! Module that contains the `wrapper`. From what I understand
2//! it's mostly here to give to the bottom component of the analysis
3//! stack (which is a [Tokenizer]) the text to parse.
4
5use rphonetic::{BeiderMorseBuilder, Encoder, Phonex};
6use tantivy_tokenizer_api::{TokenStream, Tokenizer};
7
8use super::{
9    BeiderMorseTokenStream, DaitchMokotoffTokenStream, DoubleMetaphoneTokenStream,
10    EncoderAlgorithm, GenericPhoneticTokenStream,
11};
12
13/// Phonex wrapper to handle the case only '0'.
14/// This structure implements rphonetic's trait
15/// [Encoder] that delegates call to phonex encoder
16/// and then handle the specific case.
17struct PhonexWrapper(Phonex);
18
19impl Encoder for PhonexWrapper {
20    fn encode(&self, s: &str) -> String {
21        let result = self.0.encode(s);
22        // If only '0' then treat as empty string.
23        if result.bytes().any(|b| b != b'0') {
24            result
25        } else {
26            "".to_owned()
27        }
28    }
29}
30
31#[derive(Debug, Clone)]
32pub struct PhoneticFilterWrapper<T> {
33    algorithm: EncoderAlgorithm,
34    inject: bool,
35    inner: T,
36}
37
38impl<T> PhoneticFilterWrapper<T> {
39    pub(crate) fn new(inner: T, algorithm: EncoderAlgorithm, inject: bool) -> Self {
40        Self {
41            algorithm,
42            inject,
43            inner,
44        }
45    }
46}
47
48impl<T: Tokenizer> Tokenizer for PhoneticFilterWrapper<T> {
49    type TokenStream<'a> = Box<dyn TokenStream + 'a>;
50
51    fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
52        match &self.algorithm {
53            // Beider Morse
54            EncoderAlgorithm::BeiderMorse(
55                config_files,
56                name_type,
57                rule_type,
58                concat,
59                max_phonemes,
60                languages_set,
61            ) => {
62                let mut builder = BeiderMorseBuilder::new(config_files);
63                if let Some(name_type) = name_type {
64                    builder = builder.name_type(*name_type);
65                }
66                if let Some(rule_type) = rule_type {
67                    builder = builder.rule_type(*rule_type);
68                }
69                if let Some(concat) = concat {
70                    builder = builder.concat(*concat);
71                }
72                if let Some(max_phonemes) = max_phonemes {
73                    builder = builder.max_phonemes(*max_phonemes);
74                }
75
76                let max_phonemes = match max_phonemes {
77                    Some(max_phonemes) => *max_phonemes,
78                    None => 20,
79                };
80                let encoder = builder.build();
81                Box::new(BeiderMorseTokenStream::new(
82                    self.inner.token_stream(text),
83                    encoder,
84                    max_phonemes,
85                    languages_set.clone(),
86                    self.inject,
87                ))
88            }
89            // Caverphone1
90            EncoderAlgorithm::Caverphone1(encoder) => Box::new(GenericPhoneticTokenStream::new(
91                self.inner.token_stream(text),
92                Box::new(*encoder),
93                self.inject,
94            )),
95            // Caverphone2
96            EncoderAlgorithm::Caverphone2(encoder) => Box::new(GenericPhoneticTokenStream::new(
97                self.inner.token_stream(text),
98                Box::new(*encoder),
99                self.inject,
100            )),
101            // Cologne
102            EncoderAlgorithm::Cologne(encoder) => Box::new(GenericPhoneticTokenStream::new(
103                self.inner.token_stream(text),
104                Box::new(*encoder),
105                self.inject,
106            )),
107            // Daitch Mokotoff
108            EncoderAlgorithm::DaitchMokotoffSoundex(encoder, branching) => {
109                Box::new(DaitchMokotoffTokenStream::new(
110                    self.inner.token_stream(text),
111                    encoder.clone(),
112                    *branching,
113                    self.inject,
114                ))
115            }
116            // Double Metaphone
117            EncoderAlgorithm::DoubleMetaphone(encoder, use_alternate) => match use_alternate {
118                // Alternate: if true, use specific token filter, otherwise, use generic
119                true => Box::new(DoubleMetaphoneTokenStream::new(
120                    self.inner.token_stream(text),
121                    *encoder,
122                    self.inject,
123                )),
124                false => Box::new(GenericPhoneticTokenStream::new(
125                    self.inner.token_stream(text),
126                    Box::new(*encoder),
127                    self.inject,
128                )),
129            },
130            // Match Rating Approach
131            EncoderAlgorithm::MatchRatingApproach(encoder) => {
132                Box::new(GenericPhoneticTokenStream::new(
133                    self.inner.token_stream(text),
134                    Box::new(*encoder),
135                    self.inject,
136                ))
137            }
138            // Metaphone
139            EncoderAlgorithm::Metaphone(encoder) => Box::new(GenericPhoneticTokenStream::new(
140                self.inner.token_stream(text),
141                Box::new(*encoder),
142                self.inject,
143            )),
144            // Nysiis
145            EncoderAlgorithm::Nysiis(encoder) => Box::new(GenericPhoneticTokenStream::new(
146                self.inner.token_stream(text),
147                Box::new(*encoder),
148                self.inject,
149            )),
150            // Phonex
151            EncoderAlgorithm::Phonex(encoder) => Box::new(GenericPhoneticTokenStream::new(
152                self.inner.token_stream(text),
153                Box::new(PhonexWrapper(*encoder)),
154                self.inject,
155            )),
156            // Refined Soundex
157            EncoderAlgorithm::RefinedSoundex(encoder) => Box::new(GenericPhoneticTokenStream::new(
158                self.inner.token_stream(text),
159                Box::new(*encoder),
160                self.inject,
161            )),
162            // Soundex
163            EncoderAlgorithm::Soundex(encoder) => Box::new(GenericPhoneticTokenStream::new(
164                self.inner.token_stream(text),
165                Box::new(*encoder),
166                self.inject,
167            )),
168        }
169    }
170}