1mod error;
2mod helper;
3mod hybrid_jaccard;
4mod jaro;
5mod jaro_winkler;
6mod levenshtein;
7mod monge_elkan;
8mod python;
9mod tokenizers;
10mod wrapped_strsim;
11
12use anyhow::Result;
13
14use pyo3::{prelude::*, types::PyList};
15
16pub use self::hybrid_jaccard::HybridJaccard;
17pub use self::jaro::Jaro;
18pub use self::jaro_winkler::JaroWinkler;
19pub use self::levenshtein::Levenshtein;
20pub use self::monge_elkan::{MongeElkan, SymmetricMongeElkan};
21
22pub use self::tokenizers::{
23 CachedWhitespaceTokenizer, CharacterTokenizer, WhitespaceCharSeqTokenizer, WhitespaceTokenizer,
24};
25pub use self::wrapped_strsim::SeqStrSim;
26pub use crate::error::StrSimError;
27pub use crate::helper::ReturnKind;
28
29pub trait StrSim<T> {
30 fn similarity_pre_tok2(
32 &self,
33 tokenized_key: &T,
34 tokenized_query: &T,
35 ) -> Result<f64, StrSimError>;
36}
37
38pub trait ExpectTokenizerType {
39 fn get_expected_tokenizer_type(&self) -> TokenizerType;
40}
41
42pub trait StrSimWithTokenizer<T>: StrSim<T> {
43 fn similarity(&self, key: &str, query: &str) -> Result<f64, StrSimError>;
57
58 fn similarity_pre_tok1(&self, key: &str, tokenized_query: &T) -> Result<f64, StrSimError>;
62
63 fn tokenize(&self, str: &str) -> T;
67
68 fn tokenize_list(&self, strs: &[&str]) -> Vec<T>;
72}
73
74pub trait StrSimWithMutTokenizer<T>: StrSim<T> {
77 fn similarity(&mut self, key: &str, query: &str) -> Result<f64, StrSimError>;
79
80 fn similarity_pre_tok1(&mut self, key: &str, tokenized_query: &T) -> Result<f64, StrSimError>;
82
83 fn tokenize(&mut self, str: &str) -> T;
85
86 fn tokenize_list(&mut self, strs: &[&str]) -> Vec<T>;
88}
89
90#[derive(Debug, Clone, PartialEq, Eq)]
91pub enum TokenizerType {
92 Seq(Box<Option<TokenizerType>>),
93 Set(Box<Option<TokenizerType>>),
94}
95
96impl TokenizerType {
97 fn is_outer_seq(&self) -> bool {
98 match self {
99 TokenizerType::Seq(_) => true,
100 TokenizerType::Set(_) => false,
101 }
102 }
103
104 #[allow(dead_code)]
105 #[inline]
106 fn is_outer_set(&self) -> bool {
107 !self.is_outer_seq()
108 }
109
110 fn has_nested(&self) -> bool {
111 match self {
112 TokenizerType::Seq(inner) => inner.is_some(),
113 TokenizerType::Set(inner) => inner.is_some(),
114 }
115 }
116
117 fn get_nested(&self) -> &Option<TokenizerType> {
118 match self {
119 TokenizerType::Seq(inner) => inner.as_ref(),
120 TokenizerType::Set(inner) => inner.as_ref(),
121 }
122 }
123}
124
125pub trait BaseTokenizer<T> {
126 type Return: for<'t> ReturnKind<'t, T>;
127
128 fn is_compatible(&self, tok_type: &TokenizerType) -> bool;
129}
130
131pub trait Tokenizer<T>: BaseTokenizer<T> {
132 fn tokenize<'t>(&'t self, s: &str) -> <Self::Return as ReturnKind<'t, T>>::Type;
133 fn tokenize_pair<'t>(
134 &'t self,
135 key: &str,
136 query: &str,
137 ) -> (
138 <Self::Return as ReturnKind<'t, T>>::Type,
139 <Self::Return as ReturnKind<'t, T>>::Type,
140 );
141 fn unique_tokenize<'t>(&'t self, s: &str) -> <Self::Return as ReturnKind<'t, T>>::Type;
142 fn unique_tokenize_pair<'t>(
143 &'t self,
144 key: &str,
145 query: &str,
146 ) -> (
147 <Self::Return as ReturnKind<'t, T>>::Type,
148 <Self::Return as ReturnKind<'t, T>>::Type,
149 );
150}
151
152pub trait MutTokenizer<T>: BaseTokenizer<T> {
153 fn tokenize<'t>(&'t mut self, s: &str) -> <Self::Return as ReturnKind<'t, T>>::Type;
154 fn tokenize_pair<'t>(
155 &'t mut self,
156 key: &str,
157 query: &str,
158 ) -> (
159 <Self::Return as ReturnKind<'t, T>>::Type,
160 <Self::Return as ReturnKind<'t, T>>::Type,
161 );
162 fn unique_tokenize<'t>(&'t mut self, s: &str) -> <Self::Return as ReturnKind<'t, T>>::Type;
163 fn unique_tokenize_pair<'t>(
164 &'t mut self,
165 key: &str,
166 query: &str,
167 ) -> (
168 <Self::Return as ReturnKind<'t, T>>::Type,
169 <Self::Return as ReturnKind<'t, T>>::Type,
170 );
171}
172
173impl<'t, T: Sized + 't> ReturnKind<'t, T> for Vec<char> {
174 type Type = Vec<char>;
175}
176
177impl<'t, T: Sized + 't> ReturnKind<'t, T> for Vec<Vec<char>> {
178 type Type = Vec<Vec<char>>;
179}
180
181impl<'t, T: Sized + 't> ReturnKind<'t, T> for Vec<String> {
182 type Type = Vec<String>;
183}
184
185impl<'t, T: Sized + 't> ReturnKind<'t, T> for &'t Vec<String> {
186 type Type = &'t Vec<String>;
187}
188
189#[cfg(feature = "extension-module")]
191#[pymodule]
192#[pyo3(name = "strsim")]
193pub fn yass(py: Python<'_>, m: &PyModule) -> PyResult<()> {
194 m.add("__version__", env!("CARGO_PKG_VERSION"))?;
195 m.setattr("__path__", PyList::empty(py))?;
196
197 m.add_class::<self::python::PyWhitespaceCharSeqTokenizer>()?;
198 m.add_class::<self::python::PyCharacterTokenizer>()?;
199 m.add_class::<self::python::VecVecChar>()?;
200 m.add_function(wrap_pyfunction!(self::python::levenshtein_similarity, m)?)?;
201 m.add_function(wrap_pyfunction!(self::python::jaro_similarity, m)?)?;
202 m.add_function(wrap_pyfunction!(self::python::jaro_winkler_similarity, m)?)?;
203 m.add_function(wrap_pyfunction!(self::python::monge_elkan_similarity, m)?)?;
204 m.add_function(wrap_pyfunction!(
205 self::python::symmetric_monge_elkan_similarity,
206 m
207 )?)?;
208 m.add_function(wrap_pyfunction!(
209 self::python::hybrid_jaccard_similarity,
210 m
211 )?)?;
212
213 Ok(())
214}