yass/
lib.rs

1mod error;
2mod helper;
3mod hybrid_jaccard;
4mod jaro;
5mod jaro_winkler;
6mod levenshtein;
7mod monge_elkan;
8mod python;
9mod tokenizers;
10mod wrapped_strsim;
11
12use anyhow::Result;
13
14use pyo3::{prelude::*, types::PyList};
15
16pub use self::hybrid_jaccard::HybridJaccard;
17pub use self::jaro::Jaro;
18pub use self::jaro_winkler::JaroWinkler;
19pub use self::levenshtein::Levenshtein;
20pub use self::monge_elkan::{MongeElkan, SymmetricMongeElkan};
21
22pub use self::tokenizers::{
23    CachedWhitespaceTokenizer, CharacterTokenizer, WhitespaceCharSeqTokenizer, WhitespaceTokenizer,
24};
25pub use self::wrapped_strsim::SeqStrSim;
26pub use crate::error::StrSimError;
27pub use crate::helper::ReturnKind;
28
29pub trait StrSim<T> {
30    /** Calculate the similarity with both key and query has already``` been pre-tokenized */
31    fn similarity_pre_tok2(
32        &self,
33        tokenized_key: &T,
34        tokenized_query: &T,
35    ) -> Result<f64, StrSimError>;
36}
37
38pub trait ExpectTokenizerType {
39    fn get_expected_tokenizer_type(&self) -> TokenizerType;
40}
41
42pub trait StrSimWithTokenizer<T>: StrSim<T> {
43    ///
44    /// Calculate the similarity between two strings. Usually, the similarity function is symmetric so
45    /// key and query can be swapped. However, some functions such as monge-elkan are not symmetric, so
46    /// key and query takes specific meaning: key is the value in the database and query is the search
47    /// query from the user.
48    ///
49    /// The return value is a likelihood between 0 and 1.
50    ///
51    /// # Arguments
52    ///
53    /// * `key` the value in the database (e.g., entity label)
54    /// * `query` the search query from the user (e.g., cell in the table)
55    ///
56    fn similarity(&self, key: &str, query: &str) -> Result<f64, StrSimError>;
57
58    ///
59    /// Calculate the similarity with the query's already been pre-tokenized
60    ///
61    fn similarity_pre_tok1(&self, key: &str, tokenized_query: &T) -> Result<f64, StrSimError>;
62
63    ///
64    /// Tokenize a string into a tokens used for this method.
65    ///
66    fn tokenize(&self, str: &str) -> T;
67
68    ///
69    /// Tokenize a list of strings into a list of tokens used for this method.
70    ///
71    fn tokenize_list(&self, strs: &[&str]) -> Vec<T>;
72}
73
74/// A mirror trait of StrSimWithTokenizer, but requires mutable self to invoke functions that call
75/// tokenizers' functions.
76pub trait StrSimWithMutTokenizer<T>: StrSim<T> {
77    /// See StrSimWithTokenizer::similarity
78    fn similarity(&mut self, key: &str, query: &str) -> Result<f64, StrSimError>;
79
80    /// See StrSimWithTokenizer::similarity_pre_tok1
81    fn similarity_pre_tok1(&mut self, key: &str, tokenized_query: &T) -> Result<f64, StrSimError>;
82
83    /// See StrSimWithTokenizer::tokenize
84    fn tokenize(&mut self, str: &str) -> T;
85
86    /// See StrSimWithTokenizer::tokenize_list
87    fn tokenize_list(&mut self, strs: &[&str]) -> Vec<T>;
88}
89
90#[derive(Debug, Clone, PartialEq, Eq)]
91pub enum TokenizerType {
92    Seq(Box<Option<TokenizerType>>),
93    Set(Box<Option<TokenizerType>>),
94}
95
96impl TokenizerType {
97    fn is_outer_seq(&self) -> bool {
98        match self {
99            TokenizerType::Seq(_) => true,
100            TokenizerType::Set(_) => false,
101        }
102    }
103
104    #[allow(dead_code)]
105    #[inline]
106    fn is_outer_set(&self) -> bool {
107        !self.is_outer_seq()
108    }
109
110    fn has_nested(&self) -> bool {
111        match self {
112            TokenizerType::Seq(inner) => inner.is_some(),
113            TokenizerType::Set(inner) => inner.is_some(),
114        }
115    }
116
117    fn get_nested(&self) -> &Option<TokenizerType> {
118        match self {
119            TokenizerType::Seq(inner) => inner.as_ref(),
120            TokenizerType::Set(inner) => inner.as_ref(),
121        }
122    }
123}
124
125pub trait BaseTokenizer<T> {
126    type Return: for<'t> ReturnKind<'t, T>;
127
128    fn is_compatible(&self, tok_type: &TokenizerType) -> bool;
129}
130
131pub trait Tokenizer<T>: BaseTokenizer<T> {
132    fn tokenize<'t>(&'t self, s: &str) -> <Self::Return as ReturnKind<'t, T>>::Type;
133    fn tokenize_pair<'t>(
134        &'t self,
135        key: &str,
136        query: &str,
137    ) -> (
138        <Self::Return as ReturnKind<'t, T>>::Type,
139        <Self::Return as ReturnKind<'t, T>>::Type,
140    );
141    fn unique_tokenize<'t>(&'t self, s: &str) -> <Self::Return as ReturnKind<'t, T>>::Type;
142    fn unique_tokenize_pair<'t>(
143        &'t self,
144        key: &str,
145        query: &str,
146    ) -> (
147        <Self::Return as ReturnKind<'t, T>>::Type,
148        <Self::Return as ReturnKind<'t, T>>::Type,
149    );
150}
151
152pub trait MutTokenizer<T>: BaseTokenizer<T> {
153    fn tokenize<'t>(&'t mut self, s: &str) -> <Self::Return as ReturnKind<'t, T>>::Type;
154    fn tokenize_pair<'t>(
155        &'t mut self,
156        key: &str,
157        query: &str,
158    ) -> (
159        <Self::Return as ReturnKind<'t, T>>::Type,
160        <Self::Return as ReturnKind<'t, T>>::Type,
161    );
162    fn unique_tokenize<'t>(&'t mut self, s: &str) -> <Self::Return as ReturnKind<'t, T>>::Type;
163    fn unique_tokenize_pair<'t>(
164        &'t mut self,
165        key: &str,
166        query: &str,
167    ) -> (
168        <Self::Return as ReturnKind<'t, T>>::Type,
169        <Self::Return as ReturnKind<'t, T>>::Type,
170    );
171}
172
173impl<'t, T: Sized + 't> ReturnKind<'t, T> for Vec<char> {
174    type Type = Vec<char>;
175}
176
177impl<'t, T: Sized + 't> ReturnKind<'t, T> for Vec<Vec<char>> {
178    type Type = Vec<Vec<char>>;
179}
180
181impl<'t, T: Sized + 't> ReturnKind<'t, T> for Vec<String> {
182    type Type = Vec<String>;
183}
184
185impl<'t, T: Sized + 't> ReturnKind<'t, T> for &'t Vec<String> {
186    type Type = &'t Vec<String>;
187}
188
189/// String similarity functions that are implemented in Rust.
190#[cfg(feature = "extension-module")]
191#[pymodule]
192#[pyo3(name = "strsim")]
193pub fn yass(py: Python<'_>, m: &PyModule) -> PyResult<()> {
194    m.add("__version__", env!("CARGO_PKG_VERSION"))?;
195    m.setattr("__path__", PyList::empty(py))?;
196
197    m.add_class::<self::python::PyWhitespaceCharSeqTokenizer>()?;
198    m.add_class::<self::python::PyCharacterTokenizer>()?;
199    m.add_class::<self::python::VecVecChar>()?;
200    m.add_function(wrap_pyfunction!(self::python::levenshtein_similarity, m)?)?;
201    m.add_function(wrap_pyfunction!(self::python::jaro_similarity, m)?)?;
202    m.add_function(wrap_pyfunction!(self::python::jaro_winkler_similarity, m)?)?;
203    m.add_function(wrap_pyfunction!(self::python::monge_elkan_similarity, m)?)?;
204    m.add_function(wrap_pyfunction!(
205        self::python::symmetric_monge_elkan_similarity,
206        m
207    )?)?;
208    m.add_function(wrap_pyfunction!(
209        self::python::hybrid_jaccard_similarity,
210        m
211    )?)?;
212
213    Ok(())
214}