matcher_rs 0.4.5

A high performance multiple functional word matcher
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
use std::borrow::Cow;

use fancy_regex::Regex;
use rapidfuzz::distance;
use sonic_rs::{Deserialize, Serialize};

use crate::matcher::{MatchResultTrait, TextMatcherTrait};
#[cfg(feature = "serde")]
use crate::util::serde::serde_regex;

/// An enumeration representing different types of similarity matching algorithms.
///
/// The [SimMatchType] enum defines several types of algorithms that can be used
/// for similarity matching operations. Each variant corresponds to a specific
/// algorithm, providing flexibility in choosing the appropriate method based on
/// the use case.
///
/// # Variants
///
/// - [Levenshtein](SimMatchType::Levenshtein): Represents the Levenshtein distance algorithm, which calculates
///   the number of single-character edits (insertions, deletions, or substitutions)
///   required to change one word into another.
/// - [DamerauLevenshtein](SimMatchType::DamerauLevenshtein): Represents the Damerau-Levenshtein distance algorithm,
///   an extension of Levenshtein that also considers transpositions (swapping of
///   two adjacent characters) as a single edit.
/// - [Indel](SimMatchType::Indel): Represents the Insertion-Deletion distance algorithm, focusing on
///   insertions and deletions as the only operations.
/// - [Jaro](SimMatchType::Jaro): Represents the Jaro distance algorithm, measuring the similarity between
///   two strings based on the number and order of matching characters.
/// - [JaroWinkler](SimMatchType::JaroWinkler): Represents the Jaro-Winkler distance algorithm, a variant of Jaro
///   that gives more favorable ratings to strings that match from the beginning.
///
/// This enum can be serialized and deserialized using Serde, with the variant names
/// automatically converted to snake_case during this process.
#[derive(Serialize, Deserialize, Clone, Copy, Debug, PartialEq)]
#[serde(rename_all = "snake_case")]
pub enum SimMatchType {
    Levenshtein,
    DamerauLevenshtein,
    Indel,
    Jaro,
    JaroWinkler,
}

/// A struct representing a similarity table used for matching operations.
///
/// The [SimTable] struct is used to define a table of words and associated identifiers that
/// will be used in similarity matching. Each table has an ID, a match identifier, a list of words,
/// and a threshold for scoring.
///
/// The lifetime `'a` ensures that the references to the word list remain valid for as long as
/// the `SimTable` instance exists.
///
/// # Fields
///
/// - `table_id` ([u32]): The unique identifier for the similarity table.
/// - `match_id` ([u32]): An ID that serves as an identifier for the match within the table.
/// - `sim_match_type` ([SimMatchType]): The type of similarity matching algorithm to be used
///   with this table.
/// - `word_list` ([&'a Vec<&'a str>]): A reference to a vector of string slices representing
///   the words in this similarity table. These words will be used in the matching process.
/// - `threshold` ([f64]): The threshold value for similarity scoring. This score typically
///   ranges from 0.0 to 1.0, with higher values indicating higher similarity.
///
/// # Example
///
/// ```
/// use matcher_rs::{SimTable, SimMatchType};
///
/// let words = vec!["example1", "example2"];
///
/// let table = SimTable {
///     table_id: 1,
///     match_id: 1,
///     sim_match_type: SimMatchType::Levenshtein,
///     word_list: &words,
///     threshold: 0.8,
/// };
/// ```
#[derive(Debug, Clone)]
pub struct SimTable<'a> {
    pub table_id: u32,
    pub match_id: u32,
    pub sim_match_type: SimMatchType,
    pub word_list: &'a Vec<&'a str>,
    pub threshold: f64,
}

/// A struct representing a processed similarity table.
///
/// The [SimProcessedTable] struct holds the preprocessed data for similarity matching operations.
/// After a [SimTable] has been processed, its data is converted and stored in this struct, which
/// includes all necessary information for performing match operations, such as the unique table ID,
/// match ID, type of similarity matching algorithm used, a list of words, and the threshold for
/// similarity scoring.
///
/// # Fields
///
/// - `table_id` ([u32]): The unique identifier for the similarity table.
/// - `match_id` ([u32]): An ID that serves as an identifier for the match within the table.
/// - `sim_match_type` ([SimMatchType]): The type of similarity matching algorithm used for this table.
/// - `word_list` ([`Vec<String>`]): A vector of owned strings representing the words in this similarity table.
///   These words have been preprocessed and are ready for the matching process.
/// - `threshold` ([f64]): The threshold value for similarity scoring. This score ranges from 0.0 to 1.0,
///   with higher values indicating higher similarity.
#[derive(Debug, Clone)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
struct SimProcessedTable {
    table_id: u32,
    match_id: u32,
    sim_match_type: SimMatchType,
    word_list: Vec<String>,
    threshold: f64,
}

/// A struct representing the result of a similarity match.
///
/// The `SimResult` struct captures the details of a word that was found to be similar
/// during the similarity matching process. It includes the matched word, the unique
/// identifier of the table where the word was found, the match identifier of that table,
/// and the similarity score computed for the match.
///
/// The lifetimes ensure that the references in the `SimResult` struct remain valid
/// for as long as the struct instance exists.
///
/// # Fields
///
/// - `match_id` ([u32]): An ID that serves as an identifier for the match.
/// - `table_id` ([u32]): The unique identifier of the table where the word was found.
/// - `word` ([Cow<'a, str>]): The word that was found to be similar. It is stored as a [Cow]
///   (clone-on-write) to allow for both owned and borrowed strings.
/// - `similarity` ([f64]): The similarity score computed for the match. This score typically
///   ranges from 0.0 to 1.0, with higher values indicating greater similarity.
#[derive(Debug, Clone)]
pub struct SimResult<'a> {
    pub match_id: u32,
    pub table_id: u32,
    pub word: Cow<'a, str>,
    pub similarity: f64,
}

impl MatchResultTrait<'_> for SimResult<'_> {
    fn table_id(&self) -> u32 {
        self.table_id
    }
    fn word(&self) -> &str {
        self.word.as_ref()
    }
}

/// A struct representing a similarity matcher.
///
/// The [SimMatcher] struct is responsible for managing and processing similarity matching
/// operations on provided textual data using predefined tables. It includes functionality
/// to preprocess text by removing special characters and to search for matches within
/// the preprocessed tables using normalized Levenshtein similarity.
///
/// # Fields
///
/// - `remove_special_pattern` ([Regex]): A compiled regular expression used for removing
///   special characters from the text before processing.
/// - `sim_processed_table_list` ([`Vec<SimProcessedTable>`]): A vector containing preprocessed
///   tables, where each table consists of a list of words and identifiers ready for
///   similarity matching.
///
/// # Example
///
/// ```
/// use matcher_rs::{SimMatcher, SimTable, SimMatchType};
///
/// let word_list = vec!["example1", "example2"];
///
/// let sim_tables = vec![
///     SimTable {
///         table_id: 1,
///         match_id: 1,
///         sim_match_type: SimMatchType::Levenshtein,
///         word_list: &word_list,
///         threshold: 0.8,
///     },
///     // Add more SimTable instances as desired
/// ];
///
/// let matcher = SimMatcher::new(&sim_tables);
/// ```
#[derive(Debug, Clone)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
pub struct SimMatcher {
    #[cfg_attr(feature = "serde", serde(with = "serde_regex"))]
    remove_special_pattern: Regex,
    sim_processed_table_list: Vec<SimProcessedTable>,
}

impl SimMatcher {
    /// Creates a new instance of [SimMatcher] by preprocessing the provided list of [SimTable] instances.
    ///
    /// This function takes a reference to a list of [SimTable] instances provided by the user and
    /// preprocesses each table to create corresponding `SimProcessedTable` instances. The preprocessing
    /// involves compiling a regular expression for removing special characters and converting the
    /// words and match identifiers to owned [String] types.
    ///
    /// # Parameters
    ///
    /// - `sim_table_list` (&[SimTable]): A reference to a slice of [SimTable] instances to be preprocessed.
    ///
    /// # Returns
    ///
    /// - [SimMatcher]: A new instance of [SimMatcher] with preprocessed tables ready for similarity matching.
    ///
    /// # Example
    ///
    /// ```
    /// use matcher_rs::{SimMatcher, SimTable, SimMatchType};
    ///
    /// let word_list = vec!["example1", "example2"];
    ///
    /// let sim_tables = vec![
    ///     SimTable {
    ///         table_id: 1,
    ///         match_id: 1,
    ///         sim_match_type: SimMatchType::Levenshtein,
    ///         word_list: &word_list,
    ///         threshold: 0.8,
    ///     },
    ///     // Add more SimTable instances as desired
    /// ];
    ///
    /// let matcher = SimMatcher::new(&sim_tables);
    /// ```
    pub fn new(sim_table_list: &[SimTable]) -> SimMatcher {
        SimMatcher {
            remove_special_pattern: Regex::new(r"\W+").unwrap(),
            sim_processed_table_list: sim_table_list
                .iter()
                .map(|sim_table| SimProcessedTable {
                    table_id: sim_table.table_id,
                    match_id: sim_table.match_id,
                    sim_match_type: sim_table.sim_match_type,
                    word_list: sim_table
                        .word_list
                        .iter()
                        .map(|&word| word.to_owned())
                        .collect::<Vec<String>>(),
                    threshold: sim_table.threshold,
                })
                .collect(),
        }
    }
}

impl<'a> TextMatcherTrait<'a, SimResult<'a>> for SimMatcher {
    /// Checks if the given text has any similarity match within the preprocessed tables.
    ///
    /// This function processes the input text by removing special characters and then
    /// checks if the processed text has any similarity match within the preprocessed tables.
    /// Various similarity metrics are used based on the type specified in each table.
    /// The function returns `true` if there is any match that meets the threshold specified
    /// for similarity, otherwise `false`.
    ///
    /// # Parameters
    ///
    /// - `text` (&str): A reference to the text string to be processed and checked
    ///   against the preprocessed tables for similarity matches.
    ///
    /// # Returns
    ///
    /// - (bool): `true` if a similarity match is found that meets the specified threshold, otherwise `false`.
    ///
    /// # Example
    ///
    /// ```
    /// use matcher_rs::{SimMatcher, SimTable, TextMatcherTrait, SimMatchType};
    ///
    /// let word_list = vec!["example1", "example2"];
    ///
    /// let sim_tables = vec![
    ///     SimTable {
    ///         table_id: 1,
    ///         match_id: 1,
    ///         word_list: &word_list,
    ///         sim_match_type: SimMatchType::Levenshtein,
    ///         threshold: 0.8,
    ///     },
    ///     // Add more SimTable instances as desired
    /// ];
    ///
    /// let matcher = SimMatcher::new(&sim_tables);
    ///
    /// let is_matched = matcher.is_match("example3");
    ///
    /// if is_matched {
    ///     println!("The text has a similarity match in the preprocessed tables.");
    /// } else {
    ///     println!("No similarity match found.");
    /// }
    /// ```
    fn is_match(&self, text: &str) -> bool {
        let processed_text = self.remove_special_pattern.replace_all(text, "");

        self.sim_processed_table_list
            .iter()
            .any(|sim_table| match sim_table.sim_match_type {
                SimMatchType::Levenshtein => sim_table.word_list.iter().any(|text| {
                    distance::levenshtein::normalized_similarity_with_args(
                        text.chars(),
                        processed_text.chars(),
                        &distance::levenshtein::Args::default().score_cutoff(sim_table.threshold),
                    )
                    .is_some()
                }),
                SimMatchType::DamerauLevenshtein => sim_table.word_list.iter().any(|text| {
                    distance::damerau_levenshtein::normalized_similarity_with_args(
                        text.chars(),
                        processed_text.chars(),
                        &distance::damerau_levenshtein::Args::default()
                            .score_cutoff(sim_table.threshold),
                    )
                    .is_some()
                }),
                SimMatchType::Indel => sim_table.word_list.iter().any(|text| {
                    distance::indel::normalized_similarity_with_args(
                        text.chars(),
                        processed_text.chars(),
                        &distance::indel::Args::default().score_cutoff(sim_table.threshold),
                    )
                    .is_some()
                }),
                SimMatchType::Jaro => sim_table.word_list.iter().any(|text| {
                    distance::jaro::normalized_similarity_with_args(
                        text.chars(),
                        processed_text.chars(),
                        &distance::jaro::Args::default().score_cutoff(sim_table.threshold),
                    )
                    .is_some()
                }),
                SimMatchType::JaroWinkler => sim_table.word_list.iter().any(|text| {
                    distance::jaro_winkler::normalized_similarity_with_args(
                        text.chars(),
                        processed_text.chars(),
                        &distance::jaro_winkler::Args::default().score_cutoff(sim_table.threshold),
                    )
                    .is_some()
                }),
            })
    }

    /// Processes the input text and returns a list of similarity results based on the
    /// preprocessed tables and their respective similarity match types and thresholds.
    ///
    /// This function removes special characters from the input text, then iterates through
    /// each preprocessed similarity table to calculate the similarity scores between the
    /// processed input text and each word in the table's word list. The results are collected
    /// into a vector of `SimResult` instances for each word that meets the similarity threshold.
    ///
    /// # Parameters
    ///
    /// - `text` (&str): A reference to the text string to be processed and checked against
    ///   the preprocessed tables for similarity matches.
    ///
    /// # Returns
    ///
    /// - [`Vec<SimResult>`]: A vector containing `SimResult` instances for each word that meets
    ///   the similarity threshold specified in the corresponding similarity table.
    ///
    /// # Example
    ///
    /// ```
    /// use matcher_rs::{SimMatcher, SimTable, TextMatcherTrait, SimMatchType};
    ///
    /// let word_list = vec!["example1", "example2"];
    ///
    /// let sim_tables = vec![
    ///     SimTable {
    ///         table_id: 1,
    ///         match_id: 1,
    ///         word_list: &word_list,
    ///         sim_match_type: SimMatchType::Levenshtein,
    ///         threshold: 0.8,
    ///     },
    ///     // Add more SimTable instances as desired
    /// ];
    ///
    /// let matcher = SimMatcher::new(&sim_tables);
    ///
    /// let results = matcher.process("example3");
    ///
    /// for result in results {
    ///     println!(
    ///         "Found match in table {}: word={}, similarity={}",
    ///         result.table_id, result.word, result.similarity
    ///     );
    /// }
    /// ```
    fn process(&'a self, text: &str) -> Vec<SimResult<'a>> {
        let processed_text = self.remove_special_pattern.replace_all(text, "");

        let mut result_list = Vec::new();

        for sim_table in &self.sim_processed_table_list {
            match sim_table.sim_match_type {
                SimMatchType::Levenshtein => {
                    result_list.extend(sim_table.word_list.iter().filter_map(|text| {
                        distance::levenshtein::normalized_similarity_with_args(
                            text.chars(),
                            processed_text.chars(),
                            &distance::levenshtein::Args::default()
                                .score_cutoff(sim_table.threshold),
                        )
                        .map(|similarity| SimResult {
                            match_id: sim_table.match_id,
                            table_id: sim_table.table_id,
                            word: Cow::Borrowed(text),
                            similarity,
                        })
                    }))
                }
                SimMatchType::DamerauLevenshtein => {
                    result_list.extend(sim_table.word_list.iter().filter_map(|text| {
                        distance::damerau_levenshtein::normalized_similarity_with_args(
                            text.chars(),
                            processed_text.chars(),
                            &distance::damerau_levenshtein::Args::default()
                                .score_cutoff(sim_table.threshold),
                        )
                        .map(|similarity| SimResult {
                            match_id: sim_table.match_id,
                            table_id: sim_table.table_id,
                            word: Cow::Borrowed(text),
                            similarity,
                        })
                    }))
                }
                SimMatchType::Indel => {
                    result_list.extend(sim_table.word_list.iter().filter_map(|text| {
                        distance::indel::normalized_similarity_with_args(
                            text.chars(),
                            processed_text.chars(),
                            &distance::indel::Args::default().score_cutoff(sim_table.threshold),
                        )
                        .map(|similarity| SimResult {
                            match_id: sim_table.match_id,
                            table_id: sim_table.table_id,
                            word: Cow::Borrowed(text),
                            similarity,
                        })
                    }))
                }
                SimMatchType::Jaro => {
                    result_list.extend(sim_table.word_list.iter().filter_map(|text| {
                        distance::jaro::normalized_similarity_with_args(
                            text.chars(),
                            processed_text.chars(),
                            &distance::jaro::Args::default().score_cutoff(sim_table.threshold),
                        )
                        .map(|similarity| SimResult {
                            match_id: sim_table.match_id,
                            table_id: sim_table.table_id,
                            word: Cow::Borrowed(text),
                            similarity,
                        })
                    }))
                }
                SimMatchType::JaroWinkler => {
                    result_list.extend(sim_table.word_list.iter().filter_map(|text| {
                        distance::jaro_winkler::normalized_similarity_with_args(
                            text.chars(),
                            processed_text.chars(),
                            &distance::jaro_winkler::Args::default()
                                .score_cutoff(sim_table.threshold),
                        )
                        .map(|similarity| SimResult {
                            match_id: sim_table.match_id,
                            table_id: sim_table.table_id,
                            word: Cow::Borrowed(text),
                            similarity,
                        })
                    }))
                }
            }
        }

        result_list
    }
}