matcher_rs 0.7.1

A high-performance matcher designed to solve LOGICAL and TEXT VARIATIONS problems in word matching, implemented in Rust.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
use std::borrow::Cow;
use std::cell::RefCell;
use std::rc::Rc;

use id_set::IdSet;
use rapidfuzz::distance;
use serde::{Deserialize, Serialize};

use crate::{
    matcher::{MatchResultTrait, TextMatcherTrait},
    process::process_matcher::{
        build_process_type_tree, reduce_text_process_with_tree, ProcessType, ProcessTypeBitNode,
    },
};

/// Enumeration representing the types of similarity matching algorithms available.
///
/// Currently, this enum only supports the Levenshtein distance algorithm.
///
/// # Variants
///
/// * [SimMatchType::Levenshtein] - Represents the Levenshtein distance algorithm, a string metric for measuring the difference between two sequences.
///
/// The enum variants are serialized and deserialized using the `snake_case` naming convention.
#[derive(Serialize, Deserialize, Clone, Copy, Debug, PartialEq)]
#[serde(rename_all = "snake_case")]
pub enum SimMatchType {
    Levenshtein,
}

/// Represents a table structure to be used in the similarity matching process.
///
/// This structure holds various properties required for similarity matching using different algorithms.
///
/// # Fields
///
/// * `table_id` - A unique identifier for the table.
/// * `match_id` - A unique identifier for the matching process.
/// * `process_type` - The type of processing to be applied, represented by the [ProcessType] enum.
/// * `sim_match_type` - The type of similarity matching algorithm to be used, represented by the [SimMatchType] enum.
/// * `word_list` - A list of words to be used in the matching process.
/// * `threshold` - A float value representing the similarity threshold for matching.
#[derive(Debug, Clone)]
pub struct SimTable<'a> {
    pub table_id: u32,
    pub match_id: u32,
    pub process_type: ProcessType,
    pub sim_match_type: SimMatchType,
    pub word_list: Vec<&'a str>,
    pub threshold: f64,
}

/// Represents a processed table used in the similarity matching process.
///
/// This struct is a concrete version of the [SimTable] struct, with ownership over
/// the word list.
///
/// # Fields
///
/// * `table_id` - A unique identifier for the table.
/// * `match_id` - A unique identifier for the matching process.
/// * `process_type` - The type of processing to be applied, represented by the [ProcessType] enum.
/// * `sim_match_type` - The type of similarity matching algorithm to be used, represented by the [SimMatchType] enum.
/// * `word_list` - A list of words over which the matching operation is performed. This is an owned vector of strings.
/// * `threshold` - A float value representing the similarity threshold for a match.
#[derive(Debug, Clone)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
struct SimProcessedTable {
    table_id: u32,
    match_id: u32,
    process_type: ProcessType,
    sim_match_type: SimMatchType,
    word_list: Vec<String>,
    threshold: f64,
}

/// Represents the result of a similarity matching operation.
///
/// This struct holds information about the match including identifiers for the match and table,
/// the word that was matched, and the similarity score of the match. The word is represented as a
/// [Cow] (Clone on Write) for efficient handling of borrowed or owned strings. This allows
/// flexibility in returning either a borrowed string or an owned string.
///
/// # Fields
///
/// * `match_id` - A unique identifier for the matching process.
/// * `table_id` - A unique identifier for the table.
/// * `word_id` - A unique identifier for the word within the table.
/// * `word` - The word that was matched, represented as a [Cow] to allow for both borrowed and owned strings.
/// * `similarity` - A float value representing the similarity score of the match.
#[derive(Debug, Clone)]
pub struct SimResult<'a> {
    pub match_id: u32,
    pub table_id: u32,
    pub word_id: u32,
    pub word: Cow<'a, str>,
    pub similarity: f64,
}

impl MatchResultTrait<'_> for SimResult<'_> {
    fn match_id(&self) -> u32 {
        self.match_id
    }
    fn table_id(&self) -> u32 {
        self.table_id
    }
    fn word_id(&self) -> u32 {
        0
    }
    fn word(&self) -> &str {
        &self.word
    }
    fn similarity(&self) -> Option<f64> {
        Some(self.similarity)
    }
}

/// The [SimMatcher] struct is responsible for performing similarity matching operations
/// based on different processing types and similarity algorithms.
///
/// This struct maintains a process type tree and a list of pre-processed tables that contain
/// the necessary information for performing similarity matching on texts.
///
/// # Fields
///
/// * `process_type_tree` - A vector of `ProcessTypeBitNode`, representing the tree structure used for
///   text processing based on defined process types.
/// * `sim_processed_table_list` - A vector of `SimProcessedTable`, holding the tables with processed information
///   for performing similarity matching.
///
/// # Example
///
/// ```
/// use matcher_rs::{SimMatcher, SimTable, SimMatchType, ProcessType};
///
/// // Create a list of `SimTable` with the required properties
/// let sim_table_list = vec![SimTable {
///     table_id: 1,
///     match_id: 1,
///     process_type: ProcessType::None,
///     sim_match_type: SimMatchType::Levenshtein,
///     word_list: vec!["example", "test"],
///     threshold: 0.8,
/// }];
///
/// // Instantiate a `SimMatcher` with the list of `SimTable`
/// let matcher = SimMatcher::new(&sim_table_list);
///
/// // Use `matcher` methods for performing similarity matching operations
/// ```
///
/// The [SimMatcher] struct provides methods for checking if a text matches any of the processed tables
/// and for processing texts to obtain a list of similarity results.
#[derive(Debug, Clone)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
pub struct SimMatcher {
    process_type_tree: Vec<ProcessTypeBitNode>,
    sim_processed_table_list: Vec<SimProcessedTable>,
}

impl SimMatcher {
    /// Creates a new instance of [SimMatcher] from a list of [SimTable].
    ///
    /// This function initializes a [SimMatcher] by processing each [SimTable] in the input list.
    /// It extracts the process types and constructs a tree structure used for processing texts.
    /// Additionally, it converts the word lists in each [SimTable] from borrowed strings to owned strings.
    ///
    /// # Parameters
    ///
    /// * `sim_table_list` - A slice of [SimTable] references to be processed and included in the new [SimMatcher] instance.
    ///
    /// # Returns
    ///
    /// Returns a new instance of [SimMatcher] containing:
    /// * `process_type_tree` - A vector of `ProcessTypeBitNode`, representing the tree structure used for text processing based on the process types extracted from the input [SimTable] list.
    /// * `sim_processed_table_list` - A vector of `SimProcessedTable`, each containing an owned vector of words and other properties derived from the input [SimTable] list.
    pub fn new(sim_table_list: &[SimTable]) -> SimMatcher {
        let mut process_type_set = IdSet::with_capacity(sim_table_list.len());
        let mut sim_processed_table_list = Vec::with_capacity(sim_table_list.len());

        for sim_table in sim_table_list {
            process_type_set.insert(sim_table.process_type.bits() as usize);
            sim_processed_table_list.push(SimProcessedTable {
                table_id: sim_table.table_id,
                match_id: sim_table.match_id,
                process_type: sim_table.process_type,
                sim_match_type: sim_table.sim_match_type,
                word_list: sim_table
                    .word_list
                    .iter()
                    .map(|&word| word.to_owned())
                    .collect::<Vec<String>>(),
                threshold: sim_table.threshold,
            })
        }

        let process_type_tree = build_process_type_tree(&process_type_set);

        SimMatcher {
            process_type_tree,
            sim_processed_table_list,
        }
    }
}

impl<'a> TextMatcherTrait<'a, SimResult<'a>> for SimMatcher {
    /// Checks if the provided text matches any entry in the processed tables.
    ///
    /// This function processes the input text to generate a set of processed text variants
    /// based on the defined process types. It then delegates the actual matching logic to a
    /// helper function that checks if any of these processed text variants match the entries
    /// in the `sim_processed_table_list`.
    ///
    /// # Parameters
    ///
    /// * `text` - A string slice representing the input text to be checked for similarity matches.
    ///
    /// # Returns
    ///
    /// Returns `true` if the processed text matches any entry in the processed tables; otherwise returns `false`.
    fn is_match(&'a self, text: &'a str) -> bool {
        if text.is_empty() {
            return false;
        }

        let processed_text_process_type_set =
            reduce_text_process_with_tree(&self.process_type_tree, text);

        self._is_match_with_processed_text_process_type_set(&processed_text_process_type_set)
    }

    /// Checks if any processed text variant matches an entry in the similarity tables.
    ///
    /// This helper function iterates through the processed text variants and their corresponding
    /// process type sets. For each variant, it checks against all entries in the similarity tables
    /// to see if there is a match based on the defined similarity match type (e.g., Levenshtein).
    ///
    /// # Parameters
    ///
    /// * `processed_text_process_type_set` - A reference to a list of tuples where each tuple consists of:
    ///   - A processed text variant represented as a [`Cow<str>`].
    ///   - An [IdSet] containing the process type identifiers associated with the processed text.
    ///
    /// # Returns
    ///
    /// Returns `true` if any of the processed text variants match an entry in the similarity tables
    /// according to the specified match type and similarity threshold; otherwise, returns `false`.
    fn _is_match_with_processed_text_process_type_set(
        &'a self,
        processed_text_process_type_set: &[(Cow<'a, str>, id_set::IdSet)],
    ) -> bool {
        for (processed_text, process_type_set) in processed_text_process_type_set {
            for sim_processed_table in &self.sim_processed_table_list {
                if !process_type_set.contains(sim_processed_table.process_type.bits() as usize) {
                    continue;
                }
                let is_match = match sim_processed_table.sim_match_type {
                    SimMatchType::Levenshtein => sim_processed_table.word_list.iter().any(|text| {
                        distance::levenshtein::normalized_similarity_with_args(
                            text.chars(),
                            processed_text.chars(),
                            &distance::levenshtein::Args::default()
                                .score_cutoff(sim_processed_table.threshold),
                        )
                        .is_some()
                    }),
                };

                if is_match {
                    return true;
                }
            }
        }

        false
    }

    /// Processes the provided text and returns a list of similarity results.
    ///
    /// This function takes the input text and generates a set of processed text variants based
    /// on the defined process types, as described in the `process_type_tree`. It then uses these
    /// variants to find matches in the similarity tables, accumulating results where a similarity
    /// match is found.
    ///
    /// # Parameters
    ///
    /// * `text` - A string slice representing the input text to be processed and checked for similarity matches.
    ///
    /// # Returns
    ///
    /// Returns a vector of [SimResult] instances, each containing information about a matched entry
    /// in the similarity tables, including the `match_id`, `table_id`, `word_id`, `word`, and the
    /// similarity score.
    fn process(&'a self, text: &'a str) -> Vec<SimResult<'a>> {
        if text.is_empty() {
            return Vec::new();
        }

        let processed_text_process_type_set =
            reduce_text_process_with_tree(&self.process_type_tree, text);

        self._process_with_processed_text_process_type_set(&processed_text_process_type_set)
    }

    /// Processes the given text and returns a **lazy** iterator over [SimResult] matches.
    ///
    /// Unlike [`process`], which eagerly collects all results into a [`Vec`], this method
    /// returns a [`Box<dyn Iterator>`] that yields results on demand. Text preprocessing
    /// (`reduce_text_process_with_tree`) is performed once upfront. Each similarity comparison
    /// is then driven lazily as the caller advances the iterator.
    ///
    /// Deduplication of `(table_id, word_index)` pairs is handled by an [`IdSet`] captured
    /// inside the iterator closure and updated as items are consumed.
    ///
    /// # Parameters
    ///
    /// * `text` - A string slice representing the input text to be processed and checked for similarity matches.
    ///
    /// # Returns
    ///
    /// * `Box<dyn Iterator<Item = SimResult<'a>> + 'a>` — a lazy iterator of similarity match results.
    ///
    /// # Note — `!Send`
    ///
    /// The returned iterator captures an `Rc<RefCell<IdSet>>` for deduplication and is therefore
    /// **not `Send`**. It cannot be sent across thread boundaries. Collect with
    /// `.collect::<Vec<_>>()` before crossing a thread boundary.
    fn process_iter(&'a self, text: &'a str) -> Box<dyn Iterator<Item = SimResult<'a>> + 'a> {
        if text.is_empty() {
            return Box::new(std::iter::empty());
        }

        let processed_text_process_type_set =
            reduce_text_process_with_tree(&self.process_type_tree, text);

        // Wrap the dedup set in Rc<RefCell<_>> so it can be shared across the nested
        // FnMut closures without triggering the "moved out of captured variable" error.
        let table_id_index_set = Rc::new(RefCell::new(IdSet::new()));

        Box::new(processed_text_process_type_set.into_iter().flat_map(
            move |(processed_text, process_type_set)| {
                let table_id_index_set = Rc::clone(&table_id_index_set);
                self.sim_processed_table_list
                    .iter()
                    .filter(move |t| process_type_set.contains(t.process_type.bits() as usize))
                    .flat_map(move |sim_processed_table| {
                        let table_id_index_set = Rc::clone(&table_id_index_set);
                        // Yield word-level results for this (processed_text, table) pair.
                        let mut batch: Vec<SimResult<'a>> = Vec::new();
                        match sim_processed_table.sim_match_type {
                            SimMatchType::Levenshtein => {
                                for (index, word) in
                                    sim_processed_table.word_list.iter().enumerate()
                                {
                                    let table_id_index =
                                        ((sim_processed_table.table_id as usize) << 32) | index;
                                    if table_id_index_set.borrow_mut().insert(table_id_index) {
                                        if let Some(similarity) =
                                            distance::levenshtein::normalized_similarity_with_args(
                                                word.chars(),
                                                processed_text.chars(),
                                                &distance::levenshtein::Args::default()
                                                    .score_cutoff(sim_processed_table.threshold),
                                            )
                                        {
                                            batch.push(SimResult {
                                                match_id: sim_processed_table.match_id,
                                                table_id: sim_processed_table.table_id,
                                                word_id: index as u32,
                                                word: Cow::Borrowed(word),
                                                similarity,
                                            });
                                        }
                                    }
                                }
                            }
                        }
                        batch.into_iter()
                    })
            },
        ))
    }

    /// Processes the provided set of processed text variants and their corresponding process type sets,
    /// returning a list of similarity results.
    ///
    /// This function iterates through each processed text variant and its associated process type set,
    /// comparing them against entries in the similarity tables to identify matches based on the defined
    /// similarity match type (e.g., Levenshtein). For each match found, the function accumulates the result
    /// with relevant information such as `match_id`, `table_id`, `word_id`, `word`, and the similarity score.
    ///
    /// # Parameters
    ///
    /// * `processed_text_process_type_set` - A reference to a list of tuples where each tuple consists of:
    ///   - A processed text variant represented as a [`Cow<str>`].
    ///   - An [IdSet] containing the process type identifiers associated with the processed text.
    ///
    /// # Returns
    ///
    /// Returns a vector of [SimResult] instances, each containing information about a matched entry
    /// in the similarity tables, including:
    /// - `match_id`: The identifier for the match.
    /// - `table_id`: The identifier of the similarity table where the match was found.
    /// - `word_id`: The index of the word in the similarity table's word list.
    /// - `word`: The word from the similarity table's word list that matched the processed text.
    /// - `similarity`: The similarity score of the match.
    ///
    /// The function ensures that only unique matches are included in the result list by maintaining
    /// an [IdSet] to track already processed table ID and word index combinations.
    fn _process_with_processed_text_process_type_set(
        &'a self,
        processed_text_process_type_set: &[(Cow<'a, str>, IdSet)],
    ) -> Vec<SimResult<'a>> {
        let mut result_list = Vec::new();
        let mut table_id_index_set = IdSet::new();

        for (processed_text, process_type_set) in processed_text_process_type_set {
            for sim_processed_table in &self.sim_processed_table_list {
                if !process_type_set.contains(sim_processed_table.process_type.bits() as usize) {
                    continue;
                }
                match sim_processed_table.sim_match_type {
                    SimMatchType::Levenshtein => {
                        for (index, text) in sim_processed_table.word_list.iter().enumerate() {
                            let table_id_index =
                                ((sim_processed_table.table_id as usize) << 32) | index;

                            if table_id_index_set.insert(table_id_index) {
                                if let Some(similarity) =
                                    distance::levenshtein::normalized_similarity_with_args(
                                        text.chars(),
                                        processed_text.chars(),
                                        &distance::levenshtein::Args::default()
                                            .score_cutoff(sim_processed_table.threshold),
                                    )
                                {
                                    result_list.push(SimResult {
                                        match_id: sim_processed_table.match_id,
                                        table_id: sim_processed_table.table_id,
                                        word_id: index as u32,
                                        word: Cow::Borrowed(text),
                                        similarity,
                                    });
                                }
                            }
                        }
                    }
                }
            }
        }

        result_list
    }
}