matcher_rs 0.8.1

A high-performance matcher designed to solve LOGICAL and TEXT VARIATIONS problems in word matching, implemented in Rust.
Documentation
use std::{borrow::Cow, collections::HashSet};

use rapidfuzz::distance;
use serde::{Deserialize, Serialize};

use crate::{
    matcher::{MatchResultTrait, TextMatcherInternal, TextMatcherTrait},
    process::process_matcher::{
        ProcessType, ProcessTypeBitNode, ProcessedTextSet, build_process_type_tree,
        reduce_text_process_with_tree,
    },
};

/// Enumeration representing the types of similarity matching algorithms available.
///
/// Currently, this enum only supports the Levenshtein distance algorithm.
///
/// # Variants
/// * `Levenshtein` - Represents the Levenshtein distance algorithm, a string metric for measuring the difference between two sequences.
///
/// The enum variants are serialized and deserialized using the `snake_case` naming convention.
#[derive(Serialize, Deserialize, Clone, Copy, Debug, PartialEq)]
#[serde(rename_all = "snake_case")]
pub enum SimMatchType {
    Levenshtein,
}

/// Represents a table structure to be used in the similarity matching process.
///
/// This structure holds various properties required for similarity matching using different algorithms.
///
/// # Type Parameters
/// * `'a` - The lifetime of the borrowed strings in the word list.
///
/// # Fields
/// * `table_id` - A unique identifier for the table.
/// * `match_id` - A unique identifier for the matching process.
/// * `process_type` - The type of processing to be applied, represented by the [`ProcessType`] enum.
/// * `sim_match_type` - The type of similarity matching algorithm to be used, represented by the [`SimMatchType`] enum.
/// * `word_list` - A list of words to be used in the matching process.
/// * `threshold` - A float value representing the similarity threshold for matching.
#[derive(Debug, Clone)]
pub struct SimTable<'a> {
    pub table_id: u32,
    pub match_id: u32,
    pub process_type: ProcessType,
    pub sim_match_type: SimMatchType,
    pub word_list: Vec<&'a str>,
    pub threshold: f64,
}

/// Represents a processed table used in the similarity matching process.
///
/// This struct is a concrete version of the [`SimTable`] struct, with ownership over
/// the word list.
///
/// # Fields
///
/// * `table_id` - A unique identifier for the table.
/// * `match_id` - A unique identifier for the matching process.
/// * `process_type` - The type of processing to be applied, represented by the [`ProcessType`] enum.
/// * `sim_match_type` - The type of similarity matching algorithm to be used, represented by the [`SimMatchType`] enum.
/// * `word_list` - A list of words over which the matching operation is performed. This is an owned vector of strings.
/// * `threshold` - A float value representing the similarity threshold for a match.
#[derive(Debug, Clone)]
struct SimProcessedTable {
    table_id: u32,
    match_id: u32,
    process_type: ProcessType,
    sim_match_type: SimMatchType,
    word_list: Vec<String>,
    threshold: f64,
}

/// Represents the result of a similarity matching operation.
///
/// This struct holds information about the match including identifiers for the match and table,
/// the word that was matched, and the similarity score of the match. The word is represented as a
/// `Cow` (Clone on Write) for efficient handling of borrowed or owned strings. This allows
/// flexibility in returning either a borrowed string or an owned string.
///
/// # Type Parameters
/// * `'a` - The lifetime of the matched word content.
///
/// # Fields
/// * `match_id` - A unique identifier for the matching process.
/// * `table_id` - A unique identifier for the table.
/// * `word_id` - A unique identifier for the word within the table.
/// * `word` - The word that was matched, represented as a `Cow`.
/// * `similarity` - A float value representing the similarity score of the match.
#[derive(Debug, Clone)]
pub struct SimResult<'a> {
    pub match_id: u32,
    pub table_id: u32,
    pub word_id: u32,
    pub word: Cow<'a, str>,
    pub similarity: f64,
}

impl MatchResultTrait<'_> for SimResult<'_> {
    fn match_id(&self) -> u32 {
        self.match_id
    }
    fn table_id(&self) -> u32 {
        self.table_id
    }
    fn word_id(&self) -> u32 {
        0
    }
    fn word(&self) -> &str {
        &self.word
    }
    fn similarity(&self) -> Option<f64> {
        Some(self.similarity)
    }
}

/// The [`SimMatcher`] struct is responsible for performing similarity matching operations
/// based on different processing types and similarity algorithms.
///
/// This struct maintains a process type tree and a list of pre-processed tables that contain
/// the necessary information for performing similarity matching on texts.
///
/// # Examples
///
/// ```rust
/// use matcher_rs::{SimMatcher, SimTable, SimMatchType, ProcessType, TextMatcherTrait};
///
/// let sim_table_list = vec![SimTable {
///     table_id: 1,
///     match_id: 1,
///     process_type: ProcessType::None,
///     sim_match_type: SimMatchType::Levenshtein,
///     word_list: vec!["example", "test"],
///     threshold: 0.6, // Low threshold to allow matching "exampel"
/// }];
///
/// let matcher = SimMatcher::new(&sim_table_list);
///
/// assert!(matcher.is_match("exampel"));
/// ```
///
/// The [`SimMatcher`] struct provides methods for checking if a text matches any of the processed tables
/// and for processing texts to obtain a list of similarity results.
#[derive(Debug, Clone)]
pub struct SimMatcher {
    process_type_tree: Box<[ProcessTypeBitNode]>,
    sim_processed_table_list: Box<[SimProcessedTable]>,
}

impl SimMatcher {
    /// Creates a new instance of [`SimMatcher`] from a list of [`SimTable`].
    ///
    /// This function initializes a [`SimMatcher`] by processing each [`SimTable`] in the input list.
    /// It extracts the process types and constructs a tree structure used for processing texts.
    /// Additionally, it converts the word lists in each [`SimTable`] from borrowed strings to owned strings.
    ///
    /// # Arguments
    /// * `sim_table_list` - A slice of [`SimTable`] references to be processed.
    ///
    /// # Returns
    /// An initialized [`SimMatcher`].
    pub fn new(sim_table_list: &[SimTable]) -> SimMatcher {
        let mut process_type_set = HashSet::with_capacity(sim_table_list.len());
        let mut sim_processed_table_list = Vec::with_capacity(sim_table_list.len());

        for sim_table in sim_table_list {
            process_type_set.insert(sim_table.process_type.bits());
            sim_processed_table_list.push(SimProcessedTable {
                table_id: sim_table.table_id,
                match_id: sim_table.match_id,
                process_type: sim_table.process_type,
                sim_match_type: sim_table.sim_match_type,
                word_list: sim_table
                    .word_list
                    .iter()
                    .map(|&word| word.to_owned())
                    .collect::<Vec<String>>(),
                threshold: sim_table.threshold,
            })
        }

        let process_type_tree = build_process_type_tree(&process_type_set).into_boxed_slice();

        SimMatcher {
            process_type_tree,
            sim_processed_table_list: sim_processed_table_list.into_boxed_slice(),
        }
    }
}

impl<'a> TextMatcherTrait<'a, SimResult<'a>> for SimMatcher {
    /// Checks if the provided text matches any entry in the processed tables.
    ///
    /// This function processes the input text to generate a set of processed text variants
    /// based on the defined process types. It then delegates the actual matching logic to a
    /// helper function that checks if any of these processed text variants match the entries
    /// in the `sim_processed_table_list`.
    ///
    /// # Arguments
    /// * `text` - A string slice representing the input text to be checked for similarity matches.
    ///
    /// # Returns
    /// `true` if the processed text matches any entry in the processed tables; otherwise returns `false`.
    fn is_match(&'a self, text: &'a str) -> bool {
        if text.is_empty() {
            return false;
        }

        let processed_text_process_type_set =
            reduce_text_process_with_tree(&self.process_type_tree, text);

        self.is_match_preprocessed(&processed_text_process_type_set)
    }
    /// Returns a **lazy** iterator over [`SimResult`] matches for the given text.
    ///
    /// Text preprocessing (`reduce_text_process_with_tree`) is performed once upfront.
    /// Each similarity comparison is then driven lazily — one word at a time — as the
    /// caller advances the iterator. Early termination (e.g. `.next()`, `.take(n)`,
    /// `.find()`) avoids unnecessary similarity computations.
    ///
    /// Internally a 3-level index state machine (`processed_text` → `table` → `word`)
    /// tracks progress. A [`HashSet`] deduplicates `(table_id, word_index)` pairs across
    /// processed-text variants.
    ///
    /// # Arguments
    /// * `text` - The input text to be processed and matched.
    ///
    /// # Returns
    /// An `impl Iterator<Item = SimResult<'a>>` — a lazy iterator of similarity results.
    fn process_iter(&'a self, text: &'a str) -> impl Iterator<Item = SimResult<'a>> + 'a {
        gen move {
            if text.is_empty() {
                return;
            }

            let processed = reduce_text_process_with_tree(&self.process_type_tree, text);
            let mut table_id_index_set = HashSet::new();

            for (processed_text, process_type_set) in processed {
                for sim_processed_table in self.sim_processed_table_list.iter() {
                    if !process_type_set.contains(&sim_processed_table.process_type.bits()) {
                        continue;
                    }

                    match sim_processed_table.sim_match_type {
                        SimMatchType::Levenshtein => {
                            for (index, sim_text) in
                                sim_processed_table.word_list.iter().enumerate()
                            {
                                let table_id_index =
                                    ((sim_processed_table.table_id as usize) << 32) | index;

                                if table_id_index_set.insert(table_id_index)
                                    && let Some(similarity) =
                                        distance::levenshtein::normalized_similarity_with_args(
                                            sim_text.chars(),
                                            processed_text.chars(),
                                            &distance::levenshtein::Args::default()
                                                .score_cutoff(sim_processed_table.threshold),
                                        )
                                {
                                    yield SimResult {
                                        match_id: sim_processed_table.match_id,
                                        table_id: sim_processed_table.table_id,
                                        word_id: index as u32,
                                        word: Cow::Borrowed(sim_text),
                                        similarity,
                                    };
                                }
                            }
                        }
                    }
                }
            }
        }
    }
}

impl<'a> TextMatcherInternal<'a, SimResult<'a>> for SimMatcher {
    /// Checks if any processed text variant matches an entry in the similarity tables.
    ///
    /// This helper function iterates through the processed text variants and their corresponding
    /// process type sets. For each variant, it checks against all entries in the similarity tables
    /// to see if there is a match based on the defined similarity match type (e.g., Levenshtein).
    ///
    /// # Parameters
    ///
    /// * `processed_text_process_type_set` - A reference to a list of tuples where each tuple consists of:
    ///   - A processed text variant represented as a [`Cow<str>`].
    ///   - An [`HashSet`] containing the process type identifiers associated with the processed text.
    ///
    /// # Returns
    ///
    /// Returns `true` if any of the processed text variants match an entry in the similarity tables
    /// according to the specified match type and similarity threshold; otherwise, returns `false`.
    fn is_match_preprocessed(
        &'a self,
        processed_text_process_type_set: &ProcessedTextSet<'a>,
    ) -> bool {
        for (processed_text, process_type_set) in processed_text_process_type_set {
            for sim_processed_table in &self.sim_processed_table_list {
                if !process_type_set.contains(&sim_processed_table.process_type.bits()) {
                    continue;
                }
                let is_match = match sim_processed_table.sim_match_type {
                    SimMatchType::Levenshtein => sim_processed_table.word_list.iter().any(|text| {
                        distance::levenshtein::normalized_similarity_with_args(
                            text.chars(),
                            processed_text.chars(),
                            &distance::levenshtein::Args::default()
                                .score_cutoff(sim_processed_table.threshold),
                        )
                        .is_some()
                    }),
                };

                if is_match {
                    return true;
                }
            }
        }

        false
    }

    /// Processes the provided set of processed text variants and their corresponding process type sets,
    /// returning a list of similarity results.
    ///
    /// This function iterates through each processed text variant and its associated process type set,
    /// comparing them against entries in the similarity tables to identify matches based on the defined
    /// similarity match type (e.g., Levenshtein). For each match found, the function accumulates the result
    /// with relevant information such as `match_id`, `table_id`, `word_id`, `word`, and the similarity score.
    ///
    /// # Parameters
    ///
    /// * `processed_text_process_type_set` - A reference to a list of tuples where each tuple consists of:
    ///   - A processed text variant represented as a [`Cow<str>`].
    ///   - An [`HashSet`] containing the process type identifiers associated with the processed text.
    ///
    /// # Returns
    ///
    /// Returns a vector of [`SimResult`] instances, each containing information about a matched entry
    /// in the similarity tables, including:
    /// - `match_id`: The identifier for the match.
    /// - `table_id`: The identifier of the similarity table where the match was found.
    /// - `word_id`: The index of the word in the similarity table's word list.
    /// - `word`: The word from the similarity table's word list that matched the processed text.
    /// - `similarity`: The similarity score of the match.
    ///
    /// The function ensures that only unique matches are included in the result list by maintaining
    /// an [`HashSet`] to track already processed table ID and word index combinations.
    fn process_preprocessed(
        &'a self,
        processed_text_process_type_set: &ProcessedTextSet<'a>,
    ) -> Vec<SimResult<'a>> {
        let mut result_list = Vec::new();
        let mut table_id_index_set = HashSet::new();

        for (processed_text, process_type_set) in processed_text_process_type_set {
            for sim_processed_table in &self.sim_processed_table_list {
                if !process_type_set.contains(&sim_processed_table.process_type.bits()) {
                    continue;
                }
                match sim_processed_table.sim_match_type {
                    SimMatchType::Levenshtein => {
                        for (index, text) in sim_processed_table.word_list.iter().enumerate() {
                            let table_id_index =
                                ((sim_processed_table.table_id as usize) << 32) | index;

                            if table_id_index_set.insert(table_id_index)
                                && let Some(similarity) =
                                    distance::levenshtein::normalized_similarity_with_args(
                                        text.chars(),
                                        processed_text.chars(),
                                        &distance::levenshtein::Args::default()
                                            .score_cutoff(sim_processed_table.threshold),
                                    )
                            {
                                result_list.push(SimResult {
                                    match_id: sim_processed_table.match_id,
                                    table_id: sim_processed_table.table_id,
                                    word_id: index as u32,
                                    word: Cow::Borrowed(text),
                                    similarity,
                                });
                            }
                        }
                    }
                }
            }
        }

        result_list
    }
}