matcher_rs 0.8.1

use std::{borrow::Cow, collections::HashSet};

use fancy_regex::{Regex, escape};
use regex::RegexSet;
use serde::{Deserialize, Serialize};

use crate::{
    matcher::{MatchResultTrait, TextMatcherInternal, TextMatcherTrait},
    process::process_matcher::{
        ProcessType, ProcessTypeBitNode, ProcessedTextSet, build_process_type_tree,
        reduce_text_process_with_tree,
    },
};

/// Enum representing different types of regular expression matches, each with a unique strategy.
///
/// This enum is decorated with [`Serialize`] and [`Deserialize`] traits for (de)serialization,
/// [`Clone`] and [`Copy`] traits to allow copying, [`Debug`] for formatting, and [`PartialEq`] for
/// comparison. Uses snake_case for serialized representations.
///
/// # Variants
/// * `SimilarChar` - Represents a match type that finds similar characters.
/// * `Acrostic` - Matches acrostic patterns.
/// * `Regex` - General regular expression matches.
#[derive(Serialize, Deserialize, Clone, Copy, Debug, PartialEq)]
#[serde(rename_all = "snake_case")]
pub enum RegexMatchType {
    SimilarChar,
    Acrostic,
    Regex,
}

/// A struct representing a table of regular expressions, containing metadata and a list of words.
///
/// # Type Parameters
/// * `'a` - The lifetime of the borrowed strings in the word list.
///
/// # Fields
/// * `table_id` - A unique identifier for the table.
/// * `match_id` - A unique identifier for the match.
/// * `process_type` - The type of process associated with the table, defined by the [`ProcessType`] enum.
/// * `regex_match_type` - The type of match strategy used, defined by the [`RegexMatchType`] enum.
/// * `word_list` - A list of words used in the regular expression matching.
#[derive(Debug, Clone)]
pub struct RegexTable<'a> {
    pub table_id: u32,
    pub match_id: u32,
    pub process_type: ProcessType,
    pub regex_match_type: RegexMatchType,
    pub word_list: Vec<&'a str>,
}

/// Enum representing different types of regex patterns used in the regex matcher.
///
/// The enum variants encapsulate different storage and matching strategies:
/// - `Standard`: A single compiled regex pattern.
/// - `List`: A list of compiled regex patterns along with corresponding words.
/// - `Set`: A set of compiled regex patterns optimized for simultaneous matching, along with corresponding words.
///
/// Each variant uses specific serialization and deserialization strategies provided by `serde`.
///
/// Variants:
/// - `Standard { regex }`:
///   - Fields:
///     - `regex: Regex` - A single compiled regex pattern. Uses custom serialization with `serde_regex`.
/// - `List { regex_list, word_list }`:
///   - Fields:
///     - `regex_list: Vec<Regex>` - A list of compiled regex patterns. Uses custom serialization with `serde_regex_list`.
///     - `word_list: Vec<String>` - A list of words corresponding to the regex patterns.
/// - `Set { regex_set, word_list }`:
///   - Fields:
///     - `regex_set: RegexSet` - A set of compiled regex patterns optimized for simultaneous matching. Uses custom serialization with `serde_regex_set`.
///     - `word_list: Vec<String>` - A list of words corresponding to the regex patterns in the set.
#[derive(Debug, Clone)]
enum RegexType {
    Standard {
        regex: Regex,
    },
    List {
        regex_list: Vec<Regex>,
        word_list: Vec<String>,
    },
    Set {
        regex_set: RegexSet,
        word_list: Vec<String>,
    },
}

/// A struct representing a table of regex patterns, containing metadata and the type of regex patterns.
///
/// Fields:
/// - `table_id`: A unique identifier for the table.
/// - `match_id`: A unique identifier for the match.
/// - `process_type`: The type of process associated with the table, defined by the [`ProcessType`] enum.
/// - `regex_type`: The type of regex pattern(s) used, defined by the [`RegexType`] enum.
#[derive(Debug, Clone)]
struct RegexPatternTable {
    table_id: u32,
    match_id: u32,
    process_type: ProcessType,
    regex_type: RegexType,
}

/// A struct representing the result of a regex match operation.
///
/// This struct contains metadata about the match, including the identifiers for the match and table,
/// the word identifier, and the matched word itself.
///
/// # Type Parameters
/// * `'a` - The lifetime of the matched word content.
///
/// # Fields
/// * `match_id` - A unique identifier for the match.
/// * `table_id` - A unique identifier for the table.
/// * `word_id` - A unique identifier for the word in the match.
/// * `word` - The matched word, represented as a [`Cow`] (clone-on-write) type.
#[derive(Debug, Clone)]
pub struct RegexResult<'a> {
    pub match_id: u32,
    pub table_id: u32,
    pub word_id: u32,
    pub word: Cow<'a, str>,
}

impl MatchResultTrait<'_> for RegexResult<'_> {
    fn match_id(&self) -> u32 {
        self.match_id
    }
    fn table_id(&self) -> u32 {
        self.table_id
    }
    fn word_id(&self) -> u32 {
        self.word_id
    }
    fn word(&self) -> &str {
        &self.word
    }
    fn similarity(&self) -> Option<f64> {
        None
    }
}

/// A struct representing a regex matcher.
///
/// This struct is used to match text against a collection of regular expression patterns
/// organized by different processing types.
///
/// # Examples
///
/// ```rust
/// use matcher_rs::{ProcessType, RegexTable, RegexMatchType, RegexMatcher, TextMatcherTrait};
///
/// let regex_table = RegexTable {
///     table_id: 1,
///     match_id: 1,
///     process_type: ProcessType::None,
///     regex_match_type: RegexMatchType::Regex,
///     word_list: vec!["^hello", "^world"],
/// };
///
/// let matcher = RegexMatcher::new(&[regex_table]);
///
/// assert!(matcher.is_match("hello world"));
/// ```
#[derive(Debug, Clone)]
pub struct RegexMatcher {
    process_type_tree: Box<[ProcessTypeBitNode]>,
    regex_pattern_table_list: Box<[RegexPatternTable]>,
}

impl RegexMatcher {
    /// Constructs a new [`RegexMatcher`] from a list of [`RegexTable`].
    ///
    /// This function initializes a [`RegexMatcher`] by processing the provided `regex_table_list`.
    /// Each [`RegexTable`] entry is transformed based on its `regex_match_type` to create the
    /// appropriate regex patterns, which are then stored in the matcher.
    ///
    /// # Arguments
    /// * `regex_table_list` - A slice of [`RegexTable`] containing the regex patterns and associated metadata.
    ///
    /// # Returns
    /// An instance of [`RegexMatcher`] initialized with the given `regex_table_list`.
    ///
    /// # Details
    /// The function handles three types of regex match types:
    /// * [`RegexMatchType::SimilarChar`]: Generates a single regex pattern that matches similar characters.
    /// * [`RegexMatchType::Acrostic`]: Generates individual regex patterns for each word in the table, recognizing them as acrostic patterns.
    /// * [`RegexMatchType::Regex`]: Directly uses the provided words as regex patterns or lists.
    pub fn new(regex_table_list: &[RegexTable]) -> RegexMatcher {
        let mut process_type_set = HashSet::with_capacity(regex_table_list.len());
        let mut regex_pattern_table_list = Vec::with_capacity(regex_table_list.len());

        for regex_table in regex_table_list {
            process_type_set.insert(regex_table.process_type.bits());

            let size = regex_table.word_list.len();

            match regex_table.regex_match_type {
                RegexMatchType::SimilarChar => {
                    let pattern = regex_table
                        .word_list
                        .iter()
                        .map(|charstr| format!("({})", escape(charstr).replace(',', "|")))
                        .collect::<Vec<String>>()
                        .join(".?");

                    if pattern.len() > 1024 {
                        eprintln!(
                            "SimilarChar pattern is too long ({}), potential ReDoS risk. Skipping.",
                            pattern.len()
                        );
                        continue;
                    }

                    regex_pattern_table_list.push(RegexPatternTable {
                        table_id: regex_table.table_id,
                        match_id: regex_table.match_id,
                        process_type: regex_table.process_type,
                        regex_type: RegexType::Standard {
                            regex: Regex::new(&pattern).unwrap(),
                        },
                    });
                }
                RegexMatchType::Acrostic => {
                    let mut word_list = Vec::with_capacity(size);
                    let mut regex_list = Vec::with_capacity(size);
                    let mut pattern_list = Vec::with_capacity(size);

                    for &word in &regex_table.word_list {
                        let pattern = format!(
                            r"(?i)(?:^|[\s\pP]+?){}",
                            escape(word).replace(',', r".*?[\s\pP]+?")
                        );
                        if pattern.len() > 1024 {
                            eprintln!("Acrostic pattern too long for word {}, skipping.", word);
                            continue;
                        }
                        match Regex::new(&pattern) {
                            Ok(regex) => {
                                regex_list.push(regex);
                                word_list.push(word.to_owned());
                                pattern_list.push(pattern);
                            }
                            Err(e) => {
                                eprintln!("Acrostic word {word} is illegal, ignored. Error: {e}");
                            }
                        }
                    }

                    let regex_type = RegexSet::new(pattern_list).map_or(
                        RegexType::List {
                            regex_list,
                            word_list: word_list.clone(),
                        },
                        |regex_set| RegexType::Set {
                            regex_set,
                            word_list,
                        },
                    );

                    regex_pattern_table_list.push(RegexPatternTable {
                        table_id: regex_table.table_id,
                        match_id: regex_table.match_id,
                        process_type: regex_table.process_type,
                        regex_type,
                    });
                }
                RegexMatchType::Regex => {
                    let mut word_list = Vec::with_capacity(size);
                    let mut regex_list = Vec::with_capacity(size);

                    for &word in &regex_table.word_list {
                        if word.len() > 1024 {
                            eprintln!("Regex pattern too long, skipping: {:.20}...", word);
                            continue;
                        }
                        match Regex::new(word) {
                            Ok(regex) => {
                                regex_list.push(regex);
                                word_list.push(word.to_owned());
                            }
                            Err(e) => {
                                eprintln!("Regex word {word} is illegal, ignored. Error: {e}");
                            }
                        }
                    }

                    let regex_type = RegexSet::new(&word_list).map_or(
                        RegexType::List {
                            regex_list,
                            word_list: word_list.clone(),
                        },
                        |regex_set| RegexType::Set {
                            regex_set,
                            word_list,
                        },
                    );

                    regex_pattern_table_list.push(RegexPatternTable {
                        table_id: regex_table.table_id,
                        match_id: regex_table.match_id,
                        process_type: regex_table.process_type,
                        regex_type,
                    });
                }
            };
        }

        let process_type_tree = build_process_type_tree(&process_type_set).into_boxed_slice();

        RegexMatcher {
            process_type_tree,
            regex_pattern_table_list: regex_pattern_table_list.into_boxed_slice(),
        }
    }
}

impl<'a> TextMatcherTrait<'a, RegexResult<'a>> for RegexMatcher {
    /// Checks if the given text matches any of the regex patterns in the [`RegexMatcher`].
    ///
    /// This function first processes the input text using the `process_type_tree` of the [`RegexMatcher`],
    /// which prepares the text for matching by applying various transformation rules.
    ///
    /// # Arguments
    /// * `text` - A string slice that holds the text to be checked against the regex patterns.
    ///
    /// # Returns
    /// `true` if there is a match, otherwise returns `false`.
    fn is_match(&'a self, text: &'a str) -> bool {
        if text.is_empty() {
            return false;
        }

        let processed_text_process_type_set =
            reduce_text_process_with_tree(&self.process_type_tree, text);

        self.is_match_preprocessed(&processed_text_process_type_set)
    }
    /// Returns a **lazy** iterator over [`RegexResult`] matches for the given text.
    ///
    /// Text preprocessing (`reduce_text_process_with_tree`) is performed once upfront.
    /// Pattern matching is then driven table-by-table as the caller advances the
    /// iterator. Results from each table are buffered internally and yielded one at a
    /// time, so callers that short-circuit (`.next()`, `.find()`, `.take(n)`) skip
    /// tables that would otherwise be evaluated unnecessarily.
    ///
    /// A `HashSet` deduplicates `(table_id, word_index)` pairs across processed-text
    /// variants.
    ///
    /// # Arguments
    /// * `text` - The input text to be processed and matched against the regex patterns.
    ///
    /// # Returns
    /// An `impl Iterator<Item = RegexResult<'a>>` — a lazy iterator of match results.
    fn process_iter(&'a self, text: &'a str) -> impl Iterator<Item = RegexResult<'a>> + 'a {
        gen move {
            if text.is_empty() {
                return;
            }

            let processed_text_process_type_set =
                reduce_text_process_with_tree(&self.process_type_tree, text);

            let mut table_id_index_set = HashSet::new();

            for (processed_text, process_type_set) in processed_text_process_type_set {
                for regex_pattern_table in self.regex_pattern_table_list.iter() {
                    if !process_type_set.contains(&regex_pattern_table.process_type.bits()) {
                        continue;
                    }

                    match &regex_pattern_table.regex_type {
                        RegexType::Standard { regex } => {
                            if table_id_index_set.insert(regex_pattern_table.table_id as usize) {
                                let mut temp = Vec::new();
                                for caps in regex.captures_iter(&processed_text).flatten() {
                                    temp.push(RegexResult {
                                        match_id: regex_pattern_table.match_id,
                                        table_id: regex_pattern_table.table_id,
                                        word_id: 0,
                                        word: Cow::Owned(
                                            caps.iter()
                                                .skip(1)
                                                .filter_map(|m| m.map(|mc| mc.as_str()))
                                                .collect::<String>(),
                                        ),
                                    });
                                }
                                for r in temp {
                                    yield r;
                                }
                            }
                        }
                        RegexType::List {
                            regex_list,
                            word_list,
                        } => {
                            for (index, regex) in regex_list.iter().enumerate() {
                                let table_id_index =
                                    ((regex_pattern_table.table_id as usize) << 32) | index;

                                if table_id_index_set.insert(table_id_index)
                                    && let Ok(true) = regex.is_match(&processed_text)
                                {
                                    yield RegexResult {
                                        match_id: regex_pattern_table.match_id,
                                        table_id: regex_pattern_table.table_id,
                                        word_id: index as u32,
                                        word: Cow::Borrowed(&word_list[index]),
                                    };
                                }
                            }
                        }
                        RegexType::Set {
                            regex_set,
                            word_list,
                        } => {
                            let mut temp = Vec::new();
                            for index in regex_set.matches(&processed_text) {
                                let table_id_index =
                                    ((regex_pattern_table.table_id as usize) << 32) | index;

                                if table_id_index_set.insert(table_id_index) {
                                    temp.push(RegexResult {
                                        match_id: regex_pattern_table.match_id,
                                        table_id: regex_pattern_table.table_id,
                                        word_id: index as u32,
                                        word: Cow::Borrowed(&word_list[index]),
                                    });
                                }
                            }
                            for r in temp {
                                yield r;
                            }
                        }
                    }
                }
            }
        }
    }
}

impl<'a> TextMatcherInternal<'a, RegexResult<'a>> for RegexMatcher {
    /// Checks if any of the given processed texts match any of the regex patterns in the [`RegexMatcher`].
    ///
    /// This function iterates over the pairs of processed text and their associated processing type sets.
    /// It checks if any of the regex patterns in the `regex_pattern_table_list` match the processed text.
    ///
    /// The function first verifies that the `process_type` of a regex pattern is present in the current
    /// `process_type_set`. If it is, it evaluates the match for different types of regex patterns:
    /// - `Standard`: Uses a standard regex match.
    /// - `List`: Checks if any regex in the list matches.
    /// - `Set`: Checks if the regex set matches.
    ///
    /// If any of the regex patterns match the processed text, the function returns `true`.
    ///
    /// # Arguments
    ///
    /// * `processed_text_process_type_set` - A slice of tuples where the first element is the processed text
    ///   and the second element is the set of process types associated with that text.
    ///
    /// # Returns
    ///
    /// * `bool` - Returns `true` if at least one regex pattern matches any processed text, otherwise returns `false`.
    fn is_match_preprocessed(
        &'a self,
        processed_text_process_type_set: &ProcessedTextSet<'a>,
    ) -> bool {
        for (processed_text, process_type_set) in processed_text_process_type_set {
            for regex_pattern_table in &self.regex_pattern_table_list {
                if !process_type_set.contains(&regex_pattern_table.process_type.bits()) {
                    continue;
                }

                let is_match = match &regex_pattern_table.regex_type {
                    RegexType::Standard { regex } => regex.is_match(processed_text).unwrap(),
                    RegexType::List { regex_list, .. } => regex_list
                        .iter()
                        .any(|regex| regex.is_match(processed_text).unwrap()),
                    RegexType::Set { regex_set, .. } => regex_set.is_match(processed_text),
                };

                if is_match {
                    return true;
                }
            }
        }
        false
    }

    /// Processes the `processed_text_process_type_set` to find and return regex matches.
    ///
    /// This function iterates over the pairs of processed text and their associated processing type sets.
    /// It then checks against the regex patterns in the `regex_pattern_table_list` to find matches.
    ///
    /// For each regex pattern, the function first verifies that the `process_type` of a regex pattern is present
    /// in the current `process_type_set`. If it is, it processes matches based on different types of regex patterns:
    /// - `Standard`: Uses a standard regex match and stores the captures.
    /// - `List`: Checks each regex in the list for a match and stores the corresponding words.
    /// - `Set`: Checks the regex set for matches and stores the corresponding words.
    ///
    /// The function keeps track of matches using `table_id_index_set` to avoid duplicate entries.
    ///
    /// # Arguments
    ///
    /// * `processed_text_process_type_set` - A slice of tuples where the first element is the processed text
    ///   and the second element is the set of process types associated with that text.
    ///
    /// # Returns
    ///
    /// * [`Vec<RegexResult>`] - A vector of [`RegexResult`] instances, each representing a match found in the processed text.
    fn process_preprocessed(
        &'a self,
        processed_text_process_type_set: &ProcessedTextSet<'a>,
    ) -> Vec<RegexResult<'a>> {
        let mut result_list = Vec::new();
        let mut table_id_index_set = HashSet::new();

        for (processed_text, process_type_set) in processed_text_process_type_set {
            for regex_pattern_table in &self.regex_pattern_table_list {
                if !process_type_set.contains(&regex_pattern_table.process_type.bits()) {
                    continue;
                }
                match &regex_pattern_table.regex_type {
                    RegexType::Standard { regex } => {
                        if table_id_index_set.insert(regex_pattern_table.table_id as usize) {
                            for caps in regex.captures_iter(processed_text).flatten() {
                                result_list.push(RegexResult {
                                    match_id: regex_pattern_table.match_id,
                                    table_id: regex_pattern_table.table_id,
                                    word_id: 0,
                                    word: Cow::Owned(
                                        caps.iter()
                                            .skip(1)
                                            .filter_map(|m| m.map(|match_char| match_char.as_str()))
                                            .collect::<String>(),
                                    ),
                                });
                            }
                        }
                    }
                    RegexType::List {
                        regex_list,
                        word_list,
                    } => {
                        for (index, regex) in regex_list.iter().enumerate() {
                            let table_id_index =
                                ((regex_pattern_table.table_id as usize) << 32) | index;

                            if table_id_index_set.insert(table_id_index)
                                && let Ok(is_match) = regex.is_match(processed_text)
                                && is_match
                            {
                                result_list.push(RegexResult {
                                    match_id: regex_pattern_table.match_id,
                                    table_id: regex_pattern_table.table_id,
                                    word_id: index as u32,
                                    word: Cow::Borrowed(&word_list[index]),
                                });
                            }
                        }
                    }
                    RegexType::Set {
                        regex_set,
                        word_list,
                    } => {
                        for index in regex_set.matches(processed_text) {
                            let table_id_index =
                                ((regex_pattern_table.table_id as usize) << 32) | index;

                            if table_id_index_set.insert(table_id_index) {
                                result_list.push(RegexResult {
                                    match_id: regex_pattern_table.match_id,
                                    table_id: regex_pattern_table.table_id,
                                    word_id: index as u32,
                                    word: Cow::Borrowed(&word_list[index]),
                                });
                            }
                        }
                    }
                }
            }
        }

        result_list
    }
}