summa-core 0.22.6

Summa Core library
Documentation
use std::collections::{HashMap, HashSet};

use regex::RegexSet;
use tracing::info;

use crate::components::query_parser::morphology::Morphology;

#[derive(Clone)]
pub struct EnglishMorphology {
    spelling_dict: HashMap<String, String>,
}

impl EnglishMorphology {
    pub fn new() -> Self {
        let mut spelling_set = vec![];
        let mut csv_reader = csv::ReaderBuilder::new()
            .has_headers(false)
            .flexible(true)
            .from_reader(include_bytes!("../../../../resources/spelling.csv").as_slice());
        for record in csv_reader.records() {
            let record = record.expect("incorrect record");
            let v1 = record[0].to_string();
            let v2 = record[1].to_string();
            spelling_set.push((v1.to_string(), v2.to_string()));
            spelling_set.push((v2, v1));
        }
        info!(
            action = "loaded_spelling_dictionary",
            size = spelling_set.len(),
            header = ?spelling_set[..10].iter().collect::<Vec<_>>()
        );
        EnglishMorphology {
            spelling_dict: HashMap::from_iter(spelling_set),
        }
    }
}

impl Default for EnglishMorphology {
    fn default() -> Self {
        Self::new()
    }
}

impl Morphology for EnglishMorphology {
    fn derive_tenses(&self, word: &str) -> Option<String> {
        thread_local! {
            static NOT_A_NOUN: (RegexSet, HashSet<&'static str>) = (RegexSet::new([
                r"\d$",
                r"ing$",
            ]).expect("cannot compile regex"), HashSet::from_iter(crate::components::default_tokenizers::STOP_WORDS.into_iter()));
        }
        NOT_A_NOUN.with(|(not_a_noun_regex, stop_words)| {
            if stop_words.contains(word) || not_a_noun_regex.is_match(word) {
                return None;
            }
            let is_singular = pluralize_rs::is_singular(word);
            let is_plural = pluralize_rs::is_plural(word);
            if is_singular {
                Some(pluralize_rs::to_plural(word))
            } else if is_plural {
                Some(pluralize_rs::to_singular(word))
            } else {
                None
            }
        })
    }

    fn derive_spelling(&self, word: &str) -> Option<String> {
        self.spelling_dict.get(word).cloned()
    }

    fn detect_ners(&self, _: &str) -> Vec<String> {
        vec![]
    }
}