use scirs2_core::ndarray::Array1;
pub struct VectorizedStringOps;
pub struct SimdNgramGenerator;
pub struct SimdTextSimilarity;
pub struct SimdTextNormalizer;
pub struct SimdParallelProcessor;
impl VectorizedStringOps {
pub fn vectorized_compare(strings1: &[&str], strings2: &[&str]) -> Vec<bool> {
strings1
.iter()
.zip(strings2.iter())
.map(|(s1, s2)| s1 == s2)
.collect()
}
pub fn vectorized_lengths(strings: &[&str]) -> Vec<usize> {
strings.iter().map(|s| s.len()).collect()
}
pub fn has_prefix_vectorized(strings: &[&str], prefix: &str) -> Vec<bool> {
strings.iter().map(|s| s.starts_with(prefix)).collect()
}
pub fn has_suffix_vectorized(strings: &[&str], suffix: &str) -> Vec<bool> {
strings.iter().map(|s| s.ends_with(suffix)).collect()
}
}
impl SimdNgramGenerator {
pub fn char_ngrams(text: &str, n: usize) -> Vec<String> {
if n == 0 || text.len() < n {
return vec![];
}
let chars: Vec<char> = text.chars().collect();
(0..=chars.len().saturating_sub(n))
.map(|i| chars[i..i + n].iter().collect())
.collect()
}
pub fn word_ngrams(text: &str, n: usize) -> Vec<String> {
if n == 0 {
return vec![];
}
let words: Vec<&str> = text.split_whitespace().collect();
if words.len() < n {
return vec![];
}
(0..=words.len().saturating_sub(n))
.map(|i| words[i..i + n].join(" "))
.collect()
}
pub fn skip_grams(text: &str, n: usize, k: usize) -> Vec<String> {
let words: Vec<&str> = text.split_whitespace().collect();
let mut skip_grams = Vec::new();
for i in 0..words.len() {
for j in 1..=k {
if i + j < words.len() {
skip_grams.push(format!("{} {}", words[i], words[i + j]));
}
}
}
skip_grams
}
}
impl SimdTextSimilarity {
pub fn jaccard_similarity(text1: &str, text2: &str) -> f64 {
let words1: std::collections::HashSet<&str> = text1.split_whitespace().collect();
let words2: std::collections::HashSet<&str> = text2.split_whitespace().collect();
let intersection = words1.intersection(&words2).count();
let union = words1.union(&words2).count();
if union == 0 {
0.0
} else {
intersection as f64 / union as f64
}
}
pub fn cosine_similarity_chars(text1: &str, text2: &str) -> f64 {
use std::collections::HashMap;
let mut freq1 = HashMap::new();
let mut freq2 = HashMap::new();
for c in text1.chars() {
*freq1.entry(c).or_insert(0) += 1;
}
for c in text2.chars() {
*freq2.entry(c).or_insert(0) += 1;
}
let mut dot_product = 0.0;
let mut norm1 = 0.0;
let mut norm2 = 0.0;
let all_chars: std::collections::HashSet<char> =
freq1.keys().chain(freq2.keys()).copied().collect();
for c in all_chars {
let f1 = *freq1.get(&c).unwrap_or(&0) as f64;
let f2 = *freq2.get(&c).unwrap_or(&0) as f64;
dot_product += f1 * f2;
norm1 += f1 * f1;
norm2 += f2 * f2;
}
if norm1 == 0.0 || norm2 == 0.0 {
0.0
} else {
dot_product / (norm1.sqrt() * norm2.sqrt())
}
}
pub fn levenshtein_similarity(text1: &str, text2: &str) -> f64 {
use super::edit_distance::SimdEditDistance;
let max_len = text1.len().max(text2.len());
if max_len == 0 {
return 1.0;
}
let distance = SimdEditDistance::levenshtein(text1, text2);
1.0 - (distance as f64 / max_len as f64)
}
}
impl SimdTextNormalizer {
pub fn normalize_text(text: &str) -> String {
text.to_lowercase()
.chars()
.filter(|c| c.is_alphanumeric() || c.is_whitespace())
.collect::<String>()
.split_whitespace()
.collect::<Vec<&str>>()
.join(" ")
}
pub fn remove_diacritics(text: &str) -> String {
text.chars()
.map(|c| match c {
'á' | 'à' | 'ä' | 'â' => 'a',
'é' | 'è' | 'ë' | 'ê' => 'e',
'í' | 'ì' | 'ï' | 'î' => 'i',
'ó' | 'ò' | 'ö' | 'ô' => 'o',
'ú' | 'ù' | 'ü' | 'û' => 'u',
_ => c,
})
.collect()
}
pub fn standardize_whitespace(text: &str) -> String {
text.split_whitespace().collect::<Vec<&str>>().join(" ")
}
}
impl SimdParallelProcessor {
pub fn parallel_process<F, R>(texts: &[&str], processor: F) -> Vec<R>
where
F: Fn(&str) -> R + Sync,
R: Send,
{
use scirs2_core::parallel_ops::*;
texts.par_iter().map(|&text| processor(text)).collect()
}
pub fn parallel_char_count(texts: &[&str], target: char) -> Vec<usize> {
use super::basic_ops::SimdStringOps;
Self::parallel_process(texts, |text| SimdStringOps::count_chars(text, target))
}
pub fn parallel_text_analysis(texts: &[&str]) -> Vec<super::text_analysis::TextAnalysisResult> {
use super::text_analysis::SimdTextAnalyzer;
Self::parallel_process(texts, SimdTextAnalyzer::analyze_text)
}
}