scirs2_text/simd_ops/
vectorized_ops.rs1use scirs2_core::ndarray::Array1;
7
8pub struct VectorizedStringOps;
10
11pub struct SimdNgramGenerator;
13
14pub struct SimdTextSimilarity;
16
17pub struct SimdTextNormalizer;
19
20pub struct SimdParallelProcessor;
22
23impl VectorizedStringOps {
24 pub fn vectorized_compare(strings1: &[&str], strings2: &[&str]) -> Vec<bool> {
26 strings1
27 .iter()
28 .zip(strings2.iter())
29 .map(|(s1, s2)| s1 == s2)
30 .collect()
31 }
32
33 pub fn vectorized_lengths(strings: &[&str]) -> Vec<usize> {
35 strings.iter().map(|s| s.len()).collect()
36 }
37
38 pub fn has_prefix_vectorized(strings: &[&str], prefix: &str) -> Vec<bool> {
40 strings.iter().map(|s| s.starts_with(prefix)).collect()
41 }
42
43 pub fn has_suffix_vectorized(strings: &[&str], suffix: &str) -> Vec<bool> {
45 strings.iter().map(|s| s.ends_with(suffix)).collect()
46 }
47}
48
49impl SimdNgramGenerator {
50 pub fn char_ngrams(text: &str, n: usize) -> Vec<String> {
52 if n == 0 || text.len() < n {
53 return vec![];
54 }
55
56 let chars: Vec<char> = text.chars().collect();
57 (0..=chars.len().saturating_sub(n))
58 .map(|i| chars[i..i + n].iter().collect())
59 .collect()
60 }
61
62 pub fn word_ngrams(text: &str, n: usize) -> Vec<String> {
64 if n == 0 {
65 return vec![];
66 }
67
68 let words: Vec<&str> = text.split_whitespace().collect();
69 if words.len() < n {
70 return vec![];
71 }
72
73 (0..=words.len().saturating_sub(n))
74 .map(|i| words[i..i + n].join(" "))
75 .collect()
76 }
77
78 pub fn skip_grams(text: &str, n: usize, k: usize) -> Vec<String> {
80 let words: Vec<&str> = text.split_whitespace().collect();
81 let mut skip_grams = Vec::new();
82
83 for i in 0..words.len() {
84 for j in 1..=k {
85 if i + j < words.len() {
86 skip_grams.push(format!("{} {}", words[i], words[i + j]));
87 }
88 }
89 }
90
91 skip_grams
92 }
93}
94
95impl SimdTextSimilarity {
96 pub fn jaccard_similarity(text1: &str, text2: &str) -> f64 {
98 let words1: std::collections::HashSet<&str> = text1.split_whitespace().collect();
99 let words2: std::collections::HashSet<&str> = text2.split_whitespace().collect();
100
101 let intersection = words1.intersection(&words2).count();
102 let union = words1.union(&words2).count();
103
104 if union == 0 {
105 0.0
106 } else {
107 intersection as f64 / union as f64
108 }
109 }
110
111 pub fn cosine_similarity_chars(text1: &str, text2: &str) -> f64 {
113 use std::collections::HashMap;
114
115 let mut freq1 = HashMap::new();
116 let mut freq2 = HashMap::new();
117
118 for c in text1.chars() {
119 *freq1.entry(c).or_insert(0) += 1;
120 }
121 for c in text2.chars() {
122 *freq2.entry(c).or_insert(0) += 1;
123 }
124
125 let mut dot_product = 0.0;
126 let mut norm1 = 0.0;
127 let mut norm2 = 0.0;
128
129 let all_chars: std::collections::HashSet<char> =
130 freq1.keys().chain(freq2.keys()).copied().collect();
131
132 for c in all_chars {
133 let f1 = *freq1.get(&c).unwrap_or(&0) as f64;
134 let f2 = *freq2.get(&c).unwrap_or(&0) as f64;
135
136 dot_product += f1 * f2;
137 norm1 += f1 * f1;
138 norm2 += f2 * f2;
139 }
140
141 if norm1 == 0.0 || norm2 == 0.0 {
142 0.0
143 } else {
144 dot_product / (norm1.sqrt() * norm2.sqrt())
145 }
146 }
147
148 pub fn levenshtein_similarity(text1: &str, text2: &str) -> f64 {
150 use super::edit_distance::SimdEditDistance;
151
152 let max_len = text1.len().max(text2.len());
153 if max_len == 0 {
154 return 1.0;
155 }
156
157 let distance = SimdEditDistance::levenshtein(text1, text2);
158 1.0 - (distance as f64 / max_len as f64)
159 }
160}
161
162impl SimdTextNormalizer {
163 pub fn normalize_text(text: &str) -> String {
165 text.to_lowercase()
166 .chars()
167 .filter(|c| c.is_alphanumeric() || c.is_whitespace())
168 .collect::<String>()
169 .split_whitespace()
170 .collect::<Vec<&str>>()
171 .join(" ")
172 }
173
174 pub fn remove_diacritics(text: &str) -> String {
176 text.chars()
178 .map(|c| match c {
179 'á' | 'à' | 'ä' | 'â' => 'a',
180 'é' | 'è' | 'ë' | 'ê' => 'e',
181 'í' | 'ì' | 'ï' | 'î' => 'i',
182 'ó' | 'ò' | 'ö' | 'ô' => 'o',
183 'ú' | 'ù' | 'ü' | 'û' => 'u',
184 _ => c,
185 })
186 .collect()
187 }
188
189 pub fn standardize_whitespace(text: &str) -> String {
191 text.split_whitespace().collect::<Vec<&str>>().join(" ")
192 }
193}
194
195impl SimdParallelProcessor {
196 pub fn parallel_process<F, R>(texts: &[&str], processor: F) -> Vec<R>
198 where
199 F: Fn(&str) -> R + Sync,
200 R: Send,
201 {
202 use scirs2_core::parallel_ops::*;
203 texts.par_iter().map(|&text| processor(text)).collect()
204 }
205
206 pub fn parallel_char_count(texts: &[&str], target: char) -> Vec<usize> {
208 use super::basic_ops::SimdStringOps;
209 Self::parallel_process(texts, |text| SimdStringOps::count_chars(text, target))
210 }
211
212 pub fn parallel_text_analysis(texts: &[&str]) -> Vec<super::text_analysis::TextAnalysisResult> {
214 use super::text_analysis::SimdTextAnalyzer;
215 Self::parallel_process(texts, SimdTextAnalyzer::analyze_text)
216 }
217}