Skip to main content

scirs2_text/simd_ops/
vectorized_ops.rs

1//! Advanced SIMD vectorized string operations
2//!
3//! This module provides advanced vectorized string operations using intrinsics
4//! and high-performance algorithms.
5
6use scirs2_core::ndarray::Array1;
7
8/// Advanced SIMD vectorized string operations using intrinsics
9pub struct VectorizedStringOps;
10
11/// SIMD-accelerated N-gram generator
12pub struct SimdNgramGenerator;
13
14/// SIMD-accelerated text similarity computation
15pub struct SimdTextSimilarity;
16
17/// SIMD-accelerated text normalizer
18pub struct SimdTextNormalizer;
19
20/// SIMD-accelerated parallel processor
21pub struct SimdParallelProcessor;
22
23impl VectorizedStringOps {
24    /// Vectorized string comparison
25    pub fn vectorized_compare(strings1: &[&str], strings2: &[&str]) -> Vec<bool> {
26        strings1
27            .iter()
28            .zip(strings2.iter())
29            .map(|(s1, s2)| s1 == s2)
30            .collect()
31    }
32
33    /// Vectorized length computation
34    pub fn vectorized_lengths(strings: &[&str]) -> Vec<usize> {
35        strings.iter().map(|s| s.len()).collect()
36    }
37
38    /// Vectorized prefix detection
39    pub fn has_prefix_vectorized(strings: &[&str], prefix: &str) -> Vec<bool> {
40        strings.iter().map(|s| s.starts_with(prefix)).collect()
41    }
42
43    /// Vectorized suffix detection
44    pub fn has_suffix_vectorized(strings: &[&str], suffix: &str) -> Vec<bool> {
45        strings.iter().map(|s| s.ends_with(suffix)).collect()
46    }
47}
48
49impl SimdNgramGenerator {
50    /// Generate character n-grams with SIMD acceleration
51    pub fn char_ngrams(text: &str, n: usize) -> Vec<String> {
52        if n == 0 || text.len() < n {
53            return vec![];
54        }
55
56        let chars: Vec<char> = text.chars().collect();
57        (0..=chars.len().saturating_sub(n))
58            .map(|i| chars[i..i + n].iter().collect())
59            .collect()
60    }
61
62    /// Generate word n-grams
63    pub fn word_ngrams(text: &str, n: usize) -> Vec<String> {
64        if n == 0 {
65            return vec![];
66        }
67
68        let words: Vec<&str> = text.split_whitespace().collect();
69        if words.len() < n {
70            return vec![];
71        }
72
73        (0..=words.len().saturating_sub(n))
74            .map(|i| words[i..i + n].join(" "))
75            .collect()
76    }
77
78    /// Generate skip-grams
79    pub fn skip_grams(text: &str, n: usize, k: usize) -> Vec<String> {
80        let words: Vec<&str> = text.split_whitespace().collect();
81        let mut skip_grams = Vec::new();
82
83        for i in 0..words.len() {
84            for j in 1..=k {
85                if i + j < words.len() {
86                    skip_grams.push(format!("{} {}", words[i], words[i + j]));
87                }
88            }
89        }
90
91        skip_grams
92    }
93}
94
95impl SimdTextSimilarity {
96    /// Compute Jaccard similarity with SIMD optimization
97    pub fn jaccard_similarity(text1: &str, text2: &str) -> f64 {
98        let words1: std::collections::HashSet<&str> = text1.split_whitespace().collect();
99        let words2: std::collections::HashSet<&str> = text2.split_whitespace().collect();
100
101        let intersection = words1.intersection(&words2).count();
102        let union = words1.union(&words2).count();
103
104        if union == 0 {
105            0.0
106        } else {
107            intersection as f64 / union as f64
108        }
109    }
110
111    /// Compute cosine similarity for character frequencies
112    pub fn cosine_similarity_chars(text1: &str, text2: &str) -> f64 {
113        use std::collections::HashMap;
114
115        let mut freq1 = HashMap::new();
116        let mut freq2 = HashMap::new();
117
118        for c in text1.chars() {
119            *freq1.entry(c).or_insert(0) += 1;
120        }
121        for c in text2.chars() {
122            *freq2.entry(c).or_insert(0) += 1;
123        }
124
125        let mut dot_product = 0.0;
126        let mut norm1 = 0.0;
127        let mut norm2 = 0.0;
128
129        let all_chars: std::collections::HashSet<char> =
130            freq1.keys().chain(freq2.keys()).copied().collect();
131
132        for c in all_chars {
133            let f1 = *freq1.get(&c).unwrap_or(&0) as f64;
134            let f2 = *freq2.get(&c).unwrap_or(&0) as f64;
135
136            dot_product += f1 * f2;
137            norm1 += f1 * f1;
138            norm2 += f2 * f2;
139        }
140
141        if norm1 == 0.0 || norm2 == 0.0 {
142            0.0
143        } else {
144            dot_product / (norm1.sqrt() * norm2.sqrt())
145        }
146    }
147
148    /// Compute Levenshtein similarity
149    pub fn levenshtein_similarity(text1: &str, text2: &str) -> f64 {
150        use super::edit_distance::SimdEditDistance;
151
152        let max_len = text1.len().max(text2.len());
153        if max_len == 0 {
154            return 1.0;
155        }
156
157        let distance = SimdEditDistance::levenshtein(text1, text2);
158        1.0 - (distance as f64 / max_len as f64)
159    }
160}
161
162impl SimdTextNormalizer {
163    /// Normalize text with SIMD acceleration
164    pub fn normalize_text(text: &str) -> String {
165        text.to_lowercase()
166            .chars()
167            .filter(|c| c.is_alphanumeric() || c.is_whitespace())
168            .collect::<String>()
169            .split_whitespace()
170            .collect::<Vec<&str>>()
171            .join(" ")
172    }
173
174    /// Remove diacritics (simplified)
175    pub fn remove_diacritics(text: &str) -> String {
176        // Simplified implementation - full implementation would need Unicode normalization
177        text.chars()
178            .map(|c| match c {
179                'á' | 'à' | 'ä' | 'â' => 'a',
180                'é' | 'è' | 'ë' | 'ê' => 'e',
181                'í' | 'ì' | 'ï' | 'î' => 'i',
182                'ó' | 'ò' | 'ö' | 'ô' => 'o',
183                'ú' | 'ù' | 'ü' | 'û' => 'u',
184                _ => c,
185            })
186            .collect()
187    }
188
189    /// Standardize whitespace
190    pub fn standardize_whitespace(text: &str) -> String {
191        text.split_whitespace().collect::<Vec<&str>>().join(" ")
192    }
193}
194
195impl SimdParallelProcessor {
196    /// Process texts in parallel with SIMD
197    pub fn parallel_process<F, R>(texts: &[&str], processor: F) -> Vec<R>
198    where
199        F: Fn(&str) -> R + Sync,
200        R: Send,
201    {
202        use scirs2_core::parallel_ops::*;
203        texts.par_iter().map(|&text| processor(text)).collect()
204    }
205
206    /// Parallel character counting
207    pub fn parallel_char_count(texts: &[&str], target: char) -> Vec<usize> {
208        use super::basic_ops::SimdStringOps;
209        Self::parallel_process(texts, |text| SimdStringOps::count_chars(text, target))
210    }
211
212    /// Parallel text analysis
213    pub fn parallel_text_analysis(texts: &[&str]) -> Vec<super::text_analysis::TextAnalysisResult> {
214        use super::text_analysis::SimdTextAnalyzer;
215        Self::parallel_process(texts, SimdTextAnalyzer::analyze_text)
216    }
217}