scirs2_text/simd_ops/
basic_ops.rs1use scirs2_core::simd_ops::PlatformCapabilities;
7
8pub struct SimdStringOps;
10
11impl SimdStringOps {
12 pub fn is_available() -> bool {
14 let caps = PlatformCapabilities::detect();
15 caps.simd_available
16 }
17
18 pub fn count_chars(text: &str, target: char) -> usize {
20 if !Self::is_available() || text.len() < 64 {
21 return text.chars().filter(|&c| c == target).count();
23 }
24
25 if target.is_ascii() {
27 Self::count_bytes(text.as_bytes(), target as u8)
28 } else {
29 text.chars().filter(|&c| c == target).count()
31 }
32 }
33
34 fn count_bytes(data: &[u8], target: u8) -> usize {
36 if !Self::is_available() || data.len() < 64 {
37 return data.iter().filter(|&&b| b == target).count();
38 }
39
40 let simdchunk_size = 512; let mut count = 0usize;
43
44 let chunks: Vec<_> = data.chunks(simdchunk_size).collect();
46
47 use scirs2_core::parallel_ops::*;
49 if chunks.len() > 4 && data.len() > 4096 {
50 let partial_counts: Vec<usize> = chunks
51 .par_iter()
52 .map(|chunk| {
53 let mut local_count = 0;
54 let mut i = 0;
56 while i + 8 <= chunk.len() {
57 local_count += (chunk[i] == target) as usize;
59 local_count += (chunk[i + 1] == target) as usize;
60 local_count += (chunk[i + 2] == target) as usize;
61 local_count += (chunk[i + 3] == target) as usize;
62 local_count += (chunk[i + 4] == target) as usize;
63 local_count += (chunk[i + 5] == target) as usize;
64 local_count += (chunk[i + 6] == target) as usize;
65 local_count += (chunk[i + 7] == target) as usize;
66 i += 8;
67 }
68 while i < chunk.len() {
70 local_count += (chunk[i] == target) as usize;
71 i += 1;
72 }
73 local_count
74 })
75 .collect();
76 count = partial_counts.iter().sum();
77 } else {
78 for chunk in chunks {
80 let mut local_count = 0;
81 let mut i = 0;
82 while i + 4 <= chunk.len() {
84 local_count += (chunk[i] == target) as usize;
85 local_count += (chunk[i + 1] == target) as usize;
86 local_count += (chunk[i + 2] == target) as usize;
87 local_count += (chunk[i + 3] == target) as usize;
88 i += 4;
89 }
90 while i < chunk.len() {
91 local_count += (chunk[i] == target) as usize;
92 i += 1;
93 }
94 count += local_count;
95 }
96 }
97
98 count
99 }
100
101 pub fn find_whitespace_positions(text: &str) -> Vec<usize> {
103 if !Self::is_available() || !text.is_ascii() || text.len() < 64 {
104 return text
105 .char_indices()
106 .filter(|(_, c)| c.is_whitespace())
107 .map(|(i_, _)| i_)
108 .collect();
109 }
110
111 let bytes = text.as_bytes();
112 let mut positions = Vec::new();
113
114 let space_positions = Self::find_byte_positions(bytes, b' ');
116 let tab_positions = Self::find_byte_positions(bytes, b'\t');
117 let newline_positions = Self::find_byte_positions(bytes, b'\n');
118 let cr_positions = Self::find_byte_positions(bytes, b'\r');
119
120 positions.extend(space_positions);
122 positions.extend(tab_positions);
123 positions.extend(newline_positions);
124 positions.extend(cr_positions);
125 positions.sort_unstable();
126 positions.dedup();
127
128 positions
129 }
130
131 fn find_byte_positions(data: &[u8], target: u8) -> Vec<usize> {
133 if data.len() < 64 {
134 return data
135 .iter()
136 .enumerate()
137 .filter(|(_, &b)| b == target)
138 .map(|(i_, _)| i_)
139 .collect();
140 }
141
142 let mut positions = Vec::new();
144
145 let chunk_size = 64;
147 for (chunk_idx, chunk) in data.chunks(chunk_size).enumerate() {
148 let base_idx = chunk_idx * chunk_size;
149 for (i, &byte) in chunk.iter().enumerate() {
150 if byte == target {
151 positions.push(base_idx + i);
152 }
153 }
154 }
155
156 positions
157 }
158
159 pub fn to_lowercase_ascii(text: &str) -> String {
161 if !Self::is_available() || !text.is_ascii() {
162 return text.to_lowercase();
163 }
164
165 let bytes = text.as_bytes();
166
167 let mut result = Vec::with_capacity(bytes.len());
169
170 for &b in bytes {
172 let is_upper = b.is_ascii_uppercase() as u8;
174 result.push(b + (is_upper * 32));
175 }
176
177 unsafe { String::from_utf8_unchecked(result) }
179 }
180
181 pub fn find_substring(haystack: &str, needle: &str) -> Option<usize> {
183 if !Self::is_available() || !haystack.is_ascii() || !needle.is_ascii() {
184 return haystack.find(needle);
185 }
186
187 let haystack_bytes = haystack.as_bytes();
188 let needle_bytes = needle.as_bytes();
189
190 if needle_bytes.is_empty() {
191 return Some(0);
192 }
193
194 if needle_bytes.len() > haystack_bytes.len() {
195 return None;
196 }
197
198 if needle_bytes.len() < 8 {
200 return haystack.find(needle);
201 }
202
203 let first_byte = needle_bytes[0];
205 let positions = Self::find_byte_positions(haystack_bytes, first_byte);
206
207 for pos in positions {
209 if pos + needle_bytes.len() <= haystack_bytes.len() {
210 let slice = &haystack_bytes[pos..pos + needle_bytes.len()];
211 if slice == needle_bytes {
212 return Some(pos);
213 }
214 }
215 }
216
217 None
218 }
219
220 pub fn is_alphanumeric_ascii(text: &str) -> bool {
222 if !Self::is_available() || !text.is_ascii() {
223 return text.chars().all(|c| c.is_alphanumeric());
224 }
225
226 text.bytes().all(|b| b.is_ascii_alphanumeric())
228 }
229
230 pub fn hamming_distance(s1: &str, s2: &str) -> Option<usize> {
232 if s1.len() != s2.len() {
233 return None;
234 }
235
236 let bytes1 = s1.as_bytes();
237 let bytes2 = s2.as_bytes();
238
239 let distance = bytes1
241 .iter()
242 .zip(bytes2.iter())
243 .filter(|(a, b)| a != b)
244 .count();
245
246 Some(distance)
247 }
248}