1use std::io::{self, Read, Write};
2
3const MAX_IOV: usize = 1024;
6
7const STREAM_BUF: usize = 2 * 1024 * 1024;
9
10const SINGLE_ALLOC_LIMIT: usize = 512 * 1024 * 1024;
13
14#[cfg(target_arch = "x86_64")]
20static COMPACT_LUT: [[u8; 8]; 256] = {
21 let mut lut = [[0u8; 8]; 256];
22 let mut mask: u16 = 0;
23 while mask < 256 {
24 let mut idx: usize = 0;
25 let mut bit: u8 = 0;
26 while bit < 8 {
27 if (mask >> bit) & 1 != 0 {
28 lut[mask as usize][idx] = bit;
29 idx += 1;
30 }
31 bit += 1;
32 }
33 mask += 1;
34 }
35 lut
36};
37
38#[inline]
41fn write_ioslices(writer: &mut impl Write, slices: &[std::io::IoSlice]) -> io::Result<()> {
42 if slices.is_empty() {
43 return Ok(());
44 }
45 for batch in slices.chunks(MAX_IOV) {
46 let total: usize = batch.iter().map(|s| s.len()).sum();
47 match writer.write_vectored(batch) {
48 Ok(n) if n >= total => continue,
49 Ok(mut written) => {
50 for slice in batch {
52 let slen = slice.len();
53 if written >= slen {
54 written -= slen;
55 continue;
56 }
57 if written > 0 {
58 writer.write_all(&slice[written..])?;
59 written = 0;
60 } else {
61 writer.write_all(slice)?;
62 }
63 }
64 }
65 Err(e) => return Err(e),
66 }
67 }
68 Ok(())
69}
70
71#[inline]
75#[allow(clippy::uninit_vec)]
76fn alloc_uninit_vec(len: usize) -> Vec<u8> {
77 let mut v = Vec::with_capacity(len);
78 unsafe {
80 v.set_len(len);
81 }
82 #[cfg(target_os = "linux")]
83 if len >= 2 * 1024 * 1024 {
84 unsafe {
85 libc::madvise(
86 v.as_mut_ptr() as *mut libc::c_void,
87 len,
88 libc::MADV_HUGEPAGE,
89 );
90 }
91 }
92 v
93}
94
95#[inline]
97fn build_translate_table(set1: &[u8], set2: &[u8]) -> [u8; 256] {
98 let mut table: [u8; 256] = std::array::from_fn(|i| i as u8);
99 let last = set2.last().copied();
100 for (i, &from) in set1.iter().enumerate() {
101 table[from as usize] = if i < set2.len() {
102 set2[i]
103 } else {
104 last.unwrap_or(from)
105 };
106 }
107 table
108}
109
110#[inline]
112fn build_member_set(chars: &[u8]) -> [u8; 32] {
113 let mut set = [0u8; 32];
114 for &ch in chars {
115 set[ch as usize >> 3] |= 1 << (ch & 7);
116 }
117 set
118}
119
120#[inline(always)]
121fn is_member(set: &[u8; 32], ch: u8) -> bool {
122 unsafe { (*set.get_unchecked(ch as usize >> 3) & (1 << (ch & 7))) != 0 }
123}
124
125#[cfg(target_arch = "x86_64")]
128static SIMD_LEVEL: std::sync::atomic::AtomicU8 = std::sync::atomic::AtomicU8::new(0);
129
130#[cfg(target_arch = "x86_64")]
131#[inline(always)]
132fn get_simd_level() -> u8 {
133 let level = SIMD_LEVEL.load(std::sync::atomic::Ordering::Relaxed);
134 if level != 0 {
135 return level;
136 }
137 let detected = if is_x86_feature_detected!("avx2") {
138 3
139 } else if is_x86_feature_detected!("ssse3") {
140 2
141 } else {
142 1
143 };
144 SIMD_LEVEL.store(detected, std::sync::atomic::Ordering::Relaxed);
145 detected
146}
147
148#[cfg(target_arch = "x86_64")]
150#[inline]
151fn count_non_identity(table: &[u8; 256]) -> usize {
152 table
153 .iter()
154 .enumerate()
155 .filter(|&(i, &v)| v != i as u8)
156 .count()
157}
158
159#[inline(always)]
165fn translate_inplace(data: &mut [u8], table: &[u8; 256]) {
166 #[cfg(target_arch = "x86_64")]
167 {
168 let level = get_simd_level();
169 if level >= 3 {
170 let non_id = count_non_identity(table);
175 if non_id > 0 && non_id <= 16 {
176 unsafe { translate_inplace_avx2_sparse(data, table) };
177 return;
178 }
179 unsafe { translate_inplace_avx2_table(data, table) };
180 return;
181 }
182 if level >= 2 {
183 unsafe { translate_inplace_ssse3_table(data, table) };
184 return;
185 }
186 }
187 translate_inplace_scalar(data, table);
188}
189
190#[cfg(target_arch = "x86_64")]
196#[target_feature(enable = "avx2")]
197unsafe fn translate_inplace_avx2_sparse(data: &mut [u8], table: &[u8; 256]) {
198 use std::arch::x86_64::*;
199
200 unsafe {
201 let len = data.len();
202 let ptr = data.as_mut_ptr();
203
204 let mut lut = [_mm256_setzero_si256(); 16];
206 for h in 0u8..16 {
207 let base = (h as usize) * 16;
208 let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
209 let row128 = _mm_loadu_si128(row.as_ptr() as *const _);
210 lut[h as usize] = _mm256_broadcastsi128_si256(row128);
211 }
212
213 let lo_mask = _mm256_set1_epi8(0x0F);
214
215 let mut i = 0;
216 while i + 32 <= len {
217 let input = _mm256_loadu_si256(ptr.add(i) as *const _);
218 let lo_nibble = _mm256_and_si256(input, lo_mask);
219 let hi_nibble = _mm256_and_si256(_mm256_srli_epi16(input, 4), lo_mask);
220
221 let mut result = _mm256_setzero_si256();
222 macro_rules! do_nibble {
223 ($h:expr) => {
224 let h_val = _mm256_set1_epi8($h as i8);
225 let mask = _mm256_cmpeq_epi8(hi_nibble, h_val);
226 let looked_up = _mm256_shuffle_epi8(lut[$h], lo_nibble);
227 result = _mm256_or_si256(result, _mm256_and_si256(mask, looked_up));
228 };
229 }
230 do_nibble!(0);
231 do_nibble!(1);
232 do_nibble!(2);
233 do_nibble!(3);
234 do_nibble!(4);
235 do_nibble!(5);
236 do_nibble!(6);
237 do_nibble!(7);
238 do_nibble!(8);
239 do_nibble!(9);
240 do_nibble!(10);
241 do_nibble!(11);
242 do_nibble!(12);
243 do_nibble!(13);
244 do_nibble!(14);
245 do_nibble!(15);
246
247 let diff = _mm256_xor_si256(input, result);
249 if _mm256_testz_si256(diff, diff) == 0 {
250 _mm256_storeu_si256(ptr.add(i) as *mut _, result);
251 }
252 i += 32;
253 }
254
255 while i < len {
257 *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
258 i += 1;
259 }
260 }
261}
262
263#[cfg(not(target_arch = "aarch64"))]
265#[inline(always)]
266fn translate_inplace_scalar(data: &mut [u8], table: &[u8; 256]) {
267 let len = data.len();
268 let ptr = data.as_mut_ptr();
269 let mut i = 0;
270 unsafe {
271 while i + 8 <= len {
272 *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
273 *ptr.add(i + 1) = *table.get_unchecked(*ptr.add(i + 1) as usize);
274 *ptr.add(i + 2) = *table.get_unchecked(*ptr.add(i + 2) as usize);
275 *ptr.add(i + 3) = *table.get_unchecked(*ptr.add(i + 3) as usize);
276 *ptr.add(i + 4) = *table.get_unchecked(*ptr.add(i + 4) as usize);
277 *ptr.add(i + 5) = *table.get_unchecked(*ptr.add(i + 5) as usize);
278 *ptr.add(i + 6) = *table.get_unchecked(*ptr.add(i + 6) as usize);
279 *ptr.add(i + 7) = *table.get_unchecked(*ptr.add(i + 7) as usize);
280 i += 8;
281 }
282 while i < len {
283 *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
284 i += 1;
285 }
286 }
287}
288
289#[cfg(target_arch = "aarch64")]
292#[inline(always)]
293fn translate_inplace_scalar(data: &mut [u8], table: &[u8; 256]) {
294 unsafe { translate_inplace_neon_table(data, table) };
295}
296
297#[cfg(target_arch = "aarch64")]
298#[target_feature(enable = "neon")]
299unsafe fn translate_inplace_neon_table(data: &mut [u8], table: &[u8; 256]) {
300 use std::arch::aarch64::*;
301
302 unsafe {
303 let len = data.len();
304 let ptr = data.as_mut_ptr();
305
306 let mut lut: [uint8x16_t; 16] = [vdupq_n_u8(0); 16];
308 for h in 0u8..16 {
309 let base = (h as usize) * 16;
310 lut[h as usize] = vld1q_u8(table.as_ptr().add(base));
311 }
312
313 let lo_mask = vdupq_n_u8(0x0F);
314 let mut i = 0;
315
316 while i + 16 <= len {
317 let input = vld1q_u8(ptr.add(i));
318 let lo_nibble = vandq_u8(input, lo_mask);
319 let hi_nibble = vandq_u8(vshrq_n_u8(input, 4), lo_mask);
320
321 let mut result = vdupq_n_u8(0);
322 macro_rules! do_nibble {
323 ($h:expr) => {
324 let h_val = vdupq_n_u8($h);
325 let mask = vceqq_u8(hi_nibble, h_val);
326 let looked_up = vqtbl1q_u8(lut[$h as usize], lo_nibble);
327 result = vorrq_u8(result, vandq_u8(mask, looked_up));
328 };
329 }
330 do_nibble!(0);
331 do_nibble!(1);
332 do_nibble!(2);
333 do_nibble!(3);
334 do_nibble!(4);
335 do_nibble!(5);
336 do_nibble!(6);
337 do_nibble!(7);
338 do_nibble!(8);
339 do_nibble!(9);
340 do_nibble!(10);
341 do_nibble!(11);
342 do_nibble!(12);
343 do_nibble!(13);
344 do_nibble!(14);
345 do_nibble!(15);
346
347 vst1q_u8(ptr.add(i), result);
348 i += 16;
349 }
350
351 while i < len {
353 *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
354 i += 1;
355 }
356 }
357}
358
359#[cfg(target_arch = "x86_64")]
379#[target_feature(enable = "avx2")]
380unsafe fn translate_inplace_avx2_table(data: &mut [u8], table: &[u8; 256]) {
381 use std::arch::x86_64::*;
382
383 unsafe {
384 let len = data.len();
385 let ptr = data.as_mut_ptr();
386
387 let mut lut = [_mm256_setzero_si256(); 16];
391 for h in 0u8..16 {
392 let base = (h as usize) * 16;
393 let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
394 let row128 = _mm_loadu_si128(row.as_ptr() as *const _);
396 lut[h as usize] = _mm256_broadcastsi128_si256(row128);
397 }
398
399 let lo_mask = _mm256_set1_epi8(0x0F);
400
401 let mut i = 0;
402
403 while i + 64 <= len {
407 let input0 = _mm256_loadu_si256(ptr.add(i) as *const _);
408 let input1 = _mm256_loadu_si256(ptr.add(i + 32) as *const _);
409
410 let lo0 = _mm256_and_si256(input0, lo_mask);
411 let hi0 = _mm256_and_si256(_mm256_srli_epi16(input0, 4), lo_mask);
412 let lo1 = _mm256_and_si256(input1, lo_mask);
413 let hi1 = _mm256_and_si256(_mm256_srli_epi16(input1, 4), lo_mask);
414
415 let mut r0 = _mm256_setzero_si256();
416 let mut r1 = _mm256_setzero_si256();
417
418 macro_rules! do_nibble2 {
419 ($h:expr) => {
420 let h_val = _mm256_set1_epi8($h as i8);
421 let m0 = _mm256_cmpeq_epi8(hi0, h_val);
422 let l0 = _mm256_shuffle_epi8(lut[$h], lo0);
423 r0 = _mm256_or_si256(r0, _mm256_and_si256(m0, l0));
424 let m1 = _mm256_cmpeq_epi8(hi1, h_val);
425 let l1 = _mm256_shuffle_epi8(lut[$h], lo1);
426 r1 = _mm256_or_si256(r1, _mm256_and_si256(m1, l1));
427 };
428 }
429 do_nibble2!(0);
430 do_nibble2!(1);
431 do_nibble2!(2);
432 do_nibble2!(3);
433 do_nibble2!(4);
434 do_nibble2!(5);
435 do_nibble2!(6);
436 do_nibble2!(7);
437 do_nibble2!(8);
438 do_nibble2!(9);
439 do_nibble2!(10);
440 do_nibble2!(11);
441 do_nibble2!(12);
442 do_nibble2!(13);
443 do_nibble2!(14);
444 do_nibble2!(15);
445
446 _mm256_storeu_si256(ptr.add(i) as *mut _, r0);
447 _mm256_storeu_si256(ptr.add(i + 32) as *mut _, r1);
448 i += 64;
449 }
450
451 if i + 32 <= len {
453 let input = _mm256_loadu_si256(ptr.add(i) as *const _);
454 let lo_nibble = _mm256_and_si256(input, lo_mask);
455 let hi_nibble = _mm256_and_si256(_mm256_srli_epi16(input, 4), lo_mask);
456
457 let mut result = _mm256_setzero_si256();
458
459 macro_rules! do_nibble {
460 ($h:expr) => {
461 let h_val = _mm256_set1_epi8($h as i8);
462 let mask = _mm256_cmpeq_epi8(hi_nibble, h_val);
463 let looked_up = _mm256_shuffle_epi8(lut[$h], lo_nibble);
464 result = _mm256_or_si256(result, _mm256_and_si256(mask, looked_up));
465 };
466 }
467 do_nibble!(0);
468 do_nibble!(1);
469 do_nibble!(2);
470 do_nibble!(3);
471 do_nibble!(4);
472 do_nibble!(5);
473 do_nibble!(6);
474 do_nibble!(7);
475 do_nibble!(8);
476 do_nibble!(9);
477 do_nibble!(10);
478 do_nibble!(11);
479 do_nibble!(12);
480 do_nibble!(13);
481 do_nibble!(14);
482 do_nibble!(15);
483
484 _mm256_storeu_si256(ptr.add(i) as *mut _, result);
485 i += 32;
486 }
487
488 if i + 16 <= len {
490 let lo_mask128 = _mm_set1_epi8(0x0F);
491
492 let mut lut128 = [_mm_setzero_si128(); 16];
493 for h in 0u8..16 {
494 lut128[h as usize] = _mm256_castsi256_si128(lut[h as usize]);
495 }
496
497 let input = _mm_loadu_si128(ptr.add(i) as *const _);
498 let lo_nib = _mm_and_si128(input, lo_mask128);
499 let hi_nib = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask128);
500
501 let mut res = _mm_setzero_si128();
502 macro_rules! do_nibble128 {
503 ($h:expr) => {
504 let h_val = _mm_set1_epi8($h as i8);
505 let mask = _mm_cmpeq_epi8(hi_nib, h_val);
506 let looked_up = _mm_shuffle_epi8(lut128[$h], lo_nib);
507 res = _mm_or_si128(res, _mm_and_si128(mask, looked_up));
508 };
509 }
510 do_nibble128!(0);
511 do_nibble128!(1);
512 do_nibble128!(2);
513 do_nibble128!(3);
514 do_nibble128!(4);
515 do_nibble128!(5);
516 do_nibble128!(6);
517 do_nibble128!(7);
518 do_nibble128!(8);
519 do_nibble128!(9);
520 do_nibble128!(10);
521 do_nibble128!(11);
522 do_nibble128!(12);
523 do_nibble128!(13);
524 do_nibble128!(14);
525 do_nibble128!(15);
526
527 _mm_storeu_si128(ptr.add(i) as *mut _, res);
528 i += 16;
529 }
530
531 while i < len {
533 *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
534 i += 1;
535 }
536 }
537}
538
539#[cfg(target_arch = "x86_64")]
540#[target_feature(enable = "ssse3")]
541unsafe fn translate_inplace_ssse3_table(data: &mut [u8], table: &[u8; 256]) {
542 use std::arch::x86_64::*;
543
544 unsafe {
545 let len = data.len();
546 let ptr = data.as_mut_ptr();
547
548 let mut lut = [_mm_setzero_si128(); 16];
550 for h in 0u8..16 {
551 let base = (h as usize) * 16;
552 let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
553 lut[h as usize] = _mm_loadu_si128(row.as_ptr() as *const _);
554 }
555
556 let lo_mask = _mm_set1_epi8(0x0F);
557
558 let mut i = 0;
559 while i + 16 <= len {
560 let input = _mm_loadu_si128(ptr.add(i) as *const _);
561 let lo_nibble = _mm_and_si128(input, lo_mask);
562 let hi_nibble = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask);
563
564 let mut result = _mm_setzero_si128();
565
566 macro_rules! do_nibble {
567 ($h:expr) => {
568 let h_val = _mm_set1_epi8($h as i8);
569 let mask = _mm_cmpeq_epi8(hi_nibble, h_val);
570 let looked_up = _mm_shuffle_epi8(lut[$h], lo_nibble);
571 result = _mm_or_si128(result, _mm_and_si128(mask, looked_up));
572 };
573 }
574 do_nibble!(0);
575 do_nibble!(1);
576 do_nibble!(2);
577 do_nibble!(3);
578 do_nibble!(4);
579 do_nibble!(5);
580 do_nibble!(6);
581 do_nibble!(7);
582 do_nibble!(8);
583 do_nibble!(9);
584 do_nibble!(10);
585 do_nibble!(11);
586 do_nibble!(12);
587 do_nibble!(13);
588 do_nibble!(14);
589 do_nibble!(15);
590
591 _mm_storeu_si128(ptr.add(i) as *mut _, result);
592 i += 16;
593 }
594
595 while i < len {
597 *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
598 i += 1;
599 }
600 }
601}
602
603#[inline(always)]
606fn translate_to(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
607 debug_assert!(dst.len() >= src.len());
608 #[cfg(target_arch = "x86_64")]
609 {
610 let level = get_simd_level();
611 if level >= 3 {
612 if dst.as_ptr() as usize & 31 == 0 {
614 unsafe { translate_to_avx2_table_nt(src, dst, table) };
615 } else {
616 unsafe { translate_to_avx2_table(src, dst, table) };
617 }
618 return;
619 }
620 if level >= 2 {
621 unsafe { translate_to_ssse3_table(src, dst, table) };
622 return;
623 }
624 }
625 translate_to_scalar(src, dst, table);
626}
627
628#[cfg(not(target_arch = "aarch64"))]
630#[inline(always)]
631fn translate_to_scalar(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
632 unsafe {
633 let sp = src.as_ptr();
634 let dp = dst.as_mut_ptr();
635 let len = src.len();
636 let mut i = 0;
637 while i + 8 <= len {
638 *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
639 *dp.add(i + 1) = *table.get_unchecked(*sp.add(i + 1) as usize);
640 *dp.add(i + 2) = *table.get_unchecked(*sp.add(i + 2) as usize);
641 *dp.add(i + 3) = *table.get_unchecked(*sp.add(i + 3) as usize);
642 *dp.add(i + 4) = *table.get_unchecked(*sp.add(i + 4) as usize);
643 *dp.add(i + 5) = *table.get_unchecked(*sp.add(i + 5) as usize);
644 *dp.add(i + 6) = *table.get_unchecked(*sp.add(i + 6) as usize);
645 *dp.add(i + 7) = *table.get_unchecked(*sp.add(i + 7) as usize);
646 i += 8;
647 }
648 while i < len {
649 *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
650 i += 1;
651 }
652 }
653}
654
655#[cfg(target_arch = "aarch64")]
657#[inline(always)]
658fn translate_to_scalar(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
659 unsafe { translate_to_neon_table(src, dst, table) };
660}
661
662#[cfg(target_arch = "aarch64")]
663#[target_feature(enable = "neon")]
664unsafe fn translate_to_neon_table(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
665 use std::arch::aarch64::*;
666
667 unsafe {
668 let len = src.len();
669 let sp = src.as_ptr();
670 let dp = dst.as_mut_ptr();
671
672 let mut lut: [uint8x16_t; 16] = [vdupq_n_u8(0); 16];
673 for h in 0u8..16 {
674 lut[h as usize] = vld1q_u8(table.as_ptr().add((h as usize) * 16));
675 }
676
677 let lo_mask = vdupq_n_u8(0x0F);
678 let mut i = 0;
679
680 while i + 16 <= len {
681 let input = vld1q_u8(sp.add(i));
682 let lo_nibble = vandq_u8(input, lo_mask);
683 let hi_nibble = vandq_u8(vshrq_n_u8(input, 4), lo_mask);
684
685 let mut result = vdupq_n_u8(0);
686 macro_rules! do_nibble {
687 ($h:expr) => {
688 let h_val = vdupq_n_u8($h);
689 let mask = vceqq_u8(hi_nibble, h_val);
690 let looked_up = vqtbl1q_u8(lut[$h as usize], lo_nibble);
691 result = vorrq_u8(result, vandq_u8(mask, looked_up));
692 };
693 }
694 do_nibble!(0);
695 do_nibble!(1);
696 do_nibble!(2);
697 do_nibble!(3);
698 do_nibble!(4);
699 do_nibble!(5);
700 do_nibble!(6);
701 do_nibble!(7);
702 do_nibble!(8);
703 do_nibble!(9);
704 do_nibble!(10);
705 do_nibble!(11);
706 do_nibble!(12);
707 do_nibble!(13);
708 do_nibble!(14);
709 do_nibble!(15);
710
711 vst1q_u8(dp.add(i), result);
712 i += 16;
713 }
714
715 while i < len {
716 *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
717 i += 1;
718 }
719 }
720}
721
722#[cfg(target_arch = "x86_64")]
723#[target_feature(enable = "avx2")]
724unsafe fn translate_to_avx2_table(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
725 use std::arch::x86_64::*;
726
727 unsafe {
728 let len = src.len();
729 let sp = src.as_ptr();
730 let dp = dst.as_mut_ptr();
731
732 let mut lut = [_mm256_setzero_si256(); 16];
734 for h in 0u8..16 {
735 let base = (h as usize) * 16;
736 let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
737 let row128 = _mm_loadu_si128(row.as_ptr() as *const _);
738 lut[h as usize] = _mm256_broadcastsi128_si256(row128);
739 }
740
741 let lo_mask = _mm256_set1_epi8(0x0F);
742
743 let mut i = 0;
744
745 while i + 64 <= len {
747 let input0 = _mm256_loadu_si256(sp.add(i) as *const _);
748 let input1 = _mm256_loadu_si256(sp.add(i + 32) as *const _);
749
750 let lo0 = _mm256_and_si256(input0, lo_mask);
751 let hi0 = _mm256_and_si256(_mm256_srli_epi16(input0, 4), lo_mask);
752 let lo1 = _mm256_and_si256(input1, lo_mask);
753 let hi1 = _mm256_and_si256(_mm256_srli_epi16(input1, 4), lo_mask);
754
755 let mut r0 = _mm256_setzero_si256();
756 let mut r1 = _mm256_setzero_si256();
757
758 macro_rules! do_nibble2 {
759 ($h:expr) => {
760 let h_val = _mm256_set1_epi8($h as i8);
761 let m0 = _mm256_cmpeq_epi8(hi0, h_val);
762 let l0 = _mm256_shuffle_epi8(lut[$h], lo0);
763 r0 = _mm256_or_si256(r0, _mm256_and_si256(m0, l0));
764 let m1 = _mm256_cmpeq_epi8(hi1, h_val);
765 let l1 = _mm256_shuffle_epi8(lut[$h], lo1);
766 r1 = _mm256_or_si256(r1, _mm256_and_si256(m1, l1));
767 };
768 }
769 do_nibble2!(0);
770 do_nibble2!(1);
771 do_nibble2!(2);
772 do_nibble2!(3);
773 do_nibble2!(4);
774 do_nibble2!(5);
775 do_nibble2!(6);
776 do_nibble2!(7);
777 do_nibble2!(8);
778 do_nibble2!(9);
779 do_nibble2!(10);
780 do_nibble2!(11);
781 do_nibble2!(12);
782 do_nibble2!(13);
783 do_nibble2!(14);
784 do_nibble2!(15);
785
786 _mm256_storeu_si256(dp.add(i) as *mut _, r0);
787 _mm256_storeu_si256(dp.add(i + 32) as *mut _, r1);
788 i += 64;
789 }
790
791 if i + 32 <= len {
793 let input = _mm256_loadu_si256(sp.add(i) as *const _);
794 let lo_nibble = _mm256_and_si256(input, lo_mask);
795 let hi_nibble = _mm256_and_si256(_mm256_srli_epi16(input, 4), lo_mask);
796
797 let mut result = _mm256_setzero_si256();
798
799 macro_rules! do_nibble {
800 ($h:expr) => {
801 let h_val = _mm256_set1_epi8($h as i8);
802 let mask = _mm256_cmpeq_epi8(hi_nibble, h_val);
803 let looked_up = _mm256_shuffle_epi8(lut[$h], lo_nibble);
804 result = _mm256_or_si256(result, _mm256_and_si256(mask, looked_up));
805 };
806 }
807 do_nibble!(0);
808 do_nibble!(1);
809 do_nibble!(2);
810 do_nibble!(3);
811 do_nibble!(4);
812 do_nibble!(5);
813 do_nibble!(6);
814 do_nibble!(7);
815 do_nibble!(8);
816 do_nibble!(9);
817 do_nibble!(10);
818 do_nibble!(11);
819 do_nibble!(12);
820 do_nibble!(13);
821 do_nibble!(14);
822 do_nibble!(15);
823
824 _mm256_storeu_si256(dp.add(i) as *mut _, result);
825 i += 32;
826 }
827
828 if i + 16 <= len {
830 let lo_mask128 = _mm_set1_epi8(0x0F);
831 let mut lut128 = [_mm_setzero_si128(); 16];
832 for h in 0u8..16 {
833 lut128[h as usize] = _mm256_castsi256_si128(lut[h as usize]);
834 }
835
836 let input = _mm_loadu_si128(sp.add(i) as *const _);
837 let lo_nib = _mm_and_si128(input, lo_mask128);
838 let hi_nib = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask128);
839
840 let mut res = _mm_setzero_si128();
841 macro_rules! do_nibble128 {
842 ($h:expr) => {
843 let h_val = _mm_set1_epi8($h as i8);
844 let mask = _mm_cmpeq_epi8(hi_nib, h_val);
845 let looked_up = _mm_shuffle_epi8(lut128[$h], lo_nib);
846 res = _mm_or_si128(res, _mm_and_si128(mask, looked_up));
847 };
848 }
849 do_nibble128!(0);
850 do_nibble128!(1);
851 do_nibble128!(2);
852 do_nibble128!(3);
853 do_nibble128!(4);
854 do_nibble128!(5);
855 do_nibble128!(6);
856 do_nibble128!(7);
857 do_nibble128!(8);
858 do_nibble128!(9);
859 do_nibble128!(10);
860 do_nibble128!(11);
861 do_nibble128!(12);
862 do_nibble128!(13);
863 do_nibble128!(14);
864 do_nibble128!(15);
865
866 _mm_storeu_si128(dp.add(i) as *mut _, res);
867 i += 16;
868 }
869
870 while i < len {
872 *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
873 i += 1;
874 }
875 }
876}
877
878#[cfg(target_arch = "x86_64")]
881#[target_feature(enable = "avx2")]
882unsafe fn translate_to_avx2_table_nt(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
883 use std::arch::x86_64::*;
884
885 unsafe {
886 let len = src.len();
887 let sp = src.as_ptr();
888 let dp = dst.as_mut_ptr();
889
890 let mut lut = [_mm256_setzero_si256(); 16];
892 for h in 0u8..16 {
893 let base = (h as usize) * 16;
894 let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
895 let row128 = _mm_loadu_si128(row.as_ptr() as *const _);
896 lut[h as usize] = _mm256_broadcastsi128_si256(row128);
897 }
898
899 let lo_mask = _mm256_set1_epi8(0x0F);
900 let mut i = 0;
901
902 while i + 64 <= len {
904 let input0 = _mm256_loadu_si256(sp.add(i) as *const _);
905 let input1 = _mm256_loadu_si256(sp.add(i + 32) as *const _);
906
907 let lo0 = _mm256_and_si256(input0, lo_mask);
908 let hi0 = _mm256_and_si256(_mm256_srli_epi16(input0, 4), lo_mask);
909 let lo1 = _mm256_and_si256(input1, lo_mask);
910 let hi1 = _mm256_and_si256(_mm256_srli_epi16(input1, 4), lo_mask);
911
912 let mut r0 = _mm256_setzero_si256();
913 let mut r1 = _mm256_setzero_si256();
914
915 macro_rules! do_nibble2 {
916 ($h:expr) => {
917 let h_val = _mm256_set1_epi8($h as i8);
918 let m0 = _mm256_cmpeq_epi8(hi0, h_val);
919 let l0 = _mm256_shuffle_epi8(lut[$h], lo0);
920 r0 = _mm256_or_si256(r0, _mm256_and_si256(m0, l0));
921 let m1 = _mm256_cmpeq_epi8(hi1, h_val);
922 let l1 = _mm256_shuffle_epi8(lut[$h], lo1);
923 r1 = _mm256_or_si256(r1, _mm256_and_si256(m1, l1));
924 };
925 }
926 do_nibble2!(0);
927 do_nibble2!(1);
928 do_nibble2!(2);
929 do_nibble2!(3);
930 do_nibble2!(4);
931 do_nibble2!(5);
932 do_nibble2!(6);
933 do_nibble2!(7);
934 do_nibble2!(8);
935 do_nibble2!(9);
936 do_nibble2!(10);
937 do_nibble2!(11);
938 do_nibble2!(12);
939 do_nibble2!(13);
940 do_nibble2!(14);
941 do_nibble2!(15);
942
943 _mm256_stream_si256(dp.add(i) as *mut _, r0);
944 _mm256_stream_si256(dp.add(i + 32) as *mut _, r1);
945 i += 64;
946 }
947
948 if i + 32 <= len {
950 let input = _mm256_loadu_si256(sp.add(i) as *const _);
951 let lo_nibble = _mm256_and_si256(input, lo_mask);
952 let hi_nibble = _mm256_and_si256(_mm256_srli_epi16(input, 4), lo_mask);
953
954 let mut result = _mm256_setzero_si256();
955 macro_rules! do_nibble {
956 ($h:expr) => {
957 let h_val = _mm256_set1_epi8($h as i8);
958 let mask = _mm256_cmpeq_epi8(hi_nibble, h_val);
959 let looked_up = _mm256_shuffle_epi8(lut[$h], lo_nibble);
960 result = _mm256_or_si256(result, _mm256_and_si256(mask, looked_up));
961 };
962 }
963 do_nibble!(0);
964 do_nibble!(1);
965 do_nibble!(2);
966 do_nibble!(3);
967 do_nibble!(4);
968 do_nibble!(5);
969 do_nibble!(6);
970 do_nibble!(7);
971 do_nibble!(8);
972 do_nibble!(9);
973 do_nibble!(10);
974 do_nibble!(11);
975 do_nibble!(12);
976 do_nibble!(13);
977 do_nibble!(14);
978 do_nibble!(15);
979
980 _mm256_stream_si256(dp.add(i) as *mut _, result);
981 i += 32;
982 }
983
984 if i + 16 <= len {
986 let lo_mask128 = _mm_set1_epi8(0x0F);
987 let mut lut128 = [_mm_setzero_si128(); 16];
988 for h in 0u8..16 {
989 lut128[h as usize] = _mm256_castsi256_si128(lut[h as usize]);
990 }
991
992 let input = _mm_loadu_si128(sp.add(i) as *const _);
993 let lo_nib = _mm_and_si128(input, lo_mask128);
994 let hi_nib = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask128);
995
996 let mut res = _mm_setzero_si128();
997 macro_rules! do_nibble128 {
998 ($h:expr) => {
999 let h_val = _mm_set1_epi8($h as i8);
1000 let mask = _mm_cmpeq_epi8(hi_nib, h_val);
1001 let looked_up = _mm_shuffle_epi8(lut128[$h], lo_nib);
1002 res = _mm_or_si128(res, _mm_and_si128(mask, looked_up));
1003 };
1004 }
1005 do_nibble128!(0);
1006 do_nibble128!(1);
1007 do_nibble128!(2);
1008 do_nibble128!(3);
1009 do_nibble128!(4);
1010 do_nibble128!(5);
1011 do_nibble128!(6);
1012 do_nibble128!(7);
1013 do_nibble128!(8);
1014 do_nibble128!(9);
1015 do_nibble128!(10);
1016 do_nibble128!(11);
1017 do_nibble128!(12);
1018 do_nibble128!(13);
1019 do_nibble128!(14);
1020 do_nibble128!(15);
1021
1022 _mm_storeu_si128(dp.add(i) as *mut _, res);
1023 i += 16;
1024 }
1025
1026 while i < len {
1028 *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
1029 i += 1;
1030 }
1031
1032 _mm_sfence();
1034 }
1035}
1036
1037#[cfg(target_arch = "x86_64")]
1038#[target_feature(enable = "ssse3")]
1039unsafe fn translate_to_ssse3_table(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
1040 use std::arch::x86_64::*;
1041
1042 unsafe {
1043 let len = src.len();
1044 let sp = src.as_ptr();
1045 let dp = dst.as_mut_ptr();
1046
1047 let mut lut = [_mm_setzero_si128(); 16];
1048 for h in 0u8..16 {
1049 let base = (h as usize) * 16;
1050 let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
1051 lut[h as usize] = _mm_loadu_si128(row.as_ptr() as *const _);
1052 }
1053
1054 let lo_mask = _mm_set1_epi8(0x0F);
1055
1056 let mut i = 0;
1057 while i + 16 <= len {
1058 let input = _mm_loadu_si128(sp.add(i) as *const _);
1059 let lo_nibble = _mm_and_si128(input, lo_mask);
1060 let hi_nibble = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask);
1061
1062 let mut result = _mm_setzero_si128();
1063
1064 macro_rules! do_nibble {
1065 ($h:expr) => {
1066 let h_val = _mm_set1_epi8($h as i8);
1067 let mask = _mm_cmpeq_epi8(hi_nibble, h_val);
1068 let looked_up = _mm_shuffle_epi8(lut[$h], lo_nibble);
1069 result = _mm_or_si128(result, _mm_and_si128(mask, looked_up));
1070 };
1071 }
1072 do_nibble!(0);
1073 do_nibble!(1);
1074 do_nibble!(2);
1075 do_nibble!(3);
1076 do_nibble!(4);
1077 do_nibble!(5);
1078 do_nibble!(6);
1079 do_nibble!(7);
1080 do_nibble!(8);
1081 do_nibble!(9);
1082 do_nibble!(10);
1083 do_nibble!(11);
1084 do_nibble!(12);
1085 do_nibble!(13);
1086 do_nibble!(14);
1087 do_nibble!(15);
1088
1089 _mm_storeu_si128(dp.add(i) as *mut _, result);
1090 i += 16;
1091 }
1092
1093 while i < len {
1095 *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
1096 i += 1;
1097 }
1098 }
1099}
1100
1101#[inline]
1109fn detect_range_offset(table: &[u8; 256]) -> Option<(u8, u8, i8)> {
1110 let mut lo: Option<u8> = None;
1111 let mut hi = 0u8;
1112 let mut offset = 0i16;
1113
1114 for i in 0..256 {
1115 if table[i] != i as u8 {
1116 let diff = table[i] as i16 - i as i16;
1117 match lo {
1118 None => {
1119 lo = Some(i as u8);
1120 hi = i as u8;
1121 offset = diff;
1122 }
1123 Some(_) => {
1124 if diff != offset || i as u8 != hi.wrapping_add(1) {
1125 return None;
1126 }
1127 hi = i as u8;
1128 }
1129 }
1130 }
1131 }
1132
1133 lo.map(|l| (l, hi, offset as i8))
1134}
1135
1136#[inline]
1141fn detect_range_to_constant(table: &[u8; 256]) -> Option<(u8, u8, u8)> {
1142 let mut lo: Option<u8> = None;
1143 let mut hi = 0u8;
1144 let mut replacement = 0u8;
1145
1146 for i in 0..256 {
1147 if table[i] != i as u8 {
1148 match lo {
1149 None => {
1150 lo = Some(i as u8);
1151 hi = i as u8;
1152 replacement = table[i];
1153 }
1154 Some(_) => {
1155 if table[i] != replacement || i as u8 != hi.wrapping_add(1) {
1156 return None;
1157 }
1158 hi = i as u8;
1159 }
1160 }
1161 }
1162 }
1163
1164 lo.map(|l| (l, hi, replacement))
1165}
1166
1167#[cfg(target_arch = "x86_64")]
1172fn translate_range_to_constant_simd_inplace(data: &mut [u8], lo: u8, hi: u8, replacement: u8) {
1173 if get_simd_level() >= 3 {
1174 unsafe { translate_range_to_constant_avx2_inplace(data, lo, hi, replacement) };
1175 } else {
1176 unsafe { translate_range_to_constant_sse2_inplace(data, lo, hi, replacement) };
1177 }
1178}
1179
1180#[cfg(target_arch = "x86_64")]
1181#[target_feature(enable = "avx2")]
1182unsafe fn translate_range_to_constant_avx2_inplace(
1183 data: &mut [u8],
1184 lo: u8,
1185 hi: u8,
1186 replacement: u8,
1187) {
1188 use std::arch::x86_64::*;
1189
1190 unsafe {
1191 let range = hi - lo;
1192 let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1193 let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
1194 let repl_v = _mm256_set1_epi8(replacement as i8);
1195 let zero = _mm256_setzero_si256();
1196
1197 let len = data.len();
1198 let ptr = data.as_mut_ptr();
1199 let mut i = 0;
1200
1201 while i + 64 <= len {
1203 let in0 = _mm256_loadu_si256(ptr.add(i) as *const _);
1204 let in1 = _mm256_loadu_si256(ptr.add(i + 32) as *const _);
1205 let bi0 = _mm256_add_epi8(in0, bias_v);
1206 let bi1 = _mm256_add_epi8(in1, bias_v);
1207 let gt0 = _mm256_cmpgt_epi8(bi0, threshold_v);
1208 let gt1 = _mm256_cmpgt_epi8(bi1, threshold_v);
1209 let ir0 = _mm256_cmpeq_epi8(gt0, zero);
1210 let ir1 = _mm256_cmpeq_epi8(gt1, zero);
1211 let r0 = _mm256_blendv_epi8(in0, repl_v, ir0);
1212 let r1 = _mm256_blendv_epi8(in1, repl_v, ir1);
1213 _mm256_storeu_si256(ptr.add(i) as *mut _, r0);
1214 _mm256_storeu_si256(ptr.add(i + 32) as *mut _, r1);
1215 i += 64;
1216 }
1217
1218 if i + 32 <= len {
1220 let input = _mm256_loadu_si256(ptr.add(i) as *const _);
1221 let biased = _mm256_add_epi8(input, bias_v);
1222 let gt = _mm256_cmpgt_epi8(biased, threshold_v);
1223 let in_range = _mm256_cmpeq_epi8(gt, zero);
1224 let result = _mm256_blendv_epi8(input, repl_v, in_range);
1225 _mm256_storeu_si256(ptr.add(i) as *mut _, result);
1226 i += 32;
1227 }
1228
1229 if i + 16 <= len {
1230 let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1231 let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1232 let repl_v128 = _mm_set1_epi8(replacement as i8);
1233 let zero128 = _mm_setzero_si128();
1234
1235 let input = _mm_loadu_si128(ptr.add(i) as *const _);
1236 let biased = _mm_add_epi8(input, bias_v128);
1237 let gt = _mm_cmpgt_epi8(biased, threshold_v128);
1238 let in_range = _mm_cmpeq_epi8(gt, zero128);
1239 let result = _mm_blendv_epi8(input, repl_v128, in_range);
1240 _mm_storeu_si128(ptr.add(i) as *mut _, result);
1241 i += 16;
1242 }
1243
1244 while i < len {
1245 let b = *ptr.add(i);
1246 *ptr.add(i) = if b >= lo && b <= hi { replacement } else { b };
1247 i += 1;
1248 }
1249 }
1250}
1251
1252#[cfg(target_arch = "x86_64")]
1253#[target_feature(enable = "sse2")]
1254unsafe fn translate_range_to_constant_sse2_inplace(
1255 data: &mut [u8],
1256 lo: u8,
1257 hi: u8,
1258 replacement: u8,
1259) {
1260 use std::arch::x86_64::*;
1261
1262 unsafe {
1263 let range = hi - lo;
1264 let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1265 let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1266 let repl_v = _mm_set1_epi8(replacement as i8);
1267 let zero = _mm_setzero_si128();
1268
1269 let len = data.len();
1270 let ptr = data.as_mut_ptr();
1271 let mut i = 0;
1272
1273 while i + 16 <= len {
1274 let input = _mm_loadu_si128(ptr.add(i) as *const _);
1275 let biased = _mm_add_epi8(input, bias_v);
1276 let gt = _mm_cmpgt_epi8(biased, threshold_v);
1277 let in_range = _mm_cmpeq_epi8(gt, zero);
1279 let result = _mm_or_si128(
1281 _mm_and_si128(in_range, repl_v),
1282 _mm_andnot_si128(in_range, input),
1283 );
1284 _mm_storeu_si128(ptr.add(i) as *mut _, result);
1285 i += 16;
1286 }
1287
1288 while i < len {
1289 let b = *ptr.add(i);
1290 *ptr.add(i) = if b >= lo && b <= hi { replacement } else { b };
1291 i += 1;
1292 }
1293 }
1294}
1295
1296#[cfg(target_arch = "aarch64")]
1297fn translate_range_to_constant_simd_inplace(data: &mut [u8], lo: u8, hi: u8, replacement: u8) {
1298 unsafe { translate_range_to_constant_neon_inplace(data, lo, hi, replacement) };
1299}
1300
1301#[cfg(target_arch = "aarch64")]
1302#[target_feature(enable = "neon")]
1303unsafe fn translate_range_to_constant_neon_inplace(
1304 data: &mut [u8],
1305 lo: u8,
1306 hi: u8,
1307 replacement: u8,
1308) {
1309 use std::arch::aarch64::*;
1310
1311 unsafe {
1312 let len = data.len();
1313 let ptr = data.as_mut_ptr();
1314 let lo_v = vdupq_n_u8(lo);
1315 let hi_v = vdupq_n_u8(hi);
1316 let repl_v = vdupq_n_u8(replacement);
1317 let mut i = 0;
1318
1319 while i + 32 <= len {
1320 let in0 = vld1q_u8(ptr.add(i));
1321 let in1 = vld1q_u8(ptr.add(i + 16));
1322 let ge0 = vcgeq_u8(in0, lo_v);
1323 let le0 = vcleq_u8(in0, hi_v);
1324 let mask0 = vandq_u8(ge0, le0);
1325 let ge1 = vcgeq_u8(in1, lo_v);
1326 let le1 = vcleq_u8(in1, hi_v);
1327 let mask1 = vandq_u8(ge1, le1);
1328 vst1q_u8(ptr.add(i), vbslq_u8(mask0, repl_v, in0));
1330 vst1q_u8(ptr.add(i + 16), vbslq_u8(mask1, repl_v, in1));
1331 i += 32;
1332 }
1333
1334 if i + 16 <= len {
1335 let input = vld1q_u8(ptr.add(i));
1336 let ge = vcgeq_u8(input, lo_v);
1337 let le = vcleq_u8(input, hi_v);
1338 let mask = vandq_u8(ge, le);
1339 vst1q_u8(ptr.add(i), vbslq_u8(mask, repl_v, input));
1340 i += 16;
1341 }
1342
1343 while i < len {
1344 let b = *ptr.add(i);
1345 *ptr.add(i) = if b >= lo && b <= hi { replacement } else { b };
1346 i += 1;
1347 }
1348 }
1349}
1350
1351#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
1352fn translate_range_to_constant_simd_inplace(data: &mut [u8], lo: u8, hi: u8, replacement: u8) {
1353 for b in data.iter_mut() {
1354 if *b >= lo && *b <= hi {
1355 *b = replacement;
1356 }
1357 }
1358}
1359
1360#[cfg(target_arch = "x86_64")]
1363fn translate_range_to_constant_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, replacement: u8) {
1364 if get_simd_level() >= 3 {
1365 unsafe { translate_range_to_constant_avx2(src, dst, lo, hi, replacement) };
1366 } else {
1367 unsafe { translate_range_to_constant_sse2(src, dst, lo, hi, replacement) };
1368 }
1369}
1370
1371#[cfg(target_arch = "aarch64")]
1372fn translate_range_to_constant_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, replacement: u8) {
1373 unsafe { translate_range_to_constant_neon(src, dst, lo, hi, replacement) };
1374}
1375
1376#[cfg(target_arch = "aarch64")]
1377#[target_feature(enable = "neon")]
1378unsafe fn translate_range_to_constant_neon(
1379 src: &[u8],
1380 dst: &mut [u8],
1381 lo: u8,
1382 hi: u8,
1383 replacement: u8,
1384) {
1385 use std::arch::aarch64::*;
1386
1387 unsafe {
1388 let len = src.len();
1389 let sp = src.as_ptr();
1390 let dp = dst.as_mut_ptr();
1391 let lo_v = vdupq_n_u8(lo);
1392 let hi_v = vdupq_n_u8(hi);
1393 let repl_v = vdupq_n_u8(replacement);
1394 let mut i = 0;
1395
1396 while i + 32 <= len {
1397 let in0 = vld1q_u8(sp.add(i));
1398 let in1 = vld1q_u8(sp.add(i + 16));
1399 let mask0 = vandq_u8(vcgeq_u8(in0, lo_v), vcleq_u8(in0, hi_v));
1400 let mask1 = vandq_u8(vcgeq_u8(in1, lo_v), vcleq_u8(in1, hi_v));
1401 vst1q_u8(dp.add(i), vbslq_u8(mask0, repl_v, in0));
1402 vst1q_u8(dp.add(i + 16), vbslq_u8(mask1, repl_v, in1));
1403 i += 32;
1404 }
1405
1406 if i + 16 <= len {
1407 let input = vld1q_u8(sp.add(i));
1408 let mask = vandq_u8(vcgeq_u8(input, lo_v), vcleq_u8(input, hi_v));
1409 vst1q_u8(dp.add(i), vbslq_u8(mask, repl_v, input));
1410 i += 16;
1411 }
1412
1413 while i < len {
1414 let b = *sp.add(i);
1415 *dp.add(i) = if b >= lo && b <= hi { replacement } else { b };
1416 i += 1;
1417 }
1418 }
1419}
1420
1421#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
1422fn translate_range_to_constant_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, replacement: u8) {
1423 for (i, &b) in src.iter().enumerate() {
1424 unsafe {
1425 *dst.get_unchecked_mut(i) = if b >= lo && b <= hi { replacement } else { b };
1426 }
1427 }
1428}
1429
1430#[cfg(target_arch = "x86_64")]
1431#[target_feature(enable = "avx2")]
1432unsafe fn translate_range_to_constant_avx2(
1433 src: &[u8],
1434 dst: &mut [u8],
1435 lo: u8,
1436 hi: u8,
1437 replacement: u8,
1438) {
1439 use std::arch::x86_64::*;
1440 unsafe {
1441 let range = hi - lo;
1442 let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1443 let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
1444 let repl_v = _mm256_set1_epi8(replacement as i8);
1445 let zero = _mm256_setzero_si256();
1446 let len = src.len();
1447 let sp = src.as_ptr();
1448 let dp = dst.as_mut_ptr();
1449 let mut i = 0;
1450 while i + 64 <= len {
1451 let in0 = _mm256_loadu_si256(sp.add(i) as *const _);
1452 let in1 = _mm256_loadu_si256(sp.add(i + 32) as *const _);
1453 let bi0 = _mm256_add_epi8(in0, bias_v);
1454 let bi1 = _mm256_add_epi8(in1, bias_v);
1455 let gt0 = _mm256_cmpgt_epi8(bi0, threshold_v);
1456 let gt1 = _mm256_cmpgt_epi8(bi1, threshold_v);
1457 let ir0 = _mm256_cmpeq_epi8(gt0, zero);
1458 let ir1 = _mm256_cmpeq_epi8(gt1, zero);
1459 let r0 = _mm256_blendv_epi8(in0, repl_v, ir0);
1460 let r1 = _mm256_blendv_epi8(in1, repl_v, ir1);
1461 _mm256_storeu_si256(dp.add(i) as *mut _, r0);
1462 _mm256_storeu_si256(dp.add(i + 32) as *mut _, r1);
1463 i += 64;
1464 }
1465 if i + 32 <= len {
1466 let input = _mm256_loadu_si256(sp.add(i) as *const _);
1467 let biased = _mm256_add_epi8(input, bias_v);
1468 let gt = _mm256_cmpgt_epi8(biased, threshold_v);
1469 let in_range = _mm256_cmpeq_epi8(gt, zero);
1470 let result = _mm256_blendv_epi8(input, repl_v, in_range);
1471 _mm256_storeu_si256(dp.add(i) as *mut _, result);
1472 i += 32;
1473 }
1474 while i < len {
1475 let b = *sp.add(i);
1476 *dp.add(i) = if b >= lo && b <= hi { replacement } else { b };
1477 i += 1;
1478 }
1479 }
1480}
1481
1482#[cfg(target_arch = "x86_64")]
1483#[target_feature(enable = "sse2")]
1484unsafe fn translate_range_to_constant_sse2(
1485 src: &[u8],
1486 dst: &mut [u8],
1487 lo: u8,
1488 hi: u8,
1489 replacement: u8,
1490) {
1491 use std::arch::x86_64::*;
1492 unsafe {
1493 let range = hi - lo;
1494 let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1495 let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1496 let repl_v = _mm_set1_epi8(replacement as i8);
1497 let zero = _mm_setzero_si128();
1498 let len = src.len();
1499 let sp = src.as_ptr();
1500 let dp = dst.as_mut_ptr();
1501 let mut i = 0;
1502 while i + 16 <= len {
1503 let input = _mm_loadu_si128(sp.add(i) as *const _);
1504 let biased = _mm_add_epi8(input, bias_v);
1505 let gt = _mm_cmpgt_epi8(biased, threshold_v);
1506 let in_range = _mm_cmpeq_epi8(gt, zero);
1507 let result = _mm_or_si128(
1508 _mm_and_si128(in_range, repl_v),
1509 _mm_andnot_si128(in_range, input),
1510 );
1511 _mm_storeu_si128(dp.add(i) as *mut _, result);
1512 i += 16;
1513 }
1514 while i < len {
1515 let b = *sp.add(i);
1516 *dp.add(i) = if b >= lo && b <= hi { replacement } else { b };
1517 i += 1;
1518 }
1519 }
1520}
1521
1522#[cfg(target_arch = "x86_64")]
1529fn translate_range_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
1530 if get_simd_level() >= 3 {
1531 if dst.as_ptr() as usize & 31 == 0 {
1533 unsafe { translate_range_avx2_nt(src, dst, lo, hi, offset) };
1534 } else {
1535 unsafe { translate_range_avx2(src, dst, lo, hi, offset) };
1536 }
1537 } else {
1538 unsafe { translate_range_sse2(src, dst, lo, hi, offset) };
1539 }
1540}
1541
1542#[cfg(target_arch = "x86_64")]
1543#[target_feature(enable = "avx2")]
1544unsafe fn translate_range_avx2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
1545 use std::arch::x86_64::*;
1546
1547 unsafe {
1548 let range = hi - lo;
1549 let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1554 let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
1555 let offset_v = _mm256_set1_epi8(offset);
1556 let zero = _mm256_setzero_si256();
1557
1558 let len = src.len();
1559 let sp = src.as_ptr();
1560 let dp = dst.as_mut_ptr();
1561 let mut i = 0;
1562
1563 while i + 64 <= len {
1566 let in0 = _mm256_loadu_si256(sp.add(i) as *const _);
1567 let in1 = _mm256_loadu_si256(sp.add(i + 32) as *const _);
1568 let bi0 = _mm256_add_epi8(in0, bias_v);
1569 let bi1 = _mm256_add_epi8(in1, bias_v);
1570 let gt0 = _mm256_cmpgt_epi8(bi0, threshold_v);
1571 let gt1 = _mm256_cmpgt_epi8(bi1, threshold_v);
1572 let m0 = _mm256_cmpeq_epi8(gt0, zero);
1573 let m1 = _mm256_cmpeq_epi8(gt1, zero);
1574 let om0 = _mm256_and_si256(m0, offset_v);
1575 let om1 = _mm256_and_si256(m1, offset_v);
1576 let r0 = _mm256_add_epi8(in0, om0);
1577 let r1 = _mm256_add_epi8(in1, om1);
1578 _mm256_storeu_si256(dp.add(i) as *mut _, r0);
1579 _mm256_storeu_si256(dp.add(i + 32) as *mut _, r1);
1580 i += 64;
1581 }
1582
1583 if i + 32 <= len {
1585 let input = _mm256_loadu_si256(sp.add(i) as *const _);
1586 let biased = _mm256_add_epi8(input, bias_v);
1587 let gt = _mm256_cmpgt_epi8(biased, threshold_v);
1588 let mask = _mm256_cmpeq_epi8(gt, zero);
1589 let offset_masked = _mm256_and_si256(mask, offset_v);
1590 let result = _mm256_add_epi8(input, offset_masked);
1591 _mm256_storeu_si256(dp.add(i) as *mut _, result);
1592 i += 32;
1593 }
1594
1595 if i + 16 <= len {
1597 let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1598 let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1599 let offset_v128 = _mm_set1_epi8(offset);
1600 let zero128 = _mm_setzero_si128();
1601
1602 let input = _mm_loadu_si128(sp.add(i) as *const _);
1603 let biased = _mm_add_epi8(input, bias_v128);
1604 let gt = _mm_cmpgt_epi8(biased, threshold_v128);
1605 let mask = _mm_cmpeq_epi8(gt, zero128);
1606 let offset_masked = _mm_and_si128(mask, offset_v128);
1607 let result = _mm_add_epi8(input, offset_masked);
1608 _mm_storeu_si128(dp.add(i) as *mut _, result);
1609 i += 16;
1610 }
1611
1612 while i < len {
1614 let b = *sp.add(i);
1615 *dp.add(i) = if b >= lo && b <= hi {
1616 b.wrapping_add(offset as u8)
1617 } else {
1618 b
1619 };
1620 i += 1;
1621 }
1622 }
1623}
1624
1625#[cfg(target_arch = "x86_64")]
1631#[target_feature(enable = "avx2")]
1632unsafe fn translate_range_avx2_nt(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
1633 use std::arch::x86_64::*;
1634
1635 unsafe {
1636 let range = hi - lo;
1637 let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1638 let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
1639 let offset_v = _mm256_set1_epi8(offset);
1640 let zero = _mm256_setzero_si256();
1641
1642 let len = src.len();
1643 let sp = src.as_ptr();
1644 let dp = dst.as_mut_ptr();
1645 let mut i = 0;
1646
1647 while i + 64 <= len {
1649 let in0 = _mm256_loadu_si256(sp.add(i) as *const _);
1650 let in1 = _mm256_loadu_si256(sp.add(i + 32) as *const _);
1651 let bi0 = _mm256_add_epi8(in0, bias_v);
1652 let bi1 = _mm256_add_epi8(in1, bias_v);
1653 let gt0 = _mm256_cmpgt_epi8(bi0, threshold_v);
1654 let gt1 = _mm256_cmpgt_epi8(bi1, threshold_v);
1655 let m0 = _mm256_cmpeq_epi8(gt0, zero);
1656 let m1 = _mm256_cmpeq_epi8(gt1, zero);
1657 let om0 = _mm256_and_si256(m0, offset_v);
1658 let om1 = _mm256_and_si256(m1, offset_v);
1659 let r0 = _mm256_add_epi8(in0, om0);
1660 let r1 = _mm256_add_epi8(in1, om1);
1661 _mm256_stream_si256(dp.add(i) as *mut _, r0);
1662 _mm256_stream_si256(dp.add(i + 32) as *mut _, r1);
1663 i += 64;
1664 }
1665
1666 if i + 32 <= len {
1668 let input = _mm256_loadu_si256(sp.add(i) as *const _);
1669 let biased = _mm256_add_epi8(input, bias_v);
1670 let gt = _mm256_cmpgt_epi8(biased, threshold_v);
1671 let mask = _mm256_cmpeq_epi8(gt, zero);
1672 let offset_masked = _mm256_and_si256(mask, offset_v);
1673 let result = _mm256_add_epi8(input, offset_masked);
1674 _mm256_stream_si256(dp.add(i) as *mut _, result);
1675 i += 32;
1676 }
1677
1678 if i + 16 <= len {
1680 let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1681 let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1682 let offset_v128 = _mm_set1_epi8(offset);
1683 let zero128 = _mm_setzero_si128();
1684
1685 let input = _mm_loadu_si128(sp.add(i) as *const _);
1686 let biased = _mm_add_epi8(input, bias_v128);
1687 let gt = _mm_cmpgt_epi8(biased, threshold_v128);
1688 let mask = _mm_cmpeq_epi8(gt, zero128);
1689 let offset_masked = _mm_and_si128(mask, offset_v128);
1690 let result = _mm_add_epi8(input, offset_masked);
1691 _mm_storeu_si128(dp.add(i) as *mut _, result);
1692 i += 16;
1693 }
1694
1695 while i < len {
1697 let b = *sp.add(i);
1698 *dp.add(i) = if b >= lo && b <= hi {
1699 b.wrapping_add(offset as u8)
1700 } else {
1701 b
1702 };
1703 i += 1;
1704 }
1705
1706 _mm_sfence();
1708 }
1709}
1710
1711#[cfg(target_arch = "x86_64")]
1712#[target_feature(enable = "sse2")]
1713unsafe fn translate_range_sse2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
1714 use std::arch::x86_64::*;
1715
1716 unsafe {
1717 let range = hi - lo;
1718 let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1719 let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1720 let offset_v = _mm_set1_epi8(offset);
1721 let zero = _mm_setzero_si128();
1722
1723 let len = src.len();
1724 let mut i = 0;
1725
1726 while i + 16 <= len {
1727 let input = _mm_loadu_si128(src.as_ptr().add(i) as *const _);
1728 let biased = _mm_add_epi8(input, bias_v);
1729 let gt = _mm_cmpgt_epi8(biased, threshold_v);
1730 let mask = _mm_cmpeq_epi8(gt, zero);
1731 let offset_masked = _mm_and_si128(mask, offset_v);
1732 let result = _mm_add_epi8(input, offset_masked);
1733 _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut _, result);
1734 i += 16;
1735 }
1736
1737 while i < len {
1738 let b = *src.get_unchecked(i);
1739 *dst.get_unchecked_mut(i) = if b >= lo && b <= hi {
1740 b.wrapping_add(offset as u8)
1741 } else {
1742 b
1743 };
1744 i += 1;
1745 }
1746 }
1747}
1748
1749#[cfg(target_arch = "aarch64")]
1752fn translate_range_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
1753 unsafe { translate_range_neon(src, dst, lo, hi, offset) };
1754}
1755
1756#[cfg(target_arch = "aarch64")]
1757#[target_feature(enable = "neon")]
1758unsafe fn translate_range_neon(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
1759 use std::arch::aarch64::*;
1760
1761 unsafe {
1762 let len = src.len();
1763 let sp = src.as_ptr();
1764 let dp = dst.as_mut_ptr();
1765 let lo_v = vdupq_n_u8(lo);
1766 let hi_v = vdupq_n_u8(hi);
1767 let offset_v = vdupq_n_s8(offset);
1768 let mut i = 0;
1769
1770 while i + 32 <= len {
1772 let in0 = vld1q_u8(sp.add(i));
1773 let in1 = vld1q_u8(sp.add(i + 16));
1774 let ge0 = vcgeq_u8(in0, lo_v);
1776 let le0 = vcleq_u8(in0, hi_v);
1777 let mask0 = vandq_u8(ge0, le0);
1778 let ge1 = vcgeq_u8(in1, lo_v);
1779 let le1 = vcleq_u8(in1, hi_v);
1780 let mask1 = vandq_u8(ge1, le1);
1781 let off0 = vandq_u8(mask0, vreinterpretq_u8_s8(offset_v));
1783 let off1 = vandq_u8(mask1, vreinterpretq_u8_s8(offset_v));
1784 let r0 = vaddq_u8(in0, off0);
1785 let r1 = vaddq_u8(in1, off1);
1786 vst1q_u8(dp.add(i), r0);
1787 vst1q_u8(dp.add(i + 16), r1);
1788 i += 32;
1789 }
1790
1791 if i + 16 <= len {
1792 let input = vld1q_u8(sp.add(i));
1793 let ge = vcgeq_u8(input, lo_v);
1794 let le = vcleq_u8(input, hi_v);
1795 let mask = vandq_u8(ge, le);
1796 let off = vandq_u8(mask, vreinterpretq_u8_s8(offset_v));
1797 vst1q_u8(dp.add(i), vaddq_u8(input, off));
1798 i += 16;
1799 }
1800
1801 while i < len {
1802 let b = *sp.add(i);
1803 *dp.add(i) = if b >= lo && b <= hi {
1804 b.wrapping_add(offset as u8)
1805 } else {
1806 b
1807 };
1808 i += 1;
1809 }
1810 }
1811}
1812
1813#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
1815fn translate_range_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
1816 let offset_u8 = offset as u8;
1817 let range = hi.wrapping_sub(lo);
1818 unsafe {
1819 let sp = src.as_ptr();
1820 let dp = dst.as_mut_ptr();
1821 let len = src.len();
1822 let mut i = 0;
1823 while i + 8 <= len {
1824 macro_rules! do_byte {
1825 ($off:expr) => {{
1826 let b = *sp.add(i + $off);
1827 let in_range = b.wrapping_sub(lo) <= range;
1828 *dp.add(i + $off) = if in_range {
1829 b.wrapping_add(offset_u8)
1830 } else {
1831 b
1832 };
1833 }};
1834 }
1835 do_byte!(0);
1836 do_byte!(1);
1837 do_byte!(2);
1838 do_byte!(3);
1839 do_byte!(4);
1840 do_byte!(5);
1841 do_byte!(6);
1842 do_byte!(7);
1843 i += 8;
1844 }
1845 while i < len {
1846 let b = *sp.add(i);
1847 let in_range = b.wrapping_sub(lo) <= range;
1848 *dp.add(i) = if in_range {
1849 b.wrapping_add(offset_u8)
1850 } else {
1851 b
1852 };
1853 i += 1;
1854 }
1855 }
1856}
1857
1858#[cfg(target_arch = "x86_64")]
1866fn translate_range_simd_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
1867 if get_simd_level() >= 3 {
1868 unsafe { translate_range_avx2_inplace(data, lo, hi, offset) };
1869 } else {
1870 unsafe { translate_range_sse2_inplace(data, lo, hi, offset) };
1871 }
1872}
1873
1874#[cfg(target_arch = "x86_64")]
1875#[target_feature(enable = "avx2")]
1876unsafe fn translate_range_avx2_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
1877 use std::arch::x86_64::*;
1878
1879 unsafe {
1880 let range = hi - lo;
1881 let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1882 let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
1883 let offset_v = _mm256_set1_epi8(offset);
1884 let zero = _mm256_setzero_si256();
1885
1886 let len = data.len();
1887 let ptr = data.as_mut_ptr();
1888 let mut i = 0;
1889
1890 while i + 64 <= len {
1892 let in0 = _mm256_loadu_si256(ptr.add(i) as *const _);
1893 let in1 = _mm256_loadu_si256(ptr.add(i + 32) as *const _);
1894 let bi0 = _mm256_add_epi8(in0, bias_v);
1895 let bi1 = _mm256_add_epi8(in1, bias_v);
1896 let gt0 = _mm256_cmpgt_epi8(bi0, threshold_v);
1897 let gt1 = _mm256_cmpgt_epi8(bi1, threshold_v);
1898 let m0 = _mm256_cmpeq_epi8(gt0, zero);
1899 let m1 = _mm256_cmpeq_epi8(gt1, zero);
1900 let om0 = _mm256_and_si256(m0, offset_v);
1901 let om1 = _mm256_and_si256(m1, offset_v);
1902 let r0 = _mm256_add_epi8(in0, om0);
1903 let r1 = _mm256_add_epi8(in1, om1);
1904 _mm256_storeu_si256(ptr.add(i) as *mut _, r0);
1905 _mm256_storeu_si256(ptr.add(i + 32) as *mut _, r1);
1906 i += 64;
1907 }
1908
1909 if i + 32 <= len {
1911 let input = _mm256_loadu_si256(ptr.add(i) as *const _);
1912 let biased = _mm256_add_epi8(input, bias_v);
1913 let gt = _mm256_cmpgt_epi8(biased, threshold_v);
1914 let mask = _mm256_cmpeq_epi8(gt, zero);
1915 let offset_masked = _mm256_and_si256(mask, offset_v);
1916 let result = _mm256_add_epi8(input, offset_masked);
1917 _mm256_storeu_si256(ptr.add(i) as *mut _, result);
1918 i += 32;
1919 }
1920
1921 if i + 16 <= len {
1922 let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1923 let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1924 let offset_v128 = _mm_set1_epi8(offset);
1925 let zero128 = _mm_setzero_si128();
1926
1927 let input = _mm_loadu_si128(ptr.add(i) as *const _);
1928 let biased = _mm_add_epi8(input, bias_v128);
1929 let gt = _mm_cmpgt_epi8(biased, threshold_v128);
1930 let mask = _mm_cmpeq_epi8(gt, zero128);
1931 let offset_masked = _mm_and_si128(mask, offset_v128);
1932 let result = _mm_add_epi8(input, offset_masked);
1933 _mm_storeu_si128(ptr.add(i) as *mut _, result);
1934 i += 16;
1935 }
1936
1937 while i < len {
1938 let b = *ptr.add(i);
1939 *ptr.add(i) = if b >= lo && b <= hi {
1940 b.wrapping_add(offset as u8)
1941 } else {
1942 b
1943 };
1944 i += 1;
1945 }
1946 }
1947}
1948
1949#[cfg(target_arch = "x86_64")]
1950#[target_feature(enable = "sse2")]
1951unsafe fn translate_range_sse2_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
1952 use std::arch::x86_64::*;
1953
1954 unsafe {
1955 let range = hi - lo;
1956 let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1957 let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1958 let offset_v = _mm_set1_epi8(offset);
1959 let zero = _mm_setzero_si128();
1960
1961 let len = data.len();
1962 let ptr = data.as_mut_ptr();
1963 let mut i = 0;
1964
1965 while i + 16 <= len {
1966 let input = _mm_loadu_si128(ptr.add(i) as *const _);
1967 let biased = _mm_add_epi8(input, bias_v);
1968 let gt = _mm_cmpgt_epi8(biased, threshold_v);
1969 let mask = _mm_cmpeq_epi8(gt, zero);
1970 let offset_masked = _mm_and_si128(mask, offset_v);
1971 let result = _mm_add_epi8(input, offset_masked);
1972 _mm_storeu_si128(ptr.add(i) as *mut _, result);
1973 i += 16;
1974 }
1975
1976 while i < len {
1977 let b = *ptr.add(i);
1978 *ptr.add(i) = if b >= lo && b <= hi {
1979 b.wrapping_add(offset as u8)
1980 } else {
1981 b
1982 };
1983 i += 1;
1984 }
1985 }
1986}
1987
1988#[cfg(target_arch = "aarch64")]
1989fn translate_range_simd_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
1990 unsafe { translate_range_neon_inplace(data, lo, hi, offset) };
1991}
1992
1993#[cfg(target_arch = "aarch64")]
1994#[target_feature(enable = "neon")]
1995unsafe fn translate_range_neon_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
1996 use std::arch::aarch64::*;
1997
1998 unsafe {
1999 let len = data.len();
2000 let ptr = data.as_mut_ptr();
2001 let lo_v = vdupq_n_u8(lo);
2002 let hi_v = vdupq_n_u8(hi);
2003 let offset_v = vdupq_n_s8(offset);
2004 let mut i = 0;
2005
2006 while i + 32 <= len {
2007 let in0 = vld1q_u8(ptr.add(i));
2008 let in1 = vld1q_u8(ptr.add(i + 16));
2009 let ge0 = vcgeq_u8(in0, lo_v);
2010 let le0 = vcleq_u8(in0, hi_v);
2011 let mask0 = vandq_u8(ge0, le0);
2012 let ge1 = vcgeq_u8(in1, lo_v);
2013 let le1 = vcleq_u8(in1, hi_v);
2014 let mask1 = vandq_u8(ge1, le1);
2015 let off0 = vandq_u8(mask0, vreinterpretq_u8_s8(offset_v));
2016 let off1 = vandq_u8(mask1, vreinterpretq_u8_s8(offset_v));
2017 vst1q_u8(ptr.add(i), vaddq_u8(in0, off0));
2018 vst1q_u8(ptr.add(i + 16), vaddq_u8(in1, off1));
2019 i += 32;
2020 }
2021
2022 if i + 16 <= len {
2023 let input = vld1q_u8(ptr.add(i));
2024 let ge = vcgeq_u8(input, lo_v);
2025 let le = vcleq_u8(input, hi_v);
2026 let mask = vandq_u8(ge, le);
2027 let off = vandq_u8(mask, vreinterpretq_u8_s8(offset_v));
2028 vst1q_u8(ptr.add(i), vaddq_u8(input, off));
2029 i += 16;
2030 }
2031
2032 while i < len {
2033 let b = *ptr.add(i);
2034 if b >= lo && b <= hi {
2035 *ptr.add(i) = b.wrapping_add(offset as u8);
2036 }
2037 i += 1;
2038 }
2039 }
2040}
2041
2042#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
2043fn translate_range_simd_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
2044 let offset_u8 = offset as u8;
2045 let range = hi.wrapping_sub(lo);
2046 for b in data.iter_mut() {
2047 if b.wrapping_sub(lo) <= range {
2048 *b = b.wrapping_add(offset_u8);
2049 }
2050 }
2051}
2052
2053#[inline]
2063fn detect_delete_range(chars: &[u8]) -> Option<(u8, u8)> {
2064 if chars.is_empty() {
2065 return None;
2066 }
2067 let mut lo = chars[0];
2068 let mut hi = chars[0];
2069 for &c in &chars[1..] {
2070 if c < lo {
2071 lo = c;
2072 }
2073 if c > hi {
2074 hi = c;
2075 }
2076 }
2077 if (hi as usize - lo as usize + 1) == chars.len() {
2080 Some((lo, hi))
2081 } else {
2082 None
2083 }
2084}
2085
2086#[cfg(target_arch = "x86_64")]
2090fn delete_range_chunk(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
2091 if get_simd_level() >= 3 {
2092 unsafe { delete_range_avx2(src, dst, lo, hi) }
2093 } else {
2094 unsafe { delete_range_sse2(src, dst, lo, hi) }
2095 }
2096}
2097
2098#[cfg(target_arch = "x86_64")]
2099#[target_feature(enable = "avx2")]
2100unsafe fn delete_range_avx2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
2101 use std::arch::x86_64::*;
2102
2103 unsafe {
2104 let range = hi - lo;
2105 let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
2106 let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
2107 let zero = _mm256_setzero_si256();
2108
2109 let len = src.len();
2110 let sp = src.as_ptr();
2111 let dp = dst.as_mut_ptr();
2112 let mut ri = 0;
2113 let mut wp = 0;
2114
2115 while ri + 32 <= len {
2116 let input = _mm256_loadu_si256(sp.add(ri) as *const _);
2117 let biased = _mm256_add_epi8(input, bias_v);
2118 let gt = _mm256_cmpgt_epi8(biased, threshold_v);
2120 let in_range = _mm256_cmpeq_epi8(gt, zero);
2122 let keep_mask = !(_mm256_movemask_epi8(in_range) as u32);
2124
2125 if keep_mask == 0xFFFFFFFF {
2126 std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 32);
2128 wp += 32;
2129 } else if keep_mask != 0 {
2130 let m0 = keep_mask as u8;
2135 let m1 = (keep_mask >> 8) as u8;
2136 let m2 = (keep_mask >> 16) as u8;
2137 let m3 = (keep_mask >> 24) as u8;
2138
2139 if m0 == 0xFF {
2140 std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 8);
2141 } else if m0 != 0 {
2142 compact_8bytes_simd(sp.add(ri), dp.add(wp), m0);
2143 }
2144 let c0 = m0.count_ones() as usize;
2145
2146 if m1 == 0xFF {
2147 std::ptr::copy_nonoverlapping(sp.add(ri + 8), dp.add(wp + c0), 8);
2148 } else if m1 != 0 {
2149 compact_8bytes_simd(sp.add(ri + 8), dp.add(wp + c0), m1);
2150 }
2151 let c1 = m1.count_ones() as usize;
2152
2153 if m2 == 0xFF {
2154 std::ptr::copy_nonoverlapping(sp.add(ri + 16), dp.add(wp + c0 + c1), 8);
2155 } else if m2 != 0 {
2156 compact_8bytes_simd(sp.add(ri + 16), dp.add(wp + c0 + c1), m2);
2157 }
2158 let c2 = m2.count_ones() as usize;
2159
2160 if m3 == 0xFF {
2161 std::ptr::copy_nonoverlapping(sp.add(ri + 24), dp.add(wp + c0 + c1 + c2), 8);
2162 } else if m3 != 0 {
2163 compact_8bytes_simd(sp.add(ri + 24), dp.add(wp + c0 + c1 + c2), m3);
2164 }
2165 let c3 = m3.count_ones() as usize;
2166 wp += c0 + c1 + c2 + c3;
2167 }
2168 ri += 32;
2170 }
2171
2172 if ri + 16 <= len {
2174 let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
2175 let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
2176 let zero128 = _mm_setzero_si128();
2177
2178 let input = _mm_loadu_si128(sp.add(ri) as *const _);
2179 let biased = _mm_add_epi8(input, bias_v128);
2180 let gt = _mm_cmpgt_epi8(biased, threshold_v128);
2181 let in_range = _mm_cmpeq_epi8(gt, zero128);
2182 let keep_mask = !(_mm_movemask_epi8(in_range) as u32) & 0xFFFF;
2183
2184 if keep_mask == 0xFFFF {
2185 std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 16);
2186 wp += 16;
2187 } else if keep_mask != 0 {
2188 let m0 = keep_mask as u8;
2189 let m1 = (keep_mask >> 8) as u8;
2190 if m0 == 0xFF {
2191 std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 8);
2192 } else if m0 != 0 {
2193 compact_8bytes_simd(sp.add(ri), dp.add(wp), m0);
2194 }
2195 let c0 = m0.count_ones() as usize;
2196 if m1 == 0xFF {
2197 std::ptr::copy_nonoverlapping(sp.add(ri + 8), dp.add(wp + c0), 8);
2198 } else if m1 != 0 {
2199 compact_8bytes_simd(sp.add(ri + 8), dp.add(wp + c0), m1);
2200 }
2201 wp += c0 + m1.count_ones() as usize;
2202 }
2203 ri += 16;
2204 }
2205
2206 while ri < len {
2208 let b = *sp.add(ri);
2209 *dp.add(wp) = b;
2210 wp += (b < lo || b > hi) as usize;
2211 ri += 1;
2212 }
2213
2214 wp
2215 }
2216}
2217
2218#[cfg(target_arch = "x86_64")]
2226#[inline(always)]
2227unsafe fn compact_8bytes(src: *const u8, dst: *mut u8, mask: u8) {
2228 unsafe {
2229 let idx = COMPACT_LUT.get_unchecked(mask as usize);
2230 *dst = *src.add(*idx.get_unchecked(0) as usize);
2231 *dst.add(1) = *src.add(*idx.get_unchecked(1) as usize);
2232 *dst.add(2) = *src.add(*idx.get_unchecked(2) as usize);
2233 *dst.add(3) = *src.add(*idx.get_unchecked(3) as usize);
2234 *dst.add(4) = *src.add(*idx.get_unchecked(4) as usize);
2235 *dst.add(5) = *src.add(*idx.get_unchecked(5) as usize);
2236 *dst.add(6) = *src.add(*idx.get_unchecked(6) as usize);
2237 *dst.add(7) = *src.add(*idx.get_unchecked(7) as usize);
2238 }
2239}
2240
2241#[cfg(target_arch = "x86_64")]
2246#[target_feature(enable = "ssse3")]
2247#[inline]
2248unsafe fn compact_8bytes_simd(src: *const u8, dst: *mut u8, mask: u8) {
2249 use std::arch::x86_64::*;
2250 unsafe {
2251 let src_v = _mm_loadl_epi64(src as *const _);
2252 let shuf = _mm_loadl_epi64(COMPACT_LUT.get_unchecked(mask as usize).as_ptr() as *const _);
2253 let out_v = _mm_shuffle_epi8(src_v, shuf);
2254 _mm_storel_epi64(dst as *mut _, out_v);
2255 }
2256}
2257
2258#[cfg(target_arch = "x86_64")]
2259#[target_feature(enable = "sse2")]
2260unsafe fn delete_range_sse2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
2261 use std::arch::x86_64::*;
2262
2263 unsafe {
2264 let range = hi - lo;
2265 let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
2266 let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
2267 let zero = _mm_setzero_si128();
2268
2269 let len = src.len();
2270 let sp = src.as_ptr();
2271 let dp = dst.as_mut_ptr();
2272 let mut ri = 0;
2273 let mut wp = 0;
2274
2275 while ri + 16 <= len {
2276 let input = _mm_loadu_si128(sp.add(ri) as *const _);
2277 let biased = _mm_add_epi8(input, bias_v);
2278 let gt = _mm_cmpgt_epi8(biased, threshold_v);
2279 let in_range = _mm_cmpeq_epi8(gt, zero);
2280 let keep_mask = !(_mm_movemask_epi8(in_range) as u32) & 0xFFFF;
2281
2282 if keep_mask == 0xFFFF {
2283 std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 16);
2285 wp += 16;
2286 } else if keep_mask != 0 {
2287 let m0 = keep_mask as u8;
2288 let m1 = (keep_mask >> 8) as u8;
2289 if m0 == 0xFF {
2290 std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 8);
2291 } else if m0 != 0 {
2292 compact_8bytes(sp.add(ri), dp.add(wp), m0);
2293 }
2294 let c0 = m0.count_ones() as usize;
2295 if m1 == 0xFF {
2296 std::ptr::copy_nonoverlapping(sp.add(ri + 8), dp.add(wp + c0), 8);
2297 } else if m1 != 0 {
2298 compact_8bytes(sp.add(ri + 8), dp.add(wp + c0), m1);
2299 }
2300 wp += c0 + m1.count_ones() as usize;
2301 }
2302 ri += 16;
2303 }
2304
2305 while ri < len {
2307 let b = *sp.add(ri);
2308 *dp.add(wp) = b;
2309 wp += (b < lo || b > hi) as usize;
2310 ri += 1;
2311 }
2312
2313 wp
2314 }
2315}
2316
2317#[cfg(not(target_arch = "x86_64"))]
2321fn delete_range_chunk(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
2322 let len = src.len();
2323 let sp = src.as_ptr();
2324 let dp = dst.as_mut_ptr();
2325 let mut wp: usize = 0;
2326 let mut i: usize = 0;
2327
2328 while i + 8 <= len {
2330 unsafe {
2331 let b0 = *sp.add(i);
2332 *dp.add(wp) = b0;
2333 wp += (b0 < lo || b0 > hi) as usize;
2334 let b1 = *sp.add(i + 1);
2335 *dp.add(wp) = b1;
2336 wp += (b1 < lo || b1 > hi) as usize;
2337 let b2 = *sp.add(i + 2);
2338 *dp.add(wp) = b2;
2339 wp += (b2 < lo || b2 > hi) as usize;
2340 let b3 = *sp.add(i + 3);
2341 *dp.add(wp) = b3;
2342 wp += (b3 < lo || b3 > hi) as usize;
2343 let b4 = *sp.add(i + 4);
2344 *dp.add(wp) = b4;
2345 wp += (b4 < lo || b4 > hi) as usize;
2346 let b5 = *sp.add(i + 5);
2347 *dp.add(wp) = b5;
2348 wp += (b5 < lo || b5 > hi) as usize;
2349 let b6 = *sp.add(i + 6);
2350 *dp.add(wp) = b6;
2351 wp += (b6 < lo || b6 > hi) as usize;
2352 let b7 = *sp.add(i + 7);
2353 *dp.add(wp) = b7;
2354 wp += (b7 < lo || b7 > hi) as usize;
2355 }
2356 i += 8;
2357 }
2358
2359 while i < len {
2361 unsafe {
2362 let b = *sp.add(i);
2363 *dp.add(wp) = b;
2364 wp += (b < lo || b > hi) as usize;
2365 }
2366 i += 1;
2367 }
2368
2369 wp
2370}
2371
2372fn delete_range_streaming(
2377 lo: u8,
2378 hi: u8,
2379 reader: &mut impl Read,
2380 writer: &mut impl Write,
2381) -> io::Result<()> {
2382 let mut buf = alloc_uninit_vec(STREAM_BUF);
2383 loop {
2384 let n = read_full(reader, &mut buf)?;
2385 if n == 0 {
2386 break;
2387 }
2388 let wp = delete_range_inplace(&mut buf, n, lo, hi);
2389 if wp > 0 {
2390 writer.write_all(&buf[..wp])?;
2391 }
2392 }
2393 Ok(())
2394}
2395
2396#[inline]
2399fn delete_range_inplace(buf: &mut [u8], n: usize, lo: u8, hi: u8) -> usize {
2400 #[cfg(target_arch = "x86_64")]
2401 {
2402 let level = get_simd_level();
2403 if level >= 3 {
2404 return unsafe { delete_range_inplace_avx2(buf, n, lo, hi) };
2405 }
2406 }
2407 let ptr = buf.as_mut_ptr();
2409 let mut ri = 0;
2410 let mut wp = 0;
2411 unsafe {
2412 while ri + 8 <= n {
2413 let b0 = *ptr.add(ri);
2414 let b1 = *ptr.add(ri + 1);
2415 let b2 = *ptr.add(ri + 2);
2416 let b3 = *ptr.add(ri + 3);
2417 let b4 = *ptr.add(ri + 4);
2418 let b5 = *ptr.add(ri + 5);
2419 let b6 = *ptr.add(ri + 6);
2420 let b7 = *ptr.add(ri + 7);
2421 *ptr.add(wp) = b0;
2422 wp += (b0 < lo || b0 > hi) as usize;
2423 *ptr.add(wp) = b1;
2424 wp += (b1 < lo || b1 > hi) as usize;
2425 *ptr.add(wp) = b2;
2426 wp += (b2 < lo || b2 > hi) as usize;
2427 *ptr.add(wp) = b3;
2428 wp += (b3 < lo || b3 > hi) as usize;
2429 *ptr.add(wp) = b4;
2430 wp += (b4 < lo || b4 > hi) as usize;
2431 *ptr.add(wp) = b5;
2432 wp += (b5 < lo || b5 > hi) as usize;
2433 *ptr.add(wp) = b6;
2434 wp += (b6 < lo || b6 > hi) as usize;
2435 *ptr.add(wp) = b7;
2436 wp += (b7 < lo || b7 > hi) as usize;
2437 ri += 8;
2438 }
2439 while ri < n {
2440 let b = *ptr.add(ri);
2441 *ptr.add(wp) = b;
2442 wp += (b < lo || b > hi) as usize;
2443 ri += 1;
2444 }
2445 }
2446 wp
2447}
2448
2449#[cfg(target_arch = "x86_64")]
2452#[target_feature(enable = "avx2")]
2453unsafe fn delete_range_inplace_avx2(buf: &mut [u8], n: usize, lo: u8, hi: u8) -> usize {
2454 use std::arch::x86_64::*;
2455
2456 unsafe {
2457 let range = hi - lo;
2458 let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
2459 let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
2460 let zero = _mm256_setzero_si256();
2461
2462 let ptr = buf.as_mut_ptr();
2463 let mut ri = 0;
2464 let mut wp = 0;
2465
2466 while ri + 32 <= n {
2467 let input = _mm256_loadu_si256(ptr.add(ri) as *const _);
2468 let biased = _mm256_add_epi8(input, bias_v);
2469 let gt = _mm256_cmpgt_epi8(biased, threshold_v);
2470 let in_range = _mm256_cmpeq_epi8(gt, zero);
2471 let del_mask = _mm256_movemask_epi8(in_range) as u32;
2472
2473 if del_mask == 0 {
2474 if wp != ri {
2476 std::ptr::copy(ptr.add(ri), ptr.add(wp), 32);
2477 }
2478 wp += 32;
2479 } else if del_mask != 0xFFFFFFFF {
2480 let keep_mask = !del_mask;
2485 let m0 = keep_mask as u8;
2486 let m1 = (keep_mask >> 8) as u8;
2487 let m2 = (keep_mask >> 16) as u8;
2488 let m3 = (keep_mask >> 24) as u8;
2489
2490 let c0 = m0.count_ones() as usize;
2491 let c1 = m1.count_ones() as usize;
2492 let c2 = m2.count_ones() as usize;
2493 let c3 = m3.count_ones() as usize;
2494
2495 if m0 == 0xFF {
2497 std::ptr::copy(ptr.add(ri), ptr.add(wp), 8);
2498 } else if m0 != 0 {
2499 let src_v = _mm_loadl_epi64(ptr.add(ri) as *const _);
2500 let shuf = _mm_loadl_epi64(COMPACT_LUT[m0 as usize].as_ptr() as *const _);
2501 let out_v = _mm_shuffle_epi8(src_v, shuf);
2502 _mm_storel_epi64(ptr.add(wp) as *mut _, out_v);
2503 }
2504
2505 if m1 == 0xFF {
2507 std::ptr::copy(ptr.add(ri + 8), ptr.add(wp + c0), 8);
2508 } else if m1 != 0 {
2509 let src_v = _mm_loadl_epi64(ptr.add(ri + 8) as *const _);
2510 let shuf = _mm_loadl_epi64(COMPACT_LUT[m1 as usize].as_ptr() as *const _);
2511 let out_v = _mm_shuffle_epi8(src_v, shuf);
2512 _mm_storel_epi64(ptr.add(wp + c0) as *mut _, out_v);
2513 }
2514
2515 if m2 == 0xFF {
2517 std::ptr::copy(ptr.add(ri + 16), ptr.add(wp + c0 + c1), 8);
2518 } else if m2 != 0 {
2519 let src_v = _mm_loadl_epi64(ptr.add(ri + 16) as *const _);
2520 let shuf = _mm_loadl_epi64(COMPACT_LUT[m2 as usize].as_ptr() as *const _);
2521 let out_v = _mm_shuffle_epi8(src_v, shuf);
2522 _mm_storel_epi64(ptr.add(wp + c0 + c1) as *mut _, out_v);
2523 }
2524
2525 if m3 == 0xFF {
2527 std::ptr::copy(ptr.add(ri + 24), ptr.add(wp + c0 + c1 + c2), 8);
2528 } else if m3 != 0 {
2529 let src_v = _mm_loadl_epi64(ptr.add(ri + 24) as *const _);
2530 let shuf = _mm_loadl_epi64(COMPACT_LUT[m3 as usize].as_ptr() as *const _);
2531 let out_v = _mm_shuffle_epi8(src_v, shuf);
2532 _mm_storel_epi64(ptr.add(wp + c0 + c1 + c2) as *mut _, out_v);
2533 }
2534
2535 wp += c0 + c1 + c2 + c3;
2536 }
2537 ri += 32;
2539 }
2540
2541 while ri < n {
2543 let b = *ptr.add(ri);
2544 *ptr.add(wp) = b;
2545 wp += (b < lo || b > hi) as usize;
2546 ri += 1;
2547 }
2548
2549 wp
2550 }
2551}
2552
2553pub fn translate(
2558 set1: &[u8],
2559 set2: &[u8],
2560 reader: &mut impl Read,
2561 writer: &mut impl Write,
2562) -> io::Result<()> {
2563 let table = build_translate_table(set1, set2);
2564
2565 let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
2567 if is_identity {
2568 return passthrough_stream(reader, writer);
2569 }
2570
2571 if let Some((lo, hi, offset)) = detect_range_offset(&table) {
2573 return translate_range_stream(lo, hi, offset, reader, writer);
2574 }
2575
2576 if let Some((lo, hi, replacement)) = detect_range_to_constant(&table) {
2579 return translate_range_to_constant_stream(lo, hi, replacement, reader, writer);
2580 }
2581
2582 let mut buf = alloc_uninit_vec(STREAM_BUF);
2586 loop {
2587 let n = read_full(reader, &mut buf)?;
2588 if n == 0 {
2589 break;
2590 }
2591 translate_and_write_table(&mut buf, n, &table, writer)?;
2592 }
2593 Ok(())
2594}
2595
2596#[inline]
2597fn translate_and_write_table(
2598 buf: &mut [u8],
2599 total: usize,
2600 table: &[u8; 256],
2601 writer: &mut impl Write,
2602) -> io::Result<()> {
2603 translate_inplace(&mut buf[..total], table);
2604 writer.write_all(&buf[..total])
2605}
2606
2607fn translate_range_stream(
2611 lo: u8,
2612 hi: u8,
2613 offset: i8,
2614 reader: &mut impl Read,
2615 writer: &mut impl Write,
2616) -> io::Result<()> {
2617 let mut buf = alloc_uninit_vec(STREAM_BUF);
2618 loop {
2619 let n = read_full(reader, &mut buf)?;
2620 if n == 0 {
2621 break;
2622 }
2623 translate_and_write_range(&mut buf, n, lo, hi, offset, writer)?;
2624 }
2625 Ok(())
2626}
2627
2628#[inline]
2629fn translate_and_write_range(
2630 buf: &mut [u8],
2631 total: usize,
2632 lo: u8,
2633 hi: u8,
2634 offset: i8,
2635 writer: &mut impl Write,
2636) -> io::Result<()> {
2637 translate_range_simd_inplace(&mut buf[..total], lo, hi, offset);
2638 writer.write_all(&buf[..total])
2639}
2640
2641fn translate_range_to_constant_stream(
2645 lo: u8,
2646 hi: u8,
2647 replacement: u8,
2648 reader: &mut impl Read,
2649 writer: &mut impl Write,
2650) -> io::Result<()> {
2651 let mut buf = alloc_uninit_vec(STREAM_BUF);
2652 loop {
2653 let n = read_full(reader, &mut buf)?;
2654 if n == 0 {
2655 break;
2656 }
2657 translate_and_write_range_const(&mut buf, n, lo, hi, replacement, writer)?;
2658 }
2659 Ok(())
2660}
2661
2662#[inline]
2663fn translate_and_write_range_const(
2664 buf: &mut [u8],
2665 total: usize,
2666 lo: u8,
2667 hi: u8,
2668 replacement: u8,
2669 writer: &mut impl Write,
2670) -> io::Result<()> {
2671 translate_range_to_constant_simd_inplace(&mut buf[..total], lo, hi, replacement);
2672 writer.write_all(&buf[..total])
2673}
2674
2675fn passthrough_stream(reader: &mut impl Read, writer: &mut impl Write) -> io::Result<()> {
2678 let mut buf = alloc_uninit_vec(STREAM_BUF);
2679 loop {
2680 let n = read_full(reader, &mut buf)?;
2681 if n == 0 {
2682 break;
2683 }
2684 writer.write_all(&buf[..n])?;
2685 }
2686 Ok(())
2687}
2688
2689#[inline]
2695fn read_once(reader: &mut impl Read, buf: &mut [u8]) -> io::Result<usize> {
2696 loop {
2697 match reader.read(buf) {
2698 Ok(n) => return Ok(n),
2699 Err(e) if e.kind() == io::ErrorKind::Interrupted => continue,
2700 Err(e) => return Err(e),
2701 }
2702 }
2703}
2704
2705#[inline]
2710fn read_full(reader: &mut impl Read, buf: &mut [u8]) -> io::Result<usize> {
2711 let mut total = 0;
2712 while total < buf.len() {
2713 match reader.read(&mut buf[total..]) {
2714 Ok(0) => break,
2715 Ok(n) => total += n,
2716 Err(e) if e.kind() == io::ErrorKind::Interrupted => continue,
2717 Err(e) => return Err(e),
2718 }
2719 }
2720 Ok(total)
2721}
2722
2723pub fn translate_squeeze(
2724 set1: &[u8],
2725 set2: &[u8],
2726 reader: &mut impl Read,
2727 writer: &mut impl Write,
2728) -> io::Result<()> {
2729 let table = build_translate_table(set1, set2);
2730 let squeeze_set = build_member_set(set2);
2731
2732 if set2.len() == 1 || (set2.len() > 1 && set2.iter().all(|&b| b == set2[0])) {
2735 let squeeze_ch = set2.last().copied().unwrap_or(0);
2736 return translate_squeeze_single_ch(&table, squeeze_ch, &squeeze_set, reader, writer);
2737 }
2738
2739 let range_info = detect_range_offset(&table);
2743 let range_const_info = if range_info.is_none() {
2744 detect_range_to_constant(&table)
2745 } else {
2746 None
2747 };
2748
2749 let mut buf = alloc_uninit_vec(STREAM_BUF);
2750 let mut last_squeezed: u16 = 256;
2751
2752 loop {
2753 let n = read_once(reader, &mut buf)?;
2754 if n == 0 {
2755 break;
2756 }
2757 let wp = translate_squeeze_process(
2758 &mut buf,
2759 n,
2760 &table,
2761 &squeeze_set,
2762 range_info,
2763 range_const_info,
2764 &mut last_squeezed,
2765 );
2766 if wp > 0 {
2767 writer.write_all(&buf[..wp])?;
2768 }
2769 }
2770 Ok(())
2771}
2772
2773#[inline]
2774fn translate_squeeze_process(
2775 buf: &mut [u8],
2776 n: usize,
2777 table: &[u8; 256],
2778 squeeze_set: &[u8; 32],
2779 range_info: Option<(u8, u8, i8)>,
2780 range_const_info: Option<(u8, u8, u8)>,
2781 last_squeezed: &mut u16,
2782) -> usize {
2783 if let Some((lo, hi, offset)) = range_info {
2785 translate_range_simd_inplace(&mut buf[..n], lo, hi, offset);
2786 } else if let Some((lo, hi, replacement)) = range_const_info {
2787 translate_range_to_constant_simd_inplace(&mut buf[..n], lo, hi, replacement);
2788 } else {
2789 translate_inplace(&mut buf[..n], table);
2790 }
2791 let mut wp = 0;
2793 unsafe {
2794 let ptr = buf.as_mut_ptr();
2795 let mut i = 0;
2796 while i + 8 <= n {
2797 macro_rules! squeeze_byte {
2798 ($off:expr) => {
2799 let b = *ptr.add(i + $off);
2800 if is_member(squeeze_set, b) {
2801 if *last_squeezed != b as u16 {
2802 *last_squeezed = b as u16;
2803 *ptr.add(wp) = b;
2804 wp += 1;
2805 }
2806 } else {
2807 *last_squeezed = 256;
2808 *ptr.add(wp) = b;
2809 wp += 1;
2810 }
2811 };
2812 }
2813 squeeze_byte!(0);
2814 squeeze_byte!(1);
2815 squeeze_byte!(2);
2816 squeeze_byte!(3);
2817 squeeze_byte!(4);
2818 squeeze_byte!(5);
2819 squeeze_byte!(6);
2820 squeeze_byte!(7);
2821 i += 8;
2822 }
2823 while i < n {
2824 let b = *ptr.add(i);
2825 if is_member(squeeze_set, b) {
2826 if *last_squeezed == b as u16 {
2827 i += 1;
2828 continue;
2829 }
2830 *last_squeezed = b as u16;
2831 } else {
2832 *last_squeezed = 256;
2833 }
2834 *ptr.add(wp) = b;
2835 wp += 1;
2836 i += 1;
2837 }
2838 }
2839 wp
2840}
2841
2842fn translate_squeeze_single_ch(
2846 table: &[u8; 256],
2847 squeeze_ch: u8,
2848 _squeeze_set: &[u8; 32],
2849 reader: &mut impl Read,
2850 writer: &mut impl Write,
2851) -> io::Result<()> {
2852 let range_info = detect_range_offset(table);
2853 let range_const_info = if range_info.is_none() {
2854 detect_range_to_constant(table)
2855 } else {
2856 None
2857 };
2858
2859 let pair = [squeeze_ch, squeeze_ch];
2860 let finder = memchr::memmem::Finder::new(&pair);
2861 let mut buf = alloc_uninit_vec(STREAM_BUF);
2862 let mut was_squeeze_char = false;
2863
2864 loop {
2865 let n = read_once(reader, &mut buf)?;
2866 if n == 0 {
2867 break;
2868 }
2869 let wp = translate_squeeze_single_process(
2870 &mut buf,
2871 n,
2872 table,
2873 squeeze_ch,
2874 &finder,
2875 range_info,
2876 range_const_info,
2877 &mut was_squeeze_char,
2878 );
2879 if wp > 0 {
2880 writer.write_all(&buf[..wp])?;
2881 }
2882 }
2883 Ok(())
2884}
2885
2886#[inline]
2887fn translate_squeeze_single_process(
2888 buf: &mut [u8],
2889 n: usize,
2890 table: &[u8; 256],
2891 squeeze_ch: u8,
2892 finder: &memchr::memmem::Finder<'_>,
2893 range_info: Option<(u8, u8, i8)>,
2894 range_const_info: Option<(u8, u8, u8)>,
2895 was_squeeze_char: &mut bool,
2896) -> usize {
2897 if let Some((lo, hi, offset)) = range_info {
2899 translate_range_simd_inplace(&mut buf[..n], lo, hi, offset);
2900 } else if let Some((lo, hi, replacement)) = range_const_info {
2901 translate_range_to_constant_simd_inplace(&mut buf[..n], lo, hi, replacement);
2902 } else {
2903 translate_inplace(&mut buf[..n], table);
2904 }
2905
2906 let mut i = 0;
2908 if *was_squeeze_char {
2909 while i < n && unsafe { *buf.as_ptr().add(i) } == squeeze_ch {
2910 i += 1;
2911 }
2912 *was_squeeze_char = false;
2913 if i >= n {
2914 *was_squeeze_char = true;
2915 return 0;
2916 }
2917 }
2918
2919 let ptr = buf.as_mut_ptr();
2920 let mut wp = 0usize;
2921
2922 loop {
2923 match finder.find(&buf[i..n]) {
2924 Some(offset) => {
2925 let seg_end = i + offset + 1;
2926 let gap = seg_end - i;
2927 if gap > 0 {
2928 if wp != i {
2929 unsafe {
2930 std::ptr::copy(ptr.add(i) as *const u8, ptr.add(wp), gap);
2931 }
2932 }
2933 wp += gap;
2934 }
2935 i = seg_end;
2936 while i < n && unsafe { *buf.as_ptr().add(i) } == squeeze_ch {
2937 i += 1;
2938 }
2939 if i >= n {
2940 *was_squeeze_char = true;
2941 break;
2942 }
2943 }
2944 None => {
2945 let rem = n - i;
2946 if rem > 0 {
2947 if wp != i {
2948 unsafe {
2949 std::ptr::copy(ptr.add(i) as *const u8, ptr.add(wp), rem);
2950 }
2951 }
2952 wp += rem;
2953 }
2954 *was_squeeze_char = n > 0 && unsafe { *buf.as_ptr().add(n - 1) } == squeeze_ch;
2955 break;
2956 }
2957 }
2958 }
2959 wp
2960}
2961
2962pub fn delete(
2963 delete_chars: &[u8],
2964 reader: &mut impl Read,
2965 writer: &mut impl Write,
2966) -> io::Result<()> {
2967 if delete_chars.len() == 1 {
2968 return delete_single_streaming(delete_chars[0], reader, writer);
2969 }
2970 if delete_chars.len() <= 3 {
2971 return delete_multi_streaming(delete_chars, reader, writer);
2972 }
2973
2974 if let Some((lo, hi)) = detect_delete_range(delete_chars) {
2978 return delete_range_streaming(lo, hi, reader, writer);
2979 }
2980
2981 let member = build_member_set(delete_chars);
2982 let mut buf = alloc_uninit_vec(STREAM_BUF);
2983 let mut outbuf = alloc_uninit_vec(STREAM_BUF);
2986
2987 loop {
2988 let n = read_full(reader, &mut buf)?;
2989 if n == 0 {
2990 break;
2991 }
2992 let wp = delete_bitset_dispatch(&buf[..n], &mut outbuf, &member);
2993 if wp > 0 {
2994 writer.write_all(&outbuf[..wp])?;
2995 }
2996 }
2997 Ok(())
2998}
2999
3000#[inline]
3001fn delete_bitset_dispatch(src: &[u8], dst: &mut [u8], member: &[u8; 32]) -> usize {
3002 #[cfg(target_arch = "x86_64")]
3003 {
3004 if get_simd_level() >= 3 {
3005 return unsafe { delete_bitset_avx2_stream(src, dst, member) };
3006 }
3007 }
3008 delete_bitset_scalar(src, dst, member)
3009}
3010
3011#[inline]
3013fn delete_bitset_scalar(src: &[u8], dst: &mut [u8], member: &[u8; 32]) -> usize {
3014 let n = src.len();
3015 let mut wp = 0;
3016 unsafe {
3017 let sp = src.as_ptr();
3018 let dp = dst.as_mut_ptr();
3019 let mut i = 0;
3020 while i + 8 <= n {
3021 let b0 = *sp.add(i);
3022 let b1 = *sp.add(i + 1);
3023 let b2 = *sp.add(i + 2);
3024 let b3 = *sp.add(i + 3);
3025 let b4 = *sp.add(i + 4);
3026 let b5 = *sp.add(i + 5);
3027 let b6 = *sp.add(i + 6);
3028 let b7 = *sp.add(i + 7);
3029 *dp.add(wp) = b0;
3030 wp += !is_member(member, b0) as usize;
3031 *dp.add(wp) = b1;
3032 wp += !is_member(member, b1) as usize;
3033 *dp.add(wp) = b2;
3034 wp += !is_member(member, b2) as usize;
3035 *dp.add(wp) = b3;
3036 wp += !is_member(member, b3) as usize;
3037 *dp.add(wp) = b4;
3038 wp += !is_member(member, b4) as usize;
3039 *dp.add(wp) = b5;
3040 wp += !is_member(member, b5) as usize;
3041 *dp.add(wp) = b6;
3042 wp += !is_member(member, b6) as usize;
3043 *dp.add(wp) = b7;
3044 wp += !is_member(member, b7) as usize;
3045 i += 8;
3046 }
3047 while i < n {
3048 let b = *sp.add(i);
3049 *dp.add(wp) = b;
3050 wp += !is_member(member, b) as usize;
3051 i += 1;
3052 }
3053 }
3054 wp
3055}
3056
3057#[cfg(target_arch = "x86_64")]
3060#[target_feature(enable = "avx2")]
3061unsafe fn delete_bitset_avx2_stream(src: &[u8], dst: &mut [u8], member: &[u8; 32]) -> usize {
3062 use std::arch::x86_64::*;
3063
3064 unsafe {
3065 let n = src.len();
3066 let sp = src.as_ptr();
3067 let dp = dst.as_mut_ptr();
3068 let mut ri = 0;
3069 let mut wp = 0;
3070
3071 let member_v = _mm256_loadu_si256(member.as_ptr() as *const _);
3074
3075 let mask7 = _mm256_set1_epi8(7);
3078 let mask_0x1f = _mm256_set1_epi8(0x1F_u8 as i8);
3079
3080 let bit_table = _mm256_setr_epi8(
3083 1, 2, 4, 8, 16, 32, 64, -128i8, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 4, 8, 16, 32, 64, -128i8,
3084 0, 0, 0, 0, 0, 0, 0, 0,
3085 );
3086
3087 while ri + 32 <= n {
3088 let input = _mm256_loadu_si256(sp.add(ri) as *const _);
3089
3090 let byte_idx = _mm256_and_si256(_mm256_srli_epi16(input, 3), mask_0x1f);
3092 let bit_pos = _mm256_and_si256(input, mask7);
3094 let bit_mask = _mm256_shuffle_epi8(bit_table, bit_pos);
3096
3097 let member_lo = _mm256_broadcastsi128_si256(_mm256_castsi256_si128(member_v));
3105 let member_hi = _mm256_broadcastsi128_si256(_mm256_extracti128_si256(member_v, 1));
3106 let lo_mask = _mm256_set1_epi8(0x0F);
3107 let idx_lo = _mm256_and_si256(byte_idx, lo_mask);
3108 let shuffled_lo = _mm256_shuffle_epi8(member_lo, idx_lo);
3109 let shuffled_hi = _mm256_shuffle_epi8(member_hi, idx_lo);
3110 let use_hi = _mm256_slli_epi16(byte_idx, 3); let member_byte = _mm256_blendv_epi8(shuffled_lo, shuffled_hi, use_hi);
3113
3114 let test = _mm256_and_si256(member_byte, bit_mask);
3116 let is_zero = _mm256_cmpeq_epi8(test, _mm256_setzero_si256());
3117 let keep_mask = _mm256_movemask_epi8(is_zero) as u32;
3119
3120 if keep_mask == 0xFFFFFFFF {
3121 std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 32);
3123 wp += 32;
3124 } else if keep_mask != 0 {
3125 let m0 = keep_mask as u8;
3127 let m1 = (keep_mask >> 8) as u8;
3128 let m2 = (keep_mask >> 16) as u8;
3129 let m3 = (keep_mask >> 24) as u8;
3130
3131 if m0 == 0xFF {
3132 std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 8);
3133 } else if m0 != 0 {
3134 compact_8bytes_simd(sp.add(ri), dp.add(wp), m0);
3135 }
3136 let c0 = m0.count_ones() as usize;
3137
3138 if m1 == 0xFF {
3139 std::ptr::copy_nonoverlapping(sp.add(ri + 8), dp.add(wp + c0), 8);
3140 } else if m1 != 0 {
3141 compact_8bytes_simd(sp.add(ri + 8), dp.add(wp + c0), m1);
3142 }
3143 let c1 = m1.count_ones() as usize;
3144
3145 if m2 == 0xFF {
3146 std::ptr::copy_nonoverlapping(sp.add(ri + 16), dp.add(wp + c0 + c1), 8);
3147 } else if m2 != 0 {
3148 compact_8bytes_simd(sp.add(ri + 16), dp.add(wp + c0 + c1), m2);
3149 }
3150 let c2 = m2.count_ones() as usize;
3151
3152 if m3 == 0xFF {
3153 std::ptr::copy_nonoverlapping(sp.add(ri + 24), dp.add(wp + c0 + c1 + c2), 8);
3154 } else if m3 != 0 {
3155 compact_8bytes_simd(sp.add(ri + 24), dp.add(wp + c0 + c1 + c2), m3);
3156 }
3157 let c3 = m3.count_ones() as usize;
3158 wp += c0 + c1 + c2 + c3;
3159 }
3160 ri += 32;
3162 }
3163
3164 while ri < n {
3166 let b = *sp.add(ri);
3167 *dp.add(wp) = b;
3168 wp += !is_member(member, b) as usize;
3169 ri += 1;
3170 }
3171
3172 wp
3173 }
3174}
3175
3176fn delete_single_streaming(
3177 ch: u8,
3178 reader: &mut impl Read,
3179 writer: &mut impl Write,
3180) -> io::Result<()> {
3181 let mut buf = alloc_uninit_vec(STREAM_BUF);
3182 loop {
3183 let n = read_full(reader, &mut buf)?;
3184 if n == 0 {
3185 break;
3186 }
3187 let wp = delete_single_inplace(&mut buf, n, ch);
3188 if wp > 0 {
3189 writer.write_all(&buf[..wp])?;
3190 }
3191 }
3192 Ok(())
3193}
3194
3195#[inline]
3197fn delete_single_inplace(buf: &mut [u8], n: usize, ch: u8) -> usize {
3198 let mut wp = 0;
3199 let mut i = 0;
3200 while i < n {
3201 match memchr::memchr(ch, &buf[i..n]) {
3202 Some(offset) => {
3203 if offset > 0 {
3204 if wp != i {
3205 unsafe {
3206 std::ptr::copy(buf.as_ptr().add(i), buf.as_mut_ptr().add(wp), offset);
3207 }
3208 }
3209 wp += offset;
3210 }
3211 i += offset + 1;
3212 }
3213 None => {
3214 let run_len = n - i;
3215 if run_len > 0 {
3216 if wp != i {
3217 unsafe {
3218 std::ptr::copy(buf.as_ptr().add(i), buf.as_mut_ptr().add(wp), run_len);
3219 }
3220 }
3221 wp += run_len;
3222 }
3223 break;
3224 }
3225 }
3226 }
3227 wp
3228}
3229
3230fn delete_multi_streaming(
3231 chars: &[u8],
3232 reader: &mut impl Read,
3233 writer: &mut impl Write,
3234) -> io::Result<()> {
3235 let mut buf = alloc_uninit_vec(STREAM_BUF);
3236 loop {
3237 let n = read_full(reader, &mut buf)?;
3238 if n == 0 {
3239 break;
3240 }
3241 let wp = delete_multi_inplace(&mut buf, n, chars);
3242 if wp > 0 {
3243 writer.write_all(&buf[..wp])?;
3244 }
3245 }
3246 Ok(())
3247}
3248
3249#[inline]
3251fn delete_multi_inplace(buf: &mut [u8], n: usize, chars: &[u8]) -> usize {
3252 let mut wp = 0;
3253 let mut i = 0;
3254 while i < n {
3255 let found = if chars.len() == 2 {
3256 memchr::memchr2(chars[0], chars[1], &buf[i..n])
3257 } else {
3258 memchr::memchr3(chars[0], chars[1], chars[2], &buf[i..n])
3259 };
3260 match found {
3261 Some(offset) => {
3262 if offset > 0 {
3263 if wp != i {
3264 unsafe {
3265 std::ptr::copy(buf.as_ptr().add(i), buf.as_mut_ptr().add(wp), offset);
3266 }
3267 }
3268 wp += offset;
3269 }
3270 i += offset + 1;
3271 }
3272 None => {
3273 let run_len = n - i;
3274 if run_len > 0 {
3275 if wp != i {
3276 unsafe {
3277 std::ptr::copy(buf.as_ptr().add(i), buf.as_mut_ptr().add(wp), run_len);
3278 }
3279 }
3280 wp += run_len;
3281 }
3282 break;
3283 }
3284 }
3285 }
3286 wp
3287}
3288
3289pub fn delete_squeeze(
3290 delete_chars: &[u8],
3291 squeeze_chars: &[u8],
3292 reader: &mut impl Read,
3293 writer: &mut impl Write,
3294) -> io::Result<()> {
3295 let delete_set = build_member_set(delete_chars);
3296 let squeeze_set = build_member_set(squeeze_chars);
3297 let mut buf = alloc_uninit_vec(STREAM_BUF);
3298 let mut last_squeezed: u16 = 256;
3299
3300 loop {
3301 let n = read_once(reader, &mut buf)?;
3302 if n == 0 {
3303 break;
3304 }
3305 let wp = delete_squeeze_inplace(&mut buf, n, &delete_set, &squeeze_set, &mut last_squeezed);
3306 if wp > 0 {
3307 writer.write_all(&buf[..wp])?;
3308 }
3309 }
3310 Ok(())
3311}
3312
3313#[inline]
3314fn delete_squeeze_inplace(
3315 buf: &mut [u8],
3316 n: usize,
3317 delete_set: &[u8; 32],
3318 squeeze_set: &[u8; 32],
3319 last_squeezed: &mut u16,
3320) -> usize {
3321 let mut wp = 0;
3322 unsafe {
3323 let ptr = buf.as_mut_ptr();
3324 let mut i = 0;
3325 while i + 8 <= n {
3326 macro_rules! process_byte {
3327 ($off:expr) => {
3328 let b = *ptr.add(i + $off);
3329 if !is_member(delete_set, b) {
3330 if is_member(squeeze_set, b) {
3331 if *last_squeezed != b as u16 {
3332 *last_squeezed = b as u16;
3333 *ptr.add(wp) = b;
3334 wp += 1;
3335 }
3336 } else {
3337 *last_squeezed = 256;
3338 *ptr.add(wp) = b;
3339 wp += 1;
3340 }
3341 }
3342 };
3343 }
3344 process_byte!(0);
3345 process_byte!(1);
3346 process_byte!(2);
3347 process_byte!(3);
3348 process_byte!(4);
3349 process_byte!(5);
3350 process_byte!(6);
3351 process_byte!(7);
3352 i += 8;
3353 }
3354 while i < n {
3355 let b = *ptr.add(i);
3356 if !is_member(delete_set, b) {
3357 if is_member(squeeze_set, b) {
3358 if *last_squeezed != b as u16 {
3359 *last_squeezed = b as u16;
3360 *ptr.add(wp) = b;
3361 wp += 1;
3362 }
3363 } else {
3364 *last_squeezed = 256;
3365 *ptr.add(wp) = b;
3366 wp += 1;
3367 }
3368 }
3369 i += 1;
3370 }
3371 }
3372 wp
3373}
3374
3375pub fn squeeze(
3376 squeeze_chars: &[u8],
3377 reader: &mut impl Read,
3378 writer: &mut impl Write,
3379) -> io::Result<()> {
3380 if squeeze_chars.len() == 1 {
3381 return squeeze_single_stream(squeeze_chars[0], reader, writer);
3382 }
3383
3384 if squeeze_chars.len() <= 3 {
3387 return squeeze_multi_stream(squeeze_chars, reader, writer);
3388 }
3389
3390 let member = build_member_set(squeeze_chars);
3391 let mut buf = alloc_uninit_vec(STREAM_BUF);
3392 let mut last_squeezed: u16 = 256;
3393
3394 loop {
3395 let n = read_once(reader, &mut buf)?;
3396 if n == 0 {
3397 break;
3398 }
3399 let wp = squeeze_inplace_bitset(&mut buf, n, &member, &mut last_squeezed);
3400 if wp > 0 {
3401 writer.write_all(&buf[..wp])?;
3402 }
3403 }
3404 Ok(())
3405}
3406
3407#[inline]
3408fn squeeze_inplace_bitset(
3409 buf: &mut [u8],
3410 n: usize,
3411 member: &[u8; 32],
3412 last_squeezed: &mut u16,
3413) -> usize {
3414 let mut wp = 0;
3415 unsafe {
3416 let ptr = buf.as_mut_ptr();
3417 for i in 0..n {
3418 let b = *ptr.add(i);
3419 if is_member(member, b) {
3420 if *last_squeezed == b as u16 {
3421 continue;
3422 }
3423 *last_squeezed = b as u16;
3424 } else {
3425 *last_squeezed = 256;
3426 }
3427 *ptr.add(wp) = b;
3428 wp += 1;
3429 }
3430 }
3431 wp
3432}
3433
3434fn squeeze_multi_stream(
3438 chars: &[u8],
3439 reader: &mut impl Read,
3440 writer: &mut impl Write,
3441) -> io::Result<()> {
3442 let c0 = chars[0];
3443 let c1 = chars[1];
3444 let c2 = if chars.len() >= 3 {
3445 Some(chars[2])
3446 } else {
3447 None
3448 };
3449
3450 let mut buf = alloc_uninit_vec(STREAM_BUF);
3451 let mut last_squeezed: u16 = 256;
3452
3453 loop {
3454 let n = read_once(reader, &mut buf)?;
3455 if n == 0 {
3456 break;
3457 }
3458 let wp = squeeze_multi_compact(&mut buf, n, c0, c1, c2, &mut last_squeezed);
3459 if wp > 0 {
3460 writer.write_all(&buf[..wp])?;
3461 }
3462 }
3463 Ok(())
3464}
3465
3466#[inline]
3468fn squeeze_multi_compact(
3469 buf: &mut [u8],
3470 n: usize,
3471 c0: u8,
3472 c1: u8,
3473 c2: Option<u8>,
3474 last_squeezed: &mut u16,
3475) -> usize {
3476 let ptr = buf.as_mut_ptr();
3477 let mut wp = 0usize;
3478 let mut cursor = 0usize;
3479
3480 while cursor < n {
3481 let found = if let Some(c) = c2 {
3482 memchr::memchr3(c0, c1, c, &buf[cursor..n])
3483 } else {
3484 memchr::memchr2(c0, c1, &buf[cursor..n])
3485 };
3486 match found {
3487 Some(offset) => {
3488 let pos = cursor + offset;
3489 let b = unsafe { *ptr.add(pos) };
3490
3491 let gap = pos - cursor;
3492 if gap > 0 {
3493 if wp != cursor {
3494 unsafe {
3495 std::ptr::copy(ptr.add(cursor), ptr.add(wp), gap);
3496 }
3497 }
3498 wp += gap;
3499 *last_squeezed = 256;
3500 }
3501
3502 if *last_squeezed != b as u16 {
3503 unsafe { *ptr.add(wp) = b };
3504 wp += 1;
3505 *last_squeezed = b as u16;
3506 }
3507
3508 cursor = pos + 1;
3509 while cursor < n && unsafe { *ptr.add(cursor) } == b {
3510 cursor += 1;
3511 }
3512 }
3513 None => {
3514 let rem = n - cursor;
3515 if rem > 0 {
3516 if wp != cursor {
3517 unsafe {
3518 std::ptr::copy(ptr.add(cursor), ptr.add(wp), rem);
3519 }
3520 }
3521 wp += rem;
3522 *last_squeezed = 256;
3523 }
3524 break;
3525 }
3526 }
3527 }
3528 wp
3529}
3530
3531fn squeeze_single_stream(
3532 ch: u8,
3533 reader: &mut impl Read,
3534 writer: &mut impl Write,
3535) -> io::Result<()> {
3536 let mut buf = alloc_uninit_vec(STREAM_BUF);
3537 let mut was_squeeze_char = false;
3538
3539 #[cfg(target_arch = "x86_64")]
3541 if get_simd_level() >= 3 {
3542 loop {
3543 let n = read_once(reader, &mut buf)?;
3544 if n == 0 {
3545 break;
3546 }
3547 let wp = unsafe { squeeze_single_avx2_inplace(&mut buf, n, ch, &mut was_squeeze_char) };
3548 if wp > 0 {
3549 writer.write_all(&buf[..wp])?;
3550 }
3551 }
3552 return Ok(());
3553 }
3554
3555 let pair = [ch, ch];
3557 let finder = memchr::memmem::Finder::new(&pair);
3558 loop {
3559 let n = read_once(reader, &mut buf)?;
3560 if n == 0 {
3561 break;
3562 }
3563 let wp = squeeze_single_compact(&mut buf, n, ch, &finder, &mut was_squeeze_char);
3564 if wp > 0 {
3565 writer.write_all(&buf[..wp])?;
3566 }
3567 }
3568 Ok(())
3569}
3570
3571#[inline]
3573fn squeeze_single_compact(
3574 buf: &mut [u8],
3575 n: usize,
3576 ch: u8,
3577 finder: &memchr::memmem::Finder<'_>,
3578 was_squeeze_char: &mut bool,
3579) -> usize {
3580 let mut i = 0;
3581
3582 if *was_squeeze_char {
3584 while i < n && unsafe { *buf.as_ptr().add(i) } == ch {
3585 i += 1;
3586 }
3587 *was_squeeze_char = false;
3588 if i >= n {
3589 *was_squeeze_char = true;
3590 return 0;
3591 }
3592 }
3593
3594 let ptr = buf.as_mut_ptr();
3595 let mut wp = 0usize;
3596
3597 loop {
3598 match finder.find(&buf[i..n]) {
3599 Some(offset) => {
3600 let seg_end = i + offset + 1;
3601 let gap = seg_end - i;
3602 if gap > 0 {
3603 if wp != i {
3604 unsafe {
3605 std::ptr::copy(ptr.add(i) as *const u8, ptr.add(wp), gap);
3606 }
3607 }
3608 wp += gap;
3609 }
3610 i = seg_end;
3611 while i < n && unsafe { *buf.as_ptr().add(i) } == ch {
3612 i += 1;
3613 }
3614 if i >= n {
3615 *was_squeeze_char = true;
3616 break;
3617 }
3618 }
3619 None => {
3620 let rem = n - i;
3621 if rem > 0 {
3622 if wp != i {
3623 unsafe {
3624 std::ptr::copy(ptr.add(i) as *const u8, ptr.add(wp), rem);
3625 }
3626 }
3627 wp += rem;
3628 }
3629 *was_squeeze_char = n > 0 && unsafe { *buf.as_ptr().add(n - 1) } == ch;
3630 break;
3631 }
3632 }
3633 }
3634 wp
3635}
3636
3637#[cfg(target_arch = "x86_64")]
3648#[target_feature(enable = "avx2")]
3649unsafe fn squeeze_single_avx2_inplace(
3650 buf: &mut [u8],
3651 n: usize,
3652 ch: u8,
3653 was_squeeze_char: &mut bool,
3654) -> usize {
3655 use std::arch::x86_64::*;
3656
3657 unsafe {
3658 let ch_v = _mm256_set1_epi8(ch as i8);
3659 let ptr = buf.as_mut_ptr();
3660 let mut ri = 0;
3661 let mut wp = 0;
3662 let mut carry: u32 = if *was_squeeze_char { 1 } else { 0 };
3663
3664 while ri + 32 <= n {
3665 let input = _mm256_loadu_si256(ptr.add(ri) as *const _);
3666 let cmp = _mm256_cmpeq_epi8(input, ch_v);
3667 let sq_mask = _mm256_movemask_epi8(cmp) as u32;
3668
3669 let prev_sq_mask = (sq_mask << 1) | carry;
3672
3673 let remove_mask = sq_mask & prev_sq_mask;
3675
3676 carry = (sq_mask >> 31) & 1;
3678
3679 if remove_mask == 0 {
3680 if wp != ri {
3682 std::ptr::copy(ptr.add(ri), ptr.add(wp), 32);
3683 }
3684 wp += 32;
3685 } else if remove_mask != 0xFFFFFFFF {
3686 let keep_mask = !remove_mask;
3688 let m0 = keep_mask as u8;
3689 let m1 = (keep_mask >> 8) as u8;
3690 let m2 = (keep_mask >> 16) as u8;
3691 let m3 = (keep_mask >> 24) as u8;
3692
3693 if m0 == 0xFF {
3694 std::ptr::copy_nonoverlapping(ptr.add(ri), ptr.add(wp), 8);
3695 } else if m0 != 0 {
3696 compact_8bytes_simd(ptr.add(ri), ptr.add(wp), m0);
3697 }
3698 let c0 = m0.count_ones() as usize;
3699
3700 if m1 == 0xFF {
3701 std::ptr::copy_nonoverlapping(ptr.add(ri + 8), ptr.add(wp + c0), 8);
3702 } else if m1 != 0 {
3703 compact_8bytes_simd(ptr.add(ri + 8), ptr.add(wp + c0), m1);
3704 }
3705 let c1 = m1.count_ones() as usize;
3706
3707 if m2 == 0xFF {
3708 std::ptr::copy_nonoverlapping(ptr.add(ri + 16), ptr.add(wp + c0 + c1), 8);
3709 } else if m2 != 0 {
3710 compact_8bytes_simd(ptr.add(ri + 16), ptr.add(wp + c0 + c1), m2);
3711 }
3712 let c2 = m2.count_ones() as usize;
3713
3714 if m3 == 0xFF {
3715 std::ptr::copy_nonoverlapping(ptr.add(ri + 24), ptr.add(wp + c0 + c1 + c2), 8);
3716 } else if m3 != 0 {
3717 compact_8bytes_simd(ptr.add(ri + 24), ptr.add(wp + c0 + c1 + c2), m3);
3718 }
3719 let c3 = m3.count_ones() as usize;
3720 wp += c0 + c1 + c2 + c3;
3721 }
3722 ri += 32;
3725 }
3726
3727 if ri + 16 <= n {
3729 let ch_v128 = _mm_set1_epi8(ch as i8);
3730 let input = _mm_loadu_si128(ptr.add(ri) as *const _);
3731 let cmp = _mm_cmpeq_epi8(input, ch_v128);
3732 let sq_mask = _mm_movemask_epi8(cmp) as u32 & 0xFFFF;
3733 let prev_sq_mask = (sq_mask << 1) | carry;
3734 let remove_mask = sq_mask & prev_sq_mask;
3735 carry = (sq_mask >> 15) & 1;
3736
3737 if remove_mask == 0 {
3738 if wp != ri {
3739 std::ptr::copy(ptr.add(ri), ptr.add(wp), 16);
3740 }
3741 wp += 16;
3742 } else if remove_mask != 0xFFFF {
3743 let keep_mask = !remove_mask;
3744 let m0 = keep_mask as u8;
3745 let m1 = (keep_mask >> 8) as u8;
3746 if m0 == 0xFF {
3747 std::ptr::copy_nonoverlapping(ptr.add(ri), ptr.add(wp), 8);
3748 } else if m0 != 0 {
3749 compact_8bytes_simd(ptr.add(ri), ptr.add(wp), m0);
3750 }
3751 let c0 = m0.count_ones() as usize;
3752 if m1 == 0xFF {
3753 std::ptr::copy_nonoverlapping(ptr.add(ri + 8), ptr.add(wp + c0), 8);
3754 } else if m1 != 0 {
3755 compact_8bytes_simd(ptr.add(ri + 8), ptr.add(wp + c0), m1);
3756 }
3757 wp += c0 + m1.count_ones() as usize;
3758 }
3759 ri += 16;
3760 }
3761
3762 while ri < n {
3764 let b = *ptr.add(ri);
3765 if b == ch && carry != 0 {
3766 } else {
3768 *ptr.add(wp) = b;
3769 wp += 1;
3770 }
3771 carry = if b == ch { 1 } else { 0 };
3772 ri += 1;
3773 }
3774
3775 *was_squeeze_char = carry != 0;
3776 wp
3777 }
3778}
3779
3780pub fn translate_owned(
3788 set1: &[u8],
3789 set2: &[u8],
3790 data: &mut [u8],
3791 writer: &mut impl Write,
3792) -> io::Result<()> {
3793 let table = build_translate_table(set1, set2);
3794
3795 let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
3797 if is_identity {
3798 return writer.write_all(data);
3799 }
3800
3801 if let Some((lo, hi, offset)) = detect_range_offset(&table) {
3804 translate_range_simd_inplace(data, lo, hi, offset);
3805 return writer.write_all(data);
3806 }
3807
3808 if let Some((lo, hi, replacement)) = detect_range_to_constant(&table) {
3810 translate_range_to_constant_simd_inplace(data, lo, hi, replacement);
3811 return writer.write_all(data);
3812 }
3813
3814 translate_inplace(data, &table);
3816 writer.write_all(data)
3817}
3818
3819pub fn translate_mmap(
3832 set1: &[u8],
3833 set2: &[u8],
3834 data: &[u8],
3835 writer: &mut impl Write,
3836) -> io::Result<()> {
3837 let table = build_translate_table(set1, set2);
3838
3839 let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
3841 if is_identity {
3842 return writer.write_all(data);
3843 }
3844
3845 if let Some((lo, hi, offset)) = detect_range_offset(&table) {
3847 return translate_mmap_range(data, writer, lo, hi, offset);
3848 }
3849
3850 if let Some((lo, hi, replacement)) = detect_range_to_constant(&table) {
3852 return translate_mmap_range_to_constant(data, writer, lo, hi, replacement);
3853 }
3854
3855 translate_mmap_table(data, writer, &table)
3857}
3858
3859fn translate_mmap_range(
3862 data: &[u8],
3863 writer: &mut impl Write,
3864 lo: u8,
3865 hi: u8,
3866 offset: i8,
3867) -> io::Result<()> {
3868 const CHUNK: usize = 8 * 1024 * 1024;
3869 let buf_size = data.len().min(CHUNK);
3870 let mut buf = alloc_uninit_vec(buf_size);
3871 for chunk in data.chunks(CHUNK) {
3872 translate_range_simd(chunk, &mut buf[..chunk.len()], lo, hi, offset);
3873 writer.write_all(&buf[..chunk.len()])?;
3874 }
3875 Ok(())
3876}
3877
3878fn translate_mmap_range_to_constant(
3882 data: &[u8],
3883 writer: &mut impl Write,
3884 lo: u8,
3885 hi: u8,
3886 replacement: u8,
3887) -> io::Result<()> {
3888 const CHUNK: usize = 8 * 1024 * 1024;
3889 let buf_size = data.len().min(CHUNK);
3890 let mut buf = alloc_uninit_vec(buf_size);
3891 for chunk in data.chunks(CHUNK) {
3892 buf[..chunk.len()].copy_from_slice(chunk);
3893 translate_range_to_constant_simd_inplace(&mut buf[..chunk.len()], lo, hi, replacement);
3894 writer.write_all(&buf[..chunk.len()])?;
3895 }
3896 Ok(())
3897}
3898
3899fn translate_mmap_table(data: &[u8], writer: &mut impl Write, table: &[u8; 256]) -> io::Result<()> {
3901 const CHUNK: usize = 8 * 1024 * 1024;
3902 let buf_size = data.len().min(CHUNK);
3903 let mut buf = alloc_uninit_vec(buf_size);
3904 for chunk in data.chunks(CHUNK) {
3905 translate_to(chunk, &mut buf[..chunk.len()], table);
3906 writer.write_all(&buf[..chunk.len()])?;
3907 }
3908 Ok(())
3909}
3910
3911pub fn translate_mmap_inplace(
3916 set1: &[u8],
3917 set2: &[u8],
3918 data: &mut [u8],
3919 writer: &mut impl Write,
3920) -> io::Result<()> {
3921 let table = build_translate_table(set1, set2);
3922
3923 let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
3925 if is_identity {
3926 return writer.write_all(data);
3927 }
3928
3929 if let Some((lo, hi, offset)) = detect_range_offset(&table) {
3931 translate_range_simd_inplace(data, lo, hi, offset);
3932 return writer.write_all(data);
3933 }
3934
3935 if let Some((lo, hi, replacement)) = detect_range_to_constant(&table) {
3937 translate_range_to_constant_simd_inplace(data, lo, hi, replacement);
3938 return writer.write_all(data);
3939 }
3940
3941 translate_inplace(data, &table);
3943 writer.write_all(data)
3944}
3945
3946fn translate_to_separate_buf(
3950 data: &[u8],
3951 table: &[u8; 256],
3952 writer: &mut impl Write,
3953) -> io::Result<()> {
3954 let range_info = detect_range_offset(table);
3955 let const_info = if range_info.is_none() {
3956 detect_range_to_constant(table)
3957 } else {
3958 None
3959 };
3960
3961 const CHUNK: usize = 8 * 1024 * 1024;
3962 let buf_size = data.len().min(CHUNK);
3963 let mut out_buf = alloc_uninit_vec(buf_size);
3964 for chunk in data.chunks(CHUNK) {
3965 if let Some((lo, hi, offset)) = range_info {
3966 translate_range_simd(chunk, &mut out_buf[..chunk.len()], lo, hi, offset);
3967 } else if let Some((lo, hi, replacement)) = const_info {
3968 translate_range_to_constant_simd(
3969 chunk,
3970 &mut out_buf[..chunk.len()],
3971 lo,
3972 hi,
3973 replacement,
3974 );
3975 } else {
3976 translate_to(chunk, &mut out_buf[..chunk.len()], table);
3977 }
3978 writer.write_all(&out_buf[..chunk.len()])?;
3979 }
3980 Ok(())
3981}
3982
3983pub fn translate_mmap_readonly(
3987 set1: &[u8],
3988 set2: &[u8],
3989 data: &[u8],
3990 writer: &mut impl Write,
3991) -> io::Result<()> {
3992 let table = build_translate_table(set1, set2);
3993
3994 let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
3996 if is_identity {
3997 return writer.write_all(data);
3998 }
3999
4000 translate_to_separate_buf(data, &table, writer)
4001}
4002
4003pub fn translate_squeeze_mmap(
4006 set1: &[u8],
4007 set2: &[u8],
4008 data: &[u8],
4009 writer: &mut impl Write,
4010) -> io::Result<()> {
4011 let table = build_translate_table(set1, set2);
4012 let squeeze_set = build_member_set(set2);
4013
4014 if data.len() <= SINGLE_ALLOC_LIMIT {
4018 let mut translated = alloc_uninit_vec(data.len());
4019 let range_info = detect_range_offset(&table);
4020
4021 if let Some((lo, hi, offset)) = range_info {
4022 translate_range_simd(data, &mut translated, lo, hi, offset);
4023 } else if let Some((lo, hi, repl)) = detect_range_to_constant(&table) {
4024 translate_range_to_constant_simd(data, &mut translated, lo, hi, repl);
4025 } else {
4026 translate_to(data, &mut translated, &table);
4027 }
4028
4029 let mut last_squeezed: u16 = 256;
4031 let len = translated.len();
4032 let mut wp = 0;
4033 unsafe {
4034 let ptr = translated.as_mut_ptr();
4035 let mut i = 0;
4036 while i < len {
4037 let b = *ptr.add(i);
4038 if is_member(&squeeze_set, b) {
4039 if last_squeezed == b as u16 {
4040 i += 1;
4041 continue;
4042 }
4043 last_squeezed = b as u16;
4044 } else {
4045 last_squeezed = 256;
4046 }
4047 *ptr.add(wp) = b;
4048 wp += 1;
4049 i += 1;
4050 }
4051 }
4052 return writer.write_all(&translated[..wp]);
4053 }
4054
4055 const CHUNK: usize = 8 * 1024 * 1024;
4058 let mut last_squeezed: u16 = 256;
4059 let mut buf = alloc_uninit_vec(CHUNK);
4060 for chunk in data.chunks(CHUNK) {
4061 translate_to(chunk, &mut buf[..chunk.len()], &table);
4062 let mut wp = 0;
4063 for i in 0..chunk.len() {
4064 let b = buf[i];
4065 if is_member(&squeeze_set, b) {
4066 if last_squeezed == b as u16 {
4067 continue;
4068 }
4069 last_squeezed = b as u16;
4070 } else {
4071 last_squeezed = 256;
4072 }
4073 buf[wp] = b;
4074 wp += 1;
4075 }
4076 writer.write_all(&buf[..wp])?;
4077 }
4078 Ok(())
4079}
4080
4081pub fn delete_mmap(delete_chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
4084 if delete_chars.len() == 1 {
4085 return delete_single_char_mmap(delete_chars[0], data, writer);
4086 }
4087 if delete_chars.len() <= 3 {
4088 return delete_multi_memchr_mmap(delete_chars, data, writer);
4089 }
4090
4091 if let Some((lo, hi)) = detect_delete_range(delete_chars) {
4093 return delete_range_mmap(data, writer, lo, hi);
4094 }
4095
4096 let member = build_member_set(delete_chars);
4097
4098 let sample_size = data.len().min(1024);
4103 let sample_deletes = data[..sample_size]
4104 .iter()
4105 .filter(|&&b| is_member(&member, b))
4106 .count();
4107 let estimated_deletes = if sample_size > 0 {
4108 data.len() * sample_deletes / sample_size
4109 } else {
4110 data.len()
4111 };
4112
4113 if estimated_deletes < MAX_IOV / 2 {
4114 return delete_bitset_zerocopy(data, &member, writer);
4115 }
4116
4117 const COMPACT_BUF: usize = 8 * 1024 * 1024;
4119 let mut outbuf = alloc_uninit_vec(COMPACT_BUF);
4120 for chunk in data.chunks(COMPACT_BUF) {
4121 let out_pos = delete_chunk_bitset_into(chunk, &member, &mut outbuf);
4122 if out_pos > 0 {
4123 writer.write_all(&outbuf[..out_pos])?;
4124 }
4125 }
4126 Ok(())
4127}
4128
4129fn delete_range_mmap(data: &[u8], writer: &mut impl Write, lo: u8, hi: u8) -> io::Result<()> {
4134 let sample_size = data.len().min(1024);
4136 let sample_deletes = data[..sample_size]
4137 .iter()
4138 .filter(|&&b| b >= lo && b <= hi)
4139 .count();
4140 let estimated_deletes = if sample_size > 0 {
4146 data.len() * sample_deletes / sample_size
4147 } else {
4148 data.len()
4149 };
4150 if estimated_deletes < MAX_IOV / 2 {
4151 return delete_range_mmap_zerocopy(data, writer, lo, hi);
4152 }
4153
4154 const CHUNK: usize = 8 * 1024 * 1024;
4156 let mut outbuf = alloc_uninit_vec(CHUNK);
4157 for chunk in data.chunks(CHUNK) {
4158 let kept = delete_range_chunk(chunk, &mut outbuf[..chunk.len()], lo, hi);
4159 writer.write_all(&outbuf[..kept])?;
4160 }
4161 Ok(())
4162}
4163
4164fn delete_range_mmap_zerocopy(
4169 data: &[u8],
4170 writer: &mut impl Write,
4171 lo: u8,
4172 hi: u8,
4173) -> io::Result<()> {
4174 #[cfg(target_arch = "x86_64")]
4175 {
4176 if get_simd_level() >= 3 {
4177 return unsafe { delete_range_zerocopy_avx2(data, writer, lo, hi) };
4178 }
4179 if get_simd_level() >= 2 {
4180 return unsafe { delete_range_zerocopy_sse2(data, writer, lo, hi) };
4181 }
4182 }
4183
4184 #[cfg(target_arch = "aarch64")]
4185 {
4186 return unsafe { delete_range_zerocopy_neon(data, writer, lo, hi) };
4187 }
4188
4189 #[allow(unreachable_code)]
4191 delete_range_zerocopy_scalar(data, writer, lo, hi)
4192}
4193
4194fn delete_range_zerocopy_scalar(
4197 data: &[u8],
4198 writer: &mut impl Write,
4199 lo: u8,
4200 hi: u8,
4201) -> io::Result<()> {
4202 let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(MAX_IOV);
4203 let len = data.len();
4204 let mut run_start: usize = 0;
4205 let mut i: usize = 0;
4206
4207 while i < len {
4208 let b = unsafe { *data.get_unchecked(i) };
4209 if b >= lo && b <= hi {
4210 if i > run_start {
4211 iov.push(std::io::IoSlice::new(&data[run_start..i]));
4212 if iov.len() >= MAX_IOV {
4213 write_ioslices(writer, &iov)?;
4214 iov.clear();
4215 }
4216 }
4217 run_start = i + 1;
4218 }
4219 i += 1;
4220 }
4221 if run_start < len {
4222 iov.push(std::io::IoSlice::new(&data[run_start..]));
4223 }
4224 if !iov.is_empty() {
4225 write_ioslices(writer, &iov)?;
4226 }
4227 Ok(())
4228}
4229
4230#[cfg(target_arch = "x86_64")]
4234#[target_feature(enable = "avx2")]
4235unsafe fn delete_range_zerocopy_avx2(
4236 data: &[u8],
4237 writer: &mut impl Write,
4238 lo: u8,
4239 hi: u8,
4240) -> io::Result<()> {
4241 use std::arch::x86_64::*;
4242
4243 unsafe {
4244 let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(MAX_IOV);
4245 let len = data.len();
4246 let mut run_start: usize = 0;
4247 let mut ri: usize = 0;
4248
4249 let range = hi - lo;
4250 let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
4251 let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
4252 let zero = _mm256_setzero_si256();
4253
4254 while ri + 32 <= len {
4255 let input = _mm256_loadu_si256(data.as_ptr().add(ri) as *const _);
4256 let biased = _mm256_add_epi8(input, bias_v);
4257 let gt = _mm256_cmpgt_epi8(biased, threshold_v);
4258 let in_range = _mm256_cmpeq_epi8(gt, zero);
4259 let del_mask = _mm256_movemask_epi8(in_range) as u32;
4260
4261 if del_mask == 0 {
4262 ri += 32;
4264 continue;
4265 }
4266
4267 let mut m = del_mask;
4269 while m != 0 {
4270 let bit = m.trailing_zeros() as usize;
4271 let abs_pos = ri + bit;
4272 if abs_pos > run_start {
4273 iov.push(std::io::IoSlice::new(&data[run_start..abs_pos]));
4274 if iov.len() >= MAX_IOV {
4275 write_ioslices(writer, &iov)?;
4276 iov.clear();
4277 }
4278 }
4279 run_start = abs_pos + 1;
4280 m &= m - 1; }
4282
4283 ri += 32;
4284 }
4285
4286 while ri < len {
4288 let b = *data.get_unchecked(ri);
4289 if b >= lo && b <= hi {
4290 if ri > run_start {
4291 iov.push(std::io::IoSlice::new(&data[run_start..ri]));
4292 if iov.len() >= MAX_IOV {
4293 write_ioslices(writer, &iov)?;
4294 iov.clear();
4295 }
4296 }
4297 run_start = ri + 1;
4298 }
4299 ri += 1;
4300 }
4301
4302 if run_start < len {
4303 iov.push(std::io::IoSlice::new(&data[run_start..]));
4304 }
4305 if !iov.is_empty() {
4306 write_ioslices(writer, &iov)?;
4307 }
4308 Ok(())
4309 }
4310}
4311
4312#[cfg(target_arch = "x86_64")]
4314#[target_feature(enable = "sse2")]
4315unsafe fn delete_range_zerocopy_sse2(
4316 data: &[u8],
4317 writer: &mut impl Write,
4318 lo: u8,
4319 hi: u8,
4320) -> io::Result<()> {
4321 use std::arch::x86_64::*;
4322
4323 unsafe {
4324 let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(MAX_IOV);
4325 let len = data.len();
4326 let mut run_start: usize = 0;
4327 let mut ri: usize = 0;
4328
4329 let range = hi - lo;
4330 let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
4331 let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
4332 let zero = _mm_setzero_si128();
4333
4334 while ri + 16 <= len {
4335 let input = _mm_loadu_si128(data.as_ptr().add(ri) as *const _);
4336 let biased = _mm_add_epi8(input, bias_v);
4337 let gt = _mm_cmpgt_epi8(biased, threshold_v);
4338 let in_range = _mm_cmpeq_epi8(gt, zero);
4339 let del_mask = _mm_movemask_epi8(in_range) as u32 & 0xFFFF;
4340
4341 if del_mask == 0 {
4342 ri += 16;
4343 continue;
4344 }
4345
4346 let mut m = del_mask;
4347 while m != 0 {
4348 let bit = m.trailing_zeros() as usize;
4349 let abs_pos = ri + bit;
4350 if abs_pos > run_start {
4351 iov.push(std::io::IoSlice::new(&data[run_start..abs_pos]));
4352 if iov.len() >= MAX_IOV {
4353 write_ioslices(writer, &iov)?;
4354 iov.clear();
4355 }
4356 }
4357 run_start = abs_pos + 1;
4358 m &= m - 1;
4359 }
4360
4361 ri += 16;
4362 }
4363
4364 while ri < len {
4365 let b = *data.get_unchecked(ri);
4366 if b >= lo && b <= hi {
4367 if ri > run_start {
4368 iov.push(std::io::IoSlice::new(&data[run_start..ri]));
4369 if iov.len() >= MAX_IOV {
4370 write_ioslices(writer, &iov)?;
4371 iov.clear();
4372 }
4373 }
4374 run_start = ri + 1;
4375 }
4376 ri += 1;
4377 }
4378
4379 if run_start < len {
4380 iov.push(std::io::IoSlice::new(&data[run_start..]));
4381 }
4382 if !iov.is_empty() {
4383 write_ioslices(writer, &iov)?;
4384 }
4385 Ok(())
4386 }
4387}
4388
4389#[cfg(target_arch = "aarch64")]
4393#[target_feature(enable = "neon")]
4394unsafe fn delete_range_zerocopy_neon(
4395 data: &[u8],
4396 writer: &mut impl Write,
4397 lo: u8,
4398 hi: u8,
4399) -> io::Result<()> {
4400 use std::arch::aarch64::*;
4401
4402 unsafe {
4403 let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(MAX_IOV);
4404 let len = data.len();
4405 let mut run_start: usize = 0;
4406 let mut ri: usize = 0;
4407
4408 let lo_v = vdupq_n_u8(lo);
4409 let hi_v = vdupq_n_u8(hi);
4410 let bit_mask: [u8; 16] = [1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128];
4412 let bit_mask_v = vld1q_u8(bit_mask.as_ptr());
4413
4414 while ri + 16 <= len {
4415 let input = vld1q_u8(data.as_ptr().add(ri));
4416 let ge_lo = vcgeq_u8(input, lo_v);
4418 let le_hi = vcleq_u8(input, hi_v);
4419 let in_range = vandq_u8(ge_lo, le_hi);
4420
4421 let bits = vandq_u8(in_range, bit_mask_v);
4423 let pair = vpaddlq_u8(bits); let quad = vpaddlq_u16(pair); let octet = vpaddlq_u32(quad); let mask_lo = vgetq_lane_u64::<0>(octet) as u8;
4427 let mask_hi = vgetq_lane_u64::<1>(octet) as u8;
4428 let del_mask = (mask_hi as u16) << 8 | mask_lo as u16;
4429
4430 if del_mask == 0 {
4431 ri += 16;
4433 continue;
4434 }
4435
4436 let mut m = del_mask;
4438 while m != 0 {
4439 let bit = m.trailing_zeros() as usize;
4440 let abs_pos = ri + bit;
4441 if abs_pos > run_start {
4442 iov.push(std::io::IoSlice::new(&data[run_start..abs_pos]));
4443 if iov.len() >= MAX_IOV {
4444 write_ioslices(writer, &iov)?;
4445 iov.clear();
4446 }
4447 }
4448 run_start = abs_pos + 1;
4449 m &= m - 1;
4450 }
4451
4452 ri += 16;
4453 }
4454
4455 while ri < len {
4457 let b = *data.get_unchecked(ri);
4458 if b >= lo && b <= hi {
4459 if ri > run_start {
4460 iov.push(std::io::IoSlice::new(&data[run_start..ri]));
4461 if iov.len() >= MAX_IOV {
4462 write_ioslices(writer, &iov)?;
4463 iov.clear();
4464 }
4465 }
4466 run_start = ri + 1;
4467 }
4468 ri += 1;
4469 }
4470
4471 if run_start < len {
4472 iov.push(std::io::IoSlice::new(&data[run_start..]));
4473 }
4474 if !iov.is_empty() {
4475 write_ioslices(writer, &iov)?;
4476 }
4477 Ok(())
4478 }
4479}
4480
4481#[inline]
4484fn delete_chunk_bitset_into(chunk: &[u8], member: &[u8; 32], outbuf: &mut [u8]) -> usize {
4485 let len = chunk.len();
4486 let mut out_pos = 0;
4487 let mut i = 0;
4488
4489 while i + 8 <= len {
4490 unsafe {
4491 let b0 = *chunk.get_unchecked(i);
4492 let b1 = *chunk.get_unchecked(i + 1);
4493 let b2 = *chunk.get_unchecked(i + 2);
4494 let b3 = *chunk.get_unchecked(i + 3);
4495 let b4 = *chunk.get_unchecked(i + 4);
4496 let b5 = *chunk.get_unchecked(i + 5);
4497 let b6 = *chunk.get_unchecked(i + 6);
4498 let b7 = *chunk.get_unchecked(i + 7);
4499
4500 *outbuf.get_unchecked_mut(out_pos) = b0;
4501 out_pos += !is_member(member, b0) as usize;
4502 *outbuf.get_unchecked_mut(out_pos) = b1;
4503 out_pos += !is_member(member, b1) as usize;
4504 *outbuf.get_unchecked_mut(out_pos) = b2;
4505 out_pos += !is_member(member, b2) as usize;
4506 *outbuf.get_unchecked_mut(out_pos) = b3;
4507 out_pos += !is_member(member, b3) as usize;
4508 *outbuf.get_unchecked_mut(out_pos) = b4;
4509 out_pos += !is_member(member, b4) as usize;
4510 *outbuf.get_unchecked_mut(out_pos) = b5;
4511 out_pos += !is_member(member, b5) as usize;
4512 *outbuf.get_unchecked_mut(out_pos) = b6;
4513 out_pos += !is_member(member, b6) as usize;
4514 *outbuf.get_unchecked_mut(out_pos) = b7;
4515 out_pos += !is_member(member, b7) as usize;
4516 }
4517 i += 8;
4518 }
4519
4520 while i < len {
4521 unsafe {
4522 let b = *chunk.get_unchecked(i);
4523 *outbuf.get_unchecked_mut(out_pos) = b;
4524 out_pos += !is_member(member, b) as usize;
4525 }
4526 i += 1;
4527 }
4528
4529 out_pos
4530}
4531
4532fn delete_bitset_zerocopy(
4537 data: &[u8],
4538 member: &[u8; 32],
4539 writer: &mut impl Write,
4540) -> io::Result<()> {
4541 let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(MAX_IOV);
4542 let len = data.len();
4543 let mut i = 0;
4544 let mut run_start: Option<usize> = None;
4545
4546 while i < len {
4547 let b = unsafe { *data.get_unchecked(i) };
4548 if is_member(member, b) {
4549 if let Some(rs) = run_start {
4551 iov.push(std::io::IoSlice::new(&data[rs..i]));
4552 run_start = None;
4553 if iov.len() >= MAX_IOV {
4554 write_ioslices(writer, &iov)?;
4555 iov.clear();
4556 }
4557 }
4558 } else {
4559 if run_start.is_none() {
4561 run_start = Some(i);
4562 }
4563 }
4564 i += 1;
4565 }
4566 if let Some(rs) = run_start {
4568 iov.push(std::io::IoSlice::new(&data[rs..]));
4569 }
4570 if !iov.is_empty() {
4571 write_ioslices(writer, &iov)?;
4572 }
4573 Ok(())
4574}
4575
4576fn delete_single_char_mmap(ch: u8, data: &[u8], writer: &mut impl Write) -> io::Result<()> {
4577 let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(MAX_IOV);
4581 let mut last = 0;
4582 for pos in memchr::memchr_iter(ch, data) {
4583 if pos > last {
4584 iov.push(std::io::IoSlice::new(&data[last..pos]));
4585 if iov.len() >= MAX_IOV {
4586 write_ioslices(writer, &iov)?;
4587 iov.clear();
4588 }
4589 }
4590 last = pos + 1;
4591 }
4592 if last < data.len() {
4593 iov.push(std::io::IoSlice::new(&data[last..]));
4594 }
4595 if !iov.is_empty() {
4596 write_ioslices(writer, &iov)?;
4597 }
4598 Ok(())
4599}
4600
4601fn delete_multi_memchr_mmap(chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
4602 let c0 = chars[0];
4603 let c1 = if chars.len() >= 2 { chars[1] } else { 0 };
4604 let c2 = if chars.len() >= 3 { chars[2] } else { 0 };
4605 let is_three = chars.len() >= 3;
4606
4607 let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(MAX_IOV);
4609 let mut last = 0;
4610
4611 macro_rules! process_pos {
4612 ($pos:expr) => {
4613 if $pos > last {
4614 iov.push(std::io::IoSlice::new(&data[last..$pos]));
4615 if iov.len() >= MAX_IOV {
4616 write_ioslices(writer, &iov)?;
4617 iov.clear();
4618 }
4619 }
4620 last = $pos + 1;
4621 };
4622 }
4623
4624 if is_three {
4625 for pos in memchr::memchr3_iter(c0, c1, c2, data) {
4626 process_pos!(pos);
4627 }
4628 } else {
4629 for pos in memchr::memchr2_iter(c0, c1, data) {
4630 process_pos!(pos);
4631 }
4632 }
4633 if last < data.len() {
4634 iov.push(std::io::IoSlice::new(&data[last..]));
4635 }
4636 if !iov.is_empty() {
4637 write_ioslices(writer, &iov)?;
4638 }
4639 Ok(())
4640}
4641
4642pub fn delete_squeeze_mmap(
4647 delete_chars: &[u8],
4648 squeeze_chars: &[u8],
4649 data: &[u8],
4650 writer: &mut impl Write,
4651) -> io::Result<()> {
4652 let delete_set = build_member_set(delete_chars);
4653 let squeeze_set = build_member_set(squeeze_chars);
4654
4655 if data.len() <= SINGLE_ALLOC_LIMIT {
4656 let mut outbuf = alloc_uninit_vec(data.len());
4658 let mut last_squeezed: u16 = 256;
4659 let mut out_pos = 0;
4660
4661 for &b in data.iter() {
4662 if is_member(&delete_set, b) {
4663 continue;
4664 }
4665 if is_member(&squeeze_set, b) {
4666 if last_squeezed == b as u16 {
4667 continue;
4668 }
4669 last_squeezed = b as u16;
4670 } else {
4671 last_squeezed = 256;
4672 }
4673 unsafe {
4674 *outbuf.get_unchecked_mut(out_pos) = b;
4675 }
4676 out_pos += 1;
4677 }
4678 return writer.write_all(&outbuf[..out_pos]);
4679 }
4680
4681 const CHUNK: usize = 8 * 1024 * 1024;
4683 let mut outbuf = alloc_uninit_vec(CHUNK);
4684 let mut last_squeezed: u16 = 256;
4685 for chunk in data.chunks(CHUNK) {
4686 let mut out_pos = 0;
4687 for &b in chunk.iter() {
4688 if is_member(&delete_set, b) {
4689 continue;
4690 }
4691 if is_member(&squeeze_set, b) {
4692 if last_squeezed == b as u16 {
4693 continue;
4694 }
4695 last_squeezed = b as u16;
4696 } else {
4697 last_squeezed = 256;
4698 }
4699 outbuf[out_pos] = b;
4700 out_pos += 1;
4701 }
4702 writer.write_all(&outbuf[..out_pos])?;
4703 }
4704 Ok(())
4705}
4706
4707pub fn squeeze_mmap(squeeze_chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
4710 if squeeze_chars.len() == 1 {
4711 return squeeze_single_mmap(squeeze_chars[0], data, writer);
4712 }
4713 if squeeze_chars.len() == 2 {
4714 return squeeze_multi_mmap::<2>(squeeze_chars, data, writer);
4715 }
4716 if squeeze_chars.len() == 3 {
4717 return squeeze_multi_mmap::<3>(squeeze_chars, data, writer);
4718 }
4719
4720 let member = build_member_set(squeeze_chars);
4721
4722 if data.len() <= SINGLE_ALLOC_LIMIT {
4723 let mut outbuf = alloc_uninit_vec(data.len());
4725 let len = data.len();
4726 let mut wp = 0;
4727 let mut i = 0;
4728 let mut last_squeezed: u16 = 256;
4729
4730 unsafe {
4731 let inp = data.as_ptr();
4732 let outp = outbuf.as_mut_ptr();
4733
4734 while i < len {
4735 let b = *inp.add(i);
4736 if is_member(&member, b) {
4737 if last_squeezed != b as u16 {
4738 *outp.add(wp) = b;
4739 wp += 1;
4740 last_squeezed = b as u16;
4741 }
4742 i += 1;
4743 while i < len && *inp.add(i) == b {
4744 i += 1;
4745 }
4746 } else {
4747 last_squeezed = 256;
4748 *outp.add(wp) = b;
4749 wp += 1;
4750 i += 1;
4751 }
4752 }
4753 }
4754 return writer.write_all(&outbuf[..wp]);
4755 }
4756
4757 const CHUNK: usize = 8 * 1024 * 1024;
4759 let mut outbuf = alloc_uninit_vec(CHUNK);
4760 let mut last_squeezed: u16 = 256;
4761 for chunk in data.chunks(CHUNK) {
4762 let mut wp = 0;
4763 for &b in chunk.iter() {
4764 if is_member(&member, b) {
4765 if last_squeezed != b as u16 {
4766 outbuf[wp] = b;
4767 wp += 1;
4768 last_squeezed = b as u16;
4769 }
4770 } else {
4771 last_squeezed = 256;
4772 outbuf[wp] = b;
4773 wp += 1;
4774 }
4775 }
4776 writer.write_all(&outbuf[..wp])?;
4777 }
4778 Ok(())
4779}
4780
4781fn squeeze_multi_mmap<const N: usize>(
4782 chars: &[u8],
4783 data: &[u8],
4784 writer: &mut impl Write,
4785) -> io::Result<()> {
4786 let single = [chars[0]; 1]; let _ = single;
4792 let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(1024);
4793 let mut cursor = 0;
4794 let mut last_squeezed: u16 = 256;
4795
4796 macro_rules! find_next {
4797 ($data:expr) => {
4798 if N == 2 {
4799 memchr::memchr2(chars[0], chars[1], $data)
4800 } else {
4801 memchr::memchr3(chars[0], chars[1], chars[2], $data)
4802 }
4803 };
4804 }
4805
4806 while cursor < data.len() {
4807 match find_next!(&data[cursor..]) {
4808 Some(offset) => {
4809 let pos = cursor + offset;
4810 let b = data[pos];
4811 if pos > cursor {
4813 iov.push(std::io::IoSlice::new(&data[cursor..pos]));
4814 last_squeezed = 256;
4815 }
4816 if last_squeezed != b as u16 {
4818 iov.push(std::io::IoSlice::new(&data[pos..pos + 1]));
4820 last_squeezed = b as u16;
4821 }
4822 let mut skip = pos + 1;
4824 while skip < data.len() && data[skip] == b {
4825 skip += 1;
4826 }
4827 cursor = skip;
4828 if iov.len() >= MAX_IOV {
4830 write_ioslices(writer, &iov)?;
4831 iov.clear();
4832 }
4833 }
4834 None => {
4835 if cursor < data.len() {
4836 iov.push(std::io::IoSlice::new(&data[cursor..]));
4837 }
4838 break;
4839 }
4840 }
4841 }
4842 if !iov.is_empty() {
4843 write_ioslices(writer, &iov)?;
4844 }
4845 Ok(())
4846}
4847
4848fn squeeze_single_mmap(ch: u8, data: &[u8], writer: &mut impl Write) -> io::Result<()> {
4849 if data.is_empty() {
4850 return Ok(());
4851 }
4852
4853 let pair = [ch, ch];
4855 if memchr::memmem::find(data, &pair).is_none() {
4856 return writer.write_all(data);
4857 }
4858
4859 let finder = memchr::memmem::Finder::new(&pair);
4866 let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(2048);
4867 let mut cursor = 0;
4868
4869 while cursor < data.len() {
4870 match finder.find(&data[cursor..]) {
4871 Some(offset) => {
4872 let pair_pos = cursor + offset;
4873 let seg_end = pair_pos + 1;
4875 if seg_end > cursor {
4876 iov.push(std::io::IoSlice::new(&data[cursor..seg_end]));
4877 }
4878 let mut skip = seg_end;
4880 while skip < data.len() && data[skip] == ch {
4881 skip += 1;
4882 }
4883 cursor = skip;
4884 if iov.len() >= MAX_IOV {
4886 write_ioslices(writer, &iov)?;
4887 iov.clear();
4888 }
4889 }
4890 None => {
4891 if cursor < data.len() {
4893 iov.push(std::io::IoSlice::new(&data[cursor..]));
4894 }
4895 break;
4896 }
4897 }
4898 }
4899
4900 if !iov.is_empty() {
4901 write_ioslices(writer, &iov)?;
4902 }
4903 Ok(())
4904}