1use std::io::{self, Read, Write};
2
3use rayon::prelude::*;
4
5const MAX_IOV: usize = 1024;
8
9const STREAM_BUF: usize = 16 * 1024 * 1024;
14
15const PARALLEL_THRESHOLD: usize = 4 * 1024 * 1024;
22
23#[cfg(target_arch = "x86_64")]
29static COMPACT_LUT: [[u8; 8]; 256] = {
30 let mut lut = [[0u8; 8]; 256];
31 let mut mask: u16 = 0;
32 while mask < 256 {
33 let mut idx: usize = 0;
34 let mut bit: u8 = 0;
35 while bit < 8 {
36 if (mask >> bit) & 1 != 0 {
37 lut[mask as usize][idx] = bit;
38 idx += 1;
39 }
40 bit += 1;
41 }
42 mask += 1;
43 }
44 lut
45};
46
47#[inline]
50fn write_ioslices(writer: &mut impl Write, slices: &[std::io::IoSlice]) -> io::Result<()> {
51 if slices.is_empty() {
52 return Ok(());
53 }
54 for batch in slices.chunks(MAX_IOV) {
55 let total: usize = batch.iter().map(|s| s.len()).sum();
56 match writer.write_vectored(batch) {
57 Ok(n) if n >= total => continue,
58 Ok(mut written) => {
59 for slice in batch {
61 let slen = slice.len();
62 if written >= slen {
63 written -= slen;
64 continue;
65 }
66 if written > 0 {
67 writer.write_all(&slice[written..])?;
68 written = 0;
69 } else {
70 writer.write_all(slice)?;
71 }
72 }
73 }
74 Err(e) => return Err(e),
75 }
76 }
77 Ok(())
78}
79
80#[inline]
84#[allow(clippy::uninit_vec)]
85fn alloc_uninit_vec(len: usize) -> Vec<u8> {
86 let mut v = Vec::with_capacity(len);
87 unsafe {
89 v.set_len(len);
90 }
91 #[cfg(target_os = "linux")]
92 if len >= 2 * 1024 * 1024 {
93 unsafe {
94 libc::madvise(
95 v.as_mut_ptr() as *mut libc::c_void,
96 len,
97 libc::MADV_HUGEPAGE,
98 );
99 }
100 }
101 v
102}
103
104#[inline]
106fn build_translate_table(set1: &[u8], set2: &[u8]) -> [u8; 256] {
107 let mut table: [u8; 256] = std::array::from_fn(|i| i as u8);
108 let last = set2.last().copied();
109 for (i, &from) in set1.iter().enumerate() {
110 table[from as usize] = if i < set2.len() {
111 set2[i]
112 } else {
113 last.unwrap_or(from)
114 };
115 }
116 table
117}
118
119#[inline]
121fn build_member_set(chars: &[u8]) -> [u8; 32] {
122 let mut set = [0u8; 32];
123 for &ch in chars {
124 set[ch as usize >> 3] |= 1 << (ch & 7);
125 }
126 set
127}
128
129#[inline(always)]
130fn is_member(set: &[u8; 32], ch: u8) -> bool {
131 unsafe { (*set.get_unchecked(ch as usize >> 3) & (1 << (ch & 7))) != 0 }
132}
133
134#[cfg(target_arch = "x86_64")]
137static SIMD_LEVEL: std::sync::atomic::AtomicU8 = std::sync::atomic::AtomicU8::new(0);
138
139#[cfg(target_arch = "x86_64")]
140#[inline(always)]
141fn get_simd_level() -> u8 {
142 let level = SIMD_LEVEL.load(std::sync::atomic::Ordering::Relaxed);
143 if level != 0 {
144 return level;
145 }
146 let detected = if is_x86_feature_detected!("avx2") {
147 3
148 } else if is_x86_feature_detected!("ssse3") {
149 2
150 } else {
151 1
152 };
153 SIMD_LEVEL.store(detected, std::sync::atomic::Ordering::Relaxed);
154 detected
155}
156
157#[cfg(target_arch = "x86_64")]
159#[inline]
160fn count_non_identity(table: &[u8; 256]) -> usize {
161 table
162 .iter()
163 .enumerate()
164 .filter(|&(i, &v)| v != i as u8)
165 .count()
166}
167
168#[inline(always)]
174fn translate_inplace(data: &mut [u8], table: &[u8; 256]) {
175 #[cfg(target_arch = "x86_64")]
176 {
177 let level = get_simd_level();
178 if level >= 3 {
179 let non_id = count_non_identity(table);
184 if non_id > 0 && non_id <= 16 {
185 unsafe { translate_inplace_avx2_sparse(data, table) };
186 return;
187 }
188 unsafe { translate_inplace_avx2_table(data, table) };
189 return;
190 }
191 if level >= 2 {
192 unsafe { translate_inplace_ssse3_table(data, table) };
193 return;
194 }
195 }
196 translate_inplace_scalar(data, table);
197}
198
199#[cfg(target_arch = "x86_64")]
205#[target_feature(enable = "avx2")]
206unsafe fn translate_inplace_avx2_sparse(data: &mut [u8], table: &[u8; 256]) {
207 use std::arch::x86_64::*;
208
209 unsafe {
210 let len = data.len();
211 let ptr = data.as_mut_ptr();
212
213 let mut lut = [_mm256_setzero_si256(); 16];
215 for h in 0u8..16 {
216 let base = (h as usize) * 16;
217 let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
218 let row128 = _mm_loadu_si128(row.as_ptr() as *const _);
219 lut[h as usize] = _mm256_broadcastsi128_si256(row128);
220 }
221
222 let lo_mask = _mm256_set1_epi8(0x0F);
223
224 let mut i = 0;
225 while i + 32 <= len {
226 let input = _mm256_loadu_si256(ptr.add(i) as *const _);
227 let lo_nibble = _mm256_and_si256(input, lo_mask);
228 let hi_nibble = _mm256_and_si256(_mm256_srli_epi16(input, 4), lo_mask);
229
230 let mut result = _mm256_setzero_si256();
231 macro_rules! do_nibble {
232 ($h:expr) => {
233 let h_val = _mm256_set1_epi8($h as i8);
234 let mask = _mm256_cmpeq_epi8(hi_nibble, h_val);
235 let looked_up = _mm256_shuffle_epi8(lut[$h], lo_nibble);
236 result = _mm256_or_si256(result, _mm256_and_si256(mask, looked_up));
237 };
238 }
239 do_nibble!(0);
240 do_nibble!(1);
241 do_nibble!(2);
242 do_nibble!(3);
243 do_nibble!(4);
244 do_nibble!(5);
245 do_nibble!(6);
246 do_nibble!(7);
247 do_nibble!(8);
248 do_nibble!(9);
249 do_nibble!(10);
250 do_nibble!(11);
251 do_nibble!(12);
252 do_nibble!(13);
253 do_nibble!(14);
254 do_nibble!(15);
255
256 let diff = _mm256_xor_si256(input, result);
258 if _mm256_testz_si256(diff, diff) == 0 {
259 _mm256_storeu_si256(ptr.add(i) as *mut _, result);
260 }
261 i += 32;
262 }
263
264 while i < len {
266 *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
267 i += 1;
268 }
269 }
270}
271
272#[cfg(not(target_arch = "aarch64"))]
274#[inline(always)]
275fn translate_inplace_scalar(data: &mut [u8], table: &[u8; 256]) {
276 let len = data.len();
277 let ptr = data.as_mut_ptr();
278 let mut i = 0;
279 unsafe {
280 while i + 8 <= len {
281 *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
282 *ptr.add(i + 1) = *table.get_unchecked(*ptr.add(i + 1) as usize);
283 *ptr.add(i + 2) = *table.get_unchecked(*ptr.add(i + 2) as usize);
284 *ptr.add(i + 3) = *table.get_unchecked(*ptr.add(i + 3) as usize);
285 *ptr.add(i + 4) = *table.get_unchecked(*ptr.add(i + 4) as usize);
286 *ptr.add(i + 5) = *table.get_unchecked(*ptr.add(i + 5) as usize);
287 *ptr.add(i + 6) = *table.get_unchecked(*ptr.add(i + 6) as usize);
288 *ptr.add(i + 7) = *table.get_unchecked(*ptr.add(i + 7) as usize);
289 i += 8;
290 }
291 while i < len {
292 *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
293 i += 1;
294 }
295 }
296}
297
298#[cfg(target_arch = "aarch64")]
301#[inline(always)]
302fn translate_inplace_scalar(data: &mut [u8], table: &[u8; 256]) {
303 unsafe { translate_inplace_neon_table(data, table) };
304}
305
306#[cfg(target_arch = "aarch64")]
307#[target_feature(enable = "neon")]
308unsafe fn translate_inplace_neon_table(data: &mut [u8], table: &[u8; 256]) {
309 use std::arch::aarch64::*;
310
311 unsafe {
312 let len = data.len();
313 let ptr = data.as_mut_ptr();
314
315 let mut lut: [uint8x16_t; 16] = [vdupq_n_u8(0); 16];
317 for h in 0u8..16 {
318 let base = (h as usize) * 16;
319 lut[h as usize] = vld1q_u8(table.as_ptr().add(base));
320 }
321
322 let lo_mask = vdupq_n_u8(0x0F);
323 let mut i = 0;
324
325 while i + 16 <= len {
326 let input = vld1q_u8(ptr.add(i));
327 let lo_nibble = vandq_u8(input, lo_mask);
328 let hi_nibble = vandq_u8(vshrq_n_u8(input, 4), lo_mask);
329
330 let mut result = vdupq_n_u8(0);
331 macro_rules! do_nibble {
332 ($h:expr) => {
333 let h_val = vdupq_n_u8($h);
334 let mask = vceqq_u8(hi_nibble, h_val);
335 let looked_up = vqtbl1q_u8(lut[$h as usize], lo_nibble);
336 result = vorrq_u8(result, vandq_u8(mask, looked_up));
337 };
338 }
339 do_nibble!(0);
340 do_nibble!(1);
341 do_nibble!(2);
342 do_nibble!(3);
343 do_nibble!(4);
344 do_nibble!(5);
345 do_nibble!(6);
346 do_nibble!(7);
347 do_nibble!(8);
348 do_nibble!(9);
349 do_nibble!(10);
350 do_nibble!(11);
351 do_nibble!(12);
352 do_nibble!(13);
353 do_nibble!(14);
354 do_nibble!(15);
355
356 vst1q_u8(ptr.add(i), result);
357 i += 16;
358 }
359
360 while i < len {
362 *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
363 i += 1;
364 }
365 }
366}
367
368#[cfg(target_arch = "x86_64")]
388#[target_feature(enable = "avx2")]
389unsafe fn translate_inplace_avx2_table(data: &mut [u8], table: &[u8; 256]) {
390 use std::arch::x86_64::*;
391
392 unsafe {
393 let len = data.len();
394 let ptr = data.as_mut_ptr();
395
396 let mut lut = [_mm256_setzero_si256(); 16];
400 for h in 0u8..16 {
401 let base = (h as usize) * 16;
402 let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
403 let row128 = _mm_loadu_si128(row.as_ptr() as *const _);
405 lut[h as usize] = _mm256_broadcastsi128_si256(row128);
406 }
407
408 let lo_mask = _mm256_set1_epi8(0x0F);
409
410 let mut i = 0;
411
412 while i + 64 <= len {
416 let input0 = _mm256_loadu_si256(ptr.add(i) as *const _);
417 let input1 = _mm256_loadu_si256(ptr.add(i + 32) as *const _);
418
419 let lo0 = _mm256_and_si256(input0, lo_mask);
420 let hi0 = _mm256_and_si256(_mm256_srli_epi16(input0, 4), lo_mask);
421 let lo1 = _mm256_and_si256(input1, lo_mask);
422 let hi1 = _mm256_and_si256(_mm256_srli_epi16(input1, 4), lo_mask);
423
424 let mut r0 = _mm256_setzero_si256();
425 let mut r1 = _mm256_setzero_si256();
426
427 macro_rules! do_nibble2 {
428 ($h:expr) => {
429 let h_val = _mm256_set1_epi8($h as i8);
430 let m0 = _mm256_cmpeq_epi8(hi0, h_val);
431 let l0 = _mm256_shuffle_epi8(lut[$h], lo0);
432 r0 = _mm256_or_si256(r0, _mm256_and_si256(m0, l0));
433 let m1 = _mm256_cmpeq_epi8(hi1, h_val);
434 let l1 = _mm256_shuffle_epi8(lut[$h], lo1);
435 r1 = _mm256_or_si256(r1, _mm256_and_si256(m1, l1));
436 };
437 }
438 do_nibble2!(0);
439 do_nibble2!(1);
440 do_nibble2!(2);
441 do_nibble2!(3);
442 do_nibble2!(4);
443 do_nibble2!(5);
444 do_nibble2!(6);
445 do_nibble2!(7);
446 do_nibble2!(8);
447 do_nibble2!(9);
448 do_nibble2!(10);
449 do_nibble2!(11);
450 do_nibble2!(12);
451 do_nibble2!(13);
452 do_nibble2!(14);
453 do_nibble2!(15);
454
455 _mm256_storeu_si256(ptr.add(i) as *mut _, r0);
456 _mm256_storeu_si256(ptr.add(i + 32) as *mut _, r1);
457 i += 64;
458 }
459
460 if i + 32 <= len {
462 let input = _mm256_loadu_si256(ptr.add(i) as *const _);
463 let lo_nibble = _mm256_and_si256(input, lo_mask);
464 let hi_nibble = _mm256_and_si256(_mm256_srli_epi16(input, 4), lo_mask);
465
466 let mut result = _mm256_setzero_si256();
467
468 macro_rules! do_nibble {
469 ($h:expr) => {
470 let h_val = _mm256_set1_epi8($h as i8);
471 let mask = _mm256_cmpeq_epi8(hi_nibble, h_val);
472 let looked_up = _mm256_shuffle_epi8(lut[$h], lo_nibble);
473 result = _mm256_or_si256(result, _mm256_and_si256(mask, looked_up));
474 };
475 }
476 do_nibble!(0);
477 do_nibble!(1);
478 do_nibble!(2);
479 do_nibble!(3);
480 do_nibble!(4);
481 do_nibble!(5);
482 do_nibble!(6);
483 do_nibble!(7);
484 do_nibble!(8);
485 do_nibble!(9);
486 do_nibble!(10);
487 do_nibble!(11);
488 do_nibble!(12);
489 do_nibble!(13);
490 do_nibble!(14);
491 do_nibble!(15);
492
493 _mm256_storeu_si256(ptr.add(i) as *mut _, result);
494 i += 32;
495 }
496
497 if i + 16 <= len {
499 let lo_mask128 = _mm_set1_epi8(0x0F);
500
501 let mut lut128 = [_mm_setzero_si128(); 16];
502 for h in 0u8..16 {
503 lut128[h as usize] = _mm256_castsi256_si128(lut[h as usize]);
504 }
505
506 let input = _mm_loadu_si128(ptr.add(i) as *const _);
507 let lo_nib = _mm_and_si128(input, lo_mask128);
508 let hi_nib = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask128);
509
510 let mut res = _mm_setzero_si128();
511 macro_rules! do_nibble128 {
512 ($h:expr) => {
513 let h_val = _mm_set1_epi8($h as i8);
514 let mask = _mm_cmpeq_epi8(hi_nib, h_val);
515 let looked_up = _mm_shuffle_epi8(lut128[$h], lo_nib);
516 res = _mm_or_si128(res, _mm_and_si128(mask, looked_up));
517 };
518 }
519 do_nibble128!(0);
520 do_nibble128!(1);
521 do_nibble128!(2);
522 do_nibble128!(3);
523 do_nibble128!(4);
524 do_nibble128!(5);
525 do_nibble128!(6);
526 do_nibble128!(7);
527 do_nibble128!(8);
528 do_nibble128!(9);
529 do_nibble128!(10);
530 do_nibble128!(11);
531 do_nibble128!(12);
532 do_nibble128!(13);
533 do_nibble128!(14);
534 do_nibble128!(15);
535
536 _mm_storeu_si128(ptr.add(i) as *mut _, res);
537 i += 16;
538 }
539
540 while i < len {
542 *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
543 i += 1;
544 }
545 }
546}
547
548#[cfg(target_arch = "x86_64")]
549#[target_feature(enable = "ssse3")]
550unsafe fn translate_inplace_ssse3_table(data: &mut [u8], table: &[u8; 256]) {
551 use std::arch::x86_64::*;
552
553 unsafe {
554 let len = data.len();
555 let ptr = data.as_mut_ptr();
556
557 let mut lut = [_mm_setzero_si128(); 16];
559 for h in 0u8..16 {
560 let base = (h as usize) * 16;
561 let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
562 lut[h as usize] = _mm_loadu_si128(row.as_ptr() as *const _);
563 }
564
565 let lo_mask = _mm_set1_epi8(0x0F);
566
567 let mut i = 0;
568 while i + 16 <= len {
569 let input = _mm_loadu_si128(ptr.add(i) as *const _);
570 let lo_nibble = _mm_and_si128(input, lo_mask);
571 let hi_nibble = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask);
572
573 let mut result = _mm_setzero_si128();
574
575 macro_rules! do_nibble {
576 ($h:expr) => {
577 let h_val = _mm_set1_epi8($h as i8);
578 let mask = _mm_cmpeq_epi8(hi_nibble, h_val);
579 let looked_up = _mm_shuffle_epi8(lut[$h], lo_nibble);
580 result = _mm_or_si128(result, _mm_and_si128(mask, looked_up));
581 };
582 }
583 do_nibble!(0);
584 do_nibble!(1);
585 do_nibble!(2);
586 do_nibble!(3);
587 do_nibble!(4);
588 do_nibble!(5);
589 do_nibble!(6);
590 do_nibble!(7);
591 do_nibble!(8);
592 do_nibble!(9);
593 do_nibble!(10);
594 do_nibble!(11);
595 do_nibble!(12);
596 do_nibble!(13);
597 do_nibble!(14);
598 do_nibble!(15);
599
600 _mm_storeu_si128(ptr.add(i) as *mut _, result);
601 i += 16;
602 }
603
604 while i < len {
606 *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
607 i += 1;
608 }
609 }
610}
611
612#[inline(always)]
615fn translate_to(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
616 debug_assert!(dst.len() >= src.len());
617 #[cfg(target_arch = "x86_64")]
618 {
619 let level = get_simd_level();
620 if level >= 3 {
621 if dst.as_ptr() as usize & 31 == 0 {
623 unsafe { translate_to_avx2_table_nt(src, dst, table) };
624 } else {
625 unsafe { translate_to_avx2_table(src, dst, table) };
626 }
627 return;
628 }
629 if level >= 2 {
630 unsafe { translate_to_ssse3_table(src, dst, table) };
631 return;
632 }
633 }
634 translate_to_scalar(src, dst, table);
635}
636
637#[cfg(not(target_arch = "aarch64"))]
639#[inline(always)]
640fn translate_to_scalar(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
641 unsafe {
642 let sp = src.as_ptr();
643 let dp = dst.as_mut_ptr();
644 let len = src.len();
645 let mut i = 0;
646 while i + 8 <= len {
647 *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
648 *dp.add(i + 1) = *table.get_unchecked(*sp.add(i + 1) as usize);
649 *dp.add(i + 2) = *table.get_unchecked(*sp.add(i + 2) as usize);
650 *dp.add(i + 3) = *table.get_unchecked(*sp.add(i + 3) as usize);
651 *dp.add(i + 4) = *table.get_unchecked(*sp.add(i + 4) as usize);
652 *dp.add(i + 5) = *table.get_unchecked(*sp.add(i + 5) as usize);
653 *dp.add(i + 6) = *table.get_unchecked(*sp.add(i + 6) as usize);
654 *dp.add(i + 7) = *table.get_unchecked(*sp.add(i + 7) as usize);
655 i += 8;
656 }
657 while i < len {
658 *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
659 i += 1;
660 }
661 }
662}
663
664#[cfg(target_arch = "aarch64")]
666#[inline(always)]
667fn translate_to_scalar(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
668 unsafe { translate_to_neon_table(src, dst, table) };
669}
670
671#[cfg(target_arch = "aarch64")]
672#[target_feature(enable = "neon")]
673unsafe fn translate_to_neon_table(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
674 use std::arch::aarch64::*;
675
676 unsafe {
677 let len = src.len();
678 let sp = src.as_ptr();
679 let dp = dst.as_mut_ptr();
680
681 let mut lut: [uint8x16_t; 16] = [vdupq_n_u8(0); 16];
682 for h in 0u8..16 {
683 lut[h as usize] = vld1q_u8(table.as_ptr().add((h as usize) * 16));
684 }
685
686 let lo_mask = vdupq_n_u8(0x0F);
687 let mut i = 0;
688
689 while i + 16 <= len {
690 let input = vld1q_u8(sp.add(i));
691 let lo_nibble = vandq_u8(input, lo_mask);
692 let hi_nibble = vandq_u8(vshrq_n_u8(input, 4), lo_mask);
693
694 let mut result = vdupq_n_u8(0);
695 macro_rules! do_nibble {
696 ($h:expr) => {
697 let h_val = vdupq_n_u8($h);
698 let mask = vceqq_u8(hi_nibble, h_val);
699 let looked_up = vqtbl1q_u8(lut[$h as usize], lo_nibble);
700 result = vorrq_u8(result, vandq_u8(mask, looked_up));
701 };
702 }
703 do_nibble!(0);
704 do_nibble!(1);
705 do_nibble!(2);
706 do_nibble!(3);
707 do_nibble!(4);
708 do_nibble!(5);
709 do_nibble!(6);
710 do_nibble!(7);
711 do_nibble!(8);
712 do_nibble!(9);
713 do_nibble!(10);
714 do_nibble!(11);
715 do_nibble!(12);
716 do_nibble!(13);
717 do_nibble!(14);
718 do_nibble!(15);
719
720 vst1q_u8(dp.add(i), result);
721 i += 16;
722 }
723
724 while i < len {
725 *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
726 i += 1;
727 }
728 }
729}
730
731#[cfg(target_arch = "x86_64")]
732#[target_feature(enable = "avx2")]
733unsafe fn translate_to_avx2_table(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
734 use std::arch::x86_64::*;
735
736 unsafe {
737 let len = src.len();
738 let sp = src.as_ptr();
739 let dp = dst.as_mut_ptr();
740
741 let mut lut = [_mm256_setzero_si256(); 16];
743 for h in 0u8..16 {
744 let base = (h as usize) * 16;
745 let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
746 let row128 = _mm_loadu_si128(row.as_ptr() as *const _);
747 lut[h as usize] = _mm256_broadcastsi128_si256(row128);
748 }
749
750 let lo_mask = _mm256_set1_epi8(0x0F);
751
752 let mut i = 0;
753
754 while i + 64 <= len {
756 let input0 = _mm256_loadu_si256(sp.add(i) as *const _);
757 let input1 = _mm256_loadu_si256(sp.add(i + 32) as *const _);
758
759 let lo0 = _mm256_and_si256(input0, lo_mask);
760 let hi0 = _mm256_and_si256(_mm256_srli_epi16(input0, 4), lo_mask);
761 let lo1 = _mm256_and_si256(input1, lo_mask);
762 let hi1 = _mm256_and_si256(_mm256_srli_epi16(input1, 4), lo_mask);
763
764 let mut r0 = _mm256_setzero_si256();
765 let mut r1 = _mm256_setzero_si256();
766
767 macro_rules! do_nibble2 {
768 ($h:expr) => {
769 let h_val = _mm256_set1_epi8($h as i8);
770 let m0 = _mm256_cmpeq_epi8(hi0, h_val);
771 let l0 = _mm256_shuffle_epi8(lut[$h], lo0);
772 r0 = _mm256_or_si256(r0, _mm256_and_si256(m0, l0));
773 let m1 = _mm256_cmpeq_epi8(hi1, h_val);
774 let l1 = _mm256_shuffle_epi8(lut[$h], lo1);
775 r1 = _mm256_or_si256(r1, _mm256_and_si256(m1, l1));
776 };
777 }
778 do_nibble2!(0);
779 do_nibble2!(1);
780 do_nibble2!(2);
781 do_nibble2!(3);
782 do_nibble2!(4);
783 do_nibble2!(5);
784 do_nibble2!(6);
785 do_nibble2!(7);
786 do_nibble2!(8);
787 do_nibble2!(9);
788 do_nibble2!(10);
789 do_nibble2!(11);
790 do_nibble2!(12);
791 do_nibble2!(13);
792 do_nibble2!(14);
793 do_nibble2!(15);
794
795 _mm256_storeu_si256(dp.add(i) as *mut _, r0);
796 _mm256_storeu_si256(dp.add(i + 32) as *mut _, r1);
797 i += 64;
798 }
799
800 if i + 32 <= len {
802 let input = _mm256_loadu_si256(sp.add(i) as *const _);
803 let lo_nibble = _mm256_and_si256(input, lo_mask);
804 let hi_nibble = _mm256_and_si256(_mm256_srli_epi16(input, 4), lo_mask);
805
806 let mut result = _mm256_setzero_si256();
807
808 macro_rules! do_nibble {
809 ($h:expr) => {
810 let h_val = _mm256_set1_epi8($h as i8);
811 let mask = _mm256_cmpeq_epi8(hi_nibble, h_val);
812 let looked_up = _mm256_shuffle_epi8(lut[$h], lo_nibble);
813 result = _mm256_or_si256(result, _mm256_and_si256(mask, looked_up));
814 };
815 }
816 do_nibble!(0);
817 do_nibble!(1);
818 do_nibble!(2);
819 do_nibble!(3);
820 do_nibble!(4);
821 do_nibble!(5);
822 do_nibble!(6);
823 do_nibble!(7);
824 do_nibble!(8);
825 do_nibble!(9);
826 do_nibble!(10);
827 do_nibble!(11);
828 do_nibble!(12);
829 do_nibble!(13);
830 do_nibble!(14);
831 do_nibble!(15);
832
833 _mm256_storeu_si256(dp.add(i) as *mut _, result);
834 i += 32;
835 }
836
837 if i + 16 <= len {
839 let lo_mask128 = _mm_set1_epi8(0x0F);
840 let mut lut128 = [_mm_setzero_si128(); 16];
841 for h in 0u8..16 {
842 lut128[h as usize] = _mm256_castsi256_si128(lut[h as usize]);
843 }
844
845 let input = _mm_loadu_si128(sp.add(i) as *const _);
846 let lo_nib = _mm_and_si128(input, lo_mask128);
847 let hi_nib = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask128);
848
849 let mut res = _mm_setzero_si128();
850 macro_rules! do_nibble128 {
851 ($h:expr) => {
852 let h_val = _mm_set1_epi8($h as i8);
853 let mask = _mm_cmpeq_epi8(hi_nib, h_val);
854 let looked_up = _mm_shuffle_epi8(lut128[$h], lo_nib);
855 res = _mm_or_si128(res, _mm_and_si128(mask, looked_up));
856 };
857 }
858 do_nibble128!(0);
859 do_nibble128!(1);
860 do_nibble128!(2);
861 do_nibble128!(3);
862 do_nibble128!(4);
863 do_nibble128!(5);
864 do_nibble128!(6);
865 do_nibble128!(7);
866 do_nibble128!(8);
867 do_nibble128!(9);
868 do_nibble128!(10);
869 do_nibble128!(11);
870 do_nibble128!(12);
871 do_nibble128!(13);
872 do_nibble128!(14);
873 do_nibble128!(15);
874
875 _mm_storeu_si128(dp.add(i) as *mut _, res);
876 i += 16;
877 }
878
879 while i < len {
881 *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
882 i += 1;
883 }
884 }
885}
886
887#[cfg(target_arch = "x86_64")]
890#[target_feature(enable = "avx2")]
891unsafe fn translate_to_avx2_table_nt(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
892 use std::arch::x86_64::*;
893
894 unsafe {
895 let len = src.len();
896 let sp = src.as_ptr();
897 let dp = dst.as_mut_ptr();
898
899 let mut lut = [_mm256_setzero_si256(); 16];
901 for h in 0u8..16 {
902 let base = (h as usize) * 16;
903 let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
904 let row128 = _mm_loadu_si128(row.as_ptr() as *const _);
905 lut[h as usize] = _mm256_broadcastsi128_si256(row128);
906 }
907
908 let lo_mask = _mm256_set1_epi8(0x0F);
909 let mut i = 0;
910
911 while i + 64 <= len {
913 let input0 = _mm256_loadu_si256(sp.add(i) as *const _);
914 let input1 = _mm256_loadu_si256(sp.add(i + 32) as *const _);
915
916 let lo0 = _mm256_and_si256(input0, lo_mask);
917 let hi0 = _mm256_and_si256(_mm256_srli_epi16(input0, 4), lo_mask);
918 let lo1 = _mm256_and_si256(input1, lo_mask);
919 let hi1 = _mm256_and_si256(_mm256_srli_epi16(input1, 4), lo_mask);
920
921 let mut r0 = _mm256_setzero_si256();
922 let mut r1 = _mm256_setzero_si256();
923
924 macro_rules! do_nibble2 {
925 ($h:expr) => {
926 let h_val = _mm256_set1_epi8($h as i8);
927 let m0 = _mm256_cmpeq_epi8(hi0, h_val);
928 let l0 = _mm256_shuffle_epi8(lut[$h], lo0);
929 r0 = _mm256_or_si256(r0, _mm256_and_si256(m0, l0));
930 let m1 = _mm256_cmpeq_epi8(hi1, h_val);
931 let l1 = _mm256_shuffle_epi8(lut[$h], lo1);
932 r1 = _mm256_or_si256(r1, _mm256_and_si256(m1, l1));
933 };
934 }
935 do_nibble2!(0);
936 do_nibble2!(1);
937 do_nibble2!(2);
938 do_nibble2!(3);
939 do_nibble2!(4);
940 do_nibble2!(5);
941 do_nibble2!(6);
942 do_nibble2!(7);
943 do_nibble2!(8);
944 do_nibble2!(9);
945 do_nibble2!(10);
946 do_nibble2!(11);
947 do_nibble2!(12);
948 do_nibble2!(13);
949 do_nibble2!(14);
950 do_nibble2!(15);
951
952 _mm256_stream_si256(dp.add(i) as *mut _, r0);
953 _mm256_stream_si256(dp.add(i + 32) as *mut _, r1);
954 i += 64;
955 }
956
957 if i + 32 <= len {
959 let input = _mm256_loadu_si256(sp.add(i) as *const _);
960 let lo_nibble = _mm256_and_si256(input, lo_mask);
961 let hi_nibble = _mm256_and_si256(_mm256_srli_epi16(input, 4), lo_mask);
962
963 let mut result = _mm256_setzero_si256();
964 macro_rules! do_nibble {
965 ($h:expr) => {
966 let h_val = _mm256_set1_epi8($h as i8);
967 let mask = _mm256_cmpeq_epi8(hi_nibble, h_val);
968 let looked_up = _mm256_shuffle_epi8(lut[$h], lo_nibble);
969 result = _mm256_or_si256(result, _mm256_and_si256(mask, looked_up));
970 };
971 }
972 do_nibble!(0);
973 do_nibble!(1);
974 do_nibble!(2);
975 do_nibble!(3);
976 do_nibble!(4);
977 do_nibble!(5);
978 do_nibble!(6);
979 do_nibble!(7);
980 do_nibble!(8);
981 do_nibble!(9);
982 do_nibble!(10);
983 do_nibble!(11);
984 do_nibble!(12);
985 do_nibble!(13);
986 do_nibble!(14);
987 do_nibble!(15);
988
989 _mm256_stream_si256(dp.add(i) as *mut _, result);
990 i += 32;
991 }
992
993 if i + 16 <= len {
995 let lo_mask128 = _mm_set1_epi8(0x0F);
996 let mut lut128 = [_mm_setzero_si128(); 16];
997 for h in 0u8..16 {
998 lut128[h as usize] = _mm256_castsi256_si128(lut[h as usize]);
999 }
1000
1001 let input = _mm_loadu_si128(sp.add(i) as *const _);
1002 let lo_nib = _mm_and_si128(input, lo_mask128);
1003 let hi_nib = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask128);
1004
1005 let mut res = _mm_setzero_si128();
1006 macro_rules! do_nibble128 {
1007 ($h:expr) => {
1008 let h_val = _mm_set1_epi8($h as i8);
1009 let mask = _mm_cmpeq_epi8(hi_nib, h_val);
1010 let looked_up = _mm_shuffle_epi8(lut128[$h], lo_nib);
1011 res = _mm_or_si128(res, _mm_and_si128(mask, looked_up));
1012 };
1013 }
1014 do_nibble128!(0);
1015 do_nibble128!(1);
1016 do_nibble128!(2);
1017 do_nibble128!(3);
1018 do_nibble128!(4);
1019 do_nibble128!(5);
1020 do_nibble128!(6);
1021 do_nibble128!(7);
1022 do_nibble128!(8);
1023 do_nibble128!(9);
1024 do_nibble128!(10);
1025 do_nibble128!(11);
1026 do_nibble128!(12);
1027 do_nibble128!(13);
1028 do_nibble128!(14);
1029 do_nibble128!(15);
1030
1031 _mm_storeu_si128(dp.add(i) as *mut _, res);
1032 i += 16;
1033 }
1034
1035 while i < len {
1037 *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
1038 i += 1;
1039 }
1040
1041 _mm_sfence();
1043 }
1044}
1045
1046#[cfg(target_arch = "x86_64")]
1047#[target_feature(enable = "ssse3")]
1048unsafe fn translate_to_ssse3_table(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
1049 use std::arch::x86_64::*;
1050
1051 unsafe {
1052 let len = src.len();
1053 let sp = src.as_ptr();
1054 let dp = dst.as_mut_ptr();
1055
1056 let mut lut = [_mm_setzero_si128(); 16];
1057 for h in 0u8..16 {
1058 let base = (h as usize) * 16;
1059 let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
1060 lut[h as usize] = _mm_loadu_si128(row.as_ptr() as *const _);
1061 }
1062
1063 let lo_mask = _mm_set1_epi8(0x0F);
1064
1065 let mut i = 0;
1066 while i + 16 <= len {
1067 let input = _mm_loadu_si128(sp.add(i) as *const _);
1068 let lo_nibble = _mm_and_si128(input, lo_mask);
1069 let hi_nibble = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask);
1070
1071 let mut result = _mm_setzero_si128();
1072
1073 macro_rules! do_nibble {
1074 ($h:expr) => {
1075 let h_val = _mm_set1_epi8($h as i8);
1076 let mask = _mm_cmpeq_epi8(hi_nibble, h_val);
1077 let looked_up = _mm_shuffle_epi8(lut[$h], lo_nibble);
1078 result = _mm_or_si128(result, _mm_and_si128(mask, looked_up));
1079 };
1080 }
1081 do_nibble!(0);
1082 do_nibble!(1);
1083 do_nibble!(2);
1084 do_nibble!(3);
1085 do_nibble!(4);
1086 do_nibble!(5);
1087 do_nibble!(6);
1088 do_nibble!(7);
1089 do_nibble!(8);
1090 do_nibble!(9);
1091 do_nibble!(10);
1092 do_nibble!(11);
1093 do_nibble!(12);
1094 do_nibble!(13);
1095 do_nibble!(14);
1096 do_nibble!(15);
1097
1098 _mm_storeu_si128(dp.add(i) as *mut _, result);
1099 i += 16;
1100 }
1101
1102 while i < len {
1104 *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
1105 i += 1;
1106 }
1107 }
1108}
1109
1110#[inline]
1118fn detect_range_offset(table: &[u8; 256]) -> Option<(u8, u8, i8)> {
1119 let mut lo: Option<u8> = None;
1120 let mut hi = 0u8;
1121 let mut offset = 0i16;
1122
1123 for i in 0..256 {
1124 if table[i] != i as u8 {
1125 let diff = table[i] as i16 - i as i16;
1126 match lo {
1127 None => {
1128 lo = Some(i as u8);
1129 hi = i as u8;
1130 offset = diff;
1131 }
1132 Some(_) => {
1133 if diff != offset || i as u8 != hi.wrapping_add(1) {
1134 return None;
1135 }
1136 hi = i as u8;
1137 }
1138 }
1139 }
1140 }
1141
1142 lo.map(|l| (l, hi, offset as i8))
1143}
1144
1145#[inline]
1150fn detect_range_to_constant(table: &[u8; 256]) -> Option<(u8, u8, u8)> {
1151 let mut lo: Option<u8> = None;
1152 let mut hi = 0u8;
1153 let mut replacement = 0u8;
1154
1155 for i in 0..256 {
1156 if table[i] != i as u8 {
1157 match lo {
1158 None => {
1159 lo = Some(i as u8);
1160 hi = i as u8;
1161 replacement = table[i];
1162 }
1163 Some(_) => {
1164 if table[i] != replacement || i as u8 != hi.wrapping_add(1) {
1165 return None;
1166 }
1167 hi = i as u8;
1168 }
1169 }
1170 }
1171 }
1172
1173 lo.map(|l| (l, hi, replacement))
1174}
1175
1176#[cfg(target_arch = "x86_64")]
1181fn translate_range_to_constant_simd_inplace(data: &mut [u8], lo: u8, hi: u8, replacement: u8) {
1182 if get_simd_level() >= 3 {
1183 unsafe { translate_range_to_constant_avx2_inplace(data, lo, hi, replacement) };
1184 } else {
1185 unsafe { translate_range_to_constant_sse2_inplace(data, lo, hi, replacement) };
1186 }
1187}
1188
1189#[cfg(target_arch = "x86_64")]
1190#[target_feature(enable = "avx2")]
1191unsafe fn translate_range_to_constant_avx2_inplace(
1192 data: &mut [u8],
1193 lo: u8,
1194 hi: u8,
1195 replacement: u8,
1196) {
1197 use std::arch::x86_64::*;
1198
1199 unsafe {
1200 let range = hi - lo;
1201 let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1202 let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
1203 let repl_v = _mm256_set1_epi8(replacement as i8);
1204 let zero = _mm256_setzero_si256();
1205
1206 let len = data.len();
1207 let ptr = data.as_mut_ptr();
1208 let mut i = 0;
1209
1210 while i + 64 <= len {
1212 let in0 = _mm256_loadu_si256(ptr.add(i) as *const _);
1213 let in1 = _mm256_loadu_si256(ptr.add(i + 32) as *const _);
1214 let bi0 = _mm256_add_epi8(in0, bias_v);
1215 let bi1 = _mm256_add_epi8(in1, bias_v);
1216 let gt0 = _mm256_cmpgt_epi8(bi0, threshold_v);
1217 let gt1 = _mm256_cmpgt_epi8(bi1, threshold_v);
1218 let ir0 = _mm256_cmpeq_epi8(gt0, zero);
1219 let ir1 = _mm256_cmpeq_epi8(gt1, zero);
1220 let r0 = _mm256_blendv_epi8(in0, repl_v, ir0);
1221 let r1 = _mm256_blendv_epi8(in1, repl_v, ir1);
1222 _mm256_storeu_si256(ptr.add(i) as *mut _, r0);
1223 _mm256_storeu_si256(ptr.add(i + 32) as *mut _, r1);
1224 i += 64;
1225 }
1226
1227 if i + 32 <= len {
1229 let input = _mm256_loadu_si256(ptr.add(i) as *const _);
1230 let biased = _mm256_add_epi8(input, bias_v);
1231 let gt = _mm256_cmpgt_epi8(biased, threshold_v);
1232 let in_range = _mm256_cmpeq_epi8(gt, zero);
1233 let result = _mm256_blendv_epi8(input, repl_v, in_range);
1234 _mm256_storeu_si256(ptr.add(i) as *mut _, result);
1235 i += 32;
1236 }
1237
1238 if i + 16 <= len {
1239 let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1240 let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1241 let repl_v128 = _mm_set1_epi8(replacement as i8);
1242 let zero128 = _mm_setzero_si128();
1243
1244 let input = _mm_loadu_si128(ptr.add(i) as *const _);
1245 let biased = _mm_add_epi8(input, bias_v128);
1246 let gt = _mm_cmpgt_epi8(biased, threshold_v128);
1247 let in_range = _mm_cmpeq_epi8(gt, zero128);
1248 let result = _mm_blendv_epi8(input, repl_v128, in_range);
1249 _mm_storeu_si128(ptr.add(i) as *mut _, result);
1250 i += 16;
1251 }
1252
1253 while i < len {
1254 let b = *ptr.add(i);
1255 *ptr.add(i) = if b >= lo && b <= hi { replacement } else { b };
1256 i += 1;
1257 }
1258 }
1259}
1260
1261#[cfg(target_arch = "x86_64")]
1262#[target_feature(enable = "sse2")]
1263unsafe fn translate_range_to_constant_sse2_inplace(
1264 data: &mut [u8],
1265 lo: u8,
1266 hi: u8,
1267 replacement: u8,
1268) {
1269 use std::arch::x86_64::*;
1270
1271 unsafe {
1272 let range = hi - lo;
1273 let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1274 let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1275 let repl_v = _mm_set1_epi8(replacement as i8);
1276 let zero = _mm_setzero_si128();
1277
1278 let len = data.len();
1279 let ptr = data.as_mut_ptr();
1280 let mut i = 0;
1281
1282 while i + 16 <= len {
1283 let input = _mm_loadu_si128(ptr.add(i) as *const _);
1284 let biased = _mm_add_epi8(input, bias_v);
1285 let gt = _mm_cmpgt_epi8(biased, threshold_v);
1286 let in_range = _mm_cmpeq_epi8(gt, zero);
1288 let result = _mm_or_si128(
1290 _mm_and_si128(in_range, repl_v),
1291 _mm_andnot_si128(in_range, input),
1292 );
1293 _mm_storeu_si128(ptr.add(i) as *mut _, result);
1294 i += 16;
1295 }
1296
1297 while i < len {
1298 let b = *ptr.add(i);
1299 *ptr.add(i) = if b >= lo && b <= hi { replacement } else { b };
1300 i += 1;
1301 }
1302 }
1303}
1304
1305#[cfg(target_arch = "aarch64")]
1306fn translate_range_to_constant_simd_inplace(data: &mut [u8], lo: u8, hi: u8, replacement: u8) {
1307 unsafe { translate_range_to_constant_neon_inplace(data, lo, hi, replacement) };
1308}
1309
1310#[cfg(target_arch = "aarch64")]
1311#[target_feature(enable = "neon")]
1312unsafe fn translate_range_to_constant_neon_inplace(
1313 data: &mut [u8],
1314 lo: u8,
1315 hi: u8,
1316 replacement: u8,
1317) {
1318 use std::arch::aarch64::*;
1319
1320 unsafe {
1321 let len = data.len();
1322 let ptr = data.as_mut_ptr();
1323 let lo_v = vdupq_n_u8(lo);
1324 let hi_v = vdupq_n_u8(hi);
1325 let repl_v = vdupq_n_u8(replacement);
1326 let mut i = 0;
1327
1328 while i + 32 <= len {
1329 let in0 = vld1q_u8(ptr.add(i));
1330 let in1 = vld1q_u8(ptr.add(i + 16));
1331 let ge0 = vcgeq_u8(in0, lo_v);
1332 let le0 = vcleq_u8(in0, hi_v);
1333 let mask0 = vandq_u8(ge0, le0);
1334 let ge1 = vcgeq_u8(in1, lo_v);
1335 let le1 = vcleq_u8(in1, hi_v);
1336 let mask1 = vandq_u8(ge1, le1);
1337 vst1q_u8(ptr.add(i), vbslq_u8(mask0, repl_v, in0));
1339 vst1q_u8(ptr.add(i + 16), vbslq_u8(mask1, repl_v, in1));
1340 i += 32;
1341 }
1342
1343 if i + 16 <= len {
1344 let input = vld1q_u8(ptr.add(i));
1345 let ge = vcgeq_u8(input, lo_v);
1346 let le = vcleq_u8(input, hi_v);
1347 let mask = vandq_u8(ge, le);
1348 vst1q_u8(ptr.add(i), vbslq_u8(mask, repl_v, input));
1349 i += 16;
1350 }
1351
1352 while i < len {
1353 let b = *ptr.add(i);
1354 *ptr.add(i) = if b >= lo && b <= hi { replacement } else { b };
1355 i += 1;
1356 }
1357 }
1358}
1359
1360#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
1361fn translate_range_to_constant_simd_inplace(data: &mut [u8], lo: u8, hi: u8, replacement: u8) {
1362 for b in data.iter_mut() {
1363 if *b >= lo && *b <= hi {
1364 *b = replacement;
1365 }
1366 }
1367}
1368
1369#[cfg(target_arch = "x86_64")]
1372fn translate_range_to_constant_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, replacement: u8) {
1373 if get_simd_level() >= 3 {
1374 unsafe { translate_range_to_constant_avx2(src, dst, lo, hi, replacement) };
1375 } else {
1376 unsafe { translate_range_to_constant_sse2(src, dst, lo, hi, replacement) };
1377 }
1378}
1379
1380#[cfg(target_arch = "aarch64")]
1381fn translate_range_to_constant_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, replacement: u8) {
1382 unsafe { translate_range_to_constant_neon(src, dst, lo, hi, replacement) };
1383}
1384
1385#[cfg(target_arch = "aarch64")]
1386#[target_feature(enable = "neon")]
1387unsafe fn translate_range_to_constant_neon(
1388 src: &[u8],
1389 dst: &mut [u8],
1390 lo: u8,
1391 hi: u8,
1392 replacement: u8,
1393) {
1394 use std::arch::aarch64::*;
1395
1396 unsafe {
1397 let len = src.len();
1398 let sp = src.as_ptr();
1399 let dp = dst.as_mut_ptr();
1400 let lo_v = vdupq_n_u8(lo);
1401 let hi_v = vdupq_n_u8(hi);
1402 let repl_v = vdupq_n_u8(replacement);
1403 let mut i = 0;
1404
1405 while i + 32 <= len {
1406 let in0 = vld1q_u8(sp.add(i));
1407 let in1 = vld1q_u8(sp.add(i + 16));
1408 let mask0 = vandq_u8(vcgeq_u8(in0, lo_v), vcleq_u8(in0, hi_v));
1409 let mask1 = vandq_u8(vcgeq_u8(in1, lo_v), vcleq_u8(in1, hi_v));
1410 vst1q_u8(dp.add(i), vbslq_u8(mask0, repl_v, in0));
1411 vst1q_u8(dp.add(i + 16), vbslq_u8(mask1, repl_v, in1));
1412 i += 32;
1413 }
1414
1415 if i + 16 <= len {
1416 let input = vld1q_u8(sp.add(i));
1417 let mask = vandq_u8(vcgeq_u8(input, lo_v), vcleq_u8(input, hi_v));
1418 vst1q_u8(dp.add(i), vbslq_u8(mask, repl_v, input));
1419 i += 16;
1420 }
1421
1422 while i < len {
1423 let b = *sp.add(i);
1424 *dp.add(i) = if b >= lo && b <= hi { replacement } else { b };
1425 i += 1;
1426 }
1427 }
1428}
1429
1430#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
1431fn translate_range_to_constant_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, replacement: u8) {
1432 for (i, &b) in src.iter().enumerate() {
1433 unsafe {
1434 *dst.get_unchecked_mut(i) = if b >= lo && b <= hi { replacement } else { b };
1435 }
1436 }
1437}
1438
1439#[cfg(target_arch = "x86_64")]
1440#[target_feature(enable = "avx2")]
1441unsafe fn translate_range_to_constant_avx2(
1442 src: &[u8],
1443 dst: &mut [u8],
1444 lo: u8,
1445 hi: u8,
1446 replacement: u8,
1447) {
1448 use std::arch::x86_64::*;
1449 unsafe {
1450 let range = hi - lo;
1451 let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1452 let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
1453 let repl_v = _mm256_set1_epi8(replacement as i8);
1454 let zero = _mm256_setzero_si256();
1455 let len = src.len();
1456 let sp = src.as_ptr();
1457 let dp = dst.as_mut_ptr();
1458 let mut i = 0;
1459 while i + 64 <= len {
1460 let in0 = _mm256_loadu_si256(sp.add(i) as *const _);
1461 let in1 = _mm256_loadu_si256(sp.add(i + 32) as *const _);
1462 let bi0 = _mm256_add_epi8(in0, bias_v);
1463 let bi1 = _mm256_add_epi8(in1, bias_v);
1464 let gt0 = _mm256_cmpgt_epi8(bi0, threshold_v);
1465 let gt1 = _mm256_cmpgt_epi8(bi1, threshold_v);
1466 let ir0 = _mm256_cmpeq_epi8(gt0, zero);
1467 let ir1 = _mm256_cmpeq_epi8(gt1, zero);
1468 let r0 = _mm256_blendv_epi8(in0, repl_v, ir0);
1469 let r1 = _mm256_blendv_epi8(in1, repl_v, ir1);
1470 _mm256_storeu_si256(dp.add(i) as *mut _, r0);
1471 _mm256_storeu_si256(dp.add(i + 32) as *mut _, r1);
1472 i += 64;
1473 }
1474 if i + 32 <= len {
1475 let input = _mm256_loadu_si256(sp.add(i) as *const _);
1476 let biased = _mm256_add_epi8(input, bias_v);
1477 let gt = _mm256_cmpgt_epi8(biased, threshold_v);
1478 let in_range = _mm256_cmpeq_epi8(gt, zero);
1479 let result = _mm256_blendv_epi8(input, repl_v, in_range);
1480 _mm256_storeu_si256(dp.add(i) as *mut _, result);
1481 i += 32;
1482 }
1483 while i < len {
1484 let b = *sp.add(i);
1485 *dp.add(i) = if b >= lo && b <= hi { replacement } else { b };
1486 i += 1;
1487 }
1488 }
1489}
1490
1491#[cfg(target_arch = "x86_64")]
1492#[target_feature(enable = "sse2")]
1493unsafe fn translate_range_to_constant_sse2(
1494 src: &[u8],
1495 dst: &mut [u8],
1496 lo: u8,
1497 hi: u8,
1498 replacement: u8,
1499) {
1500 use std::arch::x86_64::*;
1501 unsafe {
1502 let range = hi - lo;
1503 let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1504 let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1505 let repl_v = _mm_set1_epi8(replacement as i8);
1506 let zero = _mm_setzero_si128();
1507 let len = src.len();
1508 let sp = src.as_ptr();
1509 let dp = dst.as_mut_ptr();
1510 let mut i = 0;
1511 while i + 16 <= len {
1512 let input = _mm_loadu_si128(sp.add(i) as *const _);
1513 let biased = _mm_add_epi8(input, bias_v);
1514 let gt = _mm_cmpgt_epi8(biased, threshold_v);
1515 let in_range = _mm_cmpeq_epi8(gt, zero);
1516 let result = _mm_or_si128(
1517 _mm_and_si128(in_range, repl_v),
1518 _mm_andnot_si128(in_range, input),
1519 );
1520 _mm_storeu_si128(dp.add(i) as *mut _, result);
1521 i += 16;
1522 }
1523 while i < len {
1524 let b = *sp.add(i);
1525 *dp.add(i) = if b >= lo && b <= hi { replacement } else { b };
1526 i += 1;
1527 }
1528 }
1529}
1530
1531#[cfg(target_arch = "x86_64")]
1538fn translate_range_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
1539 if get_simd_level() >= 3 {
1540 if dst.as_ptr() as usize & 31 == 0 {
1542 unsafe { translate_range_avx2_nt(src, dst, lo, hi, offset) };
1543 } else {
1544 unsafe { translate_range_avx2(src, dst, lo, hi, offset) };
1545 }
1546 } else {
1547 unsafe { translate_range_sse2(src, dst, lo, hi, offset) };
1548 }
1549}
1550
1551#[cfg(target_arch = "x86_64")]
1552#[target_feature(enable = "avx2")]
1553unsafe fn translate_range_avx2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
1554 use std::arch::x86_64::*;
1555
1556 unsafe {
1557 let range = hi - lo;
1558 let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1563 let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
1564 let offset_v = _mm256_set1_epi8(offset);
1565 let zero = _mm256_setzero_si256();
1566
1567 let len = src.len();
1568 let sp = src.as_ptr();
1569 let dp = dst.as_mut_ptr();
1570 let mut i = 0;
1571
1572 while i + 64 <= len {
1575 let in0 = _mm256_loadu_si256(sp.add(i) as *const _);
1576 let in1 = _mm256_loadu_si256(sp.add(i + 32) as *const _);
1577 let bi0 = _mm256_add_epi8(in0, bias_v);
1578 let bi1 = _mm256_add_epi8(in1, bias_v);
1579 let gt0 = _mm256_cmpgt_epi8(bi0, threshold_v);
1580 let gt1 = _mm256_cmpgt_epi8(bi1, threshold_v);
1581 let m0 = _mm256_cmpeq_epi8(gt0, zero);
1582 let m1 = _mm256_cmpeq_epi8(gt1, zero);
1583 let om0 = _mm256_and_si256(m0, offset_v);
1584 let om1 = _mm256_and_si256(m1, offset_v);
1585 let r0 = _mm256_add_epi8(in0, om0);
1586 let r1 = _mm256_add_epi8(in1, om1);
1587 _mm256_storeu_si256(dp.add(i) as *mut _, r0);
1588 _mm256_storeu_si256(dp.add(i + 32) as *mut _, r1);
1589 i += 64;
1590 }
1591
1592 if i + 32 <= len {
1594 let input = _mm256_loadu_si256(sp.add(i) as *const _);
1595 let biased = _mm256_add_epi8(input, bias_v);
1596 let gt = _mm256_cmpgt_epi8(biased, threshold_v);
1597 let mask = _mm256_cmpeq_epi8(gt, zero);
1598 let offset_masked = _mm256_and_si256(mask, offset_v);
1599 let result = _mm256_add_epi8(input, offset_masked);
1600 _mm256_storeu_si256(dp.add(i) as *mut _, result);
1601 i += 32;
1602 }
1603
1604 if i + 16 <= len {
1606 let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1607 let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1608 let offset_v128 = _mm_set1_epi8(offset);
1609 let zero128 = _mm_setzero_si128();
1610
1611 let input = _mm_loadu_si128(sp.add(i) as *const _);
1612 let biased = _mm_add_epi8(input, bias_v128);
1613 let gt = _mm_cmpgt_epi8(biased, threshold_v128);
1614 let mask = _mm_cmpeq_epi8(gt, zero128);
1615 let offset_masked = _mm_and_si128(mask, offset_v128);
1616 let result = _mm_add_epi8(input, offset_masked);
1617 _mm_storeu_si128(dp.add(i) as *mut _, result);
1618 i += 16;
1619 }
1620
1621 while i < len {
1623 let b = *sp.add(i);
1624 *dp.add(i) = if b >= lo && b <= hi {
1625 b.wrapping_add(offset as u8)
1626 } else {
1627 b
1628 };
1629 i += 1;
1630 }
1631 }
1632}
1633
1634#[cfg(target_arch = "x86_64")]
1640#[target_feature(enable = "avx2")]
1641unsafe fn translate_range_avx2_nt(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
1642 use std::arch::x86_64::*;
1643
1644 unsafe {
1645 let range = hi - lo;
1646 let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1647 let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
1648 let offset_v = _mm256_set1_epi8(offset);
1649 let zero = _mm256_setzero_si256();
1650
1651 let len = src.len();
1652 let sp = src.as_ptr();
1653 let dp = dst.as_mut_ptr();
1654 let mut i = 0;
1655
1656 while i + 64 <= len {
1658 let in0 = _mm256_loadu_si256(sp.add(i) as *const _);
1659 let in1 = _mm256_loadu_si256(sp.add(i + 32) as *const _);
1660 let bi0 = _mm256_add_epi8(in0, bias_v);
1661 let bi1 = _mm256_add_epi8(in1, bias_v);
1662 let gt0 = _mm256_cmpgt_epi8(bi0, threshold_v);
1663 let gt1 = _mm256_cmpgt_epi8(bi1, threshold_v);
1664 let m0 = _mm256_cmpeq_epi8(gt0, zero);
1665 let m1 = _mm256_cmpeq_epi8(gt1, zero);
1666 let om0 = _mm256_and_si256(m0, offset_v);
1667 let om1 = _mm256_and_si256(m1, offset_v);
1668 let r0 = _mm256_add_epi8(in0, om0);
1669 let r1 = _mm256_add_epi8(in1, om1);
1670 _mm256_stream_si256(dp.add(i) as *mut _, r0);
1671 _mm256_stream_si256(dp.add(i + 32) as *mut _, r1);
1672 i += 64;
1673 }
1674
1675 if i + 32 <= len {
1677 let input = _mm256_loadu_si256(sp.add(i) as *const _);
1678 let biased = _mm256_add_epi8(input, bias_v);
1679 let gt = _mm256_cmpgt_epi8(biased, threshold_v);
1680 let mask = _mm256_cmpeq_epi8(gt, zero);
1681 let offset_masked = _mm256_and_si256(mask, offset_v);
1682 let result = _mm256_add_epi8(input, offset_masked);
1683 _mm256_stream_si256(dp.add(i) as *mut _, result);
1684 i += 32;
1685 }
1686
1687 if i + 16 <= len {
1689 let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1690 let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1691 let offset_v128 = _mm_set1_epi8(offset);
1692 let zero128 = _mm_setzero_si128();
1693
1694 let input = _mm_loadu_si128(sp.add(i) as *const _);
1695 let biased = _mm_add_epi8(input, bias_v128);
1696 let gt = _mm_cmpgt_epi8(biased, threshold_v128);
1697 let mask = _mm_cmpeq_epi8(gt, zero128);
1698 let offset_masked = _mm_and_si128(mask, offset_v128);
1699 let result = _mm_add_epi8(input, offset_masked);
1700 _mm_storeu_si128(dp.add(i) as *mut _, result);
1701 i += 16;
1702 }
1703
1704 while i < len {
1706 let b = *sp.add(i);
1707 *dp.add(i) = if b >= lo && b <= hi {
1708 b.wrapping_add(offset as u8)
1709 } else {
1710 b
1711 };
1712 i += 1;
1713 }
1714
1715 _mm_sfence();
1717 }
1718}
1719
1720#[cfg(target_arch = "x86_64")]
1721#[target_feature(enable = "sse2")]
1722unsafe fn translate_range_sse2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
1723 use std::arch::x86_64::*;
1724
1725 unsafe {
1726 let range = hi - lo;
1727 let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1728 let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1729 let offset_v = _mm_set1_epi8(offset);
1730 let zero = _mm_setzero_si128();
1731
1732 let len = src.len();
1733 let mut i = 0;
1734
1735 while i + 16 <= len {
1736 let input = _mm_loadu_si128(src.as_ptr().add(i) as *const _);
1737 let biased = _mm_add_epi8(input, bias_v);
1738 let gt = _mm_cmpgt_epi8(biased, threshold_v);
1739 let mask = _mm_cmpeq_epi8(gt, zero);
1740 let offset_masked = _mm_and_si128(mask, offset_v);
1741 let result = _mm_add_epi8(input, offset_masked);
1742 _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut _, result);
1743 i += 16;
1744 }
1745
1746 while i < len {
1747 let b = *src.get_unchecked(i);
1748 *dst.get_unchecked_mut(i) = if b >= lo && b <= hi {
1749 b.wrapping_add(offset as u8)
1750 } else {
1751 b
1752 };
1753 i += 1;
1754 }
1755 }
1756}
1757
1758#[cfg(target_arch = "aarch64")]
1761fn translate_range_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
1762 unsafe { translate_range_neon(src, dst, lo, hi, offset) };
1763}
1764
1765#[cfg(target_arch = "aarch64")]
1766#[target_feature(enable = "neon")]
1767unsafe fn translate_range_neon(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
1768 use std::arch::aarch64::*;
1769
1770 unsafe {
1771 let len = src.len();
1772 let sp = src.as_ptr();
1773 let dp = dst.as_mut_ptr();
1774 let lo_v = vdupq_n_u8(lo);
1775 let hi_v = vdupq_n_u8(hi);
1776 let offset_v = vdupq_n_s8(offset);
1777 let mut i = 0;
1778
1779 while i + 32 <= len {
1781 let in0 = vld1q_u8(sp.add(i));
1782 let in1 = vld1q_u8(sp.add(i + 16));
1783 let ge0 = vcgeq_u8(in0, lo_v);
1785 let le0 = vcleq_u8(in0, hi_v);
1786 let mask0 = vandq_u8(ge0, le0);
1787 let ge1 = vcgeq_u8(in1, lo_v);
1788 let le1 = vcleq_u8(in1, hi_v);
1789 let mask1 = vandq_u8(ge1, le1);
1790 let off0 = vandq_u8(mask0, vreinterpretq_u8_s8(offset_v));
1792 let off1 = vandq_u8(mask1, vreinterpretq_u8_s8(offset_v));
1793 let r0 = vaddq_u8(in0, off0);
1794 let r1 = vaddq_u8(in1, off1);
1795 vst1q_u8(dp.add(i), r0);
1796 vst1q_u8(dp.add(i + 16), r1);
1797 i += 32;
1798 }
1799
1800 if i + 16 <= len {
1801 let input = vld1q_u8(sp.add(i));
1802 let ge = vcgeq_u8(input, lo_v);
1803 let le = vcleq_u8(input, hi_v);
1804 let mask = vandq_u8(ge, le);
1805 let off = vandq_u8(mask, vreinterpretq_u8_s8(offset_v));
1806 vst1q_u8(dp.add(i), vaddq_u8(input, off));
1807 i += 16;
1808 }
1809
1810 while i < len {
1811 let b = *sp.add(i);
1812 *dp.add(i) = if b >= lo && b <= hi {
1813 b.wrapping_add(offset as u8)
1814 } else {
1815 b
1816 };
1817 i += 1;
1818 }
1819 }
1820}
1821
1822#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
1824fn translate_range_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
1825 let offset_u8 = offset as u8;
1826 let range = hi.wrapping_sub(lo);
1827 unsafe {
1828 let sp = src.as_ptr();
1829 let dp = dst.as_mut_ptr();
1830 let len = src.len();
1831 let mut i = 0;
1832 while i + 8 <= len {
1833 macro_rules! do_byte {
1834 ($off:expr) => {{
1835 let b = *sp.add(i + $off);
1836 let in_range = b.wrapping_sub(lo) <= range;
1837 *dp.add(i + $off) = if in_range {
1838 b.wrapping_add(offset_u8)
1839 } else {
1840 b
1841 };
1842 }};
1843 }
1844 do_byte!(0);
1845 do_byte!(1);
1846 do_byte!(2);
1847 do_byte!(3);
1848 do_byte!(4);
1849 do_byte!(5);
1850 do_byte!(6);
1851 do_byte!(7);
1852 i += 8;
1853 }
1854 while i < len {
1855 let b = *sp.add(i);
1856 let in_range = b.wrapping_sub(lo) <= range;
1857 *dp.add(i) = if in_range {
1858 b.wrapping_add(offset_u8)
1859 } else {
1860 b
1861 };
1862 i += 1;
1863 }
1864 }
1865}
1866
1867#[cfg(target_arch = "x86_64")]
1875fn translate_range_simd_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
1876 if get_simd_level() >= 3 {
1877 unsafe { translate_range_avx2_inplace(data, lo, hi, offset) };
1878 } else {
1879 unsafe { translate_range_sse2_inplace(data, lo, hi, offset) };
1880 }
1881}
1882
1883#[cfg(target_arch = "x86_64")]
1884#[target_feature(enable = "avx2")]
1885unsafe fn translate_range_avx2_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
1886 use std::arch::x86_64::*;
1887
1888 unsafe {
1889 let range = hi - lo;
1890 let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1891 let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
1892 let offset_v = _mm256_set1_epi8(offset);
1893 let zero = _mm256_setzero_si256();
1894
1895 let len = data.len();
1896 let ptr = data.as_mut_ptr();
1897 let mut i = 0;
1898
1899 while i + 64 <= len {
1901 let in0 = _mm256_loadu_si256(ptr.add(i) as *const _);
1902 let in1 = _mm256_loadu_si256(ptr.add(i + 32) as *const _);
1903 let bi0 = _mm256_add_epi8(in0, bias_v);
1904 let bi1 = _mm256_add_epi8(in1, bias_v);
1905 let gt0 = _mm256_cmpgt_epi8(bi0, threshold_v);
1906 let gt1 = _mm256_cmpgt_epi8(bi1, threshold_v);
1907 let m0 = _mm256_cmpeq_epi8(gt0, zero);
1908 let m1 = _mm256_cmpeq_epi8(gt1, zero);
1909 let om0 = _mm256_and_si256(m0, offset_v);
1910 let om1 = _mm256_and_si256(m1, offset_v);
1911 let r0 = _mm256_add_epi8(in0, om0);
1912 let r1 = _mm256_add_epi8(in1, om1);
1913 _mm256_storeu_si256(ptr.add(i) as *mut _, r0);
1914 _mm256_storeu_si256(ptr.add(i + 32) as *mut _, r1);
1915 i += 64;
1916 }
1917
1918 if i + 32 <= len {
1920 let input = _mm256_loadu_si256(ptr.add(i) as *const _);
1921 let biased = _mm256_add_epi8(input, bias_v);
1922 let gt = _mm256_cmpgt_epi8(biased, threshold_v);
1923 let mask = _mm256_cmpeq_epi8(gt, zero);
1924 let offset_masked = _mm256_and_si256(mask, offset_v);
1925 let result = _mm256_add_epi8(input, offset_masked);
1926 _mm256_storeu_si256(ptr.add(i) as *mut _, result);
1927 i += 32;
1928 }
1929
1930 if i + 16 <= len {
1931 let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1932 let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1933 let offset_v128 = _mm_set1_epi8(offset);
1934 let zero128 = _mm_setzero_si128();
1935
1936 let input = _mm_loadu_si128(ptr.add(i) as *const _);
1937 let biased = _mm_add_epi8(input, bias_v128);
1938 let gt = _mm_cmpgt_epi8(biased, threshold_v128);
1939 let mask = _mm_cmpeq_epi8(gt, zero128);
1940 let offset_masked = _mm_and_si128(mask, offset_v128);
1941 let result = _mm_add_epi8(input, offset_masked);
1942 _mm_storeu_si128(ptr.add(i) as *mut _, result);
1943 i += 16;
1944 }
1945
1946 while i < len {
1947 let b = *ptr.add(i);
1948 *ptr.add(i) = if b >= lo && b <= hi {
1949 b.wrapping_add(offset as u8)
1950 } else {
1951 b
1952 };
1953 i += 1;
1954 }
1955 }
1956}
1957
1958#[cfg(target_arch = "x86_64")]
1959#[target_feature(enable = "sse2")]
1960unsafe fn translate_range_sse2_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
1961 use std::arch::x86_64::*;
1962
1963 unsafe {
1964 let range = hi - lo;
1965 let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1966 let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1967 let offset_v = _mm_set1_epi8(offset);
1968 let zero = _mm_setzero_si128();
1969
1970 let len = data.len();
1971 let ptr = data.as_mut_ptr();
1972 let mut i = 0;
1973
1974 while i + 16 <= len {
1975 let input = _mm_loadu_si128(ptr.add(i) as *const _);
1976 let biased = _mm_add_epi8(input, bias_v);
1977 let gt = _mm_cmpgt_epi8(biased, threshold_v);
1978 let mask = _mm_cmpeq_epi8(gt, zero);
1979 let offset_masked = _mm_and_si128(mask, offset_v);
1980 let result = _mm_add_epi8(input, offset_masked);
1981 _mm_storeu_si128(ptr.add(i) as *mut _, result);
1982 i += 16;
1983 }
1984
1985 while i < len {
1986 let b = *ptr.add(i);
1987 *ptr.add(i) = if b >= lo && b <= hi {
1988 b.wrapping_add(offset as u8)
1989 } else {
1990 b
1991 };
1992 i += 1;
1993 }
1994 }
1995}
1996
1997#[cfg(target_arch = "aarch64")]
1998fn translate_range_simd_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
1999 unsafe { translate_range_neon_inplace(data, lo, hi, offset) };
2000}
2001
2002#[cfg(target_arch = "aarch64")]
2003#[target_feature(enable = "neon")]
2004unsafe fn translate_range_neon_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
2005 use std::arch::aarch64::*;
2006
2007 unsafe {
2008 let len = data.len();
2009 let ptr = data.as_mut_ptr();
2010 let lo_v = vdupq_n_u8(lo);
2011 let hi_v = vdupq_n_u8(hi);
2012 let offset_v = vdupq_n_s8(offset);
2013 let mut i = 0;
2014
2015 while i + 32 <= len {
2016 let in0 = vld1q_u8(ptr.add(i));
2017 let in1 = vld1q_u8(ptr.add(i + 16));
2018 let ge0 = vcgeq_u8(in0, lo_v);
2019 let le0 = vcleq_u8(in0, hi_v);
2020 let mask0 = vandq_u8(ge0, le0);
2021 let ge1 = vcgeq_u8(in1, lo_v);
2022 let le1 = vcleq_u8(in1, hi_v);
2023 let mask1 = vandq_u8(ge1, le1);
2024 let off0 = vandq_u8(mask0, vreinterpretq_u8_s8(offset_v));
2025 let off1 = vandq_u8(mask1, vreinterpretq_u8_s8(offset_v));
2026 vst1q_u8(ptr.add(i), vaddq_u8(in0, off0));
2027 vst1q_u8(ptr.add(i + 16), vaddq_u8(in1, off1));
2028 i += 32;
2029 }
2030
2031 if i + 16 <= len {
2032 let input = vld1q_u8(ptr.add(i));
2033 let ge = vcgeq_u8(input, lo_v);
2034 let le = vcleq_u8(input, hi_v);
2035 let mask = vandq_u8(ge, le);
2036 let off = vandq_u8(mask, vreinterpretq_u8_s8(offset_v));
2037 vst1q_u8(ptr.add(i), vaddq_u8(input, off));
2038 i += 16;
2039 }
2040
2041 while i < len {
2042 let b = *ptr.add(i);
2043 if b >= lo && b <= hi {
2044 *ptr.add(i) = b.wrapping_add(offset as u8);
2045 }
2046 i += 1;
2047 }
2048 }
2049}
2050
2051#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
2052fn translate_range_simd_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
2053 let offset_u8 = offset as u8;
2054 let range = hi.wrapping_sub(lo);
2055 for b in data.iter_mut() {
2056 if b.wrapping_sub(lo) <= range {
2057 *b = b.wrapping_add(offset_u8);
2058 }
2059 }
2060}
2061
2062#[inline]
2072fn detect_delete_range(chars: &[u8]) -> Option<(u8, u8)> {
2073 if chars.is_empty() {
2074 return None;
2075 }
2076 let mut lo = chars[0];
2077 let mut hi = chars[0];
2078 for &c in &chars[1..] {
2079 if c < lo {
2080 lo = c;
2081 }
2082 if c > hi {
2083 hi = c;
2084 }
2085 }
2086 if (hi as usize - lo as usize + 1) == chars.len() {
2089 Some((lo, hi))
2090 } else {
2091 None
2092 }
2093}
2094
2095#[cfg(target_arch = "x86_64")]
2099fn delete_range_chunk(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
2100 if get_simd_level() >= 3 {
2101 unsafe { delete_range_avx2(src, dst, lo, hi) }
2102 } else {
2103 unsafe { delete_range_sse2(src, dst, lo, hi) }
2104 }
2105}
2106
2107#[cfg(target_arch = "x86_64")]
2108#[target_feature(enable = "avx2")]
2109unsafe fn delete_range_avx2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
2110 use std::arch::x86_64::*;
2111
2112 unsafe {
2113 let range = hi - lo;
2114 let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
2115 let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
2116 let zero = _mm256_setzero_si256();
2117
2118 let len = src.len();
2119 let sp = src.as_ptr();
2120 let dp = dst.as_mut_ptr();
2121 let mut ri = 0;
2122 let mut wp = 0;
2123
2124 while ri + 32 <= len {
2125 let input = _mm256_loadu_si256(sp.add(ri) as *const _);
2126 let biased = _mm256_add_epi8(input, bias_v);
2127 let gt = _mm256_cmpgt_epi8(biased, threshold_v);
2129 let in_range = _mm256_cmpeq_epi8(gt, zero);
2131 let keep_mask = !(_mm256_movemask_epi8(in_range) as u32);
2133
2134 if keep_mask == 0xFFFFFFFF {
2135 std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 32);
2137 wp += 32;
2138 } else if keep_mask != 0 {
2139 let m0 = keep_mask as u8;
2144 let m1 = (keep_mask >> 8) as u8;
2145 let m2 = (keep_mask >> 16) as u8;
2146 let m3 = (keep_mask >> 24) as u8;
2147
2148 if m0 == 0xFF {
2149 std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 8);
2150 } else if m0 != 0 {
2151 compact_8bytes_simd(sp.add(ri), dp.add(wp), m0);
2152 }
2153 let c0 = m0.count_ones() as usize;
2154
2155 if m1 == 0xFF {
2156 std::ptr::copy_nonoverlapping(sp.add(ri + 8), dp.add(wp + c0), 8);
2157 } else if m1 != 0 {
2158 compact_8bytes_simd(sp.add(ri + 8), dp.add(wp + c0), m1);
2159 }
2160 let c1 = m1.count_ones() as usize;
2161
2162 if m2 == 0xFF {
2163 std::ptr::copy_nonoverlapping(sp.add(ri + 16), dp.add(wp + c0 + c1), 8);
2164 } else if m2 != 0 {
2165 compact_8bytes_simd(sp.add(ri + 16), dp.add(wp + c0 + c1), m2);
2166 }
2167 let c2 = m2.count_ones() as usize;
2168
2169 if m3 == 0xFF {
2170 std::ptr::copy_nonoverlapping(sp.add(ri + 24), dp.add(wp + c0 + c1 + c2), 8);
2171 } else if m3 != 0 {
2172 compact_8bytes_simd(sp.add(ri + 24), dp.add(wp + c0 + c1 + c2), m3);
2173 }
2174 let c3 = m3.count_ones() as usize;
2175 wp += c0 + c1 + c2 + c3;
2176 }
2177 ri += 32;
2179 }
2180
2181 if ri + 16 <= len {
2183 let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
2184 let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
2185 let zero128 = _mm_setzero_si128();
2186
2187 let input = _mm_loadu_si128(sp.add(ri) as *const _);
2188 let biased = _mm_add_epi8(input, bias_v128);
2189 let gt = _mm_cmpgt_epi8(biased, threshold_v128);
2190 let in_range = _mm_cmpeq_epi8(gt, zero128);
2191 let keep_mask = !(_mm_movemask_epi8(in_range) as u32) & 0xFFFF;
2192
2193 if keep_mask == 0xFFFF {
2194 std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 16);
2195 wp += 16;
2196 } else if keep_mask != 0 {
2197 let m0 = keep_mask as u8;
2198 let m1 = (keep_mask >> 8) as u8;
2199 if m0 == 0xFF {
2200 std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 8);
2201 } else if m0 != 0 {
2202 compact_8bytes_simd(sp.add(ri), dp.add(wp), m0);
2203 }
2204 let c0 = m0.count_ones() as usize;
2205 if m1 == 0xFF {
2206 std::ptr::copy_nonoverlapping(sp.add(ri + 8), dp.add(wp + c0), 8);
2207 } else if m1 != 0 {
2208 compact_8bytes_simd(sp.add(ri + 8), dp.add(wp + c0), m1);
2209 }
2210 wp += c0 + m1.count_ones() as usize;
2211 }
2212 ri += 16;
2213 }
2214
2215 while ri < len {
2217 let b = *sp.add(ri);
2218 *dp.add(wp) = b;
2219 wp += (b < lo || b > hi) as usize;
2220 ri += 1;
2221 }
2222
2223 wp
2224 }
2225}
2226
2227#[cfg(target_arch = "x86_64")]
2235#[inline(always)]
2236unsafe fn compact_8bytes(src: *const u8, dst: *mut u8, mask: u8) {
2237 unsafe {
2238 let idx = COMPACT_LUT.get_unchecked(mask as usize);
2239 *dst = *src.add(*idx.get_unchecked(0) as usize);
2240 *dst.add(1) = *src.add(*idx.get_unchecked(1) as usize);
2241 *dst.add(2) = *src.add(*idx.get_unchecked(2) as usize);
2242 *dst.add(3) = *src.add(*idx.get_unchecked(3) as usize);
2243 *dst.add(4) = *src.add(*idx.get_unchecked(4) as usize);
2244 *dst.add(5) = *src.add(*idx.get_unchecked(5) as usize);
2245 *dst.add(6) = *src.add(*idx.get_unchecked(6) as usize);
2246 *dst.add(7) = *src.add(*idx.get_unchecked(7) as usize);
2247 }
2248}
2249
2250#[cfg(target_arch = "x86_64")]
2255#[target_feature(enable = "ssse3")]
2256#[inline]
2257unsafe fn compact_8bytes_simd(src: *const u8, dst: *mut u8, mask: u8) {
2258 use std::arch::x86_64::*;
2259 unsafe {
2260 let src_v = _mm_loadl_epi64(src as *const _);
2261 let shuf = _mm_loadl_epi64(COMPACT_LUT.get_unchecked(mask as usize).as_ptr() as *const _);
2262 let out_v = _mm_shuffle_epi8(src_v, shuf);
2263 _mm_storel_epi64(dst as *mut _, out_v);
2264 }
2265}
2266
2267#[cfg(target_arch = "x86_64")]
2268#[target_feature(enable = "sse2")]
2269unsafe fn delete_range_sse2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
2270 use std::arch::x86_64::*;
2271
2272 unsafe {
2273 let range = hi - lo;
2274 let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
2275 let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
2276 let zero = _mm_setzero_si128();
2277
2278 let len = src.len();
2279 let sp = src.as_ptr();
2280 let dp = dst.as_mut_ptr();
2281 let mut ri = 0;
2282 let mut wp = 0;
2283
2284 while ri + 16 <= len {
2285 let input = _mm_loadu_si128(sp.add(ri) as *const _);
2286 let biased = _mm_add_epi8(input, bias_v);
2287 let gt = _mm_cmpgt_epi8(biased, threshold_v);
2288 let in_range = _mm_cmpeq_epi8(gt, zero);
2289 let keep_mask = !(_mm_movemask_epi8(in_range) as u32) & 0xFFFF;
2290
2291 if keep_mask == 0xFFFF {
2292 std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 16);
2294 wp += 16;
2295 } else if keep_mask != 0 {
2296 let m0 = keep_mask as u8;
2297 let m1 = (keep_mask >> 8) as u8;
2298 if m0 == 0xFF {
2299 std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 8);
2300 } else if m0 != 0 {
2301 compact_8bytes(sp.add(ri), dp.add(wp), m0);
2302 }
2303 let c0 = m0.count_ones() as usize;
2304 if m1 == 0xFF {
2305 std::ptr::copy_nonoverlapping(sp.add(ri + 8), dp.add(wp + c0), 8);
2306 } else if m1 != 0 {
2307 compact_8bytes(sp.add(ri + 8), dp.add(wp + c0), m1);
2308 }
2309 wp += c0 + m1.count_ones() as usize;
2310 }
2311 ri += 16;
2312 }
2313
2314 while ri < len {
2316 let b = *sp.add(ri);
2317 *dp.add(wp) = b;
2318 wp += (b < lo || b > hi) as usize;
2319 ri += 1;
2320 }
2321
2322 wp
2323 }
2324}
2325
2326#[cfg(not(target_arch = "x86_64"))]
2330fn delete_range_chunk(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
2331 let len = src.len();
2332 let sp = src.as_ptr();
2333 let dp = dst.as_mut_ptr();
2334 let mut wp: usize = 0;
2335 let mut i: usize = 0;
2336
2337 while i + 8 <= len {
2339 unsafe {
2340 let b0 = *sp.add(i);
2341 *dp.add(wp) = b0;
2342 wp += (b0 < lo || b0 > hi) as usize;
2343 let b1 = *sp.add(i + 1);
2344 *dp.add(wp) = b1;
2345 wp += (b1 < lo || b1 > hi) as usize;
2346 let b2 = *sp.add(i + 2);
2347 *dp.add(wp) = b2;
2348 wp += (b2 < lo || b2 > hi) as usize;
2349 let b3 = *sp.add(i + 3);
2350 *dp.add(wp) = b3;
2351 wp += (b3 < lo || b3 > hi) as usize;
2352 let b4 = *sp.add(i + 4);
2353 *dp.add(wp) = b4;
2354 wp += (b4 < lo || b4 > hi) as usize;
2355 let b5 = *sp.add(i + 5);
2356 *dp.add(wp) = b5;
2357 wp += (b5 < lo || b5 > hi) as usize;
2358 let b6 = *sp.add(i + 6);
2359 *dp.add(wp) = b6;
2360 wp += (b6 < lo || b6 > hi) as usize;
2361 let b7 = *sp.add(i + 7);
2362 *dp.add(wp) = b7;
2363 wp += (b7 < lo || b7 > hi) as usize;
2364 }
2365 i += 8;
2366 }
2367
2368 while i < len {
2370 unsafe {
2371 let b = *sp.add(i);
2372 *dp.add(wp) = b;
2373 wp += (b < lo || b > hi) as usize;
2374 }
2375 i += 1;
2376 }
2377
2378 wp
2379}
2380
2381fn delete_range_streaming(
2386 lo: u8,
2387 hi: u8,
2388 reader: &mut impl Read,
2389 writer: &mut impl Write,
2390) -> io::Result<()> {
2391 let mut buf = alloc_uninit_vec(STREAM_BUF);
2392 loop {
2393 let n = read_once(reader, &mut buf)?;
2394 if n == 0 {
2395 break;
2396 }
2397 let wp = delete_range_inplace(&mut buf, n, lo, hi);
2398 if wp > 0 {
2399 writer.write_all(&buf[..wp])?;
2400 }
2401 }
2402 Ok(())
2403}
2404
2405#[inline]
2408fn delete_range_inplace(buf: &mut [u8], n: usize, lo: u8, hi: u8) -> usize {
2409 #[cfg(target_arch = "x86_64")]
2410 {
2411 let level = get_simd_level();
2412 if level >= 3 {
2413 return unsafe { delete_range_inplace_avx2(buf, n, lo, hi) };
2414 }
2415 }
2416 let ptr = buf.as_mut_ptr();
2418 let mut ri = 0;
2419 let mut wp = 0;
2420 unsafe {
2421 while ri + 8 <= n {
2422 let b0 = *ptr.add(ri);
2423 let b1 = *ptr.add(ri + 1);
2424 let b2 = *ptr.add(ri + 2);
2425 let b3 = *ptr.add(ri + 3);
2426 let b4 = *ptr.add(ri + 4);
2427 let b5 = *ptr.add(ri + 5);
2428 let b6 = *ptr.add(ri + 6);
2429 let b7 = *ptr.add(ri + 7);
2430 *ptr.add(wp) = b0;
2431 wp += (b0 < lo || b0 > hi) as usize;
2432 *ptr.add(wp) = b1;
2433 wp += (b1 < lo || b1 > hi) as usize;
2434 *ptr.add(wp) = b2;
2435 wp += (b2 < lo || b2 > hi) as usize;
2436 *ptr.add(wp) = b3;
2437 wp += (b3 < lo || b3 > hi) as usize;
2438 *ptr.add(wp) = b4;
2439 wp += (b4 < lo || b4 > hi) as usize;
2440 *ptr.add(wp) = b5;
2441 wp += (b5 < lo || b5 > hi) as usize;
2442 *ptr.add(wp) = b6;
2443 wp += (b6 < lo || b6 > hi) as usize;
2444 *ptr.add(wp) = b7;
2445 wp += (b7 < lo || b7 > hi) as usize;
2446 ri += 8;
2447 }
2448 while ri < n {
2449 let b = *ptr.add(ri);
2450 *ptr.add(wp) = b;
2451 wp += (b < lo || b > hi) as usize;
2452 ri += 1;
2453 }
2454 }
2455 wp
2456}
2457
2458#[cfg(target_arch = "x86_64")]
2461#[target_feature(enable = "avx2")]
2462unsafe fn delete_range_inplace_avx2(buf: &mut [u8], n: usize, lo: u8, hi: u8) -> usize {
2463 use std::arch::x86_64::*;
2464
2465 unsafe {
2466 let range = hi - lo;
2467 let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
2468 let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
2469 let zero = _mm256_setzero_si256();
2470
2471 let ptr = buf.as_mut_ptr();
2472 let mut ri = 0;
2473 let mut wp = 0;
2474
2475 while ri + 32 <= n {
2476 let input = _mm256_loadu_si256(ptr.add(ri) as *const _);
2477 let biased = _mm256_add_epi8(input, bias_v);
2478 let gt = _mm256_cmpgt_epi8(biased, threshold_v);
2479 let in_range = _mm256_cmpeq_epi8(gt, zero);
2480 let del_mask = _mm256_movemask_epi8(in_range) as u32;
2481
2482 if del_mask == 0 {
2483 if wp != ri {
2485 std::ptr::copy(ptr.add(ri), ptr.add(wp), 32);
2486 }
2487 wp += 32;
2488 } else if del_mask != 0xFFFFFFFF {
2489 let keep_mask = !del_mask;
2494 let m0 = keep_mask as u8;
2495 let m1 = (keep_mask >> 8) as u8;
2496 let m2 = (keep_mask >> 16) as u8;
2497 let m3 = (keep_mask >> 24) as u8;
2498
2499 let c0 = m0.count_ones() as usize;
2500 let c1 = m1.count_ones() as usize;
2501 let c2 = m2.count_ones() as usize;
2502 let c3 = m3.count_ones() as usize;
2503
2504 if m0 == 0xFF {
2506 std::ptr::copy(ptr.add(ri), ptr.add(wp), 8);
2507 } else if m0 != 0 {
2508 let src_v = _mm_loadl_epi64(ptr.add(ri) as *const _);
2509 let shuf = _mm_loadl_epi64(COMPACT_LUT[m0 as usize].as_ptr() as *const _);
2510 let out_v = _mm_shuffle_epi8(src_v, shuf);
2511 _mm_storel_epi64(ptr.add(wp) as *mut _, out_v);
2512 }
2513
2514 if m1 == 0xFF {
2516 std::ptr::copy(ptr.add(ri + 8), ptr.add(wp + c0), 8);
2517 } else if m1 != 0 {
2518 let src_v = _mm_loadl_epi64(ptr.add(ri + 8) as *const _);
2519 let shuf = _mm_loadl_epi64(COMPACT_LUT[m1 as usize].as_ptr() as *const _);
2520 let out_v = _mm_shuffle_epi8(src_v, shuf);
2521 _mm_storel_epi64(ptr.add(wp + c0) as *mut _, out_v);
2522 }
2523
2524 if m2 == 0xFF {
2526 std::ptr::copy(ptr.add(ri + 16), ptr.add(wp + c0 + c1), 8);
2527 } else if m2 != 0 {
2528 let src_v = _mm_loadl_epi64(ptr.add(ri + 16) as *const _);
2529 let shuf = _mm_loadl_epi64(COMPACT_LUT[m2 as usize].as_ptr() as *const _);
2530 let out_v = _mm_shuffle_epi8(src_v, shuf);
2531 _mm_storel_epi64(ptr.add(wp + c0 + c1) as *mut _, out_v);
2532 }
2533
2534 if m3 == 0xFF {
2536 std::ptr::copy(ptr.add(ri + 24), ptr.add(wp + c0 + c1 + c2), 8);
2537 } else if m3 != 0 {
2538 let src_v = _mm_loadl_epi64(ptr.add(ri + 24) as *const _);
2539 let shuf = _mm_loadl_epi64(COMPACT_LUT[m3 as usize].as_ptr() as *const _);
2540 let out_v = _mm_shuffle_epi8(src_v, shuf);
2541 _mm_storel_epi64(ptr.add(wp + c0 + c1 + c2) as *mut _, out_v);
2542 }
2543
2544 wp += c0 + c1 + c2 + c3;
2545 }
2546 ri += 32;
2548 }
2549
2550 while ri < n {
2552 let b = *ptr.add(ri);
2553 *ptr.add(wp) = b;
2554 wp += (b < lo || b > hi) as usize;
2555 ri += 1;
2556 }
2557
2558 wp
2559 }
2560}
2561
2562pub fn translate(
2567 set1: &[u8],
2568 set2: &[u8],
2569 reader: &mut impl Read,
2570 writer: &mut impl Write,
2571) -> io::Result<()> {
2572 let table = build_translate_table(set1, set2);
2573
2574 let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
2576 if is_identity {
2577 return passthrough_stream(reader, writer);
2578 }
2579
2580 if let Some((lo, hi, offset)) = detect_range_offset(&table) {
2582 return translate_range_stream(lo, hi, offset, reader, writer);
2583 }
2584
2585 if let Some((lo, hi, replacement)) = detect_range_to_constant(&table) {
2588 return translate_range_to_constant_stream(lo, hi, replacement, reader, writer);
2589 }
2590
2591 let mut buf = alloc_uninit_vec(STREAM_BUF);
2596 loop {
2597 let n = read_once(reader, &mut buf)?;
2598 if n == 0 {
2599 break;
2600 }
2601 translate_and_write_table(&mut buf, n, &table, writer)?;
2602 }
2603 Ok(())
2604}
2605
2606#[inline]
2607fn translate_and_write_table(
2608 buf: &mut [u8],
2609 total: usize,
2610 table: &[u8; 256],
2611 writer: &mut impl Write,
2612) -> io::Result<()> {
2613 if total >= PARALLEL_THRESHOLD {
2614 let nt = rayon::current_num_threads().max(1);
2615 let cs = (total / nt).max(32 * 1024);
2616 buf[..total].par_chunks_mut(cs).for_each(|chunk| {
2617 translate_inplace(chunk, table);
2618 });
2619 } else {
2620 translate_inplace(&mut buf[..total], table);
2621 }
2622 writer.write_all(&buf[..total])
2623}
2624
2625fn translate_range_stream(
2630 lo: u8,
2631 hi: u8,
2632 offset: i8,
2633 reader: &mut impl Read,
2634 writer: &mut impl Write,
2635) -> io::Result<()> {
2636 let mut buf = alloc_uninit_vec(STREAM_BUF);
2637 loop {
2638 let n = read_once(reader, &mut buf)?;
2639 if n == 0 {
2640 break;
2641 }
2642 translate_and_write_range(&mut buf, n, lo, hi, offset, writer)?;
2643 }
2644 Ok(())
2645}
2646
2647#[inline]
2648fn translate_and_write_range(
2649 buf: &mut [u8],
2650 total: usize,
2651 lo: u8,
2652 hi: u8,
2653 offset: i8,
2654 writer: &mut impl Write,
2655) -> io::Result<()> {
2656 if total >= PARALLEL_THRESHOLD {
2657 let nt = rayon::current_num_threads().max(1);
2658 let cs = (total / nt).max(32 * 1024);
2659 buf[..total].par_chunks_mut(cs).for_each(|chunk| {
2660 translate_range_simd_inplace(chunk, lo, hi, offset);
2661 });
2662 } else {
2663 translate_range_simd_inplace(&mut buf[..total], lo, hi, offset);
2664 }
2665 writer.write_all(&buf[..total])
2666}
2667
2668fn translate_range_to_constant_stream(
2672 lo: u8,
2673 hi: u8,
2674 replacement: u8,
2675 reader: &mut impl Read,
2676 writer: &mut impl Write,
2677) -> io::Result<()> {
2678 let mut buf = alloc_uninit_vec(STREAM_BUF);
2679 loop {
2680 let n = read_once(reader, &mut buf)?;
2681 if n == 0 {
2682 break;
2683 }
2684 translate_and_write_range_const(&mut buf, n, lo, hi, replacement, writer)?;
2685 }
2686 Ok(())
2687}
2688
2689#[inline]
2690fn translate_and_write_range_const(
2691 buf: &mut [u8],
2692 total: usize,
2693 lo: u8,
2694 hi: u8,
2695 replacement: u8,
2696 writer: &mut impl Write,
2697) -> io::Result<()> {
2698 if total >= PARALLEL_THRESHOLD {
2699 let nt = rayon::current_num_threads().max(1);
2700 let cs = (total / nt).max(32 * 1024);
2701 buf[..total].par_chunks_mut(cs).for_each(|chunk| {
2702 translate_range_to_constant_simd_inplace(chunk, lo, hi, replacement);
2703 });
2704 } else {
2705 translate_range_to_constant_simd_inplace(&mut buf[..total], lo, hi, replacement);
2706 }
2707 writer.write_all(&buf[..total])
2708}
2709
2710fn passthrough_stream(reader: &mut impl Read, writer: &mut impl Write) -> io::Result<()> {
2713 let mut buf = alloc_uninit_vec(STREAM_BUF);
2714 loop {
2715 let n = read_once(reader, &mut buf)?;
2716 if n == 0 {
2717 break;
2718 }
2719 writer.write_all(&buf[..n])?;
2720 }
2721 Ok(())
2722}
2723
2724#[inline]
2730fn read_once(reader: &mut impl Read, buf: &mut [u8]) -> io::Result<usize> {
2731 loop {
2732 match reader.read(buf) {
2733 Ok(n) => return Ok(n),
2734 Err(e) if e.kind() == io::ErrorKind::Interrupted => continue,
2735 Err(e) => return Err(e),
2736 }
2737 }
2738}
2739
2740pub fn translate_squeeze(
2741 set1: &[u8],
2742 set2: &[u8],
2743 reader: &mut impl Read,
2744 writer: &mut impl Write,
2745) -> io::Result<()> {
2746 let table = build_translate_table(set1, set2);
2747 let squeeze_set = build_member_set(set2);
2748
2749 if set2.len() == 1 || (set2.len() > 1 && set2.iter().all(|&b| b == set2[0])) {
2752 let squeeze_ch = set2.last().copied().unwrap_or(0);
2753 return translate_squeeze_single_ch(&table, squeeze_ch, &squeeze_set, reader, writer);
2754 }
2755
2756 let range_info = detect_range_offset(&table);
2760 let range_const_info = if range_info.is_none() {
2761 detect_range_to_constant(&table)
2762 } else {
2763 None
2764 };
2765
2766 let mut buf = alloc_uninit_vec(STREAM_BUF);
2767 let mut last_squeezed: u16 = 256;
2768
2769 loop {
2770 let n = read_once(reader, &mut buf)?;
2771 if n == 0 {
2772 break;
2773 }
2774 let wp = translate_squeeze_process(
2775 &mut buf,
2776 n,
2777 &table,
2778 &squeeze_set,
2779 range_info,
2780 range_const_info,
2781 &mut last_squeezed,
2782 );
2783 if wp > 0 {
2784 writer.write_all(&buf[..wp])?;
2785 }
2786 }
2787 Ok(())
2788}
2789
2790#[inline]
2791fn translate_squeeze_process(
2792 buf: &mut [u8],
2793 n: usize,
2794 table: &[u8; 256],
2795 squeeze_set: &[u8; 32],
2796 range_info: Option<(u8, u8, i8)>,
2797 range_const_info: Option<(u8, u8, u8)>,
2798 last_squeezed: &mut u16,
2799) -> usize {
2800 if let Some((lo, hi, offset)) = range_info {
2802 translate_range_simd_inplace(&mut buf[..n], lo, hi, offset);
2803 } else if let Some((lo, hi, replacement)) = range_const_info {
2804 translate_range_to_constant_simd_inplace(&mut buf[..n], lo, hi, replacement);
2805 } else {
2806 translate_inplace(&mut buf[..n], table);
2807 }
2808 let mut wp = 0;
2810 unsafe {
2811 let ptr = buf.as_mut_ptr();
2812 let mut i = 0;
2813 while i + 8 <= n {
2814 macro_rules! squeeze_byte {
2815 ($off:expr) => {
2816 let b = *ptr.add(i + $off);
2817 if is_member(squeeze_set, b) {
2818 if *last_squeezed != b as u16 {
2819 *last_squeezed = b as u16;
2820 *ptr.add(wp) = b;
2821 wp += 1;
2822 }
2823 } else {
2824 *last_squeezed = 256;
2825 *ptr.add(wp) = b;
2826 wp += 1;
2827 }
2828 };
2829 }
2830 squeeze_byte!(0);
2831 squeeze_byte!(1);
2832 squeeze_byte!(2);
2833 squeeze_byte!(3);
2834 squeeze_byte!(4);
2835 squeeze_byte!(5);
2836 squeeze_byte!(6);
2837 squeeze_byte!(7);
2838 i += 8;
2839 }
2840 while i < n {
2841 let b = *ptr.add(i);
2842 if is_member(squeeze_set, b) {
2843 if *last_squeezed == b as u16 {
2844 i += 1;
2845 continue;
2846 }
2847 *last_squeezed = b as u16;
2848 } else {
2849 *last_squeezed = 256;
2850 }
2851 *ptr.add(wp) = b;
2852 wp += 1;
2853 i += 1;
2854 }
2855 }
2856 wp
2857}
2858
2859fn translate_squeeze_single_ch(
2863 table: &[u8; 256],
2864 squeeze_ch: u8,
2865 _squeeze_set: &[u8; 32],
2866 reader: &mut impl Read,
2867 writer: &mut impl Write,
2868) -> io::Result<()> {
2869 let range_info = detect_range_offset(table);
2870 let range_const_info = if range_info.is_none() {
2871 detect_range_to_constant(table)
2872 } else {
2873 None
2874 };
2875
2876 let pair = [squeeze_ch, squeeze_ch];
2877 let finder = memchr::memmem::Finder::new(&pair);
2878 let mut buf = alloc_uninit_vec(STREAM_BUF);
2879 let mut was_squeeze_char = false;
2880
2881 loop {
2882 let n = read_once(reader, &mut buf)?;
2883 if n == 0 {
2884 break;
2885 }
2886 let wp = translate_squeeze_single_process(
2887 &mut buf,
2888 n,
2889 table,
2890 squeeze_ch,
2891 &finder,
2892 range_info,
2893 range_const_info,
2894 &mut was_squeeze_char,
2895 );
2896 if wp > 0 {
2897 writer.write_all(&buf[..wp])?;
2898 }
2899 }
2900 Ok(())
2901}
2902
2903#[inline]
2904fn translate_squeeze_single_process(
2905 buf: &mut [u8],
2906 n: usize,
2907 table: &[u8; 256],
2908 squeeze_ch: u8,
2909 finder: &memchr::memmem::Finder<'_>,
2910 range_info: Option<(u8, u8, i8)>,
2911 range_const_info: Option<(u8, u8, u8)>,
2912 was_squeeze_char: &mut bool,
2913) -> usize {
2914 if let Some((lo, hi, offset)) = range_info {
2916 translate_range_simd_inplace(&mut buf[..n], lo, hi, offset);
2917 } else if let Some((lo, hi, replacement)) = range_const_info {
2918 translate_range_to_constant_simd_inplace(&mut buf[..n], lo, hi, replacement);
2919 } else {
2920 translate_inplace(&mut buf[..n], table);
2921 }
2922
2923 let mut i = 0;
2925 if *was_squeeze_char {
2926 while i < n && unsafe { *buf.as_ptr().add(i) } == squeeze_ch {
2927 i += 1;
2928 }
2929 *was_squeeze_char = false;
2930 if i >= n {
2931 *was_squeeze_char = true;
2932 return 0;
2933 }
2934 }
2935
2936 let ptr = buf.as_mut_ptr();
2937 let mut wp = 0usize;
2938
2939 loop {
2940 match finder.find(&buf[i..n]) {
2941 Some(offset) => {
2942 let seg_end = i + offset + 1;
2943 let gap = seg_end - i;
2944 if gap > 0 {
2945 if wp != i {
2946 unsafe {
2947 std::ptr::copy(ptr.add(i) as *const u8, ptr.add(wp), gap);
2948 }
2949 }
2950 wp += gap;
2951 }
2952 i = seg_end;
2953 while i < n && unsafe { *buf.as_ptr().add(i) } == squeeze_ch {
2954 i += 1;
2955 }
2956 if i >= n {
2957 *was_squeeze_char = true;
2958 break;
2959 }
2960 }
2961 None => {
2962 let rem = n - i;
2963 if rem > 0 {
2964 if wp != i {
2965 unsafe {
2966 std::ptr::copy(ptr.add(i) as *const u8, ptr.add(wp), rem);
2967 }
2968 }
2969 wp += rem;
2970 }
2971 *was_squeeze_char = n > 0 && unsafe { *buf.as_ptr().add(n - 1) } == squeeze_ch;
2972 break;
2973 }
2974 }
2975 }
2976 wp
2977}
2978
2979pub fn delete(
2980 delete_chars: &[u8],
2981 reader: &mut impl Read,
2982 writer: &mut impl Write,
2983) -> io::Result<()> {
2984 if delete_chars.len() == 1 {
2985 return delete_single_streaming(delete_chars[0], reader, writer);
2986 }
2987 if delete_chars.len() <= 3 {
2988 return delete_multi_streaming(delete_chars, reader, writer);
2989 }
2990
2991 if let Some((lo, hi)) = detect_delete_range(delete_chars) {
2995 return delete_range_streaming(lo, hi, reader, writer);
2996 }
2997
2998 let member = build_member_set(delete_chars);
2999 let mut buf = alloc_uninit_vec(STREAM_BUF);
3000 let mut outbuf = alloc_uninit_vec(STREAM_BUF);
3003
3004 loop {
3005 let n = read_once(reader, &mut buf)?;
3006 if n == 0 {
3007 break;
3008 }
3009 let wp = delete_bitset_dispatch(&buf[..n], &mut outbuf, &member);
3010 if wp > 0 {
3011 writer.write_all(&outbuf[..wp])?;
3012 }
3013 }
3014 Ok(())
3015}
3016
3017#[inline]
3018fn delete_bitset_dispatch(src: &[u8], dst: &mut [u8], member: &[u8; 32]) -> usize {
3019 #[cfg(target_arch = "x86_64")]
3020 {
3021 if get_simd_level() >= 3 {
3022 return unsafe { delete_bitset_avx2_stream(src, dst, member) };
3023 }
3024 }
3025 delete_bitset_scalar(src, dst, member)
3026}
3027
3028#[inline]
3030fn delete_bitset_scalar(src: &[u8], dst: &mut [u8], member: &[u8; 32]) -> usize {
3031 let n = src.len();
3032 let mut wp = 0;
3033 unsafe {
3034 let sp = src.as_ptr();
3035 let dp = dst.as_mut_ptr();
3036 let mut i = 0;
3037 while i + 8 <= n {
3038 let b0 = *sp.add(i);
3039 let b1 = *sp.add(i + 1);
3040 let b2 = *sp.add(i + 2);
3041 let b3 = *sp.add(i + 3);
3042 let b4 = *sp.add(i + 4);
3043 let b5 = *sp.add(i + 5);
3044 let b6 = *sp.add(i + 6);
3045 let b7 = *sp.add(i + 7);
3046 *dp.add(wp) = b0;
3047 wp += !is_member(member, b0) as usize;
3048 *dp.add(wp) = b1;
3049 wp += !is_member(member, b1) as usize;
3050 *dp.add(wp) = b2;
3051 wp += !is_member(member, b2) as usize;
3052 *dp.add(wp) = b3;
3053 wp += !is_member(member, b3) as usize;
3054 *dp.add(wp) = b4;
3055 wp += !is_member(member, b4) as usize;
3056 *dp.add(wp) = b5;
3057 wp += !is_member(member, b5) as usize;
3058 *dp.add(wp) = b6;
3059 wp += !is_member(member, b6) as usize;
3060 *dp.add(wp) = b7;
3061 wp += !is_member(member, b7) as usize;
3062 i += 8;
3063 }
3064 while i < n {
3065 let b = *sp.add(i);
3066 *dp.add(wp) = b;
3067 wp += !is_member(member, b) as usize;
3068 i += 1;
3069 }
3070 }
3071 wp
3072}
3073
3074#[cfg(target_arch = "x86_64")]
3077#[target_feature(enable = "avx2")]
3078unsafe fn delete_bitset_avx2_stream(src: &[u8], dst: &mut [u8], member: &[u8; 32]) -> usize {
3079 use std::arch::x86_64::*;
3080
3081 unsafe {
3082 let n = src.len();
3083 let sp = src.as_ptr();
3084 let dp = dst.as_mut_ptr();
3085 let mut ri = 0;
3086 let mut wp = 0;
3087
3088 let member_v = _mm256_loadu_si256(member.as_ptr() as *const _);
3091
3092 let mask7 = _mm256_set1_epi8(7);
3095 let mask_0x1f = _mm256_set1_epi8(0x1F_u8 as i8);
3096
3097 let bit_table = _mm256_setr_epi8(
3100 1, 2, 4, 8, 16, 32, 64, -128i8, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 4, 8, 16, 32, 64, -128i8,
3101 0, 0, 0, 0, 0, 0, 0, 0,
3102 );
3103
3104 while ri + 32 <= n {
3105 let input = _mm256_loadu_si256(sp.add(ri) as *const _);
3106
3107 let byte_idx = _mm256_and_si256(_mm256_srli_epi16(input, 3), mask_0x1f);
3109 let bit_pos = _mm256_and_si256(input, mask7);
3111 let bit_mask = _mm256_shuffle_epi8(bit_table, bit_pos);
3113
3114 let member_lo = _mm256_broadcastsi128_si256(_mm256_castsi256_si128(member_v));
3122 let member_hi = _mm256_broadcastsi128_si256(_mm256_extracti128_si256(member_v, 1));
3123 let lo_mask = _mm256_set1_epi8(0x0F);
3124 let idx_lo = _mm256_and_si256(byte_idx, lo_mask);
3125 let shuffled_lo = _mm256_shuffle_epi8(member_lo, idx_lo);
3126 let shuffled_hi = _mm256_shuffle_epi8(member_hi, idx_lo);
3127 let use_hi = _mm256_slli_epi16(byte_idx, 3); let member_byte = _mm256_blendv_epi8(shuffled_lo, shuffled_hi, use_hi);
3130
3131 let test = _mm256_and_si256(member_byte, bit_mask);
3133 let is_zero = _mm256_cmpeq_epi8(test, _mm256_setzero_si256());
3134 let keep_mask = _mm256_movemask_epi8(is_zero) as u32;
3136
3137 if keep_mask == 0xFFFFFFFF {
3138 std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 32);
3140 wp += 32;
3141 } else if keep_mask != 0 {
3142 let m0 = keep_mask as u8;
3144 let m1 = (keep_mask >> 8) as u8;
3145 let m2 = (keep_mask >> 16) as u8;
3146 let m3 = (keep_mask >> 24) as u8;
3147
3148 if m0 == 0xFF {
3149 std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 8);
3150 } else if m0 != 0 {
3151 compact_8bytes_simd(sp.add(ri), dp.add(wp), m0);
3152 }
3153 let c0 = m0.count_ones() as usize;
3154
3155 if m1 == 0xFF {
3156 std::ptr::copy_nonoverlapping(sp.add(ri + 8), dp.add(wp + c0), 8);
3157 } else if m1 != 0 {
3158 compact_8bytes_simd(sp.add(ri + 8), dp.add(wp + c0), m1);
3159 }
3160 let c1 = m1.count_ones() as usize;
3161
3162 if m2 == 0xFF {
3163 std::ptr::copy_nonoverlapping(sp.add(ri + 16), dp.add(wp + c0 + c1), 8);
3164 } else if m2 != 0 {
3165 compact_8bytes_simd(sp.add(ri + 16), dp.add(wp + c0 + c1), m2);
3166 }
3167 let c2 = m2.count_ones() as usize;
3168
3169 if m3 == 0xFF {
3170 std::ptr::copy_nonoverlapping(sp.add(ri + 24), dp.add(wp + c0 + c1 + c2), 8);
3171 } else if m3 != 0 {
3172 compact_8bytes_simd(sp.add(ri + 24), dp.add(wp + c0 + c1 + c2), m3);
3173 }
3174 let c3 = m3.count_ones() as usize;
3175 wp += c0 + c1 + c2 + c3;
3176 }
3177 ri += 32;
3179 }
3180
3181 while ri < n {
3183 let b = *sp.add(ri);
3184 *dp.add(wp) = b;
3185 wp += !is_member(member, b) as usize;
3186 ri += 1;
3187 }
3188
3189 wp
3190 }
3191}
3192
3193fn delete_single_streaming(
3194 ch: u8,
3195 reader: &mut impl Read,
3196 writer: &mut impl Write,
3197) -> io::Result<()> {
3198 let mut buf = alloc_uninit_vec(STREAM_BUF);
3199 loop {
3200 let n = read_once(reader, &mut buf)?;
3201 if n == 0 {
3202 break;
3203 }
3204 let wp = delete_single_inplace(&mut buf, n, ch);
3205 if wp > 0 {
3206 writer.write_all(&buf[..wp])?;
3207 }
3208 }
3209 Ok(())
3210}
3211
3212#[inline]
3214fn delete_single_inplace(buf: &mut [u8], n: usize, ch: u8) -> usize {
3215 let mut wp = 0;
3216 let mut i = 0;
3217 while i < n {
3218 match memchr::memchr(ch, &buf[i..n]) {
3219 Some(offset) => {
3220 if offset > 0 {
3221 if wp != i {
3222 unsafe {
3223 std::ptr::copy(buf.as_ptr().add(i), buf.as_mut_ptr().add(wp), offset);
3224 }
3225 }
3226 wp += offset;
3227 }
3228 i += offset + 1;
3229 }
3230 None => {
3231 let run_len = n - i;
3232 if run_len > 0 {
3233 if wp != i {
3234 unsafe {
3235 std::ptr::copy(buf.as_ptr().add(i), buf.as_mut_ptr().add(wp), run_len);
3236 }
3237 }
3238 wp += run_len;
3239 }
3240 break;
3241 }
3242 }
3243 }
3244 wp
3245}
3246
3247fn delete_multi_streaming(
3248 chars: &[u8],
3249 reader: &mut impl Read,
3250 writer: &mut impl Write,
3251) -> io::Result<()> {
3252 let mut buf = alloc_uninit_vec(STREAM_BUF);
3253 loop {
3254 let n = read_once(reader, &mut buf)?;
3255 if n == 0 {
3256 break;
3257 }
3258 let wp = delete_multi_inplace(&mut buf, n, chars);
3259 if wp > 0 {
3260 writer.write_all(&buf[..wp])?;
3261 }
3262 }
3263 Ok(())
3264}
3265
3266#[inline]
3268fn delete_multi_inplace(buf: &mut [u8], n: usize, chars: &[u8]) -> usize {
3269 let mut wp = 0;
3270 let mut i = 0;
3271 while i < n {
3272 let found = if chars.len() == 2 {
3273 memchr::memchr2(chars[0], chars[1], &buf[i..n])
3274 } else {
3275 memchr::memchr3(chars[0], chars[1], chars[2], &buf[i..n])
3276 };
3277 match found {
3278 Some(offset) => {
3279 if offset > 0 {
3280 if wp != i {
3281 unsafe {
3282 std::ptr::copy(buf.as_ptr().add(i), buf.as_mut_ptr().add(wp), offset);
3283 }
3284 }
3285 wp += offset;
3286 }
3287 i += offset + 1;
3288 }
3289 None => {
3290 let run_len = n - i;
3291 if run_len > 0 {
3292 if wp != i {
3293 unsafe {
3294 std::ptr::copy(buf.as_ptr().add(i), buf.as_mut_ptr().add(wp), run_len);
3295 }
3296 }
3297 wp += run_len;
3298 }
3299 break;
3300 }
3301 }
3302 }
3303 wp
3304}
3305
3306pub fn delete_squeeze(
3307 delete_chars: &[u8],
3308 squeeze_chars: &[u8],
3309 reader: &mut impl Read,
3310 writer: &mut impl Write,
3311) -> io::Result<()> {
3312 let delete_set = build_member_set(delete_chars);
3313 let squeeze_set = build_member_set(squeeze_chars);
3314 let mut buf = alloc_uninit_vec(STREAM_BUF);
3315 let mut last_squeezed: u16 = 256;
3316
3317 loop {
3318 let n = read_once(reader, &mut buf)?;
3319 if n == 0 {
3320 break;
3321 }
3322 let wp = delete_squeeze_inplace(&mut buf, n, &delete_set, &squeeze_set, &mut last_squeezed);
3323 if wp > 0 {
3324 writer.write_all(&buf[..wp])?;
3325 }
3326 }
3327 Ok(())
3328}
3329
3330#[inline]
3331fn delete_squeeze_inplace(
3332 buf: &mut [u8],
3333 n: usize,
3334 delete_set: &[u8; 32],
3335 squeeze_set: &[u8; 32],
3336 last_squeezed: &mut u16,
3337) -> usize {
3338 let mut wp = 0;
3339 unsafe {
3340 let ptr = buf.as_mut_ptr();
3341 let mut i = 0;
3342 while i + 8 <= n {
3343 macro_rules! process_byte {
3344 ($off:expr) => {
3345 let b = *ptr.add(i + $off);
3346 if !is_member(delete_set, b) {
3347 if is_member(squeeze_set, b) {
3348 if *last_squeezed != b as u16 {
3349 *last_squeezed = b as u16;
3350 *ptr.add(wp) = b;
3351 wp += 1;
3352 }
3353 } else {
3354 *last_squeezed = 256;
3355 *ptr.add(wp) = b;
3356 wp += 1;
3357 }
3358 }
3359 };
3360 }
3361 process_byte!(0);
3362 process_byte!(1);
3363 process_byte!(2);
3364 process_byte!(3);
3365 process_byte!(4);
3366 process_byte!(5);
3367 process_byte!(6);
3368 process_byte!(7);
3369 i += 8;
3370 }
3371 while i < n {
3372 let b = *ptr.add(i);
3373 if !is_member(delete_set, b) {
3374 if is_member(squeeze_set, b) {
3375 if *last_squeezed != b as u16 {
3376 *last_squeezed = b as u16;
3377 *ptr.add(wp) = b;
3378 wp += 1;
3379 }
3380 } else {
3381 *last_squeezed = 256;
3382 *ptr.add(wp) = b;
3383 wp += 1;
3384 }
3385 }
3386 i += 1;
3387 }
3388 }
3389 wp
3390}
3391
3392pub fn squeeze(
3393 squeeze_chars: &[u8],
3394 reader: &mut impl Read,
3395 writer: &mut impl Write,
3396) -> io::Result<()> {
3397 if squeeze_chars.len() == 1 {
3398 return squeeze_single_stream(squeeze_chars[0], reader, writer);
3399 }
3400
3401 if squeeze_chars.len() <= 3 {
3404 return squeeze_multi_stream(squeeze_chars, reader, writer);
3405 }
3406
3407 let member = build_member_set(squeeze_chars);
3408 let mut buf = alloc_uninit_vec(STREAM_BUF);
3409 let mut last_squeezed: u16 = 256;
3410
3411 loop {
3412 let n = read_once(reader, &mut buf)?;
3413 if n == 0 {
3414 break;
3415 }
3416 let wp = squeeze_inplace_bitset(&mut buf, n, &member, &mut last_squeezed);
3417 if wp > 0 {
3418 writer.write_all(&buf[..wp])?;
3419 }
3420 }
3421 Ok(())
3422}
3423
3424#[inline]
3425fn squeeze_inplace_bitset(
3426 buf: &mut [u8],
3427 n: usize,
3428 member: &[u8; 32],
3429 last_squeezed: &mut u16,
3430) -> usize {
3431 let mut wp = 0;
3432 unsafe {
3433 let ptr = buf.as_mut_ptr();
3434 for i in 0..n {
3435 let b = *ptr.add(i);
3436 if is_member(member, b) {
3437 if *last_squeezed == b as u16 {
3438 continue;
3439 }
3440 *last_squeezed = b as u16;
3441 } else {
3442 *last_squeezed = 256;
3443 }
3444 *ptr.add(wp) = b;
3445 wp += 1;
3446 }
3447 }
3448 wp
3449}
3450
3451fn squeeze_multi_stream(
3455 chars: &[u8],
3456 reader: &mut impl Read,
3457 writer: &mut impl Write,
3458) -> io::Result<()> {
3459 let c0 = chars[0];
3460 let c1 = chars[1];
3461 let c2 = if chars.len() >= 3 {
3462 Some(chars[2])
3463 } else {
3464 None
3465 };
3466
3467 let mut buf = alloc_uninit_vec(STREAM_BUF);
3468 let mut last_squeezed: u16 = 256;
3469
3470 loop {
3471 let n = read_once(reader, &mut buf)?;
3472 if n == 0 {
3473 break;
3474 }
3475 let wp = squeeze_multi_compact(&mut buf, n, c0, c1, c2, &mut last_squeezed);
3476 if wp > 0 {
3477 writer.write_all(&buf[..wp])?;
3478 }
3479 }
3480 Ok(())
3481}
3482
3483#[inline]
3485fn squeeze_multi_compact(
3486 buf: &mut [u8],
3487 n: usize,
3488 c0: u8,
3489 c1: u8,
3490 c2: Option<u8>,
3491 last_squeezed: &mut u16,
3492) -> usize {
3493 let ptr = buf.as_mut_ptr();
3494 let mut wp = 0usize;
3495 let mut cursor = 0usize;
3496
3497 while cursor < n {
3498 let found = if let Some(c) = c2 {
3499 memchr::memchr3(c0, c1, c, &buf[cursor..n])
3500 } else {
3501 memchr::memchr2(c0, c1, &buf[cursor..n])
3502 };
3503 match found {
3504 Some(offset) => {
3505 let pos = cursor + offset;
3506 let b = unsafe { *ptr.add(pos) };
3507
3508 let gap = pos - cursor;
3509 if gap > 0 {
3510 if wp != cursor {
3511 unsafe {
3512 std::ptr::copy(ptr.add(cursor), ptr.add(wp), gap);
3513 }
3514 }
3515 wp += gap;
3516 *last_squeezed = 256;
3517 }
3518
3519 if *last_squeezed != b as u16 {
3520 unsafe { *ptr.add(wp) = b };
3521 wp += 1;
3522 *last_squeezed = b as u16;
3523 }
3524
3525 cursor = pos + 1;
3526 while cursor < n && unsafe { *ptr.add(cursor) } == b {
3527 cursor += 1;
3528 }
3529 }
3530 None => {
3531 let rem = n - cursor;
3532 if rem > 0 {
3533 if wp != cursor {
3534 unsafe {
3535 std::ptr::copy(ptr.add(cursor), ptr.add(wp), rem);
3536 }
3537 }
3538 wp += rem;
3539 *last_squeezed = 256;
3540 }
3541 break;
3542 }
3543 }
3544 }
3545 wp
3546}
3547
3548fn squeeze_single_stream(
3549 ch: u8,
3550 reader: &mut impl Read,
3551 writer: &mut impl Write,
3552) -> io::Result<()> {
3553 let pair = [ch, ch];
3554 let finder = memchr::memmem::Finder::new(&pair);
3555 let mut buf = alloc_uninit_vec(STREAM_BUF);
3556 let mut was_squeeze_char = false;
3557
3558 loop {
3559 let n = read_once(reader, &mut buf)?;
3560 if n == 0 {
3561 break;
3562 }
3563 let wp = squeeze_single_compact(&mut buf, n, ch, &finder, &mut was_squeeze_char);
3564 if wp > 0 {
3565 writer.write_all(&buf[..wp])?;
3566 }
3567 }
3568 Ok(())
3569}
3570
3571#[inline]
3573fn squeeze_single_compact(
3574 buf: &mut [u8],
3575 n: usize,
3576 ch: u8,
3577 finder: &memchr::memmem::Finder<'_>,
3578 was_squeeze_char: &mut bool,
3579) -> usize {
3580 let mut i = 0;
3581
3582 if *was_squeeze_char {
3584 while i < n && unsafe { *buf.as_ptr().add(i) } == ch {
3585 i += 1;
3586 }
3587 *was_squeeze_char = false;
3588 if i >= n {
3589 *was_squeeze_char = true;
3590 return 0;
3591 }
3592 }
3593
3594 let ptr = buf.as_mut_ptr();
3595 let mut wp = 0usize;
3596
3597 loop {
3598 match finder.find(&buf[i..n]) {
3599 Some(offset) => {
3600 let seg_end = i + offset + 1;
3601 let gap = seg_end - i;
3602 if gap > 0 {
3603 if wp != i {
3604 unsafe {
3605 std::ptr::copy(ptr.add(i) as *const u8, ptr.add(wp), gap);
3606 }
3607 }
3608 wp += gap;
3609 }
3610 i = seg_end;
3611 while i < n && unsafe { *buf.as_ptr().add(i) } == ch {
3612 i += 1;
3613 }
3614 if i >= n {
3615 *was_squeeze_char = true;
3616 break;
3617 }
3618 }
3619 None => {
3620 let rem = n - i;
3621 if rem > 0 {
3622 if wp != i {
3623 unsafe {
3624 std::ptr::copy(ptr.add(i) as *const u8, ptr.add(wp), rem);
3625 }
3626 }
3627 wp += rem;
3628 }
3629 *was_squeeze_char = n > 0 && unsafe { *buf.as_ptr().add(n - 1) } == ch;
3630 break;
3631 }
3632 }
3633 }
3634 wp
3635}
3636
3637pub fn translate_owned(
3645 set1: &[u8],
3646 set2: &[u8],
3647 data: &mut [u8],
3648 writer: &mut impl Write,
3649) -> io::Result<()> {
3650 let table = build_translate_table(set1, set2);
3651
3652 let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
3654 if is_identity {
3655 return writer.write_all(data);
3656 }
3657
3658 const OWNED_PARALLEL_MIN: usize = 32 * 1024 * 1024;
3663
3664 if let Some((lo, hi, offset)) = detect_range_offset(&table) {
3666 if data.len() >= OWNED_PARALLEL_MIN {
3667 let n_threads = rayon::current_num_threads().max(1);
3668 let chunk_size = (data.len() / n_threads).max(32 * 1024);
3669 data.par_chunks_mut(chunk_size).for_each(|chunk| {
3670 translate_range_simd_inplace(chunk, lo, hi, offset);
3671 });
3672 } else {
3673 translate_range_simd_inplace(data, lo, hi, offset);
3674 }
3675 return writer.write_all(data);
3676 }
3677
3678 if let Some((lo, hi, replacement)) = detect_range_to_constant(&table) {
3680 if data.len() >= OWNED_PARALLEL_MIN {
3681 let n_threads = rayon::current_num_threads().max(1);
3682 let chunk_size = (data.len() / n_threads).max(32 * 1024);
3683 data.par_chunks_mut(chunk_size).for_each(|chunk| {
3684 translate_range_to_constant_simd_inplace(chunk, lo, hi, replacement);
3685 });
3686 } else {
3687 translate_range_to_constant_simd_inplace(data, lo, hi, replacement);
3688 }
3689 return writer.write_all(data);
3690 }
3691
3692 if data.len() >= OWNED_PARALLEL_MIN {
3694 let n_threads = rayon::current_num_threads().max(1);
3695 let chunk_size = (data.len() / n_threads).max(32 * 1024);
3696 data.par_chunks_mut(chunk_size).for_each(|chunk| {
3697 translate_inplace(chunk, &table);
3698 });
3699 } else {
3700 translate_inplace(data, &table);
3701 }
3702 writer.write_all(data)
3703}
3704
3705pub fn translate_mmap(
3719 set1: &[u8],
3720 set2: &[u8],
3721 data: &[u8],
3722 writer: &mut impl Write,
3723) -> io::Result<()> {
3724 let table = build_translate_table(set1, set2);
3725
3726 let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
3728 if is_identity {
3729 return writer.write_all(data);
3730 }
3731
3732 if let Some((lo, hi, offset)) = detect_range_offset(&table) {
3734 return translate_mmap_range(data, writer, lo, hi, offset);
3735 }
3736
3737 if let Some((lo, hi, replacement)) = detect_range_to_constant(&table) {
3739 return translate_mmap_range_to_constant(data, writer, lo, hi, replacement);
3740 }
3741
3742 translate_mmap_table(data, writer, &table)
3744}
3745
3746fn translate_mmap_range(
3748 data: &[u8],
3749 writer: &mut impl Write,
3750 lo: u8,
3751 hi: u8,
3752 offset: i8,
3753) -> io::Result<()> {
3754 if data.len() >= PARALLEL_THRESHOLD {
3756 let mut buf = alloc_uninit_vec(data.len());
3757 let n_threads = rayon::current_num_threads().max(1);
3758 let chunk_size = (data.len() / n_threads).max(32 * 1024);
3759
3760 data.par_chunks(chunk_size)
3762 .zip(buf.par_chunks_mut(chunk_size))
3763 .for_each(|(src_chunk, dst_chunk)| {
3764 translate_range_simd(src_chunk, &mut dst_chunk[..src_chunk.len()], lo, hi, offset);
3765 });
3766
3767 return writer.write_all(&buf);
3768 }
3769
3770 const CHUNK: usize = 256 * 1024;
3772 let buf_size = data.len().min(CHUNK);
3773 let mut buf = alloc_uninit_vec(buf_size);
3774 for chunk in data.chunks(CHUNK) {
3775 translate_range_simd(chunk, &mut buf[..chunk.len()], lo, hi, offset);
3776 writer.write_all(&buf[..chunk.len()])?;
3777 }
3778 Ok(())
3779}
3780
3781fn translate_mmap_range_to_constant(
3784 data: &[u8],
3785 writer: &mut impl Write,
3786 lo: u8,
3787 hi: u8,
3788 replacement: u8,
3789) -> io::Result<()> {
3790 if data.len() >= PARALLEL_THRESHOLD {
3792 let mut buf = alloc_uninit_vec(data.len());
3793 let n_threads = rayon::current_num_threads().max(1);
3794 let chunk_size = (data.len() / n_threads).max(32 * 1024);
3795
3796 data.par_chunks(chunk_size)
3798 .zip(buf.par_chunks_mut(chunk_size))
3799 .for_each(|(src_chunk, dst_chunk)| {
3800 dst_chunk[..src_chunk.len()].copy_from_slice(src_chunk);
3801 translate_range_to_constant_simd_inplace(
3802 &mut dst_chunk[..src_chunk.len()],
3803 lo,
3804 hi,
3805 replacement,
3806 );
3807 });
3808
3809 return writer.write_all(&buf);
3810 }
3811
3812 const CHUNK: usize = 256 * 1024;
3814 let buf_size = data.len().min(CHUNK);
3815 let mut buf = alloc_uninit_vec(buf_size);
3816 for chunk in data.chunks(CHUNK) {
3817 buf[..chunk.len()].copy_from_slice(chunk);
3818 translate_range_to_constant_simd_inplace(&mut buf[..chunk.len()], lo, hi, replacement);
3819 writer.write_all(&buf[..chunk.len()])?;
3820 }
3821 Ok(())
3822}
3823
3824fn translate_mmap_table(data: &[u8], writer: &mut impl Write, table: &[u8; 256]) -> io::Result<()> {
3826 if data.len() >= PARALLEL_THRESHOLD {
3828 let mut buf = alloc_uninit_vec(data.len());
3829 let n_threads = rayon::current_num_threads().max(1);
3830 let chunk_size = (data.len() / n_threads).max(32 * 1024);
3831
3832 data.par_chunks(chunk_size)
3833 .zip(buf.par_chunks_mut(chunk_size))
3834 .for_each(|(src_chunk, dst_chunk)| {
3835 translate_to(src_chunk, &mut dst_chunk[..src_chunk.len()], table);
3836 });
3837
3838 return writer.write_all(&buf);
3839 }
3840
3841 const CHUNK: usize = 256 * 1024;
3843 let buf_size = data.len().min(CHUNK);
3844 let mut buf = alloc_uninit_vec(buf_size);
3845 for chunk in data.chunks(CHUNK) {
3846 translate_to(chunk, &mut buf[..chunk.len()], table);
3847 writer.write_all(&buf[..chunk.len()])?;
3848 }
3849 Ok(())
3850}
3851
3852pub fn translate_mmap_inplace(
3859 set1: &[u8],
3860 set2: &[u8],
3861 data: &mut [u8],
3862 writer: &mut impl Write,
3863) -> io::Result<()> {
3864 let table = build_translate_table(set1, set2);
3865
3866 let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
3868 if is_identity {
3869 return writer.write_all(data);
3870 }
3871
3872 const SEPARATE_BUF_THRESHOLD: usize = 64 * 1024 * 1024;
3879
3880 if data.len() < SEPARATE_BUF_THRESHOLD {
3881 return translate_to_separate_buf(data, &table, writer);
3882 }
3883
3884 if let Some((lo, hi, offset)) = detect_range_offset(&table) {
3886 if data.len() >= PARALLEL_THRESHOLD {
3887 let n_threads = rayon::current_num_threads().max(1);
3888 let chunk_size = (data.len() / n_threads).max(32 * 1024);
3889 data.par_chunks_mut(chunk_size)
3890 .for_each(|chunk| translate_range_simd_inplace(chunk, lo, hi, offset));
3891 } else {
3892 translate_range_simd_inplace(data, lo, hi, offset);
3893 }
3894 return writer.write_all(data);
3895 }
3896
3897 if let Some((lo, hi, replacement)) = detect_range_to_constant(&table) {
3899 if data.len() >= PARALLEL_THRESHOLD {
3900 let n_threads = rayon::current_num_threads().max(1);
3901 let chunk_size = (data.len() / n_threads).max(32 * 1024);
3902 data.par_chunks_mut(chunk_size).for_each(|chunk| {
3903 translate_range_to_constant_simd_inplace(chunk, lo, hi, replacement)
3904 });
3905 } else {
3906 translate_range_to_constant_simd_inplace(data, lo, hi, replacement);
3907 }
3908 return writer.write_all(data);
3909 }
3910
3911 if data.len() >= PARALLEL_THRESHOLD {
3913 let n_threads = rayon::current_num_threads().max(1);
3914 let chunk_size = (data.len() / n_threads).max(32 * 1024);
3915 data.par_chunks_mut(chunk_size)
3916 .for_each(|chunk| translate_inplace(chunk, &table));
3917 } else {
3918 translate_inplace(data, &table);
3919 }
3920 writer.write_all(data)
3921}
3922
3923fn translate_to_separate_buf(
3931 data: &[u8],
3932 table: &[u8; 256],
3933 writer: &mut impl Write,
3934) -> io::Result<()> {
3935 let range_info = detect_range_offset(table);
3936 let const_info = if range_info.is_none() {
3937 detect_range_to_constant(table)
3938 } else {
3939 None
3940 };
3941
3942 if data.len() >= PARALLEL_THRESHOLD {
3943 let mut out_buf = alloc_uninit_vec(data.len());
3945 let n_threads = rayon::current_num_threads().max(1);
3946 let chunk_size = (data.len() / n_threads).max(32 * 1024);
3947
3948 if let Some((lo, hi, offset)) = range_info {
3949 data.par_chunks(chunk_size)
3950 .zip(out_buf.par_chunks_mut(chunk_size))
3951 .for_each(|(src, dst)| {
3952 translate_range_simd(src, &mut dst[..src.len()], lo, hi, offset);
3953 });
3954 } else if let Some((lo, hi, replacement)) = const_info {
3955 data.par_chunks(chunk_size)
3956 .zip(out_buf.par_chunks_mut(chunk_size))
3957 .for_each(|(src, dst)| {
3958 translate_range_to_constant_simd(
3959 src,
3960 &mut dst[..src.len()],
3961 lo,
3962 hi,
3963 replacement,
3964 );
3965 });
3966 } else {
3967 data.par_chunks(chunk_size)
3968 .zip(out_buf.par_chunks_mut(chunk_size))
3969 .for_each(|(src, dst)| {
3970 translate_to(src, &mut dst[..src.len()], table);
3971 });
3972 }
3973 return writer.write_all(&out_buf);
3974 }
3975
3976 let mut out_buf = alloc_uninit_vec(data.len());
3982 if let Some((lo, hi, offset)) = range_info {
3983 translate_range_simd(data, &mut out_buf, lo, hi, offset);
3984 } else if let Some((lo, hi, replacement)) = const_info {
3985 translate_range_to_constant_simd(data, &mut out_buf, lo, hi, replacement);
3986 } else {
3987 translate_to(data, &mut out_buf, table);
3988 }
3989 writer.write_all(&out_buf)
3990}
3991
3992pub fn translate_mmap_readonly(
3996 set1: &[u8],
3997 set2: &[u8],
3998 data: &[u8],
3999 writer: &mut impl Write,
4000) -> io::Result<()> {
4001 let table = build_translate_table(set1, set2);
4002
4003 let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
4005 if is_identity {
4006 return writer.write_all(data);
4007 }
4008
4009 translate_to_separate_buf(data, &table, writer)
4010}
4011
4012pub fn translate_squeeze_mmap(
4018 set1: &[u8],
4019 set2: &[u8],
4020 data: &[u8],
4021 writer: &mut impl Write,
4022) -> io::Result<()> {
4023 let table = build_translate_table(set1, set2);
4024 let squeeze_set = build_member_set(set2);
4025
4026 if data.len() >= PARALLEL_THRESHOLD {
4031 let mut translated = alloc_uninit_vec(data.len());
4033 let range_info = detect_range_offset(&table);
4034 let n_threads = rayon::current_num_threads().max(1);
4035 let chunk_size = (data.len() / n_threads).max(32 * 1024);
4036
4037 if let Some((lo, hi, offset)) = range_info {
4038 data.par_chunks(chunk_size)
4039 .zip(translated.par_chunks_mut(chunk_size))
4040 .for_each(|(src_chunk, dst_chunk)| {
4041 translate_range_simd(
4042 src_chunk,
4043 &mut dst_chunk[..src_chunk.len()],
4044 lo,
4045 hi,
4046 offset,
4047 );
4048 });
4049 } else {
4050 data.par_chunks(chunk_size)
4051 .zip(translated.par_chunks_mut(chunk_size))
4052 .for_each(|(src_chunk, dst_chunk)| {
4053 translate_to(src_chunk, &mut dst_chunk[..src_chunk.len()], &table);
4054 });
4055 }
4056
4057 let mut last_squeezed: u16 = 256;
4061 let len = translated.len();
4062 let mut wp = 0;
4063 unsafe {
4064 let ptr = translated.as_mut_ptr();
4065 let mut i = 0;
4066 while i < len {
4067 let b = *ptr.add(i);
4068 if is_member(&squeeze_set, b) {
4069 if last_squeezed == b as u16 {
4070 i += 1;
4071 continue;
4072 }
4073 last_squeezed = b as u16;
4074 } else {
4075 last_squeezed = 256;
4076 }
4077 *ptr.add(wp) = b;
4078 wp += 1;
4079 i += 1;
4080 }
4081 }
4082 return writer.write_all(&translated[..wp]);
4083 }
4084
4085 let mut buf = alloc_uninit_vec(data.len());
4088 translate_to(data, &mut buf, &table);
4089 let mut last_squeezed: u16 = 256;
4090 let mut wp = 0;
4091 unsafe {
4092 let ptr = buf.as_mut_ptr();
4093 for i in 0..data.len() {
4094 let b = *ptr.add(i);
4095 if is_member(&squeeze_set, b) {
4096 if last_squeezed == b as u16 {
4097 continue;
4098 }
4099 last_squeezed = b as u16;
4100 } else {
4101 last_squeezed = 256;
4102 }
4103 *ptr.add(wp) = b;
4104 wp += 1;
4105 }
4106 }
4107 writer.write_all(&buf[..wp])
4108}
4109
4110pub fn delete_mmap(delete_chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
4116 if delete_chars.len() == 1 {
4117 return delete_single_char_mmap(delete_chars[0], data, writer);
4118 }
4119 if delete_chars.len() <= 3 {
4120 return delete_multi_memchr_mmap(delete_chars, data, writer);
4121 }
4122
4123 if let Some((lo, hi)) = detect_delete_range(delete_chars) {
4125 return delete_range_mmap(data, writer, lo, hi);
4126 }
4127
4128 let member = build_member_set(delete_chars);
4129
4130 let sample_size = data.len().min(1024);
4135 let sample_deletes = data[..sample_size]
4136 .iter()
4137 .filter(|&&b| is_member(&member, b))
4138 .count();
4139 let estimated_deletes = if sample_size > 0 {
4140 data.len() * sample_deletes / sample_size
4141 } else {
4142 data.len()
4143 };
4144
4145 if estimated_deletes < MAX_IOV / 2 {
4146 return delete_bitset_zerocopy(data, &member, writer);
4147 }
4148
4149 if data.len() >= PARALLEL_THRESHOLD {
4151 let n_threads = rayon::current_num_threads().max(1);
4152 let chunk_size = (data.len() / n_threads).max(32 * 1024);
4153
4154 let mut outbuf = alloc_uninit_vec(data.len());
4155 let chunk_lens: Vec<usize> = data
4156 .par_chunks(chunk_size)
4157 .zip(outbuf.par_chunks_mut(chunk_size))
4158 .map(|(src_chunk, dst_chunk)| delete_chunk_bitset_into(src_chunk, &member, dst_chunk))
4159 .collect();
4160
4161 let slices: Vec<std::io::IoSlice> = chunk_lens
4165 .iter()
4166 .enumerate()
4167 .filter(|&(_, &len)| len > 0)
4168 .map(|(i, &len)| std::io::IoSlice::new(&outbuf[i * chunk_size..i * chunk_size + len]))
4169 .collect();
4170 return write_ioslices(writer, &slices);
4171 }
4172
4173 const COMPACT_BUF: usize = 256 * 1024;
4176 let mut outbuf = alloc_uninit_vec(COMPACT_BUF);
4177
4178 for chunk in data.chunks(COMPACT_BUF) {
4179 let out_pos = delete_chunk_bitset_into(chunk, &member, &mut outbuf);
4180 if out_pos > 0 {
4181 writer.write_all(&outbuf[..out_pos])?;
4182 }
4183 }
4184 Ok(())
4185}
4186
4187fn delete_range_mmap(data: &[u8], writer: &mut impl Write, lo: u8, hi: u8) -> io::Result<()> {
4192 let sample_size = data.len().min(1024);
4194 let sample_deletes = data[..sample_size]
4195 .iter()
4196 .filter(|&&b| b >= lo && b <= hi)
4197 .count();
4198 let estimated_deletes = if sample_size > 0 {
4204 data.len() * sample_deletes / sample_size
4205 } else {
4206 data.len()
4207 };
4208 if estimated_deletes < MAX_IOV / 2 {
4209 return delete_range_mmap_zerocopy(data, writer, lo, hi);
4210 }
4211
4212 if data.len() >= PARALLEL_THRESHOLD {
4214 let n_threads = rayon::current_num_threads().max(1);
4215 let chunk_size = (data.len() / n_threads).max(32 * 1024);
4216
4217 let mut outbuf = alloc_uninit_vec(data.len());
4218 let chunk_lens: Vec<usize> = data
4219 .par_chunks(chunk_size)
4220 .zip(outbuf.par_chunks_mut(chunk_size))
4221 .map(|(src_chunk, dst_chunk)| delete_range_chunk(src_chunk, dst_chunk, lo, hi))
4222 .collect();
4223
4224 let slices: Vec<std::io::IoSlice> = chunk_lens
4227 .iter()
4228 .enumerate()
4229 .filter(|&(_, &len)| len > 0)
4230 .map(|(i, &len)| std::io::IoSlice::new(&outbuf[i * chunk_size..i * chunk_size + len]))
4231 .collect();
4232 return write_ioslices(writer, &slices);
4233 }
4234
4235 const COMPACT_BUF: usize = 256 * 1024;
4239 let mut outbuf = alloc_uninit_vec(COMPACT_BUF);
4240
4241 #[cfg(target_arch = "x86_64")]
4242 {
4243 let mut wp = 0;
4244 let level = get_simd_level();
4245 let len = data.len();
4246 let sp = data.as_ptr();
4247 let dp = outbuf.as_mut_ptr();
4248 let mut ri = 0;
4249
4250 if level >= 3 {
4251 use std::arch::x86_64::*;
4252 let range = hi - lo;
4253 let bias_v = unsafe { _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8) };
4254 let threshold_v = unsafe { _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8) };
4255 let zero = unsafe { _mm256_setzero_si256() };
4256
4257 while ri + 32 <= len {
4258 if wp + 32 > COMPACT_BUF {
4260 writer.write_all(&outbuf[..wp])?;
4261 wp = 0;
4262 }
4263
4264 let input = unsafe { _mm256_loadu_si256(sp.add(ri) as *const _) };
4265 let biased = unsafe { _mm256_add_epi8(input, bias_v) };
4266 let gt = unsafe { _mm256_cmpgt_epi8(biased, threshold_v) };
4267 let in_range = unsafe { _mm256_cmpeq_epi8(gt, zero) };
4268 let keep_mask = !(unsafe { _mm256_movemask_epi8(in_range) } as u32);
4269
4270 if keep_mask == 0xFFFFFFFF {
4271 unsafe { std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 32) };
4272 wp += 32;
4273 } else if keep_mask != 0 {
4274 let m0 = keep_mask as u8;
4275 let m1 = (keep_mask >> 8) as u8;
4276 let m2 = (keep_mask >> 16) as u8;
4277 let m3 = (keep_mask >> 24) as u8;
4278
4279 if m0 == 0xFF {
4280 unsafe { std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 8) };
4281 } else if m0 != 0 {
4282 unsafe { compact_8bytes_simd(sp.add(ri), dp.add(wp), m0) };
4283 }
4284 let c0 = m0.count_ones() as usize;
4285
4286 if m1 == 0xFF {
4287 unsafe {
4288 std::ptr::copy_nonoverlapping(sp.add(ri + 8), dp.add(wp + c0), 8)
4289 };
4290 } else if m1 != 0 {
4291 unsafe { compact_8bytes_simd(sp.add(ri + 8), dp.add(wp + c0), m1) };
4292 }
4293 let c1 = m1.count_ones() as usize;
4294
4295 if m2 == 0xFF {
4296 unsafe {
4297 std::ptr::copy_nonoverlapping(sp.add(ri + 16), dp.add(wp + c0 + c1), 8)
4298 };
4299 } else if m2 != 0 {
4300 unsafe { compact_8bytes_simd(sp.add(ri + 16), dp.add(wp + c0 + c1), m2) };
4301 }
4302 let c2 = m2.count_ones() as usize;
4303
4304 if m3 == 0xFF {
4305 unsafe {
4306 std::ptr::copy_nonoverlapping(
4307 sp.add(ri + 24),
4308 dp.add(wp + c0 + c1 + c2),
4309 8,
4310 )
4311 };
4312 } else if m3 != 0 {
4313 unsafe {
4314 compact_8bytes_simd(sp.add(ri + 24), dp.add(wp + c0 + c1 + c2), m3)
4315 };
4316 }
4317 let c3 = m3.count_ones() as usize;
4318 wp += c0 + c1 + c2 + c3;
4319 }
4320 ri += 32;
4321 }
4322 }
4323
4324 while ri < len {
4326 if wp + 1 > COMPACT_BUF {
4327 writer.write_all(&outbuf[..wp])?;
4328 wp = 0;
4329 }
4330 let b = unsafe { *sp.add(ri) };
4331 unsafe { *dp.add(wp) = b };
4332 wp += (b < lo || b > hi) as usize;
4333 ri += 1;
4334 }
4335
4336 if wp > 0 {
4337 writer.write_all(&outbuf[..wp])?;
4338 }
4339 return Ok(());
4340 }
4341
4342 #[cfg(not(target_arch = "x86_64"))]
4343 {
4344 for chunk in data.chunks(COMPACT_BUF) {
4346 let clen = delete_range_chunk(chunk, &mut outbuf, lo, hi);
4347 if clen > 0 {
4348 writer.write_all(&outbuf[..clen])?;
4349 }
4350 }
4351 return Ok(());
4352 }
4353
4354 #[allow(unreachable_code)]
4355 Ok(())
4356}
4357
4358fn delete_range_mmap_zerocopy(
4363 data: &[u8],
4364 writer: &mut impl Write,
4365 lo: u8,
4366 hi: u8,
4367) -> io::Result<()> {
4368 #[cfg(target_arch = "x86_64")]
4369 {
4370 if get_simd_level() >= 3 {
4371 return unsafe { delete_range_zerocopy_avx2(data, writer, lo, hi) };
4372 }
4373 if get_simd_level() >= 2 {
4374 return unsafe { delete_range_zerocopy_sse2(data, writer, lo, hi) };
4375 }
4376 }
4377
4378 #[cfg(target_arch = "aarch64")]
4379 {
4380 return unsafe { delete_range_zerocopy_neon(data, writer, lo, hi) };
4381 }
4382
4383 #[allow(unreachable_code)]
4385 delete_range_zerocopy_scalar(data, writer, lo, hi)
4386}
4387
4388fn delete_range_zerocopy_scalar(
4391 data: &[u8],
4392 writer: &mut impl Write,
4393 lo: u8,
4394 hi: u8,
4395) -> io::Result<()> {
4396 let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(MAX_IOV);
4397 let len = data.len();
4398 let mut run_start: usize = 0;
4399 let mut i: usize = 0;
4400
4401 while i < len {
4402 let b = unsafe { *data.get_unchecked(i) };
4403 if b >= lo && b <= hi {
4404 if i > run_start {
4405 iov.push(std::io::IoSlice::new(&data[run_start..i]));
4406 if iov.len() >= MAX_IOV {
4407 write_ioslices(writer, &iov)?;
4408 iov.clear();
4409 }
4410 }
4411 run_start = i + 1;
4412 }
4413 i += 1;
4414 }
4415 if run_start < len {
4416 iov.push(std::io::IoSlice::new(&data[run_start..]));
4417 }
4418 if !iov.is_empty() {
4419 write_ioslices(writer, &iov)?;
4420 }
4421 Ok(())
4422}
4423
4424#[cfg(target_arch = "x86_64")]
4428#[target_feature(enable = "avx2")]
4429unsafe fn delete_range_zerocopy_avx2(
4430 data: &[u8],
4431 writer: &mut impl Write,
4432 lo: u8,
4433 hi: u8,
4434) -> io::Result<()> {
4435 use std::arch::x86_64::*;
4436
4437 unsafe {
4438 let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(MAX_IOV);
4439 let len = data.len();
4440 let mut run_start: usize = 0;
4441 let mut ri: usize = 0;
4442
4443 let range = hi - lo;
4444 let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
4445 let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
4446 let zero = _mm256_setzero_si256();
4447
4448 while ri + 32 <= len {
4449 let input = _mm256_loadu_si256(data.as_ptr().add(ri) as *const _);
4450 let biased = _mm256_add_epi8(input, bias_v);
4451 let gt = _mm256_cmpgt_epi8(biased, threshold_v);
4452 let in_range = _mm256_cmpeq_epi8(gt, zero);
4453 let del_mask = _mm256_movemask_epi8(in_range) as u32;
4454
4455 if del_mask == 0 {
4456 ri += 32;
4458 continue;
4459 }
4460
4461 let mut m = del_mask;
4463 while m != 0 {
4464 let bit = m.trailing_zeros() as usize;
4465 let abs_pos = ri + bit;
4466 if abs_pos > run_start {
4467 iov.push(std::io::IoSlice::new(&data[run_start..abs_pos]));
4468 if iov.len() >= MAX_IOV {
4469 write_ioslices(writer, &iov)?;
4470 iov.clear();
4471 }
4472 }
4473 run_start = abs_pos + 1;
4474 m &= m - 1; }
4476
4477 ri += 32;
4478 }
4479
4480 while ri < len {
4482 let b = *data.get_unchecked(ri);
4483 if b >= lo && b <= hi {
4484 if ri > run_start {
4485 iov.push(std::io::IoSlice::new(&data[run_start..ri]));
4486 if iov.len() >= MAX_IOV {
4487 write_ioslices(writer, &iov)?;
4488 iov.clear();
4489 }
4490 }
4491 run_start = ri + 1;
4492 }
4493 ri += 1;
4494 }
4495
4496 if run_start < len {
4497 iov.push(std::io::IoSlice::new(&data[run_start..]));
4498 }
4499 if !iov.is_empty() {
4500 write_ioslices(writer, &iov)?;
4501 }
4502 Ok(())
4503 }
4504}
4505
4506#[cfg(target_arch = "x86_64")]
4508#[target_feature(enable = "sse2")]
4509unsafe fn delete_range_zerocopy_sse2(
4510 data: &[u8],
4511 writer: &mut impl Write,
4512 lo: u8,
4513 hi: u8,
4514) -> io::Result<()> {
4515 use std::arch::x86_64::*;
4516
4517 unsafe {
4518 let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(MAX_IOV);
4519 let len = data.len();
4520 let mut run_start: usize = 0;
4521 let mut ri: usize = 0;
4522
4523 let range = hi - lo;
4524 let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
4525 let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
4526 let zero = _mm_setzero_si128();
4527
4528 while ri + 16 <= len {
4529 let input = _mm_loadu_si128(data.as_ptr().add(ri) as *const _);
4530 let biased = _mm_add_epi8(input, bias_v);
4531 let gt = _mm_cmpgt_epi8(biased, threshold_v);
4532 let in_range = _mm_cmpeq_epi8(gt, zero);
4533 let del_mask = _mm_movemask_epi8(in_range) as u32 & 0xFFFF;
4534
4535 if del_mask == 0 {
4536 ri += 16;
4537 continue;
4538 }
4539
4540 let mut m = del_mask;
4541 while m != 0 {
4542 let bit = m.trailing_zeros() as usize;
4543 let abs_pos = ri + bit;
4544 if abs_pos > run_start {
4545 iov.push(std::io::IoSlice::new(&data[run_start..abs_pos]));
4546 if iov.len() >= MAX_IOV {
4547 write_ioslices(writer, &iov)?;
4548 iov.clear();
4549 }
4550 }
4551 run_start = abs_pos + 1;
4552 m &= m - 1;
4553 }
4554
4555 ri += 16;
4556 }
4557
4558 while ri < len {
4559 let b = *data.get_unchecked(ri);
4560 if b >= lo && b <= hi {
4561 if ri > run_start {
4562 iov.push(std::io::IoSlice::new(&data[run_start..ri]));
4563 if iov.len() >= MAX_IOV {
4564 write_ioslices(writer, &iov)?;
4565 iov.clear();
4566 }
4567 }
4568 run_start = ri + 1;
4569 }
4570 ri += 1;
4571 }
4572
4573 if run_start < len {
4574 iov.push(std::io::IoSlice::new(&data[run_start..]));
4575 }
4576 if !iov.is_empty() {
4577 write_ioslices(writer, &iov)?;
4578 }
4579 Ok(())
4580 }
4581}
4582
4583#[cfg(target_arch = "aarch64")]
4587#[target_feature(enable = "neon")]
4588unsafe fn delete_range_zerocopy_neon(
4589 data: &[u8],
4590 writer: &mut impl Write,
4591 lo: u8,
4592 hi: u8,
4593) -> io::Result<()> {
4594 use std::arch::aarch64::*;
4595
4596 unsafe {
4597 let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(MAX_IOV);
4598 let len = data.len();
4599 let mut run_start: usize = 0;
4600 let mut ri: usize = 0;
4601
4602 let lo_v = vdupq_n_u8(lo);
4603 let hi_v = vdupq_n_u8(hi);
4604 let bit_mask: [u8; 16] = [1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128];
4606 let bit_mask_v = vld1q_u8(bit_mask.as_ptr());
4607
4608 while ri + 16 <= len {
4609 let input = vld1q_u8(data.as_ptr().add(ri));
4610 let ge_lo = vcgeq_u8(input, lo_v);
4612 let le_hi = vcleq_u8(input, hi_v);
4613 let in_range = vandq_u8(ge_lo, le_hi);
4614
4615 let bits = vandq_u8(in_range, bit_mask_v);
4617 let pair = vpaddlq_u8(bits); let quad = vpaddlq_u16(pair); let octet = vpaddlq_u32(quad); let mask_lo = vgetq_lane_u64::<0>(octet) as u8;
4621 let mask_hi = vgetq_lane_u64::<1>(octet) as u8;
4622 let del_mask = (mask_hi as u16) << 8 | mask_lo as u16;
4623
4624 if del_mask == 0 {
4625 ri += 16;
4627 continue;
4628 }
4629
4630 let mut m = del_mask;
4632 while m != 0 {
4633 let bit = m.trailing_zeros() as usize;
4634 let abs_pos = ri + bit;
4635 if abs_pos > run_start {
4636 iov.push(std::io::IoSlice::new(&data[run_start..abs_pos]));
4637 if iov.len() >= MAX_IOV {
4638 write_ioslices(writer, &iov)?;
4639 iov.clear();
4640 }
4641 }
4642 run_start = abs_pos + 1;
4643 m &= m - 1;
4644 }
4645
4646 ri += 16;
4647 }
4648
4649 while ri < len {
4651 let b = *data.get_unchecked(ri);
4652 if b >= lo && b <= hi {
4653 if ri > run_start {
4654 iov.push(std::io::IoSlice::new(&data[run_start..ri]));
4655 if iov.len() >= MAX_IOV {
4656 write_ioslices(writer, &iov)?;
4657 iov.clear();
4658 }
4659 }
4660 run_start = ri + 1;
4661 }
4662 ri += 1;
4663 }
4664
4665 if run_start < len {
4666 iov.push(std::io::IoSlice::new(&data[run_start..]));
4667 }
4668 if !iov.is_empty() {
4669 write_ioslices(writer, &iov)?;
4670 }
4671 Ok(())
4672 }
4673}
4674
4675#[inline]
4678fn delete_chunk_bitset_into(chunk: &[u8], member: &[u8; 32], outbuf: &mut [u8]) -> usize {
4679 let len = chunk.len();
4680 let mut out_pos = 0;
4681 let mut i = 0;
4682
4683 while i + 8 <= len {
4684 unsafe {
4685 let b0 = *chunk.get_unchecked(i);
4686 let b1 = *chunk.get_unchecked(i + 1);
4687 let b2 = *chunk.get_unchecked(i + 2);
4688 let b3 = *chunk.get_unchecked(i + 3);
4689 let b4 = *chunk.get_unchecked(i + 4);
4690 let b5 = *chunk.get_unchecked(i + 5);
4691 let b6 = *chunk.get_unchecked(i + 6);
4692 let b7 = *chunk.get_unchecked(i + 7);
4693
4694 *outbuf.get_unchecked_mut(out_pos) = b0;
4695 out_pos += !is_member(member, b0) as usize;
4696 *outbuf.get_unchecked_mut(out_pos) = b1;
4697 out_pos += !is_member(member, b1) as usize;
4698 *outbuf.get_unchecked_mut(out_pos) = b2;
4699 out_pos += !is_member(member, b2) as usize;
4700 *outbuf.get_unchecked_mut(out_pos) = b3;
4701 out_pos += !is_member(member, b3) as usize;
4702 *outbuf.get_unchecked_mut(out_pos) = b4;
4703 out_pos += !is_member(member, b4) as usize;
4704 *outbuf.get_unchecked_mut(out_pos) = b5;
4705 out_pos += !is_member(member, b5) as usize;
4706 *outbuf.get_unchecked_mut(out_pos) = b6;
4707 out_pos += !is_member(member, b6) as usize;
4708 *outbuf.get_unchecked_mut(out_pos) = b7;
4709 out_pos += !is_member(member, b7) as usize;
4710 }
4711 i += 8;
4712 }
4713
4714 while i < len {
4715 unsafe {
4716 let b = *chunk.get_unchecked(i);
4717 *outbuf.get_unchecked_mut(out_pos) = b;
4718 out_pos += !is_member(member, b) as usize;
4719 }
4720 i += 1;
4721 }
4722
4723 out_pos
4724}
4725
4726fn delete_bitset_zerocopy(
4731 data: &[u8],
4732 member: &[u8; 32],
4733 writer: &mut impl Write,
4734) -> io::Result<()> {
4735 let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(MAX_IOV);
4736 let len = data.len();
4737 let mut i = 0;
4738 let mut run_start: Option<usize> = None;
4739
4740 while i < len {
4741 let b = unsafe { *data.get_unchecked(i) };
4742 if is_member(member, b) {
4743 if let Some(rs) = run_start {
4745 iov.push(std::io::IoSlice::new(&data[rs..i]));
4746 run_start = None;
4747 if iov.len() >= MAX_IOV {
4748 write_ioslices(writer, &iov)?;
4749 iov.clear();
4750 }
4751 }
4752 } else {
4753 if run_start.is_none() {
4755 run_start = Some(i);
4756 }
4757 }
4758 i += 1;
4759 }
4760 if let Some(rs) = run_start {
4762 iov.push(std::io::IoSlice::new(&data[rs..]));
4763 }
4764 if !iov.is_empty() {
4765 write_ioslices(writer, &iov)?;
4766 }
4767 Ok(())
4768}
4769
4770fn delete_single_char_mmap(ch: u8, data: &[u8], writer: &mut impl Write) -> io::Result<()> {
4771 let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(MAX_IOV);
4775 let mut last = 0;
4776 for pos in memchr::memchr_iter(ch, data) {
4777 if pos > last {
4778 iov.push(std::io::IoSlice::new(&data[last..pos]));
4779 if iov.len() >= MAX_IOV {
4780 write_ioslices(writer, &iov)?;
4781 iov.clear();
4782 }
4783 }
4784 last = pos + 1;
4785 }
4786 if last < data.len() {
4787 iov.push(std::io::IoSlice::new(&data[last..]));
4788 }
4789 if !iov.is_empty() {
4790 write_ioslices(writer, &iov)?;
4791 }
4792 Ok(())
4793}
4794
4795fn delete_multi_memchr_mmap(chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
4796 let c0 = chars[0];
4797 let c1 = if chars.len() >= 2 { chars[1] } else { 0 };
4798 let c2 = if chars.len() >= 3 { chars[2] } else { 0 };
4799 let is_three = chars.len() >= 3;
4800
4801 let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(MAX_IOV);
4803 let mut last = 0;
4804
4805 macro_rules! process_pos {
4806 ($pos:expr) => {
4807 if $pos > last {
4808 iov.push(std::io::IoSlice::new(&data[last..$pos]));
4809 if iov.len() >= MAX_IOV {
4810 write_ioslices(writer, &iov)?;
4811 iov.clear();
4812 }
4813 }
4814 last = $pos + 1;
4815 };
4816 }
4817
4818 if is_three {
4819 for pos in memchr::memchr3_iter(c0, c1, c2, data) {
4820 process_pos!(pos);
4821 }
4822 } else {
4823 for pos in memchr::memchr2_iter(c0, c1, data) {
4824 process_pos!(pos);
4825 }
4826 }
4827 if last < data.len() {
4828 iov.push(std::io::IoSlice::new(&data[last..]));
4829 }
4830 if !iov.is_empty() {
4831 write_ioslices(writer, &iov)?;
4832 }
4833 Ok(())
4834}
4835
4836pub fn delete_squeeze_mmap(
4841 delete_chars: &[u8],
4842 squeeze_chars: &[u8],
4843 data: &[u8],
4844 writer: &mut impl Write,
4845) -> io::Result<()> {
4846 let delete_set = build_member_set(delete_chars);
4847 let squeeze_set = build_member_set(squeeze_chars);
4848
4849 let mut outbuf = alloc_uninit_vec(data.len());
4851 let mut last_squeezed: u16 = 256;
4852 let mut out_pos = 0;
4853
4854 for &b in data.iter() {
4855 if is_member(&delete_set, b) {
4856 continue;
4857 }
4858 if is_member(&squeeze_set, b) {
4859 if last_squeezed == b as u16 {
4860 continue;
4861 }
4862 last_squeezed = b as u16;
4863 } else {
4864 last_squeezed = 256;
4865 }
4866 unsafe {
4867 *outbuf.get_unchecked_mut(out_pos) = b;
4868 }
4869 out_pos += 1;
4870 }
4871 writer.write_all(&outbuf[..out_pos])
4872}
4873
4874pub fn squeeze_mmap(squeeze_chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
4880 if squeeze_chars.len() == 1 {
4881 return squeeze_single_mmap(squeeze_chars[0], data, writer);
4882 }
4883 if squeeze_chars.len() == 2 {
4884 return squeeze_multi_mmap::<2>(squeeze_chars, data, writer);
4885 }
4886 if squeeze_chars.len() == 3 {
4887 return squeeze_multi_mmap::<3>(squeeze_chars, data, writer);
4888 }
4889
4890 let member = build_member_set(squeeze_chars);
4891
4892 if data.len() >= PARALLEL_THRESHOLD {
4894 let n_threads = rayon::current_num_threads().max(1);
4895 let chunk_size = (data.len() / n_threads).max(32 * 1024);
4896
4897 let results: Vec<Vec<u8>> = data
4898 .par_chunks(chunk_size)
4899 .map(|chunk| squeeze_chunk_bitset(chunk, &member))
4900 .collect();
4901
4902 let mut slices: Vec<std::io::IoSlice> = Vec::with_capacity(results.len());
4907 for (idx, result) in results.iter().enumerate() {
4908 if result.is_empty() {
4909 continue;
4910 }
4911 if idx > 0 {
4912 if let Some(&prev_last) = results[..idx].iter().rev().find_map(|r| r.last()) {
4914 if is_member(&member, prev_last) {
4915 let skip = result.iter().take_while(|&&b| b == prev_last).count();
4917 if skip < result.len() {
4918 slices.push(std::io::IoSlice::new(&result[skip..]));
4919 }
4920 continue;
4921 }
4922 }
4923 }
4924 slices.push(std::io::IoSlice::new(result));
4925 }
4926 return write_ioslices(writer, &slices);
4927 }
4928
4929 let mut outbuf = alloc_uninit_vec(data.len());
4931 let len = data.len();
4932 let mut wp = 0;
4933 let mut i = 0;
4934 let mut last_squeezed: u16 = 256;
4935
4936 unsafe {
4937 let inp = data.as_ptr();
4938 let outp = outbuf.as_mut_ptr();
4939
4940 while i < len {
4941 let b = *inp.add(i);
4942 if is_member(&member, b) {
4943 if last_squeezed != b as u16 {
4944 *outp.add(wp) = b;
4945 wp += 1;
4946 last_squeezed = b as u16;
4947 }
4948 i += 1;
4949 while i < len && *inp.add(i) == b {
4950 i += 1;
4951 }
4952 } else {
4953 last_squeezed = 256;
4954 *outp.add(wp) = b;
4955 wp += 1;
4956 i += 1;
4957 }
4958 }
4959 }
4960 writer.write_all(&outbuf[..wp])
4961}
4962
4963fn squeeze_chunk_bitset(chunk: &[u8], member: &[u8; 32]) -> Vec<u8> {
4965 let len = chunk.len();
4966 let mut out = Vec::with_capacity(len);
4967 let mut last_squeezed: u16 = 256;
4968 let mut i = 0;
4969
4970 unsafe {
4971 out.set_len(len);
4972 let inp = chunk.as_ptr();
4973 let outp: *mut u8 = out.as_mut_ptr();
4974 let mut wp = 0;
4975
4976 while i < len {
4977 let b = *inp.add(i);
4978 if is_member(member, b) {
4979 if last_squeezed != b as u16 {
4980 *outp.add(wp) = b;
4981 wp += 1;
4982 last_squeezed = b as u16;
4983 }
4984 i += 1;
4985 while i < len && *inp.add(i) == b {
4986 i += 1;
4987 }
4988 } else {
4989 last_squeezed = 256;
4990 *outp.add(wp) = b;
4991 wp += 1;
4992 i += 1;
4993 }
4994 }
4995 out.set_len(wp);
4996 }
4997 out
4998}
4999
5000fn squeeze_multi_mmap<const N: usize>(
5001 chars: &[u8],
5002 data: &[u8],
5003 writer: &mut impl Write,
5004) -> io::Result<()> {
5005 if data.len() >= PARALLEL_THRESHOLD {
5007 let member = build_member_set(chars);
5008 let n_threads = rayon::current_num_threads().max(1);
5009 let chunk_size = (data.len() / n_threads).max(32 * 1024);
5010
5011 let results: Vec<Vec<u8>> = data
5012 .par_chunks(chunk_size)
5013 .map(|chunk| squeeze_chunk_bitset(chunk, &member))
5014 .collect();
5015
5016 let mut slices: Vec<std::io::IoSlice> = Vec::with_capacity(results.len());
5018 for (idx, result) in results.iter().enumerate() {
5019 if result.is_empty() {
5020 continue;
5021 }
5022 if idx > 0 {
5023 if let Some(&prev_last) = results[..idx].iter().rev().find_map(|r| r.last()) {
5024 if is_member(&member, prev_last) {
5025 let skip = result.iter().take_while(|&&b| b == prev_last).count();
5026 if skip < result.len() {
5027 slices.push(std::io::IoSlice::new(&result[skip..]));
5028 }
5029 continue;
5030 }
5031 }
5032 }
5033 slices.push(std::io::IoSlice::new(result));
5034 }
5035 return write_ioslices(writer, &slices);
5036 }
5037
5038 let single = [chars[0]; 1]; let _ = single;
5044 let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(1024);
5045 let mut cursor = 0;
5046 let mut last_squeezed: u16 = 256;
5047
5048 macro_rules! find_next {
5049 ($data:expr) => {
5050 if N == 2 {
5051 memchr::memchr2(chars[0], chars[1], $data)
5052 } else {
5053 memchr::memchr3(chars[0], chars[1], chars[2], $data)
5054 }
5055 };
5056 }
5057
5058 while cursor < data.len() {
5059 match find_next!(&data[cursor..]) {
5060 Some(offset) => {
5061 let pos = cursor + offset;
5062 let b = data[pos];
5063 if pos > cursor {
5065 iov.push(std::io::IoSlice::new(&data[cursor..pos]));
5066 last_squeezed = 256;
5067 }
5068 if last_squeezed != b as u16 {
5070 iov.push(std::io::IoSlice::new(&data[pos..pos + 1]));
5072 last_squeezed = b as u16;
5073 }
5074 let mut skip = pos + 1;
5076 while skip < data.len() && data[skip] == b {
5077 skip += 1;
5078 }
5079 cursor = skip;
5080 if iov.len() >= MAX_IOV {
5082 write_ioslices(writer, &iov)?;
5083 iov.clear();
5084 }
5085 }
5086 None => {
5087 if cursor < data.len() {
5088 iov.push(std::io::IoSlice::new(&data[cursor..]));
5089 }
5090 break;
5091 }
5092 }
5093 }
5094 if !iov.is_empty() {
5095 write_ioslices(writer, &iov)?;
5096 }
5097 Ok(())
5098}
5099
5100fn squeeze_single_mmap(ch: u8, data: &[u8], writer: &mut impl Write) -> io::Result<()> {
5101 if data.is_empty() {
5102 return Ok(());
5103 }
5104
5105 let pair = [ch, ch];
5107 if memchr::memmem::find(data, &pair).is_none() {
5108 return writer.write_all(data);
5109 }
5110
5111 let finder = memchr::memmem::Finder::new(&pair);
5118 let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(2048);
5119 let mut cursor = 0;
5120
5121 while cursor < data.len() {
5122 match finder.find(&data[cursor..]) {
5123 Some(offset) => {
5124 let pair_pos = cursor + offset;
5125 let seg_end = pair_pos + 1;
5127 if seg_end > cursor {
5128 iov.push(std::io::IoSlice::new(&data[cursor..seg_end]));
5129 }
5130 let mut skip = seg_end;
5132 while skip < data.len() && data[skip] == ch {
5133 skip += 1;
5134 }
5135 cursor = skip;
5136 if iov.len() >= MAX_IOV {
5138 write_ioslices(writer, &iov)?;
5139 iov.clear();
5140 }
5141 }
5142 None => {
5143 if cursor < data.len() {
5145 iov.push(std::io::IoSlice::new(&data[cursor..]));
5146 }
5147 break;
5148 }
5149 }
5150 }
5151
5152 if !iov.is_empty() {
5153 write_ioslices(writer, &iov)?;
5154 }
5155 Ok(())
5156}