1use std::io::{self, Read, Write};
2
3use rayon::prelude::*;
4
5const MAX_IOV: usize = 1024;
8
9const STREAM_BUF: usize = 4 * 1024 * 1024;
14
15const PARALLEL_THRESHOLD: usize = 64 * 1024 * 1024;
21
22#[cfg(target_arch = "x86_64")]
28static COMPACT_LUT: [[u8; 8]; 256] = {
29 let mut lut = [[0u8; 8]; 256];
30 let mut mask: u16 = 0;
31 while mask < 256 {
32 let mut idx: usize = 0;
33 let mut bit: u8 = 0;
34 while bit < 8 {
35 if (mask >> bit) & 1 != 0 {
36 lut[mask as usize][idx] = bit;
37 idx += 1;
38 }
39 bit += 1;
40 }
41 mask += 1;
42 }
43 lut
44};
45
46#[inline]
49fn write_ioslices(writer: &mut impl Write, slices: &[std::io::IoSlice]) -> io::Result<()> {
50 if slices.is_empty() {
51 return Ok(());
52 }
53 for batch in slices.chunks(MAX_IOV) {
54 let total: usize = batch.iter().map(|s| s.len()).sum();
55 match writer.write_vectored(batch) {
56 Ok(n) if n >= total => continue,
57 Ok(mut written) => {
58 for slice in batch {
60 let slen = slice.len();
61 if written >= slen {
62 written -= slen;
63 continue;
64 }
65 if written > 0 {
66 writer.write_all(&slice[written..])?;
67 written = 0;
68 } else {
69 writer.write_all(slice)?;
70 }
71 }
72 }
73 Err(e) => return Err(e),
74 }
75 }
76 Ok(())
77}
78
79#[inline]
83#[allow(clippy::uninit_vec)]
84fn alloc_uninit_vec(len: usize) -> Vec<u8> {
85 let mut v = Vec::with_capacity(len);
86 unsafe {
88 v.set_len(len);
89 }
90 #[cfg(target_os = "linux")]
91 if len >= 2 * 1024 * 1024 {
92 unsafe {
93 libc::madvise(
94 v.as_mut_ptr() as *mut libc::c_void,
95 len,
96 libc::MADV_HUGEPAGE,
97 );
98 }
99 }
100 v
101}
102
103#[inline]
105fn build_translate_table(set1: &[u8], set2: &[u8]) -> [u8; 256] {
106 let mut table: [u8; 256] = std::array::from_fn(|i| i as u8);
107 let last = set2.last().copied();
108 for (i, &from) in set1.iter().enumerate() {
109 table[from as usize] = if i < set2.len() {
110 set2[i]
111 } else {
112 last.unwrap_or(from)
113 };
114 }
115 table
116}
117
118#[inline]
120fn build_member_set(chars: &[u8]) -> [u8; 32] {
121 let mut set = [0u8; 32];
122 for &ch in chars {
123 set[ch as usize >> 3] |= 1 << (ch & 7);
124 }
125 set
126}
127
128#[inline(always)]
129fn is_member(set: &[u8; 32], ch: u8) -> bool {
130 unsafe { (*set.get_unchecked(ch as usize >> 3) & (1 << (ch & 7))) != 0 }
131}
132
133#[cfg(target_arch = "x86_64")]
136static SIMD_LEVEL: std::sync::atomic::AtomicU8 = std::sync::atomic::AtomicU8::new(0);
137
138#[cfg(target_arch = "x86_64")]
139#[inline(always)]
140fn get_simd_level() -> u8 {
141 let level = SIMD_LEVEL.load(std::sync::atomic::Ordering::Relaxed);
142 if level != 0 {
143 return level;
144 }
145 let detected = if is_x86_feature_detected!("avx2") {
146 3
147 } else if is_x86_feature_detected!("ssse3") {
148 2
149 } else {
150 1
151 };
152 SIMD_LEVEL.store(detected, std::sync::atomic::Ordering::Relaxed);
153 detected
154}
155
156#[cfg(target_arch = "x86_64")]
158#[inline]
159fn count_non_identity(table: &[u8; 256]) -> usize {
160 table
161 .iter()
162 .enumerate()
163 .filter(|&(i, &v)| v != i as u8)
164 .count()
165}
166
167#[inline(always)]
173fn translate_inplace(data: &mut [u8], table: &[u8; 256]) {
174 #[cfg(target_arch = "x86_64")]
175 {
176 let level = get_simd_level();
177 if level >= 3 {
178 let non_id = count_non_identity(table);
183 if non_id > 0 && non_id <= 16 {
184 unsafe { translate_inplace_avx2_sparse(data, table) };
185 return;
186 }
187 unsafe { translate_inplace_avx2_table(data, table) };
188 return;
189 }
190 if level >= 2 {
191 unsafe { translate_inplace_ssse3_table(data, table) };
192 return;
193 }
194 }
195 translate_inplace_scalar(data, table);
196}
197
198#[cfg(target_arch = "x86_64")]
204#[target_feature(enable = "avx2")]
205unsafe fn translate_inplace_avx2_sparse(data: &mut [u8], table: &[u8; 256]) {
206 use std::arch::x86_64::*;
207
208 unsafe {
209 let len = data.len();
210 let ptr = data.as_mut_ptr();
211
212 let mut lut = [_mm256_setzero_si256(); 16];
214 for h in 0u8..16 {
215 let base = (h as usize) * 16;
216 let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
217 let row128 = _mm_loadu_si128(row.as_ptr() as *const _);
218 lut[h as usize] = _mm256_broadcastsi128_si256(row128);
219 }
220
221 let lo_mask = _mm256_set1_epi8(0x0F);
222
223 let mut i = 0;
224 while i + 32 <= len {
225 let input = _mm256_loadu_si256(ptr.add(i) as *const _);
226 let lo_nibble = _mm256_and_si256(input, lo_mask);
227 let hi_nibble = _mm256_and_si256(_mm256_srli_epi16(input, 4), lo_mask);
228
229 let mut result = _mm256_setzero_si256();
230 macro_rules! do_nibble {
231 ($h:expr) => {
232 let h_val = _mm256_set1_epi8($h as i8);
233 let mask = _mm256_cmpeq_epi8(hi_nibble, h_val);
234 let looked_up = _mm256_shuffle_epi8(lut[$h], lo_nibble);
235 result = _mm256_or_si256(result, _mm256_and_si256(mask, looked_up));
236 };
237 }
238 do_nibble!(0);
239 do_nibble!(1);
240 do_nibble!(2);
241 do_nibble!(3);
242 do_nibble!(4);
243 do_nibble!(5);
244 do_nibble!(6);
245 do_nibble!(7);
246 do_nibble!(8);
247 do_nibble!(9);
248 do_nibble!(10);
249 do_nibble!(11);
250 do_nibble!(12);
251 do_nibble!(13);
252 do_nibble!(14);
253 do_nibble!(15);
254
255 let diff = _mm256_xor_si256(input, result);
257 if _mm256_testz_si256(diff, diff) == 0 {
258 _mm256_storeu_si256(ptr.add(i) as *mut _, result);
259 }
260 i += 32;
261 }
262
263 while i < len {
265 *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
266 i += 1;
267 }
268 }
269}
270
271#[cfg(not(target_arch = "aarch64"))]
273#[inline(always)]
274fn translate_inplace_scalar(data: &mut [u8], table: &[u8; 256]) {
275 let len = data.len();
276 let ptr = data.as_mut_ptr();
277 let mut i = 0;
278 unsafe {
279 while i + 8 <= len {
280 *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
281 *ptr.add(i + 1) = *table.get_unchecked(*ptr.add(i + 1) as usize);
282 *ptr.add(i + 2) = *table.get_unchecked(*ptr.add(i + 2) as usize);
283 *ptr.add(i + 3) = *table.get_unchecked(*ptr.add(i + 3) as usize);
284 *ptr.add(i + 4) = *table.get_unchecked(*ptr.add(i + 4) as usize);
285 *ptr.add(i + 5) = *table.get_unchecked(*ptr.add(i + 5) as usize);
286 *ptr.add(i + 6) = *table.get_unchecked(*ptr.add(i + 6) as usize);
287 *ptr.add(i + 7) = *table.get_unchecked(*ptr.add(i + 7) as usize);
288 i += 8;
289 }
290 while i < len {
291 *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
292 i += 1;
293 }
294 }
295}
296
297#[cfg(target_arch = "aarch64")]
300#[inline(always)]
301fn translate_inplace_scalar(data: &mut [u8], table: &[u8; 256]) {
302 unsafe { translate_inplace_neon_table(data, table) };
303}
304
305#[cfg(target_arch = "aarch64")]
306#[target_feature(enable = "neon")]
307unsafe fn translate_inplace_neon_table(data: &mut [u8], table: &[u8; 256]) {
308 use std::arch::aarch64::*;
309
310 unsafe {
311 let len = data.len();
312 let ptr = data.as_mut_ptr();
313
314 let mut lut: [uint8x16_t; 16] = [vdupq_n_u8(0); 16];
316 for h in 0u8..16 {
317 let base = (h as usize) * 16;
318 lut[h as usize] = vld1q_u8(table.as_ptr().add(base));
319 }
320
321 let lo_mask = vdupq_n_u8(0x0F);
322 let mut i = 0;
323
324 while i + 16 <= len {
325 let input = vld1q_u8(ptr.add(i));
326 let lo_nibble = vandq_u8(input, lo_mask);
327 let hi_nibble = vandq_u8(vshrq_n_u8(input, 4), lo_mask);
328
329 let mut result = vdupq_n_u8(0);
330 macro_rules! do_nibble {
331 ($h:expr) => {
332 let h_val = vdupq_n_u8($h);
333 let mask = vceqq_u8(hi_nibble, h_val);
334 let looked_up = vqtbl1q_u8(lut[$h as usize], lo_nibble);
335 result = vorrq_u8(result, vandq_u8(mask, looked_up));
336 };
337 }
338 do_nibble!(0);
339 do_nibble!(1);
340 do_nibble!(2);
341 do_nibble!(3);
342 do_nibble!(4);
343 do_nibble!(5);
344 do_nibble!(6);
345 do_nibble!(7);
346 do_nibble!(8);
347 do_nibble!(9);
348 do_nibble!(10);
349 do_nibble!(11);
350 do_nibble!(12);
351 do_nibble!(13);
352 do_nibble!(14);
353 do_nibble!(15);
354
355 vst1q_u8(ptr.add(i), result);
356 i += 16;
357 }
358
359 while i < len {
361 *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
362 i += 1;
363 }
364 }
365}
366
367#[cfg(target_arch = "x86_64")]
387#[target_feature(enable = "avx2")]
388unsafe fn translate_inplace_avx2_table(data: &mut [u8], table: &[u8; 256]) {
389 use std::arch::x86_64::*;
390
391 unsafe {
392 let len = data.len();
393 let ptr = data.as_mut_ptr();
394
395 let mut lut = [_mm256_setzero_si256(); 16];
399 for h in 0u8..16 {
400 let base = (h as usize) * 16;
401 let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
402 let row128 = _mm_loadu_si128(row.as_ptr() as *const _);
404 lut[h as usize] = _mm256_broadcastsi128_si256(row128);
405 }
406
407 let lo_mask = _mm256_set1_epi8(0x0F);
408
409 let mut i = 0;
410
411 while i + 64 <= len {
415 let input0 = _mm256_loadu_si256(ptr.add(i) as *const _);
416 let input1 = _mm256_loadu_si256(ptr.add(i + 32) as *const _);
417
418 let lo0 = _mm256_and_si256(input0, lo_mask);
419 let hi0 = _mm256_and_si256(_mm256_srli_epi16(input0, 4), lo_mask);
420 let lo1 = _mm256_and_si256(input1, lo_mask);
421 let hi1 = _mm256_and_si256(_mm256_srli_epi16(input1, 4), lo_mask);
422
423 let mut r0 = _mm256_setzero_si256();
424 let mut r1 = _mm256_setzero_si256();
425
426 macro_rules! do_nibble2 {
427 ($h:expr) => {
428 let h_val = _mm256_set1_epi8($h as i8);
429 let m0 = _mm256_cmpeq_epi8(hi0, h_val);
430 let l0 = _mm256_shuffle_epi8(lut[$h], lo0);
431 r0 = _mm256_or_si256(r0, _mm256_and_si256(m0, l0));
432 let m1 = _mm256_cmpeq_epi8(hi1, h_val);
433 let l1 = _mm256_shuffle_epi8(lut[$h], lo1);
434 r1 = _mm256_or_si256(r1, _mm256_and_si256(m1, l1));
435 };
436 }
437 do_nibble2!(0);
438 do_nibble2!(1);
439 do_nibble2!(2);
440 do_nibble2!(3);
441 do_nibble2!(4);
442 do_nibble2!(5);
443 do_nibble2!(6);
444 do_nibble2!(7);
445 do_nibble2!(8);
446 do_nibble2!(9);
447 do_nibble2!(10);
448 do_nibble2!(11);
449 do_nibble2!(12);
450 do_nibble2!(13);
451 do_nibble2!(14);
452 do_nibble2!(15);
453
454 _mm256_storeu_si256(ptr.add(i) as *mut _, r0);
455 _mm256_storeu_si256(ptr.add(i + 32) as *mut _, r1);
456 i += 64;
457 }
458
459 if i + 32 <= len {
461 let input = _mm256_loadu_si256(ptr.add(i) as *const _);
462 let lo_nibble = _mm256_and_si256(input, lo_mask);
463 let hi_nibble = _mm256_and_si256(_mm256_srli_epi16(input, 4), lo_mask);
464
465 let mut result = _mm256_setzero_si256();
466
467 macro_rules! do_nibble {
468 ($h:expr) => {
469 let h_val = _mm256_set1_epi8($h as i8);
470 let mask = _mm256_cmpeq_epi8(hi_nibble, h_val);
471 let looked_up = _mm256_shuffle_epi8(lut[$h], lo_nibble);
472 result = _mm256_or_si256(result, _mm256_and_si256(mask, looked_up));
473 };
474 }
475 do_nibble!(0);
476 do_nibble!(1);
477 do_nibble!(2);
478 do_nibble!(3);
479 do_nibble!(4);
480 do_nibble!(5);
481 do_nibble!(6);
482 do_nibble!(7);
483 do_nibble!(8);
484 do_nibble!(9);
485 do_nibble!(10);
486 do_nibble!(11);
487 do_nibble!(12);
488 do_nibble!(13);
489 do_nibble!(14);
490 do_nibble!(15);
491
492 _mm256_storeu_si256(ptr.add(i) as *mut _, result);
493 i += 32;
494 }
495
496 if i + 16 <= len {
498 let lo_mask128 = _mm_set1_epi8(0x0F);
499
500 let mut lut128 = [_mm_setzero_si128(); 16];
501 for h in 0u8..16 {
502 lut128[h as usize] = _mm256_castsi256_si128(lut[h as usize]);
503 }
504
505 let input = _mm_loadu_si128(ptr.add(i) as *const _);
506 let lo_nib = _mm_and_si128(input, lo_mask128);
507 let hi_nib = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask128);
508
509 let mut res = _mm_setzero_si128();
510 macro_rules! do_nibble128 {
511 ($h:expr) => {
512 let h_val = _mm_set1_epi8($h as i8);
513 let mask = _mm_cmpeq_epi8(hi_nib, h_val);
514 let looked_up = _mm_shuffle_epi8(lut128[$h], lo_nib);
515 res = _mm_or_si128(res, _mm_and_si128(mask, looked_up));
516 };
517 }
518 do_nibble128!(0);
519 do_nibble128!(1);
520 do_nibble128!(2);
521 do_nibble128!(3);
522 do_nibble128!(4);
523 do_nibble128!(5);
524 do_nibble128!(6);
525 do_nibble128!(7);
526 do_nibble128!(8);
527 do_nibble128!(9);
528 do_nibble128!(10);
529 do_nibble128!(11);
530 do_nibble128!(12);
531 do_nibble128!(13);
532 do_nibble128!(14);
533 do_nibble128!(15);
534
535 _mm_storeu_si128(ptr.add(i) as *mut _, res);
536 i += 16;
537 }
538
539 while i < len {
541 *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
542 i += 1;
543 }
544 }
545}
546
547#[cfg(target_arch = "x86_64")]
548#[target_feature(enable = "ssse3")]
549unsafe fn translate_inplace_ssse3_table(data: &mut [u8], table: &[u8; 256]) {
550 use std::arch::x86_64::*;
551
552 unsafe {
553 let len = data.len();
554 let ptr = data.as_mut_ptr();
555
556 let mut lut = [_mm_setzero_si128(); 16];
558 for h in 0u8..16 {
559 let base = (h as usize) * 16;
560 let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
561 lut[h as usize] = _mm_loadu_si128(row.as_ptr() as *const _);
562 }
563
564 let lo_mask = _mm_set1_epi8(0x0F);
565
566 let mut i = 0;
567 while i + 16 <= len {
568 let input = _mm_loadu_si128(ptr.add(i) as *const _);
569 let lo_nibble = _mm_and_si128(input, lo_mask);
570 let hi_nibble = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask);
571
572 let mut result = _mm_setzero_si128();
573
574 macro_rules! do_nibble {
575 ($h:expr) => {
576 let h_val = _mm_set1_epi8($h as i8);
577 let mask = _mm_cmpeq_epi8(hi_nibble, h_val);
578 let looked_up = _mm_shuffle_epi8(lut[$h], lo_nibble);
579 result = _mm_or_si128(result, _mm_and_si128(mask, looked_up));
580 };
581 }
582 do_nibble!(0);
583 do_nibble!(1);
584 do_nibble!(2);
585 do_nibble!(3);
586 do_nibble!(4);
587 do_nibble!(5);
588 do_nibble!(6);
589 do_nibble!(7);
590 do_nibble!(8);
591 do_nibble!(9);
592 do_nibble!(10);
593 do_nibble!(11);
594 do_nibble!(12);
595 do_nibble!(13);
596 do_nibble!(14);
597 do_nibble!(15);
598
599 _mm_storeu_si128(ptr.add(i) as *mut _, result);
600 i += 16;
601 }
602
603 while i < len {
605 *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
606 i += 1;
607 }
608 }
609}
610
611#[inline(always)]
614fn translate_to(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
615 debug_assert!(dst.len() >= src.len());
616 #[cfg(target_arch = "x86_64")]
617 {
618 let level = get_simd_level();
619 if level >= 3 {
620 if dst.as_ptr() as usize & 31 == 0 {
622 unsafe { translate_to_avx2_table_nt(src, dst, table) };
623 } else {
624 unsafe { translate_to_avx2_table(src, dst, table) };
625 }
626 return;
627 }
628 if level >= 2 {
629 unsafe { translate_to_ssse3_table(src, dst, table) };
630 return;
631 }
632 }
633 translate_to_scalar(src, dst, table);
634}
635
636#[cfg(not(target_arch = "aarch64"))]
638#[inline(always)]
639fn translate_to_scalar(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
640 unsafe {
641 let sp = src.as_ptr();
642 let dp = dst.as_mut_ptr();
643 let len = src.len();
644 let mut i = 0;
645 while i + 8 <= len {
646 *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
647 *dp.add(i + 1) = *table.get_unchecked(*sp.add(i + 1) as usize);
648 *dp.add(i + 2) = *table.get_unchecked(*sp.add(i + 2) as usize);
649 *dp.add(i + 3) = *table.get_unchecked(*sp.add(i + 3) as usize);
650 *dp.add(i + 4) = *table.get_unchecked(*sp.add(i + 4) as usize);
651 *dp.add(i + 5) = *table.get_unchecked(*sp.add(i + 5) as usize);
652 *dp.add(i + 6) = *table.get_unchecked(*sp.add(i + 6) as usize);
653 *dp.add(i + 7) = *table.get_unchecked(*sp.add(i + 7) as usize);
654 i += 8;
655 }
656 while i < len {
657 *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
658 i += 1;
659 }
660 }
661}
662
663#[cfg(target_arch = "aarch64")]
665#[inline(always)]
666fn translate_to_scalar(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
667 unsafe { translate_to_neon_table(src, dst, table) };
668}
669
670#[cfg(target_arch = "aarch64")]
671#[target_feature(enable = "neon")]
672unsafe fn translate_to_neon_table(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
673 use std::arch::aarch64::*;
674
675 unsafe {
676 let len = src.len();
677 let sp = src.as_ptr();
678 let dp = dst.as_mut_ptr();
679
680 let mut lut: [uint8x16_t; 16] = [vdupq_n_u8(0); 16];
681 for h in 0u8..16 {
682 lut[h as usize] = vld1q_u8(table.as_ptr().add((h as usize) * 16));
683 }
684
685 let lo_mask = vdupq_n_u8(0x0F);
686 let mut i = 0;
687
688 while i + 16 <= len {
689 let input = vld1q_u8(sp.add(i));
690 let lo_nibble = vandq_u8(input, lo_mask);
691 let hi_nibble = vandq_u8(vshrq_n_u8(input, 4), lo_mask);
692
693 let mut result = vdupq_n_u8(0);
694 macro_rules! do_nibble {
695 ($h:expr) => {
696 let h_val = vdupq_n_u8($h);
697 let mask = vceqq_u8(hi_nibble, h_val);
698 let looked_up = vqtbl1q_u8(lut[$h as usize], lo_nibble);
699 result = vorrq_u8(result, vandq_u8(mask, looked_up));
700 };
701 }
702 do_nibble!(0);
703 do_nibble!(1);
704 do_nibble!(2);
705 do_nibble!(3);
706 do_nibble!(4);
707 do_nibble!(5);
708 do_nibble!(6);
709 do_nibble!(7);
710 do_nibble!(8);
711 do_nibble!(9);
712 do_nibble!(10);
713 do_nibble!(11);
714 do_nibble!(12);
715 do_nibble!(13);
716 do_nibble!(14);
717 do_nibble!(15);
718
719 vst1q_u8(dp.add(i), result);
720 i += 16;
721 }
722
723 while i < len {
724 *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
725 i += 1;
726 }
727 }
728}
729
730#[cfg(target_arch = "x86_64")]
731#[target_feature(enable = "avx2")]
732unsafe fn translate_to_avx2_table(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
733 use std::arch::x86_64::*;
734
735 unsafe {
736 let len = src.len();
737 let sp = src.as_ptr();
738 let dp = dst.as_mut_ptr();
739
740 let mut lut = [_mm256_setzero_si256(); 16];
742 for h in 0u8..16 {
743 let base = (h as usize) * 16;
744 let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
745 let row128 = _mm_loadu_si128(row.as_ptr() as *const _);
746 lut[h as usize] = _mm256_broadcastsi128_si256(row128);
747 }
748
749 let lo_mask = _mm256_set1_epi8(0x0F);
750
751 let mut i = 0;
752
753 while i + 64 <= len {
755 let input0 = _mm256_loadu_si256(sp.add(i) as *const _);
756 let input1 = _mm256_loadu_si256(sp.add(i + 32) as *const _);
757
758 let lo0 = _mm256_and_si256(input0, lo_mask);
759 let hi0 = _mm256_and_si256(_mm256_srli_epi16(input0, 4), lo_mask);
760 let lo1 = _mm256_and_si256(input1, lo_mask);
761 let hi1 = _mm256_and_si256(_mm256_srli_epi16(input1, 4), lo_mask);
762
763 let mut r0 = _mm256_setzero_si256();
764 let mut r1 = _mm256_setzero_si256();
765
766 macro_rules! do_nibble2 {
767 ($h:expr) => {
768 let h_val = _mm256_set1_epi8($h as i8);
769 let m0 = _mm256_cmpeq_epi8(hi0, h_val);
770 let l0 = _mm256_shuffle_epi8(lut[$h], lo0);
771 r0 = _mm256_or_si256(r0, _mm256_and_si256(m0, l0));
772 let m1 = _mm256_cmpeq_epi8(hi1, h_val);
773 let l1 = _mm256_shuffle_epi8(lut[$h], lo1);
774 r1 = _mm256_or_si256(r1, _mm256_and_si256(m1, l1));
775 };
776 }
777 do_nibble2!(0);
778 do_nibble2!(1);
779 do_nibble2!(2);
780 do_nibble2!(3);
781 do_nibble2!(4);
782 do_nibble2!(5);
783 do_nibble2!(6);
784 do_nibble2!(7);
785 do_nibble2!(8);
786 do_nibble2!(9);
787 do_nibble2!(10);
788 do_nibble2!(11);
789 do_nibble2!(12);
790 do_nibble2!(13);
791 do_nibble2!(14);
792 do_nibble2!(15);
793
794 _mm256_storeu_si256(dp.add(i) as *mut _, r0);
795 _mm256_storeu_si256(dp.add(i + 32) as *mut _, r1);
796 i += 64;
797 }
798
799 if i + 32 <= len {
801 let input = _mm256_loadu_si256(sp.add(i) as *const _);
802 let lo_nibble = _mm256_and_si256(input, lo_mask);
803 let hi_nibble = _mm256_and_si256(_mm256_srli_epi16(input, 4), lo_mask);
804
805 let mut result = _mm256_setzero_si256();
806
807 macro_rules! do_nibble {
808 ($h:expr) => {
809 let h_val = _mm256_set1_epi8($h as i8);
810 let mask = _mm256_cmpeq_epi8(hi_nibble, h_val);
811 let looked_up = _mm256_shuffle_epi8(lut[$h], lo_nibble);
812 result = _mm256_or_si256(result, _mm256_and_si256(mask, looked_up));
813 };
814 }
815 do_nibble!(0);
816 do_nibble!(1);
817 do_nibble!(2);
818 do_nibble!(3);
819 do_nibble!(4);
820 do_nibble!(5);
821 do_nibble!(6);
822 do_nibble!(7);
823 do_nibble!(8);
824 do_nibble!(9);
825 do_nibble!(10);
826 do_nibble!(11);
827 do_nibble!(12);
828 do_nibble!(13);
829 do_nibble!(14);
830 do_nibble!(15);
831
832 _mm256_storeu_si256(dp.add(i) as *mut _, result);
833 i += 32;
834 }
835
836 if i + 16 <= len {
838 let lo_mask128 = _mm_set1_epi8(0x0F);
839 let mut lut128 = [_mm_setzero_si128(); 16];
840 for h in 0u8..16 {
841 lut128[h as usize] = _mm256_castsi256_si128(lut[h as usize]);
842 }
843
844 let input = _mm_loadu_si128(sp.add(i) as *const _);
845 let lo_nib = _mm_and_si128(input, lo_mask128);
846 let hi_nib = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask128);
847
848 let mut res = _mm_setzero_si128();
849 macro_rules! do_nibble128 {
850 ($h:expr) => {
851 let h_val = _mm_set1_epi8($h as i8);
852 let mask = _mm_cmpeq_epi8(hi_nib, h_val);
853 let looked_up = _mm_shuffle_epi8(lut128[$h], lo_nib);
854 res = _mm_or_si128(res, _mm_and_si128(mask, looked_up));
855 };
856 }
857 do_nibble128!(0);
858 do_nibble128!(1);
859 do_nibble128!(2);
860 do_nibble128!(3);
861 do_nibble128!(4);
862 do_nibble128!(5);
863 do_nibble128!(6);
864 do_nibble128!(7);
865 do_nibble128!(8);
866 do_nibble128!(9);
867 do_nibble128!(10);
868 do_nibble128!(11);
869 do_nibble128!(12);
870 do_nibble128!(13);
871 do_nibble128!(14);
872 do_nibble128!(15);
873
874 _mm_storeu_si128(dp.add(i) as *mut _, res);
875 i += 16;
876 }
877
878 while i < len {
880 *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
881 i += 1;
882 }
883 }
884}
885
886#[cfg(target_arch = "x86_64")]
889#[target_feature(enable = "avx2")]
890unsafe fn translate_to_avx2_table_nt(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
891 use std::arch::x86_64::*;
892
893 unsafe {
894 let len = src.len();
895 let sp = src.as_ptr();
896 let dp = dst.as_mut_ptr();
897
898 let mut lut = [_mm256_setzero_si256(); 16];
900 for h in 0u8..16 {
901 let base = (h as usize) * 16;
902 let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
903 let row128 = _mm_loadu_si128(row.as_ptr() as *const _);
904 lut[h as usize] = _mm256_broadcastsi128_si256(row128);
905 }
906
907 let lo_mask = _mm256_set1_epi8(0x0F);
908 let mut i = 0;
909
910 while i + 64 <= len {
912 let input0 = _mm256_loadu_si256(sp.add(i) as *const _);
913 let input1 = _mm256_loadu_si256(sp.add(i + 32) as *const _);
914
915 let lo0 = _mm256_and_si256(input0, lo_mask);
916 let hi0 = _mm256_and_si256(_mm256_srli_epi16(input0, 4), lo_mask);
917 let lo1 = _mm256_and_si256(input1, lo_mask);
918 let hi1 = _mm256_and_si256(_mm256_srli_epi16(input1, 4), lo_mask);
919
920 let mut r0 = _mm256_setzero_si256();
921 let mut r1 = _mm256_setzero_si256();
922
923 macro_rules! do_nibble2 {
924 ($h:expr) => {
925 let h_val = _mm256_set1_epi8($h as i8);
926 let m0 = _mm256_cmpeq_epi8(hi0, h_val);
927 let l0 = _mm256_shuffle_epi8(lut[$h], lo0);
928 r0 = _mm256_or_si256(r0, _mm256_and_si256(m0, l0));
929 let m1 = _mm256_cmpeq_epi8(hi1, h_val);
930 let l1 = _mm256_shuffle_epi8(lut[$h], lo1);
931 r1 = _mm256_or_si256(r1, _mm256_and_si256(m1, l1));
932 };
933 }
934 do_nibble2!(0);
935 do_nibble2!(1);
936 do_nibble2!(2);
937 do_nibble2!(3);
938 do_nibble2!(4);
939 do_nibble2!(5);
940 do_nibble2!(6);
941 do_nibble2!(7);
942 do_nibble2!(8);
943 do_nibble2!(9);
944 do_nibble2!(10);
945 do_nibble2!(11);
946 do_nibble2!(12);
947 do_nibble2!(13);
948 do_nibble2!(14);
949 do_nibble2!(15);
950
951 _mm256_stream_si256(dp.add(i) as *mut _, r0);
952 _mm256_stream_si256(dp.add(i + 32) as *mut _, r1);
953 i += 64;
954 }
955
956 if i + 32 <= len {
958 let input = _mm256_loadu_si256(sp.add(i) as *const _);
959 let lo_nibble = _mm256_and_si256(input, lo_mask);
960 let hi_nibble = _mm256_and_si256(_mm256_srli_epi16(input, 4), lo_mask);
961
962 let mut result = _mm256_setzero_si256();
963 macro_rules! do_nibble {
964 ($h:expr) => {
965 let h_val = _mm256_set1_epi8($h as i8);
966 let mask = _mm256_cmpeq_epi8(hi_nibble, h_val);
967 let looked_up = _mm256_shuffle_epi8(lut[$h], lo_nibble);
968 result = _mm256_or_si256(result, _mm256_and_si256(mask, looked_up));
969 };
970 }
971 do_nibble!(0);
972 do_nibble!(1);
973 do_nibble!(2);
974 do_nibble!(3);
975 do_nibble!(4);
976 do_nibble!(5);
977 do_nibble!(6);
978 do_nibble!(7);
979 do_nibble!(8);
980 do_nibble!(9);
981 do_nibble!(10);
982 do_nibble!(11);
983 do_nibble!(12);
984 do_nibble!(13);
985 do_nibble!(14);
986 do_nibble!(15);
987
988 _mm256_stream_si256(dp.add(i) as *mut _, result);
989 i += 32;
990 }
991
992 if i + 16 <= len {
994 let lo_mask128 = _mm_set1_epi8(0x0F);
995 let mut lut128 = [_mm_setzero_si128(); 16];
996 for h in 0u8..16 {
997 lut128[h as usize] = _mm256_castsi256_si128(lut[h as usize]);
998 }
999
1000 let input = _mm_loadu_si128(sp.add(i) as *const _);
1001 let lo_nib = _mm_and_si128(input, lo_mask128);
1002 let hi_nib = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask128);
1003
1004 let mut res = _mm_setzero_si128();
1005 macro_rules! do_nibble128 {
1006 ($h:expr) => {
1007 let h_val = _mm_set1_epi8($h as i8);
1008 let mask = _mm_cmpeq_epi8(hi_nib, h_val);
1009 let looked_up = _mm_shuffle_epi8(lut128[$h], lo_nib);
1010 res = _mm_or_si128(res, _mm_and_si128(mask, looked_up));
1011 };
1012 }
1013 do_nibble128!(0);
1014 do_nibble128!(1);
1015 do_nibble128!(2);
1016 do_nibble128!(3);
1017 do_nibble128!(4);
1018 do_nibble128!(5);
1019 do_nibble128!(6);
1020 do_nibble128!(7);
1021 do_nibble128!(8);
1022 do_nibble128!(9);
1023 do_nibble128!(10);
1024 do_nibble128!(11);
1025 do_nibble128!(12);
1026 do_nibble128!(13);
1027 do_nibble128!(14);
1028 do_nibble128!(15);
1029
1030 _mm_storeu_si128(dp.add(i) as *mut _, res);
1031 i += 16;
1032 }
1033
1034 while i < len {
1036 *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
1037 i += 1;
1038 }
1039
1040 _mm_sfence();
1042 }
1043}
1044
1045#[cfg(target_arch = "x86_64")]
1046#[target_feature(enable = "ssse3")]
1047unsafe fn translate_to_ssse3_table(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
1048 use std::arch::x86_64::*;
1049
1050 unsafe {
1051 let len = src.len();
1052 let sp = src.as_ptr();
1053 let dp = dst.as_mut_ptr();
1054
1055 let mut lut = [_mm_setzero_si128(); 16];
1056 for h in 0u8..16 {
1057 let base = (h as usize) * 16;
1058 let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
1059 lut[h as usize] = _mm_loadu_si128(row.as_ptr() as *const _);
1060 }
1061
1062 let lo_mask = _mm_set1_epi8(0x0F);
1063
1064 let mut i = 0;
1065 while i + 16 <= len {
1066 let input = _mm_loadu_si128(sp.add(i) as *const _);
1067 let lo_nibble = _mm_and_si128(input, lo_mask);
1068 let hi_nibble = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask);
1069
1070 let mut result = _mm_setzero_si128();
1071
1072 macro_rules! do_nibble {
1073 ($h:expr) => {
1074 let h_val = _mm_set1_epi8($h as i8);
1075 let mask = _mm_cmpeq_epi8(hi_nibble, h_val);
1076 let looked_up = _mm_shuffle_epi8(lut[$h], lo_nibble);
1077 result = _mm_or_si128(result, _mm_and_si128(mask, looked_up));
1078 };
1079 }
1080 do_nibble!(0);
1081 do_nibble!(1);
1082 do_nibble!(2);
1083 do_nibble!(3);
1084 do_nibble!(4);
1085 do_nibble!(5);
1086 do_nibble!(6);
1087 do_nibble!(7);
1088 do_nibble!(8);
1089 do_nibble!(9);
1090 do_nibble!(10);
1091 do_nibble!(11);
1092 do_nibble!(12);
1093 do_nibble!(13);
1094 do_nibble!(14);
1095 do_nibble!(15);
1096
1097 _mm_storeu_si128(dp.add(i) as *mut _, result);
1098 i += 16;
1099 }
1100
1101 while i < len {
1103 *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
1104 i += 1;
1105 }
1106 }
1107}
1108
1109#[inline]
1117fn detect_range_offset(table: &[u8; 256]) -> Option<(u8, u8, i8)> {
1118 let mut lo: Option<u8> = None;
1119 let mut hi = 0u8;
1120 let mut offset = 0i16;
1121
1122 for i in 0..256 {
1123 if table[i] != i as u8 {
1124 let diff = table[i] as i16 - i as i16;
1125 match lo {
1126 None => {
1127 lo = Some(i as u8);
1128 hi = i as u8;
1129 offset = diff;
1130 }
1131 Some(_) => {
1132 if diff != offset || i as u8 != hi.wrapping_add(1) {
1133 return None;
1134 }
1135 hi = i as u8;
1136 }
1137 }
1138 }
1139 }
1140
1141 lo.map(|l| (l, hi, offset as i8))
1142}
1143
1144#[inline]
1149fn detect_range_to_constant(table: &[u8; 256]) -> Option<(u8, u8, u8)> {
1150 let mut lo: Option<u8> = None;
1151 let mut hi = 0u8;
1152 let mut replacement = 0u8;
1153
1154 for i in 0..256 {
1155 if table[i] != i as u8 {
1156 match lo {
1157 None => {
1158 lo = Some(i as u8);
1159 hi = i as u8;
1160 replacement = table[i];
1161 }
1162 Some(_) => {
1163 if table[i] != replacement || i as u8 != hi.wrapping_add(1) {
1164 return None;
1165 }
1166 hi = i as u8;
1167 }
1168 }
1169 }
1170 }
1171
1172 lo.map(|l| (l, hi, replacement))
1173}
1174
1175#[cfg(target_arch = "x86_64")]
1180fn translate_range_to_constant_simd_inplace(data: &mut [u8], lo: u8, hi: u8, replacement: u8) {
1181 if get_simd_level() >= 3 {
1182 unsafe { translate_range_to_constant_avx2_inplace(data, lo, hi, replacement) };
1183 } else {
1184 unsafe { translate_range_to_constant_sse2_inplace(data, lo, hi, replacement) };
1185 }
1186}
1187
1188#[cfg(target_arch = "x86_64")]
1189#[target_feature(enable = "avx2")]
1190unsafe fn translate_range_to_constant_avx2_inplace(
1191 data: &mut [u8],
1192 lo: u8,
1193 hi: u8,
1194 replacement: u8,
1195) {
1196 use std::arch::x86_64::*;
1197
1198 unsafe {
1199 let range = hi - lo;
1200 let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1201 let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
1202 let repl_v = _mm256_set1_epi8(replacement as i8);
1203 let zero = _mm256_setzero_si256();
1204
1205 let len = data.len();
1206 let ptr = data.as_mut_ptr();
1207 let mut i = 0;
1208
1209 while i + 64 <= len {
1211 let in0 = _mm256_loadu_si256(ptr.add(i) as *const _);
1212 let in1 = _mm256_loadu_si256(ptr.add(i + 32) as *const _);
1213 let bi0 = _mm256_add_epi8(in0, bias_v);
1214 let bi1 = _mm256_add_epi8(in1, bias_v);
1215 let gt0 = _mm256_cmpgt_epi8(bi0, threshold_v);
1216 let gt1 = _mm256_cmpgt_epi8(bi1, threshold_v);
1217 let ir0 = _mm256_cmpeq_epi8(gt0, zero);
1218 let ir1 = _mm256_cmpeq_epi8(gt1, zero);
1219 let r0 = _mm256_blendv_epi8(in0, repl_v, ir0);
1220 let r1 = _mm256_blendv_epi8(in1, repl_v, ir1);
1221 _mm256_storeu_si256(ptr.add(i) as *mut _, r0);
1222 _mm256_storeu_si256(ptr.add(i + 32) as *mut _, r1);
1223 i += 64;
1224 }
1225
1226 if i + 32 <= len {
1228 let input = _mm256_loadu_si256(ptr.add(i) as *const _);
1229 let biased = _mm256_add_epi8(input, bias_v);
1230 let gt = _mm256_cmpgt_epi8(biased, threshold_v);
1231 let in_range = _mm256_cmpeq_epi8(gt, zero);
1232 let result = _mm256_blendv_epi8(input, repl_v, in_range);
1233 _mm256_storeu_si256(ptr.add(i) as *mut _, result);
1234 i += 32;
1235 }
1236
1237 if i + 16 <= len {
1238 let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1239 let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1240 let repl_v128 = _mm_set1_epi8(replacement as i8);
1241 let zero128 = _mm_setzero_si128();
1242
1243 let input = _mm_loadu_si128(ptr.add(i) as *const _);
1244 let biased = _mm_add_epi8(input, bias_v128);
1245 let gt = _mm_cmpgt_epi8(biased, threshold_v128);
1246 let in_range = _mm_cmpeq_epi8(gt, zero128);
1247 let result = _mm_blendv_epi8(input, repl_v128, in_range);
1248 _mm_storeu_si128(ptr.add(i) as *mut _, result);
1249 i += 16;
1250 }
1251
1252 while i < len {
1253 let b = *ptr.add(i);
1254 *ptr.add(i) = if b >= lo && b <= hi { replacement } else { b };
1255 i += 1;
1256 }
1257 }
1258}
1259
1260#[cfg(target_arch = "x86_64")]
1261#[target_feature(enable = "sse2")]
1262unsafe fn translate_range_to_constant_sse2_inplace(
1263 data: &mut [u8],
1264 lo: u8,
1265 hi: u8,
1266 replacement: u8,
1267) {
1268 use std::arch::x86_64::*;
1269
1270 unsafe {
1271 let range = hi - lo;
1272 let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1273 let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1274 let repl_v = _mm_set1_epi8(replacement as i8);
1275 let zero = _mm_setzero_si128();
1276
1277 let len = data.len();
1278 let ptr = data.as_mut_ptr();
1279 let mut i = 0;
1280
1281 while i + 16 <= len {
1282 let input = _mm_loadu_si128(ptr.add(i) as *const _);
1283 let biased = _mm_add_epi8(input, bias_v);
1284 let gt = _mm_cmpgt_epi8(biased, threshold_v);
1285 let in_range = _mm_cmpeq_epi8(gt, zero);
1287 let result = _mm_or_si128(
1289 _mm_and_si128(in_range, repl_v),
1290 _mm_andnot_si128(in_range, input),
1291 );
1292 _mm_storeu_si128(ptr.add(i) as *mut _, result);
1293 i += 16;
1294 }
1295
1296 while i < len {
1297 let b = *ptr.add(i);
1298 *ptr.add(i) = if b >= lo && b <= hi { replacement } else { b };
1299 i += 1;
1300 }
1301 }
1302}
1303
1304#[cfg(target_arch = "aarch64")]
1305fn translate_range_to_constant_simd_inplace(data: &mut [u8], lo: u8, hi: u8, replacement: u8) {
1306 unsafe { translate_range_to_constant_neon_inplace(data, lo, hi, replacement) };
1307}
1308
1309#[cfg(target_arch = "aarch64")]
1310#[target_feature(enable = "neon")]
1311unsafe fn translate_range_to_constant_neon_inplace(
1312 data: &mut [u8],
1313 lo: u8,
1314 hi: u8,
1315 replacement: u8,
1316) {
1317 use std::arch::aarch64::*;
1318
1319 unsafe {
1320 let len = data.len();
1321 let ptr = data.as_mut_ptr();
1322 let lo_v = vdupq_n_u8(lo);
1323 let hi_v = vdupq_n_u8(hi);
1324 let repl_v = vdupq_n_u8(replacement);
1325 let mut i = 0;
1326
1327 while i + 32 <= len {
1328 let in0 = vld1q_u8(ptr.add(i));
1329 let in1 = vld1q_u8(ptr.add(i + 16));
1330 let ge0 = vcgeq_u8(in0, lo_v);
1331 let le0 = vcleq_u8(in0, hi_v);
1332 let mask0 = vandq_u8(ge0, le0);
1333 let ge1 = vcgeq_u8(in1, lo_v);
1334 let le1 = vcleq_u8(in1, hi_v);
1335 let mask1 = vandq_u8(ge1, le1);
1336 vst1q_u8(ptr.add(i), vbslq_u8(mask0, repl_v, in0));
1338 vst1q_u8(ptr.add(i + 16), vbslq_u8(mask1, repl_v, in1));
1339 i += 32;
1340 }
1341
1342 if i + 16 <= len {
1343 let input = vld1q_u8(ptr.add(i));
1344 let ge = vcgeq_u8(input, lo_v);
1345 let le = vcleq_u8(input, hi_v);
1346 let mask = vandq_u8(ge, le);
1347 vst1q_u8(ptr.add(i), vbslq_u8(mask, repl_v, input));
1348 i += 16;
1349 }
1350
1351 while i < len {
1352 let b = *ptr.add(i);
1353 *ptr.add(i) = if b >= lo && b <= hi { replacement } else { b };
1354 i += 1;
1355 }
1356 }
1357}
1358
1359#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
1360fn translate_range_to_constant_simd_inplace(data: &mut [u8], lo: u8, hi: u8, replacement: u8) {
1361 for b in data.iter_mut() {
1362 if *b >= lo && *b <= hi {
1363 *b = replacement;
1364 }
1365 }
1366}
1367
1368#[cfg(target_arch = "x86_64")]
1371fn translate_range_to_constant_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, replacement: u8) {
1372 if get_simd_level() >= 3 {
1373 unsafe { translate_range_to_constant_avx2(src, dst, lo, hi, replacement) };
1374 } else {
1375 unsafe { translate_range_to_constant_sse2(src, dst, lo, hi, replacement) };
1376 }
1377}
1378
1379#[cfg(target_arch = "aarch64")]
1380fn translate_range_to_constant_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, replacement: u8) {
1381 unsafe { translate_range_to_constant_neon(src, dst, lo, hi, replacement) };
1382}
1383
1384#[cfg(target_arch = "aarch64")]
1385#[target_feature(enable = "neon")]
1386unsafe fn translate_range_to_constant_neon(
1387 src: &[u8],
1388 dst: &mut [u8],
1389 lo: u8,
1390 hi: u8,
1391 replacement: u8,
1392) {
1393 use std::arch::aarch64::*;
1394
1395 unsafe {
1396 let len = src.len();
1397 let sp = src.as_ptr();
1398 let dp = dst.as_mut_ptr();
1399 let lo_v = vdupq_n_u8(lo);
1400 let hi_v = vdupq_n_u8(hi);
1401 let repl_v = vdupq_n_u8(replacement);
1402 let mut i = 0;
1403
1404 while i + 32 <= len {
1405 let in0 = vld1q_u8(sp.add(i));
1406 let in1 = vld1q_u8(sp.add(i + 16));
1407 let mask0 = vandq_u8(vcgeq_u8(in0, lo_v), vcleq_u8(in0, hi_v));
1408 let mask1 = vandq_u8(vcgeq_u8(in1, lo_v), vcleq_u8(in1, hi_v));
1409 vst1q_u8(dp.add(i), vbslq_u8(mask0, repl_v, in0));
1410 vst1q_u8(dp.add(i + 16), vbslq_u8(mask1, repl_v, in1));
1411 i += 32;
1412 }
1413
1414 if i + 16 <= len {
1415 let input = vld1q_u8(sp.add(i));
1416 let mask = vandq_u8(vcgeq_u8(input, lo_v), vcleq_u8(input, hi_v));
1417 vst1q_u8(dp.add(i), vbslq_u8(mask, repl_v, input));
1418 i += 16;
1419 }
1420
1421 while i < len {
1422 let b = *sp.add(i);
1423 *dp.add(i) = if b >= lo && b <= hi { replacement } else { b };
1424 i += 1;
1425 }
1426 }
1427}
1428
1429#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
1430fn translate_range_to_constant_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, replacement: u8) {
1431 for (i, &b) in src.iter().enumerate() {
1432 unsafe {
1433 *dst.get_unchecked_mut(i) = if b >= lo && b <= hi { replacement } else { b };
1434 }
1435 }
1436}
1437
1438#[cfg(target_arch = "x86_64")]
1439#[target_feature(enable = "avx2")]
1440unsafe fn translate_range_to_constant_avx2(
1441 src: &[u8],
1442 dst: &mut [u8],
1443 lo: u8,
1444 hi: u8,
1445 replacement: u8,
1446) {
1447 use std::arch::x86_64::*;
1448 unsafe {
1449 let range = hi - lo;
1450 let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1451 let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
1452 let repl_v = _mm256_set1_epi8(replacement as i8);
1453 let zero = _mm256_setzero_si256();
1454 let len = src.len();
1455 let sp = src.as_ptr();
1456 let dp = dst.as_mut_ptr();
1457 let mut i = 0;
1458 while i + 64 <= len {
1459 let in0 = _mm256_loadu_si256(sp.add(i) as *const _);
1460 let in1 = _mm256_loadu_si256(sp.add(i + 32) as *const _);
1461 let bi0 = _mm256_add_epi8(in0, bias_v);
1462 let bi1 = _mm256_add_epi8(in1, bias_v);
1463 let gt0 = _mm256_cmpgt_epi8(bi0, threshold_v);
1464 let gt1 = _mm256_cmpgt_epi8(bi1, threshold_v);
1465 let ir0 = _mm256_cmpeq_epi8(gt0, zero);
1466 let ir1 = _mm256_cmpeq_epi8(gt1, zero);
1467 let r0 = _mm256_blendv_epi8(in0, repl_v, ir0);
1468 let r1 = _mm256_blendv_epi8(in1, repl_v, ir1);
1469 _mm256_storeu_si256(dp.add(i) as *mut _, r0);
1470 _mm256_storeu_si256(dp.add(i + 32) as *mut _, r1);
1471 i += 64;
1472 }
1473 if i + 32 <= len {
1474 let input = _mm256_loadu_si256(sp.add(i) as *const _);
1475 let biased = _mm256_add_epi8(input, bias_v);
1476 let gt = _mm256_cmpgt_epi8(biased, threshold_v);
1477 let in_range = _mm256_cmpeq_epi8(gt, zero);
1478 let result = _mm256_blendv_epi8(input, repl_v, in_range);
1479 _mm256_storeu_si256(dp.add(i) as *mut _, result);
1480 i += 32;
1481 }
1482 while i < len {
1483 let b = *sp.add(i);
1484 *dp.add(i) = if b >= lo && b <= hi { replacement } else { b };
1485 i += 1;
1486 }
1487 }
1488}
1489
1490#[cfg(target_arch = "x86_64")]
1491#[target_feature(enable = "sse2")]
1492unsafe fn translate_range_to_constant_sse2(
1493 src: &[u8],
1494 dst: &mut [u8],
1495 lo: u8,
1496 hi: u8,
1497 replacement: u8,
1498) {
1499 use std::arch::x86_64::*;
1500 unsafe {
1501 let range = hi - lo;
1502 let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1503 let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1504 let repl_v = _mm_set1_epi8(replacement as i8);
1505 let zero = _mm_setzero_si128();
1506 let len = src.len();
1507 let sp = src.as_ptr();
1508 let dp = dst.as_mut_ptr();
1509 let mut i = 0;
1510 while i + 16 <= len {
1511 let input = _mm_loadu_si128(sp.add(i) as *const _);
1512 let biased = _mm_add_epi8(input, bias_v);
1513 let gt = _mm_cmpgt_epi8(biased, threshold_v);
1514 let in_range = _mm_cmpeq_epi8(gt, zero);
1515 let result = _mm_or_si128(
1516 _mm_and_si128(in_range, repl_v),
1517 _mm_andnot_si128(in_range, input),
1518 );
1519 _mm_storeu_si128(dp.add(i) as *mut _, result);
1520 i += 16;
1521 }
1522 while i < len {
1523 let b = *sp.add(i);
1524 *dp.add(i) = if b >= lo && b <= hi { replacement } else { b };
1525 i += 1;
1526 }
1527 }
1528}
1529
1530#[cfg(target_arch = "x86_64")]
1537fn translate_range_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
1538 if get_simd_level() >= 3 {
1539 if dst.as_ptr() as usize & 31 == 0 {
1541 unsafe { translate_range_avx2_nt(src, dst, lo, hi, offset) };
1542 } else {
1543 unsafe { translate_range_avx2(src, dst, lo, hi, offset) };
1544 }
1545 } else {
1546 unsafe { translate_range_sse2(src, dst, lo, hi, offset) };
1547 }
1548}
1549
1550#[cfg(target_arch = "x86_64")]
1551#[target_feature(enable = "avx2")]
1552unsafe fn translate_range_avx2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
1553 use std::arch::x86_64::*;
1554
1555 unsafe {
1556 let range = hi - lo;
1557 let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1562 let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
1563 let offset_v = _mm256_set1_epi8(offset);
1564 let zero = _mm256_setzero_si256();
1565
1566 let len = src.len();
1567 let sp = src.as_ptr();
1568 let dp = dst.as_mut_ptr();
1569 let mut i = 0;
1570
1571 while i + 64 <= len {
1574 let in0 = _mm256_loadu_si256(sp.add(i) as *const _);
1575 let in1 = _mm256_loadu_si256(sp.add(i + 32) as *const _);
1576 let bi0 = _mm256_add_epi8(in0, bias_v);
1577 let bi1 = _mm256_add_epi8(in1, bias_v);
1578 let gt0 = _mm256_cmpgt_epi8(bi0, threshold_v);
1579 let gt1 = _mm256_cmpgt_epi8(bi1, threshold_v);
1580 let m0 = _mm256_cmpeq_epi8(gt0, zero);
1581 let m1 = _mm256_cmpeq_epi8(gt1, zero);
1582 let om0 = _mm256_and_si256(m0, offset_v);
1583 let om1 = _mm256_and_si256(m1, offset_v);
1584 let r0 = _mm256_add_epi8(in0, om0);
1585 let r1 = _mm256_add_epi8(in1, om1);
1586 _mm256_storeu_si256(dp.add(i) as *mut _, r0);
1587 _mm256_storeu_si256(dp.add(i + 32) as *mut _, r1);
1588 i += 64;
1589 }
1590
1591 if i + 32 <= len {
1593 let input = _mm256_loadu_si256(sp.add(i) as *const _);
1594 let biased = _mm256_add_epi8(input, bias_v);
1595 let gt = _mm256_cmpgt_epi8(biased, threshold_v);
1596 let mask = _mm256_cmpeq_epi8(gt, zero);
1597 let offset_masked = _mm256_and_si256(mask, offset_v);
1598 let result = _mm256_add_epi8(input, offset_masked);
1599 _mm256_storeu_si256(dp.add(i) as *mut _, result);
1600 i += 32;
1601 }
1602
1603 if i + 16 <= len {
1605 let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1606 let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1607 let offset_v128 = _mm_set1_epi8(offset);
1608 let zero128 = _mm_setzero_si128();
1609
1610 let input = _mm_loadu_si128(sp.add(i) as *const _);
1611 let biased = _mm_add_epi8(input, bias_v128);
1612 let gt = _mm_cmpgt_epi8(biased, threshold_v128);
1613 let mask = _mm_cmpeq_epi8(gt, zero128);
1614 let offset_masked = _mm_and_si128(mask, offset_v128);
1615 let result = _mm_add_epi8(input, offset_masked);
1616 _mm_storeu_si128(dp.add(i) as *mut _, result);
1617 i += 16;
1618 }
1619
1620 while i < len {
1622 let b = *sp.add(i);
1623 *dp.add(i) = if b >= lo && b <= hi {
1624 b.wrapping_add(offset as u8)
1625 } else {
1626 b
1627 };
1628 i += 1;
1629 }
1630 }
1631}
1632
1633#[cfg(target_arch = "x86_64")]
1639#[target_feature(enable = "avx2")]
1640unsafe fn translate_range_avx2_nt(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
1641 use std::arch::x86_64::*;
1642
1643 unsafe {
1644 let range = hi - lo;
1645 let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1646 let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
1647 let offset_v = _mm256_set1_epi8(offset);
1648 let zero = _mm256_setzero_si256();
1649
1650 let len = src.len();
1651 let sp = src.as_ptr();
1652 let dp = dst.as_mut_ptr();
1653 let mut i = 0;
1654
1655 while i + 64 <= len {
1657 let in0 = _mm256_loadu_si256(sp.add(i) as *const _);
1658 let in1 = _mm256_loadu_si256(sp.add(i + 32) as *const _);
1659 let bi0 = _mm256_add_epi8(in0, bias_v);
1660 let bi1 = _mm256_add_epi8(in1, bias_v);
1661 let gt0 = _mm256_cmpgt_epi8(bi0, threshold_v);
1662 let gt1 = _mm256_cmpgt_epi8(bi1, threshold_v);
1663 let m0 = _mm256_cmpeq_epi8(gt0, zero);
1664 let m1 = _mm256_cmpeq_epi8(gt1, zero);
1665 let om0 = _mm256_and_si256(m0, offset_v);
1666 let om1 = _mm256_and_si256(m1, offset_v);
1667 let r0 = _mm256_add_epi8(in0, om0);
1668 let r1 = _mm256_add_epi8(in1, om1);
1669 _mm256_stream_si256(dp.add(i) as *mut _, r0);
1670 _mm256_stream_si256(dp.add(i + 32) as *mut _, r1);
1671 i += 64;
1672 }
1673
1674 if i + 32 <= len {
1676 let input = _mm256_loadu_si256(sp.add(i) as *const _);
1677 let biased = _mm256_add_epi8(input, bias_v);
1678 let gt = _mm256_cmpgt_epi8(biased, threshold_v);
1679 let mask = _mm256_cmpeq_epi8(gt, zero);
1680 let offset_masked = _mm256_and_si256(mask, offset_v);
1681 let result = _mm256_add_epi8(input, offset_masked);
1682 _mm256_stream_si256(dp.add(i) as *mut _, result);
1683 i += 32;
1684 }
1685
1686 if i + 16 <= len {
1688 let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1689 let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1690 let offset_v128 = _mm_set1_epi8(offset);
1691 let zero128 = _mm_setzero_si128();
1692
1693 let input = _mm_loadu_si128(sp.add(i) as *const _);
1694 let biased = _mm_add_epi8(input, bias_v128);
1695 let gt = _mm_cmpgt_epi8(biased, threshold_v128);
1696 let mask = _mm_cmpeq_epi8(gt, zero128);
1697 let offset_masked = _mm_and_si128(mask, offset_v128);
1698 let result = _mm_add_epi8(input, offset_masked);
1699 _mm_storeu_si128(dp.add(i) as *mut _, result);
1700 i += 16;
1701 }
1702
1703 while i < len {
1705 let b = *sp.add(i);
1706 *dp.add(i) = if b >= lo && b <= hi {
1707 b.wrapping_add(offset as u8)
1708 } else {
1709 b
1710 };
1711 i += 1;
1712 }
1713
1714 _mm_sfence();
1716 }
1717}
1718
1719#[cfg(target_arch = "x86_64")]
1720#[target_feature(enable = "sse2")]
1721unsafe fn translate_range_sse2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
1722 use std::arch::x86_64::*;
1723
1724 unsafe {
1725 let range = hi - lo;
1726 let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1727 let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1728 let offset_v = _mm_set1_epi8(offset);
1729 let zero = _mm_setzero_si128();
1730
1731 let len = src.len();
1732 let mut i = 0;
1733
1734 while i + 16 <= len {
1735 let input = _mm_loadu_si128(src.as_ptr().add(i) as *const _);
1736 let biased = _mm_add_epi8(input, bias_v);
1737 let gt = _mm_cmpgt_epi8(biased, threshold_v);
1738 let mask = _mm_cmpeq_epi8(gt, zero);
1739 let offset_masked = _mm_and_si128(mask, offset_v);
1740 let result = _mm_add_epi8(input, offset_masked);
1741 _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut _, result);
1742 i += 16;
1743 }
1744
1745 while i < len {
1746 let b = *src.get_unchecked(i);
1747 *dst.get_unchecked_mut(i) = if b >= lo && b <= hi {
1748 b.wrapping_add(offset as u8)
1749 } else {
1750 b
1751 };
1752 i += 1;
1753 }
1754 }
1755}
1756
1757#[cfg(target_arch = "aarch64")]
1760fn translate_range_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
1761 unsafe { translate_range_neon(src, dst, lo, hi, offset) };
1762}
1763
1764#[cfg(target_arch = "aarch64")]
1765#[target_feature(enable = "neon")]
1766unsafe fn translate_range_neon(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
1767 use std::arch::aarch64::*;
1768
1769 unsafe {
1770 let len = src.len();
1771 let sp = src.as_ptr();
1772 let dp = dst.as_mut_ptr();
1773 let lo_v = vdupq_n_u8(lo);
1774 let hi_v = vdupq_n_u8(hi);
1775 let offset_v = vdupq_n_s8(offset);
1776 let mut i = 0;
1777
1778 while i + 32 <= len {
1780 let in0 = vld1q_u8(sp.add(i));
1781 let in1 = vld1q_u8(sp.add(i + 16));
1782 let ge0 = vcgeq_u8(in0, lo_v);
1784 let le0 = vcleq_u8(in0, hi_v);
1785 let mask0 = vandq_u8(ge0, le0);
1786 let ge1 = vcgeq_u8(in1, lo_v);
1787 let le1 = vcleq_u8(in1, hi_v);
1788 let mask1 = vandq_u8(ge1, le1);
1789 let off0 = vandq_u8(mask0, vreinterpretq_u8_s8(offset_v));
1791 let off1 = vandq_u8(mask1, vreinterpretq_u8_s8(offset_v));
1792 let r0 = vaddq_u8(in0, off0);
1793 let r1 = vaddq_u8(in1, off1);
1794 vst1q_u8(dp.add(i), r0);
1795 vst1q_u8(dp.add(i + 16), r1);
1796 i += 32;
1797 }
1798
1799 if i + 16 <= len {
1800 let input = vld1q_u8(sp.add(i));
1801 let ge = vcgeq_u8(input, lo_v);
1802 let le = vcleq_u8(input, hi_v);
1803 let mask = vandq_u8(ge, le);
1804 let off = vandq_u8(mask, vreinterpretq_u8_s8(offset_v));
1805 vst1q_u8(dp.add(i), vaddq_u8(input, off));
1806 i += 16;
1807 }
1808
1809 while i < len {
1810 let b = *sp.add(i);
1811 *dp.add(i) = if b >= lo && b <= hi {
1812 b.wrapping_add(offset as u8)
1813 } else {
1814 b
1815 };
1816 i += 1;
1817 }
1818 }
1819}
1820
1821#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
1823fn translate_range_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
1824 let offset_u8 = offset as u8;
1825 let range = hi.wrapping_sub(lo);
1826 unsafe {
1827 let sp = src.as_ptr();
1828 let dp = dst.as_mut_ptr();
1829 let len = src.len();
1830 let mut i = 0;
1831 while i + 8 <= len {
1832 macro_rules! do_byte {
1833 ($off:expr) => {{
1834 let b = *sp.add(i + $off);
1835 let in_range = b.wrapping_sub(lo) <= range;
1836 *dp.add(i + $off) = if in_range {
1837 b.wrapping_add(offset_u8)
1838 } else {
1839 b
1840 };
1841 }};
1842 }
1843 do_byte!(0);
1844 do_byte!(1);
1845 do_byte!(2);
1846 do_byte!(3);
1847 do_byte!(4);
1848 do_byte!(5);
1849 do_byte!(6);
1850 do_byte!(7);
1851 i += 8;
1852 }
1853 while i < len {
1854 let b = *sp.add(i);
1855 let in_range = b.wrapping_sub(lo) <= range;
1856 *dp.add(i) = if in_range {
1857 b.wrapping_add(offset_u8)
1858 } else {
1859 b
1860 };
1861 i += 1;
1862 }
1863 }
1864}
1865
1866#[cfg(target_arch = "x86_64")]
1874fn translate_range_simd_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
1875 if get_simd_level() >= 3 {
1876 unsafe { translate_range_avx2_inplace(data, lo, hi, offset) };
1877 } else {
1878 unsafe { translate_range_sse2_inplace(data, lo, hi, offset) };
1879 }
1880}
1881
1882#[cfg(target_arch = "x86_64")]
1883#[target_feature(enable = "avx2")]
1884unsafe fn translate_range_avx2_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
1885 use std::arch::x86_64::*;
1886
1887 unsafe {
1888 let range = hi - lo;
1889 let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1890 let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
1891 let offset_v = _mm256_set1_epi8(offset);
1892 let zero = _mm256_setzero_si256();
1893
1894 let len = data.len();
1895 let ptr = data.as_mut_ptr();
1896 let mut i = 0;
1897
1898 while i + 64 <= len {
1900 let in0 = _mm256_loadu_si256(ptr.add(i) as *const _);
1901 let in1 = _mm256_loadu_si256(ptr.add(i + 32) as *const _);
1902 let bi0 = _mm256_add_epi8(in0, bias_v);
1903 let bi1 = _mm256_add_epi8(in1, bias_v);
1904 let gt0 = _mm256_cmpgt_epi8(bi0, threshold_v);
1905 let gt1 = _mm256_cmpgt_epi8(bi1, threshold_v);
1906 let m0 = _mm256_cmpeq_epi8(gt0, zero);
1907 let m1 = _mm256_cmpeq_epi8(gt1, zero);
1908 let om0 = _mm256_and_si256(m0, offset_v);
1909 let om1 = _mm256_and_si256(m1, offset_v);
1910 let r0 = _mm256_add_epi8(in0, om0);
1911 let r1 = _mm256_add_epi8(in1, om1);
1912 _mm256_storeu_si256(ptr.add(i) as *mut _, r0);
1913 _mm256_storeu_si256(ptr.add(i + 32) as *mut _, r1);
1914 i += 64;
1915 }
1916
1917 if i + 32 <= len {
1919 let input = _mm256_loadu_si256(ptr.add(i) as *const _);
1920 let biased = _mm256_add_epi8(input, bias_v);
1921 let gt = _mm256_cmpgt_epi8(biased, threshold_v);
1922 let mask = _mm256_cmpeq_epi8(gt, zero);
1923 let offset_masked = _mm256_and_si256(mask, offset_v);
1924 let result = _mm256_add_epi8(input, offset_masked);
1925 _mm256_storeu_si256(ptr.add(i) as *mut _, result);
1926 i += 32;
1927 }
1928
1929 if i + 16 <= len {
1930 let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1931 let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1932 let offset_v128 = _mm_set1_epi8(offset);
1933 let zero128 = _mm_setzero_si128();
1934
1935 let input = _mm_loadu_si128(ptr.add(i) as *const _);
1936 let biased = _mm_add_epi8(input, bias_v128);
1937 let gt = _mm_cmpgt_epi8(biased, threshold_v128);
1938 let mask = _mm_cmpeq_epi8(gt, zero128);
1939 let offset_masked = _mm_and_si128(mask, offset_v128);
1940 let result = _mm_add_epi8(input, offset_masked);
1941 _mm_storeu_si128(ptr.add(i) as *mut _, result);
1942 i += 16;
1943 }
1944
1945 while i < len {
1946 let b = *ptr.add(i);
1947 *ptr.add(i) = if b >= lo && b <= hi {
1948 b.wrapping_add(offset as u8)
1949 } else {
1950 b
1951 };
1952 i += 1;
1953 }
1954 }
1955}
1956
1957#[cfg(target_arch = "x86_64")]
1958#[target_feature(enable = "sse2")]
1959unsafe fn translate_range_sse2_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
1960 use std::arch::x86_64::*;
1961
1962 unsafe {
1963 let range = hi - lo;
1964 let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1965 let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1966 let offset_v = _mm_set1_epi8(offset);
1967 let zero = _mm_setzero_si128();
1968
1969 let len = data.len();
1970 let ptr = data.as_mut_ptr();
1971 let mut i = 0;
1972
1973 while i + 16 <= len {
1974 let input = _mm_loadu_si128(ptr.add(i) as *const _);
1975 let biased = _mm_add_epi8(input, bias_v);
1976 let gt = _mm_cmpgt_epi8(biased, threshold_v);
1977 let mask = _mm_cmpeq_epi8(gt, zero);
1978 let offset_masked = _mm_and_si128(mask, offset_v);
1979 let result = _mm_add_epi8(input, offset_masked);
1980 _mm_storeu_si128(ptr.add(i) as *mut _, result);
1981 i += 16;
1982 }
1983
1984 while i < len {
1985 let b = *ptr.add(i);
1986 *ptr.add(i) = if b >= lo && b <= hi {
1987 b.wrapping_add(offset as u8)
1988 } else {
1989 b
1990 };
1991 i += 1;
1992 }
1993 }
1994}
1995
1996#[cfg(target_arch = "aarch64")]
1997fn translate_range_simd_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
1998 unsafe { translate_range_neon_inplace(data, lo, hi, offset) };
1999}
2000
2001#[cfg(target_arch = "aarch64")]
2002#[target_feature(enable = "neon")]
2003unsafe fn translate_range_neon_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
2004 use std::arch::aarch64::*;
2005
2006 unsafe {
2007 let len = data.len();
2008 let ptr = data.as_mut_ptr();
2009 let lo_v = vdupq_n_u8(lo);
2010 let hi_v = vdupq_n_u8(hi);
2011 let offset_v = vdupq_n_s8(offset);
2012 let mut i = 0;
2013
2014 while i + 32 <= len {
2015 let in0 = vld1q_u8(ptr.add(i));
2016 let in1 = vld1q_u8(ptr.add(i + 16));
2017 let ge0 = vcgeq_u8(in0, lo_v);
2018 let le0 = vcleq_u8(in0, hi_v);
2019 let mask0 = vandq_u8(ge0, le0);
2020 let ge1 = vcgeq_u8(in1, lo_v);
2021 let le1 = vcleq_u8(in1, hi_v);
2022 let mask1 = vandq_u8(ge1, le1);
2023 let off0 = vandq_u8(mask0, vreinterpretq_u8_s8(offset_v));
2024 let off1 = vandq_u8(mask1, vreinterpretq_u8_s8(offset_v));
2025 vst1q_u8(ptr.add(i), vaddq_u8(in0, off0));
2026 vst1q_u8(ptr.add(i + 16), vaddq_u8(in1, off1));
2027 i += 32;
2028 }
2029
2030 if i + 16 <= len {
2031 let input = vld1q_u8(ptr.add(i));
2032 let ge = vcgeq_u8(input, lo_v);
2033 let le = vcleq_u8(input, hi_v);
2034 let mask = vandq_u8(ge, le);
2035 let off = vandq_u8(mask, vreinterpretq_u8_s8(offset_v));
2036 vst1q_u8(ptr.add(i), vaddq_u8(input, off));
2037 i += 16;
2038 }
2039
2040 while i < len {
2041 let b = *ptr.add(i);
2042 if b >= lo && b <= hi {
2043 *ptr.add(i) = b.wrapping_add(offset as u8);
2044 }
2045 i += 1;
2046 }
2047 }
2048}
2049
2050#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
2051fn translate_range_simd_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
2052 let offset_u8 = offset as u8;
2053 let range = hi.wrapping_sub(lo);
2054 for b in data.iter_mut() {
2055 if b.wrapping_sub(lo) <= range {
2056 *b = b.wrapping_add(offset_u8);
2057 }
2058 }
2059}
2060
2061#[inline]
2071fn detect_delete_range(chars: &[u8]) -> Option<(u8, u8)> {
2072 if chars.is_empty() {
2073 return None;
2074 }
2075 let mut lo = chars[0];
2076 let mut hi = chars[0];
2077 for &c in &chars[1..] {
2078 if c < lo {
2079 lo = c;
2080 }
2081 if c > hi {
2082 hi = c;
2083 }
2084 }
2085 if (hi as usize - lo as usize + 1) == chars.len() {
2088 Some((lo, hi))
2089 } else {
2090 None
2091 }
2092}
2093
2094#[cfg(target_arch = "x86_64")]
2098fn delete_range_chunk(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
2099 if get_simd_level() >= 3 {
2100 unsafe { delete_range_avx2(src, dst, lo, hi) }
2101 } else {
2102 unsafe { delete_range_sse2(src, dst, lo, hi) }
2103 }
2104}
2105
2106#[cfg(target_arch = "x86_64")]
2107#[target_feature(enable = "avx2")]
2108unsafe fn delete_range_avx2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
2109 use std::arch::x86_64::*;
2110
2111 unsafe {
2112 let range = hi - lo;
2113 let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
2114 let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
2115 let zero = _mm256_setzero_si256();
2116
2117 let len = src.len();
2118 let sp = src.as_ptr();
2119 let dp = dst.as_mut_ptr();
2120 let mut ri = 0;
2121 let mut wp = 0;
2122
2123 while ri + 32 <= len {
2124 let input = _mm256_loadu_si256(sp.add(ri) as *const _);
2125 let biased = _mm256_add_epi8(input, bias_v);
2126 let gt = _mm256_cmpgt_epi8(biased, threshold_v);
2128 let in_range = _mm256_cmpeq_epi8(gt, zero);
2130 let keep_mask = !(_mm256_movemask_epi8(in_range) as u32);
2132
2133 if keep_mask == 0xFFFFFFFF {
2134 std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 32);
2136 wp += 32;
2137 } else if keep_mask != 0 {
2138 let m0 = keep_mask as u8;
2143 let m1 = (keep_mask >> 8) as u8;
2144 let m2 = (keep_mask >> 16) as u8;
2145 let m3 = (keep_mask >> 24) as u8;
2146
2147 if m0 == 0xFF {
2148 std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 8);
2149 } else if m0 != 0 {
2150 compact_8bytes_simd(sp.add(ri), dp.add(wp), m0);
2151 }
2152 let c0 = m0.count_ones() as usize;
2153
2154 if m1 == 0xFF {
2155 std::ptr::copy_nonoverlapping(sp.add(ri + 8), dp.add(wp + c0), 8);
2156 } else if m1 != 0 {
2157 compact_8bytes_simd(sp.add(ri + 8), dp.add(wp + c0), m1);
2158 }
2159 let c1 = m1.count_ones() as usize;
2160
2161 if m2 == 0xFF {
2162 std::ptr::copy_nonoverlapping(sp.add(ri + 16), dp.add(wp + c0 + c1), 8);
2163 } else if m2 != 0 {
2164 compact_8bytes_simd(sp.add(ri + 16), dp.add(wp + c0 + c1), m2);
2165 }
2166 let c2 = m2.count_ones() as usize;
2167
2168 if m3 == 0xFF {
2169 std::ptr::copy_nonoverlapping(sp.add(ri + 24), dp.add(wp + c0 + c1 + c2), 8);
2170 } else if m3 != 0 {
2171 compact_8bytes_simd(sp.add(ri + 24), dp.add(wp + c0 + c1 + c2), m3);
2172 }
2173 let c3 = m3.count_ones() as usize;
2174 wp += c0 + c1 + c2 + c3;
2175 }
2176 ri += 32;
2178 }
2179
2180 if ri + 16 <= len {
2182 let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
2183 let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
2184 let zero128 = _mm_setzero_si128();
2185
2186 let input = _mm_loadu_si128(sp.add(ri) as *const _);
2187 let biased = _mm_add_epi8(input, bias_v128);
2188 let gt = _mm_cmpgt_epi8(biased, threshold_v128);
2189 let in_range = _mm_cmpeq_epi8(gt, zero128);
2190 let keep_mask = !(_mm_movemask_epi8(in_range) as u32) & 0xFFFF;
2191
2192 if keep_mask == 0xFFFF {
2193 std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 16);
2194 wp += 16;
2195 } else if keep_mask != 0 {
2196 let m0 = keep_mask as u8;
2197 let m1 = (keep_mask >> 8) as u8;
2198 if m0 == 0xFF {
2199 std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 8);
2200 } else if m0 != 0 {
2201 compact_8bytes_simd(sp.add(ri), dp.add(wp), m0);
2202 }
2203 let c0 = m0.count_ones() as usize;
2204 if m1 == 0xFF {
2205 std::ptr::copy_nonoverlapping(sp.add(ri + 8), dp.add(wp + c0), 8);
2206 } else if m1 != 0 {
2207 compact_8bytes_simd(sp.add(ri + 8), dp.add(wp + c0), m1);
2208 }
2209 wp += c0 + m1.count_ones() as usize;
2210 }
2211 ri += 16;
2212 }
2213
2214 while ri < len {
2216 let b = *sp.add(ri);
2217 *dp.add(wp) = b;
2218 wp += (b < lo || b > hi) as usize;
2219 ri += 1;
2220 }
2221
2222 wp
2223 }
2224}
2225
2226#[cfg(target_arch = "x86_64")]
2234#[inline(always)]
2235unsafe fn compact_8bytes(src: *const u8, dst: *mut u8, mask: u8) {
2236 unsafe {
2237 let idx = COMPACT_LUT.get_unchecked(mask as usize);
2238 *dst = *src.add(*idx.get_unchecked(0) as usize);
2239 *dst.add(1) = *src.add(*idx.get_unchecked(1) as usize);
2240 *dst.add(2) = *src.add(*idx.get_unchecked(2) as usize);
2241 *dst.add(3) = *src.add(*idx.get_unchecked(3) as usize);
2242 *dst.add(4) = *src.add(*idx.get_unchecked(4) as usize);
2243 *dst.add(5) = *src.add(*idx.get_unchecked(5) as usize);
2244 *dst.add(6) = *src.add(*idx.get_unchecked(6) as usize);
2245 *dst.add(7) = *src.add(*idx.get_unchecked(7) as usize);
2246 }
2247}
2248
2249#[cfg(target_arch = "x86_64")]
2254#[target_feature(enable = "ssse3")]
2255#[inline]
2256unsafe fn compact_8bytes_simd(src: *const u8, dst: *mut u8, mask: u8) {
2257 use std::arch::x86_64::*;
2258 unsafe {
2259 let src_v = _mm_loadl_epi64(src as *const _);
2260 let shuf = _mm_loadl_epi64(COMPACT_LUT.get_unchecked(mask as usize).as_ptr() as *const _);
2261 let out_v = _mm_shuffle_epi8(src_v, shuf);
2262 _mm_storel_epi64(dst as *mut _, out_v);
2263 }
2264}
2265
2266#[cfg(target_arch = "x86_64")]
2267#[target_feature(enable = "sse2")]
2268unsafe fn delete_range_sse2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
2269 use std::arch::x86_64::*;
2270
2271 unsafe {
2272 let range = hi - lo;
2273 let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
2274 let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
2275 let zero = _mm_setzero_si128();
2276
2277 let len = src.len();
2278 let sp = src.as_ptr();
2279 let dp = dst.as_mut_ptr();
2280 let mut ri = 0;
2281 let mut wp = 0;
2282
2283 while ri + 16 <= len {
2284 let input = _mm_loadu_si128(sp.add(ri) as *const _);
2285 let biased = _mm_add_epi8(input, bias_v);
2286 let gt = _mm_cmpgt_epi8(biased, threshold_v);
2287 let in_range = _mm_cmpeq_epi8(gt, zero);
2288 let keep_mask = !(_mm_movemask_epi8(in_range) as u32) & 0xFFFF;
2289
2290 if keep_mask == 0xFFFF {
2291 std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 16);
2293 wp += 16;
2294 } else if keep_mask != 0 {
2295 let m0 = keep_mask as u8;
2296 let m1 = (keep_mask >> 8) as u8;
2297 if m0 == 0xFF {
2298 std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 8);
2299 } else if m0 != 0 {
2300 compact_8bytes(sp.add(ri), dp.add(wp), m0);
2301 }
2302 let c0 = m0.count_ones() as usize;
2303 if m1 == 0xFF {
2304 std::ptr::copy_nonoverlapping(sp.add(ri + 8), dp.add(wp + c0), 8);
2305 } else if m1 != 0 {
2306 compact_8bytes(sp.add(ri + 8), dp.add(wp + c0), m1);
2307 }
2308 wp += c0 + m1.count_ones() as usize;
2309 }
2310 ri += 16;
2311 }
2312
2313 while ri < len {
2315 let b = *sp.add(ri);
2316 *dp.add(wp) = b;
2317 wp += (b < lo || b > hi) as usize;
2318 ri += 1;
2319 }
2320
2321 wp
2322 }
2323}
2324
2325#[cfg(not(target_arch = "x86_64"))]
2329fn delete_range_chunk(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
2330 let len = src.len();
2331 let sp = src.as_ptr();
2332 let dp = dst.as_mut_ptr();
2333 let mut wp: usize = 0;
2334 let mut i: usize = 0;
2335
2336 while i + 8 <= len {
2338 unsafe {
2339 let b0 = *sp.add(i);
2340 *dp.add(wp) = b0;
2341 wp += (b0 < lo || b0 > hi) as usize;
2342 let b1 = *sp.add(i + 1);
2343 *dp.add(wp) = b1;
2344 wp += (b1 < lo || b1 > hi) as usize;
2345 let b2 = *sp.add(i + 2);
2346 *dp.add(wp) = b2;
2347 wp += (b2 < lo || b2 > hi) as usize;
2348 let b3 = *sp.add(i + 3);
2349 *dp.add(wp) = b3;
2350 wp += (b3 < lo || b3 > hi) as usize;
2351 let b4 = *sp.add(i + 4);
2352 *dp.add(wp) = b4;
2353 wp += (b4 < lo || b4 > hi) as usize;
2354 let b5 = *sp.add(i + 5);
2355 *dp.add(wp) = b5;
2356 wp += (b5 < lo || b5 > hi) as usize;
2357 let b6 = *sp.add(i + 6);
2358 *dp.add(wp) = b6;
2359 wp += (b6 < lo || b6 > hi) as usize;
2360 let b7 = *sp.add(i + 7);
2361 *dp.add(wp) = b7;
2362 wp += (b7 < lo || b7 > hi) as usize;
2363 }
2364 i += 8;
2365 }
2366
2367 while i < len {
2369 unsafe {
2370 let b = *sp.add(i);
2371 *dp.add(wp) = b;
2372 wp += (b < lo || b > hi) as usize;
2373 }
2374 i += 1;
2375 }
2376
2377 wp
2378}
2379
2380fn delete_range_streaming(
2385 lo: u8,
2386 hi: u8,
2387 reader: &mut impl Read,
2388 writer: &mut impl Write,
2389) -> io::Result<()> {
2390 let mut buf = alloc_uninit_vec(STREAM_BUF);
2391 loop {
2392 let n = read_once(reader, &mut buf)?;
2393 if n == 0 {
2394 break;
2395 }
2396 let wp = delete_range_inplace(&mut buf, n, lo, hi);
2397 if wp > 0 {
2398 writer.write_all(&buf[..wp])?;
2399 }
2400 }
2401 Ok(())
2402}
2403
2404#[inline]
2407fn delete_range_inplace(buf: &mut [u8], n: usize, lo: u8, hi: u8) -> usize {
2408 #[cfg(target_arch = "x86_64")]
2409 {
2410 let level = get_simd_level();
2411 if level >= 3 {
2412 return unsafe { delete_range_inplace_avx2(buf, n, lo, hi) };
2413 }
2414 }
2415 let ptr = buf.as_mut_ptr();
2417 let mut ri = 0;
2418 let mut wp = 0;
2419 unsafe {
2420 while ri + 8 <= n {
2421 let b0 = *ptr.add(ri);
2422 let b1 = *ptr.add(ri + 1);
2423 let b2 = *ptr.add(ri + 2);
2424 let b3 = *ptr.add(ri + 3);
2425 let b4 = *ptr.add(ri + 4);
2426 let b5 = *ptr.add(ri + 5);
2427 let b6 = *ptr.add(ri + 6);
2428 let b7 = *ptr.add(ri + 7);
2429 *ptr.add(wp) = b0;
2430 wp += (b0 < lo || b0 > hi) as usize;
2431 *ptr.add(wp) = b1;
2432 wp += (b1 < lo || b1 > hi) as usize;
2433 *ptr.add(wp) = b2;
2434 wp += (b2 < lo || b2 > hi) as usize;
2435 *ptr.add(wp) = b3;
2436 wp += (b3 < lo || b3 > hi) as usize;
2437 *ptr.add(wp) = b4;
2438 wp += (b4 < lo || b4 > hi) as usize;
2439 *ptr.add(wp) = b5;
2440 wp += (b5 < lo || b5 > hi) as usize;
2441 *ptr.add(wp) = b6;
2442 wp += (b6 < lo || b6 > hi) as usize;
2443 *ptr.add(wp) = b7;
2444 wp += (b7 < lo || b7 > hi) as usize;
2445 ri += 8;
2446 }
2447 while ri < n {
2448 let b = *ptr.add(ri);
2449 *ptr.add(wp) = b;
2450 wp += (b < lo || b > hi) as usize;
2451 ri += 1;
2452 }
2453 }
2454 wp
2455}
2456
2457#[cfg(target_arch = "x86_64")]
2460#[target_feature(enable = "avx2")]
2461unsafe fn delete_range_inplace_avx2(buf: &mut [u8], n: usize, lo: u8, hi: u8) -> usize {
2462 use std::arch::x86_64::*;
2463
2464 unsafe {
2465 let range = hi - lo;
2466 let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
2467 let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
2468 let zero = _mm256_setzero_si256();
2469
2470 let ptr = buf.as_mut_ptr();
2471 let mut ri = 0;
2472 let mut wp = 0;
2473
2474 while ri + 32 <= n {
2475 let input = _mm256_loadu_si256(ptr.add(ri) as *const _);
2476 let biased = _mm256_add_epi8(input, bias_v);
2477 let gt = _mm256_cmpgt_epi8(biased, threshold_v);
2478 let in_range = _mm256_cmpeq_epi8(gt, zero);
2479 let del_mask = _mm256_movemask_epi8(in_range) as u32;
2480
2481 if del_mask == 0 {
2482 if wp != ri {
2484 std::ptr::copy(ptr.add(ri), ptr.add(wp), 32);
2485 }
2486 wp += 32;
2487 } else if del_mask != 0xFFFFFFFF {
2488 let keep_mask = !del_mask;
2493 let m0 = keep_mask as u8;
2494 let m1 = (keep_mask >> 8) as u8;
2495 let m2 = (keep_mask >> 16) as u8;
2496 let m3 = (keep_mask >> 24) as u8;
2497
2498 let c0 = m0.count_ones() as usize;
2499 let c1 = m1.count_ones() as usize;
2500 let c2 = m2.count_ones() as usize;
2501 let c3 = m3.count_ones() as usize;
2502
2503 if m0 == 0xFF {
2505 std::ptr::copy(ptr.add(ri), ptr.add(wp), 8);
2506 } else if m0 != 0 {
2507 let src_v = _mm_loadl_epi64(ptr.add(ri) as *const _);
2508 let shuf = _mm_loadl_epi64(COMPACT_LUT[m0 as usize].as_ptr() as *const _);
2509 let out_v = _mm_shuffle_epi8(src_v, shuf);
2510 _mm_storel_epi64(ptr.add(wp) as *mut _, out_v);
2511 }
2512
2513 if m1 == 0xFF {
2515 std::ptr::copy(ptr.add(ri + 8), ptr.add(wp + c0), 8);
2516 } else if m1 != 0 {
2517 let src_v = _mm_loadl_epi64(ptr.add(ri + 8) as *const _);
2518 let shuf = _mm_loadl_epi64(COMPACT_LUT[m1 as usize].as_ptr() as *const _);
2519 let out_v = _mm_shuffle_epi8(src_v, shuf);
2520 _mm_storel_epi64(ptr.add(wp + c0) as *mut _, out_v);
2521 }
2522
2523 if m2 == 0xFF {
2525 std::ptr::copy(ptr.add(ri + 16), ptr.add(wp + c0 + c1), 8);
2526 } else if m2 != 0 {
2527 let src_v = _mm_loadl_epi64(ptr.add(ri + 16) as *const _);
2528 let shuf = _mm_loadl_epi64(COMPACT_LUT[m2 as usize].as_ptr() as *const _);
2529 let out_v = _mm_shuffle_epi8(src_v, shuf);
2530 _mm_storel_epi64(ptr.add(wp + c0 + c1) as *mut _, out_v);
2531 }
2532
2533 if m3 == 0xFF {
2535 std::ptr::copy(ptr.add(ri + 24), ptr.add(wp + c0 + c1 + c2), 8);
2536 } else if m3 != 0 {
2537 let src_v = _mm_loadl_epi64(ptr.add(ri + 24) as *const _);
2538 let shuf = _mm_loadl_epi64(COMPACT_LUT[m3 as usize].as_ptr() as *const _);
2539 let out_v = _mm_shuffle_epi8(src_v, shuf);
2540 _mm_storel_epi64(ptr.add(wp + c0 + c1 + c2) as *mut _, out_v);
2541 }
2542
2543 wp += c0 + c1 + c2 + c3;
2544 }
2545 ri += 32;
2547 }
2548
2549 while ri < n {
2551 let b = *ptr.add(ri);
2552 *ptr.add(wp) = b;
2553 wp += (b < lo || b > hi) as usize;
2554 ri += 1;
2555 }
2556
2557 wp
2558 }
2559}
2560
2561pub fn translate(
2566 set1: &[u8],
2567 set2: &[u8],
2568 reader: &mut impl Read,
2569 writer: &mut impl Write,
2570) -> io::Result<()> {
2571 let table = build_translate_table(set1, set2);
2572
2573 let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
2575 if is_identity {
2576 return passthrough_stream(reader, writer);
2577 }
2578
2579 if let Some((lo, hi, offset)) = detect_range_offset(&table) {
2581 return translate_range_stream(lo, hi, offset, reader, writer);
2582 }
2583
2584 if let Some((lo, hi, replacement)) = detect_range_to_constant(&table) {
2587 return translate_range_to_constant_stream(lo, hi, replacement, reader, writer);
2588 }
2589
2590 let mut buf = alloc_uninit_vec(STREAM_BUF);
2595 loop {
2596 let n = read_once(reader, &mut buf)?;
2597 if n == 0 {
2598 break;
2599 }
2600 translate_and_write_table(&mut buf, n, &table, writer)?;
2601 }
2602 Ok(())
2603}
2604
2605#[inline]
2606fn translate_and_write_table(
2607 buf: &mut [u8],
2608 total: usize,
2609 table: &[u8; 256],
2610 writer: &mut impl Write,
2611) -> io::Result<()> {
2612 if total >= PARALLEL_THRESHOLD {
2613 let nt = rayon::current_num_threads().max(1);
2614 let cs = (total / nt).max(32 * 1024);
2615 buf[..total].par_chunks_mut(cs).for_each(|chunk| {
2616 translate_inplace(chunk, table);
2617 });
2618 } else {
2619 translate_inplace(&mut buf[..total], table);
2620 }
2621 writer.write_all(&buf[..total])
2622}
2623
2624fn translate_range_stream(
2629 lo: u8,
2630 hi: u8,
2631 offset: i8,
2632 reader: &mut impl Read,
2633 writer: &mut impl Write,
2634) -> io::Result<()> {
2635 let mut buf = alloc_uninit_vec(STREAM_BUF);
2636 loop {
2637 let n = read_once(reader, &mut buf)?;
2638 if n == 0 {
2639 break;
2640 }
2641 translate_and_write_range(&mut buf, n, lo, hi, offset, writer)?;
2642 }
2643 Ok(())
2644}
2645
2646#[inline]
2647fn translate_and_write_range(
2648 buf: &mut [u8],
2649 total: usize,
2650 lo: u8,
2651 hi: u8,
2652 offset: i8,
2653 writer: &mut impl Write,
2654) -> io::Result<()> {
2655 if total >= PARALLEL_THRESHOLD {
2656 let nt = rayon::current_num_threads().max(1);
2657 let cs = (total / nt).max(32 * 1024);
2658 buf[..total].par_chunks_mut(cs).for_each(|chunk| {
2659 translate_range_simd_inplace(chunk, lo, hi, offset);
2660 });
2661 } else {
2662 translate_range_simd_inplace(&mut buf[..total], lo, hi, offset);
2663 }
2664 writer.write_all(&buf[..total])
2665}
2666
2667fn translate_range_to_constant_stream(
2671 lo: u8,
2672 hi: u8,
2673 replacement: u8,
2674 reader: &mut impl Read,
2675 writer: &mut impl Write,
2676) -> io::Result<()> {
2677 let mut buf = alloc_uninit_vec(STREAM_BUF);
2678 loop {
2679 let n = read_once(reader, &mut buf)?;
2680 if n == 0 {
2681 break;
2682 }
2683 translate_and_write_range_const(&mut buf, n, lo, hi, replacement, writer)?;
2684 }
2685 Ok(())
2686}
2687
2688#[inline]
2689fn translate_and_write_range_const(
2690 buf: &mut [u8],
2691 total: usize,
2692 lo: u8,
2693 hi: u8,
2694 replacement: u8,
2695 writer: &mut impl Write,
2696) -> io::Result<()> {
2697 if total >= PARALLEL_THRESHOLD {
2698 let nt = rayon::current_num_threads().max(1);
2699 let cs = (total / nt).max(32 * 1024);
2700 buf[..total].par_chunks_mut(cs).for_each(|chunk| {
2701 translate_range_to_constant_simd_inplace(chunk, lo, hi, replacement);
2702 });
2703 } else {
2704 translate_range_to_constant_simd_inplace(&mut buf[..total], lo, hi, replacement);
2705 }
2706 writer.write_all(&buf[..total])
2707}
2708
2709fn passthrough_stream(reader: &mut impl Read, writer: &mut impl Write) -> io::Result<()> {
2712 let mut buf = alloc_uninit_vec(STREAM_BUF);
2713 loop {
2714 let n = read_once(reader, &mut buf)?;
2715 if n == 0 {
2716 break;
2717 }
2718 writer.write_all(&buf[..n])?;
2719 }
2720 Ok(())
2721}
2722
2723#[inline]
2729fn read_once(reader: &mut impl Read, buf: &mut [u8]) -> io::Result<usize> {
2730 loop {
2731 match reader.read(buf) {
2732 Ok(n) => return Ok(n),
2733 Err(e) if e.kind() == io::ErrorKind::Interrupted => continue,
2734 Err(e) => return Err(e),
2735 }
2736 }
2737}
2738
2739pub fn translate_squeeze(
2740 set1: &[u8],
2741 set2: &[u8],
2742 reader: &mut impl Read,
2743 writer: &mut impl Write,
2744) -> io::Result<()> {
2745 let table = build_translate_table(set1, set2);
2746 let squeeze_set = build_member_set(set2);
2747
2748 if set2.len() == 1 || (set2.len() > 1 && set2.iter().all(|&b| b == set2[0])) {
2751 let squeeze_ch = set2.last().copied().unwrap_or(0);
2752 return translate_squeeze_single_ch(&table, squeeze_ch, &squeeze_set, reader, writer);
2753 }
2754
2755 let range_info = detect_range_offset(&table);
2759 let range_const_info = if range_info.is_none() {
2760 detect_range_to_constant(&table)
2761 } else {
2762 None
2763 };
2764
2765 let mut buf = alloc_uninit_vec(STREAM_BUF);
2766 let mut last_squeezed: u16 = 256;
2767
2768 loop {
2769 let n = read_once(reader, &mut buf)?;
2770 if n == 0 {
2771 break;
2772 }
2773 let wp = translate_squeeze_process(
2774 &mut buf,
2775 n,
2776 &table,
2777 &squeeze_set,
2778 range_info,
2779 range_const_info,
2780 &mut last_squeezed,
2781 );
2782 if wp > 0 {
2783 writer.write_all(&buf[..wp])?;
2784 }
2785 }
2786 Ok(())
2787}
2788
2789#[inline]
2790fn translate_squeeze_process(
2791 buf: &mut [u8],
2792 n: usize,
2793 table: &[u8; 256],
2794 squeeze_set: &[u8; 32],
2795 range_info: Option<(u8, u8, i8)>,
2796 range_const_info: Option<(u8, u8, u8)>,
2797 last_squeezed: &mut u16,
2798) -> usize {
2799 if let Some((lo, hi, offset)) = range_info {
2801 translate_range_simd_inplace(&mut buf[..n], lo, hi, offset);
2802 } else if let Some((lo, hi, replacement)) = range_const_info {
2803 translate_range_to_constant_simd_inplace(&mut buf[..n], lo, hi, replacement);
2804 } else {
2805 translate_inplace(&mut buf[..n], table);
2806 }
2807 let mut wp = 0;
2809 unsafe {
2810 let ptr = buf.as_mut_ptr();
2811 let mut i = 0;
2812 while i + 8 <= n {
2813 macro_rules! squeeze_byte {
2814 ($off:expr) => {
2815 let b = *ptr.add(i + $off);
2816 if is_member(squeeze_set, b) {
2817 if *last_squeezed != b as u16 {
2818 *last_squeezed = b as u16;
2819 *ptr.add(wp) = b;
2820 wp += 1;
2821 }
2822 } else {
2823 *last_squeezed = 256;
2824 *ptr.add(wp) = b;
2825 wp += 1;
2826 }
2827 };
2828 }
2829 squeeze_byte!(0);
2830 squeeze_byte!(1);
2831 squeeze_byte!(2);
2832 squeeze_byte!(3);
2833 squeeze_byte!(4);
2834 squeeze_byte!(5);
2835 squeeze_byte!(6);
2836 squeeze_byte!(7);
2837 i += 8;
2838 }
2839 while i < n {
2840 let b = *ptr.add(i);
2841 if is_member(squeeze_set, b) {
2842 if *last_squeezed == b as u16 {
2843 i += 1;
2844 continue;
2845 }
2846 *last_squeezed = b as u16;
2847 } else {
2848 *last_squeezed = 256;
2849 }
2850 *ptr.add(wp) = b;
2851 wp += 1;
2852 i += 1;
2853 }
2854 }
2855 wp
2856}
2857
2858fn translate_squeeze_single_ch(
2862 table: &[u8; 256],
2863 squeeze_ch: u8,
2864 _squeeze_set: &[u8; 32],
2865 reader: &mut impl Read,
2866 writer: &mut impl Write,
2867) -> io::Result<()> {
2868 let range_info = detect_range_offset(table);
2869 let range_const_info = if range_info.is_none() {
2870 detect_range_to_constant(table)
2871 } else {
2872 None
2873 };
2874
2875 let pair = [squeeze_ch, squeeze_ch];
2876 let finder = memchr::memmem::Finder::new(&pair);
2877 let mut buf = alloc_uninit_vec(STREAM_BUF);
2878 let mut was_squeeze_char = false;
2879
2880 loop {
2881 let n = read_once(reader, &mut buf)?;
2882 if n == 0 {
2883 break;
2884 }
2885 let wp = translate_squeeze_single_process(
2886 &mut buf,
2887 n,
2888 table,
2889 squeeze_ch,
2890 &finder,
2891 range_info,
2892 range_const_info,
2893 &mut was_squeeze_char,
2894 );
2895 if wp > 0 {
2896 writer.write_all(&buf[..wp])?;
2897 }
2898 }
2899 Ok(())
2900}
2901
2902#[inline]
2903fn translate_squeeze_single_process(
2904 buf: &mut [u8],
2905 n: usize,
2906 table: &[u8; 256],
2907 squeeze_ch: u8,
2908 finder: &memchr::memmem::Finder<'_>,
2909 range_info: Option<(u8, u8, i8)>,
2910 range_const_info: Option<(u8, u8, u8)>,
2911 was_squeeze_char: &mut bool,
2912) -> usize {
2913 if let Some((lo, hi, offset)) = range_info {
2915 translate_range_simd_inplace(&mut buf[..n], lo, hi, offset);
2916 } else if let Some((lo, hi, replacement)) = range_const_info {
2917 translate_range_to_constant_simd_inplace(&mut buf[..n], lo, hi, replacement);
2918 } else {
2919 translate_inplace(&mut buf[..n], table);
2920 }
2921
2922 let mut i = 0;
2924 if *was_squeeze_char {
2925 while i < n && unsafe { *buf.as_ptr().add(i) } == squeeze_ch {
2926 i += 1;
2927 }
2928 *was_squeeze_char = false;
2929 if i >= n {
2930 *was_squeeze_char = true;
2931 return 0;
2932 }
2933 }
2934
2935 let ptr = buf.as_mut_ptr();
2936 let mut wp = 0usize;
2937
2938 loop {
2939 match finder.find(&buf[i..n]) {
2940 Some(offset) => {
2941 let seg_end = i + offset + 1;
2942 let gap = seg_end - i;
2943 if gap > 0 {
2944 if wp != i {
2945 unsafe {
2946 std::ptr::copy(ptr.add(i) as *const u8, ptr.add(wp), gap);
2947 }
2948 }
2949 wp += gap;
2950 }
2951 i = seg_end;
2952 while i < n && unsafe { *buf.as_ptr().add(i) } == squeeze_ch {
2953 i += 1;
2954 }
2955 if i >= n {
2956 *was_squeeze_char = true;
2957 break;
2958 }
2959 }
2960 None => {
2961 let rem = n - i;
2962 if rem > 0 {
2963 if wp != i {
2964 unsafe {
2965 std::ptr::copy(ptr.add(i) as *const u8, ptr.add(wp), rem);
2966 }
2967 }
2968 wp += rem;
2969 }
2970 *was_squeeze_char = n > 0 && unsafe { *buf.as_ptr().add(n - 1) } == squeeze_ch;
2971 break;
2972 }
2973 }
2974 }
2975 wp
2976}
2977
2978pub fn delete(
2979 delete_chars: &[u8],
2980 reader: &mut impl Read,
2981 writer: &mut impl Write,
2982) -> io::Result<()> {
2983 if delete_chars.len() == 1 {
2984 return delete_single_streaming(delete_chars[0], reader, writer);
2985 }
2986 if delete_chars.len() <= 3 {
2987 return delete_multi_streaming(delete_chars, reader, writer);
2988 }
2989
2990 if let Some((lo, hi)) = detect_delete_range(delete_chars) {
2994 return delete_range_streaming(lo, hi, reader, writer);
2995 }
2996
2997 let member = build_member_set(delete_chars);
2998 let mut buf = alloc_uninit_vec(STREAM_BUF);
2999 let mut outbuf = alloc_uninit_vec(STREAM_BUF);
3002
3003 loop {
3004 let n = read_once(reader, &mut buf)?;
3005 if n == 0 {
3006 break;
3007 }
3008 let wp = delete_bitset_dispatch(&buf[..n], &mut outbuf, &member);
3009 if wp > 0 {
3010 writer.write_all(&outbuf[..wp])?;
3011 }
3012 }
3013 Ok(())
3014}
3015
3016#[inline]
3017fn delete_bitset_dispatch(src: &[u8], dst: &mut [u8], member: &[u8; 32]) -> usize {
3018 #[cfg(target_arch = "x86_64")]
3019 {
3020 if get_simd_level() >= 3 {
3021 return unsafe { delete_bitset_avx2_stream(src, dst, member) };
3022 }
3023 }
3024 delete_bitset_scalar(src, dst, member)
3025}
3026
3027#[inline]
3029fn delete_bitset_scalar(src: &[u8], dst: &mut [u8], member: &[u8; 32]) -> usize {
3030 let n = src.len();
3031 let mut wp = 0;
3032 unsafe {
3033 let sp = src.as_ptr();
3034 let dp = dst.as_mut_ptr();
3035 let mut i = 0;
3036 while i + 8 <= n {
3037 let b0 = *sp.add(i);
3038 let b1 = *sp.add(i + 1);
3039 let b2 = *sp.add(i + 2);
3040 let b3 = *sp.add(i + 3);
3041 let b4 = *sp.add(i + 4);
3042 let b5 = *sp.add(i + 5);
3043 let b6 = *sp.add(i + 6);
3044 let b7 = *sp.add(i + 7);
3045 *dp.add(wp) = b0;
3046 wp += !is_member(member, b0) as usize;
3047 *dp.add(wp) = b1;
3048 wp += !is_member(member, b1) as usize;
3049 *dp.add(wp) = b2;
3050 wp += !is_member(member, b2) as usize;
3051 *dp.add(wp) = b3;
3052 wp += !is_member(member, b3) as usize;
3053 *dp.add(wp) = b4;
3054 wp += !is_member(member, b4) as usize;
3055 *dp.add(wp) = b5;
3056 wp += !is_member(member, b5) as usize;
3057 *dp.add(wp) = b6;
3058 wp += !is_member(member, b6) as usize;
3059 *dp.add(wp) = b7;
3060 wp += !is_member(member, b7) as usize;
3061 i += 8;
3062 }
3063 while i < n {
3064 let b = *sp.add(i);
3065 *dp.add(wp) = b;
3066 wp += !is_member(member, b) as usize;
3067 i += 1;
3068 }
3069 }
3070 wp
3071}
3072
3073#[cfg(target_arch = "x86_64")]
3076#[target_feature(enable = "avx2")]
3077unsafe fn delete_bitset_avx2_stream(src: &[u8], dst: &mut [u8], member: &[u8; 32]) -> usize {
3078 use std::arch::x86_64::*;
3079
3080 unsafe {
3081 let n = src.len();
3082 let sp = src.as_ptr();
3083 let dp = dst.as_mut_ptr();
3084 let mut ri = 0;
3085 let mut wp = 0;
3086
3087 let member_v = _mm256_loadu_si256(member.as_ptr() as *const _);
3090
3091 let mask7 = _mm256_set1_epi8(7);
3094 let mask_0x1f = _mm256_set1_epi8(0x1F_u8 as i8);
3095
3096 let bit_table = _mm256_setr_epi8(
3099 1, 2, 4, 8, 16, 32, 64, -128i8, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 4, 8, 16, 32, 64, -128i8,
3100 0, 0, 0, 0, 0, 0, 0, 0,
3101 );
3102
3103 while ri + 32 <= n {
3104 let input = _mm256_loadu_si256(sp.add(ri) as *const _);
3105
3106 let byte_idx = _mm256_and_si256(_mm256_srli_epi16(input, 3), mask_0x1f);
3108 let bit_pos = _mm256_and_si256(input, mask7);
3110 let bit_mask = _mm256_shuffle_epi8(bit_table, bit_pos);
3112
3113 let member_lo = _mm256_broadcastsi128_si256(_mm256_castsi256_si128(member_v));
3121 let member_hi = _mm256_broadcastsi128_si256(_mm256_extracti128_si256(member_v, 1));
3122 let lo_mask = _mm256_set1_epi8(0x0F);
3123 let idx_lo = _mm256_and_si256(byte_idx, lo_mask);
3124 let shuffled_lo = _mm256_shuffle_epi8(member_lo, idx_lo);
3125 let shuffled_hi = _mm256_shuffle_epi8(member_hi, idx_lo);
3126 let use_hi = _mm256_slli_epi16(byte_idx, 3); let member_byte = _mm256_blendv_epi8(shuffled_lo, shuffled_hi, use_hi);
3129
3130 let test = _mm256_and_si256(member_byte, bit_mask);
3132 let is_zero = _mm256_cmpeq_epi8(test, _mm256_setzero_si256());
3133 let keep_mask = _mm256_movemask_epi8(is_zero) as u32;
3135
3136 if keep_mask == 0xFFFFFFFF {
3137 std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 32);
3139 wp += 32;
3140 } else if keep_mask != 0 {
3141 let m0 = keep_mask as u8;
3143 let m1 = (keep_mask >> 8) as u8;
3144 let m2 = (keep_mask >> 16) as u8;
3145 let m3 = (keep_mask >> 24) as u8;
3146
3147 if m0 == 0xFF {
3148 std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 8);
3149 } else if m0 != 0 {
3150 compact_8bytes_simd(sp.add(ri), dp.add(wp), m0);
3151 }
3152 let c0 = m0.count_ones() as usize;
3153
3154 if m1 == 0xFF {
3155 std::ptr::copy_nonoverlapping(sp.add(ri + 8), dp.add(wp + c0), 8);
3156 } else if m1 != 0 {
3157 compact_8bytes_simd(sp.add(ri + 8), dp.add(wp + c0), m1);
3158 }
3159 let c1 = m1.count_ones() as usize;
3160
3161 if m2 == 0xFF {
3162 std::ptr::copy_nonoverlapping(sp.add(ri + 16), dp.add(wp + c0 + c1), 8);
3163 } else if m2 != 0 {
3164 compact_8bytes_simd(sp.add(ri + 16), dp.add(wp + c0 + c1), m2);
3165 }
3166 let c2 = m2.count_ones() as usize;
3167
3168 if m3 == 0xFF {
3169 std::ptr::copy_nonoverlapping(sp.add(ri + 24), dp.add(wp + c0 + c1 + c2), 8);
3170 } else if m3 != 0 {
3171 compact_8bytes_simd(sp.add(ri + 24), dp.add(wp + c0 + c1 + c2), m3);
3172 }
3173 let c3 = m3.count_ones() as usize;
3174 wp += c0 + c1 + c2 + c3;
3175 }
3176 ri += 32;
3178 }
3179
3180 while ri < n {
3182 let b = *sp.add(ri);
3183 *dp.add(wp) = b;
3184 wp += !is_member(member, b) as usize;
3185 ri += 1;
3186 }
3187
3188 wp
3189 }
3190}
3191
3192fn delete_single_streaming(
3193 ch: u8,
3194 reader: &mut impl Read,
3195 writer: &mut impl Write,
3196) -> io::Result<()> {
3197 let mut buf = alloc_uninit_vec(STREAM_BUF);
3198 loop {
3199 let n = read_once(reader, &mut buf)?;
3200 if n == 0 {
3201 break;
3202 }
3203 let wp = delete_single_inplace(&mut buf, n, ch);
3204 if wp > 0 {
3205 writer.write_all(&buf[..wp])?;
3206 }
3207 }
3208 Ok(())
3209}
3210
3211#[inline]
3213fn delete_single_inplace(buf: &mut [u8], n: usize, ch: u8) -> usize {
3214 let mut wp = 0;
3215 let mut i = 0;
3216 while i < n {
3217 match memchr::memchr(ch, &buf[i..n]) {
3218 Some(offset) => {
3219 if offset > 0 {
3220 if wp != i {
3221 unsafe {
3222 std::ptr::copy(buf.as_ptr().add(i), buf.as_mut_ptr().add(wp), offset);
3223 }
3224 }
3225 wp += offset;
3226 }
3227 i += offset + 1;
3228 }
3229 None => {
3230 let run_len = n - i;
3231 if run_len > 0 {
3232 if wp != i {
3233 unsafe {
3234 std::ptr::copy(buf.as_ptr().add(i), buf.as_mut_ptr().add(wp), run_len);
3235 }
3236 }
3237 wp += run_len;
3238 }
3239 break;
3240 }
3241 }
3242 }
3243 wp
3244}
3245
3246fn delete_multi_streaming(
3247 chars: &[u8],
3248 reader: &mut impl Read,
3249 writer: &mut impl Write,
3250) -> io::Result<()> {
3251 let mut buf = alloc_uninit_vec(STREAM_BUF);
3252 loop {
3253 let n = read_once(reader, &mut buf)?;
3254 if n == 0 {
3255 break;
3256 }
3257 let wp = delete_multi_inplace(&mut buf, n, chars);
3258 if wp > 0 {
3259 writer.write_all(&buf[..wp])?;
3260 }
3261 }
3262 Ok(())
3263}
3264
3265#[inline]
3267fn delete_multi_inplace(buf: &mut [u8], n: usize, chars: &[u8]) -> usize {
3268 let mut wp = 0;
3269 let mut i = 0;
3270 while i < n {
3271 let found = if chars.len() == 2 {
3272 memchr::memchr2(chars[0], chars[1], &buf[i..n])
3273 } else {
3274 memchr::memchr3(chars[0], chars[1], chars[2], &buf[i..n])
3275 };
3276 match found {
3277 Some(offset) => {
3278 if offset > 0 {
3279 if wp != i {
3280 unsafe {
3281 std::ptr::copy(buf.as_ptr().add(i), buf.as_mut_ptr().add(wp), offset);
3282 }
3283 }
3284 wp += offset;
3285 }
3286 i += offset + 1;
3287 }
3288 None => {
3289 let run_len = n - i;
3290 if run_len > 0 {
3291 if wp != i {
3292 unsafe {
3293 std::ptr::copy(buf.as_ptr().add(i), buf.as_mut_ptr().add(wp), run_len);
3294 }
3295 }
3296 wp += run_len;
3297 }
3298 break;
3299 }
3300 }
3301 }
3302 wp
3303}
3304
3305pub fn delete_squeeze(
3306 delete_chars: &[u8],
3307 squeeze_chars: &[u8],
3308 reader: &mut impl Read,
3309 writer: &mut impl Write,
3310) -> io::Result<()> {
3311 let delete_set = build_member_set(delete_chars);
3312 let squeeze_set = build_member_set(squeeze_chars);
3313 let mut buf = alloc_uninit_vec(STREAM_BUF);
3314 let mut last_squeezed: u16 = 256;
3315
3316 loop {
3317 let n = read_once(reader, &mut buf)?;
3318 if n == 0 {
3319 break;
3320 }
3321 let wp = delete_squeeze_inplace(&mut buf, n, &delete_set, &squeeze_set, &mut last_squeezed);
3322 if wp > 0 {
3323 writer.write_all(&buf[..wp])?;
3324 }
3325 }
3326 Ok(())
3327}
3328
3329#[inline]
3330fn delete_squeeze_inplace(
3331 buf: &mut [u8],
3332 n: usize,
3333 delete_set: &[u8; 32],
3334 squeeze_set: &[u8; 32],
3335 last_squeezed: &mut u16,
3336) -> usize {
3337 let mut wp = 0;
3338 unsafe {
3339 let ptr = buf.as_mut_ptr();
3340 let mut i = 0;
3341 while i + 8 <= n {
3342 macro_rules! process_byte {
3343 ($off:expr) => {
3344 let b = *ptr.add(i + $off);
3345 if !is_member(delete_set, b) {
3346 if is_member(squeeze_set, b) {
3347 if *last_squeezed != b as u16 {
3348 *last_squeezed = b as u16;
3349 *ptr.add(wp) = b;
3350 wp += 1;
3351 }
3352 } else {
3353 *last_squeezed = 256;
3354 *ptr.add(wp) = b;
3355 wp += 1;
3356 }
3357 }
3358 };
3359 }
3360 process_byte!(0);
3361 process_byte!(1);
3362 process_byte!(2);
3363 process_byte!(3);
3364 process_byte!(4);
3365 process_byte!(5);
3366 process_byte!(6);
3367 process_byte!(7);
3368 i += 8;
3369 }
3370 while i < n {
3371 let b = *ptr.add(i);
3372 if !is_member(delete_set, b) {
3373 if is_member(squeeze_set, b) {
3374 if *last_squeezed != b as u16 {
3375 *last_squeezed = b as u16;
3376 *ptr.add(wp) = b;
3377 wp += 1;
3378 }
3379 } else {
3380 *last_squeezed = 256;
3381 *ptr.add(wp) = b;
3382 wp += 1;
3383 }
3384 }
3385 i += 1;
3386 }
3387 }
3388 wp
3389}
3390
3391pub fn squeeze(
3392 squeeze_chars: &[u8],
3393 reader: &mut impl Read,
3394 writer: &mut impl Write,
3395) -> io::Result<()> {
3396 if squeeze_chars.len() == 1 {
3397 return squeeze_single_stream(squeeze_chars[0], reader, writer);
3398 }
3399
3400 if squeeze_chars.len() <= 3 {
3403 return squeeze_multi_stream(squeeze_chars, reader, writer);
3404 }
3405
3406 let member = build_member_set(squeeze_chars);
3407 let mut buf = alloc_uninit_vec(STREAM_BUF);
3408 let mut last_squeezed: u16 = 256;
3409
3410 loop {
3411 let n = read_once(reader, &mut buf)?;
3412 if n == 0 {
3413 break;
3414 }
3415 let wp = squeeze_inplace_bitset(&mut buf, n, &member, &mut last_squeezed);
3416 if wp > 0 {
3417 writer.write_all(&buf[..wp])?;
3418 }
3419 }
3420 Ok(())
3421}
3422
3423#[inline]
3424fn squeeze_inplace_bitset(
3425 buf: &mut [u8],
3426 n: usize,
3427 member: &[u8; 32],
3428 last_squeezed: &mut u16,
3429) -> usize {
3430 let mut wp = 0;
3431 unsafe {
3432 let ptr = buf.as_mut_ptr();
3433 for i in 0..n {
3434 let b = *ptr.add(i);
3435 if is_member(member, b) {
3436 if *last_squeezed == b as u16 {
3437 continue;
3438 }
3439 *last_squeezed = b as u16;
3440 } else {
3441 *last_squeezed = 256;
3442 }
3443 *ptr.add(wp) = b;
3444 wp += 1;
3445 }
3446 }
3447 wp
3448}
3449
3450fn squeeze_multi_stream(
3454 chars: &[u8],
3455 reader: &mut impl Read,
3456 writer: &mut impl Write,
3457) -> io::Result<()> {
3458 let c0 = chars[0];
3459 let c1 = chars[1];
3460 let c2 = if chars.len() >= 3 {
3461 Some(chars[2])
3462 } else {
3463 None
3464 };
3465
3466 let mut buf = alloc_uninit_vec(STREAM_BUF);
3467 let mut last_squeezed: u16 = 256;
3468
3469 loop {
3470 let n = read_once(reader, &mut buf)?;
3471 if n == 0 {
3472 break;
3473 }
3474 let wp = squeeze_multi_compact(&mut buf, n, c0, c1, c2, &mut last_squeezed);
3475 if wp > 0 {
3476 writer.write_all(&buf[..wp])?;
3477 }
3478 }
3479 Ok(())
3480}
3481
3482#[inline]
3484fn squeeze_multi_compact(
3485 buf: &mut [u8],
3486 n: usize,
3487 c0: u8,
3488 c1: u8,
3489 c2: Option<u8>,
3490 last_squeezed: &mut u16,
3491) -> usize {
3492 let ptr = buf.as_mut_ptr();
3493 let mut wp = 0usize;
3494 let mut cursor = 0usize;
3495
3496 while cursor < n {
3497 let found = if let Some(c) = c2 {
3498 memchr::memchr3(c0, c1, c, &buf[cursor..n])
3499 } else {
3500 memchr::memchr2(c0, c1, &buf[cursor..n])
3501 };
3502 match found {
3503 Some(offset) => {
3504 let pos = cursor + offset;
3505 let b = unsafe { *ptr.add(pos) };
3506
3507 let gap = pos - cursor;
3508 if gap > 0 {
3509 if wp != cursor {
3510 unsafe {
3511 std::ptr::copy(ptr.add(cursor), ptr.add(wp), gap);
3512 }
3513 }
3514 wp += gap;
3515 *last_squeezed = 256;
3516 }
3517
3518 if *last_squeezed != b as u16 {
3519 unsafe { *ptr.add(wp) = b };
3520 wp += 1;
3521 *last_squeezed = b as u16;
3522 }
3523
3524 cursor = pos + 1;
3525 while cursor < n && unsafe { *ptr.add(cursor) } == b {
3526 cursor += 1;
3527 }
3528 }
3529 None => {
3530 let rem = n - cursor;
3531 if rem > 0 {
3532 if wp != cursor {
3533 unsafe {
3534 std::ptr::copy(ptr.add(cursor), ptr.add(wp), rem);
3535 }
3536 }
3537 wp += rem;
3538 *last_squeezed = 256;
3539 }
3540 break;
3541 }
3542 }
3543 }
3544 wp
3545}
3546
3547fn squeeze_single_stream(
3548 ch: u8,
3549 reader: &mut impl Read,
3550 writer: &mut impl Write,
3551) -> io::Result<()> {
3552 let pair = [ch, ch];
3553 let finder = memchr::memmem::Finder::new(&pair);
3554 let mut buf = alloc_uninit_vec(STREAM_BUF);
3555 let mut was_squeeze_char = false;
3556
3557 loop {
3558 let n = read_once(reader, &mut buf)?;
3559 if n == 0 {
3560 break;
3561 }
3562 let wp = squeeze_single_compact(&mut buf, n, ch, &finder, &mut was_squeeze_char);
3563 if wp > 0 {
3564 writer.write_all(&buf[..wp])?;
3565 }
3566 }
3567 Ok(())
3568}
3569
3570#[inline]
3572fn squeeze_single_compact(
3573 buf: &mut [u8],
3574 n: usize,
3575 ch: u8,
3576 finder: &memchr::memmem::Finder<'_>,
3577 was_squeeze_char: &mut bool,
3578) -> usize {
3579 let mut i = 0;
3580
3581 if *was_squeeze_char {
3583 while i < n && unsafe { *buf.as_ptr().add(i) } == ch {
3584 i += 1;
3585 }
3586 *was_squeeze_char = false;
3587 if i >= n {
3588 *was_squeeze_char = true;
3589 return 0;
3590 }
3591 }
3592
3593 let ptr = buf.as_mut_ptr();
3594 let mut wp = 0usize;
3595
3596 loop {
3597 match finder.find(&buf[i..n]) {
3598 Some(offset) => {
3599 let seg_end = i + offset + 1;
3600 let gap = seg_end - i;
3601 if gap > 0 {
3602 if wp != i {
3603 unsafe {
3604 std::ptr::copy(ptr.add(i) as *const u8, ptr.add(wp), gap);
3605 }
3606 }
3607 wp += gap;
3608 }
3609 i = seg_end;
3610 while i < n && unsafe { *buf.as_ptr().add(i) } == ch {
3611 i += 1;
3612 }
3613 if i >= n {
3614 *was_squeeze_char = true;
3615 break;
3616 }
3617 }
3618 None => {
3619 let rem = n - i;
3620 if rem > 0 {
3621 if wp != i {
3622 unsafe {
3623 std::ptr::copy(ptr.add(i) as *const u8, ptr.add(wp), rem);
3624 }
3625 }
3626 wp += rem;
3627 }
3628 *was_squeeze_char = n > 0 && unsafe { *buf.as_ptr().add(n - 1) } == ch;
3629 break;
3630 }
3631 }
3632 }
3633 wp
3634}
3635
3636pub fn translate_owned(
3644 set1: &[u8],
3645 set2: &[u8],
3646 data: &mut [u8],
3647 writer: &mut impl Write,
3648) -> io::Result<()> {
3649 let table = build_translate_table(set1, set2);
3650
3651 let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
3653 if is_identity {
3654 return writer.write_all(data);
3655 }
3656
3657 const OWNED_PARALLEL_MIN: usize = 64 * 1024 * 1024;
3661
3662 if let Some((lo, hi, offset)) = detect_range_offset(&table) {
3664 if data.len() >= OWNED_PARALLEL_MIN {
3665 let n_threads = rayon::current_num_threads().max(1);
3666 let chunk_size = (data.len() / n_threads).max(32 * 1024);
3667 data.par_chunks_mut(chunk_size).for_each(|chunk| {
3668 translate_range_simd_inplace(chunk, lo, hi, offset);
3669 });
3670 } else {
3671 translate_range_simd_inplace(data, lo, hi, offset);
3672 }
3673 return writer.write_all(data);
3674 }
3675
3676 if let Some((lo, hi, replacement)) = detect_range_to_constant(&table) {
3678 if data.len() >= OWNED_PARALLEL_MIN {
3679 let n_threads = rayon::current_num_threads().max(1);
3680 let chunk_size = (data.len() / n_threads).max(32 * 1024);
3681 data.par_chunks_mut(chunk_size).for_each(|chunk| {
3682 translate_range_to_constant_simd_inplace(chunk, lo, hi, replacement);
3683 });
3684 } else {
3685 translate_range_to_constant_simd_inplace(data, lo, hi, replacement);
3686 }
3687 return writer.write_all(data);
3688 }
3689
3690 if data.len() >= OWNED_PARALLEL_MIN {
3692 let n_threads = rayon::current_num_threads().max(1);
3693 let chunk_size = (data.len() / n_threads).max(32 * 1024);
3694 data.par_chunks_mut(chunk_size).for_each(|chunk| {
3695 translate_inplace(chunk, &table);
3696 });
3697 } else {
3698 translate_inplace(data, &table);
3699 }
3700 writer.write_all(data)
3701}
3702
3703pub fn translate_mmap(
3717 set1: &[u8],
3718 set2: &[u8],
3719 data: &[u8],
3720 writer: &mut impl Write,
3721) -> io::Result<()> {
3722 let table = build_translate_table(set1, set2);
3723
3724 let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
3726 if is_identity {
3727 return writer.write_all(data);
3728 }
3729
3730 if let Some((lo, hi, offset)) = detect_range_offset(&table) {
3732 return translate_mmap_range(data, writer, lo, hi, offset);
3733 }
3734
3735 if let Some((lo, hi, replacement)) = detect_range_to_constant(&table) {
3737 return translate_mmap_range_to_constant(data, writer, lo, hi, replacement);
3738 }
3739
3740 translate_mmap_table(data, writer, &table)
3742}
3743
3744fn translate_mmap_range(
3746 data: &[u8],
3747 writer: &mut impl Write,
3748 lo: u8,
3749 hi: u8,
3750 offset: i8,
3751) -> io::Result<()> {
3752 if data.len() >= PARALLEL_THRESHOLD {
3754 let mut buf = alloc_uninit_vec(data.len());
3755 let n_threads = rayon::current_num_threads().max(1);
3756 let chunk_size = (data.len() / n_threads).max(32 * 1024);
3757
3758 data.par_chunks(chunk_size)
3760 .zip(buf.par_chunks_mut(chunk_size))
3761 .for_each(|(src_chunk, dst_chunk)| {
3762 translate_range_simd(src_chunk, &mut dst_chunk[..src_chunk.len()], lo, hi, offset);
3763 });
3764
3765 return writer.write_all(&buf);
3766 }
3767
3768 const CHUNK: usize = 256 * 1024;
3770 let buf_size = data.len().min(CHUNK);
3771 let mut buf = alloc_uninit_vec(buf_size);
3772 for chunk in data.chunks(CHUNK) {
3773 translate_range_simd(chunk, &mut buf[..chunk.len()], lo, hi, offset);
3774 writer.write_all(&buf[..chunk.len()])?;
3775 }
3776 Ok(())
3777}
3778
3779fn translate_mmap_range_to_constant(
3782 data: &[u8],
3783 writer: &mut impl Write,
3784 lo: u8,
3785 hi: u8,
3786 replacement: u8,
3787) -> io::Result<()> {
3788 if data.len() >= PARALLEL_THRESHOLD {
3790 let mut buf = alloc_uninit_vec(data.len());
3791 let n_threads = rayon::current_num_threads().max(1);
3792 let chunk_size = (data.len() / n_threads).max(32 * 1024);
3793
3794 data.par_chunks(chunk_size)
3796 .zip(buf.par_chunks_mut(chunk_size))
3797 .for_each(|(src_chunk, dst_chunk)| {
3798 dst_chunk[..src_chunk.len()].copy_from_slice(src_chunk);
3799 translate_range_to_constant_simd_inplace(
3800 &mut dst_chunk[..src_chunk.len()],
3801 lo,
3802 hi,
3803 replacement,
3804 );
3805 });
3806
3807 return writer.write_all(&buf);
3808 }
3809
3810 const CHUNK: usize = 256 * 1024;
3812 let buf_size = data.len().min(CHUNK);
3813 let mut buf = alloc_uninit_vec(buf_size);
3814 for chunk in data.chunks(CHUNK) {
3815 buf[..chunk.len()].copy_from_slice(chunk);
3816 translate_range_to_constant_simd_inplace(&mut buf[..chunk.len()], lo, hi, replacement);
3817 writer.write_all(&buf[..chunk.len()])?;
3818 }
3819 Ok(())
3820}
3821
3822fn translate_mmap_table(data: &[u8], writer: &mut impl Write, table: &[u8; 256]) -> io::Result<()> {
3824 if data.len() >= PARALLEL_THRESHOLD {
3826 let mut buf = alloc_uninit_vec(data.len());
3827 let n_threads = rayon::current_num_threads().max(1);
3828 let chunk_size = (data.len() / n_threads).max(32 * 1024);
3829
3830 data.par_chunks(chunk_size)
3831 .zip(buf.par_chunks_mut(chunk_size))
3832 .for_each(|(src_chunk, dst_chunk)| {
3833 translate_to(src_chunk, &mut dst_chunk[..src_chunk.len()], table);
3834 });
3835
3836 return writer.write_all(&buf);
3837 }
3838
3839 const CHUNK: usize = 256 * 1024;
3841 let buf_size = data.len().min(CHUNK);
3842 let mut buf = alloc_uninit_vec(buf_size);
3843 for chunk in data.chunks(CHUNK) {
3844 translate_to(chunk, &mut buf[..chunk.len()], table);
3845 writer.write_all(&buf[..chunk.len()])?;
3846 }
3847 Ok(())
3848}
3849
3850pub fn translate_mmap_inplace(
3857 set1: &[u8],
3858 set2: &[u8],
3859 data: &mut [u8],
3860 writer: &mut impl Write,
3861) -> io::Result<()> {
3862 let table = build_translate_table(set1, set2);
3863
3864 let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
3866 if is_identity {
3867 return writer.write_all(data);
3868 }
3869
3870 if let Some((lo, hi, offset)) = detect_range_offset(&table) {
3877 if data.len() >= PARALLEL_THRESHOLD {
3878 let n_threads = rayon::current_num_threads().max(1);
3879 let chunk_size = (data.len() / n_threads).max(32 * 1024);
3880 data.par_chunks_mut(chunk_size)
3881 .for_each(|chunk| translate_range_simd_inplace(chunk, lo, hi, offset));
3882 } else {
3883 translate_range_simd_inplace(data, lo, hi, offset);
3884 }
3885 return writer.write_all(data);
3886 }
3887
3888 if let Some((lo, hi, replacement)) = detect_range_to_constant(&table) {
3890 if data.len() >= PARALLEL_THRESHOLD {
3891 let n_threads = rayon::current_num_threads().max(1);
3892 let chunk_size = (data.len() / n_threads).max(32 * 1024);
3893 data.par_chunks_mut(chunk_size).for_each(|chunk| {
3894 translate_range_to_constant_simd_inplace(chunk, lo, hi, replacement)
3895 });
3896 } else {
3897 translate_range_to_constant_simd_inplace(data, lo, hi, replacement);
3898 }
3899 return writer.write_all(data);
3900 }
3901
3902 if data.len() >= PARALLEL_THRESHOLD {
3904 let n_threads = rayon::current_num_threads().max(1);
3905 let chunk_size = (data.len() / n_threads).max(32 * 1024);
3906 data.par_chunks_mut(chunk_size)
3907 .for_each(|chunk| translate_inplace(chunk, &table));
3908 } else {
3909 translate_inplace(data, &table);
3910 }
3911 writer.write_all(data)
3912}
3913
3914fn translate_to_separate_buf(
3922 data: &[u8],
3923 table: &[u8; 256],
3924 writer: &mut impl Write,
3925) -> io::Result<()> {
3926 let range_info = detect_range_offset(table);
3927 let const_info = if range_info.is_none() {
3928 detect_range_to_constant(table)
3929 } else {
3930 None
3931 };
3932
3933 if data.len() >= PARALLEL_THRESHOLD {
3934 let mut out_buf = alloc_uninit_vec(data.len());
3936 let n_threads = rayon::current_num_threads().max(1);
3937 let chunk_size = (data.len() / n_threads).max(32 * 1024);
3938
3939 if let Some((lo, hi, offset)) = range_info {
3940 data.par_chunks(chunk_size)
3941 .zip(out_buf.par_chunks_mut(chunk_size))
3942 .for_each(|(src, dst)| {
3943 translate_range_simd(src, &mut dst[..src.len()], lo, hi, offset);
3944 });
3945 } else if let Some((lo, hi, replacement)) = const_info {
3946 data.par_chunks(chunk_size)
3947 .zip(out_buf.par_chunks_mut(chunk_size))
3948 .for_each(|(src, dst)| {
3949 translate_range_to_constant_simd(
3950 src,
3951 &mut dst[..src.len()],
3952 lo,
3953 hi,
3954 replacement,
3955 );
3956 });
3957 } else {
3958 data.par_chunks(chunk_size)
3959 .zip(out_buf.par_chunks_mut(chunk_size))
3960 .for_each(|(src, dst)| {
3961 translate_to(src, &mut dst[..src.len()], table);
3962 });
3963 }
3964 return writer.write_all(&out_buf);
3965 }
3966
3967 let mut out_buf = alloc_uninit_vec(data.len());
3973 if let Some((lo, hi, offset)) = range_info {
3974 translate_range_simd(data, &mut out_buf, lo, hi, offset);
3975 } else if let Some((lo, hi, replacement)) = const_info {
3976 translate_range_to_constant_simd(data, &mut out_buf, lo, hi, replacement);
3977 } else {
3978 translate_to(data, &mut out_buf, table);
3979 }
3980 writer.write_all(&out_buf)
3981}
3982
3983pub fn translate_mmap_readonly(
3987 set1: &[u8],
3988 set2: &[u8],
3989 data: &[u8],
3990 writer: &mut impl Write,
3991) -> io::Result<()> {
3992 let table = build_translate_table(set1, set2);
3993
3994 let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
3996 if is_identity {
3997 return writer.write_all(data);
3998 }
3999
4000 translate_to_separate_buf(data, &table, writer)
4001}
4002
4003pub fn translate_squeeze_mmap(
4009 set1: &[u8],
4010 set2: &[u8],
4011 data: &[u8],
4012 writer: &mut impl Write,
4013) -> io::Result<()> {
4014 let table = build_translate_table(set1, set2);
4015 let squeeze_set = build_member_set(set2);
4016
4017 if data.len() >= PARALLEL_THRESHOLD {
4022 let mut translated = alloc_uninit_vec(data.len());
4024 let range_info = detect_range_offset(&table);
4025 let n_threads = rayon::current_num_threads().max(1);
4026 let chunk_size = (data.len() / n_threads).max(32 * 1024);
4027
4028 if let Some((lo, hi, offset)) = range_info {
4029 data.par_chunks(chunk_size)
4030 .zip(translated.par_chunks_mut(chunk_size))
4031 .for_each(|(src_chunk, dst_chunk)| {
4032 translate_range_simd(
4033 src_chunk,
4034 &mut dst_chunk[..src_chunk.len()],
4035 lo,
4036 hi,
4037 offset,
4038 );
4039 });
4040 } else {
4041 data.par_chunks(chunk_size)
4042 .zip(translated.par_chunks_mut(chunk_size))
4043 .for_each(|(src_chunk, dst_chunk)| {
4044 translate_to(src_chunk, &mut dst_chunk[..src_chunk.len()], &table);
4045 });
4046 }
4047
4048 let mut last_squeezed: u16 = 256;
4052 let len = translated.len();
4053 let mut wp = 0;
4054 unsafe {
4055 let ptr = translated.as_mut_ptr();
4056 let mut i = 0;
4057 while i < len {
4058 let b = *ptr.add(i);
4059 if is_member(&squeeze_set, b) {
4060 if last_squeezed == b as u16 {
4061 i += 1;
4062 continue;
4063 }
4064 last_squeezed = b as u16;
4065 } else {
4066 last_squeezed = 256;
4067 }
4068 *ptr.add(wp) = b;
4069 wp += 1;
4070 i += 1;
4071 }
4072 }
4073 return writer.write_all(&translated[..wp]);
4074 }
4075
4076 let mut buf = alloc_uninit_vec(data.len());
4079 translate_to(data, &mut buf, &table);
4080 let mut last_squeezed: u16 = 256;
4081 let mut wp = 0;
4082 unsafe {
4083 let ptr = buf.as_mut_ptr();
4084 for i in 0..data.len() {
4085 let b = *ptr.add(i);
4086 if is_member(&squeeze_set, b) {
4087 if last_squeezed == b as u16 {
4088 continue;
4089 }
4090 last_squeezed = b as u16;
4091 } else {
4092 last_squeezed = 256;
4093 }
4094 *ptr.add(wp) = b;
4095 wp += 1;
4096 }
4097 }
4098 writer.write_all(&buf[..wp])
4099}
4100
4101pub fn delete_mmap(delete_chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
4107 if delete_chars.len() == 1 {
4108 return delete_single_char_mmap(delete_chars[0], data, writer);
4109 }
4110 if delete_chars.len() <= 3 {
4111 return delete_multi_memchr_mmap(delete_chars, data, writer);
4112 }
4113
4114 if let Some((lo, hi)) = detect_delete_range(delete_chars) {
4116 return delete_range_mmap(data, writer, lo, hi);
4117 }
4118
4119 let member = build_member_set(delete_chars);
4120
4121 let sample_size = data.len().min(1024);
4126 let sample_deletes = data[..sample_size]
4127 .iter()
4128 .filter(|&&b| is_member(&member, b))
4129 .count();
4130 let estimated_deletes = if sample_size > 0 {
4131 data.len() * sample_deletes / sample_size
4132 } else {
4133 data.len()
4134 };
4135
4136 if estimated_deletes < MAX_IOV / 2 {
4137 return delete_bitset_zerocopy(data, &member, writer);
4138 }
4139
4140 if data.len() >= PARALLEL_THRESHOLD {
4142 let n_threads = rayon::current_num_threads().max(1);
4143 let chunk_size = (data.len() / n_threads).max(32 * 1024);
4144
4145 let mut outbuf = alloc_uninit_vec(data.len());
4146 let chunk_lens: Vec<usize> = data
4147 .par_chunks(chunk_size)
4148 .zip(outbuf.par_chunks_mut(chunk_size))
4149 .map(|(src_chunk, dst_chunk)| delete_chunk_bitset_into(src_chunk, &member, dst_chunk))
4150 .collect();
4151
4152 let slices: Vec<std::io::IoSlice> = chunk_lens
4156 .iter()
4157 .enumerate()
4158 .filter(|&(_, &len)| len > 0)
4159 .map(|(i, &len)| std::io::IoSlice::new(&outbuf[i * chunk_size..i * chunk_size + len]))
4160 .collect();
4161 return write_ioslices(writer, &slices);
4162 }
4163
4164 const COMPACT_BUF: usize = 256 * 1024;
4167 let mut outbuf = alloc_uninit_vec(COMPACT_BUF);
4168
4169 for chunk in data.chunks(COMPACT_BUF) {
4170 let out_pos = delete_chunk_bitset_into(chunk, &member, &mut outbuf);
4171 if out_pos > 0 {
4172 writer.write_all(&outbuf[..out_pos])?;
4173 }
4174 }
4175 Ok(())
4176}
4177
4178fn delete_range_mmap(data: &[u8], writer: &mut impl Write, lo: u8, hi: u8) -> io::Result<()> {
4183 let sample_size = data.len().min(1024);
4185 let sample_deletes = data[..sample_size]
4186 .iter()
4187 .filter(|&&b| b >= lo && b <= hi)
4188 .count();
4189 let estimated_deletes = if sample_size > 0 {
4195 data.len() * sample_deletes / sample_size
4196 } else {
4197 data.len()
4198 };
4199 if estimated_deletes < MAX_IOV / 2 {
4200 return delete_range_mmap_zerocopy(data, writer, lo, hi);
4201 }
4202
4203 if data.len() >= PARALLEL_THRESHOLD {
4205 let n_threads = rayon::current_num_threads().max(1);
4206 let chunk_size = (data.len() / n_threads).max(32 * 1024);
4207
4208 let mut outbuf = alloc_uninit_vec(data.len());
4209 let chunk_lens: Vec<usize> = data
4210 .par_chunks(chunk_size)
4211 .zip(outbuf.par_chunks_mut(chunk_size))
4212 .map(|(src_chunk, dst_chunk)| delete_range_chunk(src_chunk, dst_chunk, lo, hi))
4213 .collect();
4214
4215 let slices: Vec<std::io::IoSlice> = chunk_lens
4218 .iter()
4219 .enumerate()
4220 .filter(|&(_, &len)| len > 0)
4221 .map(|(i, &len)| std::io::IoSlice::new(&outbuf[i * chunk_size..i * chunk_size + len]))
4222 .collect();
4223 return write_ioslices(writer, &slices);
4224 }
4225
4226 const COMPACT_BUF: usize = 256 * 1024;
4230 let mut outbuf = alloc_uninit_vec(COMPACT_BUF);
4231
4232 #[cfg(target_arch = "x86_64")]
4233 {
4234 let mut wp = 0;
4235 let level = get_simd_level();
4236 let len = data.len();
4237 let sp = data.as_ptr();
4238 let dp = outbuf.as_mut_ptr();
4239 let mut ri = 0;
4240
4241 if level >= 3 {
4242 use std::arch::x86_64::*;
4243 let range = hi - lo;
4244 let bias_v = unsafe { _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8) };
4245 let threshold_v = unsafe { _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8) };
4246 let zero = unsafe { _mm256_setzero_si256() };
4247
4248 while ri + 32 <= len {
4249 if wp + 32 > COMPACT_BUF {
4251 writer.write_all(&outbuf[..wp])?;
4252 wp = 0;
4253 }
4254
4255 let input = unsafe { _mm256_loadu_si256(sp.add(ri) as *const _) };
4256 let biased = unsafe { _mm256_add_epi8(input, bias_v) };
4257 let gt = unsafe { _mm256_cmpgt_epi8(biased, threshold_v) };
4258 let in_range = unsafe { _mm256_cmpeq_epi8(gt, zero) };
4259 let keep_mask = !(unsafe { _mm256_movemask_epi8(in_range) } as u32);
4260
4261 if keep_mask == 0xFFFFFFFF {
4262 unsafe { std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 32) };
4263 wp += 32;
4264 } else if keep_mask != 0 {
4265 let m0 = keep_mask as u8;
4266 let m1 = (keep_mask >> 8) as u8;
4267 let m2 = (keep_mask >> 16) as u8;
4268 let m3 = (keep_mask >> 24) as u8;
4269
4270 if m0 == 0xFF {
4271 unsafe { std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 8) };
4272 } else if m0 != 0 {
4273 unsafe { compact_8bytes_simd(sp.add(ri), dp.add(wp), m0) };
4274 }
4275 let c0 = m0.count_ones() as usize;
4276
4277 if m1 == 0xFF {
4278 unsafe {
4279 std::ptr::copy_nonoverlapping(sp.add(ri + 8), dp.add(wp + c0), 8)
4280 };
4281 } else if m1 != 0 {
4282 unsafe { compact_8bytes_simd(sp.add(ri + 8), dp.add(wp + c0), m1) };
4283 }
4284 let c1 = m1.count_ones() as usize;
4285
4286 if m2 == 0xFF {
4287 unsafe {
4288 std::ptr::copy_nonoverlapping(sp.add(ri + 16), dp.add(wp + c0 + c1), 8)
4289 };
4290 } else if m2 != 0 {
4291 unsafe { compact_8bytes_simd(sp.add(ri + 16), dp.add(wp + c0 + c1), m2) };
4292 }
4293 let c2 = m2.count_ones() as usize;
4294
4295 if m3 == 0xFF {
4296 unsafe {
4297 std::ptr::copy_nonoverlapping(
4298 sp.add(ri + 24),
4299 dp.add(wp + c0 + c1 + c2),
4300 8,
4301 )
4302 };
4303 } else if m3 != 0 {
4304 unsafe {
4305 compact_8bytes_simd(sp.add(ri + 24), dp.add(wp + c0 + c1 + c2), m3)
4306 };
4307 }
4308 let c3 = m3.count_ones() as usize;
4309 wp += c0 + c1 + c2 + c3;
4310 }
4311 ri += 32;
4312 }
4313 }
4314
4315 while ri < len {
4317 if wp + 1 > COMPACT_BUF {
4318 writer.write_all(&outbuf[..wp])?;
4319 wp = 0;
4320 }
4321 let b = unsafe { *sp.add(ri) };
4322 unsafe { *dp.add(wp) = b };
4323 wp += (b < lo || b > hi) as usize;
4324 ri += 1;
4325 }
4326
4327 if wp > 0 {
4328 writer.write_all(&outbuf[..wp])?;
4329 }
4330 return Ok(());
4331 }
4332
4333 #[cfg(not(target_arch = "x86_64"))]
4334 {
4335 for chunk in data.chunks(COMPACT_BUF) {
4337 let clen = delete_range_chunk(chunk, &mut outbuf, lo, hi);
4338 if clen > 0 {
4339 writer.write_all(&outbuf[..clen])?;
4340 }
4341 }
4342 return Ok(());
4343 }
4344
4345 #[allow(unreachable_code)]
4346 Ok(())
4347}
4348
4349fn delete_range_mmap_zerocopy(
4354 data: &[u8],
4355 writer: &mut impl Write,
4356 lo: u8,
4357 hi: u8,
4358) -> io::Result<()> {
4359 #[cfg(target_arch = "x86_64")]
4360 {
4361 if get_simd_level() >= 3 {
4362 return unsafe { delete_range_zerocopy_avx2(data, writer, lo, hi) };
4363 }
4364 if get_simd_level() >= 2 {
4365 return unsafe { delete_range_zerocopy_sse2(data, writer, lo, hi) };
4366 }
4367 }
4368
4369 #[cfg(target_arch = "aarch64")]
4370 {
4371 return unsafe { delete_range_zerocopy_neon(data, writer, lo, hi) };
4372 }
4373
4374 #[allow(unreachable_code)]
4376 delete_range_zerocopy_scalar(data, writer, lo, hi)
4377}
4378
4379fn delete_range_zerocopy_scalar(
4382 data: &[u8],
4383 writer: &mut impl Write,
4384 lo: u8,
4385 hi: u8,
4386) -> io::Result<()> {
4387 let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(MAX_IOV);
4388 let len = data.len();
4389 let mut run_start: usize = 0;
4390 let mut i: usize = 0;
4391
4392 while i < len {
4393 let b = unsafe { *data.get_unchecked(i) };
4394 if b >= lo && b <= hi {
4395 if i > run_start {
4396 iov.push(std::io::IoSlice::new(&data[run_start..i]));
4397 if iov.len() >= MAX_IOV {
4398 write_ioslices(writer, &iov)?;
4399 iov.clear();
4400 }
4401 }
4402 run_start = i + 1;
4403 }
4404 i += 1;
4405 }
4406 if run_start < len {
4407 iov.push(std::io::IoSlice::new(&data[run_start..]));
4408 }
4409 if !iov.is_empty() {
4410 write_ioslices(writer, &iov)?;
4411 }
4412 Ok(())
4413}
4414
4415#[cfg(target_arch = "x86_64")]
4419#[target_feature(enable = "avx2")]
4420unsafe fn delete_range_zerocopy_avx2(
4421 data: &[u8],
4422 writer: &mut impl Write,
4423 lo: u8,
4424 hi: u8,
4425) -> io::Result<()> {
4426 use std::arch::x86_64::*;
4427
4428 unsafe {
4429 let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(MAX_IOV);
4430 let len = data.len();
4431 let mut run_start: usize = 0;
4432 let mut ri: usize = 0;
4433
4434 let range = hi - lo;
4435 let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
4436 let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
4437 let zero = _mm256_setzero_si256();
4438
4439 while ri + 32 <= len {
4440 let input = _mm256_loadu_si256(data.as_ptr().add(ri) as *const _);
4441 let biased = _mm256_add_epi8(input, bias_v);
4442 let gt = _mm256_cmpgt_epi8(biased, threshold_v);
4443 let in_range = _mm256_cmpeq_epi8(gt, zero);
4444 let del_mask = _mm256_movemask_epi8(in_range) as u32;
4445
4446 if del_mask == 0 {
4447 ri += 32;
4449 continue;
4450 }
4451
4452 let mut m = del_mask;
4454 while m != 0 {
4455 let bit = m.trailing_zeros() as usize;
4456 let abs_pos = ri + bit;
4457 if abs_pos > run_start {
4458 iov.push(std::io::IoSlice::new(&data[run_start..abs_pos]));
4459 if iov.len() >= MAX_IOV {
4460 write_ioslices(writer, &iov)?;
4461 iov.clear();
4462 }
4463 }
4464 run_start = abs_pos + 1;
4465 m &= m - 1; }
4467
4468 ri += 32;
4469 }
4470
4471 while ri < len {
4473 let b = *data.get_unchecked(ri);
4474 if b >= lo && b <= hi {
4475 if ri > run_start {
4476 iov.push(std::io::IoSlice::new(&data[run_start..ri]));
4477 if iov.len() >= MAX_IOV {
4478 write_ioslices(writer, &iov)?;
4479 iov.clear();
4480 }
4481 }
4482 run_start = ri + 1;
4483 }
4484 ri += 1;
4485 }
4486
4487 if run_start < len {
4488 iov.push(std::io::IoSlice::new(&data[run_start..]));
4489 }
4490 if !iov.is_empty() {
4491 write_ioslices(writer, &iov)?;
4492 }
4493 Ok(())
4494 }
4495}
4496
4497#[cfg(target_arch = "x86_64")]
4499#[target_feature(enable = "sse2")]
4500unsafe fn delete_range_zerocopy_sse2(
4501 data: &[u8],
4502 writer: &mut impl Write,
4503 lo: u8,
4504 hi: u8,
4505) -> io::Result<()> {
4506 use std::arch::x86_64::*;
4507
4508 unsafe {
4509 let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(MAX_IOV);
4510 let len = data.len();
4511 let mut run_start: usize = 0;
4512 let mut ri: usize = 0;
4513
4514 let range = hi - lo;
4515 let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
4516 let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
4517 let zero = _mm_setzero_si128();
4518
4519 while ri + 16 <= len {
4520 let input = _mm_loadu_si128(data.as_ptr().add(ri) as *const _);
4521 let biased = _mm_add_epi8(input, bias_v);
4522 let gt = _mm_cmpgt_epi8(biased, threshold_v);
4523 let in_range = _mm_cmpeq_epi8(gt, zero);
4524 let del_mask = _mm_movemask_epi8(in_range) as u32 & 0xFFFF;
4525
4526 if del_mask == 0 {
4527 ri += 16;
4528 continue;
4529 }
4530
4531 let mut m = del_mask;
4532 while m != 0 {
4533 let bit = m.trailing_zeros() as usize;
4534 let abs_pos = ri + bit;
4535 if abs_pos > run_start {
4536 iov.push(std::io::IoSlice::new(&data[run_start..abs_pos]));
4537 if iov.len() >= MAX_IOV {
4538 write_ioslices(writer, &iov)?;
4539 iov.clear();
4540 }
4541 }
4542 run_start = abs_pos + 1;
4543 m &= m - 1;
4544 }
4545
4546 ri += 16;
4547 }
4548
4549 while ri < len {
4550 let b = *data.get_unchecked(ri);
4551 if b >= lo && b <= hi {
4552 if ri > run_start {
4553 iov.push(std::io::IoSlice::new(&data[run_start..ri]));
4554 if iov.len() >= MAX_IOV {
4555 write_ioslices(writer, &iov)?;
4556 iov.clear();
4557 }
4558 }
4559 run_start = ri + 1;
4560 }
4561 ri += 1;
4562 }
4563
4564 if run_start < len {
4565 iov.push(std::io::IoSlice::new(&data[run_start..]));
4566 }
4567 if !iov.is_empty() {
4568 write_ioslices(writer, &iov)?;
4569 }
4570 Ok(())
4571 }
4572}
4573
4574#[cfg(target_arch = "aarch64")]
4578#[target_feature(enable = "neon")]
4579unsafe fn delete_range_zerocopy_neon(
4580 data: &[u8],
4581 writer: &mut impl Write,
4582 lo: u8,
4583 hi: u8,
4584) -> io::Result<()> {
4585 use std::arch::aarch64::*;
4586
4587 unsafe {
4588 let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(MAX_IOV);
4589 let len = data.len();
4590 let mut run_start: usize = 0;
4591 let mut ri: usize = 0;
4592
4593 let lo_v = vdupq_n_u8(lo);
4594 let hi_v = vdupq_n_u8(hi);
4595 let bit_mask: [u8; 16] = [1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128];
4597 let bit_mask_v = vld1q_u8(bit_mask.as_ptr());
4598
4599 while ri + 16 <= len {
4600 let input = vld1q_u8(data.as_ptr().add(ri));
4601 let ge_lo = vcgeq_u8(input, lo_v);
4603 let le_hi = vcleq_u8(input, hi_v);
4604 let in_range = vandq_u8(ge_lo, le_hi);
4605
4606 let bits = vandq_u8(in_range, bit_mask_v);
4608 let pair = vpaddlq_u8(bits); let quad = vpaddlq_u16(pair); let octet = vpaddlq_u32(quad); let mask_lo = vgetq_lane_u64::<0>(octet) as u8;
4612 let mask_hi = vgetq_lane_u64::<1>(octet) as u8;
4613 let del_mask = (mask_hi as u16) << 8 | mask_lo as u16;
4614
4615 if del_mask == 0 {
4616 ri += 16;
4618 continue;
4619 }
4620
4621 let mut m = del_mask;
4623 while m != 0 {
4624 let bit = m.trailing_zeros() as usize;
4625 let abs_pos = ri + bit;
4626 if abs_pos > run_start {
4627 iov.push(std::io::IoSlice::new(&data[run_start..abs_pos]));
4628 if iov.len() >= MAX_IOV {
4629 write_ioslices(writer, &iov)?;
4630 iov.clear();
4631 }
4632 }
4633 run_start = abs_pos + 1;
4634 m &= m - 1;
4635 }
4636
4637 ri += 16;
4638 }
4639
4640 while ri < len {
4642 let b = *data.get_unchecked(ri);
4643 if b >= lo && b <= hi {
4644 if ri > run_start {
4645 iov.push(std::io::IoSlice::new(&data[run_start..ri]));
4646 if iov.len() >= MAX_IOV {
4647 write_ioslices(writer, &iov)?;
4648 iov.clear();
4649 }
4650 }
4651 run_start = ri + 1;
4652 }
4653 ri += 1;
4654 }
4655
4656 if run_start < len {
4657 iov.push(std::io::IoSlice::new(&data[run_start..]));
4658 }
4659 if !iov.is_empty() {
4660 write_ioslices(writer, &iov)?;
4661 }
4662 Ok(())
4663 }
4664}
4665
4666#[inline]
4669fn delete_chunk_bitset_into(chunk: &[u8], member: &[u8; 32], outbuf: &mut [u8]) -> usize {
4670 let len = chunk.len();
4671 let mut out_pos = 0;
4672 let mut i = 0;
4673
4674 while i + 8 <= len {
4675 unsafe {
4676 let b0 = *chunk.get_unchecked(i);
4677 let b1 = *chunk.get_unchecked(i + 1);
4678 let b2 = *chunk.get_unchecked(i + 2);
4679 let b3 = *chunk.get_unchecked(i + 3);
4680 let b4 = *chunk.get_unchecked(i + 4);
4681 let b5 = *chunk.get_unchecked(i + 5);
4682 let b6 = *chunk.get_unchecked(i + 6);
4683 let b7 = *chunk.get_unchecked(i + 7);
4684
4685 *outbuf.get_unchecked_mut(out_pos) = b0;
4686 out_pos += !is_member(member, b0) as usize;
4687 *outbuf.get_unchecked_mut(out_pos) = b1;
4688 out_pos += !is_member(member, b1) as usize;
4689 *outbuf.get_unchecked_mut(out_pos) = b2;
4690 out_pos += !is_member(member, b2) as usize;
4691 *outbuf.get_unchecked_mut(out_pos) = b3;
4692 out_pos += !is_member(member, b3) as usize;
4693 *outbuf.get_unchecked_mut(out_pos) = b4;
4694 out_pos += !is_member(member, b4) as usize;
4695 *outbuf.get_unchecked_mut(out_pos) = b5;
4696 out_pos += !is_member(member, b5) as usize;
4697 *outbuf.get_unchecked_mut(out_pos) = b6;
4698 out_pos += !is_member(member, b6) as usize;
4699 *outbuf.get_unchecked_mut(out_pos) = b7;
4700 out_pos += !is_member(member, b7) as usize;
4701 }
4702 i += 8;
4703 }
4704
4705 while i < len {
4706 unsafe {
4707 let b = *chunk.get_unchecked(i);
4708 *outbuf.get_unchecked_mut(out_pos) = b;
4709 out_pos += !is_member(member, b) as usize;
4710 }
4711 i += 1;
4712 }
4713
4714 out_pos
4715}
4716
4717fn delete_bitset_zerocopy(
4722 data: &[u8],
4723 member: &[u8; 32],
4724 writer: &mut impl Write,
4725) -> io::Result<()> {
4726 let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(MAX_IOV);
4727 let len = data.len();
4728 let mut i = 0;
4729 let mut run_start: Option<usize> = None;
4730
4731 while i < len {
4732 let b = unsafe { *data.get_unchecked(i) };
4733 if is_member(member, b) {
4734 if let Some(rs) = run_start {
4736 iov.push(std::io::IoSlice::new(&data[rs..i]));
4737 run_start = None;
4738 if iov.len() >= MAX_IOV {
4739 write_ioslices(writer, &iov)?;
4740 iov.clear();
4741 }
4742 }
4743 } else {
4744 if run_start.is_none() {
4746 run_start = Some(i);
4747 }
4748 }
4749 i += 1;
4750 }
4751 if let Some(rs) = run_start {
4753 iov.push(std::io::IoSlice::new(&data[rs..]));
4754 }
4755 if !iov.is_empty() {
4756 write_ioslices(writer, &iov)?;
4757 }
4758 Ok(())
4759}
4760
4761fn delete_single_char_mmap(ch: u8, data: &[u8], writer: &mut impl Write) -> io::Result<()> {
4762 let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(MAX_IOV);
4766 let mut last = 0;
4767 for pos in memchr::memchr_iter(ch, data) {
4768 if pos > last {
4769 iov.push(std::io::IoSlice::new(&data[last..pos]));
4770 if iov.len() >= MAX_IOV {
4771 write_ioslices(writer, &iov)?;
4772 iov.clear();
4773 }
4774 }
4775 last = pos + 1;
4776 }
4777 if last < data.len() {
4778 iov.push(std::io::IoSlice::new(&data[last..]));
4779 }
4780 if !iov.is_empty() {
4781 write_ioslices(writer, &iov)?;
4782 }
4783 Ok(())
4784}
4785
4786fn delete_multi_memchr_mmap(chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
4787 let c0 = chars[0];
4788 let c1 = if chars.len() >= 2 { chars[1] } else { 0 };
4789 let c2 = if chars.len() >= 3 { chars[2] } else { 0 };
4790 let is_three = chars.len() >= 3;
4791
4792 let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(MAX_IOV);
4794 let mut last = 0;
4795
4796 macro_rules! process_pos {
4797 ($pos:expr) => {
4798 if $pos > last {
4799 iov.push(std::io::IoSlice::new(&data[last..$pos]));
4800 if iov.len() >= MAX_IOV {
4801 write_ioslices(writer, &iov)?;
4802 iov.clear();
4803 }
4804 }
4805 last = $pos + 1;
4806 };
4807 }
4808
4809 if is_three {
4810 for pos in memchr::memchr3_iter(c0, c1, c2, data) {
4811 process_pos!(pos);
4812 }
4813 } else {
4814 for pos in memchr::memchr2_iter(c0, c1, data) {
4815 process_pos!(pos);
4816 }
4817 }
4818 if last < data.len() {
4819 iov.push(std::io::IoSlice::new(&data[last..]));
4820 }
4821 if !iov.is_empty() {
4822 write_ioslices(writer, &iov)?;
4823 }
4824 Ok(())
4825}
4826
4827pub fn delete_squeeze_mmap(
4832 delete_chars: &[u8],
4833 squeeze_chars: &[u8],
4834 data: &[u8],
4835 writer: &mut impl Write,
4836) -> io::Result<()> {
4837 let delete_set = build_member_set(delete_chars);
4838 let squeeze_set = build_member_set(squeeze_chars);
4839
4840 let mut outbuf = alloc_uninit_vec(data.len());
4842 let mut last_squeezed: u16 = 256;
4843 let mut out_pos = 0;
4844
4845 for &b in data.iter() {
4846 if is_member(&delete_set, b) {
4847 continue;
4848 }
4849 if is_member(&squeeze_set, b) {
4850 if last_squeezed == b as u16 {
4851 continue;
4852 }
4853 last_squeezed = b as u16;
4854 } else {
4855 last_squeezed = 256;
4856 }
4857 unsafe {
4858 *outbuf.get_unchecked_mut(out_pos) = b;
4859 }
4860 out_pos += 1;
4861 }
4862 writer.write_all(&outbuf[..out_pos])
4863}
4864
4865pub fn squeeze_mmap(squeeze_chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
4871 if squeeze_chars.len() == 1 {
4872 return squeeze_single_mmap(squeeze_chars[0], data, writer);
4873 }
4874 if squeeze_chars.len() == 2 {
4875 return squeeze_multi_mmap::<2>(squeeze_chars, data, writer);
4876 }
4877 if squeeze_chars.len() == 3 {
4878 return squeeze_multi_mmap::<3>(squeeze_chars, data, writer);
4879 }
4880
4881 let member = build_member_set(squeeze_chars);
4882
4883 if data.len() >= PARALLEL_THRESHOLD {
4885 let n_threads = rayon::current_num_threads().max(1);
4886 let chunk_size = (data.len() / n_threads).max(32 * 1024);
4887
4888 let results: Vec<Vec<u8>> = data
4889 .par_chunks(chunk_size)
4890 .map(|chunk| squeeze_chunk_bitset(chunk, &member))
4891 .collect();
4892
4893 let mut slices: Vec<std::io::IoSlice> = Vec::with_capacity(results.len());
4898 for (idx, result) in results.iter().enumerate() {
4899 if result.is_empty() {
4900 continue;
4901 }
4902 if idx > 0 {
4903 if let Some(&prev_last) = results[..idx].iter().rev().find_map(|r| r.last()) {
4905 if is_member(&member, prev_last) {
4906 let skip = result.iter().take_while(|&&b| b == prev_last).count();
4908 if skip < result.len() {
4909 slices.push(std::io::IoSlice::new(&result[skip..]));
4910 }
4911 continue;
4912 }
4913 }
4914 }
4915 slices.push(std::io::IoSlice::new(result));
4916 }
4917 return write_ioslices(writer, &slices);
4918 }
4919
4920 let mut outbuf = alloc_uninit_vec(data.len());
4922 let len = data.len();
4923 let mut wp = 0;
4924 let mut i = 0;
4925 let mut last_squeezed: u16 = 256;
4926
4927 unsafe {
4928 let inp = data.as_ptr();
4929 let outp = outbuf.as_mut_ptr();
4930
4931 while i < len {
4932 let b = *inp.add(i);
4933 if is_member(&member, b) {
4934 if last_squeezed != b as u16 {
4935 *outp.add(wp) = b;
4936 wp += 1;
4937 last_squeezed = b as u16;
4938 }
4939 i += 1;
4940 while i < len && *inp.add(i) == b {
4941 i += 1;
4942 }
4943 } else {
4944 last_squeezed = 256;
4945 *outp.add(wp) = b;
4946 wp += 1;
4947 i += 1;
4948 }
4949 }
4950 }
4951 writer.write_all(&outbuf[..wp])
4952}
4953
4954fn squeeze_chunk_bitset(chunk: &[u8], member: &[u8; 32]) -> Vec<u8> {
4956 let len = chunk.len();
4957 let mut out = Vec::with_capacity(len);
4958 let mut last_squeezed: u16 = 256;
4959 let mut i = 0;
4960
4961 unsafe {
4962 out.set_len(len);
4963 let inp = chunk.as_ptr();
4964 let outp: *mut u8 = out.as_mut_ptr();
4965 let mut wp = 0;
4966
4967 while i < len {
4968 let b = *inp.add(i);
4969 if is_member(member, b) {
4970 if last_squeezed != b as u16 {
4971 *outp.add(wp) = b;
4972 wp += 1;
4973 last_squeezed = b as u16;
4974 }
4975 i += 1;
4976 while i < len && *inp.add(i) == b {
4977 i += 1;
4978 }
4979 } else {
4980 last_squeezed = 256;
4981 *outp.add(wp) = b;
4982 wp += 1;
4983 i += 1;
4984 }
4985 }
4986 out.set_len(wp);
4987 }
4988 out
4989}
4990
4991fn squeeze_multi_mmap<const N: usize>(
4992 chars: &[u8],
4993 data: &[u8],
4994 writer: &mut impl Write,
4995) -> io::Result<()> {
4996 if data.len() >= PARALLEL_THRESHOLD {
4998 let member = build_member_set(chars);
4999 let n_threads = rayon::current_num_threads().max(1);
5000 let chunk_size = (data.len() / n_threads).max(32 * 1024);
5001
5002 let results: Vec<Vec<u8>> = data
5003 .par_chunks(chunk_size)
5004 .map(|chunk| squeeze_chunk_bitset(chunk, &member))
5005 .collect();
5006
5007 let mut slices: Vec<std::io::IoSlice> = Vec::with_capacity(results.len());
5009 for (idx, result) in results.iter().enumerate() {
5010 if result.is_empty() {
5011 continue;
5012 }
5013 if idx > 0 {
5014 if let Some(&prev_last) = results[..idx].iter().rev().find_map(|r| r.last()) {
5015 if is_member(&member, prev_last) {
5016 let skip = result.iter().take_while(|&&b| b == prev_last).count();
5017 if skip < result.len() {
5018 slices.push(std::io::IoSlice::new(&result[skip..]));
5019 }
5020 continue;
5021 }
5022 }
5023 }
5024 slices.push(std::io::IoSlice::new(result));
5025 }
5026 return write_ioslices(writer, &slices);
5027 }
5028
5029 let single = [chars[0]; 1]; let _ = single;
5035 let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(1024);
5036 let mut cursor = 0;
5037 let mut last_squeezed: u16 = 256;
5038
5039 macro_rules! find_next {
5040 ($data:expr) => {
5041 if N == 2 {
5042 memchr::memchr2(chars[0], chars[1], $data)
5043 } else {
5044 memchr::memchr3(chars[0], chars[1], chars[2], $data)
5045 }
5046 };
5047 }
5048
5049 while cursor < data.len() {
5050 match find_next!(&data[cursor..]) {
5051 Some(offset) => {
5052 let pos = cursor + offset;
5053 let b = data[pos];
5054 if pos > cursor {
5056 iov.push(std::io::IoSlice::new(&data[cursor..pos]));
5057 last_squeezed = 256;
5058 }
5059 if last_squeezed != b as u16 {
5061 iov.push(std::io::IoSlice::new(&data[pos..pos + 1]));
5063 last_squeezed = b as u16;
5064 }
5065 let mut skip = pos + 1;
5067 while skip < data.len() && data[skip] == b {
5068 skip += 1;
5069 }
5070 cursor = skip;
5071 if iov.len() >= MAX_IOV {
5073 write_ioslices(writer, &iov)?;
5074 iov.clear();
5075 }
5076 }
5077 None => {
5078 if cursor < data.len() {
5079 iov.push(std::io::IoSlice::new(&data[cursor..]));
5080 }
5081 break;
5082 }
5083 }
5084 }
5085 if !iov.is_empty() {
5086 write_ioslices(writer, &iov)?;
5087 }
5088 Ok(())
5089}
5090
5091fn squeeze_single_mmap(ch: u8, data: &[u8], writer: &mut impl Write) -> io::Result<()> {
5092 if data.is_empty() {
5093 return Ok(());
5094 }
5095
5096 let pair = [ch, ch];
5098 if memchr::memmem::find(data, &pair).is_none() {
5099 return writer.write_all(data);
5100 }
5101
5102 let finder = memchr::memmem::Finder::new(&pair);
5109 let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(2048);
5110 let mut cursor = 0;
5111
5112 while cursor < data.len() {
5113 match finder.find(&data[cursor..]) {
5114 Some(offset) => {
5115 let pair_pos = cursor + offset;
5116 let seg_end = pair_pos + 1;
5118 if seg_end > cursor {
5119 iov.push(std::io::IoSlice::new(&data[cursor..seg_end]));
5120 }
5121 let mut skip = seg_end;
5123 while skip < data.len() && data[skip] == ch {
5124 skip += 1;
5125 }
5126 cursor = skip;
5127 if iov.len() >= MAX_IOV {
5129 write_ioslices(writer, &iov)?;
5130 iov.clear();
5131 }
5132 }
5133 None => {
5134 if cursor < data.len() {
5136 iov.push(std::io::IoSlice::new(&data[cursor..]));
5137 }
5138 break;
5139 }
5140 }
5141 }
5142
5143 if !iov.is_empty() {
5144 write_ioslices(writer, &iov)?;
5145 }
5146 Ok(())
5147}